Merge git://git.kernel.org/pub/scm/linux/kernel/git/bunk/trivial
* git://git.kernel.org/pub/scm/linux/kernel/git/bunk/trivial: (25 commits)
sound: convert "sound" subdirectory to UTF-8
MAINTAINERS: Add cxacru website/mailing list
include files: convert "include" subdirectory to UTF-8
general: convert "kernel" subdirectory to UTF-8
documentation: convert the Documentation directory to UTF-8
Convert the toplevel files CREDITS and MAINTAINERS to UTF-8.
remove broken URLs from net drivers' output
Magic number prefix consistency change to Documentation/magic-number.txt
trivial: s/i_sem /i_mutex/
fix file specification in comments
drivers/base/platform.c: fix small typo in doc
misc doc and kconfig typos
Remove obsolete fat_cvf help text
Fix occurrences of "the the "
Fix minor typoes in kernel/module.c
Kconfig: Remove reference to external mqueue library
Kconfig: A couple of grammatical fixes in arch/i386/Kconfig
Correct comments in genrtc.c to refer to correct /proc file.
Fix more "deprecated" spellos.
Fix "deprecated" typoes.
...
Fix trivial comment conflict in kernel/relay.c.
diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist
index bd23dc0..6491b2c 100644
--- a/Documentation/SubmitChecklist
+++ b/Documentation/SubmitChecklist
@@ -80,3 +80,7 @@
23: Tested after it has been merged into the -mm patchset to make sure
that it still works with all of the other queued patches and various
changes in the VM, VFS, and other subsystems.
+
+24: Avoid whitespace damage such as indenting with spaces or whitespace
+ at the end of lines. You can test this by feeding the patch to
+ "git apply --check --whitespace=error-all"
diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches
index b0d0043..a417b25 100644
--- a/Documentation/SubmittingPatches
+++ b/Documentation/SubmittingPatches
@@ -363,7 +363,8 @@
The "summary phrase" in the email's Subject should concisely
describe the patch which that email contains. The "summary
phrase" should not be a filename. Do not use the same "summary
-phrase" for every patch in a whole patch series.
+phrase" for every patch in a whole patch series (where a "patch
+series" is an ordered sequence of multiple, related patches).
Bear in mind that the "summary phrase" of your email becomes
a globally-unique identifier for that patch. It propagates
diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt
index cc60d29..b6d24c2 100644
--- a/Documentation/cpu-hotplug.txt
+++ b/Documentation/cpu-hotplug.txt
@@ -217,14 +217,17 @@
A: The following happen, listed in no particular order :-)
- A notification is sent to in-kernel registered modules by sending an event
- CPU_DOWN_PREPARE
+ CPU_DOWN_PREPARE or CPU_DOWN_PREPARE_FROZEN, depending on whether or not the
+ CPU is being offlined while tasks are frozen due to a suspend operation in
+ progress
- All process is migrated away from this outgoing CPU to a new CPU
- All interrupts targeted to this CPU is migrated to a new CPU
- timers/bottom half/task lets are also migrated to a new CPU
- Once all services are migrated, kernel calls an arch specific routine
__cpu_disable() to perform arch specific cleanup.
- Once this is successful, an event for successful cleanup is sent by an event
- CPU_DEAD.
+ CPU_DEAD (or CPU_DEAD_FROZEN if tasks are frozen due to a suspend while the
+ CPU is being offlined).
"It is expected that each service cleans up when the CPU_DOWN_PREPARE
notifier is called, when CPU_DEAD is called its expected there is nothing
@@ -242,9 +245,11 @@
switch (action) {
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
foobar_online_action(cpu);
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
foobar_dead_action(cpu);
break;
}
diff --git a/Documentation/device-mapper/delay.txt b/Documentation/device-mapper/delay.txt
new file mode 100644
index 0000000..15adc55
--- /dev/null
+++ b/Documentation/device-mapper/delay.txt
@@ -0,0 +1,26 @@
+dm-delay
+========
+
+Device-Mapper's "delay" target delays reads and/or writes
+and maps them to different devices.
+
+Parameters:
+ <device> <offset> <delay> [<write_device> <write_offset> <write_delay>]
+
+With separate write parameters, the first set is only used for reads.
+Delays are specified in milliseconds.
+
+Example scripts
+===============
+[[
+#!/bin/sh
+# Create device delaying rw operation for 500ms
+echo "0 `blockdev --getsize $1` delay $1 0 500" | dmsetup create delayed
+]]
+
+[[
+#!/bin/sh
+# Create device delaying only write operation for 500ms and
+# splitting reads and writes to different devices $1 $2
+echo "0 `blockdev --getsize $1` delay $1 0 0 $2 0 500" | dmsetup create delayed
+]]
diff --git a/Documentation/fb/arkfb.txt b/Documentation/fb/arkfb.txt
new file mode 100644
index 0000000..e8487a9
--- /dev/null
+++ b/Documentation/fb/arkfb.txt
@@ -0,0 +1,68 @@
+
+ arkfb - fbdev driver for ARK Logic chips
+ ========================================
+
+
+Supported Hardware
+==================
+
+ ARK 2000PV chip
+ ICS 5342 ramdac
+
+ - only BIOS initialized VGA devices supported
+ - probably not working on big endian
+
+
+Supported Features
+==================
+
+ * 4 bpp pseudocolor modes (with 18bit palette, two variants)
+ * 8 bpp pseudocolor mode (with 18bit palette)
+ * 16 bpp truecolor modes (RGB 555 and RGB 565)
+ * 24 bpp truecolor mode (RGB 888)
+ * 32 bpp truecolor mode (RGB 888)
+ * text mode (activated by bpp = 0)
+ * doublescan mode variant (not available in text mode)
+ * panning in both directions
+ * suspend/resume support
+
+Text mode is supported even in higher resolutions, but there is limitation to
+lower pixclocks (i got maximum about 70 MHz, it is dependent on specific
+hardware). This limitation is not enforced by driver. Text mode supports 8bit
+wide fonts only (hardware limitation) and 16bit tall fonts (driver
+limitation). Unfortunately character attributes (like color) in text mode are
+broken for unknown reason, so its usefulness is limited.
+
+There are two 4 bpp modes. First mode (selected if nonstd == 0) is mode with
+packed pixels, high nibble first. Second mode (selected if nonstd == 1) is mode
+with interleaved planes (1 byte interleave), MSB first. Both modes support
+8bit wide fonts only (driver limitation).
+
+Suspend/resume works on systems that initialize video card during resume and
+if device is active (for example used by fbcon).
+
+
+Missing Features
+================
+(alias TODO list)
+
+ * secondary (not initialized by BIOS) device support
+ * big endian support
+ * DPMS support
+ * MMIO support
+ * interlaced mode variant
+ * support for fontwidths != 8 in 4 bpp modes
+ * support for fontheight != 16 in text mode
+ * hardware cursor
+ * vsync synchronization
+ * feature connector support
+ * acceleration support (8514-like 2D)
+
+
+Known bugs
+==========
+
+ * character attributes (and cursor) in text mode are broken
+
+--
+Ondrej Zajicek <santiago@crfreenet.org>
diff --git a/Documentation/fb/vt8623fb.txt b/Documentation/fb/vt8623fb.txt
new file mode 100644
index 0000000..f654576
--- /dev/null
+++ b/Documentation/fb/vt8623fb.txt
@@ -0,0 +1,64 @@
+
+ vt8623fb - fbdev driver for graphics core in VIA VT8623 chipset
+ ===============================================================
+
+
+Supported Hardware
+==================
+
+ VIA VT8623 [CLE266] chipset and its graphics core
+ (known as CastleRock or Unichrome)
+
+I tested vt8623fb on VIA EPIA ML-6000
+
+
+Supported Features
+==================
+
+ * 4 bpp pseudocolor modes (with 18bit palette, two variants)
+ * 8 bpp pseudocolor mode (with 18bit palette)
+ * 16 bpp truecolor mode (RGB 565)
+ * 32 bpp truecolor mode (RGB 888)
+ * text mode (activated by bpp = 0)
+ * doublescan mode variant (not available in text mode)
+ * panning in both directions
+ * suspend/resume support
+ * DPMS support
+
+Text mode is supported even in higher resolutions, but there is limitation to
+lower pixclocks (maximum about 100 MHz). This limitation is not enforced by
+driver. Text mode supports 8bit wide fonts only (hardware limitation) and
+16bit tall fonts (driver limitation).
+
+There are two 4 bpp modes. First mode (selected if nonstd == 0) is mode with
+packed pixels, high nibble first. Second mode (selected if nonstd == 1) is mode
+with interleaved planes (1 byte interleave), MSB first. Both modes support
+8bit wide fonts only (driver limitation).
+
+Suspend/resume works on systems that initialize video card during resume and
+if device is active (for example used by fbcon).
+
+
+Missing Features
+================
+(alias TODO list)
+
+ * secondary (not initialized by BIOS) device support
+ * MMIO support
+ * interlaced mode variant
+ * support for fontwidths != 8 in 4 bpp modes
+ * support for fontheight != 16 in text mode
+ * hardware cursor
+ * video overlay support
+ * vsync synchronization
+ * acceleration support (8514-like 2D, busmaster transfers)
+
+
+Known bugs
+==========
+
+ * cursor disable in text mode doesn't work
+
+
+--
+Ondrej Zajicek <santiago@crfreenet.org>
diff --git a/Documentation/md.txt b/Documentation/md.txt
index 2202f5d..5818628 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -178,6 +178,21 @@
The size should be at least PAGE_SIZE (4k) and should be a power
of 2. This can only be set while assembling an array
+ layout
+ The "layout" for the array for the particular level. This is
+ simply a number that is interpretted differently by different
+ levels. It can be written while assembling an array.
+
+ reshape_position
+ This is either "none" or a sector number within the devices of
+ the array where "reshape" is up to. If this is set, the three
+ attributes mentioned above (raid_disks, chunk_size, layout) can
+ potentially have 2 values, an old and a new value. If these
+ values differ, reading the attribute returns
+ new (old)
+ and writing will effect the 'new' value, leaving the 'old'
+ unchanged.
+
component_size
For arrays with data redundancy (i.e. not raid0, linear, faulty,
multipath), all components must be the same size - or at least
@@ -193,11 +208,6 @@
1.2 (newer format in varying locations) or "none" indicating that
the kernel isn't managing metadata at all.
- layout
- The "layout" for the array for the particular level. This is
- simply a number that is interpretted differently by different
- levels. It can be written while assembling an array.
-
resync_start
The point at which resync should start. If no resync is needed,
this will be a very large number. At array creation it will
@@ -259,29 +269,6 @@
like active, but no writes have been seen for a while (safe_mode_delay).
- sync_speed_min
- sync_speed_max
- This are similar to /proc/sys/dev/raid/speed_limit_{min,max}
- however they only apply to the particular array.
- If no value has been written to these, of if the word 'system'
- is written, then the system-wide value is used. If a value,
- in kibibytes-per-second is written, then it is used.
- When the files are read, they show the currently active value
- followed by "(local)" or "(system)" depending on whether it is
- a locally set or system-wide value.
-
- sync_completed
- This shows the number of sectors that have been completed of
- whatever the current sync_action is, followed by the number of
- sectors in total that could need to be processed. The two
- numbers are separated by a '/' thus effectively showing one
- value, a fraction of the process that is complete.
-
- sync_speed
- This shows the current actual speed, in K/sec, of the current
- sync_action. It is averaged over the last 30 seconds.
-
-
As component devices are added to an md array, they appear in the 'md'
directory as new directories named
dev-XXX
@@ -412,6 +399,35 @@
Note that the numbers are 'bit' numbers, not 'block' numbers.
They should be scaled by the bitmap_chunksize.
+ sync_speed_min
+ sync_speed_max
+ This are similar to /proc/sys/dev/raid/speed_limit_{min,max}
+ however they only apply to the particular array.
+ If no value has been written to these, of if the word 'system'
+ is written, then the system-wide value is used. If a value,
+ in kibibytes-per-second is written, then it is used.
+ When the files are read, they show the currently active value
+ followed by "(local)" or "(system)" depending on whether it is
+ a locally set or system-wide value.
+
+ sync_completed
+ This shows the number of sectors that have been completed of
+ whatever the current sync_action is, followed by the number of
+ sectors in total that could need to be processed. The two
+ numbers are separated by a '/' thus effectively showing one
+ value, a fraction of the process that is complete.
+
+ sync_speed
+ This shows the current actual speed, in K/sec, of the current
+ sync_action. It is averaged over the last 30 seconds.
+
+ suspend_lo
+ suspend_hi
+ The two values, given as numbers of sectors, indicate a range
+ within the array where IO will be blocked. This is currently
+ only supported for raid4/5/6.
+
+
Each active md device may also have attributes specific to the
personality module that manages it.
These are specific to the implementation of the module and could
diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt
index 000556c..e00c6cf 100644
--- a/Documentation/power/userland-swsusp.txt
+++ b/Documentation/power/userland-swsusp.txt
@@ -93,21 +93,23 @@
to resume the system from RAM if there's enough battery power or restore
its state on the basis of the saved suspend image otherwise)
-SNAPSHOT_PMOPS - enable the usage of the pmops->prepare, pmops->enter and
- pmops->finish methods (the in-kernel swsusp knows these as the "platform
- method") which are needed on many machines to (among others) speed up
- the resume by letting the BIOS skip some steps or to let the system
- recognise the correct state of the hardware after the resume (in
- particular on many machines this ensures that unplugged AC
- adapters get correctly detected and that kacpid does not run wild after
- the resume). The last ioctl() argument can take one of the three
- values, defined in kernel/power/power.h:
+SNAPSHOT_PMOPS - enable the usage of the hibernation_ops->prepare,
+ hibernate_ops->enter and hibernation_ops->finish methods (the in-kernel
+ swsusp knows these as the "platform method") which are needed on many
+ machines to (among others) speed up the resume by letting the BIOS skip
+ some steps or to let the system recognise the correct state of the
+ hardware after the resume (in particular on many machines this ensures
+ that unplugged AC adapters get correctly detected and that kacpid does
+ not run wild after the resume). The last ioctl() argument can take one
+ of the three values, defined in kernel/power/power.h:
PMOPS_PREPARE - make the kernel carry out the
- pm_ops->prepare(PM_SUSPEND_DISK) operation
+ hibernation_ops->prepare() operation
PMOPS_ENTER - make the kernel power off the system by calling
- pm_ops->enter(PM_SUSPEND_DISK)
+ hibernation_ops->enter()
PMOPS_FINISH - make the kernel carry out the
- pm_ops->finish(PM_SUSPEND_DISK) operation
+ hibernation_ops->finish() operation
+ Note that the actual constants are misnamed because they surface
+ internal kernel implementation details that have changed.
The device's read() operation can be used to transfer the snapshot image from
the kernel. It has the following limitations:
diff --git a/Documentation/vm/slabinfo.c b/Documentation/vm/slabinfo.c
index 41710cc..686a8e0 100644
--- a/Documentation/vm/slabinfo.c
+++ b/Documentation/vm/slabinfo.c
@@ -16,6 +16,7 @@
#include <stdarg.h>
#include <getopt.h>
#include <regex.h>
+#include <errno.h>
#define MAX_SLABS 500
#define MAX_ALIASES 500
@@ -41,12 +42,15 @@
} aliasinfo[MAX_ALIASES];
int slabs = 0;
+int actual_slabs = 0;
int aliases = 0;
int alias_targets = 0;
int highest_node = 0;
char buffer[4096];
+int show_empty = 0;
+int show_report = 0;
int show_alias = 0;
int show_slab = 0;
int skip_zero = 1;
@@ -59,6 +63,15 @@
int show_single_ref = 0;
int show_totals = 0;
int sort_size = 0;
+int set_debug = 0;
+int show_ops = 0;
+
+/* Debug options */
+int sanity = 0;
+int redzone = 0;
+int poison = 0;
+int tracking = 0;
+int tracing = 0;
int page_size;
@@ -76,20 +89,33 @@
void usage(void)
{
- printf("slabinfo [-ahnpvtsz] [slab-regexp]\n"
+ printf("slabinfo 5/7/2007. (c) 2007 sgi. clameter@sgi.com\n\n"
+ "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n"
"-a|--aliases Show aliases\n"
+ "-d<options>|--debug=<options> Set/Clear Debug options\n"
+ "-e|--empty Show empty slabs\n"
+ "-f|--first-alias Show first alias\n"
"-h|--help Show usage information\n"
+ "-i|--inverted Inverted list\n"
+ "-l|--slabs Show slabs\n"
"-n|--numa Show NUMA information\n"
+ "-o|--ops Show kmem_cache_ops\n"
"-s|--shrink Shrink slabs\n"
- "-v|--validate Validate slabs\n"
+ "-r|--report Detailed report on single slabs\n"
+ "-S|--Size Sort by size\n"
"-t|--tracking Show alloc/free information\n"
"-T|--Totals Show summary information\n"
- "-l|--slabs Show slabs\n"
- "-S|--Size Sort by size\n"
+ "-v|--validate Validate slabs\n"
"-z|--zero Include empty slabs\n"
- "-f|--first-alias Show first alias\n"
- "-i|--inverted Inverted list\n"
"-1|--1ref Single reference\n"
+ "\nValid debug options (FZPUT may be combined)\n"
+ "a / A Switch on all debug options (=FZUP)\n"
+ "- Switch off all debug options\n"
+ "f / F Sanity Checks (SLAB_DEBUG_FREE)\n"
+ "z / Z Redzoning\n"
+ "p / P Poisoning\n"
+ "u / U Tracking\n"
+ "t / T Tracing\n"
);
}
@@ -143,11 +169,10 @@
void set_obj(struct slabinfo *s, char *name, int n)
{
char x[100];
+ FILE *f;
sprintf(x, "%s/%s", s->name, name);
-
- FILE *f = fopen(x, "w");
-
+ f = fopen(x, "w");
if (!f)
fatal("Cannot write to %s\n", x);
@@ -155,6 +180,26 @@
fclose(f);
}
+unsigned long read_slab_obj(struct slabinfo *s, char *name)
+{
+ char x[100];
+ FILE *f;
+ int l;
+
+ sprintf(x, "%s/%s", s->name, name);
+ f = fopen(x, "r");
+ if (!f) {
+ buffer[0] = 0;
+ l = 0;
+ } else {
+ l = fread(buffer, 1, sizeof(buffer), f);
+ buffer[l] = 0;
+ fclose(f);
+ }
+ return l;
+}
+
+
/*
* Put a size string together
*/
@@ -226,7 +271,7 @@
void first_line(void)
{
- printf("Name Objects Objsize Space "
+ printf("Name Objects Objsize Space "
"Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n");
}
@@ -246,10 +291,7 @@
return best;
}
}
- if (best)
- return best;
- fatal("Cannot find alias for %s\n", find->name);
- return NULL;
+ return best;
}
unsigned long slab_size(struct slabinfo *s)
@@ -257,6 +299,126 @@
return s->slabs * (page_size << s->order);
}
+void slab_numa(struct slabinfo *s, int mode)
+{
+ int node;
+
+ if (strcmp(s->name, "*") == 0)
+ return;
+
+ if (!highest_node) {
+ printf("\n%s: No NUMA information available.\n", s->name);
+ return;
+ }
+
+ if (skip_zero && !s->slabs)
+ return;
+
+ if (!line) {
+ printf("\n%-21s:", mode ? "NUMA nodes" : "Slab");
+ for(node = 0; node <= highest_node; node++)
+ printf(" %4d", node);
+ printf("\n----------------------");
+ for(node = 0; node <= highest_node; node++)
+ printf("-----");
+ printf("\n");
+ }
+ printf("%-21s ", mode ? "All slabs" : s->name);
+ for(node = 0; node <= highest_node; node++) {
+ char b[20];
+
+ store_size(b, s->numa[node]);
+ printf(" %4s", b);
+ }
+ printf("\n");
+ if (mode) {
+ printf("%-21s ", "Partial slabs");
+ for(node = 0; node <= highest_node; node++) {
+ char b[20];
+
+ store_size(b, s->numa_partial[node]);
+ printf(" %4s", b);
+ }
+ printf("\n");
+ }
+ line++;
+}
+
+void show_tracking(struct slabinfo *s)
+{
+ printf("\n%s: Kernel object allocation\n", s->name);
+ printf("-----------------------------------------------------------------------\n");
+ if (read_slab_obj(s, "alloc_calls"))
+ printf(buffer);
+ else
+ printf("No Data\n");
+
+ printf("\n%s: Kernel object freeing\n", s->name);
+ printf("------------------------------------------------------------------------\n");
+ if (read_slab_obj(s, "free_calls"))
+ printf(buffer);
+ else
+ printf("No Data\n");
+
+}
+
+void ops(struct slabinfo *s)
+{
+ if (strcmp(s->name, "*") == 0)
+ return;
+
+ if (read_slab_obj(s, "ops")) {
+ printf("\n%s: kmem_cache operations\n", s->name);
+ printf("--------------------------------------------\n");
+ printf(buffer);
+ } else
+ printf("\n%s has no kmem_cache operations\n", s->name);
+}
+
+const char *onoff(int x)
+{
+ if (x)
+ return "On ";
+ return "Off";
+}
+
+void report(struct slabinfo *s)
+{
+ if (strcmp(s->name, "*") == 0)
+ return;
+ printf("\nSlabcache: %-20s Aliases: %2d Order : %2d\n", s->name, s->aliases, s->order);
+ if (s->hwcache_align)
+ printf("** Hardware cacheline aligned\n");
+ if (s->cache_dma)
+ printf("** Memory is allocated in a special DMA zone\n");
+ if (s->destroy_by_rcu)
+ printf("** Slabs are destroyed via RCU\n");
+ if (s->reclaim_account)
+ printf("** Reclaim accounting active\n");
+
+ printf("\nSizes (bytes) Slabs Debug Memory\n");
+ printf("------------------------------------------------------------------------\n");
+ printf("Object : %7d Total : %7ld Sanity Checks : %s Total: %7ld\n",
+ s->object_size, s->slabs, onoff(s->sanity_checks),
+ s->slabs * (page_size << s->order));
+ printf("SlabObj: %7d Full : %7ld Redzoning : %s Used : %7ld\n",
+ s->slab_size, s->slabs - s->partial - s->cpu_slabs,
+ onoff(s->red_zone), s->objects * s->object_size);
+ printf("SlabSiz: %7d Partial: %7ld Poisoning : %s Loss : %7ld\n",
+ page_size << s->order, s->partial, onoff(s->poison),
+ s->slabs * (page_size << s->order) - s->objects * s->object_size);
+ printf("Loss : %7d CpuSlab: %7d Tracking : %s Lalig: %7ld\n",
+ s->slab_size - s->object_size, s->cpu_slabs, onoff(s->store_user),
+ (s->slab_size - s->object_size) * s->objects);
+ printf("Align : %7d Objects: %7d Tracing : %s Lpadd: %7ld\n",
+ s->align, s->objs_per_slab, onoff(s->trace),
+ ((page_size << s->order) - s->objs_per_slab * s->slab_size) *
+ s->slabs);
+
+ ops(s);
+ show_tracking(s);
+ slab_numa(s, 1);
+}
void slabcache(struct slabinfo *s)
{
@@ -265,7 +427,18 @@
char flags[20];
char *p = flags;
- if (skip_zero && !s->slabs)
+ if (strcmp(s->name, "*") == 0)
+ return;
+
+ if (actual_slabs == 1) {
+ report(s);
+ return;
+ }
+
+ if (skip_zero && !show_empty && !s->slabs)
+ return;
+
+ if (show_empty && s->slabs)
return;
store_size(size_str, slab_size(s));
@@ -303,48 +476,128 @@
flags);
}
-void slab_numa(struct slabinfo *s)
+/*
+ * Analyze debug options. Return false if something is amiss.
+ */
+int debug_opt_scan(char *opt)
{
- int node;
+ if (!opt || !opt[0] || strcmp(opt, "-") == 0)
+ return 1;
- if (!highest_node)
- fatal("No NUMA information available.\n");
-
- if (skip_zero && !s->slabs)
- return;
-
- if (!line) {
- printf("\nSlab Node ");
- for(node = 0; node <= highest_node; node++)
- printf(" %4d", node);
- printf("\n----------------------");
- for(node = 0; node <= highest_node; node++)
- printf("-----");
- printf("\n");
+ if (strcasecmp(opt, "a") == 0) {
+ sanity = 1;
+ poison = 1;
+ redzone = 1;
+ tracking = 1;
+ return 1;
}
- printf("%-21s ", s->name);
- for(node = 0; node <= highest_node; node++) {
- char b[20];
- store_size(b, s->numa[node]);
- printf(" %4s", b);
- }
- printf("\n");
- line++;
+ for ( ; *opt; opt++)
+ switch (*opt) {
+ case 'F' : case 'f':
+ if (sanity)
+ return 0;
+ sanity = 1;
+ break;
+ case 'P' : case 'p':
+ if (poison)
+ return 0;
+ poison = 1;
+ break;
+
+ case 'Z' : case 'z':
+ if (redzone)
+ return 0;
+ redzone = 1;
+ break;
+
+ case 'U' : case 'u':
+ if (tracking)
+ return 0;
+ tracking = 1;
+ break;
+
+ case 'T' : case 't':
+ if (tracing)
+ return 0;
+ tracing = 1;
+ break;
+ default:
+ return 0;
+ }
+ return 1;
}
-void show_tracking(struct slabinfo *s)
+int slab_empty(struct slabinfo *s)
{
- printf("\n%s: Calls to allocate a slab object\n", s->name);
- printf("---------------------------------------------------\n");
- if (read_obj("alloc_calls"))
- printf(buffer);
+ if (s->objects > 0)
+ return 0;
- printf("%s: Calls to free a slab object\n", s->name);
- printf("-----------------------------------------------\n");
- if (read_obj("free_calls"))
- printf(buffer);
+ /*
+ * We may still have slabs even if there are no objects. Shrinking will
+ * remove them.
+ */
+ if (s->slabs != 0)
+ set_obj(s, "shrink", 1);
+ return 1;
+}
+
+void slab_debug(struct slabinfo *s)
+{
+ if (sanity && !s->sanity_checks) {
+ set_obj(s, "sanity", 1);
+ }
+ if (!sanity && s->sanity_checks) {
+ if (slab_empty(s))
+ set_obj(s, "sanity", 0);
+ else
+ fprintf(stderr, "%s not empty cannot disable sanity checks\n", s->name);
+ }
+ if (redzone && !s->red_zone) {
+ if (slab_empty(s))
+ set_obj(s, "red_zone", 1);
+ else
+ fprintf(stderr, "%s not empty cannot enable redzoning\n", s->name);
+ }
+ if (!redzone && s->red_zone) {
+ if (slab_empty(s))
+ set_obj(s, "red_zone", 0);
+ else
+ fprintf(stderr, "%s not empty cannot disable redzoning\n", s->name);
+ }
+ if (poison && !s->poison) {
+ if (slab_empty(s))
+ set_obj(s, "poison", 1);
+ else
+ fprintf(stderr, "%s not empty cannot enable poisoning\n", s->name);
+ }
+ if (!poison && s->poison) {
+ if (slab_empty(s))
+ set_obj(s, "poison", 0);
+ else
+ fprintf(stderr, "%s not empty cannot disable poisoning\n", s->name);
+ }
+ if (tracking && !s->store_user) {
+ if (slab_empty(s))
+ set_obj(s, "store_user", 1);
+ else
+ fprintf(stderr, "%s not empty cannot enable tracking\n", s->name);
+ }
+ if (!tracking && s->store_user) {
+ if (slab_empty(s))
+ set_obj(s, "store_user", 0);
+ else
+ fprintf(stderr, "%s not empty cannot disable tracking\n", s->name);
+ }
+ if (tracing && !s->trace) {
+ if (slabs == 1)
+ set_obj(s, "trace", 1);
+ else
+ fprintf(stderr, "%s can only enable trace for one slab at a time\n", s->name);
+ }
+ if (!tracing && s->trace)
+ set_obj(s, "trace", 1);
}
void totals(void)
@@ -673,7 +926,7 @@
for (a = aliasinfo; a < aliasinfo + aliases; a++) {
- for(s = slabinfo; s < slabinfo + slabs; s++)
+ for (s = slabinfo; s < slabinfo + slabs; s++)
if (strcmp(a->ref, s->name) == 0) {
a->slab = s;
s->refs++;
@@ -704,7 +957,7 @@
continue;
}
}
- printf("\n%-20s <- %s", a->slab->name, a->name);
+ printf("\n%-12s <- %s", a->slab->name, a->name);
active = a->slab->name;
}
else
@@ -729,7 +982,12 @@
a = find_one_alias(s);
- s->name = a->name;
+ if (a)
+ s->name = a->name;
+ else {
+ s->name = "*";
+ actual_slabs--;
+ }
}
}
@@ -748,11 +1006,14 @@
char *t;
int count;
+ if (chdir("/sys/slab"))
+ fatal("SYSFS support for SLUB not active\n");
+
dir = opendir(".");
while ((de = readdir(dir))) {
if (de->d_name[0] == '.' ||
- slab_mismatch(de->d_name))
- continue;
+ (de->d_name[0] != ':' && slab_mismatch(de->d_name)))
+ continue;
switch (de->d_type) {
case DT_LNK:
alias->name = strdup(de->d_name);
@@ -807,6 +1068,7 @@
}
closedir(dir);
slabs = slab - slabinfo;
+ actual_slabs = slabs;
aliases = alias - aliasinfo;
if (slabs > MAX_SLABS)
fatal("Too many slabs\n");
@@ -825,34 +1087,37 @@
if (show_numa)
- slab_numa(slab);
- else
- if (show_track)
+ slab_numa(slab, 0);
+ else if (show_track)
show_tracking(slab);
- else
- if (validate)
+ else if (validate)
slab_validate(slab);
- else
- if (shrink)
+ else if (shrink)
slab_shrink(slab);
- else {
- if (show_slab)
- slabcache(slab);
- }
+ else if (set_debug)
+ slab_debug(slab);
+ else if (show_ops)
+ ops(slab);
+ else if (show_slab)
+ slabcache(slab);
}
}
struct option opts[] = {
{ "aliases", 0, NULL, 'a' },
- { "slabs", 0, NULL, 'l' },
- { "numa", 0, NULL, 'n' },
- { "zero", 0, NULL, 'z' },
- { "help", 0, NULL, 'h' },
- { "validate", 0, NULL, 'v' },
+ { "debug", 2, NULL, 'd' },
+ { "empty", 0, NULL, 'e' },
{ "first-alias", 0, NULL, 'f' },
- { "shrink", 0, NULL, 's' },
- { "track", 0, NULL, 't'},
+ { "help", 0, NULL, 'h' },
{ "inverted", 0, NULL, 'i'},
+ { "numa", 0, NULL, 'n' },
+ { "ops", 0, NULL, 'o' },
+ { "report", 0, NULL, 'r' },
+ { "shrink", 0, NULL, 's' },
+ { "slabs", 0, NULL, 'l' },
+ { "track", 0, NULL, 't'},
+ { "validate", 0, NULL, 'v' },
+ { "zero", 0, NULL, 'z' },
{ "1ref", 0, NULL, '1'},
{ NULL, 0, NULL, 0 }
};
@@ -864,10 +1129,9 @@
char *pattern_source;
page_size = getpagesize();
- if (chdir("/sys/slab"))
- fatal("This kernel does not have SLUB support.\n");
- while ((c = getopt_long(argc, argv, "afhil1npstvzTS", opts, NULL)) != -1)
+ while ((c = getopt_long(argc, argv, "ad::efhil1noprstvzTS",
+ opts, NULL)) != -1)
switch(c) {
case '1':
show_single_ref = 1;
@@ -875,6 +1139,14 @@
case 'a':
show_alias = 1;
break;
+ case 'd':
+ set_debug = 1;
+ if (!debug_opt_scan(optarg))
+ fatal("Invalid debug option '%s'\n", optarg);
+ break;
+ case 'e':
+ show_empty = 1;
+ break;
case 'f':
show_first_alias = 1;
break;
@@ -887,6 +1159,12 @@
case 'n':
show_numa = 1;
break;
+ case 'o':
+ show_ops = 1;
+ break;
+ case 'r':
+ show_report = 1;
+ break;
case 's':
shrink = 1;
break;
@@ -914,8 +1192,8 @@
}
- if (!show_slab && !show_alias && !show_track
- && !validate && !shrink)
+ if (!show_slab && !show_alias && !show_track && !show_report
+ && !validate && !shrink && !set_debug && !show_ops)
show_slab = 1;
if (argc > optind)
diff --git a/arch/avr32/Makefile b/arch/avr32/Makefile
index 6115fc1..dc6bc01 100644
--- a/arch/avr32/Makefile
+++ b/arch/avr32/Makefile
@@ -16,7 +16,7 @@
CFLAGS_MODULE += -mno-relax
LDFLAGS_vmlinux += --relax
-cpuflags-$(CONFIG_CPU_AP7000) += -mcpu=ap7000
+cpuflags-$(CONFIG_CPU_AT32AP7000) += -mcpu=ap7000
CFLAGS += $(cpuflags-y)
AFLAGS += $(cpuflags-y)
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 4e4181e..13f9884 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -330,13 +330,13 @@
{
struct pt_regs *childregs;
- childregs = ((struct pt_regs *)(THREAD_SIZE + (unsigned long)p->thread_info)) - 1;
+ childregs = ((struct pt_regs *)(THREAD_SIZE + (unsigned long)task_stack_page(p))) - 1;
*childregs = *regs;
if (user_mode(regs))
childregs->sp = usp;
else
- childregs->sp = (unsigned long)p->thread_info + THREAD_SIZE;
+ childregs->sp = (unsigned long)task_stack_page(p) + THREAD_SIZE;
childregs->r12 = 0; /* Set return value for child */
@@ -403,7 +403,7 @@
if (!p || p == current || p->state == TASK_RUNNING)
return 0;
- stack_page = (unsigned long)p->thread_info;
+ stack_page = (unsigned long)task_stack_page(p);
BUG_ON(!stack_page);
/*
diff --git a/arch/avr32/kernel/ptrace.c b/arch/avr32/kernel/ptrace.c
index 8ac74dd..3c36c2d 100644
--- a/arch/avr32/kernel/ptrace.c
+++ b/arch/avr32/kernel/ptrace.c
@@ -24,7 +24,7 @@
static struct pt_regs *get_user_regs(struct task_struct *tsk)
{
- return (struct pt_regs *)((unsigned long) tsk->thread_info +
+ return (struct pt_regs *)((unsigned long)task_stack_page(tsk) +
THREAD_SIZE - sizeof(struct pt_regs));
}
diff --git a/arch/avr32/kernel/syscall_table.S b/arch/avr32/kernel/syscall_table.S
index 7c27958..07f6a6f 100644
--- a/arch/avr32/kernel/syscall_table.S
+++ b/arch/avr32/kernel/syscall_table.S
@@ -291,4 +291,5 @@
.long sys_shmget /* 275 */
.long sys_shmdt
.long sys_shmctl
+ .long sys_utimensat
.long sys_ni_syscall /* r8 is saturated at nr_syscalls */
diff --git a/arch/avr32/kernel/traps.c b/arch/avr32/kernel/traps.c
index 4de9edf..86d1075 100644
--- a/arch/avr32/kernel/traps.c
+++ b/arch/avr32/kernel/traps.c
@@ -123,7 +123,7 @@
/* This way of handling undefined instructions is stolen from ARM */
static LIST_HEAD(undef_hook);
-static spinlock_t undef_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(undef_lock);
void register_undef_hook(struct undef_hook *hook)
{
diff --git a/arch/avr32/kernel/vmlinux.lds.c b/arch/avr32/kernel/vmlinux.lds.c
index 7ad20cf..e7f72c9 100644
--- a/arch/avr32/kernel/vmlinux.lds.c
+++ b/arch/avr32/kernel/vmlinux.lds.c
@@ -35,7 +35,7 @@
_einittext = .;
. = ALIGN(4);
__tagtable_begin = .;
- *(.taglist)
+ *(.taglist.init)
__tagtable_end = .;
*(.init.data)
. = ALIGN(16);
diff --git a/arch/avr32/mach-at32ap/clock.c b/arch/avr32/mach-at32ap/clock.c
index 00c4354..0f8c89c 100644
--- a/arch/avr32/mach-at32ap/clock.c
+++ b/arch/avr32/mach-at32ap/clock.c
@@ -18,7 +18,7 @@
#include "clock.h"
-static spinlock_t clk_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(clk_lock);
struct clk *clk_get(struct device *dev, const char *id)
{
diff --git a/arch/avr32/mm/dma-coherent.c b/arch/avr32/mm/dma-coherent.c
index b68d669..099212d 100644
--- a/arch/avr32/mm/dma-coherent.c
+++ b/arch/avr32/mm/dma-coherent.c
@@ -112,16 +112,21 @@
}
EXPORT_SYMBOL(dma_free_coherent);
-#if 0
void *dma_alloc_writecombine(struct device *dev, size_t size,
dma_addr_t *handle, gfp_t gfp)
{
struct page *page;
+ dma_addr_t phys;
page = __dma_alloc(dev, size, handle, gfp);
+ if (!page)
+ return NULL;
+
+ phys = page_to_phys(page);
+ *handle = phys;
/* Now, map the page into P3 with write-combining turned on */
- return __ioremap(page_to_phys(page), size, _PAGE_BUFFER);
+ return __ioremap(phys, size, _PAGE_BUFFER);
}
EXPORT_SYMBOL(dma_alloc_writecombine);
@@ -132,8 +137,7 @@
iounmap(cpu_addr);
- page = bus_to_page(handle);
+ page = phys_to_page(handle);
__dma_free(dev, size, page, handle);
}
EXPORT_SYMBOL(dma_free_writecombine);
-#endif
diff --git a/arch/blackfin/kernel/asm-offsets.c b/arch/blackfin/kernel/asm-offsets.c
index 41d9a9f..e455f45 100644
--- a/arch/blackfin/kernel/asm-offsets.c
+++ b/arch/blackfin/kernel/asm-offsets.c
@@ -46,7 +46,7 @@
DEFINE(TASK_PTRACE, offsetof(struct task_struct, ptrace));
DEFINE(TASK_BLOCKED, offsetof(struct task_struct, blocked));
DEFINE(TASK_THREAD, offsetof(struct task_struct, thread));
- DEFINE(TASK_THREAD_INFO, offsetof(struct task_struct, thread_info));
+ DEFINE(TASK_THREAD_INFO, offsetof(struct task_struct, stack));
DEFINE(TASK_MM, offsetof(struct task_struct, mm));
DEFINE(TASK_ACTIVE_MM, offsetof(struct task_struct, active_mm));
DEFINE(TASK_SIGPENDING, offsetof(struct task_struct, pending));
diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c
index d7c8e51..e718bb4 100644
--- a/arch/blackfin/kernel/ptrace.c
+++ b/arch/blackfin/kernel/ptrace.c
@@ -73,7 +73,7 @@
static inline struct pt_regs *get_user_regs(struct task_struct *task)
{
return (struct pt_regs *)
- ((unsigned long)task->thread_info +
+ ((unsigned long)task_stack_page(task) +
(THREAD_SIZE - sizeof(struct pt_regs)));
}
@@ -99,7 +99,7 @@
unsigned char *reg_ptr;
struct pt_regs *regs =
- (struct pt_regs *)((unsigned long)task->thread_info +
+ (struct pt_regs *)((unsigned long)task_stack_page(task) +
(THREAD_SIZE - sizeof(struct pt_regs)));
reg_ptr = (char *)regs;
@@ -125,7 +125,7 @@
char * reg_ptr;
struct pt_regs *regs =
- (struct pt_regs *)((unsigned long)task->thread_info +
+ (struct pt_regs *)((unsigned long)task_stack_page(task) +
(THREAD_SIZE - sizeof(struct pt_regs)));
reg_ptr = (char *)regs;
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index eed6943..114738a 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -45,6 +45,10 @@
bool
default y
+config QUICKLIST
+ bool
+ default y
+
config ARCH_HAS_ILOG2_U32
bool
default y
@@ -53,10 +57,6 @@
bool
default y
-config ARCH_USES_SLAB_PAGE_STRUCT
- bool
- default y
-
mainmenu "Fujitsu FR-V Kernel Configuration"
source "init/Kconfig"
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index 515a5ce..9583a33 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -25,12 +25,14 @@
#include <linux/elf.h>
#include <linux/reboot.h>
#include <linux/interrupt.h>
+#include <linux/pagemap.h>
#include <asm/asm-offsets.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/setup.h>
#include <asm/pgtable.h>
+#include <asm/tlb.h>
#include <asm/gdb-stub.h>
#include <asm/mb-regs.h>
@@ -88,6 +90,8 @@
while (!need_resched()) {
irq_stat[cpu].idle_timestamp = jiffies;
+ check_pgt_cache();
+
if (!frv_dma_inprogress && idle)
idle();
}
diff --git a/arch/frv/mm/pgalloc.c b/arch/frv/mm/pgalloc.c
index 598a26a..7787c3c 100644
--- a/arch/frv/mm/pgalloc.c
+++ b/arch/frv/mm/pgalloc.c
@@ -13,12 +13,12 @@
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/highmem.h>
+#include <linux/quicklist.h>
#include <asm/pgalloc.h>
#include <asm/page.h>
#include <asm/cacheflush.h>
pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__((aligned(PAGE_SIZE)));
-struct kmem_cache *pgd_cache;
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
{
@@ -100,7 +100,7 @@
set_page_private(next, (unsigned long) pprev);
}
-void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
+void pgd_ctor(void *pgd)
{
unsigned long flags;
@@ -120,7 +120,7 @@
}
/* never called when PTRS_PER_PMD > 1 */
-void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused)
+void pgd_dtor(void *pgd)
{
unsigned long flags; /* can be called from interrupt context */
@@ -133,7 +133,7 @@
{
pgd_t *pgd;
- pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
+ pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
if (!pgd)
return pgd;
@@ -143,15 +143,15 @@
void pgd_free(pgd_t *pgd)
{
/* in the non-PAE case, clear_page_tables() clears user pgd entries */
- kmem_cache_free(pgd_cache, pgd);
+ quicklist_free(0, pgd_dtor, pgd);
}
void __init pgtable_cache_init(void)
{
- pgd_cache = kmem_cache_create("pgd",
- PTRS_PER_PGD * sizeof(pgd_t),
- PTRS_PER_PGD * sizeof(pgd_t),
- SLAB_PANIC,
- pgd_ctor,
- pgd_dtor);
}
+
+void check_pgt_cache(void)
+{
+ quicklist_trim(0, pgd_dtor, 25, 16);
+}
+
diff --git a/arch/h8300/kernel/asm-offsets.c b/arch/h8300/kernel/asm-offsets.c
index b78b82a..fc30b4f 100644
--- a/arch/h8300/kernel/asm-offsets.c
+++ b/arch/h8300/kernel/asm-offsets.c
@@ -30,7 +30,7 @@
DEFINE(TASK_PTRACE, offsetof(struct task_struct, ptrace));
DEFINE(TASK_BLOCKED, offsetof(struct task_struct, blocked));
DEFINE(TASK_THREAD, offsetof(struct task_struct, thread));
- DEFINE(TASK_THREAD_INFO, offsetof(struct task_struct, thread_info));
+ DEFINE(TASK_THREAD_INFO, offsetof(struct task_struct, stack));
DEFINE(TASK_MM, offsetof(struct task_struct, mm));
DEFINE(TASK_ACTIVE_MM, offsetof(struct task_struct, active_mm));
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index 80b4c5d..e5be819 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -733,9 +733,11 @@
sys_dev = get_cpu_sysdev(cpu);
switch (action) {
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
cache_add_dev(sys_dev);
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
cache_remove_dev(sys_dev);
break;
}
diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c
index 2f28540..7ba7c3a 100644
--- a/arch/i386/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c
@@ -137,10 +137,12 @@
mutex_lock(&therm_cpu_lock);
switch (action) {
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
err = thermal_throttle_add_dev(sys_dev);
WARN_ON(err);
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
thermal_throttle_remove_dev(sys_dev);
break;
}
diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c
index 6471a5a..200fb3f 100644
--- a/arch/i386/kernel/cpu/transmeta.c
+++ b/arch/i386/kernel/cpu/transmeta.c
@@ -77,8 +77,10 @@
set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
/* If we can run i686 user-space code, call us an i686 */
-#define USER686 (X86_FEATURE_TSC|X86_FEATURE_CX8|X86_FEATURE_CMOV)
- if ( c->x86 == 5 && (c->x86_capability[0] & USER686) == USER686 )
+#define USER686 ((1 << X86_FEATURE_TSC)|\
+ (1 << X86_FEATURE_CX8)|\
+ (1 << X86_FEATURE_CMOV))
+ if (c->x86 == 5 && (c->x86_capability[0] & USER686) == USER686)
c->x86 = 6;
#ifdef CONFIG_SYSCTL
diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
index eeae0d9..5c2faa1 100644
--- a/arch/i386/kernel/cpuid.c
+++ b/arch/i386/kernel/cpuid.c
@@ -169,9 +169,11 @@
switch (action) {
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
cpuid_device_create(cpu);
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
break;
}
diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
index cbe7ec8..83f825f 100644
--- a/arch/i386/kernel/microcode.c
+++ b/arch/i386/kernel/microcode.c
@@ -567,7 +567,7 @@
return error;
}
-static int apply_microcode_on_cpu(int cpu)
+static int apply_microcode_check_cpu(int cpu)
{
struct cpuinfo_x86 *c = cpu_data + cpu;
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
@@ -575,8 +575,9 @@
unsigned int val[2];
int err = 0;
+ /* Check if the microcode is available */
if (!uci->mc)
- return -EINVAL;
+ return 0;
old = current->cpus_allowed;
set_cpus_allowed(current, cpumask_of_cpu(cpu));
@@ -614,7 +615,7 @@
return err;
}
-static void microcode_init_cpu(int cpu)
+static void microcode_init_cpu(int cpu, int resume)
{
cpumask_t old;
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
@@ -624,8 +625,7 @@
set_cpus_allowed(current, cpumask_of_cpu(cpu));
mutex_lock(µcode_mutex);
collect_cpu_info(cpu);
- if (uci->valid && system_state == SYSTEM_RUNNING &&
- !suspend_cpu_hotplug)
+ if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
cpu_request_microcode(cpu);
mutex_unlock(µcode_mutex);
set_cpus_allowed(current, old);
@@ -702,7 +702,7 @@
.name = "microcode",
};
-static int mc_sysdev_add(struct sys_device *sys_dev)
+static int __mc_sysdev_add(struct sys_device *sys_dev, int resume)
{
int err, cpu = sys_dev->id;
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
@@ -711,39 +711,31 @@
return 0;
pr_debug("Microcode:CPU %d added\n", cpu);
- /* If suspend_cpu_hotplug is set, the system is resuming and we should
- * use the data from before the suspend.
- */
- if (suspend_cpu_hotplug) {
- err = apply_microcode_on_cpu(cpu);
- if (err)
- microcode_fini_cpu(cpu);
- }
- if (!uci->valid)
- memset(uci, 0, sizeof(*uci));
+ memset(uci, 0, sizeof(*uci));
err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
if (err)
return err;
- if (!uci->valid)
- microcode_init_cpu(cpu);
+ microcode_init_cpu(cpu, resume);
return 0;
}
+static int mc_sysdev_add(struct sys_device *sys_dev)
+{
+ return __mc_sysdev_add(sys_dev, 0);
+}
+
static int mc_sysdev_remove(struct sys_device *sys_dev)
{
int cpu = sys_dev->id;
if (!cpu_online(cpu))
return 0;
+
pr_debug("Microcode:CPU %d removed\n", cpu);
- /* If suspend_cpu_hotplug is set, the system is suspending and we should
- * keep the microcode in memory for the resume.
- */
- if (!suspend_cpu_hotplug)
- microcode_fini_cpu(cpu);
+ microcode_fini_cpu(cpu);
sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
return 0;
}
@@ -774,13 +766,34 @@
sys_dev = get_cpu_sysdev(cpu);
switch (action) {
+ case CPU_UP_CANCELED_FROZEN:
+ /* The CPU refused to come up during a system resume */
+ microcode_fini_cpu(cpu);
+ break;
case CPU_ONLINE:
case CPU_DOWN_FAILED:
mc_sysdev_add(sys_dev);
break;
+ case CPU_ONLINE_FROZEN:
+ /* System-wide resume is in progress, try to apply microcode */
+ if (apply_microcode_check_cpu(cpu)) {
+ /* The application of microcode failed */
+ microcode_fini_cpu(cpu);
+ __mc_sysdev_add(sys_dev, 1);
+ break;
+ }
+ case CPU_DOWN_FAILED_FROZEN:
+ if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
+ printk(KERN_ERR "Microcode: Failed to create the sysfs "
+ "group for CPU%d\n", cpu);
+ break;
case CPU_DOWN_PREPARE:
mc_sysdev_remove(sys_dev);
break;
+ case CPU_DOWN_PREPARE_FROZEN:
+ /* Suspend is in progress, only remove the interface */
+ sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+ break;
}
return NOTIFY_OK;
}
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index 8cd0a91..0c1069b 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -153,9 +153,11 @@
switch (action) {
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
msr_device_create(cpu);
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
break;
}
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 4bec0cb..c05e7e8 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -305,7 +305,7 @@
regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss);
printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
TASK_COMM_LEN, current->comm, current->pid,
- current_thread_info(), current, current->thread_info);
+ current_thread_info(), current, task_thread_info(current));
/*
* When in-kernel, we also print out the stack and code at the
* time of the fault..
diff --git a/arch/i386/mach-generic/probe.c b/arch/i386/mach-generic/probe.c
index a7b3999..74f3da6 100644
--- a/arch/i386/mach-generic/probe.c
+++ b/arch/i386/mach-generic/probe.c
@@ -119,9 +119,7 @@
return 0;
}
-#ifdef CONFIG_SMP
int hard_smp_processor_id(void)
{
return genapic->get_apic_id(*(unsigned long *)(APIC_BASE+APIC_ID));
}
-#endif
diff --git a/arch/i386/mach-voyager/voyager_basic.c b/arch/i386/mach-voyager/voyager_basic.c
index 8fe7e45..9b77b39 100644
--- a/arch/i386/mach-voyager/voyager_basic.c
+++ b/arch/i386/mach-voyager/voyager_basic.c
@@ -292,8 +292,8 @@
void
mca_nmi_hook(void)
{
- __u8 dumpval __attribute__((unused)) = inb(0xf823);
- __u8 swnmi __attribute__((unused)) = inb(0xf813);
+ __u8 dumpval __maybe_unused = inb(0xf823);
+ __u8 swnmi __maybe_unused = inb(0xf813);
/* FIXME: assume dump switch pressed */
/* check to see if the dump switch was pressed */
diff --git a/arch/i386/pci/init.c b/arch/i386/pci/init.c
index 1cf11af..3de9f9b 100644
--- a/arch/i386/pci/init.c
+++ b/arch/i386/pci/init.c
@@ -6,7 +6,7 @@
in the right sequence from here. */
static __init int pci_access_init(void)
{
- int type __attribute__((unused)) = 0;
+ int type __maybe_unused = 0;
#ifdef CONFIG_PCI_DIRECT
type = pci_direct_probe();
diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c
index d3e9f33..6a49600 100644
--- a/arch/ia64/kernel/err_inject.c
+++ b/arch/ia64/kernel/err_inject.c
@@ -236,9 +236,11 @@
sys_dev = get_cpu_sysdev(cpu);
switch (action) {
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
err_inject_add_dev(sys_dev);
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
err_inject_remove_dev(sys_dev);
break;
}
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 1d7cc7e..f8ae709 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -1689,7 +1689,7 @@
ti->preempt_count = 1;
ti->task = p;
ti->cpu = cpu;
- p->thread_info = ti;
+ p->stack = ti;
p->state = TASK_UNINTERRUPTIBLE;
cpu_set(cpu, p->cpus_allowed);
INIT_LIST_HEAD(&p->tasks);
diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c
index a71df9a..85829e2 100644
--- a/arch/ia64/kernel/palinfo.c
+++ b/arch/ia64/kernel/palinfo.c
@@ -975,9 +975,11 @@
switch (action) {
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
create_palinfo_proc_entries(hotcpu);
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
remove_palinfo_proc_entries(hotcpu);
break;
}
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
index a51f1d0..89f6b13 100644
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -582,6 +582,7 @@
struct salinfo_data *data;
switch (action) {
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
spin_lock_irqsave(&data_saved_lock, flags);
for (i = 0, data = salinfo_data;
i < ARRAY_SIZE(salinfo_data);
@@ -592,6 +593,7 @@
spin_unlock_irqrestore(&data_saved_lock, flags);
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
spin_lock_irqsave(&data_saved_lock, flags);
for (i = 0, data = salinfo_data;
i < ARRAY_SIZE(salinfo_data);
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index 687500d..94ae3c8 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -412,9 +412,11 @@
sys_dev = get_cpu_sysdev(cpu);
switch (action) {
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
cache_add_dev(sys_dev);
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
cache_remove_dev(sys_dev);
break;
}
diff --git a/arch/m68knommu/kernel/asm-offsets.c b/arch/m68knommu/kernel/asm-offsets.c
index b988c7b..7cd183d 100644
--- a/arch/m68knommu/kernel/asm-offsets.c
+++ b/arch/m68knommu/kernel/asm-offsets.c
@@ -31,7 +31,7 @@
DEFINE(TASK_PTRACE, offsetof(struct task_struct, ptrace));
DEFINE(TASK_BLOCKED, offsetof(struct task_struct, blocked));
DEFINE(TASK_THREAD, offsetof(struct task_struct, thread));
- DEFINE(TASK_THREAD_INFO, offsetof(struct task_struct, thread_info));
+ DEFINE(TASK_THREAD_INFO, offsetof(struct task_struct, stack));
DEFINE(TASK_MM, offsetof(struct task_struct, mm));
DEFINE(TASK_ACTIVE_MM, offsetof(struct task_struct, active_mm));
diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
index 761a779..3b27309 100644
--- a/arch/mips/kernel/asm-offsets.c
+++ b/arch/mips/kernel/asm-offsets.c
@@ -82,7 +82,7 @@
{
text("/* MIPS task_struct offsets. */");
offset("#define TASK_STATE ", struct task_struct, state);
- offset("#define TASK_THREAD_INFO ", struct task_struct, thread_info);
+ offset("#define TASK_THREAD_INFO ", struct task_struct, stack);
offset("#define TASK_FLAGS ", struct task_struct, flags);
offset("#define TASK_MM ", struct task_struct, mm);
offset("#define TASK_PID ", struct task_struct, pid);
diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c
index 5dcfab6..b361edb 100644
--- a/arch/mips/kernel/smtc.c
+++ b/arch/mips/kernel/smtc.c
@@ -560,7 +560,7 @@
write_tc_gpr_sp(__KSTK_TOS(idle));
/* global pointer */
- write_tc_gpr_gp((unsigned long)idle->thread_info);
+ write_tc_gpr_gp((unsigned long)task_thread_info(idle));
smtc_status |= SMTC_MTC_ACTIVE;
write_tc_c0_tchalt(0);
diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c
index 54fdb95..d3b7917 100644
--- a/arch/parisc/kernel/asm-offsets.c
+++ b/arch/parisc/kernel/asm-offsets.c
@@ -54,7 +54,7 @@
int main(void)
{
- DEFINE(TASK_THREAD_INFO, offsetof(struct task_struct, thread_info));
+ DEFINE(TASK_THREAD_INFO, offsetof(struct task_struct, stack));
DEFINE(TASK_STATE, offsetof(struct task_struct, state));
DEFINE(TASK_FLAGS, offsetof(struct task_struct, flags));
DEFINE(TASK_SIGPENDING, offsetof(struct task_struct, pending));
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 8f48560..37bc35e 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -58,7 +58,7 @@
#ifdef CONFIG_PPC64
DEFINE(AUDITCONTEXT, offsetof(struct task_struct, audit_context));
#else
- DEFINE(THREAD_INFO, offsetof(struct task_struct, thread_info));
+ DEFINE(THREAD_INFO, offsetof(struct task_struct, stack));
DEFINE(PTRACE, offsetof(struct task_struct, ptrace));
#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index cae39d9..68991c2 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -342,10 +342,12 @@
switch (action) {
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
register_cpu_online(cpu);
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
unregister_cpu_online(cpu);
break;
#endif
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index b3a592b..de45aa8 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -252,12 +252,15 @@
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
numa_setup_cpu(lcpu);
ret = NOTIFY_OK;
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
unmap_cpu_from_node(lcpu);
break;
ret = NOTIFY_OK;
diff --git a/arch/ppc/kernel/asm-offsets.c b/arch/ppc/kernel/asm-offsets.c
index c5850a2..e8e9432 100644
--- a/arch/ppc/kernel/asm-offsets.c
+++ b/arch/ppc/kernel/asm-offsets.c
@@ -35,7 +35,7 @@
main(void)
{
DEFINE(THREAD, offsetof(struct task_struct, thread));
- DEFINE(THREAD_INFO, offsetof(struct task_struct, thread_info));
+ DEFINE(THREAD_INFO, offsetof(struct task_struct, stack));
DEFINE(MM, offsetof(struct task_struct, mm));
DEFINE(PTRACE, offsetof(struct task_struct, ptrace));
DEFINE(KSP, offsetof(struct thread_struct, ksp));
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index ee89b33..81a2b92 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -567,9 +567,11 @@
{
switch (action) {
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
appldata_online_cpu((long) hcpu);
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
appldata_offline_cpu((long) hcpu);
break;
default:
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index ec514fe..1375f8a 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -15,7 +15,7 @@
int main(void)
{
- DEFINE(__THREAD_info, offsetof(struct task_struct, thread_info),);
+ DEFINE(__THREAD_info, offsetof(struct task_struct, stack),);
DEFINE(__THREAD_ksp, offsetof(struct task_struct, thread.ksp),);
DEFINE(__THREAD_per, offsetof(struct task_struct, thread.per_info),);
DEFINE(__THREAD_mm_segment,
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index b797702..09f028a 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -789,10 +789,12 @@
switch (action) {
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
if (sysdev_create_file(s, &attr_capability))
return NOTIFY_BAD;
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
sysdev_remove_file(s, &attr_capability);
break;
}
diff --git a/arch/sparc/kernel/asm-offsets.c b/arch/sparc/kernel/asm-offsets.c
index 29d7cfd..6773ed7 100644
--- a/arch/sparc/kernel/asm-offsets.c
+++ b/arch/sparc/kernel/asm-offsets.c
@@ -28,7 +28,7 @@
DEFINE(AOFF_task_gid, offsetof(struct task_struct, gid));
DEFINE(AOFF_task_euid, offsetof(struct task_struct, euid));
DEFINE(AOFF_task_egid, offsetof(struct task_struct, egid));
- /* DEFINE(THREAD_INFO, offsetof(struct task_struct, thread_info)); */
+ /* DEFINE(THREAD_INFO, offsetof(struct task_struct, stack)); */
DEFINE(ASIZ_task_uid, sizeof(current->uid));
DEFINE(ASIZ_task_gid, sizeof(current->gid));
DEFINE(ASIZ_task_euid, sizeof(current->euid));
diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c
index dc652f2..d0fde36 100644
--- a/arch/sparc64/kernel/traps.c
+++ b/arch/sparc64/kernel/traps.c
@@ -19,6 +19,7 @@
#include <linux/init.h>
#include <linux/kdebug.h>
+#include <asm/smp.h>
#include <asm/delay.h>
#include <asm/system.h>
#include <asm/ptrace.h>
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 354cc6b..b9c0f30 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -320,21 +320,7 @@
source "lib/Kconfig"
-menu "SCSI support"
-depends on BROKEN
-
-config SCSI
- tristate "SCSI support"
-
-# This gives us free_dma, which scsi.c wants.
-config GENERIC_ISA_DMA
- bool
- depends on SCSI
- default y
-
-source "arch/um/Kconfig.scsi"
-
-endmenu
+source "drivers/scsi/Kconfig"
source "drivers/md/Kconfig"
diff --git a/arch/um/Kconfig.scsi b/arch/um/Kconfig.scsi
deleted file mode 100644
index c291c94..0000000
--- a/arch/um/Kconfig.scsi
+++ /dev/null
@@ -1,58 +0,0 @@
-comment "SCSI support type (disk, tape, CD-ROM)"
- depends on SCSI
-
-config BLK_DEV_SD
- tristate "SCSI disk support"
- depends on SCSI
-
-config SD_EXTRA_DEVS
- int "Maximum number of SCSI disks that can be loaded as modules"
- depends on BLK_DEV_SD
- default "40"
-
-config CHR_DEV_ST
- tristate "SCSI tape support"
- depends on SCSI
-
-config BLK_DEV_SR
- tristate "SCSI CD-ROM support"
- depends on SCSI
-
-config BLK_DEV_SR_VENDOR
- bool "Enable vendor-specific extensions (for SCSI CDROM)"
- depends on BLK_DEV_SR
-
-config SR_EXTRA_DEVS
- int "Maximum number of CDROM devices that can be loaded as modules"
- depends on BLK_DEV_SR
- default "2"
-
-config CHR_DEV_SG
- tristate "SCSI generic support"
- depends on SCSI
-
-comment "Some SCSI devices (e.g. CD jukebox) support multiple LUNs"
- depends on SCSI
-
-#if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
-config SCSI_DEBUG_QUEUES
- bool "Enable extra checks in new queueing code"
- depends on SCSI
-
-#fi
-config SCSI_MULTI_LUN
- bool "Probe all LUNs on each SCSI device"
- depends on SCSI
-
-config SCSI_CONSTANTS
- bool "Verbose SCSI error reporting (kernel size +=12K)"
- depends on SCSI
-
-config SCSI_LOGGING
- bool "SCSI logging facility"
- depends on SCSI
-
-config SCSI_DEBUG
- tristate "SCSI debugging host simulator (EXPERIMENTAL)"
- depends on SCSI
-
diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c
index ef36fac..a96ae1a 100644
--- a/arch/um/kernel/skas/process.c
+++ b/arch/um/kernel/skas/process.c
@@ -178,20 +178,23 @@
int external_pid_skas(struct task_struct *task)
{
-#warning Need to look up userspace_pid by cpu
+ /* FIXME: Need to look up userspace_pid by cpu */
return(userspace_pid[0]);
}
int thread_pid_skas(struct task_struct *task)
{
-#warning Need to look up userspace_pid by cpu
+ /* FIXME: Need to look up userspace_pid by cpu */
return(userspace_pid[0]);
}
void kill_off_processes_skas(void)
{
if(proc_mm)
-#warning need to loop over userspace_pids in kill_off_processes_skas
+ /*
+ * FIXME: need to loop over userspace_pids in
+ * kill_off_processes_skas
+ */
os_kill_ptraced_process(userspace_pid[0], 1);
else {
struct task_struct *p;
diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c
index 92a7b59..2d9d2ca 100644
--- a/arch/um/os-Linux/process.c
+++ b/arch/um/os-Linux/process.c
@@ -239,6 +239,7 @@
return ok;
}
+#ifdef UML_CONFIG_MODE_TT
void init_new_thread_stack(void *sig_stack, void (*usr1_handler)(int))
{
int flags = 0, pages;
@@ -260,6 +261,7 @@
"errno = %d\n", errno);
}
}
+#endif
void init_new_thread_signals(void)
{
diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c
index 8e490ff..5c89463 100644
--- a/arch/um/os-Linux/skas/mem.c
+++ b/arch/um/os-Linux/skas/mem.c
@@ -68,7 +68,7 @@
int err, pid = mm_idp->u.pid;
if(proc_mm)
-#warning Need to look up userspace_pid by cpu
+ /* FIXME: Need to look up userspace_pid by cpu */
pid = userspace_pid[0];
multi_count++;
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index 5c088a5..6a0e466 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -586,7 +586,7 @@
{
int err;
-#warning need cpu pid in switch_mm_skas
+ /* FIXME: need cpu pid in switch_mm_skas */
if(proc_mm){
err = ptrace(PTRACE_SWITCH_MM, userspace_pid[0], 0,
mm_idp->u.mm_fd);
diff --git a/arch/v850/kernel/asm-offsets.c b/arch/v850/kernel/asm-offsets.c
index 24f2913..cee5c31 100644
--- a/arch/v850/kernel/asm-offsets.c
+++ b/arch/v850/kernel/asm-offsets.c
@@ -29,7 +29,7 @@
DEFINE (TASK_PTRACE, offsetof (struct task_struct, ptrace));
DEFINE (TASK_BLOCKED, offsetof (struct task_struct, blocked));
DEFINE (TASK_THREAD, offsetof (struct task_struct, thread));
- DEFINE (TASK_THREAD_INFO, offsetof (struct task_struct, thread_info));
+ DEFINE (TASK_THREAD_INFO, offsetof (struct task_struct, stack));
DEFINE (TASK_MM, offsetof (struct task_struct, mm));
DEFINE (TASK_ACTIVE_MM, offsetof (struct task_struct, active_mm));
DEFINE (TASK_PID, offsetof (struct task_struct, pid));
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 3bc30d2..3eaceac 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -32,7 +32,7 @@
*/
static inline void stack_overflow_check(struct pt_regs *regs)
{
- u64 curbase = (u64) current->thread_info;
+ u64 curbase = (u64)task_stack_page(current);
static unsigned long warned = -60*HZ;
if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 4421696..a14375d 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -720,9 +720,11 @@
switch (action) {
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
mce_create_device(cpu);
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
mce_remove_device(cpu);
break;
}
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index d0bd5d6..03356e6 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -654,9 +654,11 @@
switch (action) {
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
threshold_create_device(cpu);
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
threshold_remove_device(cpu);
break;
default:
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index dc32cef..51d4c6f 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -327,7 +327,7 @@
cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
{
long cpu = (long)arg;
- if (action == CPU_ONLINE)
+ if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
return NOTIFY_DONE;
}
diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c
index b256cfb..698079b 100644
--- a/arch/xtensa/kernel/asm-offsets.c
+++ b/arch/xtensa/kernel/asm-offsets.c
@@ -70,7 +70,7 @@
DEFINE(TASK_ACTIVE_MM, offsetof (struct task_struct, active_mm));
DEFINE(TASK_PID, offsetof (struct task_struct, pid));
DEFINE(TASK_THREAD, offsetof (struct task_struct, thread));
- DEFINE(TASK_THREAD_INFO, offsetof (struct task_struct, thread_info));
+ DEFINE(TASK_THREAD_INFO, offsetof (struct task_struct, stack));
DEFINE(TASK_STRUCT_SIZE, sizeof (struct task_struct));
BLANK();
diff --git a/block/as-iosched.c b/block/as-iosched.c
index 640aa83..109e91b 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -1306,7 +1306,7 @@
struct as_data *ad = e->elevator_data;
del_timer_sync(&ad->antic_timer);
- kblockd_flush();
+ kblockd_flush_work(&ad->antic_work);
BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC]));
BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC]));
diff --git a/block/genhd.c b/block/genhd.c
index b566444..93a2cf6 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -213,6 +213,59 @@
return kobj ? to_disk(kobj) : NULL;
}
+/*
+ * print a full list of all partitions - intended for places where the root
+ * filesystem can't be mounted and thus to give the victim some idea of what
+ * went wrong
+ */
+void __init printk_all_partitions(void)
+{
+ int n;
+ struct gendisk *sgp;
+
+ mutex_lock(&block_subsys_lock);
+ /* For each block device... */
+ list_for_each_entry(sgp, &block_subsys.list, kobj.entry) {
+ char buf[BDEVNAME_SIZE];
+ /*
+ * Don't show empty devices or things that have been surpressed
+ */
+ if (get_capacity(sgp) == 0 ||
+ (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
+ continue;
+
+ /*
+ * Note, unlike /proc/partitions, I am showing the numbers in
+ * hex - the same format as the root= option takes.
+ */
+ printk("%02x%02x %10llu %s",
+ sgp->major, sgp->first_minor,
+ (unsigned long long)get_capacity(sgp) >> 1,
+ disk_name(sgp, 0, buf));
+ if (sgp->driverfs_dev != NULL &&
+ sgp->driverfs_dev->driver != NULL)
+ printk(" driver: %s\n",
+ sgp->driverfs_dev->driver->name);
+ else
+ printk(" (driver?)\n");
+
+ /* now show the partitions */
+ for (n = 0; n < sgp->minors - 1; ++n) {
+ if (sgp->part[n] == NULL)
+ continue;
+ if (sgp->part[n]->nr_sects == 0)
+ continue;
+ printk(" %02x%02x %10llu %s\n",
+ sgp->major, n + 1 + sgp->first_minor,
+ (unsigned long long)sgp->part[n]->nr_sects >> 1,
+ disk_name(sgp, n + 1, buf));
+ } /* partition subloop */
+ } /* Block device loop */
+
+ mutex_unlock(&block_subsys_lock);
+ return;
+}
+
#ifdef CONFIG_PROC_FS
/* iterator */
static void *part_start(struct seq_file *part, loff_t *pos)
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index f294f15..17e1889 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -1712,7 +1712,6 @@
void blk_sync_queue(struct request_queue *q)
{
del_timer_sync(&q->unplug_timer);
- kblockd_flush();
}
EXPORT_SYMBOL(blk_sync_queue);
@@ -3508,7 +3507,7 @@
* If a CPU goes away, splice its entries to the current CPU
* and trigger a run of the softirq
*/
- if (action == CPU_DEAD) {
+ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
int cpu = (unsigned long) hcpu;
local_irq_disable();
@@ -3632,11 +3631,11 @@
EXPORT_SYMBOL(kblockd_schedule_work);
-void kblockd_flush(void)
+void kblockd_flush_work(struct work_struct *work)
{
- flush_workqueue(kblockd_workqueue);
+ cancel_work_sync(work);
}
-EXPORT_SYMBOL(kblockd_flush);
+EXPORT_SYMBOL(kblockd_flush_work);
int __init blk_dev_init(void)
{
diff --git a/drivers/acpi/sleep/main.c b/drivers/acpi/sleep/main.c
index f8c6341..52b2347 100644
--- a/drivers/acpi/sleep/main.c
+++ b/drivers/acpi/sleep/main.c
@@ -29,7 +29,6 @@
[PM_SUSPEND_ON] = ACPI_STATE_S0,
[PM_SUSPEND_STANDBY] = ACPI_STATE_S1,
[PM_SUSPEND_MEM] = ACPI_STATE_S3,
- [PM_SUSPEND_DISK] = ACPI_STATE_S4,
[PM_SUSPEND_MAX] = ACPI_STATE_S5
};
@@ -94,14 +93,6 @@
do_suspend_lowlevel();
break;
- case PM_SUSPEND_DISK:
- if (acpi_pm_ops.pm_disk_mode == PM_DISK_PLATFORM)
- status = acpi_enter_sleep_state(acpi_state);
- break;
- case PM_SUSPEND_MAX:
- acpi_power_off();
- break;
-
default:
return -EINVAL;
}
@@ -157,12 +148,13 @@
suspend_state_t states[] = {
[1] = PM_SUSPEND_STANDBY,
[3] = PM_SUSPEND_MEM,
- [4] = PM_SUSPEND_DISK,
[5] = PM_SUSPEND_MAX
};
if (acpi_state < 6 && states[acpi_state])
return pm_suspend(states[acpi_state]);
+ if (acpi_state == 4)
+ return hibernate();
return -EINVAL;
}
@@ -189,6 +181,49 @@
.finish = acpi_pm_finish,
};
+#ifdef CONFIG_SOFTWARE_SUSPEND
+static int acpi_hibernation_prepare(void)
+{
+ return acpi_sleep_prepare(ACPI_STATE_S4);
+}
+
+static int acpi_hibernation_enter(void)
+{
+ acpi_status status = AE_OK;
+ unsigned long flags = 0;
+
+ ACPI_FLUSH_CPU_CACHE();
+
+ local_irq_save(flags);
+ acpi_enable_wakeup_device(ACPI_STATE_S4);
+ /* This shouldn't return. If it returns, we have a problem */
+ status = acpi_enter_sleep_state(ACPI_STATE_S4);
+ local_irq_restore(flags);
+
+ return ACPI_SUCCESS(status) ? 0 : -EFAULT;
+}
+
+static void acpi_hibernation_finish(void)
+{
+ acpi_leave_sleep_state(ACPI_STATE_S4);
+ acpi_disable_wakeup_device(ACPI_STATE_S4);
+
+ /* reset firmware waking vector */
+ acpi_set_firmware_waking_vector((acpi_physical_address) 0);
+
+ if (init_8259A_after_S1) {
+ printk("Broken toshiba laptop -> kicking interrupts\n");
+ init_8259A(0);
+ }
+}
+
+static struct hibernation_ops acpi_hibernation_ops = {
+ .prepare = acpi_hibernation_prepare,
+ .enter = acpi_hibernation_enter,
+ .finish = acpi_hibernation_finish,
+};
+#endif /* CONFIG_SOFTWARE_SUSPEND */
+
/*
* Toshiba fails to preserve interrupts over S1, reinitialization
* of 8259 is needed after S1 resume.
@@ -227,14 +262,18 @@
sleep_states[i] = 1;
printk(" S%d", i);
}
- if (i == ACPI_STATE_S4) {
- if (sleep_states[i])
- acpi_pm_ops.pm_disk_mode = PM_DISK_PLATFORM;
- }
}
printk(")\n");
pm_set_ops(&acpi_pm_ops);
+
+#ifdef CONFIG_SOFTWARE_SUSPEND
+ if (sleep_states[ACPI_STATE_S4])
+ hibernation_set_ops(&acpi_hibernation_ops);
+#else
+ sleep_states[ACPI_STATE_S4] = 0;
+#endif
+
return 0;
}
diff --git a/drivers/acpi/sleep/proc.c b/drivers/acpi/sleep/proc.c
index 5a76e5b..76b45f0 100644
--- a/drivers/acpi/sleep/proc.c
+++ b/drivers/acpi/sleep/proc.c
@@ -60,7 +60,7 @@
state = simple_strtoul(str, NULL, 0);
#ifdef CONFIG_SOFTWARE_SUSPEND
if (state == 4) {
- error = pm_suspend(PM_SUSPEND_DISK);
+ error = hibernate();
goto Done;
}
#endif
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index a795088..fef87dd 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -1316,7 +1316,7 @@
spin_unlock_irqrestore(ap->lock, flags);
DPRINTK("flush #1\n");
- flush_workqueue(ata_wq);
+ cancel_work_sync(&ap->port_task.work); /* akpm: seems unneeded */
/*
* At this point, if a task is running, it's guaranteed to see
@@ -1327,7 +1327,7 @@
if (ata_msg_ctl(ap))
ata_port_printk(ap, KERN_DEBUG, "%s: flush #2\n",
__FUNCTION__);
- flush_workqueue(ata_wq);
+ cancel_work_sync(&ap->port_task.work);
}
spin_lock_irqsave(ap->lock, flags);
@@ -6475,9 +6475,9 @@
/* Flush hotplug task. The sequence is similar to
* ata_port_flush_task().
*/
- flush_workqueue(ata_aux_wq);
+ cancel_work_sync(&ap->hotplug_task.work); /* akpm: why? */
cancel_delayed_work(&ap->hotplug_task);
- flush_workqueue(ata_aux_wq);
+ cancel_work_sync(&ap->hotplug_task.work);
skip_eh:
/* remove the associated SCSI host */
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index 067a9e8..8d8cdfe 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -126,10 +126,13 @@
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
rc = topology_add_dev(cpu);
break;
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
topology_remove_dev(cpu);
break;
}
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index af6d727..18cdd8c 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -243,17 +243,13 @@
transfer_result = lo_do_transfer(lo, WRITE, page, offset,
bvec->bv_page, bv_offs, size, IV);
if (unlikely(transfer_result)) {
- char *kaddr;
-
/*
* The transfer failed, but we still write the data to
* keep prepare/commit calls balanced.
*/
printk(KERN_ERR "loop: transfer error block %llu\n",
(unsigned long long)index);
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr + offset, 0, size);
- kunmap_atomic(kaddr, KM_USER0);
+ zero_user_page(page, offset, size, KM_USER0);
}
flush_dcache_page(page);
ret = aops->commit_write(file, page, offset,
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 090796b..069ae39 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -366,20 +366,25 @@
.show = pid_show,
};
-static void nbd_do_it(struct nbd_device *lo)
+static int nbd_do_it(struct nbd_device *lo)
{
struct request *req;
+ int ret;
BUG_ON(lo->magic != LO_MAGIC);
lo->pid = current->pid;
- sysfs_create_file(&lo->disk->kobj, &pid_attr.attr);
+ ret = sysfs_create_file(&lo->disk->kobj, &pid_attr.attr);
+ if (ret) {
+ printk(KERN_ERR "nbd: sysfs_create_file failed!");
+ return ret;
+ }
while ((req = nbd_read_stat(lo)) != NULL)
nbd_end_request(req);
sysfs_remove_file(&lo->disk->kobj, &pid_attr.attr);
- return;
+ return 0;
}
static void nbd_clear_que(struct nbd_device *lo)
@@ -569,7 +574,9 @@
case NBD_DO_IT:
if (!lo->file)
return -EINVAL;
- nbd_do_it(lo);
+ error = nbd_do_it(lo);
+ if (error)
+ return error;
/* on return tidy up in case we have a signal */
/* Forcibly shutdown the socket causing all listeners
* to error
diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig
index 5f3acd8..7cda04b 100644
--- a/drivers/char/hw_random/Kconfig
+++ b/drivers/char/hw_random/Kconfig
@@ -91,3 +91,17 @@
module will be called omap-rng.
If unsure, say Y.
+
+config HW_RANDOM_PASEMI
+ tristate "PA Semi HW Random Number Generator support"
+ depends on HW_RANDOM && PPC_PASEMI
+ default HW_RANDOM
+ ---help---
+ This driver provides kernel-side support for the Random Number
+ Generator hardware found on PA6T-1682M processor.
+
+ To compile this driver as a module, choose M here: the
+ module will be called pasemi-rng.
+
+ If unsure, say Y.
+
diff --git a/drivers/char/hw_random/Makefile b/drivers/char/hw_random/Makefile
index c41fa19..c8b7300 100644
--- a/drivers/char/hw_random/Makefile
+++ b/drivers/char/hw_random/Makefile
@@ -10,3 +10,4 @@
obj-$(CONFIG_HW_RANDOM_VIA) += via-rng.o
obj-$(CONFIG_HW_RANDOM_IXP4XX) += ixp4xx-rng.o
obj-$(CONFIG_HW_RANDOM_OMAP) += omap-rng.o
+obj-$(CONFIG_HW_RANDOM_PASEMI) += pasemi-rng.o
diff --git a/drivers/char/hw_random/pasemi-rng.c b/drivers/char/hw_random/pasemi-rng.c
new file mode 100644
index 0000000..fa6040b
--- /dev/null
+++ b/drivers/char/hw_random/pasemi-rng.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2006-2007 PA Semi, Inc
+ *
+ * Maintained by: Olof Johansson <olof@lixom.net>
+ *
+ * Driver for the PWRficient onchip rng
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+#include <linux/hw_random.h>
+#include <asm/of_platform.h>
+#include <asm/io.h>
+
+#define SDCRNG_CTL_REG 0x00
+#define SDCRNG_CTL_FVLD_M 0x0000f000
+#define SDCRNG_CTL_FVLD_S 12
+#define SDCRNG_CTL_KSZ 0x00000800
+#define SDCRNG_CTL_RSRC_CRG 0x00000010
+#define SDCRNG_CTL_RSRC_RRG 0x00000000
+#define SDCRNG_CTL_CE 0x00000004
+#define SDCRNG_CTL_RE 0x00000002
+#define SDCRNG_CTL_DR 0x00000001
+#define SDCRNG_CTL_SELECT_RRG_RNG (SDCRNG_CTL_RE | SDCRNG_CTL_RSRC_RRG)
+#define SDCRNG_CTL_SELECT_CRG_RNG (SDCRNG_CTL_CE | SDCRNG_CTL_RSRC_CRG)
+#define SDCRNG_VAL_REG 0x20
+
+#define MODULE_NAME "pasemi_rng"
+
+static int pasemi_rng_data_present(struct hwrng *rng)
+{
+ void __iomem *rng_regs = (void __iomem *)rng->priv;
+
+ return (in_le32(rng_regs + SDCRNG_CTL_REG)
+ & SDCRNG_CTL_FVLD_M) ? 1 : 0;
+}
+
+static int pasemi_rng_data_read(struct hwrng *rng, u32 *data)
+{
+ void __iomem *rng_regs = (void __iomem *)rng->priv;
+ *data = in_le32(rng_regs + SDCRNG_VAL_REG);
+ return 4;
+}
+
+static int pasemi_rng_init(struct hwrng *rng)
+{
+ void __iomem *rng_regs = (void __iomem *)rng->priv;
+ u32 ctl;
+
+ ctl = SDCRNG_CTL_DR | SDCRNG_CTL_SELECT_RRG_RNG | SDCRNG_CTL_KSZ;
+ out_le32(rng_regs + SDCRNG_CTL_REG, ctl);
+ out_le32(rng_regs + SDCRNG_CTL_REG, ctl & ~SDCRNG_CTL_DR);
+
+ return 0;
+}
+
+static void pasemi_rng_cleanup(struct hwrng *rng)
+{
+ void __iomem *rng_regs = (void __iomem *)rng->priv;
+ u32 ctl;
+
+ ctl = SDCRNG_CTL_RE | SDCRNG_CTL_CE;
+ out_le32(rng_regs + SDCRNG_CTL_REG,
+ in_le32(rng_regs + SDCRNG_CTL_REG) & ~ctl);
+}
+
+static struct hwrng pasemi_rng = {
+ .name = MODULE_NAME,
+ .init = pasemi_rng_init,
+ .cleanup = pasemi_rng_cleanup,
+ .data_present = pasemi_rng_data_present,
+ .data_read = pasemi_rng_data_read,
+};
+
+static int __devinit rng_probe(struct of_device *ofdev,
+ const struct of_device_id *match)
+{
+ void __iomem *rng_regs;
+ struct device_node *rng_np = ofdev->node;
+ struct resource res;
+ int err = 0;
+
+ err = of_address_to_resource(rng_np, 0, &res);
+ if (err)
+ return -ENODEV;
+
+ rng_regs = ioremap(res.start, 0x100);
+
+ if (!rng_regs)
+ return -ENOMEM;
+
+ pasemi_rng.priv = (unsigned long)rng_regs;
+
+ printk(KERN_INFO "Registering PA Semi RNG\n");
+
+ err = hwrng_register(&pasemi_rng);
+
+ if (err)
+ iounmap(rng_regs);
+
+ return err;
+}
+
+static int __devexit rng_remove(struct of_device *dev)
+{
+ void __iomem *rng_regs = (void __iomem *)pasemi_rng.priv;
+
+ hwrng_unregister(&pasemi_rng);
+ iounmap(rng_regs);
+
+ return 0;
+}
+
+static struct of_device_id rng_match[] = {
+ {
+ .compatible = "1682m-rng",
+ },
+ {},
+};
+
+static struct of_platform_driver rng_driver = {
+ .name = "pasemi-rng",
+ .match_table = rng_match,
+ .probe = rng_probe,
+ .remove = rng_remove,
+};
+
+static int __init rng_init(void)
+{
+ return of_register_platform_driver(&rng_driver);
+}
+module_init(rng_init);
+
+static void __exit rng_exit(void)
+{
+ of_unregister_platform_driver(&rng_driver);
+}
+module_exit(rng_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Egor Martovetsky <egor@pasemi.com>");
+MODULE_DESCRIPTION("H/W RNG driver for PA Semi processor");
diff --git a/drivers/char/pcmcia/Kconfig b/drivers/char/pcmcia/Kconfig
index 27c1179..f25facd 100644
--- a/drivers/char/pcmcia/Kconfig
+++ b/drivers/char/pcmcia/Kconfig
@@ -21,6 +21,7 @@
config CARDMAN_4000
tristate "Omnikey Cardman 4000 support"
depends on PCMCIA
+ select BITREVERSE
help
Enable support for the Omnikey Cardman 4000 PCMCIA Smartcard
reader.
diff --git a/drivers/char/pcmcia/cm4000_cs.c b/drivers/char/pcmcia/cm4000_cs.c
index 4ea5879..fee58e0 100644
--- a/drivers/char/pcmcia/cm4000_cs.c
+++ b/drivers/char/pcmcia/cm4000_cs.c
@@ -31,6 +31,7 @@
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/delay.h>
+#include <linux/bitrev.h>
#include <asm/uaccess.h>
#include <asm/io.h>
@@ -194,41 +195,17 @@
}
#endif
-#define b_0000 15
-#define b_0001 14
-#define b_0010 13
-#define b_0011 12
-#define b_0100 11
-#define b_0101 10
-#define b_0110 9
-#define b_0111 8
-#define b_1000 7
-#define b_1001 6
-#define b_1010 5
-#define b_1011 4
-#define b_1100 3
-#define b_1101 2
-#define b_1110 1
-#define b_1111 0
-
-static unsigned char irtab[16] = {
- b_0000, b_1000, b_0100, b_1100,
- b_0010, b_1010, b_0110, b_1110,
- b_0001, b_1001, b_0101, b_1101,
- b_0011, b_1011, b_0111, b_1111
-};
+static inline unsigned char invert_revert(unsigned char ch)
+{
+ return bitrev8(~ch);
+}
static void str_invert_revert(unsigned char *b, int len)
{
int i;
for (i = 0; i < len; i++)
- b[i] = (irtab[b[i] & 0x0f] << 4) | irtab[b[i] >> 4];
-}
-
-static unsigned char invert_revert(unsigned char ch)
-{
- return (irtab[ch & 0x0f] << 4) | irtab[ch >> 4];
+ b[i] = invert_revert(b[i]);
}
#define ATRLENCK(dev,pos) \
@@ -1881,8 +1858,11 @@
init_waitqueue_head(&dev->readq);
ret = cm4000_config(link, i);
- if (ret)
+ if (ret) {
+ dev_table[i] = NULL;
+ kfree(dev);
return ret;
+ }
class_device_create(cmm_class, NULL, MKDEV(major, i), NULL,
"cmm%d", i);
@@ -1907,7 +1887,7 @@
cm4000_release(link);
dev_table[devno] = NULL;
- kfree(dev);
+ kfree(dev);
class_device_destroy(cmm_class, MKDEV(major, devno));
@@ -1956,12 +1936,14 @@
if (major < 0) {
printk(KERN_WARNING MODULE_NAME
": could not get major number\n");
+ class_destroy(cmm_class);
return major;
}
rc = pcmcia_register_driver(&cm4000_driver);
if (rc < 0) {
unregister_chrdev(major, DEVICE_NAME);
+ class_destroy(cmm_class);
return rc;
}
diff --git a/drivers/char/pcmcia/cm4040_cs.c b/drivers/char/pcmcia/cm4040_cs.c
index f2e4ec4..af88181 100644
--- a/drivers/char/pcmcia/cm4040_cs.c
+++ b/drivers/char/pcmcia/cm4040_cs.c
@@ -636,8 +636,11 @@
setup_timer(&dev->poll_timer, cm4040_do_poll, 0);
ret = reader_config(link, i);
- if (ret)
+ if (ret) {
+ dev_table[i] = NULL;
+ kfree(dev);
return ret;
+ }
class_device_create(cmx_class, NULL, MKDEV(major, i), NULL,
"cmx%d", i);
@@ -708,12 +711,14 @@
if (major < 0) {
printk(KERN_WARNING MODULE_NAME
": could not get major number\n");
+ class_destroy(cmx_class);
return major;
}
rc = pcmcia_register_driver(&reader_driver);
if (rc < 0) {
unregister_chrdev(major, DEVICE_NAME);
+ class_destroy(cmx_class);
return rc;
}
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index f6ac1d3..fc662e4 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -934,13 +934,6 @@
return -EINVAL;
/*
- * No more input please, we are switching. The new ldisc
- * will update this value in the ldisc open function
- */
-
- tty->receive_room = 0;
-
- /*
* Problem: What do we do if this blocks ?
*/
@@ -951,6 +944,13 @@
return 0;
}
+ /*
+ * No more input please, we are switching. The new ldisc
+ * will update this value in the ldisc open function
+ */
+
+ tty->receive_room = 0;
+
o_ldisc = tty->ldisc;
o_tty = tty->link;
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 893dbaf..eb37fba 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1685,9 +1685,11 @@
if (sys_dev) {
switch (action) {
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
cpufreq_add_dev(sys_dev);
break;
case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
if (unlikely(lock_policy_rwsem_write(cpu)))
BUG();
@@ -1699,6 +1701,7 @@
__cpufreq_remove_dev(sys_dev);
break;
case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
cpufreq_add_dev(sys_dev);
break;
}
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index d1c7cac..d2f0cbd 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -313,9 +313,11 @@
switch (action) {
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
cpufreq_update_policy(cpu);
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
cpufreq_stats_free_table(cpu);
break;
}
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index 03b1f65..75e3911 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -309,9 +309,11 @@
switch (action) {
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
coretemp_device_add(cpu);
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
coretemp_device_remove(cpu);
break;
}
diff --git a/drivers/i2c/chips/tps65010.c b/drivers/i2c/chips/tps65010.c
index 7ed92dc..3c3f2eb 100644
--- a/drivers/i2c/chips/tps65010.c
+++ b/drivers/i2c/chips/tps65010.c
@@ -354,7 +354,7 @@
* also needs to get error handling and probably
* an #ifdef CONFIG_SOFTWARE_SUSPEND
*/
- pm_suspend(PM_SUSPEND_DISK);
+ hibernate();
#endif
poll = 1;
}
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c
index f284be1..82dda2f 100644
--- a/drivers/infiniband/hw/ehca/ehca_irq.c
+++ b/drivers/infiniband/hw/ehca/ehca_irq.c
@@ -745,6 +745,7 @@
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
ehca_gen_dbg("CPU: %x (CPU_PREPARE)", cpu);
if(!create_comp_task(pool, cpu)) {
ehca_gen_err("Can't create comp_task for cpu: %x", cpu);
@@ -752,24 +753,29 @@
}
break;
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
ehca_gen_dbg("CPU: %x (CPU_CANCELED)", cpu);
cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
kthread_bind(cct->task, any_online_cpu(cpu_online_map));
destroy_comp_task(pool, cpu);
break;
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
ehca_gen_dbg("CPU: %x (CPU_ONLINE)", cpu);
cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
kthread_bind(cct->task, cpu);
wake_up_process(cct->task);
break;
case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
ehca_gen_dbg("CPU: %x (CPU_DOWN_PREPARE)", cpu);
break;
case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
ehca_gen_dbg("CPU: %x (CPU_DOWN_FAILED)", cpu);
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
ehca_gen_dbg("CPU: %x (CPU_DEAD)", cpu);
destroy_comp_task(pool, cpu);
take_over_work(pool, cpu);
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index c8b8cfa..0d89260 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -2889,7 +2889,9 @@
switch (val) {
case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
cpu);
decache_vcpus_on_cpu(cpu);
@@ -2897,6 +2899,7 @@
NULL, 0, 1);
break;
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
cpu);
smp_call_function_single(cpu, kvm_arch_ops->hardware_enable,
diff --git a/drivers/mca/mca-bus.c b/drivers/mca/mca-bus.c
index da862e4..67b8e94 100644
--- a/drivers/mca/mca-bus.c
+++ b/drivers/mca/mca-bus.c
@@ -47,19 +47,25 @@
{
struct mca_device *mca_dev = to_mca_device (dev);
struct mca_driver *mca_drv = to_mca_driver (drv);
- const short *mca_ids = mca_drv->id_table;
- int i;
+ const unsigned short *mca_ids = mca_drv->id_table;
+ int i = 0;
- if (!mca_ids)
- return 0;
-
- for(i = 0; mca_ids[i]; i++) {
- if (mca_ids[i] == mca_dev->pos_id) {
- mca_dev->index = i;
- return 1;
+ if (mca_ids) {
+ for(i = 0; mca_ids[i]; i++) {
+ if (mca_ids[i] == mca_dev->pos_id) {
+ mca_dev->index = i;
+ return 1;
+ }
}
}
-
+ /* If the integrated id is present, treat it as though it were an
+ * additional id in the id_table (it can't be because by definition,
+ * integrated id's overflow a short */
+ if (mca_drv->integrated_id && mca_dev->pos_id ==
+ mca_drv->integrated_id) {
+ mca_dev->index = i;
+ return 1;
+ }
return 0;
}
diff --git a/drivers/mca/mca-driver.c b/drivers/mca/mca-driver.c
index 2223466..32cd39b 100644
--- a/drivers/mca/mca-driver.c
+++ b/drivers/mca/mca-driver.c
@@ -36,12 +36,25 @@
mca_drv->driver.bus = &mca_bus_type;
if ((r = driver_register(&mca_drv->driver)) < 0)
return r;
+ mca_drv->integrated_id = 0;
}
return 0;
}
EXPORT_SYMBOL(mca_register_driver);
+int mca_register_driver_integrated(struct mca_driver *mca_driver,
+ int integrated_id)
+{
+ int r = mca_register_driver(mca_driver);
+
+ if (!r)
+ mca_driver->integrated_id = integrated_id;
+
+ return r;
+}
+EXPORT_SYMBOL(mca_register_driver_integrated);
+
void mca_unregister_driver(struct mca_driver *mca_drv)
{
if (MCA_bus)
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 4540ade..7df934d 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -262,6 +262,15 @@
---help---
Multipath support for EMC CX/AX series hardware.
+config DM_DELAY
+ tristate "I/O delaying target (EXPERIMENTAL)"
+ depends on BLK_DEV_DM && EXPERIMENTAL
+ ---help---
+ A target that delays reads and/or writes and can send
+ them to different devices. Useful for testing.
+
+ If unsure, say N.
+
endmenu
endif
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 34957a6..3875408 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -31,6 +31,7 @@
obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
+obj-$(CONFIG_DM_DELAY) += dm-delay.o
obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc.o
obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
diff --git a/drivers/md/dm-bio-list.h b/drivers/md/dm-bio-list.h
index da43496..c6be888 100644
--- a/drivers/md/dm-bio-list.h
+++ b/drivers/md/dm-bio-list.h
@@ -8,17 +8,43 @@
#define DM_BIO_LIST_H
#include <linux/bio.h>
+#include <linux/prefetch.h>
struct bio_list {
struct bio *head;
struct bio *tail;
};
+static inline int bio_list_empty(const struct bio_list *bl)
+{
+ return bl->head == NULL;
+}
+
+#define BIO_LIST_INIT { .head = NULL, .tail = NULL }
+
+#define BIO_LIST(bl) \
+ struct bio_list bl = BIO_LIST_INIT
+
static inline void bio_list_init(struct bio_list *bl)
{
bl->head = bl->tail = NULL;
}
+#define bio_list_for_each(bio, bl) \
+ for (bio = (bl)->head; bio && ({ prefetch(bio->bi_next); 1; }); \
+ bio = bio->bi_next)
+
+static inline unsigned bio_list_size(const struct bio_list *bl)
+{
+ unsigned sz = 0;
+ struct bio *bio;
+
+ bio_list_for_each(bio, bl)
+ sz++;
+
+ return sz;
+}
+
static inline void bio_list_add(struct bio_list *bl, struct bio *bio)
{
bio->bi_next = NULL;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index d812123..7b0fcfc 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -33,7 +33,6 @@
struct crypt_io {
struct dm_target *target;
struct bio *base_bio;
- struct bio *first_clone;
struct work_struct work;
atomic_t pending;
int error;
@@ -107,6 +106,8 @@
static struct kmem_cache *_crypt_io_pool;
+static void clone_init(struct crypt_io *, struct bio *);
+
/*
* Different IV generation algorithms:
*
@@ -120,6 +121,9 @@
* benbi: the 64-bit "big-endian 'narrow block'-count", starting at 1
* (needed for LRW-32-AES and possible other narrow block modes)
*
+ * null: the initial vector is always zero. Provides compatibility with
+ * obsolete loop_fish2 devices. Do not use for new devices.
+ *
* plumb: unimplemented, see:
* http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
*/
@@ -256,6 +260,13 @@
return 0;
}
+static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+{
+ memset(iv, 0, cc->iv_size);
+
+ return 0;
+}
+
static struct crypt_iv_operations crypt_iv_plain_ops = {
.generator = crypt_iv_plain_gen
};
@@ -272,6 +283,10 @@
.generator = crypt_iv_benbi_gen
};
+static struct crypt_iv_operations crypt_iv_null_ops = {
+ .generator = crypt_iv_null_gen
+};
+
static int
crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out,
struct scatterlist *in, unsigned int length,
@@ -378,36 +393,21 @@
* This should never violate the device limitations
* May return a smaller bio when running out of pages
*/
-static struct bio *
-crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
- struct bio *base_bio, unsigned int *bio_vec_idx)
+static struct bio *crypt_alloc_buffer(struct crypt_io *io, unsigned int size)
{
+ struct crypt_config *cc = io->target->private;
struct bio *clone;
unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
unsigned int i;
- if (base_bio) {
- clone = bio_alloc_bioset(GFP_NOIO, base_bio->bi_max_vecs, cc->bs);
- __bio_clone(clone, base_bio);
- } else
- clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
-
+ clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
if (!clone)
return NULL;
- clone->bi_destructor = dm_crypt_bio_destructor;
+ clone_init(io, clone);
- /* if the last bio was not complete, continue where that one ended */
- clone->bi_idx = *bio_vec_idx;
- clone->bi_vcnt = *bio_vec_idx;
- clone->bi_size = 0;
- clone->bi_flags &= ~(1 << BIO_SEG_VALID);
-
- /* clone->bi_idx pages have already been allocated */
- size -= clone->bi_idx * PAGE_SIZE;
-
- for (i = clone->bi_idx; i < nr_iovecs; i++) {
+ for (i = 0; i < nr_iovecs; i++) {
struct bio_vec *bv = bio_iovec_idx(clone, i);
bv->bv_page = mempool_alloc(cc->page_pool, gfp_mask);
@@ -419,7 +419,7 @@
* return a partially allocated bio, the caller will then try
* to allocate additional bios while submitting this partial bio
*/
- if ((i - clone->bi_idx) == (MIN_BIO_PAGES - 1))
+ if (i == (MIN_BIO_PAGES - 1))
gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
bv->bv_offset = 0;
@@ -438,12 +438,6 @@
return NULL;
}
- /*
- * Remember the last bio_vec allocated to be able
- * to correctly continue after the splitting.
- */
- *bio_vec_idx = clone->bi_vcnt;
-
return clone;
}
@@ -495,9 +489,6 @@
if (!atomic_dec_and_test(&io->pending))
return;
- if (io->first_clone)
- bio_put(io->first_clone);
-
bio_endio(io->base_bio, io->base_bio->bi_size, io->error);
mempool_free(io, cc->io_pool);
@@ -562,6 +553,7 @@
clone->bi_end_io = crypt_endio;
clone->bi_bdev = cc->dev->bdev;
clone->bi_rw = io->base_bio->bi_rw;
+ clone->bi_destructor = dm_crypt_bio_destructor;
}
static void process_read(struct crypt_io *io)
@@ -585,7 +577,6 @@
}
clone_init(io, clone);
- clone->bi_destructor = dm_crypt_bio_destructor;
clone->bi_idx = 0;
clone->bi_vcnt = bio_segments(base_bio);
clone->bi_size = base_bio->bi_size;
@@ -604,7 +595,6 @@
struct convert_context ctx;
unsigned remaining = base_bio->bi_size;
sector_t sector = base_bio->bi_sector - io->target->begin;
- unsigned bvec_idx = 0;
atomic_inc(&io->pending);
@@ -615,14 +605,14 @@
* so repeat the whole process until all the data can be handled.
*/
while (remaining) {
- clone = crypt_alloc_buffer(cc, base_bio->bi_size,
- io->first_clone, &bvec_idx);
+ clone = crypt_alloc_buffer(io, remaining);
if (unlikely(!clone)) {
dec_pending(io, -ENOMEM);
return;
}
ctx.bio_out = clone;
+ ctx.idx_out = 0;
if (unlikely(crypt_convert(cc, &ctx) < 0)) {
crypt_free_buffer_pages(cc, clone, clone->bi_size);
@@ -631,31 +621,26 @@
return;
}
- clone_init(io, clone);
+ /* crypt_convert should have filled the clone bio */
+ BUG_ON(ctx.idx_out < clone->bi_vcnt);
+
clone->bi_sector = cc->start + sector;
-
- if (!io->first_clone) {
- /*
- * hold a reference to the first clone, because it
- * holds the bio_vec array and that can't be freed
- * before all other clones are released
- */
- bio_get(clone);
- io->first_clone = clone;
- }
-
remaining -= clone->bi_size;
sector += bio_sectors(clone);
- /* prevent bio_put of first_clone */
+ /* Grab another reference to the io struct
+ * before we kick off the request */
if (remaining)
atomic_inc(&io->pending);
generic_make_request(clone);
+ /* Do not reference clone after this - it
+ * may be gone already. */
+
/* out of memory -> run queues */
if (remaining)
- congestion_wait(bio_data_dir(clone), HZ/100);
+ congestion_wait(WRITE, HZ/100);
}
}
@@ -832,6 +817,8 @@
cc->iv_gen_ops = &crypt_iv_essiv_ops;
else if (strcmp(ivmode, "benbi") == 0)
cc->iv_gen_ops = &crypt_iv_benbi_ops;
+ else if (strcmp(ivmode, "null") == 0)
+ cc->iv_gen_ops = &crypt_iv_null_ops;
else {
ti->error = "Invalid IV mode";
goto bad2;
@@ -954,10 +941,12 @@
struct crypt_config *cc = ti->private;
struct crypt_io *io;
+ if (bio_barrier(bio))
+ return -EOPNOTSUPP;
+
io = mempool_alloc(cc->io_pool, GFP_NOIO);
io->target = ti;
io->base_bio = bio;
- io->first_clone = NULL;
io->error = io->post_process = 0;
atomic_set(&io->pending, 0);
kcryptd_queue_io(io);
@@ -1057,7 +1046,7 @@
static struct target_type crypt_target = {
.name = "crypt",
- .version= {1, 3, 0},
+ .version= {1, 5, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
new file mode 100644
index 0000000..52c7cf9
--- /dev/null
+++ b/drivers/md/dm-delay.c
@@ -0,0 +1,383 @@
+/*
+ * Copyright (C) 2005-2007 Red Hat GmbH
+ *
+ * A target that delays reads and/or writes and can send
+ * them to different devices.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+
+#include "dm.h"
+#include "dm-bio-list.h"
+
+#define DM_MSG_PREFIX "delay"
+
+struct delay_c {
+ struct timer_list delay_timer;
+ struct semaphore timer_lock;
+ struct work_struct flush_expired_bios;
+ struct list_head delayed_bios;
+ atomic_t may_delay;
+ mempool_t *delayed_pool;
+
+ struct dm_dev *dev_read;
+ sector_t start_read;
+ unsigned read_delay;
+ unsigned reads;
+
+ struct dm_dev *dev_write;
+ sector_t start_write;
+ unsigned write_delay;
+ unsigned writes;
+};
+
+struct delay_info {
+ struct delay_c *context;
+ struct list_head list;
+ struct bio *bio;
+ unsigned long expires;
+};
+
+static DEFINE_MUTEX(delayed_bios_lock);
+
+static struct workqueue_struct *kdelayd_wq;
+static struct kmem_cache *delayed_cache;
+
+static void handle_delayed_timer(unsigned long data)
+{
+ struct delay_c *dc = (struct delay_c *)data;
+
+ queue_work(kdelayd_wq, &dc->flush_expired_bios);
+}
+
+static void queue_timeout(struct delay_c *dc, unsigned long expires)
+{
+ down(&dc->timer_lock);
+
+ if (!timer_pending(&dc->delay_timer) || expires < dc->delay_timer.expires)
+ mod_timer(&dc->delay_timer, expires);
+
+ up(&dc->timer_lock);
+}
+
+static void flush_bios(struct bio *bio)
+{
+ struct bio *n;
+
+ while (bio) {
+ n = bio->bi_next;
+ bio->bi_next = NULL;
+ generic_make_request(bio);
+ bio = n;
+ }
+}
+
+static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
+{
+ struct delay_info *delayed, *next;
+ unsigned long next_expires = 0;
+ int start_timer = 0;
+ BIO_LIST(flush_bios);
+
+ mutex_lock(&delayed_bios_lock);
+ list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
+ if (flush_all || time_after_eq(jiffies, delayed->expires)) {
+ list_del(&delayed->list);
+ bio_list_add(&flush_bios, delayed->bio);
+ if ((bio_data_dir(delayed->bio) == WRITE))
+ delayed->context->writes--;
+ else
+ delayed->context->reads--;
+ mempool_free(delayed, dc->delayed_pool);
+ continue;
+ }
+
+ if (!start_timer) {
+ start_timer = 1;
+ next_expires = delayed->expires;
+ } else
+ next_expires = min(next_expires, delayed->expires);
+ }
+
+ mutex_unlock(&delayed_bios_lock);
+
+ if (start_timer)
+ queue_timeout(dc, next_expires);
+
+ return bio_list_get(&flush_bios);
+}
+
+static void flush_expired_bios(struct work_struct *work)
+{
+ struct delay_c *dc;
+
+ dc = container_of(work, struct delay_c, flush_expired_bios);
+ flush_bios(flush_delayed_bios(dc, 0));
+}
+
+/*
+ * Mapping parameters:
+ * <device> <offset> <delay> [<write_device> <write_offset> <write_delay>]
+ *
+ * With separate write parameters, the first set is only used for reads.
+ * Delays are specified in milliseconds.
+ */
+static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ struct delay_c *dc;
+ unsigned long long tmpll;
+
+ if (argc != 3 && argc != 6) {
+ ti->error = "requires exactly 3 or 6 arguments";
+ return -EINVAL;
+ }
+
+ dc = kmalloc(sizeof(*dc), GFP_KERNEL);
+ if (!dc) {
+ ti->error = "Cannot allocate context";
+ return -ENOMEM;
+ }
+
+ dc->reads = dc->writes = 0;
+
+ if (sscanf(argv[1], "%llu", &tmpll) != 1) {
+ ti->error = "Invalid device sector";
+ goto bad;
+ }
+ dc->start_read = tmpll;
+
+ if (sscanf(argv[2], "%u", &dc->read_delay) != 1) {
+ ti->error = "Invalid delay";
+ goto bad;
+ }
+
+ if (dm_get_device(ti, argv[0], dc->start_read, ti->len,
+ dm_table_get_mode(ti->table), &dc->dev_read)) {
+ ti->error = "Device lookup failed";
+ goto bad;
+ }
+
+ if (argc == 3) {
+ dc->dev_write = NULL;
+ goto out;
+ }
+
+ if (sscanf(argv[4], "%llu", &tmpll) != 1) {
+ ti->error = "Invalid write device sector";
+ goto bad;
+ }
+ dc->start_write = tmpll;
+
+ if (sscanf(argv[5], "%u", &dc->write_delay) != 1) {
+ ti->error = "Invalid write delay";
+ goto bad;
+ }
+
+ if (dm_get_device(ti, argv[3], dc->start_write, ti->len,
+ dm_table_get_mode(ti->table), &dc->dev_write)) {
+ ti->error = "Write device lookup failed";
+ dm_put_device(ti, dc->dev_read);
+ goto bad;
+ }
+
+out:
+ dc->delayed_pool = mempool_create_slab_pool(128, delayed_cache);
+ if (!dc->delayed_pool) {
+ DMERR("Couldn't create delayed bio pool.");
+ goto bad;
+ }
+
+ init_timer(&dc->delay_timer);
+ dc->delay_timer.function = handle_delayed_timer;
+ dc->delay_timer.data = (unsigned long)dc;
+
+ INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
+ INIT_LIST_HEAD(&dc->delayed_bios);
+ init_MUTEX(&dc->timer_lock);
+ atomic_set(&dc->may_delay, 1);
+
+ ti->private = dc;
+ return 0;
+
+bad:
+ kfree(dc);
+ return -EINVAL;
+}
+
+static void delay_dtr(struct dm_target *ti)
+{
+ struct delay_c *dc = ti->private;
+
+ flush_workqueue(kdelayd_wq);
+
+ dm_put_device(ti, dc->dev_read);
+
+ if (dc->dev_write)
+ dm_put_device(ti, dc->dev_write);
+
+ mempool_destroy(dc->delayed_pool);
+ kfree(dc);
+}
+
+static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
+{
+ struct delay_info *delayed;
+ unsigned long expires = 0;
+
+ if (!delay || !atomic_read(&dc->may_delay))
+ return 1;
+
+ delayed = mempool_alloc(dc->delayed_pool, GFP_NOIO);
+
+ delayed->context = dc;
+ delayed->bio = bio;
+ delayed->expires = expires = jiffies + (delay * HZ / 1000);
+
+ mutex_lock(&delayed_bios_lock);
+
+ if (bio_data_dir(bio) == WRITE)
+ dc->writes++;
+ else
+ dc->reads++;
+
+ list_add_tail(&delayed->list, &dc->delayed_bios);
+
+ mutex_unlock(&delayed_bios_lock);
+
+ queue_timeout(dc, expires);
+
+ return 0;
+}
+
+static void delay_presuspend(struct dm_target *ti)
+{
+ struct delay_c *dc = ti->private;
+
+ atomic_set(&dc->may_delay, 0);
+ del_timer_sync(&dc->delay_timer);
+ flush_bios(flush_delayed_bios(dc, 1));
+}
+
+static void delay_resume(struct dm_target *ti)
+{
+ struct delay_c *dc = ti->private;
+
+ atomic_set(&dc->may_delay, 1);
+}
+
+static int delay_map(struct dm_target *ti, struct bio *bio,
+ union map_info *map_context)
+{
+ struct delay_c *dc = ti->private;
+
+ if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) {
+ bio->bi_bdev = dc->dev_write->bdev;
+ bio->bi_sector = dc->start_write +
+ (bio->bi_sector - ti->begin);
+
+ return delay_bio(dc, dc->write_delay, bio);
+ }
+
+ bio->bi_bdev = dc->dev_read->bdev;
+ bio->bi_sector = dc->start_read +
+ (bio->bi_sector - ti->begin);
+
+ return delay_bio(dc, dc->read_delay, bio);
+}
+
+static int delay_status(struct dm_target *ti, status_type_t type,
+ char *result, unsigned maxlen)
+{
+ struct delay_c *dc = ti->private;
+ int sz = 0;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("%u %u", dc->reads, dc->writes);
+ break;
+
+ case STATUSTYPE_TABLE:
+ DMEMIT("%s %llu %u", dc->dev_read->name,
+ (unsigned long long) dc->start_read,
+ dc->read_delay);
+ if (dc->dev_write)
+ DMEMIT("%s %llu %u", dc->dev_write->name,
+ (unsigned long long) dc->start_write,
+ dc->write_delay);
+ break;
+ }
+
+ return 0;
+}
+
+static struct target_type delay_target = {
+ .name = "delay",
+ .version = {1, 0, 2},
+ .module = THIS_MODULE,
+ .ctr = delay_ctr,
+ .dtr = delay_dtr,
+ .map = delay_map,
+ .presuspend = delay_presuspend,
+ .resume = delay_resume,
+ .status = delay_status,
+};
+
+static int __init dm_delay_init(void)
+{
+ int r = -ENOMEM;
+
+ kdelayd_wq = create_workqueue("kdelayd");
+ if (!kdelayd_wq) {
+ DMERR("Couldn't start kdelayd");
+ goto bad_queue;
+ }
+
+ delayed_cache = kmem_cache_create("dm-delay",
+ sizeof(struct delay_info),
+ __alignof__(struct delay_info),
+ 0, NULL, NULL);
+ if (!delayed_cache) {
+ DMERR("Couldn't create delayed bio cache.");
+ goto bad_memcache;
+ }
+
+ r = dm_register_target(&delay_target);
+ if (r < 0) {
+ DMERR("register failed %d", r);
+ goto bad_register;
+ }
+
+ return 0;
+
+bad_register:
+ kmem_cache_destroy(delayed_cache);
+bad_memcache:
+ destroy_workqueue(kdelayd_wq);
+bad_queue:
+ return r;
+}
+
+static void __exit dm_delay_exit(void)
+{
+ int r = dm_unregister_target(&delay_target);
+
+ if (r < 0)
+ DMERR("unregister failed %d", r);
+
+ kmem_cache_destroy(delayed_cache);
+ destroy_workqueue(kdelayd_wq);
+}
+
+/* Module hooks */
+module_init(dm_delay_init);
+module_exit(dm_delay_exit);
+
+MODULE_DESCRIPTION(DM_NAME " delay target");
+MODULE_AUTHOR("Heinz Mauelshagen <mauelshagen@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 99cdffa..07e0a0c 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -1,7 +1,8 @@
/*
- * dm-snapshot.c
+ * dm-exception-store.c
*
* Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2006 Red Hat GmbH
*
* This file is released under the GPL.
*/
@@ -123,6 +124,7 @@
atomic_t pending_count;
uint32_t callback_count;
struct commit_callback *callbacks;
+ struct dm_io_client *io_client;
};
static inline unsigned int sectors_to_pages(unsigned int sectors)
@@ -159,14 +161,20 @@
*/
static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
{
- struct io_region where;
- unsigned long bits;
+ struct io_region where = {
+ .bdev = ps->snap->cow->bdev,
+ .sector = ps->snap->chunk_size * chunk,
+ .count = ps->snap->chunk_size,
+ };
+ struct dm_io_request io_req = {
+ .bi_rw = rw,
+ .mem.type = DM_IO_VMA,
+ .mem.ptr.vma = ps->area,
+ .client = ps->io_client,
+ .notify.fn = NULL,
+ };
- where.bdev = ps->snap->cow->bdev;
- where.sector = ps->snap->chunk_size * chunk;
- where.count = ps->snap->chunk_size;
-
- return dm_io_sync_vm(1, &where, rw, ps->area, &bits);
+ return dm_io(&io_req, 1, &where, NULL);
}
/*
@@ -213,17 +221,18 @@
chunk_size_supplied = 0;
}
- r = dm_io_get(sectors_to_pages(ps->snap->chunk_size));
- if (r)
- return r;
+ ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap->
+ chunk_size));
+ if (IS_ERR(ps->io_client))
+ return PTR_ERR(ps->io_client);
r = alloc_area(ps);
if (r)
- goto bad1;
+ return r;
r = chunk_io(ps, 0, READ);
if (r)
- goto bad2;
+ goto bad;
dh = (struct disk_header *) ps->area;
@@ -235,7 +244,7 @@
if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
DMWARN("Invalid or corrupt snapshot");
r = -ENXIO;
- goto bad2;
+ goto bad;
}
*new_snapshot = 0;
@@ -252,27 +261,22 @@
(unsigned long long)ps->snap->chunk_size);
/* We had a bogus chunk_size. Fix stuff up. */
- dm_io_put(sectors_to_pages(ps->snap->chunk_size));
free_area(ps);
ps->snap->chunk_size = chunk_size;
ps->snap->chunk_mask = chunk_size - 1;
ps->snap->chunk_shift = ffs(chunk_size) - 1;
- r = dm_io_get(sectors_to_pages(chunk_size));
+ r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size),
+ ps->io_client);
if (r)
return r;
r = alloc_area(ps);
- if (r)
- goto bad1;
+ return r;
- return 0;
-
-bad2:
+bad:
free_area(ps);
-bad1:
- dm_io_put(sectors_to_pages(ps->snap->chunk_size));
return r;
}
@@ -405,7 +409,7 @@
{
struct pstore *ps = get_info(store);
- dm_io_put(sectors_to_pages(ps->snap->chunk_size));
+ dm_io_client_destroy(ps->io_client);
vfree(ps->callbacks);
free_area(ps);
kfree(ps);
diff --git a/drivers/md/dm-hw-handler.h b/drivers/md/dm-hw-handler.h
index 32eff28..e0832e6 100644
--- a/drivers/md/dm-hw-handler.h
+++ b/drivers/md/dm-hw-handler.h
@@ -16,6 +16,7 @@
struct hw_handler_type;
struct hw_handler {
struct hw_handler_type *type;
+ struct mapped_device *md;
void *context;
};
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 8bdc8a8..352c6fb 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2003 Sistina Software
+ * Copyright (C) 2006 Red Hat GmbH
*
* This file is released under the GPL.
*/
@@ -12,13 +13,17 @@
#include <linux/sched.h>
#include <linux/slab.h>
-static struct bio_set *_bios;
+struct dm_io_client {
+ mempool_t *pool;
+ struct bio_set *bios;
+};
/* FIXME: can we shrink this ? */
struct io {
unsigned long error;
atomic_t count;
struct task_struct *sleeper;
+ struct dm_io_client *client;
io_notify_fn callback;
void *context;
};
@@ -26,63 +31,58 @@
/*
* io contexts are only dynamically allocated for asynchronous
* io. Since async io is likely to be the majority of io we'll
- * have the same number of io contexts as buffer heads ! (FIXME:
- * must reduce this).
+ * have the same number of io contexts as bios! (FIXME: must reduce this).
*/
-static unsigned _num_ios;
-static mempool_t *_io_pool;
static unsigned int pages_to_ios(unsigned int pages)
{
return 4 * pages; /* too many ? */
}
-static int resize_pool(unsigned int new_ios)
+/*
+ * Create a client with mempool and bioset.
+ */
+struct dm_io_client *dm_io_client_create(unsigned num_pages)
{
- int r = 0;
+ unsigned ios = pages_to_ios(num_pages);
+ struct dm_io_client *client;
- if (_io_pool) {
- if (new_ios == 0) {
- /* free off the pool */
- mempool_destroy(_io_pool);
- _io_pool = NULL;
- bioset_free(_bios);
+ client = kmalloc(sizeof(*client), GFP_KERNEL);
+ if (!client)
+ return ERR_PTR(-ENOMEM);
- } else {
- /* resize the pool */
- r = mempool_resize(_io_pool, new_ios, GFP_KERNEL);
- }
+ client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io));
+ if (!client->pool)
+ goto bad;
- } else {
- /* create new pool */
- _io_pool = mempool_create_kmalloc_pool(new_ios,
- sizeof(struct io));
- if (!_io_pool)
- return -ENOMEM;
+ client->bios = bioset_create(16, 16);
+ if (!client->bios)
+ goto bad;
- _bios = bioset_create(16, 16);
- if (!_bios) {
- mempool_destroy(_io_pool);
- _io_pool = NULL;
- return -ENOMEM;
- }
- }
+ return client;
- if (!r)
- _num_ios = new_ios;
-
- return r;
+ bad:
+ if (client->pool)
+ mempool_destroy(client->pool);
+ kfree(client);
+ return ERR_PTR(-ENOMEM);
}
+EXPORT_SYMBOL(dm_io_client_create);
-int dm_io_get(unsigned int num_pages)
+int dm_io_client_resize(unsigned num_pages, struct dm_io_client *client)
{
- return resize_pool(_num_ios + pages_to_ios(num_pages));
+ return mempool_resize(client->pool, pages_to_ios(num_pages),
+ GFP_KERNEL);
}
+EXPORT_SYMBOL(dm_io_client_resize);
-void dm_io_put(unsigned int num_pages)
+void dm_io_client_destroy(struct dm_io_client *client)
{
- resize_pool(_num_ios - pages_to_ios(num_pages));
+ mempool_destroy(client->pool);
+ bioset_free(client->bios);
+ kfree(client);
}
+EXPORT_SYMBOL(dm_io_client_destroy);
/*-----------------------------------------------------------------
* We need to keep track of which region a bio is doing io for.
@@ -118,7 +118,7 @@
io_notify_fn fn = io->callback;
void *context = io->context;
- mempool_free(io, _io_pool);
+ mempool_free(io, io->client->pool);
fn(r, context);
}
}
@@ -126,7 +126,8 @@
static int endio(struct bio *bio, unsigned int done, int error)
{
- struct io *io = (struct io *) bio->bi_private;
+ struct io *io;
+ unsigned region;
/* keep going until we've finished */
if (bio->bi_size)
@@ -135,10 +136,17 @@
if (error && bio_data_dir(bio) == READ)
zero_fill_bio(bio);
- dec_count(io, bio_get_region(bio), error);
+ /*
+ * The bio destructor in bio_put() may use the io object.
+ */
+ io = bio->bi_private;
+ region = bio_get_region(bio);
+
bio->bi_max_vecs++;
bio_put(bio);
+ dec_count(io, region, error);
+
return 0;
}
@@ -209,6 +217,9 @@
dp->context_ptr = bvec;
}
+/*
+ * Functions for getting the pages from a VMA.
+ */
static void vm_get_page(struct dpages *dp,
struct page **p, unsigned long *len, unsigned *offset)
{
@@ -233,7 +244,34 @@
static void dm_bio_destructor(struct bio *bio)
{
- bio_free(bio, _bios);
+ struct io *io = bio->bi_private;
+
+ bio_free(bio, io->client->bios);
+}
+
+/*
+ * Functions for getting the pages from kernel memory.
+ */
+static void km_get_page(struct dpages *dp, struct page **p, unsigned long *len,
+ unsigned *offset)
+{
+ *p = virt_to_page(dp->context_ptr);
+ *offset = dp->context_u;
+ *len = PAGE_SIZE - dp->context_u;
+}
+
+static void km_next_page(struct dpages *dp)
+{
+ dp->context_ptr += PAGE_SIZE - dp->context_u;
+ dp->context_u = 0;
+}
+
+static void km_dp_init(struct dpages *dp, void *data)
+{
+ dp->get_page = km_get_page;
+ dp->next_page = km_next_page;
+ dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
+ dp->context_ptr = data;
}
/*-----------------------------------------------------------------
@@ -256,7 +294,7 @@
* to hide it from bio_add_page().
*/
num_bvecs = (remaining / (PAGE_SIZE >> SECTOR_SHIFT)) + 2;
- bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, _bios);
+ bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
bio->bi_sector = where->sector + (where->count - remaining);
bio->bi_bdev = where->bdev;
bio->bi_end_io = endio;
@@ -311,8 +349,9 @@
dec_count(io, 0, 0);
}
-static int sync_io(unsigned int num_regions, struct io_region *where,
- int rw, struct dpages *dp, unsigned long *error_bits)
+static int sync_io(struct dm_io_client *client, unsigned int num_regions,
+ struct io_region *where, int rw, struct dpages *dp,
+ unsigned long *error_bits)
{
struct io io;
@@ -324,6 +363,7 @@
io.error = 0;
atomic_set(&io.count, 1); /* see dispatch_io() */
io.sleeper = current;
+ io.client = client;
dispatch_io(rw, num_regions, where, dp, &io, 1);
@@ -340,12 +380,15 @@
if (atomic_read(&io.count))
return -EINTR;
- *error_bits = io.error;
+ if (error_bits)
+ *error_bits = io.error;
+
return io.error ? -EIO : 0;
}
-static int async_io(unsigned int num_regions, struct io_region *where, int rw,
- struct dpages *dp, io_notify_fn fn, void *context)
+static int async_io(struct dm_io_client *client, unsigned int num_regions,
+ struct io_region *where, int rw, struct dpages *dp,
+ io_notify_fn fn, void *context)
{
struct io *io;
@@ -355,10 +398,11 @@
return -EIO;
}
- io = mempool_alloc(_io_pool, GFP_NOIO);
+ io = mempool_alloc(client->pool, GFP_NOIO);
io->error = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
io->sleeper = NULL;
+ io->client = client;
io->callback = fn;
io->context = context;
@@ -366,61 +410,51 @@
return 0;
}
-int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
- struct page_list *pl, unsigned int offset,
- unsigned long *error_bits)
+static int dp_init(struct dm_io_request *io_req, struct dpages *dp)
{
- struct dpages dp;
- list_dp_init(&dp, pl, offset);
- return sync_io(num_regions, where, rw, &dp, error_bits);
+ /* Set up dpages based on memory type */
+ switch (io_req->mem.type) {
+ case DM_IO_PAGE_LIST:
+ list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset);
+ break;
+
+ case DM_IO_BVEC:
+ bvec_dp_init(dp, io_req->mem.ptr.bvec);
+ break;
+
+ case DM_IO_VMA:
+ vm_dp_init(dp, io_req->mem.ptr.vma);
+ break;
+
+ case DM_IO_KMEM:
+ km_dp_init(dp, io_req->mem.ptr.addr);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
}
-int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw,
- struct bio_vec *bvec, unsigned long *error_bits)
+/*
+ * New collapsed (a)synchronous interface
+ */
+int dm_io(struct dm_io_request *io_req, unsigned num_regions,
+ struct io_region *where, unsigned long *sync_error_bits)
{
+ int r;
struct dpages dp;
- bvec_dp_init(&dp, bvec);
- return sync_io(num_regions, where, rw, &dp, error_bits);
-}
-int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw,
- void *data, unsigned long *error_bits)
-{
- struct dpages dp;
- vm_dp_init(&dp, data);
- return sync_io(num_regions, where, rw, &dp, error_bits);
-}
+ r = dp_init(io_req, &dp);
+ if (r)
+ return r;
-int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
- struct page_list *pl, unsigned int offset,
- io_notify_fn fn, void *context)
-{
- struct dpages dp;
- list_dp_init(&dp, pl, offset);
- return async_io(num_regions, where, rw, &dp, fn, context);
-}
+ if (!io_req->notify.fn)
+ return sync_io(io_req->client, num_regions, where,
+ io_req->bi_rw, &dp, sync_error_bits);
-int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw,
- struct bio_vec *bvec, io_notify_fn fn, void *context)
-{
- struct dpages dp;
- bvec_dp_init(&dp, bvec);
- return async_io(num_regions, where, rw, &dp, fn, context);
+ return async_io(io_req->client, num_regions, where, io_req->bi_rw,
+ &dp, io_req->notify.fn, io_req->notify.context);
}
-
-int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw,
- void *data, io_notify_fn fn, void *context)
-{
- struct dpages dp;
- vm_dp_init(&dp, data);
- return async_io(num_regions, where, rw, &dp, fn, context);
-}
-
-EXPORT_SYMBOL(dm_io_get);
-EXPORT_SYMBOL(dm_io_put);
-EXPORT_SYMBOL(dm_io_sync);
-EXPORT_SYMBOL(dm_io_async);
-EXPORT_SYMBOL(dm_io_sync_bvec);
-EXPORT_SYMBOL(dm_io_async_bvec);
-EXPORT_SYMBOL(dm_io_sync_vm);
-EXPORT_SYMBOL(dm_io_async_vm);
+EXPORT_SYMBOL(dm_io);
diff --git a/drivers/md/dm-io.h b/drivers/md/dm-io.h
index f9035bf..f647e2c 100644
--- a/drivers/md/dm-io.h
+++ b/drivers/md/dm-io.h
@@ -12,7 +12,7 @@
struct io_region {
struct block_device *bdev;
sector_t sector;
- sector_t count;
+ sector_t count; /* If this is zero the region is ignored. */
};
struct page_list {
@@ -20,55 +20,60 @@
struct page *page;
};
-
-/*
- * 'error' is a bitset, with each bit indicating whether an error
- * occurred doing io to the corresponding region.
- */
typedef void (*io_notify_fn)(unsigned long error, void *context);
+enum dm_io_mem_type {
+ DM_IO_PAGE_LIST,/* Page list */
+ DM_IO_BVEC, /* Bio vector */
+ DM_IO_VMA, /* Virtual memory area */
+ DM_IO_KMEM, /* Kernel memory */
+};
+
+struct dm_io_memory {
+ enum dm_io_mem_type type;
+
+ union {
+ struct page_list *pl;
+ struct bio_vec *bvec;
+ void *vma;
+ void *addr;
+ } ptr;
+
+ unsigned offset;
+};
+
+struct dm_io_notify {
+ io_notify_fn fn; /* Callback for asynchronous requests */
+ void *context; /* Passed to callback */
+};
/*
- * Before anyone uses the IO interface they should call
- * dm_io_get(), specifying roughly how many pages they are
- * expecting to perform io on concurrently.
- *
- * This function may block.
+ * IO request structure
*/
-int dm_io_get(unsigned int num_pages);
-void dm_io_put(unsigned int num_pages);
+struct dm_io_client;
+struct dm_io_request {
+ int bi_rw; /* READ|WRITE - not READA */
+ struct dm_io_memory mem; /* Memory to use for io */
+ struct dm_io_notify notify; /* Synchronous if notify.fn is NULL */
+ struct dm_io_client *client; /* Client memory handler */
+};
/*
- * Synchronous IO.
+ * For async io calls, users can alternatively use the dm_io() function below
+ * and dm_io_client_create() to create private mempools for the client.
*
- * Please ensure that the rw flag in the next two functions is
- * either READ or WRITE, ie. we don't take READA. Any
- * regions with a zero count field will be ignored.
+ * Create/destroy may block.
*/
-int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
- struct page_list *pl, unsigned int offset,
- unsigned long *error_bits);
-
-int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw,
- struct bio_vec *bvec, unsigned long *error_bits);
-
-int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw,
- void *data, unsigned long *error_bits);
+struct dm_io_client *dm_io_client_create(unsigned num_pages);
+int dm_io_client_resize(unsigned num_pages, struct dm_io_client *client);
+void dm_io_client_destroy(struct dm_io_client *client);
/*
- * Aynchronous IO.
- *
- * The 'where' array may be safely allocated on the stack since
- * the function takes a copy.
+ * IO interface using private per-client pools.
+ * Each bit in the optional 'sync_error_bits' bitset indicates whether an
+ * error occurred doing io to the corresponding region.
*/
-int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
- struct page_list *pl, unsigned int offset,
- io_notify_fn fn, void *context);
-
-int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw,
- struct bio_vec *bvec, io_notify_fn fn, void *context);
-
-int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw,
- void *data, io_notify_fn fn, void *context);
+int dm_io(struct dm_io_request *io_req, unsigned num_regions,
+ struct io_region *region, unsigned long *sync_error_bits);
#endif
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 6a92613..a66428d 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -149,9 +149,12 @@
FORCESYNC, /* Force a sync to happen */
} sync;
+ struct dm_io_request io_req;
+
/*
* Disk log fields
*/
+ int log_dev_failed;
struct dm_dev *log_dev;
struct log_header header;
@@ -199,13 +202,20 @@
core->nr_regions = le64_to_cpu(disk->nr_regions);
}
+static int rw_header(struct log_c *lc, int rw)
+{
+ lc->io_req.bi_rw = rw;
+ lc->io_req.mem.ptr.vma = lc->disk_header;
+ lc->io_req.notify.fn = NULL;
+
+ return dm_io(&lc->io_req, 1, &lc->header_location, NULL);
+}
+
static int read_header(struct log_c *log)
{
int r;
- unsigned long ebits;
- r = dm_io_sync_vm(1, &log->header_location, READ,
- log->disk_header, &ebits);
+ r = rw_header(log, READ);
if (r)
return r;
@@ -233,11 +243,8 @@
static inline int write_header(struct log_c *log)
{
- unsigned long ebits;
-
header_to_disk(&log->header, log->disk_header);
- return dm_io_sync_vm(1, &log->header_location, WRITE,
- log->disk_header, &ebits);
+ return rw_header(log, WRITE);
}
/*----------------------------------------------------------------
@@ -256,6 +263,7 @@
uint32_t region_size;
unsigned int region_count;
size_t bitset_size, buf_size;
+ int r;
if (argc < 1 || argc > 2) {
DMWARN("wrong number of arguments to mirror log");
@@ -315,6 +323,7 @@
lc->disk_header = NULL;
} else {
lc->log_dev = dev;
+ lc->log_dev_failed = 0;
lc->header_location.bdev = lc->log_dev->bdev;
lc->header_location.sector = 0;
@@ -324,6 +333,15 @@
buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) +
bitset_size, ti->limits.hardsect_size);
lc->header_location.count = buf_size >> SECTOR_SHIFT;
+ lc->io_req.mem.type = DM_IO_VMA;
+ lc->io_req.client = dm_io_client_create(dm_div_up(buf_size,
+ PAGE_SIZE));
+ if (IS_ERR(lc->io_req.client)) {
+ r = PTR_ERR(lc->io_req.client);
+ DMWARN("couldn't allocate disk io client");
+ kfree(lc);
+ return -ENOMEM;
+ }
lc->disk_header = vmalloc(buf_size);
if (!lc->disk_header) {
@@ -424,6 +442,7 @@
dm_put_device(lc->ti, lc->log_dev);
vfree(lc->disk_header);
+ dm_io_client_destroy(lc->io_req.client);
destroy_log_context(lc);
}
@@ -437,6 +456,15 @@
return count;
}
+static void fail_log_device(struct log_c *lc)
+{
+ if (lc->log_dev_failed)
+ return;
+
+ lc->log_dev_failed = 1;
+ dm_table_event(lc->ti->table);
+}
+
static int disk_resume(struct dirty_log *log)
{
int r;
@@ -446,8 +474,19 @@
/* read the disk header */
r = read_header(lc);
- if (r)
- return r;
+ if (r) {
+ DMWARN("%s: Failed to read header on mirror log device",
+ lc->log_dev->name);
+ fail_log_device(lc);
+ /*
+ * If the log device cannot be read, we must assume
+ * all regions are out-of-sync. If we simply return
+ * here, the state will be uninitialized and could
+ * lead us to return 'in-sync' status for regions
+ * that are actually 'out-of-sync'.
+ */
+ lc->header.nr_regions = 0;
+ }
/* set or clear any new bits -- device has grown */
if (lc->sync == NOSYNC)
@@ -472,7 +511,14 @@
lc->header.nr_regions = lc->region_count;
/* write the new header */
- return write_header(lc);
+ r = write_header(lc);
+ if (r) {
+ DMWARN("%s: Failed to write header on mirror log device",
+ lc->log_dev->name);
+ fail_log_device(lc);
+ }
+
+ return r;
}
static uint32_t core_get_region_size(struct dirty_log *log)
@@ -516,7 +562,9 @@
return 0;
r = write_header(lc);
- if (!r)
+ if (r)
+ fail_log_device(lc);
+ else
lc->touched = 0;
return r;
@@ -591,6 +639,7 @@
switch(status) {
case STATUSTYPE_INFO:
+ DMEMIT("1 %s", log->type->name);
break;
case STATUSTYPE_TABLE:
@@ -606,17 +655,17 @@
char *result, unsigned int maxlen)
{
int sz = 0;
- char buffer[16];
struct log_c *lc = log->context;
switch(status) {
case STATUSTYPE_INFO:
+ DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name,
+ lc->log_dev_failed ? 'D' : 'A');
break;
case STATUSTYPE_TABLE:
- format_dev_t(buffer, lc->log_dev->bdev->bd_dev);
DMEMIT("%s %u %s %u ", log->type->name,
- lc->sync == DEFAULTSYNC ? 2 : 3, buffer,
+ lc->sync == DEFAULTSYNC ? 2 : 3, lc->log_dev->name,
lc->region_size);
DMEMIT_SYNC;
}
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 3aa0135..de54b39 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -668,6 +668,9 @@
return -EINVAL;
}
+ m->hw_handler.md = dm_table_get_md(ti->table);
+ dm_put(m->hw_handler.md);
+
r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv);
if (r) {
dm_put_hw_handler(hwht);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 23a6426..ef124b7 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -21,16 +21,12 @@
#include <linux/workqueue.h>
#define DM_MSG_PREFIX "raid1"
+#define DM_IO_PAGES 64
-static struct workqueue_struct *_kmirrord_wq;
-static struct work_struct _kmirrord_work;
+#define DM_RAID1_HANDLE_ERRORS 0x01
+
static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
-static inline void wake(void)
-{
- queue_work(_kmirrord_wq, &_kmirrord_work);
-}
-
/*-----------------------------------------------------------------
* Region hash
*
@@ -125,17 +121,23 @@
struct list_head list;
struct region_hash rh;
struct kcopyd_client *kcopyd_client;
+ uint64_t features;
spinlock_t lock; /* protects the next two lists */
struct bio_list reads;
struct bio_list writes;
+ struct dm_io_client *io_client;
+
/* recovery */
region_t nr_regions;
int in_sync;
struct mirror *default_mirror; /* Default mirror */
+ struct workqueue_struct *kmirrord_wq;
+ struct work_struct kmirrord_work;
+
unsigned int nr_mirrors;
struct mirror mirror[0];
};
@@ -153,6 +155,11 @@
return region << rh->region_shift;
}
+static void wake(struct mirror_set *ms)
+{
+ queue_work(ms->kmirrord_wq, &ms->kmirrord_work);
+}
+
/* FIXME move this */
static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw);
@@ -398,8 +405,7 @@
mempool_free(reg, rh->region_pool);
}
- if (!list_empty(&recovered))
- rh->log->type->flush(rh->log);
+ rh->log->type->flush(rh->log);
list_for_each_entry_safe (reg, next, &clean, list)
mempool_free(reg, rh->region_pool);
@@ -471,7 +477,7 @@
spin_unlock_irqrestore(&rh->region_lock, flags);
if (should_wake)
- wake();
+ wake(rh->ms);
}
/*
@@ -558,7 +564,7 @@
list_add(®->list, ®->rh->recovered_regions);
spin_unlock_irq(&rh->region_lock);
- wake();
+ wake(rh->ms);
}
static void rh_flush(struct region_hash *rh)
@@ -592,7 +598,7 @@
for (i = 0; i < MAX_RECOVERY; i++)
up(&rh->recovery_count);
- wake();
+ wake(rh->ms);
}
/*
@@ -735,7 +741,7 @@
/*
* We can only read balance if the region is in sync.
*/
- if (rh_in_sync(&ms->rh, region, 0))
+ if (rh_in_sync(&ms->rh, region, 1))
m = choose_mirror(ms, bio->bi_sector);
else
m = ms->default_mirror;
@@ -792,6 +798,14 @@
unsigned int i;
struct io_region io[KCOPYD_MAX_REGIONS+1];
struct mirror *m;
+ struct dm_io_request io_req = {
+ .bi_rw = WRITE,
+ .mem.type = DM_IO_BVEC,
+ .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
+ .notify.fn = write_callback,
+ .notify.context = bio,
+ .client = ms->io_client,
+ };
for (i = 0; i < ms->nr_mirrors; i++) {
m = ms->mirror + i;
@@ -802,9 +816,8 @@
}
bio_set_ms(bio, ms);
- dm_io_async_bvec(ms->nr_mirrors, io, WRITE,
- bio->bi_io_vec + bio->bi_idx,
- write_callback, bio);
+
+ (void) dm_io(&io_req, ms->nr_mirrors, io, NULL);
}
static void do_writes(struct mirror_set *ms, struct bio_list *writes)
@@ -870,11 +883,10 @@
/*-----------------------------------------------------------------
* kmirrord
*---------------------------------------------------------------*/
-static LIST_HEAD(_mirror_sets);
-static DECLARE_RWSEM(_mirror_sets_lock);
-
-static void do_mirror(struct mirror_set *ms)
+static void do_mirror(struct work_struct *work)
{
+ struct mirror_set *ms =container_of(work, struct mirror_set,
+ kmirrord_work);
struct bio_list reads, writes;
spin_lock(&ms->lock);
@@ -890,16 +902,6 @@
do_writes(ms, &writes);
}
-static void do_work(struct work_struct *ignored)
-{
- struct mirror_set *ms;
-
- down_read(&_mirror_sets_lock);
- list_for_each_entry (ms, &_mirror_sets, list)
- do_mirror(ms);
- up_read(&_mirror_sets_lock);
-}
-
/*-----------------------------------------------------------------
* Target functions
*---------------------------------------------------------------*/
@@ -931,6 +933,13 @@
ms->in_sync = 0;
ms->default_mirror = &ms->mirror[DEFAULT_MIRROR];
+ ms->io_client = dm_io_client_create(DM_IO_PAGES);
+ if (IS_ERR(ms->io_client)) {
+ ti->error = "Error creating dm_io client";
+ kfree(ms);
+ return NULL;
+ }
+
if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
ti->error = "Error creating dirty region hash";
kfree(ms);
@@ -946,6 +955,7 @@
while (m--)
dm_put_device(ti, ms->mirror[m].dev);
+ dm_io_client_destroy(ms->io_client);
rh_exit(&ms->rh);
kfree(ms);
}
@@ -978,23 +988,6 @@
return 0;
}
-static int add_mirror_set(struct mirror_set *ms)
-{
- down_write(&_mirror_sets_lock);
- list_add_tail(&ms->list, &_mirror_sets);
- up_write(&_mirror_sets_lock);
- wake();
-
- return 0;
-}
-
-static void del_mirror_set(struct mirror_set *ms)
-{
- down_write(&_mirror_sets_lock);
- list_del(&ms->list);
- up_write(&_mirror_sets_lock);
-}
-
/*
* Create dirty log: log_type #log_params <log_params>
*/
@@ -1037,16 +1030,55 @@
return dl;
}
+static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
+ unsigned *args_used)
+{
+ unsigned num_features;
+ struct dm_target *ti = ms->ti;
+
+ *args_used = 0;
+
+ if (!argc)
+ return 0;
+
+ if (sscanf(argv[0], "%u", &num_features) != 1) {
+ ti->error = "Invalid number of features";
+ return -EINVAL;
+ }
+
+ argc--;
+ argv++;
+ (*args_used)++;
+
+ if (num_features > argc) {
+ ti->error = "Not enough arguments to support feature count";
+ return -EINVAL;
+ }
+
+ if (!strcmp("handle_errors", argv[0]))
+ ms->features |= DM_RAID1_HANDLE_ERRORS;
+ else {
+ ti->error = "Unrecognised feature requested";
+ return -EINVAL;
+ }
+
+ (*args_used)++;
+
+ return 0;
+}
+
/*
* Construct a mirror mapping:
*
* log_type #log_params <log_params>
* #mirrors [mirror_path offset]{2,}
+ * [#features <features>]
*
* log_type is "core" or "disk"
* #log_params is between 1 and 3
+ *
+ * If present, features must be "handle_errors".
*/
-#define DM_IO_PAGES 64
static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
int r;
@@ -1070,8 +1102,8 @@
argv++, argc--;
- if (argc != nr_mirrors * 2) {
- ti->error = "Wrong number of mirror arguments";
+ if (argc < nr_mirrors * 2) {
+ ti->error = "Too few mirror arguments";
dm_destroy_dirty_log(dl);
return -EINVAL;
}
@@ -1096,13 +1128,37 @@
ti->private = ms;
ti->split_io = ms->rh.region_size;
- r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client);
+ ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
+ if (!ms->kmirrord_wq) {
+ DMERR("couldn't start kmirrord");
+ free_context(ms, ti, m);
+ return -ENOMEM;
+ }
+ INIT_WORK(&ms->kmirrord_work, do_mirror);
+
+ r = parse_features(ms, argc, argv, &args_used);
if (r) {
free_context(ms, ti, ms->nr_mirrors);
return r;
}
- add_mirror_set(ms);
+ argv += args_used;
+ argc -= args_used;
+
+ if (argc) {
+ ti->error = "Too many mirror arguments";
+ free_context(ms, ti, ms->nr_mirrors);
+ return -EINVAL;
+ }
+
+ r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client);
+ if (r) {
+ destroy_workqueue(ms->kmirrord_wq);
+ free_context(ms, ti, ms->nr_mirrors);
+ return r;
+ }
+
+ wake(ms);
return 0;
}
@@ -1110,8 +1166,9 @@
{
struct mirror_set *ms = (struct mirror_set *) ti->private;
- del_mirror_set(ms);
+ flush_workqueue(ms->kmirrord_wq);
kcopyd_client_destroy(ms->kcopyd_client);
+ destroy_workqueue(ms->kmirrord_wq);
free_context(ms, ti, ms->nr_mirrors);
}
@@ -1127,7 +1184,7 @@
spin_unlock(&ms->lock);
if (should_wake)
- wake();
+ wake(ms);
}
/*
@@ -1222,11 +1279,9 @@
static int mirror_status(struct dm_target *ti, status_type_t type,
char *result, unsigned int maxlen)
{
- unsigned int m, sz;
+ unsigned int m, sz = 0;
struct mirror_set *ms = (struct mirror_set *) ti->private;
- sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen);
-
switch (type) {
case STATUSTYPE_INFO:
DMEMIT("%d ", ms->nr_mirrors);
@@ -1237,13 +1292,21 @@
(unsigned long long)ms->rh.log->type->
get_sync_count(ms->rh.log),
(unsigned long long)ms->nr_regions);
+
+ sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen);
+
break;
case STATUSTYPE_TABLE:
+ sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen);
+
DMEMIT("%d", ms->nr_mirrors);
for (m = 0; m < ms->nr_mirrors; m++)
DMEMIT(" %s %llu", ms->mirror[m].dev->name,
(unsigned long long)ms->mirror[m].offset);
+
+ if (ms->features & DM_RAID1_HANDLE_ERRORS)
+ DMEMIT(" 1 handle_errors");
}
return 0;
@@ -1251,7 +1314,7 @@
static struct target_type mirror_target = {
.name = "mirror",
- .version = {1, 0, 2},
+ .version = {1, 0, 3},
.module = THIS_MODULE,
.ctr = mirror_ctr,
.dtr = mirror_dtr,
@@ -1270,20 +1333,11 @@
if (r)
return r;
- _kmirrord_wq = create_singlethread_workqueue("kmirrord");
- if (!_kmirrord_wq) {
- DMERR("couldn't start kmirrord");
- dm_dirty_log_exit();
- return r;
- }
- INIT_WORK(&_kmirrord_work, do_work);
-
r = dm_register_target(&mirror_target);
if (r < 0) {
DMERR("%s: Failed to register mirror target",
mirror_target.name);
dm_dirty_log_exit();
- destroy_workqueue(_kmirrord_wq);
}
return r;
@@ -1297,7 +1351,6 @@
if (r < 0)
DMERR("%s: unregister failed %d", mirror_target.name, r);
- destroy_workqueue(_kmirrord_wq);
dm_dirty_log_exit();
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 05befa9..2fc199b 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -425,13 +425,15 @@
}
/*
- * If possible (ie. blk_size[major] is set), this checks an area
- * of a destination device is valid.
+ * If possible, this checks an area of a destination device is valid.
*/
static int check_device_area(struct dm_dev *dd, sector_t start, sector_t len)
{
- sector_t dev_size;
- dev_size = dd->bdev->bd_inode->i_size >> SECTOR_SHIFT;
+ sector_t dev_size = dd->bdev->bd_inode->i_size >> SECTOR_SHIFT;
+
+ if (!dev_size)
+ return 1;
+
return ((start < dev_size) && (len <= (dev_size - start)));
}
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 11a98df..2717a35 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1236,6 +1236,7 @@
free_dev(md);
}
}
+EXPORT_SYMBOL_GPL(dm_put);
/*
* Process the deferred bios
diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c
index b46f6c5..dbc234e 100644
--- a/drivers/md/kcopyd.c
+++ b/drivers/md/kcopyd.c
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2002 Sistina Software (UK) Limited.
+ * Copyright (C) 2006 Red Hat GmbH
*
* This file is released under the GPL.
*
@@ -45,6 +46,8 @@
unsigned int nr_pages;
unsigned int nr_free_pages;
+ struct dm_io_client *io_client;
+
wait_queue_head_t destroyq;
atomic_t nr_jobs;
};
@@ -342,16 +345,20 @@
static int run_io_job(struct kcopyd_job *job)
{
int r;
+ struct dm_io_request io_req = {
+ .bi_rw = job->rw,
+ .mem.type = DM_IO_PAGE_LIST,
+ .mem.ptr.pl = job->pages,
+ .mem.offset = job->offset,
+ .notify.fn = complete_io,
+ .notify.context = job,
+ .client = job->kc->io_client,
+ };
if (job->rw == READ)
- r = dm_io_async(1, &job->source, job->rw,
- job->pages,
- job->offset, complete_io, job);
-
+ r = dm_io(&io_req, 1, &job->source, NULL);
else
- r = dm_io_async(job->num_dests, job->dests, job->rw,
- job->pages,
- job->offset, complete_io, job);
+ r = dm_io(&io_req, job->num_dests, job->dests, NULL);
return r;
}
@@ -670,8 +677,9 @@
return r;
}
- r = dm_io_get(nr_pages);
- if (r) {
+ kc->io_client = dm_io_client_create(nr_pages);
+ if (IS_ERR(kc->io_client)) {
+ r = PTR_ERR(kc->io_client);
client_free_pages(kc);
kfree(kc);
kcopyd_exit();
@@ -691,7 +699,7 @@
/* Wait for completion of all jobs submitted by this client. */
wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
- dm_io_put(kc->nr_pages);
+ dm_io_client_destroy(kc->io_client);
client_free_pages(kc);
client_del(kc);
kfree(kc);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2b4315d..2901d0c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -33,6 +33,7 @@
*/
#include <linux/module.h>
+#include <linux/kernel.h>
#include <linux/kthread.h>
#include <linux/linkage.h>
#include <linux/raid/md.h>
@@ -273,6 +274,7 @@
atomic_set(&new->active, 1);
spin_lock_init(&new->write_lock);
init_waitqueue_head(&new->sb_wait);
+ new->reshape_position = MaxSector;
new->queue = blk_alloc_queue(GFP_KERNEL);
if (!new->queue) {
@@ -589,14 +591,41 @@
return ret;
}
+
+static u32 md_csum_fold(u32 csum)
+{
+ csum = (csum & 0xffff) + (csum >> 16);
+ return (csum & 0xffff) + (csum >> 16);
+}
+
static unsigned int calc_sb_csum(mdp_super_t * sb)
{
+ u64 newcsum = 0;
+ u32 *sb32 = (u32*)sb;
+ int i;
unsigned int disk_csum, csum;
disk_csum = sb->sb_csum;
sb->sb_csum = 0;
- csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
+
+ for (i = 0; i < MD_SB_BYTES/4 ; i++)
+ newcsum += sb32[i];
+ csum = (newcsum & 0xffffffff) + (newcsum>>32);
+
+
+#ifdef CONFIG_ALPHA
+ /* This used to use csum_partial, which was wrong for several
+ * reasons including that different results are returned on
+ * different architectures. It isn't critical that we get exactly
+ * the same return value as before (we always csum_fold before
+ * testing, and that removes any differences). However as we
+ * know that csum_partial always returned a 16bit value on
+ * alphas, do a fold to maximise conformity to previous behaviour.
+ */
+ sb->sb_csum = md_csum_fold(disk_csum);
+#else
sb->sb_csum = disk_csum;
+#endif
return csum;
}
@@ -684,7 +713,7 @@
if (sb->raid_disks <= 0)
goto abort;
- if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) {
+ if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
b);
goto abort;
@@ -694,6 +723,17 @@
rdev->data_offset = 0;
rdev->sb_size = MD_SB_BYTES;
+ if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) {
+ if (sb->level != 1 && sb->level != 4
+ && sb->level != 5 && sb->level != 6
+ && sb->level != 10) {
+ /* FIXME use a better test */
+ printk(KERN_WARNING
+ "md: bitmaps not supported for this level.\n");
+ goto abort;
+ }
+ }
+
if (sb->level == LEVEL_MULTIPATH)
rdev->desc_nr = -1;
else
@@ -792,16 +832,8 @@
mddev->max_disks = MD_SB_DISKS;
if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
- mddev->bitmap_file == NULL) {
- if (mddev->level != 1 && mddev->level != 4
- && mddev->level != 5 && mddev->level != 6
- && mddev->level != 10) {
- /* FIXME use a better test */
- printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
- return -EINVAL;
- }
+ mddev->bitmap_file == NULL)
mddev->bitmap_offset = mddev->default_bitmap_offset;
- }
} else if (mddev->pers == NULL) {
/* Insist on good event counter while assembling */
@@ -1058,6 +1090,18 @@
bdevname(rdev->bdev,b));
return -EINVAL;
}
+ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) {
+ if (sb->level != cpu_to_le32(1) &&
+ sb->level != cpu_to_le32(4) &&
+ sb->level != cpu_to_le32(5) &&
+ sb->level != cpu_to_le32(6) &&
+ sb->level != cpu_to_le32(10)) {
+ printk(KERN_WARNING
+ "md: bitmaps not supported for this level.\n");
+ return -EINVAL;
+ }
+ }
+
rdev->preferred_minor = 0xffff;
rdev->data_offset = le64_to_cpu(sb->data_offset);
atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
@@ -1141,14 +1185,9 @@
mddev->max_disks = (4096-256)/2;
if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
- mddev->bitmap_file == NULL ) {
- if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
- && mddev->level != 10) {
- printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
- return -EINVAL;
- }
+ mddev->bitmap_file == NULL )
mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
- }
+
if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
mddev->reshape_position = le64_to_cpu(sb->reshape_position);
mddev->delta_disks = le32_to_cpu(sb->delta_disks);
@@ -2204,6 +2243,10 @@
layout_show(mddev_t *mddev, char *page)
{
/* just a number, not meaningful for all levels */
+ if (mddev->reshape_position != MaxSector &&
+ mddev->layout != mddev->new_layout)
+ return sprintf(page, "%d (%d)\n",
+ mddev->new_layout, mddev->layout);
return sprintf(page, "%d\n", mddev->layout);
}
@@ -2212,13 +2255,16 @@
{
char *e;
unsigned long n = simple_strtoul(buf, &e, 10);
- if (mddev->pers)
- return -EBUSY;
if (!*buf || (*e && *e != '\n'))
return -EINVAL;
- mddev->layout = n;
+ if (mddev->pers)
+ return -EBUSY;
+ if (mddev->reshape_position != MaxSector)
+ mddev->new_layout = n;
+ else
+ mddev->layout = n;
return len;
}
static struct md_sysfs_entry md_layout =
@@ -2230,6 +2276,10 @@
{
if (mddev->raid_disks == 0)
return 0;
+ if (mddev->reshape_position != MaxSector &&
+ mddev->delta_disks != 0)
+ return sprintf(page, "%d (%d)\n", mddev->raid_disks,
+ mddev->raid_disks - mddev->delta_disks);
return sprintf(page, "%d\n", mddev->raid_disks);
}
@@ -2247,7 +2297,11 @@
if (mddev->pers)
rv = update_raid_disks(mddev, n);
- else
+ else if (mddev->reshape_position != MaxSector) {
+ int olddisks = mddev->raid_disks - mddev->delta_disks;
+ mddev->delta_disks = n - olddisks;
+ mddev->raid_disks = n;
+ } else
mddev->raid_disks = n;
return rv ? rv : len;
}
@@ -2257,6 +2311,10 @@
static ssize_t
chunk_size_show(mddev_t *mddev, char *page)
{
+ if (mddev->reshape_position != MaxSector &&
+ mddev->chunk_size != mddev->new_chunk)
+ return sprintf(page, "%d (%d)\n", mddev->new_chunk,
+ mddev->chunk_size);
return sprintf(page, "%d\n", mddev->chunk_size);
}
@@ -2267,12 +2325,15 @@
char *e;
unsigned long n = simple_strtoul(buf, &e, 10);
- if (mddev->pers)
- return -EBUSY;
if (!*buf || (*e && *e != '\n'))
return -EINVAL;
- mddev->chunk_size = n;
+ if (mddev->pers)
+ return -EBUSY;
+ else if (mddev->reshape_position != MaxSector)
+ mddev->new_chunk = n;
+ else
+ mddev->chunk_size = n;
return len;
}
static struct md_sysfs_entry md_chunk_size =
@@ -2637,8 +2698,7 @@
minor = simple_strtoul(buf, &e, 10);
if (e==buf || (*e && *e != '\n') )
return -EINVAL;
- if (major >= sizeof(super_types)/sizeof(super_types[0]) ||
- super_types[major].name == NULL)
+ if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
return -ENOENT;
mddev->major_version = major;
mddev->minor_version = minor;
@@ -2859,6 +2919,37 @@
static struct md_sysfs_entry md_suspend_hi =
__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
+static ssize_t
+reshape_position_show(mddev_t *mddev, char *page)
+{
+ if (mddev->reshape_position != MaxSector)
+ return sprintf(page, "%llu\n",
+ (unsigned long long)mddev->reshape_position);
+ strcpy(page, "none\n");
+ return 5;
+}
+
+static ssize_t
+reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ char *e;
+ unsigned long long new = simple_strtoull(buf, &e, 10);
+ if (mddev->pers)
+ return -EBUSY;
+ if (buf == e || (*e && *e != '\n'))
+ return -EINVAL;
+ mddev->reshape_position = new;
+ mddev->delta_disks = 0;
+ mddev->new_level = mddev->level;
+ mddev->new_layout = mddev->layout;
+ mddev->new_chunk = mddev->chunk_size;
+ return len;
+}
+
+static struct md_sysfs_entry md_reshape_position =
+__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
+ reshape_position_store);
+
static struct attribute *md_default_attrs[] = {
&md_level.attr,
@@ -2871,6 +2962,7 @@
&md_new_device.attr,
&md_safe_delay.attr,
&md_array_state.attr,
+ &md_reshape_position.attr,
NULL,
};
@@ -3012,6 +3104,7 @@
struct gendisk *disk;
struct mdk_personality *pers;
char b[BDEVNAME_SIZE];
+ struct block_device *bdev;
if (list_empty(&mddev->disks))
/* cannot run an array with no devices.. */
@@ -3239,7 +3332,13 @@
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
- mddev->changed = 1;
+ bdev = bdget_disk(mddev->gendisk, 0);
+ if (bdev) {
+ bd_set_size(bdev, mddev->array_size << 1);
+ blkdev_ioctl(bdev->bd_inode, NULL, BLKRRPART, 0);
+ bdput(bdev);
+ }
+
md_new_event(mddev);
kobject_uevent(&mddev->gendisk->kobj, KOBJ_CHANGE);
return 0;
@@ -3361,7 +3460,6 @@
mddev->pers = NULL;
set_capacity(disk, 0);
- mddev->changed = 1;
if (mddev->ro)
mddev->ro = 0;
@@ -3409,6 +3507,7 @@
mddev->size = 0;
mddev->raid_disks = 0;
mddev->recovery_cp = 0;
+ mddev->reshape_position = MaxSector;
} else if (mddev->pers)
printk(KERN_INFO "md: %s switched to read-only mode.\n",
@@ -4019,7 +4118,7 @@
if (info->raid_disks == 0) {
/* just setting version number for superblock loading */
if (info->major_version < 0 ||
- info->major_version >= sizeof(super_types)/sizeof(super_types[0]) ||
+ info->major_version >= ARRAY_SIZE(super_types) ||
super_types[info->major_version].name == NULL) {
/* maybe try to auto-load a module? */
printk(KERN_INFO
@@ -4500,20 +4599,6 @@
return 0;
}
-static int md_media_changed(struct gendisk *disk)
-{
- mddev_t *mddev = disk->private_data;
-
- return mddev->changed;
-}
-
-static int md_revalidate(struct gendisk *disk)
-{
- mddev_t *mddev = disk->private_data;
-
- mddev->changed = 0;
- return 0;
-}
static struct block_device_operations md_fops =
{
.owner = THIS_MODULE,
@@ -4521,8 +4606,6 @@
.release = md_release,
.ioctl = md_ioctl,
.getgeo = md_getgeo,
- .media_changed = md_media_changed,
- .revalidate_disk= md_revalidate,
};
static int md_thread(void * arg)
@@ -4941,15 +5024,6 @@
return error;
}
-static int md_seq_release(struct inode *inode, struct file *file)
-{
- struct seq_file *m = file->private_data;
- struct mdstat_info *mi = m->private;
- m->private = NULL;
- kfree(mi);
- return seq_release(inode, file);
-}
-
static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
{
struct seq_file *m = filp->private_data;
@@ -4971,7 +5045,7 @@
.open = md_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = md_seq_release,
+ .release = seq_release_private,
.poll = mdstat_poll,
};
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 97ee870..1b7130c 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2063,7 +2063,6 @@
*/
mddev->array_size = sectors>>1;
set_capacity(mddev->gendisk, mddev->array_size << 1);
- mddev->changed = 1;
if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) {
mddev->recovery_cp = mddev->size << 1;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8d59914..a72e70a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -353,8 +353,8 @@
struct kmem_cache *sc;
int devs = conf->raid_disks;
- sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev));
- sprintf(conf->cache_name[1], "raid5/%s-alt", mdname(conf->mddev));
+ sprintf(conf->cache_name[0], "raid5-%s", mdname(conf->mddev));
+ sprintf(conf->cache_name[1], "raid5-%s-alt", mdname(conf->mddev));
conf->active_name = 0;
sc = kmem_cache_create(conf->cache_name[conf->active_name],
sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
@@ -3864,7 +3864,6 @@
sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1;
set_capacity(mddev->gendisk, mddev->array_size << 1);
- mddev->changed = 1;
if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) {
mddev->recovery_cp = mddev->size << 1;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -3999,7 +3998,6 @@
conf->mddev->array_size = conf->mddev->size *
(conf->raid_disks - conf->max_degraded);
set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
- conf->mddev->changed = 1;
bdev = bdget_disk(conf->mddev->gendisk, 0);
if (bdev) {
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index b6c1670..7385acf 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -501,9 +501,9 @@
{
#ifdef CONFIG_MMC_DEBUG
unsigned long flags;
- spin_lock_irqsave(host->lock, flags);
+ spin_lock_irqsave(&host->lock, flags);
BUG_ON(host->removed);
- spin_unlock_irqrestore(host->lock, flags);
+ spin_unlock_irqrestore(&host->lock, flags);
#endif
mmc_schedule_delayed_work(&host->detect, delay);
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index 3a03a74..637ae8f 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -1214,7 +1214,7 @@
int i;
#endif
- flush_scheduled_work();
+ cancel_work_sync(&adapter->reset_task);
e1000_release_manageability(adapter);
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index eed433d..f71dab3 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -662,10 +662,10 @@
phy_error(phydev);
/*
- * Finish any pending work; we might have been scheduled
- * to be called from keventd ourselves, though.
+ * Finish any pending work; we might have been scheduled to be called
+ * from keventd ourselves, but cancel_work_sync() handles that.
*/
- run_scheduled_work(&phydev->phy_queue);
+ cancel_work_sync(&phydev->phy_queue);
free_irq(phydev->irq, phydev);
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index e5e901e..923b9c7 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -3716,10 +3716,8 @@
unsigned int restart_timer;
tg3_full_lock(tp, 0);
- tp->tg3_flags |= TG3_FLAG_IN_RESET_TASK;
if (!netif_running(tp->dev)) {
- tp->tg3_flags &= ~TG3_FLAG_IN_RESET_TASK;
tg3_full_unlock(tp);
return;
}
@@ -3750,8 +3748,6 @@
mod_timer(&tp->timer, jiffies + 1);
out:
- tp->tg3_flags &= ~TG3_FLAG_IN_RESET_TASK;
-
tg3_full_unlock(tp);
}
@@ -7390,12 +7386,7 @@
{
struct tg3 *tp = netdev_priv(dev);
- /* Calling flush_scheduled_work() may deadlock because
- * linkwatch_event() may be on the workqueue and it will try to get
- * the rtnl_lock which we are holding.
- */
- while (tp->tg3_flags & TG3_FLAG_IN_RESET_TASK)
- msleep(1);
+ cancel_work_sync(&tp->reset_task);
netif_stop_queue(dev);
diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h
index 4d334cf..bd9f4f4 100644
--- a/drivers/net/tg3.h
+++ b/drivers/net/tg3.h
@@ -2228,7 +2228,7 @@
#define TG3_FLAG_JUMBO_RING_ENABLE 0x00800000
#define TG3_FLAG_10_100_ONLY 0x01000000
#define TG3_FLAG_PAUSE_AUTONEG 0x02000000
-#define TG3_FLAG_IN_RESET_TASK 0x04000000
+
#define TG3_FLAG_40BIT_DMA_BUG 0x08000000
#define TG3_FLAG_BROKEN_CHECKSUMS 0x10000000
#define TG3_FLAG_SUPPORT_MSI 0x20000000
diff --git a/drivers/spi/atmel_spi.c b/drivers/spi/atmel_spi.c
index 66e7bc9..1d8a2f6 100644
--- a/drivers/spi/atmel_spi.c
+++ b/drivers/spi/atmel_spi.c
@@ -22,10 +22,7 @@
#include <asm/io.h>
#include <asm/arch/board.h>
#include <asm/arch/gpio.h>
-
-#ifdef CONFIG_ARCH_AT91
#include <asm/arch/cpu.h>
-#endif
#include "atmel_spi.h"
@@ -552,10 +549,8 @@
goto out_free_buffer;
as->irq = irq;
as->clk = clk;
-#ifdef CONFIG_ARCH_AT91
if (!cpu_is_at91rm9200())
as->new_1 = 1;
-#endif
ret = request_irq(irq, atmel_spi_interrupt, 0,
pdev->dev.bus_id, master);
diff --git a/drivers/usb/atm/usbatm.c b/drivers/usb/atm/usbatm.c
index b082d95..11e9b15 100644
--- a/drivers/usb/atm/usbatm.c
+++ b/drivers/usb/atm/usbatm.c
@@ -1033,7 +1033,7 @@
static int usbatm_heavy_init(struct usbatm_data *instance)
{
- int ret = kernel_thread(usbatm_do_heavy_init, instance, CLONE_KERNEL);
+ int ret = kernel_thread(usbatm_do_heavy_init, instance, CLONE_FS | CLONE_FILES);
if (ret < 0) {
usb_err(instance, "%s: failed to create kernel_thread (%d)!\n", __func__, ret);
diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
index 1132ba5..9a256d2 100644
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -1348,6 +1348,20 @@
Please read the <file:Documentation/fb/README-sstfb.txt> for supported
options and other important info support.
+config FB_VT8623
+ tristate "VIA VT8623 support"
+ depends on FB && PCI
+ select FB_CFB_FILLRECT
+ select FB_CFB_COPYAREA
+ select FB_CFB_IMAGEBLIT
+ select FB_TILEBLITTING
+ select FB_SVGALIB
+ select VGASTATE
+ select FONT_8x16 if FRAMEBUFFER_CONSOLE
+ ---help---
+ Driver for CastleRock integrated graphics core in the
+ VIA VT8623 [Apollo CLE266] chipset.
+
config FB_CYBLA
tristate "Cyberblade/i1 support"
depends on FB && PCI && X86_32 && !64BIT
@@ -1401,6 +1415,20 @@
This will compile the Trident frame buffer device with
acceleration functions.
+config FB_ARK
+ tristate "ARK 2000PV support"
+ depends on FB && PCI
+ select FB_CFB_FILLRECT
+ select FB_CFB_COPYAREA
+ select FB_CFB_IMAGEBLIT
+ select FB_TILEBLITTING
+ select FB_SVGALIB
+ select VGASTATE
+ select FONT_8x16 if FRAMEBUFFER_CONSOLE
+ ---help---
+ Driver for PCI graphics boards with ARK 2000PV chip
+ and ICS 5342 RAMDAC.
+
config FB_PM3
tristate "Permedia3 support"
depends on FB && PCI && BROKEN
diff --git a/drivers/video/Makefile b/drivers/video/Makefile
index a916c20..0b70567 100644
--- a/drivers/video/Makefile
+++ b/drivers/video/Makefile
@@ -54,10 +54,12 @@
obj-$(CONFIG_FB_CT65550) += chipsfb.o
obj-$(CONFIG_FB_IMSTT) += imsttfb.o
obj-$(CONFIG_FB_FM2) += fm2fb.o
+obj-$(CONFIG_FB_VT8623) += vt8623fb.o
obj-$(CONFIG_FB_CYBLA) += cyblafb.o
obj-$(CONFIG_FB_TRIDENT) += tridentfb.o
obj-$(CONFIG_FB_LE80578) += vermilion/
obj-$(CONFIG_FB_S3) += s3fb.o
+obj-$(CONFIG_FB_ARK) += arkfb.o
obj-$(CONFIG_FB_STI) += stifb.o
obj-$(CONFIG_FB_FFB) += ffb.o sbuslib.o
obj-$(CONFIG_FB_CG6) += cg6.o sbuslib.o
diff --git a/drivers/video/arkfb.c b/drivers/video/arkfb.c
new file mode 100644
index 0000000..ba6fede
--- /dev/null
+++ b/drivers/video/arkfb.c
@@ -0,0 +1,1200 @@
+/*
+ * linux/drivers/video/arkfb.c -- Frame buffer device driver for ARK 2000PV
+ * with ICS 5342 dac (it is easy to add support for different dacs).
+ *
+ * Copyright (c) 2007 Ondrej Zajicek <santiago@crfreenet.org>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of this archive for
+ * more details.
+ *
+ * Code is based on s3fb
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/tty.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/fb.h>
+#include <linux/svga.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/console.h> /* Why should fb driver call console functions? because acquire_console_sem() */
+#include <video/vga.h>
+
+#ifdef CONFIG_MTRR
+#include <asm/mtrr.h>
+#endif
+
+struct arkfb_info {
+ int mclk_freq;
+ int mtrr_reg;
+
+ struct dac_info *dac;
+ struct vgastate state;
+ struct mutex open_lock;
+ unsigned int ref_count;
+ u32 pseudo_palette[16];
+};
+
+
+/* ------------------------------------------------------------------------- */
+
+
+static const struct svga_fb_format arkfb_formats[] = {
+ { 0, {0, 6, 0}, {0, 6, 0}, {0, 6, 0}, {0, 0, 0}, 0,
+ FB_TYPE_TEXT, FB_AUX_TEXT_SVGA_STEP4, FB_VISUAL_PSEUDOCOLOR, 8, 8},
+ { 4, {0, 6, 0}, {0, 6, 0}, {0, 6, 0}, {0, 0, 0}, 0,
+ FB_TYPE_PACKED_PIXELS, 0, FB_VISUAL_PSEUDOCOLOR, 8, 16},
+ { 4, {0, 6, 0}, {0, 6, 0}, {0, 6, 0}, {0, 0, 0}, 1,
+ FB_TYPE_INTERLEAVED_PLANES, 1, FB_VISUAL_PSEUDOCOLOR, 8, 16},
+ { 8, {0, 6, 0}, {0, 6, 0}, {0, 6, 0}, {0, 0, 0}, 0,
+ FB_TYPE_PACKED_PIXELS, 0, FB_VISUAL_PSEUDOCOLOR, 8, 8},
+ {16, {10, 5, 0}, {5, 5, 0}, {0, 5, 0}, {0, 0, 0}, 0,
+ FB_TYPE_PACKED_PIXELS, 0, FB_VISUAL_TRUECOLOR, 4, 4},
+ {16, {11, 5, 0}, {5, 6, 0}, {0, 5, 0}, {0, 0, 0}, 0,
+ FB_TYPE_PACKED_PIXELS, 0, FB_VISUAL_TRUECOLOR, 4, 4},
+ {24, {16, 8, 0}, {8, 8, 0}, {0, 8, 0}, {0, 0, 0}, 0,
+ FB_TYPE_PACKED_PIXELS, 0, FB_VISUAL_TRUECOLOR, 8, 8},
+ {32, {16, 8, 0}, {8, 8, 0}, {0, 8, 0}, {0, 0, 0}, 0,
+ FB_TYPE_PACKED_PIXELS, 0, FB_VISUAL_TRUECOLOR, 2, 2},
+ SVGA_FORMAT_END
+};
+
+
+/* CRT timing register sets */
+
+static const struct vga_regset ark_h_total_regs[] = {{0x00, 0, 7}, {0x41, 7, 7}, VGA_REGSET_END};
+static const struct vga_regset ark_h_display_regs[] = {{0x01, 0, 7}, {0x41, 6, 6}, VGA_REGSET_END};
+static const struct vga_regset ark_h_blank_start_regs[] = {{0x02, 0, 7}, {0x41, 5, 5}, VGA_REGSET_END};
+static const struct vga_regset ark_h_blank_end_regs[] = {{0x03, 0, 4}, {0x05, 7, 7 }, VGA_REGSET_END};
+static const struct vga_regset ark_h_sync_start_regs[] = {{0x04, 0, 7}, {0x41, 4, 4}, VGA_REGSET_END};
+static const struct vga_regset ark_h_sync_end_regs[] = {{0x05, 0, 4}, VGA_REGSET_END};
+
+static const struct vga_regset ark_v_total_regs[] = {{0x06, 0, 7}, {0x07, 0, 0}, {0x07, 5, 5}, {0x40, 7, 7}, VGA_REGSET_END};
+static const struct vga_regset ark_v_display_regs[] = {{0x12, 0, 7}, {0x07, 1, 1}, {0x07, 6, 6}, {0x40, 6, 6}, VGA_REGSET_END};
+static const struct vga_regset ark_v_blank_start_regs[] = {{0x15, 0, 7}, {0x07, 3, 3}, {0x09, 5, 5}, {0x40, 5, 5}, VGA_REGSET_END};
+// const struct vga_regset ark_v_blank_end_regs[] = {{0x16, 0, 6}, VGA_REGSET_END};
+static const struct vga_regset ark_v_blank_end_regs[] = {{0x16, 0, 7}, VGA_REGSET_END};
+static const struct vga_regset ark_v_sync_start_regs[] = {{0x10, 0, 7}, {0x07, 2, 2}, {0x07, 7, 7}, {0x40, 4, 4}, VGA_REGSET_END};
+static const struct vga_regset ark_v_sync_end_regs[] = {{0x11, 0, 3}, VGA_REGSET_END};
+
+static const struct vga_regset ark_line_compare_regs[] = {{0x18, 0, 7}, {0x07, 4, 4}, {0x09, 6, 6}, VGA_REGSET_END};
+static const struct vga_regset ark_start_address_regs[] = {{0x0d, 0, 7}, {0x0c, 0, 7}, {0x40, 0, 2}, VGA_REGSET_END};
+static const struct vga_regset ark_offset_regs[] = {{0x13, 0, 7}, {0x41, 3, 3}, VGA_REGSET_END};
+
+static const struct svga_timing_regs ark_timing_regs = {
+ ark_h_total_regs, ark_h_display_regs, ark_h_blank_start_regs,
+ ark_h_blank_end_regs, ark_h_sync_start_regs, ark_h_sync_end_regs,
+ ark_v_total_regs, ark_v_display_regs, ark_v_blank_start_regs,
+ ark_v_blank_end_regs, ark_v_sync_start_regs, ark_v_sync_end_regs,
+};
+
+
+/* ------------------------------------------------------------------------- */
+
+
+/* Module parameters */
+
+static char *mode = "640x480-8@60";
+
+#ifdef CONFIG_MTRR
+static int mtrr = 1;
+#endif
+
+MODULE_AUTHOR("(c) 2007 Ondrej Zajicek <santiago@crfreenet.org>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("fbdev driver for ARK 2000PV");
+
+module_param(mode, charp, 0444);
+MODULE_PARM_DESC(mode, "Default video mode ('640x480-8@60', etc)");
+
+#ifdef CONFIG_MTRR
+module_param(mtrr, int, 0444);
+MODULE_PARM_DESC(mtrr, "Enable write-combining with MTRR (1=enable, 0=disable, default=1)");
+#endif
+
+static int threshold = 4;
+
+module_param(threshold, int, 0644);
+MODULE_PARM_DESC(threshold, "FIFO threshold");
+
+
+/* ------------------------------------------------------------------------- */
+
+
+static void arkfb_settile(struct fb_info *info, struct fb_tilemap *map)
+{
+ const u8 *font = map->data;
+ u8 __iomem *fb = (u8 __iomem *)info->screen_base;
+ int i, c;
+
+ if ((map->width != 8) || (map->height != 16) ||
+ (map->depth != 1) || (map->length != 256)) {
+ printk(KERN_ERR "fb%d: unsupported font parameters: width %d, "
+ "height %d, depth %d, length %d\n", info->node,
+ map->width, map->height, map->depth, map->length);
+ return;
+ }
+
+ fb += 2;
+ for (c = 0; c < map->length; c++) {
+ for (i = 0; i < map->height; i++) {
+ fb_writeb(font[i], &fb[i * 4]);
+ fb_writeb(font[i], &fb[i * 4 + (128 * 8)]);
+ }
+ fb += 128;
+
+ if ((c % 8) == 7)
+ fb += 128*8;
+
+ font += map->height;
+ }
+}
+
+static struct fb_tile_ops arkfb_tile_ops = {
+ .fb_settile = arkfb_settile,
+ .fb_tilecopy = svga_tilecopy,
+ .fb_tilefill = svga_tilefill,
+ .fb_tileblit = svga_tileblit,
+ .fb_tilecursor = svga_tilecursor,
+ .fb_get_tilemax = svga_get_tilemax,
+};
+
+
+/* ------------------------------------------------------------------------- */
+
+
+/* image data is MSB-first, fb structure is MSB-first too */
+static inline u32 expand_color(u32 c)
+{
+ return ((c & 1) | ((c & 2) << 7) | ((c & 4) << 14) | ((c & 8) << 21)) * 0xFF;
+}
+
+/* arkfb_iplan_imageblit silently assumes that almost everything is 8-pixel aligned */
+static void arkfb_iplan_imageblit(struct fb_info *info, const struct fb_image *image)
+{
+ u32 fg = expand_color(image->fg_color);
+ u32 bg = expand_color(image->bg_color);
+ const u8 *src1, *src;
+ u8 __iomem *dst1;
+ u32 __iomem *dst;
+ u32 val;
+ int x, y;
+
+ src1 = image->data;
+ dst1 = info->screen_base + (image->dy * info->fix.line_length)
+ + ((image->dx / 8) * 4);
+
+ for (y = 0; y < image->height; y++) {
+ src = src1;
+ dst = (u32 __iomem *) dst1;
+ for (x = 0; x < image->width; x += 8) {
+ val = *(src++) * 0x01010101;
+ val = (val & fg) | (~val & bg);
+ fb_writel(val, dst++);
+ }
+ src1 += image->width / 8;
+ dst1 += info->fix.line_length;
+ }
+
+}
+
+/* arkfb_iplan_fillrect silently assumes that almost everything is 8-pixel aligned */
+static void arkfb_iplan_fillrect(struct fb_info *info, const struct fb_fillrect *rect)
+{
+ u32 fg = expand_color(rect->color);
+ u8 __iomem *dst1;
+ u32 __iomem *dst;
+ int x, y;
+
+ dst1 = info->screen_base + (rect->dy * info->fix.line_length)
+ + ((rect->dx / 8) * 4);
+
+ for (y = 0; y < rect->height; y++) {
+ dst = (u32 __iomem *) dst1;
+ for (x = 0; x < rect->width; x += 8) {
+ fb_writel(fg, dst++);
+ }
+ dst1 += info->fix.line_length;
+ }
+
+}
+
+
+/* image data is MSB-first, fb structure is high-nibble-in-low-byte-first */
+static inline u32 expand_pixel(u32 c)
+{
+ return (((c & 1) << 24) | ((c & 2) << 27) | ((c & 4) << 14) | ((c & 8) << 17) |
+ ((c & 16) << 4) | ((c & 32) << 7) | ((c & 64) >> 6) | ((c & 128) >> 3)) * 0xF;
+}
+
+/* arkfb_cfb4_imageblit silently assumes that almost everything is 8-pixel aligned */
+static void arkfb_cfb4_imageblit(struct fb_info *info, const struct fb_image *image)
+{
+ u32 fg = image->fg_color * 0x11111111;
+ u32 bg = image->bg_color * 0x11111111;
+ const u8 *src1, *src;
+ u8 __iomem *dst1;
+ u32 __iomem *dst;
+ u32 val;
+ int x, y;
+
+ src1 = image->data;
+ dst1 = info->screen_base + (image->dy * info->fix.line_length)
+ + ((image->dx / 8) * 4);
+
+ for (y = 0; y < image->height; y++) {
+ src = src1;
+ dst = (u32 __iomem *) dst1;
+ for (x = 0; x < image->width; x += 8) {
+ val = expand_pixel(*(src++));
+ val = (val & fg) | (~val & bg);
+ fb_writel(val, dst++);
+ }
+ src1 += image->width / 8;
+ dst1 += info->fix.line_length;
+ }
+
+}
+
+static void arkfb_imageblit(struct fb_info *info, const struct fb_image *image)
+{
+ if ((info->var.bits_per_pixel == 4) && (image->depth == 1)
+ && ((image->width % 8) == 0) && ((image->dx % 8) == 0)) {
+ if (info->fix.type == FB_TYPE_INTERLEAVED_PLANES)
+ arkfb_iplan_imageblit(info, image);
+ else
+ arkfb_cfb4_imageblit(info, image);
+ } else
+ cfb_imageblit(info, image);
+}
+
+static void arkfb_fillrect(struct fb_info *info, const struct fb_fillrect *rect)
+{
+ if ((info->var.bits_per_pixel == 4)
+ && ((rect->width % 8) == 0) && ((rect->dx % 8) == 0)
+ && (info->fix.type == FB_TYPE_INTERLEAVED_PLANES))
+ arkfb_iplan_fillrect(info, rect);
+ else
+ cfb_fillrect(info, rect);
+}
+
+
+/* ------------------------------------------------------------------------- */
+
+
+enum
+{
+ DAC_PSEUDO8_8,
+ DAC_RGB1555_8,
+ DAC_RGB0565_8,
+ DAC_RGB0888_8,
+ DAC_RGB8888_8,
+ DAC_PSEUDO8_16,
+ DAC_RGB1555_16,
+ DAC_RGB0565_16,
+ DAC_RGB0888_16,
+ DAC_RGB8888_16,
+ DAC_MAX
+};
+
+struct dac_ops {
+ int (*dac_get_mode)(struct dac_info *info);
+ int (*dac_set_mode)(struct dac_info *info, int mode);
+ int (*dac_get_freq)(struct dac_info *info, int channel);
+ int (*dac_set_freq)(struct dac_info *info, int channel, u32 freq);
+ void (*dac_release)(struct dac_info *info);
+};
+
+typedef void (*dac_read_regs_t)(void *data, u8 *code, int count);
+typedef void (*dac_write_regs_t)(void *data, u8 *code, int count);
+
+struct dac_info
+{
+ struct dac_ops *dacops;
+ dac_read_regs_t dac_read_regs;
+ dac_write_regs_t dac_write_regs;
+ void *data;
+};
+
+
+static inline u8 dac_read_reg(struct dac_info *info, u8 reg)
+{
+ u8 code[2] = {reg, 0};
+ info->dac_read_regs(info->data, code, 1);
+ return code[1];
+}
+
+static inline void dac_read_regs(struct dac_info *info, u8 *code, int count)
+{
+ info->dac_read_regs(info->data, code, count);
+}
+
+static inline void dac_write_reg(struct dac_info *info, u8 reg, u8 val)
+{
+ u8 code[2] = {reg, val};
+ info->dac_write_regs(info->data, code, 1);
+}
+
+static inline void dac_write_regs(struct dac_info *info, u8 *code, int count)
+{
+ info->dac_write_regs(info->data, code, count);
+}
+
+static inline int dac_set_mode(struct dac_info *info, int mode)
+{
+ return info->dacops->dac_set_mode(info, mode);
+}
+
+static inline int dac_set_freq(struct dac_info *info, int channel, u32 freq)
+{
+ return info->dacops->dac_set_freq(info, channel, freq);
+}
+
+static inline void dac_release(struct dac_info *info)
+{
+ info->dacops->dac_release(info);
+}
+
+
+/* ------------------------------------------------------------------------- */
+
+
+/* ICS5342 DAC */
+
+struct ics5342_info
+{
+ struct dac_info dac;
+ u8 mode;
+};
+
+#define DAC_PAR(info) ((struct ics5342_info *) info)
+
+/* LSB is set to distinguish unused slots */
+static const u8 ics5342_mode_table[DAC_MAX] = {
+ [DAC_PSEUDO8_8] = 0x01, [DAC_RGB1555_8] = 0x21, [DAC_RGB0565_8] = 0x61,
+ [DAC_RGB0888_8] = 0x41, [DAC_PSEUDO8_16] = 0x11, [DAC_RGB1555_16] = 0x31,
+ [DAC_RGB0565_16] = 0x51, [DAC_RGB0888_16] = 0x91, [DAC_RGB8888_16] = 0x71
+};
+
+static int ics5342_set_mode(struct dac_info *info, int mode)
+{
+ u8 code;
+
+ if (mode >= DAC_MAX)
+ return -EINVAL;
+
+ code = ics5342_mode_table[mode];
+
+ if (! code)
+ return -EINVAL;
+
+ dac_write_reg(info, 6, code & 0xF0);
+ DAC_PAR(info)->mode = mode;
+
+ return 0;
+}
+
+static const struct svga_pll ics5342_pll = {3, 129, 3, 33, 0, 3,
+ 60000, 250000, 14318};
+
+/* pd4 - allow only posdivider 4 (r=2) */
+static const struct svga_pll ics5342_pll_pd4 = {3, 129, 3, 33, 2, 2,
+ 60000, 335000, 14318};
+
+/* 270 MHz should be upper bound for VCO clock according to specs,
+ but that is too restrictive in pd4 case */
+
+static int ics5342_set_freq(struct dac_info *info, int channel, u32 freq)
+{
+ u16 m, n, r;
+
+ /* only postdivider 4 (r=2) is valid in mode DAC_PSEUDO8_16 */
+ int rv = svga_compute_pll((DAC_PAR(info)->mode == DAC_PSEUDO8_16)
+ ? &ics5342_pll_pd4 : &ics5342_pll,
+ freq, &m, &n, &r, 0);
+
+ if (rv < 0) {
+ return -EINVAL;
+ } else {
+ u8 code[6] = {4, 3, 5, m-2, 5, (n-2) | (r << 5)};
+ dac_write_regs(info, code, 3);
+ return 0;
+ }
+}
+
+static void ics5342_release(struct dac_info *info)
+{
+ ics5342_set_mode(info, DAC_PSEUDO8_8);
+ kfree(info);
+}
+
+static struct dac_ops ics5342_ops = {
+ .dac_set_mode = ics5342_set_mode,
+ .dac_set_freq = ics5342_set_freq,
+ .dac_release = ics5342_release
+};
+
+
+static struct dac_info * ics5342_init(dac_read_regs_t drr, dac_write_regs_t dwr, void *data)
+{
+ struct dac_info *info = kzalloc(sizeof(struct ics5342_info), GFP_KERNEL);
+
+ if (! info)
+ return NULL;
+
+ info->dacops = &ics5342_ops;
+ info->dac_read_regs = drr;
+ info->dac_write_regs = dwr;
+ info->data = data;
+ DAC_PAR(info)->mode = DAC_PSEUDO8_8; /* estimation */
+ return info;
+}
+
+
+/* ------------------------------------------------------------------------- */
+
+
+static unsigned short dac_regs[4] = {0x3c8, 0x3c9, 0x3c6, 0x3c7};
+
+static void ark_dac_read_regs(void *data, u8 *code, int count)
+{
+ u8 regval = vga_rseq(NULL, 0x1C);
+
+ while (count != 0)
+ {
+ vga_wseq(NULL, 0x1C, regval | (code[0] & 4) ? 0x80 : 0);
+ code[1] = vga_r(NULL, dac_regs[code[0] & 3]);
+ count--;
+ code += 2;
+ }
+
+ vga_wseq(NULL, 0x1C, regval);
+}
+
+static void ark_dac_write_regs(void *data, u8 *code, int count)
+{
+ u8 regval = vga_rseq(NULL, 0x1C);
+
+ while (count != 0)
+ {
+ vga_wseq(NULL, 0x1C, regval | (code[0] & 4) ? 0x80 : 0);
+ vga_w(NULL, dac_regs[code[0] & 3], code[1]);
+ count--;
+ code += 2;
+ }
+
+ vga_wseq(NULL, 0x1C, regval);
+}
+
+
+static void ark_set_pixclock(struct fb_info *info, u32 pixclock)
+{
+ struct arkfb_info *par = info->par;
+ u8 regval;
+
+ int rv = dac_set_freq(par->dac, 0, 1000000000 / pixclock);
+ if (rv < 0) {
+ printk(KERN_ERR "fb%d: cannot set requested pixclock, keeping old value\n", info->node);
+ return;
+ }
+
+ /* Set VGA misc register */
+ regval = vga_r(NULL, VGA_MIS_R);
+ vga_w(NULL, VGA_MIS_W, regval | VGA_MIS_ENB_PLL_LOAD);
+}
+
+
+/* Open framebuffer */
+
+static int arkfb_open(struct fb_info *info, int user)
+{
+ struct arkfb_info *par = info->par;
+
+ mutex_lock(&(par->open_lock));
+ if (par->ref_count == 0) {
+ memset(&(par->state), 0, sizeof(struct vgastate));
+ par->state.flags = VGA_SAVE_MODE | VGA_SAVE_FONTS | VGA_SAVE_CMAP;
+ par->state.num_crtc = 0x60;
+ par->state.num_seq = 0x30;
+ save_vga(&(par->state));
+ }
+
+ par->ref_count++;
+ mutex_unlock(&(par->open_lock));
+
+ return 0;
+}
+
+/* Close framebuffer */
+
+static int arkfb_release(struct fb_info *info, int user)
+{
+ struct arkfb_info *par = info->par;
+
+ mutex_lock(&(par->open_lock));
+ if (par->ref_count == 0) {
+ mutex_unlock(&(par->open_lock));
+ return -EINVAL;
+ }
+
+ if (par->ref_count == 1) {
+ restore_vga(&(par->state));
+ dac_set_mode(par->dac, DAC_PSEUDO8_8);
+ }
+
+ par->ref_count--;
+ mutex_unlock(&(par->open_lock));
+
+ return 0;
+}
+
+/* Validate passed in var */
+
+static int arkfb_check_var(struct fb_var_screeninfo *var, struct fb_info *info)
+{
+ int rv, mem, step;
+
+ /* Find appropriate format */
+ rv = svga_match_format (arkfb_formats, var, NULL);
+ if (rv < 0)
+ {
+ printk(KERN_ERR "fb%d: unsupported mode requested\n", info->node);
+ return rv;
+ }
+
+ /* Do not allow to have real resoulution larger than virtual */
+ if (var->xres > var->xres_virtual)
+ var->xres_virtual = var->xres;
+
+ if (var->yres > var->yres_virtual)
+ var->yres_virtual = var->yres;
+
+ /* Round up xres_virtual to have proper alignment of lines */
+ step = arkfb_formats[rv].xresstep - 1;
+ var->xres_virtual = (var->xres_virtual+step) & ~step;
+
+
+ /* Check whether have enough memory */
+ mem = ((var->bits_per_pixel * var->xres_virtual) >> 3) * var->yres_virtual;
+ if (mem > info->screen_size)
+ {
+ printk(KERN_ERR "fb%d: not enough framebuffer memory (%d kB requested , %d kB available)\n", info->node, mem >> 10, (unsigned int) (info->screen_size >> 10));
+ return -EINVAL;
+ }
+
+ rv = svga_check_timings (&ark_timing_regs, var, info->node);
+ if (rv < 0)
+ {
+ printk(KERN_ERR "fb%d: invalid timings requested\n", info->node);
+ return rv;
+ }
+
+ /* Interlaced mode is broken */
+ if (var->vmode & FB_VMODE_INTERLACED)
+ return -EINVAL;
+
+ return 0;
+}
+
+/* Set video mode from par */
+
+static int arkfb_set_par(struct fb_info *info)
+{
+ struct arkfb_info *par = info->par;
+ u32 value, mode, hmul, hdiv, offset_value, screen_size;
+ u32 bpp = info->var.bits_per_pixel;
+ u8 regval;
+
+ if (bpp != 0) {
+ info->fix.ypanstep = 1;
+ info->fix.line_length = (info->var.xres_virtual * bpp) / 8;
+
+ info->flags &= ~FBINFO_MISC_TILEBLITTING;
+ info->tileops = NULL;
+
+ /* in 4bpp supports 8p wide tiles only, any tiles otherwise */
+ info->pixmap.blit_x = (bpp == 4) ? (1 << (8 - 1)) : (~(u32)0);
+ info->pixmap.blit_y = ~(u32)0;
+
+ offset_value = (info->var.xres_virtual * bpp) / 64;
+ screen_size = info->var.yres_virtual * info->fix.line_length;
+ } else {
+ info->fix.ypanstep = 16;
+ info->fix.line_length = 0;
+
+ info->flags |= FBINFO_MISC_TILEBLITTING;
+ info->tileops = &arkfb_tile_ops;
+
+ /* supports 8x16 tiles only */
+ info->pixmap.blit_x = 1 << (8 - 1);
+ info->pixmap.blit_y = 1 << (16 - 1);
+
+ offset_value = info->var.xres_virtual / 16;
+ screen_size = (info->var.xres_virtual * info->var.yres_virtual) / 64;
+ }
+
+ info->var.xoffset = 0;
+ info->var.yoffset = 0;
+ info->var.activate = FB_ACTIVATE_NOW;
+
+ /* Unlock registers */
+ svga_wcrt_mask(0x11, 0x00, 0x80);
+
+ /* Blank screen and turn off sync */
+ svga_wseq_mask(0x01, 0x20, 0x20);
+ svga_wcrt_mask(0x17, 0x00, 0x80);
+
+ /* Set default values */
+ svga_set_default_gfx_regs();
+ svga_set_default_atc_regs();
+ svga_set_default_seq_regs();
+ svga_set_default_crt_regs();
+ svga_wcrt_multi(ark_line_compare_regs, 0xFFFFFFFF);
+ svga_wcrt_multi(ark_start_address_regs, 0);
+
+ /* ARK specific initialization */
+ svga_wseq_mask(0x10, 0x1F, 0x1F); /* enable linear framebuffer and full memory access */
+ svga_wseq_mask(0x12, 0x03, 0x03); /* 4 MB linear framebuffer size */
+
+ vga_wseq(NULL, 0x13, info->fix.smem_start >> 16);
+ vga_wseq(NULL, 0x14, info->fix.smem_start >> 24);
+ vga_wseq(NULL, 0x15, 0);
+ vga_wseq(NULL, 0x16, 0);
+
+ /* Set the FIFO threshold register */
+ /* It is fascinating way to store 5-bit value in 8-bit register */
+ regval = 0x10 | ((threshold & 0x0E) >> 1) | (threshold & 0x01) << 7 | (threshold & 0x10) << 1;
+ vga_wseq(NULL, 0x18, regval);
+
+ /* Set the offset register */
+ pr_debug("fb%d: offset register : %d\n", info->node, offset_value);
+ svga_wcrt_multi(ark_offset_regs, offset_value);
+
+ /* fix for hi-res textmode */
+ svga_wcrt_mask(0x40, 0x08, 0x08);
+
+ if (info->var.vmode & FB_VMODE_DOUBLE)
+ svga_wcrt_mask(0x09, 0x80, 0x80);
+ else
+ svga_wcrt_mask(0x09, 0x00, 0x80);
+
+ if (info->var.vmode & FB_VMODE_INTERLACED)
+ svga_wcrt_mask(0x44, 0x04, 0x04);
+ else
+ svga_wcrt_mask(0x44, 0x00, 0x04);
+
+ hmul = 1;
+ hdiv = 1;
+ mode = svga_match_format(arkfb_formats, &(info->var), &(info->fix));
+
+ /* Set mode-specific register values */
+ switch (mode) {
+ case 0:
+ pr_debug("fb%d: text mode\n", info->node);
+ svga_set_textmode_vga_regs();
+
+ vga_wseq(NULL, 0x11, 0x10); /* basic VGA mode */
+ svga_wcrt_mask(0x46, 0x00, 0x04); /* 8bit pixel path */
+ dac_set_mode(par->dac, DAC_PSEUDO8_8);
+
+ break;
+ case 1:
+ pr_debug("fb%d: 4 bit pseudocolor\n", info->node);
+ vga_wgfx(NULL, VGA_GFX_MODE, 0x40);
+
+ vga_wseq(NULL, 0x11, 0x10); /* basic VGA mode */
+ svga_wcrt_mask(0x46, 0x00, 0x04); /* 8bit pixel path */
+ dac_set_mode(par->dac, DAC_PSEUDO8_8);
+ break;
+ case 2:
+ pr_debug("fb%d: 4 bit pseudocolor, planar\n", info->node);
+
+ vga_wseq(NULL, 0x11, 0x10); /* basic VGA mode */
+ svga_wcrt_mask(0x46, 0x00, 0x04); /* 8bit pixel path */
+ dac_set_mode(par->dac, DAC_PSEUDO8_8);
+ break;
+ case 3:
+ pr_debug("fb%d: 8 bit pseudocolor\n", info->node);
+
+ vga_wseq(NULL, 0x11, 0x16); /* 8bpp accel mode */
+
+ if (info->var.pixclock > 20000) {
+ pr_debug("fb%d: not using multiplex\n", info->node);
+ svga_wcrt_mask(0x46, 0x00, 0x04); /* 8bit pixel path */
+ dac_set_mode(par->dac, DAC_PSEUDO8_8);
+ } else {
+ pr_debug("fb%d: using multiplex\n", info->node);
+ svga_wcrt_mask(0x46, 0x04, 0x04); /* 16bit pixel path */
+ dac_set_mode(par->dac, DAC_PSEUDO8_16);
+ hdiv = 2;
+ }
+ break;
+ case 4:
+ pr_debug("fb%d: 5/5/5 truecolor\n", info->node);
+
+ vga_wseq(NULL, 0x11, 0x1A); /* 16bpp accel mode */
+ svga_wcrt_mask(0x46, 0x04, 0x04); /* 16bit pixel path */
+ dac_set_mode(par->dac, DAC_RGB1555_16);
+ break;
+ case 5:
+ pr_debug("fb%d: 5/6/5 truecolor\n", info->node);
+
+ vga_wseq(NULL, 0x11, 0x1A); /* 16bpp accel mode */
+ svga_wcrt_mask(0x46, 0x04, 0x04); /* 16bit pixel path */
+ dac_set_mode(par->dac, DAC_RGB0565_16);
+ break;
+ case 6:
+ pr_debug("fb%d: 8/8/8 truecolor\n", info->node);
+
+ vga_wseq(NULL, 0x11, 0x16); /* 8bpp accel mode ??? */
+ svga_wcrt_mask(0x46, 0x04, 0x04); /* 16bit pixel path */
+ dac_set_mode(par->dac, DAC_RGB0888_16);
+ hmul = 3;
+ hdiv = 2;
+ break;
+ case 7:
+ pr_debug("fb%d: 8/8/8/8 truecolor\n", info->node);
+
+ vga_wseq(NULL, 0x11, 0x1E); /* 32bpp accel mode */
+ svga_wcrt_mask(0x46, 0x04, 0x04); /* 16bit pixel path */
+ dac_set_mode(par->dac, DAC_RGB8888_16);
+ hmul = 2;
+ break;
+ default:
+ printk(KERN_ERR "fb%d: unsupported mode - bug\n", info->node);
+ return -EINVAL;
+ }
+
+ ark_set_pixclock(info, (hdiv * info->var.pixclock) / hmul);
+ svga_set_timings(&ark_timing_regs, &(info->var), hmul, hdiv,
+ (info->var.vmode & FB_VMODE_DOUBLE) ? 2 : 1,
+ (info->var.vmode & FB_VMODE_INTERLACED) ? 2 : 1,
+ hmul, info->node);
+
+ /* Set interlaced mode start/end register */
+ value = info->var.xres + info->var.left_margin + info->var.right_margin + info->var.hsync_len;
+ value = ((value * hmul / hdiv) / 8) - 5;
+ vga_wcrt(NULL, 0x42, (value + 1) / 2);
+
+ memset_io(info->screen_base, 0x00, screen_size);
+ /* Device and screen back on */
+ svga_wcrt_mask(0x17, 0x80, 0x80);
+ svga_wseq_mask(0x01, 0x00, 0x20);
+
+ return 0;
+}
+
+/* Set a colour register */
+
+static int arkfb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
+ u_int transp, struct fb_info *fb)
+{
+ switch (fb->var.bits_per_pixel) {
+ case 0:
+ case 4:
+ if (regno >= 16)
+ return -EINVAL;
+
+ if ((fb->var.bits_per_pixel == 4) &&
+ (fb->var.nonstd == 0)) {
+ outb(0xF0, VGA_PEL_MSK);
+ outb(regno*16, VGA_PEL_IW);
+ } else {
+ outb(0x0F, VGA_PEL_MSK);
+ outb(regno, VGA_PEL_IW);
+ }
+ outb(red >> 10, VGA_PEL_D);
+ outb(green >> 10, VGA_PEL_D);
+ outb(blue >> 10, VGA_PEL_D);
+ break;
+ case 8:
+ if (regno >= 256)
+ return -EINVAL;
+
+ outb(0xFF, VGA_PEL_MSK);
+ outb(regno, VGA_PEL_IW);
+ outb(red >> 10, VGA_PEL_D);
+ outb(green >> 10, VGA_PEL_D);
+ outb(blue >> 10, VGA_PEL_D);
+ break;
+ case 16:
+ if (regno >= 16)
+ return 0;
+
+ if (fb->var.green.length == 5)
+ ((u32*)fb->pseudo_palette)[regno] = ((red & 0xF800) >> 1) |
+ ((green & 0xF800) >> 6) | ((blue & 0xF800) >> 11);
+ else if (fb->var.green.length == 6)
+ ((u32*)fb->pseudo_palette)[regno] = (red & 0xF800) |
+ ((green & 0xFC00) >> 5) | ((blue & 0xF800) >> 11);
+ else
+ return -EINVAL;
+ break;
+ case 24:
+ case 32:
+ if (regno >= 16)
+ return 0;
+
+ ((u32*)fb->pseudo_palette)[regno] = ((red & 0xFF00) << 8) |
+ (green & 0xFF00) | ((blue & 0xFF00) >> 8);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Set the display blanking state */
+
+static int arkfb_blank(int blank_mode, struct fb_info *info)
+{
+ switch (blank_mode) {
+ case FB_BLANK_UNBLANK:
+ pr_debug("fb%d: unblank\n", info->node);
+ svga_wseq_mask(0x01, 0x00, 0x20);
+ svga_wcrt_mask(0x17, 0x80, 0x80);
+ break;
+ case FB_BLANK_NORMAL:
+ pr_debug("fb%d: blank\n", info->node);
+ svga_wseq_mask(0x01, 0x20, 0x20);
+ svga_wcrt_mask(0x17, 0x80, 0x80);
+ break;
+ case FB_BLANK_POWERDOWN:
+ case FB_BLANK_HSYNC_SUSPEND:
+ case FB_BLANK_VSYNC_SUSPEND:
+ pr_debug("fb%d: sync down\n", info->node);
+ svga_wseq_mask(0x01, 0x20, 0x20);
+ svga_wcrt_mask(0x17, 0x00, 0x80);
+ break;
+ }
+ return 0;
+}
+
+
+/* Pan the display */
+
+static int arkfb_pan_display(struct fb_var_screeninfo *var, struct fb_info *info)
+{
+ unsigned int offset;
+
+ /* Calculate the offset */
+ if (var->bits_per_pixel == 0) {
+ offset = (var->yoffset / 16) * (var->xres_virtual / 2) + (var->xoffset / 2);
+ offset = offset >> 2;
+ } else {
+ offset = (var->yoffset * info->fix.line_length) +
+ (var->xoffset * var->bits_per_pixel / 8);
+ offset = offset >> ((var->bits_per_pixel == 4) ? 2 : 3);
+ }
+
+ /* Set the offset */
+ svga_wcrt_multi(ark_start_address_regs, offset);
+
+ return 0;
+}
+
+
+/* ------------------------------------------------------------------------- */
+
+
+/* Frame buffer operations */
+
+static struct fb_ops arkfb_ops = {
+ .owner = THIS_MODULE,
+ .fb_open = arkfb_open,
+ .fb_release = arkfb_release,
+ .fb_check_var = arkfb_check_var,
+ .fb_set_par = arkfb_set_par,
+ .fb_setcolreg = arkfb_setcolreg,
+ .fb_blank = arkfb_blank,
+ .fb_pan_display = arkfb_pan_display,
+ .fb_fillrect = arkfb_fillrect,
+ .fb_copyarea = cfb_copyarea,
+ .fb_imageblit = arkfb_imageblit,
+ .fb_get_caps = svga_get_caps,
+};
+
+
+/* ------------------------------------------------------------------------- */
+
+
+/* PCI probe */
+static int __devinit ark_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+ struct fb_info *info;
+ struct arkfb_info *par;
+ int rc;
+ u8 regval;
+
+ /* Ignore secondary VGA device because there is no VGA arbitration */
+ if (! svga_primary_device(dev)) {
+ dev_info(&(dev->dev), "ignoring secondary device\n");
+ return -ENODEV;
+ }
+
+ /* Allocate and fill driver data structure */
+ info = framebuffer_alloc(sizeof(struct arkfb_info), NULL);
+ if (! info) {
+ dev_err(&(dev->dev), "cannot allocate memory\n");
+ return -ENOMEM;
+ }
+
+ par = info->par;
+ mutex_init(&par->open_lock);
+
+ info->flags = FBINFO_PARTIAL_PAN_OK | FBINFO_HWACCEL_YPAN;
+ info->fbops = &arkfb_ops;
+
+ /* Prepare PCI device */
+ rc = pci_enable_device(dev);
+ if (rc < 0) {
+ dev_err(&(dev->dev), "cannot enable PCI device\n");
+ goto err_enable_device;
+ }
+
+ rc = pci_request_regions(dev, "arkfb");
+ if (rc < 0) {
+ dev_err(&(dev->dev), "cannot reserve framebuffer region\n");
+ goto err_request_regions;
+ }
+
+ par->dac = ics5342_init(ark_dac_read_regs, ark_dac_write_regs, info);
+ if (! par->dac) {
+ rc = -ENOMEM;
+ dev_err(&(dev->dev), "RAMDAC initialization failed\n");
+ goto err_dac;
+ }
+
+ info->fix.smem_start = pci_resource_start(dev, 0);
+ info->fix.smem_len = pci_resource_len(dev, 0);
+
+ /* Map physical IO memory address into kernel space */
+ info->screen_base = pci_iomap(dev, 0, 0);
+ if (! info->screen_base) {
+ rc = -ENOMEM;
+ dev_err(&(dev->dev), "iomap for framebuffer failed\n");
+ goto err_iomap;
+ }
+
+ /* FIXME get memsize */
+ regval = vga_rseq(NULL, 0x10);
+ info->screen_size = (1 << (regval >> 6)) << 20;
+ info->fix.smem_len = info->screen_size;
+
+ strcpy(info->fix.id, "ARK 2000PV");
+ info->fix.mmio_start = 0;
+ info->fix.mmio_len = 0;
+ info->fix.type = FB_TYPE_PACKED_PIXELS;
+ info->fix.visual = FB_VISUAL_PSEUDOCOLOR;
+ info->fix.ypanstep = 0;
+ info->fix.accel = FB_ACCEL_NONE;
+ info->pseudo_palette = (void*) (par->pseudo_palette);
+
+ /* Prepare startup mode */
+ rc = fb_find_mode(&(info->var), info, mode, NULL, 0, NULL, 8);
+ if (! ((rc == 1) || (rc == 2))) {
+ rc = -EINVAL;
+ dev_err(&(dev->dev), "mode %s not found\n", mode);
+ goto err_find_mode;
+ }
+
+ rc = fb_alloc_cmap(&info->cmap, 256, 0);
+ if (rc < 0) {
+ dev_err(&(dev->dev), "cannot allocate colormap\n");
+ goto err_alloc_cmap;
+ }
+
+ rc = register_framebuffer(info);
+ if (rc < 0) {
+ dev_err(&(dev->dev), "cannot register framebugger\n");
+ goto err_reg_fb;
+ }
+
+ printk(KERN_INFO "fb%d: %s on %s, %d MB RAM\n", info->node, info->fix.id,
+ pci_name(dev), info->fix.smem_len >> 20);
+
+ /* Record a reference to the driver data */
+ pci_set_drvdata(dev, info);
+
+#ifdef CONFIG_MTRR
+ if (mtrr) {
+ par->mtrr_reg = -1;
+ par->mtrr_reg = mtrr_add(info->fix.smem_start, info->fix.smem_len, MTRR_TYPE_WRCOMB, 1);
+ }
+#endif
+
+ return 0;
+
+ /* Error handling */
+err_reg_fb:
+ fb_dealloc_cmap(&info->cmap);
+err_alloc_cmap:
+err_find_mode:
+ pci_iounmap(dev, info->screen_base);
+err_iomap:
+ dac_release(par->dac);
+err_dac:
+ pci_release_regions(dev);
+err_request_regions:
+/* pci_disable_device(dev); */
+err_enable_device:
+ framebuffer_release(info);
+ return rc;
+}
+
+/* PCI remove */
+
+static void __devexit ark_pci_remove(struct pci_dev *dev)
+{
+ struct fb_info *info = pci_get_drvdata(dev);
+ struct arkfb_info *par = info->par;
+
+ if (info) {
+#ifdef CONFIG_MTRR
+ if (par->mtrr_reg >= 0) {
+ mtrr_del(par->mtrr_reg, 0, 0);
+ par->mtrr_reg = -1;
+ }
+#endif
+
+ dac_release(par->dac);
+ unregister_framebuffer(info);
+ fb_dealloc_cmap(&info->cmap);
+
+ pci_iounmap(dev, info->screen_base);
+ pci_release_regions(dev);
+/* pci_disable_device(dev); */
+
+ pci_set_drvdata(dev, NULL);
+ framebuffer_release(info);
+ }
+}
+
+
+#ifdef CONFIG_PM
+/* PCI suspend */
+
+static int ark_pci_suspend (struct pci_dev* dev, pm_message_t state)
+{
+ struct fb_info *info = pci_get_drvdata(dev);
+ struct arkfb_info *par = info->par;
+
+ dev_info(&(dev->dev), "suspend\n");
+
+ acquire_console_sem();
+ mutex_lock(&(par->open_lock));
+
+ if ((state.event == PM_EVENT_FREEZE) || (par->ref_count == 0)) {
+ mutex_unlock(&(par->open_lock));
+ release_console_sem();
+ return 0;
+ }
+
+ fb_set_suspend(info, 1);
+
+ pci_save_state(dev);
+ pci_disable_device(dev);
+ pci_set_power_state(dev, pci_choose_state(dev, state));
+
+ mutex_unlock(&(par->open_lock));
+ release_console_sem();
+
+ return 0;
+}
+
+
+/* PCI resume */
+
+static int ark_pci_resume (struct pci_dev* dev)
+{
+ struct fb_info *info = pci_get_drvdata(dev);
+ struct arkfb_info *par = info->par;
+
+ dev_info(&(dev->dev), "resume\n");
+
+ acquire_console_sem();
+ mutex_lock(&(par->open_lock));
+
+ if (par->ref_count == 0) {
+ mutex_unlock(&(par->open_lock));
+ release_console_sem();
+ return 0;
+ }
+
+ pci_set_power_state(dev, PCI_D0);
+ pci_restore_state(dev);
+
+ if (pci_enable_device(dev))
+ goto fail;
+
+ pci_set_master(dev);
+
+ arkfb_set_par(info);
+ fb_set_suspend(info, 0);
+
+ mutex_unlock(&(par->open_lock));
+fail:
+ release_console_sem();
+ return 0;
+}
+#else
+#define ark_pci_suspend NULL
+#define ark_pci_resume NULL
+#endif /* CONFIG_PM */
+
+/* List of boards that we are trying to support */
+
+static struct pci_device_id ark_devices[] __devinitdata = {
+ {PCI_DEVICE(0xEDD8, 0xA099)},
+ {0, 0, 0, 0, 0, 0, 0}
+};
+
+
+MODULE_DEVICE_TABLE(pci, ark_devices);
+
+static struct pci_driver arkfb_pci_driver = {
+ .name = "arkfb",
+ .id_table = ark_devices,
+ .probe = ark_pci_probe,
+ .remove = __devexit_p(ark_pci_remove),
+ .suspend = ark_pci_suspend,
+ .resume = ark_pci_resume,
+};
+
+/* Cleanup */
+
+static void __exit arkfb_cleanup(void)
+{
+ pr_debug("arkfb: cleaning up\n");
+ pci_unregister_driver(&arkfb_pci_driver);
+}
+
+/* Driver Initialisation */
+
+static int __init arkfb_init(void)
+{
+
+#ifndef MODULE
+ char *option = NULL;
+
+ if (fb_get_options("arkfb", &option))
+ return -ENODEV;
+
+ if (option && *option)
+ mode = option;
+#endif
+
+ pr_debug("arkfb: initializing\n");
+ return pci_register_driver(&arkfb_pci_driver);
+}
+
+module_init(arkfb_init);
+module_exit(arkfb_cleanup);
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index 08d4e11..38c2e25 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1236,6 +1236,10 @@
pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE;
#elif defined(__arm__) || defined(__sh__) || defined(__m32r__)
vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+#elif defined(__avr32__)
+ vma->vm_page_prot = __pgprot((pgprot_val(vma->vm_page_prot)
+ & ~_PAGE_CACHABLE)
+ | (_PAGE_BUFFER | _PAGE_DIRTY));
#elif defined(__ia64__)
if (efi_range_is_wc(vma->vm_start, vma->vm_end - vma->vm_start))
vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
diff --git a/drivers/video/nvidia/nv_hw.c b/drivers/video/nvidia/nv_hw.c
index f297c7b..c627955 100644
--- a/drivers/video/nvidia/nv_hw.c
+++ b/drivers/video/nvidia/nv_hw.c
@@ -149,8 +149,7 @@
pll = NV_RD32(par->PMC, 0x4024);
M = pll & 0xFF;
N = (pll >> 8) & 0xFF;
- if (((par->Chipset & 0xfff0) == 0x0290) ||
- ((par->Chipset & 0xfff0) == 0x0390)) {
+ if (((par->Chipset & 0xfff0) == 0x0290) || ((par->Chipset & 0xfff0) == 0x0390) || ((par->Chipset & 0xfff0) == 0x02E0)) {
MB = 1;
NB = 1;
} else {
@@ -963,6 +962,7 @@
if (((par->Chipset & 0xfff0) == 0x0090) ||
((par->Chipset & 0xfff0) == 0x01D0) ||
+ ((par->Chipset & 0xfff0) == 0x02E0) ||
((par->Chipset & 0xfff0) == 0x0290))
regions = 15;
for(i = 0; i < regions; i++) {
@@ -1275,6 +1275,7 @@
0x00100000);
break;
case 0x0090:
+ case 0x02E0:
case 0x0290:
NV_WR32(par->PRAMDAC, 0x0608,
NV_RD32(par->PRAMDAC, 0x0608) |
@@ -1352,6 +1353,7 @@
} else {
if (((par->Chipset & 0xfff0) == 0x0090) ||
((par->Chipset & 0xfff0) == 0x01D0) ||
+ ((par->Chipset & 0xfff0) == 0x02E0) ||
((par->Chipset & 0xfff0) == 0x0290)) {
for (i = 0; i < 60; i++) {
NV_WR32(par->PGRAPH,
@@ -1403,6 +1405,7 @@
} else {
if ((par->Chipset & 0xfff0) == 0x0090 ||
(par->Chipset & 0xfff0) == 0x01D0 ||
+ (par->Chipset & 0xfff0) == 0x02E0 ||
(par->Chipset & 0xfff0) == 0x0290) {
NV_WR32(par->PGRAPH, 0x0DF0,
NV_RD32(par->PFB, 0x0200));
diff --git a/drivers/video/nvidia/nvidia.c b/drivers/video/nvidia/nvidia.c
index 7c36b5f..f85edf0 100644
--- a/drivers/video/nvidia/nvidia.c
+++ b/drivers/video/nvidia/nvidia.c
@@ -1243,6 +1243,7 @@
case 0x0140: /* GeForce 6600 */
case 0x0160: /* GeForce 6200 */
case 0x01D0: /* GeForce 7200, 7300, 7400 */
+ case 0x02E0: /* GeForce 7300 GT */
case 0x0090: /* GeForce 7800 */
case 0x0210: /* GeForce 6800 */
case 0x0220: /* GeForce 6200 */
diff --git a/drivers/video/s3fb.c b/drivers/video/s3fb.c
index 756fafb..d117358 100644
--- a/drivers/video/s3fb.c
+++ b/drivers/video/s3fb.c
@@ -796,23 +796,6 @@
return 0;
}
-/* Get capabilities of accelerator based on the mode */
-
-static void s3fb_get_caps(struct fb_info *info, struct fb_blit_caps *caps,
- struct fb_var_screeninfo *var)
-{
- if (var->bits_per_pixel == 0) {
- /* can only support 256 8x16 bitmap */
- caps->x = 1 << (8 - 1);
- caps->y = 1 << (16 - 1);
- caps->len = 256;
- } else {
- caps->x = ~(u32)0;
- caps->y = ~(u32)0;
- caps->len = ~(u32)0;
- }
-}
-
/* ------------------------------------------------------------------------- */
/* Frame buffer operations */
@@ -829,7 +812,7 @@
.fb_fillrect = s3fb_fillrect,
.fb_copyarea = cfb_copyarea,
.fb_imageblit = s3fb_imageblit,
- .fb_get_caps = s3fb_get_caps,
+ .fb_get_caps = svga_get_caps,
};
/* ------------------------------------------------------------------------- */
diff --git a/drivers/video/svgalib.c b/drivers/video/svgalib.c
index 079cdc9..25df928 100644
--- a/drivers/video/svgalib.c
+++ b/drivers/video/svgalib.c
@@ -347,6 +347,23 @@
return 256;
}
+/* Get capabilities of accelerator based on the mode */
+
+void svga_get_caps(struct fb_info *info, struct fb_blit_caps *caps,
+ struct fb_var_screeninfo *var)
+{
+ if (var->bits_per_pixel == 0) {
+ /* can only support 256 8x16 bitmap */
+ caps->x = 1 << (8 - 1);
+ caps->y = 1 << (16 - 1);
+ caps->len = 256;
+ } else {
+ caps->x = (var->bits_per_pixel == 4) ? 1 << (8 - 1) : ~(u32)0;
+ caps->y = ~(u32)0;
+ caps->len = ~(u32)0;
+ }
+}
+EXPORT_SYMBOL(svga_get_caps);
/* ------------------------------------------------------------------------- */
diff --git a/drivers/video/vt8623fb.c b/drivers/video/vt8623fb.c
new file mode 100644
index 0000000..5e9755e
--- /dev/null
+++ b/drivers/video/vt8623fb.c
@@ -0,0 +1,927 @@
+/*
+ * linux/drivers/video/vt8623fb.c - fbdev driver for
+ * integrated graphic core in VIA VT8623 [CLE266] chipset
+ *
+ * Copyright (c) 2006-2007 Ondrej Zajicek <santiago@crfreenet.org>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of this archive for
+ * more details.
+ *
+ * Code is based on s3fb, some parts are from David Boucher's viafb
+ * (http://davesdomain.org.uk/viafb/)
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/tty.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/fb.h>
+#include <linux/svga.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/console.h> /* Why should fb driver call console functions? because acquire_console_sem() */
+#include <video/vga.h>
+
+#ifdef CONFIG_MTRR
+#include <asm/mtrr.h>
+#endif
+
+struct vt8623fb_info {
+ char __iomem *mmio_base;
+ int mtrr_reg;
+ struct vgastate state;
+ struct mutex open_lock;
+ unsigned int ref_count;
+ u32 pseudo_palette[16];
+};
+
+
+
+/* ------------------------------------------------------------------------- */
+
+static const struct svga_fb_format vt8623fb_formats[] = {
+ { 0, {0, 6, 0}, {0, 6, 0}, {0, 6, 0}, {0, 0, 0}, 0,
+ FB_TYPE_TEXT, FB_AUX_TEXT_SVGA_STEP8, FB_VISUAL_PSEUDOCOLOR, 16, 16},
+ { 4, {0, 6, 0}, {0, 6, 0}, {0, 6, 0}, {0, 0, 0}, 0,
+ FB_TYPE_PACKED_PIXELS, 0, FB_VISUAL_PSEUDOCOLOR, 16, 16},
+ { 4, {0, 6, 0}, {0, 6, 0}, {0, 6, 0}, {0, 0, 0}, 1,
+ FB_TYPE_INTERLEAVED_PLANES, 1, FB_VISUAL_PSEUDOCOLOR, 16, 16},
+ { 8, {0, 6, 0}, {0, 6, 0}, {0, 6, 0}, {0, 0, 0}, 0,
+ FB_TYPE_PACKED_PIXELS, 0, FB_VISUAL_PSEUDOCOLOR, 8, 8},
+/* {16, {10, 5, 0}, {5, 5, 0}, {0, 5, 0}, {0, 0, 0}, 0,
+ FB_TYPE_PACKED_PIXELS, 0, FB_VISUAL_TRUECOLOR, 4, 4}, */
+ {16, {11, 5, 0}, {5, 6, 0}, {0, 5, 0}, {0, 0, 0}, 0,
+ FB_TYPE_PACKED_PIXELS, 0, FB_VISUAL_TRUECOLOR, 4, 4},
+ {32, {16, 8, 0}, {8, 8, 0}, {0, 8, 0}, {0, 0, 0}, 0,
+ FB_TYPE_PACKED_PIXELS, 0, FB_VISUAL_TRUECOLOR, 2, 2},
+ SVGA_FORMAT_END
+};
+
+static const struct svga_pll vt8623_pll = {2, 127, 2, 7, 0, 3,
+ 60000, 300000, 14318};
+
+/* CRT timing register sets */
+
+struct vga_regset vt8623_h_total_regs[] = {{0x00, 0, 7}, {0x36, 3, 3}, VGA_REGSET_END};
+struct vga_regset vt8623_h_display_regs[] = {{0x01, 0, 7}, VGA_REGSET_END};
+struct vga_regset vt8623_h_blank_start_regs[] = {{0x02, 0, 7}, VGA_REGSET_END};
+struct vga_regset vt8623_h_blank_end_regs[] = {{0x03, 0, 4}, {0x05, 7, 7}, {0x33, 5, 5}, VGA_REGSET_END};
+struct vga_regset vt8623_h_sync_start_regs[] = {{0x04, 0, 7}, {0x33, 4, 4}, VGA_REGSET_END};
+struct vga_regset vt8623_h_sync_end_regs[] = {{0x05, 0, 4}, VGA_REGSET_END};
+
+struct vga_regset vt8623_v_total_regs[] = {{0x06, 0, 7}, {0x07, 0, 0}, {0x07, 5, 5}, {0x35, 0, 0}, VGA_REGSET_END};
+struct vga_regset vt8623_v_display_regs[] = {{0x12, 0, 7}, {0x07, 1, 1}, {0x07, 6, 6}, {0x35, 2, 2}, VGA_REGSET_END};
+struct vga_regset vt8623_v_blank_start_regs[] = {{0x15, 0, 7}, {0x07, 3, 3}, {0x09, 5, 5}, {0x35, 3, 3}, VGA_REGSET_END};
+struct vga_regset vt8623_v_blank_end_regs[] = {{0x16, 0, 7}, VGA_REGSET_END};
+struct vga_regset vt8623_v_sync_start_regs[] = {{0x10, 0, 7}, {0x07, 2, 2}, {0x07, 7, 7}, {0x35, 1, 1}, VGA_REGSET_END};
+struct vga_regset vt8623_v_sync_end_regs[] = {{0x11, 0, 3}, VGA_REGSET_END};
+
+struct vga_regset vt8623_offset_regs[] = {{0x13, 0, 7}, {0x35, 5, 7}, VGA_REGSET_END};
+struct vga_regset vt8623_line_compare_regs[] = {{0x18, 0, 7}, {0x07, 4, 4}, {0x09, 6, 6}, {0x33, 0, 2}, {0x35, 4, 4}, VGA_REGSET_END};
+struct vga_regset vt8623_fetch_count_regs[] = {{0x1C, 0, 7}, {0x1D, 0, 1}, VGA_REGSET_END};
+struct vga_regset vt8623_start_address_regs[] = {{0x0d, 0, 7}, {0x0c, 0, 7}, {0x34, 0, 7}, {0x48, 0, 1}, VGA_REGSET_END};
+
+struct svga_timing_regs vt8623_timing_regs = {
+ vt8623_h_total_regs, vt8623_h_display_regs, vt8623_h_blank_start_regs,
+ vt8623_h_blank_end_regs, vt8623_h_sync_start_regs, vt8623_h_sync_end_regs,
+ vt8623_v_total_regs, vt8623_v_display_regs, vt8623_v_blank_start_regs,
+ vt8623_v_blank_end_regs, vt8623_v_sync_start_regs, vt8623_v_sync_end_regs,
+};
+
+
+/* ------------------------------------------------------------------------- */
+
+
+/* Module parameters */
+
+static char *mode = "640x480-8@60";
+
+#ifdef CONFIG_MTRR
+static int mtrr = 1;
+#endif
+
+MODULE_AUTHOR("(c) 2006 Ondrej Zajicek <santiago@crfreenet.org>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("fbdev driver for integrated graphics core in VIA VT8623 [CLE266]");
+
+module_param(mode, charp, 0644);
+MODULE_PARM_DESC(mode, "Default video mode ('640x480-8@60', etc)");
+
+#ifdef CONFIG_MTRR
+module_param(mtrr, int, 0444);
+MODULE_PARM_DESC(mtrr, "Enable write-combining with MTRR (1=enable, 0=disable, default=1)");
+#endif
+
+
+/* ------------------------------------------------------------------------- */
+
+
+static struct fb_tile_ops vt8623fb_tile_ops = {
+ .fb_settile = svga_settile,
+ .fb_tilecopy = svga_tilecopy,
+ .fb_tilefill = svga_tilefill,
+ .fb_tileblit = svga_tileblit,
+ .fb_tilecursor = svga_tilecursor,
+ .fb_get_tilemax = svga_get_tilemax,
+};
+
+
+/* ------------------------------------------------------------------------- */
+
+
+/* image data is MSB-first, fb structure is MSB-first too */
+static inline u32 expand_color(u32 c)
+{
+ return ((c & 1) | ((c & 2) << 7) | ((c & 4) << 14) | ((c & 8) << 21)) * 0xFF;
+}
+
+/* vt8623fb_iplan_imageblit silently assumes that almost everything is 8-pixel aligned */
+static void vt8623fb_iplan_imageblit(struct fb_info *info, const struct fb_image *image)
+{
+ u32 fg = expand_color(image->fg_color);
+ u32 bg = expand_color(image->bg_color);
+ const u8 *src1, *src;
+ u8 __iomem *dst1;
+ u32 __iomem *dst;
+ u32 val;
+ int x, y;
+
+ src1 = image->data;
+ dst1 = info->screen_base + (image->dy * info->fix.line_length)
+ + ((image->dx / 8) * 4);
+
+ for (y = 0; y < image->height; y++) {
+ src = src1;
+ dst = (u32 __iomem *) dst1;
+ for (x = 0; x < image->width; x += 8) {
+ val = *(src++) * 0x01010101;
+ val = (val & fg) | (~val & bg);
+ fb_writel(val, dst++);
+ }
+ src1 += image->width / 8;
+ dst1 += info->fix.line_length;
+ }
+}
+
+/* vt8623fb_iplan_fillrect silently assumes that almost everything is 8-pixel aligned */
+static void vt8623fb_iplan_fillrect(struct fb_info *info, const struct fb_fillrect *rect)
+{
+ u32 fg = expand_color(rect->color);
+ u8 __iomem *dst1;
+ u32 __iomem *dst;
+ int x, y;
+
+ dst1 = info->screen_base + (rect->dy * info->fix.line_length)
+ + ((rect->dx / 8) * 4);
+
+ for (y = 0; y < rect->height; y++) {
+ dst = (u32 __iomem *) dst1;
+ for (x = 0; x < rect->width; x += 8) {
+ fb_writel(fg, dst++);
+ }
+ dst1 += info->fix.line_length;
+ }
+}
+
+
+/* image data is MSB-first, fb structure is high-nibble-in-low-byte-first */
+static inline u32 expand_pixel(u32 c)
+{
+ return (((c & 1) << 24) | ((c & 2) << 27) | ((c & 4) << 14) | ((c & 8) << 17) |
+ ((c & 16) << 4) | ((c & 32) << 7) | ((c & 64) >> 6) | ((c & 128) >> 3)) * 0xF;
+}
+
+/* vt8623fb_cfb4_imageblit silently assumes that almost everything is 8-pixel aligned */
+static void vt8623fb_cfb4_imageblit(struct fb_info *info, const struct fb_image *image)
+{
+ u32 fg = image->fg_color * 0x11111111;
+ u32 bg = image->bg_color * 0x11111111;
+ const u8 *src1, *src;
+ u8 __iomem *dst1;
+ u32 __iomem *dst;
+ u32 val;
+ int x, y;
+
+ src1 = image->data;
+ dst1 = info->screen_base + (image->dy * info->fix.line_length)
+ + ((image->dx / 8) * 4);
+
+ for (y = 0; y < image->height; y++) {
+ src = src1;
+ dst = (u32 __iomem *) dst1;
+ for (x = 0; x < image->width; x += 8) {
+ val = expand_pixel(*(src++));
+ val = (val & fg) | (~val & bg);
+ fb_writel(val, dst++);
+ }
+ src1 += image->width / 8;
+ dst1 += info->fix.line_length;
+ }
+}
+
+static void vt8623fb_imageblit(struct fb_info *info, const struct fb_image *image)
+{
+ if ((info->var.bits_per_pixel == 4) && (image->depth == 1)
+ && ((image->width % 8) == 0) && ((image->dx % 8) == 0)) {
+ if (info->fix.type == FB_TYPE_INTERLEAVED_PLANES)
+ vt8623fb_iplan_imageblit(info, image);
+ else
+ vt8623fb_cfb4_imageblit(info, image);
+ } else
+ cfb_imageblit(info, image);
+}
+
+static void vt8623fb_fillrect(struct fb_info *info, const struct fb_fillrect *rect)
+{
+ if ((info->var.bits_per_pixel == 4)
+ && ((rect->width % 8) == 0) && ((rect->dx % 8) == 0)
+ && (info->fix.type == FB_TYPE_INTERLEAVED_PLANES))
+ vt8623fb_iplan_fillrect(info, rect);
+ else
+ cfb_fillrect(info, rect);
+}
+
+
+/* ------------------------------------------------------------------------- */
+
+
+static void vt8623_set_pixclock(struct fb_info *info, u32 pixclock)
+{
+ u16 m, n, r;
+ u8 regval;
+ int rv;
+
+ rv = svga_compute_pll(&vt8623_pll, 1000000000 / pixclock, &m, &n, &r, info->node);
+ if (rv < 0) {
+ printk(KERN_ERR "fb%d: cannot set requested pixclock, keeping old value\n", info->node);
+ return;
+ }
+
+ /* Set VGA misc register */
+ regval = vga_r(NULL, VGA_MIS_R);
+ vga_w(NULL, VGA_MIS_W, regval | VGA_MIS_ENB_PLL_LOAD);
+
+ /* Set clock registers */
+ vga_wseq(NULL, 0x46, (n | (r << 6)));
+ vga_wseq(NULL, 0x47, m);
+
+ udelay(1000);
+
+ /* PLL reset */
+ svga_wseq_mask(0x40, 0x02, 0x02);
+ svga_wseq_mask(0x40, 0x00, 0x02);
+}
+
+
+static int vt8623fb_open(struct fb_info *info, int user)
+{
+ struct vt8623fb_info *par = info->par;
+
+ mutex_lock(&(par->open_lock));
+ if (par->ref_count == 0) {
+ memset(&(par->state), 0, sizeof(struct vgastate));
+ par->state.flags = VGA_SAVE_MODE | VGA_SAVE_FONTS | VGA_SAVE_CMAP;
+ par->state.num_crtc = 0xA2;
+ par->state.num_seq = 0x50;
+ save_vga(&(par->state));
+ }
+
+ par->ref_count++;
+ mutex_unlock(&(par->open_lock));
+
+ return 0;
+}
+
+static int vt8623fb_release(struct fb_info *info, int user)
+{
+ struct vt8623fb_info *par = info->par;
+
+ mutex_lock(&(par->open_lock));
+ if (par->ref_count == 0) {
+ mutex_unlock(&(par->open_lock));
+ return -EINVAL;
+ }
+
+ if (par->ref_count == 1)
+ restore_vga(&(par->state));
+
+ par->ref_count--;
+ mutex_unlock(&(par->open_lock));
+
+ return 0;
+}
+
+static int vt8623fb_check_var(struct fb_var_screeninfo *var, struct fb_info *info)
+{
+ int rv, mem, step;
+
+ /* Find appropriate format */
+ rv = svga_match_format (vt8623fb_formats, var, NULL);
+ if (rv < 0)
+ {
+ printk(KERN_ERR "fb%d: unsupported mode requested\n", info->node);
+ return rv;
+ }
+
+ /* Do not allow to have real resoulution larger than virtual */
+ if (var->xres > var->xres_virtual)
+ var->xres_virtual = var->xres;
+
+ if (var->yres > var->yres_virtual)
+ var->yres_virtual = var->yres;
+
+ /* Round up xres_virtual to have proper alignment of lines */
+ step = vt8623fb_formats[rv].xresstep - 1;
+ var->xres_virtual = (var->xres_virtual+step) & ~step;
+
+ /* Check whether have enough memory */
+ mem = ((var->bits_per_pixel * var->xres_virtual) >> 3) * var->yres_virtual;
+ if (mem > info->screen_size)
+ {
+ printk(KERN_ERR "fb%d: not enough framebuffer memory (%d kB requested , %d kB available)\n", info->node, mem >> 10, (unsigned int) (info->screen_size >> 10));
+ return -EINVAL;
+ }
+
+ /* Text mode is limited to 256 kB of memory */
+ if ((var->bits_per_pixel == 0) && (mem > (256*1024)))
+ {
+ printk(KERN_ERR "fb%d: text framebuffer size too large (%d kB requested, 256 kB possible)\n", info->node, mem >> 10);
+ return -EINVAL;
+ }
+
+ rv = svga_check_timings (&vt8623_timing_regs, var, info->node);
+ if (rv < 0)
+ {
+ printk(KERN_ERR "fb%d: invalid timings requested\n", info->node);
+ return rv;
+ }
+
+ /* Interlaced mode not supported */
+ if (var->vmode & FB_VMODE_INTERLACED)
+ return -EINVAL;
+
+ return 0;
+}
+
+
+static int vt8623fb_set_par(struct fb_info *info)
+{
+ u32 mode, offset_value, fetch_value, screen_size;
+ u32 bpp = info->var.bits_per_pixel;
+
+ if (bpp != 0) {
+ info->fix.ypanstep = 1;
+ info->fix.line_length = (info->var.xres_virtual * bpp) / 8;
+
+ info->flags &= ~FBINFO_MISC_TILEBLITTING;
+ info->tileops = NULL;
+
+ /* in 4bpp supports 8p wide tiles only, any tiles otherwise */
+ info->pixmap.blit_x = (bpp == 4) ? (1 << (8 - 1)) : (~(u32)0);
+ info->pixmap.blit_y = ~(u32)0;
+
+ offset_value = (info->var.xres_virtual * bpp) / 64;
+ fetch_value = ((info->var.xres * bpp) / 128) + 4;
+
+ if (bpp == 4)
+ fetch_value = (info->var.xres / 8) + 8; /* + 0 is OK */
+
+ screen_size = info->var.yres_virtual * info->fix.line_length;
+ } else {
+ info->fix.ypanstep = 16;
+ info->fix.line_length = 0;
+
+ info->flags |= FBINFO_MISC_TILEBLITTING;
+ info->tileops = &vt8623fb_tile_ops;
+
+ /* supports 8x16 tiles only */
+ info->pixmap.blit_x = 1 << (8 - 1);
+ info->pixmap.blit_y = 1 << (16 - 1);
+
+ offset_value = info->var.xres_virtual / 16;
+ fetch_value = (info->var.xres / 8) + 8;
+ screen_size = (info->var.xres_virtual * info->var.yres_virtual) / 64;
+ }
+
+ info->var.xoffset = 0;
+ info->var.yoffset = 0;
+ info->var.activate = FB_ACTIVATE_NOW;
+
+ /* Unlock registers */
+ svga_wseq_mask(0x10, 0x01, 0x01);
+ svga_wcrt_mask(0x11, 0x00, 0x80);
+ svga_wcrt_mask(0x47, 0x00, 0x01);
+
+ /* Device, screen and sync off */
+ svga_wseq_mask(0x01, 0x20, 0x20);
+ svga_wcrt_mask(0x36, 0x30, 0x30);
+ svga_wcrt_mask(0x17, 0x00, 0x80);
+
+ /* Set default values */
+ svga_set_default_gfx_regs();
+ svga_set_default_atc_regs();
+ svga_set_default_seq_regs();
+ svga_set_default_crt_regs();
+ svga_wcrt_multi(vt8623_line_compare_regs, 0xFFFFFFFF);
+ svga_wcrt_multi(vt8623_start_address_regs, 0);
+
+ svga_wcrt_multi(vt8623_offset_regs, offset_value);
+ svga_wseq_multi(vt8623_fetch_count_regs, fetch_value);
+
+ if (info->var.vmode & FB_VMODE_DOUBLE)
+ svga_wcrt_mask(0x09, 0x80, 0x80);
+ else
+ svga_wcrt_mask(0x09, 0x00, 0x80);
+
+ svga_wseq_mask(0x1E, 0xF0, 0xF0); // DI/DVP bus
+ svga_wseq_mask(0x2A, 0x0F, 0x0F); // DI/DVP bus
+ svga_wseq_mask(0x16, 0x08, 0xBF); // FIFO read treshold
+ vga_wseq(NULL, 0x17, 0x1F); // FIFO depth
+ vga_wseq(NULL, 0x18, 0x4E);
+ svga_wseq_mask(0x1A, 0x08, 0x08); // enable MMIO ?
+
+ vga_wcrt(NULL, 0x32, 0x00);
+ vga_wcrt(NULL, 0x34, 0x00);
+ vga_wcrt(NULL, 0x6A, 0x80);
+ vga_wcrt(NULL, 0x6A, 0xC0);
+
+ vga_wgfx(NULL, 0x20, 0x00);
+ vga_wgfx(NULL, 0x21, 0x00);
+ vga_wgfx(NULL, 0x22, 0x00);
+
+ /* Set SR15 according to number of bits per pixel */
+ mode = svga_match_format(vt8623fb_formats, &(info->var), &(info->fix));
+ switch (mode) {
+ case 0:
+ pr_debug("fb%d: text mode\n", info->node);
+ svga_set_textmode_vga_regs();
+ svga_wseq_mask(0x15, 0x00, 0xFE);
+ svga_wcrt_mask(0x11, 0x60, 0x70);
+ break;
+ case 1:
+ pr_debug("fb%d: 4 bit pseudocolor\n", info->node);
+ vga_wgfx(NULL, VGA_GFX_MODE, 0x40);
+ svga_wseq_mask(0x15, 0x20, 0xFE);
+ svga_wcrt_mask(0x11, 0x00, 0x70);
+ break;
+ case 2:
+ pr_debug("fb%d: 4 bit pseudocolor, planar\n", info->node);
+ svga_wseq_mask(0x15, 0x00, 0xFE);
+ svga_wcrt_mask(0x11, 0x00, 0x70);
+ break;
+ case 3:
+ pr_debug("fb%d: 8 bit pseudocolor\n", info->node);
+ svga_wseq_mask(0x15, 0x22, 0xFE);
+ break;
+ case 4:
+ pr_debug("fb%d: 5/6/5 truecolor\n", info->node);
+ svga_wseq_mask(0x15, 0xB6, 0xFE);
+ break;
+ case 5:
+ pr_debug("fb%d: 8/8/8 truecolor\n", info->node);
+ svga_wseq_mask(0x15, 0xAE, 0xFE);
+ break;
+ default:
+ printk(KERN_ERR "vt8623fb: unsupported mode - bug\n");
+ return (-EINVAL);
+ }
+
+ vt8623_set_pixclock(info, info->var.pixclock);
+ svga_set_timings(&vt8623_timing_regs, &(info->var), 1, 1,
+ (info->var.vmode & FB_VMODE_DOUBLE) ? 2 : 1, 1,
+ 1, info->node);
+
+ memset_io(info->screen_base, 0x00, screen_size);
+
+ /* Device and screen back on */
+ svga_wcrt_mask(0x17, 0x80, 0x80);
+ svga_wcrt_mask(0x36, 0x00, 0x30);
+ svga_wseq_mask(0x01, 0x00, 0x20);
+
+ return 0;
+}
+
+
+static int vt8623fb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
+ u_int transp, struct fb_info *fb)
+{
+ switch (fb->var.bits_per_pixel) {
+ case 0:
+ case 4:
+ if (regno >= 16)
+ return -EINVAL;
+
+ outb(0x0F, VGA_PEL_MSK);
+ outb(regno, VGA_PEL_IW);
+ outb(red >> 10, VGA_PEL_D);
+ outb(green >> 10, VGA_PEL_D);
+ outb(blue >> 10, VGA_PEL_D);
+ break;
+ case 8:
+ if (regno >= 256)
+ return -EINVAL;
+
+ outb(0xFF, VGA_PEL_MSK);
+ outb(regno, VGA_PEL_IW);
+ outb(red >> 10, VGA_PEL_D);
+ outb(green >> 10, VGA_PEL_D);
+ outb(blue >> 10, VGA_PEL_D);
+ break;
+ case 16:
+ if (regno >= 16)
+ return 0;
+
+ if (fb->var.green.length == 5)
+ ((u32*)fb->pseudo_palette)[regno] = ((red & 0xF800) >> 1) |
+ ((green & 0xF800) >> 6) | ((blue & 0xF800) >> 11);
+ else if (fb->var.green.length == 6)
+ ((u32*)fb->pseudo_palette)[regno] = (red & 0xF800) |
+ ((green & 0xFC00) >> 5) | ((blue & 0xF800) >> 11);
+ else
+ return -EINVAL;
+ break;
+ case 24:
+ case 32:
+ if (regno >= 16)
+ return 0;
+
+ /* ((transp & 0xFF00) << 16) */
+ ((u32*)fb->pseudo_palette)[regno] = ((red & 0xFF00) << 8) |
+ (green & 0xFF00) | ((blue & 0xFF00) >> 8);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+
+static int vt8623fb_blank(int blank_mode, struct fb_info *info)
+{
+ switch (blank_mode) {
+ case FB_BLANK_UNBLANK:
+ pr_debug("fb%d: unblank\n", info->node);
+ svga_wcrt_mask(0x36, 0x00, 0x30);
+ svga_wseq_mask(0x01, 0x00, 0x20);
+ break;
+ case FB_BLANK_NORMAL:
+ pr_debug("fb%d: blank\n", info->node);
+ svga_wcrt_mask(0x36, 0x00, 0x30);
+ svga_wseq_mask(0x01, 0x20, 0x20);
+ break;
+ case FB_BLANK_HSYNC_SUSPEND:
+ pr_debug("fb%d: DPMS standby (hsync off)\n", info->node);
+ svga_wcrt_mask(0x36, 0x10, 0x30);
+ svga_wseq_mask(0x01, 0x20, 0x20);
+ break;
+ case FB_BLANK_VSYNC_SUSPEND:
+ pr_debug("fb%d: DPMS suspend (vsync off)\n", info->node);
+ svga_wcrt_mask(0x36, 0x20, 0x30);
+ svga_wseq_mask(0x01, 0x20, 0x20);
+ break;
+ case FB_BLANK_POWERDOWN:
+ pr_debug("fb%d: DPMS off (no sync)\n", info->node);
+ svga_wcrt_mask(0x36, 0x30, 0x30);
+ svga_wseq_mask(0x01, 0x20, 0x20);
+ break;
+ }
+
+ return 0;
+}
+
+
+static int vt8623fb_pan_display(struct fb_var_screeninfo *var, struct fb_info *info)
+{
+ unsigned int offset;
+
+ /* Calculate the offset */
+ if (var->bits_per_pixel == 0) {
+ offset = (var->yoffset / 16) * var->xres_virtual + var->xoffset;
+ offset = offset >> 3;
+ } else {
+ offset = (var->yoffset * info->fix.line_length) +
+ (var->xoffset * var->bits_per_pixel / 8);
+ offset = offset >> ((var->bits_per_pixel == 4) ? 2 : 1);
+ }
+
+ /* Set the offset */
+ svga_wcrt_multi(vt8623_start_address_regs, offset);
+
+ return 0;
+}
+
+
+/* ------------------------------------------------------------------------- */
+
+
+/* Frame buffer operations */
+
+static struct fb_ops vt8623fb_ops = {
+ .owner = THIS_MODULE,
+ .fb_open = vt8623fb_open,
+ .fb_release = vt8623fb_release,
+ .fb_check_var = vt8623fb_check_var,
+ .fb_set_par = vt8623fb_set_par,
+ .fb_setcolreg = vt8623fb_setcolreg,
+ .fb_blank = vt8623fb_blank,
+ .fb_pan_display = vt8623fb_pan_display,
+ .fb_fillrect = vt8623fb_fillrect,
+ .fb_copyarea = cfb_copyarea,
+ .fb_imageblit = vt8623fb_imageblit,
+ .fb_get_caps = svga_get_caps,
+};
+
+
+/* PCI probe */
+
+static int __devinit vt8623_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+ struct fb_info *info;
+ struct vt8623fb_info *par;
+ unsigned int memsize1, memsize2;
+ int rc;
+
+ /* Ignore secondary VGA device because there is no VGA arbitration */
+ if (! svga_primary_device(dev)) {
+ dev_info(&(dev->dev), "ignoring secondary device\n");
+ return -ENODEV;
+ }
+
+ /* Allocate and fill driver data structure */
+ info = framebuffer_alloc(sizeof(struct vt8623fb_info), NULL);
+ if (! info) {
+ dev_err(&(dev->dev), "cannot allocate memory\n");
+ return -ENOMEM;
+ }
+
+ par = info->par;
+ mutex_init(&par->open_lock);
+
+ info->flags = FBINFO_PARTIAL_PAN_OK | FBINFO_HWACCEL_YPAN;
+ info->fbops = &vt8623fb_ops;
+
+ /* Prepare PCI device */
+
+ rc = pci_enable_device(dev);
+ if (rc < 0) {
+ dev_err(&(dev->dev), "cannot enable PCI device\n");
+ goto err_enable_device;
+ }
+
+ rc = pci_request_regions(dev, "vt8623fb");
+ if (rc < 0) {
+ dev_err(&(dev->dev), "cannot reserve framebuffer region\n");
+ goto err_request_regions;
+ }
+
+ info->fix.smem_start = pci_resource_start(dev, 0);
+ info->fix.smem_len = pci_resource_len(dev, 0);
+ info->fix.mmio_start = pci_resource_start(dev, 1);
+ info->fix.mmio_len = pci_resource_len(dev, 1);
+
+ /* Map physical IO memory address into kernel space */
+ info->screen_base = pci_iomap(dev, 0, 0);
+ if (! info->screen_base) {
+ rc = -ENOMEM;
+ dev_err(&(dev->dev), "iomap for framebuffer failed\n");
+ goto err_iomap_1;
+ }
+
+ par->mmio_base = pci_iomap(dev, 1, 0);
+ if (! par->mmio_base) {
+ rc = -ENOMEM;
+ dev_err(&(dev->dev), "iomap for MMIO failed\n");
+ goto err_iomap_2;
+ }
+
+ /* Find how many physical memory there is on card */
+ memsize1 = (vga_rseq(NULL, 0x34) + 1) >> 1;
+ memsize2 = vga_rseq(NULL, 0x39) << 2;
+
+ if ((16 <= memsize1) && (memsize1 <= 64) && (memsize1 == memsize2))
+ info->screen_size = memsize1 << 20;
+ else {
+ dev_err(&(dev->dev), "memory size detection failed (%x %x), suppose 16 MB\n", memsize1, memsize2);
+ info->screen_size = 16 << 20;
+ }
+
+ info->fix.smem_len = info->screen_size;
+ strcpy(info->fix.id, "VIA VT8623");
+ info->fix.type = FB_TYPE_PACKED_PIXELS;
+ info->fix.visual = FB_VISUAL_PSEUDOCOLOR;
+ info->fix.ypanstep = 0;
+ info->fix.accel = FB_ACCEL_NONE;
+ info->pseudo_palette = (void*)par->pseudo_palette;
+
+ /* Prepare startup mode */
+
+ rc = fb_find_mode(&(info->var), info, mode, NULL, 0, NULL, 8);
+ if (! ((rc == 1) || (rc == 2))) {
+ rc = -EINVAL;
+ dev_err(&(dev->dev), "mode %s not found\n", mode);
+ goto err_find_mode;
+ }
+
+ rc = fb_alloc_cmap(&info->cmap, 256, 0);
+ if (rc < 0) {
+ dev_err(&(dev->dev), "cannot allocate colormap\n");
+ goto err_alloc_cmap;
+ }
+
+ rc = register_framebuffer(info);
+ if (rc < 0) {
+ dev_err(&(dev->dev), "cannot register framebugger\n");
+ goto err_reg_fb;
+ }
+
+ printk(KERN_INFO "fb%d: %s on %s, %d MB RAM\n", info->node, info->fix.id,
+ pci_name(dev), info->fix.smem_len >> 20);
+
+ /* Record a reference to the driver data */
+ pci_set_drvdata(dev, info);
+
+#ifdef CONFIG_MTRR
+ if (mtrr) {
+ par->mtrr_reg = -1;
+ par->mtrr_reg = mtrr_add(info->fix.smem_start, info->fix.smem_len, MTRR_TYPE_WRCOMB, 1);
+ }
+#endif
+
+ return 0;
+
+ /* Error handling */
+err_reg_fb:
+ fb_dealloc_cmap(&info->cmap);
+err_alloc_cmap:
+err_find_mode:
+ pci_iounmap(dev, par->mmio_base);
+err_iomap_2:
+ pci_iounmap(dev, info->screen_base);
+err_iomap_1:
+ pci_release_regions(dev);
+err_request_regions:
+/* pci_disable_device(dev); */
+err_enable_device:
+ framebuffer_release(info);
+ return rc;
+}
+
+/* PCI remove */
+
+static void __devexit vt8623_pci_remove(struct pci_dev *dev)
+{
+ struct fb_info *info = pci_get_drvdata(dev);
+ struct vt8623fb_info *par = info->par;
+
+ if (info) {
+#ifdef CONFIG_MTRR
+ if (par->mtrr_reg >= 0) {
+ mtrr_del(par->mtrr_reg, 0, 0);
+ par->mtrr_reg = -1;
+ }
+#endif
+
+ unregister_framebuffer(info);
+ fb_dealloc_cmap(&info->cmap);
+
+ pci_iounmap(dev, info->screen_base);
+ pci_iounmap(dev, par->mmio_base);
+ pci_release_regions(dev);
+/* pci_disable_device(dev); */
+
+ pci_set_drvdata(dev, NULL);
+ framebuffer_release(info);
+ }
+}
+
+
+#ifdef CONFIG_PM
+/* PCI suspend */
+
+static int vt8623_pci_suspend(struct pci_dev* dev, pm_message_t state)
+{
+ struct fb_info *info = pci_get_drvdata(dev);
+ struct vt8623fb_info *par = info->par;
+
+ dev_info(&(dev->dev), "suspend\n");
+
+ acquire_console_sem();
+ mutex_lock(&(par->open_lock));
+
+ if ((state.event == PM_EVENT_FREEZE) || (par->ref_count == 0)) {
+ mutex_unlock(&(par->open_lock));
+ release_console_sem();
+ return 0;
+ }
+
+ fb_set_suspend(info, 1);
+
+ pci_save_state(dev);
+ pci_disable_device(dev);
+ pci_set_power_state(dev, pci_choose_state(dev, state));
+
+ mutex_unlock(&(par->open_lock));
+ release_console_sem();
+
+ return 0;
+}
+
+
+/* PCI resume */
+
+static int vt8623_pci_resume(struct pci_dev* dev)
+{
+ struct fb_info *info = pci_get_drvdata(dev);
+ struct vt8623fb_info *par = info->par;
+
+ dev_info(&(dev->dev), "resume\n");
+
+ acquire_console_sem();
+ mutex_lock(&(par->open_lock));
+
+ if (par->ref_count == 0) {
+ mutex_unlock(&(par->open_lock));
+ release_console_sem();
+ return 0;
+ }
+
+ pci_set_power_state(dev, PCI_D0);
+ pci_restore_state(dev);
+
+ if (pci_enable_device(dev))
+ goto fail;
+
+ pci_set_master(dev);
+
+ vt8623fb_set_par(info);
+ fb_set_suspend(info, 0);
+
+ mutex_unlock(&(par->open_lock));
+fail:
+ release_console_sem();
+
+ return 0;
+}
+#else
+#define vt8623_pci_suspend NULL
+#define vt8623_pci_resume NULL
+#endif /* CONFIG_PM */
+
+/* List of boards that we are trying to support */
+
+static struct pci_device_id vt8623_devices[] __devinitdata = {
+ {PCI_DEVICE(PCI_VENDOR_ID_VIA, 0x3122)},
+ {0, 0, 0, 0, 0, 0, 0}
+};
+
+MODULE_DEVICE_TABLE(pci, vt8623_devices);
+
+static struct pci_driver vt8623fb_pci_driver = {
+ .name = "vt8623fb",
+ .id_table = vt8623_devices,
+ .probe = vt8623_pci_probe,
+ .remove = __devexit_p(vt8623_pci_remove),
+ .suspend = vt8623_pci_suspend,
+ .resume = vt8623_pci_resume,
+};
+
+/* Cleanup */
+
+static void __exit vt8623fb_cleanup(void)
+{
+ pr_debug("vt8623fb: cleaning up\n");
+ pci_unregister_driver(&vt8623fb_pci_driver);
+}
+
+/* Driver Initialisation */
+
+int __init vt8623fb_init(void)
+{
+
+#ifndef MODULE
+ char *option = NULL;
+
+ if (fb_get_options("vt8623fb", &option))
+ return -ENODEV;
+
+ if (option && *option)
+ mode = option;
+#endif
+
+ pr_debug("vt8623fb: initializing\n");
+ return pci_register_driver(&vt8623fb_pci_driver);
+}
+
+/* ------------------------------------------------------------------------- */
+
+/* Modularization */
+
+module_init(vt8623fb_init);
+module_exit(vt8623fb_cleanup);
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 4aa8079..c879690 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -628,11 +628,7 @@
return err;
}
if (to < PAGE_CACHE_SIZE) {
- char *kaddr = kmap_atomic(page, KM_USER0);
-
- memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ zero_user_page(page, to, PAGE_CACHE_SIZE - to, KM_USER0);
if (size > offset + to) {
if (size < offset + PAGE_CACHE_SIZE)
tmp = size & ~PAGE_CACHE_MASK;
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index cf83e5d..73ce561 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -22,6 +22,7 @@
vlclient.o \
vlocation.o \
vnode.o \
- volume.o
+ volume.o \
+ write.o
obj-$(CONFIG_AFS_FS) := kafs.o
diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h
index 89e0d16..2198006 100644
--- a/fs/afs/afs_fs.h
+++ b/fs/afs/afs_fs.h
@@ -18,6 +18,8 @@
enum AFS_FS_Operations {
FSFETCHDATA = 130, /* AFS Fetch file data */
FSFETCHSTATUS = 132, /* AFS Fetch file status */
+ FSSTOREDATA = 133, /* AFS Store file data */
+ FSSTORESTATUS = 135, /* AFS Store file status */
FSREMOVEFILE = 136, /* AFS Remove a file */
FSCREATEFILE = 137, /* AFS Create a file */
FSRENAME = 138, /* AFS Rename or move a file or directory */
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 9bdbf36..f64e40f 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -44,7 +44,7 @@
while (!RB_EMPTY_ROOT(&server->cb_promises)) {
vnode = rb_entry(server->cb_promises.rb_node,
struct afs_vnode, cb_promise);
- _debug("UNPROMISE { vid=%x vn=%u uq=%u}",
+ _debug("UNPROMISE { vid=%x:%u uq=%u}",
vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
rb_erase(&vnode->cb_promise, &server->cb_promises);
vnode->cb_promised = false;
@@ -84,11 +84,8 @@
/* if the vnode's data version number changed then its contents
* are different */
- if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
- _debug("zap data {%x:%u}",
- vnode->fid.vid, vnode->fid.vnode);
- invalidate_remote_inode(&vnode->vfs_inode);
- }
+ if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
+ afs_zap_data(vnode);
}
out:
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 0c1e902..2fb3127 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -55,7 +55,8 @@
.rmdir = afs_rmdir,
.rename = afs_rename,
.permission = afs_permission,
- .getattr = afs_inode_getattr,
+ .getattr = afs_getattr,
+ .setattr = afs_setattr,
};
static struct dentry_operations afs_fs_dentry_operations = {
@@ -491,7 +492,7 @@
vnode = AFS_FS_I(dir);
- _enter("{%x:%d},%p{%s},",
+ _enter("{%x:%u},%p{%s},",
vnode->fid.vid, vnode->fid.vnode, dentry, dentry->d_name.name);
ASSERTCMP(dentry->d_inode, ==, NULL);
@@ -731,7 +732,7 @@
dvnode = AFS_FS_I(dir);
- _enter("{%x:%d},{%s},%o",
+ _enter("{%x:%u},{%s},%o",
dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
ret = -ENAMETOOLONG;
@@ -796,7 +797,7 @@
dvnode = AFS_FS_I(dir);
- _enter("{%x:%d},{%s}",
+ _enter("{%x:%u},{%s}",
dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
ret = -ENAMETOOLONG;
@@ -842,7 +843,7 @@
dvnode = AFS_FS_I(dir);
- _enter("{%x:%d},{%s}",
+ _enter("{%x:%u},{%s}",
dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
ret = -ENAMETOOLONG;
@@ -916,7 +917,7 @@
dvnode = AFS_FS_I(dir);
- _enter("{%x:%d},{%s},%o,",
+ _enter("{%x:%u},{%s},%o,",
dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
ret = -ENAMETOOLONG;
@@ -983,7 +984,7 @@
vnode = AFS_FS_I(from->d_inode);
dvnode = AFS_FS_I(dir);
- _enter("{%x:%d},{%x:%d},{%s}",
+ _enter("{%x:%u},{%x:%u},{%s}",
vnode->fid.vid, vnode->fid.vnode,
dvnode->fid.vid, dvnode->fid.vnode,
dentry->d_name.name);
@@ -1032,7 +1033,7 @@
dvnode = AFS_FS_I(dir);
- _enter("{%x:%d},{%s},%s",
+ _enter("{%x:%u},{%s},%s",
dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name,
content);
@@ -1104,7 +1105,7 @@
orig_dvnode = AFS_FS_I(old_dir);
new_dvnode = AFS_FS_I(new_dir);
- _enter("{%x:%d},{%x:%d},{%x:%d},{%s}",
+ _enter("{%x:%u},{%x:%u},{%x:%u},{%s}",
orig_dvnode->fid.vid, orig_dvnode->fid.vnode,
vnode->fid.vid, vnode->fid.vnode,
new_dvnode->fid.vid, new_dvnode->fid.vnode,
diff --git a/fs/afs/file.c b/fs/afs/file.c
index ae25649..3e25795 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -15,32 +15,43 @@
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
+#include <linux/writeback.h>
#include "internal.h"
-static int afs_file_readpage(struct file *file, struct page *page);
-static void afs_file_invalidatepage(struct page *page, unsigned long offset);
-static int afs_file_releasepage(struct page *page, gfp_t gfp_flags);
+static int afs_readpage(struct file *file, struct page *page);
+static void afs_invalidatepage(struct page *page, unsigned long offset);
+static int afs_releasepage(struct page *page, gfp_t gfp_flags);
+static int afs_launder_page(struct page *page);
const struct file_operations afs_file_operations = {
.open = afs_open,
.release = afs_release,
.llseek = generic_file_llseek,
.read = do_sync_read,
+ .write = do_sync_write,
.aio_read = generic_file_aio_read,
+ .aio_write = afs_file_write,
.mmap = generic_file_readonly_mmap,
.sendfile = generic_file_sendfile,
+ .fsync = afs_fsync,
};
const struct inode_operations afs_file_inode_operations = {
- .getattr = afs_inode_getattr,
+ .getattr = afs_getattr,
+ .setattr = afs_setattr,
.permission = afs_permission,
};
const struct address_space_operations afs_fs_aops = {
- .readpage = afs_file_readpage,
- .set_page_dirty = __set_page_dirty_nobuffers,
- .releasepage = afs_file_releasepage,
- .invalidatepage = afs_file_invalidatepage,
+ .readpage = afs_readpage,
+ .set_page_dirty = afs_set_page_dirty,
+ .launder_page = afs_launder_page,
+ .releasepage = afs_releasepage,
+ .invalidatepage = afs_invalidatepage,
+ .prepare_write = afs_prepare_write,
+ .commit_write = afs_commit_write,
+ .writepage = afs_writepage,
+ .writepages = afs_writepages,
};
/*
@@ -52,7 +63,7 @@
struct key *key;
int ret;
- _enter("{%x:%x},", vnode->fid.vid, vnode->fid.vnode);
+ _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
key = afs_request_key(vnode->volume->cell);
if (IS_ERR(key)) {
@@ -78,7 +89,7 @@
{
struct afs_vnode *vnode = AFS_FS_I(inode);
- _enter("{%x:%x},", vnode->fid.vid, vnode->fid.vnode);
+ _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
key_put(file->private_data);
_leave(" = 0");
@@ -89,10 +100,10 @@
* deal with notification that a page was read from the cache
*/
#ifdef AFS_CACHING_SUPPORT
-static void afs_file_readpage_read_complete(void *cookie_data,
- struct page *page,
- void *data,
- int error)
+static void afs_readpage_read_complete(void *cookie_data,
+ struct page *page,
+ void *data,
+ int error)
{
_enter("%p,%p,%p,%d", cookie_data, page, data, error);
@@ -109,10 +120,10 @@
* deal with notification that a page was written to the cache
*/
#ifdef AFS_CACHING_SUPPORT
-static void afs_file_readpage_write_complete(void *cookie_data,
- struct page *page,
- void *data,
- int error)
+static void afs_readpage_write_complete(void *cookie_data,
+ struct page *page,
+ void *data,
+ int error)
{
_enter("%p,%p,%p,%d", cookie_data, page, data, error);
@@ -121,9 +132,9 @@
#endif
/*
- * AFS read page from file (or symlink)
+ * AFS read page from file, directory or symlink
*/
-static int afs_file_readpage(struct file *file, struct page *page)
+static int afs_readpage(struct file *file, struct page *page)
{
struct afs_vnode *vnode;
struct inode *inode;
@@ -219,39 +230,17 @@
}
/*
- * get a page cookie for the specified page
- */
-#ifdef AFS_CACHING_SUPPORT
-int afs_cache_get_page_cookie(struct page *page,
- struct cachefs_page **_page_cookie)
-{
- int ret;
-
- _enter("");
- ret = cachefs_page_get_private(page,_page_cookie, GFP_NOIO);
-
- _leave(" = %d", ret);
- return ret;
-}
-#endif
-
-/*
* invalidate part or all of a page
*/
-static void afs_file_invalidatepage(struct page *page, unsigned long offset)
+static void afs_invalidatepage(struct page *page, unsigned long offset)
{
int ret = 1;
- _enter("{%lu},%lu", page->index, offset);
+ kenter("{%lu},%lu", page->index, offset);
BUG_ON(!PageLocked(page));
if (PagePrivate(page)) {
-#ifdef AFS_CACHING_SUPPORT
- struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
- cachefs_uncache_page(vnode->cache,page);
-#endif
-
/* We release buffers only if the entire page is being
* invalidated.
* The get_block cached value has been unconditionally
@@ -272,25 +261,33 @@
}
/*
+ * write back a dirty page
+ */
+static int afs_launder_page(struct page *page)
+{
+ _enter("{%lu}", page->index);
+
+ return 0;
+}
+
+/*
* release a page and cleanup its private data
*/
-static int afs_file_releasepage(struct page *page, gfp_t gfp_flags)
+static int afs_releasepage(struct page *page, gfp_t gfp_flags)
{
- struct cachefs_page *pageio;
+ struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+ struct afs_writeback *wb;
- _enter("{%lu},%x", page->index, gfp_flags);
+ _enter("{{%x:%u}[%lu],%lx},%x",
+ vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
+ gfp_flags);
if (PagePrivate(page)) {
-#ifdef AFS_CACHING_SUPPORT
- struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
- cachefs_uncache_page(vnode->cache, page);
-#endif
-
- pageio = (struct cachefs_page *) page_private(page);
+ wb = (struct afs_writeback *) page_private(page);
+ ASSERT(wb != NULL);
set_page_private(page, 0);
ClearPagePrivate(page);
-
- kfree(pageio);
+ afs_put_writeback(wb);
}
_leave(" = 0");
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index e54e6c2..025b190 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -33,8 +33,10 @@
*/
static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
struct afs_file_status *status,
- struct afs_vnode *vnode)
+ struct afs_vnode *vnode,
+ afs_dataversion_t *store_version)
{
+ afs_dataversion_t expected_version;
const __be32 *bp = *_bp;
umode_t mode;
u64 data_version, size;
@@ -101,7 +103,11 @@
vnode->vfs_inode.i_atime = vnode->vfs_inode.i_ctime;
}
- if (status->data_version != data_version) {
+ expected_version = status->data_version;
+ if (store_version)
+ expected_version = *store_version;
+
+ if (expected_version != data_version) {
status->data_version = data_version;
if (vnode && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
_debug("vnode modified %llx on {%x:%u}",
@@ -110,6 +116,8 @@
set_bit(AFS_VNODE_MODIFIED, &vnode->flags);
set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
}
+ } else if (store_version) {
+ status->data_version = data_version;
}
}
@@ -156,6 +164,44 @@
}
/*
+ * encode the requested attributes into an AFSStoreStatus block
+ */
+static void xdr_encode_AFS_StoreStatus(__be32 **_bp, struct iattr *attr)
+{
+ __be32 *bp = *_bp;
+ u32 mask = 0, mtime = 0, owner = 0, group = 0, mode = 0;
+
+ mask = 0;
+ if (attr->ia_valid & ATTR_MTIME) {
+ mask |= AFS_SET_MTIME;
+ mtime = attr->ia_mtime.tv_sec;
+ }
+
+ if (attr->ia_valid & ATTR_UID) {
+ mask |= AFS_SET_OWNER;
+ owner = attr->ia_uid;
+ }
+
+ if (attr->ia_valid & ATTR_GID) {
+ mask |= AFS_SET_GROUP;
+ group = attr->ia_gid;
+ }
+
+ if (attr->ia_valid & ATTR_MODE) {
+ mask |= AFS_SET_MODE;
+ mode = attr->ia_mode & S_IALLUGO;
+ }
+
+ *bp++ = htonl(mask);
+ *bp++ = htonl(mtime);
+ *bp++ = htonl(owner);
+ *bp++ = htonl(group);
+ *bp++ = htonl(mode);
+ *bp++ = 0; /* segment size */
+ *_bp = bp;
+}
+
+/*
* deliver reply data to an FS.FetchStatus
*/
static int afs_deliver_fs_fetch_status(struct afs_call *call,
@@ -175,7 +221,7 @@
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+ xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
xdr_decode_AFSCallBack(&bp, vnode);
if (call->reply2)
xdr_decode_AFSVolSync(&bp, call->reply2);
@@ -206,7 +252,7 @@
struct afs_call *call;
__be32 *bp;
- _enter(",%x,{%x:%d},,",
+ _enter(",%x,{%x:%u},,",
key_serial(key), vnode->fid.vid, vnode->fid.vnode);
call = afs_alloc_flat_call(&afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4);
@@ -265,25 +311,20 @@
call->offset = 0;
call->unmarshall++;
- if (call->count < PAGE_SIZE) {
- page = call->reply3;
- buffer = kmap_atomic(page, KM_USER0);
- memset(buffer + PAGE_SIZE - call->count, 0,
- call->count);
- kunmap_atomic(buffer, KM_USER0);
- }
-
/* extract the returned data */
case 2:
_debug("extract data");
- page = call->reply3;
- buffer = kmap_atomic(page, KM_USER0);
- ret = afs_extract_data(call, skb, last, buffer, call->count);
- kunmap_atomic(buffer, KM_USER0);
- switch (ret) {
- case 0: break;
- case -EAGAIN: return 0;
- default: return ret;
+ if (call->count > 0) {
+ page = call->reply3;
+ buffer = kmap_atomic(page, KM_USER0);
+ ret = afs_extract_data(call, skb, last, buffer,
+ call->count);
+ kunmap_atomic(buffer, KM_USER0);
+ switch (ret) {
+ case 0: break;
+ case -EAGAIN: return 0;
+ default: return ret;
+ }
}
call->offset = 0;
@@ -300,7 +341,7 @@
}
bp = call->buffer;
- xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+ xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
xdr_decode_AFSCallBack(&bp, vnode);
if (call->reply2)
xdr_decode_AFSVolSync(&bp, call->reply2);
@@ -318,6 +359,14 @@
if (!last)
return 0;
+ if (call->count < PAGE_SIZE) {
+ _debug("clear");
+ page = call->reply3;
+ buffer = kmap_atomic(page, KM_USER0);
+ memset(buffer + call->count, 0, PAGE_SIZE - call->count);
+ kunmap_atomic(buffer, KM_USER0);
+ }
+
_leave(" = 0 [done]");
return 0;
}
@@ -476,8 +525,8 @@
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
xdr_decode_AFSFid(&bp, call->reply2);
- xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL);
- xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+ xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL, NULL);
+ xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
xdr_decode_AFSCallBack_raw(&bp, call->reply4);
/* xdr_decode_AFSVolSync(&bp, call->replyX); */
@@ -574,7 +623,7 @@
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+ xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
/* xdr_decode_AFSVolSync(&bp, call->replyX); */
_leave(" = 0 [done]");
@@ -657,8 +706,8 @@
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
- xdr_decode_AFSFetchStatus(&bp, &dvnode->status, dvnode);
+ xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
+ xdr_decode_AFSFetchStatus(&bp, &dvnode->status, dvnode, NULL);
/* xdr_decode_AFSVolSync(&bp, call->replyX); */
_leave(" = 0 [done]");
@@ -746,8 +795,8 @@
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
xdr_decode_AFSFid(&bp, call->reply2);
- xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL);
- xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+ xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL, NULL);
+ xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
/* xdr_decode_AFSVolSync(&bp, call->replyX); */
_leave(" = 0 [done]");
@@ -852,9 +901,10 @@
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- xdr_decode_AFSFetchStatus(&bp, &orig_dvnode->status, orig_dvnode);
+ xdr_decode_AFSFetchStatus(&bp, &orig_dvnode->status, orig_dvnode, NULL);
if (new_dvnode != orig_dvnode)
- xdr_decode_AFSFetchStatus(&bp, &new_dvnode->status, new_dvnode);
+ xdr_decode_AFSFetchStatus(&bp, &new_dvnode->status, new_dvnode,
+ NULL);
/* xdr_decode_AFSVolSync(&bp, call->replyX); */
_leave(" = 0 [done]");
@@ -936,3 +986,262 @@
return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
}
+
+/*
+ * deliver reply data to an FS.StoreData
+ */
+static int afs_deliver_fs_store_data(struct afs_call *call,
+ struct sk_buff *skb, bool last)
+{
+ struct afs_vnode *vnode = call->reply;
+ const __be32 *bp;
+
+ _enter(",,%u", last);
+
+ afs_transfer_reply(call, skb);
+ if (!last) {
+ _leave(" = 0 [more]");
+ return 0;
+ }
+
+ if (call->reply_size != call->reply_max) {
+ _leave(" = -EBADMSG [%u != %u]",
+ call->reply_size, call->reply_max);
+ return -EBADMSG;
+ }
+
+ /* unmarshall the reply once we've received all of it */
+ bp = call->buffer;
+ xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode,
+ &call->store_version);
+ /* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+ afs_pages_written_back(vnode, call);
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * FS.StoreData operation type
+ */
+static const struct afs_call_type afs_RXFSStoreData = {
+ .name = "FS.StoreData",
+ .deliver = afs_deliver_fs_store_data,
+ .abort_to_error = afs_abort_to_error,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * store a set of pages
+ */
+int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb,
+ pgoff_t first, pgoff_t last,
+ unsigned offset, unsigned to,
+ const struct afs_wait_mode *wait_mode)
+{
+ struct afs_vnode *vnode = wb->vnode;
+ struct afs_call *call;
+ loff_t size, pos, i_size;
+ __be32 *bp;
+
+ _enter(",%x,{%x:%u},,",
+ key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode);
+
+ size = to - offset;
+ if (first != last)
+ size += (loff_t)(last - first) << PAGE_SHIFT;
+ pos = (loff_t)first << PAGE_SHIFT;
+ pos += offset;
+
+ i_size = i_size_read(&vnode->vfs_inode);
+ if (pos + size > i_size)
+ i_size = size + pos;
+
+ _debug("size %llx, at %llx, i_size %llx",
+ (unsigned long long) size, (unsigned long long) pos,
+ (unsigned long long) i_size);
+
+ BUG_ON(i_size > 0xffffffff); // TODO: use 64-bit store
+
+ call = afs_alloc_flat_call(&afs_RXFSStoreData,
+ (4 + 6 + 3) * 4,
+ (21 + 6) * 4);
+ if (!call)
+ return -ENOMEM;
+
+ call->wb = wb;
+ call->key = wb->key;
+ call->reply = vnode;
+ call->service_id = FS_SERVICE;
+ call->port = htons(AFS_FS_PORT);
+ call->mapping = vnode->vfs_inode.i_mapping;
+ call->first = first;
+ call->last = last;
+ call->first_offset = offset;
+ call->last_to = to;
+ call->send_pages = true;
+ call->store_version = vnode->status.data_version + 1;
+
+ /* marshall the parameters */
+ bp = call->request;
+ *bp++ = htonl(FSSTOREDATA);
+ *bp++ = htonl(vnode->fid.vid);
+ *bp++ = htonl(vnode->fid.vnode);
+ *bp++ = htonl(vnode->fid.unique);
+
+ *bp++ = 0; /* mask */
+ *bp++ = 0; /* mtime */
+ *bp++ = 0; /* owner */
+ *bp++ = 0; /* group */
+ *bp++ = 0; /* unix mode */
+ *bp++ = 0; /* segment size */
+
+ *bp++ = htonl(pos);
+ *bp++ = htonl(size);
+ *bp++ = htonl(i_size);
+
+ return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.StoreStatus
+ */
+static int afs_deliver_fs_store_status(struct afs_call *call,
+ struct sk_buff *skb, bool last)
+{
+ afs_dataversion_t *store_version;
+ struct afs_vnode *vnode = call->reply;
+ const __be32 *bp;
+
+ _enter(",,%u", last);
+
+ afs_transfer_reply(call, skb);
+ if (!last) {
+ _leave(" = 0 [more]");
+ return 0;
+ }
+
+ if (call->reply_size != call->reply_max) {
+ _leave(" = -EBADMSG [%u != %u]",
+ call->reply_size, call->reply_max);
+ return -EBADMSG;
+ }
+
+ /* unmarshall the reply once we've received all of it */
+ store_version = NULL;
+ if (call->operation_ID == FSSTOREDATA)
+ store_version = &call->store_version;
+
+ bp = call->buffer;
+ xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, store_version);
+ /* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+ _leave(" = 0 [done]");
+ return 0;
+}
+
+/*
+ * FS.StoreStatus operation type
+ */
+static const struct afs_call_type afs_RXFSStoreStatus = {
+ .name = "FS.StoreStatus",
+ .deliver = afs_deliver_fs_store_status,
+ .abort_to_error = afs_abort_to_error,
+ .destructor = afs_flat_call_destructor,
+};
+
+static const struct afs_call_type afs_RXFSStoreData_as_Status = {
+ .name = "FS.StoreData",
+ .deliver = afs_deliver_fs_store_status,
+ .abort_to_error = afs_abort_to_error,
+ .destructor = afs_flat_call_destructor,
+};
+
+/*
+ * set the attributes on a file, using FS.StoreData rather than FS.StoreStatus
+ * so as to alter the file size also
+ */
+static int afs_fs_setattr_size(struct afs_server *server, struct key *key,
+ struct afs_vnode *vnode, struct iattr *attr,
+ const struct afs_wait_mode *wait_mode)
+{
+ struct afs_call *call;
+ __be32 *bp;
+
+ _enter(",%x,{%x:%u},,",
+ key_serial(key), vnode->fid.vid, vnode->fid.vnode);
+
+ ASSERT(attr->ia_valid & ATTR_SIZE);
+ ASSERTCMP(attr->ia_size, <=, 0xffffffff); // TODO: use 64-bit store
+
+ call = afs_alloc_flat_call(&afs_RXFSStoreData_as_Status,
+ (4 + 6 + 3) * 4,
+ (21 + 6) * 4);
+ if (!call)
+ return -ENOMEM;
+
+ call->key = key;
+ call->reply = vnode;
+ call->service_id = FS_SERVICE;
+ call->port = htons(AFS_FS_PORT);
+ call->store_version = vnode->status.data_version + 1;
+ call->operation_ID = FSSTOREDATA;
+
+ /* marshall the parameters */
+ bp = call->request;
+ *bp++ = htonl(FSSTOREDATA);
+ *bp++ = htonl(vnode->fid.vid);
+ *bp++ = htonl(vnode->fid.vnode);
+ *bp++ = htonl(vnode->fid.unique);
+
+ xdr_encode_AFS_StoreStatus(&bp, attr);
+
+ *bp++ = 0; /* position of start of write */
+ *bp++ = 0; /* size of write */
+ *bp++ = htonl(attr->ia_size); /* new file length */
+
+ return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * set the attributes on a file, using FS.StoreData if there's a change in file
+ * size, and FS.StoreStatus otherwise
+ */
+int afs_fs_setattr(struct afs_server *server, struct key *key,
+ struct afs_vnode *vnode, struct iattr *attr,
+ const struct afs_wait_mode *wait_mode)
+{
+ struct afs_call *call;
+ __be32 *bp;
+
+ if (attr->ia_valid & ATTR_SIZE)
+ return afs_fs_setattr_size(server, key, vnode, attr,
+ wait_mode);
+
+ _enter(",%x,{%x:%u},,",
+ key_serial(key), vnode->fid.vid, vnode->fid.vnode);
+
+ call = afs_alloc_flat_call(&afs_RXFSStoreStatus,
+ (4 + 6) * 4,
+ (21 + 6) * 4);
+ if (!call)
+ return -ENOMEM;
+
+ call->key = key;
+ call->reply = vnode;
+ call->service_id = FS_SERVICE;
+ call->port = htons(AFS_FS_PORT);
+ call->operation_ID = FSSTORESTATUS;
+
+ /* marshall the parameters */
+ bp = call->request;
+ *bp++ = htonl(FSSTORESTATUS);
+ *bp++ = htonl(vnode->fid.vid);
+ *bp++ = htonl(vnode->fid.vnode);
+ *bp++ = htonl(vnode->fid.unique);
+
+ xdr_encode_AFS_StoreStatus(&bp, attr);
+
+ return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index c184a4e..515a5d1 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -125,7 +125,7 @@
struct inode *inode;
int ret;
- _enter(",{%u,%u,%u},,", fid->vid, fid->vnode, fid->unique);
+ _enter(",{%x:%u.%u},,", fid->vid, fid->vnode, fid->unique);
as = sb->s_fs_info;
data.volume = as->volume;
@@ -204,6 +204,19 @@
}
/*
+ * mark the data attached to an inode as obsolete due to a write on the server
+ * - might also want to ditch all the outstanding writes and dirty pages
+ */
+void afs_zap_data(struct afs_vnode *vnode)
+{
+ _enter("zap data {%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+
+ /* nuke all the non-dirty pages that aren't locked, mapped or being
+ * written back */
+ invalidate_remote_inode(&vnode->vfs_inode);
+}
+
+/*
* validate a vnode/inode
* - there are several things we need to check
* - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
@@ -258,10 +271,8 @@
/* if the vnode's data version number changed then its contents are
* different */
- if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
- _debug("zap data {%x:%d}", vnode->fid.vid, vnode->fid.vnode);
- invalidate_remote_inode(&vnode->vfs_inode);
- }
+ if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
+ afs_zap_data(vnode);
clear_bit(AFS_VNODE_MODIFIED, &vnode->flags);
mutex_unlock(&vnode->validate_lock);
@@ -278,7 +289,7 @@
/*
* read the attributes of an inode
*/
-int afs_inode_getattr(struct vfsmount *mnt, struct dentry *dentry,
+int afs_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct inode *inode;
@@ -301,7 +312,7 @@
vnode = AFS_FS_I(inode);
- _enter("{%x:%d.%d} v=%u x=%u t=%u }",
+ _enter("{%x:%u.%d} v=%u x=%u t=%u }",
vnode->fid.vid,
vnode->fid.vnode,
vnode->fid.unique,
@@ -323,6 +334,7 @@
vnode->server = NULL;
}
+ ASSERT(list_empty(&vnode->writebacks));
ASSERT(!vnode->cb_promised);
#ifdef AFS_CACHING_SUPPORT
@@ -339,3 +351,47 @@
_leave("");
}
+
+/*
+ * set the attributes of an inode
+ */
+int afs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
+ struct key *key;
+ int ret;
+
+ _enter("{%x:%u},{n=%s},%x",
+ vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name,
+ attr->ia_valid);
+
+ if (!(attr->ia_valid & (ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID |
+ ATTR_MTIME))) {
+ _leave(" = 0 [unsupported]");
+ return 0;
+ }
+
+ /* flush any dirty data outstanding on a regular file */
+ if (S_ISREG(vnode->vfs_inode.i_mode)) {
+ filemap_write_and_wait(vnode->vfs_inode.i_mapping);
+ afs_writeback_all(vnode);
+ }
+
+ if (attr->ia_valid & ATTR_FILE) {
+ key = attr->ia_file->private_data;
+ } else {
+ key = afs_request_key(vnode->volume->cell);
+ if (IS_ERR(key)) {
+ ret = PTR_ERR(key);
+ goto error;
+ }
+ }
+
+ ret = afs_vnode_setattr(vnode, key, attr);
+ if (!(attr->ia_valid & ATTR_FILE))
+ key_put(key);
+
+error:
+ _leave(" = %d", ret);
+ return ret;
+}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index d90c158..a30d4fa 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -21,6 +21,7 @@
#define AFS_CELL_MAX_ADDRS 15
+struct pagevec;
struct afs_call;
typedef enum {
@@ -75,12 +76,15 @@
struct key *key; /* security for this call */
struct afs_server *server; /* server affected by incoming CM call */
void *request; /* request data (first part) */
- void *request2; /* request data (second part) */
+ struct address_space *mapping; /* page set */
+ struct afs_writeback *wb; /* writeback being performed */
void *buffer; /* reply receive buffer */
void *reply; /* reply buffer (first part) */
void *reply2; /* reply buffer (second part) */
void *reply3; /* reply buffer (third part) */
void *reply4; /* reply buffer (fourth part) */
+ pgoff_t first; /* first page in mapping to deal with */
+ pgoff_t last; /* last page in mapping to deal with */
enum { /* call state */
AFS_CALL_REQUESTING, /* request is being sent for outgoing call */
AFS_CALL_AWAIT_REPLY, /* awaiting reply to outgoing call */
@@ -97,14 +101,18 @@
unsigned request_size; /* size of request data */
unsigned reply_max; /* maximum size of reply */
unsigned reply_size; /* current size of reply */
+ unsigned first_offset; /* offset into mapping[first] */
+ unsigned last_to; /* amount of mapping[last] */
unsigned short offset; /* offset into received data store */
unsigned char unmarshall; /* unmarshalling phase */
bool incoming; /* T if incoming call */
+ bool send_pages; /* T if data from mapping should be sent */
u16 service_id; /* RxRPC service ID to call */
__be16 port; /* target UDP port */
__be32 operation_ID; /* operation ID for an incoming call */
u32 count; /* count for use in unmarshalling */
__be32 tmp; /* place to extract temporary data */
+ afs_dataversion_t store_version; /* updated version expected from store */
};
struct afs_call_type {
@@ -124,6 +132,32 @@
};
/*
+ * record of an outstanding writeback on a vnode
+ */
+struct afs_writeback {
+ struct list_head link; /* link in vnode->writebacks */
+ struct work_struct writer; /* work item to perform the writeback */
+ struct afs_vnode *vnode; /* vnode to which this write applies */
+ struct key *key; /* owner of this write */
+ wait_queue_head_t waitq; /* completion and ready wait queue */
+ pgoff_t first; /* first page in batch */
+ pgoff_t point; /* last page in current store op */
+ pgoff_t last; /* last page in batch (inclusive) */
+ unsigned offset_first; /* offset into first page of start of write */
+ unsigned to_last; /* offset into last page of end of write */
+ int num_conflicts; /* count of conflicting writes in list */
+ int usage;
+ bool conflicts; /* T if has dependent conflicts */
+ enum {
+ AFS_WBACK_SYNCING, /* synchronisation being performed */
+ AFS_WBACK_PENDING, /* write pending */
+ AFS_WBACK_CONFLICTING, /* conflicting writes posted */
+ AFS_WBACK_WRITING, /* writing back */
+ AFS_WBACK_COMPLETE /* the writeback record has been unlinked */
+ } state __attribute__((packed));
+};
+
+/*
* AFS superblock private data
* - there's one superblock per volume
*/
@@ -305,6 +339,7 @@
wait_queue_head_t update_waitq; /* status fetch waitqueue */
int update_cnt; /* number of outstanding ops that will update the
* status */
+ spinlock_t writeback_lock; /* lock for writebacks */
spinlock_t lock; /* waitqueue/flags lock */
unsigned long flags;
#define AFS_VNODE_CB_BROKEN 0 /* set if vnode's callback was broken */
@@ -316,6 +351,8 @@
long acl_order; /* ACL check count (callback break count) */
+ struct list_head writebacks; /* alterations in pagecache that need writing */
+
/* outstanding callback notification on this file */
struct rb_node server_rb; /* link in server->fs_vnodes */
struct rb_node cb_promise; /* link in server->cb_promises */
@@ -433,10 +470,6 @@
extern int afs_open(struct inode *, struct file *);
extern int afs_release(struct inode *, struct file *);
-#ifdef AFS_CACHING_SUPPORT
-extern int afs_cache_get_page_cookie(struct page *, struct cachefs_page **);
-#endif
-
/*
* fsclient.c
*/
@@ -467,6 +500,12 @@
struct afs_vnode *, const char *,
struct afs_vnode *, const char *,
const struct afs_wait_mode *);
+extern int afs_fs_store_data(struct afs_server *, struct afs_writeback *,
+ pgoff_t, pgoff_t, unsigned, unsigned,
+ const struct afs_wait_mode *);
+extern int afs_fs_setattr(struct afs_server *, struct key *,
+ struct afs_vnode *, struct iattr *,
+ const struct afs_wait_mode *);
/*
* inode.c
@@ -474,10 +513,10 @@
extern struct inode *afs_iget(struct super_block *, struct key *,
struct afs_fid *, struct afs_file_status *,
struct afs_callback *);
+extern void afs_zap_data(struct afs_vnode *);
extern int afs_validate(struct afs_vnode *, struct key *);
-extern int afs_inode_getattr(struct vfsmount *, struct dentry *,
- struct kstat *);
-extern void afs_zap_permits(struct rcu_head *);
+extern int afs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int afs_setattr(struct dentry *, struct iattr *);
extern void afs_clear_inode(struct inode *);
/*
@@ -533,6 +572,7 @@
*/
extern void afs_clear_permits(struct afs_vnode *);
extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
+extern void afs_zap_permits(struct rcu_head *);
extern struct key *afs_request_key(struct afs_cell *);
extern int afs_permission(struct inode *, int, struct nameidata *);
@@ -629,6 +669,9 @@
struct afs_file_status *, struct afs_server **);
extern int afs_vnode_rename(struct afs_vnode *, struct afs_vnode *,
struct key *, const char *, const char *);
+extern int afs_vnode_store_data(struct afs_writeback *, pgoff_t, pgoff_t,
+ unsigned, unsigned);
+extern int afs_vnode_setattr(struct afs_vnode *, struct key *, struct iattr *);
/*
* volume.c
@@ -645,6 +688,23 @@
extern int afs_volume_release_fileserver(struct afs_vnode *,
struct afs_server *, int);
+/*
+ * write.c
+ */
+extern int afs_set_page_dirty(struct page *);
+extern void afs_put_writeback(struct afs_writeback *);
+extern int afs_prepare_write(struct file *, struct page *, unsigned, unsigned);
+extern int afs_commit_write(struct file *, struct page *, unsigned, unsigned);
+extern int afs_writepage(struct page *, struct writeback_control *);
+extern int afs_writepages(struct address_space *, struct writeback_control *);
+extern int afs_write_inode(struct inode *, int);
+extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
+extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
+ unsigned long, loff_t);
+extern int afs_writeback_all(struct afs_vnode *);
+extern int afs_fsync(struct file *, struct dentry *, int);
+
+
/*****************************************************************************/
/*
* debug tracing
@@ -726,6 +786,21 @@
} \
} while(0)
+#define ASSERTRANGE(L, OP1, N, OP2, H) \
+do { \
+ if (unlikely(!((L) OP1 (N)) || !((N) OP2 (H)))) { \
+ printk(KERN_ERR "\n"); \
+ printk(KERN_ERR "AFS: Assertion failed\n"); \
+ printk(KERN_ERR "%lu "#OP1" %lu "#OP2" %lu is false\n", \
+ (unsigned long)(L), (unsigned long)(N), \
+ (unsigned long)(H)); \
+ printk(KERN_ERR "0x%lx "#OP1" 0x%lx "#OP2" 0x%lx is false\n", \
+ (unsigned long)(L), (unsigned long)(N), \
+ (unsigned long)(H)); \
+ BUG(); \
+ } \
+} while(0)
+
#define ASSERTIF(C, X) \
do { \
if (unlikely((C) && !(X))) { \
@@ -758,6 +833,10 @@
do { \
} while(0)
+#define ASSERTRANGE(L, OP1, N, OP2, H) \
+do { \
+} while(0)
+
#define ASSERTIF(C, X) \
do { \
} while(0)
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 80ec6fd..f1f71ff 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -149,6 +149,7 @@
afs_vlocation_purge();
afs_cell_purge();
afs_proc_cleanup();
+ rcu_barrier();
printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
return ret;
}
@@ -176,6 +177,7 @@
cachefs_unregister_netfs(&afs_cache_netfs);
#endif
afs_proc_cleanup();
+ rcu_barrier();
}
module_exit(afs_exit);
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index cdb9792..d1a889c 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -22,6 +22,7 @@
{
switch (abort_code) {
case 13: return -EACCES;
+ case 27: return -EFBIG;
case 30: return -EROFS;
case VSALVAGE: return -EIO;
case VNOVNODE: return -ENOENT;
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 034fcfd..a3684dc 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -36,7 +36,7 @@
.lookup = afs_mntpt_lookup,
.follow_link = afs_mntpt_follow_link,
.readlink = page_readlink,
- .getattr = afs_inode_getattr,
+ .getattr = afs_getattr,
};
static LIST_HEAD(afs_vfsmounts);
@@ -58,7 +58,8 @@
char *buf;
int ret;
- _enter("{%u,%u}", vnode->fid.vnode, vnode->fid.unique);
+ _enter("{%x:%u,%u}",
+ vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
/* read the contents of the symlink into the pagecache */
page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, &file);
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 222c1a3..04189c4 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -237,6 +237,70 @@
}
/*
+ * attach the data from a bunch of pages on an inode to a call
+ */
+int afs_send_pages(struct afs_call *call, struct msghdr *msg, struct kvec *iov)
+{
+ struct page *pages[8];
+ unsigned count, n, loop, offset, to;
+ pgoff_t first = call->first, last = call->last;
+ int ret;
+
+ _enter("");
+
+ offset = call->first_offset;
+ call->first_offset = 0;
+
+ do {
+ _debug("attach %lx-%lx", first, last);
+
+ count = last - first + 1;
+ if (count > ARRAY_SIZE(pages))
+ count = ARRAY_SIZE(pages);
+ n = find_get_pages_contig(call->mapping, first, count, pages);
+ ASSERTCMP(n, ==, count);
+
+ loop = 0;
+ do {
+ msg->msg_flags = 0;
+ to = PAGE_SIZE;
+ if (first + loop >= last)
+ to = call->last_to;
+ else
+ msg->msg_flags = MSG_MORE;
+ iov->iov_base = kmap(pages[loop]) + offset;
+ iov->iov_len = to - offset;
+ offset = 0;
+
+ _debug("- range %u-%u%s",
+ offset, to, msg->msg_flags ? " [more]" : "");
+ msg->msg_iov = (struct iovec *) iov;
+ msg->msg_iovlen = 1;
+
+ /* have to change the state *before* sending the last
+ * packet as RxRPC might give us the reply before it
+ * returns from sending the request */
+ if (first + loop >= last)
+ call->state = AFS_CALL_AWAIT_REPLY;
+ ret = rxrpc_kernel_send_data(call->rxcall, msg,
+ to - offset);
+ kunmap(pages[loop]);
+ if (ret < 0)
+ break;
+ } while (++loop < count);
+ first += count;
+
+ for (loop = 0; loop < count; loop++)
+ put_page(pages[loop]);
+ if (ret < 0)
+ break;
+ } while (first < last);
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
* initiate a call
*/
int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
@@ -253,8 +317,9 @@
ASSERT(call->type != NULL);
ASSERT(call->type->name != NULL);
- _debug("MAKE %p{%s} [%d]",
- call, call->type->name, atomic_read(&afs_outstanding_calls));
+ _debug("____MAKE %p{%s,%x} [%d]____",
+ call, call->type->name, key_serial(call->key),
+ atomic_read(&afs_outstanding_calls));
call->wait_mode = wait_mode;
INIT_WORK(&call->async_work, afs_process_async_call);
@@ -289,16 +354,23 @@
msg.msg_iovlen = 1;
msg.msg_control = NULL;
msg.msg_controllen = 0;
- msg.msg_flags = 0;
+ msg.msg_flags = (call->send_pages ? MSG_MORE : 0);
/* have to change the state *before* sending the last packet as RxRPC
* might give us the reply before it returns from sending the
* request */
- call->state = AFS_CALL_AWAIT_REPLY;
+ if (!call->send_pages)
+ call->state = AFS_CALL_AWAIT_REPLY;
ret = rxrpc_kernel_send_data(rxcall, &msg, call->request_size);
if (ret < 0)
goto error_do_abort;
+ if (call->send_pages) {
+ ret = afs_send_pages(call, &msg, iov);
+ if (ret < 0)
+ goto error_do_abort;
+ }
+
/* at this point, an async call may no longer exist as it may have
* already completed */
return wait_mode->wait(call);
diff --git a/fs/afs/security.c b/fs/afs/security.c
index f9f424d..e0ea88b 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -109,7 +109,7 @@
{
struct afs_permits *permits;
- _enter("{%x}", vnode->fid.vnode);
+ _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
mutex_lock(&vnode->permits_lock);
permits = vnode->permits;
@@ -132,7 +132,8 @@
struct afs_vnode *auth_vnode;
int count, loop;
- _enter("{%x},%x,%lx", vnode->fid.vnode, key_serial(key), acl_order);
+ _enter("{%x:%u},%x,%lx",
+ vnode->fid.vid, vnode->fid.vnode, key_serial(key), acl_order);
auth_vnode = afs_get_auth_inode(vnode, key);
if (IS_ERR(auth_vnode)) {
@@ -220,7 +221,8 @@
bool valid;
int loop, ret;
- _enter("");
+ _enter("{%x:%u},%x",
+ vnode->fid.vid, vnode->fid.vnode, key_serial(key));
auth_vnode = afs_get_auth_inode(vnode, key);
if (IS_ERR(auth_vnode)) {
@@ -268,9 +270,9 @@
_leave(" = %d", ret);
return ret;
}
+ *_access = vnode->status.caller_access;
}
- *_access = vnode->status.caller_access;
iput(&auth_vnode->vfs_inode);
_leave(" = 0 [access %x]", *_access);
return 0;
@@ -288,7 +290,7 @@
struct key *key;
int ret;
- _enter("{{%x:%x},%lx},%x,",
+ _enter("{{%x:%u},%lx},%x,",
vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask);
key = afs_request_key(vnode->volume->cell);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 96bb23b..231ae41 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -252,6 +252,9 @@
{
_enter("%p", server);
+ ASSERTIF(server->cb_break_head != server->cb_break_tail,
+ delayed_work_pending(&server->cb_break_work));
+
ASSERTCMP(server->fs_vnodes.rb_node, ==, NULL);
ASSERTCMP(server->cb_promises.rb_node, ==, NULL);
ASSERTCMP(server->cb_break_head, ==, server->cb_break_tail);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 7030d76..d24be33 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -50,6 +50,7 @@
.statfs = simple_statfs,
.alloc_inode = afs_alloc_inode,
.drop_inode = generic_delete_inode,
+ .write_inode = afs_write_inode,
.destroy_inode = afs_destroy_inode,
.clear_inode = afs_clear_inode,
.umount_begin = afs_umount_begin,
@@ -66,7 +67,7 @@
afs_opt_vol,
};
-static const match_table_t afs_options_list = {
+static match_table_t afs_options_list = {
{ afs_opt_cell, "cell=%s" },
{ afs_opt_rwpath, "rwpath" },
{ afs_opt_vol, "vol=%s" },
@@ -459,7 +460,9 @@
init_waitqueue_head(&vnode->update_waitq);
mutex_init(&vnode->permits_lock);
mutex_init(&vnode->validate_lock);
+ spin_lock_init(&vnode->writeback_lock);
spin_lock_init(&vnode->lock);
+ INIT_LIST_HEAD(&vnode->writebacks);
INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work);
}
}
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index a1904ab..ec81466 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -261,7 +261,7 @@
DECLARE_WAITQUEUE(myself, current);
- _enter("%s,{%u,%u,%u}",
+ _enter("%s,{%x:%u.%u}",
vnode->volume->vlocation->vldb.name,
vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
@@ -389,7 +389,7 @@
struct afs_server *server;
int ret;
- _enter("%s{%u,%u,%u},%x,,,",
+ _enter("%s{%x:%u.%u},%x,,,",
vnode->volume->vlocation->vldb.name,
vnode->fid.vid,
vnode->fid.vnode,
@@ -446,7 +446,7 @@
struct afs_server *server;
int ret;
- _enter("%s{%u,%u,%u},%x,%s,,",
+ _enter("%s{%x:%u.%u},%x,%s,,",
vnode->volume->vlocation->vldb.name,
vnode->fid.vid,
vnode->fid.vnode,
@@ -502,7 +502,7 @@
struct afs_server *server;
int ret;
- _enter("%s{%u,%u,%u},%x,%s",
+ _enter("%s{%x:%u.%u},%x,%s",
vnode->volume->vlocation->vldb.name,
vnode->fid.vid,
vnode->fid.vnode,
@@ -557,7 +557,7 @@
struct afs_server *server;
int ret;
- _enter("%s{%u,%u,%u},%s{%u,%u,%u},%x,%s",
+ _enter("%s{%x:%u.%u},%s{%x:%u.%u},%x,%s",
dvnode->volume->vlocation->vldb.name,
dvnode->fid.vid,
dvnode->fid.vnode,
@@ -628,7 +628,7 @@
struct afs_server *server;
int ret;
- _enter("%s{%u,%u,%u},%x,%s,%s,,,",
+ _enter("%s{%x:%u.%u},%x,%s,%s,,,",
vnode->volume->vlocation->vldb.name,
vnode->fid.vid,
vnode->fid.vnode,
@@ -687,7 +687,7 @@
struct afs_server *server;
int ret;
- _enter("%s{%u,%u,%u},%s{%u,%u,%u},%x,%s,%s",
+ _enter("%s{%x:%u.%u},%s{%u,%u,%u},%x,%s,%s",
orig_dvnode->volume->vlocation->vldb.name,
orig_dvnode->fid.vid,
orig_dvnode->fid.vnode,
@@ -753,3 +753,110 @@
_leave(" = %ld [cnt %d]", PTR_ERR(server), orig_dvnode->update_cnt);
return PTR_ERR(server);
}
+
+/*
+ * write to a file
+ */
+int afs_vnode_store_data(struct afs_writeback *wb, pgoff_t first, pgoff_t last,
+ unsigned offset, unsigned to)
+{
+ struct afs_server *server;
+ struct afs_vnode *vnode = wb->vnode;
+ int ret;
+
+ _enter("%s{%x:%u.%u},%x,%lx,%lx,%x,%x",
+ vnode->volume->vlocation->vldb.name,
+ vnode->fid.vid,
+ vnode->fid.vnode,
+ vnode->fid.unique,
+ key_serial(wb->key),
+ first, last, offset, to);
+
+ /* this op will fetch the status */
+ spin_lock(&vnode->lock);
+ vnode->update_cnt++;
+ spin_unlock(&vnode->lock);
+
+ do {
+ /* pick a server to query */
+ server = afs_volume_pick_fileserver(vnode);
+ if (IS_ERR(server))
+ goto no_server;
+
+ _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+ ret = afs_fs_store_data(server, wb, first, last, offset, to,
+ &afs_sync_call);
+
+ } while (!afs_volume_release_fileserver(vnode, server, ret));
+
+ /* adjust the flags */
+ if (ret == 0) {
+ afs_vnode_finalise_status_update(vnode, server);
+ afs_put_server(server);
+ } else {
+ afs_vnode_status_update_failed(vnode, ret);
+ }
+
+ _leave(" = %d", ret);
+ return ret;
+
+no_server:
+ spin_lock(&vnode->lock);
+ vnode->update_cnt--;
+ ASSERTCMP(vnode->update_cnt, >=, 0);
+ spin_unlock(&vnode->lock);
+ return PTR_ERR(server);
+}
+
+/*
+ * set the attributes on a file
+ */
+int afs_vnode_setattr(struct afs_vnode *vnode, struct key *key,
+ struct iattr *attr)
+{
+ struct afs_server *server;
+ int ret;
+
+ _enter("%s{%x:%u.%u},%x",
+ vnode->volume->vlocation->vldb.name,
+ vnode->fid.vid,
+ vnode->fid.vnode,
+ vnode->fid.unique,
+ key_serial(key));
+
+ /* this op will fetch the status */
+ spin_lock(&vnode->lock);
+ vnode->update_cnt++;
+ spin_unlock(&vnode->lock);
+
+ do {
+ /* pick a server to query */
+ server = afs_volume_pick_fileserver(vnode);
+ if (IS_ERR(server))
+ goto no_server;
+
+ _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+ ret = afs_fs_setattr(server, key, vnode, attr, &afs_sync_call);
+
+ } while (!afs_volume_release_fileserver(vnode, server, ret));
+
+ /* adjust the flags */
+ if (ret == 0) {
+ afs_vnode_finalise_status_update(vnode, server);
+ afs_put_server(server);
+ } else {
+ afs_vnode_status_update_failed(vnode, ret);
+ }
+
+ _leave(" = %d", ret);
+ return ret;
+
+no_server:
+ spin_lock(&vnode->lock);
+ vnode->update_cnt--;
+ ASSERTCMP(vnode->update_cnt, >=, 0);
+ spin_unlock(&vnode->lock);
+ return PTR_ERR(server);
+}
diff --git a/fs/afs/write.c b/fs/afs/write.c
new file mode 100644
index 0000000..83ff292
--- /dev/null
+++ b/fs/afs/write.c
@@ -0,0 +1,835 @@
+/* handling of writes to regular files and writing back to the server
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "internal.h"
+
+static int afs_write_back_from_locked_page(struct afs_writeback *wb,
+ struct page *page);
+
+/*
+ * mark a page as having been made dirty and thus needing writeback
+ */
+int afs_set_page_dirty(struct page *page)
+{
+ _enter("");
+ return __set_page_dirty_nobuffers(page);
+}
+
+/*
+ * unlink a writeback record because its usage has reached zero
+ * - must be called with the wb->vnode->writeback_lock held
+ */
+static void afs_unlink_writeback(struct afs_writeback *wb)
+{
+ struct afs_writeback *front;
+ struct afs_vnode *vnode = wb->vnode;
+
+ list_del_init(&wb->link);
+ if (!list_empty(&vnode->writebacks)) {
+ /* if an fsync rises to the front of the queue then wake it
+ * up */
+ front = list_entry(vnode->writebacks.next,
+ struct afs_writeback, link);
+ if (front->state == AFS_WBACK_SYNCING) {
+ _debug("wake up sync");
+ front->state = AFS_WBACK_COMPLETE;
+ wake_up(&front->waitq);
+ }
+ }
+}
+
+/*
+ * free a writeback record
+ */
+static void afs_free_writeback(struct afs_writeback *wb)
+{
+ _enter("");
+ key_put(wb->key);
+ kfree(wb);
+}
+
+/*
+ * dispose of a reference to a writeback record
+ */
+void afs_put_writeback(struct afs_writeback *wb)
+{
+ struct afs_vnode *vnode = wb->vnode;
+
+ _enter("{%d}", wb->usage);
+
+ spin_lock(&vnode->writeback_lock);
+ if (--wb->usage == 0)
+ afs_unlink_writeback(wb);
+ else
+ wb = NULL;
+ spin_unlock(&vnode->writeback_lock);
+ if (wb)
+ afs_free_writeback(wb);
+}
+
+/*
+ * partly or wholly fill a page that's under preparation for writing
+ */
+static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
+ unsigned start, unsigned len, struct page *page)
+{
+ int ret;
+
+ _enter(",,%u,%u", start, len);
+
+ ASSERTCMP(start + len, <=, PAGE_SIZE);
+
+ ret = afs_vnode_fetch_data(vnode, key, start, len, page);
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ _debug("got NOENT from server"
+ " - marking file deleted and stale");
+ set_bit(AFS_VNODE_DELETED, &vnode->flags);
+ ret = -ESTALE;
+ }
+ }
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * prepare a page for being written to
+ */
+static int afs_prepare_page(struct afs_vnode *vnode, struct page *page,
+ struct key *key, unsigned offset, unsigned to)
+{
+ unsigned eof, tail, start, stop, len;
+ loff_t i_size, pos;
+ void *p;
+ int ret;
+
+ _enter("");
+
+ if (offset == 0 && to == PAGE_SIZE)
+ return 0;
+
+ p = kmap(page);
+
+ i_size = i_size_read(&vnode->vfs_inode);
+ pos = (loff_t) page->index << PAGE_SHIFT;
+ if (pos >= i_size) {
+ /* partial write, page beyond EOF */
+ _debug("beyond");
+ if (offset > 0)
+ memset(p, 0, offset);
+ if (to < PAGE_SIZE)
+ memset(p + to, 0, PAGE_SIZE - to);
+ kunmap(page);
+ return 0;
+ }
+
+ if (i_size - pos >= PAGE_SIZE) {
+ /* partial write, page entirely before EOF */
+ _debug("before");
+ tail = eof = PAGE_SIZE;
+ } else {
+ /* partial write, page overlaps EOF */
+ eof = i_size - pos;
+ _debug("overlap %u", eof);
+ tail = max(eof, to);
+ if (tail < PAGE_SIZE)
+ memset(p + tail, 0, PAGE_SIZE - tail);
+ if (offset > eof)
+ memset(p + eof, 0, PAGE_SIZE - eof);
+ }
+
+ kunmap(p);
+
+ ret = 0;
+ if (offset > 0 || eof > to) {
+ /* need to fill one or two bits that aren't going to be written
+ * (cover both fillers in one read if there are two) */
+ start = (offset > 0) ? 0 : to;
+ stop = (eof > to) ? eof : offset;
+ len = stop - start;
+ _debug("wr=%u-%u av=0-%u rd=%u@%u",
+ offset, to, eof, start, len);
+ ret = afs_fill_page(vnode, key, start, len, page);
+ }
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * prepare to perform part of a write to a page
+ * - the caller holds the page locked, preventing it from being written out or
+ * modified by anyone else
+ */
+int afs_prepare_write(struct file *file, struct page *page,
+ unsigned offset, unsigned to)
+{
+ struct afs_writeback *candidate, *wb;
+ struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+ struct key *key = file->private_data;
+ pgoff_t index;
+ int ret;
+
+ _enter("{%x:%u},{%lx},%u,%u",
+ vnode->fid.vid, vnode->fid.vnode, page->index, offset, to);
+
+ candidate = kzalloc(sizeof(*candidate), GFP_KERNEL);
+ if (!candidate)
+ return -ENOMEM;
+ candidate->vnode = vnode;
+ candidate->first = candidate->last = page->index;
+ candidate->offset_first = offset;
+ candidate->to_last = to;
+ candidate->usage = 1;
+ candidate->state = AFS_WBACK_PENDING;
+ init_waitqueue_head(&candidate->waitq);
+
+ if (!PageUptodate(page)) {
+ _debug("not up to date");
+ ret = afs_prepare_page(vnode, page, key, offset, to);
+ if (ret < 0) {
+ kfree(candidate);
+ _leave(" = %d [prep]", ret);
+ return ret;
+ }
+ SetPageUptodate(page);
+ }
+
+try_again:
+ index = page->index;
+ spin_lock(&vnode->writeback_lock);
+
+ /* see if this page is already pending a writeback under a suitable key
+ * - if so we can just join onto that one */
+ wb = (struct afs_writeback *) page_private(page);
+ if (wb) {
+ if (wb->key == key && wb->state == AFS_WBACK_PENDING)
+ goto subsume_in_current_wb;
+ goto flush_conflicting_wb;
+ }
+
+ if (index > 0) {
+ /* see if we can find an already pending writeback that we can
+ * append this page to */
+ list_for_each_entry(wb, &vnode->writebacks, link) {
+ if (wb->last == index - 1 && wb->key == key &&
+ wb->state == AFS_WBACK_PENDING)
+ goto append_to_previous_wb;
+ }
+ }
+
+ list_add_tail(&candidate->link, &vnode->writebacks);
+ candidate->key = key_get(key);
+ spin_unlock(&vnode->writeback_lock);
+ SetPagePrivate(page);
+ set_page_private(page, (unsigned long) candidate);
+ _leave(" = 0 [new]");
+ return 0;
+
+subsume_in_current_wb:
+ _debug("subsume");
+ ASSERTRANGE(wb->first, <=, index, <=, wb->last);
+ if (index == wb->first && offset < wb->offset_first)
+ wb->offset_first = offset;
+ if (index == wb->last && to > wb->to_last)
+ wb->to_last = to;
+ spin_unlock(&vnode->writeback_lock);
+ kfree(candidate);
+ _leave(" = 0 [sub]");
+ return 0;
+
+append_to_previous_wb:
+ _debug("append into %lx-%lx", wb->first, wb->last);
+ wb->usage++;
+ wb->last++;
+ wb->to_last = to;
+ spin_unlock(&vnode->writeback_lock);
+ SetPagePrivate(page);
+ set_page_private(page, (unsigned long) wb);
+ kfree(candidate);
+ _leave(" = 0 [app]");
+ return 0;
+
+ /* the page is currently bound to another context, so if it's dirty we
+ * need to flush it before we can use the new context */
+flush_conflicting_wb:
+ _debug("flush conflict");
+ if (wb->state == AFS_WBACK_PENDING)
+ wb->state = AFS_WBACK_CONFLICTING;
+ spin_unlock(&vnode->writeback_lock);
+ if (PageDirty(page)) {
+ ret = afs_write_back_from_locked_page(wb, page);
+ if (ret < 0) {
+ afs_put_writeback(candidate);
+ _leave(" = %d", ret);
+ return ret;
+ }
+ }
+
+ /* the page holds a ref on the writeback record */
+ afs_put_writeback(wb);
+ set_page_private(page, 0);
+ ClearPagePrivate(page);
+ goto try_again;
+}
+
+/*
+ * finalise part of a write to a page
+ */
+int afs_commit_write(struct file *file, struct page *page,
+ unsigned offset, unsigned to)
+{
+ struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+ loff_t i_size, maybe_i_size;
+
+ _enter("{%x:%u},{%lx},%u,%u",
+ vnode->fid.vid, vnode->fid.vnode, page->index, offset, to);
+
+ maybe_i_size = (loff_t) page->index << PAGE_SHIFT;
+ maybe_i_size += to;
+
+ i_size = i_size_read(&vnode->vfs_inode);
+ if (maybe_i_size > i_size) {
+ spin_lock(&vnode->writeback_lock);
+ i_size = i_size_read(&vnode->vfs_inode);
+ if (maybe_i_size > i_size)
+ i_size_write(&vnode->vfs_inode, maybe_i_size);
+ spin_unlock(&vnode->writeback_lock);
+ }
+
+ set_page_dirty(page);
+
+ if (PageDirty(page))
+ _debug("dirtied");
+
+ return 0;
+}
+
+/*
+ * kill all the pages in the given range
+ */
+static void afs_kill_pages(struct afs_vnode *vnode, bool error,
+ pgoff_t first, pgoff_t last)
+{
+ struct pagevec pv;
+ unsigned count, loop;
+
+ _enter("{%x:%u},%lx-%lx",
+ vnode->fid.vid, vnode->fid.vnode, first, last);
+
+ pagevec_init(&pv, 0);
+
+ do {
+ _debug("kill %lx-%lx", first, last);
+
+ count = last - first + 1;
+ if (count > PAGEVEC_SIZE)
+ count = PAGEVEC_SIZE;
+ pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping,
+ first, count, pv.pages);
+ ASSERTCMP(pv.nr, ==, count);
+
+ for (loop = 0; loop < count; loop++) {
+ ClearPageUptodate(pv.pages[loop]);
+ if (error)
+ SetPageError(pv.pages[loop]);
+ end_page_writeback(pv.pages[loop]);
+ }
+
+ __pagevec_release(&pv);
+ } while (first < last);
+
+ _leave("");
+}
+
+/*
+ * synchronously write back the locked page and any subsequent non-locked dirty
+ * pages also covered by the same writeback record
+ */
+static int afs_write_back_from_locked_page(struct afs_writeback *wb,
+ struct page *primary_page)
+{
+ struct page *pages[8], *page;
+ unsigned long count;
+ unsigned n, offset, to;
+ pgoff_t start, first, last;
+ int loop, ret;
+
+ _enter(",%lx", primary_page->index);
+
+ count = 1;
+ if (!clear_page_dirty_for_io(primary_page))
+ BUG();
+ if (test_set_page_writeback(primary_page))
+ BUG();
+
+ /* find all consecutive lockable dirty pages, stopping when we find a
+ * page that is not immediately lockable, is not dirty or is missing,
+ * or we reach the end of the range */
+ start = primary_page->index;
+ if (start >= wb->last)
+ goto no_more;
+ start++;
+ do {
+ _debug("more %lx [%lx]", start, count);
+ n = wb->last - start + 1;
+ if (n > ARRAY_SIZE(pages))
+ n = ARRAY_SIZE(pages);
+ n = find_get_pages_contig(wb->vnode->vfs_inode.i_mapping,
+ start, n, pages);
+ _debug("fgpc %u", n);
+ if (n == 0)
+ goto no_more;
+ if (pages[0]->index != start) {
+ for (n--; n >= 0; n--)
+ put_page(pages[n]);
+ goto no_more;
+ }
+
+ for (loop = 0; loop < n; loop++) {
+ page = pages[loop];
+ if (page->index > wb->last)
+ break;
+ if (TestSetPageLocked(page))
+ break;
+ if (!PageDirty(page) ||
+ page_private(page) != (unsigned long) wb) {
+ unlock_page(page);
+ break;
+ }
+ if (!clear_page_dirty_for_io(page))
+ BUG();
+ if (test_set_page_writeback(page))
+ BUG();
+ unlock_page(page);
+ put_page(page);
+ }
+ count += loop;
+ if (loop < n) {
+ for (; loop < n; loop++)
+ put_page(pages[loop]);
+ goto no_more;
+ }
+
+ start += loop;
+ } while (start <= wb->last && count < 65536);
+
+no_more:
+ /* we now have a contiguous set of dirty pages, each with writeback set
+ * and the dirty mark cleared; the first page is locked and must remain
+ * so, all the rest are unlocked */
+ first = primary_page->index;
+ last = first + count - 1;
+
+ offset = (first == wb->first) ? wb->offset_first : 0;
+ to = (last == wb->last) ? wb->to_last : PAGE_SIZE;
+
+ _debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to);
+
+ ret = afs_vnode_store_data(wb, first, last, offset, to);
+ if (ret < 0) {
+ switch (ret) {
+ case -EDQUOT:
+ case -ENOSPC:
+ set_bit(AS_ENOSPC,
+ &wb->vnode->vfs_inode.i_mapping->flags);
+ break;
+ case -EROFS:
+ case -EIO:
+ case -EREMOTEIO:
+ case -EFBIG:
+ case -ENOENT:
+ case -ENOMEDIUM:
+ case -ENXIO:
+ afs_kill_pages(wb->vnode, true, first, last);
+ set_bit(AS_EIO, &wb->vnode->vfs_inode.i_mapping->flags);
+ break;
+ case -EACCES:
+ case -EPERM:
+ case -ENOKEY:
+ case -EKEYEXPIRED:
+ case -EKEYREJECTED:
+ case -EKEYREVOKED:
+ afs_kill_pages(wb->vnode, false, first, last);
+ break;
+ default:
+ break;
+ }
+ } else {
+ ret = count;
+ }
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * write a page back to the server
+ * - the caller locked the page for us
+ */
+int afs_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct backing_dev_info *bdi = page->mapping->backing_dev_info;
+ struct afs_writeback *wb;
+ int ret;
+
+ _enter("{%lx},", page->index);
+
+ if (wbc->sync_mode != WB_SYNC_NONE)
+ wait_on_page_writeback(page);
+
+ if (PageWriteback(page) || !PageDirty(page)) {
+ unlock_page(page);
+ return 0;
+ }
+
+ wb = (struct afs_writeback *) page_private(page);
+ ASSERT(wb != NULL);
+
+ ret = afs_write_back_from_locked_page(wb, page);
+ unlock_page(page);
+ if (ret < 0) {
+ _leave(" = %d", ret);
+ return 0;
+ }
+
+ wbc->nr_to_write -= ret;
+ if (wbc->nonblocking && bdi_write_congested(bdi))
+ wbc->encountered_congestion = 1;
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * write a region of pages back to the server
+ */
+int afs_writepages_region(struct address_space *mapping,
+ struct writeback_control *wbc,
+ pgoff_t index, pgoff_t end, pgoff_t *_next)
+{
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
+ struct afs_writeback *wb;
+ struct page *page;
+ int ret, n;
+
+ _enter(",,%lx,%lx,", index, end);
+
+ do {
+ n = find_get_pages_tag(mapping, &index, PAGECACHE_TAG_DIRTY,
+ 1, &page);
+ if (!n)
+ break;
+
+ _debug("wback %lx", page->index);
+
+ if (page->index > end) {
+ *_next = index;
+ page_cache_release(page);
+ _leave(" = 0 [%lx]", *_next);
+ return 0;
+ }
+
+ /* at this point we hold neither mapping->tree_lock nor lock on
+ * the page itself: the page may be truncated or invalidated
+ * (changing page->mapping to NULL), or even swizzled back from
+ * swapper_space to tmpfs file mapping
+ */
+ lock_page(page);
+
+ if (page->mapping != mapping) {
+ unlock_page(page);
+ page_cache_release(page);
+ continue;
+ }
+
+ if (wbc->sync_mode != WB_SYNC_NONE)
+ wait_on_page_writeback(page);
+
+ if (PageWriteback(page) || !PageDirty(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ wb = (struct afs_writeback *) page_private(page);
+ ASSERT(wb != NULL);
+
+ spin_lock(&wb->vnode->writeback_lock);
+ wb->state = AFS_WBACK_WRITING;
+ spin_unlock(&wb->vnode->writeback_lock);
+
+ ret = afs_write_back_from_locked_page(wb, page);
+ unlock_page(page);
+ page_cache_release(page);
+ if (ret < 0) {
+ _leave(" = %d", ret);
+ return ret;
+ }
+
+ wbc->nr_to_write -= ret;
+
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ wbc->encountered_congestion = 1;
+ break;
+ }
+
+ cond_resched();
+ } while (index < end && wbc->nr_to_write > 0);
+
+ *_next = index;
+ _leave(" = 0 [%lx]", *_next);
+ return 0;
+}
+
+/*
+ * write some of the pending data back to the server
+ */
+int afs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
+ pgoff_t start, end, next;
+ int ret;
+
+ _enter("");
+
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ wbc->encountered_congestion = 1;
+ _leave(" = 0 [congest]");
+ return 0;
+ }
+
+ if (wbc->range_cyclic) {
+ start = mapping->writeback_index;
+ end = -1;
+ ret = afs_writepages_region(mapping, wbc, start, end, &next);
+ if (start > 0 && wbc->nr_to_write > 0 && ret == 0 &&
+ !(wbc->nonblocking && wbc->encountered_congestion))
+ ret = afs_writepages_region(mapping, wbc, 0, start,
+ &next);
+ mapping->writeback_index = next;
+ } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
+ end = (pgoff_t)(LLONG_MAX >> PAGE_CACHE_SHIFT);
+ ret = afs_writepages_region(mapping, wbc, 0, end, &next);
+ if (wbc->nr_to_write > 0)
+ mapping->writeback_index = next;
+ } else {
+ start = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ ret = afs_writepages_region(mapping, wbc, start, end, &next);
+ }
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * write an inode back
+ */
+int afs_write_inode(struct inode *inode, int sync)
+{
+ struct afs_vnode *vnode = AFS_FS_I(inode);
+ int ret;
+
+ _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
+
+ ret = 0;
+ if (sync) {
+ ret = filemap_fdatawait(inode->i_mapping);
+ if (ret < 0)
+ __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+ }
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * completion of write to server
+ */
+void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
+{
+ struct afs_writeback *wb = call->wb;
+ struct pagevec pv;
+ unsigned count, loop;
+ pgoff_t first = call->first, last = call->last;
+ bool free_wb;
+
+ _enter("{%x:%u},{%lx-%lx}",
+ vnode->fid.vid, vnode->fid.vnode, first, last);
+
+ ASSERT(wb != NULL);
+
+ pagevec_init(&pv, 0);
+
+ do {
+ _debug("attach %lx-%lx", first, last);
+
+ count = last - first + 1;
+ if (count > PAGEVEC_SIZE)
+ count = PAGEVEC_SIZE;
+ pv.nr = find_get_pages_contig(call->mapping, first, count,
+ pv.pages);
+ ASSERTCMP(pv.nr, ==, count);
+
+ spin_lock(&vnode->writeback_lock);
+ for (loop = 0; loop < count; loop++) {
+ struct page *page = pv.pages[loop];
+ end_page_writeback(page);
+ if (page_private(page) == (unsigned long) wb) {
+ set_page_private(page, 0);
+ ClearPagePrivate(page);
+ wb->usage--;
+ }
+ }
+ free_wb = false;
+ if (wb->usage == 0) {
+ afs_unlink_writeback(wb);
+ free_wb = true;
+ }
+ spin_unlock(&vnode->writeback_lock);
+ first += count;
+ if (free_wb) {
+ afs_free_writeback(wb);
+ wb = NULL;
+ }
+
+ __pagevec_release(&pv);
+ } while (first < last);
+
+ _leave("");
+}
+
+/*
+ * write to an AFS file
+ */
+ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct dentry *dentry = iocb->ki_filp->f_path.dentry;
+ struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
+ ssize_t result;
+ size_t count = iov_length(iov, nr_segs);
+ int ret;
+
+ _enter("{%x.%u},{%zu},%lu,",
+ vnode->fid.vid, vnode->fid.vnode, count, nr_segs);
+
+ if (IS_SWAPFILE(&vnode->vfs_inode)) {
+ printk(KERN_INFO
+ "AFS: Attempt to write to active swap file!\n");
+ return -EBUSY;
+ }
+
+ if (!count)
+ return 0;
+
+ result = generic_file_aio_write(iocb, iov, nr_segs, pos);
+ if (IS_ERR_VALUE(result)) {
+ _leave(" = %zd", result);
+ return result;
+ }
+
+ /* return error values for O_SYNC and IS_SYNC() */
+ if (IS_SYNC(&vnode->vfs_inode) || iocb->ki_filp->f_flags & O_SYNC) {
+ ret = afs_fsync(iocb->ki_filp, dentry, 1);
+ if (ret < 0)
+ result = ret;
+ }
+
+ _leave(" = %zd", result);
+ return result;
+}
+
+/*
+ * flush the vnode to the fileserver
+ */
+int afs_writeback_all(struct afs_vnode *vnode)
+{
+ struct address_space *mapping = vnode->vfs_inode.i_mapping;
+ struct writeback_control wbc = {
+ .bdi = mapping->backing_dev_info,
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .for_writepages = 1,
+ .range_cyclic = 1,
+ };
+ int ret;
+
+ _enter("");
+
+ ret = mapping->a_ops->writepages(mapping, &wbc);
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/*
+ * flush any dirty pages for this process, and check for write errors.
+ * - the return status from this call provides a reliable indication of
+ * whether any write errors occurred for this process.
+ */
+int afs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+ struct afs_writeback *wb, *xwb;
+ struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
+ int ret;
+
+ _enter("{%x:%u},{n=%s},%d",
+ vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name,
+ datasync);
+
+ /* use a writeback record as a marker in the queue - when this reaches
+ * the front of the queue, all the outstanding writes are either
+ * completed or rejected */
+ wb = kzalloc(sizeof(*wb), GFP_KERNEL);
+ if (!wb)
+ return -ENOMEM;
+ wb->vnode = vnode;
+ wb->first = 0;
+ wb->last = -1;
+ wb->offset_first = 0;
+ wb->to_last = PAGE_SIZE;
+ wb->usage = 1;
+ wb->state = AFS_WBACK_SYNCING;
+ init_waitqueue_head(&wb->waitq);
+
+ spin_lock(&vnode->writeback_lock);
+ list_for_each_entry(xwb, &vnode->writebacks, link) {
+ if (xwb->state == AFS_WBACK_PENDING)
+ xwb->state = AFS_WBACK_CONFLICTING;
+ }
+ list_add_tail(&wb->link, &vnode->writebacks);
+ spin_unlock(&vnode->writeback_lock);
+
+ /* push all the outstanding writebacks to the server */
+ ret = afs_writeback_all(vnode);
+ if (ret < 0) {
+ afs_put_writeback(wb);
+ _leave(" = %d [wb]", ret);
+ return ret;
+ }
+
+ /* wait for the preceding writes to actually complete */
+ ret = wait_event_interruptible(wb->waitq,
+ wb->state == AFS_WBACK_COMPLETE ||
+ vnode->writebacks.next == &wb->link);
+ afs_put_writeback(wb);
+ _leave(" = %d", ret);
+ return ret;
+}
diff --git a/fs/aio.c b/fs/aio.c
index b97ab80..ac1c158 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -346,10 +346,9 @@
wait_for_all_aios(ctx);
/*
- * this is an overkill, but ensures we don't leave
- * the ctx on the aio_wq
+ * Ensure we don't leave the ctx on the aio_wq
*/
- flush_workqueue(aio_wq);
+ cancel_work_sync(&ctx->wq.work);
if (1 != atomic_read(&ctx->users))
printk(KERN_DEBUG
@@ -372,7 +371,7 @@
BUG_ON(ctx->reqs_active);
cancel_delayed_work(&ctx->wq);
- flush_workqueue(aio_wq);
+ cancel_work_sync(&ctx->wq.work);
aio_free_ring(ctx);
mmdrop(ctx->mm);
ctx->mm = NULL;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 18657f0..72d0b41 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -675,19 +675,8 @@
bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
{
char *s = enabled ? "enabled" : "disabled";
- int len = strlen(s);
- loff_t pos = *ppos;
- if (pos < 0)
- return -EINVAL;
- if (pos >= len)
- return 0;
- if (len < pos + nbytes)
- nbytes = len - pos;
- if (copy_to_user(buf, s + pos, nbytes))
- return -EFAULT;
- *ppos = pos + nbytes;
- return nbytes;
+ return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
}
static ssize_t bm_status_write(struct file * file, const char __user * buffer,
diff --git a/fs/buffer.c b/fs/buffer.c
index eb820b8..aecd057 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1846,13 +1846,8 @@
if (block_start >= to)
break;
if (buffer_new(bh)) {
- void *kaddr;
-
clear_buffer_new(bh);
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr+block_start, 0, bh->b_size);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ zero_user_page(page, block_start, bh->b_size, KM_USER0);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
}
@@ -1940,10 +1935,8 @@
SetPageError(page);
}
if (!buffer_mapped(bh)) {
- void *kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr + i * blocksize, 0, blocksize);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ zero_user_page(page, i * blocksize, blocksize,
+ KM_USER0);
if (!err)
set_buffer_uptodate(bh);
continue;
@@ -2086,7 +2079,6 @@
long status;
unsigned zerofrom;
unsigned blocksize = 1 << inode->i_blkbits;
- void *kaddr;
while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
status = -ENOMEM;
@@ -2108,10 +2100,8 @@
PAGE_CACHE_SIZE, get_block);
if (status)
goto out_unmap;
- kaddr = kmap_atomic(new_page, KM_USER0);
- memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
- flush_dcache_page(new_page);
- kunmap_atomic(kaddr, KM_USER0);
+ zero_user_page(page, zerofrom, PAGE_CACHE_SIZE - zerofrom,
+ KM_USER0);
generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
unlock_page(new_page);
page_cache_release(new_page);
@@ -2138,10 +2128,7 @@
if (status)
goto out1;
if (zerofrom < offset) {
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr+zerofrom, 0, offset-zerofrom);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ zero_user_page(page, zerofrom, offset - zerofrom, KM_USER0);
__block_commit_write(inode, page, zerofrom, offset);
}
return 0;
@@ -2340,10 +2327,7 @@
* Error recovery is pretty slack. Clear the page and mark it dirty
* so we'll later zero out any blocks which _were_ allocated.
*/
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr, 0, PAGE_CACHE_SIZE);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
SetPageUptodate(page);
set_page_dirty(page);
return ret;
@@ -2382,7 +2366,6 @@
loff_t i_size = i_size_read(inode);
const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
unsigned offset;
- void *kaddr;
int ret;
/* Is the page fully inside i_size? */
@@ -2413,10 +2396,7 @@
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
*/
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, KM_USER0);
out:
ret = mpage_writepage(page, get_block, wbc);
if (ret == -EAGAIN)
@@ -2437,7 +2417,6 @@
unsigned to;
struct page *page;
const struct address_space_operations *a_ops = mapping->a_ops;
- char *kaddr;
int ret = 0;
if ((offset & (blocksize - 1)) == 0)
@@ -2451,10 +2430,8 @@
to = (offset + blocksize) & ~(blocksize - 1);
ret = a_ops->prepare_write(NULL, page, offset, to);
if (ret == 0) {
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ zero_user_page(page, offset, PAGE_CACHE_SIZE - offset,
+ KM_USER0);
/*
* It would be more correct to call aops->commit_write()
* here, but this is more efficient.
@@ -2480,7 +2457,6 @@
struct inode *inode = mapping->host;
struct page *page;
struct buffer_head *bh;
- void *kaddr;
int err;
blocksize = 1 << inode->i_blkbits;
@@ -2534,11 +2510,7 @@
goto unlock;
}
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr + offset, 0, length);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
-
+ zero_user_page(page, offset, length, KM_USER0);
mark_buffer_dirty(bh);
err = 0;
@@ -2559,7 +2531,6 @@
loff_t i_size = i_size_read(inode);
const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
unsigned offset;
- void *kaddr;
/* Is the page fully inside i_size? */
if (page->index < end_index)
@@ -2585,10 +2556,7 @@
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
*/
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, KM_USER0);
return __block_write_full_page(inode, page, get_block, wbc);
}
@@ -2978,7 +2946,7 @@
static int buffer_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
- if (action == CPU_DEAD)
+ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
buffer_exit_cpu((unsigned long)hcpu);
return NOTIFY_OK;
}
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index d98be5e..3527c7c 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -77,36 +77,6 @@
return ret;
}
-
-/**
- * flush_read_buffer - push buffer to userspace.
- * @buffer: data buffer for file.
- * @userbuf: user-passed buffer.
- * @count: number of bytes requested.
- * @ppos: file position.
- *
- * Copy the buffer we filled in fill_read_buffer() to userspace.
- * This is done at the reader's leisure, copying and advancing
- * the amount they specify each time.
- * This may be called continuously until the buffer is empty.
- */
-static int flush_read_buffer(struct configfs_buffer * buffer, char __user * buf,
- size_t count, loff_t * ppos)
-{
- int error;
-
- if (*ppos > buffer->count)
- return 0;
-
- if (count > (buffer->count - *ppos))
- count = buffer->count - *ppos;
-
- error = copy_to_user(buf,buffer->page + *ppos,count);
- if (!error)
- *ppos += count;
- return error ? -EFAULT : count;
-}
-
/**
* configfs_read_file - read an attribute.
* @file: file pointer.
@@ -139,7 +109,8 @@
}
pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n",
__FUNCTION__, count, *ppos, buffer->page);
- retval = flush_read_buffer(buffer,buf,count,ppos);
+ retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
+ buffer->count);
out:
up(&buffer->sem);
return retval;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 1e88d8d..8593f3d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -867,7 +867,6 @@
do_holes:
/* Handle holes */
if (!buffer_mapped(map_bh)) {
- char *kaddr;
loff_t i_size_aligned;
/* AKPM: eargh, -ENOTBLK is a hack */
@@ -888,11 +887,8 @@
page_cache_release(page);
goto out;
}
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr + (block_in_page << blkbits),
- 0, 1 << blkbits);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ zero_user_page(page, block_in_page << blkbits,
+ 1 << blkbits, KM_USER0);
dio->block_in_file++;
block_in_page++;
goto next_block;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index e1bb031..a6cb617 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1767,7 +1767,6 @@
struct inode *inode = mapping->host;
struct buffer_head *bh;
int err = 0;
- void *kaddr;
blocksize = inode->i_sb->s_blocksize;
length = blocksize - (offset & (blocksize - 1));
@@ -1779,10 +1778,7 @@
*/
if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
ext3_should_writeback_data(inode) && PageUptodate(page)) {
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr + offset, 0, length);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ zero_user_page(page, offset, length, KM_USER0);
set_page_dirty(page);
goto unlock;
}
@@ -1835,11 +1831,7 @@
goto unlock;
}
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr + offset, 0, length);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
-
+ zero_user_page(page, offset, length, KM_USER0);
BUFFER_TRACE(bh, "zeroed end of block");
err = 0;
diff --git a/fs/mpage.c b/fs/mpage.c
index fa2441f..0fb914f 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -284,11 +284,9 @@
}
if (first_hole != blocks_per_page) {
- char *kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr + (first_hole << blkbits), 0,
- PAGE_CACHE_SIZE - (first_hole << blkbits));
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ zero_user_page(page, first_hole << blkbits,
+ PAGE_CACHE_SIZE - (first_hole << blkbits),
+ KM_USER0);
if (first_hole == 0) {
SetPageUptodate(page);
unlock_page(page);
@@ -576,14 +574,11 @@
* written out to the file."
*/
unsigned offset = i_size & (PAGE_CACHE_SIZE - 1);
- char *kaddr;
if (page->index > end_index || !offset)
goto confused;
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ zero_user_page(page, offset, PAGE_CACHE_SIZE - offset,
+ KM_USER0);
}
/*
diff --git a/fs/namei.c b/fs/namei.c
index 856b2f5..b3780e3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1152,14 +1152,12 @@
fput_light(file, fput_needed);
}
- current->total_link_count = 0;
- retval = link_path_walk(name, nd);
+
+ retval = path_walk(name, nd);
out:
- if (likely(retval == 0)) {
- if (unlikely(!audit_dummy_context() && nd && nd->dentry &&
+ if (unlikely(!retval && !audit_dummy_context() && nd->dentry &&
nd->dentry->d_inode))
audit_inode(name, nd->dentry->d_inode);
- }
out_fail:
return retval;
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index ce341dc..9b118ee 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -11,4 +11,3 @@
nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
nfs4acl.o nfs4callback.o nfs4recover.o
-nfsd-objs := $(nfsd-y)
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 6f24768..79bd03b 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -469,6 +469,13 @@
nd.dentry = NULL;
exp.ex_path = NULL;
+ /* fs locations */
+ exp.ex_fslocs.locations = NULL;
+ exp.ex_fslocs.locations_count = 0;
+ exp.ex_fslocs.migrated = 0;
+
+ exp.ex_uuid = NULL;
+
if (mesg[mlen-1] != '\n')
return -EINVAL;
mesg[mlen-1] = 0;
@@ -509,13 +516,6 @@
if (exp.h.expiry_time == 0)
goto out;
- /* fs locations */
- exp.ex_fslocs.locations = NULL;
- exp.ex_fslocs.locations_count = 0;
- exp.ex_fslocs.migrated = 0;
-
- exp.ex_uuid = NULL;
-
/* flags */
err = get_int(&mesg, &an_int);
if (err == -ENOENT)
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 7f5bad0..eac8283 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -177,7 +177,7 @@
if (max_blocksize < resp->count)
resp->count = max_blocksize;
- svc_reserve(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
+ svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
fh_copy(&resp->fh, &argp->fh);
nfserr = nfsd_read(rqstp, &resp->fh, NULL,
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 7e4bb0a..10f6e7d 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -239,7 +239,7 @@
encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
{
struct dentry *dentry = fhp->fh_dentry;
- if (dentry && dentry->d_inode != NULL) {
+ if (dentry && dentry->d_inode) {
int err;
struct kstat stat;
@@ -300,9 +300,9 @@
nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_sattrargs *args)
{
- if (!(p = decode_fh(p, &args->fh))
- || !(p = decode_sattr3(p, &args->attrs)))
+ if (!(p = decode_fh(p, &args->fh)))
return 0;
+ p = decode_sattr3(p, &args->attrs);
if ((args->check_guard = ntohl(*p++)) != 0) {
struct timespec time;
@@ -343,9 +343,9 @@
int v,pn;
u32 max_blocksize = svc_max_payload(rqstp);
- if (!(p = decode_fh(p, &args->fh))
- || !(p = xdr_decode_hyper(p, &args->offset)))
+ if (!(p = decode_fh(p, &args->fh)))
return 0;
+ p = xdr_decode_hyper(p, &args->offset);
len = args->count = ntohl(*p++);
@@ -369,28 +369,44 @@
nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_writeargs *args)
{
- unsigned int len, v, hdr;
+ unsigned int len, v, hdr, dlen;
u32 max_blocksize = svc_max_payload(rqstp);
- if (!(p = decode_fh(p, &args->fh))
- || !(p = xdr_decode_hyper(p, &args->offset)))
+ if (!(p = decode_fh(p, &args->fh)))
return 0;
+ p = xdr_decode_hyper(p, &args->offset);
args->count = ntohl(*p++);
args->stable = ntohl(*p++);
len = args->len = ntohl(*p++);
-
- hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
- if (rqstp->rq_arg.len < hdr ||
- rqstp->rq_arg.len - hdr < len)
+ /*
+ * The count must equal the amount of data passed.
+ */
+ if (args->count != args->len)
return 0;
+ /*
+ * Check to make sure that we got the right number of
+ * bytes.
+ */
+ hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
+ dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
+ - hdr;
+ /*
+ * Round the length of the data which was specified up to
+ * the next multiple of XDR units and then compare that
+ * against the length which was actually received.
+ */
+ if (dlen != XDR_QUADLEN(len)*4)
+ return 0;
+
+ if (args->count > max_blocksize) {
+ args->count = max_blocksize;
+ len = args->len = max_blocksize;
+ }
rqstp->rq_vec[0].iov_base = (void*)p;
rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
-
- if (len > max_blocksize)
- len = max_blocksize;
- v= 0;
+ v = 0;
while (len > rqstp->rq_vec[v].iov_len) {
len -= rqstp->rq_vec[v].iov_len;
v++;
@@ -398,9 +414,8 @@
rqstp->rq_vec[v].iov_len = PAGE_SIZE;
}
rqstp->rq_vec[v].iov_len = len;
- args->vlen = v+1;
-
- return args->count == args->len && rqstp->rq_vec[0].iov_len > 0;
+ args->vlen = v + 1;
+ return 1;
}
int
@@ -414,8 +429,7 @@
switch (args->createmode = ntohl(*p++)) {
case NFS3_CREATE_UNCHECKED:
case NFS3_CREATE_GUARDED:
- if (!(p = decode_sattr3(p, &args->attrs)))
- return 0;
+ p = decode_sattr3(p, &args->attrs);
break;
case NFS3_CREATE_EXCLUSIVE:
args->verf = p;
@@ -431,10 +445,10 @@
nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_createargs *args)
{
- if (!(p = decode_fh(p, &args->fh))
- || !(p = decode_filename(p, &args->name, &args->len))
- || !(p = decode_sattr3(p, &args->attrs)))
+ if (!(p = decode_fh(p, &args->fh)) ||
+ !(p = decode_filename(p, &args->name, &args->len)))
return 0;
+ p = decode_sattr3(p, &args->attrs);
return xdr_argsize_check(rqstp, p);
}
@@ -448,11 +462,12 @@
char *old, *new;
struct kvec *vec;
- if (!(p = decode_fh(p, &args->ffh))
- || !(p = decode_filename(p, &args->fname, &args->flen))
- || !(p = decode_sattr3(p, &args->attrs))
+ if (!(p = decode_fh(p, &args->ffh)) ||
+ !(p = decode_filename(p, &args->fname, &args->flen))
)
return 0;
+ p = decode_sattr3(p, &args->attrs);
+
/* now decode the pathname, which might be larger than the first page.
* As we have to check for nul's anyway, we copy it into a new page
* This page appears in the rq_res.pages list, but as pages_len is always
@@ -502,10 +517,8 @@
args->ftype = ntohl(*p++);
if (args->ftype == NF3BLK || args->ftype == NF3CHR
- || args->ftype == NF3SOCK || args->ftype == NF3FIFO) {
- if (!(p = decode_sattr3(p, &args->attrs)))
- return 0;
- }
+ || args->ftype == NF3SOCK || args->ftype == NF3FIFO)
+ p = decode_sattr3(p, &args->attrs);
if (args->ftype == NF3BLK || args->ftype == NF3CHR) {
args->major = ntohl(*p++);
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 673a53c..cc3b7ba 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -137,7 +137,6 @@
static short ace2type(struct nfs4_ace *);
static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *,
unsigned int);
-void nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t);
struct nfs4_acl *
nfs4_acl_posix_to_nfsv4(struct posix_acl *pacl, struct posix_acl *dpacl,
@@ -785,21 +784,6 @@
return acl;
}
-void
-nfs4_acl_add_ace(struct nfs4_acl *acl, u32 type, u32 flag, u32 access_mask,
- int whotype, uid_t who)
-{
- struct nfs4_ace *ace = acl->aces + acl->naces;
-
- ace->type = type;
- ace->flag = flag;
- ace->access_mask = access_mask;
- ace->whotype = whotype;
- ace->who = who;
-
- acl->naces++;
-}
-
static struct {
char *string;
int stringlen;
@@ -851,6 +835,5 @@
}
EXPORT_SYMBOL(nfs4_acl_new);
-EXPORT_SYMBOL(nfs4_acl_add_ace);
EXPORT_SYMBOL(nfs4_acl_get_whotype);
EXPORT_SYMBOL(nfs4_acl_write_who);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 678f3be..3cc8ce4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1326,8 +1326,6 @@
{
struct nfs4_delegation *dp = __dp;
- daemonize("nfsv4-recall");
-
nfsd4_cb_recall(dp);
return 0;
}
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 739dd3c..6ca2d24 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -323,7 +323,7 @@
*
*/
- u8 version = 1;
+ u8 version;
u8 fsid_type = 0;
struct inode * inode = dentry->d_inode;
struct dentry *parent = dentry->d_parent;
@@ -341,15 +341,59 @@
* the reference filehandle (if it is in the same export)
* or the export options.
*/
+ retry:
+ version = 1;
if (ref_fh && ref_fh->fh_export == exp) {
version = ref_fh->fh_handle.fh_version;
- if (version == 0xca)
+ fsid_type = ref_fh->fh_handle.fh_fsid_type;
+
+ if (ref_fh == fhp)
+ fh_put(ref_fh);
+ ref_fh = NULL;
+
+ switch (version) {
+ case 0xca:
fsid_type = FSID_DEV;
- else
- fsid_type = ref_fh->fh_handle.fh_fsid_type;
- /* We know this version/type works for this export
- * so there is no need for further checks.
+ break;
+ case 1:
+ break;
+ default:
+ goto retry;
+ }
+
+ /* Need to check that this type works for this
+ * export point. As the fsid -> filesystem mapping
+ * was guided by user-space, there is no guarantee
+ * that the filesystem actually supports that fsid
+ * type. If it doesn't we loop around again without
+ * ref_fh set.
*/
+ switch(fsid_type) {
+ case FSID_DEV:
+ if (!old_valid_dev(ex_dev))
+ goto retry;
+ /* FALL THROUGH */
+ case FSID_MAJOR_MINOR:
+ case FSID_ENCODE_DEV:
+ if (!(exp->ex_dentry->d_inode->i_sb->s_type->fs_flags
+ & FS_REQUIRES_DEV))
+ goto retry;
+ break;
+ case FSID_NUM:
+ if (! (exp->ex_flags & NFSEXP_FSID))
+ goto retry;
+ break;
+ case FSID_UUID8:
+ case FSID_UUID16:
+ if (!root_export)
+ goto retry;
+ /* fall through */
+ case FSID_UUID4_INUM:
+ case FSID_UUID16_INUM:
+ if (exp->ex_uuid == NULL)
+ goto retry;
+ break;
+ }
} else if (exp->ex_uuid) {
if (fhp->fh_maxsize >= 64) {
if (root_export)
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 5cc2eec..b2c7147 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -155,7 +155,7 @@
argp->count);
argp->count = NFSSVC_MAXBLKSIZE_V2;
}
- svc_reserve(rqstp, (19<<2) + argp->count + 4);
+ svc_reserve_auth(rqstp, (19<<2) + argp->count + 4);
resp->count = argp->count;
nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 0c24b9e..cb3e7fa 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -231,9 +231,10 @@
nfssvc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_sattrargs *args)
{
- if (!(p = decode_fh(p, &args->fh))
- || !(p = decode_sattr(p, &args->attrs)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
+ p = decode_sattr(p, &args->attrs);
return xdr_argsize_check(rqstp, p);
}
@@ -284,8 +285,9 @@
nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_writeargs *args)
{
- unsigned int len;
+ unsigned int len, hdr, dlen;
int v;
+
if (!(p = decode_fh(p, &args->fh)))
return 0;
@@ -293,11 +295,30 @@
args->offset = ntohl(*p++); /* offset */
p++; /* totalcount */
len = args->len = ntohl(*p++);
- rqstp->rq_vec[0].iov_base = (void*)p;
- rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len -
- (((void*)p) - rqstp->rq_arg.head[0].iov_base);
+ /*
+ * The protocol specifies a maximum of 8192 bytes.
+ */
if (len > NFSSVC_MAXBLKSIZE_V2)
- len = NFSSVC_MAXBLKSIZE_V2;
+ return 0;
+
+ /*
+ * Check to make sure that we got the right number of
+ * bytes.
+ */
+ hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
+ dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
+ - hdr;
+
+ /*
+ * Round the length of the data which was specified up to
+ * the next multiple of XDR units and then compare that
+ * against the length which was actually received.
+ */
+ if (dlen != XDR_QUADLEN(len)*4)
+ return 0;
+
+ rqstp->rq_vec[0].iov_base = (void*)p;
+ rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
v = 0;
while (len > rqstp->rq_vec[v].iov_len) {
len -= rqstp->rq_vec[v].iov_len;
@@ -306,18 +327,18 @@
rqstp->rq_vec[v].iov_len = PAGE_SIZE;
}
rqstp->rq_vec[v].iov_len = len;
- args->vlen = v+1;
- return rqstp->rq_vec[0].iov_len > 0;
+ args->vlen = v + 1;
+ return 1;
}
int
nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_createargs *args)
{
- if (!(p = decode_fh(p, &args->fh))
- || !(p = decode_filename(p, &args->name, &args->len))
- || !(p = decode_sattr(p, &args->attrs)))
+ if ( !(p = decode_fh(p, &args->fh))
+ || !(p = decode_filename(p, &args->name, &args->len)))
return 0;
+ p = decode_sattr(p, &args->attrs);
return xdr_argsize_check(rqstp, p);
}
@@ -361,11 +382,11 @@
nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_symlinkargs *args)
{
- if (!(p = decode_fh(p, &args->ffh))
- || !(p = decode_filename(p, &args->fname, &args->flen))
- || !(p = decode_pathname(p, &args->tname, &args->tlen))
- || !(p = decode_sattr(p, &args->attrs)))
+ if ( !(p = decode_fh(p, &args->ffh))
+ || !(p = decode_filename(p, &args->fname, &args->flen))
+ || !(p = decode_pathname(p, &args->tname, &args->tlen)))
return 0;
+ p = decode_sattr(p, &args->attrs);
return xdr_argsize_check(rqstp, p);
}
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index ab45db5..9e451a6 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1059,20 +1059,12 @@
maping blocks, since there is none, so we just zero out remaining
parts of first and last pages in write area (if needed) */
if ((pos & ~((loff_t) PAGE_CACHE_SIZE - 1)) > inode->i_size) {
- if (from != 0) { /* First page needs to be partially zeroed */
- char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
- memset(kaddr, 0, from);
- kunmap_atomic(kaddr, KM_USER0);
- flush_dcache_page(prepared_pages[0]);
- }
- if (to != PAGE_CACHE_SIZE) { /* Last page needs to be partially zeroed */
- char *kaddr =
- kmap_atomic(prepared_pages[num_pages - 1],
- KM_USER0);
- memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
- kunmap_atomic(kaddr, KM_USER0);
- flush_dcache_page(prepared_pages[num_pages - 1]);
- }
+ if (from != 0) /* First page needs to be partially zeroed */
+ zero_user_page(prepared_pages[0], 0, from, KM_USER0);
+
+ if (to != PAGE_CACHE_SIZE) /* Last page needs to be partially zeroed */
+ zero_user_page(prepared_pages[num_pages-1], to,
+ PAGE_CACHE_SIZE - to, KM_USER0);
/* Since all blocks are new - use already calculated value */
return blocks;
@@ -1199,13 +1191,9 @@
ll_rw_block(READ, 1, &bh);
*wait_bh++ = bh;
} else { /* Not mapped, zero it */
- char *kaddr =
- kmap_atomic(prepared_pages[0],
- KM_USER0);
- memset(kaddr + block_start, 0,
- from - block_start);
- kunmap_atomic(kaddr, KM_USER0);
- flush_dcache_page(prepared_pages[0]);
+ zero_user_page(prepared_pages[0],
+ block_start,
+ from - block_start, KM_USER0);
set_buffer_uptodate(bh);
}
}
@@ -1237,13 +1225,8 @@
ll_rw_block(READ, 1, &bh);
*wait_bh++ = bh;
} else { /* Not mapped, zero it */
- char *kaddr =
- kmap_atomic(prepared_pages
- [num_pages - 1],
- KM_USER0);
- memset(kaddr + to, 0, block_end - to);
- kunmap_atomic(kaddr, KM_USER0);
- flush_dcache_page(prepared_pages[num_pages - 1]);
+ zero_user_page(prepared_pages[num_pages-1],
+ to, block_end - to, KM_USER0);
set_buffer_uptodate(bh);
}
}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 9fcbfe3..1272d11 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2148,13 +2148,8 @@
length = offset & (blocksize - 1);
/* if we are not on a block boundary */
if (length) {
- char *kaddr;
-
length = blocksize - length;
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr + offset, 0, length);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ zero_user_page(page, offset, length, KM_USER0);
if (buffer_mapped(bh) && bh->b_blocknr != 0) {
mark_buffer_dirty(bh);
}
@@ -2370,7 +2365,6 @@
** last byte in the file
*/
if (page->index >= end_index) {
- char *kaddr;
unsigned last_offset;
last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
@@ -2379,10 +2373,7 @@
unlock_page(page);
return 0;
}
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr + last_offset, 0, PAGE_CACHE_SIZE - last_offset);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ zero_user_page(page, last_offset, PAGE_CACHE_SIZE - last_offset, KM_USER0);
}
bh = head;
block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 0e637ad..b502c71 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -111,36 +111,6 @@
return ret;
}
-
-/**
- * flush_read_buffer - push buffer to userspace.
- * @buffer: data buffer for file.
- * @buf: user-passed buffer.
- * @count: number of bytes requested.
- * @ppos: file position.
- *
- * Copy the buffer we filled in fill_read_buffer() to userspace.
- * This is done at the reader's leisure, copying and advancing
- * the amount they specify each time.
- * This may be called continuously until the buffer is empty.
- */
-static int flush_read_buffer(struct sysfs_buffer * buffer, char __user * buf,
- size_t count, loff_t * ppos)
-{
- int error;
-
- if (*ppos > buffer->count)
- return 0;
-
- if (count > (buffer->count - *ppos))
- count = buffer->count - *ppos;
-
- error = copy_to_user(buf,buffer->page + *ppos,count);
- if (!error)
- *ppos += count;
- return error ? -EFAULT : count;
-}
-
/**
* sysfs_read_file - read an attribute.
* @file: file pointer.
@@ -177,7 +147,8 @@
}
pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n",
__FUNCTION__, count, *ppos, buffer->page);
- retval = flush_read_buffer(buffer,buf,count,ppos);
+ retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
+ buffer->count);
out:
up(&buffer->sem);
return retval;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index f5aa3ef..a96bde6 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1734,11 +1734,13 @@
per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu);
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
/* Easy Case - initialize the area and locks, and
* then rebalance when online does everything else for us. */
memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
break;
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
xfs_icsb_lock(mp);
xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0, 0);
xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0, 0);
@@ -1746,6 +1748,7 @@
xfs_icsb_unlock(mp);
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
/* Disable all the counters, then fold the dead cpu's
* count into the total on the global superblock and
* re-enable the counters. */
diff --git a/include/asm-alpha/smp.h b/include/asm-alpha/smp.h
index a1a1eca..286e1d8 100644
--- a/include/asm-alpha/smp.h
+++ b/include/asm-alpha/smp.h
@@ -51,6 +51,7 @@
#else /* CONFIG_SMP */
+#define hard_smp_processor_id() 0
#define smp_call_function_on_cpu(func,info,retry,wait,cpu) ({ 0; })
#endif /* CONFIG_SMP */
diff --git a/include/asm-alpha/thread_info.h b/include/asm-alpha/thread_info.h
index eeb3bef..f4defc2 100644
--- a/include/asm-alpha/thread_info.h
+++ b/include/asm-alpha/thread_info.h
@@ -97,7 +97,7 @@
1 << TIF_UAC_SIGBUS)
#define SET_UNALIGN_CTL(task,value) ({ \
- (task)->thread_info->flags = (((task)->thread_info->flags & \
+ task_thread_info(task)->flags = ((task_thread_info(task)->flags & \
~ALPHA_UAC_MASK) \
| (((value) << ALPHA_UAC_SHIFT) & (1<<TIF_UAC_NOPRINT))\
| (((value) << (ALPHA_UAC_SHIFT + 1)) & (1<<TIF_UAC_SIGBUS)) \
@@ -105,11 +105,11 @@
0; })
#define GET_UNALIGN_CTL(task,value) ({ \
- put_user(((task)->thread_info->flags & (1 << TIF_UAC_NOPRINT)) \
+ put_user((task_thread_info(task)->flags & (1 << TIF_UAC_NOPRINT))\
>> ALPHA_UAC_SHIFT \
- | ((task)->thread_info->flags & (1 << TIF_UAC_SIGBUS)) \
+ | (task_thread_info(task)->flags & (1 << TIF_UAC_SIGBUS))\
>> (ALPHA_UAC_SHIFT + 1) \
- | ((task)->thread_info->flags & (1 << TIF_UAC_NOFIX)) \
+ | (task_thread_info(task)->flags & (1 << TIF_UAC_NOFIX))\
>> (ALPHA_UAC_SHIFT - 1), \
(int __user *)(value)); \
})
diff --git a/include/asm-arm/arch-at91/cpu.h b/include/asm-arm/arch-at91/cpu.h
index d464ca5..7ef4eeb 100644
--- a/include/asm-arm/arch-at91/cpu.h
+++ b/include/asm-arm/arch-at91/cpu.h
@@ -68,4 +68,10 @@
#define cpu_is_at91sam9263() (0)
#endif
+/*
+ * Since this is ARM, we will never run on any AVR32 CPU. But these
+ * definitions may reduce clutter in common drivers.
+ */
+#define cpu_is_at32ap7000() (0)
+
#endif
diff --git a/include/asm-avr32/arch-at32ap/cpu.h b/include/asm-avr32/arch-at32ap/cpu.h
new file mode 100644
index 0000000..2bdc5bd
--- /dev/null
+++ b/include/asm-avr32/arch-at32ap/cpu.h
@@ -0,0 +1,33 @@
+/*
+ * AVR32 and (fake) AT91 CPU identification
+ *
+ * Copyright (C) 2007 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_ARCH_CPU_H
+#define __ASM_ARCH_CPU_H
+
+/*
+ * Only AT32AP7000 is defined for now. We can identify the specific
+ * chip at runtime, but I'm not sure if it's really worth it.
+ */
+#ifdef CONFIG_CPU_AT32AP7000
+# define cpu_is_at32ap7000() (1)
+#else
+# define cpu_is_at32ap7000() (0)
+#endif
+
+/*
+ * Since this is AVR32, we will never run on any AT91 CPU. But these
+ * definitions may reduce clutter in common drivers.
+ */
+#define cpu_is_at91rm9200() (0)
+#define cpu_is_at91sam9xe() (0)
+#define cpu_is_at91sam9260() (0)
+#define cpu_is_at91sam9261() (0)
+#define cpu_is_at91sam9263() (0)
+
+#endif /* __ASM_ARCH_CPU_H */
diff --git a/include/asm-avr32/setup.h b/include/asm-avr32/setup.h
index 1ff1a21..b0828d4 100644
--- a/include/asm-avr32/setup.h
+++ b/include/asm-avr32/setup.h
@@ -110,7 +110,7 @@
int (*parse)(struct tag *);
};
-#define __tag __attribute_used__ __attribute__((__section__(".taglist")))
+#define __tag __attribute_used__ __attribute__((__section__(".taglist.init")))
#define __tagtable(tag, fn) \
static struct tagtable __tagtable_##fn __tag = { tag, fn }
diff --git a/include/asm-avr32/unistd.h b/include/asm-avr32/unistd.h
index 8f51204..2418cce 100644
--- a/include/asm-avr32/unistd.h
+++ b/include/asm-avr32/unistd.h
@@ -295,8 +295,10 @@
#define __NR_shmdt 276
#define __NR_shmctl 277
+#define __NR_utimensat 278
+
#ifdef __KERNEL__
-#define NR_syscalls 278
+#define NR_syscalls 279
#define __ARCH_WANT_IPC_PARSE_VERSION
diff --git a/include/asm-blackfin/processor.h b/include/asm-blackfin/processor.h
index 997465c..0336ff1 100644
--- a/include/asm-blackfin/processor.h
+++ b/include/asm-blackfin/processor.h
@@ -58,10 +58,10 @@
(_regs)->pc = (_pc); \
if (current->mm) \
(_regs)->p5 = current->mm->start_data; \
- current->thread_info->l1_task_info.stack_start \
+ task_thread_info(current)->l1_task_info.stack_start \
= (void *)current->mm->context.stack_start; \
- current->thread_info->l1_task_info.lowest_sp = (void *)(_usp); \
- memcpy(L1_SCRATCH_TASK_INFO, ¤t->thread_info->l1_task_info, \
+ task_thread_info(current)->l1_task_info.lowest_sp = (void *)(_usp); \
+ memcpy(L1_SCRATCH_TASK_INFO, &task_thread_info(current)->l1_task_info, \
sizeof(*L1_SCRATCH_TASK_INFO)); \
wrusp(_usp); \
} while(0)
diff --git a/include/asm-blackfin/system.h b/include/asm-blackfin/system.h
index b5bf6e7..5e5f1a0 100644
--- a/include/asm-blackfin/system.h
+++ b/include/asm-blackfin/system.h
@@ -239,9 +239,9 @@
#define switch_to(prev,next,last) \
do { \
- memcpy (&prev->thread_info->l1_task_info, L1_SCRATCH_TASK_INFO, \
+ memcpy (&task_thread_info(prev)->l1_task_info, L1_SCRATCH_TASK_INFO, \
sizeof *L1_SCRATCH_TASK_INFO); \
- memcpy (L1_SCRATCH_TASK_INFO, &next->thread_info->l1_task_info, \
+ memcpy (L1_SCRATCH_TASK_INFO, &task_thread_info(next)->l1_task_info, \
sizeof *L1_SCRATCH_TASK_INFO); \
(last) = resume (prev, next); \
} while (0)
diff --git a/include/asm-frv/tlb.h b/include/asm-frv/tlb.h
index f94fe5c..cd458eb 100644
--- a/include/asm-frv/tlb.h
+++ b/include/asm-frv/tlb.h
@@ -3,7 +3,11 @@
#include <asm/tlbflush.h>
+#ifdef CONFIG_MMU
+extern void check_pgt_cache(void);
+#else
#define check_pgt_cache() do {} while(0)
+#endif
/*
* we don't need any special per-pte or per-vma handling...
diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h
index 3503ad6..118e981 100644
--- a/include/asm-i386/mmzone.h
+++ b/include/asm-i386/mmzone.h
@@ -122,21 +122,21 @@
__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
#define alloc_bootmem_node(pgdat, x) \
({ \
- struct pglist_data __attribute__ ((unused)) \
+ struct pglist_data __maybe_unused \
*__alloc_bootmem_node__pgdat = (pgdat); \
__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
__pa(MAX_DMA_ADDRESS)); \
})
#define alloc_bootmem_pages_node(pgdat, x) \
({ \
- struct pglist_data __attribute__ ((unused)) \
+ struct pglist_data __maybe_unused \
*__alloc_bootmem_node__pgdat = (pgdat); \
__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \
__pa(MAX_DMA_ADDRESS)) \
})
#define alloc_bootmem_low_pages_node(pgdat, x) \
({ \
- struct pglist_data __attribute__ ((unused)) \
+ struct pglist_data __maybe_unused \
*__alloc_bootmem_node__pgdat = (pgdat); \
__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \
})
diff --git a/include/asm-i386/msr.h b/include/asm-i386/msr.h
index 26861df..df21ea0 100644
--- a/include/asm-i386/msr.h
+++ b/include/asm-i386/msr.h
@@ -86,62 +86,50 @@
#define rdmsr(msr,val1,val2) \
do { \
- unsigned long long __val = native_read_msr(msr); \
- val1 = __val; \
- val2 = __val >> 32; \
+ u64 __val = native_read_msr(msr); \
+ (val1) = (u32)__val; \
+ (val2) = (u32)(__val >> 32); \
} while(0)
-#define wrmsr(msr,val1,val2) \
- native_write_msr(msr, ((unsigned long long)val2 << 32) | val1)
-
-#define rdmsrl(msr,val) \
- do { \
- (val) = native_read_msr(msr); \
- } while(0)
-
-static inline void wrmsrl (unsigned long msr, unsigned long long val)
+static inline void wrmsr(u32 __msr, u32 __low, u32 __high)
{
- unsigned long lo, hi;
- lo = (unsigned long) val;
- hi = val >> 32;
- wrmsr (msr, lo, hi);
+ native_write_msr(__msr, ((u64)__high << 32) | __low);
}
+#define rdmsrl(msr,val) \
+ ((val) = native_read_msr(msr))
+
+#define wrmsrl(msr,val) native_write_msr(msr, val)
+
/* wrmsr with exception handling */
-#define wrmsr_safe(msr,val1,val2) \
- (native_write_msr_safe(msr, ((unsigned long long)val2 << 32) | val1))
+static inline int wrmsr_safe(u32 __msr, u32 __low, u32 __high)
+{
+ return native_write_msr_safe(__msr, ((u64)__high << 32) | __low);
+}
/* rdmsr with exception handling */
#define rdmsr_safe(msr,p1,p2) \
({ \
int __err; \
- unsigned long long __val = native_read_msr_safe(msr, &__err);\
- (*p1) = __val; \
- (*p2) = __val >> 32; \
+ u64 __val = native_read_msr_safe(msr, &__err); \
+ (*p1) = (u32)__val; \
+ (*p2) = (u32)(__val >> 32); \
__err; \
})
-#define rdtsc(low,high) \
- do { \
- u64 _l = native_read_tsc(); \
- (low) = (u32)_l; \
- (high) = _l >> 32; \
- } while(0)
-
#define rdtscl(low) \
- do { \
- (low) = native_read_tsc(); \
- } while(0)
+ ((low) = (u32)native_read_tsc())
-#define rdtscll(val) ((val) = native_read_tsc())
+#define rdtscll(val) \
+ ((val) = native_read_tsc())
#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
#define rdpmc(counter,low,high) \
do { \
u64 _l = native_read_pmc(); \
- low = (u32)_l; \
- high = _l >> 32; \
+ (low) = (u32)_l; \
+ (high) = (u32)(_l >> 32); \
} while(0)
#endif /* !CONFIG_PARAVIRT */
diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h
index e2e7f98..bc5c12c 100644
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -560,11 +560,6 @@
{
return PVOP_CALL0(u64, read_tsc);
}
-#define rdtsc(low,high) do { \
- u64 _l = paravirt_read_tsc(); \
- low = (u32)_l; \
- high = _l >> 32; \
-} while(0)
#define rdtscl(low) do { \
u64 _l = paravirt_read_tsc(); \
diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h
index 090abc1..0c71327 100644
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -124,20 +124,6 @@
return cpus_weight(cpu_callout_map);
}
-#ifdef CONFIG_X86_LOCAL_APIC
-
-#ifdef APIC_DEFINITION
-extern int hard_smp_processor_id(void);
-#else
-#include <mach_apicdef.h>
-static inline int hard_smp_processor_id(void)
-{
- /* we don't want to mark this access volatile - bad code generation */
- return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
-}
-#endif
-#endif
-
extern int safe_smp_processor_id(void);
extern int __cpu_disable(void);
extern void __cpu_die(unsigned int cpu);
@@ -152,10 +138,31 @@
#define NO_PROC_ID 0xFF /* No processor magic marker */
-#endif
+#endif /* CONFIG_SMP */
#ifndef __ASSEMBLY__
+#ifdef CONFIG_X86_LOCAL_APIC
+
+#ifdef APIC_DEFINITION
+extern int hard_smp_processor_id(void);
+#else
+#include <mach_apicdef.h>
+static inline int hard_smp_processor_id(void)
+{
+ /* we don't want to mark this access volatile - bad code generation */
+ return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
+}
+#endif /* APIC_DEFINITION */
+
+#else /* CONFIG_X86_LOCAL_APIC */
+
+#ifndef CONFIG_SMP
+#define hard_smp_processor_id() 0
+#endif
+
+#endif /* CONFIG_X86_LOCAL_APIC */
+
extern u8 apicid_2_node[];
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
index bf01d4b..4cb0f91 100644
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -172,7 +172,7 @@
#define TS_USEDFPU 0x0001 /* FPU was used by this task this quantum (SMP) */
#define TS_POLLING 0x0002 /* True if in idle loop and not sleeping */
-#define tsk_is_polling(t) ((t)->thread_info->status & TS_POLLING)
+#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
#endif /* __KERNEL__ */
diff --git a/include/asm-ia64/smp.h b/include/asm-ia64/smp.h
index 60fd4ae..c600249 100644
--- a/include/asm-ia64/smp.h
+++ b/include/asm-ia64/smp.h
@@ -38,6 +38,8 @@
return lid.f.id << 8 | lid.f.eid;
}
+#define hard_smp_processor_id() ia64_get_lid()
+
#ifdef CONFIG_SMP
#define XTP_OFFSET 0x1e0008
@@ -110,8 +112,6 @@
writeb(0x0f, ipi_base_addr + XTP_OFFSET); /* Set XTP to max */
}
-#define hard_smp_processor_id() ia64_get_lid()
-
/* Upping and downing of CPUs */
extern int __cpu_disable (void);
extern void __cpu_die (unsigned int cpu);
@@ -128,7 +128,7 @@
extern void identify_siblings (struct cpuinfo_ia64 *);
extern int is_multithreading_enabled(void);
-#else
+#else /* CONFIG_SMP */
#define cpu_logical_id(i) 0
#define cpu_physical_id(i) ia64_get_lid()
diff --git a/include/asm-ia64/thread_info.h b/include/asm-ia64/thread_info.h
index 9169859..d281475 100644
--- a/include/asm-ia64/thread_info.h
+++ b/include/asm-ia64/thread_info.h
@@ -110,6 +110,6 @@
#define TS_POLLING 1 /* true if in idle loop and not sleeping */
-#define tsk_is_polling(t) ((t)->thread_info->status & TS_POLLING)
+#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
#endif /* _ASM_IA64_THREAD_INFO_H */
diff --git a/include/asm-m32r/smp.h b/include/asm-m32r/smp.h
index abd937a..078e1a5 100644
--- a/include/asm-m32r/smp.h
+++ b/include/asm-m32r/smp.h
@@ -108,6 +108,10 @@
#define IPI_SHIFT (0)
#define NR_IPIS (8)
-#endif /* CONFIG_SMP */
+#else /* CONFIG_SMP */
+
+#define hard_smp_processor_id() 0
+
+#endif /* CONFIG_SMP */
#endif /* _ASM_M32R_SMP_H */
diff --git a/include/asm-m68k/thread_info.h b/include/asm-m68k/thread_info.h
index c4d622a..d635a37 100644
--- a/include/asm-m68k/thread_info.h
+++ b/include/asm-m68k/thread_info.h
@@ -37,17 +37,17 @@
#define init_stack (init_thread_union.stack)
#define task_thread_info(tsk) (&(tsk)->thread.info)
-#define task_stack_page(tsk) ((void *)(tsk)->thread_info)
+#define task_stack_page(tsk) ((tsk)->stack)
#define current_thread_info() task_thread_info(current)
#define __HAVE_THREAD_FUNCTIONS
#define setup_thread_stack(p, org) ({ \
- *(struct task_struct **)(p)->thread_info = (p); \
+ *(struct task_struct **)(p)->stack = (p); \
task_thread_info(p)->task = (p); \
})
-#define end_of_stack(p) ((unsigned long *)(p)->thread_info + 1)
+#define end_of_stack(p) ((unsigned long *)(p)->stack + 1)
/* entry.S relies on these definitions!
* bits 0-7 are tested at every exception exit
diff --git a/include/asm-mips/system.h b/include/asm-mips/system.h
index 30f23a2..3713d25 100644
--- a/include/asm-mips/system.h
+++ b/include/asm-mips/system.h
@@ -55,7 +55,7 @@
if (cpu_has_dsp) \
__save_dsp(prev); \
next->thread.emulated_fp = 0; \
- (last) = resume(prev, next, next->thread_info); \
+ (last) = resume(prev, next, task_thread_info(next)); \
if (cpu_has_dsp) \
__restore_dsp(current); \
} while(0)
diff --git a/include/asm-parisc/compat.h b/include/asm-parisc/compat.h
index fe85790..11f4222 100644
--- a/include/asm-parisc/compat.h
+++ b/include/asm-parisc/compat.h
@@ -152,7 +152,7 @@
static inline int __is_compat_task(struct task_struct *t)
{
- return test_ti_thread_flag(t->thread_info, TIF_32BIT);
+ return test_ti_thread_flag(task_thread_info(t), TIF_32BIT);
}
static inline int is_compat_task(void)
diff --git a/include/asm-powerpc/smp.h b/include/asm-powerpc/smp.h
index 01717f2..d037f50 100644
--- a/include/asm-powerpc/smp.h
+++ b/include/asm-powerpc/smp.h
@@ -83,6 +83,7 @@
#else
/* for UP */
+#define hard_smp_processor_id() 0
#define smp_setup_cpu_maps()
#endif /* CONFIG_SMP */
diff --git a/include/asm-s390/smp.h b/include/asm-s390/smp.h
index 0a28e6d..76e424f 100644
--- a/include/asm-s390/smp.h
+++ b/include/asm-s390/smp.h
@@ -110,6 +110,7 @@
__load_psw_mask(psw_kernel_bits & ~PSW_MASK_MCHECK);
}
+#define hard_smp_processor_id() 0
#define smp_cpu_not_running(cpu) 1
#define smp_setup_cpu_possible_map() do { } while (0)
#endif
diff --git a/include/asm-sh/cpu-sh3/dma.h b/include/asm-sh/cpu-sh3/dma.h
index 954801b..3a66dc4 100644
--- a/include/asm-sh/cpu-sh3/dma.h
+++ b/include/asm-sh/cpu-sh3/dma.h
@@ -26,7 +26,7 @@
XMIT_SZ_128BIT,
};
-static unsigned int ts_shift[] __attribute__ ((used)) = {
+static unsigned int ts_shift[] __maybe_unused = {
[XMIT_SZ_8BIT] = 0,
[XMIT_SZ_16BIT] = 1,
[XMIT_SZ_32BIT] = 2,
diff --git a/include/asm-sh/cpu-sh4/dma-sh7780.h b/include/asm-sh/cpu-sh4/dma-sh7780.h
index 6c90d28..71b426a 100644
--- a/include/asm-sh/cpu-sh4/dma-sh7780.h
+++ b/include/asm-sh/cpu-sh4/dma-sh7780.h
@@ -28,7 +28,7 @@
/*
* The DMA count is defined as the number of bytes to transfer.
*/
-static unsigned int __attribute__ ((used)) ts_shift[] = {
+static unsigned int ts_shift[] __maybe_unused = {
[XMIT_SZ_8BIT] = 0,
[XMIT_SZ_16BIT] = 1,
[XMIT_SZ_32BIT] = 2,
diff --git a/include/asm-sh/cpu-sh4/dma.h b/include/asm-sh/cpu-sh4/dma.h
index c135e9c..36e26a9 100644
--- a/include/asm-sh/cpu-sh4/dma.h
+++ b/include/asm-sh/cpu-sh4/dma.h
@@ -53,7 +53,7 @@
/*
* The DMA count is defined as the number of bytes to transfer.
*/
-static unsigned int ts_shift[] __attribute__ ((used)) = {
+static unsigned int ts_shift[] __maybe_unused = {
[XMIT_SZ_64BIT] = 3,
[XMIT_SZ_8BIT] = 0,
[XMIT_SZ_16BIT] = 1,
diff --git a/include/asm-sparc/smp.h b/include/asm-sparc/smp.h
index b9da9a6..b3f4922 100644
--- a/include/asm-sparc/smp.h
+++ b/include/asm-sparc/smp.h
@@ -165,6 +165,7 @@
#else /* SMP */
+#define hard_smp_processor_id() 0
#define smp_setup_cpu_possible_map() do { } while (0)
#endif /* !(SMP) */
diff --git a/include/asm-sparc64/smp.h b/include/asm-sparc64/smp.h
index cca5480..869d16f 100644
--- a/include/asm-sparc64/smp.h
+++ b/include/asm-sparc64/smp.h
@@ -48,6 +48,7 @@
#else
+#define hard_smp_processor_id() 0
#define smp_setup_cpu_possible_map() do { } while (0)
#define boot_cpu_id (0)
diff --git a/include/asm-um/required-features.h b/include/asm-um/required-features.h
new file mode 100644
index 0000000..dfb967b
--- /dev/null
+++ b/include/asm-um/required-features.h
@@ -0,0 +1,9 @@
+#ifndef __UM_REQUIRED_FEATURES_H
+#define __UM_REQUIRED_FEATURES_H
+
+/*
+ * Nothing to see, just need something for the i386 and x86_64 asm
+ * headers to include.
+ */
+
+#endif
diff --git a/include/asm-um/smp.h b/include/asm-um/smp.h
index ca55226..84f8cf2 100644
--- a/include/asm-um/smp.h
+++ b/include/asm-um/smp.h
@@ -24,6 +24,10 @@
extern struct task_struct *idle_threads[NR_CPUS];
+#else
+
+#define hard_smp_processor_id() 0
+
#endif
#endif
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index d570442..3f303d2 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -57,12 +57,6 @@
#define raw_smp_processor_id() read_pda(cpunumber)
-static inline int hard_smp_processor_id(void)
-{
- /* we don't want to mark this access volatile - bad code generation */
- return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
-}
-
extern int __cpu_disable(void);
extern void __cpu_die(unsigned int cpu);
extern void prefill_possible_map(void);
@@ -71,7 +65,13 @@
#define NO_PROC_ID 0xFF /* No processor magic marker */
-#endif
+#endif /* CONFIG_SMP */
+
+static inline int hard_smp_processor_id(void)
+{
+ /* we don't want to mark this access volatile - bad code generation */
+ return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
+}
/*
* Some lowlevel functions might want to know about
diff --git a/include/asm-x86_64/system.h b/include/asm-x86_64/system.h
index b7b8021..ead9f9a 100644
--- a/include/asm-x86_64/system.h
+++ b/include/asm-x86_64/system.h
@@ -39,7 +39,7 @@
[threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
[ti_flags] "i" (offsetof(struct thread_info, flags)),\
[tif_fork] "i" (TIF_FORK), \
- [thread_info] "i" (offsetof(struct task_struct, thread_info)), \
+ [thread_info] "i" (offsetof(struct task_struct, stack)), \
[pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
: "memory", "cc" __EXTRA_CLOBBER)
diff --git a/include/asm-x86_64/thread_info.h b/include/asm-x86_64/thread_info.h
index 74a6c74..10bb5a8 100644
--- a/include/asm-x86_64/thread_info.h
+++ b/include/asm-x86_64/thread_info.h
@@ -162,7 +162,7 @@
#define TS_COMPAT 0x0002 /* 32bit syscall active */
#define TS_POLLING 0x0004 /* true if in idle loop and not sleeping */
-#define tsk_is_polling(t) ((t)->thread_info->status & TS_POLLING)
+#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
#endif /* __KERNEL__ */
diff --git a/include/linux/aio.h b/include/linux/aio.h
index a30ef13..43dc2eb 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -226,7 +226,8 @@
__put_ioctx(kioctx); \
} while (0)
-#define in_aio() !is_sync_wait(current->io_wait)
+#define in_aio() (unlikely(!is_sync_wait(current->io_wait)))
+
/* may be used for debugging */
#define warn_if_async() \
do { \
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a686eab..db5b00a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -854,7 +854,7 @@
struct work_struct;
int kblockd_schedule_work(struct work_struct *work);
-void kblockd_flush(void);
+void kblockd_flush_work(struct work_struct *work);
#define MODULE_ALIAS_BLOCKDEV(major,minor) \
MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 2665ca0..bf297b0 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -49,6 +49,7 @@
* @shift: cycle to nanosecond divisor (power of two)
* @flags: flags describing special properties
* @vread: vsyscall based read
+ * @resume: resume function for the clocksource, if necessary
* @cycle_interval: Used internally by timekeeping core, please ignore.
* @xtime_interval: Used internally by timekeeping core, please ignore.
*/
@@ -65,6 +66,7 @@
u32 shift;
unsigned long flags;
cycle_t (*vread)(void);
+ void (*resume)(void);
/* timekeeping specific data, ignore */
cycle_t cycle_interval;
@@ -209,6 +211,7 @@
extern int clocksource_register(struct clocksource*);
extern struct clocksource* clocksource_get_next(void);
extern void clocksource_change_rating(struct clocksource *cs, int rating);
+extern void clocksource_resume(void);
#ifdef CONFIG_GENERIC_TIME_VSYSCALL
extern void update_vsyscall(struct timespec *ts, struct clocksource *c);
diff --git a/include/linux/compat.h b/include/linux/compat.h
index ccd863d..70a157a 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -253,5 +253,8 @@
const compat_sigset_t __user *sigmask,
compat_size_t sigsetsize);
+asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename,
+ struct compat_timespec __user *t, int flags);
+
#endif /* CONFIG_COMPAT */
#endif /* _LINUX_COMPAT_H */
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index a9f7947..03ec231 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -40,3 +40,4 @@
#define noinline __attribute__((noinline))
#define __attribute_pure__ __attribute__((pure))
#define __attribute_const__ __attribute__((__const__))
+#define __maybe_unused __attribute__((unused))
diff --git a/include/linux/compiler-gcc3.h b/include/linux/compiler-gcc3.h
index ecd621f..a9e2863 100644
--- a/include/linux/compiler-gcc3.h
+++ b/include/linux/compiler-gcc3.h
@@ -4,9 +4,11 @@
#include <linux/compiler-gcc.h>
#if __GNUC_MINOR__ >= 3
-# define __attribute_used__ __attribute__((__used__))
+# define __used __attribute__((__used__))
+# define __attribute_used__ __used /* deprecated */
#else
-# define __attribute_used__ __attribute__((__unused__))
+# define __used __attribute__((__unused__))
+# define __attribute_used__ __used /* deprecated */
#endif
#if __GNUC_MINOR__ >= 4
diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index fd0cc7c..a03e9398 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -12,7 +12,8 @@
# define __inline __inline __attribute__((always_inline))
#endif
-#define __attribute_used__ __attribute__((__used__))
+#define __used __attribute__((__used__))
+#define __attribute_used__ __used /* deprecated */
#define __must_check __attribute__((warn_unused_result))
#define __compiler_offsetof(a,b) __builtin_offsetof(a,b)
#define __always_inline inline __attribute__((always_inline))
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 3b6949b..498c359 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -108,15 +108,30 @@
* Allow us to avoid 'defined but not used' warnings on functions and data,
* as well as force them to be emitted to the assembly file.
*
- * As of gcc 3.3, static functions that are not marked with attribute((used))
- * may be elided from the assembly file. As of gcc 3.3, static data not so
+ * As of gcc 3.4, static functions that are not marked with attribute((used))
+ * may be elided from the assembly file. As of gcc 3.4, static data not so
* marked will not be elided, but this may change in a future gcc version.
*
+ * NOTE: Because distributions shipped with a backported unit-at-a-time
+ * compiler in gcc 3.3, we must define __used to be __attribute__((used))
+ * for gcc >=3.3 instead of 3.4.
+ *
* In prior versions of gcc, such functions and data would be emitted, but
* would be warned about except with attribute((unused)).
+ *
+ * Mark functions that are referenced only in inline assembly as __used so
+ * the code is emitted even though it appears to be unreferenced.
*/
#ifndef __attribute_used__
-# define __attribute_used__ /* unimplemented */
+# define __attribute_used__ /* deprecated */
+#endif
+
+#ifndef __used
+# define __used /* unimplemented */
+#endif
+
+#ifndef __maybe_unused
+# define __maybe_unused /* unimplemented */
#endif
/*
diff --git a/include/linux/fb.h b/include/linux/fb.h
index dff7a728..c654d0e 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -868,7 +868,7 @@
#define fb_writeq sbus_writeq
#define fb_memset sbus_memset_io
-#elif defined(__i386__) || defined(__alpha__) || defined(__x86_64__) || defined(__hppa__) || (defined(__sh__) && !defined(__SH5__)) || defined(__powerpc__)
+#elif defined(__i386__) || defined(__alpha__) || defined(__x86_64__) || defined(__hppa__) || (defined(__sh__) && !defined(__SH5__)) || defined(__powerpc__) || defined(__avr32__)
#define fb_readb __raw_readb
#define fb_readw __raw_readw
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 820125c..899fc7f 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -3,6 +3,8 @@
#include <linux/sched.h>
+union ktime;
+
/* Second argument to futex syscall */
@@ -15,6 +17,19 @@
#define FUTEX_LOCK_PI 6
#define FUTEX_UNLOCK_PI 7
#define FUTEX_TRYLOCK_PI 8
+#define FUTEX_CMP_REQUEUE_PI 9
+
+#define FUTEX_PRIVATE_FLAG 128
+#define FUTEX_CMD_MASK ~FUTEX_PRIVATE_FLAG
+
+#define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
+#define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
+#define FUTEX_REQUEUE_PRIVATE (FUTEX_REQUEUE | FUTEX_PRIVATE_FLAG)
+#define FUTEX_CMP_REQUEUE_PRIVATE (FUTEX_CMP_REQUEUE | FUTEX_PRIVATE_FLAG)
+#define FUTEX_WAKE_OP_PRIVATE (FUTEX_WAKE_OP | FUTEX_PRIVATE_FLAG)
+#define FUTEX_LOCK_PI_PRIVATE (FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG)
+#define FUTEX_UNLOCK_PI_PRIVATE (FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG)
+#define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG)
/*
* Support for robust futexes: the kernel cleans up held futexes at
@@ -83,9 +98,14 @@
#define FUTEX_OWNER_DIED 0x40000000
/*
+ * Some processes have been requeued on this PI-futex
+ */
+#define FUTEX_WAITER_REQUEUED 0x20000000
+
+/*
* The rest of the robust-futex field is for the TID:
*/
-#define FUTEX_TID_MASK 0x3fffffff
+#define FUTEX_TID_MASK 0x0fffffff
/*
* This limit protects against a deliberately circular list.
@@ -94,7 +114,7 @@
#define ROBUST_LIST_LIMIT 2048
#ifdef __KERNEL__
-long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
+long do_futex(u32 __user *uaddr, int op, u32 val, union ktime *timeout,
u32 __user *uaddr2, u32 val2, u32 val3);
extern int
@@ -106,9 +126,20 @@
* Don't rearrange members without looking at hash_futex().
*
* offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
- * We set bit 0 to indicate if it's an inode-based key.
- */
+ * We use the two low order bits of offset to tell what is the kind of key :
+ * 00 : Private process futex (PTHREAD_PROCESS_PRIVATE)
+ * (no reference on an inode or mm)
+ * 01 : Shared futex (PTHREAD_PROCESS_SHARED)
+ * mapped on a file (reference on the underlying inode)
+ * 10 : Shared futex (PTHREAD_PROCESS_SHARED)
+ * (but private mapping on an mm, and reference taken on it)
+*/
+
+#define FUT_OFF_INODE 1 /* We set bit 0 if key has a reference on inode */
+#define FUT_OFF_MMSHARED 2 /* We set bit 1 if key has a reference on mm */
+
union futex_key {
+ u32 __user *uaddr;
struct {
unsigned long pgoff;
struct inode *inode;
@@ -125,7 +156,8 @@
int offset;
} both;
};
-int get_futex_key(u32 __user *uaddr, union futex_key *key);
+int get_futex_key(u32 __user *uaddr, struct rw_semaphore *shared,
+ union futex_key *key);
void get_futex_key_refs(union futex_key *key);
void drop_futex_key_refs(union futex_key *key);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 2c65da7..f589559 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -413,6 +413,7 @@
extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev);
extern void add_partition(struct gendisk *, int, sector_t, sector_t, int);
extern void delete_partition(struct gendisk *, int);
+extern void printk_all_partitions(void);
extern struct gendisk *alloc_disk_node(int minors, int node_id);
extern struct gendisk *alloc_disk(int minors);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 97a36c3..0d2ef0b 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -176,10 +176,6 @@
#define free_page(addr) free_pages((addr),0)
void page_alloc_init(void);
-#ifdef CONFIG_NUMA
-void drain_node_pages(int node);
-#else
-static inline void drain_node_pages(int node) { };
-#endif
+void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
#endif /* __LINUX_GFP_H */
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index a515eb0..98e2cce 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -94,17 +94,26 @@
/*
* Same but also flushes aliased cache contents to RAM.
+ *
+ * This must be a macro because KM_USER0 and friends aren't defined if
+ * !CONFIG_HIGHMEM
*/
-static inline void memclear_highpage_flush(struct page *page, unsigned int offset, unsigned int size)
+#define zero_user_page(page, offset, size, km_type) \
+ do { \
+ void *kaddr; \
+ \
+ BUG_ON((offset) + (size) > PAGE_SIZE); \
+ \
+ kaddr = kmap_atomic(page, km_type); \
+ memset((char *)kaddr + (offset), 0, (size)); \
+ flush_dcache_page(page); \
+ kunmap_atomic(kaddr, (km_type)); \
+ } while (0)
+
+static inline void __deprecated memclear_highpage_flush(struct page *page,
+ unsigned int offset, unsigned int size)
{
- void *kaddr;
-
- BUG_ON(offset + size > PAGE_SIZE);
-
- kaddr = kmap_atomic(page, KM_USER0);
- memset((char *)kaddr + offset, 0, size);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ zero_user_page(page, offset, size, KM_USER0);
}
#ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 7951023..45170b2 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -95,7 +95,7 @@
#define INIT_TASK(tsk) \
{ \
.state = 0, \
- .thread_info = &init_thread_info, \
+ .stack = &init_thread_info, \
.usage = ATOMIC_INIT(2), \
.flags = 0, \
.lock_depth = -1, \
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 1c65e7a..00dd957 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -30,4 +30,7 @@
int kthread_stop(struct task_struct *k);
int kthread_should_stop(void);
+int kthreadd(void *unused);
+extern struct task_struct *kthreadd_task;
+
#endif /* _LINUX_KTHREAD_H */
diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index 81bb9c7..c762954 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -43,7 +43,7 @@
* plain scalar nanosecond based representation can be selected by the
* config switch CONFIG_KTIME_SCALAR.
*/
-typedef union {
+union ktime {
s64 tv64;
#if BITS_PER_LONG != 64 && !defined(CONFIG_KTIME_SCALAR)
struct {
@@ -54,7 +54,9 @@
# endif
} tv;
#endif
-} ktime_t;
+};
+
+typedef union ktime ktime_t; /* Kill this */
#define KTIME_MAX ((s64)~((u64)1 << 63))
#if (BITS_PER_LONG == 64)
diff --git a/include/linux/mca.h b/include/linux/mca.h
index 5cff292..3797270 100644
--- a/include/linux/mca.h
+++ b/include/linux/mca.h
@@ -94,6 +94,7 @@
struct mca_driver {
const short *id_table;
void *driver_data;
+ int integrated_id;
struct device_driver driver;
};
#define to_mca_driver(mdriver) container_of(mdriver, struct mca_driver, driver)
@@ -125,6 +126,7 @@
extern struct bus_type mca_bus_type;
extern int mca_register_driver(struct mca_driver *drv);
+extern int mca_register_driver_integrated(struct mca_driver *, int);
extern void mca_unregister_driver(struct mca_driver *drv);
/* WARNING: only called by the boot time device setup */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2f1544e..d09b134 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -83,6 +83,9 @@
struct per_cpu_pageset {
struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */
+#ifdef CONFIG_NUMA
+ s8 expire;
+#endif
#ifdef CONFIG_SMP
s8 stat_threshold;
s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
diff --git a/include/linux/module.h b/include/linux/module.h
index 6d3dc9c..792d483 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -356,6 +356,9 @@
keeping pointers to this stuff */
char *args;
};
+#ifndef MODULE_ARCH_INIT
+#define MODULE_ARCH_INIT {}
+#endif
/* FIXME: It'd be nice to isolate modules during init, too, so they
aren't used before they (may) fail. But presently too much code
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index b81bc2a..0d50ea3 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -121,11 +121,12 @@
* Also see Documentation/mutex-design.txt.
*/
extern void fastcall mutex_lock(struct mutex *lock);
-extern int fastcall mutex_lock_interruptible(struct mutex *lock);
+extern int __must_check fastcall mutex_lock_interruptible(struct mutex *lock);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
-extern int mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass);
+extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock,
+ unsigned int subclass);
#else
# define mutex_lock_nested(lock, subclass) mutex_lock(lock)
# define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock)
diff --git a/include/linux/nfs4_acl.h b/include/linux/nfs4_acl.h
index 409b6e0..c9c05a7 100644
--- a/include/linux/nfs4_acl.h
+++ b/include/linux/nfs4_acl.h
@@ -44,7 +44,6 @@
#define NFS4_ACL_MAX 170
struct nfs4_acl *nfs4_acl_new(int);
-void nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t);
int nfs4_acl_get_whotype(char *, u32);
int nfs4_acl_write_who(int who, char *p);
int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 10a43ed..9431101 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -112,32 +112,40 @@
#ifdef __KERNEL__
-extern int atomic_notifier_chain_register(struct atomic_notifier_head *,
- struct notifier_block *);
-extern int blocking_notifier_chain_register(struct blocking_notifier_head *,
- struct notifier_block *);
-extern int raw_notifier_chain_register(struct raw_notifier_head *,
- struct notifier_block *);
-extern int srcu_notifier_chain_register(struct srcu_notifier_head *,
- struct notifier_block *);
+extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
+ struct notifier_block *nb);
+extern int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+ struct notifier_block *nb);
+extern int raw_notifier_chain_register(struct raw_notifier_head *nh,
+ struct notifier_block *nb);
+extern int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
+ struct notifier_block *nb);
-extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *,
- struct notifier_block *);
-extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *,
- struct notifier_block *);
-extern int raw_notifier_chain_unregister(struct raw_notifier_head *,
- struct notifier_block *);
-extern int srcu_notifier_chain_unregister(struct srcu_notifier_head *,
- struct notifier_block *);
+extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
+ struct notifier_block *nb);
+extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
+ struct notifier_block *nb);
+extern int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
+ struct notifier_block *nb);
+extern int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
+ struct notifier_block *nb);
-extern int atomic_notifier_call_chain(struct atomic_notifier_head *,
+extern int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
unsigned long val, void *v);
-extern int blocking_notifier_call_chain(struct blocking_notifier_head *,
+extern int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+ unsigned long val, void *v, int nr_to_call, int *nr_calls);
+extern int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
unsigned long val, void *v);
-extern int raw_notifier_call_chain(struct raw_notifier_head *,
+extern int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+ unsigned long val, void *v, int nr_to_call, int *nr_calls);
+extern int raw_notifier_call_chain(struct raw_notifier_head *nh,
unsigned long val, void *v);
-extern int srcu_notifier_call_chain(struct srcu_notifier_head *,
+extern int __raw_notifier_call_chain(struct raw_notifier_head *nh,
+ unsigned long val, void *v, int nr_to_call, int *nr_calls);
+extern int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
unsigned long val, void *v);
+extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+ unsigned long val, void *v, int nr_to_call, int *nr_calls);
#define NOTIFY_DONE 0x0000 /* Don't care */
#define NOTIFY_OK 0x0001 /* Suits me */
@@ -186,6 +194,20 @@
#define CPU_DOWN_PREPARE 0x0005 /* CPU (unsigned)v going down */
#define CPU_DOWN_FAILED 0x0006 /* CPU (unsigned)v NOT going down */
#define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */
+#define CPU_LOCK_ACQUIRE 0x0008 /* Acquire all hotcpu locks */
+#define CPU_LOCK_RELEASE 0x0009 /* Release all hotcpu locks */
+
+/* Used for CPU hotplug events occuring while tasks are frozen due to a suspend
+ * operation in progress
+ */
+#define CPU_TASKS_FROZEN 0x0010
+
+#define CPU_ONLINE_FROZEN (CPU_ONLINE | CPU_TASKS_FROZEN)
+#define CPU_UP_PREPARE_FROZEN (CPU_UP_PREPARE | CPU_TASKS_FROZEN)
+#define CPU_UP_CANCELED_FROZEN (CPU_UP_CANCELED | CPU_TASKS_FROZEN)
+#define CPU_DOWN_PREPARE_FROZEN (CPU_DOWN_PREPARE | CPU_TASKS_FROZEN)
+#define CPU_DOWN_FAILED_FROZEN (CPU_DOWN_FAILED | CPU_TASKS_FROZEN)
+#define CPU_DEAD_FROZEN (CPU_DEAD | CPU_TASKS_FROZEN)
#endif /* __KERNEL__ */
#endif /* _LINUX_NOTIFIER_H */
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 6e8fa30..87545e0 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -107,26 +107,11 @@
#define PM_SUSPEND_ON ((__force suspend_state_t) 0)
#define PM_SUSPEND_STANDBY ((__force suspend_state_t) 1)
#define PM_SUSPEND_MEM ((__force suspend_state_t) 3)
-#define PM_SUSPEND_DISK ((__force suspend_state_t) 4)
-#define PM_SUSPEND_MAX ((__force suspend_state_t) 5)
-
-typedef int __bitwise suspend_disk_method_t;
-
-/* invalid must be 0 so struct pm_ops initialisers can leave it out */
-#define PM_DISK_INVALID ((__force suspend_disk_method_t) 0)
-#define PM_DISK_PLATFORM ((__force suspend_disk_method_t) 1)
-#define PM_DISK_SHUTDOWN ((__force suspend_disk_method_t) 2)
-#define PM_DISK_REBOOT ((__force suspend_disk_method_t) 3)
-#define PM_DISK_TEST ((__force suspend_disk_method_t) 4)
-#define PM_DISK_TESTPROC ((__force suspend_disk_method_t) 5)
-#define PM_DISK_MAX ((__force suspend_disk_method_t) 6)
+#define PM_SUSPEND_MAX ((__force suspend_state_t) 4)
/**
* struct pm_ops - Callbacks for managing platform dependent suspend states.
* @valid: Callback to determine whether the given state can be entered.
- * If %CONFIG_SOFTWARE_SUSPEND is set then %PM_SUSPEND_DISK is
- * always valid and never passed to this call. If not assigned,
- * no suspend states are valid.
* Valid states are advertised in /sys/power/state but can still
* be rejected by prepare or enter if the conditions aren't right.
* There is a %pm_valid_only_mem function available that can be assigned
@@ -140,24 +125,12 @@
*
* @finish: Called when the system has left the given state and all devices
* are resumed. The return value is ignored.
- *
- * @pm_disk_mode: The generic code always allows one of the shutdown methods
- * %PM_DISK_SHUTDOWN, %PM_DISK_REBOOT, %PM_DISK_TEST and
- * %PM_DISK_TESTPROC. If this variable is set, the mode it is set
- * to is allowed in addition to those modes and is also made default.
- * When this mode is sent selected, the @prepare call will be called
- * before suspending to disk (if present), the @enter call should be
- * present and will be called after all state has been saved and the
- * machine is ready to be powered off; the @finish callback is called
- * after state has been restored. All these calls are called with
- * %PM_SUSPEND_DISK as the state.
*/
struct pm_ops {
int (*valid)(suspend_state_t state);
int (*prepare)(suspend_state_t state);
int (*enter)(suspend_state_t state);
int (*finish)(suspend_state_t state);
- suspend_disk_method_t pm_disk_mode;
};
/**
@@ -276,8 +249,6 @@
extern void device_resume(void);
#ifdef CONFIG_PM
-extern suspend_disk_method_t pm_disk_mode;
-
extern int device_suspend(pm_message_t state);
extern int device_prepare_suspend(pm_message_t state);
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index de72c49..a121f36 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -201,7 +201,6 @@
struct mutex reconfig_mutex;
atomic_t active;
- int changed; /* true if we might need to reread partition info */
int degraded; /* whether md should consider
* adding a spare
*/
diff --git a/include/linux/relay.h b/include/linux/relay.h
index 759a0f9..6cd8c44 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -12,6 +12,7 @@
#include <linux/types.h>
#include <linux/sched.h>
+#include <linux/timer.h>
#include <linux/wait.h>
#include <linux/list.h>
#include <linux/fs.h>
@@ -38,7 +39,7 @@
size_t subbufs_consumed; /* count of sub-buffers consumed */
struct rchan *chan; /* associated channel */
wait_queue_head_t read_wait; /* reader wait queue */
- struct delayed_work wake_readers; /* reader wake-up work struct */
+ struct timer_list timer; /* reader wake-up timer */
struct dentry *dentry; /* channel file dentry */
struct kref kref; /* channel buffer refcount */
struct page **page_array; /* array of current buffer pages */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3d95c48..17b72d8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -817,7 +817,7 @@
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
- struct thread_info *thread_info;
+ void *stack;
atomic_t usage;
unsigned int flags; /* per process flags, defined below */
unsigned int ptrace;
@@ -1317,6 +1317,7 @@
extern void proc_caches_init(void);
extern void flush_signals(struct task_struct *);
+extern void ignore_signals(struct task_struct *);
extern void flush_signal_handlers(struct task_struct *, int force_default);
extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
@@ -1512,8 +1513,8 @@
#ifndef __HAVE_THREAD_FUNCTIONS
-#define task_thread_info(task) (task)->thread_info
-#define task_stack_page(task) ((void*)((task)->thread_info))
+#define task_thread_info(task) ((struct thread_info *)(task)->stack)
+#define task_stack_page(task) ((task)->stack)
static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
{
@@ -1523,7 +1524,7 @@
static inline unsigned long *end_of_stack(struct task_struct *p)
{
- return (unsigned long *)(p->thread_info + 1);
+ return (unsigned long *)(task_thread_info(p) + 1);
}
#endif
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 1474905..3fa0fab 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -243,6 +243,131 @@
extern struct kmem_cache *sighand_cachep;
+/*
+ * In POSIX a signal is sent either to a specific thread (Linux task)
+ * or to the process as a whole (Linux thread group). How the signal
+ * is sent determines whether it's to one thread or the whole group,
+ * which determines which signal mask(s) are involved in blocking it
+ * from being delivered until later. When the signal is delivered,
+ * either it's caught or ignored by a user handler or it has a default
+ * effect that applies to the whole thread group (POSIX process).
+ *
+ * The possible effects an unblocked signal set to SIG_DFL can have are:
+ * ignore - Nothing Happens
+ * terminate - kill the process, i.e. all threads in the group,
+ * similar to exit_group. The group leader (only) reports
+ * WIFSIGNALED status to its parent.
+ * coredump - write a core dump file describing all threads using
+ * the same mm and then kill all those threads
+ * stop - stop all the threads in the group, i.e. TASK_STOPPED state
+ *
+ * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored.
+ * Other signals when not blocked and set to SIG_DFL behaves as follows.
+ * The job control signals also have other special effects.
+ *
+ * +--------------------+------------------+
+ * | POSIX signal | default action |
+ * +--------------------+------------------+
+ * | SIGHUP | terminate |
+ * | SIGINT | terminate |
+ * | SIGQUIT | coredump |
+ * | SIGILL | coredump |
+ * | SIGTRAP | coredump |
+ * | SIGABRT/SIGIOT | coredump |
+ * | SIGBUS | coredump |
+ * | SIGFPE | coredump |
+ * | SIGKILL | terminate(+) |
+ * | SIGUSR1 | terminate |
+ * | SIGSEGV | coredump |
+ * | SIGUSR2 | terminate |
+ * | SIGPIPE | terminate |
+ * | SIGALRM | terminate |
+ * | SIGTERM | terminate |
+ * | SIGCHLD | ignore |
+ * | SIGCONT | ignore(*) |
+ * | SIGSTOP | stop(*)(+) |
+ * | SIGTSTP | stop(*) |
+ * | SIGTTIN | stop(*) |
+ * | SIGTTOU | stop(*) |
+ * | SIGURG | ignore |
+ * | SIGXCPU | coredump |
+ * | SIGXFSZ | coredump |
+ * | SIGVTALRM | terminate |
+ * | SIGPROF | terminate |
+ * | SIGPOLL/SIGIO | terminate |
+ * | SIGSYS/SIGUNUSED | coredump |
+ * | SIGSTKFLT | terminate |
+ * | SIGWINCH | ignore |
+ * | SIGPWR | terminate |
+ * | SIGRTMIN-SIGRTMAX | terminate |
+ * +--------------------+------------------+
+ * | non-POSIX signal | default action |
+ * +--------------------+------------------+
+ * | SIGEMT | coredump |
+ * +--------------------+------------------+
+ *
+ * (+) For SIGKILL and SIGSTOP the action is "always", not just "default".
+ * (*) Special job control effects:
+ * When SIGCONT is sent, it resumes the process (all threads in the group)
+ * from TASK_STOPPED state and also clears any pending/queued stop signals
+ * (any of those marked with "stop(*)"). This happens regardless of blocking,
+ * catching, or ignoring SIGCONT. When any stop signal is sent, it clears
+ * any pending/queued SIGCONT signals; this happens regardless of blocking,
+ * catching, or ignored the stop signal, though (except for SIGSTOP) the
+ * default action of stopping the process may happen later or never.
+ */
+
+#ifdef SIGEMT
+#define SIGEMT_MASK rt_sigmask(SIGEMT)
+#else
+#define SIGEMT_MASK 0
+#endif
+
+#if SIGRTMIN > BITS_PER_LONG
+#define rt_sigmask(sig) (1ULL << ((sig)-1))
+#else
+#define rt_sigmask(sig) sigmask(sig)
+#endif
+#define siginmask(sig, mask) (rt_sigmask(sig) & (mask))
+
+#define SIG_KERNEL_ONLY_MASK (\
+ rt_sigmask(SIGKILL) | rt_sigmask(SIGSTOP))
+
+#define SIG_KERNEL_STOP_MASK (\
+ rt_sigmask(SIGSTOP) | rt_sigmask(SIGTSTP) | \
+ rt_sigmask(SIGTTIN) | rt_sigmask(SIGTTOU) )
+
+#define SIG_KERNEL_COREDUMP_MASK (\
+ rt_sigmask(SIGQUIT) | rt_sigmask(SIGILL) | \
+ rt_sigmask(SIGTRAP) | rt_sigmask(SIGABRT) | \
+ rt_sigmask(SIGFPE) | rt_sigmask(SIGSEGV) | \
+ rt_sigmask(SIGBUS) | rt_sigmask(SIGSYS) | \
+ rt_sigmask(SIGXCPU) | rt_sigmask(SIGXFSZ) | \
+ SIGEMT_MASK )
+
+#define SIG_KERNEL_IGNORE_MASK (\
+ rt_sigmask(SIGCONT) | rt_sigmask(SIGCHLD) | \
+ rt_sigmask(SIGWINCH) | rt_sigmask(SIGURG) )
+
+#define sig_kernel_only(sig) \
+ (((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_ONLY_MASK))
+#define sig_kernel_coredump(sig) \
+ (((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_COREDUMP_MASK))
+#define sig_kernel_ignore(sig) \
+ (((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_IGNORE_MASK))
+#define sig_kernel_stop(sig) \
+ (((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_STOP_MASK))
+
+#define sig_needs_tasklist(sig) ((sig) == SIGCONT)
+
+#define sig_user_defined(t, signr) \
+ (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \
+ ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
+
+#define sig_fatal(t, signr) \
+ (!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
+ (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)
+
#endif /* __KERNEL__ */
#endif /* _LINUX_SIGNAL_H */
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 7ba23ec..3f70149 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -83,7 +83,6 @@
* These macros fold the SMP functionality into a single CPU system
*/
#define raw_smp_processor_id() 0
-#define hard_smp_processor_id() 0
static inline int up_smp_call_function(void)
{
return 0;
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 35fa4d5..4a7ae8a 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -396,4 +396,23 @@
#define RPC_MAX_ADDRBUFLEN (63U)
+/*
+ * When we want to reduce the size of the reserved space in the response
+ * buffer, we need to take into account the size of any checksum data that
+ * may be at the end of the packet. This is difficult to determine exactly
+ * for all cases without actually generating the checksum, so we just use a
+ * static value.
+ */
+static inline void
+svc_reserve_auth(struct svc_rqst *rqstp, int space)
+{
+ int added_space = 0;
+
+ switch(rqstp->rq_authop->flavour) {
+ case RPC_AUTH_GSS:
+ added_space = RPC_MAX_AUTH_SIZE;
+ }
+ return svc_reserve(rqstp, space + added_space);
+}
+
#endif /* SUNRPC_SVC_H */
diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index 7909687..e21dd93 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -37,7 +37,8 @@
atomic_t sk_reserved; /* space on outq that is reserved */
- spinlock_t sk_defer_lock; /* protects sk_deferred */
+ spinlock_t sk_lock; /* protects sk_deferred and
+ * sk_info_authunix */
struct list_head sk_deferred; /* deferred requests that need to
* be revisted */
struct mutex sk_mutex; /* to serialize sending data */
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 9d2aa1a..d74da91 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -32,6 +32,24 @@
static inline void pm_restore_console(void) {}
#endif
+/**
+ * struct hibernation_ops - hibernation platform support
+ *
+ * The methods in this structure allow a platform to override the default
+ * mechanism of shutting down the machine during a hibernation transition.
+ *
+ * All three methods must be assigned.
+ *
+ * @prepare: prepare system for hibernation
+ * @enter: shut down system after state has been saved to disk
+ * @finish: finish/clean up after state has been reloaded
+ */
+struct hibernation_ops {
+ int (*prepare)(void);
+ int (*enter)(void);
+ void (*finish)(void);
+};
+
#if defined(CONFIG_PM) && defined(CONFIG_SOFTWARE_SUSPEND)
/* kernel/power/snapshot.c */
extern void __init register_nosave_region(unsigned long, unsigned long);
@@ -39,11 +57,17 @@
extern void swsusp_set_page_free(struct page *);
extern void swsusp_unset_page_free(struct page *);
extern unsigned long get_safe_page(gfp_t gfp_mask);
+
+extern void hibernation_set_ops(struct hibernation_ops *ops);
+extern int hibernate(void);
#else
static inline void register_nosave_region(unsigned long b, unsigned long e) {}
static inline int swsusp_page_is_forbidden(struct page *p) { return 0; }
static inline void swsusp_set_page_free(struct page *p) {}
static inline void swsusp_unset_page_free(struct page *p) {}
+
+static inline void hibernation_set_ops(struct hibernation_ops *ops) {}
+static inline int hibernate(void) { return -ENOSYS; }
#endif /* defined(CONFIG_PM) && defined(CONFIG_SOFTWARE_SUSPEND) */
void save_processor_state(void);
diff --git a/include/linux/svga.h b/include/linux/svga.h
index e1cc552..13ad0b8 100644
--- a/include/linux/svga.h
+++ b/include/linux/svga.h
@@ -113,6 +113,8 @@
void svga_tileblit(struct fb_info *info, struct fb_tileblit *blit);
void svga_tilecursor(struct fb_info *info, struct fb_tilecursor *cursor);
int svga_get_tilemax(struct fb_info *info);
+void svga_get_caps(struct fb_info *info, struct fb_blit_caps *caps,
+ struct fb_var_screeninfo *var);
int svga_compute_pll(const struct svga_pll *pll, u32 f_wanted, u16 *m, u16 *n, u16 *r, int node);
int svga_check_timings(const struct svga_timing_regs *tm, struct fb_var_screeninfo *var, int node);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 1912c6c..3139f44 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -576,6 +576,8 @@
struct stat64 __user *statbuf, int flag);
asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *buf,
int bufsiz);
+asmlinkage long sys_utimensat(int dfd, char __user *filename,
+ struct timespec __user *utimes, int flags);
asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename,
struct compat_timeval __user *t);
asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user * filename,
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index acb1f10..d9325cf 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -212,8 +212,6 @@
extern void __dec_zone_state(struct zone *, enum zone_stat_item);
void refresh_cpu_vm_stats(int);
-void refresh_vm_stats(void);
-
#else /* CONFIG_SMP */
/*
@@ -260,7 +258,6 @@
#define mod_zone_page_state __mod_zone_page_state
static inline void refresh_cpu_vm_stats(int cpu) { }
-static inline void refresh_vm_stats(void) { }
#endif
#endif /* _LINUX_VMSTAT_H */
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index f16ba1e..d555f31 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -24,15 +24,13 @@
struct work_struct {
atomic_long_t data;
#define WORK_STRUCT_PENDING 0 /* T if work item pending execution */
-#define WORK_STRUCT_NOAUTOREL 1 /* F if work item automatically released on exec */
#define WORK_STRUCT_FLAG_MASK (3UL)
#define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK)
struct list_head entry;
work_func_t func;
};
-#define WORK_DATA_INIT(autorelease) \
- ATOMIC_LONG_INIT((autorelease) << WORK_STRUCT_NOAUTOREL)
+#define WORK_DATA_INIT() ATOMIC_LONG_INIT(0)
struct delayed_work {
struct work_struct work;
@@ -44,14 +42,8 @@
};
#define __WORK_INITIALIZER(n, f) { \
- .data = WORK_DATA_INIT(0), \
- .entry = { &(n).entry, &(n).entry }, \
- .func = (f), \
- }
-
-#define __WORK_INITIALIZER_NAR(n, f) { \
- .data = WORK_DATA_INIT(1), \
- .entry = { &(n).entry, &(n).entry }, \
+ .data = WORK_DATA_INIT(), \
+ .entry = { &(n).entry, &(n).entry }, \
.func = (f), \
}
@@ -60,23 +52,12 @@
.timer = TIMER_INITIALIZER(NULL, 0, 0), \
}
-#define __DELAYED_WORK_INITIALIZER_NAR(n, f) { \
- .work = __WORK_INITIALIZER_NAR((n).work, (f)), \
- .timer = TIMER_INITIALIZER(NULL, 0, 0), \
- }
-
#define DECLARE_WORK(n, f) \
struct work_struct n = __WORK_INITIALIZER(n, f)
-#define DECLARE_WORK_NAR(n, f) \
- struct work_struct n = __WORK_INITIALIZER_NAR(n, f)
-
#define DECLARE_DELAYED_WORK(n, f) \
struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f)
-#define DECLARE_DELAYED_WORK_NAR(n, f) \
- struct dwork_struct n = __DELAYED_WORK_INITIALIZER_NAR(n, f)
-
/*
* initialize a work item's function pointer
*/
@@ -95,16 +76,9 @@
* assignment of the work data initializer allows the compiler
* to generate better code.
*/
-#define INIT_WORK(_work, _func) \
- do { \
- (_work)->data = (atomic_long_t) WORK_DATA_INIT(0); \
- INIT_LIST_HEAD(&(_work)->entry); \
- PREPARE_WORK((_work), (_func)); \
- } while (0)
-
-#define INIT_WORK_NAR(_work, _func) \
+#define INIT_WORK(_work, _func) \
do { \
- (_work)->data = (atomic_long_t) WORK_DATA_INIT(1); \
+ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \
INIT_LIST_HEAD(&(_work)->entry); \
PREPARE_WORK((_work), (_func)); \
} while (0)
@@ -115,12 +89,6 @@
init_timer(&(_work)->timer); \
} while (0)
-#define INIT_DELAYED_WORK_NAR(_work, _func) \
- do { \
- INIT_WORK_NAR(&(_work)->work, (_func)); \
- init_timer(&(_work)->timer); \
- } while (0)
-
#define INIT_DELAYED_WORK_DEFERRABLE(_work, _func) \
do { \
INIT_WORK(&(_work)->work, (_func)); \
@@ -143,24 +111,10 @@
work_pending(&(w)->work)
/**
- * work_release - Release a work item under execution
- * @work: The work item to release
- *
- * This is used to release a work item that has been initialised with automatic
- * release mode disabled (WORK_STRUCT_NOAUTOREL is set). This gives the work
- * function the opportunity to grab auxiliary data from the container of the
- * work_struct before clearing the pending bit as the work_struct may be
- * subject to deallocation the moment the pending bit is cleared.
- *
- * In such a case, this should be called in the work function after it has
- * fetched any data it may require from the containter of the work_struct.
- * After this function has been called, the work_struct may be scheduled for
- * further execution or it may be deallocated unless other precautions are
- * taken.
- *
- * This should also be used to release a delayed work item.
+ * work_clear_pending - for internal use only, mark a work item as not pending
+ * @work: The work item in question
*/
-#define work_release(work) \
+#define work_clear_pending(work) \
clear_bit(WORK_STRUCT_PENDING, work_data_bits(work))
@@ -174,27 +128,28 @@
extern void destroy_workqueue(struct workqueue_struct *wq);
extern int FASTCALL(queue_work(struct workqueue_struct *wq, struct work_struct *work));
-extern int FASTCALL(queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *work, unsigned long delay));
+extern int FASTCALL(queue_delayed_work(struct workqueue_struct *wq,
+ struct delayed_work *work, unsigned long delay));
extern int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
- struct delayed_work *work, unsigned long delay);
+ struct delayed_work *work, unsigned long delay);
+
extern void FASTCALL(flush_workqueue(struct workqueue_struct *wq));
+extern void flush_scheduled_work(void);
extern int FASTCALL(schedule_work(struct work_struct *work));
-extern int FASTCALL(run_scheduled_work(struct work_struct *work));
-extern int FASTCALL(schedule_delayed_work(struct delayed_work *work, unsigned long delay));
-
-extern int schedule_delayed_work_on(int cpu, struct delayed_work *work, unsigned long delay);
+extern int FASTCALL(schedule_delayed_work(struct delayed_work *work,
+ unsigned long delay));
+extern int schedule_delayed_work_on(int cpu, struct delayed_work *work,
+ unsigned long delay);
extern int schedule_on_each_cpu(work_func_t func);
-extern void flush_scheduled_work(void);
extern int current_is_keventd(void);
extern int keventd_up(void);
extern void init_workqueues(void);
-void cancel_rearming_delayed_work(struct delayed_work *work);
-void cancel_rearming_delayed_workqueue(struct workqueue_struct *,
- struct delayed_work *);
int execute_in_process_context(work_func_t fn, struct execute_work *);
+extern void cancel_work_sync(struct work_struct *work);
+
/*
* Kill off a pending schedule_delayed_work(). Note that the work callback
* function may still be running on return from cancel_delayed_work(), unless
@@ -207,8 +162,18 @@
ret = del_timer(&work->timer);
if (ret)
- work_release(&work->work);
+ work_clear_pending(&work->work);
return ret;
}
+extern void cancel_rearming_delayed_work(struct delayed_work *work);
+
+/* Obsolete. use cancel_rearming_delayed_work() */
+static inline
+void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
+ struct delayed_work *work)
+{
+ cancel_rearming_delayed_work(work);
+}
+
#endif
diff --git a/init/Kconfig b/init/Kconfig
index a7e4879..e63a017 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -502,6 +502,15 @@
on EMBEDDED systems. /proc/vmstat will only show page counts
if VM event counters are disabled.
+config SLUB_DEBUG
+ default y
+ bool "Enable SLUB debugging support" if EMBEDDED
+ help
+ SLUB has extensive debug support features. Disabling these can
+ result in significant savings in code size. This also disables
+ SLUB sysfs support. /sys/slab will not exist and there will be
+ no support for cache validation etc.
+
choice
prompt "Choose SLAB allocator"
default SLAB
@@ -512,9 +521,9 @@
bool "SLAB"
help
The regular slab allocator that is established and known to work
- well in all environments. It organizes chache hot objects in
+ well in all environments. It organizes cache hot objects in
per cpu and per node queues. SLAB is the default choice for
- slab allocator.
+ a slab allocator.
config SLUB
depends on EXPERIMENTAL && !ARCH_USES_SLAB_PAGE_STRUCT
@@ -524,21 +533,20 @@
instead of managing queues of cached objects (SLAB approach).
Per cpu caching is realized using slabs of objects instead
of queues of objects. SLUB can use memory efficiently
- way and has enhanced diagnostics.
+ and has enhanced diagnostics.
config SLOB
#
-# SLOB cannot support SMP because SLAB_DESTROY_BY_RCU does not work
-# properly.
+# SLOB does not support SMP because SLAB_DESTROY_BY_RCU is unsupported
#
depends on EMBEDDED && !SMP && !SPARSEMEM
bool "SLOB (Simple Allocator)"
help
SLOB replaces the SLAB allocator with a drastically simpler
allocator. SLOB is more space efficient that SLAB but does not
- scale well (single lock for all operations) and is more susceptible
- to fragmentation. SLOB it is a great choice to reduce
- memory usage and code size for embedded systems.
+ scale well (single lock for all operations) and is also highly
+ susceptible to fragmentation. SLUB can accomplish a higher object
+ density. It is usually better to use SLUB instead of SLOB.
endchoice
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 3f57ed4..46fe407 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -7,6 +7,7 @@
#include <linux/root_dev.h>
#include <linux/security.h>
#include <linux/delay.h>
+#include <linux/genhd.h>
#include <linux/mount.h>
#include <linux/device.h>
#include <linux/init.h>
@@ -308,17 +309,21 @@
/*
* Allow the user to distinguish between failed sys_open
* and bad superblock on root device.
+ * and give them a list of the available devices
*/
#ifdef CONFIG_BLOCK
__bdevname(ROOT_DEV, b);
#endif
printk("VFS: Cannot open root device \"%s\" or %s\n",
root_device_name, b);
- printk("Please append a correct \"root=\" boot option\n");
+ printk("Please append a correct \"root=\" boot option; here are the available partitions:\n");
+ printk_all_partitions();
panic("VFS: Unable to mount root fs on %s", b);
}
+ printk("List of all partitions:\n");
+ printk_all_partitions();
printk("No filesystem could mount root, tried: ");
for (p = fs_names; *p; p += strlen(p)+1)
printk(" %s", p);
diff --git a/init/main.c b/init/main.c
index c1537e0..e8d080c 100644
--- a/init/main.c
+++ b/init/main.c
@@ -54,6 +54,7 @@
#include <linux/lockdep.h>
#include <linux/pid_namespace.h>
#include <linux/device.h>
+#include <linux/kthread.h>
#include <asm/io.h>
#include <asm/bugs.h>
@@ -425,8 +426,12 @@
static void noinline rest_init(void)
__releases(kernel_lock)
{
+ int pid;
+
kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND);
numa_default_policy();
+ pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
+ kthreadd_task = find_task_by_pid(pid);
unlock_kernel();
/*
diff --git a/kernel/configs.c b/kernel/configs.c
index 8fa1fb2..e84d3f9 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -61,18 +61,9 @@
ikconfig_read_current(struct file *file, char __user *buf,
size_t len, loff_t * offset)
{
- loff_t pos = *offset;
- ssize_t count;
-
- if (pos >= kernel_config_data_size)
- return 0;
-
- count = min(len, (size_t)(kernel_config_data_size - pos));
- if (copy_to_user(buf, kernel_config_data + MAGIC_SIZE + pos, count))
- return -EFAULT;
-
- *offset += count;
- return count;
+ return simple_read_from_buffer(buf, len, offset,
+ kernel_config_data + MAGIC_SIZE,
+ kernel_config_data_size);
}
static const struct file_operations ikconfig_file_ops = {
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 36e7084..208cf34 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -97,7 +97,7 @@
(!cputime_eq(p->utime, cputime_zero) ||
!cputime_eq(p->stime, cputime_zero)))
printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
- (state = %ld, flags = %lx) \n",
+ (state = %ld, flags = %x) \n",
p->comm, p->pid, cpu, p->state, p->flags);
}
write_unlock_irq(&tasklist_lock);
@@ -120,11 +120,13 @@
}
/* Requires cpu_add_remove_lock to be held */
-static int _cpu_down(unsigned int cpu)
+static int _cpu_down(unsigned int cpu, int tasks_frozen)
{
- int err;
+ int err, nr_calls = 0;
struct task_struct *p;
cpumask_t old_allowed, tmp;
+ void *hcpu = (void *)(long)cpu;
+ unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
if (num_online_cpus() == 1)
return -EBUSY;
@@ -132,12 +134,16 @@
if (!cpu_online(cpu))
return -EINVAL;
- err = raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
- (void *)(long)cpu);
+ raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
+ err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
+ hcpu, -1, &nr_calls);
if (err == NOTIFY_BAD) {
+ __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
+ hcpu, nr_calls, NULL);
printk("%s: attempt to take down CPU %u failed\n",
__FUNCTION__, cpu);
- return -EINVAL;
+ err = -EINVAL;
+ goto out_release;
}
/* Ensure that we are not runnable on dying cpu */
@@ -152,8 +158,8 @@
if (IS_ERR(p) || cpu_online(cpu)) {
/* CPU didn't die: tell everyone. Can't complain. */
- if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
- (void *)(long)cpu) == NOTIFY_BAD)
+ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
+ hcpu) == NOTIFY_BAD)
BUG();
if (IS_ERR(p)) {
@@ -170,13 +176,9 @@
/* This actually kills the CPU. */
__cpu_die(cpu);
- /* Move it here so it can run. */
- kthread_bind(p, get_cpu());
- put_cpu();
-
/* CPU is completely dead: tell everyone. Too late to complain. */
- if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD,
- (void *)(long)cpu) == NOTIFY_BAD)
+ if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod,
+ hcpu) == NOTIFY_BAD)
BUG();
check_for_tasks(cpu);
@@ -185,6 +187,8 @@
err = kthread_stop(p);
out_allowed:
set_cpus_allowed(current, old_allowed);
+out_release:
+ raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
return err;
}
@@ -196,7 +200,7 @@
if (cpu_hotplug_disabled)
err = -EBUSY;
else
- err = _cpu_down(cpu);
+ err = _cpu_down(cpu, 0);
mutex_unlock(&cpu_add_remove_lock);
return err;
@@ -204,15 +208,18 @@
#endif /*CONFIG_HOTPLUG_CPU*/
/* Requires cpu_add_remove_lock to be held */
-static int __cpuinit _cpu_up(unsigned int cpu)
+static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
{
- int ret;
+ int ret, nr_calls = 0;
void *hcpu = (void *)(long)cpu;
+ unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
if (cpu_online(cpu) || !cpu_present(cpu))
return -EINVAL;
- ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
+ raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
+ ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
+ -1, &nr_calls);
if (ret == NOTIFY_BAD) {
printk("%s: attempt to bring up CPU %u failed\n",
__FUNCTION__, cpu);
@@ -229,12 +236,13 @@
BUG_ON(!cpu_online(cpu));
/* Now call notifier in preparation. */
- raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
+ raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
out_notify:
if (ret != 0)
- raw_notifier_call_chain(&cpu_chain,
- CPU_UP_CANCELED, hcpu);
+ __raw_notifier_call_chain(&cpu_chain,
+ CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
+ raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
return ret;
}
@@ -247,19 +255,13 @@
if (cpu_hotplug_disabled)
err = -EBUSY;
else
- err = _cpu_up(cpu);
+ err = _cpu_up(cpu, 0);
mutex_unlock(&cpu_add_remove_lock);
return err;
}
#ifdef CONFIG_SUSPEND_SMP
-/* Needed to prevent the microcode driver from requesting firmware in its CPU
- * hotplug notifier during the suspend/resume.
- */
-int suspend_cpu_hotplug;
-EXPORT_SYMBOL(suspend_cpu_hotplug);
-
static cpumask_t frozen_cpus;
int disable_nonboot_cpus(void)
@@ -267,7 +269,6 @@
int cpu, first_cpu, error = 0;
mutex_lock(&cpu_add_remove_lock);
- suspend_cpu_hotplug = 1;
first_cpu = first_cpu(cpu_online_map);
/* We take down all of the non-boot CPUs in one shot to avoid races
* with the userspace trying to use the CPU hotplug at the same time
@@ -277,7 +278,7 @@
for_each_online_cpu(cpu) {
if (cpu == first_cpu)
continue;
- error = _cpu_down(cpu);
+ error = _cpu_down(cpu, 1);
if (!error) {
cpu_set(cpu, frozen_cpus);
printk("CPU%d is down\n", cpu);
@@ -294,7 +295,6 @@
} else {
printk(KERN_ERR "Non-boot CPUs are not disabled\n");
}
- suspend_cpu_hotplug = 0;
mutex_unlock(&cpu_add_remove_lock);
return error;
}
@@ -309,10 +309,9 @@
if (cpus_empty(frozen_cpus))
goto out;
- suspend_cpu_hotplug = 1;
printk("Enabling non-boot CPUs ...\n");
for_each_cpu_mask(cpu, frozen_cpus) {
- error = _cpu_up(cpu);
+ error = _cpu_up(cpu, 1);
if (!error) {
printk("CPU%d is up\n", cpu);
continue;
@@ -320,7 +319,6 @@
printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
}
cpus_clear(frozen_cpus);
- suspend_cpu_hotplug = 0;
out:
mutex_unlock(&cpu_add_remove_lock);
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 88b416d..f57854b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1772,12 +1772,7 @@
{
struct ctr_struct *ctr = file->private_data;
- if (*ppos + nbytes > ctr->bufsz)
- nbytes = ctr->bufsz - *ppos;
- if (copy_to_user(buf, ctr->buf + *ppos, nbytes))
- return -EFAULT;
- *ppos += nbytes;
- return nbytes;
+ return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
}
static int cpuset_tasks_release(struct inode *unused_inode, struct file *file)
diff --git a/kernel/exit.c b/kernel/exit.c
index f5a7abb..b0c6f0c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -26,6 +26,7 @@
#include <linux/profile.h>
#include <linux/mount.h>
#include <linux/proc_fs.h>
+#include <linux/kthread.h>
#include <linux/mempolicy.h>
#include <linux/taskstats_kern.h>
#include <linux/delayacct.h>
@@ -254,26 +255,25 @@
}
/**
- * reparent_to_init - Reparent the calling kernel thread to the init task of the pid space that the thread belongs to.
+ * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
*
* If a kernel thread is launched as a result of a system call, or if
- * it ever exits, it should generally reparent itself to init so that
- * it is correctly cleaned up on exit.
+ * it ever exits, it should generally reparent itself to kthreadd so it
+ * isn't in the way of other processes and is correctly cleaned up on exit.
*
* The various task state such as scheduling policy and priority may have
* been inherited from a user process, so we reset them to sane values here.
*
- * NOTE that reparent_to_init() gives the caller full capabilities.
+ * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
*/
-static void reparent_to_init(void)
+static void reparent_to_kthreadd(void)
{
write_lock_irq(&tasklist_lock);
ptrace_unlink(current);
/* Reparent to init */
remove_parent(current);
- current->parent = child_reaper(current);
- current->real_parent = child_reaper(current);
+ current->real_parent = current->parent = kthreadd_task;
add_parent(current);
/* Set the exit signal to SIGCHLD so we signal init on exit */
@@ -347,7 +347,7 @@
return -EINVAL;
spin_lock_irq(¤t->sighand->siglock);
- sigaddset(¤t->blocked, sig);
+ current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
recalc_sigpending();
spin_unlock_irq(¤t->sighand->siglock);
return 0;
@@ -400,7 +400,7 @@
current->files = init_task.files;
atomic_inc(¤t->files->count);
- reparent_to_init();
+ reparent_to_kthreadd();
}
EXPORT_SYMBOL(daemonize);
diff --git a/kernel/fork.c b/kernel/fork.c
index a8dd75d..5dd3979 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -105,7 +105,7 @@
void free_task(struct task_struct *tsk)
{
- free_thread_info(tsk->thread_info);
+ free_thread_info(tsk->stack);
rt_mutex_debug_task_free(tsk);
free_task_struct(tsk);
}
@@ -175,7 +175,7 @@
}
*tsk = *orig;
- tsk->thread_info = ti;
+ tsk->stack = ti;
setup_thread_stack(tsk, orig);
#ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/kernel/futex.c b/kernel/futex.c
index 600bc9d..b7ce15c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -16,6 +16,9 @@
* Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
* Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
*
+ * PRIVATE futexes by Eric Dumazet
+ * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
+ *
* Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
* enough at me, Linus for the original (flawed) idea, Matthew
* Kirkwood for proof-of-concept implementation.
@@ -53,6 +56,12 @@
#include "rtmutex_common.h"
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
/*
@@ -81,12 +90,12 @@
* we can wake only the relevant ones (hashed queues may be shared).
*
* A futex_q has a woken state, just like tasks have TASK_RUNNING.
- * It is considered woken when list_empty(&q->list) || q->lock_ptr == 0.
+ * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
* The order of wakup is always to make the first condition true, then
* wake up q->waiters, then make the second condition true.
*/
struct futex_q {
- struct list_head list;
+ struct plist_node list;
wait_queue_head_t waiters;
/* Which hash list lock to use: */
@@ -102,14 +111,20 @@
/* Optional priority inheritance state: */
struct futex_pi_state *pi_state;
struct task_struct *task;
+
+ /*
+ * This waiter is used in case of requeue from a
+ * normal futex to a PI-futex
+ */
+ struct rt_mutex_waiter waiter;
};
/*
* Split the global futex_lock into every hash list lock.
*/
struct futex_hash_bucket {
- spinlock_t lock;
- struct list_head chain;
+ spinlock_t lock;
+ struct plist_head chain;
};
static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
@@ -138,19 +153,26 @@
&& key1->both.offset == key2->both.offset);
}
-/*
- * Get parameters which are the keys for a futex.
+/**
+ * get_futex_key - Get parameters which are the keys for a futex.
+ * @uaddr: virtual address of the futex
+ * @shared: NULL for a PROCESS_PRIVATE futex,
+ * ¤t->mm->mmap_sem for a PROCESS_SHARED futex
+ * @key: address where result is stored.
+ *
+ * Returns a negative error code or 0
+ * The key words are stored in *key on success.
*
* For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
* offset_within_page). For private mappings, it's (uaddr, current->mm).
* We can usually work out the index without swapping in the page.
*
- * Returns: 0, or negative error code.
- * The key words are stored in *key on success.
- *
- * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks.
+ * fshared is NULL for PROCESS_PRIVATE futexes
+ * For other futexes, it points to ¤t->mm->mmap_sem and
+ * caller must have taken the reader lock. but NOT any spinlocks.
*/
-int get_futex_key(u32 __user *uaddr, union futex_key *key)
+int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
+ union futex_key *key)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
@@ -162,11 +184,25 @@
* The futex address must be "naturally" aligned.
*/
key->both.offset = address % PAGE_SIZE;
- if (unlikely((key->both.offset % sizeof(u32)) != 0))
+ if (unlikely((address % sizeof(u32)) != 0))
return -EINVAL;
address -= key->both.offset;
/*
+ * PROCESS_PRIVATE futexes are fast.
+ * As the mm cannot disappear under us and the 'key' only needs
+ * virtual address, we dont even have to find the underlying vma.
+ * Note : We do have to check 'uaddr' is a valid user address,
+ * but access_ok() should be faster than find_vma()
+ */
+ if (!fshared) {
+ if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
+ return -EFAULT;
+ key->private.mm = mm;
+ key->private.address = address;
+ return 0;
+ }
+ /*
* The futex is hashed differently depending on whether
* it's in a shared or private mapping. So check vma first.
*/
@@ -180,6 +216,9 @@
if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
+ /* Save the user address in the ley */
+ key->uaddr = uaddr;
+
/*
* Private mappings are handled in a simple way.
*
@@ -190,6 +229,7 @@
* mappings of _writable_ handles.
*/
if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
+ key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
key->private.mm = mm;
key->private.address = address;
return 0;
@@ -199,7 +239,7 @@
* Linear file mappings are also simple.
*/
key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
- key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
+ key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
+ vma->vm_pgoff);
@@ -227,16 +267,18 @@
* Take a reference to the resource addressed by a key.
* Can be called while holding spinlocks.
*
- * NOTE: mmap_sem MUST be held between get_futex_key() and calling this
- * function, if it is called at all. mmap_sem keeps key->shared.inode valid.
*/
inline void get_futex_key_refs(union futex_key *key)
{
- if (key->both.ptr != 0) {
- if (key->both.offset & 1)
+ if (key->both.ptr == 0)
+ return;
+ switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+ case FUT_OFF_INODE:
atomic_inc(&key->shared.inode->i_count);
- else
+ break;
+ case FUT_OFF_MMSHARED:
atomic_inc(&key->private.mm->mm_count);
+ break;
}
}
EXPORT_SYMBOL_GPL(get_futex_key_refs);
@@ -247,11 +289,15 @@
*/
void drop_futex_key_refs(union futex_key *key)
{
- if (key->both.ptr != 0) {
- if (key->both.offset & 1)
+ if (key->both.ptr == 0)
+ return;
+ switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+ case FUT_OFF_INODE:
iput(key->shared.inode);
- else
+ break;
+ case FUT_OFF_MMSHARED:
mmdrop(key->private.mm);
+ break;
}
}
EXPORT_SYMBOL_GPL(drop_futex_key_refs);
@@ -268,28 +314,38 @@
}
/*
- * Fault handling. Called with current->mm->mmap_sem held.
+ * Fault handling.
+ * if fshared is non NULL, current->mm->mmap_sem is already held
*/
-static int futex_handle_fault(unsigned long address, int attempt)
+static int futex_handle_fault(unsigned long address,
+ struct rw_semaphore *fshared, int attempt)
{
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
+ int ret = -EFAULT;
- if (attempt > 2 || !(vma = find_vma(mm, address)) ||
- vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
- return -EFAULT;
+ if (attempt > 2)
+ return ret;
- switch (handle_mm_fault(mm, vma, address, 1)) {
- case VM_FAULT_MINOR:
- current->min_flt++;
- break;
- case VM_FAULT_MAJOR:
- current->maj_flt++;
- break;
- default:
- return -EFAULT;
+ if (!fshared)
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, address);
+ if (vma && address >= vma->vm_start &&
+ (vma->vm_flags & VM_WRITE)) {
+ switch (handle_mm_fault(mm, vma, address, 1)) {
+ case VM_FAULT_MINOR:
+ ret = 0;
+ current->min_flt++;
+ break;
+ case VM_FAULT_MAJOR:
+ ret = 0;
+ current->maj_flt++;
+ break;
+ }
}
- return 0;
+ if (!fshared)
+ up_read(&mm->mmap_sem);
+ return ret;
}
/*
@@ -439,18 +495,19 @@
}
static int
-lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
+lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+ union futex_key *key, struct futex_pi_state **ps)
{
struct futex_pi_state *pi_state = NULL;
struct futex_q *this, *next;
- struct list_head *head;
+ struct plist_head *head;
struct task_struct *p;
pid_t pid;
head = &hb->chain;
- list_for_each_entry_safe(this, next, head, list) {
- if (match_futex(&this->key, &me->key)) {
+ plist_for_each_entry_safe(this, next, head, list) {
+ if (match_futex(&this->key, key)) {
/*
* Another waiter already exists - bump up
* the refcount and return its pi_state:
@@ -465,7 +522,7 @@
WARN_ON(!atomic_read(&pi_state->refcount));
atomic_inc(&pi_state->refcount);
- me->pi_state = pi_state;
+ *ps = pi_state;
return 0;
}
@@ -492,7 +549,7 @@
rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
/* Store the key for possible exit cleanups: */
- pi_state->key = me->key;
+ pi_state->key = *key;
spin_lock_irq(&p->pi_lock);
WARN_ON(!list_empty(&pi_state->list));
@@ -502,7 +559,7 @@
put_task_struct(p);
- me->pi_state = pi_state;
+ *ps = pi_state;
return 0;
}
@@ -513,12 +570,12 @@
*/
static void wake_futex(struct futex_q *q)
{
- list_del_init(&q->list);
+ plist_del(&q->list, &q->list.plist);
if (q->filp)
send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
/*
* The lock in wake_up_all() is a crucial memory barrier after the
- * list_del_init() and also before assigning to q->lock_ptr.
+ * plist_del() and also before assigning to q->lock_ptr.
*/
wake_up_all(&q->waiters);
/*
@@ -562,6 +619,8 @@
*/
if (!(uval & FUTEX_OWNER_DIED)) {
newval = FUTEX_WAITERS | new_owner->pid;
+ /* Keep the FUTEX_WAITER_REQUEUED flag if it was set */
+ newval |= (uval & FUTEX_WAITER_REQUEUED);
pagefault_disable();
curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -629,17 +688,19 @@
* Wake up all waiters hashed on the physical page that is mapped
* to this virtual address:
*/
-static int futex_wake(u32 __user *uaddr, int nr_wake)
+static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
+ int nr_wake)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
- struct list_head *head;
+ struct plist_head *head;
union futex_key key;
int ret;
- down_read(¤t->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr, &key);
+ ret = get_futex_key(uaddr, fshared, &key);
if (unlikely(ret != 0))
goto out;
@@ -647,7 +708,7 @@
spin_lock(&hb->lock);
head = &hb->chain;
- list_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key)) {
if (this->pi_state) {
ret = -EINVAL;
@@ -661,7 +722,261 @@
spin_unlock(&hb->lock);
out:
- up_read(¤t->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
+ return ret;
+}
+
+/*
+ * Called from futex_requeue_pi.
+ * Set FUTEX_WAITERS and FUTEX_WAITER_REQUEUED flags on the
+ * PI-futex value; search its associated pi_state if an owner exist
+ * or create a new one without owner.
+ */
+static inline int
+lookup_pi_state_for_requeue(u32 __user *uaddr, struct futex_hash_bucket *hb,
+ union futex_key *key,
+ struct futex_pi_state **pi_state)
+{
+ u32 curval, uval, newval;
+
+retry:
+ /*
+ * We can't handle a fault cleanly because we can't
+ * release the locks here. Simply return the fault.
+ */
+ if (get_futex_value_locked(&curval, uaddr))
+ return -EFAULT;
+
+ /* set the flags FUTEX_WAITERS and FUTEX_WAITER_REQUEUED */
+ if ((curval & (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED))
+ != (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED)) {
+ /*
+ * No waiters yet, we prepare the futex to have some waiters.
+ */
+
+ uval = curval;
+ newval = uval | FUTEX_WAITERS | FUTEX_WAITER_REQUEUED;
+
+ pagefault_disable();
+ curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+ pagefault_enable();
+
+ if (unlikely(curval == -EFAULT))
+ return -EFAULT;
+ if (unlikely(curval != uval))
+ goto retry;
+ }
+
+ if (!(curval & FUTEX_TID_MASK)
+ || lookup_pi_state(curval, hb, key, pi_state)) {
+ /* the futex has no owner (yet) or the lookup failed:
+ allocate one pi_state without owner */
+
+ *pi_state = alloc_pi_state();
+
+ /* Already stores the key: */
+ (*pi_state)->key = *key;
+
+ /* init the mutex without owner */
+ __rt_mutex_init(&(*pi_state)->pi_mutex, NULL);
+ }
+
+ return 0;
+}
+
+/*
+ * Keep the first nr_wake waiter from futex1, wake up one,
+ * and requeue the next nr_requeue waiters following hashed on
+ * one physical page to another physical page (PI-futex uaddr2)
+ */
+static int futex_requeue_pi(u32 __user *uaddr1,
+ struct rw_semaphore *fshared,
+ u32 __user *uaddr2,
+ int nr_wake, int nr_requeue, u32 *cmpval)
+{
+ union futex_key key1, key2;
+ struct futex_hash_bucket *hb1, *hb2;
+ struct plist_head *head1;
+ struct futex_q *this, *next;
+ struct futex_pi_state *pi_state2 = NULL;
+ struct rt_mutex_waiter *waiter, *top_waiter = NULL;
+ struct rt_mutex *lock2 = NULL;
+ int ret, drop_count = 0;
+
+ if (refill_pi_state_cache())
+ return -ENOMEM;
+
+retry:
+ /*
+ * First take all the futex related locks:
+ */
+ if (fshared)
+ down_read(fshared);
+
+ ret = get_futex_key(uaddr1, fshared, &key1);
+ if (unlikely(ret != 0))
+ goto out;
+ ret = get_futex_key(uaddr2, fshared, &key2);
+ if (unlikely(ret != 0))
+ goto out;
+
+ hb1 = hash_futex(&key1);
+ hb2 = hash_futex(&key2);
+
+ double_lock_hb(hb1, hb2);
+
+ if (likely(cmpval != NULL)) {
+ u32 curval;
+
+ ret = get_futex_value_locked(&curval, uaddr1);
+
+ if (unlikely(ret)) {
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
+
+ /*
+ * If we would have faulted, release mmap_sem, fault
+ * it in and start all over again.
+ */
+ if (fshared)
+ up_read(fshared);
+
+ ret = get_user(curval, uaddr1);
+
+ if (!ret)
+ goto retry;
+
+ return ret;
+ }
+ if (curval != *cmpval) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ }
+
+ head1 = &hb1->chain;
+ plist_for_each_entry_safe(this, next, head1, list) {
+ if (!match_futex (&this->key, &key1))
+ continue;
+ if (++ret <= nr_wake) {
+ wake_futex(this);
+ } else {
+ /*
+ * FIRST: get and set the pi_state
+ */
+ if (!pi_state2) {
+ int s;
+ /* do this only the first time we requeue someone */
+ s = lookup_pi_state_for_requeue(uaddr2, hb2,
+ &key2, &pi_state2);
+ if (s) {
+ ret = s;
+ goto out_unlock;
+ }
+
+ lock2 = &pi_state2->pi_mutex;
+ spin_lock(&lock2->wait_lock);
+
+ /* Save the top waiter of the wait_list */
+ if (rt_mutex_has_waiters(lock2))
+ top_waiter = rt_mutex_top_waiter(lock2);
+ } else
+ atomic_inc(&pi_state2->refcount);
+
+
+ this->pi_state = pi_state2;
+
+ /*
+ * SECOND: requeue futex_q to the correct hashbucket
+ */
+
+ /*
+ * If key1 and key2 hash to the same bucket, no need to
+ * requeue.
+ */
+ if (likely(head1 != &hb2->chain)) {
+ plist_del(&this->list, &hb1->chain);
+ plist_add(&this->list, &hb2->chain);
+ this->lock_ptr = &hb2->lock;
+#ifdef CONFIG_DEBUG_PI_LIST
+ this->list.plist.lock = &hb2->lock;
+#endif
+ }
+ this->key = key2;
+ get_futex_key_refs(&key2);
+ drop_count++;
+
+
+ /*
+ * THIRD: queue it to lock2
+ */
+ spin_lock_irq(&this->task->pi_lock);
+ waiter = &this->waiter;
+ waiter->task = this->task;
+ waiter->lock = lock2;
+ plist_node_init(&waiter->list_entry, this->task->prio);
+ plist_node_init(&waiter->pi_list_entry, this->task->prio);
+ plist_add(&waiter->list_entry, &lock2->wait_list);
+ this->task->pi_blocked_on = waiter;
+ spin_unlock_irq(&this->task->pi_lock);
+
+ if (ret - nr_wake >= nr_requeue)
+ break;
+ }
+ }
+
+ /* If we've requeued some tasks and the top_waiter of the rt_mutex
+ has changed, we must adjust the priority of the owner, if any */
+ if (drop_count) {
+ struct task_struct *owner = rt_mutex_owner(lock2);
+ if (owner &&
+ (top_waiter != (waiter = rt_mutex_top_waiter(lock2)))) {
+ int chain_walk = 0;
+
+ spin_lock_irq(&owner->pi_lock);
+ if (top_waiter)
+ plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
+ else
+ /*
+ * There was no waiters before the requeue,
+ * the flag must be updated
+ */
+ mark_rt_mutex_waiters(lock2);
+
+ plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
+ __rt_mutex_adjust_prio(owner);
+ if (owner->pi_blocked_on) {
+ chain_walk = 1;
+ get_task_struct(owner);
+ }
+
+ spin_unlock_irq(&owner->pi_lock);
+ spin_unlock(&lock2->wait_lock);
+
+ if (chain_walk)
+ rt_mutex_adjust_prio_chain(owner, 0, lock2, NULL,
+ current);
+ } else {
+ /* No owner or the top_waiter does not change */
+ mark_rt_mutex_waiters(lock2);
+ spin_unlock(&lock2->wait_lock);
+ }
+ }
+
+out_unlock:
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
+
+ /* drop_futex_key_refs() must be called outside the spinlocks. */
+ while (--drop_count >= 0)
+ drop_futex_key_refs(&key1);
+
+out:
+ if (fshared)
+ up_read(fshared);
return ret;
}
@@ -670,22 +985,24 @@
* to this virtual address:
*/
static int
-futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
+futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
+ u32 __user *uaddr2,
int nr_wake, int nr_wake2, int op)
{
union futex_key key1, key2;
struct futex_hash_bucket *hb1, *hb2;
- struct list_head *head;
+ struct plist_head *head;
struct futex_q *this, *next;
int ret, op_ret, attempt = 0;
retryfull:
- down_read(¤t->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr1, &key1);
+ ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, &key2);
+ ret = get_futex_key(uaddr2, fshared, &key2);
if (unlikely(ret != 0))
goto out;
@@ -725,11 +1042,10 @@
* still holding the mmap_sem.
*/
if (attempt++) {
- if (futex_handle_fault((unsigned long)uaddr2,
- attempt)) {
- ret = -EFAULT;
+ ret = futex_handle_fault((unsigned long)uaddr2,
+ fshared, attempt);
+ if (ret)
goto out;
- }
goto retry;
}
@@ -737,7 +1053,8 @@
* If we would have faulted, release mmap_sem,
* fault it in and start all over again.
*/
- up_read(¤t->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
ret = get_user(dummy, uaddr2);
if (ret)
@@ -748,7 +1065,7 @@
head = &hb1->chain;
- list_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key1)) {
wake_futex(this);
if (++ret >= nr_wake)
@@ -760,7 +1077,7 @@
head = &hb2->chain;
op_ret = 0;
- list_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key2)) {
wake_futex(this);
if (++op_ret >= nr_wake2)
@@ -774,7 +1091,8 @@
if (hb1 != hb2)
spin_unlock(&hb2->lock);
out:
- up_read(¤t->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
return ret;
}
@@ -782,22 +1100,24 @@
* Requeue all waiters hashed on one physical page to another
* physical page.
*/
-static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
+static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
+ u32 __user *uaddr2,
int nr_wake, int nr_requeue, u32 *cmpval)
{
union futex_key key1, key2;
struct futex_hash_bucket *hb1, *hb2;
- struct list_head *head1;
+ struct plist_head *head1;
struct futex_q *this, *next;
int ret, drop_count = 0;
retry:
- down_read(¤t->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr1, &key1);
+ ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, &key2);
+ ret = get_futex_key(uaddr2, fshared, &key2);
if (unlikely(ret != 0))
goto out;
@@ -820,7 +1140,8 @@
* If we would have faulted, release mmap_sem, fault
* it in and start all over again.
*/
- up_read(¤t->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
ret = get_user(curval, uaddr1);
@@ -836,7 +1157,7 @@
}
head1 = &hb1->chain;
- list_for_each_entry_safe(this, next, head1, list) {
+ plist_for_each_entry_safe(this, next, head1, list) {
if (!match_futex (&this->key, &key1))
continue;
if (++ret <= nr_wake) {
@@ -847,9 +1168,13 @@
* requeue.
*/
if (likely(head1 != &hb2->chain)) {
- list_move_tail(&this->list, &hb2->chain);
+ plist_del(&this->list, &hb1->chain);
+ plist_add(&this->list, &hb2->chain);
this->lock_ptr = &hb2->lock;
- }
+#ifdef CONFIG_DEBUG_PI_LIST
+ this->list.plist.lock = &hb2->lock;
+#endif
+ }
this->key = key2;
get_futex_key_refs(&key2);
drop_count++;
@@ -869,7 +1194,8 @@
drop_futex_key_refs(&key1);
out:
- up_read(¤t->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
return ret;
}
@@ -894,7 +1220,23 @@
static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
{
- list_add_tail(&q->list, &hb->chain);
+ int prio;
+
+ /*
+ * The priority used to register this element is
+ * - either the real thread-priority for the real-time threads
+ * (i.e. threads with a priority lower than MAX_RT_PRIO)
+ * - or MAX_RT_PRIO for non-RT threads.
+ * Thus, all RT-threads are woken first in priority order, and
+ * the others are woken last, in FIFO order.
+ */
+ prio = min(current->normal_prio, MAX_RT_PRIO);
+
+ plist_node_init(&q->list, prio);
+#ifdef CONFIG_DEBUG_PI_LIST
+ q->list.plist.lock = &hb->lock;
+#endif
+ plist_add(&q->list, &hb->chain);
q->task = current;
spin_unlock(&hb->lock);
}
@@ -949,8 +1291,8 @@
spin_unlock(lock_ptr);
goto retry;
}
- WARN_ON(list_empty(&q->list));
- list_del(&q->list);
+ WARN_ON(plist_node_empty(&q->list));
+ plist_del(&q->list, &q->list.plist);
BUG_ON(q->pi_state);
@@ -964,39 +1306,104 @@
/*
* PI futexes can not be requeued and must remove themself from the
- * hash bucket. The hash bucket lock is held on entry and dropped here.
+ * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
+ * and dropped here.
*/
-static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
+static void unqueue_me_pi(struct futex_q *q)
{
- WARN_ON(list_empty(&q->list));
- list_del(&q->list);
+ WARN_ON(plist_node_empty(&q->list));
+ plist_del(&q->list, &q->list.plist);
BUG_ON(!q->pi_state);
free_pi_state(q->pi_state);
q->pi_state = NULL;
- spin_unlock(&hb->lock);
+ spin_unlock(q->lock_ptr);
drop_futex_key_refs(&q->key);
}
+/*
+ * Fixup the pi_state owner with current.
+ *
+ * The cur->mm semaphore must be held, it is released at return of this
+ * function.
+ */
+static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared,
+ struct futex_q *q,
+ struct futex_hash_bucket *hb,
+ struct task_struct *curr)
+{
+ u32 newtid = curr->pid | FUTEX_WAITERS;
+ struct futex_pi_state *pi_state = q->pi_state;
+ u32 uval, curval, newval;
+ int ret;
+
+ /* Owner died? */
+ if (pi_state->owner != NULL) {
+ spin_lock_irq(&pi_state->owner->pi_lock);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ spin_unlock_irq(&pi_state->owner->pi_lock);
+ } else
+ newtid |= FUTEX_OWNER_DIED;
+
+ pi_state->owner = curr;
+
+ spin_lock_irq(&curr->pi_lock);
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &curr->pi_state_list);
+ spin_unlock_irq(&curr->pi_lock);
+
+ /* Unqueue and drop the lock */
+ unqueue_me_pi(q);
+ if (fshared)
+ up_read(fshared);
+ /*
+ * We own it, so we have to replace the pending owner
+ * TID. This must be atomic as we have preserve the
+ * owner died bit here.
+ */
+ ret = get_user(uval, uaddr);
+ while (!ret) {
+ newval = (uval & FUTEX_OWNER_DIED) | newtid;
+ newval |= (uval & FUTEX_WAITER_REQUEUED);
+ curval = futex_atomic_cmpxchg_inatomic(uaddr,
+ uval, newval);
+ if (curval == -EFAULT)
+ ret = -EFAULT;
+ if (curval == uval)
+ break;
+ uval = curval;
+ }
+ return ret;
+}
+
+/*
+ * In case we must use restart_block to restart a futex_wait,
+ * we encode in the 'arg3' shared capability
+ */
+#define ARG3_SHARED 1
+
static long futex_wait_restart(struct restart_block *restart);
-static int futex_wait_abstime(u32 __user *uaddr, u32 val,
- int timed, unsigned long abs_time)
+static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
+ u32 val, ktime_t *abs_time)
{
struct task_struct *curr = current;
DECLARE_WAITQUEUE(wait, curr);
struct futex_hash_bucket *hb;
struct futex_q q;
- unsigned long time_left = 0;
u32 uval;
int ret;
+ struct hrtimer_sleeper t, *to = NULL;
+ int rem = 0;
q.pi_state = NULL;
retry:
- down_read(&curr->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr, &q.key);
+ ret = get_futex_key(uaddr, fshared, &q.key);
if (unlikely(ret != 0))
goto out_release_sem;
@@ -1019,8 +1426,8 @@
* a wakeup when *uaddr != val on entry to the syscall. This is
* rare, but normal.
*
- * We hold the mmap semaphore, so the mapping cannot have changed
- * since we looked it up in get_futex_key.
+ * for shared futexes, we hold the mmap semaphore, so the mapping
+ * cannot have changed since we looked it up in get_futex_key.
*/
ret = get_futex_value_locked(&uval, uaddr);
@@ -1031,7 +1438,8 @@
* If we would have faulted, release mmap_sem, fault it in and
* start all over again.
*/
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
ret = get_user(uval, uaddr);
@@ -1043,6 +1451,14 @@
if (uval != val)
goto out_unlock_release_sem;
+ /*
+ * This rt_mutex_waiter structure is prepared here and will
+ * be used only if this task is requeued from a normal futex to
+ * a PI-futex with futex_requeue_pi.
+ */
+ debug_rt_mutex_init_waiter(&q.waiter);
+ q.waiter.task = NULL;
+
/* Only actually queue if *uaddr contained val. */
__queue_me(&q, hb);
@@ -1050,7 +1466,8 @@
* Now the futex is queued and we have checked the data, we
* don't want to hold mmap_sem while we sleep.
*/
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
/*
* There might have been scheduling since the queue_me(), as we
@@ -1065,23 +1482,33 @@
__set_current_state(TASK_INTERRUPTIBLE);
add_wait_queue(&q.waiters, &wait);
/*
- * !list_empty() is safe here without any lock.
+ * !plist_node_empty() is safe here without any lock.
* q.lock_ptr != 0 is not safe, because of ordering against wakeup.
*/
- time_left = 0;
- if (likely(!list_empty(&q.list))) {
- unsigned long rel_time;
+ if (likely(!plist_node_empty(&q.list))) {
+ if (!abs_time)
+ schedule();
+ else {
+ to = &t;
+ hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init_sleeper(&t, current);
+ t.timer.expires = *abs_time;
- if (timed) {
- unsigned long now = jiffies;
- if (time_after(now, abs_time))
- rel_time = 0;
- else
- rel_time = abs_time - now;
- } else
- rel_time = MAX_SCHEDULE_TIMEOUT;
+ hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS);
- time_left = schedule_timeout(rel_time);
+ /*
+ * the timer could have already expired, in which
+ * case current would be flagged for rescheduling.
+ * Don't bother calling schedule.
+ */
+ if (likely(t.task))
+ schedule();
+
+ hrtimer_cancel(&t.timer);
+
+ /* Flag if a timeout occured */
+ rem = (t.task == NULL);
+ }
}
__set_current_state(TASK_RUNNING);
@@ -1090,17 +1517,80 @@
* we are the only user of it.
*/
+ if (q.pi_state) {
+ /*
+ * We were woken but have been requeued on a PI-futex.
+ * We have to complete the lock acquisition by taking
+ * the rtmutex.
+ */
+
+ struct rt_mutex *lock = &q.pi_state->pi_mutex;
+
+ spin_lock(&lock->wait_lock);
+ if (unlikely(q.waiter.task)) {
+ remove_waiter(lock, &q.waiter);
+ }
+ spin_unlock(&lock->wait_lock);
+
+ if (rem)
+ ret = -ETIMEDOUT;
+ else
+ ret = rt_mutex_timed_lock(lock, to, 1);
+
+ if (fshared)
+ down_read(fshared);
+ spin_lock(q.lock_ptr);
+
+ /*
+ * Got the lock. We might not be the anticipated owner if we
+ * did a lock-steal - fix up the PI-state in that case.
+ */
+ if (!ret && q.pi_state->owner != curr) {
+ /*
+ * We MUST play with the futex we were requeued on,
+ * NOT the current futex.
+ * We can retrieve it from the key of the pi_state
+ */
+ uaddr = q.pi_state->key.uaddr;
+
+ /* mmap_sem and hash_bucket lock are unlocked at
+ return of this function */
+ ret = fixup_pi_state_owner(uaddr, fshared,
+ &q, hb, curr);
+ } else {
+ /*
+ * Catch the rare case, where the lock was released
+ * when we were on the way back before we locked
+ * the hash bucket.
+ */
+ if (ret && q.pi_state->owner == curr) {
+ if (rt_mutex_trylock(&q.pi_state->pi_mutex))
+ ret = 0;
+ }
+ /* Unqueue and drop the lock */
+ unqueue_me_pi(&q);
+ if (fshared)
+ up_read(fshared);
+ }
+
+ debug_rt_mutex_free_waiter(&q.waiter);
+
+ return ret;
+ }
+
+ debug_rt_mutex_free_waiter(&q.waiter);
+
/* If we were woken (and unqueued), we succeeded, whatever. */
if (!unqueue_me(&q))
return 0;
- if (time_left == 0)
+ if (rem)
return -ETIMEDOUT;
/*
* We expect signal_pending(current), but another thread may
* have handled it for us already.
*/
- if (time_left == MAX_SCHEDULE_TIMEOUT)
+ if (!abs_time)
return -ERESTARTSYS;
else {
struct restart_block *restart;
@@ -1108,8 +1598,10 @@
restart->fn = futex_wait_restart;
restart->arg0 = (unsigned long)uaddr;
restart->arg1 = (unsigned long)val;
- restart->arg2 = (unsigned long)timed;
- restart->arg3 = abs_time;
+ restart->arg2 = (unsigned long)abs_time;
+ restart->arg3 = 0;
+ if (fshared)
+ restart->arg3 |= ARG3_SHARED;
return -ERESTART_RESTARTBLOCK;
}
@@ -1117,65 +1609,111 @@
queue_unlock(&q, hb);
out_release_sem:
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
return ret;
}
-static int futex_wait(u32 __user *uaddr, u32 val, unsigned long rel_time)
-{
- int timed = (rel_time != MAX_SCHEDULE_TIMEOUT);
- return futex_wait_abstime(uaddr, val, timed, jiffies+rel_time);
-}
static long futex_wait_restart(struct restart_block *restart)
{
u32 __user *uaddr = (u32 __user *)restart->arg0;
u32 val = (u32)restart->arg1;
- int timed = (int)restart->arg2;
- unsigned long abs_time = restart->arg3;
+ ktime_t *abs_time = (ktime_t *)restart->arg2;
+ struct rw_semaphore *fshared = NULL;
restart->fn = do_no_restart_syscall;
- return (long)futex_wait_abstime(uaddr, val, timed, abs_time);
+ if (restart->arg3 & ARG3_SHARED)
+ fshared = ¤t->mm->mmap_sem;
+ return (long)futex_wait(uaddr, fshared, val, abs_time);
}
+static void set_pi_futex_owner(struct futex_hash_bucket *hb,
+ union futex_key *key, struct task_struct *p)
+{
+ struct plist_head *head;
+ struct futex_q *this, *next;
+ struct futex_pi_state *pi_state = NULL;
+ struct rt_mutex *lock;
+
+ /* Search a waiter that should already exists */
+
+ head = &hb->chain;
+
+ plist_for_each_entry_safe(this, next, head, list) {
+ if (match_futex (&this->key, key)) {
+ pi_state = this->pi_state;
+ break;
+ }
+ }
+
+ BUG_ON(!pi_state);
+
+ /* set p as pi_state's owner */
+ lock = &pi_state->pi_mutex;
+
+ spin_lock(&lock->wait_lock);
+ spin_lock_irq(&p->pi_lock);
+
+ list_add(&pi_state->list, &p->pi_state_list);
+ pi_state->owner = p;
+
+
+ /* set p as pi_mutex's owner */
+ debug_rt_mutex_proxy_lock(lock, p);
+ WARN_ON(rt_mutex_owner(lock));
+ rt_mutex_set_owner(lock, p, 0);
+ rt_mutex_deadlock_account_lock(lock, p);
+
+ plist_add(&rt_mutex_top_waiter(lock)->pi_list_entry,
+ &p->pi_waiters);
+ __rt_mutex_adjust_prio(p);
+
+ spin_unlock_irq(&p->pi_lock);
+ spin_unlock(&lock->wait_lock);
+}
+
/*
* Userspace tried a 0 -> TID atomic transition of the futex value
* and failed. The kernel side here does the whole locking operation:
* if there are waiters then it will block, it does PI, etc. (Due to
* races the kernel might see a 0 value of the futex too.)
*/
-static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
- long nsec, int trylock)
+static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
+ int detect, ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
struct task_struct *curr = current;
struct futex_hash_bucket *hb;
u32 uval, newval, curval;
struct futex_q q;
- int ret, attempt = 0;
+ int ret, lock_held, attempt = 0;
if (refill_pi_state_cache())
return -ENOMEM;
- if (sec != MAX_SCHEDULE_TIMEOUT) {
+ if (time) {
to = &timeout;
hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
hrtimer_init_sleeper(to, current);
- to->timer.expires = ktime_set(sec, nsec);
+ to->timer.expires = *time;
}
q.pi_state = NULL;
retry:
- down_read(&curr->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr, &q.key);
+ ret = get_futex_key(uaddr, fshared, &q.key);
if (unlikely(ret != 0))
goto out_release_sem;
hb = queue_lock(&q, -1, NULL);
retry_locked:
+ lock_held = 0;
+
/*
* To avoid races, we attempt to take the lock here again
* (by doing a 0 -> TID atomic cmpxchg), while holding all
@@ -1194,7 +1732,16 @@
if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
if (!detect && 0)
force_sig(SIGKILL, current);
- ret = -EDEADLK;
+ /*
+ * Normally, this check is done in user space.
+ * In case of requeue, the owner may attempt to lock this futex,
+ * even if the ownership has already been given by the previous
+ * waker.
+ * In the usual case, this is a case of deadlock, but not in case
+ * of REQUEUE_PI.
+ */
+ if (!(curval & FUTEX_WAITER_REQUEUED))
+ ret = -EDEADLK;
goto out_unlock_release_sem;
}
@@ -1206,7 +1753,18 @@
goto out_unlock_release_sem;
uval = curval;
- newval = uval | FUTEX_WAITERS;
+ /*
+ * In case of a requeue, check if there already is an owner
+ * If not, just take the futex.
+ */
+ if ((curval & FUTEX_WAITER_REQUEUED) && !(curval & FUTEX_TID_MASK)) {
+ /* set current as futex owner */
+ newval = curval | current->pid;
+ lock_held = 1;
+ } else
+ /* Set the WAITERS flag, so the owner will know it has someone
+ to wake at next unlock */
+ newval = curval | FUTEX_WAITERS;
pagefault_disable();
curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -1217,11 +1775,16 @@
if (unlikely(curval != uval))
goto retry_locked;
+ if (lock_held) {
+ set_pi_futex_owner(hb, &q.key, curr);
+ goto out_unlock_release_sem;
+ }
+
/*
* We dont have the lock. Look up the PI state (or create it if
* we are the first waiter):
*/
- ret = lookup_pi_state(uval, hb, &q);
+ ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
if (unlikely(ret)) {
/*
@@ -1263,7 +1826,8 @@
* Now the futex is queued and we have checked the data, we
* don't want to hold mmap_sem while we sleep.
*/
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
WARN_ON(!q.pi_state);
/*
@@ -1277,52 +1841,18 @@
ret = ret ? 0 : -EWOULDBLOCK;
}
- down_read(&curr->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
spin_lock(q.lock_ptr);
/*
* Got the lock. We might not be the anticipated owner if we
* did a lock-steal - fix up the PI-state in that case.
*/
- if (!ret && q.pi_state->owner != curr) {
- u32 newtid = current->pid | FUTEX_WAITERS;
-
- /* Owner died? */
- if (q.pi_state->owner != NULL) {
- spin_lock_irq(&q.pi_state->owner->pi_lock);
- WARN_ON(list_empty(&q.pi_state->list));
- list_del_init(&q.pi_state->list);
- spin_unlock_irq(&q.pi_state->owner->pi_lock);
- } else
- newtid |= FUTEX_OWNER_DIED;
-
- q.pi_state->owner = current;
-
- spin_lock_irq(¤t->pi_lock);
- WARN_ON(!list_empty(&q.pi_state->list));
- list_add(&q.pi_state->list, ¤t->pi_state_list);
- spin_unlock_irq(¤t->pi_lock);
-
- /* Unqueue and drop the lock */
- unqueue_me_pi(&q, hb);
- up_read(&curr->mm->mmap_sem);
- /*
- * We own it, so we have to replace the pending owner
- * TID. This must be atomic as we have preserve the
- * owner died bit here.
- */
- ret = get_user(uval, uaddr);
- while (!ret) {
- newval = (uval & FUTEX_OWNER_DIED) | newtid;
- curval = futex_atomic_cmpxchg_inatomic(uaddr,
- uval, newval);
- if (curval == -EFAULT)
- ret = -EFAULT;
- if (curval == uval)
- break;
- uval = curval;
- }
- } else {
+ if (!ret && q.pi_state->owner != curr)
+ /* mmap_sem is unlocked at return of this function */
+ ret = fixup_pi_state_owner(uaddr, fshared, &q, hb, curr);
+ else {
/*
* Catch the rare case, where the lock was released
* when we were on the way back before we locked
@@ -1333,8 +1863,9 @@
ret = 0;
}
/* Unqueue and drop the lock */
- unqueue_me_pi(&q, hb);
- up_read(&curr->mm->mmap_sem);
+ unqueue_me_pi(&q);
+ if (fshared)
+ up_read(fshared);
}
if (!detect && ret == -EDEADLK && 0)
@@ -1346,7 +1877,8 @@
queue_unlock(&q, hb);
out_release_sem:
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
return ret;
uaddr_faulted:
@@ -1357,15 +1889,16 @@
* still holding the mmap_sem.
*/
if (attempt++) {
- if (futex_handle_fault((unsigned long)uaddr, attempt)) {
- ret = -EFAULT;
+ ret = futex_handle_fault((unsigned long)uaddr, fshared,
+ attempt);
+ if (ret)
goto out_unlock_release_sem;
- }
goto retry_locked;
}
queue_unlock(&q, hb);
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
ret = get_user(uval, uaddr);
if (!ret && (uval != -EFAULT))
@@ -1379,12 +1912,12 @@
* This is the in-kernel slowpath: we look up the PI state (if any),
* and do the rt-mutex unlock.
*/
-static int futex_unlock_pi(u32 __user *uaddr)
+static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
u32 uval;
- struct list_head *head;
+ struct plist_head *head;
union futex_key key;
int ret, attempt = 0;
@@ -1399,9 +1932,10 @@
/*
* First take all the futex related locks:
*/
- down_read(¤t->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr, &key);
+ ret = get_futex_key(uaddr, fshared, &key);
if (unlikely(ret != 0))
goto out;
@@ -1435,7 +1969,7 @@
*/
head = &hb->chain;
- list_for_each_entry_safe(this, next, head, list) {
+ plist_for_each_entry_safe(this, next, head, list) {
if (!match_futex (&this->key, &key))
continue;
ret = wake_futex_pi(uaddr, uval, this);
@@ -1460,7 +1994,8 @@
out_unlock:
spin_unlock(&hb->lock);
out:
- up_read(¤t->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
return ret;
@@ -1472,15 +2007,16 @@
* still holding the mmap_sem.
*/
if (attempt++) {
- if (futex_handle_fault((unsigned long)uaddr, attempt)) {
- ret = -EFAULT;
+ ret = futex_handle_fault((unsigned long)uaddr, fshared,
+ attempt);
+ if (ret)
goto out_unlock;
- }
goto retry_locked;
}
spin_unlock(&hb->lock);
- up_read(¤t->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
ret = get_user(uval, uaddr);
if (!ret && (uval != -EFAULT))
@@ -1509,10 +2045,10 @@
poll_wait(filp, &q->waiters, wait);
/*
- * list_empty() is safe here without any lock.
+ * plist_node_empty() is safe here without any lock.
* q->lock_ptr != 0 is not safe, because of ordering against wakeup.
*/
- if (list_empty(&q->list))
+ if (plist_node_empty(&q->list))
ret = POLLIN | POLLRDNORM;
return ret;
@@ -1532,6 +2068,7 @@
struct futex_q *q;
struct file *filp;
int ret, err;
+ struct rw_semaphore *fshared;
static unsigned long printk_interval;
if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
@@ -1573,11 +2110,12 @@
}
q->pi_state = NULL;
- down_read(¤t->mm->mmap_sem);
- err = get_futex_key(uaddr, &q->key);
+ fshared = ¤t->mm->mmap_sem;
+ down_read(fshared);
+ err = get_futex_key(uaddr, fshared, &q->key);
if (unlikely(err != 0)) {
- up_read(¤t->mm->mmap_sem);
+ up_read(fshared);
kfree(q);
goto error;
}
@@ -1589,7 +2127,7 @@
filp->private_data = q;
queue_me(q, ret, filp);
- up_read(¤t->mm->mmap_sem);
+ up_read(fshared);
/* Now we map fd to filp, so userspace can access it */
fd_install(ret, filp);
@@ -1702,6 +2240,8 @@
* userspace.
*/
mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+ /* Also keep the FUTEX_WAITER_REQUEUED flag if set */
+ mval |= (uval & FUTEX_WAITER_REQUEUED);
nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
if (nval == -EFAULT)
@@ -1716,7 +2256,7 @@
*/
if (!pi) {
if (uval & FUTEX_WAITERS)
- futex_wake(uaddr, 1);
+ futex_wake(uaddr, &curr->mm->mmap_sem, 1);
}
}
return 0;
@@ -1772,7 +2312,8 @@
return;
if (pending)
- handle_futex_death((void __user *)pending + futex_offset, curr, pip);
+ handle_futex_death((void __user *)pending + futex_offset,
+ curr, pip);
while (entry != &head->list) {
/*
@@ -1798,39 +2339,47 @@
}
}
-long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
+long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
int ret;
+ int cmd = op & FUTEX_CMD_MASK;
+ struct rw_semaphore *fshared = NULL;
- switch (op) {
+ if (!(op & FUTEX_PRIVATE_FLAG))
+ fshared = ¤t->mm->mmap_sem;
+
+ switch (cmd) {
case FUTEX_WAIT:
- ret = futex_wait(uaddr, val, timeout);
+ ret = futex_wait(uaddr, fshared, val, timeout);
break;
case FUTEX_WAKE:
- ret = futex_wake(uaddr, val);
+ ret = futex_wake(uaddr, fshared, val);
break;
case FUTEX_FD:
/* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
ret = futex_fd(uaddr, val);
break;
case FUTEX_REQUEUE:
- ret = futex_requeue(uaddr, uaddr2, val, val2, NULL);
+ ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
break;
case FUTEX_CMP_REQUEUE:
- ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
+ ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
break;
case FUTEX_WAKE_OP:
- ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
+ ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
break;
case FUTEX_LOCK_PI:
- ret = futex_lock_pi(uaddr, val, timeout, val2, 0);
+ ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
break;
case FUTEX_UNLOCK_PI:
- ret = futex_unlock_pi(uaddr);
+ ret = futex_unlock_pi(uaddr, fshared);
break;
case FUTEX_TRYLOCK_PI:
- ret = futex_lock_pi(uaddr, 0, timeout, val2, 1);
+ ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
+ break;
+ case FUTEX_CMP_REQUEUE_PI:
+ ret = futex_requeue_pi(uaddr, fshared, uaddr2, val, val2, &val3);
break;
default:
ret = -ENOSYS;
@@ -1843,29 +2392,30 @@
struct timespec __user *utime, u32 __user *uaddr2,
u32 val3)
{
- struct timespec t;
- unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+ struct timespec ts;
+ ktime_t t, *tp = NULL;
u32 val2 = 0;
+ int cmd = op & FUTEX_CMD_MASK;
- if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
- if (copy_from_user(&t, utime, sizeof(t)) != 0)
+ if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) {
+ if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
return -EFAULT;
- if (!timespec_valid(&t))
+ if (!timespec_valid(&ts))
return -EINVAL;
- if (op == FUTEX_WAIT)
- timeout = timespec_to_jiffies(&t) + 1;
- else {
- timeout = t.tv_sec;
- val2 = t.tv_nsec;
- }
+
+ t = timespec_to_ktime(ts);
+ if (cmd == FUTEX_WAIT)
+ t = ktime_add(ktime_get(), t);
+ tp = &t;
}
/*
- * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
+ * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
*/
- if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE
+ || cmd == FUTEX_CMP_REQUEUE_PI)
val2 = (u32) (unsigned long) utime;
- return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
+ return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}
static int futexfs_get_sb(struct file_system_type *fs_type,
@@ -1895,7 +2445,7 @@
}
for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
- INIT_LIST_HEAD(&futex_queues[i].chain);
+ plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
spin_lock_init(&futex_queues[i].lock);
}
return 0;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 50f24ee..338a9b4 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -141,24 +141,24 @@
struct compat_timespec __user *utime, u32 __user *uaddr2,
u32 val3)
{
- struct timespec t;
- unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+ struct timespec ts;
+ ktime_t t, *tp = NULL;
int val2 = 0;
if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
- if (get_compat_timespec(&t, utime))
+ if (get_compat_timespec(&ts, utime))
return -EFAULT;
- if (!timespec_valid(&t))
+ if (!timespec_valid(&ts))
return -EINVAL;
+
+ t = timespec_to_ktime(ts);
if (op == FUTEX_WAIT)
- timeout = timespec_to_jiffies(&t) + 1;
- else {
- timeout = t.tv_sec;
- val2 = t.tv_nsec;
- }
+ t = ktime_add(ktime_get(), t);
+ tp = &t;
}
- if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+ if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE
+ || op == FUTEX_CMP_REQUEUE_PI)
val2 = (int) (unsigned long) utime;
- return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
+ return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index c9f4f04..23c03f4 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1411,11 +1411,13 @@
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
init_hrtimers_cpu(cpu);
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu);
migrate_hrtimers(cpu);
break;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 32e1ab1..e391cbb 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -22,7 +22,6 @@
* handle_bad_irq - handle spurious and unhandled irqs
* @irq: the interrupt number
* @desc: description of the interrupt
- * @regs: pointer to a register structure
*
* Handles spurious and unhandled IRQ's. It also prints a debugmessage.
*/
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 49cc4b9..4d32eb0 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -135,7 +135,6 @@
/* Unblock all signals and set the session keyring. */
new_session = key_get(sub_info->ring);
- flush_signals(current);
spin_lock_irq(¤t->sighand->siglock);
old_session = __install_session_keyring(current, new_session);
flush_signal_handlers(current, 1);
@@ -186,14 +185,9 @@
{
struct subprocess_info *sub_info = data;
pid_t pid;
- struct k_sigaction sa;
/* Install a handler: if SIGCLD isn't handled sys_wait4 won't
* populate the status, but will return -ECHILD. */
- sa.sa.sa_handler = SIG_IGN;
- sa.sa.sa_flags = 0;
- siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
- do_sigaction(SIGCHLD, &sa, NULL);
allow_signal(SIGCHLD);
pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 87c50cc..df8a8e8 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1,7 +1,7 @@
/* Kernel thread helper functions.
* Copyright (C) 2004 IBM Corporation, Rusty Russell.
*
- * Creation is done via keventd, so that we get a clean environment
+ * Creation is done via kthreadd, so that we get a clean environment
* even if we're invoked from userspace (think modprobe, hotplug cpu,
* etc.).
*/
@@ -15,24 +15,22 @@
#include <linux/mutex.h>
#include <asm/semaphore.h>
-/*
- * We dont want to execute off keventd since it might
- * hold a semaphore our callers hold too:
- */
-static struct workqueue_struct *helper_wq;
+static DEFINE_SPINLOCK(kthread_create_lock);
+static LIST_HEAD(kthread_create_list);
+struct task_struct *kthreadd_task;
struct kthread_create_info
{
- /* Information passed to kthread() from keventd. */
+ /* Information passed to kthread() from kthreadd. */
int (*threadfn)(void *data);
void *data;
struct completion started;
- /* Result passed back to kthread_create() from keventd. */
+ /* Result passed back to kthread_create() from kthreadd. */
struct task_struct *result;
struct completion done;
- struct work_struct work;
+ struct list_head list;
};
struct kthread_stop_info
@@ -60,42 +58,17 @@
}
EXPORT_SYMBOL(kthread_should_stop);
-static void kthread_exit_files(void)
-{
- struct fs_struct *fs;
- struct task_struct *tsk = current;
-
- exit_fs(tsk); /* current->fs->count--; */
- fs = init_task.fs;
- tsk->fs = fs;
- atomic_inc(&fs->count);
- exit_files(tsk);
- current->files = init_task.files;
- atomic_inc(&tsk->files->count);
-}
-
static int kthread(void *_create)
{
struct kthread_create_info *create = _create;
int (*threadfn)(void *data);
void *data;
- sigset_t blocked;
int ret = -EINTR;
- kthread_exit_files();
-
- /* Copy data: it's on keventd's stack */
+ /* Copy data: it's on kthread's stack */
threadfn = create->threadfn;
data = create->data;
- /* Block and flush all signals (in case we're not from keventd). */
- sigfillset(&blocked);
- sigprocmask(SIG_BLOCK, &blocked, NULL);
- flush_signals(current);
-
- /* By default we can run anywhere, unlike keventd. */
- set_cpus_allowed(current, CPU_MASK_ALL);
-
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_INTERRUPTIBLE);
complete(&create->started);
@@ -112,11 +85,8 @@
return 0;
}
-/* We are keventd: create a thread. */
-static void keventd_create_kthread(struct work_struct *work)
+static void create_kthread(struct kthread_create_info *create)
{
- struct kthread_create_info *create =
- container_of(work, struct kthread_create_info, work);
int pid;
/* We want our own signal handler (we take no signals by default). */
@@ -162,17 +132,14 @@
create.data = data;
init_completion(&create.started);
init_completion(&create.done);
- INIT_WORK(&create.work, keventd_create_kthread);
- /*
- * The workqueue needs to start up first:
- */
- if (!helper_wq)
- create.work.func(&create.work);
- else {
- queue_work(helper_wq, &create.work);
- wait_for_completion(&create.done);
- }
+ spin_lock(&kthread_create_lock);
+ list_add_tail(&create.list, &kthread_create_list);
+ wake_up_process(kthreadd_task);
+ spin_unlock(&kthread_create_lock);
+
+ wait_for_completion(&create.done);
+
if (!IS_ERR(create.result)) {
va_list args;
va_start(args, namefmt);
@@ -180,7 +147,6 @@
namefmt, args);
va_end(args);
}
-
return create.result;
}
EXPORT_SYMBOL(kthread_create);
@@ -245,12 +211,47 @@
}
EXPORT_SYMBOL(kthread_stop);
-static __init int helper_init(void)
+
+static __init void kthreadd_setup(void)
{
- helper_wq = create_singlethread_workqueue("kthread");
- BUG_ON(!helper_wq);
+ struct task_struct *tsk = current;
+
+ set_task_comm(tsk, "kthreadd");
+
+ ignore_signals(tsk);
+
+ set_user_nice(tsk, -5);
+ set_cpus_allowed(tsk, CPU_MASK_ALL);
+}
+
+int kthreadd(void *unused)
+{
+ /* Setup a clean context for our children to inherit. */
+ kthreadd_setup();
+
+ current->flags |= PF_NOFREEZE;
+
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (list_empty(&kthread_create_list))
+ schedule();
+ __set_current_state(TASK_RUNNING);
+
+ spin_lock(&kthread_create_lock);
+ while (!list_empty(&kthread_create_list)) {
+ struct kthread_create_info *create;
+
+ create = list_entry(kthread_create_list.next,
+ struct kthread_create_info, list);
+ list_del_init(&create->list);
+ spin_unlock(&kthread_create_lock);
+
+ create_kthread(create);
+
+ spin_lock(&kthread_create_lock);
+ }
+ spin_unlock(&kthread_create_lock);
+ }
return 0;
}
-
-core_initcall(helper_init);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index e7cbbb8..303eab1 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -133,7 +133,7 @@
debug_mutex_lock_common(lock, &waiter);
mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
- debug_mutex_add_waiter(lock, &waiter, task->thread_info);
+ debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
/* add waiting tasks to the end of the waitqueue (FIFO): */
list_add_tail(&waiter.list, &lock->wait_list);
@@ -159,7 +159,7 @@
*/
if (unlikely(state == TASK_INTERRUPTIBLE &&
signal_pending(task))) {
- mutex_remove_waiter(lock, &waiter, task->thread_info);
+ mutex_remove_waiter(lock, &waiter, task_thread_info(task));
mutex_release(&lock->dep_map, 1, _RET_IP_);
spin_unlock_mutex(&lock->wait_lock, flags);
@@ -175,8 +175,8 @@
}
/* got the lock - rejoice! */
- mutex_remove_waiter(lock, &waiter, task->thread_info);
- debug_mutex_set_owner(lock, task->thread_info);
+ mutex_remove_waiter(lock, &waiter, task_thread_info(task));
+ debug_mutex_set_owner(lock, task_thread_info(task));
/* set it to 0 if there are no waiters left: */
if (likely(list_empty(&lock->wait_list)))
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 0633137..b5f0543 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -30,30 +30,69 @@
dev_t swsusp_resume_device;
sector_t swsusp_resume_block;
+enum {
+ HIBERNATION_INVALID,
+ HIBERNATION_PLATFORM,
+ HIBERNATION_TEST,
+ HIBERNATION_TESTPROC,
+ HIBERNATION_SHUTDOWN,
+ HIBERNATION_REBOOT,
+ /* keep last */
+ __HIBERNATION_AFTER_LAST
+};
+#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1)
+#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1)
+
+static int hibernation_mode = HIBERNATION_SHUTDOWN;
+
+struct hibernation_ops *hibernation_ops;
+
+/**
+ * hibernation_set_ops - set the global hibernate operations
+ * @ops: the hibernation operations to use in subsequent hibernation transitions
+ */
+
+void hibernation_set_ops(struct hibernation_ops *ops)
+{
+ if (ops && !(ops->prepare && ops->enter && ops->finish)) {
+ WARN_ON(1);
+ return;
+ }
+ mutex_lock(&pm_mutex);
+ hibernation_ops = ops;
+ if (ops)
+ hibernation_mode = HIBERNATION_PLATFORM;
+ else if (hibernation_mode == HIBERNATION_PLATFORM)
+ hibernation_mode = HIBERNATION_SHUTDOWN;
+
+ mutex_unlock(&pm_mutex);
+}
+
+
/**
* platform_prepare - prepare the machine for hibernation using the
* platform driver if so configured and return an error code if it fails
*/
-static inline int platform_prepare(void)
+static int platform_prepare(void)
{
- int error = 0;
-
- switch (pm_disk_mode) {
- case PM_DISK_TEST:
- case PM_DISK_TESTPROC:
- case PM_DISK_SHUTDOWN:
- case PM_DISK_REBOOT:
- break;
- default:
- if (pm_ops && pm_ops->prepare)
- error = pm_ops->prepare(PM_SUSPEND_DISK);
- }
- return error;
+ return (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) ?
+ hibernation_ops->prepare() : 0;
}
/**
- * power_down - Shut machine down for hibernate.
+ * platform_finish - switch the machine to the normal mode of operation
+ * using the platform driver (must be called after platform_prepare())
+ */
+
+static void platform_finish(void)
+{
+ if (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops)
+ hibernation_ops->finish();
+}
+
+/**
+ * power_down - Shut the machine down for hibernation.
*
* Use the platform driver, if configured so; otherwise try
* to power off or reboot.
@@ -61,20 +100,20 @@
static void power_down(void)
{
- switch (pm_disk_mode) {
- case PM_DISK_TEST:
- case PM_DISK_TESTPROC:
+ switch (hibernation_mode) {
+ case HIBERNATION_TEST:
+ case HIBERNATION_TESTPROC:
break;
- case PM_DISK_SHUTDOWN:
+ case HIBERNATION_SHUTDOWN:
kernel_power_off();
break;
- case PM_DISK_REBOOT:
+ case HIBERNATION_REBOOT:
kernel_restart(NULL);
break;
- default:
- if (pm_ops && pm_ops->enter) {
+ case HIBERNATION_PLATFORM:
+ if (hibernation_ops) {
kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
- pm_ops->enter(PM_SUSPEND_DISK);
+ hibernation_ops->enter();
break;
}
}
@@ -87,20 +126,6 @@
while(1);
}
-static inline void platform_finish(void)
-{
- switch (pm_disk_mode) {
- case PM_DISK_TEST:
- case PM_DISK_TESTPROC:
- case PM_DISK_SHUTDOWN:
- case PM_DISK_REBOOT:
- break;
- default:
- if (pm_ops && pm_ops->finish)
- pm_ops->finish(PM_SUSPEND_DISK);
- }
-}
-
static void unprepare_processes(void)
{
thaw_processes();
@@ -120,13 +145,10 @@
}
/**
- * pm_suspend_disk - The granpappy of hibernation power management.
- *
- * If not, then call swsusp to do its thing, then figure out how
- * to power down the system.
+ * hibernate - The granpappy of the built-in hibernation management
*/
-int pm_suspend_disk(void)
+int hibernate(void)
{
int error;
@@ -143,7 +165,8 @@
if (error)
goto Finish;
- if (pm_disk_mode == PM_DISK_TESTPROC) {
+ mutex_lock(&pm_mutex);
+ if (hibernation_mode == HIBERNATION_TESTPROC) {
printk("swsusp debug: Waiting for 5 seconds.\n");
mdelay(5000);
goto Thaw;
@@ -168,7 +191,7 @@
if (error)
goto Enable_cpus;
- if (pm_disk_mode == PM_DISK_TEST) {
+ if (hibernation_mode == HIBERNATION_TEST) {
printk("swsusp debug: Waiting for 5 seconds.\n");
mdelay(5000);
goto Enable_cpus;
@@ -205,6 +228,7 @@
device_resume();
resume_console();
Thaw:
+ mutex_unlock(&pm_mutex);
unprepare_processes();
Finish:
free_basic_memory_bitmaps();
@@ -220,7 +244,7 @@
* Called as a late_initcall (so all devices are discovered and
* initialized), we call swsusp to see if we have a saved image or not.
* If so, we quiesce devices, the restore the saved image. We will
- * return above (in pm_suspend_disk() ) if everything goes well.
+ * return above (in hibernate() ) if everything goes well.
* Otherwise, we fail gracefully and return to the normally
* scheduled program.
*
@@ -315,25 +339,26 @@
late_initcall(software_resume);
-static const char * const pm_disk_modes[] = {
- [PM_DISK_PLATFORM] = "platform",
- [PM_DISK_SHUTDOWN] = "shutdown",
- [PM_DISK_REBOOT] = "reboot",
- [PM_DISK_TEST] = "test",
- [PM_DISK_TESTPROC] = "testproc",
+static const char * const hibernation_modes[] = {
+ [HIBERNATION_PLATFORM] = "platform",
+ [HIBERNATION_SHUTDOWN] = "shutdown",
+ [HIBERNATION_REBOOT] = "reboot",
+ [HIBERNATION_TEST] = "test",
+ [HIBERNATION_TESTPROC] = "testproc",
};
/**
- * disk - Control suspend-to-disk mode
+ * disk - Control hibernation mode
*
* Suspend-to-disk can be handled in several ways. We have a few options
* for putting the system to sleep - using the platform driver (e.g. ACPI
- * or other pm_ops), powering off the system or rebooting the system
- * (for testing) as well as the two test modes.
+ * or other hibernation_ops), powering off the system or rebooting the
+ * system (for testing) as well as the two test modes.
*
* The system can support 'platform', and that is known a priori (and
- * encoded in pm_ops). However, the user may choose 'shutdown' or 'reboot'
- * as alternatives, as well as the test modes 'test' and 'testproc'.
+ * encoded by the presence of hibernation_ops). However, the user may
+ * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the
+ * test modes, 'test' or 'testproc'.
*
* show() will display what the mode is currently set to.
* store() will accept one of
@@ -345,7 +370,7 @@
* 'testproc'
*
* It will only change to 'platform' if the system
- * supports it (as determined from pm_ops->pm_disk_mode).
+ * supports it (as determined by having hibernation_ops).
*/
static ssize_t disk_show(struct kset *kset, char *buf)
@@ -353,28 +378,25 @@
int i;
char *start = buf;
- for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) {
- if (!pm_disk_modes[i])
+ for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
+ if (!hibernation_modes[i])
continue;
switch (i) {
- case PM_DISK_SHUTDOWN:
- case PM_DISK_REBOOT:
- case PM_DISK_TEST:
- case PM_DISK_TESTPROC:
+ case HIBERNATION_SHUTDOWN:
+ case HIBERNATION_REBOOT:
+ case HIBERNATION_TEST:
+ case HIBERNATION_TESTPROC:
break;
- default:
- if (pm_ops && pm_ops->enter &&
- (i == pm_ops->pm_disk_mode))
+ case HIBERNATION_PLATFORM:
+ if (hibernation_ops)
break;
/* not a valid mode, continue with loop */
continue;
}
- if (i == pm_disk_mode)
- buf += sprintf(buf, "[%s]", pm_disk_modes[i]);
+ if (i == hibernation_mode)
+ buf += sprintf(buf, "[%s] ", hibernation_modes[i]);
else
- buf += sprintf(buf, "%s", pm_disk_modes[i]);
- if (i+1 != PM_DISK_MAX)
- buf += sprintf(buf, " ");
+ buf += sprintf(buf, "%s ", hibernation_modes[i]);
}
buf += sprintf(buf, "\n");
return buf-start;
@@ -387,39 +409,38 @@
int i;
int len;
char *p;
- suspend_disk_method_t mode = 0;
+ int mode = HIBERNATION_INVALID;
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
mutex_lock(&pm_mutex);
- for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) {
- if (!strncmp(buf, pm_disk_modes[i], len)) {
+ for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
+ if (!strncmp(buf, hibernation_modes[i], len)) {
mode = i;
break;
}
}
- if (mode) {
+ if (mode != HIBERNATION_INVALID) {
switch (mode) {
- case PM_DISK_SHUTDOWN:
- case PM_DISK_REBOOT:
- case PM_DISK_TEST:
- case PM_DISK_TESTPROC:
- pm_disk_mode = mode;
+ case HIBERNATION_SHUTDOWN:
+ case HIBERNATION_REBOOT:
+ case HIBERNATION_TEST:
+ case HIBERNATION_TESTPROC:
+ hibernation_mode = mode;
break;
- default:
- if (pm_ops && pm_ops->enter &&
- (mode == pm_ops->pm_disk_mode))
- pm_disk_mode = mode;
+ case HIBERNATION_PLATFORM:
+ if (hibernation_ops)
+ hibernation_mode = mode;
else
error = -EINVAL;
}
- } else {
+ } else
error = -EINVAL;
- }
- pr_debug("PM: suspend-to-disk mode set to '%s'\n",
- pm_disk_modes[mode]);
+ if (!error)
+ pr_debug("PM: suspend-to-disk mode set to '%s'\n",
+ hibernation_modes[mode]);
mutex_unlock(&pm_mutex);
return error ? error : n;
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f6dda68..40d56a3 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -30,7 +30,6 @@
DEFINE_MUTEX(pm_mutex);
struct pm_ops *pm_ops;
-suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
/**
* pm_set_ops - Set the global power method table.
@@ -41,10 +40,6 @@
{
mutex_lock(&pm_mutex);
pm_ops = ops;
- if (ops && ops->pm_disk_mode != PM_DISK_INVALID) {
- pm_disk_mode = ops->pm_disk_mode;
- } else
- pm_disk_mode = PM_DISK_SHUTDOWN;
mutex_unlock(&pm_mutex);
}
@@ -184,24 +179,12 @@
static const char * const pm_states[PM_SUSPEND_MAX] = {
[PM_SUSPEND_STANDBY] = "standby",
[PM_SUSPEND_MEM] = "mem",
- [PM_SUSPEND_DISK] = "disk",
};
static inline int valid_state(suspend_state_t state)
{
- /* Suspend-to-disk does not really need low-level support.
- * It can work with shutdown/reboot if needed. If it isn't
- * configured, then it cannot be supported.
- */
- if (state == PM_SUSPEND_DISK)
-#ifdef CONFIG_SOFTWARE_SUSPEND
- return 1;
-#else
- return 0;
-#endif
-
- /* all other states need lowlevel support and need to be
- * valid to the lowlevel implementation, no valid callback
+ /* All states need lowlevel support and need to be valid
+ * to the lowlevel implementation, no valid callback
* implies that none are valid. */
if (!pm_ops || !pm_ops->valid || !pm_ops->valid(state))
return 0;
@@ -229,11 +212,6 @@
if (!mutex_trylock(&pm_mutex))
return -EBUSY;
- if (state == PM_SUSPEND_DISK) {
- error = pm_suspend_disk();
- goto Unlock;
- }
-
pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
if ((error = suspend_prepare(state)))
goto Unlock;
@@ -251,7 +229,7 @@
/**
* pm_suspend - Externally visible function for suspending system.
- * @state: Enumarted value of state to enter.
+ * @state: Enumerated value of state to enter.
*
* Determine whether or not value is within range, get state
* structure, and enter (above).
@@ -289,7 +267,13 @@
if (pm_states[i] && valid_state(i))
s += sprintf(s,"%s ", pm_states[i]);
}
- s += sprintf(s,"\n");
+#ifdef CONFIG_SOFTWARE_SUSPEND
+ s += sprintf(s, "%s\n", "disk");
+#else
+ if (s != buf)
+ /* convert the last space to a newline */
+ *(s-1) = '\n';
+#endif
return (s - buf);
}
@@ -304,6 +288,12 @@
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
+ /* First, check if we are requested to hibernate */
+ if (!strncmp(buf, "disk", len)) {
+ error = hibernate();
+ return error ? error : n;
+ }
+
for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
if (*s && !strncmp(buf, *s, len))
break;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 34b4354..5138148 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -25,12 +25,7 @@
*/
#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
-extern int pm_suspend_disk(void);
-#else
-static inline int pm_suspend_disk(void)
-{
- return -EPERM;
-}
+extern struct hibernation_ops *hibernation_ops;
#endif
extern int pfn_is_nosave(unsigned long);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b703977..48383ea 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1227,7 +1227,7 @@
nr_copy_pages = nr_pages;
nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
- printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
+ printk("swsusp: critical section: done (%d pages copied)\n", nr_pages);
return 0;
}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 040560d..24d7d78 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -130,16 +130,16 @@
{
int error = 0;
- if (pm_ops && pm_ops->prepare)
- error = pm_ops->prepare(PM_SUSPEND_DISK);
+ if (hibernation_ops)
+ error = hibernation_ops->prepare();
return error;
}
static inline void platform_finish(void)
{
- if (pm_ops && pm_ops->finish)
- pm_ops->finish(PM_SUSPEND_DISK);
+ if (hibernation_ops)
+ hibernation_ops->finish();
}
static inline int snapshot_suspend(int platform_suspend)
@@ -384,7 +384,7 @@
switch (arg) {
case PMOPS_PREPARE:
- if (pm_ops && pm_ops->enter) {
+ if (hibernation_ops) {
data->platform_suspend = 1;
error = 0;
} else {
@@ -395,8 +395,7 @@
case PMOPS_ENTER:
if (data->platform_suspend) {
kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
- error = pm_ops->enter(PM_SUSPEND_DISK);
- error = 0;
+ error = hibernation_ops->enter();
}
break;
diff --git a/kernel/profile.c b/kernel/profile.c
index 9bfadb2..cc91b9b 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -340,6 +340,7 @@
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
node = cpu_to_node(cpu);
per_cpu(cpu_profile_flip, cpu) = 0;
if (!per_cpu(cpu_profile_hits, cpu)[1]) {
@@ -365,10 +366,13 @@
__free_page(page);
return NOTIFY_BAD;
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
cpu_set(cpu, prof_cpu_mask);
break;
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
cpu_clear(cpu, prof_cpu_mask);
if (per_cpu(cpu_profile_hits, cpu)[0]) {
page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 3554b76..2c2dd84 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -558,9 +558,11 @@
long cpu = (long)hcpu;
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
rcu_online_cpu(cpu);
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
rcu_offline_cpu(cpu);
break;
default:
diff --git a/kernel/relay.c b/kernel/relay.c
index d24395e..4311101 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -310,16 +310,13 @@
/**
* wakeup_readers - wake up readers waiting on a channel
- * @work: work struct that contains the channel buffer
+ * @data: contains the channel buffer
*
- * This is the work function used to defer reader waking. The
- * reason waking is deferred is that calling directly from write
- * causes problems if you're writing from say the scheduler.
+ * This is the timer function used to defer reader waking.
*/
-static void wakeup_readers(struct work_struct *work)
+static void wakeup_readers(unsigned long data)
{
- struct rchan_buf *buf =
- container_of(work, struct rchan_buf, wake_readers.work);
+ struct rchan_buf *buf = (struct rchan_buf *)data;
wake_up_interruptible(&buf->read_wait);
}
@@ -337,11 +334,9 @@
if (init) {
init_waitqueue_head(&buf->read_wait);
kref_init(&buf->kref);
- INIT_DELAYED_WORK(&buf->wake_readers, NULL);
- } else {
- cancel_delayed_work(&buf->wake_readers);
- flush_scheduled_work();
- }
+ setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
+ } else
+ del_timer_sync(&buf->timer);
buf->subbufs_produced = 0;
buf->subbufs_consumed = 0;
@@ -447,8 +442,7 @@
static void relay_close_buf(struct rchan_buf *buf)
{
buf->finalized = 1;
- cancel_delayed_work(&buf->wake_readers);
- flush_scheduled_work();
+ del_timer_sync(&buf->timer);
kref_put(&buf->kref, relay_remove_buf);
}
@@ -490,6 +484,7 @@
switch(action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
mutex_lock(&relay_channels_mutex);
list_for_each_entry(chan, &relay_channels, list) {
if (chan->buf[hotcpu])
@@ -506,6 +501,7 @@
mutex_unlock(&relay_channels_mutex);
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
/* No need to flush the cpu : will be flushed upon
* final relay_flush() call. */
break;
@@ -608,11 +604,14 @@
buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
buf->padding[old_subbuf];
smp_mb();
- if (waitqueue_active(&buf->read_wait)) {
- PREPARE_DELAYED_WORK(&buf->wake_readers,
- wakeup_readers);
- schedule_delayed_work(&buf->wake_readers, 1);
- }
+ if (waitqueue_active(&buf->read_wait))
+ /*
+ * Calling wake_up_interruptible() from here
+ * will deadlock if we happen to be logging
+ * from the scheduler (trying to re-grab
+ * rq->lock), so defer it.
+ */
+ __mod_timer(&buf->timer, jiffies + 1);
}
old = buf->data;
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 180978c..12879f6 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -56,7 +56,7 @@
* state.
*/
-static void
+void
rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
unsigned long mask)
{
@@ -81,29 +81,6 @@
}
/*
- * We can speed up the acquire/release, if the architecture
- * supports cmpxchg and if there's no debugging state to be set up
- */
-#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
-# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
-{
- unsigned long owner, *p = (unsigned long *) &lock->owner;
-
- do {
- owner = *p;
- } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
-}
-#else
-# define rt_mutex_cmpxchg(l,c,n) (0)
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
-{
- lock->owner = (struct task_struct *)
- ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
-}
-#endif
-
-/*
* Calculate task priority from the waiter list priority
*
* Return task->normal_prio when the waiter list is empty or when
@@ -123,7 +100,7 @@
*
* This can be both boosting and unboosting. task->pi_lock must be held.
*/
-static void __rt_mutex_adjust_prio(struct task_struct *task)
+void __rt_mutex_adjust_prio(struct task_struct *task)
{
int prio = rt_mutex_getprio(task);
@@ -159,11 +136,11 @@
* Decreases task's usage by one - may thus free the task.
* Returns 0 or -EDEADLK.
*/
-static int rt_mutex_adjust_prio_chain(struct task_struct *task,
- int deadlock_detect,
- struct rt_mutex *orig_lock,
- struct rt_mutex_waiter *orig_waiter,
- struct task_struct *top_task)
+int rt_mutex_adjust_prio_chain(struct task_struct *task,
+ int deadlock_detect,
+ struct rt_mutex *orig_lock,
+ struct rt_mutex_waiter *orig_waiter,
+ struct task_struct *top_task)
{
struct rt_mutex *lock;
struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
@@ -524,8 +501,8 @@
*
* Must be called with lock->wait_lock held
*/
-static void remove_waiter(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter)
+void remove_waiter(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter)
{
int first = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 9c75856..242ec7e 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -113,6 +113,29 @@
}
/*
+ * We can speed up the acquire/release, if the architecture
+ * supports cmpxchg and if there's no debugging state to be set up
+ */
+#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+ do {
+ owner = *p;
+ } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+}
+#else
+# define rt_mutex_cmpxchg(l,c,n) (0)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+ lock->owner = (struct task_struct *)
+ ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
+}
+#endif
+
+/*
* PI-futex support (proxy locking functions, etc.):
*/
extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
@@ -120,4 +143,15 @@
struct task_struct *proxy_owner);
extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
struct task_struct *proxy_owner);
+
+extern void rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
+ unsigned long mask);
+extern void __rt_mutex_adjust_prio(struct task_struct *task);
+extern int rt_mutex_adjust_prio_chain(struct task_struct *task,
+ int deadlock_detect,
+ struct rt_mutex *orig_lock,
+ struct rt_mutex_waiter *orig_waiter,
+ struct task_struct *top_task);
+extern void remove_waiter(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter);
#endif
diff --git a/kernel/sched.c b/kernel/sched.c
index 66bd7ff..799d23b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -305,6 +305,7 @@
};
static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
+static DEFINE_MUTEX(sched_hotcpu_mutex);
static inline int cpu_of(struct rq *rq)
{
@@ -4520,13 +4521,13 @@
struct task_struct *p;
int retval;
- lock_cpu_hotplug();
+ mutex_lock(&sched_hotcpu_mutex);
read_lock(&tasklist_lock);
p = find_process_by_pid(pid);
if (!p) {
read_unlock(&tasklist_lock);
- unlock_cpu_hotplug();
+ mutex_unlock(&sched_hotcpu_mutex);
return -ESRCH;
}
@@ -4553,7 +4554,7 @@
out_unlock:
put_task_struct(p);
- unlock_cpu_hotplug();
+ mutex_unlock(&sched_hotcpu_mutex);
return retval;
}
@@ -4610,7 +4611,7 @@
struct task_struct *p;
int retval;
- lock_cpu_hotplug();
+ mutex_lock(&sched_hotcpu_mutex);
read_lock(&tasklist_lock);
retval = -ESRCH;
@@ -4626,7 +4627,7 @@
out_unlock:
read_unlock(&tasklist_lock);
- unlock_cpu_hotplug();
+ mutex_unlock(&sched_hotcpu_mutex);
if (retval)
return retval;
@@ -5388,7 +5389,12 @@
struct rq *rq;
switch (action) {
+ case CPU_LOCK_ACQUIRE:
+ mutex_lock(&sched_hotcpu_mutex);
+ break;
+
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
if (IS_ERR(p))
return NOTIFY_BAD;
@@ -5402,12 +5408,14 @@
break;
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
/* Strictly unneccessary, as first user will wake it. */
wake_up_process(cpu_rq(cpu)->migration_thread);
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
if (!cpu_rq(cpu)->migration_thread)
break;
/* Unbind it from offline cpu so it can run. Fall thru. */
@@ -5418,6 +5426,7 @@
break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
migrate_live_tasks(cpu);
rq = cpu_rq(cpu);
kthread_stop(rq->migration_thread);
@@ -5433,7 +5442,7 @@
BUG_ON(rq->nr_running != 0);
/* No need to migrate the tasks: it was best-effort if
- * they didn't do lock_cpu_hotplug(). Just wake up
+ * they didn't take sched_hotcpu_mutex. Just wake up
* the requestors. */
spin_lock_irq(&rq->lock);
while (!list_empty(&rq->migration_queue)) {
@@ -5447,6 +5456,9 @@
spin_unlock_irq(&rq->lock);
break;
#endif
+ case CPU_LOCK_RELEASE:
+ mutex_unlock(&sched_hotcpu_mutex);
+ break;
}
return NOTIFY_OK;
}
@@ -6822,10 +6834,10 @@
{
int err;
- lock_cpu_hotplug();
+ mutex_lock(&sched_hotcpu_mutex);
detach_destroy_domains(&cpu_online_map);
err = arch_init_sched_domains(&cpu_online_map);
- unlock_cpu_hotplug();
+ mutex_unlock(&sched_hotcpu_mutex);
return err;
}
@@ -6904,14 +6916,20 @@
{
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
detach_destroy_domains(&cpu_online_map);
return NOTIFY_OK;
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
/*
* Fall through and re-initialise the domains.
*/
@@ -6930,12 +6948,12 @@
{
cpumask_t non_isolated_cpus;
- lock_cpu_hotplug();
+ mutex_lock(&sched_hotcpu_mutex);
arch_init_sched_domains(&cpu_online_map);
cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
if (cpus_empty(non_isolated_cpus))
cpu_set(smp_processor_id(), non_isolated_cpus);
- unlock_cpu_hotplug();
+ mutex_unlock(&sched_hotcpu_mutex);
/* XXX: Theoretical race here - CPU may be hotplugged now */
hotcpu_notifier(update_sched_domains, 0);
diff --git a/kernel/signal.c b/kernel/signal.c
index 1368e67..2ac3a66 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -38,125 +38,6 @@
static struct kmem_cache *sigqueue_cachep;
-/*
- * In POSIX a signal is sent either to a specific thread (Linux task)
- * or to the process as a whole (Linux thread group). How the signal
- * is sent determines whether it's to one thread or the whole group,
- * which determines which signal mask(s) are involved in blocking it
- * from being delivered until later. When the signal is delivered,
- * either it's caught or ignored by a user handler or it has a default
- * effect that applies to the whole thread group (POSIX process).
- *
- * The possible effects an unblocked signal set to SIG_DFL can have are:
- * ignore - Nothing Happens
- * terminate - kill the process, i.e. all threads in the group,
- * similar to exit_group. The group leader (only) reports
- * WIFSIGNALED status to its parent.
- * coredump - write a core dump file describing all threads using
- * the same mm and then kill all those threads
- * stop - stop all the threads in the group, i.e. TASK_STOPPED state
- *
- * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored.
- * Other signals when not blocked and set to SIG_DFL behaves as follows.
- * The job control signals also have other special effects.
- *
- * +--------------------+------------------+
- * | POSIX signal | default action |
- * +--------------------+------------------+
- * | SIGHUP | terminate |
- * | SIGINT | terminate |
- * | SIGQUIT | coredump |
- * | SIGILL | coredump |
- * | SIGTRAP | coredump |
- * | SIGABRT/SIGIOT | coredump |
- * | SIGBUS | coredump |
- * | SIGFPE | coredump |
- * | SIGKILL | terminate(+) |
- * | SIGUSR1 | terminate |
- * | SIGSEGV | coredump |
- * | SIGUSR2 | terminate |
- * | SIGPIPE | terminate |
- * | SIGALRM | terminate |
- * | SIGTERM | terminate |
- * | SIGCHLD | ignore |
- * | SIGCONT | ignore(*) |
- * | SIGSTOP | stop(*)(+) |
- * | SIGTSTP | stop(*) |
- * | SIGTTIN | stop(*) |
- * | SIGTTOU | stop(*) |
- * | SIGURG | ignore |
- * | SIGXCPU | coredump |
- * | SIGXFSZ | coredump |
- * | SIGVTALRM | terminate |
- * | SIGPROF | terminate |
- * | SIGPOLL/SIGIO | terminate |
- * | SIGSYS/SIGUNUSED | coredump |
- * | SIGSTKFLT | terminate |
- * | SIGWINCH | ignore |
- * | SIGPWR | terminate |
- * | SIGRTMIN-SIGRTMAX | terminate |
- * +--------------------+------------------+
- * | non-POSIX signal | default action |
- * +--------------------+------------------+
- * | SIGEMT | coredump |
- * +--------------------+------------------+
- *
- * (+) For SIGKILL and SIGSTOP the action is "always", not just "default".
- * (*) Special job control effects:
- * When SIGCONT is sent, it resumes the process (all threads in the group)
- * from TASK_STOPPED state and also clears any pending/queued stop signals
- * (any of those marked with "stop(*)"). This happens regardless of blocking,
- * catching, or ignoring SIGCONT. When any stop signal is sent, it clears
- * any pending/queued SIGCONT signals; this happens regardless of blocking,
- * catching, or ignored the stop signal, though (except for SIGSTOP) the
- * default action of stopping the process may happen later or never.
- */
-
-#ifdef SIGEMT
-#define M_SIGEMT M(SIGEMT)
-#else
-#define M_SIGEMT 0
-#endif
-
-#if SIGRTMIN > BITS_PER_LONG
-#define M(sig) (1ULL << ((sig)-1))
-#else
-#define M(sig) (1UL << ((sig)-1))
-#endif
-#define T(sig, mask) (M(sig) & (mask))
-
-#define SIG_KERNEL_ONLY_MASK (\
- M(SIGKILL) | M(SIGSTOP) )
-
-#define SIG_KERNEL_STOP_MASK (\
- M(SIGSTOP) | M(SIGTSTP) | M(SIGTTIN) | M(SIGTTOU) )
-
-#define SIG_KERNEL_COREDUMP_MASK (\
- M(SIGQUIT) | M(SIGILL) | M(SIGTRAP) | M(SIGABRT) | \
- M(SIGFPE) | M(SIGSEGV) | M(SIGBUS) | M(SIGSYS) | \
- M(SIGXCPU) | M(SIGXFSZ) | M_SIGEMT )
-
-#define SIG_KERNEL_IGNORE_MASK (\
- M(SIGCONT) | M(SIGCHLD) | M(SIGWINCH) | M(SIGURG) )
-
-#define sig_kernel_only(sig) \
- (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_ONLY_MASK))
-#define sig_kernel_coredump(sig) \
- (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_COREDUMP_MASK))
-#define sig_kernel_ignore(sig) \
- (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_IGNORE_MASK))
-#define sig_kernel_stop(sig) \
- (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK))
-
-#define sig_needs_tasklist(sig) ((sig) == SIGCONT)
-
-#define sig_user_defined(t, signr) \
- (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \
- ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
-
-#define sig_fatal(t, signr) \
- (!T(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
- (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)
static int sig_ignored(struct task_struct *t, int sig)
{
@@ -328,6 +209,16 @@
spin_unlock_irqrestore(&t->sighand->siglock, flags);
}
+void ignore_signals(struct task_struct *t)
+{
+ int i;
+
+ for (i = 0; i < _NSIG; ++i)
+ t->sighand->action[i].sa.sa_handler = SIG_IGN;
+
+ flush_signals(t);
+}
+
/*
* Flush all handlers for a task.
*/
@@ -1032,17 +923,6 @@
if (t->exit_state)
continue;
- /*
- * We don't want to notify the parent, since we are
- * killed as part of a thread group due to another
- * thread doing an execve() or similar. So set the
- * exit signal to -1 to allow immediate reaping of
- * the process. But don't detach the thread group
- * leader.
- */
- if (t != p->group_leader)
- t->exit_signal = -1;
-
/* SIGKILL will be handled before any pending SIGSTOP */
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 8b75008..0b9886a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -593,6 +593,7 @@
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
if (IS_ERR(p)) {
printk("ksoftirqd for %i failed\n", hotcpu);
@@ -602,16 +603,19 @@
per_cpu(ksoftirqd, hotcpu) = p;
break;
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
wake_up_process(per_cpu(ksoftirqd, hotcpu));
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
if (!per_cpu(ksoftirqd, hotcpu))
break;
/* Unbind so it can run. Fall thru. */
kthread_bind(per_cpu(ksoftirqd, hotcpu),
any_online_cpu(cpu_online_map));
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
p = per_cpu(ksoftirqd, hotcpu);
per_cpu(ksoftirqd, hotcpu) = NULL;
kthread_stop(p);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 8fa7040..0131e29 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -146,6 +146,7 @@
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
BUG_ON(per_cpu(watchdog_task, hotcpu));
p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
if (IS_ERR(p)) {
@@ -157,16 +158,19 @@
kthread_bind(p, hotcpu);
break;
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
wake_up_process(per_cpu(watchdog_task, hotcpu));
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
if (!per_cpu(watchdog_task, hotcpu))
break;
/* Unbind so it can run. Fall thru. */
kthread_bind(per_cpu(watchdog_task, hotcpu),
any_online_cpu(cpu_online_map));
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
p = per_cpu(watchdog_task, hotcpu);
per_cpu(watchdog_task, hotcpu) = NULL;
kthread_stop(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index 0742c93..cdb7e94 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -134,19 +134,39 @@
return -ENOENT;
}
+/**
+ * notifier_call_chain - Informs the registered notifiers about an event.
+ * @nl: Pointer to head of the blocking notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: Number of notifier functions to be called. Don't care
+ * value of this parameter is -1.
+ * @nr_calls: Records the number of notifications sent. Don't care
+ * value of this field is NULL.
+ * @returns: notifier_call_chain returns the value returned by the
+ * last notifier function called.
+ */
+
static int __kprobes notifier_call_chain(struct notifier_block **nl,
- unsigned long val, void *v)
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
{
int ret = NOTIFY_DONE;
struct notifier_block *nb, *next_nb;
nb = rcu_dereference(*nl);
- while (nb) {
+
+ while (nb && nr_to_call) {
next_nb = rcu_dereference(nb->next);
ret = nb->notifier_call(nb, val, v);
+
+ if (nr_calls)
+ (*nr_calls)++;
+
if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
break;
nb = next_nb;
+ nr_to_call--;
}
return ret;
}
@@ -205,10 +225,12 @@
EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
/**
- * atomic_notifier_call_chain - Call functions in an atomic notifier chain
+ * __atomic_notifier_call_chain - Call functions in an atomic notifier chain
* @nh: Pointer to head of the atomic notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: See the comment for notifier_call_chain.
+ * @nr_calls: See the comment for notifier_call_chain.
*
* Calls each function in a notifier chain in turn. The functions
* run in an atomic context, so they must not block.
@@ -222,19 +244,27 @@
* of the last notifier function called.
*/
-int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
- unsigned long val, void *v)
+int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
{
int ret;
rcu_read_lock();
- ret = notifier_call_chain(&nh->head, val, v);
+ ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
rcu_read_unlock();
return ret;
}
-EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
+EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
+int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+ unsigned long val, void *v)
+{
+ return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
+}
+
+EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
/*
* Blocking notifier chain routines. All access to the chain is
* synchronized by an rwsem.
@@ -304,10 +334,12 @@
EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
/**
- * blocking_notifier_call_chain - Call functions in a blocking notifier chain
+ * __blocking_notifier_call_chain - Call functions in a blocking notifier chain
* @nh: Pointer to head of the blocking notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: See comment for notifier_call_chain.
+ * @nr_calls: See comment for notifier_call_chain.
*
* Calls each function in a notifier chain in turn. The functions
* run in a process context, so they are allowed to block.
@@ -320,8 +352,9 @@
* of the last notifier function called.
*/
-int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
- unsigned long val, void *v)
+int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
{
int ret = NOTIFY_DONE;
@@ -332,12 +365,19 @@
*/
if (rcu_dereference(nh->head)) {
down_read(&nh->rwsem);
- ret = notifier_call_chain(&nh->head, val, v);
+ ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
+ nr_calls);
up_read(&nh->rwsem);
}
return ret;
}
+EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
+int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+ unsigned long val, void *v)
+{
+ return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
+}
EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
/*
@@ -383,10 +423,12 @@
EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
/**
- * raw_notifier_call_chain - Call functions in a raw notifier chain
+ * __raw_notifier_call_chain - Call functions in a raw notifier chain
* @nh: Pointer to head of the raw notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: See comment for notifier_call_chain.
+ * @nr_calls: See comment for notifier_call_chain
*
* Calls each function in a notifier chain in turn. The functions
* run in an undefined context.
@@ -400,10 +442,19 @@
* of the last notifier function called.
*/
+int __raw_notifier_call_chain(struct raw_notifier_head *nh,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
+{
+ return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
+}
+
+EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
+
int raw_notifier_call_chain(struct raw_notifier_head *nh,
unsigned long val, void *v)
{
- return notifier_call_chain(&nh->head, val, v);
+ return __raw_notifier_call_chain(nh, val, v, -1, NULL);
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
@@ -478,10 +529,12 @@
EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
/**
- * srcu_notifier_call_chain - Call functions in an SRCU notifier chain
+ * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain
* @nh: Pointer to head of the SRCU notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
+ * @nr_to_call: See comment for notifier_call_chain.
+ * @nr_calls: See comment for notifier_call_chain
*
* Calls each function in a notifier chain in turn. The functions
* run in a process context, so they are allowed to block.
@@ -494,18 +547,25 @@
* of the last notifier function called.
*/
-int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
- unsigned long val, void *v)
+int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
{
int ret;
int idx;
idx = srcu_read_lock(&nh->srcu);
- ret = notifier_call_chain(&nh->head, val, v);
+ ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
srcu_read_unlock(&nh->srcu, idx);
return ret;
}
+EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
+int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+ unsigned long val, void *v)
+{
+ return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
+}
EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
/**
@@ -881,7 +941,7 @@
#ifdef CONFIG_SOFTWARE_SUSPEND
case LINUX_REBOOT_CMD_SW_SUSPEND:
{
- int ret = pm_suspend(PM_SUSPEND_DISK);
+ int ret = hibernate();
unlock_kernel();
return ret;
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f0664bd..4073353 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -77,6 +77,7 @@
extern int percpu_pagelist_fraction;
extern int compat_log;
extern int maps_protect;
+extern int sysctl_stat_interval;
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
@@ -857,6 +858,17 @@
.extra2 = &one_hundred,
},
#endif
+#ifdef CONFIG_SMP
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "stat_interval",
+ .data = &sysctl_stat_interval,
+ .maxlen = sizeof(sysctl_stat_interval),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = &sysctl_jiffies,
+ },
+#endif
#if defined(CONFIG_X86_32) || \
(defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
{
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index fe5c7db..3db5c3c 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -74,15 +74,17 @@
static struct timer_list watchdog_timer;
static DEFINE_SPINLOCK(watchdog_lock);
static cycle_t watchdog_last;
+static int watchdog_resumed;
+
/*
- * Interval: 0.5sec Treshold: 0.0625s
+ * Interval: 0.5sec Threshold: 0.0625s
*/
#define WATCHDOG_INTERVAL (HZ >> 1)
-#define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4)
+#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
{
- if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD)
+ if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD)
return;
printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
@@ -98,15 +100,26 @@
struct clocksource *cs, *tmp;
cycle_t csnow, wdnow;
int64_t wd_nsec, cs_nsec;
+ int resumed;
spin_lock(&watchdog_lock);
+ resumed = watchdog_resumed;
+ if (unlikely(resumed))
+ watchdog_resumed = 0;
+
wdnow = watchdog->read();
wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
watchdog_last = wdnow;
list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
csnow = cs->read();
+
+ if (unlikely(resumed)) {
+ cs->wd_last = csnow;
+ continue;
+ }
+
/* Initialized ? */
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
@@ -136,6 +149,13 @@
}
spin_unlock(&watchdog_lock);
}
+static void clocksource_resume_watchdog(void)
+{
+ spin_lock(&watchdog_lock);
+ watchdog_resumed = 1;
+ spin_unlock(&watchdog_lock);
+}
+
static void clocksource_check_watchdog(struct clocksource *cs)
{
struct clocksource *cse;
@@ -182,9 +202,34 @@
if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
}
+
+static inline void clocksource_resume_watchdog(void) { }
#endif
/**
+ * clocksource_resume - resume the clocksource(s)
+ */
+void clocksource_resume(void)
+{
+ struct list_head *tmp;
+ unsigned long flags;
+
+ spin_lock_irqsave(&clocksource_lock, flags);
+
+ list_for_each(tmp, &clocksource_list) {
+ struct clocksource *cs;
+
+ cs = list_entry(tmp, struct clocksource, list);
+ if (cs->resume)
+ cs->resume();
+ }
+
+ clocksource_resume_watchdog();
+
+ spin_unlock_irqrestore(&clocksource_lock, flags);
+}
+
+/**
* clocksource_get_next - Returns the selected clocksource
*
*/
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index b734ca4..8bbcfb7 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -65,7 +65,7 @@
SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
#endif
SEQ_printf(m, "\n");
- SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n",
+ SEQ_printf(m, " # expires at %Lu nsecs [in %Lu nsecs]\n",
(unsigned long long)ktime_to_ns(timer->expires),
(unsigned long long)(ktime_to_ns(timer->expires) - now));
}
@@ -111,14 +111,14 @@
{
SEQ_printf(m, " .index: %d\n",
base->index);
- SEQ_printf(m, " .resolution: %Ld nsecs\n",
+ SEQ_printf(m, " .resolution: %Lu nsecs\n",
(unsigned long long)ktime_to_ns(base->resolution));
SEQ_printf(m, " .get_time: ");
print_name_offset(m, base->get_time);
SEQ_printf(m, "\n");
#ifdef CONFIG_HIGH_RES_TIMERS
- SEQ_printf(m, " .offset: %Ld nsecs\n",
- ktime_to_ns(base->offset));
+ SEQ_printf(m, " .offset: %Lu nsecs\n",
+ (unsigned long long) ktime_to_ns(base->offset));
#endif
SEQ_printf(m, "active timers:\n");
print_active_timers(m, base, now);
@@ -135,10 +135,11 @@
print_base(m, cpu_base->clock_base + i, now);
}
#define P(x) \
- SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(cpu_base->x))
+ SEQ_printf(m, " .%-15s: %Lu\n", #x, \
+ (unsigned long long)(cpu_base->x))
#define P_ns(x) \
- SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \
- (u64)(ktime_to_ns(cpu_base->x)))
+ SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \
+ (unsigned long long)(ktime_to_ns(cpu_base->x)))
#ifdef CONFIG_HIGH_RES_TIMERS
P_ns(expires_next);
@@ -150,10 +151,11 @@
#ifdef CONFIG_TICK_ONESHOT
# define P(x) \
- SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(ts->x))
+ SEQ_printf(m, " .%-15s: %Lu\n", #x, \
+ (unsigned long long)(ts->x))
# define P_ns(x) \
- SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \
- (u64)(ktime_to_ns(ts->x)))
+ SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \
+ (unsigned long long)(ktime_to_ns(ts->x)))
{
struct tick_sched *ts = tick_get_tick_sched(cpu);
P(nohz_mode);
@@ -167,7 +169,8 @@
P(last_jiffies);
P(next_jiffies);
P_ns(idle_expires);
- SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies);
+ SEQ_printf(m, "jiffies: %Lu\n",
+ (unsigned long long)jiffies);
}
#endif
diff --git a/kernel/timer.c b/kernel/timer.c
index 7a64483..59a28b1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -92,24 +92,24 @@
/* Functions below help us manage 'deferrable' flag */
static inline unsigned int tbase_get_deferrable(tvec_base_t *base)
{
- return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
+ return (unsigned int)((unsigned long)base & TBASE_DEFERRABLE_FLAG);
}
static inline tvec_base_t *tbase_get_base(tvec_base_t *base)
{
- return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
+ return (tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG);
}
static inline void timer_set_deferrable(struct timer_list *timer)
{
- timer->base = ((tvec_base_t *)((unsigned long)(timer->base) |
- TBASE_DEFERRABLE_FLAG));
+ timer->base = (tvec_base_t *)((unsigned long)timer->base |
+ TBASE_DEFERRABLE_FLAG);
}
static inline void
timer_set_base(struct timer_list *timer, tvec_base_t *new_base)
{
- timer->base = (tvec_base_t *)((unsigned long)(new_base) |
+ timer->base = (tvec_base_t *)((unsigned long)new_base |
tbase_get_deferrable(timer->base));
}
@@ -1293,11 +1293,13 @@
long cpu = (long)hcpu;
switch(action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
if (init_timers_cpu(cpu) < 0)
return NOTIFY_BAD;
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
migrate_timers(cpu);
break;
#endif
@@ -1497,6 +1499,8 @@
prev = &curr->next;
}
+ clocksource_resume();
+
write_seqlock_irqsave(&xtime_lock, flags);
if (ti == time_interpolator) {
/* we lost the best time-interpolator: */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b6fa5e6..fb56fed 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -36,30 +36,20 @@
/*
* The per-CPU workqueue (if single thread, we always use the first
* possible cpu).
- *
- * The sequence counters are for flush_scheduled_work(). It wants to wait
- * until all currently-scheduled works are completed, but it doesn't
- * want to be livelocked by new, incoming ones. So it waits until
- * remove_sequence is >= the insert_sequence which pertained when
- * flush_scheduled_work() was called.
*/
struct cpu_workqueue_struct {
spinlock_t lock;
- long remove_sequence; /* Least-recently added (next to run) */
- long insert_sequence; /* Next to add */
-
struct list_head worklist;
wait_queue_head_t more_work;
- wait_queue_head_t work_done;
+ struct work_struct *current_work;
struct workqueue_struct *wq;
struct task_struct *thread;
+ int should_stop;
int run_depth; /* Detect run_workqueue() recursion depth */
-
- int freezeable; /* Freeze the thread during suspend */
} ____cacheline_aligned;
/*
@@ -68,8 +58,10 @@
*/
struct workqueue_struct {
struct cpu_workqueue_struct *cpu_wq;
+ struct list_head list;
const char *name;
- struct list_head list; /* Empty if single thread */
+ int singlethread;
+ int freezeable; /* Freeze threads during suspend */
};
/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
@@ -77,107 +69,69 @@
static DEFINE_MUTEX(workqueue_mutex);
static LIST_HEAD(workqueues);
-static int singlethread_cpu;
+static int singlethread_cpu __read_mostly;
+static cpumask_t cpu_singlethread_map __read_mostly;
+/* optimization, we could use cpu_possible_map */
+static cpumask_t cpu_populated_map __read_mostly;
/* If it's single threaded, it isn't in the list of workqueues. */
static inline int is_single_threaded(struct workqueue_struct *wq)
{
- return list_empty(&wq->list);
+ return wq->singlethread;
+}
+
+static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq)
+{
+ return is_single_threaded(wq)
+ ? &cpu_singlethread_map : &cpu_populated_map;
+}
+
+static
+struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
+{
+ if (unlikely(is_single_threaded(wq)))
+ cpu = singlethread_cpu;
+ return per_cpu_ptr(wq->cpu_wq, cpu);
}
/*
* Set the workqueue on which a work item is to be run
* - Must *only* be called if the pending flag is set
*/
-static inline void set_wq_data(struct work_struct *work, void *wq)
+static inline void set_wq_data(struct work_struct *work,
+ struct cpu_workqueue_struct *cwq)
{
unsigned long new;
BUG_ON(!work_pending(work));
- new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING);
+ new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING);
new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work);
atomic_long_set(&work->data, new);
}
-static inline void *get_wq_data(struct work_struct *work)
+static inline
+struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
{
return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
}
-static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work)
+static void insert_work(struct cpu_workqueue_struct *cwq,
+ struct work_struct *work, int tail)
{
- int ret = 0;
- unsigned long flags;
-
- spin_lock_irqsave(&cwq->lock, flags);
+ set_wq_data(work, cwq);
/*
- * We need to re-validate the work info after we've gotten
- * the cpu_workqueue lock. We can run the work now iff:
- *
- * - the wq_data still matches the cpu_workqueue_struct
- * - AND the work is still marked pending
- * - AND the work is still on a list (which will be this
- * workqueue_struct list)
- *
- * All these conditions are important, because we
- * need to protect against the work being run right
- * now on another CPU (all but the last one might be
- * true if it's currently running and has not been
- * released yet, for example).
+ * Ensure that we get the right work->data if we see the
+ * result of list_add() below, see try_to_grab_pending().
*/
- if (get_wq_data(work) == cwq
- && work_pending(work)
- && !list_empty(&work->entry)) {
- work_func_t f = work->func;
- list_del_init(&work->entry);
- spin_unlock_irqrestore(&cwq->lock, flags);
-
- if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work)))
- work_release(work);
- f(work);
-
- spin_lock_irqsave(&cwq->lock, flags);
- cwq->remove_sequence++;
- wake_up(&cwq->work_done);
- ret = 1;
- }
- spin_unlock_irqrestore(&cwq->lock, flags);
- return ret;
+ smp_wmb();
+ if (tail)
+ list_add_tail(&work->entry, &cwq->worklist);
+ else
+ list_add(&work->entry, &cwq->worklist);
+ wake_up(&cwq->more_work);
}
-/**
- * run_scheduled_work - run scheduled work synchronously
- * @work: work to run
- *
- * This checks if the work was pending, and runs it
- * synchronously if so. It returns a boolean to indicate
- * whether it had any scheduled work to run or not.
- *
- * NOTE! This _only_ works for normal work_structs. You
- * CANNOT use this for delayed work, because the wq data
- * for delayed work will not point properly to the per-
- * CPU workqueue struct, but will change!
- */
-int fastcall run_scheduled_work(struct work_struct *work)
-{
- for (;;) {
- struct cpu_workqueue_struct *cwq;
-
- if (!work_pending(work))
- return 0;
- if (list_empty(&work->entry))
- return 0;
- /* NOTE! This depends intimately on __queue_work! */
- cwq = get_wq_data(work);
- if (!cwq)
- return 0;
- if (__run_work(cwq, work))
- return 1;
- }
-}
-EXPORT_SYMBOL(run_scheduled_work);
-
/* Preempt must be disabled. */
static void __queue_work(struct cpu_workqueue_struct *cwq,
struct work_struct *work)
@@ -185,10 +139,7 @@
unsigned long flags;
spin_lock_irqsave(&cwq->lock, flags);
- set_wq_data(work, cwq);
- list_add_tail(&work->entry, &cwq->worklist);
- cwq->insert_sequence++;
- wake_up(&cwq->more_work);
+ insert_work(cwq, work, 1);
spin_unlock_irqrestore(&cwq->lock, flags);
}
@@ -204,16 +155,14 @@
*/
int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
{
- int ret = 0, cpu = get_cpu();
+ int ret = 0;
if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
- if (unlikely(is_single_threaded(wq)))
- cpu = singlethread_cpu;
BUG_ON(!list_empty(&work->entry));
- __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+ __queue_work(wq_per_cpu(wq, get_cpu()), work);
+ put_cpu();
ret = 1;
}
- put_cpu();
return ret;
}
EXPORT_SYMBOL_GPL(queue_work);
@@ -221,13 +170,10 @@
void delayed_work_timer_fn(unsigned long __data)
{
struct delayed_work *dwork = (struct delayed_work *)__data;
- struct workqueue_struct *wq = get_wq_data(&dwork->work);
- int cpu = smp_processor_id();
+ struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
+ struct workqueue_struct *wq = cwq->wq;
- if (unlikely(is_single_threaded(wq)))
- cpu = singlethread_cpu;
-
- __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work);
+ __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work);
}
/**
@@ -241,27 +187,11 @@
int fastcall queue_delayed_work(struct workqueue_struct *wq,
struct delayed_work *dwork, unsigned long delay)
{
- int ret = 0;
- struct timer_list *timer = &dwork->timer;
- struct work_struct *work = &dwork->work;
-
- timer_stats_timer_set_start_info(timer);
+ timer_stats_timer_set_start_info(&dwork->timer);
if (delay == 0)
- return queue_work(wq, work);
+ return queue_work(wq, &dwork->work);
- if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
- BUG_ON(timer_pending(timer));
- BUG_ON(!list_empty(&work->entry));
-
- /* This stores wq for the moment, for the timer_fn */
- set_wq_data(work, wq);
- timer->expires = jiffies + delay;
- timer->data = (unsigned long)dwork;
- timer->function = delayed_work_timer_fn;
- add_timer(timer);
- ret = 1;
- }
- return ret;
+ return queue_delayed_work_on(-1, wq, dwork, delay);
}
EXPORT_SYMBOL_GPL(queue_delayed_work);
@@ -285,12 +215,16 @@
BUG_ON(timer_pending(timer));
BUG_ON(!list_empty(&work->entry));
- /* This stores wq for the moment, for the timer_fn */
- set_wq_data(work, wq);
+ /* This stores cwq for the moment, for the timer_fn */
+ set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
timer->expires = jiffies + delay;
timer->data = (unsigned long)dwork;
timer->function = delayed_work_timer_fn;
- add_timer_on(timer, cpu);
+
+ if (unlikely(cpu >= 0))
+ add_timer_on(timer, cpu);
+ else
+ add_timer(timer);
ret = 1;
}
return ret;
@@ -299,13 +233,7 @@
static void run_workqueue(struct cpu_workqueue_struct *cwq)
{
- unsigned long flags;
-
- /*
- * Keep taking off work from the queue until
- * done.
- */
- spin_lock_irqsave(&cwq->lock, flags);
+ spin_lock_irq(&cwq->lock);
cwq->run_depth++;
if (cwq->run_depth > 3) {
/* morton gets to eat his hat */
@@ -318,12 +246,12 @@
struct work_struct, entry);
work_func_t f = work->func;
+ cwq->current_work = work;
list_del_init(cwq->worklist.next);
- spin_unlock_irqrestore(&cwq->lock, flags);
+ spin_unlock_irq(&cwq->lock);
BUG_ON(get_wq_data(work) != cwq);
- if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work)))
- work_release(work);
+ work_clear_pending(work);
f(work);
if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
@@ -337,63 +265,81 @@
dump_stack();
}
- spin_lock_irqsave(&cwq->lock, flags);
- cwq->remove_sequence++;
- wake_up(&cwq->work_done);
+ spin_lock_irq(&cwq->lock);
+ cwq->current_work = NULL;
}
cwq->run_depth--;
- spin_unlock_irqrestore(&cwq->lock, flags);
+ spin_unlock_irq(&cwq->lock);
+}
+
+/*
+ * NOTE: the caller must not touch *cwq if this func returns true
+ */
+static int cwq_should_stop(struct cpu_workqueue_struct *cwq)
+{
+ int should_stop = cwq->should_stop;
+
+ if (unlikely(should_stop)) {
+ spin_lock_irq(&cwq->lock);
+ should_stop = cwq->should_stop && list_empty(&cwq->worklist);
+ if (should_stop)
+ cwq->thread = NULL;
+ spin_unlock_irq(&cwq->lock);
+ }
+
+ return should_stop;
}
static int worker_thread(void *__cwq)
{
struct cpu_workqueue_struct *cwq = __cwq;
- DECLARE_WAITQUEUE(wait, current);
- struct k_sigaction sa;
- sigset_t blocked;
+ DEFINE_WAIT(wait);
- if (!cwq->freezeable)
+ if (!cwq->wq->freezeable)
current->flags |= PF_NOFREEZE;
set_user_nice(current, -5);
- /* Block and flush all signals */
- sigfillset(&blocked);
- sigprocmask(SIG_BLOCK, &blocked, NULL);
- flush_signals(current);
-
- /*
- * We inherited MPOL_INTERLEAVE from the booting kernel.
- * Set MPOL_DEFAULT to insure node local allocations.
- */
- numa_default_policy();
-
- /* SIG_IGN makes children autoreap: see do_notify_parent(). */
- sa.sa.sa_handler = SIG_IGN;
- sa.sa.sa_flags = 0;
- siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
- do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
-
- set_current_state(TASK_INTERRUPTIBLE);
- while (!kthread_should_stop()) {
- if (cwq->freezeable)
- try_to_freeze();
-
- add_wait_queue(&cwq->more_work, &wait);
- if (list_empty(&cwq->worklist))
+ for (;;) {
+ prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
+ if (!freezing(current) && !cwq->should_stop
+ && list_empty(&cwq->worklist))
schedule();
- else
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&cwq->more_work, &wait);
+ finish_wait(&cwq->more_work, &wait);
- if (!list_empty(&cwq->worklist))
- run_workqueue(cwq);
- set_current_state(TASK_INTERRUPTIBLE);
+ try_to_freeze();
+
+ if (cwq_should_stop(cwq))
+ break;
+
+ run_workqueue(cwq);
}
- __set_current_state(TASK_RUNNING);
+
return 0;
}
+struct wq_barrier {
+ struct work_struct work;
+ struct completion done;
+};
+
+static void wq_barrier_func(struct work_struct *work)
+{
+ struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
+ complete(&barr->done);
+}
+
+static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
+ struct wq_barrier *barr, int tail)
+{
+ INIT_WORK(&barr->work, wq_barrier_func);
+ __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
+
+ init_completion(&barr->done);
+
+ insert_work(cwq, &barr->work, tail);
+}
+
static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
{
if (cwq->thread == current) {
@@ -403,21 +349,18 @@
*/
run_workqueue(cwq);
} else {
- DEFINE_WAIT(wait);
- long sequence_needed;
+ struct wq_barrier barr;
+ int active = 0;
spin_lock_irq(&cwq->lock);
- sequence_needed = cwq->insert_sequence;
-
- while (sequence_needed - cwq->remove_sequence > 0) {
- prepare_to_wait(&cwq->work_done, &wait,
- TASK_UNINTERRUPTIBLE);
- spin_unlock_irq(&cwq->lock);
- schedule();
- spin_lock_irq(&cwq->lock);
+ if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
+ insert_wq_barrier(cwq, &barr, 1);
+ active = 1;
}
- finish_wait(&cwq->work_done, &wait);
spin_unlock_irq(&cwq->lock);
+
+ if (active)
+ wait_for_completion(&barr.done);
}
}
@@ -428,151 +371,145 @@
* Forces execution of the workqueue and blocks until its completion.
* This is typically used in driver shutdown handlers.
*
- * This function will sample each workqueue's current insert_sequence number and
- * will sleep until the head sequence is greater than or equal to that. This
- * means that we sleep until all works which were queued on entry have been
- * handled, but we are not livelocked by new incoming ones.
+ * We sleep until all works which were queued on entry have been handled,
+ * but we are not livelocked by new incoming ones.
*
* This function used to run the workqueues itself. Now we just wait for the
* helper threads to do it.
*/
void fastcall flush_workqueue(struct workqueue_struct *wq)
{
+ const cpumask_t *cpu_map = wq_cpu_map(wq);
+ int cpu;
+
might_sleep();
-
- if (is_single_threaded(wq)) {
- /* Always use first cpu's area. */
- flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
- } else {
- int cpu;
-
- mutex_lock(&workqueue_mutex);
- for_each_online_cpu(cpu)
- flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
- mutex_unlock(&workqueue_mutex);
- }
+ for_each_cpu_mask(cpu, *cpu_map)
+ flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
}
EXPORT_SYMBOL_GPL(flush_workqueue);
-static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
- int cpu, int freezeable)
-{
- struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
- struct task_struct *p;
-
- spin_lock_init(&cwq->lock);
- cwq->wq = wq;
- cwq->thread = NULL;
- cwq->insert_sequence = 0;
- cwq->remove_sequence = 0;
- cwq->freezeable = freezeable;
- INIT_LIST_HEAD(&cwq->worklist);
- init_waitqueue_head(&cwq->more_work);
- init_waitqueue_head(&cwq->work_done);
-
- if (is_single_threaded(wq))
- p = kthread_create(worker_thread, cwq, "%s", wq->name);
- else
- p = kthread_create(worker_thread, cwq, "%s/%d", wq->name, cpu);
- if (IS_ERR(p))
- return NULL;
- cwq->thread = p;
- return p;
-}
-
-struct workqueue_struct *__create_workqueue(const char *name,
- int singlethread, int freezeable)
-{
- int cpu, destroy = 0;
- struct workqueue_struct *wq;
- struct task_struct *p;
-
- wq = kzalloc(sizeof(*wq), GFP_KERNEL);
- if (!wq)
- return NULL;
-
- wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
- if (!wq->cpu_wq) {
- kfree(wq);
- return NULL;
- }
-
- wq->name = name;
- mutex_lock(&workqueue_mutex);
- if (singlethread) {
- INIT_LIST_HEAD(&wq->list);
- p = create_workqueue_thread(wq, singlethread_cpu, freezeable);
- if (!p)
- destroy = 1;
- else
- wake_up_process(p);
- } else {
- list_add(&wq->list, &workqueues);
- for_each_online_cpu(cpu) {
- p = create_workqueue_thread(wq, cpu, freezeable);
- if (p) {
- kthread_bind(p, cpu);
- wake_up_process(p);
- } else
- destroy = 1;
- }
- }
- mutex_unlock(&workqueue_mutex);
-
- /*
- * Was there any error during startup? If yes then clean up:
- */
- if (destroy) {
- destroy_workqueue(wq);
- wq = NULL;
- }
- return wq;
-}
-EXPORT_SYMBOL_GPL(__create_workqueue);
-
-static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
+/*
+ * Upon a successful return, the caller "owns" WORK_STRUCT_PENDING bit,
+ * so this work can't be re-armed in any way.
+ */
+static int try_to_grab_pending(struct work_struct *work)
{
struct cpu_workqueue_struct *cwq;
- unsigned long flags;
- struct task_struct *p;
+ int ret = 0;
- cwq = per_cpu_ptr(wq->cpu_wq, cpu);
- spin_lock_irqsave(&cwq->lock, flags);
- p = cwq->thread;
- cwq->thread = NULL;
- spin_unlock_irqrestore(&cwq->lock, flags);
- if (p)
- kthread_stop(p);
+ if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work)))
+ return 1;
+
+ /*
+ * The queueing is in progress, or it is already queued. Try to
+ * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
+ */
+
+ cwq = get_wq_data(work);
+ if (!cwq)
+ return ret;
+
+ spin_lock_irq(&cwq->lock);
+ if (!list_empty(&work->entry)) {
+ /*
+ * This work is queued, but perhaps we locked the wrong cwq.
+ * In that case we must see the new value after rmb(), see
+ * insert_work()->wmb().
+ */
+ smp_rmb();
+ if (cwq == get_wq_data(work)) {
+ list_del_init(&work->entry);
+ ret = 1;
+ }
+ }
+ spin_unlock_irq(&cwq->lock);
+
+ return ret;
+}
+
+static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
+ struct work_struct *work)
+{
+ struct wq_barrier barr;
+ int running = 0;
+
+ spin_lock_irq(&cwq->lock);
+ if (unlikely(cwq->current_work == work)) {
+ insert_wq_barrier(cwq, &barr, 0);
+ running = 1;
+ }
+ spin_unlock_irq(&cwq->lock);
+
+ if (unlikely(running))
+ wait_for_completion(&barr.done);
+}
+
+static void wait_on_work(struct work_struct *work)
+{
+ struct cpu_workqueue_struct *cwq;
+ struct workqueue_struct *wq;
+ const cpumask_t *cpu_map;
+ int cpu;
+
+ might_sleep();
+
+ cwq = get_wq_data(work);
+ if (!cwq)
+ return;
+
+ wq = cwq->wq;
+ cpu_map = wq_cpu_map(wq);
+
+ for_each_cpu_mask(cpu, *cpu_map)
+ wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
}
/**
- * destroy_workqueue - safely terminate a workqueue
- * @wq: target workqueue
+ * cancel_work_sync - block until a work_struct's callback has terminated
+ * @work: the work which is to be flushed
*
- * Safely destroy a workqueue. All work currently pending will be done first.
+ * cancel_work_sync() will cancel the work if it is queued. If the work's
+ * callback appears to be running, cancel_work_sync() will block until it
+ * has completed.
+ *
+ * It is possible to use this function if the work re-queues itself. It can
+ * cancel the work even if it migrates to another workqueue, however in that
+ * case it only guarantees that work->func() has completed on the last queued
+ * workqueue.
+ *
+ * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not
+ * pending, otherwise it goes into a busy-wait loop until the timer expires.
+ *
+ * The caller must ensure that workqueue_struct on which this work was last
+ * queued can't be destroyed before this function returns.
*/
-void destroy_workqueue(struct workqueue_struct *wq)
+void cancel_work_sync(struct work_struct *work)
{
- int cpu;
-
- flush_workqueue(wq);
-
- /* We don't need the distraction of CPUs appearing and vanishing. */
- mutex_lock(&workqueue_mutex);
- if (is_single_threaded(wq))
- cleanup_workqueue_thread(wq, singlethread_cpu);
- else {
- for_each_online_cpu(cpu)
- cleanup_workqueue_thread(wq, cpu);
- list_del(&wq->list);
- }
- mutex_unlock(&workqueue_mutex);
- free_percpu(wq->cpu_wq);
- kfree(wq);
+ while (!try_to_grab_pending(work))
+ cpu_relax();
+ wait_on_work(work);
+ work_clear_pending(work);
}
-EXPORT_SYMBOL_GPL(destroy_workqueue);
+EXPORT_SYMBOL_GPL(cancel_work_sync);
-static struct workqueue_struct *keventd_wq;
+/**
+ * cancel_rearming_delayed_work - reliably kill off a delayed work.
+ * @dwork: the delayed work struct
+ *
+ * It is possible to use this function if @dwork rearms itself via queue_work()
+ * or queue_delayed_work(). See also the comment for cancel_work_sync().
+ */
+void cancel_rearming_delayed_work(struct delayed_work *dwork)
+{
+ while (!del_timer(&dwork->timer) &&
+ !try_to_grab_pending(&dwork->work))
+ cpu_relax();
+ wait_on_work(&dwork->work);
+ work_clear_pending(&dwork->work);
+}
+EXPORT_SYMBOL(cancel_rearming_delayed_work);
+
+static struct workqueue_struct *keventd_wq __read_mostly;
/**
* schedule_work - put work task in global workqueue
@@ -638,7 +575,7 @@
if (!works)
return -ENOMEM;
- mutex_lock(&workqueue_mutex);
+ preempt_disable(); /* CPU hotplug */
for_each_online_cpu(cpu) {
struct work_struct *work = per_cpu_ptr(works, cpu);
@@ -646,7 +583,7 @@
set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
}
- mutex_unlock(&workqueue_mutex);
+ preempt_enable();
flush_workqueue(keventd_wq);
free_percpu(works);
return 0;
@@ -659,29 +596,6 @@
EXPORT_SYMBOL(flush_scheduled_work);
/**
- * cancel_rearming_delayed_workqueue - reliably kill off a delayed work whose handler rearms the delayed work.
- * @wq: the controlling workqueue structure
- * @dwork: the delayed work struct
- */
-void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
- struct delayed_work *dwork)
-{
- while (!cancel_delayed_work(dwork))
- flush_workqueue(wq);
-}
-EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
-
-/**
- * cancel_rearming_delayed_work - reliably kill off a delayed keventd work whose handler rearms the delayed work.
- * @dwork: the delayed work struct
- */
-void cancel_rearming_delayed_work(struct delayed_work *dwork)
-{
- cancel_rearming_delayed_workqueue(keventd_wq, dwork);
-}
-EXPORT_SYMBOL(cancel_rearming_delayed_work);
-
-/**
* execute_in_process_context - reliably execute the routine with user context
* @fn: the function to execute
* @ew: guaranteed storage for the execute work structure (must
@@ -728,94 +642,209 @@
}
-/* Take the work from this (downed) CPU. */
-static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
+static struct cpu_workqueue_struct *
+init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
{
struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
- struct list_head list;
- struct work_struct *work;
- spin_lock_irq(&cwq->lock);
- list_replace_init(&cwq->worklist, &list);
+ cwq->wq = wq;
+ spin_lock_init(&cwq->lock);
+ INIT_LIST_HEAD(&cwq->worklist);
+ init_waitqueue_head(&cwq->more_work);
- while (!list_empty(&list)) {
- printk("Taking work for %s\n", wq->name);
- work = list_entry(list.next,struct work_struct,entry);
- list_del(&work->entry);
- __queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work);
- }
- spin_unlock_irq(&cwq->lock);
+ return cwq;
}
-/* We're holding the cpucontrol mutex here */
-static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
{
- unsigned int hotcpu = (unsigned long)hcpu;
+ struct workqueue_struct *wq = cwq->wq;
+ const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d";
+ struct task_struct *p;
+
+ p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
+ /*
+ * Nobody can add the work_struct to this cwq,
+ * if (caller is __create_workqueue)
+ * nobody should see this wq
+ * else // caller is CPU_UP_PREPARE
+ * cpu is not on cpu_online_map
+ * so we can abort safely.
+ */
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ cwq->thread = p;
+ cwq->should_stop = 0;
+
+ return 0;
+}
+
+static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+{
+ struct task_struct *p = cwq->thread;
+
+ if (p != NULL) {
+ if (cpu >= 0)
+ kthread_bind(p, cpu);
+ wake_up_process(p);
+ }
+}
+
+struct workqueue_struct *__create_workqueue(const char *name,
+ int singlethread, int freezeable)
+{
+ struct workqueue_struct *wq;
+ struct cpu_workqueue_struct *cwq;
+ int err = 0, cpu;
+
+ wq = kzalloc(sizeof(*wq), GFP_KERNEL);
+ if (!wq)
+ return NULL;
+
+ wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
+ if (!wq->cpu_wq) {
+ kfree(wq);
+ return NULL;
+ }
+
+ wq->name = name;
+ wq->singlethread = singlethread;
+ wq->freezeable = freezeable;
+ INIT_LIST_HEAD(&wq->list);
+
+ if (singlethread) {
+ cwq = init_cpu_workqueue(wq, singlethread_cpu);
+ err = create_workqueue_thread(cwq, singlethread_cpu);
+ start_workqueue_thread(cwq, -1);
+ } else {
+ mutex_lock(&workqueue_mutex);
+ list_add(&wq->list, &workqueues);
+
+ for_each_possible_cpu(cpu) {
+ cwq = init_cpu_workqueue(wq, cpu);
+ if (err || !cpu_online(cpu))
+ continue;
+ err = create_workqueue_thread(cwq, cpu);
+ start_workqueue_thread(cwq, cpu);
+ }
+ mutex_unlock(&workqueue_mutex);
+ }
+
+ if (err) {
+ destroy_workqueue(wq);
+ wq = NULL;
+ }
+ return wq;
+}
+EXPORT_SYMBOL_GPL(__create_workqueue);
+
+static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+{
+ struct wq_barrier barr;
+ int alive = 0;
+
+ spin_lock_irq(&cwq->lock);
+ if (cwq->thread != NULL) {
+ insert_wq_barrier(cwq, &barr, 1);
+ cwq->should_stop = 1;
+ alive = 1;
+ }
+ spin_unlock_irq(&cwq->lock);
+
+ if (alive) {
+ wait_for_completion(&barr.done);
+
+ while (unlikely(cwq->thread != NULL))
+ cpu_relax();
+ /*
+ * Wait until cwq->thread unlocks cwq->lock,
+ * it won't touch *cwq after that.
+ */
+ smp_rmb();
+ spin_unlock_wait(&cwq->lock);
+ }
+}
+
+/**
+ * destroy_workqueue - safely terminate a workqueue
+ * @wq: target workqueue
+ *
+ * Safely destroy a workqueue. All work currently pending will be done first.
+ */
+void destroy_workqueue(struct workqueue_struct *wq)
+{
+ const cpumask_t *cpu_map = wq_cpu_map(wq);
+ struct cpu_workqueue_struct *cwq;
+ int cpu;
+
+ mutex_lock(&workqueue_mutex);
+ list_del(&wq->list);
+ mutex_unlock(&workqueue_mutex);
+
+ for_each_cpu_mask(cpu, *cpu_map) {
+ cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+ cleanup_workqueue_thread(cwq, cpu);
+ }
+
+ free_percpu(wq->cpu_wq);
+ kfree(wq);
+}
+EXPORT_SYMBOL_GPL(destroy_workqueue);
+
+static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+ struct cpu_workqueue_struct *cwq;
struct workqueue_struct *wq;
+ action &= ~CPU_TASKS_FROZEN;
+
switch (action) {
+ case CPU_LOCK_ACQUIRE:
+ mutex_lock(&workqueue_mutex);
+ return NOTIFY_OK;
+
+ case CPU_LOCK_RELEASE:
+ mutex_unlock(&workqueue_mutex);
+ return NOTIFY_OK;
+
case CPU_UP_PREPARE:
- mutex_lock(&workqueue_mutex);
- /* Create a new workqueue thread for it. */
- list_for_each_entry(wq, &workqueues, list) {
- if (!create_workqueue_thread(wq, hotcpu, 0)) {
- printk("workqueue for %i failed\n", hotcpu);
- return NOTIFY_BAD;
- }
+ cpu_set(cpu, cpu_populated_map);
+ }
+
+ list_for_each_entry(wq, &workqueues, list) {
+ cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ if (!create_workqueue_thread(cwq, cpu))
+ break;
+ printk(KERN_ERR "workqueue for %i failed\n", cpu);
+ return NOTIFY_BAD;
+
+ case CPU_ONLINE:
+ start_workqueue_thread(cwq, cpu);
+ break;
+
+ case CPU_UP_CANCELED:
+ start_workqueue_thread(cwq, -1);
+ case CPU_DEAD:
+ cleanup_workqueue_thread(cwq, cpu);
+ break;
}
- break;
-
- case CPU_ONLINE:
- /* Kick off worker threads. */
- list_for_each_entry(wq, &workqueues, list) {
- struct cpu_workqueue_struct *cwq;
-
- cwq = per_cpu_ptr(wq->cpu_wq, hotcpu);
- kthread_bind(cwq->thread, hotcpu);
- wake_up_process(cwq->thread);
- }
- mutex_unlock(&workqueue_mutex);
- break;
-
- case CPU_UP_CANCELED:
- list_for_each_entry(wq, &workqueues, list) {
- if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread)
- continue;
- /* Unbind so it can run. */
- kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread,
- any_online_cpu(cpu_online_map));
- cleanup_workqueue_thread(wq, hotcpu);
- }
- mutex_unlock(&workqueue_mutex);
- break;
-
- case CPU_DOWN_PREPARE:
- mutex_lock(&workqueue_mutex);
- break;
-
- case CPU_DOWN_FAILED:
- mutex_unlock(&workqueue_mutex);
- break;
-
- case CPU_DEAD:
- list_for_each_entry(wq, &workqueues, list)
- cleanup_workqueue_thread(wq, hotcpu);
- list_for_each_entry(wq, &workqueues, list)
- take_over_work(wq, hotcpu);
- mutex_unlock(&workqueue_mutex);
- break;
}
return NOTIFY_OK;
}
-void init_workqueues(void)
+void __init init_workqueues(void)
{
+ cpu_populated_map = cpu_online_map;
singlethread_cpu = first_cpu(cpu_possible_map);
+ cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu);
hotcpu_notifier(workqueue_cpu_callback, 0);
keventd_wq = create_workqueue("events");
BUG_ON(!keventd_wq);
}
-
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index d69ddbe..402eb4e 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -1004,7 +1004,7 @@
struct radix_tree_preload *rtp;
/* Free per-cpu pool of perloaded nodes */
- if (action == CPU_DEAD) {
+ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
rtp = &per_cpu(radix_tree_preloads, cpu);
while (rtp->nr) {
kmem_cache_free(radix_tree_node_cachep,
diff --git a/mm/filemap.c b/mm/filemap.c
index 9cbf4fe..9e56fd1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -750,6 +750,7 @@
read_unlock_irq(&mapping->tree_lock);
return i;
}
+EXPORT_SYMBOL(find_get_pages_contig);
/**
* find_get_pages_tag - find and return pages that match @tag
@@ -778,6 +779,7 @@
read_unlock_irq(&mapping->tree_lock);
return ret;
}
+EXPORT_SYMBOL(find_get_pages_tag);
/**
* grab_cache_page_nowait - returns locked page at given index in given cache
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index cbb3358..1b49dab 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -434,7 +434,6 @@
unsigned blocksize;
unsigned length;
struct page *page;
- void *kaddr;
BUG_ON(!mapping->a_ops->get_xip_page);
@@ -458,11 +457,7 @@
else
return PTR_ERR(page);
}
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr + offset, 0, length);
- kunmap_atomic(kaddr, KM_USER0);
-
- flush_dcache_page(page);
+ zero_user_page(page, offset, length, KM_USER0);
return 0;
}
EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 36db012..eb7180d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -140,6 +140,8 @@
return page;
fail:
+ if (vma->vm_flags & VM_MAYSHARE)
+ resv_huge_pages++;
spin_unlock(&hugetlb_lock);
return NULL;
}
@@ -172,6 +174,17 @@
}
__setup("hugepages=", hugetlb_setup);
+static unsigned int cpuset_mems_nr(unsigned int *array)
+{
+ int node;
+ unsigned int nr = 0;
+
+ for_each_node_mask(node, cpuset_current_mems_allowed)
+ nr += array[node];
+
+ return nr;
+}
+
#ifdef CONFIG_SYSCTL
static void update_and_free_page(struct page *page)
{
@@ -817,6 +830,26 @@
chg = region_chg(&inode->i_mapping->private_list, from, to);
if (chg < 0)
return chg;
+ /*
+ * When cpuset is configured, it breaks the strict hugetlb page
+ * reservation as the accounting is done on a global variable. Such
+ * reservation is completely rubbish in the presence of cpuset because
+ * the reservation is not checked against page availability for the
+ * current cpuset. Application can still potentially OOM'ed by kernel
+ * with lack of free htlb page in cpuset that the task is in.
+ * Attempt to enforce strict accounting with cpuset is almost
+ * impossible (or too ugly) because cpuset is too fluid that
+ * task or memory node can be dynamically moved between cpusets.
+ *
+ * The change of semantics for shared hugetlb mapping with cpuset is
+ * undesirable. However, in order to preserve some of the semantics,
+ * we fall back to check against current free page availability as
+ * a best attempt and hopefully to minimize the impact of changing
+ * semantics that cpuset has.
+ */
+ if (chg > cpuset_mems_nr(free_huge_pages_node))
+ return -ENOMEM;
+
ret = hugetlb_acct_memory(chg);
if (ret < 0)
return ret;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6fd0b74..f9b5d6d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -691,43 +691,26 @@
#ifdef CONFIG_NUMA
/*
- * Called from the slab reaper to drain pagesets on a particular node that
- * belongs to the currently executing processor.
+ * Called from the vmstat counter updater to drain pagesets of this
+ * currently executing processor on remote nodes after they have
+ * expired.
+ *
* Note that this function must be called with the thread pinned to
* a single processor.
*/
-void drain_node_pages(int nodeid)
+void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{
- int i;
- enum zone_type z;
unsigned long flags;
+ int to_drain;
- for (z = 0; z < MAX_NR_ZONES; z++) {
- struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
- struct per_cpu_pageset *pset;
-
- if (!populated_zone(zone))
- continue;
-
- pset = zone_pcp(zone, smp_processor_id());
- for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
- struct per_cpu_pages *pcp;
-
- pcp = &pset->pcp[i];
- if (pcp->count) {
- int to_drain;
-
- local_irq_save(flags);
- if (pcp->count >= pcp->batch)
- to_drain = pcp->batch;
- else
- to_drain = pcp->count;
- free_pages_bulk(zone, to_drain, &pcp->list, 0);
- pcp->count -= to_drain;
- local_irq_restore(flags);
- }
- }
- }
+ local_irq_save(flags);
+ if (pcp->count >= pcp->batch)
+ to_drain = pcp->batch;
+ else
+ to_drain = pcp->count;
+ free_pages_bulk(zone, to_drain, &pcp->list, 0);
+ pcp->count -= to_drain;
+ local_irq_restore(flags);
}
#endif
@@ -2148,11 +2131,14 @@
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
if (process_zones(cpu))
ret = NOTIFY_BAD;
break;
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
free_zone_pagesets(cpu);
break;
default:
@@ -3012,7 +2998,7 @@
{
int cpu = (unsigned long)hcpu;
- if (action == CPU_DEAD) {
+ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
local_irq_disable();
__drain_pages(cpu);
vm_events_fold_cpu(cpu);
diff --git a/mm/slab.c b/mm/slab.c
index acda7e2..944b205 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -928,12 +928,6 @@
{
int node = __get_cpu_var(reap_node);
- /*
- * Also drain per cpu pages on remote zones
- */
- if (node != numa_node_id())
- drain_node_pages(node);
-
node = next_node(node, node_online_map);
if (unlikely(node >= MAX_NUMNODES))
node = first_node(node_online_map);
@@ -1186,8 +1180,11 @@
int memsize = sizeof(struct kmem_list3);
switch (action) {
- case CPU_UP_PREPARE:
+ case CPU_LOCK_ACQUIRE:
mutex_lock(&cache_chain_mutex);
+ break;
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
/*
* We need to do this right in the beginning since
* alloc_arraycache's are going to use this list.
@@ -1274,17 +1271,28 @@
}
break;
case CPU_ONLINE:
- mutex_unlock(&cache_chain_mutex);
+ case CPU_ONLINE_FROZEN:
start_cpu_timer(cpu);
break;
#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DOWN_PREPARE:
- mutex_lock(&cache_chain_mutex);
- break;
- case CPU_DOWN_FAILED:
- mutex_unlock(&cache_chain_mutex);
- break;
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ /*
+ * Shutdown cache reaper. Note that the cache_chain_mutex is
+ * held so that if cache_reap() is invoked it cannot do
+ * anything expensive but will only modify reap_work
+ * and reschedule the timer.
+ */
+ cancel_rearming_delayed_work(&per_cpu(reap_work, cpu));
+ /* Now the cache_reaper is guaranteed to be not running. */
+ per_cpu(reap_work, cpu).work.func = NULL;
+ break;
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ start_cpu_timer(cpu);
+ break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
/*
* Even if all the cpus of a node are down, we don't free the
* kmem_list3 of any cache. This to avoid a race between
@@ -1296,6 +1304,7 @@
/* fall thru */
#endif
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
list_for_each_entry(cachep, &cache_chain, next) {
struct array_cache *nc;
struct array_cache *shared;
@@ -1354,6 +1363,8 @@
continue;
drain_freelist(cachep, l3, l3->free_objects);
}
+ break;
+ case CPU_LOCK_RELEASE:
mutex_unlock(&cache_chain_mutex);
break;
}
@@ -3742,7 +3753,6 @@
/**
* krealloc - reallocate memory. The contents will remain unchanged.
- *
* @p: object to reallocate memory for.
* @new_size: how many bytes of memory are required.
* @flags: the type of memory to allocate.
@@ -4140,7 +4150,6 @@
check_irq_on();
mutex_unlock(&cache_chain_mutex);
next_reap_node();
- refresh_cpu_vm_stats(smp_processor_id());
out:
/* Set up the next iteration */
schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
diff --git a/mm/slub.c b/mm/slub.c
index 5db3da5..bd2efae 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -66,11 +66,11 @@
* SLUB assigns one slab for allocation to each processor.
* Allocations only occur from these slabs called cpu slabs.
*
- * Slabs with free elements are kept on a partial list.
- * There is no list for full slabs. If an object in a full slab is
+ * Slabs with free elements are kept on a partial list and during regular
+ * operations no list for full slabs is used. If an object in a full slab is
* freed then the slab will show up again on the partial lists.
- * Otherwise there is no need to track full slabs unless we have to
- * track full slabs for debugging purposes.
+ * We track full slabs for debugging purposes though because otherwise we
+ * cannot scan all objects.
*
* Slabs are freed when they become empty. Teardown and setup is
* minimal so we rely on the page allocators per cpu caches for
@@ -87,13 +87,36 @@
* the fast path.
*/
+static inline int SlabDebug(struct page *page)
+{
+#ifdef CONFIG_SLUB_DEBUG
+ return PageError(page);
+#else
+ return 0;
+#endif
+}
+
+static inline void SetSlabDebug(struct page *page)
+{
+#ifdef CONFIG_SLUB_DEBUG
+ SetPageError(page);
+#endif
+}
+
+static inline void ClearSlabDebug(struct page *page)
+{
+#ifdef CONFIG_SLUB_DEBUG
+ ClearPageError(page);
+#endif
+}
+
/*
* Issues still to be resolved:
*
* - The per cpu array is updated for each new slab and and is a remote
* cacheline for most nodes. This could become a bouncing cacheline given
- * enough frequent updates. There are 16 pointers in a cacheline.so at
- * max 16 cpus could compete. Likely okay.
+ * enough frequent updates. There are 16 pointers in a cacheline, so at
+ * max 16 cpus could compete for the cacheline which may be okay.
*
* - Support PAGE_ALLOC_DEBUG. Should be easy to do.
*
@@ -137,6 +160,7 @@
#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
SLAB_POISON | SLAB_STORE_USER)
+
/*
* Set of flags that will prevent slab merging
*/
@@ -157,6 +181,11 @@
/* Internal SLUB flags */
#define __OBJECT_POISON 0x80000000 /* Poison object */
+/* Not all arches define cache_line_size */
+#ifndef cache_line_size
+#define cache_line_size() L1_CACHE_BYTES
+#endif
+
static int kmem_size = sizeof(struct kmem_cache);
#ifdef CONFIG_SMP
@@ -166,7 +195,7 @@
static enum {
DOWN, /* No slab functionality available */
PARTIAL, /* kmem_cache_open() works but kmalloc does not */
- UP, /* Everything works */
+ UP, /* Everything works but does not show up in sysfs */
SYSFS /* Sysfs up */
} slab_state = DOWN;
@@ -174,7 +203,19 @@
static DECLARE_RWSEM(slub_lock);
LIST_HEAD(slab_caches);
-#ifdef CONFIG_SYSFS
+/*
+ * Tracking user of a slab.
+ */
+struct track {
+ void *addr; /* Called from address */
+ int cpu; /* Was running on cpu */
+ int pid; /* Pid context */
+ unsigned long when; /* When did the operation occur */
+};
+
+enum track_item { TRACK_ALLOC, TRACK_FREE };
+
+#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
static void sysfs_slab_remove(struct kmem_cache *);
@@ -202,6 +243,63 @@
#endif
}
+static inline int check_valid_pointer(struct kmem_cache *s,
+ struct page *page, const void *object)
+{
+ void *base;
+
+ if (!object)
+ return 1;
+
+ base = page_address(page);
+ if (object < base || object >= base + s->objects * s->size ||
+ (object - base) % s->size) {
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Slow version of get and set free pointer.
+ *
+ * This version requires touching the cache lines of kmem_cache which
+ * we avoid to do in the fast alloc free paths. There we obtain the offset
+ * from the page struct.
+ */
+static inline void *get_freepointer(struct kmem_cache *s, void *object)
+{
+ return *(void **)(object + s->offset);
+}
+
+static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
+{
+ *(void **)(object + s->offset) = fp;
+}
+
+/* Loop over all objects in a slab */
+#define for_each_object(__p, __s, __addr) \
+ for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\
+ __p += (__s)->size)
+
+/* Scan freelist */
+#define for_each_free_object(__p, __s, __free) \
+ for (__p = (__free); __p; __p = get_freepointer((__s), __p))
+
+/* Determine object index from a given position */
+static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
+{
+ return (p - addr) / s->size;
+}
+
+#ifdef CONFIG_SLUB_DEBUG
+/*
+ * Debug settings:
+ */
+static int slub_debug;
+
+static char *slub_debug_slabs;
+
/*
* Object debugging
*/
@@ -237,35 +335,6 @@
}
}
-/*
- * Slow version of get and set free pointer.
- *
- * This requires touching the cache lines of kmem_cache.
- * The offset can also be obtained from the page. In that
- * case it is in the cacheline that we already need to touch.
- */
-static void *get_freepointer(struct kmem_cache *s, void *object)
-{
- return *(void **)(object + s->offset);
-}
-
-static void set_freepointer(struct kmem_cache *s, void *object, void *fp)
-{
- *(void **)(object + s->offset) = fp;
-}
-
-/*
- * Tracking user of a slab.
- */
-struct track {
- void *addr; /* Called from address */
- int cpu; /* Was running on cpu */
- int pid; /* Pid context */
- unsigned long when; /* When did the operation occur */
-};
-
-enum track_item { TRACK_ALLOC, TRACK_FREE };
-
static struct track *get_track(struct kmem_cache *s, void *object,
enum track_item alloc)
{
@@ -400,24 +469,6 @@
return 1;
}
-
-static int check_valid_pointer(struct kmem_cache *s, struct page *page,
- void *object)
-{
- void *base;
-
- if (!object)
- return 1;
-
- base = page_address(page);
- if (object < base || object >= base + s->objects * s->size ||
- (object - base) % s->size) {
- return 0;
- }
-
- return 1;
-}
-
/*
* Object layout:
*
@@ -425,26 +476,34 @@
* Bytes of the object to be managed.
* If the freepointer may overlay the object then the free
* pointer is the first word of the object.
+ *
* Poisoning uses 0x6b (POISON_FREE) and the last byte is
* 0xa5 (POISON_END)
*
* object + s->objsize
* Padding to reach word boundary. This is also used for Redzoning.
- * Padding is extended to word size if Redzoning is enabled
- * and objsize == inuse.
+ * Padding is extended by another word if Redzoning is enabled and
+ * objsize == inuse.
+ *
* We fill with 0xbb (RED_INACTIVE) for inactive objects and with
* 0xcc (RED_ACTIVE) for objects in use.
*
* object + s->inuse
+ * Meta data starts here.
+ *
* A. Free pointer (if we cannot overwrite object on free)
* B. Tracking data for SLAB_STORE_USER
- * C. Padding to reach required alignment boundary
- * Padding is done using 0x5a (POISON_INUSE)
+ * C. Padding to reach required alignment boundary or at mininum
+ * one word if debuggin is on to be able to detect writes
+ * before the word boundary.
+ *
+ * Padding is done using 0x5a (POISON_INUSE)
*
* object + s->size
+ * Nothing is used beyond s->size.
*
- * If slabcaches are merged then the objsize and inuse boundaries are to
- * be ignored. And therefore no slab options that rely on these boundaries
+ * If slabcaches are merged then the objsize and inuse boundaries are mostly
+ * ignored. And therefore no slab options that rely on these boundaries
* may be used with merged slabcaches.
*/
@@ -570,8 +629,7 @@
/*
* No choice but to zap it and thus loose the remainder
* of the free objects in this slab. May cause
- * another error because the object count maybe
- * wrong now.
+ * another error because the object count is now wrong.
*/
set_freepointer(s, p, NULL);
return 0;
@@ -611,9 +669,8 @@
}
/*
- * Determine if a certain object on a page is on the freelist and
- * therefore free. Must hold the slab lock for cpu slabs to
- * guarantee that the chains are consistent.
+ * Determine if a certain object on a page is on the freelist. Must hold the
+ * slab lock to guarantee that the chains are in a consistent state.
*/
static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
{
@@ -659,7 +716,7 @@
}
/*
- * Tracking of fully allocated slabs for debugging
+ * Tracking of fully allocated slabs for debugging purposes.
*/
static void add_full(struct kmem_cache_node *n, struct page *page)
{
@@ -710,7 +767,7 @@
/*
* If this is a slab page then lets do the best we can
* to avoid issues in the future. Marking all objects
- * as used avoids touching the remainder.
+ * as used avoids touching the remaining objects.
*/
printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n",
s->name, page);
@@ -764,6 +821,113 @@
return 0;
}
+static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc)
+{
+ if (s->flags & SLAB_TRACE) {
+ printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
+ s->name,
+ alloc ? "alloc" : "free",
+ object, page->inuse,
+ page->freelist);
+
+ if (!alloc)
+ print_section("Object", (void *)object, s->objsize);
+
+ dump_stack();
+ }
+}
+
+static int __init setup_slub_debug(char *str)
+{
+ if (!str || *str != '=')
+ slub_debug = DEBUG_DEFAULT_FLAGS;
+ else {
+ str++;
+ if (*str == 0 || *str == ',')
+ slub_debug = DEBUG_DEFAULT_FLAGS;
+ else
+ for( ;*str && *str != ','; str++)
+ switch (*str) {
+ case 'f' : case 'F' :
+ slub_debug |= SLAB_DEBUG_FREE;
+ break;
+ case 'z' : case 'Z' :
+ slub_debug |= SLAB_RED_ZONE;
+ break;
+ case 'p' : case 'P' :
+ slub_debug |= SLAB_POISON;
+ break;
+ case 'u' : case 'U' :
+ slub_debug |= SLAB_STORE_USER;
+ break;
+ case 't' : case 'T' :
+ slub_debug |= SLAB_TRACE;
+ break;
+ default:
+ printk(KERN_ERR "slub_debug option '%c' "
+ "unknown. skipped\n",*str);
+ }
+ }
+
+ if (*str == ',')
+ slub_debug_slabs = str + 1;
+ return 1;
+}
+
+__setup("slub_debug", setup_slub_debug);
+
+static void kmem_cache_open_debug_check(struct kmem_cache *s)
+{
+ /*
+ * The page->offset field is only 16 bit wide. This is an offset
+ * in units of words from the beginning of an object. If the slab
+ * size is bigger then we cannot move the free pointer behind the
+ * object anymore.
+ *
+ * On 32 bit platforms the limit is 256k. On 64bit platforms
+ * the limit is 512k.
+ *
+ * Debugging or ctor/dtors may create a need to move the free
+ * pointer. Fail if this happens.
+ */
+ if (s->size >= 65535 * sizeof(void *)) {
+ BUG_ON(s->flags & (SLAB_RED_ZONE | SLAB_POISON |
+ SLAB_STORE_USER | SLAB_DESTROY_BY_RCU));
+ BUG_ON(s->ctor || s->dtor);
+ }
+ else
+ /*
+ * Enable debugging if selected on the kernel commandline.
+ */
+ if (slub_debug && (!slub_debug_slabs ||
+ strncmp(slub_debug_slabs, s->name,
+ strlen(slub_debug_slabs)) == 0))
+ s->flags |= slub_debug;
+}
+#else
+
+static inline int alloc_object_checks(struct kmem_cache *s,
+ struct page *page, void *object) { return 0; }
+
+static inline int free_object_checks(struct kmem_cache *s,
+ struct page *page, void *object) { return 0; }
+
+static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
+static inline void remove_full(struct kmem_cache *s, struct page *page) {}
+static inline void trace(struct kmem_cache *s, struct page *page,
+ void *object, int alloc) {}
+static inline void init_object(struct kmem_cache *s,
+ void *object, int active) {}
+static inline void init_tracking(struct kmem_cache *s, void *object) {}
+static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
+ { return 1; }
+static inline int check_object(struct kmem_cache *s, struct page *page,
+ void *object, int active) { return 1; }
+static inline void set_track(struct kmem_cache *s, void *object,
+ enum track_item alloc, void *addr) {}
+static inline void kmem_cache_open_debug_check(struct kmem_cache *s) {}
+#define slub_debug 0
+#endif
/*
* Slab allocation and freeing
*/
@@ -797,7 +961,7 @@
static void setup_object(struct kmem_cache *s, struct page *page,
void *object)
{
- if (PageError(page)) {
+ if (SlabDebug(page)) {
init_object(s, object, 0);
init_tracking(s, object);
}
@@ -832,7 +996,7 @@
page->flags |= 1 << PG_slab;
if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
SLAB_STORE_USER | SLAB_TRACE))
- page->flags |= 1 << PG_error;
+ SetSlabDebug(page);
start = page_address(page);
end = start + s->objects * s->size;
@@ -841,7 +1005,7 @@
memset(start, POISON_INUSE, PAGE_SIZE << s->order);
last = start;
- for (p = start + s->size; p < end; p += s->size) {
+ for_each_object(p, s, start) {
setup_object(s, page, last);
set_freepointer(s, last, p);
last = p;
@@ -861,13 +1025,11 @@
{
int pages = 1 << s->order;
- if (unlikely(PageError(page) || s->dtor)) {
- void *start = page_address(page);
- void *end = start + (pages << PAGE_SHIFT);
+ if (unlikely(SlabDebug(page) || s->dtor)) {
void *p;
slab_pad_check(s, page);
- for (p = start; p <= end - s->size; p += s->size) {
+ for_each_object(p, s, page_address(page)) {
if (s->dtor)
s->dtor(p, s, 0);
check_object(s, page, p, 0);
@@ -910,7 +1072,8 @@
atomic_long_dec(&n->nr_slabs);
reset_page_mapcount(page);
- page->flags &= ~(1 << PG_slab | 1 << PG_error);
+ ClearSlabDebug(page);
+ __ClearPageSlab(page);
free_slab(s, page);
}
@@ -966,9 +1129,9 @@
}
/*
- * Lock page and remove it from the partial list
+ * Lock slab and remove from the partial list.
*
- * Must hold list_lock
+ * Must hold list_lock.
*/
static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page)
{
@@ -981,7 +1144,7 @@
}
/*
- * Try to get a partial slab from a specific node
+ * Try to allocate a partial slab from a specific node.
*/
static struct page *get_partial_node(struct kmem_cache_node *n)
{
@@ -990,7 +1153,8 @@
/*
* Racy check. If we mistakenly see no partial slabs then we
* just allocate an empty slab. If we mistakenly try to get a
- * partial slab then get_partials() will return NULL.
+ * partial slab and there is none available then get_partials()
+ * will return NULL.
*/
if (!n || !n->nr_partial)
return NULL;
@@ -1006,8 +1170,7 @@
}
/*
- * Get a page from somewhere. Search in increasing NUMA
- * distances.
+ * Get a page from somewhere. Search in increasing NUMA distances.
*/
static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
{
@@ -1017,24 +1180,22 @@
struct page *page;
/*
- * The defrag ratio allows to configure the tradeoffs between
- * inter node defragmentation and node local allocations.
- * A lower defrag_ratio increases the tendency to do local
- * allocations instead of scanning throught the partial
- * lists on other nodes.
+ * The defrag ratio allows a configuration of the tradeoffs between
+ * inter node defragmentation and node local allocations. A lower
+ * defrag_ratio increases the tendency to do local allocations
+ * instead of attempting to obtain partial slabs from other nodes.
*
- * If defrag_ratio is set to 0 then kmalloc() always
- * returns node local objects. If its higher then kmalloc()
- * may return off node objects in order to avoid fragmentation.
- *
- * A higher ratio means slabs may be taken from other nodes
- * thus reducing the number of partial slabs on those nodes.
+ * If the defrag_ratio is set to 0 then kmalloc() always
+ * returns node local objects. If the ratio is higher then kmalloc()
+ * may return off node objects because partial slabs are obtained
+ * from other nodes and filled up.
*
* If /sys/slab/xx/defrag_ratio is set to 100 (which makes
- * defrag_ratio = 1000) then every (well almost) allocation
- * will first attempt to defrag slab caches on other nodes. This
- * means scanning over all nodes to look for partial slabs which
- * may be a bit expensive to do on every slab allocation.
+ * defrag_ratio = 1000) then every (well almost) allocation will
+ * first attempt to defrag slab caches on other nodes. This means
+ * scanning over all nodes to look for partial slabs which may be
+ * expensive if we do it every time we are trying to find a slab
+ * with available objects.
*/
if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio)
return NULL;
@@ -1087,18 +1248,19 @@
if (page->freelist)
add_partial(n, page);
- else if (PageError(page) && (s->flags & SLAB_STORE_USER))
+ else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER))
add_full(n, page);
slab_unlock(page);
} else {
if (n->nr_partial < MIN_PARTIAL) {
/*
- * Adding an empty page to the partial slabs in order
- * to avoid page allocator overhead. This page needs to
- * come after all the others that are not fully empty
- * in order to make sure that we do maximum
- * defragmentation.
+ * Adding an empty slab to the partial slabs in order
+ * to avoid page allocator overhead. This slab needs
+ * to come after the other slabs with objects in
+ * order to fill them up. That way the size of the
+ * partial list stays small. kmem_cache_shrink can
+ * reclaim empty slabs from the partial list.
*/
add_partial_tail(n, page);
slab_unlock(page);
@@ -1166,11 +1328,11 @@
* 1. The page struct
* 2. The first cacheline of the object to be allocated.
*
- * The only cache lines that are read (apart from code) is the
+ * The only other cache lines that are read (apart from code) is the
* per cpu array in the kmem_cache struct.
*
* Fastpath is not possible if we need to get a new slab or have
- * debugging enabled (which means all slabs are marked with PageError)
+ * debugging enabled (which means all slabs are marked with SlabDebug)
*/
static void *slab_alloc(struct kmem_cache *s,
gfp_t gfpflags, int node, void *addr)
@@ -1193,7 +1355,7 @@
object = page->freelist;
if (unlikely(!object))
goto another_slab;
- if (unlikely(PageError(page)))
+ if (unlikely(SlabDebug(page)))
goto debug;
have_object:
@@ -1220,9 +1382,11 @@
cpu = smp_processor_id();
if (s->cpu_slab[cpu]) {
/*
- * Someone else populated the cpu_slab while we enabled
- * interrupts, or we have got scheduled on another cpu.
- * The page may not be on the requested node.
+ * Someone else populated the cpu_slab while we
+ * enabled interrupts, or we have gotten scheduled
+ * on another cpu. The page may not be on the
+ * requested node even if __GFP_THISNODE was
+ * specified. So we need to recheck.
*/
if (node == -1 ||
page_to_nid(s->cpu_slab[cpu]) == node) {
@@ -1235,7 +1399,7 @@
slab_lock(page);
goto redo;
}
- /* Dump the current slab */
+ /* New slab does not fit our expectations */
flush_slab(s, s->cpu_slab[cpu], cpu);
}
slab_lock(page);
@@ -1248,12 +1412,7 @@
goto another_slab;
if (s->flags & SLAB_STORE_USER)
set_track(s, object, TRACK_ALLOC, addr);
- if (s->flags & SLAB_TRACE) {
- printk(KERN_INFO "TRACE %s alloc 0x%p inuse=%d fp=0x%p\n",
- s->name, object, page->inuse,
- page->freelist);
- dump_stack();
- }
+ trace(s, page, object, 1);
init_object(s, object, 1);
goto have_object;
}
@@ -1276,7 +1435,8 @@
* The fastpath only writes the cacheline of the page struct and the first
* cacheline of the object.
*
- * No special cachelines need to be read
+ * We read the cpu_slab cacheline to check if the slab is the per cpu
+ * slab for this processor.
*/
static void slab_free(struct kmem_cache *s, struct page *page,
void *x, void *addr)
@@ -1288,7 +1448,7 @@
local_irq_save(flags);
slab_lock(page);
- if (unlikely(PageError(page)))
+ if (unlikely(SlabDebug(page)))
goto debug;
checks_ok:
prior = object[page->offset] = page->freelist;
@@ -1321,7 +1481,7 @@
slab_empty:
if (prior)
/*
- * Slab on the partial list.
+ * Slab still on the partial list.
*/
remove_partial(s, page);
@@ -1337,13 +1497,7 @@
remove_full(s, page);
if (s->flags & SLAB_STORE_USER)
set_track(s, x, TRACK_FREE, addr);
- if (s->flags & SLAB_TRACE) {
- printk(KERN_INFO "TRACE %s free 0x%p inuse=%d fp=0x%p\n",
- s->name, object, page->inuse,
- page->freelist);
- print_section("Object", (void *)object, s->objsize);
- dump_stack();
- }
+ trace(s, page, object, 0);
init_object(s, object, 0);
goto checks_ok;
}
@@ -1370,22 +1524,16 @@
}
/*
- * kmem_cache_open produces objects aligned at "size" and the first object
- * is placed at offset 0 in the slab (We have no metainformation on the
- * slab, all slabs are in essence "off slab").
- *
- * In order to get the desired alignment one just needs to align the
- * size.
+ * Object placement in a slab is made very easy because we always start at
+ * offset 0. If we tune the size of the object to the alignment then we can
+ * get the required alignment by putting one properly sized object after
+ * another.
*
* Notice that the allocation order determines the sizes of the per cpu
* caches. Each processor has always one slab available for allocations.
* Increasing the allocation order reduces the number of times that slabs
- * must be moved on and off the partial lists and therefore may influence
+ * must be moved on and off the partial lists and is therefore a factor in
* locking overhead.
- *
- * The offset is used to relocate the free list link in each object. It is
- * therefore possible to move the free list link behind the object. This
- * is necessary for RCU to work properly and also useful for debugging.
*/
/*
@@ -1396,76 +1544,110 @@
*/
static int slub_min_order;
static int slub_max_order = DEFAULT_MAX_ORDER;
-
-/*
- * Minimum number of objects per slab. This is necessary in order to
- * reduce locking overhead. Similar to the queue size in SLAB.
- */
static int slub_min_objects = DEFAULT_MIN_OBJECTS;
/*
* Merge control. If this is set then no merging of slab caches will occur.
+ * (Could be removed. This was introduced to pacify the merge skeptics.)
*/
static int slub_nomerge;
/*
- * Debug settings:
- */
-static int slub_debug;
-
-static char *slub_debug_slabs;
-
-/*
* Calculate the order of allocation given an slab object size.
*
- * The order of allocation has significant impact on other elements
- * of the system. Generally order 0 allocations should be preferred
- * since they do not cause fragmentation in the page allocator. Larger
- * objects may have problems with order 0 because there may be too much
- * space left unused in a slab. We go to a higher order if more than 1/8th
- * of the slab would be wasted.
+ * The order of allocation has significant impact on performance and other
+ * system components. Generally order 0 allocations should be preferred since
+ * order 0 does not cause fragmentation in the page allocator. Larger objects
+ * be problematic to put into order 0 slabs because there may be too much
+ * unused space left. We go to a higher order if more than 1/8th of the slab
+ * would be wasted.
*
- * In order to reach satisfactory performance we must ensure that
- * a minimum number of objects is in one slab. Otherwise we may
- * generate too much activity on the partial lists. This is less a
- * concern for large slabs though. slub_max_order specifies the order
- * where we begin to stop considering the number of objects in a slab.
+ * In order to reach satisfactory performance we must ensure that a minimum
+ * number of objects is in one slab. Otherwise we may generate too much
+ * activity on the partial lists which requires taking the list_lock. This is
+ * less a concern for large slabs though which are rarely used.
*
- * Higher order allocations also allow the placement of more objects
- * in a slab and thereby reduce object handling overhead. If the user
- * has requested a higher mininum order then we start with that one
- * instead of zero.
+ * slub_max_order specifies the order where we begin to stop considering the
+ * number of objects in a slab as critical. If we reach slub_max_order then
+ * we try to keep the page order as low as possible. So we accept more waste
+ * of space in favor of a small page order.
+ *
+ * Higher order allocations also allow the placement of more objects in a
+ * slab and thereby reduce object handling overhead. If the user has
+ * requested a higher mininum order then we start with that one instead of
+ * the smallest order which will fit the object.
*/
-static int calculate_order(int size)
+static inline int slab_order(int size, int min_objects,
+ int max_order, int fract_leftover)
{
int order;
int rem;
- for (order = max(slub_min_order, fls(size - 1) - PAGE_SHIFT);
- order < MAX_ORDER; order++) {
+ for (order = max(slub_min_order,
+ fls(min_objects * size - 1) - PAGE_SHIFT);
+ order <= max_order; order++) {
+
unsigned long slab_size = PAGE_SIZE << order;
- if (slub_max_order > order &&
- slab_size < slub_min_objects * size)
- continue;
-
- if (slab_size < size)
+ if (slab_size < min_objects * size)
continue;
rem = slab_size % size;
- if (rem <= (PAGE_SIZE << order) / 8)
+ if (rem <= slab_size / fract_leftover)
break;
}
- if (order >= MAX_ORDER)
- return -E2BIG;
+
return order;
}
+static inline int calculate_order(int size)
+{
+ int order;
+ int min_objects;
+ int fraction;
+
+ /*
+ * Attempt to find best configuration for a slab. This
+ * works by first attempting to generate a layout with
+ * the best configuration and backing off gradually.
+ *
+ * First we reduce the acceptable waste in a slab. Then
+ * we reduce the minimum objects required in a slab.
+ */
+ min_objects = slub_min_objects;
+ while (min_objects > 1) {
+ fraction = 8;
+ while (fraction >= 4) {
+ order = slab_order(size, min_objects,
+ slub_max_order, fraction);
+ if (order <= slub_max_order)
+ return order;
+ fraction /= 2;
+ }
+ min_objects /= 2;
+ }
+
+ /*
+ * We were unable to place multiple objects in a slab. Now
+ * lets see if we can place a single object there.
+ */
+ order = slab_order(size, 1, slub_max_order, 1);
+ if (order <= slub_max_order)
+ return order;
+
+ /*
+ * Doh this slab cannot be placed using slub_max_order.
+ */
+ order = slab_order(size, 1, MAX_ORDER, 1);
+ if (order <= MAX_ORDER)
+ return order;
+ return -ENOSYS;
+}
+
/*
- * Function to figure out which alignment to use from the
- * various ways of specifying it.
+ * Figure out what the alignment of the objects will be.
*/
static unsigned long calculate_alignment(unsigned long flags,
unsigned long align, unsigned long size)
@@ -1480,8 +1662,8 @@
* then use it.
*/
if ((flags & SLAB_HWCACHE_ALIGN) &&
- size > L1_CACHE_BYTES / 2)
- return max_t(unsigned long, align, L1_CACHE_BYTES);
+ size > cache_line_size() / 2)
+ return max_t(unsigned long, align, cache_line_size());
if (align < ARCH_SLAB_MINALIGN)
return ARCH_SLAB_MINALIGN;
@@ -1619,22 +1801,23 @@
*/
size = ALIGN(size, sizeof(void *));
+#ifdef CONFIG_SLUB_DEBUG
/*
- * If we are redzoning then check if there is some space between the
+ * If we are Redzoning then check if there is some space between the
* end of the object and the free pointer. If not then add an
- * additional word, so that we can establish a redzone between
- * the object and the freepointer to be able to check for overwrites.
+ * additional word to have some bytes to store Redzone information.
*/
if ((flags & SLAB_RED_ZONE) && size == s->objsize)
size += sizeof(void *);
+#endif
/*
- * With that we have determined how much of the slab is in actual
- * use by the object. This is the potential offset to the free
- * pointer.
+ * With that we have determined the number of bytes in actual use
+ * by the object. This is the potential offset to the free pointer.
*/
s->inuse = size;
+#ifdef CONFIG_SLUB_DEBUG
if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
s->ctor || s->dtor)) {
/*
@@ -1656,7 +1839,7 @@
*/
size += 2 * sizeof(struct track);
- if (flags & DEBUG_DEFAULT_FLAGS)
+ if (flags & SLAB_RED_ZONE)
/*
* Add some empty padding so that we can catch
* overwrites from earlier objects rather than let
@@ -1665,10 +1848,12 @@
* of the object.
*/
size += sizeof(void *);
+#endif
+
/*
* Determine the alignment based on various parameters that the
- * user specified (this is unecessarily complex due to the attempt
- * to be compatible with SLAB. Should be cleaned up some day).
+ * user specified and the dynamic determination of cache line size
+ * on bootup.
*/
align = calculate_alignment(flags, align, s->objsize);
@@ -1700,23 +1885,6 @@
}
-static int __init finish_bootstrap(void)
-{
- struct list_head *h;
- int err;
-
- slab_state = SYSFS;
-
- list_for_each(h, &slab_caches) {
- struct kmem_cache *s =
- container_of(h, struct kmem_cache, list);
-
- err = sysfs_slab_add(s);
- BUG_ON(err);
- }
- return 0;
-}
-
static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
const char *name, size_t size,
size_t align, unsigned long flags,
@@ -1730,32 +1898,7 @@
s->objsize = size;
s->flags = flags;
s->align = align;
-
- /*
- * The page->offset field is only 16 bit wide. This is an offset
- * in units of words from the beginning of an object. If the slab
- * size is bigger then we cannot move the free pointer behind the
- * object anymore.
- *
- * On 32 bit platforms the limit is 256k. On 64bit platforms
- * the limit is 512k.
- *
- * Debugging or ctor/dtors may create a need to move the free
- * pointer. Fail if this happens.
- */
- if (s->size >= 65535 * sizeof(void *)) {
- BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON |
- SLAB_STORE_USER | SLAB_DESTROY_BY_RCU));
- BUG_ON(ctor || dtor);
- }
- else
- /*
- * Enable debugging if selected on the kernel commandline.
- */
- if (slub_debug && (!slub_debug_slabs ||
- strncmp(slub_debug_slabs, name,
- strlen(slub_debug_slabs)) == 0))
- s->flags |= slub_debug;
+ kmem_cache_open_debug_check(s);
if (!calculate_sizes(s))
goto error;
@@ -1783,7 +1926,6 @@
int kmem_ptr_validate(struct kmem_cache *s, const void *object)
{
struct page * page;
- void *addr;
page = get_object_page(object);
@@ -1791,13 +1933,7 @@
/* No slab or wrong slab */
return 0;
- addr = page_address(page);
- if (object < addr || object >= addr + s->objects * s->size)
- /* Out of bounds */
- return 0;
-
- if ((object - addr) % s->size)
- /* Improperly aligned */
+ if (!check_valid_pointer(s, page, object))
return 0;
/*
@@ -1826,7 +1962,8 @@
EXPORT_SYMBOL(kmem_cache_name);
/*
- * Attempt to free all slabs on a node
+ * Attempt to free all slabs on a node. Return the number of slabs we
+ * were unable to free.
*/
static int free_list(struct kmem_cache *s, struct kmem_cache_node *n,
struct list_head *list)
@@ -1847,7 +1984,7 @@
}
/*
- * Release all resources used by slab cache
+ * Release all resources used by a slab cache.
*/
static int kmem_cache_close(struct kmem_cache *s)
{
@@ -1932,45 +2069,6 @@
__setup("slub_nomerge", setup_slub_nomerge);
-static int __init setup_slub_debug(char *str)
-{
- if (!str || *str != '=')
- slub_debug = DEBUG_DEFAULT_FLAGS;
- else {
- str++;
- if (*str == 0 || *str == ',')
- slub_debug = DEBUG_DEFAULT_FLAGS;
- else
- for( ;*str && *str != ','; str++)
- switch (*str) {
- case 'f' : case 'F' :
- slub_debug |= SLAB_DEBUG_FREE;
- break;
- case 'z' : case 'Z' :
- slub_debug |= SLAB_RED_ZONE;
- break;
- case 'p' : case 'P' :
- slub_debug |= SLAB_POISON;
- break;
- case 'u' : case 'U' :
- slub_debug |= SLAB_STORE_USER;
- break;
- case 't' : case 'T' :
- slub_debug |= SLAB_TRACE;
- break;
- default:
- printk(KERN_ERR "slub_debug option '%c' "
- "unknown. skipped\n",*str);
- }
- }
-
- if (*str == ',')
- slub_debug_slabs = str + 1;
- return 1;
-}
-
-__setup("slub_debug", setup_slub_debug);
-
static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
const char *name, int size, gfp_t gfp_flags)
{
@@ -2108,13 +2206,14 @@
EXPORT_SYMBOL(kfree);
/*
- * kmem_cache_shrink removes empty slabs from the partial lists
- * and then sorts the partially allocated slabs by the number
- * of items in use. The slabs with the most items in use
- * come first. New allocations will remove these from the
- * partial list because they are full. The slabs with the
- * least items are placed last. If it happens that the objects
- * are freed then the page can be returned to the page allocator.
+ * kmem_cache_shrink removes empty slabs from the partial lists and sorts
+ * the remaining slabs by the number of items in use. The slabs with the
+ * most items in use come first. New allocations will then fill those up
+ * and thus they can be removed from the partial lists.
+ *
+ * The slabs with the least items are placed last. This results in them
+ * being allocated from last increasing the chance that the last objects
+ * are freed in them.
*/
int kmem_cache_shrink(struct kmem_cache *s)
{
@@ -2143,12 +2242,10 @@
spin_lock_irqsave(&n->list_lock, flags);
/*
- * Build lists indexed by the items in use in
- * each slab or free slabs if empty.
+ * Build lists indexed by the items in use in each slab.
*
- * Note that concurrent frees may occur while
- * we hold the list_lock. page->inuse here is
- * the upper limit.
+ * Note that concurrent frees may occur while we hold the
+ * list_lock. page->inuse here is the upper limit.
*/
list_for_each_entry_safe(page, t, &n->partial, lru) {
if (!page->inuse && slab_trylock(page)) {
@@ -2172,8 +2269,8 @@
goto out;
/*
- * Rebuild the partial list with the slabs filled up
- * most first and the least used slabs at the end.
+ * Rebuild the partial list with the slabs filled up most
+ * first and the least used slabs at the end.
*/
for (i = s->objects - 1; i >= 0; i--)
list_splice(slabs_by_inuse + i, n->partial.prev);
@@ -2189,7 +2286,6 @@
/**
* krealloc - reallocate memory. The contents will remain unchanged.
- *
* @p: object to reallocate memory for.
* @new_size: how many bytes of memory are required.
* @flags: the type of memory to allocate.
@@ -2201,9 +2297,8 @@
*/
void *krealloc(const void *p, size_t new_size, gfp_t flags)
{
- struct kmem_cache *new_cache;
void *ret;
- struct page *page;
+ size_t ks;
if (unlikely(!p))
return kmalloc(new_size, flags);
@@ -2213,19 +2308,13 @@
return NULL;
}
- page = virt_to_head_page(p);
-
- new_cache = get_slab(new_size, flags);
-
- /*
- * If new size fits in the current cache, bail out.
- */
- if (likely(page->slab == new_cache))
+ ks = ksize(p);
+ if (ks >= new_size)
return (void *)p;
ret = kmalloc(new_size, flags);
if (ret) {
- memcpy(ret, p, min(new_size, ksize(p)));
+ memcpy(ret, p, min(new_size, ks));
kfree(p);
}
return ret;
@@ -2243,7 +2332,7 @@
#ifdef CONFIG_NUMA
/*
* Must first have the slab cache available for the allocations of the
- * struct kmalloc_cache_node's. There is special bootstrap code in
+ * struct kmem_cache_node's. There is special bootstrap code in
* kmem_cache_open for slab_state == DOWN.
*/
create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
@@ -2280,7 +2369,7 @@
printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
" Processors=%d, Nodes=%d\n",
- KMALLOC_SHIFT_HIGH, L1_CACHE_BYTES,
+ KMALLOC_SHIFT_HIGH, cache_line_size(),
slub_min_order, slub_max_order, slub_min_objects,
nr_cpu_ids, nr_node_ids);
}
@@ -2415,8 +2504,8 @@
}
/*
- * Use the cpu notifier to insure that the slab are flushed
- * when necessary.
+ * Use the cpu notifier to insure that the cpu slabs are flushed when
+ * necessary.
*/
static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
@@ -2425,7 +2514,9 @@
switch (action) {
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
for_all_slabs(__flush_cpu_slab, cpu);
break;
default:
@@ -2439,94 +2530,122 @@
#endif
-#ifdef CONFIG_NUMA
-
-/*****************************************************************
- * Generic reaper used to support the page allocator
- * (the cpu slabs are reaped by a per slab workqueue).
- *
- * Maybe move this to the page allocator?
- ****************************************************************/
-
-static DEFINE_PER_CPU(unsigned long, reap_node);
-
-static void init_reap_node(int cpu)
+void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
{
- int node;
+ struct kmem_cache *s = get_slab(size, gfpflags);
- node = next_node(cpu_to_node(cpu), node_online_map);
- if (node == MAX_NUMNODES)
- node = first_node(node_online_map);
+ if (!s)
+ return NULL;
- __get_cpu_var(reap_node) = node;
+ return slab_alloc(s, gfpflags, -1, caller);
}
-static void next_reap_node(void)
+void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
+ int node, void *caller)
{
- int node = __get_cpu_var(reap_node);
+ struct kmem_cache *s = get_slab(size, gfpflags);
- /*
- * Also drain per cpu pages on remote zones
- */
- if (node != numa_node_id())
- drain_node_pages(node);
+ if (!s)
+ return NULL;
- node = next_node(node, node_online_map);
- if (unlikely(node >= MAX_NUMNODES))
- node = first_node(node_online_map);
- __get_cpu_var(reap_node) = node;
-}
-#else
-#define init_reap_node(cpu) do { } while (0)
-#define next_reap_node(void) do { } while (0)
-#endif
-
-#define REAPTIMEOUT_CPUC (2*HZ)
-
-#ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct delayed_work, reap_work);
-
-static void cache_reap(struct work_struct *unused)
-{
- next_reap_node();
- refresh_cpu_vm_stats(smp_processor_id());
- schedule_delayed_work(&__get_cpu_var(reap_work),
- REAPTIMEOUT_CPUC);
+ return slab_alloc(s, gfpflags, node, caller);
}
-static void __devinit start_cpu_timer(int cpu)
+#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
+static int validate_slab(struct kmem_cache *s, struct page *page)
{
- struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
+ void *p;
+ void *addr = page_address(page);
+ DECLARE_BITMAP(map, s->objects);
- /*
- * When this gets called from do_initcalls via cpucache_init(),
- * init_workqueues() has already run, so keventd will be setup
- * at that time.
- */
- if (keventd_up() && reap_work->work.func == NULL) {
- init_reap_node(cpu);
- INIT_DELAYED_WORK(reap_work, cache_reap);
- schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
+ if (!check_slab(s, page) ||
+ !on_freelist(s, page, NULL))
+ return 0;
+
+ /* Now we know that a valid freelist exists */
+ bitmap_zero(map, s->objects);
+
+ for_each_free_object(p, s, page->freelist) {
+ set_bit(slab_index(p, s, addr), map);
+ if (!check_object(s, page, p, 0))
+ return 0;
+ }
+
+ for_each_object(p, s, addr)
+ if (!test_bit(slab_index(p, s, addr), map))
+ if (!check_object(s, page, p, 1))
+ return 0;
+ return 1;
+}
+
+static void validate_slab_slab(struct kmem_cache *s, struct page *page)
+{
+ if (slab_trylock(page)) {
+ validate_slab(s, page);
+ slab_unlock(page);
+ } else
+ printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
+ s->name, page);
+
+ if (s->flags & DEBUG_DEFAULT_FLAGS) {
+ if (!SlabDebug(page))
+ printk(KERN_ERR "SLUB %s: SlabDebug not set "
+ "on slab 0x%p\n", s->name, page);
+ } else {
+ if (SlabDebug(page))
+ printk(KERN_ERR "SLUB %s: SlabDebug set on "
+ "slab 0x%p\n", s->name, page);
}
}
-static int __init cpucache_init(void)
+static int validate_slab_node(struct kmem_cache *s, struct kmem_cache_node *n)
{
- int cpu;
+ unsigned long count = 0;
+ struct page *page;
+ unsigned long flags;
- /*
- * Register the timers that drain pcp pages and update vm statistics
- */
- for_each_online_cpu(cpu)
- start_cpu_timer(cpu);
- return 0;
+ spin_lock_irqsave(&n->list_lock, flags);
+
+ list_for_each_entry(page, &n->partial, lru) {
+ validate_slab_slab(s, page);
+ count++;
+ }
+ if (count != n->nr_partial)
+ printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
+ "counter=%ld\n", s->name, count, n->nr_partial);
+
+ if (!(s->flags & SLAB_STORE_USER))
+ goto out;
+
+ list_for_each_entry(page, &n->full, lru) {
+ validate_slab_slab(s, page);
+ count++;
+ }
+ if (count != atomic_long_read(&n->nr_slabs))
+ printk(KERN_ERR "SLUB: %s %ld slabs counted but "
+ "counter=%ld\n", s->name, count,
+ atomic_long_read(&n->nr_slabs));
+
+out:
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ return count;
}
-__initcall(cpucache_init);
-#endif
+
+static unsigned long validate_slab_cache(struct kmem_cache *s)
+{
+ int node;
+ unsigned long count = 0;
+
+ flush_all(s);
+ for_each_online_node(node) {
+ struct kmem_cache_node *n = get_node(s, node);
+
+ count += validate_slab_node(s, n);
+ }
+ return count;
+}
#ifdef SLUB_RESILIENCY_TEST
-static unsigned long validate_slab_cache(struct kmem_cache *s);
-
static void resiliency_test(void)
{
u8 *p;
@@ -2582,134 +2701,20 @@
#endif
/*
- * These are not as efficient as kmalloc for the non debug case.
- * We do not have the page struct available so we have to touch one
- * cacheline in struct kmem_cache to check slab flags.
- */
-void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
-{
- struct kmem_cache *s = get_slab(size, gfpflags);
-
- if (!s)
- return NULL;
-
- return slab_alloc(s, gfpflags, -1, caller);
-}
-
-void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
- int node, void *caller)
-{
- struct kmem_cache *s = get_slab(size, gfpflags);
-
- if (!s)
- return NULL;
-
- return slab_alloc(s, gfpflags, node, caller);
-}
-
-#ifdef CONFIG_SYSFS
-
-static int validate_slab(struct kmem_cache *s, struct page *page)
-{
- void *p;
- void *addr = page_address(page);
- unsigned long map[BITS_TO_LONGS(s->objects)];
-
- if (!check_slab(s, page) ||
- !on_freelist(s, page, NULL))
- return 0;
-
- /* Now we know that a valid freelist exists */
- bitmap_zero(map, s->objects);
-
- for(p = page->freelist; p; p = get_freepointer(s, p)) {
- set_bit((p - addr) / s->size, map);
- if (!check_object(s, page, p, 0))
- return 0;
- }
-
- for(p = addr; p < addr + s->objects * s->size; p += s->size)
- if (!test_bit((p - addr) / s->size, map))
- if (!check_object(s, page, p, 1))
- return 0;
- return 1;
-}
-
-static void validate_slab_slab(struct kmem_cache *s, struct page *page)
-{
- if (slab_trylock(page)) {
- validate_slab(s, page);
- slab_unlock(page);
- } else
- printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
- s->name, page);
-
- if (s->flags & DEBUG_DEFAULT_FLAGS) {
- if (!PageError(page))
- printk(KERN_ERR "SLUB %s: PageError not set "
- "on slab 0x%p\n", s->name, page);
- } else {
- if (PageError(page))
- printk(KERN_ERR "SLUB %s: PageError set on "
- "slab 0x%p\n", s->name, page);
- }
-}
-
-static int validate_slab_node(struct kmem_cache *s, struct kmem_cache_node *n)
-{
- unsigned long count = 0;
- struct page *page;
- unsigned long flags;
-
- spin_lock_irqsave(&n->list_lock, flags);
-
- list_for_each_entry(page, &n->partial, lru) {
- validate_slab_slab(s, page);
- count++;
- }
- if (count != n->nr_partial)
- printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
- "counter=%ld\n", s->name, count, n->nr_partial);
-
- if (!(s->flags & SLAB_STORE_USER))
- goto out;
-
- list_for_each_entry(page, &n->full, lru) {
- validate_slab_slab(s, page);
- count++;
- }
- if (count != atomic_long_read(&n->nr_slabs))
- printk(KERN_ERR "SLUB: %s %ld slabs counted but "
- "counter=%ld\n", s->name, count,
- atomic_long_read(&n->nr_slabs));
-
-out:
- spin_unlock_irqrestore(&n->list_lock, flags);
- return count;
-}
-
-static unsigned long validate_slab_cache(struct kmem_cache *s)
-{
- int node;
- unsigned long count = 0;
-
- flush_all(s);
- for_each_online_node(node) {
- struct kmem_cache_node *n = get_node(s, node);
-
- count += validate_slab_node(s, n);
- }
- return count;
-}
-
-/*
- * Generate lists of locations where slabcache objects are allocated
+ * Generate lists of code addresses where slabcache objects are allocated
* and freed.
*/
struct location {
unsigned long count;
void *addr;
+ long long sum_time;
+ long min_time;
+ long max_time;
+ long min_pid;
+ long max_pid;
+ cpumask_t cpus;
+ nodemask_t nodes;
};
struct loc_track {
@@ -2750,11 +2755,12 @@
}
static int add_location(struct loc_track *t, struct kmem_cache *s,
- void *addr)
+ const struct track *track)
{
long start, end, pos;
struct location *l;
void *caddr;
+ unsigned long age = jiffies - track->when;
start = -1;
end = t->count;
@@ -2770,19 +2776,36 @@
break;
caddr = t->loc[pos].addr;
- if (addr == caddr) {
- t->loc[pos].count++;
+ if (track->addr == caddr) {
+
+ l = &t->loc[pos];
+ l->count++;
+ if (track->when) {
+ l->sum_time += age;
+ if (age < l->min_time)
+ l->min_time = age;
+ if (age > l->max_time)
+ l->max_time = age;
+
+ if (track->pid < l->min_pid)
+ l->min_pid = track->pid;
+ if (track->pid > l->max_pid)
+ l->max_pid = track->pid;
+
+ cpu_set(track->cpu, l->cpus);
+ }
+ node_set(page_to_nid(virt_to_page(track)), l->nodes);
return 1;
}
- if (addr < caddr)
+ if (track->addr < caddr)
end = pos;
else
start = pos;
}
/*
- * Not found. Insert new tracking element
+ * Not found. Insert new tracking element.
*/
if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max))
return 0;
@@ -2793,7 +2816,16 @@
(t->count - pos) * sizeof(struct location));
t->count++;
l->count = 1;
- l->addr = addr;
+ l->addr = track->addr;
+ l->sum_time = age;
+ l->min_time = age;
+ l->max_time = age;
+ l->min_pid = track->pid;
+ l->max_pid = track->pid;
+ cpus_clear(l->cpus);
+ cpu_set(track->cpu, l->cpus);
+ nodes_clear(l->nodes);
+ node_set(page_to_nid(virt_to_page(track)), l->nodes);
return 1;
}
@@ -2801,19 +2833,16 @@
struct page *page, enum track_item alloc)
{
void *addr = page_address(page);
- unsigned long map[BITS_TO_LONGS(s->objects)];
+ DECLARE_BITMAP(map, s->objects);
void *p;
bitmap_zero(map, s->objects);
- for (p = page->freelist; p; p = get_freepointer(s, p))
- set_bit((p - addr) / s->size, map);
+ for_each_free_object(p, s, page->freelist)
+ set_bit(slab_index(p, s, addr), map);
- for (p = addr; p < addr + s->objects * s->size; p += s->size)
- if (!test_bit((p - addr) / s->size, map)) {
- void *addr = get_track(s, p, alloc)->addr;
-
- add_location(t, s, addr);
- }
+ for_each_object(p, s, addr)
+ if (!test_bit(slab_index(p, s, addr), map))
+ add_location(t, s, get_track(s, p, alloc));
}
static int list_locations(struct kmem_cache *s, char *buf,
@@ -2847,15 +2876,47 @@
}
for (i = 0; i < t.count; i++) {
- void *addr = t.loc[i].addr;
+ struct location *l = &t.loc[i];
if (n > PAGE_SIZE - 100)
break;
- n += sprintf(buf + n, "%7ld ", t.loc[i].count);
- if (addr)
- n += sprint_symbol(buf + n, (unsigned long)t.loc[i].addr);
+ n += sprintf(buf + n, "%7ld ", l->count);
+
+ if (l->addr)
+ n += sprint_symbol(buf + n, (unsigned long)l->addr);
else
n += sprintf(buf + n, "<not-available>");
+
+ if (l->sum_time != l->min_time) {
+ unsigned long remainder;
+
+ n += sprintf(buf + n, " age=%ld/%ld/%ld",
+ l->min_time,
+ div_long_long_rem(l->sum_time, l->count, &remainder),
+ l->max_time);
+ } else
+ n += sprintf(buf + n, " age=%ld",
+ l->min_time);
+
+ if (l->min_pid != l->max_pid)
+ n += sprintf(buf + n, " pid=%ld-%ld",
+ l->min_pid, l->max_pid);
+ else
+ n += sprintf(buf + n, " pid=%ld",
+ l->min_pid);
+
+ if (num_online_cpus() > 1 && !cpus_empty(l->cpus)) {
+ n += sprintf(buf + n, " cpus=");
+ n += cpulist_scnprintf(buf + n, PAGE_SIZE - n - 50,
+ l->cpus);
+ }
+
+ if (num_online_nodes() > 1 && !nodes_empty(l->nodes)) {
+ n += sprintf(buf + n, " nodes=");
+ n += nodelist_scnprintf(buf + n, PAGE_SIZE - n - 50,
+ l->nodes);
+ }
+
n += sprintf(buf + n, "\n");
}
@@ -3491,6 +3552,7 @@
static int __init slab_sysfs_init(void)
{
+ struct list_head *h;
int err;
err = subsystem_register(&slab_subsys);
@@ -3499,7 +3561,15 @@
return -ENOSYS;
}
- finish_bootstrap();
+ slab_state = SYSFS;
+
+ list_for_each(h, &slab_caches) {
+ struct kmem_cache *s =
+ container_of(h, struct kmem_cache, list);
+
+ err = sysfs_slab_add(s);
+ BUG_ON(err);
+ }
while (alias_list) {
struct saved_alias *al = alias_list;
@@ -3515,6 +3585,4 @@
}
__initcall(slab_sysfs_init);
-#else
-__initcall(finish_bootstrap);
#endif
diff --git a/mm/swap.c b/mm/swap.c
index 218c52a..d3cb966 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -488,7 +488,7 @@
long *committed;
committed = &per_cpu(committed_space, (long)hcpu);
- if (action == CPU_DEAD) {
+ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
atomic_add(*committed, &vm_committed_space);
*committed = 0;
__lru_add_drain((long)hcpu);
diff --git a/mm/truncate.c b/mm/truncate.c
index 0f4b6d1..4fbe1a2 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -12,6 +12,7 @@
#include <linux/swap.h>
#include <linux/module.h>
#include <linux/pagemap.h>
+#include <linux/highmem.h>
#include <linux/pagevec.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/buffer_head.h> /* grr. try_to_release_page,
@@ -46,7 +47,7 @@
static inline void truncate_partial_page(struct page *page, unsigned partial)
{
- memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
+ zero_user_page(page, partial, PAGE_CACHE_SIZE - partial, KM_USER0);
if (PagePrivate(page))
do_invalidatepage(page, partial);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1c8e75a1..1be5a63 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1528,7 +1528,7 @@
pg_data_t *pgdat;
cpumask_t mask;
- if (action == CPU_ONLINE) {
+ if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
for_each_online_pgdat(pgdat) {
mask = node_to_cpumask(pgdat->node_id);
if (any_online_cpu(mask) != NR_CPUS)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6c488d6..9832d9a 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -281,6 +281,17 @@
/*
* Update the zone counters for one cpu.
+ *
+ * Note that refresh_cpu_vm_stats strives to only access
+ * node local memory. The per cpu pagesets on remote zones are placed
+ * in the memory local to the processor using that pageset. So the
+ * loop over all zones will access a series of cachelines local to
+ * the processor.
+ *
+ * The call to zone_page_state_add updates the cachelines with the
+ * statistics in the remote zone struct as well as the global cachelines
+ * with the global counters. These could cause remote node cache line
+ * bouncing and will have to be only done when necessary.
*/
void refresh_cpu_vm_stats(int cpu)
{
@@ -289,21 +300,54 @@
unsigned long flags;
for_each_zone(zone) {
- struct per_cpu_pageset *pcp;
+ struct per_cpu_pageset *p;
if (!populated_zone(zone))
continue;
- pcp = zone_pcp(zone, cpu);
+ p = zone_pcp(zone, cpu);
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- if (pcp->vm_stat_diff[i]) {
+ if (p->vm_stat_diff[i]) {
local_irq_save(flags);
- zone_page_state_add(pcp->vm_stat_diff[i],
+ zone_page_state_add(p->vm_stat_diff[i],
zone, i);
- pcp->vm_stat_diff[i] = 0;
+ p->vm_stat_diff[i] = 0;
+#ifdef CONFIG_NUMA
+ /* 3 seconds idle till flush */
+ p->expire = 3;
+#endif
local_irq_restore(flags);
}
+#ifdef CONFIG_NUMA
+ /*
+ * Deal with draining the remote pageset of this
+ * processor
+ *
+ * Check if there are pages remaining in this pageset
+ * if not then there is nothing to expire.
+ */
+ if (!p->expire || (!p->pcp[0].count && !p->pcp[1].count))
+ continue;
+
+ /*
+ * We never drain zones local to this processor.
+ */
+ if (zone_to_nid(zone) == numa_node_id()) {
+ p->expire = 0;
+ continue;
+ }
+
+ p->expire--;
+ if (p->expire)
+ continue;
+
+ if (p->pcp[0].count)
+ drain_zone_pages(zone, p->pcp + 0);
+
+ if (p->pcp[1].count)
+ drain_zone_pages(zone, p->pcp + 1);
+#endif
}
}
@@ -640,6 +684,24 @@
#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
+int sysctl_stat_interval __read_mostly = HZ;
+
+static void vmstat_update(struct work_struct *w)
+{
+ refresh_cpu_vm_stats(smp_processor_id());
+ schedule_delayed_work(&__get_cpu_var(vmstat_work),
+ sysctl_stat_interval);
+}
+
+static void __devinit start_cpu_timer(int cpu)
+{
+ struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu);
+
+ INIT_DELAYED_WORK(vmstat_work, vmstat_update);
+ schedule_delayed_work_on(cpu, vmstat_work, HZ + cpu);
+}
+
/*
* Use the cpu notifier to insure that the thresholds are recalculated
* when necessary.
@@ -648,10 +710,24 @@
unsigned long action,
void *hcpu)
{
+ long cpu = (long)hcpu;
+
switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_CANCELED:
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ start_cpu_timer(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu));
+ per_cpu(vmstat_work, cpu).work.func = NULL;
+ break;
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ start_cpu_timer(cpu);
+ break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
refresh_zone_stat_thresholds();
break;
default:
@@ -665,8 +741,13 @@
int __init setup_vmstat(void)
{
+ int cpu;
+
refresh_zone_stat_thresholds();
register_cpu_notifier(&vmstat_notifier);
+
+ for_each_online_cpu(cpu)
+ start_cpu_timer(cpu);
return 0;
}
module_init(setup_vmstat)
diff --git a/net/core/dev.c b/net/core/dev.c
index 4317c1b..8301e2a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3450,7 +3450,7 @@
unsigned int cpu, oldcpu = (unsigned long)ocpu;
struct softnet_data *sd, *oldsd;
- if (action != CPU_DEAD)
+ if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
return NOTIFY_OK;
local_irq_disable();
diff --git a/net/core/flow.c b/net/core/flow.c
index 5d25697..0514305 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -338,7 +338,7 @@
unsigned long action,
void *hcpu)
{
- if (action == CPU_DEAD)
+ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
__flow_cache_shrink((unsigned long)hcpu, 0);
return NOTIFY_OK;
}
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index b3050a6..68fe1d4 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -2387,6 +2387,7 @@
EnterFunction(2);
ip_vs_trash_cleanup();
cancel_rearming_delayed_work(&defense_work);
+ cancel_work_sync(&defense_work.work);
ip_vs_kill_estimator(&ip_vs_stats);
unregister_sysctl_table(sysctl_header);
proc_net_remove("ip_vs_stats");
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index fb3faf7..b733306 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -556,6 +556,7 @@
switch (action) {
case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
if (!percpu_populate(iucv_irq_data,
sizeof(struct iucv_irq_data),
GFP_KERNEL|GFP_DMA, cpu))
@@ -567,15 +568,20 @@
}
break;
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
percpu_depopulate(iucv_param, cpu);
percpu_depopulate(iucv_irq_data, cpu);
break;
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
smp_call_function_on(iucv_declare_cpu, NULL, 0, 1, cpu);
break;
case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
cpumask = iucv_buffer_cpumask;
cpu_clear(cpu, cpumask);
if (cpus_empty(cpumask))
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index db298b5..099a983 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -924,6 +924,7 @@
gss_write_init_verf(struct svc_rqst *rqstp, struct rsi *rsip)
{
struct rsc *rsci;
+ int rc;
if (rsip->major_status != GSS_S_COMPLETE)
return gss_write_null_verf(rqstp);
@@ -932,7 +933,9 @@
rsip->major_status = GSS_S_NO_CONTEXT;
return gss_write_null_verf(rqstp);
}
- return gss_write_verf(rqstp, rsci->mechctx, GSS_SEQ_WIN);
+ rc = gss_write_verf(rqstp, rsci->mechctx, GSS_SEQ_WIN);
+ cache_put(&rsci->h, &rsc_cache);
+ return rc;
}
/*
@@ -1089,6 +1092,8 @@
}
goto complete;
case RPC_GSS_PROC_DESTROY:
+ if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
+ goto auth_err;
set_bit(CACHE_NEGATIVE, &rsci->h.flags);
if (resv->iov_len + 4 > PAGE_SIZE)
goto drop;
@@ -1196,13 +1201,7 @@
if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset,
integ_len))
BUG();
- if (resbuf->page_len == 0
- && resbuf->head[0].iov_len + RPC_MAX_AUTH_SIZE
- < PAGE_SIZE) {
- BUG_ON(resbuf->tail[0].iov_len);
- /* Use head for everything */
- resv = &resbuf->head[0];
- } else if (resbuf->tail[0].iov_base == NULL) {
+ if (resbuf->tail[0].iov_base == NULL) {
if (resbuf->head[0].iov_len + RPC_MAX_AUTH_SIZE > PAGE_SIZE)
goto out_err;
resbuf->tail[0].iov_base = resbuf->head[0].iov_base
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index ad39b47..a2f1893 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -845,6 +845,8 @@
int register_rpc_pipefs(void)
{
+ int err;
+
rpc_inode_cachep = kmem_cache_create("rpc_inode_cache",
sizeof(struct rpc_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@ -852,7 +854,12 @@
init_once, NULL);
if (!rpc_inode_cachep)
return -ENOMEM;
- register_filesystem(&rpc_pipe_fs_type);
+ err = register_filesystem(&rpc_pipe_fs_type);
+ if (err) {
+ kmem_cache_destroy(rpc_inode_cachep);
+ return err;
+ }
+
return 0;
}
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 43ecf62..0d35bc7 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -146,9 +146,11 @@
int err = register_rpc_pipefs();
if (err)
goto out;
- err = rpc_init_mempool() != 0;
- if (err)
+ err = rpc_init_mempool();
+ if (err) {
+ unregister_rpc_pipefs();
goto out;
+ }
#ifdef RPC_DEBUG
rpc_register_sysctl();
#endif
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index b7503c1..e673ef9 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -907,7 +907,7 @@
* better idea of reply size
*/
if (procp->pc_xdrressize)
- svc_reserve(rqstp, procp->pc_xdrressize<<2);
+ svc_reserve_auth(rqstp, procp->pc_xdrressize<<2);
/* Call the function that processes the request. */
if (!versp->vs_dispatch) {
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 2bd23ea..07dcd20 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -385,7 +385,7 @@
{
struct ip_map *ipm;
struct svc_sock *svsk = rqstp->rq_sock;
- spin_lock_bh(&svsk->sk_defer_lock);
+ spin_lock(&svsk->sk_lock);
ipm = svsk->sk_info_authunix;
if (ipm != NULL) {
if (!cache_valid(&ipm->h)) {
@@ -395,13 +395,13 @@
* same IP address.
*/
svsk->sk_info_authunix = NULL;
- spin_unlock_bh(&svsk->sk_defer_lock);
+ spin_unlock(&svsk->sk_lock);
cache_put(&ipm->h, &ip_map_cache);
return NULL;
}
cache_get(&ipm->h);
}
- spin_unlock_bh(&svsk->sk_defer_lock);
+ spin_unlock(&svsk->sk_lock);
return ipm;
}
@@ -410,14 +410,14 @@
{
struct svc_sock *svsk = rqstp->rq_sock;
- spin_lock_bh(&svsk->sk_defer_lock);
+ spin_lock(&svsk->sk_lock);
if (svsk->sk_sock->type == SOCK_STREAM &&
svsk->sk_info_authunix == NULL) {
/* newly cached, keep the reference */
svsk->sk_info_authunix = ipm;
ipm = NULL;
}
- spin_unlock_bh(&svsk->sk_defer_lock);
+ spin_unlock(&svsk->sk_lock);
if (ipm)
cache_put(&ipm->h, &ip_map_cache);
}
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 22f61ae..5baf48d 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -53,7 +53,8 @@
* svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
* when both need to be taken (rare), svc_serv->sv_lock is first.
* BKL protects svc_serv->sv_nrthread.
- * svc_sock->sk_defer_lock protects the svc_sock->sk_deferred list
+ * svc_sock->sk_lock protects the svc_sock->sk_deferred list
+ * and the ->sk_info_authunix cache.
* svc_sock->sk_flags.SK_BUSY prevents a svc_sock being enqueued multiply.
*
* Some flags can be set to certain values at any time
@@ -787,15 +788,20 @@
}
clear_bit(SK_DATA, &svsk->sk_flags);
- while ((err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
- 0, 0, MSG_PEEK | MSG_DONTWAIT)) < 0 ||
- (skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err)) == NULL) {
- if (err == -EAGAIN) {
- svc_sock_received(svsk);
- return err;
+ skb = NULL;
+ err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
+ 0, 0, MSG_PEEK | MSG_DONTWAIT);
+ if (err >= 0)
+ skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err);
+
+ if (skb == NULL) {
+ if (err != -EAGAIN) {
+ /* possibly an icmp error */
+ dprintk("svc: recvfrom returned error %d\n", -err);
+ set_bit(SK_DATA, &svsk->sk_flags);
}
- /* possibly an icmp error */
- dprintk("svc: recvfrom returned error %d\n", -err);
+ svc_sock_received(svsk);
+ return -EAGAIN;
}
rqstp->rq_addrlen = sizeof(rqstp->rq_addr);
if (skb->tstamp.tv64 == 0) {
@@ -1633,7 +1639,7 @@
svsk->sk_server = serv;
atomic_set(&svsk->sk_inuse, 1);
svsk->sk_lastrecv = get_seconds();
- spin_lock_init(&svsk->sk_defer_lock);
+ spin_lock_init(&svsk->sk_lock);
INIT_LIST_HEAD(&svsk->sk_deferred);
INIT_LIST_HEAD(&svsk->sk_ready);
mutex_init(&svsk->sk_mutex);
@@ -1857,9 +1863,9 @@
dprintk("revisit queued\n");
svsk = dr->svsk;
dr->svsk = NULL;
- spin_lock_bh(&svsk->sk_defer_lock);
+ spin_lock(&svsk->sk_lock);
list_add(&dr->handle.recent, &svsk->sk_deferred);
- spin_unlock_bh(&svsk->sk_defer_lock);
+ spin_unlock(&svsk->sk_lock);
set_bit(SK_DEFERRED, &svsk->sk_flags);
svc_sock_enqueue(svsk);
svc_sock_put(svsk);
@@ -1925,7 +1931,7 @@
if (!test_bit(SK_DEFERRED, &svsk->sk_flags))
return NULL;
- spin_lock_bh(&svsk->sk_defer_lock);
+ spin_lock(&svsk->sk_lock);
clear_bit(SK_DEFERRED, &svsk->sk_flags);
if (!list_empty(&svsk->sk_deferred)) {
dr = list_entry(svsk->sk_deferred.next,
@@ -1934,6 +1940,6 @@
list_del_init(&dr->handle.recent);
set_bit(SK_DEFERRED, &svsk->sk_flags);
}
- spin_unlock_bh(&svsk->sk_defer_lock);
+ spin_unlock(&svsk->sk_lock);
return dr;
}
diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index a325a0c..e5bf649 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -337,6 +337,7 @@
}
return $version;
}
+my $kernelversion = get_kernel_version();
# generate a sequence of code that will splice in highlighting information
# using the s// operator.
@@ -610,7 +611,7 @@
print "<refmeta>\n";
print " <refentrytitle><phrase>".$args{'function'}."</phrase></refentrytitle>\n";
print " <manvolnum>9</manvolnum>\n";
- print " <refmiscinfo class=\"version\">" . get_kernel_version() . "</refmiscinfo>\n";
+ print " <refmiscinfo class=\"version\">" . $kernelversion . "</refmiscinfo>\n";
print "</refmeta>\n";
print "<refnamediv>\n";
print " <refname>".$args{'function'}."</refname>\n";
@@ -687,7 +688,7 @@
print "<refmeta>\n";
print " <refentrytitle><phrase>".$args{'type'}." ".$args{'struct'}."</phrase></refentrytitle>\n";
print " <manvolnum>9</manvolnum>\n";
- print " <refmiscinfo class=\"version\">" . get_kernel_version() . "</refmiscinfo>\n";
+ print " <refmiscinfo class=\"version\">" . $kernelversion . "</refmiscinfo>\n";
print "</refmeta>\n";
print "<refnamediv>\n";
print " <refname>".$args{'type'}." ".$args{'struct'}."</refname>\n";
@@ -772,7 +773,7 @@
print "<refmeta>\n";
print " <refentrytitle><phrase>enum ".$args{'enum'}."</phrase></refentrytitle>\n";
print " <manvolnum>9</manvolnum>\n";
- print " <refmiscinfo class=\"version\">" . get_kernel_version() . "</refmiscinfo>\n";
+ print " <refmiscinfo class=\"version\">" . $kernelversion . "</refmiscinfo>\n";
print "</refmeta>\n";
print "<refnamediv>\n";
print " <refname>enum ".$args{'enum'}."</refname>\n";
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 480e18b..113dc77 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -1343,6 +1343,7 @@
buf_printf(b, "#ifdef CONFIG_MODULE_UNLOAD\n"
" .exit = cleanup_module,\n"
"#endif\n");
+ buf_printf(b, " .arch = MODULE_ARCH_INIT,\n");
buf_printf(b, "};\n");
}