Merge master.kernel.org:/pub/scm/linux/kernel/git/mchehab/v4l-dvb

* master.kernel.org:/pub/scm/linux/kernel/git/mchehab/v4l-dvb: (26 commits)
  V4L/DVB (4263): Fix warning when compiling on 64 bit machines
  V4L/DVB (4261): Included required header for in-kernel compilation
  V4L/DVB (4260): Stradis.c: make 2 functions static
  V4L/DVB (4259): Pass an explicit log prefix to cx2341x_log_status
  V4L/DVB (4257): Fix 64-bit compile warnings.
  V4L/DVB (4255): Tda9887 default TOP value is 0x10
  V4L/DVB (4254): Remove obsoleted tuner_debug option.
  V4L/DVB (4253): IVTV VBI format description too long.
  V4L/DVB (4252): Remove duplicate 'tda9887' in info messages.
  V4L/DVB (4245): Reduce the amount of pvrusb2-sourced noise going into the system log
  V4L/DVB (4244): Implement use of cx2341x module in pvrusb2 driver
  V4L/DVB (4243): Exploit new V4L control features in pvrusb2
  V4L/DVB (4242): Don't suspend encoder when changing its attributes (in pvrusb2)
  V4L/DVB (4241): Fix faulty encoder error recovery in pvrusb2
  V4L/DVB (4240): Various V4L control enhancements in pvrusb2
  V4L/DVB (4239): Handle boolean controls in pvrusb2
  V4L/DVB (4238): Make sure flags field is initialized when quering a control in pvrusb2
  V4L/DVB (4237): Move LOG_STATUS bracketing to a different part of the pvrusb2 driver
  V4L/DVB (4236): Rearrange things in pvrusb2 driver in preparation for using cx2341x module
  V4L/DVB (4235): Increase the maximum number of controls that pvrusb2-sysfs.c can handle.
  ...
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index e4c3815..a494859 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@@ -7,7 +7,7 @@
 implementations.  It creates an rcutorture kernel module that can
 be loaded to run a torture test.  The test periodically outputs
 status messages via printk(), which can be examined via the dmesg
-command (perhaps grepping for "rcutorture").  The test is started
+command (perhaps grepping for "torture").  The test is started
 when the module is loaded, and stops when the module is unloaded.
 
 However, actually setting this config option to "y" results in the system
@@ -35,6 +35,19 @@
 		be printed -only- when the module is unloaded, and this
 		is the default.
 
+shuffle_interval
+		The number of seconds to keep the test threads affinitied
+		to a particular subset of the CPUs.  Used in conjunction
+		with test_no_idle_hz.
+
+test_no_idle_hz	Whether or not to test the ability of RCU to operate in
+		a kernel that disables the scheduling-clock interrupt to
+		idle CPUs.  Boolean parameter, "1" to test, "0" otherwise.
+
+torture_type	The type of RCU to test: "rcu" for the rcu_read_lock()
+		API, "rcu_bh" for the rcu_read_lock_bh() API, and "srcu"
+		for the "srcu_read_lock()" API.
+
 verbose		Enable debug printk()s.  Default is disabled.
 
 
@@ -42,14 +55,14 @@
 
 The statistics output is as follows:
 
-	rcutorture: --- Start of test: nreaders=16 stat_interval=0 verbose=0
-	rcutorture: rtc: 0000000000000000 ver: 1916 tfle: 0 rta: 1916 rtaf: 0 rtf: 1915
-	rcutorture: Reader Pipe:  1466408 9747 0 0 0 0 0 0 0 0 0
-	rcutorture: Reader Batch:  1464477 11678 0 0 0 0 0 0 0 0
-	rcutorture: Free-Block Circulation:  1915 1915 1915 1915 1915 1915 1915 1915 1915 1915 0
-	rcutorture: --- End of test
+	rcu-torture: --- Start of test: nreaders=16 stat_interval=0 verbose=0
+	rcu-torture: rtc: 0000000000000000 ver: 1916 tfle: 0 rta: 1916 rtaf: 0 rtf: 1915
+	rcu-torture: Reader Pipe:  1466408 9747 0 0 0 0 0 0 0 0 0
+	rcu-torture: Reader Batch:  1464477 11678 0 0 0 0 0 0 0 0
+	rcu-torture: Free-Block Circulation:  1915 1915 1915 1915 1915 1915 1915 1915 1915 1915 0
+	rcu-torture: --- End of test
 
-The command "dmesg | grep rcutorture:" will extract this information on
+The command "dmesg | grep torture:" will extract this information on
 most systems.  On more esoteric configurations, it may be necessary to
 use other commands to access the output of the printk()s used by
 the RCU torture test.  The printk()s use KERN_ALERT, so they should
@@ -115,8 +128,9 @@
 	modprobe rcutorture
 	sleep 100
 	rmmod rcutorture
-	dmesg | grep rcutorture:
+	dmesg | grep torture:
 
 The output can be manually inspected for the error flag of "!!!".
 One could of course create a more elaborate script that automatically
-checked for such errors.
+checked for such errors.  The "rmmod" command forces a "SUCCESS" or
+"FAILURE" indication to be printk()ed.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 2e352a6..0d189c9 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1669,6 +1669,10 @@
 	usbhid.mousepoll=
 			[USBHID] The interval which mice are to be polled at.
 
+	vdso=		[IA-32]
+			vdso=1: enable VDSO (default)
+			vdso=0: disable VDSO mapping
+
 	video=		[FB] Frame buffer configuration
 			See Documentation/fb/modedb.txt.
 
diff --git a/Documentation/pi-futex.txt b/Documentation/pi-futex.txt
new file mode 100644
index 0000000..5d61dac
--- /dev/null
+++ b/Documentation/pi-futex.txt
@@ -0,0 +1,121 @@
+Lightweight PI-futexes
+----------------------
+
+We are calling them lightweight for 3 reasons:
+
+ - in the user-space fastpath a PI-enabled futex involves no kernel work
+   (or any other PI complexity) at all. No registration, no extra kernel
+   calls - just pure fast atomic ops in userspace.
+
+ - even in the slowpath, the system call and scheduling pattern is very
+   similar to normal futexes.
+
+ - the in-kernel PI implementation is streamlined around the mutex
+   abstraction, with strict rules that keep the implementation
+   relatively simple: only a single owner may own a lock (i.e. no
+   read-write lock support), only the owner may unlock a lock, no
+   recursive locking, etc.
+
+Priority Inheritance - why?
+---------------------------
+
+The short reply: user-space PI helps achieving/improving determinism for
+user-space applications. In the best-case, it can help achieve
+determinism and well-bound latencies. Even in the worst-case, PI will
+improve the statistical distribution of locking related application
+delays.
+
+The longer reply:
+-----------------
+
+Firstly, sharing locks between multiple tasks is a common programming
+technique that often cannot be replaced with lockless algorithms. As we
+can see it in the kernel [which is a quite complex program in itself],
+lockless structures are rather the exception than the norm - the current
+ratio of lockless vs. locky code for shared data structures is somewhere
+between 1:10 and 1:100. Lockless is hard, and the complexity of lockless
+algorithms often endangers to ability to do robust reviews of said code.
+I.e. critical RT apps often choose lock structures to protect critical
+data structures, instead of lockless algorithms. Furthermore, there are
+cases (like shared hardware, or other resource limits) where lockless
+access is mathematically impossible.
+
+Media players (such as Jack) are an example of reasonable application
+design with multiple tasks (with multiple priority levels) sharing
+short-held locks: for example, a highprio audio playback thread is
+combined with medium-prio construct-audio-data threads and low-prio
+display-colory-stuff threads. Add video and decoding to the mix and
+we've got even more priority levels.
+
+So once we accept that synchronization objects (locks) are an
+unavoidable fact of life, and once we accept that multi-task userspace
+apps have a very fair expectation of being able to use locks, we've got
+to think about how to offer the option of a deterministic locking
+implementation to user-space.
+
+Most of the technical counter-arguments against doing priority
+inheritance only apply to kernel-space locks. But user-space locks are
+different, there we cannot disable interrupts or make the task
+non-preemptible in a critical section, so the 'use spinlocks' argument
+does not apply (user-space spinlocks have the same priority inversion
+problems as other user-space locking constructs). Fact is, pretty much
+the only technique that currently enables good determinism for userspace
+locks (such as futex-based pthread mutexes) is priority inheritance:
+
+Currently (without PI), if a high-prio and a low-prio task shares a lock
+[this is a quite common scenario for most non-trivial RT applications],
+even if all critical sections are coded carefully to be deterministic
+(i.e. all critical sections are short in duration and only execute a
+limited number of instructions), the kernel cannot guarantee any
+deterministic execution of the high-prio task: any medium-priority task
+could preempt the low-prio task while it holds the shared lock and
+executes the critical section, and could delay it indefinitely.
+
+Implementation:
+---------------
+
+As mentioned before, the userspace fastpath of PI-enabled pthread
+mutexes involves no kernel work at all - they behave quite similarly to
+normal futex-based locks: a 0 value means unlocked, and a value==TID
+means locked. (This is the same method as used by list-based robust
+futexes.) Userspace uses atomic ops to lock/unlock these mutexes without
+entering the kernel.
+
+To handle the slowpath, we have added two new futex ops:
+
+  FUTEX_LOCK_PI
+  FUTEX_UNLOCK_PI
+
+If the lock-acquire fastpath fails, [i.e. an atomic transition from 0 to
+TID fails], then FUTEX_LOCK_PI is called. The kernel does all the
+remaining work: if there is no futex-queue attached to the futex address
+yet then the code looks up the task that owns the futex [it has put its
+own TID into the futex value], and attaches a 'PI state' structure to
+the futex-queue. The pi_state includes an rt-mutex, which is a PI-aware,
+kernel-based synchronization object. The 'other' task is made the owner
+of the rt-mutex, and the FUTEX_WAITERS bit is atomically set in the
+futex value. Then this task tries to lock the rt-mutex, on which it
+blocks. Once it returns, it has the mutex acquired, and it sets the
+futex value to its own TID and returns. Userspace has no other work to
+perform - it now owns the lock, and futex value contains
+FUTEX_WAITERS|TID.
+
+If the unlock side fastpath succeeds, [i.e. userspace manages to do a
+TID -> 0 atomic transition of the futex value], then no kernel work is
+triggered.
+
+If the unlock fastpath fails (because the FUTEX_WAITERS bit is set),
+then FUTEX_UNLOCK_PI is called, and the kernel unlocks the futex on the
+behalf of userspace - and it also unlocks the attached
+pi_state->rt_mutex and thus wakes up any potential waiters.
+
+Note that under this approach, contrary to previous PI-futex approaches,
+there is no prior 'registration' of a PI-futex. [which is not quite
+possible anyway, due to existing ABI properties of pthread mutexes.]
+
+Also, under this scheme, 'robustness' and 'PI' are two orthogonal
+properties of futexes, and all four combinations are possible: futex,
+robust-futex, PI-futex, robust+PI-futex.
+
+More details about priority inheritance can be found in
+Documentation/rtmutex.txt.
diff --git a/Documentation/robust-futexes.txt b/Documentation/robust-futexes.txt
index df82d75..76e8064 100644
--- a/Documentation/robust-futexes.txt
+++ b/Documentation/robust-futexes.txt
@@ -95,7 +95,7 @@
 is empty. If the thread/process crashed or terminated in some incorrect
 way then the list might be non-empty: in this case the kernel carefully
 walks the list [not trusting it], and marks all locks that are owned by
-this thread with the FUTEX_OWNER_DEAD bit, and wakes up one waiter (if
+this thread with the FUTEX_OWNER_DIED bit, and wakes up one waiter (if
 any).
 
 The list is guaranteed to be private and per-thread at do_exit() time,
diff --git a/Documentation/rt-mutex-design.txt b/Documentation/rt-mutex-design.txt
new file mode 100644
index 0000000..c472ffa
--- /dev/null
+++ b/Documentation/rt-mutex-design.txt
@@ -0,0 +1,781 @@
+#
+# Copyright (c) 2006 Steven Rostedt
+# Licensed under the GNU Free Documentation License, Version 1.2
+#
+
+RT-mutex implementation design
+------------------------------
+
+This document tries to describe the design of the rtmutex.c implementation.
+It doesn't describe the reasons why rtmutex.c exists. For that please see
+Documentation/rt-mutex.txt.  Although this document does explain problems
+that happen without this code, but that is in the concept to understand
+what the code actually is doing.
+
+The goal of this document is to help others understand the priority
+inheritance (PI) algorithm that is used, as well as reasons for the
+decisions that were made to implement PI in the manner that was done.
+
+
+Unbounded Priority Inversion
+----------------------------
+
+Priority inversion is when a lower priority process executes while a higher
+priority process wants to run.  This happens for several reasons, and
+most of the time it can't be helped.  Anytime a high priority process wants
+to use a resource that a lower priority process has (a mutex for example),
+the high priority process must wait until the lower priority process is done
+with the resource.  This is a priority inversion.  What we want to prevent
+is something called unbounded priority inversion.  That is when the high
+priority process is prevented from running by a lower priority process for
+an undetermined amount of time.
+
+The classic example of unbounded priority inversion is were you have three
+processes, let's call them processes A, B, and C, where A is the highest
+priority process, C is the lowest, and B is in between. A tries to grab a lock
+that C owns and must wait and lets C run to release the lock. But in the
+meantime, B executes, and since B is of a higher priority than C, it preempts C,
+but by doing so, it is in fact preempting A which is a higher priority process.
+Now there's no way of knowing how long A will be sleeping waiting for C
+to release the lock, because for all we know, B is a CPU hog and will
+never give C a chance to release the lock.  This is called unbounded priority
+inversion.
+
+Here's a little ASCII art to show the problem.
+
+   grab lock L1 (owned by C)
+     |
+A ---+
+        C preempted by B
+          |
+C    +----+
+
+B         +-------->
+                B now keeps A from running.
+
+
+Priority Inheritance (PI)
+-------------------------
+
+There are several ways to solve this issue, but other ways are out of scope
+for this document.  Here we only discuss PI.
+
+PI is where a process inherits the priority of another process if the other
+process blocks on a lock owned by the current process.  To make this easier
+to understand, let's use the previous example, with processes A, B, and C again.
+
+This time, when A blocks on the lock owned by C, C would inherit the priority
+of A.  So now if B becomes runnable, it would not preempt C, since C now has
+the high priority of A.  As soon as C releases the lock, it loses its
+inherited priority, and A then can continue with the resource that C had.
+
+Terminology
+-----------
+
+Here I explain some terminology that is used in this document to help describe
+the design that is used to implement PI.
+
+PI chain - The PI chain is an ordered series of locks and processes that cause
+           processes to inherit priorities from a previous process that is
+           blocked on one of its locks.  This is described in more detail
+           later in this document.
+
+mutex    - In this document, to differentiate from locks that implement
+           PI and spin locks that are used in the PI code, from now on
+           the PI locks will be called a mutex.
+
+lock     - In this document from now on, I will use the term lock when
+           referring to spin locks that are used to protect parts of the PI
+           algorithm.  These locks disable preemption for UP (when
+           CONFIG_PREEMPT is enabled) and on SMP prevents multiple CPUs from
+           entering critical sections simultaneously.
+
+spin lock - Same as lock above.
+
+waiter   - A waiter is a struct that is stored on the stack of a blocked
+           process.  Since the scope of the waiter is within the code for
+           a process being blocked on the mutex, it is fine to allocate
+           the waiter on the process's stack (local variable).  This
+           structure holds a pointer to the task, as well as the mutex that
+           the task is blocked on.  It also has the plist node structures to
+           place the task in the waiter_list of a mutex as well as the
+           pi_list of a mutex owner task (described below).
+
+           waiter is sometimes used in reference to the task that is waiting
+           on a mutex. This is the same as waiter->task.
+
+waiters  - A list of processes that are blocked on a mutex.
+
+top waiter - The highest priority process waiting on a specific mutex.
+
+top pi waiter - The highest priority process waiting on one of the mutexes
+                that a specific process owns.
+
+Note:  task and process are used interchangeably in this document, mostly to
+       differentiate between two processes that are being described together.
+
+
+PI chain
+--------
+
+The PI chain is a list of processes and mutexes that may cause priority
+inheritance to take place.  Multiple chains may converge, but a chain
+would never diverge, since a process can't be blocked on more than one
+mutex at a time.
+
+Example:
+
+   Process:  A, B, C, D, E
+   Mutexes:  L1, L2, L3, L4
+
+   A owns: L1
+           B blocked on L1
+           B owns L2
+                  C blocked on L2
+                  C owns L3
+                         D blocked on L3
+                         D owns L4
+                                E blocked on L4
+
+The chain would be:
+
+   E->L4->D->L3->C->L2->B->L1->A
+
+To show where two chains merge, we could add another process F and
+another mutex L5 where B owns L5 and F is blocked on mutex L5.
+
+The chain for F would be:
+
+   F->L5->B->L1->A
+
+Since a process may own more than one mutex, but never be blocked on more than
+one, the chains merge.
+
+Here we show both chains:
+
+   E->L4->D->L3->C->L2-+
+                       |
+                       +->B->L1->A
+                       |
+                 F->L5-+
+
+For PI to work, the processes at the right end of these chains (or we may
+also call it the Top of the chain) must be equal to or higher in priority
+than the processes to the left or below in the chain.
+
+Also since a mutex may have more than one process blocked on it, we can
+have multiple chains merge at mutexes.  If we add another process G that is
+blocked on mutex L2:
+
+  G->L2->B->L1->A
+
+And once again, to show how this can grow I will show the merging chains
+again.
+
+   E->L4->D->L3->C-+
+                   +->L2-+
+                   |     |
+                 G-+     +->B->L1->A
+                         |
+                   F->L5-+
+
+
+Plist
+-----
+
+Before I go further and talk about how the PI chain is stored through lists
+on both mutexes and processes, I'll explain the plist.  This is similar to
+the struct list_head functionality that is already in the kernel.
+The implementation of plist is out of scope for this document, but it is
+very important to understand what it does.
+
+There are a few differences between plist and list, the most important one
+being that plist is a priority sorted linked list.  This means that the
+priorities of the plist are sorted, such that it takes O(1) to retrieve the
+highest priority item in the list.  Obviously this is useful to store processes
+based on their priorities.
+
+Another difference, which is important for implementation, is that, unlike
+list, the head of the list is a different element than the nodes of a list.
+So the head of the list is declared as struct plist_head and nodes that will
+be added to the list are declared as struct plist_node.
+
+
+Mutex Waiter List
+-----------------
+
+Every mutex keeps track of all the waiters that are blocked on itself. The mutex
+has a plist to store these waiters by priority.  This list is protected by
+a spin lock that is located in the struct of the mutex. This lock is called
+wait_lock.  Since the modification of the waiter list is never done in
+interrupt context, the wait_lock can be taken without disabling interrupts.
+
+
+Task PI List
+------------
+
+To keep track of the PI chains, each process has its own PI list.  This is
+a list of all top waiters of the mutexes that are owned by the process.
+Note that this list only holds the top waiters and not all waiters that are
+blocked on mutexes owned by the process.
+
+The top of the task's PI list is always the highest priority task that
+is waiting on a mutex that is owned by the task.  So if the task has
+inherited a priority, it will always be the priority of the task that is
+at the top of this list.
+
+This list is stored in the task structure of a process as a plist called
+pi_list.  This list is protected by a spin lock also in the task structure,
+called pi_lock.  This lock may also be taken in interrupt context, so when
+locking the pi_lock, interrupts must be disabled.
+
+
+Depth of the PI Chain
+---------------------
+
+The maximum depth of the PI chain is not dynamic, and could actually be
+defined.  But is very complex to figure it out, since it depends on all
+the nesting of mutexes.  Let's look at the example where we have 3 mutexes,
+L1, L2, and L3, and four separate functions func1, func2, func3 and func4.
+The following shows a locking order of L1->L2->L3, but may not actually
+be directly nested that way.
+
+void func1(void)
+{
+	mutex_lock(L1);
+
+	/* do anything */
+
+	mutex_unlock(L1);
+}
+
+void func2(void)
+{
+	mutex_lock(L1);
+	mutex_lock(L2);
+
+	/* do something */
+
+	mutex_unlock(L2);
+	mutex_unlock(L1);
+}
+
+void func3(void)
+{
+	mutex_lock(L2);
+	mutex_lock(L3);
+
+	/* do something else */
+
+	mutex_unlock(L3);
+	mutex_unlock(L2);
+}
+
+void func4(void)
+{
+	mutex_lock(L3);
+
+	/* do something again */
+
+	mutex_unlock(L3);
+}
+
+Now we add 4 processes that run each of these functions separately.
+Processes A, B, C, and D which run functions func1, func2, func3 and func4
+respectively, and such that D runs first and A last.  With D being preempted
+in func4 in the "do something again" area, we have a locking that follows:
+
+D owns L3
+       C blocked on L3
+       C owns L2
+              B blocked on L2
+              B owns L1
+                     A blocked on L1
+
+And thus we have the chain A->L1->B->L2->C->L3->D.
+
+This gives us a PI depth of 4 (four processes), but looking at any of the
+functions individually, it seems as though they only have at most a locking
+depth of two.  So, although the locking depth is defined at compile time,
+it still is very difficult to find the possibilities of that depth.
+
+Now since mutexes can be defined by user-land applications, we don't want a DOS
+type of application that nests large amounts of mutexes to create a large
+PI chain, and have the code holding spin locks while looking at a large
+amount of data.  So to prevent this, the implementation not only implements
+a maximum lock depth, but also only holds at most two different locks at a
+time, as it walks the PI chain.  More about this below.
+
+
+Mutex owner and flags
+---------------------
+
+The mutex structure contains a pointer to the owner of the mutex.  If the
+mutex is not owned, this owner is set to NULL.  Since all architectures
+have the task structure on at least a four byte alignment (and if this is
+not true, the rtmutex.c code will be broken!), this allows for the two
+least significant bits to be used as flags.  This part is also described
+in Documentation/rt-mutex.txt, but will also be briefly described here.
+
+Bit 0 is used as the "Pending Owner" flag.  This is described later.
+Bit 1 is used as the "Has Waiters" flags.  This is also described later
+  in more detail, but is set whenever there are waiters on a mutex.
+
+
+cmpxchg Tricks
+--------------
+
+Some architectures implement an atomic cmpxchg (Compare and Exchange).  This
+is used (when applicable) to keep the fast path of grabbing and releasing
+mutexes short.
+
+cmpxchg is basically the following function performed atomically:
+
+unsigned long _cmpxchg(unsigned long *A, unsigned long *B, unsigned long *C)
+{
+        unsigned long T = *A;
+        if (*A == *B) {
+                *A = *C;
+        }
+        return T;
+}
+#define cmpxchg(a,b,c) _cmpxchg(&a,&b,&c)
+
+This is really nice to have, since it allows you to only update a variable
+if the variable is what you expect it to be.  You know if it succeeded if
+the return value (the old value of A) is equal to B.
+
+The macro rt_mutex_cmpxchg is used to try to lock and unlock mutexes. If
+the architecture does not support CMPXCHG, then this macro is simply set
+to fail every time.  But if CMPXCHG is supported, then this will
+help out extremely to keep the fast path short.
+
+The use of rt_mutex_cmpxchg with the flags in the owner field help optimize
+the system for architectures that support it.  This will also be explained
+later in this document.
+
+
+Priority adjustments
+--------------------
+
+The implementation of the PI code in rtmutex.c has several places that a
+process must adjust its priority.  With the help of the pi_list of a
+process this is rather easy to know what needs to be adjusted.
+
+The functions implementing the task adjustments are rt_mutex_adjust_prio,
+__rt_mutex_adjust_prio (same as the former, but expects the task pi_lock
+to already be taken), rt_mutex_get_prio, and rt_mutex_setprio.
+
+rt_mutex_getprio and rt_mutex_setprio are only used in __rt_mutex_adjust_prio.
+
+rt_mutex_getprio returns the priority that the task should have.  Either the
+task's own normal priority, or if a process of a higher priority is waiting on
+a mutex owned by the task, then that higher priority should be returned.
+Since the pi_list of a task holds an order by priority list of all the top
+waiters of all the mutexes that the task owns, rt_mutex_getprio simply needs
+to compare the top pi waiter to its own normal priority, and return the higher
+priority back.
+
+(Note:  if looking at the code, you will notice that the lower number of
+        prio is returned.  This is because the prio field in the task structure
+        is an inverse order of the actual priority.  So a "prio" of 5 is
+        of higher priority than a "prio" of 10.)
+
+__rt_mutex_adjust_prio examines the result of rt_mutex_getprio, and if the
+result does not equal the task's current priority, then rt_mutex_setprio
+is called to adjust the priority of the task to the new priority.
+Note that rt_mutex_setprio is defined in kernel/sched.c to implement the
+actual change in priority.
+
+It is interesting to note that __rt_mutex_adjust_prio can either increase
+or decrease the priority of the task.  In the case that a higher priority
+process has just blocked on a mutex owned by the task, __rt_mutex_adjust_prio
+would increase/boost the task's priority.  But if a higher priority task
+were for some reason to leave the mutex (timeout or signal), this same function
+would decrease/unboost the priority of the task.  That is because the pi_list
+always contains the highest priority task that is waiting on a mutex owned
+by the task, so we only need to compare the priority of that top pi waiter
+to the normal priority of the given task.
+
+
+High level overview of the PI chain walk
+----------------------------------------
+
+The PI chain walk is implemented by the function rt_mutex_adjust_prio_chain.
+
+The implementation has gone through several iterations, and has ended up
+with what we believe is the best.  It walks the PI chain by only grabbing
+at most two locks at a time, and is very efficient.
+
+The rt_mutex_adjust_prio_chain can be used either to boost or lower process
+priorities.
+
+rt_mutex_adjust_prio_chain is called with a task to be checked for PI
+(de)boosting (the owner of a mutex that a process is blocking on), a flag to
+check for deadlocking, the mutex that the task owns, and a pointer to a waiter
+that is the process's waiter struct that is blocked on the mutex (although this
+parameter may be NULL for deboosting).
+
+For this explanation, I will not mention deadlock detection. This explanation
+will try to stay at a high level.
+
+When this function is called, there are no locks held.  That also means
+that the state of the owner and lock can change when entered into this function.
+
+Before this function is called, the task has already had rt_mutex_adjust_prio
+performed on it.  This means that the task is set to the priority that it
+should be at, but the plist nodes of the task's waiter have not been updated
+with the new priorities, and that this task may not be in the proper locations
+in the pi_lists and wait_lists that the task is blocked on.  This function
+solves all that.
+
+A loop is entered, where task is the owner to be checked for PI changes that
+was passed by parameter (for the first iteration).  The pi_lock of this task is
+taken to prevent any more changes to the pi_list of the task.  This also
+prevents new tasks from completing the blocking on a mutex that is owned by this
+task.
+
+If the task is not blocked on a mutex then the loop is exited.  We are at
+the top of the PI chain.
+
+A check is now done to see if the original waiter (the process that is blocked
+on the current mutex) is the top pi waiter of the task.  That is, is this
+waiter on the top of the task's pi_list.  If it is not, it either means that
+there is another process higher in priority that is blocked on one of the
+mutexes that the task owns, or that the waiter has just woken up via a signal
+or timeout and has left the PI chain.  In either case, the loop is exited, since
+we don't need to do any more changes to the priority of the current task, or any
+task that owns a mutex that this current task is waiting on.  A priority chain
+walk is only needed when a new top pi waiter is made to a task.
+
+The next check sees if the task's waiter plist node has the priority equal to
+the priority the task is set at.  If they are equal, then we are done with
+the loop.  Remember that the function started with the priority of the
+task adjusted, but the plist nodes that hold the task in other processes
+pi_lists have not been adjusted.
+
+Next, we look at the mutex that the task is blocked on. The mutex's wait_lock
+is taken.  This is done by a spin_trylock, because the locking order of the
+pi_lock and wait_lock goes in the opposite direction. If we fail to grab the
+lock, the pi_lock is released, and we restart the loop.
+
+Now that we have both the pi_lock of the task as well as the wait_lock of
+the mutex the task is blocked on, we update the task's waiter's plist node
+that is located on the mutex's wait_list.
+
+Now we release the pi_lock of the task.
+
+Next the owner of the mutex has its pi_lock taken, so we can update the
+task's entry in the owner's pi_list.  If the task is the highest priority
+process on the mutex's wait_list, then we remove the previous top waiter
+from the owner's pi_list, and replace it with the task.
+
+Note: It is possible that the task was the current top waiter on the mutex,
+      in which case the task is not yet on the pi_list of the waiter.  This
+      is OK, since plist_del does nothing if the plist node is not on any
+      list.
+
+If the task was not the top waiter of the mutex, but it was before we
+did the priority updates, that means we are deboosting/lowering the
+task.  In this case, the task is removed from the pi_list of the owner,
+and the new top waiter is added.
+
+Lastly, we unlock both the pi_lock of the task, as well as the mutex's
+wait_lock, and continue the loop again.  On the next iteration of the
+loop, the previous owner of the mutex will be the task that will be
+processed.
+
+Note: One might think that the owner of this mutex might have changed
+      since we just grab the mutex's wait_lock. And one could be right.
+      The important thing to remember is that the owner could not have
+      become the task that is being processed in the PI chain, since
+      we have taken that task's pi_lock at the beginning of the loop.
+      So as long as there is an owner of this mutex that is not the same
+      process as the tasked being worked on, we are OK.
+
+      Looking closely at the code, one might be confused.  The check for the
+      end of the PI chain is when the task isn't blocked on anything or the
+      task's waiter structure "task" element is NULL.  This check is
+      protected only by the task's pi_lock.  But the code to unlock the mutex
+      sets the task's waiter structure "task" element to NULL with only
+      the protection of the mutex's wait_lock, which was not taken yet.
+      Isn't this a race condition if the task becomes the new owner?
+
+      The answer is No!  The trick is the spin_trylock of the mutex's
+      wait_lock.  If we fail that lock, we release the pi_lock of the
+      task and continue the loop, doing the end of PI chain check again.
+
+      In the code to release the lock, the wait_lock of the mutex is held
+      the entire time, and it is not let go when we grab the pi_lock of the
+      new owner of the mutex.  So if the switch of a new owner were to happen
+      after the check for end of the PI chain and the grabbing of the
+      wait_lock, the unlocking code would spin on the new owner's pi_lock
+      but never give up the wait_lock.  So the PI chain loop is guaranteed to
+      fail the spin_trylock on the wait_lock, release the pi_lock, and
+      try again.
+
+      If you don't quite understand the above, that's OK. You don't have to,
+      unless you really want to make a proof out of it ;)
+
+
+Pending Owners and Lock stealing
+--------------------------------
+
+One of the flags in the owner field of the mutex structure is "Pending Owner".
+What this means is that an owner was chosen by the process releasing the
+mutex, but that owner has yet to wake up and actually take the mutex.
+
+Why is this important?  Why can't we just give the mutex to another process
+and be done with it?
+
+The PI code is to help with real-time processes, and to let the highest
+priority process run as long as possible with little latencies and delays.
+If a high priority process owns a mutex that a lower priority process is
+blocked on, when the mutex is released it would be given to the lower priority
+process.  What if the higher priority process wants to take that mutex again.
+The high priority process would fail to take that mutex that it just gave up
+and it would need to boost the lower priority process to run with full
+latency of that critical section (since the low priority process just entered
+it).
+
+There's no reason a high priority process that gives up a mutex should be
+penalized if it tries to take that mutex again.  If the new owner of the
+mutex has not woken up yet, there's no reason that the higher priority process
+could not take that mutex away.
+
+To solve this, we introduced Pending Ownership and Lock Stealing.  When a
+new process is given a mutex that it was blocked on, it is only given
+pending ownership.  This means that it's the new owner, unless a higher
+priority process comes in and tries to grab that mutex.  If a higher priority
+process does come along and wants that mutex, we let the higher priority
+process "steal" the mutex from the pending owner (only if it is still pending)
+and continue with the mutex.
+
+
+Taking of a mutex (The walk through)
+------------------------------------
+
+OK, now let's take a look at the detailed walk through of what happens when
+taking a mutex.
+
+The first thing that is tried is the fast taking of the mutex.  This is
+done when we have CMPXCHG enabled (otherwise the fast taking automatically
+fails).  Only when the owner field of the mutex is NULL can the lock be
+taken with the CMPXCHG and nothing else needs to be done.
+
+If there is contention on the lock, whether it is owned or pending owner
+we go about the slow path (rt_mutex_slowlock).
+
+The slow path function is where the task's waiter structure is created on
+the stack.  This is because the waiter structure is only needed for the
+scope of this function.  The waiter structure holds the nodes to store
+the task on the wait_list of the mutex, and if need be, the pi_list of
+the owner.
+
+The wait_lock of the mutex is taken since the slow path of unlocking the
+mutex also takes this lock.
+
+We then call try_to_take_rt_mutex.  This is where the architecture that
+does not implement CMPXCHG would always grab the lock (if there's no
+contention).
+
+try_to_take_rt_mutex is used every time the task tries to grab a mutex in the
+slow path.  The first thing that is done here is an atomic setting of
+the "Has Waiters" flag of the mutex's owner field.  Yes, this could really
+be false, because if the the mutex has no owner, there are no waiters and
+the current task also won't have any waiters.  But we don't have the lock
+yet, so we assume we are going to be a waiter.  The reason for this is to
+play nice for those architectures that do have CMPXCHG.  By setting this flag
+now, the owner of the mutex can't release the mutex without going into the
+slow unlock path, and it would then need to grab the wait_lock, which this
+code currently holds.  So setting the "Has Waiters" flag forces the owner
+to synchronize with this code.
+
+Now that we know that we can't have any races with the owner releasing the
+mutex, we check to see if we can take the ownership.  This is done if the
+mutex doesn't have a owner, or if we can steal the mutex from a pending
+owner.  Let's look at the situations we have here.
+
+  1) Has owner that is pending
+  ----------------------------
+
+  The mutex has a owner, but it hasn't woken up and the mutex flag
+  "Pending Owner" is set.  The first check is to see if the owner isn't the
+  current task.  This is because this function is also used for the pending
+  owner to grab the mutex.  When a pending owner wakes up, it checks to see
+  if it can take the mutex, and this is done if the owner is already set to
+  itself.  If so, we succeed and leave the function, clearing the "Pending
+  Owner" bit.
+
+  If the pending owner is not current, we check to see if the current priority is
+  higher than the pending owner.  If not, we fail the function and return.
+
+  There's also something special about a pending owner.  That is a pending owner
+  is never blocked on a mutex.  So there is no PI chain to worry about.  It also
+  means that if the mutex doesn't have any waiters, there's no accounting needed
+  to update the pending owner's pi_list, since we only worry about processes
+  blocked on the current mutex.
+
+  If there are waiters on this mutex, and we just stole the ownership, we need
+  to take the top waiter, remove it from the pi_list of the pending owner, and
+  add it to the current pi_list.  Note that at this moment, the pending owner
+  is no longer on the list of waiters.  This is fine, since the pending owner
+  would add itself back when it realizes that it had the ownership stolen
+  from itself.  When the pending owner tries to grab the mutex, it will fail
+  in try_to_take_rt_mutex if the owner field points to another process.
+
+  2) No owner
+  -----------
+
+  If there is no owner (or we successfully stole the lock), we set the owner
+  of the mutex to current, and set the flag of "Has Waiters" if the current
+  mutex actually has waiters, or we clear the flag if it doesn't.  See, it was
+  OK that we set that flag early, since now it is cleared.
+
+  3) Failed to grab ownership
+  ---------------------------
+
+  The most interesting case is when we fail to take ownership. This means that
+  there exists an owner, or there's a pending owner with equal or higher
+  priority than the current task.
+
+We'll continue on the failed case.
+
+If the mutex has a timeout, we set up a timer to go off to break us out
+of this mutex if we failed to get it after a specified amount of time.
+
+Now we enter a loop that will continue to try to take ownership of the mutex, or
+fail from a timeout or signal.
+
+Once again we try to take the mutex.  This will usually fail the first time
+in the loop, since it had just failed to get the mutex.  But the second time
+in the loop, this would likely succeed, since the task would likely be
+the pending owner.
+
+If the mutex is TASK_INTERRUPTIBLE a check for signals and timeout is done
+here.
+
+The waiter structure has a "task" field that points to the task that is blocked
+on the mutex.  This field can be NULL the first time it goes through the loop
+or if the task is a pending owner and had it's mutex stolen.  If the "task"
+field is NULL then we need to set up the accounting for it.
+
+Task blocks on mutex
+--------------------
+
+The accounting of a mutex and process is done with the waiter structure of
+the process.  The "task" field is set to the process, and the "lock" field
+to the mutex.  The plist nodes are initialized to the processes current
+priority.
+
+Since the wait_lock was taken at the entry of the slow lock, we can safely
+add the waiter to the wait_list.  If the current process is the highest
+priority process currently waiting on this mutex, then we remove the
+previous top waiter process (if it exists) from the pi_list of the owner,
+and add the current process to that list.  Since the pi_list of the owner
+has changed, we call rt_mutex_adjust_prio on the owner to see if the owner
+should adjust its priority accordingly.
+
+If the owner is also blocked on a lock, and had its pi_list changed
+(or deadlock checking is on), we unlock the wait_lock of the mutex and go ahead
+and run rt_mutex_adjust_prio_chain on the owner, as described earlier.
+
+Now all locks are released, and if the current process is still blocked on a
+mutex (waiter "task" field is not NULL), then we go to sleep (call schedule).
+
+Waking up in the loop
+---------------------
+
+The schedule can then wake up for a few reasons.
+  1) we were given pending ownership of the mutex.
+  2) we received a signal and was TASK_INTERRUPTIBLE
+  3) we had a timeout and was TASK_INTERRUPTIBLE
+
+In any of these cases, we continue the loop and once again try to grab the
+ownership of the mutex.  If we succeed, we exit the loop, otherwise we continue
+and on signal and timeout, will exit the loop, or if we had the mutex stolen
+we just simply add ourselves back on the lists and go back to sleep.
+
+Note: For various reasons, because of timeout and signals, the steal mutex
+      algorithm needs to be careful. This is because the current process is
+      still on the wait_list. And because of dynamic changing of priorities,
+      especially on SCHED_OTHER tasks, the current process can be the
+      highest priority task on the wait_list.
+
+Failed to get mutex on Timeout or Signal
+----------------------------------------
+
+If a timeout or signal occurred, the waiter's "task" field would not be
+NULL and the task needs to be taken off the wait_list of the mutex and perhaps
+pi_list of the owner.  If this process was a high priority process, then
+the rt_mutex_adjust_prio_chain needs to be executed again on the owner,
+but this time it will be lowering the priorities.
+
+
+Unlocking the Mutex
+-------------------
+
+The unlocking of a mutex also has a fast path for those architectures with
+CMPXCHG.  Since the taking of a mutex on contention always sets the
+"Has Waiters" flag of the mutex's owner, we use this to know if we need to
+take the slow path when unlocking the mutex.  If the mutex doesn't have any
+waiters, the owner field of the mutex would equal the current process and
+the mutex can be unlocked by just replacing the owner field with NULL.
+
+If the owner field has the "Has Waiters" bit set (or CMPXCHG is not available),
+the slow unlock path is taken.
+
+The first thing done in the slow unlock path is to take the wait_lock of the
+mutex.  This synchronizes the locking and unlocking of the mutex.
+
+A check is made to see if the mutex has waiters or not.  On architectures that
+do not have CMPXCHG, this is the location that the owner of the mutex will
+determine if a waiter needs to be awoken or not.  On architectures that
+do have CMPXCHG, that check is done in the fast path, but it is still needed
+in the slow path too.  If a waiter of a mutex woke up because of a signal
+or timeout between the time the owner failed the fast path CMPXCHG check and
+the grabbing of the wait_lock, the mutex may not have any waiters, thus the
+owner still needs to make this check. If there are no waiters than the mutex
+owner field is set to NULL, the wait_lock is released and nothing more is
+needed.
+
+If there are waiters, then we need to wake one up and give that waiter
+pending ownership.
+
+On the wake up code, the pi_lock of the current owner is taken.  The top
+waiter of the lock is found and removed from the wait_list of the mutex
+as well as the pi_list of the current owner.  The task field of the new
+pending owner's waiter structure is set to NULL, and the owner field of the
+mutex is set to the new owner with the "Pending Owner" bit set, as well
+as the "Has Waiters" bit if there still are other processes blocked on the
+mutex.
+
+The pi_lock of the previous owner is released, and the new pending owner's
+pi_lock is taken.  Remember that this is the trick to prevent the race
+condition in rt_mutex_adjust_prio_chain from adding itself as a waiter
+on the mutex.
+
+We now clear the "pi_blocked_on" field of the new pending owner, and if
+the mutex still has waiters pending, we add the new top waiter to the pi_list
+of the pending owner.
+
+Finally we unlock the pi_lock of the pending owner and wake it up.
+
+
+Contact
+-------
+
+For updates on this document, please email Steven Rostedt <rostedt@goodmis.org>
+
+
+Credits
+-------
+
+Author:  Steven Rostedt <rostedt@goodmis.org>
+
+Reviewers:  Ingo Molnar, Thomas Gleixner, Thomas Duetsch, and Randy Dunlap
+
+Updates
+-------
+
+This document was originally written for 2.6.17-rc3-mm1
diff --git a/Documentation/rt-mutex.txt b/Documentation/rt-mutex.txt
new file mode 100644
index 0000000..243393d
--- /dev/null
+++ b/Documentation/rt-mutex.txt
@@ -0,0 +1,79 @@
+RT-mutex subsystem with PI support
+----------------------------------
+
+RT-mutexes with priority inheritance are used to support PI-futexes,
+which enable pthread_mutex_t priority inheritance attributes
+(PTHREAD_PRIO_INHERIT). [See Documentation/pi-futex.txt for more details
+about PI-futexes.]
+
+This technology was developed in the -rt tree and streamlined for
+pthread_mutex support.
+
+Basic principles:
+-----------------
+
+RT-mutexes extend the semantics of simple mutexes by the priority
+inheritance protocol.
+
+A low priority owner of a rt-mutex inherits the priority of a higher
+priority waiter until the rt-mutex is released. If the temporarily
+boosted owner blocks on a rt-mutex itself it propagates the priority
+boosting to the owner of the other rt_mutex it gets blocked on. The
+priority boosting is immediately removed once the rt_mutex has been
+unlocked.
+
+This approach allows us to shorten the block of high-prio tasks on
+mutexes which protect shared resources. Priority inheritance is not a
+magic bullet for poorly designed applications, but it allows
+well-designed applications to use userspace locks in critical parts of
+an high priority thread, without losing determinism.
+
+The enqueueing of the waiters into the rtmutex waiter list is done in
+priority order. For same priorities FIFO order is chosen. For each
+rtmutex, only the top priority waiter is enqueued into the owner's
+priority waiters list. This list too queues in priority order. Whenever
+the top priority waiter of a task changes (for example it timed out or
+got a signal), the priority of the owner task is readjusted. [The
+priority enqueueing is handled by "plists", see include/linux/plist.h
+for more details.]
+
+RT-mutexes are optimized for fastpath operations and have no internal
+locking overhead when locking an uncontended mutex or unlocking a mutex
+without waiters. The optimized fastpath operations require cmpxchg
+support. [If that is not available then the rt-mutex internal spinlock
+is used]
+
+The state of the rt-mutex is tracked via the owner field of the rt-mutex
+structure:
+
+rt_mutex->owner holds the task_struct pointer of the owner. Bit 0 and 1
+are used to keep track of the "owner is pending" and "rtmutex has
+waiters" state.
+
+ owner		bit1	bit0
+ NULL		0	0	mutex is free (fast acquire possible)
+ NULL		0	1	invalid state
+ NULL		1	0	Transitional state*
+ NULL		1	1	invalid state
+ taskpointer	0	0	mutex is held (fast release possible)
+ taskpointer	0	1	task is pending owner
+ taskpointer	1	0	mutex is held and has waiters
+ taskpointer	1	1	task is pending owner and mutex has waiters
+
+Pending-ownership handling is a performance optimization:
+pending-ownership is assigned to the first (highest priority) waiter of
+the mutex, when the mutex is released. The thread is woken up and once
+it starts executing it can acquire the mutex. Until the mutex is taken
+by it (bit 0 is cleared) a competing higher priority thread can "steal"
+the mutex which puts the woken up thread back on the waiters list.
+
+The pending-ownership optimization is especially important for the
+uninterrupted workflow of high-prio tasks which repeatedly
+takes/releases locks that have lower-prio waiters. Without this
+optimization the higher-prio thread would ping-pong to the lower-prio
+task [because at unlock time we always assign a new owner].
+
+(*) The "mutex has waiters" bit gets set to take the lock. If the lock
+doesn't already have an owner, this bit is quickly cleared if there are
+no waiters.  So this is a transitional state to synchronize with looking
+at the owner field of the mutex and the mutex owner releasing the lock.
diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c
index 558b833..254c507 100644
--- a/arch/alpha/kernel/setup.c
+++ b/arch/alpha/kernel/setup.c
@@ -481,7 +481,7 @@
 		struct cpu *p = kzalloc(sizeof(*p), GFP_KERNEL);
 		if (!p)
 			return -ENOMEM;
-		register_cpu(p, i, NULL);
+		register_cpu(p, i);
 	}
 	return 0;
 }
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 9fc9af8..093ccba 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -808,7 +808,7 @@
 	int cpu;
 
 	for_each_possible_cpu(cpu)
-		register_cpu(&per_cpu(cpu_data, cpu).cpu, cpu, NULL);
+		register_cpu(&per_cpu(cpu_data, cpu).cpu, cpu);
 
 	return 0;
 }
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 47c08bc..3bb221d 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -233,7 +233,7 @@
 
 config SCHED_SMT
 	bool "SMT (Hyperthreading) scheduler support"
-	depends on SMP
+	depends on X86_HT
 	help
 	  SMT scheduler support improves the CPU scheduler's decision making
 	  when dealing with Intel Pentium 4 chips with HyperThreading at a
@@ -242,7 +242,7 @@
 
 config SCHED_MC
 	bool "Multi-core scheduler support"
-	depends on SMP
+	depends on X86_HT
 	default y
 	help
 	  Multi-core scheduler support improves the CPU scheduler's decision
@@ -780,6 +780,17 @@
 	  enable suspend on SMP systems. CPUs can be controlled through
 	  /sys/devices/system/cpu.
 
+config COMPAT_VDSO
+	bool "Compat VDSO support"
+	default y
+	help
+	  Map the VDSO to the predictable old-style address too.
+	---help---
+	  Say N here if you are running a sufficiently recent glibc
+	  version (2.3.3 or later), to remove the high-mapped
+	  VDSO mapping and to exclusively use the randomized VDSO.
+
+	  If unsure, say Y.
 
 endmenu
 
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index 1c3a809..c80271f 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -14,6 +14,7 @@
 #include <asm/fixmap.h>
 #include <asm/processor.h>
 #include <asm/thread_info.h>
+#include <asm/elf.h>
 
 #define DEFINE(sym, val) \
         asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -54,6 +55,7 @@
 	OFFSET(TI_preempt_count, thread_info, preempt_count);
 	OFFSET(TI_addr_limit, thread_info, addr_limit);
 	OFFSET(TI_restart_block, thread_info, restart_block);
+	OFFSET(TI_sysenter_return, thread_info, sysenter_return);
 	BLANK();
 
 	OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
@@ -69,7 +71,7 @@
 		 sizeof(struct tss_struct));
 
 	DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
-	DEFINE(VSYSCALL_BASE, __fix_to_virt(FIX_VSYSCALL));
+	DEFINE(VDSO_PRELINK, VDSO_PRELINK);
 
 	OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
 }
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index fd0457c..e6a2d6b 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -235,10 +235,10 @@
 			while ((1 << bits) < c->x86_max_cores)
 				bits++;
 		}
-		cpu_core_id[cpu] = phys_proc_id[cpu] & ((1<<bits)-1);
-		phys_proc_id[cpu] >>= bits;
+		c->cpu_core_id = c->phys_proc_id & ((1<<bits)-1);
+		c->phys_proc_id >>= bits;
 		printk(KERN_INFO "CPU %d(%d) -> Core %d\n",
-		       cpu, c->x86_max_cores, cpu_core_id[cpu]);
+		       cpu, c->x86_max_cores, c->cpu_core_id);
 	}
 #endif
 
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 44f2c5f..70c87de 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -294,7 +294,7 @@
 			if (c->x86 >= 0x6)
 				c->x86_model += ((tfms >> 16) & 0xF) << 4;
 			c->x86_mask = tfms & 15;
-#ifdef CONFIG_SMP
+#ifdef CONFIG_X86_HT
 			c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
 #else
 			c->apicid = (ebx >> 24) & 0xFF;
@@ -319,7 +319,7 @@
 	early_intel_workaround(c);
 
 #ifdef CONFIG_X86_HT
-	phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
+	c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
 #endif
 }
 
@@ -477,11 +477,9 @@
 {
 	u32 	eax, ebx, ecx, edx;
 	int 	index_msb, core_bits;
-	int 	cpu = smp_processor_id();
 
 	cpuid(1, &eax, &ebx, &ecx, &edx);
 
-
 	if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
 		return;
 
@@ -492,16 +490,17 @@
 	} else if (smp_num_siblings > 1 ) {
 
 		if (smp_num_siblings > NR_CPUS) {
-			printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
+			printk(KERN_WARNING "CPU: Unsupported number of the "
+					"siblings %d", smp_num_siblings);
 			smp_num_siblings = 1;
 			return;
 		}
 
 		index_msb = get_count_order(smp_num_siblings);
-		phys_proc_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
+		c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
 
 		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
-		       phys_proc_id[cpu]);
+		       c->phys_proc_id);
 
 		smp_num_siblings = smp_num_siblings / c->x86_max_cores;
 
@@ -509,12 +508,12 @@
 
 		core_bits = get_count_order(c->x86_max_cores);
 
-		cpu_core_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
+		c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
 					       ((1 << core_bits) - 1);
 
 		if (c->x86_max_cores > 1)
 			printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
-			       cpu_core_id[cpu]);
+			       c->cpu_core_id);
 	}
 }
 #endif
@@ -613,6 +612,12 @@
 		set_in_cr4(X86_CR4_TSD);
 	}
 
+	/* The CPU hotplug case */
+	if (cpu_gdt_descr->address) {
+		gdt = (struct desc_struct *)cpu_gdt_descr->address;
+		memset(gdt, 0, PAGE_SIZE);
+		goto old_gdt;
+	}
 	/*
 	 * This is a horrible hack to allocate the GDT.  The problem
 	 * is that cpu_init() is called really early for the boot CPU
@@ -631,7 +636,7 @@
 				local_irq_enable();
 		}
 	}
-
+old_gdt:
 	/*
 	 * Initialize the per-CPU GDT with the boot GDT,
 	 * and set up the GDT descriptor:
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index 6c37b4f..e9f0b92 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -159,13 +159,13 @@
 	unsigned val;
 };
 
-static unsigned short assocs[] = {
+static const unsigned short assocs[] = {
 	[1] = 1, [2] = 2, [4] = 4, [6] = 8,
 	[8] = 16,
 	[0xf] = 0xffff // ??
 	};
-static unsigned char levels[] = { 1, 1, 2 };
-static unsigned char types[] = { 1, 2, 3 };
+static const unsigned char levels[] = { 1, 1, 2 };
+static const unsigned char types[] = { 1, 2, 3 };
 
 static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
 		       union _cpuid4_leaf_ebx *ebx,
@@ -261,7 +261,7 @@
 	unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
 	unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
 	unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
-#ifdef CONFIG_SMP
+#ifdef CONFIG_X86_HT
 	unsigned int cpu = (c == &boot_cpu_data) ? 0 : (c - cpu_data);
 #endif
 
@@ -383,14 +383,14 @@
 
 	if (new_l2) {
 		l2 = new_l2;
-#ifdef CONFIG_SMP
+#ifdef CONFIG_X86_HT
 		cpu_llc_id[cpu] = l2_id;
 #endif
 	}
 
 	if (new_l3) {
 		l3 = new_l3;
-#ifdef CONFIG_SMP
+#ifdef CONFIG_X86_HT
 		cpu_llc_id[cpu] = l3_id;
 #endif
 	}
@@ -729,7 +729,7 @@
 	return;
 }
 
-static int cacheinfo_cpu_callback(struct notifier_block *nfb,
+static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
 					unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
@@ -747,7 +747,7 @@
 	return NOTIFY_OK;
 }
 
-static struct notifier_block cacheinfo_cpu_notifier =
+static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier =
 {
     .notifier_call = cacheinfo_cpu_callback,
 };
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index a19fcb2..f54a152 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -18,7 +18,7 @@
 	 * applications want to get the raw CPUID data, they should access
 	 * /dev/cpu/<cpu_nr>/cpuid instead.
 	 */
-	static char *x86_cap_flags[] = {
+	static const char * const x86_cap_flags[] = {
 		/* Intel-defined */
 	        "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
 	        "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
@@ -62,7 +62,7 @@
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 	};
-	static char *x86_power_flags[] = {
+	static const char * const x86_power_flags[] = {
 		"ts",	/* temperature sensor */
 		"fid",  /* frequency id control */
 		"vid",  /* voltage id control */
@@ -109,9 +109,9 @@
 		seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
 #ifdef CONFIG_X86_HT
 	if (c->x86_max_cores * smp_num_siblings > 1) {
-		seq_printf(m, "physical id\t: %d\n", phys_proc_id[n]);
+		seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
 		seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[n]));
-		seq_printf(m, "core id\t\t: %d\n", cpu_core_id[n]);
+		seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
 		seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
 	}
 #endif
diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
index 1d9a4ab..f6dfa9f 100644
--- a/arch/i386/kernel/cpuid.c
+++ b/arch/i386/kernel/cpuid.c
@@ -183,7 +183,7 @@
 	return NOTIFY_OK;
 }
 
-static struct notifier_block cpuid_class_cpu_notifier =
+static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier =
 {
 	.notifier_call = cpuid_class_cpu_callback,
 };
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index e6e4506..fbdb933 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -83,6 +83,12 @@
 #define resume_kernel		restore_nocheck
 #endif
 
+#ifdef CONFIG_VM86
+#define resume_userspace_sig	check_userspace
+#else
+#define resume_userspace_sig	resume_userspace
+#endif
+
 #define SAVE_ALL \
 	cld; \
 	pushl %es; \
@@ -211,6 +217,7 @@
 	preempt_stop
 ret_from_intr:
 	GET_THREAD_INFO(%ebp)
+check_userspace:
 	movl EFLAGS(%esp), %eax		# mix EFLAGS and CS
 	movb CS(%esp), %al
 	testl $(VM_MASK | 3), %eax
@@ -263,7 +270,12 @@
 	pushl $(__USER_CS)
 	CFI_ADJUST_CFA_OFFSET 4
 	/*CFI_REL_OFFSET cs, 0*/
-	pushl $SYSENTER_RETURN
+	/*
+	 * Push current_thread_info()->sysenter_return to the stack.
+	 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
+	 * pushed above; +8 corresponds to copy_thread's esp0 setting.
+	 */
+	pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
 	CFI_ADJUST_CFA_OFFSET 4
 	CFI_REL_OFFSET eip, 0
 
@@ -415,7 +427,7 @@
 					# vm86-space
 	xorl %edx, %edx
 	call do_notify_resume
-	jmp resume_userspace
+	jmp resume_userspace_sig
 
 	ALIGN
 work_notifysig_v86:
@@ -428,7 +440,7 @@
 	movl %eax, %esp
 	xorl %edx, %edx
 	call do_notify_resume
-	jmp resume_userspace
+	jmp resume_userspace_sig
 #endif
 
 	# perform syscall exit tracing
@@ -515,7 +527,7 @@
  .if vector
 	CFI_ADJUST_CFA_OFFSET -4
  .endif
-1:	pushl $vector-256
+1:	pushl $~(vector)
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp common_interrupt
 .data
@@ -535,7 +547,7 @@
 #define BUILD_INTERRUPT(name, nr)	\
 ENTRY(name)				\
 	RING0_INT_FRAME;		\
-	pushl $nr-256;			\
+	pushl $~(nr);			\
 	CFI_ADJUST_CFA_OFFSET 4;	\
 	SAVE_ALL;			\
 	movl %esp,%eax;			\
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 061533e..c703bc7 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -53,8 +53,8 @@
  */
 fastcall unsigned int do_IRQ(struct pt_regs *regs)
 {	
-	/* high bits used in ret_from_ code */
-	int irq = regs->orig_eax & 0xff;
+	/* high bit used in ret_from_ code */
+	int irq = ~regs->orig_eax;
 #ifdef CONFIG_4KSTACKS
 	union irq_ctx *curctx, *irqctx;
 	u32 *isp;
@@ -100,8 +100,8 @@
 		 * softirq checks work in the hardirq context.
 		 */
 		irqctx->tinfo.preempt_count =
-			irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK |
-			curctx->tinfo.preempt_count & SOFTIRQ_MASK;
+			(irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
+			(curctx->tinfo.preempt_count & SOFTIRQ_MASK);
 
 		asm volatile(
 			"       xchgl   %%ebx,%%esp      \n"
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index 7a32823..d022cb8 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -266,7 +266,7 @@
 	return NOTIFY_OK;
 }
 
-static struct notifier_block msr_class_cpu_notifier =
+static struct notifier_block __cpuinitdata msr_class_cpu_notifier =
 {
 	.notifier_call = msr_class_cpu_callback,
 };
diff --git a/arch/i386/kernel/scx200.c b/arch/i386/kernel/scx200.c
index 321f5fd..9bf590c 100644
--- a/arch/i386/kernel/scx200.c
+++ b/arch/i386/kernel/scx200.c
@@ -9,6 +9,7 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/mutex.h>
 #include <linux/pci.h>
 
 #include <linux/scx200.h>
@@ -45,11 +46,19 @@
 	.probe = scx200_probe,
 };
 
-static DEFINE_SPINLOCK(scx200_gpio_config_lock);
+static DEFINE_MUTEX(scx200_gpio_config_lock);
+
+static void __devinit scx200_init_shadow(void)
+{
+	int bank;
+
+	/* read the current values driven on the GPIO signals */
+	for (bank = 0; bank < 2; ++bank)
+		scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
+}
 
 static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
-	int bank;
 	unsigned base;
 
 	if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE ||
@@ -63,10 +72,7 @@
 		}
 
 		scx200_gpio_base = base;
-
-		/* read the current values driven on the GPIO signals */
-		for (bank = 0; bank < 2; ++bank)
-			scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
+		scx200_init_shadow();
 
 	} else {
 		/* find the base of the Configuration Block */
@@ -87,12 +93,11 @@
 	return 0;
 }
 
-u32 scx200_gpio_configure(int index, u32 mask, u32 bits)
+u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits)
 {
 	u32 config, new_config;
-	unsigned long flags;
 
-	spin_lock_irqsave(&scx200_gpio_config_lock, flags);
+	mutex_lock(&scx200_gpio_config_lock);
 
 	outl(index, scx200_gpio_base + 0x20);
 	config = inl(scx200_gpio_base + 0x24);
@@ -100,45 +105,11 @@
 	new_config = (config & mask) | bits;
 	outl(new_config, scx200_gpio_base + 0x24);
 
-	spin_unlock_irqrestore(&scx200_gpio_config_lock, flags);
+	mutex_unlock(&scx200_gpio_config_lock);
 
 	return config;
 }
 
-#if 0
-void scx200_gpio_dump(unsigned index)
-{
-	u32 config = scx200_gpio_configure(index, ~0, 0);
-	printk(KERN_DEBUG "GPIO%02u: 0x%08lx", index, (unsigned long)config);
-	
-	if (config & 1) 
-		printk(" OE"); /* output enabled */
-	else
-		printk(" TS"); /* tristate */
-	if (config & 2) 
-		printk(" PP"); /* push pull */
-	else
-		printk(" OD"); /* open drain */
-	if (config & 4) 
-		printk(" PUE"); /* pull up enabled */
-	else
-		printk(" PUD"); /* pull up disabled */
-	if (config & 8) 
-		printk(" LOCKED"); /* locked */
-	if (config & 16) 
-		printk(" LEVEL"); /* level input */
-	else
-		printk(" EDGE"); /* edge input */
-	if (config & 32) 
-		printk(" HI"); /* trigger on rising edge */
-	else
-		printk(" LO"); /* trigger on falling edge */
-	if (config & 64) 
-		printk(" DEBOUNCE"); /* debounce */
-	printk("\n");
-}
-#endif  /*  0  */
-
 static int __init scx200_init(void)
 {
 	printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n");
@@ -159,10 +130,3 @@
 EXPORT_SYMBOL(scx200_gpio_shadow);
 EXPORT_SYMBOL(scx200_gpio_configure);
 EXPORT_SYMBOL(scx200_cb_base);
-
-/*
-    Local variables:
-        compile-command: "make -k -C ../../.. SUBDIRS=arch/i386/kernel modules"
-        c-basic-offset: 8
-    End:
-*/
diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index 5c352c3..43002cf 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -351,7 +351,7 @@
 			goto give_sigsegv;
 	}
 
-	restorer = &__kernel_sigreturn;
+	restorer = (void *)VDSO_SYM(&__kernel_sigreturn);
 	if (ka->sa.sa_flags & SA_RESTORER)
 		restorer = ka->sa.sa_restorer;
 
@@ -447,7 +447,7 @@
 		goto give_sigsegv;
 
 	/* Set up to return from userspace.  */
-	restorer = &__kernel_rt_sigreturn;
+	restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn);
 	if (ka->sa.sa_flags & SA_RESTORER)
 		restorer = ka->sa.sa_restorer;
 	err |= __put_user(restorer, &frame->pretcode);
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index bce5470..89e7315 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -67,12 +67,6 @@
 EXPORT_SYMBOL(smp_num_siblings);
 #endif
 
-/* Package ID of each logical CPU */
-int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
-
-/* Core ID of each logical CPU */
-int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
-
 /* Last level cache ID of each logical CPU */
 int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
 
@@ -454,10 +448,12 @@
 	struct cpuinfo_x86 *c = cpu_data + cpu;
 	/*
 	 * For perf, we return last level cache shared map.
-	 * TBD: when power saving sched policy is added, we will return
-	 *      cpu_core_map when power saving policy is enabled
+	 * And for power savings, we return cpu_core_map
 	 */
-	return c->llc_shared_map;
+	if (sched_mc_power_savings || sched_smt_power_savings)
+		return cpu_core_map[cpu];
+	else
+		return c->llc_shared_map;
 }
 
 /* representing cpus for which sibling maps can be computed */
@@ -473,8 +469,8 @@
 
 	if (smp_num_siblings > 1) {
 		for_each_cpu_mask(i, cpu_sibling_setup_map) {
-			if (phys_proc_id[cpu] == phys_proc_id[i] &&
-			    cpu_core_id[cpu] == cpu_core_id[i]) {
+			if (c[cpu].phys_proc_id == c[i].phys_proc_id &&
+			    c[cpu].cpu_core_id == c[i].cpu_core_id) {
 				cpu_set(i, cpu_sibling_map[cpu]);
 				cpu_set(cpu, cpu_sibling_map[i]);
 				cpu_set(i, cpu_core_map[cpu]);
@@ -501,7 +497,7 @@
 			cpu_set(i, c[cpu].llc_shared_map);
 			cpu_set(cpu, c[i].llc_shared_map);
 		}
-		if (phys_proc_id[cpu] == phys_proc_id[i]) {
+		if (c[cpu].phys_proc_id == c[i].phys_proc_id) {
 			cpu_set(i, cpu_core_map[cpu]);
 			cpu_set(cpu, cpu_core_map[i]);
 			/*
@@ -1056,6 +1052,7 @@
 	struct warm_boot_cpu_info info;
 	struct work_struct task;
 	int	apicid, ret;
+	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
 
 	apicid = x86_cpu_to_apicid[cpu];
 	if (apicid == BAD_APICID) {
@@ -1063,6 +1060,18 @@
 		goto exit;
 	}
 
+	/*
+	 * the CPU isn't initialized at boot time, allocate gdt table here.
+	 * cpu_init will initialize it
+	 */
+	if (!cpu_gdt_descr->address) {
+		cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL);
+		if (!cpu_gdt_descr->address)
+			printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
+			ret = -ENOMEM;
+			goto exit;
+	}
+
 	info.complete = &done;
 	info.apicid = apicid;
 	info.cpu = cpu;
@@ -1340,8 +1349,8 @@
 		cpu_clear(cpu, cpu_sibling_map[sibling]);
 	cpus_clear(cpu_sibling_map[cpu]);
 	cpus_clear(cpu_core_map[cpu]);
-	phys_proc_id[cpu] = BAD_APICID;
-	cpu_core_id[cpu] = BAD_APICID;
+	c[cpu].phys_proc_id = 0;
+	c[cpu].cpu_core_id = 0;
 	cpu_clear(cpu, cpu_sibling_setup_map);
 }
 
diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
index 0bada18..c60419de 100644
--- a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -2,6 +2,8 @@
  * linux/arch/i386/kernel/sysenter.c
  *
  * (C) Copyright 2002 Linus Torvalds
+ * Portions based on the vdso-randomization code from exec-shield:
+ * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
  *
  * This file contains the needed initializations to support sysenter.
  */
@@ -13,12 +15,31 @@
 #include <linux/gfp.h>
 #include <linux/string.h>
 #include <linux/elf.h>
+#include <linux/mm.h>
+#include <linux/module.h>
 
 #include <asm/cpufeature.h>
 #include <asm/msr.h>
 #include <asm/pgtable.h>
 #include <asm/unistd.h>
 
+/*
+ * Should the kernel map a VDSO page into processes and pass its
+ * address down to glibc upon exec()?
+ */
+unsigned int __read_mostly vdso_enabled = 1;
+
+EXPORT_SYMBOL_GPL(vdso_enabled);
+
+static int __init vdso_setup(char *s)
+{
+	vdso_enabled = simple_strtoul(s, NULL, 0);
+
+	return 1;
+}
+
+__setup("vdso=", vdso_setup);
+
 extern asmlinkage void sysenter_entry(void);
 
 void enable_sep_cpu(void)
@@ -45,23 +66,122 @@
  */
 extern const char vsyscall_int80_start, vsyscall_int80_end;
 extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
+static void *syscall_page;
 
 int __init sysenter_setup(void)
 {
-	void *page = (void *)get_zeroed_page(GFP_ATOMIC);
+	syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
 
-	__set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY_EXEC);
+#ifdef CONFIG_COMPAT_VDSO
+	__set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY);
+	printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
+#else
+	/*
+	 * In the non-compat case the ELF coredumping code needs the fixmap:
+	 */
+	__set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_KERNEL_RO);
+#endif
 
 	if (!boot_cpu_has(X86_FEATURE_SEP)) {
-		memcpy(page,
+		memcpy(syscall_page,
 		       &vsyscall_int80_start,
 		       &vsyscall_int80_end - &vsyscall_int80_start);
 		return 0;
 	}
 
-	memcpy(page,
+	memcpy(syscall_page,
 	       &vsyscall_sysenter_start,
 	       &vsyscall_sysenter_end - &vsyscall_sysenter_start);
 
 	return 0;
 }
+
+static struct page *syscall_nopage(struct vm_area_struct *vma,
+				unsigned long adr, int *type)
+{
+	struct page *p = virt_to_page(adr - vma->vm_start + syscall_page);
+	get_page(p);
+	return p;
+}
+
+/* Prevent VMA merging */
+static void syscall_vma_close(struct vm_area_struct *vma)
+{
+}
+
+static struct vm_operations_struct syscall_vm_ops = {
+	.close = syscall_vma_close,
+	.nopage = syscall_nopage,
+};
+
+/* Defined in vsyscall-sysenter.S */
+extern void SYSENTER_RETURN;
+
+/* Setup a VMA at program startup for the vsyscall page */
+int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
+{
+	struct vm_area_struct *vma;
+	struct mm_struct *mm = current->mm;
+	unsigned long addr;
+	int ret;
+
+	down_write(&mm->mmap_sem);
+	addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
+	if (IS_ERR_VALUE(addr)) {
+		ret = addr;
+		goto up_fail;
+	}
+
+	vma = kmem_cache_zalloc(vm_area_cachep, SLAB_KERNEL);
+	if (!vma) {
+		ret = -ENOMEM;
+		goto up_fail;
+	}
+
+	vma->vm_start = addr;
+	vma->vm_end = addr + PAGE_SIZE;
+	/* MAYWRITE to allow gdb to COW and set breakpoints */
+	vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE;
+	vma->vm_flags |= mm->def_flags;
+	vma->vm_page_prot = protection_map[vma->vm_flags & 7];
+	vma->vm_ops = &syscall_vm_ops;
+	vma->vm_mm = mm;
+
+	ret = insert_vm_struct(mm, vma);
+	if (ret)
+		goto free_vma;
+
+	current->mm->context.vdso = (void *)addr;
+	current_thread_info()->sysenter_return =
+				    (void *)VDSO_SYM(&SYSENTER_RETURN);
+	mm->total_vm++;
+up_fail:
+	up_write(&mm->mmap_sem);
+	return ret;
+
+free_vma:
+	kmem_cache_free(vm_area_cachep, vma);
+	return ret;
+}
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
+		return "[vdso]";
+	return NULL;
+}
+
+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+{
+	return NULL;
+}
+
+int in_gate_area(struct task_struct *task, unsigned long addr)
+{
+	return 0;
+}
+
+int in_gate_area_no_task(unsigned long addr)
+{
+	return 0;
+}
diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c
index 2963552..e2e281d 100644
--- a/arch/i386/kernel/topology.c
+++ b/arch/i386/kernel/topology.c
@@ -32,15 +32,8 @@
 
 static struct i386_cpu cpu_devices[NR_CPUS];
 
-int arch_register_cpu(int num){
-	struct node *parent = NULL;
-
-#ifdef CONFIG_NUMA
-	int node = cpu_to_node(num);
-	if (node_online(node))
-		parent = &node_devices[node].node;
-#endif /* CONFIG_NUMA */
-
+int arch_register_cpu(int num)
+{
 	/*
 	 * CPU0 cannot be offlined due to several
 	 * restrictions and assumptions in kernel. This basically
@@ -50,21 +43,13 @@
 	if (!num)
 		cpu_devices[num].cpu.no_control = 1;
 
-	return register_cpu(&cpu_devices[num].cpu, num, parent);
+	return register_cpu(&cpu_devices[num].cpu, num);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
 
 void arch_unregister_cpu(int num) {
-	struct node *parent = NULL;
-
-#ifdef CONFIG_NUMA
-	int node = cpu_to_node(num);
-	if (node_online(node))
-		parent = &node_devices[node].node;
-#endif /* CONFIG_NUMA */
-
-	return unregister_cpu(&cpu_devices[num].cpu, parent);
+	return unregister_cpu(&cpu_devices[num].cpu);
 }
 EXPORT_SYMBOL(arch_register_cpu);
 EXPORT_SYMBOL(arch_unregister_cpu);
@@ -74,16 +59,13 @@
 
 #ifdef CONFIG_NUMA
 #include <linux/mmzone.h>
-#include <asm/node.h>
-
-struct i386_node node_devices[MAX_NUMNODES];
 
 static int __init topology_init(void)
 {
 	int i;
 
 	for_each_online_node(i)
-		arch_register_node(i);
+		register_one_node(i);
 
 	for_each_present_cpu(i)
 		arch_register_cpu(i);
diff --git a/arch/i386/kernel/vsyscall-sysenter.S b/arch/i386/kernel/vsyscall-sysenter.S
index 3b62baa..1a36d26 100644
--- a/arch/i386/kernel/vsyscall-sysenter.S
+++ b/arch/i386/kernel/vsyscall-sysenter.S
@@ -42,10 +42,10 @@
 	/* 7: align return point with nop's to make disassembly easier */
 	.space 7,0x90
 
-	/* 14: System call restart point is here! (SYSENTER_RETURN - 2) */
+	/* 14: System call restart point is here! (SYSENTER_RETURN-2) */
 	jmp .Lenter_kernel
 	/* 16: System call normal return point is here! */
-	.globl SYSENTER_RETURN	/* Symbol used by entry.S.  */
+	.globl SYSENTER_RETURN	/* Symbol used by sysenter.c  */
 SYSENTER_RETURN:
 	pop %ebp
 .Lpop_ebp:
diff --git a/arch/i386/kernel/vsyscall.lds.S b/arch/i386/kernel/vsyscall.lds.S
index 98699ca..e26975f 100644
--- a/arch/i386/kernel/vsyscall.lds.S
+++ b/arch/i386/kernel/vsyscall.lds.S
@@ -7,7 +7,7 @@
 
 SECTIONS
 {
-  . = VSYSCALL_BASE + SIZEOF_HEADERS;
+  . = VDSO_PRELINK + SIZEOF_HEADERS;
 
   .hash           : { *(.hash) }		:text
   .dynsym         : { *(.dynsym) }
@@ -20,7 +20,7 @@
      For the layouts to match, we need to skip more than enough
      space for the dynamic symbol table et al.  If this amount
      is insufficient, ld -shared will barf.  Just increase it here.  */
-  . = VSYSCALL_BASE + 0x400;
+  . = VDSO_PRELINK + 0x400;
 
   .text           : { *(.text) }		:text =0x90909090
   .note		  : { *(.note.*) }		:text :note
diff --git a/arch/i386/mach-voyager/setup.c b/arch/i386/mach-voyager/setup.c
index 0e22505..defc6eb 100644
--- a/arch/i386/mach-voyager/setup.c
+++ b/arch/i386/mach-voyager/setup.c
@@ -5,10 +5,10 @@
 #include <linux/config.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
-#include <asm/acpi.h>
 #include <asm/arch_hooks.h>
 #include <asm/voyager.h>
 #include <asm/e820.h>
+#include <asm/io.h>
 #include <asm/setup.h>
 
 void __init pre_intr_init_hook(void)
@@ -27,8 +27,7 @@
 	smp_intr_init();
 #endif
 
-	if (!acpi_ioapic)
-		setup_irq(2, &irq2);
+	setup_irq(2, &irq2);
 }
 
 void __init pre_setup_arch_hook(void)
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index bf19513..f84b16e 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -23,6 +23,7 @@
 #include <linux/init.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/poison.h>
 #include <linux/bootmem.h>
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
@@ -654,7 +655,7 @@
  */
 #ifdef CONFIG_MEMORY_HOTPLUG
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-int add_memory(u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size)
 {
 	struct pglist_data *pgdata = &contig_page_data;
 	struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
@@ -753,7 +754,7 @@
 	for (addr = begin; addr < end; addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
 		init_page_count(virt_to_page(addr));
-		memset((void *)addr, 0xcc, PAGE_SIZE);
+		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
 		free_page(addr);
 		totalram_pages++;
 	}
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index 0887b34..353a836 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -229,8 +229,8 @@
 	if (PageHighMem(page))
 		return;
 	if (!enable)
-		mutex_debug_check_no_locks_freed(page_address(page),
-						 numpages * PAGE_SIZE);
+		debug_check_no_locks_freed(page_address(page),
+					   numpages * PAGE_SIZE);
 
 	/* the return value is ignored - the calls cannot fail,
 	 * large pages are disabled at boot time.
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 1831874..a56df7b 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -374,6 +374,10 @@
 	def_bool y
 	depends on NEED_MULTIPLE_NODES
 
+config HAVE_ARCH_NODEDATA_EXTENSION
+	def_bool y
+	depends on NUMA
+
 config IA32_SUPPORT
 	bool "Support for Linux/x86 binaries"
 	help
diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c
index 859fb37..303a9af 100644
--- a/arch/ia64/kernel/palinfo.c
+++ b/arch/ia64/kernel/palinfo.c
@@ -959,7 +959,7 @@
 	}
 }
 
-static int palinfo_cpu_callback(struct notifier_block *nfb,
+static int __cpuinit palinfo_cpu_callback(struct notifier_block *nfb,
 								unsigned long action,
 								void *hcpu)
 {
@@ -978,7 +978,7 @@
 	return NOTIFY_OK;
 }
 
-static struct notifier_block palinfo_cpu_notifier =
+static struct notifier_block __cpuinitdata palinfo_cpu_notifier =
 {
 	.notifier_call = palinfo_cpu_callback,
 	.priority = 0,
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
index 663a186..9065f0f 100644
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -572,7 +572,7 @@
 };
 
 #ifdef	CONFIG_HOTPLUG_CPU
-static int
+static int __devinit
 salinfo_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 {
 	unsigned int i, cpu = (unsigned long)hcpu;
@@ -673,9 +673,7 @@
 	salinfo_timer.function = &salinfo_timeout;
 	add_timer(&salinfo_timer);
 
-#ifdef	CONFIG_HOTPLUG_CPU
-	register_cpu_notifier(&salinfo_cpu_notifier);
-#endif
+	register_hotcpu_notifier(&salinfo_cpu_notifier);
 
 	return 0;
 }
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index 879edb5..5511d9c 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -26,19 +26,10 @@
 #include <asm/numa.h>
 #include <asm/cpu.h>
 
-#ifdef CONFIG_NUMA
-static struct node *sysfs_nodes;
-#endif
 static struct ia64_cpu *sysfs_cpus;
 
 int arch_register_cpu(int num)
 {
-	struct node *parent = NULL;
-	
-#ifdef CONFIG_NUMA
-	parent = &sysfs_nodes[cpu_to_node(num)];
-#endif /* CONFIG_NUMA */
-
 #if defined (CONFIG_ACPI) && defined (CONFIG_HOTPLUG_CPU)
 	/*
 	 * If CPEI cannot be re-targetted, and this is
@@ -48,21 +39,14 @@
 		sysfs_cpus[num].cpu.no_control = 1;
 #endif
 
-	return register_cpu(&sysfs_cpus[num].cpu, num, parent);
+	return register_cpu(&sysfs_cpus[num].cpu, num);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
 
 void arch_unregister_cpu(int num)
 {
-	struct node *parent = NULL;
-
-#ifdef CONFIG_NUMA
-	int node = cpu_to_node(num);
-	parent = &sysfs_nodes[node];
-#endif /* CONFIG_NUMA */
-
-	return unregister_cpu(&sysfs_cpus[num].cpu, parent);
+	return unregister_cpu(&sysfs_cpus[num].cpu);
 }
 EXPORT_SYMBOL(arch_register_cpu);
 EXPORT_SYMBOL(arch_unregister_cpu);
@@ -74,17 +58,11 @@
 	int i, err = 0;
 
 #ifdef CONFIG_NUMA
-	sysfs_nodes = kzalloc(sizeof(struct node) * MAX_NUMNODES, GFP_KERNEL);
-	if (!sysfs_nodes) {
-		err = -ENOMEM;
-		goto out;
-	}
-
 	/*
 	 * MCD - Do we want to register all ONLINE nodes, or all POSSIBLE nodes?
 	 */
 	for_each_online_node(i) {
-		if ((err = register_node(&sysfs_nodes[i], i, 0)))
+		if ((err = register_one_node(i)))
 			goto out;
 	}
 #endif
@@ -426,7 +404,7 @@
  * When a cpu is hot-plugged, do a check and initiate
  * cache kobject if necessary
  */
-static int cache_cpu_callback(struct notifier_block *nfb,
+static int __cpuinit cache_cpu_callback(struct notifier_block *nfb,
 		unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
@@ -444,7 +422,7 @@
 	return NOTIFY_OK;
 }
 
-static struct notifier_block cache_cpu_notifier =
+static struct notifier_block __cpuinitdata cache_cpu_notifier =
 {
 	.notifier_call = cache_cpu_callback
 };
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index b6bcc9f..525b082 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -33,7 +33,6 @@
  */
 struct early_node_data {
 	struct ia64_node_data *node_data;
-	pg_data_t *pgdat;
 	unsigned long pernode_addr;
 	unsigned long pernode_size;
 	struct bootmem_data bootmem_data;
@@ -46,6 +45,8 @@
 static struct early_node_data mem_data[MAX_NUMNODES] __initdata;
 static nodemask_t memory_less_mask __initdata;
 
+static pg_data_t *pgdat_list[MAX_NUMNODES];
+
 /*
  * To prevent cache aliasing effects, align per-node structures so that they
  * start at addresses that are strided by node number.
@@ -99,7 +100,7 @@
  * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
  * called yet.  Note that node 0 will also count all non-existent cpus.
  */
-static int __init early_nr_cpus_node(int node)
+static int __meminit early_nr_cpus_node(int node)
 {
 	int cpu, n = 0;
 
@@ -114,7 +115,7 @@
  * compute_pernodesize - compute size of pernode data
  * @node: the node id.
  */
-static unsigned long __init compute_pernodesize(int node)
+static unsigned long __meminit compute_pernodesize(int node)
 {
 	unsigned long pernodesize = 0, cpus;
 
@@ -175,13 +176,13 @@
 	pernode += PERCPU_PAGE_SIZE * cpus;
 	pernode += node * L1_CACHE_BYTES;
 
-	mem_data[node].pgdat = __va(pernode);
+	pgdat_list[node] = __va(pernode);
 	pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
 
 	mem_data[node].node_data = __va(pernode);
 	pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
 
-	mem_data[node].pgdat->bdata = bdp;
+	pgdat_list[node]->bdata = bdp;
 	pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
 
 	cpu_data = per_cpu_node_setup(cpu_data, node);
@@ -268,7 +269,7 @@
 static int __init free_node_bootmem(unsigned long start, unsigned long len,
 				    int node)
 {
-	free_bootmem_node(mem_data[node].pgdat, start, len);
+	free_bootmem_node(pgdat_list[node], start, len);
 
 	return 0;
 }
@@ -287,7 +288,7 @@
 	int node;
 
 	for_each_online_node(node) {
-		pg_data_t *pdp = mem_data[node].pgdat;
+		pg_data_t *pdp = pgdat_list[node];
 
 		if (node_isset(node, memory_less_mask))
 			continue;
@@ -307,6 +308,17 @@
 	}
 }
 
+static void __meminit scatter_node_data(void)
+{
+	pg_data_t **dst;
+	int node;
+
+	for_each_online_node(node) {
+		dst = LOCAL_DATA_ADDR(pgdat_list[node])->pg_data_ptrs;
+		memcpy(dst, pgdat_list, sizeof(pgdat_list));
+	}
+}
+
 /**
  * initialize_pernode_data - fixup per-cpu & per-node pointers
  *
@@ -317,17 +329,10 @@
  */
 static void __init initialize_pernode_data(void)
 {
-	pg_data_t *pgdat_list[MAX_NUMNODES];
 	int cpu, node;
 
-	for_each_online_node(node)
-		pgdat_list[node] = mem_data[node].pgdat;
+	scatter_node_data();
 
-	/* Copy the pg_data_t list to each node and init the node field */
-	for_each_online_node(node) {
-		memcpy(mem_data[node].node_data->pg_data_ptrs, pgdat_list,
-		       sizeof(pgdat_list));
-	}
 #ifdef CONFIG_SMP
 	/* Set the node_data pointer for each per-cpu struct */
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
@@ -372,7 +377,7 @@
 	if (bestnode == -1)
 		bestnode = anynode;
 
-	ptr = __alloc_bootmem_node(mem_data[bestnode].pgdat, pernodesize,
+	ptr = __alloc_bootmem_node(pgdat_list[bestnode], pernodesize,
 		PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
 
 	return ptr;
@@ -476,7 +481,7 @@
 		pernodesize = mem_data[node].pernode_size;
 		map = pernode + pernodesize;
 
-		init_bootmem_node(mem_data[node].pgdat,
+		init_bootmem_node(pgdat_list[node],
 				  map>>PAGE_SHIFT,
 				  bdp->node_boot_start>>PAGE_SHIFT,
 				  bdp->node_low_pfn);
@@ -786,3 +791,21 @@
 
 	zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
 }
+
+pg_data_t *arch_alloc_nodedata(int nid)
+{
+	unsigned long size = compute_pernodesize(nid);
+
+	return kzalloc(size, GFP_KERNEL);
+}
+
+void arch_free_nodedata(pg_data_t *pgdat)
+{
+	kfree(pgdat);
+}
+
+void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat)
+{
+	pgdat_list[update_node] = update_pgdat;
+	scatter_node_data();
+}
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 11f0800..38306e9 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -652,7 +652,7 @@
 	num_physpages++;
 }
 
-int add_memory(u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size)
 {
 	pg_data_t *pgdat;
 	struct zone *zone;
@@ -660,7 +660,7 @@
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
-	pgdat = NODE_DATA(0);
+	pgdat = NODE_DATA(nid);
 
 	zone = pgdat->node_zones + ZONE_NORMAL;
 	ret = __add_pages(zone, start_pfn, nr_pages);
@@ -671,7 +671,6 @@
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(add_memory);
 
 int remove_memory(u64 start, u64 size)
 {
diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c
index dc8e2b6..677c6c0 100644
--- a/arch/ia64/sn/kernel/irq.c
+++ b/arch/ia64/sn/kernel/irq.c
@@ -27,7 +27,7 @@
 int sn_force_interrupt_flag = 1;
 extern int sn_ioif_inited;
 struct list_head **sn_irq_lh;
-static spinlock_t sn_irq_info_lock = SPIN_LOCK_UNLOCKED; /* non-IRQ lock */
+static DEFINE_SPINLOCK(sn_irq_info_lock); /* non-IRQ lock */
 
 u64 sn_intr_alloc(nasid_t local_nasid, int local_widget,
 				     struct sn_irq_info *sn_irq_info,
diff --git a/arch/m32r/kernel/setup.c b/arch/m32r/kernel/setup.c
index 3cd3c29..1ff483c 100644
--- a/arch/m32r/kernel/setup.c
+++ b/arch/m32r/kernel/setup.c
@@ -275,7 +275,7 @@
 	int i;
 
 	for_each_present_cpu(i)
-		register_cpu(&cpu_devices[i], i, NULL);
+		register_cpu(&cpu_devices[i], i);
 
 	return 0;
 }
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 298f82f..9096a5e 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -446,7 +446,7 @@
 	int ret;
 
 	for_each_present_cpu(cpu) {
-		ret = register_cpu(&per_cpu(cpu_devices, cpu), cpu, NULL);
+		ret = register_cpu(&per_cpu(cpu_devices, cpu), cpu);
 		if (ret)
 			printk(KERN_WARNING "topology_init: register_cpu %d "
 			       "failed (%d)\n", cpu, ret);
diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c
index 2e8e52c..70cf09a 100644
--- a/arch/mips/kernel/smtc.c
+++ b/arch/mips/kernel/smtc.c
@@ -367,7 +367,7 @@
 	dvpe();
 	dmt();
 
-	freeIPIq.lock = SPIN_LOCK_UNLOCKED;
+	spin_lock_init(&freeIPIq.lock);
 
 	/*
 	 * We probably don't have as many VPEs as we do SMP "CPUs",
@@ -375,7 +375,7 @@
 	 */
 	for (i=0; i<NR_CPUS; i++) {
 		IPIQ[i].head = IPIQ[i].tail = NULL;
-		IPIQ[i].lock = SPIN_LOCK_UNLOCKED;
+		spin_lock_init(&IPIQ[i].lock);
 		IPIQ[i].depth = 0;
 		ipi_timer_latch[i] = 0;
 	}
diff --git a/arch/parisc/kernel/topology.c b/arch/parisc/kernel/topology.c
index 3ba0400..068b20d 100644
--- a/arch/parisc/kernel/topology.c
+++ b/arch/parisc/kernel/topology.c
@@ -26,11 +26,10 @@
 
 static int __init topology_init(void)
 {
-	struct node *parent = NULL;
 	int num;
 
 	for_each_present_cpu(num) {
-		register_cpu(&cpu_devices[num], num, parent);
+		register_cpu(&cpu_devices[num], num);
 	}
 	return 0;
 }
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index e5a4481..0932a62 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -215,7 +215,7 @@
 
 	/* register CPU devices */
 	for_each_possible_cpu(i)
-		register_cpu(&cpu_devices[i], i, NULL);
+		register_cpu(&cpu_devices[i], i);
 
 	/* call platform init */
 	if (ppc_md.init != NULL) {
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 5bc2585..4662b58 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -279,7 +279,7 @@
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int sysfs_cpu_notify(struct notifier_block *self,
+static int __devinit sysfs_cpu_notify(struct notifier_block *self,
 				      unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned int)(long)hcpu;
@@ -297,30 +297,19 @@
 	return NOTIFY_OK;
 }
 
-static struct notifier_block sysfs_cpu_nb = {
+static struct notifier_block __devinitdata sysfs_cpu_nb = {
 	.notifier_call	= sysfs_cpu_notify,
 };
 
 /* NUMA stuff */
 
 #ifdef CONFIG_NUMA
-static struct node node_devices[MAX_NUMNODES];
-
 static void register_nodes(void)
 {
 	int i;
 
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		if (node_online(i)) {
-			int p_node = parent_node(i);
-			struct node *parent = NULL;
-
-			if (p_node != i)
-				parent = &node_devices[p_node];
-
-			register_node(&node_devices[i], i, parent);
-		}
-	}
+	for (i = 0; i < MAX_NUMNODES; i++)
+		register_one_node(i);
 }
 
 int sysfs_add_device_to_node(struct sys_device *dev, int nid)
@@ -359,23 +348,13 @@
 static int __init topology_init(void)
 {
 	int cpu;
-	struct node *parent = NULL;
 
 	register_nodes();
-
 	register_cpu_notifier(&sysfs_cpu_nb);
 
 	for_each_possible_cpu(cpu) {
 		struct cpu *c = &per_cpu(cpu_devices, cpu);
 
-#ifdef CONFIG_NUMA
-		/* The node to which a cpu belongs can't be known
-		 * until the cpu is made present.
-		 */
-		parent = NULL;
-		if (cpu_present(cpu))
-			parent = &node_devices[cpu_to_node(cpu)];
-#endif
 		/*
 		 * For now, we just see if the system supports making
 		 * the RTAS calls for CPU hotplug.  But, there may be a
@@ -387,7 +366,7 @@
 			c->no_control = 1;
 
 		if (cpu_online(cpu) || (c->no_control == 0)) {
-			register_cpu(c, cpu, parent);
+			register_cpu(c, cpu);
 
 			sysdev_create_file(&c->sysdev, &attr_physical_id);
 		}
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 9e30f96..d454caa 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -41,6 +41,7 @@
 #include <linux/idr.h>
 #include <linux/nodemask.h>
 #include <linux/module.h>
+#include <linux/poison.h>
 
 #include <asm/pgalloc.h>
 #include <asm/page.h>
@@ -90,7 +91,7 @@
 
 	addr = (unsigned long)__init_begin;
 	for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
-		memset((void *)addr, 0xcc, PAGE_SIZE);
+		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
 		ClearPageReserved(virt_to_page(addr));
 		init_page_count(virt_to_page(addr));
 		free_page(addr);
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 69f3b9a..089d939 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -114,15 +114,20 @@
 	num_physpages++;
 }
 
-int __devinit add_memory(u64 start, u64 size)
+#ifdef CONFIG_NUMA
+int memory_add_physaddr_to_nid(u64 start)
+{
+	return hot_add_scn_to_nid(start);
+}
+#endif
+
+int __devinit arch_add_memory(int nid, u64 start, u64 size)
 {
 	struct pglist_data *pgdata;
 	struct zone *zone;
-	int nid;
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 
-	nid = hot_add_scn_to_nid(start);
 	pgdata = NODE_DATA(nid);
 
 	start = (unsigned long)__va(start);
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index aa98cb3..fbe2393 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -334,7 +334,7 @@
 	return nid;
 }
 
-static int cpu_numa_callback(struct notifier_block *nfb,
+static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
 			     unsigned long action,
 			     void *hcpu)
 {
@@ -609,14 +609,15 @@
 	return (void *)ret;
 }
 
+static struct notifier_block __cpuinitdata ppc64_numa_nb = {
+	.notifier_call = cpu_numa_callback,
+	.priority = 1 /* Must run before sched domains notifier. */
+};
+
 void __init do_init_bootmem(void)
 {
 	int nid;
 	unsigned int i;
-	static struct notifier_block ppc64_numa_nb = {
-		.notifier_call = cpu_numa_callback,
-		.priority = 1 /* Must run before sched domains notifier. */
-	};
 
 	min_low_pfn = 0;
 	max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
diff --git a/arch/powerpc/platforms/cell/spufs/switch.c b/arch/powerpc/platforms/cell/spufs/switch.c
index 3068b42..a656d81 100644
--- a/arch/powerpc/platforms/cell/spufs/switch.c
+++ b/arch/powerpc/platforms/cell/spufs/switch.c
@@ -2203,7 +2203,7 @@
 
 	memset(lscsa, 0, sizeof(struct spu_lscsa));
 	csa->lscsa = lscsa;
-	csa->register_lock = SPIN_LOCK_UNLOCKED;
+	spin_lock_init(&csa->register_lock);
 
 	/* Set LS pages reserved to allow for user-space mapping. */
 	for (p = lscsa->ls; p < lscsa->ls + LS_SIZE; p += PAGE_SIZE)
diff --git a/arch/powerpc/platforms/powermac/pfunc_core.c b/arch/powerpc/platforms/powermac/pfunc_core.c
index 047f954..93e7505 100644
--- a/arch/powerpc/platforms/powermac/pfunc_core.c
+++ b/arch/powerpc/platforms/powermac/pfunc_core.c
@@ -546,7 +546,7 @@
 };
 
 static LIST_HEAD(pmf_devices);
-static spinlock_t pmf_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(pmf_lock);
 static DEFINE_MUTEX(pmf_irq_mutex);
 
 static void pmf_release_device(struct kref *kref)
diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c
index 8f2d129..45ccc68 100644
--- a/arch/powerpc/platforms/pseries/eeh_event.c
+++ b/arch/powerpc/platforms/pseries/eeh_event.c
@@ -35,7 +35,7 @@
  */
 
 /* EEH event workqueue setup. */
-static spinlock_t eeh_eventlist_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(eeh_eventlist_lock);
 LIST_HEAD(eeh_eventlist);
 static void eeh_thread_launcher(void *);
 DECLARE_WORK(eeh_event_wq, eeh_thread_launcher, NULL);
diff --git a/arch/powerpc/sysdev/mmio_nvram.c b/arch/powerpc/sysdev/mmio_nvram.c
index 74e0d31..615350d 100644
--- a/arch/powerpc/sysdev/mmio_nvram.c
+++ b/arch/powerpc/sysdev/mmio_nvram.c
@@ -32,7 +32,7 @@
 
 static void __iomem *mmio_nvram_start;
 static long mmio_nvram_len;
-static spinlock_t mmio_nvram_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(mmio_nvram_lock);
 
 static ssize_t mmio_nvram_read(char *buf, size_t count, loff_t *index)
 {
diff --git a/arch/ppc/kernel/setup.c b/arch/ppc/kernel/setup.c
index 1f79e84..4b4607d 100644
--- a/arch/ppc/kernel/setup.c
+++ b/arch/ppc/kernel/setup.c
@@ -475,7 +475,7 @@
 
 	/* register CPU devices */
 	for_each_possible_cpu(i)
-		register_cpu(&cpu_devices[i], i, NULL);
+		register_cpu(&cpu_devices[i], i);
 
 	/* call platform init */
 	if (ppc_md.init != NULL) {
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index 9a22434..54d35c1 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -652,7 +652,7 @@
 	return NOTIFY_OK;
 }
 
-static struct notifier_block appldata_nb = {
+static struct notifier_block __devinitdata appldata_nb = {
 	.notifier_call = appldata_cpu_notify,
 };
 
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 343120c..8e03219 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -869,7 +869,7 @@
 	int ret;
 
 	for_each_possible_cpu(cpu) {
-		ret = register_cpu(&per_cpu(cpu_devices, cpu), cpu, NULL);
+		ret = register_cpu(&per_cpu(cpu_devices, cpu), cpu);
 		if (ret)
 			printk(KERN_WARNING "topology_init: register_cpu %d "
 			       "failed (%d)\n", cpu, ret);
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index bb229ef..9af22116 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -402,7 +402,7 @@
 	int cpu_id;
 
 	for_each_possible_cpu(cpu_id)
-		register_cpu(&cpu[cpu_id], cpu_id, NULL);
+		register_cpu(&cpu[cpu_id], cpu_id);
 
 	return 0;
 }
diff --git a/arch/sh64/kernel/setup.c b/arch/sh64/kernel/setup.c
index d2711c9..da98d8d 100644
--- a/arch/sh64/kernel/setup.c
+++ b/arch/sh64/kernel/setup.c
@@ -309,7 +309,7 @@
 
 static int __init topology_init(void)
 {
-	return register_cpu(cpu, 0, NULL);
+	return register_cpu(cpu, 0);
 }
 
 subsys_initcall(topology_init);
diff --git a/arch/sparc64/kernel/setup.c b/arch/sparc64/kernel/setup.c
index a6a7d81..116d963 100644
--- a/arch/sparc64/kernel/setup.c
+++ b/arch/sparc64/kernel/setup.c
@@ -537,7 +537,7 @@
 	for_each_possible_cpu(i) {
 		struct cpu *p = kzalloc(sizeof(*p), GFP_KERNEL);
 		if (p) {
-			register_cpu(p, i, NULL);
+			register_cpu(p, i);
 			err = 0;
 		}
 	}
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index 5c2bcf3..cb75a27 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -18,6 +18,7 @@
 #include <linux/initrd.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
+#include <linux/poison.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/kprobes.h>
@@ -1520,7 +1521,7 @@
 		page = (addr +
 			((unsigned long) __va(kern_base)) -
 			((unsigned long) KERNBASE));
-		memset((void *)addr, 0xcc, PAGE_SIZE);
+		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
 		p = virt_to_page(page);
 
 		ClearPageReserved(p);
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 7290e72..22cac44 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -588,7 +588,7 @@
  */		
 	.macro apicinterrupt num,func
 	INTR_FRAME
-	pushq $\num-256
+	pushq $~(\num)
 	CFI_ADJUST_CFA_OFFSET 8
 	interrupt \func
 	jmp ret_from_intr
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 59518d4..3be0a7e 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -115,8 +115,8 @@
  */
 asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
 {	
-	/* high bits used in ret_from_ code  */
-	unsigned irq = regs->orig_rax & 0xff;
+	/* high bit used in ret_from_ code  */
+	unsigned irq = ~regs->orig_rax;
 
 	exit_idle();
 	irq_enter();
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index acd5816..8884567 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -629,7 +629,7 @@
 #endif
 
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int
+static __cpuinit int
 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
@@ -647,7 +647,7 @@
 	return NOTIFY_OK;
 }
 
-static struct notifier_block mce_cpu_notifier = {
+static struct notifier_block __cpuinitdata mce_cpu_notifier = {
 	.notifier_call = mce_cpu_callback,
 };
 
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index acee4bc..5a1c0a3 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -135,10 +135,10 @@
 
 	cpu = smp_processor_id();
 	/*
-	 * orig_rax contains the interrupt vector - 256.
+	 * orig_rax contains the negated interrupt vector.
 	 * Use that to determine where the sender put the data.
 	 */
-	sender = regs->orig_rax + 256 - INVALIDATE_TLB_VECTOR_START;
+	sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
 	f = &per_cpu(flush_state, sender);
 
 	if (!cpu_isset(cpu, f->flush_cpumask))
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 4e97551..540c0cc 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -455,10 +455,12 @@
 	struct cpuinfo_x86 *c = cpu_data + cpu;
 	/*
 	 * For perf, we return last level cache shared map.
-	 * TBD: when power saving sched policy is added, we will return
-	 *      cpu_core_map when power saving policy is enabled
+	 * And for power savings, we return cpu_core_map
 	 */
-	return c->llc_shared_map;
+	if (sched_mc_power_savings || sched_smt_power_savings)
+		return cpu_core_map[cpu];
+	else
+		return c->llc_shared_map;
 }
 
 /* representing cpus for which sibling maps can be computed */
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 02add1d..95bd232 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -23,6 +23,7 @@
 #include <linux/bootmem.h>
 #include <linux/proc_fs.h>
 #include <linux/pci.h>
+#include <linux/poison.h>
 #include <linux/dma-mapping.h>
 #include <linux/module.h>
 #include <linux/memory_hotplug.h>
@@ -506,8 +507,6 @@
 /*
  * Memory hotplug specific functions
  */
-#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)
-
 void online_page(struct page *page)
 {
 	ClearPageReserved(page);
@@ -517,7 +516,52 @@
 	num_physpages++;
 }
 
-#ifndef CONFIG_MEMORY_HOTPLUG
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * XXX: memory_add_physaddr_to_nid() is to find node id from physical address
+ *	via probe interface of sysfs. If acpi notifies hot-add event, then it
+ *	can tell node id by searching dsdt. But, probe interface doesn't have
+ *	node id. So, return 0 as node id at this time.
+ */
+#ifdef CONFIG_NUMA
+int memory_add_physaddr_to_nid(u64 start)
+{
+	return 0;
+}
+#endif
+
+/*
+ * Memory is added always to NORMAL zone. This means you will never get
+ * additional DMA/DMA32 memory.
+ */
+int arch_add_memory(int nid, u64 start, u64 size)
+{
+	struct pglist_data *pgdat = NODE_DATA(nid);
+	struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	int ret;
+
+	ret = __add_pages(zone, start_pfn, nr_pages);
+	if (ret)
+		goto error;
+
+	init_memory_mapping(start, (start + size -1));
+
+	return ret;
+error:
+	printk("%s: Problem encountered in __add_pages!\n", __func__);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(arch_add_memory);
+
+int remove_memory(u64 start, u64 size)
+{
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(remove_memory);
+
+#else /* CONFIG_MEMORY_HOTPLUG */
 /*
  * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
  * just online the pages.
@@ -543,40 +587,7 @@
 	}
 	return err;
 }
-#endif
-
-/*
- * Memory is added always to NORMAL zone. This means you will never get
- * additional DMA/DMA32 memory.
- */
-int add_memory(u64 start, u64 size)
-{
-	struct pglist_data *pgdat = NODE_DATA(0);
-	struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long nr_pages = size >> PAGE_SHIFT;
-	int ret;
-
-	ret = __add_pages(zone, start_pfn, nr_pages);
-	if (ret)
-		goto error;
-
-	init_memory_mapping(start, (start + size -1));
-
-	return ret;
-error:
-	printk("%s: Problem encountered in __add_pages!\n", __func__);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(add_memory);
-
-int remove_memory(u64 start, u64 size)
-{
-	return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(remove_memory);
-
-#endif
+#endif /* CONFIG_MEMORY_HOTPLUG */
 
 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
 			 kcore_vsyscall;
@@ -650,7 +661,8 @@
 	for (addr = begin; addr < end; addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
 		init_page_count(virt_to_page(addr));
-		memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); 
+		memset((void *)(addr & ~(PAGE_SIZE-1)),
+			POISON_FREE_INITMEM, PAGE_SIZE);
 		free_page(addr);
 		totalram_pages++;
 	}
@@ -658,7 +670,8 @@
 
 void free_initmem(void)
 {
-	memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
+	memset(__initdata_begin, POISON_FREE_INITDATA,
+		__initdata_end - __initdata_begin);
 	free_init_pages("unused kernel memory",
 			(unsigned long)(&__init_begin),
 			(unsigned long)(&__init_end));
diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c
index 937d81f..fe14909 100644
--- a/arch/xtensa/kernel/time.c
+++ b/arch/xtensa/kernel/time.c
@@ -29,7 +29,7 @@
 
 extern volatile unsigned long wall_jiffies;
 
-spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(rtc_lock);
 EXPORT_SYMBOL(rtc_lock);
 
 
diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c
index 225d64d..27e4090 100644
--- a/arch/xtensa/kernel/traps.c
+++ b/arch/xtensa/kernel/traps.c
@@ -461,7 +461,7 @@
 	}
 }
 
-spinlock_t die_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(die_lock);
 
 void die(const char * str, struct pt_regs * regs, long err)
 {
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index c04422a..eee03a3 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3403,7 +3403,7 @@
 }
 
 
-static struct notifier_block blk_cpu_notifier = {
+static struct notifier_block __devinitdata blk_cpu_notifier = {
 	.notifier_call	= blk_cpu_notify,
 };
 
@@ -3541,9 +3541,7 @@
 		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
 
 	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
-#ifdef CONFIG_HOTPLUG_CPU
-	register_cpu_notifier(&blk_cpu_notifier);
-#endif
+	register_hotcpu_notifier(&blk_cpu_notifier);
 
 	blk_max_low_pfn = max_low_pfn;
 	blk_max_pfn = max_pfn;
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 94b8d82..610d2cc 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -328,7 +328,7 @@
 config ACPI_HOTPLUG_MEMORY
 	tristate "Memory Hotplug"
 	depends on ACPI
-	depends on MEMORY_HOTPLUG || X86_64
+	depends on MEMORY_HOTPLUG
 	default n
 	help
 	  This driver adds supports for ACPI Memory Hotplug.  This driver
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index e0a95ba..1012284 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -57,6 +57,7 @@
 
 static int acpi_memory_device_add(struct acpi_device *device);
 static int acpi_memory_device_remove(struct acpi_device *device, int type);
+static int acpi_memory_device_start(struct acpi_device *device);
 
 static struct acpi_driver acpi_memory_device_driver = {
 	.name = ACPI_MEMORY_DEVICE_DRIVER_NAME,
@@ -65,48 +66,79 @@
 	.ops = {
 		.add = acpi_memory_device_add,
 		.remove = acpi_memory_device_remove,
+		.start = acpi_memory_device_start,
 		},
 };
 
+struct acpi_memory_info {
+	struct list_head list;
+	u64 start_addr;		/* Memory Range start physical addr */
+	u64 length;		/* Memory Range length */
+	unsigned short caching;	/* memory cache attribute */
+	unsigned short write_protect;	/* memory read/write attribute */
+	unsigned int enabled:1;
+};
+
 struct acpi_memory_device {
 	acpi_handle handle;
 	unsigned int state;	/* State of the memory device */
-	unsigned short caching;	/* memory cache attribute */
-	unsigned short write_protect;	/* memory read/write attribute */
-	u64 start_addr;		/* Memory Range start physical addr */
-	u64 length;		/* Memory Range length */
+	struct list_head res_list;
 };
 
+static acpi_status
+acpi_memory_get_resource(struct acpi_resource *resource, void *context)
+{
+	struct acpi_memory_device *mem_device = context;
+	struct acpi_resource_address64 address64;
+	struct acpi_memory_info *info, *new;
+	acpi_status status;
+
+	status = acpi_resource_to_address64(resource, &address64);
+	if (ACPI_FAILURE(status) ||
+	    (address64.resource_type != ACPI_MEMORY_RANGE))
+		return AE_OK;
+
+	list_for_each_entry(info, &mem_device->res_list, list) {
+		/* Can we combine the resource range information? */
+		if ((info->caching == address64.info.mem.caching) &&
+		    (info->write_protect == address64.info.mem.write_protect) &&
+		    (info->start_addr + info->length == address64.minimum)) {
+			info->length += address64.address_length;
+			return AE_OK;
+		}
+	}
+
+	new = kzalloc(sizeof(struct acpi_memory_info), GFP_KERNEL);
+	if (!new)
+		return AE_ERROR;
+
+	INIT_LIST_HEAD(&new->list);
+	new->caching = address64.info.mem.caching;
+	new->write_protect = address64.info.mem.write_protect;
+	new->start_addr = address64.minimum;
+	new->length = address64.address_length;
+	list_add_tail(&new->list, &mem_device->res_list);
+
+	return AE_OK;
+}
+
 static int
 acpi_memory_get_device_resources(struct acpi_memory_device *mem_device)
 {
 	acpi_status status;
-	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
-	struct acpi_resource *resource = NULL;
-	struct acpi_resource_address64 address64;
+	struct acpi_memory_info *info, *n;
 
 	ACPI_FUNCTION_TRACE("acpi_memory_get_device_resources");
 
-	/* Get the range from the _CRS */
-	status = acpi_get_current_resources(mem_device->handle, &buffer);
-	if (ACPI_FAILURE(status))
-		return_VALUE(-EINVAL);
-
-	resource = (struct acpi_resource *)buffer.pointer;
-	status = acpi_resource_to_address64(resource, &address64);
-	if (ACPI_SUCCESS(status)) {
-		if (address64.resource_type == ACPI_MEMORY_RANGE) {
-			/* Populate the structure */
-			mem_device->caching = address64.info.mem.caching;
-			mem_device->write_protect =
-			    address64.info.mem.write_protect;
-			mem_device->start_addr = address64.minimum;
-			mem_device->length = address64.address_length;
-		}
+	status = acpi_walk_resources(mem_device->handle, METHOD_NAME__CRS,
+				     acpi_memory_get_resource, mem_device);
+	if (ACPI_FAILURE(status)) {
+		list_for_each_entry_safe(info, n, &mem_device->res_list, list)
+			kfree(info);
+		return -EINVAL;
 	}
 
-	acpi_os_free(buffer.pointer);
-	return_VALUE(0);
+	return 0;
 }
 
 static int
@@ -181,7 +213,9 @@
 
 static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 {
-	int result;
+	int result, num_enabled = 0;
+	struct acpi_memory_info *info;
+	int node;
 
 	ACPI_FUNCTION_TRACE("acpi_memory_enable_device");
 
@@ -194,15 +228,35 @@
 		return result;
 	}
 
+	node = acpi_get_node(mem_device->handle);
 	/*
 	 * Tell the VM there is more memory here...
 	 * Note: Assume that this function returns zero on success
+	 * We don't have memory-hot-add rollback function,now.
+	 * (i.e. memory-hot-remove function)
 	 */
-	result = add_memory(mem_device->start_addr, mem_device->length);
-	if (result) {
+	list_for_each_entry(info, &mem_device->res_list, list) {
+		u64 start_pfn, end_pfn;
+
+		start_pfn = info->start_addr >> PAGE_SHIFT;
+		end_pfn = (info->start_addr + info->length - 1) >> PAGE_SHIFT;
+
+		if (pfn_valid(start_pfn) || pfn_valid(end_pfn)) {
+			/* already enabled. try next area */
+			num_enabled++;
+			continue;
+		}
+
+		result = add_memory(node, info->start_addr, info->length);
+		if (result)
+			continue;
+		info->enabled = 1;
+		num_enabled++;
+	}
+	if (!num_enabled) {
 		ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "\nadd_memory failed\n"));
 		mem_device->state = MEMORY_INVALID_STATE;
-		return result;
+		return -EINVAL;
 	}
 
 	return result;
@@ -246,8 +300,7 @@
 static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
 {
 	int result;
-	u64 start = mem_device->start_addr;
-	u64 len = mem_device->length;
+	struct acpi_memory_info *info, *n;
 
 	ACPI_FUNCTION_TRACE("acpi_memory_disable_device");
 
@@ -255,10 +308,13 @@
 	 * Ask the VM to offline this memory range.
 	 * Note: Assume that this function returns zero on success
 	 */
-	result = remove_memory(start, len);
-	if (result) {
-		ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Hot-Remove failed.\n"));
-		return_VALUE(result);
+	list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
+		if (info->enabled) {
+			result = remove_memory(info->start_addr, info->length);
+			if (result)
+				return result;
+		}
+		kfree(info);
 	}
 
 	/* Power-off and eject the device */
@@ -356,6 +412,7 @@
 		return_VALUE(-ENOMEM);
 	memset(mem_device, 0, sizeof(struct acpi_memory_device));
 
+	INIT_LIST_HEAD(&mem_device->res_list);
 	mem_device->handle = device->handle;
 	sprintf(acpi_device_name(device), "%s", ACPI_MEMORY_DEVICE_NAME);
 	sprintf(acpi_device_class(device), "%s", ACPI_MEMORY_DEVICE_CLASS);
@@ -391,6 +448,25 @@
 	return_VALUE(0);
 }
 
+static int acpi_memory_device_start (struct acpi_device *device)
+{
+	struct acpi_memory_device *mem_device;
+	int result = 0;
+
+	ACPI_FUNCTION_TRACE("acpi_memory_device_start");
+
+	mem_device = acpi_driver_data(device);
+
+	if (!acpi_memory_check_device(mem_device)) {
+		/* call add_memory func */
+		result = acpi_memory_enable_device(mem_device);
+		if (result)
+			ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
+				"Error in acpi_memory_enable_device\n"));
+	}
+	return_VALUE(result);
+}
+
 /*
  * Helper function to check for memory device
  */
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index e2c1a16..13d6d5bd 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -254,5 +254,18 @@
 	} while (ACPI_SUCCESS(status));
 	return -1;
 }
-
 EXPORT_SYMBOL(acpi_get_pxm);
+
+int acpi_get_node(acpi_handle *handle)
+{
+	int pxm, node = -1;
+
+	ACPI_FUNCTION_TRACE("acpi_get_node");
+
+	pxm = acpi_get_pxm(handle);
+	if (pxm >= 0)
+		node = acpi_map_pxm_to_node(pxm);
+
+	return_VALUE(node);
+}
+EXPORT_SYMBOL(acpi_get_node);
diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c
index f2eeaf9..1bca86e 100644
--- a/drivers/atm/firestream.c
+++ b/drivers/atm/firestream.c
@@ -33,6 +33,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/pci.h>
+#include <linux/poison.h>
 #include <linux/errno.h>
 #include <linux/atm.h>
 #include <linux/atmdev.h>
@@ -754,7 +755,7 @@
 			fs_kfree_skb (skb);
 
 			fs_dprintk (FS_DEBUG_ALLOC, "Free trans-d: %p\n", td); 
-			memset (td, 0x12, sizeof (struct FS_BPENTRY));
+			memset (td, ATM_POISON_FREE, sizeof(struct FS_BPENTRY));
 			kfree (td);
 			break;
 		default:
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index dd712b2..4bef76a 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -8,6 +8,7 @@
 #include <linux/cpu.h>
 #include <linux/topology.h>
 #include <linux/device.h>
+#include <linux/node.h>
 
 #include "base.h"
 
@@ -57,13 +58,12 @@
 {
 	sysdev_create_file(&cpu->sysdev, &attr_online);
 }
-void unregister_cpu(struct cpu *cpu, struct node *root)
+void unregister_cpu(struct cpu *cpu)
 {
 	int logical_cpu = cpu->sysdev.id;
 
-	if (root)
-		sysfs_remove_link(&root->sysdev.kobj,
-				  kobject_name(&cpu->sysdev.kobj));
+	unregister_cpu_under_node(logical_cpu, cpu_to_node(logical_cpu));
+
 	sysdev_remove_file(&cpu->sysdev, &attr_online);
 
 	sysdev_unregister(&cpu->sysdev);
@@ -109,23 +109,21 @@
  *
  * Initialize and register the CPU device.
  */
-int __devinit register_cpu(struct cpu *cpu, int num, struct node *root)
+int __devinit register_cpu(struct cpu *cpu, int num)
 {
 	int error;
-
 	cpu->node_id = cpu_to_node(num);
 	cpu->sysdev.id = num;
 	cpu->sysdev.cls = &cpu_sysdev_class;
 
 	error = sysdev_register(&cpu->sysdev);
-	if (!error && root)
-		error = sysfs_create_link(&root->sysdev.kobj,
-					  &cpu->sysdev.kobj,
-					  kobject_name(&cpu->sysdev.kobj));
+
 	if (!error && !cpu->no_control)
 		register_cpu_control(cpu);
 	if (!error)
 		cpu_sys_devices[num] = &cpu->sysdev;
+	if (!error)
+		register_cpu_under_node(num, cpu_to_node(num));
 
 #ifdef CONFIG_KEXEC
 	if (!error)
@@ -145,5 +143,13 @@
 
 int __init cpu_dev_init(void)
 {
-	return sysdev_class_register(&cpu_sysdev_class);
+	int err;
+
+	err = sysdev_class_register(&cpu_sysdev_class);
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+	if (!err)
+		err = sched_create_sysfs_power_savings_entries(&cpu_sysdev_class);
+#endif
+
+	return err;
 }
diff --git a/drivers/base/dmapool.c b/drivers/base/dmapool.c
index e2f64f9..33c5cce 100644
--- a/drivers/base/dmapool.c
+++ b/drivers/base/dmapool.c
@@ -7,6 +7,7 @@
 #include <linux/dmapool.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/poison.h>
 
 /*
  * Pool allocator ... wraps the dma_alloc_coherent page allocator, so
@@ -35,8 +36,6 @@
 };
 
 #define	POOL_TIMEOUT_JIFFIES	((100 /* msec */ * HZ) / 1000)
-#define	POOL_POISON_FREED	0xa7	/* !inuse */
-#define	POOL_POISON_ALLOCATED	0xa9	/* !initted */
 
 static DECLARE_MUTEX (pools_lock);
 
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index dd547af..c6b7d9c 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -306,11 +306,13 @@
 memory_probe_store(struct class *class, const char *buf, size_t count)
 {
 	u64 phys_addr;
+	int nid;
 	int ret;
 
 	phys_addr = simple_strtoull(buf, NULL, 0);
 
-	ret = add_memory(phys_addr, PAGES_PER_SECTION << PAGE_SHIFT);
+	nid = memory_add_physaddr_to_nid(phys_addr);
+	ret = add_memory(nid, phys_addr, PAGES_PER_SECTION << PAGE_SHIFT);
 
 	if (ret)
 		count = ret;
diff --git a/drivers/base/node.c b/drivers/base/node.c
index c80c3ae..eae2bdc 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -11,6 +11,7 @@
 #include <linux/cpumask.h>
 #include <linux/topology.h>
 #include <linux/nodemask.h>
+#include <linux/cpu.h>
 
 static struct sysdev_class node_class = {
 	set_kset_name("node"),
@@ -190,6 +191,66 @@
 	sysdev_unregister(&node->sysdev);
 }
 
+struct node node_devices[MAX_NUMNODES];
+
+/*
+ * register cpu under node
+ */
+int register_cpu_under_node(unsigned int cpu, unsigned int nid)
+{
+	if (node_online(nid)) {
+		struct sys_device *obj = get_cpu_sysdev(cpu);
+		if (!obj)
+			return 0;
+		return sysfs_create_link(&node_devices[nid].sysdev.kobj,
+					 &obj->kobj,
+					 kobject_name(&obj->kobj));
+	 }
+
+	return 0;
+}
+
+int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
+{
+	if (node_online(nid)) {
+		struct sys_device *obj = get_cpu_sysdev(cpu);
+		if (obj)
+			sysfs_remove_link(&node_devices[nid].sysdev.kobj,
+					 kobject_name(&obj->kobj));
+	}
+	return 0;
+}
+
+int register_one_node(int nid)
+{
+	int error = 0;
+	int cpu;
+
+	if (node_online(nid)) {
+		int p_node = parent_node(nid);
+		struct node *parent = NULL;
+
+		if (p_node != nid)
+			parent = &node_devices[p_node];
+
+		error = register_node(&node_devices[nid], nid, parent);
+
+		/* link cpu under this node */
+		for_each_present_cpu(cpu) {
+			if (cpu_to_node(cpu) == nid)
+				register_cpu_under_node(cpu, nid);
+		}
+	}
+
+	return error;
+
+}
+
+void unregister_one_node(int nid)
+{
+	unregister_node(&node_devices[nid]);
+}
+
 static int __init register_node_type(void)
 {
 	return sysdev_class_register(&node_class);
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index 8c52421..c2d6216 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -107,7 +107,7 @@
 	return 0;
 }
 
-static int topology_cpu_callback(struct notifier_block *nfb,
+static int __cpuinit topology_cpu_callback(struct notifier_block *nfb,
 		unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
@@ -125,7 +125,7 @@
 	return NOTIFY_OK;
 }
 
-static struct notifier_block topology_cpu_notifier =
+static struct notifier_block __cpuinitdata topology_cpu_notifier =
 {
 	.notifier_call = topology_cpu_callback,
 };
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 3610c57..410d70c 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -939,12 +939,35 @@
 config SCx200_GPIO
 	tristate "NatSemi SCx200 GPIO Support"
 	depends on SCx200
+	select NSC_GPIO
 	help
 	  Give userspace access to the GPIO pins on the National
 	  Semiconductor SCx200 processors.
 
 	  If compiled as a module, it will be called scx200_gpio.
 
+config PC8736x_GPIO
+	tristate "NatSemi PC8736x GPIO Support"
+	depends on X86
+	default SCx200_GPIO	# mostly N
+	select NSC_GPIO		# needed for support routines
+	help
+	  Give userspace access to the GPIO pins on the National
+	  Semiconductor PC-8736x (x=[03456]) SuperIO chip.  The chip
+	  has multiple functional units, inc several managed by
+	  hwmon/pc87360 driver.  Tested with PC-87366
+
+	  If compiled as a module, it will be called pc8736x_gpio.
+
+config NSC_GPIO
+	tristate "NatSemi Base GPIO Support"
+	# selected by SCx200_GPIO and PC8736x_GPIO
+	# what about 2 selectors differing: m != y
+	help
+	  Common support used (and needed) by scx200_gpio and
+	  pc8736x_gpio drivers.  If those drivers are built as
+	  modules, this one will be too, named nsc_gpio
+
 config CS5535_GPIO
 	tristate "AMD CS5535/CS5536 GPIO (Geode Companion Device)"
 	depends on X86_32
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index 5241055..6e0f446 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -82,6 +82,8 @@
 obj-$(CONFIG_NWBUTTON)		+= nwbutton.o
 obj-$(CONFIG_NWFLASH)		+= nwflash.o
 obj-$(CONFIG_SCx200_GPIO)	+= scx200_gpio.o
+obj-$(CONFIG_PC8736x_GPIO)	+= pc8736x_gpio.o
+obj-$(CONFIG_NSC_GPIO)		+= nsc_gpio.o
 obj-$(CONFIG_CS5535_GPIO)	+= cs5535_gpio.o
 obj-$(CONFIG_GPIO_VR41XX)	+= vr41xx_giu.o
 obj-$(CONFIG_TANBAC_TB0219)	+= tb0219.o
diff --git a/drivers/char/agp/sgi-agp.c b/drivers/char/agp/sgi-agp.c
index cfa7922..d73be4c 100644
--- a/drivers/char/agp/sgi-agp.c
+++ b/drivers/char/agp/sgi-agp.c
@@ -329,9 +329,8 @@
 
 static void __devexit agp_sgi_cleanup(void)
 {
-	if (sgi_tioca_agp_bridges)
-		kfree(sgi_tioca_agp_bridges);
-	sgi_tioca_agp_bridges=NULL;
+	kfree(sgi_tioca_agp_bridges);
+	sgi_tioca_agp_bridges = NULL;
 }
 
 module_init(agp_sgi_init);
diff --git a/drivers/char/drm/drm_memory_debug.h b/drivers/char/drm/drm_memory_debug.h
index 6543b9a..d117cc9 100644
--- a/drivers/char/drm/drm_memory_debug.h
+++ b/drivers/char/drm/drm_memory_debug.h
@@ -43,7 +43,7 @@
 	unsigned long bytes_freed;
 } drm_mem_stats_t;
 
-static spinlock_t drm_mem_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(drm_mem_lock);
 static unsigned long drm_ram_available = 0;	/* In pages */
 static unsigned long drm_ram_used = 0;
 static drm_mem_stats_t drm_mem_stats[] =
diff --git a/drivers/char/drm/via_dmablit.c b/drivers/char/drm/via_dmablit.c
index b7f1745..78a81a4 100644
--- a/drivers/char/drm/via_dmablit.c
+++ b/drivers/char/drm/via_dmablit.c
@@ -557,7 +557,7 @@
 		blitq->num_outstanding = 0;
 		blitq->is_active = 0;
 		blitq->aborting = 0;
-		blitq->blit_lock = SPIN_LOCK_UNLOCKED;
+		spin_lock_init(&blitq->blit_lock);
 		for (j=0; j<VIA_NUM_BLIT_SLOTS; ++j) {
 			DRM_INIT_WAITQUEUE(blitq->blit_queue + j);
 		}
diff --git a/drivers/char/epca.c b/drivers/char/epca.c
index 9cad850..dc0602a 100644
--- a/drivers/char/epca.c
+++ b/drivers/char/epca.c
@@ -80,7 +80,7 @@
 /* The ISA boards do window flipping into the same spaces so its only sane
    with a single lock. It's still pretty efficient */
 
-static spinlock_t epca_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(epca_lock);
 
 /* -----------------------------------------------------------------------
 	MAXBOARDS is typically 12, but ISA and EISA cards are restricted to 
diff --git a/drivers/char/hvcs.c b/drivers/char/hvcs.c
index 8d97b39..afa26b6 100644
--- a/drivers/char/hvcs.c
+++ b/drivers/char/hvcs.c
@@ -1320,11 +1320,12 @@
 static int hvcs_alloc_index_list(int n)
 {
 	int i;
+
 	hvcs_index_list = kmalloc(n * sizeof(hvcs_index_count),GFP_KERNEL);
 	if (!hvcs_index_list)
 		return -ENOMEM;
 	hvcs_index_count = n;
-	for(i = 0; i < hvcs_index_count; i++)
+	for (i = 0; i < hvcs_index_count; i++)
 		hvcs_index_list[i] = -1;
 	return 0;
 }
@@ -1332,11 +1333,9 @@
 static void hvcs_free_index_list(void)
 {
 	/* Paranoia check to be thorough. */
-	if (hvcs_index_list) {
-		kfree(hvcs_index_list);
-		hvcs_index_list = NULL;
-		hvcs_index_count = 0;
-	}
+	kfree(hvcs_index_list);
+	hvcs_index_list = NULL;
+	hvcs_index_count = 0;
 }
 
 static int __init hvcs_module_init(void)
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index b03ddab..83ed6ae 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -57,8 +57,7 @@
 static int initialized = 0;
 
 #ifdef CONFIG_PROC_FS
-struct proc_dir_entry *proc_ipmi_root = NULL;
-EXPORT_SYMBOL(proc_ipmi_root);
+static struct proc_dir_entry *proc_ipmi_root = NULL;
 #endif /* CONFIG_PROC_FS */
 
 #define MAX_EVENTS_IN_QUEUE	25
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 02a7dd7..101c14b 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -809,7 +809,7 @@
 			/* do nothing */
 		}
 		else if (smi_result == SI_SM_CALL_WITH_DELAY)
-			udelay(1);
+			schedule();
 		else
 			schedule_timeout_interruptible(1);
 	}
diff --git a/drivers/char/moxa.c b/drivers/char/moxa.c
index f43c2e0..01247cc 100644
--- a/drivers/char/moxa.c
+++ b/drivers/char/moxa.c
@@ -301,7 +301,7 @@
 	.tiocmset = moxa_tiocmset,
 };
 
-static spinlock_t moxa_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(moxa_lock);
 
 #ifdef CONFIG_PCI
 static int moxa_get_PCI_conf(struct pci_dev *p, int board_type, moxa_board_conf * board)
diff --git a/drivers/char/nsc_gpio.c b/drivers/char/nsc_gpio.c
new file mode 100644
index 0000000..5b91e4e
--- /dev/null
+++ b/drivers/char/nsc_gpio.c
@@ -0,0 +1,142 @@
+/* linux/drivers/char/nsc_gpio.c
+
+   National Semiconductor common GPIO device-file/VFS methods.
+   Allows a user space process to control the GPIO pins.
+
+   Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>
+   Copyright (c) 2005      Jim Cromie <jim.cromie@gmail.com>
+*/
+
+#include <linux/config.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/nsc_gpio.h>
+#include <linux/platform_device.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#define NAME "nsc_gpio"
+
+void nsc_gpio_dump(struct nsc_gpio_ops *amp, unsigned index)
+{
+	/* retrieve current config w/o changing it */
+	u32 config = amp->gpio_config(index, ~0, 0);
+
+	/* user requested via 'v' command, so its INFO */
+	dev_info(amp->dev, "io%02u: 0x%04x %s %s %s %s %s %s %s\tio:%d/%d\n",
+		 index, config,
+		 (config & 1) ? "OE" : "TS",      /* output-enabled/tristate */
+		 (config & 2) ? "PP" : "OD",      /* push pull / open drain */
+		 (config & 4) ? "PUE" : "PUD",    /* pull up enabled/disabled */
+		 (config & 8) ? "LOCKED" : "",    /* locked / unlocked */
+		 (config & 16) ? "LEVEL" : "EDGE",/* level/edge input */
+		 (config & 32) ? "HI" : "LO",     /* trigger on rise/fall edge */
+		 (config & 64) ? "DEBOUNCE" : "", /* debounce */
+
+		 amp->gpio_get(index), amp->gpio_current(index));
+}
+
+ssize_t nsc_gpio_write(struct file *file, const char __user *data,
+		       size_t len, loff_t *ppos)
+{
+	unsigned m = iminor(file->f_dentry->d_inode);
+	struct nsc_gpio_ops *amp = file->private_data;
+	struct device *dev = amp->dev;
+	size_t i;
+	int err = 0;
+
+	for (i = 0; i < len; ++i) {
+		char c;
+		if (get_user(c, data + i))
+			return -EFAULT;
+		switch (c) {
+		case '0':
+			amp->gpio_set(m, 0);
+			break;
+		case '1':
+			amp->gpio_set(m, 1);
+			break;
+		case 'O':
+			dev_dbg(dev, "GPIO%d output enabled\n", m);
+			amp->gpio_config(m, ~1, 1);
+			break;
+		case 'o':
+			dev_dbg(dev, "GPIO%d output disabled\n", m);
+			amp->gpio_config(m, ~1, 0);
+			break;
+		case 'T':
+			dev_dbg(dev, "GPIO%d output is push pull\n",
+			       m);
+			amp->gpio_config(m, ~2, 2);
+			break;
+		case 't':
+			dev_dbg(dev, "GPIO%d output is open drain\n",
+			       m);
+			amp->gpio_config(m, ~2, 0);
+			break;
+		case 'P':
+			dev_dbg(dev, "GPIO%d pull up enabled\n", m);
+			amp->gpio_config(m, ~4, 4);
+			break;
+		case 'p':
+			dev_dbg(dev, "GPIO%d pull up disabled\n", m);
+			amp->gpio_config(m, ~4, 0);
+			break;
+		case 'v':
+			/* View Current pin settings */
+			amp->gpio_dump(amp, m);
+			break;
+		case '\n':
+			/* end of settings string, do nothing */
+			break;
+		default:
+			dev_err(dev, "io%2d bad setting: chr<0x%2x>\n",
+				m, (int)c);
+			err++;
+		}
+	}
+	if (err)
+		return -EINVAL;	/* full string handled, report error */
+
+	return len;
+}
+
+ssize_t nsc_gpio_read(struct file *file, char __user * buf,
+		      size_t len, loff_t * ppos)
+{
+	unsigned m = iminor(file->f_dentry->d_inode);
+	int value;
+	struct nsc_gpio_ops *amp = file->private_data;
+
+	value = amp->gpio_get(m);
+	if (put_user(value ? '1' : '0', buf))
+		return -EFAULT;
+
+	return 1;
+}
+
+/* common file-ops routines for both scx200_gpio and pc87360_gpio */
+EXPORT_SYMBOL(nsc_gpio_write);
+EXPORT_SYMBOL(nsc_gpio_read);
+EXPORT_SYMBOL(nsc_gpio_dump);
+
+static int __init nsc_gpio_init(void)
+{
+	printk(KERN_DEBUG NAME " initializing\n");
+	return 0;
+}
+
+static void __exit nsc_gpio_cleanup(void)
+{
+	printk(KERN_DEBUG NAME " cleanup\n");
+}
+
+module_init(nsc_gpio_init);
+module_exit(nsc_gpio_cleanup);
+
+MODULE_AUTHOR("Jim Cromie <jim.cromie@gmail.com>");
+MODULE_DESCRIPTION("NatSemi GPIO Common Methods");
+MODULE_LICENSE("GPL");
diff --git a/drivers/char/pc8736x_gpio.c b/drivers/char/pc8736x_gpio.c
new file mode 100644
index 0000000..1c706cc
--- /dev/null
+++ b/drivers/char/pc8736x_gpio.c
@@ -0,0 +1,340 @@
+/* linux/drivers/char/pc8736x_gpio.c
+
+   National Semiconductor PC8736x GPIO driver.  Allows a user space
+   process to play with the GPIO pins.
+
+   Copyright (c) 2005 Jim Cromie <jim.cromie@gmail.com>
+
+   adapted from linux/drivers/char/scx200_gpio.c
+   Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>,
+*/
+
+#include <linux/config.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/mutex.h>
+#include <linux/nsc_gpio.h>
+#include <linux/platform_device.h>
+#include <asm/uaccess.h>
+
+#define DEVNAME "pc8736x_gpio"
+
+MODULE_AUTHOR("Jim Cromie <jim.cromie@gmail.com>");
+MODULE_DESCRIPTION("NatSemi PC-8736x GPIO Pin Driver");
+MODULE_LICENSE("GPL");
+
+static int major;		/* default to dynamic major */
+module_param(major, int, 0);
+MODULE_PARM_DESC(major, "Major device number");
+
+static DEFINE_MUTEX(pc8736x_gpio_config_lock);
+static unsigned pc8736x_gpio_base;
+static u8 pc8736x_gpio_shadow[4];
+
+#define SIO_BASE1       0x2E	/* 1st command-reg to check */
+#define SIO_BASE2       0x4E	/* alt command-reg to check */
+#define SIO_BASE_OFFSET 0x20
+
+#define SIO_SID		0x20	/* SuperI/O ID Register */
+#define SIO_SID_VALUE	0xe9	/* Expected value in SuperI/O ID Register */
+
+#define SIO_CF1		0x21	/* chip config, bit0 is chip enable */
+
+#define PC8736X_GPIO_SIZE	16
+
+#define SIO_UNIT_SEL	0x7	/* unit select reg */
+#define SIO_UNIT_ACT	0x30	/* unit enable */
+#define SIO_GPIO_UNIT	0x7	/* unit number of GPIO */
+#define SIO_VLM_UNIT	0x0D
+#define SIO_TMS_UNIT	0x0E
+
+/* config-space addrs to read/write each unit's runtime addr */
+#define SIO_BASE_HADDR		0x60
+#define SIO_BASE_LADDR		0x61
+
+/* GPIO config-space pin-control addresses */
+#define SIO_GPIO_PIN_SELECT	0xF0
+#define SIO_GPIO_PIN_CONFIG     0xF1
+#define SIO_GPIO_PIN_EVENT      0xF2
+
+static unsigned char superio_cmd = 0;
+static unsigned char selected_device = 0xFF;	/* bogus start val */
+
+/* GPIO port runtime access, functionality */
+static int port_offset[] = { 0, 4, 8, 10 };	/* non-uniform offsets ! */
+/* static int event_capable[] = { 1, 1, 0, 0 };   ports 2,3 are hobbled */
+
+#define PORT_OUT	0
+#define PORT_IN		1
+#define PORT_EVT_EN	2
+#define PORT_EVT_STST	3
+
+static struct platform_device *pdev;  /* use in dev_*() */
+
+static inline void superio_outb(int addr, int val)
+{
+	outb_p(addr, superio_cmd);
+	outb_p(val, superio_cmd + 1);
+}
+
+static inline int superio_inb(int addr)
+{
+	outb_p(addr, superio_cmd);
+	return inb_p(superio_cmd + 1);
+}
+
+static int pc8736x_superio_present(void)
+{
+	/* try the 2 possible values, read a hardware reg to verify */
+	superio_cmd = SIO_BASE1;
+	if (superio_inb(SIO_SID) == SIO_SID_VALUE)
+		return superio_cmd;
+
+	superio_cmd = SIO_BASE2;
+	if (superio_inb(SIO_SID) == SIO_SID_VALUE)
+		return superio_cmd;
+
+	return 0;
+}
+
+static void device_select(unsigned devldn)
+{
+	superio_outb(SIO_UNIT_SEL, devldn);
+	selected_device = devldn;
+}
+
+static void select_pin(unsigned iminor)
+{
+	/* select GPIO port/pin from device minor number */
+	device_select(SIO_GPIO_UNIT);
+	superio_outb(SIO_GPIO_PIN_SELECT,
+		     ((iminor << 1) & 0xF0) | (iminor & 0x7));
+}
+
+static inline u32 pc8736x_gpio_configure_fn(unsigned index, u32 mask, u32 bits,
+					    u32 func_slct)
+{
+	u32 config, new_config;
+
+	mutex_lock(&pc8736x_gpio_config_lock);
+
+	device_select(SIO_GPIO_UNIT);
+	select_pin(index);
+
+	/* read current config value */
+	config = superio_inb(func_slct);
+
+	/* set new config */
+	new_config = (config & mask) | bits;
+	superio_outb(func_slct, new_config);
+
+	mutex_unlock(&pc8736x_gpio_config_lock);
+
+	return config;
+}
+
+static u32 pc8736x_gpio_configure(unsigned index, u32 mask, u32 bits)
+{
+	return pc8736x_gpio_configure_fn(index, mask, bits,
+					 SIO_GPIO_PIN_CONFIG);
+}
+
+static int pc8736x_gpio_get(unsigned minor)
+{
+	int port, bit, val;
+
+	port = minor >> 3;
+	bit = minor & 7;
+	val = inb_p(pc8736x_gpio_base + port_offset[port] + PORT_IN);
+	val >>= bit;
+	val &= 1;
+
+	dev_dbg(&pdev->dev, "_gpio_get(%d from %x bit %d) == val %d\n",
+		minor, pc8736x_gpio_base + port_offset[port] + PORT_IN, bit,
+		val);
+
+	return val;
+}
+
+static void pc8736x_gpio_set(unsigned minor, int val)
+{
+	int port, bit, curval;
+
+	minor &= 0x1f;
+	port = minor >> 3;
+	bit = minor & 7;
+	curval = inb_p(pc8736x_gpio_base + port_offset[port] + PORT_OUT);
+
+	dev_dbg(&pdev->dev, "addr:%x cur:%x bit-pos:%d cur-bit:%x + new:%d -> bit-new:%d\n",
+		pc8736x_gpio_base + port_offset[port] + PORT_OUT,
+		curval, bit, (curval & ~(1 << bit)), val, (val << bit));
+
+	val = (curval & ~(1 << bit)) | (val << bit);
+
+	dev_dbg(&pdev->dev, "gpio_set(minor:%d port:%d bit:%d)"
+		" %2x -> %2x\n", minor, port, bit, curval, val);
+
+	outb_p(val, pc8736x_gpio_base + port_offset[port] + PORT_OUT);
+
+	curval = inb_p(pc8736x_gpio_base + port_offset[port] + PORT_OUT);
+	val = inb_p(pc8736x_gpio_base + port_offset[port] + PORT_IN);
+
+	dev_dbg(&pdev->dev, "wrote %x, read: %x\n", curval, val);
+	pc8736x_gpio_shadow[port] = val;
+}
+
+static void pc8736x_gpio_set_high(unsigned index)
+{
+	pc8736x_gpio_set(index, 1);
+}
+
+static void pc8736x_gpio_set_low(unsigned index)
+{
+	pc8736x_gpio_set(index, 0);
+}
+
+static int pc8736x_gpio_current(unsigned minor)
+{
+	int port, bit;
+	minor &= 0x1f;
+	port = minor >> 3;
+	bit = minor & 7;
+	return ((pc8736x_gpio_shadow[port] >> bit) & 0x01);
+}
+
+static void pc8736x_gpio_change(unsigned index)
+{
+	pc8736x_gpio_set(index, !pc8736x_gpio_current(index));
+}
+
+static struct nsc_gpio_ops pc8736x_access = {
+	.owner		= THIS_MODULE,
+	.gpio_config	= pc8736x_gpio_configure,
+	.gpio_dump	= nsc_gpio_dump,
+	.gpio_get	= pc8736x_gpio_get,
+	.gpio_set	= pc8736x_gpio_set,
+	.gpio_set_high	= pc8736x_gpio_set_high,
+	.gpio_set_low	= pc8736x_gpio_set_low,
+	.gpio_change	= pc8736x_gpio_change,
+	.gpio_current	= pc8736x_gpio_current
+};
+
+static int pc8736x_gpio_open(struct inode *inode, struct file *file)
+{
+	unsigned m = iminor(inode);
+	file->private_data = &pc8736x_access;
+
+	dev_dbg(&pdev->dev, "open %d\n", m);
+
+	if (m > 63)
+		return -EINVAL;
+	return nonseekable_open(inode, file);
+}
+
+static struct file_operations pc8736x_gpio_fops = {
+	.owner	= THIS_MODULE,
+	.open	= pc8736x_gpio_open,
+	.write	= nsc_gpio_write,
+	.read	= nsc_gpio_read,
+};
+
+static void __init pc8736x_init_shadow(void)
+{
+	int port;
+
+	/* read the current values driven on the GPIO signals */
+	for (port = 0; port < 4; ++port)
+		pc8736x_gpio_shadow[port]
+		    = inb_p(pc8736x_gpio_base + port_offset[port]
+			    + PORT_OUT);
+
+}
+
+static int __init pc8736x_gpio_init(void)
+{
+	int rc = 0;
+
+	pdev = platform_device_alloc(DEVNAME, 0);
+	if (!pdev)
+		return -ENOMEM;
+
+	rc = platform_device_add(pdev);
+	if (rc) {
+		rc = -ENODEV;
+		goto undo_platform_dev_alloc;
+	}
+	dev_info(&pdev->dev, "NatSemi pc8736x GPIO Driver Initializing\n");
+
+	if (!pc8736x_superio_present()) {
+		rc = -ENODEV;
+		dev_err(&pdev->dev, "no device found\n");
+		goto undo_platform_dev_add;
+	}
+	pc8736x_access.dev = &pdev->dev;
+
+	/* Verify that chip and it's GPIO unit are both enabled.
+	   My BIOS does this, so I take minimum action here
+	 */
+	rc = superio_inb(SIO_CF1);
+	if (!(rc & 0x01)) {
+		rc = -ENODEV;
+		dev_err(&pdev->dev, "device not enabled\n");
+		goto undo_platform_dev_add;
+	}
+	device_select(SIO_GPIO_UNIT);
+	if (!superio_inb(SIO_UNIT_ACT)) {
+		rc = -ENODEV;
+		dev_err(&pdev->dev, "GPIO unit not enabled\n");
+		goto undo_platform_dev_add;
+	}
+
+	/* read the GPIO unit base addr that chip responds to */
+	pc8736x_gpio_base = (superio_inb(SIO_BASE_HADDR) << 8
+			     | superio_inb(SIO_BASE_LADDR));
+
+	if (!request_region(pc8736x_gpio_base, 16, DEVNAME)) {
+		rc = -ENODEV;
+		dev_err(&pdev->dev, "GPIO ioport %x busy\n",
+			pc8736x_gpio_base);
+		goto undo_platform_dev_add;
+	}
+	dev_info(&pdev->dev, "GPIO ioport %x reserved\n", pc8736x_gpio_base);
+
+	rc = register_chrdev(major, DEVNAME, &pc8736x_gpio_fops);
+	if (rc < 0) {
+		dev_err(&pdev->dev, "register-chrdev failed: %d\n", rc);
+		goto undo_platform_dev_add;
+	}
+	if (!major) {
+		major = rc;
+		dev_dbg(&pdev->dev, "got dynamic major %d\n", major);
+	}
+
+	pc8736x_init_shadow();
+	return 0;
+
+undo_platform_dev_add:
+	platform_device_put(pdev);
+undo_platform_dev_alloc:
+	kfree(pdev);
+	return rc;
+}
+
+static void __exit pc8736x_gpio_cleanup(void)
+{
+	dev_dbg(&pdev->dev, " cleanup\n");
+
+	release_region(pc8736x_gpio_base, 16);
+
+	unregister_chrdev(major, DEVNAME);
+}
+
+EXPORT_SYMBOL(pc8736x_access);
+
+module_init(pc8736x_gpio_init);
+module_exit(pc8736x_gpio_cleanup);
diff --git a/drivers/char/scx200_gpio.c b/drivers/char/scx200_gpio.c
index 664a6e9..5a280a3 100644
--- a/drivers/char/scx200_gpio.c
+++ b/drivers/char/scx200_gpio.c
@@ -1,4 +1,4 @@
-/* linux/drivers/char/scx200_gpio.c 
+/* linux/drivers/char/scx200_gpio.c
 
    National Semiconductor SCx200 GPIO driver.  Allows a user space
    process to play with the GPIO pins.
@@ -6,17 +6,26 @@
    Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com> */
 
 #include <linux/config.h>
+#include <linux/device.h>
 #include <linux/fs.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/platform_device.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
+#include <linux/types.h>
+#include <linux/cdev.h>
+
 #include <linux/scx200_gpio.h>
+#include <linux/nsc_gpio.h>
 
 #define NAME "scx200_gpio"
+#define DEVNAME NAME
+
+static struct platform_device *pdev;
 
 MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>");
 MODULE_DESCRIPTION("NatSemi SCx200 GPIO Pin Driver");
@@ -26,70 +35,23 @@
 module_param(major, int, 0);
 MODULE_PARM_DESC(major, "Major device number");
 
-static ssize_t scx200_gpio_write(struct file *file, const char __user *data, 
-				 size_t len, loff_t *ppos)
-{
-	unsigned m = iminor(file->f_dentry->d_inode);
-	size_t i;
-
-	for (i = 0; i < len; ++i) {
-		char c;
-		if (get_user(c, data+i))
-			return -EFAULT;
-		switch (c)
-		{
-		case '0': 
-			scx200_gpio_set(m, 0); 
-			break;
-		case '1': 
-			scx200_gpio_set(m, 1); 
-			break;
-		case 'O':
-			printk(KERN_INFO NAME ": GPIO%d output enabled\n", m);
-			scx200_gpio_configure(m, ~1, 1);
-			break;
-		case 'o':
-			printk(KERN_INFO NAME ": GPIO%d output disabled\n", m);
-			scx200_gpio_configure(m, ~1, 0);
-			break;
-		case 'T':
-			printk(KERN_INFO NAME ": GPIO%d output is push pull\n", m);
-			scx200_gpio_configure(m, ~2, 2);
-			break;
-		case 't':
-			printk(KERN_INFO NAME ": GPIO%d output is open drain\n", m);
-			scx200_gpio_configure(m, ~2, 0);
-			break;
-		case 'P':
-			printk(KERN_INFO NAME ": GPIO%d pull up enabled\n", m);
-			scx200_gpio_configure(m, ~4, 4);
-			break;
-		case 'p':
-			printk(KERN_INFO NAME ": GPIO%d pull up disabled\n", m);
-			scx200_gpio_configure(m, ~4, 0);
-			break;
-		}
-	}
-
-	return len;
-}
-
-static ssize_t scx200_gpio_read(struct file *file, char __user *buf,
-				size_t len, loff_t *ppos)
-{
-	unsigned m = iminor(file->f_dentry->d_inode);
-	int value;
-
-	value = scx200_gpio_get(m);
-	if (put_user(value ? '1' : '0', buf))
-		return -EFAULT;
-	
-	return 1;
-}
+struct nsc_gpio_ops scx200_access = {
+	.owner		= THIS_MODULE,
+	.gpio_config	= scx200_gpio_configure,
+	.gpio_dump	= nsc_gpio_dump,
+	.gpio_get	= scx200_gpio_get,
+	.gpio_set	= scx200_gpio_set,
+	.gpio_set_high	= scx200_gpio_set_high,
+	.gpio_set_low	= scx200_gpio_set_low,
+	.gpio_change	= scx200_gpio_change,
+	.gpio_current	= scx200_gpio_current
+};
 
 static int scx200_gpio_open(struct inode *inode, struct file *file)
 {
 	unsigned m = iminor(inode);
+	file->private_data = &scx200_access;
+
 	if (m > 63)
 		return -EINVAL;
 	return nonseekable_open(inode, file);
@@ -103,47 +65,81 @@
 
 static struct file_operations scx200_gpio_fops = {
 	.owner   = THIS_MODULE,
-	.write   = scx200_gpio_write,
-	.read    = scx200_gpio_read,
+	.write   = nsc_gpio_write,
+	.read    = nsc_gpio_read,
 	.open    = scx200_gpio_open,
 	.release = scx200_gpio_release,
 };
 
+struct cdev *scx200_devices;
+static int num_pins = 32;
+
 static int __init scx200_gpio_init(void)
 {
-	int r;
-
-	printk(KERN_DEBUG NAME ": NatSemi SCx200 GPIO Driver\n");
+	int rc, i;
+	dev_t dev = MKDEV(major, 0);
 
 	if (!scx200_gpio_present()) {
-		printk(KERN_ERR NAME ": no SCx200 gpio pins available\n");
+		printk(KERN_ERR NAME ": no SCx200 gpio present\n");
 		return -ENODEV;
 	}
 
-	r = register_chrdev(major, NAME, &scx200_gpio_fops);
-	if (r < 0) {
-		printk(KERN_ERR NAME ": unable to register character device\n");
-		return r;
+	/* support dev_dbg() with pdev->dev */
+	pdev = platform_device_alloc(DEVNAME, 0);
+	if (!pdev)
+		return -ENOMEM;
+
+	rc = platform_device_add(pdev);
+	if (rc)
+		goto undo_malloc;
+
+	/* nsc_gpio uses dev_dbg(), so needs this */
+	scx200_access.dev = &pdev->dev;
+
+	if (major)
+		rc = register_chrdev_region(dev, num_pins, "scx200_gpio");
+	else {
+		rc = alloc_chrdev_region(&dev, 0, num_pins, "scx200_gpio");
+		major = MAJOR(dev);
 	}
-	if (!major) {
-		major = r;
-		printk(KERN_DEBUG NAME ": got dynamic major %d\n", major);
+	if (rc < 0) {
+		dev_err(&pdev->dev, "SCx200 chrdev_region err: %d\n", rc);
+		goto undo_platform_device_add;
+	}
+	scx200_devices = kzalloc(num_pins * sizeof(struct cdev), GFP_KERNEL);
+	if (!scx200_devices) {
+		rc = -ENOMEM;
+		goto undo_chrdev_region;
+	}
+	for (i = 0; i < num_pins; i++) {
+		struct cdev *cdev = &scx200_devices[i];
+		cdev_init(cdev, &scx200_gpio_fops);
+		cdev->owner = THIS_MODULE;
+		rc = cdev_add(cdev, MKDEV(major, i), 1);
+		/* tolerate 'minor' errors */
+		if (rc)
+			dev_err(&pdev->dev, "Error %d on minor %d", rc, i);
 	}
 
-	return 0;
+	return 0; /* succeed */
+
+undo_chrdev_region:
+	unregister_chrdev_region(dev, num_pins);
+undo_platform_device_add:
+	platform_device_put(pdev);
+undo_malloc:
+	kfree(pdev);
+	return rc;
 }
 
 static void __exit scx200_gpio_cleanup(void)
 {
-	unregister_chrdev(major, NAME);
+	kfree(scx200_devices);
+	unregister_chrdev_region(MKDEV(major, 0), num_pins);
+	platform_device_put(pdev);
+	platform_device_unregister(pdev);
+	/* kfree(pdev); */
 }
 
 module_init(scx200_gpio_init);
 module_exit(scx200_gpio_cleanup);
-
-/*
-    Local variables:
-        compile-command: "make -k -C ../.. SUBDIRS=drivers/char modules"
-        c-basic-offset: 8
-    End:
-*/
diff --git a/drivers/char/specialix.c b/drivers/char/specialix.c
index 1b53302..d2d6b01 100644
--- a/drivers/char/specialix.c
+++ b/drivers/char/specialix.c
@@ -2477,7 +2477,7 @@
 #endif
 
 	for (i = 0; i < SX_NBOARD; i++)
-		sx_board[i].lock = SPIN_LOCK_UNLOCKED;
+		spin_lock_init(&sx_board[i].lock);
 
 	if (sx_init_drivers()) {
 		func_exit();
diff --git a/drivers/char/stallion.c b/drivers/char/stallion.c
index a9c5a72..bf361a5 100644
--- a/drivers/char/stallion.c
+++ b/drivers/char/stallion.c
@@ -141,15 +141,6 @@
 static struct tty_driver	*stl_serial;
 
 /*
- *	We will need to allocate a temporary write buffer for chars that
- *	come direct from user space. The problem is that a copy from user
- *	space might cause a page fault (typically on a system that is
- *	swapping!). All ports will share one buffer - since if the system
- *	is already swapping a shared buffer won't make things any worse.
- */
-static char			*stl_tmpwritebuf;
-
-/*
  *	Define a local default termios struct. All ports will be created
  *	with this termios initially. Basically all it defines is a raw port
  *	at 9600, 8 data bits, 1 stop bit.
@@ -363,6 +354,14 @@
 };
 
 /*
+ *	Lock ordering is that you may not take stallion_lock holding
+ *	brd_lock.
+ */
+
+static spinlock_t brd_lock; 		/* Guard the board mapping */
+static spinlock_t stallion_lock;	/* Guard the tty driver */
+
+/*
  *	Set up enable and disable macros for the ECH boards. They require
  *	the secondary io address space to be activated and deactivated.
  *	This way all ECH boards can share their secondary io region.
@@ -725,17 +724,7 @@
 
 static int __init stallion_module_init(void)
 {
-	unsigned long	flags;
-
-#ifdef DEBUG
-	printk("init_module()\n");
-#endif
-
-	save_flags(flags);
-	cli();
 	stl_init();
-	restore_flags(flags);
-
 	return 0;
 }
 
@@ -746,7 +735,6 @@
 	stlbrd_t	*brdp;
 	stlpanel_t	*panelp;
 	stlport_t	*portp;
-	unsigned long	flags;
 	int		i, j, k;
 
 #ifdef DEBUG
@@ -756,9 +744,6 @@
 	printk(KERN_INFO "Unloading %s: version %s\n", stl_drvtitle,
 		stl_drvversion);
 
-	save_flags(flags);
-	cli();
-
 /*
  *	Free up all allocated resources used by the ports. This includes
  *	memory and interrupts. As part of this process we will also do
@@ -770,7 +755,6 @@
 	if (i) {
 		printk("STALLION: failed to un-register tty driver, "
 			"errno=%d\n", -i);
-		restore_flags(flags);
 		return;
 	}
 	for (i = 0; i < 4; i++) {
@@ -783,8 +767,6 @@
 			"errno=%d\n", -i);
 	class_destroy(stallion_class);
 
-	kfree(stl_tmpwritebuf);
-
 	for (i = 0; (i < stl_nrbrds); i++) {
 		if ((brdp = stl_brds[i]) == (stlbrd_t *) NULL)
 			continue;
@@ -814,8 +796,6 @@
 		kfree(brdp);
 		stl_brds[i] = (stlbrd_t *) NULL;
 	}
-
-	restore_flags(flags);
 }
 
 module_init(stallion_module_init);
@@ -948,7 +928,7 @@
 
 	brdp = kzalloc(sizeof(stlbrd_t), GFP_KERNEL);
 	if (!brdp) {
-		printk("STALLION: failed to allocate memory (size=%d)\n",
+		printk("STALLION: failed to allocate memory (size=%Zd)\n",
 			sizeof(stlbrd_t));
 		return NULL;
 	}
@@ -1066,16 +1046,17 @@
 	rc = 0;
 	doclocal = 0;
 
+	spin_lock_irqsave(&stallion_lock, flags);
+
 	if (portp->tty->termios->c_cflag & CLOCAL)
 		doclocal++;
 
-	save_flags(flags);
-	cli();
 	portp->openwaitcnt++;
 	if (! tty_hung_up_p(filp))
 		portp->refcount--;
 
 	for (;;) {
+		/* Takes brd_lock internally */
 		stl_setsignals(portp, 1, 1);
 		if (tty_hung_up_p(filp) ||
 		    ((portp->flags & ASYNC_INITIALIZED) == 0)) {
@@ -1093,13 +1074,14 @@
 			rc = -ERESTARTSYS;
 			break;
 		}
+		/* FIXME */
 		interruptible_sleep_on(&portp->open_wait);
 	}
 
 	if (! tty_hung_up_p(filp))
 		portp->refcount++;
 	portp->openwaitcnt--;
-	restore_flags(flags);
+	spin_unlock_irqrestore(&stallion_lock, flags);
 
 	return rc;
 }
@@ -1119,16 +1101,15 @@
 	if (portp == (stlport_t *) NULL)
 		return;
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&stallion_lock, flags);
 	if (tty_hung_up_p(filp)) {
-		restore_flags(flags);
+		spin_unlock_irqrestore(&stallion_lock, flags);
 		return;
 	}
 	if ((tty->count == 1) && (portp->refcount != 1))
 		portp->refcount = 1;
 	if (portp->refcount-- > 1) {
-		restore_flags(flags);
+		spin_unlock_irqrestore(&stallion_lock, flags);
 		return;
 	}
 
@@ -1142,11 +1123,18 @@
  *	(The sc26198 has no "end-of-data" interrupt only empty FIFO)
  */
 	tty->closing = 1;
+
+	spin_unlock_irqrestore(&stallion_lock, flags);
+
 	if (portp->closing_wait != ASYNC_CLOSING_WAIT_NONE)
 		tty_wait_until_sent(tty, portp->closing_wait);
 	stl_waituntilsent(tty, (HZ / 2));
 
+
+	spin_lock_irqsave(&stallion_lock, flags);
 	portp->flags &= ~ASYNC_INITIALIZED;
+	spin_unlock_irqrestore(&stallion_lock, flags);
+
 	stl_disableintrs(portp);
 	if (tty->termios->c_cflag & HUPCL)
 		stl_setsignals(portp, 0, 0);
@@ -1173,7 +1161,6 @@
 
 	portp->flags &= ~(ASYNC_NORMAL_ACTIVE|ASYNC_CLOSING);
 	wake_up_interruptible(&portp->close_wait);
-	restore_flags(flags);
 }
 
 /*****************************************************************************/
@@ -1195,9 +1182,6 @@
 		(int) tty, (int) buf, count);
 #endif
 
-	if ((tty == (struct tty_struct *) NULL) ||
-	    (stl_tmpwritebuf == (char *) NULL))
-		return 0;
 	portp = tty->driver_data;
 	if (portp == (stlport_t *) NULL)
 		return 0;
@@ -1302,11 +1286,6 @@
 	if (portp->tx.buf == (char *) NULL)
 		return;
 
-#if 0
-	if (tty->stopped || tty->hw_stopped ||
-	    (portp->tx.head == portp->tx.tail))
-		return;
-#endif
 	stl_startrxtx(portp, -1, 1);
 }
 
@@ -1977,12 +1956,14 @@
 	unsigned int	iobase;
 	int		handled = 0;
 
+	spin_lock(&brd_lock);
 	panelp = brdp->panels[0];
 	iobase = panelp->iobase;
 	while (inb(brdp->iostatus) & EIO_INTRPEND) {
 		handled = 1;
 		(* panelp->isr)(panelp, iobase);
 	}
+	spin_unlock(&brd_lock);
 	return handled;
 }
 
@@ -2168,7 +2149,7 @@
 		portp = kzalloc(sizeof(stlport_t), GFP_KERNEL);
 		if (!portp) {
 			printk("STALLION: failed to allocate memory "
-				"(size=%d)\n", sizeof(stlport_t));
+				"(size=%Zd)\n", sizeof(stlport_t));
 			break;
 		}
 
@@ -2304,7 +2285,7 @@
 	panelp = kzalloc(sizeof(stlpanel_t), GFP_KERNEL);
 	if (!panelp) {
 		printk(KERN_WARNING "STALLION: failed to allocate memory "
-			"(size=%d)\n", sizeof(stlpanel_t));
+			"(size=%Zd)\n", sizeof(stlpanel_t));
 		return -ENOMEM;
 	}
 
@@ -2478,7 +2459,7 @@
 		panelp = kzalloc(sizeof(stlpanel_t), GFP_KERNEL);
 		if (!panelp) {
 			printk("STALLION: failed to allocate memory "
-				"(size=%d)\n", sizeof(stlpanel_t));
+				"(size=%Zd)\n", sizeof(stlpanel_t));
 			break;
 		}
 		panelp->magic = STL_PANELMAGIC;
@@ -2879,8 +2860,7 @@
 	portp->stats.lflags = 0;
 	portp->stats.rxbuffered = 0;
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&stallion_lock, flags);
 	if (portp->tty != (struct tty_struct *) NULL) {
 		if (portp->tty->driver_data == portp) {
 			portp->stats.ttystate = portp->tty->flags;
@@ -2894,7 +2874,7 @@
 			}
 		}
 	}
-	restore_flags(flags);
+	spin_unlock_irqrestore(&stallion_lock, flags);
 
 	head = portp->tx.head;
 	tail = portp->tx.tail;
@@ -3056,14 +3036,6 @@
 		return -1;
 
 /*
- *	Allocate a temporary write buffer.
- */
-	stl_tmpwritebuf = kmalloc(STL_TXBUFSIZE, GFP_KERNEL);
-	if (!stl_tmpwritebuf)
-		printk("STALLION: failed to allocate memory (size=%d)\n",
-			STL_TXBUFSIZE);
-
-/*
  *	Set up a character driver for per board stuff. This is mainly used
  *	to do stats ioctls on the ports.
  */
@@ -3147,11 +3119,13 @@
 	unsigned int	gfrcr;
 	int		chipmask, i, j;
 	int		nrchips, uartaddr, ioaddr;
+	unsigned long   flags;
 
 #ifdef DEBUG
 	printk("stl_panelinit(brdp=%x,panelp=%x)\n", (int) brdp, (int) panelp);
 #endif
 
+	spin_lock_irqsave(&brd_lock, flags);
 	BRDENABLE(panelp->brdnr, panelp->pagenr);
 
 /*
@@ -3189,6 +3163,7 @@
 	}
 
 	BRDDISABLE(panelp->brdnr);
+	spin_unlock_irqrestore(&brd_lock, flags);
 	return chipmask;
 }
 
@@ -3200,6 +3175,7 @@
 
 static void stl_cd1400portinit(stlbrd_t *brdp, stlpanel_t *panelp, stlport_t *portp)
 {
+	unsigned long flags;
 #ifdef DEBUG
 	printk("stl_cd1400portinit(brdp=%x,panelp=%x,portp=%x)\n",
 		(int) brdp, (int) panelp, (int) portp);
@@ -3209,6 +3185,7 @@
 	    (portp == (stlport_t *) NULL))
 		return;
 
+	spin_lock_irqsave(&brd_lock, flags);
 	portp->ioaddr = panelp->iobase + (((brdp->brdtype == BRD_ECHPCI) ||
 		(portp->portnr < 8)) ? 0 : EREG_BANKSIZE);
 	portp->uartaddr = (portp->portnr & 0x04) << 5;
@@ -3219,6 +3196,7 @@
 	stl_cd1400setreg(portp, LIVR, (portp->portnr << 3));
 	portp->hwid = stl_cd1400getreg(portp, GFRCR);
 	BRDDISABLE(portp->brdnr);
+	spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -3428,8 +3406,7 @@
 		tiosp->c_cc[VSTART], tiosp->c_cc[VSTOP]);
 #endif
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&brd_lock, flags);
 	BRDENABLE(portp->brdnr, portp->pagenr);
 	stl_cd1400setreg(portp, CAR, (portp->portnr & 0x3));
 	srer = stl_cd1400getreg(portp, SRER);
@@ -3466,7 +3443,7 @@
 		portp->sigs &= ~TIOCM_CD;
 	stl_cd1400setreg(portp, SRER, ((srer & ~sreroff) | sreron));
 	BRDDISABLE(portp->brdnr);
-	restore_flags(flags);
+	spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -3492,8 +3469,7 @@
 	if (rts > 0)
 		msvr2 = MSVR2_RTS;
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&brd_lock, flags);
 	BRDENABLE(portp->brdnr, portp->pagenr);
 	stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03));
 	if (rts >= 0)
@@ -3501,7 +3477,7 @@
 	if (dtr >= 0)
 		stl_cd1400setreg(portp, MSVR1, msvr1);
 	BRDDISABLE(portp->brdnr);
-	restore_flags(flags);
+	spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -3520,14 +3496,13 @@
 	printk("stl_cd1400getsignals(portp=%x)\n", (int) portp);
 #endif
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&brd_lock, flags);
 	BRDENABLE(portp->brdnr, portp->pagenr);
 	stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03));
 	msvr1 = stl_cd1400getreg(portp, MSVR1);
 	msvr2 = stl_cd1400getreg(portp, MSVR2);
 	BRDDISABLE(portp->brdnr);
-	restore_flags(flags);
+	spin_unlock_irqrestore(&brd_lock, flags);
 
 	sigs = 0;
 	sigs |= (msvr1 & MSVR1_DCD) ? TIOCM_CD : 0;
@@ -3569,15 +3544,14 @@
 	else if (rx > 0)
 		ccr |= CCR_RXENABLE;
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&brd_lock, flags);
 	BRDENABLE(portp->brdnr, portp->pagenr);
 	stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03));
 	stl_cd1400ccrwait(portp);
 	stl_cd1400setreg(portp, CCR, ccr);
 	stl_cd1400ccrwait(portp);
 	BRDDISABLE(portp->brdnr);
-	restore_flags(flags);
+	spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -3609,8 +3583,7 @@
 	else if (rx > 0)
 		sreron |= SRER_RXDATA;
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&brd_lock, flags);
 	BRDENABLE(portp->brdnr, portp->pagenr);
 	stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03));
 	stl_cd1400setreg(portp, SRER,
@@ -3618,7 +3591,7 @@
 	BRDDISABLE(portp->brdnr);
 	if (tx > 0)
 		set_bit(ASYI_TXBUSY, &portp->istate);
-	restore_flags(flags);
+	spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -3634,13 +3607,12 @@
 #ifdef DEBUG
 	printk("stl_cd1400disableintrs(portp=%x)\n", (int) portp);
 #endif
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&brd_lock, flags);
 	BRDENABLE(portp->brdnr, portp->pagenr);
 	stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03));
 	stl_cd1400setreg(portp, SRER, 0);
 	BRDDISABLE(portp->brdnr);
-	restore_flags(flags);
+	spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -3653,8 +3625,7 @@
 	printk("stl_cd1400sendbreak(portp=%x,len=%d)\n", (int) portp, len);
 #endif
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&brd_lock, flags);
 	BRDENABLE(portp->brdnr, portp->pagenr);
 	stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03));
 	stl_cd1400setreg(portp, SRER,
@@ -3664,7 +3635,7 @@
 	portp->brklen = len;
 	if (len == 1)
 		portp->stats.txbreaks++;
-	restore_flags(flags);
+	spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -3688,8 +3659,7 @@
 	if (tty == (struct tty_struct *) NULL)
 		return;
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&brd_lock, flags);
 	BRDENABLE(portp->brdnr, portp->pagenr);
 	stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03));
 
@@ -3729,7 +3699,7 @@
 	}
 
 	BRDDISABLE(portp->brdnr);
-	restore_flags(flags);
+	spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -3753,8 +3723,7 @@
 	if (tty == (struct tty_struct *) NULL)
 		return;
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&brd_lock, flags);
 	BRDENABLE(portp->brdnr, portp->pagenr);
 	stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03));
 	if (state) {
@@ -3769,7 +3738,7 @@
 		stl_cd1400ccrwait(portp);
 	}
 	BRDDISABLE(portp->brdnr);
-	restore_flags(flags);
+	spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -3785,8 +3754,7 @@
 	if (portp == (stlport_t *) NULL)
 		return;
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&brd_lock, flags);
 	BRDENABLE(portp->brdnr, portp->pagenr);
 	stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03));
 	stl_cd1400ccrwait(portp);
@@ -3794,7 +3762,7 @@
 	stl_cd1400ccrwait(portp);
 	portp->tx.tail = portp->tx.head;
 	BRDDISABLE(portp->brdnr);
-	restore_flags(flags);
+	spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -3833,6 +3801,7 @@
 		(int) panelp, iobase);
 #endif
 
+	spin_lock(&brd_lock);
 	outb(SVRR, iobase);
 	svrtype = inb(iobase + EREG_DATA);
 	if (panelp->nrports > 4) {
@@ -3846,6 +3815,8 @@
 		stl_cd1400txisr(panelp, iobase);
 	else if (svrtype & SVRR_MDM)
 		stl_cd1400mdmisr(panelp, iobase);
+
+	spin_unlock(&brd_lock);
 }
 
 /*****************************************************************************/
@@ -4433,8 +4404,7 @@
 		tiosp->c_cc[VSTART], tiosp->c_cc[VSTOP]);
 #endif
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&brd_lock, flags);
 	BRDENABLE(portp->brdnr, portp->pagenr);
 	stl_sc26198setreg(portp, IMR, 0);
 	stl_sc26198updatereg(portp, MR0, mr0);
@@ -4461,7 +4431,7 @@
 	portp->imr = (portp->imr & ~imroff) | imron;
 	stl_sc26198setreg(portp, IMR, portp->imr);
 	BRDDISABLE(portp->brdnr);
-	restore_flags(flags);
+	spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -4491,13 +4461,12 @@
 	else if (rts > 0)
 		iopioron |= IPR_RTS;
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&brd_lock, flags);
 	BRDENABLE(portp->brdnr, portp->pagenr);
 	stl_sc26198setreg(portp, IOPIOR,
 		((stl_sc26198getreg(portp, IOPIOR) & ~iopioroff) | iopioron));
 	BRDDISABLE(portp->brdnr);
-	restore_flags(flags);
+	spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -4516,12 +4485,11 @@
 	printk("stl_sc26198getsignals(portp=%x)\n", (int) portp);
 #endif
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&brd_lock, flags);
 	BRDENABLE(portp->brdnr, portp->pagenr);
 	ipr = stl_sc26198getreg(portp, IPR);
 	BRDDISABLE(portp->brdnr);
-	restore_flags(flags);
+	spin_unlock_irqrestore(&brd_lock, flags);
 
 	sigs = 0;
 	sigs |= (ipr & IPR_DCD) ? 0 : TIOCM_CD;
@@ -4558,13 +4526,12 @@
 	else if (rx > 0)
 		ccr |= CR_RXENABLE;
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&brd_lock, flags);
 	BRDENABLE(portp->brdnr, portp->pagenr);
 	stl_sc26198setreg(portp, SCCR, ccr);
 	BRDDISABLE(portp->brdnr);
 	portp->crenable = ccr;
-	restore_flags(flags);
+	spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -4593,15 +4560,14 @@
 	else if (rx > 0)
 		imr |= IR_RXRDY | IR_RXBREAK | IR_RXWATCHDOG;
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&brd_lock, flags);
 	BRDENABLE(portp->brdnr, portp->pagenr);
 	stl_sc26198setreg(portp, IMR, imr);
 	BRDDISABLE(portp->brdnr);
 	portp->imr = imr;
 	if (tx > 0)
 		set_bit(ASYI_TXBUSY, &portp->istate);
-	restore_flags(flags);
+	spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -4618,13 +4584,12 @@
 	printk("stl_sc26198disableintrs(portp=%x)\n", (int) portp);
 #endif
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&brd_lock, flags);
 	BRDENABLE(portp->brdnr, portp->pagenr);
 	portp->imr = 0;
 	stl_sc26198setreg(portp, IMR, 0);
 	BRDDISABLE(portp->brdnr);
-	restore_flags(flags);
+	spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -4637,8 +4602,7 @@
 	printk("stl_sc26198sendbreak(portp=%x,len=%d)\n", (int) portp, len);
 #endif
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&brd_lock, flags);
 	BRDENABLE(portp->brdnr, portp->pagenr);
 	if (len == 1) {
 		stl_sc26198setreg(portp, SCCR, CR_TXSTARTBREAK);
@@ -4647,7 +4611,7 @@
 		stl_sc26198setreg(portp, SCCR, CR_TXSTOPBREAK);
 	}
 	BRDDISABLE(portp->brdnr);
-	restore_flags(flags);
+	spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -4672,8 +4636,7 @@
 	if (tty == (struct tty_struct *) NULL)
 		return;
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&brd_lock, flags);
 	BRDENABLE(portp->brdnr, portp->pagenr);
 
 	if (state) {
@@ -4719,7 +4682,7 @@
 	}
 
 	BRDDISABLE(portp->brdnr);
-	restore_flags(flags);
+	spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -4744,8 +4707,7 @@
 	if (tty == (struct tty_struct *) NULL)
 		return;
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&brd_lock, flags);
 	BRDENABLE(portp->brdnr, portp->pagenr);
 	if (state) {
 		mr0 = stl_sc26198getreg(portp, MR0);
@@ -4765,7 +4727,7 @@
 		stl_sc26198setreg(portp, MR0, mr0);
 	}
 	BRDDISABLE(portp->brdnr);
-	restore_flags(flags);
+	spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -4781,14 +4743,13 @@
 	if (portp == (stlport_t *) NULL)
 		return;
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&brd_lock, flags);
 	BRDENABLE(portp->brdnr, portp->pagenr);
 	stl_sc26198setreg(portp, SCCR, CR_TXRESET);
 	stl_sc26198setreg(portp, SCCR, portp->crenable);
 	BRDDISABLE(portp->brdnr);
 	portp->tx.tail = portp->tx.head;
-	restore_flags(flags);
+	spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -4815,12 +4776,11 @@
 	if (test_bit(ASYI_TXBUSY, &portp->istate))
 		return 1;
 
-	save_flags(flags);
-	cli();
+	spin_lock_irqsave(&brd_lock, flags);
 	BRDENABLE(portp->brdnr, portp->pagenr);
 	sr = stl_sc26198getreg(portp, SR);
 	BRDDISABLE(portp->brdnr);
-	restore_flags(flags);
+	spin_unlock_irqrestore(&brd_lock, flags);
 
 	return (sr & SR_TXEMPTY) ? 0 : 1;
 }
@@ -4878,6 +4838,8 @@
 	stlport_t	*portp;
 	unsigned int	iack;
 
+	spin_lock(&brd_lock);
+
 /* 
  *	Work around bug in sc26198 chip... Cannot have A6 address
  *	line of UART high, else iack will be returned as 0.
@@ -4893,6 +4855,8 @@
 		stl_sc26198txisr(portp);
 	else
 		stl_sc26198otherisr(portp, iack);
+
+	spin_unlock(&brd_lock);
 }
 
 /*****************************************************************************/
diff --git a/drivers/char/sx.c b/drivers/char/sx.c
index 3b47472..76b9107 100644
--- a/drivers/char/sx.c
+++ b/drivers/char/sx.c
@@ -2320,7 +2320,7 @@
 #ifdef NEW_WRITE_LOCKING
 			port->gs.port_write_mutex = MUTEX;
 #endif
-			port->gs.driver_lock = SPIN_LOCK_UNLOCKED;
+			spin_lock_init(&port->gs.driver_lock);
 			/*
 			 * Initializing wait queue
 			 */
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 8b2a599..bd74e82 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -2621,10 +2621,9 @@
 			tty->driver->break_ctl(tty, 0);
 			return 0;
 		case TCSBRK:   /* SVID version: non-zero arg --> no break */
-			/*
-			 * XXX is the above comment correct, or the
-			 * code below correct?  Is this ioctl used at
-			 * all by anyone?
+			/* non-zero arg means wait for all output data
+			 * to be sent (performed above) but don't send break.
+			 * This is used by the tcdrain() termios function.
 			 */
 			if (!arg)
 				return send_break(tty, 250);
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 44d1eca..35e0b9c 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1497,6 +1497,7 @@
 }
 EXPORT_SYMBOL(cpufreq_update_policy);
 
+#ifdef CONFIG_HOTPLUG_CPU
 static int cpufreq_cpu_callback(struct notifier_block *nfb,
 					unsigned long action, void *hcpu)
 {
@@ -1532,10 +1533,11 @@
 	return NOTIFY_OK;
 }
 
-static struct notifier_block cpufreq_cpu_notifier =
+static struct notifier_block __cpuinitdata cpufreq_cpu_notifier =
 {
     .notifier_call = cpufreq_cpu_callback,
 };
+#endif /* CONFIG_HOTPLUG_CPU */
 
 /*********************************************************************
  *               REGISTER / UNREGISTER CPUFREQ DRIVER                *
@@ -1596,7 +1598,7 @@
 	}
 
 	if (!ret) {
-		register_cpu_notifier(&cpufreq_cpu_notifier);
+		register_hotcpu_notifier(&cpufreq_cpu_notifier);
 		dprintk("driver %s up and running\n", driver_data->name);
 		cpufreq_debug_enable_ratelimit();
 	}
@@ -1628,7 +1630,7 @@
 	dprintk("unregistering driver %s\n", driver->name);
 
 	sysdev_driver_unregister(&cpu_sysdev_class, &cpufreq_sysdev_driver);
-	unregister_cpu_notifier(&cpufreq_cpu_notifier);
+	unregister_hotcpu_notifier(&cpufreq_cpu_notifier);
 
 	spin_lock_irqsave(&cpufreq_driver_lock, flags);
 	cpufreq_driver = NULL;
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index c576c0b..145061b 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -350,7 +350,7 @@
 		return ret;
 	}
 
-	register_cpu_notifier(&cpufreq_stat_cpu_notifier);
+	register_hotcpu_notifier(&cpufreq_stat_cpu_notifier);
 	lock_cpu_hotplug();
 	for_each_online_cpu(cpu) {
 		cpufreq_stat_cpu_callback(&cpufreq_stat_cpu_notifier, CPU_ONLINE,
@@ -368,7 +368,7 @@
 			CPUFREQ_POLICY_NOTIFIER);
 	cpufreq_unregister_notifier(&notifier_trans_block,
 			CPUFREQ_TRANSITION_NOTIFIER);
-	unregister_cpu_notifier(&cpufreq_stat_cpu_notifier);
+	unregister_hotcpu_notifier(&cpufreq_stat_cpu_notifier);
 	lock_cpu_hotplug();
 	for_each_online_cpu(cpu) {
 		cpufreq_stat_cpu_callback(&cpufreq_stat_cpu_notifier, CPU_DEAD,
diff --git a/drivers/input/input.c b/drivers/input/input.c
index de2e754..a90486f 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -998,12 +998,13 @@
 	sysfs_remove_group(&dev->cdev.kobj, &input_dev_caps_attr_group);
 	sysfs_remove_group(&dev->cdev.kobj, &input_dev_id_attr_group);
 	sysfs_remove_group(&dev->cdev.kobj, &input_dev_attr_group);
-	class_device_unregister(&dev->cdev);
 
 	mutex_lock(&dev->mutex);
 	dev->name = dev->phys = dev->uniq = NULL;
 	mutex_unlock(&dev->mutex);
 
+	class_device_unregister(&dev->cdev);
+
 	input_wakeup_procfs_readers();
 }
 EXPORT_SYMBOL(input_unregister_device);
diff --git a/drivers/isdn/gigaset/common.c b/drivers/isdn/gigaset/common.c
index acb7e26..2a56bf3 100644
--- a/drivers/isdn/gigaset/common.c
+++ b/drivers/isdn/gigaset/common.c
@@ -981,7 +981,7 @@
 EXPORT_SYMBOL_GPL(gigaset_stop);
 
 static LIST_HEAD(drivers);
-static spinlock_t driver_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(driver_lock);
 
 struct cardstate *gigaset_get_cs_by_id(int id)
 {
diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c
index 2ac9024..433389d 100644
--- a/drivers/isdn/i4l/isdn_tty.c
+++ b/drivers/isdn/i4l/isdn_tty.c
@@ -82,7 +82,7 @@
 						int l = skb->len;
 						unsigned char *dp = skb->data;
 						while (--l) {
-							if (*skb->data == DLE)
+							if (*dp == DLE)
 								tty_insert_flip_char(tty, DLE, 0);
 							tty_insert_flip_char(tty, *dp++, 0);
 						}
diff --git a/drivers/leds/led-core.c b/drivers/leds/led-core.c
index fe65413..9b015f9 100644
--- a/drivers/leds/led-core.c
+++ b/drivers/leds/led-core.c
@@ -18,7 +18,7 @@
 #include <linux/leds.h>
 #include "leds.h"
 
-rwlock_t leds_list_lock = RW_LOCK_UNLOCKED;
+DEFINE_RWLOCK(leds_list_lock);
 LIST_HEAD(leds_list);
 
 EXPORT_SYMBOL_GPL(leds_list);
diff --git a/drivers/leds/led-triggers.c b/drivers/leds/led-triggers.c
index 5e2cd8b..1b1ce65 100644
--- a/drivers/leds/led-triggers.c
+++ b/drivers/leds/led-triggers.c
@@ -26,7 +26,7 @@
 /*
  * Nests outside led_cdev->trigger_lock
  */
-static rwlock_t triggers_list_lock = RW_LOCK_UNLOCKED;
+static DEFINE_RWLOCK(triggers_list_lock);
 static LIST_HEAD(trigger_list);
 
 ssize_t led_trigger_store(struct class_device *dev, const char *buf,
diff --git a/drivers/message/fusion/mptfc.c b/drivers/message/fusion/mptfc.c
index 74714e5..3ff8378 100644
--- a/drivers/message/fusion/mptfc.c
+++ b/drivers/message/fusion/mptfc.c
@@ -305,10 +305,8 @@
 	}
 
  out:
-	if (pp0_array)
-		kfree(pp0_array);
-	if (p0_array)
-		kfree(p0_array);
+	kfree(pp0_array);
+	kfree(p0_array);
 	return rc;
 }
 
diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c
index af6ec55..85689ab 100644
--- a/drivers/message/fusion/mptsas.c
+++ b/drivers/message/fusion/mptsas.c
@@ -1378,8 +1378,7 @@
 	return 0;
 
  out_free_port_info:
-	if (hba)
-		kfree(hba);
+	kfree(hba);
  out:
 	return error;
 }
diff --git a/drivers/message/i2o/iop.c b/drivers/message/i2o/iop.c
index febbdd4..c74e546 100644
--- a/drivers/message/i2o/iop.c
+++ b/drivers/message/i2o/iop.c
@@ -1239,7 +1239,6 @@
 EXPORT_SYMBOL(i2o_cntxt_list_get_ptr);
 #endif
 EXPORT_SYMBOL(i2o_msg_get_wait);
-EXPORT_SYMBOL(i2o_msg_nop);
 EXPORT_SYMBOL(i2o_find_iop);
 EXPORT_SYMBOL(i2o_iop_find_device);
 EXPORT_SYMBOL(i2o_event_register);
diff --git a/drivers/misc/ibmasm/module.c b/drivers/misc/ibmasm/module.c
index 1fdf03f..9706cc1 100644
--- a/drivers/misc/ibmasm/module.c
+++ b/drivers/misc/ibmasm/module.c
@@ -85,7 +85,7 @@
 	}
 	memset(sp, 0, sizeof(struct service_processor));
 
-	sp->lock = SPIN_LOCK_UNLOCKED;
+	spin_lock_init(&sp->lock);
 	INIT_LIST_HEAD(&sp->command_queue);
 
 	pci_set_drvdata(pdev, (void *)sp);
diff --git a/drivers/net/fs_enet/fs_enet-mii.c b/drivers/net/fs_enet/fs_enet-mii.c
index c677037..0cd0715 100644
--- a/drivers/net/fs_enet/fs_enet-mii.c
+++ b/drivers/net/fs_enet/fs_enet-mii.c
@@ -431,8 +431,7 @@
 	return bus;
 
 err:
-	if (bus)
-		kfree(bus);
+	kfree(bus);
 	return ERR_PTR(ret);
 }
 
diff --git a/drivers/net/wireless/ipw2200.c b/drivers/net/wireless/ipw2200.c
index 081a899..a8a8f97 100644
--- a/drivers/net/wireless/ipw2200.c
+++ b/drivers/net/wireless/ipw2200.c
@@ -1229,12 +1229,6 @@
 	return error;
 }
 
-static void ipw_free_error_log(struct ipw_fw_error *error)
-{
-	if (error)
-		kfree(error);
-}
-
 static ssize_t show_event_log(struct device *d,
 			      struct device_attribute *attr, char *buf)
 {
@@ -1296,10 +1290,9 @@
 			   const char *buf, size_t count)
 {
 	struct ipw_priv *priv = dev_get_drvdata(d);
-	if (priv->error) {
-		ipw_free_error_log(priv->error);
-		priv->error = NULL;
-	}
+
+	kfree(priv->error);
+	priv->error = NULL;
 	return count;
 }
 
@@ -1970,8 +1963,7 @@
 				struct ipw_fw_error *error =
 				    ipw_alloc_error_log(priv);
 				ipw_dump_error_log(priv, error);
-				if (error)
-					ipw_free_error_log(error);
+				kfree(error);
 			}
 #endif
 		} else {
@@ -11693,10 +11685,8 @@
 		}
 	}
 
-	if (priv->error) {
-		ipw_free_error_log(priv->error);
-		priv->error = NULL;
-	}
+	kfree(priv->error);
+	priv->error = NULL;
 
 #ifdef CONFIG_IPW2200_PROMISCUOUS
 	ipw_prom_free(priv);
diff --git a/drivers/pcmcia/m8xx_pcmcia.c b/drivers/pcmcia/m8xx_pcmcia.c
index 0e07d95..d0f68ab 100644
--- a/drivers/pcmcia/m8xx_pcmcia.c
+++ b/drivers/pcmcia/m8xx_pcmcia.c
@@ -157,7 +157,7 @@
 
 static int pcmcia_schlvl = PCMCIA_SCHLVL;
 
-static spinlock_t events_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(events_lock);
 
 
 #define PCMCIA_SOCKET_KEY_5V 1
@@ -644,7 +644,7 @@
 };
 
 static u32 pending_events[PCMCIA_SOCKETS_NO];
-static spinlock_t pending_event_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(pending_event_lock);
 
 static irqreturn_t m8xx_interrupt(int irq, void *dev, struct pt_regs *regs)
 {
diff --git a/drivers/rapidio/rio-access.c b/drivers/rapidio/rio-access.c
index b9fab2a..8b56bbd 100644
--- a/drivers/rapidio/rio-access.c
+++ b/drivers/rapidio/rio-access.c
@@ -17,8 +17,8 @@
  * These interrupt-safe spinlocks protect all accesses to RIO
  * configuration space and doorbell access.
  */
-static spinlock_t rio_config_lock = SPIN_LOCK_UNLOCKED;
-static spinlock_t rio_doorbell_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(rio_config_lock);
+static DEFINE_SPINLOCK(rio_doorbell_lock);
 
 /*
  *  Wrappers for all RIO configuration access functions.  They just check
diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index 5396bee..1cb61a7 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -94,7 +94,9 @@
 	kfree(rtc);
 
 exit_idr:
+	mutex_lock(&idr_lock);
 	idr_remove(&rtc_idr, id);
+	mutex_unlock(&idr_lock);
 
 exit:
 	dev_err(dev, "rtc core: unable to register %s, err = %d\n",
diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c
index ecafbad..762521a 100644
--- a/drivers/rtc/rtc-ds1553.c
+++ b/drivers/rtc/rtc-ds1553.c
@@ -226,7 +226,7 @@
 	struct rtc_plat_data *pdata = platform_get_drvdata(pdev);
 
 	if (pdata->irq < 0)
-		return -ENOIOCTLCMD;
+		return -ENOIOCTLCMD; /* fall back into rtc-dev's emulation */
 	switch (cmd) {
 	case RTC_AIE_OFF:
 		pdata->irqen &= ~RTC_AF;
diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c
index ab486fb..9cd1cb3 100644
--- a/drivers/rtc/rtc-sa1100.c
+++ b/drivers/rtc/rtc-sa1100.c
@@ -45,7 +45,7 @@
 
 static unsigned long rtc_freq = 1024;
 static struct rtc_time rtc_alarm;
-static spinlock_t sa1100_rtc_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(sa1100_rtc_lock);
 
 static int rtc_update_alarm(struct rtc_time *alrm)
 {
diff --git a/drivers/rtc/rtc-vr41xx.c b/drivers/rtc/rtc-vr41xx.c
index 33e0292..4b9291d 100644
--- a/drivers/rtc/rtc-vr41xx.c
+++ b/drivers/rtc/rtc-vr41xx.c
@@ -93,7 +93,7 @@
 
 static unsigned long epoch = 1970;	/* Jan 1 1970 00:00:00 */
 
-static spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(rtc_lock);
 static char rtc_name[] = "RTC";
 static unsigned long periodic_frequency;
 static unsigned long periodic_count;
diff --git a/drivers/s390/block/dasd_eer.c b/drivers/s390/block/dasd_eer.c
index 2d946b6..2d8af70 100644
--- a/drivers/s390/block/dasd_eer.c
+++ b/drivers/s390/block/dasd_eer.c
@@ -89,7 +89,7 @@
 };
 
 static LIST_HEAD(bufferlist);
-static spinlock_t bufferlock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(bufferlock);
 static DECLARE_WAIT_QUEUE_HEAD(dasd_eer_read_wait_queue);
 
 /*
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 6c66877..855ce9a 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -5733,7 +5733,7 @@
 module_exit(ata_exit);
 
 static unsigned long ratelimit_time;
-static spinlock_t ata_ratelimit_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(ata_ratelimit_lock);
 
 int ata_ratelimit(void)
 {
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index 93d18a7..2915bca 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -222,9 +222,7 @@
 	 && copy_to_user(arg + sizeof(args), argbuf, argsize))
 		rc = -EFAULT;
 error:
-	if (argbuf)
-		kfree(argbuf);
-
+	kfree(argbuf);
 	return rc;
 }
 
diff --git a/drivers/sn/ioc3.c b/drivers/sn/ioc3.c
index 501316b..ed94631 100644
--- a/drivers/sn/ioc3.c
+++ b/drivers/sn/ioc3.c
@@ -26,7 +26,7 @@
 
 static struct ioc3_submodule *ioc3_submodules[IOC3_MAX_SUBMODULES];
 static struct ioc3_submodule *ioc3_ethernet;
-static rwlock_t ioc3_submodules_lock = RW_LOCK_UNLOCKED;
+static DEFINE_RWLOCK(ioc3_submodules_lock);
 
 /* NIC probing code */
 
diff --git a/drivers/video/au1100fb.c b/drivers/video/au1100fb.c
index d63c3f4..9ef68cd 100644
--- a/drivers/video/au1100fb.c
+++ b/drivers/video/au1100fb.c
@@ -743,8 +743,7 @@
 {
 	driver_unregister(&au1100fb_driver);
 
-	if (drv_info.opt_mode)
-		kfree(drv_info.opt_mode);
+	kfree(drv_info.opt_mode);
 }
 
 module_init(au1100fb_init);
diff --git a/drivers/video/backlight/hp680_bl.c b/drivers/video/backlight/hp680_bl.c
index a71e984..ffc72ae 100644
--- a/drivers/video/backlight/hp680_bl.c
+++ b/drivers/video/backlight/hp680_bl.c
@@ -27,7 +27,7 @@
 
 static int hp680bl_suspended;
 static int current_intensity = 0;
-static spinlock_t bl_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(bl_lock);
 static struct backlight_device *hp680_backlight_device;
 
 static void hp680bl_send_intensity(struct backlight_device *bd)
diff --git a/fs/buffer.c b/fs/buffer.c
index 373bb62..f23bb64 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -564,7 +564,7 @@
  * Completion handler for block_write_full_page() - pages which are unlocked
  * during I/O, and which have PageWriteback cleared upon I/O completion.
  */
-void end_buffer_async_write(struct buffer_head *bh, int uptodate)
+static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 {
 	char b[BDEVNAME_SIZE];
 	unsigned long flags;
@@ -3166,7 +3166,6 @@
 EXPORT_SYMBOL(block_truncate_page);
 EXPORT_SYMBOL(block_write_full_page);
 EXPORT_SYMBOL(cont_prepare_write);
-EXPORT_SYMBOL(end_buffer_async_write);
 EXPORT_SYMBOL(end_buffer_read_sync);
 EXPORT_SYMBOL(end_buffer_write_sync);
 EXPORT_SYMBOL(file_fsync);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 7f96b5c..8c9b28d 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -34,6 +34,7 @@
 #include <linux/suspend.h>
 #include <linux/pagemap.h>
 #include <linux/kthread.h>
+#include <linux/poison.h>
 #include <linux/proc_fs.h>
 
 #include <asm/uaccess.h>
@@ -1675,7 +1676,7 @@
 {
 #ifdef CONFIG_JBD_DEBUG
 	atomic_dec(&nr_journal_heads);
-	memset(jh, 0x5b, sizeof(*jh));
+	memset(jh, JBD_POISON_FREE, sizeof(*jh));
 #endif
 	kmem_cache_free(journal_head_cache, jh);
 }
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 402005c..8ca9707 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -909,7 +909,7 @@
  * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
  *
  */
-void __exit nfs_destroy_directcache(void)
+void nfs_destroy_directcache(void)
 {
 	if (kmem_cache_destroy(nfs_direct_cachep))
 		printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n");
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 51bc88b..c5b9166 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1132,7 +1132,7 @@
 	return 0;
 }
 
-static void __exit nfs_destroy_inodecache(void)
+static void nfs_destroy_inodecache(void)
 {
 	if (kmem_cache_destroy(nfs_inode_cachep))
 		printk(KERN_INFO "nfs_inode_cache: not all structures were freed\n");
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index bd2815e..4fe51c1 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -31,15 +31,15 @@
 
 /* pagelist.c */
 extern int __init nfs_init_nfspagecache(void);
-extern void __exit nfs_destroy_nfspagecache(void);
+extern void nfs_destroy_nfspagecache(void);
 extern int __init nfs_init_readpagecache(void);
-extern void __exit nfs_destroy_readpagecache(void);
+extern void nfs_destroy_readpagecache(void);
 extern int __init nfs_init_writepagecache(void);
-extern void __exit nfs_destroy_writepagecache(void);
+extern void nfs_destroy_writepagecache(void);
 
 #ifdef CONFIG_NFS_DIRECTIO
 extern int __init nfs_init_directcache(void);
-extern void __exit nfs_destroy_directcache(void);
+extern void nfs_destroy_directcache(void);
 #else
 #define nfs_init_directcache() (0)
 #define nfs_destroy_directcache() do {} while(0)
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index ef94296..d89f6fb 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -390,7 +390,7 @@
 	return 0;
 }
 
-void __exit nfs_destroy_nfspagecache(void)
+void nfs_destroy_nfspagecache(void)
 {
 	if (kmem_cache_destroy(nfs_page_cachep))
 		printk(KERN_INFO "nfs_page: not all structures were freed\n");
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 41c2ffe..32cf377 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -711,7 +711,7 @@
 	return 0;
 }
 
-void __exit nfs_destroy_readpagecache(void)
+void nfs_destroy_readpagecache(void)
 {
 	mempool_destroy(nfs_rdata_mempool);
 	if (kmem_cache_destroy(nfs_rdata_cachep))
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b383fdd..8fccb9c 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1551,7 +1551,7 @@
 	return 0;
 }
 
-void __exit nfs_destroy_writepagecache(void)
+void nfs_destroy_writepagecache(void)
 {
 	mempool_destroy(nfs_commit_mempool);
 	mempool_destroy(nfs_wdata_mempool);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1630b56..7c7d016 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -123,7 +123,7 @@
  */
 
 /* recall_lock protects the del_recall_lru */
-static spinlock_t recall_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(recall_lock);
 static struct list_head del_recall_lru;
 
 static void
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 21f38ac..1d26cfc 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -54,7 +54,7 @@
  * multiple hb threads are watching multiple regions.  A node is live
  * whenever any of the threads sees activity from the node in its region.
  */
-static spinlock_t o2hb_live_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(o2hb_live_lock);
 static struct list_head o2hb_live_slots[O2NM_MAX_NODES];
 static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 static LIST_HEAD(o2hb_node_events);
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 0f60cc0..1591eb3 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -108,7 +108,7 @@
 	    ##args);							\
 } while (0)
 
-static rwlock_t o2net_handler_lock = RW_LOCK_UNLOCKED;
+static DEFINE_RWLOCK(o2net_handler_lock);
 static struct rb_root o2net_handler_tree = RB_ROOT;
 
 static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index ba27c5c..b8c23f7 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -88,7 +88,7 @@
  *
  */
 
-spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(dlm_domain_lock);
 LIST_HEAD(dlm_domains);
 static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
 
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index d6f8957..5ca57ec 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -53,7 +53,7 @@
 #define MLOG_MASK_PREFIX ML_DLM
 #include "cluster/masklog.h"
 
-static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(dlm_cookie_lock);
 static u64 dlm_next_cookie = 1;
 
 static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index da39901..29b2845 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -98,8 +98,8 @@
 
 static u64 dlm_get_next_mig_cookie(void);
 
-static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED;
-static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(dlm_reco_state_lock);
+static DEFINE_SPINLOCK(dlm_mig_cookie_lock);
 static u64 dlm_mig_cookie = 1;
 
 static u64 dlm_get_next_mig_cookie(void)
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 64cd528..4acd372 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -242,7 +242,7 @@
 	mlog_exit_void();
 }
 
-static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock);
 
 static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
 				       struct ocfs2_dlm_debug *dlm_debug)
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 3fe8781..910a601 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -49,7 +49,7 @@
 
 #include "buffer_head_io.h"
 
-spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(trans_inc_lock);
 
 static int ocfs2_force_read_journal(struct inode *inode);
 static int ocfs2_recover_node(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
index ee42765..cf70fe2 100644
--- a/fs/ocfs2/vote.c
+++ b/fs/ocfs2/vote.c
@@ -988,9 +988,7 @@
 	}
 
 bail:
-	if (request)
-		kfree(request);
-
+	kfree(request);
 	return status;
 }
 
@@ -1021,9 +1019,7 @@
 	}
 
 bail:
-	if (request)
-		kfree(request);
-
+	kfree(request);
 	return status;
 }
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 0137ec4..0a163a4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -122,6 +122,11 @@
 	unsigned long private_dirty;
 };
 
+__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
+{
+	return NULL;
+}
+
 static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
 {
 	struct proc_maps_private *priv = m->private;
@@ -158,22 +163,23 @@
 		pad_len_spaces(m, len);
 		seq_path(m, file->f_vfsmnt, file->f_dentry, "\n");
 	} else {
-		if (mm) {
-			if (vma->vm_start <= mm->start_brk &&
+		const char *name = arch_vma_name(vma);
+		if (!name) {
+			if (mm) {
+				if (vma->vm_start <= mm->start_brk &&
 						vma->vm_end >= mm->brk) {
-				pad_len_spaces(m, len);
-				seq_puts(m, "[heap]");
-			} else {
-				if (vma->vm_start <= mm->start_stack &&
-					vma->vm_end >= mm->start_stack) {
-
-					pad_len_spaces(m, len);
-					seq_puts(m, "[stack]");
+					name = "[heap]";
+				} else if (vma->vm_start <= mm->start_stack &&
+					   vma->vm_end >= mm->start_stack) {
+					name = "[stack]";
 				}
+			} else {
+				name = "[vdso]";
 			}
-		} else {
+		}
+		if (name) {
 			pad_len_spaces(m, len);
-			seq_puts(m, "[vdso]");
+			seq_puts(m, name);
 		}
 	}
 	seq_putc(m, '\n');
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index f2dbdf5..259bd19 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -605,39 +605,12 @@
 				   ufs_get_inode_dev(inode->i_sb, UFS_I(inode)));
 }
 
-void ufs_read_inode (struct inode * inode)
+static void ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
 {
 	struct ufs_inode_info *ufsi = UFS_I(inode);
-	struct super_block * sb;
-	struct ufs_sb_private_info * uspi;
-	struct ufs_inode * ufs_inode;	
-	struct ufs2_inode *ufs2_inode;
-	struct buffer_head * bh;
+	struct super_block *sb = inode->i_sb;
 	mode_t mode;
 	unsigned i;
-	unsigned flags;
-	
-	UFSD("ENTER, ino %lu\n", inode->i_ino);
-	
-	sb = inode->i_sb;
-	uspi = UFS_SB(sb)->s_uspi;
-	flags = UFS_SB(sb)->s_flags;
-
-	if (inode->i_ino < UFS_ROOTINO || 
-	    inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) {
-		ufs_warning (sb, "ufs_read_inode", "bad inode number (%lu)\n", inode->i_ino);
-		goto bad_inode;
-	}
-	
-	bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino));
-	if (!bh) {
-		ufs_warning (sb, "ufs_read_inode", "unable to read inode %lu\n", inode->i_ino);
-		goto bad_inode;
-	}
-	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
-		goto ufs2_inode;
-
-	ufs_inode = (struct ufs_inode *) (bh->b_data + sizeof(struct ufs_inode) * ufs_inotofsbo(inode->i_ino));
 
 	/*
 	 * Copy data to the in-core inode.
@@ -661,14 +634,11 @@
 	inode->i_atime.tv_nsec = 0;
 	inode->i_ctime.tv_nsec = 0;
 	inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks);
-	inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size (for stat) */
-	inode->i_version++;
 	ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags);
 	ufsi->i_gen = fs32_to_cpu(sb, ufs_inode->ui_gen);
 	ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow);
 	ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag);
-	ufsi->i_lastfrag = (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
-	ufsi->i_dir_start_lookup = 0;
+
 	
 	if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
 		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++)
@@ -677,24 +647,16 @@
 		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
 			ufsi->i_u1.i_symlink[i] = ufs_inode->ui_u2.ui_symlink[i];
 	}
-	ufsi->i_osync = 0;
+}
 
-	ufs_set_inode_ops(inode);
+static void ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
+{
+	struct ufs_inode_info *ufsi = UFS_I(inode);
+	struct super_block *sb = inode->i_sb;
+	mode_t mode;
+	unsigned i;
 
-	brelse (bh);
-
-	UFSD("EXIT\n");
-	return;
-
-bad_inode:
-	make_bad_inode(inode);
-	return;
-
-ufs2_inode :
 	UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino);
-
-	ufs2_inode = (struct ufs2_inode *)(bh->b_data + sizeof(struct ufs2_inode) * ufs_inotofsbo(inode->i_ino));
-
 	/*
 	 * Copy data to the in-core inode.
 	 */
@@ -717,26 +679,64 @@
 	inode->i_atime.tv_nsec = 0;
 	inode->i_ctime.tv_nsec = 0;
 	inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks);
-	inode->i_blksize = PAGE_SIZE; /*This is the optimal IO size(for stat)*/
-
-	inode->i_version++;
 	ufsi->i_flags = fs32_to_cpu(sb, ufs2_inode->ui_flags);
 	ufsi->i_gen = fs32_to_cpu(sb, ufs2_inode->ui_gen);
 	/*
 	ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow);
 	ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag);
 	*/
-	ufsi->i_lastfrag= (inode->i_size + uspi->s_fsize- 1) >> uspi->s_fshift;
 
 	if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
 		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++)
 			ufsi->i_u1.u2_i_data[i] =
 				ufs2_inode->ui_u2.ui_addr.ui_db[i];
-	}
-	else {
+	} else {
 		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
 			ufsi->i_u1.i_symlink[i] = ufs2_inode->ui_u2.ui_symlink[i];
 	}
+}
+
+void ufs_read_inode(struct inode * inode)
+{
+	struct ufs_inode_info *ufsi = UFS_I(inode);
+	struct super_block * sb;
+	struct ufs_sb_private_info * uspi;
+	struct buffer_head * bh;
+
+	UFSD("ENTER, ino %lu\n", inode->i_ino);
+
+	sb = inode->i_sb;
+	uspi = UFS_SB(sb)->s_uspi;
+
+	if (inode->i_ino < UFS_ROOTINO ||
+	    inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) {
+		ufs_warning(sb, "ufs_read_inode", "bad inode number (%lu)\n",
+			    inode->i_ino);
+		goto bad_inode;
+	}
+
+	bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino));
+	if (!bh) {
+		ufs_warning(sb, "ufs_read_inode", "unable to read inode %lu\n",
+			    inode->i_ino);
+		goto bad_inode;
+	}
+	if ((UFS_SB(sb)->s_flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
+		struct ufs2_inode *ufs2_inode = (struct ufs2_inode *)bh->b_data;
+
+		ufs2_read_inode(inode,
+				ufs2_inode + ufs_inotofsbo(inode->i_ino));
+	} else {
+		struct ufs_inode *ufs_inode = (struct ufs_inode *)bh->b_data;
+
+		ufs1_read_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino));
+	}
+
+	inode->i_blksize = PAGE_SIZE;/*This is the optimal IO size (for stat)*/
+	inode->i_version++;
+	ufsi->i_lastfrag =
+		(inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
+	ufsi->i_dir_start_lookup = 0;
 	ufsi->i_osync = 0;
 
 	ufs_set_inode_ops(inode);
@@ -745,6 +745,9 @@
 
 	UFSD("EXIT\n");
 	return;
+
+bad_inode:
+	make_bad_inode(inode);
 }
 
 static int ufs_update_inode(struct inode * inode, int do_sync)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 10dbf20..ed7579b 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1721,15 +1721,14 @@
  * is present to prevent thrashing).
  */
 
+#ifdef CONFIG_HOTPLUG_CPU
 /*
  * hot-plug CPU notifier support.
  *
- * We cannot use the hotcpu_register() function because it does
- * not allow notifier instances. We need a notifier per filesystem
- * as we need to be able to identify the filesystem to balance
- * the counters out. This is achieved by having a notifier block
- * embedded in the xfs_mount_t and doing pointer magic to get the
- * mount pointer from the notifier block address.
+ * We need a notifier per filesystem as we need to be able to identify
+ * the filesystem to balance the counters out. This is achieved by
+ * having a notifier block embedded in the xfs_mount_t and doing pointer
+ * magic to get the mount pointer from the notifier block address.
  */
 STATIC int
 xfs_icsb_cpu_notify(
@@ -1779,6 +1778,7 @@
 
 	return NOTIFY_OK;
 }
+#endif /* CONFIG_HOTPLUG_CPU */
 
 int
 xfs_icsb_init_counters(
@@ -1791,9 +1791,11 @@
 	if (mp->m_sb_cnts == NULL)
 		return -ENOMEM;
 
+#ifdef CONFIG_HOTPLUG_CPU
 	mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
 	mp->m_icsb_notifier.priority = 0;
-	register_cpu_notifier(&mp->m_icsb_notifier);
+	register_hotcpu_notifier(&mp->m_icsb_notifier);
+#endif /* CONFIG_HOTPLUG_CPU */
 
 	for_each_online_cpu(i) {
 		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
@@ -1812,7 +1814,7 @@
 	xfs_mount_t	*mp)
 {
 	if (mp->m_sb_cnts) {
-		unregister_cpu_notifier(&mp->m_icsb_notifier);
+		unregister_hotcpu_notifier(&mp->m_icsb_notifier);
 		free_percpu(mp->m_sb_cnts);
 	}
 }
diff --git a/include/asm-alpha/core_t2.h b/include/asm-alpha/core_t2.h
index dba70c6..457c34b 100644
--- a/include/asm-alpha/core_t2.h
+++ b/include/asm-alpha/core_t2.h
@@ -435,7 +435,7 @@
 	set_hae(msb); \
 }
 
-static spinlock_t t2_hae_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(t2_hae_lock);
 
 __EXTERN_INLINE u8 t2_readb(const volatile void __iomem *xaddr)
 {
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 845cb67..8ceab7b 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -51,4 +51,10 @@
 	__ret;						\
 })
 
+#ifdef CONFIG_SMP
+# define WARN_ON_SMP(x)			WARN_ON(x)
+#else
+# define WARN_ON_SMP(x)			do { } while (0)
+#endif
+
 #endif
diff --git a/include/asm-i386/cpu.h b/include/asm-i386/cpu.h
index e7252c2..b1bc7b1 100644
--- a/include/asm-i386/cpu.h
+++ b/include/asm-i386/cpu.h
@@ -7,8 +7,6 @@
 #include <linux/nodemask.h>
 #include <linux/percpu.h>
 
-#include <asm/node.h>
-
 struct i386_cpu {
 	struct cpu cpu;
 };
diff --git a/include/asm-i386/elf.h b/include/asm-i386/elf.h
index 4153d80..1eac92c 100644
--- a/include/asm-i386/elf.h
+++ b/include/asm-i386/elf.h
@@ -10,6 +10,7 @@
 #include <asm/processor.h>
 #include <asm/system.h>		/* for savesegment */
 #include <asm/auxvec.h>
+#include <asm/desc.h>
 
 #include <linux/utsname.h>
 
@@ -129,15 +130,41 @@
 #define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs)
 #define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) dump_task_extended_fpu(tsk, elf_xfpregs)
 
-#define VSYSCALL_BASE	(__fix_to_virt(FIX_VSYSCALL))
-#define VSYSCALL_EHDR	((const struct elfhdr *) VSYSCALL_BASE)
-#define VSYSCALL_ENTRY	((unsigned long) &__kernel_vsyscall)
+#define VDSO_HIGH_BASE		(__fix_to_virt(FIX_VDSO))
+#define VDSO_BASE		((unsigned long)current->mm->context.vdso)
+
+#ifdef CONFIG_COMPAT_VDSO
+# define VDSO_COMPAT_BASE	VDSO_HIGH_BASE
+# define VDSO_PRELINK		VDSO_HIGH_BASE
+#else
+# define VDSO_COMPAT_BASE	VDSO_BASE
+# define VDSO_PRELINK		0
+#endif
+
+#define VDSO_COMPAT_SYM(x) \
+		(VDSO_COMPAT_BASE + (unsigned long)(x) - VDSO_PRELINK)
+
+#define VDSO_SYM(x) \
+		(VDSO_BASE + (unsigned long)(x) - VDSO_PRELINK)
+
+#define VDSO_HIGH_EHDR		((const struct elfhdr *) VDSO_HIGH_BASE)
+#define VDSO_EHDR		((const struct elfhdr *) VDSO_COMPAT_BASE)
+
 extern void __kernel_vsyscall;
 
+#define VDSO_ENTRY		VDSO_SYM(&__kernel_vsyscall)
+
+#define ARCH_HAS_SETUP_ADDITIONAL_PAGES
+struct linux_binprm;
+extern int arch_setup_additional_pages(struct linux_binprm *bprm,
+                                       int executable_stack);
+
+extern unsigned int vdso_enabled;
+
 #define ARCH_DLINFO						\
-do {								\
-		NEW_AUX_ENT(AT_SYSINFO,	VSYSCALL_ENTRY);	\
-		NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE);	\
+do if (vdso_enabled) {						\
+		NEW_AUX_ENT(AT_SYSINFO,	VDSO_ENTRY);		\
+		NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_COMPAT_BASE);	\
 } while (0)
 
 /*
@@ -148,15 +175,15 @@
  * Dumping its extra ELF program headers includes all the other information
  * a debugger needs to easily find how the vsyscall DSO was being used.
  */
-#define ELF_CORE_EXTRA_PHDRS		(VSYSCALL_EHDR->e_phnum)
+#define ELF_CORE_EXTRA_PHDRS		(VDSO_HIGH_EHDR->e_phnum)
 #define ELF_CORE_WRITE_EXTRA_PHDRS					      \
 do {									      \
 	const struct elf_phdr *const vsyscall_phdrs =			      \
-		(const struct elf_phdr *) (VSYSCALL_BASE		      \
-					   + VSYSCALL_EHDR->e_phoff);	      \
+		(const struct elf_phdr *) (VDSO_HIGH_BASE		      \
+					   + VDSO_HIGH_EHDR->e_phoff);    \
 	int i;								      \
 	Elf32_Off ofs = 0;						      \
-	for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) {			      \
+	for (i = 0; i < VDSO_HIGH_EHDR->e_phnum; ++i) {		      \
 		struct elf_phdr phdr = vsyscall_phdrs[i];		      \
 		if (phdr.p_type == PT_LOAD) {				      \
 			BUG_ON(ofs != 0);				      \
@@ -174,10 +201,10 @@
 #define ELF_CORE_WRITE_EXTRA_DATA					      \
 do {									      \
 	const struct elf_phdr *const vsyscall_phdrs =			      \
-		(const struct elf_phdr *) (VSYSCALL_BASE		      \
-					   + VSYSCALL_EHDR->e_phoff);	      \
+		(const struct elf_phdr *) (VDSO_HIGH_BASE		      \
+					   + VDSO_HIGH_EHDR->e_phoff);    \
 	int i;								      \
-	for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) {			      \
+	for (i = 0; i < VDSO_HIGH_EHDR->e_phnum; ++i) {		      \
 		if (vsyscall_phdrs[i].p_type == PT_LOAD)		      \
 			DUMP_WRITE((void *) vsyscall_phdrs[i].p_vaddr,	      \
 				   PAGE_ALIGN(vsyscall_phdrs[i].p_memsz));    \
diff --git a/include/asm-i386/fixmap.h b/include/asm-i386/fixmap.h
index f7e068f..a48cc3f 100644
--- a/include/asm-i386/fixmap.h
+++ b/include/asm-i386/fixmap.h
@@ -51,7 +51,7 @@
  */
 enum fixed_addresses {
 	FIX_HOLE,
-	FIX_VSYSCALL,
+	FIX_VDSO,
 #ifdef CONFIG_X86_LOCAL_APIC
 	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
 #endif
@@ -115,14 +115,6 @@
 #define __fix_to_virt(x)	(FIXADDR_TOP - ((x) << PAGE_SHIFT))
 #define __virt_to_fix(x)	((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
 
-/*
- * This is the range that is readable by user mode, and things
- * acting like user mode such as get_user_pages.
- */
-#define FIXADDR_USER_START	(__fix_to_virt(FIX_VSYSCALL))
-#define FIXADDR_USER_END	(FIXADDR_USER_START + PAGE_SIZE)
-
-
 extern void __this_fixmap_does_not_exist(void);
 
 /*
diff --git a/include/asm-i386/mmu.h b/include/asm-i386/mmu.h
index f431a0b..8358dd3 100644
--- a/include/asm-i386/mmu.h
+++ b/include/asm-i386/mmu.h
@@ -12,6 +12,7 @@
 	int size;
 	struct semaphore sem;
 	void *ldt;
+	void *vdso;
 } mm_context_t;
 
 #endif
diff --git a/include/asm-i386/node.h b/include/asm-i386/node.h
deleted file mode 100644
index e13c6ff..0000000
--- a/include/asm-i386/node.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef _ASM_I386_NODE_H_
-#define _ASM_I386_NODE_H_
-
-#include <linux/device.h>
-#include <linux/mmzone.h>
-#include <linux/node.h>
-#include <linux/topology.h>
-#include <linux/nodemask.h>
-
-struct i386_node {
-	struct node node;
-};
-extern struct i386_node node_devices[MAX_NUMNODES];
-
-static inline int arch_register_node(int num){
-	int p_node;
-	struct node *parent = NULL;
-
-	if (!node_online(num))
-		return 0;
-	p_node = parent_node(num);
-
-	if (p_node != num)
-		parent = &node_devices[p_node].node;
-
-	return register_node(&node_devices[num].node, num, parent);
-}
-
-#endif /* _ASM_I386_NODE_H_ */
diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h
index e3a552f..f5bf544 100644
--- a/include/asm-i386/page.h
+++ b/include/asm-i386/page.h
@@ -96,6 +96,8 @@
 
 #ifndef __ASSEMBLY__
 
+struct vm_area_struct;
+
 /*
  * This much address space is reserved for vmalloc() and iomap()
  * as well as fixmap mappings.
@@ -139,6 +141,7 @@
 #include <asm-generic/memory_model.h>
 #include <asm-generic/page.h>
 
+#define __HAVE_ARCH_GATE_AREA 1
 #endif /* __KERNEL__ */
 
 #endif /* _I386_PAGE_H */
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index 55ea992..b32346d 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -71,8 +71,12 @@
 	cpumask_t llc_shared_map;	/* cpus sharing the last level cache */
 #endif
 	unsigned char x86_max_cores;	/* cpuid returned max cores value */
-	unsigned char booted_cores;	/* number of cores as seen by OS */
 	unsigned char apicid;
+#ifdef CONFIG_SMP
+	unsigned char booted_cores;	/* number of cores as seen by OS */
+	__u8 phys_proc_id; 		/* Physical processor id. */
+	__u8 cpu_core_id;  		/* Core id */
+#endif
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 #define X86_VENDOR_INTEL 0
@@ -104,8 +108,6 @@
 #define current_cpu_data boot_cpu_data
 #endif
 
-extern	int phys_proc_id[NR_CPUS];
-extern	int cpu_core_id[NR_CPUS];
 extern	int cpu_llc_id[NR_CPUS];
 extern char ignore_fpu_irq;
 
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
index fdbc7f4..2833fa2 100644
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -37,6 +37,7 @@
 					 	   0-0xBFFFFFFF for user-thead
 						   0-0xFFFFFFFF for kernel-thread
 						*/
+	void			*sysenter_return;
 	struct restart_block    restart_block;
 
 	unsigned long           previous_esp;   /* ESP of the previous stack in case
@@ -83,17 +84,15 @@
 #define init_stack		(init_thread_union.stack)
 
 
+/* how to get the current stack pointer from C */
+register unsigned long current_stack_pointer asm("esp") __attribute_used__;
+
 /* how to get the thread information struct from C */
 static inline struct thread_info *current_thread_info(void)
 {
-	struct thread_info *ti;
-	__asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1)));
-	return ti;
+	return (struct thread_info *)(current_stack_pointer & ~(THREAD_SIZE - 1));
 }
 
-/* how to get the current stack pointer from C */
-register unsigned long current_stack_pointer asm("esp") __attribute_used__;
-
 /* thread information allocation */
 #ifdef CONFIG_DEBUG_STACK_USAGE
 #define alloc_thread_info(tsk)					\
diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h
index b94e5ee..6adbd9b 100644
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -28,10 +28,8 @@
 #define _ASM_I386_TOPOLOGY_H
 
 #ifdef CONFIG_X86_HT
-#define topology_physical_package_id(cpu)				\
-	(phys_proc_id[cpu] == BAD_APICID ? -1 : phys_proc_id[cpu])
-#define topology_core_id(cpu)						\
-	(cpu_core_id[cpu] == BAD_APICID ? 0 : cpu_core_id[cpu])
+#define topology_physical_package_id(cpu)	(cpu_data[cpu].phys_proc_id)
+#define topology_core_id(cpu)			(cpu_data[cpu].cpu_core_id)
 #define topology_core_siblings(cpu)		(cpu_core_map[cpu])
 #define topology_thread_siblings(cpu)		(cpu_sibling_map[cpu])
 #endif
@@ -114,4 +112,9 @@
 
 extern cpumask_t cpu_coregroup_map(int cpu);
 
+#ifdef CONFIG_SMP
+#define mc_capable()	(boot_cpu_data.x86_max_cores > 1)
+#define smt_capable()	(smp_num_siblings > 1)
+#endif
+
 #endif /* _ASM_I386_TOPOLOGY_H */
diff --git a/include/asm-i386/unwind.h b/include/asm-i386/unwind.h
index d480f2e..69f0f1d 100644
--- a/include/asm-i386/unwind.h
+++ b/include/asm-i386/unwind.h
@@ -78,8 +78,8 @@
 	return user_mode_vm(&info->regs);
 #else
 	return info->regs.eip < PAGE_OFFSET
-	       || (info->regs.eip >= __fix_to_virt(FIX_VSYSCALL)
-	            && info->regs.eip < __fix_to_virt(FIX_VSYSCALL) + PAGE_SIZE)
+	       || (info->regs.eip >= __fix_to_virt(FIX_VDSO)
+	            && info->regs.eip < __fix_to_virt(FIX_VDSO) + PAGE_SIZE)
 	       || info->regs.esp < PAGE_OFFSET;
 #endif
 }
diff --git a/include/asm-ia64/nodedata.h b/include/asm-ia64/nodedata.h
index a140310..2fb337b 100644
--- a/include/asm-ia64/nodedata.h
+++ b/include/asm-ia64/nodedata.h
@@ -46,6 +46,18 @@
  */
 #define NODE_DATA(nid)		(local_node_data->pg_data_ptrs[nid])
 
+/*
+ * LOCAL_DATA_ADDR - This is to calculate the address of other node's
+ *		     "local_node_data" at hot-plug phase. The local_node_data
+ *		     is pointed by per_cpu_page. Kernel usually use it for
+ *		     just executing cpu. However, when new node is hot-added,
+ *		     the addresses of local data for other nodes are necessary
+ *		     to update all of them.
+ */
+#define LOCAL_DATA_ADDR(pgdat)  			\
+	((struct ia64_node_data *)((u64)(pgdat) + 	\
+				   L1_CACHE_ALIGN(sizeof(struct pglist_data))))
+
 #endif /* CONFIG_NUMA */
 
 #endif /* _ASM_IA64_NODEDATA_H */
diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h
index 616b5ed..937c2125 100644
--- a/include/asm-ia64/topology.h
+++ b/include/asm-ia64/topology.h
@@ -112,6 +112,7 @@
 #define topology_core_id(cpu)			(cpu_data(cpu)->core_id)
 #define topology_core_siblings(cpu)		(cpu_core_map[cpu])
 #define topology_thread_siblings(cpu)		(cpu_sibling_map[cpu])
+#define smt_capable() 				(smp_num_siblings > 1)
 #endif
 
 #include <asm-generic/topology.h>
diff --git a/include/asm-powerpc/topology.h b/include/asm-powerpc/topology.h
index 92f3e55..bbc3844 100644
--- a/include/asm-powerpc/topology.h
+++ b/include/asm-powerpc/topology.h
@@ -93,5 +93,10 @@
 
 #endif /* CONFIG_NUMA */
 
+#ifdef CONFIG_SMP
+#include <asm/cputable.h>
+#define smt_capable() 		(cpu_has_feature(CPU_FTR_SMT))
+#endif
+
 #endif /* __KERNEL__ */
 #endif	/* _ASM_POWERPC_TOPOLOGY_H */
diff --git a/include/asm-sparc64/topology.h b/include/asm-sparc64/topology.h
index 0e234e2..98a6c61 100644
--- a/include/asm-sparc64/topology.h
+++ b/include/asm-sparc64/topology.h
@@ -1,6 +1,9 @@
 #ifndef _ASM_SPARC64_TOPOLOGY_H
 #define _ASM_SPARC64_TOPOLOGY_H
 
+#include <asm/spitfire.h>
+#define smt_capable()	(tlb_type == hypervisor)
+
 #include <asm-generic/topology.h>
 
 #endif /* _ASM_SPARC64_TOPOLOGY_H */
diff --git a/include/asm-x86_64/hw_irq.h b/include/asm-x86_64/hw_irq.h
index 1b2ac55..9318774 100644
--- a/include/asm-x86_64/hw_irq.h
+++ b/include/asm-x86_64/hw_irq.h
@@ -124,7 +124,7 @@
 __asm__( \
 "\n.p2align\n" \
 "IRQ" #nr "_interrupt:\n\t" \
-	"push $" #nr "-256 ; " \
+	"push $~(" #nr ") ; " \
 	"jmp common_interrupt");
 
 #if defined(CONFIG_X86_IO_APIC)
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index c4e46e7..6e7a2e9 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -59,6 +59,8 @@
 #define topology_core_id(cpu)			(cpu_data[cpu].cpu_core_id)
 #define topology_core_siblings(cpu)		(cpu_core_map[cpu])
 #define topology_thread_siblings(cpu)		(cpu_sibling_map[cpu])
+#define mc_capable()			(boot_cpu_data.x86_max_cores > 1)
+#define smt_capable() 			(smp_num_siblings > 1)
 #endif
 
 #include <asm-generic/topology.h>
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 90d6df1..88b5dfd 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -528,12 +528,18 @@
 
 #ifdef CONFIG_ACPI_NUMA
 int acpi_get_pxm(acpi_handle handle);
+int acpi_get_node(acpi_handle *handle);
 #else
 static inline int acpi_get_pxm(acpi_handle handle)
 {
 	return 0;
 }
+static inline int acpi_get_node(acpi_handle *handle)
+{
+	return 0;
+}
 #endif
+extern int acpi_paddr_to_node(u64 start_addr, u64 size);
 
 extern int pnpacpi_disabled;
 
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index fb7e9b7..737e407 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -149,7 +149,6 @@
 			unsigned long b_state);
 void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
 void end_buffer_write_sync(struct buffer_head *bh, int uptodate);
-void end_buffer_async_write(struct buffer_head *bh, int uptodate);
 
 /* Things to do with buffers at mapping->private_list */
 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode);
@@ -214,6 +213,7 @@
 int nobh_writepage(struct page *page, get_block_t *get_block,
                         struct writeback_control *wbc);
 
+void buffer_init(void);
 
 /*
  * inline definitions
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 08d50c5..a3caf68 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -31,17 +31,23 @@
 	struct sys_device sysdev;
 };
 
-extern int register_cpu(struct cpu *, int, struct node *);
+extern int register_cpu(struct cpu *cpu, int num);
 extern struct sys_device *get_cpu_sysdev(unsigned cpu);
 #ifdef CONFIG_HOTPLUG_CPU
-extern void unregister_cpu(struct cpu *, struct node *);
+extern void unregister_cpu(struct cpu *cpu);
 #endif
 struct notifier_block;
 
 #ifdef CONFIG_SMP
 /* Need to know about CPUs going up/down? */
 extern int register_cpu_notifier(struct notifier_block *nb);
+#ifdef CONFIG_HOTPLUG_CPU
 extern void unregister_cpu_notifier(struct notifier_block *nb);
+#else
+static inline void unregister_cpu_notifier(struct notifier_block *nb)
+{
+}
+#endif
 extern int current_in_cpu_hotplug(void);
 
 int cpu_up(unsigned int cpu);
@@ -73,6 +79,8 @@
 		{ .notifier_call = fn, .priority = pri };	\
 	register_cpu_notifier(&fn##_nb);			\
 }
+#define register_hotcpu_notifier(nb)	register_cpu_notifier(nb)
+#define unregister_hotcpu_notifier(nb)	unregister_cpu_notifier(nb)
 int cpu_down(unsigned int cpu);
 #define cpu_is_offline(cpu) unlikely(!cpu_online(cpu))
 #else
@@ -80,6 +88,8 @@
 #define unlock_cpu_hotplug()	do { } while (0)
 #define lock_cpu_hotplug_interruptible() 0
 #define hotcpu_notifier(fn, pri)
+#define register_hotcpu_notifier(nb)
+#define unregister_hotcpu_notifier(nb)
 
 /* CPUs don't go offline once they're online w/o CONFIG_HOTPLUG_CPU */
 static inline int cpu_is_offline(int cpu) { return 0; }
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 78b236c..272010a6 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -20,7 +20,7 @@
  */
 #ifndef DMAENGINE_H
 #define DMAENGINE_H
-#include <linux/config.h>
+
 #ifdef CONFIG_DMA_ENGINE
 
 #include <linux/device.h>
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 966a5b3..34c3a21 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -12,6 +12,9 @@
 #define FUTEX_REQUEUE		3
 #define FUTEX_CMP_REQUEUE	4
 #define FUTEX_WAKE_OP		5
+#define FUTEX_LOCK_PI		6
+#define FUTEX_UNLOCK_PI		7
+#define FUTEX_TRYLOCK_PI	8
 
 /*
  * Support for robust futexes: the kernel cleans up held futexes at
@@ -90,18 +93,21 @@
  */
 #define ROBUST_LIST_LIMIT	2048
 
-long do_futex(unsigned long uaddr, int op, int val,
-		unsigned long timeout, unsigned long uaddr2, int val2,
-		int val3);
+long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
+	      u32 __user *uaddr2, u32 val2, u32 val3);
 
 extern int handle_futex_death(u32 __user *uaddr, struct task_struct *curr);
 
 #ifdef CONFIG_FUTEX
 extern void exit_robust_list(struct task_struct *curr);
+extern void exit_pi_state_list(struct task_struct *curr);
 #else
 static inline void exit_robust_list(struct task_struct *curr)
 {
 }
+static inline void exit_pi_state_list(struct task_struct *curr)
+{
+}
 #endif
 
 #define FUTEX_OP_SET		0	/* *(int *)UADDR2 = OPARG; */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index e127ef7..3a25695 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -87,6 +87,7 @@
 	.lock_depth	= -1,						\
 	.prio		= MAX_PRIO-20,					\
 	.static_prio	= MAX_PRIO-20,					\
+	.normal_prio	= MAX_PRIO-20,					\
 	.policy		= SCHED_NORMAL,					\
 	.cpus_allowed	= CPU_MASK_ALL,					\
 	.mm		= NULL,						\
@@ -122,6 +123,8 @@
 	.journal_info	= NULL,						\
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 	.fs_excl	= ATOMIC_INIT(0),				\
+	.pi_lock	= SPIN_LOCK_UNLOCKED,				\
+	INIT_RT_MUTEXES(tsk)						\
 }
 
 
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index cd6bd00..edfc733 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -105,6 +105,9 @@
 int adjust_resource(struct resource *res, unsigned long start,
 		    unsigned long size);
 
+/* get registered SYSTEM_RAM resources in specified area */
+extern int find_next_system_ram(struct resource *res);
+
 /* Convenience shorthand with allocation */
 #define request_region(start,n,name)	__request_region(&ioport_resource, (start), (n), (name))
 #define request_mem_region(start,n,name) __request_region(&iomem_resource, (start), (n), (name))
diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h
index 5653b2f..d09fbea 100644
--- a/include/linux/ipmi.h
+++ b/include/linux/ipmi.h
@@ -210,11 +210,7 @@
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/device.h>
-
-#ifdef CONFIG_PROC_FS
 #include <linux/proc_fs.h>
-extern struct proc_dir_entry *proc_ipmi_root;
-#endif /* CONFIG_PROC_FS */
 
 /* Opaque type for a IPMI message user.  One of these is needed to
    send and receive messages. */
diff --git a/include/linux/list.h b/include/linux/list.h
index 37ca31b..6b74adf 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -4,18 +4,11 @@
 #ifdef __KERNEL__
 
 #include <linux/stddef.h>
+#include <linux/poison.h>
 #include <linux/prefetch.h>
 #include <asm/system.h>
 
 /*
- * These are non-NULL pointers that will result in page faults
- * under normal circumstances, used to verify that nobody uses
- * non-initialized list entries.
- */
-#define LIST_POISON1  ((void *) 0x00100100)
-#define LIST_POISON2  ((void *) 0x00200200)
-
-/*
  * Simple doubly linked list implementation.
  *
  * Some of the internal functions ("__xxx") are useful when
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 9112063..218501c 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -63,6 +63,76 @@
 /* reasonably generic interface to expand the physical pages in a zone  */
 extern int __add_pages(struct zone *zone, unsigned long start_pfn,
 	unsigned long nr_pages);
+
+#ifdef CONFIG_NUMA
+extern int memory_add_physaddr_to_nid(u64 start);
+#else
+static inline int memory_add_physaddr_to_nid(u64 start)
+{
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION
+/*
+ * For supporting node-hotadd, we have to allocate a new pgdat.
+ *
+ * If an arch has generic style NODE_DATA(),
+ * node_data[nid] = kzalloc() works well. But it depends on the architecture.
+ *
+ * In general, generic_alloc_nodedata() is used.
+ * Now, arch_free_nodedata() is just defined for error path of node_hot_add.
+ *
+ */
+extern pg_data_t *arch_alloc_nodedata(int nid);
+extern void arch_free_nodedata(pg_data_t *pgdat);
+extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat);
+
+#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
+
+#define arch_alloc_nodedata(nid)	generic_alloc_nodedata(nid)
+#define arch_free_nodedata(pgdat)	generic_free_nodedata(pgdat)
+
+#ifdef CONFIG_NUMA
+/*
+ * If ARCH_HAS_NODEDATA_EXTENSION=n, this func is used to allocate pgdat.
+ * XXX: kmalloc_node() can't work well to get new node's memory at this time.
+ *	Because, pgdat for the new node is not allocated/initialized yet itself.
+ *	To use new node's memory, more consideration will be necessary.
+ */
+#define generic_alloc_nodedata(nid)				\
+({								\
+	kzalloc(sizeof(pg_data_t), GFP_KERNEL);			\
+})
+/*
+ * This definition is just for error path in node hotadd.
+ * For node hotremove, we have to replace this.
+ */
+#define generic_free_nodedata(pgdat)	kfree(pgdat)
+
+extern pg_data_t *node_data[];
+static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
+{
+	node_data[nid] = pgdat;
+}
+
+#else /* !CONFIG_NUMA */
+
+/* never called */
+static inline pg_data_t *generic_alloc_nodedata(int nid)
+{
+	BUG();
+	return NULL;
+}
+static inline void generic_free_nodedata(pg_data_t *pgdat)
+{
+}
+static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
+{
+}
+#endif /* CONFIG_NUMA */
+#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
+
 #else /* ! CONFIG_MEMORY_HOTPLUG */
 /*
  * Stub functions for when hotplug is off
@@ -99,7 +169,8 @@
 	return -ENOSYS;
 }
 
-extern int add_memory(u64 start, u64 size);
+extern int add_memory(int nid, u64 start, u64 size);
+extern int arch_add_memory(int nid, u64 start, u64 size);
 extern int remove_memory(u64 start, u64 size);
 
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a929ea1..c41a129 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1030,13 +1030,20 @@
 }
 #endif /* CONFIG_PROC_FS */
 
+static inline void
+debug_check_no_locks_freed(const void *from, unsigned long len)
+{
+	mutex_debug_check_no_locks_freed(from, len);
+	rt_mutex_debug_check_no_locks_freed(from, len);
+}
+
 #ifndef CONFIG_DEBUG_PAGEALLOC
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable)
 {
 	if (!PageHighMem(page) && !enable)
-		mutex_debug_check_no_locks_freed(page_address(page),
-						 numpages * PAGE_SIZE);
+		debug_check_no_locks_freed(page_address(page),
+					   numpages * PAGE_SIZE);
 }
 #endif
 
@@ -1065,5 +1072,7 @@
 extern int randomize_va_space;
 #endif
 
+const char *arch_vma_name(struct vm_area_struct *vma);
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/node.h b/include/linux/node.h
index 254dc3d..81dcec8 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -26,8 +26,25 @@
 	struct sys_device	sysdev;
 };
 
+extern struct node node_devices[];
+
 extern int register_node(struct node *, int, struct node *);
 extern void unregister_node(struct node *node);
+extern int register_one_node(int nid);
+extern void unregister_one_node(int nid);
+#ifdef CONFIG_NUMA
+extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
+extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
+#else
+static inline int register_cpu_under_node(unsigned int cpu, unsigned int nid)
+{
+	return 0;
+}
+static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
+{
+	return 0;
+}
+#endif
 
 #define to_node(sys_device) container_of(sys_device, struct node, sysdev)
 
diff --git a/include/linux/nsc_gpio.h b/include/linux/nsc_gpio.h
new file mode 100644
index 0000000..135742c
--- /dev/null
+++ b/include/linux/nsc_gpio.h
@@ -0,0 +1,42 @@
+/**
+   nsc_gpio.c
+
+   National Semiconductor GPIO common access methods.
+
+   struct nsc_gpio_ops abstracts the low-level access
+   operations for the GPIO units on 2 NSC chip families; the GEODE
+   integrated CPU, and the PC-8736[03456] integrated PC-peripheral
+   chips.
+
+   The GPIO units on these chips have the same pin architecture, but
+   the access methods differ.  Thus, scx200_gpio and pc8736x_gpio
+   implement their own versions of these routines; and use the common
+   file-operations routines implemented in nsc_gpio module.
+
+   Copyright (c) 2005 Jim Cromie <jim.cromie@gmail.com>
+
+   NB: this work was tested on the Geode SC-1100 and PC-87366 chips.
+   NSC sold the GEODE line to AMD, and the PC-8736x line to Winbond.
+*/
+
+struct nsc_gpio_ops {
+	struct module*	owner;
+	u32	(*gpio_config)	(unsigned iminor, u32 mask, u32 bits);
+	void	(*gpio_dump)	(struct nsc_gpio_ops *amp, unsigned iminor);
+	int	(*gpio_get)	(unsigned iminor);
+	void	(*gpio_set)	(unsigned iminor, int state);
+	void	(*gpio_set_high)(unsigned iminor);
+	void	(*gpio_set_low)	(unsigned iminor);
+	void	(*gpio_change)	(unsigned iminor);
+	int	(*gpio_current)	(unsigned iminor);
+	struct device*	dev;	/* for dev_dbg() support, set in init  */
+};
+
+extern ssize_t nsc_gpio_write(struct file *file, const char __user *data,
+			      size_t len, loff_t *ppos);
+
+extern ssize_t nsc_gpio_read(struct file *file, char __user *buf,
+			     size_t len, loff_t *ppos);
+
+extern void nsc_gpio_dump(struct nsc_gpio_ops *amp, unsigned index);
+
diff --git a/include/linux/plist.h b/include/linux/plist.h
new file mode 100644
index 0000000..3404fae
--- /dev/null
+++ b/include/linux/plist.h
@@ -0,0 +1,247 @@
+/*
+ * Descending-priority-sorted double-linked list
+ *
+ * (C) 2002-2003 Intel Corp
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>.
+ *
+ * 2001-2005 (c) MontaVista Software, Inc.
+ * Daniel Walker <dwalker@mvista.com>
+ *
+ * (C) 2005 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * Simplifications of the original code by
+ * Oleg Nesterov <oleg@tv-sign.ru>
+ *
+ * Licensed under the FSF's GNU Public License v2 or later.
+ *
+ * Based on simple lists (include/linux/list.h).
+ *
+ * This is a priority-sorted list of nodes; each node has a
+ * priority from INT_MIN (highest) to INT_MAX (lowest).
+ *
+ * Addition is O(K), removal is O(1), change of priority of a node is
+ * O(K) and K is the number of RT priority levels used in the system.
+ * (1 <= K <= 99)
+ *
+ * This list is really a list of lists:
+ *
+ *  - The tier 1 list is the prio_list, different priority nodes.
+ *
+ *  - The tier 2 list is the node_list, serialized nodes.
+ *
+ * Simple ASCII art explanation:
+ *
+ * |HEAD          |
+ * |              |
+ * |prio_list.prev|<------------------------------------|
+ * |prio_list.next|<->|pl|<->|pl|<--------------->|pl|<-|
+ * |10            |   |10|   |21|   |21|   |21|   |40|   (prio)
+ * |              |   |  |   |  |   |  |   |  |   |  |
+ * |              |   |  |   |  |   |  |   |  |   |  |
+ * |node_list.next|<->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<-|
+ * |node_list.prev|<------------------------------------|
+ *
+ * The nodes on the prio_list list are sorted by priority to simplify
+ * the insertion of new nodes. There are no nodes with duplicate
+ * priorites on the list.
+ *
+ * The nodes on the node_list is ordered by priority and can contain
+ * entries which have the same priority. Those entries are ordered
+ * FIFO
+ *
+ * Addition means: look for the prio_list node in the prio_list
+ * for the priority of the node and insert it before the node_list
+ * entry of the next prio_list node. If it is the first node of
+ * that priority, add it to the prio_list in the right position and
+ * insert it into the serialized node_list list
+ *
+ * Removal means remove it from the node_list and remove it from
+ * the prio_list if the node_list list_head is non empty. In case
+ * of removal from the prio_list it must be checked whether other
+ * entries of the same priority are on the list or not. If there
+ * is another entry of the same priority then this entry has to
+ * replace the removed entry on the prio_list. If the entry which
+ * is removed is the only entry of this priority then a simple
+ * remove from both list is sufficient.
+ *
+ * INT_MIN is the highest priority, 0 is the medium highest, INT_MAX
+ * is lowest priority.
+ *
+ * No locking is done, up to the caller.
+ *
+ */
+#ifndef _LINUX_PLIST_H_
+#define _LINUX_PLIST_H_
+
+#include <linux/list.h>
+#include <linux/spinlock_types.h>
+
+struct plist_head {
+	struct list_head prio_list;
+	struct list_head node_list;
+#ifdef CONFIG_DEBUG_PI_LIST
+	spinlock_t *lock;
+#endif
+};
+
+struct plist_node {
+	int			prio;
+	struct plist_head	plist;
+};
+
+#ifdef CONFIG_DEBUG_PI_LIST
+# define PLIST_HEAD_LOCK_INIT(_lock)	.lock = _lock
+#else
+# define PLIST_HEAD_LOCK_INIT(_lock)
+#endif
+
+/**
+ * #PLIST_HEAD_INIT - static struct plist_head initializer
+ *
+ * @head:	struct plist_head variable name
+ */
+#define PLIST_HEAD_INIT(head, _lock)			\
+{							\
+	.prio_list = LIST_HEAD_INIT((head).prio_list),	\
+	.node_list = LIST_HEAD_INIT((head).node_list),	\
+	PLIST_HEAD_LOCK_INIT(&(_lock))			\
+}
+
+/**
+ * #PLIST_NODE_INIT - static struct plist_node initializer
+ *
+ * @node:	struct plist_node variable name
+ * @__prio:	initial node priority
+ */
+#define PLIST_NODE_INIT(node, __prio)			\
+{							\
+	.prio  = (__prio),				\
+	.plist = PLIST_HEAD_INIT((node).plist, NULL),	\
+}
+
+/**
+ * plist_head_init - dynamic struct plist_head initializer
+ *
+ * @head:	&struct plist_head pointer
+ */
+static inline void
+plist_head_init(struct plist_head *head, spinlock_t *lock)
+{
+	INIT_LIST_HEAD(&head->prio_list);
+	INIT_LIST_HEAD(&head->node_list);
+#ifdef CONFIG_DEBUG_PI_LIST
+	head->lock = lock;
+#endif
+}
+
+/**
+ * plist_node_init - Dynamic struct plist_node initializer
+ *
+ * @node:	&struct plist_node pointer
+ * @prio:	initial node priority
+ */
+static inline void plist_node_init(struct plist_node *node, int prio)
+{
+	node->prio = prio;
+	plist_head_init(&node->plist, NULL);
+}
+
+extern void plist_add(struct plist_node *node, struct plist_head *head);
+extern void plist_del(struct plist_node *node, struct plist_head *head);
+
+/**
+ * plist_for_each - iterate over the plist
+ *
+ * @pos1:	the type * to use as a loop counter.
+ * @head:	the head for your list.
+ */
+#define plist_for_each(pos, head)	\
+	 list_for_each_entry(pos, &(head)->node_list, plist.node_list)
+
+/**
+ * plist_for_each_entry_safe - iterate over a plist of given type safe
+ * against removal of list entry
+ *
+ * @pos1:	the type * to use as a loop counter.
+ * @n1:	another type * to use as temporary storage
+ * @head:	the head for your list.
+ */
+#define plist_for_each_safe(pos, n, head)	\
+	 list_for_each_entry_safe(pos, n, &(head)->node_list, plist.node_list)
+
+/**
+ * plist_for_each_entry	- iterate over list of given type
+ *
+ * @pos:	the type * to use as a loop counter.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define plist_for_each_entry(pos, head, mem)	\
+	 list_for_each_entry(pos, &(head)->node_list, mem.plist.node_list)
+
+/**
+ * plist_for_each_entry_safe - iterate over list of given type safe against
+ * removal of list entry
+ *
+ * @pos:	the type * to use as a loop counter.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @m:		the name of the list_struct within the struct.
+ */
+#define plist_for_each_entry_safe(pos, n, head, m)	\
+	list_for_each_entry_safe(pos, n, &(head)->node_list, m.plist.node_list)
+
+/**
+ * plist_head_empty - return !0 if a plist_head is empty
+ *
+ * @head:	&struct plist_head pointer
+ */
+static inline int plist_head_empty(const struct plist_head *head)
+{
+	return list_empty(&head->node_list);
+}
+
+/**
+ * plist_node_empty - return !0 if plist_node is not on a list
+ *
+ * @node:	&struct plist_node pointer
+ */
+static inline int plist_node_empty(const struct plist_node *node)
+{
+	return plist_head_empty(&node->plist);
+}
+
+/* All functions below assume the plist_head is not empty. */
+
+/**
+ * plist_first_entry - get the struct for the first entry
+ *
+ * @ptr:	the &struct plist_head pointer.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_struct within the struct.
+ */
+#ifdef CONFIG_DEBUG_PI_LIST
+# define plist_first_entry(head, type, member)	\
+({ \
+	WARN_ON(plist_head_empty(head)); \
+	container_of(plist_first(head), type, member); \
+})
+#else
+# define plist_first_entry(head, type, member)	\
+	container_of(plist_first(head), type, member)
+#endif
+
+/**
+ * plist_first - return the first node (and thus, highest priority)
+ *
+ * @head:	the &struct plist_head pointer
+ *
+ * Assumes the plist is _not_ empty.
+ */
+static inline struct plist_node* plist_first(const struct plist_head *head)
+{
+	return list_entry(head->node_list.next,
+			  struct plist_node, plist.node_list);
+}
+
+#endif
diff --git a/include/linux/poison.h b/include/linux/poison.h
new file mode 100644
index 0000000..a5347c0
--- /dev/null
+++ b/include/linux/poison.h
@@ -0,0 +1,58 @@
+#ifndef _LINUX_POISON_H
+#define _LINUX_POISON_H
+
+/********** include/linux/list.h **********/
+/*
+ * These are non-NULL pointers that will result in page faults
+ * under normal circumstances, used to verify that nobody uses
+ * non-initialized list entries.
+ */
+#define LIST_POISON1  ((void *) 0x00100100)
+#define LIST_POISON2  ((void *) 0x00200200)
+
+/********** mm/slab.c **********/
+/*
+ * Magic nums for obj red zoning.
+ * Placed in the first word before and the first word after an obj.
+ */
+#define	RED_INACTIVE	0x5A2CF071UL	/* when obj is inactive */
+#define	RED_ACTIVE	0x170FC2A5UL	/* when obj is active */
+
+/* ...and for poisoning */
+#define	POISON_INUSE	0x5a	/* for use-uninitialised poisoning */
+#define POISON_FREE	0x6b	/* for use-after-free poisoning */
+#define	POISON_END	0xa5	/* end-byte of poisoning */
+
+/********** arch/$ARCH/mm/init.c **********/
+#define POISON_FREE_INITMEM	0xcc
+
+/********** arch/x86_64/mm/init.c **********/
+#define	POISON_FREE_INITDATA	0xba
+
+/********** arch/ia64/hp/common/sba_iommu.c **********/
+/*
+ * arch/ia64/hp/common/sba_iommu.c uses a 16-byte poison string with a
+ * value of "SBAIOMMU POISON\0" for spill-over poisoning.
+ */
+
+/********** fs/jbd/journal.c **********/
+#define JBD_POISON_FREE	0x5b
+
+/********** drivers/base/dmapool.c **********/
+#define	POOL_POISON_FREED	0xa7	/* !inuse */
+#define	POOL_POISON_ALLOCATED	0xa9	/* !initted */
+
+/********** drivers/atm/ **********/
+#define ATM_POISON_FREE		0x12
+
+/********** kernel/mutexes **********/
+#define MUTEX_DEBUG_INIT	0x11
+#define MUTEX_DEBUG_FREE	0x22
+
+/********** security/ **********/
+#define KEY_DESTROY		0xbd
+
+/********** sound/oss/ **********/
+#define OSS_POISON_FREE		0xAB
+
+#endif
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 6312758..48dfe00 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -258,6 +258,7 @@
 extern void rcu_check_callbacks(int cpu, int user);
 extern void rcu_restart_cpu(int cpu);
 extern long rcu_batches_completed(void);
+extern long rcu_batches_completed_bh(void);
 
 /* Exported interfaces */
 extern void FASTCALL(call_rcu(struct rcu_head *head, 
diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
new file mode 100644
index 0000000..fa4a3b8
--- /dev/null
+++ b/include/linux/rtmutex.h
@@ -0,0 +1,117 @@
+/*
+ * RT Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains the public data structure and API definitions.
+ */
+
+#ifndef __LINUX_RT_MUTEX_H
+#define __LINUX_RT_MUTEX_H
+
+#include <linux/linkage.h>
+#include <linux/plist.h>
+#include <linux/spinlock_types.h>
+
+/*
+ * The rt_mutex structure
+ *
+ * @wait_lock:	spinlock to protect the structure
+ * @wait_list:	pilist head to enqueue waiters in priority order
+ * @owner:	the mutex owner
+ */
+struct rt_mutex {
+	spinlock_t		wait_lock;
+	struct plist_head	wait_list;
+	struct task_struct	*owner;
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	int			save_state;
+	struct list_head	held_list_entry;
+	unsigned long		acquire_ip;
+	const char 		*name, *file;
+	int			line;
+	void			*magic;
+#endif
+};
+
+struct rt_mutex_waiter;
+struct hrtimer_sleeper;
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+ extern int rt_mutex_debug_check_no_locks_freed(const void *from,
+						unsigned long len);
+ extern void rt_mutex_debug_check_no_locks_held(struct task_struct *task);
+#else
+ static inline int rt_mutex_debug_check_no_locks_freed(const void *from,
+						       unsigned long len)
+ {
+	return 0;
+ }
+# define rt_mutex_debug_check_no_locks_held(task)	do { } while (0)
+#endif
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
+	, .name = #mutexname, .file = __FILE__, .line = __LINE__
+# define rt_mutex_init(mutex)			__rt_mutex_init(mutex, __FUNCTION__)
+ extern void rt_mutex_debug_task_free(struct task_struct *tsk);
+#else
+# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
+# define rt_mutex_init(mutex)			__rt_mutex_init(mutex, NULL)
+# define rt_mutex_debug_task_free(t)			do { } while (0)
+#endif
+
+#define __RT_MUTEX_INITIALIZER(mutexname) \
+	{ .wait_lock = SPIN_LOCK_UNLOCKED \
+	, .wait_list = PLIST_HEAD_INIT(mutexname.wait_list, mutexname.wait_lock) \
+	, .owner = NULL \
+	__DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
+
+#define DEFINE_RT_MUTEX(mutexname) \
+	struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
+
+/***
+ * rt_mutex_is_locked - is the mutex locked
+ * @lock: the mutex to be queried
+ *
+ * Returns 1 if the mutex is locked, 0 if unlocked.
+ */
+static inline int rt_mutex_is_locked(struct rt_mutex *lock)
+{
+	return lock->owner != NULL;
+}
+
+extern void __rt_mutex_init(struct rt_mutex *lock, const char *name);
+extern void rt_mutex_destroy(struct rt_mutex *lock);
+
+extern void rt_mutex_lock(struct rt_mutex *lock);
+extern int rt_mutex_lock_interruptible(struct rt_mutex *lock,
+						int detect_deadlock);
+extern int rt_mutex_timed_lock(struct rt_mutex *lock,
+					struct hrtimer_sleeper *timeout,
+					int detect_deadlock);
+
+extern int rt_mutex_trylock(struct rt_mutex *lock);
+
+extern void rt_mutex_unlock(struct rt_mutex *lock);
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# define INIT_RT_MUTEX_DEBUG(tsk)					\
+	.held_list_head	= LIST_HEAD_INIT(tsk.held_list_head),		\
+	.held_list_lock	= SPIN_LOCK_UNLOCKED
+#else
+# define INIT_RT_MUTEX_DEBUG(tsk)
+#endif
+
+#ifdef CONFIG_RT_MUTEXES
+# define INIT_RT_MUTEXES(tsk)						\
+	.pi_waiters	= PLIST_HEAD_INIT(tsk.pi_waiters, tsk.pi_lock),	\
+	INIT_RT_MUTEX_DEBUG(tsk)
+#else
+# define INIT_RT_MUTEXES(tsk)
+#endif
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 122a25c..821f048 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -73,6 +73,7 @@
 #include <linux/seccomp.h>
 #include <linux/rcupdate.h>
 #include <linux/futex.h>
+#include <linux/rtmutex.h>
 
 #include <linux/time.h>
 #include <linux/param.h>
@@ -83,6 +84,7 @@
 #include <asm/processor.h>
 
 struct exec_domain;
+struct futex_pi_state;
 
 /*
  * List of flags we want to share for kernel threads,
@@ -123,6 +125,7 @@
 extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
+extern unsigned long weighted_cpuload(const int cpu);
 
 
 /*
@@ -494,8 +497,11 @@
 
 #define MAX_PRIO		(MAX_RT_PRIO + 40)
 
-#define rt_task(p)		(unlikely((p)->prio < MAX_RT_PRIO))
+#define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
+#define rt_task(p)		rt_prio((p)->prio)
 #define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
+#define has_rt_policy(p) \
+	unlikely((p)->policy != SCHED_NORMAL && (p)->policy != SCHED_BATCH)
 
 /*
  * Some day this will be a full-fledged user tracking system..
@@ -558,9 +564,9 @@
 /*
  * sched-domains (multiprocessor balancing) declarations:
  */
-#ifdef CONFIG_SMP
 #define SCHED_LOAD_SCALE	128UL	/* increase resolution of load */
 
+#ifdef CONFIG_SMP
 #define SD_LOAD_BALANCE		1	/* Do load balancing on this domain. */
 #define SD_BALANCE_NEWIDLE	2	/* Balance when about to become idle */
 #define SD_BALANCE_EXEC		4	/* Balance on exec */
@@ -569,6 +575,11 @@
 #define SD_WAKE_AFFINE		32	/* Wake task to waking CPU */
 #define SD_WAKE_BALANCE		64	/* Perform balancing at task wakeup */
 #define SD_SHARE_CPUPOWER	128	/* Domain members share cpu power */
+#define SD_POWERSAVINGS_BALANCE	256	/* Balance for power savings */
+
+#define BALANCE_FOR_POWER	((sched_mc_power_savings || sched_smt_power_savings) \
+				 ? SD_POWERSAVINGS_BALANCE : 0)
+
 
 struct sched_group {
 	struct sched_group *next;	/* Must be a circular list */
@@ -638,7 +649,7 @@
 #endif
 };
 
-extern void partition_sched_domains(cpumask_t *partition1,
+extern int partition_sched_domains(cpumask_t *partition1,
 				    cpumask_t *partition2);
 
 /*
@@ -713,10 +724,13 @@
 
 	int lock_depth;		/* BKL lock depth */
 
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#ifdef CONFIG_SMP
+#ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	int oncpu;
 #endif
-	int prio, static_prio;
+#endif
+	int load_weight;	/* for niceness load balancing purposes */
+	int prio, static_prio, normal_prio;
 	struct list_head run_list;
 	prio_array_t *array;
 
@@ -843,6 +857,20 @@
 /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
 	spinlock_t alloc_lock;
 
+	/* Protection of the PI data structures: */
+	spinlock_t pi_lock;
+
+#ifdef CONFIG_RT_MUTEXES
+	/* PI waiters blocked on a rt_mutex held by this task */
+	struct plist_head pi_waiters;
+	/* Deadlock detection and priority inheritance handling */
+	struct rt_mutex_waiter *pi_blocked_on;
+# ifdef CONFIG_DEBUG_RT_MUTEXES
+	spinlock_t held_list_lock;
+	struct list_head held_list_head;
+# endif
+#endif
+
 #ifdef CONFIG_DEBUG_MUTEXES
 	/* mutex deadlock detection */
 	struct mutex_waiter *blocked_on;
@@ -888,6 +916,8 @@
 #ifdef CONFIG_COMPAT
 	struct compat_robust_list_head __user *compat_robust_list;
 #endif
+	struct list_head pi_state_list;
+	struct futex_pi_state *pi_state_cache;
 
 	atomic_t fs_excl;	/* holding fs exclusive resources */
 	struct rcu_head rcu;
@@ -955,6 +985,7 @@
 #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
+#define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
@@ -1009,6 +1040,19 @@
 #endif
 
 extern void sched_idle_next(void);
+
+#ifdef CONFIG_RT_MUTEXES
+extern int rt_mutex_getprio(task_t *p);
+extern void rt_mutex_setprio(task_t *p, int prio);
+extern void rt_mutex_adjust_pi(task_t *p);
+#else
+static inline int rt_mutex_getprio(task_t *p)
+{
+	return p->normal_prio;
+}
+# define rt_mutex_adjust_pi(p)		do { } while (0)
+#endif
+
 extern void set_user_nice(task_t *p, long nice);
 extern int task_prio(const task_t *p);
 extern int task_nice(const task_t *p);
@@ -1408,6 +1452,11 @@
 extern long sched_setaffinity(pid_t pid, cpumask_t new_mask);
 extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
 
+#include <linux/sysdev.h>
+extern int sched_mc_power_savings, sched_smt_power_savings;
+extern struct sysdev_attribute attr_sched_mc_power_savings, attr_sched_smt_power_savings;
+extern int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls);
+
 extern void normalize_rt_tasks(void);
 
 #ifdef CONFIG_PM
diff --git a/include/linux/scx200.h b/include/linux/scx200.h
index a22f9e1..693c055 100644
--- a/include/linux/scx200.h
+++ b/include/linux/scx200.h
@@ -49,10 +49,3 @@
 #define SCx200_REV 0x3d		/* Revision Register */
 #define SCx200_CBA 0x3e		/* Configuration Base Address Register */
 #define SCx200_CBA_SCRATCH 0x64	/* Configuration Base Address Scratchpad */
-
-/*
-    Local variables:
-        compile-command: "make -C ../.. bzImage modules"
-        c-basic-offset: 8
-    End:
-*/
diff --git a/include/linux/scx200_gpio.h b/include/linux/scx200_gpio.h
index 30cdd64..90dd069 100644
--- a/include/linux/scx200_gpio.h
+++ b/include/linux/scx200_gpio.h
@@ -1,6 +1,6 @@
 #include <linux/spinlock.h>
 
-u32 scx200_gpio_configure(int index, u32 set, u32 clear);
+u32 scx200_gpio_configure(unsigned index, u32 set, u32 clear);
 
 extern unsigned scx200_gpio_base;
 extern long scx200_gpio_shadow[2];
@@ -17,7 +17,7 @@
 
 /* returns the value of the GPIO pin */
 
-static inline int scx200_gpio_get(int index) {
+static inline int scx200_gpio_get(unsigned index) {
 	__SCx200_GPIO_BANK;
 	__SCx200_GPIO_IOADDR + 0x04;
 	__SCx200_GPIO_INDEX;
@@ -29,7 +29,7 @@
    driven if the GPIO is configured as an output, it might not be the
    state of the GPIO right now if the GPIO is configured as an input) */
 
-static inline int scx200_gpio_current(int index) {
+static inline int scx200_gpio_current(unsigned index) {
         __SCx200_GPIO_BANK;
 	__SCx200_GPIO_INDEX;
 		
@@ -38,7 +38,7 @@
 
 /* drive the GPIO signal high */
 
-static inline void scx200_gpio_set_high(int index) {
+static inline void scx200_gpio_set_high(unsigned index) {
 	__SCx200_GPIO_BANK;
 	__SCx200_GPIO_IOADDR;
 	__SCx200_GPIO_SHADOW;
@@ -49,7 +49,7 @@
 
 /* drive the GPIO signal low */
 
-static inline void scx200_gpio_set_low(int index) {
+static inline void scx200_gpio_set_low(unsigned index) {
 	__SCx200_GPIO_BANK;
 	__SCx200_GPIO_IOADDR;
 	__SCx200_GPIO_SHADOW;
@@ -60,7 +60,7 @@
 
 /* drive the GPIO signal to state */
 
-static inline void scx200_gpio_set(int index, int state) {
+static inline void scx200_gpio_set(unsigned index, int state) {
 	__SCx200_GPIO_BANK;
 	__SCx200_GPIO_IOADDR;
 	__SCx200_GPIO_SHADOW;
@@ -73,7 +73,7 @@
 }
 
 /* toggle the GPIO signal */
-static inline void scx200_gpio_change(int index) {
+static inline void scx200_gpio_change(unsigned index) {
 	__SCx200_GPIO_BANK;
 	__SCx200_GPIO_IOADDR;
 	__SCx200_GPIO_SHADOW;
@@ -87,10 +87,3 @@
 #undef __SCx200_GPIO_SHADOW
 #undef __SCx200_GPIO_INDEX
 #undef __SCx200_GPIO_OUT
-
-/*
-    Local variables:
-        compile-command: "make -C ../.. bzImage modules"
-        c-basic-offset: 8
-    End:
-*/
diff --git a/include/linux/swap.h b/include/linux/swap.h
index dc3f3aa..c41e2d6 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -199,6 +199,8 @@
 }
 #endif
 
+extern int kswapd_run(int nid);
+
 #ifdef CONFIG_MMU
 /* linux/mm/shmem.c */
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 33785b7..008f04c 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -174,9 +174,9 @@
 			   int options, struct rusage __user *ru);
 asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options);
 asmlinkage long sys_set_tid_address(int __user *tidptr);
-asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
+asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
 			struct timespec __user *utime, u32 __user *uaddr2,
-			int val3);
+			u32 val3);
 
 asmlinkage long sys_init_module(void __user *umod, unsigned long len,
 				const char __user *uargs);
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 349ef90..46e4d8f 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -149,6 +149,7 @@
 	KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */
 	KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */
 	KERN_COMPAT_LOG=73,	/* int: print compat layer  messages */
+	KERN_MAX_LOCK_DEPTH=74,
 };
 
 
@@ -189,6 +190,7 @@
 	VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */
 	VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */
 	VM_PANIC_ON_OOM=33,	/* panic at out-of-memory */
+	VM_VDSO_ENABLED=34,	/* map VDSO into new processes? */
 };
 
 
diff --git a/include/linux/topology.h b/include/linux/topology.h
index a305ae2..ec1eca8 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -134,7 +134,8 @@
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
-				| SD_WAKE_AFFINE,	\
+				| SD_WAKE_AFFINE	\
+				| BALANCE_FOR_POWER,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 	.nr_balance_failed	= 0,			\
diff --git a/init/Kconfig b/init/Kconfig
index df55b36..f70f2fd 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -339,9 +339,14 @@
 	  kernel data structures. This saves memory on small machines,
 	  but may reduce performance.
 
+config RT_MUTEXES
+	boolean
+	select PLIST
+
 config FUTEX
 	bool "Enable futex support" if EMBEDDED
 	default y
+	select RT_MUTEXES
 	help
 	  Disabling this option will cause the kernel to be built without
 	  support for "fast userspace mutexes".  The resulting kernel may not
diff --git a/init/main.c b/init/main.c
index 80af1a5..0d57f6c 100644
--- a/init/main.c
+++ b/init/main.c
@@ -48,6 +48,7 @@
 #include <linux/mempolicy.h>
 #include <linux/key.h>
 #include <linux/unwind.h>
+#include <linux/buffer_head.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -80,7 +81,6 @@
 extern void sbus_init(void);
 extern void sysctl_init(void);
 extern void signals_init(void);
-extern void buffer_init(void);
 extern void pidhash_init(void);
 extern void pidmap_init(void);
 extern void prio_tree_init(void);
diff --git a/kernel/Makefile b/kernel/Makefile
index 752bd7d..82fb182 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -16,6 +16,9 @@
 ifeq ($(CONFIG_COMPAT),y)
 obj-$(CONFIG_FUTEX) += futex_compat.o
 endif
+obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
+obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
+obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 368c4f0..126ca43 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -521,6 +521,7 @@
 
 /**
  * acct_init_pacct - initialize a new pacct_struct
+ * @pacct: per-process accounting info struct to initialize
  */
 void acct_init_pacct(struct pacct_struct *pacct)
 {
@@ -576,7 +577,7 @@
  *
  * handles process accounting for an exiting task
  */
-void acct_process()
+void acct_process(void)
 {
 	struct file *file = NULL;
 
diff --git a/kernel/audit.c b/kernel/audit.c
index 7dfac70..82443fb 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -818,7 +818,7 @@
  */
 unsigned int audit_serial(void)
 {
-	static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(serial_lock);
 	static unsigned int serial = 0;
 
 	unsigned long flags;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9ebd96f..dc5e3f0 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -658,8 +658,7 @@
 	return;
 
 error_path:
-	if (ctx)
-		kfree(ctx);
+	kfree(ctx);
 	audit_panic("error in audit_log_task_context");
 	return;
 }
@@ -1367,7 +1366,7 @@
  * @mqdes: MQ descriptor
  * @msg_len: Message length
  * @msg_prio: Message priority
- * @abs_timeout: Message timeout in absolute time
+ * @u_abs_timeout: Message timeout in absolute time
  *
  * Returns 0 for success or NULL context or < 0 on error.
  */
@@ -1409,8 +1408,8 @@
  * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive
  * @mqdes: MQ descriptor
  * @msg_len: Message length
- * @msg_prio: Message priority
- * @abs_timeout: Message timeout in absolute time
+ * @u_msg_prio: Message priority
+ * @u_abs_timeout: Message timeout in absolute time
  *
  * Returns 0 for success or NULL context or < 0 on error.
  */
@@ -1558,7 +1557,6 @@
  * @uid: msgq user id
  * @gid: msgq group id
  * @mode: msgq mode (permissions)
- * @ipcp: in-kernel IPC permissions
  *
  * Returns 0 for success or NULL context or < 0 on error.
  */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 03dcd98..70fbf2e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -18,7 +18,7 @@
 /* This protects CPUs going up and down... */
 static DEFINE_MUTEX(cpucontrol);
 
-static BLOCKING_NOTIFIER_HEAD(cpu_chain);
+static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
 
 #ifdef CONFIG_HOTPLUG_CPU
 static struct task_struct *lock_cpu_hotplug_owner;
@@ -69,10 +69,13 @@
 #endif	/* CONFIG_HOTPLUG_CPU */
 
 /* Need to know about CPUs going up/down? */
-int register_cpu_notifier(struct notifier_block *nb)
+int __cpuinit register_cpu_notifier(struct notifier_block *nb)
 {
 	return blocking_notifier_chain_register(&cpu_chain, nb);
 }
+
+#ifdef CONFIG_HOTPLUG_CPU
+
 EXPORT_SYMBOL(register_cpu_notifier);
 
 void unregister_cpu_notifier(struct notifier_block *nb)
@@ -81,7 +84,6 @@
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
 
-#ifdef CONFIG_HOTPLUG_CPU
 static inline void check_for_tasks(int cpu)
 {
 	struct task_struct *p;
diff --git a/kernel/exit.c b/kernel/exit.c
index 304ef63..ab06b9f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -926,9 +926,18 @@
 	tsk->mempolicy = NULL;
 #endif
 	/*
+	 * This must happen late, after the PID is not
+	 * hashed anymore:
+	 */
+	if (unlikely(!list_empty(&tsk->pi_state_list)))
+		exit_pi_state_list(tsk);
+	if (unlikely(current->pi_state_cache))
+		kfree(current->pi_state_cache);
+	/*
 	 * If DEBUG_MUTEXES is on, make sure we are holding no locks:
 	 */
 	mutex_debug_check_no_locks_held(tsk);
+	rt_mutex_debug_check_no_locks_held(tsk);
 
 	if (tsk->io_context)
 		exit_io_context();
diff --git a/kernel/fork.c b/kernel/fork.c
index 9b4e54e..628198a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -104,6 +104,7 @@
 void free_task(struct task_struct *tsk)
 {
 	free_thread_info(tsk->thread_info);
+	rt_mutex_debug_task_free(tsk);
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -913,6 +914,19 @@
 	return current->pid;
 }
 
+static inline void rt_mutex_init_task(struct task_struct *p)
+{
+#ifdef CONFIG_RT_MUTEXES
+	spin_lock_init(&p->pi_lock);
+	plist_head_init(&p->pi_waiters, &p->pi_lock);
+	p->pi_blocked_on = NULL;
+# ifdef CONFIG_DEBUG_RT_MUTEXES
+	spin_lock_init(&p->held_list_lock);
+	INIT_LIST_HEAD(&p->held_list_head);
+# endif
+#endif
+}
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
@@ -1034,6 +1048,8 @@
 	mpol_fix_fork_child_flag(p);
 #endif
 
+	rt_mutex_init_task(p);
+
 #ifdef CONFIG_DEBUG_MUTEXES
 	p->blocked_on = NULL; /* not blocked yet */
 #endif
@@ -1076,6 +1092,9 @@
 #ifdef CONFIG_COMPAT
 	p->compat_robust_list = NULL;
 #endif
+	INIT_LIST_HEAD(&p->pi_state_list);
+	p->pi_state_cache = NULL;
+
 	/*
 	 * sigaltstack should be cleared when sharing the same VM
 	 */
diff --git a/kernel/futex.c b/kernel/futex.c
index e1a380c..6c91f93 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -12,6 +12,10 @@
  *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
  *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
  *
+ *  PI-futex support started by Ingo Molnar and Thomas Gleixner
+ *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  *  enough at me, Linus for the original (flawed) idea, Matthew
  *  Kirkwood for proof-of-concept implementation.
@@ -46,6 +50,8 @@
 #include <linux/signal.h>
 #include <asm/futex.h>
 
+#include "rtmutex_common.h"
+
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
 
 /*
@@ -63,7 +69,7 @@
 		int offset;
 	} shared;
 	struct {
-		unsigned long uaddr;
+		unsigned long address;
 		struct mm_struct *mm;
 		int offset;
 	} private;
@@ -75,6 +81,27 @@
 };
 
 /*
+ * Priority Inheritance state:
+ */
+struct futex_pi_state {
+	/*
+	 * list of 'owned' pi_state instances - these have to be
+	 * cleaned up in do_exit() if the task exits prematurely:
+	 */
+	struct list_head list;
+
+	/*
+	 * The PI object:
+	 */
+	struct rt_mutex pi_mutex;
+
+	struct task_struct *owner;
+	atomic_t refcount;
+
+	union futex_key key;
+};
+
+/*
  * We use this hashed waitqueue instead of a normal wait_queue_t, so
  * we can wake only the relevant ones (hashed queues may be shared).
  *
@@ -87,15 +114,19 @@
 	struct list_head list;
 	wait_queue_head_t waiters;
 
-	/* Which hash list lock to use. */
+	/* Which hash list lock to use: */
 	spinlock_t *lock_ptr;
 
-	/* Key which the futex is hashed on. */
+	/* Key which the futex is hashed on: */
 	union futex_key key;
 
-	/* For fd, sigio sent using these. */
+	/* For fd, sigio sent using these: */
 	int fd;
 	struct file *filp;
+
+	/* Optional priority inheritance state: */
+	struct futex_pi_state *pi_state;
+	struct task_struct *task;
 };
 
 /*
@@ -144,8 +175,9 @@
  *
  * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
  */
-static int get_futex_key(unsigned long uaddr, union futex_key *key)
+static int get_futex_key(u32 __user *uaddr, union futex_key *key)
 {
+	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	struct page *page;
@@ -154,16 +186,16 @@
 	/*
 	 * The futex address must be "naturally" aligned.
 	 */
-	key->both.offset = uaddr % PAGE_SIZE;
+	key->both.offset = address % PAGE_SIZE;
 	if (unlikely((key->both.offset % sizeof(u32)) != 0))
 		return -EINVAL;
-	uaddr -= key->both.offset;
+	address -= key->both.offset;
 
 	/*
 	 * The futex is hashed differently depending on whether
 	 * it's in a shared or private mapping.  So check vma first.
 	 */
-	vma = find_extend_vma(mm, uaddr);
+	vma = find_extend_vma(mm, address);
 	if (unlikely(!vma))
 		return -EFAULT;
 
@@ -184,7 +216,7 @@
 	 */
 	if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
 		key->private.mm = mm;
-		key->private.uaddr = uaddr;
+		key->private.address = address;
 		return 0;
 	}
 
@@ -194,7 +226,7 @@
 	key->shared.inode = vma->vm_file->f_dentry->d_inode;
 	key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
 	if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
-		key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT)
+		key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
 				     + vma->vm_pgoff);
 		return 0;
 	}
@@ -205,7 +237,7 @@
 	 * from swap.  But that's a lot of code to duplicate here
 	 * for a rare case, so we simply fetch the page.
 	 */
-	err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL);
+	err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
 	if (err >= 0) {
 		key->shared.pgoff =
 			page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -246,18 +278,244 @@
 	}
 }
 
-static inline int get_futex_value_locked(int *dest, int __user *from)
+static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
 {
 	int ret;
 
 	inc_preempt_count();
-	ret = __copy_from_user_inatomic(dest, from, sizeof(int));
+	ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
 	dec_preempt_count();
 
 	return ret ? -EFAULT : 0;
 }
 
 /*
+ * Fault handling. Called with current->mm->mmap_sem held.
+ */
+static int futex_handle_fault(unsigned long address, int attempt)
+{
+	struct vm_area_struct * vma;
+	struct mm_struct *mm = current->mm;
+
+	if (attempt >= 2 || !(vma = find_vma(mm, address)) ||
+	    vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
+		return -EFAULT;
+
+	switch (handle_mm_fault(mm, vma, address, 1)) {
+	case VM_FAULT_MINOR:
+		current->min_flt++;
+		break;
+	case VM_FAULT_MAJOR:
+		current->maj_flt++;
+		break;
+	default:
+		return -EFAULT;
+	}
+	return 0;
+}
+
+/*
+ * PI code:
+ */
+static int refill_pi_state_cache(void)
+{
+	struct futex_pi_state *pi_state;
+
+	if (likely(current->pi_state_cache))
+		return 0;
+
+	pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL);
+
+	if (!pi_state)
+		return -ENOMEM;
+
+	memset(pi_state, 0, sizeof(*pi_state));
+	INIT_LIST_HEAD(&pi_state->list);
+	/* pi_mutex gets initialized later */
+	pi_state->owner = NULL;
+	atomic_set(&pi_state->refcount, 1);
+
+	current->pi_state_cache = pi_state;
+
+	return 0;
+}
+
+static struct futex_pi_state * alloc_pi_state(void)
+{
+	struct futex_pi_state *pi_state = current->pi_state_cache;
+
+	WARN_ON(!pi_state);
+	current->pi_state_cache = NULL;
+
+	return pi_state;
+}
+
+static void free_pi_state(struct futex_pi_state *pi_state)
+{
+	if (!atomic_dec_and_test(&pi_state->refcount))
+		return;
+
+	/*
+	 * If pi_state->owner is NULL, the owner is most probably dying
+	 * and has cleaned up the pi_state already
+	 */
+	if (pi_state->owner) {
+		spin_lock_irq(&pi_state->owner->pi_lock);
+		list_del_init(&pi_state->list);
+		spin_unlock_irq(&pi_state->owner->pi_lock);
+
+		rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
+	}
+
+	if (current->pi_state_cache)
+		kfree(pi_state);
+	else {
+		/*
+		 * pi_state->list is already empty.
+		 * clear pi_state->owner.
+		 * refcount is at 0 - put it back to 1.
+		 */
+		pi_state->owner = NULL;
+		atomic_set(&pi_state->refcount, 1);
+		current->pi_state_cache = pi_state;
+	}
+}
+
+/*
+ * Look up the task based on what TID userspace gave us.
+ * We dont trust it.
+ */
+static struct task_struct * futex_find_get_task(pid_t pid)
+{
+	struct task_struct *p;
+
+	read_lock(&tasklist_lock);
+	p = find_task_by_pid(pid);
+	if (!p)
+		goto out_unlock;
+	if ((current->euid != p->euid) && (current->euid != p->uid)) {
+		p = NULL;
+		goto out_unlock;
+	}
+	if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) {
+		p = NULL;
+		goto out_unlock;
+	}
+	get_task_struct(p);
+out_unlock:
+	read_unlock(&tasklist_lock);
+
+	return p;
+}
+
+/*
+ * This task is holding PI mutexes at exit time => bad.
+ * Kernel cleans up PI-state, but userspace is likely hosed.
+ * (Robust-futex cleanup is separate and might save the day for userspace.)
+ */
+void exit_pi_state_list(struct task_struct *curr)
+{
+	struct futex_hash_bucket *hb;
+	struct list_head *next, *head = &curr->pi_state_list;
+	struct futex_pi_state *pi_state;
+	union futex_key key;
+
+	/*
+	 * We are a ZOMBIE and nobody can enqueue itself on
+	 * pi_state_list anymore, but we have to be careful
+	 * versus waiters unqueueing themselfs
+	 */
+	spin_lock_irq(&curr->pi_lock);
+	while (!list_empty(head)) {
+
+		next = head->next;
+		pi_state = list_entry(next, struct futex_pi_state, list);
+		key = pi_state->key;
+		spin_unlock_irq(&curr->pi_lock);
+
+		hb = hash_futex(&key);
+		spin_lock(&hb->lock);
+
+		spin_lock_irq(&curr->pi_lock);
+		if (head->next != next) {
+			spin_unlock(&hb->lock);
+			continue;
+		}
+
+		list_del_init(&pi_state->list);
+
+		WARN_ON(pi_state->owner != curr);
+
+		pi_state->owner = NULL;
+		spin_unlock_irq(&curr->pi_lock);
+
+		rt_mutex_unlock(&pi_state->pi_mutex);
+
+		spin_unlock(&hb->lock);
+
+		spin_lock_irq(&curr->pi_lock);
+	}
+	spin_unlock_irq(&curr->pi_lock);
+}
+
+static int
+lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
+{
+	struct futex_pi_state *pi_state = NULL;
+	struct futex_q *this, *next;
+	struct list_head *head;
+	struct task_struct *p;
+	pid_t pid;
+
+	head = &hb->chain;
+
+	list_for_each_entry_safe(this, next, head, list) {
+		if (match_futex (&this->key, &me->key)) {
+			/*
+			 * Another waiter already exists - bump up
+			 * the refcount and return its pi_state:
+			 */
+			pi_state = this->pi_state;
+			atomic_inc(&pi_state->refcount);
+			me->pi_state = pi_state;
+
+			return 0;
+		}
+	}
+
+	/*
+	 * We are the first waiter - try to look up the real owner and
+	 * attach the new pi_state to it:
+	 */
+	pid = uval & FUTEX_TID_MASK;
+	p = futex_find_get_task(pid);
+	if (!p)
+		return -ESRCH;
+
+	pi_state = alloc_pi_state();
+
+	/*
+	 * Initialize the pi_mutex in locked state and make 'p'
+	 * the owner of it:
+	 */
+	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
+
+	/* Store the key for possible exit cleanups: */
+	pi_state->key = me->key;
+
+	spin_lock_irq(&p->pi_lock);
+	list_add(&pi_state->list, &p->pi_state_list);
+	pi_state->owner = p;
+	spin_unlock_irq(&p->pi_lock);
+
+	put_task_struct(p);
+
+	me->pi_state = pi_state;
+
+	return 0;
+}
+
+/*
  * The hash bucket lock must be held when this is called.
  * Afterwards, the futex_q must not be accessed.
  */
@@ -284,16 +542,80 @@
 	q->lock_ptr = NULL;
 }
 
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
+{
+	struct task_struct *new_owner;
+	struct futex_pi_state *pi_state = this->pi_state;
+	u32 curval, newval;
+
+	if (!pi_state)
+		return -EINVAL;
+
+	new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
+
+	/*
+	 * This happens when we have stolen the lock and the original
+	 * pending owner did not enqueue itself back on the rt_mutex.
+	 * Thats not a tragedy. We know that way, that a lock waiter
+	 * is on the fly. We make the futex_q waiter the pending owner.
+	 */
+	if (!new_owner)
+		new_owner = this->task;
+
+	/*
+	 * We pass it to the next owner. (The WAITERS bit is always
+	 * kept enabled while there is PI state around. We must also
+	 * preserve the owner died bit.)
+	 */
+	newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid;
+
+	inc_preempt_count();
+	curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+	dec_preempt_count();
+
+	if (curval == -EFAULT)
+		return -EFAULT;
+	if (curval != uval)
+		return -EINVAL;
+
+	list_del_init(&pi_state->owner->pi_state_list);
+	list_add(&pi_state->list, &new_owner->pi_state_list);
+	pi_state->owner = new_owner;
+	rt_mutex_unlock(&pi_state->pi_mutex);
+
+	return 0;
+}
+
+static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
+{
+	u32 oldval;
+
+	/*
+	 * There is no waiter, so we unlock the futex. The owner died
+	 * bit has not to be preserved here. We are the owner:
+	 */
+	inc_preempt_count();
+	oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
+	dec_preempt_count();
+
+	if (oldval == -EFAULT)
+		return oldval;
+	if (oldval != uval)
+		return -EAGAIN;
+
+	return 0;
+}
+
 /*
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:
  */
-static int futex_wake(unsigned long uaddr, int nr_wake)
+static int futex_wake(u32 __user *uaddr, int nr_wake)
 {
-	union futex_key key;
-	struct futex_hash_bucket *bh;
-	struct list_head *head;
+	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
+	struct list_head *head;
+	union futex_key key;
 	int ret;
 
 	down_read(&current->mm->mmap_sem);
@@ -302,19 +624,21 @@
 	if (unlikely(ret != 0))
 		goto out;
 
-	bh = hash_futex(&key);
-	spin_lock(&bh->lock);
-	head = &bh->chain;
+	hb = hash_futex(&key);
+	spin_lock(&hb->lock);
+	head = &hb->chain;
 
 	list_for_each_entry_safe(this, next, head, list) {
 		if (match_futex (&this->key, &key)) {
+			if (this->pi_state)
+				return -EINVAL;
 			wake_futex(this);
 			if (++ret >= nr_wake)
 				break;
 		}
 	}
 
-	spin_unlock(&bh->lock);
+	spin_unlock(&hb->lock);
 out:
 	up_read(&current->mm->mmap_sem);
 	return ret;
@@ -324,10 +648,12 @@
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:
  */
-static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op)
+static int
+futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
+	      int nr_wake, int nr_wake2, int op)
 {
 	union futex_key key1, key2;
-	struct futex_hash_bucket *bh1, *bh2;
+	struct futex_hash_bucket *hb1, *hb2;
 	struct list_head *head;
 	struct futex_q *this, *next;
 	int ret, op_ret, attempt = 0;
@@ -342,27 +668,29 @@
 	if (unlikely(ret != 0))
 		goto out;
 
-	bh1 = hash_futex(&key1);
-	bh2 = hash_futex(&key2);
+	hb1 = hash_futex(&key1);
+	hb2 = hash_futex(&key2);
 
 retry:
-	if (bh1 < bh2)
-		spin_lock(&bh1->lock);
-	spin_lock(&bh2->lock);
-	if (bh1 > bh2)
-		spin_lock(&bh1->lock);
+	if (hb1 < hb2)
+		spin_lock(&hb1->lock);
+	spin_lock(&hb2->lock);
+	if (hb1 > hb2)
+		spin_lock(&hb1->lock);
 
-	op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2);
+	op_ret = futex_atomic_op_inuser(op, uaddr2);
 	if (unlikely(op_ret < 0)) {
-		int dummy;
+		u32 dummy;
 
-		spin_unlock(&bh1->lock);
-		if (bh1 != bh2)
-			spin_unlock(&bh2->lock);
+		spin_unlock(&hb1->lock);
+		if (hb1 != hb2)
+			spin_unlock(&hb2->lock);
 
 #ifndef CONFIG_MMU
-		/* we don't get EFAULT from MMU faults if we don't have an MMU,
-		 * but we might get them from range checking */
+		/*
+		 * we don't get EFAULT from MMU faults if we don't have an MMU,
+		 * but we might get them from range checking
+		 */
 		ret = op_ret;
 		goto out;
 #endif
@@ -372,47 +700,34 @@
 			goto out;
 		}
 
-		/* futex_atomic_op_inuser needs to both read and write
+		/*
+		 * futex_atomic_op_inuser needs to both read and write
 		 * *(int __user *)uaddr2, but we can't modify it
 		 * non-atomically.  Therefore, if get_user below is not
 		 * enough, we need to handle the fault ourselves, while
-		 * still holding the mmap_sem.  */
+		 * still holding the mmap_sem.
+		 */
 		if (attempt++) {
-			struct vm_area_struct * vma;
-			struct mm_struct *mm = current->mm;
-
-			ret = -EFAULT;
-			if (attempt >= 2 ||
-			    !(vma = find_vma(mm, uaddr2)) ||
-			    vma->vm_start > uaddr2 ||
-			    !(vma->vm_flags & VM_WRITE))
+			if (futex_handle_fault((unsigned long)uaddr2,
+					       attempt))
 				goto out;
-
-			switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
-			case VM_FAULT_MINOR:
-				current->min_flt++;
-				break;
-			case VM_FAULT_MAJOR:
-				current->maj_flt++;
-				break;
-			default:
-				goto out;
-			}
 			goto retry;
 		}
 
-		/* If we would have faulted, release mmap_sem,
-		 * fault it in and start all over again.  */
+		/*
+		 * If we would have faulted, release mmap_sem,
+		 * fault it in and start all over again.
+		 */
 		up_read(&current->mm->mmap_sem);
 
-		ret = get_user(dummy, (int __user *)uaddr2);
+		ret = get_user(dummy, uaddr2);
 		if (ret)
 			return ret;
 
 		goto retryfull;
 	}
 
-	head = &bh1->chain;
+	head = &hb1->chain;
 
 	list_for_each_entry_safe(this, next, head, list) {
 		if (match_futex (&this->key, &key1)) {
@@ -423,7 +738,7 @@
 	}
 
 	if (op_ret > 0) {
-		head = &bh2->chain;
+		head = &hb2->chain;
 
 		op_ret = 0;
 		list_for_each_entry_safe(this, next, head, list) {
@@ -436,9 +751,9 @@
 		ret += op_ret;
 	}
 
-	spin_unlock(&bh1->lock);
-	if (bh1 != bh2)
-		spin_unlock(&bh2->lock);
+	spin_unlock(&hb1->lock);
+	if (hb1 != hb2)
+		spin_unlock(&hb2->lock);
 out:
 	up_read(&current->mm->mmap_sem);
 	return ret;
@@ -448,11 +763,11 @@
  * Requeue all waiters hashed on one physical page to another
  * physical page.
  */
-static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
-			 int nr_wake, int nr_requeue, int *valp)
+static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
+			 int nr_wake, int nr_requeue, u32 *cmpval)
 {
 	union futex_key key1, key2;
-	struct futex_hash_bucket *bh1, *bh2;
+	struct futex_hash_bucket *hb1, *hb2;
 	struct list_head *head1;
 	struct futex_q *this, *next;
 	int ret, drop_count = 0;
@@ -467,68 +782,72 @@
 	if (unlikely(ret != 0))
 		goto out;
 
-	bh1 = hash_futex(&key1);
-	bh2 = hash_futex(&key2);
+	hb1 = hash_futex(&key1);
+	hb2 = hash_futex(&key2);
 
-	if (bh1 < bh2)
-		spin_lock(&bh1->lock);
-	spin_lock(&bh2->lock);
-	if (bh1 > bh2)
-		spin_lock(&bh1->lock);
+	if (hb1 < hb2)
+		spin_lock(&hb1->lock);
+	spin_lock(&hb2->lock);
+	if (hb1 > hb2)
+		spin_lock(&hb1->lock);
 
-	if (likely(valp != NULL)) {
-		int curval;
+	if (likely(cmpval != NULL)) {
+		u32 curval;
 
-		ret = get_futex_value_locked(&curval, (int __user *)uaddr1);
+		ret = get_futex_value_locked(&curval, uaddr1);
 
 		if (unlikely(ret)) {
-			spin_unlock(&bh1->lock);
-			if (bh1 != bh2)
-				spin_unlock(&bh2->lock);
+			spin_unlock(&hb1->lock);
+			if (hb1 != hb2)
+				spin_unlock(&hb2->lock);
 
-			/* If we would have faulted, release mmap_sem, fault
+			/*
+			 * If we would have faulted, release mmap_sem, fault
 			 * it in and start all over again.
 			 */
 			up_read(&current->mm->mmap_sem);
 
-			ret = get_user(curval, (int __user *)uaddr1);
+			ret = get_user(curval, uaddr1);
 
 			if (!ret)
 				goto retry;
 
 			return ret;
 		}
-		if (curval != *valp) {
+		if (curval != *cmpval) {
 			ret = -EAGAIN;
 			goto out_unlock;
 		}
 	}
 
-	head1 = &bh1->chain;
+	head1 = &hb1->chain;
 	list_for_each_entry_safe(this, next, head1, list) {
 		if (!match_futex (&this->key, &key1))
 			continue;
 		if (++ret <= nr_wake) {
 			wake_futex(this);
 		} else {
-			list_move_tail(&this->list, &bh2->chain);
-			this->lock_ptr = &bh2->lock;
+			/*
+			 * If key1 and key2 hash to the same bucket, no need to
+			 * requeue.
+			 */
+			if (likely(head1 != &hb2->chain)) {
+				list_move_tail(&this->list, &hb2->chain);
+				this->lock_ptr = &hb2->lock;
+			}
 			this->key = key2;
 			get_key_refs(&key2);
 			drop_count++;
 
 			if (ret - nr_wake >= nr_requeue)
 				break;
-			/* Make sure to stop if key1 == key2 */
-			if (head1 == &bh2->chain && head1 != &next->list)
-				head1 = &this->list;
 		}
 	}
 
 out_unlock:
-	spin_unlock(&bh1->lock);
-	if (bh1 != bh2)
-		spin_unlock(&bh2->lock);
+	spin_unlock(&hb1->lock);
+	if (hb1 != hb2)
+		spin_unlock(&hb2->lock);
 
 	/* drop_key_refs() must be called outside the spinlocks. */
 	while (--drop_count >= 0)
@@ -543,7 +862,7 @@
 static inline struct futex_hash_bucket *
 queue_lock(struct futex_q *q, int fd, struct file *filp)
 {
-	struct futex_hash_bucket *bh;
+	struct futex_hash_bucket *hb;
 
 	q->fd = fd;
 	q->filp = filp;
@@ -551,23 +870,24 @@
 	init_waitqueue_head(&q->waiters);
 
 	get_key_refs(&q->key);
-	bh = hash_futex(&q->key);
-	q->lock_ptr = &bh->lock;
+	hb = hash_futex(&q->key);
+	q->lock_ptr = &hb->lock;
 
-	spin_lock(&bh->lock);
-	return bh;
+	spin_lock(&hb->lock);
+	return hb;
 }
 
-static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh)
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
 {
-	list_add_tail(&q->list, &bh->chain);
-	spin_unlock(&bh->lock);
+	list_add_tail(&q->list, &hb->chain);
+	q->task = current;
+	spin_unlock(&hb->lock);
 }
 
 static inline void
-queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
+queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
 {
-	spin_unlock(&bh->lock);
+	spin_unlock(&hb->lock);
 	drop_key_refs(&q->key);
 }
 
@@ -579,16 +899,17 @@
 /* The key must be already stored in q->key. */
 static void queue_me(struct futex_q *q, int fd, struct file *filp)
 {
-	struct futex_hash_bucket *bh;
-	bh = queue_lock(q, fd, filp);
-	__queue_me(q, bh);
+	struct futex_hash_bucket *hb;
+
+	hb = queue_lock(q, fd, filp);
+	__queue_me(q, hb);
 }
 
 /* Return 1 if we were still queued (ie. 0 means we were woken) */
 static int unqueue_me(struct futex_q *q)
 {
-	int ret = 0;
 	spinlock_t *lock_ptr;
+	int ret = 0;
 
 	/* In the common case we don't take the spinlock, which is nice. */
  retry:
@@ -614,6 +935,9 @@
 		}
 		WARN_ON(list_empty(&q->list));
 		list_del(&q->list);
+
+		BUG_ON(q->pi_state);
+
 		spin_unlock(lock_ptr);
 		ret = 1;
 	}
@@ -622,21 +946,42 @@
 	return ret;
 }
 
-static int futex_wait(unsigned long uaddr, int val, unsigned long time)
+/*
+ * PI futexes can not be requeued and must remove themself from the
+ * hash bucket. The hash bucket lock is held on entry and dropped here.
+ */
+static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
 {
-	DECLARE_WAITQUEUE(wait, current);
-	int ret, curval;
-	struct futex_q q;
-	struct futex_hash_bucket *bh;
+	WARN_ON(list_empty(&q->list));
+	list_del(&q->list);
 
+	BUG_ON(!q->pi_state);
+	free_pi_state(q->pi_state);
+	q->pi_state = NULL;
+
+	spin_unlock(&hb->lock);
+
+	drop_key_refs(&q->key);
+}
+
+static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
+{
+	struct task_struct *curr = current;
+	DECLARE_WAITQUEUE(wait, curr);
+	struct futex_hash_bucket *hb;
+	struct futex_q q;
+	u32 uval;
+	int ret;
+
+	q.pi_state = NULL;
  retry:
-	down_read(&current->mm->mmap_sem);
+	down_read(&curr->mm->mmap_sem);
 
 	ret = get_futex_key(uaddr, &q.key);
 	if (unlikely(ret != 0))
 		goto out_release_sem;
 
-	bh = queue_lock(&q, -1, NULL);
+	hb = queue_lock(&q, -1, NULL);
 
 	/*
 	 * Access the page AFTER the futex is queued.
@@ -658,37 +1003,35 @@
 	 * We hold the mmap semaphore, so the mapping cannot have changed
 	 * since we looked it up in get_futex_key.
 	 */
-
-	ret = get_futex_value_locked(&curval, (int __user *)uaddr);
+	ret = get_futex_value_locked(&uval, uaddr);
 
 	if (unlikely(ret)) {
-		queue_unlock(&q, bh);
+		queue_unlock(&q, hb);
 
-		/* If we would have faulted, release mmap_sem, fault it in and
+		/*
+		 * If we would have faulted, release mmap_sem, fault it in and
 		 * start all over again.
 		 */
-		up_read(&current->mm->mmap_sem);
+		up_read(&curr->mm->mmap_sem);
 
-		ret = get_user(curval, (int __user *)uaddr);
+		ret = get_user(uval, uaddr);
 
 		if (!ret)
 			goto retry;
 		return ret;
 	}
-	if (curval != val) {
-		ret = -EWOULDBLOCK;
-		queue_unlock(&q, bh);
-		goto out_release_sem;
-	}
+	ret = -EWOULDBLOCK;
+	if (uval != val)
+		goto out_unlock_release_sem;
 
 	/* Only actually queue if *uaddr contained val.  */
-	__queue_me(&q, bh);
+	__queue_me(&q, hb);
 
 	/*
 	 * Now the futex is queued and we have checked the data, we
 	 * don't want to hold mmap_sem while we sleep.
-	 */	
-	up_read(&current->mm->mmap_sem);
+	 */
+	up_read(&curr->mm->mmap_sem);
 
 	/*
 	 * There might have been scheduling since the queue_me(), as we
@@ -720,12 +1063,421 @@
 		return 0;
 	if (time == 0)
 		return -ETIMEDOUT;
-	/* We expect signal_pending(current), but another thread may
-	 * have handled it for us already. */
+	/*
+	 * We expect signal_pending(current), but another thread may
+	 * have handled it for us already.
+	 */
 	return -EINTR;
 
+ out_unlock_release_sem:
+	queue_unlock(&q, hb);
+
  out_release_sem:
+	up_read(&curr->mm->mmap_sem);
+	return ret;
+}
+
+/*
+ * Userspace tried a 0 -> TID atomic transition of the futex value
+ * and failed. The kernel side here does the whole locking operation:
+ * if there are waiters then it will block, it does PI, etc. (Due to
+ * races the kernel might see a 0 value of the futex too.)
+ */
+static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
+			    struct hrtimer_sleeper *to)
+{
+	struct task_struct *curr = current;
+	struct futex_hash_bucket *hb;
+	u32 uval, newval, curval;
+	struct futex_q q;
+	int ret, attempt = 0;
+
+	if (refill_pi_state_cache())
+		return -ENOMEM;
+
+	q.pi_state = NULL;
+ retry:
+	down_read(&curr->mm->mmap_sem);
+
+	ret = get_futex_key(uaddr, &q.key);
+	if (unlikely(ret != 0))
+		goto out_release_sem;
+
+	hb = queue_lock(&q, -1, NULL);
+
+ retry_locked:
+	/*
+	 * To avoid races, we attempt to take the lock here again
+	 * (by doing a 0 -> TID atomic cmpxchg), while holding all
+	 * the locks. It will most likely not succeed.
+	 */
+	newval = current->pid;
+
+	inc_preempt_count();
+	curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
+	dec_preempt_count();
+
+	if (unlikely(curval == -EFAULT))
+		goto uaddr_faulted;
+
+	/* We own the lock already */
+	if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
+		if (!detect && 0)
+			force_sig(SIGKILL, current);
+		ret = -EDEADLK;
+		goto out_unlock_release_sem;
+	}
+
+	/*
+	 * Surprise - we got the lock. Just return
+	 * to userspace:
+	 */
+	if (unlikely(!curval))
+		goto out_unlock_release_sem;
+
+	uval = curval;
+	newval = uval | FUTEX_WAITERS;
+
+	inc_preempt_count();
+	curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+	dec_preempt_count();
+
+	if (unlikely(curval == -EFAULT))
+		goto uaddr_faulted;
+	if (unlikely(curval != uval))
+		goto retry_locked;
+
+	/*
+	 * We dont have the lock. Look up the PI state (or create it if
+	 * we are the first waiter):
+	 */
+	ret = lookup_pi_state(uval, hb, &q);
+
+	if (unlikely(ret)) {
+		/*
+		 * There were no waiters and the owner task lookup
+		 * failed. When the OWNER_DIED bit is set, then we
+		 * know that this is a robust futex and we actually
+		 * take the lock. This is safe as we are protected by
+		 * the hash bucket lock. We also set the waiters bit
+		 * unconditionally here, to simplify glibc handling of
+		 * multiple tasks racing to acquire the lock and
+		 * cleanup the problems which were left by the dead
+		 * owner.
+		 */
+		if (curval & FUTEX_OWNER_DIED) {
+			uval = newval;
+			newval = current->pid |
+				FUTEX_OWNER_DIED | FUTEX_WAITERS;
+
+			inc_preempt_count();
+			curval = futex_atomic_cmpxchg_inatomic(uaddr,
+							       uval, newval);
+			dec_preempt_count();
+
+			if (unlikely(curval == -EFAULT))
+				goto uaddr_faulted;
+			if (unlikely(curval != uval))
+				goto retry_locked;
+			ret = 0;
+		}
+		goto out_unlock_release_sem;
+	}
+
+	/*
+	 * Only actually queue now that the atomic ops are done:
+	 */
+	__queue_me(&q, hb);
+
+	/*
+	 * Now the futex is queued and we have checked the data, we
+	 * don't want to hold mmap_sem while we sleep.
+	 */
+	up_read(&curr->mm->mmap_sem);
+
+	WARN_ON(!q.pi_state);
+	/*
+	 * Block on the PI mutex:
+	 */
+	if (!trylock)
+		ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
+	else {
+		ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
+		/* Fixup the trylock return value: */
+		ret = ret ? 0 : -EWOULDBLOCK;
+	}
+
+	down_read(&curr->mm->mmap_sem);
+	hb = queue_lock(&q, -1, NULL);
+
+	/*
+	 * Got the lock. We might not be the anticipated owner if we
+	 * did a lock-steal - fix up the PI-state in that case.
+	 */
+	if (!ret && q.pi_state->owner != curr) {
+		u32 newtid = current->pid | FUTEX_WAITERS;
+
+		/* Owner died? */
+		if (q.pi_state->owner != NULL) {
+			spin_lock_irq(&q.pi_state->owner->pi_lock);
+			list_del_init(&q.pi_state->list);
+			spin_unlock_irq(&q.pi_state->owner->pi_lock);
+		} else
+			newtid |= FUTEX_OWNER_DIED;
+
+		q.pi_state->owner = current;
+
+		spin_lock_irq(&current->pi_lock);
+		list_add(&q.pi_state->list, &current->pi_state_list);
+		spin_unlock_irq(&current->pi_lock);
+
+		/* Unqueue and drop the lock */
+		unqueue_me_pi(&q, hb);
+		up_read(&curr->mm->mmap_sem);
+		/*
+		 * We own it, so we have to replace the pending owner
+		 * TID. This must be atomic as we have preserve the
+		 * owner died bit here.
+		 */
+		ret = get_user(uval, uaddr);
+		while (!ret) {
+			newval = (uval & FUTEX_OWNER_DIED) | newtid;
+			curval = futex_atomic_cmpxchg_inatomic(uaddr,
+							       uval, newval);
+			if (curval == -EFAULT)
+				ret = -EFAULT;
+			if (curval == uval)
+				break;
+			uval = curval;
+		}
+	} else {
+		/*
+		 * Catch the rare case, where the lock was released
+		 * when we were on the way back before we locked
+		 * the hash bucket.
+		 */
+		if (ret && q.pi_state->owner == curr) {
+			if (rt_mutex_trylock(&q.pi_state->pi_mutex))
+				ret = 0;
+		}
+		/* Unqueue and drop the lock */
+		unqueue_me_pi(&q, hb);
+		up_read(&curr->mm->mmap_sem);
+	}
+
+	if (!detect && ret == -EDEADLK && 0)
+		force_sig(SIGKILL, current);
+
+	return ret;
+
+ out_unlock_release_sem:
+	queue_unlock(&q, hb);
+
+ out_release_sem:
+	up_read(&curr->mm->mmap_sem);
+	return ret;
+
+ uaddr_faulted:
+	/*
+	 * We have to r/w  *(int __user *)uaddr, but we can't modify it
+	 * non-atomically.  Therefore, if get_user below is not
+	 * enough, we need to handle the fault ourselves, while
+	 * still holding the mmap_sem.
+	 */
+	if (attempt++) {
+		if (futex_handle_fault((unsigned long)uaddr, attempt))
+			goto out_unlock_release_sem;
+
+		goto retry_locked;
+	}
+
+	queue_unlock(&q, hb);
+	up_read(&curr->mm->mmap_sem);
+
+	ret = get_user(uval, uaddr);
+	if (!ret && (uval != -EFAULT))
+		goto retry;
+
+	return ret;
+}
+
+/*
+ * Restart handler
+ */
+static long futex_lock_pi_restart(struct restart_block *restart)
+{
+	struct hrtimer_sleeper timeout, *to = NULL;
+	int ret;
+
+	restart->fn = do_no_restart_syscall;
+
+	if (restart->arg2 || restart->arg3) {
+		to = &timeout;
+		hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
+		hrtimer_init_sleeper(to, current);
+		to->timer.expires.tv64 = ((u64)restart->arg1 << 32) |
+			(u64) restart->arg0;
+	}
+
+	pr_debug("lock_pi restart: %p, %d (%d)\n",
+		 (u32 __user *)restart->arg0, current->pid);
+
+	ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1,
+			       0, to);
+
+	if (ret != -EINTR)
+		return ret;
+
+	restart->fn = futex_lock_pi_restart;
+
+	/* The other values are filled in */
+	return -ERESTART_RESTARTBLOCK;
+}
+
+/*
+ * Called from the syscall entry below.
+ */
+static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
+			 long nsec, int trylock)
+{
+	struct hrtimer_sleeper timeout, *to = NULL;
+	struct restart_block *restart;
+	int ret;
+
+	if (sec != MAX_SCHEDULE_TIMEOUT) {
+		to = &timeout;
+		hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
+		hrtimer_init_sleeper(to, current);
+		to->timer.expires = ktime_set(sec, nsec);
+	}
+
+	ret = do_futex_lock_pi(uaddr, detect, trylock, to);
+
+	if (ret != -EINTR)
+		return ret;
+
+	pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid);
+
+	restart = &current_thread_info()->restart_block;
+	restart->fn = futex_lock_pi_restart;
+	restart->arg0 = (unsigned long) uaddr;
+	restart->arg1 = detect;
+	if (to) {
+		restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF;
+		restart->arg3 = to->timer.expires.tv64 >> 32;
+	} else
+		restart->arg2 = restart->arg3 = 0;
+
+	return -ERESTART_RESTARTBLOCK;
+}
+
+/*
+ * Userspace attempted a TID -> 0 atomic transition, and failed.
+ * This is the in-kernel slowpath: we look up the PI state (if any),
+ * and do the rt-mutex unlock.
+ */
+static int futex_unlock_pi(u32 __user *uaddr)
+{
+	struct futex_hash_bucket *hb;
+	struct futex_q *this, *next;
+	u32 uval;
+	struct list_head *head;
+	union futex_key key;
+	int ret, attempt = 0;
+
+retry:
+	if (get_user(uval, uaddr))
+		return -EFAULT;
+	/*
+	 * We release only a lock we actually own:
+	 */
+	if ((uval & FUTEX_TID_MASK) != current->pid)
+		return -EPERM;
+	/*
+	 * First take all the futex related locks:
+	 */
+	down_read(&current->mm->mmap_sem);
+
+	ret = get_futex_key(uaddr, &key);
+	if (unlikely(ret != 0))
+		goto out;
+
+	hb = hash_futex(&key);
+	spin_lock(&hb->lock);
+
+retry_locked:
+	/*
+	 * To avoid races, try to do the TID -> 0 atomic transition
+	 * again. If it succeeds then we can return without waking
+	 * anyone else up:
+	 */
+	inc_preempt_count();
+	uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
+	dec_preempt_count();
+
+	if (unlikely(uval == -EFAULT))
+		goto pi_faulted;
+	/*
+	 * Rare case: we managed to release the lock atomically,
+	 * no need to wake anyone else up:
+	 */
+	if (unlikely(uval == current->pid))
+		goto out_unlock;
+
+	/*
+	 * Ok, other tasks may need to be woken up - check waiters
+	 * and do the wakeup if necessary:
+	 */
+	head = &hb->chain;
+
+	list_for_each_entry_safe(this, next, head, list) {
+		if (!match_futex (&this->key, &key))
+			continue;
+		ret = wake_futex_pi(uaddr, uval, this);
+		/*
+		 * The atomic access to the futex value
+		 * generated a pagefault, so retry the
+		 * user-access and the wakeup:
+		 */
+		if (ret == -EFAULT)
+			goto pi_faulted;
+		goto out_unlock;
+	}
+	/*
+	 * No waiters - kernel unlocks the futex:
+	 */
+	ret = unlock_futex_pi(uaddr, uval);
+	if (ret == -EFAULT)
+		goto pi_faulted;
+
+out_unlock:
+	spin_unlock(&hb->lock);
+out:
 	up_read(&current->mm->mmap_sem);
+
+	return ret;
+
+pi_faulted:
+	/*
+	 * We have to r/w  *(int __user *)uaddr, but we can't modify it
+	 * non-atomically.  Therefore, if get_user below is not
+	 * enough, we need to handle the fault ourselves, while
+	 * still holding the mmap_sem.
+	 */
+	if (attempt++) {
+		if (futex_handle_fault((unsigned long)uaddr, attempt))
+			goto out_unlock;
+
+		goto retry_locked;
+	}
+
+	spin_unlock(&hb->lock);
+	up_read(&current->mm->mmap_sem);
+
+	ret = get_user(uval, uaddr);
+	if (!ret && (uval != -EFAULT))
+		goto retry;
+
 	return ret;
 }
 
@@ -735,6 +1487,7 @@
 
 	unqueue_me(q);
 	kfree(q);
+
 	return 0;
 }
 
@@ -766,7 +1519,7 @@
  * Signal allows caller to avoid the race which would occur if they
  * set the sigio stuff up afterwards.
  */
-static int futex_fd(unsigned long uaddr, int signal)
+static int futex_fd(u32 __user *uaddr, int signal)
 {
 	struct futex_q *q;
 	struct file *filp;
@@ -803,6 +1556,7 @@
 		err = -ENOMEM;
 		goto error;
 	}
+	q->pi_state = NULL;
 
 	down_read(&current->mm->mmap_sem);
 	err = get_futex_key(uaddr, &q->key);
@@ -840,7 +1594,7 @@
  * Implementation: user-space maintains a per-thread list of locks it
  * is holding. Upon do_exit(), the kernel carefully walks this list,
  * and marks all locks that are owned by this thread with the
- * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is
+ * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
  * always manipulated with the lock held, so the list is private and
  * per-thread. Userspace also maintains a per-thread 'list_op_pending'
  * field, to allow the kernel to clean up if the thread dies after
@@ -915,7 +1669,7 @@
  */
 int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
 {
-	u32 uval;
+	u32 uval, nval;
 
 retry:
 	if (get_user(uval, uaddr))
@@ -932,12 +1686,16 @@
 		 * thread-death.) The rest of the cleanup is done in
 		 * userspace.
 		 */
-		if (futex_atomic_cmpxchg_inatomic(uaddr, uval,
-					 uval | FUTEX_OWNER_DIED) != uval)
+		nval = futex_atomic_cmpxchg_inatomic(uaddr, uval,
+						     uval | FUTEX_OWNER_DIED);
+		if (nval == -EFAULT)
+			return -1;
+
+		if (nval != uval)
 			goto retry;
 
 		if (uval & FUTEX_WAITERS)
-			futex_wake((unsigned long)uaddr, 1);
+			futex_wake(uaddr, 1);
 	}
 	return 0;
 }
@@ -978,7 +1736,7 @@
 	while (entry != &head->list) {
 		/*
 		 * A pending lock might already be on the list, so
-		 * dont process it twice:
+		 * don't process it twice:
 		 */
 		if (entry != pending)
 			if (handle_futex_death((void *)entry + futex_offset,
@@ -999,8 +1757,8 @@
 	}
 }
 
-long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
-		unsigned long uaddr2, int val2, int val3)
+long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
+		u32 __user *uaddr2, u32 val2, u32 val3)
 {
 	int ret;
 
@@ -1024,6 +1782,15 @@
 	case FUTEX_WAKE_OP:
 		ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
 		break;
+	case FUTEX_LOCK_PI:
+		ret = futex_lock_pi(uaddr, val, timeout, val2, 0);
+		break;
+	case FUTEX_UNLOCK_PI:
+		ret = futex_unlock_pi(uaddr);
+		break;
+	case FUTEX_TRYLOCK_PI:
+		ret = futex_lock_pi(uaddr, 0, timeout, val2, 1);
+		break;
 	default:
 		ret = -ENOSYS;
 	}
@@ -1031,29 +1798,33 @@
 }
 
 
-asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
+asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
 			  struct timespec __user *utime, u32 __user *uaddr2,
-			  int val3)
+			  u32 val3)
 {
 	struct timespec t;
 	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
-	int val2 = 0;
+	u32 val2 = 0;
 
-	if (utime && (op == FUTEX_WAIT)) {
+	if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
 		if (copy_from_user(&t, utime, sizeof(t)) != 0)
 			return -EFAULT;
 		if (!timespec_valid(&t))
 			return -EINVAL;
-		timeout = timespec_to_jiffies(&t) + 1;
+		if (op == FUTEX_WAIT)
+			timeout = timespec_to_jiffies(&t) + 1;
+		else {
+			timeout = t.tv_sec;
+			val2 = t.tv_nsec;
+		}
 	}
 	/*
 	 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
 	 */
-	if (op >= FUTEX_REQUEUE)
-		val2 = (int) (unsigned long) utime;
+	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+		val2 = (u32) (unsigned long) utime;
 
-	return do_futex((unsigned long)uaddr, op, val, timeout,
-			(unsigned long)uaddr2, val2, val3);
+	return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
 }
 
 static int futexfs_get_sb(struct file_system_type *fs_type,
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 1ab6a0e..d1d92b4 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -129,16 +129,20 @@
 	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
 	int val2 = 0;
 
-	if (utime && (op == FUTEX_WAIT)) {
+	if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
 		if (get_compat_timespec(&t, utime))
 			return -EFAULT;
 		if (!timespec_valid(&t))
 			return -EINVAL;
-		timeout = timespec_to_jiffies(&t) + 1;
+		if (op == FUTEX_WAIT)
+			timeout = timespec_to_jiffies(&t) + 1;
+		else {
+			timeout = t.tv_sec;
+			val2 = t.tv_nsec;
+		}
 	}
-	if (op >= FUTEX_REQUEUE)
+	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
 		val2 = (int) (unsigned long) utime;
 
-	return do_futex((unsigned long)uaddr, op, val, timeout,
-			(unsigned long)uaddr2, val2, val3);
+	return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
 }
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 55601b3..8d3dc29 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -833,7 +833,7 @@
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int hrtimer_cpu_notify(struct notifier_block *self,
+static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
 					unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
@@ -857,7 +857,7 @@
 	return NOTIFY_OK;
 }
 
-static struct notifier_block hrtimers_nb = {
+static struct notifier_block __devinitdata hrtimers_nb = {
 	.notifier_call = hrtimer_cpu_notify,
 };
 
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 036b628..e38e4ba 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/delay.h>
 #include <linux/module.h>
+#include <linux/poison.h>
 #include <linux/spinlock.h>
 #include <linux/kallsyms.h>
 #include <linux/interrupt.h>
@@ -381,7 +382,7 @@
 
 void debug_mutex_init_waiter(struct mutex_waiter *waiter)
 {
-	memset(waiter, 0x11, sizeof(*waiter));
+	memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
 	waiter->magic = waiter;
 	INIT_LIST_HEAD(&waiter->list);
 }
@@ -397,7 +398,7 @@
 void debug_mutex_free_waiter(struct mutex_waiter *waiter)
 {
 	DEBUG_WARN_ON(!list_empty(&waiter->list));
-	memset(waiter, 0x22, sizeof(*waiter));
+	memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter));
 }
 
 void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index fc311a4..857b4fa 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -38,13 +38,22 @@
 
 config PM_TRACE
 	bool "Suspend/resume event tracing"
-	depends on PM && PM_DEBUG && X86_32
-	default y
+	depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL
+	default n
 	---help---
 	This enables some cheesy code to save the last PM event point in the
 	RTC across reboots, so that you can debug a machine that just hangs
 	during suspend (or more commonly, during resume).
 
+	To use this debugging feature you should attempt to suspend the machine,
+	then reboot it, then run
+
+		dmesg -s 1000000 | grep 'hash matches'
+
+	CAUTION: this option will cause your machine's real-time clock to be
+	set to an invalid time after a resume.
+
+
 config SOFTWARE_SUSPEND
 	bool "Software Suspend"
 	depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)
diff --git a/kernel/profile.c b/kernel/profile.c
index 68afe12..5a730fd 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -299,7 +299,7 @@
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static int profile_cpu_callback(struct notifier_block *info,
+static int __devinit profile_cpu_callback(struct notifier_block *info,
 					unsigned long action, void *__cpu)
 {
 	int node, cpu = (unsigned long)__cpu;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 20e9710..f464f5a 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -182,6 +182,15 @@
 	return rcu_ctrlblk.completed;
 }
 
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed_bh(void)
+{
+	return rcu_bh_ctrlblk.completed;
+}
+
 static void rcu_barrier_callback(struct rcu_head *notused)
 {
 	if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -539,7 +548,7 @@
 	tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
 }
 
-static int rcu_cpu_notify(struct notifier_block *self,
+static int __devinit rcu_cpu_notify(struct notifier_block *self,
 				unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
@@ -556,7 +565,7 @@
 	return NOTIFY_OK;
 }
 
-static struct notifier_block rcu_nb = {
+static struct notifier_block __devinitdata rcu_nb = {
 	.notifier_call	= rcu_cpu_notify,
 };
 
@@ -619,6 +628,7 @@
 module_param(rsinterval, int, 0);
 #endif
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 EXPORT_SYMBOL_GPL(call_rcu);
 EXPORT_SYMBOL_GPL(call_rcu_bh);
 EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 8154e75..4d1c3d2 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -1,5 +1,5 @@
 /*
- * Read-Copy Update /proc-based torture test facility
+ * Read-Copy Update module-based torture test facility
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -53,6 +53,7 @@
 static int verbose;		/* Print more debug info. */
 static int test_no_idle_hz;	/* Test RCU's support for tickless idle CPUs. */
 static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
+static char *torture_type = "rcu"; /* What to torture. */
 
 module_param(nreaders, int, 0);
 MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
@@ -64,13 +65,16 @@
 MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
 module_param(shuffle_interval, int, 0);
 MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
-#define TORTURE_FLAG "rcutorture: "
+module_param(torture_type, charp, 0);
+MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)");
+
+#define TORTURE_FLAG "-torture:"
 #define PRINTK_STRING(s) \
-	do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
+	do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
 #define VERBOSE_PRINTK_STRING(s) \
-	do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
+	do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
 #define VERBOSE_PRINTK_ERRSTRING(s) \
-	do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0)
+	do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
 
 static char printk_buf[4096];
 
@@ -139,28 +143,6 @@
 	spin_unlock_bh(&rcu_torture_lock);
 }
 
-static void
-rcu_torture_cb(struct rcu_head *p)
-{
-	int i;
-	struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
-
-	if (fullstop) {
-		/* Test is ending, just drop callbacks on the floor. */
-		/* The next initialization will pick up the pieces. */
-		return;
-	}
-	i = rp->rtort_pipe_count;
-	if (i > RCU_TORTURE_PIPE_LEN)
-		i = RCU_TORTURE_PIPE_LEN;
-	atomic_inc(&rcu_torture_wcount[i]);
-	if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
-		rp->rtort_mbtest = 0;
-		rcu_torture_free(rp);
-	} else
-		call_rcu(p, rcu_torture_cb);
-}
-
 struct rcu_random_state {
 	unsigned long rrs_state;
 	unsigned long rrs_count;
@@ -191,6 +173,119 @@
 }
 
 /*
+ * Operations vector for selecting different types of tests.
+ */
+
+struct rcu_torture_ops {
+	void (*init)(void);
+	void (*cleanup)(void);
+	int (*readlock)(void);
+	void (*readunlock)(int idx);
+	int (*completed)(void);
+	void (*deferredfree)(struct rcu_torture *p);
+	int (*stats)(char *page);
+	char *name;
+};
+static struct rcu_torture_ops *cur_ops = NULL;
+
+/*
+ * Definitions for rcu torture testing.
+ */
+
+static int rcu_torture_read_lock(void)
+{
+	rcu_read_lock();
+	return 0;
+}
+
+static void rcu_torture_read_unlock(int idx)
+{
+	rcu_read_unlock();
+}
+
+static int rcu_torture_completed(void)
+{
+	return rcu_batches_completed();
+}
+
+static void
+rcu_torture_cb(struct rcu_head *p)
+{
+	int i;
+	struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
+
+	if (fullstop) {
+		/* Test is ending, just drop callbacks on the floor. */
+		/* The next initialization will pick up the pieces. */
+		return;
+	}
+	i = rp->rtort_pipe_count;
+	if (i > RCU_TORTURE_PIPE_LEN)
+		i = RCU_TORTURE_PIPE_LEN;
+	atomic_inc(&rcu_torture_wcount[i]);
+	if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+		rp->rtort_mbtest = 0;
+		rcu_torture_free(rp);
+	} else
+		cur_ops->deferredfree(rp);
+}
+
+static void rcu_torture_deferred_free(struct rcu_torture *p)
+{
+	call_rcu(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static struct rcu_torture_ops rcu_ops = {
+	.init = NULL,
+	.cleanup = NULL,
+	.readlock = rcu_torture_read_lock,
+	.readunlock = rcu_torture_read_unlock,
+	.completed = rcu_torture_completed,
+	.deferredfree = rcu_torture_deferred_free,
+	.stats = NULL,
+	.name = "rcu"
+};
+
+/*
+ * Definitions for rcu_bh torture testing.
+ */
+
+static int rcu_bh_torture_read_lock(void)
+{
+	rcu_read_lock_bh();
+	return 0;
+}
+
+static void rcu_bh_torture_read_unlock(int idx)
+{
+	rcu_read_unlock_bh();
+}
+
+static int rcu_bh_torture_completed(void)
+{
+	return rcu_batches_completed_bh();
+}
+
+static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
+{
+	call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static struct rcu_torture_ops rcu_bh_ops = {
+	.init = NULL,
+	.cleanup = NULL,
+	.readlock = rcu_bh_torture_read_lock,
+	.readunlock = rcu_bh_torture_read_unlock,
+	.completed = rcu_bh_torture_completed,
+	.deferredfree = rcu_bh_torture_deferred_free,
+	.stats = NULL,
+	.name = "rcu_bh"
+};
+
+static struct rcu_torture_ops *torture_ops[] =
+	{ &rcu_ops, &rcu_bh_ops, NULL };
+
+/*
  * RCU torture writer kthread.  Repeatedly substitutes a new structure
  * for that pointed to by rcu_torture_current, freeing the old structure
  * after a series of grace periods (the "pipeline").
@@ -209,8 +304,6 @@
 
 	do {
 		schedule_timeout_uninterruptible(1);
-		if (rcu_batches_completed() == oldbatch)
-			continue;
 		if ((rp = rcu_torture_alloc()) == NULL)
 			continue;
 		rp->rtort_pipe_count = 0;
@@ -225,10 +318,10 @@
 				i = RCU_TORTURE_PIPE_LEN;
 			atomic_inc(&rcu_torture_wcount[i]);
 			old_rp->rtort_pipe_count++;
-			call_rcu(&old_rp->rtort_rcu, rcu_torture_cb);
+			cur_ops->deferredfree(old_rp);
 		}
 		rcu_torture_current_version++;
-		oldbatch = rcu_batches_completed();
+		oldbatch = cur_ops->completed();
 	} while (!kthread_should_stop() && !fullstop);
 	VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
 	while (!kthread_should_stop())
@@ -246,6 +339,7 @@
 rcu_torture_reader(void *arg)
 {
 	int completed;
+	int idx;
 	DEFINE_RCU_RANDOM(rand);
 	struct rcu_torture *p;
 	int pipe_count;
@@ -254,12 +348,12 @@
 	set_user_nice(current, 19);
 
 	do {
-		rcu_read_lock();
-		completed = rcu_batches_completed();
+		idx = cur_ops->readlock();
+		completed = cur_ops->completed();
 		p = rcu_dereference(rcu_torture_current);
 		if (p == NULL) {
 			/* Wait for rcu_torture_writer to get underway */
-			rcu_read_unlock();
+			cur_ops->readunlock(idx);
 			schedule_timeout_interruptible(HZ);
 			continue;
 		}
@@ -273,14 +367,14 @@
 			pipe_count = RCU_TORTURE_PIPE_LEN;
 		}
 		++__get_cpu_var(rcu_torture_count)[pipe_count];
-		completed = rcu_batches_completed() - completed;
+		completed = cur_ops->completed() - completed;
 		if (completed > RCU_TORTURE_PIPE_LEN) {
 			/* Should not happen, but... */
 			completed = RCU_TORTURE_PIPE_LEN;
 		}
 		++__get_cpu_var(rcu_torture_batch)[completed];
 		preempt_enable();
-		rcu_read_unlock();
+		cur_ops->readunlock(idx);
 		schedule();
 	} while (!kthread_should_stop() && !fullstop);
 	VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
@@ -311,7 +405,7 @@
 		if (pipesummary[i] != 0)
 			break;
 	}
-	cnt += sprintf(&page[cnt], "rcutorture: ");
+	cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
 	cnt += sprintf(&page[cnt],
 		       "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
 		       "rtmbe: %d",
@@ -324,7 +418,7 @@
 		       atomic_read(&n_rcu_torture_mberror));
 	if (atomic_read(&n_rcu_torture_mberror) != 0)
 		cnt += sprintf(&page[cnt], " !!!");
-	cnt += sprintf(&page[cnt], "\nrcutorture: ");
+	cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
 	if (i > 1) {
 		cnt += sprintf(&page[cnt], "!!! ");
 		atomic_inc(&n_rcu_torture_error);
@@ -332,17 +426,19 @@
 	cnt += sprintf(&page[cnt], "Reader Pipe: ");
 	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
 		cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
-	cnt += sprintf(&page[cnt], "\nrcutorture: ");
+	cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
 	cnt += sprintf(&page[cnt], "Reader Batch: ");
-	for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
 		cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
-	cnt += sprintf(&page[cnt], "\nrcutorture: ");
+	cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
 	cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
 	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
 		cnt += sprintf(&page[cnt], " %d",
 			       atomic_read(&rcu_torture_wcount[i]));
 	}
 	cnt += sprintf(&page[cnt], "\n");
+	if (cur_ops->stats != NULL)
+		cnt += cur_ops->stats(&page[cnt]);
 	return cnt;
 }
 
@@ -444,11 +540,11 @@
 static inline void
 rcu_torture_print_module_parms(char *tag)
 {
-	printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d "
+	printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d "
 		"stat_interval=%d verbose=%d test_no_idle_hz=%d "
 		"shuffle_interval = %d\n",
-		tag, nrealreaders, stat_interval, verbose, test_no_idle_hz,
-		shuffle_interval);
+		torture_type, tag, nrealreaders, stat_interval, verbose,
+		test_no_idle_hz, shuffle_interval);
 }
 
 static void
@@ -493,6 +589,9 @@
 	rcu_barrier();
 
 	rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
+
+	if (cur_ops->cleanup != NULL)
+		cur_ops->cleanup();
 	if (atomic_read(&n_rcu_torture_error))
 		rcu_torture_print_module_parms("End of test: FAILURE");
 	else
@@ -508,6 +607,20 @@
 
 	/* Process args and tell the world that the torturer is on the job. */
 
+	for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) {
+		cur_ops = torture_ops[i];
+		if (strcmp(torture_type, cur_ops->name) == 0) {
+			break;
+		}
+	}
+	if (cur_ops == NULL) {
+		printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
+		       torture_type);
+		return (-EINVAL);
+	}
+	if (cur_ops->init != NULL)
+		cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+
 	if (nreaders >= 0)
 		nrealreaders = nreaders;
 	else
diff --git a/kernel/resource.c b/kernel/resource.c
index e3080fc..2404f9b 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -232,6 +232,44 @@
 
 EXPORT_SYMBOL(release_resource);
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * Finds the lowest memory reosurce exists within [res->start.res->end)
+ * the caller must specify res->start, res->end, res->flags.
+ * If found, returns 0, res is overwritten, if not found, returns -1.
+ */
+int find_next_system_ram(struct resource *res)
+{
+	resource_size_t start, end;
+	struct resource *p;
+
+	BUG_ON(!res);
+
+	start = res->start;
+	end = res->end;
+
+	read_lock(&resource_lock);
+	for (p = iomem_resource.child; p ; p = p->sibling) {
+		/* system ram is just marked as IORESOURCE_MEM */
+		if (p->flags != res->flags)
+			continue;
+		if (p->start > end) {
+			p = NULL;
+			break;
+		}
+		if (p->start >= start)
+			break;
+	}
+	read_unlock(&resource_lock);
+	if (!p)
+		return -1;
+	/* copy data */
+	res->start = p->start;
+	res->end = p->end;
+	return 0;
+}
+#endif
+
 /*
  * Find empty slot in the resource tree given range and alignment.
  */
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
new file mode 100644
index 0000000..4aa8a2c9
--- /dev/null
+++ b/kernel/rtmutex-debug.c
@@ -0,0 +1,513 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This code is based on the rt.c implementation in the preempt-rt tree.
+ * Portions of said code are
+ *
+ *  Copyright (C) 2004  LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ *  Copyright (C) 2006  Esben Nielsen
+ *  Copyright (C) 2006  Kihon Technologies Inc.,
+ *			Steven Rostedt <rostedt@goodmis.org>
+ *
+ * See rt.c in preempt-rt for proper credits and further information
+ */
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/plist.h>
+#include <linux/fs.h>
+
+#include "rtmutex_common.h"
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
+# define TRACE_WARN_ON(x)			WARN_ON(x)
+# define TRACE_BUG_ON(x)			BUG_ON(x)
+
+# define TRACE_OFF()						\
+do {								\
+	if (rt_trace_on) {					\
+		rt_trace_on = 0;				\
+		console_verbose();				\
+		if (spin_is_locked(&current->pi_lock))		\
+			spin_unlock(&current->pi_lock);		\
+		if (spin_is_locked(&current->held_list_lock))	\
+			spin_unlock(&current->held_list_lock);	\
+	}							\
+} while (0)
+
+# define TRACE_OFF_NOLOCK()					\
+do {								\
+	if (rt_trace_on) {					\
+		rt_trace_on = 0;				\
+		console_verbose();				\
+	}							\
+} while (0)
+
+# define TRACE_BUG_LOCKED()			\
+do {						\
+	TRACE_OFF();				\
+	BUG();					\
+} while (0)
+
+# define TRACE_WARN_ON_LOCKED(c)		\
+do {						\
+	if (unlikely(c)) {			\
+		TRACE_OFF();			\
+		WARN_ON(1);			\
+	}					\
+} while (0)
+
+# define TRACE_BUG_ON_LOCKED(c)			\
+do {						\
+	if (unlikely(c))			\
+		TRACE_BUG_LOCKED();		\
+} while (0)
+
+#ifdef CONFIG_SMP
+# define SMP_TRACE_BUG_ON_LOCKED(c)	TRACE_BUG_ON_LOCKED(c)
+#else
+# define SMP_TRACE_BUG_ON_LOCKED(c)	do { } while (0)
+#endif
+
+/*
+ * deadlock detection flag. We turn it off when we detect
+ * the first problem because we dont want to recurse back
+ * into the tracing code when doing error printk or
+ * executing a BUG():
+ */
+int rt_trace_on = 1;
+
+void deadlock_trace_off(void)
+{
+	rt_trace_on = 0;
+}
+
+static void printk_task(task_t *p)
+{
+	if (p)
+		printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
+	else
+		printk("<none>");
+}
+
+static void printk_task_short(task_t *p)
+{
+	if (p)
+		printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio);
+	else
+		printk("<none>");
+}
+
+static void printk_lock(struct rt_mutex *lock, int print_owner)
+{
+	if (lock->name)
+		printk(" [%p] {%s}\n",
+			lock, lock->name);
+	else
+		printk(" [%p] {%s:%d}\n",
+			lock, lock->file, lock->line);
+
+	if (print_owner && rt_mutex_owner(lock)) {
+		printk(".. ->owner: %p\n", lock->owner);
+		printk(".. held by:  ");
+		printk_task(rt_mutex_owner(lock));
+		printk("\n");
+	}
+	if (rt_mutex_owner(lock)) {
+		printk("... acquired at:               ");
+		print_symbol("%s\n", lock->acquire_ip);
+	}
+}
+
+static void printk_waiter(struct rt_mutex_waiter *w)
+{
+	printk("-------------------------\n");
+	printk("| waiter struct %p:\n", w);
+	printk("| w->list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
+	       w->list_entry.plist.prio_list.prev, w->list_entry.plist.prio_list.next,
+	       w->list_entry.plist.node_list.prev, w->list_entry.plist.node_list.next,
+	       w->list_entry.prio);
+	printk("| w->pi_list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
+	       w->pi_list_entry.plist.prio_list.prev, w->pi_list_entry.plist.prio_list.next,
+	       w->pi_list_entry.plist.node_list.prev, w->pi_list_entry.plist.node_list.next,
+	       w->pi_list_entry.prio);
+	printk("\n| lock:\n");
+	printk_lock(w->lock, 1);
+	printk("| w->ti->task:\n");
+	printk_task(w->task);
+	printk("| blocked at:  ");
+	print_symbol("%s\n", w->ip);
+	printk("-------------------------\n");
+}
+
+static void show_task_locks(task_t *p)
+{
+	switch (p->state) {
+	case TASK_RUNNING:		printk("R"); break;
+	case TASK_INTERRUPTIBLE:	printk("S"); break;
+	case TASK_UNINTERRUPTIBLE:	printk("D"); break;
+	case TASK_STOPPED:		printk("T"); break;
+	case EXIT_ZOMBIE:		printk("Z"); break;
+	case EXIT_DEAD:			printk("X"); break;
+	default:			printk("?"); break;
+	}
+	printk_task(p);
+	if (p->pi_blocked_on) {
+		struct rt_mutex *lock = p->pi_blocked_on->lock;
+
+		printk(" blocked on:");
+		printk_lock(lock, 1);
+	} else
+		printk(" (not blocked)\n");
+}
+
+void rt_mutex_show_held_locks(task_t *task, int verbose)
+{
+	struct list_head *curr, *cursor = NULL;
+	struct rt_mutex *lock;
+	task_t *t;
+	unsigned long flags;
+	int count = 0;
+
+	if (!rt_trace_on)
+		return;
+
+	if (verbose) {
+		printk("------------------------------\n");
+		printk("| showing all locks held by: |  (");
+		printk_task_short(task);
+		printk("):\n");
+		printk("------------------------------\n");
+	}
+
+next:
+	spin_lock_irqsave(&task->held_list_lock, flags);
+	list_for_each(curr, &task->held_list_head) {
+		if (cursor && curr != cursor)
+			continue;
+		lock = list_entry(curr, struct rt_mutex, held_list_entry);
+		t = rt_mutex_owner(lock);
+		WARN_ON(t != task);
+		count++;
+		cursor = curr->next;
+		spin_unlock_irqrestore(&task->held_list_lock, flags);
+
+		printk("\n#%03d:            ", count);
+		printk_lock(lock, 0);
+		goto next;
+	}
+	spin_unlock_irqrestore(&task->held_list_lock, flags);
+
+	printk("\n");
+}
+
+void rt_mutex_show_all_locks(void)
+{
+	task_t *g, *p;
+	int count = 10;
+	int unlock = 1;
+
+	printk("\n");
+	printk("----------------------\n");
+	printk("| showing all tasks: |\n");
+	printk("----------------------\n");
+
+	/*
+	 * Here we try to get the tasklist_lock as hard as possible,
+	 * if not successful after 2 seconds we ignore it (but keep
+	 * trying). This is to enable a debug printout even if a
+	 * tasklist_lock-holding task deadlocks or crashes.
+	 */
+retry:
+	if (!read_trylock(&tasklist_lock)) {
+		if (count == 10)
+			printk("hm, tasklist_lock locked, retrying... ");
+		if (count) {
+			count--;
+			printk(" #%d", 10-count);
+			mdelay(200);
+			goto retry;
+		}
+		printk(" ignoring it.\n");
+		unlock = 0;
+	}
+	if (count != 10)
+		printk(" locked it.\n");
+
+	do_each_thread(g, p) {
+		show_task_locks(p);
+		if (!unlock)
+			if (read_trylock(&tasklist_lock))
+				unlock = 1;
+	} while_each_thread(g, p);
+
+	printk("\n");
+
+	printk("-----------------------------------------\n");
+	printk("| showing all locks held in the system: |\n");
+	printk("-----------------------------------------\n");
+
+	do_each_thread(g, p) {
+		rt_mutex_show_held_locks(p, 0);
+		if (!unlock)
+			if (read_trylock(&tasklist_lock))
+				unlock = 1;
+	} while_each_thread(g, p);
+
+
+	printk("=============================================\n\n");
+
+	if (unlock)
+		read_unlock(&tasklist_lock);
+}
+
+void rt_mutex_debug_check_no_locks_held(task_t *task)
+{
+	struct rt_mutex_waiter *w;
+	struct list_head *curr;
+	struct rt_mutex *lock;
+
+	if (!rt_trace_on)
+		return;
+	if (!rt_prio(task->normal_prio) && rt_prio(task->prio)) {
+		printk("BUG: PI priority boost leaked!\n");
+		printk_task(task);
+		printk("\n");
+	}
+	if (list_empty(&task->held_list_head))
+		return;
+
+	spin_lock(&task->pi_lock);
+	plist_for_each_entry(w, &task->pi_waiters, pi_list_entry) {
+		TRACE_OFF();
+
+		printk("hm, PI interest held at exit time? Task:\n");
+		printk_task(task);
+		printk_waiter(w);
+		return;
+	}
+	spin_unlock(&task->pi_lock);
+
+	list_for_each(curr, &task->held_list_head) {
+		lock = list_entry(curr, struct rt_mutex, held_list_entry);
+
+		printk("BUG: %s/%d, lock held at task exit time!\n",
+		       task->comm, task->pid);
+		printk_lock(lock, 1);
+		if (rt_mutex_owner(lock) != task)
+			printk("exiting task is not even the owner??\n");
+	}
+}
+
+int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
+{
+	const void *to = from + len;
+	struct list_head *curr;
+	struct rt_mutex *lock;
+	unsigned long flags;
+	void *lock_addr;
+
+	if (!rt_trace_on)
+		return 0;
+
+	spin_lock_irqsave(&current->held_list_lock, flags);
+	list_for_each(curr, &current->held_list_head) {
+		lock = list_entry(curr, struct rt_mutex, held_list_entry);
+		lock_addr = lock;
+		if (lock_addr < from || lock_addr >= to)
+			continue;
+		TRACE_OFF();
+
+		printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
+			current->comm, current->pid, lock, from, to);
+		dump_stack();
+		printk_lock(lock, 1);
+		if (rt_mutex_owner(lock) != current)
+			printk("freeing task is not even the owner??\n");
+		return 1;
+	}
+	spin_unlock_irqrestore(&current->held_list_lock, flags);
+
+	return 0;
+}
+
+void rt_mutex_debug_task_free(struct task_struct *task)
+{
+	WARN_ON(!plist_head_empty(&task->pi_waiters));
+	WARN_ON(task->pi_blocked_on);
+}
+
+/*
+ * We fill out the fields in the waiter to store the information about
+ * the deadlock. We print when we return. act_waiter can be NULL in
+ * case of a remove waiter operation.
+ */
+void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
+			     struct rt_mutex *lock)
+{
+	struct task_struct *task;
+
+	if (!rt_trace_on || detect || !act_waiter)
+		return;
+
+	task = rt_mutex_owner(act_waiter->lock);
+	if (task && task != current) {
+		act_waiter->deadlock_task_pid = task->pid;
+		act_waiter->deadlock_lock = lock;
+	}
+}
+
+void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
+{
+	struct task_struct *task;
+
+	if (!waiter->deadlock_lock || !rt_trace_on)
+		return;
+
+	task = find_task_by_pid(waiter->deadlock_task_pid);
+	if (!task)
+		return;
+
+	TRACE_OFF_NOLOCK();
+
+	printk("\n============================================\n");
+	printk(  "[ BUG: circular locking deadlock detected! ]\n");
+	printk(  "--------------------------------------------\n");
+	printk("%s/%d is deadlocking current task %s/%d\n\n",
+	       task->comm, task->pid, current->comm, current->pid);
+
+	printk("\n1) %s/%d is trying to acquire this lock:\n",
+	       current->comm, current->pid);
+	printk_lock(waiter->lock, 1);
+
+	printk("... trying at:                 ");
+	print_symbol("%s\n", waiter->ip);
+
+	printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid);
+	printk_lock(waiter->deadlock_lock, 1);
+
+	rt_mutex_show_held_locks(current, 1);
+	rt_mutex_show_held_locks(task, 1);
+
+	printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid);
+	show_stack(task, NULL);
+	printk("\n%s/%d's [current] stackdump:\n\n",
+	       current->comm, current->pid);
+	dump_stack();
+	rt_mutex_show_all_locks();
+	printk("[ turning off deadlock detection."
+	       "Please report this trace. ]\n\n");
+	local_irq_disable();
+}
+
+void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__)
+{
+	unsigned long flags;
+
+	if (rt_trace_on) {
+		TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
+
+		spin_lock_irqsave(&current->held_list_lock, flags);
+		list_add_tail(&lock->held_list_entry, &current->held_list_head);
+		spin_unlock_irqrestore(&current->held_list_lock, flags);
+
+		lock->acquire_ip = ip;
+	}
+}
+
+void debug_rt_mutex_unlock(struct rt_mutex *lock)
+{
+	unsigned long flags;
+
+	if (rt_trace_on) {
+		TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
+		TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
+
+		spin_lock_irqsave(&current->held_list_lock, flags);
+		list_del_init(&lock->held_list_entry);
+		spin_unlock_irqrestore(&current->held_list_lock, flags);
+	}
+}
+
+void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
+			       struct task_struct *powner __IP_DECL__)
+{
+	unsigned long flags;
+
+	if (rt_trace_on) {
+		TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
+
+		spin_lock_irqsave(&powner->held_list_lock, flags);
+		list_add_tail(&lock->held_list_entry, &powner->held_list_head);
+		spin_unlock_irqrestore(&powner->held_list_lock, flags);
+
+		lock->acquire_ip = ip;
+	}
+}
+
+void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
+{
+	unsigned long flags;
+
+	if (rt_trace_on) {
+		struct task_struct *owner = rt_mutex_owner(lock);
+
+		TRACE_WARN_ON_LOCKED(!owner);
+		TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
+
+		spin_lock_irqsave(&owner->held_list_lock, flags);
+		list_del_init(&lock->held_list_entry);
+		spin_unlock_irqrestore(&owner->held_list_lock, flags);
+	}
+}
+
+void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+	memset(waiter, 0x11, sizeof(*waiter));
+	plist_node_init(&waiter->list_entry, MAX_PRIO);
+	plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
+}
+
+void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
+{
+	TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
+	TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+	TRACE_WARN_ON(waiter->task);
+	memset(waiter, 0x22, sizeof(*waiter));
+}
+
+void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
+{
+	void *addr = lock;
+
+	if (rt_trace_on) {
+		rt_mutex_debug_check_no_locks_freed(addr,
+						    sizeof(struct rt_mutex));
+		INIT_LIST_HEAD(&lock->held_list_entry);
+		lock->name = name;
+	}
+}
+
+void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, task_t *task)
+{
+}
+
+void rt_mutex_deadlock_account_unlock(struct task_struct *task)
+{
+}
+
diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h
new file mode 100644
index 0000000..7612fbc
--- /dev/null
+++ b/kernel/rtmutex-debug.h
@@ -0,0 +1,37 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains macros used solely by rtmutex.c. Debug version.
+ */
+
+#define __IP_DECL__		, unsigned long ip
+#define __IP__			, ip
+#define __RET_IP__		, (unsigned long)__builtin_return_address(0)
+
+extern void
+rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
+extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
+extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
+extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
+extern void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__);
+extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
+extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
+				      struct task_struct *powner __IP_DECL__);
+extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
+extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
+				    struct rt_mutex *lock);
+extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
+# define debug_rt_mutex_reset_waiter(w)			\
+	do { (w)->deadlock_lock = NULL; } while (0)
+
+static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
+						 int detect)
+{
+	return (waiter != NULL);
+}
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
new file mode 100644
index 0000000..e82c2f8
--- /dev/null
+++ b/kernel/rtmutex-tester.c
@@ -0,0 +1,440 @@
+/*
+ * RT-Mutex-tester: scriptable tester for rt mutexes
+ *
+ * started by Thomas Gleixner:
+ *
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ */
+#include <linux/config.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
+#include <linux/sysdev.h>
+#include <linux/timer.h>
+
+#include "rtmutex.h"
+
+#define MAX_RT_TEST_THREADS	8
+#define MAX_RT_TEST_MUTEXES	8
+
+static spinlock_t rttest_lock;
+static atomic_t rttest_event;
+
+struct test_thread_data {
+	int			opcode;
+	int			opdata;
+	int			mutexes[MAX_RT_TEST_MUTEXES];
+	int			bkl;
+	int			event;
+	struct sys_device	sysdev;
+};
+
+static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
+static task_t *threads[MAX_RT_TEST_THREADS];
+static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
+
+enum test_opcodes {
+	RTTEST_NOP = 0,
+	RTTEST_SCHEDOT,		/* 1 Sched other, data = nice */
+	RTTEST_SCHEDRT,		/* 2 Sched fifo, data = prio */
+	RTTEST_LOCK,		/* 3 Lock uninterruptible, data = lockindex */
+	RTTEST_LOCKNOWAIT,	/* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
+	RTTEST_LOCKINT,		/* 5 Lock interruptible, data = lockindex */
+	RTTEST_LOCKINTNOWAIT,	/* 6 Lock interruptible no wait in wakeup, data = lockindex */
+	RTTEST_LOCKCONT,	/* 7 Continue locking after the wakeup delay */
+	RTTEST_UNLOCK,		/* 8 Unlock, data = lockindex */
+	RTTEST_LOCKBKL,		/* 9 Lock BKL */
+	RTTEST_UNLOCKBKL,	/* 10 Unlock BKL */
+	RTTEST_SIGNAL,		/* 11 Signal other test thread, data = thread id */
+	RTTEST_RESETEVENT = 98,	/* 98 Reset event counter */
+	RTTEST_RESET = 99,	/* 99 Reset all pending operations */
+};
+
+static int handle_op(struct test_thread_data *td, int lockwakeup)
+{
+	int i, id, ret = -EINVAL;
+
+	switch(td->opcode) {
+
+	case RTTEST_NOP:
+		return 0;
+
+	case RTTEST_LOCKCONT:
+		td->mutexes[td->opdata] = 1;
+		td->event = atomic_add_return(1, &rttest_event);
+		return 0;
+
+	case RTTEST_RESET:
+		for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
+			if (td->mutexes[i] == 4) {
+				rt_mutex_unlock(&mutexes[i]);
+				td->mutexes[i] = 0;
+			}
+		}
+
+		if (!lockwakeup && td->bkl == 4) {
+			unlock_kernel();
+			td->bkl = 0;
+		}
+		return 0;
+
+	case RTTEST_RESETEVENT:
+		atomic_set(&rttest_event, 0);
+		return 0;
+
+	default:
+		if (lockwakeup)
+			return ret;
+	}
+
+	switch(td->opcode) {
+
+	case RTTEST_LOCK:
+	case RTTEST_LOCKNOWAIT:
+		id = td->opdata;
+		if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
+			return ret;
+
+		td->mutexes[id] = 1;
+		td->event = atomic_add_return(1, &rttest_event);
+		rt_mutex_lock(&mutexes[id]);
+		td->event = atomic_add_return(1, &rttest_event);
+		td->mutexes[id] = 4;
+		return 0;
+
+	case RTTEST_LOCKINT:
+	case RTTEST_LOCKINTNOWAIT:
+		id = td->opdata;
+		if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
+			return ret;
+
+		td->mutexes[id] = 1;
+		td->event = atomic_add_return(1, &rttest_event);
+		ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
+		td->event = atomic_add_return(1, &rttest_event);
+		td->mutexes[id] = ret ? 0 : 4;
+		return ret ? -EINTR : 0;
+
+	case RTTEST_UNLOCK:
+		id = td->opdata;
+		if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
+			return ret;
+
+		td->event = atomic_add_return(1, &rttest_event);
+		rt_mutex_unlock(&mutexes[id]);
+		td->event = atomic_add_return(1, &rttest_event);
+		td->mutexes[id] = 0;
+		return 0;
+
+	case RTTEST_LOCKBKL:
+		if (td->bkl)
+			return 0;
+		td->bkl = 1;
+		lock_kernel();
+		td->bkl = 4;
+		return 0;
+
+	case RTTEST_UNLOCKBKL:
+		if (td->bkl != 4)
+			break;
+		unlock_kernel();
+		td->bkl = 0;
+		return 0;
+
+	default:
+		break;
+	}
+	return ret;
+}
+
+/*
+ * Schedule replacement for rtsem_down(). Only called for threads with
+ * PF_MUTEX_TESTER set.
+ *
+ * This allows us to have finegrained control over the event flow.
+ *
+ */
+void schedule_rt_mutex_test(struct rt_mutex *mutex)
+{
+	int tid, op, dat;
+	struct test_thread_data *td;
+
+	/* We have to lookup the task */
+	for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
+		if (threads[tid] == current)
+			break;
+	}
+
+	BUG_ON(tid == MAX_RT_TEST_THREADS);
+
+	td = &thread_data[tid];
+
+	op = td->opcode;
+	dat = td->opdata;
+
+	switch (op) {
+	case RTTEST_LOCK:
+	case RTTEST_LOCKINT:
+	case RTTEST_LOCKNOWAIT:
+	case RTTEST_LOCKINTNOWAIT:
+		if (mutex != &mutexes[dat])
+			break;
+
+		if (td->mutexes[dat] != 1)
+			break;
+
+		td->mutexes[dat] = 2;
+		td->event = atomic_add_return(1, &rttest_event);
+		break;
+
+	case RTTEST_LOCKBKL:
+	default:
+		break;
+	}
+
+	schedule();
+
+
+	switch (op) {
+	case RTTEST_LOCK:
+	case RTTEST_LOCKINT:
+		if (mutex != &mutexes[dat])
+			return;
+
+		if (td->mutexes[dat] != 2)
+			return;
+
+		td->mutexes[dat] = 3;
+		td->event = atomic_add_return(1, &rttest_event);
+		break;
+
+	case RTTEST_LOCKNOWAIT:
+	case RTTEST_LOCKINTNOWAIT:
+		if (mutex != &mutexes[dat])
+			return;
+
+		if (td->mutexes[dat] != 2)
+			return;
+
+		td->mutexes[dat] = 1;
+		td->event = atomic_add_return(1, &rttest_event);
+		return;
+
+	case RTTEST_LOCKBKL:
+		return;
+	default:
+		return;
+	}
+
+	td->opcode = 0;
+
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (td->opcode > 0) {
+			int ret;
+
+			set_current_state(TASK_RUNNING);
+			ret = handle_op(td, 1);
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (td->opcode == RTTEST_LOCKCONT)
+				break;
+			td->opcode = ret;
+		}
+
+		/* Wait for the next command to be executed */
+		schedule();
+	}
+
+	/* Restore previous command and data */
+	td->opcode = op;
+	td->opdata = dat;
+}
+
+static int test_func(void *data)
+{
+	struct test_thread_data *td = data;
+	int ret;
+
+	current->flags |= PF_MUTEX_TESTER;
+	allow_signal(SIGHUP);
+
+	for(;;) {
+
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (td->opcode > 0) {
+			set_current_state(TASK_RUNNING);
+			ret = handle_op(td, 0);
+			set_current_state(TASK_INTERRUPTIBLE);
+			td->opcode = ret;
+		}
+
+		/* Wait for the next command to be executed */
+		schedule();
+
+		if (signal_pending(current))
+			flush_signals(current);
+
+		if(kthread_should_stop())
+			break;
+	}
+	return 0;
+}
+
+/**
+ * sysfs_test_command - interface for test commands
+ * @dev:	thread reference
+ * @buf:	command for actual step
+ * @count:	length of buffer
+ *
+ * command syntax:
+ *
+ * opcode:data
+ */
+static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
+				  size_t count)
+{
+	struct sched_param schedpar;
+	struct test_thread_data *td;
+	char cmdbuf[32];
+	int op, dat, tid, ret;
+
+	td = container_of(dev, struct test_thread_data, sysdev);
+	tid = td->sysdev.id;
+
+	/* strings from sysfs write are not 0 terminated! */
+	if (count >= sizeof(cmdbuf))
+		return -EINVAL;
+
+	/* strip of \n: */
+	if (buf[count-1] == '\n')
+		count--;
+	if (count < 1)
+		return -EINVAL;
+
+	memcpy(cmdbuf, buf, count);
+	cmdbuf[count] = 0;
+
+	if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
+		return -EINVAL;
+
+	switch (op) {
+	case RTTEST_SCHEDOT:
+		schedpar.sched_priority = 0;
+		ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar);
+		if (ret)
+			return ret;
+		set_user_nice(current, 0);
+		break;
+
+	case RTTEST_SCHEDRT:
+		schedpar.sched_priority = dat;
+		ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar);
+		if (ret)
+			return ret;
+		break;
+
+	case RTTEST_SIGNAL:
+		send_sig(SIGHUP, threads[tid], 0);
+		break;
+
+	default:
+		if (td->opcode > 0)
+			return -EBUSY;
+		td->opdata = dat;
+		td->opcode = op;
+		wake_up_process(threads[tid]);
+	}
+
+	return count;
+}
+
+/**
+ * sysfs_test_status - sysfs interface for rt tester
+ * @dev:	thread to query
+ * @buf:	char buffer to be filled with thread status info
+ */
+static ssize_t sysfs_test_status(struct sys_device *dev, char *buf)
+{
+	struct test_thread_data *td;
+	char *curr = buf;
+	task_t *tsk;
+	int i;
+
+	td = container_of(dev, struct test_thread_data, sysdev);
+	tsk = threads[td->sysdev.id];
+
+	spin_lock(&rttest_lock);
+
+	curr += sprintf(curr,
+		"O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:",
+		td->opcode, td->event, tsk->state,
+			(MAX_RT_PRIO - 1) - tsk->prio,
+			(MAX_RT_PRIO - 1) - tsk->normal_prio,
+		tsk->pi_blocked_on, td->bkl);
+
+	for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
+		curr += sprintf(curr, "%d", td->mutexes[i]);
+
+	spin_unlock(&rttest_lock);
+
+	curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
+			mutexes[td->sysdev.id].owner);
+
+	return curr - buf;
+}
+
+static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
+static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
+
+static struct sysdev_class rttest_sysclass = {
+	set_kset_name("rttest"),
+};
+
+static int init_test_thread(int id)
+{
+	thread_data[id].sysdev.cls = &rttest_sysclass;
+	thread_data[id].sysdev.id = id;
+
+	threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
+	if (IS_ERR(threads[id]))
+		return PTR_ERR(threads[id]);
+
+	return sysdev_register(&thread_data[id].sysdev);
+}
+
+static int init_rttest(void)
+{
+	int ret, i;
+
+	spin_lock_init(&rttest_lock);
+
+	for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
+		rt_mutex_init(&mutexes[i]);
+
+	ret = sysdev_class_register(&rttest_sysclass);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
+		ret = init_test_thread(i);
+		if (ret)
+			break;
+		ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status);
+		if (ret)
+			break;
+		ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command);
+		if (ret)
+			break;
+	}
+
+	printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
+
+	return ret;
+}
+
+device_initcall(init_rttest);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
new file mode 100644
index 0000000..45d6101
--- /dev/null
+++ b/kernel/rtmutex.c
@@ -0,0 +1,990 @@
+/*
+ * RT-Mutexes: simple blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner.
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
+ *  Copyright (C) 2006 Esben Nielsen
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+
+#include "rtmutex_common.h"
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
+/*
+ * lock->owner state tracking:
+ *
+ * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1
+ * are used to keep track of the "owner is pending" and "lock has
+ * waiters" state.
+ *
+ * owner	bit1	bit0
+ * NULL		0	0	lock is free (fast acquire possible)
+ * NULL		0	1	invalid state
+ * NULL		1	0	Transitional State*
+ * NULL		1	1	invalid state
+ * taskpointer	0	0	lock is held (fast release possible)
+ * taskpointer	0	1	task is pending owner
+ * taskpointer	1	0	lock is held and has waiters
+ * taskpointer	1	1	task is pending owner and lock has more waiters
+ *
+ * Pending ownership is assigned to the top (highest priority)
+ * waiter of the lock, when the lock is released. The thread is woken
+ * up and can now take the lock. Until the lock is taken (bit 0
+ * cleared) a competing higher priority thread can steal the lock
+ * which puts the woken up thread back on the waiters list.
+ *
+ * The fast atomic compare exchange based acquire and release is only
+ * possible when bit 0 and 1 of lock->owner are 0.
+ *
+ * (*) There's a small time where the owner can be NULL and the
+ * "lock has waiters" bit is set.  This can happen when grabbing the lock.
+ * To prevent a cmpxchg of the owner releasing the lock, we need to set this
+ * bit before looking at the lock, hence the reason this is a transitional
+ * state.
+ */
+
+static void
+rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
+		   unsigned long mask)
+{
+	unsigned long val = (unsigned long)owner | mask;
+
+	if (rt_mutex_has_waiters(lock))
+		val |= RT_MUTEX_HAS_WAITERS;
+
+	lock->owner = (struct task_struct *)val;
+}
+
+static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
+{
+	lock->owner = (struct task_struct *)
+			((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
+}
+
+static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
+{
+	if (!rt_mutex_has_waiters(lock))
+		clear_rt_mutex_waiters(lock);
+}
+
+/*
+ * We can speed up the acquire/release, if the architecture
+ * supports cmpxchg and if there's no debugging state to be set up
+ */
+#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+# define rt_mutex_cmpxchg(l,c,n)	(cmpxchg(&l->owner, c, n) == c)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+	unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+	do {
+		owner = *p;
+	} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+}
+#else
+# define rt_mutex_cmpxchg(l,c,n)	(0)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+	lock->owner = (struct task_struct *)
+			((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
+}
+#endif
+
+/*
+ * Calculate task priority from the waiter list priority
+ *
+ * Return task->normal_prio when the waiter list is empty or when
+ * the waiter is not allowed to do priority boosting
+ */
+int rt_mutex_getprio(struct task_struct *task)
+{
+	if (likely(!task_has_pi_waiters(task)))
+		return task->normal_prio;
+
+	return min(task_top_pi_waiter(task)->pi_list_entry.prio,
+		   task->normal_prio);
+}
+
+/*
+ * Adjust the priority of a task, after its pi_waiters got modified.
+ *
+ * This can be both boosting and unboosting. task->pi_lock must be held.
+ */
+static void __rt_mutex_adjust_prio(struct task_struct *task)
+{
+	int prio = rt_mutex_getprio(task);
+
+	if (task->prio != prio)
+		rt_mutex_setprio(task, prio);
+}
+
+/*
+ * Adjust task priority (undo boosting). Called from the exit path of
+ * rt_mutex_slowunlock() and rt_mutex_slowlock().
+ *
+ * (Note: We do this outside of the protection of lock->wait_lock to
+ * allow the lock to be taken while or before we readjust the priority
+ * of task. We do not use the spin_xx_mutex() variants here as we are
+ * outside of the debug path.)
+ */
+static void rt_mutex_adjust_prio(struct task_struct *task)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&task->pi_lock, flags);
+	__rt_mutex_adjust_prio(task);
+	spin_unlock_irqrestore(&task->pi_lock, flags);
+}
+
+/*
+ * Max number of times we'll walk the boosting chain:
+ */
+int max_lock_depth = 1024;
+
+/*
+ * Adjust the priority chain. Also used for deadlock detection.
+ * Decreases task's usage by one - may thus free the task.
+ * Returns 0 or -EDEADLK.
+ */
+static int rt_mutex_adjust_prio_chain(task_t *task,
+				      int deadlock_detect,
+				      struct rt_mutex *orig_lock,
+				      struct rt_mutex_waiter *orig_waiter,
+				      struct task_struct *top_task
+				      __IP_DECL__)
+{
+	struct rt_mutex *lock;
+	struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
+	int detect_deadlock, ret = 0, depth = 0;
+	unsigned long flags;
+
+	detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter,
+							 deadlock_detect);
+
+	/*
+	 * The (de)boosting is a step by step approach with a lot of
+	 * pitfalls. We want this to be preemptible and we want hold a
+	 * maximum of two locks per step. So we have to check
+	 * carefully whether things change under us.
+	 */
+ again:
+	if (++depth > max_lock_depth) {
+		static int prev_max;
+
+		/*
+		 * Print this only once. If the admin changes the limit,
+		 * print a new message when reaching the limit again.
+		 */
+		if (prev_max != max_lock_depth) {
+			prev_max = max_lock_depth;
+			printk(KERN_WARNING "Maximum lock depth %d reached "
+			       "task: %s (%d)\n", max_lock_depth,
+			       top_task->comm, top_task->pid);
+		}
+		put_task_struct(task);
+
+		return deadlock_detect ? -EDEADLK : 0;
+	}
+ retry:
+	/*
+	 * Task can not go away as we did a get_task() before !
+	 */
+	spin_lock_irqsave(&task->pi_lock, flags);
+
+	waiter = task->pi_blocked_on;
+	/*
+	 * Check whether the end of the boosting chain has been
+	 * reached or the state of the chain has changed while we
+	 * dropped the locks.
+	 */
+	if (!waiter || !waiter->task)
+		goto out_unlock_pi;
+
+	if (top_waiter && (!task_has_pi_waiters(task) ||
+			   top_waiter != task_top_pi_waiter(task)))
+		goto out_unlock_pi;
+
+	/*
+	 * When deadlock detection is off then we check, if further
+	 * priority adjustment is necessary.
+	 */
+	if (!detect_deadlock && waiter->list_entry.prio == task->prio)
+		goto out_unlock_pi;
+
+	lock = waiter->lock;
+	if (!spin_trylock(&lock->wait_lock)) {
+		spin_unlock_irqrestore(&task->pi_lock, flags);
+		cpu_relax();
+		goto retry;
+	}
+
+	/* Deadlock detection */
+	if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
+		debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
+		spin_unlock(&lock->wait_lock);
+		ret = deadlock_detect ? -EDEADLK : 0;
+		goto out_unlock_pi;
+	}
+
+	top_waiter = rt_mutex_top_waiter(lock);
+
+	/* Requeue the waiter */
+	plist_del(&waiter->list_entry, &lock->wait_list);
+	waiter->list_entry.prio = task->prio;
+	plist_add(&waiter->list_entry, &lock->wait_list);
+
+	/* Release the task */
+	spin_unlock_irqrestore(&task->pi_lock, flags);
+	put_task_struct(task);
+
+	/* Grab the next task */
+	task = rt_mutex_owner(lock);
+	spin_lock_irqsave(&task->pi_lock, flags);
+
+	if (waiter == rt_mutex_top_waiter(lock)) {
+		/* Boost the owner */
+		plist_del(&top_waiter->pi_list_entry, &task->pi_waiters);
+		waiter->pi_list_entry.prio = waiter->list_entry.prio;
+		plist_add(&waiter->pi_list_entry, &task->pi_waiters);
+		__rt_mutex_adjust_prio(task);
+
+	} else if (top_waiter == waiter) {
+		/* Deboost the owner */
+		plist_del(&waiter->pi_list_entry, &task->pi_waiters);
+		waiter = rt_mutex_top_waiter(lock);
+		waiter->pi_list_entry.prio = waiter->list_entry.prio;
+		plist_add(&waiter->pi_list_entry, &task->pi_waiters);
+		__rt_mutex_adjust_prio(task);
+	}
+
+	get_task_struct(task);
+	spin_unlock_irqrestore(&task->pi_lock, flags);
+
+	top_waiter = rt_mutex_top_waiter(lock);
+	spin_unlock(&lock->wait_lock);
+
+	if (!detect_deadlock && waiter != top_waiter)
+		goto out_put_task;
+
+	goto again;
+
+ out_unlock_pi:
+	spin_unlock_irqrestore(&task->pi_lock, flags);
+ out_put_task:
+	put_task_struct(task);
+	return ret;
+}
+
+/*
+ * Optimization: check if we can steal the lock from the
+ * assigned pending owner [which might not have taken the
+ * lock yet]:
+ */
+static inline int try_to_steal_lock(struct rt_mutex *lock)
+{
+	struct task_struct *pendowner = rt_mutex_owner(lock);
+	struct rt_mutex_waiter *next;
+	unsigned long flags;
+
+	if (!rt_mutex_owner_pending(lock))
+		return 0;
+
+	if (pendowner == current)
+		return 1;
+
+	spin_lock_irqsave(&pendowner->pi_lock, flags);
+	if (current->prio >= pendowner->prio) {
+		spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+		return 0;
+	}
+
+	/*
+	 * Check if a waiter is enqueued on the pending owners
+	 * pi_waiters list. Remove it and readjust pending owners
+	 * priority.
+	 */
+	if (likely(!rt_mutex_has_waiters(lock))) {
+		spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+		return 1;
+	}
+
+	/* No chain handling, pending owner is not blocked on anything: */
+	next = rt_mutex_top_waiter(lock);
+	plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
+	__rt_mutex_adjust_prio(pendowner);
+	spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+
+	/*
+	 * We are going to steal the lock and a waiter was
+	 * enqueued on the pending owners pi_waiters queue. So
+	 * we have to enqueue this waiter into
+	 * current->pi_waiters list. This covers the case,
+	 * where current is boosted because it holds another
+	 * lock and gets unboosted because the booster is
+	 * interrupted, so we would delay a waiter with higher
+	 * priority as current->normal_prio.
+	 *
+	 * Note: in the rare case of a SCHED_OTHER task changing
+	 * its priority and thus stealing the lock, next->task
+	 * might be current:
+	 */
+	if (likely(next->task != current)) {
+		spin_lock_irqsave(&current->pi_lock, flags);
+		plist_add(&next->pi_list_entry, &current->pi_waiters);
+		__rt_mutex_adjust_prio(current);
+		spin_unlock_irqrestore(&current->pi_lock, flags);
+	}
+	return 1;
+}
+
+/*
+ * Try to take an rt-mutex
+ *
+ * This fails
+ * - when the lock has a real owner
+ * - when a different pending owner exists and has higher priority than current
+ *
+ * Must be called with lock->wait_lock held.
+ */
+static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__)
+{
+	/*
+	 * We have to be careful here if the atomic speedups are
+	 * enabled, such that, when
+	 *  - no other waiter is on the lock
+	 *  - the lock has been released since we did the cmpxchg
+	 * the lock can be released or taken while we are doing the
+	 * checks and marking the lock with RT_MUTEX_HAS_WAITERS.
+	 *
+	 * The atomic acquire/release aware variant of
+	 * mark_rt_mutex_waiters uses a cmpxchg loop. After setting
+	 * the WAITERS bit, the atomic release / acquire can not
+	 * happen anymore and lock->wait_lock protects us from the
+	 * non-atomic case.
+	 *
+	 * Note, that this might set lock->owner =
+	 * RT_MUTEX_HAS_WAITERS in the case the lock is not contended
+	 * any more. This is fixed up when we take the ownership.
+	 * This is the transitional state explained at the top of this file.
+	 */
+	mark_rt_mutex_waiters(lock);
+
+	if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
+		return 0;
+
+	/* We got the lock. */
+	debug_rt_mutex_lock(lock __IP__);
+
+	rt_mutex_set_owner(lock, current, 0);
+
+	rt_mutex_deadlock_account_lock(lock, current);
+
+	return 1;
+}
+
+/*
+ * Task blocks on lock.
+ *
+ * Prepare waiter and propagate pi chain
+ *
+ * This must be called with lock->wait_lock held.
+ */
+static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
+				   struct rt_mutex_waiter *waiter,
+				   int detect_deadlock
+				   __IP_DECL__)
+{
+	struct rt_mutex_waiter *top_waiter = waiter;
+	task_t *owner = rt_mutex_owner(lock);
+	int boost = 0, res;
+	unsigned long flags;
+
+	spin_lock_irqsave(&current->pi_lock, flags);
+	__rt_mutex_adjust_prio(current);
+	waiter->task = current;
+	waiter->lock = lock;
+	plist_node_init(&waiter->list_entry, current->prio);
+	plist_node_init(&waiter->pi_list_entry, current->prio);
+
+	/* Get the top priority waiter on the lock */
+	if (rt_mutex_has_waiters(lock))
+		top_waiter = rt_mutex_top_waiter(lock);
+	plist_add(&waiter->list_entry, &lock->wait_list);
+
+	current->pi_blocked_on = waiter;
+
+	spin_unlock_irqrestore(&current->pi_lock, flags);
+
+	if (waiter == rt_mutex_top_waiter(lock)) {
+		spin_lock_irqsave(&owner->pi_lock, flags);
+		plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
+		plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
+
+		__rt_mutex_adjust_prio(owner);
+		if (owner->pi_blocked_on) {
+			boost = 1;
+			/* gets dropped in rt_mutex_adjust_prio_chain()! */
+			get_task_struct(owner);
+		}
+		spin_unlock_irqrestore(&owner->pi_lock, flags);
+	}
+	else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
+		spin_lock_irqsave(&owner->pi_lock, flags);
+		if (owner->pi_blocked_on) {
+			boost = 1;
+			/* gets dropped in rt_mutex_adjust_prio_chain()! */
+			get_task_struct(owner);
+		}
+		spin_unlock_irqrestore(&owner->pi_lock, flags);
+	}
+	if (!boost)
+		return 0;
+
+	spin_unlock(&lock->wait_lock);
+
+	res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
+					 current __IP__);
+
+	spin_lock(&lock->wait_lock);
+
+	return res;
+}
+
+/*
+ * Wake up the next waiter on the lock.
+ *
+ * Remove the top waiter from the current tasks waiter list and from
+ * the lock waiter list. Set it as pending owner. Then wake it up.
+ *
+ * Called with lock->wait_lock held.
+ */
+static void wakeup_next_waiter(struct rt_mutex *lock)
+{
+	struct rt_mutex_waiter *waiter;
+	struct task_struct *pendowner;
+	unsigned long flags;
+
+	spin_lock_irqsave(&current->pi_lock, flags);
+
+	waiter = rt_mutex_top_waiter(lock);
+	plist_del(&waiter->list_entry, &lock->wait_list);
+
+	/*
+	 * Remove it from current->pi_waiters. We do not adjust a
+	 * possible priority boost right now. We execute wakeup in the
+	 * boosted mode and go back to normal after releasing
+	 * lock->wait_lock.
+	 */
+	plist_del(&waiter->pi_list_entry, &current->pi_waiters);
+	pendowner = waiter->task;
+	waiter->task = NULL;
+
+	rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
+
+	spin_unlock_irqrestore(&current->pi_lock, flags);
+
+	/*
+	 * Clear the pi_blocked_on variable and enqueue a possible
+	 * waiter into the pi_waiters list of the pending owner. This
+	 * prevents that in case the pending owner gets unboosted a
+	 * waiter with higher priority than pending-owner->normal_prio
+	 * is blocked on the unboosted (pending) owner.
+	 */
+	spin_lock_irqsave(&pendowner->pi_lock, flags);
+
+	WARN_ON(!pendowner->pi_blocked_on);
+	WARN_ON(pendowner->pi_blocked_on != waiter);
+	WARN_ON(pendowner->pi_blocked_on->lock != lock);
+
+	pendowner->pi_blocked_on = NULL;
+
+	if (rt_mutex_has_waiters(lock)) {
+		struct rt_mutex_waiter *next;
+
+		next = rt_mutex_top_waiter(lock);
+		plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
+	}
+	spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+
+	wake_up_process(pendowner);
+}
+
+/*
+ * Remove a waiter from a lock
+ *
+ * Must be called with lock->wait_lock held
+ */
+static void remove_waiter(struct rt_mutex *lock,
+			  struct rt_mutex_waiter *waiter  __IP_DECL__)
+{
+	int first = (waiter == rt_mutex_top_waiter(lock));
+	int boost = 0;
+	task_t *owner = rt_mutex_owner(lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&current->pi_lock, flags);
+	plist_del(&waiter->list_entry, &lock->wait_list);
+	waiter->task = NULL;
+	current->pi_blocked_on = NULL;
+	spin_unlock_irqrestore(&current->pi_lock, flags);
+
+	if (first && owner != current) {
+
+		spin_lock_irqsave(&owner->pi_lock, flags);
+
+		plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
+
+		if (rt_mutex_has_waiters(lock)) {
+			struct rt_mutex_waiter *next;
+
+			next = rt_mutex_top_waiter(lock);
+			plist_add(&next->pi_list_entry, &owner->pi_waiters);
+		}
+		__rt_mutex_adjust_prio(owner);
+
+		if (owner->pi_blocked_on) {
+			boost = 1;
+			/* gets dropped in rt_mutex_adjust_prio_chain()! */
+			get_task_struct(owner);
+		}
+		spin_unlock_irqrestore(&owner->pi_lock, flags);
+	}
+
+	WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+
+	if (!boost)
+		return;
+
+	spin_unlock(&lock->wait_lock);
+
+	rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current __IP__);
+
+	spin_lock(&lock->wait_lock);
+}
+
+/*
+ * Recheck the pi chain, in case we got a priority setting
+ *
+ * Called from sched_setscheduler
+ */
+void rt_mutex_adjust_pi(struct task_struct *task)
+{
+	struct rt_mutex_waiter *waiter;
+	unsigned long flags;
+
+	spin_lock_irqsave(&task->pi_lock, flags);
+
+	waiter = task->pi_blocked_on;
+	if (!waiter || waiter->list_entry.prio == task->prio) {
+		spin_unlock_irqrestore(&task->pi_lock, flags);
+		return;
+	}
+
+	/* gets dropped in rt_mutex_adjust_prio_chain()! */
+	get_task_struct(task);
+	spin_unlock_irqrestore(&task->pi_lock, flags);
+
+	rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task __RET_IP__);
+}
+
+/*
+ * Slow path lock function:
+ */
+static int __sched
+rt_mutex_slowlock(struct rt_mutex *lock, int state,
+		  struct hrtimer_sleeper *timeout,
+		  int detect_deadlock __IP_DECL__)
+{
+	struct rt_mutex_waiter waiter;
+	int ret = 0;
+
+	debug_rt_mutex_init_waiter(&waiter);
+	waiter.task = NULL;
+
+	spin_lock(&lock->wait_lock);
+
+	/* Try to acquire the lock again: */
+	if (try_to_take_rt_mutex(lock __IP__)) {
+		spin_unlock(&lock->wait_lock);
+		return 0;
+	}
+
+	set_current_state(state);
+
+	/* Setup the timer, when timeout != NULL */
+	if (unlikely(timeout))
+		hrtimer_start(&timeout->timer, timeout->timer.expires,
+			      HRTIMER_ABS);
+
+	for (;;) {
+		/* Try to acquire the lock: */
+		if (try_to_take_rt_mutex(lock __IP__))
+			break;
+
+		/*
+		 * TASK_INTERRUPTIBLE checks for signals and
+		 * timeout. Ignored otherwise.
+		 */
+		if (unlikely(state == TASK_INTERRUPTIBLE)) {
+			/* Signal pending? */
+			if (signal_pending(current))
+				ret = -EINTR;
+			if (timeout && !timeout->task)
+				ret = -ETIMEDOUT;
+			if (ret)
+				break;
+		}
+
+		/*
+		 * waiter.task is NULL the first time we come here and
+		 * when we have been woken up by the previous owner
+		 * but the lock got stolen by a higher prio task.
+		 */
+		if (!waiter.task) {
+			ret = task_blocks_on_rt_mutex(lock, &waiter,
+						      detect_deadlock __IP__);
+			/*
+			 * If we got woken up by the owner then start loop
+			 * all over without going into schedule to try
+			 * to get the lock now:
+			 */
+			if (unlikely(!waiter.task))
+				continue;
+
+			if (unlikely(ret))
+				break;
+		}
+
+		spin_unlock(&lock->wait_lock);
+
+		debug_rt_mutex_print_deadlock(&waiter);
+
+		if (waiter.task)
+			schedule_rt_mutex(lock);
+
+		spin_lock(&lock->wait_lock);
+		set_current_state(state);
+	}
+
+	set_current_state(TASK_RUNNING);
+
+	if (unlikely(waiter.task))
+		remove_waiter(lock, &waiter __IP__);
+
+	/*
+	 * try_to_take_rt_mutex() sets the waiter bit
+	 * unconditionally. We might have to fix that up.
+	 */
+	fixup_rt_mutex_waiters(lock);
+
+	spin_unlock(&lock->wait_lock);
+
+	/* Remove pending timer: */
+	if (unlikely(timeout))
+		hrtimer_cancel(&timeout->timer);
+
+	/*
+	 * Readjust priority, when we did not get the lock. We might
+	 * have been the pending owner and boosted. Since we did not
+	 * take the lock, the PI boost has to go.
+	 */
+	if (unlikely(ret))
+		rt_mutex_adjust_prio(current);
+
+	debug_rt_mutex_free_waiter(&waiter);
+
+	return ret;
+}
+
+/*
+ * Slow path try-lock function:
+ */
+static inline int
+rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__)
+{
+	int ret = 0;
+
+	spin_lock(&lock->wait_lock);
+
+	if (likely(rt_mutex_owner(lock) != current)) {
+
+		ret = try_to_take_rt_mutex(lock __IP__);
+		/*
+		 * try_to_take_rt_mutex() sets the lock waiters
+		 * bit unconditionally. Clean this up.
+		 */
+		fixup_rt_mutex_waiters(lock);
+	}
+
+	spin_unlock(&lock->wait_lock);
+
+	return ret;
+}
+
+/*
+ * Slow path to release a rt-mutex:
+ */
+static void __sched
+rt_mutex_slowunlock(struct rt_mutex *lock)
+{
+	spin_lock(&lock->wait_lock);
+
+	debug_rt_mutex_unlock(lock);
+
+	rt_mutex_deadlock_account_unlock(current);
+
+	if (!rt_mutex_has_waiters(lock)) {
+		lock->owner = NULL;
+		spin_unlock(&lock->wait_lock);
+		return;
+	}
+
+	wakeup_next_waiter(lock);
+
+	spin_unlock(&lock->wait_lock);
+
+	/* Undo pi boosting if necessary: */
+	rt_mutex_adjust_prio(current);
+}
+
+/*
+ * debug aware fast / slowpath lock,trylock,unlock
+ *
+ * The atomic acquire/release ops are compiled away, when either the
+ * architecture does not support cmpxchg or when debugging is enabled.
+ */
+static inline int
+rt_mutex_fastlock(struct rt_mutex *lock, int state,
+		  int detect_deadlock,
+		  int (*slowfn)(struct rt_mutex *lock, int state,
+				struct hrtimer_sleeper *timeout,
+				int detect_deadlock __IP_DECL__))
+{
+	if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+		rt_mutex_deadlock_account_lock(lock, current);
+		return 0;
+	} else
+		return slowfn(lock, state, NULL, detect_deadlock __RET_IP__);
+}
+
+static inline int
+rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
+			struct hrtimer_sleeper *timeout, int detect_deadlock,
+			int (*slowfn)(struct rt_mutex *lock, int state,
+				      struct hrtimer_sleeper *timeout,
+				      int detect_deadlock __IP_DECL__))
+{
+	if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+		rt_mutex_deadlock_account_lock(lock, current);
+		return 0;
+	} else
+		return slowfn(lock, state, timeout, detect_deadlock __RET_IP__);
+}
+
+static inline int
+rt_mutex_fasttrylock(struct rt_mutex *lock,
+		     int (*slowfn)(struct rt_mutex *lock __IP_DECL__))
+{
+	if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+		rt_mutex_deadlock_account_lock(lock, current);
+		return 1;
+	}
+	return slowfn(lock __RET_IP__);
+}
+
+static inline void
+rt_mutex_fastunlock(struct rt_mutex *lock,
+		    void (*slowfn)(struct rt_mutex *lock))
+{
+	if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+		rt_mutex_deadlock_account_unlock(current);
+	else
+		slowfn(lock);
+}
+
+/**
+ * rt_mutex_lock - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ */
+void __sched rt_mutex_lock(struct rt_mutex *lock)
+{
+	might_sleep();
+
+	rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock);
+
+/**
+ * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
+ *
+ * @lock: 		the rt_mutex to be locked
+ * @detect_deadlock:	deadlock detection on/off
+ *
+ * Returns:
+ *  0 		on success
+ * -EINTR 	when interrupted by a signal
+ * -EDEADLK	when the lock would deadlock (when deadlock detection is on)
+ */
+int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
+						 int detect_deadlock)
+{
+	might_sleep();
+
+	return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE,
+				 detect_deadlock, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+
+/**
+ * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible
+ *				       the timeout structure is provided
+ *				       by the caller
+ *
+ * @lock: 		the rt_mutex to be locked
+ * @timeout:		timeout structure or NULL (no timeout)
+ * @detect_deadlock:	deadlock detection on/off
+ *
+ * Returns:
+ *  0 		on success
+ * -EINTR 	when interrupted by a signal
+ * -ETIMEOUT	when the timeout expired
+ * -EDEADLK	when the lock would deadlock (when deadlock detection is on)
+ */
+int
+rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout,
+		    int detect_deadlock)
+{
+	might_sleep();
+
+	return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
+				       detect_deadlock, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
+
+/**
+ * rt_mutex_trylock - try to lock a rt_mutex
+ *
+ * @lock:	the rt_mutex to be locked
+ *
+ * Returns 1 on success and 0 on contention
+ */
+int __sched rt_mutex_trylock(struct rt_mutex *lock)
+{
+	return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_trylock);
+
+/**
+ * rt_mutex_unlock - unlock a rt_mutex
+ *
+ * @lock: the rt_mutex to be unlocked
+ */
+void __sched rt_mutex_unlock(struct rt_mutex *lock)
+{
+	rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_unlock);
+
+/***
+ * rt_mutex_destroy - mark a mutex unusable
+ * @lock: the mutex to be destroyed
+ *
+ * This function marks the mutex uninitialized, and any subsequent
+ * use of the mutex is forbidden. The mutex must not be locked when
+ * this function is called.
+ */
+void rt_mutex_destroy(struct rt_mutex *lock)
+{
+	WARN_ON(rt_mutex_is_locked(lock));
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	lock->magic = NULL;
+#endif
+}
+
+EXPORT_SYMBOL_GPL(rt_mutex_destroy);
+
+/**
+ * __rt_mutex_init - initialize the rt lock
+ *
+ * @lock: the rt lock to be initialized
+ *
+ * Initialize the rt lock to unlocked state.
+ *
+ * Initializing of a locked rt lock is not allowed
+ */
+void __rt_mutex_init(struct rt_mutex *lock, const char *name)
+{
+	lock->owner = NULL;
+	spin_lock_init(&lock->wait_lock);
+	plist_head_init(&lock->wait_list, &lock->wait_lock);
+
+	debug_rt_mutex_init(lock, name);
+}
+EXPORT_SYMBOL_GPL(__rt_mutex_init);
+
+/**
+ * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
+ *				proxy owner
+ *
+ * @lock: 	the rt_mutex to be locked
+ * @proxy_owner:the task to set as owner
+ *
+ * No locking. Caller has to do serializing itself
+ * Special API call for PI-futex support
+ */
+void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+				struct task_struct *proxy_owner)
+{
+	__rt_mutex_init(lock, NULL);
+	debug_rt_mutex_proxy_lock(lock, proxy_owner __RET_IP__);
+	rt_mutex_set_owner(lock, proxy_owner, 0);
+	rt_mutex_deadlock_account_lock(lock, proxy_owner);
+}
+
+/**
+ * rt_mutex_proxy_unlock - release a lock on behalf of owner
+ *
+ * @lock: 	the rt_mutex to be locked
+ *
+ * No locking. Caller has to do serializing itself
+ * Special API call for PI-futex support
+ */
+void rt_mutex_proxy_unlock(struct rt_mutex *lock,
+			   struct task_struct *proxy_owner)
+{
+	debug_rt_mutex_proxy_unlock(lock);
+	rt_mutex_set_owner(lock, NULL, 0);
+	rt_mutex_deadlock_account_unlock(proxy_owner);
+}
+
+/**
+ * rt_mutex_next_owner - return the next owner of the lock
+ *
+ * @lock: the rt lock query
+ *
+ * Returns the next owner of the lock or NULL
+ *
+ * Caller has to serialize against other accessors to the lock
+ * itself.
+ *
+ * Special API call for PI-futex support
+ */
+struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
+{
+	if (!rt_mutex_has_waiters(lock))
+		return NULL;
+
+	return rt_mutex_top_waiter(lock)->task;
+}
diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h
new file mode 100644
index 0000000..1e0fca1
--- /dev/null
+++ b/kernel/rtmutex.h
@@ -0,0 +1,29 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains macros used solely by rtmutex.c.
+ * Non-debug version.
+ */
+
+#define __IP_DECL__
+#define __IP__
+#define __RET_IP__
+#define rt_mutex_deadlock_check(l)			(0)
+#define rt_mutex_deadlock_account_lock(m, t)		do { } while (0)
+#define rt_mutex_deadlock_account_unlock(l)		do { } while (0)
+#define debug_rt_mutex_init_waiter(w)			do { } while (0)
+#define debug_rt_mutex_free_waiter(w)			do { } while (0)
+#define debug_rt_mutex_lock(l)				do { } while (0)
+#define debug_rt_mutex_proxy_lock(l,p)			do { } while (0)
+#define debug_rt_mutex_proxy_unlock(l)			do { } while (0)
+#define debug_rt_mutex_unlock(l)			do { } while (0)
+#define debug_rt_mutex_init(m, n)			do { } while (0)
+#define debug_rt_mutex_deadlock(d, a ,l)		do { } while (0)
+#define debug_rt_mutex_print_deadlock(w)		do { } while (0)
+#define debug_rt_mutex_detect_deadlock(w,d)		(d)
+#define debug_rt_mutex_reset_waiter(w)			do { } while (0)
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
new file mode 100644
index 0000000..9c75856
--- /dev/null
+++ b/kernel/rtmutex_common.h
@@ -0,0 +1,123 @@
+/*
+ * RT Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains the private data structure and API definitions.
+ */
+
+#ifndef __KERNEL_RTMUTEX_COMMON_H
+#define __KERNEL_RTMUTEX_COMMON_H
+
+#include <linux/rtmutex.h>
+
+/*
+ * The rtmutex in kernel tester is independent of rtmutex debugging. We
+ * call schedule_rt_mutex_test() instead of schedule() for the tasks which
+ * belong to the tester. That way we can delay the wakeup path of those
+ * threads to provoke lock stealing and testing of  complex boosting scenarios.
+ */
+#ifdef CONFIG_RT_MUTEX_TESTER
+
+extern void schedule_rt_mutex_test(struct rt_mutex *lock);
+
+#define schedule_rt_mutex(_lock)				\
+  do {								\
+	if (!(current->flags & PF_MUTEX_TESTER))		\
+		schedule();					\
+	else							\
+		schedule_rt_mutex_test(_lock);			\
+  } while (0)
+
+#else
+# define schedule_rt_mutex(_lock)			schedule()
+#endif
+
+/*
+ * This is the control structure for tasks blocked on a rt_mutex,
+ * which is allocated on the kernel stack on of the blocked task.
+ *
+ * @list_entry:		pi node to enqueue into the mutex waiters list
+ * @pi_list_entry:	pi node to enqueue into the mutex owner waiters list
+ * @task:		task reference to the blocked task
+ */
+struct rt_mutex_waiter {
+	struct plist_node	list_entry;
+	struct plist_node	pi_list_entry;
+	struct task_struct	*task;
+	struct rt_mutex		*lock;
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	unsigned long		ip;
+	pid_t			deadlock_task_pid;
+	struct rt_mutex		*deadlock_lock;
+#endif
+};
+
+/*
+ * Various helpers to access the waiters-plist:
+ */
+static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
+{
+	return !plist_head_empty(&lock->wait_list);
+}
+
+static inline struct rt_mutex_waiter *
+rt_mutex_top_waiter(struct rt_mutex *lock)
+{
+	struct rt_mutex_waiter *w;
+
+	w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter,
+			       list_entry);
+	BUG_ON(w->lock != lock);
+
+	return w;
+}
+
+static inline int task_has_pi_waiters(struct task_struct *p)
+{
+	return !plist_head_empty(&p->pi_waiters);
+}
+
+static inline struct rt_mutex_waiter *
+task_top_pi_waiter(struct task_struct *p)
+{
+	return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter,
+				  pi_list_entry);
+}
+
+/*
+ * lock->owner state tracking:
+ */
+#define RT_MUTEX_OWNER_PENDING	1UL
+#define RT_MUTEX_HAS_WAITERS	2UL
+#define RT_MUTEX_OWNER_MASKALL	3UL
+
+static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
+{
+	return (struct task_struct *)
+		((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
+}
+
+static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
+{
+ 	return (struct task_struct *)
+		((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
+}
+
+static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
+{
+	return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
+}
+
+/*
+ * PI-futex support (proxy locking functions, etc.):
+ */
+extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
+extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+				       struct task_struct *proxy_owner);
+extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
+				  struct task_struct *proxy_owner);
+#endif
diff --git a/kernel/sched.c b/kernel/sched.c
index a856040..2629c17 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -168,15 +168,21 @@
  */
 
 #define SCALE_PRIO(x, prio) \
-	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
+	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
 
-static unsigned int task_timeslice(task_t *p)
+static unsigned int static_prio_timeslice(int static_prio)
 {
-	if (p->static_prio < NICE_TO_PRIO(0))
-		return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
+	if (static_prio < NICE_TO_PRIO(0))
+		return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
 	else
-		return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
+		return SCALE_PRIO(DEF_TIMESLICE, static_prio);
 }
+
+static inline unsigned int task_timeslice(task_t *p)
+{
+	return static_prio_timeslice(p->static_prio);
+}
+
 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)	\
 				< (long long) (sd)->cache_hot_time)
 
@@ -184,13 +190,11 @@
  * These are the runqueue data structures:
  */
 
-#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
-
 typedef struct runqueue runqueue_t;
 
 struct prio_array {
 	unsigned int nr_active;
-	unsigned long bitmap[BITMAP_SIZE];
+	DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
 	struct list_head queue[MAX_PRIO];
 };
 
@@ -209,6 +213,7 @@
 	 * remote CPUs use both these fields when doing load calculation.
 	 */
 	unsigned long nr_running;
+	unsigned long raw_weighted_load;
 #ifdef CONFIG_SMP
 	unsigned long cpu_load[3];
 #endif
@@ -239,7 +244,6 @@
 
 	task_t *migration_thread;
 	struct list_head migration_queue;
-	int cpu;
 #endif
 
 #ifdef CONFIG_SCHEDSTATS
@@ -351,11 +355,30 @@
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 
 /*
+ * __task_rq_lock - lock the runqueue a given task resides on.
+ * Must be called interrupts disabled.
+ */
+static inline runqueue_t *__task_rq_lock(task_t *p)
+	__acquires(rq->lock)
+{
+	struct runqueue *rq;
+
+repeat_lock_task:
+	rq = task_rq(p);
+	spin_lock(&rq->lock);
+	if (unlikely(rq != task_rq(p))) {
+		spin_unlock(&rq->lock);
+		goto repeat_lock_task;
+	}
+	return rq;
+}
+
+/*
  * task_rq_lock - lock the runqueue a given task resides on and disable
  * interrupts.  Note the ordering: we can safely lookup the task_rq without
  * explicitly disabling preemption.
  */
-static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
+static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
 	__acquires(rq->lock)
 {
 	struct runqueue *rq;
@@ -371,6 +394,12 @@
 	return rq;
 }
 
+static inline void __task_rq_unlock(runqueue_t *rq)
+	__releases(rq->lock)
+{
+	spin_unlock(&rq->lock);
+}
+
 static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
 	__releases(rq->lock)
 {
@@ -634,7 +663,7 @@
 }
 
 /*
- * effective_prio - return the priority that is based on the static
+ * __normal_prio - return the priority that is based on the static
  * priority but is modified by bonuses/penalties.
  *
  * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
@@ -647,13 +676,11 @@
  *
  * Both properties are important to certain workloads.
  */
-static int effective_prio(task_t *p)
+
+static inline int __normal_prio(task_t *p)
 {
 	int bonus, prio;
 
-	if (rt_task(p))
-		return p->prio;
-
 	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
 
 	prio = p->static_prio - bonus;
@@ -665,6 +692,106 @@
 }
 
 /*
+ * To aid in avoiding the subversion of "niceness" due to uneven distribution
+ * of tasks with abnormal "nice" values across CPUs the contribution that
+ * each task makes to its run queue's load is weighted according to its
+ * scheduling class and "nice" value.  For SCHED_NORMAL tasks this is just a
+ * scaled version of the new time slice allocation that they receive on time
+ * slice expiry etc.
+ */
+
+/*
+ * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
+ * If static_prio_timeslice() is ever changed to break this assumption then
+ * this code will need modification
+ */
+#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
+#define LOAD_WEIGHT(lp) \
+	(((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
+#define PRIO_TO_LOAD_WEIGHT(prio) \
+	LOAD_WEIGHT(static_prio_timeslice(prio))
+#define RTPRIO_TO_LOAD_WEIGHT(rp) \
+	(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
+
+static void set_load_weight(task_t *p)
+{
+	if (has_rt_policy(p)) {
+#ifdef CONFIG_SMP
+		if (p == task_rq(p)->migration_thread)
+			/*
+			 * The migration thread does the actual balancing.
+			 * Giving its load any weight will skew balancing
+			 * adversely.
+			 */
+			p->load_weight = 0;
+		else
+#endif
+			p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
+	} else
+		p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+}
+
+static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p)
+{
+	rq->raw_weighted_load += p->load_weight;
+}
+
+static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p)
+{
+	rq->raw_weighted_load -= p->load_weight;
+}
+
+static inline void inc_nr_running(task_t *p, runqueue_t *rq)
+{
+	rq->nr_running++;
+	inc_raw_weighted_load(rq, p);
+}
+
+static inline void dec_nr_running(task_t *p, runqueue_t *rq)
+{
+	rq->nr_running--;
+	dec_raw_weighted_load(rq, p);
+}
+
+/*
+ * Calculate the expected normal priority: i.e. priority
+ * without taking RT-inheritance into account. Might be
+ * boosted by interactivity modifiers. Changes upon fork,
+ * setprio syscalls, and whenever the interactivity
+ * estimator recalculates.
+ */
+static inline int normal_prio(task_t *p)
+{
+	int prio;
+
+	if (has_rt_policy(p))
+		prio = MAX_RT_PRIO-1 - p->rt_priority;
+	else
+		prio = __normal_prio(p);
+	return prio;
+}
+
+/*
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks, or might be boosted by
+ * interactivity modifiers. Will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
+ */
+static int effective_prio(task_t *p)
+{
+	p->normal_prio = normal_prio(p);
+	/*
+	 * If we are RT tasks or we were boosted to RT priority,
+	 * keep the priority unchanged. Otherwise, update priority
+	 * to the normal priority:
+	 */
+	if (!rt_prio(p->prio))
+		return p->normal_prio;
+	return p->prio;
+}
+
+/*
  * __activate_task - move a task to the runqueue.
  */
 static void __activate_task(task_t *p, runqueue_t *rq)
@@ -674,7 +801,7 @@
 	if (batch_task(p))
 		target = rq->expired;
 	enqueue_task(p, target);
-	rq->nr_running++;
+	inc_nr_running(p, rq);
 }
 
 /*
@@ -683,39 +810,45 @@
 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 {
 	enqueue_task_head(p, rq->active);
-	rq->nr_running++;
+	inc_nr_running(p, rq);
 }
 
+/*
+ * Recalculate p->normal_prio and p->prio after having slept,
+ * updating the sleep-average too:
+ */
 static int recalc_task_prio(task_t *p, unsigned long long now)
 {
 	/* Caller must always ensure 'now >= p->timestamp' */
-	unsigned long long __sleep_time = now - p->timestamp;
-	unsigned long sleep_time;
+	unsigned long sleep_time = now - p->timestamp;
 
 	if (batch_task(p))
 		sleep_time = 0;
-	else {
-		if (__sleep_time > NS_MAX_SLEEP_AVG)
-			sleep_time = NS_MAX_SLEEP_AVG;
-		else
-			sleep_time = (unsigned long)__sleep_time;
-	}
 
 	if (likely(sleep_time > 0)) {
 		/*
-		 * User tasks that sleep a long time are categorised as
-		 * idle. They will only have their sleep_avg increased to a
-		 * level that makes them just interactive priority to stay
-		 * active yet prevent them suddenly becoming cpu hogs and
-		 * starving other processes.
+		 * This ceiling is set to the lowest priority that would allow
+		 * a task to be reinserted into the active array on timeslice
+		 * completion.
 		 */
-		if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) {
-				unsigned long ceiling;
+		unsigned long ceiling = INTERACTIVE_SLEEP(p);
 
-				ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG -
-					DEF_TIMESLICE);
-				if (p->sleep_avg < ceiling)
-					p->sleep_avg = ceiling;
+		if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
+			/*
+			 * Prevents user tasks from achieving best priority
+			 * with one single large enough sleep.
+			 */
+			p->sleep_avg = ceiling;
+			/*
+			 * Using INTERACTIVE_SLEEP() as a ceiling places a
+			 * nice(0) task 1ms sleep away from promotion, and
+			 * gives it 700ms to round-robin with no chance of
+			 * being demoted.  This is more than generous, so
+			 * mark this sleep as non-interactive to prevent the
+			 * on-runqueue bonus logic from intervening should
+			 * this task not receive cpu immediately.
+			 */
+			p->sleep_type = SLEEP_NONINTERACTIVE;
 		} else {
 			/*
 			 * Tasks waking from uninterruptible sleep are
@@ -723,12 +856,12 @@
 			 * are likely to be waiting on I/O
 			 */
 			if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
-				if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
+				if (p->sleep_avg >= ceiling)
 					sleep_time = 0;
 				else if (p->sleep_avg + sleep_time >=
-						INTERACTIVE_SLEEP(p)) {
-					p->sleep_avg = INTERACTIVE_SLEEP(p);
-					sleep_time = 0;
+					 ceiling) {
+						p->sleep_avg = ceiling;
+						sleep_time = 0;
 				}
 			}
 
@@ -742,9 +875,9 @@
 			 */
 			p->sleep_avg += sleep_time;
 
-			if (p->sleep_avg > NS_MAX_SLEEP_AVG)
-				p->sleep_avg = NS_MAX_SLEEP_AVG;
 		}
+		if (p->sleep_avg > NS_MAX_SLEEP_AVG)
+			p->sleep_avg = NS_MAX_SLEEP_AVG;
 	}
 
 	return effective_prio(p);
@@ -805,7 +938,7 @@
  */
 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
-	rq->nr_running--;
+	dec_nr_running(p, rq);
 	dequeue_task(p, p->array);
 	p->array = NULL;
 }
@@ -860,6 +993,12 @@
 	return cpu_curr(task_cpu(p)) == p;
 }
 
+/* Used instead of source_load when we know the type == 0 */
+unsigned long weighted_cpuload(const int cpu)
+{
+	return cpu_rq(cpu)->raw_weighted_load;
+}
+
 #ifdef CONFIG_SMP
 typedef struct {
 	struct list_head list;
@@ -949,7 +1088,8 @@
 }
 
 /*
- * Return a low guess at the load of a migration-source cpu.
+ * Return a low guess at the load of a migration-source cpu weighted
+ * according to the scheduling class and "nice" value.
  *
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
@@ -957,24 +1097,36 @@
 static inline unsigned long source_load(int cpu, int type)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
-	if (type == 0)
-		return load_now;
 
-	return min(rq->cpu_load[type-1], load_now);
+	if (type == 0)
+		return rq->raw_weighted_load;
+
+	return min(rq->cpu_load[type-1], rq->raw_weighted_load);
 }
 
 /*
- * Return a high guess at the load of a migration-target cpu
+ * Return a high guess at the load of a migration-target cpu weighted
+ * according to the scheduling class and "nice" value.
  */
 static inline unsigned long target_load(int cpu, int type)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
-	if (type == 0)
-		return load_now;
 
-	return max(rq->cpu_load[type-1], load_now);
+	if (type == 0)
+		return rq->raw_weighted_load;
+
+	return max(rq->cpu_load[type-1], rq->raw_weighted_load);
+}
+
+/*
+ * Return the average load per task on the cpu's run queue
+ */
+static inline unsigned long cpu_avg_load_per_task(int cpu)
+{
+	runqueue_t *rq = cpu_rq(cpu);
+	unsigned long n = rq->nr_running;
+
+	return n ?  rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
 }
 
 /*
@@ -1047,7 +1199,7 @@
 	cpus_and(tmp, group->cpumask, p->cpus_allowed);
 
 	for_each_cpu_mask(i, tmp) {
-		load = source_load(i, 0);
+		load = weighted_cpuload(i);
 
 		if (load < min_load || (load == min_load && i == this_cpu)) {
 			min_load = load;
@@ -1074,9 +1226,15 @@
 	struct task_struct *t = current;
 	struct sched_domain *tmp, *sd = NULL;
 
-	for_each_domain(cpu, tmp)
+	for_each_domain(cpu, tmp) {
+ 		/*
+ 	 	 * If power savings logic is enabled for a domain, stop there.
+ 	 	 */
+		if (tmp->flags & SD_POWERSAVINGS_BALANCE)
+			break;
 		if (tmp->flags & flag)
 			sd = tmp;
+	}
 
 	while (sd) {
 		cpumask_t span;
@@ -1226,17 +1384,19 @@
 
 		if (this_sd->flags & SD_WAKE_AFFINE) {
 			unsigned long tl = this_load;
+			unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
+
 			/*
 			 * If sync wakeup then subtract the (maximum possible)
 			 * effect of the currently running task from the load
 			 * of the current CPU:
 			 */
 			if (sync)
-				tl -= SCHED_LOAD_SCALE;
+				tl -= current->load_weight;
 
 			if ((tl <= load &&
-				tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
-				100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
+				tl + target_load(cpu, idx) <= tl_per_task) ||
+				100*(tl + p->load_weight) <= imbalance*load) {
 				/*
 				 * This domain has SD_WAKE_AFFINE and
 				 * p is cache cold in this domain, and
@@ -1353,6 +1513,12 @@
 	 * event cannot wake it up and insert it on the runqueue either.
 	 */
 	p->state = TASK_RUNNING;
+
+	/*
+	 * Make sure we do not leak PI boosting priority to the child:
+	 */
+	p->prio = current->normal_prio;
+
 	INIT_LIST_HEAD(&p->run_list);
 	p->array = NULL;
 #ifdef CONFIG_SCHEDSTATS
@@ -1432,10 +1598,11 @@
 				__activate_task(p, rq);
 			else {
 				p->prio = current->prio;
+				p->normal_prio = current->normal_prio;
 				list_add_tail(&p->run_list, &current->run_list);
 				p->array = current->array;
 				p->array->nr_active++;
-				rq->nr_running++;
+				inc_nr_running(p, rq);
 			}
 			set_need_resched();
 		} else
@@ -1653,7 +1820,8 @@
 
 unsigned long long nr_context_switches(void)
 {
-	unsigned long long i, sum = 0;
+	int i;
+	unsigned long long sum = 0;
 
 	for_each_possible_cpu(i)
 		sum += cpu_rq(i)->nr_switches;
@@ -1691,9 +1859,6 @@
 /*
  * double_rq_lock - safely lock two runqueues
  *
- * We must take them in cpu order to match code in
- * dependent_sleeper and wake_dependent_sleeper.
- *
  * Note this does not disable interrupts like task_rq_lock,
  * you need to do so manually before calling.
  */
@@ -1705,7 +1870,7 @@
 		spin_lock(&rq1->lock);
 		__acquire(rq2->lock);	/* Fake it out ;) */
 	} else {
-		if (rq1->cpu < rq2->cpu) {
+		if (rq1 < rq2) {
 			spin_lock(&rq1->lock);
 			spin_lock(&rq2->lock);
 		} else {
@@ -1741,7 +1906,7 @@
 	__acquires(this_rq->lock)
 {
 	if (unlikely(!spin_trylock(&busiest->lock))) {
-		if (busiest->cpu < this_rq->cpu) {
+		if (busiest < this_rq) {
 			spin_unlock(&this_rq->lock);
 			spin_lock(&busiest->lock);
 			spin_lock(&this_rq->lock);
@@ -1804,9 +1969,9 @@
 	       runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
 {
 	dequeue_task(p, src_array);
-	src_rq->nr_running--;
+	dec_nr_running(p, src_rq);
 	set_task_cpu(p, this_cpu);
-	this_rq->nr_running++;
+	inc_nr_running(p, this_rq);
 	enqueue_task(p, this_array);
 	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
 				+ this_rq->timestamp_last_tick;
@@ -1853,26 +2018,42 @@
 	return 1;
 }
 
+#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
 /*
- * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
- * as part of a balancing operation within "domain". Returns the number of
- * tasks moved.
+ * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
+ * load from busiest to this_rq, as part of a balancing operation within
+ * "domain". Returns the number of tasks moved.
  *
  * Called with both runqueues locked.
  */
 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
-		      unsigned long max_nr_move, struct sched_domain *sd,
-		      enum idle_type idle, int *all_pinned)
+		      unsigned long max_nr_move, unsigned long max_load_move,
+		      struct sched_domain *sd, enum idle_type idle,
+		      int *all_pinned)
 {
 	prio_array_t *array, *dst_array;
 	struct list_head *head, *curr;
-	int idx, pulled = 0, pinned = 0;
+	int idx, pulled = 0, pinned = 0, this_best_prio, busiest_best_prio;
+	int busiest_best_prio_seen;
+	int skip_for_load; /* skip the task based on weighted load issues */
+	long rem_load_move;
 	task_t *tmp;
 
-	if (max_nr_move == 0)
+	if (max_nr_move == 0 || max_load_move == 0)
 		goto out;
 
+	rem_load_move = max_load_move;
 	pinned = 1;
+	this_best_prio = rq_best_prio(this_rq);
+	busiest_best_prio = rq_best_prio(busiest);
+	/*
+	 * Enable handling of the case where there is more than one task
+	 * with the best priority.   If the current running task is one
+	 * of those with prio==busiest_best_prio we know it won't be moved
+	 * and therefore it's safe to override the skip (based on load) of
+	 * any task we find with that prio.
+	 */
+	busiest_best_prio_seen = busiest_best_prio == busiest->curr->prio;
 
 	/*
 	 * We first consider expired tasks. Those will likely not be
@@ -1912,7 +2093,17 @@
 
 	curr = curr->prev;
 
-	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
+	/*
+	 * To help distribute high priority tasks accross CPUs we don't
+	 * skip a task if it will be the highest priority task (i.e. smallest
+	 * prio value) on its new queue regardless of its load weight
+	 */
+	skip_for_load = tmp->load_weight > rem_load_move;
+	if (skip_for_load && idx < this_best_prio)
+		skip_for_load = !busiest_best_prio_seen && idx == busiest_best_prio;
+	if (skip_for_load ||
+	    !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
+		busiest_best_prio_seen |= idx == busiest_best_prio;
 		if (curr != head)
 			goto skip_queue;
 		idx++;
@@ -1926,9 +2117,15 @@
 
 	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
 	pulled++;
+	rem_load_move -= tmp->load_weight;
 
-	/* We only want to steal up to the prescribed number of tasks. */
-	if (pulled < max_nr_move) {
+	/*
+	 * We only want to steal up to the prescribed number of tasks
+	 * and the prescribed amount of weighted load.
+	 */
+	if (pulled < max_nr_move && rem_load_move > 0) {
+		if (idx < this_best_prio)
+			this_best_prio = idx;
 		if (curr != head)
 			goto skip_queue;
 		idx++;
@@ -1949,7 +2146,7 @@
 
 /*
  * find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the number of tasks which should be
+ * domain. It calculates and returns the amount of weighted load which should be
  * moved to restore balance via the imbalance parameter.
  */
 static struct sched_group *
@@ -1959,9 +2156,19 @@
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
 	unsigned long max_pull;
+	unsigned long busiest_load_per_task, busiest_nr_running;
+	unsigned long this_load_per_task, this_nr_running;
 	int load_idx;
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+	int power_savings_balance = 1;
+	unsigned long leader_nr_running = 0, min_load_per_task = 0;
+	unsigned long min_nr_running = ULONG_MAX;
+	struct sched_group *group_min = NULL, *group_leader = NULL;
+#endif
 
 	max_load = this_load = total_load = total_pwr = 0;
+	busiest_load_per_task = busiest_nr_running = 0;
+	this_load_per_task = this_nr_running = 0;
 	if (idle == NOT_IDLE)
 		load_idx = sd->busy_idx;
 	else if (idle == NEWLY_IDLE)
@@ -1970,16 +2177,19 @@
 		load_idx = sd->idle_idx;
 
 	do {
-		unsigned long load;
+		unsigned long load, group_capacity;
 		int local_group;
 		int i;
+		unsigned long sum_nr_running, sum_weighted_load;
 
 		local_group = cpu_isset(this_cpu, group->cpumask);
 
 		/* Tally up the load of all CPUs in the group */
-		avg_load = 0;
+		sum_weighted_load = sum_nr_running = avg_load = 0;
 
 		for_each_cpu_mask(i, group->cpumask) {
+			runqueue_t *rq = cpu_rq(i);
+
 			if (*sd_idle && !idle_cpu(i))
 				*sd_idle = 0;
 
@@ -1990,6 +2200,8 @@
 				load = source_load(i, load_idx);
 
 			avg_load += load;
+			sum_nr_running += rq->nr_running;
+			sum_weighted_load += rq->raw_weighted_load;
 		}
 
 		total_load += avg_load;
@@ -1998,17 +2210,80 @@
 		/* Adjust by relative CPU power of the group */
 		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
 
+		group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
+
 		if (local_group) {
 			this_load = avg_load;
 			this = group;
-		} else if (avg_load > max_load) {
+			this_nr_running = sum_nr_running;
+			this_load_per_task = sum_weighted_load;
+		} else if (avg_load > max_load &&
+			   sum_nr_running > group_capacity) {
 			max_load = avg_load;
 			busiest = group;
+			busiest_nr_running = sum_nr_running;
+			busiest_load_per_task = sum_weighted_load;
 		}
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+		/*
+		 * Busy processors will not participate in power savings
+		 * balance.
+		 */
+ 		if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+ 			goto group_next;
+
+		/*
+		 * If the local group is idle or completely loaded
+		 * no need to do power savings balance at this domain
+		 */
+		if (local_group && (this_nr_running >= group_capacity ||
+				    !this_nr_running))
+			power_savings_balance = 0;
+
+ 		/*
+		 * If a group is already running at full capacity or idle,
+		 * don't include that group in power savings calculations
+ 		 */
+ 		if (!power_savings_balance || sum_nr_running >= group_capacity
+		    || !sum_nr_running)
+ 			goto group_next;
+
+ 		/*
+		 * Calculate the group which has the least non-idle load.
+ 		 * This is the group from where we need to pick up the load
+ 		 * for saving power
+ 		 */
+ 		if ((sum_nr_running < min_nr_running) ||
+ 		    (sum_nr_running == min_nr_running &&
+		     first_cpu(group->cpumask) <
+		     first_cpu(group_min->cpumask))) {
+ 			group_min = group;
+ 			min_nr_running = sum_nr_running;
+			min_load_per_task = sum_weighted_load /
+						sum_nr_running;
+ 		}
+
+ 		/*
+		 * Calculate the group which is almost near its
+ 		 * capacity but still has some space to pick up some load
+ 		 * from other group and save more power
+ 		 */
+ 		if (sum_nr_running <= group_capacity - 1)
+ 			if (sum_nr_running > leader_nr_running ||
+ 			    (sum_nr_running == leader_nr_running &&
+ 			     first_cpu(group->cpumask) >
+ 			      first_cpu(group_leader->cpumask))) {
+ 				group_leader = group;
+ 				leader_nr_running = sum_nr_running;
+ 			}
+
+group_next:
+#endif
 		group = group->next;
 	} while (group != sd->groups);
 
-	if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
+	if (!busiest || this_load >= max_load || busiest_nr_running == 0)
 		goto out_balanced;
 
 	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -2017,6 +2292,7 @@
 			100*max_load <= sd->imbalance_pct*this_load)
 		goto out_balanced;
 
+	busiest_load_per_task /= busiest_nr_running;
 	/*
 	 * We're trying to get all the cpus to the average_load, so we don't
 	 * want to push ourselves above the average load, nor do we wish to
@@ -2028,21 +2304,50 @@
 	 * by pulling tasks to us.  Be careful of negative numbers as they'll
 	 * appear as very large values with unsigned longs.
 	 */
+	if (max_load <= busiest_load_per_task)
+		goto out_balanced;
+
+	/*
+	 * In the presence of smp nice balancing, certain scenarios can have
+	 * max load less than avg load(as we skip the groups at or below
+	 * its cpu_power, while calculating max_load..)
+	 */
+	if (max_load < avg_load) {
+		*imbalance = 0;
+		goto small_imbalance;
+	}
 
 	/* Don't want to pull so many tasks that a group would go idle */
-	max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);
+	max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
 
 	/* How much load to actually move to equalise the imbalance */
 	*imbalance = min(max_pull * busiest->cpu_power,
 				(avg_load - this_load) * this->cpu_power)
 			/ SCHED_LOAD_SCALE;
 
-	if (*imbalance < SCHED_LOAD_SCALE) {
-		unsigned long pwr_now = 0, pwr_move = 0;
+	/*
+	 * if *imbalance is less than the average load per runnable task
+	 * there is no gaurantee that any tasks will be moved so we'll have
+	 * a think about bumping its value to force at least one task to be
+	 * moved
+	 */
+	if (*imbalance < busiest_load_per_task) {
+		unsigned long pwr_now, pwr_move;
 		unsigned long tmp;
+		unsigned int imbn;
 
-		if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
-			*imbalance = 1;
+small_imbalance:
+		pwr_move = pwr_now = 0;
+		imbn = 2;
+		if (this_nr_running) {
+			this_load_per_task /= this_nr_running;
+			if (busiest_load_per_task > this_load_per_task)
+				imbn = 1;
+		} else
+			this_load_per_task = SCHED_LOAD_SCALE;
+
+		if (max_load - this_load >= busiest_load_per_task * imbn) {
+			*imbalance = busiest_load_per_task;
 			return busiest;
 		}
 
@@ -2052,39 +2357,47 @@
 		 * moving them.
 		 */
 
-		pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
-		pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
+		pwr_now += busiest->cpu_power *
+			min(busiest_load_per_task, max_load);
+		pwr_now += this->cpu_power *
+			min(this_load_per_task, this_load);
 		pwr_now /= SCHED_LOAD_SCALE;
 
 		/* Amount of load we'd subtract */
-		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
+		tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
 		if (max_load > tmp)
-			pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
-							max_load - tmp);
+			pwr_move += busiest->cpu_power *
+				min(busiest_load_per_task, max_load - tmp);
 
 		/* Amount of load we'd add */
 		if (max_load*busiest->cpu_power <
-				SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)
+				busiest_load_per_task*SCHED_LOAD_SCALE)
 			tmp = max_load*busiest->cpu_power/this->cpu_power;
 		else
-			tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
-		pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
+			tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
+		pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
 		pwr_move /= SCHED_LOAD_SCALE;
 
 		/* Move if we gain throughput */
 		if (pwr_move <= pwr_now)
 			goto out_balanced;
 
-		*imbalance = 1;
-		return busiest;
+		*imbalance = busiest_load_per_task;
 	}
 
-	/* Get rid of the scaling factor, rounding down as we divide */
-	*imbalance = *imbalance / SCHED_LOAD_SCALE;
 	return busiest;
 
 out_balanced:
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+	if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+		goto ret;
 
+	if (this == group_leader && group_leader != group_min) {
+		*imbalance = min_load_per_task;
+		return group_min;
+	}
+ret:
+#endif
 	*imbalance = 0;
 	return NULL;
 }
@@ -2093,18 +2406,21 @@
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
 static runqueue_t *find_busiest_queue(struct sched_group *group,
-	enum idle_type idle)
+	enum idle_type idle, unsigned long imbalance)
 {
-	unsigned long load, max_load = 0;
-	runqueue_t *busiest = NULL;
+	unsigned long max_load = 0;
+	runqueue_t *busiest = NULL, *rqi;
 	int i;
 
 	for_each_cpu_mask(i, group->cpumask) {
-		load = source_load(i, 0);
+		rqi = cpu_rq(i);
 
-		if (load > max_load) {
-			max_load = load;
-			busiest = cpu_rq(i);
+		if (rqi->nr_running == 1 && rqi->raw_weighted_load > imbalance)
+			continue;
+
+		if (rqi->raw_weighted_load > max_load) {
+			max_load = rqi->raw_weighted_load;
+			busiest = rqi;
 		}
 	}
 
@@ -2117,6 +2433,7 @@
  */
 #define MAX_PINNED_INTERVAL	512
 
+#define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0)
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
@@ -2133,7 +2450,8 @@
 	int active_balance = 0;
 	int sd_idle = 0;
 
-	if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
+	if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
+	    !sched_smt_power_savings)
 		sd_idle = 1;
 
 	schedstat_inc(sd, lb_cnt[idle]);
@@ -2144,7 +2462,7 @@
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group, idle);
+	busiest = find_busiest_queue(group, idle, imbalance);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
 		goto out_balanced;
@@ -2164,6 +2482,7 @@
 		 */
 		double_rq_lock(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
+					minus_1_or_zero(busiest->nr_running),
 					imbalance, sd, idle, &all_pinned);
 		double_rq_unlock(this_rq, busiest);
 
@@ -2221,7 +2540,8 @@
 			sd->balance_interval *= 2;
 	}
 
-	if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+	if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+	    !sched_smt_power_savings)
 		return -1;
 	return nr_moved;
 
@@ -2236,7 +2556,7 @@
 			(sd->balance_interval < sd->max_interval))
 		sd->balance_interval *= 2;
 
-	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
 		return -1;
 	return 0;
 }
@@ -2257,7 +2577,7 @@
 	int nr_moved = 0;
 	int sd_idle = 0;
 
-	if (sd->flags & SD_SHARE_CPUPOWER)
+	if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
 		sd_idle = 1;
 
 	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
@@ -2267,7 +2587,7 @@
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group, NEWLY_IDLE);
+	busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
 		goto out_balanced;
@@ -2282,6 +2602,7 @@
 		/* Attempt to move tasks */
 		double_lock_balance(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
+					minus_1_or_zero(busiest->nr_running),
 					imbalance, sd, NEWLY_IDLE, NULL);
 		spin_unlock(&busiest->lock);
 	}
@@ -2297,7 +2618,7 @@
 
 out_balanced:
 	schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
-	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
 		return -1;
 	sd->nr_balance_failed = 0;
 	return 0;
@@ -2352,17 +2673,19 @@
 	double_lock_balance(busiest_rq, target_rq);
 
 	/* Search for an sd spanning us and the target CPU. */
-	for_each_domain(target_cpu, sd)
+	for_each_domain(target_cpu, sd) {
 		if ((sd->flags & SD_LOAD_BALANCE) &&
 			cpu_isset(busiest_cpu, sd->span))
 				break;
+	}
 
 	if (unlikely(sd == NULL))
 		goto out;
 
 	schedstat_inc(sd, alb_cnt);
 
-	if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
+	if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
+			RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, NULL))
 		schedstat_inc(sd, alb_pushed);
 	else
 		schedstat_inc(sd, alb_failed);
@@ -2390,7 +2713,7 @@
 	struct sched_domain *sd;
 	int i;
 
-	this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
+	this_load = this_rq->raw_weighted_load;
 	/* Update our load */
 	for (i = 0; i < 3; i++) {
 		unsigned long new_load = this_load;
@@ -2691,48 +3014,35 @@
 		resched_task(rq->idle);
 }
 
-static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
+/*
+ * Called with interrupt disabled and this_rq's runqueue locked.
+ */
+static void wake_sleeping_dependent(int this_cpu)
 {
 	struct sched_domain *tmp, *sd = NULL;
-	cpumask_t sibling_map;
 	int i;
 
-	for_each_domain(this_cpu, tmp)
-		if (tmp->flags & SD_SHARE_CPUPOWER)
+	for_each_domain(this_cpu, tmp) {
+		if (tmp->flags & SD_SHARE_CPUPOWER) {
 			sd = tmp;
+			break;
+		}
+	}
 
 	if (!sd)
 		return;
 
-	/*
-	 * Unlock the current runqueue because we have to lock in
-	 * CPU order to avoid deadlocks. Caller knows that we might
-	 * unlock. We keep IRQs disabled.
-	 */
-	spin_unlock(&this_rq->lock);
-
-	sibling_map = sd->span;
-
-	for_each_cpu_mask(i, sibling_map)
-		spin_lock(&cpu_rq(i)->lock);
-	/*
-	 * We clear this CPU from the mask. This both simplifies the
-	 * inner loop and keps this_rq locked when we exit:
-	 */
-	cpu_clear(this_cpu, sibling_map);
-
-	for_each_cpu_mask(i, sibling_map) {
+	for_each_cpu_mask(i, sd->span) {
 		runqueue_t *smt_rq = cpu_rq(i);
 
-		wakeup_busy_runqueue(smt_rq);
-	}
+		if (i == this_cpu)
+			continue;
+		if (unlikely(!spin_trylock(&smt_rq->lock)))
+			continue;
 
-	for_each_cpu_mask(i, sibling_map)
-		spin_unlock(&cpu_rq(i)->lock);
-	/*
-	 * We exit with this_cpu's rq still held and IRQs
-	 * still disabled:
-	 */
+		wakeup_busy_runqueue(smt_rq);
+		spin_unlock(&smt_rq->lock);
+	}
 }
 
 /*
@@ -2745,52 +3055,46 @@
 	return p->time_slice * (100 - sd->per_cpu_gain) / 100;
 }
 
-static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
+/*
+ * To minimise lock contention and not have to drop this_rq's runlock we only
+ * trylock the sibling runqueues and bypass those runqueues if we fail to
+ * acquire their lock. As we only trylock the normal locking order does not
+ * need to be obeyed.
+ */
+static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p)
 {
 	struct sched_domain *tmp, *sd = NULL;
-	cpumask_t sibling_map;
-	prio_array_t *array;
 	int ret = 0, i;
-	task_t *p;
 
-	for_each_domain(this_cpu, tmp)
-		if (tmp->flags & SD_SHARE_CPUPOWER)
+	/* kernel/rt threads do not participate in dependent sleeping */
+	if (!p->mm || rt_task(p))
+		return 0;
+
+	for_each_domain(this_cpu, tmp) {
+		if (tmp->flags & SD_SHARE_CPUPOWER) {
 			sd = tmp;
+			break;
+		}
+	}
 
 	if (!sd)
 		return 0;
 
-	/*
-	 * The same locking rules and details apply as for
-	 * wake_sleeping_dependent():
-	 */
-	spin_unlock(&this_rq->lock);
-	sibling_map = sd->span;
-	for_each_cpu_mask(i, sibling_map)
-		spin_lock(&cpu_rq(i)->lock);
-	cpu_clear(this_cpu, sibling_map);
+	for_each_cpu_mask(i, sd->span) {
+		runqueue_t *smt_rq;
+		task_t *smt_curr;
 
-	/*
-	 * Establish next task to be run - it might have gone away because
-	 * we released the runqueue lock above:
-	 */
-	if (!this_rq->nr_running)
-		goto out_unlock;
-	array = this_rq->active;
-	if (!array->nr_active)
-		array = this_rq->expired;
-	BUG_ON(!array->nr_active);
+		if (i == this_cpu)
+			continue;
 
-	p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
-		task_t, run_list);
+		smt_rq = cpu_rq(i);
+		if (unlikely(!spin_trylock(&smt_rq->lock)))
+			continue;
 
-	for_each_cpu_mask(i, sibling_map) {
-		runqueue_t *smt_rq = cpu_rq(i);
-		task_t *smt_curr = smt_rq->curr;
+		smt_curr = smt_rq->curr;
 
-		/* Kernel threads do not participate in dependent sleeping */
-		if (!p->mm || !smt_curr->mm || rt_task(p))
-			goto check_smt_task;
+		if (!smt_curr->mm)
+			goto unlock;
 
 		/*
 		 * If a user task with lower static priority than the
@@ -2808,49 +3112,24 @@
 			if ((jiffies % DEF_TIMESLICE) >
 				(sd->per_cpu_gain * DEF_TIMESLICE / 100))
 					ret = 1;
-		} else
+		} else {
 			if (smt_curr->static_prio < p->static_prio &&
 				!TASK_PREEMPTS_CURR(p, smt_rq) &&
 				smt_slice(smt_curr, sd) > task_timeslice(p))
 					ret = 1;
-
-check_smt_task:
-		if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
-			rt_task(smt_curr))
-				continue;
-		if (!p->mm) {
-			wakeup_busy_runqueue(smt_rq);
-			continue;
 		}
-
-		/*
-		 * Reschedule a lower priority task on the SMT sibling for
-		 * it to be put to sleep, or wake it up if it has been put to
-		 * sleep for priority reasons to see if it should run now.
-		 */
-		if (rt_task(p)) {
-			if ((jiffies % DEF_TIMESLICE) >
-				(sd->per_cpu_gain * DEF_TIMESLICE / 100))
-					resched_task(smt_curr);
-		} else {
-			if (TASK_PREEMPTS_CURR(p, smt_rq) &&
-				smt_slice(p, sd) > task_timeslice(smt_curr))
-					resched_task(smt_curr);
-			else
-				wakeup_busy_runqueue(smt_rq);
-		}
+unlock:
+		spin_unlock(&smt_rq->lock);
 	}
-out_unlock:
-	for_each_cpu_mask(i, sibling_map)
-		spin_unlock(&cpu_rq(i)->lock);
 	return ret;
 }
 #else
-static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
+static inline void wake_sleeping_dependent(int this_cpu)
 {
 }
 
-static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
+static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq,
+					task_t *p)
 {
 	return 0;
 }
@@ -2972,32 +3251,13 @@
 
 	cpu = smp_processor_id();
 	if (unlikely(!rq->nr_running)) {
-go_idle:
 		idle_balance(cpu, rq);
 		if (!rq->nr_running) {
 			next = rq->idle;
 			rq->expired_timestamp = 0;
-			wake_sleeping_dependent(cpu, rq);
-			/*
-			 * wake_sleeping_dependent() might have released
-			 * the runqueue, so break out if we got new
-			 * tasks meanwhile:
-			 */
-			if (!rq->nr_running)
-				goto switch_tasks;
-		}
-	} else {
-		if (dependent_sleeper(cpu, rq)) {
-			next = rq->idle;
+			wake_sleeping_dependent(cpu);
 			goto switch_tasks;
 		}
-		/*
-		 * dependent_sleeper() releases and reacquires the runqueue
-		 * lock, hence go into the idle loop if the rq went
-		 * empty meanwhile:
-		 */
-		if (unlikely(!rq->nr_running))
-			goto go_idle;
 	}
 
 	array = rq->active;
@@ -3035,6 +3295,8 @@
 		}
 	}
 	next->sleep_type = SLEEP_NORMAL;
+	if (dependent_sleeper(cpu, rq, next))
+		next = rq->idle;
 switch_tasks:
 	if (next == rq->idle)
 		schedstat_inc(rq, sched_goidle);
@@ -3478,12 +3740,65 @@
 
 EXPORT_SYMBOL(sleep_on_timeout);
 
+#ifdef CONFIG_RT_MUTEXES
+
+/*
+ * rt_mutex_setprio - set the current priority of a task
+ * @p: task
+ * @prio: prio value (kernel-internal form)
+ *
+ * This function changes the 'effective' priority of a task. It does
+ * not touch ->normal_prio like __setscheduler().
+ *
+ * Used by the rt_mutex code to implement priority inheritance logic.
+ */
+void rt_mutex_setprio(task_t *p, int prio)
+{
+	unsigned long flags;
+	prio_array_t *array;
+	runqueue_t *rq;
+	int oldprio;
+
+	BUG_ON(prio < 0 || prio > MAX_PRIO);
+
+	rq = task_rq_lock(p, &flags);
+
+	oldprio = p->prio;
+	array = p->array;
+	if (array)
+		dequeue_task(p, array);
+	p->prio = prio;
+
+	if (array) {
+		/*
+		 * If changing to an RT priority then queue it
+		 * in the active array!
+		 */
+		if (rt_task(p))
+			array = rq->active;
+		enqueue_task(p, array);
+		/*
+		 * Reschedule if we are currently running on this runqueue and
+		 * our priority decreased, or if we are not currently running on
+		 * this runqueue and our priority is higher than the current's
+		 */
+		if (task_running(rq, p)) {
+			if (p->prio > oldprio)
+				resched_task(rq->curr);
+		} else if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
+	}
+	task_rq_unlock(rq, &flags);
+}
+
+#endif
+
 void set_user_nice(task_t *p, long nice)
 {
 	unsigned long flags;
 	prio_array_t *array;
 	runqueue_t *rq;
-	int old_prio, new_prio, delta;
+	int old_prio, delta;
 
 	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
 		return;
@@ -3498,22 +3813,25 @@
 	 * it wont have any effect on scheduling until the task is
 	 * not SCHED_NORMAL/SCHED_BATCH:
 	 */
-	if (rt_task(p)) {
+	if (has_rt_policy(p)) {
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
 	array = p->array;
-	if (array)
+	if (array) {
 		dequeue_task(p, array);
+		dec_raw_weighted_load(rq, p);
+	}
 
-	old_prio = p->prio;
-	new_prio = NICE_TO_PRIO(nice);
-	delta = new_prio - old_prio;
 	p->static_prio = NICE_TO_PRIO(nice);
-	p->prio += delta;
+	set_load_weight(p);
+	old_prio = p->prio;
+	p->prio = effective_prio(p);
+	delta = p->prio - old_prio;
 
 	if (array) {
 		enqueue_task(p, array);
+		inc_raw_weighted_load(rq, p);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -3524,7 +3842,6 @@
 out_unlock:
 	task_rq_unlock(rq, &flags);
 }
-
 EXPORT_SYMBOL(set_user_nice);
 
 /*
@@ -3639,16 +3956,15 @@
 	BUG_ON(p->array);
 	p->policy = policy;
 	p->rt_priority = prio;
-	if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {
-		p->prio = MAX_RT_PRIO-1 - p->rt_priority;
-	} else {
-		p->prio = p->static_prio;
-		/*
-		 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
-		 */
-		if (policy == SCHED_BATCH)
-			p->sleep_avg = 0;
-	}
+	p->normal_prio = normal_prio(p);
+	/* we are holding p->pi_lock already */
+	p->prio = rt_mutex_getprio(p);
+	/*
+	 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
+	 */
+	if (policy == SCHED_BATCH)
+		p->sleep_avg = 0;
+	set_load_weight(p);
 }
 
 /**
@@ -3667,6 +3983,8 @@
 	unsigned long flags;
 	runqueue_t *rq;
 
+	/* may grab non-irq protected spin_locks */
+	BUG_ON(in_interrupt());
 recheck:
 	/* double check policy once rq lock held */
 	if (policy < 0)
@@ -3715,14 +4033,20 @@
 	if (retval)
 		return retval;
 	/*
+	 * make sure no PI-waiters arrive (or leave) while we are
+	 * changing the priority of the task:
+	 */
+	spin_lock_irqsave(&p->pi_lock, flags);
+	/*
 	 * To be able to change p->policy safely, the apropriate
 	 * runqueue lock must be held.
 	 */
-	rq = task_rq_lock(p, &flags);
+	rq = __task_rq_lock(p);
 	/* recheck policy now with rq lock held */
 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
 		policy = oldpolicy = -1;
-		task_rq_unlock(rq, &flags);
+		__task_rq_unlock(rq);
+		spin_unlock_irqrestore(&p->pi_lock, flags);
 		goto recheck;
 	}
 	array = p->array;
@@ -3743,7 +4067,11 @@
 		} else if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
 	}
-	task_rq_unlock(rq, &flags);
+	__task_rq_unlock(rq);
+	spin_unlock_irqrestore(&p->pi_lock, flags);
+
+	rt_mutex_adjust_pi(p);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);
@@ -3765,8 +4093,10 @@
 		read_unlock_irq(&tasklist_lock);
 		return -ESRCH;
 	}
-	retval = sched_setscheduler(p, policy, &lparam);
+	get_task_struct(p);
 	read_unlock_irq(&tasklist_lock);
+	retval = sched_setscheduler(p, policy, &lparam);
+	put_task_struct(p);
 	return retval;
 }
 
@@ -4378,7 +4708,7 @@
 	idle->timestamp = sched_clock();
 	idle->sleep_avg = 0;
 	idle->array = NULL;
-	idle->prio = MAX_PRIO;
+	idle->prio = idle->normal_prio = MAX_PRIO;
 	idle->state = TASK_RUNNING;
 	idle->cpus_allowed = cpumask_of_cpu(cpu);
 	set_task_cpu(idle, cpu);
@@ -4474,13 +4804,16 @@
  *
  * So we race with normal scheduler movements, but that's OK, as long
  * as the task is no longer on this CPU.
+ *
+ * Returns non-zero if task was successfully migrated.
  */
-static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
 	runqueue_t *rq_dest, *rq_src;
+	int ret = 0;
 
 	if (unlikely(cpu_is_offline(dest_cpu)))
-		return;
+		return ret;
 
 	rq_src = cpu_rq(src_cpu);
 	rq_dest = cpu_rq(dest_cpu);
@@ -4508,9 +4841,10 @@
 		if (TASK_PREEMPTS_CURR(p, rq_dest))
 			resched_task(rq_dest->curr);
 	}
-
+	ret = 1;
 out:
 	double_rq_unlock(rq_src, rq_dest);
+	return ret;
 }
 
 /*
@@ -4580,9 +4914,12 @@
 /* Figure out where task on dead CPU should go, use force if neccessary. */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
 {
+	runqueue_t *rq;
+	unsigned long flags;
 	int dest_cpu;
 	cpumask_t mask;
 
+restart:
 	/* On same node? */
 	mask = node_to_cpumask(cpu_to_node(dead_cpu));
 	cpus_and(mask, mask, tsk->cpus_allowed);
@@ -4594,8 +4931,10 @@
 
 	/* No more Mr. Nice Guy. */
 	if (dest_cpu == NR_CPUS) {
+		rq = task_rq_lock(tsk, &flags);
 		cpus_setall(tsk->cpus_allowed);
 		dest_cpu = any_online_cpu(tsk->cpus_allowed);
+		task_rq_unlock(rq, &flags);
 
 		/*
 		 * Don't tell them about moving exiting tasks or
@@ -4607,7 +4946,8 @@
 			       "longer affine to cpu%d\n",
 			       tsk->pid, tsk->comm, dead_cpu);
 	}
-	__migrate_task(tsk, dead_cpu, dest_cpu);
+	if (!__migrate_task(tsk, dead_cpu, dest_cpu))
+		goto restart;
 }
 
 /*
@@ -4734,8 +5074,9 @@
  * migration_call - callback that gets triggered when a CPU is added.
  * Here we can start up the necessary migration thread for the new CPU.
  */
-static int migration_call(struct notifier_block *nfb, unsigned long action,
-			  void *hcpu)
+static int __cpuinit migration_call(struct notifier_block *nfb,
+			unsigned long action,
+			void *hcpu)
 {
 	int cpu = (long)hcpu;
 	struct task_struct *p;
@@ -4805,7 +5146,7 @@
 /* Register at highest priority so that task migration (migrate_all_tasks)
  * happens before everything else.
  */
-static struct notifier_block migration_notifier = {
+static struct notifier_block __cpuinitdata migration_notifier = {
 	.notifier_call = migration_call,
 	.priority = 10
 };
@@ -5606,6 +5947,7 @@
 }
 #endif
 
+int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 /*
  * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
  * can switch it on easily if needed.
@@ -5621,7 +5963,7 @@
 
 #ifdef CONFIG_SCHED_MC
 static DEFINE_PER_CPU(struct sched_domain, core_domains);
-static struct sched_group sched_group_core[NR_CPUS];
+static struct sched_group *sched_group_core_bycpu[NR_CPUS];
 #endif
 
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
@@ -5637,7 +5979,7 @@
 #endif
 
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group sched_group_phys[NR_CPUS];
+static struct sched_group *sched_group_phys_bycpu[NR_CPUS];
 static int cpu_to_phys_group(int cpu)
 {
 #if defined(CONFIG_SCHED_MC)
@@ -5694,13 +6036,74 @@
 }
 #endif
 
+/* Free memory allocated for various sched_group structures */
+static void free_sched_groups(const cpumask_t *cpu_map)
+{
+	int cpu;
+#ifdef CONFIG_NUMA
+	int i;
+
+	for_each_cpu_mask(cpu, *cpu_map) {
+		struct sched_group *sched_group_allnodes
+			= sched_group_allnodes_bycpu[cpu];
+		struct sched_group **sched_group_nodes
+			= sched_group_nodes_bycpu[cpu];
+
+		if (sched_group_allnodes) {
+			kfree(sched_group_allnodes);
+			sched_group_allnodes_bycpu[cpu] = NULL;
+		}
+
+		if (!sched_group_nodes)
+			continue;
+
+		for (i = 0; i < MAX_NUMNODES; i++) {
+			cpumask_t nodemask = node_to_cpumask(i);
+			struct sched_group *oldsg, *sg = sched_group_nodes[i];
+
+			cpus_and(nodemask, nodemask, *cpu_map);
+			if (cpus_empty(nodemask))
+				continue;
+
+			if (sg == NULL)
+				continue;
+			sg = sg->next;
+next_sg:
+			oldsg = sg;
+			sg = sg->next;
+			kfree(oldsg);
+			if (oldsg != sched_group_nodes[i])
+				goto next_sg;
+		}
+		kfree(sched_group_nodes);
+		sched_group_nodes_bycpu[cpu] = NULL;
+	}
+#endif
+	for_each_cpu_mask(cpu, *cpu_map) {
+		if (sched_group_phys_bycpu[cpu]) {
+			kfree(sched_group_phys_bycpu[cpu]);
+			sched_group_phys_bycpu[cpu] = NULL;
+		}
+#ifdef CONFIG_SCHED_MC
+		if (sched_group_core_bycpu[cpu]) {
+			kfree(sched_group_core_bycpu[cpu]);
+			sched_group_core_bycpu[cpu] = NULL;
+		}
+#endif
+	}
+}
+
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
-void build_sched_domains(const cpumask_t *cpu_map)
+static int build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
+	struct sched_group *sched_group_phys = NULL;
+#ifdef CONFIG_SCHED_MC
+	struct sched_group *sched_group_core = NULL;
+#endif
 #ifdef CONFIG_NUMA
 	struct sched_group **sched_group_nodes = NULL;
 	struct sched_group *sched_group_allnodes = NULL;
@@ -5708,11 +6111,11 @@
 	/*
 	 * Allocate the per-node list of sched groups
 	 */
-	sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
-					   GFP_ATOMIC);
+	sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+					   GFP_KERNEL);
 	if (!sched_group_nodes) {
 		printk(KERN_WARNING "Can not alloc sched group node list\n");
-		return;
+		return -ENOMEM;
 	}
 	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
 #endif
@@ -5738,7 +6141,7 @@
 				if (!sched_group_allnodes) {
 					printk(KERN_WARNING
 					"Can not alloc allnodes sched group\n");
-					break;
+					goto error;
 				}
 				sched_group_allnodes_bycpu[i]
 						= sched_group_allnodes;
@@ -5759,6 +6162,18 @@
 		cpus_and(sd->span, sd->span, *cpu_map);
 #endif
 
+		if (!sched_group_phys) {
+			sched_group_phys
+				= kmalloc(sizeof(struct sched_group) * NR_CPUS,
+					  GFP_KERNEL);
+			if (!sched_group_phys) {
+				printk (KERN_WARNING "Can not alloc phys sched"
+						     "group\n");
+				goto error;
+			}
+			sched_group_phys_bycpu[i] = sched_group_phys;
+		}
+
 		p = sd;
 		sd = &per_cpu(phys_domains, i);
 		group = cpu_to_phys_group(i);
@@ -5768,6 +6183,18 @@
 		sd->groups = &sched_group_phys[group];
 
 #ifdef CONFIG_SCHED_MC
+		if (!sched_group_core) {
+			sched_group_core
+				= kmalloc(sizeof(struct sched_group) * NR_CPUS,
+					  GFP_KERNEL);
+			if (!sched_group_core) {
+				printk (KERN_WARNING "Can not alloc core sched"
+						     "group\n");
+				goto error;
+			}
+			sched_group_core_bycpu[i] = sched_group_core;
+		}
+
 		p = sd;
 		sd = &per_cpu(core_domains, i);
 		group = cpu_to_core_group(i);
@@ -5851,24 +6278,21 @@
 		domainspan = sched_domain_node_span(i);
 		cpus_and(domainspan, domainspan, *cpu_map);
 
-		sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+		sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
+		if (!sg) {
+			printk(KERN_WARNING "Can not alloc domain group for "
+				"node %d\n", i);
+			goto error;
+		}
 		sched_group_nodes[i] = sg;
 		for_each_cpu_mask(j, nodemask) {
 			struct sched_domain *sd;
 			sd = &per_cpu(node_domains, j);
 			sd->groups = sg;
-			if (sd->groups == NULL) {
-				/* Turn off balancing if we have no groups */
-				sd->flags = 0;
-			}
-		}
-		if (!sg) {
-			printk(KERN_WARNING
-			"Can not alloc domain group for node %d\n", i);
-			continue;
 		}
 		sg->cpu_power = 0;
 		sg->cpumask = nodemask;
+		sg->next = sg;
 		cpus_or(covered, covered, nodemask);
 		prev = sg;
 
@@ -5887,54 +6311,90 @@
 			if (cpus_empty(tmp))
 				continue;
 
-			sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+			sg = kmalloc_node(sizeof(struct sched_group),
+					  GFP_KERNEL, i);
 			if (!sg) {
 				printk(KERN_WARNING
 				"Can not alloc domain group for node %d\n", j);
-				break;
+				goto error;
 			}
 			sg->cpu_power = 0;
 			sg->cpumask = tmp;
+			sg->next = prev->next;
 			cpus_or(covered, covered, tmp);
 			prev->next = sg;
 			prev = sg;
 		}
-		prev->next = sched_group_nodes[i];
 	}
 #endif
 
 	/* Calculate CPU power for physical packages and nodes */
+#ifdef CONFIG_SCHED_SMT
+	for_each_cpu_mask(i, *cpu_map) {
+		struct sched_domain *sd;
+		sd = &per_cpu(cpu_domains, i);
+		sd->groups->cpu_power = SCHED_LOAD_SCALE;
+	}
+#endif
+#ifdef CONFIG_SCHED_MC
 	for_each_cpu_mask(i, *cpu_map) {
 		int power;
 		struct sched_domain *sd;
-#ifdef CONFIG_SCHED_SMT
-		sd = &per_cpu(cpu_domains, i);
-		power = SCHED_LOAD_SCALE;
-		sd->groups->cpu_power = power;
-#endif
-#ifdef CONFIG_SCHED_MC
 		sd = &per_cpu(core_domains, i);
-		power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
+		if (sched_smt_power_savings)
+			power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
+		else
+			power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
 					    * SCHED_LOAD_SCALE / 10;
 		sd->groups->cpu_power = power;
+	}
+#endif
 
+	for_each_cpu_mask(i, *cpu_map) {
+		struct sched_domain *sd;
+#ifdef CONFIG_SCHED_MC
 		sd = &per_cpu(phys_domains, i);
+		if (i != first_cpu(sd->groups->cpumask))
+			continue;
 
- 		/*
- 		 * This has to be < 2 * SCHED_LOAD_SCALE
- 		 * Lets keep it SCHED_LOAD_SCALE, so that
- 		 * while calculating NUMA group's cpu_power
- 		 * we can simply do
- 		 *  numa_group->cpu_power += phys_group->cpu_power;
- 		 *
- 		 * See "only add power once for each physical pkg"
- 		 * comment below
- 		 */
- 		sd->groups->cpu_power = SCHED_LOAD_SCALE;
+		sd->groups->cpu_power = 0;
+		if (sched_mc_power_savings || sched_smt_power_savings) {
+			int j;
+
+ 			for_each_cpu_mask(j, sd->groups->cpumask) {
+				struct sched_domain *sd1;
+ 				sd1 = &per_cpu(core_domains, j);
+ 				/*
+ 			 	 * for each core we will add once
+ 				 * to the group in physical domain
+ 			 	 */
+  	 			if (j != first_cpu(sd1->groups->cpumask))
+ 					continue;
+
+ 				if (sched_smt_power_savings)
+   					sd->groups->cpu_power += sd1->groups->cpu_power;
+ 				else
+   					sd->groups->cpu_power += SCHED_LOAD_SCALE;
+   			}
+ 		} else
+ 			/*
+ 			 * This has to be < 2 * SCHED_LOAD_SCALE
+ 			 * Lets keep it SCHED_LOAD_SCALE, so that
+ 			 * while calculating NUMA group's cpu_power
+ 			 * we can simply do
+ 			 *  numa_group->cpu_power += phys_group->cpu_power;
+ 			 *
+ 			 * See "only add power once for each physical pkg"
+ 			 * comment below
+ 			 */
+ 			sd->groups->cpu_power = SCHED_LOAD_SCALE;
 #else
+		int power;
 		sd = &per_cpu(phys_domains, i);
-		power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
+		if (sched_smt_power_savings)
+			power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
+		else
+			power = SCHED_LOAD_SCALE;
 		sd->groups->cpu_power = power;
 #endif
 	}
@@ -5962,13 +6422,20 @@
 	 * Tune cache-hot values:
 	 */
 	calibrate_migration_costs(cpu_map);
+
+	return 0;
+
+error:
+	free_sched_groups(cpu_map);
+	return -ENOMEM;
 }
 /*
  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
  */
-static void arch_init_sched_domains(const cpumask_t *cpu_map)
+static int arch_init_sched_domains(const cpumask_t *cpu_map)
 {
 	cpumask_t cpu_default_map;
+	int err;
 
 	/*
 	 * Setup mask for cpus without special case scheduling requirements.
@@ -5977,51 +6444,14 @@
 	 */
 	cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
 
-	build_sched_domains(&cpu_default_map);
+	err = build_sched_domains(&cpu_default_map);
+
+	return err;
 }
 
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
-#ifdef CONFIG_NUMA
-	int i;
-	int cpu;
-
-	for_each_cpu_mask(cpu, *cpu_map) {
-		struct sched_group *sched_group_allnodes
-			= sched_group_allnodes_bycpu[cpu];
-		struct sched_group **sched_group_nodes
-			= sched_group_nodes_bycpu[cpu];
-
-		if (sched_group_allnodes) {
-			kfree(sched_group_allnodes);
-			sched_group_allnodes_bycpu[cpu] = NULL;
-		}
-
-		if (!sched_group_nodes)
-			continue;
-
-		for (i = 0; i < MAX_NUMNODES; i++) {
-			cpumask_t nodemask = node_to_cpumask(i);
-			struct sched_group *oldsg, *sg = sched_group_nodes[i];
-
-			cpus_and(nodemask, nodemask, *cpu_map);
-			if (cpus_empty(nodemask))
-				continue;
-
-			if (sg == NULL)
-				continue;
-			sg = sg->next;
-next_sg:
-			oldsg = sg;
-			sg = sg->next;
-			kfree(oldsg);
-			if (oldsg != sched_group_nodes[i])
-				goto next_sg;
-		}
-		kfree(sched_group_nodes);
-		sched_group_nodes_bycpu[cpu] = NULL;
-	}
-#endif
+	free_sched_groups(cpu_map);
 }
 
 /*
@@ -6046,9 +6476,10 @@
  * correct sched domains
  * Call with hotplug lock held
  */
-void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
 {
 	cpumask_t change_map;
+	int err = 0;
 
 	cpus_and(*partition1, *partition1, cpu_online_map);
 	cpus_and(*partition2, *partition2, cpu_online_map);
@@ -6057,11 +6488,87 @@
 	/* Detach sched domains from all of the affected cpus */
 	detach_destroy_domains(&change_map);
 	if (!cpus_empty(*partition1))
-		build_sched_domains(partition1);
-	if (!cpus_empty(*partition2))
-		build_sched_domains(partition2);
+		err = build_sched_domains(partition1);
+	if (!err && !cpus_empty(*partition2))
+		err = build_sched_domains(partition2);
+
+	return err;
 }
 
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+int arch_reinit_sched_domains(void)
+{
+	int err;
+
+	lock_cpu_hotplug();
+	detach_destroy_domains(&cpu_online_map);
+	err = arch_init_sched_domains(&cpu_online_map);
+	unlock_cpu_hotplug();
+
+	return err;
+}
+
+static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
+{
+	int ret;
+
+	if (buf[0] != '0' && buf[0] != '1')
+		return -EINVAL;
+
+	if (smt)
+		sched_smt_power_savings = (buf[0] == '1');
+	else
+		sched_mc_power_savings = (buf[0] == '1');
+
+	ret = arch_reinit_sched_domains();
+
+	return ret ? ret : count;
+}
+
+int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+{
+	int err = 0;
+#ifdef CONFIG_SCHED_SMT
+	if (smt_capable())
+		err = sysfs_create_file(&cls->kset.kobj,
+					&attr_sched_smt_power_savings.attr);
+#endif
+#ifdef CONFIG_SCHED_MC
+	if (!err && mc_capable())
+		err = sysfs_create_file(&cls->kset.kobj,
+					&attr_sched_mc_power_savings.attr);
+#endif
+	return err;
+}
+#endif
+
+#ifdef CONFIG_SCHED_MC
+static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
+{
+	return sprintf(page, "%u\n", sched_mc_power_savings);
+}
+static ssize_t sched_mc_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
+{
+	return sched_power_savings_store(buf, count, 0);
+}
+SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
+	    sched_mc_power_savings_store);
+#endif
+
+#ifdef CONFIG_SCHED_SMT
+static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
+{
+	return sprintf(page, "%u\n", sched_smt_power_savings);
+}
+static ssize_t sched_smt_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
+{
+	return sched_power_savings_store(buf, count, 1);
+}
+SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
+	    sched_smt_power_savings_store);
+#endif
+
+
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Force a reinitialization of the sched domains hierarchy.  The domains
@@ -6143,7 +6650,6 @@
 		rq->push_cpu = 0;
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
-		rq->cpu = i;
 #endif
 		atomic_set(&rq->nr_iowait, 0);
 
@@ -6158,6 +6664,7 @@
 		}
 	}
 
+	set_load_weight(&init_task);
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
@@ -6204,11 +6711,12 @@
 	runqueue_t *rq;
 
 	read_lock_irq(&tasklist_lock);
-	for_each_process (p) {
+	for_each_process(p) {
 		if (!rt_task(p))
 			continue;
 
-		rq = task_rq_lock(p, &flags);
+		spin_lock_irqsave(&p->pi_lock, flags);
+		rq = __task_rq_lock(p);
 
 		array = p->array;
 		if (array)
@@ -6219,7 +6727,8 @@
 			resched_task(rq->curr);
 		}
 
-		task_rq_unlock(rq, &flags);
+		__task_rq_unlock(rq);
+		spin_unlock_irqrestore(&p->pi_lock, flags);
 	}
 	read_unlock_irq(&tasklist_lock);
 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9e2f1c6..8f03e3b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -446,7 +446,7 @@
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int cpu_callback(struct notifier_block *nfb,
+static int __devinit cpu_callback(struct notifier_block *nfb,
 				  unsigned long action,
 				  void *hcpu)
 {
@@ -486,7 +486,7 @@
 	return NOTIFY_OK;
 }
 
-static struct notifier_block cpu_nfb = {
+static struct notifier_block __devinitdata cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index b5c3b94..6b76caa 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -104,7 +104,7 @@
 /*
  * Create/destroy watchdog threads as CPUs come and go:
  */
-static int
+static int __devinit
 cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	int hotcpu = (unsigned long)hcpu;
@@ -142,7 +142,7 @@
 	return NOTIFY_OK;
 }
 
-static struct notifier_block cpu_nfb = {
+static struct notifier_block __devinitdata cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f1a4eb1..93a2c53 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -133,6 +133,10 @@
 extern int no_unaligned_warning;
 #endif
 
+#ifdef CONFIG_RT_MUTEXES
+extern int max_lock_depth;
+#endif
+
 static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
 		       ctl_table *, void **);
 static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
@@ -688,6 +692,17 @@
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+#ifdef CONFIG_RT_MUTEXES
+	{
+		.ctl_name	= KERN_MAX_LOCK_DEPTH,
+		.procname	= "max_lock_depth",
+		.data		= &max_lock_depth,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+
 	{ .ctl_name = 0 }
 };
 
@@ -928,6 +943,18 @@
 		.strategy	= &sysctl_jiffies,
 	},
 #endif
+#ifdef CONFIG_X86_32
+	{
+		.ctl_name	= VM_VDSO_ENABLED,
+		.procname	= "vdso_enabled",
+		.data		= &vdso_enabled,
+		.maxlen		= sizeof(vdso_enabled),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+#endif
 	{ .ctl_name = 0 }
 };
 
diff --git a/kernel/timer.c b/kernel/timer.c
index 5bb6b79..5a89602 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1652,7 +1652,7 @@
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int timer_cpu_notify(struct notifier_block *self,
+static int __devinit timer_cpu_notify(struct notifier_block *self,
 				unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
@@ -1672,7 +1672,7 @@
 	return NOTIFY_OK;
 }
 
-static struct notifier_block timers_nb = {
+static struct notifier_block __devinitdata timers_nb = {
 	.notifier_call	= timer_cpu_notify,
 };
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 565cf7a..59f0b42 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -559,7 +559,7 @@
 }
 
 /* We're holding the cpucontrol mutex here */
-static int workqueue_cpu_callback(struct notifier_block *nfb,
+static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 				  unsigned long action,
 				  void *hcpu)
 {
diff --git a/lib/Kconfig b/lib/Kconfig
index 3de9335..f629934 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -86,4 +86,10 @@
 config TEXTSEARCH_FSM
 	tristate
 
+#
+# plist support is select#ed if needed
+#
+config PLIST
+	boolean
+
 endmenu
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 8bab010..5330911 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -107,6 +107,24 @@
 	 This allows mutex semantics violations and mutex related deadlocks
 	 (lockups) to be detected and reported automatically.
 
+config DEBUG_RT_MUTEXES
+	bool "RT Mutex debugging, deadlock detection"
+	depends on DEBUG_KERNEL && RT_MUTEXES
+	help
+	 This allows rt mutex semantics violations and rt mutex related
+	 deadlocks (lockups) to be detected and reported automatically.
+
+config DEBUG_PI_LIST
+	bool
+	default y
+	depends on DEBUG_RT_MUTEXES
+
+config RT_MUTEX_TESTER
+	bool "Built-in scriptable tester for rt-mutexes"
+	depends on DEBUG_KERNEL && RT_MUTEXES
+	help
+	  This option enables a rt-mutex tester.
+
 config DEBUG_SPINLOCK
 	bool "Spinlock debugging"
 	depends on DEBUG_KERNEL
diff --git a/lib/Makefile b/lib/Makefile
index 79358ad..10c13c9 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -25,6 +25,7 @@
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
 lib-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
+obj-$(CONFIG_PLIST) += plist.o
 obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
 
 ifneq ($(CONFIG_HAVE_DEC_LOCK),y)
diff --git a/lib/plist.c b/lib/plist.c
new file mode 100644
index 0000000..3074a02
--- /dev/null
+++ b/lib/plist.c
@@ -0,0 +1,118 @@
+/*
+ * lib/plist.c
+ *
+ * Descending-priority-sorted double-linked list
+ *
+ * (C) 2002-2003 Intel Corp
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>.
+ *
+ * 2001-2005 (c) MontaVista Software, Inc.
+ * Daniel Walker <dwalker@mvista.com>
+ *
+ * (C) 2005 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * Simplifications of the original code by
+ * Oleg Nesterov <oleg@tv-sign.ru>
+ *
+ * Licensed under the FSF's GNU Public License v2 or later.
+ *
+ * Based on simple lists (include/linux/list.h).
+ *
+ * This file contains the add / del functions which are considered to
+ * be too large to inline. See include/linux/plist.h for further
+ * information.
+ */
+
+#include <linux/plist.h>
+#include <linux/spinlock.h>
+
+#ifdef CONFIG_DEBUG_PI_LIST
+
+static void plist_check_prev_next(struct list_head *t, struct list_head *p,
+				  struct list_head *n)
+{
+	if (n->prev != p || p->next != n) {
+		printk("top: %p, n: %p, p: %p\n", t, t->next, t->prev);
+		printk("prev: %p, n: %p, p: %p\n", p, p->next, p->prev);
+		printk("next: %p, n: %p, p: %p\n", n, n->next, n->prev);
+		WARN_ON(1);
+	}
+}
+
+static void plist_check_list(struct list_head *top)
+{
+	struct list_head *prev = top, *next = top->next;
+
+	plist_check_prev_next(top, prev, next);
+	while (next != top) {
+		prev = next;
+		next = prev->next;
+		plist_check_prev_next(top, prev, next);
+	}
+}
+
+static void plist_check_head(struct plist_head *head)
+{
+	WARN_ON(!head->lock);
+	if (head->lock)
+		WARN_ON_SMP(!spin_is_locked(head->lock));
+	plist_check_list(&head->prio_list);
+	plist_check_list(&head->node_list);
+}
+
+#else
+# define plist_check_head(h)	do { } while (0)
+#endif
+
+/**
+ * plist_add - add @node to @head
+ *
+ * @node:	&struct plist_node pointer
+ * @head:	&struct plist_head pointer
+ */
+void plist_add(struct plist_node *node, struct plist_head *head)
+{
+	struct plist_node *iter;
+
+	plist_check_head(head);
+	WARN_ON(!plist_node_empty(node));
+
+	list_for_each_entry(iter, &head->prio_list, plist.prio_list) {
+		if (node->prio < iter->prio)
+			goto lt_prio;
+		else if (node->prio == iter->prio) {
+			iter = list_entry(iter->plist.prio_list.next,
+					struct plist_node, plist.prio_list);
+			goto eq_prio;
+		}
+	}
+
+lt_prio:
+	list_add_tail(&node->plist.prio_list, &iter->plist.prio_list);
+eq_prio:
+	list_add_tail(&node->plist.node_list, &iter->plist.node_list);
+
+	plist_check_head(head);
+}
+
+/**
+ * plist_del - Remove a @node from plist.
+ *
+ * @node:	&struct plist_node pointer - entry to be removed
+ * @head:	&struct plist_head pointer - list head
+ */
+void plist_del(struct plist_node *node, struct plist_head *head)
+{
+	plist_check_head(head);
+
+	if (!list_empty(&node->plist.prio_list)) {
+		struct plist_node *next = plist_first(&node->plist);
+
+		list_move_tail(&next->plist.prio_list, &node->plist.prio_list);
+		list_del_init(&node->plist.prio_list);
+	}
+
+	list_del_init(&node->plist.node_list);
+
+	plist_check_head(head);
+}
diff --git a/lib/zlib_inflate/inffast.c b/lib/zlib_inflate/inffast.c
index 02a16ea..d84560c 100644
--- a/lib/zlib_inflate/inffast.c
+++ b/lib/zlib_inflate/inffast.c
@@ -63,10 +63,10 @@
       bytes, which is the maximum length that can be coded.  inflate_fast()
       requires strm->avail_out >= 258 for each loop to avoid checking for
       output space.
+
+    - @start:	inflate()'s starting value for strm->avail_out
  */
-void inflate_fast(strm, start)
-z_streamp strm;
-unsigned start;         /* inflate()'s starting value for strm->avail_out */
+void inflate_fast(z_streamp strm, unsigned start)
 {
     struct inflate_state *state;
     unsigned char *in;      /* local strm->next_in */
diff --git a/lib/zlib_inflate/inftrees.c b/lib/zlib_inflate/inftrees.c
index da665fbb..3fe6ce5 100644
--- a/lib/zlib_inflate/inftrees.c
+++ b/lib/zlib_inflate/inftrees.c
@@ -20,13 +20,8 @@
    table index bits.  It will differ if the request is greater than the
    longest code or if it is less than the shortest code.
  */
-int zlib_inflate_table(type, lens, codes, table, bits, work)
-codetype type;
-unsigned short *lens;
-unsigned codes;
-code **table;
-unsigned *bits;
-unsigned short *work;
+int zlib_inflate_table(codetype type, unsigned short *lens, unsigned codes,
+			code **table, unsigned *bits, unsigned short *work)
 {
     unsigned len;               /* a code's length in bits */
     unsigned sym;               /* index of code symbols */
diff --git a/mm/Kconfig b/mm/Kconfig
index 66e65ab..e76c023 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -116,6 +116,7 @@
 config MEMORY_HOTPLUG
 	bool "Allow for memory hot-add"
 	depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND
+	depends on (IA64 || X86 || PPC64)
 
 comment "Memory hotplug is currently incompatible with Software Suspend"
 	depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
diff --git a/mm/filemap.c b/mm/filemap.c
index 9c7334b..d504d6e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2095,14 +2095,21 @@
 	do {
 		unsigned long index;
 		unsigned long offset;
-		unsigned long maxlen;
 		size_t copied;
 
 		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
 		index = pos >> PAGE_CACHE_SHIFT;
 		bytes = PAGE_CACHE_SIZE - offset;
-		if (bytes > count)
-			bytes = count;
+
+		/* Limit the size of the copy to the caller's write size */
+		bytes = min(bytes, count);
+
+		/*
+		 * Limit the size of the copy to that of the current segment,
+		 * because fault_in_pages_readable() doesn't know how to walk
+		 * segments.
+		 */
+		bytes = min(bytes, cur_iov->iov_len - iov_base);
 
 		/*
 		 * Bring in the user page that we will copy from _first_.
@@ -2110,10 +2117,7 @@
 		 * same page as we're writing to, without it being marked
 		 * up-to-date.
 		 */
-		maxlen = cur_iov->iov_len - iov_base;
-		if (maxlen > bytes)
-			maxlen = bytes;
-		fault_in_pages_readable(buf, maxlen);
+		fault_in_pages_readable(buf, bytes);
 
 		page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
 		if (!page) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 841a077..ea40388 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -21,6 +21,7 @@
 #include <linux/memory_hotplug.h>
 #include <linux/highmem.h>
 #include <linux/vmalloc.h>
+#include <linux/ioport.h>
 
 #include <asm/tlbflush.h>
 
@@ -126,6 +127,9 @@
 	unsigned long i;
 	unsigned long flags;
 	unsigned long onlined_pages = 0;
+	struct resource res;
+	u64 section_end;
+	unsigned long start_pfn;
 	struct zone *zone;
 	int need_zonelists_rebuild = 0;
 
@@ -148,10 +152,27 @@
 	if (!populated_zone(zone))
 		need_zonelists_rebuild = 1;
 
-	for (i = 0; i < nr_pages; i++) {
-		struct page *page = pfn_to_page(pfn + i);
-		online_page(page);
-		onlined_pages++;
+	res.start = (u64)pfn << PAGE_SHIFT;
+	res.end = res.start + ((u64)nr_pages << PAGE_SHIFT) - 1;
+	res.flags = IORESOURCE_MEM; /* we just need system ram */
+	section_end = res.end;
+
+	while (find_next_system_ram(&res) >= 0) {
+		start_pfn = (unsigned long)(res.start >> PAGE_SHIFT);
+		nr_pages = (unsigned long)
+                           ((res.end + 1 - res.start) >> PAGE_SHIFT);
+
+		if (PageReserved(pfn_to_page(start_pfn))) {
+			/* this region's page is not onlined now */
+			for (i = 0; i < nr_pages; i++) {
+				struct page *page = pfn_to_page(start_pfn + i);
+				online_page(page);
+				onlined_pages++;
+			}
+		}
+
+		res.start = res.end + 1;
+		res.end = section_end;
 	}
 	zone->present_pages += onlined_pages;
 	zone->zone_pgdat->node_present_pages += onlined_pages;
@@ -163,3 +184,100 @@
 	vm_total_pages = nr_free_pagecache_pages();
 	return 0;
 }
+
+static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
+{
+	struct pglist_data *pgdat;
+	unsigned long zones_size[MAX_NR_ZONES] = {0};
+	unsigned long zholes_size[MAX_NR_ZONES] = {0};
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+
+	pgdat = arch_alloc_nodedata(nid);
+	if (!pgdat)
+		return NULL;
+
+	arch_refresh_nodedata(nid, pgdat);
+
+	/* we can use NODE_DATA(nid) from here */
+
+	/* init node's zones as empty zones, we don't have any present pages.*/
+	free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size);
+
+	return pgdat;
+}
+
+static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
+{
+	arch_refresh_nodedata(nid, NULL);
+	arch_free_nodedata(pgdat);
+	return;
+}
+
+/* add this memory to iomem resource */
+static void register_memory_resource(u64 start, u64 size)
+{
+	struct resource *res;
+
+	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
+	BUG_ON(!res);
+
+	res->name = "System RAM";
+	res->start = start;
+	res->end = start + size - 1;
+	res->flags = IORESOURCE_MEM;
+	if (request_resource(&iomem_resource, res) < 0) {
+		printk("System RAM resource %llx - %llx cannot be added\n",
+		(unsigned long long)res->start, (unsigned long long)res->end);
+		kfree(res);
+	}
+}
+
+
+
+int add_memory(int nid, u64 start, u64 size)
+{
+	pg_data_t *pgdat = NULL;
+	int new_pgdat = 0;
+	int ret;
+
+	if (!node_online(nid)) {
+		pgdat = hotadd_new_pgdat(nid, start);
+		if (!pgdat)
+			return -ENOMEM;
+		new_pgdat = 1;
+		ret = kswapd_run(nid);
+		if (ret)
+			goto error;
+	}
+
+	/* call arch's memory hotadd */
+	ret = arch_add_memory(nid, start, size);
+
+	if (ret < 0)
+		goto error;
+
+	/* we online node here. we can't roll back from here. */
+	node_set_online(nid);
+
+	if (new_pgdat) {
+		ret = register_one_node(nid);
+		/*
+		 * If sysfs file of new node can't create, cpu on the node
+		 * can't be hot-added. There is no rollback way now.
+		 * So, check by BUG_ON() to catch it reluctantly..
+		 */
+		BUG_ON(ret);
+	}
+
+	/* register this memory as resource */
+	register_memory_resource(start, size);
+
+	return ret;
+error:
+	/* rollback pgdat allocation and others */
+	if (new_pgdat)
+		rollback_node_hotadd(nid, pgdat);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(add_memory);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 8ccf6f1b..4ec7026 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -516,14 +516,14 @@
 		ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
 }
 
-static int
+static int __cpuinit
 ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
 {
 	set_ratelimit();
 	return 0;
 }
 
-static struct notifier_block ratelimit_nb = {
+static struct notifier_block __cpuinitdata ratelimit_nb = {
 	.notifier_call	= ratelimit_handler,
 	.next		= NULL,
 };
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9f86191..084a2de 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -446,8 +446,8 @@
 
 	arch_free_page(page, order);
 	if (!PageHighMem(page))
-		mutex_debug_check_no_locks_freed(page_address(page),
-						 PAGE_SIZE<<order);
+		debug_check_no_locks_freed(page_address(page),
+					   PAGE_SIZE<<order);
 
 	for (i = 0 ; i < (1 << order) ; ++i)
 		reserved += free_pages_check(page + i);
@@ -2009,7 +2009,7 @@
 	}
 }
 
-static int pageset_cpuup_callback(struct notifier_block *nfb,
+static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
 		unsigned long action,
 		void *hcpu)
 {
@@ -2031,7 +2031,7 @@
 	return ret;
 }
 
-static struct notifier_block pageset_notifier =
+static struct notifier_block __cpuinitdata pageset_notifier =
 	{ &pageset_cpuup_callback, NULL, 0 };
 
 void __init setup_per_cpu_pageset(void)
diff --git a/mm/slab.c b/mm/slab.c
index 98ac20b..233e39d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -89,6 +89,7 @@
 #include	<linux/config.h>
 #include	<linux/slab.h>
 #include	<linux/mm.h>
+#include	<linux/poison.h>
 #include	<linux/swap.h>
 #include	<linux/cache.h>
 #include	<linux/interrupt.h>
@@ -106,6 +107,7 @@
 #include	<linux/nodemask.h>
 #include	<linux/mempolicy.h>
 #include	<linux/mutex.h>
+#include	<linux/rtmutex.h>
 
 #include	<asm/uaccess.h>
 #include	<asm/cacheflush.h>
@@ -492,17 +494,6 @@
 #endif
 
 #if DEBUG
-/*
- * Magic nums for obj red zoning.
- * Placed in the first word before and the first word after an obj.
- */
-#define	RED_INACTIVE	0x5A2CF071UL	/* when obj is inactive */
-#define	RED_ACTIVE	0x170FC2A5UL	/* when obj is active */
-
-/* ...and for poisoning */
-#define	POISON_INUSE	0x5a	/* for use-uninitialised poisoning */
-#define POISON_FREE	0x6b	/* for use-after-free poisoning */
-#define	POISON_END	0xa5	/* end-byte of poisoning */
 
 /*
  * memory layout of objects:
@@ -1083,7 +1074,7 @@
 
 #endif
 
-static int cpuup_callback(struct notifier_block *nfb,
+static int __devinit cpuup_callback(struct notifier_block *nfb,
 				    unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
@@ -1265,7 +1256,9 @@
 	return NOTIFY_BAD;
 }
 
-static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
+static struct notifier_block __cpuinitdata cpucache_notifier = {
+	&cpuup_callback, NULL, 0
+};
 
 /*
  * swap the static kmem_list3 with kmalloced memory
@@ -3405,7 +3398,7 @@
 	local_irq_save(flags);
 	kfree_debugcheck(objp);
 	c = virt_to_cache(objp);
-	mutex_debug_check_no_locks_freed(objp, obj_size(c));
+	debug_check_no_locks_freed(objp, obj_size(c));
 	__cache_free(c, (void *)objp);
 	local_irq_restore(flags);
 }
diff --git a/mm/sparse.c b/mm/sparse.c
index e0a3fe4..c7a2b3a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -45,7 +45,7 @@
 
 static int sparse_index_init(unsigned long section_nr, int nid)
 {
-	static spinlock_t index_init_lock = SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(index_init_lock);
 	unsigned long root = SECTION_NR_TO_ROOT(section_nr);
 	struct mem_section *section;
 	int ret = 0;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 72babac..eeacb0d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -34,6 +34,7 @@
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
 #include <linux/delay.h>
+#include <linux/kthread.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -1223,7 +1224,6 @@
 	};
 	cpumask_t cpumask;
 
-	daemonize("kswapd%d", pgdat->node_id);
 	cpumask = node_to_cpumask(pgdat->node_id);
 	if (!cpus_empty(cpumask))
 		set_cpus_allowed(tsk, cpumask);
@@ -1450,7 +1450,7 @@
    not required for correctness.  So if the last cpu in a node goes
    away, we get changed to run anywhere: as the first one comes back,
    restore their cpu bindings. */
-static int cpu_callback(struct notifier_block *nfb,
+static int __devinit cpu_callback(struct notifier_block *nfb,
 				  unsigned long action, void *hcpu)
 {
 	pg_data_t *pgdat;
@@ -1468,20 +1468,35 @@
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
+/*
+ * This kswapd start function will be called by init and node-hot-add.
+ * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
+ */
+int kswapd_run(int nid)
+{
+	pg_data_t *pgdat = NODE_DATA(nid);
+	int ret = 0;
+
+	if (pgdat->kswapd)
+		return 0;
+
+	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+	if (IS_ERR(pgdat->kswapd)) {
+		/* failure at boot is fatal */
+		BUG_ON(system_state == SYSTEM_BOOTING);
+		printk("Failed to start kswapd on node %d\n",nid);
+		ret = -1;
+	}
+	return ret;
+}
+
 static int __init kswapd_init(void)
 {
-	pg_data_t *pgdat;
+	int nid;
 
 	swap_setup();
-	for_each_online_pgdat(pgdat) {
-		pid_t pid;
-
-		pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
-		BUG_ON(pid < 0);
-		read_lock(&tasklist_lock);
-		pgdat->kswapd = find_task_by_pid(pid);
-		read_unlock(&tasklist_lock);
-	}
+	for_each_online_node(nid)
+ 		kswapd_run(nid);
 	hotcpu_notifier(cpu_callback, 0);
 	return 0;
 }
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8a77793..e728980 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -349,7 +349,7 @@
 	    (strict & RT6_SELECT_F_REACHABLE) &&
 	    last && last != rt0) {
 		/* no entries matched; do round-robin */
-		static spinlock_t lock = SPIN_LOCK_UNLOCKED;
+		static DEFINE_SPINLOCK(lock);
 		spin_lock(&lock);
 		*head = rt0->u.next;
 		rt0->u.next = last->u.next;
diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
index f433112..2f31216 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -70,7 +70,7 @@
 # define RPCDBG_FACILITY        RPCDBG_AUTH
 #endif
 
-spinlock_t krb5_seq_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(krb5_seq_lock);
 
 u32
 gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text,
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 5412804..1bb7570 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -117,7 +117,7 @@
 static struct bcbearer *bcbearer = NULL;
 static struct bclink *bclink = NULL;
 static struct link *bcl = NULL;
-static spinlock_t bc_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(bc_lock);
 
 char tipc_bclink_name[] = "multicast-link";
 
@@ -796,7 +796,7 @@
 	memset(bclink, 0, sizeof(struct bclink));
 	INIT_LIST_HEAD(&bcl->waiting_ports);
 	bcl->next_out_no = 1;
-	bclink->node.lock =  SPIN_LOCK_UNLOCKED;        
+	spin_lock_init(&bclink->node.lock);
 	bcl->owner = &bclink->node;
         bcl->max_pkt = MAX_PKT_DEFAULT_MCAST;
 	tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT);
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 4fa24b5..7ef17a4 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -566,7 +566,7 @@
 		b_ptr->link_req = tipc_disc_init_link_req(b_ptr, &m_ptr->bcast_addr,
 							  bcast_scope, 2);
 	}
-	b_ptr->publ.lock = SPIN_LOCK_UNLOCKED;
+	spin_lock_init(&b_ptr->publ.lock);
 	write_unlock_bh(&tipc_net_lock);
 	info("Enabled bearer <%s>, discovery domain %s, priority %u\n",
 	     name, addr_string_fill(addr_string, bcast_scope), priority);
diff --git a/net/tipc/config.c b/net/tipc/config.c
index 3ec502fa..285e1bc 100644
--- a/net/tipc/config.c
+++ b/net/tipc/config.c
@@ -63,7 +63,7 @@
 
 static struct manager mng = { 0};
 
-static spinlock_t config_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(config_lock);
 
 static const void *req_tlv_area;	/* request message TLV area */
 static int req_tlv_space;		/* request message TLV area size */
diff --git a/net/tipc/dbg.c b/net/tipc/dbg.c
index 26ef95d..5513065 100644
--- a/net/tipc/dbg.c
+++ b/net/tipc/dbg.c
@@ -41,7 +41,7 @@
 #define MAX_STRING 512
 
 static char print_string[MAX_STRING];
-static spinlock_t print_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(print_lock);
 
 static struct print_buf cons_buf = { NULL, 0, NULL, NULL };
 struct print_buf *TIPC_CONS = &cons_buf;
diff --git a/net/tipc/handler.c b/net/tipc/handler.c
index 966f70a1..ae6ddf0 100644
--- a/net/tipc/handler.c
+++ b/net/tipc/handler.c
@@ -44,7 +44,7 @@
 
 static kmem_cache_t *tipc_queue_item_cache;
 static struct list_head signal_queue_head;
-static spinlock_t qitem_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(qitem_lock);
 static int handler_enabled = 0;
 
 static void process_signal_queue(unsigned long dummy);
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 38571306..a6926ff 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -101,7 +101,7 @@
 
 static struct name_table table = { NULL } ;
 static atomic_t rsv_publ_ok = ATOMIC_INIT(0);
-rwlock_t tipc_nametbl_lock = RW_LOCK_UNLOCKED;
+DEFINE_RWLOCK(tipc_nametbl_lock);
 
 
 static int hash(int x)
@@ -172,7 +172,7 @@
 	}
 
 	memset(nseq, 0, sizeof(*nseq));
-	nseq->lock = SPIN_LOCK_UNLOCKED;
+	spin_lock_init(&nseq->lock);
 	nseq->type = type;
 	nseq->sseqs = sseq;
 	dbg("tipc_nameseq_create(): nseq = %p, type %u, ssseqs %p, ff: %u\n",
diff --git a/net/tipc/net.c b/net/tipc/net.c
index f7c8223..e5a359a 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -115,7 +115,7 @@
  *     - A local spin_lock protecting the queue of subscriber events.
 */
 
-rwlock_t tipc_net_lock = RW_LOCK_UNLOCKED;
+DEFINE_RWLOCK(tipc_net_lock);
 struct network tipc_net = { NULL };
 
 struct node *tipc_net_select_remote_node(u32 addr, u32 ref) 
diff --git a/net/tipc/node.c b/net/tipc/node.c
index ce9678e..861322b 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -77,7 +77,7 @@
 		
 	memset(n_ptr, 0, sizeof(*n_ptr));
 	n_ptr->addr = addr;
-	n_ptr->lock =  SPIN_LOCK_UNLOCKED;	
+                spin_lock_init(&n_ptr->lock);
 	INIT_LIST_HEAD(&n_ptr->nsub);
 	n_ptr->owner = c_ptr;
 	tipc_cltr_attach_node(c_ptr, n_ptr);
diff --git a/net/tipc/port.c b/net/tipc/port.c
index 47d9740..3251c8d 100644
--- a/net/tipc/port.c
+++ b/net/tipc/port.c
@@ -57,8 +57,8 @@
 static struct sk_buff *msg_queue_head = NULL;
 static struct sk_buff *msg_queue_tail = NULL;
 
-spinlock_t tipc_port_list_lock = SPIN_LOCK_UNLOCKED;
-static spinlock_t queue_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(tipc_port_list_lock);
+static DEFINE_SPINLOCK(queue_lock);
 
 static LIST_HEAD(ports);
 static void port_handle_node_down(unsigned long ref);
diff --git a/net/tipc/ref.c b/net/tipc/ref.c
index d2f0cce..596d3c8 100644
--- a/net/tipc/ref.c
+++ b/net/tipc/ref.c
@@ -63,7 +63,7 @@
 
 struct ref_table tipc_ref_table = { NULL };
 
-static rwlock_t ref_table_lock = RW_LOCK_UNLOCKED;
+static DEFINE_RWLOCK(ref_table_lock);
 
 /**
  * tipc_ref_table_init - create reference table for objects
@@ -87,7 +87,7 @@
 	index_mask = sz - 1;
 	for (i = sz - 1; i >= 0; i--) {
 		table[i].object = NULL;
-		table[i].lock = SPIN_LOCK_UNLOCKED;
+		spin_lock_init(&table[i].lock);
 		table[i].data.next_plus_upper = (start & ~index_mask) + i - 1;
 	}
 	tipc_ref_table.entries = table;
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index fc17187..e19b4bc 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -457,7 +457,7 @@
 	int res = -1;
 
 	memset(&topsrv, 0, sizeof (topsrv));
-	topsrv.lock = SPIN_LOCK_UNLOCKED;
+	spin_lock_init(&topsrv.lock);
 	INIT_LIST_HEAD(&topsrv.subscriber_list);
 
 	spin_lock_bh(&topsrv.lock);
diff --git a/net/tipc/user_reg.c b/net/tipc/user_reg.c
index 3f3f933..1e3ae57 100644
--- a/net/tipc/user_reg.c
+++ b/net/tipc/user_reg.c
@@ -67,7 +67,7 @@
 
 static struct tipc_user *users = NULL;
 static u32 next_free_user = MAX_USERID + 1;
-static spinlock_t reg_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(reg_lock);
 
 /**
  * reg_init - create TIPC user registry (but don't activate it)
diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include
index ac5f275..b0d067b 100644
--- a/scripts/Kbuild.include
+++ b/scripts/Kbuild.include
@@ -13,11 +13,6 @@
 depfile = $(subst $(comma),_,$(@D)/.$(@F).d)
 
 ###
-# basetarget equals the filename of the target with no extension.
-# So 'foo/bar.o' becomes 'bar'
-basetarget = $(basename $(notdir $@))
-
-###
 # Escape single quote for use in echo statements
 escsq = $(subst $(squote),'\$(squote)',$1)
 
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index 3cb445c..02a7eea 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -117,7 +117,7 @@
 $(obj-m)              : quiet_modtag := [M]
 
 # Default for not multi-part modules
-modname = $(basetarget)
+modname = $(*F)
 
 $(multi-objs-m)         : modname = $(modname-multi)
 $(multi-objs-m:.o=.i)   : modname = $(modname-multi)
diff --git a/scripts/Makefile.host b/scripts/Makefile.host
index 18ecd4d..2b066d1 100644
--- a/scripts/Makefile.host
+++ b/scripts/Makefile.host
@@ -80,10 +80,8 @@
 #####
 # Handle options to gcc. Support building with separate output directory
 
-_hostc_flags   = $(HOSTCFLAGS)   $(HOST_EXTRACFLAGS)   \
-                 $(HOSTCFLAGS_$(basetarget).o)
-_hostcxx_flags = $(HOSTCXXFLAGS) $(HOST_EXTRACXXFLAGS) \
-                 $(HOSTCXXFLAGS_$(basetarget).o)
+_hostc_flags   = $(HOSTCFLAGS)   $(HOST_EXTRACFLAGS)   $(HOSTCFLAGS_$(*F).o)
+_hostcxx_flags = $(HOSTCXXFLAGS) $(HOST_EXTRACXXFLAGS) $(HOSTCXXFLAGS_$(*F).o)
 
 ifeq ($(KBUILD_SRC),)
 __hostc_flags	= $(_hostc_flags)
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index fc498fe..2cb4935 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -82,12 +82,12 @@
 #       than one module. In that case KBUILD_MODNAME will be set to foo_bar,
 #       where foo and bar are the name of the modules.
 name-fix = $(subst $(comma),_,$(subst -,_,$1))
-basename_flags = -D"KBUILD_BASENAME=KBUILD_STR($(call name-fix,$(basetarget)))"
+basename_flags = -D"KBUILD_BASENAME=KBUILD_STR($(call name-fix,$(*F)))"
 modname_flags  = $(if $(filter 1,$(words $(modname))),\
                  -D"KBUILD_MODNAME=KBUILD_STR($(call name-fix,$(modname)))")
 
-_c_flags       = $(CFLAGS) $(EXTRA_CFLAGS) $(CFLAGS_$(basetarget).o)
-_a_flags       = $(AFLAGS) $(EXTRA_AFLAGS) $(AFLAGS_$(basetarget).o)
+_c_flags       = $(CFLAGS) $(EXTRA_CFLAGS) $(CFLAGS_$(*F).o)
+_a_flags       = $(AFLAGS) $(EXTRA_AFLAGS) $(AFLAGS_$(*F).o)
 _cpp_flags     = $(CPPFLAGS) $(EXTRA_CPPFLAGS) $(CPPFLAGS_$(@F))
 
 # If building the kernel in a separate objtree expand all occurrences
diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost
index e83613e..576cce5e 100644
--- a/scripts/Makefile.modpost
+++ b/scripts/Makefile.modpost
@@ -72,7 +72,7 @@
 # Step 5), compile all *.mod.c files
 
 # modname is set to make c_flags define KBUILD_MODNAME
-modname = $(basetarget)
+modname = $(*F)
 
 quiet_cmd_cc_o_c = CC      $@
       cmd_cc_o_c = $(CC) $(c_flags) $(CFLAGS_MODULE)	\
diff --git a/scripts/rt-tester/check-all.sh b/scripts/rt-tester/check-all.sh
new file mode 100644
index 0000000..43098af
--- /dev/null
+++ b/scripts/rt-tester/check-all.sh
@@ -0,0 +1,22 @@
+
+
+function testit ()
+{
+ printf "%-30s: " $1
+ ./rt-tester.py $1 | grep Pass
+}
+
+testit t2-l1-2rt-sameprio.tst
+testit t2-l1-pi.tst
+testit t2-l1-signal.tst
+#testit t2-l2-2rt-deadlock.tst
+testit t3-l1-pi-1rt.tst
+testit t3-l1-pi-2rt.tst
+testit t3-l1-pi-3rt.tst
+testit t3-l1-pi-signal.tst
+testit t3-l1-pi-steal.tst
+testit t3-l2-pi.tst
+testit t4-l2-pi-deboost.tst
+testit t5-l4-pi-boost-deboost.tst
+testit t5-l4-pi-boost-deboost-setsched.tst
+
diff --git a/scripts/rt-tester/rt-tester.py b/scripts/rt-tester/rt-tester.py
new file mode 100644
index 0000000..4c79660
--- /dev/null
+++ b/scripts/rt-tester/rt-tester.py
@@ -0,0 +1,222 @@
+#!/usr/bin/env python
+#
+# rt-mutex tester
+#
+# (C) 2006 Thomas Gleixner <tglx@linutronix.de>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+#
+import os
+import sys
+import getopt
+import shutil
+import string
+
+# Globals
+quiet = 0
+test = 0
+comments = 0
+
+sysfsprefix = "/sys/devices/system/rttest/rttest"
+statusfile = "/status"
+commandfile = "/command"
+
+# Command opcodes
+cmd_opcodes = {
+    "schedother"    : "1",
+    "schedfifo"     : "2",
+    "lock"          : "3",
+    "locknowait"    : "4",
+    "lockint"       : "5",
+    "lockintnowait" : "6",
+    "lockcont"      : "7",
+    "unlock"        : "8",
+    "lockbkl"       : "9",
+    "unlockbkl"     : "10",
+    "signal"        : "11",
+    "resetevent"    : "98",
+    "reset"         : "99",
+    }
+
+test_opcodes = {
+    "prioeq"        : ["P" , "eq" , None],
+    "priolt"        : ["P" , "lt" , None],
+    "priogt"        : ["P" , "gt" , None],
+    "nprioeq"       : ["N" , "eq" , None],
+    "npriolt"       : ["N" , "lt" , None],
+    "npriogt"       : ["N" , "gt" , None],
+    "unlocked"      : ["M" , "eq" , 0],
+    "trylock"       : ["M" , "eq" , 1],
+    "blocked"       : ["M" , "eq" , 2],
+    "blockedwake"   : ["M" , "eq" , 3],
+    "locked"        : ["M" , "eq" , 4],
+    "opcodeeq"      : ["O" , "eq" , None],
+    "opcodelt"      : ["O" , "lt" , None],
+    "opcodegt"      : ["O" , "gt" , None],
+    "eventeq"       : ["E" , "eq" , None],
+    "eventlt"       : ["E" , "lt" , None],
+    "eventgt"       : ["E" , "gt" , None],
+    }
+
+# Print usage information
+def usage():
+    print "rt-tester.py <-c -h -q -t> <testfile>"
+    print " -c    display comments after first command"
+    print " -h    help"
+    print " -q    quiet mode"
+    print " -t    test mode (syntax check)"
+    print " testfile: read test specification from testfile"
+    print " otherwise from stdin"
+    return
+
+# Print progress when not in quiet mode
+def progress(str):
+    if not quiet:
+        print str
+
+# Analyse a status value
+def analyse(val, top, arg):
+
+    intval = int(val)
+
+    if top[0] == "M":
+        intval = intval / (10 ** int(arg))
+	intval = intval % 10
+        argval = top[2]
+    elif top[0] == "O":
+        argval = int(cmd_opcodes.get(arg, arg))
+    else:
+        argval = int(arg)
+
+    # progress("%d %s %d" %(intval, top[1], argval))
+
+    if top[1] == "eq" and intval == argval:
+	return 1
+    if top[1] == "lt" and intval < argval:
+        return 1
+    if top[1] == "gt" and intval > argval:
+	return 1
+    return 0
+
+# Parse the commandline
+try:
+    (options, arguments) = getopt.getopt(sys.argv[1:],'chqt')
+except getopt.GetoptError, ex:
+    usage()
+    sys.exit(1)
+
+# Parse commandline options
+for option, value in options:
+    if option == "-c":
+        comments = 1
+    elif option == "-q":
+        quiet = 1
+    elif option == "-t":
+        test = 1
+    elif option == '-h':
+        usage()
+        sys.exit(0)
+
+# Select the input source
+if arguments:
+    try:
+        fd = open(arguments[0])
+    except Exception,ex:
+        sys.stderr.write("File not found %s\n" %(arguments[0]))
+        sys.exit(1)
+else:
+    fd = sys.stdin
+
+linenr = 0
+
+# Read the test patterns
+while 1:
+
+    linenr = linenr + 1
+    line = fd.readline()
+    if not len(line):
+        break
+
+    line = line.strip()
+    parts = line.split(":")
+
+    if not parts or len(parts) < 1:
+        continue
+
+    if len(parts[0]) == 0:
+        continue
+
+    if parts[0].startswith("#"):
+	if comments > 1:
+	    progress(line)
+	continue
+
+    if comments == 1:
+	comments = 2
+
+    progress(line)
+
+    cmd = parts[0].strip().lower()
+    opc = parts[1].strip().lower()
+    tid = parts[2].strip()
+    dat = parts[3].strip()
+
+    try:
+        # Test or wait for a status value
+        if cmd == "t" or cmd == "w":
+            testop = test_opcodes[opc]
+
+            fname = "%s%s%s" %(sysfsprefix, tid, statusfile)
+            if test:
+		print fname
+                continue
+
+            while 1:
+                query = 1
+                fsta = open(fname, 'r')
+                status = fsta.readline().strip()
+                fsta.close()
+                stat = status.split(",")
+                for s in stat:
+		    s = s.strip()
+                    if s.startswith(testop[0]):
+                        # Seperate status value
+                        val = s[2:].strip()
+                        query = analyse(val, testop, dat)
+                        break
+                if query or cmd == "t":
+                    break
+
+            progress("   " + status)
+
+            if not query:
+                sys.stderr.write("Test failed in line %d\n" %(linenr))
+		sys.exit(1)
+
+        # Issue a command to the tester
+        elif cmd == "c":
+            cmdnr = cmd_opcodes[opc]
+            # Build command string and sys filename
+            cmdstr = "%s:%s" %(cmdnr, dat)
+            fname = "%s%s%s" %(sysfsprefix, tid, commandfile)
+            if test:
+		print fname
+                continue
+            fcmd = open(fname, 'w')
+            fcmd.write(cmdstr)
+            fcmd.close()
+
+    except Exception,ex:
+    	sys.stderr.write(str(ex))
+        sys.stderr.write("\nSyntax error in line %d\n" %(linenr))
+        if not test:
+            fd.close()
+            sys.exit(1)
+
+# Normal exit pass
+print "Pass"
+sys.exit(0)
+
+
diff --git a/scripts/rt-tester/t2-l1-2rt-sameprio.tst b/scripts/rt-tester/t2-l1-2rt-sameprio.tst
new file mode 100644
index 0000000..8821f27
--- /dev/null
+++ b/scripts/rt-tester/t2-l1-2rt-sameprio.tst
@@ -0,0 +1,99 @@
+#
+# RT-Mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	0
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 2 threads 1 lock
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedfifo:		0: 	80
+C: schedfifo:		1: 	80
+
+# T0 lock L0
+C: locknowait:		0: 	0
+C: locknowait:		1:	0
+W: locked:		0: 	0
+W: blocked:		1: 	0
+T: prioeq:		0: 	80
+
+# T0 unlock L0
+C: unlock:		0: 	0
+W: locked:		1: 	0
+
+# Verify T0
+W: unlocked:		0: 	0
+T: prioeq:		0: 	80
+
+# Unlock
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
+
+# T1,T0 lock L0
+C: locknowait:		1: 	0
+C: locknowait:		0:	0
+W: locked:		1: 	0
+W: blocked:		0: 	0
+T: prioeq:		1: 	80
+
+# T1 unlock L0
+C: unlock:		1: 	0
+W: locked:		0: 	0
+
+# Verify T1
+W: unlocked:		1: 	0
+T: prioeq:		1: 	80
+
+# Unlock and exit
+C: unlock:		0: 	0
+W: unlocked:		0: 	0
+
diff --git a/scripts/rt-tester/t2-l1-pi.tst b/scripts/rt-tester/t2-l1-pi.tst
new file mode 100644
index 0000000..cde1f18
--- /dev/null
+++ b/scripts/rt-tester/t2-l1-pi.tst
@@ -0,0 +1,82 @@
+#
+# RT-Mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	0
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 2 threads 1 lock with priority inversion
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+C: schedfifo:		1: 	80
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L0
+C: locknowait:		1: 	0
+W: blocked:		1: 	0
+T: prioeq:		0: 	80
+
+# T0 unlock L0
+C: unlock:		0: 	0
+W: locked:		1: 	0
+
+# Verify T1
+W: unlocked:		0: 	0
+T: priolt:		0: 	1
+
+# Unlock and exit
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
+
diff --git a/scripts/rt-tester/t2-l1-signal.tst b/scripts/rt-tester/t2-l1-signal.tst
new file mode 100644
index 0000000..3ab0bfc
--- /dev/null
+++ b/scripts/rt-tester/t2-l1-signal.tst
@@ -0,0 +1,77 @@
+#
+# RT-Mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	0
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 2 threads 1 lock with priority inversion
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+C: schedother:		1: 	0
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L0
+C: lockintnowait:	1: 	0
+W: blocked:		1: 	0
+
+# Interrupt T1
+C: signal:		1:	0
+W: unlocked:		1: 	0
+T: opcodeeq:		1:	-4
+
+# Unlock and exit
+C: unlock:		0: 	0
+W: unlocked:		0: 	0
diff --git a/scripts/rt-tester/t2-l2-2rt-deadlock.tst b/scripts/rt-tester/t2-l2-2rt-deadlock.tst
new file mode 100644
index 0000000..f4b5d5d
--- /dev/null
+++ b/scripts/rt-tester/t2-l2-2rt-deadlock.tst
@@ -0,0 +1,89 @@
+#
+# RT-Mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	0
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 2 threads 2 lock
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedfifo:		0: 	80
+C: schedfifo:		1: 	80
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L1
+C: locknowait:		1:	1
+W: locked:		1: 	1
+
+# T0 lock L1
+C: lockintnowait:	0: 	1
+W: blocked:		0: 	1
+
+# T1 lock L0
+C: lockintnowait:	1: 	0
+W: blocked:		1: 	0
+
+# Make deadlock go away
+C: signal:		1:	0
+W: unlocked:		1:	0
+C: signal:		0:	0
+W: unlocked:		0:	1
+
+# Unlock and exit
+C: unlock:		0: 	0
+W: unlocked:		0: 	0
+C: unlock:		1: 	1
+W: unlocked:		1: 	1
+
diff --git a/scripts/rt-tester/t3-l1-pi-1rt.tst b/scripts/rt-tester/t3-l1-pi-1rt.tst
new file mode 100644
index 0000000..63440ca
--- /dev/null
+++ b/scripts/rt-tester/t3-l1-pi-1rt.tst
@@ -0,0 +1,92 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 3 threads 1 lock PI
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+C: schedother:		1: 	0
+C: schedfifo:		2: 	82
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L0
+C: locknowait:		1: 	0
+W: blocked:		1: 	0
+T: priolt:		0: 	1
+
+# T2 lock L0
+C: locknowait:		2: 	0
+W: blocked:		2: 	0
+T: prioeq:		0: 	82
+
+# T0 unlock L0
+C: unlock:		0: 	0
+
+# Wait until T2 got the lock
+W: locked:		2: 	0
+W: unlocked:		0:	0
+T: priolt:		0:	1
+
+# T2 unlock L0
+C: unlock:		2: 	0
+
+W: unlocked:		2: 	0
+W: locked:		1: 	0
+
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
diff --git a/scripts/rt-tester/t3-l1-pi-2rt.tst b/scripts/rt-tester/t3-l1-pi-2rt.tst
new file mode 100644
index 0000000..e5816fe
--- /dev/null
+++ b/scripts/rt-tester/t3-l1-pi-2rt.tst
@@ -0,0 +1,93 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 3 threads 1 lock PI
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+C: schedfifo:		1: 	81
+C: schedfifo:		2: 	82
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L0
+C: locknowait:		1: 	0
+W: blocked:		1: 	0
+T: prioeq:		0: 	81
+
+# T2 lock L0
+C: locknowait:		2: 	0
+W: blocked:		2: 	0
+T: prioeq:		0: 	82
+T: prioeq:		1:	81
+
+# T0 unlock L0
+C: unlock:		0: 	0
+
+# Wait until T2 got the lock
+W: locked:		2: 	0
+W: unlocked:		0:	0
+T: priolt:		0:	1
+
+# T2 unlock L0
+C: unlock:		2: 	0
+
+W: unlocked:		2: 	0
+W: locked:		1: 	0
+
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
diff --git a/scripts/rt-tester/t3-l1-pi-3rt.tst b/scripts/rt-tester/t3-l1-pi-3rt.tst
new file mode 100644
index 0000000..718b82b
--- /dev/null
+++ b/scripts/rt-tester/t3-l1-pi-3rt.tst
@@ -0,0 +1,92 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 3 threads 1 lock PI
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedfifo:		0: 	80
+C: schedfifo:		1: 	81
+C: schedfifo:		2: 	82
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L0
+C: locknowait:		1: 	0
+W: blocked:		1: 	0
+T: prioeq:		0: 	81
+
+# T2 lock L0
+C: locknowait:		2: 	0
+W: blocked:		2: 	0
+T: prioeq:		0: 	82
+
+# T0 unlock L0
+C: unlock:		0: 	0
+
+# Wait until T2 got the lock
+W: locked:		2: 	0
+W: unlocked:		0:	0
+T: prioeq:		0:	80
+
+# T2 unlock L0
+C: unlock:		2: 	0
+
+W: locked:		1: 	0
+W: unlocked:		2: 	0
+
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
diff --git a/scripts/rt-tester/t3-l1-pi-signal.tst b/scripts/rt-tester/t3-l1-pi-signal.tst
new file mode 100644
index 0000000..c6e2135
--- /dev/null
+++ b/scripts/rt-tester/t3-l1-pi-signal.tst
@@ -0,0 +1,98 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+# Reset event counter
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set priorities
+C: schedother:		0: 	0
+C: schedfifo:		1: 	80
+C: schedfifo:		2: 	81
+
+# T0 lock L0
+C: lock:		0:	0
+W: locked:		0: 	0
+
+# T1 lock L0, no wait in the wakeup path
+C: locknowait:		1: 	0
+W: blocked:		1: 	0
+T: prioeq:		0:	80
+T: prioeq:		1:	80
+
+# T2 lock L0 interruptible, no wait in the wakeup path
+C: lockintnowait:	2:	0
+W: blocked:		2: 	0
+T: prioeq:		0:	81
+T: prioeq:		1:	80
+
+# Interrupt T2
+C: signal:		2:	2
+W: unlocked:		2:	0
+T: prioeq:		1:	80
+T: prioeq:		0:	80
+
+T: locked:		0:	0
+T: blocked:		1:	0
+
+# T0 unlock L0
+C: unlock:		0: 	0
+
+# Wait until T1 has locked L0 and exit
+W: locked:		1:	0
+W: unlocked:		0: 	0
+T: priolt:		0:	1
+
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
+
+
+
diff --git a/scripts/rt-tester/t3-l1-pi-steal.tst b/scripts/rt-tester/t3-l1-pi-steal.tst
new file mode 100644
index 0000000..f53749d
--- /dev/null
+++ b/scripts/rt-tester/t3-l1-pi-steal.tst
@@ -0,0 +1,96 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 3 threads 1 lock PI steal pending ownership
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+C: schedfifo:		1: 	80
+C: schedfifo:		2: 	81
+
+# T0 lock L0
+C: lock:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L0
+C: lock:		1: 	0
+W: blocked:		1: 	0
+T: prioeq:		0: 	80
+
+# T0 unlock L0
+C: unlock:		0: 	0
+
+# Wait until T1 is in the wakeup loop
+W: blockedwake:		1: 	0
+T: priolt:		0: 	1
+
+# T2 lock L0
+C: lock:		2: 	0
+# T1 leave wakeup loop
+C: lockcont:		1: 	0
+
+# T2 must have the lock and T1 must be blocked
+W: locked:		2: 	0
+W: blocked:		1: 	0
+
+# T2 unlock L0
+C: unlock:		2: 	0
+
+# Wait until T1 is in the wakeup loop and let it run
+W: blockedwake:		1: 	0
+C: lockcont:		1: 	0
+W: locked:		1: 	0
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
diff --git a/scripts/rt-tester/t3-l2-pi.tst b/scripts/rt-tester/t3-l2-pi.tst
new file mode 100644
index 0000000..cdc3e4f
--- /dev/null
+++ b/scripts/rt-tester/t3-l2-pi.tst
@@ -0,0 +1,92 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 3 threads 2 lock PI
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+C: schedother:		1: 	0
+C: schedfifo:		2: 	82
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L0
+C: locknowait:		1: 	0
+W: blocked:		1: 	0
+T: priolt:		0: 	1
+
+# T2 lock L0
+C: locknowait:		2: 	0
+W: blocked:		2: 	0
+T: prioeq:		0: 	82
+
+# T0 unlock L0
+C: unlock:		0: 	0
+
+# Wait until T2 got the lock
+W: locked:		2: 	0
+W: unlocked:		0:	0
+T: priolt:		0:	1
+
+# T2 unlock L0
+C: unlock:		2: 	0
+
+W: unlocked:		2: 	0
+W: locked:		1: 	0
+
+C: unlock:		1: 	0
+W: unlocked:		1: 	0
diff --git a/scripts/rt-tester/t4-l2-pi-deboost.tst b/scripts/rt-tester/t4-l2-pi-deboost.tst
new file mode 100644
index 0000000..baa1413
--- /dev/null
+++ b/scripts/rt-tester/t4-l2-pi-deboost.tst
@@ -0,0 +1,123 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 4 threads 2 lock PI
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+C: schedother:		1: 	0
+C: schedfifo:		2: 	82
+C: schedfifo:		3: 	83
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L1
+C: locknowait:		1: 	1
+W: locked:		1: 	1
+
+# T3 lock L0
+C: lockintnowait:	3: 	0
+W: blocked:		3: 	0
+T: prioeq:		0: 	83
+
+# T0 lock L1
+C: lock:		0: 	1
+W: blocked:		0: 	1
+T: prioeq:		1: 	83
+
+# T1 unlock L1
+C: unlock:		1:	1
+
+# Wait until T0 is in the wakeup code
+W: blockedwake:		0:	1
+
+# Verify that T1 is unboosted
+W: unlocked:		1: 	1
+T: priolt:		1: 	1
+
+# T2 lock L1 (T0 is boosted and pending owner !)
+C: locknowait:		2:	1
+W: blocked:		2: 	1
+T: prioeq:		0: 	83
+
+# Interrupt T3 and wait until T3 returned
+C: signal:		3:	0
+W: unlocked:		3:	0
+
+# Verify prio of T0 (still pending owner,
+# but T2 is enqueued due to the previous boost by T3
+T: prioeq:		0:	82
+
+# Let T0 continue
+C: lockcont:		0:	1
+W: locked:		0:	1
+
+# Unlock L1 and let T2 get L1
+C: unlock:		0:	1
+W: locked:		2:	1
+
+# Verify that T0 is unboosted
+W: unlocked:		0:	1
+T: priolt:		0:	1
+
+# Unlock everything and exit
+C: unlock:		2:	1
+W: unlocked:		2:	1
+
+C: unlock:		0:	0
+W: unlocked:		0:	0
+
diff --git a/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst b/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst
new file mode 100644
index 0000000..e6ec0c8
--- /dev/null
+++ b/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst
@@ -0,0 +1,183 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 5 threads 4 lock PI - modify priority of blocked threads
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+C: schedfifo:		1: 	81
+C: schedfifo:		2: 	82
+C: schedfifo:		3: 	83
+C: schedfifo:		4: 	84
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L1
+C: locknowait:		1: 	1
+W: locked:		1: 	1
+
+# T1 lock L0
+C: lockintnowait:	1: 	0
+W: blocked:		1: 	0
+T: prioeq:		0: 	81
+
+# T2 lock L2
+C: locknowait:		2: 	2
+W: locked:		2: 	2
+
+# T2 lock L1
+C: lockintnowait:	2: 	1
+W: blocked:		2: 	1
+T: prioeq:		0: 	82
+T: prioeq:		1:	82
+
+# T3 lock L3
+C: locknowait:		3: 	3
+W: locked:		3: 	3
+
+# T3 lock L2
+C: lockintnowait:	3: 	2
+W: blocked:		3: 	2
+T: prioeq:		0: 	83
+T: prioeq:		1:	83
+T: prioeq:		2:	83
+
+# T4 lock L3
+C: lockintnowait:	4:	3
+W: blocked:		4: 	3
+T: prioeq:		0: 	84
+T: prioeq:		1:	84
+T: prioeq:		2:	84
+T: prioeq:		3:	84
+
+# Reduce prio of T4
+C: schedfifo:		4: 	80
+T: prioeq:		0: 	83
+T: prioeq:		1:	83
+T: prioeq:		2:	83
+T: prioeq:		3:	83
+T: prioeq:		4:	80
+
+# Increase prio of T4
+C: schedfifo:		4: 	84
+T: prioeq:		0: 	84
+T: prioeq:		1:	84
+T: prioeq:		2:	84
+T: prioeq:		3:	84
+T: prioeq:		4:	84
+
+# Reduce prio of T3
+C: schedfifo:		3: 	80
+T: prioeq:		0: 	84
+T: prioeq:		1:	84
+T: prioeq:		2:	84
+T: prioeq:		3:	84
+T: prioeq:		4:	84
+
+# Increase prio of T3
+C: schedfifo:		3: 	85
+T: prioeq:		0: 	85
+T: prioeq:		1:	85
+T: prioeq:		2:	85
+T: prioeq:		3:	85
+T: prioeq:		4:	84
+
+# Reduce prio of T3
+C: schedfifo:		3: 	83
+T: prioeq:		0: 	84
+T: prioeq:		1:	84
+T: prioeq:		2:	84
+T: prioeq:		3:	84
+T: prioeq:		4:	84
+
+# Signal T4
+C: signal:		4: 	0
+W: unlocked:		4: 	3
+T: prioeq:		0: 	83
+T: prioeq:		1:	83
+T: prioeq:		2:	83
+T: prioeq:		3:	83
+
+# Signal T3
+C: signal:		3: 	0
+W: unlocked:		3: 	2
+T: prioeq:		0: 	82
+T: prioeq:		1:	82
+T: prioeq:		2:	82
+
+# Signal T2
+C: signal:		2: 	0
+W: unlocked:		2: 	1
+T: prioeq:		0: 	81
+T: prioeq:		1:	81
+
+# Signal T1
+C: signal:		1: 	0
+W: unlocked:		1: 	0
+T: priolt:		0: 	1
+
+# Unlock and exit
+C: unlock:		3:	3
+C: unlock:		2:	2
+C: unlock:		1:	1
+C: unlock:		0:	0
+
+W: unlocked:		3:	3
+W: unlocked:		2:	2
+W: unlocked:		1:	1
+W: unlocked:		0:	0
+
diff --git a/scripts/rt-tester/t5-l4-pi-boost-deboost.tst b/scripts/rt-tester/t5-l4-pi-boost-deboost.tst
new file mode 100644
index 0000000..ca64f8b
--- /dev/null
+++ b/scripts/rt-tester/t5-l4-pi-boost-deboost.tst
@@ -0,0 +1,143 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode	opcode argument
+# schedother	nice value
+# schedfifo	priority
+# lock		lock nr (0-7)
+# locknowait	lock nr (0-7)
+# lockint	lock nr (0-7)
+# lockintnowait	lock nr (0-7)
+# lockcont	lock nr (0-7)
+# unlock	lock nr (0-7)
+# lockbkl	lock nr (0-7)
+# unlockbkl	lock nr (0-7)
+# signal	thread to signal (0-7)
+# reset		0
+# resetevent	0
+#
+# Tests / Wait
+#
+# opcode	opcode argument
+#
+# prioeq	priority
+# priolt	priority
+# priogt	priority
+# nprioeq	normal priority
+# npriolt	normal priority
+# npriogt	normal priority
+# locked	lock nr (0-7)
+# blocked	lock nr (0-7)
+# blockedwake	lock nr (0-7)
+# unlocked	lock nr (0-7)
+# lockedbkl	dont care
+# blockedbkl	dont care
+# unlockedbkl	dont care
+# opcodeeq	command opcode or number
+# opcodelt	number
+# opcodegt	number
+# eventeq	number
+# eventgt	number
+# eventlt	number
+
+#
+# 5 threads 4 lock PI
+#
+C: resetevent:		0: 	0
+W: opcodeeq:		0: 	0
+
+# Set schedulers
+C: schedother:		0: 	0
+C: schedfifo:		1: 	81
+C: schedfifo:		2: 	82
+C: schedfifo:		3: 	83
+C: schedfifo:		4: 	84
+
+# T0 lock L0
+C: locknowait:		0: 	0
+W: locked:		0: 	0
+
+# T1 lock L1
+C: locknowait:		1: 	1
+W: locked:		1: 	1
+
+# T1 lock L0
+C: lockintnowait:	1: 	0
+W: blocked:		1: 	0
+T: prioeq:		0: 	81
+
+# T2 lock L2
+C: locknowait:		2: 	2
+W: locked:		2: 	2
+
+# T2 lock L1
+C: lockintnowait:	2: 	1
+W: blocked:		2: 	1
+T: prioeq:		0: 	82
+T: prioeq:		1:	82
+
+# T3 lock L3
+C: locknowait:		3: 	3
+W: locked:		3: 	3
+
+# T3 lock L2
+C: lockintnowait:	3: 	2
+W: blocked:		3: 	2
+T: prioeq:		0: 	83
+T: prioeq:		1:	83
+T: prioeq:		2:	83
+
+# T4 lock L3
+C: lockintnowait:	4:	3
+W: blocked:		4: 	3
+T: prioeq:		0: 	84
+T: prioeq:		1:	84
+T: prioeq:		2:	84
+T: prioeq:		3:	84
+
+# Signal T4
+C: signal:		4: 	0
+W: unlocked:		4: 	3
+T: prioeq:		0: 	83
+T: prioeq:		1:	83
+T: prioeq:		2:	83
+T: prioeq:		3:	83
+
+# Signal T3
+C: signal:		3: 	0
+W: unlocked:		3: 	2
+T: prioeq:		0: 	82
+T: prioeq:		1:	82
+T: prioeq:		2:	82
+
+# Signal T2
+C: signal:		2: 	0
+W: unlocked:		2: 	1
+T: prioeq:		0: 	81
+T: prioeq:		1:	81
+
+# Signal T1
+C: signal:		1: 	0
+W: unlocked:		1: 	0
+T: priolt:		0: 	1
+
+# Unlock and exit
+C: unlock:		3:	3
+C: unlock:		2:	2
+C: unlock:		1:	1
+C: unlock:		0:	0
+
+W: unlocked:		3:	3
+W: unlocked:		2:	2
+W: unlocked:		1:	1
+W: unlocked:		0:	0
+
diff --git a/security/keys/key.c b/security/keys/key.c
index 43295ca..80de8c3 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -11,6 +11,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/poison.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/security.h>
@@ -988,7 +989,7 @@
 		if (key->type == ktype) {
 			if (ktype->destroy)
 				ktype->destroy(key);
-			memset(&key->payload, 0xbd, sizeof(key->payload));
+			memset(&key->payload, KEY_DESTROY, sizeof(key->payload));
 		}
 	}
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index ac7f2b2..28832e6 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1532,8 +1532,9 @@
 	/* Default to the current task SID. */
 	bsec->sid = tsec->sid;
 
-	/* Reset create and sockcreate SID on execve. */
+	/* Reset fs, key, and sock SIDs on execve. */
 	tsec->create_sid = 0;
+	tsec->keycreate_sid = 0;
 	tsec->sockcreate_sid = 0;
 
 	if (tsec->exec_sid) {
@@ -2586,9 +2587,10 @@
 	tsec2->osid = tsec1->osid;
 	tsec2->sid = tsec1->sid;
 
-	/* Retain the exec, create, and sock SIDs across fork */
+	/* Retain the exec, fs, key, and sock SIDs across fork */
 	tsec2->exec_sid = tsec1->exec_sid;
 	tsec2->create_sid = tsec1->create_sid;
+	tsec2->keycreate_sid = tsec1->keycreate_sid;
 	tsec2->sockcreate_sid = tsec1->sockcreate_sid;
 
 	/* Retain ptracer SID across fork, if any.
diff --git a/sound/oss/via82cxxx_audio.c b/sound/oss/via82cxxx_audio.c
index 1a921ee7..2343ded 100644
--- a/sound/oss/via82cxxx_audio.c
+++ b/sound/oss/via82cxxx_audio.c
@@ -24,6 +24,7 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/pci.h>
+#include <linux/poison.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/proc_fs.h>
@@ -3522,7 +3523,7 @@
 
 err_out_kfree:
 #ifndef VIA_NDEBUG
-	memset (card, 0xAB, sizeof (*card)); /* poison memory */
+	memset (card, OSS_POISON_FREE, sizeof (*card)); /* poison memory */
 #endif
 	kfree (card);
 
@@ -3559,7 +3560,7 @@
 	via_ac97_cleanup (card);
 
 #ifndef VIA_NDEBUG
-	memset (card, 0xAB, sizeof (*card)); /* poison memory */
+	memset (card, OSS_POISON_FREE, sizeof (*card)); /* poison memory */
 #endif
 	kfree (card);