Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

* git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6:
  crypto: aesni-intel - fixed problem with packets that are not multiple of 64bytes
diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss
index 4f29e5f1..f5bb0a3 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss
+++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss
@@ -59,3 +59,15 @@
 Contact:	iss_storagedev@hp.com
 Description:	Displays the usage count (number of opens) of logical drive Y
 		of controller X.
+
+Where:		/sys/bus/pci/devices/<dev>/ccissX/resettable
+Date:		February 2011
+Kernel Version:	2.6.38
+Contact:	iss_storagedev@hp.com
+Description:	Value of 1 indicates the controller can honor the reset_devices
+		kernel parameter.  Value of 0 indicates reset_devices cannot be
+		honored.  This is to allow, for example, kexec tools to be able
+		to warn the user if they designate an unresettable device as
+		a dump device, as kdump requires resetting the device in order
+		to work reliably.
+
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile
index 2deb069..8436b01 100644
--- a/Documentation/DocBook/Makefile
+++ b/Documentation/DocBook/Makefile
@@ -55,7 +55,6 @@
 build_images = mkdir -p $(objtree)/Documentation/DocBook/media/ && \
 	       cp $(srctree)/Documentation/DocBook/dvb/*.png \
 	          $(srctree)/Documentation/DocBook/v4l/*.gif \
-	          $(srctree)/Documentation/DocBook/v4l/*.png \
 		  $(objtree)/Documentation/DocBook/media/
 
 xmldoclinks:
diff --git a/Documentation/DocBook/rapidio.tmpl b/Documentation/DocBook/rapidio.tmpl
index 54eb26b..5047936 100644
--- a/Documentation/DocBook/rapidio.tmpl
+++ b/Documentation/DocBook/rapidio.tmpl
@@ -133,7 +133,6 @@
 !Idrivers/rapidio/rio-sysfs.c
      </sect1>
      <sect1 id="PPC32_support"><title>PPC32 support</title>
-!Earch/powerpc/sysdev/fsl_rio.c
 !Iarch/powerpc/sysdev/fsl_rio.c
      </sect1>
   </chapter>
diff --git a/Documentation/development-process/1.Intro b/Documentation/development-process/1.Intro
index 8cc2cba..9b61448 100644
--- a/Documentation/development-process/1.Intro
+++ b/Documentation/development-process/1.Intro
@@ -56,13 +56,13 @@
 
 1.2: WHAT THIS DOCUMENT IS ABOUT
 
-The Linux kernel, at over 6 million lines of code and well over 1000 active
-contributors, is one of the largest and most active free software projects
-in existence.  Since its humble beginning in 1991, this kernel has evolved
-into a best-of-breed operating system component which runs on pocket-sized
-digital music players, desktop PCs, the largest supercomputers in
-existence, and all types of systems in between.  It is a robust, efficient,
-and scalable solution for almost any situation.
+The Linux kernel, at over 8 million lines of code and well over 1000
+contributors to each release, is one of the largest and most active free
+software projects in existence.  Since its humble beginning in 1991, this
+kernel has evolved into a best-of-breed operating system component which
+runs on pocket-sized digital music players, desktop PCs, the largest
+supercomputers in existence, and all types of systems in between.  It is a
+robust, efficient, and scalable solution for almost any situation.
 
 With the growth of Linux has come an increase in the number of developers
 (and companies) wishing to participate in its development.  Hardware
@@ -115,7 +115,7 @@
 improved by comments from Johannes Berg, James Berry, Alex Chiang, Roland
 Dreier, Randy Dunlap, Jake Edge, Jiri Kosina, Matt Mackall, Arthur Marsh,
 Amanda McPherson, Andrew Morton, Andrew Price, Tsugikazu Shibata, and
-Jochen Voß. 
+Jochen Voß.
 
 This work was supported by the Linux Foundation; thanks especially to
 Amanda McPherson, who saw the value of this effort and made it all happen.
@@ -221,7 +221,7 @@
 - Everything that was said above about code review applies doubly to
   closed-source code.  Since this code is not available at all, it cannot
   have been reviewed by the community and will, beyond doubt, have serious
-  problems. 
+  problems.
 
 Makers of embedded systems, in particular, may be tempted to disregard much
 of what has been said in this section in the belief that they are shipping
diff --git a/Documentation/development-process/2.Process b/Documentation/development-process/2.Process
index 911a451..4823577 100644
--- a/Documentation/development-process/2.Process
+++ b/Documentation/development-process/2.Process
@@ -14,16 +14,15 @@
 major kernel release happening every two or three months.  The recent
 release history looks like this:
 
-	2.6.26	July 13, 2008
-	2.6.25	April 16, 2008
-	2.6.24	January 24, 2008
-	2.6.23	October 9, 2007
-	2.6.22	July 8, 2007
-	2.6.21	April 25, 2007
-	2.6.20	February 4, 2007
+	2.6.38	March 14, 2011
+	2.6.37	January 4, 2011
+	2.6.36	October 20, 2010
+	2.6.35	August 1, 2010
+	2.6.34	May 15, 2010
+	2.6.33	February 24, 2010
 
 Every 2.6.x release is a major kernel release with new features, internal
-API changes, and more.  A typical 2.6 release can contain over 10,000
+API changes, and more.  A typical 2.6 release can contain nearly 10,000
 changesets with changes to several hundred thousand lines of code.  2.6 is
 thus the leading edge of Linux kernel development; the kernel uses a
 rolling development model which is continually integrating major changes.
@@ -42,13 +41,13 @@
 and staged ahead of time.  How that process works will be described in
 detail later on).
 
-The merge window lasts for two weeks.  At the end of this time, Linus
-Torvalds will declare that the window is closed and release the first of
-the "rc" kernels.  For the kernel which is destined to be 2.6.26, for
-example, the release which happens at the end of the merge window will be
-called 2.6.26-rc1.  The -rc1 release is the signal that the time to merge
-new features has passed, and that the time to stabilize the next kernel has
-begun.
+The merge window lasts for approximately two weeks.  At the end of this
+time, Linus Torvalds will declare that the window is closed and release the
+first of the "rc" kernels.  For the kernel which is destined to be 2.6.40,
+for example, the release which happens at the end of the merge window will
+be called 2.6.40-rc1.  The -rc1 release is the signal that the time to
+merge new features has passed, and that the time to stabilize the next
+kernel has begun.
 
 Over the next six to ten weeks, only patches which fix problems should be
 submitted to the mainline.  On occasion a more significant change will be
@@ -66,20 +65,19 @@
 considered to be sufficiently stable and the final 2.6.x release is made.
 At that point the whole process starts over again.
 
-As an example, here is how the 2.6.25 development cycle went (all dates in
-2008): 
+As an example, here is how the 2.6.38 development cycle went (all dates in
+2011):
 
-	January 24	2.6.24 stable release
-	February 10	2.6.25-rc1, merge window closes
-	February 15	2.6.25-rc2
-	February 24	2.6.25-rc3
-	March 4	 	2.6.25-rc4
-	March 9		2.6.25-rc5
-	March 16	2.6.25-rc6
-	March 25	2.6.25-rc7
-	April 1		2.6.25-rc8
-	April 11	2.6.25-rc9
-	April 16	2.6.25 stable release
+	January 4	2.6.37 stable release
+	January 18	2.6.38-rc1, merge window closes
+	January 21	2.6.38-rc2
+	February 1	2.6.38-rc3
+	February 7	2.6.38-rc4
+	February 15	2.6.38-rc5
+	February 21	2.6.38-rc6
+	March 1		2.6.38-rc7
+	March 7		2.6.38-rc8
+	March 14	2.6.38 stable release
 
 How do the developers decide when to close the development cycle and create
 the stable release?  The most significant metric used is the list of
@@ -87,7 +85,7 @@
 break systems which worked in the past are considered to be especially
 serious.  For this reason, patches which cause regressions are looked upon
 unfavorably and are quite likely to be reverted during the stabilization
-period. 
+period.
 
 The developers' goal is to fix all known regressions before the stable
 release is made.  In the real world, this kind of perfection is hard to
@@ -99,26 +97,34 @@
 of them are serious.
 
 Once a stable release is made, its ongoing maintenance is passed off to the
-"stable team," currently comprised of Greg Kroah-Hartman and Chris Wright.
-The stable team will release occasional updates to the stable release using
-the 2.6.x.y numbering scheme.  To be considered for an update release, a
-patch must (1) fix a significant bug, and (2) already be merged into the
-mainline for the next development kernel.  Continuing our 2.6.25 example,
-the history (as of this writing) is:
+"stable team," currently consisting of Greg Kroah-Hartman.  The stable team
+will release occasional updates to the stable release using the 2.6.x.y
+numbering scheme.  To be considered for an update release, a patch must (1)
+fix a significant bug, and (2) already be merged into the mainline for the
+next development kernel.  Kernels will typically receive stable updates for
+a little more than one development cycle past their initial release.  So,
+for example, the 2.6.36 kernel's history looked like:
 
-	May 1		2.6.25.1
-	May 6		2.6.25.2 
-	May 9		2.6.25.3 
-	May 15		2.6.25.4
-	June 7		2.6.25.5
-	June 9		2.6.25.6
-	June 16		2.6.25.7
-	June 21		2.6.25.8
-	June 24		2.6.25.9
+	October 10	2.6.36 stable release
+	November 22	2.6.36.1
+	December 9	2.6.36.2
+	January 7	2.6.36.3
+	February 17	2.6.36.4
 
-Stable updates for a given kernel are made for approximately six months;
-after that, the maintenance of stable releases is solely the responsibility
-of the distributors which have shipped that particular kernel.
+2.6.36.4 was the final stable update for the 2.6.36 release.
+
+Some kernels are designated "long term" kernels; they will receive support
+for a longer period.  As of this writing, the current long term kernels
+and their maintainers are:
+
+	2.6.27	Willy Tarreau		(Deep-frozen stable kernel)
+	2.6.32	Greg Kroah-Hartman
+	2.6.35	Andi Kleen		(Embedded flag kernel)
+
+The selection of a kernel for long-term support is purely a matter of a
+maintainer having the need and the time to maintain that release.  There
+are no known plans for long-term support for any specific upcoming
+release.
 
 
 2.2: THE LIFECYCLE OF A PATCH
@@ -130,7 +136,7 @@
 This process can happen quickly for minor fixes, or, in the case of large
 and controversial changes, go on for years.  Much developer frustration
 comes from a lack of understanding of this process or from attempts to
-circumvent it.  
+circumvent it.
 
 In the hopes of reducing that frustration, this document will describe how
 a patch gets into the kernel.  What follows below is an introduction which
@@ -193,8 +199,8 @@
 2.3: HOW PATCHES GET INTO THE KERNEL
 
 There is exactly one person who can merge patches into the mainline kernel
-repository: Linus Torvalds.  But, of the over 12,000 patches which went
-into the 2.6.25 kernel, only 250 (around 2%) were directly chosen by Linus
+repository: Linus Torvalds.  But, of the over 9,500 patches which went
+into the 2.6.38 kernel, only 112 (around 1.3%) were directly chosen by Linus
 himself.  The kernel project has long since grown to a size where no single
 developer could possibly inspect and select every patch unassisted.  The
 way the kernel developers have addressed this growth is through the use of
@@ -229,7 +235,7 @@
 etc.  This chain of repositories can be arbitrarily long, though it rarely
 exceeds two or three links.  Since each maintainer in the chain trusts
 those managing lower-level trees, this process is known as the "chain of
-trust." 
+trust."
 
 Clearly, in a system like this, getting patches into the kernel depends on
 finding the right maintainer.  Sending patches directly to Linus is not
@@ -254,7 +260,7 @@
 collected for testing and review.  The older of these trees, maintained by
 Andrew Morton, is called "-mm" (for memory management, which is how it got
 started).  The -mm tree integrates patches from a long list of subsystem
-trees; it also has some patches aimed at helping with debugging.  
+trees; it also has some patches aimed at helping with debugging.
 
 Beyond that, -mm contains a significant collection of patches which have
 been selected by Andrew directly.  These patches may have been posted on a
@@ -264,8 +270,8 @@
 patch into the mainline, it is likely to end up in -mm.  Miscellaneous
 patches which accumulate in -mm will eventually either be forwarded on to
 an appropriate subsystem tree or be sent directly to Linus.  In a typical
-development cycle, approximately 10% of the patches going into the mainline
-get there via -mm.
+development cycle, approximately 5-10% of the patches going into the
+mainline get there via -mm.
 
 The current -mm patch is available in the "mmotm" (-mm of the moment)
 directory at:
@@ -275,7 +281,7 @@
 Use of the MMOTM tree is likely to be a frustrating experience, though;
 there is a definite chance that it will not even compile.
 
-The other -next tree, started more recently, is linux-next, maintained by
+The primary tree for next-cycle patch merging is linux-next, maintained by
 Stephen Rothwell.  The linux-next tree is, by design, a snapshot of what
 the mainline is expected to look like after the next merge window closes.
 Linux-next trees are announced on the linux-kernel and linux-next mailing
@@ -287,25 +293,14 @@
 
 	http://linux.f-seidel.de/linux-next/pmwiki/
 
-How the linux-next tree will fit into the development process is still
-changing.  As of this writing, the first full development cycle involving
-linux-next (2.6.26) is coming to an end; thus far, it has proved to be a
-valuable resource for finding and fixing integration problems before the
-beginning of the merge window.  See http://lwn.net/Articles/287155/ for
-more information on how linux-next has worked to set up the 2.6.27 merge
-window.
+Linux-next has become an integral part of the kernel development process;
+all patches merged during a given merge window should really have found
+their way into linux-next some time before the merge window opens.
 
-Some developers have begun to suggest that linux-next should be used as the
-target for future development as well.  The linux-next tree does tend to be
-far ahead of the mainline and is more representative of the tree into which
-any new work will be merged.  The downside to this idea is that the
-volatility of linux-next tends to make it a difficult development target.
-See http://lwn.net/Articles/289013/ for more information on this topic, and
-stay tuned; much is still in flux where linux-next is involved.
 
 2.4.1: STAGING TREES
 
-The kernel source tree now contains the drivers/staging/ directory, where
+The kernel source tree contains the drivers/staging/ directory, where
 many sub-directories for drivers or filesystems that are on their way to
 being added to the kernel tree live.  They remain in drivers/staging while
 they still need more work; once complete, they can be moved into the
@@ -313,15 +308,23 @@
 up to Linux kernel coding or quality standards, but people may want to use
 them and track development.
 
-Greg Kroah-Hartman currently (as of 2.6.36) maintains the staging tree.
-Drivers that still need work are sent to him, with each driver having
-its own subdirectory in drivers/staging/.  Along with the driver source
-files, a TODO file should be present in the directory as well.  The TODO
-file lists the pending work that the driver needs for acceptance into
-the kernel proper, as well as a list of people that should be Cc'd for any
-patches to the driver.  Staging drivers that don't currently build should
-have their config entries depend upon CONFIG_BROKEN.  Once they can
-be successfully built without outside patches, CONFIG_BROKEN can be removed.
+Greg Kroah-Hartman currently maintains the staging tree.  Drivers that
+still need work are sent to him, with each driver having its own
+subdirectory in drivers/staging/.  Along with the driver source files, a
+TODO file should be present in the directory as well.  The TODO file lists
+the pending work that the driver needs for acceptance into the kernel
+proper, as well as a list of people that should be Cc'd for any patches to
+the driver.  Current rules require that drivers contributed to staging
+must, at a minimum, compile properly.
+
+Staging can be a relatively easy way to get new drivers into the mainline
+where, with luck, they will come to the attention of other developers and
+improve quickly.  Entry into staging is not the end of the story, though;
+code in staging which is not seeing regular progress will eventually be
+removed.  Distributors also tend to be relatively reluctant to enable
+staging drivers.  So staging is, at best, a stop on the way toward becoming
+a proper mainline driver.
+
 
 2.5: TOOLS
 
@@ -347,11 +350,7 @@
 
 	http://git-scm.com/
 
-That page has pointers to documentation and tutorials.  One should be
-aware, in particular, of the Kernel Hacker's Guide to git, which has
-information specific to kernel development:
-
-	http://linux.yyz.us/git-howto.html
+That page has pointers to documentation and tutorials.
 
 Among the kernel developers who do not use git, the most popular choice is
 almost certainly Mercurial:
@@ -408,7 +407,7 @@
   important to filter on both the topic of interest (though note that
   long-running conversations can drift away from the original subject
   without changing the email subject line) and the people who are
-  participating.  
+  participating.
 
 - Do not feed the trolls.  If somebody is trying to stir up an angry
   response, ignore them.
diff --git a/Documentation/development-process/3.Early-stage b/Documentation/development-process/3.Early-stage
index 307a159..f87ba7b 100644
--- a/Documentation/development-process/3.Early-stage
+++ b/Documentation/development-process/3.Early-stage
@@ -110,8 +110,8 @@
 
  - The AppArmor security module made use of internal virtual filesystem
    data structures in ways which were considered to be unsafe and
-   unreliable.  This code has since been significantly reworked, but
-   remains outside of the mainline.
+   unreliable.  This concern (among others) kept AppArmor out of the
+   mainline for years.
 
 In each of these cases, a great deal of pain and extra work could have been
 avoided with some early discussion with the kernel developers.
@@ -138,6 +138,19 @@
 patches.  Those are the people who will be best placed to help with a new
 development project.
 
+The task of finding the right maintainer is sometimes challenging enough
+that the kernel developers have added a script to ease the process:
+
+	.../scripts/get_maintainer.pl
+
+This script will return the current maintainer(s) for a given file or
+directory when given the "-f" option.  If passed a patch on the
+command line, it will list the maintainers who should probably receive
+copies of the patch.  There are a number of options regulating how hard
+get_maintainer.pl will search for maintainers; please be careful about
+using the more aggressive options as you may end up including developers
+who have no real interest in the code you are modifying.
+
 If all else fails, talking to Andrew Morton can be an effective way to
 track down a maintainer for a specific piece of code.
 
@@ -155,11 +168,15 @@
 matter is (1) kernel developers tend to be busy, (2) there is no shortage
 of people with grand plans and little code (or even prospect of code) to
 back them up, and (3) nobody is obligated to review or comment on ideas
-posted by others.  If a request-for-comments posting yields little in the
-way of comments, do not assume that it means there is no interest in the
-project.  Unfortunately, you also cannot assume that there are no problems
-with your idea.  The best thing to do in this situation is to proceed,
-keeping the community informed as you go.
+posted by others.  Beyond that, high-level designs often hide problems
+which are only reviewed when somebody actually tries to implement those
+designs; for that reason, kernel developers would rather see the code.
+
+If a request-for-comments posting yields little in the way of comments, do
+not assume that it means there is no interest in the project.
+Unfortunately, you also cannot assume that there are no problems with your
+idea.  The best thing to do in this situation is to proceed, keeping the
+community informed as you go.
 
 
 3.5: GETTING OFFICIAL BUY-IN
diff --git a/Documentation/development-process/4.Coding b/Documentation/development-process/4.Coding
index 2278693..f3f1a46 100644
--- a/Documentation/development-process/4.Coding
+++ b/Documentation/development-process/4.Coding
@@ -131,6 +131,11 @@
 often does not apply to contemporary hardware.  Space *is* time, in that a
 larger program will run slower than one which is more compact.
 
+More recent compilers take an increasingly active role in deciding whether
+a given function should actually be inlined or not.  So the liberal
+placement of "inline" keywords may not just be excessive; it could also be
+irrelevant.
+
 
 * Locking
 
@@ -285,6 +290,13 @@
 distributor does not package it); it can then be run on the code by adding
 "C=1" to your make command.
 
+The "Coccinelle" tool (http://coccinelle.lip6.fr/) is able to find a wide
+variety of potential coding problems; it can also propose fixes for those
+problems.  Quite a few "semantic patches" for the kernel have been packaged
+under the scripts/coccinelle directory; running "make coccicheck" will run
+through those semantic patches and report on any problems found.  See
+Documentation/coccinelle.txt for more information.
+
 Other kinds of portability errors are best found by compiling your code for
 other architectures.  If you do not happen to have an S/390 system or a
 Blackfin development board handy, you can still perform the compilation
@@ -308,7 +320,9 @@
 changelog.  Log entries should describe the problem being solved, the form
 of the solution, the people who worked on the patch, any relevant
 effects on performance, and anything else that might be needed to
-understand the patch.
+understand the patch.  Be sure that the changelog says *why* the patch is
+worth applying; a surprising number of developers fail to provide that
+information.
 
 Any code which adds a new user-space interface - including new sysfs or
 /proc files - should include documentation of that interface which enables
@@ -321,7 +335,7 @@
 appropriate entries to this file.
 
 Any new configuration options must be accompanied by help text which
-clearly explains the options and when the user might want to select them. 
+clearly explains the options and when the user might want to select them.
 
 Internal API information for many subsystems is documented by way of
 specially-formatted comments; these comments can be extracted and formatted
@@ -372,7 +386,8 @@
 lead to literally hundreds or thousands of changes - many of which are
 likely to conflict with work being done by other developers.  Needless to
 say, this can be a large job, so it is best to be sure that the
-justification is solid.
+justification is solid.  Note that the Coccinelle tool can help with
+wide-ranging API changes.
 
 When making an incompatible API change, one should, whenever possible,
 ensure that code which has not been updated is caught by the compiler.
diff --git a/Documentation/development-process/5.Posting b/Documentation/development-process/5.Posting
index f622c1e..903a254 100644
--- a/Documentation/development-process/5.Posting
+++ b/Documentation/development-process/5.Posting
@@ -60,12 +60,15 @@
 
 Patches must be prepared against a specific version of the kernel.  As a
 general rule, a patch should be based on the current mainline as found in
-Linus's git tree.  It may become necessary to make versions against -mm,
-linux-next, or a subsystem tree, though, to facilitate wider testing and
-review.  Depending on the area of your patch and what is going on
-elsewhere, basing a patch against these other trees can require a
-significant amount of work resolving conflicts and dealing with API
-changes.
+Linus's git tree.  When basing on mainline, start with a well-known release
+point - a stable or -rc release - rather than branching off the mainline at
+an arbitrary spot.
+
+It may become necessary to make versions against -mm, linux-next, or a
+subsystem tree, though, to facilitate wider testing and review.  Depending
+on the area of your patch and what is going on elsewhere, basing a patch
+against these other trees can require a significant amount of work
+resolving conflicts and dealing with API changes.
 
 Only the most simple changes should be formatted as a single patch;
 everything else should be made as a logical series of changes.  Splitting
@@ -100,11 +103,11 @@
    result is a broken kernel, you will make life harder for developers and
    users who are engaging in the noble work of tracking down problems.
 
- - Do not overdo it, though.  One developer recently posted a set of edits
+ - Do not overdo it, though.  One developer once posted a set of edits
    to a single file as 500 separate patches - an act which did not make him
    the most popular person on the kernel mailing list.  A single patch can
    be reasonably large as long as it still contains a single *logical*
-   change. 
+   change.
 
  - It can be tempting to add a whole new infrastructure with a series of
    patches, but to leave that infrastructure unused until the final patch
@@ -162,7 +165,8 @@
 for the change as well as possible given the one-line constraint.  The
 detailed description can then amplify on those topics and provide any
 needed additional information.  If the patch fixes a bug, cite the commit
-which introduced the bug if possible.  If a problem is associated with
+which introduced the bug if possible (and please provide both the commit ID
+and the title when citing commits).  If a problem is associated with
 specific log or compiler output, include that output to help others
 searching for a solution to the same problem.  If the change is meant to
 support other changes coming in later patch, say so.  If internal APIs are
@@ -230,7 +234,7 @@
    which have had gratuitous white-space changes or line wrapping performed
    by the mail client will not apply at the other end, and often will not
    be examined in any detail.  If there is any doubt at all, mail the patch
-   to yourself and convince yourself that it shows up intact.  
+   to yourself and convince yourself that it shows up intact.
 
    Documentation/email-clients.txt has some helpful hints on making
    specific mail clients work for sending patches.
@@ -287,7 +291,7 @@
 
 where "nn" is the ordinal number of the patch, "mm" is the total number of
 patches in the series, and "subsys" is the name of the affected subsystem.
-Clearly, nn/mm can be omitted for a single, standalone patch.  
+Clearly, nn/mm can be omitted for a single, standalone patch.
 
 If you have a significant series of patches, it is customary to send an
 introductory description as part zero.  This convention is not universally
@@ -299,5 +303,5 @@
 sent as a reply to the first part so that they all thread together at the
 receiving end.  Tools like git and quilt have commands to mail out a set of
 patches with the proper threading.  If you have a long series, though, and
-are using git, please provide the --no-chain-reply-to option to avoid
+are using git, please stay away from the --chain-reply-to option to avoid
 creating exceptionally deep nesting.
diff --git a/Documentation/development-process/6.Followthrough b/Documentation/development-process/6.Followthrough
index a8fba3d8..41d324a 100644
--- a/Documentation/development-process/6.Followthrough
+++ b/Documentation/development-process/6.Followthrough
@@ -66,6 +66,11 @@
 that you don't realize that something is fundamentally wrong or, perhaps,
 you're not even solving the right problem.
 
+Andrew Morton has suggested that every review comment which does not result
+in a code change should result in an additional code comment instead; that
+can help future reviewers avoid the questions which came up the first time
+around.
+
 One fatal mistake is to ignore review comments in the hope that they will
 go away.  They will not go away.  If you repost code without having
 responded to the comments you got the time before, you're likely to find
@@ -100,7 +105,7 @@
 subsystem to the next; each maintainer has his or her own way of doing
 things.  In particular, there may be more than one tree - one, perhaps,
 dedicated to patches planned for the next merge window, and another for
-longer-term work.  
+longer-term work.
 
 For patches applying to areas for which there is no obvious subsystem tree
 (memory management patches, for example), the default tree often ends up
@@ -109,11 +114,10 @@
 
 Inclusion into a subsystem tree can bring a higher level of visibility to a
 patch.  Now other developers working with that tree will get the patch by
-default.  Subsystem trees typically feed into -mm and linux-next as well,
-making their contents visible to the development community as a whole.  At
-this point, there's a good chance that you will get more comments from a
-new set of reviewers; these comments need to be answered as in the previous
-round.
+default.  Subsystem trees typically feed linux-next as well, making their
+contents visible to the development community as a whole.  At this point,
+there's a good chance that you will get more comments from a new set of
+reviewers; these comments need to be answered as in the previous round.
 
 What may also happen at this point, depending on the nature of your patch,
 is that conflicts with work being done by others turn up.  In the worst
diff --git a/Documentation/development-process/7.AdvancedTopics b/Documentation/development-process/7.AdvancedTopics
index 8371794..26dc3fa 100644
--- a/Documentation/development-process/7.AdvancedTopics
+++ b/Documentation/development-process/7.AdvancedTopics
@@ -119,7 +119,7 @@
 	to trust things *without* then having to go and check every
 	individual change by hand.
 
-(http://lwn.net/Articles/224135/).  
+(http://lwn.net/Articles/224135/).
 
 To avoid this kind of situation, ensure that all patches within a given
 branch stick closely to the associated topic; a "driver fixes" branch
@@ -138,7 +138,7 @@
 your tree is, what branch to pull, and what changes will result from the
 pull.  The git request-pull command can be helpful in this regard; it will
 format the request as other developers expect, and will also check to be
-sure that you have remembered to push those changes to the public server. 
+sure that you have remembered to push those changes to the public server.
 
 
 7.2: REVIEWING PATCHES
diff --git a/Documentation/dynamic-debug-howto.txt b/Documentation/dynamic-debug-howto.txt
index e6c4b75..f959909 100644
--- a/Documentation/dynamic-debug-howto.txt
+++ b/Documentation/dynamic-debug-howto.txt
@@ -6,7 +6,7 @@
 
 Dynamic debug is designed to allow you to dynamically enable/disable kernel
 code to obtain additional kernel information. Currently, if
-CONFIG_DYNAMIC_DEBUG is set, then all pr_debug()/dev_debug() calls can be
+CONFIG_DYNAMIC_DEBUG is set, then all pr_debug()/dev_dbg() calls can be
 dynamically enabled per-callsite.
 
 Dynamic debug has even more useful features:
@@ -26,7 +26,7 @@
 Controlling dynamic debug Behaviour
 ===================================
 
-The behaviour of pr_debug()/dev_debug()s are controlled via writing to a
+The behaviour of pr_debug()/dev_dbg()s are controlled via writing to a
 control file in the 'debugfs' filesystem. Thus, you must first mount the debugfs
 filesystem, in order to make use of this feature. Subsequently, we refer to the
 control file as: <debugfs>/dynamic_debug/control. For example, if you want to
diff --git a/Documentation/hwmon/f71882fg b/Documentation/hwmon/f71882fg
index 4d0bc70..df02245 100644
--- a/Documentation/hwmon/f71882fg
+++ b/Documentation/hwmon/f71882fg
@@ -2,6 +2,10 @@
 ======================
 
 Supported chips:
+  * Fintek F71808E
+    Prefix: 'f71808e'
+    Addresses scanned: none, address read from Super I/O config space
+    Datasheet: Not public
   * Fintek F71858FG
     Prefix: 'f71858fg'
     Addresses scanned: none, address read from Super I/O config space
@@ -26,10 +30,25 @@
     Prefix: 'f71889ed'
     Addresses scanned: none, address read from Super I/O config space
     Datasheet: Should become available on the Fintek website soon
+  * Fintek F71889A
+    Prefix: 'f71889a'
+    Addresses scanned: none, address read from Super I/O config space
+    Datasheet: Should become available on the Fintek website soon
   * Fintek F8000
     Prefix: 'f8000'
     Addresses scanned: none, address read from Super I/O config space
     Datasheet: Not public
+  * Fintek F81801U
+    Prefix: 'f71889fg'
+    Addresses scanned: none, address read from Super I/O config space
+    Datasheet: Not public
+    Note: This is the 64-pin variant of the F71889FG, they have the
+	  same device ID and are fully compatible as far as hardware
+	  monitoring is concerned.
+  * Fintek F81865F
+    Prefix: 'f81865f'
+    Addresses scanned: none, address read from Super I/O config space
+    Datasheet: Available from the Fintek website
 
 Author: Hans de Goede <hdegoede@redhat.com>
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 4fb9017..8aa1cac 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -548,10 +548,8 @@
 F:	sound/aoa/
 
 APM DRIVER
-M:	Stephen Rothwell <sfr@canb.auug.org.au>
 L:	linux-laptop@vger.kernel.org
-W:	http://www.canb.auug.org.au/~sfr/
-S:	Supported
+S:	Orphan
 F:	arch/x86/kernel/apm_32.c
 F:	include/linux/apm_bios.h
 
@@ -6633,6 +6631,7 @@
 
 USER-MODE LINUX (UML)
 M:	Jeff Dike <jdike@addtoit.com>
+M:	Richard Weinberger <richard@nod.at>
 L:	user-mode-linux-devel@lists.sourceforge.net
 L:	user-mode-linux-user@lists.sourceforge.net
 W:	http://user-mode-linux.sourceforge.net
diff --git a/arch/arm/mach-omap2/board-omap4panda.c b/arch/arm/mach-omap2/board-omap4panda.c
index c936c6d..f3a7b10 100644
--- a/arch/arm/mach-omap2/board-omap4panda.c
+++ b/arch/arm/mach-omap2/board-omap4panda.c
@@ -285,19 +285,6 @@
 	return 0;
 }
 
-static struct regulator_init_data omap4_panda_vaux1 = {
-	.constraints = {
-		.min_uV			= 1000000,
-		.max_uV			= 3000000,
-		.apply_uV		= true,
-		.valid_modes_mask	= REGULATOR_MODE_NORMAL
-					| REGULATOR_MODE_STANDBY,
-		.valid_ops_mask	 = REGULATOR_CHANGE_VOLTAGE
-					| REGULATOR_CHANGE_MODE
-					| REGULATOR_CHANGE_STATUS,
-	},
-};
-
 static struct regulator_init_data omap4_panda_vaux2 = {
 	.constraints = {
 		.min_uV			= 1200000,
@@ -353,19 +340,6 @@
 	},
 };
 
-static struct regulator_init_data omap4_panda_vusim = {
-	.constraints = {
-		.min_uV			= 1200000,
-		.max_uV			= 2900000,
-		.apply_uV		= true,
-		.valid_modes_mask	= REGULATOR_MODE_NORMAL
-					| REGULATOR_MODE_STANDBY,
-		.valid_ops_mask	 = REGULATOR_CHANGE_VOLTAGE
-					| REGULATOR_CHANGE_MODE
-					| REGULATOR_CHANGE_STATUS,
-	},
-};
-
 static struct regulator_init_data omap4_panda_vana = {
 	.constraints = {
 		.min_uV			= 2100000,
@@ -424,12 +398,10 @@
 	/* Regulators */
 	.vmmc		= &omap4_panda_vmmc,
 	.vpp		= &omap4_panda_vpp,
-	.vusim		= &omap4_panda_vusim,
 	.vana		= &omap4_panda_vana,
 	.vcxio		= &omap4_panda_vcxio,
 	.vdac		= &omap4_panda_vdac,
 	.vusb		= &omap4_panda_vusb,
-	.vaux1		= &omap4_panda_vaux1,
 	.vaux2		= &omap4_panda_vaux2,
 	.vaux3		= &omap4_panda_vaux3,
 	.clk32kg	= &omap4_panda_clk32kg,
diff --git a/arch/arm/mach-omap2/devices.c b/arch/arm/mach-omap2/devices.c
index e978514..84d1b73 100644
--- a/arch/arm/mach-omap2/devices.c
+++ b/arch/arm/mach-omap2/devices.c
@@ -66,7 +66,7 @@
 
 	WARN(IS_ERR(od), "could not build omap_device for %s\n", oh_name);
 
-	return PTR_ERR(od);
+	return IS_ERR(od) ? PTR_ERR(od) : 0;
 }
 postcore_initcall(omap3_l3_init);
 
diff --git a/arch/arm/mach-omap2/gpmc.c b/arch/arm/mach-omap2/gpmc.c
index 6741743..493505c 100644
--- a/arch/arm/mach-omap2/gpmc.c
+++ b/arch/arm/mach-omap2/gpmc.c
@@ -693,6 +693,7 @@
 {
 	u32 l, irq;
 	int cs, ret = -EINVAL;
+	int gpmc_irq;
 	char *ck = NULL;
 
 	if (cpu_is_omap24xx()) {
@@ -701,12 +702,15 @@
 			l = OMAP2420_GPMC_BASE;
 		else
 			l = OMAP34XX_GPMC_BASE;
+		gpmc_irq = INT_34XX_GPMC_IRQ;
 	} else if (cpu_is_omap34xx()) {
 		ck = "gpmc_fck";
 		l = OMAP34XX_GPMC_BASE;
+		gpmc_irq = INT_34XX_GPMC_IRQ;
 	} else if (cpu_is_omap44xx()) {
 		ck = "gpmc_ck";
 		l = OMAP44XX_GPMC_BASE;
+		gpmc_irq = OMAP44XX_IRQ_GPMC;
 	}
 
 	if (WARN_ON(!ck))
@@ -739,16 +743,17 @@
 	/* initalize the irq_chained */
 	irq = OMAP_GPMC_IRQ_BASE;
 	for (cs = 0; cs < GPMC_CS_NUM; cs++) {
-		set_irq_handler(irq, handle_simple_irq);
+		set_irq_chip_and_handler(irq, &dummy_irq_chip,
+						handle_simple_irq);
 		set_irq_flags(irq, IRQF_VALID);
 		irq++;
 	}
 
-	ret = request_irq(INT_34XX_GPMC_IRQ,
+	ret = request_irq(gpmc_irq,
 			gpmc_handle_irq, IRQF_SHARED, "gpmc", gpmc_base);
 	if (ret)
 		pr_err("gpmc: irq-%d could not claim: err %d\n",
-						INT_34XX_GPMC_IRQ, ret);
+						gpmc_irq, ret);
 	return ret;
 }
 postcore_initcall(gpmc_init);
@@ -757,8 +762,6 @@
 {
 	u8 cs;
 
-	if (irq != INT_34XX_GPMC_IRQ)
-		return IRQ_HANDLED;
 	/* check cs to invoke the irq */
 	cs = ((gpmc_read_reg(GPMC_PREFETCH_CONFIG1)) >> CS_NUM_SHIFT) & 0x7;
 	if (OMAP_GPMC_IRQ_BASE+cs <= OMAP_GPMC_IRQ_END)
diff --git a/arch/arm/mach-omap2/omap_l3_smx.c b/arch/arm/mach-omap2/omap_l3_smx.c
index 265bff3..5f2da756 100644
--- a/arch/arm/mach-omap2/omap_l3_smx.c
+++ b/arch/arm/mach-omap2/omap_l3_smx.c
@@ -226,7 +226,6 @@
 	struct omap3_l3         *l3;
 	struct resource         *res;
 	int                     ret;
-	int                     irq;
 
 	l3 = kzalloc(sizeof(*l3), GFP_KERNEL);
 	if (!l3) {
@@ -249,18 +248,17 @@
 		goto err2;
 	}
 
-	irq = platform_get_irq(pdev, 0);
-	ret = request_irq(irq, omap3_l3_app_irq,
+	l3->debug_irq = platform_get_irq(pdev, 0);
+	ret = request_irq(l3->debug_irq, omap3_l3_app_irq,
 		IRQF_DISABLED | IRQF_TRIGGER_RISING,
 		"l3-debug-irq", l3);
 	if (ret) {
 		dev_err(&pdev->dev, "couldn't request debug irq\n");
 		goto err3;
 	}
-	l3->debug_irq = irq;
 
-	irq = platform_get_irq(pdev, 1);
-	ret = request_irq(irq, omap3_l3_app_irq,
+	l3->app_irq = platform_get_irq(pdev, 1);
+	ret = request_irq(l3->app_irq, omap3_l3_app_irq,
 		IRQF_DISABLED | IRQF_TRIGGER_RISING,
 		"l3-app-irq", l3);
 
@@ -269,7 +267,6 @@
 		goto err4;
 	}
 
-	l3->app_irq = irq;
 	goto err0;
 
 err4:
diff --git a/arch/arm/mach-ux500/board-mop500-regulators.h b/arch/arm/mach-ux500/board-mop500-regulators.h
index 2675fae..f979b89 100644
--- a/arch/arm/mach-ux500/board-mop500-regulators.h
+++ b/arch/arm/mach-ux500/board-mop500-regulators.h
@@ -14,6 +14,8 @@
 #include <linux/regulator/machine.h>
 #include <linux/regulator/ab8500.h>
 
+extern struct ab8500_regulator_reg_init
+ab8500_regulator_reg_init[AB8500_NUM_REGULATOR_REGISTERS];
 extern struct regulator_init_data ab8500_regulators[AB8500_NUM_REGULATORS];
 
 #endif
diff --git a/arch/arm/mach-ux500/board-mop500.c b/arch/arm/mach-ux500/board-mop500.c
index 8790d984..d007645 100644
--- a/arch/arm/mach-ux500/board-mop500.c
+++ b/arch/arm/mach-ux500/board-mop500.c
@@ -20,6 +20,7 @@
 #include <linux/amba/serial.h>
 #include <linux/spi/spi.h>
 #include <linux/mfd/ab8500.h>
+#include <linux/regulator/ab8500.h>
 #include <linux/mfd/tc3589x.h>
 #include <linux/leds-lp5521.h>
 #include <linux/input.h>
diff --git a/arch/arm/plat-omap/include/plat/irqs.h b/arch/arm/plat-omap/include/plat/irqs.h
index d779283..5a25098 100644
--- a/arch/arm/plat-omap/include/plat/irqs.h
+++ b/arch/arm/plat-omap/include/plat/irqs.h
@@ -416,7 +416,7 @@
 
 /* GPMC related */
 #define OMAP_GPMC_IRQ_BASE	(TWL_IRQ_END)
-#define OMAP_GPMC_NR_IRQS	7
+#define OMAP_GPMC_NR_IRQS	8
 #define OMAP_GPMC_IRQ_END	(OMAP_GPMC_IRQ_BASE + OMAP_GPMC_NR_IRQS)
 
 
diff --git a/arch/arm/plat-omap/include/plat/onenand.h b/arch/arm/plat-omap/include/plat/onenand.h
index cbe897c..2858667 100644
--- a/arch/arm/plat-omap/include/plat/onenand.h
+++ b/arch/arm/plat-omap/include/plat/onenand.h
@@ -32,6 +32,7 @@
 	int			dma_channel;
 	u8			flags;
 	u8			regulator_can_sleep;
+	u8			skip_initial_unlocking;
 };
 
 #define ONENAND_MAX_PARTITIONS 8
diff --git a/arch/arm/plat-pxa/include/plat/pxa3xx_nand.h b/arch/arm/plat-pxa/include/plat/pxa3xx_nand.h
index 01a8448..442301f 100644
--- a/arch/arm/plat-pxa/include/plat/pxa3xx_nand.h
+++ b/arch/arm/plat-pxa/include/plat/pxa3xx_nand.h
@@ -30,6 +30,7 @@
 };
 
 struct pxa3xx_nand_flash {
+	char		*name;
 	uint32_t	chip_id;
 	unsigned int	page_per_block; /* Pages per block (PG_PER_BLK) */
 	unsigned int	page_size;	/* Page size in bytes (PAGE_SZ) */
@@ -37,7 +38,6 @@
 	unsigned int	dfc_width;	/* Width of flash controller(DWIDTH_C) */
 	unsigned int	num_blocks;	/* Number of physical blocks in Flash */
 
-	struct pxa3xx_nand_cmdset *cmdset;	/* NAND command set */
 	struct pxa3xx_nand_timing *timing;	/* NAND Flash timing */
 };
 
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 4db5b46..04a7fc5 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -276,7 +276,6 @@
 	select MTD_CHAR
 	select MTD_BLOCK
 	select MTD_PARTITIONS
-	select MTD_CONCAT
 	select MTD_COMPLEX_MAPPINGS
 	help
 	  This option enables MTD mapping of flash devices.  Needed to use
diff --git a/arch/cris/arch-v10/drivers/axisflashmap.c b/arch/cris/arch-v10/drivers/axisflashmap.c
index b207970..ed708e1 100644
--- a/arch/cris/arch-v10/drivers/axisflashmap.c
+++ b/arch/cris/arch-v10/drivers/axisflashmap.c
@@ -234,7 +234,6 @@
 	}
 
 	if (mtd_cse0 && mtd_cse1) {
-#ifdef CONFIG_MTD_CONCAT
 		struct mtd_info *mtds[] = { mtd_cse0, mtd_cse1 };
 
 		/* Since the concatenation layer adds a small overhead we
@@ -246,11 +245,6 @@
 		 */
 		mtd_cse = mtd_concat_create(mtds, ARRAY_SIZE(mtds),
 					    "cse0+cse1");
-#else
-		printk(KERN_ERR "%s and %s: Cannot concatenate due to kernel "
-		       "(mis)configuration!\n", map_cse0.name, map_cse1.name);
-		mtd_cse = NULL;
-#endif
 		if (!mtd_cse) {
 			printk(KERN_ERR "%s and %s: Concatenation failed!\n",
 			       map_cse0.name, map_cse1.name);
diff --git a/arch/cris/arch-v32/drivers/Kconfig b/arch/cris/arch-v32/drivers/Kconfig
index a2dd740..1633b12 100644
--- a/arch/cris/arch-v32/drivers/Kconfig
+++ b/arch/cris/arch-v32/drivers/Kconfig
@@ -406,7 +406,6 @@
 	select MTD_CHAR
 	select MTD_BLOCK
 	select MTD_PARTITIONS
-	select MTD_CONCAT
 	select MTD_COMPLEX_MAPPINGS
 	help
 	  This option enables MTD mapping of flash devices.  Needed to use
diff --git a/arch/cris/arch-v32/drivers/axisflashmap.c b/arch/cris/arch-v32/drivers/axisflashmap.c
index 51e1e85..3d75125 100644
--- a/arch/cris/arch-v32/drivers/axisflashmap.c
+++ b/arch/cris/arch-v32/drivers/axisflashmap.c
@@ -275,7 +275,6 @@
 	}
 
 	if (count > 1) {
-#ifdef CONFIG_MTD_CONCAT
 		/* Since the concatenation layer adds a small overhead we
 		 * could try to figure out if the chips in cse0 and cse1 are
 		 * identical and reprobe the whole cse0+cse1 window. But since
@@ -284,11 +283,6 @@
 		 * complicating the probing procedure.
 		 */
 		mtd_total = mtd_concat_create(mtds, count, "cse0+cse1");
-#else
-		printk(KERN_ERR "%s and %s: Cannot concatenate due to kernel "
-		       "(mis)configuration!\n", map_cse0.name, map_cse1.name);
-		mtd_toal = NULL;
-#endif
 		if (!mtd_total) {
 			printk(KERN_ERR "%s and %s: Concatenation failed!\n",
 				map_cse0.name, map_cse1.name);
diff --git a/arch/score/Kconfig b/arch/score/Kconfig
index 27b2295..4278bbc 100644
--- a/arch/score/Kconfig
+++ b/arch/score/Kconfig
@@ -3,6 +3,8 @@
 config SCORE
        def_bool y
        select HAVE_GENERIC_HARDIRQS
+       select GENERIC_HARDIRQS_NO_DEPRECATED
+       select GENERIC_IRQ_SHOW
 
 choice
 	prompt "System type"
diff --git a/arch/score/include/asm/irqflags.h b/arch/score/include/asm/irqflags.h
index 5c75638..37c6ac9 100644
--- a/arch/score/include/asm/irqflags.h
+++ b/arch/score/include/asm/irqflags.h
@@ -29,7 +29,7 @@
 
 static inline unsigned long arch_local_irq_save(void)
 {
-	unsigned long flags
+	unsigned long flags;
 
 	asm volatile(
 		"	mfcr	r8, cr0		\n"
diff --git a/arch/score/kernel/irq.c b/arch/score/kernel/irq.c
index 47647dd..d419673 100644
--- a/arch/score/kernel/irq.c
+++ b/arch/score/kernel/irq.c
@@ -52,9 +52,9 @@
 	irq_exit();
 }
 
-static void score_mask(unsigned int irq_nr)
+static void score_mask(struct irq_data *d)
 {
-	unsigned int irq_source = 63 - irq_nr;
+	unsigned int irq_source = 63 - d->irq;
 
 	if (irq_source < 32)
 		__raw_writel((__raw_readl(SCORE_PIC + INT_MASKL) | \
@@ -64,9 +64,9 @@
 			(1 << (irq_source - 32))), SCORE_PIC + INT_MASKH);
 }
 
-static void score_unmask(unsigned int irq_nr)
+static void score_unmask(struct irq_data *d)
 {
-	unsigned int irq_source = 63 - irq_nr;
+	unsigned int irq_source = 63 - d->irq;
 
 	if (irq_source < 32)
 		__raw_writel((__raw_readl(SCORE_PIC + INT_MASKL) & \
@@ -78,9 +78,9 @@
 
 struct irq_chip score_irq_chip = {
 	.name		= "Score7-level",
-	.mask		= score_mask,
-	.mask_ack	= score_mask,
-	.unmask		= score_unmask,
+	.irq_mask	= score_mask,
+	.irq_mask_ack	= score_mask,
+	.irq_unmask	= score_unmask,
 };
 
 /*
@@ -92,7 +92,7 @@
 	unsigned long target_addr;
 
 	for (index = 0; index < NR_IRQS; ++index)
-		set_irq_chip_and_handler(index, &score_irq_chip,
+		irq_set_chip_and_handler(index, &score_irq_chip,
 					 handle_level_irq);
 
 	for (target_addr = IRQ_VECTOR_BASE_ADDR;
@@ -109,40 +109,3 @@
 		: : "r" (EXCEPTION_VECTOR_BASE_ADDR | \
 			VECTOR_ADDRESS_OFFSET_MODE16));
 }
-
-/*
- * Generic, controller-independent functions:
- */
-int show_interrupts(struct seq_file *p, void *v)
-{
-	int i = *(loff_t *)v, cpu;
-	struct irqaction *action;
-	unsigned long flags;
-
-	if (i == 0) {
-		seq_puts(p, "           ");
-		for_each_online_cpu(cpu)
-			seq_printf(p, "CPU%d       ", cpu);
-		seq_putc(p, '\n');
-	}
-
-	if (i < NR_IRQS) {
-		spin_lock_irqsave(&irq_desc[i].lock, flags);
-		action = irq_desc[i].action;
-		if (!action)
-			goto unlock;
-
-		seq_printf(p, "%3d: ", i);
-		seq_printf(p, "%10u ", kstat_irqs(i));
-		seq_printf(p, " %8s", irq_desc[i].chip->name ? : "-");
-		seq_printf(p, "  %s", action->name);
-		for (action = action->next; action; action = action->next)
-			seq_printf(p, ", %s", action->name);
-
-		seq_putc(p, '\n');
-unlock:
-		spin_unlock_irqrestore(&irq_desc[i].lock, flags);
-	}
-
-	return 0;
-}
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index f3b7870..5e34a9f 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -12,6 +12,7 @@
 	select GENERIC_IRQ_PROBE
 	select GENERIC_PENDING_IRQ if SMP
 	select GENERIC_HARDIRQS_NO_DEPRECATED
+	select GENERIC_IRQ_SHOW
 
 # FIXME: investigate whether we need/want these options.
 #	select HAVE_IOREMAP_PROT
diff --git a/arch/tile/kernel/irq.c b/arch/tile/kernel/irq.c
index 0baa758..aa0134d 100644
--- a/arch/tile/kernel/irq.c
+++ b/arch/tile/kernel/irq.c
@@ -241,14 +241,14 @@
 	irq_flow_handler_t handle = handle_level_irq;
 	if (tile_irq_type == TILE_IRQ_PERCPU)
 		handle = handle_percpu_irq;
-	set_irq_chip_and_handler(irq, &tile_irq_chip, handle);
+	irq_set_chip_and_handler(irq, &tile_irq_chip, handle);
 
 	/*
 	 * Flag interrupts that are hardware-cleared so that ack()
 	 * won't clear them.
 	 */
 	if (tile_irq_type == TILE_IRQ_HW_CLEAR)
-		set_irq_chip_data(irq, (void *)IS_HW_CLEARED);
+		irq_set_chip_data(irq, (void *)IS_HW_CLEARED);
 }
 EXPORT_SYMBOL(tile_irq_activate);
 
@@ -262,47 +262,6 @@
  * Generic, controller-independent functions:
  */
 
-int show_interrupts(struct seq_file *p, void *v)
-{
-	int i = *(loff_t *) v, j;
-	struct irqaction *action;
-	unsigned long flags;
-
-	if (i == 0) {
-		seq_printf(p, "           ");
-		for (j = 0; j < NR_CPUS; j++)
-			if (cpu_online(j))
-				seq_printf(p, "CPU%-8d", j);
-		seq_putc(p, '\n');
-	}
-
-	if (i < NR_IRQS) {
-		struct irq_desc *desc = irq_to_desc(i);
-
-		raw_spin_lock_irqsave(&desc->lock, flags);
-		action = desc->action;
-		if (!action)
-			goto skip;
-		seq_printf(p, "%3d: ", i);
-#ifndef CONFIG_SMP
-		seq_printf(p, "%10u ", kstat_irqs(i));
-#else
-		for_each_online_cpu(j)
-			seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
-#endif
-		seq_printf(p, " %14s", get_irq_desc_chip(desc)->name);
-		seq_printf(p, "  %s", action->name);
-
-		for (action = action->next; action; action = action->next)
-			seq_printf(p, ", %s", action->name);
-
-		seq_putc(p, '\n');
-skip:
-		raw_spin_unlock_irqrestore(&desc->lock, flags);
-	}
-	return 0;
-}
-
 #if CHIP_HAS_IPI()
 int create_irq(void)
 {
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index a09e1f0..d475b43 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -45,7 +45,7 @@
 #include <linux/stringify.h>
 
 #ifdef CONFIG_SMP
-#define __percpu_arg(x)		"%%"__stringify(__percpu_seg)":%P" #x
+#define __percpu_prefix		"%%"__stringify(__percpu_seg)":"
 #define __my_cpu_offset		percpu_read(this_cpu_off)
 
 /*
@@ -62,9 +62,11 @@
 	(typeof(*(ptr)) __kernel __force *)tcp_ptr__;	\
 })
 #else
-#define __percpu_arg(x)		"%P" #x
+#define __percpu_prefix		""
 #endif
 
+#define __percpu_arg(x)		__percpu_prefix "%P" #x
+
 /*
  * Initialized pointers to per-cpu variables needed for the boot
  * processor need to use these macros to get the proper address
@@ -516,11 +518,11 @@
 	typeof(o2) __n2 = n2;						\
 	typeof(o2) __dummy;						\
 	alternative_io("call this_cpu_cmpxchg16b_emu\n\t" P6_NOP4,	\
-		       "cmpxchg16b %%gs:(%%rsi)\n\tsetz %0\n\t",	\
+		       "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t",	\
 		       X86_FEATURE_CX16,				\
 		       ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)),		\
 		       "S" (&pcp1), "b"(__n1), "c"(__n2),		\
-		       "a"(__o1), "d"(__o2));				\
+		       "a"(__o1), "d"(__o2) : "memory");		\
 	__ret;								\
 })
 
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
index 3e8b08a..1e572c5 100644
--- a/arch/x86/lib/cmpxchg16b_emu.S
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -10,6 +10,12 @@
 #include <asm/frame.h>
 #include <asm/dwarf2.h>
 
+#ifdef CONFIG_SMP
+#define SEG_PREFIX %gs:
+#else
+#define SEG_PREFIX
+#endif
+
 .text
 
 /*
@@ -37,13 +43,13 @@
 	pushf
 	cli
 
-	cmpq %gs:(%rsi), %rax
+	cmpq SEG_PREFIX(%rsi), %rax
 	jne not_same
-	cmpq %gs:8(%rsi), %rdx
+	cmpq SEG_PREFIX 8(%rsi), %rdx
 	jne not_same
 
-	movq %rbx, %gs:(%rsi)
-	movq %rcx, %gs:8(%rsi)
+	movq %rbx, SEG_PREFIX(%rsi)
+	movq %rcx, SEG_PREFIX 8(%rsi)
 
 	popf
 	mov $1, %al
diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c
index 9951364..ab81fb2 100644
--- a/arch/x86/platform/olpc/olpc-xo1.c
+++ b/arch/x86/platform/olpc/olpc-xo1.c
@@ -72,9 +72,9 @@
 		dev_err(&pdev->dev, "can't fetch device resource info\n");
 		return -EIO;
 	}
-	if (strcmp(pdev->name, "olpc-xo1-pms") == 0)
+	if (strcmp(pdev->name, "cs5535-pms") == 0)
 		pms_base = res->start;
-	else if (strcmp(pdev->name, "olpc-xo1-ac-acpi") == 0)
+	else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0)
 		acpi_base = res->start;
 
 	/* If we have both addresses, we can override the poweroff hook */
@@ -90,9 +90,9 @@
 {
 	mfd_cell_disable(pdev);
 
-	if (strcmp(pdev->name, "olpc-xo1-pms") == 0)
+	if (strcmp(pdev->name, "cs5535-pms") == 0)
 		pms_base = 0;
-	else if (strcmp(pdev->name, "olpc-xo1-acpi") == 0)
+	else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0)
 		acpi_base = 0;
 
 	pm_power_off = NULL;
@@ -101,7 +101,7 @@
 
 static struct platform_driver cs5535_pms_drv = {
 	.driver = {
-		.name = "olpc-xo1-pms",
+		.name = "cs5535-pms",
 		.owner = THIS_MODULE,
 	},
 	.probe = olpc_xo1_probe,
@@ -110,7 +110,7 @@
 
 static struct platform_driver cs5535_acpi_drv = {
 	.driver = {
-		.name = "olpc-xo1-acpi",
+		.name = "olpc-xo1-pm-acpi",
 		.owner = THIS_MODULE,
 	},
 	.probe = olpc_xo1_probe,
@@ -121,22 +121,21 @@
 {
 	int r;
 
-	r = mfd_shared_platform_driver_register(&cs5535_pms_drv, "cs5535-pms");
+	r = platform_driver_register(&cs5535_pms_drv);
 	if (r)
 		return r;
 
-	r = mfd_shared_platform_driver_register(&cs5535_acpi_drv,
-			"cs5535-acpi");
+	r = platform_driver_register(&cs5535_acpi_drv);
 	if (r)
-		mfd_shared_platform_driver_unregister(&cs5535_pms_drv);
+		platform_driver_unregister(&cs5535_pms_drv);
 
 	return r;
 }
 
 static void __exit olpc_xo1_exit(void)
 {
-	mfd_shared_platform_driver_unregister(&cs5535_acpi_drv);
-	mfd_shared_platform_driver_unregister(&cs5535_pms_drv);
+	platform_driver_unregister(&cs5535_acpi_drv);
+	platform_driver_unregister(&cs5535_pms_drv);
 }
 
 MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c
index a18e497..31e9e10 100644
--- a/drivers/acpi/video.c
+++ b/drivers/acpi/video.c
@@ -824,11 +824,6 @@
 		device->backlight->props.brightness =
 				acpi_video_get_brightness(device->backlight);
 
-		result = sysfs_create_link(&device->backlight->dev.kobj,
-					   &device->dev->dev.kobj, "device");
-		if (result)
-			printk(KERN_ERR PREFIX "Create sysfs link\n");
-
 		device->cooling_dev = thermal_cooling_device_register("LCD",
 					device->dev, &video_cooling_ops);
 		if (IS_ERR(device->cooling_dev)) {
@@ -1381,7 +1376,6 @@
 		       "Cant remove video notify handler\n");
 	}
 	if (device->backlight) {
-		sysfs_remove_link(&device->backlight->dev.kobj, "device");
 		backlight_device_unregister(device->backlight);
 		device->backlight = NULL;
 	}
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 35658f4..9bf1398 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -193,7 +193,7 @@
 	u64 *cfg_offset);
 static int __devinit cciss_pci_find_memory_BAR(struct pci_dev *pdev,
 	unsigned long *memory_bar);
-
+static inline u32 cciss_tag_discard_error_bits(ctlr_info_t *h, u32 tag);
 
 /* performant mode helper functions */
 static void  calc_bucket_map(int *bucket, int num_buckets, int nsgs,
@@ -231,7 +231,7 @@
  */
 static void set_performant_mode(ctlr_info_t *h, CommandList_struct *c)
 {
-	if (likely(h->transMethod == CFGTBL_Trans_Performant))
+	if (likely(h->transMethod & CFGTBL_Trans_Performant))
 		c->busaddr |= 1 | (h->blockFetchTable[c->Header.SGList] << 1);
 }
 
@@ -556,6 +556,44 @@
 #define to_hba(n) container_of(n, struct ctlr_info, dev)
 #define to_drv(n) container_of(n, drive_info_struct, dev)
 
+/* List of controllers which cannot be reset on kexec with reset_devices */
+static u32 unresettable_controller[] = {
+	0x324a103C, /* Smart Array P712m */
+	0x324b103C, /* SmartArray P711m */
+	0x3223103C, /* Smart Array P800 */
+	0x3234103C, /* Smart Array P400 */
+	0x3235103C, /* Smart Array P400i */
+	0x3211103C, /* Smart Array E200i */
+	0x3212103C, /* Smart Array E200 */
+	0x3213103C, /* Smart Array E200i */
+	0x3214103C, /* Smart Array E200i */
+	0x3215103C, /* Smart Array E200i */
+	0x3237103C, /* Smart Array E500 */
+	0x323D103C, /* Smart Array P700m */
+	0x409C0E11, /* Smart Array 6400 */
+	0x409D0E11, /* Smart Array 6400 EM */
+};
+
+static int ctlr_is_resettable(struct ctlr_info *h)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(unresettable_controller); i++)
+		if (unresettable_controller[i] == h->board_id)
+			return 0;
+	return 1;
+}
+
+static ssize_t host_show_resettable(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct ctlr_info *h = to_hba(dev);
+
+	return snprintf(buf, 20, "%d\n", ctlr_is_resettable(h));
+}
+static DEVICE_ATTR(resettable, S_IRUGO, host_show_resettable, NULL);
+
 static ssize_t host_store_rescan(struct device *dev,
 				 struct device_attribute *attr,
 				 const char *buf, size_t count)
@@ -741,6 +779,7 @@
 
 static struct attribute *cciss_host_attrs[] = {
 	&dev_attr_rescan.attr,
+	&dev_attr_resettable.attr,
 	NULL
 };
 
@@ -973,8 +1012,8 @@
 	temp64.val32.upper = c->ErrDesc.Addr.upper;
 	pci_free_consistent(h->pdev, sizeof(ErrorInfo_struct),
 			    c->err_info, (dma_addr_t) temp64.val);
-	pci_free_consistent(h->pdev, sizeof(CommandList_struct),
-			    c, (dma_addr_t) c->busaddr);
+	pci_free_consistent(h->pdev, sizeof(CommandList_struct), c,
+		(dma_addr_t) cciss_tag_discard_error_bits(h, (u32) c->busaddr));
 }
 
 static inline ctlr_info_t *get_host(struct gendisk *disk)
@@ -1490,8 +1529,7 @@
 		return -EINVAL;
 	if (!capable(CAP_SYS_RAWIO))
 		return -EPERM;
-	ioc = (BIG_IOCTL_Command_struct *)
-	    kmalloc(sizeof(*ioc), GFP_KERNEL);
+	ioc = kmalloc(sizeof(*ioc), GFP_KERNEL);
 	if (!ioc) {
 		status = -ENOMEM;
 		goto cleanup1;
@@ -2653,6 +2691,10 @@
 			c->Request.CDB[0]);
 		return_status = IO_NEEDS_RETRY;
 		break;
+	case CMD_UNABORTABLE:
+		dev_warn(&h->pdev->dev, "cmd unabortable\n");
+		return_status = IO_ERROR;
+		break;
 	default:
 		dev_warn(&h->pdev->dev, "cmd 0x%02x returned "
 		       "unknown status %x\n", c->Request.CDB[0],
@@ -3103,6 +3145,13 @@
 			(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
 				DID_PASSTHROUGH : DID_ERROR);
 		break;
+	case CMD_UNABORTABLE:
+		dev_warn(&h->pdev->dev, "cmd %p unabortable\n", cmd);
+		rq->errors = make_status_bytes(SAM_STAT_GOOD,
+			cmd->err_info->CommandStatus, DRIVER_OK,
+			cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC ?
+				DID_PASSTHROUGH : DID_ERROR);
+		break;
 	default:
 		dev_warn(&h->pdev->dev, "cmd %p returned "
 		       "unknown status %x\n", cmd,
@@ -3136,10 +3185,13 @@
 	return tag >> DIRECT_LOOKUP_SHIFT;
 }
 
-static inline u32 cciss_tag_discard_error_bits(u32 tag)
+static inline u32 cciss_tag_discard_error_bits(ctlr_info_t *h, u32 tag)
 {
-#define CCISS_ERROR_BITS 0x03
-	return tag & ~CCISS_ERROR_BITS;
+#define CCISS_PERF_ERROR_BITS ((1 << DIRECT_LOOKUP_SHIFT) - 1)
+#define CCISS_SIMPLE_ERROR_BITS 0x03
+	if (likely(h->transMethod & CFGTBL_Trans_Performant))
+		return tag & ~CCISS_PERF_ERROR_BITS;
+	return tag & ~CCISS_SIMPLE_ERROR_BITS;
 }
 
 static inline void cciss_mark_tag_indexed(u32 *tag)
@@ -3359,7 +3411,7 @@
 {
 	u32 a;
 
-	if (unlikely(h->transMethod != CFGTBL_Trans_Performant))
+	if (unlikely(!(h->transMethod & CFGTBL_Trans_Performant)))
 		return h->access.command_completed(h);
 
 	if ((*(h->reply_pool_head) & 1) == (h->reply_pool_wraparound)) {
@@ -3394,14 +3446,12 @@
 /* process completion of a non-indexed command */
 static inline u32 process_nonindexed_cmd(ctlr_info_t *h, u32 raw_tag)
 {
-	u32 tag;
 	CommandList_struct *c = NULL;
 	__u32 busaddr_masked, tag_masked;
 
-	tag = cciss_tag_discard_error_bits(raw_tag);
+	tag_masked = cciss_tag_discard_error_bits(h, raw_tag);
 	list_for_each_entry(c, &h->cmpQ, list) {
-		busaddr_masked = cciss_tag_discard_error_bits(c->busaddr);
-		tag_masked = cciss_tag_discard_error_bits(tag);
+		busaddr_masked = cciss_tag_discard_error_bits(h, c->busaddr);
 		if (busaddr_masked == tag_masked) {
 			finish_cmd(h, c, raw_tag);
 			return next_command(h);
@@ -3753,7 +3803,8 @@
 	}
 }
 
-static __devinit void cciss_enter_performant_mode(ctlr_info_t *h)
+static __devinit void cciss_enter_performant_mode(ctlr_info_t *h,
+	u32 use_short_tags)
 {
 	/* This is a bit complicated.  There are 8 registers on
 	 * the controller which we write to to tell it 8 different
@@ -3808,7 +3859,7 @@
 	writel(0, &h->transtable->RepQCtrAddrHigh32);
 	writel(h->reply_pool_dhandle, &h->transtable->RepQAddr0Low32);
 	writel(0, &h->transtable->RepQAddr0High32);
-	writel(CFGTBL_Trans_Performant,
+	writel(CFGTBL_Trans_Performant | use_short_tags,
 			&(h->cfgtable->HostWrite.TransportRequest));
 
 	writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
@@ -3855,7 +3906,8 @@
 	if ((h->reply_pool == NULL) || (h->blockFetchTable == NULL))
 		goto clean_up;
 
-	cciss_enter_performant_mode(h);
+	cciss_enter_performant_mode(h,
+		trans_support & CFGTBL_Trans_use_short_tags);
 
 	/* Change the access methods to the performant access methods */
 	h->access = SA5_performant_access;
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index 579f749..554bbd9 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -222,6 +222,7 @@
 			h->ctlr, c->busaddr);
 #endif /* CCISS_DEBUG */
          writel(c->busaddr, h->vaddr + SA5_REQUEST_PORT_OFFSET);
+	readl(h->vaddr + SA5_REQUEST_PORT_OFFSET);
 	 h->commands_outstanding++;
 	 if ( h->commands_outstanding > h->max_outstanding)
 		h->max_outstanding = h->commands_outstanding;
diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h
index 35463d2..cd441be 100644
--- a/drivers/block/cciss_cmd.h
+++ b/drivers/block/cciss_cmd.h
@@ -56,6 +56,7 @@
 
 #define CFGTBL_Trans_Simple     0x00000002l
 #define CFGTBL_Trans_Performant 0x00000004l
+#define CFGTBL_Trans_use_short_tags 0x20000000l
 
 #define CFGTBL_BusType_Ultra2   0x00000001l
 #define CFGTBL_BusType_Ultra3   0x00000002l
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 727d022..df79380 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -824,13 +824,18 @@
 			break;
 			case CMD_UNSOLICITED_ABORT:
 				cmd->result = DID_ABORT << 16;
-				dev_warn(&h->pdev->dev, "%p aborted do to an "
+				dev_warn(&h->pdev->dev, "%p aborted due to an "
 					"unsolicited abort\n", c);
 			break;
 			case CMD_TIMEOUT:
 				cmd->result = DID_TIME_OUT << 16;
 				dev_warn(&h->pdev->dev, "%p timedout\n", c);
 			break;
+			case CMD_UNABORTABLE:
+				cmd->result = DID_ERROR << 16;
+				dev_warn(&h->pdev->dev, "c %p command "
+					"unabortable\n", c);
+			break;
 			default:
 				cmd->result = DID_ERROR << 16;
 				dev_warn(&h->pdev->dev,
@@ -1007,11 +1012,15 @@
 		break;
 		case CMD_UNSOLICITED_ABORT:
 			dev_warn(&h->pdev->dev,
-				"%p aborted do to an unsolicited abort\n", c);
+				"%p aborted due to an unsolicited abort\n", c);
 		break;
 		case CMD_TIMEOUT:
 			dev_warn(&h->pdev->dev, "%p timedout\n", c);
 		break;
+		case CMD_UNABORTABLE:
+			dev_warn(&h->pdev->dev,
+				"%p unabortable\n", c);
+		break;
 		default:
 			dev_warn(&h->pdev->dev,
 				"%p returned unknown status %x\n",
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index aca3024..2a1642b 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -92,7 +92,7 @@
 	bio->bi_end_io = drbd_md_io_complete;
 	bio->bi_rw = rw;
 
-	if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
+	if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
 		bio_endio(bio, -EIO);
 	else
 		submit_bio(rw, bio);
@@ -176,13 +176,17 @@
 	struct lc_element *al_ext;
 	struct lc_element *tmp;
 	unsigned long     al_flags = 0;
+	int wake;
 
 	spin_lock_irq(&mdev->al_lock);
 	tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
 	if (unlikely(tmp != NULL)) {
 		struct bm_extent  *bm_ext = lc_entry(tmp, struct bm_extent, lce);
 		if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
+			wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
 			spin_unlock_irq(&mdev->al_lock);
+			if (wake)
+				wake_up(&mdev->al_wait);
 			return NULL;
 		}
 	}
@@ -258,6 +262,33 @@
 	spin_unlock_irqrestore(&mdev->al_lock, flags);
 }
 
+#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
+/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
+ * are still coupled, or assume too much about their relation.
+ * Code below will not work if this is violated.
+ * Will be cleaned up with some followup patch.
+ */
+# error FIXME
+#endif
+
+static unsigned int al_extent_to_bm_page(unsigned int al_enr)
+{
+	return al_enr >>
+		/* bit to page */
+		((PAGE_SHIFT + 3) -
+		/* al extent number to bit */
+		 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
+}
+
+static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
+{
+	return rs_enr >>
+		/* bit to page */
+		((PAGE_SHIFT + 3) -
+		/* al extent number to bit */
+		 (BM_EXT_SHIFT - BM_BLOCK_SHIFT));
+}
+
 int
 w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 {
@@ -285,7 +316,7 @@
 	 * For now, we must not write the transaction,
 	 * if we cannot write out the bitmap of the evicted extent. */
 	if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
-		drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT);
+		drbd_bm_write_page(mdev, al_extent_to_bm_page(evicted));
 
 	/* The bitmap write may have failed, causing a state change. */
 	if (mdev->state.disk < D_INCONSISTENT) {
@@ -334,7 +365,7 @@
 		+ mdev->ldev->md.al_offset + mdev->al_tr_pos;
 
 	if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
-		drbd_chk_io_error(mdev, 1, TRUE);
+		drbd_chk_io_error(mdev, 1, true);
 
 	if (++mdev->al_tr_pos >
 	    div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
@@ -511,225 +542,6 @@
 	return 1;
 }
 
-static void atodb_endio(struct bio *bio, int error)
-{
-	struct drbd_atodb_wait *wc = bio->bi_private;
-	struct drbd_conf *mdev = wc->mdev;
-	struct page *page;
-	int uptodate = bio_flagged(bio, BIO_UPTODATE);
-
-	/* strange behavior of some lower level drivers...
-	 * fail the request by clearing the uptodate flag,
-	 * but do not return any error?! */
-	if (!error && !uptodate)
-		error = -EIO;
-
-	drbd_chk_io_error(mdev, error, TRUE);
-	if (error && wc->error == 0)
-		wc->error = error;
-
-	if (atomic_dec_and_test(&wc->count))
-		complete(&wc->io_done);
-
-	page = bio->bi_io_vec[0].bv_page;
-	put_page(page);
-	bio_put(bio);
-	mdev->bm_writ_cnt++;
-	put_ldev(mdev);
-}
-
-/* sector to word */
-#define S2W(s)	((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
-
-/* activity log to on disk bitmap -- prepare bio unless that sector
- * is already covered by previously prepared bios */
-static int atodb_prepare_unless_covered(struct drbd_conf *mdev,
-					struct bio **bios,
-					unsigned int enr,
-					struct drbd_atodb_wait *wc) __must_hold(local)
-{
-	struct bio *bio;
-	struct page *page;
-	sector_t on_disk_sector;
-	unsigned int page_offset = PAGE_SIZE;
-	int offset;
-	int i = 0;
-	int err = -ENOMEM;
-
-	/* We always write aligned, full 4k blocks,
-	 * so we can ignore the logical_block_size (for now) */
-	enr &= ~7U;
-	on_disk_sector = enr + mdev->ldev->md.md_offset
-			     + mdev->ldev->md.bm_offset;
-
-	D_ASSERT(!(on_disk_sector & 7U));
-
-	/* Check if that enr is already covered by an already created bio.
-	 * Caution, bios[] is not NULL terminated,
-	 * but only initialized to all NULL.
-	 * For completely scattered activity log,
-	 * the last invocation iterates over all bios,
-	 * and finds the last NULL entry.
-	 */
-	while ((bio = bios[i])) {
-		if (bio->bi_sector == on_disk_sector)
-			return 0;
-		i++;
-	}
-	/* bios[i] == NULL, the next not yet used slot */
-
-	/* GFP_KERNEL, we are not in the write-out path */
-	bio = bio_alloc(GFP_KERNEL, 1);
-	if (bio == NULL)
-		return -ENOMEM;
-
-	if (i > 0) {
-		const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec;
-		page_offset = prev_bv->bv_offset + prev_bv->bv_len;
-		page = prev_bv->bv_page;
-	}
-	if (page_offset == PAGE_SIZE) {
-		page = alloc_page(__GFP_HIGHMEM);
-		if (page == NULL)
-			goto out_bio_put;
-		page_offset = 0;
-	} else {
-		get_page(page);
-	}
-
-	offset = S2W(enr);
-	drbd_bm_get_lel(mdev, offset,
-			min_t(size_t, S2W(8), drbd_bm_words(mdev) - offset),
-			kmap(page) + page_offset);
-	kunmap(page);
-
-	bio->bi_private = wc;
-	bio->bi_end_io = atodb_endio;
-	bio->bi_bdev = mdev->ldev->md_bdev;
-	bio->bi_sector = on_disk_sector;
-
-	if (bio_add_page(bio, page, 4096, page_offset) != 4096)
-		goto out_put_page;
-
-	atomic_inc(&wc->count);
-	/* we already know that we may do this...
-	 * get_ldev_if_state(mdev,D_ATTACHING);
-	 * just get the extra reference, so that the local_cnt reflects
-	 * the number of pending IO requests DRBD at its backing device.
-	 */
-	atomic_inc(&mdev->local_cnt);
-
-	bios[i] = bio;
-
-	return 0;
-
-out_put_page:
-	err = -EINVAL;
-	put_page(page);
-out_bio_put:
-	bio_put(bio);
-	return err;
-}
-
-/**
- * drbd_al_to_on_disk_bm() -  * Writes bitmap parts covered by active AL extents
- * @mdev:	DRBD device.
- *
- * Called when we detach (unconfigure) local storage,
- * or when we go from R_PRIMARY to R_SECONDARY role.
- */
-void drbd_al_to_on_disk_bm(struct drbd_conf *mdev)
-{
-	int i, nr_elements;
-	unsigned int enr;
-	struct bio **bios;
-	struct drbd_atodb_wait wc;
-
-	ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING))
-		return; /* sorry, I don't have any act_log etc... */
-
-	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
-
-	nr_elements = mdev->act_log->nr_elements;
-
-	/* GFP_KERNEL, we are not in anyone's write-out path */
-	bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL);
-	if (!bios)
-		goto submit_one_by_one;
-
-	atomic_set(&wc.count, 0);
-	init_completion(&wc.io_done);
-	wc.mdev = mdev;
-	wc.error = 0;
-
-	for (i = 0; i < nr_elements; i++) {
-		enr = lc_element_by_index(mdev->act_log, i)->lc_number;
-		if (enr == LC_FREE)
-			continue;
-		/* next statement also does atomic_inc wc.count and local_cnt */
-		if (atodb_prepare_unless_covered(mdev, bios,
-						enr/AL_EXT_PER_BM_SECT,
-						&wc))
-			goto free_bios_submit_one_by_one;
-	}
-
-	/* unnecessary optimization? */
-	lc_unlock(mdev->act_log);
-	wake_up(&mdev->al_wait);
-
-	/* all prepared, submit them */
-	for (i = 0; i < nr_elements; i++) {
-		if (bios[i] == NULL)
-			break;
-		if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) {
-			bios[i]->bi_rw = WRITE;
-			bio_endio(bios[i], -EIO);
-		} else {
-			submit_bio(WRITE, bios[i]);
-		}
-	}
-
-	/* always (try to) flush bitmap to stable storage */
-	drbd_md_flush(mdev);
-
-	/* In case we did not submit a single IO do not wait for
-	 * them to complete. ( Because we would wait forever here. )
-	 *
-	 * In case we had IOs and they are already complete, there
-	 * is not point in waiting anyways.
-	 * Therefore this if () ... */
-	if (atomic_read(&wc.count))
-		wait_for_completion(&wc.io_done);
-
-	put_ldev(mdev);
-
-	kfree(bios);
-	return;
-
- free_bios_submit_one_by_one:
-	/* free everything by calling the endio callback directly. */
-	for (i = 0; i < nr_elements && bios[i]; i++)
-		bio_endio(bios[i], 0);
-
-	kfree(bios);
-
- submit_one_by_one:
-	dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n");
-
-	for (i = 0; i < mdev->act_log->nr_elements; i++) {
-		enr = lc_element_by_index(mdev->act_log, i)->lc_number;
-		if (enr == LC_FREE)
-			continue;
-		/* Really slow: if we have al-extents 16..19 active,
-		 * sector 4 will be written four times! Synchronous! */
-		drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT);
-	}
-
-	lc_unlock(mdev->act_log);
-	wake_up(&mdev->al_wait);
-	put_ldev(mdev);
-}
-
 /**
  * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents
  * @mdev:	DRBD device.
@@ -809,7 +621,7 @@
 		return 1;
 	}
 
-	drbd_bm_write_sect(mdev, udw->enr);
+	drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr));
 	put_ldev(mdev);
 
 	kfree(udw);
@@ -889,7 +701,6 @@
 				dev_warn(DEV, "Kicking resync_lru element enr=%u "
 				     "out with rs_failed=%d\n",
 				     ext->lce.lc_number, ext->rs_failed);
-				set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
 			}
 			ext->rs_left = rs_left;
 			ext->rs_failed = success ? 0 : count;
@@ -908,7 +719,6 @@
 				drbd_queue_work_front(&mdev->data.work, &udw->w);
 			} else {
 				dev_warn(DEV, "Could not kmalloc an udw\n");
-				set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
 			}
 		}
 	} else {
@@ -919,6 +729,22 @@
 	}
 }
 
+void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go)
+{
+	unsigned long now = jiffies;
+	unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark];
+	int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS;
+	if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
+		if (mdev->rs_mark_left[mdev->rs_last_mark] != still_to_go &&
+		    mdev->state.conn != C_PAUSED_SYNC_T &&
+		    mdev->state.conn != C_PAUSED_SYNC_S) {
+			mdev->rs_mark_time[next] = now;
+			mdev->rs_mark_left[next] = still_to_go;
+			mdev->rs_last_mark = next;
+		}
+	}
+}
+
 /* clear the bit corresponding to the piece of storage in question:
  * size byte of data starting from sector.  Only clear a bits of the affected
  * one ore more _aligned_ BM_BLOCK_SIZE blocks.
@@ -936,7 +762,7 @@
 	int wake_up = 0;
 	unsigned long flags;
 
-	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
+	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
 		dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
 				(unsigned long long)sector, size);
 		return;
@@ -969,21 +795,9 @@
 	 */
 	count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
 	if (count && get_ldev(mdev)) {
-		unsigned long now = jiffies;
-		unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark];
-		int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS;
-		if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
-			unsigned long tw = drbd_bm_total_weight(mdev);
-			if (mdev->rs_mark_left[mdev->rs_last_mark] != tw &&
-			    mdev->state.conn != C_PAUSED_SYNC_T &&
-			    mdev->state.conn != C_PAUSED_SYNC_S) {
-				mdev->rs_mark_time[next] = now;
-				mdev->rs_mark_left[next] = tw;
-				mdev->rs_last_mark = next;
-			}
-		}
+		drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev));
 		spin_lock_irqsave(&mdev->al_lock, flags);
-		drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE);
+		drbd_try_clear_on_disk_bm(mdev, sector, count, true);
 		spin_unlock_irqrestore(&mdev->al_lock, flags);
 
 		/* just wake_up unconditional now, various lc_chaged(),
@@ -998,27 +812,27 @@
 /*
  * this is intended to set one request worth of data out of sync.
  * affects at least 1 bit,
- * and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits.
+ * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits.
  *
  * called by tl_clear and drbd_send_dblock (==drbd_make_request).
  * so this can be _any_ process.
  */
-void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
+int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
 			    const char *file, const unsigned int line)
 {
 	unsigned long sbnr, ebnr, lbnr, flags;
 	sector_t esector, nr_sectors;
-	unsigned int enr, count;
+	unsigned int enr, count = 0;
 	struct lc_element *e;
 
-	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
+	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
 		dev_err(DEV, "sector: %llus, size: %d\n",
 			(unsigned long long)sector, size);
-		return;
+		return 0;
 	}
 
 	if (!get_ldev(mdev))
-		return; /* no disk, no metadata, no bitmap to set bits in */
+		return 0; /* no disk, no metadata, no bitmap to set bits in */
 
 	nr_sectors = drbd_get_capacity(mdev->this_bdev);
 	esector = sector + (size >> 9) - 1;
@@ -1048,6 +862,8 @@
 
 out:
 	put_ldev(mdev);
+
+	return count;
 }
 
 static
@@ -1128,7 +944,10 @@
 	unsigned int enr = BM_SECT_TO_EXT(sector);
 	struct bm_extent *bm_ext;
 	int i, sig;
+	int sa = 200; /* Step aside 200 times, then grab the extent and let app-IO wait.
+			 200 times -> 20 seconds. */
 
+retry:
 	sig = wait_event_interruptible(mdev->al_wait,
 			(bm_ext = _bme_get(mdev, enr)));
 	if (sig)
@@ -1139,16 +958,25 @@
 
 	for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
 		sig = wait_event_interruptible(mdev->al_wait,
-				!_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i));
-		if (sig) {
+					       !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i) ||
+					       test_bit(BME_PRIORITY, &bm_ext->flags));
+
+		if (sig || (test_bit(BME_PRIORITY, &bm_ext->flags) && sa)) {
 			spin_lock_irq(&mdev->al_lock);
 			if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
-				clear_bit(BME_NO_WRITES, &bm_ext->flags);
+				bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */
 				mdev->resync_locked--;
 				wake_up(&mdev->al_wait);
 			}
 			spin_unlock_irq(&mdev->al_lock);
-			return -EINTR;
+			if (sig)
+				return -EINTR;
+			if (schedule_timeout_interruptible(HZ/10))
+				return -EINTR;
+			if (sa && --sa == 0)
+				dev_warn(DEV,"drbd_rs_begin_io() stepped aside for 20sec."
+					 "Resync stalled?\n");
+			goto retry;
 		}
 	}
 	set_bit(BME_LOCKED, &bm_ext->flags);
@@ -1291,8 +1119,7 @@
 	}
 
 	if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
-		clear_bit(BME_LOCKED, &bm_ext->flags);
-		clear_bit(BME_NO_WRITES, &bm_ext->flags);
+		bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */
 		mdev->resync_locked--;
 		wake_up(&mdev->al_wait);
 	}
@@ -1383,7 +1210,7 @@
 	sector_t esector, nr_sectors;
 	int wake_up = 0;
 
-	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
+	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
 		dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
 				(unsigned long long)sector, size);
 		return;
@@ -1420,7 +1247,7 @@
 		mdev->rs_failed += count;
 
 		if (get_ldev(mdev)) {
-			drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE);
+			drbd_try_clear_on_disk_bm(mdev, sector, count, false);
 			put_ldev(mdev);
 		}
 
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 0645ca8..76210ba 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -28,18 +28,56 @@
 #include <linux/drbd.h>
 #include <linux/slab.h>
 #include <asm/kmap_types.h>
+
 #include "drbd_int.h"
 
+
 /* OPAQUE outside this file!
  * interface defined in drbd_int.h
 
  * convention:
  * function name drbd_bm_... => used elsewhere, "public".
  * function name      bm_... => internal to implementation, "private".
+ */
 
- * Note that since find_first_bit returns int, at the current granularity of
- * the bitmap (4KB per byte), this implementation "only" supports up to
- * 1<<(32+12) == 16 TB...
+
+/*
+ * LIMITATIONS:
+ * We want to support >= peta byte of backend storage, while for now still using
+ * a granularity of one bit per 4KiB of storage.
+ * 1 << 50		bytes backend storage (1 PiB)
+ * 1 << (50 - 12)	bits needed
+ *	38 --> we need u64 to index and count bits
+ * 1 << (38 - 3)	bitmap bytes needed
+ *	35 --> we still need u64 to index and count bytes
+ *			(that's 32 GiB of bitmap for 1 PiB storage)
+ * 1 << (35 - 2)	32bit longs needed
+ *	33 --> we'd even need u64 to index and count 32bit long words.
+ * 1 << (35 - 3)	64bit longs needed
+ *	32 --> we could get away with a 32bit unsigned int to index and count
+ *	64bit long words, but I rather stay with unsigned long for now.
+ *	We probably should neither count nor point to bytes or long words
+ *	directly, but either by bitnumber, or by page index and offset.
+ * 1 << (35 - 12)
+ *	22 --> we need that much 4KiB pages of bitmap.
+ *	1 << (22 + 3) --> on a 64bit arch,
+ *	we need 32 MiB to store the array of page pointers.
+ *
+ * Because I'm lazy, and because the resulting patch was too large, too ugly
+ * and still incomplete, on 32bit we still "only" support 16 TiB (minus some),
+ * (1 << 32) bits * 4k storage.
+ *
+
+ * bitmap storage and IO:
+ *	Bitmap is stored little endian on disk, and is kept little endian in
+ *	core memory. Currently we still hold the full bitmap in core as long
+ *	as we are "attached" to a local disk, which at 32 GiB for 1PiB storage
+ *	seems excessive.
+ *
+ *	We plan to reduce the amount of in-core bitmap pages by pageing them in
+ *	and out against their on-disk location as necessary, but need to make
+ *	sure we don't cause too much meta data IO, and must not deadlock in
+ *	tight memory situations. This needs some more work.
  */
 
 /*
@@ -55,13 +93,9 @@
 struct drbd_bitmap {
 	struct page **bm_pages;
 	spinlock_t bm_lock;
-	/* WARNING unsigned long bm_*:
-	 * 32bit number of bit offset is just enough for 512 MB bitmap.
-	 * it will blow up if we make the bitmap bigger...
-	 * not that it makes much sense to have a bitmap that large,
-	 * rather change the granularity to 16k or 64k or something.
-	 * (that implies other problems, however...)
-	 */
+
+	/* see LIMITATIONS: above */
+
 	unsigned long bm_set;       /* nr of set bits; THINK maybe atomic_t? */
 	unsigned long bm_bits;
 	size_t   bm_words;
@@ -69,29 +103,18 @@
 	sector_t bm_dev_capacity;
 	struct mutex bm_change; /* serializes resize operations */
 
-	atomic_t bm_async_io;
-	wait_queue_head_t bm_io_wait;
+	wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */
 
-	unsigned long  bm_flags;
+	enum bm_flag bm_flags;
 
 	/* debugging aid, in case we are still racy somewhere */
 	char          *bm_why;
 	struct task_struct *bm_task;
 };
 
-/* definition of bits in bm_flags */
-#define BM_LOCKED       0
-#define BM_MD_IO_ERROR  1
-#define BM_P_VMALLOCED  2
-
 static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
 			       unsigned long e, int val, const enum km_type km);
 
-static int bm_is_locked(struct drbd_bitmap *b)
-{
-	return test_bit(BM_LOCKED, &b->bm_flags);
-}
-
 #define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
 static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
 {
@@ -108,7 +131,7 @@
 	    b->bm_task == mdev->worker.task   ? "worker"   : "?");
 }
 
-void drbd_bm_lock(struct drbd_conf *mdev, char *why)
+void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
 	int trylock_failed;
@@ -131,8 +154,9 @@
 		    b->bm_task == mdev->worker.task   ? "worker"   : "?");
 		mutex_lock(&b->bm_change);
 	}
-	if (__test_and_set_bit(BM_LOCKED, &b->bm_flags))
+	if (BM_LOCKED_MASK & b->bm_flags)
 		dev_err(DEV, "FIXME bitmap already locked in bm_lock\n");
+	b->bm_flags |= flags & BM_LOCKED_MASK;
 
 	b->bm_why  = why;
 	b->bm_task = current;
@@ -146,31 +170,137 @@
 		return;
 	}
 
-	if (!__test_and_clear_bit(BM_LOCKED, &mdev->bitmap->bm_flags))
+	if (!(BM_LOCKED_MASK & mdev->bitmap->bm_flags))
 		dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n");
 
+	b->bm_flags &= ~BM_LOCKED_MASK;
 	b->bm_why  = NULL;
 	b->bm_task = NULL;
 	mutex_unlock(&b->bm_change);
 }
 
-/* word offset to long pointer */
-static unsigned long *__bm_map_paddr(struct drbd_bitmap *b, unsigned long offset, const enum km_type km)
+/* we store some "meta" info about our pages in page->private */
+/* at a granularity of 4k storage per bitmap bit:
+ * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks
+ *  1<<38 bits,
+ *  1<<23 4k bitmap pages.
+ * Use 24 bits as page index, covers 2 peta byte storage
+ * at a granularity of 4k per bit.
+ * Used to report the failed page idx on io error from the endio handlers.
+ */
+#define BM_PAGE_IDX_MASK	((1UL<<24)-1)
+/* this page is currently read in, or written back */
+#define BM_PAGE_IO_LOCK		31
+/* if there has been an IO error for this page */
+#define BM_PAGE_IO_ERROR	30
+/* this is to be able to intelligently skip disk IO,
+ * set if bits have been set since last IO. */
+#define BM_PAGE_NEED_WRITEOUT	29
+/* to mark for lazy writeout once syncer cleared all clearable bits,
+ * we if bits have been cleared since last IO. */
+#define BM_PAGE_LAZY_WRITEOUT	28
+
+/* store_page_idx uses non-atomic assingment. It is only used directly after
+ * allocating the page.  All other bm_set_page_* and bm_clear_page_* need to
+ * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap
+ * changes) may happen from various contexts, and wait_on_bit/wake_up_bit
+ * requires it all to be atomic as well. */
+static void bm_store_page_idx(struct page *page, unsigned long idx)
 {
-	struct page *page;
-	unsigned long page_nr;
+	BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
+	page_private(page) |= idx;
+}
 
+static unsigned long bm_page_to_idx(struct page *page)
+{
+	return page_private(page) & BM_PAGE_IDX_MASK;
+}
+
+/* As is very unlikely that the same page is under IO from more than one
+ * context, we can get away with a bit per page and one wait queue per bitmap.
+ */
+static void bm_page_lock_io(struct drbd_conf *mdev, int page_nr)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	void *addr = &page_private(b->bm_pages[page_nr]);
+	wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr));
+}
+
+static void bm_page_unlock_io(struct drbd_conf *mdev, int page_nr)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	void *addr = &page_private(b->bm_pages[page_nr]);
+	clear_bit(BM_PAGE_IO_LOCK, addr);
+	smp_mb__after_clear_bit();
+	wake_up(&mdev->bitmap->bm_io_wait);
+}
+
+/* set _before_ submit_io, so it may be reset due to being changed
+ * while this page is in flight... will get submitted later again */
+static void bm_set_page_unchanged(struct page *page)
+{
+	/* use cmpxchg? */
+	clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
+	clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+
+static void bm_set_page_need_writeout(struct page *page)
+{
+	set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
+}
+
+static int bm_test_page_unchanged(struct page *page)
+{
+	volatile const unsigned long *addr = &page_private(page);
+	return (*addr & ((1UL<<BM_PAGE_NEED_WRITEOUT)|(1UL<<BM_PAGE_LAZY_WRITEOUT))) == 0;
+}
+
+static void bm_set_page_io_err(struct page *page)
+{
+	set_bit(BM_PAGE_IO_ERROR, &page_private(page));
+}
+
+static void bm_clear_page_io_err(struct page *page)
+{
+	clear_bit(BM_PAGE_IO_ERROR, &page_private(page));
+}
+
+static void bm_set_page_lazy_writeout(struct page *page)
+{
+	set_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+
+static int bm_test_page_lazy_writeout(struct page *page)
+{
+	return test_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+
+/* on a 32bit box, this would allow for exactly (2<<38) bits. */
+static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr)
+{
 	/* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
-	page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
+	unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3);
 	BUG_ON(page_nr >= b->bm_number_of_pages);
-	page = b->bm_pages[page_nr];
+	return page_nr;
+}
 
+static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)
+{
+	/* page_nr = (bitnr/8) >> PAGE_SHIFT; */
+	unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3);
+	BUG_ON(page_nr >= b->bm_number_of_pages);
+	return page_nr;
+}
+
+static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx, const enum km_type km)
+{
+	struct page *page = b->bm_pages[idx];
 	return (unsigned long *) kmap_atomic(page, km);
 }
 
-static unsigned long * bm_map_paddr(struct drbd_bitmap *b, unsigned long offset)
+static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
 {
-	return __bm_map_paddr(b, offset, KM_IRQ1);
+	return __bm_map_pidx(b, idx, KM_IRQ1);
 }
 
 static void __bm_unmap(unsigned long *p_addr, const enum km_type km)
@@ -202,6 +332,7 @@
  * to be able to report device specific.
  */
 
+
 static void bm_free_pages(struct page **pages, unsigned long number)
 {
 	unsigned long i;
@@ -269,6 +400,9 @@
 				bm_vk_free(new_pages, vmalloced);
 				return NULL;
 			}
+			/* we want to know which page it is
+			 * from the endio handlers */
+			bm_store_page_idx(page, i);
 			new_pages[i] = page;
 		}
 	} else {
@@ -280,9 +414,9 @@
 	}
 
 	if (vmalloced)
-		set_bit(BM_P_VMALLOCED, &b->bm_flags);
+		b->bm_flags |= BM_P_VMALLOCED;
 	else
-		clear_bit(BM_P_VMALLOCED, &b->bm_flags);
+		b->bm_flags &= ~BM_P_VMALLOCED;
 
 	return new_pages;
 }
@@ -319,7 +453,7 @@
 {
 	ERR_IF (!mdev->bitmap) return;
 	bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
-	bm_vk_free(mdev->bitmap->bm_pages, test_bit(BM_P_VMALLOCED, &mdev->bitmap->bm_flags));
+	bm_vk_free(mdev->bitmap->bm_pages, (BM_P_VMALLOCED & mdev->bitmap->bm_flags));
 	kfree(mdev->bitmap);
 	mdev->bitmap = NULL;
 }
@@ -329,22 +463,39 @@
  * this masks out the remaining bits.
  * Returns the number of bits cleared.
  */
+#define BITS_PER_PAGE		(1UL << (PAGE_SHIFT + 3))
+#define BITS_PER_PAGE_MASK	(BITS_PER_PAGE - 1)
+#define BITS_PER_LONG_MASK	(BITS_PER_LONG - 1)
 static int bm_clear_surplus(struct drbd_bitmap *b)
 {
-	const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1;
-	size_t w = b->bm_bits >> LN2_BPL;
-	int cleared = 0;
+	unsigned long mask;
 	unsigned long *p_addr, *bm;
+	int tmp;
+	int cleared = 0;
 
-	p_addr = bm_map_paddr(b, w);
-	bm = p_addr + MLPP(w);
-	if (w < b->bm_words) {
+	/* number of bits modulo bits per page */
+	tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
+	/* mask the used bits of the word containing the last bit */
+	mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
+	/* bitmap is always stored little endian,
+	 * on disk and in core memory alike */
+	mask = cpu_to_lel(mask);
+
+	p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
+	bm = p_addr + (tmp/BITS_PER_LONG);
+	if (mask) {
+		/* If mask != 0, we are not exactly aligned, so bm now points
+		 * to the long containing the last bit.
+		 * If mask == 0, bm already points to the word immediately
+		 * after the last (long word aligned) bit. */
 		cleared = hweight_long(*bm & ~mask);
 		*bm &= mask;
-		w++; bm++;
+		bm++;
 	}
 
-	if (w < b->bm_words) {
+	if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
+		/* on a 32bit arch, we may need to zero out
+		 * a padding long to align with a 64bit remote */
 		cleared += hweight_long(*bm);
 		*bm = 0;
 	}
@@ -354,66 +505,75 @@
 
 static void bm_set_surplus(struct drbd_bitmap *b)
 {
-	const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1;
-	size_t w = b->bm_bits >> LN2_BPL;
+	unsigned long mask;
 	unsigned long *p_addr, *bm;
+	int tmp;
 
-	p_addr = bm_map_paddr(b, w);
-	bm = p_addr + MLPP(w);
-	if (w < b->bm_words) {
+	/* number of bits modulo bits per page */
+	tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
+	/* mask the used bits of the word containing the last bit */
+	mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
+	/* bitmap is always stored little endian,
+	 * on disk and in core memory alike */
+	mask = cpu_to_lel(mask);
+
+	p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
+	bm = p_addr + (tmp/BITS_PER_LONG);
+	if (mask) {
+		/* If mask != 0, we are not exactly aligned, so bm now points
+		 * to the long containing the last bit.
+		 * If mask == 0, bm already points to the word immediately
+		 * after the last (long word aligned) bit. */
 		*bm |= ~mask;
-		bm++; w++;
+		bm++;
 	}
 
-	if (w < b->bm_words) {
-		*bm = ~(0UL);
+	if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
+		/* on a 32bit arch, we may need to zero out
+		 * a padding long to align with a 64bit remote */
+		*bm = ~0UL;
 	}
 	bm_unmap(p_addr);
 }
 
-static unsigned long __bm_count_bits(struct drbd_bitmap *b, const int swap_endian)
-{
-	unsigned long *p_addr, *bm, offset = 0;
-	unsigned long bits = 0;
-	unsigned long i, do_now;
-
-	while (offset < b->bm_words) {
-		i = do_now = min_t(size_t, b->bm_words-offset, LWPP);
-		p_addr = __bm_map_paddr(b, offset, KM_USER0);
-		bm = p_addr + MLPP(offset);
-		while (i--) {
-#ifndef __LITTLE_ENDIAN
-			if (swap_endian)
-				*bm = lel_to_cpu(*bm);
-#endif
-			bits += hweight_long(*bm++);
-		}
-		__bm_unmap(p_addr, KM_USER0);
-		offset += do_now;
-		cond_resched();
-	}
-
-	return bits;
-}
-
+/* you better not modify the bitmap while this is running,
+ * or its results will be stale */
 static unsigned long bm_count_bits(struct drbd_bitmap *b)
 {
-	return __bm_count_bits(b, 0);
-}
+	unsigned long *p_addr;
+	unsigned long bits = 0;
+	unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;
+	int idx, i, last_word;
 
-static unsigned long bm_count_bits_swap_endian(struct drbd_bitmap *b)
-{
-	return __bm_count_bits(b, 1);
+	/* all but last page */
+	for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
+		p_addr = __bm_map_pidx(b, idx, KM_USER0);
+		for (i = 0; i < LWPP; i++)
+			bits += hweight_long(p_addr[i]);
+		__bm_unmap(p_addr, KM_USER0);
+		cond_resched();
+	}
+	/* last (or only) page */
+	last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
+	p_addr = __bm_map_pidx(b, idx, KM_USER0);
+	for (i = 0; i < last_word; i++)
+		bits += hweight_long(p_addr[i]);
+	p_addr[last_word] &= cpu_to_lel(mask);
+	bits += hweight_long(p_addr[last_word]);
+	/* 32bit arch, may have an unused padding long */
+	if (BITS_PER_LONG == 32 && (last_word & 1) == 0)
+		p_addr[last_word+1] = 0;
+	__bm_unmap(p_addr, KM_USER0);
+	return bits;
 }
 
 /* offset and len in long words.*/
 static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
 {
 	unsigned long *p_addr, *bm;
+	unsigned int idx;
 	size_t do_now, end;
 
-#define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512)
-
 	end = offset + len;
 
 	if (end > b->bm_words) {
@@ -423,15 +583,16 @@
 
 	while (offset < end) {
 		do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
-		p_addr = bm_map_paddr(b, offset);
+		idx = bm_word_to_page_idx(b, offset);
+		p_addr = bm_map_pidx(b, idx);
 		bm = p_addr + MLPP(offset);
 		if (bm+do_now > p_addr + LWPP) {
 			printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
 			       p_addr, bm, (int)do_now);
-			break; /* breaks to after catch_oob_access_end() only! */
-		}
-		memset(bm, c, do_now * sizeof(long));
+		} else
+			memset(bm, c, do_now * sizeof(long));
 		bm_unmap(p_addr);
+		bm_set_page_need_writeout(b->bm_pages[idx]);
 		offset += do_now;
 	}
 }
@@ -447,7 +608,7 @@
 int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
-	unsigned long bits, words, owords, obits, *p_addr, *bm;
+	unsigned long bits, words, owords, obits;
 	unsigned long want, have, onpages; /* number of pages */
 	struct page **npages, **opages = NULL;
 	int err = 0, growing;
@@ -455,7 +616,7 @@
 
 	ERR_IF(!b) return -ENOMEM;
 
-	drbd_bm_lock(mdev, "resize");
+	drbd_bm_lock(mdev, "resize", BM_LOCKED_MASK);
 
 	dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n",
 			(unsigned long long)capacity);
@@ -463,7 +624,7 @@
 	if (capacity == b->bm_dev_capacity)
 		goto out;
 
-	opages_vmalloced = test_bit(BM_P_VMALLOCED, &b->bm_flags);
+	opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags);
 
 	if (capacity == 0) {
 		spin_lock_irq(&b->bm_lock);
@@ -491,18 +652,23 @@
 	words = ALIGN(bits, 64) >> LN2_BPL;
 
 	if (get_ldev(mdev)) {
-		D_ASSERT((u64)bits <= (((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12));
+		u64 bits_on_disk = ((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12;
 		put_ldev(mdev);
+		if (bits > bits_on_disk) {
+			dev_info(DEV, "bits = %lu\n", bits);
+			dev_info(DEV, "bits_on_disk = %llu\n", bits_on_disk);
+			err = -ENOSPC;
+			goto out;
+		}
 	}
 
-	/* one extra long to catch off by one errors */
-	want = ALIGN((words+1)*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
+	want = ALIGN(words*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
 	have = b->bm_number_of_pages;
 	if (want == have) {
 		D_ASSERT(b->bm_pages != NULL);
 		npages = b->bm_pages;
 	} else {
-		if (FAULT_ACTIVE(mdev, DRBD_FAULT_BM_ALLOC))
+		if (drbd_insert_fault(mdev, DRBD_FAULT_BM_ALLOC))
 			npages = NULL;
 		else
 			npages = bm_realloc_pages(b, want);
@@ -542,11 +708,6 @@
 		bm_free_pages(opages + want, have - want);
 	}
 
-	p_addr = bm_map_paddr(b, words);
-	bm = p_addr + MLPP(words);
-	*bm = DRBD_MAGIC;
-	bm_unmap(p_addr);
-
 	(void)bm_clear_surplus(b);
 
 	spin_unlock_irq(&b->bm_lock);
@@ -554,7 +715,7 @@
 		bm_vk_free(opages, opages_vmalloced);
 	if (!growing)
 		b->bm_set = bm_count_bits(b);
-	dev_info(DEV, "resync bitmap: bits=%lu words=%lu\n", bits, words);
+	dev_info(DEV, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
 
  out:
 	drbd_bm_unlock(mdev);
@@ -624,6 +785,7 @@
 	struct drbd_bitmap *b = mdev->bitmap;
 	unsigned long *p_addr, *bm;
 	unsigned long word, bits;
+	unsigned int idx;
 	size_t end, do_now;
 
 	end = offset + number;
@@ -638,16 +800,18 @@
 	spin_lock_irq(&b->bm_lock);
 	while (offset < end) {
 		do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
-		p_addr = bm_map_paddr(b, offset);
+		idx = bm_word_to_page_idx(b, offset);
+		p_addr = bm_map_pidx(b, idx);
 		bm = p_addr + MLPP(offset);
 		offset += do_now;
 		while (do_now--) {
 			bits = hweight_long(*bm);
-			word = *bm | lel_to_cpu(*buffer++);
+			word = *bm | *buffer++;
 			*bm++ = word;
 			b->bm_set += hweight_long(word) - bits;
 		}
 		bm_unmap(p_addr);
+		bm_set_page_need_writeout(b->bm_pages[idx]);
 	}
 	/* with 32bit <-> 64bit cross-platform connect
 	 * this is only correct for current usage,
@@ -656,7 +820,6 @@
 	 */
 	if (end == b->bm_words)
 		b->bm_set -= bm_clear_surplus(b);
-
 	spin_unlock_irq(&b->bm_lock);
 }
 
@@ -686,11 +849,11 @@
 	else {
 		while (offset < end) {
 			do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
-			p_addr = bm_map_paddr(b, offset);
+			p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset));
 			bm = p_addr + MLPP(offset);
 			offset += do_now;
 			while (do_now--)
-				*buffer++ = cpu_to_lel(*bm++);
+				*buffer++ = *bm++;
 			bm_unmap(p_addr);
 		}
 	}
@@ -724,9 +887,22 @@
 	spin_unlock_irq(&b->bm_lock);
 }
 
+struct bm_aio_ctx {
+	struct drbd_conf *mdev;
+	atomic_t in_flight;
+	struct completion done;
+	unsigned flags;
+#define BM_AIO_COPY_PAGES	1
+	int error;
+};
+
+/* bv_page may be a copy, or may be the original */
 static void bm_async_io_complete(struct bio *bio, int error)
 {
-	struct drbd_bitmap *b = bio->bi_private;
+	struct bm_aio_ctx *ctx = bio->bi_private;
+	struct drbd_conf *mdev = ctx->mdev;
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
 	int uptodate = bio_flagged(bio, BIO_UPTODATE);
 
 
@@ -737,38 +913,83 @@
 	if (!error && !uptodate)
 		error = -EIO;
 
+	if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
+	    !bm_test_page_unchanged(b->bm_pages[idx]))
+		dev_warn(DEV, "bitmap page idx %u changed during IO!\n", idx);
+
 	if (error) {
-		/* doh. what now?
-		 * for now, set all bits, and flag MD_IO_ERROR */
-		__set_bit(BM_MD_IO_ERROR, &b->bm_flags);
+		/* ctx error will hold the completed-last non-zero error code,
+		 * in case error codes differ. */
+		ctx->error = error;
+		bm_set_page_io_err(b->bm_pages[idx]);
+		/* Not identical to on disk version of it.
+		 * Is BM_PAGE_IO_ERROR enough? */
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_err(DEV, "IO ERROR %d on bitmap page idx %u\n",
+					error, idx);
+	} else {
+		bm_clear_page_io_err(b->bm_pages[idx]);
+		dynamic_dev_dbg(DEV, "bitmap page idx %u completed\n", idx);
 	}
-	if (atomic_dec_and_test(&b->bm_async_io))
-		wake_up(&b->bm_io_wait);
+
+	bm_page_unlock_io(mdev, idx);
+
+	/* FIXME give back to page pool */
+	if (ctx->flags & BM_AIO_COPY_PAGES)
+		put_page(bio->bi_io_vec[0].bv_page);
 
 	bio_put(bio);
+
+	if (atomic_dec_and_test(&ctx->in_flight))
+		complete(&ctx->done);
 }
 
-static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local)
+static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local)
 {
 	/* we are process context. we always get a bio */
 	struct bio *bio = bio_alloc(GFP_KERNEL, 1);
+	struct drbd_conf *mdev = ctx->mdev;
+	struct drbd_bitmap *b = mdev->bitmap;
+	struct page *page;
 	unsigned int len;
+
 	sector_t on_disk_sector =
 		mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset;
 	on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
 
 	/* this might happen with very small
-	 * flexible external meta data device */
+	 * flexible external meta data device,
+	 * or with PAGE_SIZE > 4k */
 	len = min_t(unsigned int, PAGE_SIZE,
 		(drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9);
 
+	/* serialize IO on this page */
+	bm_page_lock_io(mdev, page_nr);
+	/* before memcpy and submit,
+	 * so it can be redirtied any time */
+	bm_set_page_unchanged(b->bm_pages[page_nr]);
+
+	if (ctx->flags & BM_AIO_COPY_PAGES) {
+		/* FIXME alloc_page is good enough for now, but actually needs
+		 * to use pre-allocated page pool */
+		void *src, *dest;
+		page = alloc_page(__GFP_HIGHMEM|__GFP_WAIT);
+		dest = kmap_atomic(page, KM_USER0);
+		src = kmap_atomic(b->bm_pages[page_nr], KM_USER1);
+		memcpy(dest, src, PAGE_SIZE);
+		kunmap_atomic(src, KM_USER1);
+		kunmap_atomic(dest, KM_USER0);
+		bm_store_page_idx(page, page_nr);
+	} else
+		page = b->bm_pages[page_nr];
+
 	bio->bi_bdev = mdev->ldev->md_bdev;
 	bio->bi_sector = on_disk_sector;
-	bio_add_page(bio, b->bm_pages[page_nr], len, 0);
-	bio->bi_private = b;
+	bio_add_page(bio, page, len, 0);
+	bio->bi_private = ctx;
 	bio->bi_end_io = bm_async_io_complete;
 
-	if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
+	if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
 		bio->bi_rw |= rw;
 		bio_endio(bio, -EIO);
 	} else {
@@ -776,87 +997,84 @@
 	}
 }
 
-# if defined(__LITTLE_ENDIAN)
-	/* nothing to do, on disk == in memory */
-# define bm_cpu_to_lel(x) ((void)0)
-# else
-static void bm_cpu_to_lel(struct drbd_bitmap *b)
-{
-	/* need to cpu_to_lel all the pages ...
-	 * this may be optimized by using
-	 * cpu_to_lel(-1) == -1 and cpu_to_lel(0) == 0;
-	 * the following is still not optimal, but better than nothing */
-	unsigned int i;
-	unsigned long *p_addr, *bm;
-	if (b->bm_set == 0) {
-		/* no page at all; avoid swap if all is 0 */
-		i = b->bm_number_of_pages;
-	} else if (b->bm_set == b->bm_bits) {
-		/* only the last page */
-		i = b->bm_number_of_pages - 1;
-	} else {
-		/* all pages */
-		i = 0;
-	}
-	for (; i < b->bm_number_of_pages; i++) {
-		p_addr = kmap_atomic(b->bm_pages[i], KM_USER0);
-		for (bm = p_addr; bm < p_addr + PAGE_SIZE/sizeof(long); bm++)
-			*bm = cpu_to_lel(*bm);
-		kunmap_atomic(p_addr, KM_USER0);
-	}
-}
-# endif
-/* lel_to_cpu == cpu_to_lel */
-# define bm_lel_to_cpu(x) bm_cpu_to_lel(x)
-
 /*
  * bm_rw: read/write the whole bitmap from/to its on disk location.
  */
-static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local)
+static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_idx) __must_hold(local)
 {
+	struct bm_aio_ctx ctx = {
+		.mdev = mdev,
+		.in_flight = ATOMIC_INIT(1),
+		.done = COMPLETION_INITIALIZER_ONSTACK(ctx.done),
+		.flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0,
+	};
 	struct drbd_bitmap *b = mdev->bitmap;
-	/* sector_t sector; */
-	int bm_words, num_pages, i;
+	int num_pages, i, count = 0;
 	unsigned long now;
 	char ppb[10];
 	int err = 0;
 
-	WARN_ON(!bm_is_locked(b));
+	/*
+	 * We are protected against bitmap disappearing/resizing by holding an
+	 * ldev reference (caller must have called get_ldev()).
+	 * For read/write, we are protected against changes to the bitmap by
+	 * the bitmap lock (see drbd_bitmap_io).
+	 * For lazy writeout, we don't care for ongoing changes to the bitmap,
+	 * as we submit copies of pages anyways.
+	 */
+	if (!ctx.flags)
+		WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
 
-	/* no spinlock here, the drbd_bm_lock should be enough! */
-
-	bm_words  = drbd_bm_words(mdev);
-	num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT;
-
-	/* on disk bitmap is little endian */
-	if (rw == WRITE)
-		bm_cpu_to_lel(b);
+	num_pages = b->bm_number_of_pages;
 
 	now = jiffies;
-	atomic_set(&b->bm_async_io, num_pages);
-	__clear_bit(BM_MD_IO_ERROR, &b->bm_flags);
 
 	/* let the layers below us try to merge these bios... */
-	for (i = 0; i < num_pages; i++)
-		bm_page_io_async(mdev, b, i, rw);
+	for (i = 0; i < num_pages; i++) {
+		/* ignore completely unchanged pages */
+		if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
+			break;
+		if (rw & WRITE) {
+			if (bm_test_page_unchanged(b->bm_pages[i])) {
+				dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
+				continue;
+			}
+			/* during lazy writeout,
+			 * ignore those pages not marked for lazy writeout. */
+			if (lazy_writeout_upper_idx &&
+			    !bm_test_page_lazy_writeout(b->bm_pages[i])) {
+				dynamic_dev_dbg(DEV, "skipped bm lazy write for idx %u\n", i);
+				continue;
+			}
+		}
+		atomic_inc(&ctx.in_flight);
+		bm_page_io_async(&ctx, i, rw);
+		++count;
+		cond_resched();
+	}
 
-	wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0);
+	/*
+	 * We initialize ctx.in_flight to one to make sure bm_async_io_complete
+	 * will not complete() early, and decrement / test it here.  If there
+	 * are still some bios in flight, we need to wait for them here.
+	 */
+	if (!atomic_dec_and_test(&ctx.in_flight))
+		wait_for_completion(&ctx.done);
+	dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
+			rw == WRITE ? "WRITE" : "READ",
+			count, jiffies - now);
 
-	if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) {
+	if (ctx.error) {
 		dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
-		drbd_chk_io_error(mdev, 1, TRUE);
-		err = -EIO;
+		drbd_chk_io_error(mdev, 1, true);
+		err = -EIO; /* ctx.error ? */
 	}
 
 	now = jiffies;
 	if (rw == WRITE) {
-		/* swap back endianness */
-		bm_lel_to_cpu(b);
-		/* flush bitmap to stable storage */
 		drbd_md_flush(mdev);
 	} else /* rw == READ */ {
-		/* just read, if necessary adjust endianness */
-		b->bm_set = bm_count_bits_swap_endian(b);
+		b->bm_set = bm_count_bits(b);
 		dev_info(DEV, "recounting of set bits took additional %lu jiffies\n",
 		     jiffies - now);
 	}
@@ -874,112 +1092,128 @@
  */
 int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
 {
-	return bm_rw(mdev, READ);
+	return bm_rw(mdev, READ, 0);
 }
 
 /**
  * drbd_bm_write() - Write the whole bitmap to its on disk location.
  * @mdev:	DRBD device.
+ *
+ * Will only write pages that have changed since last IO.
  */
 int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
 {
-	return bm_rw(mdev, WRITE);
+	return bm_rw(mdev, WRITE, 0);
 }
 
 /**
- * drbd_bm_write_sect: Writes a 512 (MD_SECTOR_SIZE) byte piece of the bitmap
+ * drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
  * @mdev:	DRBD device.
- * @enr:	Extent number in the resync lru (happens to be sector offset)
- *
- * The BM_EXT_SIZE is on purpose exactly the amount of the bitmap covered
- * by a single sector write. Therefore enr == sector offset from the
- * start of the bitmap.
+ * @upper_idx:	0: write all changed pages; +ve: page index to stop scanning for changed pages
  */
-int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local)
+int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local)
 {
-	sector_t on_disk_sector = enr + mdev->ldev->md.md_offset
-				      + mdev->ldev->md.bm_offset;
-	int bm_words, num_words, offset;
-	int err = 0;
+	return bm_rw(mdev, WRITE, upper_idx);
+}
 
-	mutex_lock(&mdev->md_io_mutex);
-	bm_words  = drbd_bm_words(mdev);
-	offset    = S2W(enr);	/* word offset into bitmap */
-	num_words = min(S2W(1), bm_words - offset);
-	if (num_words < S2W(1))
-		memset(page_address(mdev->md_io_page), 0, MD_SECTOR_SIZE);
-	drbd_bm_get_lel(mdev, offset, num_words,
-			page_address(mdev->md_io_page));
-	if (!drbd_md_sync_page_io(mdev, mdev->ldev, on_disk_sector, WRITE)) {
-		int i;
-		err = -EIO;
-		dev_err(DEV, "IO ERROR writing bitmap sector %lu "
-		    "(meta-disk sector %llus)\n",
-		    enr, (unsigned long long)on_disk_sector);
-		drbd_chk_io_error(mdev, 1, TRUE);
-		for (i = 0; i < AL_EXT_PER_BM_SECT; i++)
-			drbd_bm_ALe_set_all(mdev, enr*AL_EXT_PER_BM_SECT+i);
+
+/**
+ * drbd_bm_write_page: Writes a PAGE_SIZE aligned piece of bitmap
+ * @mdev:	DRBD device.
+ * @idx:	bitmap page index
+ *
+ * We don't want to special case on logical_block_size of the backend device,
+ * so we submit PAGE_SIZE aligned pieces.
+ * Note that on "most" systems, PAGE_SIZE is 4k.
+ *
+ * In case this becomes an issue on systems with larger PAGE_SIZE,
+ * we may want to change this again to write 4k aligned 4k pieces.
+ */
+int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local)
+{
+	struct bm_aio_ctx ctx = {
+		.mdev = mdev,
+		.in_flight = ATOMIC_INIT(1),
+		.done = COMPLETION_INITIALIZER_ONSTACK(ctx.done),
+		.flags = BM_AIO_COPY_PAGES,
+	};
+
+	if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) {
+		dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx);
+		return 0;
 	}
+
+	bm_page_io_async(&ctx, idx, WRITE_SYNC);
+	wait_for_completion(&ctx.done);
+
+	if (ctx.error)
+		drbd_chk_io_error(mdev, 1, true);
+		/* that should force detach, so the in memory bitmap will be
+		 * gone in a moment as well. */
+
 	mdev->bm_writ_cnt++;
-	mutex_unlock(&mdev->md_io_mutex);
-	return err;
+	return ctx.error;
 }
 
 /* NOTE
  * find_first_bit returns int, we return unsigned long.
- * should not make much difference anyways, but ...
+ * For this to work on 32bit arch with bitnumbers > (1<<32),
+ * we'd need to return u64, and get a whole lot of other places
+ * fixed where we still use unsigned long.
  *
  * this returns a bit number, NOT a sector!
  */
-#define BPP_MASK ((1UL << (PAGE_SHIFT+3)) - 1)
 static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
 	const int find_zero_bit, const enum km_type km)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
-	unsigned long i = -1UL;
 	unsigned long *p_addr;
-	unsigned long bit_offset; /* bit offset of the mapped page. */
+	unsigned long bit_offset;
+	unsigned i;
+
 
 	if (bm_fo > b->bm_bits) {
 		dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
+		bm_fo = DRBD_END_OF_BITMAP;
 	} else {
 		while (bm_fo < b->bm_bits) {
-			unsigned long offset;
-			bit_offset = bm_fo & ~BPP_MASK; /* bit offset of the page */
-			offset = bit_offset >> LN2_BPL;    /* word offset of the page */
-			p_addr = __bm_map_paddr(b, offset, km);
+			/* bit offset of the first bit in the page */
+			bit_offset = bm_fo & ~BITS_PER_PAGE_MASK;
+			p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo), km);
 
 			if (find_zero_bit)
-				i = find_next_zero_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK);
+				i = find_next_zero_bit_le(p_addr,
+						PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
 			else
-				i = find_next_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK);
+				i = find_next_bit_le(p_addr,
+						PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
 
 			__bm_unmap(p_addr, km);
 			if (i < PAGE_SIZE*8) {
-				i = bit_offset + i;
-				if (i >= b->bm_bits)
+				bm_fo = bit_offset + i;
+				if (bm_fo >= b->bm_bits)
 					break;
 				goto found;
 			}
 			bm_fo = bit_offset + PAGE_SIZE*8;
 		}
-		i = -1UL;
+		bm_fo = DRBD_END_OF_BITMAP;
 	}
  found:
-	return i;
+	return bm_fo;
 }
 
 static unsigned long bm_find_next(struct drbd_conf *mdev,
 	unsigned long bm_fo, const int find_zero_bit)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
-	unsigned long i = -1UL;
+	unsigned long i = DRBD_END_OF_BITMAP;
 
 	ERR_IF(!b) return i;
 	ERR_IF(!b->bm_pages) return i;
 
 	spin_lock_irq(&b->bm_lock);
-	if (bm_is_locked(b))
+	if (BM_DONT_TEST & b->bm_flags)
 		bm_print_lock_info(mdev);
 
 	i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1);
@@ -1005,13 +1239,13 @@
  * you must take drbd_bm_lock() first */
 unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
 {
-	/* WARN_ON(!bm_is_locked(mdev)); */
+	/* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
 	return __bm_find_next(mdev, bm_fo, 0, KM_USER1);
 }
 
 unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
 {
-	/* WARN_ON(!bm_is_locked(mdev)); */
+	/* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
 	return __bm_find_next(mdev, bm_fo, 1, KM_USER1);
 }
 
@@ -1027,8 +1261,9 @@
 	struct drbd_bitmap *b = mdev->bitmap;
 	unsigned long *p_addr = NULL;
 	unsigned long bitnr;
-	unsigned long last_page_nr = -1UL;
+	unsigned int last_page_nr = -1U;
 	int c = 0;
+	int changed_total = 0;
 
 	if (e >= b->bm_bits) {
 		dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
@@ -1036,23 +1271,33 @@
 		e = b->bm_bits ? b->bm_bits -1 : 0;
 	}
 	for (bitnr = s; bitnr <= e; bitnr++) {
-		unsigned long offset = bitnr>>LN2_BPL;
-		unsigned long page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
+		unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
 		if (page_nr != last_page_nr) {
 			if (p_addr)
 				__bm_unmap(p_addr, km);
-			p_addr = __bm_map_paddr(b, offset, km);
+			if (c < 0)
+				bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
+			else if (c > 0)
+				bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
+			changed_total += c;
+			c = 0;
+			p_addr = __bm_map_pidx(b, page_nr, km);
 			last_page_nr = page_nr;
 		}
 		if (val)
-			c += (0 == __test_and_set_bit(bitnr & BPP_MASK, p_addr));
+			c += (0 == __test_and_set_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
 		else
-			c -= (0 != __test_and_clear_bit(bitnr & BPP_MASK, p_addr));
+			c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
 	}
 	if (p_addr)
 		__bm_unmap(p_addr, km);
-	b->bm_set += c;
-	return c;
+	if (c < 0)
+		bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
+	else if (c > 0)
+		bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
+	changed_total += c;
+	b->bm_set += changed_total;
+	return changed_total;
 }
 
 /* returns number of bits actually changed.
@@ -1070,7 +1315,7 @@
 	ERR_IF(!b->bm_pages) return 0;
 
 	spin_lock_irqsave(&b->bm_lock, flags);
-	if (bm_is_locked(b))
+	if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
 		bm_print_lock_info(mdev);
 
 	c = __bm_change_bits_to(mdev, s, e, val, KM_IRQ1);
@@ -1187,12 +1432,11 @@
 	ERR_IF(!b->bm_pages) return 0;
 
 	spin_lock_irqsave(&b->bm_lock, flags);
-	if (bm_is_locked(b))
+	if (BM_DONT_TEST & b->bm_flags)
 		bm_print_lock_info(mdev);
 	if (bitnr < b->bm_bits) {
-		unsigned long offset = bitnr>>LN2_BPL;
-		p_addr = bm_map_paddr(b, offset);
-		i = test_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0;
+		p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr));
+		i = test_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr) ? 1 : 0;
 		bm_unmap(p_addr);
 	} else if (bitnr == b->bm_bits) {
 		i = -1;
@@ -1210,10 +1454,10 @@
 {
 	unsigned long flags;
 	struct drbd_bitmap *b = mdev->bitmap;
-	unsigned long *p_addr = NULL, page_nr = -1;
+	unsigned long *p_addr = NULL;
 	unsigned long bitnr;
+	unsigned int page_nr = -1U;
 	int c = 0;
-	size_t w;
 
 	/* If this is called without a bitmap, that is a bug.  But just to be
 	 * robust in case we screwed up elsewhere, in that case pretend there
@@ -1223,20 +1467,20 @@
 	ERR_IF(!b->bm_pages) return 1;
 
 	spin_lock_irqsave(&b->bm_lock, flags);
-	if (bm_is_locked(b))
+	if (BM_DONT_TEST & b->bm_flags)
 		bm_print_lock_info(mdev);
 	for (bitnr = s; bitnr <= e; bitnr++) {
-		w = bitnr >> LN2_BPL;
-		if (page_nr != w >> (PAGE_SHIFT - LN2_BPL + 3)) {
-			page_nr = w >> (PAGE_SHIFT - LN2_BPL + 3);
+		unsigned int idx = bm_bit_to_page_idx(b, bitnr);
+		if (page_nr != idx) {
+			page_nr = idx;
 			if (p_addr)
 				bm_unmap(p_addr);
-			p_addr = bm_map_paddr(b, w);
+			p_addr = bm_map_pidx(b, idx);
 		}
 		ERR_IF (bitnr >= b->bm_bits) {
 			dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
 		} else {
-			c += (0 != test_bit(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
+			c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
 		}
 	}
 	if (p_addr)
@@ -1271,7 +1515,7 @@
 	ERR_IF(!b->bm_pages) return 0;
 
 	spin_lock_irqsave(&b->bm_lock, flags);
-	if (bm_is_locked(b))
+	if (BM_DONT_TEST & b->bm_flags)
 		bm_print_lock_info(mdev);
 
 	s = S2W(enr);
@@ -1279,7 +1523,7 @@
 	count = 0;
 	if (s < b->bm_words) {
 		int n = e-s;
-		p_addr = bm_map_paddr(b, s);
+		p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
 		bm = p_addr + MLPP(s);
 		while (n--)
 			count += hweight_long(*bm++);
@@ -1291,18 +1535,20 @@
 	return count;
 }
 
-/* set all bits covered by the AL-extent al_enr */
+/* Set all bits covered by the AL-extent al_enr.
+ * Returns number of bits changed. */
 unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
 	unsigned long *p_addr, *bm;
 	unsigned long weight;
-	int count, s, e, i, do_now;
+	unsigned long s, e;
+	int count, i, do_now;
 	ERR_IF(!b) return 0;
 	ERR_IF(!b->bm_pages) return 0;
 
 	spin_lock_irq(&b->bm_lock);
-	if (bm_is_locked(b))
+	if (BM_DONT_SET & b->bm_flags)
 		bm_print_lock_info(mdev);
 	weight = b->bm_set;
 
@@ -1314,7 +1560,7 @@
 	count = 0;
 	if (s < b->bm_words) {
 		i = do_now = e-s;
-		p_addr = bm_map_paddr(b, s);
+		p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
 		bm = p_addr + MLPP(s);
 		while (i--) {
 			count += hweight_long(*bm);
@@ -1326,7 +1572,7 @@
 		if (e == b->bm_words)
 			b->bm_set -= bm_clear_surplus(b);
 	} else {
-		dev_err(DEV, "start offset (%d) too large in drbd_bm_ALe_set_all\n", s);
+		dev_err(DEV, "start offset (%lu) too large in drbd_bm_ALe_set_all\n", s);
 	}
 	weight = b->bm_set - weight;
 	spin_unlock_irq(&b->bm_lock);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index b0bd27d..81030d8 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -72,13 +72,6 @@
 extern char usermode_helper[];
 
 
-#ifndef TRUE
-#define TRUE 1
-#endif
-#ifndef FALSE
-#define FALSE 0
-#endif
-
 /* I don't remember why XCPU ...
  * This is used to wake the asender,
  * and to interrupt sending the sending task
@@ -104,6 +97,7 @@
 #define ID_SYNCER (-1ULL)
 #define ID_VACANT 0
 #define is_syncer_block_id(id) ((id) == ID_SYNCER)
+#define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL)
 
 struct drbd_conf;
 
@@ -137,20 +131,19 @@
 	DRBD_FAULT_MAX,
 };
 
-#ifdef CONFIG_DRBD_FAULT_INJECTION
 extern unsigned int
 _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type);
+
 static inline int
 drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) {
+#ifdef CONFIG_DRBD_FAULT_INJECTION
 	return fault_rate &&
 		(enable_faults & (1<<type)) &&
 		_drbd_insert_fault(mdev, type);
-}
-#define FAULT_ACTIVE(_m, _t) (drbd_insert_fault((_m), (_t)))
-
 #else
-#define FAULT_ACTIVE(_m, _t) (0)
+	return 0;
 #endif
+}
 
 /* integer division, round _UP_ to the next integer */
 #define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0))
@@ -212,8 +205,10 @@
 	/* P_CKPT_FENCE_REQ      = 0x25, * currently reserved for protocol D */
 	/* P_CKPT_DISABLE_REQ    = 0x26, * currently reserved for protocol D */
 	P_DELAY_PROBE         = 0x27, /* is used on BOTH sockets */
+	P_OUT_OF_SYNC         = 0x28, /* Mark as out of sync (Outrunning), data socket */
+	P_RS_CANCEL           = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */
 
-	P_MAX_CMD	      = 0x28,
+	P_MAX_CMD	      = 0x2A,
 	P_MAY_IGNORE	      = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
 	P_MAX_OPT_CMD	      = 0x101,
 
@@ -269,6 +264,7 @@
 		[P_RS_IS_IN_SYNC]	= "CsumRSIsInSync",
 		[P_COMPRESSED_BITMAP]   = "CBitmap",
 		[P_DELAY_PROBE]         = "DelayProbe",
+		[P_OUT_OF_SYNC]		= "OutOfSync",
 		[P_MAX_CMD]	        = NULL,
 	};
 
@@ -512,7 +508,7 @@
 	u64	    d_size;  /* size of disk */
 	u64	    u_size;  /* user requested size */
 	u64	    c_size;  /* current exported size */
-	u32	    max_segment_size;  /* Maximal size of a BIO */
+	u32	    max_bio_size;  /* Maximal size of a BIO */
 	u16	    queue_order_type;  /* not yet implemented in DRBD*/
 	u16	    dds_flags; /* use enum dds_flags here. */
 } __packed;
@@ -550,6 +546,13 @@
 	u32	    pad;
 } __packed;
 
+struct p_block_desc {
+	struct p_header80 head;
+	u64 sector;
+	u32 blksize;
+	u32 pad;	/* to multiple of 8 Byte */
+} __packed;
+
 /* Valid values for the encoding field.
  * Bump proto version when changing this. */
 enum drbd_bitmap_code {
@@ -647,6 +650,7 @@
         struct p_block_req       block_req;
 	struct p_delay_probe93   delay_probe93;
 	struct p_rs_uuid         rs_uuid;
+	struct p_block_desc      block_desc;
 } __packed;
 
 /**********************************************************************/
@@ -677,13 +681,6 @@
 	return thi->t_state;
 }
 
-
-/*
- * Having this as the first member of a struct provides sort of "inheritance".
- * "derived" structs can be "drbd_queue_work()"ed.
- * The callback should know and cast back to the descendant struct.
- * drbd_request and drbd_epoch_entry are descendants of drbd_work.
- */
 struct drbd_work;
 typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel);
 struct drbd_work {
@@ -712,9 +709,6 @@
 	 * starting a new epoch...
 	 */
 
-	/* up to here, the struct layout is identical to drbd_epoch_entry;
-	 * we might be able to use that to our advantage...  */
-
 	struct list_head tl_requests; /* ring list in the transfer log */
 	struct bio *master_bio;       /* master bio pointer */
 	unsigned long rq_state; /* see comments above _req_mod() */
@@ -831,7 +825,7 @@
 	CRASHED_PRIMARY,	/* This node was a crashed primary.
 				 * Gets cleared when the state.conn
 				 * goes into C_CONNECTED state. */
-	WRITE_BM_AFTER_RESYNC,	/* A kmalloc() during resync failed */
+	NO_BARRIER_SUPP,	/* underlying block device doesn't implement barriers */
 	CONSIDER_RESYNC,
 
 	MD_NO_FUA,		/* Users wants us to not use FUA/FLUSH on meta data dev */
@@ -856,10 +850,37 @@
 	GOT_PING_ACK,		/* set when we receive a ping_ack packet, misc wait gets woken */
 	NEW_CUR_UUID,		/* Create new current UUID when thawing IO */
 	AL_SUSPENDED,		/* Activity logging is currently suspended. */
+	AHEAD_TO_SYNC_SOURCE,   /* Ahead -> SyncSource queued */
 };
 
 struct drbd_bitmap; /* opaque for drbd_conf */
 
+/* definition of bits in bm_flags to be used in drbd_bm_lock
+ * and drbd_bitmap_io and friends. */
+enum bm_flag {
+	/* do we need to kfree, or vfree bm_pages? */
+	BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */
+
+	/* currently locked for bulk operation */
+	BM_LOCKED_MASK = 0x7,
+
+	/* in detail, that is: */
+	BM_DONT_CLEAR = 0x1,
+	BM_DONT_SET   = 0x2,
+	BM_DONT_TEST  = 0x4,
+
+	/* (test bit, count bit) allowed (common case) */
+	BM_LOCKED_TEST_ALLOWED = 0x3,
+
+	/* testing bits, as well as setting new bits allowed, but clearing bits
+	 * would be unexpected.  Used during bitmap receive.  Setting new bits
+	 * requires sending of "out-of-sync" information, though. */
+	BM_LOCKED_SET_ALLOWED = 0x1,
+
+	/* clear is not expected while bitmap is locked for bulk operation */
+};
+
+
 /* TODO sort members for performance
  * MAYBE group them further */
 
@@ -925,6 +946,7 @@
 struct bm_io_work {
 	struct drbd_work w;
 	char *why;
+	enum bm_flag flags;
 	int (*io_fn)(struct drbd_conf *mdev);
 	void (*done)(struct drbd_conf *mdev, int rv);
 };
@@ -963,9 +985,12 @@
 	struct drbd_work  resync_work,
 			  unplug_work,
 			  go_diskless,
-			  md_sync_work;
+			  md_sync_work,
+			  start_resync_work;
 	struct timer_list resync_timer;
 	struct timer_list md_sync_timer;
+	struct timer_list start_resync_timer;
+	struct timer_list request_timer;
 #ifdef DRBD_DEBUG_MD_SYNC
 	struct {
 		unsigned int line;
@@ -1000,9 +1025,9 @@
 	struct hlist_head *tl_hash;
 	unsigned int tl_hash_s;
 
-	/* blocks to sync in this run [unit BM_BLOCK_SIZE] */
+	/* blocks to resync in this run [unit BM_BLOCK_SIZE] */
 	unsigned long rs_total;
-	/* number of sync IOs that failed in this run */
+	/* number of resync blocks that failed in this run */
 	unsigned long rs_failed;
 	/* Syncer's start time [unit jiffies] */
 	unsigned long rs_start;
@@ -1102,6 +1127,7 @@
 	struct fifo_buffer rs_plan_s; /* correction values of resync planer */
 	int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
 	int rs_planed;    /* resync sectors already planed */
+	atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
 };
 
 static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@ -1163,14 +1189,19 @@
 };
 
 extern void drbd_init_set_defaults(struct drbd_conf *mdev);
-extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
-			union drbd_state mask, union drbd_state val);
+extern enum drbd_state_rv drbd_change_state(struct drbd_conf *mdev,
+					    enum chg_state_flags f,
+					    union drbd_state mask,
+					    union drbd_state val);
 extern void drbd_force_state(struct drbd_conf *, union drbd_state,
 			union drbd_state);
-extern int _drbd_request_state(struct drbd_conf *, union drbd_state,
-			union drbd_state, enum chg_state_flags);
-extern int __drbd_set_state(struct drbd_conf *, union drbd_state,
-			    enum chg_state_flags, struct completion *done);
+extern enum drbd_state_rv _drbd_request_state(struct drbd_conf *,
+					      union drbd_state,
+					      union drbd_state,
+					      enum chg_state_flags);
+extern enum drbd_state_rv __drbd_set_state(struct drbd_conf *, union drbd_state,
+					   enum chg_state_flags,
+					   struct completion *done);
 extern void print_st_err(struct drbd_conf *, union drbd_state,
 			union drbd_state, int);
 extern int  drbd_thread_start(struct drbd_thread *thi);
@@ -1195,7 +1226,7 @@
 extern int drbd_send_protocol(struct drbd_conf *mdev);
 extern int drbd_send_uuids(struct drbd_conf *mdev);
 extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
-extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val);
+extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev);
 extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags);
 extern int _drbd_send_state(struct drbd_conf *mdev);
 extern int drbd_send_state(struct drbd_conf *mdev);
@@ -1220,11 +1251,10 @@
 			struct p_data *dp, int data_size);
 extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
 			    sector_t sector, int blksize, u64 block_id);
+extern int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req);
 extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
 			   struct drbd_epoch_entry *e);
 extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req);
-extern int _drbd_send_barrier(struct drbd_conf *mdev,
-			struct drbd_tl_epoch *barrier);
 extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
 			      sector_t sector, int size, u64 block_id);
 extern int drbd_send_drequest_csum(struct drbd_conf *mdev,
@@ -1235,14 +1265,13 @@
 
 extern int drbd_send_bitmap(struct drbd_conf *mdev);
 extern int _drbd_send_bitmap(struct drbd_conf *mdev);
-extern int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode);
+extern int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode);
 extern void drbd_free_bc(struct drbd_backing_dev *ldev);
 extern void drbd_mdev_cleanup(struct drbd_conf *mdev);
+void drbd_print_uuids(struct drbd_conf *mdev, const char *text);
 
-/* drbd_meta-data.c (still in drbd_main.c) */
 extern void drbd_md_sync(struct drbd_conf *mdev);
 extern int  drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev);
-/* maybe define them below as inline? */
 extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
 extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
 extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
@@ -1261,10 +1290,12 @@
 extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
 				 int (*io_fn)(struct drbd_conf *),
 				 void (*done)(struct drbd_conf *, int),
-				 char *why);
+				 char *why, enum bm_flag flags);
+extern int drbd_bitmap_io(struct drbd_conf *mdev,
+		int (*io_fn)(struct drbd_conf *),
+		char *why, enum bm_flag flags);
 extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
 extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
-extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
 extern void drbd_go_diskless(struct drbd_conf *mdev);
 extern void drbd_ldev_destroy(struct drbd_conf *mdev);
 
@@ -1313,6 +1344,7 @@
 
 #define BME_NO_WRITES  0  /* bm_extent.flags: no more requests on this one! */
 #define BME_LOCKED     1  /* bm_extent.flags: syncer active on this one. */
+#define BME_PRIORITY   2  /* finish resync IO on this extent ASAP! App IO waiting! */
 
 /* drbd_bitmap.c */
 /*
@@ -1390,7 +1422,9 @@
  * you should use 64bit OS for that much storage, anyways. */
 #define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff)
 #else
-#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0x1LU << 32)
+/* we allow up to 1 PiB now on 64bit architecture with "flexible" meta data */
+#define DRBD_MAX_SECTORS_FLEX (1UL << 51)
+/* corresponds to (1UL << 38) bits right now. */
 #endif
 #endif
 
@@ -1398,7 +1432,7 @@
  * With a value of 8 all IO in one 128K block make it to the same slot of the
  * hash table. */
 #define HT_SHIFT 8
-#define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT))
+#define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT))
 
 #define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */
 
@@ -1410,16 +1444,20 @@
 extern void drbd_bm_cleanup(struct drbd_conf *mdev);
 extern void drbd_bm_set_all(struct drbd_conf *mdev);
 extern void drbd_bm_clear_all(struct drbd_conf *mdev);
+/* set/clear/test only a few bits at a time */
 extern int  drbd_bm_set_bits(
 		struct drbd_conf *mdev, unsigned long s, unsigned long e);
 extern int  drbd_bm_clear_bits(
 		struct drbd_conf *mdev, unsigned long s, unsigned long e);
-/* bm_set_bits variant for use while holding drbd_bm_lock */
+extern int drbd_bm_count_bits(
+	struct drbd_conf *mdev, const unsigned long s, const unsigned long e);
+/* bm_set_bits variant for use while holding drbd_bm_lock,
+ * may process the whole bitmap in one go */
 extern void _drbd_bm_set_bits(struct drbd_conf *mdev,
 		const unsigned long s, const unsigned long e);
 extern int  drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr);
 extern int  drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
-extern int  drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local);
+extern int  drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local);
 extern int  drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
 extern int  drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
 extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
@@ -1427,6 +1465,8 @@
 extern size_t	     drbd_bm_words(struct drbd_conf *mdev);
 extern unsigned long drbd_bm_bits(struct drbd_conf *mdev);
 extern sector_t      drbd_bm_capacity(struct drbd_conf *mdev);
+
+#define DRBD_END_OF_BITMAP	(~(unsigned long)0)
 extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
 /* bm_find_next variants for use while you hold drbd_bm_lock() */
 extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
@@ -1437,14 +1477,12 @@
 /* for receive_bitmap */
 extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset,
 		size_t number, unsigned long *buffer);
-/* for _drbd_send_bitmap and drbd_bm_write_sect */
+/* for _drbd_send_bitmap */
 extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset,
 		size_t number, unsigned long *buffer);
 
-extern void drbd_bm_lock(struct drbd_conf *mdev, char *why);
+extern void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags);
 extern void drbd_bm_unlock(struct drbd_conf *mdev);
-
-extern int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e);
 /* drbd_main.c */
 
 extern struct kmem_cache *drbd_request_cache;
@@ -1467,7 +1505,7 @@
 extern int proc_details;
 
 /* drbd_req */
-extern int drbd_make_request_26(struct request_queue *q, struct bio *bio);
+extern int drbd_make_request(struct request_queue *q, struct bio *bio);
 extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
 extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
 extern int is_valid_ar_handle(struct drbd_request *, sector_t);
@@ -1482,8 +1520,9 @@
 extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
 extern void resync_after_online_grow(struct drbd_conf *);
 extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
-extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
-		int force);
+extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev,
+					enum drbd_role new_role,
+					int force);
 extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
 extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev);
 extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
@@ -1499,6 +1538,7 @@
 extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
 		struct drbd_backing_dev *bdev, sector_t sector, int rw);
 extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
+extern void drbd_rs_controller_reset(struct drbd_conf *mdev);
 
 static inline void ov_oos_print(struct drbd_conf *mdev)
 {
@@ -1522,21 +1562,23 @@
 extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int);
 extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int);
 extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int);
-extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int);
+extern int w_resync_timer(struct drbd_conf *, struct drbd_work *, int);
 extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int);
 extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int);
-extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int);
 extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int);
 extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
 extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
 extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
 extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
 extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int);
+extern int w_send_oos(struct drbd_conf *, struct drbd_work *, int);
+extern int w_start_resync(struct drbd_conf *, struct drbd_work *, int);
 
 extern void resync_timer_fn(unsigned long data);
+extern void start_resync_timer_fn(unsigned long data);
 
 /* drbd_receiver.c */
-extern int drbd_rs_should_slow_down(struct drbd_conf *mdev);
+extern int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector);
 extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
 		const unsigned rw, const int fault_type);
 extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
@@ -1619,16 +1661,16 @@
 extern void drbd_rs_failed_io(struct drbd_conf *mdev,
 		sector_t sector, int size);
 extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *);
+extern void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go);
 extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector,
 		int size, const char *file, const unsigned int line);
 #define drbd_set_in_sync(mdev, sector, size) \
 	__drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__)
-extern void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
+extern int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
 		int size, const char *file, const unsigned int line);
 #define drbd_set_out_of_sync(mdev, sector, size) \
 	__drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
 extern void drbd_al_apply_to_bm(struct drbd_conf *mdev);
-extern void drbd_al_to_on_disk_bm(struct drbd_conf *mdev);
 extern void drbd_al_shrink(struct drbd_conf *mdev);
 
 
@@ -1747,11 +1789,11 @@
 	wake_up(&mdev->misc_wait);
 }
 
-static inline int _drbd_set_state(struct drbd_conf *mdev,
-				   union drbd_state ns, enum chg_state_flags flags,
-				   struct completion *done)
+static inline enum drbd_state_rv
+_drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
+		enum chg_state_flags flags, struct completion *done)
 {
-	int rv;
+	enum drbd_state_rv rv;
 
 	read_lock(&global_state_lock);
 	rv = __drbd_set_state(mdev, ns, flags, done);
@@ -1982,17 +2024,17 @@
 
 static inline void drbd_thread_stop(struct drbd_thread *thi)
 {
-	_drbd_thread_stop(thi, FALSE, TRUE);
+	_drbd_thread_stop(thi, false, true);
 }
 
 static inline void drbd_thread_stop_nowait(struct drbd_thread *thi)
 {
-	_drbd_thread_stop(thi, FALSE, FALSE);
+	_drbd_thread_stop(thi, false, false);
 }
 
 static inline void drbd_thread_restart_nowait(struct drbd_thread *thi)
 {
-	_drbd_thread_stop(thi, TRUE, FALSE);
+	_drbd_thread_stop(thi, true, false);
 }
 
 /* counts how many answer packets packets we expect from our peer,
@@ -2146,17 +2188,18 @@
 static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
 		unsigned long *bits_left, unsigned int *per_mil_done)
 {
-	/*
-	 * this is to break it at compile time when we change that
-	 * (we may feel 4TB maximum storage per drbd is not enough)
-	 */
+	/* this is to break it at compile time when we change that, in case we
+	 * want to support more than (1<<32) bits on a 32bit arch. */
 	typecheck(unsigned long, mdev->rs_total);
 
 	/* note: both rs_total and rs_left are in bits, i.e. in
 	 * units of BM_BLOCK_SIZE.
 	 * for the percentage, we don't care. */
 
-	*bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
+	if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
+		*bits_left = mdev->ov_left;
+	else
+		*bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
 	/* >> 10 to prevent overflow,
 	 * +1 to prevent division by zero */
 	if (*bits_left > mdev->rs_total) {
@@ -2171,10 +2214,19 @@
 				*bits_left, mdev->rs_total, mdev->rs_failed);
 		*per_mil_done = 0;
 	} else {
-		/* make sure the calculation happens in long context */
-		unsigned long tmp = 1000UL -
-				(*bits_left >> 10)*1000UL
-				/ ((mdev->rs_total >> 10) + 1UL);
+		/* Make sure the division happens in long context.
+		 * We allow up to one petabyte storage right now,
+		 * at a granularity of 4k per bit that is 2**38 bits.
+		 * After shift right and multiplication by 1000,
+		 * this should still fit easily into a 32bit long,
+		 * so we don't need a 64bit division on 32bit arch.
+		 * Note: currently we don't support such large bitmaps on 32bit
+		 * arch anyways, but no harm done to be prepared for it here.
+		 */
+		unsigned int shift = mdev->rs_total >= (1ULL << 32) ? 16 : 10;
+		unsigned long left = *bits_left >> shift;
+		unsigned long total = 1UL + (mdev->rs_total >> shift);
+		unsigned long tmp = 1000UL - left * 1000UL/total;
 		*per_mil_done = tmp;
 	}
 }
@@ -2193,8 +2245,9 @@
 	return mxb;
 }
 
-static inline int drbd_state_is_stable(union drbd_state s)
+static inline int drbd_state_is_stable(struct drbd_conf *mdev)
 {
+	union drbd_state s = mdev->state;
 
 	/* DO NOT add a default clause, we want the compiler to warn us
 	 * for any newly introduced state we may have forgotten to add here */
@@ -2211,11 +2264,9 @@
 	case C_VERIFY_T:
 	case C_PAUSED_SYNC_S:
 	case C_PAUSED_SYNC_T:
-		/* maybe stable, look at the disk state */
-		break;
-
-	/* no new io accepted during tansitional states
-	 * like handshake or teardown */
+	case C_AHEAD:
+	case C_BEHIND:
+		/* transitional states, IO allowed */
 	case C_DISCONNECTING:
 	case C_UNCONNECTED:
 	case C_TIMEOUT:
@@ -2226,7 +2277,15 @@
 	case C_WF_REPORT_PARAMS:
 	case C_STARTING_SYNC_S:
 	case C_STARTING_SYNC_T:
+		break;
+
+		/* Allow IO in BM exchange states with new protocols */
 	case C_WF_BITMAP_S:
+		if (mdev->agreed_pro_version < 96)
+			return 0;
+		break;
+
+		/* no new io accepted in these states */
 	case C_WF_BITMAP_T:
 	case C_WF_SYNC_UUID:
 	case C_MASK:
@@ -2261,41 +2320,47 @@
 	return s.susp || s.susp_nod || s.susp_fen;
 }
 
-static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
+static inline bool may_inc_ap_bio(struct drbd_conf *mdev)
 {
 	int mxb = drbd_get_max_buffers(mdev);
 
 	if (is_susp(mdev->state))
-		return 0;
+		return false;
 	if (test_bit(SUSPEND_IO, &mdev->flags))
-		return 0;
+		return false;
 
 	/* to avoid potential deadlock or bitmap corruption,
 	 * in various places, we only allow new application io
 	 * to start during "stable" states. */
 
 	/* no new io accepted when attaching or detaching the disk */
-	if (!drbd_state_is_stable(mdev->state))
-		return 0;
+	if (!drbd_state_is_stable(mdev))
+		return false;
 
 	/* since some older kernels don't have atomic_add_unless,
 	 * and we are within the spinlock anyways, we have this workaround.  */
 	if (atomic_read(&mdev->ap_bio_cnt) > mxb)
-		return 0;
+		return false;
 	if (test_bit(BITMAP_IO, &mdev->flags))
-		return 0;
-	return 1;
+		return false;
+	return true;
 }
 
-/* I'd like to use wait_event_lock_irq,
- * but I'm not sure when it got introduced,
- * and not sure when it has 3 or 4 arguments */
+static inline bool inc_ap_bio_cond(struct drbd_conf *mdev, int count)
+{
+	bool rv = false;
+
+	spin_lock_irq(&mdev->req_lock);
+	rv = may_inc_ap_bio(mdev);
+	if (rv)
+		atomic_add(count, &mdev->ap_bio_cnt);
+	spin_unlock_irq(&mdev->req_lock);
+
+	return rv;
+}
+
 static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
 {
-	/* compare with after_state_ch,
-	 * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */
-	DEFINE_WAIT(wait);
-
 	/* we wait here
 	 *    as long as the device is suspended
 	 *    until the bitmap is no longer on the fly during connection
@@ -2304,16 +2369,7 @@
 	 * to avoid races with the reconnect code,
 	 * we need to atomic_inc within the spinlock. */
 
-	spin_lock_irq(&mdev->req_lock);
-	while (!__inc_ap_bio_cond(mdev)) {
-		prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
-		spin_unlock_irq(&mdev->req_lock);
-		schedule();
-		finish_wait(&mdev->misc_wait, &wait);
-		spin_lock_irq(&mdev->req_lock);
-	}
-	atomic_add(count, &mdev->ap_bio_cnt);
-	spin_unlock_irq(&mdev->req_lock);
+	wait_event(mdev->misc_wait, inc_ap_bio_cond(mdev, count));
 }
 
 static inline void dec_ap_bio(struct drbd_conf *mdev)
@@ -2333,9 +2389,11 @@
 	}
 }
 
-static inline void drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
+static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
 {
+	int changed = mdev->ed_uuid != val;
 	mdev->ed_uuid = val;
+	return changed;
 }
 
 static inline int seq_cmp(u32 a, u32 b)
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 8a43ce0..dfc85f3 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -85,7 +85,8 @@
 MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
 MODULE_VERSION(REL_VERSION);
 MODULE_LICENSE("GPL");
-MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
+MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
+		 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
 MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
 
 #include <linux/moduleparam.h>
@@ -115,7 +116,7 @@
 #endif
 
 /* module parameter, defined */
-unsigned int minor_count = 32;
+unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
 int disable_sendpage;
 int allow_oos;
 unsigned int cn_idx = CN_IDX_DRBD;
@@ -335,6 +336,7 @@
 	drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
 }
 
+
 /**
  * _tl_restart() - Walks the transfer log, and applies an action to all requests
  * @mdev:	DRBD device.
@@ -456,7 +458,7 @@
 }
 
 /**
- * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
+ * cl_wide_st_chg() - true if the state change is a cluster wide one
  * @mdev:	DRBD device.
  * @os:		old (current) state.
  * @ns:		new (wanted) state.
@@ -473,12 +475,13 @@
 		(os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
 }
 
-int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
-		      union drbd_state mask, union drbd_state val)
+enum drbd_state_rv
+drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
+		  union drbd_state mask, union drbd_state val)
 {
 	unsigned long flags;
 	union drbd_state os, ns;
-	int rv;
+	enum drbd_state_rv rv;
 
 	spin_lock_irqsave(&mdev->req_lock, flags);
 	os = mdev->state;
@@ -502,20 +505,22 @@
 	drbd_change_state(mdev, CS_HARD, mask, val);
 }
 
-static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
-static int is_valid_state_transition(struct drbd_conf *,
-				     union drbd_state, union drbd_state);
+static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
+static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
+						    union drbd_state,
+						    union drbd_state);
 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
 				       union drbd_state ns, const char **warn_sync_abort);
 int drbd_send_state_req(struct drbd_conf *,
 			union drbd_state, union drbd_state);
 
-static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
-				    union drbd_state mask, union drbd_state val)
+static enum drbd_state_rv
+_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
+	     union drbd_state val)
 {
 	union drbd_state os, ns;
 	unsigned long flags;
-	int rv;
+	enum drbd_state_rv rv;
 
 	if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
 		return SS_CW_SUCCESS;
@@ -536,7 +541,7 @@
 		if (rv == SS_SUCCESS) {
 			rv = is_valid_state_transition(mdev, ns, os);
 			if (rv == SS_SUCCESS)
-				rv = 0; /* cont waiting, otherwise fail. */
+				rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
 		}
 	}
 	spin_unlock_irqrestore(&mdev->req_lock, flags);
@@ -554,14 +559,14 @@
  * Should not be called directly, use drbd_request_state() or
  * _drbd_request_state().
  */
-static int drbd_req_state(struct drbd_conf *mdev,
-			  union drbd_state mask, union drbd_state val,
-			  enum chg_state_flags f)
+static enum drbd_state_rv
+drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
+	       union drbd_state val, enum chg_state_flags f)
 {
 	struct completion done;
 	unsigned long flags;
 	union drbd_state os, ns;
-	int rv;
+	enum drbd_state_rv rv;
 
 	init_completion(&done);
 
@@ -636,10 +641,11 @@
  * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
  * flag, or when logging of failed state change requests is not desired.
  */
-int _drbd_request_state(struct drbd_conf *mdev,	union drbd_state mask,
-			union drbd_state val,	enum chg_state_flags f)
+enum drbd_state_rv
+_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
+		    union drbd_state val, enum chg_state_flags f)
 {
-	int rv;
+	enum drbd_state_rv rv;
 
 	wait_event(mdev->state_wait,
 		   (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
@@ -663,8 +669,8 @@
 	    );
 }
 
-void print_st_err(struct drbd_conf *mdev,
-	union drbd_state os, union drbd_state ns, int err)
+void print_st_err(struct drbd_conf *mdev, union drbd_state os,
+	          union drbd_state ns, enum drbd_state_rv err)
 {
 	if (err == SS_IN_TRANSIENT_STATE)
 		return;
@@ -674,32 +680,18 @@
 }
 
 
-#define drbd_peer_str drbd_role_str
-#define drbd_pdsk_str drbd_disk_str
-
-#define drbd_susp_str(A)     ((A) ? "1" : "0")
-#define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
-#define drbd_peer_isp_str(A) ((A) ? "1" : "0")
-#define drbd_user_isp_str(A) ((A) ? "1" : "0")
-
-#define PSC(A) \
-	({ if (ns.A != os.A) { \
-		pbp += sprintf(pbp, #A "( %s -> %s ) ", \
-			      drbd_##A##_str(os.A), \
-			      drbd_##A##_str(ns.A)); \
-	} })
-
 /**
  * is_valid_state() - Returns an SS_ error code if ns is not valid
  * @mdev:	DRBD device.
  * @ns:		State to consider.
  */
-static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
+static enum drbd_state_rv
+is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
 {
 	/* See drbd_state_sw_errors in drbd_strings.c */
 
 	enum drbd_fencing_p fp;
-	int rv = SS_SUCCESS;
+	enum drbd_state_rv rv = SS_SUCCESS;
 
 	fp = FP_DONT_CARE;
 	if (get_ldev(mdev)) {
@@ -762,10 +754,11 @@
  * @ns:		new state.
  * @os:		old state.
  */
-static int is_valid_state_transition(struct drbd_conf *mdev,
-				     union drbd_state ns, union drbd_state os)
+static enum drbd_state_rv
+is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
+			  union drbd_state os)
 {
-	int rv = SS_SUCCESS;
+	enum drbd_state_rv rv = SS_SUCCESS;
 
 	if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
 	    os.conn > C_CONNECTED)
@@ -800,6 +793,10 @@
 	    os.conn < C_CONNECTED)
 		rv = SS_NEED_CONNECTION;
 
+	if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
+	    && os.conn < C_WF_REPORT_PARAMS)
+		rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
+
 	return rv;
 }
 
@@ -817,6 +814,7 @@
 				       union drbd_state ns, const char **warn_sync_abort)
 {
 	enum drbd_fencing_p fp;
+	enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
 
 	fp = FP_DONT_CARE;
 	if (get_ldev(mdev)) {
@@ -869,56 +867,6 @@
 		ns.conn = C_CONNECTED;
 	}
 
-	if (ns.conn >= C_CONNECTED &&
-	    ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) ||
-	     (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) {
-		switch (ns.conn) {
-		case C_WF_BITMAP_T:
-		case C_PAUSED_SYNC_T:
-			ns.disk = D_OUTDATED;
-			break;
-		case C_CONNECTED:
-		case C_WF_BITMAP_S:
-		case C_SYNC_SOURCE:
-		case C_PAUSED_SYNC_S:
-			ns.disk = D_UP_TO_DATE;
-			break;
-		case C_SYNC_TARGET:
-			ns.disk = D_INCONSISTENT;
-			dev_warn(DEV, "Implicitly set disk state Inconsistent!\n");
-			break;
-		}
-		if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE)
-			dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n");
-	}
-
-	if (ns.conn >= C_CONNECTED &&
-	    (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) {
-		switch (ns.conn) {
-		case C_CONNECTED:
-		case C_WF_BITMAP_T:
-		case C_PAUSED_SYNC_T:
-		case C_SYNC_TARGET:
-			ns.pdsk = D_UP_TO_DATE;
-			break;
-		case C_WF_BITMAP_S:
-		case C_PAUSED_SYNC_S:
-			/* remap any consistent state to D_OUTDATED,
-			 * but disallow "upgrade" of not even consistent states.
-			 */
-			ns.pdsk =
-				(D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED)
-				? os.pdsk : D_OUTDATED;
-			break;
-		case C_SYNC_SOURCE:
-			ns.pdsk = D_INCONSISTENT;
-			dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n");
-			break;
-		}
-		if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE)
-			dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n");
-	}
-
 	/* Connection breaks down before we finished "Negotiating" */
 	if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
 	    get_ldev_if_state(mdev, D_NEGOTIATING)) {
@@ -933,6 +881,94 @@
 		put_ldev(mdev);
 	}
 
+	/* D_CONSISTENT and D_OUTDATED vanish when we get connected */
+	if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
+		if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
+			ns.disk = D_UP_TO_DATE;
+		if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
+			ns.pdsk = D_UP_TO_DATE;
+	}
+
+	/* Implications of the connection stat on the disk states */
+	disk_min = D_DISKLESS;
+	disk_max = D_UP_TO_DATE;
+	pdsk_min = D_INCONSISTENT;
+	pdsk_max = D_UNKNOWN;
+	switch ((enum drbd_conns)ns.conn) {
+	case C_WF_BITMAP_T:
+	case C_PAUSED_SYNC_T:
+	case C_STARTING_SYNC_T:
+	case C_WF_SYNC_UUID:
+	case C_BEHIND:
+		disk_min = D_INCONSISTENT;
+		disk_max = D_OUTDATED;
+		pdsk_min = D_UP_TO_DATE;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_VERIFY_S:
+	case C_VERIFY_T:
+		disk_min = D_UP_TO_DATE;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_UP_TO_DATE;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_CONNECTED:
+		disk_min = D_DISKLESS;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_DISKLESS;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_WF_BITMAP_S:
+	case C_PAUSED_SYNC_S:
+	case C_STARTING_SYNC_S:
+	case C_AHEAD:
+		disk_min = D_UP_TO_DATE;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_INCONSISTENT;
+		pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
+		break;
+	case C_SYNC_TARGET:
+		disk_min = D_INCONSISTENT;
+		disk_max = D_INCONSISTENT;
+		pdsk_min = D_UP_TO_DATE;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_SYNC_SOURCE:
+		disk_min = D_UP_TO_DATE;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_INCONSISTENT;
+		pdsk_max = D_INCONSISTENT;
+		break;
+	case C_STANDALONE:
+	case C_DISCONNECTING:
+	case C_UNCONNECTED:
+	case C_TIMEOUT:
+	case C_BROKEN_PIPE:
+	case C_NETWORK_FAILURE:
+	case C_PROTOCOL_ERROR:
+	case C_TEAR_DOWN:
+	case C_WF_CONNECTION:
+	case C_WF_REPORT_PARAMS:
+	case C_MASK:
+		break;
+	}
+	if (ns.disk > disk_max)
+		ns.disk = disk_max;
+
+	if (ns.disk < disk_min) {
+		dev_warn(DEV, "Implicitly set disk from %s to %s\n",
+			 drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
+		ns.disk = disk_min;
+	}
+	if (ns.pdsk > pdsk_max)
+		ns.pdsk = pdsk_max;
+
+	if (ns.pdsk < pdsk_min) {
+		dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
+			 drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
+		ns.pdsk = pdsk_min;
+	}
+
 	if (fp == FP_STONITH &&
 	    (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
 	    !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
@@ -961,6 +997,10 @@
 /* helper for __drbd_set_state */
 static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
 {
+	if (mdev->agreed_pro_version < 90)
+		mdev->ov_start_sector = 0;
+	mdev->rs_total = drbd_bm_bits(mdev);
+	mdev->ov_position = 0;
 	if (cs == C_VERIFY_T) {
 		/* starting online verify from an arbitrary position
 		 * does not fit well into the existing protocol.
@@ -970,11 +1010,15 @@
 		mdev->ov_start_sector = ~(sector_t)0;
 	} else {
 		unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
-		if (bit >= mdev->rs_total)
+		if (bit >= mdev->rs_total) {
 			mdev->ov_start_sector =
 				BM_BIT_TO_SECT(mdev->rs_total - 1);
+			mdev->rs_total = 1;
+		} else
+			mdev->rs_total -= bit;
 		mdev->ov_position = mdev->ov_start_sector;
 	}
+	mdev->ov_left = mdev->rs_total;
 }
 
 static void drbd_resume_al(struct drbd_conf *mdev)
@@ -992,12 +1036,12 @@
  *
  * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
  */
-int __drbd_set_state(struct drbd_conf *mdev,
-		    union drbd_state ns, enum chg_state_flags flags,
-		    struct completion *done)
+enum drbd_state_rv
+__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
+	         enum chg_state_flags flags, struct completion *done)
 {
 	union drbd_state os;
-	int rv = SS_SUCCESS;
+	enum drbd_state_rv rv = SS_SUCCESS;
 	const char *warn_sync_abort = NULL;
 	struct after_state_chg_work *ascw;
 
@@ -1033,22 +1077,46 @@
 		dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
 
 	{
-		char *pbp, pb[300];
-		pbp = pb;
-		*pbp = 0;
-		PSC(role);
-		PSC(peer);
-		PSC(conn);
-		PSC(disk);
-		PSC(pdsk);
-		if (is_susp(ns) != is_susp(os))
-			pbp += sprintf(pbp, "susp( %s -> %s ) ",
-				       drbd_susp_str(is_susp(os)),
-				       drbd_susp_str(is_susp(ns)));
-		PSC(aftr_isp);
-		PSC(peer_isp);
-		PSC(user_isp);
-		dev_info(DEV, "%s\n", pb);
+	char *pbp, pb[300];
+	pbp = pb;
+	*pbp = 0;
+	if (ns.role != os.role)
+		pbp += sprintf(pbp, "role( %s -> %s ) ",
+			       drbd_role_str(os.role),
+			       drbd_role_str(ns.role));
+	if (ns.peer != os.peer)
+		pbp += sprintf(pbp, "peer( %s -> %s ) ",
+			       drbd_role_str(os.peer),
+			       drbd_role_str(ns.peer));
+	if (ns.conn != os.conn)
+		pbp += sprintf(pbp, "conn( %s -> %s ) ",
+			       drbd_conn_str(os.conn),
+			       drbd_conn_str(ns.conn));
+	if (ns.disk != os.disk)
+		pbp += sprintf(pbp, "disk( %s -> %s ) ",
+			       drbd_disk_str(os.disk),
+			       drbd_disk_str(ns.disk));
+	if (ns.pdsk != os.pdsk)
+		pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
+			       drbd_disk_str(os.pdsk),
+			       drbd_disk_str(ns.pdsk));
+	if (is_susp(ns) != is_susp(os))
+		pbp += sprintf(pbp, "susp( %d -> %d ) ",
+			       is_susp(os),
+			       is_susp(ns));
+	if (ns.aftr_isp != os.aftr_isp)
+		pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
+			       os.aftr_isp,
+			       ns.aftr_isp);
+	if (ns.peer_isp != os.peer_isp)
+		pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
+			       os.peer_isp,
+			       ns.peer_isp);
+	if (ns.user_isp != os.user_isp)
+		pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
+			       os.user_isp,
+			       ns.user_isp);
+	dev_info(DEV, "%s\n", pb);
 	}
 
 	/* solve the race between becoming unconfigured,
@@ -1074,6 +1142,10 @@
 		atomic_inc(&mdev->local_cnt);
 
 	mdev->state = ns;
+
+	if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
+		drbd_print_uuids(mdev, "attached to UUIDs");
+
 	wake_up(&mdev->misc_wait);
 	wake_up(&mdev->state_wait);
 
@@ -1081,7 +1153,7 @@
 	if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
 	    ns.conn < C_CONNECTED) {
 		mdev->ov_start_sector =
-			BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left);
+			BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
 		dev_info(DEV, "Online Verify reached sector %llu\n",
 			(unsigned long long)mdev->ov_start_sector);
 	}
@@ -1106,14 +1178,7 @@
 		unsigned long now = jiffies;
 		int i;
 
-		mdev->ov_position = 0;
-		mdev->rs_total = drbd_bm_bits(mdev);
-		if (mdev->agreed_pro_version >= 90)
-			set_ov_position(mdev, ns.conn);
-		else
-			mdev->ov_start_sector = 0;
-		mdev->ov_left = mdev->rs_total
-			      - BM_SECT_TO_BIT(mdev->ov_position);
+		set_ov_position(mdev, ns.conn);
 		mdev->rs_start = now;
 		mdev->rs_last_events = 0;
 		mdev->rs_last_sect_ev = 0;
@@ -1121,10 +1186,12 @@
 		mdev->ov_last_oos_start = 0;
 
 		for (i = 0; i < DRBD_SYNC_MARKS; i++) {
-			mdev->rs_mark_left[i] = mdev->rs_total;
+			mdev->rs_mark_left[i] = mdev->ov_left;
 			mdev->rs_mark_time[i] = now;
 		}
 
+		drbd_rs_controller_reset(mdev);
+
 		if (ns.conn == C_VERIFY_S) {
 			dev_info(DEV, "Starting Online Verify from sector %llu\n",
 					(unsigned long long)mdev->ov_position);
@@ -1228,6 +1295,26 @@
 	}
 }
 
+int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
+		int (*io_fn)(struct drbd_conf *),
+		char *why, enum bm_flag flags)
+{
+	int rv;
+
+	D_ASSERT(current == mdev->worker.task);
+
+	/* open coded non-blocking drbd_suspend_io(mdev); */
+	set_bit(SUSPEND_IO, &mdev->flags);
+
+	drbd_bm_lock(mdev, why, flags);
+	rv = io_fn(mdev);
+	drbd_bm_unlock(mdev);
+
+	drbd_resume_io(mdev);
+
+	return rv;
+}
+
 /**
  * after_state_ch() - Perform after state change actions that may sleep
  * @mdev:	DRBD device.
@@ -1266,16 +1353,14 @@
 
 	nsm.i = -1;
 	if (ns.susp_nod) {
-		if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
-			if (ns.conn == C_CONNECTED)
-				what = resend, nsm.susp_nod = 0;
-			else /* ns.conn > C_CONNECTED */
-				dev_err(DEV, "Unexpected Resynd going on!\n");
-		}
+		if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
+			what = resend;
 
 		if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
-			what = restart_frozen_disk_io, nsm.susp_nod = 0;
+			what = restart_frozen_disk_io;
 
+		if (what != nothing)
+			nsm.susp_nod = 0;
 	}
 
 	if (ns.susp_fen) {
@@ -1306,13 +1391,30 @@
 		spin_unlock_irq(&mdev->req_lock);
 	}
 
+	/* Became sync source.  With protocol >= 96, we still need to send out
+	 * the sync uuid now. Need to do that before any drbd_send_state, or
+	 * the other side may go "paused sync" before receiving the sync uuids,
+	 * which is unexpected. */
+	if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
+	    (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
+	    mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
+		drbd_gen_and_send_sync_uuid(mdev);
+		put_ldev(mdev);
+	}
+
 	/* Do not change the order of the if above and the two below... */
 	if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
 		drbd_send_uuids(mdev);
 		drbd_send_state(mdev);
 	}
-	if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
-		drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
+	/* No point in queuing send_bitmap if we don't have a connection
+	 * anymore, so check also the _current_ state, not only the new state
+	 * at the time this work was queued. */
+	if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
+	    mdev->state.conn == C_WF_BITMAP_S)
+		drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
+				"send_bitmap (WFBitMapS)",
+				BM_LOCKED_TEST_ALLOWED);
 
 	/* Lost contact to peer's copy of the data */
 	if ((os.pdsk >= D_INCONSISTENT &&
@@ -1343,7 +1445,23 @@
 
 		/* D_DISKLESS Peer becomes secondary */
 		if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
-			drbd_al_to_on_disk_bm(mdev);
+			/* We may still be Primary ourselves.
+			 * No harm done if the bitmap still changes,
+			 * redirtied pages will follow later. */
+			drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
+				"demote diskless peer", BM_LOCKED_SET_ALLOWED);
+		put_ldev(mdev);
+	}
+
+	/* Write out all changed bits on demote.
+	 * Though, no need to da that just yet
+	 * if there is a resync going on still */
+	if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
+		mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
+		/* No changes to the bitmap expected this time, so assert that,
+		 * even though no harm was done if it did change. */
+		drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
+				"demote", BM_LOCKED_TEST_ALLOWED);
 		put_ldev(mdev);
 	}
 
@@ -1371,15 +1489,23 @@
 	if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
 		drbd_send_state(mdev);
 
+	if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
+		drbd_send_state(mdev);
+
 	/* We are in the progress to start a full sync... */
 	if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
 	    (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
-		drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
+		/* no other bitmap changes expected during this phase */
+		drbd_queue_bitmap_io(mdev,
+			&drbd_bmio_set_n_write, &abw_start_sync,
+			"set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
 
 	/* We are invalidating our self... */
 	if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
 	    os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
-		drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
+		/* other bitmap operation expected during this phase */
+		drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
+			"set_n_write from invalidate", BM_LOCKED_MASK);
 
 	/* first half of local IO error, failure to attach,
 	 * or administrative detach */
@@ -1434,8 +1560,6 @@
 
 		if (drbd_send_state(mdev))
 			dev_warn(DEV, "Notified peer that I'm now diskless.\n");
-		else
-			dev_err(DEV, "Sending state for being diskless failed\n");
 		/* corresponding get_ldev in __drbd_set_state
 		 * this may finaly trigger drbd_ldev_destroy. */
 		put_ldev(mdev);
@@ -1459,6 +1583,19 @@
 	if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
 		drbd_send_state(mdev);
 
+	/* This triggers bitmap writeout of potentially still unwritten pages
+	 * if the resync finished cleanly, or aborted because of peer disk
+	 * failure, or because of connection loss.
+	 * For resync aborted because of local disk failure, we cannot do
+	 * any bitmap writeout anymore.
+	 * No harm done if some bits change during this phase.
+	 */
+	if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
+		drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL,
+			"write from resync_finished", BM_LOCKED_SET_ALLOWED);
+		put_ldev(mdev);
+	}
+
 	/* free tl_hash if we Got thawed and are C_STANDALONE */
 	if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
 		drbd_free_tl_hash(mdev);
@@ -1559,7 +1696,7 @@
 		if (!try_module_get(THIS_MODULE)) {
 			dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
 			spin_unlock_irqrestore(&thi->t_lock, flags);
-			return FALSE;
+			return false;
 		}
 
 		init_completion(&thi->stop);
@@ -1576,7 +1713,7 @@
 			dev_err(DEV, "Couldn't start thread\n");
 
 			module_put(THIS_MODULE);
-			return FALSE;
+			return false;
 		}
 		spin_lock_irqsave(&thi->t_lock, flags);
 		thi->task = nt;
@@ -1596,7 +1733,7 @@
 		break;
 	}
 
-	return TRUE;
+	return true;
 }
 
 
@@ -1694,8 +1831,8 @@
 {
 	int sent, ok;
 
-	ERR_IF(!h) return FALSE;
-	ERR_IF(!size) return FALSE;
+	ERR_IF(!h) return false;
+	ERR_IF(!size) return false;
 
 	h->magic   = BE_DRBD_MAGIC;
 	h->command = cpu_to_be16(cmd);
@@ -1704,8 +1841,8 @@
 	sent = drbd_send(mdev, sock, h, size, msg_flags);
 
 	ok = (sent == size);
-	if (!ok)
-		dev_err(DEV, "short sent %s size=%d sent=%d\n",
+	if (!ok && !signal_pending(current))
+		dev_warn(DEV, "short sent %s size=%d sent=%d\n",
 		    cmdname(cmd), (int)size, sent);
 	return ok;
 }
@@ -1840,7 +1977,7 @@
 		else {
 			dev_err(DEV, "--dry-run is not supported by peer");
 			kfree(p);
-			return 0;
+			return -1;
 		}
 	}
 	p->conn_flags    = cpu_to_be32(cf);
@@ -1888,12 +2025,36 @@
 	return _drbd_send_uuids(mdev, 8);
 }
 
+void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
+{
+	if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
+		u64 *uuid = mdev->ldev->md.uuid;
+		dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
+		     text,
+		     (unsigned long long)uuid[UI_CURRENT],
+		     (unsigned long long)uuid[UI_BITMAP],
+		     (unsigned long long)uuid[UI_HISTORY_START],
+		     (unsigned long long)uuid[UI_HISTORY_END]);
+		put_ldev(mdev);
+	} else {
+		dev_info(DEV, "%s effective data uuid: %016llX\n",
+				text,
+				(unsigned long long)mdev->ed_uuid);
+	}
+}
 
-int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
+int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
 {
 	struct p_rs_uuid p;
+	u64 uuid;
 
-	p.uuid = cpu_to_be64(val);
+	D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
+
+	uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
+	drbd_uuid_set(mdev, UI_BITMAP, uuid);
+	drbd_print_uuids(mdev, "updated sync UUID");
+	drbd_md_sync(mdev);
+	p.uuid = cpu_to_be64(uuid);
 
 	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
 			     (struct p_header80 *)&p, sizeof(p));
@@ -1921,7 +2082,7 @@
 	p.d_size = cpu_to_be64(d_size);
 	p.u_size = cpu_to_be64(u_size);
 	p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
-	p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
+	p.max_bio_size = cpu_to_be32(queue_max_hw_sectors(mdev->rq_queue) << 9);
 	p.queue_order_type = cpu_to_be16(q_order_type);
 	p.dds_flags = cpu_to_be16(flags);
 
@@ -1972,7 +2133,7 @@
 			     (struct p_header80 *)&p, sizeof(p));
 }
 
-int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
+int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
 {
 	struct p_req_state_reply p;
 
@@ -2076,9 +2237,15 @@
 	return len;
 }
 
-enum { OK, FAILED, DONE }
+/**
+ * send_bitmap_rle_or_plain
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
 send_bitmap_rle_or_plain(struct drbd_conf *mdev,
-	struct p_header80 *h, struct bm_xfer_ctx *c)
+			 struct p_header80 *h, struct bm_xfer_ctx *c)
 {
 	struct p_compressed_bm *p = (void*)h;
 	unsigned long num_words;
@@ -2088,7 +2255,7 @@
 	len = fill_bitmap_rle_bits(mdev, p, c);
 
 	if (len < 0)
-		return FAILED;
+		return -EIO;
 
 	if (len) {
 		DCBP_set_code(p, RLE_VLI_Bits);
@@ -2118,11 +2285,14 @@
 		if (c->bit_offset > c->bm_bits)
 			c->bit_offset = c->bm_bits;
 	}
-	ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
-
-	if (ok == DONE)
-		INFO_bm_xfer_stats(mdev, "send", c);
-	return ok;
+	if (ok) {
+		if (len == 0) {
+			INFO_bm_xfer_stats(mdev, "send", c);
+			return 0;
+		} else
+			return 1;
+	}
+	return -EIO;
 }
 
 /* See the comment at receive_bitmap() */
@@ -2130,16 +2300,16 @@
 {
 	struct bm_xfer_ctx c;
 	struct p_header80 *p;
-	int ret;
+	int err;
 
-	ERR_IF(!mdev->bitmap) return FALSE;
+	ERR_IF(!mdev->bitmap) return false;
 
 	/* maybe we should use some per thread scratch page,
 	 * and allocate that during initial device creation? */
 	p = (struct p_header80 *) __get_free_page(GFP_NOIO);
 	if (!p) {
 		dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
-		return FALSE;
+		return false;
 	}
 
 	if (get_ldev(mdev)) {
@@ -2165,11 +2335,11 @@
 	};
 
 	do {
-		ret = send_bitmap_rle_or_plain(mdev, p, &c);
-	} while (ret == OK);
+		err = send_bitmap_rle_or_plain(mdev, p, &c);
+	} while (err > 0);
 
 	free_page((unsigned long) p);
-	return (ret == DONE);
+	return err == 0;
 }
 
 int drbd_send_bitmap(struct drbd_conf *mdev)
@@ -2192,7 +2362,7 @@
 	p.set_size = cpu_to_be32(set_size);
 
 	if (mdev->state.conn < C_CONNECTED)
-		return FALSE;
+		return false;
 	ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
 			(struct p_header80 *)&p, sizeof(p));
 	return ok;
@@ -2220,7 +2390,7 @@
 	p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
 
 	if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
-		return FALSE;
+		return false;
 	ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
 				(struct p_header80 *)&p, sizeof(p));
 	return ok;
@@ -2326,8 +2496,8 @@
 }
 
 /* called on sndtimeo
- * returns FALSE if we should retry,
- * TRUE if we think connection is dead
+ * returns false if we should retry,
+ * true if we think connection is dead
  */
 static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
 {
@@ -2340,7 +2510,7 @@
 		|| mdev->state.conn < C_CONNECTED;
 
 	if (drop_it)
-		return TRUE;
+		return true;
 
 	drop_it = !--mdev->ko_count;
 	if (!drop_it) {
@@ -2531,13 +2701,39 @@
 	if (ok && dgs) {
 		dgb = mdev->int_dig_out;
 		drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
-		ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
+		ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
 	}
 	if (ok) {
-		if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
+		/* For protocol A, we have to memcpy the payload into
+		 * socket buffers, as we may complete right away
+		 * as soon as we handed it over to tcp, at which point the data
+		 * pages may become invalid.
+		 *
+		 * For data-integrity enabled, we copy it as well, so we can be
+		 * sure that even if the bio pages may still be modified, it
+		 * won't change the data on the wire, thus if the digest checks
+		 * out ok after sending on this side, but does not fit on the
+		 * receiving side, we sure have detected corruption elsewhere.
+		 */
+		if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
 			ok = _drbd_send_bio(mdev, req->master_bio);
 		else
 			ok = _drbd_send_zc_bio(mdev, req->master_bio);
+
+		/* double check digest, sometimes buffers have been modified in flight. */
+		if (dgs > 0 && dgs <= 64) {
+			/* 64 byte, 512 bit, is the larges digest size
+			 * currently supported in kernel crypto. */
+			unsigned char digest[64];
+			drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
+			if (memcmp(mdev->int_dig_out, digest, dgs)) {
+				dev_warn(DEV,
+					"Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
+					(unsigned long long)req->sector, req->size);
+			}
+		} /* else if (dgs > 64) {
+		     ... Be noisy about digest too large ...
+		} */
 	}
 
 	drbd_put_data_sock(mdev);
@@ -2587,7 +2783,7 @@
 	if (ok && dgs) {
 		dgb = mdev->int_dig_out;
 		drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
-		ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
+		ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
 	}
 	if (ok)
 		ok = _drbd_send_zc_ee(mdev, e);
@@ -2597,6 +2793,16 @@
 	return ok;
 }
 
+int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
+{
+	struct p_block_desc p;
+
+	p.sector  = cpu_to_be64(req->sector);
+	p.blksize = cpu_to_be32(req->size);
+
+	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
+}
+
 /*
   drbd_send distinguishes two cases:
 
@@ -2770,6 +2976,7 @@
 	atomic_set(&mdev->pp_in_use_by_net, 0);
 	atomic_set(&mdev->rs_sect_in, 0);
 	atomic_set(&mdev->rs_sect_ev, 0);
+	atomic_set(&mdev->ap_in_flight, 0);
 
 	mutex_init(&mdev->md_io_mutex);
 	mutex_init(&mdev->data.mutex);
@@ -2798,19 +3005,27 @@
 	INIT_LIST_HEAD(&mdev->unplug_work.list);
 	INIT_LIST_HEAD(&mdev->go_diskless.list);
 	INIT_LIST_HEAD(&mdev->md_sync_work.list);
+	INIT_LIST_HEAD(&mdev->start_resync_work.list);
 	INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
 
-	mdev->resync_work.cb  = w_resync_inactive;
+	mdev->resync_work.cb  = w_resync_timer;
 	mdev->unplug_work.cb  = w_send_write_hint;
 	mdev->go_diskless.cb  = w_go_diskless;
 	mdev->md_sync_work.cb = w_md_sync;
 	mdev->bm_io_work.w.cb = w_bitmap_io;
+	mdev->start_resync_work.cb = w_start_resync;
 	init_timer(&mdev->resync_timer);
 	init_timer(&mdev->md_sync_timer);
+	init_timer(&mdev->start_resync_timer);
+	init_timer(&mdev->request_timer);
 	mdev->resync_timer.function = resync_timer_fn;
 	mdev->resync_timer.data = (unsigned long) mdev;
 	mdev->md_sync_timer.function = md_sync_timer_fn;
 	mdev->md_sync_timer.data = (unsigned long) mdev;
+	mdev->start_resync_timer.function = start_resync_timer_fn;
+	mdev->start_resync_timer.data = (unsigned long) mdev;
+	mdev->request_timer.function = request_timer_fn;
+	mdev->request_timer.data = (unsigned long) mdev;
 
 	init_waitqueue_head(&mdev->misc_wait);
 	init_waitqueue_head(&mdev->state_wait);
@@ -2881,6 +3096,8 @@
 	D_ASSERT(list_empty(&mdev->resync_work.list));
 	D_ASSERT(list_empty(&mdev->unplug_work.list));
 	D_ASSERT(list_empty(&mdev->go_diskless.list));
+
+	drbd_set_defaults(mdev);
 }
 
 
@@ -2923,7 +3140,7 @@
 static int drbd_create_mempools(void)
 {
 	struct page *page;
-	const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count;
+	const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
 	int i;
 
 	/* prepare our caches and mempools */
@@ -3087,11 +3304,20 @@
 
 	unregister_reboot_notifier(&drbd_notifier);
 
+	/* first remove proc,
+	 * drbdsetup uses it's presence to detect
+	 * whether DRBD is loaded.
+	 * If we would get stuck in proc removal,
+	 * but have netlink already deregistered,
+	 * some drbdsetup commands may wait forever
+	 * for an answer.
+	 */
+	if (drbd_proc)
+		remove_proc_entry("drbd", NULL);
+
 	drbd_nl_cleanup();
 
 	if (minor_table) {
-		if (drbd_proc)
-			remove_proc_entry("drbd", NULL);
 		i = minor_count;
 		while (i--)
 			drbd_delete_device(i);
@@ -3119,7 +3345,7 @@
 	char reason = '-';
 	int r = 0;
 
-	if (!__inc_ap_bio_cond(mdev)) {
+	if (!may_inc_ap_bio(mdev)) {
 		/* DRBD has frozen IO */
 		r = bdi_bits;
 		reason = 'd';
@@ -3172,7 +3398,7 @@
 		goto out_no_disk;
 	mdev->vdisk = disk;
 
-	set_disk_ro(disk, TRUE);
+	set_disk_ro(disk, true);
 
 	disk->queue = q;
 	disk->major = DRBD_MAJOR;
@@ -3188,8 +3414,8 @@
 	q->backing_dev_info.congested_fn = drbd_congested;
 	q->backing_dev_info.congested_data = mdev;
 
-	blk_queue_make_request(q, drbd_make_request_26);
-	blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE);
+	blk_queue_make_request(q, drbd_make_request);
+	blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE >> 9);
 	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
 	blk_queue_merge_bvec(q, drbd_merge_bvec);
 	q->queue_lock = &mdev->req_lock;
@@ -3251,6 +3477,7 @@
 	put_disk(mdev->vdisk);
 	blk_cleanup_queue(mdev->rq_queue);
 	free_cpumask_var(mdev->cpu_mask);
+	drbd_free_tl_hash(mdev);
 	kfree(mdev);
 }
 
@@ -3266,7 +3493,7 @@
 		return -EINVAL;
 	}
 
-	if (1 > minor_count || minor_count > 255) {
+	if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
 		printk(KERN_ERR
 			"drbd: invalid minor_count (%d)\n", minor_count);
 #ifdef MODULE
@@ -3448,7 +3675,7 @@
 	if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
 		/* this was a try anyways ... */
 		dev_err(DEV, "meta data update failed!\n");
-		drbd_chk_io_error(mdev, 1, TRUE);
+		drbd_chk_io_error(mdev, 1, true);
 	}
 
 	/* Update mdev->ldev->md.la_size_sect,
@@ -3464,7 +3691,7 @@
  * @mdev:	DRBD device.
  * @bdev:	Device from which the meta data should be read in.
  *
- * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
+ * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
  * something goes wrong.  Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
  */
 int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
@@ -3534,28 +3761,6 @@
 	return rv;
 }
 
-static void debug_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index)
-{
-	static char *uuid_str[UI_EXTENDED_SIZE] = {
-		[UI_CURRENT] = "CURRENT",
-		[UI_BITMAP] = "BITMAP",
-		[UI_HISTORY_START] = "HISTORY_START",
-		[UI_HISTORY_END] = "HISTORY_END",
-		[UI_SIZE] = "SIZE",
-		[UI_FLAGS] = "FLAGS",
-	};
-
-	if (index >= UI_EXTENDED_SIZE) {
-		dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n");
-		return;
-	}
-
-	dynamic_dev_dbg(DEV, " uuid[%s] now %016llX\n",
-		 uuid_str[index],
-		 (unsigned long long)mdev->ldev->md.uuid[index]);
-}
-
-
 /**
  * drbd_md_mark_dirty() - Mark meta data super block as dirty
  * @mdev:	DRBD device.
@@ -3585,10 +3790,8 @@
 {
 	int i;
 
-	for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) {
+	for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
 		mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
-		debug_drbd_uuid(mdev, i+1);
-	}
 }
 
 void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
@@ -3603,7 +3806,6 @@
 	}
 
 	mdev->ldev->md.uuid[idx] = val;
-	debug_drbd_uuid(mdev, idx);
 	drbd_md_mark_dirty(mdev);
 }
 
@@ -3613,7 +3815,6 @@
 	if (mdev->ldev->md.uuid[idx]) {
 		drbd_uuid_move_history(mdev);
 		mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
-		debug_drbd_uuid(mdev, UI_HISTORY_START);
 	}
 	_drbd_uuid_set(mdev, idx, val);
 }
@@ -3628,14 +3829,16 @@
 void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
 {
 	u64 val;
+	unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
 
-	dev_info(DEV, "Creating new current UUID\n");
-	D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
+	if (bm_uuid)
+		dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
+
 	mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
-	debug_drbd_uuid(mdev, UI_BITMAP);
 
 	get_random_bytes(&val, sizeof(u64));
 	_drbd_uuid_set(mdev, UI_CURRENT, val);
+	drbd_print_uuids(mdev, "new current UUID");
 	/* get it to stable storage _now_ */
 	drbd_md_sync(mdev);
 }
@@ -3649,16 +3852,12 @@
 		drbd_uuid_move_history(mdev);
 		mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
 		mdev->ldev->md.uuid[UI_BITMAP] = 0;
-		debug_drbd_uuid(mdev, UI_HISTORY_START);
-		debug_drbd_uuid(mdev, UI_BITMAP);
 	} else {
-		if (mdev->ldev->md.uuid[UI_BITMAP])
-			dev_warn(DEV, "bm UUID already set");
+		unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
+		if (bm_uuid)
+			dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
 
-		mdev->ldev->md.uuid[UI_BITMAP] = val;
-		mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
-
-		debug_drbd_uuid(mdev, UI_BITMAP);
+		mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
 	}
 	drbd_md_mark_dirty(mdev);
 }
@@ -3714,15 +3913,19 @@
 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 {
 	struct bm_io_work *work = container_of(w, struct bm_io_work, w);
-	int rv;
+	int rv = -EIO;
 
 	D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
 
-	drbd_bm_lock(mdev, work->why);
-	rv = work->io_fn(mdev);
-	drbd_bm_unlock(mdev);
+	if (get_ldev(mdev)) {
+		drbd_bm_lock(mdev, work->why, work->flags);
+		rv = work->io_fn(mdev);
+		drbd_bm_unlock(mdev);
+		put_ldev(mdev);
+	}
 
 	clear_bit(BITMAP_IO, &mdev->flags);
+	smp_mb__after_clear_bit();
 	wake_up(&mdev->misc_wait);
 
 	if (work->done)
@@ -3730,6 +3933,7 @@
 
 	clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
 	work->why = NULL;
+	work->flags = 0;
 
 	return 1;
 }
@@ -3784,7 +3988,7 @@
 void drbd_queue_bitmap_io(struct drbd_conf *mdev,
 			  int (*io_fn)(struct drbd_conf *),
 			  void (*done)(struct drbd_conf *, int),
-			  char *why)
+			  char *why, enum bm_flag flags)
 {
 	D_ASSERT(current == mdev->worker.task);
 
@@ -3798,15 +4002,15 @@
 	mdev->bm_io_work.io_fn = io_fn;
 	mdev->bm_io_work.done = done;
 	mdev->bm_io_work.why = why;
+	mdev->bm_io_work.flags = flags;
 
+	spin_lock_irq(&mdev->req_lock);
 	set_bit(BITMAP_IO, &mdev->flags);
 	if (atomic_read(&mdev->ap_bio_cnt) == 0) {
-		if (list_empty(&mdev->bm_io_work.w.list)) {
-			set_bit(BITMAP_IO_QUEUED, &mdev->flags);
+		if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
 			drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
-		} else
-			dev_err(DEV, "FIXME avoided double queuing bm_io_work\n");
 	}
+	spin_unlock_irq(&mdev->req_lock);
 }
 
 /**
@@ -3818,19 +4022,22 @@
  * freezes application IO while that the actual IO operations runs. This
  * functions MAY NOT be called from worker context.
  */
-int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
+int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
+		char *why, enum bm_flag flags)
 {
 	int rv;
 
 	D_ASSERT(current != mdev->worker.task);
 
-	drbd_suspend_io(mdev);
+	if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
+		drbd_suspend_io(mdev);
 
-	drbd_bm_lock(mdev, why);
+	drbd_bm_lock(mdev, why, flags);
 	rv = io_fn(mdev);
 	drbd_bm_unlock(mdev);
 
-	drbd_resume_io(mdev);
+	if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
+		drbd_resume_io(mdev);
 
 	return rv;
 }
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index fe81c85..03b29f7 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -288,10 +288,11 @@
 		dev_err(DEV, "out of mem, failed to invoke fence-peer helper\n");
 }
 
-int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
+enum drbd_state_rv
+drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
 {
 	const int max_tries = 4;
-	int r = 0;
+	enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
 	int try = 0;
 	int forced = 0;
 	union drbd_state mask, val;
@@ -306,17 +307,17 @@
 	val.i  = 0; val.role  = new_role;
 
 	while (try++ < max_tries) {
-		r = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE);
+		rv = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE);
 
 		/* in case we first succeeded to outdate,
 		 * but now suddenly could establish a connection */
-		if (r == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
+		if (rv == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
 			val.pdsk = 0;
 			mask.pdsk = 0;
 			continue;
 		}
 
-		if (r == SS_NO_UP_TO_DATE_DISK && force &&
+		if (rv == SS_NO_UP_TO_DATE_DISK && force &&
 		    (mdev->state.disk < D_UP_TO_DATE &&
 		     mdev->state.disk >= D_INCONSISTENT)) {
 			mask.disk = D_MASK;
@@ -325,7 +326,7 @@
 			continue;
 		}
 
-		if (r == SS_NO_UP_TO_DATE_DISK &&
+		if (rv == SS_NO_UP_TO_DATE_DISK &&
 		    mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) {
 			D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
 			nps = drbd_try_outdate_peer(mdev);
@@ -341,9 +342,9 @@
 			continue;
 		}
 
-		if (r == SS_NOTHING_TO_DO)
+		if (rv == SS_NOTHING_TO_DO)
 			goto fail;
-		if (r == SS_PRIMARY_NOP && mask.pdsk == 0) {
+		if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
 			nps = drbd_try_outdate_peer(mdev);
 
 			if (force && nps > D_OUTDATED) {
@@ -356,25 +357,24 @@
 
 			continue;
 		}
-		if (r == SS_TWO_PRIMARIES) {
+		if (rv == SS_TWO_PRIMARIES) {
 			/* Maybe the peer is detected as dead very soon...
 			   retry at most once more in this case. */
-			__set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10);
+			schedule_timeout_interruptible((mdev->net_conf->ping_timeo+1)*HZ/10);
 			if (try < max_tries)
 				try = max_tries - 1;
 			continue;
 		}
-		if (r < SS_SUCCESS) {
-			r = _drbd_request_state(mdev, mask, val,
+		if (rv < SS_SUCCESS) {
+			rv = _drbd_request_state(mdev, mask, val,
 						CS_VERBOSE + CS_WAIT_COMPLETE);
-			if (r < SS_SUCCESS)
+			if (rv < SS_SUCCESS)
 				goto fail;
 		}
 		break;
 	}
 
-	if (r < SS_SUCCESS)
+	if (rv < SS_SUCCESS)
 		goto fail;
 
 	if (forced)
@@ -384,7 +384,7 @@
 	wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0);
 
 	if (new_role == R_SECONDARY) {
-		set_disk_ro(mdev->vdisk, TRUE);
+		set_disk_ro(mdev->vdisk, true);
 		if (get_ldev(mdev)) {
 			mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
 			put_ldev(mdev);
@@ -394,7 +394,7 @@
 			mdev->net_conf->want_lose = 0;
 			put_net_conf(mdev);
 		}
-		set_disk_ro(mdev->vdisk, FALSE);
+		set_disk_ro(mdev->vdisk, false);
 		if (get_ldev(mdev)) {
 			if (((mdev->state.conn < C_CONNECTED ||
 			       mdev->state.pdsk <= D_FAILED)
@@ -406,10 +406,8 @@
 		}
 	}
 
-	if ((new_role == R_SECONDARY) && get_ldev(mdev)) {
-		drbd_al_to_on_disk_bm(mdev);
-		put_ldev(mdev);
-	}
+	/* writeout of activity log covered areas of the bitmap
+	 * to stable storage done in after state change already */
 
 	if (mdev->state.conn >= C_WF_REPORT_PARAMS) {
 		/* if this was forced, we should consider sync */
@@ -423,7 +421,7 @@
 	kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
  fail:
 	mutex_unlock(&mdev->state_mutex);
-	return r;
+	return rv;
 }
 
 static struct drbd_conf *ensure_mdev(int minor, int create)
@@ -528,17 +526,19 @@
 	}
 }
 
+/* input size is expected to be in KB */
 char *ppsize(char *buf, unsigned long long size)
 {
-	/* Needs 9 bytes at max. */
+	/* Needs 9 bytes at max including trailing NUL:
+	 * -1ULL ==> "16384 EB" */
 	static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
 	int base = 0;
-	while (size >= 10000) {
+	while (size >= 10000 && base < sizeof(units)-1) {
 		/* shift + round */
 		size = (size >> 10) + !!(size & (1<<9));
 		base++;
 	}
-	sprintf(buf, "%lu %cB", (long)size, units[base]);
+	sprintf(buf, "%u %cB", (unsigned)size, units[base]);
 
 	return buf;
 }
@@ -642,11 +642,19 @@
 		|| prev_size	   != mdev->ldev->md.md_size_sect;
 
 	if (la_size_changed || md_moved) {
+		int err;
+
 		drbd_al_shrink(mdev); /* All extents inactive. */
 		dev_info(DEV, "Writing the whole bitmap, %s\n",
 			 la_size_changed && md_moved ? "size changed and md moved" :
 			 la_size_changed ? "size changed" : "md moved");
-		rv = drbd_bitmap_io(mdev, &drbd_bm_write, "size changed"); /* does drbd_resume_io() ! */
+		/* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
+		err = drbd_bitmap_io(mdev, &drbd_bm_write,
+				"size changed", BM_LOCKED_MASK);
+		if (err) {
+			rv = dev_size_error;
+			goto out;
+		}
 		drbd_md_mark_dirty(mdev);
 	}
 
@@ -765,22 +773,21 @@
 	return 0;
 }
 
-void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __must_hold(local)
+void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size) __must_hold(local)
 {
 	struct request_queue * const q = mdev->rq_queue;
 	struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
 	int max_segments = mdev->ldev->dc.max_bio_bvecs;
+	int max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
 
-	max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s);
-
-	blk_queue_max_hw_sectors(q, max_seg_s >> 9);
-	blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
-	blk_queue_max_segment_size(q, max_seg_s);
 	blk_queue_logical_block_size(q, 512);
-	blk_queue_segment_boundary(q, PAGE_SIZE-1);
-	blk_stack_limits(&q->limits, &b->limits, 0);
+	blk_queue_max_hw_sectors(q, max_hw_sectors);
+	/* This is the workaround for "bio would need to, but cannot, be split" */
+	blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
+	blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1);
+	blk_queue_stack_limits(q, b);
 
-	dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q));
+	dev_info(DEV, "max BIO size = %u\n", queue_max_hw_sectors(q) << 9);
 
 	if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
 		dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
@@ -850,7 +857,7 @@
 static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 			     struct drbd_nl_cfg_reply *reply)
 {
-	enum drbd_ret_codes retcode;
+	enum drbd_ret_code retcode;
 	enum determine_dev_size dd;
 	sector_t max_possible_sectors;
 	sector_t min_md_device_sectors;
@@ -858,8 +865,8 @@
 	struct block_device *bdev;
 	struct lru_cache *resync_lru = NULL;
 	union drbd_state ns, os;
-	unsigned int max_seg_s;
-	int rv;
+	unsigned int max_bio_size;
+	enum drbd_state_rv rv;
 	int cp_discovered = 0;
 	int logical_block_size;
 
@@ -1005,9 +1012,10 @@
 	/* and for any other previously queued work */
 	drbd_flush_workqueue(mdev);
 
-	retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE);
+	rv = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE);
+	retcode = rv;  /* FIXME: Type mismatch. */
 	drbd_resume_io(mdev);
-	if (retcode < SS_SUCCESS)
+	if (rv < SS_SUCCESS)
 		goto fail;
 
 	if (!get_ldev_if_state(mdev, D_ATTACHING))
@@ -1109,20 +1117,20 @@
 	mdev->read_cnt = 0;
 	mdev->writ_cnt = 0;
 
-	max_seg_s = DRBD_MAX_SEGMENT_SIZE;
+	max_bio_size = DRBD_MAX_BIO_SIZE;
 	if (mdev->state.conn == C_CONNECTED) {
 		/* We are Primary, Connected, and now attach a new local
 		 * backing store. We must not increase the user visible maximum
 		 * bio size on this device to something the peer may not be
 		 * able to handle. */
 		if (mdev->agreed_pro_version < 94)
-			max_seg_s = queue_max_segment_size(mdev->rq_queue);
+			max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9;
 		else if (mdev->agreed_pro_version == 94)
-			max_seg_s = DRBD_MAX_SIZE_H80_PACKET;
+			max_bio_size = DRBD_MAX_SIZE_H80_PACKET;
 		/* else: drbd 8.3.9 and later, stay with default */
 	}
 
-	drbd_setup_queue_param(mdev, max_seg_s);
+	drbd_setup_queue_param(mdev, max_bio_size);
 
 	/* If I am currently not R_PRIMARY,
 	 * but meta data primary indicator is set,
@@ -1154,12 +1162,14 @@
 	if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
 		dev_info(DEV, "Assuming that all blocks are out of sync "
 		     "(aka FullSync)\n");
-		if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from attaching")) {
+		if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write,
+			"set_n_write from attaching", BM_LOCKED_MASK)) {
 			retcode = ERR_IO_MD_DISK;
 			goto force_diskless_dec;
 		}
 	} else {
-		if (drbd_bitmap_io(mdev, &drbd_bm_read, "read from attaching") < 0) {
+		if (drbd_bitmap_io(mdev, &drbd_bm_read,
+			"read from attaching", BM_LOCKED_MASK) < 0) {
 			retcode = ERR_IO_MD_DISK;
 			goto force_diskless_dec;
 		}
@@ -1167,7 +1177,11 @@
 
 	if (cp_discovered) {
 		drbd_al_apply_to_bm(mdev);
-		drbd_al_to_on_disk_bm(mdev);
+		if (drbd_bitmap_io(mdev, &drbd_bm_write,
+			"crashed primary apply AL", BM_LOCKED_MASK)) {
+			retcode = ERR_IO_MD_DISK;
+			goto force_diskless_dec;
+		}
 	}
 
 	if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev))
@@ -1279,7 +1293,7 @@
 			    struct drbd_nl_cfg_reply *reply)
 {
 	int i, ns;
-	enum drbd_ret_codes retcode;
+	enum drbd_ret_code retcode;
 	struct net_conf *new_conf = NULL;
 	struct crypto_hash *tfm = NULL;
 	struct crypto_hash *integrity_w_tfm = NULL;
@@ -1324,6 +1338,8 @@
 	new_conf->wire_protocol    = DRBD_PROT_C;
 	new_conf->ping_timeo	   = DRBD_PING_TIMEO_DEF;
 	new_conf->rr_conflict	   = DRBD_RR_CONFLICT_DEF;
+	new_conf->on_congestion    = DRBD_ON_CONGESTION_DEF;
+	new_conf->cong_extents     = DRBD_CONG_EXTENTS_DEF;
 
 	if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) {
 		retcode = ERR_MANDATORY_TAG;
@@ -1345,6 +1361,11 @@
 		}
 	}
 
+	if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) {
+		retcode = ERR_CONG_NOT_PROTO_A;
+		goto fail;
+	}
+
 	if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
 		retcode = ERR_DISCARD;
 		goto fail;
@@ -1525,6 +1546,21 @@
 			      struct drbd_nl_cfg_reply *reply)
 {
 	int retcode;
+	struct disconnect dc;
+
+	memset(&dc, 0, sizeof(struct disconnect));
+	if (!disconnect_from_tags(mdev, nlp->tag_list, &dc)) {
+		retcode = ERR_MANDATORY_TAG;
+		goto fail;
+	}
+
+	if (dc.force) {
+		spin_lock_irq(&mdev->req_lock);
+		if (mdev->state.conn >= C_WF_CONNECTION)
+			_drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), CS_HARD, NULL);
+		spin_unlock_irq(&mdev->req_lock);
+		goto done;
+	}
 
 	retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED);
 
@@ -1842,6 +1878,10 @@
 {
 	int retcode;
 
+	/* If there is still bitmap IO pending, probably because of a previous
+	 * resync just being finished, wait for it before requesting a new resync. */
+	wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
+
 	retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
 
 	if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION)
@@ -1877,6 +1917,10 @@
 {
 	int retcode;
 
+	/* If there is still bitmap IO pending, probably because of a previous
+	 * resync just being finished, wait for it before requesting a new resync. */
+	wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
+
 	retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
 
 	if (retcode < SS_SUCCESS) {
@@ -1885,9 +1929,9 @@
 			   into a full resync. */
 			retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
 			if (retcode >= SS_SUCCESS) {
-				/* open coded drbd_bitmap_io() */
 				if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
-						   "set_n_write from invalidate_peer"))
+					"set_n_write from invalidate_peer",
+					BM_LOCKED_SET_ALLOWED))
 					retcode = ERR_IO_MD_DISK;
 			}
 		} else
@@ -1914,9 +1958,17 @@
 			       struct drbd_nl_cfg_reply *reply)
 {
 	int retcode = NO_ERROR;
+	union drbd_state s;
 
-	if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO)
-		retcode = ERR_PAUSE_IS_CLEAR;
+	if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
+		s = mdev->state;
+		if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) {
+			retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP :
+				  s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR;
+		} else {
+			retcode = ERR_PAUSE_IS_CLEAR;
+		}
+	}
 
 	reply->ret_code = retcode;
 	return 0;
@@ -2054,6 +2106,11 @@
 		reply->ret_code = ERR_MANDATORY_TAG;
 		return 0;
 	}
+
+	/* If there is still bitmap IO pending, e.g. previous resync or verify
+	 * just being finished, wait for it before requesting a new resync. */
+	wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
+
 	/* w_make_ov_request expects position to be aligned */
 	mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT;
 	reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
@@ -2097,7 +2154,8 @@
 	drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */
 
 	if (args.clear_bm) {
-		err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, "clear_n_write from new_c_uuid");
+		err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
+			"clear_n_write from new_c_uuid", BM_LOCKED_MASK);
 		if (err) {
 			dev_err(DEV, "Writing bitmap failed with %d\n",err);
 			retcode = ERR_IO_MD_DISK;
@@ -2105,6 +2163,7 @@
 		if (skip_initial_sync) {
 			drbd_send_uuids_skip_initial_sync(mdev);
 			_drbd_uuid_set(mdev, UI_BITMAP, 0);
+			drbd_print_uuids(mdev, "cleared bitmap UUID");
 			spin_lock_irq(&mdev->req_lock);
 			_drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
 					CS_VERBOSE, NULL);
@@ -2189,7 +2248,8 @@
 		goto fail;
 	}
 
-	if (nlp->packet_type >= P_nl_after_last_packet) {
+	if (nlp->packet_type >= P_nl_after_last_packet ||
+	    nlp->packet_type == P_return_code_only) {
 		retcode = ERR_PACKET_NR;
 		goto fail;
 	}
@@ -2205,7 +2265,7 @@
 	reply_size += cm->reply_body_size;
 
 	/* allocation not in the IO path, cqueue thread context */
-	cn_reply = kmalloc(reply_size, GFP_KERNEL);
+	cn_reply = kzalloc(reply_size, GFP_KERNEL);
 	if (!cn_reply) {
 		retcode = ERR_NOMEM;
 		goto fail;
@@ -2213,7 +2273,7 @@
 	reply = (struct drbd_nl_cfg_reply *) cn_reply->data;
 
 	reply->packet_type =
-		cm->reply_body_size ? nlp->packet_type : P_nl_after_last_packet;
+		cm->reply_body_size ? nlp->packet_type : P_return_code_only;
 	reply->minor = nlp->drbd_minor;
 	reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */
 	/* reply->tag_list; might be modified by cm->function. */
@@ -2376,7 +2436,7 @@
 	/* receiver thread context, which is not in the writeout path (of this node),
 	 * but may be in the writeout path of the _other_ node.
 	 * GFP_NOIO to avoid potential "distributed deadlock". */
-	cn_reply = kmalloc(
+	cn_reply = kzalloc(
 		sizeof(struct cn_msg)+
 		sizeof(struct drbd_nl_cfg_reply)+
 		sizeof(struct dump_ee_tag_len_struct)+
@@ -2398,10 +2458,11 @@
 	tl = tl_add_int(tl, T_ee_sector, &e->sector);
 	tl = tl_add_int(tl, T_ee_block_id, &e->block_id);
 
+	/* dump the first 32k */
+	len = min_t(unsigned, e->size, 32 << 10);
 	put_unaligned(T_ee_data, tl++);
-	put_unaligned(e->size, tl++);
+	put_unaligned(len, tl++);
 
-	len = e->size;
 	page = e->pages;
 	page_chain_for_each(page) {
 		void *d = kmap_atomic(page, KM_USER0);
@@ -2410,6 +2471,8 @@
 		kunmap_atomic(d, KM_USER0);
 		tl = (unsigned short*)((char*)tl + l);
 		len -= l;
+		if (len == 0)
+			break;
 	}
 	put_unaligned(TT_END, tl++); /* Close the tag list */
 
@@ -2508,6 +2571,7 @@
 		(struct drbd_nl_cfg_reply *)cn_reply->data;
 	int rr;
 
+	memset(buffer, 0, sizeof(buffer));
 	cn_reply->id = req->id;
 
 	cn_reply->seq = req->seq;
@@ -2515,6 +2579,7 @@
 	cn_reply->len = sizeof(struct drbd_nl_cfg_reply);
 	cn_reply->flags = 0;
 
+	reply->packet_type = P_return_code_only;
 	reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor;
 	reply->ret_code = ret_code;
 
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 7e6ac30..2959cdf 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -34,6 +34,7 @@
 #include "drbd_int.h"
 
 static int drbd_proc_open(struct inode *inode, struct file *file);
+static int drbd_proc_release(struct inode *inode, struct file *file);
 
 
 struct proc_dir_entry *drbd_proc;
@@ -42,9 +43,22 @@
 	.open		= drbd_proc_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= single_release,
+	.release	= drbd_proc_release,
 };
 
+void seq_printf_with_thousands_grouping(struct seq_file *seq, long v)
+{
+	/* v is in kB/sec. We don't expect TiByte/sec yet. */
+	if (unlikely(v >= 1000000)) {
+		/* cool: > GiByte/s */
+		seq_printf(seq, "%ld,", v / 1000000);
+		v /= 1000000;
+		seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000);
+	} else if (likely(v >= 1000))
+		seq_printf(seq, "%ld,%03ld", v/1000, v % 1000);
+	else
+		seq_printf(seq, "%ld", v);
+}
 
 /*lge
  * progress bars shamelessly adapted from driver/md/md.c
@@ -71,10 +85,15 @@
 		seq_printf(seq, ".");
 	seq_printf(seq, "] ");
 
-	seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10);
-	/* if more than 1 GB display in MB */
-	if (mdev->rs_total > 0x100000L)
-		seq_printf(seq, "(%lu/%lu)M\n\t",
+	if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
+		seq_printf(seq, "verified:");
+	else
+		seq_printf(seq, "sync'ed:");
+	seq_printf(seq, "%3u.%u%% ", res / 10, res % 10);
+
+	/* if more than a few GB, display in MB */
+	if (mdev->rs_total > (4UL << (30 - BM_BLOCK_SHIFT)))
+		seq_printf(seq, "(%lu/%lu)M",
 			    (unsigned long) Bit2KB(rs_left >> 10),
 			    (unsigned long) Bit2KB(mdev->rs_total >> 10));
 	else
@@ -94,6 +113,7 @@
 	/* Rolling marks. last_mark+1 may just now be modified.  last_mark+2 is
 	 * at least (DRBD_SYNC_MARKS-2)*DRBD_SYNC_MARK_STEP old, and has at
 	 * least DRBD_SYNC_MARK_STEP time before it will be modified. */
+	/* ------------------------ ~18s average ------------------------ */
 	i = (mdev->rs_last_mark + 2) % DRBD_SYNC_MARKS;
 	dt = (jiffies - mdev->rs_mark_time[i]) / HZ;
 	if (dt > (DRBD_SYNC_MARK_STEP * DRBD_SYNC_MARKS))
@@ -107,14 +127,24 @@
 	seq_printf(seq, "finish: %lu:%02lu:%02lu",
 		rt / 3600, (rt % 3600) / 60, rt % 60);
 
-	/* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */
 	dbdt = Bit2KB(db/dt);
-	if (dbdt > 1000)
-		seq_printf(seq, " speed: %ld,%03ld",
-			dbdt/1000, dbdt % 1000);
-	else
-		seq_printf(seq, " speed: %ld", dbdt);
+	seq_printf(seq, " speed: ");
+	seq_printf_with_thousands_grouping(seq, dbdt);
+	seq_printf(seq, " (");
+	/* ------------------------- ~3s average ------------------------ */
+	if (proc_details >= 1) {
+		/* this is what drbd_rs_should_slow_down() uses */
+		i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
+		dt = (jiffies - mdev->rs_mark_time[i]) / HZ;
+		if (!dt)
+			dt++;
+		db = mdev->rs_mark_left[i] - rs_left;
+		dbdt = Bit2KB(db/dt);
+		seq_printf_with_thousands_grouping(seq, dbdt);
+		seq_printf(seq, " -- ");
+	}
 
+	/* --------------------- long term average ---------------------- */
 	/* mean speed since syncer started
 	 * we do account for PausedSync periods */
 	dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
@@ -122,20 +152,34 @@
 		dt = 1;
 	db = mdev->rs_total - rs_left;
 	dbdt = Bit2KB(db/dt);
-	if (dbdt > 1000)
-		seq_printf(seq, " (%ld,%03ld)",
-			dbdt/1000, dbdt % 1000);
-	else
-		seq_printf(seq, " (%ld)", dbdt);
+	seq_printf_with_thousands_grouping(seq, dbdt);
+	seq_printf(seq, ")");
 
-	if (mdev->state.conn == C_SYNC_TARGET) {
-		if (mdev->c_sync_rate > 1000)
-			seq_printf(seq, " want: %d,%03d",
-				   mdev->c_sync_rate / 1000, mdev->c_sync_rate % 1000);
-		else
-			seq_printf(seq, " want: %d", mdev->c_sync_rate);
+	if (mdev->state.conn == C_SYNC_TARGET ||
+	    mdev->state.conn == C_VERIFY_S) {
+		seq_printf(seq, " want: ");
+		seq_printf_with_thousands_grouping(seq, mdev->c_sync_rate);
 	}
 	seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : "");
+
+	if (proc_details >= 1) {
+		/* 64 bit:
+		 * we convert to sectors in the display below. */
+		unsigned long bm_bits = drbd_bm_bits(mdev);
+		unsigned long bit_pos;
+		if (mdev->state.conn == C_VERIFY_S ||
+		    mdev->state.conn == C_VERIFY_T)
+			bit_pos = bm_bits - mdev->ov_left;
+		else
+			bit_pos = mdev->bm_resync_fo;
+		/* Total sectors may be slightly off for oddly
+		 * sized devices. So what. */
+		seq_printf(seq,
+			"\t%3d%% sector pos: %llu/%llu\n",
+			(int)(bit_pos / (bm_bits/100+1)),
+			(unsigned long long)bit_pos * BM_SECT_PER_BIT,
+			(unsigned long long)bm_bits * BM_SECT_PER_BIT);
+	}
 }
 
 static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
@@ -232,20 +276,16 @@
 			   mdev->epochs,
 			   write_ordering_chars[mdev->write_ordering]
 			);
-			seq_printf(seq, " oos:%lu\n",
-				   Bit2KB(drbd_bm_total_weight(mdev)));
+			seq_printf(seq, " oos:%llu\n",
+				   Bit2KB((unsigned long long)
+					   drbd_bm_total_weight(mdev)));
 		}
 		if (mdev->state.conn == C_SYNC_SOURCE ||
-		    mdev->state.conn == C_SYNC_TARGET)
+		    mdev->state.conn == C_SYNC_TARGET ||
+		    mdev->state.conn == C_VERIFY_S ||
+		    mdev->state.conn == C_VERIFY_T)
 			drbd_syncer_progress(mdev, seq);
 
-		if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
-			seq_printf(seq, "\t%3d%%      %lu/%lu\n",
-				   (int)((mdev->rs_total-mdev->ov_left) /
-					 (mdev->rs_total/100+1)),
-				   mdev->rs_total - mdev->ov_left,
-				   mdev->rs_total);
-
 		if (proc_details >= 1 && get_ldev_if_state(mdev, D_FAILED)) {
 			lc_seq_printf_stats(seq, mdev->resync);
 			lc_seq_printf_stats(seq, mdev->act_log);
@@ -265,7 +305,15 @@
 
 static int drbd_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, drbd_seq_show, PDE(inode)->data);
+	if (try_module_get(THIS_MODULE))
+		return single_open(file, drbd_seq_show, PDE(inode)->data);
+	return -ENODEV;
+}
+
+static int drbd_proc_release(struct inode *inode, struct file *file)
+{
+	module_put(THIS_MODULE);
+	return single_release(inode, file);
 }
 
 /* PROC FS stuff end */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 8e68be9..fe1564c 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -277,7 +277,7 @@
 	atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
 	int i;
 
-	if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count)
+	if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE)*minor_count)
 		i = page_chain_free(page);
 	else {
 		struct page *tmp;
@@ -319,7 +319,7 @@
 	struct page *page;
 	unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
 
-	if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE))
+	if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE))
 		return NULL;
 
 	e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
@@ -725,16 +725,16 @@
 	char tb[4];
 
 	if (!*sock)
-		return FALSE;
+		return false;
 
 	rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
 
 	if (rr > 0 || rr == -EAGAIN) {
-		return TRUE;
+		return true;
 	} else {
 		sock_release(*sock);
 		*sock = NULL;
-		return FALSE;
+		return false;
 	}
 }
 
@@ -768,8 +768,7 @@
 			if (s || ++try >= 3)
 				break;
 			/* give the other side time to call bind() & listen() */
-			__set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout(HZ / 10);
+			schedule_timeout_interruptible(HZ / 10);
 		}
 
 		if (s) {
@@ -788,8 +787,7 @@
 		}
 
 		if (sock && msock) {
-			__set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout(HZ / 10);
+			schedule_timeout_interruptible(HZ / 10);
 			ok = drbd_socket_okay(mdev, &sock);
 			ok = drbd_socket_okay(mdev, &msock) && ok;
 			if (ok)
@@ -906,7 +904,7 @@
 		put_ldev(mdev);
 	}
 
-	if (!drbd_send_protocol(mdev))
+	if (drbd_send_protocol(mdev) == -1)
 		return -1;
 	drbd_send_sync_param(mdev, &mdev->sync_conf);
 	drbd_send_sizes(mdev, 0, 0);
@@ -914,6 +912,7 @@
 	drbd_send_state(mdev);
 	clear_bit(USE_DEGR_WFC_T, &mdev->flags);
 	clear_bit(RESIZE_PENDING, &mdev->flags);
+	mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
 
 	return 1;
 
@@ -932,8 +931,9 @@
 
 	r = drbd_recv(mdev, h, sizeof(*h));
 	if (unlikely(r != sizeof(*h))) {
-		dev_err(DEV, "short read expecting header on sock: r=%d\n", r);
-		return FALSE;
+		if (!signal_pending(current))
+			dev_warn(DEV, "short read expecting header on sock: r=%d\n", r);
+		return false;
 	}
 
 	if (likely(h->h80.magic == BE_DRBD_MAGIC)) {
@@ -947,11 +947,11 @@
 		    be32_to_cpu(h->h80.magic),
 		    be16_to_cpu(h->h80.command),
 		    be16_to_cpu(h->h80.length));
-		return FALSE;
+		return false;
 	}
 	mdev->last_received = jiffies;
 
-	return TRUE;
+	return true;
 }
 
 static void drbd_flush(struct drbd_conf *mdev)
@@ -1074,6 +1074,16 @@
  * @mdev:	DRBD device.
  * @e:		epoch entry
  * @rw:		flag field, see bio->bi_rw
+ *
+ * May spread the pages to multiple bios,
+ * depending on bio_add_page restrictions.
+ *
+ * Returns 0 if all bios have been submitted,
+ * -ENOMEM if we could not allocate enough bios,
+ * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
+ *  single page to an empty bio (which should never happen and likely indicates
+ *  that the lower level IO stack is in some way broken). This has been observed
+ *  on certain Xen deployments.
  */
 /* TODO allocate from our own bio_set. */
 int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
@@ -1086,6 +1096,7 @@
 	unsigned ds = e->size;
 	unsigned n_bios = 0;
 	unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
+	int err = -ENOMEM;
 
 	/* In most cases, we will only need one bio.  But in case the lower
 	 * level restrictions happen to be different at this offset on this
@@ -1111,8 +1122,17 @@
 	page_chain_for_each(page) {
 		unsigned len = min_t(unsigned, ds, PAGE_SIZE);
 		if (!bio_add_page(bio, page, len, 0)) {
-			/* a single page must always be possible! */
-			BUG_ON(bio->bi_vcnt == 0);
+			/* A single page must always be possible!
+			 * But in case it fails anyways,
+			 * we deal with it, and complain (below). */
+			if (bio->bi_vcnt == 0) {
+				dev_err(DEV,
+					"bio_add_page failed for len=%u, "
+					"bi_vcnt=0 (bi_sector=%llu)\n",
+					len, (unsigned long long)bio->bi_sector);
+				err = -ENOSPC;
+				goto fail;
+			}
 			goto next_bio;
 		}
 		ds -= len;
@@ -1138,7 +1158,7 @@
 		bios = bios->bi_next;
 		bio_put(bio);
 	}
-	return -ENOMEM;
+	return err;
 }
 
 static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
@@ -1160,7 +1180,7 @@
 	switch (mdev->write_ordering) {
 	case WO_none:
 		if (rv == FE_RECYCLED)
-			return TRUE;
+			return true;
 
 		/* receiver context, in the writeout path of the other node.
 		 * avoid potential distributed deadlock */
@@ -1188,10 +1208,10 @@
 		D_ASSERT(atomic_read(&epoch->active) == 0);
 		D_ASSERT(epoch->flags == 0);
 
-		return TRUE;
+		return true;
 	default:
 		dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering);
-		return FALSE;
+		return false;
 	}
 
 	epoch->flags = 0;
@@ -1209,7 +1229,7 @@
 	}
 	spin_unlock(&mdev->epoch_lock);
 
-	return TRUE;
+	return true;
 }
 
 /* used from receive_RSDataReply (recv_resync_read)
@@ -1231,21 +1251,25 @@
 	if (dgs) {
 		rr = drbd_recv(mdev, dig_in, dgs);
 		if (rr != dgs) {
-			dev_warn(DEV, "short read receiving data digest: read %d expected %d\n",
-			     rr, dgs);
+			if (!signal_pending(current))
+				dev_warn(DEV,
+					"short read receiving data digest: read %d expected %d\n",
+					rr, dgs);
 			return NULL;
 		}
 	}
 
 	data_size -= dgs;
 
+	ERR_IF(data_size == 0) return NULL;
 	ERR_IF(data_size &  0x1ff) return NULL;
-	ERR_IF(data_size >  DRBD_MAX_SEGMENT_SIZE) return NULL;
+	ERR_IF(data_size >  DRBD_MAX_BIO_SIZE) return NULL;
 
 	/* even though we trust out peer,
 	 * we sometimes have to double check. */
 	if (sector + (data_size>>9) > capacity) {
-		dev_err(DEV, "capacity: %llus < sector: %llus + size: %u\n",
+		dev_err(DEV, "request from peer beyond end of local disk: "
+			"capacity: %llus < sector: %llus + size: %u\n",
 			(unsigned long long)capacity,
 			(unsigned long long)sector, data_size);
 		return NULL;
@@ -1264,15 +1288,16 @@
 		unsigned len = min_t(int, ds, PAGE_SIZE);
 		data = kmap(page);
 		rr = drbd_recv(mdev, data, len);
-		if (FAULT_ACTIVE(mdev, DRBD_FAULT_RECEIVE)) {
+		if (drbd_insert_fault(mdev, DRBD_FAULT_RECEIVE)) {
 			dev_err(DEV, "Fault injection: Corrupting data on receive\n");
 			data[0] = data[0] ^ (unsigned long)-1;
 		}
 		kunmap(page);
 		if (rr != len) {
 			drbd_free_ee(mdev, e);
-			dev_warn(DEV, "short read receiving data: read %d expected %d\n",
-			     rr, len);
+			if (!signal_pending(current))
+				dev_warn(DEV, "short read receiving data: read %d expected %d\n",
+				rr, len);
 			return NULL;
 		}
 		ds -= rr;
@@ -1281,7 +1306,8 @@
 	if (dgs) {
 		drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv);
 		if (memcmp(dig_in, dig_vv, dgs)) {
-			dev_err(DEV, "Digest integrity check FAILED.\n");
+			dev_err(DEV, "Digest integrity check FAILED: %llus +%u\n",
+				(unsigned long long)sector, data_size);
 			drbd_bcast_ee(mdev, "digest failed",
 					dgs, dig_in, dig_vv, e);
 			drbd_free_ee(mdev, e);
@@ -1302,7 +1328,7 @@
 	void *data;
 
 	if (!data_size)
-		return TRUE;
+		return true;
 
 	page = drbd_pp_alloc(mdev, 1, 1);
 
@@ -1311,8 +1337,10 @@
 		rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE));
 		if (rr != min_t(int, data_size, PAGE_SIZE)) {
 			rv = 0;
-			dev_warn(DEV, "short read receiving data: read %d expected %d\n",
-			     rr, min_t(int, data_size, PAGE_SIZE));
+			if (!signal_pending(current))
+				dev_warn(DEV,
+					"short read receiving data: read %d expected %d\n",
+					rr, min_t(int, data_size, PAGE_SIZE));
 			break;
 		}
 		data_size -= rr;
@@ -1337,8 +1365,10 @@
 	if (dgs) {
 		rr = drbd_recv(mdev, dig_in, dgs);
 		if (rr != dgs) {
-			dev_warn(DEV, "short read receiving data reply digest: read %d expected %d\n",
-			     rr, dgs);
+			if (!signal_pending(current))
+				dev_warn(DEV,
+					"short read receiving data reply digest: read %d expected %d\n",
+					rr, dgs);
 			return 0;
 		}
 	}
@@ -1359,9 +1389,10 @@
 			     expect);
 		kunmap(bvec->bv_page);
 		if (rr != expect) {
-			dev_warn(DEV, "short read receiving data reply: "
-			     "read %d expected %d\n",
-			     rr, expect);
+			if (!signal_pending(current))
+				dev_warn(DEV, "short read receiving data reply: "
+					"read %d expected %d\n",
+					rr, expect);
 			return 0;
 		}
 		data_size -= rr;
@@ -1425,11 +1456,10 @@
 
 	atomic_add(data_size >> 9, &mdev->rs_sect_ev);
 	if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
-		return TRUE;
+		return true;
 
-	/* drbd_submit_ee currently fails for one reason only:
-	 * not being able to allocate enough bios.
-	 * Is dropping the connection going to help? */
+	/* don't care for the reason here */
+	dev_err(DEV, "submit failed, triggering re-connect\n");
 	spin_lock_irq(&mdev->req_lock);
 	list_del(&e->w.list);
 	spin_unlock_irq(&mdev->req_lock);
@@ -1437,7 +1467,7 @@
 	drbd_free_ee(mdev, e);
 fail:
 	put_ldev(mdev);
-	return FALSE;
+	return false;
 }
 
 static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
@@ -1454,7 +1484,7 @@
 	spin_unlock_irq(&mdev->req_lock);
 	if (unlikely(!req)) {
 		dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n");
-		return FALSE;
+		return false;
 	}
 
 	/* hlist_del(&req->colision) is done in _req_may_be_done, to avoid
@@ -1611,15 +1641,15 @@
 	return ret;
 }
 
-static unsigned long write_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
+/* see also bio_flags_to_wire()
+ * DRBD_REQ_*, because we need to semantically map the flags to data packet
+ * flags and back. We may replicate to other kernel versions. */
+static unsigned long wire_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
 {
-	if (mdev->agreed_pro_version >= 95)
-		return  (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
-			(dpf & DP_FUA ? REQ_FUA : 0) |
-			(dpf & DP_FLUSH ? REQ_FUA : 0) |
-			(dpf & DP_DISCARD ? REQ_DISCARD : 0);
-	else
-		return dpf & DP_RW_SYNC ? REQ_SYNC : 0;
+	return  (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
+		(dpf & DP_FUA ? REQ_FUA : 0) |
+		(dpf & DP_FLUSH ? REQ_FLUSH : 0) |
+		(dpf & DP_DISCARD ? REQ_DISCARD : 0);
 }
 
 /* mirrored write */
@@ -1632,9 +1662,6 @@
 	u32 dp_flags;
 
 	if (!get_ldev(mdev)) {
-		if (__ratelimit(&drbd_ratelimit_state))
-			dev_err(DEV, "Can not write mirrored data block "
-			    "to local disk.\n");
 		spin_lock(&mdev->peer_seq_lock);
 		if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num))
 			mdev->peer_seq++;
@@ -1654,23 +1681,23 @@
 	e = read_in_block(mdev, p->block_id, sector, data_size);
 	if (!e) {
 		put_ldev(mdev);
-		return FALSE;
+		return false;
 	}
 
 	e->w.cb = e_end_block;
 
+	dp_flags = be32_to_cpu(p->dp_flags);
+	rw |= wire_flags_to_bio(mdev, dp_flags);
+
+	if (dp_flags & DP_MAY_SET_IN_SYNC)
+		e->flags |= EE_MAY_SET_IN_SYNC;
+
 	spin_lock(&mdev->epoch_lock);
 	e->epoch = mdev->current_epoch;
 	atomic_inc(&e->epoch->epoch_size);
 	atomic_inc(&e->epoch->active);
 	spin_unlock(&mdev->epoch_lock);
 
-	dp_flags = be32_to_cpu(p->dp_flags);
-	rw |= write_flags_to_bio(mdev, dp_flags);
-
-	if (dp_flags & DP_MAY_SET_IN_SYNC)
-		e->flags |= EE_MAY_SET_IN_SYNC;
-
 	/* I'm the receiver, I do hold a net_cnt reference. */
 	if (!mdev->net_conf->two_primaries) {
 		spin_lock_irq(&mdev->req_lock);
@@ -1773,7 +1800,7 @@
 				put_ldev(mdev);
 				wake_asender(mdev);
 				finish_wait(&mdev->misc_wait, &wait);
-				return TRUE;
+				return true;
 			}
 
 			if (signal_pending(current)) {
@@ -1829,11 +1856,10 @@
 	}
 
 	if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
-		return TRUE;
+		return true;
 
-	/* drbd_submit_ee currently fails for one reason only:
-	 * not being able to allocate enough bios.
-	 * Is dropping the connection going to help? */
+	/* don't care for the reason here */
+	dev_err(DEV, "submit failed, triggering re-connect\n");
 	spin_lock_irq(&mdev->req_lock);
 	list_del(&e->w.list);
 	hlist_del_init(&e->colision);
@@ -1842,12 +1868,10 @@
 		drbd_al_complete_io(mdev, e->sector);
 
 out_interrupted:
-	/* yes, the epoch_size now is imbalanced.
-	 * but we drop the connection anyways, so we don't have a chance to
-	 * receive a barrier... atomic_inc(&mdev->epoch_size); */
+	drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + EV_CLEANUP);
 	put_ldev(mdev);
 	drbd_free_ee(mdev, e);
-	return FALSE;
+	return false;
 }
 
 /* We may throttle resync, if the lower device seems to be busy,
@@ -1861,10 +1885,11 @@
  * The current sync rate used here uses only the most recent two step marks,
  * to have a short time average so we can react faster.
  */
-int drbd_rs_should_slow_down(struct drbd_conf *mdev)
+int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector)
 {
 	struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
 	unsigned long db, dt, dbdt;
+	struct lc_element *tmp;
 	int curr_events;
 	int throttle = 0;
 
@@ -1872,9 +1897,22 @@
 	if (mdev->sync_conf.c_min_rate == 0)
 		return 0;
 
+	spin_lock_irq(&mdev->al_lock);
+	tmp = lc_find(mdev->resync, BM_SECT_TO_EXT(sector));
+	if (tmp) {
+		struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+		if (test_bit(BME_PRIORITY, &bm_ext->flags)) {
+			spin_unlock_irq(&mdev->al_lock);
+			return 0;
+		}
+		/* Do not slow down if app IO is already waiting for this extent */
+	}
+	spin_unlock_irq(&mdev->al_lock);
+
 	curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
 		      (int)part_stat_read(&disk->part0, sectors[1]) -
 			atomic_read(&mdev->rs_sect_ev);
+
 	if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
 		unsigned long rs_left;
 		int i;
@@ -1883,8 +1921,12 @@
 
 		/* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
 		 * approx. */
-		i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-2) % DRBD_SYNC_MARKS;
-		rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
+		i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
+
+		if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
+			rs_left = mdev->ov_left;
+		else
+			rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
 
 		dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
 		if (!dt)
@@ -1912,15 +1954,15 @@
 	sector = be64_to_cpu(p->sector);
 	size   = be32_to_cpu(p->blksize);
 
-	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
+	if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
 		dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
 				(unsigned long long)sector, size);
-		return FALSE;
+		return false;
 	}
 	if (sector + (size>>9) > capacity) {
 		dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
 				(unsigned long long)sector, size);
-		return FALSE;
+		return false;
 	}
 
 	if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
@@ -1957,7 +1999,7 @@
 	e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO);
 	if (!e) {
 		put_ldev(mdev);
-		return FALSE;
+		return false;
 	}
 
 	switch (cmd) {
@@ -1970,6 +2012,8 @@
 	case P_RS_DATA_REQUEST:
 		e->w.cb = w_e_end_rsdata_req;
 		fault_type = DRBD_FAULT_RS_RD;
+		/* used in the sector offset progress display */
+		mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
 		break;
 
 	case P_OV_REPLY:
@@ -1991,7 +2035,11 @@
 		if (cmd == P_CSUM_RS_REQUEST) {
 			D_ASSERT(mdev->agreed_pro_version >= 89);
 			e->w.cb = w_e_end_csum_rs_req;
+			/* used in the sector offset progress display */
+			mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
 		} else if (cmd == P_OV_REPLY) {
+			/* track progress, we may need to throttle */
+			atomic_add(size >> 9, &mdev->rs_sect_in);
 			e->w.cb = w_e_end_ov_reply;
 			dec_rs_pending(mdev);
 			/* drbd_rs_begin_io done when we sent this request,
@@ -2003,9 +2051,16 @@
 	case P_OV_REQUEST:
 		if (mdev->ov_start_sector == ~(sector_t)0 &&
 		    mdev->agreed_pro_version >= 90) {
+			unsigned long now = jiffies;
+			int i;
 			mdev->ov_start_sector = sector;
 			mdev->ov_position = sector;
-			mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(sector);
+			mdev->ov_left = drbd_bm_bits(mdev) - BM_SECT_TO_BIT(sector);
+			mdev->rs_total = mdev->ov_left;
+			for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+				mdev->rs_mark_left[i] = mdev->ov_left;
+				mdev->rs_mark_time[i] = now;
+			}
 			dev_info(DEV, "Online Verify start sector: %llu\n",
 					(unsigned long long)sector);
 		}
@@ -2042,9 +2097,9 @@
 	 * we would also throttle its application reads.
 	 * In that case, throttling is done on the SyncTarget only.
 	 */
-	if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev))
-		msleep(100);
-	if (drbd_rs_begin_io(mdev, e->sector))
+	if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev, sector))
+		schedule_timeout_uninterruptible(HZ/10);
+	if (drbd_rs_begin_io(mdev, sector))
 		goto out_free_e;
 
 submit_for_resync:
@@ -2057,11 +2112,10 @@
 	spin_unlock_irq(&mdev->req_lock);
 
 	if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
-		return TRUE;
+		return true;
 
-	/* drbd_submit_ee currently fails for one reason only:
-	 * not being able to allocate enough bios.
-	 * Is dropping the connection going to help? */
+	/* don't care for the reason here */
+	dev_err(DEV, "submit failed, triggering re-connect\n");
 	spin_lock_irq(&mdev->req_lock);
 	list_del(&e->w.list);
 	spin_unlock_irq(&mdev->req_lock);
@@ -2070,7 +2124,7 @@
 out_free_e:
 	put_ldev(mdev);
 	drbd_free_ee(mdev, e);
-	return FALSE;
+	return false;
 }
 
 static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
@@ -2147,10 +2201,7 @@
 
 static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
 {
-	int self, peer, hg, rv = -100;
-
-	self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
-	peer = mdev->p_uuid[UI_BITMAP] & 1;
+	int hg, rv = -100;
 
 	switch (mdev->net_conf->after_sb_1p) {
 	case ASB_DISCARD_YOUNGER_PRI:
@@ -2177,12 +2228,14 @@
 	case ASB_CALL_HELPER:
 		hg = drbd_asb_recover_0p(mdev);
 		if (hg == -1 && mdev->state.role == R_PRIMARY) {
-			self = drbd_set_role(mdev, R_SECONDARY, 0);
+			enum drbd_state_rv rv2;
+
+			drbd_set_role(mdev, R_SECONDARY, 0);
 			 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
 			  * we might be here in C_WF_REPORT_PARAMS which is transient.
 			  * we do not need to wait for the after state change work either. */
-			self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
-			if (self != SS_SUCCESS) {
+			rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
+			if (rv2 != SS_SUCCESS) {
 				drbd_khelper(mdev, "pri-lost-after-sb");
 			} else {
 				dev_warn(DEV, "Successfully gave up primary role.\n");
@@ -2197,10 +2250,7 @@
 
 static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
 {
-	int self, peer, hg, rv = -100;
-
-	self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
-	peer = mdev->p_uuid[UI_BITMAP] & 1;
+	int hg, rv = -100;
 
 	switch (mdev->net_conf->after_sb_2p) {
 	case ASB_DISCARD_YOUNGER_PRI:
@@ -2220,11 +2270,13 @@
 	case ASB_CALL_HELPER:
 		hg = drbd_asb_recover_0p(mdev);
 		if (hg == -1) {
+			enum drbd_state_rv rv2;
+
 			 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
 			  * we might be here in C_WF_REPORT_PARAMS which is transient.
 			  * we do not need to wait for the after state change work either. */
-			self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
-			if (self != SS_SUCCESS) {
+			rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
+			if (rv2 != SS_SUCCESS) {
 				drbd_khelper(mdev, "pri-lost-after-sb");
 			} else {
 				dev_warn(DEV, "Successfully gave up primary role.\n");
@@ -2263,6 +2315,8 @@
    -2	C_SYNC_TARGET set BitMap
  -100	after split brain, disconnect
 -1000	unrelated data
+-1091   requires proto 91
+-1096   requires proto 96
  */
 static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local)
 {
@@ -2292,7 +2346,7 @@
 		if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
 
 			if (mdev->agreed_pro_version < 91)
-				return -1001;
+				return -1091;
 
 			if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
 			    (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
@@ -2313,7 +2367,7 @@
 		if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
 
 			if (mdev->agreed_pro_version < 91)
-				return -1001;
+				return -1091;
 
 			if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
 			    (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
@@ -2358,17 +2412,22 @@
 	*rule_nr = 51;
 	peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
 	if (self == peer) {
-		self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
-		peer = mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1);
-		if (self == peer) {
+		if (mdev->agreed_pro_version < 96 ?
+		    (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
+		    (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
+		    peer + UUID_NEW_BM_OFFSET == (mdev->p_uuid[UI_BITMAP] & ~((u64)1))) {
 			/* The last P_SYNC_UUID did not get though. Undo the last start of
 			   resync as sync source modifications of the peer's UUIDs. */
 
 			if (mdev->agreed_pro_version < 91)
-				return -1001;
+				return -1091;
 
 			mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
 			mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
+
+			dev_info(DEV, "Did not got last syncUUID packet, corrected:\n");
+			drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
+
 			return -1;
 		}
 	}
@@ -2390,20 +2449,20 @@
 	*rule_nr = 71;
 	self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
 	if (self == peer) {
-		self = mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1);
-		peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
-		if (self == peer) {
+		if (mdev->agreed_pro_version < 96 ?
+		    (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
+		    (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
+		    self + UUID_NEW_BM_OFFSET == (mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
 			/* The last P_SYNC_UUID did not get though. Undo the last start of
 			   resync as sync source modifications of our UUIDs. */
 
 			if (mdev->agreed_pro_version < 91)
-				return -1001;
+				return -1091;
 
 			_drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
 			_drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]);
 
-			dev_info(DEV, "Undid last start of resync:\n");
-
+			dev_info(DEV, "Last syncUUID did not get through, corrected:\n");
 			drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
 				       mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
 
@@ -2466,8 +2525,8 @@
 		dev_alert(DEV, "Unrelated data, aborting!\n");
 		return C_MASK;
 	}
-	if (hg == -1001) {
-		dev_alert(DEV, "To resolve this both sides have to support at least protocol\n");
+	if (hg < -1000) {
+		dev_alert(DEV, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
 		return C_MASK;
 	}
 
@@ -2566,7 +2625,8 @@
 
 	if (abs(hg) >= 2) {
 		dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
-		if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake"))
+		if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
+					BM_LOCKED_SET_ALLOWED))
 			return C_MASK;
 	}
 
@@ -2660,7 +2720,7 @@
 		unsigned char *my_alg = mdev->net_conf->integrity_alg;
 
 		if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size)
-			return FALSE;
+			return false;
 
 		p_integrity_alg[SHARED_SECRET_MAX-1] = 0;
 		if (strcmp(p_integrity_alg, my_alg)) {
@@ -2671,11 +2731,11 @@
 		     my_alg[0] ? my_alg : (unsigned char *)"<not-used>");
 	}
 
-	return TRUE;
+	return true;
 
 disconnect:
 	drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-	return FALSE;
+	return false;
 }
 
 /* helper function
@@ -2707,7 +2767,7 @@
 
 static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size)
 {
-	int ok = TRUE;
+	int ok = true;
 	struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95;
 	unsigned int header_size, data_size, exp_max_sz;
 	struct crypto_hash *verify_tfm = NULL;
@@ -2725,7 +2785,7 @@
 	if (packet_size > exp_max_sz) {
 		dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
 		    packet_size, exp_max_sz);
-		return FALSE;
+		return false;
 	}
 
 	if (apv <= 88) {
@@ -2745,7 +2805,7 @@
 	memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
 
 	if (drbd_recv(mdev, &p->head.payload, header_size) != header_size)
-		return FALSE;
+		return false;
 
 	mdev->sync_conf.rate	  = be32_to_cpu(p->rate);
 
@@ -2755,11 +2815,11 @@
 				dev_err(DEV, "verify-alg too long, "
 				    "peer wants %u, accepting only %u byte\n",
 						data_size, SHARED_SECRET_MAX);
-				return FALSE;
+				return false;
 			}
 
 			if (drbd_recv(mdev, p->verify_alg, data_size) != data_size)
-				return FALSE;
+				return false;
 
 			/* we expect NUL terminated string */
 			/* but just in case someone tries to be evil */
@@ -2853,7 +2913,7 @@
 	/* but free the verify_tfm again, if csums_tfm did not work out */
 	crypto_free_hash(verify_tfm);
 	drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-	return FALSE;
+	return false;
 }
 
 static void drbd_setup_order_type(struct drbd_conf *mdev, int peer)
@@ -2879,7 +2939,7 @@
 {
 	struct p_sizes *p = &mdev->data.rbuf.sizes;
 	enum determine_dev_size dd = unchanged;
-	unsigned int max_seg_s;
+	unsigned int max_bio_size;
 	sector_t p_size, p_usize, my_usize;
 	int ldsc = 0; /* local disk size changed */
 	enum dds_flags ddsf;
@@ -2890,7 +2950,7 @@
 	if (p_size == 0 && mdev->state.disk == D_DISKLESS) {
 		dev_err(DEV, "some backing storage is needed\n");
 		drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-		return FALSE;
+		return false;
 	}
 
 	/* just store the peer's disk size for now.
@@ -2927,18 +2987,17 @@
 			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
 			mdev->ldev->dc.disk_size = my_usize;
 			put_ldev(mdev);
-			return FALSE;
+			return false;
 		}
 		put_ldev(mdev);
 	}
-#undef min_not_zero
 
 	ddsf = be16_to_cpu(p->dds_flags);
 	if (get_ldev(mdev)) {
 		dd = drbd_determin_dev_size(mdev, ddsf);
 		put_ldev(mdev);
 		if (dd == dev_size_error)
-			return FALSE;
+			return false;
 		drbd_md_sync(mdev);
 	} else {
 		/* I am diskless, need to accept the peer's size. */
@@ -2952,14 +3011,14 @@
 		}
 
 		if (mdev->agreed_pro_version < 94)
-			max_seg_s = be32_to_cpu(p->max_segment_size);
+			max_bio_size = be32_to_cpu(p->max_bio_size);
 		else if (mdev->agreed_pro_version == 94)
-			max_seg_s = DRBD_MAX_SIZE_H80_PACKET;
+			max_bio_size = DRBD_MAX_SIZE_H80_PACKET;
 		else /* drbd 8.3.8 onwards */
-			max_seg_s = DRBD_MAX_SEGMENT_SIZE;
+			max_bio_size = DRBD_MAX_BIO_SIZE;
 
-		if (max_seg_s != queue_max_segment_size(mdev->rq_queue))
-			drbd_setup_queue_param(mdev, max_seg_s);
+		if (max_bio_size != queue_max_hw_sectors(mdev->rq_queue) << 9)
+			drbd_setup_queue_param(mdev, max_bio_size);
 
 		drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type));
 		put_ldev(mdev);
@@ -2985,14 +3044,14 @@
 		}
 	}
 
-	return TRUE;
+	return true;
 }
 
 static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
 {
 	struct p_uuids *p = &mdev->data.rbuf.uuids;
 	u64 *p_uuid;
-	int i;
+	int i, updated_uuids = 0;
 
 	p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
 
@@ -3009,7 +3068,7 @@
 		dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
 		    (unsigned long long)mdev->ed_uuid);
 		drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-		return FALSE;
+		return false;
 	}
 
 	if (get_ldev(mdev)) {
@@ -3021,19 +3080,21 @@
 		if (skip_initial_sync) {
 			dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n");
 			drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
-					"clear_n_write from receive_uuids");
+					"clear_n_write from receive_uuids",
+					BM_LOCKED_TEST_ALLOWED);
 			_drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]);
 			_drbd_uuid_set(mdev, UI_BITMAP, 0);
 			_drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
 					CS_VERBOSE, NULL);
 			drbd_md_sync(mdev);
+			updated_uuids = 1;
 		}
 		put_ldev(mdev);
 	} else if (mdev->state.disk < D_INCONSISTENT &&
 		   mdev->state.role == R_PRIMARY) {
 		/* I am a diskless primary, the peer just created a new current UUID
 		   for me. */
-		drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
+		updated_uuids = drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
 	}
 
 	/* Before we test for the disk state, we should wait until an eventually
@@ -3042,9 +3103,12 @@
 	   new disk state... */
 	wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags));
 	if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
-		drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
+		updated_uuids |= drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
 
-	return TRUE;
+	if (updated_uuids)
+		drbd_print_uuids(mdev, "receiver updated UUIDs to");
+
+	return true;
 }
 
 /**
@@ -3081,7 +3145,7 @@
 {
 	struct p_req_state *p = &mdev->data.rbuf.req_state;
 	union drbd_state mask, val;
-	int rv;
+	enum drbd_state_rv rv;
 
 	mask.i = be32_to_cpu(p->mask);
 	val.i = be32_to_cpu(p->val);
@@ -3089,7 +3153,7 @@
 	if (test_bit(DISCARD_CONCURRENT, &mdev->flags) &&
 	    test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) {
 		drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
-		return TRUE;
+		return true;
 	}
 
 	mask = convert_state(mask);
@@ -3100,7 +3164,7 @@
 	drbd_send_sr_reply(mdev, rv);
 	drbd_md_sync(mdev);
 
-	return TRUE;
+	return true;
 }
 
 static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
@@ -3145,7 +3209,7 @@
 			 peer_state.conn == C_CONNECTED) {
 			if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
 				drbd_resync_finished(mdev);
-			return TRUE;
+			return true;
 		}
 	}
 
@@ -3161,6 +3225,9 @@
 	if (ns.conn == C_WF_REPORT_PARAMS)
 		ns.conn = C_CONNECTED;
 
+	if (peer_state.conn == C_AHEAD)
+		ns.conn = C_BEHIND;
+
 	if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
 	    get_ldev_if_state(mdev, D_NEGOTIATING)) {
 		int cr; /* consider resync */
@@ -3195,10 +3262,10 @@
 				real_peer_disk = D_DISKLESS;
 			} else {
 				if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags))
-					return FALSE;
+					return false;
 				D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
 				drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-				return FALSE;
+				return false;
 			}
 		}
 	}
@@ -3223,7 +3290,7 @@
 		drbd_uuid_new_current(mdev);
 		clear_bit(NEW_CUR_UUID, &mdev->flags);
 		drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0));
-		return FALSE;
+		return false;
 	}
 	rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
 	ns = mdev->state;
@@ -3231,7 +3298,7 @@
 
 	if (rv < SS_SUCCESS) {
 		drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-		return FALSE;
+		return false;
 	}
 
 	if (os.conn > C_WF_REPORT_PARAMS) {
@@ -3249,7 +3316,7 @@
 
 	drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
 
-	return TRUE;
+	return true;
 }
 
 static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
@@ -3258,6 +3325,7 @@
 
 	wait_event(mdev->misc_wait,
 		   mdev->state.conn == C_WF_SYNC_UUID ||
+		   mdev->state.conn == C_BEHIND ||
 		   mdev->state.conn < C_CONNECTED ||
 		   mdev->state.disk < D_NEGOTIATING);
 
@@ -3269,32 +3337,42 @@
 		_drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid));
 		_drbd_uuid_set(mdev, UI_BITMAP, 0UL);
 
+		drbd_print_uuids(mdev, "updated sync uuid");
 		drbd_start_resync(mdev, C_SYNC_TARGET);
 
 		put_ldev(mdev);
 	} else
 		dev_err(DEV, "Ignoring SyncUUID packet!\n");
 
-	return TRUE;
+	return true;
 }
 
-enum receive_bitmap_ret { OK, DONE, FAILED };
-
-static enum receive_bitmap_ret
+/**
+ * receive_bitmap_plain
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
 receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
 		     unsigned long *buffer, struct bm_xfer_ctx *c)
 {
 	unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
 	unsigned want = num_words * sizeof(long);
+	int err;
 
 	if (want != data_size) {
 		dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size);
-		return FAILED;
+		return -EIO;
 	}
 	if (want == 0)
-		return DONE;
-	if (drbd_recv(mdev, buffer, want) != want)
-		return FAILED;
+		return 0;
+	err = drbd_recv(mdev, buffer, want);
+	if (err != want) {
+		if (err >= 0)
+			err = -EIO;
+		return err;
+	}
 
 	drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer);
 
@@ -3303,10 +3381,16 @@
 	if (c->bit_offset > c->bm_bits)
 		c->bit_offset = c->bm_bits;
 
-	return OK;
+	return 1;
 }
 
-static enum receive_bitmap_ret
+/**
+ * recv_bm_rle_bits
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
 recv_bm_rle_bits(struct drbd_conf *mdev,
 		struct p_compressed_bm *p,
 		struct bm_xfer_ctx *c)
@@ -3326,18 +3410,18 @@
 
 	bits = bitstream_get_bits(&bs, &look_ahead, 64);
 	if (bits < 0)
-		return FAILED;
+		return -EIO;
 
 	for (have = bits; have > 0; s += rl, toggle = !toggle) {
 		bits = vli_decode_bits(&rl, look_ahead);
 		if (bits <= 0)
-			return FAILED;
+			return -EIO;
 
 		if (toggle) {
 			e = s + rl -1;
 			if (e >= c->bm_bits) {
 				dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
-				return FAILED;
+				return -EIO;
 			}
 			_drbd_bm_set_bits(mdev, s, e);
 		}
@@ -3347,14 +3431,14 @@
 				have, bits, look_ahead,
 				(unsigned int)(bs.cur.b - p->code),
 				(unsigned int)bs.buf_len);
-			return FAILED;
+			return -EIO;
 		}
 		look_ahead >>= bits;
 		have -= bits;
 
 		bits = bitstream_get_bits(&bs, &tmp, 64 - have);
 		if (bits < 0)
-			return FAILED;
+			return -EIO;
 		look_ahead |= tmp << have;
 		have += bits;
 	}
@@ -3362,10 +3446,16 @@
 	c->bit_offset = s;
 	bm_xfer_ctx_bit_to_word_offset(c);
 
-	return (s == c->bm_bits) ? DONE : OK;
+	return (s != c->bm_bits);
 }
 
-static enum receive_bitmap_ret
+/**
+ * decode_bitmap_c
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
 decode_bitmap_c(struct drbd_conf *mdev,
 		struct p_compressed_bm *p,
 		struct bm_xfer_ctx *c)
@@ -3379,7 +3469,7 @@
 
 	dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
 	drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
-	return FAILED;
+	return -EIO;
 }
 
 void INFO_bm_xfer_stats(struct drbd_conf *mdev,
@@ -3428,13 +3518,13 @@
 {
 	struct bm_xfer_ctx c;
 	void *buffer;
-	enum receive_bitmap_ret ret;
-	int ok = FALSE;
+	int err;
+	int ok = false;
 	struct p_header80 *h = &mdev->data.rbuf.header.h80;
 
-	wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
-
-	drbd_bm_lock(mdev, "receive bitmap");
+	drbd_bm_lock(mdev, "receive bitmap", BM_LOCKED_SET_ALLOWED);
+	/* you are supposed to send additional out-of-sync information
+	 * if you actually set bits during this phase */
 
 	/* maybe we should use some per thread scratch page,
 	 * and allocate that during initial device creation? */
@@ -3449,9 +3539,9 @@
 		.bm_words = drbd_bm_words(mdev),
 	};
 
-	do {
+	for(;;) {
 		if (cmd == P_BITMAP) {
-			ret = receive_bitmap_plain(mdev, data_size, buffer, &c);
+			err = receive_bitmap_plain(mdev, data_size, buffer, &c);
 		} else if (cmd == P_COMPRESSED_BITMAP) {
 			/* MAYBE: sanity check that we speak proto >= 90,
 			 * and the feature is enabled! */
@@ -3468,9 +3558,9 @@
 				goto out;
 			if (data_size <= (sizeof(*p) - sizeof(p->head))) {
 				dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size);
-				return FAILED;
+				goto out;
 			}
-			ret = decode_bitmap_c(mdev, p, &c);
+			err = decode_bitmap_c(mdev, p, &c);
 		} else {
 			dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd);
 			goto out;
@@ -3479,24 +3569,26 @@
 		c.packets[cmd == P_BITMAP]++;
 		c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size;
 
-		if (ret != OK)
+		if (err <= 0) {
+			if (err < 0)
+				goto out;
 			break;
-
+		}
 		if (!drbd_recv_header(mdev, &cmd, &data_size))
 			goto out;
-	} while (ret == OK);
-	if (ret == FAILED)
-		goto out;
+	}
 
 	INFO_bm_xfer_stats(mdev, "receive", &c);
 
 	if (mdev->state.conn == C_WF_BITMAP_T) {
+		enum drbd_state_rv rv;
+
 		ok = !drbd_send_bitmap(mdev);
 		if (!ok)
 			goto out;
 		/* Omit CS_ORDERED with this state transition to avoid deadlocks. */
-		ok = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
-		D_ASSERT(ok == SS_SUCCESS);
+		rv = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
+		D_ASSERT(rv == SS_SUCCESS);
 	} else if (mdev->state.conn != C_WF_BITMAP_S) {
 		/* admin may have requested C_DISCONNECTING,
 		 * other threads may have noticed network errors */
@@ -3504,7 +3596,7 @@
 		    drbd_conn_str(mdev->state.conn));
 	}
 
-	ok = TRUE;
+	ok = true;
  out:
 	drbd_bm_unlock(mdev);
 	if (ok && mdev->state.conn == C_WF_BITMAP_S)
@@ -3538,7 +3630,26 @@
 	 * with the data requests being unplugged */
 	drbd_tcp_quickack(mdev->data.socket);
 
-	return TRUE;
+	return true;
+}
+
+static int receive_out_of_sync(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+{
+	struct p_block_desc *p = &mdev->data.rbuf.block_desc;
+
+	switch (mdev->state.conn) {
+	case C_WF_SYNC_UUID:
+	case C_WF_BITMAP_T:
+	case C_BEHIND:
+			break;
+	default:
+		dev_err(DEV, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
+				drbd_conn_str(mdev->state.conn));
+	}
+
+	drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
+
+	return true;
 }
 
 typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive);
@@ -3571,6 +3682,7 @@
 	[P_OV_REPLY]        = { 1, sizeof(struct p_block_req), receive_DataRequest },
 	[P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
 	[P_DELAY_PROBE]     = { 0, sizeof(struct p_delay_probe93), receive_skip },
+	[P_OUT_OF_SYNC]     = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
 	/* anything missing from this table is in
 	 * the asender_tbl, see get_asender_cmd */
 	[P_MAX_CMD]	    = { 0, 0, NULL },
@@ -3610,7 +3722,8 @@
 		if (shs) {
 			rv = drbd_recv(mdev, &header->h80.payload, shs);
 			if (unlikely(rv != shs)) {
-				dev_err(DEV, "short read while reading sub header: rv=%d\n", rv);
+				if (!signal_pending(current))
+					dev_warn(DEV, "short read while reading sub header: rv=%d\n", rv);
 				goto err_out;
 			}
 		}
@@ -3682,9 +3795,6 @@
 
 	if (mdev->state.conn == C_STANDALONE)
 		return;
-	if (mdev->state.conn >= C_WF_CONNECTION)
-		dev_err(DEV, "ASSERT FAILED cstate = %s, expected < WFConnection\n",
-				drbd_conn_str(mdev->state.conn));
 
 	/* asender does not clean up anything. it must not interfere, either */
 	drbd_thread_stop(&mdev->asender);
@@ -3713,6 +3823,8 @@
 	atomic_set(&mdev->rs_pending_cnt, 0);
 	wake_up(&mdev->misc_wait);
 
+	del_timer(&mdev->request_timer);
+
 	/* make sure syncer is stopped and w_resume_next_sg queued */
 	del_timer_sync(&mdev->resync_timer);
 	resync_timer_fn((unsigned long)mdev);
@@ -3758,13 +3870,6 @@
 	if (os.conn == C_DISCONNECTING) {
 		wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
 
-		if (!is_susp(mdev->state)) {
-			/* we must not free the tl_hash
-			 * while application io is still on the fly */
-			wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
-			drbd_free_tl_hash(mdev);
-		}
-
 		crypto_free_hash(mdev->cram_hmac_tfm);
 		mdev->cram_hmac_tfm = NULL;
 
@@ -3773,6 +3878,10 @@
 		drbd_request_state(mdev, NS(conn, C_STANDALONE));
 	}
 
+	/* serialize with bitmap writeout triggered by the state change,
+	 * if any. */
+	wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
+
 	/* tcp_close and release of sendpage pages can be deferred.  I don't
 	 * want to use SO_LINGER, because apparently it can be deferred for
 	 * more than 20 seconds (longest time I checked).
@@ -3873,7 +3982,8 @@
 	rv = drbd_recv(mdev, &p->head.payload, expect);
 
 	if (rv != expect) {
-		dev_err(DEV, "short read receiving handshake packet: l=%u\n", rv);
+		if (!signal_pending(current))
+			dev_warn(DEV, "short read receiving handshake packet: l=%u\n", rv);
 		return 0;
 	}
 
@@ -3975,7 +4085,8 @@
 	rv = drbd_recv(mdev, peers_ch, length);
 
 	if (rv != length) {
-		dev_err(DEV, "short read AuthChallenge: l=%u\n", rv);
+		if (!signal_pending(current))
+			dev_warn(DEV, "short read AuthChallenge: l=%u\n", rv);
 		rv = 0;
 		goto fail;
 	}
@@ -4022,7 +4133,8 @@
 	rv = drbd_recv(mdev, response , resp_size);
 
 	if (rv != resp_size) {
-		dev_err(DEV, "short read receiving AuthResponse: l=%u\n", rv);
+		if (!signal_pending(current))
+			dev_warn(DEV, "short read receiving AuthResponse: l=%u\n", rv);
 		rv = 0;
 		goto fail;
 	}
@@ -4074,8 +4186,7 @@
 		h = drbd_connect(mdev);
 		if (h == 0) {
 			drbd_disconnect(mdev);
-			__set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout(HZ);
+			schedule_timeout_interruptible(HZ);
 		}
 		if (h == -1) {
 			dev_warn(DEV, "Discarding network configuration.\n");
@@ -4113,7 +4224,7 @@
 	}
 	wake_up(&mdev->state_wait);
 
-	return TRUE;
+	return true;
 }
 
 static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h)
@@ -4129,7 +4240,7 @@
 	if (!test_and_set_bit(GOT_PING_ACK, &mdev->flags))
 		wake_up(&mdev->misc_wait);
 
-	return TRUE;
+	return true;
 }
 
 static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
@@ -4152,7 +4263,7 @@
 	dec_rs_pending(mdev);
 	atomic_add(blksize >> 9, &mdev->rs_sect_in);
 
-	return TRUE;
+	return true;
 }
 
 /* when we receive the ACK for a write request,
@@ -4176,8 +4287,6 @@
 			return req;
 		}
 	}
-	dev_err(DEV, "_ack_id_to_req: failed to find req %p, sector %llus in list\n",
-		(void *)(unsigned long)id, (unsigned long long)sector);
 	return NULL;
 }
 
@@ -4195,15 +4304,17 @@
 	req = validator(mdev, id, sector);
 	if (unlikely(!req)) {
 		spin_unlock_irq(&mdev->req_lock);
-		dev_err(DEV, "%s: got a corrupt block_id/sector pair\n", func);
-		return FALSE;
+
+		dev_err(DEV, "%s: failed to find req %p, sector %llus\n", func,
+			(void *)(unsigned long)id, (unsigned long long)sector);
+		return false;
 	}
 	__req_mod(req, what, &m);
 	spin_unlock_irq(&mdev->req_lock);
 
 	if (m.bio)
 		complete_master_bio(mdev, &m);
-	return TRUE;
+	return true;
 }
 
 static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h)
@@ -4218,7 +4329,7 @@
 	if (is_syncer_block_id(p->block_id)) {
 		drbd_set_in_sync(mdev, sector, blksize);
 		dec_rs_pending(mdev);
-		return TRUE;
+		return true;
 	}
 	switch (be16_to_cpu(h->command)) {
 	case P_RS_WRITE_ACK:
@@ -4239,7 +4350,7 @@
 		break;
 	default:
 		D_ASSERT(0);
-		return FALSE;
+		return false;
 	}
 
 	return validate_req_change_req_state(mdev, p->block_id, sector,
@@ -4250,20 +4361,44 @@
 {
 	struct p_block_ack *p = (struct p_block_ack *)h;
 	sector_t sector = be64_to_cpu(p->sector);
-
-	if (__ratelimit(&drbd_ratelimit_state))
-		dev_warn(DEV, "Got NegAck packet. Peer is in troubles?\n");
+	int size = be32_to_cpu(p->blksize);
+	struct drbd_request *req;
+	struct bio_and_error m;
 
 	update_peer_seq(mdev, be32_to_cpu(p->seq_num));
 
 	if (is_syncer_block_id(p->block_id)) {
-		int size = be32_to_cpu(p->blksize);
 		dec_rs_pending(mdev);
 		drbd_rs_failed_io(mdev, sector, size);
-		return TRUE;
+		return true;
 	}
-	return validate_req_change_req_state(mdev, p->block_id, sector,
-		_ack_id_to_req, __func__ , neg_acked);
+
+	spin_lock_irq(&mdev->req_lock);
+	req = _ack_id_to_req(mdev, p->block_id, sector);
+	if (!req) {
+		spin_unlock_irq(&mdev->req_lock);
+		if (mdev->net_conf->wire_protocol == DRBD_PROT_A ||
+		    mdev->net_conf->wire_protocol == DRBD_PROT_B) {
+			/* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
+			   The master bio might already be completed, therefore the
+			   request is no longer in the collision hash.
+			   => Do not try to validate block_id as request. */
+			/* In Protocol B we might already have got a P_RECV_ACK
+			   but then get a P_NEG_ACK after wards. */
+			drbd_set_out_of_sync(mdev, sector, size);
+			return true;
+		} else {
+			dev_err(DEV, "%s: failed to find req %p, sector %llus\n", __func__,
+				(void *)(unsigned long)p->block_id, (unsigned long long)sector);
+			return false;
+		}
+	}
+	__req_mod(req, neg_acked, &m);
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (m.bio)
+		complete_master_bio(mdev, &m);
+	return true;
 }
 
 static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h)
@@ -4294,11 +4429,20 @@
 
 	if (get_ldev_if_state(mdev, D_FAILED)) {
 		drbd_rs_complete_io(mdev, sector);
-		drbd_rs_failed_io(mdev, sector, size);
+		switch (be16_to_cpu(h->command)) {
+		case P_NEG_RS_DREPLY:
+			drbd_rs_failed_io(mdev, sector, size);
+		case P_RS_CANCEL:
+			break;
+		default:
+			D_ASSERT(0);
+			put_ldev(mdev);
+			return false;
+		}
 		put_ldev(mdev);
 	}
 
-	return TRUE;
+	return true;
 }
 
 static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
@@ -4307,7 +4451,14 @@
 
 	tl_release(mdev, p->barrier, be32_to_cpu(p->set_size));
 
-	return TRUE;
+	if (mdev->state.conn == C_AHEAD &&
+	    atomic_read(&mdev->ap_in_flight) == 0 &&
+	    !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags)) {
+		mdev->start_resync_timer.expires = jiffies + HZ;
+		add_timer(&mdev->start_resync_timer);
+	}
+
+	return true;
 }
 
 static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
@@ -4328,12 +4479,18 @@
 		ov_oos_print(mdev);
 
 	if (!get_ldev(mdev))
-		return TRUE;
+		return true;
 
 	drbd_rs_complete_io(mdev, sector);
 	dec_rs_pending(mdev);
 
-	if (--mdev->ov_left == 0) {
+	--mdev->ov_left;
+
+	/* let's advance progress step marks only for every other megabyte */
+	if ((mdev->ov_left & 0x200) == 0x200)
+		drbd_advance_rs_marks(mdev, mdev->ov_left);
+
+	if (mdev->ov_left == 0) {
 		w = kmalloc(sizeof(*w), GFP_NOIO);
 		if (w) {
 			w->cb = w_ov_finished;
@@ -4345,12 +4502,12 @@
 		}
 	}
 	put_ldev(mdev);
-	return TRUE;
+	return true;
 }
 
 static int got_skip(struct drbd_conf *mdev, struct p_header80 *h)
 {
-	return TRUE;
+	return true;
 }
 
 struct asender_cmd {
@@ -4378,6 +4535,7 @@
 	[P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
 	[P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
 	[P_DELAY_PROBE]     = { sizeof(struct p_delay_probe93), got_skip },
+	[P_RS_CANCEL]       = { sizeof(struct p_block_ack), got_NegRSDReply},
 	[P_MAX_CMD]	    = { 0, NULL },
 	};
 	if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index ad3fc62..5c0c8be 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -140,9 +140,14 @@
 	struct hlist_node *n;
 	struct hlist_head *slot;
 
-	/* before we can signal completion to the upper layers,
-	 * we may need to close the current epoch */
+	/* Before we can signal completion to the upper layers,
+	 * we may need to close the current epoch.
+	 * We can skip this, if this request has not even been sent, because we
+	 * did not have a fully established connection yet/anymore, during
+	 * bitmap exchange, or while we are C_AHEAD due to congestion policy.
+	 */
 	if (mdev->state.conn >= C_CONNECTED &&
+	    (s & RQ_NET_SENT) != 0 &&
 	    req->epoch == mdev->newest_tle->br_number)
 		queue_barrier(mdev);
 
@@ -440,7 +445,7 @@
 		req->rq_state |= RQ_LOCAL_COMPLETED;
 		req->rq_state &= ~RQ_LOCAL_PENDING;
 
-		__drbd_chk_io_error(mdev, FALSE);
+		__drbd_chk_io_error(mdev, false);
 		_req_may_be_done_not_susp(req, m);
 		put_ldev(mdev);
 		break;
@@ -461,7 +466,7 @@
 
 		D_ASSERT(!(req->rq_state & RQ_NET_MASK));
 
-		__drbd_chk_io_error(mdev, FALSE);
+		__drbd_chk_io_error(mdev, false);
 		put_ldev(mdev);
 
 		/* no point in retrying if there is no good remote data,
@@ -545,6 +550,14 @@
 
 		break;
 
+	case queue_for_send_oos:
+		req->rq_state |= RQ_NET_QUEUED;
+		req->w.cb =  w_send_oos;
+		drbd_queue_work(&mdev->data.work, &req->w);
+		break;
+
+	case oos_handed_to_network:
+		/* actually the same */
 	case send_canceled:
 		/* treat it the same */
 	case send_failed:
@@ -558,6 +571,9 @@
 
 	case handed_over_to_network:
 		/* assert something? */
+		if (bio_data_dir(req->master_bio) == WRITE)
+			atomic_add(req->size>>9, &mdev->ap_in_flight);
+
 		if (bio_data_dir(req->master_bio) == WRITE &&
 		    mdev->net_conf->wire_protocol == DRBD_PROT_A) {
 			/* this is what is dangerous about protocol A:
@@ -591,6 +607,9 @@
 			dec_ap_pending(mdev);
 		req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
 		req->rq_state |= RQ_NET_DONE;
+		if (req->rq_state & RQ_NET_SENT && req->rq_state & RQ_WRITE)
+			atomic_sub(req->size>>9, &mdev->ap_in_flight);
+
 		/* if it is still queued, we may not complete it here.
 		 * it will be canceled soon. */
 		if (!(req->rq_state & RQ_NET_QUEUED))
@@ -628,14 +647,17 @@
 		req->rq_state |= RQ_NET_OK;
 		D_ASSERT(req->rq_state & RQ_NET_PENDING);
 		dec_ap_pending(mdev);
+		atomic_sub(req->size>>9, &mdev->ap_in_flight);
 		req->rq_state &= ~RQ_NET_PENDING;
 		_req_may_be_done_not_susp(req, m);
 		break;
 
 	case neg_acked:
 		/* assert something? */
-		if (req->rq_state & RQ_NET_PENDING)
+		if (req->rq_state & RQ_NET_PENDING) {
 			dec_ap_pending(mdev);
+			atomic_sub(req->size>>9, &mdev->ap_in_flight);
+		}
 		req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
 
 		req->rq_state |= RQ_NET_DONE;
@@ -690,8 +712,11 @@
 			dev_err(DEV, "FIXME (barrier_acked but pending)\n");
 			list_move(&req->tl_requests, &mdev->out_of_sequence_requests);
 		}
-		D_ASSERT(req->rq_state & RQ_NET_SENT);
-		req->rq_state |= RQ_NET_DONE;
+		if ((req->rq_state & RQ_NET_MASK) != 0) {
+			req->rq_state |= RQ_NET_DONE;
+			if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
+				atomic_sub(req->size>>9, &mdev->ap_in_flight);
+		}
 		_req_may_be_done(req, m); /* Allowed while state.susp */
 		break;
 
@@ -738,14 +763,14 @@
 	return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr);
 }
 
-static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
+static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
 {
 	const int rw = bio_rw(bio);
 	const int size = bio->bi_size;
 	const sector_t sector = bio->bi_sector;
 	struct drbd_tl_epoch *b = NULL;
 	struct drbd_request *req;
-	int local, remote;
+	int local, remote, send_oos = 0;
 	int err = -EIO;
 	int ret = 0;
 
@@ -759,6 +784,7 @@
 		bio_endio(bio, -ENOMEM);
 		return 0;
 	}
+	req->start_time = start_time;
 
 	local = get_ldev(mdev);
 	if (!local) {
@@ -808,9 +834,9 @@
 		drbd_al_begin_io(mdev, sector);
 	}
 
-	remote = remote && (mdev->state.pdsk == D_UP_TO_DATE ||
-			    (mdev->state.pdsk == D_INCONSISTENT &&
-			     mdev->state.conn >= C_CONNECTED));
+	remote = remote && drbd_should_do_remote(mdev->state);
+	send_oos = rw == WRITE && drbd_should_send_oos(mdev->state);
+	D_ASSERT(!(remote && send_oos));
 
 	if (!(local || remote) && !is_susp(mdev->state)) {
 		if (__ratelimit(&drbd_ratelimit_state))
@@ -824,7 +850,7 @@
 	 * but there is a race between testing the bit and pointer outside the
 	 * spinlock, and grabbing the spinlock.
 	 * if we lost that race, we retry.  */
-	if (rw == WRITE && remote &&
+	if (rw == WRITE && (remote || send_oos) &&
 	    mdev->unused_spare_tle == NULL &&
 	    test_bit(CREATE_BARRIER, &mdev->flags)) {
 allocate_barrier:
@@ -842,18 +868,19 @@
 	if (is_susp(mdev->state)) {
 		/* If we got suspended, use the retry mechanism of
 		   generic_make_request() to restart processing of this
-		   bio. In the next call to drbd_make_request_26
+		   bio. In the next call to drbd_make_request
 		   we sleep in inc_ap_bio() */
 		ret = 1;
 		spin_unlock_irq(&mdev->req_lock);
 		goto fail_free_complete;
 	}
 
-	if (remote) {
-		remote = (mdev->state.pdsk == D_UP_TO_DATE ||
-			    (mdev->state.pdsk == D_INCONSISTENT &&
-			     mdev->state.conn >= C_CONNECTED));
-		if (!remote)
+	if (remote || send_oos) {
+		remote = drbd_should_do_remote(mdev->state);
+		send_oos = rw == WRITE && drbd_should_send_oos(mdev->state);
+		D_ASSERT(!(remote && send_oos));
+
+		if (!(remote || send_oos))
 			dev_warn(DEV, "lost connection while grabbing the req_lock!\n");
 		if (!(local || remote)) {
 			dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
@@ -866,7 +893,7 @@
 		mdev->unused_spare_tle = b;
 		b = NULL;
 	}
-	if (rw == WRITE && remote &&
+	if (rw == WRITE && (remote || send_oos) &&
 	    mdev->unused_spare_tle == NULL &&
 	    test_bit(CREATE_BARRIER, &mdev->flags)) {
 		/* someone closed the current epoch
@@ -889,7 +916,7 @@
 	 * barrier packet.  To get the write ordering right, we only have to
 	 * make sure that, if this is a write request and it triggered a
 	 * barrier packet, this request is queued within the same spinlock. */
-	if (remote && mdev->unused_spare_tle &&
+	if ((remote || send_oos) && mdev->unused_spare_tle &&
 	    test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
 		_tl_add_barrier(mdev, mdev->unused_spare_tle);
 		mdev->unused_spare_tle = NULL;
@@ -937,6 +964,34 @@
 				? queue_for_net_write
 				: queue_for_net_read);
 	}
+	if (send_oos && drbd_set_out_of_sync(mdev, sector, size))
+		_req_mod(req, queue_for_send_oos);
+
+	if (remote &&
+	    mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) {
+		int congested = 0;
+
+		if (mdev->net_conf->cong_fill &&
+		    atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) {
+			dev_info(DEV, "Congestion-fill threshold reached\n");
+			congested = 1;
+		}
+
+		if (mdev->act_log->used >= mdev->net_conf->cong_extents) {
+			dev_info(DEV, "Congestion-extents threshold reached\n");
+			congested = 1;
+		}
+
+		if (congested) {
+			queue_barrier(mdev); /* last barrier, after mirrored writes */
+
+			if (mdev->net_conf->on_congestion == OC_PULL_AHEAD)
+				_drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL);
+			else  /*mdev->net_conf->on_congestion == OC_DISCONNECT */
+				_drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL);
+		}
+	}
+
 	spin_unlock_irq(&mdev->req_lock);
 	kfree(b); /* if someone else has beaten us to it... */
 
@@ -949,9 +1004,9 @@
 		 * stable storage, and this is a WRITE, we may not even submit
 		 * this bio. */
 		if (get_ldev(mdev)) {
-			if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
-					     : rw == READ  ? DRBD_FAULT_DT_RD
-					     :               DRBD_FAULT_DT_RA))
+			if (drbd_insert_fault(mdev,   rw == WRITE ? DRBD_FAULT_DT_WR
+						    : rw == READ  ? DRBD_FAULT_DT_RD
+						    :               DRBD_FAULT_DT_RA))
 				bio_endio(req->private_bio, -EIO);
 			else
 				generic_make_request(req->private_bio);
@@ -1018,16 +1073,19 @@
 	return 0;
 }
 
-int drbd_make_request_26(struct request_queue *q, struct bio *bio)
+int drbd_make_request(struct request_queue *q, struct bio *bio)
 {
 	unsigned int s_enr, e_enr;
 	struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
+	unsigned long start_time;
 
 	if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) {
 		bio_endio(bio, -EPERM);
 		return 0;
 	}
 
+	start_time = jiffies;
+
 	/*
 	 * what we "blindly" assume:
 	 */
@@ -1042,12 +1100,12 @@
 
 	if (likely(s_enr == e_enr)) {
 		inc_ap_bio(mdev, 1);
-		return drbd_make_request_common(mdev, bio);
+		return drbd_make_request_common(mdev, bio, start_time);
 	}
 
 	/* can this bio be split generically?
 	 * Maybe add our own split-arbitrary-bios function. */
-	if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_SEGMENT_SIZE) {
+	if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_BIO_SIZE) {
 		/* rather error out here than BUG in bio_split */
 		dev_err(DEV, "bio would need to, but cannot, be split: "
 		    "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n",
@@ -1069,11 +1127,7 @@
 		const int sps = 1 << HT_SHIFT; /* sectors per slot */
 		const int mask = sps - 1;
 		const sector_t first_sectors = sps - (sect & mask);
-		bp = bio_split(bio,
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
-				bio_split_pool,
-#endif
-				first_sectors);
+		bp = bio_split(bio, first_sectors);
 
 		/* we need to get a "reference count" (ap_bio_cnt)
 		 * to avoid races with the disconnect/reconnect/suspend code.
@@ -1084,10 +1138,10 @@
 
 		D_ASSERT(e_enr == s_enr + 1);
 
-		while (drbd_make_request_common(mdev, &bp->bio1))
+		while (drbd_make_request_common(mdev, &bp->bio1, start_time))
 			inc_ap_bio(mdev, 1);
 
-		while (drbd_make_request_common(mdev, &bp->bio2))
+		while (drbd_make_request_common(mdev, &bp->bio2, start_time))
 			inc_ap_bio(mdev, 1);
 
 		dec_ap_bio(mdev);
@@ -1098,7 +1152,7 @@
 }
 
 /* This is called by bio_add_page().  With this function we reduce
- * the number of BIOs that span over multiple DRBD_MAX_SEGMENT_SIZEs
+ * the number of BIOs that span over multiple DRBD_MAX_BIO_SIZEs
  * units (was AL_EXTENTs).
  *
  * we do the calculation within the lower 32bit of the byte offsets,
@@ -1108,7 +1162,7 @@
  * As long as the BIO is empty we have to allow at least one bvec,
  * regardless of size and offset.  so the resulting bio may still
  * cross extent boundaries.  those are dealt with (bio_split) in
- * drbd_make_request_26.
+ * drbd_make_request.
  */
 int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
 {
@@ -1118,8 +1172,8 @@
 	unsigned int bio_size = bvm->bi_size;
 	int limit, backing_limit;
 
-	limit = DRBD_MAX_SEGMENT_SIZE
-	      - ((bio_offset & (DRBD_MAX_SEGMENT_SIZE-1)) + bio_size);
+	limit = DRBD_MAX_BIO_SIZE
+	      - ((bio_offset & (DRBD_MAX_BIO_SIZE-1)) + bio_size);
 	if (limit < 0)
 		limit = 0;
 	if (bio_size == 0) {
@@ -1136,3 +1190,42 @@
 	}
 	return limit;
 }
+
+void request_timer_fn(unsigned long data)
+{
+	struct drbd_conf *mdev = (struct drbd_conf *) data;
+	struct drbd_request *req; /* oldest request */
+	struct list_head *le;
+	unsigned long et = 0; /* effective timeout = ko_count * timeout */
+
+	if (get_net_conf(mdev)) {
+		et = mdev->net_conf->timeout*HZ/10 * mdev->net_conf->ko_count;
+		put_net_conf(mdev);
+	}
+	if (!et || mdev->state.conn < C_WF_REPORT_PARAMS)
+		return; /* Recurring timer stopped */
+
+	spin_lock_irq(&mdev->req_lock);
+	le = &mdev->oldest_tle->requests;
+	if (list_empty(le)) {
+		spin_unlock_irq(&mdev->req_lock);
+		mod_timer(&mdev->request_timer, jiffies + et);
+		return;
+	}
+
+	le = le->prev;
+	req = list_entry(le, struct drbd_request, tl_requests);
+	if (time_is_before_eq_jiffies(req->start_time + et)) {
+		if (req->rq_state & RQ_NET_PENDING) {
+			dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n");
+			_drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE, NULL);
+		} else {
+			dev_warn(DEV, "Local backing block device frozen?\n");
+			mod_timer(&mdev->request_timer, jiffies + et);
+		}
+	} else {
+		mod_timer(&mdev->request_timer, req->start_time + et);
+	}
+
+	spin_unlock_irq(&mdev->req_lock);
+}
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index ab2bd09..32e2c3e 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -82,14 +82,16 @@
 	to_be_submitted,
 
 	/* XXX yes, now I am inconsistent...
-	 * these two are not "events" but "actions"
+	 * these are not "events" but "actions"
 	 * oh, well... */
 	queue_for_net_write,
 	queue_for_net_read,
+	queue_for_send_oos,
 
 	send_canceled,
 	send_failed,
 	handed_over_to_network,
+	oos_handed_to_network,
 	connection_lost_while_pending,
 	read_retry_remote_canceled,
 	recv_acked_by_peer,
@@ -289,7 +291,6 @@
 		req->epoch       = 0;
 		req->sector      = bio_src->bi_sector;
 		req->size        = bio_src->bi_size;
-		req->start_time  = jiffies;
 		INIT_HLIST_NODE(&req->colision);
 		INIT_LIST_HEAD(&req->tl_requests);
 		INIT_LIST_HEAD(&req->w.list);
@@ -321,6 +322,7 @@
 		struct bio_and_error *m);
 extern void complete_master_bio(struct drbd_conf *mdev,
 		struct bio_and_error *m);
+extern void request_timer_fn(unsigned long data);
 
 /* use this if you don't want to deal with calling complete_master_bio()
  * outside the spinlock, e.g. when walking some list on cleanup. */
@@ -338,23 +340,43 @@
 	return rv;
 }
 
-/* completion of master bio is outside of spinlock.
- * If you need it irqsave, do it your self!
- * Which means: don't use from bio endio callback. */
+/* completion of master bio is outside of our spinlock.
+ * We still may or may not be inside some irqs disabled section
+ * of the lower level driver completion callback, so we need to
+ * spin_lock_irqsave here. */
 static inline int req_mod(struct drbd_request *req,
 		enum drbd_req_event what)
 {
+	unsigned long flags;
 	struct drbd_conf *mdev = req->mdev;
 	struct bio_and_error m;
 	int rv;
 
-	spin_lock_irq(&mdev->req_lock);
+	spin_lock_irqsave(&mdev->req_lock, flags);
 	rv = __req_mod(req, what, &m);
-	spin_unlock_irq(&mdev->req_lock);
+	spin_unlock_irqrestore(&mdev->req_lock, flags);
 
 	if (m.bio)
 		complete_master_bio(mdev, &m);
 
 	return rv;
 }
+
+static inline bool drbd_should_do_remote(union drbd_state s)
+{
+	return s.pdsk == D_UP_TO_DATE ||
+		(s.pdsk >= D_INCONSISTENT &&
+		 s.conn >= C_WF_BITMAP_T &&
+		 s.conn < C_AHEAD);
+	/* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
+	   That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
+	   states. */
+}
+static inline bool drbd_should_send_oos(union drbd_state s)
+{
+	return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
+	/* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
+	   since we enter state C_AHEAD only if proto >= 96 */
+}
+
 #endif
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
index 85179e1..c44a2a6 100644
--- a/drivers/block/drbd/drbd_strings.c
+++ b/drivers/block/drbd/drbd_strings.c
@@ -48,6 +48,8 @@
 	[C_PAUSED_SYNC_T]    = "PausedSyncT",
 	[C_VERIFY_S]         = "VerifyS",
 	[C_VERIFY_T]         = "VerifyT",
+	[C_AHEAD]            = "Ahead",
+	[C_BEHIND]           = "Behind",
 };
 
 static const char *drbd_role_s_names[] = {
@@ -92,7 +94,7 @@
 const char *drbd_conn_str(enum drbd_conns s)
 {
 	/* enums are unsigned... */
-	return s > C_PAUSED_SYNC_T ? "TOO_LARGE" : drbd_conn_s_names[s];
+	return s > C_BEHIND ? "TOO_LARGE" : drbd_conn_s_names[s];
 }
 
 const char *drbd_role_str(enum drbd_role s)
@@ -105,7 +107,7 @@
 	return s > D_UP_TO_DATE    ? "TOO_LARGE" : drbd_disk_s_names[s];
 }
 
-const char *drbd_set_st_err_str(enum drbd_state_ret_codes err)
+const char *drbd_set_st_err_str(enum drbd_state_rv err)
 {
 	return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" :
 	       err > SS_TWO_PRIMARIES ? "TOO_LARGE"
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index e027446..f7e6c92 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -39,18 +39,17 @@
 #include "drbd_req.h"
 
 static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
+static int w_make_resync_request(struct drbd_conf *mdev,
+				 struct drbd_work *w, int cancel);
 
 
 
-/* defined here:
-   drbd_md_io_complete
-   drbd_endio_sec
-   drbd_endio_pri
-
- * more endio handlers:
-   atodb_endio in drbd_actlog.c
-   drbd_bm_async_io_complete in drbd_bitmap.c
-
+/* endio handlers:
+ *   drbd_md_io_complete (defined here)
+ *   drbd_endio_pri (defined here)
+ *   drbd_endio_sec (defined here)
+ *   bm_async_io_complete (defined in drbd_bitmap.c)
+ *
  * For all these callbacks, note the following:
  * The callbacks will be called in irq context by the IDE drivers,
  * and in Softirqs/Tasklets/BH context by the SCSI drivers.
@@ -94,7 +93,7 @@
 	if (list_empty(&mdev->read_ee))
 		wake_up(&mdev->ee_wait);
 	if (test_bit(__EE_WAS_ERROR, &e->flags))
-		__drbd_chk_io_error(mdev, FALSE);
+		__drbd_chk_io_error(mdev, false);
 	spin_unlock_irqrestore(&mdev->req_lock, flags);
 
 	drbd_queue_work(&mdev->data.work, &e->w);
@@ -137,7 +136,7 @@
 		: list_empty(&mdev->active_ee);
 
 	if (test_bit(__EE_WAS_ERROR, &e->flags))
-		__drbd_chk_io_error(mdev, FALSE);
+		__drbd_chk_io_error(mdev, false);
 	spin_unlock_irqrestore(&mdev->req_lock, flags);
 
 	if (is_syncer_req)
@@ -163,14 +162,15 @@
 	int uptodate = bio_flagged(bio, BIO_UPTODATE);
 	int is_write = bio_data_dir(bio) == WRITE;
 
-	if (error)
+	if (error && __ratelimit(&drbd_ratelimit_state))
 		dev_warn(DEV, "%s: error=%d s=%llus\n",
 				is_write ? "write" : "read", error,
 				(unsigned long long)e->sector);
 	if (!error && !uptodate) {
-		dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
-				is_write ? "write" : "read",
-				(unsigned long long)e->sector);
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
+					is_write ? "write" : "read",
+					(unsigned long long)e->sector);
 		/* strange behavior of some lower level drivers...
 		 * fail the request by clearing the uptodate flag,
 		 * but do not return any error?! */
@@ -250,13 +250,6 @@
 	return w_send_read_req(mdev, w, 0);
 }
 
-int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
-{
-	ERR_IF(cancel) return 1;
-	dev_err(DEV, "resync inactive, but callback triggered??\n");
-	return 1; /* Simply ignore this! */
-}
-
 void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest)
 {
 	struct hash_desc desc;
@@ -355,7 +348,7 @@
 	if (!get_ldev(mdev))
 		return -EIO;
 
-	if (drbd_rs_should_slow_down(mdev))
+	if (drbd_rs_should_slow_down(mdev, sector))
 		goto defer;
 
 	/* GFP_TRY, because if there is no memory available right now, this may
@@ -373,9 +366,10 @@
 	if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
 		return 0;
 
-	/* drbd_submit_ee currently fails for one reason only:
-	 * not being able to allocate enough bios.
-	 * Is dropping the connection going to help? */
+	/* If it failed because of ENOMEM, retry should help.  If it failed
+	 * because bio_add_page failed (probably broken lower level driver),
+	 * retry may or may not help.
+	 * If it does not, you may need to force disconnect. */
 	spin_lock_irq(&mdev->req_lock);
 	list_del(&e->w.list);
 	spin_unlock_irq(&mdev->req_lock);
@@ -386,26 +380,25 @@
 	return -EAGAIN;
 }
 
+int w_resync_timer(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	switch (mdev->state.conn) {
+	case C_VERIFY_S:
+		w_make_ov_request(mdev, w, cancel);
+		break;
+	case C_SYNC_TARGET:
+		w_make_resync_request(mdev, w, cancel);
+		break;
+	}
+
+	return 1;
+}
+
 void resync_timer_fn(unsigned long data)
 {
 	struct drbd_conf *mdev = (struct drbd_conf *) data;
-	int queue;
 
-	queue = 1;
-	switch (mdev->state.conn) {
-	case C_VERIFY_S:
-		mdev->resync_work.cb = w_make_ov_request;
-		break;
-	case C_SYNC_TARGET:
-		mdev->resync_work.cb = w_make_resync_request;
-		break;
-	default:
-		queue = 0;
-		mdev->resync_work.cb = w_resync_inactive;
-	}
-
-	/* harmless race: list_empty outside data.work.q_lock */
-	if (list_empty(&mdev->resync_work.list) && queue)
+	if (list_empty(&mdev->resync_work.list))
 		drbd_queue_work(&mdev->data.work, &mdev->resync_work);
 }
 
@@ -438,7 +431,7 @@
 		fb->values[i] += value;
 }
 
-int drbd_rs_controller(struct drbd_conf *mdev)
+static int drbd_rs_controller(struct drbd_conf *mdev)
 {
 	unsigned int sect_in;  /* Number of sectors that came in since the last turn */
 	unsigned int want;     /* The number of sectors we want in the proxy */
@@ -492,29 +485,36 @@
 	return req_sect;
 }
 
-int w_make_resync_request(struct drbd_conf *mdev,
-		struct drbd_work *w, int cancel)
+static int drbd_rs_number_requests(struct drbd_conf *mdev)
+{
+	int number;
+	if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */
+		number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9);
+		mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
+	} else {
+		mdev->c_sync_rate = mdev->sync_conf.rate;
+		number = SLEEP_TIME * mdev->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
+	}
+
+	/* ignore the amount of pending requests, the resync controller should
+	 * throttle down to incoming reply rate soon enough anyways. */
+	return number;
+}
+
+static int w_make_resync_request(struct drbd_conf *mdev,
+				 struct drbd_work *w, int cancel)
 {
 	unsigned long bit;
 	sector_t sector;
 	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
-	int max_segment_size;
-	int number, rollback_i, size, pe, mx;
+	int max_bio_size;
+	int number, rollback_i, size;
 	int align, queued, sndbuf;
 	int i = 0;
 
 	if (unlikely(cancel))
 		return 1;
 
-	if (unlikely(mdev->state.conn < C_CONNECTED)) {
-		dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected");
-		return 0;
-	}
-
-	if (mdev->state.conn != C_SYNC_TARGET)
-		dev_err(DEV, "%s in w_make_resync_request\n",
-			drbd_conn_str(mdev->state.conn));
-
 	if (mdev->rs_total == 0) {
 		/* empty resync? */
 		drbd_resync_finished(mdev);
@@ -527,49 +527,19 @@
 		   to continue resync with a broken disk makes no sense at
 		   all */
 		dev_err(DEV, "Disk broke down during resync!\n");
-		mdev->resync_work.cb = w_resync_inactive;
 		return 1;
 	}
 
 	/* starting with drbd 8.3.8, we can handle multi-bio EEs,
 	 * if it should be necessary */
-	max_segment_size =
-		mdev->agreed_pro_version < 94 ? queue_max_segment_size(mdev->rq_queue) :
-		mdev->agreed_pro_version < 95 ?	DRBD_MAX_SIZE_H80_PACKET : DRBD_MAX_SEGMENT_SIZE;
+	max_bio_size =
+		mdev->agreed_pro_version < 94 ? queue_max_hw_sectors(mdev->rq_queue) << 9 :
+		mdev->agreed_pro_version < 95 ?	DRBD_MAX_SIZE_H80_PACKET : DRBD_MAX_BIO_SIZE;
 
-	if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */
-		number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9);
-		mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
-	} else {
-		mdev->c_sync_rate = mdev->sync_conf.rate;
-		number = SLEEP_TIME * mdev->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
-	}
-
-	/* Throttle resync on lower level disk activity, which may also be
-	 * caused by application IO on Primary/SyncTarget.
-	 * Keep this after the call to drbd_rs_controller, as that assumes
-	 * to be called as precisely as possible every SLEEP_TIME,
-	 * and would be confused otherwise. */
-	if (drbd_rs_should_slow_down(mdev))
+	number = drbd_rs_number_requests(mdev);
+	if (number == 0)
 		goto requeue;
 
-	mutex_lock(&mdev->data.mutex);
-	if (mdev->data.socket)
-		mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req);
-	else
-		mx = 1;
-	mutex_unlock(&mdev->data.mutex);
-
-	/* For resync rates >160MB/sec, allow more pending RS requests */
-	if (number > mx)
-		mx = number;
-
-	/* Limit the number of pending RS requests to no more than the peer's receive buffer */
-	pe = atomic_read(&mdev->rs_pending_cnt);
-	if ((pe + number) > mx) {
-		number = mx - pe;
-	}
-
 	for (i = 0; i < number; i++) {
 		/* Stop generating RS requests, when half of the send buffer is filled */
 		mutex_lock(&mdev->data.mutex);
@@ -588,16 +558,16 @@
 		size = BM_BLOCK_SIZE;
 		bit  = drbd_bm_find_next(mdev, mdev->bm_resync_fo);
 
-		if (bit == -1UL) {
+		if (bit == DRBD_END_OF_BITMAP) {
 			mdev->bm_resync_fo = drbd_bm_bits(mdev);
-			mdev->resync_work.cb = w_resync_inactive;
 			put_ldev(mdev);
 			return 1;
 		}
 
 		sector = BM_BIT_TO_SECT(bit);
 
-		if (drbd_try_rs_begin_io(mdev, sector)) {
+		if (drbd_rs_should_slow_down(mdev, sector) ||
+		    drbd_try_rs_begin_io(mdev, sector)) {
 			mdev->bm_resync_fo = bit;
 			goto requeue;
 		}
@@ -608,7 +578,7 @@
 			goto next_sector;
 		}
 
-#if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE
+#if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE
 		/* try to find some adjacent bits.
 		 * we stop if we have already the maximum req size.
 		 *
@@ -618,7 +588,7 @@
 		align = 1;
 		rollback_i = i;
 		for (;;) {
-			if (size + BM_BLOCK_SIZE > max_segment_size)
+			if (size + BM_BLOCK_SIZE > max_bio_size)
 				break;
 
 			/* Be always aligned */
@@ -685,7 +655,6 @@
 		 * resync data block, and the last bit is cleared.
 		 * until then resync "work" is "inactive" ...
 		 */
-		mdev->resync_work.cb = w_resync_inactive;
 		put_ldev(mdev);
 		return 1;
 	}
@@ -706,27 +675,18 @@
 	if (unlikely(cancel))
 		return 1;
 
-	if (unlikely(mdev->state.conn < C_CONNECTED)) {
-		dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected");
-		return 0;
-	}
-
-	number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
-	if (atomic_read(&mdev->rs_pending_cnt) > number)
-		goto requeue;
-
-	number -= atomic_read(&mdev->rs_pending_cnt);
+	number = drbd_rs_number_requests(mdev);
 
 	sector = mdev->ov_position;
 	for (i = 0; i < number; i++) {
 		if (sector >= capacity) {
-			mdev->resync_work.cb = w_resync_inactive;
 			return 1;
 		}
 
 		size = BM_BLOCK_SIZE;
 
-		if (drbd_try_rs_begin_io(mdev, sector)) {
+		if (drbd_rs_should_slow_down(mdev, sector) ||
+		    drbd_try_rs_begin_io(mdev, sector)) {
 			mdev->ov_position = sector;
 			goto requeue;
 		}
@@ -744,11 +704,33 @@
 	mdev->ov_position = sector;
 
  requeue:
+	mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
 	mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
 	return 1;
 }
 
 
+void start_resync_timer_fn(unsigned long data)
+{
+	struct drbd_conf *mdev = (struct drbd_conf *) data;
+
+	drbd_queue_work(&mdev->data.work, &mdev->start_resync_work);
+}
+
+int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) {
+		dev_warn(DEV, "w_start_resync later...\n");
+		mdev->start_resync_timer.expires = jiffies + HZ/10;
+		add_timer(&mdev->start_resync_timer);
+		return 1;
+	}
+
+	drbd_start_resync(mdev, C_SYNC_SOURCE);
+	clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags);
+	return 1;
+}
+
 int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 {
 	kfree(w);
@@ -782,6 +764,7 @@
 	union drbd_state os, ns;
 	struct drbd_work *w;
 	char *khelper_cmd = NULL;
+	int verify_done = 0;
 
 	/* Remove all elements from the resync LRU. Since future actions
 	 * might set bits in the (main) bitmap, then the entries in the
@@ -792,8 +775,7 @@
 		 * queue (or even the read operations for those packets
 		 * is not finished by now).   Retry in 100ms. */
 
-		__set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(HZ / 10);
+		schedule_timeout_interruptible(HZ / 10);
 		w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
 		if (w) {
 			w->cb = w_resync_finished;
@@ -818,6 +800,8 @@
 	spin_lock_irq(&mdev->req_lock);
 	os = mdev->state;
 
+	verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);
+
 	/* This protects us against multiple calls (that can happen in the presence
 	   of application IO), and against connectivity loss just before we arrive here. */
 	if (os.conn <= C_CONNECTED)
@@ -827,8 +811,7 @@
 	ns.conn = C_CONNECTED;
 
 	dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
-	     (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ?
-	     "Online verify " : "Resync",
+	     verify_done ? "Online verify " : "Resync",
 	     dt + mdev->rs_paused, mdev->rs_paused, dbdt);
 
 	n_oos = drbd_bm_total_weight(mdev);
@@ -886,14 +869,18 @@
 			}
 		}
 
-		drbd_uuid_set_bm(mdev, 0UL);
-
-		if (mdev->p_uuid) {
-			/* Now the two UUID sets are equal, update what we
-			 * know of the peer. */
-			int i;
-			for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
-				mdev->p_uuid[i] = mdev->ldev->md.uuid[i];
+		if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) {
+			/* for verify runs, we don't update uuids here,
+			 * so there would be nothing to report. */
+			drbd_uuid_set_bm(mdev, 0UL);
+			drbd_print_uuids(mdev, "updated UUIDs");
+			if (mdev->p_uuid) {
+				/* Now the two UUID sets are equal, update what we
+				 * know of the peer. */
+				int i;
+				for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
+					mdev->p_uuid[i] = mdev->ldev->md.uuid[i];
+			}
 		}
 	}
 
@@ -905,15 +892,11 @@
 	mdev->rs_total  = 0;
 	mdev->rs_failed = 0;
 	mdev->rs_paused = 0;
-	mdev->ov_start_sector = 0;
+	if (verify_done)
+		mdev->ov_start_sector = 0;
 
 	drbd_md_sync(mdev);
 
-	if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
-		dev_info(DEV, "Writing the whole bitmap\n");
-		drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
-	}
-
 	if (khelper_cmd)
 		drbd_khelper(mdev, khelper_cmd);
 
@@ -994,7 +977,9 @@
 		put_ldev(mdev);
 	}
 
-	if (likely((e->flags & EE_WAS_ERROR) == 0)) {
+	if (mdev->state.conn == C_AHEAD) {
+		ok = drbd_send_ack(mdev, P_RS_CANCEL, e);
+	} else if (likely((e->flags & EE_WAS_ERROR) == 0)) {
 		if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
 			inc_rs_pending(mdev);
 			ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
@@ -1096,25 +1081,27 @@
 	if (unlikely(cancel))
 		goto out;
 
-	if (unlikely((e->flags & EE_WAS_ERROR) != 0))
-		goto out;
-
 	digest_size = crypto_hash_digestsize(mdev->verify_tfm);
-	/* FIXME if this allocation fails, online verify will not terminate! */
 	digest = kmalloc(digest_size, GFP_NOIO);
-	if (digest) {
-		drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
-		inc_rs_pending(mdev);
-		ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
-					     digest, digest_size, P_OV_REPLY);
-		if (!ok)
-			dec_rs_pending(mdev);
-		kfree(digest);
+	if (!digest) {
+		ok = 0;	/* terminate the connection in case the allocation failed */
+		goto out;
 	}
 
+	if (likely(!(e->flags & EE_WAS_ERROR)))
+		drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
+	else
+		memset(digest, 0, digest_size);
+
+	inc_rs_pending(mdev);
+	ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
+				     digest, digest_size, P_OV_REPLY);
+	if (!ok)
+		dec_rs_pending(mdev);
+	kfree(digest);
+
 out:
 	drbd_free_ee(mdev, e);
-
 	dec_unacked(mdev);
 
 	return ok;
@@ -1129,7 +1116,6 @@
 		mdev->ov_last_oos_size = size>>9;
 	}
 	drbd_set_out_of_sync(mdev, sector, size);
-	set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
 }
 
 int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
@@ -1165,10 +1151,6 @@
 			eq = !memcmp(digest, di->digest, digest_size);
 			kfree(digest);
 		}
-	} else {
-		ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
-		if (__ratelimit(&drbd_ratelimit_state))
-			dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
 	}
 
 	dec_unacked(mdev);
@@ -1182,7 +1164,13 @@
 
 	drbd_free_ee(mdev, e);
 
-	if (--mdev->ov_left == 0) {
+	--mdev->ov_left;
+
+	/* let's advance progress step marks only for every other megabyte */
+	if ((mdev->ov_left & 0x200) == 0x200)
+		drbd_advance_rs_marks(mdev, mdev->ov_left);
+
+	if (mdev->ov_left == 0) {
 		ov_oos_print(mdev);
 		drbd_resync_finished(mdev);
 	}
@@ -1235,6 +1223,22 @@
 	return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE);
 }
 
+int w_send_oos(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+	int ok;
+
+	if (unlikely(cancel)) {
+		req_mod(req, send_canceled);
+		return 1;
+	}
+
+	ok = drbd_send_oos(mdev, req);
+	req_mod(req, oos_handed_to_network);
+
+	return ok;
+}
+
 /**
  * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
  * @mdev:	DRBD device.
@@ -1430,6 +1434,17 @@
 	return retcode;
 }
 
+void drbd_rs_controller_reset(struct drbd_conf *mdev)
+{
+	atomic_set(&mdev->rs_sect_in, 0);
+	atomic_set(&mdev->rs_sect_ev, 0);
+	mdev->rs_in_flight = 0;
+	mdev->rs_planed = 0;
+	spin_lock(&mdev->peer_seq_lock);
+	fifo_set(&mdev->rs_plan_s, 0);
+	spin_unlock(&mdev->peer_seq_lock);
+}
+
 /**
  * drbd_start_resync() - Start the resync process
  * @mdev:	DRBD device.
@@ -1443,13 +1458,18 @@
 	union drbd_state ns;
 	int r;
 
-	if (mdev->state.conn >= C_SYNC_SOURCE) {
+	if (mdev->state.conn >= C_SYNC_SOURCE && mdev->state.conn < C_AHEAD) {
 		dev_err(DEV, "Resync already running!\n");
 		return;
 	}
 
-	/* In case a previous resync run was aborted by an IO error/detach on the peer. */
-	drbd_rs_cancel_all(mdev);
+	if (mdev->state.conn < C_AHEAD) {
+		/* In case a previous resync run was aborted by an IO error/detach on the peer. */
+		drbd_rs_cancel_all(mdev);
+		/* This should be done when we abort the resync. We definitely do not
+		   want to have this for connections going back and forth between
+		   Ahead/Behind and SyncSource/SyncTarget */
+	}
 
 	if (side == C_SYNC_TARGET) {
 		/* Since application IO was locked out during C_WF_BITMAP_T and
@@ -1463,6 +1483,20 @@
 			drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
 			return;
 		}
+	} else /* C_SYNC_SOURCE */ {
+		r = drbd_khelper(mdev, "before-resync-source");
+		r = (r >> 8) & 0xff;
+		if (r > 0) {
+			if (r == 3) {
+				dev_info(DEV, "before-resync-source handler returned %d, "
+					 "ignoring. Old userland tools?", r);
+			} else {
+				dev_info(DEV, "before-resync-source handler returned %d, "
+					 "dropping connection.\n", r);
+				drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+				return;
+			}
+		}
 	}
 
 	drbd_state_lock(mdev);
@@ -1472,18 +1506,6 @@
 		return;
 	}
 
-	if (side == C_SYNC_TARGET) {
-		mdev->bm_resync_fo = 0;
-	} else /* side == C_SYNC_SOURCE */ {
-		u64 uuid;
-
-		get_random_bytes(&uuid, sizeof(u64));
-		drbd_uuid_set(mdev, UI_BITMAP, uuid);
-		drbd_send_sync_uuid(mdev, uuid);
-
-		D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
-	}
-
 	write_lock_irq(&global_state_lock);
 	ns = mdev->state;
 
@@ -1521,13 +1543,24 @@
 		_drbd_pause_after(mdev);
 	}
 	write_unlock_irq(&global_state_lock);
-	put_ldev(mdev);
 
 	if (r == SS_SUCCESS) {
 		dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
 		     drbd_conn_str(ns.conn),
 		     (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
 		     (unsigned long) mdev->rs_total);
+		if (side == C_SYNC_TARGET)
+			mdev->bm_resync_fo = 0;
+
+		/* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
+		 * with w_send_oos, or the sync target will get confused as to
+		 * how much bits to resync.  We cannot do that always, because for an
+		 * empty resync and protocol < 95, we need to do it here, as we call
+		 * drbd_resync_finished from here in that case.
+		 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
+		 * and from after_state_ch otherwise. */
+		if (side == C_SYNC_SOURCE && mdev->agreed_pro_version < 96)
+			drbd_gen_and_send_sync_uuid(mdev);
 
 		if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) {
 			/* This still has a race (about when exactly the peers
@@ -1547,13 +1580,7 @@
 			drbd_resync_finished(mdev);
 		}
 
-		atomic_set(&mdev->rs_sect_in, 0);
-		atomic_set(&mdev->rs_sect_ev, 0);
-		mdev->rs_in_flight = 0;
-		mdev->rs_planed = 0;
-		spin_lock(&mdev->peer_seq_lock);
-		fifo_set(&mdev->rs_plan_s, 0);
-		spin_unlock(&mdev->peer_seq_lock);
+		drbd_rs_controller_reset(mdev);
 		/* ns.conn may already be != mdev->state.conn,
 		 * we may have been paused in between, or become paused until
 		 * the timer triggers.
@@ -1563,6 +1590,7 @@
 
 		drbd_md_sync(mdev);
 	}
+	put_ldev(mdev);
 	drbd_state_unlock(mdev);
 }
 
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h
index 53586fa..151f1a3 100644
--- a/drivers/block/drbd/drbd_wrappers.h
+++ b/drivers/block/drbd/drbd_wrappers.h
@@ -39,7 +39,7 @@
 		return;
 	}
 
-	if (FAULT_ACTIVE(mdev, fault_type))
+	if (drbd_insert_fault(mdev, fault_type))
 		bio_endio(bio, -EIO);
 	else
 		generic_make_request(bio);
diff --git a/drivers/dma/ipu/ipu_irq.c b/drivers/dma/ipu/ipu_irq.c
index dd8ebc7..ab8a4ef 100644
--- a/drivers/dma/ipu/ipu_irq.c
+++ b/drivers/dma/ipu/ipu_irq.c
@@ -94,9 +94,9 @@
 	return NULL;
 }
 
-static void ipu_irq_unmask(unsigned int irq)
+static void ipu_irq_unmask(struct irq_data *d)
 {
-	struct ipu_irq_map *map = get_irq_chip_data(irq);
+	struct ipu_irq_map *map = irq_data_get_irq_chip_data(d);
 	struct ipu_irq_bank *bank;
 	uint32_t reg;
 	unsigned long lock_flags;
@@ -106,7 +106,7 @@
 	bank = map->bank;
 	if (!bank) {
 		spin_unlock_irqrestore(&bank_lock, lock_flags);
-		pr_err("IPU: %s(%u) - unmapped!\n", __func__, irq);
+		pr_err("IPU: %s(%u) - unmapped!\n", __func__, d->irq);
 		return;
 	}
 
@@ -117,9 +117,9 @@
 	spin_unlock_irqrestore(&bank_lock, lock_flags);
 }
 
-static void ipu_irq_mask(unsigned int irq)
+static void ipu_irq_mask(struct irq_data *d)
 {
-	struct ipu_irq_map *map = get_irq_chip_data(irq);
+	struct ipu_irq_map *map = irq_data_get_irq_chip_data(d);
 	struct ipu_irq_bank *bank;
 	uint32_t reg;
 	unsigned long lock_flags;
@@ -129,7 +129,7 @@
 	bank = map->bank;
 	if (!bank) {
 		spin_unlock_irqrestore(&bank_lock, lock_flags);
-		pr_err("IPU: %s(%u) - unmapped!\n", __func__, irq);
+		pr_err("IPU: %s(%u) - unmapped!\n", __func__, d->irq);
 		return;
 	}
 
@@ -140,9 +140,9 @@
 	spin_unlock_irqrestore(&bank_lock, lock_flags);
 }
 
-static void ipu_irq_ack(unsigned int irq)
+static void ipu_irq_ack(struct irq_data *d)
 {
-	struct ipu_irq_map *map = get_irq_chip_data(irq);
+	struct ipu_irq_map *map = irq_data_get_irq_chip_data(d);
 	struct ipu_irq_bank *bank;
 	unsigned long lock_flags;
 
@@ -151,7 +151,7 @@
 	bank = map->bank;
 	if (!bank) {
 		spin_unlock_irqrestore(&bank_lock, lock_flags);
-		pr_err("IPU: %s(%u) - unmapped!\n", __func__, irq);
+		pr_err("IPU: %s(%u) - unmapped!\n", __func__, d->irq);
 		return;
 	}
 
@@ -167,7 +167,7 @@
  */
 bool ipu_irq_status(unsigned int irq)
 {
-	struct ipu_irq_map *map = get_irq_chip_data(irq);
+	struct ipu_irq_map *map = irq_get_chip_data(irq);
 	struct ipu_irq_bank *bank;
 	unsigned long lock_flags;
 	bool ret;
@@ -269,7 +269,7 @@
 /* Chained IRQ handler for IPU error interrupt */
 static void ipu_irq_err(unsigned int irq, struct irq_desc *desc)
 {
-	struct ipu *ipu = get_irq_data(irq);
+	struct ipu *ipu = irq_get_handler_data(irq);
 	u32 status;
 	int i, line;
 
@@ -310,7 +310,7 @@
 /* Chained IRQ handler for IPU function interrupt */
 static void ipu_irq_fn(unsigned int irq, struct irq_desc *desc)
 {
-	struct ipu *ipu = get_irq_data(irq);
+	struct ipu *ipu = irq_desc_get_handler_data(desc);
 	u32 status;
 	int i, line;
 
@@ -345,10 +345,10 @@
 }
 
 static struct irq_chip ipu_irq_chip = {
-	.name	= "ipu_irq",
-	.ack	= ipu_irq_ack,
-	.mask	= ipu_irq_mask,
-	.unmask	= ipu_irq_unmask,
+	.name		= "ipu_irq",
+	.irq_ack	= ipu_irq_ack,
+	.irq_mask	= ipu_irq_mask,
+	.irq_unmask	= ipu_irq_unmask,
 };
 
 /* Install the IRQ handler */
@@ -366,26 +366,26 @@
 		int ret;
 
 		irq = irq_base + i;
-		ret = set_irq_chip(irq, &ipu_irq_chip);
+		ret = irq_set_chip(irq, &ipu_irq_chip);
 		if (ret < 0)
 			return ret;
-		ret = set_irq_chip_data(irq, irq_map + i);
+		ret = irq_set_chip_data(irq, irq_map + i);
 		if (ret < 0)
 			return ret;
 		irq_map[i].ipu = ipu;
 		irq_map[i].irq = irq;
 		irq_map[i].source = -EINVAL;
-		set_irq_handler(irq, handle_level_irq);
+		irq_set_handler(irq, handle_level_irq);
 #ifdef CONFIG_ARM
 		set_irq_flags(irq, IRQF_VALID | IRQF_PROBE);
 #endif
 	}
 
-	set_irq_data(ipu->irq_fn, ipu);
-	set_irq_chained_handler(ipu->irq_fn, ipu_irq_fn);
+	irq_set_handler_data(ipu->irq_fn, ipu);
+	irq_set_chained_handler(ipu->irq_fn, ipu_irq_fn);
 
-	set_irq_data(ipu->irq_err, ipu);
-	set_irq_chained_handler(ipu->irq_err, ipu_irq_err);
+	irq_set_handler_data(ipu->irq_err, ipu);
+	irq_set_chained_handler(ipu->irq_err, ipu_irq_err);
 
 	return 0;
 }
@@ -397,17 +397,17 @@
 
 	irq_base = pdata->irq_base;
 
-	set_irq_chained_handler(ipu->irq_fn, NULL);
-	set_irq_data(ipu->irq_fn, NULL);
+	irq_set_chained_handler(ipu->irq_fn, NULL);
+	irq_set_handler_data(ipu->irq_fn, NULL);
 
-	set_irq_chained_handler(ipu->irq_err, NULL);
-	set_irq_data(ipu->irq_err, NULL);
+	irq_set_chained_handler(ipu->irq_err, NULL);
+	irq_set_handler_data(ipu->irq_err, NULL);
 
 	for (irq = irq_base; irq < irq_base + CONFIG_MX3_IPU_IRQS; irq++) {
 #ifdef CONFIG_ARM
 		set_irq_flags(irq, 0);
 #endif
-		set_irq_chip(irq, NULL);
-		set_irq_chip_data(irq, NULL);
+		irq_set_chip(irq, NULL);
+		irq_set_chip_data(irq, NULL);
 	}
 }
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 81131ed..060ef63 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -315,11 +315,22 @@
 	  will be called f71805f.
 
 config SENSORS_F71882FG
-	tristate "Fintek F71858FG, F71862FG, F71882FG, F71889FG and F8000"
+	tristate "Fintek F71882FG and compatibles"
 	help
 	  If you say yes here you get support for hardware monitoring
-	  features of the Fintek F71858FG, F71862FG/71863FG, F71882FG/F71883FG,
-	  F71889FG and F8000 Super-I/O chips.
+	  features of many Fintek Super-I/O (LPC) chips. The currently
+	  supported chips are:
+	    F71808E
+	    F71858FG
+	    F71862FG
+	    F71863FG
+	    F71869F/E
+	    F71882FG
+	    F71883FG
+	    F71889FG/ED/A
+	    F8000
+	    F81801U
+	    F81865F
 
 	  This driver can also be built as a module.  If so, the module
 	  will be called f71882fg.
diff --git a/drivers/hwmon/f71882fg.c b/drivers/hwmon/f71882fg.c
index a4d430e..ca07a32 100644
--- a/drivers/hwmon/f71882fg.c
+++ b/drivers/hwmon/f71882fg.c
@@ -54,7 +54,9 @@
 #define SIO_F71882_ID		0x0541	/* Chipset ID */
 #define SIO_F71889_ID		0x0723	/* Chipset ID */
 #define SIO_F71889E_ID		0x0909	/* Chipset ID */
+#define SIO_F71889A_ID		0x1005	/* Chipset ID */
 #define SIO_F8000_ID		0x0581	/* Chipset ID */
+#define SIO_F81865_ID		0x0704	/* Chipset ID */
 
 #define REGION_LENGTH		8
 #define ADDR_REG_OFFSET		5
@@ -106,7 +108,7 @@
 MODULE_PARM_DESC(force_id, "Override the detected device ID");
 
 enum chips { f71808e, f71858fg, f71862fg, f71869, f71882fg, f71889fg,
-	     f71889ed, f8000 };
+	     f71889ed, f71889a, f8000, f81865f };
 
 static const char *f71882fg_names[] = {
 	"f71808e",
@@ -114,42 +116,76 @@
 	"f71862fg",
 	"f71869", /* Both f71869f and f71869e, reg. compatible and same id */
 	"f71882fg",
-	"f71889fg",
+	"f71889fg", /* f81801u too, same id */
 	"f71889ed",
+	"f71889a",
 	"f8000",
+	"f81865f",
 };
 
-static const char f71882fg_has_in[8][F71882FG_MAX_INS] = {
-	{ 1, 1, 1, 1, 1, 1, 0, 1, 1 }, /* f71808e */
-	{ 1, 1, 1, 0, 0, 0, 0, 0, 0 }, /* f71858fg */
-	{ 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* f71862fg */
-	{ 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* f71869 */
-	{ 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* f71882fg */
-	{ 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* f71889fg */
-	{ 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* f71889ed */
-	{ 1, 1, 1, 0, 0, 0, 0, 0, 0 }, /* f8000 */
+static const char f71882fg_has_in[][F71882FG_MAX_INS] = {
+	[f71808e]	= { 1, 1, 1, 1, 1, 1, 0, 1, 1 },
+	[f71858fg]	= { 1, 1, 1, 0, 0, 0, 0, 0, 0 },
+	[f71862fg]	= { 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+	[f71869]	= { 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+	[f71882fg]	= { 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+	[f71889fg]	= { 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+	[f71889ed]	= { 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+	[f71889a]	= { 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+	[f8000]		= { 1, 1, 1, 0, 0, 0, 0, 0, 0 },
+	[f81865f]	= { 1, 1, 1, 1, 1, 1, 1, 0, 0 },
 };
 
-static const char f71882fg_has_in1_alarm[8] = {
-	0, /* f71808e */
-	0, /* f71858fg */
-	0, /* f71862fg */
-	0, /* f71869 */
-	1, /* f71882fg */
-	1, /* f71889fg */
-	1, /* f71889ed */
-	0, /* f8000 */
+static const char f71882fg_has_in1_alarm[] = {
+	[f71808e]	= 0,
+	[f71858fg]	= 0,
+	[f71862fg]	= 0,
+	[f71869]	= 0,
+	[f71882fg]	= 1,
+	[f71889fg]	= 1,
+	[f71889ed]	= 1,
+	[f71889a]	= 1,
+	[f8000]		= 0,
+	[f81865f]	= 1,
 };
 
-static const char f71882fg_has_beep[8] = {
-	0, /* f71808e */
-	0, /* f71858fg */
-	1, /* f71862fg */
-	1, /* f71869 */
-	1, /* f71882fg */
-	1, /* f71889fg */
-	1, /* f71889ed */
-	0, /* f8000 */
+static const char f71882fg_has_beep[] = {
+	[f71808e]	= 0,
+	[f71858fg]	= 0,
+	[f71862fg]	= 1,
+	[f71869]	= 1,
+	[f71882fg]	= 1,
+	[f71889fg]	= 1,
+	[f71889ed]	= 1,
+	[f71889a]	= 1,
+	[f8000]		= 0,
+	[f81865f]	= 1,
+};
+
+static const char f71882fg_nr_fans[] = {
+	[f71808e]	= 3,
+	[f71858fg]	= 3,
+	[f71862fg]	= 3,
+	[f71869]	= 3,
+	[f71882fg]	= 4,
+	[f71889fg]	= 3,
+	[f71889ed]	= 3,
+	[f71889a]	= 3,
+	[f8000]		= 3,
+	[f81865f]	= 2,
+};
+
+static const char f71882fg_nr_temps[] = {
+	[f71808e]	= 2,
+	[f71858fg]	= 3,
+	[f71862fg]	= 3,
+	[f71869]	= 3,
+	[f71882fg]	= 3,
+	[f71889fg]	= 3,
+	[f71889ed]	= 3,
+	[f71889a]	= 3,
+	[f8000]		= 3,
+	[f81865f]	= 2,
 };
 
 static struct platform_device *f71882fg_pdev;
@@ -1071,9 +1107,9 @@
 static struct f71882fg_data *f71882fg_update_device(struct device *dev)
 {
 	struct f71882fg_data *data = dev_get_drvdata(dev);
+	int nr_fans = f71882fg_nr_fans[data->type];
+	int nr_temps = f71882fg_nr_temps[data->type];
 	int nr, reg, point;
-	int nr_fans = (data->type == f71882fg) ? 4 : 3;
-	int nr_temps = (data->type == f71808e) ? 2 : 3;
 
 	mutex_lock(&data->update_lock);
 
@@ -2042,8 +2078,9 @@
 {
 	struct f71882fg_data *data;
 	struct f71882fg_sio_data *sio_data = pdev->dev.platform_data;
-	int err, i, nr_fans = (sio_data->type == f71882fg) ? 4 : 3;
-	int nr_temps = (sio_data->type == f71808e) ? 2 : 3;
+	int nr_fans = f71882fg_nr_fans[sio_data->type];
+	int nr_temps = f71882fg_nr_temps[sio_data->type];
+	int err, i;
 	u8 start_reg, reg;
 
 	data = kzalloc(sizeof(struct f71882fg_data), GFP_KERNEL);
@@ -2138,6 +2175,7 @@
 			/* Fall through to select correct fan/pwm reg bank! */
 		case f71889fg:
 		case f71889ed:
+		case f71889a:
 			reg = f71882fg_read8(data, F71882FG_REG_FAN_FAULT_T);
 			if (reg & F71882FG_FAN_NEG_TEMP_EN)
 				data->auto_point_temp_signed = 1;
@@ -2163,16 +2201,12 @@
 		case f71862fg:
 			err = (data->pwm_enable & 0x15) != 0x15;
 			break;
-		case f71808e:
-		case f71869:
-		case f71882fg:
-		case f71889fg:
-		case f71889ed:
-			err = 0;
-			break;
 		case f8000:
 			err = data->pwm_enable & 0x20;
 			break;
+		default:
+			err = 0;
+			break;
 		}
 		if (err) {
 			dev_err(&pdev->dev,
@@ -2199,6 +2233,7 @@
 		case f71869:
 		case f71889fg:
 		case f71889ed:
+		case f71889a:
 			for (i = 0; i < nr_fans; i++) {
 				data->pwm_auto_point_mapping[i] =
 					f71882fg_read8(data,
@@ -2276,8 +2311,9 @@
 static int f71882fg_remove(struct platform_device *pdev)
 {
 	struct f71882fg_data *data = platform_get_drvdata(pdev);
-	int i, nr_fans = (data->type == f71882fg) ? 4 : 3;
-	int nr_temps = (data->type == f71808e) ? 2 : 3;
+	int nr_fans = f71882fg_nr_fans[data->type];
+	int nr_temps = f71882fg_nr_temps[data->type];
+	int i;
 	u8 start_reg = f71882fg_read8(data, F71882FG_REG_START);
 
 	if (data->hwmon_dev)
@@ -2406,9 +2442,15 @@
 	case SIO_F71889E_ID:
 		sio_data->type = f71889ed;
 		break;
+	case SIO_F71889A_ID:
+		sio_data->type = f71889a;
+		break;
 	case SIO_F8000_ID:
 		sio_data->type = f8000;
 		break;
+	case SIO_F81865_ID:
+		sio_data->type = f81865f;
+		break;
 	default:
 		pr_info("Unsupported Fintek device: %04x\n",
 			(unsigned int)devid);
diff --git a/drivers/hwmon/pmbus_core.c b/drivers/hwmon/pmbus_core.c
index 6474512..edfb92e 100644
--- a/drivers/hwmon/pmbus_core.c
+++ b/drivers/hwmon/pmbus_core.c
@@ -752,7 +752,7 @@
 static void pmbus_add_sensor(struct pmbus_data *data,
 			     const char *name, const char *type, int seq,
 			     int page, int reg, enum pmbus_sensor_classes class,
-			     bool update)
+			     bool update, bool readonly)
 {
 	struct pmbus_sensor *sensor;
 
@@ -765,7 +765,7 @@
 	sensor->reg = reg;
 	sensor->class = class;
 	sensor->update = update;
-	if (update)
+	if (readonly)
 		PMBUS_ADD_GET_ATTR(data, sensor->name, sensor,
 				   data->num_sensors);
 	else
@@ -916,14 +916,14 @@
 
 		i0 = data->num_sensors;
 		pmbus_add_label(data, "in", in_index, "vin", 0);
-		pmbus_add_sensor(data, "in", "input", in_index,
-				 0, PMBUS_READ_VIN, PSC_VOLTAGE_IN, true);
+		pmbus_add_sensor(data, "in", "input", in_index, 0,
+				 PMBUS_READ_VIN, PSC_VOLTAGE_IN, true, true);
 		if (pmbus_check_word_register(client, 0,
 					      PMBUS_VIN_UV_WARN_LIMIT)) {
 			i1 = data->num_sensors;
 			pmbus_add_sensor(data, "in", "min", in_index,
 					 0, PMBUS_VIN_UV_WARN_LIMIT,
-					 PSC_VOLTAGE_IN, false);
+					 PSC_VOLTAGE_IN, false, false);
 			if (info->func[0] & PMBUS_HAVE_STATUS_INPUT) {
 				pmbus_add_boolean_reg(data, "in", "min_alarm",
 						      in_index,
@@ -937,7 +937,7 @@
 			i1 = data->num_sensors;
 			pmbus_add_sensor(data, "in", "lcrit", in_index,
 					 0, PMBUS_VIN_UV_FAULT_LIMIT,
-					 PSC_VOLTAGE_IN, false);
+					 PSC_VOLTAGE_IN, false, false);
 			if (info->func[0] & PMBUS_HAVE_STATUS_INPUT) {
 				pmbus_add_boolean_reg(data, "in", "lcrit_alarm",
 						      in_index,
@@ -951,7 +951,7 @@
 			i1 = data->num_sensors;
 			pmbus_add_sensor(data, "in", "max", in_index,
 					 0, PMBUS_VIN_OV_WARN_LIMIT,
-					 PSC_VOLTAGE_IN, false);
+					 PSC_VOLTAGE_IN, false, false);
 			if (info->func[0] & PMBUS_HAVE_STATUS_INPUT) {
 				pmbus_add_boolean_reg(data, "in", "max_alarm",
 						      in_index,
@@ -965,7 +965,7 @@
 			i1 = data->num_sensors;
 			pmbus_add_sensor(data, "in", "crit", in_index,
 					 0, PMBUS_VIN_OV_FAULT_LIMIT,
-					 PSC_VOLTAGE_IN, false);
+					 PSC_VOLTAGE_IN, false, false);
 			if (info->func[0] & PMBUS_HAVE_STATUS_INPUT) {
 				pmbus_add_boolean_reg(data, "in", "crit_alarm",
 						      in_index,
@@ -988,7 +988,7 @@
 	if (info->func[0] & PMBUS_HAVE_VCAP) {
 		pmbus_add_label(data, "in", in_index, "vcap", 0);
 		pmbus_add_sensor(data, "in", "input", in_index, 0,
-				 PMBUS_READ_VCAP, PSC_VOLTAGE_IN, true);
+				 PMBUS_READ_VCAP, PSC_VOLTAGE_IN, true, true);
 		in_index++;
 	}
 
@@ -1004,13 +1004,13 @@
 		i0 = data->num_sensors;
 		pmbus_add_label(data, "in", in_index, "vout", page + 1);
 		pmbus_add_sensor(data, "in", "input", in_index, page,
-				 PMBUS_READ_VOUT, PSC_VOLTAGE_OUT, true);
+				 PMBUS_READ_VOUT, PSC_VOLTAGE_OUT, true, true);
 		if (pmbus_check_word_register(client, page,
 					      PMBUS_VOUT_UV_WARN_LIMIT)) {
 			i1 = data->num_sensors;
 			pmbus_add_sensor(data, "in", "min", in_index, page,
 					 PMBUS_VOUT_UV_WARN_LIMIT,
-					 PSC_VOLTAGE_OUT, false);
+					 PSC_VOLTAGE_OUT, false, false);
 			if (info->func[page] & PMBUS_HAVE_STATUS_VOUT) {
 				pmbus_add_boolean_reg(data, "in", "min_alarm",
 						      in_index,
@@ -1025,7 +1025,7 @@
 			i1 = data->num_sensors;
 			pmbus_add_sensor(data, "in", "lcrit", in_index, page,
 					 PMBUS_VOUT_UV_FAULT_LIMIT,
-					 PSC_VOLTAGE_OUT, false);
+					 PSC_VOLTAGE_OUT, false, false);
 			if (info->func[page] & PMBUS_HAVE_STATUS_VOUT) {
 				pmbus_add_boolean_reg(data, "in", "lcrit_alarm",
 						      in_index,
@@ -1040,7 +1040,7 @@
 			i1 = data->num_sensors;
 			pmbus_add_sensor(data, "in", "max", in_index, page,
 					 PMBUS_VOUT_OV_WARN_LIMIT,
-					 PSC_VOLTAGE_OUT, false);
+					 PSC_VOLTAGE_OUT, false, false);
 			if (info->func[page] & PMBUS_HAVE_STATUS_VOUT) {
 				pmbus_add_boolean_reg(data, "in", "max_alarm",
 						      in_index,
@@ -1055,7 +1055,7 @@
 			i1 = data->num_sensors;
 			pmbus_add_sensor(data, "in", "crit", in_index, page,
 					 PMBUS_VOUT_OV_FAULT_LIMIT,
-					 PSC_VOLTAGE_OUT, false);
+					 PSC_VOLTAGE_OUT, false, false);
 			if (info->func[page] & PMBUS_HAVE_STATUS_VOUT) {
 				pmbus_add_boolean_reg(data, "in", "crit_alarm",
 						      in_index,
@@ -1088,14 +1088,14 @@
 	if (info->func[0] & PMBUS_HAVE_IIN) {
 		i0 = data->num_sensors;
 		pmbus_add_label(data, "curr", in_index, "iin", 0);
-		pmbus_add_sensor(data, "curr", "input", in_index,
-				 0, PMBUS_READ_IIN, PSC_CURRENT_IN, true);
+		pmbus_add_sensor(data, "curr", "input", in_index, 0,
+				 PMBUS_READ_IIN, PSC_CURRENT_IN, true, true);
 		if (pmbus_check_word_register(client, 0,
 					      PMBUS_IIN_OC_WARN_LIMIT)) {
 			i1 = data->num_sensors;
 			pmbus_add_sensor(data, "curr", "max", in_index,
 					 0, PMBUS_IIN_OC_WARN_LIMIT,
-					 PSC_CURRENT_IN, false);
+					 PSC_CURRENT_IN, false, false);
 			if (info->func[0] & PMBUS_HAVE_STATUS_INPUT) {
 				pmbus_add_boolean_reg(data, "curr", "max_alarm",
 						      in_index,
@@ -1108,7 +1108,7 @@
 			i1 = data->num_sensors;
 			pmbus_add_sensor(data, "curr", "crit", in_index,
 					 0, PMBUS_IIN_OC_FAULT_LIMIT,
-					 PSC_CURRENT_IN, false);
+					 PSC_CURRENT_IN, false, false);
 			if (info->func[0] & PMBUS_HAVE_STATUS_INPUT)
 				pmbus_add_boolean_reg(data, "curr",
 						      "crit_alarm",
@@ -1131,13 +1131,13 @@
 		i0 = data->num_sensors;
 		pmbus_add_label(data, "curr", in_index, "iout", page + 1);
 		pmbus_add_sensor(data, "curr", "input", in_index, page,
-				 PMBUS_READ_IOUT, PSC_CURRENT_OUT, true);
+				 PMBUS_READ_IOUT, PSC_CURRENT_OUT, true, true);
 		if (pmbus_check_word_register(client, page,
 					      PMBUS_IOUT_OC_WARN_LIMIT)) {
 			i1 = data->num_sensors;
 			pmbus_add_sensor(data, "curr", "max", in_index, page,
 					 PMBUS_IOUT_OC_WARN_LIMIT,
-					 PSC_CURRENT_OUT, false);
+					 PSC_CURRENT_OUT, false, false);
 			if (info->func[page] & PMBUS_HAVE_STATUS_IOUT) {
 				pmbus_add_boolean_reg(data, "curr", "max_alarm",
 						      in_index,
@@ -1151,7 +1151,7 @@
 			i1 = data->num_sensors;
 			pmbus_add_sensor(data, "curr", "lcrit", in_index, page,
 					 PMBUS_IOUT_UC_FAULT_LIMIT,
-					 PSC_CURRENT_OUT, false);
+					 PSC_CURRENT_OUT, false, false);
 			if (info->func[page] & PMBUS_HAVE_STATUS_IOUT) {
 				pmbus_add_boolean_reg(data, "curr",
 						      "lcrit_alarm",
@@ -1166,7 +1166,7 @@
 			i1 = data->num_sensors;
 			pmbus_add_sensor(data, "curr", "crit", in_index, page,
 					 PMBUS_IOUT_OC_FAULT_LIMIT,
-					 PSC_CURRENT_OUT, false);
+					 PSC_CURRENT_OUT, false, false);
 			if (info->func[page] & PMBUS_HAVE_STATUS_IOUT) {
 				pmbus_add_boolean_reg(data, "curr",
 						      "crit_alarm",
@@ -1199,13 +1199,13 @@
 		i0 = data->num_sensors;
 		pmbus_add_label(data, "power", in_index, "pin", 0);
 		pmbus_add_sensor(data, "power", "input", in_index,
-				 0, PMBUS_READ_PIN, PSC_POWER, true);
+				 0, PMBUS_READ_PIN, PSC_POWER, true, true);
 		if (pmbus_check_word_register(client, 0,
 					      PMBUS_PIN_OP_WARN_LIMIT)) {
 			i1 = data->num_sensors;
 			pmbus_add_sensor(data, "power", "max", in_index,
 					 0, PMBUS_PIN_OP_WARN_LIMIT, PSC_POWER,
-					 false);
+					 false, false);
 			if (info->func[0] & PMBUS_HAVE_STATUS_INPUT)
 				pmbus_add_boolean_reg(data, "power",
 						      "alarm",
@@ -1228,7 +1228,7 @@
 		i0 = data->num_sensors;
 		pmbus_add_label(data, "power", in_index, "pout", page + 1);
 		pmbus_add_sensor(data, "power", "input", in_index, page,
-				 PMBUS_READ_POUT, PSC_POWER, true);
+				 PMBUS_READ_POUT, PSC_POWER, true, true);
 		/*
 		 * Per hwmon sysfs API, power_cap is to be used to limit output
 		 * power.
@@ -1241,7 +1241,8 @@
 		if (pmbus_check_word_register(client, page, PMBUS_POUT_MAX)) {
 			i1 = data->num_sensors;
 			pmbus_add_sensor(data, "power", "cap", in_index, page,
-					 PMBUS_POUT_MAX, PSC_POWER, false);
+					 PMBUS_POUT_MAX, PSC_POWER,
+					 false, false);
 			need_alarm = true;
 		}
 		if (pmbus_check_word_register(client, page,
@@ -1249,7 +1250,7 @@
 			i1 = data->num_sensors;
 			pmbus_add_sensor(data, "power", "max", in_index, page,
 					 PMBUS_POUT_OP_WARN_LIMIT, PSC_POWER,
-					 false);
+					 false, false);
 			need_alarm = true;
 		}
 		if (need_alarm && (info->func[page] & PMBUS_HAVE_STATUS_IOUT))
@@ -1264,7 +1265,7 @@
 			i1 = data->num_sensors;
 			pmbus_add_sensor(data, "power", "crit", in_index, page,
 					 PMBUS_POUT_OP_FAULT_LIMIT, PSC_POWER,
-					 false);
+					 false, false);
 			if (info->func[page] & PMBUS_HAVE_STATUS_IOUT)
 				pmbus_add_boolean_reg(data, "power",
 						      "crit_alarm",
@@ -1302,7 +1303,7 @@
 			i0 = data->num_sensors;
 			pmbus_add_sensor(data, "temp", "input", in_index, page,
 					 pmbus_temp_registers[t],
-					 PSC_TEMPERATURE, true);
+					 PSC_TEMPERATURE, true, true);
 
 			/*
 			 * PMBus provides only one status register for TEMP1-3.
@@ -1323,7 +1324,7 @@
 				i1 = data->num_sensors;
 				pmbus_add_sensor(data, "temp", "min", in_index,
 						 page, PMBUS_UT_WARN_LIMIT,
-						 PSC_TEMPERATURE, true);
+						 PSC_TEMPERATURE, true, false);
 				if (info->func[page] & PMBUS_HAVE_STATUS_TEMP) {
 					pmbus_add_boolean_cmp(data, "temp",
 						"min_alarm", in_index, i1, i0,
@@ -1338,7 +1339,7 @@
 				pmbus_add_sensor(data, "temp", "lcrit",
 						 in_index, page,
 						 PMBUS_UT_FAULT_LIMIT,
-						 PSC_TEMPERATURE, true);
+						 PSC_TEMPERATURE, true, false);
 				if (info->func[page] & PMBUS_HAVE_STATUS_TEMP) {
 					pmbus_add_boolean_cmp(data, "temp",
 						"lcrit_alarm", in_index, i1, i0,
@@ -1352,7 +1353,7 @@
 				i1 = data->num_sensors;
 				pmbus_add_sensor(data, "temp", "max", in_index,
 						 page, PMBUS_OT_WARN_LIMIT,
-						 PSC_TEMPERATURE, true);
+						 PSC_TEMPERATURE, true, false);
 				if (info->func[page] & PMBUS_HAVE_STATUS_TEMP) {
 					pmbus_add_boolean_cmp(data, "temp",
 						"max_alarm", in_index, i0, i1,
@@ -1366,7 +1367,7 @@
 				i1 = data->num_sensors;
 				pmbus_add_sensor(data, "temp", "crit", in_index,
 						 page, PMBUS_OT_FAULT_LIMIT,
-						 PSC_TEMPERATURE, true);
+						 PSC_TEMPERATURE, true, false);
 				if (info->func[page] & PMBUS_HAVE_STATUS_TEMP) {
 					pmbus_add_boolean_cmp(data, "temp",
 						"crit_alarm", in_index, i0, i1,
@@ -1421,7 +1422,8 @@
 
 			i0 = data->num_sensors;
 			pmbus_add_sensor(data, "fan", "input", in_index, page,
-					 pmbus_fan_registers[f], PSC_FAN, true);
+					 pmbus_fan_registers[f], PSC_FAN, true,
+					 true);
 
 			/*
 			 * Each fan status register covers multiple fans,
diff --git a/drivers/hwspinlock/Kconfig b/drivers/hwspinlock/Kconfig
index eb4af28..1f29bab 100644
--- a/drivers/hwspinlock/Kconfig
+++ b/drivers/hwspinlock/Kconfig
@@ -4,6 +4,7 @@
 
 config HWSPINLOCK
 	tristate "Generic Hardware Spinlock framework"
+	depends on ARCH_OMAP4
 	help
 	  Say y here to support the generic hardware spinlock framework.
 	  You only need to enable this if you have hardware spinlock module
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index f407784..0e406d73 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -440,6 +440,7 @@
 	struct ide_host *host = hwif->host;
 	struct request	*rq = NULL;
 	ide_startstop_t	startstop;
+	unsigned long queue_run_ms = 3; /* old plug delay */
 
 	spin_unlock_irq(q->queue_lock);
 
@@ -459,6 +460,9 @@
 		prev_port = hwif->host->cur_port;
 		if (drive->dev_flags & IDE_DFLAG_SLEEPING &&
 		    time_after(drive->sleep, jiffies)) {
+			unsigned long left = jiffies - drive->sleep;
+
+			queue_run_ms = jiffies_to_msecs(left + 1);
 			ide_unlock_port(hwif);
 			goto plug_device;
 		}
@@ -547,8 +551,10 @@
 plug_device_2:
 	spin_lock_irq(q->queue_lock);
 
-	if (rq)
+	if (rq) {
 		blk_requeue_request(q, rq);
+		blk_delay_queue(q, queue_run_ms);
+	}
 }
 
 void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq)
@@ -562,6 +568,10 @@
 		blk_requeue_request(q, rq);
 
 	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	/* Use 3ms as that was the old plug delay */
+	if (rq)
+		blk_delay_queue(q, 3);
 }
 
 static int drive_is_ready(ide_drive_t *drive)
diff --git a/drivers/mfd/88pm860x-core.c b/drivers/mfd/88pm860x-core.c
index 9c511c1..011cb6c 100644
--- a/drivers/mfd/88pm860x-core.c
+++ b/drivers/mfd/88pm860x-core.c
@@ -416,7 +416,6 @@
 				: chip->companion;
 	unsigned char status_buf[INT_STATUS_NUM];
 	unsigned long flags = IRQF_TRIGGER_FALLING | IRQF_ONESHOT;
-	struct irq_desc *desc;
 	int i, data, mask, ret = -EINVAL;
 	int __irq;
 
@@ -468,19 +467,17 @@
 	if (!chip->core_irq)
 		goto out;
 
-	desc = irq_to_desc(chip->core_irq);
-
 	/* register IRQ by genirq */
 	for (i = 0; i < ARRAY_SIZE(pm860x_irqs); i++) {
 		__irq = i + chip->irq_base;
-		set_irq_chip_data(__irq, chip);
-		set_irq_chip_and_handler(__irq, &pm860x_irq_chip,
+		irq_set_chip_data(__irq, chip);
+		irq_set_chip_and_handler(__irq, &pm860x_irq_chip,
 					 handle_edge_irq);
-		set_irq_nested_thread(__irq, 1);
+		irq_set_nested_thread(__irq, 1);
 #ifdef CONFIG_ARM
 		set_irq_flags(__irq, IRQF_VALID);
 #else
-		set_irq_noprobe(__irq);
+		irq_set_noprobe(__irq);
 #endif
 	}
 
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index a9a1af4..9a46d64 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -133,6 +133,7 @@
 	tristate "TPS61050/61052 Boost Converters"
 	depends on I2C
 	select REGULATOR
+	select MFD_CORE
 	select REGULATOR_FIXED_VOLTAGE
 	help
 	  This option enables a driver for the TP61050/TPS61052
@@ -591,7 +592,7 @@
 config MFD_CS5535
 	tristate "Support for CS5535 and CS5536 southbridge core functions"
 	select MFD_CORE
-	depends on PCI
+	depends on PCI && X86
 	---help---
 	  This is the core driver for CS5535/CS5536 MFD functions.  This is
           necessary for using the board's GPIO and MFGPT functionality.
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 47f5709..ef489f2 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -63,7 +63,7 @@
 obj-$(CONFIG_PMIC_DA903X)	+= da903x.o
 max8925-objs			:= max8925-core.o max8925-i2c.o
 obj-$(CONFIG_MFD_MAX8925)	+= max8925.o
-obj-$(CONFIG_MFD_MAX8997)	+= max8997.o
+obj-$(CONFIG_MFD_MAX8997)	+= max8997.o max8997-irq.o
 obj-$(CONFIG_MFD_MAX8998)	+= max8998.o max8998-irq.o
 
 pcf50633-objs			:= pcf50633-core.o pcf50633-irq.o
diff --git a/drivers/mfd/ab3550-core.c b/drivers/mfd/ab3550-core.c
index c12d042..ff86acf 100644
--- a/drivers/mfd/ab3550-core.c
+++ b/drivers/mfd/ab3550-core.c
@@ -668,7 +668,7 @@
 	struct ab3550_platform_data *plf_data;
 	bool val;
 
-	ab = get_irq_chip_data(irq);
+	ab = irq_get_chip_data(irq);
 	plf_data = ab->i2c_client[0]->dev.platform_data;
 	irq -= plf_data->irq.base;
 	val = ((ab->startup_events[irq / 8] & BIT(irq % 8)) != 0);
@@ -1296,14 +1296,14 @@
 		unsigned int irq;
 
 		irq = ab3550_plf_data->irq.base + i;
-		set_irq_chip_data(irq, ab);
-		set_irq_chip_and_handler(irq, &ab3550_irq_chip,
-			handle_simple_irq);
-		set_irq_nested_thread(irq, 1);
+		irq_set_chip_data(irq, ab);
+		irq_set_chip_and_handler(irq, &ab3550_irq_chip,
+					 handle_simple_irq);
+		irq_set_nested_thread(irq, 1);
 #ifdef CONFIG_ARM
 		set_irq_flags(irq, IRQF_VALID);
 #else
-		set_irq_noprobe(irq);
+		irq_set_noprobe(irq);
 #endif
 	}
 
diff --git a/drivers/mfd/ab8500-core.c b/drivers/mfd/ab8500-core.c
index 6e185b2..62e33e2 100644
--- a/drivers/mfd/ab8500-core.c
+++ b/drivers/mfd/ab8500-core.c
@@ -334,14 +334,14 @@
 	int irq;
 
 	for (irq = base; irq < base + AB8500_NR_IRQS; irq++) {
-		set_irq_chip_data(irq, ab8500);
-		set_irq_chip_and_handler(irq, &ab8500_irq_chip,
+		irq_set_chip_data(irq, ab8500);
+		irq_set_chip_and_handler(irq, &ab8500_irq_chip,
 					 handle_simple_irq);
-		set_irq_nested_thread(irq, 1);
+		irq_set_nested_thread(irq, 1);
 #ifdef CONFIG_ARM
 		set_irq_flags(irq, IRQF_VALID);
 #else
-		set_irq_noprobe(irq);
+		irq_set_noprobe(irq);
 #endif
 	}
 
@@ -357,8 +357,8 @@
 #ifdef CONFIG_ARM
 		set_irq_flags(irq, 0);
 #endif
-		set_irq_chip_and_handler(irq, NULL, NULL);
-		set_irq_chip_data(irq, NULL);
+		irq_set_chip_and_handler(irq, NULL, NULL);
+		irq_set_chip_data(irq, NULL);
 	}
 }
 
diff --git a/drivers/mfd/asic3.c b/drivers/mfd/asic3.c
index 0241f08..d4a851c 100644
--- a/drivers/mfd/asic3.c
+++ b/drivers/mfd/asic3.c
@@ -139,13 +139,12 @@
 
 static void asic3_irq_demux(unsigned int irq, struct irq_desc *desc)
 {
+	struct asic3 *asic = irq_desc_get_handler_data(desc);
+	struct irq_data *data = irq_desc_get_irq_data(desc);
 	int iter, i;
 	unsigned long flags;
-	struct asic3 *asic;
 
-	desc->irq_data.chip->irq_ack(&desc->irq_data);
-
-	asic = get_irq_data(irq);
+	data->chip->irq_ack(irq_data);
 
 	for (iter = 0 ; iter < MAX_ASIC_ISR_LOOPS; iter++) {
 		u32 status;
@@ -188,8 +187,7 @@
 					irqnr = asic->irq_base +
 						(ASIC3_GPIOS_PER_BANK * bank)
 						+ i;
-					desc = irq_to_desc(irqnr);
-					desc->handle_irq(irqnr, desc);
+					generic_handle_irq(irqnr);
 					if (asic->irq_bothedge[bank] & bit)
 						asic3_irq_flip_edge(asic, base,
 								    bit);
@@ -200,11 +198,8 @@
 		/* Handle remaining IRQs in the status register */
 		for (i = ASIC3_NUM_GPIOS; i < ASIC3_NR_IRQS; i++) {
 			/* They start at bit 4 and go up */
-			if (status & (1 << (i - ASIC3_NUM_GPIOS + 4))) {
-				desc = irq_to_desc(asic->irq_base + i);
-				desc->handle_irq(asic->irq_base + i,
-						 desc);
-			}
+			if (status & (1 << (i - ASIC3_NUM_GPIOS + 4)))
+				generic_handle_irq(asic->irq_base + i);
 		}
 	}
 
@@ -393,21 +388,21 @@
 
 	for (irq = irq_base; irq < irq_base + ASIC3_NR_IRQS; irq++) {
 		if (irq < asic->irq_base + ASIC3_NUM_GPIOS)
-			set_irq_chip(irq, &asic3_gpio_irq_chip);
+			irq_set_chip(irq, &asic3_gpio_irq_chip);
 		else
-			set_irq_chip(irq, &asic3_irq_chip);
+			irq_set_chip(irq, &asic3_irq_chip);
 
-		set_irq_chip_data(irq, asic);
-		set_irq_handler(irq, handle_level_irq);
+		irq_set_chip_data(irq, asic);
+		irq_set_handler(irq, handle_level_irq);
 		set_irq_flags(irq, IRQF_VALID | IRQF_PROBE);
 	}
 
 	asic3_write_register(asic, ASIC3_OFFSET(INTR, INT_MASK),
 			     ASIC3_INTMASK_GINTMASK);
 
-	set_irq_chained_handler(asic->irq_nr, asic3_irq_demux);
-	set_irq_type(asic->irq_nr, IRQ_TYPE_EDGE_RISING);
-	set_irq_data(asic->irq_nr, asic);
+	irq_set_chained_handler(asic->irq_nr, asic3_irq_demux);
+	irq_set_irq_type(asic->irq_nr, IRQ_TYPE_EDGE_RISING);
+	irq_set_handler_data(asic->irq_nr, asic);
 
 	return 0;
 }
@@ -421,11 +416,10 @@
 
 	for (irq = irq_base; irq < irq_base + ASIC3_NR_IRQS; irq++) {
 		set_irq_flags(irq, 0);
-		set_irq_handler(irq, NULL);
-		set_irq_chip(irq, NULL);
-		set_irq_chip_data(irq, NULL);
+		irq_set_chip_and_handler(irq, NULL, NULL);
+		irq_set_chip_data(irq, NULL);
 	}
-	set_irq_chained_handler(asic->irq_nr, NULL);
+	irq_set_chained_handler(asic->irq_nr, NULL);
 }
 
 /* GPIOs */
diff --git a/drivers/mfd/cs5535-mfd.c b/drivers/mfd/cs5535-mfd.c
index 886a068..155fa04 100644
--- a/drivers/mfd/cs5535-mfd.c
+++ b/drivers/mfd/cs5535-mfd.c
@@ -27,6 +27,7 @@
 #include <linux/mfd/core.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <asm/olpc.h>
 
 #define DRV_NAME "cs5535-mfd"
 
@@ -111,6 +112,20 @@
 	},
 };
 
+#ifdef CONFIG_OLPC
+static void __devinit cs5535_clone_olpc_cells(void)
+{
+	const char *acpi_clones[] = { "olpc-xo1-pm-acpi", "olpc-xo1-sci-acpi" };
+
+	if (!machine_is_olpc())
+		return;
+
+	mfd_clone_cell("cs5535-acpi", acpi_clones, ARRAY_SIZE(acpi_clones));
+}
+#else
+static void cs5535_clone_olpc_cells(void) { }
+#endif
+
 static int __devinit cs5535_mfd_probe(struct pci_dev *pdev,
 		const struct pci_device_id *id)
 {
@@ -139,6 +154,7 @@
 		dev_err(&pdev->dev, "MFD add devices failed: %d\n", err);
 		goto err_disable;
 	}
+	cs5535_clone_olpc_cells();
 
 	dev_info(&pdev->dev, "%zu devices registered.\n",
 			ARRAY_SIZE(cs5535_mfd_cells));
diff --git a/drivers/mfd/ezx-pcap.c b/drivers/mfd/ezx-pcap.c
index 9e2d8dd..f2f4029 100644
--- a/drivers/mfd/ezx-pcap.c
+++ b/drivers/mfd/ezx-pcap.c
@@ -162,6 +162,7 @@
 
 static struct irq_chip pcap_irq_chip = {
 	.name		= "pcap",
+	.irq_disable	= pcap_mask_irq,
 	.irq_mask	= pcap_mask_irq,
 	.irq_unmask	= pcap_unmask_irq,
 };
@@ -196,17 +197,8 @@
 		local_irq_disable();
 		service = isr & ~msr;
 		for (irq = pcap->irq_base; service; service >>= 1, irq++) {
-			if (service & 1) {
-				struct irq_desc *desc = irq_to_desc(irq);
-
-				if (WARN(!desc, "Invalid PCAP IRQ %d\n", irq))
-					break;
-
-				if (desc->status & IRQ_DISABLED)
-					note_interrupt(irq, desc, IRQ_NONE);
-				else
-					desc->handle_irq(irq, desc);
-			}
+			if (service & 1)
+				generic_handle_irq(irq);
 		}
 		local_irq_enable();
 		ezx_pcap_write(pcap, PCAP_REG_MSR, pcap->msr);
@@ -215,7 +207,7 @@
 
 static void pcap_irq_handler(unsigned int irq, struct irq_desc *desc)
 {
-	struct pcap_chip *pcap = get_irq_data(irq);
+	struct pcap_chip *pcap = irq_get_handler_data(irq);
 
 	desc->irq_data.chip->irq_ack(&desc->irq_data);
 	queue_work(pcap->workqueue, &pcap->isr_work);
@@ -419,7 +411,7 @@
 
 	/* cleanup irqchip */
 	for (i = pcap->irq_base; i < (pcap->irq_base + PCAP_NIRQS); i++)
-		set_irq_chip_and_handler(i, NULL, NULL);
+		irq_set_chip_and_handler(i, NULL, NULL);
 
 	destroy_workqueue(pcap->workqueue);
 
@@ -476,12 +468,12 @@
 
 	/* setup irq chip */
 	for (i = pcap->irq_base; i < (pcap->irq_base + PCAP_NIRQS); i++) {
-		set_irq_chip_and_handler(i, &pcap_irq_chip, handle_simple_irq);
-		set_irq_chip_data(i, pcap);
+		irq_set_chip_and_handler(i, &pcap_irq_chip, handle_simple_irq);
+		irq_set_chip_data(i, pcap);
 #ifdef CONFIG_ARM
 		set_irq_flags(i, IRQF_VALID);
 #else
-		set_irq_noprobe(i);
+		irq_set_noprobe(i);
 #endif
 	}
 
@@ -490,10 +482,10 @@
 	ezx_pcap_write(pcap, PCAP_REG_ISR, PCAP_CLEAR_INTERRUPT_REGISTER);
 	pcap->msr = PCAP_MASK_ALL_INTERRUPT;
 
-	set_irq_type(spi->irq, IRQ_TYPE_EDGE_RISING);
-	set_irq_data(spi->irq, pcap);
-	set_irq_chained_handler(spi->irq, pcap_irq_handler);
-	set_irq_wake(spi->irq, 1);
+	irq_set_irq_type(spi->irq, IRQ_TYPE_EDGE_RISING);
+	irq_set_handler_data(spi->irq, pcap);
+	irq_set_chained_handler(spi->irq, pcap_irq_handler);
+	irq_set_irq_wake(spi->irq, 1);
 
 	/* ADC */
 	adc_irq = pcap_to_irq(pcap, (pdata->config & PCAP_SECOND_PORT) ?
@@ -522,7 +514,7 @@
 	free_irq(adc_irq, pcap);
 free_irqchip:
 	for (i = pcap->irq_base; i < (pcap->irq_base + PCAP_NIRQS); i++)
-		set_irq_chip_and_handler(i, NULL, NULL);
+		irq_set_chip_and_handler(i, NULL, NULL);
 /* destroy_workqueue: */
 	destroy_workqueue(pcap->workqueue);
 free_pcap:
diff --git a/drivers/mfd/htc-egpio.c b/drivers/mfd/htc-egpio.c
index d00b6d1..bbaec0c 100644
--- a/drivers/mfd/htc-egpio.c
+++ b/drivers/mfd/htc-egpio.c
@@ -100,7 +100,7 @@
 
 static void egpio_handler(unsigned int irq, struct irq_desc *desc)
 {
-	struct egpio_info *ei = get_irq_data(irq);
+	struct egpio_info *ei = irq_desc_get_handler_data(desc);
 	int irqpin;
 
 	/* Read current pins. */
@@ -113,9 +113,7 @@
 	for_each_set_bit(irqpin, &readval, ei->nirqs) {
 		/* Run irq handler */
 		pr_debug("got IRQ %d\n", irqpin);
-		irq = ei->irq_start + irqpin;
-		desc = irq_to_desc(irq);
-		desc->handle_irq(irq, desc);
+		generic_handle_irq(ei->irq_start + irqpin);
 	}
 }
 
@@ -346,14 +344,14 @@
 			ei->ack_write = 0;
 		irq_end = ei->irq_start + ei->nirqs;
 		for (irq = ei->irq_start; irq < irq_end; irq++) {
-			set_irq_chip(irq, &egpio_muxed_chip);
-			set_irq_chip_data(irq, ei);
-			set_irq_handler(irq, handle_simple_irq);
+			irq_set_chip_and_handler(irq, &egpio_muxed_chip,
+						 handle_simple_irq);
+			irq_set_chip_data(irq, ei);
 			set_irq_flags(irq, IRQF_VALID | IRQF_PROBE);
 		}
-		set_irq_type(ei->chained_irq, IRQ_TYPE_EDGE_RISING);
-		set_irq_data(ei->chained_irq, ei);
-		set_irq_chained_handler(ei->chained_irq, egpio_handler);
+		irq_set_irq_type(ei->chained_irq, IRQ_TYPE_EDGE_RISING);
+		irq_set_handler_data(ei->chained_irq, ei);
+		irq_set_chained_handler(ei->chained_irq, egpio_handler);
 		ack_irqs(ei);
 
 		device_init_wakeup(&pdev->dev, 1);
@@ -375,11 +373,10 @@
 	if (ei->chained_irq) {
 		irq_end = ei->irq_start + ei->nirqs;
 		for (irq = ei->irq_start; irq < irq_end; irq++) {
-			set_irq_chip(irq, NULL);
-			set_irq_handler(irq, NULL);
+			irq_set_chip_and_handler(irq, NULL, NULL);
 			set_irq_flags(irq, 0);
 		}
-		set_irq_chained_handler(ei->chained_irq, NULL);
+		irq_set_chained_handler(ei->chained_irq, NULL);
 		device_init_wakeup(&pdev->dev, 0);
 	}
 	iounmap(ei->base_addr);
diff --git a/drivers/mfd/htc-i2cpld.c b/drivers/mfd/htc-i2cpld.c
index 296ad15..d55065c 100644
--- a/drivers/mfd/htc-i2cpld.c
+++ b/drivers/mfd/htc-i2cpld.c
@@ -58,6 +58,7 @@
 	uint                    irq_start;
 	int                     nirqs;
 
+	unsigned int		flow_type;
 	/*
 	 * Work structure to allow for setting values outside of any
 	 * possible interrupt context
@@ -97,12 +98,7 @@
 
 static int htcpld_set_type(struct irq_data *data, unsigned int flags)
 {
-	struct irq_desc *d = irq_to_desc(data->irq);
-
-	if (!d) {
-		pr_err("HTCPLD invalid IRQ: %d\n", data->irq);
-		return -EINVAL;
-	}
+	struct htcpld_chip *chip = irq_data_get_irq_chip_data(data);
 
 	if (flags & ~IRQ_TYPE_SENSE_MASK)
 		return -EINVAL;
@@ -111,9 +107,7 @@
 	if (flags & (IRQ_TYPE_LEVEL_LOW|IRQ_TYPE_LEVEL_HIGH))
 		return -EINVAL;
 
-	d->status &= ~IRQ_TYPE_SENSE_MASK;
-	d->status |= flags;
-
+	chip->flow_type = flags;
 	return 0;
 }
 
@@ -135,7 +129,6 @@
 	unsigned int i;
 	unsigned long flags;
 	int irqpin;
-	struct irq_desc *desc;
 
 	if (!htcpld) {
 		pr_debug("htcpld is null in ISR\n");
@@ -195,23 +188,19 @@
 		 * associated interrupts.
 		 */
 		for (irqpin = 0; irqpin < chip->nirqs; irqpin++) {
-			unsigned oldb, newb;
-			int flags;
+			unsigned oldb, newb, type = chip->flow_type;
 
 			irq = chip->irq_start + irqpin;
-			desc = irq_to_desc(irq);
-			flags = desc->status;
 
 			/* Run the IRQ handler, but only if the bit value
 			 * changed, and the proper flags are set */
 			oldb = (old_val >> irqpin) & 1;
 			newb = (uval >> irqpin) & 1;
 
-			if ((!oldb && newb && (flags & IRQ_TYPE_EDGE_RISING)) ||
-			    (oldb && !newb &&
-			     (flags & IRQ_TYPE_EDGE_FALLING))) {
+			if ((!oldb && newb && (type & IRQ_TYPE_EDGE_RISING)) ||
+			    (oldb && !newb && (type & IRQ_TYPE_EDGE_FALLING))) {
 				pr_debug("fire IRQ %d\n", irqpin);
-				desc->handle_irq(irq, desc);
+				generic_handle_irq(irq);
 			}
 		}
 	}
@@ -359,13 +348,13 @@
 	/* Setup irq handlers */
 	irq_end = chip->irq_start + chip->nirqs;
 	for (irq = chip->irq_start; irq < irq_end; irq++) {
-		set_irq_chip(irq, &htcpld_muxed_chip);
-		set_irq_chip_data(irq, chip);
-		set_irq_handler(irq, handle_simple_irq);
+		irq_set_chip_and_handler(irq, &htcpld_muxed_chip,
+					 handle_simple_irq);
+		irq_set_chip_data(irq, chip);
 #ifdef CONFIG_ARM
 		set_irq_flags(irq, IRQF_VALID | IRQF_PROBE);
 #else
-		set_irq_probe(irq);
+		irq_set_probe(irq);
 #endif
 	}
 
diff --git a/drivers/mfd/jz4740-adc.c b/drivers/mfd/jz4740-adc.c
index aa518b9..a0bd0cf 100644
--- a/drivers/mfd/jz4740-adc.c
+++ b/drivers/mfd/jz4740-adc.c
@@ -112,7 +112,7 @@
 
 static void jz4740_adc_irq_demux(unsigned int irq, struct irq_desc *desc)
 {
-	struct jz4740_adc *adc = get_irq_desc_data(desc);
+	struct jz4740_adc *adc = irq_desc_get_handler_data(desc);
 	uint8_t status;
 	unsigned int i;
 
@@ -310,13 +310,13 @@
 	platform_set_drvdata(pdev, adc);
 
 	for (irq = adc->irq_base; irq < adc->irq_base + 5; ++irq) {
-		set_irq_chip_data(irq, adc);
-		set_irq_chip_and_handler(irq, &jz4740_adc_irq_chip,
-		    handle_level_irq);
+		irq_set_chip_data(irq, adc);
+		irq_set_chip_and_handler(irq, &jz4740_adc_irq_chip,
+					 handle_level_irq);
 	}
 
-	set_irq_data(adc->irq, adc);
-	set_irq_chained_handler(adc->irq, jz4740_adc_irq_demux);
+	irq_set_handler_data(adc->irq, adc);
+	irq_set_chained_handler(adc->irq, jz4740_adc_irq_demux);
 
 	writeb(0x00, adc->base + JZ_REG_ADC_ENABLE);
 	writeb(0xff, adc->base + JZ_REG_ADC_CTRL);
@@ -347,8 +347,8 @@
 
 	mfd_remove_devices(&pdev->dev);
 
-	set_irq_data(adc->irq, NULL);
-	set_irq_chained_handler(adc->irq, NULL);
+	irq_set_handler_data(adc->irq, NULL);
+	irq_set_chained_handler(adc->irq, NULL);
 
 	iounmap(adc->base);
 	release_mem_region(adc->mem->start, resource_size(adc->mem));
diff --git a/drivers/mfd/max8925-core.c b/drivers/mfd/max8925-core.c
index 0e998dc..58cc5fd 100644
--- a/drivers/mfd/max8925-core.c
+++ b/drivers/mfd/max8925-core.c
@@ -517,7 +517,6 @@
 			    struct max8925_platform_data *pdata)
 {
 	unsigned long flags = IRQF_TRIGGER_FALLING | IRQF_ONESHOT;
-	struct irq_desc *desc;
 	int i, ret;
 	int __irq;
 
@@ -544,19 +543,18 @@
 	mutex_init(&chip->irq_lock);
 	chip->core_irq = irq;
 	chip->irq_base = pdata->irq_base;
-	desc = irq_to_desc(chip->core_irq);
 
 	/* register with genirq */
 	for (i = 0; i < ARRAY_SIZE(max8925_irqs); i++) {
 		__irq = i + chip->irq_base;
-		set_irq_chip_data(__irq, chip);
-		set_irq_chip_and_handler(__irq, &max8925_irq_chip,
+		irq_set_chip_data(__irq, chip);
+		irq_set_chip_and_handler(__irq, &max8925_irq_chip,
 					 handle_edge_irq);
-		set_irq_nested_thread(__irq, 1);
+		irq_set_nested_thread(__irq, 1);
 #ifdef CONFIG_ARM
 		set_irq_flags(__irq, IRQF_VALID);
 #else
-		set_irq_noprobe(__irq);
+		irq_set_noprobe(__irq);
 #endif
 	}
 	if (!irq) {
diff --git a/drivers/mfd/max8997-irq.c b/drivers/mfd/max8997-irq.c
new file mode 100644
index 0000000..638bf7e
--- /dev/null
+++ b/drivers/mfd/max8997-irq.c
@@ -0,0 +1,377 @@
+/*
+ * max8997-irq.c - Interrupt controller support for MAX8997
+ *
+ * Copyright (C) 2011 Samsung Electronics Co.Ltd
+ * MyungJoo Ham <myungjoo.ham@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * This driver is based on max8998-irq.c
+ */
+
+#include <linux/err.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/mfd/max8997.h>
+#include <linux/mfd/max8997-private.h>
+
+static const u8 max8997_mask_reg[] = {
+	[PMIC_INT1] = MAX8997_REG_INT1MSK,
+	[PMIC_INT2] = MAX8997_REG_INT2MSK,
+	[PMIC_INT3] = MAX8997_REG_INT3MSK,
+	[PMIC_INT4] = MAX8997_REG_INT4MSK,
+	[FUEL_GAUGE] = MAX8997_REG_INVALID,
+	[MUIC_INT1] = MAX8997_MUIC_REG_INTMASK1,
+	[MUIC_INT2] = MAX8997_MUIC_REG_INTMASK2,
+	[MUIC_INT3] = MAX8997_MUIC_REG_INTMASK3,
+	[GPIO_LOW] = MAX8997_REG_INVALID,
+	[GPIO_HI] = MAX8997_REG_INVALID,
+	[FLASH_STATUS] = MAX8997_REG_INVALID,
+};
+
+static struct i2c_client *get_i2c(struct max8997_dev *max8997,
+				enum max8997_irq_source src)
+{
+	switch (src) {
+	case PMIC_INT1 ... PMIC_INT4:
+		return max8997->i2c;
+	case FUEL_GAUGE:
+		return NULL;
+	case MUIC_INT1 ... MUIC_INT3:
+		return max8997->muic;
+	case GPIO_LOW ... GPIO_HI:
+		return max8997->i2c;
+	case FLASH_STATUS:
+		return max8997->i2c;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+
+	return ERR_PTR(-EINVAL);
+}
+
+struct max8997_irq_data {
+	int mask;
+	enum max8997_irq_source group;
+};
+
+#define DECLARE_IRQ(idx, _group, _mask)		\
+	[(idx)] = { .group = (_group), .mask = (_mask) }
+static const struct max8997_irq_data max8997_irqs[] = {
+	DECLARE_IRQ(MAX8997_PMICIRQ_PWRONR,	PMIC_INT1, 1 << 0),
+	DECLARE_IRQ(MAX8997_PMICIRQ_PWRONF,	PMIC_INT1, 1 << 1),
+	DECLARE_IRQ(MAX8997_PMICIRQ_PWRON1SEC,	PMIC_INT1, 1 << 3),
+	DECLARE_IRQ(MAX8997_PMICIRQ_JIGONR,	PMIC_INT1, 1 << 4),
+	DECLARE_IRQ(MAX8997_PMICIRQ_JIGONF,	PMIC_INT1, 1 << 5),
+	DECLARE_IRQ(MAX8997_PMICIRQ_LOWBAT2,	PMIC_INT1, 1 << 6),
+	DECLARE_IRQ(MAX8997_PMICIRQ_LOWBAT1,	PMIC_INT1, 1 << 7),
+
+	DECLARE_IRQ(MAX8997_PMICIRQ_JIGR,	PMIC_INT2, 1 << 0),
+	DECLARE_IRQ(MAX8997_PMICIRQ_JIGF,	PMIC_INT2, 1 << 1),
+	DECLARE_IRQ(MAX8997_PMICIRQ_MR,		PMIC_INT2, 1 << 2),
+	DECLARE_IRQ(MAX8997_PMICIRQ_DVS1OK,	PMIC_INT2, 1 << 3),
+	DECLARE_IRQ(MAX8997_PMICIRQ_DVS2OK,	PMIC_INT2, 1 << 4),
+	DECLARE_IRQ(MAX8997_PMICIRQ_DVS3OK,	PMIC_INT2, 1 << 5),
+	DECLARE_IRQ(MAX8997_PMICIRQ_DVS4OK,	PMIC_INT2, 1 << 6),
+
+	DECLARE_IRQ(MAX8997_PMICIRQ_CHGINS,	PMIC_INT3, 1 << 0),
+	DECLARE_IRQ(MAX8997_PMICIRQ_CHGRM,	PMIC_INT3, 1 << 1),
+	DECLARE_IRQ(MAX8997_PMICIRQ_DCINOVP,	PMIC_INT3, 1 << 2),
+	DECLARE_IRQ(MAX8997_PMICIRQ_TOPOFFR,	PMIC_INT3, 1 << 3),
+	DECLARE_IRQ(MAX8997_PMICIRQ_CHGRSTF,	PMIC_INT3, 1 << 5),
+	DECLARE_IRQ(MAX8997_PMICIRQ_MBCHGTMEXPD,	PMIC_INT3, 1 << 7),
+
+	DECLARE_IRQ(MAX8997_PMICIRQ_RTC60S,	PMIC_INT4, 1 << 0),
+	DECLARE_IRQ(MAX8997_PMICIRQ_RTCA1,	PMIC_INT4, 1 << 1),
+	DECLARE_IRQ(MAX8997_PMICIRQ_RTCA2,	PMIC_INT4, 1 << 2),
+	DECLARE_IRQ(MAX8997_PMICIRQ_SMPL_INT,	PMIC_INT4, 1 << 3),
+	DECLARE_IRQ(MAX8997_PMICIRQ_RTC1S,	PMIC_INT4, 1 << 4),
+	DECLARE_IRQ(MAX8997_PMICIRQ_WTSR,	PMIC_INT4, 1 << 5),
+
+	DECLARE_IRQ(MAX8997_MUICIRQ_ADCError,	MUIC_INT1, 1 << 2),
+	DECLARE_IRQ(MAX8997_MUICIRQ_ADCLow,	MUIC_INT1, 1 << 1),
+	DECLARE_IRQ(MAX8997_MUICIRQ_ADC,	MUIC_INT1, 1 << 0),
+
+	DECLARE_IRQ(MAX8997_MUICIRQ_VBVolt,	MUIC_INT2, 1 << 4),
+	DECLARE_IRQ(MAX8997_MUICIRQ_DBChg,	MUIC_INT2, 1 << 3),
+	DECLARE_IRQ(MAX8997_MUICIRQ_DCDTmr,	MUIC_INT2, 1 << 2),
+	DECLARE_IRQ(MAX8997_MUICIRQ_ChgDetRun,	MUIC_INT2, 1 << 1),
+	DECLARE_IRQ(MAX8997_MUICIRQ_ChgTyp,	MUIC_INT2, 1 << 0),
+
+	DECLARE_IRQ(MAX8997_MUICIRQ_OVP,	MUIC_INT3, 1 << 2),
+};
+
+static void max8997_irq_lock(struct irq_data *data)
+{
+	struct max8997_dev *max8997 = irq_get_chip_data(data->irq);
+
+	mutex_lock(&max8997->irqlock);
+}
+
+static void max8997_irq_sync_unlock(struct irq_data *data)
+{
+	struct max8997_dev *max8997 = irq_get_chip_data(data->irq);
+	int i;
+
+	for (i = 0; i < MAX8997_IRQ_GROUP_NR; i++) {
+		u8 mask_reg = max8997_mask_reg[i];
+		struct i2c_client *i2c = get_i2c(max8997, i);
+
+		if (mask_reg == MAX8997_REG_INVALID ||
+				IS_ERR_OR_NULL(i2c))
+			continue;
+		max8997->irq_masks_cache[i] = max8997->irq_masks_cur[i];
+
+		max8997_write_reg(i2c, max8997_mask_reg[i],
+				max8997->irq_masks_cur[i]);
+	}
+
+	mutex_unlock(&max8997->irqlock);
+}
+
+static const inline struct max8997_irq_data *
+irq_to_max8997_irq(struct max8997_dev *max8997, int irq)
+{
+	return &max8997_irqs[irq - max8997->irq_base];
+}
+
+static void max8997_irq_mask(struct irq_data *data)
+{
+	struct max8997_dev *max8997 = irq_get_chip_data(data->irq);
+	const struct max8997_irq_data *irq_data = irq_to_max8997_irq(max8997,
+								data->irq);
+
+	max8997->irq_masks_cur[irq_data->group] |= irq_data->mask;
+}
+
+static void max8997_irq_unmask(struct irq_data *data)
+{
+	struct max8997_dev *max8997 = irq_get_chip_data(data->irq);
+	const struct max8997_irq_data *irq_data = irq_to_max8997_irq(max8997,
+								data->irq);
+
+	max8997->irq_masks_cur[irq_data->group] &= ~irq_data->mask;
+}
+
+static struct irq_chip max8997_irq_chip = {
+	.name			= "max8997",
+	.irq_bus_lock		= max8997_irq_lock,
+	.irq_bus_sync_unlock	= max8997_irq_sync_unlock,
+	.irq_mask		= max8997_irq_mask,
+	.irq_unmask		= max8997_irq_unmask,
+};
+
+#define MAX8997_IRQSRC_PMIC		(1 << 1)
+#define MAX8997_IRQSRC_FUELGAUGE	(1 << 2)
+#define MAX8997_IRQSRC_MUIC		(1 << 3)
+#define MAX8997_IRQSRC_GPIO		(1 << 4)
+#define MAX8997_IRQSRC_FLASH		(1 << 5)
+static irqreturn_t max8997_irq_thread(int irq, void *data)
+{
+	struct max8997_dev *max8997 = data;
+	u8 irq_reg[MAX8997_IRQ_GROUP_NR] = {};
+	u8 irq_src;
+	int ret;
+	int i;
+
+	ret = max8997_read_reg(max8997->i2c, MAX8997_REG_INTSRC, &irq_src);
+	if (ret < 0) {
+		dev_err(max8997->dev, "Failed to read interrupt source: %d\n",
+				ret);
+		return IRQ_NONE;
+	}
+
+	if (irq_src & MAX8997_IRQSRC_PMIC) {
+		/* PMIC INT1 ~ INT4 */
+		max8997_bulk_read(max8997->i2c, MAX8997_REG_INT1, 4,
+				&irq_reg[PMIC_INT1]);
+	}
+	if (irq_src & MAX8997_IRQSRC_FUELGAUGE) {
+		/*
+		 * TODO: FUEL GAUGE
+		 *
+		 * This is to be supported by Max17042 driver. When
+		 * an interrupt incurs here, it should be relayed to a
+		 * Max17042 device that is connected (probably by
+		 * platform-data). However, we do not have interrupt
+		 * handling in Max17042 driver currently. The Max17042 IRQ
+		 * driver should be ready to be used as a stand-alone device and
+		 * a Max8997-dependent device. Because it is not ready in
+		 * Max17042-side and it is not too critical in operating
+		 * Max8997, we do not implement this in initial releases.
+		 */
+		irq_reg[FUEL_GAUGE] = 0;
+	}
+	if (irq_src & MAX8997_IRQSRC_MUIC) {
+		/* MUIC INT1 ~ INT3 */
+		max8997_bulk_read(max8997->muic, MAX8997_MUIC_REG_INT1, 3,
+				&irq_reg[MUIC_INT1]);
+	}
+	if (irq_src & MAX8997_IRQSRC_GPIO) {
+		/* GPIO Interrupt */
+		u8 gpio_info[MAX8997_NUM_GPIO];
+
+		irq_reg[GPIO_LOW] = 0;
+		irq_reg[GPIO_HI] = 0;
+
+		max8997_bulk_read(max8997->i2c, MAX8997_REG_GPIOCNTL1,
+				MAX8997_NUM_GPIO, gpio_info);
+		for (i = 0; i < MAX8997_NUM_GPIO; i++) {
+			bool interrupt = false;
+
+			switch (gpio_info[i] & MAX8997_GPIO_INT_MASK) {
+			case MAX8997_GPIO_INT_BOTH:
+				if (max8997->gpio_status[i] != gpio_info[i])
+					interrupt = true;
+				break;
+			case MAX8997_GPIO_INT_RISE:
+				if ((max8997->gpio_status[i] != gpio_info[i]) &&
+				    (gpio_info[i] & MAX8997_GPIO_DATA_MASK))
+					interrupt = true;
+				break;
+			case MAX8997_GPIO_INT_FALL:
+				if ((max8997->gpio_status[i] != gpio_info[i]) &&
+				    !(gpio_info[i] & MAX8997_GPIO_DATA_MASK))
+					interrupt = true;
+				break;
+			default:
+				break;
+			}
+
+			if (interrupt) {
+				if (i < 8)
+					irq_reg[GPIO_LOW] |= (1 << i);
+				else
+					irq_reg[GPIO_HI] |= (1 << (i - 8));
+			}
+
+		}
+	}
+	if (irq_src & MAX8997_IRQSRC_FLASH) {
+		/* Flash Status Interrupt */
+		ret = max8997_read_reg(max8997->i2c, MAX8997_REG_FLASHSTATUS,
+				&irq_reg[FLASH_STATUS]);
+	}
+
+	/* Apply masking */
+	for (i = 0; i < MAX8997_IRQ_GROUP_NR; i++)
+		irq_reg[i] &= ~max8997->irq_masks_cur[i];
+
+	/* Report */
+	for (i = 0; i < MAX8997_IRQ_NR; i++) {
+		if (irq_reg[max8997_irqs[i].group] & max8997_irqs[i].mask)
+			handle_nested_irq(max8997->irq_base + i);
+	}
+
+	return IRQ_HANDLED;
+}
+
+int max8997_irq_resume(struct max8997_dev *max8997)
+{
+	if (max8997->irq && max8997->irq_base)
+		max8997_irq_thread(max8997->irq_base, max8997);
+	return 0;
+}
+
+int max8997_irq_init(struct max8997_dev *max8997)
+{
+	int i;
+	int cur_irq;
+	int ret;
+	u8 val;
+
+	if (!max8997->irq) {
+		dev_warn(max8997->dev, "No interrupt specified.\n");
+		max8997->irq_base = 0;
+		return 0;
+	}
+
+	if (!max8997->irq_base) {
+		dev_err(max8997->dev, "No interrupt base specified.\n");
+		return 0;
+	}
+
+	mutex_init(&max8997->irqlock);
+
+	/* Mask individual interrupt sources */
+	for (i = 0; i < MAX8997_IRQ_GROUP_NR; i++) {
+		struct i2c_client *i2c;
+
+		max8997->irq_masks_cur[i] = 0xff;
+		max8997->irq_masks_cache[i] = 0xff;
+		i2c = get_i2c(max8997, i);
+
+		if (IS_ERR_OR_NULL(i2c))
+			continue;
+		if (max8997_mask_reg[i] == MAX8997_REG_INVALID)
+			continue;
+
+		max8997_write_reg(i2c, max8997_mask_reg[i], 0xff);
+	}
+
+	for (i = 0; i < MAX8997_NUM_GPIO; i++) {
+		max8997->gpio_status[i] = (max8997_read_reg(max8997->i2c,
+						MAX8997_REG_GPIOCNTL1 + i,
+						&val)
+					& MAX8997_GPIO_DATA_MASK) ?
+					true : false;
+	}
+
+	/* Register with genirq */
+	for (i = 0; i < MAX8997_IRQ_NR; i++) {
+		cur_irq = i + max8997->irq_base;
+		irq_set_chip_data(cur_irq, max8997);
+		irq_set_chip_and_handler(cur_irq, &max8997_irq_chip,
+				handle_edge_irq);
+		irq_set_nested_thread(cur_irq, 1);
+#ifdef CONFIG_ARM
+		set_irq_flags(cur_irq, IRQF_VALID);
+#else
+		irq_set_noprobe(cur_irq);
+#endif
+	}
+
+	ret = request_threaded_irq(max8997->irq, NULL, max8997_irq_thread,
+			IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
+			"max8997-irq", max8997);
+
+	if (ret) {
+		dev_err(max8997->dev, "Failed to request IRQ %d: %d\n",
+				max8997->irq, ret);
+		return ret;
+	}
+
+	if (!max8997->ono)
+		return 0;
+
+	ret = request_threaded_irq(max8997->ono, NULL, max8997_irq_thread,
+			IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING |
+			IRQF_ONESHOT, "max8997-ono", max8997);
+
+	if (ret)
+		dev_err(max8997->dev, "Failed to request ono-IRQ %d: %d\n",
+				max8997->ono, ret);
+
+	return 0;
+}
+
+void max8997_irq_exit(struct max8997_dev *max8997)
+{
+	if (max8997->ono)
+		free_irq(max8997->ono, max8997);
+
+	if (max8997->irq)
+		free_irq(max8997->irq, max8997);
+}
diff --git a/drivers/mfd/max8998-irq.c b/drivers/mfd/max8998-irq.c
index 3903e1f..5919710 100644
--- a/drivers/mfd/max8998-irq.c
+++ b/drivers/mfd/max8998-irq.c
@@ -224,14 +224,14 @@
 	/* register with genirq */
 	for (i = 0; i < MAX8998_IRQ_NR; i++) {
 		cur_irq = i + max8998->irq_base;
-		set_irq_chip_data(cur_irq, max8998);
-		set_irq_chip_and_handler(cur_irq, &max8998_irq_chip,
+		irq_set_chip_data(cur_irq, max8998);
+		irq_set_chip_and_handler(cur_irq, &max8998_irq_chip,
 					 handle_edge_irq);
-		set_irq_nested_thread(cur_irq, 1);
+		irq_set_nested_thread(cur_irq, 1);
 #ifdef CONFIG_ARM
 		set_irq_flags(cur_irq, IRQF_VALID);
 #else
-		set_irq_noprobe(cur_irq);
+		irq_set_noprobe(cur_irq);
 #endif
 	}
 
diff --git a/drivers/mfd/max8998.c b/drivers/mfd/max8998.c
index c002142..9ec7570 100644
--- a/drivers/mfd/max8998.c
+++ b/drivers/mfd/max8998.c
@@ -209,7 +209,7 @@
 	struct max8998_dev *max8998 = i2c_get_clientdata(i2c);
 
 	if (max8998->wakeup)
-		set_irq_wake(max8998->irq, 1);
+		irq_set_irq_wake(max8998->irq, 1);
 	return 0;
 }
 
@@ -219,7 +219,7 @@
 	struct max8998_dev *max8998 = i2c_get_clientdata(i2c);
 
 	if (max8998->wakeup)
-		set_irq_wake(max8998->irq, 0);
+		irq_set_irq_wake(max8998->irq, 0);
 	/*
 	 * In LP3974, if IRQ registers are not "read & clear"
 	 * when it's set during sleep, the interrupt becomes
diff --git a/drivers/mfd/mfd-core.c b/drivers/mfd/mfd-core.c
index 79eda02..d01574d 100644
--- a/drivers/mfd/mfd-core.c
+++ b/drivers/mfd/mfd-core.c
@@ -184,16 +184,12 @@
 }
 EXPORT_SYMBOL(mfd_remove_devices);
 
-static int add_shared_platform_device(const char *cell, const char *name)
+int mfd_clone_cell(const char *cell, const char **clones, size_t n_clones)
 {
 	struct mfd_cell cell_entry;
 	struct device *dev;
 	struct platform_device *pdev;
-	int err;
-
-	/* check if we've already registered a device (don't fail if we have) */
-	if (bus_find_device_by_name(&platform_bus_type, NULL, name))
-		return 0;
+	int i;
 
 	/* fetch the parent cell's device (should already be registered!) */
 	dev = bus_find_device_by_name(&platform_bus_type, NULL, cell);
@@ -206,44 +202,17 @@
 
 	WARN_ON(!cell_entry.enable);
 
-	cell_entry.name = name;
-	err = mfd_add_device(pdev->dev.parent, -1, &cell_entry, NULL, 0);
-	if (err)
-		dev_err(dev, "MFD add devices failed: %d\n", err);
-	return err;
+	for (i = 0; i < n_clones; i++) {
+		cell_entry.name = clones[i];
+		/* don't give up if a single call fails; just report error */
+		if (mfd_add_device(pdev->dev.parent, -1, &cell_entry, NULL, 0))
+			dev_err(dev, "failed to create platform device '%s'\n",
+					clones[i]);
+	}
+
+	return 0;
 }
-
-int mfd_shared_platform_driver_register(struct platform_driver *drv,
-		const char *cellname)
-{
-	int err;
-
-	err = add_shared_platform_device(cellname, drv->driver.name);
-	if (err)
-		printk(KERN_ERR "failed to add platform device %s\n",
-				drv->driver.name);
-
-	err = platform_driver_register(drv);
-	if (err)
-		printk(KERN_ERR "failed to add platform driver %s\n",
-				drv->driver.name);
-
-	return err;
-}
-EXPORT_SYMBOL(mfd_shared_platform_driver_register);
-
-void mfd_shared_platform_driver_unregister(struct platform_driver *drv)
-{
-	struct device *dev;
-
-	dev = bus_find_device_by_name(&platform_bus_type, NULL,
-			drv->driver.name);
-	if (dev)
-		platform_device_unregister(to_platform_device(dev));
-
-	platform_driver_unregister(drv);
-}
-EXPORT_SYMBOL(mfd_shared_platform_driver_unregister);
+EXPORT_SYMBOL(mfd_clone_cell);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Ian Molton, Dmitry Baryshkov");
diff --git a/drivers/mfd/pcf50633-core.c b/drivers/mfd/pcf50633-core.c
index c1306ed..c7687f6 100644
--- a/drivers/mfd/pcf50633-core.c
+++ b/drivers/mfd/pcf50633-core.c
@@ -356,7 +356,7 @@
 	return 0;
 }
 
-static struct i2c_device_id pcf50633_id_table[] = {
+static const struct i2c_device_id pcf50633_id_table[] = {
 	{"pcf50633", 0x73},
 	{/* end of list */}
 };
diff --git a/drivers/mfd/rdc321x-southbridge.c b/drivers/mfd/rdc321x-southbridge.c
index 193c940..10dbe63 100644
--- a/drivers/mfd/rdc321x-southbridge.c
+++ b/drivers/mfd/rdc321x-southbridge.c
@@ -97,6 +97,7 @@
 	{ PCI_DEVICE(PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030) },
 	{}
 };
+MODULE_DEVICE_TABLE(pci, rdc321x_sb_table);
 
 static struct pci_driver rdc321x_sb_driver = {
 	.name		= "RDC321x Southbridge",
diff --git a/drivers/mfd/stmpe.c b/drivers/mfd/stmpe.c
index 3e5732b..7ab7746 100644
--- a/drivers/mfd/stmpe.c
+++ b/drivers/mfd/stmpe.c
@@ -762,14 +762,14 @@
 	int irq;
 
 	for (irq = base; irq < base + num_irqs; irq++) {
-		set_irq_chip_data(irq, stmpe);
-		set_irq_chip_and_handler(irq, &stmpe_irq_chip,
+		irq_set_chip_data(irq, stmpe);
+		irq_set_chip_and_handler(irq, &stmpe_irq_chip,
 					 handle_edge_irq);
-		set_irq_nested_thread(irq, 1);
+		irq_set_nested_thread(irq, 1);
 #ifdef CONFIG_ARM
 		set_irq_flags(irq, IRQF_VALID);
 #else
-		set_irq_noprobe(irq);
+		irq_set_noprobe(irq);
 #endif
 	}
 
@@ -786,8 +786,8 @@
 #ifdef CONFIG_ARM
 		set_irq_flags(irq, 0);
 #endif
-		set_irq_chip_and_handler(irq, NULL, NULL);
-		set_irq_chip_data(irq, NULL);
+		irq_set_chip_and_handler(irq, NULL, NULL);
+		irq_set_chip_data(irq, NULL);
 	}
 }
 
diff --git a/drivers/mfd/t7l66xb.c b/drivers/mfd/t7l66xb.c
index af57fc7..42830e6 100644
--- a/drivers/mfd/t7l66xb.c
+++ b/drivers/mfd/t7l66xb.c
@@ -186,7 +186,7 @@
 /* Handle the T7L66XB interrupt mux */
 static void t7l66xb_irq(unsigned int irq, struct irq_desc *desc)
 {
-	struct t7l66xb *t7l66xb = get_irq_data(irq);
+	struct t7l66xb *t7l66xb = irq_get_handler_data(irq);
 	unsigned int isr;
 	unsigned int i, irq_base;
 
@@ -243,17 +243,16 @@
 	irq_base = t7l66xb->irq_base;
 
 	for (irq = irq_base; irq < irq_base + T7L66XB_NR_IRQS; irq++) {
-		set_irq_chip(irq, &t7l66xb_chip);
-		set_irq_chip_data(irq, t7l66xb);
-		set_irq_handler(irq, handle_level_irq);
+		irq_set_chip_and_handler(irq, &t7l66xb_chip, handle_level_irq);
+		irq_set_chip_data(irq, t7l66xb);
 #ifdef CONFIG_ARM
 		set_irq_flags(irq, IRQF_VALID | IRQF_PROBE);
 #endif
 	}
 
-	set_irq_type(t7l66xb->irq, IRQ_TYPE_EDGE_FALLING);
-	set_irq_data(t7l66xb->irq, t7l66xb);
-	set_irq_chained_handler(t7l66xb->irq, t7l66xb_irq);
+	irq_set_irq_type(t7l66xb->irq, IRQ_TYPE_EDGE_FALLING);
+	irq_set_handler_data(t7l66xb->irq, t7l66xb);
+	irq_set_chained_handler(t7l66xb->irq, t7l66xb_irq);
 }
 
 static void t7l66xb_detach_irq(struct platform_device *dev)
@@ -263,15 +262,15 @@
 
 	irq_base = t7l66xb->irq_base;
 
-	set_irq_chained_handler(t7l66xb->irq, NULL);
-	set_irq_data(t7l66xb->irq, NULL);
+	irq_set_chained_handler(t7l66xb->irq, NULL);
+	irq_set_handler_data(t7l66xb->irq, NULL);
 
 	for (irq = irq_base; irq < irq_base + T7L66XB_NR_IRQS; irq++) {
 #ifdef CONFIG_ARM
 		set_irq_flags(irq, 0);
 #endif
-		set_irq_chip(irq, NULL);
-		set_irq_chip_data(irq, NULL);
+		irq_set_chip(irq, NULL);
+		irq_set_chip_data(irq, NULL);
 	}
 }
 
diff --git a/drivers/mfd/tc3589x.c b/drivers/mfd/tc3589x.c
index 729dbee..c27e515 100644
--- a/drivers/mfd/tc3589x.c
+++ b/drivers/mfd/tc3589x.c
@@ -192,14 +192,14 @@
 	int irq;
 
 	for (irq = base; irq < base + TC3589x_NR_INTERNAL_IRQS; irq++) {
-		set_irq_chip_data(irq, tc3589x);
-		set_irq_chip_and_handler(irq, &dummy_irq_chip,
+		irq_set_chip_data(irq, tc3589x);
+		irq_set_chip_and_handler(irq, &dummy_irq_chip,
 					 handle_edge_irq);
-		set_irq_nested_thread(irq, 1);
+		irq_set_nested_thread(irq, 1);
 #ifdef CONFIG_ARM
 		set_irq_flags(irq, IRQF_VALID);
 #else
-		set_irq_noprobe(irq);
+		irq_set_noprobe(irq);
 #endif
 	}
 
@@ -215,8 +215,8 @@
 #ifdef CONFIG_ARM
 		set_irq_flags(irq, 0);
 #endif
-		set_irq_chip_and_handler(irq, NULL, NULL);
-		set_irq_chip_data(irq, NULL);
+		irq_set_chip_and_handler(irq, NULL, NULL);
+		irq_set_chip_data(irq, NULL);
 	}
 }
 
diff --git a/drivers/mfd/tc6393xb.c b/drivers/mfd/tc6393xb.c
index 3d62ded..fc53ce2 100644
--- a/drivers/mfd/tc6393xb.c
+++ b/drivers/mfd/tc6393xb.c
@@ -513,7 +513,7 @@
 static void
 tc6393xb_irq(unsigned int irq, struct irq_desc *desc)
 {
-	struct tc6393xb *tc6393xb = get_irq_data(irq);
+	struct tc6393xb *tc6393xb = irq_get_handler_data(irq);
 	unsigned int isr;
 	unsigned int i, irq_base;
 
@@ -572,15 +572,14 @@
 	irq_base = tc6393xb->irq_base;
 
 	for (irq = irq_base; irq < irq_base + TC6393XB_NR_IRQS; irq++) {
-		set_irq_chip(irq, &tc6393xb_chip);
-		set_irq_chip_data(irq, tc6393xb);
-		set_irq_handler(irq, handle_edge_irq);
+		irq_set_chip_and_handler(irq, &tc6393xb_chip, handle_edge_irq);
+		irq_set_chip_data(irq, tc6393xb);
 		set_irq_flags(irq, IRQF_VALID | IRQF_PROBE);
 	}
 
-	set_irq_type(tc6393xb->irq, IRQ_TYPE_EDGE_FALLING);
-	set_irq_data(tc6393xb->irq, tc6393xb);
-	set_irq_chained_handler(tc6393xb->irq, tc6393xb_irq);
+	irq_set_irq_type(tc6393xb->irq, IRQ_TYPE_EDGE_FALLING);
+	irq_set_handler_data(tc6393xb->irq, tc6393xb);
+	irq_set_chained_handler(tc6393xb->irq, tc6393xb_irq);
 }
 
 static void tc6393xb_detach_irq(struct platform_device *dev)
@@ -588,15 +587,15 @@
 	struct tc6393xb *tc6393xb = platform_get_drvdata(dev);
 	unsigned int irq, irq_base;
 
-	set_irq_chained_handler(tc6393xb->irq, NULL);
-	set_irq_data(tc6393xb->irq, NULL);
+	irq_set_chained_handler(tc6393xb->irq, NULL);
+	irq_set_handler_data(tc6393xb->irq, NULL);
 
 	irq_base = tc6393xb->irq_base;
 
 	for (irq = irq_base; irq < irq_base + TC6393XB_NR_IRQS; irq++) {
 		set_irq_flags(irq, 0);
-		set_irq_chip(irq, NULL);
-		set_irq_chip_data(irq, NULL);
+		irq_set_chip(irq, NULL);
+		irq_set_chip_data(irq, NULL);
 	}
 }
 
diff --git a/drivers/mfd/tps6586x.c b/drivers/mfd/tps6586x.c
index 0aa9186..b600808 100644
--- a/drivers/mfd/tps6586x.c
+++ b/drivers/mfd/tps6586x.c
@@ -422,10 +422,10 @@
 
 	for (i = 0; i < ARRAY_SIZE(tps6586x_irqs); i++) {
 		int __irq = i + tps6586x->irq_base;
-		set_irq_chip_data(__irq, tps6586x);
-		set_irq_chip_and_handler(__irq, &tps6586x->irq_chip,
+		irq_set_chip_data(__irq, tps6586x);
+		irq_set_chip_and_handler(__irq, &tps6586x->irq_chip,
 					 handle_simple_irq);
-		set_irq_nested_thread(__irq, 1);
+		irq_set_nested_thread(__irq, 1);
 #ifdef CONFIG_ARM
 		set_irq_flags(__irq, IRQF_VALID);
 #endif
diff --git a/drivers/mfd/twl4030-irq.c b/drivers/mfd/twl4030-irq.c
index 63a30e8..8a7ee31 100644
--- a/drivers/mfd/twl4030-irq.c
+++ b/drivers/mfd/twl4030-irq.c
@@ -320,24 +320,8 @@
 		for (module_irq = twl4030_irq_base;
 				pih_isr;
 				pih_isr >>= 1, module_irq++) {
-			if (pih_isr & 0x1) {
-				struct irq_desc *d = irq_to_desc(module_irq);
-
-				if (!d) {
-					pr_err("twl4030: Invalid SIH IRQ: %d\n",
-					       module_irq);
-					return -EINVAL;
-				}
-
-				/* These can't be masked ... always warn
-				 * if we get any surprises.
-				 */
-				if (d->status & IRQ_DISABLED)
-					note_interrupt(module_irq, d,
-							IRQ_NONE);
-				else
-					d->handle_irq(module_irq, d);
-			}
+			if (pih_isr & 0x1)
+				generic_handle_irq(module_irq);
 		}
 		local_irq_enable();
 
@@ -470,7 +454,7 @@
 	set_irq_flags(irq, IRQF_VALID);
 #else
 	/* same effect on other architectures */
-	set_irq_noprobe(irq);
+	irq_set_noprobe(irq);
 #endif
 }
 
@@ -560,24 +544,18 @@
 	/* Modify only the bits we know must change */
 	while (edge_change) {
 		int		i = fls(edge_change) - 1;
-		struct irq_desc	*d = irq_to_desc(i + agent->irq_base);
+		struct irq_data	*idata = irq_get_irq_data(i + agent->irq_base);
 		int		byte = 1 + (i >> 2);
 		int		off = (i & 0x3) * 2;
-
-		if (!d) {
-			pr_err("twl4030: Invalid IRQ: %d\n",
-			       i + agent->irq_base);
-			return;
-		}
+		unsigned int	type;
 
 		bytes[byte] &= ~(0x03 << off);
 
-		raw_spin_lock_irq(&d->lock);
-		if (d->status & IRQ_TYPE_EDGE_RISING)
+		type = irqd_get_trigger_type(idata);
+		if (type & IRQ_TYPE_EDGE_RISING)
 			bytes[byte] |= BIT(off + 1);
-		if (d->status & IRQ_TYPE_EDGE_FALLING)
+		if (type & IRQ_TYPE_EDGE_FALLING)
 			bytes[byte] |= BIT(off + 0);
-		raw_spin_unlock_irq(&d->lock);
 
 		edge_change &= ~BIT(i);
 	}
@@ -626,21 +604,13 @@
 static int twl4030_sih_set_type(struct irq_data *data, unsigned trigger)
 {
 	struct sih_agent *sih = irq_data_get_irq_chip_data(data);
-	struct irq_desc *desc = irq_to_desc(data->irq);
 	unsigned long flags;
 
-	if (!desc) {
-		pr_err("twl4030: Invalid IRQ: %d\n", data->irq);
-		return -EINVAL;
-	}
-
 	if (trigger & ~(IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING))
 		return -EINVAL;
 
 	spin_lock_irqsave(&sih_agent_lock, flags);
-	if ((desc->status & IRQ_TYPE_SENSE_MASK) != trigger) {
-		desc->status &= ~IRQ_TYPE_SENSE_MASK;
-		desc->status |= trigger;
+	if (irqd_get_trigger_type(data) != trigger) {
 		sih->edge_change |= BIT(data->irq - sih->irq_base);
 		queue_work(wq, &sih->edge_work);
 	}
@@ -680,7 +650,7 @@
  */
 static void handle_twl4030_sih(unsigned irq, struct irq_desc *desc)
 {
-	struct sih_agent *agent = get_irq_data(irq);
+	struct sih_agent *agent = irq_get_handler_data(irq);
 	const struct sih *sih = agent->sih;
 	int isr;
 
@@ -754,9 +724,9 @@
 	for (i = 0; i < sih->bits; i++) {
 		irq = irq_base + i;
 
-		set_irq_chip_and_handler(irq, &twl4030_sih_irq_chip,
-				handle_edge_irq);
-		set_irq_chip_data(irq, agent);
+		irq_set_chip_and_handler(irq, &twl4030_sih_irq_chip,
+					 handle_edge_irq);
+		irq_set_chip_data(irq, agent);
 		activate_irq(irq);
 	}
 
@@ -765,8 +735,8 @@
 
 	/* replace generic PIH handler (handle_simple_irq) */
 	irq = sih_mod + twl4030_irq_base;
-	set_irq_data(irq, agent);
-	set_irq_chained_handler(irq, handle_twl4030_sih);
+	irq_set_handler_data(irq, agent);
+	irq_set_chained_handler(irq, handle_twl4030_sih);
 
 	pr_info("twl4030: %s (irq %d) chaining IRQs %d..%d\n", sih->name,
 			irq, irq_base, twl4030_irq_next - 1);
@@ -815,8 +785,8 @@
 	twl4030_sih_irq_chip.irq_ack = dummy_irq_chip.irq_ack;
 
 	for (i = irq_base; i < irq_end; i++) {
-		set_irq_chip_and_handler(i, &twl4030_irq_chip,
-				handle_simple_irq);
+		irq_set_chip_and_handler(i, &twl4030_irq_chip,
+					 handle_simple_irq);
 		activate_irq(i);
 	}
 	twl4030_irq_next = i;
@@ -856,7 +826,7 @@
 	/* clean up twl4030_sih_setup */
 fail:
 	for (i = irq_base; i < irq_end; i++)
-		set_irq_chip_and_handler(i, NULL, NULL);
+		irq_set_chip_and_handler(i, NULL, NULL);
 	destroy_workqueue(wq);
 	wq = NULL;
 	return status;
diff --git a/drivers/mfd/twl6030-irq.c b/drivers/mfd/twl6030-irq.c
index 4082ed7..fa93705 100644
--- a/drivers/mfd/twl6030-irq.c
+++ b/drivers/mfd/twl6030-irq.c
@@ -140,22 +140,7 @@
 			if (sts.int_sts & 0x1) {
 				int module_irq = twl6030_irq_base +
 					twl6030_interrupt_mapping[i];
-				struct irq_desc *d = irq_to_desc(module_irq);
-
-				if (!d) {
-					pr_err("twl6030: Invalid SIH IRQ: %d\n",
-					       module_irq);
-					return -EINVAL;
-				}
-
-				/* These can't be masked ... always warn
-				 * if we get any surprises.
-				 */
-				if (d->status & IRQ_DISABLED)
-					note_interrupt(module_irq, d,
-							IRQ_NONE);
-				else
-					d->handle_irq(module_irq, d);
+				generic_handle_irq(module_irq);
 
 			}
 		local_irq_enable();
@@ -198,7 +183,7 @@
 	set_irq_flags(irq, IRQF_VALID);
 #else
 	/* same effect on other architectures */
-	set_irq_noprobe(irq);
+	irq_set_noprobe(irq);
 #endif
 }
 
@@ -335,8 +320,8 @@
 	twl6030_irq_chip.irq_set_type = NULL;
 
 	for (i = irq_base; i < irq_end; i++) {
-		set_irq_chip_and_handler(i, &twl6030_irq_chip,
-				handle_simple_irq);
+		irq_set_chip_and_handler(i, &twl6030_irq_chip,
+					 handle_simple_irq);
 		activate_irq(i);
 	}
 
@@ -365,7 +350,7 @@
 
 fail_kthread:
 	for (i = irq_base; i < irq_end; i++)
-		set_irq_chip_and_handler(i, NULL, NULL);
+		irq_set_chip_and_handler(i, NULL, NULL);
 	return status;
 }
 
diff --git a/drivers/mfd/wl1273-core.c b/drivers/mfd/wl1273-core.c
index f76f6c7..04914f2 100644
--- a/drivers/mfd/wl1273-core.c
+++ b/drivers/mfd/wl1273-core.c
@@ -25,7 +25,7 @@
 
 #define DRIVER_DESC "WL1273 FM Radio Core"
 
-static struct i2c_device_id wl1273_driver_id_table[] = {
+static const struct i2c_device_id wl1273_driver_id_table[] = {
 	{ WL1273_FM_DRIVER_NAME, 0 },
 	{ }
 };
diff --git a/drivers/mfd/wm831x-irq.c b/drivers/mfd/wm831x-irq.c
index a5cd17e..23e66af 100644
--- a/drivers/mfd/wm831x-irq.c
+++ b/drivers/mfd/wm831x-irq.c
@@ -553,17 +553,17 @@
 	for (cur_irq = wm831x->irq_base;
 	     cur_irq < ARRAY_SIZE(wm831x_irqs) + wm831x->irq_base;
 	     cur_irq++) {
-		set_irq_chip_data(cur_irq, wm831x);
-		set_irq_chip_and_handler(cur_irq, &wm831x_irq_chip,
+		irq_set_chip_data(cur_irq, wm831x);
+		irq_set_chip_and_handler(cur_irq, &wm831x_irq_chip,
 					 handle_edge_irq);
-		set_irq_nested_thread(cur_irq, 1);
+		irq_set_nested_thread(cur_irq, 1);
 
 		/* ARM needs us to explicitly flag the IRQ as valid
 		 * and will set them noprobe when we do so. */
 #ifdef CONFIG_ARM
 		set_irq_flags(cur_irq, IRQF_VALID);
 #else
-		set_irq_noprobe(cur_irq);
+		irq_set_noprobe(cur_irq);
 #endif
 	}
 
diff --git a/drivers/mfd/wm8350-irq.c b/drivers/mfd/wm8350-irq.c
index 5839966..ed4b22a 100644
--- a/drivers/mfd/wm8350-irq.c
+++ b/drivers/mfd/wm8350-irq.c
@@ -518,17 +518,17 @@
 	for (cur_irq = wm8350->irq_base;
 	     cur_irq < ARRAY_SIZE(wm8350_irqs) + wm8350->irq_base;
 	     cur_irq++) {
-		set_irq_chip_data(cur_irq, wm8350);
-		set_irq_chip_and_handler(cur_irq, &wm8350_irq_chip,
+		irq_set_chip_data(cur_irq, wm8350);
+		irq_set_chip_and_handler(cur_irq, &wm8350_irq_chip,
 					 handle_edge_irq);
-		set_irq_nested_thread(cur_irq, 1);
+		irq_set_nested_thread(cur_irq, 1);
 
 		/* ARM needs us to explicitly flag the IRQ as valid
 		 * and will set them noprobe when we do so. */
 #ifdef CONFIG_ARM
 		set_irq_flags(cur_irq, IRQF_VALID);
 #else
-		set_irq_noprobe(cur_irq);
+		irq_set_noprobe(cur_irq);
 #endif
 	}
 
diff --git a/drivers/mfd/wm8994-irq.c b/drivers/mfd/wm8994-irq.c
index 1e3bf4a..71c6e8f 100644
--- a/drivers/mfd/wm8994-irq.c
+++ b/drivers/mfd/wm8994-irq.c
@@ -278,17 +278,17 @@
 	for (cur_irq = wm8994->irq_base;
 	     cur_irq < ARRAY_SIZE(wm8994_irqs) + wm8994->irq_base;
 	     cur_irq++) {
-		set_irq_chip_data(cur_irq, wm8994);
-		set_irq_chip_and_handler(cur_irq, &wm8994_irq_chip,
+		irq_set_chip_data(cur_irq, wm8994);
+		irq_set_chip_and_handler(cur_irq, &wm8994_irq_chip,
 					 handle_edge_irq);
-		set_irq_nested_thread(cur_irq, 1);
+		irq_set_nested_thread(cur_irq, 1);
 
 		/* ARM needs us to explicitly flag the IRQ as valid
 		 * and will set them noprobe when we do so. */
 #ifdef CONFIG_ARM
 		set_irq_flags(cur_irq, IRQF_VALID);
 #else
-		set_irq_noprobe(cur_irq);
+		irq_set_noprobe(cur_irq);
 #endif
 	}
 
diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig
index 7741470..b4567c3 100644
--- a/drivers/mtd/Kconfig
+++ b/drivers/mtd/Kconfig
@@ -33,14 +33,6 @@
 	  should normally be compiled as kernel modules. The modules perform
 	  various checks and verifications when loaded.
 
-config MTD_CONCAT
-	tristate "MTD concatenating support"
-	help
-	  Support for concatenating several MTD devices into a single
-	  (virtual) one. This allows you to have -for example- a JFFS(2)
-	  file system spanning multiple physical flash chips. If unsure,
-	  say 'Y'.
-
 config MTD_PARTITIONS
 	bool "MTD partitioning support"
 	help
@@ -333,6 +325,16 @@
 	  To use, add console=ttyMTDx to the kernel command line,
 	  where x is the MTD device number to use.
 
+config MTD_SWAP
+	tristate "Swap on MTD device support"
+	depends on MTD && SWAP
+	select MTD_BLKDEVS
+	help
+	  Provides volatile block device driver on top of mtd partition
+          suitable for swapping.  The mapping of written blocks is not saved.
+	  The driver provides wear leveling by storing erase counter into the
+	  OOB.
+
 source "drivers/mtd/chips/Kconfig"
 
 source "drivers/mtd/maps/Kconfig"
diff --git a/drivers/mtd/Makefile b/drivers/mtd/Makefile
index d4e7f25..d578095 100644
--- a/drivers/mtd/Makefile
+++ b/drivers/mtd/Makefile
@@ -4,11 +4,10 @@
 
 # Core functionality.
 obj-$(CONFIG_MTD)		+= mtd.o
-mtd-y				:= mtdcore.o mtdsuper.o
+mtd-y				:= mtdcore.o mtdsuper.o mtdconcat.o
 mtd-$(CONFIG_MTD_PARTITIONS)	+= mtdpart.o
 mtd-$(CONFIG_MTD_OF_PARTS)	+= ofpart.o
 
-obj-$(CONFIG_MTD_CONCAT)	+= mtdconcat.o
 obj-$(CONFIG_MTD_REDBOOT_PARTS) += redboot.o
 obj-$(CONFIG_MTD_CMDLINE_PARTS) += cmdlinepart.o
 obj-$(CONFIG_MTD_AFS_PARTS)	+= afs.o
@@ -26,6 +25,7 @@
 obj-$(CONFIG_SSFDC)		+= ssfdc.o
 obj-$(CONFIG_SM_FTL)		+= sm_ftl.o
 obj-$(CONFIG_MTD_OOPS)		+= mtdoops.o
+obj-$(CONFIG_MTD_SWAP)		+= mtdswap.o
 
 nftl-objs		:= nftlcore.o nftlmount.o
 inftl-objs		:= inftlcore.o inftlmount.o
diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c
index 4aaa88f..092aef1 100644
--- a/drivers/mtd/chips/cfi_cmdset_0001.c
+++ b/drivers/mtd/chips/cfi_cmdset_0001.c
@@ -455,7 +455,7 @@
 	mtd->flags   = MTD_CAP_NORFLASH;
 	mtd->name    = map->name;
 	mtd->writesize = 1;
-	mtd->writebufsize = 1 << cfi->cfiq->MaxBufWriteSize;
+	mtd->writebufsize = cfi_interleave(cfi) << cfi->cfiq->MaxBufWriteSize;
 
 	mtd->reboot_notifier.notifier_call = cfi_intelext_reboot;
 
diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c
index f072fcf..f9a5331 100644
--- a/drivers/mtd/chips/cfi_cmdset_0002.c
+++ b/drivers/mtd/chips/cfi_cmdset_0002.c
@@ -349,6 +349,7 @@
 	{ CFI_MFR_ATMEL, CFI_ID_ANY, fixup_convert_atmel_pri },
 #ifdef AMD_BOOTLOC_BUG
 	{ CFI_MFR_AMD, CFI_ID_ANY, fixup_amd_bootblock },
+	{ CFI_MFR_AMIC, CFI_ID_ANY, fixup_amd_bootblock },
 	{ CFI_MFR_MACRONIX, CFI_ID_ANY, fixup_amd_bootblock },
 #endif
 	{ CFI_MFR_AMD, 0x0050, fixup_use_secsi },
@@ -440,7 +441,7 @@
 	mtd->flags   = MTD_CAP_NORFLASH;
 	mtd->name    = map->name;
 	mtd->writesize = 1;
-	mtd->writebufsize = 1 << cfi->cfiq->MaxBufWriteSize;
+	mtd->writebufsize = cfi_interleave(cfi) << cfi->cfiq->MaxBufWriteSize;
 
 	DEBUG(MTD_DEBUG_LEVEL3, "MTD %s(): write buffer size %d\n",
 		__func__, mtd->writebufsize);
diff --git a/drivers/mtd/chips/cfi_cmdset_0020.c b/drivers/mtd/chips/cfi_cmdset_0020.c
index c04b765..ed56ad3 100644
--- a/drivers/mtd/chips/cfi_cmdset_0020.c
+++ b/drivers/mtd/chips/cfi_cmdset_0020.c
@@ -238,7 +238,7 @@
 	mtd->resume = cfi_staa_resume;
 	mtd->flags = MTD_CAP_NORFLASH & ~MTD_BIT_WRITEABLE;
 	mtd->writesize = 8; /* FIXME: Should be 0 for STMicro flashes w/out ECC */
-	mtd->writebufsize = 1 << cfi->cfiq->MaxBufWriteSize;
+	mtd->writebufsize = cfi_interleave(cfi) << cfi->cfiq->MaxBufWriteSize;
 	map->fldrv = &cfi_staa_chipdrv;
 	__module_get(THIS_MODULE);
 	mtd->name = map->name;
diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c
index e4eba6c..3fb981d 100644
--- a/drivers/mtd/devices/m25p80.c
+++ b/drivers/mtd/devices/m25p80.c
@@ -655,7 +655,8 @@
 	{ "at26df161a", INFO(0x1f4601, 0, 64 * 1024, 32, SECT_4K) },
 	{ "at26df321",  INFO(0x1f4700, 0, 64 * 1024, 64, SECT_4K) },
 
-	/* EON -- en25pxx */
+	/* EON -- en25xxx */
+	{ "en25f32", INFO(0x1c3116, 0, 64 * 1024,  64, SECT_4K) },
 	{ "en25p32", INFO(0x1c2016, 0, 64 * 1024,  64, 0) },
 	{ "en25p64", INFO(0x1c2017, 0, 64 * 1024, 128, 0) },
 
@@ -728,6 +729,8 @@
 	{ "m25pe80", INFO(0x208014,  0, 64 * 1024, 16,       0) },
 	{ "m25pe16", INFO(0x208015,  0, 64 * 1024, 32, SECT_4K) },
 
+	{ "m25px64", INFO(0x207117,  0, 64 * 1024, 128, 0) },
+
 	/* Winbond -- w25x "blocks" are 64K, "sectors" are 4KiB */
 	{ "w25x10", INFO(0xef3011, 0, 64 * 1024,  2,  SECT_4K) },
 	{ "w25x20", INFO(0xef3012, 0, 64 * 1024,  4,  SECT_4K) },
diff --git a/drivers/mtd/devices/mtdram.c b/drivers/mtd/devices/mtdram.c
index 26a6e80..1483e18 100644
--- a/drivers/mtd/devices/mtdram.c
+++ b/drivers/mtd/devices/mtdram.c
@@ -121,6 +121,7 @@
 	mtd->flags = MTD_CAP_RAM;
 	mtd->size = size;
 	mtd->writesize = 1;
+	mtd->writebufsize = 64; /* Mimic CFI NOR flashes */
 	mtd->erasesize = MTDRAM_ERASE_SIZE;
 	mtd->priv = mapped_address;
 
diff --git a/drivers/mtd/devices/phram.c b/drivers/mtd/devices/phram.c
index 5239328..8d28fa0 100644
--- a/drivers/mtd/devices/phram.c
+++ b/drivers/mtd/devices/phram.c
@@ -117,6 +117,7 @@
 	list_for_each_entry_safe(this, safe, &phram_list, list) {
 		del_mtd_device(&this->mtd);
 		iounmap(this->mtd.priv);
+		kfree(this->mtd.name);
 		kfree(this);
 	}
 }
@@ -275,6 +276,8 @@
 	ret = register_device(name, start, len);
 	if (!ret)
 		pr_info("%s device: %#x at %#x\n", name, len, start);
+	else
+		kfree(name);
 
 	return ret;
 }
diff --git a/drivers/mtd/maps/Kconfig b/drivers/mtd/maps/Kconfig
index 5d37d31..44b1f46 100644
--- a/drivers/mtd/maps/Kconfig
+++ b/drivers/mtd/maps/Kconfig
@@ -114,7 +114,7 @@
 
 config MTD_SC520CDP
 	tristate "CFI Flash device mapped on AMD SC520 CDP"
-	depends on X86 && MTD_CFI && MTD_CONCAT
+	depends on X86 && MTD_CFI
 	help
 	  The SC520 CDP board has two banks of CFI-compliant chips and one
 	  Dual-in-line JEDEC chip. This 'mapping' driver supports that
@@ -262,7 +262,7 @@
 
 config MTD_DILNETPC
 	tristate "CFI Flash device mapped on DIL/Net PC"
-	depends on X86 && MTD_CONCAT && MTD_PARTITIONS && MTD_CFI_INTELEXT && BROKEN
+	depends on X86 && MTD_PARTITIONS && MTD_CFI_INTELEXT && BROKEN
 	help
 	  MTD map driver for SSV DIL/Net PC Boards "DNP" and "ADNP".
 	  For details, see <http://www.ssv-embedded.de/ssv/pc104/p169.htm>
@@ -552,4 +552,13 @@
 
 	  When built as a module, it will be called pismo.ko
 
+config MTD_LATCH_ADDR
+        tristate "Latch-assisted Flash Chip Support"
+        depends on MTD_COMPLEX_MAPPINGS
+        help
+          Map driver which allows flashes to be partially physically addressed
+          and have the upper address lines set by a board specific code.
+
+          If compiled as a module, it will be called latch-addr-flash.
+
 endmenu
diff --git a/drivers/mtd/maps/Makefile b/drivers/mtd/maps/Makefile
index c7869c7..08533bd 100644
--- a/drivers/mtd/maps/Makefile
+++ b/drivers/mtd/maps/Makefile
@@ -59,3 +59,4 @@
 obj-$(CONFIG_MTD_VMU)		+= vmu-flash.o
 obj-$(CONFIG_MTD_GPIO_ADDR)	+= gpio-addr-flash.o
 obj-$(CONFIG_MTD_BCM963XX)	+= bcm963xx-flash.o
+obj-$(CONFIG_MTD_LATCH_ADDR)	+= latch-addr-flash.o
diff --git a/drivers/mtd/maps/ceiva.c b/drivers/mtd/maps/ceiva.c
index c09f4f5..e5f645b 100644
--- a/drivers/mtd/maps/ceiva.c
+++ b/drivers/mtd/maps/ceiva.c
@@ -194,16 +194,10 @@
 			 * We detected multiple devices.  Concatenate
 			 * them together.
 			 */
-#ifdef CONFIG_MTD_CONCAT
 			*rmtd = mtd_concat_create(subdev, found,
 						  "clps flash");
 			if (*rmtd == NULL)
 				ret = -ENXIO;
-#else
-			printk(KERN_ERR "clps flash: multiple devices "
-			       "found but MTD concat support disabled.\n");
-			ret = -ENXIO;
-#endif
 		}
 	}
 
diff --git a/drivers/mtd/maps/integrator-flash.c b/drivers/mtd/maps/integrator-flash.c
index 2aac41b..e22ff5a 100644
--- a/drivers/mtd/maps/integrator-flash.c
+++ b/drivers/mtd/maps/integrator-flash.c
@@ -202,7 +202,6 @@
 	if (info->nr_subdev == 1)
 		info->mtd = info->subdev[0].mtd;
 	else if (info->nr_subdev > 1) {
-#ifdef CONFIG_MTD_CONCAT
 		struct mtd_info *cdev[info->nr_subdev];
 
 		/*
@@ -215,11 +214,6 @@
 					      dev_name(&dev->dev));
 		if (info->mtd == NULL)
 			err = -ENXIO;
-#else
-		printk(KERN_ERR "armflash: multiple devices found but "
-		       "MTD concat support disabled.\n");
-		err = -ENXIO;
-#endif
 	}
 
 	if (err < 0)
@@ -244,10 +238,8 @@
  cleanup:
 	if (info->mtd) {
 		del_mtd_partitions(info->mtd);
-#ifdef CONFIG_MTD_CONCAT
 		if (info->mtd != info->subdev[0].mtd)
 			mtd_concat_destroy(info->mtd);
-#endif
 	}
 	kfree(info->parts);
  subdev_err:
@@ -272,10 +264,8 @@
 	if (info) {
 		if (info->mtd) {
 			del_mtd_partitions(info->mtd);
-#ifdef CONFIG_MTD_CONCAT
 			if (info->mtd != info->subdev[0].mtd)
 				mtd_concat_destroy(info->mtd);
-#endif
 		}
 		kfree(info->parts);
 
diff --git a/drivers/mtd/maps/latch-addr-flash.c b/drivers/mtd/maps/latch-addr-flash.c
new file mode 100644
index 0000000..ee25480
--- /dev/null
+++ b/drivers/mtd/maps/latch-addr-flash.c
@@ -0,0 +1,272 @@
+/*
+ * Interface for NOR flash driver whose high address lines are latched
+ *
+ * Copyright © 2000 Nicolas Pitre <nico@cam.org>
+ * Copyright © 2005-2008 Analog Devices Inc.
+ * Copyright © 2008 MontaVista Software, Inc. <source@mvista.com>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/map.h>
+#include <linux/mtd/partitions.h>
+#include <linux/platform_device.h>
+#include <linux/mtd/latch-addr-flash.h>
+#include <linux/slab.h>
+
+#define DRIVER_NAME "latch-addr-flash"
+
+struct latch_addr_flash_info {
+	struct mtd_info		*mtd;
+	struct map_info		map;
+	struct resource		*res;
+
+	void			(*set_window)(unsigned long offset, void *data);
+	void			*data;
+
+	/* cache; could be found out of res */
+	unsigned long		win_mask;
+
+	int			nr_parts;
+	struct mtd_partition	*parts;
+
+	spinlock_t		lock;
+};
+
+static map_word lf_read(struct map_info *map, unsigned long ofs)
+{
+	struct latch_addr_flash_info *info;
+	map_word datum;
+
+	info = (struct latch_addr_flash_info *)map->map_priv_1;
+
+	spin_lock(&info->lock);
+
+	info->set_window(ofs, info->data);
+	datum = inline_map_read(map, info->win_mask & ofs);
+
+	spin_unlock(&info->lock);
+
+	return datum;
+}
+
+static void lf_write(struct map_info *map, map_word datum, unsigned long ofs)
+{
+	struct latch_addr_flash_info *info;
+
+	info = (struct latch_addr_flash_info *)map->map_priv_1;
+
+	spin_lock(&info->lock);
+
+	info->set_window(ofs, info->data);
+	inline_map_write(map, datum, info->win_mask & ofs);
+
+	spin_unlock(&info->lock);
+}
+
+static void lf_copy_from(struct map_info *map, void *to,
+		unsigned long from, ssize_t len)
+{
+	struct latch_addr_flash_info *info =
+		(struct latch_addr_flash_info *) map->map_priv_1;
+	unsigned n;
+
+	while (len > 0) {
+		n = info->win_mask + 1 - (from & info->win_mask);
+		if (n > len)
+			n = len;
+
+		spin_lock(&info->lock);
+
+		info->set_window(from, info->data);
+		memcpy_fromio(to, map->virt + (from & info->win_mask), n);
+
+		spin_unlock(&info->lock);
+
+		to += n;
+		from += n;
+		len -= n;
+	}
+}
+
+static char *rom_probe_types[] = { "cfi_probe", NULL };
+
+static char *part_probe_types[] = { "cmdlinepart", NULL };
+
+static int latch_addr_flash_remove(struct platform_device *dev)
+{
+	struct latch_addr_flash_info *info;
+	struct latch_addr_flash_data *latch_addr_data;
+
+	info = platform_get_drvdata(dev);
+	if (info == NULL)
+		return 0;
+	platform_set_drvdata(dev, NULL);
+
+	latch_addr_data = dev->dev.platform_data;
+
+	if (info->mtd != NULL) {
+		if (mtd_has_partitions()) {
+			if (info->nr_parts) {
+				del_mtd_partitions(info->mtd);
+				kfree(info->parts);
+			} else if (latch_addr_data->nr_parts) {
+				del_mtd_partitions(info->mtd);
+			} else {
+				del_mtd_device(info->mtd);
+			}
+		} else {
+			del_mtd_device(info->mtd);
+		}
+		map_destroy(info->mtd);
+	}
+
+	if (info->map.virt != NULL)
+		iounmap(info->map.virt);
+
+	if (info->res != NULL)
+		release_mem_region(info->res->start, resource_size(info->res));
+
+	kfree(info);
+
+	if (latch_addr_data->done)
+		latch_addr_data->done(latch_addr_data->data);
+
+	return 0;
+}
+
+static int __devinit latch_addr_flash_probe(struct platform_device *dev)
+{
+	struct latch_addr_flash_data *latch_addr_data;
+	struct latch_addr_flash_info *info;
+	resource_size_t win_base = dev->resource->start;
+	resource_size_t win_size = resource_size(dev->resource);
+	char **probe_type;
+	int chipsel;
+	int err;
+
+	latch_addr_data = dev->dev.platform_data;
+	if (latch_addr_data == NULL)
+		return -ENODEV;
+
+	pr_notice("latch-addr platform flash device: %#llx byte "
+		  "window at %#.8llx\n",
+		  (unsigned long long)win_size, (unsigned long long)win_base);
+
+	chipsel = dev->id;
+
+	if (latch_addr_data->init) {
+		err = latch_addr_data->init(latch_addr_data->data, chipsel);
+		if (err != 0)
+			return err;
+	}
+
+	info = kzalloc(sizeof(struct latch_addr_flash_info), GFP_KERNEL);
+	if (info == NULL) {
+		err = -ENOMEM;
+		goto done;
+	}
+
+	platform_set_drvdata(dev, info);
+
+	info->res = request_mem_region(win_base, win_size, DRIVER_NAME);
+	if (info->res == NULL) {
+		dev_err(&dev->dev, "Could not reserve memory region\n");
+		err = -EBUSY;
+		goto free_info;
+	}
+
+	info->map.name		= DRIVER_NAME;
+	info->map.size		= latch_addr_data->size;
+	info->map.bankwidth	= latch_addr_data->width;
+
+	info->map.phys		= NO_XIP;
+	info->map.virt		= ioremap(win_base, win_size);
+	if (!info->map.virt) {
+		err = -ENOMEM;
+		goto free_res;
+	}
+
+	info->map.map_priv_1	= (unsigned long)info;
+
+	info->map.read		= lf_read;
+	info->map.copy_from	= lf_copy_from;
+	info->map.write		= lf_write;
+	info->set_window	= latch_addr_data->set_window;
+	info->data		= latch_addr_data->data;
+	info->win_mask		= win_size - 1;
+
+	spin_lock_init(&info->lock);
+
+	for (probe_type = rom_probe_types; !info->mtd && *probe_type;
+		probe_type++)
+		info->mtd = do_map_probe(*probe_type, &info->map);
+
+	if (info->mtd == NULL) {
+		dev_err(&dev->dev, "map_probe failed\n");
+		err = -ENODEV;
+		goto iounmap;
+	}
+	info->mtd->owner = THIS_MODULE;
+
+	if (mtd_has_partitions()) {
+
+		err = parse_mtd_partitions(info->mtd,
+					   (const char **)part_probe_types,
+					   &info->parts, 0);
+		if (err > 0) {
+			add_mtd_partitions(info->mtd, info->parts, err);
+			return 0;
+		}
+		if (latch_addr_data->nr_parts) {
+			pr_notice("Using latch-addr-flash partition information\n");
+			add_mtd_partitions(info->mtd, latch_addr_data->parts,
+					latch_addr_data->nr_parts);
+			return 0;
+		}
+	}
+	add_mtd_device(info->mtd);
+	return 0;
+
+iounmap:
+	iounmap(info->map.virt);
+free_res:
+	release_mem_region(info->res->start, resource_size(info->res));
+free_info:
+	kfree(info);
+done:
+	if (latch_addr_data->done)
+		latch_addr_data->done(latch_addr_data->data);
+	return err;
+}
+
+static struct platform_driver latch_addr_flash_driver = {
+	.probe		= latch_addr_flash_probe,
+	.remove		= __devexit_p(latch_addr_flash_remove),
+	.driver		= {
+		.name	= DRIVER_NAME,
+	},
+};
+
+static int __init latch_addr_flash_init(void)
+{
+	return platform_driver_register(&latch_addr_flash_driver);
+}
+module_init(latch_addr_flash_init);
+
+static void __exit latch_addr_flash_exit(void)
+{
+	platform_driver_unregister(&latch_addr_flash_driver);
+}
+module_exit(latch_addr_flash_exit);
+
+MODULE_AUTHOR("David Griego <dgriego@mvista.com>");
+MODULE_DESCRIPTION("MTD map driver for flashes addressed physically with upper "
+		"address lines being set board specifically");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/mtd/maps/physmap.c b/drivers/mtd/maps/physmap.c
index 4c18b98..7522df4 100644
--- a/drivers/mtd/maps/physmap.c
+++ b/drivers/mtd/maps/physmap.c
@@ -59,10 +59,8 @@
 #else
 		del_mtd_device(info->cmtd);
 #endif
-#ifdef CONFIG_MTD_CONCAT
 		if (info->cmtd != info->mtd[0])
 			mtd_concat_destroy(info->cmtd);
-#endif
 	}
 
 	for (i = 0; i < MAX_RESOURCES; i++) {
@@ -159,15 +157,9 @@
 		/*
 		 * We detected multiple devices. Concatenate them together.
 		 */
-#ifdef CONFIG_MTD_CONCAT
 		info->cmtd = mtd_concat_create(info->mtd, devices_found, dev_name(&dev->dev));
 		if (info->cmtd == NULL)
 			err = -ENXIO;
-#else
-		printk(KERN_ERR "physmap-flash: multiple devices "
-		       "found but MTD concat support disabled.\n");
-		err = -ENXIO;
-#endif
 	}
 	if (err)
 		goto err_out;
diff --git a/drivers/mtd/maps/physmap_of.c b/drivers/mtd/maps/physmap_of.c
index 3db0cb0..bd483f0 100644
--- a/drivers/mtd/maps/physmap_of.c
+++ b/drivers/mtd/maps/physmap_of.c
@@ -104,12 +104,10 @@
 		return 0;
 	dev_set_drvdata(&dev->dev, NULL);
 
-#ifdef CONFIG_MTD_CONCAT
 	if (info->cmtd != info->list[0].mtd) {
 		del_mtd_device(info->cmtd);
 		mtd_concat_destroy(info->cmtd);
 	}
-#endif
 
 	if (info->cmtd) {
 		if (OF_FLASH_PARTS(info)) {
@@ -337,16 +335,10 @@
 		/*
 		 * We detected multiple devices. Concatenate them together.
 		 */
-#ifdef CONFIG_MTD_CONCAT
 		info->cmtd = mtd_concat_create(mtd_list, info->list_size,
 					       dev_name(&dev->dev));
 		if (info->cmtd == NULL)
 			err = -ENXIO;
-#else
-		printk(KERN_ERR "physmap_of: multiple devices "
-		       "found but MTD concat support disabled.\n");
-		err = -ENXIO;
-#endif
 	}
 	if (err)
 		goto err_out;
diff --git a/drivers/mtd/maps/sa1100-flash.c b/drivers/mtd/maps/sa1100-flash.c
index f3af87e..da875908 100644
--- a/drivers/mtd/maps/sa1100-flash.c
+++ b/drivers/mtd/maps/sa1100-flash.c
@@ -232,10 +232,8 @@
 		else
 			del_mtd_partitions(info->mtd);
 #endif
-#ifdef CONFIG_MTD_CONCAT
 		if (info->mtd != info->subdev[0].mtd)
 			mtd_concat_destroy(info->mtd);
-#endif
 	}
 
 	kfree(info->parts);
@@ -321,7 +319,6 @@
 		info->mtd = info->subdev[0].mtd;
 		ret = 0;
 	} else if (info->num_subdev > 1) {
-#ifdef CONFIG_MTD_CONCAT
 		struct mtd_info *cdev[nr];
 		/*
 		 * We detected multiple devices.  Concatenate them together.
@@ -333,11 +330,6 @@
 					      plat->name);
 		if (info->mtd == NULL)
 			ret = -ENXIO;
-#else
-		printk(KERN_ERR "SA1100 flash: multiple devices "
-		       "found but MTD concat support disabled.\n");
-		ret = -ENXIO;
-#endif
 	}
 
 	if (ret == 0)
diff --git a/drivers/mtd/maps/ts5500_flash.c b/drivers/mtd/maps/ts5500_flash.c
index e2147bf..e02dfa9 100644
--- a/drivers/mtd/maps/ts5500_flash.c
+++ b/drivers/mtd/maps/ts5500_flash.c
@@ -94,7 +94,6 @@
 	return 0;
 
 err1:
-	map_destroy(mymtd);
 	iounmap(ts5500_map.virt);
 err2:
 	return rc;
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index e0a2373..a534e1f 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -40,7 +40,7 @@
 static LIST_HEAD(blktrans_majors);
 static DEFINE_MUTEX(blktrans_ref_mutex);
 
-void blktrans_dev_release(struct kref *kref)
+static void blktrans_dev_release(struct kref *kref)
 {
 	struct mtd_blktrans_dev *dev =
 		container_of(kref, struct mtd_blktrans_dev, ref);
@@ -67,7 +67,7 @@
 	return dev;
 }
 
-void blktrans_dev_put(struct mtd_blktrans_dev *dev)
+static void blktrans_dev_put(struct mtd_blktrans_dev *dev)
 {
 	mutex_lock(&blktrans_ref_mutex);
 	kref_put(&dev->ref, blktrans_dev_release);
@@ -119,18 +119,43 @@
 	}
 }
 
+int mtd_blktrans_cease_background(struct mtd_blktrans_dev *dev)
+{
+	if (kthread_should_stop())
+		return 1;
+
+	return dev->bg_stop;
+}
+EXPORT_SYMBOL_GPL(mtd_blktrans_cease_background);
+
 static int mtd_blktrans_thread(void *arg)
 {
 	struct mtd_blktrans_dev *dev = arg;
+	struct mtd_blktrans_ops *tr = dev->tr;
 	struct request_queue *rq = dev->rq;
 	struct request *req = NULL;
+	int background_done = 0;
 
 	spin_lock_irq(rq->queue_lock);
 
 	while (!kthread_should_stop()) {
 		int res;
 
+		dev->bg_stop = false;
 		if (!req && !(req = blk_fetch_request(rq))) {
+			if (tr->background && !background_done) {
+				spin_unlock_irq(rq->queue_lock);
+				mutex_lock(&dev->lock);
+				tr->background(dev);
+				mutex_unlock(&dev->lock);
+				spin_lock_irq(rq->queue_lock);
+				/*
+				 * Do background processing just once per idle
+				 * period.
+				 */
+				background_done = !dev->bg_stop;
+				continue;
+			}
 			set_current_state(TASK_INTERRUPTIBLE);
 
 			if (kthread_should_stop())
@@ -152,6 +177,8 @@
 
 		if (!__blk_end_request_cur(req, res))
 			req = NULL;
+
+		background_done = 0;
 	}
 
 	if (req)
@@ -172,8 +199,10 @@
 	if (!dev)
 		while ((req = blk_fetch_request(rq)) != NULL)
 			__blk_end_request_all(req, -ENODEV);
-	else
+	else {
+		dev->bg_stop = true;
 		wake_up_process(dev->thread);
+	}
 }
 
 static int blktrans_open(struct block_device *bdev, fmode_t mode)
@@ -379,9 +408,10 @@
 	new->rq->queuedata = new;
 	blk_queue_logical_block_size(new->rq, tr->blksize);
 
-	if (tr->discard)
-		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
-					new->rq);
+	if (tr->discard) {
+		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, new->rq);
+		new->rq->limits.max_discard_sectors = UINT_MAX;
+	}
 
 	gd->queue = new->rq;
 
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index 5f5777b..5060e60 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -750,6 +750,7 @@
 	struct mtd_concat *concat;
 	uint32_t max_erasesize, curr_erasesize;
 	int num_erase_region;
+	int max_writebufsize = 0;
 
 	printk(KERN_NOTICE "Concatenating MTD devices:\n");
 	for (i = 0; i < num_devs; i++)
@@ -776,7 +777,12 @@
 	concat->mtd.size = subdev[0]->size;
 	concat->mtd.erasesize = subdev[0]->erasesize;
 	concat->mtd.writesize = subdev[0]->writesize;
-	concat->mtd.writebufsize = subdev[0]->writebufsize;
+
+	for (i = 0; i < num_devs; i++)
+		if (max_writebufsize < subdev[i]->writebufsize)
+			max_writebufsize = subdev[i]->writebufsize;
+	concat->mtd.writebufsize = max_writebufsize;
+
 	concat->mtd.subpage_sft = subdev[0]->subpage_sft;
 	concat->mtd.oobsize = subdev[0]->oobsize;
 	concat->mtd.oobavail = subdev[0]->oobavail;
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 527cebf..da69bc8 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -43,7 +43,7 @@
  * backing device capabilities for non-mappable devices (such as NAND flash)
  * - permits private mappings, copies are taken of the data
  */
-struct backing_dev_info mtd_bdi_unmappable = {
+static struct backing_dev_info mtd_bdi_unmappable = {
 	.capabilities	= BDI_CAP_MAP_COPY,
 };
 
@@ -52,7 +52,7 @@
  * - permits private mappings, copies are taken of the data
  * - permits non-writable shared mappings
  */
-struct backing_dev_info mtd_bdi_ro_mappable = {
+static struct backing_dev_info mtd_bdi_ro_mappable = {
 	.capabilities	= (BDI_CAP_MAP_COPY | BDI_CAP_MAP_DIRECT |
 			   BDI_CAP_EXEC_MAP | BDI_CAP_READ_MAP),
 };
@@ -62,7 +62,7 @@
  * - permits private mappings, copies are taken of the data
  * - permits non-writable shared mappings
  */
-struct backing_dev_info mtd_bdi_rw_mappable = {
+static struct backing_dev_info mtd_bdi_rw_mappable = {
 	.capabilities	= (BDI_CAP_MAP_COPY | BDI_CAP_MAP_DIRECT |
 			   BDI_CAP_EXEC_MAP | BDI_CAP_READ_MAP |
 			   BDI_CAP_WRITE_MAP),
diff --git a/drivers/mtd/mtdswap.c b/drivers/mtd/mtdswap.c
new file mode 100644
index 0000000..237913c
--- /dev/null
+++ b/drivers/mtd/mtdswap.c
@@ -0,0 +1,1587 @@
+/*
+ * Swap block device support for MTDs
+ * Turns an MTD device into a swap device with block wear leveling
+ *
+ * Copyright © 2007,2011 Nokia Corporation. All rights reserved.
+ *
+ * Authors: Jarkko Lavinen <jarkko.lavinen@nokia.com>
+ *
+ * Based on Richard Purdie's earlier implementation in 2007. Background
+ * support and lock-less operation written by Adrian Hunter.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/blktrans.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/genhd.h>
+#include <linux/swap.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/device.h>
+#include <linux/math64.h>
+
+#define MTDSWAP_PREFIX "mtdswap"
+
+/*
+ * The number of free eraseblocks when GC should stop
+ */
+#define CLEAN_BLOCK_THRESHOLD	20
+
+/*
+ * Number of free eraseblocks below which GC can also collect low frag
+ * blocks.
+ */
+#define LOW_FRAG_GC_TRESHOLD	5
+
+/*
+ * Wear level cost amortization. We want to do wear leveling on the background
+ * without disturbing gc too much. This is made by defining max GC frequency.
+ * Frequency value 6 means 1/6 of the GC passes will pick an erase block based
+ * on the biggest wear difference rather than the biggest dirtiness.
+ *
+ * The lower freq2 should be chosen so that it makes sure the maximum erase
+ * difference will decrease even if a malicious application is deliberately
+ * trying to make erase differences large.
+ */
+#define MAX_ERASE_DIFF		4000
+#define COLLECT_NONDIRTY_BASE	MAX_ERASE_DIFF
+#define COLLECT_NONDIRTY_FREQ1	6
+#define COLLECT_NONDIRTY_FREQ2	4
+
+#define PAGE_UNDEF		UINT_MAX
+#define BLOCK_UNDEF		UINT_MAX
+#define BLOCK_ERROR		(UINT_MAX - 1)
+#define BLOCK_MAX		(UINT_MAX - 2)
+
+#define EBLOCK_BAD		(1 << 0)
+#define EBLOCK_NOMAGIC		(1 << 1)
+#define EBLOCK_BITFLIP		(1 << 2)
+#define EBLOCK_FAILED		(1 << 3)
+#define EBLOCK_READERR		(1 << 4)
+#define EBLOCK_IDX_SHIFT	5
+
+struct swap_eb {
+	struct rb_node rb;
+	struct rb_root *root;
+
+	unsigned int flags;
+	unsigned int active_count;
+	unsigned int erase_count;
+	unsigned int pad;		/* speeds up pointer decremtnt */
+};
+
+#define MTDSWAP_ECNT_MIN(rbroot) (rb_entry(rb_first(rbroot), struct swap_eb, \
+				rb)->erase_count)
+#define MTDSWAP_ECNT_MAX(rbroot) (rb_entry(rb_last(rbroot), struct swap_eb, \
+				rb)->erase_count)
+
+struct mtdswap_tree {
+	struct rb_root root;
+	unsigned int count;
+};
+
+enum {
+	MTDSWAP_CLEAN,
+	MTDSWAP_USED,
+	MTDSWAP_LOWFRAG,
+	MTDSWAP_HIFRAG,
+	MTDSWAP_DIRTY,
+	MTDSWAP_BITFLIP,
+	MTDSWAP_FAILING,
+	MTDSWAP_TREE_CNT,
+};
+
+struct mtdswap_dev {
+	struct mtd_blktrans_dev *mbd_dev;
+	struct mtd_info *mtd;
+	struct device *dev;
+
+	unsigned int *page_data;
+	unsigned int *revmap;
+
+	unsigned int eblks;
+	unsigned int spare_eblks;
+	unsigned int pages_per_eblk;
+	unsigned int max_erase_count;
+	struct swap_eb *eb_data;
+
+	struct mtdswap_tree trees[MTDSWAP_TREE_CNT];
+
+	unsigned long long sect_read_count;
+	unsigned long long sect_write_count;
+	unsigned long long mtd_write_count;
+	unsigned long long mtd_read_count;
+	unsigned long long discard_count;
+	unsigned long long discard_page_count;
+
+	unsigned int curr_write_pos;
+	struct swap_eb *curr_write;
+
+	char *page_buf;
+	char *oob_buf;
+
+	struct dentry *debugfs_root;
+};
+
+struct mtdswap_oobdata {
+	__le16 magic;
+	__le32 count;
+} __attribute__((packed));
+
+#define MTDSWAP_MAGIC_CLEAN	0x2095
+#define MTDSWAP_MAGIC_DIRTY	(MTDSWAP_MAGIC_CLEAN + 1)
+#define MTDSWAP_TYPE_CLEAN	0
+#define MTDSWAP_TYPE_DIRTY	1
+#define MTDSWAP_OOBSIZE		sizeof(struct mtdswap_oobdata)
+
+#define MTDSWAP_ERASE_RETRIES	3 /* Before marking erase block bad */
+#define MTDSWAP_IO_RETRIES	3
+
+enum {
+	MTDSWAP_SCANNED_CLEAN,
+	MTDSWAP_SCANNED_DIRTY,
+	MTDSWAP_SCANNED_BITFLIP,
+	MTDSWAP_SCANNED_BAD,
+};
+
+/*
+ * In the worst case mtdswap_writesect() has allocated the last clean
+ * page from the current block and is then pre-empted by the GC
+ * thread. The thread can consume a full erase block when moving a
+ * block.
+ */
+#define MIN_SPARE_EBLOCKS	2
+#define MIN_ERASE_BLOCKS	(MIN_SPARE_EBLOCKS + 1)
+
+#define TREE_ROOT(d, name) (&d->trees[MTDSWAP_ ## name].root)
+#define TREE_EMPTY(d, name) (TREE_ROOT(d, name)->rb_node == NULL)
+#define TREE_NONEMPTY(d, name) (!TREE_EMPTY(d, name))
+#define TREE_COUNT(d, name) (d->trees[MTDSWAP_ ## name].count)
+
+#define MTDSWAP_MBD_TO_MTDSWAP(dev) ((struct mtdswap_dev *)dev->priv)
+
+static char partitions[128] = "";
+module_param_string(partitions, partitions, sizeof(partitions), 0444);
+MODULE_PARM_DESC(partitions, "MTD partition numbers to use as swap "
+		"partitions=\"1,3,5\"");
+
+static unsigned int spare_eblocks = 10;
+module_param(spare_eblocks, uint, 0444);
+MODULE_PARM_DESC(spare_eblocks, "Percentage of spare erase blocks for "
+		"garbage collection (default 10%)");
+
+static bool header; /* false */
+module_param(header, bool, 0444);
+MODULE_PARM_DESC(header,
+		"Include builtin swap header (default 0, without header)");
+
+static int mtdswap_gc(struct mtdswap_dev *d, unsigned int background);
+
+static loff_t mtdswap_eb_offset(struct mtdswap_dev *d, struct swap_eb *eb)
+{
+	return (loff_t)(eb - d->eb_data) * d->mtd->erasesize;
+}
+
+static void mtdswap_eb_detach(struct mtdswap_dev *d, struct swap_eb *eb)
+{
+	unsigned int oldidx;
+	struct mtdswap_tree *tp;
+
+	if (eb->root) {
+		tp = container_of(eb->root, struct mtdswap_tree, root);
+		oldidx = tp - &d->trees[0];
+
+		d->trees[oldidx].count--;
+		rb_erase(&eb->rb, eb->root);
+	}
+}
+
+static void __mtdswap_rb_add(struct rb_root *root, struct swap_eb *eb)
+{
+	struct rb_node **p, *parent = NULL;
+	struct swap_eb *cur;
+
+	p = &root->rb_node;
+	while (*p) {
+		parent = *p;
+		cur = rb_entry(parent, struct swap_eb, rb);
+		if (eb->erase_count > cur->erase_count)
+			p = &(*p)->rb_right;
+		else
+			p = &(*p)->rb_left;
+	}
+
+	rb_link_node(&eb->rb, parent, p);
+	rb_insert_color(&eb->rb, root);
+}
+
+static void mtdswap_rb_add(struct mtdswap_dev *d, struct swap_eb *eb, int idx)
+{
+	struct rb_root *root;
+
+	if (eb->root == &d->trees[idx].root)
+		return;
+
+	mtdswap_eb_detach(d, eb);
+	root = &d->trees[idx].root;
+	__mtdswap_rb_add(root, eb);
+	eb->root = root;
+	d->trees[idx].count++;
+}
+
+static struct rb_node *mtdswap_rb_index(struct rb_root *root, unsigned int idx)
+{
+	struct rb_node *p;
+	unsigned int i;
+
+	p = rb_first(root);
+	i = 0;
+	while (i < idx && p) {
+		p = rb_next(p);
+		i++;
+	}
+
+	return p;
+}
+
+static int mtdswap_handle_badblock(struct mtdswap_dev *d, struct swap_eb *eb)
+{
+	int ret;
+	loff_t offset;
+
+	d->spare_eblks--;
+	eb->flags |= EBLOCK_BAD;
+	mtdswap_eb_detach(d, eb);
+	eb->root = NULL;
+
+	/* badblocks not supported */
+	if (!d->mtd->block_markbad)
+		return 1;
+
+	offset = mtdswap_eb_offset(d, eb);
+	dev_warn(d->dev, "Marking bad block at %08llx\n", offset);
+	ret = d->mtd->block_markbad(d->mtd, offset);
+
+	if (ret) {
+		dev_warn(d->dev, "Mark block bad failed for block at %08llx "
+			"error %d\n", offset, ret);
+		return ret;
+	}
+
+	return 1;
+
+}
+
+static int mtdswap_handle_write_error(struct mtdswap_dev *d, struct swap_eb *eb)
+{
+	unsigned int marked = eb->flags & EBLOCK_FAILED;
+	struct swap_eb *curr_write = d->curr_write;
+
+	eb->flags |= EBLOCK_FAILED;
+	if (curr_write == eb) {
+		d->curr_write = NULL;
+
+		if (!marked && d->curr_write_pos != 0) {
+			mtdswap_rb_add(d, eb, MTDSWAP_FAILING);
+			return 0;
+		}
+	}
+
+	return mtdswap_handle_badblock(d, eb);
+}
+
+static int mtdswap_read_oob(struct mtdswap_dev *d, loff_t from,
+			struct mtd_oob_ops *ops)
+{
+	int ret = d->mtd->read_oob(d->mtd, from, ops);
+
+	if (ret == -EUCLEAN)
+		return ret;
+
+	if (ret) {
+		dev_warn(d->dev, "Read OOB failed %d for block at %08llx\n",
+			ret, from);
+		return ret;
+	}
+
+	if (ops->oobretlen < ops->ooblen) {
+		dev_warn(d->dev, "Read OOB return short read (%zd bytes not "
+			"%zd) for block at %08llx\n",
+			ops->oobretlen, ops->ooblen, from);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int mtdswap_read_markers(struct mtdswap_dev *d, struct swap_eb *eb)
+{
+	struct mtdswap_oobdata *data, *data2;
+	int ret;
+	loff_t offset;
+	struct mtd_oob_ops ops;
+
+	offset = mtdswap_eb_offset(d, eb);
+
+	/* Check first if the block is bad. */
+	if (d->mtd->block_isbad && d->mtd->block_isbad(d->mtd, offset))
+		return MTDSWAP_SCANNED_BAD;
+
+	ops.ooblen = 2 * d->mtd->ecclayout->oobavail;
+	ops.oobbuf = d->oob_buf;
+	ops.ooboffs = 0;
+	ops.datbuf = NULL;
+	ops.mode = MTD_OOB_AUTO;
+
+	ret = mtdswap_read_oob(d, offset, &ops);
+
+	if (ret && ret != -EUCLEAN)
+		return ret;
+
+	data = (struct mtdswap_oobdata *)d->oob_buf;
+	data2 = (struct mtdswap_oobdata *)
+		(d->oob_buf + d->mtd->ecclayout->oobavail);
+
+	if (le16_to_cpu(data->magic) == MTDSWAP_MAGIC_CLEAN) {
+		eb->erase_count = le32_to_cpu(data->count);
+		if (ret == -EUCLEAN)
+			ret = MTDSWAP_SCANNED_BITFLIP;
+		else {
+			if (le16_to_cpu(data2->magic) == MTDSWAP_MAGIC_DIRTY)
+				ret = MTDSWAP_SCANNED_DIRTY;
+			else
+				ret = MTDSWAP_SCANNED_CLEAN;
+		}
+	} else {
+		eb->flags |= EBLOCK_NOMAGIC;
+		ret = MTDSWAP_SCANNED_DIRTY;
+	}
+
+	return ret;
+}
+
+static int mtdswap_write_marker(struct mtdswap_dev *d, struct swap_eb *eb,
+				u16 marker)
+{
+	struct mtdswap_oobdata n;
+	int ret;
+	loff_t offset;
+	struct mtd_oob_ops ops;
+
+	ops.ooboffs = 0;
+	ops.oobbuf = (uint8_t *)&n;
+	ops.mode = MTD_OOB_AUTO;
+	ops.datbuf = NULL;
+
+	if (marker == MTDSWAP_TYPE_CLEAN) {
+		n.magic = cpu_to_le16(MTDSWAP_MAGIC_CLEAN);
+		n.count = cpu_to_le32(eb->erase_count);
+		ops.ooblen = MTDSWAP_OOBSIZE;
+		offset = mtdswap_eb_offset(d, eb);
+	} else {
+		n.magic = cpu_to_le16(MTDSWAP_MAGIC_DIRTY);
+		ops.ooblen = sizeof(n.magic);
+		offset = mtdswap_eb_offset(d, eb) + d->mtd->writesize;
+	}
+
+	ret = d->mtd->write_oob(d->mtd, offset , &ops);
+
+	if (ret) {
+		dev_warn(d->dev, "Write OOB failed for block at %08llx "
+			"error %d\n", offset, ret);
+		if (ret == -EIO || ret == -EBADMSG)
+			mtdswap_handle_write_error(d, eb);
+		return ret;
+	}
+
+	if (ops.oobretlen != ops.ooblen) {
+		dev_warn(d->dev, "Short OOB write for block at %08llx: "
+			"%zd not %zd\n",
+			offset, ops.oobretlen, ops.ooblen);
+		return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * Are there any erase blocks without MAGIC_CLEAN header, presumably
+ * because power was cut off after erase but before header write? We
+ * need to guestimate the erase count.
+ */
+static void mtdswap_check_counts(struct mtdswap_dev *d)
+{
+	struct rb_root hist_root = RB_ROOT;
+	struct rb_node *medrb;
+	struct swap_eb *eb;
+	unsigned int i, cnt, median;
+
+	cnt = 0;
+	for (i = 0; i < d->eblks; i++) {
+		eb = d->eb_data + i;
+
+		if (eb->flags & (EBLOCK_NOMAGIC | EBLOCK_BAD | EBLOCK_READERR))
+			continue;
+
+		__mtdswap_rb_add(&hist_root, eb);
+		cnt++;
+	}
+
+	if (cnt == 0)
+		return;
+
+	medrb = mtdswap_rb_index(&hist_root, cnt / 2);
+	median = rb_entry(medrb, struct swap_eb, rb)->erase_count;
+
+	d->max_erase_count = MTDSWAP_ECNT_MAX(&hist_root);
+
+	for (i = 0; i < d->eblks; i++) {
+		eb = d->eb_data + i;
+
+		if (eb->flags & (EBLOCK_NOMAGIC | EBLOCK_READERR))
+			eb->erase_count = median;
+
+		if (eb->flags & (EBLOCK_NOMAGIC | EBLOCK_BAD | EBLOCK_READERR))
+			continue;
+
+		rb_erase(&eb->rb, &hist_root);
+	}
+}
+
+static void mtdswap_scan_eblks(struct mtdswap_dev *d)
+{
+	int status;
+	unsigned int i, idx;
+	struct swap_eb *eb;
+
+	for (i = 0; i < d->eblks; i++) {
+		eb = d->eb_data + i;
+
+		status = mtdswap_read_markers(d, eb);
+		if (status < 0)
+			eb->flags |= EBLOCK_READERR;
+		else if (status == MTDSWAP_SCANNED_BAD) {
+			eb->flags |= EBLOCK_BAD;
+			continue;
+		}
+
+		switch (status) {
+		case MTDSWAP_SCANNED_CLEAN:
+			idx = MTDSWAP_CLEAN;
+			break;
+		case MTDSWAP_SCANNED_DIRTY:
+		case MTDSWAP_SCANNED_BITFLIP:
+			idx = MTDSWAP_DIRTY;
+			break;
+		default:
+			idx = MTDSWAP_FAILING;
+		}
+
+		eb->flags |= (idx << EBLOCK_IDX_SHIFT);
+	}
+
+	mtdswap_check_counts(d);
+
+	for (i = 0; i < d->eblks; i++) {
+		eb = d->eb_data + i;
+
+		if (eb->flags & EBLOCK_BAD)
+			continue;
+
+		idx = eb->flags >> EBLOCK_IDX_SHIFT;
+		mtdswap_rb_add(d, eb, idx);
+	}
+}
+
+/*
+ * Place eblk into a tree corresponding to its number of active blocks
+ * it contains.
+ */
+static void mtdswap_store_eb(struct mtdswap_dev *d, struct swap_eb *eb)
+{
+	unsigned int weight = eb->active_count;
+	unsigned int maxweight = d->pages_per_eblk;
+
+	if (eb == d->curr_write)
+		return;
+
+	if (eb->flags & EBLOCK_BITFLIP)
+		mtdswap_rb_add(d, eb, MTDSWAP_BITFLIP);
+	else if (eb->flags & (EBLOCK_READERR | EBLOCK_FAILED))
+		mtdswap_rb_add(d, eb, MTDSWAP_FAILING);
+	if (weight == maxweight)
+		mtdswap_rb_add(d, eb, MTDSWAP_USED);
+	else if (weight == 0)
+		mtdswap_rb_add(d, eb, MTDSWAP_DIRTY);
+	else if (weight > (maxweight/2))
+		mtdswap_rb_add(d, eb, MTDSWAP_LOWFRAG);
+	else
+		mtdswap_rb_add(d, eb, MTDSWAP_HIFRAG);
+}
+
+
+static void mtdswap_erase_callback(struct erase_info *done)
+{
+	wait_queue_head_t *wait_q = (wait_queue_head_t *)done->priv;
+	wake_up(wait_q);
+}
+
+static int mtdswap_erase_block(struct mtdswap_dev *d, struct swap_eb *eb)
+{
+	struct mtd_info *mtd = d->mtd;
+	struct erase_info erase;
+	wait_queue_head_t wq;
+	unsigned int retries = 0;
+	int ret;
+
+	eb->erase_count++;
+	if (eb->erase_count > d->max_erase_count)
+		d->max_erase_count = eb->erase_count;
+
+retry:
+	init_waitqueue_head(&wq);
+	memset(&erase, 0, sizeof(struct erase_info));
+
+	erase.mtd	= mtd;
+	erase.callback	= mtdswap_erase_callback;
+	erase.addr	= mtdswap_eb_offset(d, eb);
+	erase.len	= mtd->erasesize;
+	erase.priv	= (u_long)&wq;
+
+	ret = mtd->erase(mtd, &erase);
+	if (ret) {
+		if (retries++ < MTDSWAP_ERASE_RETRIES) {
+			dev_warn(d->dev,
+				"erase of erase block %#llx on %s failed",
+				erase.addr, mtd->name);
+			yield();
+			goto retry;
+		}
+
+		dev_err(d->dev, "Cannot erase erase block %#llx on %s\n",
+			erase.addr, mtd->name);
+
+		mtdswap_handle_badblock(d, eb);
+		return -EIO;
+	}
+
+	ret = wait_event_interruptible(wq, erase.state == MTD_ERASE_DONE ||
+					   erase.state == MTD_ERASE_FAILED);
+	if (ret) {
+		dev_err(d->dev, "Interrupted erase block %#llx erassure on %s",
+			erase.addr, mtd->name);
+		return -EINTR;
+	}
+
+	if (erase.state == MTD_ERASE_FAILED) {
+		if (retries++ < MTDSWAP_ERASE_RETRIES) {
+			dev_warn(d->dev,
+				"erase of erase block %#llx on %s failed",
+				erase.addr, mtd->name);
+			yield();
+			goto retry;
+		}
+
+		mtdswap_handle_badblock(d, eb);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int mtdswap_map_free_block(struct mtdswap_dev *d, unsigned int page,
+				unsigned int *block)
+{
+	int ret;
+	struct swap_eb *old_eb = d->curr_write;
+	struct rb_root *clean_root;
+	struct swap_eb *eb;
+
+	if (old_eb == NULL || d->curr_write_pos >= d->pages_per_eblk) {
+		do {
+			if (TREE_EMPTY(d, CLEAN))
+				return -ENOSPC;
+
+			clean_root = TREE_ROOT(d, CLEAN);
+			eb = rb_entry(rb_first(clean_root), struct swap_eb, rb);
+			rb_erase(&eb->rb, clean_root);
+			eb->root = NULL;
+			TREE_COUNT(d, CLEAN)--;
+
+			ret = mtdswap_write_marker(d, eb, MTDSWAP_TYPE_DIRTY);
+		} while (ret == -EIO || ret == -EBADMSG);
+
+		if (ret)
+			return ret;
+
+		d->curr_write_pos = 0;
+		d->curr_write = eb;
+		if (old_eb)
+			mtdswap_store_eb(d, old_eb);
+	}
+
+	*block = (d->curr_write - d->eb_data) * d->pages_per_eblk +
+		d->curr_write_pos;
+
+	d->curr_write->active_count++;
+	d->revmap[*block] = page;
+	d->curr_write_pos++;
+
+	return 0;
+}
+
+static unsigned int mtdswap_free_page_cnt(struct mtdswap_dev *d)
+{
+	return TREE_COUNT(d, CLEAN) * d->pages_per_eblk +
+		d->pages_per_eblk - d->curr_write_pos;
+}
+
+static unsigned int mtdswap_enough_free_pages(struct mtdswap_dev *d)
+{
+	return mtdswap_free_page_cnt(d) > d->pages_per_eblk;
+}
+
+static int mtdswap_write_block(struct mtdswap_dev *d, char *buf,
+			unsigned int page, unsigned int *bp, int gc_context)
+{
+	struct mtd_info *mtd = d->mtd;
+	struct swap_eb *eb;
+	size_t retlen;
+	loff_t writepos;
+	int ret;
+
+retry:
+	if (!gc_context)
+		while (!mtdswap_enough_free_pages(d))
+			if (mtdswap_gc(d, 0) > 0)
+				return -ENOSPC;
+
+	ret = mtdswap_map_free_block(d, page, bp);
+	eb = d->eb_data + (*bp / d->pages_per_eblk);
+
+	if (ret == -EIO || ret == -EBADMSG) {
+		d->curr_write = NULL;
+		eb->active_count--;
+		d->revmap[*bp] = PAGE_UNDEF;
+		goto retry;
+	}
+
+	if (ret < 0)
+		return ret;
+
+	writepos = (loff_t)*bp << PAGE_SHIFT;
+	ret =  mtd->write(mtd, writepos, PAGE_SIZE, &retlen, buf);
+	if (ret == -EIO || ret == -EBADMSG) {
+		d->curr_write_pos--;
+		eb->active_count--;
+		d->revmap[*bp] = PAGE_UNDEF;
+		mtdswap_handle_write_error(d, eb);
+		goto retry;
+	}
+
+	if (ret < 0) {
+		dev_err(d->dev, "Write to MTD device failed: %d (%zd written)",
+			ret, retlen);
+		goto err;
+	}
+
+	if (retlen != PAGE_SIZE) {
+		dev_err(d->dev, "Short write to MTD device: %zd written",
+			retlen);
+		ret = -EIO;
+		goto err;
+	}
+
+	return ret;
+
+err:
+	d->curr_write_pos--;
+	eb->active_count--;
+	d->revmap[*bp] = PAGE_UNDEF;
+
+	return ret;
+}
+
+static int mtdswap_move_block(struct mtdswap_dev *d, unsigned int oldblock,
+		unsigned int *newblock)
+{
+	struct mtd_info *mtd = d->mtd;
+	struct swap_eb *eb, *oldeb;
+	int ret;
+	size_t retlen;
+	unsigned int page, retries;
+	loff_t readpos;
+
+	page = d->revmap[oldblock];
+	readpos = (loff_t) oldblock << PAGE_SHIFT;
+	retries = 0;
+
+retry:
+	ret = mtd->read(mtd, readpos, PAGE_SIZE, &retlen, d->page_buf);
+
+	if (ret < 0 && ret != -EUCLEAN) {
+		oldeb = d->eb_data + oldblock / d->pages_per_eblk;
+		oldeb->flags |= EBLOCK_READERR;
+
+		dev_err(d->dev, "Read Error: %d (block %u)\n", ret,
+			oldblock);
+		retries++;
+		if (retries < MTDSWAP_IO_RETRIES)
+			goto retry;
+
+		goto read_error;
+	}
+
+	if (retlen != PAGE_SIZE) {
+		dev_err(d->dev, "Short read: %zd (block %u)\n", retlen,
+		       oldblock);
+		ret = -EIO;
+		goto read_error;
+	}
+
+	ret = mtdswap_write_block(d, d->page_buf, page, newblock, 1);
+	if (ret < 0) {
+		d->page_data[page] = BLOCK_ERROR;
+		dev_err(d->dev, "Write error: %d\n", ret);
+		return ret;
+	}
+
+	eb = d->eb_data + *newblock / d->pages_per_eblk;
+	d->page_data[page] = *newblock;
+	d->revmap[oldblock] = PAGE_UNDEF;
+	eb = d->eb_data + oldblock / d->pages_per_eblk;
+	eb->active_count--;
+
+	return 0;
+
+read_error:
+	d->page_data[page] = BLOCK_ERROR;
+	d->revmap[oldblock] = PAGE_UNDEF;
+	return ret;
+}
+
+static int mtdswap_gc_eblock(struct mtdswap_dev *d, struct swap_eb *eb)
+{
+	unsigned int i, block, eblk_base, newblock;
+	int ret, errcode;
+
+	errcode = 0;
+	eblk_base = (eb - d->eb_data) * d->pages_per_eblk;
+
+	for (i = 0; i < d->pages_per_eblk; i++) {
+		if (d->spare_eblks < MIN_SPARE_EBLOCKS)
+			return -ENOSPC;
+
+		block = eblk_base + i;
+		if (d->revmap[block] == PAGE_UNDEF)
+			continue;
+
+		ret = mtdswap_move_block(d, block, &newblock);
+		if (ret < 0 && !errcode)
+			errcode = ret;
+	}
+
+	return errcode;
+}
+
+static int __mtdswap_choose_gc_tree(struct mtdswap_dev *d)
+{
+	int idx, stopat;
+
+	if (TREE_COUNT(d, CLEAN) < LOW_FRAG_GC_TRESHOLD)
+		stopat = MTDSWAP_LOWFRAG;
+	else
+		stopat = MTDSWAP_HIFRAG;
+
+	for (idx = MTDSWAP_BITFLIP; idx >= stopat; idx--)
+		if (d->trees[idx].root.rb_node != NULL)
+			return idx;
+
+	return -1;
+}
+
+static int mtdswap_wlfreq(unsigned int maxdiff)
+{
+	unsigned int h, x, y, dist, base;
+
+	/*
+	 * Calculate linear ramp down from f1 to f2 when maxdiff goes from
+	 * MAX_ERASE_DIFF to MAX_ERASE_DIFF + COLLECT_NONDIRTY_BASE.  Similar
+	 * to triangle with height f1 - f1 and width COLLECT_NONDIRTY_BASE.
+	 */
+
+	dist = maxdiff - MAX_ERASE_DIFF;
+	if (dist > COLLECT_NONDIRTY_BASE)
+		dist = COLLECT_NONDIRTY_BASE;
+
+	/*
+	 * Modelling the slop as right angular triangle with base
+	 * COLLECT_NONDIRTY_BASE and height freq1 - freq2. The ratio y/x is
+	 * equal to the ratio h/base.
+	 */
+	h = COLLECT_NONDIRTY_FREQ1 - COLLECT_NONDIRTY_FREQ2;
+	base = COLLECT_NONDIRTY_BASE;
+
+	x = dist - base;
+	y = (x * h + base / 2) / base;
+
+	return COLLECT_NONDIRTY_FREQ2 + y;
+}
+
+static int mtdswap_choose_wl_tree(struct mtdswap_dev *d)
+{
+	static unsigned int pick_cnt;
+	unsigned int i, idx = -1, wear, max;
+	struct rb_root *root;
+
+	max = 0;
+	for (i = 0; i <= MTDSWAP_DIRTY; i++) {
+		root = &d->trees[i].root;
+		if (root->rb_node == NULL)
+			continue;
+
+		wear = d->max_erase_count - MTDSWAP_ECNT_MIN(root);
+		if (wear > max) {
+			max = wear;
+			idx = i;
+		}
+	}
+
+	if (max > MAX_ERASE_DIFF && pick_cnt >= mtdswap_wlfreq(max) - 1) {
+		pick_cnt = 0;
+		return idx;
+	}
+
+	pick_cnt++;
+	return -1;
+}
+
+static int mtdswap_choose_gc_tree(struct mtdswap_dev *d,
+				unsigned int background)
+{
+	int idx;
+
+	if (TREE_NONEMPTY(d, FAILING) &&
+		(background || (TREE_EMPTY(d, CLEAN) && TREE_EMPTY(d, DIRTY))))
+		return MTDSWAP_FAILING;
+
+	idx = mtdswap_choose_wl_tree(d);
+	if (idx >= MTDSWAP_CLEAN)
+		return idx;
+
+	return __mtdswap_choose_gc_tree(d);
+}
+
+static struct swap_eb *mtdswap_pick_gc_eblk(struct mtdswap_dev *d,
+					unsigned int background)
+{
+	struct rb_root *rp = NULL;
+	struct swap_eb *eb = NULL;
+	int idx;
+
+	if (background && TREE_COUNT(d, CLEAN) > CLEAN_BLOCK_THRESHOLD &&
+		TREE_EMPTY(d, DIRTY) && TREE_EMPTY(d, FAILING))
+		return NULL;
+
+	idx = mtdswap_choose_gc_tree(d, background);
+	if (idx < 0)
+		return NULL;
+
+	rp = &d->trees[idx].root;
+	eb = rb_entry(rb_first(rp), struct swap_eb, rb);
+
+	rb_erase(&eb->rb, rp);
+	eb->root = NULL;
+	d->trees[idx].count--;
+	return eb;
+}
+
+static unsigned int mtdswap_test_patt(unsigned int i)
+{
+	return i % 2 ? 0x55555555 : 0xAAAAAAAA;
+}
+
+static unsigned int mtdswap_eblk_passes(struct mtdswap_dev *d,
+					struct swap_eb *eb)
+{
+	struct mtd_info *mtd = d->mtd;
+	unsigned int test, i, j, patt, mtd_pages;
+	loff_t base, pos;
+	unsigned int *p1 = (unsigned int *)d->page_buf;
+	unsigned char *p2 = (unsigned char *)d->oob_buf;
+	struct mtd_oob_ops ops;
+	int ret;
+
+	ops.mode = MTD_OOB_AUTO;
+	ops.len = mtd->writesize;
+	ops.ooblen = mtd->ecclayout->oobavail;
+	ops.ooboffs = 0;
+	ops.datbuf = d->page_buf;
+	ops.oobbuf = d->oob_buf;
+	base = mtdswap_eb_offset(d, eb);
+	mtd_pages = d->pages_per_eblk * PAGE_SIZE / mtd->writesize;
+
+	for (test = 0; test < 2; test++) {
+		pos = base;
+		for (i = 0; i < mtd_pages; i++) {
+			patt = mtdswap_test_patt(test + i);
+			memset(d->page_buf, patt, mtd->writesize);
+			memset(d->oob_buf, patt, mtd->ecclayout->oobavail);
+			ret = mtd->write_oob(mtd, pos, &ops);
+			if (ret)
+				goto error;
+
+			pos += mtd->writesize;
+		}
+
+		pos = base;
+		for (i = 0; i < mtd_pages; i++) {
+			ret = mtd->read_oob(mtd, pos, &ops);
+			if (ret)
+				goto error;
+
+			patt = mtdswap_test_patt(test + i);
+			for (j = 0; j < mtd->writesize/sizeof(int); j++)
+				if (p1[j] != patt)
+					goto error;
+
+			for (j = 0; j < mtd->ecclayout->oobavail; j++)
+				if (p2[j] != (unsigned char)patt)
+					goto error;
+
+			pos += mtd->writesize;
+		}
+
+		ret = mtdswap_erase_block(d, eb);
+		if (ret)
+			goto error;
+	}
+
+	eb->flags &= ~EBLOCK_READERR;
+	return 1;
+
+error:
+	mtdswap_handle_badblock(d, eb);
+	return 0;
+}
+
+static int mtdswap_gc(struct mtdswap_dev *d, unsigned int background)
+{
+	struct swap_eb *eb;
+	int ret;
+
+	if (d->spare_eblks < MIN_SPARE_EBLOCKS)
+		return 1;
+
+	eb = mtdswap_pick_gc_eblk(d, background);
+	if (!eb)
+		return 1;
+
+	ret = mtdswap_gc_eblock(d, eb);
+	if (ret == -ENOSPC)
+		return 1;
+
+	if (eb->flags & EBLOCK_FAILED) {
+		mtdswap_handle_badblock(d, eb);
+		return 0;
+	}
+
+	eb->flags &= ~EBLOCK_BITFLIP;
+	ret = mtdswap_erase_block(d, eb);
+	if ((eb->flags & EBLOCK_READERR) &&
+		(ret || !mtdswap_eblk_passes(d, eb)))
+		return 0;
+
+	if (ret == 0)
+		ret = mtdswap_write_marker(d, eb, MTDSWAP_TYPE_CLEAN);
+
+	if (ret == 0)
+		mtdswap_rb_add(d, eb, MTDSWAP_CLEAN);
+	else if (ret != -EIO && ret != -EBADMSG)
+		mtdswap_rb_add(d, eb, MTDSWAP_DIRTY);
+
+	return 0;
+}
+
+static void mtdswap_background(struct mtd_blktrans_dev *dev)
+{
+	struct mtdswap_dev *d = MTDSWAP_MBD_TO_MTDSWAP(dev);
+	int ret;
+
+	while (1) {
+		ret = mtdswap_gc(d, 1);
+		if (ret || mtd_blktrans_cease_background(dev))
+			return;
+	}
+}
+
+static void mtdswap_cleanup(struct mtdswap_dev *d)
+{
+	vfree(d->eb_data);
+	vfree(d->revmap);
+	vfree(d->page_data);
+	kfree(d->oob_buf);
+	kfree(d->page_buf);
+}
+
+static int mtdswap_flush(struct mtd_blktrans_dev *dev)
+{
+	struct mtdswap_dev *d = MTDSWAP_MBD_TO_MTDSWAP(dev);
+
+	if (d->mtd->sync)
+		d->mtd->sync(d->mtd);
+	return 0;
+}
+
+static unsigned int mtdswap_badblocks(struct mtd_info *mtd, uint64_t size)
+{
+	loff_t offset;
+	unsigned int badcnt;
+
+	badcnt = 0;
+
+	if (mtd->block_isbad)
+		for (offset = 0; offset < size; offset += mtd->erasesize)
+			if (mtd->block_isbad(mtd, offset))
+				badcnt++;
+
+	return badcnt;
+}
+
+static int mtdswap_writesect(struct mtd_blktrans_dev *dev,
+			unsigned long page, char *buf)
+{
+	struct mtdswap_dev *d = MTDSWAP_MBD_TO_MTDSWAP(dev);
+	unsigned int newblock, mapped;
+	struct swap_eb *eb;
+	int ret;
+
+	d->sect_write_count++;
+
+	if (d->spare_eblks < MIN_SPARE_EBLOCKS)
+		return -ENOSPC;
+
+	if (header) {
+		/* Ignore writes to the header page */
+		if (unlikely(page == 0))
+			return 0;
+
+		page--;
+	}
+
+	mapped = d->page_data[page];
+	if (mapped <= BLOCK_MAX) {
+		eb = d->eb_data + (mapped / d->pages_per_eblk);
+		eb->active_count--;
+		mtdswap_store_eb(d, eb);
+		d->page_data[page] = BLOCK_UNDEF;
+		d->revmap[mapped] = PAGE_UNDEF;
+	}
+
+	ret = mtdswap_write_block(d, buf, page, &newblock, 0);
+	d->mtd_write_count++;
+
+	if (ret < 0)
+		return ret;
+
+	eb = d->eb_data + (newblock / d->pages_per_eblk);
+	d->page_data[page] = newblock;
+
+	return 0;
+}
+
+/* Provide a dummy swap header for the kernel */
+static int mtdswap_auto_header(struct mtdswap_dev *d, char *buf)
+{
+	union swap_header *hd = (union swap_header *)(buf);
+
+	memset(buf, 0, PAGE_SIZE - 10);
+
+	hd->info.version = 1;
+	hd->info.last_page = d->mbd_dev->size - 1;
+	hd->info.nr_badpages = 0;
+
+	memcpy(buf + PAGE_SIZE - 10, "SWAPSPACE2", 10);
+
+	return 0;
+}
+
+static int mtdswap_readsect(struct mtd_blktrans_dev *dev,
+			unsigned long page, char *buf)
+{
+	struct mtdswap_dev *d = MTDSWAP_MBD_TO_MTDSWAP(dev);
+	struct mtd_info *mtd = d->mtd;
+	unsigned int realblock, retries;
+	loff_t readpos;
+	struct swap_eb *eb;
+	size_t retlen;
+	int ret;
+
+	d->sect_read_count++;
+
+	if (header) {
+		if (unlikely(page == 0))
+			return mtdswap_auto_header(d, buf);
+
+		page--;
+	}
+
+	realblock = d->page_data[page];
+	if (realblock > BLOCK_MAX) {
+		memset(buf, 0x0, PAGE_SIZE);
+		if (realblock == BLOCK_UNDEF)
+			return 0;
+		else
+			return -EIO;
+	}
+
+	eb = d->eb_data + (realblock / d->pages_per_eblk);
+	BUG_ON(d->revmap[realblock] == PAGE_UNDEF);
+
+	readpos = (loff_t)realblock << PAGE_SHIFT;
+	retries = 0;
+
+retry:
+	ret = mtd->read(mtd, readpos, PAGE_SIZE, &retlen, buf);
+
+	d->mtd_read_count++;
+	if (ret == -EUCLEAN) {
+		eb->flags |= EBLOCK_BITFLIP;
+		mtdswap_rb_add(d, eb, MTDSWAP_BITFLIP);
+		ret = 0;
+	}
+
+	if (ret < 0) {
+		dev_err(d->dev, "Read error %d\n", ret);
+		eb->flags |= EBLOCK_READERR;
+		mtdswap_rb_add(d, eb, MTDSWAP_FAILING);
+		retries++;
+		if (retries < MTDSWAP_IO_RETRIES)
+			goto retry;
+
+		return ret;
+	}
+
+	if (retlen != PAGE_SIZE) {
+		dev_err(d->dev, "Short read %zd\n", retlen);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int mtdswap_discard(struct mtd_blktrans_dev *dev, unsigned long first,
+			unsigned nr_pages)
+{
+	struct mtdswap_dev *d = MTDSWAP_MBD_TO_MTDSWAP(dev);
+	unsigned long page;
+	struct swap_eb *eb;
+	unsigned int mapped;
+
+	d->discard_count++;
+
+	for (page = first; page < first + nr_pages; page++) {
+		mapped = d->page_data[page];
+		if (mapped <= BLOCK_MAX) {
+			eb = d->eb_data + (mapped / d->pages_per_eblk);
+			eb->active_count--;
+			mtdswap_store_eb(d, eb);
+			d->page_data[page] = BLOCK_UNDEF;
+			d->revmap[mapped] = PAGE_UNDEF;
+			d->discard_page_count++;
+		} else if (mapped == BLOCK_ERROR) {
+			d->page_data[page] = BLOCK_UNDEF;
+			d->discard_page_count++;
+		}
+	}
+
+	return 0;
+}
+
+static int mtdswap_show(struct seq_file *s, void *data)
+{
+	struct mtdswap_dev *d = (struct mtdswap_dev *) s->private;
+	unsigned long sum;
+	unsigned int count[MTDSWAP_TREE_CNT];
+	unsigned int min[MTDSWAP_TREE_CNT];
+	unsigned int max[MTDSWAP_TREE_CNT];
+	unsigned int i, cw = 0, cwp = 0, cwecount = 0, bb_cnt, mapped, pages;
+	uint64_t use_size;
+	char *name[] = {"clean", "used", "low", "high", "dirty", "bitflip",
+			"failing"};
+
+	mutex_lock(&d->mbd_dev->lock);
+
+	for (i = 0; i < MTDSWAP_TREE_CNT; i++) {
+		struct rb_root *root = &d->trees[i].root;
+
+		if (root->rb_node) {
+			count[i] = d->trees[i].count;
+			min[i] = rb_entry(rb_first(root), struct swap_eb,
+					rb)->erase_count;
+			max[i] = rb_entry(rb_last(root), struct swap_eb,
+					rb)->erase_count;
+		} else
+			count[i] = 0;
+	}
+
+	if (d->curr_write) {
+		cw = 1;
+		cwp = d->curr_write_pos;
+		cwecount = d->curr_write->erase_count;
+	}
+
+	sum = 0;
+	for (i = 0; i < d->eblks; i++)
+		sum += d->eb_data[i].erase_count;
+
+	use_size = (uint64_t)d->eblks * d->mtd->erasesize;
+	bb_cnt = mtdswap_badblocks(d->mtd, use_size);
+
+	mapped = 0;
+	pages = d->mbd_dev->size;
+	for (i = 0; i < pages; i++)
+		if (d->page_data[i] != BLOCK_UNDEF)
+			mapped++;
+
+	mutex_unlock(&d->mbd_dev->lock);
+
+	for (i = 0; i < MTDSWAP_TREE_CNT; i++) {
+		if (!count[i])
+			continue;
+
+		if (min[i] != max[i])
+			seq_printf(s, "%s:\t%5d erase blocks, erased min %d, "
+				"max %d times\n",
+				name[i], count[i], min[i], max[i]);
+		else
+			seq_printf(s, "%s:\t%5d erase blocks, all erased %d "
+				"times\n", name[i], count[i], min[i]);
+	}
+
+	if (bb_cnt)
+		seq_printf(s, "bad:\t%5u erase blocks\n", bb_cnt);
+
+	if (cw)
+		seq_printf(s, "current erase block: %u pages used, %u free, "
+			"erased %u times\n",
+			cwp, d->pages_per_eblk - cwp, cwecount);
+
+	seq_printf(s, "total erasures: %lu\n", sum);
+
+	seq_printf(s, "\n");
+
+	seq_printf(s, "mtdswap_readsect count: %llu\n", d->sect_read_count);
+	seq_printf(s, "mtdswap_writesect count: %llu\n", d->sect_write_count);
+	seq_printf(s, "mtdswap_discard count: %llu\n", d->discard_count);
+	seq_printf(s, "mtd read count: %llu\n", d->mtd_read_count);
+	seq_printf(s, "mtd write count: %llu\n", d->mtd_write_count);
+	seq_printf(s, "discarded pages count: %llu\n", d->discard_page_count);
+
+	seq_printf(s, "\n");
+	seq_printf(s, "total pages: %u\n", pages);
+	seq_printf(s, "pages mapped: %u\n", mapped);
+
+	return 0;
+}
+
+static int mtdswap_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, mtdswap_show, inode->i_private);
+}
+
+static const struct file_operations mtdswap_fops = {
+	.open		= mtdswap_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int mtdswap_add_debugfs(struct mtdswap_dev *d)
+{
+	struct gendisk *gd = d->mbd_dev->disk;
+	struct device *dev = disk_to_dev(gd);
+
+	struct dentry *root;
+	struct dentry *dent;
+
+	root = debugfs_create_dir(gd->disk_name, NULL);
+	if (IS_ERR(root))
+		return 0;
+
+	if (!root) {
+		dev_err(dev, "failed to initialize debugfs\n");
+		return -1;
+	}
+
+	d->debugfs_root = root;
+
+	dent = debugfs_create_file("stats", S_IRUSR, root, d,
+				&mtdswap_fops);
+	if (!dent) {
+		dev_err(d->dev, "debugfs_create_file failed\n");
+		debugfs_remove_recursive(root);
+		d->debugfs_root = NULL;
+		return -1;
+	}
+
+	return 0;
+}
+
+static int mtdswap_init(struct mtdswap_dev *d, unsigned int eblocks,
+			unsigned int spare_cnt)
+{
+	struct mtd_info *mtd = d->mbd_dev->mtd;
+	unsigned int i, eblk_bytes, pages, blocks;
+	int ret = -ENOMEM;
+
+	d->mtd = mtd;
+	d->eblks = eblocks;
+	d->spare_eblks = spare_cnt;
+	d->pages_per_eblk = mtd->erasesize >> PAGE_SHIFT;
+
+	pages = d->mbd_dev->size;
+	blocks = eblocks * d->pages_per_eblk;
+
+	for (i = 0; i < MTDSWAP_TREE_CNT; i++)
+		d->trees[i].root = RB_ROOT;
+
+	d->page_data = vmalloc(sizeof(int)*pages);
+	if (!d->page_data)
+		goto page_data_fail;
+
+	d->revmap = vmalloc(sizeof(int)*blocks);
+	if (!d->revmap)
+		goto revmap_fail;
+
+	eblk_bytes = sizeof(struct swap_eb)*d->eblks;
+	d->eb_data = vmalloc(eblk_bytes);
+	if (!d->eb_data)
+		goto eb_data_fail;
+
+	memset(d->eb_data, 0, eblk_bytes);
+	for (i = 0; i < pages; i++)
+		d->page_data[i] = BLOCK_UNDEF;
+
+	for (i = 0; i < blocks; i++)
+		d->revmap[i] = PAGE_UNDEF;
+
+	d->page_buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!d->page_buf)
+		goto page_buf_fail;
+
+	d->oob_buf = kmalloc(2 * mtd->ecclayout->oobavail, GFP_KERNEL);
+	if (!d->oob_buf)
+		goto oob_buf_fail;
+
+	mtdswap_scan_eblks(d);
+
+	return 0;
+
+oob_buf_fail:
+	kfree(d->page_buf);
+page_buf_fail:
+	vfree(d->eb_data);
+eb_data_fail:
+	vfree(d->revmap);
+revmap_fail:
+	vfree(d->page_data);
+page_data_fail:
+	printk(KERN_ERR "%s: init failed (%d)\n", MTDSWAP_PREFIX, ret);
+	return ret;
+}
+
+static void mtdswap_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
+{
+	struct mtdswap_dev *d;
+	struct mtd_blktrans_dev *mbd_dev;
+	char *parts;
+	char *this_opt;
+	unsigned long part;
+	unsigned int eblocks, eavailable, bad_blocks, spare_cnt;
+	uint64_t swap_size, use_size, size_limit;
+	struct nand_ecclayout *oinfo;
+	int ret;
+
+	parts = &partitions[0];
+	if (!*parts)
+		return;
+
+	while ((this_opt = strsep(&parts, ",")) != NULL) {
+		if (strict_strtoul(this_opt, 0, &part) < 0)
+			return;
+
+		if (mtd->index == part)
+			break;
+	}
+
+	if (mtd->index != part)
+		return;
+
+	if (mtd->erasesize < PAGE_SIZE || mtd->erasesize % PAGE_SIZE) {
+		printk(KERN_ERR "%s: Erase size %u not multiple of PAGE_SIZE "
+			"%lu\n", MTDSWAP_PREFIX, mtd->erasesize, PAGE_SIZE);
+		return;
+	}
+
+	if (PAGE_SIZE % mtd->writesize || mtd->writesize > PAGE_SIZE) {
+		printk(KERN_ERR "%s: PAGE_SIZE %lu not multiple of write size"
+			" %u\n", MTDSWAP_PREFIX, PAGE_SIZE, mtd->writesize);
+		return;
+	}
+
+	oinfo = mtd->ecclayout;
+	if (!mtd->oobsize || !oinfo || oinfo->oobavail < MTDSWAP_OOBSIZE) {
+		printk(KERN_ERR "%s: Not enough free bytes in OOB, "
+			"%d available, %lu needed.\n",
+			MTDSWAP_PREFIX, oinfo->oobavail, MTDSWAP_OOBSIZE);
+		return;
+	}
+
+	if (spare_eblocks > 100)
+		spare_eblocks = 100;
+
+	use_size = mtd->size;
+	size_limit = (uint64_t) BLOCK_MAX * PAGE_SIZE;
+
+	if (mtd->size > size_limit) {
+		printk(KERN_WARNING "%s: Device too large. Limiting size to "
+			"%llu bytes\n", MTDSWAP_PREFIX, size_limit);
+		use_size = size_limit;
+	}
+
+	eblocks = mtd_div_by_eb(use_size, mtd);
+	use_size = eblocks * mtd->erasesize;
+	bad_blocks = mtdswap_badblocks(mtd, use_size);
+	eavailable = eblocks - bad_blocks;
+
+	if (eavailable < MIN_ERASE_BLOCKS) {
+		printk(KERN_ERR "%s: Not enough erase blocks. %u available, "
+			"%d needed\n", MTDSWAP_PREFIX, eavailable,
+			MIN_ERASE_BLOCKS);
+		return;
+	}
+
+	spare_cnt = div_u64((uint64_t)eavailable * spare_eblocks, 100);
+
+	if (spare_cnt < MIN_SPARE_EBLOCKS)
+		spare_cnt = MIN_SPARE_EBLOCKS;
+
+	if (spare_cnt > eavailable - 1)
+		spare_cnt = eavailable - 1;
+
+	swap_size = (uint64_t)(eavailable - spare_cnt) * mtd->erasesize +
+		(header ? PAGE_SIZE : 0);
+
+	printk(KERN_INFO "%s: Enabling MTD swap on device %lu, size %llu KB, "
+		"%u spare, %u bad blocks\n",
+		MTDSWAP_PREFIX, part, swap_size / 1024, spare_cnt, bad_blocks);
+
+	d = kzalloc(sizeof(struct mtdswap_dev), GFP_KERNEL);
+	if (!d)
+		return;
+
+	mbd_dev = kzalloc(sizeof(struct mtd_blktrans_dev), GFP_KERNEL);
+	if (!mbd_dev) {
+		kfree(d);
+		return;
+	}
+
+	d->mbd_dev = mbd_dev;
+	mbd_dev->priv = d;
+
+	mbd_dev->mtd = mtd;
+	mbd_dev->devnum = mtd->index;
+	mbd_dev->size = swap_size >> PAGE_SHIFT;
+	mbd_dev->tr = tr;
+
+	if (!(mtd->flags & MTD_WRITEABLE))
+		mbd_dev->readonly = 1;
+
+	if (mtdswap_init(d, eblocks, spare_cnt) < 0)
+		goto init_failed;
+
+	if (add_mtd_blktrans_dev(mbd_dev) < 0)
+		goto cleanup;
+
+	d->dev = disk_to_dev(mbd_dev->disk);
+
+	ret = mtdswap_add_debugfs(d);
+	if (ret < 0)
+		goto debugfs_failed;
+
+	return;
+
+debugfs_failed:
+	del_mtd_blktrans_dev(mbd_dev);
+
+cleanup:
+	mtdswap_cleanup(d);
+
+init_failed:
+	kfree(mbd_dev);
+	kfree(d);
+}
+
+static void mtdswap_remove_dev(struct mtd_blktrans_dev *dev)
+{
+	struct mtdswap_dev *d = MTDSWAP_MBD_TO_MTDSWAP(dev);
+
+	debugfs_remove_recursive(d->debugfs_root);
+	del_mtd_blktrans_dev(dev);
+	mtdswap_cleanup(d);
+	kfree(d);
+}
+
+static struct mtd_blktrans_ops mtdswap_ops = {
+	.name		= "mtdswap",
+	.major		= 0,
+	.part_bits	= 0,
+	.blksize	= PAGE_SIZE,
+	.flush		= mtdswap_flush,
+	.readsect	= mtdswap_readsect,
+	.writesect	= mtdswap_writesect,
+	.discard	= mtdswap_discard,
+	.background	= mtdswap_background,
+	.add_mtd	= mtdswap_add_mtd,
+	.remove_dev	= mtdswap_remove_dev,
+	.owner		= THIS_MODULE,
+};
+
+static int __init mtdswap_modinit(void)
+{
+	return register_mtd_blktrans(&mtdswap_ops);
+}
+
+static void __exit mtdswap_modexit(void)
+{
+	deregister_mtd_blktrans(&mtdswap_ops);
+}
+
+module_init(mtdswap_modinit);
+module_exit(mtdswap_modexit);
+
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jarkko Lavinen <jarkko.lavinen@nokia.com>");
+MODULE_DESCRIPTION("Block device access to an MTD suitable for using as "
+		"swap space");
diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
index 4f6c06f..a92054e 100644
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -31,6 +31,21 @@
 	  device thinks the write was successful, a bit could have been
 	  flipped accidentally due to device wear or something else.
 
+config MTD_NAND_BCH
+	tristate
+	select BCH
+	depends on MTD_NAND_ECC_BCH
+	default MTD_NAND
+
+config MTD_NAND_ECC_BCH
+	bool "Support software BCH ECC"
+	default n
+	help
+	  This enables support for software BCH error correction. Binary BCH
+	  codes are more powerful and cpu intensive than traditional Hamming
+	  ECC codes. They are used with NAND devices requiring more than 1 bit
+	  of error correction.
+
 config MTD_SM_COMMON
 	tristate
 	default n
diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile
index 8ad6fae..5745d83 100644
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -4,6 +4,7 @@
 
 obj-$(CONFIG_MTD_NAND)			+= nand.o
 obj-$(CONFIG_MTD_NAND_ECC)		+= nand_ecc.o
+obj-$(CONFIG_MTD_NAND_BCH)		+= nand_bch.o
 obj-$(CONFIG_MTD_NAND_IDS)		+= nand_ids.o
 obj-$(CONFIG_MTD_SM_COMMON) 		+= sm_common.o
 
diff --git a/drivers/mtd/nand/atmel_nand.c b/drivers/mtd/nand/atmel_nand.c
index ccce0f0..6fae04b 100644
--- a/drivers/mtd/nand/atmel_nand.c
+++ b/drivers/mtd/nand/atmel_nand.c
@@ -48,6 +48,9 @@
 #define no_ecc		0
 #endif
 
+static int use_dma = 1;
+module_param(use_dma, int, 0);
+
 static int on_flash_bbt = 0;
 module_param(on_flash_bbt, int, 0);
 
@@ -89,11 +92,20 @@
 	struct nand_chip	nand_chip;
 	struct mtd_info		mtd;
 	void __iomem		*io_base;
+	dma_addr_t		io_phys;
 	struct atmel_nand_data	*board;
 	struct device		*dev;
 	void __iomem		*ecc;
+
+	struct completion	comp;
+	struct dma_chan		*dma_chan;
 };
 
+static int cpu_has_dma(void)
+{
+	return cpu_is_at91sam9rl() || cpu_is_at91sam9g45();
+}
+
 /*
  * Enable NAND.
  */
@@ -150,7 +162,7 @@
 /*
  * Minimal-overhead PIO for data access.
  */
-static void atmel_read_buf(struct mtd_info *mtd, u8 *buf, int len)
+static void atmel_read_buf8(struct mtd_info *mtd, u8 *buf, int len)
 {
 	struct nand_chip	*nand_chip = mtd->priv;
 
@@ -164,7 +176,7 @@
 	__raw_readsw(nand_chip->IO_ADDR_R, buf, len / 2);
 }
 
-static void atmel_write_buf(struct mtd_info *mtd, const u8 *buf, int len)
+static void atmel_write_buf8(struct mtd_info *mtd, const u8 *buf, int len)
 {
 	struct nand_chip	*nand_chip = mtd->priv;
 
@@ -178,6 +190,121 @@
 	__raw_writesw(nand_chip->IO_ADDR_W, buf, len / 2);
 }
 
+static void dma_complete_func(void *completion)
+{
+	complete(completion);
+}
+
+static int atmel_nand_dma_op(struct mtd_info *mtd, void *buf, int len,
+			       int is_read)
+{
+	struct dma_device *dma_dev;
+	enum dma_ctrl_flags flags;
+	dma_addr_t dma_src_addr, dma_dst_addr, phys_addr;
+	struct dma_async_tx_descriptor *tx = NULL;
+	dma_cookie_t cookie;
+	struct nand_chip *chip = mtd->priv;
+	struct atmel_nand_host *host = chip->priv;
+	void *p = buf;
+	int err = -EIO;
+	enum dma_data_direction dir = is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+
+	if (buf >= high_memory) {
+		struct page *pg;
+
+		if (((size_t)buf & PAGE_MASK) !=
+		    ((size_t)(buf + len - 1) & PAGE_MASK)) {
+			dev_warn(host->dev, "Buffer not fit in one page\n");
+			goto err_buf;
+		}
+
+		pg = vmalloc_to_page(buf);
+		if (pg == 0) {
+			dev_err(host->dev, "Failed to vmalloc_to_page\n");
+			goto err_buf;
+		}
+		p = page_address(pg) + ((size_t)buf & ~PAGE_MASK);
+	}
+
+	dma_dev = host->dma_chan->device;
+
+	flags = DMA_CTRL_ACK | DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_SRC_UNMAP |
+		DMA_COMPL_SKIP_DEST_UNMAP;
+
+	phys_addr = dma_map_single(dma_dev->dev, p, len, dir);
+	if (dma_mapping_error(dma_dev->dev, phys_addr)) {
+		dev_err(host->dev, "Failed to dma_map_single\n");
+		goto err_buf;
+	}
+
+	if (is_read) {
+		dma_src_addr = host->io_phys;
+		dma_dst_addr = phys_addr;
+	} else {
+		dma_src_addr = phys_addr;
+		dma_dst_addr = host->io_phys;
+	}
+
+	tx = dma_dev->device_prep_dma_memcpy(host->dma_chan, dma_dst_addr,
+					     dma_src_addr, len, flags);
+	if (!tx) {
+		dev_err(host->dev, "Failed to prepare DMA memcpy\n");
+		goto err_dma;
+	}
+
+	init_completion(&host->comp);
+	tx->callback = dma_complete_func;
+	tx->callback_param = &host->comp;
+
+	cookie = tx->tx_submit(tx);
+	if (dma_submit_error(cookie)) {
+		dev_err(host->dev, "Failed to do DMA tx_submit\n");
+		goto err_dma;
+	}
+
+	dma_async_issue_pending(host->dma_chan);
+	wait_for_completion(&host->comp);
+
+	err = 0;
+
+err_dma:
+	dma_unmap_single(dma_dev->dev, phys_addr, len, dir);
+err_buf:
+	if (err != 0)
+		dev_warn(host->dev, "Fall back to CPU I/O\n");
+	return err;
+}
+
+static void atmel_read_buf(struct mtd_info *mtd, u8 *buf, int len)
+{
+	struct nand_chip *chip = mtd->priv;
+	struct atmel_nand_host *host = chip->priv;
+
+	if (use_dma && len >= mtd->oobsize)
+		if (atmel_nand_dma_op(mtd, buf, len, 1) == 0)
+			return;
+
+	if (host->board->bus_width_16)
+		atmel_read_buf16(mtd, buf, len);
+	else
+		atmel_read_buf8(mtd, buf, len);
+}
+
+static void atmel_write_buf(struct mtd_info *mtd, const u8 *buf, int len)
+{
+	struct nand_chip *chip = mtd->priv;
+	struct atmel_nand_host *host = chip->priv;
+
+	if (use_dma && len >= mtd->oobsize)
+		if (atmel_nand_dma_op(mtd, (void *)buf, len, 0) == 0)
+			return;
+
+	if (host->board->bus_width_16)
+		atmel_write_buf16(mtd, buf, len);
+	else
+		atmel_write_buf8(mtd, buf, len);
+}
+
 /*
  * Calculate HW ECC
  *
@@ -398,6 +525,8 @@
 		return -ENOMEM;
 	}
 
+	host->io_phys = (dma_addr_t)mem->start;
+
 	host->io_base = ioremap(mem->start, mem->end - mem->start + 1);
 	if (host->io_base == NULL) {
 		printk(KERN_ERR "atmel_nand: ioremap failed\n");
@@ -448,14 +577,11 @@
 
 	nand_chip->chip_delay = 20;		/* 20us command delay time */
 
-	if (host->board->bus_width_16) {	/* 16-bit bus width */
+	if (host->board->bus_width_16)	/* 16-bit bus width */
 		nand_chip->options |= NAND_BUSWIDTH_16;
-		nand_chip->read_buf = atmel_read_buf16;
-		nand_chip->write_buf = atmel_write_buf16;
-	} else {
-		nand_chip->read_buf = atmel_read_buf;
-		nand_chip->write_buf = atmel_write_buf;
-	}
+
+	nand_chip->read_buf = atmel_read_buf;
+	nand_chip->write_buf = atmel_write_buf;
 
 	platform_set_drvdata(pdev, host);
 	atmel_nand_enable(host);
@@ -473,6 +599,22 @@
 		nand_chip->options |= NAND_USE_FLASH_BBT;
 	}
 
+	if (cpu_has_dma() && use_dma) {
+		dma_cap_mask_t mask;
+
+		dma_cap_zero(mask);
+		dma_cap_set(DMA_MEMCPY, mask);
+		host->dma_chan = dma_request_channel(mask, 0, NULL);
+		if (!host->dma_chan) {
+			dev_err(host->dev, "Failed to request DMA channel\n");
+			use_dma = 0;
+		}
+	}
+	if (use_dma)
+		dev_info(host->dev, "Using DMA for NAND access.\n");
+	else
+		dev_info(host->dev, "No DMA support for NAND access.\n");
+
 	/* first scan to find the device and get the page size */
 	if (nand_scan_ident(mtd, 1, NULL)) {
 		res = -ENXIO;
@@ -555,6 +697,8 @@
 err_no_card:
 	atmel_nand_disable(host);
 	platform_set_drvdata(pdev, NULL);
+	if (host->dma_chan)
+		dma_release_channel(host->dma_chan);
 	if (host->ecc)
 		iounmap(host->ecc);
 err_ecc_ioremap:
@@ -578,6 +722,10 @@
 
 	if (host->ecc)
 		iounmap(host->ecc);
+
+	if (host->dma_chan)
+		dma_release_channel(host->dma_chan);
+
 	iounmap(host->io_base);
 	kfree(host);
 
diff --git a/drivers/mtd/nand/davinci_nand.c b/drivers/mtd/nand/davinci_nand.c
index a90fde3..aff3468 100644
--- a/drivers/mtd/nand/davinci_nand.c
+++ b/drivers/mtd/nand/davinci_nand.c
@@ -37,9 +37,6 @@
 #include <mach/nand.h>
 #include <mach/aemif.h>
 
-#include <asm/mach-types.h>
-
-
 /*
  * This is a device driver for the NAND flash controller found on the
  * various DaVinci family chips.  It handles up to four SoC chipselects,
diff --git a/drivers/mtd/nand/mpc5121_nfc.c b/drivers/mtd/nand/mpc5121_nfc.c
index c2f9543..0b81b5b 100644
--- a/drivers/mtd/nand/mpc5121_nfc.c
+++ b/drivers/mtd/nand/mpc5121_nfc.c
@@ -29,6 +29,7 @@
 #include <linux/clk.h>
 #include <linux/gfp.h>
 #include <linux/delay.h>
+#include <linux/err.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
@@ -757,9 +758,9 @@
 
 	/* Enable NFC clock */
 	prv->clk = clk_get(dev, "nfc_clk");
-	if (!prv->clk) {
+	if (IS_ERR(prv->clk)) {
 		dev_err(dev, "Unable to acquire NFC clock!\n");
-		retval = -ENODEV;
+		retval = PTR_ERR(prv->clk);
 		goto error;
 	}
 
diff --git a/drivers/mtd/nand/mxc_nand.c b/drivers/mtd/nand/mxc_nand.c
index 5ae1d9e..42a95fb 100644
--- a/drivers/mtd/nand/mxc_nand.c
+++ b/drivers/mtd/nand/mxc_nand.c
@@ -211,6 +211,31 @@
 	}
 };
 
+/* OOB description for 4096 byte pages with 128 byte OOB */
+static struct nand_ecclayout nandv2_hw_eccoob_4k = {
+	.eccbytes = 8 * 9,
+	.eccpos = {
+		7,  8,  9, 10, 11, 12, 13, 14, 15,
+		23, 24, 25, 26, 27, 28, 29, 30, 31,
+		39, 40, 41, 42, 43, 44, 45, 46, 47,
+		55, 56, 57, 58, 59, 60, 61, 62, 63,
+		71, 72, 73, 74, 75, 76, 77, 78, 79,
+		87, 88, 89, 90, 91, 92, 93, 94, 95,
+		103, 104, 105, 106, 107, 108, 109, 110, 111,
+		119, 120, 121, 122, 123, 124, 125, 126, 127,
+	},
+	.oobfree = {
+		{.offset = 2, .length = 4},
+		{.offset = 16, .length = 7},
+		{.offset = 32, .length = 7},
+		{.offset = 48, .length = 7},
+		{.offset = 64, .length = 7},
+		{.offset = 80, .length = 7},
+		{.offset = 96, .length = 7},
+		{.offset = 112, .length = 7},
+	}
+};
+
 #ifdef CONFIG_MTD_PARTITIONS
 static const char *part_probes[] = { "RedBoot", "cmdlinepart", NULL };
 #endif
@@ -641,9 +666,9 @@
 
 	n = min(n, len);
 
-	memcpy(buf, host->data_buf + col, len);
+	memcpy(buf, host->data_buf + col, n);
 
-	host->buf_start += len;
+	host->buf_start += n;
 }
 
 /* Used by the upper layer to verify the data in NAND Flash
@@ -1185,6 +1210,8 @@
 
 	if (mtd->writesize == 2048)
 		this->ecc.layout = oob_largepage;
+	if (nfc_is_v21() && mtd->writesize == 4096)
+		this->ecc.layout = &nandv2_hw_eccoob_4k;
 
 	/* second phase scan */
 	if (nand_scan_tail(mtd)) {
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index a9c6ce7..85cfc06 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -42,6 +42,7 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
 #include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/nand_bch.h>
 #include <linux/interrupt.h>
 #include <linux/bitops.h>
 #include <linux/leds.h>
@@ -2377,7 +2378,7 @@
 		return -EINVAL;
 	}
 
-	/* Do not allow reads past end of device */
+	/* Do not allow write past end of device */
 	if (unlikely(to >= mtd->size ||
 		     ops->ooboffs + ops->ooblen >
 			((mtd->size >> chip->page_shift) -
@@ -3248,7 +3249,7 @@
 	/*
 	 * If no default placement scheme is given, select an appropriate one
 	 */
-	if (!chip->ecc.layout) {
+	if (!chip->ecc.layout && (chip->ecc.mode != NAND_ECC_SOFT_BCH)) {
 		switch (mtd->oobsize) {
 		case 8:
 			chip->ecc.layout = &nand_oob_8;
@@ -3351,6 +3352,40 @@
 		chip->ecc.bytes = 3;
 		break;
 
+	case NAND_ECC_SOFT_BCH:
+		if (!mtd_nand_has_bch()) {
+			printk(KERN_WARNING "CONFIG_MTD_ECC_BCH not enabled\n");
+			BUG();
+		}
+		chip->ecc.calculate = nand_bch_calculate_ecc;
+		chip->ecc.correct = nand_bch_correct_data;
+		chip->ecc.read_page = nand_read_page_swecc;
+		chip->ecc.read_subpage = nand_read_subpage;
+		chip->ecc.write_page = nand_write_page_swecc;
+		chip->ecc.read_page_raw = nand_read_page_raw;
+		chip->ecc.write_page_raw = nand_write_page_raw;
+		chip->ecc.read_oob = nand_read_oob_std;
+		chip->ecc.write_oob = nand_write_oob_std;
+		/*
+		 * Board driver should supply ecc.size and ecc.bytes values to
+		 * select how many bits are correctable; see nand_bch_init()
+		 * for details.
+		 * Otherwise, default to 4 bits for large page devices
+		 */
+		if (!chip->ecc.size && (mtd->oobsize >= 64)) {
+			chip->ecc.size = 512;
+			chip->ecc.bytes = 7;
+		}
+		chip->ecc.priv = nand_bch_init(mtd,
+					       chip->ecc.size,
+					       chip->ecc.bytes,
+					       &chip->ecc.layout);
+		if (!chip->ecc.priv) {
+			printk(KERN_WARNING "BCH ECC initialization failed!\n");
+			BUG();
+		}
+		break;
+
 	case NAND_ECC_NONE:
 		printk(KERN_WARNING "NAND_ECC_NONE selected by board driver. "
 		       "This is not recommended !!\n");
@@ -3501,6 +3536,9 @@
 {
 	struct nand_chip *chip = mtd->priv;
 
+	if (chip->ecc.mode == NAND_ECC_SOFT_BCH)
+		nand_bch_free((struct nand_bch_control *)chip->ecc.priv);
+
 #ifdef CONFIG_MTD_PARTITIONS
 	/* Deregister partitions */
 	del_mtd_partitions(mtd);
diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c
index 6ebd869..a1e8b30 100644
--- a/drivers/mtd/nand/nand_bbt.c
+++ b/drivers/mtd/nand/nand_bbt.c
@@ -1101,12 +1101,16 @@
 static void verify_bbt_descr(struct mtd_info *mtd, struct nand_bbt_descr *bd)
 {
 	struct nand_chip *this = mtd->priv;
-	u32 pattern_len = bd->len;
-	u32 bits = bd->options & NAND_BBT_NRBITS_MSK;
+	u32 pattern_len;
+	u32 bits;
 	u32 table_size;
 
 	if (!bd)
 		return;
+
+	pattern_len = bd->len;
+	bits = bd->options & NAND_BBT_NRBITS_MSK;
+
 	BUG_ON((this->options & NAND_USE_FLASH_BBT_NO_OOB) &&
 			!(this->options & NAND_USE_FLASH_BBT));
 	BUG_ON(!bits);
diff --git a/drivers/mtd/nand/nand_bch.c b/drivers/mtd/nand/nand_bch.c
new file mode 100644
index 0000000..0f931e7
--- /dev/null
+++ b/drivers/mtd/nand/nand_bch.c
@@ -0,0 +1,243 @@
+/*
+ * This file provides ECC correction for more than 1 bit per block of data,
+ * using binary BCH codes. It relies on the generic BCH library lib/bch.c.
+ *
+ * Copyright © 2011 Ivan Djelic <ivan.djelic@parrot.com>
+ *
+ * This file is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 or (at your option) any
+ * later version.
+ *
+ * This file is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this file; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/nand.h>
+#include <linux/mtd/nand_bch.h>
+#include <linux/bch.h>
+
+/**
+ * struct nand_bch_control - private NAND BCH control structure
+ * @bch:       BCH control structure
+ * @ecclayout: private ecc layout for this BCH configuration
+ * @errloc:    error location array
+ * @eccmask:   XOR ecc mask, allows erased pages to be decoded as valid
+ */
+struct nand_bch_control {
+	struct bch_control   *bch;
+	struct nand_ecclayout ecclayout;
+	unsigned int         *errloc;
+	unsigned char        *eccmask;
+};
+
+/**
+ * nand_bch_calculate_ecc - [NAND Interface] Calculate ECC for data block
+ * @mtd:	MTD block structure
+ * @buf:	input buffer with raw data
+ * @code:	output buffer with ECC
+ */
+int nand_bch_calculate_ecc(struct mtd_info *mtd, const unsigned char *buf,
+			   unsigned char *code)
+{
+	const struct nand_chip *chip = mtd->priv;
+	struct nand_bch_control *nbc = chip->ecc.priv;
+	unsigned int i;
+
+	memset(code, 0, chip->ecc.bytes);
+	encode_bch(nbc->bch, buf, chip->ecc.size, code);
+
+	/* apply mask so that an erased page is a valid codeword */
+	for (i = 0; i < chip->ecc.bytes; i++)
+		code[i] ^= nbc->eccmask[i];
+
+	return 0;
+}
+EXPORT_SYMBOL(nand_bch_calculate_ecc);
+
+/**
+ * nand_bch_correct_data - [NAND Interface] Detect and correct bit error(s)
+ * @mtd:	MTD block structure
+ * @buf:	raw data read from the chip
+ * @read_ecc:	ECC from the chip
+ * @calc_ecc:	the ECC calculated from raw data
+ *
+ * Detect and correct bit errors for a data byte block
+ */
+int nand_bch_correct_data(struct mtd_info *mtd, unsigned char *buf,
+			  unsigned char *read_ecc, unsigned char *calc_ecc)
+{
+	const struct nand_chip *chip = mtd->priv;
+	struct nand_bch_control *nbc = chip->ecc.priv;
+	unsigned int *errloc = nbc->errloc;
+	int i, count;
+
+	count = decode_bch(nbc->bch, NULL, chip->ecc.size, read_ecc, calc_ecc,
+			   NULL, errloc);
+	if (count > 0) {
+		for (i = 0; i < count; i++) {
+			if (errloc[i] < (chip->ecc.size*8))
+				/* error is located in data, correct it */
+				buf[errloc[i] >> 3] ^= (1 << (errloc[i] & 7));
+			/* else error in ecc, no action needed */
+
+			DEBUG(MTD_DEBUG_LEVEL0, "%s: corrected bitflip %u\n",
+			      __func__, errloc[i]);
+		}
+	} else if (count < 0) {
+		printk(KERN_ERR "ecc unrecoverable error\n");
+		count = -1;
+	}
+	return count;
+}
+EXPORT_SYMBOL(nand_bch_correct_data);
+
+/**
+ * nand_bch_init - [NAND Interface] Initialize NAND BCH error correction
+ * @mtd:	MTD block structure
+ * @eccsize:	ecc block size in bytes
+ * @eccbytes:	ecc length in bytes
+ * @ecclayout:	output default layout
+ *
+ * Returns:
+ *  a pointer to a new NAND BCH control structure, or NULL upon failure
+ *
+ * Initialize NAND BCH error correction. Parameters @eccsize and @eccbytes
+ * are used to compute BCH parameters m (Galois field order) and t (error
+ * correction capability). @eccbytes should be equal to the number of bytes
+ * required to store m*t bits, where m is such that 2^m-1 > @eccsize*8.
+ *
+ * Example: to configure 4 bit correction per 512 bytes, you should pass
+ * @eccsize = 512  (thus, m=13 is the smallest integer such that 2^m-1 > 512*8)
+ * @eccbytes = 7   (7 bytes are required to store m*t = 13*4 = 52 bits)
+ */
+struct nand_bch_control *
+nand_bch_init(struct mtd_info *mtd, unsigned int eccsize, unsigned int eccbytes,
+	      struct nand_ecclayout **ecclayout)
+{
+	unsigned int m, t, eccsteps, i;
+	struct nand_ecclayout *layout;
+	struct nand_bch_control *nbc = NULL;
+	unsigned char *erased_page;
+
+	if (!eccsize || !eccbytes) {
+		printk(KERN_WARNING "ecc parameters not supplied\n");
+		goto fail;
+	}
+
+	m = fls(1+8*eccsize);
+	t = (eccbytes*8)/m;
+
+	nbc = kzalloc(sizeof(*nbc), GFP_KERNEL);
+	if (!nbc)
+		goto fail;
+
+	nbc->bch = init_bch(m, t, 0);
+	if (!nbc->bch)
+		goto fail;
+
+	/* verify that eccbytes has the expected value */
+	if (nbc->bch->ecc_bytes != eccbytes) {
+		printk(KERN_WARNING "invalid eccbytes %u, should be %u\n",
+		       eccbytes, nbc->bch->ecc_bytes);
+		goto fail;
+	}
+
+	eccsteps = mtd->writesize/eccsize;
+
+	/* if no ecc placement scheme was provided, build one */
+	if (!*ecclayout) {
+
+		/* handle large page devices only */
+		if (mtd->oobsize < 64) {
+			printk(KERN_WARNING "must provide an oob scheme for "
+			       "oobsize %d\n", mtd->oobsize);
+			goto fail;
+		}
+
+		layout = &nbc->ecclayout;
+		layout->eccbytes = eccsteps*eccbytes;
+
+		/* reserve 2 bytes for bad block marker */
+		if (layout->eccbytes+2 > mtd->oobsize) {
+			printk(KERN_WARNING "no suitable oob scheme available "
+			       "for oobsize %d eccbytes %u\n", mtd->oobsize,
+			       eccbytes);
+			goto fail;
+		}
+		/* put ecc bytes at oob tail */
+		for (i = 0; i < layout->eccbytes; i++)
+			layout->eccpos[i] = mtd->oobsize-layout->eccbytes+i;
+
+		layout->oobfree[0].offset = 2;
+		layout->oobfree[0].length = mtd->oobsize-2-layout->eccbytes;
+
+		*ecclayout = layout;
+	}
+
+	/* sanity checks */
+	if (8*(eccsize+eccbytes) >= (1 << m)) {
+		printk(KERN_WARNING "eccsize %u is too large\n", eccsize);
+		goto fail;
+	}
+	if ((*ecclayout)->eccbytes != (eccsteps*eccbytes)) {
+		printk(KERN_WARNING "invalid ecc layout\n");
+		goto fail;
+	}
+
+	nbc->eccmask = kmalloc(eccbytes, GFP_KERNEL);
+	nbc->errloc = kmalloc(t*sizeof(*nbc->errloc), GFP_KERNEL);
+	if (!nbc->eccmask || !nbc->errloc)
+		goto fail;
+	/*
+	 * compute and store the inverted ecc of an erased ecc block
+	 */
+	erased_page = kmalloc(eccsize, GFP_KERNEL);
+	if (!erased_page)
+		goto fail;
+
+	memset(erased_page, 0xff, eccsize);
+	memset(nbc->eccmask, 0, eccbytes);
+	encode_bch(nbc->bch, erased_page, eccsize, nbc->eccmask);
+	kfree(erased_page);
+
+	for (i = 0; i < eccbytes; i++)
+		nbc->eccmask[i] ^= 0xff;
+
+	return nbc;
+fail:
+	nand_bch_free(nbc);
+	return NULL;
+}
+EXPORT_SYMBOL(nand_bch_init);
+
+/**
+ * nand_bch_free - [NAND Interface] Release NAND BCH ECC resources
+ * @nbc:	NAND BCH control structure
+ */
+void nand_bch_free(struct nand_bch_control *nbc)
+{
+	if (nbc) {
+		free_bch(nbc->bch);
+		kfree(nbc->errloc);
+		kfree(nbc->eccmask);
+		kfree(nbc);
+	}
+}
+EXPORT_SYMBOL(nand_bch_free);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ivan Djelic <ivan.djelic@parrot.com>");
+MODULE_DESCRIPTION("NAND software BCH ECC support");
diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
index a5aa99f..213181b 100644
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -34,6 +34,7 @@
 #include <linux/string.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
+#include <linux/mtd/nand_bch.h>
 #include <linux/mtd/partitions.h>
 #include <linux/delay.h>
 #include <linux/list.h>
@@ -108,6 +109,7 @@
 static unsigned int overridesize = 0;
 static char *cache_file = NULL;
 static unsigned int bbt;
+static unsigned int bch;
 
 module_param(first_id_byte,  uint, 0400);
 module_param(second_id_byte, uint, 0400);
@@ -132,6 +134,7 @@
 module_param(overridesize,   uint, 0400);
 module_param(cache_file,     charp, 0400);
 module_param(bbt,	     uint, 0400);
+module_param(bch,	     uint, 0400);
 
 MODULE_PARM_DESC(first_id_byte,  "The first byte returned by NAND Flash 'read ID' command (manufacturer ID)");
 MODULE_PARM_DESC(second_id_byte, "The second byte returned by NAND Flash 'read ID' command (chip ID)");
@@ -165,6 +168,8 @@
 				 " e.g. 5 means a size of 32 erase blocks");
 MODULE_PARM_DESC(cache_file,     "File to use to cache nand pages instead of memory");
 MODULE_PARM_DESC(bbt,		 "0 OOB, 1 BBT with marker in OOB, 2 BBT with marker in data area");
+MODULE_PARM_DESC(bch,		 "Enable BCH ecc and set how many bits should "
+				 "be correctable in 512-byte blocks");
 
 /* The largest possible page size */
 #define NS_LARGEST_PAGE_SIZE	4096
@@ -2309,7 +2314,43 @@
 	if ((retval = parse_gravepages()) != 0)
 		goto error;
 
-	if ((retval = nand_scan(nsmtd, 1)) != 0) {
+	retval = nand_scan_ident(nsmtd, 1, NULL);
+	if (retval) {
+		NS_ERR("cannot scan NAND Simulator device\n");
+		if (retval > 0)
+			retval = -ENXIO;
+		goto error;
+	}
+
+	if (bch) {
+		unsigned int eccsteps, eccbytes;
+		if (!mtd_nand_has_bch()) {
+			NS_ERR("BCH ECC support is disabled\n");
+			retval = -EINVAL;
+			goto error;
+		}
+		/* use 512-byte ecc blocks */
+		eccsteps = nsmtd->writesize/512;
+		eccbytes = (bch*13+7)/8;
+		/* do not bother supporting small page devices */
+		if ((nsmtd->oobsize < 64) || !eccsteps) {
+			NS_ERR("bch not available on small page devices\n");
+			retval = -EINVAL;
+			goto error;
+		}
+		if ((eccbytes*eccsteps+2) > nsmtd->oobsize) {
+			NS_ERR("invalid bch value %u\n", bch);
+			retval = -EINVAL;
+			goto error;
+		}
+		chip->ecc.mode = NAND_ECC_SOFT_BCH;
+		chip->ecc.size = 512;
+		chip->ecc.bytes = eccbytes;
+		NS_INFO("using %u-bit/%u bytes BCH ECC\n", bch, chip->ecc.size);
+	}
+
+	retval = nand_scan_tail(nsmtd);
+	if (retval) {
 		NS_ERR("can't register NAND Simulator\n");
 		if (retval > 0)
 			retval = -ENXIO;
diff --git a/drivers/mtd/nand/omap2.c b/drivers/mtd/nand/omap2.c
index 7b8f1ff..da9a351 100644
--- a/drivers/mtd/nand/omap2.c
+++ b/drivers/mtd/nand/omap2.c
@@ -668,6 +668,8 @@
  *
  * This function compares two ECC's and indicates if there is an error.
  * If the error can be corrected it will be corrected to the buffer.
+ * If there is no error, %0 is returned. If there is an error but it
+ * was corrected, %1 is returned. Otherwise, %-1 is returned.
  */
 static int omap_compare_ecc(u8 *ecc_data1,	/* read from NAND memory */
 			    u8 *ecc_data2,	/* read from register */
@@ -773,7 +775,7 @@
 
 		page_data[find_byte] ^= (1 << find_bit);
 
-		return 0;
+		return 1;
 	default:
 		if (isEccFF) {
 			if (ecc_data2[0] == 0 &&
@@ -794,8 +796,11 @@
  * @calc_ecc: ecc read from HW ECC registers
  *
  * Compares the ecc read from nand spare area with ECC registers values
- * and if ECC's mismached, it will call 'omap_compare_ecc' for error detection
- * and correction.
+ * and if ECC's mismatched, it will call 'omap_compare_ecc' for error
+ * detection and correction. If there are no errors, %0 is returned. If
+ * there were errors and all of the errors were corrected, the number of
+ * corrected errors is returned. If uncorrectable errors exist, %-1 is
+ * returned.
  */
 static int omap_correct_data(struct mtd_info *mtd, u_char *dat,
 				u_char *read_ecc, u_char *calc_ecc)
@@ -803,6 +808,7 @@
 	struct omap_nand_info *info = container_of(mtd, struct omap_nand_info,
 							mtd);
 	int blockCnt = 0, i = 0, ret = 0;
+	int stat = 0;
 
 	/* Ex NAND_ECC_HW12_2048 */
 	if ((info->nand.ecc.mode == NAND_ECC_HW) &&
@@ -816,12 +822,14 @@
 			ret = omap_compare_ecc(read_ecc, calc_ecc, dat);
 			if (ret < 0)
 				return ret;
+			/* keep track of the number of corrected errors */
+			stat += ret;
 		}
 		read_ecc += 3;
 		calc_ecc += 3;
 		dat      += 512;
 	}
-	return 0;
+	return stat;
 }
 
 /**
diff --git a/drivers/mtd/nand/pxa3xx_nand.c b/drivers/mtd/nand/pxa3xx_nand.c
index ea2c288..ab7f4c3 100644
--- a/drivers/mtd/nand/pxa3xx_nand.c
+++ b/drivers/mtd/nand/pxa3xx_nand.c
@@ -27,6 +27,8 @@
 #include <plat/pxa3xx_nand.h>
 
 #define	CHIP_DELAY_TIMEOUT	(2 * HZ/10)
+#define NAND_STOP_DELAY		(2 * HZ/50)
+#define PAGE_CHUNK_SIZE		(2048)
 
 /* registers and bit definitions */
 #define NDCR		(0x00) /* Control register */
@@ -52,16 +54,18 @@
 #define NDCR_ND_MODE		(0x3 << 21)
 #define NDCR_NAND_MODE   	(0x0)
 #define NDCR_CLR_PG_CNT		(0x1 << 20)
-#define NDCR_CLR_ECC		(0x1 << 19)
+#define NDCR_STOP_ON_UNCOR	(0x1 << 19)
 #define NDCR_RD_ID_CNT_MASK	(0x7 << 16)
 #define NDCR_RD_ID_CNT(x)	(((x) << 16) & NDCR_RD_ID_CNT_MASK)
 
 #define NDCR_RA_START		(0x1 << 15)
 #define NDCR_PG_PER_BLK		(0x1 << 14)
 #define NDCR_ND_ARB_EN		(0x1 << 12)
+#define NDCR_INT_MASK           (0xFFF)
 
 #define NDSR_MASK		(0xfff)
-#define NDSR_RDY		(0x1 << 11)
+#define NDSR_RDY                (0x1 << 12)
+#define NDSR_FLASH_RDY          (0x1 << 11)
 #define NDSR_CS0_PAGED		(0x1 << 10)
 #define NDSR_CS1_PAGED		(0x1 << 9)
 #define NDSR_CS0_CMDD		(0x1 << 8)
@@ -74,6 +78,7 @@
 #define NDSR_RDDREQ		(0x1 << 1)
 #define NDSR_WRCMDREQ		(0x1)
 
+#define NDCB0_ST_ROW_EN         (0x1 << 26)
 #define NDCB0_AUTO_RS		(0x1 << 25)
 #define NDCB0_CSEL		(0x1 << 24)
 #define NDCB0_CMD_TYPE_MASK	(0x7 << 21)
@@ -104,18 +109,21 @@
 };
 
 enum {
-	STATE_READY	= 0,
+	STATE_IDLE = 0,
 	STATE_CMD_HANDLE,
 	STATE_DMA_READING,
 	STATE_DMA_WRITING,
 	STATE_DMA_DONE,
 	STATE_PIO_READING,
 	STATE_PIO_WRITING,
+	STATE_CMD_DONE,
+	STATE_READY,
 };
 
 struct pxa3xx_nand_info {
 	struct nand_chip	nand_chip;
 
+	struct nand_hw_control	controller;
 	struct platform_device	 *pdev;
 	struct pxa3xx_nand_cmdset *cmdset;
 
@@ -126,6 +134,7 @@
 	unsigned int 		buf_start;
 	unsigned int		buf_count;
 
+	struct mtd_info         *mtd;
 	/* DMA information */
 	int			drcmr_dat;
 	int			drcmr_cmd;
@@ -149,6 +158,7 @@
 
 	int			use_ecc;	/* use HW ECC ? */
 	int			use_dma;	/* use DMA ? */
+	int			is_ready;
 
 	unsigned int		page_size;	/* page size of attached chip */
 	unsigned int		data_size;	/* data size in FIFO */
@@ -201,20 +211,22 @@
 };
 
 static struct pxa3xx_nand_flash builtin_flash_types[] = {
-	{      0,   0, 2048,  8,  8,    0, &default_cmdset, &timing[0] },
-	{ 0x46ec,  32,  512, 16, 16, 4096, &default_cmdset, &timing[1] },
-	{ 0xdaec,  64, 2048,  8,  8, 2048, &default_cmdset, &timing[1] },
-	{ 0xd7ec, 128, 4096,  8,  8, 8192, &default_cmdset, &timing[1] },
-	{ 0xa12c,  64, 2048,  8,  8, 1024, &default_cmdset, &timing[2] },
-	{ 0xb12c,  64, 2048, 16, 16, 1024, &default_cmdset, &timing[2] },
-	{ 0xdc2c,  64, 2048,  8,  8, 4096, &default_cmdset, &timing[2] },
-	{ 0xcc2c,  64, 2048, 16, 16, 4096, &default_cmdset, &timing[2] },
-	{ 0xba20,  64, 2048, 16, 16, 2048, &default_cmdset, &timing[3] },
+{ "DEFAULT FLASH",      0,   0, 2048,  8,  8,    0, &timing[0] },
+{ "64MiB 16-bit",  0x46ec,  32,  512, 16, 16, 4096, &timing[1] },
+{ "256MiB 8-bit",  0xdaec,  64, 2048,  8,  8, 2048, &timing[1] },
+{ "4GiB 8-bit",    0xd7ec, 128, 4096,  8,  8, 8192, &timing[1] },
+{ "128MiB 8-bit",  0xa12c,  64, 2048,  8,  8, 1024, &timing[2] },
+{ "128MiB 16-bit", 0xb12c,  64, 2048, 16, 16, 1024, &timing[2] },
+{ "512MiB 8-bit",  0xdc2c,  64, 2048,  8,  8, 4096, &timing[2] },
+{ "512MiB 16-bit", 0xcc2c,  64, 2048, 16, 16, 4096, &timing[2] },
+{ "256MiB 16-bit", 0xba20,  64, 2048, 16, 16, 2048, &timing[3] },
 };
 
 /* Define a default flash type setting serve as flash detecting only */
 #define DEFAULT_FLASH_TYPE (&builtin_flash_types[0])
 
+const char *mtd_names[] = {"pxa3xx_nand-0", NULL};
+
 #define NDTR0_tCH(c)	(min((c), 7) << 19)
 #define NDTR0_tCS(c)	(min((c), 7) << 16)
 #define NDTR0_tWH(c)	(min((c), 7) << 11)
@@ -252,25 +264,6 @@
 	nand_writel(info, NDTR1CS0, ndtr1);
 }
 
-#define WAIT_EVENT_TIMEOUT	10
-
-static int wait_for_event(struct pxa3xx_nand_info *info, uint32_t event)
-{
-	int timeout = WAIT_EVENT_TIMEOUT;
-	uint32_t ndsr;
-
-	while (timeout--) {
-		ndsr = nand_readl(info, NDSR) & NDSR_MASK;
-		if (ndsr & event) {
-			nand_writel(info, NDSR, ndsr);
-			return 0;
-		}
-		udelay(10);
-	}
-
-	return -ETIMEDOUT;
-}
-
 static void pxa3xx_set_datasize(struct pxa3xx_nand_info *info)
 {
 	int oob_enable = info->reg_ndcr & NDCR_SPARE_EN;
@@ -291,69 +284,45 @@
 	}
 }
 
-static int prepare_read_prog_cmd(struct pxa3xx_nand_info *info,
-		uint16_t cmd, int column, int page_addr)
+/**
+ * NOTE: it is a must to set ND_RUN firstly, then write
+ * command buffer, otherwise, it does not work.
+ * We enable all the interrupt at the same time, and
+ * let pxa3xx_nand_irq to handle all logic.
+ */
+static void pxa3xx_nand_start(struct pxa3xx_nand_info *info)
 {
-	const struct pxa3xx_nand_cmdset *cmdset = info->cmdset;
-	pxa3xx_set_datasize(info);
+	uint32_t ndcr;
 
-	/* generate values for NDCBx registers */
-	info->ndcb0 = cmd | ((cmd & 0xff00) ? NDCB0_DBC : 0);
-	info->ndcb1 = 0;
-	info->ndcb2 = 0;
-	info->ndcb0 |= NDCB0_ADDR_CYC(info->row_addr_cycles + info->col_addr_cycles);
+	ndcr = info->reg_ndcr;
+	ndcr |= info->use_ecc ? NDCR_ECC_EN : 0;
+	ndcr |= info->use_dma ? NDCR_DMA_EN : 0;
+	ndcr |= NDCR_ND_RUN;
 
-	if (info->col_addr_cycles == 2) {
-		/* large block, 2 cycles for column address
-		 * row address starts from 3rd cycle
-		 */
-		info->ndcb1 |= page_addr << 16;
-		if (info->row_addr_cycles == 3)
-			info->ndcb2 = (page_addr >> 16) & 0xff;
-	} else
-		/* small block, 1 cycles for column address
-		 * row address starts from 2nd cycle
-		 */
-		info->ndcb1 = page_addr << 8;
-
-	if (cmd == cmdset->program)
-		info->ndcb0 |= NDCB0_CMD_TYPE(1) | NDCB0_AUTO_RS;
-
-	return 0;
+	/* clear status bits and run */
+	nand_writel(info, NDCR, 0);
+	nand_writel(info, NDSR, NDSR_MASK);
+	nand_writel(info, NDCR, ndcr);
 }
 
-static int prepare_erase_cmd(struct pxa3xx_nand_info *info,
-			uint16_t cmd, int page_addr)
+static void pxa3xx_nand_stop(struct pxa3xx_nand_info *info)
 {
-	info->ndcb0 = cmd | ((cmd & 0xff00) ? NDCB0_DBC : 0);
-	info->ndcb0 |= NDCB0_CMD_TYPE(2) | NDCB0_AUTO_RS | NDCB0_ADDR_CYC(3);
-	info->ndcb1 = page_addr;
-	info->ndcb2 = 0;
-	return 0;
-}
+	uint32_t ndcr;
+	int timeout = NAND_STOP_DELAY;
 
-static int prepare_other_cmd(struct pxa3xx_nand_info *info, uint16_t cmd)
-{
-	const struct pxa3xx_nand_cmdset *cmdset = info->cmdset;
+	/* wait RUN bit in NDCR become 0 */
+	ndcr = nand_readl(info, NDCR);
+	while ((ndcr & NDCR_ND_RUN) && (timeout-- > 0)) {
+		ndcr = nand_readl(info, NDCR);
+		udelay(1);
+	}
 
-	info->ndcb0 = cmd | ((cmd & 0xff00) ? NDCB0_DBC : 0);
-	info->ndcb1 = 0;
-	info->ndcb2 = 0;
-
-	info->oob_size = 0;
-	if (cmd == cmdset->read_id) {
-		info->ndcb0 |= NDCB0_CMD_TYPE(3);
-		info->data_size = 8;
-	} else if (cmd == cmdset->read_status) {
-		info->ndcb0 |= NDCB0_CMD_TYPE(4);
-		info->data_size = 8;
-	} else if (cmd == cmdset->reset || cmd == cmdset->lock ||
-		   cmd == cmdset->unlock) {
-		info->ndcb0 |= NDCB0_CMD_TYPE(5);
-	} else
-		return -EINVAL;
-
-	return 0;
+	if (timeout <= 0) {
+		ndcr &= ~NDCR_ND_RUN;
+		nand_writel(info, NDCR, ndcr);
+	}
+	/* clear status bits */
+	nand_writel(info, NDSR, NDSR_MASK);
 }
 
 static void enable_int(struct pxa3xx_nand_info *info, uint32_t int_mask)
@@ -372,39 +341,8 @@
 	nand_writel(info, NDCR, ndcr | int_mask);
 }
 
-/* NOTE: it is a must to set ND_RUN firstly, then write command buffer
- * otherwise, it does not work
- */
-static int write_cmd(struct pxa3xx_nand_info *info)
+static void handle_data_pio(struct pxa3xx_nand_info *info)
 {
-	uint32_t ndcr;
-
-	/* clear status bits and run */
-	nand_writel(info, NDSR, NDSR_MASK);
-
-	ndcr = info->reg_ndcr;
-
-	ndcr |= info->use_ecc ? NDCR_ECC_EN : 0;
-	ndcr |= info->use_dma ? NDCR_DMA_EN : 0;
-	ndcr |= NDCR_ND_RUN;
-
-	nand_writel(info, NDCR, ndcr);
-
-	if (wait_for_event(info, NDSR_WRCMDREQ)) {
-		printk(KERN_ERR "timed out writing command\n");
-		return -ETIMEDOUT;
-	}
-
-	nand_writel(info, NDCB0, info->ndcb0);
-	nand_writel(info, NDCB0, info->ndcb1);
-	nand_writel(info, NDCB0, info->ndcb2);
-	return 0;
-}
-
-static int handle_data_pio(struct pxa3xx_nand_info *info)
-{
-	int ret, timeout = CHIP_DELAY_TIMEOUT;
-
 	switch (info->state) {
 	case STATE_PIO_WRITING:
 		__raw_writesl(info->mmio_base + NDDB, info->data_buff,
@@ -412,14 +350,6 @@
 		if (info->oob_size > 0)
 			__raw_writesl(info->mmio_base + NDDB, info->oob_buff,
 					DIV_ROUND_UP(info->oob_size, 4));
-
-		enable_int(info, NDSR_CS0_BBD | NDSR_CS0_CMDD);
-
-		ret = wait_for_completion_timeout(&info->cmd_complete, timeout);
-		if (!ret) {
-			printk(KERN_ERR "program command time out\n");
-			return -1;
-		}
 		break;
 	case STATE_PIO_READING:
 		__raw_readsl(info->mmio_base + NDDB, info->data_buff,
@@ -431,14 +361,11 @@
 	default:
 		printk(KERN_ERR "%s: invalid state %d\n", __func__,
 				info->state);
-		return -EINVAL;
+		BUG();
 	}
-
-	info->state = STATE_READY;
-	return 0;
 }
 
-static void start_data_dma(struct pxa3xx_nand_info *info, int dir_out)
+static void start_data_dma(struct pxa3xx_nand_info *info)
 {
 	struct pxa_dma_desc *desc = info->data_desc;
 	int dma_len = ALIGN(info->data_size + info->oob_size, 32);
@@ -446,14 +373,21 @@
 	desc->ddadr = DDADR_STOP;
 	desc->dcmd = DCMD_ENDIRQEN | DCMD_WIDTH4 | DCMD_BURST32 | dma_len;
 
-	if (dir_out) {
+	switch (info->state) {
+	case STATE_DMA_WRITING:
 		desc->dsadr = info->data_buff_phys;
 		desc->dtadr = info->mmio_phys + NDDB;
 		desc->dcmd |= DCMD_INCSRCADDR | DCMD_FLOWTRG;
-	} else {
+		break;
+	case STATE_DMA_READING:
 		desc->dtadr = info->data_buff_phys;
 		desc->dsadr = info->mmio_phys + NDDB;
 		desc->dcmd |= DCMD_INCTRGADDR | DCMD_FLOWSRC;
+		break;
+	default:
+		printk(KERN_ERR "%s: invalid state %d\n", __func__,
+				info->state);
+		BUG();
 	}
 
 	DRCMR(info->drcmr_dat) = DRCMR_MAPVLD | info->data_dma_ch;
@@ -471,95 +405,64 @@
 
 	if (dcsr & DCSR_BUSERR) {
 		info->retcode = ERR_DMABUSERR;
-		complete(&info->cmd_complete);
 	}
 
-	if (info->state == STATE_DMA_WRITING) {
-		info->state = STATE_DMA_DONE;
-		enable_int(info, NDSR_CS0_BBD | NDSR_CS0_CMDD);
-	} else {
-		info->state = STATE_READY;
-		complete(&info->cmd_complete);
-	}
+	info->state = STATE_DMA_DONE;
+	enable_int(info, NDCR_INT_MASK);
+	nand_writel(info, NDSR, NDSR_WRDREQ | NDSR_RDDREQ);
 }
 
 static irqreturn_t pxa3xx_nand_irq(int irq, void *devid)
 {
 	struct pxa3xx_nand_info *info = devid;
-	unsigned int status;
+	unsigned int status, is_completed = 0;
 
 	status = nand_readl(info, NDSR);
 
-	if (status & (NDSR_RDDREQ | NDSR_DBERR | NDSR_SBERR)) {
-		if (status & NDSR_DBERR)
-			info->retcode = ERR_DBERR;
-		else if (status & NDSR_SBERR)
-			info->retcode = ERR_SBERR;
-
-		disable_int(info, NDSR_RDDREQ | NDSR_DBERR | NDSR_SBERR);
-
+	if (status & NDSR_DBERR)
+		info->retcode = ERR_DBERR;
+	if (status & NDSR_SBERR)
+		info->retcode = ERR_SBERR;
+	if (status & (NDSR_RDDREQ | NDSR_WRDREQ)) {
+		/* whether use dma to transfer data */
 		if (info->use_dma) {
-			info->state = STATE_DMA_READING;
-			start_data_dma(info, 0);
+			disable_int(info, NDCR_INT_MASK);
+			info->state = (status & NDSR_RDDREQ) ?
+				      STATE_DMA_READING : STATE_DMA_WRITING;
+			start_data_dma(info);
+			goto NORMAL_IRQ_EXIT;
 		} else {
-			info->state = STATE_PIO_READING;
-			complete(&info->cmd_complete);
+			info->state = (status & NDSR_RDDREQ) ?
+				      STATE_PIO_READING : STATE_PIO_WRITING;
+			handle_data_pio(info);
 		}
-	} else if (status & NDSR_WRDREQ) {
-		disable_int(info, NDSR_WRDREQ);
-		if (info->use_dma) {
-			info->state = STATE_DMA_WRITING;
-			start_data_dma(info, 1);
-		} else {
-			info->state = STATE_PIO_WRITING;
-			complete(&info->cmd_complete);
-		}
-	} else if (status & (NDSR_CS0_BBD | NDSR_CS0_CMDD)) {
-		if (status & NDSR_CS0_BBD)
-			info->retcode = ERR_BBERR;
-
-		disable_int(info, NDSR_CS0_BBD | NDSR_CS0_CMDD);
+	}
+	if (status & NDSR_CS0_CMDD) {
+		info->state = STATE_CMD_DONE;
+		is_completed = 1;
+	}
+	if (status & NDSR_FLASH_RDY) {
+		info->is_ready = 1;
 		info->state = STATE_READY;
-		complete(&info->cmd_complete);
 	}
+
+	if (status & NDSR_WRCMDREQ) {
+		nand_writel(info, NDSR, NDSR_WRCMDREQ);
+		status &= ~NDSR_WRCMDREQ;
+		info->state = STATE_CMD_HANDLE;
+		nand_writel(info, NDCB0, info->ndcb0);
+		nand_writel(info, NDCB0, info->ndcb1);
+		nand_writel(info, NDCB0, info->ndcb2);
+	}
+
+	/* clear NDSR to let the controller exit the IRQ */
 	nand_writel(info, NDSR, status);
+	if (is_completed)
+		complete(&info->cmd_complete);
+NORMAL_IRQ_EXIT:
 	return IRQ_HANDLED;
 }
 
-static int pxa3xx_nand_do_cmd(struct pxa3xx_nand_info *info, uint32_t event)
-{
-	uint32_t ndcr;
-	int ret, timeout = CHIP_DELAY_TIMEOUT;
-
-	if (write_cmd(info)) {
-		info->retcode = ERR_SENDCMD;
-		goto fail_stop;
-	}
-
-	info->state = STATE_CMD_HANDLE;
-
-	enable_int(info, event);
-
-	ret = wait_for_completion_timeout(&info->cmd_complete, timeout);
-	if (!ret) {
-		printk(KERN_ERR "command execution timed out\n");
-		info->retcode = ERR_SENDCMD;
-		goto fail_stop;
-	}
-
-	if (info->use_dma == 0 && info->data_size > 0)
-		if (handle_data_pio(info))
-			goto fail_stop;
-
-	return 0;
-
-fail_stop:
-	ndcr = nand_readl(info, NDCR);
-	nand_writel(info, NDCR, ndcr & ~NDCR_ND_RUN);
-	udelay(10);
-	return -ETIMEDOUT;
-}
-
 static int pxa3xx_nand_dev_ready(struct mtd_info *mtd)
 {
 	struct pxa3xx_nand_info *info = mtd->priv;
@@ -574,125 +477,218 @@
 	return 1;
 }
 
+static int prepare_command_pool(struct pxa3xx_nand_info *info, int command,
+		uint16_t column, int page_addr)
+{
+	uint16_t cmd;
+	int addr_cycle, exec_cmd, ndcb0;
+	struct mtd_info *mtd = info->mtd;
+
+	ndcb0 = 0;
+	addr_cycle = 0;
+	exec_cmd = 1;
+
+	/* reset data and oob column point to handle data */
+	info->buf_start		= 0;
+	info->buf_count		= 0;
+	info->oob_size		= 0;
+	info->use_ecc		= 0;
+	info->is_ready		= 0;
+	info->retcode		= ERR_NONE;
+
+	switch (command) {
+	case NAND_CMD_READ0:
+	case NAND_CMD_PAGEPROG:
+		info->use_ecc = 1;
+	case NAND_CMD_READOOB:
+		pxa3xx_set_datasize(info);
+		break;
+	case NAND_CMD_SEQIN:
+		exec_cmd = 0;
+		break;
+	default:
+		info->ndcb1 = 0;
+		info->ndcb2 = 0;
+		break;
+	}
+
+	info->ndcb0 = ndcb0;
+	addr_cycle = NDCB0_ADDR_CYC(info->row_addr_cycles
+				    + info->col_addr_cycles);
+
+	switch (command) {
+	case NAND_CMD_READOOB:
+	case NAND_CMD_READ0:
+		cmd = info->cmdset->read1;
+		if (command == NAND_CMD_READOOB)
+			info->buf_start = mtd->writesize + column;
+		else
+			info->buf_start = column;
+
+		if (unlikely(info->page_size < PAGE_CHUNK_SIZE))
+			info->ndcb0 |= NDCB0_CMD_TYPE(0)
+					| addr_cycle
+					| (cmd & NDCB0_CMD1_MASK);
+		else
+			info->ndcb0 |= NDCB0_CMD_TYPE(0)
+					| NDCB0_DBC
+					| addr_cycle
+					| cmd;
+
+	case NAND_CMD_SEQIN:
+		/* small page addr setting */
+		if (unlikely(info->page_size < PAGE_CHUNK_SIZE)) {
+			info->ndcb1 = ((page_addr & 0xFFFFFF) << 8)
+					| (column & 0xFF);
+
+			info->ndcb2 = 0;
+		} else {
+			info->ndcb1 = ((page_addr & 0xFFFF) << 16)
+					| (column & 0xFFFF);
+
+			if (page_addr & 0xFF0000)
+				info->ndcb2 = (page_addr & 0xFF0000) >> 16;
+			else
+				info->ndcb2 = 0;
+		}
+
+		info->buf_count = mtd->writesize + mtd->oobsize;
+		memset(info->data_buff, 0xFF, info->buf_count);
+
+		break;
+
+	case NAND_CMD_PAGEPROG:
+		if (is_buf_blank(info->data_buff,
+					(mtd->writesize + mtd->oobsize))) {
+			exec_cmd = 0;
+			break;
+		}
+
+		cmd = info->cmdset->program;
+		info->ndcb0 |= NDCB0_CMD_TYPE(0x1)
+				| NDCB0_AUTO_RS
+				| NDCB0_ST_ROW_EN
+				| NDCB0_DBC
+				| cmd
+				| addr_cycle;
+		break;
+
+	case NAND_CMD_READID:
+		cmd = info->cmdset->read_id;
+		info->buf_count = info->read_id_bytes;
+		info->ndcb0 |= NDCB0_CMD_TYPE(3)
+				| NDCB0_ADDR_CYC(1)
+				| cmd;
+
+		info->data_size = 8;
+		break;
+	case NAND_CMD_STATUS:
+		cmd = info->cmdset->read_status;
+		info->buf_count = 1;
+		info->ndcb0 |= NDCB0_CMD_TYPE(4)
+				| NDCB0_ADDR_CYC(1)
+				| cmd;
+
+		info->data_size = 8;
+		break;
+
+	case NAND_CMD_ERASE1:
+		cmd = info->cmdset->erase;
+		info->ndcb0 |= NDCB0_CMD_TYPE(2)
+				| NDCB0_AUTO_RS
+				| NDCB0_ADDR_CYC(3)
+				| NDCB0_DBC
+				| cmd;
+		info->ndcb1 = page_addr;
+		info->ndcb2 = 0;
+
+		break;
+	case NAND_CMD_RESET:
+		cmd = info->cmdset->reset;
+		info->ndcb0 |= NDCB0_CMD_TYPE(5)
+				| cmd;
+
+		break;
+
+	case NAND_CMD_ERASE2:
+		exec_cmd = 0;
+		break;
+
+	default:
+		exec_cmd = 0;
+		printk(KERN_ERR "pxa3xx-nand: non-supported"
+			" command %x\n", command);
+		break;
+	}
+
+	return exec_cmd;
+}
+
 static void pxa3xx_nand_cmdfunc(struct mtd_info *mtd, unsigned command,
 				int column, int page_addr)
 {
 	struct pxa3xx_nand_info *info = mtd->priv;
-	const struct pxa3xx_nand_cmdset *cmdset = info->cmdset;
-	int ret;
+	int ret, exec_cmd;
 
-	info->use_dma = (use_dma) ? 1 : 0;
-	info->use_ecc = 0;
-	info->data_size = 0;
-	info->state = STATE_READY;
+	/*
+	 * if this is a x16 device ,then convert the input
+	 * "byte" address into a "word" address appropriate
+	 * for indexing a word-oriented device
+	 */
+	if (info->reg_ndcr & NDCR_DWIDTH_M)
+		column /= 2;
 
-	init_completion(&info->cmd_complete);
+	exec_cmd = prepare_command_pool(info, command, column, page_addr);
+	if (exec_cmd) {
+		init_completion(&info->cmd_complete);
+		pxa3xx_nand_start(info);
 
-	switch (command) {
-	case NAND_CMD_READOOB:
-		/* disable HW ECC to get all the OOB data */
-		info->buf_count = mtd->writesize + mtd->oobsize;
-		info->buf_start = mtd->writesize + column;
-		memset(info->data_buff, 0xFF, info->buf_count);
-
-		if (prepare_read_prog_cmd(info, cmdset->read1, column, page_addr))
-			break;
-
-		pxa3xx_nand_do_cmd(info, NDSR_RDDREQ | NDSR_DBERR | NDSR_SBERR);
-
-		/* We only are OOB, so if the data has error, does not matter */
-		if (info->retcode == ERR_DBERR)
-			info->retcode = ERR_NONE;
-		break;
-
-	case NAND_CMD_READ0:
-		info->use_ecc = 1;
-		info->retcode = ERR_NONE;
-		info->buf_start = column;
-		info->buf_count = mtd->writesize + mtd->oobsize;
-		memset(info->data_buff, 0xFF, info->buf_count);
-
-		if (prepare_read_prog_cmd(info, cmdset->read1, column, page_addr))
-			break;
-
-		pxa3xx_nand_do_cmd(info, NDSR_RDDREQ | NDSR_DBERR | NDSR_SBERR);
-
-		if (info->retcode == ERR_DBERR) {
-			/* for blank page (all 0xff), HW will calculate its ECC as
-			 * 0, which is different from the ECC information within
-			 * OOB, ignore such double bit errors
-			 */
-			if (is_buf_blank(info->data_buff, mtd->writesize))
-				info->retcode = ERR_NONE;
+		ret = wait_for_completion_timeout(&info->cmd_complete,
+				CHIP_DELAY_TIMEOUT);
+		if (!ret) {
+			printk(KERN_ERR "Wait time out!!!\n");
+			/* Stop State Machine for next command cycle */
+			pxa3xx_nand_stop(info);
 		}
-		break;
-	case NAND_CMD_SEQIN:
-		info->buf_start = column;
-		info->buf_count = mtd->writesize + mtd->oobsize;
-		memset(info->data_buff, 0xff, info->buf_count);
+		info->state = STATE_IDLE;
+	}
+}
 
-		/* save column/page_addr for next CMD_PAGEPROG */
-		info->seqin_column = column;
-		info->seqin_page_addr = page_addr;
-		break;
-	case NAND_CMD_PAGEPROG:
-		info->use_ecc = (info->seqin_column >= mtd->writesize) ? 0 : 1;
+static void pxa3xx_nand_write_page_hwecc(struct mtd_info *mtd,
+		struct nand_chip *chip, const uint8_t *buf)
+{
+	chip->write_buf(mtd, buf, mtd->writesize);
+	chip->write_buf(mtd, chip->oob_poi, mtd->oobsize);
+}
 
-		if (prepare_read_prog_cmd(info, cmdset->program,
-				info->seqin_column, info->seqin_page_addr))
+static int pxa3xx_nand_read_page_hwecc(struct mtd_info *mtd,
+		struct nand_chip *chip, uint8_t *buf, int page)
+{
+	struct pxa3xx_nand_info *info = mtd->priv;
+
+	chip->read_buf(mtd, buf, mtd->writesize);
+	chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);
+
+	if (info->retcode == ERR_SBERR) {
+		switch (info->use_ecc) {
+		case 1:
+			mtd->ecc_stats.corrected++;
 			break;
-
-		pxa3xx_nand_do_cmd(info, NDSR_WRDREQ);
-		break;
-	case NAND_CMD_ERASE1:
-		if (prepare_erase_cmd(info, cmdset->erase, page_addr))
+		case 0:
+		default:
 			break;
-
-		pxa3xx_nand_do_cmd(info, NDSR_CS0_BBD | NDSR_CS0_CMDD);
-		break;
-	case NAND_CMD_ERASE2:
-		break;
-	case NAND_CMD_READID:
-	case NAND_CMD_STATUS:
-		info->use_dma = 0;	/* force PIO read */
-		info->buf_start = 0;
-		info->buf_count = (command == NAND_CMD_READID) ?
-				info->read_id_bytes : 1;
-
-		if (prepare_other_cmd(info, (command == NAND_CMD_READID) ?
-				cmdset->read_id : cmdset->read_status))
-			break;
-
-		pxa3xx_nand_do_cmd(info, NDSR_RDDREQ);
-		break;
-	case NAND_CMD_RESET:
-		if (prepare_other_cmd(info, cmdset->reset))
-			break;
-
-		ret = pxa3xx_nand_do_cmd(info, NDSR_CS0_CMDD);
-		if (ret == 0) {
-			int timeout = 2;
-			uint32_t ndcr;
-
-			while (timeout--) {
-				if (nand_readl(info, NDSR) & NDSR_RDY)
-					break;
-				msleep(10);
-			}
-
-			ndcr = nand_readl(info, NDCR);
-			nand_writel(info, NDCR, ndcr & ~NDCR_ND_RUN);
 		}
-		break;
-	default:
-		printk(KERN_ERR "non-supported command.\n");
-		break;
+	} else if (info->retcode == ERR_DBERR) {
+		/*
+		 * for blank page (all 0xff), HW will calculate its ECC as
+		 * 0, which is different from the ECC information within
+		 * OOB, ignore such double bit errors
+		 */
+		if (is_buf_blank(buf, mtd->writesize))
+			mtd->ecc_stats.failed++;
 	}
 
-	if (info->retcode == ERR_DBERR) {
-		printk(KERN_ERR "double bit error @ page %08x\n", page_addr);
-		info->retcode = ERR_NONE;
-	}
+	return 0;
 }
 
 static uint8_t pxa3xx_nand_read_byte(struct mtd_info *mtd)
@@ -769,73 +765,12 @@
 	return 0;
 }
 
-static void pxa3xx_nand_ecc_hwctl(struct mtd_info *mtd, int mode)
-{
-	return;
-}
-
-static int pxa3xx_nand_ecc_calculate(struct mtd_info *mtd,
-		const uint8_t *dat, uint8_t *ecc_code)
-{
-	return 0;
-}
-
-static int pxa3xx_nand_ecc_correct(struct mtd_info *mtd,
-		uint8_t *dat, uint8_t *read_ecc, uint8_t *calc_ecc)
-{
-	struct pxa3xx_nand_info *info = mtd->priv;
-	/*
-	 * Any error include ERR_SEND_CMD, ERR_DBERR, ERR_BUSERR, we
-	 * consider it as a ecc error which will tell the caller the
-	 * read fail We have distinguish all the errors, but the
-	 * nand_read_ecc only check this function return value
-	 *
-	 * Corrected (single-bit) errors must also be noted.
-	 */
-	if (info->retcode == ERR_SBERR)
-		return 1;
-	else if (info->retcode != ERR_NONE)
-		return -1;
-
-	return 0;
-}
-
-static int __readid(struct pxa3xx_nand_info *info, uint32_t *id)
-{
-	const struct pxa3xx_nand_cmdset *cmdset = info->cmdset;
-	uint32_t ndcr;
-	uint8_t  id_buff[8];
-
-	if (prepare_other_cmd(info, cmdset->read_id)) {
-		printk(KERN_ERR "failed to prepare command\n");
-		return -EINVAL;
-	}
-
-	/* Send command */
-	if (write_cmd(info))
-		goto fail_timeout;
-
-	/* Wait for CMDDM(command done successfully) */
-	if (wait_for_event(info, NDSR_RDDREQ))
-		goto fail_timeout;
-
-	__raw_readsl(info->mmio_base + NDDB, id_buff, 2);
-	*id = id_buff[0] | (id_buff[1] << 8);
-	return 0;
-
-fail_timeout:
-	ndcr = nand_readl(info, NDCR);
-	nand_writel(info, NDCR, ndcr & ~NDCR_ND_RUN);
-	udelay(10);
-	return -ETIMEDOUT;
-}
-
 static int pxa3xx_nand_config_flash(struct pxa3xx_nand_info *info,
 				    const struct pxa3xx_nand_flash *f)
 {
 	struct platform_device *pdev = info->pdev;
 	struct pxa3xx_nand_platform_data *pdata = pdev->dev.platform_data;
-	uint32_t ndcr = 0x00000FFF; /* disable all interrupts */
+	uint32_t ndcr = 0x0; /* enable all interrupts */
 
 	if (f->page_size != 2048 && f->page_size != 512)
 		return -EINVAL;
@@ -844,9 +779,8 @@
 		return -EINVAL;
 
 	/* calculate flash information */
-	info->cmdset = f->cmdset;
+	info->cmdset = &default_cmdset;
 	info->page_size = f->page_size;
-	info->oob_buff = info->data_buff + f->page_size;
 	info->read_id_bytes = (f->page_size == 2048) ? 4 : 2;
 
 	/* calculate addressing information */
@@ -876,87 +810,18 @@
 static int pxa3xx_nand_detect_config(struct pxa3xx_nand_info *info)
 {
 	uint32_t ndcr = nand_readl(info, NDCR);
-	struct nand_flash_dev *type = NULL;
-	uint32_t id = -1, page_per_block, num_blocks;
-	int i;
-
-	page_per_block = ndcr & NDCR_PG_PER_BLK ? 64 : 32;
 	info->page_size = ndcr & NDCR_PAGE_SZ ? 2048 : 512;
-	/* set info fields needed to __readid */
+	/* set info fields needed to read id */
 	info->read_id_bytes = (info->page_size == 2048) ? 4 : 2;
 	info->reg_ndcr = ndcr;
 	info->cmdset = &default_cmdset;
 
-	if (__readid(info, &id))
-		return -ENODEV;
-
-	/* Lookup the flash id */
-	id = (id >> 8) & 0xff;		/* device id is byte 2 */
-	for (i = 0; nand_flash_ids[i].name != NULL; i++) {
-		if (id == nand_flash_ids[i].id) {
-			type =  &nand_flash_ids[i];
-			break;
-		}
-	}
-
-	if (!type)
-		return -ENODEV;
-
-	/* fill the missing flash information */
-	i = __ffs(page_per_block * info->page_size);
-	num_blocks = type->chipsize << (20 - i);
-
-	/* calculate addressing information */
-	info->col_addr_cycles = (info->page_size == 2048) ? 2 : 1;
-
-	if (num_blocks * page_per_block > 65536)
-		info->row_addr_cycles = 3;
-	else
-		info->row_addr_cycles = 2;
-
 	info->ndtr0cs0 = nand_readl(info, NDTR0CS0);
 	info->ndtr1cs0 = nand_readl(info, NDTR1CS0);
 
 	return 0;
 }
 
-static int pxa3xx_nand_detect_flash(struct pxa3xx_nand_info *info,
-				    const struct pxa3xx_nand_platform_data *pdata)
-{
-	const struct pxa3xx_nand_flash *f;
-	uint32_t id = -1;
-	int i;
-
-	if (pdata->keep_config)
-		if (pxa3xx_nand_detect_config(info) == 0)
-			return 0;
-
-	/* we use default timing to detect id */
-	f = DEFAULT_FLASH_TYPE;
-	pxa3xx_nand_config_flash(info, f);
-	if (__readid(info, &id))
-		goto fail_detect;
-
-	for (i=0; i<ARRAY_SIZE(builtin_flash_types) + pdata->num_flash - 1; i++) {
-		/* we first choose the flash definition from platfrom */
-		if (i < pdata->num_flash)
-			f = pdata->flash + i;
-		else
-			f = &builtin_flash_types[i - pdata->num_flash + 1];
-		if (f->chip_id == id) {
-			dev_info(&info->pdev->dev, "detect chip id: 0x%x\n", id);
-			pxa3xx_nand_config_flash(info, f);
-			return 0;
-		}
-	}
-
-	dev_warn(&info->pdev->dev,
-		 "failed to detect configured nand flash; found %04x instead of\n",
-		 id);
-fail_detect:
-	return -ENODEV;
-}
-
 /* the maximum possible buffer size for large page with OOB data
  * is: 2048 + 64 = 2112 bytes, allocate a page here for both the
  * data buffer and the DMA descriptor
@@ -998,82 +863,144 @@
 	return 0;
 }
 
-static struct nand_ecclayout hw_smallpage_ecclayout = {
-	.eccbytes = 6,
-	.eccpos = {8, 9, 10, 11, 12, 13 },
-	.oobfree = { {2, 6} }
-};
-
-static struct nand_ecclayout hw_largepage_ecclayout = {
-	.eccbytes = 24,
-	.eccpos = {
-		40, 41, 42, 43, 44, 45, 46, 47,
-		48, 49, 50, 51, 52, 53, 54, 55,
-		56, 57, 58, 59, 60, 61, 62, 63},
-	.oobfree = { {2, 38} }
-};
-
-static void pxa3xx_nand_init_mtd(struct mtd_info *mtd,
-				 struct pxa3xx_nand_info *info)
+static int pxa3xx_nand_sensing(struct pxa3xx_nand_info *info)
 {
-	struct nand_chip *this = &info->nand_chip;
+	struct mtd_info *mtd = info->mtd;
+	struct nand_chip *chip = mtd->priv;
 
-	this->options = (info->reg_ndcr & NDCR_DWIDTH_C) ? NAND_BUSWIDTH_16: 0;
-
-	this->waitfunc		= pxa3xx_nand_waitfunc;
-	this->select_chip	= pxa3xx_nand_select_chip;
-	this->dev_ready		= pxa3xx_nand_dev_ready;
-	this->cmdfunc		= pxa3xx_nand_cmdfunc;
-	this->read_word		= pxa3xx_nand_read_word;
-	this->read_byte		= pxa3xx_nand_read_byte;
-	this->read_buf		= pxa3xx_nand_read_buf;
-	this->write_buf		= pxa3xx_nand_write_buf;
-	this->verify_buf	= pxa3xx_nand_verify_buf;
-
-	this->ecc.mode		= NAND_ECC_HW;
-	this->ecc.hwctl		= pxa3xx_nand_ecc_hwctl;
-	this->ecc.calculate	= pxa3xx_nand_ecc_calculate;
-	this->ecc.correct	= pxa3xx_nand_ecc_correct;
-	this->ecc.size		= info->page_size;
-
-	if (info->page_size == 2048)
-		this->ecc.layout = &hw_largepage_ecclayout;
+	/* use the common timing to make a try */
+	pxa3xx_nand_config_flash(info, &builtin_flash_types[0]);
+	chip->cmdfunc(mtd, NAND_CMD_RESET, 0, 0);
+	if (info->is_ready)
+		return 1;
 	else
-		this->ecc.layout = &hw_smallpage_ecclayout;
-
-	this->chip_delay = 25;
+		return 0;
 }
 
-static int pxa3xx_nand_probe(struct platform_device *pdev)
+static int pxa3xx_nand_scan(struct mtd_info *mtd)
 {
-	struct pxa3xx_nand_platform_data *pdata;
+	struct pxa3xx_nand_info *info = mtd->priv;
+	struct platform_device *pdev = info->pdev;
+	struct pxa3xx_nand_platform_data *pdata = pdev->dev.platform_data;
+	struct nand_flash_dev pxa3xx_flash_ids[2] = { {NULL,}, {NULL,} };
+	const struct pxa3xx_nand_flash *f = NULL;
+	struct nand_chip *chip = mtd->priv;
+	uint32_t id = -1;
+	uint64_t chipsize;
+	int i, ret, num;
+
+	if (pdata->keep_config && !pxa3xx_nand_detect_config(info))
+		goto KEEP_CONFIG;
+
+	ret = pxa3xx_nand_sensing(info);
+	if (!ret) {
+		kfree(mtd);
+		info->mtd = NULL;
+		printk(KERN_INFO "There is no nand chip on cs 0!\n");
+
+		return -EINVAL;
+	}
+
+	chip->cmdfunc(mtd, NAND_CMD_READID, 0, 0);
+	id = *((uint16_t *)(info->data_buff));
+	if (id != 0)
+		printk(KERN_INFO "Detect a flash id %x\n", id);
+	else {
+		kfree(mtd);
+		info->mtd = NULL;
+		printk(KERN_WARNING "Read out ID 0, potential timing set wrong!!\n");
+
+		return -EINVAL;
+	}
+
+	num = ARRAY_SIZE(builtin_flash_types) + pdata->num_flash - 1;
+	for (i = 0; i < num; i++) {
+		if (i < pdata->num_flash)
+			f = pdata->flash + i;
+		else
+			f = &builtin_flash_types[i - pdata->num_flash + 1];
+
+		/* find the chip in default list */
+		if (f->chip_id == id)
+			break;
+	}
+
+	if (i >= (ARRAY_SIZE(builtin_flash_types) + pdata->num_flash - 1)) {
+		kfree(mtd);
+		info->mtd = NULL;
+		printk(KERN_ERR "ERROR!! flash not defined!!!\n");
+
+		return -EINVAL;
+	}
+
+	pxa3xx_nand_config_flash(info, f);
+	pxa3xx_flash_ids[0].name = f->name;
+	pxa3xx_flash_ids[0].id = (f->chip_id >> 8) & 0xffff;
+	pxa3xx_flash_ids[0].pagesize = f->page_size;
+	chipsize = (uint64_t)f->num_blocks * f->page_per_block * f->page_size;
+	pxa3xx_flash_ids[0].chipsize = chipsize >> 20;
+	pxa3xx_flash_ids[0].erasesize = f->page_size * f->page_per_block;
+	if (f->flash_width == 16)
+		pxa3xx_flash_ids[0].options = NAND_BUSWIDTH_16;
+KEEP_CONFIG:
+	if (nand_scan_ident(mtd, 1, pxa3xx_flash_ids))
+		return -ENODEV;
+	/* calculate addressing information */
+	info->col_addr_cycles = (mtd->writesize >= 2048) ? 2 : 1;
+	info->oob_buff = info->data_buff + mtd->writesize;
+	if ((mtd->size >> chip->page_shift) > 65536)
+		info->row_addr_cycles = 3;
+	else
+		info->row_addr_cycles = 2;
+	mtd->name = mtd_names[0];
+	chip->ecc.mode = NAND_ECC_HW;
+	chip->ecc.size = f->page_size;
+
+	chip->options = (f->flash_width == 16) ? NAND_BUSWIDTH_16 : 0;
+	chip->options |= NAND_NO_AUTOINCR;
+	chip->options |= NAND_NO_READRDY;
+
+	return nand_scan_tail(mtd);
+}
+
+static
+struct pxa3xx_nand_info *alloc_nand_resource(struct platform_device *pdev)
+{
 	struct pxa3xx_nand_info *info;
-	struct nand_chip *this;
+	struct nand_chip *chip;
 	struct mtd_info *mtd;
 	struct resource *r;
-	int ret = 0, irq;
-
-	pdata = pdev->dev.platform_data;
-
-	if (!pdata) {
-		dev_err(&pdev->dev, "no platform data defined\n");
-		return -ENODEV;
-	}
+	int ret, irq;
 
 	mtd = kzalloc(sizeof(struct mtd_info) + sizeof(struct pxa3xx_nand_info),
 			GFP_KERNEL);
 	if (!mtd) {
 		dev_err(&pdev->dev, "failed to allocate memory\n");
-		return -ENOMEM;
+		return NULL;
 	}
 
 	info = (struct pxa3xx_nand_info *)(&mtd[1]);
+	chip = (struct nand_chip *)(&mtd[1]);
 	info->pdev = pdev;
-
-	this = &info->nand_chip;
+	info->mtd = mtd;
 	mtd->priv = info;
 	mtd->owner = THIS_MODULE;
 
+	chip->ecc.read_page	= pxa3xx_nand_read_page_hwecc;
+	chip->ecc.write_page	= pxa3xx_nand_write_page_hwecc;
+	chip->controller        = &info->controller;
+	chip->waitfunc		= pxa3xx_nand_waitfunc;
+	chip->select_chip	= pxa3xx_nand_select_chip;
+	chip->dev_ready		= pxa3xx_nand_dev_ready;
+	chip->cmdfunc		= pxa3xx_nand_cmdfunc;
+	chip->read_word		= pxa3xx_nand_read_word;
+	chip->read_byte		= pxa3xx_nand_read_byte;
+	chip->read_buf		= pxa3xx_nand_read_buf;
+	chip->write_buf		= pxa3xx_nand_write_buf;
+	chip->verify_buf	= pxa3xx_nand_verify_buf;
+
+	spin_lock_init(&chip->controller->lock);
+	init_waitqueue_head(&chip->controller->wq);
 	info->clk = clk_get(&pdev->dev, NULL);
 	if (IS_ERR(info->clk)) {
 		dev_err(&pdev->dev, "failed to get nand clock\n");
@@ -1141,43 +1068,12 @@
 		goto fail_free_buf;
 	}
 
-	ret = pxa3xx_nand_detect_flash(info, pdata);
-	if (ret) {
-		dev_err(&pdev->dev, "failed to detect flash\n");
-		ret = -ENODEV;
-		goto fail_free_irq;
-	}
+	platform_set_drvdata(pdev, info);
 
-	pxa3xx_nand_init_mtd(mtd, info);
+	return info;
 
-	platform_set_drvdata(pdev, mtd);
-
-	if (nand_scan(mtd, 1)) {
-		dev_err(&pdev->dev, "failed to scan nand\n");
-		ret = -ENXIO;
-		goto fail_free_irq;
-	}
-
-#ifdef CONFIG_MTD_PARTITIONS
-	if (mtd_has_cmdlinepart()) {
-		static const char *probes[] = { "cmdlinepart", NULL };
-		struct mtd_partition *parts;
-		int nr_parts;
-
-		nr_parts = parse_mtd_partitions(mtd, probes, &parts, 0);
-
-		if (nr_parts)
-			return add_mtd_partitions(mtd, parts, nr_parts);
-	}
-
-	return add_mtd_partitions(mtd, pdata->parts, pdata->nr_parts);
-#else
-	return 0;
-#endif
-
-fail_free_irq:
-	free_irq(irq, info);
 fail_free_buf:
+	free_irq(irq, info);
 	if (use_dma) {
 		pxa_free_dma(info->data_dma_ch);
 		dma_free_coherent(&pdev->dev, info->data_buff_size,
@@ -1193,22 +1089,18 @@
 	clk_put(info->clk);
 fail_free_mtd:
 	kfree(mtd);
-	return ret;
+	return NULL;
 }
 
 static int pxa3xx_nand_remove(struct platform_device *pdev)
 {
-	struct mtd_info *mtd = platform_get_drvdata(pdev);
-	struct pxa3xx_nand_info *info = mtd->priv;
+	struct pxa3xx_nand_info *info = platform_get_drvdata(pdev);
+	struct mtd_info *mtd = info->mtd;
 	struct resource *r;
 	int irq;
 
 	platform_set_drvdata(pdev, NULL);
 
-	del_mtd_device(mtd);
-#ifdef CONFIG_MTD_PARTITIONS
-	del_mtd_partitions(mtd);
-#endif
 	irq = platform_get_irq(pdev, 0);
 	if (irq >= 0)
 		free_irq(irq, info);
@@ -1226,17 +1118,62 @@
 	clk_disable(info->clk);
 	clk_put(info->clk);
 
-	kfree(mtd);
+	if (mtd) {
+		del_mtd_device(mtd);
+#ifdef CONFIG_MTD_PARTITIONS
+		del_mtd_partitions(mtd);
+#endif
+		kfree(mtd);
+	}
 	return 0;
 }
 
+static int pxa3xx_nand_probe(struct platform_device *pdev)
+{
+	struct pxa3xx_nand_platform_data *pdata;
+	struct pxa3xx_nand_info *info;
+
+	pdata = pdev->dev.platform_data;
+	if (!pdata) {
+		dev_err(&pdev->dev, "no platform data defined\n");
+		return -ENODEV;
+	}
+
+	info = alloc_nand_resource(pdev);
+	if (info == NULL)
+		return -ENOMEM;
+
+	if (pxa3xx_nand_scan(info->mtd)) {
+		dev_err(&pdev->dev, "failed to scan nand\n");
+		pxa3xx_nand_remove(pdev);
+		return -ENODEV;
+	}
+
+#ifdef CONFIG_MTD_PARTITIONS
+	if (mtd_has_cmdlinepart()) {
+		const char *probes[] = { "cmdlinepart", NULL };
+		struct mtd_partition *parts;
+		int nr_parts;
+
+		nr_parts = parse_mtd_partitions(info->mtd, probes, &parts, 0);
+
+		if (nr_parts)
+			return add_mtd_partitions(info->mtd, parts, nr_parts);
+	}
+
+	return add_mtd_partitions(info->mtd, pdata->parts, pdata->nr_parts);
+#else
+	return 0;
+#endif
+}
+
 #ifdef CONFIG_PM
 static int pxa3xx_nand_suspend(struct platform_device *pdev, pm_message_t state)
 {
-	struct mtd_info *mtd = (struct mtd_info *)platform_get_drvdata(pdev);
-	struct pxa3xx_nand_info *info = mtd->priv;
+	struct pxa3xx_nand_info *info = platform_get_drvdata(pdev);
+	struct mtd_info *mtd = info->mtd;
 
-	if (info->state != STATE_READY) {
+	if (info->state) {
 		dev_err(&pdev->dev, "driver busy, state = %d\n", info->state);
 		return -EAGAIN;
 	}
@@ -1246,8 +1183,8 @@
 
 static int pxa3xx_nand_resume(struct platform_device *pdev)
 {
-	struct mtd_info *mtd = (struct mtd_info *)platform_get_drvdata(pdev);
-	struct pxa3xx_nand_info *info = mtd->priv;
+	struct pxa3xx_nand_info *info = platform_get_drvdata(pdev);
+	struct mtd_info *mtd = info->mtd;
 
 	nand_writel(info, NDTR0CS0, info->ndtr0cs0);
 	nand_writel(info, NDTR1CS0, info->ndtr1cs0);
diff --git a/drivers/mtd/onenand/omap2.c b/drivers/mtd/onenand/omap2.c
index 14a49ab..f591f61 100644
--- a/drivers/mtd/onenand/omap2.c
+++ b/drivers/mtd/onenand/omap2.c
@@ -629,6 +629,7 @@
 {
 	struct omap_onenand_platform_data *pdata;
 	struct omap2_onenand *c;
+	struct onenand_chip *this;
 	int r;
 
 	pdata = pdev->dev.platform_data;
@@ -726,9 +727,8 @@
 
 	c->mtd.dev.parent = &pdev->dev;
 
+	this = &c->onenand;
 	if (c->dma_channel >= 0) {
-		struct onenand_chip *this = &c->onenand;
-
 		this->wait = omap2_onenand_wait;
 		if (cpu_is_omap34xx()) {
 			this->read_bufferram = omap3_onenand_read_bufferram;
@@ -749,6 +749,9 @@
 		c->onenand.disable = omap2_onenand_disable;
 	}
 
+	if (pdata->skip_initial_unlocking)
+		this->options |= ONENAND_SKIP_INITIAL_UNLOCKING;
+
 	if ((r = onenand_scan(&c->mtd, 1)) < 0)
 		goto err_release_regulator;
 
diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index bac41ca..56a8b20 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -1132,6 +1132,8 @@
 			onenand_update_bufferram(mtd, from, !ret);
 			if (ret == -EBADMSG)
 				ret = 0;
+			if (ret)
+				break;
 		}
 
 		this->read_bufferram(mtd, ONENAND_DATARAM, buf, column, thislen);
@@ -1646,11 +1648,10 @@
 	int ret = 0;
 	int thislen, column;
 
+	column = addr & (this->writesize - 1);
+
 	while (len != 0) {
-		thislen = min_t(int, this->writesize, len);
-		column = addr & (this->writesize - 1);
-		if (column + thislen > this->writesize)
-			thislen = this->writesize - column;
+		thislen = min_t(int, this->writesize - column, len);
 
 		this->command(mtd, ONENAND_CMD_READ, addr, this->writesize);
 
@@ -1664,12 +1665,13 @@
 
 		this->read_bufferram(mtd, ONENAND_DATARAM, this->verify_buf, 0, mtd->writesize);
 
-		if (memcmp(buf, this->verify_buf, thislen))
+		if (memcmp(buf, this->verify_buf + column, thislen))
 			return -EBADMSG;
 
 		len -= thislen;
 		buf += thislen;
 		addr += thislen;
+		column = 0;
 	}
 
 	return 0;
@@ -4083,7 +4085,8 @@
 	mtd->writebufsize = mtd->writesize;
 
 	/* Unlock whole block */
-	this->unlock_all(mtd);
+	if (!(this->options & ONENAND_SKIP_INITIAL_UNLOCKING))
+		this->unlock_all(mtd);
 
 	ret = this->scan_bbt(mtd);
 	if ((!FLEXONENAND(this)) || ret)
diff --git a/drivers/mtd/sm_ftl.c b/drivers/mtd/sm_ftl.c
index ac0d6a8..2b0daae 100644
--- a/drivers/mtd/sm_ftl.c
+++ b/drivers/mtd/sm_ftl.c
@@ -64,12 +64,16 @@
 					SM_SMALL_PAGE - SM_CIS_VENDOR_OFFSET);
 
 	char *vendor = kmalloc(vendor_len, GFP_KERNEL);
+	if (!vendor)
+		goto error1;
 	memcpy(vendor, ftl->cis_buffer + SM_CIS_VENDOR_OFFSET, vendor_len);
 	vendor[vendor_len] = 0;
 
 	/* Initialize sysfs attributes */
 	vendor_attribute =
 		kzalloc(sizeof(struct sm_sysfs_attribute), GFP_KERNEL);
+	if (!vendor_attribute)
+		goto error2;
 
 	sysfs_attr_init(&vendor_attribute->dev_attr.attr);
 
@@ -83,12 +87,24 @@
 	/* Create array of pointers to the attributes */
 	attributes = kzalloc(sizeof(struct attribute *) * (NUM_ATTRIBUTES + 1),
 								GFP_KERNEL);
+	if (!attributes)
+		goto error3;
 	attributes[0] = &vendor_attribute->dev_attr.attr;
 
 	/* Finally create the attribute group */
 	attr_group = kzalloc(sizeof(struct attribute_group), GFP_KERNEL);
+	if (!attr_group)
+		goto error4;
 	attr_group->attrs = attributes;
 	return attr_group;
+error4:
+	kfree(attributes);
+error3:
+	kfree(vendor_attribute);
+error2:
+	kfree(vendor);
+error1:
+	return NULL;
 }
 
 void sm_delete_sysfs_attributes(struct sm_ftl *ftl)
@@ -1178,6 +1194,8 @@
 	}
 
 	ftl->disk_attributes = sm_create_sysfs_attributes(ftl);
+	if (!ftl->disk_attributes)
+		goto error6;
 	trans->disk_attributes = ftl->disk_attributes;
 
 	sm_printk("Found %d MiB xD/SmartMedia FTL on mtd%d",
diff --git a/drivers/mtd/tests/mtd_speedtest.c b/drivers/mtd/tests/mtd_speedtest.c
index 161feeb..627d4e2 100644
--- a/drivers/mtd/tests/mtd_speedtest.c
+++ b/drivers/mtd/tests/mtd_speedtest.c
@@ -16,7 +16,7 @@
  *
  * Test read and write speed of a MTD device.
  *
- * Author: Adrian Hunter <ext-adrian.hunter@nokia.com>
+ * Author: Adrian Hunter <adrian.hunter@nokia.com>
  */
 
 #include <linux/init.h>
@@ -33,6 +33,11 @@
 module_param(dev, int, S_IRUGO);
 MODULE_PARM_DESC(dev, "MTD device number to use");
 
+static int count;
+module_param(count, int, S_IRUGO);
+MODULE_PARM_DESC(count, "Maximum number of eraseblocks to use "
+			"(0 means use all)");
+
 static struct mtd_info *mtd;
 static unsigned char *iobuf;
 static unsigned char *bbt;
@@ -89,6 +94,33 @@
 	return 0;
 }
 
+static int multiblock_erase(int ebnum, int blocks)
+{
+	int err;
+	struct erase_info ei;
+	loff_t addr = ebnum * mtd->erasesize;
+
+	memset(&ei, 0, sizeof(struct erase_info));
+	ei.mtd  = mtd;
+	ei.addr = addr;
+	ei.len  = mtd->erasesize * blocks;
+
+	err = mtd->erase(mtd, &ei);
+	if (err) {
+		printk(PRINT_PREF "error %d while erasing EB %d, blocks %d\n",
+		       err, ebnum, blocks);
+		return err;
+	}
+
+	if (ei.state == MTD_ERASE_FAILED) {
+		printk(PRINT_PREF "some erase error occurred at EB %d,"
+		       "blocks %d\n", ebnum, blocks);
+		return -EIO;
+	}
+
+	return 0;
+}
+
 static int erase_whole_device(void)
 {
 	int err;
@@ -282,13 +314,16 @@
 
 static long calc_speed(void)
 {
-	long ms, k, speed;
+	uint64_t k;
+	long ms;
 
 	ms = (finish.tv_sec - start.tv_sec) * 1000 +
 	     (finish.tv_usec - start.tv_usec) / 1000;
-	k = goodebcnt * mtd->erasesize / 1024;
-	speed = (k * 1000) / ms;
-	return speed;
+	if (ms == 0)
+		return 0;
+	k = goodebcnt * (mtd->erasesize / 1024) * 1000;
+	do_div(k, ms);
+	return k;
 }
 
 static int scan_for_bad_eraseblocks(void)
@@ -320,13 +355,16 @@
 
 static int __init mtd_speedtest_init(void)
 {
-	int err, i;
+	int err, i, blocks, j, k;
 	long speed;
 	uint64_t tmp;
 
 	printk(KERN_INFO "\n");
 	printk(KERN_INFO "=================================================\n");
-	printk(PRINT_PREF "MTD device: %d\n", dev);
+	if (count)
+		printk(PRINT_PREF "MTD device: %d    count: %d\n", dev, count);
+	else
+		printk(PRINT_PREF "MTD device: %d\n", dev);
 
 	mtd = get_mtd_device(NULL, dev);
 	if (IS_ERR(mtd)) {
@@ -353,6 +391,9 @@
 	       (unsigned long long)mtd->size, mtd->erasesize,
 	       pgsize, ebcnt, pgcnt, mtd->oobsize);
 
+	if (count > 0 && count < ebcnt)
+		ebcnt = count;
+
 	err = -ENOMEM;
 	iobuf = kmalloc(mtd->erasesize, GFP_KERNEL);
 	if (!iobuf) {
@@ -484,6 +525,31 @@
 	speed = calc_speed();
 	printk(PRINT_PREF "erase speed is %ld KiB/s\n", speed);
 
+	/* Multi-block erase all eraseblocks */
+	for (k = 1; k < 7; k++) {
+		blocks = 1 << k;
+		printk(PRINT_PREF "Testing %dx multi-block erase speed\n",
+		       blocks);
+		start_timing();
+		for (i = 0; i < ebcnt; ) {
+			for (j = 0; j < blocks && (i + j) < ebcnt; j++)
+				if (bbt[i + j])
+					break;
+			if (j < 1) {
+				i++;
+				continue;
+			}
+			err = multiblock_erase(i, j);
+			if (err)
+				goto out;
+			cond_resched();
+			i += j;
+		}
+		stop_timing();
+		speed = calc_speed();
+		printk(PRINT_PREF "%dx multi-block erase speed is %ld KiB/s\n",
+		       blocks, speed);
+	}
 	printk(PRINT_PREF "finished\n");
 out:
 	kfree(iobuf);
diff --git a/drivers/mtd/tests/mtd_subpagetest.c b/drivers/mtd/tests/mtd_subpagetest.c
index 11204e8..334eae5 100644
--- a/drivers/mtd/tests/mtd_subpagetest.c
+++ b/drivers/mtd/tests/mtd_subpagetest.c
@@ -394,6 +394,11 @@
 	}
 
 	subpgsize = mtd->writesize >> mtd->subpage_sft;
+	tmp = mtd->size;
+	do_div(tmp, mtd->erasesize);
+	ebcnt = tmp;
+	pgcnt = mtd->erasesize / mtd->writesize;
+
 	printk(PRINT_PREF "MTD device size %llu, eraseblock size %u, "
 	       "page size %u, subpage size %u, count of eraseblocks %u, "
 	       "pages per eraseblock %u, OOB size %u\n",
@@ -413,11 +418,6 @@
 		goto out;
 	}
 
-	tmp = mtd->size;
-	do_div(tmp, mtd->erasesize);
-	ebcnt = tmp;
-	pgcnt = mtd->erasesize / mtd->writesize;
-
 	err = scan_for_bad_eraseblocks();
 	if (err)
 		goto out;
diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index de75f67..b9f29e0 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -126,7 +126,7 @@
 	  and S5PC1XX chips to control VCC_CORE and VCC_USIM voltages.
 
 config REGULATOR_TWL4030
-	bool "TI TWL4030/TWL5030/TWL6030/TPS695x0 PMIC"
+	bool "TI TWL4030/TWL5030/TWL6030/TPS659x0 PMIC"
 	depends on TWL4030_CORE
 	help
 	  This driver supports the voltage regulators provided by
diff --git a/drivers/regulator/ab3100.c b/drivers/regulator/ab3100.c
index 2dec589..b1d7794 100644
--- a/drivers/regulator/ab3100.c
+++ b/drivers/regulator/ab3100.c
@@ -206,29 +206,6 @@
 		return err;
 	}
 
-	/* Per-regulator power on delay from spec */
-	switch (abreg->regreg) {
-	case AB3100_LDO_A: /* Fallthrough */
-	case AB3100_LDO_C: /* Fallthrough */
-	case AB3100_LDO_D: /* Fallthrough */
-	case AB3100_LDO_E: /* Fallthrough */
-	case AB3100_LDO_H: /* Fallthrough */
-	case AB3100_LDO_K:
-		udelay(200);
-		break;
-	case AB3100_LDO_F:
-		udelay(600);
-		break;
-	case AB3100_LDO_G:
-		udelay(400);
-		break;
-	case AB3100_BUCK:
-		mdelay(1);
-		break;
-	default:
-		break;
-	}
-
 	return 0;
 }
 
@@ -450,11 +427,37 @@
 	return abreg->plfdata->external_voltage;
 }
 
+static int ab3100_enable_time_regulator(struct regulator_dev *reg)
+{
+	struct ab3100_regulator *abreg = reg->reg_data;
+
+	/* Per-regulator power on delay from spec */
+	switch (abreg->regreg) {
+	case AB3100_LDO_A: /* Fallthrough */
+	case AB3100_LDO_C: /* Fallthrough */
+	case AB3100_LDO_D: /* Fallthrough */
+	case AB3100_LDO_E: /* Fallthrough */
+	case AB3100_LDO_H: /* Fallthrough */
+	case AB3100_LDO_K:
+		return 200;
+	case AB3100_LDO_F:
+		return 600;
+	case AB3100_LDO_G:
+		return 400;
+	case AB3100_BUCK:
+		return 1000;
+	default:
+		break;
+	}
+	return 0;
+}
+
 static struct regulator_ops regulator_ops_fixed = {
 	.enable      = ab3100_enable_regulator,
 	.disable     = ab3100_disable_regulator,
 	.is_enabled  = ab3100_is_enabled_regulator,
 	.get_voltage = ab3100_get_voltage_regulator,
+	.enable_time = ab3100_enable_time_regulator,
 };
 
 static struct regulator_ops regulator_ops_variable = {
@@ -464,6 +467,7 @@
 	.get_voltage = ab3100_get_voltage_regulator,
 	.set_voltage = ab3100_set_voltage_regulator,
 	.list_voltage = ab3100_list_voltage_regulator,
+	.enable_time = ab3100_enable_time_regulator,
 };
 
 static struct regulator_ops regulator_ops_variable_sleepable = {
@@ -474,6 +478,7 @@
 	.set_voltage = ab3100_set_voltage_regulator,
 	.set_suspend_voltage = ab3100_set_suspend_voltage_regulator,
 	.list_voltage = ab3100_list_voltage_regulator,
+	.enable_time = ab3100_enable_time_regulator,
 };
 
 /*
diff --git a/drivers/regulator/ab8500.c b/drivers/regulator/ab8500.c
index d9a052c..02f3c23 100644
--- a/drivers/regulator/ab8500.c
+++ b/drivers/regulator/ab8500.c
@@ -9,7 +9,7 @@
  * AB8500 peripheral regulators
  *
  * AB8500 supports the following regulators:
- *   VAUX1/2/3, VINTCORE, VTVOUT, VAUDIO, VAMIC1/2, VDMIC, VANA
+ *   VAUX1/2/3, VINTCORE, VTVOUT, VUSB, VAUDIO, VAMIC1/2, VDMIC, VANA
  */
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -38,6 +38,7 @@
  * @voltage_mask: mask to control regulator voltage
  * @voltages: supported voltage table
  * @voltages_len: number of supported voltages for the regulator
+ * @delay: startup/set voltage delay in us
  */
 struct ab8500_regulator_info {
 	struct device		*dev;
@@ -55,6 +56,7 @@
 	u8 voltage_mask;
 	int const *voltages;
 	int voltages_len;
+	unsigned int delay;
 };
 
 /* voltage tables for the vauxn/vintcore supplies */
@@ -290,6 +292,29 @@
 	return ret;
 }
 
+static int ab8500_regulator_enable_time(struct regulator_dev *rdev)
+{
+	struct ab8500_regulator_info *info = rdev_get_drvdata(rdev);
+
+	return info->delay;
+}
+
+static int ab8500_regulator_set_voltage_time_sel(struct regulator_dev *rdev,
+					     unsigned int old_sel,
+					     unsigned int new_sel)
+{
+	struct ab8500_regulator_info *info = rdev_get_drvdata(rdev);
+	int ret;
+
+	/* If the regulator isn't on, it won't take time here */
+	ret = ab8500_regulator_is_enabled(rdev);
+	if (ret < 0)
+		return ret;
+	if (!ret)
+		return 0;
+	return info->delay;
+}
+
 static struct regulator_ops ab8500_regulator_ops = {
 	.enable		= ab8500_regulator_enable,
 	.disable	= ab8500_regulator_disable,
@@ -297,6 +322,8 @@
 	.get_voltage	= ab8500_regulator_get_voltage,
 	.set_voltage	= ab8500_regulator_set_voltage,
 	.list_voltage	= ab8500_list_voltage,
+	.enable_time	= ab8500_regulator_enable_time,
+	.set_voltage_time_sel = ab8500_regulator_set_voltage_time_sel,
 };
 
 static int ab8500_fixed_get_voltage(struct regulator_dev *rdev)
@@ -317,6 +344,8 @@
 	.is_enabled	= ab8500_regulator_is_enabled,
 	.get_voltage	= ab8500_fixed_get_voltage,
 	.list_voltage	= ab8500_list_voltage,
+	.enable_time	= ab8500_regulator_enable_time,
+	.set_voltage_time_sel = ab8500_regulator_set_voltage_time_sel,
 };
 
 static struct ab8500_regulator_info
@@ -426,12 +455,28 @@
 			.owner		= THIS_MODULE,
 			.n_voltages	= 1,
 		},
+		.delay			= 10000,
 		.fixed_uV		= 2000000,
 		.update_bank		= 0x03,
 		.update_reg		= 0x80,
 		.update_mask		= 0x82,
 		.update_val_enable	= 0x02,
 	},
+	[AB8500_LDO_USB] = {
+		.desc = {
+			.name           = "LDO-USB",
+			.ops            = &ab8500_regulator_fixed_ops,
+			.type           = REGULATOR_VOLTAGE,
+			.id             = AB8500_LDO_USB,
+			.owner          = THIS_MODULE,
+			.n_voltages     = 1,
+		},
+		.fixed_uV               = 3300000,
+		.update_bank            = 0x03,
+		.update_reg             = 0x82,
+		.update_mask            = 0x03,
+		.update_val_enable      = 0x01,
+	},
 	[AB8500_LDO_AUDIO] = {
 		.desc = {
 			.name		= "LDO-AUDIO",
@@ -511,6 +556,186 @@
 
 };
 
+struct ab8500_reg_init {
+	u8 bank;
+	u8 addr;
+	u8 mask;
+};
+
+#define REG_INIT(_id, _bank, _addr, _mask)	\
+	[_id] = {				\
+		.bank = _bank,			\
+		.addr = _addr,			\
+		.mask = _mask,			\
+	}
+
+static struct ab8500_reg_init ab8500_reg_init[] = {
+	/*
+	 * 0x30, VanaRequestCtrl
+	 * 0x0C, VpllRequestCtrl
+	 * 0xc0, VextSupply1RequestCtrl
+	 */
+	REG_INIT(AB8500_REGUREQUESTCTRL2,	0x03, 0x04, 0xfc),
+	/*
+	 * 0x03, VextSupply2RequestCtrl
+	 * 0x0c, VextSupply3RequestCtrl
+	 * 0x30, Vaux1RequestCtrl
+	 * 0xc0, Vaux2RequestCtrl
+	 */
+	REG_INIT(AB8500_REGUREQUESTCTRL3,	0x03, 0x05, 0xff),
+	/*
+	 * 0x03, Vaux3RequestCtrl
+	 * 0x04, SwHPReq
+	 */
+	REG_INIT(AB8500_REGUREQUESTCTRL4,	0x03, 0x06, 0x07),
+	/*
+	 * 0x08, VanaSysClkReq1HPValid
+	 * 0x20, Vaux1SysClkReq1HPValid
+	 * 0x40, Vaux2SysClkReq1HPValid
+	 * 0x80, Vaux3SysClkReq1HPValid
+	 */
+	REG_INIT(AB8500_REGUSYSCLKREQ1HPVALID1,	0x03, 0x07, 0xe8),
+	/*
+	 * 0x10, VextSupply1SysClkReq1HPValid
+	 * 0x20, VextSupply2SysClkReq1HPValid
+	 * 0x40, VextSupply3SysClkReq1HPValid
+	 */
+	REG_INIT(AB8500_REGUSYSCLKREQ1HPVALID2,	0x03, 0x08, 0x70),
+	/*
+	 * 0x08, VanaHwHPReq1Valid
+	 * 0x20, Vaux1HwHPReq1Valid
+	 * 0x40, Vaux2HwHPReq1Valid
+	 * 0x80, Vaux3HwHPReq1Valid
+	 */
+	REG_INIT(AB8500_REGUHWHPREQ1VALID1,	0x03, 0x09, 0xe8),
+	/*
+	 * 0x01, VextSupply1HwHPReq1Valid
+	 * 0x02, VextSupply2HwHPReq1Valid
+	 * 0x04, VextSupply3HwHPReq1Valid
+	 */
+	REG_INIT(AB8500_REGUHWHPREQ1VALID2,	0x03, 0x0a, 0x07),
+	/*
+	 * 0x08, VanaHwHPReq2Valid
+	 * 0x20, Vaux1HwHPReq2Valid
+	 * 0x40, Vaux2HwHPReq2Valid
+	 * 0x80, Vaux3HwHPReq2Valid
+	 */
+	REG_INIT(AB8500_REGUHWHPREQ2VALID1,	0x03, 0x0b, 0xe8),
+	/*
+	 * 0x01, VextSupply1HwHPReq2Valid
+	 * 0x02, VextSupply2HwHPReq2Valid
+	 * 0x04, VextSupply3HwHPReq2Valid
+	 */
+	REG_INIT(AB8500_REGUHWHPREQ2VALID2,	0x03, 0x0c, 0x07),
+	/*
+	 * 0x20, VanaSwHPReqValid
+	 * 0x80, Vaux1SwHPReqValid
+	 */
+	REG_INIT(AB8500_REGUSWHPREQVALID1,	0x03, 0x0d, 0xa0),
+	/*
+	 * 0x01, Vaux2SwHPReqValid
+	 * 0x02, Vaux3SwHPReqValid
+	 * 0x04, VextSupply1SwHPReqValid
+	 * 0x08, VextSupply2SwHPReqValid
+	 * 0x10, VextSupply3SwHPReqValid
+	 */
+	REG_INIT(AB8500_REGUSWHPREQVALID2,	0x03, 0x0e, 0x1f),
+	/*
+	 * 0x02, SysClkReq2Valid1
+	 * ...
+	 * 0x80, SysClkReq8Valid1
+	 */
+	REG_INIT(AB8500_REGUSYSCLKREQVALID1,	0x03, 0x0f, 0xfe),
+	/*
+	 * 0x02, SysClkReq2Valid2
+	 * ...
+	 * 0x80, SysClkReq8Valid2
+	 */
+	REG_INIT(AB8500_REGUSYSCLKREQVALID2,	0x03, 0x10, 0xfe),
+	/*
+	 * 0x02, VTVoutEna
+	 * 0x04, Vintcore12Ena
+	 * 0x38, Vintcore12Sel
+	 * 0x40, Vintcore12LP
+	 * 0x80, VTVoutLP
+	 */
+	REG_INIT(AB8500_REGUMISC1,		0x03, 0x80, 0xfe),
+	/*
+	 * 0x02, VaudioEna
+	 * 0x04, VdmicEna
+	 * 0x08, Vamic1Ena
+	 * 0x10, Vamic2Ena
+	 */
+	REG_INIT(AB8500_VAUDIOSUPPLY,		0x03, 0x83, 0x1e),
+	/*
+	 * 0x01, Vamic1_dzout
+	 * 0x02, Vamic2_dzout
+	 */
+	REG_INIT(AB8500_REGUCTRL1VAMIC,		0x03, 0x84, 0x03),
+	/*
+	 * 0x0c, VanaRegu
+	 * 0x03, VpllRegu
+	 */
+	REG_INIT(AB8500_VPLLVANAREGU,		0x04, 0x06, 0x0f),
+	/*
+	 * 0x01, VrefDDREna
+	 * 0x02, VrefDDRSleepMode
+	 */
+	REG_INIT(AB8500_VREFDDR,		0x04, 0x07, 0x03),
+	/*
+	 * 0x03, VextSupply1Regu
+	 * 0x0c, VextSupply2Regu
+	 * 0x30, VextSupply3Regu
+	 * 0x40, ExtSupply2Bypass
+	 * 0x80, ExtSupply3Bypass
+	 */
+	REG_INIT(AB8500_EXTSUPPLYREGU,		0x04, 0x08, 0xff),
+	/*
+	 * 0x03, Vaux1Regu
+	 * 0x0c, Vaux2Regu
+	 */
+	REG_INIT(AB8500_VAUX12REGU,		0x04, 0x09, 0x0f),
+	/*
+	 * 0x03, Vaux3Regu
+	 */
+	REG_INIT(AB8500_VRF1VAUX3REGU,		0x04, 0x0a, 0x03),
+	/*
+	 * 0x3f, Vsmps1Sel1
+	 */
+	REG_INIT(AB8500_VSMPS1SEL1,		0x04, 0x13, 0x3f),
+	/*
+	 * 0x0f, Vaux1Sel
+	 */
+	REG_INIT(AB8500_VAUX1SEL,		0x04, 0x1f, 0x0f),
+	/*
+	 * 0x0f, Vaux2Sel
+	 */
+	REG_INIT(AB8500_VAUX2SEL,		0x04, 0x20, 0x0f),
+	/*
+	 * 0x07, Vaux3Sel
+	 */
+	REG_INIT(AB8500_VRF1VAUX3SEL,		0x04, 0x21, 0x07),
+	/*
+	 * 0x01, VextSupply12LP
+	 */
+	REG_INIT(AB8500_REGUCTRL2SPARE,		0x04, 0x22, 0x01),
+	/*
+	 * 0x04, Vaux1Disch
+	 * 0x08, Vaux2Disch
+	 * 0x10, Vaux3Disch
+	 * 0x20, Vintcore12Disch
+	 * 0x40, VTVoutDisch
+	 * 0x80, VaudioDisch
+	 */
+	REG_INIT(AB8500_REGUCTRLDISCH,		0x04, 0x43, 0xfc),
+	/*
+	 * 0x02, VanaDisch
+	 * 0x04, VdmicPullDownEna
+	 * 0x10, VdmicDisch
+	 */
+	REG_INIT(AB8500_REGUCTRLDISCH2,		0x04, 0x44, 0x16),
+};
+
 static __devinit int ab8500_regulator_probe(struct platform_device *pdev)
 {
 	struct ab8500 *ab8500 = dev_get_drvdata(pdev->dev.parent);
@@ -529,10 +754,51 @@
 
 	/* make sure the platform data has the correct size */
 	if (pdata->num_regulator != ARRAY_SIZE(ab8500_regulator_info)) {
-		dev_err(&pdev->dev, "platform configuration error\n");
+		dev_err(&pdev->dev, "Configuration error: size mismatch.\n");
 		return -EINVAL;
 	}
 
+	/* initialize registers */
+	for (i = 0; i < pdata->num_regulator_reg_init; i++) {
+		int id;
+		u8 value;
+
+		id = pdata->regulator_reg_init[i].id;
+		value = pdata->regulator_reg_init[i].value;
+
+		/* check for configuration errors */
+		if (id >= AB8500_NUM_REGULATOR_REGISTERS) {
+			dev_err(&pdev->dev,
+				"Configuration error: id outside range.\n");
+			return -EINVAL;
+		}
+		if (value & ~ab8500_reg_init[id].mask) {
+			dev_err(&pdev->dev,
+				"Configuration error: value outside mask.\n");
+			return -EINVAL;
+		}
+
+		/* initialize register */
+		err = abx500_mask_and_set_register_interruptible(&pdev->dev,
+			ab8500_reg_init[id].bank,
+			ab8500_reg_init[id].addr,
+			ab8500_reg_init[id].mask,
+			value);
+		if (err < 0) {
+			dev_err(&pdev->dev,
+				"Failed to initialize 0x%02x, 0x%02x.\n",
+				ab8500_reg_init[id].bank,
+				ab8500_reg_init[id].addr);
+			return err;
+		}
+		dev_vdbg(&pdev->dev,
+			"  init: 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",
+			ab8500_reg_init[id].bank,
+			ab8500_reg_init[id].addr,
+			ab8500_reg_init[id].mask,
+			value);
+	}
+
 	/* register all regulators */
 	for (i = 0; i < ARRAY_SIZE(ab8500_regulator_info); i++) {
 		struct ab8500_regulator_info *info = NULL;
diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 9fa2095..3ffc697 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1629,6 +1629,7 @@
 				     int min_uV, int max_uV)
 {
 	int ret;
+	int delay = 0;
 	unsigned int selector;
 
 	trace_regulator_set_voltage(rdev_get_name(rdev), min_uV, max_uV);
@@ -1662,6 +1663,22 @@
 			}
 		}
 
+		/*
+		 * If we can't obtain the old selector there is not enough
+		 * info to call set_voltage_time_sel().
+		 */
+		if (rdev->desc->ops->set_voltage_time_sel &&
+		    rdev->desc->ops->get_voltage_sel) {
+			unsigned int old_selector = 0;
+
+			ret = rdev->desc->ops->get_voltage_sel(rdev);
+			if (ret < 0)
+				return ret;
+			old_selector = ret;
+			delay = rdev->desc->ops->set_voltage_time_sel(rdev,
+						old_selector, selector);
+		}
+
 		if (best_val != INT_MAX) {
 			ret = rdev->desc->ops->set_voltage_sel(rdev, selector);
 			selector = best_val;
@@ -1672,6 +1689,14 @@
 		ret = -EINVAL;
 	}
 
+	/* Insert any necessary delays */
+	if (delay >= 1000) {
+		mdelay(delay / 1000);
+		udelay(delay % 1000);
+	} else if (delay) {
+		udelay(delay);
+	}
+
 	if (ret == 0)
 		_notifier_call_chain(rdev, REGULATOR_EVENT_VOLTAGE_CHANGE,
 				     NULL);
@@ -1740,6 +1765,51 @@
 EXPORT_SYMBOL_GPL(regulator_set_voltage);
 
 /**
+ * regulator_set_voltage_time - get raise/fall time
+ * @regulator: regulator source
+ * @old_uV: starting voltage in microvolts
+ * @new_uV: target voltage in microvolts
+ *
+ * Provided with the starting and ending voltage, this function attempts to
+ * calculate the time in microseconds required to rise or fall to this new
+ * voltage.
+ */
+int regulator_set_voltage_time(struct regulator *regulator,
+			       int old_uV, int new_uV)
+{
+	struct regulator_dev	*rdev = regulator->rdev;
+	struct regulator_ops	*ops = rdev->desc->ops;
+	int old_sel = -1;
+	int new_sel = -1;
+	int voltage;
+	int i;
+
+	/* Currently requires operations to do this */
+	if (!ops->list_voltage || !ops->set_voltage_time_sel
+	    || !rdev->desc->n_voltages)
+		return -EINVAL;
+
+	for (i = 0; i < rdev->desc->n_voltages; i++) {
+		/* We only look for exact voltage matches here */
+		voltage = regulator_list_voltage(regulator, i);
+		if (voltage < 0)
+			return -EINVAL;
+		if (voltage == 0)
+			continue;
+		if (voltage == old_uV)
+			old_sel = i;
+		if (voltage == new_uV)
+			new_sel = i;
+	}
+
+	if (old_sel < 0 || new_sel < 0)
+		return -EINVAL;
+
+	return ops->set_voltage_time_sel(rdev, old_sel, new_sel);
+}
+EXPORT_SYMBOL_GPL(regulator_set_voltage_time);
+
+/**
  * regulator_sync_voltage - re-apply last regulator output voltage
  * @regulator: regulator source
  *
@@ -2565,8 +2635,11 @@
 			init_data->consumer_supplies[i].dev,
 			init_data->consumer_supplies[i].dev_name,
 			init_data->consumer_supplies[i].supply);
-		if (ret < 0)
+		if (ret < 0) {
+			dev_err(dev, "Failed to set supply %s\n",
+				init_data->consumer_supplies[i].supply);
 			goto unset_supplies;
+		}
 	}
 
 	list_add(&rdev->list, &regulator_list);
@@ -2653,6 +2726,47 @@
 EXPORT_SYMBOL_GPL(regulator_suspend_prepare);
 
 /**
+ * regulator_suspend_finish - resume regulators from system wide suspend
+ *
+ * Turn on regulators that might be turned off by regulator_suspend_prepare
+ * and that should be turned on according to the regulators properties.
+ */
+int regulator_suspend_finish(void)
+{
+	struct regulator_dev *rdev;
+	int ret = 0, error;
+
+	mutex_lock(&regulator_list_mutex);
+	list_for_each_entry(rdev, &regulator_list, list) {
+		struct regulator_ops *ops = rdev->desc->ops;
+
+		mutex_lock(&rdev->mutex);
+		if ((rdev->use_count > 0  || rdev->constraints->always_on) &&
+				ops->enable) {
+			error = ops->enable(rdev);
+			if (error)
+				ret = error;
+		} else {
+			if (!has_full_constraints)
+				goto unlock;
+			if (!ops->disable)
+				goto unlock;
+			if (ops->is_enabled && !ops->is_enabled(rdev))
+				goto unlock;
+
+			error = ops->disable(rdev);
+			if (error)
+				ret = error;
+		}
+unlock:
+		mutex_unlock(&rdev->mutex);
+	}
+	mutex_unlock(&regulator_list_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(regulator_suspend_finish);
+
+/**
  * regulator_has_full_constraints - the system has fully specified constraints
  *
  * Calling this function will cause the regulator API to disable all
diff --git a/drivers/regulator/max8997.c b/drivers/regulator/max8997.c
index 01ef7e9..77e0cfb 100644
--- a/drivers/regulator/max8997.c
+++ b/drivers/regulator/max8997.c
@@ -1185,6 +1185,7 @@
 	{ "max8997-pmic", 0},
 	{ },
 };
+MODULE_DEVICE_TABLE(platform, max8997_pmic_id);
 
 static struct platform_driver max8997_pmic_driver = {
 	.driver = {
diff --git a/drivers/regulator/max8998.c b/drivers/regulator/max8998.c
index 0ec49ca..4341026 100644
--- a/drivers/regulator/max8998.c
+++ b/drivers/regulator/max8998.c
@@ -887,6 +887,7 @@
 	{ "lp3974-pmic", TYPE_LP3974 },
 	{ }
 };
+MODULE_DEVICE_TABLE(platform, max8998_pmic_id);
 
 static struct platform_driver max8998_pmic_driver = {
 	.driver = {
diff --git a/drivers/regulator/tps6524x-regulator.c b/drivers/regulator/tps6524x-regulator.c
index 176a6be..9166aa0 100644
--- a/drivers/regulator/tps6524x-regulator.c
+++ b/drivers/regulator/tps6524x-regulator.c
@@ -596,7 +596,7 @@
 	.get_current_limit	= get_current_limit,
 };
 
-static int __devexit pmic_remove(struct spi_device *spi)
+static int pmic_remove(struct spi_device *spi)
 {
 	struct tps6524x *hw = spi_get_drvdata(spi);
 	int i;
diff --git a/drivers/regulator/wm831x-dcdc.c b/drivers/regulator/wm831x-dcdc.c
index 06df898..e93453b 100644
--- a/drivers/regulator/wm831x-dcdc.c
+++ b/drivers/regulator/wm831x-dcdc.c
@@ -565,9 +565,8 @@
 	}
 
 	irq = platform_get_irq_byname(pdev, "UV");
-	ret = wm831x_request_irq(wm831x, irq, wm831x_dcdc_uv_irq,
-				 IRQF_TRIGGER_RISING, dcdc->name,
-				 dcdc);
+	ret = request_threaded_irq(irq, NULL, wm831x_dcdc_uv_irq,
+				   IRQF_TRIGGER_RISING, dcdc->name, dcdc);
 	if (ret != 0) {
 		dev_err(&pdev->dev, "Failed to request UV IRQ %d: %d\n",
 			irq, ret);
@@ -575,9 +574,8 @@
 	}
 
 	irq = platform_get_irq_byname(pdev, "HC");
-	ret = wm831x_request_irq(wm831x, irq, wm831x_dcdc_oc_irq,
-				 IRQF_TRIGGER_RISING, dcdc->name,
-				 dcdc);
+	ret = request_threaded_irq(irq, NULL, wm831x_dcdc_oc_irq,
+				   IRQF_TRIGGER_RISING, dcdc->name, dcdc);
 	if (ret != 0) {
 		dev_err(&pdev->dev, "Failed to request HC IRQ %d: %d\n",
 			irq, ret);
@@ -589,7 +587,7 @@
 	return 0;
 
 err_uv:
-	wm831x_free_irq(wm831x, platform_get_irq_byname(pdev, "UV"), dcdc);
+	free_irq(platform_get_irq_byname(pdev, "UV"), dcdc);
 err_regulator:
 	regulator_unregister(dcdc->regulator);
 err:
@@ -606,8 +604,8 @@
 
 	platform_set_drvdata(pdev, NULL);
 
-	wm831x_free_irq(wm831x, platform_get_irq_byname(pdev, "HC"), dcdc);
-	wm831x_free_irq(wm831x, platform_get_irq_byname(pdev, "UV"), dcdc);
+	free_irq(platform_get_irq_byname(pdev, "HC"), dcdc);
+	free_irq(platform_get_irq_byname(pdev, "UV"), dcdc);
 	regulator_unregister(dcdc->regulator);
 	if (dcdc->dvs_gpio)
 		gpio_free(dcdc->dvs_gpio);
@@ -756,9 +754,8 @@
 	}
 
 	irq = platform_get_irq_byname(pdev, "UV");
-	ret = wm831x_request_irq(wm831x, irq, wm831x_dcdc_uv_irq,
-				 IRQF_TRIGGER_RISING, dcdc->name,
-				 dcdc);
+	ret = request_threaded_irq(irq, NULL, wm831x_dcdc_uv_irq,
+				   IRQF_TRIGGER_RISING,	dcdc->name, dcdc);
 	if (ret != 0) {
 		dev_err(&pdev->dev, "Failed to request UV IRQ %d: %d\n",
 			irq, ret);
@@ -783,7 +780,7 @@
 
 	platform_set_drvdata(pdev, NULL);
 
-	wm831x_free_irq(wm831x, platform_get_irq_byname(pdev, "UV"), dcdc);
+	free_irq(platform_get_irq_byname(pdev, "UV"), dcdc);
 	regulator_unregister(dcdc->regulator);
 	kfree(dcdc);
 
@@ -885,9 +882,9 @@
 	}
 
 	irq = platform_get_irq_byname(pdev, "UV");
-	ret = wm831x_request_irq(wm831x, irq, wm831x_dcdc_uv_irq,
-				 IRQF_TRIGGER_RISING, dcdc->name,
-				 dcdc);
+	ret = request_threaded_irq(irq, NULL, wm831x_dcdc_uv_irq,
+				   IRQF_TRIGGER_RISING, dcdc->name,
+				   dcdc);
 	if (ret != 0) {
 		dev_err(&pdev->dev, "Failed to request UV IRQ %d: %d\n",
 			irq, ret);
@@ -908,11 +905,10 @@
 static __devexit int wm831x_boostp_remove(struct platform_device *pdev)
 {
 	struct wm831x_dcdc *dcdc = platform_get_drvdata(pdev);
-	struct wm831x *wm831x = dcdc->wm831x;
 
 	platform_set_drvdata(pdev, NULL);
 
-	wm831x_free_irq(wm831x, platform_get_irq_byname(pdev, "UV"), dcdc);
+	free_irq(platform_get_irq_byname(pdev, "UV"), dcdc);
 	regulator_unregister(dcdc->regulator);
 	kfree(dcdc);
 
diff --git a/drivers/regulator/wm831x-isink.c b/drivers/regulator/wm831x-isink.c
index 6c446cd..01f27c7 100644
--- a/drivers/regulator/wm831x-isink.c
+++ b/drivers/regulator/wm831x-isink.c
@@ -198,9 +198,8 @@
 	}
 
 	irq = platform_get_irq(pdev, 0);
-	ret = wm831x_request_irq(wm831x, irq, wm831x_isink_irq,
-				 IRQF_TRIGGER_RISING, isink->name,
-				 isink);
+	ret = request_threaded_irq(irq, NULL, wm831x_isink_irq,
+				   IRQF_TRIGGER_RISING, isink->name, isink);
 	if (ret != 0) {
 		dev_err(&pdev->dev, "Failed to request ISINK IRQ %d: %d\n",
 			irq, ret);
@@ -221,11 +220,10 @@
 static __devexit int wm831x_isink_remove(struct platform_device *pdev)
 {
 	struct wm831x_isink *isink = platform_get_drvdata(pdev);
-	struct wm831x *wm831x = isink->wm831x;
 
 	platform_set_drvdata(pdev, NULL);
 
-	wm831x_free_irq(wm831x, platform_get_irq(pdev, 0), isink);
+	free_irq(platform_get_irq(pdev, 0), isink);
 
 	regulator_unregister(isink->regulator);
 	kfree(isink);
diff --git a/drivers/regulator/wm831x-ldo.c b/drivers/regulator/wm831x-ldo.c
index c94fc5b..2220cf8 100644
--- a/drivers/regulator/wm831x-ldo.c
+++ b/drivers/regulator/wm831x-ldo.c
@@ -354,9 +354,9 @@
 	}
 
 	irq = platform_get_irq_byname(pdev, "UV");
-	ret = wm831x_request_irq(wm831x, irq, wm831x_ldo_uv_irq,
-				 IRQF_TRIGGER_RISING, ldo->name,
-				 ldo);
+	ret = request_threaded_irq(irq, NULL, wm831x_ldo_uv_irq,
+				   IRQF_TRIGGER_RISING, ldo->name,
+				   ldo);
 	if (ret != 0) {
 		dev_err(&pdev->dev, "Failed to request UV IRQ %d: %d\n",
 			irq, ret);
@@ -377,11 +377,10 @@
 static __devexit int wm831x_gp_ldo_remove(struct platform_device *pdev)
 {
 	struct wm831x_ldo *ldo = platform_get_drvdata(pdev);
-	struct wm831x *wm831x = ldo->wm831x;
 
 	platform_set_drvdata(pdev, NULL);
 
-	wm831x_free_irq(wm831x, platform_get_irq_byname(pdev, "UV"), ldo);
+	free_irq(platform_get_irq_byname(pdev, "UV"), ldo);
 	regulator_unregister(ldo->regulator);
 	kfree(ldo);
 
@@ -619,9 +618,8 @@
 	}
 
 	irq = platform_get_irq_byname(pdev, "UV");
-	ret = wm831x_request_irq(wm831x, irq, wm831x_ldo_uv_irq,
-				 IRQF_TRIGGER_RISING, ldo->name,
-				 ldo);
+	ret = request_threaded_irq(irq, NULL, wm831x_ldo_uv_irq,
+				   IRQF_TRIGGER_RISING, ldo->name, ldo);
 	if (ret != 0) {
 		dev_err(&pdev->dev, "Failed to request UV IRQ %d: %d\n",
 			irq, ret);
@@ -642,9 +640,8 @@
 static __devexit int wm831x_aldo_remove(struct platform_device *pdev)
 {
 	struct wm831x_ldo *ldo = platform_get_drvdata(pdev);
-	struct wm831x *wm831x = ldo->wm831x;
 
-	wm831x_free_irq(wm831x, platform_get_irq_byname(pdev, "UV"), ldo);
+	free_irq(platform_get_irq_byname(pdev, "UV"), ldo);
 	regulator_unregister(ldo->regulator);
 	kfree(ldo);
 
diff --git a/drivers/staging/westbridge/astoria/block/cyasblkdev_block.c b/drivers/staging/westbridge/astoria/block/cyasblkdev_block.c
index 842cd92..289729d 100644
--- a/drivers/staging/westbridge/astoria/block/cyasblkdev_block.c
+++ b/drivers/staging/westbridge/astoria/block/cyasblkdev_block.c
@@ -1191,7 +1191,7 @@
 		bd->user_disk_1->first_minor = (devidx + 1) << CYASBLKDEV_SHIFT;
 		bd->user_disk_1->minors = 8;
 		bd->user_disk_1->fops = &cyasblkdev_bdops;
-		bd->user_disk_0->events = DISK_EVENT_MEDIA_CHANGE;
+		bd->user_disk_1->events = DISK_EVENT_MEDIA_CHANGE;
 		bd->user_disk_1->private_data = bd;
 		bd->user_disk_1->queue = bd->queue.queue;
 		bd->dbgprn_flags = DBGPRN_RD_RQ;
diff --git a/fs/inode.c b/fs/inode.c
index 05a1f75..5f4e11a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1167,7 +1167,7 @@
  * Note: I_NEW is not waited upon so you have to be very careful what you do
  * with the returned inode.  You probably should be using ilookup5() instead.
  *
- * Note: @test is called with the inode_hash_lock held, so can't sleep.
+ * Note2: @test is called with the inode_hash_lock held, so can't sleep.
  */
 struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
 		int (*test)(struct inode *, void *), void *data)
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 4f9cc04..3e93cdd 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -31,7 +31,7 @@
  *   is used to release xattr name/value pair and detach from c->xattrindex.
  * reclaim_xattr_datum(c)
  *   is used to reclaim xattr name/value pairs on the xattr name/value pair cache when
- *   memory usage by cache is over c->xdatum_mem_threshold. Currently, this threshold 
+ *   memory usage by cache is over c->xdatum_mem_threshold. Currently, this threshold
  *   is hard coded as 32KiB.
  * do_verify_xattr_datum(c, xd)
  *   is used to load the xdatum informations without name/value pair from the medium.
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index ad92bf7..9166fcb 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -192,13 +192,15 @@
 	auth   = rpcauth_create(flavor, clone);
 	if (!auth) {
 		flavor = -EIO;
-		goto out;
+		goto out_shutdown;
 	}
 	err = server->nfs_client->rpc_ops->lookup(clone, parent->d_inode,
 						  &path->dentry->d_name,
 						  fh, fattr);
 	if (err < 0)
 		flavor = err;
+out_shutdown:
+	rpc_shutdown_client(clone);
 out:
 	return flavor;
 }
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 87a593c..c80add6 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -135,14 +135,14 @@
 		nfs_unlock_request(req);
 }
 
-/**
+/*
  * nfs_clear_request - Free up all resources allocated to the request
  * @req:
  *
  * Release page and open context resources associated with a read/write
  * request after it has completed.
  */
-void nfs_clear_request(struct nfs_page *req)
+static void nfs_clear_request(struct nfs_page *req)
 {
 	struct page *page = req->wb_page;
 	struct nfs_open_context *ctx = req->wb_context;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 85d7525..af0c627 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -389,11 +389,8 @@
 	spin_lock(&inode->i_lock);
 	error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
 	BUG_ON(error);
-	if (!nfsi->npages) {
-		igrab(inode);
-		if (nfs_have_delegation(inode, FMODE_WRITE))
-			nfsi->change_attr++;
-	}
+	if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE))
+		nfsi->change_attr++;
 	set_bit(PG_MAPPED, &req->wb_flags);
 	SetPagePrivate(req->wb_page);
 	set_page_private(req->wb_page, (unsigned long)req);
@@ -423,11 +420,7 @@
 	clear_bit(PG_MAPPED, &req->wb_flags);
 	radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
 	nfsi->npages--;
-	if (!nfsi->npages) {
-		spin_unlock(&inode->i_lock);
-		iput(inode);
-	} else
-		spin_unlock(&inode->i_lock);
+	spin_unlock(&inode->i_lock);
 	nfs_release_request(req);
 }
 
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 90f2729..e913ad1 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -24,7 +24,6 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 
-#define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index e4984e2..b27a0d8 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -30,7 +30,6 @@
 #include <linux/swap.h>
 #include <linux/quotaops.h>
 
-#define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -50,6 +49,7 @@
 #include "uptodate.h"
 #include "xattr.h"
 #include "refcounttree.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
@@ -886,8 +886,7 @@
 	struct ocfs2_extent_block *eb =
 		(struct ocfs2_extent_block *)bh->b_data;
 
-	mlog(0, "Validating extent block %llu\n",
-	     (unsigned long long)bh->b_blocknr);
+	trace_ocfs2_validate_extent_block((unsigned long long)bh->b_blocknr);
 
 	BUG_ON(!buffer_uptodate(bh));
 
@@ -965,8 +964,6 @@
 	struct buffer_head *eb_bh = NULL;
 	u64 last_eb_blk = 0;
 
-	mlog_entry_void();
-
 	el = et->et_root_el;
 	last_eb_blk = ocfs2_et_get_last_eb_blk(et);
 
@@ -987,7 +984,7 @@
 bail:
 	brelse(eb_bh);
 
-	mlog_exit(retval);
+	trace_ocfs2_num_free_extents(retval);
 	return retval;
 }
 
@@ -1010,8 +1007,6 @@
 		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
 	struct ocfs2_extent_block *eb;
 
-	mlog_entry_void();
-
 	count = 0;
 	while (count < wanted) {
 		status = ocfs2_claim_metadata(handle,
@@ -1074,8 +1069,8 @@
 			brelse(bhs[i]);
 			bhs[i] = NULL;
 		}
+		mlog_errno(status);
 	}
-	mlog_exit(status);
 	return status;
 }
 
@@ -1173,8 +1168,6 @@
 	struct ocfs2_extent_list  *el;
 	u32 new_cpos, root_end;
 
-	mlog_entry_void();
-
 	BUG_ON(!last_eb_bh || !*last_eb_bh);
 
 	if (eb_bh) {
@@ -1200,8 +1193,11 @@
 	 * from new_cpos).
 	 */
 	if (root_end > new_cpos) {
-		mlog(0, "adjust the cluster end from %u to %u\n",
-		     root_end, new_cpos);
+		trace_ocfs2_adjust_rightmost_branch(
+			(unsigned long long)
+			ocfs2_metadata_cache_owner(et->et_ci),
+			root_end, new_cpos);
+
 		status = ocfs2_adjust_rightmost_branch(handle, et);
 		if (status) {
 			mlog_errno(status);
@@ -1332,7 +1328,6 @@
 		kfree(new_eb_bhs);
 	}
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -1353,8 +1348,6 @@
 	struct ocfs2_extent_list  *root_el;
 	struct ocfs2_extent_list  *eb_el;
 
-	mlog_entry_void();
-
 	status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
 					   &new_eb_bh);
 	if (status < 0) {
@@ -1415,7 +1408,6 @@
 bail:
 	brelse(new_eb_bh);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -1446,8 +1438,6 @@
 	struct buffer_head *bh = NULL;
 	struct buffer_head *lowest_bh = NULL;
 
-	mlog_entry_void();
-
 	*target_bh = NULL;
 
 	el = et->et_root_el;
@@ -1503,7 +1493,6 @@
 bail:
 	brelse(bh);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -1540,7 +1529,10 @@
 	 * another tree level */
 	if (shift) {
 		BUG_ON(bh);
-		mlog(0, "need to shift tree depth (current = %d)\n", depth);
+		trace_ocfs2_grow_tree(
+			(unsigned long long)
+			ocfs2_metadata_cache_owner(et->et_ci),
+			depth);
 
 		/* ocfs2_shift_tree_depth will return us a buffer with
 		 * the new extent block (so we can pass that to
@@ -1570,7 +1562,6 @@
 
 	/* call ocfs2_add_branch to add the final part of the tree with
 	 * the new data. */
-	mlog(0, "add branch. bh = %p\n", bh);
 	ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
 			       meta_ac);
 	if (ret < 0) {
@@ -1645,8 +1636,9 @@
 	}
 	insert_index = i;
 
-	mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
-	     insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
+	trace_ocfs2_rotate_leaf(insert_cpos, insert_index,
+				has_empty, next_free,
+				le16_to_cpu(el->l_count));
 
 	BUG_ON(insert_index < 0);
 	BUG_ON(insert_index >= le16_to_cpu(el->l_count));
@@ -2059,7 +2051,7 @@
 	left_el = path_leaf_el(left_path);
 	right_el = path_leaf_el(right_path);
 	for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
-		mlog(0, "Adjust records at index %u\n", i);
+		trace_ocfs2_complete_edge_insert(i);
 
 		/*
 		 * One nice property of knowing that all of these
@@ -2389,7 +2381,9 @@
 		goto out;
 	}
 
-	mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
+	trace_ocfs2_rotate_tree_right(
+		(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+		insert_cpos, cpos);
 
 	/*
 	 * What we want to do here is:
@@ -2418,8 +2412,10 @@
 	 * rotating subtrees.
 	 */
 	while (cpos && insert_cpos <= cpos) {
-		mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
-		     insert_cpos, cpos);
+		trace_ocfs2_rotate_tree_right(
+			(unsigned long long)
+			ocfs2_metadata_cache_owner(et->et_ci),
+			insert_cpos, cpos);
 
 		ret = ocfs2_find_path(et->et_ci, left_path, cpos);
 		if (ret) {
@@ -2461,10 +2457,10 @@
 
 		start = ocfs2_find_subtree_root(et, left_path, right_path);
 
-		mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
-		     start,
-		     (unsigned long long) right_path->p_node[start].bh->b_blocknr,
-		     right_path->p_tree_depth);
+		trace_ocfs2_rotate_subtree(start,
+			(unsigned long long)
+			right_path->p_node[start].bh->b_blocknr,
+			right_path->p_tree_depth);
 
 		ret = ocfs2_extend_rotate_transaction(handle, start,
 						      orig_credits, right_path);
@@ -2964,8 +2960,7 @@
 		subtree_root = ocfs2_find_subtree_root(et, left_path,
 						       right_path);
 
-		mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
-		     subtree_root,
+		trace_ocfs2_rotate_subtree(subtree_root,
 		     (unsigned long long)
 		     right_path->p_node[subtree_root].bh->b_blocknr,
 		     right_path->p_tree_depth);
@@ -3989,9 +3984,11 @@
 			goto out;
 		}
 
-		mlog(0, "Append may need a left path update. cpos: %u, "
-		     "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
-		     left_cpos);
+		trace_ocfs2_append_rec_to_path(
+			(unsigned long long)
+			ocfs2_metadata_cache_owner(et->et_ci),
+			le32_to_cpu(insert_rec->e_cpos),
+			left_cpos);
 
 		/*
 		 * No need to worry if the append is already in the
@@ -4562,7 +4559,7 @@
 					      ocfs2_et_get_last_eb_blk(et),
 					      &bh);
 		if (ret) {
-			mlog_exit(ret);
+			mlog_errno(ret);
 			goto out;
 		}
 		eb = (struct ocfs2_extent_block *) bh->b_data;
@@ -4678,9 +4675,9 @@
 	struct ocfs2_insert_type insert = {0, };
 	struct ocfs2_extent_rec rec;
 
-	mlog(0, "add %u clusters at position %u to owner %llu\n",
-	     new_clusters, cpos,
-	     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
+	trace_ocfs2_insert_extent_start(
+		(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+		cpos, new_clusters);
 
 	memset(&rec, 0, sizeof(rec));
 	rec.e_cpos = cpu_to_le32(cpos);
@@ -4700,11 +4697,9 @@
 		goto bail;
 	}
 
-	mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
-	     "Insert.contig_index: %d, Insert.free_records: %d, "
-	     "Insert.tree_depth: %d\n",
-	     insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
-	     free_records, insert.ins_tree_depth);
+	trace_ocfs2_insert_extent(insert.ins_appending, insert.ins_contig,
+				  insert.ins_contig_index, free_records,
+				  insert.ins_tree_depth);
 
 	if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
 		status = ocfs2_grow_tree(handle, et,
@@ -4726,7 +4721,6 @@
 bail:
 	brelse(last_eb_bh);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -4746,7 +4740,7 @@
 				struct ocfs2_alloc_context *meta_ac,
 				enum ocfs2_alloc_restarted *reason_ret)
 {
-	int status = 0;
+	int status = 0, err = 0;
 	int free_extents;
 	enum ocfs2_alloc_restarted reason = RESTART_NONE;
 	u32 bit_off, num_bits;
@@ -4773,14 +4767,14 @@
 	 * 2) we are so fragmented, we've needed to add metadata too
 	 *    many times. */
 	if (!free_extents && !meta_ac) {
-		mlog(0, "we haven't reserved any metadata!\n");
+		err = -1;
 		status = -EAGAIN;
 		reason = RESTART_META;
 		goto leave;
 	} else if ((!free_extents)
 		   && (ocfs2_alloc_context_bits_left(meta_ac)
 		       < ocfs2_extend_meta_needed(et->et_root_el))) {
-		mlog(0, "filesystem is really fragmented...\n");
+		err = -2;
 		status = -EAGAIN;
 		reason = RESTART_META;
 		goto leave;
@@ -4805,9 +4799,9 @@
 	}
 
 	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-	mlog(0, "Allocating %u clusters at block %u for owner %llu\n",
-	     num_bits, bit_off,
-	     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
+	trace_ocfs2_add_clusters_in_btree(
+	     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+	     bit_off, num_bits);
 	status = ocfs2_insert_extent(handle, et, *logical_offset, block,
 				     num_bits, flags, meta_ac);
 	if (status < 0) {
@@ -4821,16 +4815,15 @@
 	*logical_offset += num_bits;
 
 	if (clusters_to_add) {
-		mlog(0, "need to alloc once more, wanted = %u\n",
-		     clusters_to_add);
+		err = clusters_to_add;
 		status = -EAGAIN;
 		reason = RESTART_TRANS;
 	}
 
 leave:
-	mlog_exit(status);
 	if (reason_ret)
 		*reason_ret = reason;
+	trace_ocfs2_add_clusters_in_btree_ret(status, reason, err);
 	return status;
 }
 
@@ -5039,7 +5032,7 @@
 					      ocfs2_et_get_last_eb_blk(et),
 					      &last_eb_bh);
 		if (ret) {
-			mlog_exit(ret);
+			mlog_errno(ret);
 			goto out;
 		}
 
@@ -5056,9 +5049,9 @@
 
 	ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
 
-	mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
-	     split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
-	     ctxt.c_split_covers_rec);
+	trace_ocfs2_split_extent(split_index, ctxt.c_contig_type,
+				 ctxt.c_has_empty_extent,
+				 ctxt.c_split_covers_rec);
 
 	if (ctxt.c_contig_type == CONTIG_NONE) {
 		if (ctxt.c_split_covers_rec)
@@ -5192,8 +5185,9 @@
 {
 	int ret;
 
-	mlog(0, "Inode %lu cpos %u, len %u, phys clusters %u\n",
-	     inode->i_ino, cpos, len, phys);
+	trace_ocfs2_mark_extent_written(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		cpos, len, phys);
 
 	if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
 		ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
@@ -5512,11 +5506,10 @@
 
 	BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
 
-	mlog(0, "Owner %llu, remove (cpos %u, len %u). Existing index %d "
-	     "(cpos %u, len %u)\n",
-	     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
-	     cpos, len, index,
-	     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
+	trace_ocfs2_remove_extent(
+		(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+		cpos, len, index, le32_to_cpu(rec->e_cpos),
+		ocfs2_rec_clusters(el, rec));
 
 	if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
 		ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
@@ -5795,9 +5788,6 @@
 	struct ocfs2_dinode *di;
 	struct ocfs2_truncate_log *tl;
 
-	mlog_entry("start_blk = %llu, num_clusters = %u\n",
-		   (unsigned long long)start_blk, num_clusters);
-
 	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
 
 	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
@@ -5834,10 +5824,9 @@
 		goto bail;
 	}
 
-	mlog(0, "Log truncate of %u clusters starting at cluster %u to "
-	     "%llu (index = %d)\n", num_clusters, start_cluster,
-	     (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
-
+	trace_ocfs2_truncate_log_append(
+		(unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index,
+		start_cluster, num_clusters);
 	if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
 		/*
 		 * Move index back to the record we are coalescing with.
@@ -5846,9 +5835,10 @@
 		index--;
 
 		num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
-		mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
-		     index, le32_to_cpu(tl->tl_recs[index].t_start),
-		     num_clusters);
+		trace_ocfs2_truncate_log_append(
+			(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+			index, le32_to_cpu(tl->tl_recs[index].t_start),
+			num_clusters);
 	} else {
 		tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
 		tl->tl_used = cpu_to_le16(index + 1);
@@ -5859,7 +5849,6 @@
 
 	osb->truncated_clusters += num_clusters;
 bail:
-	mlog_exit(status);
 	return status;
 }
 
@@ -5878,8 +5867,6 @@
 	struct inode *tl_inode = osb->osb_tl_inode;
 	struct buffer_head *tl_bh = osb->osb_tl_bh;
 
-	mlog_entry_void();
-
 	di = (struct ocfs2_dinode *) tl_bh->b_data;
 	tl = &di->id2.i_dealloc;
 	i = le16_to_cpu(tl->tl_used) - 1;
@@ -5915,8 +5902,9 @@
 		/* if start_blk is not set, we ignore the record as
 		 * invalid. */
 		if (start_blk) {
-			mlog(0, "free record %d, start = %u, clusters = %u\n",
-			     i, le32_to_cpu(rec.t_start), num_clusters);
+			trace_ocfs2_replay_truncate_records(
+				(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+				i, le32_to_cpu(rec.t_start), num_clusters);
 
 			status = ocfs2_free_clusters(handle, data_alloc_inode,
 						     data_alloc_bh, start_blk,
@@ -5932,7 +5920,6 @@
 	osb->truncated_clusters = 0;
 
 bail:
-	mlog_exit(status);
 	return status;
 }
 
@@ -5949,8 +5936,6 @@
 	struct ocfs2_dinode *di;
 	struct ocfs2_truncate_log *tl;
 
-	mlog_entry_void();
-
 	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
 
 	di = (struct ocfs2_dinode *) tl_bh->b_data;
@@ -5962,8 +5947,9 @@
 
 	tl = &di->id2.i_dealloc;
 	num_to_flush = le16_to_cpu(tl->tl_used);
-	mlog(0, "Flush %u records from truncate log #%llu\n",
-	     num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
+	trace_ocfs2_flush_truncate_log(
+		(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+		num_to_flush);
 	if (!num_to_flush) {
 		status = 0;
 		goto out;
@@ -6009,7 +5995,6 @@
 	iput(data_alloc_inode);
 
 out:
-	mlog_exit(status);
 	return status;
 }
 
@@ -6032,15 +6017,11 @@
 		container_of(work, struct ocfs2_super,
 			     osb_truncate_log_wq.work);
 
-	mlog_entry_void();
-
 	status = ocfs2_flush_truncate_log(osb);
 	if (status < 0)
 		mlog_errno(status);
 	else
 		ocfs2_init_steal_slots(osb);
-
-	mlog_exit(status);
 }
 
 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
@@ -6086,7 +6067,6 @@
 	*tl_inode = inode;
 	*tl_bh    = bh;
 bail:
-	mlog_exit(status);
 	return status;
 }
 
@@ -6106,7 +6086,7 @@
 
 	*tl_copy = NULL;
 
-	mlog(0, "recover truncate log from slot %d\n", slot_num);
+	trace_ocfs2_begin_truncate_log_recovery(slot_num);
 
 	status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
 	if (status < 0) {
@@ -6123,8 +6103,7 @@
 
 	tl = &di->id2.i_dealloc;
 	if (le16_to_cpu(tl->tl_used)) {
-		mlog(0, "We'll have %u logs to recover\n",
-		     le16_to_cpu(tl->tl_used));
+		trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used));
 
 		*tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
 		if (!(*tl_copy)) {
@@ -6157,9 +6136,9 @@
 	if (status < 0 && (*tl_copy)) {
 		kfree(*tl_copy);
 		*tl_copy = NULL;
+		mlog_errno(status);
 	}
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -6174,8 +6153,6 @@
 	struct inode *tl_inode = osb->osb_tl_inode;
 	struct ocfs2_truncate_log *tl;
 
-	mlog_entry_void();
-
 	if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
 		mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
 		return -EINVAL;
@@ -6183,8 +6160,9 @@
 
 	tl = &tl_copy->id2.i_dealloc;
 	num_recs = le16_to_cpu(tl->tl_used);
-	mlog(0, "cleanup %u records from %llu\n", num_recs,
-	     (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
+	trace_ocfs2_complete_truncate_log_recovery(
+		(unsigned long long)le64_to_cpu(tl_copy->i_blkno),
+		num_recs);
 
 	mutex_lock(&tl_inode->i_mutex);
 	for(i = 0; i < num_recs; i++) {
@@ -6219,7 +6197,6 @@
 bail_up:
 	mutex_unlock(&tl_inode->i_mutex);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -6228,8 +6205,6 @@
 	int status;
 	struct inode *tl_inode = osb->osb_tl_inode;
 
-	mlog_entry_void();
-
 	if (tl_inode) {
 		cancel_delayed_work(&osb->osb_truncate_log_wq);
 		flush_workqueue(ocfs2_wq);
@@ -6241,8 +6216,6 @@
 		brelse(osb->osb_tl_bh);
 		iput(osb->osb_tl_inode);
 	}
-
-	mlog_exit_void();
 }
 
 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
@@ -6251,8 +6224,6 @@
 	struct inode *tl_inode = NULL;
 	struct buffer_head *tl_bh = NULL;
 
-	mlog_entry_void();
-
 	status = ocfs2_get_truncate_log_info(osb,
 					     osb->slot_num,
 					     &tl_inode,
@@ -6268,7 +6239,6 @@
 	osb->osb_tl_bh    = tl_bh;
 	osb->osb_tl_inode = tl_inode;
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -6350,8 +6320,8 @@
 		else
 			bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
 							      head->free_bit);
-		mlog(0, "Free bit: (bit %u, blkno %llu)\n",
-		     head->free_bit, (unsigned long long)head->free_blk);
+		trace_ocfs2_free_cached_blocks(
+		     (unsigned long long)head->free_blk, head->free_bit);
 
 		ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
 					       head->free_bit, bg_blkno, 1);
@@ -6404,8 +6374,7 @@
 		return ret;
 	}
 
-	mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
-	     bit, (unsigned long long)blkno);
+	trace_ocfs2_cache_cluster_dealloc((unsigned long long)blkno, bit);
 
 	item->free_blk = blkno;
 	item->free_bit = bit;
@@ -6480,8 +6449,8 @@
 		fl = ctxt->c_first_suballocator;
 
 		if (fl->f_first) {
-			mlog(0, "Free items: (type %u, slot %d)\n",
-			     fl->f_inode_type, fl->f_slot);
+			trace_ocfs2_run_deallocs(fl->f_inode_type,
+						 fl->f_slot);
 			ret2 = ocfs2_free_cached_blocks(osb,
 							fl->f_inode_type,
 							fl->f_slot,
@@ -6558,8 +6527,9 @@
 		goto out;
 	}
 
-	mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
-	     type, slot, bit, (unsigned long long)blkno);
+	trace_ocfs2_cache_block_dealloc(type, slot,
+					(unsigned long long)suballoc,
+					(unsigned long long)blkno, bit);
 
 	item->free_bg = suballoc;
 	item->free_blk = blkno;
@@ -7005,8 +6975,6 @@
 	struct ocfs2_extent_tree et;
 	struct ocfs2_cached_dealloc_ctxt dealloc;
 
-	mlog_entry_void();
-
 	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
 	ocfs2_init_dealloc_ctxt(&dealloc);
 
@@ -7041,8 +7009,11 @@
 		goto bail;
 	}
 
-	mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
-	     OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
+	trace_ocfs2_commit_truncate(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		new_highest_cpos,
+		OCFS2_I(inode)->ip_clusters,
+		path->p_tree_depth);
 
 	/*
 	 * By now, el will point to the extent list on the bottom most
@@ -7136,7 +7107,6 @@
 
 	ocfs2_free_path(path);
 
-	mlog_exit(status);
 	return status;
 }
 
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index daea035..ac97bca 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -29,7 +29,6 @@
 #include <linux/mpage.h>
 #include <linux/quotaops.h>
 
-#define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -45,6 +44,7 @@
 #include "super.h"
 #include "symlink.h"
 #include "refcounttree.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
@@ -59,8 +59,9 @@
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	void *kaddr;
 
-	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
-		   (unsigned long long)iblock, bh_result, create);
+	trace_ocfs2_symlink_get_block(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(unsigned long long)iblock, bh_result, create);
 
 	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
 
@@ -123,7 +124,6 @@
 bail:
 	brelse(bh);
 
-	mlog_exit(err);
 	return err;
 }
 
@@ -136,8 +136,8 @@
 	u64 p_blkno, count, past_eof;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
-		   (unsigned long long)iblock, bh_result, create);
+	trace_ocfs2_get_block((unsigned long long)OCFS2_I(inode)->ip_blkno,
+			      (unsigned long long)iblock, bh_result, create);
 
 	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
 		mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
@@ -199,8 +199,9 @@
 	}
 
 	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-	mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
-	     (unsigned long long)past_eof);
+
+	trace_ocfs2_get_block_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
+				  (unsigned long long)past_eof);
 	if (create && (iblock >= past_eof))
 		set_buffer_new(bh_result);
 
@@ -208,7 +209,6 @@
 	if (err < 0)
 		err = -EIO;
 
-	mlog_exit(err);
 	return err;
 }
 
@@ -278,7 +278,8 @@
 	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
 	int ret, unlock = 1;
 
-	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
+	trace_ocfs2_readpage((unsigned long long)oi->ip_blkno,
+			     (page ? page->index : 0));
 
 	ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
 	if (ret != 0) {
@@ -323,7 +324,6 @@
 out:
 	if (unlock)
 		unlock_page(page);
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -396,15 +396,11 @@
  */
 static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
 {
-	int ret;
+	trace_ocfs2_writepage(
+		(unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno,
+		page->index);
 
-	mlog_entry("(0x%p)\n", page);
-
-	ret = block_write_full_page(page, ocfs2_get_block, wbc);
-
-	mlog_exit(ret);
-
-	return ret;
+	return block_write_full_page(page, ocfs2_get_block, wbc);
 }
 
 /* Taken from ext3. We don't necessarily need the full blown
@@ -450,7 +446,8 @@
 	int err = 0;
 	struct inode *inode = mapping->host;
 
-	mlog_entry("(block = %llu)\n", (unsigned long long)block);
+	trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno,
+			 (unsigned long long)block);
 
 	/* We don't need to lock journal system files, since they aren't
 	 * accessed concurrently from multiple nodes.
@@ -484,8 +481,6 @@
 bail:
 	status = err ? 0 : p_blkno;
 
-	mlog_exit((int)status);
-
 	return status;
 }
 
@@ -616,9 +611,6 @@
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
-	int ret;
-
-	mlog_entry_void();
 
 	/*
 	 * Fallback to buffered I/O if we see an inode without
@@ -631,13 +623,10 @@
 	if (i_size_read(inode) <= offset)
 		return 0;
 
-	ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
-				   iov, offset, nr_segs,
-				   ocfs2_direct_IO_get_blocks,
-				   ocfs2_dio_end_io, NULL, 0);
-
-	mlog_exit(ret);
-	return ret;
+	return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+				    iov, offset, nr_segs,
+				    ocfs2_direct_IO_get_blocks,
+				    ocfs2_dio_end_io, NULL, 0);
 }
 
 static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
@@ -1026,6 +1015,12 @@
 	ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
 					&cluster_start, &cluster_end);
 
+	/* treat the write as new if the a hole/lseek spanned across
+	 * the page boundary.
+	 */
+	new = new | ((i_size_read(inode) <= page_offset(page)) &&
+			(page_offset(page) <= user_pos));
+
 	if (page == wc->w_target_page) {
 		map_from = user_pos & (PAGE_CACHE_SIZE - 1);
 		map_to = map_from + user_len;
@@ -1534,9 +1529,9 @@
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_dinode *di = NULL;
 
-	mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n",
-	     (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos,
-	     oi->ip_dyn_features);
+	trace_ocfs2_try_to_write_inline_data((unsigned long long)oi->ip_blkno,
+					     len, (unsigned long long)pos,
+					     oi->ip_dyn_features);
 
 	/*
 	 * Handle inodes which already have inline data 1st.
@@ -1739,6 +1734,13 @@
 
 	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
 
+	trace_ocfs2_write_begin_nolock(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(long long)i_size_read(inode),
+			le32_to_cpu(di->i_clusters),
+			pos, len, flags, mmap_page,
+			clusters_to_alloc, extents_to_split);
+
 	/*
 	 * We set w_target_from, w_target_to here so that
 	 * ocfs2_write_end() knows which range in the target page to
@@ -1751,12 +1753,6 @@
 		 * ocfs2_lock_allocators(). It greatly over-estimates
 		 * the work to be done.
 		 */
-		mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u,"
-		     " clusters_to_add = %u, extents_to_split = %u\n",
-		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-		     (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
-		     clusters_to_alloc, extents_to_split);
-
 		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
 					      wc->w_di_bh);
 		ret = ocfs2_lock_allocators(inode, &et,
@@ -1938,8 +1934,8 @@
 	memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
 	kunmap_atomic(kaddr, KM_USER0);
 
-	mlog(0, "Data written to inode at offset %llu. "
-	     "id_count = %u, copied = %u, i_dyn_features = 0x%x\n",
+	trace_ocfs2_write_end_inline(
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 	     (unsigned long long)pos, *copied,
 	     le16_to_cpu(di->id2.i_data.id_count),
 	     le16_to_cpu(di->i_dyn_features));
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index f9d5d3f..5d18ad1 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -35,8 +35,8 @@
 #include "inode.h"
 #include "journal.h"
 #include "uptodate.h"
-
 #include "buffer_head_io.h"
+#include "ocfs2_trace.h"
 
 /*
  * Bits on bh->b_state used by ocfs2.
@@ -55,8 +55,7 @@
 {
 	int ret = 0;
 
-	mlog_entry("(bh->b_blocknr = %llu, ci=%p)\n",
-		   (unsigned long long)bh->b_blocknr, ci);
+	trace_ocfs2_write_block((unsigned long long)bh->b_blocknr, ci);
 
 	BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
 	BUG_ON(buffer_jbd(bh));
@@ -66,6 +65,7 @@
 	 * can get modified during recovery even if read-only. */
 	if (ocfs2_is_hard_readonly(osb)) {
 		ret = -EROFS;
+		mlog_errno(ret);
 		goto out;
 	}
 
@@ -91,11 +91,11 @@
 		 * uptodate. */
 		ret = -EIO;
 		put_bh(bh);
+		mlog_errno(ret);
 	}
 
 	ocfs2_metadata_cache_io_unlock(ci);
 out:
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -106,10 +106,10 @@
 	unsigned int i;
 	struct buffer_head *bh;
 
-	if (!nr) {
-		mlog(ML_BH_IO, "No buffers will be read!\n");
+	trace_ocfs2_read_blocks_sync((unsigned long long)block, nr);
+
+	if (!nr)
 		goto bail;
-	}
 
 	for (i = 0 ; i < nr ; i++) {
 		if (bhs[i] == NULL) {
@@ -123,10 +123,8 @@
 		bh = bhs[i];
 
 		if (buffer_jbd(bh)) {
-			mlog(ML_BH_IO,
-			     "trying to sync read a jbd "
-			     "managed bh (blocknr = %llu), skipping\n",
-			     (unsigned long long)bh->b_blocknr);
+			trace_ocfs2_read_blocks_sync_jbd(
+					(unsigned long long)bh->b_blocknr);
 			continue;
 		}
 
@@ -186,8 +184,7 @@
 	struct buffer_head *bh;
 	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
 
-	mlog_entry("(ci=%p, block=(%llu), nr=(%d), flags=%d)\n",
-		   ci, (unsigned long long)block, nr, flags);
+	trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags);
 
 	BUG_ON(!ci);
 	BUG_ON((flags & OCFS2_BH_READAHEAD) &&
@@ -207,7 +204,6 @@
 	}
 
 	if (nr == 0) {
-		mlog(ML_BH_IO, "No buffers will be read!\n");
 		status = 0;
 		goto bail;
 	}
@@ -251,8 +247,7 @@
 		 */
 
 		if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) {
-			mlog(ML_UPTODATE,
-			     "bh (%llu), owner %llu not uptodate\n",
+			trace_ocfs2_read_blocks_from_disk(
 			     (unsigned long long)bh->b_blocknr,
 			     (unsigned long long)ocfs2_metadata_cache_owner(ci));
 			/* We're using ignore_cache here to say
@@ -260,11 +255,10 @@
 			ignore_cache = 1;
 		}
 
+		trace_ocfs2_read_blocks_bh((unsigned long long)bh->b_blocknr,
+			ignore_cache, buffer_jbd(bh), buffer_dirty(bh));
+
 		if (buffer_jbd(bh)) {
-			if (ignore_cache)
-				mlog(ML_BH_IO, "trying to sync read a jbd "
-					       "managed bh (blocknr = %llu)\n",
-				     (unsigned long long)bh->b_blocknr);
 			continue;
 		}
 
@@ -272,9 +266,6 @@
 			if (buffer_dirty(bh)) {
 				/* This should probably be a BUG, or
 				 * at least return an error. */
-				mlog(ML_BH_IO, "asking me to sync read a dirty "
-					       "buffer! (blocknr = %llu)\n",
-				     (unsigned long long)bh->b_blocknr);
 				continue;
 			}
 
@@ -367,14 +358,11 @@
 	}
 	ocfs2_metadata_cache_io_unlock(ci);
 
-	mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n",
-	     (unsigned long long)block, nr,
-	     ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes",
-	     flags);
+	trace_ocfs2_read_blocks_end((unsigned long long)block, nr,
+				    flags, ignore_cache);
 
 bail:
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -408,13 +396,12 @@
 	int ret = 0;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
 
-	mlog_entry_void();
-
 	BUG_ON(buffer_jbd(bh));
 	ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
 
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
 		ret = -EROFS;
+		mlog_errno(ret);
 		goto out;
 	}
 
@@ -434,9 +421,9 @@
 	if (!buffer_uptodate(bh)) {
 		ret = -EIO;
 		put_bh(bh);
+		mlog_errno(ret);
 	}
 
 out:
-	mlog_exit(ret);
 	return ret;
 }
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 1adab28..2461eb3 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1654,8 +1654,6 @@
 	struct o2hb_disk_slot *slot;
 	struct o2hb_disk_heartbeat_block *hb_block;
 
-	mlog_entry_void();
-
 	ret = o2hb_read_slots(reg, reg->hr_blocks);
 	if (ret) {
 		mlog_errno(ret);
@@ -1677,7 +1675,6 @@
 	}
 
 out:
-	mlog_exit(ret);
 	return ret;
 }
 
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 6c61771..07ac24f 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -30,7 +30,7 @@
 
 struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK);
 EXPORT_SYMBOL_GPL(mlog_and_bits);
-struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(MLOG_INITIAL_NOT_MASK);
+struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(0);
 EXPORT_SYMBOL_GPL(mlog_not_bits);
 
 static ssize_t mlog_mask_show(u64 mask, char *buf)
@@ -80,8 +80,6 @@
 }
 
 static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
-	define_mask(ENTRY),
-	define_mask(EXIT),
 	define_mask(TCP),
 	define_mask(MSG),
 	define_mask(SOCKET),
@@ -93,27 +91,11 @@
 	define_mask(DLM_THREAD),
 	define_mask(DLM_MASTER),
 	define_mask(DLM_RECOVERY),
-	define_mask(AIO),
-	define_mask(JOURNAL),
-	define_mask(DISK_ALLOC),
-	define_mask(SUPER),
-	define_mask(FILE_IO),
-	define_mask(EXTENT_MAP),
 	define_mask(DLM_GLUE),
-	define_mask(BH_IO),
-	define_mask(UPTODATE),
-	define_mask(NAMEI),
-	define_mask(INODE),
 	define_mask(VOTE),
-	define_mask(DCACHE),
 	define_mask(CONN),
 	define_mask(QUORUM),
-	define_mask(EXPORT),
-	define_mask(XATTR),
-	define_mask(QUOTA),
-	define_mask(REFCOUNT),
 	define_mask(BASTS),
-	define_mask(RESERVATIONS),
 	define_mask(CLUSTER),
 	define_mask(ERROR),
 	define_mask(NOTICE),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 34d6544..baa2b9e 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -82,41 +82,23 @@
 
 /* bits that are frequently given and infrequently matched in the low word */
 /* NOTE: If you add a flag, you need to also update masklog.c! */
-#define ML_ENTRY	0x0000000000000001ULL /* func call entry */
-#define ML_EXIT		0x0000000000000002ULL /* func call exit */
-#define ML_TCP		0x0000000000000004ULL /* net cluster/tcp.c */
-#define ML_MSG		0x0000000000000008ULL /* net network messages */
-#define ML_SOCKET	0x0000000000000010ULL /* net socket lifetime */
-#define ML_HEARTBEAT	0x0000000000000020ULL /* hb all heartbeat tracking */
-#define ML_HB_BIO	0x0000000000000040ULL /* hb io tracing */
-#define ML_DLMFS	0x0000000000000080ULL /* dlm user dlmfs */
-#define ML_DLM		0x0000000000000100ULL /* dlm general debugging */
-#define ML_DLM_DOMAIN	0x0000000000000200ULL /* dlm domain debugging */
-#define ML_DLM_THREAD	0x0000000000000400ULL /* dlm domain thread */
-#define ML_DLM_MASTER	0x0000000000000800ULL /* dlm master functions */
-#define ML_DLM_RECOVERY	0x0000000000001000ULL /* dlm master functions */
-#define ML_AIO		0x0000000000002000ULL /* ocfs2 aio read and write */
-#define ML_JOURNAL	0x0000000000004000ULL /* ocfs2 journalling functions */
-#define ML_DISK_ALLOC	0x0000000000008000ULL /* ocfs2 disk allocation */
-#define ML_SUPER	0x0000000000010000ULL /* ocfs2 mount / umount */
-#define ML_FILE_IO	0x0000000000020000ULL /* ocfs2 file I/O */
-#define ML_EXTENT_MAP	0x0000000000040000ULL /* ocfs2 extent map caching */
-#define ML_DLM_GLUE	0x0000000000080000ULL /* ocfs2 dlm glue layer */
-#define ML_BH_IO	0x0000000000100000ULL /* ocfs2 buffer I/O */
-#define ML_UPTODATE	0x0000000000200000ULL /* ocfs2 caching sequence #'s */
-#define ML_NAMEI	0x0000000000400000ULL /* ocfs2 directory / namespace */
-#define ML_INODE	0x0000000000800000ULL /* ocfs2 inode manipulation */
-#define ML_VOTE		0x0000000001000000ULL /* ocfs2 node messaging  */
-#define ML_DCACHE	0x0000000002000000ULL /* ocfs2 dcache operations */
-#define ML_CONN		0x0000000004000000ULL /* net connection management */
-#define ML_QUORUM	0x0000000008000000ULL /* net connection quorum */
-#define ML_EXPORT	0x0000000010000000ULL /* ocfs2 export operations */
-#define ML_XATTR	0x0000000020000000ULL /* ocfs2 extended attributes */
-#define ML_QUOTA	0x0000000040000000ULL /* ocfs2 quota operations */
-#define ML_REFCOUNT	0x0000000080000000ULL /* refcount tree operations */
-#define ML_BASTS	0x0000000100000000ULL /* dlmglue asts and basts */
-#define ML_RESERVATIONS	0x0000000200000000ULL /* ocfs2 alloc reservations */
-#define ML_CLUSTER	0x0000000400000000ULL /* cluster stack */
+#define ML_TCP		0x0000000000000001ULL /* net cluster/tcp.c */
+#define ML_MSG		0x0000000000000002ULL /* net network messages */
+#define ML_SOCKET	0x0000000000000004ULL /* net socket lifetime */
+#define ML_HEARTBEAT	0x0000000000000008ULL /* hb all heartbeat tracking */
+#define ML_HB_BIO	0x0000000000000010ULL /* hb io tracing */
+#define ML_DLMFS	0x0000000000000020ULL /* dlm user dlmfs */
+#define ML_DLM		0x0000000000000040ULL /* dlm general debugging */
+#define ML_DLM_DOMAIN	0x0000000000000080ULL /* dlm domain debugging */
+#define ML_DLM_THREAD	0x0000000000000100ULL /* dlm domain thread */
+#define ML_DLM_MASTER	0x0000000000000200ULL /* dlm master functions */
+#define ML_DLM_RECOVERY	0x0000000000000400ULL /* dlm master functions */
+#define ML_DLM_GLUE	0x0000000000000800ULL /* ocfs2 dlm glue layer */
+#define ML_VOTE		0x0000000000001000ULL /* ocfs2 node messaging  */
+#define ML_CONN		0x0000000000002000ULL /* net connection management */
+#define ML_QUORUM	0x0000000000004000ULL /* net connection quorum */
+#define ML_BASTS	0x0000000000008000ULL /* dlmglue asts and basts */
+#define ML_CLUSTER	0x0000000000010000ULL /* cluster stack */
 
 /* bits that are infrequently given and frequently matched in the high word */
 #define ML_ERROR	0x1000000000000000ULL /* sent to KERN_ERR */
@@ -124,7 +106,6 @@
 #define ML_KTHREAD	0x4000000000000000ULL /* kernel thread activity */
 
 #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
-#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
 #ifndef MLOG_MASK_PREFIX
 #define MLOG_MASK_PREFIX 0
 #endif
@@ -222,58 +203,6 @@
 		mlog(ML_ERROR, "status = %lld\n", (long long)_st);	\
 } while (0)
 
-#if defined(CONFIG_OCFS2_DEBUG_MASKLOG)
-#define mlog_entry(fmt, args...) do {					\
-	mlog(ML_ENTRY, "ENTRY:" fmt , ##args);				\
-} while (0)
-
-#define mlog_entry_void() do {						\
-	mlog(ML_ENTRY, "ENTRY:\n");					\
-} while (0)
-
-/*
- * We disable this for sparse.
- */
-#if !defined(__CHECKER__)
-#define mlog_exit(st) do {						     \
-	if (__builtin_types_compatible_p(typeof(st), unsigned long))	     \
-		mlog(ML_EXIT, "EXIT: %lu\n", (unsigned long) (st));	     \
-	else if (__builtin_types_compatible_p(typeof(st), signed long))      \
-		mlog(ML_EXIT, "EXIT: %ld\n", (signed long) (st));	     \
-	else if (__builtin_types_compatible_p(typeof(st), unsigned int)	     \
-		 || __builtin_types_compatible_p(typeof(st), unsigned short) \
-		 || __builtin_types_compatible_p(typeof(st), unsigned char)) \
-		mlog(ML_EXIT, "EXIT: %u\n", (unsigned int) (st));	     \
-	else if (__builtin_types_compatible_p(typeof(st), signed int)	     \
-		 || __builtin_types_compatible_p(typeof(st), signed short)   \
-		 || __builtin_types_compatible_p(typeof(st), signed char))   \
-		mlog(ML_EXIT, "EXIT: %d\n", (signed int) (st));		     \
-	else if (__builtin_types_compatible_p(typeof(st), long long))	     \
-		mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st));	     \
-	else								     \
-		mlog(ML_EXIT, "EXIT: %llu\n", (unsigned long long) (st));    \
-} while (0)
-#else
-#define mlog_exit(st) do {						     \
-	mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st));		     \
-} while (0)
-#endif
-
-#define mlog_exit_ptr(ptr) do {						\
-	mlog(ML_EXIT, "EXIT: %p\n", ptr);				\
-} while (0)
-
-#define mlog_exit_void() do {						\
-	mlog(ML_EXIT, "EXIT\n");					\
-} while (0)
-#else
-#define mlog_entry(...)  do { } while (0)
-#define mlog_entry_void(...)  do { } while (0)
-#define mlog_exit(...)  do { } while (0)
-#define mlog_exit_ptr(...)  do { } while (0)
-#define mlog_exit_void(...)  do { } while (0)
-#endif  /* defined(CONFIG_OCFS2_DEBUG_MASKLOG) */
-
 #define mlog_bug_on_msg(cond, fmt, args...) do {			\
 	if (cond) {							\
 		mlog(ML_ERROR, "bug expression: " #cond "\n");		\
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 3b11cb1..ee04ff5 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -210,10 +210,6 @@
 	sc->sc_tv_func_stop = ktime_get();
 }
 
-static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
-{
-	return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start);
-}
 #else  /* CONFIG_DEBUG_FS */
 # define o2net_init_nst(a, b, c, d, e)
 # define o2net_set_nst_sock_time(a)
@@ -227,10 +223,14 @@
 # define o2net_set_advance_stop_time(a)
 # define o2net_set_func_start_time(a)
 # define o2net_set_func_stop_time(a)
-# define o2net_get_func_run_time(a)		(ktime_t)0
 #endif /* CONFIG_DEBUG_FS */
 
 #ifdef CONFIG_OCFS2_FS_STATS
+static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
+{
+	return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start);
+}
+
 static void o2net_update_send_stats(struct o2net_send_tracking *nst,
 				    struct o2net_sock_container *sc)
 {
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 7eb9040..e5ba348 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -28,7 +28,6 @@
 #include <linux/slab.h>
 #include <linux/namei.h>
 
-#define MLOG_MASK_PREFIX ML_DCACHE
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -39,6 +38,7 @@
 #include "file.h"
 #include "inode.h"
 #include "super.h"
+#include "ocfs2_trace.h"
 
 void ocfs2_dentry_attach_gen(struct dentry *dentry)
 {
@@ -62,8 +62,8 @@
 	inode = dentry->d_inode;
 	osb = OCFS2_SB(dentry->d_sb);
 
-	mlog_entry("(0x%p, '%.*s')\n", dentry,
-		   dentry->d_name.len, dentry->d_name.name);
+	trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len,
+				      dentry->d_name.name);
 
 	/* For a negative dentry -
 	 * check the generation number of the parent and compare with the
@@ -73,9 +73,10 @@
 		unsigned long gen = (unsigned long) dentry->d_fsdata;
 		unsigned long pgen =
 			OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
-		mlog(0, "negative dentry: %.*s parent gen: %lu "
-			"dentry gen: %lu\n",
-			dentry->d_name.len, dentry->d_name.name, pgen, gen);
+
+		trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len,
+						       dentry->d_name.name,
+						       pgen, gen);
 		if (gen != pgen)
 			goto bail;
 		goto valid;
@@ -90,8 +91,8 @@
 	/* did we or someone else delete this inode? */
 	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
 		spin_unlock(&OCFS2_I(inode)->ip_lock);
-		mlog(0, "inode (%llu) deleted, returning false\n",
-		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		trace_ocfs2_dentry_revalidate_delete(
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
 		goto bail;
 	}
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
@@ -101,10 +102,9 @@
 	 * inode nlink hits zero, it never goes back.
 	 */
 	if (inode->i_nlink == 0) {
-		mlog(0, "Inode %llu orphaned, returning false "
-		     "dir = %d\n",
-		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-		     S_ISDIR(inode->i_mode));
+		trace_ocfs2_dentry_revalidate_orphaned(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			S_ISDIR(inode->i_mode));
 		goto bail;
 	}
 
@@ -113,9 +113,8 @@
 	 * redo it.
 	 */
 	if (!dentry->d_fsdata) {
-		mlog(0, "Inode %llu doesn't have dentry lock, "
-		     "returning false\n",
-		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		trace_ocfs2_dentry_revalidate_nofsdata(
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
 		goto bail;
 	}
 
@@ -123,8 +122,7 @@
 	ret = 1;
 
 bail:
-	mlog_exit(ret);
-
+	trace_ocfs2_dentry_revalidate_ret(ret);
 	return ret;
 }
 
@@ -181,8 +179,8 @@
 
 		spin_lock(&dentry->d_lock);
 		if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {
-			mlog(0, "dentry found: %.*s\n",
-			     dentry->d_name.len, dentry->d_name.name);
+			trace_ocfs2_find_local_alias(dentry->d_name.len,
+						     dentry->d_name.name);
 
 			dget_dlock(dentry);
 			spin_unlock(&dentry->d_lock);
@@ -240,9 +238,8 @@
 	struct dentry *alias;
 	struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
 
-	mlog(0, "Attach \"%.*s\", parent %llu, fsdata: %p\n",
-	     dentry->d_name.len, dentry->d_name.name,
-	     (unsigned long long)parent_blkno, dl);
+	trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name,
+				       (unsigned long long)parent_blkno, dl);
 
 	/*
 	 * Negative dentry. We ignore these for now.
@@ -292,7 +289,9 @@
 				(unsigned long long)parent_blkno,
 				(unsigned long long)dl->dl_parent_blkno);
 
-		mlog(0, "Found: %s\n", dl->dl_lockres.l_name);
+		trace_ocfs2_dentry_attach_lock_found(dl->dl_lockres.l_name,
+				(unsigned long long)parent_blkno,
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 		goto out_attach;
 	}
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f97b6f1..9fe5b8f 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -43,7 +43,6 @@
 #include <linux/quotaops.h>
 #include <linux/sort.h>
 
-#define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -61,6 +60,7 @@
 #include "super.h"
 #include "sysfile.h"
 #include "uptodate.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
@@ -322,21 +322,23 @@
 	const char *error_msg = NULL;
 	const int rlen = le16_to_cpu(de->rec_len);
 
-	if (rlen < OCFS2_DIR_REC_LEN(1))
+	if (unlikely(rlen < OCFS2_DIR_REC_LEN(1)))
 		error_msg = "rec_len is smaller than minimal";
-	else if (rlen % 4 != 0)
+	else if (unlikely(rlen % 4 != 0))
 		error_msg = "rec_len % 4 != 0";
-	else if (rlen < OCFS2_DIR_REC_LEN(de->name_len))
+	else if (unlikely(rlen < OCFS2_DIR_REC_LEN(de->name_len)))
 		error_msg = "rec_len is too small for name_len";
-	else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+	else if (unlikely(
+		 ((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize))
 		error_msg = "directory entry across blocks";
 
-	if (error_msg != NULL)
+	if (unlikely(error_msg != NULL))
 		mlog(ML_ERROR, "bad entry in directory #%llu: %s - "
 		     "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n",
 		     (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg,
 		     offset, (unsigned long long)le64_to_cpu(de->inode), rlen,
 		     de->name_len);
+
 	return error_msg == NULL ? 1 : 0;
 }
 
@@ -367,8 +369,6 @@
 	int de_len;
 	int ret = 0;
 
-	mlog_entry_void();
-
 	de_buf = first_de;
 	dlimit = de_buf + bytes;
 
@@ -402,7 +402,7 @@
 	}
 
 bail:
-	mlog_exit(ret);
+	trace_ocfs2_search_dirblock(ret);
 	return ret;
 }
 
@@ -447,8 +447,7 @@
 	 * We don't validate dirents here, that's handled
 	 * in-place when the code walks them.
 	 */
-	mlog(0, "Validating dirblock %llu\n",
-	     (unsigned long long)bh->b_blocknr);
+	trace_ocfs2_validate_dir_block((unsigned long long)bh->b_blocknr);
 
 	BUG_ON(!buffer_uptodate(bh));
 
@@ -706,8 +705,6 @@
 	int num = 0;
 	int nblocks, i, err;
 
-	mlog_entry_void();
-
 	sb = dir->i_sb;
 
 	nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
@@ -788,7 +785,7 @@
 	for (; ra_ptr < ra_max; ra_ptr++)
 		brelse(bh_use[ra_ptr]);
 
-	mlog_exit_ptr(ret);
+	trace_ocfs2_find_entry_el(ret);
 	return ret;
 }
 
@@ -950,11 +947,9 @@
 		goto out;
 	}
 
-	mlog(0, "Dir %llu: name: \"%.*s\", lookup of hash: %u.0x%x "
-	     "returns: %llu\n",
-	     (unsigned long long)OCFS2_I(dir)->ip_blkno,
-	     namelen, name, hinfo->major_hash, hinfo->minor_hash,
-	     (unsigned long long)phys);
+	trace_ocfs2_dx_dir_search((unsigned long long)OCFS2_I(dir)->ip_blkno,
+				  namelen, name, hinfo->major_hash,
+				  hinfo->minor_hash, (unsigned long long)phys);
 
 	ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
 	if (ret) {
@@ -964,9 +959,9 @@
 
 	dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
 
-	mlog(0, "leaf info: num_used: %d, count: %d\n",
-	     le16_to_cpu(dx_leaf->dl_list.de_num_used),
-	     le16_to_cpu(dx_leaf->dl_list.de_count));
+	trace_ocfs2_dx_dir_search_leaf_info(
+			le16_to_cpu(dx_leaf->dl_list.de_num_used),
+			le16_to_cpu(dx_leaf->dl_list.de_count));
 
 	entry_list = &dx_leaf->dl_list;
 
@@ -1166,8 +1161,6 @@
 	int i, status = -ENOENT;
 	ocfs2_journal_access_func access = ocfs2_journal_access_db;
 
-	mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
-
 	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
 		access = ocfs2_journal_access_di;
 
@@ -1202,7 +1195,6 @@
 		de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
 	}
 bail:
-	mlog_exit(status);
 	return status;
 }
 
@@ -1348,8 +1340,8 @@
 		}
 	}
 
-	mlog(0, "Dir %llu: delete entry at index: %d\n",
-	     (unsigned long long)OCFS2_I(dir)->ip_blkno, index);
+	trace_ocfs2_delete_entry_dx((unsigned long long)OCFS2_I(dir)->ip_blkno,
+				    index);
 
 	ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
 				   leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
@@ -1632,8 +1624,6 @@
 	struct buffer_head *insert_bh = lookup->dl_leaf_bh;
 	char *data_start = insert_bh->b_data;
 
-	mlog_entry_void();
-
 	if (!namelen)
 		return -EINVAL;
 
@@ -1765,8 +1755,9 @@
 	 * from ever getting here. */
 	retval = -ENOSPC;
 bail:
+	if (retval)
+		mlog_errno(retval);
 
-	mlog_exit(retval);
 	return retval;
 }
 
@@ -2028,8 +2019,7 @@
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	int lock_level = 0;
 
-	mlog_entry("dirino=%llu\n",
-		   (unsigned long long)OCFS2_I(inode)->ip_blkno);
+	trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 	error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
 	if (lock_level && error >= 0) {
@@ -2051,9 +2041,10 @@
 				      dirent, filldir, NULL);
 
 	ocfs2_inode_unlock(inode, lock_level);
+	if (error)
+		mlog_errno(error);
 
 bail_nolock:
-	mlog_exit(error);
 
 	return error;
 }
@@ -2069,8 +2060,8 @@
 {
 	int status = -ENOENT;
 
-	mlog(0, "name=%.*s, blkno=%p, inode=%llu\n", namelen, name, blkno,
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+	trace_ocfs2_find_files_on_disk(namelen, name, blkno,
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 	status = ocfs2_find_entry(name, namelen, inode, lookup);
 	if (status)
@@ -2114,8 +2105,8 @@
 	int ret;
 	struct ocfs2_dir_lookup_result lookup = { NULL, };
 
-	mlog_entry("dir %llu, name '%.*s'\n",
-		   (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
+	trace_ocfs2_check_dir_for_entry(
+		(unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
 
 	ret = -EEXIST;
 	if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
@@ -2125,7 +2116,8 @@
 bail:
 	ocfs2_free_dir_lookup_result(&lookup);
 
-	mlog_exit(ret);
+	if (ret)
+		mlog_errno(ret);
 	return ret;
 }
 
@@ -2324,8 +2316,6 @@
 	struct buffer_head *new_bh = NULL;
 	struct ocfs2_dir_entry *de;
 
-	mlog_entry_void();
-
 	if (ocfs2_new_dir_wants_trailer(inode))
 		size = ocfs2_dir_trailer_blk_off(parent->i_sb);
 
@@ -2380,7 +2370,6 @@
 bail:
 	brelse(new_bh);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -2409,9 +2398,9 @@
 		goto out;
 	}
 
-	mlog(0, "Dir %llu, attach new index block: %llu\n",
-	     (unsigned long long)OCFS2_I(dir)->ip_blkno,
-	     (unsigned long long)dr_blkno);
+	trace_ocfs2_dx_dir_attach_index(
+				(unsigned long long)OCFS2_I(dir)->ip_blkno,
+				(unsigned long long)dr_blkno);
 
 	dx_root_bh = sb_getblk(osb->sb, dr_blkno);
 	if (dx_root_bh == NULL) {
@@ -2511,11 +2500,10 @@
 		dx_leaf->dl_list.de_count =
 			cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
 
-		mlog(0,
-		     "Dir %llu, format dx_leaf: %llu, entry count: %u\n",
-		     (unsigned long long)OCFS2_I(dir)->ip_blkno,
-		     (unsigned long long)bh->b_blocknr,
-		     le16_to_cpu(dx_leaf->dl_list.de_count));
+		trace_ocfs2_dx_dir_format_cluster(
+				(unsigned long long)OCFS2_I(dir)->ip_blkno,
+				(unsigned long long)bh->b_blocknr,
+				le16_to_cpu(dx_leaf->dl_list.de_count));
 
 		ocfs2_journal_dirty(handle, bh);
 	}
@@ -2759,12 +2747,11 @@
 
 		ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo);
 
-		mlog(0,
-		     "dir: %llu, major: 0x%x minor: 0x%x, index: %u, name: %.*s\n",
-		     (unsigned long long)dir->i_ino, hinfo.major_hash,
-		     hinfo.minor_hash,
-		     le16_to_cpu(dx_root->dr_entries.de_num_used),
-		     de->name_len, de->name);
+		trace_ocfs2_dx_dir_index_root_block(
+				(unsigned long long)dir->i_ino,
+				hinfo.major_hash, hinfo.minor_hash,
+				de->name_len, de->name,
+				le16_to_cpu(dx_root->dr_entries.de_num_used));
 
 		ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo,
 					   dirent_blk);
@@ -3235,7 +3222,6 @@
 bail:
 	if (did_quota && status < 0)
 		dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
-	mlog_exit(status);
 	return status;
 }
 
@@ -3270,8 +3256,6 @@
 	struct ocfs2_extent_tree et;
 	struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
 
-	mlog_entry_void();
-
 	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 		/*
 		 * This would be a code error as an inline directory should
@@ -3320,8 +3304,8 @@
 	down_write(&OCFS2_I(dir)->ip_alloc_sem);
 	drop_alloc_sem = 1;
 	dir_i_size = i_size_read(dir);
-	mlog(0, "extending dir %llu (i_size = %lld)\n",
-	     (unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size);
+	trace_ocfs2_extend_dir((unsigned long long)OCFS2_I(dir)->ip_blkno,
+			       dir_i_size);
 
 	/* dir->i_size is always block aligned. */
 	spin_lock(&OCFS2_I(dir)->ip_lock);
@@ -3436,7 +3420,6 @@
 
 	brelse(new_bh);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -3583,8 +3566,9 @@
 	status = 0;
 bail:
 	brelse(bh);
+	if (status)
+		mlog_errno(status);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -3815,9 +3799,9 @@
 	struct ocfs2_dx_root_block *dx_root;
 	struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
 
-	mlog(0, "DX Dir: %llu, rebalance leaf leaf_blkno: %llu insert: %u\n",
-	     (unsigned long long)OCFS2_I(dir)->ip_blkno,
-	     (unsigned long long)leaf_blkno, insert_hash);
+	trace_ocfs2_dx_dir_rebalance((unsigned long long)OCFS2_I(dir)->ip_blkno,
+				     (unsigned long long)leaf_blkno,
+				     insert_hash);
 
 	ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
 
@@ -3897,8 +3881,7 @@
 		goto  out_commit;
 	}
 
-	mlog(0, "Split leaf (%u) at %u, insert major hash is %u\n",
-	     leaf_cpos, split_hash, insert_hash);
+	trace_ocfs2_dx_dir_rebalance_split(leaf_cpos, split_hash, insert_hash);
 
 	/*
 	 * We have to carefully order operations here. There are items
@@ -4355,8 +4338,8 @@
 	unsigned int blocks_wanted = 1;
 	struct buffer_head *bh = NULL;
 
-	mlog(0, "getting ready to insert namelen %d into dir %llu\n",
-	     namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno);
+	trace_ocfs2_prepare_dir_for_insert(
+		(unsigned long long)OCFS2_I(dir)->ip_blkno, namelen);
 
 	if (!namelen) {
 		ret = -EINVAL;
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 9f30491..29a886d 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -128,8 +128,8 @@
 
 	assert_spin_locked(&res->spinlock);
 
-	mlog_entry("type=%d, convert_type=%d, new convert_type=%d\n",
-		   lock->ml.type, lock->ml.convert_type, type);
+	mlog(0, "type=%d, convert_type=%d, new convert_type=%d\n",
+	     lock->ml.type, lock->ml.convert_type, type);
 
 	spin_lock(&lock->spinlock);
 
@@ -353,7 +353,7 @@
 	struct kvec vec[2];
 	size_t veclen = 1;
 
-	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+	mlog(0, "%.*s\n", res->lockname.len, res->lockname.name);
 
 	memset(&convert, 0, sizeof(struct dlm_convert_lock));
 	convert.node_idx = dlm->node_num;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 7e38a07..7540a49 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -188,7 +188,7 @@
 	struct hlist_head *bucket;
 	struct hlist_node *list;
 
-	mlog_entry("%.*s\n", len, name);
+	mlog(0, "%.*s\n", len, name);
 
 	assert_spin_locked(&dlm->spinlock);
 
@@ -222,7 +222,7 @@
 {
 	struct dlm_lock_resource *res = NULL;
 
-	mlog_entry("%.*s\n", len, name);
+	mlog(0, "%.*s\n", len, name);
 
 	assert_spin_locked(&dlm->spinlock);
 
@@ -531,7 +531,7 @@
 	unsigned int node;
 	struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
 
-	mlog_entry("%p %u %p", msg, len, data);
+	mlog(0, "%p %u %p", msg, len, data);
 
 	if (!dlm_grab(dlm))
 		return 0;
@@ -926,9 +926,10 @@
 }
 
 static int dlm_match_regions(struct dlm_ctxt *dlm,
-			     struct dlm_query_region *qr)
+			     struct dlm_query_region *qr,
+			     char *local, int locallen)
 {
-	char *local = NULL, *remote = qr->qr_regions;
+	char *remote = qr->qr_regions;
 	char *l, *r;
 	int localnr, i, j, foundit;
 	int status = 0;
@@ -957,13 +958,8 @@
 		r += O2HB_MAX_REGION_NAME_LEN;
 	}
 
-	local = kmalloc(sizeof(qr->qr_regions), GFP_ATOMIC);
-	if (!local) {
-		status = -ENOMEM;
-		goto bail;
-	}
-
-	localnr = o2hb_get_all_regions(local, O2NM_MAX_REGIONS);
+	localnr = min(O2NM_MAX_REGIONS, locallen/O2HB_MAX_REGION_NAME_LEN);
+	localnr = o2hb_get_all_regions(local, (u8)localnr);
 
 	/* compare local regions with remote */
 	l = local;
@@ -1012,8 +1008,6 @@
 	}
 
 bail:
-	kfree(local);
-
 	return status;
 }
 
@@ -1075,6 +1069,7 @@
 {
 	struct dlm_query_region *qr;
 	struct dlm_ctxt *dlm = NULL;
+	char *local = NULL;
 	int status = 0;
 	int locked = 0;
 
@@ -1083,6 +1078,13 @@
 	mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node,
 	     qr->qr_domain);
 
+	/* buffer used in dlm_mast_regions() */
+	local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
+	if (!local) {
+		status = -ENOMEM;
+		goto bail;
+	}
+
 	status = -EINVAL;
 
 	spin_lock(&dlm_domain_lock);
@@ -1112,13 +1114,15 @@
 		goto bail;
 	}
 
-	status = dlm_match_regions(dlm, qr);
+	status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions));
 
 bail:
 	if (locked)
 		spin_unlock(&dlm->spinlock);
 	spin_unlock(&dlm_domain_lock);
 
+	kfree(local);
+
 	return status;
 }
 
@@ -1553,7 +1557,7 @@
 	struct domain_join_ctxt *ctxt;
 	enum dlm_query_join_response_code response = JOIN_DISALLOW;
 
-	mlog_entry("%p", dlm);
+	mlog(0, "%p", dlm);
 
 	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
 	if (!ctxt) {
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 7009292..8d39e0f 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -128,7 +128,7 @@
 	int call_ast = 0, kick_thread = 0;
 	enum dlm_status status = DLM_NORMAL;
 
-	mlog_entry("type=%d\n", lock->ml.type);
+	mlog(0, "type=%d\n", lock->ml.type);
 
 	spin_lock(&res->spinlock);
 	/* if called from dlm_create_lock_handler, need to
@@ -227,8 +227,8 @@
 	enum dlm_status status = DLM_DENIED;
 	int lockres_changed = 1;
 
-	mlog_entry("type=%d\n", lock->ml.type);
-	mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len,
+	mlog(0, "type=%d, lockres %.*s, flags = 0x%x\n",
+	     lock->ml.type, res->lockname.len,
 	     res->lockname.name, flags);
 
 	spin_lock(&res->spinlock);
@@ -308,8 +308,6 @@
 	int tmpret, status = 0;
 	enum dlm_status ret;
 
-	mlog_entry_void();
-
 	memset(&create, 0, sizeof(create));
 	create.node_idx = dlm->node_num;
 	create.requested_type = lock->ml.type;
@@ -477,8 +475,6 @@
 
 	BUG_ON(!dlm);
 
-	mlog_entry_void();
-
 	if (!dlm_grab(dlm))
 		return DLM_REJECTED;
 
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 59f0f6b..9d67610 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -426,8 +426,6 @@
 	struct dlm_master_list_entry *mle;
 	struct dlm_ctxt *dlm;
 
-	mlog_entry_void();
-
 	mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
 	dlm = mle->dlm;
 
@@ -3120,8 +3118,6 @@
 
 	*oldmle = NULL;
 
-	mlog_entry_void();
-
 	assert_spin_locked(&dlm->spinlock);
 	assert_spin_locked(&dlm->master_lock);
 
@@ -3261,7 +3257,7 @@
 	struct hlist_node *list;
 	unsigned int i;
 
-	mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
+	mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node);
 top:
 	assert_spin_locked(&dlm->spinlock);
 
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index aaaffbc..f1beb6f 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -727,7 +727,6 @@
 	if (destroy)
 		dlm_destroy_recovery_area(dlm, dead_node);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -1496,9 +1495,9 @@
 			kfree(buf);
 		if (item)
 			kfree(item);
+		mlog_errno(ret);
 	}
 
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -1567,7 +1566,6 @@
 		dlm_lockres_put(res);
 	}
 	kfree(data);
-	mlog_exit(ret);
 }
 
 
@@ -1986,7 +1984,6 @@
 			dlm_lock_put(newlock);
 	}
 
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -2083,8 +2080,6 @@
 	struct hlist_head *bucket;
 	struct dlm_lock_resource *res, *next;
 
-	mlog_entry_void();
-
 	assert_spin_locked(&dlm->spinlock);
 
 	list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
@@ -2607,8 +2602,6 @@
 	int nodenum;
 	int status;
 
-	mlog_entry("%u\n", dead_node);
-
 	mlog(0, "%s: dead node is %u\n", dlm->name, dead_node);
 
 	spin_lock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 817287c..850aa7e 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -317,7 +317,7 @@
 	struct kvec vec[2];
 	size_t veclen = 1;
 
-	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+	mlog(0, "%.*s\n", res->lockname.len, res->lockname.name);
 
 	if (owner == dlm->node_num) {
 		/* ended up trying to contact ourself.  this means
@@ -588,8 +588,6 @@
 	struct dlm_lock *lock = NULL;
 	int call_ast, is_master;
 
-	mlog_entry_void();
-
 	if (!lksb) {
 		dlm_error(DLM_BADARGS);
 		return DLM_BADARGS;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e8d94d7..7642d7c 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -64,7 +64,7 @@
 	unsigned long		mw_mask;
 	unsigned long		mw_goal;
 #ifdef CONFIG_OCFS2_FS_STATS
-	unsigned long long 	mw_lock_start;
+	ktime_t			mw_lock_start;
 #endif
 };
 
@@ -397,8 +397,6 @@
 {
 	int len;
 
-	mlog_entry_void();
-
 	BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
 
 	len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x",
@@ -408,8 +406,6 @@
 	BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
 
 	mlog(0, "built lock resource with name: %s\n", name);
-
-	mlog_exit_void();
 }
 
 static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock);
@@ -435,44 +431,41 @@
 #ifdef CONFIG_OCFS2_FS_STATS
 static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
 {
-	res->l_lock_num_prmode = 0;
-	res->l_lock_num_prmode_failed = 0;
-	res->l_lock_total_prmode = 0;
-	res->l_lock_max_prmode = 0;
-	res->l_lock_num_exmode = 0;
-	res->l_lock_num_exmode_failed = 0;
-	res->l_lock_total_exmode = 0;
-	res->l_lock_max_exmode = 0;
 	res->l_lock_refresh = 0;
+	memset(&res->l_lock_prmode, 0, sizeof(struct ocfs2_lock_stats));
+	memset(&res->l_lock_exmode, 0, sizeof(struct ocfs2_lock_stats));
 }
 
 static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level,
 				    struct ocfs2_mask_waiter *mw, int ret)
 {
-	unsigned long long *num, *sum;
-	unsigned int *max, *failed;
-	struct timespec ts = current_kernel_time();
-	unsigned long long time = timespec_to_ns(&ts) - mw->mw_lock_start;
+	u32 usec;
+	ktime_t kt;
+	struct ocfs2_lock_stats *stats;
 
-	if (level == LKM_PRMODE) {
-		num = &res->l_lock_num_prmode;
-		sum = &res->l_lock_total_prmode;
-		max = &res->l_lock_max_prmode;
-		failed = &res->l_lock_num_prmode_failed;
-	} else if (level == LKM_EXMODE) {
-		num = &res->l_lock_num_exmode;
-		sum = &res->l_lock_total_exmode;
-		max = &res->l_lock_max_exmode;
-		failed = &res->l_lock_num_exmode_failed;
-	} else
+	if (level == LKM_PRMODE)
+		stats = &res->l_lock_prmode;
+	else if (level == LKM_EXMODE)
+		stats = &res->l_lock_exmode;
+	else
 		return;
 
-	(*num)++;
-	(*sum) += time;
-	if (time > *max)
-		*max = time;
+	kt = ktime_sub(ktime_get(), mw->mw_lock_start);
+	usec = ktime_to_us(kt);
+
+	stats->ls_gets++;
+	stats->ls_total += ktime_to_ns(kt);
+	/* overflow */
+	if (unlikely(stats->ls_gets) == 0) {
+		stats->ls_gets++;
+		stats->ls_total = ktime_to_ns(kt);
+	}
+
+	if (stats->ls_max < usec)
+		stats->ls_max = usec;
+
 	if (ret)
-		(*failed)++;
+		stats->ls_fail++;
 }
 
 static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
@@ -482,8 +475,7 @@
 
 static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
 {
-	struct timespec ts = current_kernel_time();
-	mw->mw_lock_start = timespec_to_ns(&ts);
+	mw->mw_lock_start = ktime_get();
 }
 #else
 static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
@@ -729,8 +721,6 @@
 
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
 {
-	mlog_entry_void();
-
 	if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
 		return;
 
@@ -756,14 +746,11 @@
 	memset(&res->l_lksb, 0, sizeof(res->l_lksb));
 
 	res->l_flags = 0UL;
-	mlog_exit_void();
 }
 
 static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
 				     int level)
 {
-	mlog_entry_void();
-
 	BUG_ON(!lockres);
 
 	switch(level) {
@@ -776,15 +763,11 @@
 	default:
 		BUG();
 	}
-
-	mlog_exit_void();
 }
 
 static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
 				     int level)
 {
-	mlog_entry_void();
-
 	BUG_ON(!lockres);
 
 	switch(level) {
@@ -799,7 +782,6 @@
 	default:
 		BUG();
 	}
-	mlog_exit_void();
 }
 
 /* WARNING: This function lives in a world where the only three lock
@@ -846,8 +828,6 @@
 
 static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
 {
-	mlog_entry_void();
-
 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
@@ -860,14 +840,10 @@
 		lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
 	}
 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-
-	mlog_exit_void();
 }
 
 static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
 {
-	mlog_entry_void();
-
 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
 
@@ -889,14 +865,10 @@
 	lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
 
 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-
-	mlog_exit_void();
 }
 
 static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
 {
-	mlog_entry_void();
-
 	BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
 	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
 
@@ -908,15 +880,12 @@
 	lockres->l_level = lockres->l_requested;
 	lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-
-	mlog_exit_void();
 }
 
 static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
 				     int level)
 {
 	int needs_downconvert = 0;
-	mlog_entry_void();
 
 	assert_spin_locked(&lockres->l_lock);
 
@@ -938,8 +907,7 @@
 
 	if (needs_downconvert)
 		lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
-
-	mlog_exit(needs_downconvert);
+	mlog(0, "needs_downconvert = %d\n", needs_downconvert);
 	return needs_downconvert;
 }
 
@@ -1151,8 +1119,6 @@
 	struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
 	unsigned long flags;
 
-	mlog_entry_void();
-
 	mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n",
 	     lockres->l_name, lockres->l_unlock_action);
 
@@ -1162,7 +1128,6 @@
 		     "unlock_action %d\n", error, lockres->l_name,
 		     lockres->l_unlock_action);
 		spin_unlock_irqrestore(&lockres->l_lock, flags);
-		mlog_exit_void();
 		return;
 	}
 
@@ -1186,8 +1151,6 @@
 	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
 	wake_up(&lockres->l_event);
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
-
-	mlog_exit_void();
 }
 
 /*
@@ -1233,7 +1196,6 @@
 {
 	unsigned long flags;
 
-	mlog_entry_void();
 	spin_lock_irqsave(&lockres->l_lock, flags);
 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
 	lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
@@ -1244,7 +1206,6 @@
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 	wake_up(&lockres->l_event);
-	mlog_exit_void();
 }
 
 /* Note: If we detect another process working on the lock (i.e.,
@@ -1260,8 +1221,6 @@
 	unsigned long flags;
 	unsigned int gen;
 
-	mlog_entry_void();
-
 	mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level,
 	     dlm_flags);
 
@@ -1293,7 +1252,6 @@
 	mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name);
 
 bail:
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -1416,8 +1374,6 @@
 	unsigned int gen;
 	int noqueue_attempted = 0;
 
-	mlog_entry_void();
-
 	ocfs2_init_mask_waiter(&mw);
 
 	if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
@@ -1583,7 +1539,6 @@
 				caller_ip);
 	}
 #endif
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -1605,7 +1560,6 @@
 {
 	unsigned long flags;
 
-	mlog_entry_void();
 	spin_lock_irqsave(&lockres->l_lock, flags);
 	ocfs2_dec_holders(lockres, level);
 	ocfs2_downconvert_on_unlock(osb, lockres);
@@ -1614,7 +1568,6 @@
 	if (lockres->l_lockdep_map.key != NULL)
 		rwsem_release(&lockres->l_lockdep_map, 1, caller_ip);
 #endif
-	mlog_exit_void();
 }
 
 static int ocfs2_create_new_lock(struct ocfs2_super *osb,
@@ -1648,8 +1601,6 @@
 	BUG_ON(!inode);
 	BUG_ON(!ocfs2_inode_is_new(inode));
 
-	mlog_entry_void();
-
 	mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 	/* NOTE: That we don't increment any of the holder counts, nor
@@ -1683,7 +1634,6 @@
 	}
 
 bail:
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -1695,16 +1645,12 @@
 
 	BUG_ON(!inode);
 
-	mlog_entry_void();
-
 	mlog(0, "inode %llu take %s RW lock\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 	     write ? "EXMODE" : "PRMODE");
 
-	if (ocfs2_mount_local(osb)) {
-		mlog_exit(0);
+	if (ocfs2_mount_local(osb))
 		return 0;
-	}
 
 	lockres = &OCFS2_I(inode)->ip_rw_lockres;
 
@@ -1715,7 +1661,6 @@
 	if (status < 0)
 		mlog_errno(status);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -1725,16 +1670,12 @@
 	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog_entry_void();
-
 	mlog(0, "inode %llu drop %s RW lock\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 	     write ? "EXMODE" : "PRMODE");
 
 	if (!ocfs2_mount_local(osb))
 		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
-
-	mlog_exit_void();
 }
 
 /*
@@ -1748,8 +1689,6 @@
 
 	BUG_ON(!inode);
 
-	mlog_entry_void();
-
 	mlog(0, "inode %llu take PRMODE open lock\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
@@ -1764,7 +1703,6 @@
 		mlog_errno(status);
 
 out:
-	mlog_exit(status);
 	return status;
 }
 
@@ -1776,8 +1714,6 @@
 
 	BUG_ON(!inode);
 
-	mlog_entry_void();
-
 	mlog(0, "inode %llu try to take %s open lock\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 	     write ? "EXMODE" : "PRMODE");
@@ -1799,7 +1735,6 @@
 				    level, DLM_LKF_NOQUEUE, 0);
 
 out:
-	mlog_exit(status);
 	return status;
 }
 
@@ -1811,8 +1746,6 @@
 	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog_entry_void();
-
 	mlog(0, "inode %llu drop open lock\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
@@ -1827,7 +1760,7 @@
 				     DLM_LOCK_EX);
 
 out:
-	mlog_exit_void();
+	return;
 }
 
 static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
@@ -2043,8 +1976,6 @@
 {
 	int kick = 0;
 
-	mlog_entry_void();
-
 	/* If we know that another node is waiting on our lock, kick
 	 * the downconvert thread * pre-emptively when we reach a release
 	 * condition. */
@@ -2065,8 +1996,6 @@
 
 	if (kick)
 		ocfs2_wake_downconvert_thread(osb);
-
-	mlog_exit_void();
 }
 
 #define OCFS2_SEC_BITS   34
@@ -2095,8 +2024,6 @@
 	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
 	struct ocfs2_meta_lvb *lvb;
 
-	mlog_entry_void();
-
 	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 
 	/*
@@ -2128,8 +2055,6 @@
 
 out:
 	mlog_meta_lvb(0, lockres);
-
-	mlog_exit_void();
 }
 
 static void ocfs2_unpack_timespec(struct timespec *spec,
@@ -2145,8 +2070,6 @@
 	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
 	struct ocfs2_meta_lvb *lvb;
 
-	mlog_entry_void();
-
 	mlog_meta_lvb(0, lockres);
 
 	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
@@ -2177,8 +2100,6 @@
 	ocfs2_unpack_timespec(&inode->i_ctime,
 			      be64_to_cpu(lvb->lvb_ictime_packed));
 	spin_unlock(&oi->ip_lock);
-
-	mlog_exit_void();
 }
 
 static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
@@ -2205,8 +2126,6 @@
 	unsigned long flags;
 	int status = 0;
 
-	mlog_entry_void();
-
 refresh_check:
 	spin_lock_irqsave(&lockres->l_lock, flags);
 	if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
@@ -2227,7 +2146,7 @@
 
 	status = 1;
 bail:
-	mlog_exit(status);
+	mlog(0, "status %d\n", status);
 	return status;
 }
 
@@ -2237,7 +2156,6 @@
 						   int status)
 {
 	unsigned long flags;
-	mlog_entry_void();
 
 	spin_lock_irqsave(&lockres->l_lock, flags);
 	lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
@@ -2246,8 +2164,6 @@
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 	wake_up(&lockres->l_event);
-
-	mlog_exit_void();
 }
 
 /* may or may not return a bh if it went to disk. */
@@ -2260,8 +2176,6 @@
 	struct ocfs2_dinode *fe;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog_entry_void();
-
 	if (ocfs2_mount_local(osb))
 		goto bail;
 
@@ -2330,7 +2244,6 @@
 bail_refresh:
 	ocfs2_complete_lock_res_refresh(lockres, status);
 bail:
-	mlog_exit(status);
 	return status;
 }
 
@@ -2374,8 +2287,6 @@
 
 	BUG_ON(!inode);
 
-	mlog_entry_void();
-
 	mlog(0, "inode %llu, take %s META lock\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 	     ex ? "EXMODE" : "PRMODE");
@@ -2467,7 +2378,6 @@
 	if (local_bh)
 		brelse(local_bh);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -2517,7 +2427,6 @@
 {
 	int ret;
 
-	mlog_entry_void();
 	ret = ocfs2_inode_lock(inode, NULL, 0);
 	if (ret < 0) {
 		mlog_errno(ret);
@@ -2545,7 +2454,6 @@
 	} else
 		*level = 0;
 
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -2556,8 +2464,6 @@
 	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog_entry_void();
-
 	mlog(0, "inode %llu drop %s META lock\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 	     ex ? "EXMODE" : "PRMODE");
@@ -2565,8 +2471,6 @@
 	if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
 	    !ocfs2_mount_local(osb))
 		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
-
-	mlog_exit_void();
 }
 
 int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno)
@@ -2617,8 +2521,6 @@
 	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
 	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
 
-	mlog_entry_void();
-
 	if (ocfs2_is_hard_readonly(osb))
 		return -EROFS;
 
@@ -2650,7 +2552,6 @@
 		ocfs2_track_lock_refresh(lockres);
 	}
 bail:
-	mlog_exit(status);
 	return status;
 }
 
@@ -2869,8 +2770,15 @@
 	return iter;
 }
 
-/* So that debugfs.ocfs2 can determine which format is being used */
-#define OCFS2_DLM_DEBUG_STR_VERSION 2
+/*
+ * Version is used by debugfs.ocfs2 to determine the format being used
+ *
+ * New in version 2
+ *	- Lock stats printed
+ * New in version 3
+ *	- Max time in lock stats is in usecs (instead of nsecs)
+ */
+#define OCFS2_DLM_DEBUG_STR_VERSION 3
 static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
 {
 	int i;
@@ -2912,18 +2820,18 @@
 		seq_printf(m, "0x%x\t", lvb[i]);
 
 #ifdef CONFIG_OCFS2_FS_STATS
-# define lock_num_prmode(_l)		(_l)->l_lock_num_prmode
-# define lock_num_exmode(_l)		(_l)->l_lock_num_exmode
-# define lock_num_prmode_failed(_l)	(_l)->l_lock_num_prmode_failed
-# define lock_num_exmode_failed(_l)	(_l)->l_lock_num_exmode_failed
-# define lock_total_prmode(_l)		(_l)->l_lock_total_prmode
-# define lock_total_exmode(_l)		(_l)->l_lock_total_exmode
-# define lock_max_prmode(_l)		(_l)->l_lock_max_prmode
-# define lock_max_exmode(_l)		(_l)->l_lock_max_exmode
-# define lock_refresh(_l)		(_l)->l_lock_refresh
+# define lock_num_prmode(_l)		((_l)->l_lock_prmode.ls_gets)
+# define lock_num_exmode(_l)		((_l)->l_lock_exmode.ls_gets)
+# define lock_num_prmode_failed(_l)	((_l)->l_lock_prmode.ls_fail)
+# define lock_num_exmode_failed(_l)	((_l)->l_lock_exmode.ls_fail)
+# define lock_total_prmode(_l)		((_l)->l_lock_prmode.ls_total)
+# define lock_total_exmode(_l)		((_l)->l_lock_exmode.ls_total)
+# define lock_max_prmode(_l)		((_l)->l_lock_prmode.ls_max)
+# define lock_max_exmode(_l)		((_l)->l_lock_exmode.ls_max)
+# define lock_refresh(_l)		((_l)->l_lock_refresh)
 #else
-# define lock_num_prmode(_l)		(0ULL)
-# define lock_num_exmode(_l)		(0ULL)
+# define lock_num_prmode(_l)		(0)
+# define lock_num_exmode(_l)		(0)
 # define lock_num_prmode_failed(_l)	(0)
 # define lock_num_exmode_failed(_l)	(0)
 # define lock_total_prmode(_l)		(0ULL)
@@ -2933,8 +2841,8 @@
 # define lock_refresh(_l)		(0)
 #endif
 	/* The following seq_print was added in version 2 of this output */
-	seq_printf(m, "%llu\t"
-		   "%llu\t"
+	seq_printf(m, "%u\t"
+		   "%u\t"
 		   "%u\t"
 		   "%u\t"
 		   "%llu\t"
@@ -3054,8 +2962,6 @@
 	int status = 0;
 	struct ocfs2_cluster_connection *conn = NULL;
 
-	mlog_entry_void();
-
 	if (ocfs2_mount_local(osb)) {
 		osb->node_num = 0;
 		goto local;
@@ -3112,15 +3018,12 @@
 			kthread_stop(osb->dc_task);
 	}
 
-	mlog_exit(status);
 	return status;
 }
 
 void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
 			int hangup_pending)
 {
-	mlog_entry_void();
-
 	ocfs2_drop_osb_locks(osb);
 
 	/*
@@ -3143,8 +3046,6 @@
 	osb->cconn = NULL;
 
 	ocfs2_dlm_shutdown_debug(osb);
-
-	mlog_exit_void();
 }
 
 static int ocfs2_drop_lock(struct ocfs2_super *osb,
@@ -3226,7 +3127,6 @@
 
 	ocfs2_wait_on_busy_lock(lockres);
 out:
-	mlog_exit(0);
 	return 0;
 }
 
@@ -3284,8 +3184,6 @@
 {
 	int status, err;
 
-	mlog_entry_void();
-
 	/* No need to call ocfs2_mark_lockres_freeing here -
 	 * ocfs2_clear_inode has done it for us. */
 
@@ -3310,7 +3208,6 @@
 	if (err < 0 && !status)
 		status = err;
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -3352,8 +3249,6 @@
 	int ret;
 	u32 dlm_flags = DLM_LKF_CONVERT;
 
-	mlog_entry_void();
-
 	mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
 	     lockres->l_level, new_level);
 
@@ -3375,7 +3270,6 @@
 
 	ret = 0;
 bail:
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -3385,8 +3279,6 @@
 {
 	assert_spin_locked(&lockres->l_lock);
 
-	mlog_entry_void();
-
 	if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
 		/* If we're already trying to cancel a lock conversion
 		 * then just drop the spinlock and allow the caller to
@@ -3416,8 +3308,6 @@
 {
 	int ret;
 
-	mlog_entry_void();
-
 	ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
 			       DLM_LKF_CANCEL);
 	if (ret) {
@@ -3427,7 +3317,6 @@
 
 	mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
 
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -3443,8 +3332,6 @@
 	int set_lvb = 0;
 	unsigned int gen;
 
-	mlog_entry_void();
-
 	spin_lock_irqsave(&lockres->l_lock, flags);
 
 recheck:
@@ -3619,14 +3506,14 @@
 				     gen);
 
 leave:
-	mlog_exit(ret);
+	if (ret)
+		mlog_errno(ret);
 	return ret;
 
 leave_requeue:
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 	ctl->requeue = 1;
 
-	mlog_exit(0);
 	return 0;
 }
 
@@ -3859,8 +3746,6 @@
 	struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
 					    oinfo->dqi_gi.dqi_type);
 
-	mlog_entry_void();
-
 	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 	lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
 	lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
@@ -3869,8 +3754,6 @@
 	lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
 	lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
 	lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
-
-	mlog_exit_void();
 }
 
 void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
@@ -3879,10 +3762,8 @@
 	struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
 	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
 
-	mlog_entry_void();
 	if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
 		ocfs2_cluster_unlock(osb, lockres, level);
-	mlog_exit_void();
 }
 
 static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
@@ -3937,8 +3818,6 @@
 	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
 	int status = 0;
 
-	mlog_entry_void();
-
 	/* On RO devices, locking really isn't needed... */
 	if (ocfs2_is_hard_readonly(osb)) {
 		if (ex)
@@ -3961,7 +3840,6 @@
 		ocfs2_qinfo_unlock(oinfo, ex);
 	ocfs2_complete_lock_res_refresh(lockres, status);
 bail:
-	mlog_exit(status);
 	return status;
 }
 
@@ -4007,8 +3885,6 @@
 	 * considered valid until we remove the OCFS2_LOCK_QUEUED
 	 * flag. */
 
-	mlog_entry_void();
-
 	BUG_ON(!lockres);
 	BUG_ON(!lockres->l_ops);
 
@@ -4042,15 +3918,11 @@
 	if (ctl.unblock_action != UNBLOCK_CONTINUE
 	    && lockres->l_ops->post_unlock)
 		lockres->l_ops->post_unlock(osb, lockres);
-
-	mlog_exit_void();
 }
 
 static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
 					struct ocfs2_lock_res *lockres)
 {
-	mlog_entry_void();
-
 	assert_spin_locked(&lockres->l_lock);
 
 	if (lockres->l_flags & OCFS2_LOCK_FREEING) {
@@ -4071,8 +3943,6 @@
 		osb->blocked_lock_count++;
 	}
 	spin_unlock(&osb->dc_task_lock);
-
-	mlog_exit_void();
 }
 
 static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
@@ -4080,8 +3950,6 @@
 	unsigned long processed;
 	struct ocfs2_lock_res *lockres;
 
-	mlog_entry_void();
-
 	spin_lock(&osb->dc_task_lock);
 	/* grab this early so we know to try again if a state change and
 	 * wake happens part-way through our work  */
@@ -4105,8 +3973,6 @@
 		spin_lock(&osb->dc_task_lock);
 	}
 	spin_unlock(&osb->dc_task_lock);
-
-	mlog_exit_void();
 }
 
 static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 254652a..745db42 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -26,7 +26,6 @@
 #include <linux/fs.h>
 #include <linux/types.h>
 
-#define MLOG_MASK_PREFIX ML_EXPORT
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -40,6 +39,7 @@
 
 #include "buffer_head_io.h"
 #include "suballoc.h"
+#include "ocfs2_trace.h"
 
 struct ocfs2_inode_handle
 {
@@ -56,10 +56,9 @@
 	int status, set;
 	struct dentry *result;
 
-	mlog_entry("(0x%p, 0x%p)\n", sb, handle);
+	trace_ocfs2_get_dentry_begin(sb, handle, (unsigned long long)blkno);
 
 	if (blkno == 0) {
-		mlog(0, "nfs wants inode with blkno: 0\n");
 		result = ERR_PTR(-ESTALE);
 		goto bail;
 	}
@@ -83,6 +82,7 @@
 	}
 
 	status = ocfs2_test_inode_bit(osb, blkno, &set);
+	trace_ocfs2_get_dentry_test_bit(status, set);
 	if (status < 0) {
 		if (status == -EINVAL) {
 			/*
@@ -90,18 +90,14 @@
 			 * as an inode, we return -ESTALE to be
 			 * nice
 			 */
-			mlog(0, "test inode bit failed %d\n", status);
 			status = -ESTALE;
-		} else {
+		} else
 			mlog(ML_ERROR, "test inode bit failed %d\n", status);
-		}
 		goto unlock_nfs_sync;
 	}
 
 	/* If the inode allocator bit is clear, this inode must be stale */
 	if (!set) {
-		mlog(0, "inode %llu suballoc bit is clear\n",
-		     (unsigned long long)blkno);
 		status = -ESTALE;
 		goto unlock_nfs_sync;
 	}
@@ -114,8 +110,8 @@
 check_err:
 	if (status < 0) {
 		if (status == -ESTALE) {
-			mlog(0, "stale inode ino: %llu generation: %u\n",
-			     (unsigned long long)blkno, handle->ih_generation);
+			trace_ocfs2_get_dentry_stale((unsigned long long)blkno,
+						     handle->ih_generation);
 		}
 		result = ERR_PTR(status);
 		goto bail;
@@ -130,8 +126,9 @@
 check_gen:
 	if (handle->ih_generation != inode->i_generation) {
 		iput(inode);
-		mlog(0, "stale inode ino: %llu generation: %u\n",
-		     (unsigned long long)blkno, handle->ih_generation);
+		trace_ocfs2_get_dentry_generation((unsigned long long)blkno,
+						  handle->ih_generation,
+						  inode->i_generation);
 		result = ERR_PTR(-ESTALE);
 		goto bail;
 	}
@@ -141,7 +138,7 @@
 		mlog_errno(PTR_ERR(result));
 
 bail:
-	mlog_exit_ptr(result);
+	trace_ocfs2_get_dentry_end(result);
 	return result;
 }
 
@@ -152,11 +149,8 @@
 	struct dentry *parent;
 	struct inode *dir = child->d_inode;
 
-	mlog_entry("(0x%p, '%.*s')\n", child,
-		   child->d_name.len, child->d_name.name);
-
-	mlog(0, "find parent of directory %llu\n",
-	     (unsigned long long)OCFS2_I(dir)->ip_blkno);
+	trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name,
+			       (unsigned long long)OCFS2_I(dir)->ip_blkno);
 
 	status = ocfs2_inode_lock(dir, NULL, 0);
 	if (status < 0) {
@@ -178,7 +172,7 @@
 	ocfs2_inode_unlock(dir, 0);
 
 bail:
-	mlog_exit_ptr(parent);
+	trace_ocfs2_get_parent_end(parent);
 
 	return parent;
 }
@@ -193,9 +187,9 @@
 	u32 generation;
 	__le32 *fh = (__force __le32 *) fh_in;
 
-	mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
-		   dentry->d_name.len, dentry->d_name.name,
-		   fh, len, connectable);
+	trace_ocfs2_encode_fh_begin(dentry, dentry->d_name.len,
+				    dentry->d_name.name,
+				    fh, len, connectable);
 
 	if (connectable && (len < 6)) {
 		*max_len = 6;
@@ -210,8 +204,7 @@
 	blkno = OCFS2_I(inode)->ip_blkno;
 	generation = inode->i_generation;
 
-	mlog(0, "Encoding fh: blkno: %llu, generation: %u\n",
-	     (unsigned long long)blkno, generation);
+	trace_ocfs2_encode_fh_self((unsigned long long)blkno, generation);
 
 	len = 3;
 	fh[0] = cpu_to_le32((u32)(blkno >> 32));
@@ -236,14 +229,14 @@
 		len = 6;
 		type = 2;
 
-		mlog(0, "Encoding parent: blkno: %llu, generation: %u\n",
-		     (unsigned long long)blkno, generation);
+		trace_ocfs2_encode_fh_parent((unsigned long long)blkno,
+					     generation);
 	}
 
 	*max_len = len;
 
 bail:
-	mlog_exit(type);
+	trace_ocfs2_encode_fh_type(type);
 	return type;
 }
 
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 09e3fdf..23457b4 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -28,7 +28,6 @@
 #include <linux/types.h>
 #include <linux/fiemap.h>
 
-#define MLOG_MASK_PREFIX ML_EXTENT_MAP
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -39,6 +38,7 @@
 #include "inode.h"
 #include "super.h"
 #include "symlink.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
@@ -841,10 +841,9 @@
 	u64 p_block, p_count;
 	int i, count, done = 0;
 
-	mlog_entry("(inode = %p, v_block = %llu, nr = %d, bhs = %p, "
-		   "flags = %x, validate = %p)\n",
-		   inode, (unsigned long long)v_block, nr, bhs, flags,
-		   validate);
+	trace_ocfs2_read_virt_blocks(
+	     inode, (unsigned long long)v_block, nr, bhs, flags,
+	     validate);
 
 	if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
 	    i_size_read(inode)) {
@@ -897,7 +896,6 @@
 	}
 
 out:
-	mlog_exit(rc);
 	return rc;
 }
 
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a665195..41565ae 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -38,7 +38,6 @@
 #include <linux/quotaops.h>
 #include <linux/blkdev.h>
 
-#define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -61,6 +60,7 @@
 #include "acl.h"
 #include "quota.h"
 #include "refcounttree.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
@@ -99,8 +99,10 @@
 	int mode = file->f_flags;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
-	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
-		   file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
+	trace_ocfs2_file_open(inode, file, file->f_path.dentry,
+			      (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			      file->f_path.dentry->d_name.len,
+			      file->f_path.dentry->d_name.name, mode);
 
 	if (file->f_mode & FMODE_WRITE)
 		dquot_initialize(inode);
@@ -135,7 +137,6 @@
 	}
 
 leave:
-	mlog_exit(status);
 	return status;
 }
 
@@ -143,19 +144,19 @@
 {
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
-	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
-		       file->f_path.dentry->d_name.len,
-		       file->f_path.dentry->d_name.name);
-
 	spin_lock(&oi->ip_lock);
 	if (!--oi->ip_open_count)
 		oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
+
+	trace_ocfs2_file_release(inode, file, file->f_path.dentry,
+				 oi->ip_blkno,
+				 file->f_path.dentry->d_name.len,
+				 file->f_path.dentry->d_name.name,
+				 oi->ip_open_count);
 	spin_unlock(&oi->ip_lock);
 
 	ocfs2_free_file_private(inode, file);
 
-	mlog_exit(0);
-
 	return 0;
 }
 
@@ -177,9 +178,11 @@
 	struct inode *inode = file->f_mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog_entry("(0x%p, %d, 0x%p, '%.*s')\n", file, datasync,
-		   file->f_path.dentry, file->f_path.dentry->d_name.len,
-		   file->f_path.dentry->d_name.name);
+	trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
+			      OCFS2_I(inode)->ip_blkno,
+			      file->f_path.dentry->d_name.len,
+			      file->f_path.dentry->d_name.name,
+			      (unsigned long long)datasync);
 
 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
 		/*
@@ -195,7 +198,8 @@
 	err = jbd2_journal_force_commit(journal);
 
 bail:
-	mlog_exit(err);
+	if (err)
+		mlog_errno(err);
 
 	return (err < 0) ? -EIO : 0;
 }
@@ -251,8 +255,6 @@
 	handle_t *handle;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
 
-	mlog_entry_void();
-
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
@@ -280,7 +282,6 @@
 out_commit:
 	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -291,7 +292,6 @@
 {
 	int status;
 
-	mlog_entry_void();
 	i_size_write(inode, new_i_size);
 	inode->i_blocks = ocfs2_inode_sector_count(inode);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -303,7 +303,6 @@
 	}
 
 bail:
-	mlog_exit(status);
 	return status;
 }
 
@@ -375,8 +374,6 @@
 	struct ocfs2_dinode *di;
 	u64 cluster_bytes;
 
-	mlog_entry_void();
-
 	/*
 	 * We need to CoW the cluster contains the offset if it is reflinked
 	 * since we will call ocfs2_zero_range_for_truncate later which will
@@ -429,8 +426,6 @@
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 out:
-
-	mlog_exit(status);
 	return status;
 }
 
@@ -442,14 +437,14 @@
 	struct ocfs2_dinode *fe = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog_entry("(inode = %llu, new_i_size = %llu\n",
-		   (unsigned long long)OCFS2_I(inode)->ip_blkno,
-		   (unsigned long long)new_i_size);
-
 	/* We trust di_bh because it comes from ocfs2_inode_lock(), which
 	 * already validated it */
 	fe = (struct ocfs2_dinode *) di_bh->b_data;
 
+	trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
+				  (unsigned long long)le64_to_cpu(fe->i_size),
+				  (unsigned long long)new_i_size);
+
 	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
 			"Inode %llu, inode i_size = %lld != di "
 			"i_size = %llu, i_flags = 0x%x\n",
@@ -459,19 +454,14 @@
 			le32_to_cpu(fe->i_flags));
 
 	if (new_i_size > le64_to_cpu(fe->i_size)) {
-		mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
-		     (unsigned long long)le64_to_cpu(fe->i_size),
-		     (unsigned long long)new_i_size);
+		trace_ocfs2_truncate_file_error(
+			(unsigned long long)le64_to_cpu(fe->i_size),
+			(unsigned long long)new_i_size);
 		status = -EINVAL;
 		mlog_errno(status);
 		goto bail;
 	}
 
-	mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
-	     (unsigned long long)le64_to_cpu(fe->i_blkno),
-	     (unsigned long long)le64_to_cpu(fe->i_size),
-	     (unsigned long long)new_i_size);
-
 	/* lets handle the simple truncate cases before doing any more
 	 * cluster locking. */
 	if (new_i_size == le64_to_cpu(fe->i_size))
@@ -525,7 +515,6 @@
 	if (!status && OCFS2_I(inode)->ip_clusters == 0)
 		status = ocfs2_try_remove_refcount_tree(inode, di_bh);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -578,8 +567,6 @@
 	struct ocfs2_extent_tree et;
 	int did_quota = 0;
 
-	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
-
 	/*
 	 * This function only exists for file systems which don't
 	 * support holes.
@@ -596,11 +583,6 @@
 restart_all:
 	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 
-	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
-	     "clusters_to_add = %u\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-	     (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
-	     clusters_to_add);
 	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
 	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
 				       &data_ac, &meta_ac);
@@ -620,6 +602,12 @@
 	}
 
 restarted_transaction:
+	trace_ocfs2_extend_allocation(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		(unsigned long long)i_size_read(inode),
+		le32_to_cpu(fe->i_clusters), clusters_to_add,
+		why, restart_func);
+
 	status = dquot_alloc_space_nodirty(inode,
 			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 	if (status)
@@ -666,13 +654,11 @@
 
 	if (why != RESTART_NONE && clusters_to_add) {
 		if (why == RESTART_META) {
-			mlog(0, "restarting function.\n");
 			restart_func = 1;
 			status = 0;
 		} else {
 			BUG_ON(why != RESTART_TRANS);
 
-			mlog(0, "restarting transaction.\n");
 			/* TODO: This can be more intelligent. */
 			credits = ocfs2_calc_extend_credits(osb->sb,
 							    &fe->id2.i_list,
@@ -689,11 +675,11 @@
 		}
 	}
 
-	mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
+	trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
 	     le32_to_cpu(fe->i_clusters),
-	     (unsigned long long)le64_to_cpu(fe->i_size));
-	mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
-	     OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
+	     (unsigned long long)le64_to_cpu(fe->i_size),
+	     OCFS2_I(inode)->ip_clusters,
+	     (unsigned long long)i_size_read(inode));
 
 leave:
 	if (status < 0 && did_quota)
@@ -718,7 +704,6 @@
 	brelse(bh);
 	bh = NULL;
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -785,10 +770,11 @@
 	if (!zero_to)
 		zero_to = PAGE_CACHE_SIZE;
 
-	mlog(0,
-	     "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n",
-	     (unsigned long long)abs_from, (unsigned long long)abs_to,
-	     index, zero_from, zero_to);
+	trace_ocfs2_write_zero_page(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(unsigned long long)abs_from,
+			(unsigned long long)abs_to,
+			index, zero_from, zero_to);
 
 	/* We know that zero_from is block aligned */
 	for (block_start = zero_from; block_start < zero_to;
@@ -928,9 +914,10 @@
 	u64 next_pos;
 	u64 zero_pos = range_start;
 
-	mlog(0, "range_start = %llu, range_end = %llu\n",
-	     (unsigned long long)range_start,
-	     (unsigned long long)range_end);
+	trace_ocfs2_zero_extend_range(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(unsigned long long)range_start,
+			(unsigned long long)range_end);
 	BUG_ON(range_start >= range_end);
 
 	while (zero_pos < range_end) {
@@ -962,9 +949,9 @@
 	struct super_block *sb = inode->i_sb;
 
 	zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
-	mlog(0, "zero_start %llu for i_size %llu\n",
-	     (unsigned long long)zero_start,
-	     (unsigned long long)i_size_read(inode));
+	trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
+				(unsigned long long)zero_start,
+				(unsigned long long)i_size_read(inode));
 	while (zero_start < zero_to_size) {
 		ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
 						  zero_to_size,
@@ -1113,30 +1100,20 @@
 	struct dquot *transfer_to[MAXQUOTAS] = { };
 	int qtype;
 
-	mlog_entry("(0x%p, '%.*s')\n", dentry,
-	           dentry->d_name.len, dentry->d_name.name);
+	trace_ocfs2_setattr(inode, dentry,
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			    dentry->d_name.len, dentry->d_name.name,
+			    attr->ia_valid, attr->ia_mode,
+			    attr->ia_uid, attr->ia_gid);
 
 	/* ensuring we don't even attempt to truncate a symlink */
 	if (S_ISLNK(inode->i_mode))
 		attr->ia_valid &= ~ATTR_SIZE;
 
-	if (attr->ia_valid & ATTR_MODE)
-		mlog(0, "mode change: %d\n", attr->ia_mode);
-	if (attr->ia_valid & ATTR_UID)
-		mlog(0, "uid change: %d\n", attr->ia_uid);
-	if (attr->ia_valid & ATTR_GID)
-		mlog(0, "gid change: %d\n", attr->ia_gid);
-	if (attr->ia_valid & ATTR_SIZE)
-		mlog(0, "size change...\n");
-	if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
-		mlog(0, "time change...\n");
-
 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
 			   | ATTR_GID | ATTR_UID | ATTR_MODE)
-	if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
-		mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
+	if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
 		return 0;
-	}
 
 	status = inode_change_ok(inode, attr);
 	if (status)
@@ -1274,7 +1251,6 @@
 			mlog_errno(status);
 	}
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -1287,8 +1263,6 @@
 	struct ocfs2_super *osb = sb->s_fs_info;
 	int err;
 
-	mlog_entry_void();
-
 	err = ocfs2_inode_revalidate(dentry);
 	if (err) {
 		if (err != -ENOENT)
@@ -1302,8 +1276,6 @@
 	stat->blksize = osb->s_clustersize;
 
 bail:
-	mlog_exit(err);
-
 	return err;
 }
 
@@ -1314,8 +1286,6 @@
 	if (flags & IPERM_FLAG_RCU)
 		return -ECHILD;
 
-	mlog_entry_void();
-
 	ret = ocfs2_inode_lock(inode, NULL, 0);
 	if (ret) {
 		if (ret != -ENOENT)
@@ -1327,7 +1297,6 @@
 
 	ocfs2_inode_unlock(inode, 0);
 out:
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -1339,8 +1308,9 @@
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_dinode *di;
 
-	mlog_entry("(Inode %llu, mode 0%o)\n",
-		   (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
+	trace_ocfs2_write_remove_suid(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			inode->i_mode);
 
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
@@ -1368,7 +1338,6 @@
 out_trans:
 	ocfs2_commit_trans(osb, handle);
 out:
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -1547,8 +1516,9 @@
 	 * partial clusters here. There's no need to worry about
 	 * physical allocation - the zeroing code knows to skip holes.
 	 */
-	mlog(0, "byte start: %llu, end: %llu\n",
-	     (unsigned long long)start, (unsigned long long)end);
+	trace_ocfs2_zero_partial_clusters(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		(unsigned long long)start, (unsigned long long)end);
 
 	/*
 	 * If both edges are on a cluster boundary then there's no
@@ -1572,8 +1542,8 @@
 	if (tmpend > end)
 		tmpend = end;
 
-	mlog(0, "1st range: start: %llu, tmpend: %llu\n",
-	     (unsigned long long)start, (unsigned long long)tmpend);
+	trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start,
+						 (unsigned long long)tmpend);
 
 	ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
 	if (ret)
@@ -1587,8 +1557,8 @@
 		 */
 		start = end & ~(osb->s_clustersize - 1);
 
-		mlog(0, "2nd range: start: %llu, end: %llu\n",
-		     (unsigned long long)start, (unsigned long long)end);
+		trace_ocfs2_zero_partial_clusters_range2(
+			(unsigned long long)start, (unsigned long long)end);
 
 		ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
 		if (ret)
@@ -1688,6 +1658,11 @@
 	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
 	ocfs2_init_dealloc_ctxt(&dealloc);
 
+	trace_ocfs2_remove_inode_range(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(unsigned long long)byte_start,
+			(unsigned long long)byte_len);
+
 	if (byte_len == 0)
 		return 0;
 
@@ -1734,11 +1709,6 @@
 	trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
 	cluster_in_el = trunc_end;
 
-	mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, cend: %u\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-	     (unsigned long long)byte_start,
-	     (unsigned long long)byte_len, trunc_start, trunc_end);
-
 	ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
 	if (ret) {
 		mlog_errno(ret);
@@ -2093,7 +2063,7 @@
 	int ret = 0, meta_level = 0;
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
-	loff_t saved_pos, end;
+	loff_t saved_pos = 0, end;
 
 	/*
 	 * We start with a read level meta lock and only jump to an ex
@@ -2132,12 +2102,10 @@
 
 		/* work on a copy of ppos until we're sure that we won't have
 		 * to recalculate it due to relocking. */
-		if (appending) {
+		if (appending)
 			saved_pos = i_size_read(inode);
-			mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
-		} else {
+		else
 			saved_pos = *ppos;
-		}
 
 		end = saved_pos + count;
 
@@ -2208,6 +2176,10 @@
 		*ppos = saved_pos;
 
 out_unlock:
+	trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
+					    saved_pos, appending, count,
+					    direct_io, has_refcount);
+
 	if (meta_level >= 0)
 		ocfs2_inode_unlock(inode, meta_level);
 
@@ -2233,10 +2205,11 @@
 	int full_coherency = !(osb->s_mount_opt &
 			       OCFS2_MOUNT_COHERENCY_BUFFERED);
 
-	mlog_entry("(0x%p, %u, '%.*s')\n", file,
-		   (unsigned int)nr_segs,
-		   file->f_path.dentry->d_name.len,
-		   file->f_path.dentry->d_name.name);
+	trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		file->f_path.dentry->d_name.len,
+		file->f_path.dentry->d_name.name,
+		(unsigned int)nr_segs);
 
 	if (iocb->ki_left == 0)
 		return 0;
@@ -2402,7 +2375,6 @@
 
 	if (written)
 		ret = written;
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -2438,10 +2410,11 @@
 		.u.file = out,
 	};
 
-	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
-		   (unsigned int)len,
-		   out->f_path.dentry->d_name.len,
-		   out->f_path.dentry->d_name.name);
+
+	trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry,
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			out->f_path.dentry->d_name.len,
+			out->f_path.dentry->d_name.name, len);
 
 	if (pipe->inode)
 		mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
@@ -2485,7 +2458,6 @@
 		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
 	}
 
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -2498,10 +2470,10 @@
 	int ret = 0, lock_level = 0;
 	struct inode *inode = in->f_path.dentry->d_inode;
 
-	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
-		   (unsigned int)len,
-		   in->f_path.dentry->d_name.len,
-		   in->f_path.dentry->d_name.name);
+	trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			in->f_path.dentry->d_name.len,
+			in->f_path.dentry->d_name.name, len);
 
 	/*
 	 * See the comment in ocfs2_file_aio_read()
@@ -2516,7 +2488,6 @@
 	ret = generic_file_splice_read(in, ppos, pipe, len, flags);
 
 bail:
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -2529,10 +2500,11 @@
 	struct file *filp = iocb->ki_filp;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 
-	mlog_entry("(0x%p, %u, '%.*s')\n", filp,
-		   (unsigned int)nr_segs,
-		   filp->f_path.dentry->d_name.len,
-		   filp->f_path.dentry->d_name.name);
+	trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			filp->f_path.dentry->d_name.len,
+			filp->f_path.dentry->d_name.name, nr_segs);
+
 
 	if (!inode) {
 		ret = -EINVAL;
@@ -2578,8 +2550,7 @@
 	ocfs2_inode_unlock(inode, lock_level);
 
 	ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
-	if (ret == -EINVAL)
-		mlog(0, "generic_file_aio_read returned -EINVAL\n");
+	trace_generic_file_aio_read_ret(ret);
 
 	/* buffered aio wouldn't have proper lock coverage today */
 	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
@@ -2597,7 +2568,6 @@
 	}
 	if (rw_level != -1)
 		ocfs2_rw_unlock(inode, rw_level);
-	mlog_exit(ret);
 
 	return ret;
 }
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 1aa863d..d8208b2 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -28,7 +28,6 @@
 #include <linux/types.h>
 #include <linux/highmem.h>
 
-#define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -37,6 +36,7 @@
 #include "heartbeat.h"
 #include "inode.h"
 #include "journal.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
@@ -66,7 +66,7 @@
 
 	BUG_ON(osb->node_num == node_num);
 
-	mlog(0, "ocfs2: node down event for %d\n", node_num);
+	trace_ocfs2_do_node_down(node_num);
 
 	if (!osb->cconn) {
 		/*
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 4068c6c..177d3a6 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -31,7 +31,6 @@
 
 #include <asm/byteorder.h>
 
-#define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -53,6 +52,7 @@
 #include "uptodate.h"
 #include "xattr.h"
 #include "refcounttree.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
@@ -131,7 +131,8 @@
 	struct super_block *sb = osb->sb;
 	struct ocfs2_find_inode_args args;
 
-	mlog_entry("(blkno = %llu)\n", (unsigned long long)blkno);
+	trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
+			       sysfile_type);
 
 	/* Ok. By now we've either got the offsets passed to us by the
 	 * caller, or we just pulled them off the bh. Lets do some
@@ -152,16 +153,16 @@
 	/* inode was *not* in the inode cache. 2.6.x requires
 	 * us to do our own read_inode call and unlock it
 	 * afterwards. */
-	if (inode && inode->i_state & I_NEW) {
-		mlog(0, "Inode was not in inode cache, reading it.\n");
-		ocfs2_read_locked_inode(inode, &args);
-		unlock_new_inode(inode);
-	}
 	if (inode == NULL) {
 		inode = ERR_PTR(-ENOMEM);
 		mlog_errno(PTR_ERR(inode));
 		goto bail;
 	}
+	trace_ocfs2_iget5_locked(inode->i_state);
+	if (inode->i_state & I_NEW) {
+		ocfs2_read_locked_inode(inode, &args);
+		unlock_new_inode(inode);
+	}
 	if (is_bad_inode(inode)) {
 		iput(inode);
 		inode = ERR_PTR(-ESTALE);
@@ -170,9 +171,8 @@
 
 bail:
 	if (!IS_ERR(inode)) {
-		mlog(0, "returning inode with number %llu\n",
-		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
-		mlog_exit_ptr(inode);
+		trace_ocfs2_iget_end(inode, 
+			(unsigned long long)OCFS2_I(inode)->ip_blkno);
 	}
 
 	return inode;
@@ -192,18 +192,17 @@
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	int ret = 0;
 
-	mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque);
-
 	args = opaque;
 
 	mlog_bug_on_msg(!inode, "No inode in find actor!\n");
 
+	trace_ocfs2_find_actor(inode, inode->i_ino, opaque, args->fi_blkno);
+
 	if (oi->ip_blkno != args->fi_blkno)
 		goto bail;
 
 	ret = 1;
 bail:
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -218,8 +217,6 @@
 	static struct lock_class_key ocfs2_quota_ip_alloc_sem_key,
 				     ocfs2_file_ip_alloc_sem_key;
 
-	mlog_entry("inode = %p, opaque = %p\n", inode, opaque);
-
 	inode->i_ino = args->fi_ino;
 	OCFS2_I(inode)->ip_blkno = args->fi_blkno;
 	if (args->fi_sysfile_type != 0)
@@ -235,7 +232,6 @@
 		lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem,
 				  &ocfs2_file_ip_alloc_sem_key);
 
-	mlog_exit(0);
 	return 0;
 }
 
@@ -246,9 +242,6 @@
 	struct ocfs2_super *osb;
 	int use_plocks = 1;
 
-	mlog_entry("(0x%p, size:%llu)\n", inode,
-		   (unsigned long long)le64_to_cpu(fe->i_size));
-
 	sb = inode->i_sb;
 	osb = OCFS2_SB(sb);
 
@@ -300,20 +293,20 @@
 
 	inode->i_nlink = ocfs2_read_links_count(fe);
 
+	trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno,
+				   le32_to_cpu(fe->i_flags));
 	if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
 		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
 		inode->i_flags |= S_NOQUOTA;
 	}
-
+  
 	if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
 		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
-		mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
 	} else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
 		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
 	} else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) {
 		inode->i_flags |= S_NOQUOTA;
 	} else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
-		mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
 		/* we can't actually hit this as read_inode can't
 		 * handle superblocks today ;-) */
 		BUG();
@@ -381,7 +374,6 @@
 	if (S_ISDIR(inode->i_mode))
 		ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv,
 				    OCFS2_RESV_FLAG_DIR);
-	mlog_exit_void();
 }
 
 static int ocfs2_read_locked_inode(struct inode *inode,
@@ -394,8 +386,6 @@
 	int status, can_lock;
 	u32 generation = 0;
 
-	mlog_entry("(0x%p, 0x%p)\n", inode, args);
-
 	status = -EINVAL;
 	if (inode == NULL || inode->i_sb == NULL) {
 		mlog(ML_ERROR, "bad inode\n");
@@ -443,6 +433,9 @@
 		&& !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
 		&& !ocfs2_mount_local(osb);
 
+	trace_ocfs2_read_locked_inode(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno, can_lock);
+
 	/*
 	 * To maintain backwards compatibility with older versions of
 	 * ocfs2-tools, we still store the generation value for system
@@ -534,7 +527,6 @@
 	if (args && bh)
 		brelse(bh);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -551,8 +543,6 @@
 	struct ocfs2_dinode *fe;
 	handle_t *handle = NULL;
 
-	mlog_entry_void();
-
 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
 	/*
@@ -600,7 +590,6 @@
 out:
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
-	mlog_exit(status);
 	return status;
 }
 
@@ -696,8 +685,6 @@
 
 	spin_lock(&osb->osb_lock);
 	if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) {
-		mlog(0, "Recovery is happening on orphan dir %d, will skip "
-		     "this inode\n", slot);
 		ret = -EDEADLK;
 		goto out;
 	}
@@ -706,6 +693,7 @@
 	osb->osb_orphan_wipes[slot]++;
 out:
 	spin_unlock(&osb->osb_lock);
+	trace_ocfs2_check_orphan_recovery_state(slot, ret);
 	return ret;
 }
 
@@ -816,6 +804,10 @@
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
+	trace_ocfs2_inode_is_valid_to_delete(current, osb->dc_task,
+					     (unsigned long long)oi->ip_blkno,
+					     oi->ip_flags);
+
 	/* We shouldn't be getting here for the root directory
 	 * inode.. */
 	if (inode == osb->root_inode) {
@@ -828,11 +820,8 @@
 	 * have to skip deleting this guy. That's OK though because
 	 * the node who's doing the actual deleting should handle it
 	 * anyway. */
-	if (current == osb->dc_task) {
-		mlog(0, "Skipping delete of %lu because we're currently "
-		     "in downconvert\n", inode->i_ino);
+	if (current == osb->dc_task)
 		goto bail;
-	}
 
 	spin_lock(&oi->ip_lock);
 	/* OCFS2 *never* deletes system files. This should technically
@@ -847,11 +836,8 @@
 	/* If we have allowd wipe of this inode for another node, it
 	 * will be marked here so we can safely skip it. Recovery will
 	 * cleanup any inodes we might inadvertantly skip here. */
-	if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
-		mlog(0, "Skipping delete of %lu because another node "
-		     "has done this for us.\n", inode->i_ino);
+	if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE)
 		goto bail_unlock;
-	}
 
 	ret = 1;
 bail_unlock:
@@ -868,28 +854,27 @@
 				  struct buffer_head *di_bh,
 				  int *wipe)
 {
-	int status = 0;
+	int status = 0, reason = 0;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_dinode *di;
 
 	*wipe = 0;
 
+	trace_ocfs2_query_inode_wipe_begin((unsigned long long)oi->ip_blkno,
+					   inode->i_nlink);
+
 	/* While we were waiting for the cluster lock in
 	 * ocfs2_delete_inode, another node might have asked to delete
 	 * the inode. Recheck our flags to catch this. */
 	if (!ocfs2_inode_is_valid_to_delete(inode)) {
-		mlog(0, "Skipping delete of %llu because flags changed\n",
-		     (unsigned long long)oi->ip_blkno);
+		reason = 1;
 		goto bail;
 	}
 
 	/* Now that we have an up to date inode, we can double check
 	 * the link count. */
-	if (inode->i_nlink) {
-		mlog(0, "Skipping delete of %llu because nlink = %u\n",
-		     (unsigned long long)oi->ip_blkno, inode->i_nlink);
+	if (inode->i_nlink)
 		goto bail;
-	}
 
 	/* Do some basic inode verification... */
 	di = (struct ocfs2_dinode *) di_bh->b_data;
@@ -904,9 +889,7 @@
 		 * ORPHANED_FL not.
 		 */
 		if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) {
-			mlog(0, "Reflinked inode %llu is no longer orphaned.  "
-			     "it shouldn't be deleted\n",
-			     (unsigned long long)oi->ip_blkno);
+			reason = 2;
 			goto bail;
 		}
 
@@ -943,8 +926,7 @@
 	status = ocfs2_try_open_lock(inode, 1);
 	if (status == -EAGAIN) {
 		status = 0;
-		mlog(0, "Skipping delete of %llu because it is in use on "
-		     "other nodes\n", (unsigned long long)oi->ip_blkno);
+		reason = 3;
 		goto bail;
 	}
 	if (status < 0) {
@@ -953,11 +935,10 @@
 	}
 
 	*wipe = 1;
-	mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n",
-	     (unsigned long long)oi->ip_blkno,
-	     le16_to_cpu(di->i_orphaned_slot));
+	trace_ocfs2_query_inode_wipe_succ(le16_to_cpu(di->i_orphaned_slot));
 
 bail:
+	trace_ocfs2_query_inode_wipe_end(status, reason);
 	return status;
 }
 
@@ -967,8 +948,8 @@
 static void ocfs2_cleanup_delete_inode(struct inode *inode,
 				       int sync_data)
 {
-	mlog(0, "Cleanup inode %llu, sync = %d\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
+	trace_ocfs2_cleanup_delete_inode(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
 	if (sync_data)
 		write_inode_now(inode, 1);
 	truncate_inode_pages(&inode->i_data, 0);
@@ -980,15 +961,15 @@
 	sigset_t oldset;
 	struct buffer_head *di_bh = NULL;
 
-	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
+	trace_ocfs2_delete_inode(inode->i_ino,
+				 (unsigned long long)OCFS2_I(inode)->ip_blkno,
+				 is_bad_inode(inode));
 
 	/* When we fail in read_inode() we mark inode as bad. The second test
 	 * catches the case when inode allocation fails before allocating
 	 * a block for inode. */
-	if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) {
-		mlog(0, "Skipping delete of bad inode\n");
+	if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno)
 		goto bail;
-	}
 
 	dquot_initialize(inode);
 
@@ -1080,7 +1061,7 @@
 bail_unblock:
 	ocfs2_unblock_signals(&oldset);
 bail:
-	mlog_exit_void();
+	return;
 }
 
 static void ocfs2_clear_inode(struct inode *inode)
@@ -1088,11 +1069,9 @@
 	int status;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
-	mlog_entry_void();
-
 	end_writeback(inode);
-	mlog(0, "Clearing inode: %llu, nlink = %u\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_nlink);
+	trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno,
+				inode->i_nlink);
 
 	mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
 			"Inode=%lu\n", inode->i_ino);
@@ -1181,8 +1160,6 @@
 	 */
 	jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
 				       &oi->ip_jinode);
-
-	mlog_exit_void();
 }
 
 void ocfs2_evict_inode(struct inode *inode)
@@ -1204,17 +1181,14 @@
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	int res;
 
-	mlog_entry_void();
-
-	mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
-	     (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
+	trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno,
+				inode->i_nlink, oi->ip_flags);
 
 	if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
 		res = 1;
 	else
 		res = generic_drop_inode(inode);
 
-	mlog_exit_void();
 	return res;
 }
 
@@ -1226,11 +1200,11 @@
 	struct inode *inode = dentry->d_inode;
 	int status = 0;
 
-	mlog_entry("(inode = 0x%p, ino = %llu)\n", inode,
-		   inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL);
+	trace_ocfs2_inode_revalidate(inode,
+		inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL,
+		inode ? (unsigned long long)OCFS2_I(inode)->ip_flags : 0);
 
 	if (!inode) {
-		mlog(0, "eep, no inode!\n");
 		status = -ENOENT;
 		goto bail;
 	}
@@ -1238,7 +1212,6 @@
 	spin_lock(&OCFS2_I(inode)->ip_lock);
 	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
 		spin_unlock(&OCFS2_I(inode)->ip_lock);
-		mlog(0, "inode deleted!\n");
 		status = -ENOENT;
 		goto bail;
 	}
@@ -1254,8 +1227,6 @@
 	}
 	ocfs2_inode_unlock(inode, 0);
 bail:
-	mlog_exit(status);
-
 	return status;
 }
 
@@ -1271,8 +1242,7 @@
 	int status;
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
 
-	mlog_entry("(inode %llu)\n",
-		   (unsigned long long)OCFS2_I(inode)->ip_blkno);
+	trace_ocfs2_mark_inode_dirty((unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
 					 OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1302,7 +1272,6 @@
 
 	ocfs2_journal_dirty(handle, bh);
 leave:
-	mlog_exit(status);
 	return status;
 }
 
@@ -1345,8 +1314,7 @@
 	int rc;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
 
-	mlog(0, "Validating dinode %llu\n",
-	     (unsigned long long)bh->b_blocknr);
+	trace_ocfs2_validate_inode_block((unsigned long long)bh->b_blocknr);
 
 	BUG_ON(!buffer_uptodate(bh));
 
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 09de77c..8f13c59 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -9,7 +9,6 @@
 #include <linux/mount.h>
 #include <linux/compat.h>
 
-#define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -46,6 +45,22 @@
 #define o2info_set_request_error(a, b) \
 		__o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
 
+static inline void __o2info_set_request_filled(struct ocfs2_info_request *req)
+{
+	req->ir_flags |= OCFS2_INFO_FL_FILLED;
+}
+
+#define o2info_set_request_filled(a) \
+		__o2info_set_request_filled((struct ocfs2_info_request *)&(a))
+
+static inline void __o2info_clear_request_filled(struct ocfs2_info_request *req)
+{
+	req->ir_flags &= ~OCFS2_INFO_FL_FILLED;
+}
+
+#define o2info_clear_request_filled(a) \
+		__o2info_clear_request_filled((struct ocfs2_info_request *)&(a))
+
 static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
 {
 	int status;
@@ -59,7 +74,6 @@
 	*flags = OCFS2_I(inode)->ip_attr;
 	ocfs2_inode_unlock(inode, 0);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -125,7 +139,6 @@
 
 	brelse(bh);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -139,7 +152,8 @@
 		goto bail;
 
 	oib.ib_blocksize = inode->i_sb->s_blocksize;
-	oib.ib_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+	o2info_set_request_filled(oib);
 
 	if (o2info_to_user(oib, req))
 		goto bail;
@@ -163,7 +177,8 @@
 		goto bail;
 
 	oic.ic_clustersize = osb->s_clustersize;
-	oic.ic_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+	o2info_set_request_filled(oic);
 
 	if (o2info_to_user(oic, req))
 		goto bail;
@@ -187,7 +202,8 @@
 		goto bail;
 
 	oim.im_max_slots = osb->max_slots;
-	oim.im_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+	o2info_set_request_filled(oim);
 
 	if (o2info_to_user(oim, req))
 		goto bail;
@@ -211,7 +227,8 @@
 		goto bail;
 
 	memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
-	oil.il_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+	o2info_set_request_filled(oil);
 
 	if (o2info_to_user(oil, req))
 		goto bail;
@@ -235,7 +252,8 @@
 		goto bail;
 
 	memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
-	oiu.iu_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+	o2info_set_request_filled(oiu);
 
 	if (o2info_to_user(oiu, req))
 		goto bail;
@@ -261,7 +279,8 @@
 	oif.if_compat_features = osb->s_feature_compat;
 	oif.if_incompat_features = osb->s_feature_incompat;
 	oif.if_ro_compat_features = osb->s_feature_ro_compat;
-	oif.if_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+	o2info_set_request_filled(oif);
 
 	if (o2info_to_user(oif, req))
 		goto bail;
@@ -286,7 +305,7 @@
 
 	oij.ij_journal_size = osb->journal->j_inode->i_size;
 
-	oij.ij_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+	o2info_set_request_filled(oij);
 
 	if (o2info_to_user(oij, req))
 		goto bail;
@@ -308,7 +327,7 @@
 	if (o2info_from_user(oir, req))
 		goto bail;
 
-	oir.ir_flags &= ~OCFS2_INFO_FL_FILLED;
+	o2info_clear_request_filled(oir);
 
 	if (o2info_to_user(oir, req))
 		goto bail;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index faa2303..dcc2d93 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -31,7 +31,6 @@
 #include <linux/time.h>
 #include <linux/random.h>
 
-#define MLOG_MASK_PREFIX ML_JOURNAL
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -52,6 +51,7 @@
 #include "quota.h"
 
 #include "buffer_head_io.h"
+#include "ocfs2_trace.h"
 
 DEFINE_SPINLOCK(trans_inc_lock);
 
@@ -303,16 +303,15 @@
 	unsigned int flushed;
 	struct ocfs2_journal *journal = NULL;
 
-	mlog_entry_void();
-
 	journal = osb->journal;
 
 	/* Flush all pending commits and checkpoint the journal. */
 	down_write(&journal->j_trans_barrier);
 
-	if (atomic_read(&journal->j_num_trans) == 0) {
+	flushed = atomic_read(&journal->j_num_trans);
+	trace_ocfs2_commit_cache_begin(flushed);
+	if (flushed == 0) {
 		up_write(&journal->j_trans_barrier);
-		mlog(0, "No transactions for me to flush!\n");
 		goto finally;
 	}
 
@@ -331,13 +330,11 @@
 	atomic_set(&journal->j_num_trans, 0);
 	up_write(&journal->j_trans_barrier);
 
-	mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
-	     journal->j_trans_id, flushed);
+	trace_ocfs2_commit_cache_end(journal->j_trans_id, flushed);
 
 	ocfs2_wake_downconvert_thread(osb);
 	wake_up(&journal->j_checkpointed);
 finally:
-	mlog_exit(status);
 	return status;
 }
 
@@ -425,9 +422,8 @@
 		return 0;
 
 	old_nblocks = handle->h_buffer_credits;
-	mlog_entry_void();
 
-	mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
+	trace_ocfs2_extend_trans(old_nblocks, nblocks);
 
 #ifdef CONFIG_OCFS2_DEBUG_FS
 	status = 1;
@@ -440,9 +436,7 @@
 #endif
 
 	if (status > 0) {
-		mlog(0,
-		     "jbd2_journal_extend failed, trying "
-		     "jbd2_journal_restart\n");
+		trace_ocfs2_extend_trans_restart(old_nblocks + nblocks);
 		status = jbd2_journal_restart(handle,
 					      old_nblocks + nblocks);
 		if (status < 0) {
@@ -453,8 +447,6 @@
 
 	status = 0;
 bail:
-
-	mlog_exit(status);
 	return status;
 }
 
@@ -622,12 +614,9 @@
 	BUG_ON(!handle);
 	BUG_ON(!bh);
 
-	mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %zu\n",
-		   (unsigned long long)bh->b_blocknr, type,
-		   (type == OCFS2_JOURNAL_ACCESS_CREATE) ?
-		   "OCFS2_JOURNAL_ACCESS_CREATE" :
-		   "OCFS2_JOURNAL_ACCESS_WRITE",
-		   bh->b_size);
+	trace_ocfs2_journal_access(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)bh->b_blocknr, type, bh->b_size);
 
 	/* we can safely remove this assertion after testing. */
 	if (!buffer_uptodate(bh)) {
@@ -668,7 +657,6 @@
 		mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
 		     status, type);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -737,13 +725,10 @@
 {
 	int status;
 
-	mlog_entry("(bh->b_blocknr=%llu)\n",
-		   (unsigned long long)bh->b_blocknr);
+	trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr);
 
 	status = jbd2_journal_dirty_metadata(handle, bh);
 	BUG_ON(status);
-
-	mlog_exit_void();
 }
 
 #define OCFS2_DEFAULT_COMMIT_INTERVAL	(HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
@@ -775,8 +760,6 @@
 	struct ocfs2_super *osb;
 	int inode_lock = 0;
 
-	mlog_entry_void();
-
 	BUG_ON(!journal);
 
 	osb = journal->j_osb;
@@ -820,10 +803,9 @@
 		goto done;
 	}
 
-	mlog(0, "inode->i_size = %lld\n", inode->i_size);
-	mlog(0, "inode->i_blocks = %llu\n",
-			(unsigned long long)inode->i_blocks);
-	mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
+	trace_ocfs2_journal_init(inode->i_size,
+				 (unsigned long long)inode->i_blocks,
+				 OCFS2_I(inode)->ip_clusters);
 
 	/* call the kernels journal init function now */
 	j_journal = jbd2_journal_init_inode(inode);
@@ -833,8 +815,7 @@
 		goto done;
 	}
 
-	mlog(0, "Returned from jbd2_journal_init_inode\n");
-	mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
+	trace_ocfs2_journal_init_maxlen(j_journal->j_maxlen);
 
 	*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
 		  OCFS2_JOURNAL_DIRTY_FL);
@@ -859,7 +840,6 @@
 		}
 	}
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -882,8 +862,6 @@
 	struct buffer_head *bh = journal->j_bh;
 	struct ocfs2_dinode *fe;
 
-	mlog_entry_void();
-
 	fe = (struct ocfs2_dinode *)bh->b_data;
 
 	/* The journal bh on the osb always comes from ocfs2_journal_init()
@@ -906,7 +884,6 @@
 	if (status < 0)
 		mlog_errno(status);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -921,8 +898,6 @@
 	struct inode *inode = NULL;
 	int num_running_trans = 0;
 
-	mlog_entry_void();
-
 	BUG_ON(!osb);
 
 	journal = osb->journal;
@@ -939,10 +914,7 @@
 		BUG();
 
 	num_running_trans = atomic_read(&(osb->journal->j_num_trans));
-	if (num_running_trans > 0)
-		mlog(0, "Shutting down journal: must wait on %d "
-		     "running transactions!\n",
-		     num_running_trans);
+	trace_ocfs2_journal_shutdown(num_running_trans);
 
 	/* Do a commit_cache here. It will flush our journal, *and*
 	 * release any locks that are still held.
@@ -955,7 +927,7 @@
 	 * completely destroy the journal. */
 	if (osb->commit_task) {
 		/* Wait for the commit thread */
-		mlog(0, "Waiting for ocfs2commit to exit....\n");
+		trace_ocfs2_journal_shutdown_wait(osb->commit_task);
 		kthread_stop(osb->commit_task);
 		osb->commit_task = NULL;
 	}
@@ -998,7 +970,6 @@
 done:
 	if (inode)
 		iput(inode);
-	mlog_exit_void();
 }
 
 static void ocfs2_clear_journal_error(struct super_block *sb,
@@ -1024,8 +995,6 @@
 	int status = 0;
 	struct ocfs2_super *osb;
 
-	mlog_entry_void();
-
 	BUG_ON(!journal);
 
 	osb = journal->j_osb;
@@ -1059,7 +1028,6 @@
 		osb->commit_task = NULL;
 
 done:
-	mlog_exit(status);
 	return status;
 }
 
@@ -1070,8 +1038,6 @@
 {
 	int status;
 
-	mlog_entry_void();
-
 	BUG_ON(!journal);
 
 	status = jbd2_journal_wipe(journal->j_journal, full);
@@ -1085,7 +1051,6 @@
 		mlog_errno(status);
 
 bail:
-	mlog_exit(status);
 	return status;
 }
 
@@ -1124,8 +1089,6 @@
 #define CONCURRENT_JOURNAL_FILL 32ULL
 	struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
 
-	mlog_entry_void();
-
 	memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
 
 	num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size);
@@ -1161,7 +1124,6 @@
 bail:
 	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
 		brelse(bhs[i]);
-	mlog_exit(status);
 	return status;
 }
 
@@ -1185,7 +1147,7 @@
  */
 void ocfs2_complete_recovery(struct work_struct *work)
 {
-	int ret;
+	int ret = 0;
 	struct ocfs2_journal *journal =
 		container_of(work, struct ocfs2_journal, j_recovery_work);
 	struct ocfs2_super *osb = journal->j_osb;
@@ -1194,9 +1156,8 @@
 	struct ocfs2_quota_recovery *qrec;
 	LIST_HEAD(tmp_la_list);
 
-	mlog_entry_void();
-
-	mlog(0, "completing recovery from keventd\n");
+	trace_ocfs2_complete_recovery(
+		(unsigned long long)OCFS2_I(journal->j_inode)->ip_blkno);
 
 	spin_lock(&journal->j_lock);
 	list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
@@ -1205,15 +1166,18 @@
 	list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) {
 		list_del_init(&item->lri_list);
 
-		mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
-
 		ocfs2_wait_on_quotas(osb);
 
 		la_dinode = item->lri_la_dinode;
-		if (la_dinode) {
-			mlog(0, "Clean up local alloc %llu\n",
-			     (unsigned long long)le64_to_cpu(la_dinode->i_blkno));
+		tl_dinode = item->lri_tl_dinode;
+		qrec = item->lri_qrec;
 
+		trace_ocfs2_complete_recovery_slot(item->lri_slot,
+			la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0,
+			tl_dinode ? le64_to_cpu(tl_dinode->i_blkno) : 0,
+			qrec);
+
+		if (la_dinode) {
 			ret = ocfs2_complete_local_alloc_recovery(osb,
 								  la_dinode);
 			if (ret < 0)
@@ -1222,11 +1186,7 @@
 			kfree(la_dinode);
 		}
 
-		tl_dinode = item->lri_tl_dinode;
 		if (tl_dinode) {
-			mlog(0, "Clean up truncate log %llu\n",
-			     (unsigned long long)le64_to_cpu(tl_dinode->i_blkno));
-
 			ret = ocfs2_complete_truncate_log_recovery(osb,
 								   tl_dinode);
 			if (ret < 0)
@@ -1239,9 +1199,7 @@
 		if (ret < 0)
 			mlog_errno(ret);
 
-		qrec = item->lri_qrec;
 		if (qrec) {
-			mlog(0, "Recovering quota files");
 			ret = ocfs2_finish_quota_recovery(osb, qrec,
 							  item->lri_slot);
 			if (ret < 0)
@@ -1252,8 +1210,7 @@
 		kfree(item);
 	}
 
-	mlog(0, "Recovery completion\n");
-	mlog_exit_void();
+	trace_ocfs2_complete_recovery_end(ret);
 }
 
 /* NOTE: This function always eats your references to la_dinode and
@@ -1339,8 +1296,6 @@
 	int rm_quota_used = 0, i;
 	struct ocfs2_quota_recovery *qrec;
 
-	mlog_entry_void();
-
 	status = ocfs2_wait_on_mount(osb);
 	if (status < 0) {
 		goto bail;
@@ -1372,15 +1327,12 @@
 		 * clear it until ocfs2_recover_node() has succeeded. */
 		node_num = rm->rm_entries[0];
 		spin_unlock(&osb->osb_lock);
-		mlog(0, "checking node %d\n", node_num);
 		slot_num = ocfs2_node_num_to_slot(osb, node_num);
+		trace_ocfs2_recovery_thread_node(node_num, slot_num);
 		if (slot_num == -ENOENT) {
 			status = 0;
-			mlog(0, "no slot for this node, so no recovery"
-			     "required.\n");
 			goto skip_recovery;
 		}
-		mlog(0, "node %d was using slot %d\n", node_num, slot_num);
 
 		/* It is a bit subtle with quota recovery. We cannot do it
 		 * immediately because we have to obtain cluster locks from
@@ -1407,7 +1359,7 @@
 		spin_lock(&osb->osb_lock);
 	}
 	spin_unlock(&osb->osb_lock);
-	mlog(0, "All nodes recovered\n");
+	trace_ocfs2_recovery_thread_end(status);
 
 	/* Refresh all journal recovery generations from disk */
 	status = ocfs2_check_journals_nolocks(osb);
@@ -1451,7 +1403,6 @@
 	if (rm_quota)
 		kfree(rm_quota);
 
-	mlog_exit(status);
 	/* no one is callint kthread_stop() for us so the kthread() api
 	 * requires that we call do_exit().  And it isn't exported, but
 	 * complete_and_exit() seems to be a minimal wrapper around it. */
@@ -1461,20 +1412,16 @@
 
 void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
 {
-	mlog_entry("(node_num=%d, osb->node_num = %d)\n",
-		   node_num, osb->node_num);
-
 	mutex_lock(&osb->recovery_lock);
+
+	trace_ocfs2_recovery_thread(node_num, osb->node_num,
+		osb->disable_recovery, osb->recovery_thread_task,
+		osb->disable_recovery ?
+		-1 : ocfs2_recovery_map_set(osb, node_num));
+
 	if (osb->disable_recovery)
 		goto out;
 
-	/* People waiting on recovery will wait on
-	 * the recovery map to empty. */
-	if (ocfs2_recovery_map_set(osb, node_num))
-		mlog(0, "node %d already in recovery map.\n", node_num);
-
-	mlog(0, "starting recovery thread...\n");
-
 	if (osb->recovery_thread_task)
 		goto out;
 
@@ -1488,8 +1435,6 @@
 out:
 	mutex_unlock(&osb->recovery_lock);
 	wake_up(&osb->recovery_event);
-
-	mlog_exit_void();
 }
 
 static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
@@ -1563,7 +1508,7 @@
 	 * If not, it needs recovery.
 	 */
 	if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) {
-		mlog(0, "Slot %u already recovered (old/new=%u/%u)\n", slot_num,
+		trace_ocfs2_replay_journal_recovered(slot_num,
 		     osb->slot_recovery_generations[slot_num], slot_reco_gen);
 		osb->slot_recovery_generations[slot_num] = slot_reco_gen;
 		status = -EBUSY;
@@ -1574,7 +1519,7 @@
 
 	status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
 	if (status < 0) {
-		mlog(0, "status returned from ocfs2_inode_lock=%d\n", status);
+		trace_ocfs2_replay_journal_lock_err(status);
 		if (status != -ERESTARTSYS)
 			mlog(ML_ERROR, "Could not lock journal!\n");
 		goto done;
@@ -1587,7 +1532,7 @@
 	slot_reco_gen = ocfs2_get_recovery_generation(fe);
 
 	if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
-		mlog(0, "No recovery required for node %d\n", node_num);
+		trace_ocfs2_replay_journal_skip(node_num);
 		/* Refresh recovery generation for the slot */
 		osb->slot_recovery_generations[slot_num] = slot_reco_gen;
 		goto done;
@@ -1608,7 +1553,6 @@
 		goto done;
 	}
 
-	mlog(0, "calling journal_init_inode\n");
 	journal = jbd2_journal_init_inode(inode);
 	if (journal == NULL) {
 		mlog(ML_ERROR, "Linux journal layer error\n");
@@ -1628,7 +1572,6 @@
 	ocfs2_clear_journal_error(osb->sb, journal, slot_num);
 
 	/* wipe the journal */
-	mlog(0, "flushing the journal.\n");
 	jbd2_journal_lock_updates(journal);
 	status = jbd2_journal_flush(journal);
 	jbd2_journal_unlock_updates(journal);
@@ -1665,7 +1608,6 @@
 
 	brelse(bh);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -1688,8 +1630,7 @@
 	struct ocfs2_dinode *la_copy = NULL;
 	struct ocfs2_dinode *tl_copy = NULL;
 
-	mlog_entry("(node_num=%d, slot_num=%d, osb->node_num = %d)\n",
-		   node_num, slot_num, osb->node_num);
+	trace_ocfs2_recover_node(node_num, slot_num, osb->node_num);
 
 	/* Should not ever be called to recover ourselves -- in that
 	 * case we should've called ocfs2_journal_load instead. */
@@ -1698,9 +1639,7 @@
 	status = ocfs2_replay_journal(osb, node_num, slot_num);
 	if (status < 0) {
 		if (status == -EBUSY) {
-			mlog(0, "Skipping recovery for slot %u (node %u) "
-			     "as another node has recovered it\n", slot_num,
-			     node_num);
+			trace_ocfs2_recover_node_skip(slot_num, node_num);
 			status = 0;
 			goto done;
 		}
@@ -1735,7 +1674,6 @@
 	status = 0;
 done:
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -1808,8 +1746,8 @@
 		spin_lock(&osb->osb_lock);
 		osb->slot_recovery_generations[i] = gen;
 
-		mlog(0, "Slot %u recovery generation is %u\n", i,
-		     osb->slot_recovery_generations[i]);
+		trace_ocfs2_mark_dead_nodes(i,
+					    osb->slot_recovery_generations[i]);
 
 		if (i == osb->slot_num) {
 			spin_unlock(&osb->osb_lock);
@@ -1845,7 +1783,6 @@
 
 	status = 0;
 bail:
-	mlog_exit(status);
 	return status;
 }
 
@@ -1884,11 +1821,12 @@
 
 	os = &osb->osb_orphan_scan;
 
-	mlog(0, "Begin orphan scan\n");
-
 	if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
 		goto out;
 
+	trace_ocfs2_queue_orphan_scan_begin(os->os_count, os->os_seqno,
+					    atomic_read(&os->os_state));
+
 	status = ocfs2_orphan_scan_lock(osb, &seqno);
 	if (status < 0) {
 		if (status != -EAGAIN)
@@ -1918,7 +1856,8 @@
 unlock:
 	ocfs2_orphan_scan_unlock(osb, seqno);
 out:
-	mlog(0, "Orphan scan completed\n");
+	trace_ocfs2_queue_orphan_scan_end(os->os_count, os->os_seqno,
+					  atomic_read(&os->os_state));
 	return;
 }
 
@@ -2002,8 +1941,7 @@
 	if (IS_ERR(iter))
 		return 0;
 
-	mlog(0, "queue orphan %llu\n",
-	     (unsigned long long)OCFS2_I(iter)->ip_blkno);
+	trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno);
 	/* No locking is required for the next_orphan queue as there
 	 * is only ever a single process doing orphan recovery. */
 	OCFS2_I(iter)->ip_next_orphan = p->head;
@@ -2119,7 +2057,7 @@
 	struct inode *iter;
 	struct ocfs2_inode_info *oi;
 
-	mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
+	trace_ocfs2_recover_orphans(slot);
 
 	ocfs2_mark_recovering_orphan_dir(osb, slot);
 	ret = ocfs2_queue_orphans(osb, slot, &inode);
@@ -2132,7 +2070,8 @@
 
 	while (inode) {
 		oi = OCFS2_I(inode);
-		mlog(0, "iput orphan %llu\n", (unsigned long long)oi->ip_blkno);
+		trace_ocfs2_recover_orphans_iput(
+					(unsigned long long)oi->ip_blkno);
 
 		iter = oi->ip_next_orphan;
 
@@ -2170,6 +2109,7 @@
 	 * MOUNTED flag, but this is set right before
 	 * dismount_volume() so we can trust it. */
 	if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) {
+		trace_ocfs2_wait_on_mount(VOLUME_DISABLED);
 		mlog(0, "mount error, exiting!\n");
 		return -EBUSY;
 	}
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ec6adbf..210c352 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -29,7 +29,6 @@
 #include <linux/highmem.h>
 #include <linux/bitops.h>
 
-#define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -43,6 +42,7 @@
 #include "suballoc.h"
 #include "super.h"
 #include "sysfile.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
@@ -201,8 +201,7 @@
 	la_max_mb = ocfs2_clusters_to_megabytes(sb,
 						ocfs2_local_alloc_size(sb) * 8);
 
-	mlog(0, "requested: %dM, max: %uM, default: %uM\n",
-	     requested_mb, la_max_mb, la_default_mb);
+	trace_ocfs2_la_set_sizes(requested_mb, la_max_mb, la_default_mb);
 
 	if (requested_mb == -1) {
 		/* No user request - use defaults */
@@ -276,8 +275,8 @@
 
 	ret = 1;
 bail:
-	mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
-	     osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
+	trace_ocfs2_alloc_should_use_local(
+	     (unsigned long long)bits, osb->local_alloc_state, la_bits, ret);
 	spin_unlock(&osb->osb_lock);
 	return ret;
 }
@@ -291,8 +290,6 @@
 	struct inode *inode = NULL;
 	struct ocfs2_local_alloc *la;
 
-	mlog_entry_void();
-
 	if (osb->local_alloc_bits == 0)
 		goto bail;
 
@@ -364,9 +361,10 @@
 	if (inode)
 		iput(inode);
 
-	mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
+	trace_ocfs2_load_local_alloc(osb->local_alloc_bits);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -388,8 +386,6 @@
 	struct ocfs2_dinode *alloc_copy = NULL;
 	struct ocfs2_dinode *alloc = NULL;
 
-	mlog_entry_void();
-
 	cancel_delayed_work(&osb->la_enable_wq);
 	flush_workqueue(ocfs2_wq);
 
@@ -482,8 +478,6 @@
 
 	if (alloc_copy)
 		kfree(alloc_copy);
-
-	mlog_exit_void();
 }
 
 /*
@@ -502,7 +496,7 @@
 	struct inode *inode = NULL;
 	struct ocfs2_dinode *alloc;
 
-	mlog_entry("(slot_num = %d)\n", slot_num);
+	trace_ocfs2_begin_local_alloc_recovery(slot_num);
 
 	*alloc_copy = NULL;
 
@@ -552,7 +546,8 @@
 		iput(inode);
 	}
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -570,8 +565,6 @@
 	struct buffer_head *main_bm_bh = NULL;
 	struct inode *main_bm_inode;
 
-	mlog_entry_void();
-
 	main_bm_inode = ocfs2_get_system_file_inode(osb,
 						    GLOBAL_BITMAP_SYSTEM_INODE,
 						    OCFS2_INVALID_SLOT);
@@ -620,7 +613,8 @@
 out:
 	if (!status)
 		ocfs2_init_steal_slots(osb);
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -640,8 +634,6 @@
 	struct inode *local_alloc_inode;
 	unsigned int free_bits;
 
-	mlog_entry_void();
-
 	BUG_ON(!ac);
 
 	local_alloc_inode =
@@ -712,10 +704,6 @@
 			goto bail;
 	}
 
-	if (ac->ac_max_block)
-		mlog(0, "Calling in_range for max block %llu\n",
-		     (unsigned long long)ac->ac_max_block);
-
 	ac->ac_inode = local_alloc_inode;
 	/* We should never use localalloc from another slot */
 	ac->ac_alloc_slot = osb->slot_num;
@@ -729,10 +717,12 @@
 		iput(local_alloc_inode);
 	}
 
-	mlog(0, "bits=%d, slot=%d, ret=%d\n", bits_wanted, osb->slot_num,
-	     status);
+	trace_ocfs2_reserve_local_alloc_bits(
+		(unsigned long long)ac->ac_max_block,
+		bits_wanted, osb->slot_num, status);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -749,7 +739,6 @@
 	struct ocfs2_dinode *alloc;
 	struct ocfs2_local_alloc *la;
 
-	mlog_entry_void();
 	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
 
 	local_alloc_inode = ac->ac_inode;
@@ -788,7 +777,8 @@
 	ocfs2_journal_dirty(handle, osb->local_alloc_bh);
 
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -799,13 +789,11 @@
 	u32 count = 0;
 	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
 
-	mlog_entry_void();
-
 	buffer = la->la_bitmap;
 	for (i = 0; i < le16_to_cpu(la->la_size); i++)
 		count += hweight8(buffer[i]);
 
-	mlog_exit(count);
+	trace_ocfs2_local_alloc_count_bits(count);
 	return count;
 }
 
@@ -820,10 +808,7 @@
 	void *bitmap = NULL;
 	struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap;
 
-	mlog_entry("(numbits wanted = %u)\n", *numbits);
-
 	if (!alloc->id1.bitmap1.i_total) {
-		mlog(0, "No bits in my window!\n");
 		bitoff = -1;
 		goto bail;
 	}
@@ -883,8 +868,7 @@
 		}
 	}
 
-	mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
-	     numfound);
+	trace_ocfs2_local_alloc_find_clear_bits_search_bitmap(bitoff, numfound);
 
 	if (numfound == *numbits)
 		bitoff = startoff - numfound;
@@ -895,7 +879,10 @@
 	if (local_resv)
 		ocfs2_resv_discard(resmap, resv);
 
-	mlog_exit(bitoff);
+	trace_ocfs2_local_alloc_find_clear_bits(*numbits,
+		le32_to_cpu(alloc->id1.bitmap1.i_total),
+		bitoff, numfound);
+
 	return bitoff;
 }
 
@@ -903,15 +890,12 @@
 {
 	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
 	int i;
-	mlog_entry_void();
 
 	alloc->id1.bitmap1.i_total = 0;
 	alloc->id1.bitmap1.i_used = 0;
 	la->la_bm_off = 0;
 	for(i = 0; i < le16_to_cpu(la->la_size); i++)
 		la->la_bitmap[i] = 0;
-
-	mlog_exit_void();
 }
 
 #if 0
@@ -952,18 +936,16 @@
 	void *bitmap;
 	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
 
-	mlog_entry("total = %u, used = %u\n",
-		   le32_to_cpu(alloc->id1.bitmap1.i_total),
-		   le32_to_cpu(alloc->id1.bitmap1.i_used));
+	trace_ocfs2_sync_local_to_main(
+	     le32_to_cpu(alloc->id1.bitmap1.i_total),
+	     le32_to_cpu(alloc->id1.bitmap1.i_used));
 
 	if (!alloc->id1.bitmap1.i_total) {
-		mlog(0, "nothing to sync!\n");
 		goto bail;
 	}
 
 	if (le32_to_cpu(alloc->id1.bitmap1.i_used) ==
 	    le32_to_cpu(alloc->id1.bitmap1.i_total)) {
-		mlog(0, "all bits were taken!\n");
 		goto bail;
 	}
 
@@ -985,8 +967,7 @@
 				ocfs2_clusters_to_blocks(osb->sb,
 							 start - count);
 
-			mlog(0, "freeing %u bits starting at local alloc bit "
-			     "%u (la_start_blk = %llu, blkno = %llu)\n",
+			trace_ocfs2_sync_local_to_main_free(
 			     count, start - count,
 			     (unsigned long long)la_start_blk,
 			     (unsigned long long)blkno);
@@ -1007,7 +988,8 @@
 	}
 
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1132,7 +1114,8 @@
 		*ac = NULL;
 	}
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1148,17 +1131,12 @@
 	struct ocfs2_dinode *alloc = NULL;
 	struct ocfs2_local_alloc *la;
 
-	mlog_entry_void();
-
 	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
 	la = OCFS2_LOCAL_ALLOC(alloc);
 
-	if (alloc->id1.bitmap1.i_total)
-		mlog(0, "asking me to alloc a new window over a non-empty "
-		     "one\n");
-
-	mlog(0, "Allocating %u clusters for a new window.\n",
-	     osb->local_alloc_bits);
+	trace_ocfs2_local_alloc_new_window(
+		le32_to_cpu(alloc->id1.bitmap1.i_total),
+		osb->local_alloc_bits);
 
 	/* Instruct the allocation code to try the most recently used
 	 * cluster group. We'll re-record the group used this pass
@@ -1220,13 +1198,13 @@
 	ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count,
 			     OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
 
-	mlog(0, "New window allocated:\n");
-	mlog(0, "window la_bm_off = %u\n",
-	     OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
-	mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total));
+	trace_ocfs2_local_alloc_new_window_result(
+		OCFS2_LOCAL_ALLOC(alloc)->la_bm_off,
+		le32_to_cpu(alloc->id1.bitmap1.i_total));
 
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1243,8 +1221,6 @@
 	struct ocfs2_dinode *alloc_copy = NULL;
 	struct ocfs2_alloc_context *ac = NULL;
 
-	mlog_entry_void();
-
 	ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE);
 
 	/* This will lock the main bitmap for us. */
@@ -1324,7 +1300,8 @@
 	if (ac)
 		ocfs2_free_alloc_context(ac);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index b5cb3ed..e57c804 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -26,7 +26,6 @@
 #include <linux/fs.h>
 #include <linux/fcntl.h>
 
-#define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 7e32db9..3e9393c 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -31,7 +31,6 @@
 #include <linux/signal.h>
 #include <linux/rbtree.h>
 
-#define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -42,6 +41,7 @@
 #include "inode.h"
 #include "mmap.h"
 #include "super.h"
+#include "ocfs2_trace.h"
 
 
 static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
@@ -49,13 +49,12 @@
 	sigset_t oldset;
 	int ret;
 
-	mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff);
-
 	ocfs2_block_signals(&oldset);
 	ret = filemap_fault(area, vmf);
 	ocfs2_unblock_signals(&oldset);
 
-	mlog_exit_ptr(vmf->page);
+	trace_ocfs2_fault(OCFS2_I(area->vm_file->f_mapping->host)->ip_blkno,
+			  area, vmf->page, vmf->pgoff);
 	return ret;
 }
 
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d6c25d7..28f2cc1 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -42,7 +42,6 @@
 #include <linux/highmem.h>
 #include <linux/quotaops.h>
 
-#define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -63,6 +62,7 @@
 #include "uptodate.h"
 #include "xattr.h"
 #include "acl.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
@@ -106,17 +106,15 @@
 	struct dentry *ret;
 	struct ocfs2_inode_info *oi;
 
-	mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
-		   dentry->d_name.len, dentry->d_name.name);
+	trace_ocfs2_lookup(dir, dentry, dentry->d_name.len,
+			   dentry->d_name.name,
+			   (unsigned long long)OCFS2_I(dir)->ip_blkno, 0);
 
 	if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) {
 		ret = ERR_PTR(-ENAMETOOLONG);
 		goto bail;
 	}
 
-	mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len,
-	     dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno);
-
 	status = ocfs2_inode_lock_nested(dir, NULL, 0, OI_LS_PARENT);
 	if (status < 0) {
 		if (status != -ENOENT)
@@ -182,7 +180,7 @@
 
 bail:
 
-	mlog_exit_ptr(ret);
+	trace_ocfs2_lookup_ret(ret);
 
 	return ret;
 }
@@ -235,9 +233,9 @@
 	sigset_t oldset;
 	int did_block_signals = 0;
 
-	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
-		   (unsigned long)dev, dentry->d_name.len,
-		   dentry->d_name.name);
+	trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
+			  (unsigned long long)OCFS2_I(dir)->ip_blkno,
+			  (unsigned long)dev, mode);
 
 	dquot_initialize(dir);
 
@@ -354,10 +352,6 @@
 		goto leave;
 	did_quota_inode = 1;
 
-	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
-		   inode->i_mode, (unsigned long)dev, dentry->d_name.len,
-		   dentry->d_name.name);
-
 	/* do the real work now. */
 	status = ocfs2_mknod_locked(osb, dir, inode, dev,
 				    &new_fe_bh, parent_fe_bh, handle,
@@ -436,9 +430,6 @@
 	if (did_block_signals)
 		ocfs2_unblock_signals(&oldset);
 
-	if (status == -ENOSPC)
-		mlog(0, "Disk is full\n");
-
 	brelse(new_fe_bh);
 	brelse(parent_fe_bh);
 	kfree(si.name);
@@ -466,7 +457,8 @@
 		iput(inode);
 	}
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 
 	return status;
 }
@@ -577,7 +569,8 @@
 		}
 	}
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -615,10 +608,11 @@
 {
 	int ret;
 
-	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
-		   dentry->d_name.len, dentry->d_name.name);
+	trace_ocfs2_mkdir(dir, dentry, dentry->d_name.len, dentry->d_name.name,
+			  OCFS2_I(dir)->ip_blkno, mode);
 	ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
-	mlog_exit(ret);
+	if (ret)
+		mlog_errno(ret);
 
 	return ret;
 }
@@ -630,10 +624,11 @@
 {
 	int ret;
 
-	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
-		   dentry->d_name.len, dentry->d_name.name);
+	trace_ocfs2_create(dir, dentry, dentry->d_name.len, dentry->d_name.name,
+			   (unsigned long long)OCFS2_I(dir)->ip_blkno, mode);
 	ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
-	mlog_exit(ret);
+	if (ret)
+		mlog_errno(ret);
 
 	return ret;
 }
@@ -652,9 +647,9 @@
 	struct ocfs2_dir_lookup_result lookup = { NULL, };
 	sigset_t oldset;
 
-	mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
-		   old_dentry->d_name.len, old_dentry->d_name.name,
-		   dentry->d_name.len, dentry->d_name.name);
+	trace_ocfs2_link((unsigned long long)OCFS2_I(inode)->ip_blkno,
+			 old_dentry->d_name.len, old_dentry->d_name.name,
+			 dentry->d_name.len, dentry->d_name.name);
 
 	if (S_ISDIR(inode->i_mode))
 		return -EPERM;
@@ -757,7 +752,8 @@
 
 	ocfs2_free_dir_lookup_result(&lookup);
 
-	mlog_exit(err);
+	if (err)
+		mlog_errno(err);
 
 	return err;
 }
@@ -809,19 +805,17 @@
 	struct ocfs2_dir_lookup_result lookup = { NULL, };
 	struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
 
-	mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
-		   dentry->d_name.len, dentry->d_name.name);
+	trace_ocfs2_unlink(dir, dentry, dentry->d_name.len,
+			   dentry->d_name.name,
+			   (unsigned long long)OCFS2_I(dir)->ip_blkno,
+			   (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 	dquot_initialize(dir);
 
 	BUG_ON(dentry->d_parent->d_inode != dir);
 
-	mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
-
-	if (inode == osb->root_inode) {
-		mlog(0, "Cannot delete the root directory\n");
+	if (inode == osb->root_inode)
 		return -EPERM;
-	}
 
 	status = ocfs2_inode_lock_nested(dir, &parent_node_bh, 1,
 					 OI_LS_PARENT);
@@ -843,9 +837,10 @@
 	if (OCFS2_I(inode)->ip_blkno != blkno) {
 		status = -ENOENT;
 
-		mlog(0, "ip_blkno %llu != dirent blkno %llu ip_flags = %x\n",
-		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-		     (unsigned long long)blkno, OCFS2_I(inode)->ip_flags);
+		trace_ocfs2_unlink_noent(
+				(unsigned long long)OCFS2_I(inode)->ip_blkno,
+				(unsigned long long)blkno,
+				OCFS2_I(inode)->ip_flags);
 		goto leave;
 	}
 
@@ -954,7 +949,8 @@
 	ocfs2_free_dir_lookup_result(&orphan_insert);
 	ocfs2_free_dir_lookup_result(&lookup);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 
 	return status;
 }
@@ -975,9 +971,8 @@
 	struct buffer_head **tmpbh;
 	struct inode *tmpinode;
 
-	mlog_entry("(inode1 = %llu, inode2 = %llu)\n",
-		   (unsigned long long)oi1->ip_blkno,
-		   (unsigned long long)oi2->ip_blkno);
+	trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
+				(unsigned long long)oi2->ip_blkno);
 
 	if (*bh1)
 		*bh1 = NULL;
@@ -988,7 +983,6 @@
 	if (oi1->ip_blkno != oi2->ip_blkno) {
 		if (oi1->ip_blkno < oi2->ip_blkno) {
 			/* switch id1 and id2 around */
-			mlog(0, "switching them around...\n");
 			tmpbh = bh2;
 			bh2 = bh1;
 			bh1 = tmpbh;
@@ -1024,8 +1018,13 @@
 			mlog_errno(status);
 	}
 
+	trace_ocfs2_double_lock_end(
+			(unsigned long long)OCFS2_I(inode1)->ip_blkno,
+			(unsigned long long)OCFS2_I(inode2)->ip_blkno);
+
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1067,10 +1066,9 @@
 	/* At some point it might be nice to break this function up a
 	 * bit. */
 
-	mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n",
-		   old_dir, old_dentry, new_dir, new_dentry,
-		   old_dentry->d_name.len, old_dentry->d_name.name,
-		   new_dentry->d_name.len, new_dentry->d_name.name);
+	trace_ocfs2_rename(old_dir, old_dentry, new_dir, new_dentry,
+			   old_dentry->d_name.len, old_dentry->d_name.name,
+			   new_dentry->d_name.len, new_dentry->d_name.name);
 
 	dquot_initialize(old_dir);
 	dquot_initialize(new_dir);
@@ -1227,16 +1225,15 @@
 		if (!new_inode) {
 			status = -EACCES;
 
-			mlog(0, "We found an inode for name %.*s but VFS "
-			     "didn't give us one.\n", new_dentry->d_name.len,
-			     new_dentry->d_name.name);
+			trace_ocfs2_rename_target_exists(new_dentry->d_name.len,
+						new_dentry->d_name.name);
 			goto bail;
 		}
 
 		if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) {
 			status = -EACCES;
 
-			mlog(0, "Inode %llu and dir %llu disagree. flags = %x\n",
+			trace_ocfs2_rename_disagree(
 			     (unsigned long long)OCFS2_I(new_inode)->ip_blkno,
 			     (unsigned long long)newfe_blkno,
 			     OCFS2_I(new_inode)->ip_flags);
@@ -1259,8 +1256,7 @@
 
 		newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
 
-		mlog(0, "aha rename over existing... new_blkno=%llu "
-		     "newfebh=%p bhblocknr=%llu\n",
+		trace_ocfs2_rename_over_existing(
 		     (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
 		     (unsigned long long)newfe_bh->b_blocknr : 0ULL);
 
@@ -1476,7 +1472,8 @@
 	brelse(old_dir_bh);
 	brelse(new_dir_bh);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 
 	return status;
 }
@@ -1501,9 +1498,8 @@
 	 * write i_size + 1 bytes. */
 	blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
 
-	mlog_entry("i_blocks = %llu, i_size = %llu, blocks = %d\n",
-			(unsigned long long)inode->i_blocks,
-			i_size_read(inode), blocks);
+	trace_ocfs2_create_symlink_data((unsigned long long)inode->i_blocks,
+					i_size_read(inode), blocks);
 
 	/* Sanity check -- make sure we're going to fit. */
 	if (bytes_left >
@@ -1579,7 +1575,8 @@
 		kfree(bhs);
 	}
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1610,8 +1607,8 @@
 	sigset_t oldset;
 	int did_block_signals = 0;
 
-	mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
-		   dentry, symname, dentry->d_name.len, dentry->d_name.name);
+	trace_ocfs2_symlink_begin(dir, dentry, symname,
+				  dentry->d_name.len, dentry->d_name.name);
 
 	dquot_initialize(dir);
 
@@ -1713,9 +1710,10 @@
 		goto bail;
 	did_quota_inode = 1;
 
-	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry,
-		   inode->i_mode, dentry->d_name.len,
-		   dentry->d_name.name);
+	trace_ocfs2_symlink_create(dir, dentry, dentry->d_name.len,
+				   dentry->d_name.name,
+				   (unsigned long long)OCFS2_I(dir)->ip_blkno,
+				   inode->i_mode);
 
 	status = ocfs2_mknod_locked(osb, dir, inode,
 				    0, &new_fe_bh, parent_fe_bh, handle,
@@ -1835,7 +1833,8 @@
 		iput(inode);
 	}
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 
 	return status;
 }
@@ -1844,8 +1843,6 @@
 {
 	int status, namelen;
 
-	mlog_entry_void();
-
 	namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016llx",
 			   (long long)blkno);
 	if (namelen <= 0) {
@@ -1862,12 +1859,12 @@
 		goto bail;
 	}
 
-	mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name,
-	     namelen);
+	trace_ocfs2_blkno_stringify(blkno, name, namelen);
 
 	status = 0;
 bail:
-	mlog_exit(status);
+	if (status < 0)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1980,7 +1977,8 @@
 		iput(orphan_dir_inode);
 	}
 
-	mlog_exit(ret);
+	if (ret)
+		mlog_errno(ret);
 	return ret;
 }
 
@@ -1997,7 +1995,8 @@
 	struct ocfs2_dinode *orphan_fe;
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
-	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
+	trace_ocfs2_orphan_add_begin(
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 	status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh);
 	if (status < 0) {
@@ -2056,13 +2055,14 @@
 
 	ocfs2_journal_dirty(handle, fe_bh);
 
-	mlog(0, "Inode %llu orphaned in slot %d\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
+	trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
+				   osb->slot_num);
 
 leave:
 	brelse(orphan_dir_bh);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2078,17 +2078,15 @@
 	int status = 0;
 	struct ocfs2_dir_lookup_result lookup = { NULL, };
 
-	mlog_entry_void();
-
 	status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 
-	mlog(0, "removing '%s' from orphan dir %llu (namelen=%d)\n",
-	     name, (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
-	     OCFS2_ORPHAN_NAMELEN);
+	trace_ocfs2_orphan_del(
+	     (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
+	     name, OCFS2_ORPHAN_NAMELEN);
 
 	/* find it's spot in the orphan directory */
 	status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
@@ -2124,7 +2122,8 @@
 leave:
 	ocfs2_free_dir_lookup_result(&lookup);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2321,9 +2320,6 @@
 		iput(orphan_dir);
 	}
 
-	if (status == -ENOSPC)
-		mlog(0, "Disk is full\n");
-
 	if ((status < 0) && inode) {
 		clear_nlink(inode);
 		iput(inode);
@@ -2358,8 +2354,10 @@
 	struct buffer_head *di_bh = NULL;
 	struct ocfs2_dir_lookup_result lookup = { NULL, };
 
-	mlog_entry("(0x%p, 0x%p, %.*s')\n", dir, dentry,
-		   dentry->d_name.len, dentry->d_name.name);
+	trace_ocfs2_mv_orphaned_inode_to_new(dir, dentry,
+				dentry->d_name.len, dentry->d_name.name,
+				(unsigned long long)OCFS2_I(dir)->ip_blkno,
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 	status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
 	if (status < 0) {
@@ -2476,7 +2474,8 @@
 
 	ocfs2_free_dir_lookup_result(&lookup);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 
 	return status;
 }
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1a97ba1..4092858 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -147,6 +147,17 @@
 
 typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
 
+#ifdef CONFIG_OCFS2_FS_STATS
+struct ocfs2_lock_stats {
+	u64		ls_total;	/* Total wait in NSEC */
+	u32		ls_gets;	/* Num acquires */
+	u32		ls_fail;	/* Num failed acquires */
+
+	/* Storing max wait in usecs saves 24 bytes per inode */
+	u32		ls_max;		/* Max wait in USEC */
+};
+#endif
+
 struct ocfs2_lock_res {
 	void                    *l_priv;
 	struct ocfs2_lock_res_ops *l_ops;
@@ -182,15 +193,9 @@
 	struct list_head         l_debug_list;
 
 #ifdef CONFIG_OCFS2_FS_STATS
-	unsigned long long	 l_lock_num_prmode; 	   /* PR acquires */
-	unsigned long long 	 l_lock_num_exmode; 	   /* EX acquires */
-	unsigned int		 l_lock_num_prmode_failed; /* Failed PR gets */
-	unsigned int		 l_lock_num_exmode_failed; /* Failed EX gets */
-	unsigned long long	 l_lock_total_prmode; 	   /* Tot wait for PR */
-	unsigned long long	 l_lock_total_exmode; 	   /* Tot wait for EX */
-	unsigned int		 l_lock_max_prmode; 	   /* Max wait for PR */
-	unsigned int		 l_lock_max_exmode; 	   /* Max wait for EX */
-	unsigned int		 l_lock_refresh;	   /* Disk refreshes */
+	struct ocfs2_lock_stats  l_lock_prmode;		/* PR mode stats */
+	u32                      l_lock_refresh;	/* Disk refreshes */
+	struct ocfs2_lock_stats  l_lock_exmode;		/* EX mode stats */
 #endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	 l_lockdep_map;
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
new file mode 100644
index 0000000..a1dae5b
--- /dev/null
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -0,0 +1,2739 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ocfs2
+
+#if !defined(_TRACE_OCFS2_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_OCFS2_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(ocfs2__int,
+	TP_PROTO(int num),
+	TP_ARGS(num),
+	TP_STRUCT__entry(
+		__field(int, num)
+	),
+	TP_fast_assign(
+		__entry->num = num;
+	),
+	TP_printk("%d", __entry->num)
+);
+
+#define DEFINE_OCFS2_INT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__int, name,	\
+	TP_PROTO(int num),	\
+	TP_ARGS(num))
+
+DECLARE_EVENT_CLASS(ocfs2__uint,
+	TP_PROTO(unsigned int num),
+	TP_ARGS(num),
+	TP_STRUCT__entry(
+		__field(	unsigned int,	num		)
+	),
+	TP_fast_assign(
+		__entry->num	= 	num;
+	),
+	TP_printk("%u", __entry->num)
+);
+
+#define DEFINE_OCFS2_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__uint, name,	\
+	TP_PROTO(unsigned int num),	\
+	TP_ARGS(num))
+
+DECLARE_EVENT_CLASS(ocfs2__ull,
+	TP_PROTO(unsigned long long blkno),
+	TP_ARGS(blkno),
+	TP_STRUCT__entry(
+		__field(unsigned long long, blkno)
+	),
+	TP_fast_assign(
+		__entry->blkno = blkno;
+	),
+	TP_printk("%llu", __entry->blkno)
+);
+
+#define DEFINE_OCFS2_ULL_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull, name,	\
+	TP_PROTO(unsigned long long num),	\
+	TP_ARGS(num))
+
+DECLARE_EVENT_CLASS(ocfs2__pointer,
+	TP_PROTO(void *pointer),
+	TP_ARGS(pointer),
+	TP_STRUCT__entry(
+		__field(void *, pointer)
+	),
+	TP_fast_assign(
+		__entry->pointer = pointer;
+	),
+	TP_printk("%p", __entry->pointer)
+);
+
+#define DEFINE_OCFS2_POINTER_EVENT(name)	\
+DEFINE_EVENT(ocfs2__pointer, name,	\
+	TP_PROTO(void *pointer),	\
+	TP_ARGS(pointer))
+
+DECLARE_EVENT_CLASS(ocfs2__string,
+	TP_PROTO(const char *name),
+	TP_ARGS(name),
+	TP_STRUCT__entry(
+		__string(name,name)
+	),
+	TP_fast_assign(
+		__assign_str(name, name);
+	),
+	TP_printk("%s", __get_str(name))
+);
+
+#define DEFINE_OCFS2_STRING_EVENT(name)	\
+DEFINE_EVENT(ocfs2__string, name,	\
+	TP_PROTO(const char *name),	\
+	TP_ARGS(name))
+
+DECLARE_EVENT_CLASS(ocfs2__int_int,
+	TP_PROTO(int value1, int value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(int, value1)
+		__field(int, value2)
+	),
+	TP_fast_assign(
+		__entry->value1	= value1;
+		__entry->value2	= value2;
+	),
+	TP_printk("%d %d", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_INT_INT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__int_int, name,	\
+	TP_PROTO(int val1, int val2),	\
+	TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__uint_int,
+	TP_PROTO(unsigned int value1, int value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned int, value1)
+		__field(int, value2)
+	),
+	TP_fast_assign(
+		__entry->value1	= value1;
+		__entry->value2	= value2;
+	),
+	TP_printk("%u %d", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_UINT_INT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__uint_int, name,	\
+	TP_PROTO(unsigned int val1, int val2),	\
+	TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__uint_uint,
+	TP_PROTO(unsigned int value1, unsigned int value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned int, value1)
+		__field(unsigned int, value2)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+	),
+	TP_printk("%u %u", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_UINT_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__uint_uint, name,	\
+	TP_PROTO(unsigned int val1, unsigned int val2),	\
+	TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_uint,
+	TP_PROTO(unsigned long long value1, unsigned int value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned int, value2)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+	),
+	TP_printk("%llu %u", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_ULL_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_uint, name,	\
+	TP_PROTO(unsigned long long val1, unsigned int val2),	\
+	TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_int,
+	TP_PROTO(unsigned long long value1, int value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(int, value2)
+	),
+	TP_fast_assign(
+		__entry->value1	= value1;
+		__entry->value2	= value2;
+	),
+	TP_printk("%llu %d", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_ULL_INT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_int, name,	\
+	TP_PROTO(unsigned long long val1, int val2),	\
+	TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull,
+	TP_PROTO(unsigned long long value1, unsigned long long value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned long long, value2)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+	),
+	TP_printk("%llu %llu", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_ull, name,	\
+	TP_PROTO(unsigned long long val1, unsigned long long val2),	\
+	TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint,
+	TP_PROTO(unsigned long long value1,
+		 unsigned long long value2, unsigned int value3),
+	TP_ARGS(value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned long long, value2)
+		__field(unsigned int, value3)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+		__entry->value3 = value3;
+	),
+	TP_printk("%llu %llu %u",
+		  __entry->value1, __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_ull_uint, name,	\
+	TP_PROTO(unsigned long long val1,	\
+		 unsigned long long val2, unsigned int val3),	\
+	TP_ARGS(val1, val2, val3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint,
+	TP_PROTO(unsigned long long value1,
+		 unsigned int value2, unsigned int value3),
+	TP_ARGS(value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned int, value2)
+		__field(unsigned int, value3)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+		__entry->value3	= value3;
+	),
+	TP_printk("%llu %u %u", __entry->value1,
+		  __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_UINT_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_uint_uint, name,	\
+	TP_PROTO(unsigned long long val1,	\
+		 unsigned int val2, unsigned int val3),	\
+	TP_ARGS(val1, val2, val3))
+
+DECLARE_EVENT_CLASS(ocfs2__uint_uint_uint,
+	TP_PROTO(unsigned int value1, unsigned int value2,
+		 unsigned int value3),
+	TP_ARGS(value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(	unsigned int,	value1		)
+		__field(	unsigned int,	value2		)
+		__field(	unsigned int,	value3		)
+	),
+	TP_fast_assign(
+		__entry->value1	= 	value1;
+		__entry->value2	= 	value2;
+		__entry->value3	= 	value3;
+	),
+	TP_printk("%u %u %u", __entry->value1, __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_UINT_UINT_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__uint_uint_uint, name,	\
+	TP_PROTO(unsigned int value1, unsigned int value2,	\
+		 unsigned int value3),	\
+	TP_ARGS(value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_ull,
+	TP_PROTO(unsigned long long value1,
+		 unsigned long long value2, unsigned long long value3),
+	TP_ARGS(value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned long long, value2)
+		__field(unsigned long long, value3)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+		__entry->value3 = value3;
+	),
+	TP_printk("%llu %llu %llu",
+		  __entry->value1, __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_ULL_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_ull_ull, name,	\
+	TP_PROTO(unsigned long long value1, unsigned long long value2,	\
+		 unsigned long long value3),	\
+	TP_ARGS(value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_int_int_int,
+	TP_PROTO(unsigned long long ull, int value1, int value2, int value3),
+	TP_ARGS(ull, value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(	unsigned long long,	ull	)
+		__field(	int,	value1			)
+		__field(	int,	value2			)
+		__field(	int,	value3			)
+	),
+	TP_fast_assign(
+		__entry->ull		= ull;
+		__entry->value1		= value1;
+		__entry->value2		= value2;
+		__entry->value3		= value3;
+	),
+	TP_printk("%llu %d %d %d",
+		  __entry->ull, __entry->value1,
+		  __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_int_int_int, name,	\
+	TP_PROTO(unsigned long long ull, int value1,	\
+		 int value2, int value3),	\
+	TP_ARGS(ull, value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint_uint,
+	TP_PROTO(unsigned long long ull, unsigned int value1,
+		 unsigned int value2, unsigned int value3),
+	TP_ARGS(ull, value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ull)
+		__field(unsigned int, value1)
+		__field(unsigned int, value2)
+		__field(unsigned int, value3)
+	),
+	TP_fast_assign(
+		__entry->ull = ull;
+		__entry->value1 = value1;
+		__entry->value2	= value2;
+		__entry->value3	= value3;
+	),
+	TP_printk("%llu %u %u %u",
+		  __entry->ull, __entry->value1,
+		  __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_uint_uint_uint, name,	\
+	TP_PROTO(unsigned long long ull, unsigned int value1,	\
+		 unsigned int value2, unsigned int value3),	\
+	TP_ARGS(ull, value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint_uint,
+	TP_PROTO(unsigned long long value1, unsigned long long value2,
+		 unsigned int value3, unsigned int value4),
+	TP_ARGS(value1, value2, value3, value4),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned long long, value2)
+		__field(unsigned int, value3)
+		__field(unsigned int, value4)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+		__entry->value3 = value3;
+		__entry->value4 = value4;
+	),
+	TP_printk("%llu %llu %u %u",
+		  __entry->value1, __entry->value2,
+		  __entry->value3, __entry->value4)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_ull_uint_uint, name,	\
+	TP_PROTO(unsigned long long ull, unsigned long long ull1,	\
+		 unsigned int value2, unsigned int value3),	\
+	TP_ARGS(ull, ull1, value2, value3))
+
+/* Trace events for fs/ocfs2/alloc.c. */
+DECLARE_EVENT_CLASS(ocfs2__btree_ops,
+	TP_PROTO(unsigned long long owner,\
+		 unsigned int value1, unsigned int value2),
+	TP_ARGS(owner, value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned long long, owner)
+		__field(unsigned int, value1)
+		__field(unsigned int, value2)
+	),
+	TP_fast_assign(
+		__entry->owner = owner;
+		__entry->value1 = value1;
+		__entry->value2	= value2;
+	),
+	TP_printk("%llu %u %u",
+		  __entry->owner, __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_BTREE_EVENT(name)	\
+DEFINE_EVENT(ocfs2__btree_ops, name,	\
+	TP_PROTO(unsigned long long owner,	\
+		 unsigned int value1, unsigned int value2),	\
+	TP_ARGS(owner, value1, value2))
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_adjust_rightmost_branch);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_rotate_tree_right);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_append_rec_to_path);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_insert_extent_start);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_add_clusters_in_btree);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_num_free_extents);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_complete_edge_insert);
+
+TRACE_EVENT(ocfs2_grow_tree,
+	TP_PROTO(unsigned long long owner, int depth),
+	TP_ARGS(owner, depth),
+	TP_STRUCT__entry(
+		__field(unsigned long long, owner)
+		__field(int, depth)
+	),
+	TP_fast_assign(
+		__entry->owner = owner;
+		__entry->depth = depth;
+	),
+	TP_printk("%llu %d", __entry->owner, __entry->depth)
+);
+
+TRACE_EVENT(ocfs2_rotate_subtree,
+	TP_PROTO(int subtree_root, unsigned long long blkno,
+		 int depth),
+	TP_ARGS(subtree_root, blkno, depth),
+	TP_STRUCT__entry(
+		__field(int, subtree_root)
+		__field(unsigned long long, blkno)
+		__field(int, depth)
+	),
+	TP_fast_assign(
+		__entry->subtree_root = subtree_root;
+		__entry->blkno = blkno;
+		__entry->depth = depth;
+	),
+	TP_printk("%d %llu %d", __entry->subtree_root,
+		  __entry->blkno, __entry->depth)
+);
+
+TRACE_EVENT(ocfs2_insert_extent,
+	TP_PROTO(unsigned int ins_appending, unsigned int ins_contig,
+		 int ins_contig_index, int free_records, int ins_tree_depth),
+	TP_ARGS(ins_appending, ins_contig, ins_contig_index, free_records,
+		ins_tree_depth),
+	TP_STRUCT__entry(
+		__field(unsigned int, ins_appending)
+		__field(unsigned int, ins_contig)
+		__field(int, ins_contig_index)
+		__field(int, free_records)
+		__field(int, ins_tree_depth)
+	),
+	TP_fast_assign(
+		__entry->ins_appending = ins_appending;
+		__entry->ins_contig = ins_contig;
+		__entry->ins_contig_index = ins_contig_index;
+		__entry->free_records = free_records;
+		__entry->ins_tree_depth = ins_tree_depth;
+	),
+	TP_printk("%u %u %d %d %d",
+		  __entry->ins_appending, __entry->ins_contig,
+		  __entry->ins_contig_index, __entry->free_records,
+		  __entry->ins_tree_depth)
+);
+
+TRACE_EVENT(ocfs2_split_extent,
+	TP_PROTO(int split_index, unsigned int c_contig_type,
+		 unsigned int c_has_empty_extent,
+		 unsigned int c_split_covers_rec),
+	TP_ARGS(split_index, c_contig_type,
+		c_has_empty_extent, c_split_covers_rec),
+	TP_STRUCT__entry(
+		__field(int, split_index)
+		__field(unsigned int, c_contig_type)
+		__field(unsigned int, c_has_empty_extent)
+		__field(unsigned int, c_split_covers_rec)
+	),
+	TP_fast_assign(
+		__entry->split_index = split_index;
+		__entry->c_contig_type = c_contig_type;
+		__entry->c_has_empty_extent = c_has_empty_extent;
+		__entry->c_split_covers_rec = c_split_covers_rec;
+	),
+	TP_printk("%d %u %u %u", __entry->split_index, __entry->c_contig_type,
+		  __entry->c_has_empty_extent, __entry->c_split_covers_rec)
+);
+
+TRACE_EVENT(ocfs2_remove_extent,
+	TP_PROTO(unsigned long long owner, unsigned int cpos,
+		 unsigned int len, int index,
+		 unsigned int e_cpos, unsigned int clusters),
+	TP_ARGS(owner, cpos, len, index, e_cpos, clusters),
+	TP_STRUCT__entry(
+		__field(unsigned long long, owner)
+		__field(unsigned int, cpos)
+		__field(unsigned int, len)
+		__field(int, index)
+		__field(unsigned int, e_cpos)
+		__field(unsigned int, clusters)
+	),
+	TP_fast_assign(
+		__entry->owner = owner;
+		__entry->cpos = cpos;
+		__entry->len = len;
+		__entry->index = index;
+		__entry->e_cpos = e_cpos;
+		__entry->clusters = clusters;
+	),
+	TP_printk("%llu %u %u %d %u %u",
+		  __entry->owner, __entry->cpos, __entry->len, __entry->index,
+		  __entry->e_cpos, __entry->clusters)
+);
+
+TRACE_EVENT(ocfs2_commit_truncate,
+	TP_PROTO(unsigned long long ino, unsigned int new_cpos,
+		 unsigned int clusters, unsigned int depth),
+	TP_ARGS(ino, new_cpos, clusters, depth),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, new_cpos)
+		__field(unsigned int, clusters)
+		__field(unsigned int, depth)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->new_cpos = new_cpos;
+		__entry->clusters = clusters;
+		__entry->depth = depth;
+	),
+	TP_printk("%llu %u %u %u",
+		  __entry->ino, __entry->new_cpos,
+		  __entry->clusters, __entry->depth)
+);
+
+TRACE_EVENT(ocfs2_validate_extent_block,
+	TP_PROTO(unsigned long long blkno),
+	TP_ARGS(blkno),
+	TP_STRUCT__entry(
+		__field(unsigned long long, blkno)
+	),
+	TP_fast_assign(
+		__entry->blkno = blkno;
+	),
+	TP_printk("%llu ", __entry->blkno)
+);
+
+TRACE_EVENT(ocfs2_rotate_leaf,
+	TP_PROTO(unsigned int insert_cpos, int insert_index,
+		 int has_empty, int next_free,
+		 unsigned int l_count),
+	TP_ARGS(insert_cpos, insert_index, has_empty,
+		next_free, l_count),
+	TP_STRUCT__entry(
+		__field(unsigned int, insert_cpos)
+		__field(int, insert_index)
+		__field(int, has_empty)
+		__field(int, next_free)
+		__field(unsigned int, l_count)
+	),
+	TP_fast_assign(
+		__entry->insert_cpos = insert_cpos;
+		__entry->insert_index = insert_index;
+		__entry->has_empty = has_empty;
+		__entry->next_free = next_free;
+		__entry->l_count = l_count;
+	),
+	TP_printk("%u %d %d %d %u", __entry->insert_cpos,
+		  __entry->insert_index, __entry->has_empty,
+		  __entry->next_free, __entry->l_count)
+);
+
+TRACE_EVENT(ocfs2_add_clusters_in_btree_ret,
+	TP_PROTO(int status, int reason, int err),
+	TP_ARGS(status, reason, err),
+	TP_STRUCT__entry(
+		__field(int, status)
+		__field(int, reason)
+		__field(int, err)
+	),
+	TP_fast_assign(
+		__entry->status = status;
+		__entry->reason = reason;
+		__entry->err = err;
+	),
+	TP_printk("%d %d %d", __entry->status,
+		  __entry->reason, __entry->err)
+);
+
+TRACE_EVENT(ocfs2_mark_extent_written,
+	TP_PROTO(unsigned long long owner, unsigned int cpos,
+		 unsigned int len, unsigned int phys),
+	TP_ARGS(owner, cpos, len, phys),
+	TP_STRUCT__entry(
+		__field(unsigned long long, owner)
+		__field(unsigned int, cpos)
+		__field(unsigned int, len)
+		__field(unsigned int, phys)
+	),
+	TP_fast_assign(
+		__entry->owner = owner;
+		__entry->cpos = cpos;
+		__entry->len = len;
+		__entry->phys = phys;
+	),
+	TP_printk("%llu %u %u %u",
+		  __entry->owner, __entry->cpos,
+		  __entry->len, __entry->phys)
+);
+
+DECLARE_EVENT_CLASS(ocfs2__truncate_log_ops,
+	TP_PROTO(unsigned long long blkno, int index,
+		 unsigned int start, unsigned int num),
+	TP_ARGS(blkno, index, start, num),
+	TP_STRUCT__entry(
+		__field(unsigned long long, blkno)
+		__field(int, index)
+		__field(unsigned int, start)
+		__field(unsigned int, num)
+	),
+	TP_fast_assign(
+		__entry->blkno = blkno;
+		__entry->index = index;
+		__entry->start = start;
+		__entry->num = num;
+	),
+	TP_printk("%llu %d %u %u",
+		  __entry->blkno, __entry->index,
+		  __entry->start, __entry->num)
+);
+
+#define DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(name)	\
+DEFINE_EVENT(ocfs2__truncate_log_ops, name,	\
+	TP_PROTO(unsigned long long blkno, int index,	\
+		 unsigned int start, unsigned int num),	\
+	TP_ARGS(blkno, index, start, num))
+
+DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_truncate_log_append);
+
+DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_replay_truncate_records);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_flush_truncate_log);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_begin_truncate_log_recovery);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_truncate_log_recovery_num);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_complete_truncate_log_recovery);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_free_cached_blocks);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_cache_cluster_dealloc);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_run_deallocs);
+
+TRACE_EVENT(ocfs2_cache_block_dealloc,
+	TP_PROTO(int type, int slot, unsigned long long suballoc,
+		 unsigned long long blkno, unsigned int bit),
+	TP_ARGS(type, slot, suballoc, blkno, bit),
+	TP_STRUCT__entry(
+		__field(int, type)
+		__field(int, slot)
+		__field(unsigned long long, suballoc)
+		__field(unsigned long long, blkno)
+		__field(unsigned int, bit)
+	),
+	TP_fast_assign(
+		__entry->type = type;
+		__entry->slot = slot;
+		__entry->suballoc = suballoc;
+		__entry->blkno = blkno;
+		__entry->bit = bit;
+	),
+	TP_printk("%d %d %llu %llu %u",
+		  __entry->type, __entry->slot, __entry->suballoc,
+		  __entry->blkno, __entry->bit)
+);
+
+/* End of trace events for fs/ocfs2/alloc.c. */
+
+/* Trace events for fs/ocfs2/localalloc.c. */
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_la_set_sizes);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_alloc_should_use_local);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_load_local_alloc);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_begin_local_alloc_recovery);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_reserve_local_alloc_bits);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_local_alloc_count_bits);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits_search_bitmap);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_sync_local_to_main);
+
+TRACE_EVENT(ocfs2_sync_local_to_main_free,
+	TP_PROTO(int count, int bit, unsigned long long start_blk,
+		 unsigned long long blkno),
+	TP_ARGS(count, bit, start_blk, blkno),
+	TP_STRUCT__entry(
+		__field(int, count)
+		__field(int, bit)
+		__field(unsigned long long, start_blk)
+		__field(unsigned long long, blkno)
+	),
+	TP_fast_assign(
+		__entry->count = count;
+		__entry->bit = bit;
+		__entry->start_blk = start_blk;
+		__entry->blkno = blkno;
+	),
+	TP_printk("%d %d %llu %llu",
+		  __entry->count, __entry->bit, __entry->start_blk,
+		  __entry->blkno)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_new_window);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_local_alloc_new_window_result);
+
+/* End of trace events for fs/ocfs2/localalloc.c. */
+
+/* Trace events for fs/ocfs2/resize.c. */
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_update_last_group_and_inode);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_group_extend);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_group_add);
+
+/* End of trace events for fs/ocfs2/resize.c. */
+
+/* Trace events for fs/ocfs2/suballoc.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_group_descriptor);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_contig);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_discontig);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_block_group_alloc);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_nospc);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_no_new_group);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_reserve_new_inode_new_group);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_set_bits);
+
+TRACE_EVENT(ocfs2_relink_block_group,
+	TP_PROTO(unsigned long long i_blkno, unsigned int chain,
+		 unsigned long long bg_blkno,
+		 unsigned long long prev_blkno),
+	TP_ARGS(i_blkno, chain, bg_blkno, prev_blkno),
+	TP_STRUCT__entry(
+		__field(unsigned long long, i_blkno)
+		__field(unsigned int, chain)
+		__field(unsigned long long, bg_blkno)
+		__field(unsigned long long, prev_blkno)
+	),
+	TP_fast_assign(
+		__entry->i_blkno = i_blkno;
+		__entry->chain = chain;
+		__entry->bg_blkno = bg_blkno;
+		__entry->prev_blkno = prev_blkno;
+	),
+	TP_printk("%llu %u %llu %llu",
+		  __entry->i_blkno, __entry->chain, __entry->bg_blkno,
+		  __entry->prev_blkno)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_cluster_group_search_wrong_max_bits);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cluster_group_search_max_block);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_block_group_search_max_block);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_search_chain_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_succ);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_end);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_claim_suballoc_bits);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_claim_new_inode_at_loc);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_clear_bits);
+
+TRACE_EVENT(ocfs2_free_suballoc_bits,
+	TP_PROTO(unsigned long long inode, unsigned long long group,
+		 unsigned int start_bit, unsigned int count),
+	TP_ARGS(inode, group, start_bit, count),
+	TP_STRUCT__entry(
+		__field(unsigned long long, inode)
+		__field(unsigned long long, group)
+		__field(unsigned int, start_bit)
+		__field(unsigned int, count)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->group = group;
+		__entry->start_bit = start_bit;
+		__entry->count = count;
+	),
+	TP_printk("%llu %llu %u %u", __entry->inode, __entry->group,
+		  __entry->start_bit, __entry->count)
+);
+
+TRACE_EVENT(ocfs2_free_clusters,
+	TP_PROTO(unsigned long long bg_blkno, unsigned long long start_blk,
+		 unsigned int start_bit, unsigned int count),
+	TP_ARGS(bg_blkno, start_blk, start_bit, count),
+	TP_STRUCT__entry(
+		__field(unsigned long long, bg_blkno)
+		__field(unsigned long long, start_blk)
+		__field(unsigned int, start_bit)
+		__field(unsigned int, count)
+	),
+	TP_fast_assign(
+		__entry->bg_blkno = bg_blkno;
+		__entry->start_blk = start_blk;
+		__entry->start_bit = start_bit;
+		__entry->count = count;
+	),
+	TP_printk("%llu %llu %u %u", __entry->bg_blkno, __entry->start_blk,
+		  __entry->start_bit, __entry->count)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_get_suballoc_slot_bit);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_test_suballoc_bit);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_test_inode_bit);
+
+/* End of trace events for fs/ocfs2/suballoc.c. */
+
+/* Trace events for fs/ocfs2/refcounttree.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_refcount_block);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_refcount_trees);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree_blkno);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_change_refcount_rec);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_expand_inline_ref_root);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_divide_leaf_refcount_block);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_new_leaf_refcount_block);
+
+DECLARE_EVENT_CLASS(ocfs2__refcount_tree_ops,
+	TP_PROTO(unsigned long long blkno, int index,
+		 unsigned long long cpos,
+		 unsigned int clusters, unsigned int refcount),
+	TP_ARGS(blkno, index, cpos, clusters, refcount),
+	TP_STRUCT__entry(
+		__field(unsigned long long, blkno)
+		__field(int, index)
+		__field(unsigned long long, cpos)
+		__field(unsigned int, clusters)
+		__field(unsigned int, refcount)
+	),
+	TP_fast_assign(
+		__entry->blkno = blkno;
+		__entry->index = index;
+		__entry->cpos = cpos;
+		__entry->clusters = clusters;
+		__entry->refcount = refcount;
+	),
+	TP_printk("%llu %d %llu %u %u", __entry->blkno, __entry->index,
+		  __entry->cpos, __entry->clusters, __entry->refcount)
+);
+
+#define DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(name)	\
+DEFINE_EVENT(ocfs2__refcount_tree_ops, name,		\
+	TP_PROTO(unsigned long long blkno, int index,	\
+		 unsigned long long cpos,		\
+		 unsigned int count, unsigned int refcount),	\
+	TP_ARGS(blkno, index, cpos, count, refcount))
+
+DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_insert_refcount_rec);
+
+TRACE_EVENT(ocfs2_split_refcount_rec,
+	TP_PROTO(unsigned long long cpos,
+		 unsigned int clusters, unsigned int refcount,
+		 unsigned long long split_cpos,
+		 unsigned int split_clusters, unsigned int split_refcount),
+	TP_ARGS(cpos, clusters, refcount,
+		split_cpos, split_clusters, split_refcount),
+	TP_STRUCT__entry(
+		__field(unsigned long long, cpos)
+		__field(unsigned int, clusters)
+		__field(unsigned int, refcount)
+		__field(unsigned long long, split_cpos)
+		__field(unsigned int, split_clusters)
+		__field(unsigned int, split_refcount)
+	),
+	TP_fast_assign(
+		__entry->cpos = cpos;
+		__entry->clusters = clusters;
+		__entry->refcount = refcount;
+		__entry->split_cpos = split_cpos;
+		__entry->split_clusters = split_clusters;
+		__entry->split_refcount	= split_refcount;
+	),
+	TP_printk("%llu %u %u %llu %u %u",
+		  __entry->cpos, __entry->clusters, __entry->refcount,
+		  __entry->split_cpos, __entry->split_clusters,
+		  __entry->split_refcount)
+);
+
+DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_split_refcount_rec_insert);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_increase_refcount_begin);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_change);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_increase_refcount_insert);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_split);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_remove_refcount_extent);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_restore_refcount_block);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_decrease_refcount_rec);
+
+TRACE_EVENT(ocfs2_decrease_refcount,
+	TP_PROTO(unsigned long long owner,
+		 unsigned long long cpos,
+		 unsigned int len, int delete),
+	TP_ARGS(owner, cpos, len, delete),
+	TP_STRUCT__entry(
+		__field(unsigned long long, owner)
+		__field(unsigned long long, cpos)
+		__field(unsigned int, len)
+		__field(int, delete)
+	),
+	TP_fast_assign(
+		__entry->owner = owner;
+		__entry->cpos = cpos;
+		__entry->len = len;
+		__entry->delete = delete;
+	),
+	TP_printk("%llu %llu %u %d",
+		  __entry->owner, __entry->cpos, __entry->len, __entry->delete)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_mark_extent_refcounted);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_calc_refcount_meta_credits);
+
+TRACE_EVENT(ocfs2_calc_refcount_meta_credits_iterate,
+	TP_PROTO(int recs_add, unsigned long long cpos,
+		 unsigned int clusters, unsigned long long r_cpos,
+		 unsigned int r_clusters, unsigned int refcount, int index),
+	TP_ARGS(recs_add, cpos, clusters, r_cpos, r_clusters, refcount, index),
+	TP_STRUCT__entry(
+		__field(int, recs_add)
+		__field(unsigned long long, cpos)
+		__field(unsigned int, clusters)
+		__field(unsigned long long, r_cpos)
+		__field(unsigned int, r_clusters)
+		__field(unsigned int, refcount)
+		__field(int, index)
+	),
+	TP_fast_assign(
+		__entry->recs_add = recs_add;
+		__entry->cpos = cpos;
+		__entry->clusters = clusters;
+		__entry->r_cpos = r_cpos;
+		__entry->r_clusters = r_clusters;
+		__entry->refcount = refcount;
+		__entry->index = index;
+	),
+	TP_printk("%d %llu %u %llu %u %u %d",
+		  __entry->recs_add, __entry->cpos, __entry->clusters,
+		  __entry->r_cpos, __entry->r_clusters,
+		  __entry->refcount, __entry->index)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_add_refcount_flag);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_prepare_refcount_change_for_del);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_lock_refcount_allocators);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_page);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_jbd);
+
+TRACE_EVENT(ocfs2_clear_ext_refcount,
+	TP_PROTO(unsigned long long ino, unsigned int cpos,
+		 unsigned int len, unsigned int p_cluster,
+		 unsigned int ext_flags),
+	TP_ARGS(ino, cpos, len, p_cluster, ext_flags),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, cpos)
+		__field(unsigned int, len)
+		__field(unsigned int, p_cluster)
+		__field(unsigned int, ext_flags)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->cpos = cpos;
+		__entry->len = len;
+		__entry->p_cluster = p_cluster;
+		__entry->ext_flags = ext_flags;
+	),
+	TP_printk("%llu %u %u %u %u",
+		  __entry->ino, __entry->cpos, __entry->len,
+		  __entry->p_cluster, __entry->ext_flags)
+);
+
+TRACE_EVENT(ocfs2_replace_clusters,
+	TP_PROTO(unsigned long long ino, unsigned int cpos,
+		 unsigned int old, unsigned int new, unsigned int len,
+		 unsigned int ext_flags),
+	TP_ARGS(ino, cpos, old, new, len, ext_flags),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, cpos)
+		__field(unsigned int, old)
+		__field(unsigned int, new)
+		__field(unsigned int, len)
+		__field(unsigned int, ext_flags)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->cpos = cpos;
+		__entry->old = old;
+		__entry->new = new;
+		__entry->len = len;
+		__entry->ext_flags = ext_flags;
+	),
+	TP_printk("%llu %u %u %u %u %u",
+		  __entry->ino, __entry->cpos, __entry->old, __entry->new,
+		  __entry->len, __entry->ext_flags)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_make_clusters_writable);
+
+TRACE_EVENT(ocfs2_refcount_cow_hunk,
+	TP_PROTO(unsigned long long ino, unsigned int cpos,
+		 unsigned int write_len, unsigned int max_cpos,
+		 unsigned int cow_start, unsigned int cow_len),
+	TP_ARGS(ino, cpos, write_len, max_cpos, cow_start, cow_len),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, cpos)
+		__field(unsigned int, write_len)
+		__field(unsigned int, max_cpos)
+		__field(unsigned int, cow_start)
+		__field(unsigned int, cow_len)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->cpos = cpos;
+		__entry->write_len = write_len;
+		__entry->max_cpos = max_cpos;
+		__entry->cow_start = cow_start;
+		__entry->cow_len = cow_len;
+	),
+	TP_printk("%llu %u %u %u %u %u",
+		  __entry->ino, __entry->cpos, __entry->write_len,
+		  __entry->max_cpos, __entry->cow_start, __entry->cow_len)
+);
+
+/* End of trace events for fs/ocfs2/refcounttree.c. */
+
+/* Trace events for fs/ocfs2/aops.c. */
+
+DECLARE_EVENT_CLASS(ocfs2__get_block,
+	TP_PROTO(unsigned long long ino, unsigned long long iblock,
+		 void *bh_result, int create),
+	TP_ARGS(ino, iblock, bh_result, create),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned long long, iblock)
+		__field(void *, bh_result)
+		__field(int, create)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->iblock = iblock;
+		__entry->bh_result = bh_result;
+		__entry->create = create;
+	),
+	TP_printk("%llu %llu %p %d",
+		  __entry->ino, __entry->iblock,
+		  __entry->bh_result, __entry->create)
+);
+
+#define DEFINE_OCFS2_GET_BLOCK_EVENT(name)	\
+DEFINE_EVENT(ocfs2__get_block, name,	\
+	TP_PROTO(unsigned long long ino, unsigned long long iblock,	\
+		 void *bh_result, int create),	\
+	TP_ARGS(ino, iblock, bh_result, create))
+
+DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_symlink_get_block);
+
+DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_get_block);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_get_block_end);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_readpage);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_writepage);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_bmap);
+
+TRACE_EVENT(ocfs2_try_to_write_inline_data,
+	TP_PROTO(unsigned long long ino, unsigned int len,
+		 unsigned long long pos, unsigned int flags),
+	TP_ARGS(ino, len, pos, flags),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, len)
+		__field(unsigned long long, pos)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->len = len;
+		__entry->pos = pos;
+		__entry->flags = flags;
+	),
+	TP_printk("%llu %u %llu 0x%x",
+		  __entry->ino, __entry->len, __entry->pos, __entry->flags)
+);
+
+TRACE_EVENT(ocfs2_write_begin_nolock,
+	TP_PROTO(unsigned long long ino,
+		 long long i_size, unsigned int i_clusters,
+		 unsigned long long pos, unsigned int len,
+		 unsigned int flags, void *page,
+		 unsigned int clusters, unsigned int extents_to_split),
+	TP_ARGS(ino, i_size, i_clusters, pos, len, flags,
+		page, clusters, extents_to_split),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(long long, i_size)
+		__field(unsigned int, i_clusters)
+		__field(unsigned long long, pos)
+		__field(unsigned int, len)
+		__field(unsigned int, flags)
+		__field(void *, page)
+		__field(unsigned int, clusters)
+		__field(unsigned int, extents_to_split)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->i_size = i_size;
+		__entry->i_clusters = i_clusters;
+		__entry->pos = pos;
+		__entry->len = len;
+		__entry->flags = flags;
+		__entry->page = page;
+		__entry->clusters = clusters;
+		__entry->extents_to_split = extents_to_split;
+	),
+	TP_printk("%llu %lld %u %llu %u %u %p %u %u",
+		  __entry->ino, __entry->i_size, __entry->i_clusters,
+		  __entry->pos, __entry->len,
+		  __entry->flags, __entry->page, __entry->clusters,
+		  __entry->extents_to_split)
+);
+
+TRACE_EVENT(ocfs2_write_end_inline,
+	TP_PROTO(unsigned long long ino,
+		 unsigned long long pos, unsigned int copied,
+		 unsigned int id_count, unsigned int features),
+	TP_ARGS(ino, pos, copied, id_count, features),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned long long, pos)
+		__field(unsigned int, copied)
+		__field(unsigned int, id_count)
+		__field(unsigned int, features)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->pos = pos;
+		__entry->copied = copied;
+		__entry->id_count = id_count;
+		__entry->features = features;
+	),
+	TP_printk("%llu %llu %u %u %u",
+		  __entry->ino, __entry->pos, __entry->copied,
+		  __entry->id_count, __entry->features)
+);
+
+/* End of trace events for fs/ocfs2/aops.c. */
+
+/* Trace events for fs/ocfs2/mmap.c. */
+
+TRACE_EVENT(ocfs2_fault,
+	TP_PROTO(unsigned long long ino,
+		 void *area, void *page, unsigned long pgoff),
+	TP_ARGS(ino, area, page, pgoff),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(void *, area)
+		__field(void *, page)
+		__field(unsigned long, pgoff)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->area = area;
+		__entry->page = page;
+		__entry->pgoff = pgoff;
+	),
+	TP_printk("%llu %p %p %lu",
+		  __entry->ino, __entry->area, __entry->page, __entry->pgoff)
+);
+
+/* End of trace events for fs/ocfs2/mmap.c. */
+
+/* Trace events for fs/ocfs2/file.c. */
+
+DECLARE_EVENT_CLASS(ocfs2__file_ops,
+	TP_PROTO(void *inode, void *file, void *dentry,
+		 unsigned long long ino,
+		 unsigned int d_len, const unsigned char *d_name,
+		 unsigned long long para),
+	TP_ARGS(inode, file, dentry, ino, d_len, d_name, para),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(void *, file)
+		__field(void *, dentry)
+		__field(unsigned long long, ino)
+		__field(unsigned int, d_len)
+		__string(d_name, d_name)
+		__field(unsigned long long, para)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->file = file;
+		__entry->dentry = dentry;
+		__entry->ino = ino;
+		__entry->d_len = d_len;
+		__assign_str(d_name, d_name);
+		__entry->para = para;
+	),
+	TP_printk("%p %p %p %llu %llu %.*s", __entry->inode, __entry->file,
+		  __entry->dentry, __entry->ino, __entry->para,
+		  __entry->d_len, __get_str(d_name))
+);
+
+#define DEFINE_OCFS2_FILE_OPS(name)				\
+DEFINE_EVENT(ocfs2__file_ops, name,				\
+TP_PROTO(void *inode, void *file, void *dentry,			\
+	 unsigned long long ino,				\
+	 unsigned int d_len, const unsigned char *d_name,	\
+	 unsigned long long mode),				\
+	TP_ARGS(inode, file, dentry, ino, d_len, d_name, mode))
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_open);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_release);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_sync_file);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_read);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_truncate_file_error);
+
+TRACE_EVENT(ocfs2_extend_allocation,
+	TP_PROTO(unsigned long long ip_blkno, unsigned long long size,
+		 unsigned int clusters, unsigned int clusters_to_add,
+		 int why, int restart_func),
+	TP_ARGS(ip_blkno, size, clusters, clusters_to_add, why, restart_func),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ip_blkno)
+		__field(unsigned long long, size)
+		__field(unsigned int, clusters)
+		__field(unsigned int, clusters_to_add)
+		__field(int, why)
+		__field(int, restart_func)
+	),
+	TP_fast_assign(
+		__entry->ip_blkno = ip_blkno;
+		__entry->size = size;
+		__entry->clusters = clusters;
+		__entry->clusters_to_add = clusters_to_add;
+		__entry->why = why;
+		__entry->restart_func = restart_func;
+	),
+	TP_printk("%llu %llu %u %u %d %d",
+		  __entry->ip_blkno, __entry->size, __entry->clusters,
+		  __entry->clusters_to_add, __entry->why, __entry->restart_func)
+);
+
+TRACE_EVENT(ocfs2_extend_allocation_end,
+	TP_PROTO(unsigned long long ino,
+		 unsigned int di_clusters, unsigned long long di_size,
+		 unsigned int ip_clusters, unsigned long long i_size),
+	TP_ARGS(ino, di_clusters, di_size, ip_clusters, i_size),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, di_clusters)
+		__field(unsigned long long, di_size)
+		__field(unsigned int, ip_clusters)
+		__field(unsigned long long, i_size)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->di_clusters = di_clusters;
+		__entry->di_size = di_size;
+		__entry->ip_clusters = ip_clusters;
+		__entry->i_size = i_size;
+	),
+	TP_printk("%llu %u %llu %u %llu", __entry->ino, __entry->di_clusters,
+		  __entry->di_size, __entry->ip_clusters, __entry->i_size)
+);
+
+TRACE_EVENT(ocfs2_write_zero_page,
+	TP_PROTO(unsigned long long ino,
+		 unsigned long long abs_from, unsigned long long abs_to,
+		 unsigned long index, unsigned int zero_from,
+		 unsigned int zero_to),
+	TP_ARGS(ino, abs_from, abs_to, index, zero_from, zero_to),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned long long, abs_from)
+		__field(unsigned long long, abs_to)
+		__field(unsigned long, index)
+		__field(unsigned int, zero_from)
+		__field(unsigned int, zero_to)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->abs_from = abs_from;
+		__entry->abs_to = abs_to;
+		__entry->index = index;
+		__entry->zero_from = zero_from;
+		__entry->zero_to = zero_to;
+	),
+	TP_printk("%llu %llu %llu %lu %u %u", __entry->ino,
+		  __entry->abs_from, __entry->abs_to,
+		  __entry->index, __entry->zero_from, __entry->zero_to)
+);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend_range);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend);
+
+TRACE_EVENT(ocfs2_setattr,
+	TP_PROTO(void *inode, void *dentry,
+		 unsigned long long ino,
+		 unsigned int d_len, const unsigned char *d_name,
+		 unsigned int ia_valid, unsigned int ia_mode,
+		 unsigned int ia_uid, unsigned int ia_gid),
+	TP_ARGS(inode, dentry, ino, d_len, d_name,
+		ia_valid, ia_mode, ia_uid, ia_gid),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(void *, dentry)
+		__field(unsigned long long, ino)
+		__field(unsigned int, d_len)
+		__string(d_name, d_name)
+		__field(unsigned int, ia_valid)
+		__field(unsigned int, ia_mode)
+		__field(unsigned int, ia_uid)
+		__field(unsigned int, ia_gid)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->dentry = dentry;
+		__entry->ino = ino;
+		__entry->d_len = d_len;
+		__assign_str(d_name, d_name);
+		__entry->ia_valid = ia_valid;
+		__entry->ia_mode = ia_mode;
+		__entry->ia_uid = ia_uid;
+		__entry->ia_gid = ia_gid;
+	),
+	TP_printk("%p %p %llu %.*s %u %u %u %u", __entry->inode,
+		  __entry->dentry, __entry->ino, __entry->d_len,
+		  __get_str(d_name), __entry->ia_valid, __entry->ia_mode,
+		  __entry->ia_uid, __entry->ia_gid)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_write_remove_suid);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_partial_clusters);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range1);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range2);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
+
+TRACE_EVENT(ocfs2_prepare_inode_for_write,
+	TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
+		 int appending, unsigned long count,
+		 int *direct_io, int *has_refcount),
+	TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned long long, saved_pos)
+		__field(int, appending)
+		__field(unsigned long, count)
+		__field(int, direct_io)
+		__field(int, has_refcount)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->saved_pos = saved_pos;
+		__entry->appending = appending;
+		__entry->count = count;
+		__entry->direct_io = direct_io ? *direct_io : -1;
+		__entry->has_refcount = has_refcount ? *has_refcount : -1;
+	),
+	TP_printk("%llu %llu %d %lu %d %d", __entry->ino,
+		  __entry->saved_pos, __entry->appending, __entry->count,
+		  __entry->direct_io, __entry->has_refcount)
+);
+
+DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
+
+/* End of trace events for fs/ocfs2/file.c. */
+
+/* Trace events for fs/ocfs2/inode.c. */
+
+TRACE_EVENT(ocfs2_iget_begin,
+	TP_PROTO(unsigned long long ino, unsigned int flags, int sysfile_type),
+	TP_ARGS(ino, flags, sysfile_type),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, flags)
+		__field(int, sysfile_type)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->flags = flags;
+		__entry->sysfile_type = sysfile_type;
+	),
+	TP_printk("%llu %u %d", __entry->ino,
+		  __entry->flags, __entry->sysfile_type)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_iget5_locked);
+
+TRACE_EVENT(ocfs2_iget_end,
+	TP_PROTO(void *inode, unsigned long long ino),
+	TP_ARGS(inode, ino),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(unsigned long long, ino)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->ino = ino;
+	),
+	TP_printk("%p %llu", __entry->inode, __entry->ino)
+);
+
+TRACE_EVENT(ocfs2_find_actor,
+	TP_PROTO(void *inode, unsigned long long ino,
+		 void *args,  unsigned long long fi_blkno),
+	TP_ARGS(inode, ino, args, fi_blkno),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(unsigned long long, ino)
+		__field(void *, args)
+		__field(unsigned long long, fi_blkno)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->ino = ino;
+		__entry->args = args;
+		__entry->fi_blkno = fi_blkno;
+	),
+	TP_printk("%p %llu %p %llu", __entry->inode, __entry->ino,
+		  __entry->args, __entry->fi_blkno)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_populate_inode);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block);
+
+TRACE_EVENT(ocfs2_inode_is_valid_to_delete,
+	TP_PROTO(void *task, void *dc_task, unsigned long long ino,
+		 unsigned int flags),
+	TP_ARGS(task, dc_task, ino, flags),
+	TP_STRUCT__entry(
+		__field(void *, task)
+		__field(void *, dc_task)
+		__field(unsigned long long, ino)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->task = task;
+		__entry->dc_task = dc_task;
+		__entry->ino = ino;
+		__entry->flags = flags;
+	),
+	TP_printk("%p %p %llu %u", __entry->task, __entry->dc_task,
+		  __entry->ino, __entry->flags)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_query_inode_wipe_begin);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_query_inode_wipe_succ);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_query_inode_wipe_end);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_cleanup_delete_inode);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_delete_inode);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_clear_inode);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_drop_inode);
+
+TRACE_EVENT(ocfs2_inode_revalidate,
+	TP_PROTO(void *inode, unsigned long long ino,
+		 unsigned int flags),
+	TP_ARGS(inode, ino, flags),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(unsigned long long, ino)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->ino = ino;
+		__entry->flags = flags;
+	),
+	TP_printk("%p %llu %u", __entry->inode, __entry->ino, __entry->flags)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_mark_inode_dirty);
+
+/* End of trace events for fs/ocfs2/inode.c. */
+
+/* Trace events for fs/ocfs2/extent_map.c. */
+
+TRACE_EVENT(ocfs2_read_virt_blocks,
+	TP_PROTO(void *inode, unsigned long long vblock, int nr,
+		 void *bhs, unsigned int flags, void *validate),
+	TP_ARGS(inode, vblock, nr, bhs, flags, validate),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(unsigned long long, vblock)
+		__field(int, nr)
+		__field(void *, bhs)
+		__field(unsigned int, flags)
+		__field(void *, validate)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->vblock = vblock;
+		__entry->nr = nr;
+		__entry->bhs = bhs;
+		__entry->flags = flags;
+		__entry->validate = validate;
+	),
+	TP_printk("%p %llu %d %p %x %p", __entry->inode, __entry->vblock,
+		  __entry->nr, __entry->bhs, __entry->flags, __entry->validate)
+);
+
+/* End of trace events for fs/ocfs2/extent_map.c. */
+
+/* Trace events for fs/ocfs2/slot_map.c. */
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_refresh_slot_info);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers_block);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_find_slot);
+
+/* End of trace events for fs/ocfs2/slot_map.c. */
+
+/* Trace events for fs/ocfs2/heartbeat.c. */
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_do_node_down);
+
+/* End of trace events for fs/ocfs2/heartbeat.c. */
+
+/* Trace events for fs/ocfs2/super.c. */
+
+TRACE_EVENT(ocfs2_remount,
+	TP_PROTO(unsigned long s_flags, unsigned long osb_flags, int flags),
+	TP_ARGS(s_flags, osb_flags, flags),
+	TP_STRUCT__entry(
+		__field(unsigned long, s_flags)
+		__field(unsigned long, osb_flags)
+		__field(int, flags)
+	),
+	TP_fast_assign(
+		__entry->s_flags = s_flags;
+		__entry->osb_flags = osb_flags;
+		__entry->flags = flags;
+	),
+	TP_printk("%lu %lu %d", __entry->s_flags,
+		  __entry->osb_flags, __entry->flags)
+);
+
+TRACE_EVENT(ocfs2_fill_super,
+	TP_PROTO(void *sb, void *data, int silent),
+	TP_ARGS(sb, data, silent),
+	TP_STRUCT__entry(
+		__field(void *, sb)
+		__field(void *, data)
+		__field(int, silent)
+	),
+	TP_fast_assign(
+		__entry->sb = sb;
+		__entry->data = data;
+		__entry->silent = silent;
+	),
+	TP_printk("%p %p %d", __entry->sb,
+		  __entry->data, __entry->silent)
+);
+
+TRACE_EVENT(ocfs2_parse_options,
+	TP_PROTO(int is_remount, char *options),
+	TP_ARGS(is_remount, options),
+	TP_STRUCT__entry(
+		__field(int, is_remount)
+		__string(options, options)
+	),
+	TP_fast_assign(
+		__entry->is_remount = is_remount;
+		__assign_str(options, options);
+	),
+	TP_printk("%d %s", __entry->is_remount, __get_str(options))
+);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_put_super);
+
+TRACE_EVENT(ocfs2_statfs,
+	TP_PROTO(void *sb, void *buf),
+	TP_ARGS(sb, buf),
+	TP_STRUCT__entry(
+		__field(void *, sb)
+		__field(void *, buf)
+	),
+	TP_fast_assign(
+		__entry->sb = sb;
+		__entry->buf = buf;
+	),
+	TP_printk("%p %p", __entry->sb, __entry->buf)
+);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_dismount_volume);
+
+TRACE_EVENT(ocfs2_initialize_super,
+	TP_PROTO(char *label, char *uuid_str, unsigned long long root_dir,
+		 unsigned long long system_dir, int cluster_bits),
+	TP_ARGS(label, uuid_str, root_dir, system_dir, cluster_bits),
+	TP_STRUCT__entry(
+		__string(label, label)
+		__string(uuid_str, uuid_str)
+		__field(unsigned long long, root_dir)
+		__field(unsigned long long, system_dir)
+		__field(int, cluster_bits)
+	),
+	TP_fast_assign(
+		__assign_str(label, label);
+		__assign_str(uuid_str, uuid_str);
+		__entry->root_dir = root_dir;
+		__entry->system_dir = system_dir;
+		__entry->cluster_bits = cluster_bits;
+	),
+	TP_printk("%s %s %llu %llu %d", __get_str(label), __get_str(uuid_str),
+		  __entry->root_dir, __entry->system_dir, __entry->cluster_bits)
+);
+
+/* End of trace events for fs/ocfs2/super.c. */
+
+/* Trace events for fs/ocfs2/xattr.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_xattr_block);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_xattr_extend_allocation);
+
+TRACE_EVENT(ocfs2_init_xattr_set_ctxt,
+	TP_PROTO(const char *name, int meta, int clusters, int credits),
+	TP_ARGS(name, meta, clusters, credits),
+	TP_STRUCT__entry(
+		__string(name, name)
+		__field(int, meta)
+		__field(int, clusters)
+		__field(int, credits)
+	),
+	TP_fast_assign(
+		__assign_str(name, name);
+		__entry->meta = meta;
+		__entry->clusters = clusters;
+		__entry->credits = credits;
+	),
+	TP_printk("%s %d %d %d", __get_str(name), __entry->meta,
+		  __entry->clusters, __entry->credits)
+);
+
+DECLARE_EVENT_CLASS(ocfs2__xattr_find,
+	TP_PROTO(unsigned long long ino, const char *name, int name_index,
+		 unsigned int hash, unsigned long long location,
+		 int xe_index),
+	TP_ARGS(ino, name, name_index, hash, location, xe_index),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__string(name, name)
+		__field(int, name_index)
+		__field(unsigned int, hash)
+		__field(unsigned long long, location)
+		__field(int, xe_index)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__assign_str(name, name);
+		__entry->name_index = name_index;
+		__entry->hash = hash;
+		__entry->location = location;
+		__entry->xe_index = xe_index;
+	),
+	TP_printk("%llu %s %d %u %llu %d", __entry->ino, __get_str(name),
+		  __entry->name_index, __entry->hash, __entry->location,
+		  __entry->xe_index)
+);
+
+#define DEFINE_OCFS2_XATTR_FIND_EVENT(name)					\
+DEFINE_EVENT(ocfs2__xattr_find, name,					\
+TP_PROTO(unsigned long long ino, const char *name, int name_index,	\
+	 unsigned int hash, unsigned long long bucket,			\
+	 int xe_index),							\
+	TP_ARGS(ino, name, name_index, hash, bucket, xe_index))
+
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_bucket_find);
+
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find);
+
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find_rec);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_iterate_xattr_buckets);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_iterate_xattr_bucket);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cp_xattr_block_to_bucket_begin);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cp_xattr_block_to_bucket_end);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block_begin);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_defrag_xattr_bucket);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_bucket_cross_cluster);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_divide_xattr_bucket_begin);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_divide_xattr_bucket_move);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_cp_xattr_bucket);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_buckets);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_adjust_xattr_cross_cluster);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_add_new_xattr_cluster);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_insert);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_extend_xattr_bucket);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_add_new_xattr_bucket);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_xattr_bucket_value_truncate);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_rm_xattr_cluster);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_header);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_create_empty_xattr_block);
+
+DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_bucket);
+
+DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_index_block);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_xattr_bucket_value_refcount);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_reflink_xattr_buckets);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_rec);
+
+/* End of trace events for fs/ocfs2/xattr.c. */
+
+/* Trace events for fs/ocfs2/reservations.c. */
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_insert);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_begin);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_end);
+
+TRACE_EVENT(ocfs2_resv_find_window_begin,
+	TP_PROTO(unsigned int r_start, unsigned int r_end, unsigned int goal,
+		 unsigned int wanted, int empty_root),
+	TP_ARGS(r_start, r_end, goal, wanted, empty_root),
+	TP_STRUCT__entry(
+		__field(unsigned int, r_start)
+		__field(unsigned int, r_end)
+		__field(unsigned int, goal)
+		__field(unsigned int, wanted)
+		__field(int, empty_root)
+	),
+	TP_fast_assign(
+		__entry->r_start = r_start;
+		__entry->r_end = r_end;
+		__entry->goal = goal;
+		__entry->wanted = wanted;
+		__entry->empty_root = empty_root;
+	),
+	TP_printk("%u %u %u %u %d", __entry->r_start, __entry->r_end,
+		  __entry->goal, __entry->wanted, __entry->empty_root)
+);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_find_window_prev);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_resv_find_window_next);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cannibalize_resv_begin);
+
+TRACE_EVENT(ocfs2_cannibalize_resv_end,
+	TP_PROTO(unsigned int start, unsigned int end, unsigned int len,
+		 unsigned int last_start, unsigned int last_len),
+	TP_ARGS(start, end, len, last_start, last_len),
+	TP_STRUCT__entry(
+		__field(unsigned int, start)
+		__field(unsigned int, end)
+		__field(unsigned int, len)
+		__field(unsigned int, last_start)
+		__field(unsigned int, last_len)
+	),
+	TP_fast_assign(
+		__entry->start = start;
+		__entry->end = end;
+		__entry->len = len;
+		__entry->last_start = last_start;
+		__entry->last_len = last_len;
+	),
+	TP_printk("%u %u %u %u %u", __entry->start, __entry->end,
+		  __entry->len, __entry->last_start, __entry->last_len)
+);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_resv_bits);
+
+TRACE_EVENT(ocfs2_resmap_claimed_bits_begin,
+	TP_PROTO(unsigned int cstart, unsigned int cend, unsigned int clen,
+		 unsigned int r_start, unsigned int r_end, unsigned int r_len,
+		 unsigned int last_start, unsigned int last_len),
+	TP_ARGS(cstart, cend, clen, r_start, r_end,
+		r_len, last_start, last_len),
+	TP_STRUCT__entry(
+		__field(unsigned int, cstart)
+		__field(unsigned int, cend)
+		__field(unsigned int, clen)
+		__field(unsigned int, r_start)
+		__field(unsigned int, r_end)
+		__field(unsigned int, r_len)
+		__field(unsigned int, last_start)
+		__field(unsigned int, last_len)
+	),
+	TP_fast_assign(
+		__entry->cstart = cstart;
+		__entry->cend = cend;
+		__entry->clen = clen;
+		__entry->r_start = r_start;
+		__entry->r_end = r_end;
+		__entry->r_len = r_len;
+		__entry->last_start = last_start;
+		__entry->last_len = last_len;
+	),
+	TP_printk("%u %u %u %u %u %u %u %u",
+		  __entry->cstart, __entry->cend, __entry->clen,
+		  __entry->r_start, __entry->r_end, __entry->r_len,
+		  __entry->last_start, __entry->last_len)
+);
+
+TRACE_EVENT(ocfs2_resmap_claimed_bits_end,
+	TP_PROTO(unsigned int start, unsigned int end, unsigned int len,
+		 unsigned int last_start, unsigned int last_len),
+	TP_ARGS(start, end, len, last_start, last_len),
+	TP_STRUCT__entry(
+		__field(unsigned int, start)
+		__field(unsigned int, end)
+		__field(unsigned int, len)
+		__field(unsigned int, last_start)
+		__field(unsigned int, last_len)
+	),
+	TP_fast_assign(
+		__entry->start = start;
+		__entry->end = end;
+		__entry->len = len;
+		__entry->last_start = last_start;
+		__entry->last_len = last_len;
+	),
+	TP_printk("%u %u %u %u %u", __entry->start, __entry->end,
+		  __entry->len, __entry->last_start, __entry->last_len)
+);
+
+/* End of trace events for fs/ocfs2/reservations.c. */
+
+/* Trace events for fs/ocfs2/quota_local.c. */
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_recover_local_quota_file);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_finish_quota_recovery);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(olq_set_dquot);
+
+/* End of trace events for fs/ocfs2/quota_local.c. */
+
+/* Trace events for fs/ocfs2/quota_global.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_quota_block);
+
+TRACE_EVENT(ocfs2_sync_dquot,
+	TP_PROTO(unsigned int dq_id, long long dqb_curspace,
+		 long long spacechange, long long curinodes,
+		 long long inodechange),
+	TP_ARGS(dq_id, dqb_curspace, spacechange, curinodes, inodechange),
+	TP_STRUCT__entry(
+		__field(unsigned int, dq_id)
+		__field(long long, dqb_curspace)
+		__field(long long, spacechange)
+		__field(long long, curinodes)
+		__field(long long, inodechange)
+	),
+	TP_fast_assign(
+		__entry->dq_id = dq_id;
+		__entry->dqb_curspace = dqb_curspace;
+		__entry->spacechange = spacechange;
+		__entry->curinodes = curinodes;
+		__entry->inodechange = inodechange;
+	),
+	TP_printk("%u %lld %lld %lld %lld", __entry->dq_id,
+		  __entry->dqb_curspace, __entry->spacechange,
+		  __entry->curinodes, __entry->inodechange)
+);
+
+TRACE_EVENT(ocfs2_sync_dquot_helper,
+	TP_PROTO(unsigned int dq_id, unsigned int dq_type, unsigned long type,
+		 const char *s_id),
+	TP_ARGS(dq_id, dq_type, type, s_id),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, dq_id)
+		__field(unsigned int, dq_type)
+		__field(unsigned long, type)
+		__string(s_id, s_id)
+	),
+	TP_fast_assign(
+		__entry->dq_id = dq_id;
+		__entry->dq_type = dq_type;
+		__entry->type = type;
+		__assign_str(s_id, s_id);
+	),
+	TP_printk("%u %u %lu %s", __entry->dq_id, __entry->dq_type,
+		  __entry->type, __get_str(s_id))
+);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_write_dquot);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_release_dquot);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_acquire_dquot);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_mark_dquot_dirty);
+
+/* End of trace events for fs/ocfs2/quota_global.c. */
+
+/* Trace events for fs/ocfs2/dir.c. */
+DEFINE_OCFS2_INT_EVENT(ocfs2_search_dirblock);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_dir_block);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_find_entry_el);
+
+TRACE_EVENT(ocfs2_dx_dir_search,
+	TP_PROTO(unsigned long long ino, int namelen, const char *name,
+		 unsigned int major_hash, unsigned int minor_hash,
+		 unsigned long long blkno),
+	TP_ARGS(ino, namelen, name, major_hash, minor_hash, blkno),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(int, namelen)
+		__string(name, name)
+		__field(unsigned int, major_hash)
+		__field(unsigned int,minor_hash)
+		__field(unsigned long long, blkno)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->namelen = namelen;
+		__assign_str(name, name);
+		__entry->major_hash = major_hash;
+		__entry->minor_hash = minor_hash;
+		__entry->blkno = blkno;
+	),
+	TP_printk("%llu %.*s %u %u %llu", __entry->ino,
+		   __entry->namelen, __get_str(name),
+		  __entry->major_hash, __entry->minor_hash, __entry->blkno)
+);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_dx_dir_search_leaf_info);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_delete_entry_dx);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_readdir);
+
+TRACE_EVENT(ocfs2_find_files_on_disk,
+	TP_PROTO(int namelen, const char *name, void *blkno,
+		 unsigned long long dir),
+	TP_ARGS(namelen, name, blkno, dir),
+	TP_STRUCT__entry(
+		__field(int, namelen)
+		__string(name, name)
+		__field(void *, blkno)
+		__field(unsigned long long, dir)
+	),
+	TP_fast_assign(
+		__entry->namelen = namelen;
+		__assign_str(name, name);
+		__entry->blkno = blkno;
+		__entry->dir = dir;
+	),
+	TP_printk("%.*s %p %llu", __entry->namelen, __get_str(name),
+		  __entry->blkno, __entry->dir)
+);
+
+TRACE_EVENT(ocfs2_check_dir_for_entry,
+	TP_PROTO(unsigned long long dir, int namelen, const char *name),
+	TP_ARGS(dir, namelen, name),
+	TP_STRUCT__entry(
+		__field(unsigned long long, dir)
+		__field(int, namelen)
+		__string(name, name)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__entry->namelen = namelen;
+		__assign_str(name, name);
+	),
+	TP_printk("%llu %.*s", __entry->dir,
+		  __entry->namelen, __get_str(name))
+);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_dx_dir_attach_index);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_format_cluster);
+
+TRACE_EVENT(ocfs2_dx_dir_index_root_block,
+	TP_PROTO(unsigned long long dir,
+		 unsigned int major_hash, unsigned int minor_hash,
+		 int namelen, const char *name, unsigned int num_used),
+	TP_ARGS(dir, major_hash, minor_hash, namelen, name, num_used),
+	TP_STRUCT__entry(
+		__field(unsigned long long, dir)
+		__field(unsigned int, major_hash)
+		__field(unsigned int, minor_hash)
+		__field(int, namelen)
+		__string(name, name)
+		__field(unsigned int, num_used)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__entry->major_hash = major_hash;
+		__entry->minor_hash = minor_hash;
+		__entry->namelen = namelen;
+		__assign_str(name, name);
+		__entry->num_used = num_used;
+	),
+	TP_printk("%llu %x %x %.*s %u", __entry->dir,
+		  __entry->major_hash, __entry->minor_hash,
+		   __entry->namelen, __get_str(name), __entry->num_used)
+);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_extend_dir);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_rebalance);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_dx_dir_rebalance_split);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_prepare_dir_for_insert);
+
+/* End of trace events for fs/ocfs2/dir.c. */
+
+/* Trace events for fs/ocfs2/namei.c. */
+
+DECLARE_EVENT_CLASS(ocfs2__dentry_ops,
+	TP_PROTO(void *dir, void *dentry, int name_len, const char *name,
+		 unsigned long long dir_blkno, unsigned long long extra),
+	TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra),
+	TP_STRUCT__entry(
+		__field(void *, dir)
+		__field(void *, dentry)
+		__field(int, name_len)
+		__string(name, name)
+		__field(unsigned long long, dir_blkno)
+		__field(unsigned long long, extra)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__entry->dentry = dentry;
+		__entry->name_len = name_len;
+		__assign_str(name, name);
+		__entry->dir_blkno = dir_blkno;
+		__entry->extra = extra;
+	),
+	TP_printk("%p %p %.*s %llu %llu", __entry->dir, __entry->dentry,
+		  __entry->name_len, __get_str(name),
+		  __entry->dir_blkno, __entry->extra)
+);
+
+#define DEFINE_OCFS2_DENTRY_OPS(name)					\
+DEFINE_EVENT(ocfs2__dentry_ops, name,					\
+TP_PROTO(void *dir, void *dentry, int name_len, const char *name,	\
+	 unsigned long long dir_blkno, unsigned long long extra),	\
+	TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra))
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_lookup);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_mkdir);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_create);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_unlink);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_symlink_create);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_mv_orphaned_inode_to_new);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_lookup_ret);
+
+TRACE_EVENT(ocfs2_mknod,
+	TP_PROTO(void *dir, void *dentry, int name_len, const char *name,
+		 unsigned long long dir_blkno, unsigned long dev, int mode),
+	TP_ARGS(dir, dentry, name_len, name, dir_blkno, dev, mode),
+	TP_STRUCT__entry(
+		__field(void *, dir)
+		__field(void *, dentry)
+		__field(int, name_len)
+		__string(name, name)
+		__field(unsigned long long, dir_blkno)
+		__field(unsigned long, dev)
+		__field(int, mode)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__entry->dentry = dentry;
+		__entry->name_len = name_len;
+		__assign_str(name, name);
+		__entry->dir_blkno = dir_blkno;
+		__entry->dev = dev;
+		__entry->mode = mode;
+	),
+	TP_printk("%p %p %.*s %llu %lu %d", __entry->dir, __entry->dentry,
+		  __entry->name_len, __get_str(name),
+		  __entry->dir_blkno, __entry->dev, __entry->mode)
+);
+
+TRACE_EVENT(ocfs2_link,
+	TP_PROTO(unsigned long long ino, int old_len, const char *old_name,
+		 int name_len, const char *name),
+	TP_ARGS(ino, old_len, old_name, name_len, name),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(int, old_len)
+		__string(old_name, old_name)
+		__field(int, name_len)
+		__string(name, name)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->old_len = old_len;
+		__assign_str(old_name, old_name);
+		__entry->name_len = name_len;
+		__assign_str(name, name);
+	),
+	TP_printk("%llu %.*s %.*s", __entry->ino,
+		  __entry->old_len, __get_str(old_name),
+		  __entry->name_len, __get_str(name))
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_unlink_noent);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock_end);
+
+TRACE_EVENT(ocfs2_rename,
+	TP_PROTO(void *old_dir, void *old_dentry,
+		 void *new_dir, void *new_dentry,
+		 int old_len, const char *old_name,
+		 int new_len, const char *new_name),
+	TP_ARGS(old_dir, old_dentry, new_dir, new_dentry,
+		old_len, old_name, new_len, new_name),
+	TP_STRUCT__entry(
+		__field(void *, old_dir)
+		__field(void *, old_dentry)
+		__field(void *, new_dir)
+		__field(void *, new_dentry)
+		__field(int, old_len)
+		__string(old_name, old_name)
+		__field(int, new_len)
+		__string(new_name, new_name)
+	),
+	TP_fast_assign(
+		__entry->old_dir = old_dir;
+		__entry->old_dentry = old_dentry;
+		__entry->new_dir = new_dir;
+		__entry->new_dentry = new_dentry;
+		__entry->old_len = old_len;
+		__assign_str(old_name, old_name);
+		__entry->new_len = new_len;
+		__assign_str(new_name, new_name);
+	),
+	TP_printk("%p %p %p %p %.*s %.*s",
+		  __entry->old_dir, __entry->old_dentry,
+		  __entry->new_dir, __entry->new_dentry,
+		  __entry->old_len, __get_str(old_name),
+		  __entry->new_len, __get_str(new_name))
+);
+
+TRACE_EVENT(ocfs2_rename_target_exists,
+	TP_PROTO(int new_len, const char *new_name),
+	TP_ARGS(new_len, new_name),
+	TP_STRUCT__entry(
+		__field(int, new_len)
+		__string(new_name, new_name)
+	),
+	TP_fast_assign(
+		__entry->new_len = new_len;
+		__assign_str(new_name, new_name);
+	),
+	TP_printk("%.*s", __entry->new_len, __get_str(new_name))
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_rename_disagree);
+
+TRACE_EVENT(ocfs2_rename_over_existing,
+	TP_PROTO(unsigned long long new_blkno, void *new_bh,
+		 unsigned long long newdi_blkno),
+	TP_ARGS(new_blkno, new_bh, newdi_blkno),
+	TP_STRUCT__entry(
+		__field(unsigned long long, new_blkno)
+		__field(void *, new_bh)
+		__field(unsigned long long, newdi_blkno)
+	),
+	TP_fast_assign(
+		__entry->new_blkno = new_blkno;
+		__entry->new_bh = new_bh;
+		__entry->newdi_blkno = newdi_blkno;
+	),
+	TP_printk("%llu %p %llu", __entry->new_blkno, __entry->new_bh,
+		  __entry->newdi_blkno)
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_create_symlink_data);
+
+TRACE_EVENT(ocfs2_symlink_begin,
+	TP_PROTO(void *dir, void *dentry, const char *symname,
+		 int len, const char *name),
+	TP_ARGS(dir, dentry, symname, len, name),
+	TP_STRUCT__entry(
+		__field(void *, dir)
+		__field(void *, dentry)
+		__field(const char *, symname)
+		__field(int, len)
+		__string(name, name)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__entry->dentry = dentry;
+		__entry->symname = symname;
+		__entry->len = len;
+		__assign_str(name, name);
+	),
+	TP_printk("%p %p %s %.*s", __entry->dir, __entry->dentry,
+		  __entry->symname, __entry->len, __get_str(name))
+);
+
+TRACE_EVENT(ocfs2_blkno_stringify,
+	TP_PROTO(unsigned long long blkno, const char *name, int namelen),
+	TP_ARGS(blkno, name, namelen),
+	TP_STRUCT__entry(
+		__field(unsigned long long, blkno)
+		__string(name, name)
+		__field(int, namelen)
+	),
+	TP_fast_assign(
+		__entry->blkno = blkno;
+		__assign_str(name, name);
+		__entry->namelen = namelen;
+	),
+	TP_printk("%llu %s %d", __entry->blkno, __get_str(name),
+		  __entry->namelen)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_add_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_orphan_add_end);
+
+TRACE_EVENT(ocfs2_orphan_del,
+	TP_PROTO(unsigned long long dir, const char *name, int namelen),
+	TP_ARGS(dir, name, namelen),
+	TP_STRUCT__entry(
+		__field(unsigned long long, dir)
+		__string(name, name)
+		__field(int, namelen)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__assign_str(name, name);
+		__entry->namelen = namelen;
+	),
+	TP_printk("%llu %s %d", __entry->dir, __get_str(name),
+		  __entry->namelen)
+);
+
+/* End of trace events for fs/ocfs2/namei.c. */
+
+/* Trace events for fs/ocfs2/dcache.c. */
+
+TRACE_EVENT(ocfs2_dentry_revalidate,
+	TP_PROTO(void *dentry, int len, const char *name),
+	TP_ARGS(dentry, len, name),
+	TP_STRUCT__entry(
+		__field(void *, dentry)
+		__field(int, len)
+		__string(name, name)
+	),
+	TP_fast_assign(
+		__entry->dentry = dentry;
+		__entry->len = len;
+		__assign_str(name, name);
+	),
+	TP_printk("%p %.*s", __entry->dentry, __entry->len, __get_str(name))
+);
+
+TRACE_EVENT(ocfs2_dentry_revalidate_negative,
+	TP_PROTO(int len, const char *name, unsigned long pgen,
+		 unsigned long gen),
+	TP_ARGS(len, name, pgen, gen),
+	TP_STRUCT__entry(
+		__field(int, len)
+		__string(name, name)
+		__field(unsigned long, pgen)
+		__field(unsigned long, gen)
+	),
+	TP_fast_assign(
+		__entry->len = len;
+		__assign_str(name, name);
+		__entry->pgen = pgen;
+		__entry->gen = gen;
+	),
+	TP_printk("%.*s %lu %lu", __entry->len, __get_str(name),
+		  __entry->pgen, __entry->gen)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_delete);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_dentry_revalidate_orphaned);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_nofsdata);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_dentry_revalidate_ret);
+
+TRACE_EVENT(ocfs2_find_local_alias,
+	TP_PROTO(int len, const char *name),
+	TP_ARGS(len, name),
+	TP_STRUCT__entry(
+		__field(int, len)
+		__string(name, name)
+	),
+	TP_fast_assign(
+		__entry->len = len;
+		__assign_str(name, name);
+	),
+	TP_printk("%.*s", __entry->len, __get_str(name))
+);
+
+TRACE_EVENT(ocfs2_dentry_attach_lock,
+	TP_PROTO(int len, const char *name,
+		 unsigned long long parent, void *fsdata),
+	TP_ARGS(len, name, parent, fsdata),
+	TP_STRUCT__entry(
+		__field(int, len)
+		__string(name, name)
+		__field(unsigned long long, parent)
+		__field(void *, fsdata)
+	),
+	TP_fast_assign(
+		__entry->len = len;
+		__assign_str(name, name);
+		__entry->parent = parent;
+		__entry->fsdata = fsdata;
+	),
+	TP_printk("%.*s %llu %p", __entry->len, __get_str(name),
+		  __entry->parent, __entry->fsdata)
+);
+
+TRACE_EVENT(ocfs2_dentry_attach_lock_found,
+	TP_PROTO(const char *name, unsigned long long parent,
+		 unsigned long long ino),
+	TP_ARGS(name, parent, ino),
+	TP_STRUCT__entry(
+		__string(name, name)
+		__field(unsigned long long, parent)
+		__field(unsigned long long, ino)
+	),
+	TP_fast_assign(
+		__assign_str(name, name);
+		__entry->parent = parent;
+		__entry->ino = ino;
+	),
+	TP_printk("%s %llu %llu", __get_str(name), __entry->parent, __entry->ino)
+);
+/* End of trace events for fs/ocfs2/dcache.c. */
+
+/* Trace events for fs/ocfs2/export.c. */
+
+TRACE_EVENT(ocfs2_get_dentry_begin,
+	TP_PROTO(void *sb, void *handle, unsigned long long blkno),
+	TP_ARGS(sb, handle, blkno),
+	TP_STRUCT__entry(
+		__field(void *, sb)
+		__field(void *, handle)
+		__field(unsigned long long, blkno)
+	),
+	TP_fast_assign(
+		__entry->sb = sb;
+		__entry->handle = handle;
+		__entry->blkno = blkno;
+	),
+	TP_printk("%p %p %llu", __entry->sb, __entry->handle, __entry->blkno)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_get_dentry_test_bit);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_get_dentry_stale);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_get_dentry_generation);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_dentry_end);
+
+TRACE_EVENT(ocfs2_get_parent,
+	TP_PROTO(void *child, int len, const char *name,
+		 unsigned long long ino),
+	TP_ARGS(child, len, name, ino),
+	TP_STRUCT__entry(
+		__field(void *,	child)
+		__field(int, len)
+		__string(name, name)
+		__field(unsigned long long, ino)
+	),
+	TP_fast_assign(
+		__entry->child = child;
+		__entry->len = len;
+		__assign_str(name, name);
+		__entry->ino = ino;
+	),
+	TP_printk("%p %.*s %llu", __entry->child, __entry->len,
+		  __get_str(name), __entry->ino)
+);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_parent_end);
+
+TRACE_EVENT(ocfs2_encode_fh_begin,
+	TP_PROTO(void *dentry, int name_len, const char *name,
+		 void *fh, int len, int connectable),
+	TP_ARGS(dentry, name_len, name, fh, len, connectable),
+	TP_STRUCT__entry(
+		__field(void *, dentry)
+		__field(int, name_len)
+		__string(name, name)
+		__field(void *, fh)
+		__field(int, len)
+		__field(int, connectable)
+	),
+	TP_fast_assign(
+		__entry->dentry = dentry;
+		__entry->name_len = name_len;
+		__assign_str(name, name);
+		__entry->fh = fh;
+		__entry->len = len;
+		__entry->connectable = connectable;
+	),
+	TP_printk("%p %.*s %p %d %d", __entry->dentry, __entry->name_len,
+		  __get_str(name), __entry->fh, __entry->len,
+		  __entry->connectable)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_self);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_parent);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_encode_fh_type);
+
+/* End of trace events for fs/ocfs2/export.c. */
+
+/* Trace events for fs/ocfs2/journal.c. */
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_commit_cache_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit_cache_end);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_journal_access);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_journal_dirty);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_journal_init);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_journal_init_maxlen);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_journal_shutdown);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_journal_shutdown_wait);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_complete_recovery);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_complete_recovery_end);
+
+TRACE_EVENT(ocfs2_complete_recovery_slot,
+	TP_PROTO(int slot, unsigned long long la_ino,
+		 unsigned long long tl_ino, void *qrec),
+	TP_ARGS(slot, la_ino, tl_ino, qrec),
+	TP_STRUCT__entry(
+		__field(int, slot)
+		__field(unsigned long long, la_ino)
+		__field(unsigned long long, tl_ino)
+		__field(void *, qrec)
+	),
+	TP_fast_assign(
+		__entry->slot = slot;
+		__entry->la_ino = la_ino;
+		__entry->tl_ino = tl_ino;
+		__entry->qrec = qrec;
+	),
+	TP_printk("%d %llu %llu %p", __entry->slot, __entry->la_ino,
+		  __entry->tl_ino, __entry->qrec)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_recovery_thread_node);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_recovery_thread_end);
+
+TRACE_EVENT(ocfs2_recovery_thread,
+	TP_PROTO(int node_num, int osb_node_num, int disable,
+		 void *recovery_thread, int map_set),
+	TP_ARGS(node_num, osb_node_num, disable, recovery_thread, map_set),
+	TP_STRUCT__entry(
+		__field(int, node_num)
+		__field(int, osb_node_num)
+		__field(int,disable)
+		__field(void *, recovery_thread)
+		__field(int,map_set)
+	),
+	TP_fast_assign(
+		__entry->node_num = node_num;
+		__entry->osb_node_num = osb_node_num;
+		__entry->disable = disable;
+		__entry->recovery_thread = recovery_thread;
+		__entry->map_set = map_set;
+	),
+	TP_printk("%d %d %d %p %d", __entry->node_num,
+		   __entry->osb_node_num, __entry->disable,
+		   __entry->recovery_thread, __entry->map_set)
+);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_replay_journal_recovered);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_lock_err);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_skip);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_recover_node);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_recover_node_skip);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_mark_dead_nodes);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_begin);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_end);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_filldir);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_recover_orphans);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_recover_orphans_iput);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_wait_on_mount);
+
+/* End of trace events for fs/ocfs2/journal.c. */
+
+/* Trace events for fs/ocfs2/buffer_head_io.c. */
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_read_blocks_sync);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_read_blocks_sync_jbd);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_read_blocks_from_disk);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_bh);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_end);
+
+TRACE_EVENT(ocfs2_write_block,
+	TP_PROTO(unsigned long long block, void *ci),
+	TP_ARGS(block, ci),
+	TP_STRUCT__entry(
+		__field(unsigned long long, block)
+		__field(void *, ci)
+	),
+	TP_fast_assign(
+		__entry->block = block;
+		__entry->ci = ci;
+	),
+	TP_printk("%llu %p", __entry->block, __entry->ci)
+);
+
+TRACE_EVENT(ocfs2_read_blocks_begin,
+	TP_PROTO(void *ci, unsigned long long block,
+		 unsigned int nr, int flags),
+	TP_ARGS(ci, block, nr, flags),
+	TP_STRUCT__entry(
+		__field(void *, ci)
+		__field(unsigned long long, block)
+		__field(unsigned int, nr)
+		__field(int, flags)
+	),
+	TP_fast_assign(
+		__entry->ci = ci;
+		__entry->block = block;
+		__entry->nr = nr;
+		__entry->flags = flags;
+	),
+	TP_printk("%p %llu %u %d", __entry->ci, __entry->block,
+		  __entry->nr, __entry->flags)
+);
+
+/* End of trace events for fs/ocfs2/buffer_head_io.c. */
+
+/* Trace events for fs/ocfs2/uptodate.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_copied_metadata_tree);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_metadata_cache_purge);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_buffer_cached_begin);
+
+TRACE_EVENT(ocfs2_buffer_cached_end,
+	TP_PROTO(int index, void *item),
+	TP_ARGS(index, item),
+	TP_STRUCT__entry(
+		__field(int, index)
+		__field(void *, item)
+	),
+	TP_fast_assign(
+		__entry->index = index;
+		__entry->item = item;
+	),
+	TP_printk("%d %p", __entry->index, __entry->item)
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_append_cache_array);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_insert_cache_tree);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_expand_cache);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_set_buffer_uptodate);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_set_buffer_uptodate_begin);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_remove_metadata_array);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_remove_metadata_tree);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_remove_block_from_cache);
+
+/* End of trace events for fs/ocfs2/uptodate.c. */
+#endif /* _TRACE_OCFS2_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE ocfs2_trace
+#include <trace/define_trace.h>
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index a73f641..279aef6 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -11,7 +11,6 @@
 #include <linux/writeback.h>
 #include <linux/workqueue.h>
 
-#define MLOG_MASK_PREFIX ML_QUOTA
 #include <cluster/masklog.h>
 
 #include "ocfs2_fs.h"
@@ -27,6 +26,7 @@
 #include "super.h"
 #include "buffer_head_io.h"
 #include "quota.h"
+#include "ocfs2_trace.h"
 
 /*
  * Locking of quotas with OCFS2 is rather complex. Here are rules that
@@ -130,8 +130,7 @@
 	struct ocfs2_disk_dqtrailer *dqt =
 		ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
 
-	mlog(0, "Validating quota block %llu\n",
-	     (unsigned long long)bh->b_blocknr);
+	trace_ocfs2_validate_quota_block((unsigned long long)bh->b_blocknr);
 
 	BUG_ON(!buffer_uptodate(bh));
 
@@ -341,8 +340,6 @@
 	u64 pcount;
 	int status;
 
-	mlog_entry_void();
-
 	/* Read global header */
 	gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
 			OCFS2_INVALID_SLOT);
@@ -402,7 +399,8 @@
 			      msecs_to_jiffies(oinfo->dqi_syncms));
 
 out_err:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 out_unlock:
 	ocfs2_unlock_global_qf(oinfo, 0);
@@ -508,9 +506,10 @@
 	olditime = dquot->dq_dqb.dqb_itime;
 	oldbtime = dquot->dq_dqb.dqb_btime;
 	ocfs2_global_disk2memdqb(dquot, &dqblk);
-	mlog(0, "Syncing global dquot %u space %lld+%lld, inodes %lld+%lld\n",
-	     dquot->dq_id, dquot->dq_dqb.dqb_curspace, (long long)spacechange,
-	     dquot->dq_dqb.dqb_curinodes, (long long)inodechange);
+	trace_ocfs2_sync_dquot(dquot->dq_id, dquot->dq_dqb.dqb_curspace,
+			       (long long)spacechange,
+			       dquot->dq_dqb.dqb_curinodes,
+			       (long long)inodechange);
 	if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
 		dquot->dq_dqb.dqb_curspace += spacechange;
 	if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
@@ -594,8 +593,8 @@
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	int status = 0;
 
-	mlog_entry("id=%u qtype=%u type=%lu device=%s\n", dquot->dq_id,
-		   dquot->dq_type, type, sb->s_id);
+	trace_ocfs2_sync_dquot_helper(dquot->dq_id, dquot->dq_type,
+				      type, sb->s_id);
 	if (type != dquot->dq_type)
 		goto out;
 	status = ocfs2_lock_global_qf(oinfo, 1);
@@ -621,7 +620,6 @@
 out_ilock:
 	ocfs2_unlock_global_qf(oinfo, 1);
 out:
-	mlog_exit(status);
 	return status;
 }
 
@@ -647,7 +645,7 @@
 	struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
 	int status = 0;
 
-	mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+	trace_ocfs2_write_dquot(dquot->dq_id, dquot->dq_type);
 
 	handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
 	if (IS_ERR(handle)) {
@@ -660,7 +658,6 @@
 	mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
 	ocfs2_commit_trans(osb, handle);
 out:
-	mlog_exit(status);
 	return status;
 }
 
@@ -686,7 +683,7 @@
 	struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
 	int status = 0;
 
-	mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+	trace_ocfs2_release_dquot(dquot->dq_id, dquot->dq_type);
 
 	mutex_lock(&dquot->dq_lock);
 	/* Check whether we are not racing with some other dqget() */
@@ -722,7 +719,8 @@
 	ocfs2_unlock_global_qf(oinfo, 1);
 out:
 	mutex_unlock(&dquot->dq_lock);
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -743,7 +741,7 @@
 	int need_alloc = ocfs2_global_qinit_alloc(sb, type);
 	handle_t *handle;
 
-	mlog_entry("id=%u, type=%d", dquot->dq_id, type);
+	trace_ocfs2_acquire_dquot(dquot->dq_id, type);
 	mutex_lock(&dquot->dq_lock);
 	/*
 	 * We need an exclusive lock, because we're going to update use count
@@ -809,7 +807,8 @@
 	set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
 out:
 	mutex_unlock(&dquot->dq_lock);
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -829,7 +828,7 @@
 	handle_t *handle;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 
-	mlog_entry("id=%u, type=%d", dquot->dq_id, type);
+	trace_ocfs2_mark_dquot_dirty(dquot->dq_id, type);
 
 	/* In case user set some limits, sync dquot immediately to global
 	 * quota file so that information propagates quicker */
@@ -866,7 +865,8 @@
 out_ilock:
 	ocfs2_unlock_global_qf(oinfo, 1);
 out:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -877,8 +877,6 @@
 	int status = 0;
 	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
 
-	mlog_entry_void();
-
 	status = ocfs2_lock_global_qf(oinfo, 1);
 	if (status < 0)
 		goto out;
@@ -893,7 +891,8 @@
 out_ilock:
 	ocfs2_unlock_global_qf(oinfo, 1);
 out:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index dc78764..dc8007f 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -8,7 +8,6 @@
 #include <linux/quotaops.h>
 #include <linux/module.h>
 
-#define MLOG_MASK_PREFIX ML_QUOTA
 #include <cluster/masklog.h>
 
 #include "ocfs2_fs.h"
@@ -23,6 +22,7 @@
 #include "quota.h"
 #include "uptodate.h"
 #include "super.h"
+#include "ocfs2_trace.h"
 
 /* Number of local quota structures per block */
 static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
@@ -475,7 +475,7 @@
 	struct ocfs2_recovery_chunk *rchunk, *next;
 	qsize_t spacechange, inodechange;
 
-	mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type);
+	trace_ocfs2_recover_local_quota_file((unsigned long)lqinode->i_ino, type);
 
 	list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
 		chunk = rchunk->rc_chunk;
@@ -575,7 +575,8 @@
 	}
 	if (status < 0)
 		free_recovery_list(&(rec->r_list[type]));
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -600,7 +601,7 @@
 	for (type = 0; type < MAXQUOTAS; type++) {
 		if (list_empty(&(rec->r_list[type])))
 			continue;
-		mlog(0, "Recovering quota in slot %d\n", slot_num);
+		trace_ocfs2_finish_quota_recovery(slot_num);
 		lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
 		if (!lqinode) {
 			status = -ENOENT;
@@ -882,9 +883,10 @@
 	dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
 					  od->dq_originodes);
 	spin_unlock(&dq_data_lock);
-	mlog(0, "Writing local dquot %u space %lld inodes %lld\n",
-	     od->dq_dquot.dq_id, (long long)le64_to_cpu(dqblk->dqb_spacemod),
-	     (long long)le64_to_cpu(dqblk->dqb_inodemod));
+	trace_olq_set_dquot(
+		(unsigned long long)le64_to_cpu(dqblk->dqb_spacemod),
+		(unsigned long long)le64_to_cpu(dqblk->dqb_inodemod),
+		od->dq_dquot.dq_id);
 }
 
 /* Write dquot to local quota file */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index c384d63..5d32749 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -16,7 +16,6 @@
  */
 
 #include <linux/sort.h>
-#define MLOG_MASK_PREFIX ML_REFCOUNT
 #include <cluster/masklog.h>
 #include "ocfs2.h"
 #include "inode.h"
@@ -34,6 +33,7 @@
 #include "aops.h"
 #include "xattr.h"
 #include "namei.h"
+#include "ocfs2_trace.h"
 
 #include <linux/bio.h>
 #include <linux/blkdev.h>
@@ -84,8 +84,7 @@
 	struct ocfs2_refcount_block *rb =
 		(struct ocfs2_refcount_block *)bh->b_data;
 
-	mlog(0, "Validating refcount block %llu\n",
-	     (unsigned long long)bh->b_blocknr);
+	trace_ocfs2_validate_refcount_block((unsigned long long)bh->b_blocknr);
 
 	BUG_ON(!buffer_uptodate(bh));
 
@@ -545,8 +544,8 @@
 	while ((node = rb_last(root)) != NULL) {
 		tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
 
-		mlog(0, "Purge tree %llu\n",
-		     (unsigned long long) tree->rf_blkno);
+		trace_ocfs2_purge_refcount_trees(
+				(unsigned long long) tree->rf_blkno);
 
 		rb_erase(&tree->rf_node, root);
 		ocfs2_free_refcount_tree(tree);
@@ -575,7 +574,8 @@
 
 	BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
 
-	mlog(0, "create tree for inode %lu\n", inode->i_ino);
+	trace_ocfs2_create_refcount_tree(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 	ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
 	if (ret) {
@@ -646,8 +646,7 @@
 	di->i_refcount_loc = cpu_to_le64(first_blkno);
 	spin_unlock(&oi->ip_lock);
 
-	mlog(0, "created tree for inode %lu, refblock %llu\n",
-	     inode->i_ino, (unsigned long long)first_blkno);
+	trace_ocfs2_create_refcount_tree_blkno((unsigned long long)first_blkno);
 
 	ocfs2_journal_dirty(handle, di_bh);
 
@@ -1256,8 +1255,9 @@
 		goto out;
 	}
 
-	mlog(0, "change index %d, old count %u, change %d\n", index,
-	     le32_to_cpu(rec->r_refcount), change);
+	trace_ocfs2_change_refcount_rec(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		index, le32_to_cpu(rec->r_refcount), change);
 	le32_add_cpu(&rec->r_refcount, change);
 
 	if (!rec->r_refcount) {
@@ -1353,8 +1353,8 @@
 
 	ocfs2_journal_dirty(handle, ref_root_bh);
 
-	mlog(0, "new leaf block %llu, used %u\n", (unsigned long long)blkno,
-	     le16_to_cpu(new_rb->rf_records.rl_used));
+	trace_ocfs2_expand_inline_ref_root((unsigned long long)blkno,
+		le16_to_cpu(new_rb->rf_records.rl_used));
 
 	*ref_leaf_bh = new_bh;
 	new_bh = NULL;
@@ -1466,9 +1466,9 @@
 			(struct ocfs2_refcount_block *)new_bh->b_data;
 	struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
 
-	mlog(0, "split old leaf refcount block %llu, count = %u, used = %u\n",
-	     (unsigned long long)ref_leaf_bh->b_blocknr,
-	     le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used));
+	trace_ocfs2_divide_leaf_refcount_block(
+		(unsigned long long)ref_leaf_bh->b_blocknr,
+		le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used));
 
 	/*
 	 * XXX: Improvement later.
@@ -1601,8 +1601,8 @@
 
 	ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
 
-	mlog(0, "insert new leaf block %llu at %u\n",
-	     (unsigned long long)new_bh->b_blocknr, new_cpos);
+	trace_ocfs2_new_leaf_refcount_block(
+			(unsigned long long)new_bh->b_blocknr, new_cpos);
 
 	/* Insert the new leaf block with the specific offset cpos. */
 	ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr,
@@ -1794,11 +1794,10 @@
 			(le16_to_cpu(rf_list->rl_used) - index) *
 			 sizeof(struct ocfs2_refcount_rec));
 
-	mlog(0, "insert refcount record start %llu, len %u, count %u "
-	     "to leaf block %llu at index %d\n",
-	     (unsigned long long)le64_to_cpu(rec->r_cpos),
-	     le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount),
-	     (unsigned long long)ref_leaf_bh->b_blocknr, index);
+	trace_ocfs2_insert_refcount_rec(
+		(unsigned long long)ref_leaf_bh->b_blocknr, index,
+		(unsigned long long)le64_to_cpu(rec->r_cpos),
+		le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount));
 
 	rf_list->rl_recs[index] = *rec;
 
@@ -1850,10 +1849,12 @@
 
 	BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
 
-	mlog(0, "original r_pos %llu, cluster %u, split %llu, cluster %u\n",
-	     le64_to_cpu(orig_rec->r_cpos), le32_to_cpu(orig_rec->r_clusters),
-	     le64_to_cpu(split_rec->r_cpos),
-	     le32_to_cpu(split_rec->r_clusters));
+	trace_ocfs2_split_refcount_rec(le64_to_cpu(orig_rec->r_cpos),
+		le32_to_cpu(orig_rec->r_clusters),
+		le32_to_cpu(orig_rec->r_refcount),
+		le64_to_cpu(split_rec->r_cpos),
+		le32_to_cpu(split_rec->r_clusters),
+		le32_to_cpu(split_rec->r_refcount));
 
 	/*
 	 * If we just need to split the header or tail clusters,
@@ -1967,12 +1968,11 @@
 
 	if (split_rec->r_refcount) {
 		rf_list->rl_recs[index] = *split_rec;
-		mlog(0, "insert refcount record start %llu, len %u, count %u "
-		     "to leaf block %llu at index %d\n",
-		     (unsigned long long)le64_to_cpu(split_rec->r_cpos),
-		     le32_to_cpu(split_rec->r_clusters),
-		     le32_to_cpu(split_rec->r_refcount),
-		     (unsigned long long)ref_leaf_bh->b_blocknr, index);
+		trace_ocfs2_split_refcount_rec_insert(
+			(unsigned long long)ref_leaf_bh->b_blocknr, index,
+			(unsigned long long)le64_to_cpu(split_rec->r_cpos),
+			le32_to_cpu(split_rec->r_clusters),
+			le32_to_cpu(split_rec->r_refcount));
 
 		if (merge)
 			ocfs2_refcount_rec_merge(rb, index);
@@ -1997,7 +1997,7 @@
 	struct ocfs2_refcount_rec rec;
 	unsigned int set_len = 0;
 
-	mlog(0, "Tree owner %llu, add refcount start %llu, len %u\n",
+	trace_ocfs2_increase_refcount_begin(
 	     (unsigned long long)ocfs2_metadata_cache_owner(ci),
 	     (unsigned long long)cpos, len);
 
@@ -2024,9 +2024,9 @@
 		 */
 		if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
 		    set_len <= len) {
-			mlog(0, "increase refcount rec, start %llu, len %u, "
-			     "count %u\n", (unsigned long long)cpos, set_len,
-			     le32_to_cpu(rec.r_refcount));
+			trace_ocfs2_increase_refcount_change(
+				(unsigned long long)cpos, set_len,
+				le32_to_cpu(rec.r_refcount));
 			ret = ocfs2_change_refcount_rec(handle, ci,
 							ref_leaf_bh, index,
 							merge, 1);
@@ -2037,7 +2037,7 @@
 		} else if (!rec.r_refcount) {
 			rec.r_refcount = cpu_to_le32(1);
 
-			mlog(0, "insert refcount rec, start %llu, len %u\n",
+			trace_ocfs2_increase_refcount_insert(
 			     (unsigned long long)le64_to_cpu(rec.r_cpos),
 			     set_len);
 			ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
@@ -2055,8 +2055,7 @@
 			rec.r_clusters = cpu_to_le32(set_len);
 			le32_add_cpu(&rec.r_refcount, 1);
 
-			mlog(0, "split refcount rec, start %llu, "
-			     "len %u, count %u\n",
+			trace_ocfs2_increase_refcount_split(
 			     (unsigned long long)le64_to_cpu(rec.r_cpos),
 			     set_len, le32_to_cpu(rec.r_refcount));
 			ret = ocfs2_split_refcount_rec(handle, ci,
@@ -2095,6 +2094,11 @@
 
 	BUG_ON(rb->rf_records.rl_used);
 
+	trace_ocfs2_remove_refcount_extent(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)ref_leaf_bh->b_blocknr,
+		le32_to_cpu(rb->rf_cpos));
+
 	ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
 	ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
 				  1, meta_ac, dealloc);
@@ -2137,7 +2141,7 @@
 	if (!rb->rf_list.l_next_free_rec) {
 		BUG_ON(rb->rf_clusters);
 
-		mlog(0, "reset refcount tree root %llu to be a record block.\n",
+		trace_ocfs2_restore_refcount_block(
 		     (unsigned long long)ref_root_bh->b_blocknr);
 
 		rb->rf_flags = 0;
@@ -2184,6 +2188,10 @@
 	BUG_ON(cpos + len >
 	       le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
 
+	trace_ocfs2_decrease_refcount_rec(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)cpos, len);
+
 	if (cpos == le64_to_cpu(rec->r_cpos) &&
 	    len == le32_to_cpu(rec->r_clusters))
 		ret = ocfs2_change_refcount_rec(handle, ci,
@@ -2195,12 +2203,6 @@
 
 		le32_add_cpu(&split.r_refcount, -1);
 
-		mlog(0, "split refcount rec, start %llu, "
-		     "len %u, count %u, original start %llu, len %u\n",
-		     (unsigned long long)le64_to_cpu(split.r_cpos),
-		     len, le32_to_cpu(split.r_refcount),
-		     (unsigned long long)le64_to_cpu(rec->r_cpos),
-		     le32_to_cpu(rec->r_clusters));
 		ret = ocfs2_split_refcount_rec(handle, ci,
 					       ref_root_bh, ref_leaf_bh,
 					       &split, index, 1,
@@ -2239,10 +2241,9 @@
 	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
 	struct buffer_head *ref_leaf_bh = NULL;
 
-	mlog(0, "Tree owner %llu, decrease refcount start %llu, "
-	     "len %u, delete %u\n",
-	     (unsigned long long)ocfs2_metadata_cache_owner(ci),
-	     (unsigned long long)cpos, len, delete);
+	trace_ocfs2_decrease_refcount(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)cpos, len, delete);
 
 	while (len) {
 		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
@@ -2352,8 +2353,8 @@
 {
 	int ret;
 
-	mlog(0, "Inode %lu refcount tree cpos %u, len %u, phys cluster %u\n",
-	     inode->i_ino, cpos, len, phys);
+	trace_ocfs2_mark_extent_refcounted(OCFS2_I(inode)->ip_blkno,
+					   cpos, len, phys);
 
 	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
 		ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
@@ -2392,8 +2393,6 @@
 	struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL;
 	u32 len;
 
-	mlog(0, "start_cpos %llu, clusters %u\n",
-	     (unsigned long long)start_cpos, clusters);
 	while (clusters) {
 		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
 					     cpos, clusters, &rec,
@@ -2427,12 +2426,11 @@
 
 		rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
 
-		mlog(0, "recs_add %d,cpos %llu, clusters %u, rec->r_cpos %llu,"
-		     "rec->r_clusters %u, rec->r_refcount %u, index %d\n",
-		     recs_add, (unsigned long long)cpos, clusters,
-		     (unsigned long long)le64_to_cpu(rec.r_cpos),
-		     le32_to_cpu(rec.r_clusters),
-		     le32_to_cpu(rec.r_refcount), index);
+		trace_ocfs2_calc_refcount_meta_credits_iterate(
+				recs_add, (unsigned long long)cpos, clusters,
+				(unsigned long long)le64_to_cpu(rec.r_cpos),
+				le32_to_cpu(rec.r_clusters),
+				le32_to_cpu(rec.r_refcount), index);
 
 		len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
 			  le32_to_cpu(rec.r_clusters)) - cpos;
@@ -2488,7 +2486,6 @@
 	if (!ref_blocks)
 		goto out;
 
-	mlog(0, "we need ref_blocks %d\n", ref_blocks);
 	*meta_add += ref_blocks;
 	*credits += ref_blocks;
 
@@ -2514,6 +2511,10 @@
 	}
 
 out:
+
+	trace_ocfs2_calc_refcount_meta_credits(
+		(unsigned long long)start_cpos, clusters,
+		*meta_add, *credits);
 	brelse(ref_leaf_bh);
 	brelse(prev_bh);
 	return ret;
@@ -2578,8 +2579,7 @@
 		goto out;
 	}
 
-	mlog(0, "reserve new metadata %d blocks, credits = %d\n",
-	     *ref_blocks, *credits);
+	trace_ocfs2_prepare_refcount_change_for_del(*ref_blocks, *credits);
 
 out:
 	brelse(ref_root_bh);
@@ -2886,8 +2886,7 @@
 		goto out;
 	}
 
-	mlog(0, "reserve new metadata %d, clusters %u, credits = %d\n",
-	     meta_add, num_clusters, *credits);
+	trace_ocfs2_lock_refcount_allocators(meta_add, *credits);
 	ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
 						meta_ac);
 	if (ret) {
@@ -2937,8 +2936,8 @@
 	loff_t offset, end, map_end;
 	struct address_space *mapping = context->inode->i_mapping;
 
-	mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
-	     new_cluster, new_len, cpos);
+	trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
+					       new_cluster, new_len);
 
 	readahead_pages =
 		(ocfs2_cow_contig_clusters(sb) <<
@@ -3031,8 +3030,8 @@
 	struct buffer_head *old_bh = NULL;
 	struct buffer_head *new_bh = NULL;
 
-	mlog(0, "old_cluster %u, new %u, len %u\n", old_cluster,
-	     new_cluster, new_len);
+	trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
+					       new_cluster, new_len);
 
 	for (i = 0; i < blocks; i++, old_block++, new_block++) {
 		new_bh = sb_getblk(osb->sb, new_block);
@@ -3085,8 +3084,8 @@
 	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
 	u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
 
-	mlog(0, "inode %llu cpos %u, len %u, p_cluster %u, ext_flags %u\n",
-	     (unsigned long long)ino, cpos, len, p_cluster, ext_flags);
+	trace_ocfs2_clear_ext_refcount((unsigned long long)ino,
+				       cpos, len, p_cluster, ext_flags);
 
 	memset(&replace_rec, 0, sizeof(replace_rec));
 	replace_rec.e_cpos = cpu_to_le32(cpos);
@@ -3141,8 +3140,8 @@
 	struct ocfs2_caching_info *ci = context->data_et.et_ci;
 	u64 ino = ocfs2_metadata_cache_owner(ci);
 
-	mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n",
-	     (unsigned long long)ino, cpos, old, new, len, ext_flags);
+	trace_ocfs2_replace_clusters((unsigned long long)ino,
+				     cpos, old, new, len, ext_flags);
 
 	/*If the old clusters is unwritten, no need to duplicate. */
 	if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
@@ -3236,8 +3235,8 @@
 	struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci;
 	struct ocfs2_refcount_rec rec;
 
-	mlog(0, "cpos %u, p_cluster %u, num_clusters %u, e_flags %u\n",
-	     cpos, p_cluster, num_clusters, e_flags);
+	trace_ocfs2_make_clusters_writable(cpos, p_cluster,
+					   num_clusters, e_flags);
 
 	ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
 					     &context->data_et,
@@ -3475,9 +3474,9 @@
 		goto out;
 	}
 
-	mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, "
-	     "cow_len %u\n", inode->i_ino,
-	     cpos, write_len, cow_start, cow_len);
+	trace_ocfs2_refcount_cow_hunk(OCFS2_I(inode)->ip_blkno,
+				      cpos, write_len, max_cpos,
+				      cow_start, cow_len);
 
 	BUG_ON(cow_len == 0);
 
@@ -3756,8 +3755,7 @@
 		goto out;
 	}
 
-	mlog(0, "reserve new metadata %d, credits = %d\n",
-	     ref_blocks, credits);
+	trace_ocfs2_add_refcount_flag(ref_blocks, credits);
 
 	if (ref_blocks) {
 		ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index 3e78db3..41ffd36 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -30,10 +30,10 @@
 #include <linux/bitops.h>
 #include <linux/list.h>
 
-#define MLOG_MASK_PREFIX ML_RESERVATIONS
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
+#include "ocfs2_trace.h"
 
 #ifdef CONFIG_OCFS2_DEBUG_FS
 #define OCFS2_CHECK_RESERVATIONS
@@ -321,8 +321,7 @@
 
 	assert_spin_locked(&resv_lock);
 
-	mlog(0, "Insert reservation start: %u len: %u\n", new->r_start,
-	     new->r_len);
+	trace_ocfs2_resv_insert(new->r_start, new->r_len);
 
 	while (*p) {
 		parent = *p;
@@ -423,8 +422,8 @@
 	unsigned int best_start, best_len = 0;
 	int offset, start, found;
 
-	mlog(0, "Find %u bits within range (%u, len %u) resmap len: %u\n",
-	     wanted, search_start, search_len, resmap->m_bitmap_len);
+	trace_ocfs2_resmap_find_free_bits_begin(search_start, search_len,
+						wanted, resmap->m_bitmap_len);
 
 	found = best_start = best_len = 0;
 
@@ -463,7 +462,7 @@
 	*rlen = best_len;
 	*rstart = best_start;
 
-	mlog(0, "Found start: %u len: %u\n", best_start, best_len);
+	trace_ocfs2_resmap_find_free_bits_end(best_start, best_len);
 
 	return *rlen;
 }
@@ -487,9 +486,8 @@
 	 * - our window should be last in all reservations
 	 * - need to make sure we don't go past end of bitmap
 	 */
-
-	mlog(0, "resv start: %u resv end: %u goal: %u wanted: %u\n",
-	     resv->r_start, ocfs2_resv_end(resv), goal, wanted);
+	trace_ocfs2_resv_find_window_begin(resv->r_start, ocfs2_resv_end(resv),
+					   goal, wanted, RB_EMPTY_ROOT(root));
 
 	assert_spin_locked(&resv_lock);
 
@@ -498,9 +496,6 @@
 		 * Easiest case - empty tree. We can just take
 		 * whatever window of free bits we want.
 		 */
-
-		mlog(0, "Empty root\n");
-
 		clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
 						   resmap->m_bitmap_len - goal,
 						   &cstart, &clen);
@@ -524,8 +519,6 @@
 	prev_resv = ocfs2_find_resv_lhs(resmap, goal);
 
 	if (prev_resv == NULL) {
-		mlog(0, "Goal on LHS of leftmost window\n");
-
 		/*
 		 * A NULL here means that the search code couldn't
 		 * find a window that starts before goal.
@@ -570,13 +563,15 @@
 		next_resv = NULL;
 	}
 
+	trace_ocfs2_resv_find_window_prev(prev_resv->r_start,
+					  ocfs2_resv_end(prev_resv));
+
 	prev = &prev_resv->r_node;
 
 	/* Now we do a linear search for a window, starting at 'prev_rsv' */
 	while (1) {
 		next = rb_next(prev);
 		if (next) {
-			mlog(0, "One more resv found in linear search\n");
 			next_resv = rb_entry(next,
 					     struct ocfs2_alloc_reservation,
 					     r_node);
@@ -585,7 +580,6 @@
 			gap_end = next_resv->r_start - 1;
 			gap_len = gap_end - gap_start + 1;
 		} else {
-			mlog(0, "No next node\n");
 			/*
 			 * We're at the rightmost edge of the
 			 * tree. See if a reservation between this
@@ -596,6 +590,8 @@
 			gap_end = resmap->m_bitmap_len - 1;
 		}
 
+		trace_ocfs2_resv_find_window_next(next ? next_resv->r_start: -1,
+					next ? ocfs2_resv_end(next_resv) : -1);
 		/*
 		 * No need to check this gap if we have already found
 		 * a larger region of free bits.
@@ -654,8 +650,9 @@
 	lru_resv = list_first_entry(&resmap->m_lru,
 				    struct ocfs2_alloc_reservation, r_lru);
 
-	mlog(0, "lru resv: start: %u len: %u end: %u\n", lru_resv->r_start,
-	     lru_resv->r_len, ocfs2_resv_end(lru_resv));
+	trace_ocfs2_cannibalize_resv_begin(lru_resv->r_start,
+					   lru_resv->r_len,
+					   ocfs2_resv_end(lru_resv));
 
 	/*
 	 * Cannibalize (some or all) of the target reservation and
@@ -684,10 +681,9 @@
 		resv->r_len = shrink;
 	}
 
-	mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
-	     "r_len: %u r_last_start: %u r_last_len: %u\n",
-	     resv->r_start, ocfs2_resv_end(resv), resv->r_len,
-	     resv->r_last_start, resv->r_last_len);
+	trace_ocfs2_cannibalize_resv_end(resv->r_start, ocfs2_resv_end(resv),
+					 resv->r_len, resv->r_last_start,
+					 resv->r_last_len);
 
 	ocfs2_resv_insert(resmap, resv);
 }
@@ -748,7 +744,6 @@
 		if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
 			wanted = *clen;
 
-		mlog(0, "empty reservation, find new window\n");
 		/*
 		 * Try to get a window here. If it works, we must fall
 		 * through and test the bitmap . This avoids some
@@ -757,6 +752,7 @@
 		 * that inode.
 		 */
 		ocfs2_resv_find_window(resmap, resv, wanted);
+		trace_ocfs2_resmap_resv_bits(resv->r_start, resv->r_len);
 	}
 
 	BUG_ON(ocfs2_resv_empty(resv));
@@ -813,10 +809,10 @@
 
 	spin_lock(&resv_lock);
 
-	mlog(0, "claim bits: cstart: %u cend: %u clen: %u r_start: %u "
-	     "r_end: %u r_len: %u, r_last_start: %u r_last_len: %u\n",
-	     cstart, cend, clen, resv->r_start, ocfs2_resv_end(resv),
-	     resv->r_len, resv->r_last_start, resv->r_last_len);
+	trace_ocfs2_resmap_claimed_bits_begin(cstart, cend, clen, resv->r_start,
+					      ocfs2_resv_end(resv), resv->r_len,
+					      resv->r_last_start,
+					      resv->r_last_len);
 
 	BUG_ON(cstart < resv->r_start);
 	BUG_ON(cstart > ocfs2_resv_end(resv));
@@ -833,10 +829,9 @@
 	if (!ocfs2_resv_empty(resv))
 		ocfs2_resv_mark_lru(resmap, resv);
 
-	mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
-	     "r_len: %u r_last_start: %u r_last_len: %u\n",
-	     resv->r_start, ocfs2_resv_end(resv), resv->r_len,
-	     resv->r_last_start, resv->r_last_len);
+	trace_ocfs2_resmap_claimed_bits_end(resv->r_start, ocfs2_resv_end(resv),
+					    resv->r_len, resv->r_last_start,
+					    resv->r_last_len);
 
 	ocfs2_check_resmap(resmap);
 
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index dacd553..ec55add 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -27,7 +27,6 @@
 #include <linux/fs.h>
 #include <linux/types.h>
 
-#define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -39,6 +38,7 @@
 #include "super.h"
 #include "sysfile.h"
 #include "uptodate.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 #include "suballoc.h"
@@ -82,7 +82,6 @@
 		backups++;
 	}
 
-	mlog_exit_void();
 	return backups;
 }
 
@@ -103,8 +102,8 @@
 	u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
 	u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
 
-	mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
-		   new_clusters, first_new_cluster);
+	trace_ocfs2_update_last_group_and_inode(new_clusters,
+						first_new_cluster);
 
 	ret = ocfs2_journal_access_gd(handle, INODE_CACHE(bm_inode),
 				      group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -176,7 +175,8 @@
 		le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
 	}
 out:
-	mlog_exit(ret);
+	if (ret)
+		mlog_errno(ret);
 	return ret;
 }
 
@@ -281,8 +281,6 @@
 	u32 first_new_cluster;
 	u64 lgd_blkno;
 
-	mlog_entry_void();
-
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 		return -EROFS;
 
@@ -342,7 +340,8 @@
 		goto out_unlock;
 	}
 
-	mlog(0, "extend the last group at %llu, new clusters = %d\n",
+
+	trace_ocfs2_group_extend(
 	     (unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters);
 
 	handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS);
@@ -377,7 +376,6 @@
 	iput(main_bm_inode);
 
 out:
-	mlog_exit_void();
 	return ret;
 }
 
@@ -472,8 +470,6 @@
 	struct ocfs2_chain_rec *cr;
 	u16 cl_bpc;
 
-	mlog_entry_void();
-
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 		return -EROFS;
 
@@ -520,8 +516,8 @@
 		goto out_unlock;
 	}
 
-	mlog(0, "Add a new group  %llu in chain = %u, length = %u\n",
-	     (unsigned long long)input->group, input->chain, input->clusters);
+	trace_ocfs2_group_add((unsigned long long)input->group,
+			       input->chain, input->clusters, input->frees);
 
 	handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS);
 	if (IS_ERR(handle)) {
@@ -589,6 +585,5 @@
 	iput(main_bm_inode);
 
 out:
-	mlog_exit_void();
 	return ret;
 }
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index ab4e017..26fc001 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -27,7 +27,6 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 
-#define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -39,6 +38,7 @@
 #include "slot_map.h"
 #include "super.h"
 #include "sysfile.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
@@ -142,8 +142,7 @@
 	BUG_ON(si->si_blocks == 0);
 	BUG_ON(si->si_bh == NULL);
 
-	mlog(0, "Refreshing slot map, reading %u block(s)\n",
-	     si->si_blocks);
+	trace_ocfs2_refresh_slot_info(si->si_blocks);
 
 	/*
 	 * We pass -1 as blocknr because we expect all of si->si_bh to
@@ -381,8 +380,7 @@
 	/* The size checks above should ensure this */
 	BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks);
 
-	mlog(0, "Slot map needs %u buffers for %llu bytes\n",
-	     si->si_blocks, bytes);
+	trace_ocfs2_map_slot_buffers(bytes, si->si_blocks);
 
 	si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks,
 			    GFP_KERNEL);
@@ -400,8 +398,7 @@
 			goto bail;
 		}
 
-		mlog(0, "Reading slot map block %u at %llu\n", i,
-		     (unsigned long long)blkno);
+		trace_ocfs2_map_slot_buffers_block((unsigned long long)blkno, i);
 
 		bh = NULL;  /* Acquire a fresh bh */
 		status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno,
@@ -475,8 +472,6 @@
 	int slot;
 	struct ocfs2_slot_info *si;
 
-	mlog_entry_void();
-
 	si = osb->slot_info;
 
 	spin_lock(&osb->osb_lock);
@@ -505,14 +500,13 @@
 	osb->slot_num = slot;
 	spin_unlock(&osb->osb_lock);
 
-	mlog(0, "taking node slot %d\n", osb->slot_num);
+	trace_ocfs2_find_slot(osb->slot_num);
 
 	status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
 	if (status < 0)
 		mlog_errno(status);
 
 bail:
-	mlog_exit(status);
 	return status;
 }
 
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 71998d4..ab6e206 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -29,7 +29,6 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 
-#define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -44,6 +43,7 @@
 #include "super.h"
 #include "sysfile.h"
 #include "uptodate.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
@@ -308,8 +308,8 @@
 	int rc;
 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 
-	mlog(0, "Validating group descriptor %llu\n",
-	     (unsigned long long)bh->b_blocknr);
+	trace_ocfs2_validate_group_descriptor(
+					(unsigned long long)bh->b_blocknr);
 
 	BUG_ON(!buffer_uptodate(bh));
 
@@ -389,8 +389,6 @@
 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 	struct super_block * sb = alloc_inode->i_sb;
 
-	mlog_entry_void();
-
 	if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
 		ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
 			    "b_blocknr (%llu)",
@@ -436,7 +434,8 @@
 	 * allocation time. */
 
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -477,8 +476,8 @@
 
 	/* setup the group */
 	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-	mlog(0, "new descriptor, record %u, at block %llu\n",
-	     alloc_rec, (unsigned long long)bg_blkno);
+	trace_ocfs2_block_group_alloc_contig(
+	     (unsigned long long)bg_blkno, alloc_rec);
 
 	bg_bh = sb_getblk(osb->sb, bg_blkno);
 	if (!bg_bh) {
@@ -657,8 +656,8 @@
 
 	/* setup the group */
 	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-	mlog(0, "new descriptor, record %u, at block %llu\n",
-	     alloc_rec, (unsigned long long)bg_blkno);
+	trace_ocfs2_block_group_alloc_discontig(
+				(unsigned long long)bg_blkno, alloc_rec);
 
 	bg_bh = sb_getblk(osb->sb, bg_blkno);
 	if (!bg_bh) {
@@ -707,8 +706,6 @@
 
 	BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
 
-	mlog_entry_void();
-
 	cl = &fe->id2.i_chain;
 	status = ocfs2_reserve_clusters_with_limit(osb,
 						   le16_to_cpu(cl->cl_cpg),
@@ -730,8 +727,8 @@
 	}
 
 	if (last_alloc_group && *last_alloc_group != 0) {
-		mlog(0, "use old allocation group %llu for block group alloc\n",
-		     (unsigned long long)*last_alloc_group);
+		trace_ocfs2_block_group_alloc(
+				(unsigned long long)*last_alloc_group);
 		ac->ac_last_group = *last_alloc_group;
 	}
 
@@ -796,7 +793,8 @@
 
 	brelse(bg_bh);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -814,8 +812,6 @@
 	struct ocfs2_dinode *fe;
 	u32 free_bits;
 
-	mlog_entry_void();
-
 	alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
 	if (!alloc_inode) {
 		mlog_errno(-EINVAL);
@@ -855,16 +851,15 @@
 	if (bits_wanted > free_bits) {
 		/* cluster bitmap never grows */
 		if (ocfs2_is_cluster_bitmap(alloc_inode)) {
-			mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
-			     bits_wanted, free_bits);
+			trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted,
+								free_bits);
 			status = -ENOSPC;
 			goto bail;
 		}
 
 		if (!(flags & ALLOC_NEW_GROUP)) {
-			mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
-			     "and we don't alloc a new group for it.\n",
-			     slot, bits_wanted, free_bits);
+			trace_ocfs2_reserve_suballoc_bits_no_new_group(
+						slot, bits_wanted, free_bits);
 			status = -ENOSPC;
 			goto bail;
 		}
@@ -890,7 +885,8 @@
 bail:
 	brelse(bh);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1052,7 +1048,8 @@
 		*ac = NULL;
 	}
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1119,8 +1116,8 @@
 		spin_lock(&osb->osb_lock);
 		osb->osb_inode_alloc_group = alloc_group;
 		spin_unlock(&osb->osb_lock);
-		mlog(0, "after reservation, new allocation group is "
-		     "%llu\n", (unsigned long long)alloc_group);
+		trace_ocfs2_reserve_new_inode_new_group(
+			(unsigned long long)alloc_group);
 
 		/*
 		 * Some inodes must be freed by us, so try to allocate
@@ -1152,7 +1149,8 @@
 		*ac = NULL;
 	}
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1189,8 +1187,6 @@
 {
 	int status;
 
-	mlog_entry_void();
-
 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
 	if (!(*ac)) {
 		status = -ENOMEM;
@@ -1229,7 +1225,8 @@
 		*ac = NULL;
 	}
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1357,15 +1354,12 @@
 	void *bitmap = bg->bg_bitmap;
 	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
 
-	mlog_entry_void();
-
 	/* All callers get the descriptor via
 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
 	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
 
-	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
-	     num_bits);
+	trace_ocfs2_block_group_set_bits(bit_off, num_bits);
 
 	if (ocfs2_is_cluster_bitmap(alloc_inode))
 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
@@ -1394,7 +1388,8 @@
 	ocfs2_journal_dirty(handle, group_bh);
 
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1437,10 +1432,10 @@
 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
 
-	mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
-	     (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
-	     (unsigned long long)le64_to_cpu(bg->bg_blkno),
-	     (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
+	trace_ocfs2_relink_block_group(
+		(unsigned long long)le64_to_cpu(fe->i_blkno), chain,
+		(unsigned long long)le64_to_cpu(bg->bg_blkno),
+		(unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
 
 	fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
 	bg_ptr = le64_to_cpu(bg->bg_next_group);
@@ -1484,7 +1479,8 @@
 		prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
 	}
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1525,10 +1521,10 @@
 		if ((gd_cluster_off + max_bits) >
 		    OCFS2_I(inode)->ip_clusters) {
 			max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
-			mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
-			     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-			     le16_to_cpu(gd->bg_bits),
-			     OCFS2_I(inode)->ip_clusters, max_bits);
+			trace_ocfs2_cluster_group_search_wrong_max_bits(
+				(unsigned long long)le64_to_cpu(gd->bg_blkno),
+				le16_to_cpu(gd->bg_bits),
+				OCFS2_I(inode)->ip_clusters, max_bits);
 		}
 
 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
@@ -1542,9 +1538,9 @@
 							  gd_cluster_off +
 							  res->sr_bit_offset +
 							  res->sr_bits);
-			mlog(0, "Checking %llu against %llu\n",
-			     (unsigned long long)blkoff,
-			     (unsigned long long)max_block);
+			trace_ocfs2_cluster_group_search_max_block(
+				(unsigned long long)blkoff,
+				(unsigned long long)max_block);
 			if (blkoff > max_block)
 				return -ENOSPC;
 		}
@@ -1588,9 +1584,9 @@
 		if (!ret && max_block) {
 			blkoff = le64_to_cpu(bg->bg_blkno) +
 				res->sr_bit_offset + res->sr_bits;
-			mlog(0, "Checking %llu against %llu\n",
-			     (unsigned long long)blkoff,
-			     (unsigned long long)max_block);
+			trace_ocfs2_block_group_search_max_block(
+				(unsigned long long)blkoff,
+				(unsigned long long)max_block);
 			if (blkoff > max_block)
 				ret = -ENOSPC;
 		}
@@ -1756,9 +1752,9 @@
 	struct ocfs2_group_desc *bg;
 
 	chain = ac->ac_chain;
-	mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
-	     bits_wanted, chain,
-	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
+	trace_ocfs2_search_chain_begin(
+		(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
+		bits_wanted, chain);
 
 	status = ocfs2_read_group_descriptor(alloc_inode, fe,
 					     le64_to_cpu(cl->cl_recs[chain].c_blkno),
@@ -1799,8 +1795,8 @@
 		goto bail;
 	}
 
-	mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
-	     res->sr_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
+	trace_ocfs2_search_chain_succ(
+		(unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits);
 
 	res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
 
@@ -1861,8 +1857,9 @@
 		goto bail;
 	}
 
-	mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
-	     (unsigned long long)le64_to_cpu(fe->i_blkno));
+	trace_ocfs2_search_chain_end(
+			(unsigned long long)le64_to_cpu(fe->i_blkno),
+			res->sr_bits);
 
 out_loc_only:
 	*bits_left = le16_to_cpu(bg->bg_free_bits_count);
@@ -1870,7 +1867,8 @@
 	brelse(group_bh);
 	brelse(prev_group_bh);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1888,8 +1886,6 @@
 	struct ocfs2_chain_list *cl;
 	struct ocfs2_dinode *fe;
 
-	mlog_entry_void();
-
 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
 	BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
 	BUG_ON(!ac->ac_bh);
@@ -1945,8 +1941,7 @@
 		goto bail;
 	}
 
-	mlog(0, "Search of victim chain %u came up with nothing, "
-	     "trying all chains now.\n", victim);
+	trace_ocfs2_claim_suballoc_bits(victim);
 
 	/* If we didn't pick a good victim, then just default to
 	 * searching each chain in order. Don't allow chain relinking
@@ -1984,7 +1979,8 @@
 	}
 
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2021,7 +2017,8 @@
 	*num_bits = res.sr_bits;
 	status = 0;
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2172,8 +2169,8 @@
 		goto out;
 	}
 
-	mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
-	     (unsigned long long)di_blkno);
+	trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno,
+					   res->sr_bits);
 
 	atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
 
@@ -2201,8 +2198,6 @@
 	int status;
 	struct ocfs2_suballoc_result res;
 
-	mlog_entry_void();
-
 	BUG_ON(!ac);
 	BUG_ON(ac->ac_bits_given != 0);
 	BUG_ON(ac->ac_bits_wanted != 1);
@@ -2230,7 +2225,8 @@
 	ocfs2_save_inode_ac_group(dir, ac);
 	status = 0;
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2307,8 +2303,6 @@
 	struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
 	struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
 
-	mlog_entry_void();
-
 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
 
 	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
@@ -2363,7 +2357,8 @@
 	ac->ac_bits_given += *num_clusters;
 
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2392,13 +2387,11 @@
 	unsigned int tmp;
 	struct ocfs2_group_desc *undo_bg = NULL;
 
-	mlog_entry_void();
-
 	/* The caller got this descriptor from
 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
 
-	mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
+	trace_ocfs2_block_group_clear_bits(bit_off, num_bits);
 
 	BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
@@ -2463,8 +2456,6 @@
 	struct buffer_head *group_bh = NULL;
 	struct ocfs2_group_desc *group;
 
-	mlog_entry_void();
-
 	/* The alloc_bh comes from ocfs2_free_dinode() or
 	 * ocfs2_free_clusters().  The callers have all locked the
 	 * allocator and gotten alloc_bh from the lock call.  This
@@ -2473,9 +2464,10 @@
 	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
 	BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
 
-	mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
-	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
-	     (unsigned long long)bg_blkno, start_bit);
+	trace_ocfs2_free_suballoc_bits(
+		(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
+		(unsigned long long)bg_blkno,
+		start_bit, count);
 
 	status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
 					     &group_bh);
@@ -2511,7 +2503,8 @@
 bail:
 	brelse(group_bh);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2556,11 +2549,8 @@
 
 	/* You can't ever have a contiguous set of clusters
 	 * bigger than a block group bitmap so we never have to worry
-	 * about looping on them. */
-
-	mlog_entry_void();
-
-	/* This is expensive. We can safely remove once this stuff has
+	 * about looping on them.
+	 * This is expensive. We can safely remove once this stuff has
 	 * gotten tested really well. */
 	BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
 
@@ -2569,10 +2559,9 @@
 	ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
 				     &bg_start_bit);
 
-	mlog(0, "want to free %u clusters starting at block %llu\n",
-	     num_clusters, (unsigned long long)start_blk);
-	mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
-	     (unsigned long long)bg_blkno, bg_start_bit);
+	trace_ocfs2_free_clusters((unsigned long long)bg_blkno,
+			(unsigned long long)start_blk,
+			bg_start_bit, num_clusters);
 
 	status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
 					   bg_start_bit, bg_blkno,
@@ -2586,7 +2575,8 @@
 					 num_clusters);
 
 out:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2756,7 +2746,7 @@
 	struct buffer_head *inode_bh = NULL;
 	struct ocfs2_dinode *inode_fe;
 
-	mlog_entry("blkno: %llu\n", (unsigned long long)blkno);
+	trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno);
 
 	/* dirty read disk */
 	status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
@@ -2793,7 +2783,8 @@
 bail:
 	brelse(inode_bh);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2816,8 +2807,8 @@
 	u64 bg_blkno;
 	int status;
 
-	mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
-		   (unsigned int)bit);
+	trace_ocfs2_test_suballoc_bit((unsigned long long)blkno,
+				      (unsigned int)bit);
 
 	alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
 	if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
@@ -2844,7 +2835,8 @@
 bail:
 	brelse(group_bh);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2869,7 +2861,7 @@
 	struct inode *inode_alloc_inode;
 	struct buffer_head *alloc_bh = NULL;
 
-	mlog_entry("blkno: %llu", (unsigned long long)blkno);
+	trace_ocfs2_test_inode_bit((unsigned long long)blkno);
 
 	status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
 					     &group_blkno, &suballoc_bit);
@@ -2910,6 +2902,7 @@
 	iput(inode_alloc_inode);
 	brelse(alloc_bh);
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 236ed1b..69fa11b 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -42,7 +42,9 @@
 #include <linux/seq_file.h>
 #include <linux/quotaops.h>
 
-#define MLOG_MASK_PREFIX ML_SUPER
+#define CREATE_TRACE_POINTS
+#include "ocfs2_trace.h"
+
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -441,8 +443,6 @@
 	int status = 0;
 	int i;
 
-	mlog_entry_void();
-
 	new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
 	if (IS_ERR(new)) {
 		status = PTR_ERR(new);
@@ -478,7 +478,8 @@
 	}
 
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -488,8 +489,6 @@
 	int status = 0;
 	int i;
 
-	mlog_entry_void();
-
 	for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
 	     i < NUM_SYSTEM_INODES;
 	     i++) {
@@ -508,7 +507,8 @@
 	}
 
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -517,8 +517,6 @@
 	int i;
 	struct inode *inode;
 
-	mlog_entry_void();
-
 	for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
 		inode = osb->global_system_inodes[i];
 		if (inode) {
@@ -540,7 +538,7 @@
 	}
 
 	if (!osb->local_system_inodes)
-		goto out;
+		return;
 
 	for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
 		if (osb->local_system_inodes[i]) {
@@ -551,9 +549,6 @@
 
 	kfree(osb->local_system_inodes);
 	osb->local_system_inodes = NULL;
-
-out:
-	mlog_exit(0);
 }
 
 /* We're allocating fs objects, use GFP_NOFS */
@@ -684,12 +679,9 @@
 		}
 
 		if (*flags & MS_RDONLY) {
-			mlog(0, "Going to ro mode.\n");
 			sb->s_flags |= MS_RDONLY;
 			osb->osb_flags |= OCFS2_OSB_SOFT_RO;
 		} else {
-			mlog(0, "Making ro filesystem writeable.\n");
-
 			if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
 				mlog(ML_ERROR, "Cannot remount RDWR "
 				     "filesystem due to previous errors.\n");
@@ -707,6 +699,7 @@
 			sb->s_flags &= ~MS_RDONLY;
 			osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
 		}
+		trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags);
 unlock_osb:
 		spin_unlock(&osb->osb_lock);
 		/* Enable quota accounting after remounting RW */
@@ -1032,7 +1025,7 @@
 	char nodestr[8];
 	struct ocfs2_blockcheck_stats stats;
 
-	mlog_entry("%p, %p, %i", sb, data, silent);
+	trace_ocfs2_fill_super(sb, data, silent);
 
 	if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) {
 		status = -EINVAL;
@@ -1208,7 +1201,6 @@
 			mlog_errno(status);
 			atomic_set(&osb->vol_state, VOLUME_DISABLED);
 			wake_up(&osb->osb_mount_event);
-			mlog_exit(status);
 			return status;
 		}
 	}
@@ -1222,7 +1214,6 @@
 	/* Start this when the mount is almost sure of being successful */
 	ocfs2_orphan_scan_start(osb);
 
-	mlog_exit(status);
 	return status;
 
 read_super_error:
@@ -1237,7 +1228,8 @@
 		ocfs2_dismount_volume(sb, 1);
 	}
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1320,8 +1312,7 @@
 	char *p;
 	u32 tmp;
 
-	mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
-		   options ? options : "(none)");
+	trace_ocfs2_parse_options(is_remount, options ? options : "(none)");
 
 	mopt->commit_interval = 0;
 	mopt->mount_opt = OCFS2_MOUNT_NOINTR;
@@ -1538,7 +1529,6 @@
 	status = 1;
 
 bail:
-	mlog_exit(status);
 	return status;
 }
 
@@ -1629,8 +1619,6 @@
 {
 	int status;
 
-	mlog_entry_void();
-
 	ocfs2_print_version();
 
 	status = init_ocfs2_uptodate_cache();
@@ -1664,10 +1652,9 @@
 	if (status < 0) {
 		ocfs2_free_mem_caches();
 		exit_ocfs2_uptodate_cache();
+		mlog_errno(status);
 	}
 
-	mlog_exit(status);
-
 	if (status >= 0) {
 		return register_filesystem(&ocfs2_fs_type);
 	} else
@@ -1676,8 +1663,6 @@
 
 static void __exit ocfs2_exit(void)
 {
-	mlog_entry_void();
-
 	if (ocfs2_wq) {
 		flush_workqueue(ocfs2_wq);
 		destroy_workqueue(ocfs2_wq);
@@ -1692,18 +1677,14 @@
 	unregister_filesystem(&ocfs2_fs_type);
 
 	exit_ocfs2_uptodate_cache();
-
-	mlog_exit_void();
 }
 
 static void ocfs2_put_super(struct super_block *sb)
 {
-	mlog_entry("(0x%p)\n", sb);
+	trace_ocfs2_put_super(sb);
 
 	ocfs2_sync_blockdev(sb);
 	ocfs2_dismount_volume(sb, 0);
-
-	mlog_exit_void();
 }
 
 static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1715,7 +1696,7 @@
 	struct buffer_head *bh = NULL;
 	struct inode *inode = NULL;
 
-	mlog_entry("(%p, %p)\n", dentry->d_sb, buf);
+	trace_ocfs2_statfs(dentry->d_sb, buf);
 
 	osb = OCFS2_SB(dentry->d_sb);
 
@@ -1762,7 +1743,8 @@
 	if (inode)
 		iput(inode);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 
 	return status;
 }
@@ -1882,8 +1864,6 @@
 	int unlock_super = 0;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 
-	mlog_entry_void();
-
 	if (ocfs2_is_hard_readonly(osb))
 		goto leave;
 
@@ -1928,7 +1908,6 @@
 	if (unlock_super)
 		ocfs2_super_unlock(osb, 1);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -1938,7 +1917,7 @@
 	struct ocfs2_super *osb = NULL;
 	char nodestr[8];
 
-	mlog_entry("(0x%p)\n", sb);
+	trace_ocfs2_dismount_volume(sb);
 
 	BUG_ON(!sb);
 	osb = OCFS2_SB(sb);
@@ -2090,8 +2069,6 @@
 	struct ocfs2_super *osb;
 	u64 total_blocks;
 
-	mlog_entry_void();
-
 	osb = kzalloc(sizeof(struct ocfs2_super), GFP_KERNEL);
 	if (!osb) {
 		status = -ENOMEM;
@@ -2155,7 +2132,6 @@
 		status = -EINVAL;
 		goto bail;
 	}
-	mlog(0, "max_slots for this device: %u\n", osb->max_slots);
 
 	ocfs2_orphan_scan_init(osb);
 
@@ -2294,7 +2270,6 @@
 	osb->s_clustersize_bits =
 		le32_to_cpu(di->id2.i_super.s_clustersize_bits);
 	osb->s_clustersize = 1 << osb->s_clustersize_bits;
-	mlog(0, "clusterbits=%d\n", osb->s_clustersize_bits);
 
 	if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE ||
 	    osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) {
@@ -2333,11 +2308,10 @@
 		le64_to_cpu(di->id2.i_super.s_first_cluster_group);
 	osb->fs_generation = le32_to_cpu(di->i_fs_generation);
 	osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash);
-	mlog(0, "vol_label: %s\n", osb->vol_label);
-	mlog(0, "uuid: %s\n", osb->uuid_str);
-	mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n",
-	     (unsigned long long)osb->root_blkno,
-	     (unsigned long long)osb->system_dir_blkno);
+	trace_ocfs2_initialize_super(osb->vol_label, osb->uuid_str,
+				     (unsigned long long)osb->root_blkno,
+				     (unsigned long long)osb->system_dir_blkno,
+				     osb->s_clustersize_bits);
 
 	osb->osb_dlm_debug = ocfs2_new_dlm_debug();
 	if (!osb->osb_dlm_debug) {
@@ -2380,7 +2354,6 @@
 	}
 
 bail:
-	mlog_exit(status);
 	return status;
 }
 
@@ -2396,8 +2369,6 @@
 {
 	int status = -EAGAIN;
 
-	mlog_entry_void();
-
 	if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
 		   strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
 		/* We have to do a raw check of the feature here */
@@ -2452,7 +2423,8 @@
 	}
 
 out:
-	mlog_exit(status);
+	if (status && status != -EAGAIN)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2465,8 +2437,6 @@
 						  * recover
 						  * ourselves. */
 
-	mlog_entry_void();
-
 	/* Init our journal object. */
 	status = ocfs2_journal_init(osb->journal, &dirty);
 	if (status < 0) {
@@ -2516,8 +2486,6 @@
 		 * ourselves as mounted. */
 	}
 
-	mlog(0, "Journal loaded.\n");
-
 	status = ocfs2_load_local_alloc(osb);
 	if (status < 0) {
 		mlog_errno(status);
@@ -2549,7 +2517,8 @@
 	if (local_alloc)
 		kfree(local_alloc);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2561,8 +2530,6 @@
  */
 static void ocfs2_delete_osb(struct ocfs2_super *osb)
 {
-	mlog_entry_void();
-
 	/* This function assumes that the caller has the main osb resource */
 
 	ocfs2_free_slot_info(osb);
@@ -2580,8 +2547,6 @@
 	kfree(osb->uuid_str);
 	ocfs2_put_dlm_debug(osb->osb_dlm_debug);
 	memset(osb, 0, sizeof(struct ocfs2_super));
-
-	mlog_exit_void();
 }
 
 /* Put OCFS2 into a readonly state, or (if the user specifies it),
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 9975457..5d22872 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -40,7 +40,6 @@
 #include <linux/pagemap.h>
 #include <linux/namei.h>
 
-#define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -62,8 +61,6 @@
 	char *link = NULL;
 	struct ocfs2_dinode *fe;
 
-	mlog_entry_void();
-
 	status = ocfs2_read_inode_block(inode, bh);
 	if (status < 0) {
 		mlog_errno(status);
@@ -74,7 +71,6 @@
 	fe = (struct ocfs2_dinode *) (*bh)->b_data;
 	link = (char *) fe->id2.i_symlink;
 bail:
-	mlog_exit(status);
 
 	return link;
 }
@@ -88,8 +84,6 @@
 	struct buffer_head *bh = NULL;
 	struct inode *inode = dentry->d_inode;
 
-	mlog_entry_void();
-
 	link = ocfs2_fast_symlink_getlink(inode, &bh);
 	if (IS_ERR(link)) {
 		ret = PTR_ERR(link);
@@ -104,7 +98,8 @@
 
 	brelse(bh);
 out:
-	mlog_exit(ret);
+	if (ret < 0)
+		mlog_errno(ret);
 	return ret;
 }
 
@@ -117,8 +112,6 @@
 	struct inode *inode = dentry->d_inode;
 	struct buffer_head *bh = NULL;
 
-	mlog_entry_void();
-
 	BUG_ON(!ocfs2_inode_is_fast_symlink(inode));
 	target = ocfs2_fast_symlink_getlink(inode, &bh);
 	if (IS_ERR(target)) {
@@ -142,7 +135,8 @@
 	nd_set_link(nd, status ? ERR_PTR(status) : link);
 	brelse(bh);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return NULL;
 }
 
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 902efb2..3d635f4 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -27,7 +27,6 @@
 #include <linux/types.h>
 #include <linux/highmem.h>
 
-#define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index a0a120e..52eaf33 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -54,14 +54,13 @@
 #include <linux/buffer_head.h>
 #include <linux/rbtree.h>
 
-#define MLOG_MASK_PREFIX ML_UPTODATE
-
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
 
 #include "inode.h"
 #include "uptodate.h"
+#include "ocfs2_trace.h"
 
 struct ocfs2_meta_cache_item {
 	struct rb_node	c_node;
@@ -152,8 +151,8 @@
 	while ((node = rb_last(root)) != NULL) {
 		item = rb_entry(node, struct ocfs2_meta_cache_item, c_node);
 
-		mlog(0, "Purge item %llu\n",
-		     (unsigned long long) item->c_block);
+		trace_ocfs2_purge_copied_metadata_tree(
+					(unsigned long long) item->c_block);
 
 		rb_erase(&item->c_node, root);
 		kmem_cache_free(ocfs2_uptodate_cachep, item);
@@ -180,9 +179,9 @@
 	tree = !(ci->ci_flags & OCFS2_CACHE_FL_INLINE);
 	to_purge = ci->ci_num_cached;
 
-	mlog(0, "Purge %u %s items from Owner %llu\n", to_purge,
-	     tree ? "array" : "tree",
-	     (unsigned long long)ocfs2_metadata_cache_owner(ci));
+	trace_ocfs2_metadata_cache_purge(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		to_purge, tree);
 
 	/* If we're a tree, save off the root so that we can safely
 	 * initialize the cache. We do the work to free tree members
@@ -249,10 +248,10 @@
 
 	ocfs2_metadata_cache_lock(ci);
 
-	mlog(0, "Owner %llu, query block %llu (inline = %u)\n",
-	     (unsigned long long)ocfs2_metadata_cache_owner(ci),
-	     (unsigned long long) bh->b_blocknr,
-	     !!(ci->ci_flags & OCFS2_CACHE_FL_INLINE));
+	trace_ocfs2_buffer_cached_begin(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long) bh->b_blocknr,
+		!!(ci->ci_flags & OCFS2_CACHE_FL_INLINE));
 
 	if (ci->ci_flags & OCFS2_CACHE_FL_INLINE)
 		index = ocfs2_search_cache_array(ci, bh->b_blocknr);
@@ -261,7 +260,7 @@
 
 	ocfs2_metadata_cache_unlock(ci);
 
-	mlog(0, "index = %d, item = %p\n", index, item);
+	trace_ocfs2_buffer_cached_end(index, item);
 
 	return (index != -1) || (item != NULL);
 }
@@ -306,8 +305,9 @@
 {
 	BUG_ON(ci->ci_num_cached >= OCFS2_CACHE_INFO_MAX_ARRAY);
 
-	mlog(0, "block %llu takes position %u\n", (unsigned long long) block,
-	     ci->ci_num_cached);
+	trace_ocfs2_append_cache_array(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)block, ci->ci_num_cached);
 
 	ci->ci_cache.ci_array[ci->ci_num_cached] = block;
 	ci->ci_num_cached++;
@@ -324,8 +324,9 @@
 	struct rb_node **p = &ci->ci_cache.ci_tree.rb_node;
 	struct ocfs2_meta_cache_item *tmp;
 
-	mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block,
-	     ci->ci_num_cached);
+	trace_ocfs2_insert_cache_tree(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)block, ci->ci_num_cached);
 
 	while(*p) {
 		parent = *p;
@@ -389,9 +390,9 @@
 		tree[i] = NULL;
 	}
 
-	mlog(0, "Expanded %llu to a tree cache: flags 0x%x, num = %u\n",
-	     (unsigned long long)ocfs2_metadata_cache_owner(ci),
-	     ci->ci_flags, ci->ci_num_cached);
+	trace_ocfs2_expand_cache(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		ci->ci_flags, ci->ci_num_cached);
 }
 
 /* Slow path function - memory allocation is necessary. See the
@@ -405,9 +406,9 @@
 	struct ocfs2_meta_cache_item *tree[OCFS2_CACHE_INFO_MAX_ARRAY] =
 		{ NULL, };
 
-	mlog(0, "Owner %llu, block %llu, expand = %d\n",
-	     (unsigned long long)ocfs2_metadata_cache_owner(ci),
-	     (unsigned long long)block, expand_tree);
+	trace_ocfs2_set_buffer_uptodate(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)block, expand_tree);
 
 	new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS);
 	if (!new) {
@@ -433,7 +434,6 @@
 
 	ocfs2_metadata_cache_lock(ci);
 	if (ocfs2_insert_can_use_array(ci)) {
-		mlog(0, "Someone cleared the tree underneath us\n");
 		/* Ok, items were removed from the cache in between
 		 * locks. Detect this and revert back to the fast path */
 		ocfs2_append_cache_array(ci, block);
@@ -490,9 +490,9 @@
 	if (ocfs2_buffer_cached(ci, bh))
 		return;
 
-	mlog(0, "Owner %llu, inserting block %llu\n",
-	     (unsigned long long)ocfs2_metadata_cache_owner(ci),
-	     (unsigned long long)bh->b_blocknr);
+	trace_ocfs2_set_buffer_uptodate_begin(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)bh->b_blocknr);
 
 	/* No need to recheck under spinlock - insertion is guarded by
 	 * co_io_lock() */
@@ -542,8 +542,9 @@
 	BUG_ON(index >= ci->ci_num_cached);
 	BUG_ON(!ci->ci_num_cached);
 
-	mlog(0, "remove index %d (num_cached = %u\n", index,
-	     ci->ci_num_cached);
+	trace_ocfs2_remove_metadata_array(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		index, ci->ci_num_cached);
 
 	ci->ci_num_cached--;
 
@@ -559,8 +560,9 @@
 static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
 				       struct ocfs2_meta_cache_item *item)
 {
-	mlog(0, "remove block %llu from tree\n",
-	     (unsigned long long) item->c_block);
+	trace_ocfs2_remove_metadata_tree(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)item->c_block);
 
 	rb_erase(&item->c_node, &ci->ci_cache.ci_tree);
 	ci->ci_num_cached--;
@@ -573,10 +575,10 @@
 	struct ocfs2_meta_cache_item *item = NULL;
 
 	ocfs2_metadata_cache_lock(ci);
-	mlog(0, "Owner %llu, remove %llu, items = %u, array = %u\n",
-	     (unsigned long long)ocfs2_metadata_cache_owner(ci),
-	     (unsigned long long) block, ci->ci_num_cached,
-	     ci->ci_flags & OCFS2_CACHE_FL_INLINE);
+	trace_ocfs2_remove_block_from_cache(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long) block, ci->ci_num_cached,
+		ci->ci_flags);
 
 	if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
 		index = ocfs2_search_cache_array(ci, block);
@@ -626,9 +628,6 @@
 	if (!ocfs2_uptodate_cachep)
 		return -ENOMEM;
 
-	mlog(0, "%u inlined cache items per inode.\n",
-	     OCFS2_CACHE_INFO_MAX_ARRAY);
-
 	return 0;
 }
 
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 6bb6024..57a215d 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -37,7 +37,6 @@
 #include <linux/string.h>
 #include <linux/security.h>
 
-#define MLOG_MASK_PREFIX ML_XATTR
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -57,6 +56,7 @@
 #include "xattr.h"
 #include "refcounttree.h"
 #include "acl.h"
+#include "ocfs2_trace.h"
 
 struct ocfs2_xattr_def_value_root {
 	struct ocfs2_xattr_value_root	xv;
@@ -474,8 +474,7 @@
 	struct ocfs2_xattr_block *xb =
 		(struct ocfs2_xattr_block *)bh->b_data;
 
-	mlog(0, "Validating xattr block %llu\n",
-	     (unsigned long long)bh->b_blocknr);
+	trace_ocfs2_validate_xattr_block((unsigned long long)bh->b_blocknr);
 
 	BUG_ON(!buffer_uptodate(bh));
 
@@ -715,11 +714,11 @@
 	u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
 	struct ocfs2_extent_tree et;
 
-	mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
-
 	ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
 
 	while (clusters_to_add) {
+		trace_ocfs2_xattr_extend_allocation(clusters_to_add);
+
 		status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
 				       OCFS2_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
@@ -754,8 +753,6 @@
 			 */
 			BUG_ON(why == RESTART_META);
 
-			mlog(0, "restarting xattr value extension for %u"
-			     " clusters,.\n", clusters_to_add);
 			credits = ocfs2_calc_extend_credits(inode->i_sb,
 							    &vb->vb_xv->xr_list,
 							    clusters_to_add);
@@ -3246,8 +3243,8 @@
 	}
 
 	meta_add += extra_meta;
-	mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
-	     "credits = %d\n", xi->xi_name, meta_add, clusters_add, *credits);
+	trace_ocfs2_init_xattr_set_ctxt(xi->xi_name, meta_add,
+					clusters_add, *credits);
 
 	if (meta_add) {
 		ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
@@ -3887,8 +3884,10 @@
 
 	if (found) {
 		xs->here = &xs->header->xh_entries[index];
-		mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
-		     (unsigned long long)bucket_blkno(xs->bucket), index);
+		trace_ocfs2_xattr_bucket_find(OCFS2_I(inode)->ip_blkno,
+			name, name_index, name_hash,
+			(unsigned long long)bucket_blkno(xs->bucket),
+			index);
 	} else
 		ret = -ENODATA;
 
@@ -3915,8 +3914,10 @@
 	if (le16_to_cpu(el->l_next_free_rec) == 0)
 		return -ENODATA;
 
-	mlog(0, "find xattr %s, hash = %u, index = %d in xattr tree\n",
-	     name, name_hash, name_index);
+	trace_ocfs2_xattr_index_block_find(OCFS2_I(inode)->ip_blkno,
+					name, name_index, name_hash,
+					(unsigned long long)root_bh->b_blocknr,
+					-1);
 
 	ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash,
 				  &num_clusters, el);
@@ -3927,9 +3928,10 @@
 
 	BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash);
 
-	mlog(0, "find xattr extent rec %u clusters from %llu, the first hash "
-	     "in the rec is %u\n", num_clusters, (unsigned long long)p_blkno,
-	     first_hash);
+	trace_ocfs2_xattr_index_block_find_rec(OCFS2_I(inode)->ip_blkno,
+					name, name_index, first_hash,
+					(unsigned long long)p_blkno,
+					num_clusters);
 
 	ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash,
 				      p_blkno, first_hash, num_clusters, xs);
@@ -3955,8 +3957,9 @@
 		return -ENOMEM;
 	}
 
-	mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
-	     clusters, (unsigned long long)blkno);
+	trace_ocfs2_iterate_xattr_buckets(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		(unsigned long long)blkno, clusters);
 
 	for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) {
 		ret = ocfs2_read_xattr_bucket(bucket, blkno);
@@ -3972,8 +3975,7 @@
 		if (i == 0)
 			num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets);
 
-		mlog(0, "iterating xattr bucket %llu, first hash %u\n",
-		     (unsigned long long)blkno,
+		trace_ocfs2_iterate_xattr_bucket((unsigned long long)blkno,
 		     le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
 		if (func) {
 			ret = func(inode, bucket, para);
@@ -4173,9 +4175,9 @@
 	char *src = xb_bh->b_data;
 	char *target = bucket_block(bucket, blks - 1);
 
-	mlog(0, "cp xattr from block %llu to bucket %llu\n",
-	     (unsigned long long)xb_bh->b_blocknr,
-	     (unsigned long long)bucket_blkno(bucket));
+	trace_ocfs2_cp_xattr_block_to_bucket_begin(
+				(unsigned long long)xb_bh->b_blocknr,
+				(unsigned long long)bucket_blkno(bucket));
 
 	for (i = 0; i < blks; i++)
 		memset(bucket_block(bucket, i), 0, blocksize);
@@ -4211,8 +4213,7 @@
 	for (i = 0; i < count; i++)
 		le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change);
 
-	mlog(0, "copy entry: start = %u, size = %u, offset_change = %u\n",
-	     offset, size, off_change);
+	trace_ocfs2_cp_xattr_block_to_bucket_end(offset, size, off_change);
 
 	sort(target + offset, count, sizeof(struct ocfs2_xattr_entry),
 	     cmp_xe, swap_xe);
@@ -4261,8 +4262,8 @@
 	struct ocfs2_xattr_tree_root *xr;
 	u16 xb_flags = le16_to_cpu(xb->xb_flags);
 
-	mlog(0, "create xattr index block for %llu\n",
-	     (unsigned long long)xb_bh->b_blocknr);
+	trace_ocfs2_xattr_create_index_block_begin(
+				(unsigned long long)xb_bh->b_blocknr);
 
 	BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
 	BUG_ON(!xs->bucket);
@@ -4295,8 +4296,7 @@
 	 */
 	blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
 
-	mlog(0, "allocate 1 cluster from %llu to xattr block\n",
-	     (unsigned long long)blkno);
+	trace_ocfs2_xattr_create_index_block((unsigned long long)blkno);
 
 	ret = ocfs2_init_xattr_bucket(xs->bucket, blkno);
 	if (ret) {
@@ -4400,8 +4400,7 @@
 	entries = (char *)xh->xh_entries;
 	xh_free_start = le16_to_cpu(xh->xh_free_start);
 
-	mlog(0, "adjust xattr bucket in %llu, count = %u, "
-	     "xh_free_start = %u, xh_name_value_len = %u.\n",
+	trace_ocfs2_defrag_xattr_bucket(
 	     (unsigned long long)blkno, le16_to_cpu(xh->xh_count),
 	     xh_free_start, le16_to_cpu(xh->xh_name_value_len));
 
@@ -4503,8 +4502,9 @@
 	BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets);
 	BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize);
 
-	mlog(0, "move half of xattrs in cluster %llu to %llu\n",
-	     (unsigned long long)last_cluster_blkno, (unsigned long long)new_blkno);
+	trace_ocfs2_mv_xattr_bucket_cross_cluster(
+				(unsigned long long)last_cluster_blkno,
+				(unsigned long long)new_blkno);
 
 	ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first),
 				     last_cluster_blkno, new_blkno,
@@ -4614,8 +4614,8 @@
 	struct ocfs2_xattr_entry *xe;
 	int blocksize = inode->i_sb->s_blocksize;
 
-	mlog(0, "move some of xattrs from bucket %llu to %llu\n",
-	     (unsigned long long)blk, (unsigned long long)new_blk);
+	trace_ocfs2_divide_xattr_bucket_begin((unsigned long long)blk,
+					      (unsigned long long)new_blk);
 
 	s_bucket = ocfs2_xattr_bucket_new(inode);
 	t_bucket = ocfs2_xattr_bucket_new(inode);
@@ -4714,9 +4714,9 @@
 	 */
 	xe = &xh->xh_entries[start];
 	len = sizeof(struct ocfs2_xattr_entry) * (count - start);
-	mlog(0, "mv xattr entry len %d from %d to %d\n", len,
-	     (int)((char *)xe - (char *)xh),
-	     (int)((char *)xh->xh_entries - (char *)xh));
+	trace_ocfs2_divide_xattr_bucket_move(len,
+			(int)((char *)xe - (char *)xh),
+			(int)((char *)xh->xh_entries - (char *)xh));
 	memmove((char *)xh->xh_entries, (char *)xe, len);
 	xe = &xh->xh_entries[count - start];
 	len = sizeof(struct ocfs2_xattr_entry) * start;
@@ -4788,9 +4788,9 @@
 
 	BUG_ON(s_blkno == t_blkno);
 
-	mlog(0, "cp bucket %llu to %llu, target is %d\n",
-	     (unsigned long long)s_blkno, (unsigned long long)t_blkno,
-	     t_is_new);
+	trace_ocfs2_cp_xattr_bucket((unsigned long long)s_blkno,
+				    (unsigned long long)t_blkno,
+				    t_is_new);
 
 	s_bucket = ocfs2_xattr_bucket_new(inode);
 	t_bucket = ocfs2_xattr_bucket_new(inode);
@@ -4862,8 +4862,8 @@
 	int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
 	struct ocfs2_xattr_bucket *old_first, *new_first;
 
-	mlog(0, "mv xattrs from cluster %llu to %llu\n",
-	     (unsigned long long)last_blk, (unsigned long long)to_blk);
+	trace_ocfs2_mv_xattr_buckets((unsigned long long)last_blk,
+				     (unsigned long long)to_blk);
 
 	BUG_ON(start_bucket >= num_buckets);
 	if (start_bucket) {
@@ -5013,9 +5013,9 @@
 {
 	int ret;
 
-	mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
-	     (unsigned long long)bucket_blkno(first), prev_clusters,
-	     (unsigned long long)new_blk);
+	trace_ocfs2_adjust_xattr_cross_cluster(
+			(unsigned long long)bucket_blkno(first),
+			(unsigned long long)new_blk, prev_clusters);
 
 	if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) {
 		ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
@@ -5088,10 +5088,10 @@
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_extent_tree et;
 
-	mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
-	     "previous xattr blkno = %llu\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-	     prev_cpos, (unsigned long long)bucket_blkno(first));
+	trace_ocfs2_add_new_xattr_cluster_begin(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		(unsigned long long)bucket_blkno(first),
+		prev_cpos, prev_clusters);
 
 	ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh);
 
@@ -5113,8 +5113,7 @@
 	BUG_ON(num_bits > clusters_to_add);
 
 	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-	mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
-	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+	trace_ocfs2_add_new_xattr_cluster((unsigned long long)block, num_bits);
 
 	if (bucket_blkno(first) + (prev_clusters * bpc) == block &&
 	    (prev_clusters + num_bits) << osb->s_clustersize_bits <=
@@ -5130,8 +5129,6 @@
 		 */
 		v_start = prev_cpos + prev_clusters;
 		*num_clusters = prev_clusters + num_bits;
-		mlog(0, "Add contiguous %u clusters to previous extent rec.\n",
-		     num_bits);
 	} else {
 		ret = ocfs2_adjust_xattr_cross_cluster(inode,
 						       handle,
@@ -5147,8 +5144,8 @@
 		}
 	}
 
-	mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
-	     num_bits, (unsigned long long)block, v_start);
+	trace_ocfs2_add_new_xattr_cluster_insert((unsigned long long)block,
+						 v_start, num_bits);
 	ret = ocfs2_insert_extent(handle, &et, v_start, block,
 				  num_bits, 0, ctxt->meta_ac);
 	if (ret < 0) {
@@ -5183,9 +5180,9 @@
 	u64 end_blk;
 	u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets);
 
-	mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
-	     "from %llu, len = %u\n", (unsigned long long)target_blk,
-	     (unsigned long long)bucket_blkno(first), num_clusters);
+	trace_ocfs2_extend_xattr_bucket((unsigned long long)target_blk,
+					(unsigned long long)bucket_blkno(first),
+					num_clusters, new_bucket);
 
 	/* The extent must have room for an additional bucket */
 	BUG_ON(new_bucket >=
@@ -5265,8 +5262,8 @@
 	/* The bucket at the front of the extent */
 	struct ocfs2_xattr_bucket *first;
 
-	mlog(0, "Add new xattr bucket starting from %llu\n",
-	     (unsigned long long)bucket_blkno(target));
+	trace_ocfs2_add_new_xattr_bucket(
+				(unsigned long long)bucket_blkno(target));
 
 	/* The first bucket of the original extent */
 	first = ocfs2_xattr_bucket_new(inode);
@@ -5382,8 +5379,8 @@
 	 * modified something.  We have to assume they did, and dirty
 	 * the whole bucket.  This leaves us in a consistent state.
 	 */
-	mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
-	     xe_off, (unsigned long long)bucket_blkno(bucket), len);
+	trace_ocfs2_xattr_bucket_value_truncate(
+			(unsigned long long)bucket_blkno(bucket), xe_off, len);
 	ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
 	if (ret) {
 		mlog_errno(ret);
@@ -5433,8 +5430,9 @@
 
 	ocfs2_init_dealloc_ctxt(&dealloc);
 
-	mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n",
-	     cpos, len, (unsigned long long)blkno);
+	trace_ocfs2_rm_xattr_cluster(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(unsigned long long)blkno, cpos, len);
 
 	ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode), blkno,
 					       len);
@@ -5538,7 +5536,7 @@
 	int ret;
 	struct ocfs2_xa_loc loc;
 
-	mlog_entry("Set xattr %s in xattr bucket\n", xi->xi_name);
+	trace_ocfs2_xattr_set_entry_bucket(xi->xi_name);
 
 	ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
 				       xs->not_found ? NULL : xs->here);
@@ -5570,7 +5568,6 @@
 
 
 out:
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -5581,7 +5578,7 @@
 {
 	int ret;
 
-	mlog_entry("Set xattr %s in xattr index block\n", xi->xi_name);
+	trace_ocfs2_xattr_set_entry_index_block(xi->xi_name);
 
 	ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
 	if (!ret)
@@ -5637,7 +5634,6 @@
 		mlog_errno(ret);
 
 out:
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -6041,9 +6037,9 @@
 	if (ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)))
 		p = &refcount;
 
-	mlog(0, "refcount bucket %llu, count = %u\n",
-	     (unsigned long long)bucket_blkno(bucket),
-	     le16_to_cpu(xh->xh_count));
+	trace_ocfs2_xattr_bucket_value_refcount(
+				(unsigned long long)bucket_blkno(bucket),
+				le16_to_cpu(xh->xh_count));
 	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
 		xe = &xh->xh_entries[i];
 
@@ -6339,8 +6335,8 @@
 	u32 clusters, cpos, p_cluster, num_clusters;
 	unsigned int ext_flags = 0;
 
-	mlog(0, "reflink xattr in container %llu, count = %u\n",
-	     (unsigned long long)old_bh->b_blocknr, le16_to_cpu(xh->xh_count));
+	trace_ocfs2_reflink_xattr_header((unsigned long long)old_bh->b_blocknr,
+					 le16_to_cpu(xh->xh_count));
 
 	last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)];
 	for (i = 0, j = 0; i < le16_to_cpu(xh->xh_count); i++, j++) {
@@ -6540,8 +6536,8 @@
 		goto out;
 	}
 
-	mlog(0, "create new xattr block for inode %llu, index = %d\n",
-	     (unsigned long long)fe_bh->b_blocknr, indexed);
+	trace_ocfs2_create_empty_xattr_block(
+				(unsigned long long)fe_bh->b_blocknr, indexed);
 	ret = ocfs2_create_xattr_block(inode, fe_bh, &ctxt, indexed,
 				       ret_bh);
 	if (ret)
@@ -6952,8 +6948,8 @@
 		if (ret)
 			mlog_errno(ret);
 
-		mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
-		     (unsigned long long)new_blkno, num_clusters, reflink_cpos);
+		trace_ocfs2_reflink_xattr_buckets((unsigned long long)new_blkno,
+						  num_clusters, reflink_cpos);
 
 		len -= num_clusters;
 		blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
@@ -6982,8 +6978,7 @@
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_extent_tree et;
 
-	mlog(0, "reflink xattr buckets %llu len %u\n",
-	     (unsigned long long)blkno, len);
+	trace_ocfs2_reflink_xattr_rec((unsigned long long)blkno, len);
 
 	ocfs2_init_xattr_tree_extent_tree(&et,
 					  INODE_CACHE(args->reflink->new_inode),
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7c708a4..2e7addf 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -182,7 +182,8 @@
 	struct proc_maps_private *priv = m->private;
 	struct vm_area_struct *vma = v;
 
-	vma_stop(priv, vma);
+	if (!IS_ERR(vma))
+		vma_stop(priv, vma);
 	if (priv->task)
 		put_task_struct(priv->task);
 }
diff --git a/include/linux/bch.h b/include/linux/bch.h
new file mode 100644
index 0000000..295b4ef
--- /dev/null
+++ b/include/linux/bch.h
@@ -0,0 +1,79 @@
+/*
+ * Generic binary BCH encoding/decoding library
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Copyright © 2011 Parrot S.A.
+ *
+ * Author: Ivan Djelic <ivan.djelic@parrot.com>
+ *
+ * Description:
+ *
+ * This library provides runtime configurable encoding/decoding of binary
+ * Bose-Chaudhuri-Hocquenghem (BCH) codes.
+*/
+#ifndef _BCH_H
+#define _BCH_H
+
+#include <linux/types.h>
+
+/**
+ * struct bch_control - BCH control structure
+ * @m:          Galois field order
+ * @n:          maximum codeword size in bits (= 2^m-1)
+ * @t:          error correction capability in bits
+ * @ecc_bits:   ecc exact size in bits, i.e. generator polynomial degree (<=m*t)
+ * @ecc_bytes:  ecc max size (m*t bits) in bytes
+ * @a_pow_tab:  Galois field GF(2^m) exponentiation lookup table
+ * @a_log_tab:  Galois field GF(2^m) log lookup table
+ * @mod8_tab:   remainder generator polynomial lookup tables
+ * @ecc_buf:    ecc parity words buffer
+ * @ecc_buf2:   ecc parity words buffer
+ * @xi_tab:     GF(2^m) base for solving degree 2 polynomial roots
+ * @syn:        syndrome buffer
+ * @cache:      log-based polynomial representation buffer
+ * @elp:        error locator polynomial
+ * @poly_2t:    temporary polynomials of degree 2t
+ */
+struct bch_control {
+	unsigned int    m;
+	unsigned int    n;
+	unsigned int    t;
+	unsigned int    ecc_bits;
+	unsigned int    ecc_bytes;
+/* private: */
+	uint16_t       *a_pow_tab;
+	uint16_t       *a_log_tab;
+	uint32_t       *mod8_tab;
+	uint32_t       *ecc_buf;
+	uint32_t       *ecc_buf2;
+	unsigned int   *xi_tab;
+	unsigned int   *syn;
+	int            *cache;
+	struct gf_poly *elp;
+	struct gf_poly *poly_2t[4];
+};
+
+struct bch_control *init_bch(int m, int t, unsigned int prim_poly);
+
+void free_bch(struct bch_control *bch);
+
+void encode_bch(struct bch_control *bch, const uint8_t *data,
+		unsigned int len, uint8_t *ecc);
+
+int decode_bch(struct bch_control *bch, const uint8_t *data, unsigned int len,
+	       const uint8_t *recv_ecc, const uint8_t *calc_ecc,
+	       const unsigned int *syn, unsigned int *errloc);
+
+#endif /* _BCH_H */
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index ef44c7a..d18d673 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -53,10 +53,10 @@
 
 
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.3.9"
+#define REL_VERSION "8.3.10"
 #define API_VERSION 88
 #define PRO_VERSION_MIN 86
-#define PRO_VERSION_MAX 95
+#define PRO_VERSION_MAX 96
 
 
 enum drbd_io_error_p {
@@ -96,8 +96,14 @@
 	OND_SUSPEND_IO
 };
 
+enum drbd_on_congestion {
+	OC_BLOCK,
+	OC_PULL_AHEAD,
+	OC_DISCONNECT,
+};
+
 /* KEEP the order, do not delete or insert. Only append. */
-enum drbd_ret_codes {
+enum drbd_ret_code {
 	ERR_CODE_BASE		= 100,
 	NO_ERROR		= 101,
 	ERR_LOCAL_ADDR		= 102,
@@ -146,6 +152,9 @@
 	ERR_PERM		= 152,
 	ERR_NEED_APV_93		= 153,
 	ERR_STONITH_AND_PROT_A  = 154,
+	ERR_CONG_NOT_PROTO_A	= 155,
+	ERR_PIC_AFTER_DEP	= 156,
+	ERR_PIC_PEER_DEP	= 157,
 
 	/* insert new ones above this line */
 	AFTER_LAST_ERR_CODE
@@ -199,6 +208,10 @@
 	C_VERIFY_T,
 	C_PAUSED_SYNC_S,
 	C_PAUSED_SYNC_T,
+
+	C_AHEAD,
+	C_BEHIND,
+
 	C_MASK = 31
 };
 
@@ -259,7 +272,7 @@
 	unsigned int i;
 };
 
-enum drbd_state_ret_codes {
+enum drbd_state_rv {
 	SS_CW_NO_NEED = 4,
 	SS_CW_SUCCESS = 3,
 	SS_NOTHING_TO_DO = 2,
@@ -290,7 +303,7 @@
 extern const char *drbd_conn_str(enum drbd_conns);
 extern const char *drbd_role_str(enum drbd_role);
 extern const char *drbd_disk_str(enum drbd_disk_state);
-extern const char *drbd_set_st_err_str(enum drbd_state_ret_codes);
+extern const char *drbd_set_st_err_str(enum drbd_state_rv);
 
 #define SHARED_SECRET_MAX 64
 
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index 4ac33f3..bb264a5 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -16,7 +16,8 @@
 #define DEBUG_RANGE_CHECK 0
 
 #define DRBD_MINOR_COUNT_MIN 1
-#define DRBD_MINOR_COUNT_MAX 255
+#define DRBD_MINOR_COUNT_MAX 256
+#define DRBD_MINOR_COUNT_DEF 32
 
 #define DRBD_DIALOG_REFRESH_MIN 0
 #define DRBD_DIALOG_REFRESH_MAX 600
@@ -129,6 +130,7 @@
 #define DRBD_AFTER_SB_2P_DEF ASB_DISCONNECT
 #define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT
 #define DRBD_ON_NO_DATA_DEF OND_IO_ERROR
+#define DRBD_ON_CONGESTION_DEF OC_BLOCK
 
 #define DRBD_MAX_BIO_BVECS_MIN 0
 #define DRBD_MAX_BIO_BVECS_MAX 128
@@ -154,5 +156,13 @@
 #define DRBD_C_MIN_RATE_MAX     (4 << 20)
 #define DRBD_C_MIN_RATE_DEF     4096
 
+#define DRBD_CONG_FILL_MIN	0
+#define DRBD_CONG_FILL_MAX	(10<<21) /* 10GByte in sectors */
+#define DRBD_CONG_FILL_DEF	0
+
+#define DRBD_CONG_EXTENTS_MIN	DRBD_AL_EXTENTS_MIN
+#define DRBD_CONG_EXTENTS_MAX	DRBD_AL_EXTENTS_MAX
+#define DRBD_CONG_EXTENTS_DEF	DRBD_AL_EXTENTS_DEF
+
 #undef RANGE
 #endif
diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h
index ade9110..ab6159e4 100644
--- a/include/linux/drbd_nl.h
+++ b/include/linux/drbd_nl.h
@@ -56,6 +56,9 @@
 	NL_INTEGER(	39,	T_MAY_IGNORE,	rr_conflict)
 	NL_INTEGER(	40,	T_MAY_IGNORE,	ping_timeo)
 	NL_INTEGER(	67,	T_MAY_IGNORE,	rcvbuf_size)
+	NL_INTEGER(	81,	T_MAY_IGNORE,	on_congestion)
+	NL_INTEGER(	82,	T_MAY_IGNORE,	cong_fill)
+	NL_INTEGER(	83,	T_MAY_IGNORE,	cong_extents)
 	  /* 59 addr_family was available in GIT, never released */
 	NL_BIT(		60,	T_MANDATORY,	mind_af)
 	NL_BIT(		27,	T_MAY_IGNORE,	want_lose)
@@ -66,7 +69,9 @@
 	NL_BIT(		70,	T_MANDATORY,	dry_run)
 )
 
-NL_PACKET(disconnect, 6, )
+NL_PACKET(disconnect, 6,
+	NL_BIT(		84,	T_MAY_IGNORE,	force)
+)
 
 NL_PACKET(resize, 7,
 	NL_INT64(		29,	T_MAY_IGNORE,	resize_size)
@@ -143,9 +148,13 @@
        NL_BIT(		63,	T_MANDATORY,	clear_bm)
 )
 
+#ifdef NL_RESPONSE
+NL_RESPONSE(return_code_only, 27)
+#endif
+
 #undef NL_PACKET
 #undef NL_INTEGER
 #undef NL_INT64
 #undef NL_BIT
 #undef NL_STRING
-
+#undef NL_RESPONSE
diff --git a/include/linux/drbd_tag_magic.h b/include/linux/drbd_tag_magic.h
index fcdff84..f14a165 100644
--- a/include/linux/drbd_tag_magic.h
+++ b/include/linux/drbd_tag_magic.h
@@ -7,6 +7,7 @@
 /* declare packet_type enums */
 enum packet_types {
 #define NL_PACKET(name, number, fields) P_ ## name = number,
+#define NL_RESPONSE(name, number) P_ ## name = number,
 #define NL_INTEGER(pn, pr, member)
 #define NL_INT64(pn, pr, member)
 #define NL_BIT(pn, pr, member)
diff --git a/include/linux/mfd/ab8500.h b/include/linux/mfd/ab8500.h
index 56f8dea..6e4f77e 100644
--- a/include/linux/mfd/ab8500.h
+++ b/include/linux/mfd/ab8500.h
@@ -139,17 +139,23 @@
 	u8 oldmask[AB8500_NUM_IRQ_REGS];
 };
 
+struct regulator_reg_init;
 struct regulator_init_data;
 
 /**
  * struct ab8500_platform_data - AB8500 platform data
  * @irq_base: start of AB8500 IRQs, AB8500_NR_IRQS will be used
  * @init: board-specific initialization after detection of ab8500
+ * @num_regulator_reg_init: number of regulator init registers
+ * @regulator_reg_init: regulator init registers
+ * @num_regulator: number of regulators
  * @regulator: machine-specific constraints for regulators
  */
 struct ab8500_platform_data {
 	int irq_base;
 	void (*init) (struct ab8500 *);
+	int num_regulator_reg_init;
+	struct ab8500_regulator_reg_init *regulator_reg_init;
 	int num_regulator;
 	struct regulator_init_data *regulator;
 };
diff --git a/include/linux/mfd/core.h b/include/linux/mfd/core.h
index 1408bf8..ad1b19a 100644
--- a/include/linux/mfd/core.h
+++ b/include/linux/mfd/core.h
@@ -63,6 +63,24 @@
 extern int mfd_cell_disable(struct platform_device *pdev);
 
 /*
+ * "Clone" multiple platform devices for a single cell. This is to be used
+ * for devices that have multiple users of a cell.  For example, if an mfd
+ * driver wants the cell "foo" to be used by a GPIO driver, an MTD driver,
+ * and a platform driver, the following bit of code would be use after first
+ * calling mfd_add_devices():
+ *
+ * const char *fclones[] = { "foo-gpio", "foo-mtd" };
+ * err = mfd_clone_cells("foo", fclones, ARRAY_SIZE(fclones));
+ *
+ * Each driver (MTD, GPIO, and platform driver) would then register
+ * platform_drivers for "foo-mtd", "foo-gpio", and "foo", respectively.
+ * The cell's .enable/.disable hooks should be used to deal with hardware
+ * resource contention.
+ */
+extern int mfd_clone_cell(const char *cell, const char **clones,
+		size_t n_clones);
+
+/*
  * Given a platform device that's been created by mfd_add_devices(), fetch
  * the mfd_cell that created it.
  */
@@ -87,13 +105,4 @@
 
 extern void mfd_remove_devices(struct device *parent);
 
-/*
- * For MFD drivers with clients sharing access to resources, these create
- * multiple platform devices per cell.  Contention handling must still be
- * handled via drivers (ie, with enable/disable hooks).
- */
-extern int mfd_shared_platform_driver_register(struct platform_driver *drv,
-		const char *cellname);
-extern void mfd_shared_platform_driver_unregister(struct platform_driver *drv);
-
 #endif
diff --git a/include/linux/mfd/max8997-private.h b/include/linux/mfd/max8997-private.h
index 93a9477..69d1010 100644
--- a/include/linux/mfd/max8997-private.h
+++ b/include/linux/mfd/max8997-private.h
@@ -24,6 +24,8 @@
 
 #include <linux/i2c.h>
 
+#define MAX8997_REG_INVALID	(0xff)
+
 enum max8997_pmic_reg {
 	MAX8997_REG_PMIC_ID0	= 0x00,
 	MAX8997_REG_PMIC_ID1	= 0x01,
@@ -313,6 +315,7 @@
 #define MAX8997_REG_BUCK2DVS(x)	(MAX8997_REG_BUCK2DVS1 + (x) - 1)
 #define MAX8997_REG_BUCK5DVS(x)	(MAX8997_REG_BUCK5DVS1 + (x) - 1)
 
+#define MAX8997_NUM_GPIO	12
 struct max8997_dev {
 	struct device *dev;
 	struct i2c_client *i2c; /* 0xcc / PMIC, Battery Control, and FLASH */
@@ -324,11 +327,19 @@
 	int type;
 	struct platform_device *battery; /* battery control (not fuel gauge) */
 
+	int irq;
+	int ono;
+	int irq_base;
 	bool wakeup;
+	struct mutex irqlock;
+	int irq_masks_cur[MAX8997_IRQ_GROUP_NR];
+	int irq_masks_cache[MAX8997_IRQ_GROUP_NR];
 
 	/* For hibernation */
 	u8 reg_dump[MAX8997_REG_PMIC_END + MAX8997_MUIC_REG_END +
 		MAX8997_HAPTIC_REG_END];
+
+	bool gpio_status[MAX8997_NUM_GPIO];
 };
 
 enum max8997_types {
@@ -336,6 +347,10 @@
 	TYPE_MAX8966,
 };
 
+extern int max8997_irq_init(struct max8997_dev *max8997);
+extern void max8997_irq_exit(struct max8997_dev *max8997);
+extern int max8997_irq_resume(struct max8997_dev *max8997);
+
 extern int max8997_read_reg(struct i2c_client *i2c, u8 reg, u8 *dest);
 extern int max8997_bulk_read(struct i2c_client *i2c, u8 reg, int count,
 				u8 *buf);
@@ -344,4 +359,10 @@
 				u8 *buf);
 extern int max8997_update_reg(struct i2c_client *i2c, u8 reg, u8 val, u8 mask);
 
+#define MAX8997_GPIO_INT_BOTH	(0x3 << 4)
+#define MAX8997_GPIO_INT_RISE	(0x2 << 4)
+#define MAX8997_GPIO_INT_FALL	(0x1 << 4)
+
+#define MAX8997_GPIO_INT_MASK	(0x3 << 4)
+#define MAX8997_GPIO_DATA_MASK	(0x1 << 2)
 #endif /*  __LINUX_MFD_MAX8997_PRIV_H */
diff --git a/include/linux/mfd/max8997.h b/include/linux/mfd/max8997.h
index cb671b3..60931d0 100644
--- a/include/linux/mfd/max8997.h
+++ b/include/linux/mfd/max8997.h
@@ -78,8 +78,11 @@
 };
 
 struct max8997_platform_data {
-	bool wakeup;
-	/* IRQ: Not implemented */
+	/* IRQ */
+	int irq_base;
+	int ono;
+	int wakeup;
+
 	/* ---- PMIC ---- */
 	struct max8997_regulator_data *regulators;
 	int num_regulators;
diff --git a/include/linux/mtd/blktrans.h b/include/linux/mtd/blktrans.h
index 26529eb..1bbd9f2 100644
--- a/include/linux/mtd/blktrans.h
+++ b/include/linux/mtd/blktrans.h
@@ -36,6 +36,7 @@
 	struct mtd_info *mtd;
 	struct mutex lock;
 	int devnum;
+	bool bg_stop;
 	unsigned long size;
 	int readonly;
 	int open;
@@ -62,6 +63,7 @@
 		     unsigned long block, char *buffer);
 	int (*discard)(struct mtd_blktrans_dev *dev,
 		       unsigned long block, unsigned nr_blocks);
+	void (*background)(struct mtd_blktrans_dev *dev);
 
 	/* Block layer ioctls */
 	int (*getgeo)(struct mtd_blktrans_dev *dev, struct hd_geometry *geo);
@@ -85,6 +87,7 @@
 extern int deregister_mtd_blktrans(struct mtd_blktrans_ops *tr);
 extern int add_mtd_blktrans_dev(struct mtd_blktrans_dev *dev);
 extern int del_mtd_blktrans_dev(struct mtd_blktrans_dev *dev);
+extern int mtd_blktrans_cease_background(struct mtd_blktrans_dev *dev);
 
 
 #endif /* __MTD_TRANS_H__ */
diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
index a9baee6..0d823f2 100644
--- a/include/linux/mtd/cfi.h
+++ b/include/linux/mtd/cfi.h
@@ -535,6 +535,7 @@
 #define CFI_MFR_CONTINUATION	0x007F
 
 #define CFI_MFR_AMD		0x0001
+#define CFI_MFR_AMIC		0x0037
 #define CFI_MFR_ATMEL		0x001F
 #define CFI_MFR_EON		0x001C
 #define CFI_MFR_FUJITSU		0x0004
diff --git a/include/linux/mtd/latch-addr-flash.h b/include/linux/mtd/latch-addr-flash.h
new file mode 100644
index 0000000..e94b8e1
--- /dev/null
+++ b/include/linux/mtd/latch-addr-flash.h
@@ -0,0 +1,29 @@
+/*
+ * Interface for NOR flash driver whose high address lines are latched
+ *
+ * Copyright © 2008 MontaVista Software, Inc. <source@mvista.com>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+#ifndef __LATCH_ADDR_FLASH__
+#define __LATCH_ADDR_FLASH__
+
+struct map_info;
+struct mtd_partition;
+
+struct latch_addr_flash_data {
+	unsigned int		width;
+	unsigned int		size;
+
+	int			(*init)(void *data, int cs);
+	void			(*done)(void *data);
+	void			(*set_window)(unsigned long offset, void *data);
+	void			*data;
+
+	unsigned int		nr_parts;
+	struct mtd_partition	*parts;
+};
+
+#endif
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 1f489b2..ae67ef5 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -140,6 +140,7 @@
 	NAND_ECC_HW,
 	NAND_ECC_HW_SYNDROME,
 	NAND_ECC_HW_OOB_FIRST,
+	NAND_ECC_SOFT_BCH,
 } nand_ecc_modes_t;
 
 /*
@@ -339,6 +340,7 @@
  * @prepad:	padding information for syndrome based ecc generators
  * @postpad:	padding information for syndrome based ecc generators
  * @layout:	ECC layout control struct pointer
+ * @priv:	pointer to private ecc control data
  * @hwctl:	function to control hardware ecc generator. Must only
  *		be provided if an hardware ECC is available
  * @calculate:	function for ecc calculation or readback from ecc hardware
@@ -362,6 +364,7 @@
 	int prepad;
 	int postpad;
 	struct nand_ecclayout	*layout;
+	void *priv;
 	void (*hwctl)(struct mtd_info *mtd, int mode);
 	int (*calculate)(struct mtd_info *mtd, const uint8_t *dat,
 			uint8_t *ecc_code);
diff --git a/include/linux/mtd/nand_bch.h b/include/linux/mtd/nand_bch.h
new file mode 100644
index 0000000..74acf53
--- /dev/null
+++ b/include/linux/mtd/nand_bch.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright © 2011 Ivan Djelic <ivan.djelic@parrot.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This file is the header for the NAND BCH ECC implementation.
+ */
+
+#ifndef __MTD_NAND_BCH_H__
+#define __MTD_NAND_BCH_H__
+
+struct mtd_info;
+struct nand_bch_control;
+
+#if defined(CONFIG_MTD_NAND_ECC_BCH)
+
+static inline int mtd_nand_has_bch(void) { return 1; }
+
+/*
+ * Calculate BCH ecc code
+ */
+int nand_bch_calculate_ecc(struct mtd_info *mtd, const u_char *dat,
+			   u_char *ecc_code);
+
+/*
+ * Detect and correct bit errors
+ */
+int nand_bch_correct_data(struct mtd_info *mtd, u_char *dat, u_char *read_ecc,
+			  u_char *calc_ecc);
+/*
+ * Initialize BCH encoder/decoder
+ */
+struct nand_bch_control *
+nand_bch_init(struct mtd_info *mtd, unsigned int eccsize,
+	      unsigned int eccbytes, struct nand_ecclayout **ecclayout);
+/*
+ * Release BCH encoder/decoder resources
+ */
+void nand_bch_free(struct nand_bch_control *nbc);
+
+#else /* !CONFIG_MTD_NAND_ECC_BCH */
+
+static inline int mtd_nand_has_bch(void) { return 0; }
+
+static inline int
+nand_bch_calculate_ecc(struct mtd_info *mtd, const u_char *dat,
+		       u_char *ecc_code)
+{
+	return -1;
+}
+
+static inline int
+nand_bch_correct_data(struct mtd_info *mtd, unsigned char *buf,
+		      unsigned char *read_ecc, unsigned char *calc_ecc)
+{
+	return -1;
+}
+
+static inline struct nand_bch_control *
+nand_bch_init(struct mtd_info *mtd, unsigned int eccsize,
+	      unsigned int eccbytes, struct nand_ecclayout **ecclayout)
+{
+	return NULL;
+}
+
+static inline void nand_bch_free(struct nand_bch_control *nbc) {}
+
+#endif /* CONFIG_MTD_NAND_ECC_BCH */
+
+#endif /* __MTD_NAND_BCH_H__ */
diff --git a/include/linux/mtd/onenand.h b/include/linux/mtd/onenand.h
index ae418e4..52b6f18 100644
--- a/include/linux/mtd/onenand.h
+++ b/include/linux/mtd/onenand.h
@@ -198,6 +198,7 @@
 #define ONENAND_SKIP_UNLOCK_CHECK	(0x0100)
 #define ONENAND_PAGEBUF_ALLOC		(0x1000)
 #define ONENAND_OOBBUF_ALLOC		(0x2000)
+#define ONENAND_SKIP_INITIAL_UNLOCKING	(0x4000)
 
 #define ONENAND_IS_4KB_PAGE(this)					\
 	(this->options & ONENAND_HAS_4KB_PAGE)
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 8023e4e..91af2e4 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -78,7 +78,6 @@
 					    struct page *page,
 					    unsigned int offset,
 					    unsigned int count);
-extern	void nfs_clear_request(struct nfs_page *req);
 extern	void nfs_release_request(struct nfs_page *req);
 
 
diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h
index 6a210f1..76579f9 100644
--- a/include/linux/regulator/ab8500.h
+++ b/include/linux/regulator/ab8500.h
@@ -3,8 +3,8 @@
  *
  * License Terms: GNU General Public License v2
  *
- * Author: Sundar Iyer <sundar.iyer@stericsson.com> for ST-Ericsson
- *
+ * Authors: Sundar Iyer <sundar.iyer@stericsson.com> for ST-Ericsson
+ *          Bengt Jonsson <bengt.g.jonsson@stericsson.com> for ST-Ericsson
  */
 
 #ifndef __LINUX_MFD_AB8500_REGULATOR_H
@@ -17,6 +17,7 @@
 	AB8500_LDO_AUX3,
 	AB8500_LDO_INTCORE,
 	AB8500_LDO_TVOUT,
+	AB8500_LDO_USB,
 	AB8500_LDO_AUDIO,
 	AB8500_LDO_ANAMIC1,
 	AB8500_LDO_ANAMIC2,
@@ -24,4 +25,50 @@
 	AB8500_LDO_ANA,
 	AB8500_NUM_REGULATORS,
 };
+
+/* AB8500 register initialization */
+struct ab8500_regulator_reg_init {
+	int id;
+	u8 value;
+};
+
+#define INIT_REGULATOR_REGISTER(_id, _value)	\
+	{					\
+		.id = _id,			\
+		.value = _value,		\
+	}
+
+/* AB8500 registers */
+enum ab8500_regulator_reg {
+	AB8500_REGUREQUESTCTRL2,
+	AB8500_REGUREQUESTCTRL3,
+	AB8500_REGUREQUESTCTRL4,
+	AB8500_REGUSYSCLKREQ1HPVALID1,
+	AB8500_REGUSYSCLKREQ1HPVALID2,
+	AB8500_REGUHWHPREQ1VALID1,
+	AB8500_REGUHWHPREQ1VALID2,
+	AB8500_REGUHWHPREQ2VALID1,
+	AB8500_REGUHWHPREQ2VALID2,
+	AB8500_REGUSWHPREQVALID1,
+	AB8500_REGUSWHPREQVALID2,
+	AB8500_REGUSYSCLKREQVALID1,
+	AB8500_REGUSYSCLKREQVALID2,
+	AB8500_REGUMISC1,
+	AB8500_VAUDIOSUPPLY,
+	AB8500_REGUCTRL1VAMIC,
+	AB8500_VPLLVANAREGU,
+	AB8500_VREFDDR,
+	AB8500_EXTSUPPLYREGU,
+	AB8500_VAUX12REGU,
+	AB8500_VRF1VAUX3REGU,
+	AB8500_VAUX1SEL,
+	AB8500_VAUX2SEL,
+	AB8500_VRF1VAUX3SEL,
+	AB8500_REGUCTRL2SPARE,
+	AB8500_REGUCTRLDISCH,
+	AB8500_REGUCTRLDISCH2,
+	AB8500_VSMPS1SEL1,
+	AB8500_NUM_REGULATOR_REGISTERS,
+};
+
 #endif
diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index 7954f6b..9e87c1c 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -153,6 +153,8 @@
 int regulator_is_supported_voltage(struct regulator *regulator,
 				   int min_uV, int max_uV);
 int regulator_set_voltage(struct regulator *regulator, int min_uV, int max_uV);
+int regulator_set_voltage_time(struct regulator *regulator,
+			       int old_uV, int new_uV);
 int regulator_get_voltage(struct regulator *regulator);
 int regulator_sync_voltage(struct regulator *regulator);
 int regulator_set_current_limit(struct regulator *regulator,
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index b8ed16a..6c433b8 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -63,7 +63,11 @@
  *                    when running with the specified parameters.
  *
  * @enable_time: Time taken for the regulator voltage output voltage to
- *               stabalise after being enabled, in microseconds.
+ *               stabilise after being enabled, in microseconds.
+ * @set_voltage_time_sel: Time taken for the regulator voltage output voltage
+ *               to stabilise after being set to a new value, in microseconds.
+ *               The function provides the from and to voltage selector, the
+ *               function should return the worst case.
  *
  * @set_suspend_voltage: Set the voltage for the regulator when the system
  *                       is suspended.
@@ -103,8 +107,11 @@
 	int (*set_mode) (struct regulator_dev *, unsigned int mode);
 	unsigned int (*get_mode) (struct regulator_dev *);
 
-	/* Time taken to enable the regulator */
+	/* Time taken to enable or set voltage on the regulator */
 	int (*enable_time) (struct regulator_dev *);
+	int (*set_voltage_time_sel) (struct regulator_dev *,
+				     unsigned int old_selector,
+				     unsigned int new_selector);
 
 	/* report regulator status ... most other accessors report
 	 * control inputs, this reports results of combining inputs
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 761c745..c4c4fc4 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -186,6 +186,7 @@
 };
 
 int regulator_suspend_prepare(suspend_state_t state);
+int regulator_suspend_finish(void);
 
 #ifdef CONFIG_REGULATOR
 void regulator_has_full_constraints(void);
diff --git a/include/sound/pcm.h b/include/sound/pcm.h
index 430a9cc..e1bad11 100644
--- a/include/sound/pcm.h
+++ b/include/sound/pcm.h
@@ -1031,9 +1031,7 @@
 #define snd_pcm_lib_mmap_iomem	NULL
 #endif
 
-int snd_pcm_lib_mmap_noncached(struct snd_pcm_substream *substream,
-			       struct vm_area_struct *area);
-#define snd_pcm_lib_mmap_vmalloc	snd_pcm_lib_mmap_noncached
+#define snd_pcm_lib_mmap_vmalloc NULL
 
 static inline void snd_pcm_limit_isa_dma_size(int dma, size_t *max)
 {
diff --git a/ipc/util.c b/ipc/util.c
index 8fd1b89..5c0d289 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -317,6 +317,7 @@
 
 /**
  *	ipc_check_perms	-	check security and permissions for an IPC
+ *	@ns: IPC namespace
  *	@ipcp: ipc permission set
  *	@ops: the actual security routine to call
  *	@params: its parameters
@@ -607,6 +608,7 @@
 
 /**
  *	ipcperms	-	check IPC permissions
+ *	@ns: IPC namespace
  *	@ipcp: IPC permission set
  *	@flag: desired permission set.
  *
@@ -769,7 +771,7 @@
 
 /**
  * ipcctl_pre_down - retrieve an ipc and check permissions for some IPC_XXX cmd
- * @ids:  the ipc namespace
+ * @ns:  the ipc namespace
  * @ids:  the table of ids where to look for the ipc
  * @id:   the id of the ipc to retrieve
  * @cmd:  the cmd to check
diff --git a/lib/Kconfig b/lib/Kconfig
index 23fa7a3..9c10e38 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -158,6 +158,45 @@
 	boolean
 
 #
+# BCH support is selected if needed
+#
+config BCH
+	tristate
+
+config BCH_CONST_PARAMS
+	boolean
+	help
+	  Drivers may select this option to force specific constant
+	  values for parameters 'm' (Galois field order) and 't'
+	  (error correction capability). Those specific values must
+	  be set by declaring default values for symbols BCH_CONST_M
+	  and BCH_CONST_T.
+	  Doing so will enable extra compiler optimizations,
+	  improving encoding and decoding performance up to 2x for
+	  usual (m,t) values (typically such that m*t < 200).
+	  When this option is selected, the BCH library supports
+	  only a single (m,t) configuration. This is mainly useful
+	  for NAND flash board drivers requiring known, fixed BCH
+	  parameters.
+
+config BCH_CONST_M
+	int
+	range 5 15
+	help
+	  Constant value for Galois field order 'm'. If 'k' is the
+	  number of data bits to protect, 'm' should be chosen such
+	  that (k + m*t) <= 2**m - 1.
+	  Drivers should declare a default value for this symbol if
+	  they select option BCH_CONST_PARAMS.
+
+config BCH_CONST_T
+	int
+	help
+	  Constant value for error correction capability in bits 't'.
+	  Drivers should declare a default value for this symbol if
+	  they select option BCH_CONST_PARAMS.
+
+#
 # Textsearch support is select'ed if needed
 #
 config TEXTSEARCH
diff --git a/lib/Makefile b/lib/Makefile
index d7872b5..ef0f285 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -69,6 +69,7 @@
 obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/
 obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/
 obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
+obj-$(CONFIG_BCH) += bch.o
 obj-$(CONFIG_LZO_COMPRESS) += lzo/
 obj-$(CONFIG_LZO_DECOMPRESS) += lzo/
 obj-$(CONFIG_XZ_DEC) += xz/
diff --git a/lib/bch.c b/lib/bch.c
new file mode 100644
index 0000000..bc89dfe
--- /dev/null
+++ b/lib/bch.c
@@ -0,0 +1,1368 @@
+/*
+ * Generic binary BCH encoding/decoding library
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Copyright © 2011 Parrot S.A.
+ *
+ * Author: Ivan Djelic <ivan.djelic@parrot.com>
+ *
+ * Description:
+ *
+ * This library provides runtime configurable encoding/decoding of binary
+ * Bose-Chaudhuri-Hocquenghem (BCH) codes.
+ *
+ * Call init_bch to get a pointer to a newly allocated bch_control structure for
+ * the given m (Galois field order), t (error correction capability) and
+ * (optional) primitive polynomial parameters.
+ *
+ * Call encode_bch to compute and store ecc parity bytes to a given buffer.
+ * Call decode_bch to detect and locate errors in received data.
+ *
+ * On systems supporting hw BCH features, intermediate results may be provided
+ * to decode_bch in order to skip certain steps. See decode_bch() documentation
+ * for details.
+ *
+ * Option CONFIG_BCH_CONST_PARAMS can be used to force fixed values of
+ * parameters m and t; thus allowing extra compiler optimizations and providing
+ * better (up to 2x) encoding performance. Using this option makes sense when
+ * (m,t) are fixed and known in advance, e.g. when using BCH error correction
+ * on a particular NAND flash device.
+ *
+ * Algorithmic details:
+ *
+ * Encoding is performed by processing 32 input bits in parallel, using 4
+ * remainder lookup tables.
+ *
+ * The final stage of decoding involves the following internal steps:
+ * a. Syndrome computation
+ * b. Error locator polynomial computation using Berlekamp-Massey algorithm
+ * c. Error locator root finding (by far the most expensive step)
+ *
+ * In this implementation, step c is not performed using the usual Chien search.
+ * Instead, an alternative approach described in [1] is used. It consists in
+ * factoring the error locator polynomial using the Berlekamp Trace algorithm
+ * (BTA) down to a certain degree (4), after which ad hoc low-degree polynomial
+ * solving techniques [2] are used. The resulting algorithm, called BTZ, yields
+ * much better performance than Chien search for usual (m,t) values (typically
+ * m >= 13, t < 32, see [1]).
+ *
+ * [1] B. Biswas, V. Herbert. Efficient root finding of polynomials over fields
+ * of characteristic 2, in: Western European Workshop on Research in Cryptology
+ * - WEWoRC 2009, Graz, Austria, LNCS, Springer, July 2009, to appear.
+ * [2] [Zin96] V.A. Zinoviev. On the solution of equations of degree 10 over
+ * finite fields GF(2^q). In Rapport de recherche INRIA no 2829, 1996.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <asm/byteorder.h>
+#include <linux/bch.h>
+
+#if defined(CONFIG_BCH_CONST_PARAMS)
+#define GF_M(_p)               (CONFIG_BCH_CONST_M)
+#define GF_T(_p)               (CONFIG_BCH_CONST_T)
+#define GF_N(_p)               ((1 << (CONFIG_BCH_CONST_M))-1)
+#else
+#define GF_M(_p)               ((_p)->m)
+#define GF_T(_p)               ((_p)->t)
+#define GF_N(_p)               ((_p)->n)
+#endif
+
+#define BCH_ECC_WORDS(_p)      DIV_ROUND_UP(GF_M(_p)*GF_T(_p), 32)
+#define BCH_ECC_BYTES(_p)      DIV_ROUND_UP(GF_M(_p)*GF_T(_p), 8)
+
+#ifndef dbg
+#define dbg(_fmt, args...)     do {} while (0)
+#endif
+
+/*
+ * represent a polynomial over GF(2^m)
+ */
+struct gf_poly {
+	unsigned int deg;    /* polynomial degree */
+	unsigned int c[0];   /* polynomial terms */
+};
+
+/* given its degree, compute a polynomial size in bytes */
+#define GF_POLY_SZ(_d) (sizeof(struct gf_poly)+((_d)+1)*sizeof(unsigned int))
+
+/* polynomial of degree 1 */
+struct gf_poly_deg1 {
+	struct gf_poly poly;
+	unsigned int   c[2];
+};
+
+/*
+ * same as encode_bch(), but process input data one byte at a time
+ */
+static void encode_bch_unaligned(struct bch_control *bch,
+				 const unsigned char *data, unsigned int len,
+				 uint32_t *ecc)
+{
+	int i;
+	const uint32_t *p;
+	const int l = BCH_ECC_WORDS(bch)-1;
+
+	while (len--) {
+		p = bch->mod8_tab + (l+1)*(((ecc[0] >> 24)^(*data++)) & 0xff);
+
+		for (i = 0; i < l; i++)
+			ecc[i] = ((ecc[i] << 8)|(ecc[i+1] >> 24))^(*p++);
+
+		ecc[l] = (ecc[l] << 8)^(*p);
+	}
+}
+
+/*
+ * convert ecc bytes to aligned, zero-padded 32-bit ecc words
+ */
+static void load_ecc8(struct bch_control *bch, uint32_t *dst,
+		      const uint8_t *src)
+{
+	uint8_t pad[4] = {0, 0, 0, 0};
+	unsigned int i, nwords = BCH_ECC_WORDS(bch)-1;
+
+	for (i = 0; i < nwords; i++, src += 4)
+		dst[i] = (src[0] << 24)|(src[1] << 16)|(src[2] << 8)|src[3];
+
+	memcpy(pad, src, BCH_ECC_BYTES(bch)-4*nwords);
+	dst[nwords] = (pad[0] << 24)|(pad[1] << 16)|(pad[2] << 8)|pad[3];
+}
+
+/*
+ * convert 32-bit ecc words to ecc bytes
+ */
+static void store_ecc8(struct bch_control *bch, uint8_t *dst,
+		       const uint32_t *src)
+{
+	uint8_t pad[4];
+	unsigned int i, nwords = BCH_ECC_WORDS(bch)-1;
+
+	for (i = 0; i < nwords; i++) {
+		*dst++ = (src[i] >> 24);
+		*dst++ = (src[i] >> 16) & 0xff;
+		*dst++ = (src[i] >>  8) & 0xff;
+		*dst++ = (src[i] >>  0) & 0xff;
+	}
+	pad[0] = (src[nwords] >> 24);
+	pad[1] = (src[nwords] >> 16) & 0xff;
+	pad[2] = (src[nwords] >>  8) & 0xff;
+	pad[3] = (src[nwords] >>  0) & 0xff;
+	memcpy(dst, pad, BCH_ECC_BYTES(bch)-4*nwords);
+}
+
+/**
+ * encode_bch - calculate BCH ecc parity of data
+ * @bch:   BCH control structure
+ * @data:  data to encode
+ * @len:   data length in bytes
+ * @ecc:   ecc parity data, must be initialized by caller
+ *
+ * The @ecc parity array is used both as input and output parameter, in order to
+ * allow incremental computations. It should be of the size indicated by member
+ * @ecc_bytes of @bch, and should be initialized to 0 before the first call.
+ *
+ * The exact number of computed ecc parity bits is given by member @ecc_bits of
+ * @bch; it may be less than m*t for large values of t.
+ */
+void encode_bch(struct bch_control *bch, const uint8_t *data,
+		unsigned int len, uint8_t *ecc)
+{
+	const unsigned int l = BCH_ECC_WORDS(bch)-1;
+	unsigned int i, mlen;
+	unsigned long m;
+	uint32_t w, r[l+1];
+	const uint32_t * const tab0 = bch->mod8_tab;
+	const uint32_t * const tab1 = tab0 + 256*(l+1);
+	const uint32_t * const tab2 = tab1 + 256*(l+1);
+	const uint32_t * const tab3 = tab2 + 256*(l+1);
+	const uint32_t *pdata, *p0, *p1, *p2, *p3;
+
+	if (ecc) {
+		/* load ecc parity bytes into internal 32-bit buffer */
+		load_ecc8(bch, bch->ecc_buf, ecc);
+	} else {
+		memset(bch->ecc_buf, 0, sizeof(r));
+	}
+
+	/* process first unaligned data bytes */
+	m = ((unsigned long)data) & 3;
+	if (m) {
+		mlen = (len < (4-m)) ? len : 4-m;
+		encode_bch_unaligned(bch, data, mlen, bch->ecc_buf);
+		data += mlen;
+		len  -= mlen;
+	}
+
+	/* process 32-bit aligned data words */
+	pdata = (uint32_t *)data;
+	mlen  = len/4;
+	data += 4*mlen;
+	len  -= 4*mlen;
+	memcpy(r, bch->ecc_buf, sizeof(r));
+
+	/*
+	 * split each 32-bit word into 4 polynomials of weight 8 as follows:
+	 *
+	 * 31 ...24  23 ...16  15 ... 8  7 ... 0
+	 * xxxxxxxx  yyyyyyyy  zzzzzzzz  tttttttt
+	 *                               tttttttt  mod g = r0 (precomputed)
+	 *                     zzzzzzzz  00000000  mod g = r1 (precomputed)
+	 *           yyyyyyyy  00000000  00000000  mod g = r2 (precomputed)
+	 * xxxxxxxx  00000000  00000000  00000000  mod g = r3 (precomputed)
+	 * xxxxxxxx  yyyyyyyy  zzzzzzzz  tttttttt  mod g = r0^r1^r2^r3
+	 */
+	while (mlen--) {
+		/* input data is read in big-endian format */
+		w = r[0]^cpu_to_be32(*pdata++);
+		p0 = tab0 + (l+1)*((w >>  0) & 0xff);
+		p1 = tab1 + (l+1)*((w >>  8) & 0xff);
+		p2 = tab2 + (l+1)*((w >> 16) & 0xff);
+		p3 = tab3 + (l+1)*((w >> 24) & 0xff);
+
+		for (i = 0; i < l; i++)
+			r[i] = r[i+1]^p0[i]^p1[i]^p2[i]^p3[i];
+
+		r[l] = p0[l]^p1[l]^p2[l]^p3[l];
+	}
+	memcpy(bch->ecc_buf, r, sizeof(r));
+
+	/* process last unaligned bytes */
+	if (len)
+		encode_bch_unaligned(bch, data, len, bch->ecc_buf);
+
+	/* store ecc parity bytes into original parity buffer */
+	if (ecc)
+		store_ecc8(bch, ecc, bch->ecc_buf);
+}
+EXPORT_SYMBOL_GPL(encode_bch);
+
+static inline int modulo(struct bch_control *bch, unsigned int v)
+{
+	const unsigned int n = GF_N(bch);
+	while (v >= n) {
+		v -= n;
+		v = (v & n) + (v >> GF_M(bch));
+	}
+	return v;
+}
+
+/*
+ * shorter and faster modulo function, only works when v < 2N.
+ */
+static inline int mod_s(struct bch_control *bch, unsigned int v)
+{
+	const unsigned int n = GF_N(bch);
+	return (v < n) ? v : v-n;
+}
+
+static inline int deg(unsigned int poly)
+{
+	/* polynomial degree is the most-significant bit index */
+	return fls(poly)-1;
+}
+
+static inline int parity(unsigned int x)
+{
+	/*
+	 * public domain code snippet, lifted from
+	 * http://www-graphics.stanford.edu/~seander/bithacks.html
+	 */
+	x ^= x >> 1;
+	x ^= x >> 2;
+	x = (x & 0x11111111U) * 0x11111111U;
+	return (x >> 28) & 1;
+}
+
+/* Galois field basic operations: multiply, divide, inverse, etc. */
+
+static inline unsigned int gf_mul(struct bch_control *bch, unsigned int a,
+				  unsigned int b)
+{
+	return (a && b) ? bch->a_pow_tab[mod_s(bch, bch->a_log_tab[a]+
+					       bch->a_log_tab[b])] : 0;
+}
+
+static inline unsigned int gf_sqr(struct bch_control *bch, unsigned int a)
+{
+	return a ? bch->a_pow_tab[mod_s(bch, 2*bch->a_log_tab[a])] : 0;
+}
+
+static inline unsigned int gf_div(struct bch_control *bch, unsigned int a,
+				  unsigned int b)
+{
+	return a ? bch->a_pow_tab[mod_s(bch, bch->a_log_tab[a]+
+					GF_N(bch)-bch->a_log_tab[b])] : 0;
+}
+
+static inline unsigned int gf_inv(struct bch_control *bch, unsigned int a)
+{
+	return bch->a_pow_tab[GF_N(bch)-bch->a_log_tab[a]];
+}
+
+static inline unsigned int a_pow(struct bch_control *bch, int i)
+{
+	return bch->a_pow_tab[modulo(bch, i)];
+}
+
+static inline int a_log(struct bch_control *bch, unsigned int x)
+{
+	return bch->a_log_tab[x];
+}
+
+static inline int a_ilog(struct bch_control *bch, unsigned int x)
+{
+	return mod_s(bch, GF_N(bch)-bch->a_log_tab[x]);
+}
+
+/*
+ * compute 2t syndromes of ecc polynomial, i.e. ecc(a^j) for j=1..2t
+ */
+static void compute_syndromes(struct bch_control *bch, uint32_t *ecc,
+			      unsigned int *syn)
+{
+	int i, j, s;
+	unsigned int m;
+	uint32_t poly;
+	const int t = GF_T(bch);
+
+	s = bch->ecc_bits;
+
+	/* make sure extra bits in last ecc word are cleared */
+	m = ((unsigned int)s) & 31;
+	if (m)
+		ecc[s/32] &= ~((1u << (32-m))-1);
+	memset(syn, 0, 2*t*sizeof(*syn));
+
+	/* compute v(a^j) for j=1 .. 2t-1 */
+	do {
+		poly = *ecc++;
+		s -= 32;
+		while (poly) {
+			i = deg(poly);
+			for (j = 0; j < 2*t; j += 2)
+				syn[j] ^= a_pow(bch, (j+1)*(i+s));
+
+			poly ^= (1 << i);
+		}
+	} while (s > 0);
+
+	/* v(a^(2j)) = v(a^j)^2 */
+	for (j = 0; j < t; j++)
+		syn[2*j+1] = gf_sqr(bch, syn[j]);
+}
+
+static void gf_poly_copy(struct gf_poly *dst, struct gf_poly *src)
+{
+	memcpy(dst, src, GF_POLY_SZ(src->deg));
+}
+
+static int compute_error_locator_polynomial(struct bch_control *bch,
+					    const unsigned int *syn)
+{
+	const unsigned int t = GF_T(bch);
+	const unsigned int n = GF_N(bch);
+	unsigned int i, j, tmp, l, pd = 1, d = syn[0];
+	struct gf_poly *elp = bch->elp;
+	struct gf_poly *pelp = bch->poly_2t[0];
+	struct gf_poly *elp_copy = bch->poly_2t[1];
+	int k, pp = -1;
+
+	memset(pelp, 0, GF_POLY_SZ(2*t));
+	memset(elp, 0, GF_POLY_SZ(2*t));
+
+	pelp->deg = 0;
+	pelp->c[0] = 1;
+	elp->deg = 0;
+	elp->c[0] = 1;
+
+	/* use simplified binary Berlekamp-Massey algorithm */
+	for (i = 0; (i < t) && (elp->deg <= t); i++) {
+		if (d) {
+			k = 2*i-pp;
+			gf_poly_copy(elp_copy, elp);
+			/* e[i+1](X) = e[i](X)+di*dp^-1*X^2(i-p)*e[p](X) */
+			tmp = a_log(bch, d)+n-a_log(bch, pd);
+			for (j = 0; j <= pelp->deg; j++) {
+				if (pelp->c[j]) {
+					l = a_log(bch, pelp->c[j]);
+					elp->c[j+k] ^= a_pow(bch, tmp+l);
+				}
+			}
+			/* compute l[i+1] = max(l[i]->c[l[p]+2*(i-p]) */
+			tmp = pelp->deg+k;
+			if (tmp > elp->deg) {
+				elp->deg = tmp;
+				gf_poly_copy(pelp, elp_copy);
+				pd = d;
+				pp = 2*i;
+			}
+		}
+		/* di+1 = S(2i+3)+elp[i+1].1*S(2i+2)+...+elp[i+1].lS(2i+3-l) */
+		if (i < t-1) {
+			d = syn[2*i+2];
+			for (j = 1; j <= elp->deg; j++)
+				d ^= gf_mul(bch, elp->c[j], syn[2*i+2-j]);
+		}
+	}
+	dbg("elp=%s\n", gf_poly_str(elp));
+	return (elp->deg > t) ? -1 : (int)elp->deg;
+}
+
+/*
+ * solve a m x m linear system in GF(2) with an expected number of solutions,
+ * and return the number of found solutions
+ */
+static int solve_linear_system(struct bch_control *bch, unsigned int *rows,
+			       unsigned int *sol, int nsol)
+{
+	const int m = GF_M(bch);
+	unsigned int tmp, mask;
+	int rem, c, r, p, k, param[m];
+
+	k = 0;
+	mask = 1 << m;
+
+	/* Gaussian elimination */
+	for (c = 0; c < m; c++) {
+		rem = 0;
+		p = c-k;
+		/* find suitable row for elimination */
+		for (r = p; r < m; r++) {
+			if (rows[r] & mask) {
+				if (r != p) {
+					tmp = rows[r];
+					rows[r] = rows[p];
+					rows[p] = tmp;
+				}
+				rem = r+1;
+				break;
+			}
+		}
+		if (rem) {
+			/* perform elimination on remaining rows */
+			tmp = rows[p];
+			for (r = rem; r < m; r++) {
+				if (rows[r] & mask)
+					rows[r] ^= tmp;
+			}
+		} else {
+			/* elimination not needed, store defective row index */
+			param[k++] = c;
+		}
+		mask >>= 1;
+	}
+	/* rewrite system, inserting fake parameter rows */
+	if (k > 0) {
+		p = k;
+		for (r = m-1; r >= 0; r--) {
+			if ((r > m-1-k) && rows[r])
+				/* system has no solution */
+				return 0;
+
+			rows[r] = (p && (r == param[p-1])) ?
+				p--, 1u << (m-r) : rows[r-p];
+		}
+	}
+
+	if (nsol != (1 << k))
+		/* unexpected number of solutions */
+		return 0;
+
+	for (p = 0; p < nsol; p++) {
+		/* set parameters for p-th solution */
+		for (c = 0; c < k; c++)
+			rows[param[c]] = (rows[param[c]] & ~1)|((p >> c) & 1);
+
+		/* compute unique solution */
+		tmp = 0;
+		for (r = m-1; r >= 0; r--) {
+			mask = rows[r] & (tmp|1);
+			tmp |= parity(mask) << (m-r);
+		}
+		sol[p] = tmp >> 1;
+	}
+	return nsol;
+}
+
+/*
+ * this function builds and solves a linear system for finding roots of a degree
+ * 4 affine monic polynomial X^4+aX^2+bX+c over GF(2^m).
+ */
+static int find_affine4_roots(struct bch_control *bch, unsigned int a,
+			      unsigned int b, unsigned int c,
+			      unsigned int *roots)
+{
+	int i, j, k;
+	const int m = GF_M(bch);
+	unsigned int mask = 0xff, t, rows[16] = {0,};
+
+	j = a_log(bch, b);
+	k = a_log(bch, a);
+	rows[0] = c;
+
+	/* buid linear system to solve X^4+aX^2+bX+c = 0 */
+	for (i = 0; i < m; i++) {
+		rows[i+1] = bch->a_pow_tab[4*i]^
+			(a ? bch->a_pow_tab[mod_s(bch, k)] : 0)^
+			(b ? bch->a_pow_tab[mod_s(bch, j)] : 0);
+		j++;
+		k += 2;
+	}
+	/*
+	 * transpose 16x16 matrix before passing it to linear solver
+	 * warning: this code assumes m < 16
+	 */
+	for (j = 8; j != 0; j >>= 1, mask ^= (mask << j)) {
+		for (k = 0; k < 16; k = (k+j+1) & ~j) {
+			t = ((rows[k] >> j)^rows[k+j]) & mask;
+			rows[k] ^= (t << j);
+			rows[k+j] ^= t;
+		}
+	}
+	return solve_linear_system(bch, rows, roots, 4);
+}
+
+/*
+ * compute root r of a degree 1 polynomial over GF(2^m) (returned as log(1/r))
+ */
+static int find_poly_deg1_roots(struct bch_control *bch, struct gf_poly *poly,
+				unsigned int *roots)
+{
+	int n = 0;
+
+	if (poly->c[0])
+		/* poly[X] = bX+c with c!=0, root=c/b */
+		roots[n++] = mod_s(bch, GF_N(bch)-bch->a_log_tab[poly->c[0]]+
+				   bch->a_log_tab[poly->c[1]]);
+	return n;
+}
+
+/*
+ * compute roots of a degree 2 polynomial over GF(2^m)
+ */
+static int find_poly_deg2_roots(struct bch_control *bch, struct gf_poly *poly,
+				unsigned int *roots)
+{
+	int n = 0, i, l0, l1, l2;
+	unsigned int u, v, r;
+
+	if (poly->c[0] && poly->c[1]) {
+
+		l0 = bch->a_log_tab[poly->c[0]];
+		l1 = bch->a_log_tab[poly->c[1]];
+		l2 = bch->a_log_tab[poly->c[2]];
+
+		/* using z=a/bX, transform aX^2+bX+c into z^2+z+u (u=ac/b^2) */
+		u = a_pow(bch, l0+l2+2*(GF_N(bch)-l1));
+		/*
+		 * let u = sum(li.a^i) i=0..m-1; then compute r = sum(li.xi):
+		 * r^2+r = sum(li.(xi^2+xi)) = sum(li.(a^i+Tr(a^i).a^k)) =
+		 * u + sum(li.Tr(a^i).a^k) = u+a^k.Tr(sum(li.a^i)) = u+a^k.Tr(u)
+		 * i.e. r and r+1 are roots iff Tr(u)=0
+		 */
+		r = 0;
+		v = u;
+		while (v) {
+			i = deg(v);
+			r ^= bch->xi_tab[i];
+			v ^= (1 << i);
+		}
+		/* verify root */
+		if ((gf_sqr(bch, r)^r) == u) {
+			/* reverse z=a/bX transformation and compute log(1/r) */
+			roots[n++] = modulo(bch, 2*GF_N(bch)-l1-
+					    bch->a_log_tab[r]+l2);
+			roots[n++] = modulo(bch, 2*GF_N(bch)-l1-
+					    bch->a_log_tab[r^1]+l2);
+		}
+	}
+	return n;
+}
+
+/*
+ * compute roots of a degree 3 polynomial over GF(2^m)
+ */
+static int find_poly_deg3_roots(struct bch_control *bch, struct gf_poly *poly,
+				unsigned int *roots)
+{
+	int i, n = 0;
+	unsigned int a, b, c, a2, b2, c2, e3, tmp[4];
+
+	if (poly->c[0]) {
+		/* transform polynomial into monic X^3 + a2X^2 + b2X + c2 */
+		e3 = poly->c[3];
+		c2 = gf_div(bch, poly->c[0], e3);
+		b2 = gf_div(bch, poly->c[1], e3);
+		a2 = gf_div(bch, poly->c[2], e3);
+
+		/* (X+a2)(X^3+a2X^2+b2X+c2) = X^4+aX^2+bX+c (affine) */
+		c = gf_mul(bch, a2, c2);           /* c = a2c2      */
+		b = gf_mul(bch, a2, b2)^c2;        /* b = a2b2 + c2 */
+		a = gf_sqr(bch, a2)^b2;            /* a = a2^2 + b2 */
+
+		/* find the 4 roots of this affine polynomial */
+		if (find_affine4_roots(bch, a, b, c, tmp) == 4) {
+			/* remove a2 from final list of roots */
+			for (i = 0; i < 4; i++) {
+				if (tmp[i] != a2)
+					roots[n++] = a_ilog(bch, tmp[i]);
+			}
+		}
+	}
+	return n;
+}
+
+/*
+ * compute roots of a degree 4 polynomial over GF(2^m)
+ */
+static int find_poly_deg4_roots(struct bch_control *bch, struct gf_poly *poly,
+				unsigned int *roots)
+{
+	int i, l, n = 0;
+	unsigned int a, b, c, d, e = 0, f, a2, b2, c2, e4;
+
+	if (poly->c[0] == 0)
+		return 0;
+
+	/* transform polynomial into monic X^4 + aX^3 + bX^2 + cX + d */
+	e4 = poly->c[4];
+	d = gf_div(bch, poly->c[0], e4);
+	c = gf_div(bch, poly->c[1], e4);
+	b = gf_div(bch, poly->c[2], e4);
+	a = gf_div(bch, poly->c[3], e4);
+
+	/* use Y=1/X transformation to get an affine polynomial */
+	if (a) {
+		/* first, eliminate cX by using z=X+e with ae^2+c=0 */
+		if (c) {
+			/* compute e such that e^2 = c/a */
+			f = gf_div(bch, c, a);
+			l = a_log(bch, f);
+			l += (l & 1) ? GF_N(bch) : 0;
+			e = a_pow(bch, l/2);
+			/*
+			 * use transformation z=X+e:
+			 * z^4+e^4 + a(z^3+ez^2+e^2z+e^3) + b(z^2+e^2) +cz+ce+d
+			 * z^4 + az^3 + (ae+b)z^2 + (ae^2+c)z+e^4+be^2+ae^3+ce+d
+			 * z^4 + az^3 + (ae+b)z^2 + e^4+be^2+d
+			 * z^4 + az^3 +     b'z^2 + d'
+			 */
+			d = a_pow(bch, 2*l)^gf_mul(bch, b, f)^d;
+			b = gf_mul(bch, a, e)^b;
+		}
+		/* now, use Y=1/X to get Y^4 + b/dY^2 + a/dY + 1/d */
+		if (d == 0)
+			/* assume all roots have multiplicity 1 */
+			return 0;
+
+		c2 = gf_inv(bch, d);
+		b2 = gf_div(bch, a, d);
+		a2 = gf_div(bch, b, d);
+	} else {
+		/* polynomial is already affine */
+		c2 = d;
+		b2 = c;
+		a2 = b;
+	}
+	/* find the 4 roots of this affine polynomial */
+	if (find_affine4_roots(bch, a2, b2, c2, roots) == 4) {
+		for (i = 0; i < 4; i++) {
+			/* post-process roots (reverse transformations) */
+			f = a ? gf_inv(bch, roots[i]) : roots[i];
+			roots[i] = a_ilog(bch, f^e);
+		}
+		n = 4;
+	}
+	return n;
+}
+
+/*
+ * build monic, log-based representation of a polynomial
+ */
+static void gf_poly_logrep(struct bch_control *bch,
+			   const struct gf_poly *a, int *rep)
+{
+	int i, d = a->deg, l = GF_N(bch)-a_log(bch, a->c[a->deg]);
+
+	/* represent 0 values with -1; warning, rep[d] is not set to 1 */
+	for (i = 0; i < d; i++)
+		rep[i] = a->c[i] ? mod_s(bch, a_log(bch, a->c[i])+l) : -1;
+}
+
+/*
+ * compute polynomial Euclidean division remainder in GF(2^m)[X]
+ */
+static void gf_poly_mod(struct bch_control *bch, struct gf_poly *a,
+			const struct gf_poly *b, int *rep)
+{
+	int la, p, m;
+	unsigned int i, j, *c = a->c;
+	const unsigned int d = b->deg;
+
+	if (a->deg < d)
+		return;
+
+	/* reuse or compute log representation of denominator */
+	if (!rep) {
+		rep = bch->cache;
+		gf_poly_logrep(bch, b, rep);
+	}
+
+	for (j = a->deg; j >= d; j--) {
+		if (c[j]) {
+			la = a_log(bch, c[j]);
+			p = j-d;
+			for (i = 0; i < d; i++, p++) {
+				m = rep[i];
+				if (m >= 0)
+					c[p] ^= bch->a_pow_tab[mod_s(bch,
+								     m+la)];
+			}
+		}
+	}
+	a->deg = d-1;
+	while (!c[a->deg] && a->deg)
+		a->deg--;
+}
+
+/*
+ * compute polynomial Euclidean division quotient in GF(2^m)[X]
+ */
+static void gf_poly_div(struct bch_control *bch, struct gf_poly *a,
+			const struct gf_poly *b, struct gf_poly *q)
+{
+	if (a->deg >= b->deg) {
+		q->deg = a->deg-b->deg;
+		/* compute a mod b (modifies a) */
+		gf_poly_mod(bch, a, b, NULL);
+		/* quotient is stored in upper part of polynomial a */
+		memcpy(q->c, &a->c[b->deg], (1+q->deg)*sizeof(unsigned int));
+	} else {
+		q->deg = 0;
+		q->c[0] = 0;
+	}
+}
+
+/*
+ * compute polynomial GCD (Greatest Common Divisor) in GF(2^m)[X]
+ */
+static struct gf_poly *gf_poly_gcd(struct bch_control *bch, struct gf_poly *a,
+				   struct gf_poly *b)
+{
+	struct gf_poly *tmp;
+
+	dbg("gcd(%s,%s)=", gf_poly_str(a), gf_poly_str(b));
+
+	if (a->deg < b->deg) {
+		tmp = b;
+		b = a;
+		a = tmp;
+	}
+
+	while (b->deg > 0) {
+		gf_poly_mod(bch, a, b, NULL);
+		tmp = b;
+		b = a;
+		a = tmp;
+	}
+
+	dbg("%s\n", gf_poly_str(a));
+
+	return a;
+}
+
+/*
+ * Given a polynomial f and an integer k, compute Tr(a^kX) mod f
+ * This is used in Berlekamp Trace algorithm for splitting polynomials
+ */
+static void compute_trace_bk_mod(struct bch_control *bch, int k,
+				 const struct gf_poly *f, struct gf_poly *z,
+				 struct gf_poly *out)
+{
+	const int m = GF_M(bch);
+	int i, j;
+
+	/* z contains z^2j mod f */
+	z->deg = 1;
+	z->c[0] = 0;
+	z->c[1] = bch->a_pow_tab[k];
+
+	out->deg = 0;
+	memset(out, 0, GF_POLY_SZ(f->deg));
+
+	/* compute f log representation only once */
+	gf_poly_logrep(bch, f, bch->cache);
+
+	for (i = 0; i < m; i++) {
+		/* add a^(k*2^i)(z^(2^i) mod f) and compute (z^(2^i) mod f)^2 */
+		for (j = z->deg; j >= 0; j--) {
+			out->c[j] ^= z->c[j];
+			z->c[2*j] = gf_sqr(bch, z->c[j]);
+			z->c[2*j+1] = 0;
+		}
+		if (z->deg > out->deg)
+			out->deg = z->deg;
+
+		if (i < m-1) {
+			z->deg *= 2;
+			/* z^(2(i+1)) mod f = (z^(2^i) mod f)^2 mod f */
+			gf_poly_mod(bch, z, f, bch->cache);
+		}
+	}
+	while (!out->c[out->deg] && out->deg)
+		out->deg--;
+
+	dbg("Tr(a^%d.X) mod f = %s\n", k, gf_poly_str(out));
+}
+
+/*
+ * factor a polynomial using Berlekamp Trace algorithm (BTA)
+ */
+static void factor_polynomial(struct bch_control *bch, int k, struct gf_poly *f,
+			      struct gf_poly **g, struct gf_poly **h)
+{
+	struct gf_poly *f2 = bch->poly_2t[0];
+	struct gf_poly *q  = bch->poly_2t[1];
+	struct gf_poly *tk = bch->poly_2t[2];
+	struct gf_poly *z  = bch->poly_2t[3];
+	struct gf_poly *gcd;
+
+	dbg("factoring %s...\n", gf_poly_str(f));
+
+	*g = f;
+	*h = NULL;
+
+	/* tk = Tr(a^k.X) mod f */
+	compute_trace_bk_mod(bch, k, f, z, tk);
+
+	if (tk->deg > 0) {
+		/* compute g = gcd(f, tk) (destructive operation) */
+		gf_poly_copy(f2, f);
+		gcd = gf_poly_gcd(bch, f2, tk);
+		if (gcd->deg < f->deg) {
+			/* compute h=f/gcd(f,tk); this will modify f and q */
+			gf_poly_div(bch, f, gcd, q);
+			/* store g and h in-place (clobbering f) */
+			*h = &((struct gf_poly_deg1 *)f)[gcd->deg].poly;
+			gf_poly_copy(*g, gcd);
+			gf_poly_copy(*h, q);
+		}
+	}
+}
+
+/*
+ * find roots of a polynomial, using BTZ algorithm; see the beginning of this
+ * file for details
+ */
+static int find_poly_roots(struct bch_control *bch, unsigned int k,
+			   struct gf_poly *poly, unsigned int *roots)
+{
+	int cnt;
+	struct gf_poly *f1, *f2;
+
+	switch (poly->deg) {
+		/* handle low degree polynomials with ad hoc techniques */
+	case 1:
+		cnt = find_poly_deg1_roots(bch, poly, roots);
+		break;
+	case 2:
+		cnt = find_poly_deg2_roots(bch, poly, roots);
+		break;
+	case 3:
+		cnt = find_poly_deg3_roots(bch, poly, roots);
+		break;
+	case 4:
+		cnt = find_poly_deg4_roots(bch, poly, roots);
+		break;
+	default:
+		/* factor polynomial using Berlekamp Trace Algorithm (BTA) */
+		cnt = 0;
+		if (poly->deg && (k <= GF_M(bch))) {
+			factor_polynomial(bch, k, poly, &f1, &f2);
+			if (f1)
+				cnt += find_poly_roots(bch, k+1, f1, roots);
+			if (f2)
+				cnt += find_poly_roots(bch, k+1, f2, roots+cnt);
+		}
+		break;
+	}
+	return cnt;
+}
+
+#if defined(USE_CHIEN_SEARCH)
+/*
+ * exhaustive root search (Chien) implementation - not used, included only for
+ * reference/comparison tests
+ */
+static int chien_search(struct bch_control *bch, unsigned int len,
+			struct gf_poly *p, unsigned int *roots)
+{
+	int m;
+	unsigned int i, j, syn, syn0, count = 0;
+	const unsigned int k = 8*len+bch->ecc_bits;
+
+	/* use a log-based representation of polynomial */
+	gf_poly_logrep(bch, p, bch->cache);
+	bch->cache[p->deg] = 0;
+	syn0 = gf_div(bch, p->c[0], p->c[p->deg]);
+
+	for (i = GF_N(bch)-k+1; i <= GF_N(bch); i++) {
+		/* compute elp(a^i) */
+		for (j = 1, syn = syn0; j <= p->deg; j++) {
+			m = bch->cache[j];
+			if (m >= 0)
+				syn ^= a_pow(bch, m+j*i);
+		}
+		if (syn == 0) {
+			roots[count++] = GF_N(bch)-i;
+			if (count == p->deg)
+				break;
+		}
+	}
+	return (count == p->deg) ? count : 0;
+}
+#define find_poly_roots(_p, _k, _elp, _loc) chien_search(_p, len, _elp, _loc)
+#endif /* USE_CHIEN_SEARCH */
+
+/**
+ * decode_bch - decode received codeword and find bit error locations
+ * @bch:      BCH control structure
+ * @data:     received data, ignored if @calc_ecc is provided
+ * @len:      data length in bytes, must always be provided
+ * @recv_ecc: received ecc, if NULL then assume it was XORed in @calc_ecc
+ * @calc_ecc: calculated ecc, if NULL then calc_ecc is computed from @data
+ * @syn:      hw computed syndrome data (if NULL, syndrome is calculated)
+ * @errloc:   output array of error locations
+ *
+ * Returns:
+ *  The number of errors found, or -EBADMSG if decoding failed, or -EINVAL if
+ *  invalid parameters were provided
+ *
+ * Depending on the available hw BCH support and the need to compute @calc_ecc
+ * separately (using encode_bch()), this function should be called with one of
+ * the following parameter configurations -
+ *
+ * by providing @data and @recv_ecc only:
+ *   decode_bch(@bch, @data, @len, @recv_ecc, NULL, NULL, @errloc)
+ *
+ * by providing @recv_ecc and @calc_ecc:
+ *   decode_bch(@bch, NULL, @len, @recv_ecc, @calc_ecc, NULL, @errloc)
+ *
+ * by providing ecc = recv_ecc XOR calc_ecc:
+ *   decode_bch(@bch, NULL, @len, NULL, ecc, NULL, @errloc)
+ *
+ * by providing syndrome results @syn:
+ *   decode_bch(@bch, NULL, @len, NULL, NULL, @syn, @errloc)
+ *
+ * Once decode_bch() has successfully returned with a positive value, error
+ * locations returned in array @errloc should be interpreted as follows -
+ *
+ * if (errloc[n] >= 8*len), then n-th error is located in ecc (no need for
+ * data correction)
+ *
+ * if (errloc[n] < 8*len), then n-th error is located in data and can be
+ * corrected with statement data[errloc[n]/8] ^= 1 << (errloc[n] % 8);
+ *
+ * Note that this function does not perform any data correction by itself, it
+ * merely indicates error locations.
+ */
+int decode_bch(struct bch_control *bch, const uint8_t *data, unsigned int len,
+	       const uint8_t *recv_ecc, const uint8_t *calc_ecc,
+	       const unsigned int *syn, unsigned int *errloc)
+{
+	const unsigned int ecc_words = BCH_ECC_WORDS(bch);
+	unsigned int nbits;
+	int i, err, nroots;
+	uint32_t sum;
+
+	/* sanity check: make sure data length can be handled */
+	if (8*len > (bch->n-bch->ecc_bits))
+		return -EINVAL;
+
+	/* if caller does not provide syndromes, compute them */
+	if (!syn) {
+		if (!calc_ecc) {
+			/* compute received data ecc into an internal buffer */
+			if (!data || !recv_ecc)
+				return -EINVAL;
+			encode_bch(bch, data, len, NULL);
+		} else {
+			/* load provided calculated ecc */
+			load_ecc8(bch, bch->ecc_buf, calc_ecc);
+		}
+		/* load received ecc or assume it was XORed in calc_ecc */
+		if (recv_ecc) {
+			load_ecc8(bch, bch->ecc_buf2, recv_ecc);
+			/* XOR received and calculated ecc */
+			for (i = 0, sum = 0; i < (int)ecc_words; i++) {
+				bch->ecc_buf[i] ^= bch->ecc_buf2[i];
+				sum |= bch->ecc_buf[i];
+			}
+			if (!sum)
+				/* no error found */
+				return 0;
+		}
+		compute_syndromes(bch, bch->ecc_buf, bch->syn);
+		syn = bch->syn;
+	}
+
+	err = compute_error_locator_polynomial(bch, syn);
+	if (err > 0) {
+		nroots = find_poly_roots(bch, 1, bch->elp, errloc);
+		if (err != nroots)
+			err = -1;
+	}
+	if (err > 0) {
+		/* post-process raw error locations for easier correction */
+		nbits = (len*8)+bch->ecc_bits;
+		for (i = 0; i < err; i++) {
+			if (errloc[i] >= nbits) {
+				err = -1;
+				break;
+			}
+			errloc[i] = nbits-1-errloc[i];
+			errloc[i] = (errloc[i] & ~7)|(7-(errloc[i] & 7));
+		}
+	}
+	return (err >= 0) ? err : -EBADMSG;
+}
+EXPORT_SYMBOL_GPL(decode_bch);
+
+/*
+ * generate Galois field lookup tables
+ */
+static int build_gf_tables(struct bch_control *bch, unsigned int poly)
+{
+	unsigned int i, x = 1;
+	const unsigned int k = 1 << deg(poly);
+
+	/* primitive polynomial must be of degree m */
+	if (k != (1u << GF_M(bch)))
+		return -1;
+
+	for (i = 0; i < GF_N(bch); i++) {
+		bch->a_pow_tab[i] = x;
+		bch->a_log_tab[x] = i;
+		if (i && (x == 1))
+			/* polynomial is not primitive (a^i=1 with 0<i<2^m-1) */
+			return -1;
+		x <<= 1;
+		if (x & k)
+			x ^= poly;
+	}
+	bch->a_pow_tab[GF_N(bch)] = 1;
+	bch->a_log_tab[0] = 0;
+
+	return 0;
+}
+
+/*
+ * compute generator polynomial remainder tables for fast encoding
+ */
+static void build_mod8_tables(struct bch_control *bch, const uint32_t *g)
+{
+	int i, j, b, d;
+	uint32_t data, hi, lo, *tab;
+	const int l = BCH_ECC_WORDS(bch);
+	const int plen = DIV_ROUND_UP(bch->ecc_bits+1, 32);
+	const int ecclen = DIV_ROUND_UP(bch->ecc_bits, 32);
+
+	memset(bch->mod8_tab, 0, 4*256*l*sizeof(*bch->mod8_tab));
+
+	for (i = 0; i < 256; i++) {
+		/* p(X)=i is a small polynomial of weight <= 8 */
+		for (b = 0; b < 4; b++) {
+			/* we want to compute (p(X).X^(8*b+deg(g))) mod g(X) */
+			tab = bch->mod8_tab + (b*256+i)*l;
+			data = i << (8*b);
+			while (data) {
+				d = deg(data);
+				/* subtract X^d.g(X) from p(X).X^(8*b+deg(g)) */
+				data ^= g[0] >> (31-d);
+				for (j = 0; j < ecclen; j++) {
+					hi = (d < 31) ? g[j] << (d+1) : 0;
+					lo = (j+1 < plen) ?
+						g[j+1] >> (31-d) : 0;
+					tab[j] ^= hi|lo;
+				}
+			}
+		}
+	}
+}
+
+/*
+ * build a base for factoring degree 2 polynomials
+ */
+static int build_deg2_base(struct bch_control *bch)
+{
+	const int m = GF_M(bch);
+	int i, j, r;
+	unsigned int sum, x, y, remaining, ak = 0, xi[m];
+
+	/* find k s.t. Tr(a^k) = 1 and 0 <= k < m */
+	for (i = 0; i < m; i++) {
+		for (j = 0, sum = 0; j < m; j++)
+			sum ^= a_pow(bch, i*(1 << j));
+
+		if (sum) {
+			ak = bch->a_pow_tab[i];
+			break;
+		}
+	}
+	/* find xi, i=0..m-1 such that xi^2+xi = a^i+Tr(a^i).a^k */
+	remaining = m;
+	memset(xi, 0, sizeof(xi));
+
+	for (x = 0; (x <= GF_N(bch)) && remaining; x++) {
+		y = gf_sqr(bch, x)^x;
+		for (i = 0; i < 2; i++) {
+			r = a_log(bch, y);
+			if (y && (r < m) && !xi[r]) {
+				bch->xi_tab[r] = x;
+				xi[r] = 1;
+				remaining--;
+				dbg("x%d = %x\n", r, x);
+				break;
+			}
+			y ^= ak;
+		}
+	}
+	/* should not happen but check anyway */
+	return remaining ? -1 : 0;
+}
+
+static void *bch_alloc(size_t size, int *err)
+{
+	void *ptr;
+
+	ptr = kmalloc(size, GFP_KERNEL);
+	if (ptr == NULL)
+		*err = 1;
+	return ptr;
+}
+
+/*
+ * compute generator polynomial for given (m,t) parameters.
+ */
+static uint32_t *compute_generator_polynomial(struct bch_control *bch)
+{
+	const unsigned int m = GF_M(bch);
+	const unsigned int t = GF_T(bch);
+	int n, err = 0;
+	unsigned int i, j, nbits, r, word, *roots;
+	struct gf_poly *g;
+	uint32_t *genpoly;
+
+	g = bch_alloc(GF_POLY_SZ(m*t), &err);
+	roots = bch_alloc((bch->n+1)*sizeof(*roots), &err);
+	genpoly = bch_alloc(DIV_ROUND_UP(m*t+1, 32)*sizeof(*genpoly), &err);
+
+	if (err) {
+		kfree(genpoly);
+		genpoly = NULL;
+		goto finish;
+	}
+
+	/* enumerate all roots of g(X) */
+	memset(roots , 0, (bch->n+1)*sizeof(*roots));
+	for (i = 0; i < t; i++) {
+		for (j = 0, r = 2*i+1; j < m; j++) {
+			roots[r] = 1;
+			r = mod_s(bch, 2*r);
+		}
+	}
+	/* build generator polynomial g(X) */
+	g->deg = 0;
+	g->c[0] = 1;
+	for (i = 0; i < GF_N(bch); i++) {
+		if (roots[i]) {
+			/* multiply g(X) by (X+root) */
+			r = bch->a_pow_tab[i];
+			g->c[g->deg+1] = 1;
+			for (j = g->deg; j > 0; j--)
+				g->c[j] = gf_mul(bch, g->c[j], r)^g->c[j-1];
+
+			g->c[0] = gf_mul(bch, g->c[0], r);
+			g->deg++;
+		}
+	}
+	/* store left-justified binary representation of g(X) */
+	n = g->deg+1;
+	i = 0;
+
+	while (n > 0) {
+		nbits = (n > 32) ? 32 : n;
+		for (j = 0, word = 0; j < nbits; j++) {
+			if (g->c[n-1-j])
+				word |= 1u << (31-j);
+		}
+		genpoly[i++] = word;
+		n -= nbits;
+	}
+	bch->ecc_bits = g->deg;
+
+finish:
+	kfree(g);
+	kfree(roots);
+
+	return genpoly;
+}
+
+/**
+ * init_bch - initialize a BCH encoder/decoder
+ * @m:          Galois field order, should be in the range 5-15
+ * @t:          maximum error correction capability, in bits
+ * @prim_poly:  user-provided primitive polynomial (or 0 to use default)
+ *
+ * Returns:
+ *  a newly allocated BCH control structure if successful, NULL otherwise
+ *
+ * This initialization can take some time, as lookup tables are built for fast
+ * encoding/decoding; make sure not to call this function from a time critical
+ * path. Usually, init_bch() should be called on module/driver init and
+ * free_bch() should be called to release memory on exit.
+ *
+ * You may provide your own primitive polynomial of degree @m in argument
+ * @prim_poly, or let init_bch() use its default polynomial.
+ *
+ * Once init_bch() has successfully returned a pointer to a newly allocated
+ * BCH control structure, ecc length in bytes is given by member @ecc_bytes of
+ * the structure.
+ */
+struct bch_control *init_bch(int m, int t, unsigned int prim_poly)
+{
+	int err = 0;
+	unsigned int i, words;
+	uint32_t *genpoly;
+	struct bch_control *bch = NULL;
+
+	const int min_m = 5;
+	const int max_m = 15;
+
+	/* default primitive polynomials */
+	static const unsigned int prim_poly_tab[] = {
+		0x25, 0x43, 0x83, 0x11d, 0x211, 0x409, 0x805, 0x1053, 0x201b,
+		0x402b, 0x8003,
+	};
+
+#if defined(CONFIG_BCH_CONST_PARAMS)
+	if ((m != (CONFIG_BCH_CONST_M)) || (t != (CONFIG_BCH_CONST_T))) {
+		printk(KERN_ERR "bch encoder/decoder was configured to support "
+		       "parameters m=%d, t=%d only!\n",
+		       CONFIG_BCH_CONST_M, CONFIG_BCH_CONST_T);
+		goto fail;
+	}
+#endif
+	if ((m < min_m) || (m > max_m))
+		/*
+		 * values of m greater than 15 are not currently supported;
+		 * supporting m > 15 would require changing table base type
+		 * (uint16_t) and a small patch in matrix transposition
+		 */
+		goto fail;
+
+	/* sanity checks */
+	if ((t < 1) || (m*t >= ((1 << m)-1)))
+		/* invalid t value */
+		goto fail;
+
+	/* select a primitive polynomial for generating GF(2^m) */
+	if (prim_poly == 0)
+		prim_poly = prim_poly_tab[m-min_m];
+
+	bch = kzalloc(sizeof(*bch), GFP_KERNEL);
+	if (bch == NULL)
+		goto fail;
+
+	bch->m = m;
+	bch->t = t;
+	bch->n = (1 << m)-1;
+	words  = DIV_ROUND_UP(m*t, 32);
+	bch->ecc_bytes = DIV_ROUND_UP(m*t, 8);
+	bch->a_pow_tab = bch_alloc((1+bch->n)*sizeof(*bch->a_pow_tab), &err);
+	bch->a_log_tab = bch_alloc((1+bch->n)*sizeof(*bch->a_log_tab), &err);
+	bch->mod8_tab  = bch_alloc(words*1024*sizeof(*bch->mod8_tab), &err);
+	bch->ecc_buf   = bch_alloc(words*sizeof(*bch->ecc_buf), &err);
+	bch->ecc_buf2  = bch_alloc(words*sizeof(*bch->ecc_buf2), &err);
+	bch->xi_tab    = bch_alloc(m*sizeof(*bch->xi_tab), &err);
+	bch->syn       = bch_alloc(2*t*sizeof(*bch->syn), &err);
+	bch->cache     = bch_alloc(2*t*sizeof(*bch->cache), &err);
+	bch->elp       = bch_alloc((t+1)*sizeof(struct gf_poly_deg1), &err);
+
+	for (i = 0; i < ARRAY_SIZE(bch->poly_2t); i++)
+		bch->poly_2t[i] = bch_alloc(GF_POLY_SZ(2*t), &err);
+
+	if (err)
+		goto fail;
+
+	err = build_gf_tables(bch, prim_poly);
+	if (err)
+		goto fail;
+
+	/* use generator polynomial for computing encoding tables */
+	genpoly = compute_generator_polynomial(bch);
+	if (genpoly == NULL)
+		goto fail;
+
+	build_mod8_tables(bch, genpoly);
+	kfree(genpoly);
+
+	err = build_deg2_base(bch);
+	if (err)
+		goto fail;
+
+	return bch;
+
+fail:
+	free_bch(bch);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(init_bch);
+
+/**
+ *  free_bch - free the BCH control structure
+ *  @bch:    BCH control structure to release
+ */
+void free_bch(struct bch_control *bch)
+{
+	unsigned int i;
+
+	if (bch) {
+		kfree(bch->a_pow_tab);
+		kfree(bch->a_log_tab);
+		kfree(bch->mod8_tab);
+		kfree(bch->ecc_buf);
+		kfree(bch->ecc_buf2);
+		kfree(bch->xi_tab);
+		kfree(bch->syn);
+		kfree(bch->cache);
+		kfree(bch->elp);
+
+		for (i = 0; i < ARRAY_SIZE(bch->poly_2t); i++)
+			kfree(bch->poly_2t[i]);
+
+		kfree(bch);
+	}
+}
+EXPORT_SYMBOL_GPL(free_bch);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ivan Djelic <ivan.djelic@parrot.com>");
+MODULE_DESCRIPTION("Binary BCH encoder/decoder");
diff --git a/mm/memory.c b/mm/memory.c
index 51a5c23..9da8cab 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3715,7 +3715,7 @@
 }
 
 /**
- * @access_remote_vm - access another process' address space
+ * access_remote_vm - access another process' address space
  * @mm:		the mm_struct of the target address space
  * @addr:	start address to access
  * @buf:	source or destination buffer
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index ffb6876..6b43ee7 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -860,8 +860,10 @@
 {
 	if (task->tk_rqstp)
 		xprt_release(task);
-	if (task->tk_msg.rpc_cred)
+	if (task->tk_msg.rpc_cred) {
 		put_rpccred(task->tk_msg.rpc_cred);
+		task->tk_msg.rpc_cred = NULL;
+	}
 	rpc_task_release_client(task);
 }
 
diff --git a/sound/core/init.c b/sound/core/init.c
index 3e65da2..a0080aa 100644
--- a/sound/core/init.c
+++ b/sound/core/init.c
@@ -848,6 +848,7 @@
 		return -ENOMEM;
 	mfile->file = file;
 	mfile->disconnected_f_op = NULL;
+	INIT_LIST_HEAD(&mfile->shutdown_list);
 	spin_lock(&card->files_lock);
 	if (card->shutdown) {
 		spin_unlock(&card->files_lock);
@@ -883,6 +884,9 @@
 	list_for_each_entry(mfile, &card->files_list, list) {
 		if (mfile->file == file) {
 			list_del(&mfile->list);
+			spin_lock(&shutdown_lock);
+			list_del(&mfile->shutdown_list);
+			spin_unlock(&shutdown_lock);
 			if (mfile->disconnected_f_op)
 				fops_put(mfile->disconnected_f_op);
 			found = mfile;
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index ae42b65..fe5c803 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -3201,15 +3201,6 @@
 EXPORT_SYMBOL(snd_pcm_lib_mmap_iomem);
 #endif /* SNDRV_PCM_INFO_MMAP */
 
-/* mmap callback with pgprot_noncached */
-int snd_pcm_lib_mmap_noncached(struct snd_pcm_substream *substream,
-			       struct vm_area_struct *area)
-{
-	area->vm_page_prot = pgprot_noncached(area->vm_page_prot);
-	return snd_pcm_default_mmap(substream, area);
-}
-EXPORT_SYMBOL(snd_pcm_lib_mmap_noncached);
-
 /*
  * mmap DMA buffer
  */
diff --git a/sound/oss/dev_table.h b/sound/oss/dev_table.h
index b7617be..0199a31 100644
--- a/sound/oss/dev_table.h
+++ b/sound/oss/dev_table.h
@@ -271,7 +271,7 @@
 	void (*reset) (int dev);
 	void (*hw_control) (int dev, unsigned char *event);
 	int (*load_patch) (int dev, int format, const char __user *addr,
-	     int offs, int count, int pmgr_flag);
+	     int count, int pmgr_flag);
 	void (*aftertouch) (int dev, int voice, int pressure);
 	void (*controller) (int dev, int voice, int ctrl_num, int value);
 	void (*panning) (int dev, int voice, int value);
diff --git a/sound/oss/midi_synth.c b/sound/oss/midi_synth.c
index 3c09374..2292c23 100644
--- a/sound/oss/midi_synth.c
+++ b/sound/oss/midi_synth.c
@@ -476,7 +476,7 @@
 
 int
 midi_synth_load_patch(int dev, int format, const char __user *addr,
-		      int offs, int count, int pmgr_flag)
+		      int count, int pmgr_flag)
 {
 	int             orig_dev = synth_devs[dev]->midi_dev;
 
@@ -491,33 +491,29 @@
 	if (!prefix_cmd(orig_dev, 0xf0))
 		return 0;
 
+	/* Invalid patch format */
 	if (format != SYSEX_PATCH)
-	{
-/*		  printk("MIDI Error: Invalid patch format (key) 0x%x\n", format);*/
 		  return -EINVAL;
-	}
+
+	/* Patch header too short */
 	if (count < hdr_size)
-	{
-/*		printk("MIDI Error: Patch header too short\n");*/
 		return -EINVAL;
-	}
+
 	count -= hdr_size;
 
 	/*
-	 * Copy the header from user space but ignore the first bytes which have
-	 * been transferred already.
+	 * Copy the header from user space
 	 */
 
-	if(copy_from_user(&((char *) &sysex)[offs], &(addr)[offs], hdr_size - offs))
+	if (copy_from_user(&sysex, addr, hdr_size))
 		return -EFAULT;
- 
- 	if (count < sysex.len)
-	{
-/*		printk(KERN_WARNING "MIDI Warning: Sysex record too short (%d<%d)\n", count, (int) sysex.len);*/
+
+	/* Sysex record too short */
+	if ((unsigned)count < (unsigned)sysex.len)
 		sysex.len = count;
-	}
-  	left = sysex.len;
-  	src_offs = 0;
+
+	left = sysex.len;
+	src_offs = 0;
 
 	for (i = 0; i < left && !signal_pending(current); i++)
 	{
diff --git a/sound/oss/midi_synth.h b/sound/oss/midi_synth.h
index 6bc9d00..b64ddd6 100644
--- a/sound/oss/midi_synth.h
+++ b/sound/oss/midi_synth.h
@@ -8,7 +8,7 @@
 void midi_synth_close (int dev);
 void midi_synth_hw_control (int dev, unsigned char *event);
 int midi_synth_load_patch (int dev, int format, const char __user * addr,
-		 int offs, int count, int pmgr_flag);
+		 int count, int pmgr_flag);
 void midi_synth_panning (int dev, int channel, int pressure);
 void midi_synth_aftertouch (int dev, int channel, int pressure);
 void midi_synth_controller (int dev, int channel, int ctrl_num, int value);
diff --git a/sound/oss/opl3.c b/sound/oss/opl3.c
index 938c48c..407cd67 100644
--- a/sound/oss/opl3.c
+++ b/sound/oss/opl3.c
@@ -820,7 +820,7 @@
 }
 
 static int opl3_load_patch(int dev, int format, const char __user *addr,
-		int offs, int count, int pmgr_flag)
+		int count, int pmgr_flag)
 {
 	struct sbi_instrument ins;
 
@@ -830,11 +830,7 @@
 		return -EINVAL;
 	}
 
-	/*
-	 * What the fuck is going on here?  We leave junk in the beginning
-	 * of ins and then check the field pretty close to that beginning?
-	 */
-	if(copy_from_user(&((char *) &ins)[offs], addr + offs, sizeof(ins) - offs))
+	if (copy_from_user(&ins, addr, sizeof(ins)))
 		return -EFAULT;
 
 	if (ins.channel < 0 || ins.channel >= SBFM_MAXINSTR)
@@ -849,6 +845,10 @@
 
 static void opl3_panning(int dev, int voice, int value)
 {
+
+	if (voice < 0 || voice >= devc->nr_voice)
+		return;
+
 	devc->voc[voice].panning = value;
 }
 
@@ -1066,8 +1066,15 @@
 
 static void opl3_setup_voice(int dev, int voice, int chn)
 {
-	struct channel_info *info =
-	&synth_devs[dev]->chn_info[chn];
+	struct channel_info *info;
+
+	if (voice < 0 || voice >= devc->nr_voice)
+		return;
+
+	if (chn < 0 || chn > 15)
+		return;
+
+	info = &synth_devs[dev]->chn_info[chn];
 
 	opl3_set_instr(dev, voice, info->pgm_num);
 
diff --git a/sound/oss/sequencer.c b/sound/oss/sequencer.c
index 5ea1098..30bcfe4 100644
--- a/sound/oss/sequencer.c
+++ b/sound/oss/sequencer.c
@@ -241,7 +241,7 @@
 				return -ENXIO;
 
 			fmt = (*(short *) &event_rec[0]) & 0xffff;
-			err = synth_devs[dev]->load_patch(dev, fmt, buf, p + 4, c, 0);
+			err = synth_devs[dev]->load_patch(dev, fmt, buf + p, c, 0);
 			if (err < 0)
 				return err;
 
diff --git a/sound/pci/asihpi/asihpi.c b/sound/pci/asihpi/asihpi.c
index 0ac1f98..f53a31e 100644
--- a/sound/pci/asihpi/asihpi.c
+++ b/sound/pci/asihpi/asihpi.c
@@ -22,21 +22,6 @@
  *  for any purpose including commercial applications.
  */
 
-/* >0: print Hw params, timer vars. >1: print stream write/copy sizes  */
-#define REALLY_VERBOSE_LOGGING 0
-
-#if REALLY_VERBOSE_LOGGING
-#define VPRINTK1 snd_printd
-#else
-#define VPRINTK1(...)
-#endif
-
-#if REALLY_VERBOSE_LOGGING > 1
-#define VPRINTK2 snd_printd
-#else
-#define VPRINTK2(...)
-#endif
-
 #include "hpi_internal.h"
 #include "hpimsginit.h"
 #include "hpioctl.h"
@@ -57,11 +42,25 @@
 #include <sound/tlv.h>
 #include <sound/hwdep.h>
 
-
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("AudioScience inc. <support@audioscience.com>");
 MODULE_DESCRIPTION("AudioScience ALSA ASI5000 ASI6000 ASI87xx ASI89xx");
 
+#if defined CONFIG_SND_DEBUG_VERBOSE
+/**
+ * snd_printddd - very verbose debug printk
+ * @format: format string
+ *
+ * Works like snd_printk() for debugging purposes.
+ * Ignored when CONFIG_SND_DEBUG_VERBOSE is not set.
+ * Must set snd module debug parameter to 3 to enable at runtime.
+ */
+#define snd_printddd(format, args...) \
+	__snd_printk(3, __FILE__, __LINE__, format, ##args)
+#else
+#define snd_printddd(format, args...)	do { } while (0)
+#endif
+
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
 static int enable[SNDRV_CARDS] = SNDRV_DEFAULT_ENABLE_PNP;
@@ -289,7 +288,6 @@
 #define hpi_handle_error(x)  handle_error(x, __LINE__, __FILE__)
 
 /***************************** GENERAL PCM ****************/
-#if REALLY_VERBOSE_LOGGING
 static void print_hwparams(struct snd_pcm_hw_params *p)
 {
 	snd_printd("HWPARAMS \n");
@@ -304,9 +302,6 @@
 	snd_printd("periods %d \n", params_periods(p));
 	snd_printd("buffer_size %d \n", params_buffer_size(p));
 }
-#else
-#define print_hwparams(x)
-#endif
 
 static snd_pcm_format_t hpi_to_alsa_formats[] = {
 	-1,			/* INVALID */
@@ -381,13 +376,13 @@
 				"No local sampleclock, err %d\n", err);
 		}
 
-		for (idx = 0; idx < 100; idx++) {
-			if (hpi_sample_clock_query_local_rate(
-				h_control, idx, &sample_rate)) {
-				if (!idx)
-					snd_printk(KERN_ERR
-						"Local rate query failed\n");
-
+		for (idx = -1; idx < 100; idx++) {
+			if (idx == -1) {
+				if (hpi_sample_clock_get_sample_rate(h_control,
+								&sample_rate))
+					continue;
+			} else if (hpi_sample_clock_query_local_rate(h_control,
+							idx, &sample_rate)) {
 				break;
 			}
 
@@ -440,8 +435,6 @@
 		}
 	}
 
-	/* printk(KERN_INFO "Supported rates %X %d %d\n",
-	   rates, rate_min, rate_max); */
 	pcmhw->rates = rates;
 	pcmhw->rate_min = rate_min;
 	pcmhw->rate_max = rate_max;
@@ -466,7 +459,7 @@
 	if (err)
 		return err;
 
-	VPRINTK1(KERN_INFO "format %d, %d chans, %d_hz\n",
+	snd_printdd("format %d, %d chans, %d_hz\n",
 				format, params_channels(params),
 				params_rate(params));
 
@@ -489,13 +482,12 @@
 		err = hpi_stream_host_buffer_attach(dpcm->h_stream,
 			params_buffer_bytes(params),  runtime->dma_addr);
 		if (err == 0) {
-			VPRINTK1(KERN_INFO
+			snd_printdd(
 				"stream_host_buffer_attach succeeded %u %lu\n",
 				params_buffer_bytes(params),
 				(unsigned long)runtime->dma_addr);
 		} else {
-			snd_printd(KERN_INFO
-					"stream_host_buffer_attach error %d\n",
+			snd_printd("stream_host_buffer_attach error %d\n",
 					err);
 			return -ENOMEM;
 		}
@@ -504,7 +496,7 @@
 						&dpcm->hpi_buffer_attached,
 						NULL, NULL, NULL);
 
-		VPRINTK1(KERN_INFO "stream_host_buffer_attach status 0x%x\n",
+		snd_printdd("stream_host_buffer_attach status 0x%x\n",
 				dpcm->hpi_buffer_attached);
 	}
 	bytes_per_sec = params_rate(params) * params_channels(params);
@@ -517,7 +509,7 @@
 	dpcm->bytes_per_sec = bytes_per_sec;
 	dpcm->buffer_bytes = params_buffer_bytes(params);
 	dpcm->period_bytes = params_period_bytes(params);
-	VPRINTK1(KERN_INFO "buffer_bytes=%d, period_bytes=%d, bps=%d\n",
+	snd_printdd("buffer_bytes=%d, period_bytes=%d, bps=%d\n",
 			dpcm->buffer_bytes, dpcm->period_bytes, bytes_per_sec);
 
 	return 0;
@@ -573,7 +565,7 @@
 	struct snd_pcm_substream *s;
 	u16 e;
 
-	VPRINTK1(KERN_INFO "%c%d trigger\n",
+	snd_printdd("%c%d trigger\n",
 			SCHR(substream->stream), substream->number);
 	switch (cmd) {
 	case SNDRV_PCM_TRIGGER_START:
@@ -597,7 +589,7 @@
 				* data??
 				*/
 				unsigned int preload = ds->period_bytes * 1;
-				VPRINTK2(KERN_INFO "%d preload x%x\n", s->number, preload);
+				snd_printddd("%d preload x%x\n", s->number, preload);
 				hpi_handle_error(hpi_outstream_write_buf(
 						ds->h_stream,
 						&runtime->dma_area[0],
@@ -607,7 +599,7 @@
 			}
 
 			if (card->support_grouping) {
-				VPRINTK1(KERN_INFO "\t%c%d group\n",
+				snd_printdd("\t%c%d group\n",
 						SCHR(s->stream),
 						s->number);
 				e = hpi_stream_group_add(
@@ -622,7 +614,7 @@
 			} else
 				break;
 		}
-		VPRINTK1(KERN_INFO "start\n");
+		snd_printdd("start\n");
 		/* start the master stream */
 		snd_card_asihpi_pcm_timer_start(substream);
 		if ((substream->stream == SNDRV_PCM_STREAM_CAPTURE) ||
@@ -644,14 +636,14 @@
 			s->runtime->status->state = SNDRV_PCM_STATE_SETUP;
 
 			if (card->support_grouping) {
-				VPRINTK1(KERN_INFO "\t%c%d group\n",
+				snd_printdd("\t%c%d group\n",
 				SCHR(s->stream),
 					s->number);
 				snd_pcm_trigger_done(s, substream);
 			} else
 				break;
 		}
-		VPRINTK1(KERN_INFO "stop\n");
+		snd_printdd("stop\n");
 
 		/* _prepare and _hwparams reset the stream */
 		hpi_handle_error(hpi_stream_stop(dpcm->h_stream));
@@ -664,12 +656,12 @@
 		break;
 
 	case SNDRV_PCM_TRIGGER_PAUSE_RELEASE:
-		VPRINTK1(KERN_INFO "pause release\n");
+		snd_printdd("pause release\n");
 		hpi_handle_error(hpi_stream_start(dpcm->h_stream));
 		snd_card_asihpi_pcm_timer_start(substream);
 		break;
 	case SNDRV_PCM_TRIGGER_PAUSE_PUSH:
-		VPRINTK1(KERN_INFO "pause\n");
+		snd_printdd("pause\n");
 		snd_card_asihpi_pcm_timer_stop(substream);
 		hpi_handle_error(hpi_stream_stop(dpcm->h_stream));
 		break;
@@ -741,7 +733,7 @@
 	u16 state;
 	u32 buffer_size, bytes_avail, samples_played, on_card_bytes;
 
-	VPRINTK1(KERN_INFO "%c%d snd_card_asihpi_timer_function\n",
+	snd_printdd("%c%d snd_card_asihpi_timer_function\n",
 				SCHR(substream->stream), substream->number);
 
 	/* find minimum newdata and buffer pos in group */
@@ -770,10 +762,10 @@
 				if ((bytes_avail == 0) &&
 				    (on_card_bytes < ds->pcm_buf_host_rw_ofs)) {
 					hpi_handle_error(hpi_stream_start(ds->h_stream));
-					VPRINTK1(KERN_INFO "P%d start\n", s->number);
+					snd_printdd("P%d start\n", s->number);
 				}
 			} else if (state == HPI_STATE_DRAINED) {
-				VPRINTK1(KERN_WARNING "P%d drained\n",
+				snd_printd(KERN_WARNING "P%d drained\n",
 						s->number);
 				/*snd_pcm_stop(s, SNDRV_PCM_STATE_XRUN);
 				continue; */
@@ -794,13 +786,13 @@
 				newdata);
 		}
 
-		VPRINTK1(KERN_INFO "PB timer hw_ptr x%04lX, appl_ptr x%04lX\n",
+		snd_printdd("hw_ptr x%04lX, appl_ptr x%04lX\n",
 			(unsigned long)frames_to_bytes(runtime,
 						runtime->status->hw_ptr),
 			(unsigned long)frames_to_bytes(runtime,
 						runtime->control->appl_ptr));
 
-		VPRINTK1(KERN_INFO "%d %c%d S=%d, rw=%04X, dma=x%04X, left=x%04X,"
+		snd_printdd("%d %c%d S=%d, rw=%04X, dma=x%04X, left=x%04X,"
 			" aux=x%04X space=x%04X\n",
 			loops, SCHR(s->stream),	s->number,
 			state,	ds->pcm_buf_host_rw_ofs, pcm_buf_dma_ofs, (int)bytes_avail,
@@ -822,7 +814,7 @@
 
 	next_jiffies = max(next_jiffies, 1U);
 	dpcm->timer.expires = jiffies + next_jiffies;
-	VPRINTK1(KERN_INFO "jif %d buf pos x%04X newdata x%04X xfer x%04X\n",
+	snd_printdd("jif %d buf pos x%04X newdata x%04X xfer x%04X\n",
 			next_jiffies, pcm_buf_dma_ofs, newdata, xfercount);
 
 	snd_pcm_group_for_each_entry(s, substream) {
@@ -837,7 +829,7 @@
 		if (xfercount && (on_card_bytes <= ds->period_bytes)) {
 			if (card->support_mmap) {
 				if (s->stream == SNDRV_PCM_STREAM_PLAYBACK) {
-					VPRINTK2(KERN_INFO "P%d write x%04x\n",
+					snd_printddd("P%d write x%04x\n",
 							s->number,
 							ds->period_bytes);
 					hpi_handle_error(
@@ -848,7 +840,7 @@
 							xfercount,
 							&ds->format));
 				} else {
-					VPRINTK2(KERN_INFO "C%d read x%04x\n",
+					snd_printddd("C%d read x%04x\n",
 						s->number,
 						xfercount);
 					hpi_handle_error(
@@ -871,7 +863,7 @@
 static int snd_card_asihpi_playback_ioctl(struct snd_pcm_substream *substream,
 					  unsigned int cmd, void *arg)
 {
-	/* snd_printd(KERN_INFO "Playback ioctl %d\n", cmd); */
+	snd_printdd(KERN_INFO "Playback ioctl %d\n", cmd);
 	return snd_pcm_lib_ioctl(substream, cmd, arg);
 }
 
@@ -881,7 +873,7 @@
 	struct snd_pcm_runtime *runtime = substream->runtime;
 	struct snd_card_asihpi_pcm *dpcm = runtime->private_data;
 
-	VPRINTK1(KERN_INFO "playback prepare %d\n", substream->number);
+	snd_printdd("playback prepare %d\n", substream->number);
 
 	hpi_handle_error(hpi_outstream_reset(dpcm->h_stream));
 	dpcm->pcm_buf_host_rw_ofs = 0;
@@ -898,7 +890,7 @@
 	snd_pcm_uframes_t ptr;
 
 	ptr = bytes_to_frames(runtime, dpcm->pcm_buf_dma_ofs  % dpcm->buffer_bytes);
-	/* VPRINTK2(KERN_INFO "playback_pointer=x%04lx\n", (unsigned long)ptr); */
+	snd_printddd("playback_pointer=x%04lx\n", (unsigned long)ptr);
 	return ptr;
 }
 
@@ -1014,12 +1006,13 @@
 
 	snd_pcm_hw_constraint_step(runtime, 0, SNDRV_PCM_HW_PARAM_PERIOD_SIZE,
 		card->update_interval_frames);
+
 	snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_PERIOD_SIZE,
 		card->update_interval_frames * 2, UINT_MAX);
 
 	snd_pcm_set_sync(substream);
 
-	VPRINTK1(KERN_INFO "playback open\n");
+	snd_printdd("playback open\n");
 
 	return 0;
 }
@@ -1030,7 +1023,7 @@
 	struct snd_card_asihpi_pcm *dpcm = runtime->private_data;
 
 	hpi_handle_error(hpi_outstream_close(dpcm->h_stream));
-	VPRINTK1(KERN_INFO "playback close\n");
+	snd_printdd("playback close\n");
 
 	return 0;
 }
@@ -1050,13 +1043,13 @@
 	if (copy_from_user(runtime->dma_area, src, len))
 		return -EFAULT;
 
-	VPRINTK2(KERN_DEBUG "playback copy%d %u bytes\n",
+	snd_printddd("playback copy%d %u bytes\n",
 			substream->number, len);
 
 	hpi_handle_error(hpi_outstream_write_buf(dpcm->h_stream,
 				runtime->dma_area, len, &dpcm->format));
 
-	dpcm->pcm_buf_host_rw_ofs = dpcm->pcm_buf_host_rw_ofs + len;
+	dpcm->pcm_buf_host_rw_ofs += len;
 
 	return 0;
 }
@@ -1066,16 +1059,11 @@
 					    snd_pcm_uframes_t pos,
 					    snd_pcm_uframes_t count)
 {
-	unsigned int len;
-	struct snd_pcm_runtime *runtime = substream->runtime;
-	struct snd_card_asihpi_pcm *dpcm = runtime->private_data;
-
-	len = frames_to_bytes(runtime, count);
-	VPRINTK1(KERN_INFO "playback silence  %u bytes\n", len);
-
-	memset(runtime->dma_area, 0, len);
-	hpi_handle_error(hpi_outstream_write_buf(dpcm->h_stream,
-				runtime->dma_area, len, &dpcm->format));
+	/* Usually writes silence to DMA buffer, which should be overwritten
+	by real audio later.  Our fifos cannot be overwritten, and are not
+	free-running DMAs. Silence is output on fifo underflow.
+	This callback is still required to allow the copy callback to be used.
+	*/
 	return 0;
 }
 
@@ -1110,7 +1098,7 @@
 	struct snd_pcm_runtime *runtime = substream->runtime;
 	struct snd_card_asihpi_pcm *dpcm = runtime->private_data;
 
-	VPRINTK2(KERN_INFO "capture pointer %d=%d\n",
+	snd_printddd("capture pointer %d=%d\n",
 			substream->number, dpcm->pcm_buf_dma_ofs);
 	/* NOTE Unlike playback can't use actual samples_played
 		for the capture position, because those samples aren't yet in
@@ -1135,7 +1123,7 @@
 	dpcm->pcm_buf_dma_ofs = 0;
 	dpcm->pcm_buf_elapsed_dma_ofs = 0;
 
-	VPRINTK1("Capture Prepare %d\n", substream->number);
+	snd_printdd("Capture Prepare %d\n", substream->number);
 	return 0;
 }
 
@@ -1198,7 +1186,7 @@
 	if (dpcm == NULL)
 		return -ENOMEM;
 
-	VPRINTK1("hpi_instream_open adapter %d stream %d\n",
+	snd_printdd("capture open adapter %d stream %d\n",
 		   card->adapter_index, substream->number);
 
 	err = hpi_handle_error(
@@ -1268,7 +1256,7 @@
 
 	len = frames_to_bytes(runtime, count);
 
-	VPRINTK2(KERN_INFO "capture copy%d %d bytes\n", substream->number, len);
+	snd_printddd("capture copy%d %d bytes\n", substream->number, len);
 	hpi_handle_error(hpi_instream_read_buf(dpcm->h_stream,
 				runtime->dma_area, len));
 
@@ -2887,6 +2875,9 @@
 	if (err)
 		asihpi->update_interval_frames = 512;
 
+	if (!asihpi->support_mmap)
+		asihpi->update_interval_frames *= 2;
+
 	hpi_handle_error(hpi_instream_open(asihpi->adapter_index,
 			     0, &h_stream));
 
@@ -2909,7 +2900,6 @@
 			asihpi->support_mrx
 	      );
 
-
 	err = snd_card_asihpi_pcm_new(asihpi, 0, pcm_substreams);
 	if (err < 0) {
 		snd_printk(KERN_ERR "pcm_new failed\n");
@@ -2944,6 +2934,7 @@
 	sprintf(card->longname, "%s %i",
 			card->shortname, asihpi->adapter_index);
 	err = snd_card_register(card);
+
 	if (!err) {
 		hpi_card->snd_card_asihpi = card;
 		dev++;
diff --git a/sound/pci/hda/patch_analog.c b/sound/pci/hda/patch_analog.c
index 734c6ee..2942d2a 100644
--- a/sound/pci/hda/patch_analog.c
+++ b/sound/pci/hda/patch_analog.c
@@ -4256,6 +4256,84 @@
 }
 
 /*
+ * Precision R5500
+ * 0x12 - HP/line-out
+ * 0x13 - speaker (mono)
+ * 0x15 - mic-in
+ */
+
+static struct hda_verb ad1984a_precision_verbs[] = {
+	/* Unmute main output path */
+	{0x03, AC_VERB_SET_AMP_GAIN_MUTE, 0x27}, /* 0dB */
+	{0x21, AC_VERB_SET_AMP_GAIN_MUTE, AMP_OUT_UNMUTE + 0x1f}, /* 0dB */
+	{0x20, AC_VERB_SET_AMP_GAIN_MUTE, AMP_IN_UNMUTE(5) + 0x17}, /* 0dB */
+	/* Analog mixer; mute as default */
+	{0x20, AC_VERB_SET_AMP_GAIN_MUTE, AMP_IN_MUTE(0)},
+	{0x20, AC_VERB_SET_AMP_GAIN_MUTE, AMP_IN_MUTE(1)},
+	{0x20, AC_VERB_SET_AMP_GAIN_MUTE, AMP_IN_MUTE(2)},
+	{0x20, AC_VERB_SET_AMP_GAIN_MUTE, AMP_IN_MUTE(3)},
+	{0x20, AC_VERB_SET_AMP_GAIN_MUTE, AMP_IN_MUTE(4)},
+	/* Select mic as input */
+	{0x0c, AC_VERB_SET_CONNECT_SEL, 0x1},
+	{0x0c, AC_VERB_SET_AMP_GAIN_MUTE, AMP_OUT_UNMUTE + 0x27}, /* 0dB */
+	/* Configure as mic */
+	{0x15, AC_VERB_SET_PIN_WIDGET_CONTROL, PIN_VREF80},
+	{0x15, AC_VERB_SET_AMP_GAIN_MUTE, 0x7002}, /* raise mic as default */
+	/* HP unmute */
+	{0x12, AC_VERB_SET_AMP_GAIN_MUTE, AMP_OUT_UNMUTE},
+	/* turn on EAPD */
+	{0x13, AC_VERB_SET_EAPD_BTLENABLE, 0x02},
+	/* unsolicited event for pin-sense */
+	{0x12, AC_VERB_SET_UNSOLICITED_ENABLE, AC_USRSP_EN | AD1884A_HP_EVENT},
+	{ } /* end */
+};
+
+static struct snd_kcontrol_new ad1984a_precision_mixers[] = {
+	HDA_CODEC_VOLUME("Master Playback Volume", 0x21, 0x0, HDA_OUTPUT),
+	HDA_CODEC_MUTE("Master Playback Switch", 0x21, 0x0, HDA_OUTPUT),
+	HDA_CODEC_VOLUME("PCM Playback Volume", 0x20, 0x5, HDA_INPUT),
+	HDA_CODEC_MUTE("PCM Playback Switch", 0x20, 0x5, HDA_INPUT),
+	HDA_CODEC_VOLUME("Mic Playback Volume", 0x20, 0x01, HDA_INPUT),
+	HDA_CODEC_MUTE("Mic Playback Switch", 0x20, 0x01, HDA_INPUT),
+	HDA_CODEC_VOLUME("Mic Boost Volume", 0x15, 0x0, HDA_INPUT),
+	HDA_CODEC_MUTE("Front Playback Switch", 0x12, 0x0, HDA_OUTPUT),
+	HDA_CODEC_VOLUME("Speaker Playback Volume", 0x13, 0x0, HDA_OUTPUT),
+	HDA_CODEC_VOLUME("Capture Volume", 0x0c, 0x0, HDA_OUTPUT),
+	HDA_CODEC_MUTE("Capture Switch", 0x0c, 0x0, HDA_OUTPUT),
+	{ } /* end */
+};
+
+
+/* mute internal speaker if HP is plugged */
+static void ad1984a_precision_automute(struct hda_codec *codec)
+{
+	unsigned int present;
+
+	present = snd_hda_jack_detect(codec, 0x12);
+	snd_hda_codec_amp_stereo(codec, 0x13, HDA_OUTPUT, 0,
+				 HDA_AMP_MUTE, present ? HDA_AMP_MUTE : 0);
+}
+
+
+/* unsolicited event for HP jack sensing */
+static void ad1984a_precision_unsol_event(struct hda_codec *codec,
+					 unsigned int res)
+{
+	if ((res >> 26) != AD1884A_HP_EVENT)
+		return;
+	ad1984a_precision_automute(codec);
+}
+
+/* initialize jack-sensing, too */
+static int ad1984a_precision_init(struct hda_codec *codec)
+{
+	ad198x_init(codec);
+	ad1984a_precision_automute(codec);
+	return 0;
+}
+
+
+/*
  * HP Touchsmart
  * port-A (0x11)      - front hp-out
  * port-B (0x14)      - unused
@@ -4384,6 +4462,7 @@
 	AD1884A_MOBILE,
 	AD1884A_THINKPAD,
 	AD1984A_TOUCHSMART,
+	AD1984A_PRECISION,
 	AD1884A_MODELS
 };
 
@@ -4393,9 +4472,11 @@
 	[AD1884A_MOBILE]	= "mobile",
 	[AD1884A_THINKPAD]	= "thinkpad",
 	[AD1984A_TOUCHSMART]	= "touchsmart",
+	[AD1984A_PRECISION]	= "precision",
 };
 
 static struct snd_pci_quirk ad1884a_cfg_tbl[] = {
+	SND_PCI_QUIRK(0x1028, 0x04ac, "Precision R5500", AD1984A_PRECISION),
 	SND_PCI_QUIRK(0x103c, 0x3030, "HP", AD1884A_MOBILE),
 	SND_PCI_QUIRK(0x103c, 0x3037, "HP 2230s", AD1884A_LAPTOP),
 	SND_PCI_QUIRK(0x103c, 0x3056, "HP", AD1884A_MOBILE),
@@ -4489,6 +4570,14 @@
 		codec->patch_ops.unsol_event = ad1984a_thinkpad_unsol_event;
 		codec->patch_ops.init = ad1984a_thinkpad_init;
 		break;
+	case AD1984A_PRECISION:
+		spec->mixers[0] = ad1984a_precision_mixers;
+		spec->init_verbs[spec->num_init_verbs++] =
+			ad1984a_precision_verbs;
+		spec->multiout.dig_out_nid = 0;
+		codec->patch_ops.unsol_event = ad1984a_precision_unsol_event;
+		codec->patch_ops.init = ad1984a_precision_init;
+		break;
 	case AD1984A_TOUCHSMART:
 		spec->mixers[0] = ad1984a_touchsmart_mixers;
 		spec->init_verbs[0] = ad1984a_touchsmart_verbs;
diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 5d582de..0ef0035 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -1290,7 +1290,7 @@
 		case 0x10ec0883:
 		case 0x10ec0885:
 		case 0x10ec0887:
-		case 0x10ec0889:
+		/*case 0x10ec0889:*/ /* this causes an SPDIF problem */
 			alc889_coef_init(codec);
 			break;
 		case 0x10ec0888:
diff --git a/sound/usb/quirks-table.h b/sound/usb/quirks-table.h
index c0dcfca..c66d3f6 100644
--- a/sound/usb/quirks-table.h
+++ b/sound/usb/quirks-table.h
@@ -1568,6 +1568,46 @@
 	}
 },
 {
+	USB_DEVICE_VENDOR_SPEC(0x0582, 0x0104),
+	.driver_info = (unsigned long) & (const struct snd_usb_audio_quirk) {
+		/* .vendor_name = "Roland", */
+		/* .product_name = "UM-1G", */
+		.ifnum = 0,
+		.type = QUIRK_MIDI_FIXED_ENDPOINT,
+		.data = & (const struct snd_usb_midi_endpoint_info) {
+			.out_cables = 0x0001,
+			.in_cables  = 0x0001
+		}
+	}
+},
+{
+	/* Boss JS-8 Jam Station  */
+	USB_DEVICE(0x0582, 0x0109),
+	.driver_info = (unsigned long) & (const struct snd_usb_audio_quirk) {
+		/* .vendor_name = "BOSS", */
+		/* .product_name = "JS-8", */
+		.ifnum = QUIRK_ANY_INTERFACE,
+		.type = QUIRK_COMPOSITE,
+		.data = (const struct snd_usb_audio_quirk[]) {
+			{
+				.ifnum = 0,
+				.type = QUIRK_AUDIO_STANDARD_INTERFACE
+			},
+			{
+				.ifnum = 1,
+				.type = QUIRK_AUDIO_STANDARD_INTERFACE
+			},
+			{
+				.ifnum = 2,
+				.type = QUIRK_MIDI_STANDARD_INTERFACE
+			},
+			{
+				.ifnum = -1
+			}
+		}
+	}
+},
+{
 	/* has ID 0x0110 when not in Advanced Driver mode */
 	USB_DEVICE_VENDOR_SPEC(0x0582, 0x010f),
 	.driver_info = (unsigned long) & (const struct snd_usb_audio_quirk) {