Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net

Pull networking fixes from David Miller:

 1) Fix various build warnings in tlan/qed/xen-netback drivers, from
    Arnd Bergmann.

 2) Propagate proper error code in strparser's strp_recv(), from Geert
    Uytterhoeven.

 3) Fix accidental broadcast of RTM_GETTFILTER responses, from Eric
    Dumazret.

 4) Need to use list_for_each_entry_safe() in qed driver, from Wei
    Yongjun.

 5) Openvswitch 802.1AD bug fixes from Jiri Benc.

 6) Cure BUILD_BUG_ON() in mlx5 driver, from Tom Herbert.

 7) Fix UDP ipv6 checksumming in netvsc driver, from Stephen Hemminger.

 8) stmmac driver fixes from Giuseppe CAVALLARO.

 9) Fix access to mangled IP6CB in tcp, from Eric Dumazet.

10) Fix info leaks in tipc and rtnetlink, from Dan Carpenter.

* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net: (27 commits)
  net: bridge: add the multicast_flood flag attribute to brport_attrs
  net: axienet: Remove unused parameter from __axienet_device_reset
  liquidio: CN23XX: fix a loop timeout
  net: rtnl: info leak in rtnl_fill_vfinfo()
  tipc: info leak in __tipc_nl_add_udp_addr()
  net: ipv4: Do not drop to make_route if oif is l3mdev
  net: phy: Trigger state machine on state change and not polling.
  ipv6: tcp: restore IP6CB for pktoptions skbs
  netvsc: Remove mistaken udp.h inclusion.
  xen-netback: fix type mismatch warning
  stmmac: fix error check when init ptp
  stmmac: fix ptp init for gmac4
  qed: fix old-style function definition
  netvsc: fix checksum on UDP IPV6
  net_sched: reorder pernet ops and act ops registrations
  xen-netback: fix guest Rx stall detection (after guest Rx refactor)
  drivers/ptp: Fix kernel memory disclosure
  net/mlx5: Add MLX5_ARRAY_SET64 to fix BUILD_BUG_ON
  qmi_wwan: add support for Quectel EC21 and EC25
  openvswitch: add NETIF_F_HW_VLAN_STAG_TX to internal dev
  ...
diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl
index 589b40c..2a27227 100644
--- a/Documentation/DocBook/kernel-hacking.tmpl
+++ b/Documentation/DocBook/kernel-hacking.tmpl
@@ -483,7 +483,7 @@
     <function>get_user()</function>
     /
     <function>put_user()</function>
-    <filename class="headerfile">include/asm/uaccess.h</filename>
+    <filename class="headerfile">include/linux/uaccess.h</filename>
    </title>  
 
    <para>
diff --git a/Documentation/devicetree/bindings/pwm/pwm-meson.txt b/Documentation/devicetree/bindings/pwm/pwm-meson.txt
new file mode 100644
index 0000000..5376a44
--- /dev/null
+++ b/Documentation/devicetree/bindings/pwm/pwm-meson.txt
@@ -0,0 +1,23 @@
+Amlogic Meson PWM Controller
+============================
+
+Required properties:
+- compatible: Shall contain "amlogic,meson8b-pwm" or "amlogic,meson-gxbb-pwm".
+- #pwm-cells: Should be 3. See pwm.txt in this directory for a description of
+  the cells format.
+
+Optional properties:
+- clocks: Could contain one or two parents clocks phandle for each of the two
+  PWM channels.
+- clock-names: Could contain at least the "clkin0" and/or "clkin1" names.
+
+Example:
+
+	pwm_ab: pwm@8550 {
+		compatible = "amlogic,meson-gxbb-pwm";
+		reg = <0x0 0x08550 0x0 0x10>;
+		#pwm-cells = <3>;
+		status = "disabled";
+		clocks = <&xtal>, <&xtal>;
+		clock-names = "clkin0", "clkin1";
+	}
diff --git a/Documentation/devicetree/bindings/pwm/pwm-mtk-disp.txt b/Documentation/devicetree/bindings/pwm/pwm-mtk-disp.txt
index f8f59ba..6f8af2b 100644
--- a/Documentation/devicetree/bindings/pwm/pwm-mtk-disp.txt
+++ b/Documentation/devicetree/bindings/pwm/pwm-mtk-disp.txt
@@ -2,8 +2,9 @@
 
 Required properties:
  - compatible: should be "mediatek,<name>-disp-pwm":
-   - "mediatek,mt8173-disp-pwm": found on mt8173 SoC.
+   - "mediatek,mt2701-disp-pwm": found on mt2701 SoC.
    - "mediatek,mt6595-disp-pwm": found on mt6595 SoC.
+   - "mediatek,mt8173-disp-pwm": found on mt8173 SoC.
  - reg: physical base address and length of the controller's registers.
  - #pwm-cells: must be 2. See pwm.txt in this directory for a description of
    the cell format.
diff --git a/Documentation/devicetree/bindings/pwm/pwm-st.txt b/Documentation/devicetree/bindings/pwm/pwm-st.txt
index 84d2fb8..19fce77 100644
--- a/Documentation/devicetree/bindings/pwm/pwm-st.txt
+++ b/Documentation/devicetree/bindings/pwm/pwm-st.txt
@@ -13,13 +13,14 @@
 - pinctrl-0: 		List of phandles pointing to pin configuration nodes
 			for PWM module.
 			For Pinctrl properties, please refer to [1].
-- clock-names: 		Set to "pwm".
+- clock-names: 		Valid entries are "pwm" and/or "capture".
 - clocks: 		phandle of the clock used by the PWM module.
 			For Clk properties, please refer to [2].
+- interrupts:		IRQ for the Capture device
 
 Optional properties:
-- st,pwm-num-chan:	Number of available channels. If not passed, the driver
-			will consider single channel by default.
+- st,pwm-num-chan:	Number of available PWM channels.  Default is 0.
+- st,capture-num-chan:	Number of available Capture channels.  Default is 0.
 
 [1] Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
 [2] Documentation/devicetree/bindings/clock/clock-bindings.txt
@@ -38,4 +39,5 @@
 	clocks = <&clk_sysin>;
 	clock-names = "pwm";
 	st,pwm-num-chan = <4>;
+	st,capture-num-chan = <2>;
 };
diff --git a/Documentation/devicetree/bindings/pwm/pwm-sun4i.txt b/Documentation/devicetree/bindings/pwm/pwm-sun4i.txt
index cf6068b..f1cbeef 100644
--- a/Documentation/devicetree/bindings/pwm/pwm-sun4i.txt
+++ b/Documentation/devicetree/bindings/pwm/pwm-sun4i.txt
@@ -6,6 +6,7 @@
     - "allwinner,sun5i-a10s-pwm"
     - "allwinner,sun5i-a13-pwm"
     - "allwinner,sun7i-a20-pwm"
+    - "allwinner,sun8i-h3-pwm"
   - reg: physical base address and length of the controller's registers
   - #pwm-cells: should be 3. See pwm.txt in this directory for a description of
     the cells format.
diff --git a/Documentation/devicetree/bindings/thermal/max77620_thermal.txt b/Documentation/devicetree/bindings/thermal/max77620_thermal.txt
new file mode 100644
index 0000000..323a3b3
--- /dev/null
+++ b/Documentation/devicetree/bindings/thermal/max77620_thermal.txt
@@ -0,0 +1,70 @@
+Thermal driver for MAX77620 Power management IC from Maxim Semiconductor.
+
+Maxim Semiconductor MAX77620 supports alarm interrupts when its
+die temperature crosses 120C and 140C. These threshold temperatures
+are not configurable. Device does not provide the real temperature
+of die other than just indicating whether temperature is above or
+below threshold level.
+
+Required properties:
+-------------------
+#thermal-sensor-cells:	Please refer <devicetree/bindings/thermal/thermal.txt>
+			for more details.
+			The value must be 0.
+
+For more details, please refer generic thermal DT binding document
+<devicetree/bindings/thermal/thermal.txt>.
+
+Please refer <devicetree/bindings/mfd/max77620.txt> for mfd DT binding
+document for the MAX77620.
+
+Example:
+--------
+#include <dt-bindings/mfd/max77620.h>
+#include <dt-bindings/thermal/thermal.h>
+...
+
+i2c@7000d000 {
+	spmic: max77620@3c {
+		compatible = "maxim,max77620";
+		:::::
+		#thermal-sensor-cells = <0>;
+		:::
+	};
+};
+
+cool_dev: cool-dev {
+	compatible = "cooling-dev";
+	#cooling-cells = <2>;
+};
+
+thermal-zones {
+	PMIC-Die {
+		polling-delay = <0>;
+		polling-delay-passive = <0>;
+		thermal-sensors = <&spmic>;
+
+		trips {
+			pmic_die_warn_temp_thresh: hot-die {
+				temperature = <120000>;
+				type = "hot";
+				hysteresis = <0>;
+			};
+
+			pmic_die_cirt_temp_thresh: cirtical-die {
+				temperature = <140000>;
+				type = "critical";
+				hysteresis = <0>;
+			};
+		};
+
+		cooling-maps {
+			map0 {
+				trip = <&pmic_die_warn_temp_thresh>;
+				cooling-device = <&cool_dev THERMAL_NO_LIMIT
+						  THERMAL_NO_LIMIT>;
+				contribution = <100>;
+			};
+		};
+	};
+};
diff --git a/Documentation/devicetree/bindings/thermal/mediatek-thermal.txt b/Documentation/devicetree/bindings/thermal/mediatek-thermal.txt
index 81f9a51..e2f494d 100644
--- a/Documentation/devicetree/bindings/thermal/mediatek-thermal.txt
+++ b/Documentation/devicetree/bindings/thermal/mediatek-thermal.txt
@@ -8,7 +8,9 @@
 is also needed.
 
 Required properties:
-- compatible: "mediatek,mt8173-thermal"
+- compatible:
+  - "mediatek,mt8173-thermal" : For MT8173 family of SoCs
+  - "mediatek,mt2701-thermal" : For MT2701 family of SoCs
 - reg: Address range of the thermal controller
 - interrupts: IRQ for the thermal controller
 - clocks, clock-names: Clocks needed for the thermal controller. required
diff --git a/Documentation/devicetree/bindings/thermal/nvidia,tegra124-soctherm.txt b/Documentation/devicetree/bindings/thermal/nvidia,tegra124-soctherm.txt
index edebfa0..b6c0ae5 100644
--- a/Documentation/devicetree/bindings/thermal/nvidia,tegra124-soctherm.txt
+++ b/Documentation/devicetree/bindings/thermal/nvidia,tegra124-soctherm.txt
@@ -10,8 +10,14 @@
 - compatible : For Tegra124, must contain "nvidia,tegra124-soctherm".
   For Tegra132, must contain "nvidia,tegra132-soctherm".
   For Tegra210, must contain "nvidia,tegra210-soctherm".
-- reg : Should contain 1 entry:
+- reg : Should contain at least 2 entries for each entry in reg-names:
   - SOCTHERM register set
+  - Tegra CAR register set: Required for Tegra124 and Tegra210.
+  - CCROC register set: Required for Tegra132.
+- reg-names :  Should contain at least 2 entries:
+  - soctherm-reg
+  - car-reg
+  - ccroc-reg
 - interrupts : Defines the interrupt used by SOCTHERM
 - clocks : Must contain an entry for each entry in clock-names.
   See ../clocks/clock-bindings.txt for details.
@@ -25,17 +31,45 @@
 - #thermal-sensor-cells : Should be 1. See ./thermal.txt for a description
     of this property. See <dt-bindings/thermal/tegra124-soctherm.h> for a
     list of valid values when referring to thermal sensors.
+- throttle-cfgs: A sub-node which is a container of configuration for each
+    hardware throttle events. These events can be set as cooling devices.
+  * throttle events: Sub-nodes must be named as "light" or "heavy".
+      Properties:
+      - nvidia,priority: Each throttles has its own throttle settings, so the
+        SW need to set priorities for various throttle, the HW arbiter can select
+        the final throttle settings.
+        Bigger value indicates higher priority, In general, higher priority
+        translates to lower target frequency. SW needs to ensure that critical
+        thermal alarms are given higher priority, and ensure that there is
+        no race if priority of two vectors is set to the same value.
+        The range of this value is 1~100.
+      - nvidia,cpu-throt-percent: This property is for Tegra124 and Tegra210.
+        It is the throttling depth of pulse skippers, it's the percentage
+        throttling.
+      - nvidia,cpu-throt-level: This property is only for Tegra132, it is the
+        level of pulse skippers, which used to throttle clock frequencies. It
+        indicates cpu clock throttling depth, and the depth can be programmed.
+        Must set as following values:
+        TEGRA_SOCTHERM_THROT_LEVEL_LOW, TEGRA_SOCTHERM_THROT_LEVEL_MED
+        TEGRA_SOCTHERM_THROT_LEVEL_HIGH, TEGRA_SOCTHERM_THROT_LEVEL_NONE
+      - #cooling-cells: Should be 1. This cooling device only support on/off state.
+        See ./thermal.txt for a description of this property.
 
 Note:
 - the "critical" type trip points will be set to SOC_THERM hardware as the
 shut down temperature. Once the temperature of this thermal zone is higher
 than it, the system will be shutdown or reset by hardware.
+- the "hot" type trip points will be set to SOC_THERM hardware as the throttle
+temperature. Once the the temperature of this thermal zone is higher
+than it, it will trigger the HW throttle event.
 
 Example :
 
 	soctherm@700e2000 {
 		compatible = "nvidia,tegra124-soctherm";
-		reg = <0x0 0x700e2000 0x0 0x1000>;
+		reg = <0x0 0x700e2000 0x0 0x600  /* SOC_THERM reg_base */
+			0x0 0x60006000 0x0 0x400 /* CAR reg_base */
+		reg-names = "soctherm-reg", "car-reg";
 		interrupts = <GIC_SPI 48 IRQ_TYPE_LEVEL_HIGH>;
 		clocks = <&tegra_car TEGRA124_CLK_TSENSOR>,
 			<&tegra_car TEGRA124_CLK_SOC_THERM>;
@@ -44,6 +78,76 @@
 		reset-names = "soctherm";
 
 		#thermal-sensor-cells = <1>;
+
+		throttle-cfgs {
+			/*
+			 * When the "heavy" cooling device triggered,
+			 * the HW will skip cpu clock's pulse in 85% depth
+			 */
+			throttle_heavy: heavy {
+				nvidia,priority = <100>;
+				nvidia,cpu-throt-percent = <85>;
+
+				#cooling-cells = <1>;
+			};
+
+			/*
+			 * When the "light" cooling device triggered,
+			 * the HW will skip cpu clock's pulse in 50% depth
+			 */
+			throttle_light: light {
+				nvidia,priority = <80>;
+				nvidia,cpu-throt-percent = <50>;
+
+				#cooling-cells = <1>;
+			};
+
+			/*
+			 * If these two devices are triggered in same time, the HW throttle
+			 * arbiter will select the highest priority as the final throttle
+			 * settings to skip cpu pulse.
+			 */
+		};
+	};
+
+Example: referring to Tegra132's "reg", "reg-names" and "throttle-cfgs" :
+
+	soctherm@700e2000 {
+		compatible = "nvidia,tegra132-soctherm";
+		reg = <0x0 0x700e2000 0x0 0x600  /* SOC_THERM reg_base */
+			0x0 0x70040000 0x0 0x200>; /* CCROC reg_base */;
+		reg-names = "soctherm-reg", "ccroc-reg";
+
+		throttle-cfgs {
+			/*
+			 * When the "heavy" cooling device triggered,
+			 * the HW will skip cpu clock's pulse in HIGH level
+			 */
+			throttle_heavy: heavy {
+				nvidia,priority = <100>;
+				nvidia,cpu-throt-level = <TEGRA_SOCTHERM_THROT_LEVEL_HIGH>;
+
+				#cooling-cells = <1>;
+			};
+
+			/*
+			 * When the "light" cooling device triggered,
+			 * the HW will skip cpu clock's pulse in MED level
+			 */
+			throttle_light: light {
+				nvidia,priority = <80>;
+				nvidia,cpu-throt-level = <TEGRA_SOCTHERM_THROT_LEVEL_MED>;
+
+				#cooling-cells = <1>;
+			};
+
+			/*
+			 * If these two devices are triggered in same time, the HW throttle
+			 * arbiter will select the highest priority as the final throttle
+			 * settings to skip cpu pulse.
+			 */
+
+		};
 	};
 
 Example: referring to thermal sensors :
@@ -62,6 +166,19 @@
 					hysteresis = <1000>;
 					type = "critical";
 				};
+
+				cpu_throttle_trip: throttle-trip {
+					temperature = <100000>;
+					hysteresis = <1000>;
+					type = "hot";
+				};
+			};
+
+			cooling-maps {
+				map0 {
+					trip = <&cpu_throttle_trip>;
+					cooling-device = <&throttle_heavy 1 1>;
+				};
 			};
                 };
 	};
diff --git a/Documentation/devicetree/bindings/thermal/qcom-tsens.txt b/Documentation/devicetree/bindings/thermal/qcom-tsens.txt
new file mode 100644
index 0000000..292ed89
--- /dev/null
+++ b/Documentation/devicetree/bindings/thermal/qcom-tsens.txt
@@ -0,0 +1,21 @@
+* QCOM SoC Temperature Sensor (TSENS)
+
+Required properties:
+- compatible :
+ - "qcom,msm8916-tsens" : For 8916 Family of SoCs
+ - "qcom,msm8974-tsens" : For 8974 Family of SoCs
+ - "qcom,msm8996-tsens" : For 8996 Family of SoCs
+
+- reg: Address range of the thermal registers
+- #thermal-sensor-cells : Should be 1. See ./thermal.txt for a description.
+- Refer to Documentation/devicetree/bindings/nvmem/nvmem.txt to know how to specify
+nvmem cells
+
+Example:
+tsens: thermal-sensor@900000 {
+		compatible = "qcom,msm8916-tsens";
+		reg = <0x4a8000 0x2000>;
+		nvmem-cells = <&tsens_caldata>, <&tsens_calsel>;
+		nvmem-cell-names = "caldata", "calsel";
+		#thermal-sensor-cells = <1>;
+	};
diff --git a/Documentation/devicetree/bindings/watchdog/of-xilinx-wdt.txt b/Documentation/devicetree/bindings/watchdog/of-xilinx-wdt.txt
index 6d63782..c6ae9c9d 100644
--- a/Documentation/devicetree/bindings/watchdog/of-xilinx-wdt.txt
+++ b/Documentation/devicetree/bindings/watchdog/of-xilinx-wdt.txt
@@ -7,6 +7,8 @@
 - reg			: Physical base address and size
 
 Optional properties:
+- clocks		: Input clock specifier. Refer to common clock
+			  bindings.
 - clock-frequency	: Frequency of clock in Hz
 - xlnx,wdt-enable-once	: 0 - Watchdog can be restarted
 			  1 - Watchdog can be enabled just once
@@ -17,6 +19,7 @@
 axi-timebase-wdt@40100000 {
 	clock-frequency = <50000000>;
 	compatible = "xlnx,xps-timebase-wdt-1.00.a";
+	clocks = <&clkc 15>;
 	reg = <0x40100000 0x10000>;
 	xlnx,wdt-enable-once = <0x0>;
 	xlnx,wdt-interval = <0x1b>;
diff --git a/Documentation/devicetree/bindings/watchdog/st_lpc_wdt.txt b/Documentation/devicetree/bindings/watchdog/st_lpc_wdt.txt
index 039c5ca..b949039 100644
--- a/Documentation/devicetree/bindings/watchdog/st_lpc_wdt.txt
+++ b/Documentation/devicetree/bindings/watchdog/st_lpc_wdt.txt
@@ -9,8 +9,7 @@
 
 Required properties
 
-- compatible 	: Must be one of: "st,stih407-lpc" "st,stih416-lpc"
-				  "st,stih415-lpc" "st,stid127-lpc"
+- compatible 	: Should be: "st,stih407-lpc"
 - reg		: LPC registers base address + size
 - interrupts    : LPC interrupt line number and associated flags
 - clocks	: Clock used by LPC device (See: ../clock/clock-bindings.txt)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a1489e1..58f3c10 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2470,6 +2470,11 @@
 	nfsrootdebug	[NFS] enable nfsroot debugging messages.
 			See Documentation/filesystems/nfs/nfsroot.txt.
 
+	nfs.callback_nr_threads=
+			[NFSv4] set the total number of threads that the
+			NFS client will assign to service NFSv4 callback
+			requests.
+
 	nfs.callback_tcpport=
 			[NFS] set the TCP port on which the NFSv4 callback
 			channel should listen.
@@ -2493,6 +2498,13 @@
 			of returning the full 64-bit number.
 			The default is to return 64-bit inode numbers.
 
+	nfs.max_session_cb_slots=
+			[NFSv4.1] Sets the maximum number of session
+			slots the client will assign to the callback
+			channel. This determines the maximum number of
+			callbacks the client will process in parallel for
+			a particular server.
+
 	nfs.max_session_slots=
 			[NFSv4.1] Sets the maximum number of session slots
 			the client will attempt to negotiate with the server.
diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt
index efc3f3d..ef473dc 100644
--- a/Documentation/thermal/sysfs-api.txt
+++ b/Documentation/thermal/sysfs-api.txt
@@ -49,6 +49,9 @@
 	.bind: bind the thermal zone device with a thermal cooling device.
 	.unbind: unbind the thermal zone device with a thermal cooling device.
 	.get_temp: get the current temperature of the thermal zone.
+	.set_trips: set the trip points window. Whenever the current temperature
+		    is updated, the trip points immediately below and above the
+		    current temperature are found.
 	.get_mode: get the current mode (enabled/disabled) of the thermal zone.
 	    - "enabled" means the kernel thermal management is enabled.
 	    - "disabled" will prevent kernel thermal driver action upon trip points
@@ -95,6 +98,10 @@
 			get_temp:	a pointer to a function that reads the
 					sensor temperature. This is mandatory
 					callback provided by sensor driver.
+			set_trips:      a pointer to a function that sets a
+					temperature window. When this window is
+					left the driver must inform the thermal
+					core via thermal_zone_device_update.
 			get_trend: 	a pointer to a function that reads the
 					sensor temperature trend.
 			set_emul_temp:	a pointer to a function that sets
@@ -140,6 +147,18 @@
 	Normally this function will not need to be called and the resource
 	management code will ensure that the resource is freed.
 
+1.1.7 int thermal_zone_get_slope(struct thermal_zone_device *tz)
+
+	This interface is used to read the slope attribute value
+	for the thermal zone device, which might be useful for platform
+	drivers for temperature calculations.
+
+1.1.8 int thermal_zone_get_offset(struct thermal_zone_device *tz)
+
+	This interface is used to read the offset attribute value
+	for the thermal zone device, which might be useful for platform
+	drivers for temperature calculations.
+
 1.2 thermal cooling device interface
 1.2.1 struct thermal_cooling_device *thermal_cooling_device_register(char *name,
 		void *devdata, struct thermal_cooling_device_ops *)
diff --git a/Documentation/watchdog/watchdog-kernel-api.txt b/Documentation/watchdog/watchdog-kernel-api.txt
index 7f31125..ea27747 100644
--- a/Documentation/watchdog/watchdog-kernel-api.txt
+++ b/Documentation/watchdog/watchdog-kernel-api.txt
@@ -48,8 +48,10 @@
 	const struct attribute_group **groups;
 	const struct watchdog_info *info;
 	const struct watchdog_ops *ops;
+	const struct watchdog_governor *gov;
 	unsigned int bootstatus;
 	unsigned int timeout;
+	unsigned int pretimeout;
 	unsigned int min_timeout;
 	unsigned int max_timeout;
 	unsigned int min_hw_heartbeat_ms;
@@ -74,9 +76,11 @@
 * info: a pointer to a watchdog_info structure. This structure gives some
   additional information about the watchdog timer itself. (Like it's unique name)
 * ops: a pointer to the list of watchdog operations that the watchdog supports.
+* gov: a pointer to the assigned watchdog device pretimeout governor or NULL.
 * timeout: the watchdog timer's timeout value (in seconds).
   This is the time after which the system will reboot if user space does
   not send a heartbeat request if WDOG_ACTIVE is set.
+* pretimeout: the watchdog timer's pretimeout value (in seconds).
 * min_timeout: the watchdog timer's minimum timeout value (in seconds).
   If set, the minimum configurable value for 'timeout'.
 * max_timeout: the watchdog timer's maximum timeout value (in seconds),
@@ -121,6 +125,7 @@
 	int (*ping)(struct watchdog_device *);
 	unsigned int (*status)(struct watchdog_device *);
 	int (*set_timeout)(struct watchdog_device *, unsigned int);
+	int (*set_pretimeout)(struct watchdog_device *, unsigned int);
 	unsigned int (*get_timeleft)(struct watchdog_device *);
 	int (*restart)(struct watchdog_device *);
 	void (*ref)(struct watchdog_device *) __deprecated;
@@ -188,6 +193,23 @@
   If set_timeout is not provided but, WDIOF_SETTIMEOUT is set, the watchdog
   infrastructure updates the timeout value of the watchdog_device internally
   to the requested value.
+  If the pretimeout feature is used (WDIOF_PRETIMEOUT), then set_timeout must
+  also take care of checking if pretimeout is still valid and set up the timer
+  accordingly. This can't be done in the core without races, so it is the
+  duty of the driver.
+* set_pretimeout: this routine checks and changes the pretimeout value of
+  the watchdog. It is optional because not all watchdogs support pretimeout
+  notification. The timeout value is not an absolute time, but the number of
+  seconds before the actual timeout would happen. It returns 0 on success,
+  -EINVAL for "parameter out of range" and -EIO for "could not write value to
+  the watchdog". A value of 0 disables pretimeout notification.
+  (Note: the WDIOF_PRETIMEOUT needs to be set in the options field of the
+  watchdog's info structure).
+  If the watchdog driver does not have to perform any action but setting the
+  watchdog_device.pretimeout, this callback can be omitted. That means if
+  set_pretimeout is not provided but WDIOF_PRETIMEOUT is set, the watchdog
+  infrastructure updates the pretimeout value of the watchdog_device internally
+  to the requested value.
 * get_timeleft: this routines returns the time that's left before a reset.
 * restart: this routine restarts the machine. It returns 0 on success or a
   negative errno code for failure.
@@ -268,3 +290,14 @@
 * 128: default restart handler, use if no other handler is expected to be
   available, and/or if restart is sufficient to restart the entire system
 * 255: highest priority, will preempt all other restart handlers
+
+To raise a pretimeout notification, the following function should be used:
+
+void watchdog_notify_pretimeout(struct watchdog_device *wdd)
+
+The function can be called in the interrupt context. If watchdog pretimeout
+governor framework (kbuild CONFIG_WATCHDOG_PRETIMEOUT_GOV symbol) is enabled,
+an action is taken by a preconfigured pretimeout governor preassigned to
+the watchdog device. If watchdog pretimeout governor framework is not
+enabled, watchdog_notify_pretimeout() prints a notification message to
+the kernel log buffer.
diff --git a/MAINTAINERS b/MAINTAINERS
index 5e925a2..1fc66f0a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4775,15 +4775,6 @@
 S:	Maintained
 F:	drivers/iommu/exynos-iommu.c
 
-EXYNOS MIPI DISPLAY DRIVERS
-M:	Inki Dae <inki.dae@samsung.com>
-M:	Donghwa Lee <dh09.lee@samsung.com>
-M:	Kyungmin Park <kyungmin.park@samsung.com>
-L:	linux-fbdev@vger.kernel.org
-S:	Maintained
-F:	drivers/video/fbdev/exynos/exynos_mipi*
-F:	include/video/exynos_mipi*
-
 EZchip NPS platform support
 M:	Noam Camus <noamc@ezchip.com>
 S:	Supported
@@ -4962,12 +4953,9 @@
 F:	drivers/net/wan/sdla.c
 
 FRAMEBUFFER LAYER
-M:	Jean-Christophe Plagniol-Villard <plagnioj@jcrosoft.com>
 M:	Tomi Valkeinen <tomi.valkeinen@ti.com>
 L:	linux-fbdev@vger.kernel.org
-W:	http://linux-fbdev.sourceforge.net/
 Q:	http://patchwork.kernel.org/project/linux-fbdev/list/
-T:	git git://git.kernel.org/pub/scm/linux/kernel/git/plagnioj/linux-fbdev.git
 S:	Maintained
 F:	Documentation/fb/
 F:	drivers/video/
@@ -9201,6 +9189,14 @@
 F:	Documentation/devicetree/bindings/pci/versatile.txt
 F:	drivers/pci/host/pci-versatile.c
 
+PCI DRIVER FOR ARMADA 8K
+M:	Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
+L:	linux-pci@vger.kernel.org
+L:	linux-arm-kernel@lists.infradead.org
+S:	Maintained
+F:	Documentation/devicetree/bindings/pci/pci-armada8k.txt
+F:	drivers/pci/host/pcie-armada8k.c
+
 PCI DRIVER FOR APPLIEDMICRO XGENE
 M:	Tanmay Inamdar <tinamdar@apm.com>
 L:	linux-pci@vger.kernel.org
@@ -9247,6 +9243,7 @@
 L:	linux-pci@vger.kernel.org
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
+F:	Documentation/devicetree/bindings/pci/aardvark-pci.txt
 F:	drivers/pci/host/pci-aardvark.c
 
 PCI DRIVER FOR NVIDIA TEGRA
diff --git a/Makefile b/Makefile
index addb235..27f97b5 100644
--- a/Makefile
+++ b/Makefile
@@ -621,6 +621,7 @@
 
 KBUILD_CFLAGS	+= $(call cc-option,-fno-delete-null-pointer-checks,)
 KBUILD_CFLAGS	+= $(call cc-disable-warning,maybe-uninitialized,)
+KBUILD_CFLAGS	+= $(call cc-disable-warning,frame-address,)
 
 ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
 KBUILD_CFLAGS	+= -Os
diff --git a/arch/arm/boot/dts/tegra124-jetson-tk1.dts b/arch/arm/boot/dts/tegra124-jetson-tk1.dts
index e52b824..53994f9 100644
--- a/arch/arm/boot/dts/tegra124-jetson-tk1.dts
+++ b/arch/arm/boot/dts/tegra124-jetson-tk1.dts
@@ -2045,44 +2045,32 @@
 	thermal-zones {
 		cpu {
 			trips {
-				trip {
+				cpu-shutdown-trip {
 					temperature = <101000>;
 					hysteresis = <0>;
 					type = "critical";
 				};
 			};
-
-			cooling-maps {
-				/* There are currently no cooling maps because there are no cooling devices */
-			};
 		};
 
 		mem {
 			trips {
-				trip {
+				mem-shutdown-trip {
 					temperature = <101000>;
 					hysteresis = <0>;
 					type = "critical";
 				};
 			};
-
-			cooling-maps {
-				/* There are currently no cooling maps because there are no cooling devices */
-			};
 		};
 
 		gpu {
 			trips {
-				trip {
+				gpu-shutdown-trip {
 					temperature = <101000>;
 					hysteresis = <0>;
 					type = "critical";
 				};
 			};
-
-			cooling-maps {
-				/* There are currently no cooling maps because there are no cooling devices */
-			};
 		};
 	};
 };
diff --git a/arch/arm/boot/dts/tegra124.dtsi b/arch/arm/boot/dts/tegra124.dtsi
index ea340f9..187a36c 100644
--- a/arch/arm/boot/dts/tegra124.dtsi
+++ b/arch/arm/boot/dts/tegra124.dtsi
@@ -851,7 +851,9 @@
 
 	soctherm: thermal-sensor@700e2000 {
 		compatible = "nvidia,tegra124-soctherm";
-		reg = <0x0 0x700e2000 0x0 0x1000>;
+		reg = <0x0 0x700e2000 0x0 0x600 /* SOC_THERM reg_base */
+			0x0 0x60006000 0x0 0x400>; /* CAR reg_base */
+		reg-names = "soctherm-reg", "car-reg";
 		interrupts = <GIC_SPI 48 IRQ_TYPE_LEVEL_HIGH>;
 		clocks = <&tegra_car TEGRA124_CLK_TSENSOR>,
 			<&tegra_car TEGRA124_CLK_SOC_THERM>;
@@ -859,6 +861,15 @@
 		resets = <&tegra_car 78>;
 		reset-names = "soctherm";
 		#thermal-sensor-cells = <1>;
+
+		throttle-cfgs {
+			throttle_heavy: heavy {
+				nvidia,priority = <100>;
+				nvidia,cpu-throt-percent = <85>;
+
+				#cooling-cells = <2>;
+			};
+		};
 	};
 
 	dfll: clock@70110000 {
@@ -1154,6 +1165,26 @@
 
 			thermal-sensors =
 				<&soctherm TEGRA124_SOCTHERM_SENSOR_CPU>;
+
+			trips {
+				cpu-shutdown-trip {
+					temperature = <103000>;
+					hysteresis = <0>;
+					type = "critical";
+				};
+				cpu_throttle_trip: throttle-trip {
+					temperature = <100000>;
+					hysteresis = <1000>;
+					type = "hot";
+				};
+			};
+
+			cooling-maps {
+				map0 {
+					trip = <&cpu_throttle_trip>;
+					cooling-device = <&throttle_heavy 1 1>;
+				};
+			};
 		};
 
 		mem {
@@ -1162,6 +1193,21 @@
 
 			thermal-sensors =
 				<&soctherm TEGRA124_SOCTHERM_SENSOR_MEM>;
+
+			trips {
+				mem-shutdown-trip {
+					temperature = <103000>;
+					hysteresis = <0>;
+					type = "critical";
+				};
+			};
+
+			cooling-maps {
+				/*
+				 * There are currently no cooling maps,
+				 * because there are no cooling devices.
+				 */
+			};
 		};
 
 		gpu {
@@ -1170,6 +1216,26 @@
 
 			thermal-sensors =
 				<&soctherm TEGRA124_SOCTHERM_SENSOR_GPU>;
+
+			trips {
+				gpu-shutdown-trip {
+					temperature = <101000>;
+					hysteresis = <0>;
+					type = "critical";
+				};
+				gpu_throttle_trip: throttle-trip {
+					temperature = <99000>;
+					hysteresis = <1000>;
+					type = "hot";
+				};
+			};
+
+			cooling-maps {
+				map0 {
+					trip = <&gpu_throttle_trip>;
+					cooling-device = <&throttle_heavy 1 1>;
+				};
+			};
 		};
 
 		pllx {
@@ -1178,6 +1244,21 @@
 
 			thermal-sensors =
 				<&soctherm TEGRA124_SOCTHERM_SENSOR_PLLX>;
+
+			trips {
+				pllx-shutdown-trip {
+					temperature = <103000>;
+					hysteresis = <0>;
+					type = "critical";
+				};
+			};
+
+			cooling-maps {
+				/*
+				 * There are currently no cooling maps,
+				 * because there are no cooling devices.
+				 */
+			};
 		};
 	};
 
diff --git a/arch/arm/configs/exynos_defconfig b/arch/arm/configs/exynos_defconfig
index 4e484f4..c58f684 100644
--- a/arch/arm/configs/exynos_defconfig
+++ b/arch/arm/configs/exynos_defconfig
@@ -168,8 +168,6 @@
 CONFIG_DRM_PANEL_SAMSUNG_S6E8AA0=y
 CONFIG_DRM_NXP_PTN3460=y
 CONFIG_DRM_PARADE_PS8622=y
-CONFIG_EXYNOS_VIDEO=y
-CONFIG_EXYNOS_MIPI_DSI=y
 CONFIG_LCD_CLASS_DEVICE=y
 CONFIG_LCD_PLATFORM=y
 CONFIG_BACKLIGHT_PWM=y
diff --git a/arch/arm/mm/fault.h b/arch/arm/mm/fault.h
index 05ec5e0..67532f2 100644
--- a/arch/arm/mm/fault.h
+++ b/arch/arm/mm/fault.h
@@ -23,7 +23,6 @@
 #endif
 
 void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs);
-unsigned long search_exception_table(unsigned long addr);
 void early_abt_enable(void);
 
 #endif	/* __ARCH_ARM_FAULT_H */
diff --git a/arch/arm64/boot/dts/nvidia/tegra132.dtsi b/arch/arm64/boot/dts/nvidia/tegra132.dtsi
index 2013f89..3f3a46a 100644
--- a/arch/arm64/boot/dts/nvidia/tegra132.dtsi
+++ b/arch/arm64/boot/dts/nvidia/tegra132.dtsi
@@ -4,6 +4,7 @@
 #include <dt-bindings/pinctrl/pinctrl-tegra.h>
 #include <dt-bindings/pinctrl/pinctrl-tegra-xusb.h>
 #include <dt-bindings/interrupt-controller/arm-gic.h>
+#include <dt-bindings/thermal/tegra124-soctherm.h>
 
 / {
 	compatible = "nvidia,tegra132", "nvidia,tegra124";
@@ -727,8 +728,10 @@
 	};
 
 	soctherm: thermal-sensor@700e2000 {
-		compatible = "nvidia,tegra124-soctherm";
-		reg = <0x0 0x700e2000 0x0 0x1000>;
+		compatible = "nvidia,tegra132-soctherm";
+		reg = <0x0 0x700e2000 0x0 0x600 /* 0: SOC_THERM reg_base */
+			0x0 0x70040000 0x0 0x200>; /* 2: CCROC reg_base */
+		reg-names = "soctherm-reg", "ccroc-reg";
 		interrupts = <GIC_SPI 48 IRQ_TYPE_LEVEL_HIGH>;
 		clocks = <&tegra_car TEGRA124_CLK_TSENSOR>,
 			<&tegra_car TEGRA124_CLK_SOC_THERM>;
@@ -736,6 +739,118 @@
 		resets = <&tegra_car 78>;
 		reset-names = "soctherm";
 		#thermal-sensor-cells = <1>;
+
+		throttle-cfgs {
+			throttle_heavy: heavy {
+				nvidia,priority = <100>;
+				nvidia,cpu-throt-level = <TEGRA_SOCTHERM_THROT_LEVEL_HIGH>;
+
+				#cooling-cells = <2>;
+			};
+		};
+	};
+
+	thermal-zones {
+		cpu {
+			polling-delay-passive = <1000>;
+			polling-delay = <0>;
+
+			thermal-sensors =
+				<&soctherm TEGRA124_SOCTHERM_SENSOR_CPU>;
+
+			trips {
+				cpu_shutdown_trip {
+					temperature = <105000>;
+					hysteresis = <1000>;
+					type = "critical";
+				};
+
+				cpu_throttle_trip: throttle-trip {
+					temperature = <102000>;
+					hysteresis = <1000>;
+					type = "hot";
+				};
+			};
+
+			cooling-maps {
+				map0 {
+					trip = <&cpu_throttle_trip>;
+					cooling-device = <&throttle_heavy 1 1>;
+				};
+			};
+		};
+		mem {
+			polling-delay-passive = <0>;
+			polling-delay = <0>;
+
+			thermal-sensors =
+				<&soctherm TEGRA124_SOCTHERM_SENSOR_MEM>;
+
+			trips {
+				mem_shutdown_trip {
+					temperature = <101000>;
+					hysteresis = <1000>;
+					type = "critical";
+				};
+			};
+
+			cooling-maps {
+				/*
+				 * There are currently no cooling maps,
+				 * because there are no cooling devices.
+				 */
+			};
+		};
+		gpu {
+			polling-delay-passive = <1000>;
+			polling-delay = <0>;
+
+			thermal-sensors =
+				<&soctherm TEGRA124_SOCTHERM_SENSOR_GPU>;
+
+			trips {
+				gpu_shutdown_trip {
+					temperature = <101000>;
+					hysteresis = <1000>;
+					type = "critical";
+				};
+
+				gpu_throttle_trip: throttle-trip {
+					temperature = <99000>;
+					hysteresis = <1000>;
+					type = "hot";
+				};
+			};
+
+			cooling-maps {
+				map0 {
+					trip = <&gpu_throttle_trip>;
+					cooling-device = <&throttle_heavy 1 1>;
+				};
+			};
+		};
+		pllx {
+			polling-delay-passive = <0>;
+			polling-delay = <0>;
+
+			thermal-sensors =
+				<&soctherm TEGRA124_SOCTHERM_SENSOR_PLLX>;
+
+			trips {
+				pllx_shutdown_trip {
+					temperature = <105000>;
+					hysteresis = <1000>;
+					type = "critical";
+				};
+			};
+
+			cooling-maps {
+				/*
+				 * There are currently no cooling maps,
+				 * because there are no cooling devices.
+				 */
+			};
+		};
 	};
 
 	ahub@70300000 {
diff --git a/arch/arm64/boot/dts/nvidia/tegra210.dtsi b/arch/arm64/boot/dts/nvidia/tegra210.dtsi
index f673979..46045fe 100644
--- a/arch/arm64/boot/dts/nvidia/tegra210.dtsi
+++ b/arch/arm64/boot/dts/nvidia/tegra210.dtsi
@@ -3,6 +3,7 @@
 #include <dt-bindings/memory/tegra210-mc.h>
 #include <dt-bindings/pinctrl/pinctrl-tegra.h>
 #include <dt-bindings/interrupt-controller/arm-gic.h>
+#include <dt-bindings/thermal/tegra124-soctherm.h>
 
 / {
 	compatible = "nvidia,tegra210";
@@ -1159,4 +1160,130 @@
 				(GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>;
 		interrupt-parent = <&gic>;
 	};
+
+	soctherm: thermal-sensor@700e2000 {
+		compatible = "nvidia,tegra210-soctherm";
+		reg = <0x0 0x700e2000 0x0 0x600 /* SOC_THERM reg_base */
+			0x0 0x60006000 0x0 0x400>; /* CAR reg_base */
+		reg-names = "soctherm-reg", "car-reg";
+		interrupts = <GIC_SPI 48 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA210_CLK_TSENSOR>,
+			<&tegra_car TEGRA210_CLK_SOC_THERM>;
+		clock-names = "tsensor", "soctherm";
+		resets = <&tegra_car 78>;
+		reset-names = "soctherm";
+		#thermal-sensor-cells = <1>;
+
+		throttle-cfgs {
+			throttle_heavy: heavy {
+				nvidia,priority = <100>;
+				nvidia,cpu-throt-percent = <85>;
+
+				#cooling-cells = <2>;
+			};
+		};
+	};
+
+	thermal-zones {
+		cpu {
+			polling-delay-passive = <1000>;
+			polling-delay = <0>;
+
+			thermal-sensors =
+				<&soctherm TEGRA124_SOCTHERM_SENSOR_CPU>;
+
+			trips {
+				cpu-shutdown-trip {
+					temperature = <102500>;
+					hysteresis = <0>;
+					type = "critical";
+				};
+
+				cpu_throttle_trip: throttle-trip {
+					temperature = <98500>;
+					hysteresis = <1000>;
+					type = "hot";
+				};
+			};
+
+			cooling-maps {
+				map0 {
+					trip = <&cpu_throttle_trip>;
+					cooling-device = <&throttle_heavy 1 1>;
+				};
+			};
+		};
+		mem {
+			polling-delay-passive = <0>;
+			polling-delay = <0>;
+
+			thermal-sensors =
+				<&soctherm TEGRA124_SOCTHERM_SENSOR_MEM>;
+
+			trips {
+				mem-shutdown-trip {
+					temperature = <103000>;
+					hysteresis = <0>;
+					type = "critical";
+				};
+			};
+
+			cooling-maps {
+				/*
+				 * There are currently no cooling maps,
+				 * because there are no cooling devices.
+				 */
+			};
+		};
+		gpu {
+			polling-delay-passive = <1000>;
+			polling-delay = <0>;
+
+			thermal-sensors =
+				<&soctherm TEGRA124_SOCTHERM_SENSOR_GPU>;
+
+			trips {
+				gpu-shutdown-trip {
+					temperature = <103000>;
+					hysteresis = <0>;
+					type = "critical";
+				};
+
+				gpu_throttle_trip: throttle-trip {
+					temperature = <100000>;
+					hysteresis = <1000>;
+					type = "hot";
+				};
+			};
+
+			cooling-maps {
+				map0 {
+					trip = <&gpu_throttle_trip>;
+					cooling-device = <&throttle_heavy 1 1>;
+				};
+			};
+		};
+		pllx {
+			polling-delay-passive = <0>;
+			polling-delay = <0>;
+
+			thermal-sensors =
+				<&soctherm TEGRA124_SOCTHERM_SENSOR_PLLX>;
+
+			trips {
+				pllx-shutdown-trip {
+					temperature = <103000>;
+					hysteresis = <0>;
+					type = "critical";
+				};
+			};
+
+			cooling-maps {
+				/*
+				 * There are currently no cooling maps,
+				 * because there are no cooling devices.
+				 */
+			};
+		};
+	};
 };
diff --git a/arch/frv/include/asm/pgtable.h b/arch/frv/include/asm/pgtable.h
index 07d7a7e..a0513d4 100644
--- a/arch/frv/include/asm/pgtable.h
+++ b/arch/frv/include/asm/pgtable.h
@@ -522,5 +522,6 @@
 #ifndef __ASSEMBLY__
 extern void __init paging_init(void);
 #endif /* !__ASSEMBLY__ */
+#define HAVE_ARCH_UNMAPPED_AREA
 
 #endif /* _ASM_PGTABLE_H */
diff --git a/arch/frv/include/asm/segment.h b/arch/frv/include/asm/segment.h
index 4377c89..2305142 100644
--- a/arch/frv/include/asm/segment.h
+++ b/arch/frv/include/asm/segment.h
@@ -32,7 +32,6 @@
 #define get_ds()		(KERNEL_DS)
 #define get_fs()		(__current_thread_info->addr_limit)
 #define segment_eq(a, b)	((a).seg == (b).seg)
-#define __kernel_ds_p()		segment_eq(get_fs(), KERNEL_DS)
 #define get_addr_limit()	(get_fs().seg)
 
 #define set_fs(_x)					\
diff --git a/arch/frv/include/asm/uaccess.h b/arch/frv/include/asm/uaccess.h
index 87d9e34..c0f4057e 100644
--- a/arch/frv/include/asm/uaccess.h
+++ b/arch/frv/include/asm/uaccess.h
@@ -20,8 +20,6 @@
 #include <asm/segment.h>
 #include <asm/sections.h>
 
-#define HAVE_ARCH_UNMAPPED_AREA	/* we decide where to put mmaps */
-
 #define __ptr(x) ((unsigned long __force *)(x))
 
 #define VERIFY_READ	0
diff --git a/arch/m68k/include/asm/uaccess_no.h b/arch/m68k/include/asm/uaccess_no.h
index 1bdf152..36deeb3 100644
--- a/arch/m68k/include/asm/uaccess_no.h
+++ b/arch/m68k/include/asm/uaccess_no.h
@@ -44,9 +44,6 @@
 	unsigned long insn, fixup;
 };
 
-/* Returns 0 if exception not found and fixup otherwise.  */
-extern unsigned long search_exception_table(unsigned long);
-
 
 /*
  * These are the main single-value transfer routines.  They automatically
diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h
index 8266767..253a67e 100644
--- a/arch/microblaze/include/asm/uaccess.h
+++ b/arch/microblaze/include/asm/uaccess.h
@@ -71,9 +71,6 @@
 	unsigned long insn, fixup;
 };
 
-/* Returns 0 if exception not found and fixup otherwise.  */
-extern unsigned long search_exception_table(unsigned long);
-
 #ifndef CONFIG_MMU
 
 /* Check against bounds of physical memory */
diff --git a/arch/mips/include/asm/extable.h b/arch/mips/include/asm/extable.h
new file mode 100644
index 0000000..dce7a62
--- /dev/null
+++ b/arch/mips/include/asm/extable.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_EXTABLE_H
+#define _ASM_EXTABLE_H
+
+struct exception_table_entry
+{
+	unsigned long insn;
+	unsigned long nextinsn;
+};
+
+struct pt_regs;
+extern int fixup_exception(struct pt_regs *regs);
+
+#endif
diff --git a/arch/mips/include/asm/module.h b/arch/mips/include/asm/module.h
index 0aaf9a0..702c273 100644
--- a/arch/mips/include/asm/module.h
+++ b/arch/mips/include/asm/module.h
@@ -3,7 +3,7 @@
 
 #include <linux/list.h>
 #include <linux/elf.h>
-#include <asm/uaccess.h>
+#include <asm/extable.h>
 
 struct mod_arch_specific {
 	/* Data Bus Error exception tables */
diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h
index 21a2aab..4daf839 100644
--- a/arch/mips/include/asm/uaccess.h
+++ b/arch/mips/include/asm/uaccess.h
@@ -16,6 +16,7 @@
 #include <linux/thread_info.h>
 #include <linux/string.h>
 #include <asm/asm-eva.h>
+#include <asm/extable.h>
 
 /*
  * The fs value determines whether argument validity checking should be
@@ -1485,12 +1486,4 @@
 	return res;
 }
 
-struct exception_table_entry
-{
-	unsigned long insn;
-	unsigned long nextinsn;
-};
-
-extern int fixup_exception(struct pt_regs *regs);
-
 #endif /* _ASM_UACCESS_H */
diff --git a/arch/mips/lasat/picvue_proc.c b/arch/mips/lasat/picvue_proc.c
index 27533c1..dd292dc 100644
--- a/arch/mips/lasat/picvue_proc.c
+++ b/arch/mips/lasat/picvue_proc.c
@@ -16,6 +16,7 @@
 
 #include <linux/timer.h>
 #include <linux/mutex.h>
+#include <linux/uaccess.h>
 
 #include "picvue.h"
 
diff --git a/arch/mn10300/include/asm/processor.h b/arch/mn10300/include/asm/processor.h
index 769d5ed..b10ba12 100644
--- a/arch/mn10300/include/asm/processor.h
+++ b/arch/mn10300/include/asm/processor.h
@@ -18,7 +18,6 @@
 #include <asm/page.h>
 #include <asm/ptrace.h>
 #include <asm/cpu-regs.h>
-#include <asm/uaccess.h>
 #include <asm/current.h>
 
 /* Forward declaration, a strange C thing */
diff --git a/arch/mn10300/include/asm/uaccess.h b/arch/mn10300/include/asm/uaccess.h
index d012e87..2eedf6f 100644
--- a/arch/mn10300/include/asm/uaccess.h
+++ b/arch/mn10300/include/asm/uaccess.h
@@ -38,7 +38,6 @@
 #define get_ds()	(KERNEL_DS)
 #define get_fs()	(current_thread_info()->addr_limit)
 #define set_fs(x)	(current_thread_info()->addr_limit = (x))
-#define __kernel_ds_p() (current_thread_info()->addr_limit.seg == 0x9FFFFFFF)
 
 #define segment_eq(a, b) ((a).seg == (b).seg)
 
@@ -72,12 +71,6 @@
 #define access_ok(type, addr, size) (__range_ok((addr), (size)) == 0)
 #define __access_ok(addr, size)     (__range_ok((addr), (size)) == 0)
 
-static inline int verify_area(int type, const void *addr, unsigned long size)
-{
-	return access_ok(type, addr, size) ? 0 : -EFAULT;
-}
-
-
 /*
  * The exception table consists of pairs of addresses: the first is the
  * address of an instruction that is allowed to fault, and the second is
diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c
index dfd0301..cd8cb1d 100644
--- a/arch/mn10300/kernel/signal.c
+++ b/arch/mn10300/kernel/signal.c
@@ -75,7 +75,7 @@
 		struct fpucontext *buf;
 		err |= __get_user(buf, &sc->fpucontext);
 		if (buf) {
-			if (verify_area(VERIFY_READ, buf, sizeof(*buf)))
+			if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
 				goto badframe;
 			err |= fpu_restore_sigcontext(buf);
 		}
@@ -98,7 +98,7 @@
 	long d0;
 
 	frame = (struct sigframe __user *) current_frame()->sp;
-	if (verify_area(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
 	if (__get_user(set.sig[0], &frame->sc.oldmask))
 		goto badframe;
@@ -130,7 +130,7 @@
 	long d0;
 
 	frame = (struct rt_sigframe __user *) current_frame()->sp;
-	if (verify_area(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
diff --git a/arch/openrisc/include/asm/uaccess.h b/arch/openrisc/include/asm/uaccess.h
index 5cc6b4f..140faa1 100644
--- a/arch/openrisc/include/asm/uaccess.h
+++ b/arch/openrisc/include/asm/uaccess.h
@@ -82,10 +82,6 @@
 	unsigned long insn, fixup;
 };
 
-/* Returns 0 if exception not found and fixup otherwise.  */
-extern unsigned long search_exception_table(unsigned long);
-extern void sort_exception_table(void);
-
 /*
  * These are the main single-value transfer routines.  They automatically
  * use the right size if we just have the right pointer type.
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index e44bdb9..c2c43f7 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -83,10 +83,10 @@
 	printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, (unsigned long)pgd_val(e))
 
 /* This is the size of the initially mapped kernel memory */
-#if defined(CONFIG_64BIT) || defined(CONFIG_SMP)
-#define KERNEL_INITIAL_ORDER	25	/* 1<<25 = 32MB */
+#if defined(CONFIG_64BIT)
+#define KERNEL_INITIAL_ORDER	26	/* 1<<26 = 64MB */
 #else
-#define KERNEL_INITIAL_ORDER	24	/* 1<<24 = 16MB */
+#define KERNEL_INITIAL_ORDER	25	/* 1<<25 = 32MB */
 #endif
 #define KERNEL_INITIAL_SIZE	(1 << KERNEL_INITIAL_ORDER)
 
diff --git a/arch/parisc/include/asm/traps.h b/arch/parisc/include/asm/traps.h
index 5e953ab..6367023 100644
--- a/arch/parisc/include/asm/traps.h
+++ b/arch/parisc/include/asm/traps.h
@@ -11,6 +11,7 @@
 void die_if_kernel(char *str, struct pt_regs *regs, long err);
 
 /* mm/fault.c */
+const char *trap_name(unsigned long code);
 void do_page_fault(struct pt_regs *regs, unsigned long code,
 		unsigned long address);
 #endif
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index 97d6b20..378df92 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -458,8 +458,8 @@
 	}
 
 	printk("\n");
-	printk(KERN_CRIT "%s: Code=%d regs=%p (Addr=" RFMT ")\n",
-			msg, code, regs, offset);
+	pr_crit("%s: Code=%d (%s) regs=%p (Addr=" RFMT ")\n",
+		msg, code, trap_name(code), regs, offset);
 	show_regs(regs);
 
 	spin_unlock(&terminate_lock);
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index b37787d..3d6ef1b 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -90,8 +90,9 @@
 	/* Start of data section */
 	_sdata = .;
 
-	RO_DATA_SECTION(8)
-
+	/* Architecturally we need to keep __gp below 0x1000000 and thus
+	 * in front of RO_DATA_SECTION() which stores lots of tracepoint
+	 * and ftrace symbols. */
 #ifdef CONFIG_64BIT
 	. = ALIGN(16);
 	/* Linkage tables */
@@ -106,6 +107,12 @@
 	}
 #endif
 
+	RO_DATA_SECTION(8)
+
+	/* RO because of BUILDTIME_EXTABLE_SORT */
+	EXCEPTION_TABLE(8)
+	NOTES
+
 	/* unwind info */
 	.PARISC.unwind : {
 		__start___unwind = .;
@@ -121,9 +128,6 @@
 	. = ALIGN(HUGEPAGE_SIZE);
 	data_start = .;
 
-	EXCEPTION_TABLE(8)
-	NOTES
-
 	/* Data */
 	RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, PAGE_SIZE)
 
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index 47a6ca4..8ff9253 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -14,7 +14,7 @@
 #include <linux/ptrace.h>
 #include <linux/sched.h>
 #include <linux/interrupt.h>
-#include <linux/module.h>
+#include <linux/extable.h>
 #include <linux/uaccess.h>
 
 #include <asm/traps.h>
@@ -204,6 +204,16 @@
 	[28] "Unaligned data reference trap",
 };
 
+const char *trap_name(unsigned long code)
+{
+	const char *t = NULL;
+
+	if (code < ARRAY_SIZE(trap_description))
+		t = trap_description[code];
+
+	return t ? t : "Unknown trap";
+}
+
 /*
  * Print out info about fatal segfaults, if the show_unhandled_signals
  * sysctl is set:
@@ -213,8 +223,6 @@
 		unsigned long address, struct task_struct *tsk,
 		struct vm_area_struct *vma)
 {
-	const char *trap_name = NULL;
-
 	if (!unhandled_signal(tsk, SIGSEGV))
 		return;
 
@@ -226,10 +234,7 @@
 	    tsk->comm, code, address);
 	print_vma_addr(KERN_CONT " in ", regs->iaoq[0]);
 
-	if (code < ARRAY_SIZE(trap_description))
-		trap_name = trap_description[code];
-	pr_warn(KERN_CONT " trap #%lu: %s%c", code,
-		trap_name ? trap_name : "unknown",
+	pr_cont(" trap #%lu: %s%c", code, trap_name(code),
 		vma ? ',':'\n');
 
 	if (vma)
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 356f384..e02ada3 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -105,6 +105,8 @@
 	else
 		panic("get_memblock() failed.\n");
 
+	memset(__va(phys), 0, size);
+
 	return __va(phys);
 }
 
diff --git a/arch/score/include/asm/extable.h b/arch/score/include/asm/extable.h
new file mode 100644
index 0000000..c4423ccf
--- /dev/null
+++ b/arch/score/include/asm/extable.h
@@ -0,0 +1,11 @@
+#ifndef _ASM_SCORE_EXTABLE_H
+#define _ASM_SCORE_EXTABLE_H
+
+struct exception_table_entry {
+	unsigned long insn;
+	unsigned long fixup;
+};
+
+struct pt_regs;
+extern int fixup_exception(struct pt_regs *regs);
+#endif
diff --git a/arch/score/include/asm/module.h b/arch/score/include/asm/module.h
index abf395b..6dc1f29 100644
--- a/arch/score/include/asm/module.h
+++ b/arch/score/include/asm/module.h
@@ -2,7 +2,7 @@
 #define _ASM_SCORE_MODULE_H
 
 #include <linux/list.h>
-#include <asm/uaccess.h>
+#include <asm/extable.h>
 #include <asm-generic/module.h>
 
 struct mod_arch_specific {
diff --git a/arch/score/include/asm/uaccess.h b/arch/score/include/asm/uaccess.h
index 01aec8c..db58ab9 100644
--- a/arch/score/include/asm/uaccess.h
+++ b/arch/score/include/asm/uaccess.h
@@ -4,6 +4,7 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/thread_info.h>
+#include <asm/extable.h>
 
 #define VERIFY_READ		0
 #define VERIFY_WRITE		1
@@ -420,12 +421,5 @@
 		return __strnlen_user(str, len);
 }
 
-struct exception_table_entry {
-	unsigned long insn;
-	unsigned long fixup;
-};
-
-extern int fixup_exception(struct pt_regs *regs);
-
 #endif /* __SCORE_UACCESS_H */
 
diff --git a/arch/sh/include/asm/uaccess.h b/arch/sh/include/asm/uaccess.h
index 92ade79..a38d0c7 100644
--- a/arch/sh/include/asm/uaccess.h
+++ b/arch/sh/include/asm/uaccess.h
@@ -192,8 +192,6 @@
 #endif
 
 int fixup_exception(struct pt_regs *regs);
-/* Returns 0 if exception not found and fixup.unit otherwise.  */
-unsigned long search_exception_table(unsigned long addr);
 const struct exception_table_entry *search_exception_tables(unsigned long addr);
 
 extern void *set_exception_table_vec(unsigned int vec, void *handler);
diff --git a/arch/sparc/include/asm/elf_64.h b/arch/sparc/include/asm/elf_64.h
index 9331083..3f2d403 100644
--- a/arch/sparc/include/asm/elf_64.h
+++ b/arch/sparc/include/asm/elf_64.h
@@ -7,7 +7,7 @@
 
 #include <asm/ptrace.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <asm/extable_64.h>
 #include <asm/spitfire.h>
 
 /*
diff --git a/arch/sparc/include/asm/extable_64.h b/arch/sparc/include/asm/extable_64.h
new file mode 100644
index 0000000..1121cb0
--- /dev/null
+++ b/arch/sparc/include/asm/extable_64.h
@@ -0,0 +1,20 @@
+#ifndef __ASM_EXTABLE64_H
+#define __ASM_EXTABLE64_H
+/*
+ * The exception table consists of pairs of addresses: the first is the
+ * address of an instruction that is allowed to fault, and the second is
+ * the address at which the program should continue.  No registers are
+ * modified, so it is entirely up to the continuation code to figure out
+ * what to do.
+ *
+ * All the routines below use bits of fixup code that are out of line
+ * with the main instruction path.  This means when everything is well,
+ * we don't even have to jump over them.  Further, they do not intrude
+ * on our cache or tlb entries.
+ */
+
+struct exception_table_entry {
+        unsigned int insn, fixup;
+};
+
+#endif
diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h
index 37a315d..b68acc5 100644
--- a/arch/sparc/include/asm/uaccess_64.h
+++ b/arch/sparc/include/asm/uaccess_64.h
@@ -13,6 +13,7 @@
 #include <asm/asi.h>
 #include <asm/spitfire.h>
 #include <asm-generic/uaccess-unaligned.h>
+#include <asm/extable_64.h>
 #endif
 
 #ifndef __ASSEMBLY__
@@ -81,23 +82,6 @@
 	return 1;
 }
 
-/*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue.  No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
- *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path.  This means when everything is well,
- * we don't even have to jump over them.  Further, they do not intrude
- * on our cache or tlb entries.
- */
-
-struct exception_table_entry {
-        unsigned int insn, fixup;
-};
-
 void __ret_efault(void);
 void __retl_efault(void);
 
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 61518cf..872877d 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -4,7 +4,6 @@
 /* Caches aren't brain-dead on the intel. */
 #include <asm-generic/cacheflush.h>
 #include <asm/special_insns.h>
-#include <asm/uaccess.h>
 
 /*
  * The set_memory_* API can be used to change various attributes of a virtual
diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h
new file mode 100644
index 0000000..b8ad261
--- /dev/null
+++ b/arch/x86/include/asm/extable.h
@@ -0,0 +1,35 @@
+#ifndef _ASM_X86_EXTABLE_H
+#define _ASM_X86_EXTABLE_H
+/*
+ * The exception table consists of triples of addresses relative to the
+ * exception table entry itself. The first address is of an instruction
+ * that is allowed to fault, the second is the target at which the program
+ * should continue. The third is a handler function to deal with the fault
+ * caused by the instruction in the first field.
+ *
+ * All the routines below use bits of fixup code that are out of line
+ * with the main instruction path.  This means when everything is well,
+ * we don't even have to jump over them.  Further, they do not intrude
+ * on our cache or tlb entries.
+ */
+
+struct exception_table_entry {
+	int insn, fixup, handler;
+};
+struct pt_regs;
+
+#define ARCH_HAS_RELATIVE_EXTABLE
+
+#define swap_ex_entry_fixup(a, b, tmp, delta)			\
+	do {							\
+		(a)->fixup = (b)->fixup + (delta);		\
+		(b)->fixup = (tmp).fixup - (delta);		\
+		(a)->handler = (b)->handler + (delta);		\
+		(b)->handler = (tmp).handler - (delta);		\
+	} while (0)
+
+extern int fixup_exception(struct pt_regs *regs, int trapnr);
+extern bool ex_has_fault_handler(unsigned long ip);
+extern void early_fixup_exception(struct pt_regs *regs, int trapnr);
+
+#endif
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 13b6cdd..2f75f30 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -2,7 +2,7 @@
 #define _ASM_X86_SECTIONS_H
 
 #include <asm-generic/sections.h>
-#include <asm/uaccess.h>
+#include <asm/extable.h>
 
 extern char __brk_base[], __brk_limit[];
 extern struct exception_table_entry __stop___ex_table[];
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 2131c4c..faf3687 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -11,6 +11,7 @@
 #include <asm/asm.h>
 #include <asm/page.h>
 #include <asm/smap.h>
+#include <asm/extable.h>
 
 #define VERIFY_READ 0
 #define VERIFY_WRITE 1
@@ -91,37 +92,6 @@
 	likely(!__range_not_ok(addr, size, user_addr_max()))
 
 /*
- * The exception table consists of triples of addresses relative to the
- * exception table entry itself. The first address is of an instruction
- * that is allowed to fault, the second is the target at which the program
- * should continue. The third is a handler function to deal with the fault
- * caused by the instruction in the first field.
- *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path.  This means when everything is well,
- * we don't even have to jump over them.  Further, they do not intrude
- * on our cache or tlb entries.
- */
-
-struct exception_table_entry {
-	int insn, fixup, handler;
-};
-
-#define ARCH_HAS_RELATIVE_EXTABLE
-
-#define swap_ex_entry_fixup(a, b, tmp, delta)			\
-	do {							\
-		(a)->fixup = (b)->fixup + (delta);		\
-		(b)->fixup = (tmp).fixup - (delta);		\
-		(a)->handler = (b)->handler + (delta);		\
-		(b)->handler = (tmp).handler - (delta);		\
-	} while (0)
-
-extern int fixup_exception(struct pt_regs *regs, int trapnr);
-extern bool ex_has_fault_handler(unsigned long ip);
-extern void early_fixup_exception(struct pt_regs *regs, int trapnr);
-
-/*
  * These are the main single-value transfer routines.  They automatically
  * use the right size if we just have the right pointer type.
  *
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4dc1334..9f72ca3 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -5,7 +5,7 @@
  */
 #include <linux/sched.h>		/* test_thread_flag(), ...	*/
 #include <linux/kdebug.h>		/* oops_begin/end, ...		*/
-#include <linux/extable.h>		/* search_exception_table	*/
+#include <linux/extable.h>		/* search_exception_tables	*/
 #include <linux/bootmem.h>		/* max_low_pfn			*/
 #include <linux/kprobes.h>		/* NOKPROBE_SYMBOL, ...		*/
 #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
diff --git a/arch/xtensa/include/asm/asm-uaccess.h b/arch/xtensa/include/asm/asm-uaccess.h
new file mode 100644
index 0000000..a7a1100
--- /dev/null
+++ b/arch/xtensa/include/asm/asm-uaccess.h
@@ -0,0 +1,160 @@
+/*
+ * include/asm-xtensa/uaccess.h
+ *
+ * User space memory access functions
+ *
+ * These routines provide basic accessing functions to the user memory
+ * space for the kernel. This header file provides functions such as:
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2001 - 2005 Tensilica Inc.
+ */
+
+#ifndef _XTENSA_ASM_UACCESS_H
+#define _XTENSA_ASM_UACCESS_H
+
+#include <linux/errno.h>
+#include <asm/types.h>
+
+#define VERIFY_READ    0
+#define VERIFY_WRITE   1
+
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/processor.h>
+
+/*
+ * These assembly macros mirror the C macros in asm/uaccess.h.  They
+ * should always have identical functionality.  See
+ * arch/xtensa/kernel/sys.S for usage.
+ */
+
+#define KERNEL_DS	0
+#define USER_DS		1
+
+#define get_ds		(KERNEL_DS)
+
+/*
+ * get_fs reads current->thread.current_ds into a register.
+ * On Entry:
+ * 	<ad>	anything
+ * 	<sp>	stack
+ * On Exit:
+ * 	<ad>	contains current->thread.current_ds
+ */
+	.macro	get_fs	ad, sp
+	GET_CURRENT(\ad,\sp)
+#if THREAD_CURRENT_DS > 1020
+	addi	\ad, \ad, TASK_THREAD
+	l32i	\ad, \ad, THREAD_CURRENT_DS - TASK_THREAD
+#else
+	l32i	\ad, \ad, THREAD_CURRENT_DS
+#endif
+	.endm
+
+/*
+ * set_fs sets current->thread.current_ds to some value.
+ * On Entry:
+ *	<at>	anything (temp register)
+ *	<av>	value to write
+ *	<sp>	stack
+ * On Exit:
+ *	<at>	destroyed (actually, current)
+ *	<av>	preserved, value to write
+ */
+	.macro	set_fs	at, av, sp
+	GET_CURRENT(\at,\sp)
+	s32i	\av, \at, THREAD_CURRENT_DS
+	.endm
+
+/*
+ * kernel_ok determines whether we should bypass addr/size checking.
+ * See the equivalent C-macro version below for clarity.
+ * On success, kernel_ok branches to a label indicated by parameter
+ * <success>.  This implies that the macro falls through to the next
+ * insruction on an error.
+ *
+ * Note that while this macro can be used independently, we designed
+ * in for optimal use in the access_ok macro below (i.e., we fall
+ * through on error).
+ *
+ * On Entry:
+ * 	<at>		anything (temp register)
+ * 	<success>	label to branch to on success; implies
+ * 			fall-through macro on error
+ * 	<sp>		stack pointer
+ * On Exit:
+ * 	<at>		destroyed (actually, current->thread.current_ds)
+ */
+
+#if ((KERNEL_DS != 0) || (USER_DS == 0))
+# error Assembly macro kernel_ok fails
+#endif
+	.macro	kernel_ok  at, sp, success
+	get_fs	\at, \sp
+	beqz	\at, \success
+	.endm
+
+/*
+ * user_ok determines whether the access to user-space memory is allowed.
+ * See the equivalent C-macro version below for clarity.
+ *
+ * On error, user_ok branches to a label indicated by parameter
+ * <error>.  This implies that the macro falls through to the next
+ * instruction on success.
+ *
+ * Note that while this macro can be used independently, we designed
+ * in for optimal use in the access_ok macro below (i.e., we fall
+ * through on success).
+ *
+ * On Entry:
+ * 	<aa>	register containing memory address
+ * 	<as>	register containing memory size
+ * 	<at>	temp register
+ * 	<error>	label to branch to on error; implies fall-through
+ * 		macro on success
+ * On Exit:
+ * 	<aa>	preserved
+ * 	<as>	preserved
+ * 	<at>	destroyed (actually, (TASK_SIZE + 1 - size))
+ */
+	.macro	user_ok	aa, as, at, error
+	movi	\at, __XTENSA_UL_CONST(TASK_SIZE)
+	bgeu	\as, \at, \error
+	sub	\at, \at, \as
+	bgeu	\aa, \at, \error
+	.endm
+
+/*
+ * access_ok determines whether a memory access is allowed.  See the
+ * equivalent C-macro version below for clarity.
+ *
+ * On error, access_ok branches to a label indicated by parameter
+ * <error>.  This implies that the macro falls through to the next
+ * instruction on success.
+ *
+ * Note that we assume success is the common case, and we optimize the
+ * branch fall-through case on success.
+ *
+ * On Entry:
+ * 	<aa>	register containing memory address
+ * 	<as>	register containing memory size
+ * 	<at>	temp register
+ * 	<sp>
+ * 	<error>	label to branch to on error; implies fall-through
+ * 		macro on success
+ * On Exit:
+ * 	<aa>	preserved
+ * 	<as>	preserved
+ * 	<at>	destroyed
+ */
+	.macro	access_ok  aa, as, at, sp, error
+	kernel_ok  \at, \sp, .Laccess_ok_\@
+	user_ok    \aa, \as, \at, \error
+.Laccess_ok_\@:
+	.endm
+
+#endif	/* _XTENSA_ASM_UACCESS_H */
diff --git a/arch/xtensa/include/asm/uaccess.h b/arch/xtensa/include/asm/uaccess.h
index 147b26e..848a3d7 100644
--- a/arch/xtensa/include/asm/uaccess.h
+++ b/arch/xtensa/include/asm/uaccess.h
@@ -17,153 +17,12 @@
 #define _XTENSA_UACCESS_H
 
 #include <linux/errno.h>
-#ifndef __ASSEMBLY__
 #include <linux/prefetch.h>
-#endif
 #include <asm/types.h>
 
 #define VERIFY_READ    0
 #define VERIFY_WRITE   1
 
-#ifdef __ASSEMBLY__
-
-#include <asm/current.h>
-#include <asm/asm-offsets.h>
-#include <asm/processor.h>
-
-/*
- * These assembly macros mirror the C macros that follow below.  They
- * should always have identical functionality.  See
- * arch/xtensa/kernel/sys.S for usage.
- */
-
-#define KERNEL_DS	0
-#define USER_DS		1
-
-#define get_ds		(KERNEL_DS)
-
-/*
- * get_fs reads current->thread.current_ds into a register.
- * On Entry:
- * 	<ad>	anything
- * 	<sp>	stack
- * On Exit:
- * 	<ad>	contains current->thread.current_ds
- */
-	.macro	get_fs	ad, sp
-	GET_CURRENT(\ad,\sp)
-#if THREAD_CURRENT_DS > 1020
-	addi	\ad, \ad, TASK_THREAD
-	l32i	\ad, \ad, THREAD_CURRENT_DS - TASK_THREAD
-#else
-	l32i	\ad, \ad, THREAD_CURRENT_DS
-#endif
-	.endm
-
-/*
- * set_fs sets current->thread.current_ds to some value.
- * On Entry:
- *	<at>	anything (temp register)
- *	<av>	value to write
- *	<sp>	stack
- * On Exit:
- *	<at>	destroyed (actually, current)
- *	<av>	preserved, value to write
- */
-	.macro	set_fs	at, av, sp
-	GET_CURRENT(\at,\sp)
-	s32i	\av, \at, THREAD_CURRENT_DS
-	.endm
-
-/*
- * kernel_ok determines whether we should bypass addr/size checking.
- * See the equivalent C-macro version below for clarity.
- * On success, kernel_ok branches to a label indicated by parameter
- * <success>.  This implies that the macro falls through to the next
- * insruction on an error.
- *
- * Note that while this macro can be used independently, we designed
- * in for optimal use in the access_ok macro below (i.e., we fall
- * through on error).
- *
- * On Entry:
- * 	<at>		anything (temp register)
- * 	<success>	label to branch to on success; implies
- * 			fall-through macro on error
- * 	<sp>		stack pointer
- * On Exit:
- * 	<at>		destroyed (actually, current->thread.current_ds)
- */
-
-#if ((KERNEL_DS != 0) || (USER_DS == 0))
-# error Assembly macro kernel_ok fails
-#endif
-	.macro	kernel_ok  at, sp, success
-	get_fs	\at, \sp
-	beqz	\at, \success
-	.endm
-
-/*
- * user_ok determines whether the access to user-space memory is allowed.
- * See the equivalent C-macro version below for clarity.
- *
- * On error, user_ok branches to a label indicated by parameter
- * <error>.  This implies that the macro falls through to the next
- * instruction on success.
- *
- * Note that while this macro can be used independently, we designed
- * in for optimal use in the access_ok macro below (i.e., we fall
- * through on success).
- *
- * On Entry:
- * 	<aa>	register containing memory address
- * 	<as>	register containing memory size
- * 	<at>	temp register
- * 	<error>	label to branch to on error; implies fall-through
- * 		macro on success
- * On Exit:
- * 	<aa>	preserved
- * 	<as>	preserved
- * 	<at>	destroyed (actually, (TASK_SIZE + 1 - size))
- */
-	.macro	user_ok	aa, as, at, error
-	movi	\at, __XTENSA_UL_CONST(TASK_SIZE)
-	bgeu	\as, \at, \error
-	sub	\at, \at, \as
-	bgeu	\aa, \at, \error
-	.endm
-
-/*
- * access_ok determines whether a memory access is allowed.  See the
- * equivalent C-macro version below for clarity.
- *
- * On error, access_ok branches to a label indicated by parameter
- * <error>.  This implies that the macro falls through to the next
- * instruction on success.
- *
- * Note that we assume success is the common case, and we optimize the
- * branch fall-through case on success.
- *
- * On Entry:
- * 	<aa>	register containing memory address
- * 	<as>	register containing memory size
- * 	<at>	temp register
- * 	<sp>
- * 	<error>	label to branch to on error; implies fall-through
- * 		macro on success
- * On Exit:
- * 	<aa>	preserved
- * 	<as>	preserved
- * 	<at>	destroyed
- */
-	.macro	access_ok  aa, as, at, sp, error
-	kernel_ok  \at, \sp, .Laccess_ok_\@
-	user_ok    \aa, \as, \at, \error
-.Laccess_ok_\@:
-	.endm
-
-#else /* __ASSEMBLY__ not defined */
-
 #include <linux/sched.h>
 
 /*
@@ -495,16 +354,4 @@
 	unsigned long insn, fixup;
 };
 
-/* Returns 0 if exception not found and fixup.unit otherwise.  */
-
-extern unsigned long search_exception_table(unsigned long addr);
-extern void sort_exception_table(void);
-
-/* Returns the new pc */
-#define fixup_exception(map_reg, fixup_unit, pc)                \
-({                                                              \
-	fixup_unit;                                             \
-})
-
-#endif	/* __ASSEMBLY__ */
 #endif	/* _XTENSA_UACCESS_H */
diff --git a/arch/xtensa/kernel/coprocessor.S b/arch/xtensa/kernel/coprocessor.S
index a482df5..6911e38 100644
--- a/arch/xtensa/kernel/coprocessor.S
+++ b/arch/xtensa/kernel/coprocessor.S
@@ -17,7 +17,7 @@
 #include <asm/processor.h>
 #include <asm/coprocessor.h>
 #include <asm/thread_info.h>
-#include <asm/uaccess.h>
+#include <asm/asm-uaccess.h>
 #include <asm/unistd.h>
 #include <asm/ptrace.h>
 #include <asm/current.h>
diff --git a/arch/xtensa/kernel/entry.S b/arch/xtensa/kernel/entry.S
index fa04d9d..f5ef3cc 100644
--- a/arch/xtensa/kernel/entry.S
+++ b/arch/xtensa/kernel/entry.S
@@ -17,7 +17,7 @@
 #include <asm/processor.h>
 #include <asm/coprocessor.h>
 #include <asm/thread_info.h>
-#include <asm/uaccess.h>
+#include <asm/asm-uaccess.h>
 #include <asm/unistd.h>
 #include <asm/ptrace.h>
 #include <asm/current.h>
diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index f4ebe39..35e8fbc 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -520,7 +520,8 @@
 	if (!tz->tz_enabled)
 		return;
 
-	thermal_zone_device_update(tz->thermal_zone);
+	thermal_zone_device_update(tz->thermal_zone,
+				   THERMAL_EVENT_UNSPECIFIED);
 }
 
 /* sys I/F for generic thermal sysfs support */
diff --git a/drivers/char/tb0219.c b/drivers/char/tb0219.c
index 480a777..7c19d9b 100644
--- a/drivers/char/tb0219.c
+++ b/drivers/char/tb0219.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 
 #include <asm/io.h>
 #include <asm/reboot.h>
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 3f31ca3..5fa36eb 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -471,9 +471,9 @@
 		/* Yes, the mii is overlaid on the ifreq.ifr_ifru */
 		strncpy(ifr.ifr_name, slave_dev->name, IFNAMSIZ);
 		mii = if_mii(&ifr);
-		if (IOCTL(slave_dev, &ifr, SIOCGMIIPHY) == 0) {
+		if (ioctl(slave_dev, &ifr, SIOCGMIIPHY) == 0) {
 			mii->reg_num = MII_BMSR;
-			if (IOCTL(slave_dev, &ifr, SIOCGMIIREG) == 0)
+			if (ioctl(slave_dev, &ifr, SIOCGMIIREG) == 0)
 				return mii->val_out & BMSR_LSTATUS;
 		}
 	}
diff --git a/drivers/pci/host/pci-aardvark.c b/drivers/pci/host/pci-aardvark.c
index e4a5b7e..4fce494 100644
--- a/drivers/pci/host/pci-aardvark.c
+++ b/drivers/pci/host/pci-aardvark.c
@@ -230,20 +230,20 @@
 
 static int advk_pcie_wait_for_link(struct advk_pcie *pcie)
 {
+	struct device *dev = &pcie->pdev->dev;
 	int retries;
 
 	/* check if the link is up or not */
 	for (retries = 0; retries < LINK_WAIT_MAX_RETRIES; retries++) {
 		if (advk_pcie_link_up(pcie)) {
-			dev_info(&pcie->pdev->dev, "link up\n");
+			dev_info(dev, "link up\n");
 			return 0;
 		}
 
 		usleep_range(LINK_WAIT_USLEEP_MIN, LINK_WAIT_USLEEP_MAX);
 	}
 
-	dev_err(&pcie->pdev->dev, "link never came up\n");
-
+	dev_err(dev, "link never came up\n");
 	return -ETIMEDOUT;
 }
 
@@ -376,6 +376,7 @@
 
 static void advk_pcie_check_pio_status(struct advk_pcie *pcie)
 {
+	struct device *dev = &pcie->pdev->dev;
 	u32 reg;
 	unsigned int status;
 	char *strcomp_status, *str_posted;
@@ -407,12 +408,13 @@
 	else
 		str_posted = "Posted";
 
-	dev_err(&pcie->pdev->dev, "%s PIO Response Status: %s, %#x @ %#x\n",
+	dev_err(dev, "%s PIO Response Status: %s, %#x @ %#x\n",
 		str_posted, strcomp_status, reg, advk_readl(pcie, PIO_ADDR_LS));
 }
 
 static int advk_pcie_wait_pio(struct advk_pcie *pcie)
 {
+	struct device *dev = &pcie->pdev->dev;
 	unsigned long timeout;
 
 	timeout = jiffies + msecs_to_jiffies(PIO_TIMEOUT_MS);
@@ -426,7 +428,7 @@
 			return 0;
 	}
 
-	dev_err(&pcie->pdev->dev, "config read/write timed out\n");
+	dev_err(dev, "config read/write timed out\n");
 	return -ETIMEDOUT;
 }
 
@@ -560,10 +562,11 @@
 
 static void advk_pcie_free_msi(struct advk_pcie *pcie, int hwirq)
 {
+	struct device *dev = &pcie->pdev->dev;
+
 	mutex_lock(&pcie->msi_used_lock);
 	if (!test_bit(hwirq, pcie->msi_irq_in_use))
-		dev_err(&pcie->pdev->dev, "trying to free unused MSI#%d\n",
-			hwirq);
+		dev_err(dev, "trying to free unused MSI#%d\n", hwirq);
 	else
 		clear_bit(hwirq, pcie->msi_irq_in_use);
 	mutex_unlock(&pcie->msi_used_lock);
@@ -910,6 +913,7 @@
 
 static int advk_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct advk_pcie *pcie;
 	struct resource *res;
 	struct pci_bus *bus, *child;
@@ -917,31 +921,29 @@
 	struct device_node *msi_node;
 	int ret, irq;
 
-	pcie = devm_kzalloc(&pdev->dev, sizeof(struct advk_pcie),
-			    GFP_KERNEL);
+	pcie = devm_kzalloc(dev, sizeof(struct advk_pcie), GFP_KERNEL);
 	if (!pcie)
 		return -ENOMEM;
 
 	pcie->pdev = pdev;
-	platform_set_drvdata(pdev, pcie);
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	pcie->base = devm_ioremap_resource(&pdev->dev, res);
+	pcie->base = devm_ioremap_resource(dev, res);
 	if (IS_ERR(pcie->base))
 		return PTR_ERR(pcie->base);
 
 	irq = platform_get_irq(pdev, 0);
-	ret = devm_request_irq(&pdev->dev, irq, advk_pcie_irq_handler,
+	ret = devm_request_irq(dev, irq, advk_pcie_irq_handler,
 			       IRQF_SHARED | IRQF_NO_THREAD, "advk-pcie",
 			       pcie);
 	if (ret) {
-		dev_err(&pdev->dev, "Failed to register interrupt\n");
+		dev_err(dev, "Failed to register interrupt\n");
 		return ret;
 	}
 
 	ret = advk_pcie_parse_request_of_pci_ranges(pcie);
 	if (ret) {
-		dev_err(&pdev->dev, "Failed to parse resources\n");
+		dev_err(dev, "Failed to parse resources\n");
 		return ret;
 	}
 
@@ -949,24 +951,24 @@
 
 	ret = advk_pcie_init_irq_domain(pcie);
 	if (ret) {
-		dev_err(&pdev->dev, "Failed to initialize irq\n");
+		dev_err(dev, "Failed to initialize irq\n");
 		return ret;
 	}
 
 	ret = advk_pcie_init_msi_irq_domain(pcie);
 	if (ret) {
-		dev_err(&pdev->dev, "Failed to initialize irq\n");
+		dev_err(dev, "Failed to initialize irq\n");
 		advk_pcie_remove_irq_domain(pcie);
 		return ret;
 	}
 
-	msi_node = of_parse_phandle(pdev->dev.of_node, "msi-parent", 0);
+	msi_node = of_parse_phandle(dev->of_node, "msi-parent", 0);
 	if (msi_node)
 		msi = of_pci_find_msi_chip_by_node(msi_node);
 	else
 		msi = NULL;
 
-	bus = pci_scan_root_bus_msi(&pdev->dev, 0, &advk_pcie_ops,
+	bus = pci_scan_root_bus_msi(dev, 0, &advk_pcie_ops,
 				    pcie, &pcie->resources, &pcie->msi);
 	if (!bus) {
 		advk_pcie_remove_msi_irq_domain(pcie);
@@ -980,7 +982,6 @@
 		pcie_bus_configure_settings(child);
 
 	pci_bus_add_devices(bus);
-
 	return 0;
 }
 
diff --git a/drivers/pci/host/pci-dra7xx.c b/drivers/pci/host/pci-dra7xx.c
index 19223ed..9595fad 100644
--- a/drivers/pci/host/pci-dra7xx.c
+++ b/drivers/pci/host/pci-dra7xx.c
@@ -64,11 +64,10 @@
 #define	DRA7XX_CPU_TO_BUS_ADDR				0x0FFFFFFF
 
 struct dra7xx_pcie {
-	void __iomem		*base;
-	struct phy		**phy;
-	int			phy_count;
-	struct device		*dev;
 	struct pcie_port	pp;
+	void __iomem		*base;		/* DT ti_conf */
+	int			phy_count;	/* DT phy-names count */
+	struct phy		**phy;
 };
 
 #define to_dra7xx_pcie(x)	container_of((x), struct dra7xx_pcie, pp)
@@ -84,17 +83,6 @@
 	writel(value, pcie->base + offset);
 }
 
-static inline u32 dra7xx_pcie_readl_rc(struct pcie_port *pp, u32 offset)
-{
-	return readl(pp->dbi_base + offset);
-}
-
-static inline void dra7xx_pcie_writel_rc(struct pcie_port *pp, u32 offset,
-					 u32 value)
-{
-	writel(value, pp->dbi_base + offset);
-}
-
 static int dra7xx_pcie_link_up(struct pcie_port *pp)
 {
 	struct dra7xx_pcie *dra7xx = to_dra7xx_pcie(pp);
@@ -103,13 +91,14 @@
 	return !!(reg & LINK_UP);
 }
 
-static int dra7xx_pcie_establish_link(struct pcie_port *pp)
+static int dra7xx_pcie_establish_link(struct dra7xx_pcie *dra7xx)
 {
-	struct dra7xx_pcie *dra7xx = to_dra7xx_pcie(pp);
+	struct pcie_port *pp = &dra7xx->pp;
+	struct device *dev = pp->dev;
 	u32 reg;
 
 	if (dw_pcie_link_up(pp)) {
-		dev_err(pp->dev, "link is already up\n");
+		dev_err(dev, "link is already up\n");
 		return 0;
 	}
 
@@ -120,10 +109,8 @@
 	return dw_pcie_wait_for_link(pp);
 }
 
-static void dra7xx_pcie_enable_interrupts(struct pcie_port *pp)
+static void dra7xx_pcie_enable_interrupts(struct dra7xx_pcie *dra7xx)
 {
-	struct dra7xx_pcie *dra7xx = to_dra7xx_pcie(pp);
-
 	dra7xx_pcie_writel(dra7xx, PCIECTRL_DRA7XX_CONF_IRQSTATUS_MAIN,
 			   ~INTERRUPTS);
 	dra7xx_pcie_writel(dra7xx,
@@ -142,6 +129,8 @@
 
 static void dra7xx_pcie_host_init(struct pcie_port *pp)
 {
+	struct dra7xx_pcie *dra7xx = to_dra7xx_pcie(pp);
+
 	pp->io_base &= DRA7XX_CPU_TO_BUS_ADDR;
 	pp->mem_base &= DRA7XX_CPU_TO_BUS_ADDR;
 	pp->cfg0_base &= DRA7XX_CPU_TO_BUS_ADDR;
@@ -149,10 +138,10 @@
 
 	dw_pcie_setup_rc(pp);
 
-	dra7xx_pcie_establish_link(pp);
+	dra7xx_pcie_establish_link(dra7xx);
 	if (IS_ENABLED(CONFIG_PCI_MSI))
 		dw_pcie_msi_init(pp);
-	dra7xx_pcie_enable_interrupts(pp);
+	dra7xx_pcie_enable_interrupts(dra7xx);
 }
 
 static struct pcie_host_ops dra7xx_pcie_host_ops = {
@@ -196,8 +185,8 @@
 
 static irqreturn_t dra7xx_pcie_msi_irq_handler(int irq, void *arg)
 {
-	struct pcie_port *pp = arg;
-	struct dra7xx_pcie *dra7xx = to_dra7xx_pcie(pp);
+	struct dra7xx_pcie *dra7xx = arg;
+	struct pcie_port *pp = &dra7xx->pp;
 	u32 reg;
 
 	reg = dra7xx_pcie_readl(dra7xx, PCIECTRL_DRA7XX_CONF_IRQSTATUS_MSI);
@@ -223,51 +212,51 @@
 static irqreturn_t dra7xx_pcie_irq_handler(int irq, void *arg)
 {
 	struct dra7xx_pcie *dra7xx = arg;
+	struct device *dev = dra7xx->pp.dev;
 	u32 reg;
 
 	reg = dra7xx_pcie_readl(dra7xx, PCIECTRL_DRA7XX_CONF_IRQSTATUS_MAIN);
 
 	if (reg & ERR_SYS)
-		dev_dbg(dra7xx->dev, "System Error\n");
+		dev_dbg(dev, "System Error\n");
 
 	if (reg & ERR_FATAL)
-		dev_dbg(dra7xx->dev, "Fatal Error\n");
+		dev_dbg(dev, "Fatal Error\n");
 
 	if (reg & ERR_NONFATAL)
-		dev_dbg(dra7xx->dev, "Non Fatal Error\n");
+		dev_dbg(dev, "Non Fatal Error\n");
 
 	if (reg & ERR_COR)
-		dev_dbg(dra7xx->dev, "Correctable Error\n");
+		dev_dbg(dev, "Correctable Error\n");
 
 	if (reg & ERR_AXI)
-		dev_dbg(dra7xx->dev, "AXI tag lookup fatal Error\n");
+		dev_dbg(dev, "AXI tag lookup fatal Error\n");
 
 	if (reg & ERR_ECRC)
-		dev_dbg(dra7xx->dev, "ECRC Error\n");
+		dev_dbg(dev, "ECRC Error\n");
 
 	if (reg & PME_TURN_OFF)
-		dev_dbg(dra7xx->dev,
+		dev_dbg(dev,
 			"Power Management Event Turn-Off message received\n");
 
 	if (reg & PME_TO_ACK)
-		dev_dbg(dra7xx->dev,
+		dev_dbg(dev,
 			"Power Management Turn-Off Ack message received\n");
 
 	if (reg & PM_PME)
-		dev_dbg(dra7xx->dev,
-			"PM Power Management Event message received\n");
+		dev_dbg(dev, "PM Power Management Event message received\n");
 
 	if (reg & LINK_REQ_RST)
-		dev_dbg(dra7xx->dev, "Link Request Reset\n");
+		dev_dbg(dev, "Link Request Reset\n");
 
 	if (reg & LINK_UP_EVT)
-		dev_dbg(dra7xx->dev, "Link-up state change\n");
+		dev_dbg(dev, "Link-up state change\n");
 
 	if (reg & CFG_BME_EVT)
-		dev_dbg(dra7xx->dev, "CFG 'Bus Master Enable' change\n");
+		dev_dbg(dev, "CFG 'Bus Master Enable' change\n");
 
 	if (reg & CFG_MSE_EVT)
-		dev_dbg(dra7xx->dev, "CFG 'Memory Space Enable' change\n");
+		dev_dbg(dev, "CFG 'Memory Space Enable' change\n");
 
 	dra7xx_pcie_writel(dra7xx, PCIECTRL_DRA7XX_CONF_IRQSTATUS_MAIN, reg);
 
@@ -278,13 +267,9 @@
 				       struct platform_device *pdev)
 {
 	int ret;
-	struct pcie_port *pp;
+	struct pcie_port *pp = &dra7xx->pp;
+	struct device *dev = pp->dev;
 	struct resource *res;
-	struct device *dev = &pdev->dev;
-
-	pp = &dra7xx->pp;
-	pp->dev = dev;
-	pp->ops = &dra7xx_pcie_host_ops;
 
 	pp->irq = platform_get_irq(pdev, 1);
 	if (pp->irq < 0) {
@@ -292,12 +277,11 @@
 		return -EINVAL;
 	}
 
-	ret = devm_request_irq(&pdev->dev, pp->irq,
-			       dra7xx_pcie_msi_irq_handler,
+	ret = devm_request_irq(dev, pp->irq, dra7xx_pcie_msi_irq_handler,
 			       IRQF_SHARED | IRQF_NO_THREAD,
-			       "dra7-pcie-msi",	pp);
+			       "dra7-pcie-msi",	dra7xx);
 	if (ret) {
-		dev_err(&pdev->dev, "failed to request irq\n");
+		dev_err(dev, "failed to request irq\n");
 		return ret;
 	}
 
@@ -314,7 +298,7 @@
 
 	ret = dw_pcie_host_init(pp);
 	if (ret) {
-		dev_err(dra7xx->dev, "failed to initialize host\n");
+		dev_err(dev, "failed to initialize host\n");
 		return ret;
 	}
 
@@ -332,6 +316,7 @@
 	void __iomem *base;
 	struct resource *res;
 	struct dra7xx_pcie *dra7xx;
+	struct pcie_port *pp;
 	struct device *dev = &pdev->dev;
 	struct device_node *np = dev->of_node;
 	char name[10];
@@ -343,6 +328,10 @@
 	if (!dra7xx)
 		return -ENOMEM;
 
+	pp = &dra7xx->pp;
+	pp->dev = dev;
+	pp->ops = &dra7xx_pcie_host_ops;
+
 	irq = platform_get_irq(pdev, 0);
 	if (irq < 0) {
 		dev_err(dev, "missing IRQ resource\n");
@@ -390,7 +379,6 @@
 
 	dra7xx->base = base;
 	dra7xx->phy = phy;
-	dra7xx->dev = dev;
 	dra7xx->phy_count = phy_count;
 
 	pm_runtime_enable(dev);
@@ -407,7 +395,7 @@
 		ret = devm_gpio_request_one(dev, gpio_sel, gpio_flags,
 					    "pcie_reset");
 		if (ret) {
-			dev_err(&pdev->dev, "gpio%d request failed, ret %d\n",
+			dev_err(dev, "gpio%d request failed, ret %d\n",
 				gpio_sel, ret);
 			goto err_gpio;
 		}
@@ -420,12 +408,11 @@
 	reg &= ~LTSSM_EN;
 	dra7xx_pcie_writel(dra7xx, PCIECTRL_DRA7XX_CONF_DEVICE_CMD, reg);
 
-	platform_set_drvdata(pdev, dra7xx);
-
 	ret = dra7xx_add_pcie_port(dra7xx, pdev);
 	if (ret < 0)
 		goto err_gpio;
 
+	platform_set_drvdata(pdev, dra7xx);
 	return 0;
 
 err_gpio:
@@ -451,9 +438,9 @@
 	u32 val;
 
 	/* clear MSE */
-	val = dra7xx_pcie_readl_rc(pp, PCI_COMMAND);
+	val = dw_pcie_readl_rc(pp, PCI_COMMAND);
 	val &= ~PCI_COMMAND_MEMORY;
-	dra7xx_pcie_writel_rc(pp, PCI_COMMAND, val);
+	dw_pcie_writel_rc(pp, PCI_COMMAND, val);
 
 	return 0;
 }
@@ -465,9 +452,9 @@
 	u32 val;
 
 	/* set MSE */
-	val = dra7xx_pcie_readl_rc(pp, PCI_COMMAND);
+	val = dw_pcie_readl_rc(pp, PCI_COMMAND);
 	val |= PCI_COMMAND_MEMORY;
-	dra7xx_pcie_writel_rc(pp, PCI_COMMAND, val);
+	dw_pcie_writel_rc(pp, PCI_COMMAND, val);
 
 	return 0;
 }
diff --git a/drivers/pci/host/pci-exynos.c b/drivers/pci/host/pci-exynos.c
index 2e2d7f0..f1c544b 100644
--- a/drivers/pci/host/pci-exynos.c
+++ b/drivers/pci/host/pci-exynos.c
@@ -29,13 +29,13 @@
 #define to_exynos_pcie(x)	container_of(x, struct exynos_pcie, pp)
 
 struct exynos_pcie {
-	void __iomem		*elbi_base;
-	void __iomem		*phy_base;
-	void __iomem		*block_base;
+	struct pcie_port	pp;
+	void __iomem		*elbi_base;	/* DT 0th resource */
+	void __iomem		*phy_base;	/* DT 1st resource */
+	void __iomem		*block_base;	/* DT 2nd resource */
 	int			reset_gpio;
 	struct clk		*clk;
 	struct clk		*bus_clk;
-	struct pcie_port	pp;
 };
 
 /* PCIe ELBI registers */
@@ -102,40 +102,40 @@
 #define PCIE_PHY_TRSV3_PD_TSV		(0x1 << 7)
 #define PCIE_PHY_TRSV3_LVCC		0x31c
 
-static inline void exynos_elb_writel(struct exynos_pcie *pcie, u32 val, u32 reg)
+static void exynos_elb_writel(struct exynos_pcie *exynos_pcie, u32 val, u32 reg)
 {
-	writel(val, pcie->elbi_base + reg);
+	writel(val, exynos_pcie->elbi_base + reg);
 }
 
-static inline u32 exynos_elb_readl(struct exynos_pcie *pcie, u32 reg)
+static u32 exynos_elb_readl(struct exynos_pcie *exynos_pcie, u32 reg)
 {
-	return readl(pcie->elbi_base + reg);
+	return readl(exynos_pcie->elbi_base + reg);
 }
 
-static inline void exynos_phy_writel(struct exynos_pcie *pcie, u32 val, u32 reg)
+static void exynos_phy_writel(struct exynos_pcie *exynos_pcie, u32 val, u32 reg)
 {
-	writel(val, pcie->phy_base + reg);
+	writel(val, exynos_pcie->phy_base + reg);
 }
 
-static inline u32 exynos_phy_readl(struct exynos_pcie *pcie, u32 reg)
+static u32 exynos_phy_readl(struct exynos_pcie *exynos_pcie, u32 reg)
 {
-	return readl(pcie->phy_base + reg);
+	return readl(exynos_pcie->phy_base + reg);
 }
 
-static inline void exynos_blk_writel(struct exynos_pcie *pcie, u32 val, u32 reg)
+static void exynos_blk_writel(struct exynos_pcie *exynos_pcie, u32 val, u32 reg)
 {
-	writel(val, pcie->block_base + reg);
+	writel(val, exynos_pcie->block_base + reg);
 }
 
-static inline u32 exynos_blk_readl(struct exynos_pcie *pcie, u32 reg)
+static u32 exynos_blk_readl(struct exynos_pcie *exynos_pcie, u32 reg)
 {
-	return readl(pcie->block_base + reg);
+	return readl(exynos_pcie->block_base + reg);
 }
 
-static void exynos_pcie_sideband_dbi_w_mode(struct pcie_port *pp, bool on)
+static void exynos_pcie_sideband_dbi_w_mode(struct exynos_pcie *exynos_pcie,
+					    bool on)
 {
 	u32 val;
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 
 	if (on) {
 		val = exynos_elb_readl(exynos_pcie, PCIE_ELBI_SLV_AWMISC);
@@ -148,10 +148,10 @@
 	}
 }
 
-static void exynos_pcie_sideband_dbi_r_mode(struct pcie_port *pp, bool on)
+static void exynos_pcie_sideband_dbi_r_mode(struct exynos_pcie *exynos_pcie,
+					    bool on)
 {
 	u32 val;
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 
 	if (on) {
 		val = exynos_elb_readl(exynos_pcie, PCIE_ELBI_SLV_ARMISC);
@@ -164,10 +164,9 @@
 	}
 }
 
-static void exynos_pcie_assert_core_reset(struct pcie_port *pp)
+static void exynos_pcie_assert_core_reset(struct exynos_pcie *exynos_pcie)
 {
 	u32 val;
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 
 	val = exynos_elb_readl(exynos_pcie, PCIE_CORE_RESET);
 	val &= ~PCIE_CORE_RESET_ENABLE;
@@ -177,10 +176,9 @@
 	exynos_elb_writel(exynos_pcie, 0, PCIE_NONSTICKY_RESET);
 }
 
-static void exynos_pcie_deassert_core_reset(struct pcie_port *pp)
+static void exynos_pcie_deassert_core_reset(struct exynos_pcie *exynos_pcie)
 {
 	u32 val;
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 
 	val = exynos_elb_readl(exynos_pcie, PCIE_CORE_RESET);
 	val |= PCIE_CORE_RESET_ENABLE;
@@ -193,18 +191,14 @@
 	exynos_blk_writel(exynos_pcie, 1, PCIE_PHY_MAC_RESET);
 }
 
-static void exynos_pcie_assert_phy_reset(struct pcie_port *pp)
+static void exynos_pcie_assert_phy_reset(struct exynos_pcie *exynos_pcie)
 {
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
-
 	exynos_blk_writel(exynos_pcie, 0, PCIE_PHY_MAC_RESET);
 	exynos_blk_writel(exynos_pcie, 1, PCIE_PHY_GLOBAL_RESET);
 }
 
-static void exynos_pcie_deassert_phy_reset(struct pcie_port *pp)
+static void exynos_pcie_deassert_phy_reset(struct exynos_pcie *exynos_pcie)
 {
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
-
 	exynos_blk_writel(exynos_pcie, 0, PCIE_PHY_GLOBAL_RESET);
 	exynos_elb_writel(exynos_pcie, 1, PCIE_PWR_RESET);
 	exynos_blk_writel(exynos_pcie, 0, PCIE_PHY_COMMON_RESET);
@@ -213,10 +207,9 @@
 	exynos_blk_writel(exynos_pcie, 0, PCIE_PHY_TRSV_RESET);
 }
 
-static void exynos_pcie_power_on_phy(struct pcie_port *pp)
+static void exynos_pcie_power_on_phy(struct exynos_pcie *exynos_pcie)
 {
 	u32 val;
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 
 	val = exynos_phy_readl(exynos_pcie, PCIE_PHY_COMMON_POWER);
 	val &= ~PCIE_PHY_COMMON_PD_CMN;
@@ -239,10 +232,9 @@
 	exynos_phy_writel(exynos_pcie, val, PCIE_PHY_TRSV3_POWER);
 }
 
-static void exynos_pcie_power_off_phy(struct pcie_port *pp)
+static void exynos_pcie_power_off_phy(struct exynos_pcie *exynos_pcie)
 {
 	u32 val;
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 
 	val = exynos_phy_readl(exynos_pcie, PCIE_PHY_COMMON_POWER);
 	val |= PCIE_PHY_COMMON_PD_CMN;
@@ -265,10 +257,8 @@
 	exynos_phy_writel(exynos_pcie, val, PCIE_PHY_TRSV3_POWER);
 }
 
-static void exynos_pcie_init_phy(struct pcie_port *pp)
+static void exynos_pcie_init_phy(struct exynos_pcie *exynos_pcie)
 {
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
-
 	/* DCC feedback control off */
 	exynos_phy_writel(exynos_pcie, 0x29, PCIE_PHY_DCC_FEEDBACK);
 
@@ -305,51 +295,41 @@
 	exynos_phy_writel(exynos_pcie, 0xa0, PCIE_PHY_TRSV3_LVCC);
 }
 
-static void exynos_pcie_assert_reset(struct pcie_port *pp)
+static void exynos_pcie_assert_reset(struct exynos_pcie *exynos_pcie)
 {
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
+	struct pcie_port *pp = &exynos_pcie->pp;
+	struct device *dev = pp->dev;
 
 	if (exynos_pcie->reset_gpio >= 0)
-		devm_gpio_request_one(pp->dev, exynos_pcie->reset_gpio,
+		devm_gpio_request_one(dev, exynos_pcie->reset_gpio,
 				GPIOF_OUT_INIT_HIGH, "RESET");
 }
 
-static int exynos_pcie_establish_link(struct pcie_port *pp)
+static int exynos_pcie_establish_link(struct exynos_pcie *exynos_pcie)
 {
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
+	struct pcie_port *pp = &exynos_pcie->pp;
+	struct device *dev = pp->dev;
 	u32 val;
 
 	if (dw_pcie_link_up(pp)) {
-		dev_err(pp->dev, "Link already up\n");
+		dev_err(dev, "Link already up\n");
 		return 0;
 	}
 
-	/* assert reset signals */
-	exynos_pcie_assert_core_reset(pp);
-	exynos_pcie_assert_phy_reset(pp);
-
-	/* de-assert phy reset */
-	exynos_pcie_deassert_phy_reset(pp);
-
-	/* power on phy */
-	exynos_pcie_power_on_phy(pp);
-
-	/* initialize phy */
-	exynos_pcie_init_phy(pp);
+	exynos_pcie_assert_core_reset(exynos_pcie);
+	exynos_pcie_assert_phy_reset(exynos_pcie);
+	exynos_pcie_deassert_phy_reset(exynos_pcie);
+	exynos_pcie_power_on_phy(exynos_pcie);
+	exynos_pcie_init_phy(exynos_pcie);
 
 	/* pulse for common reset */
 	exynos_blk_writel(exynos_pcie, 1, PCIE_PHY_COMMON_RESET);
 	udelay(500);
 	exynos_blk_writel(exynos_pcie, 0, PCIE_PHY_COMMON_RESET);
 
-	/* de-assert core reset */
-	exynos_pcie_deassert_core_reset(pp);
-
-	/* setup root complex */
+	exynos_pcie_deassert_core_reset(exynos_pcie);
 	dw_pcie_setup_rc(pp);
-
-	/* assert reset signal */
-	exynos_pcie_assert_reset(pp);
+	exynos_pcie_assert_reset(exynos_pcie);
 
 	/* assert LTSSM enable */
 	exynos_elb_writel(exynos_pcie, PCIE_ELBI_LTSSM_ENABLE,
@@ -361,27 +341,23 @@
 
 	while (exynos_phy_readl(exynos_pcie, PCIE_PHY_PLL_LOCKED) == 0) {
 		val = exynos_blk_readl(exynos_pcie, PCIE_PHY_PLL_LOCKED);
-		dev_info(pp->dev, "PLL Locked: 0x%x\n", val);
+		dev_info(dev, "PLL Locked: 0x%x\n", val);
 	}
-	/* power off phy */
-	exynos_pcie_power_off_phy(pp);
-
+	exynos_pcie_power_off_phy(exynos_pcie);
 	return -ETIMEDOUT;
 }
 
-static void exynos_pcie_clear_irq_pulse(struct pcie_port *pp)
+static void exynos_pcie_clear_irq_pulse(struct exynos_pcie *exynos_pcie)
 {
 	u32 val;
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 
 	val = exynos_elb_readl(exynos_pcie, PCIE_IRQ_PULSE);
 	exynos_elb_writel(exynos_pcie, val, PCIE_IRQ_PULSE);
 }
 
-static void exynos_pcie_enable_irq_pulse(struct pcie_port *pp)
+static void exynos_pcie_enable_irq_pulse(struct exynos_pcie *exynos_pcie)
 {
 	u32 val;
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 
 	/* enable INTX interrupt */
 	val = IRQ_INTA_ASSERT | IRQ_INTB_ASSERT |
@@ -391,23 +367,24 @@
 
 static irqreturn_t exynos_pcie_irq_handler(int irq, void *arg)
 {
-	struct pcie_port *pp = arg;
+	struct exynos_pcie *exynos_pcie = arg;
 
-	exynos_pcie_clear_irq_pulse(pp);
+	exynos_pcie_clear_irq_pulse(exynos_pcie);
 	return IRQ_HANDLED;
 }
 
 static irqreturn_t exynos_pcie_msi_irq_handler(int irq, void *arg)
 {
-	struct pcie_port *pp = arg;
+	struct exynos_pcie *exynos_pcie = arg;
+	struct pcie_port *pp = &exynos_pcie->pp;
 
 	return dw_handle_msi_irq(pp);
 }
 
-static void exynos_pcie_msi_init(struct pcie_port *pp)
+static void exynos_pcie_msi_init(struct exynos_pcie *exynos_pcie)
 {
+	struct pcie_port *pp = &exynos_pcie->pp;
 	u32 val;
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 
 	dw_pcie_msi_init(pp);
 
@@ -417,60 +394,64 @@
 	exynos_elb_writel(exynos_pcie, val, PCIE_IRQ_EN_LEVEL);
 }
 
-static void exynos_pcie_enable_interrupts(struct pcie_port *pp)
+static void exynos_pcie_enable_interrupts(struct exynos_pcie *exynos_pcie)
 {
-	exynos_pcie_enable_irq_pulse(pp);
+	exynos_pcie_enable_irq_pulse(exynos_pcie);
 
 	if (IS_ENABLED(CONFIG_PCI_MSI))
-		exynos_pcie_msi_init(pp);
+		exynos_pcie_msi_init(exynos_pcie);
 }
 
-static inline u32 exynos_pcie_readl_rc(struct pcie_port *pp,
-				       void __iomem *dbi_base)
+static u32 exynos_pcie_readl_rc(struct pcie_port *pp, u32 reg)
 {
+	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 	u32 val;
 
-	exynos_pcie_sideband_dbi_r_mode(pp, true);
-	val = readl(dbi_base);
-	exynos_pcie_sideband_dbi_r_mode(pp, false);
+	exynos_pcie_sideband_dbi_r_mode(exynos_pcie, true);
+	val = readl(pp->dbi_base + reg);
+	exynos_pcie_sideband_dbi_r_mode(exynos_pcie, false);
 	return val;
 }
 
-static inline void exynos_pcie_writel_rc(struct pcie_port *pp,
-					u32 val, void __iomem *dbi_base)
+static void exynos_pcie_writel_rc(struct pcie_port *pp, u32 reg, u32 val)
 {
-	exynos_pcie_sideband_dbi_w_mode(pp, true);
-	writel(val, dbi_base);
-	exynos_pcie_sideband_dbi_w_mode(pp, false);
+	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
+
+	exynos_pcie_sideband_dbi_w_mode(exynos_pcie, true);
+	writel(val, pp->dbi_base + reg);
+	exynos_pcie_sideband_dbi_w_mode(exynos_pcie, false);
 }
 
 static int exynos_pcie_rd_own_conf(struct pcie_port *pp, int where, int size,
 				u32 *val)
 {
+	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 	int ret;
 
-	exynos_pcie_sideband_dbi_r_mode(pp, true);
+	exynos_pcie_sideband_dbi_r_mode(exynos_pcie, true);
 	ret = dw_pcie_cfg_read(pp->dbi_base + where, size, val);
-	exynos_pcie_sideband_dbi_r_mode(pp, false);
+	exynos_pcie_sideband_dbi_r_mode(exynos_pcie, false);
 	return ret;
 }
 
 static int exynos_pcie_wr_own_conf(struct pcie_port *pp, int where, int size,
 				u32 val)
 {
+	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 	int ret;
 
-	exynos_pcie_sideband_dbi_w_mode(pp, true);
+	exynos_pcie_sideband_dbi_w_mode(exynos_pcie, true);
 	ret = dw_pcie_cfg_write(pp->dbi_base + where, size, val);
-	exynos_pcie_sideband_dbi_w_mode(pp, false);
+	exynos_pcie_sideband_dbi_w_mode(exynos_pcie, false);
 	return ret;
 }
 
 static int exynos_pcie_link_up(struct pcie_port *pp)
 {
 	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
-	u32 val = exynos_elb_readl(exynos_pcie, PCIE_ELBI_RDLH_LINKUP);
+	u32 val;
 
+	val = exynos_elb_readl(exynos_pcie, PCIE_ELBI_RDLH_LINKUP);
 	if (val == PCIE_ELBI_LTSSM_ENABLE)
 		return 1;
 
@@ -479,8 +460,10 @@
 
 static void exynos_pcie_host_init(struct pcie_port *pp)
 {
-	exynos_pcie_establish_link(pp);
-	exynos_pcie_enable_interrupts(pp);
+	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
+
+	exynos_pcie_establish_link(exynos_pcie);
+	exynos_pcie_enable_interrupts(exynos_pcie);
 }
 
 static struct pcie_host_ops exynos_pcie_host_ops = {
@@ -492,36 +475,38 @@
 	.host_init = exynos_pcie_host_init,
 };
 
-static int __init exynos_add_pcie_port(struct pcie_port *pp,
+static int __init exynos_add_pcie_port(struct exynos_pcie *exynos_pcie,
 				       struct platform_device *pdev)
 {
+	struct pcie_port *pp = &exynos_pcie->pp;
+	struct device *dev = pp->dev;
 	int ret;
 
 	pp->irq = platform_get_irq(pdev, 1);
 	if (!pp->irq) {
-		dev_err(&pdev->dev, "failed to get irq\n");
+		dev_err(dev, "failed to get irq\n");
 		return -ENODEV;
 	}
-	ret = devm_request_irq(&pdev->dev, pp->irq, exynos_pcie_irq_handler,
-				IRQF_SHARED, "exynos-pcie", pp);
+	ret = devm_request_irq(dev, pp->irq, exynos_pcie_irq_handler,
+				IRQF_SHARED, "exynos-pcie", exynos_pcie);
 	if (ret) {
-		dev_err(&pdev->dev, "failed to request irq\n");
+		dev_err(dev, "failed to request irq\n");
 		return ret;
 	}
 
 	if (IS_ENABLED(CONFIG_PCI_MSI)) {
 		pp->msi_irq = platform_get_irq(pdev, 0);
 		if (!pp->msi_irq) {
-			dev_err(&pdev->dev, "failed to get msi irq\n");
+			dev_err(dev, "failed to get msi irq\n");
 			return -ENODEV;
 		}
 
-		ret = devm_request_irq(&pdev->dev, pp->msi_irq,
+		ret = devm_request_irq(dev, pp->msi_irq,
 					exynos_pcie_msi_irq_handler,
 					IRQF_SHARED | IRQF_NO_THREAD,
-					"exynos-pcie", pp);
+					"exynos-pcie", exynos_pcie);
 		if (ret) {
-			dev_err(&pdev->dev, "failed to request msi irq\n");
+			dev_err(dev, "failed to request msi irq\n");
 			return ret;
 		}
 	}
@@ -531,7 +516,7 @@
 
 	ret = dw_pcie_host_init(pp);
 	if (ret) {
-		dev_err(&pdev->dev, "failed to initialize host\n");
+		dev_err(dev, "failed to initialize host\n");
 		return ret;
 	}
 
@@ -540,37 +525,36 @@
 
 static int __init exynos_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct exynos_pcie *exynos_pcie;
 	struct pcie_port *pp;
-	struct device_node *np = pdev->dev.of_node;
+	struct device_node *np = dev->of_node;
 	struct resource *elbi_base;
 	struct resource *phy_base;
 	struct resource *block_base;
 	int ret;
 
-	exynos_pcie = devm_kzalloc(&pdev->dev, sizeof(*exynos_pcie),
-				GFP_KERNEL);
+	exynos_pcie = devm_kzalloc(dev, sizeof(*exynos_pcie), GFP_KERNEL);
 	if (!exynos_pcie)
 		return -ENOMEM;
 
 	pp = &exynos_pcie->pp;
-
-	pp->dev = &pdev->dev;
+	pp->dev = dev;
 
 	exynos_pcie->reset_gpio = of_get_named_gpio(np, "reset-gpio", 0);
 
-	exynos_pcie->clk = devm_clk_get(&pdev->dev, "pcie");
+	exynos_pcie->clk = devm_clk_get(dev, "pcie");
 	if (IS_ERR(exynos_pcie->clk)) {
-		dev_err(&pdev->dev, "Failed to get pcie rc clock\n");
+		dev_err(dev, "Failed to get pcie rc clock\n");
 		return PTR_ERR(exynos_pcie->clk);
 	}
 	ret = clk_prepare_enable(exynos_pcie->clk);
 	if (ret)
 		return ret;
 
-	exynos_pcie->bus_clk = devm_clk_get(&pdev->dev, "pcie_bus");
+	exynos_pcie->bus_clk = devm_clk_get(dev, "pcie_bus");
 	if (IS_ERR(exynos_pcie->bus_clk)) {
-		dev_err(&pdev->dev, "Failed to get pcie bus clock\n");
+		dev_err(dev, "Failed to get pcie bus clock\n");
 		ret = PTR_ERR(exynos_pcie->bus_clk);
 		goto fail_clk;
 	}
@@ -579,27 +563,27 @@
 		goto fail_clk;
 
 	elbi_base = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	exynos_pcie->elbi_base = devm_ioremap_resource(&pdev->dev, elbi_base);
+	exynos_pcie->elbi_base = devm_ioremap_resource(dev, elbi_base);
 	if (IS_ERR(exynos_pcie->elbi_base)) {
 		ret = PTR_ERR(exynos_pcie->elbi_base);
 		goto fail_bus_clk;
 	}
 
 	phy_base = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-	exynos_pcie->phy_base = devm_ioremap_resource(&pdev->dev, phy_base);
+	exynos_pcie->phy_base = devm_ioremap_resource(dev, phy_base);
 	if (IS_ERR(exynos_pcie->phy_base)) {
 		ret = PTR_ERR(exynos_pcie->phy_base);
 		goto fail_bus_clk;
 	}
 
 	block_base = platform_get_resource(pdev, IORESOURCE_MEM, 2);
-	exynos_pcie->block_base = devm_ioremap_resource(&pdev->dev, block_base);
+	exynos_pcie->block_base = devm_ioremap_resource(dev, block_base);
 	if (IS_ERR(exynos_pcie->block_base)) {
 		ret = PTR_ERR(exynos_pcie->block_base);
 		goto fail_bus_clk;
 	}
 
-	ret = exynos_add_pcie_port(pp, pdev);
+	ret = exynos_add_pcie_port(exynos_pcie, pdev);
 	if (ret < 0)
 		goto fail_bus_clk;
 
diff --git a/drivers/pci/host/pci-imx6.c b/drivers/pci/host/pci-imx6.c
index ead4a5c..c8cefb0 100644
--- a/drivers/pci/host/pci-imx6.c
+++ b/drivers/pci/host/pci-imx6.c
@@ -39,16 +39,15 @@
 };
 
 struct imx6_pcie {
+	struct pcie_port	pp;	/* pp.dbi_base is DT 0th resource */
 	int			reset_gpio;
 	bool			gpio_active_high;
 	struct clk		*pcie_bus;
 	struct clk		*pcie_phy;
 	struct clk		*pcie_inbound_axi;
 	struct clk		*pcie;
-	struct pcie_port	pp;
 	struct regmap		*iomuxc_gpr;
 	enum imx6_pcie_variants variant;
-	void __iomem		*mem_base;
 	u32			tx_deemph_gen1;
 	u32			tx_deemph_gen2_3p5db;
 	u32			tx_deemph_gen2_6db;
@@ -96,14 +95,15 @@
 #define PHY_RX_OVRD_IN_LO_RX_DATA_EN (1 << 5)
 #define PHY_RX_OVRD_IN_LO_RX_PLL_EN (1 << 3)
 
-static int pcie_phy_poll_ack(void __iomem *dbi_base, int exp_val)
+static int pcie_phy_poll_ack(struct imx6_pcie *imx6_pcie, int exp_val)
 {
+	struct pcie_port *pp = &imx6_pcie->pp;
 	u32 val;
 	u32 max_iterations = 10;
 	u32 wait_counter = 0;
 
 	do {
-		val = readl(dbi_base + PCIE_PHY_STAT);
+		val = dw_pcie_readl_rc(pp, PCIE_PHY_STAT);
 		val = (val >> PCIE_PHY_STAT_ACK_LOC) & 0x1;
 		wait_counter++;
 
@@ -116,123 +116,126 @@
 	return -ETIMEDOUT;
 }
 
-static int pcie_phy_wait_ack(void __iomem *dbi_base, int addr)
+static int pcie_phy_wait_ack(struct imx6_pcie *imx6_pcie, int addr)
 {
+	struct pcie_port *pp = &imx6_pcie->pp;
 	u32 val;
 	int ret;
 
 	val = addr << PCIE_PHY_CTRL_DATA_LOC;
-	writel(val, dbi_base + PCIE_PHY_CTRL);
+	dw_pcie_writel_rc(pp, PCIE_PHY_CTRL, val);
 
 	val |= (0x1 << PCIE_PHY_CTRL_CAP_ADR_LOC);
-	writel(val, dbi_base + PCIE_PHY_CTRL);
+	dw_pcie_writel_rc(pp, PCIE_PHY_CTRL, val);
 
-	ret = pcie_phy_poll_ack(dbi_base, 1);
+	ret = pcie_phy_poll_ack(imx6_pcie, 1);
 	if (ret)
 		return ret;
 
 	val = addr << PCIE_PHY_CTRL_DATA_LOC;
-	writel(val, dbi_base + PCIE_PHY_CTRL);
+	dw_pcie_writel_rc(pp, PCIE_PHY_CTRL, val);
 
-	return pcie_phy_poll_ack(dbi_base, 0);
+	return pcie_phy_poll_ack(imx6_pcie, 0);
 }
 
 /* Read from the 16-bit PCIe PHY control registers (not memory-mapped) */
-static int pcie_phy_read(void __iomem *dbi_base, int addr, int *data)
+static int pcie_phy_read(struct imx6_pcie *imx6_pcie, int addr, int *data)
 {
+	struct pcie_port *pp = &imx6_pcie->pp;
 	u32 val, phy_ctl;
 	int ret;
 
-	ret = pcie_phy_wait_ack(dbi_base, addr);
+	ret = pcie_phy_wait_ack(imx6_pcie, addr);
 	if (ret)
 		return ret;
 
 	/* assert Read signal */
 	phy_ctl = 0x1 << PCIE_PHY_CTRL_RD_LOC;
-	writel(phy_ctl, dbi_base + PCIE_PHY_CTRL);
+	dw_pcie_writel_rc(pp, PCIE_PHY_CTRL, phy_ctl);
 
-	ret = pcie_phy_poll_ack(dbi_base, 1);
+	ret = pcie_phy_poll_ack(imx6_pcie, 1);
 	if (ret)
 		return ret;
 
-	val = readl(dbi_base + PCIE_PHY_STAT);
+	val = dw_pcie_readl_rc(pp, PCIE_PHY_STAT);
 	*data = val & 0xffff;
 
 	/* deassert Read signal */
-	writel(0x00, dbi_base + PCIE_PHY_CTRL);
+	dw_pcie_writel_rc(pp, PCIE_PHY_CTRL, 0x00);
 
-	return pcie_phy_poll_ack(dbi_base, 0);
+	return pcie_phy_poll_ack(imx6_pcie, 0);
 }
 
-static int pcie_phy_write(void __iomem *dbi_base, int addr, int data)
+static int pcie_phy_write(struct imx6_pcie *imx6_pcie, int addr, int data)
 {
+	struct pcie_port *pp = &imx6_pcie->pp;
 	u32 var;
 	int ret;
 
 	/* write addr */
 	/* cap addr */
-	ret = pcie_phy_wait_ack(dbi_base, addr);
+	ret = pcie_phy_wait_ack(imx6_pcie, addr);
 	if (ret)
 		return ret;
 
 	var = data << PCIE_PHY_CTRL_DATA_LOC;
-	writel(var, dbi_base + PCIE_PHY_CTRL);
+	dw_pcie_writel_rc(pp, PCIE_PHY_CTRL, var);
 
 	/* capture data */
 	var |= (0x1 << PCIE_PHY_CTRL_CAP_DAT_LOC);
-	writel(var, dbi_base + PCIE_PHY_CTRL);
+	dw_pcie_writel_rc(pp, PCIE_PHY_CTRL, var);
 
-	ret = pcie_phy_poll_ack(dbi_base, 1);
+	ret = pcie_phy_poll_ack(imx6_pcie, 1);
 	if (ret)
 		return ret;
 
 	/* deassert cap data */
 	var = data << PCIE_PHY_CTRL_DATA_LOC;
-	writel(var, dbi_base + PCIE_PHY_CTRL);
+	dw_pcie_writel_rc(pp, PCIE_PHY_CTRL, var);
 
 	/* wait for ack de-assertion */
-	ret = pcie_phy_poll_ack(dbi_base, 0);
+	ret = pcie_phy_poll_ack(imx6_pcie, 0);
 	if (ret)
 		return ret;
 
 	/* assert wr signal */
 	var = 0x1 << PCIE_PHY_CTRL_WR_LOC;
-	writel(var, dbi_base + PCIE_PHY_CTRL);
+	dw_pcie_writel_rc(pp, PCIE_PHY_CTRL, var);
 
 	/* wait for ack */
-	ret = pcie_phy_poll_ack(dbi_base, 1);
+	ret = pcie_phy_poll_ack(imx6_pcie, 1);
 	if (ret)
 		return ret;
 
 	/* deassert wr signal */
 	var = data << PCIE_PHY_CTRL_DATA_LOC;
-	writel(var, dbi_base + PCIE_PHY_CTRL);
+	dw_pcie_writel_rc(pp, PCIE_PHY_CTRL, var);
 
 	/* wait for ack de-assertion */
-	ret = pcie_phy_poll_ack(dbi_base, 0);
+	ret = pcie_phy_poll_ack(imx6_pcie, 0);
 	if (ret)
 		return ret;
 
-	writel(0x0, dbi_base + PCIE_PHY_CTRL);
+	dw_pcie_writel_rc(pp, PCIE_PHY_CTRL, 0x0);
 
 	return 0;
 }
 
-static void imx6_pcie_reset_phy(struct pcie_port *pp)
+static void imx6_pcie_reset_phy(struct imx6_pcie *imx6_pcie)
 {
 	u32 tmp;
 
-	pcie_phy_read(pp->dbi_base, PHY_RX_OVRD_IN_LO, &tmp);
+	pcie_phy_read(imx6_pcie, PHY_RX_OVRD_IN_LO, &tmp);
 	tmp |= (PHY_RX_OVRD_IN_LO_RX_DATA_EN |
 		PHY_RX_OVRD_IN_LO_RX_PLL_EN);
-	pcie_phy_write(pp->dbi_base, PHY_RX_OVRD_IN_LO, tmp);
+	pcie_phy_write(imx6_pcie, PHY_RX_OVRD_IN_LO, tmp);
 
 	usleep_range(2000, 3000);
 
-	pcie_phy_read(pp->dbi_base, PHY_RX_OVRD_IN_LO, &tmp);
+	pcie_phy_read(imx6_pcie, PHY_RX_OVRD_IN_LO, &tmp);
 	tmp &= ~(PHY_RX_OVRD_IN_LO_RX_DATA_EN |
 		  PHY_RX_OVRD_IN_LO_RX_PLL_EN);
-	pcie_phy_write(pp->dbi_base, PHY_RX_OVRD_IN_LO, tmp);
+	pcie_phy_write(imx6_pcie, PHY_RX_OVRD_IN_LO, tmp);
 }
 
 /*  Added for PCI abort handling */
@@ -242,9 +245,9 @@
 	return 0;
 }
 
-static int imx6_pcie_assert_core_reset(struct pcie_port *pp)
+static void imx6_pcie_assert_core_reset(struct imx6_pcie *imx6_pcie)
 {
-	struct imx6_pcie *imx6_pcie = to_imx6_pcie(pp);
+	struct pcie_port *pp = &imx6_pcie->pp;
 	u32 val, gpr1, gpr12;
 
 	switch (imx6_pcie->variant) {
@@ -281,10 +284,10 @@
 
 		if ((gpr1 & IMX6Q_GPR1_PCIE_REF_CLK_EN) &&
 		    (gpr12 & IMX6Q_GPR12_PCIE_CTL_2)) {
-			val = readl(pp->dbi_base + PCIE_PL_PFLR);
+			val = dw_pcie_readl_rc(pp, PCIE_PL_PFLR);
 			val &= ~PCIE_PL_PFLR_LINK_STATE_MASK;
 			val |= PCIE_PL_PFLR_FORCE_LINK;
-			writel(val, pp->dbi_base + PCIE_PL_PFLR);
+			dw_pcie_writel_rc(pp, PCIE_PL_PFLR, val);
 
 			regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12,
 					   IMX6Q_GPR12_PCIE_CTL_2, 0 << 10);
@@ -296,20 +299,19 @@
 				   IMX6Q_GPR1_PCIE_REF_CLK_EN, 0 << 16);
 		break;
 	}
-
-	return 0;
 }
 
 static int imx6_pcie_enable_ref_clk(struct imx6_pcie *imx6_pcie)
 {
 	struct pcie_port *pp = &imx6_pcie->pp;
+	struct device *dev = pp->dev;
 	int ret = 0;
 
 	switch (imx6_pcie->variant) {
 	case IMX6SX:
 		ret = clk_prepare_enable(imx6_pcie->pcie_inbound_axi);
 		if (ret) {
-			dev_err(pp->dev, "unable to enable pcie_axi clock\n");
+			dev_err(dev, "unable to enable pcie_axi clock\n");
 			break;
 		}
 
@@ -336,32 +338,33 @@
 	return ret;
 }
 
-static int imx6_pcie_deassert_core_reset(struct pcie_port *pp)
+static void imx6_pcie_deassert_core_reset(struct imx6_pcie *imx6_pcie)
 {
-	struct imx6_pcie *imx6_pcie = to_imx6_pcie(pp);
+	struct pcie_port *pp = &imx6_pcie->pp;
+	struct device *dev = pp->dev;
 	int ret;
 
 	ret = clk_prepare_enable(imx6_pcie->pcie_phy);
 	if (ret) {
-		dev_err(pp->dev, "unable to enable pcie_phy clock\n");
-		goto err_pcie_phy;
+		dev_err(dev, "unable to enable pcie_phy clock\n");
+		return;
 	}
 
 	ret = clk_prepare_enable(imx6_pcie->pcie_bus);
 	if (ret) {
-		dev_err(pp->dev, "unable to enable pcie_bus clock\n");
+		dev_err(dev, "unable to enable pcie_bus clock\n");
 		goto err_pcie_bus;
 	}
 
 	ret = clk_prepare_enable(imx6_pcie->pcie);
 	if (ret) {
-		dev_err(pp->dev, "unable to enable pcie clock\n");
+		dev_err(dev, "unable to enable pcie clock\n");
 		goto err_pcie;
 	}
 
 	ret = imx6_pcie_enable_ref_clk(imx6_pcie);
 	if (ret) {
-		dev_err(pp->dev, "unable to enable pcie ref clock\n");
+		dev_err(dev, "unable to enable pcie ref clock\n");
 		goto err_ref_clk;
 	}
 
@@ -392,7 +395,7 @@
 		break;
 	}
 
-	return 0;
+	return;
 
 err_ref_clk:
 	clk_disable_unprepare(imx6_pcie->pcie);
@@ -400,14 +403,10 @@
 	clk_disable_unprepare(imx6_pcie->pcie_bus);
 err_pcie_bus:
 	clk_disable_unprepare(imx6_pcie->pcie_phy);
-err_pcie_phy:
-	return ret;
 }
 
-static void imx6_pcie_init_phy(struct pcie_port *pp)
+static void imx6_pcie_init_phy(struct imx6_pcie *imx6_pcie)
 {
-	struct imx6_pcie *imx6_pcie = to_imx6_pcie(pp);
-
 	if (imx6_pcie->variant == IMX6SX)
 		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12,
 				   IMX6SX_GPR12_PCIE_RX_EQ_MASK,
@@ -439,45 +438,52 @@
 			   imx6_pcie->tx_swing_low << 25);
 }
 
-static int imx6_pcie_wait_for_link(struct pcie_port *pp)
+static int imx6_pcie_wait_for_link(struct imx6_pcie *imx6_pcie)
 {
+	struct pcie_port *pp = &imx6_pcie->pp;
+	struct device *dev = pp->dev;
+
 	/* check if the link is up or not */
 	if (!dw_pcie_wait_for_link(pp))
 		return 0;
 
-	dev_dbg(pp->dev, "DEBUG_R0: 0x%08x, DEBUG_R1: 0x%08x\n",
-		readl(pp->dbi_base + PCIE_PHY_DEBUG_R0),
-		readl(pp->dbi_base + PCIE_PHY_DEBUG_R1));
+	dev_dbg(dev, "DEBUG_R0: 0x%08x, DEBUG_R1: 0x%08x\n",
+		dw_pcie_readl_rc(pp, PCIE_PHY_DEBUG_R0),
+		dw_pcie_readl_rc(pp, PCIE_PHY_DEBUG_R1));
 	return -ETIMEDOUT;
 }
 
-static int imx6_pcie_wait_for_speed_change(struct pcie_port *pp)
+static int imx6_pcie_wait_for_speed_change(struct imx6_pcie *imx6_pcie)
 {
+	struct pcie_port *pp = &imx6_pcie->pp;
+	struct device *dev = pp->dev;
 	u32 tmp;
 	unsigned int retries;
 
 	for (retries = 0; retries < 200; retries++) {
-		tmp = readl(pp->dbi_base + PCIE_LINK_WIDTH_SPEED_CONTROL);
+		tmp = dw_pcie_readl_rc(pp, PCIE_LINK_WIDTH_SPEED_CONTROL);
 		/* Test if the speed change finished. */
 		if (!(tmp & PORT_LOGIC_SPEED_CHANGE))
 			return 0;
 		usleep_range(100, 1000);
 	}
 
-	dev_err(pp->dev, "Speed change timeout\n");
+	dev_err(dev, "Speed change timeout\n");
 	return -EINVAL;
 }
 
 static irqreturn_t imx6_pcie_msi_handler(int irq, void *arg)
 {
-	struct pcie_port *pp = arg;
+	struct imx6_pcie *imx6_pcie = arg;
+	struct pcie_port *pp = &imx6_pcie->pp;
 
 	return dw_handle_msi_irq(pp);
 }
 
-static int imx6_pcie_establish_link(struct pcie_port *pp)
+static int imx6_pcie_establish_link(struct imx6_pcie *imx6_pcie)
 {
-	struct imx6_pcie *imx6_pcie = to_imx6_pcie(pp);
+	struct pcie_port *pp = &imx6_pcie->pp;
+	struct device *dev = pp->dev;
 	u32 tmp;
 	int ret;
 
@@ -486,76 +492,73 @@
 	 * started in Gen2 mode, there is a possibility the devices on the
 	 * bus will not be detected at all.  This happens with PCIe switches.
 	 */
-	tmp = readl(pp->dbi_base + PCIE_RC_LCR);
+	tmp = dw_pcie_readl_rc(pp, PCIE_RC_LCR);
 	tmp &= ~PCIE_RC_LCR_MAX_LINK_SPEEDS_MASK;
 	tmp |= PCIE_RC_LCR_MAX_LINK_SPEEDS_GEN1;
-	writel(tmp, pp->dbi_base + PCIE_RC_LCR);
+	dw_pcie_writel_rc(pp, PCIE_RC_LCR, tmp);
 
 	/* Start LTSSM. */
 	regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12,
 			IMX6Q_GPR12_PCIE_CTL_2, 1 << 10);
 
-	ret = imx6_pcie_wait_for_link(pp);
+	ret = imx6_pcie_wait_for_link(imx6_pcie);
 	if (ret) {
-		dev_info(pp->dev, "Link never came up\n");
+		dev_info(dev, "Link never came up\n");
 		goto err_reset_phy;
 	}
 
 	if (imx6_pcie->link_gen == 2) {
 		/* Allow Gen2 mode after the link is up. */
-		tmp = readl(pp->dbi_base + PCIE_RC_LCR);
+		tmp = dw_pcie_readl_rc(pp, PCIE_RC_LCR);
 		tmp &= ~PCIE_RC_LCR_MAX_LINK_SPEEDS_MASK;
 		tmp |= PCIE_RC_LCR_MAX_LINK_SPEEDS_GEN2;
-		writel(tmp, pp->dbi_base + PCIE_RC_LCR);
+		dw_pcie_writel_rc(pp, PCIE_RC_LCR, tmp);
 	} else {
-		dev_info(pp->dev, "Link: Gen2 disabled\n");
+		dev_info(dev, "Link: Gen2 disabled\n");
 	}
 
 	/*
 	 * Start Directed Speed Change so the best possible speed both link
 	 * partners support can be negotiated.
 	 */
-	tmp = readl(pp->dbi_base + PCIE_LINK_WIDTH_SPEED_CONTROL);
+	tmp = dw_pcie_readl_rc(pp, PCIE_LINK_WIDTH_SPEED_CONTROL);
 	tmp |= PORT_LOGIC_SPEED_CHANGE;
-	writel(tmp, pp->dbi_base + PCIE_LINK_WIDTH_SPEED_CONTROL);
+	dw_pcie_writel_rc(pp, PCIE_LINK_WIDTH_SPEED_CONTROL, tmp);
 
-	ret = imx6_pcie_wait_for_speed_change(pp);
+	ret = imx6_pcie_wait_for_speed_change(imx6_pcie);
 	if (ret) {
-		dev_err(pp->dev, "Failed to bring link up!\n");
+		dev_err(dev, "Failed to bring link up!\n");
 		goto err_reset_phy;
 	}
 
 	/* Make sure link training is finished as well! */
-	ret = imx6_pcie_wait_for_link(pp);
+	ret = imx6_pcie_wait_for_link(imx6_pcie);
 	if (ret) {
-		dev_err(pp->dev, "Failed to bring link up!\n");
+		dev_err(dev, "Failed to bring link up!\n");
 		goto err_reset_phy;
 	}
 
-	tmp = readl(pp->dbi_base + PCIE_RC_LCSR);
-	dev_info(pp->dev, "Link up, Gen%i\n", (tmp >> 16) & 0xf);
+	tmp = dw_pcie_readl_rc(pp, PCIE_RC_LCSR);
+	dev_info(dev, "Link up, Gen%i\n", (tmp >> 16) & 0xf);
 	return 0;
 
 err_reset_phy:
-	dev_dbg(pp->dev, "PHY DEBUG_R0=0x%08x DEBUG_R1=0x%08x\n",
-		readl(pp->dbi_base + PCIE_PHY_DEBUG_R0),
-		readl(pp->dbi_base + PCIE_PHY_DEBUG_R1));
-	imx6_pcie_reset_phy(pp);
-
+	dev_dbg(dev, "PHY DEBUG_R0=0x%08x DEBUG_R1=0x%08x\n",
+		dw_pcie_readl_rc(pp, PCIE_PHY_DEBUG_R0),
+		dw_pcie_readl_rc(pp, PCIE_PHY_DEBUG_R1));
+	imx6_pcie_reset_phy(imx6_pcie);
 	return ret;
 }
 
 static void imx6_pcie_host_init(struct pcie_port *pp)
 {
-	imx6_pcie_assert_core_reset(pp);
+	struct imx6_pcie *imx6_pcie = to_imx6_pcie(pp);
 
-	imx6_pcie_init_phy(pp);
-
-	imx6_pcie_deassert_core_reset(pp);
-
+	imx6_pcie_assert_core_reset(imx6_pcie);
+	imx6_pcie_init_phy(imx6_pcie);
+	imx6_pcie_deassert_core_reset(imx6_pcie);
 	dw_pcie_setup_rc(pp);
-
-	imx6_pcie_establish_link(pp);
+	imx6_pcie_establish_link(imx6_pcie);
 
 	if (IS_ENABLED(CONFIG_PCI_MSI))
 		dw_pcie_msi_init(pp);
@@ -563,7 +566,7 @@
 
 static int imx6_pcie_link_up(struct pcie_port *pp)
 {
-	return readl(pp->dbi_base + PCIE_PHY_DEBUG_R1) &
+	return dw_pcie_readl_rc(pp, PCIE_PHY_DEBUG_R1) &
 			PCIE_PHY_DEBUG_R1_XMLH_LINK_UP;
 }
 
@@ -572,24 +575,26 @@
 	.host_init = imx6_pcie_host_init,
 };
 
-static int __init imx6_add_pcie_port(struct pcie_port *pp,
-			struct platform_device *pdev)
+static int __init imx6_add_pcie_port(struct imx6_pcie *imx6_pcie,
+				     struct platform_device *pdev)
 {
+	struct pcie_port *pp = &imx6_pcie->pp;
+	struct device *dev = pp->dev;
 	int ret;
 
 	if (IS_ENABLED(CONFIG_PCI_MSI)) {
 		pp->msi_irq = platform_get_irq_byname(pdev, "msi");
 		if (pp->msi_irq <= 0) {
-			dev_err(&pdev->dev, "failed to get MSI irq\n");
+			dev_err(dev, "failed to get MSI irq\n");
 			return -ENODEV;
 		}
 
-		ret = devm_request_irq(&pdev->dev, pp->msi_irq,
+		ret = devm_request_irq(dev, pp->msi_irq,
 				       imx6_pcie_msi_handler,
 				       IRQF_SHARED | IRQF_NO_THREAD,
-				       "mx6-pcie-msi", pp);
+				       "mx6-pcie-msi", imx6_pcie);
 		if (ret) {
-			dev_err(&pdev->dev, "failed to request MSI irq\n");
+			dev_err(dev, "failed to request MSI irq\n");
 			return ret;
 		}
 	}
@@ -599,7 +604,7 @@
 
 	ret = dw_pcie_host_init(pp);
 	if (ret) {
-		dev_err(&pdev->dev, "failed to initialize host\n");
+		dev_err(dev, "failed to initialize host\n");
 		return ret;
 	}
 
@@ -608,75 +613,72 @@
 
 static int __init imx6_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct imx6_pcie *imx6_pcie;
 	struct pcie_port *pp;
-	struct device_node *np = pdev->dev.of_node;
 	struct resource *dbi_base;
-	struct device_node *node = pdev->dev.of_node;
+	struct device_node *node = dev->of_node;
 	int ret;
 
-	imx6_pcie = devm_kzalloc(&pdev->dev, sizeof(*imx6_pcie), GFP_KERNEL);
+	imx6_pcie = devm_kzalloc(dev, sizeof(*imx6_pcie), GFP_KERNEL);
 	if (!imx6_pcie)
 		return -ENOMEM;
 
 	pp = &imx6_pcie->pp;
-	pp->dev = &pdev->dev;
+	pp->dev = dev;
 
 	imx6_pcie->variant =
-		(enum imx6_pcie_variants)of_device_get_match_data(&pdev->dev);
+		(enum imx6_pcie_variants)of_device_get_match_data(dev);
 
 	/* Added for PCI abort handling */
 	hook_fault_code(16 + 6, imx6q_pcie_abort_handler, SIGBUS, 0,
 		"imprecise external abort");
 
 	dbi_base = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	pp->dbi_base = devm_ioremap_resource(&pdev->dev, dbi_base);
+	pp->dbi_base = devm_ioremap_resource(dev, dbi_base);
 	if (IS_ERR(pp->dbi_base))
 		return PTR_ERR(pp->dbi_base);
 
 	/* Fetch GPIOs */
-	imx6_pcie->reset_gpio = of_get_named_gpio(np, "reset-gpio", 0);
-	imx6_pcie->gpio_active_high = of_property_read_bool(np,
+	imx6_pcie->reset_gpio = of_get_named_gpio(node, "reset-gpio", 0);
+	imx6_pcie->gpio_active_high = of_property_read_bool(node,
 						"reset-gpio-active-high");
 	if (gpio_is_valid(imx6_pcie->reset_gpio)) {
-		ret = devm_gpio_request_one(&pdev->dev, imx6_pcie->reset_gpio,
+		ret = devm_gpio_request_one(dev, imx6_pcie->reset_gpio,
 				imx6_pcie->gpio_active_high ?
 					GPIOF_OUT_INIT_HIGH :
 					GPIOF_OUT_INIT_LOW,
 				"PCIe reset");
 		if (ret) {
-			dev_err(&pdev->dev, "unable to get reset gpio\n");
+			dev_err(dev, "unable to get reset gpio\n");
 			return ret;
 		}
 	}
 
 	/* Fetch clocks */
-	imx6_pcie->pcie_phy = devm_clk_get(&pdev->dev, "pcie_phy");
+	imx6_pcie->pcie_phy = devm_clk_get(dev, "pcie_phy");
 	if (IS_ERR(imx6_pcie->pcie_phy)) {
-		dev_err(&pdev->dev,
-			"pcie_phy clock source missing or invalid\n");
+		dev_err(dev, "pcie_phy clock source missing or invalid\n");
 		return PTR_ERR(imx6_pcie->pcie_phy);
 	}
 
-	imx6_pcie->pcie_bus = devm_clk_get(&pdev->dev, "pcie_bus");
+	imx6_pcie->pcie_bus = devm_clk_get(dev, "pcie_bus");
 	if (IS_ERR(imx6_pcie->pcie_bus)) {
-		dev_err(&pdev->dev,
-			"pcie_bus clock source missing or invalid\n");
+		dev_err(dev, "pcie_bus clock source missing or invalid\n");
 		return PTR_ERR(imx6_pcie->pcie_bus);
 	}
 
-	imx6_pcie->pcie = devm_clk_get(&pdev->dev, "pcie");
+	imx6_pcie->pcie = devm_clk_get(dev, "pcie");
 	if (IS_ERR(imx6_pcie->pcie)) {
-		dev_err(&pdev->dev,
-			"pcie clock source missing or invalid\n");
+		dev_err(dev, "pcie clock source missing or invalid\n");
 		return PTR_ERR(imx6_pcie->pcie);
 	}
 
 	if (imx6_pcie->variant == IMX6SX) {
-		imx6_pcie->pcie_inbound_axi = devm_clk_get(&pdev->dev,
+		imx6_pcie->pcie_inbound_axi = devm_clk_get(dev,
 							   "pcie_inbound_axi");
 		if (IS_ERR(imx6_pcie->pcie_inbound_axi)) {
-			dev_err(&pdev->dev,
+			dev_err(dev,
 				"pcie_incbound_axi clock missing or invalid\n");
 			return PTR_ERR(imx6_pcie->pcie_inbound_axi);
 		}
@@ -686,7 +688,7 @@
 	imx6_pcie->iomuxc_gpr =
 		 syscon_regmap_lookup_by_compatible("fsl,imx6q-iomuxc-gpr");
 	if (IS_ERR(imx6_pcie->iomuxc_gpr)) {
-		dev_err(&pdev->dev, "unable to find iomuxc registers\n");
+		dev_err(dev, "unable to find iomuxc registers\n");
 		return PTR_ERR(imx6_pcie->iomuxc_gpr);
 	}
 
@@ -712,12 +714,12 @@
 		imx6_pcie->tx_swing_low = 127;
 
 	/* Limit link speed */
-	ret = of_property_read_u32(pp->dev->of_node, "fsl,max-link-speed",
+	ret = of_property_read_u32(node, "fsl,max-link-speed",
 				   &imx6_pcie->link_gen);
 	if (ret)
 		imx6_pcie->link_gen = 1;
 
-	ret = imx6_add_pcie_port(pp, pdev);
+	ret = imx6_add_pcie_port(imx6_pcie, pdev);
 	if (ret < 0)
 		return ret;
 
@@ -730,7 +732,7 @@
 	struct imx6_pcie *imx6_pcie = platform_get_drvdata(pdev);
 
 	/* bring down link, so bootloader gets clean state in case of reboot */
-	imx6_pcie_assert_core_reset(&imx6_pcie->pp);
+	imx6_pcie_assert_core_reset(imx6_pcie);
 }
 
 static const struct of_device_id imx6_pcie_of_match[] = {
diff --git a/drivers/pci/host/pci-keystone-dw.c b/drivers/pci/host/pci-keystone-dw.c
index 4151509..9397c46 100644
--- a/drivers/pci/host/pci-keystone-dw.c
+++ b/drivers/pci/host/pci-keystone-dw.c
@@ -88,13 +88,24 @@
 	return ks_pcie->app.start + MSI_IRQ;
 }
 
+static u32 ks_dw_app_readl(struct keystone_pcie *ks_pcie, u32 offset)
+{
+	return readl(ks_pcie->va_app_base + offset);
+}
+
+static void ks_dw_app_writel(struct keystone_pcie *ks_pcie, u32 offset, u32 val)
+{
+	writel(val, ks_pcie->va_app_base + offset);
+}
+
 void ks_dw_pcie_handle_msi_irq(struct keystone_pcie *ks_pcie, int offset)
 {
 	struct pcie_port *pp = &ks_pcie->pp;
+	struct device *dev = pp->dev;
 	u32 pending, vector;
 	int src, virq;
 
-	pending = readl(ks_pcie->va_app_base + MSI0_IRQ_STATUS + (offset << 4));
+	pending = ks_dw_app_readl(ks_pcie, MSI0_IRQ_STATUS + (offset << 4));
 
 	/*
 	 * MSI0 status bit 0-3 shows vectors 0, 8, 16, 24, MSI1 status bit
@@ -104,7 +115,7 @@
 		if (BIT(src) & pending) {
 			vector = offset + (src << 3);
 			virq = irq_linear_revmap(pp->irq_domain, vector);
-			dev_dbg(pp->dev, "irq: bit %d, vector %d, virq %d\n",
+			dev_dbg(dev, "irq: bit %d, vector %d, virq %d\n",
 				src, vector, virq);
 			generic_handle_irq(virq);
 		}
@@ -124,9 +135,9 @@
 	offset = d->irq - irq_linear_revmap(pp->irq_domain, 0);
 	update_reg_offset_bit_pos(offset, &reg_offset, &bit_pos);
 
-	writel(BIT(bit_pos),
-	       ks_pcie->va_app_base + MSI0_IRQ_STATUS + (reg_offset << 4));
-	writel(reg_offset + MSI_IRQ_OFFSET, ks_pcie->va_app_base + IRQ_EOI);
+	ks_dw_app_writel(ks_pcie, MSI0_IRQ_STATUS + (reg_offset << 4),
+			 BIT(bit_pos));
+	ks_dw_app_writel(ks_pcie, IRQ_EOI, reg_offset + MSI_IRQ_OFFSET);
 }
 
 void ks_dw_pcie_msi_set_irq(struct pcie_port *pp, int irq)
@@ -135,8 +146,8 @@
 	struct keystone_pcie *ks_pcie = to_keystone_pcie(pp);
 
 	update_reg_offset_bit_pos(irq, &reg_offset, &bit_pos);
-	writel(BIT(bit_pos),
-	       ks_pcie->va_app_base + MSI0_IRQ_ENABLE_SET + (reg_offset << 4));
+	ks_dw_app_writel(ks_pcie, MSI0_IRQ_ENABLE_SET + (reg_offset << 4),
+			 BIT(bit_pos));
 }
 
 void ks_dw_pcie_msi_clear_irq(struct pcie_port *pp, int irq)
@@ -145,8 +156,8 @@
 	struct keystone_pcie *ks_pcie = to_keystone_pcie(pp);
 
 	update_reg_offset_bit_pos(irq, &reg_offset, &bit_pos);
-	writel(BIT(bit_pos),
-	       ks_pcie->va_app_base + MSI0_IRQ_ENABLE_CLR + (reg_offset << 4));
+	ks_dw_app_writel(ks_pcie, MSI0_IRQ_ENABLE_CLR + (reg_offset << 4),
+			 BIT(bit_pos));
 }
 
 static void ks_dw_pcie_msi_irq_mask(struct irq_data *d)
@@ -215,6 +226,7 @@
 int ks_dw_pcie_msi_host_init(struct pcie_port *pp, struct msi_controller *chip)
 {
 	struct keystone_pcie *ks_pcie = to_keystone_pcie(pp);
+	struct device *dev = pp->dev;
 	int i;
 
 	pp->irq_domain = irq_domain_add_linear(ks_pcie->msi_intc_np,
@@ -222,7 +234,7 @@
 					&ks_dw_pcie_msi_domain_ops,
 					chip);
 	if (!pp->irq_domain) {
-		dev_err(pp->dev, "irq domain init failed\n");
+		dev_err(dev, "irq domain init failed\n");
 		return -ENXIO;
 	}
 
@@ -237,47 +249,47 @@
 	int i;
 
 	for (i = 0; i < MAX_LEGACY_IRQS; i++)
-		writel(0x1, ks_pcie->va_app_base + IRQ_ENABLE_SET + (i << 4));
+		ks_dw_app_writel(ks_pcie, IRQ_ENABLE_SET + (i << 4), 0x1);
 }
 
 void ks_dw_pcie_handle_legacy_irq(struct keystone_pcie *ks_pcie, int offset)
 {
 	struct pcie_port *pp = &ks_pcie->pp;
+	struct device *dev = pp->dev;
 	u32 pending;
 	int virq;
 
-	pending = readl(ks_pcie->va_app_base + IRQ_STATUS + (offset << 4));
+	pending = ks_dw_app_readl(ks_pcie, IRQ_STATUS + (offset << 4));
 
 	if (BIT(0) & pending) {
 		virq = irq_linear_revmap(ks_pcie->legacy_irq_domain, offset);
-		dev_dbg(pp->dev, ": irq: irq_offset %d, virq %d\n", offset,
-			virq);
+		dev_dbg(dev, ": irq: irq_offset %d, virq %d\n", offset, virq);
 		generic_handle_irq(virq);
 	}
 
 	/* EOI the INTx interrupt */
-	writel(offset, ks_pcie->va_app_base + IRQ_EOI);
+	ks_dw_app_writel(ks_pcie, IRQ_EOI, offset);
 }
 
-void ks_dw_pcie_enable_error_irq(void __iomem *reg_base)
+void ks_dw_pcie_enable_error_irq(struct keystone_pcie *ks_pcie)
 {
-	writel(ERR_IRQ_ALL, reg_base + ERR_IRQ_ENABLE_SET);
+	ks_dw_app_writel(ks_pcie, ERR_IRQ_ENABLE_SET, ERR_IRQ_ALL);
 }
 
-irqreturn_t ks_dw_pcie_handle_error_irq(struct device *dev,
-					void __iomem *reg_base)
+irqreturn_t ks_dw_pcie_handle_error_irq(struct keystone_pcie *ks_pcie)
 {
 	u32 status;
 
-	status = readl(reg_base + ERR_IRQ_STATUS_RAW) & ERR_IRQ_ALL;
+	status = ks_dw_app_readl(ks_pcie, ERR_IRQ_STATUS_RAW) & ERR_IRQ_ALL;
 	if (!status)
 		return IRQ_NONE;
 
 	if (status & ERR_FATAL_IRQ)
-		dev_err(dev, "fatal error (status %#010x)\n", status);
+		dev_err(ks_pcie->pp.dev, "fatal error (status %#010x)\n",
+			status);
 
 	/* Ack the IRQ; status bits are RW1C */
-	writel(status, reg_base + ERR_IRQ_STATUS);
+	ks_dw_app_writel(ks_pcie, ERR_IRQ_STATUS, status);
 	return IRQ_HANDLED;
 }
 
@@ -322,15 +334,15 @@
  * Since modification of dbi_cs2 involves different clock domain, read the
  * status back to ensure the transition is complete.
  */
-static void ks_dw_pcie_set_dbi_mode(void __iomem *reg_virt)
+static void ks_dw_pcie_set_dbi_mode(struct keystone_pcie *ks_pcie)
 {
 	u32 val;
 
-	writel(DBI_CS2_EN_VAL | readl(reg_virt + CMD_STATUS),
-	       reg_virt + CMD_STATUS);
+	val = ks_dw_app_readl(ks_pcie, CMD_STATUS);
+	ks_dw_app_writel(ks_pcie, CMD_STATUS, DBI_CS2_EN_VAL | val);
 
 	do {
-		val = readl(reg_virt + CMD_STATUS);
+		val = ks_dw_app_readl(ks_pcie, CMD_STATUS);
 	} while (!(val & DBI_CS2_EN_VAL));
 }
 
@@ -340,15 +352,15 @@
  * Since modification of dbi_cs2 involves different clock domain, read the
  * status back to ensure the transition is complete.
  */
-static void ks_dw_pcie_clear_dbi_mode(void __iomem *reg_virt)
+static void ks_dw_pcie_clear_dbi_mode(struct keystone_pcie *ks_pcie)
 {
 	u32 val;
 
-	writel(~DBI_CS2_EN_VAL & readl(reg_virt + CMD_STATUS),
-		     reg_virt + CMD_STATUS);
+	val = ks_dw_app_readl(ks_pcie, CMD_STATUS);
+	ks_dw_app_writel(ks_pcie, CMD_STATUS, ~DBI_CS2_EN_VAL & val);
 
 	do {
-		val = readl(reg_virt + CMD_STATUS);
+		val = ks_dw_app_readl(ks_pcie, CMD_STATUS);
 	} while (val & DBI_CS2_EN_VAL);
 }
 
@@ -357,28 +369,29 @@
 	struct pcie_port *pp = &ks_pcie->pp;
 	u32 start = pp->mem->start, end = pp->mem->end;
 	int i, tr_size;
+	u32 val;
 
 	/* Disable BARs for inbound access */
-	ks_dw_pcie_set_dbi_mode(ks_pcie->va_app_base);
-	writel(0, pp->dbi_base + PCI_BASE_ADDRESS_0);
-	writel(0, pp->dbi_base + PCI_BASE_ADDRESS_1);
-	ks_dw_pcie_clear_dbi_mode(ks_pcie->va_app_base);
+	ks_dw_pcie_set_dbi_mode(ks_pcie);
+	dw_pcie_writel_rc(pp, PCI_BASE_ADDRESS_0, 0);
+	dw_pcie_writel_rc(pp, PCI_BASE_ADDRESS_1, 0);
+	ks_dw_pcie_clear_dbi_mode(ks_pcie);
 
 	/* Set outbound translation size per window division */
-	writel(CFG_PCIM_WIN_SZ_IDX & 0x7, ks_pcie->va_app_base + OB_SIZE);
+	ks_dw_app_writel(ks_pcie, OB_SIZE, CFG_PCIM_WIN_SZ_IDX & 0x7);
 
 	tr_size = (1 << (CFG_PCIM_WIN_SZ_IDX & 0x7)) * SZ_1M;
 
 	/* Using Direct 1:1 mapping of RC <-> PCI memory space */
 	for (i = 0; (i < CFG_PCIM_WIN_CNT) && (start < end); i++) {
-		writel(start | 1, ks_pcie->va_app_base + OB_OFFSET_INDEX(i));
-		writel(0, ks_pcie->va_app_base + OB_OFFSET_HI(i));
+		ks_dw_app_writel(ks_pcie, OB_OFFSET_INDEX(i), start | 1);
+		ks_dw_app_writel(ks_pcie, OB_OFFSET_HI(i), 0);
 		start += tr_size;
 	}
 
 	/* Enable OB translation */
-	writel(OB_XLAT_EN_VAL | readl(ks_pcie->va_app_base + CMD_STATUS),
-	       ks_pcie->va_app_base + CMD_STATUS);
+	val = ks_dw_app_readl(ks_pcie, CMD_STATUS);
+	ks_dw_app_writel(ks_pcie, CMD_STATUS, OB_XLAT_EN_VAL | val);
 }
 
 /**
@@ -418,7 +431,7 @@
 	if (bus != 1)
 		regval |= BIT(24);
 
-	writel(regval, ks_pcie->va_app_base + CFG_SETUP);
+	ks_dw_app_writel(ks_pcie, CFG_SETUP, regval);
 	return pp->va_cfg0_base;
 }
 
@@ -456,19 +469,19 @@
 	struct keystone_pcie *ks_pcie = to_keystone_pcie(pp);
 
 	/* Configure and set up BAR0 */
-	ks_dw_pcie_set_dbi_mode(ks_pcie->va_app_base);
+	ks_dw_pcie_set_dbi_mode(ks_pcie);
 
 	/* Enable BAR0 */
-	writel(1, pp->dbi_base + PCI_BASE_ADDRESS_0);
-	writel(SZ_4K - 1, pp->dbi_base + PCI_BASE_ADDRESS_0);
+	dw_pcie_writel_rc(pp, PCI_BASE_ADDRESS_0, 1);
+	dw_pcie_writel_rc(pp, PCI_BASE_ADDRESS_0, SZ_4K - 1);
 
-	ks_dw_pcie_clear_dbi_mode(ks_pcie->va_app_base);
+	ks_dw_pcie_clear_dbi_mode(ks_pcie);
 
 	 /*
 	  * For BAR0, just setting bus address for inbound writes (MSI) should
 	  * be sufficient.  Use physical address to avoid any conflicts.
 	  */
-	writel(ks_pcie->app.start, pp->dbi_base + PCI_BASE_ADDRESS_0);
+	dw_pcie_writel_rc(pp, PCI_BASE_ADDRESS_0, ks_pcie->app.start);
 }
 
 /**
@@ -476,8 +489,9 @@
  */
 int ks_dw_pcie_link_up(struct pcie_port *pp)
 {
-	u32 val = readl(pp->dbi_base + DEBUG0);
+	u32 val;
 
+	val = dw_pcie_readl_rc(pp, DEBUG0);
 	return (val & LTSSM_STATE_MASK) == LTSSM_STATE_L0;
 }
 
@@ -486,13 +500,13 @@
 	u32 val;
 
 	/* Disable Link training */
-	val = readl(ks_pcie->va_app_base + CMD_STATUS);
+	val = ks_dw_app_readl(ks_pcie, CMD_STATUS);
 	val &= ~LTSSM_EN_VAL;
-	writel(LTSSM_EN_VAL | val,  ks_pcie->va_app_base + CMD_STATUS);
+	ks_dw_app_writel(ks_pcie, CMD_STATUS, LTSSM_EN_VAL | val);
 
 	/* Initiate Link Training */
-	val = readl(ks_pcie->va_app_base + CMD_STATUS);
-	writel(LTSSM_EN_VAL | val,  ks_pcie->va_app_base + CMD_STATUS);
+	val = ks_dw_app_readl(ks_pcie, CMD_STATUS);
+	ks_dw_app_writel(ks_pcie, CMD_STATUS, LTSSM_EN_VAL | val);
 }
 
 /**
@@ -506,12 +520,13 @@
 				struct device_node *msi_intc_np)
 {
 	struct pcie_port *pp = &ks_pcie->pp;
-	struct platform_device *pdev = to_platform_device(pp->dev);
+	struct device *dev = pp->dev;
+	struct platform_device *pdev = to_platform_device(dev);
 	struct resource *res;
 
 	/* Index 0 is the config reg. space address */
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	pp->dbi_base = devm_ioremap_resource(pp->dev, res);
+	pp->dbi_base = devm_ioremap_resource(dev, res);
 	if (IS_ERR(pp->dbi_base))
 		return PTR_ERR(pp->dbi_base);
 
@@ -524,7 +539,7 @@
 
 	/* Index 1 is the application reg. space address */
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-	ks_pcie->va_app_base = devm_ioremap_resource(pp->dev, res);
+	ks_pcie->va_app_base = devm_ioremap_resource(dev, res);
 	if (IS_ERR(ks_pcie->va_app_base))
 		return PTR_ERR(ks_pcie->va_app_base);
 
@@ -537,7 +552,7 @@
 					&ks_dw_pcie_legacy_irq_domain_ops,
 					NULL);
 	if (!ks_pcie->legacy_irq_domain) {
-		dev_err(pp->dev, "Failed to add irq domain for legacy irqs\n");
+		dev_err(dev, "Failed to add irq domain for legacy irqs\n");
 		return -EINVAL;
 	}
 
diff --git a/drivers/pci/host/pci-keystone.c b/drivers/pci/host/pci-keystone.c
index 82b461b..043c19a 100644
--- a/drivers/pci/host/pci-keystone.c
+++ b/drivers/pci/host/pci-keystone.c
@@ -89,12 +89,13 @@
 static int ks_pcie_establish_link(struct keystone_pcie *ks_pcie)
 {
 	struct pcie_port *pp = &ks_pcie->pp;
+	struct device *dev = pp->dev;
 	unsigned int retries;
 
 	dw_pcie_setup_rc(pp);
 
 	if (dw_pcie_link_up(pp)) {
-		dev_err(pp->dev, "Link already up\n");
+		dev_err(dev, "Link already up\n");
 		return 0;
 	}
 
@@ -105,7 +106,7 @@
 			return 0;
 	}
 
-	dev_err(pp->dev, "phy link never came up\n");
+	dev_err(dev, "phy link never came up\n");
 	return -ETIMEDOUT;
 }
 
@@ -115,9 +116,10 @@
 	struct keystone_pcie *ks_pcie = irq_desc_get_handler_data(desc);
 	u32 offset = irq - ks_pcie->msi_host_irqs[0];
 	struct pcie_port *pp = &ks_pcie->pp;
+	struct device *dev = pp->dev;
 	struct irq_chip *chip = irq_desc_get_chip(desc);
 
-	dev_dbg(pp->dev, "%s, irq %d\n", __func__, irq);
+	dev_dbg(dev, "%s, irq %d\n", __func__, irq);
 
 	/*
 	 * The chained irq handler installation would have replaced normal
@@ -142,10 +144,11 @@
 	unsigned int irq = irq_desc_get_irq(desc);
 	struct keystone_pcie *ks_pcie = irq_desc_get_handler_data(desc);
 	struct pcie_port *pp = &ks_pcie->pp;
+	struct device *dev = pp->dev;
 	u32 irq_offset = irq - ks_pcie->legacy_host_irqs[0];
 	struct irq_chip *chip = irq_desc_get_chip(desc);
 
-	dev_dbg(pp->dev, ": Handling legacy irq %d\n", irq);
+	dev_dbg(dev, ": Handling legacy irq %d\n", irq);
 
 	/*
 	 * The chained irq handler installation would have replaced normal
@@ -234,7 +237,7 @@
 	}
 
 	if (ks_pcie->error_irq > 0)
-		ks_dw_pcie_enable_error_irq(ks_pcie->va_app_base);
+		ks_dw_pcie_enable_error_irq(ks_pcie);
 }
 
 /*
@@ -302,14 +305,14 @@
 {
 	struct keystone_pcie *ks_pcie = priv;
 
-	return ks_dw_pcie_handle_error_irq(ks_pcie->pp.dev,
-					   ks_pcie->va_app_base);
+	return ks_dw_pcie_handle_error_irq(ks_pcie);
 }
 
 static int __init ks_add_pcie_port(struct keystone_pcie *ks_pcie,
 			 struct platform_device *pdev)
 {
 	struct pcie_port *pp = &ks_pcie->pp;
+	struct device *dev = pp->dev;
 	int ret;
 
 	ret = ks_pcie_get_irq_controller_info(ks_pcie,
@@ -332,12 +335,12 @@
 	 */
 	ks_pcie->error_irq = irq_of_parse_and_map(ks_pcie->np, 0);
 	if (ks_pcie->error_irq <= 0)
-		dev_info(&pdev->dev, "no error IRQ defined\n");
+		dev_info(dev, "no error IRQ defined\n");
 	else {
 		ret = request_irq(ks_pcie->error_irq, pcie_err_irq_handler,
 				  IRQF_SHARED, "pcie-error-irq", ks_pcie);
 		if (ret < 0) {
-			dev_err(&pdev->dev, "failed to request error IRQ %d\n",
+			dev_err(dev, "failed to request error IRQ %d\n",
 				ks_pcie->error_irq);
 			return ret;
 		}
@@ -347,7 +350,7 @@
 	pp->ops = &keystone_pcie_host_ops;
 	ret = ks_dw_pcie_host_init(ks_pcie, ks_pcie->msi_intc_np);
 	if (ret) {
-		dev_err(&pdev->dev, "failed to initialize host\n");
+		dev_err(dev, "failed to initialize host\n");
 		return ret;
 	}
 
@@ -381,12 +384,12 @@
 	struct phy *phy;
 	int ret;
 
-	ks_pcie = devm_kzalloc(&pdev->dev, sizeof(*ks_pcie),
-				GFP_KERNEL);
+	ks_pcie = devm_kzalloc(dev, sizeof(*ks_pcie), GFP_KERNEL);
 	if (!ks_pcie)
 		return -ENOMEM;
 
 	pp = &ks_pcie->pp;
+	pp->dev = dev;
 
 	/* initialize SerDes Phy if present */
 	phy = devm_phy_get(dev, "pcie-phy");
@@ -408,7 +411,6 @@
 	devm_iounmap(dev, reg_p);
 	devm_release_mem_region(dev, res->start, resource_size(res));
 
-	pp->dev = dev;
 	ks_pcie->np = dev->of_node;
 	platform_set_drvdata(pdev, ks_pcie);
 	ks_pcie->clk = devm_clk_get(dev, "pcie");
diff --git a/drivers/pci/host/pci-keystone.h b/drivers/pci/host/pci-keystone.h
index a5b0cb2..bc54baf 100644
--- a/drivers/pci/host/pci-keystone.h
+++ b/drivers/pci/host/pci-keystone.h
@@ -17,8 +17,8 @@
 #define MAX_LEGACY_HOST_IRQS		4
 
 struct keystone_pcie {
+	struct	pcie_port	pp;		/* pp.dbi_base is DT 0th res */
 	struct	clk		*clk;
-	struct	pcie_port	pp;
 	/* PCI Device ID */
 	u32			device_id;
 	int			num_legacy_host_irqs;
@@ -34,7 +34,7 @@
 	int error_irq;
 
 	/* Application register space */
-	void __iomem		*va_app_base;
+	void __iomem		*va_app_base;	/* DT 1st resource */
 	struct resource		app;
 };
 
@@ -45,9 +45,8 @@
 /* Keystone specific PCI controller APIs */
 void ks_dw_pcie_enable_legacy_irqs(struct keystone_pcie *ks_pcie);
 void ks_dw_pcie_handle_legacy_irq(struct keystone_pcie *ks_pcie, int offset);
-void ks_dw_pcie_enable_error_irq(void __iomem *reg_base);
-irqreturn_t ks_dw_pcie_handle_error_irq(struct device *dev,
-					void __iomem *reg_base);
+void ks_dw_pcie_enable_error_irq(struct keystone_pcie *ks_pcie);
+irqreturn_t ks_dw_pcie_handle_error_irq(struct keystone_pcie *ks_pcie);
 int  ks_dw_pcie_host_init(struct keystone_pcie *ks_pcie,
 			struct device_node *msi_intc_np);
 int ks_dw_pcie_wr_other_conf(struct pcie_port *pp, struct pci_bus *bus,
diff --git a/drivers/pci/host/pci-layerscape.c b/drivers/pci/host/pci-layerscape.c
index 114ba81..2cb7315 100644
--- a/drivers/pci/host/pci-layerscape.c
+++ b/drivers/pci/host/pci-layerscape.c
@@ -45,10 +45,9 @@
 };
 
 struct ls_pcie {
-	void __iomem *dbi;
+	struct pcie_port pp;		/* pp.dbi_base is DT regs */
 	void __iomem *lut;
 	struct regmap *scfg;
-	struct pcie_port pp;
 	const struct ls_pcie_drvdata *drvdata;
 	int index;
 };
@@ -59,7 +58,7 @@
 {
 	u32 header_type;
 
-	header_type = ioread8(pcie->dbi + PCI_HEADER_TYPE);
+	header_type = ioread8(pcie->pp.dbi_base + PCI_HEADER_TYPE);
 	header_type &= 0x7f;
 
 	return header_type == PCI_HEADER_TYPE_BRIDGE;
@@ -68,13 +67,13 @@
 /* Clear multi-function bit */
 static void ls_pcie_clear_multifunction(struct ls_pcie *pcie)
 {
-	iowrite8(PCI_HEADER_TYPE_BRIDGE, pcie->dbi + PCI_HEADER_TYPE);
+	iowrite8(PCI_HEADER_TYPE_BRIDGE, pcie->pp.dbi_base + PCI_HEADER_TYPE);
 }
 
 /* Fix class value */
 static void ls_pcie_fix_class(struct ls_pcie *pcie)
 {
-	iowrite16(PCI_CLASS_BRIDGE_PCI, pcie->dbi + PCI_CLASS_DEVICE);
+	iowrite16(PCI_CLASS_BRIDGE_PCI, pcie->pp.dbi_base + PCI_CLASS_DEVICE);
 }
 
 /* Drop MSG TLP except for Vendor MSG */
@@ -82,9 +81,9 @@
 {
 	u32 val;
 
-	val = ioread32(pcie->dbi + PCIE_STRFMR1);
+	val = ioread32(pcie->pp.dbi_base + PCIE_STRFMR1);
 	val &= 0xDFFFFFFF;
-	iowrite32(val, pcie->dbi + PCIE_STRFMR1);
+	iowrite32(val, pcie->pp.dbi_base + PCIE_STRFMR1);
 }
 
 static int ls1021_pcie_link_up(struct pcie_port *pp)
@@ -106,18 +105,19 @@
 
 static void ls1021_pcie_host_init(struct pcie_port *pp)
 {
+	struct device *dev = pp->dev;
 	struct ls_pcie *pcie = to_ls_pcie(pp);
 	u32 index[2];
 
-	pcie->scfg = syscon_regmap_lookup_by_phandle(pp->dev->of_node,
+	pcie->scfg = syscon_regmap_lookup_by_phandle(dev->of_node,
 						     "fsl,pcie-scfg");
 	if (IS_ERR(pcie->scfg)) {
-		dev_err(pp->dev, "No syscfg phandle specified\n");
+		dev_err(dev, "No syscfg phandle specified\n");
 		pcie->scfg = NULL;
 		return;
 	}
 
-	if (of_property_read_u32_array(pp->dev->of_node,
+	if (of_property_read_u32_array(dev->of_node,
 				       "fsl,pcie-scfg", index, 2)) {
 		pcie->scfg = NULL;
 		return;
@@ -148,18 +148,19 @@
 {
 	struct ls_pcie *pcie = to_ls_pcie(pp);
 
-	iowrite32(1, pcie->dbi + PCIE_DBI_RO_WR_EN);
+	iowrite32(1, pcie->pp.dbi_base + PCIE_DBI_RO_WR_EN);
 	ls_pcie_fix_class(pcie);
 	ls_pcie_clear_multifunction(pcie);
 	ls_pcie_drop_msg_tlp(pcie);
-	iowrite32(0, pcie->dbi + PCIE_DBI_RO_WR_EN);
+	iowrite32(0, pcie->pp.dbi_base + PCIE_DBI_RO_WR_EN);
 }
 
 static int ls_pcie_msi_host_init(struct pcie_port *pp,
 				 struct msi_controller *chip)
 {
+	struct device *dev = pp->dev;
+	struct device_node *np = dev->of_node;
 	struct device_node *msi_node;
-	struct device_node *np = pp->dev->of_node;
 
 	/*
 	 * The MSI domain is set by the generic of_msi_configure().  This
@@ -169,7 +170,7 @@
 	 */
 	msi_node = of_parse_phandle(np, "msi-parent", 0);
 	if (!msi_node) {
-		dev_err(pp->dev, "failed to find msi-parent\n");
+		dev_err(dev, "failed to find msi-parent\n");
 		return -EINVAL;
 	}
 
@@ -212,19 +213,15 @@
 	{ },
 };
 
-static int __init ls_add_pcie_port(struct pcie_port *pp,
-				   struct platform_device *pdev)
+static int __init ls_add_pcie_port(struct ls_pcie *pcie)
 {
+	struct pcie_port *pp = &pcie->pp;
+	struct device *dev = pp->dev;
 	int ret;
-	struct ls_pcie *pcie = to_ls_pcie(pp);
-
-	pp->dev = &pdev->dev;
-	pp->dbi_base = pcie->dbi;
-	pp->ops = pcie->drvdata->ops;
 
 	ret = dw_pcie_host_init(pp);
 	if (ret) {
-		dev_err(pp->dev, "failed to initialize host\n");
+		dev_err(dev, "failed to initialize host\n");
 		return ret;
 	}
 
@@ -233,38 +230,42 @@
 
 static int __init ls_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	const struct of_device_id *match;
 	struct ls_pcie *pcie;
+	struct pcie_port *pp;
 	struct resource *dbi_base;
 	int ret;
 
-	match = of_match_device(ls_pcie_of_match, &pdev->dev);
+	match = of_match_device(ls_pcie_of_match, dev);
 	if (!match)
 		return -ENODEV;
 
-	pcie = devm_kzalloc(&pdev->dev, sizeof(*pcie), GFP_KERNEL);
+	pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
 	if (!pcie)
 		return -ENOMEM;
 
+	pp = &pcie->pp;
+	pp->dev = dev;
+	pp->ops = pcie->drvdata->ops;
+
 	dbi_base = platform_get_resource_byname(pdev, IORESOURCE_MEM, "regs");
-	pcie->dbi = devm_ioremap_resource(&pdev->dev, dbi_base);
-	if (IS_ERR(pcie->dbi)) {
-		dev_err(&pdev->dev, "missing *regs* space\n");
-		return PTR_ERR(pcie->dbi);
+	pcie->pp.dbi_base = devm_ioremap_resource(dev, dbi_base);
+	if (IS_ERR(pcie->pp.dbi_base)) {
+		dev_err(dev, "missing *regs* space\n");
+		return PTR_ERR(pcie->pp.dbi_base);
 	}
 
 	pcie->drvdata = match->data;
-	pcie->lut = pcie->dbi + pcie->drvdata->lut_offset;
+	pcie->lut = pcie->pp.dbi_base + pcie->drvdata->lut_offset;
 
 	if (!ls_pcie_is_bridge(pcie))
 		return -ENODEV;
 
-	ret = ls_add_pcie_port(&pcie->pp, pdev);
+	ret = ls_add_pcie_port(pcie);
 	if (ret < 0)
 		return ret;
 
-	platform_set_drvdata(pdev, pcie);
-
 	return 0;
 }
 
diff --git a/drivers/pci/host/pci-mvebu.c b/drivers/pci/host/pci-mvebu.c
index 307f81d..45a89d9 100644
--- a/drivers/pci/host/pci-mvebu.c
+++ b/drivers/pci/host/pci-mvebu.c
@@ -1190,13 +1190,13 @@
 
 static int mvebu_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct mvebu_pcie *pcie;
-	struct device_node *np = pdev->dev.of_node;
+	struct device_node *np = dev->of_node;
 	struct device_node *child;
 	int num, i, ret;
 
-	pcie = devm_kzalloc(&pdev->dev, sizeof(struct mvebu_pcie),
-			    GFP_KERNEL);
+	pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
 	if (!pcie)
 		return -ENOMEM;
 
@@ -1206,7 +1206,7 @@
 	/* Get the PCIe memory and I/O aperture */
 	mvebu_mbus_get_pcie_mem_aperture(&pcie->mem);
 	if (resource_size(&pcie->mem) == 0) {
-		dev_err(&pdev->dev, "invalid memory aperture size\n");
+		dev_err(dev, "invalid memory aperture size\n");
 		return -EINVAL;
 	}
 
@@ -1224,20 +1224,18 @@
 	/* Get the bus range */
 	ret = of_pci_parse_bus_range(np, &pcie->busn);
 	if (ret) {
-		dev_err(&pdev->dev, "failed to parse bus-range property: %d\n",
-			ret);
+		dev_err(dev, "failed to parse bus-range property: %d\n", ret);
 		return ret;
 	}
 
-	num = of_get_available_child_count(pdev->dev.of_node);
+	num = of_get_available_child_count(np);
 
-	pcie->ports = devm_kcalloc(&pdev->dev, num, sizeof(*pcie->ports),
-				   GFP_KERNEL);
+	pcie->ports = devm_kcalloc(dev, num, sizeof(*pcie->ports), GFP_KERNEL);
 	if (!pcie->ports)
 		return -ENOMEM;
 
 	i = 0;
-	for_each_available_child_of_node(pdev->dev.of_node, child) {
+	for_each_available_child_of_node(np, child) {
 		struct mvebu_pcie_port *port = &pcie->ports[i];
 
 		ret = mvebu_pcie_parse_port(pcie, port, child);
@@ -1266,8 +1264,7 @@
 
 		port->base = mvebu_pcie_map_registers(pdev, child, port);
 		if (IS_ERR(port->base)) {
-			dev_err(&pdev->dev, "%s: cannot map registers\n",
-				port->name);
+			dev_err(dev, "%s: cannot map registers\n", port->name);
 			port->base = NULL;
 			mvebu_pcie_powerdown(port);
 			continue;
diff --git a/drivers/pci/host/pci-rcar-gen2.c b/drivers/pci/host/pci-rcar-gen2.c
index 597566f..1eeefa4 100644
--- a/drivers/pci/host/pci-rcar-gen2.c
+++ b/drivers/pci/host/pci-rcar-gen2.c
@@ -154,10 +154,11 @@
 static irqreturn_t rcar_pci_err_irq(int irq, void *pw)
 {
 	struct rcar_pci_priv *priv = pw;
+	struct device *dev = priv->dev;
 	u32 status = ioread32(priv->reg + RCAR_PCI_INT_STATUS_REG);
 
 	if (status & RCAR_PCI_INT_ALLERRORS) {
-		dev_err(priv->dev, "error irq: status %08x\n", status);
+		dev_err(dev, "error irq: status %08x\n", status);
 
 		/* clear the error(s) */
 		iowrite32(status & RCAR_PCI_INT_ALLERRORS,
@@ -170,13 +171,14 @@
 
 static void rcar_pci_setup_errirq(struct rcar_pci_priv *priv)
 {
+	struct device *dev = priv->dev;
 	int ret;
 	u32 val;
 
-	ret = devm_request_irq(priv->dev, priv->irq, rcar_pci_err_irq,
+	ret = devm_request_irq(dev, priv->irq, rcar_pci_err_irq,
 			       IRQF_SHARED, "error irq", priv);
 	if (ret) {
-		dev_err(priv->dev, "cannot claim IRQ for error handling\n");
+		dev_err(dev, "cannot claim IRQ for error handling\n");
 		return;
 	}
 
@@ -192,15 +194,16 @@
 static int rcar_pci_setup(int nr, struct pci_sys_data *sys)
 {
 	struct rcar_pci_priv *priv = sys->private_data;
+	struct device *dev = priv->dev;
 	void __iomem *reg = priv->reg;
 	u32 val;
 	int ret;
 
-	pm_runtime_enable(priv->dev);
-	pm_runtime_get_sync(priv->dev);
+	pm_runtime_enable(dev);
+	pm_runtime_get_sync(dev);
 
 	val = ioread32(reg + RCAR_PCI_UNIT_REV_REG);
-	dev_info(priv->dev, "PCI: bus%u revision %x\n", sys->busnr, val);
+	dev_info(dev, "PCI: bus%u revision %x\n", sys->busnr, val);
 
 	/* Disable Direct Power Down State and assert reset */
 	val = ioread32(reg + RCAR_USBCTR_REG) & ~RCAR_USBCTR_DIRPD;
@@ -275,7 +278,7 @@
 
 	/* Add PCI resources */
 	pci_add_resource(&sys->resources, &priv->mem_res);
-	ret = devm_request_pci_bus_resources(priv->dev, &sys->resources);
+	ret = devm_request_pci_bus_resources(dev, &sys->resources);
 	if (ret < 0)
 		return ret;
 
@@ -311,6 +314,7 @@
 static int rcar_pci_parse_map_dma_ranges(struct rcar_pci_priv *pci,
 					 struct device_node *np)
 {
+	struct device *dev = pci->dev;
 	struct of_pci_range range;
 	struct of_pci_range_parser parser;
 	int index = 0;
@@ -331,14 +335,14 @@
 
 		/* Catch HW limitations */
 		if (!(range.flags & IORESOURCE_PREFETCH)) {
-			dev_err(pci->dev, "window must be prefetchable\n");
+			dev_err(dev, "window must be prefetchable\n");
 			return -EINVAL;
 		}
 		if (pci->window_addr) {
 			u32 lowaddr = 1 << (ffs(pci->window_addr) - 1);
 
 			if (lowaddr < pci->window_size) {
-				dev_err(pci->dev, "invalid window size/addr\n");
+				dev_err(dev, "invalid window size/addr\n");
 				return -EINVAL;
 			}
 		}
@@ -350,6 +354,7 @@
 
 static int rcar_pci_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct resource *cfg_res, *mem_res;
 	struct rcar_pci_priv *priv;
 	void __iomem *reg;
@@ -357,7 +362,7 @@
 	void *hw_private[1];
 
 	cfg_res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	reg = devm_ioremap_resource(&pdev->dev, cfg_res);
+	reg = devm_ioremap_resource(dev, cfg_res);
 	if (IS_ERR(reg))
 		return PTR_ERR(reg);
 
@@ -368,8 +373,7 @@
 	if (mem_res->start & 0xFFFF)
 		return -EINVAL;
 
-	priv = devm_kzalloc(&pdev->dev,
-			    sizeof(struct rcar_pci_priv), GFP_KERNEL);
+	priv = devm_kzalloc(dev, sizeof(struct rcar_pci_priv), GFP_KERNEL);
 	if (!priv)
 		return -ENOMEM;
 
@@ -378,10 +382,10 @@
 
 	priv->irq = platform_get_irq(pdev, 0);
 	priv->reg = reg;
-	priv->dev = &pdev->dev;
+	priv->dev = dev;
 
 	if (priv->irq < 0) {
-		dev_err(&pdev->dev, "no valid irq found\n");
+		dev_err(dev, "no valid irq found\n");
 		return priv->irq;
 	}
 
@@ -390,23 +394,23 @@
 	priv->window_pci = 0x40000000;
 	priv->window_size = SZ_1G;
 
-	if (pdev->dev.of_node) {
+	if (dev->of_node) {
 		struct resource busnr;
 		int ret;
 
-		ret = of_pci_parse_bus_range(pdev->dev.of_node, &busnr);
+		ret = of_pci_parse_bus_range(dev->of_node, &busnr);
 		if (ret < 0) {
-			dev_err(&pdev->dev, "failed to parse bus-range\n");
+			dev_err(dev, "failed to parse bus-range\n");
 			return ret;
 		}
 
 		priv->busnr = busnr.start;
 		if (busnr.end != busnr.start)
-			dev_warn(&pdev->dev, "only one bus number supported\n");
+			dev_warn(dev, "only one bus number supported\n");
 
-		ret = rcar_pci_parse_map_dma_ranges(priv, pdev->dev.of_node);
+		ret = rcar_pci_parse_map_dma_ranges(priv, dev->of_node);
 		if (ret < 0) {
-			dev_err(&pdev->dev, "failed to parse dma-range\n");
+			dev_err(dev, "failed to parse dma-range\n");
 			return ret;
 		}
 	} else {
@@ -421,7 +425,7 @@
 	hw.map_irq = rcar_pci_map_irq;
 	hw.ops = &rcar_pci_ops;
 	hw.setup = rcar_pci_setup;
-	pci_common_init_dev(&pdev->dev, &hw);
+	pci_common_init_dev(dev, &hw);
 	return 0;
 }
 
diff --git a/drivers/pci/host/pci-tegra.c b/drivers/pci/host/pci-tegra.c
index e2a8e4c..8dfccf7 100644
--- a/drivers/pci/host/pci-tegra.c
+++ b/drivers/pci/host/pci-tegra.c
@@ -384,6 +384,7 @@
 static struct tegra_pcie_bus *tegra_pcie_bus_alloc(struct tegra_pcie *pcie,
 						   unsigned int busnr)
 {
+	struct device *dev = pcie->dev;
 	pgprot_t prot = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
 				 L_PTE_XN | L_PTE_MT_DEV_SHARED | L_PTE_SHARED);
 	phys_addr_t cs = pcie->cs->start;
@@ -413,8 +414,7 @@
 
 		err = ioremap_page_range(virt, virt + SZ_64K, phys, prot);
 		if (err < 0) {
-			dev_err(pcie->dev, "ioremap_page_range() failed: %d\n",
-				err);
+			dev_err(dev, "ioremap_page_range() failed: %d\n", err);
 			goto unmap;
 		}
 	}
@@ -462,6 +462,7 @@
 					int where)
 {
 	struct tegra_pcie *pcie = sys_to_pcie(bus->sysdata);
+	struct device *dev = pcie->dev;
 	void __iomem *addr = NULL;
 
 	if (bus->number == 0) {
@@ -482,8 +483,7 @@
 				addr = (void __iomem *)b->area->addr;
 
 		if (!addr) {
-			dev_err(pcie->dev,
-				"failed to map cfg. space for bus %u\n",
+			dev_err(dev, "failed to map cfg. space for bus %u\n",
 				bus->number);
 			return NULL;
 		}
@@ -584,12 +584,13 @@
 static void tegra_pcie_port_free(struct tegra_pcie_port *port)
 {
 	struct tegra_pcie *pcie = port->pcie;
+	struct device *dev = pcie->dev;
 
-	devm_iounmap(pcie->dev, port->base);
-	devm_release_mem_region(pcie->dev, port->regs.start,
+	devm_iounmap(dev, port->base);
+	devm_release_mem_region(dev, port->regs.start,
 				resource_size(&port->regs));
 	list_del(&port->list);
-	devm_kfree(pcie->dev, port);
+	devm_kfree(dev, port);
 }
 
 /* Tegra PCIE root complex wrongly reports device class */
@@ -612,12 +613,13 @@
 static int tegra_pcie_setup(int nr, struct pci_sys_data *sys)
 {
 	struct tegra_pcie *pcie = sys_to_pcie(sys);
+	struct device *dev = pcie->dev;
 	int err;
 
 	sys->mem_offset = pcie->offset.mem;
 	sys->io_offset = pcie->offset.io;
 
-	err = devm_request_resource(pcie->dev, &iomem_resource, &pcie->io);
+	err = devm_request_resource(dev, &iomem_resource, &pcie->io);
 	if (err < 0)
 		return err;
 
@@ -631,7 +633,7 @@
 				sys->mem_offset);
 	pci_add_resource(&sys->resources, &pcie->busn);
 
-	err = devm_request_pci_bus_resources(pcie->dev, &sys->resources);
+	err = devm_request_pci_bus_resources(dev, &sys->resources);
 	if (err < 0)
 		return err;
 
@@ -672,6 +674,7 @@
 		"Peer2Peer error",
 	};
 	struct tegra_pcie *pcie = arg;
+	struct device *dev = pcie->dev;
 	u32 code, signature;
 
 	code = afi_readl(pcie, AFI_INTR_CODE) & AFI_INTR_CODE_MASK;
@@ -689,11 +692,9 @@
 	 * happen a lot during enumeration
 	 */
 	if (code == AFI_INTR_MASTER_ABORT)
-		dev_dbg(pcie->dev, "%s, signature: %08x\n", err_msg[code],
-			signature);
+		dev_dbg(dev, "%s, signature: %08x\n", err_msg[code], signature);
 	else
-		dev_err(pcie->dev, "%s, signature: %08x\n", err_msg[code],
-			signature);
+		dev_err(dev, "%s, signature: %08x\n", err_msg[code], signature);
 
 	if (code == AFI_INTR_TARGET_ABORT || code == AFI_INTR_MASTER_ABORT ||
 	    code == AFI_INTR_FPCI_DECODE_ERROR) {
@@ -701,9 +702,9 @@
 		u64 address = (u64)fpci << 32 | (signature & 0xfffffffc);
 
 		if (code == AFI_INTR_MASTER_ABORT)
-			dev_dbg(pcie->dev, "  FPCI address: %10llx\n", address);
+			dev_dbg(dev, "  FPCI address: %10llx\n", address);
 		else
-			dev_err(pcie->dev, "  FPCI address: %10llx\n", address);
+			dev_err(dev, "  FPCI address: %10llx\n", address);
 	}
 
 	return IRQ_HANDLED;
@@ -793,6 +794,7 @@
 
 static int tegra_pcie_phy_enable(struct tegra_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	const struct tegra_pcie_soc *soc = pcie->soc;
 	u32 value;
 	int err;
@@ -829,7 +831,7 @@
 	/* wait for the PLL to lock */
 	err = tegra_pcie_pll_wait(pcie, 500);
 	if (err < 0) {
-		dev_err(pcie->dev, "PLL failed to lock: %d\n", err);
+		dev_err(dev, "PLL failed to lock: %d\n", err);
 		return err;
 	}
 
@@ -859,7 +861,7 @@
 	/* override IDDQ */
 	value = pads_readl(pcie, PADS_CTL);
 	value |= PADS_CTL_IDDQ_1L;
-	pads_writel(pcie, PADS_CTL, value);
+	pads_writel(pcie, value, PADS_CTL);
 
 	/* reset PLL */
 	value = pads_readl(pcie, soc->pads_pll_ctl);
@@ -880,8 +882,7 @@
 	for (i = 0; i < port->lanes; i++) {
 		err = phy_power_on(port->phys[i]);
 		if (err < 0) {
-			dev_err(dev, "failed to power on PHY#%u: %d\n", i,
-				err);
+			dev_err(dev, "failed to power on PHY#%u: %d\n", i, err);
 			return err;
 		}
 	}
@@ -909,6 +910,7 @@
 
 static int tegra_pcie_phy_power_on(struct tegra_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	const struct tegra_pcie_soc *soc = pcie->soc;
 	struct tegra_pcie_port *port;
 	int err;
@@ -920,7 +922,7 @@
 			err = tegra_pcie_phy_enable(pcie);
 
 		if (err < 0)
-			dev_err(pcie->dev, "failed to power on PHY: %d\n", err);
+			dev_err(dev, "failed to power on PHY: %d\n", err);
 
 		return err;
 	}
@@ -928,7 +930,7 @@
 	list_for_each_entry(port, &pcie->ports, list) {
 		err = tegra_pcie_port_phy_power_on(port);
 		if (err < 0) {
-			dev_err(pcie->dev,
+			dev_err(dev,
 				"failed to power on PCIe port %u PHY: %d\n",
 				port->index, err);
 			return err;
@@ -946,6 +948,7 @@
 
 static int tegra_pcie_phy_power_off(struct tegra_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	struct tegra_pcie_port *port;
 	int err;
 
@@ -956,8 +959,7 @@
 			err = tegra_pcie_phy_disable(pcie);
 
 		if (err < 0)
-			dev_err(pcie->dev, "failed to power off PHY: %d\n",
-				err);
+			dev_err(dev, "failed to power off PHY: %d\n", err);
 
 		return err;
 	}
@@ -965,7 +967,7 @@
 	list_for_each_entry(port, &pcie->ports, list) {
 		err = tegra_pcie_port_phy_power_off(port);
 		if (err < 0) {
-			dev_err(pcie->dev,
+			dev_err(dev,
 				"failed to power off PCIe port %u PHY: %d\n",
 				port->index, err);
 			return err;
@@ -977,6 +979,7 @@
 
 static int tegra_pcie_enable_controller(struct tegra_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	const struct tegra_pcie_soc *soc = pcie->soc;
 	struct tegra_pcie_port *port;
 	unsigned long value;
@@ -1016,7 +1019,7 @@
 
 	err = tegra_pcie_phy_power_on(pcie);
 	if (err < 0) {
-		dev_err(pcie->dev, "failed to power on PHY(s): %d\n", err);
+		dev_err(dev, "failed to power on PHY(s): %d\n", err);
 		return err;
 	}
 
@@ -1049,13 +1052,14 @@
 
 static void tegra_pcie_power_off(struct tegra_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	int err;
 
 	/* TODO: disable and unprepare clocks? */
 
 	err = tegra_pcie_phy_power_off(pcie);
 	if (err < 0)
-		dev_err(pcie->dev, "failed to power off PHY(s): %d\n", err);
+		dev_err(dev, "failed to power off PHY(s): %d\n", err);
 
 	reset_control_assert(pcie->pcie_xrst);
 	reset_control_assert(pcie->afi_rst);
@@ -1065,11 +1069,12 @@
 
 	err = regulator_bulk_disable(pcie->num_supplies, pcie->supplies);
 	if (err < 0)
-		dev_warn(pcie->dev, "failed to disable regulators: %d\n", err);
+		dev_warn(dev, "failed to disable regulators: %d\n", err);
 }
 
 static int tegra_pcie_power_on(struct tegra_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	const struct tegra_pcie_soc *soc = pcie->soc;
 	int err;
 
@@ -1082,13 +1087,13 @@
 	/* enable regulators */
 	err = regulator_bulk_enable(pcie->num_supplies, pcie->supplies);
 	if (err < 0)
-		dev_err(pcie->dev, "failed to enable regulators: %d\n", err);
+		dev_err(dev, "failed to enable regulators: %d\n", err);
 
 	err = tegra_powergate_sequence_power_up(TEGRA_POWERGATE_PCIE,
 						pcie->pex_clk,
 						pcie->pex_rst);
 	if (err) {
-		dev_err(pcie->dev, "powerup sequence failed: %d\n", err);
+		dev_err(dev, "powerup sequence failed: %d\n", err);
 		return err;
 	}
 
@@ -1096,22 +1101,21 @@
 
 	err = clk_prepare_enable(pcie->afi_clk);
 	if (err < 0) {
-		dev_err(pcie->dev, "failed to enable AFI clock: %d\n", err);
+		dev_err(dev, "failed to enable AFI clock: %d\n", err);
 		return err;
 	}
 
 	if (soc->has_cml_clk) {
 		err = clk_prepare_enable(pcie->cml_clk);
 		if (err < 0) {
-			dev_err(pcie->dev, "failed to enable CML clock: %d\n",
-				err);
+			dev_err(dev, "failed to enable CML clock: %d\n", err);
 			return err;
 		}
 	}
 
 	err = clk_prepare_enable(pcie->pll_e);
 	if (err < 0) {
-		dev_err(pcie->dev, "failed to enable PLLE clock: %d\n", err);
+		dev_err(dev, "failed to enable PLLE clock: %d\n", err);
 		return err;
 	}
 
@@ -1120,22 +1124,23 @@
 
 static int tegra_pcie_clocks_get(struct tegra_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	const struct tegra_pcie_soc *soc = pcie->soc;
 
-	pcie->pex_clk = devm_clk_get(pcie->dev, "pex");
+	pcie->pex_clk = devm_clk_get(dev, "pex");
 	if (IS_ERR(pcie->pex_clk))
 		return PTR_ERR(pcie->pex_clk);
 
-	pcie->afi_clk = devm_clk_get(pcie->dev, "afi");
+	pcie->afi_clk = devm_clk_get(dev, "afi");
 	if (IS_ERR(pcie->afi_clk))
 		return PTR_ERR(pcie->afi_clk);
 
-	pcie->pll_e = devm_clk_get(pcie->dev, "pll_e");
+	pcie->pll_e = devm_clk_get(dev, "pll_e");
 	if (IS_ERR(pcie->pll_e))
 		return PTR_ERR(pcie->pll_e);
 
 	if (soc->has_cml_clk) {
-		pcie->cml_clk = devm_clk_get(pcie->dev, "cml");
+		pcie->cml_clk = devm_clk_get(dev, "cml");
 		if (IS_ERR(pcie->cml_clk))
 			return PTR_ERR(pcie->cml_clk);
 	}
@@ -1145,15 +1150,17 @@
 
 static int tegra_pcie_resets_get(struct tegra_pcie *pcie)
 {
-	pcie->pex_rst = devm_reset_control_get(pcie->dev, "pex");
+	struct device *dev = pcie->dev;
+
+	pcie->pex_rst = devm_reset_control_get(dev, "pex");
 	if (IS_ERR(pcie->pex_rst))
 		return PTR_ERR(pcie->pex_rst);
 
-	pcie->afi_rst = devm_reset_control_get(pcie->dev, "afi");
+	pcie->afi_rst = devm_reset_control_get(dev, "afi");
 	if (IS_ERR(pcie->afi_rst))
 		return PTR_ERR(pcie->afi_rst);
 
-	pcie->pcie_xrst = devm_reset_control_get(pcie->dev, "pcie_x");
+	pcie->pcie_xrst = devm_reset_control_get(dev, "pcie_x");
 	if (IS_ERR(pcie->pcie_xrst))
 		return PTR_ERR(pcie->pcie_xrst);
 
@@ -1162,18 +1169,19 @@
 
 static int tegra_pcie_phys_get_legacy(struct tegra_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	int err;
 
-	pcie->phy = devm_phy_optional_get(pcie->dev, "pcie");
+	pcie->phy = devm_phy_optional_get(dev, "pcie");
 	if (IS_ERR(pcie->phy)) {
 		err = PTR_ERR(pcie->phy);
-		dev_err(pcie->dev, "failed to get PHY: %d\n", err);
+		dev_err(dev, "failed to get PHY: %d\n", err);
 		return err;
 	}
 
 	err = phy_init(pcie->phy);
 	if (err < 0) {
-		dev_err(pcie->dev, "failed to initialize PHY: %d\n", err);
+		dev_err(dev, "failed to initialize PHY: %d\n", err);
 		return err;
 	}
 
@@ -1256,43 +1264,44 @@
 
 static int tegra_pcie_get_resources(struct tegra_pcie *pcie)
 {
-	struct platform_device *pdev = to_platform_device(pcie->dev);
+	struct device *dev = pcie->dev;
+	struct platform_device *pdev = to_platform_device(dev);
 	struct resource *pads, *afi, *res;
 	int err;
 
 	err = tegra_pcie_clocks_get(pcie);
 	if (err) {
-		dev_err(&pdev->dev, "failed to get clocks: %d\n", err);
+		dev_err(dev, "failed to get clocks: %d\n", err);
 		return err;
 	}
 
 	err = tegra_pcie_resets_get(pcie);
 	if (err) {
-		dev_err(&pdev->dev, "failed to get resets: %d\n", err);
+		dev_err(dev, "failed to get resets: %d\n", err);
 		return err;
 	}
 
 	err = tegra_pcie_phys_get(pcie);
 	if (err < 0) {
-		dev_err(&pdev->dev, "failed to get PHYs: %d\n", err);
+		dev_err(dev, "failed to get PHYs: %d\n", err);
 		return err;
 	}
 
 	err = tegra_pcie_power_on(pcie);
 	if (err) {
-		dev_err(&pdev->dev, "failed to power up: %d\n", err);
+		dev_err(dev, "failed to power up: %d\n", err);
 		return err;
 	}
 
 	pads = platform_get_resource_byname(pdev, IORESOURCE_MEM, "pads");
-	pcie->pads = devm_ioremap_resource(&pdev->dev, pads);
+	pcie->pads = devm_ioremap_resource(dev, pads);
 	if (IS_ERR(pcie->pads)) {
 		err = PTR_ERR(pcie->pads);
 		goto poweroff;
 	}
 
 	afi = platform_get_resource_byname(pdev, IORESOURCE_MEM, "afi");
-	pcie->afi = devm_ioremap_resource(&pdev->dev, afi);
+	pcie->afi = devm_ioremap_resource(dev, afi);
 	if (IS_ERR(pcie->afi)) {
 		err = PTR_ERR(pcie->afi);
 		goto poweroff;
@@ -1305,7 +1314,7 @@
 		goto poweroff;
 	}
 
-	pcie->cs = devm_request_mem_region(pcie->dev, res->start,
+	pcie->cs = devm_request_mem_region(dev, res->start,
 					   resource_size(res), res->name);
 	if (!pcie->cs) {
 		err = -EADDRNOTAVAIL;
@@ -1315,7 +1324,7 @@
 	/* request interrupt */
 	err = platform_get_irq_byname(pdev, "intr");
 	if (err < 0) {
-		dev_err(&pdev->dev, "failed to get IRQ: %d\n", err);
+		dev_err(dev, "failed to get IRQ: %d\n", err);
 		goto poweroff;
 	}
 
@@ -1323,7 +1332,7 @@
 
 	err = request_irq(pcie->irq, tegra_pcie_isr, IRQF_SHARED, "PCIE", pcie);
 	if (err) {
-		dev_err(&pdev->dev, "failed to register IRQ: %d\n", err);
+		dev_err(dev, "failed to register IRQ: %d\n", err);
 		goto poweroff;
 	}
 
@@ -1336,6 +1345,7 @@
 
 static int tegra_pcie_put_resources(struct tegra_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	int err;
 
 	if (pcie->irq > 0)
@@ -1345,7 +1355,7 @@
 
 	err = phy_exit(pcie->phy);
 	if (err < 0)
-		dev_err(pcie->dev, "failed to teardown PHY: %d\n", err);
+		dev_err(dev, "failed to teardown PHY: %d\n", err);
 
 	return 0;
 }
@@ -1384,6 +1394,7 @@
 static irqreturn_t tegra_pcie_msi_irq(int irq, void *data)
 {
 	struct tegra_pcie *pcie = data;
+	struct device *dev = pcie->dev;
 	struct tegra_msi *msi = &pcie->msi;
 	unsigned int i, processed = 0;
 
@@ -1403,13 +1414,13 @@
 				if (test_bit(index, msi->used))
 					generic_handle_irq(irq);
 				else
-					dev_info(pcie->dev, "unhandled MSI\n");
+					dev_info(dev, "unhandled MSI\n");
 			} else {
 				/*
 				 * that's weird who triggered this?
 				 * just clear it
 				 */
-				dev_info(pcie->dev, "unexpected MSI\n");
+				dev_info(dev, "unexpected MSI\n");
 			}
 
 			/* see if there's any more pending in this vector */
@@ -1488,7 +1499,8 @@
 
 static int tegra_pcie_enable_msi(struct tegra_pcie *pcie)
 {
-	struct platform_device *pdev = to_platform_device(pcie->dev);
+	struct device *dev = pcie->dev;
+	struct platform_device *pdev = to_platform_device(dev);
 	const struct tegra_pcie_soc *soc = pcie->soc;
 	struct tegra_msi *msi = &pcie->msi;
 	unsigned long base;
@@ -1497,20 +1509,20 @@
 
 	mutex_init(&msi->lock);
 
-	msi->chip.dev = pcie->dev;
+	msi->chip.dev = dev;
 	msi->chip.setup_irq = tegra_msi_setup_irq;
 	msi->chip.teardown_irq = tegra_msi_teardown_irq;
 
-	msi->domain = irq_domain_add_linear(pcie->dev->of_node, INT_PCI_MSI_NR,
+	msi->domain = irq_domain_add_linear(dev->of_node, INT_PCI_MSI_NR,
 					    &msi_domain_ops, &msi->chip);
 	if (!msi->domain) {
-		dev_err(&pdev->dev, "failed to create IRQ domain\n");
+		dev_err(dev, "failed to create IRQ domain\n");
 		return -ENOMEM;
 	}
 
 	err = platform_get_irq_byname(pdev, "msi");
 	if (err < 0) {
-		dev_err(&pdev->dev, "failed to get IRQ: %d\n", err);
+		dev_err(dev, "failed to get IRQ: %d\n", err);
 		goto err;
 	}
 
@@ -1519,7 +1531,7 @@
 	err = request_irq(msi->irq, tegra_pcie_msi_irq, IRQF_NO_THREAD,
 			  tegra_msi_irq_chip.name, pcie);
 	if (err < 0) {
-		dev_err(&pdev->dev, "failed to request IRQ: %d\n", err);
+		dev_err(dev, "failed to request IRQ: %d\n", err);
 		goto err;
 	}
 
@@ -1594,46 +1606,47 @@
 static int tegra_pcie_get_xbar_config(struct tegra_pcie *pcie, u32 lanes,
 				      u32 *xbar)
 {
-	struct device_node *np = pcie->dev->of_node;
+	struct device *dev = pcie->dev;
+	struct device_node *np = dev->of_node;
 
 	if (of_device_is_compatible(np, "nvidia,tegra124-pcie")) {
 		switch (lanes) {
 		case 0x0000104:
-			dev_info(pcie->dev, "4x1, 1x1 configuration\n");
+			dev_info(dev, "4x1, 1x1 configuration\n");
 			*xbar = AFI_PCIE_CONFIG_SM2TMS0_XBAR_CONFIG_X4_X1;
 			return 0;
 
 		case 0x0000102:
-			dev_info(pcie->dev, "2x1, 1x1 configuration\n");
+			dev_info(dev, "2x1, 1x1 configuration\n");
 			*xbar = AFI_PCIE_CONFIG_SM2TMS0_XBAR_CONFIG_X2_X1;
 			return 0;
 		}
 	} else if (of_device_is_compatible(np, "nvidia,tegra30-pcie")) {
 		switch (lanes) {
 		case 0x00000204:
-			dev_info(pcie->dev, "4x1, 2x1 configuration\n");
+			dev_info(dev, "4x1, 2x1 configuration\n");
 			*xbar = AFI_PCIE_CONFIG_SM2TMS0_XBAR_CONFIG_420;
 			return 0;
 
 		case 0x00020202:
-			dev_info(pcie->dev, "2x3 configuration\n");
+			dev_info(dev, "2x3 configuration\n");
 			*xbar = AFI_PCIE_CONFIG_SM2TMS0_XBAR_CONFIG_222;
 			return 0;
 
 		case 0x00010104:
-			dev_info(pcie->dev, "4x1, 1x2 configuration\n");
+			dev_info(dev, "4x1, 1x2 configuration\n");
 			*xbar = AFI_PCIE_CONFIG_SM2TMS0_XBAR_CONFIG_411;
 			return 0;
 		}
 	} else if (of_device_is_compatible(np, "nvidia,tegra20-pcie")) {
 		switch (lanes) {
 		case 0x00000004:
-			dev_info(pcie->dev, "single-mode configuration\n");
+			dev_info(dev, "single-mode configuration\n");
 			*xbar = AFI_PCIE_CONFIG_SM2TMS0_XBAR_CONFIG_SINGLE;
 			return 0;
 
 		case 0x00000202:
-			dev_info(pcie->dev, "dual-mode configuration\n");
+			dev_info(dev, "dual-mode configuration\n");
 			*xbar = AFI_PCIE_CONFIG_SM2TMS0_XBAR_CONFIG_DUAL;
 			return 0;
 		}
@@ -1673,7 +1686,8 @@
  */
 static int tegra_pcie_get_legacy_regulators(struct tegra_pcie *pcie)
 {
-	struct device_node *np = pcie->dev->of_node;
+	struct device *dev = pcie->dev;
+	struct device_node *np = dev->of_node;
 
 	if (of_device_is_compatible(np, "nvidia,tegra30-pcie"))
 		pcie->num_supplies = 3;
@@ -1681,12 +1695,12 @@
 		pcie->num_supplies = 2;
 
 	if (pcie->num_supplies == 0) {
-		dev_err(pcie->dev, "device %s not supported in legacy mode\n",
+		dev_err(dev, "device %s not supported in legacy mode\n",
 			np->full_name);
 		return -ENODEV;
 	}
 
-	pcie->supplies = devm_kcalloc(pcie->dev, pcie->num_supplies,
+	pcie->supplies = devm_kcalloc(dev, pcie->num_supplies,
 				      sizeof(*pcie->supplies),
 				      GFP_KERNEL);
 	if (!pcie->supplies)
@@ -1698,8 +1712,7 @@
 	if (pcie->num_supplies > 2)
 		pcie->supplies[2].supply = "avdd";
 
-	return devm_regulator_bulk_get(pcie->dev, pcie->num_supplies,
-				       pcie->supplies);
+	return devm_regulator_bulk_get(dev, pcie->num_supplies, pcie->supplies);
 }
 
 /*
@@ -1713,13 +1726,14 @@
  */
 static int tegra_pcie_get_regulators(struct tegra_pcie *pcie, u32 lane_mask)
 {
-	struct device_node *np = pcie->dev->of_node;
+	struct device *dev = pcie->dev;
+	struct device_node *np = dev->of_node;
 	unsigned int i = 0;
 
 	if (of_device_is_compatible(np, "nvidia,tegra124-pcie")) {
 		pcie->num_supplies = 7;
 
-		pcie->supplies = devm_kcalloc(pcie->dev, pcie->num_supplies,
+		pcie->supplies = devm_kcalloc(dev, pcie->num_supplies,
 					      sizeof(*pcie->supplies),
 					      GFP_KERNEL);
 		if (!pcie->supplies)
@@ -1746,7 +1760,7 @@
 		pcie->num_supplies = 4 + (need_pexa ? 2 : 0) +
 					 (need_pexb ? 2 : 0);
 
-		pcie->supplies = devm_kcalloc(pcie->dev, pcie->num_supplies,
+		pcie->supplies = devm_kcalloc(dev, pcie->num_supplies,
 					      sizeof(*pcie->supplies),
 					      GFP_KERNEL);
 		if (!pcie->supplies)
@@ -1769,7 +1783,7 @@
 	} else if (of_device_is_compatible(np, "nvidia,tegra20-pcie")) {
 		pcie->num_supplies = 5;
 
-		pcie->supplies = devm_kcalloc(pcie->dev, pcie->num_supplies,
+		pcie->supplies = devm_kcalloc(dev, pcie->num_supplies,
 					      sizeof(*pcie->supplies),
 					      GFP_KERNEL);
 		if (!pcie->supplies)
@@ -1782,9 +1796,9 @@
 		pcie->supplies[4].supply = "vddio-pex-clk";
 	}
 
-	if (of_regulator_bulk_available(pcie->dev->of_node, pcie->supplies,
+	if (of_regulator_bulk_available(dev->of_node, pcie->supplies,
 					pcie->num_supplies))
-		return devm_regulator_bulk_get(pcie->dev, pcie->num_supplies,
+		return devm_regulator_bulk_get(dev, pcie->num_supplies,
 					       pcie->supplies);
 
 	/*
@@ -1792,9 +1806,9 @@
 	 * that the device tree complies with an older version of the device
 	 * tree binding.
 	 */
-	dev_info(pcie->dev, "using legacy DT binding for power supplies\n");
+	dev_info(dev, "using legacy DT binding for power supplies\n");
 
-	devm_kfree(pcie->dev, pcie->supplies);
+	devm_kfree(dev, pcie->supplies);
 	pcie->num_supplies = 0;
 
 	return tegra_pcie_get_legacy_regulators(pcie);
@@ -1802,7 +1816,8 @@
 
 static int tegra_pcie_parse_dt(struct tegra_pcie *pcie)
 {
-	struct device_node *np = pcie->dev->of_node, *port;
+	struct device *dev = pcie->dev;
+	struct device_node *np = dev->of_node, *port;
 	const struct tegra_pcie_soc *soc = pcie->soc;
 	struct of_pci_range_parser parser;
 	struct of_pci_range range;
@@ -1812,7 +1827,7 @@
 	int err;
 
 	if (of_pci_range_parser_init(&parser, np)) {
-		dev_err(pcie->dev, "missing \"ranges\" property\n");
+		dev_err(dev, "missing \"ranges\" property\n");
 		return -EINVAL;
 	}
 
@@ -1867,8 +1882,7 @@
 
 	err = of_pci_parse_bus_range(np, &pcie->busn);
 	if (err < 0) {
-		dev_err(pcie->dev, "failed to parse ranges property: %d\n",
-			err);
+		dev_err(dev, "failed to parse ranges property: %d\n", err);
 		pcie->busn.name = np->name;
 		pcie->busn.start = 0;
 		pcie->busn.end = 0xff;
@@ -1883,15 +1897,14 @@
 
 		err = of_pci_get_devfn(port);
 		if (err < 0) {
-			dev_err(pcie->dev, "failed to parse address: %d\n",
-				err);
+			dev_err(dev, "failed to parse address: %d\n", err);
 			return err;
 		}
 
 		index = PCI_SLOT(err);
 
 		if (index < 1 || index > soc->num_ports) {
-			dev_err(pcie->dev, "invalid port number: %d\n", index);
+			dev_err(dev, "invalid port number: %d\n", index);
 			return -EINVAL;
 		}
 
@@ -1899,13 +1912,13 @@
 
 		err = of_property_read_u32(port, "nvidia,num-lanes", &value);
 		if (err < 0) {
-			dev_err(pcie->dev, "failed to parse # of lanes: %d\n",
+			dev_err(dev, "failed to parse # of lanes: %d\n",
 				err);
 			return err;
 		}
 
 		if (value > 16) {
-			dev_err(pcie->dev, "invalid # of lanes: %u\n", value);
+			dev_err(dev, "invalid # of lanes: %u\n", value);
 			return -EINVAL;
 		}
 
@@ -1919,14 +1932,13 @@
 		mask |= ((1 << value) - 1) << lane;
 		lane += value;
 
-		rp = devm_kzalloc(pcie->dev, sizeof(*rp), GFP_KERNEL);
+		rp = devm_kzalloc(dev, sizeof(*rp), GFP_KERNEL);
 		if (!rp)
 			return -ENOMEM;
 
 		err = of_address_to_resource(port, 0, &rp->regs);
 		if (err < 0) {
-			dev_err(pcie->dev, "failed to parse address: %d\n",
-				err);
+			dev_err(dev, "failed to parse address: %d\n", err);
 			return err;
 		}
 
@@ -1936,7 +1948,7 @@
 		rp->pcie = pcie;
 		rp->np = port;
 
-		rp->base = devm_ioremap_resource(pcie->dev, &rp->regs);
+		rp->base = devm_ioremap_resource(dev, &rp->regs);
 		if (IS_ERR(rp->base))
 			return PTR_ERR(rp->base);
 
@@ -1945,7 +1957,7 @@
 
 	err = tegra_pcie_get_xbar_config(pcie, lanes, &pcie->xbar_config);
 	if (err < 0) {
-		dev_err(pcie->dev, "invalid lane configuration\n");
+		dev_err(dev, "invalid lane configuration\n");
 		return err;
 	}
 
@@ -1964,6 +1976,7 @@
 #define TEGRA_PCIE_LINKUP_TIMEOUT	200	/* up to 1.2 seconds */
 static bool tegra_pcie_port_check_link(struct tegra_pcie_port *port)
 {
+	struct device *dev = port->pcie->dev;
 	unsigned int retries = 3;
 	unsigned long value;
 
@@ -1986,8 +1999,7 @@
 		} while (--timeout);
 
 		if (!timeout) {
-			dev_err(port->pcie->dev, "link %u down, retrying\n",
-				port->index);
+			dev_err(dev, "link %u down, retrying\n", port->index);
 			goto retry;
 		}
 
@@ -2011,11 +2023,12 @@
 
 static int tegra_pcie_enable(struct tegra_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	struct tegra_pcie_port *port, *tmp;
 	struct hw_pci hw;
 
 	list_for_each_entry_safe(port, tmp, &pcie->ports, list) {
-		dev_info(pcie->dev, "probing port %u, using %u lanes\n",
+		dev_info(dev, "probing port %u, using %u lanes\n",
 			 port->index, port->lanes);
 
 		tegra_pcie_port_enable(port);
@@ -2023,7 +2036,7 @@
 		if (tegra_pcie_port_check_link(port))
 			continue;
 
-		dev_info(pcie->dev, "link %u down, ignoring\n", port->index);
+		dev_info(dev, "link %u down, ignoring\n", port->index);
 
 		tegra_pcie_port_disable(port);
 		tegra_pcie_port_free(port);
@@ -2041,8 +2054,7 @@
 	hw.map_irq = tegra_pcie_map_irq;
 	hw.ops = &tegra_pcie_ops;
 
-	pci_common_init_dev(pcie->dev, &hw);
-
+	pci_common_init_dev(dev, &hw);
 	return 0;
 }
 
@@ -2204,17 +2216,18 @@
 
 static int tegra_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct tegra_pcie *pcie;
 	int err;
 
-	pcie = devm_kzalloc(&pdev->dev, sizeof(*pcie), GFP_KERNEL);
+	pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
 	if (!pcie)
 		return -ENOMEM;
 
-	pcie->soc = of_device_get_match_data(&pdev->dev);
+	pcie->soc = of_device_get_match_data(dev);
 	INIT_LIST_HEAD(&pcie->buses);
 	INIT_LIST_HEAD(&pcie->ports);
-	pcie->dev = &pdev->dev;
+	pcie->dev = dev;
 
 	err = tegra_pcie_parse_dt(pcie);
 	if (err < 0)
@@ -2222,7 +2235,7 @@
 
 	err = tegra_pcie_get_resources(pcie);
 	if (err < 0) {
-		dev_err(&pdev->dev, "failed to request resources: %d\n", err);
+		dev_err(dev, "failed to request resources: %d\n", err);
 		return err;
 	}
 
@@ -2236,27 +2249,23 @@
 	if (IS_ENABLED(CONFIG_PCI_MSI)) {
 		err = tegra_pcie_enable_msi(pcie);
 		if (err < 0) {
-			dev_err(&pdev->dev,
-				"failed to enable MSI support: %d\n",
-				err);
+			dev_err(dev, "failed to enable MSI support: %d\n", err);
 			goto put_resources;
 		}
 	}
 
 	err = tegra_pcie_enable(pcie);
 	if (err < 0) {
-		dev_err(&pdev->dev, "failed to enable PCIe ports: %d\n", err);
+		dev_err(dev, "failed to enable PCIe ports: %d\n", err);
 		goto disable_msi;
 	}
 
 	if (IS_ENABLED(CONFIG_DEBUG_FS)) {
 		err = tegra_pcie_debugfs_init(pcie);
 		if (err < 0)
-			dev_err(&pdev->dev, "failed to setup debugfs: %d\n",
-				err);
+			dev_err(dev, "failed to setup debugfs: %d\n", err);
 	}
 
-	platform_set_drvdata(pdev, pcie);
 	return 0;
 
 disable_msi:
diff --git a/drivers/pci/host/pci-xgene.c b/drivers/pci/host/pci-xgene.c
index a81273c..1de23d7 100644
--- a/drivers/pci/host/pci-xgene.c
+++ b/drivers/pci/host/pci-xgene.c
@@ -76,6 +76,16 @@
 	u32			version;
 };
 
+static u32 xgene_pcie_readl(struct xgene_pcie_port *port, u32 reg)
+{
+	return readl(port->csr_base + reg);
+}
+
+static void xgene_pcie_writel(struct xgene_pcie_port *port, u32 reg, u32 val)
+{
+	writel(val, port->csr_base + reg);
+}
+
 static inline u32 pcie_bar_low_val(u32 addr, u32 flags)
 {
 	return (addr & PCI_BASE_ADDRESS_MEM_MASK) | flags;
@@ -112,9 +122,9 @@
 	if (!pci_is_root_bus(bus))
 		rtdid_val = (b << 8) | (d << 3) | f;
 
-	writel(rtdid_val, port->csr_base + RTDID);
+	xgene_pcie_writel(port, RTDID, rtdid_val);
 	/* read the register back to ensure flush */
-	readl(port->csr_base + RTDID);
+	xgene_pcie_readl(port, RTDID);
 }
 
 /*
@@ -179,28 +189,28 @@
 	.write = pci_generic_config_write32,
 };
 
-static u64 xgene_pcie_set_ib_mask(void __iomem *csr_base, u32 addr,
+static u64 xgene_pcie_set_ib_mask(struct xgene_pcie_port *port, u32 addr,
 				  u32 flags, u64 size)
 {
 	u64 mask = (~(size - 1) & PCI_BASE_ADDRESS_MEM_MASK) | flags;
 	u32 val32 = 0;
 	u32 val;
 
-	val32 = readl(csr_base + addr);
+	val32 = xgene_pcie_readl(port, addr);
 	val = (val32 & 0x0000ffff) | (lower_32_bits(mask) << 16);
-	writel(val, csr_base + addr);
+	xgene_pcie_writel(port, addr, val);
 
-	val32 = readl(csr_base + addr + 0x04);
+	val32 = xgene_pcie_readl(port, addr + 0x04);
 	val = (val32 & 0xffff0000) | (lower_32_bits(mask) >> 16);
-	writel(val, csr_base + addr + 0x04);
+	xgene_pcie_writel(port, addr + 0x04, val);
 
-	val32 = readl(csr_base + addr + 0x04);
+	val32 = xgene_pcie_readl(port, addr + 0x04);
 	val = (val32 & 0x0000ffff) | (upper_32_bits(mask) << 16);
-	writel(val, csr_base + addr + 0x04);
+	xgene_pcie_writel(port, addr + 0x04, val);
 
-	val32 = readl(csr_base + addr + 0x08);
+	val32 = xgene_pcie_readl(port, addr + 0x08);
 	val = (val32 & 0xffff0000) | (upper_32_bits(mask) >> 16);
-	writel(val, csr_base + addr + 0x08);
+	xgene_pcie_writel(port, addr + 0x08, val);
 
 	return mask;
 }
@@ -208,32 +218,32 @@
 static void xgene_pcie_linkup(struct xgene_pcie_port *port,
 				   u32 *lanes, u32 *speed)
 {
-	void __iomem *csr_base = port->csr_base;
 	u32 val32;
 
 	port->link_up = false;
-	val32 = readl(csr_base + PCIECORE_CTLANDSTATUS);
+	val32 = xgene_pcie_readl(port, PCIECORE_CTLANDSTATUS);
 	if (val32 & LINK_UP_MASK) {
 		port->link_up = true;
 		*speed = PIPE_PHY_RATE_RD(val32);
-		val32 = readl(csr_base + BRIDGE_STATUS_0);
+		val32 = xgene_pcie_readl(port, BRIDGE_STATUS_0);
 		*lanes = val32 >> 26;
 	}
 }
 
 static int xgene_pcie_init_port(struct xgene_pcie_port *port)
 {
+	struct device *dev = port->dev;
 	int rc;
 
-	port->clk = clk_get(port->dev, NULL);
+	port->clk = clk_get(dev, NULL);
 	if (IS_ERR(port->clk)) {
-		dev_err(port->dev, "clock not available\n");
+		dev_err(dev, "clock not available\n");
 		return -ENODEV;
 	}
 
 	rc = clk_prepare_enable(port->clk);
 	if (rc) {
-		dev_err(port->dev, "clock enable failed\n");
+		dev_err(dev, "clock enable failed\n");
 		return rc;
 	}
 
@@ -243,15 +253,16 @@
 static int xgene_pcie_map_reg(struct xgene_pcie_port *port,
 			      struct platform_device *pdev)
 {
+	struct device *dev = port->dev;
 	struct resource *res;
 
 	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "csr");
-	port->csr_base = devm_ioremap_resource(port->dev, res);
+	port->csr_base = devm_ioremap_resource(dev, res);
 	if (IS_ERR(port->csr_base))
 		return PTR_ERR(port->csr_base);
 
 	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "cfg");
-	port->cfg_base = devm_ioremap_resource(port->dev, res);
+	port->cfg_base = devm_ioremap_resource(dev, res);
 	if (IS_ERR(port->cfg_base))
 		return PTR_ERR(port->cfg_base);
 	port->cfg_addr = res->start;
@@ -263,7 +274,7 @@
 				    struct resource *res, u32 offset,
 				    u64 cpu_addr, u64 pci_addr)
 {
-	void __iomem *base = port->csr_base + offset;
+	struct device *dev = port->dev;
 	resource_size_t size = resource_size(res);
 	u64 restype = resource_type(res);
 	u64 mask = 0;
@@ -280,22 +291,24 @@
 	if (size >= min_size)
 		mask = ~(size - 1) | flag;
 	else
-		dev_warn(port->dev, "res size 0x%llx less than minimum 0x%x\n",
+		dev_warn(dev, "res size 0x%llx less than minimum 0x%x\n",
 			 (u64)size, min_size);
 
-	writel(lower_32_bits(cpu_addr), base);
-	writel(upper_32_bits(cpu_addr), base + 0x04);
-	writel(lower_32_bits(mask), base + 0x08);
-	writel(upper_32_bits(mask), base + 0x0c);
-	writel(lower_32_bits(pci_addr), base + 0x10);
-	writel(upper_32_bits(pci_addr), base + 0x14);
+	xgene_pcie_writel(port, offset, lower_32_bits(cpu_addr));
+	xgene_pcie_writel(port, offset + 0x04, upper_32_bits(cpu_addr));
+	xgene_pcie_writel(port, offset + 0x08, lower_32_bits(mask));
+	xgene_pcie_writel(port, offset + 0x0c, upper_32_bits(mask));
+	xgene_pcie_writel(port, offset + 0x10, lower_32_bits(pci_addr));
+	xgene_pcie_writel(port, offset + 0x14, upper_32_bits(pci_addr));
 }
 
-static void xgene_pcie_setup_cfg_reg(void __iomem *csr_base, u64 addr)
+static void xgene_pcie_setup_cfg_reg(struct xgene_pcie_port *port)
 {
-	writel(lower_32_bits(addr), csr_base + CFGBARL);
-	writel(upper_32_bits(addr), csr_base + CFGBARH);
-	writel(EN_REG, csr_base + CFGCTL);
+	u64 addr = port->cfg_addr;
+
+	xgene_pcie_writel(port, CFGBARL, lower_32_bits(addr));
+	xgene_pcie_writel(port, CFGBARH, upper_32_bits(addr));
+	xgene_pcie_writel(port, CFGCTL, EN_REG);
 }
 
 static int xgene_pcie_map_ranges(struct xgene_pcie_port *port,
@@ -310,7 +323,7 @@
 		struct resource *res = window->res;
 		u64 restype = resource_type(res);
 
-		dev_dbg(port->dev, "%pR\n", res);
+		dev_dbg(dev, "%pR\n", res);
 
 		switch (restype) {
 		case IORESOURCE_IO:
@@ -339,17 +352,18 @@
 			return -EINVAL;
 		}
 	}
-	xgene_pcie_setup_cfg_reg(port->csr_base, port->cfg_addr);
-
+	xgene_pcie_setup_cfg_reg(port);
 	return 0;
 }
 
-static void xgene_pcie_setup_pims(void *addr, u64 pim, u64 size)
+static void xgene_pcie_setup_pims(struct xgene_pcie_port *port, u32 pim_reg,
+				  u64 pim, u64 size)
 {
-	writel(lower_32_bits(pim), addr);
-	writel(upper_32_bits(pim) | EN_COHERENCY, addr + 0x04);
-	writel(lower_32_bits(size), addr + 0x10);
-	writel(upper_32_bits(size), addr + 0x14);
+	xgene_pcie_writel(port, pim_reg, lower_32_bits(pim));
+	xgene_pcie_writel(port, pim_reg + 0x04,
+			  upper_32_bits(pim) | EN_COHERENCY);
+	xgene_pcie_writel(port, pim_reg + 0x10, lower_32_bits(size));
+	xgene_pcie_writel(port, pim_reg + 0x14, upper_32_bits(size));
 }
 
 /*
@@ -379,10 +393,10 @@
 static void xgene_pcie_setup_ib_reg(struct xgene_pcie_port *port,
 				    struct of_pci_range *range, u8 *ib_reg_mask)
 {
-	void __iomem *csr_base = port->csr_base;
 	void __iomem *cfg_base = port->cfg_base;
+	struct device *dev = port->dev;
 	void *bar_addr;
-	void *pim_addr;
+	u32 pim_reg;
 	u64 cpu_addr = range->cpu_addr;
 	u64 pci_addr = range->pci_addr;
 	u64 size = range->size;
@@ -393,7 +407,7 @@
 
 	region = xgene_pcie_select_ib_reg(ib_reg_mask, range->size);
 	if (region < 0) {
-		dev_warn(port->dev, "invalid pcie dma-range config\n");
+		dev_warn(dev, "invalid pcie dma-range config\n");
 		return;
 	}
 
@@ -403,29 +417,27 @@
 	bar_low = pcie_bar_low_val((u32)cpu_addr, flags);
 	switch (region) {
 	case 0:
-		xgene_pcie_set_ib_mask(csr_base, BRIDGE_CFG_4, flags, size);
+		xgene_pcie_set_ib_mask(port, BRIDGE_CFG_4, flags, size);
 		bar_addr = cfg_base + PCI_BASE_ADDRESS_0;
 		writel(bar_low, bar_addr);
 		writel(upper_32_bits(cpu_addr), bar_addr + 0x4);
-		pim_addr = csr_base + PIM1_1L;
+		pim_reg = PIM1_1L;
 		break;
 	case 1:
-		bar_addr = csr_base + IBAR2;
-		writel(bar_low, bar_addr);
-		writel(lower_32_bits(mask), csr_base + IR2MSK);
-		pim_addr = csr_base + PIM2_1L;
+		xgene_pcie_writel(port, IBAR2, bar_low);
+		xgene_pcie_writel(port, IR2MSK, lower_32_bits(mask));
+		pim_reg = PIM2_1L;
 		break;
 	case 2:
-		bar_addr = csr_base + IBAR3L;
-		writel(bar_low, bar_addr);
-		writel(upper_32_bits(cpu_addr), bar_addr + 0x4);
-		writel(lower_32_bits(mask), csr_base + IR3MSKL);
-		writel(upper_32_bits(mask), csr_base + IR3MSKL + 0x4);
-		pim_addr = csr_base + PIM3_1L;
+		xgene_pcie_writel(port, IBAR3L, bar_low);
+		xgene_pcie_writel(port, IBAR3L + 0x4, upper_32_bits(cpu_addr));
+		xgene_pcie_writel(port, IR3MSKL, lower_32_bits(mask));
+		xgene_pcie_writel(port, IR3MSKL + 0x4, upper_32_bits(mask));
+		pim_reg = PIM3_1L;
 		break;
 	}
 
-	xgene_pcie_setup_pims(pim_addr, pci_addr, ~(size - 1));
+	xgene_pcie_setup_pims(port, pim_reg, pci_addr, ~(size - 1));
 }
 
 static int pci_dma_range_parser_init(struct of_pci_range_parser *parser,
@@ -463,7 +475,7 @@
 	for_each_of_pci_range(&parser, &range) {
 		u64 end = range.cpu_addr + range.size - 1;
 
-		dev_dbg(port->dev, "0x%08x 0x%016llx..0x%016llx -> 0x%016llx\n",
+		dev_dbg(dev, "0x%08x 0x%016llx..0x%016llx -> 0x%016llx\n",
 			range.flags, range.cpu_addr, end, range.pci_addr);
 		xgene_pcie_setup_ib_reg(port, &range, &ib_reg_mask);
 	}
@@ -476,13 +488,14 @@
 	int i;
 
 	for (i = PIM1_1L; i <= CFGCTL; i += 4)
-		writel(0x0, port->csr_base + i);
+		xgene_pcie_writel(port, i, 0);
 }
 
 static int xgene_pcie_setup(struct xgene_pcie_port *port,
 			    struct list_head *res,
 			    resource_size_t io_base)
 {
+	struct device *dev = port->dev;
 	u32 val, lanes = 0, speed = 0;
 	int ret;
 
@@ -490,7 +503,7 @@
 
 	/* setup the vendor and device IDs correctly */
 	val = (XGENE_PCIE_DEVICEID << 16) | XGENE_PCIE_VENDORID;
-	writel(val, port->csr_base + BRIDGE_CFG_0);
+	xgene_pcie_writel(port, BRIDGE_CFG_0, val);
 
 	ret = xgene_pcie_map_ranges(port, res, io_base);
 	if (ret)
@@ -502,27 +515,28 @@
 
 	xgene_pcie_linkup(port, &lanes, &speed);
 	if (!port->link_up)
-		dev_info(port->dev, "(rc) link down\n");
+		dev_info(dev, "(rc) link down\n");
 	else
-		dev_info(port->dev, "(rc) x%d gen-%d link up\n",
-				lanes, speed + 1);
+		dev_info(dev, "(rc) x%d gen-%d link up\n", lanes, speed + 1);
 	return 0;
 }
 
 static int xgene_pcie_probe_bridge(struct platform_device *pdev)
 {
-	struct device_node *dn = pdev->dev.of_node;
+	struct device *dev = &pdev->dev;
+	struct device_node *dn = dev->of_node;
 	struct xgene_pcie_port *port;
 	resource_size_t iobase = 0;
 	struct pci_bus *bus;
 	int ret;
 	LIST_HEAD(res);
 
-	port = devm_kzalloc(&pdev->dev, sizeof(*port), GFP_KERNEL);
+	port = devm_kzalloc(dev, sizeof(*port), GFP_KERNEL);
 	if (!port)
 		return -ENOMEM;
-	port->node = of_node_get(pdev->dev.of_node);
-	port->dev = &pdev->dev;
+
+	port->node = of_node_get(dn);
+	port->dev = dev;
 
 	port->version = XGENE_PCIE_IP_VER_UNKN;
 	if (of_device_is_compatible(port->node, "apm,xgene-pcie"))
@@ -540,7 +554,7 @@
 	if (ret)
 		return ret;
 
-	ret = devm_request_pci_bus_resources(&pdev->dev, &res);
+	ret = devm_request_pci_bus_resources(dev, &res);
 	if (ret)
 		goto error;
 
@@ -548,8 +562,7 @@
 	if (ret)
 		goto error;
 
-	bus = pci_create_root_bus(&pdev->dev, 0,
-					&xgene_pcie_ops, port, &res);
+	bus = pci_create_root_bus(dev, 0, &xgene_pcie_ops, port, &res);
 	if (!bus) {
 		ret = -ENOMEM;
 		goto error;
@@ -558,8 +571,6 @@
 	pci_scan_child_bus(bus);
 	pci_assign_unassigned_bus_resources(bus);
 	pci_bus_add_devices(bus);
-
-	platform_set_drvdata(pdev, port);
 	return 0;
 
 error:
diff --git a/drivers/pci/host/pcie-altera.c b/drivers/pci/host/pcie-altera.c
index c24e965..b0ac4df 100644
--- a/drivers/pci/host/pcie-altera.c
+++ b/drivers/pci/host/pcie-altera.c
@@ -55,15 +55,19 @@
 #define TLP_PAYLOAD_SIZE		0x01
 #define TLP_READ_TAG			0x1d
 #define TLP_WRITE_TAG			0x10
-#define TLP_CFG_DW0(fmttype)		(((fmttype) << 24) | TLP_PAYLOAD_SIZE)
-#define TLP_CFG_DW1(reqid, tag, be)	(((reqid) << 16) | (tag << 8) | (be))
+#define RP_DEVFN			0
+#define TLP_REQ_ID(bus, devfn)		(((bus) << 8) | (devfn))
+#define TLP_CFG_DW0(pcie, bus)						\
+    ((((bus == pcie->root_bus_nr) ? TLP_FMTTYPE_CFGRD0			\
+				    : TLP_FMTTYPE_CFGRD1) << 24) |	\
+     TLP_PAYLOAD_SIZE)
+#define TLP_CFG_DW1(pcie, tag, be)	\
+    (((TLP_REQ_ID(pcie->root_bus_nr,  RP_DEVFN)) << 16) | (tag << 8) | (be))
 #define TLP_CFG_DW2(bus, devfn, offset)	\
 				(((bus) << 24) | ((devfn) << 16) | (offset))
-#define TLP_REQ_ID(bus, devfn)		(((bus) << 8) | (devfn))
 #define TLP_COMP_STATUS(s)		(((s) >> 12) & 7)
 #define TLP_HDR_SIZE			3
 #define TLP_LOOP			500
-#define RP_DEVFN			0
 
 #define LINK_UP_TIMEOUT			HZ
 #define LINK_RETRAIN_TIMEOUT		HZ
@@ -74,7 +78,7 @@
 
 struct altera_pcie {
 	struct platform_device	*pdev;
-	void __iomem		*cra_base;
+	void __iomem		*cra_base;	/* DT Cra */
 	int			irq;
 	u8			root_bus_nr;
 	struct irq_domain	*irq_domain;
@@ -131,7 +135,7 @@
 	cra_writel(pcie, tlp_rp_regdata->ctrl, RP_TX_CNTRL);
 }
 
-static bool altera_pcie_valid_config(struct altera_pcie *pcie,
+static bool altera_pcie_valid_device(struct altera_pcie *pcie,
 				     struct pci_bus *bus, int dev)
 {
 	/* If there is no link, then there is no device */
@@ -218,13 +222,8 @@
 {
 	u32 headers[TLP_HDR_SIZE];
 
-	if (bus == pcie->root_bus_nr)
-		headers[0] = TLP_CFG_DW0(TLP_FMTTYPE_CFGRD0);
-	else
-		headers[0] = TLP_CFG_DW0(TLP_FMTTYPE_CFGRD1);
-
-	headers[1] = TLP_CFG_DW1(TLP_REQ_ID(pcie->root_bus_nr, RP_DEVFN),
-					TLP_READ_TAG, byte_en);
+	headers[0] = TLP_CFG_DW0(pcie, bus);
+	headers[1] = TLP_CFG_DW1(pcie, TLP_READ_TAG, byte_en);
 	headers[2] = TLP_CFG_DW2(bus, devfn, where);
 
 	tlp_write_packet(pcie, headers, 0, false);
@@ -238,13 +237,8 @@
 	u32 headers[TLP_HDR_SIZE];
 	int ret;
 
-	if (bus == pcie->root_bus_nr)
-		headers[0] = TLP_CFG_DW0(TLP_FMTTYPE_CFGWR0);
-	else
-		headers[0] = TLP_CFG_DW0(TLP_FMTTYPE_CFGWR1);
-
-	headers[1] = TLP_CFG_DW1(TLP_REQ_ID(pcie->root_bus_nr, RP_DEVFN),
-					TLP_WRITE_TAG, byte_en);
+	headers[0] = TLP_CFG_DW0(pcie, bus);
+	headers[1] = TLP_CFG_DW1(pcie, TLP_WRITE_TAG, byte_en);
 	headers[2] = TLP_CFG_DW2(bus, devfn, where);
 
 	/* check alignment to Qword */
@@ -342,7 +336,7 @@
 	if (altera_pcie_hide_rc_bar(bus, devfn, where))
 		return PCIBIOS_BAD_REGISTER_NUMBER;
 
-	if (!altera_pcie_valid_config(pcie, bus, PCI_SLOT(devfn))) {
+	if (!altera_pcie_valid_device(pcie, bus, PCI_SLOT(devfn))) {
 		*value = 0xffffffff;
 		return PCIBIOS_DEVICE_NOT_FOUND;
 	}
@@ -359,7 +353,7 @@
 	if (altera_pcie_hide_rc_bar(bus, devfn, where))
 		return PCIBIOS_BAD_REGISTER_NUMBER;
 
-	if (!altera_pcie_valid_config(pcie, bus, PCI_SLOT(devfn)))
+	if (!altera_pcie_valid_device(pcie, bus, PCI_SLOT(devfn)))
 		return PCIBIOS_DEVICE_NOT_FOUND;
 
 	return _altera_pcie_cfg_write(pcie, bus->number, devfn, where, size,
@@ -394,6 +388,7 @@
 
 static void altera_wait_link_retrain(struct altera_pcie *pcie)
 {
+	struct device *dev = &pcie->pdev->dev;
 	u16 reg16;
 	unsigned long start_jiffies;
 
@@ -406,7 +401,7 @@
 			break;
 
 		if (time_after(jiffies, start_jiffies + LINK_RETRAIN_TIMEOUT)) {
-			dev_err(&pcie->pdev->dev, "link retrain timeout\n");
+			dev_err(dev, "link retrain timeout\n");
 			break;
 		}
 		udelay(100);
@@ -419,7 +414,7 @@
 			break;
 
 		if (time_after(jiffies, start_jiffies + LINK_UP_TIMEOUT)) {
-			dev_err(&pcie->pdev->dev, "link up timeout\n");
+			dev_err(dev, "link up timeout\n");
 			break;
 		}
 		udelay(100);
@@ -460,7 +455,6 @@
 {
 	irq_set_chip_and_handler(irq, &dummy_irq_chip, handle_simple_irq);
 	irq_set_chip_data(irq, domain->host_data);
-
 	return 0;
 }
 
@@ -472,12 +466,14 @@
 {
 	struct irq_chip *chip = irq_desc_get_chip(desc);
 	struct altera_pcie *pcie;
+	struct device *dev;
 	unsigned long status;
 	u32 bit;
 	u32 virq;
 
 	chained_irq_enter(chip, desc);
 	pcie = irq_desc_get_handler_data(desc);
+	dev = &pcie->pdev->dev;
 
 	while ((status = cra_readl(pcie, P2A_INT_STATUS)
 		& P2A_INT_STS_ALL) != 0) {
@@ -489,8 +485,7 @@
 			if (virq)
 				generic_handle_irq(virq);
 			else
-				dev_err(&pcie->pdev->dev,
-					"unexpected IRQ, INT%d\n", bit);
+				dev_err(dev, "unexpected IRQ, INT%d\n", bit);
 		}
 	}
 
@@ -549,30 +544,25 @@
 
 static int altera_pcie_parse_dt(struct altera_pcie *pcie)
 {
-	struct resource *cra;
+	struct device *dev = &pcie->pdev->dev;
 	struct platform_device *pdev = pcie->pdev;
+	struct resource *cra;
 
 	cra = platform_get_resource_byname(pdev, IORESOURCE_MEM, "Cra");
-	if (!cra) {
-		dev_err(&pdev->dev, "no Cra memory resource defined\n");
-		return -ENODEV;
-	}
-
-	pcie->cra_base = devm_ioremap_resource(&pdev->dev, cra);
+	pcie->cra_base = devm_ioremap_resource(dev, cra);
 	if (IS_ERR(pcie->cra_base)) {
-		dev_err(&pdev->dev, "failed to map cra memory\n");
+		dev_err(dev, "failed to map cra memory\n");
 		return PTR_ERR(pcie->cra_base);
 	}
 
 	/* setup IRQ */
 	pcie->irq = platform_get_irq(pdev, 0);
 	if (pcie->irq <= 0) {
-		dev_err(&pdev->dev, "failed to get IRQ: %d\n", pcie->irq);
+		dev_err(dev, "failed to get IRQ: %d\n", pcie->irq);
 		return -EINVAL;
 	}
 
 	irq_set_chained_handler_and_data(pcie->irq, altera_pcie_isr, pcie);
-
 	return 0;
 }
 
@@ -583,12 +573,13 @@
 
 static int altera_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct altera_pcie *pcie;
 	struct pci_bus *bus;
 	struct pci_bus *child;
 	int ret;
 
-	pcie = devm_kzalloc(&pdev->dev, sizeof(*pcie), GFP_KERNEL);
+	pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
 	if (!pcie)
 		return -ENOMEM;
 
@@ -596,7 +587,7 @@
 
 	ret = altera_pcie_parse_dt(pcie);
 	if (ret) {
-		dev_err(&pdev->dev, "Parsing DT failed\n");
+		dev_err(dev, "Parsing DT failed\n");
 		return ret;
 	}
 
@@ -604,13 +595,13 @@
 
 	ret = altera_pcie_parse_request_of_pci_ranges(pcie);
 	if (ret) {
-		dev_err(&pdev->dev, "Failed add resources\n");
+		dev_err(dev, "Failed add resources\n");
 		return ret;
 	}
 
 	ret = altera_pcie_init_irq_domain(pcie);
 	if (ret) {
-		dev_err(&pdev->dev, "Failed creating IRQ Domain\n");
+		dev_err(dev, "Failed creating IRQ Domain\n");
 		return ret;
 	}
 
@@ -620,7 +611,7 @@
 	cra_writel(pcie, P2A_INT_ENA_ALL, P2A_INT_ENABLE);
 	altera_pcie_host_init(pcie);
 
-	bus = pci_scan_root_bus(&pdev->dev, pcie->root_bus_nr, &altera_pcie_ops,
+	bus = pci_scan_root_bus(dev, pcie->root_bus_nr, &altera_pcie_ops,
 				pcie, &pcie->resources);
 	if (!bus)
 		return -ENOMEM;
@@ -633,8 +624,6 @@
 		pcie_bus_configure_settings(child);
 
 	pci_bus_add_devices(bus);
-
-	platform_set_drvdata(pdev, pcie);
 	return ret;
 }
 
diff --git a/drivers/pci/host/pcie-armada8k.c b/drivers/pci/host/pcie-armada8k.c
index 0f4f570..0ac0f18 100644
--- a/drivers/pci/host/pcie-armada8k.c
+++ b/drivers/pci/host/pcie-armada8k.c
@@ -29,34 +29,33 @@
 #include "pcie-designware.h"
 
 struct armada8k_pcie {
-	void __iomem *base;
+	struct pcie_port pp;		/* pp.dbi_base is DT ctrl */
 	struct clk *clk;
-	struct pcie_port pp;
 };
 
 #define PCIE_VENDOR_REGS_OFFSET		0x8000
 
-#define PCIE_GLOBAL_CONTROL_REG		0x0
+#define PCIE_GLOBAL_CONTROL_REG		(PCIE_VENDOR_REGS_OFFSET + 0x0)
 #define PCIE_APP_LTSSM_EN		BIT(2)
 #define PCIE_DEVICE_TYPE_SHIFT		4
 #define PCIE_DEVICE_TYPE_MASK		0xF
 #define PCIE_DEVICE_TYPE_RC		0x4 /* Root complex */
 
-#define PCIE_GLOBAL_STATUS_REG		0x8
+#define PCIE_GLOBAL_STATUS_REG		(PCIE_VENDOR_REGS_OFFSET + 0x8)
 #define PCIE_GLB_STS_RDLH_LINK_UP	BIT(1)
 #define PCIE_GLB_STS_PHY_LINK_UP	BIT(9)
 
-#define PCIE_GLOBAL_INT_CAUSE1_REG	0x1C
-#define PCIE_GLOBAL_INT_MASK1_REG	0x20
+#define PCIE_GLOBAL_INT_CAUSE1_REG	(PCIE_VENDOR_REGS_OFFSET + 0x1C)
+#define PCIE_GLOBAL_INT_MASK1_REG	(PCIE_VENDOR_REGS_OFFSET + 0x20)
 #define PCIE_INT_A_ASSERT_MASK		BIT(9)
 #define PCIE_INT_B_ASSERT_MASK		BIT(10)
 #define PCIE_INT_C_ASSERT_MASK		BIT(11)
 #define PCIE_INT_D_ASSERT_MASK		BIT(12)
 
-#define PCIE_ARCACHE_TRC_REG		0x50
-#define PCIE_AWCACHE_TRC_REG		0x54
-#define PCIE_ARUSER_REG			0x5C
-#define PCIE_AWUSER_REG			0x60
+#define PCIE_ARCACHE_TRC_REG		(PCIE_VENDOR_REGS_OFFSET + 0x50)
+#define PCIE_AWCACHE_TRC_REG		(PCIE_VENDOR_REGS_OFFSET + 0x54)
+#define PCIE_ARUSER_REG			(PCIE_VENDOR_REGS_OFFSET + 0x5C)
+#define PCIE_AWUSER_REG			(PCIE_VENDOR_REGS_OFFSET + 0x60)
 /*
  * AR/AW Cache defauls: Normal memory, Write-Back, Read / Write
  * allocate
@@ -72,11 +71,10 @@
 
 static int armada8k_pcie_link_up(struct pcie_port *pp)
 {
-	struct armada8k_pcie *pcie = to_armada8k_pcie(pp);
 	u32 reg;
 	u32 mask = PCIE_GLB_STS_RDLH_LINK_UP | PCIE_GLB_STS_PHY_LINK_UP;
 
-	reg = readl(pcie->base + PCIE_GLOBAL_STATUS_REG);
+	reg = dw_pcie_readl_rc(pp, PCIE_GLOBAL_STATUS_REG);
 
 	if ((reg & mask) == mask)
 		return 1;
@@ -85,51 +83,50 @@
 	return 0;
 }
 
-static void armada8k_pcie_establish_link(struct pcie_port *pp)
+static void armada8k_pcie_establish_link(struct armada8k_pcie *pcie)
 {
-	struct armada8k_pcie *pcie = to_armada8k_pcie(pp);
-	void __iomem *base = pcie->base;
+	struct pcie_port *pp = &pcie->pp;
 	u32 reg;
 
 	if (!dw_pcie_link_up(pp)) {
 		/* Disable LTSSM state machine to enable configuration */
-		reg = readl(base + PCIE_GLOBAL_CONTROL_REG);
+		reg = dw_pcie_readl_rc(pp, PCIE_GLOBAL_CONTROL_REG);
 		reg &= ~(PCIE_APP_LTSSM_EN);
-		writel(reg, base + PCIE_GLOBAL_CONTROL_REG);
+		dw_pcie_writel_rc(pp, PCIE_GLOBAL_CONTROL_REG, reg);
 	}
 
 	/* Set the device to root complex mode */
-	reg = readl(base + PCIE_GLOBAL_CONTROL_REG);
+	reg = dw_pcie_readl_rc(pp, PCIE_GLOBAL_CONTROL_REG);
 	reg &= ~(PCIE_DEVICE_TYPE_MASK << PCIE_DEVICE_TYPE_SHIFT);
 	reg |= PCIE_DEVICE_TYPE_RC << PCIE_DEVICE_TYPE_SHIFT;
-	writel(reg, base + PCIE_GLOBAL_CONTROL_REG);
+	dw_pcie_writel_rc(pp, PCIE_GLOBAL_CONTROL_REG, reg);
 
 	/* Set the PCIe master AxCache attributes */
-	writel(ARCACHE_DEFAULT_VALUE, base + PCIE_ARCACHE_TRC_REG);
-	writel(AWCACHE_DEFAULT_VALUE, base + PCIE_AWCACHE_TRC_REG);
+	dw_pcie_writel_rc(pp, PCIE_ARCACHE_TRC_REG, ARCACHE_DEFAULT_VALUE);
+	dw_pcie_writel_rc(pp, PCIE_AWCACHE_TRC_REG, AWCACHE_DEFAULT_VALUE);
 
 	/* Set the PCIe master AxDomain attributes */
-	reg = readl(base + PCIE_ARUSER_REG);
+	reg = dw_pcie_readl_rc(pp, PCIE_ARUSER_REG);
 	reg &= ~(AX_USER_DOMAIN_MASK << AX_USER_DOMAIN_SHIFT);
 	reg |= DOMAIN_OUTER_SHAREABLE << AX_USER_DOMAIN_SHIFT;
-	writel(reg, base + PCIE_ARUSER_REG);
+	dw_pcie_writel_rc(pp, PCIE_ARUSER_REG, reg);
 
-	reg = readl(base + PCIE_AWUSER_REG);
+	reg = dw_pcie_readl_rc(pp, PCIE_AWUSER_REG);
 	reg &= ~(AX_USER_DOMAIN_MASK << AX_USER_DOMAIN_SHIFT);
 	reg |= DOMAIN_OUTER_SHAREABLE << AX_USER_DOMAIN_SHIFT;
-	writel(reg, base + PCIE_AWUSER_REG);
+	dw_pcie_writel_rc(pp, PCIE_AWUSER_REG, reg);
 
 	/* Enable INT A-D interrupts */
-	reg = readl(base + PCIE_GLOBAL_INT_MASK1_REG);
+	reg = dw_pcie_readl_rc(pp, PCIE_GLOBAL_INT_MASK1_REG);
 	reg |= PCIE_INT_A_ASSERT_MASK | PCIE_INT_B_ASSERT_MASK |
 	       PCIE_INT_C_ASSERT_MASK | PCIE_INT_D_ASSERT_MASK;
-	writel(reg, base + PCIE_GLOBAL_INT_MASK1_REG);
+	dw_pcie_writel_rc(pp, PCIE_GLOBAL_INT_MASK1_REG, reg);
 
 	if (!dw_pcie_link_up(pp)) {
 		/* Configuration done. Start LTSSM */
-		reg = readl(base + PCIE_GLOBAL_CONTROL_REG);
+		reg = dw_pcie_readl_rc(pp, PCIE_GLOBAL_CONTROL_REG);
 		reg |= PCIE_APP_LTSSM_EN;
-		writel(reg, base + PCIE_GLOBAL_CONTROL_REG);
+		dw_pcie_writel_rc(pp, PCIE_GLOBAL_CONTROL_REG, reg);
 	}
 
 	/* Wait until the link becomes active again */
@@ -139,15 +136,16 @@
 
 static void armada8k_pcie_host_init(struct pcie_port *pp)
 {
+	struct armada8k_pcie *pcie = to_armada8k_pcie(pp);
+
 	dw_pcie_setup_rc(pp);
-	armada8k_pcie_establish_link(pp);
+	armada8k_pcie_establish_link(pcie);
 }
 
 static irqreturn_t armada8k_pcie_irq_handler(int irq, void *arg)
 {
-	struct pcie_port *pp = arg;
-	struct armada8k_pcie *pcie = to_armada8k_pcie(pp);
-	void __iomem *base = pcie->base;
+	struct armada8k_pcie *pcie = arg;
+	struct pcie_port *pp = &pcie->pp;
 	u32 val;
 
 	/*
@@ -155,8 +153,8 @@
 	 * PCI device. However, they are also latched into the PCIe
 	 * controller, so we simply discard them.
 	 */
-	val = readl(base + PCIE_GLOBAL_INT_CAUSE1_REG);
-	writel(val, base + PCIE_GLOBAL_INT_CAUSE1_REG);
+	val = dw_pcie_readl_rc(pp, PCIE_GLOBAL_INT_CAUSE1_REG);
+	dw_pcie_writel_rc(pp, PCIE_GLOBAL_INT_CAUSE1_REG, val);
 
 	return IRQ_HANDLED;
 }
@@ -166,9 +164,10 @@
 	.host_init = armada8k_pcie_host_init,
 };
 
-static int armada8k_add_pcie_port(struct pcie_port *pp,
+static int armada8k_add_pcie_port(struct armada8k_pcie *pcie,
 				  struct platform_device *pdev)
 {
+	struct pcie_port *pp = &pcie->pp;
 	struct device *dev = &pdev->dev;
 	int ret;
 
@@ -182,7 +181,7 @@
 	}
 
 	ret = devm_request_irq(dev, pp->irq, armada8k_pcie_irq_handler,
-			       IRQF_SHARED, "armada8k-pcie", pp);
+			       IRQF_SHARED, "armada8k-pcie", pcie);
 	if (ret) {
 		dev_err(dev, "failed to request irq %d\n", pp->irq);
 		return ret;
@@ -217,7 +216,6 @@
 
 	pp = &pcie->pp;
 	pp->dev = dev;
-	platform_set_drvdata(pdev, pcie);
 
 	/* Get the dw-pcie unit configuration/control registers base. */
 	base = platform_get_resource_byname(pdev, IORESOURCE_MEM, "ctrl");
@@ -228,9 +226,7 @@
 		goto fail;
 	}
 
-	pcie->base = pp->dbi_base + PCIE_VENDOR_REGS_OFFSET;
-
-	ret = armada8k_add_pcie_port(pp, pdev);
+	ret = armada8k_add_pcie_port(pcie, pdev);
 	if (ret)
 		goto fail;
 
diff --git a/drivers/pci/host/pcie-artpec6.c b/drivers/pci/host/pcie-artpec6.c
index 39bf1a6..212786b 100644
--- a/drivers/pci/host/pcie-artpec6.c
+++ b/drivers/pci/host/pcie-artpec6.c
@@ -27,9 +27,9 @@
 #define to_artpec6_pcie(x)	container_of(x, struct artpec6_pcie, pp)
 
 struct artpec6_pcie {
-	struct pcie_port	pp;
-	struct regmap		*regmap;
-	void __iomem		*phy_base;
+	struct pcie_port	pp;		/* pp.dbi_base is DT dbi */
+	struct regmap		*regmap;	/* DT axis,syscon-pcie */
+	void __iomem		*phy_base;	/* DT phy */
 };
 
 /* PCIe Port Logic registers (memory-mapped) */
@@ -65,18 +65,31 @@
 
 #define ARTPEC6_CPU_TO_BUS_ADDR		0x0fffffff
 
-static int artpec6_pcie_establish_link(struct pcie_port *pp)
+static u32 artpec6_pcie_readl(struct artpec6_pcie *artpec6_pcie, u32 offset)
 {
-	struct artpec6_pcie *artpec6_pcie = to_artpec6_pcie(pp);
+	u32 val;
+
+	regmap_read(artpec6_pcie->regmap, offset, &val);
+	return val;
+}
+
+static void artpec6_pcie_writel(struct artpec6_pcie *artpec6_pcie, u32 offset, u32 val)
+{
+	regmap_write(artpec6_pcie->regmap, offset, val);
+}
+
+static int artpec6_pcie_establish_link(struct artpec6_pcie *artpec6_pcie)
+{
+	struct pcie_port *pp = &artpec6_pcie->pp;
 	u32 val;
 	unsigned int retries;
 
 	/* Hold DW core in reset */
-	regmap_read(artpec6_pcie->regmap, PCIECFG, &val);
+	val = artpec6_pcie_readl(artpec6_pcie, PCIECFG);
 	val |= PCIECFG_CORE_RESET_REQ;
-	regmap_write(artpec6_pcie->regmap, PCIECFG, val);
+	artpec6_pcie_writel(artpec6_pcie, PCIECFG, val);
 
-	regmap_read(artpec6_pcie->regmap, PCIECFG, &val);
+	val = artpec6_pcie_readl(artpec6_pcie, PCIECFG);
 	val |=  PCIECFG_RISRCREN |	/* Receiver term. 50 Ohm */
 		PCIECFG_MODE_TX_DRV_EN |
 		PCIECFG_CISRREN |	/* Reference clock term. 100 Ohm */
@@ -84,27 +97,27 @@
 	val |= PCIECFG_REFCLK_ENABLE;
 	val &= ~PCIECFG_DBG_OEN;
 	val &= ~PCIECFG_CLKREQ_B;
-	regmap_write(artpec6_pcie->regmap, PCIECFG, val);
+	artpec6_pcie_writel(artpec6_pcie, PCIECFG, val);
 	usleep_range(5000, 6000);
 
-	regmap_read(artpec6_pcie->regmap, NOCCFG, &val);
+	val = artpec6_pcie_readl(artpec6_pcie, NOCCFG);
 	val |= NOCCFG_ENABLE_CLK_PCIE;
-	regmap_write(artpec6_pcie->regmap, NOCCFG, val);
+	artpec6_pcie_writel(artpec6_pcie, NOCCFG, val);
 	usleep_range(20, 30);
 
-	regmap_read(artpec6_pcie->regmap, PCIECFG, &val);
+	val = artpec6_pcie_readl(artpec6_pcie, PCIECFG);
 	val |= PCIECFG_PCLK_ENABLE | PCIECFG_PLL_ENABLE;
-	regmap_write(artpec6_pcie->regmap, PCIECFG, val);
+	artpec6_pcie_writel(artpec6_pcie, PCIECFG, val);
 	usleep_range(6000, 7000);
 
-	regmap_read(artpec6_pcie->regmap, NOCCFG, &val);
+	val = artpec6_pcie_readl(artpec6_pcie, NOCCFG);
 	val &= ~NOCCFG_POWER_PCIE_IDLEREQ;
-	regmap_write(artpec6_pcie->regmap, NOCCFG, val);
+	artpec6_pcie_writel(artpec6_pcie, NOCCFG, val);
 
 	retries = 50;
 	do {
 		usleep_range(1000, 2000);
-		regmap_read(artpec6_pcie->regmap, NOCCFG, &val);
+		val = artpec6_pcie_readl(artpec6_pcie, NOCCFG);
 		retries--;
 	} while (retries &&
 		(val & (NOCCFG_POWER_PCIE_IDLEACK | NOCCFG_POWER_PCIE_IDLE)));
@@ -117,16 +130,16 @@
 	} while (retries && !(val & PHY_COSPLLLOCK));
 
 	/* Take DW core out of reset */
-	regmap_read(artpec6_pcie->regmap, PCIECFG, &val);
+	val = artpec6_pcie_readl(artpec6_pcie, PCIECFG);
 	val &= ~PCIECFG_CORE_RESET_REQ;
-	regmap_write(artpec6_pcie->regmap, PCIECFG, val);
+	artpec6_pcie_writel(artpec6_pcie, PCIECFG, val);
 	usleep_range(100, 200);
 
 	/*
 	 * Enable writing to config regs. This is required as the Synopsys
 	 * driver changes the class code. That register needs DBI write enable.
 	 */
-	writel(DBI_RO_WR_EN, pp->dbi_base + MISC_CONTROL_1_OFF);
+	dw_pcie_writel_rc(pp, MISC_CONTROL_1_OFF, DBI_RO_WR_EN);
 
 	pp->io_base &= ARTPEC6_CPU_TO_BUS_ADDR;
 	pp->mem_base &= ARTPEC6_CPU_TO_BUS_ADDR;
@@ -137,78 +150,69 @@
 	dw_pcie_setup_rc(pp);
 
 	/* assert LTSSM enable */
-	regmap_read(artpec6_pcie->regmap, PCIECFG, &val);
+	val = artpec6_pcie_readl(artpec6_pcie, PCIECFG);
 	val |= PCIECFG_LTSSM_ENABLE;
-	regmap_write(artpec6_pcie->regmap, PCIECFG, val);
+	artpec6_pcie_writel(artpec6_pcie, PCIECFG, val);
 
 	/* check if the link is up or not */
 	if (!dw_pcie_wait_for_link(pp))
 		return 0;
 
 	dev_dbg(pp->dev, "DEBUG_R0: 0x%08x, DEBUG_R1: 0x%08x\n",
-		readl(pp->dbi_base + PCIE_PHY_DEBUG_R0),
-		readl(pp->dbi_base + PCIE_PHY_DEBUG_R1));
+		dw_pcie_readl_rc(pp, PCIE_PHY_DEBUG_R0),
+		dw_pcie_readl_rc(pp, PCIE_PHY_DEBUG_R1));
 
 	return -ETIMEDOUT;
 }
 
-static void artpec6_pcie_enable_interrupts(struct pcie_port *pp)
+static void artpec6_pcie_enable_interrupts(struct artpec6_pcie *artpec6_pcie)
 {
+	struct pcie_port *pp = &artpec6_pcie->pp;
+
 	if (IS_ENABLED(CONFIG_PCI_MSI))
 		dw_pcie_msi_init(pp);
 }
 
 static void artpec6_pcie_host_init(struct pcie_port *pp)
 {
-	artpec6_pcie_establish_link(pp);
-	artpec6_pcie_enable_interrupts(pp);
-}
+	struct artpec6_pcie *artpec6_pcie = to_artpec6_pcie(pp);
 
-static int artpec6_pcie_link_up(struct pcie_port *pp)
-{
-	u32 rc;
-
-	/*
-	 * Get status from Synopsys IP
-	 * link is debug bit 36, debug register 1 starts at bit 32
-	 */
-	rc = readl(pp->dbi_base + PCIE_PHY_DEBUG_R1) & (0x1 << (36 - 32));
-	if (rc)
-		return 1;
-
-	return 0;
+	artpec6_pcie_establish_link(artpec6_pcie);
+	artpec6_pcie_enable_interrupts(artpec6_pcie);
 }
 
 static struct pcie_host_ops artpec6_pcie_host_ops = {
-	.link_up = artpec6_pcie_link_up,
 	.host_init = artpec6_pcie_host_init,
 };
 
 static irqreturn_t artpec6_pcie_msi_handler(int irq, void *arg)
 {
-	struct pcie_port *pp = arg;
+	struct artpec6_pcie *artpec6_pcie = arg;
+	struct pcie_port *pp = &artpec6_pcie->pp;
 
 	return dw_handle_msi_irq(pp);
 }
 
-static int artpec6_add_pcie_port(struct pcie_port *pp,
+static int artpec6_add_pcie_port(struct artpec6_pcie *artpec6_pcie,
 				 struct platform_device *pdev)
 {
+	struct pcie_port *pp = &artpec6_pcie->pp;
+	struct device *dev = pp->dev;
 	int ret;
 
 	if (IS_ENABLED(CONFIG_PCI_MSI)) {
 		pp->msi_irq = platform_get_irq_byname(pdev, "msi");
 		if (pp->msi_irq <= 0) {
-			dev_err(&pdev->dev, "failed to get MSI irq\n");
+			dev_err(dev, "failed to get MSI irq\n");
 			return -ENODEV;
 		}
 
-		ret = devm_request_irq(&pdev->dev, pp->msi_irq,
+		ret = devm_request_irq(dev, pp->msi_irq,
 				       artpec6_pcie_msi_handler,
 				       IRQF_SHARED | IRQF_NO_THREAD,
-				       "artpec6-pcie-msi", pp);
+				       "artpec6-pcie-msi", artpec6_pcie);
 		if (ret) {
-			dev_err(&pdev->dev, "failed to request MSI irq\n");
+			dev_err(dev, "failed to request MSI irq\n");
 			return ret;
 		}
 	}
@@ -218,7 +222,7 @@
 
 	ret = dw_pcie_host_init(pp);
 	if (ret) {
-		dev_err(&pdev->dev, "failed to initialize host\n");
+		dev_err(dev, "failed to initialize host\n");
 		return ret;
 	}
 
@@ -227,41 +231,40 @@
 
 static int artpec6_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct artpec6_pcie *artpec6_pcie;
 	struct pcie_port *pp;
 	struct resource *dbi_base;
 	struct resource *phy_base;
 	int ret;
 
-	artpec6_pcie = devm_kzalloc(&pdev->dev, sizeof(*artpec6_pcie),
-				    GFP_KERNEL);
+	artpec6_pcie = devm_kzalloc(dev, sizeof(*artpec6_pcie), GFP_KERNEL);
 	if (!artpec6_pcie)
 		return -ENOMEM;
 
 	pp = &artpec6_pcie->pp;
-	pp->dev = &pdev->dev;
+	pp->dev = dev;
 
 	dbi_base = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi");
-	pp->dbi_base = devm_ioremap_resource(&pdev->dev, dbi_base);
+	pp->dbi_base = devm_ioremap_resource(dev, dbi_base);
 	if (IS_ERR(pp->dbi_base))
 		return PTR_ERR(pp->dbi_base);
 
 	phy_base = platform_get_resource_byname(pdev, IORESOURCE_MEM, "phy");
-	artpec6_pcie->phy_base = devm_ioremap_resource(&pdev->dev, phy_base);
+	artpec6_pcie->phy_base = devm_ioremap_resource(dev, phy_base);
 	if (IS_ERR(artpec6_pcie->phy_base))
 		return PTR_ERR(artpec6_pcie->phy_base);
 
 	artpec6_pcie->regmap =
-		syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
+		syscon_regmap_lookup_by_phandle(dev->of_node,
 						"axis,syscon-pcie");
 	if (IS_ERR(artpec6_pcie->regmap))
 		return PTR_ERR(artpec6_pcie->regmap);
 
-	ret = artpec6_add_pcie_port(pp, pdev);
+	ret = artpec6_add_pcie_port(artpec6_pcie, pdev);
 	if (ret < 0)
 		return ret;
 
-	platform_set_drvdata(pdev, artpec6_pcie);
 	return 0;
 }
 
diff --git a/drivers/pci/host/pcie-designware-plat.c b/drivers/pci/host/pcie-designware-plat.c
index 17da005..537f58a 100644
--- a/drivers/pci/host/pcie-designware-plat.c
+++ b/drivers/pci/host/pcie-designware-plat.c
@@ -25,8 +25,7 @@
 #include "pcie-designware.h"
 
 struct dw_plat_pcie {
-	void __iomem		*mem_base;
-	struct pcie_port	pp;
+	struct pcie_port	pp;	/* pp.dbi_base is DT 0th resource */
 };
 
 static irqreturn_t dw_plat_pcie_msi_irq_handler(int irq, void *arg)
@@ -52,6 +51,7 @@
 static int dw_plat_add_pcie_port(struct pcie_port *pp,
 				 struct platform_device *pdev)
 {
+	struct device *dev = pp->dev;
 	int ret;
 
 	pp->irq = platform_get_irq(pdev, 1);
@@ -63,11 +63,11 @@
 		if (pp->msi_irq < 0)
 			return pp->msi_irq;
 
-		ret = devm_request_irq(&pdev->dev, pp->msi_irq,
+		ret = devm_request_irq(dev, pp->msi_irq,
 					dw_plat_pcie_msi_irq_handler,
 					IRQF_SHARED, "dw-plat-pcie-msi", pp);
 		if (ret) {
-			dev_err(&pdev->dev, "failed to request MSI IRQ\n");
+			dev_err(dev, "failed to request MSI IRQ\n");
 			return ret;
 		}
 	}
@@ -77,7 +77,7 @@
 
 	ret = dw_pcie_host_init(pp);
 	if (ret) {
-		dev_err(&pdev->dev, "failed to initialize host\n");
+		dev_err(dev, "failed to initialize host\n");
 		return ret;
 	}
 
@@ -86,31 +86,28 @@
 
 static int dw_plat_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct dw_plat_pcie *dw_plat_pcie;
 	struct pcie_port *pp;
 	struct resource *res;  /* Resource from DT */
 	int ret;
 
-	dw_plat_pcie = devm_kzalloc(&pdev->dev, sizeof(*dw_plat_pcie),
-					GFP_KERNEL);
+	dw_plat_pcie = devm_kzalloc(dev, sizeof(*dw_plat_pcie), GFP_KERNEL);
 	if (!dw_plat_pcie)
 		return -ENOMEM;
 
 	pp = &dw_plat_pcie->pp;
-	pp->dev = &pdev->dev;
+	pp->dev = dev;
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	dw_plat_pcie->mem_base = devm_ioremap_resource(&pdev->dev, res);
-	if (IS_ERR(dw_plat_pcie->mem_base))
-		return PTR_ERR(dw_plat_pcie->mem_base);
-
-	pp->dbi_base = dw_plat_pcie->mem_base;
+	pp->dbi_base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(pp->dbi_base))
+		return PTR_ERR(pp->dbi_base);
 
 	ret = dw_plat_add_pcie_port(pp, pdev);
 	if (ret < 0)
 		return ret;
 
-	platform_set_drvdata(pdev, dw_plat_pcie);
 	return 0;
 }
 
diff --git a/drivers/pci/host/pcie-designware.c b/drivers/pci/host/pcie-designware.c
index 74da71e..035f50c 100644
--- a/drivers/pci/host/pcie-designware.c
+++ b/drivers/pci/host/pcie-designware.c
@@ -141,41 +141,35 @@
 	return PCIBIOS_SUCCESSFUL;
 }
 
-static inline u32 dw_pcie_readl_rc(struct pcie_port *pp, u32 reg)
+u32 dw_pcie_readl_rc(struct pcie_port *pp, u32 reg)
 {
 	if (pp->ops->readl_rc)
-		return pp->ops->readl_rc(pp, pp->dbi_base + reg);
+		return pp->ops->readl_rc(pp, reg);
 
 	return readl(pp->dbi_base + reg);
 }
 
-static inline void dw_pcie_writel_rc(struct pcie_port *pp, u32 val, u32 reg)
+void dw_pcie_writel_rc(struct pcie_port *pp, u32 reg, u32 val)
 {
 	if (pp->ops->writel_rc)
-		pp->ops->writel_rc(pp, val, pp->dbi_base + reg);
+		pp->ops->writel_rc(pp, reg, val);
 	else
 		writel(val, pp->dbi_base + reg);
 }
 
-static inline u32 dw_pcie_readl_unroll(struct pcie_port *pp, u32 index, u32 reg)
+static u32 dw_pcie_readl_unroll(struct pcie_port *pp, u32 index, u32 reg)
 {
 	u32 offset = PCIE_GET_ATU_OUTB_UNR_REG_OFFSET(index);
 
-	if (pp->ops->readl_rc)
-		return pp->ops->readl_rc(pp, pp->dbi_base + offset + reg);
-
-	return readl(pp->dbi_base + offset + reg);
+	return dw_pcie_readl_rc(pp, offset + reg);
 }
 
-static inline void dw_pcie_writel_unroll(struct pcie_port *pp, u32 index,
-					 u32 val, u32 reg)
+static void dw_pcie_writel_unroll(struct pcie_port *pp, u32 index, u32 reg,
+				  u32 val)
 {
 	u32 offset = PCIE_GET_ATU_OUTB_UNR_REG_OFFSET(index);
 
-	if (pp->ops->writel_rc)
-		pp->ops->writel_rc(pp, val, pp->dbi_base + offset + reg);
-	else
-		writel(val, pp->dbi_base + offset + reg);
+	dw_pcie_writel_rc(pp, offset + reg, val);
 }
 
 static int dw_pcie_rd_own_conf(struct pcie_port *pp, int where, int size,
@@ -202,35 +196,35 @@
 	u32 retries, val;
 
 	if (pp->iatu_unroll_enabled) {
-		dw_pcie_writel_unroll(pp, index,
-			lower_32_bits(cpu_addr), PCIE_ATU_UNR_LOWER_BASE);
-		dw_pcie_writel_unroll(pp, index,
-			upper_32_bits(cpu_addr), PCIE_ATU_UNR_UPPER_BASE);
-		dw_pcie_writel_unroll(pp, index,
-			lower_32_bits(cpu_addr + size - 1), PCIE_ATU_UNR_LIMIT);
-		dw_pcie_writel_unroll(pp, index,
-			lower_32_bits(pci_addr), PCIE_ATU_UNR_LOWER_TARGET);
-		dw_pcie_writel_unroll(pp, index,
-			upper_32_bits(pci_addr), PCIE_ATU_UNR_UPPER_TARGET);
-		dw_pcie_writel_unroll(pp, index,
-			type, PCIE_ATU_UNR_REGION_CTRL1);
-		dw_pcie_writel_unroll(pp, index,
-			PCIE_ATU_ENABLE, PCIE_ATU_UNR_REGION_CTRL2);
+		dw_pcie_writel_unroll(pp, index, PCIE_ATU_UNR_LOWER_BASE,
+			lower_32_bits(cpu_addr));
+		dw_pcie_writel_unroll(pp, index, PCIE_ATU_UNR_UPPER_BASE,
+			upper_32_bits(cpu_addr));
+		dw_pcie_writel_unroll(pp, index, PCIE_ATU_UNR_LIMIT,
+			lower_32_bits(cpu_addr + size - 1));
+		dw_pcie_writel_unroll(pp, index, PCIE_ATU_UNR_LOWER_TARGET,
+			lower_32_bits(pci_addr));
+		dw_pcie_writel_unroll(pp, index, PCIE_ATU_UNR_UPPER_TARGET,
+			upper_32_bits(pci_addr));
+		dw_pcie_writel_unroll(pp, index, PCIE_ATU_UNR_REGION_CTRL1,
+			type);
+		dw_pcie_writel_unroll(pp, index, PCIE_ATU_UNR_REGION_CTRL2,
+			PCIE_ATU_ENABLE);
 	} else {
-		dw_pcie_writel_rc(pp, PCIE_ATU_REGION_OUTBOUND | index,
-						PCIE_ATU_VIEWPORT);
-		dw_pcie_writel_rc(pp, lower_32_bits(cpu_addr),
-						PCIE_ATU_LOWER_BASE);
-		dw_pcie_writel_rc(pp, upper_32_bits(cpu_addr),
-						PCIE_ATU_UPPER_BASE);
-		dw_pcie_writel_rc(pp, lower_32_bits(cpu_addr + size - 1),
-						PCIE_ATU_LIMIT);
-		dw_pcie_writel_rc(pp, lower_32_bits(pci_addr),
-						PCIE_ATU_LOWER_TARGET);
-		dw_pcie_writel_rc(pp, upper_32_bits(pci_addr),
-						PCIE_ATU_UPPER_TARGET);
-		dw_pcie_writel_rc(pp, type, PCIE_ATU_CR1);
-		dw_pcie_writel_rc(pp, PCIE_ATU_ENABLE, PCIE_ATU_CR2);
+		dw_pcie_writel_rc(pp, PCIE_ATU_VIEWPORT,
+				  PCIE_ATU_REGION_OUTBOUND | index);
+		dw_pcie_writel_rc(pp, PCIE_ATU_LOWER_BASE,
+				  lower_32_bits(cpu_addr));
+		dw_pcie_writel_rc(pp, PCIE_ATU_UPPER_BASE,
+				  upper_32_bits(cpu_addr));
+		dw_pcie_writel_rc(pp, PCIE_ATU_LIMIT,
+				  lower_32_bits(cpu_addr + size - 1));
+		dw_pcie_writel_rc(pp, PCIE_ATU_LOWER_TARGET,
+				  lower_32_bits(pci_addr));
+		dw_pcie_writel_rc(pp, PCIE_ATU_UPPER_TARGET,
+				  upper_32_bits(pci_addr));
+		dw_pcie_writel_rc(pp, PCIE_ATU_CR1, type);
+		dw_pcie_writel_rc(pp, PCIE_ATU_CR2, PCIE_ATU_ENABLE);
 	}
 
 	/*
@@ -760,8 +754,8 @@
 	return ret;
 }
 
-static int dw_pcie_valid_config(struct pcie_port *pp,
-				struct pci_bus *bus, int dev)
+static int dw_pcie_valid_device(struct pcie_port *pp, struct pci_bus *bus,
+				int dev)
 {
 	/* If there is no link, then there is no device */
 	if (bus->number != pp->root_bus_nr) {
@@ -781,7 +775,7 @@
 {
 	struct pcie_port *pp = bus->sysdata;
 
-	if (dw_pcie_valid_config(pp, bus, PCI_SLOT(devfn)) == 0) {
+	if (!dw_pcie_valid_device(pp, bus, PCI_SLOT(devfn))) {
 		*val = 0xffffffff;
 		return PCIBIOS_DEVICE_NOT_FOUND;
 	}
@@ -797,7 +791,7 @@
 {
 	struct pcie_port *pp = bus->sysdata;
 
-	if (dw_pcie_valid_config(pp, bus, PCI_SLOT(devfn)) == 0)
+	if (!dw_pcie_valid_device(pp, bus, PCI_SLOT(devfn)))
 		return PCIBIOS_DEVICE_NOT_FOUND;
 
 	if (bus->number == pp->root_bus_nr)
@@ -835,7 +829,7 @@
 		dev_err(pp->dev, "num-lanes %u: invalid value\n", pp->lanes);
 		return;
 	}
-	dw_pcie_writel_rc(pp, val, PCIE_PORT_LINK_CONTROL);
+	dw_pcie_writel_rc(pp, PCIE_PORT_LINK_CONTROL, val);
 
 	/* set link width speed control register */
 	val = dw_pcie_readl_rc(pp, PCIE_LINK_WIDTH_SPEED_CONTROL);
@@ -854,30 +848,30 @@
 		val |= PORT_LOGIC_LINK_WIDTH_8_LANES;
 		break;
 	}
-	dw_pcie_writel_rc(pp, val, PCIE_LINK_WIDTH_SPEED_CONTROL);
+	dw_pcie_writel_rc(pp, PCIE_LINK_WIDTH_SPEED_CONTROL, val);
 
 	/* setup RC BARs */
-	dw_pcie_writel_rc(pp, 0x00000004, PCI_BASE_ADDRESS_0);
-	dw_pcie_writel_rc(pp, 0x00000000, PCI_BASE_ADDRESS_1);
+	dw_pcie_writel_rc(pp, PCI_BASE_ADDRESS_0, 0x00000004);
+	dw_pcie_writel_rc(pp, PCI_BASE_ADDRESS_1, 0x00000000);
 
 	/* setup interrupt pins */
 	val = dw_pcie_readl_rc(pp, PCI_INTERRUPT_LINE);
 	val &= 0xffff00ff;
 	val |= 0x00000100;
-	dw_pcie_writel_rc(pp, val, PCI_INTERRUPT_LINE);
+	dw_pcie_writel_rc(pp, PCI_INTERRUPT_LINE, val);
 
 	/* setup bus numbers */
 	val = dw_pcie_readl_rc(pp, PCI_PRIMARY_BUS);
 	val &= 0xff000000;
 	val |= 0x00010100;
-	dw_pcie_writel_rc(pp, val, PCI_PRIMARY_BUS);
+	dw_pcie_writel_rc(pp, PCI_PRIMARY_BUS, val);
 
 	/* setup command register */
 	val = dw_pcie_readl_rc(pp, PCI_COMMAND);
 	val &= 0xffff0000;
 	val |= PCI_COMMAND_IO | PCI_COMMAND_MEMORY |
 		PCI_COMMAND_MASTER | PCI_COMMAND_SERR;
-	dw_pcie_writel_rc(pp, val, PCI_COMMAND);
+	dw_pcie_writel_rc(pp, PCI_COMMAND, val);
 
 	/*
 	 * If the platform provides ->rd_other_conf, it means the platform
diff --git a/drivers/pci/host/pcie-designware.h b/drivers/pci/host/pcie-designware.h
index c8e5bc6..a567ea2 100644
--- a/drivers/pci/host/pcie-designware.h
+++ b/drivers/pci/host/pcie-designware.h
@@ -54,9 +54,8 @@
 };
 
 struct pcie_host_ops {
-	u32 (*readl_rc)(struct pcie_port *pp, void __iomem *dbi_base);
-	void (*writel_rc)(struct pcie_port *pp,
-			u32 val, void __iomem *dbi_base);
+	u32 (*readl_rc)(struct pcie_port *pp, u32 reg);
+	void (*writel_rc)(struct pcie_port *pp, u32 reg, u32 val);
 	int (*rd_own_conf)(struct pcie_port *pp, int where, int size, u32 *val);
 	int (*wr_own_conf)(struct pcie_port *pp, int where, int size, u32 val);
 	int (*rd_other_conf)(struct pcie_port *pp, struct pci_bus *bus,
@@ -73,6 +72,8 @@
 	int (*msi_host_init)(struct pcie_port *pp, struct msi_controller *chip);
 };
 
+u32 dw_pcie_readl_rc(struct pcie_port *pp, u32 reg);
+void dw_pcie_writel_rc(struct pcie_port *pp, u32 reg, u32 val);
 int dw_pcie_cfg_read(void __iomem *addr, int size, u32 *val);
 int dw_pcie_cfg_write(void __iomem *addr, int size, u32 val);
 irqreturn_t dw_handle_msi_irq(struct pcie_port *pp);
diff --git a/drivers/pci/host/pcie-hisi.c b/drivers/pci/host/pcie-hisi.c
index 7ee9dfc..56154c2 100644
--- a/drivers/pci/host/pcie-hisi.c
+++ b/drivers/pci/host/pcie-hisi.c
@@ -22,51 +22,38 @@
 
 #include "pcie-designware.h"
 
-#define PCIE_LTSSM_LINKUP_STATE				0x11
-#define PCIE_LTSSM_STATE_MASK				0x3F
-#define PCIE_SUBCTRL_SYS_STATE4_REG			0x6818
-#define PCIE_SYS_STATE4						0x31c
-#define PCIE_HIP06_CTRL_OFF					0x1000
+#define PCIE_SUBCTRL_SYS_STATE4_REG		0x6818
+#define PCIE_HIP06_CTRL_OFF			0x1000
+#define PCIE_SYS_STATE4				(PCIE_HIP06_CTRL_OFF + 0x31c)
+#define PCIE_LTSSM_LINKUP_STATE			0x11
+#define PCIE_LTSSM_STATE_MASK			0x3F
 
 #define to_hisi_pcie(x)	container_of(x, struct hisi_pcie, pp)
 
 struct hisi_pcie;
 
 struct pcie_soc_ops {
-	int (*hisi_pcie_link_up)(struct hisi_pcie *pcie);
+	int (*hisi_pcie_link_up)(struct hisi_pcie *hisi_pcie);
 };
 
 struct hisi_pcie {
+	struct pcie_port pp;		/* pp.dbi_base is DT rc_dbi */
 	struct regmap *subctrl;
-	void __iomem *reg_base;
 	u32 port_id;
-	struct pcie_port pp;
 	struct pcie_soc_ops *soc_ops;
 };
 
-static inline void hisi_pcie_apb_writel(struct hisi_pcie *pcie,
-					u32 val, u32 reg)
-{
-	writel(val, pcie->reg_base + reg);
-}
-
-static inline u32 hisi_pcie_apb_readl(struct hisi_pcie *pcie, u32 reg)
-{
-	return readl(pcie->reg_base + reg);
-}
-
 /* HipXX PCIe host only supports 32-bit config access */
 static int hisi_pcie_cfg_read(struct pcie_port *pp, int where, int size,
 			      u32 *val)
 {
 	u32 reg;
 	u32 reg_val;
-	struct hisi_pcie *pcie = to_hisi_pcie(pp);
 	void *walker = &reg_val;
 
 	walker += (where & 0x3);
 	reg = where & ~0x3;
-	reg_val = hisi_pcie_apb_readl(pcie, reg);
+	reg_val = dw_pcie_readl_rc(pp, reg);
 
 	if (size == 1)
 		*val = *(u8 __force *) walker;
@@ -86,21 +73,20 @@
 {
 	u32 reg_val;
 	u32 reg;
-	struct hisi_pcie *pcie = to_hisi_pcie(pp);
 	void *walker = &reg_val;
 
 	walker += (where & 0x3);
 	reg = where & ~0x3;
 	if (size == 4)
-		hisi_pcie_apb_writel(pcie, val, reg);
+		dw_pcie_writel_rc(pp, reg, val);
 	else if (size == 2) {
-		reg_val = hisi_pcie_apb_readl(pcie, reg);
+		reg_val = dw_pcie_readl_rc(pp, reg);
 		*(u16 __force *) walker = val;
-		hisi_pcie_apb_writel(pcie, reg_val, reg);
+		dw_pcie_writel_rc(pp, reg, reg_val);
 	} else if (size == 1) {
-		reg_val = hisi_pcie_apb_readl(pcie, reg);
+		reg_val = dw_pcie_readl_rc(pp, reg);
 		*(u8 __force *) walker = val;
-		hisi_pcie_apb_writel(pcie, reg_val, reg);
+		dw_pcie_writel_rc(pp, reg, reg_val);
 	} else
 		return PCIBIOS_BAD_REGISTER_NUMBER;
 
@@ -119,10 +105,10 @@
 
 static int hisi_pcie_link_up_hip06(struct hisi_pcie *hisi_pcie)
 {
+	struct pcie_port *pp = &hisi_pcie->pp;
 	u32 val;
 
-	val = hisi_pcie_apb_readl(hisi_pcie, PCIE_HIP06_CTRL_OFF +
-			PCIE_SYS_STATE4);
+	val = dw_pcie_readl_rc(pp, PCIE_SYS_STATE4);
 
 	return ((val & PCIE_LTSSM_STATE_MASK) == PCIE_LTSSM_LINKUP_STATE);
 }
@@ -140,19 +126,20 @@
 	.link_up = hisi_pcie_link_up,
 };
 
-static int hisi_add_pcie_port(struct pcie_port *pp,
-				     struct platform_device *pdev)
+static int hisi_add_pcie_port(struct hisi_pcie *hisi_pcie,
+			      struct platform_device *pdev)
 {
+	struct pcie_port *pp = &hisi_pcie->pp;
+	struct device *dev = pp->dev;
 	int ret;
 	u32 port_id;
-	struct hisi_pcie *hisi_pcie = to_hisi_pcie(pp);
 
-	if (of_property_read_u32(pdev->dev.of_node, "port-id", &port_id)) {
-		dev_err(&pdev->dev, "failed to read port-id\n");
+	if (of_property_read_u32(dev->of_node, "port-id", &port_id)) {
+		dev_err(dev, "failed to read port-id\n");
 		return -EINVAL;
 	}
 	if (port_id > 3) {
-		dev_err(&pdev->dev, "Invalid port-id: %d\n", port_id);
+		dev_err(dev, "Invalid port-id: %d\n", port_id);
 		return -EINVAL;
 	}
 	hisi_pcie->port_id = port_id;
@@ -161,7 +148,7 @@
 
 	ret = dw_pcie_host_init(pp);
 	if (ret) {
-		dev_err(&pdev->dev, "failed to initialize host\n");
+		dev_err(dev, "failed to initialize host\n");
 		return ret;
 	}
 
@@ -170,6 +157,7 @@
 
 static int hisi_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct hisi_pcie *hisi_pcie;
 	struct pcie_port *pp;
 	const struct of_device_id *match;
@@ -177,40 +165,36 @@
 	struct device_driver *driver;
 	int ret;
 
-	hisi_pcie = devm_kzalloc(&pdev->dev, sizeof(*hisi_pcie), GFP_KERNEL);
+	hisi_pcie = devm_kzalloc(dev, sizeof(*hisi_pcie), GFP_KERNEL);
 	if (!hisi_pcie)
 		return -ENOMEM;
 
 	pp = &hisi_pcie->pp;
-	pp->dev = &pdev->dev;
-	driver = (pdev->dev).driver;
+	pp->dev = dev;
+	driver = dev->driver;
 
-	match = of_match_device(driver->of_match_table, &pdev->dev);
+	match = of_match_device(driver->of_match_table, dev);
 	hisi_pcie->soc_ops = (struct pcie_soc_ops *) match->data;
 
 	hisi_pcie->subctrl =
 	syscon_regmap_lookup_by_compatible("hisilicon,pcie-sas-subctrl");
 	if (IS_ERR(hisi_pcie->subctrl)) {
-		dev_err(pp->dev, "cannot get subctrl base\n");
+		dev_err(dev, "cannot get subctrl base\n");
 		return PTR_ERR(hisi_pcie->subctrl);
 	}
 
 	reg = platform_get_resource_byname(pdev, IORESOURCE_MEM, "rc_dbi");
-	hisi_pcie->reg_base = devm_ioremap_resource(&pdev->dev, reg);
-	if (IS_ERR(hisi_pcie->reg_base)) {
-		dev_err(pp->dev, "cannot get rc_dbi base\n");
-		return PTR_ERR(hisi_pcie->reg_base);
+	pp->dbi_base = devm_ioremap_resource(dev, reg);
+	if (IS_ERR(pp->dbi_base)) {
+		dev_err(dev, "cannot get rc_dbi base\n");
+		return PTR_ERR(pp->dbi_base);
 	}
 
-	hisi_pcie->pp.dbi_base = hisi_pcie->reg_base;
-
-	ret = hisi_add_pcie_port(pp, pdev);
+	ret = hisi_add_pcie_port(hisi_pcie, pdev);
 	if (ret)
 		return ret;
 
-	platform_set_drvdata(pdev, hisi_pcie);
-
-	dev_warn(pp->dev, "only 32-bit config accesses supported; smaller writes may corrupt adjacent RW1C fields\n");
+	dev_warn(dev, "only 32-bit config accesses supported; smaller writes may corrupt adjacent RW1C fields\n");
 
 	return 0;
 }
diff --git a/drivers/pci/host/pcie-iproc-bcma.c b/drivers/pci/host/pcie-iproc-bcma.c
index 0d7bee4..8ce0890 100644
--- a/drivers/pci/host/pcie-iproc-bcma.c
+++ b/drivers/pci/host/pcie-iproc-bcma.c
@@ -42,19 +42,24 @@
 
 static int iproc_pcie_bcma_probe(struct bcma_device *bdev)
 {
+	struct device *dev = &bdev->dev;
 	struct iproc_pcie *pcie;
 	LIST_HEAD(res);
 	struct resource res_mem;
 	int ret;
 
-	pcie = devm_kzalloc(&bdev->dev, sizeof(*pcie), GFP_KERNEL);
+	pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
 	if (!pcie)
 		return -ENOMEM;
 
-	pcie->dev = &bdev->dev;
-	bcma_set_drvdata(bdev, pcie);
+	pcie->dev = dev;
 
 	pcie->base = bdev->io_addr;
+	if (!pcie->base) {
+		dev_err(dev, "no controller registers\n");
+		return -ENOMEM;
+	}
+
 	pcie->base_addr = bdev->addr;
 
 	res_mem.start = bdev->addr_s[0];
@@ -67,10 +72,11 @@
 
 	ret = iproc_pcie_setup(pcie, &res);
 	if (ret)
-		dev_err(pcie->dev, "PCIe controller setup failed\n");
+		dev_err(dev, "PCIe controller setup failed\n");
 
 	pci_free_resource_list(&res);
 
+	bcma_set_drvdata(bdev, pcie);
 	return ret;
 }
 
diff --git a/drivers/pci/host/pcie-iproc-platform.c b/drivers/pci/host/pcie-iproc-platform.c
index 1738c52..a3de087 100644
--- a/drivers/pci/host/pcie-iproc-platform.c
+++ b/drivers/pci/host/pcie-iproc-platform.c
@@ -40,35 +40,35 @@
 
 static int iproc_pcie_pltfm_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	const struct of_device_id *of_id;
 	struct iproc_pcie *pcie;
-	struct device_node *np = pdev->dev.of_node;
+	struct device_node *np = dev->of_node;
 	struct resource reg;
 	resource_size_t iobase = 0;
 	LIST_HEAD(res);
 	int ret;
 
-	of_id = of_match_device(iproc_pcie_of_match_table, &pdev->dev);
+	of_id = of_match_device(iproc_pcie_of_match_table, dev);
 	if (!of_id)
 		return -EINVAL;
 
-	pcie = devm_kzalloc(&pdev->dev, sizeof(struct iproc_pcie), GFP_KERNEL);
+	pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
 	if (!pcie)
 		return -ENOMEM;
 
-	pcie->dev = &pdev->dev;
+	pcie->dev = dev;
 	pcie->type = (enum iproc_pcie_type)of_id->data;
-	platform_set_drvdata(pdev, pcie);
 
 	ret = of_address_to_resource(np, 0, &reg);
 	if (ret < 0) {
-		dev_err(pcie->dev, "unable to obtain controller resources\n");
+		dev_err(dev, "unable to obtain controller resources\n");
 		return ret;
 	}
 
-	pcie->base = devm_ioremap(pcie->dev, reg.start, resource_size(&reg));
+	pcie->base = devm_ioremap(dev, reg.start, resource_size(&reg));
 	if (!pcie->base) {
-		dev_err(pcie->dev, "unable to map controller registers\n");
+		dev_err(dev, "unable to map controller registers\n");
 		return -ENOMEM;
 	}
 	pcie->base_addr = reg.start;
@@ -79,7 +79,7 @@
 		ret = of_property_read_u32(np, "brcm,pcie-ob-axi-offset",
 					   &val);
 		if (ret) {
-			dev_err(pcie->dev,
+			dev_err(dev,
 				"missing brcm,pcie-ob-axi-offset property\n");
 			return ret;
 		}
@@ -88,7 +88,7 @@
 		ret = of_property_read_u32(np, "brcm,pcie-ob-window-size",
 					   &val);
 		if (ret) {
-			dev_err(pcie->dev,
+			dev_err(dev,
 				"missing brcm,pcie-ob-window-size property\n");
 			return ret;
 		}
@@ -101,7 +101,7 @@
 	}
 
 	/* PHY use is optional */
-	pcie->phy = devm_phy_get(&pdev->dev, "pcie-phy");
+	pcie->phy = devm_phy_get(dev, "pcie-phy");
 	if (IS_ERR(pcie->phy)) {
 		if (PTR_ERR(pcie->phy) == -EPROBE_DEFER)
 			return -EPROBE_DEFER;
@@ -110,7 +110,7 @@
 
 	ret = of_pci_get_host_bridge_resources(np, 0, 0xff, &res, &iobase);
 	if (ret) {
-		dev_err(pcie->dev,
+		dev_err(dev,
 			"unable to get PCI host bridge resources\n");
 		return ret;
 	}
@@ -119,10 +119,11 @@
 
 	ret = iproc_pcie_setup(pcie, &res);
 	if (ret)
-		dev_err(pcie->dev, "PCIe controller setup failed\n");
+		dev_err(dev, "PCIe controller setup failed\n");
 
 	pci_free_resource_list(&res);
 
+	platform_set_drvdata(pdev, pcie);
 	return ret;
 }
 
diff --git a/drivers/pci/host/pcie-iproc.c b/drivers/pci/host/pcie-iproc.c
index e167b2f..0b999a9 100644
--- a/drivers/pci/host/pcie-iproc.c
+++ b/drivers/pci/host/pcie-iproc.c
@@ -63,6 +63,8 @@
 #define OARR_SIZE_CFG_SHIFT          1
 #define OARR_SIZE_CFG                BIT(OARR_SIZE_CFG_SHIFT)
 
+#define PCI_EXP_CAP			0xac
+
 #define MAX_NUM_OB_WINDOWS           2
 
 #define IPROC_PCIE_REG_INVALID 0xffff
@@ -258,9 +260,10 @@
 
 static int iproc_pcie_check_link(struct iproc_pcie *pcie, struct pci_bus *bus)
 {
+	struct device *dev = pcie->dev;
 	u8 hdr_type;
 	u32 link_ctrl, class, val;
-	u16 pos, link_status;
+	u16 pos = PCI_EXP_CAP, link_status;
 	bool link_is_active = false;
 
 	/*
@@ -272,14 +275,14 @@
 
 	val = iproc_pcie_read_reg(pcie, IPROC_PCIE_LINK_STATUS);
 	if (!(val & PCIE_PHYLINKUP) || !(val & PCIE_DL_ACTIVE)) {
-		dev_err(pcie->dev, "PHY or data link is INACTIVE!\n");
+		dev_err(dev, "PHY or data link is INACTIVE!\n");
 		return -ENODEV;
 	}
 
 	/* make sure we are not in EP mode */
 	pci_bus_read_config_byte(bus, 0, PCI_HEADER_TYPE, &hdr_type);
 	if ((hdr_type & 0x7f) != PCI_HEADER_TYPE_BRIDGE) {
-		dev_err(pcie->dev, "in EP mode, hdr=%#02x\n", hdr_type);
+		dev_err(dev, "in EP mode, hdr=%#02x\n", hdr_type);
 		return -EFAULT;
 	}
 
@@ -293,30 +296,27 @@
 	pci_bus_write_config_dword(bus, 0, PCI_BRIDGE_CTRL_REG_OFFSET, class);
 
 	/* check link status to see if link is active */
-	pos = pci_bus_find_capability(bus, 0, PCI_CAP_ID_EXP);
 	pci_bus_read_config_word(bus, 0, pos + PCI_EXP_LNKSTA, &link_status);
 	if (link_status & PCI_EXP_LNKSTA_NLW)
 		link_is_active = true;
 
 	if (!link_is_active) {
 		/* try GEN 1 link speed */
-#define PCI_LINK_STATUS_CTRL_2_OFFSET 0x0dc
 #define PCI_TARGET_LINK_SPEED_MASK    0xf
 #define PCI_TARGET_LINK_SPEED_GEN2    0x2
 #define PCI_TARGET_LINK_SPEED_GEN1    0x1
 		pci_bus_read_config_dword(bus, 0,
-					  PCI_LINK_STATUS_CTRL_2_OFFSET,
+					  pos + PCI_EXP_LNKCTL2,
 					  &link_ctrl);
 		if ((link_ctrl & PCI_TARGET_LINK_SPEED_MASK) ==
 		    PCI_TARGET_LINK_SPEED_GEN2) {
 			link_ctrl &= ~PCI_TARGET_LINK_SPEED_MASK;
 			link_ctrl |= PCI_TARGET_LINK_SPEED_GEN1;
 			pci_bus_write_config_dword(bus, 0,
-					   PCI_LINK_STATUS_CTRL_2_OFFSET,
+					   pos + PCI_EXP_LNKCTL2,
 					   link_ctrl);
 			msleep(100);
 
-			pos = pci_bus_find_capability(bus, 0, PCI_CAP_ID_EXP);
 			pci_bus_read_config_word(bus, 0, pos + PCI_EXP_LNKSTA,
 						 &link_status);
 			if (link_status & PCI_EXP_LNKSTA_NLW)
@@ -324,7 +324,7 @@
 		}
 	}
 
-	dev_info(pcie->dev, "link: %s\n", link_is_active ? "UP" : "DOWN");
+	dev_info(dev, "link: %s\n", link_is_active ? "UP" : "DOWN");
 
 	return link_is_active ? 0 : -ENODEV;
 }
@@ -349,12 +349,13 @@
 			       u64 pci_addr, resource_size_t size)
 {
 	struct iproc_pcie_ob *ob = &pcie->ob;
+	struct device *dev = pcie->dev;
 	unsigned i;
 	u64 max_size = (u64)ob->window_size * MAX_NUM_OB_WINDOWS;
 	u64 remainder;
 
 	if (size > max_size) {
-		dev_err(pcie->dev,
+		dev_err(dev,
 			"res size %pap exceeds max supported size 0x%llx\n",
 			&size, max_size);
 		return -EINVAL;
@@ -362,15 +363,14 @@
 
 	div64_u64_rem(size, ob->window_size, &remainder);
 	if (remainder) {
-		dev_err(pcie->dev,
+		dev_err(dev,
 			"res size %pap needs to be multiple of window size %pap\n",
 			&size, &ob->window_size);
 		return -EINVAL;
 	}
 
 	if (axi_addr < ob->axi_offset) {
-		dev_err(pcie->dev,
-			"axi address %pap less than offset %pap\n",
+		dev_err(dev, "axi address %pap less than offset %pap\n",
 			&axi_addr, &ob->axi_offset);
 		return -EINVAL;
 	}
@@ -406,6 +406,7 @@
 static int iproc_pcie_map_ranges(struct iproc_pcie *pcie,
 				 struct list_head *resources)
 {
+	struct device *dev = pcie->dev;
 	struct resource_entry *window;
 	int ret;
 
@@ -425,7 +426,7 @@
 				return ret;
 			break;
 		default:
-			dev_err(pcie->dev, "invalid resource %pR\n", res);
+			dev_err(dev, "invalid resource %pR\n", res);
 			return -EINVAL;
 		}
 	}
@@ -455,26 +456,25 @@
 
 int iproc_pcie_setup(struct iproc_pcie *pcie, struct list_head *res)
 {
+	struct device *dev;
 	int ret;
 	void *sysdata;
 	struct pci_bus *bus;
 
-	if (!pcie || !pcie->dev || !pcie->base)
-		return -EINVAL;
-
-	ret = devm_request_pci_bus_resources(pcie->dev, res);
+	dev = pcie->dev;
+	ret = devm_request_pci_bus_resources(dev, res);
 	if (ret)
 		return ret;
 
 	ret = phy_init(pcie->phy);
 	if (ret) {
-		dev_err(pcie->dev, "unable to initialize PCIe PHY\n");
+		dev_err(dev, "unable to initialize PCIe PHY\n");
 		return ret;
 	}
 
 	ret = phy_power_on(pcie->phy);
 	if (ret) {
-		dev_err(pcie->dev, "unable to power on PCIe PHY\n");
+		dev_err(dev, "unable to power on PCIe PHY\n");
 		goto err_exit_phy;
 	}
 
@@ -486,7 +486,7 @@
 		pcie->reg_offsets = iproc_pcie_reg_paxc;
 		break;
 	default:
-		dev_err(pcie->dev, "incompatible iProc PCIe interface\n");
+		dev_err(dev, "incompatible iProc PCIe interface\n");
 		ret = -EINVAL;
 		goto err_power_off_phy;
 	}
@@ -496,7 +496,7 @@
 	if (pcie->need_ob_cfg) {
 		ret = iproc_pcie_map_ranges(pcie, res);
 		if (ret) {
-			dev_err(pcie->dev, "map failed\n");
+			dev_err(dev, "map failed\n");
 			goto err_power_off_phy;
 		}
 	}
@@ -508,9 +508,9 @@
 	sysdata = pcie;
 #endif
 
-	bus = pci_create_root_bus(pcie->dev, 0, &iproc_pcie_ops, sysdata, res);
+	bus = pci_create_root_bus(dev, 0, &iproc_pcie_ops, sysdata, res);
 	if (!bus) {
-		dev_err(pcie->dev, "unable to create PCI root bus\n");
+		dev_err(dev, "unable to create PCI root bus\n");
 		ret = -ENOMEM;
 		goto err_power_off_phy;
 	}
@@ -518,7 +518,7 @@
 
 	ret = iproc_pcie_check_link(pcie, bus);
 	if (ret) {
-		dev_err(pcie->dev, "no PCIe EP device detected\n");
+		dev_err(dev, "no PCIe EP device detected\n");
 		goto err_rm_root_bus;
 	}
 
@@ -526,7 +526,7 @@
 
 	if (IS_ENABLED(CONFIG_PCI_MSI))
 		if (iproc_pcie_msi_enable(pcie))
-			dev_info(pcie->dev, "not using iProc MSI\n");
+			dev_info(dev, "not using iProc MSI\n");
 
 	pci_scan_child_bus(bus);
 	pci_assign_unassigned_bus_resources(bus);
diff --git a/drivers/pci/host/pcie-qcom.c b/drivers/pci/host/pcie-qcom.c
index 5ec2d44..ef0a84c 100644
--- a/drivers/pci/host/pcie-qcom.c
+++ b/drivers/pci/host/pcie-qcom.c
@@ -86,12 +86,10 @@
 };
 
 struct qcom_pcie {
-	struct pcie_port pp;
-	struct device *dev;
+	struct pcie_port pp;			/* pp.dbi_base is DT dbi */
+	void __iomem *parf;			/* DT parf */
+	void __iomem *elbi;			/* DT elbi */
 	union qcom_pcie_resources res;
-	void __iomem *parf;
-	void __iomem *dbi;
-	void __iomem *elbi;
 	struct phy *phy;
 	struct gpio_desc *reset;
 	struct qcom_pcie_ops *ops;
@@ -136,7 +134,7 @@
 static int qcom_pcie_get_resources_v0(struct qcom_pcie *pcie)
 {
 	struct qcom_pcie_resources_v0 *res = &pcie->res.v0;
-	struct device *dev = pcie->dev;
+	struct device *dev = pcie->pp.dev;
 
 	res->vdda = devm_regulator_get(dev, "vdda");
 	if (IS_ERR(res->vdda))
@@ -188,7 +186,7 @@
 static int qcom_pcie_get_resources_v1(struct qcom_pcie *pcie)
 {
 	struct qcom_pcie_resources_v1 *res = &pcie->res.v1;
-	struct device *dev = pcie->dev;
+	struct device *dev = pcie->pp.dev;
 
 	res->vdda = devm_regulator_get(dev, "vdda");
 	if (IS_ERR(res->vdda))
@@ -237,7 +235,7 @@
 static int qcom_pcie_init_v0(struct qcom_pcie *pcie)
 {
 	struct qcom_pcie_resources_v0 *res = &pcie->res.v0;
-	struct device *dev = pcie->dev;
+	struct device *dev = pcie->pp.dev;
 	u32 val;
 	int ret;
 
@@ -359,7 +357,7 @@
 static int qcom_pcie_init_v1(struct qcom_pcie *pcie)
 {
 	struct qcom_pcie_resources_v1 *res = &pcie->res.v1;
-	struct device *dev = pcie->dev;
+	struct device *dev = pcie->pp.dev;
 	int ret;
 
 	ret = reset_control_deassert(res->core);
@@ -426,7 +424,7 @@
 static int qcom_pcie_link_up(struct pcie_port *pp)
 {
 	struct qcom_pcie *pcie = to_qcom_pcie(pp);
-	u16 val = readw(pcie->dbi + PCIE20_CAP + PCI_EXP_LNKSTA);
+	u16 val = readw(pcie->pp.dbi_base + PCIE20_CAP + PCI_EXP_LNKSTA);
 
 	return !!(val & PCI_EXP_LNKSTA_DLLLA);
 }
@@ -509,8 +507,8 @@
 	if (!pcie)
 		return -ENOMEM;
 
+	pp = &pcie->pp;
 	pcie->ops = (struct qcom_pcie_ops *)of_device_get_match_data(dev);
-	pcie->dev = dev;
 
 	pcie->reset = devm_gpiod_get_optional(dev, "perst", GPIOD_OUT_LOW);
 	if (IS_ERR(pcie->reset))
@@ -522,9 +520,9 @@
 		return PTR_ERR(pcie->parf);
 
 	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi");
-	pcie->dbi = devm_ioremap_resource(dev, res);
-	if (IS_ERR(pcie->dbi))
-		return PTR_ERR(pcie->dbi);
+	pp->dbi_base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(pp->dbi_base))
+		return PTR_ERR(pp->dbi_base);
 
 	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "elbi");
 	pcie->elbi = devm_ioremap_resource(dev, res);
@@ -539,9 +537,7 @@
 	if (ret)
 		return ret;
 
-	pp = &pcie->pp;
 	pp->dev = dev;
-	pp->dbi_base = pcie->dbi;
 	pp->root_bus_nr = -1;
 	pp->ops = &qcom_pcie_dw_ops;
 
@@ -569,8 +565,6 @@
 		return ret;
 	}
 
-	platform_set_drvdata(pdev, pcie);
-
 	return 0;
 }
 
diff --git a/drivers/pci/host/pcie-rcar.c b/drivers/pci/host/pcie-rcar.c
index e06b1d3..62700d1 100644
--- a/drivers/pci/host/pcie-rcar.c
+++ b/drivers/pci/host/pcie-rcar.c
@@ -31,8 +31,6 @@
 #include <linux/pm_runtime.h>
 #include <linux/slab.h>
 
-#define DRV_NAME "rcar-pcie"
-
 #define PCIECAR			0x000010
 #define PCIECCTLR		0x000018
 #define  CONFIG_SEND_ENABLE	(1 << 31)
@@ -397,6 +395,7 @@
 
 static void rcar_pcie_force_speedup(struct rcar_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	unsigned int timeout = 1000;
 	u32 macsr;
 
@@ -404,7 +403,7 @@
 		return;
 
 	if (rcar_pci_read_reg(pcie, MACCTLR) & SPEED_CHANGE) {
-		dev_err(pcie->dev, "Speed change already in progress\n");
+		dev_err(dev, "Speed change already in progress\n");
 		return;
 	}
 
@@ -433,7 +432,7 @@
 			rcar_pci_write_reg(pcie, macsr, MACSR);
 
 			if (macsr & SPCHGFAIL)
-				dev_err(pcie->dev, "Speed change failed\n");
+				dev_err(dev, "Speed change failed\n");
 
 			goto done;
 		}
@@ -441,15 +440,16 @@
 		msleep(1);
 	};
 
-	dev_err(pcie->dev, "Speed change timed out\n");
+	dev_err(dev, "Speed change timed out\n");
 
 done:
-	dev_info(pcie->dev, "Current link speed is %s GT/s\n",
+	dev_info(dev, "Current link speed is %s GT/s\n",
 		 (macsr & LINK_SPEED) == LINK_SPEED_5_0GTS ? "5" : "2.5");
 }
 
 static int rcar_pcie_enable(struct rcar_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	struct pci_bus *bus, *child;
 	LIST_HEAD(res);
 
@@ -461,14 +461,14 @@
 	pci_add_flags(PCI_REASSIGN_ALL_RSRC | PCI_REASSIGN_ALL_BUS);
 
 	if (IS_ENABLED(CONFIG_PCI_MSI))
-		bus = pci_scan_root_bus_msi(pcie->dev, pcie->root_bus_nr,
+		bus = pci_scan_root_bus_msi(dev, pcie->root_bus_nr,
 				&rcar_pcie_ops, pcie, &res, &pcie->msi.chip);
 	else
-		bus = pci_scan_root_bus(pcie->dev, pcie->root_bus_nr,
+		bus = pci_scan_root_bus(dev, pcie->root_bus_nr,
 				&rcar_pcie_ops, pcie, &res);
 
 	if (!bus) {
-		dev_err(pcie->dev, "Scanning rootbus failed");
+		dev_err(dev, "Scanning rootbus failed");
 		return -ENODEV;
 	}
 
@@ -487,6 +487,7 @@
 
 static int phy_wait_for_ack(struct rcar_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	unsigned int timeout = 100;
 
 	while (timeout--) {
@@ -496,7 +497,7 @@
 		udelay(100);
 	}
 
-	dev_err(pcie->dev, "Access to PCIe phy timed out\n");
+	dev_err(dev, "Access to PCIe phy timed out\n");
 
 	return -ETIMEDOUT;
 }
@@ -697,6 +698,7 @@
 {
 	struct rcar_pcie *pcie = data;
 	struct rcar_msi *msi = &pcie->msi;
+	struct device *dev = pcie->dev;
 	unsigned long reg;
 
 	reg = rcar_pci_read_reg(pcie, PCIEMSIFR);
@@ -717,10 +719,10 @@
 			if (test_bit(index, msi->used))
 				generic_handle_irq(irq);
 			else
-				dev_info(pcie->dev, "unhandled MSI\n");
+				dev_info(dev, "unhandled MSI\n");
 		} else {
 			/* Unknown MSI, just clear it */
-			dev_dbg(pcie->dev, "unexpected MSI\n");
+			dev_dbg(dev, "unexpected MSI\n");
 		}
 
 		/* see if there's any more pending in this vector */
@@ -843,22 +845,22 @@
 
 static int rcar_pcie_enable_msi(struct rcar_pcie *pcie)
 {
-	struct platform_device *pdev = to_platform_device(pcie->dev);
+	struct device *dev = pcie->dev;
 	struct rcar_msi *msi = &pcie->msi;
 	unsigned long base;
 	int err, i;
 
 	mutex_init(&msi->lock);
 
-	msi->chip.dev = pcie->dev;
+	msi->chip.dev = dev;
 	msi->chip.setup_irq = rcar_msi_setup_irq;
 	msi->chip.setup_irqs = rcar_msi_setup_irqs;
 	msi->chip.teardown_irq = rcar_msi_teardown_irq;
 
-	msi->domain = irq_domain_add_linear(pcie->dev->of_node, INT_PCI_MSI_NR,
+	msi->domain = irq_domain_add_linear(dev->of_node, INT_PCI_MSI_NR,
 					    &msi_domain_ops, &msi->chip);
 	if (!msi->domain) {
-		dev_err(&pdev->dev, "failed to create IRQ domain\n");
+		dev_err(dev, "failed to create IRQ domain\n");
 		return -ENOMEM;
 	}
 
@@ -866,19 +868,19 @@
 		irq_create_mapping(msi->domain, i);
 
 	/* Two irqs are for MSI, but they are also used for non-MSI irqs */
-	err = devm_request_irq(&pdev->dev, msi->irq1, rcar_pcie_msi_irq,
+	err = devm_request_irq(dev, msi->irq1, rcar_pcie_msi_irq,
 			       IRQF_SHARED | IRQF_NO_THREAD,
 			       rcar_msi_irq_chip.name, pcie);
 	if (err < 0) {
-		dev_err(&pdev->dev, "failed to request IRQ: %d\n", err);
+		dev_err(dev, "failed to request IRQ: %d\n", err);
 		goto err;
 	}
 
-	err = devm_request_irq(&pdev->dev, msi->irq2, rcar_pcie_msi_irq,
+	err = devm_request_irq(dev, msi->irq2, rcar_pcie_msi_irq,
 			       IRQF_SHARED | IRQF_NO_THREAD,
 			       rcar_msi_irq_chip.name, pcie);
 	if (err < 0) {
-		dev_err(&pdev->dev, "failed to request IRQ: %d\n", err);
+		dev_err(dev, "failed to request IRQ: %d\n", err);
 		goto err;
 	}
 
@@ -899,32 +901,32 @@
 	return err;
 }
 
-static int rcar_pcie_get_resources(struct platform_device *pdev,
-				   struct rcar_pcie *pcie)
+static int rcar_pcie_get_resources(struct rcar_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	struct resource res;
 	int err, i;
 
-	err = of_address_to_resource(pdev->dev.of_node, 0, &res);
+	err = of_address_to_resource(dev->of_node, 0, &res);
 	if (err)
 		return err;
 
-	pcie->base = devm_ioremap_resource(&pdev->dev, &res);
+	pcie->base = devm_ioremap_resource(dev, &res);
 	if (IS_ERR(pcie->base))
 		return PTR_ERR(pcie->base);
 
-	pcie->clk = devm_clk_get(&pdev->dev, "pcie");
+	pcie->clk = devm_clk_get(dev, "pcie");
 	if (IS_ERR(pcie->clk)) {
-		dev_err(pcie->dev, "cannot get platform clock\n");
+		dev_err(dev, "cannot get platform clock\n");
 		return PTR_ERR(pcie->clk);
 	}
 	err = clk_prepare_enable(pcie->clk);
 	if (err)
 		return err;
 
-	pcie->bus_clk = devm_clk_get(&pdev->dev, "pcie_bus");
+	pcie->bus_clk = devm_clk_get(dev, "pcie_bus");
 	if (IS_ERR(pcie->bus_clk)) {
-		dev_err(pcie->dev, "cannot get pcie bus clock\n");
+		dev_err(dev, "cannot get pcie bus clock\n");
 		err = PTR_ERR(pcie->bus_clk);
 		goto fail_clk;
 	}
@@ -932,17 +934,17 @@
 	if (err)
 		goto fail_clk;
 
-	i = irq_of_parse_and_map(pdev->dev.of_node, 0);
+	i = irq_of_parse_and_map(dev->of_node, 0);
 	if (!i) {
-		dev_err(pcie->dev, "cannot get platform resources for msi interrupt\n");
+		dev_err(dev, "cannot get platform resources for msi interrupt\n");
 		err = -ENOENT;
 		goto err_map_reg;
 	}
 	pcie->msi.irq1 = i;
 
-	i = irq_of_parse_and_map(pdev->dev.of_node, 1);
+	i = irq_of_parse_and_map(dev->of_node, 1);
 	if (!i) {
-		dev_err(pcie->dev, "cannot get platform resources for msi interrupt\n");
+		dev_err(dev, "cannot get platform resources for msi interrupt\n");
 		err = -ENOENT;
 		goto err_map_reg;
 	}
@@ -1119,60 +1121,60 @@
 
 static int rcar_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct rcar_pcie *pcie;
 	unsigned int data;
 	const struct of_device_id *of_id;
 	int err;
 	int (*hw_init_fn)(struct rcar_pcie *);
 
-	pcie = devm_kzalloc(&pdev->dev, sizeof(*pcie), GFP_KERNEL);
+	pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
 	if (!pcie)
 		return -ENOMEM;
 
-	pcie->dev = &pdev->dev;
-	platform_set_drvdata(pdev, pcie);
+	pcie->dev = dev;
 
 	INIT_LIST_HEAD(&pcie->resources);
 
 	rcar_pcie_parse_request_of_pci_ranges(pcie);
 
-	err = rcar_pcie_get_resources(pdev, pcie);
+	err = rcar_pcie_get_resources(pcie);
 	if (err < 0) {
-		dev_err(&pdev->dev, "failed to request resources: %d\n", err);
+		dev_err(dev, "failed to request resources: %d\n", err);
 		return err;
 	}
 
-	err = rcar_pcie_parse_map_dma_ranges(pcie, pdev->dev.of_node);
+	err = rcar_pcie_parse_map_dma_ranges(pcie, dev->of_node);
 	if (err)
 		return err;
 
-	of_id = of_match_device(rcar_pcie_of_match, pcie->dev);
+	of_id = of_match_device(rcar_pcie_of_match, dev);
 	if (!of_id || !of_id->data)
 		return -EINVAL;
 	hw_init_fn = of_id->data;
 
-	pm_runtime_enable(pcie->dev);
-	err = pm_runtime_get_sync(pcie->dev);
+	pm_runtime_enable(dev);
+	err = pm_runtime_get_sync(dev);
 	if (err < 0) {
-		dev_err(pcie->dev, "pm_runtime_get_sync failed\n");
+		dev_err(dev, "pm_runtime_get_sync failed\n");
 		goto err_pm_disable;
 	}
 
 	/* Failure to get a link might just be that no cards are inserted */
 	err = hw_init_fn(pcie);
 	if (err) {
-		dev_info(&pdev->dev, "PCIe link down\n");
+		dev_info(dev, "PCIe link down\n");
 		err = 0;
 		goto err_pm_put;
 	}
 
 	data = rcar_pci_read_reg(pcie, MACSR);
-	dev_info(&pdev->dev, "PCIe x%d: link up\n", (data >> 20) & 0x3f);
+	dev_info(dev, "PCIe x%d: link up\n", (data >> 20) & 0x3f);
 
 	if (IS_ENABLED(CONFIG_PCI_MSI)) {
 		err = rcar_pcie_enable_msi(pcie);
 		if (err < 0) {
-			dev_err(&pdev->dev,
+			dev_err(dev,
 				"failed to enable MSI support: %d\n",
 				err);
 			goto err_pm_put;
@@ -1186,16 +1188,16 @@
 	return 0;
 
 err_pm_put:
-	pm_runtime_put(pcie->dev);
+	pm_runtime_put(dev);
 
 err_pm_disable:
-	pm_runtime_disable(pcie->dev);
+	pm_runtime_disable(dev);
 	return err;
 }
 
 static struct platform_driver rcar_pcie_driver = {
 	.driver = {
-		.name = DRV_NAME,
+		.name = "rcar-pcie",
 		.of_match_table = rcar_pcie_of_match,
 		.suppress_bind_attrs = true,
 	},
diff --git a/drivers/pci/host/pcie-rockchip.c b/drivers/pci/host/pcie-rockchip.c
index b8c82fc..e0b22da 100644
--- a/drivers/pci/host/pcie-rockchip.c
+++ b/drivers/pci/host/pcie-rockchip.c
@@ -972,7 +972,7 @@
 		return -EINVAL;
 	if (region_no == 0) {
 		if (AXI_REGION_0_SIZE < (2ULL << num_pass_bits))
-		return -EINVAL;
+			return -EINVAL;
 	}
 	if (region_no != 0) {
 		if (AXI_REGION_SIZE < (2ULL << num_pass_bits))
@@ -1091,8 +1091,6 @@
 	if (err)
 		goto err_vpcie;
 
-	platform_set_drvdata(pdev, rockchip);
-
 	rockchip_pcie_enable_interrupts(rockchip);
 
 	err = rockchip_pcie_init_irq_domain(rockchip);
diff --git a/drivers/pci/host/pcie-spear13xx.c b/drivers/pci/host/pcie-spear13xx.c
index 09aed85..3cf197b 100644
--- a/drivers/pci/host/pcie-spear13xx.c
+++ b/drivers/pci/host/pcie-spear13xx.c
@@ -25,10 +25,10 @@
 #include "pcie-designware.h"
 
 struct spear13xx_pcie {
+	struct pcie_port	pp;		/* DT dbi is pp.dbi_base */
 	void __iomem		*app_base;
 	struct phy		*phy;
 	struct clk		*clk;
-	struct pcie_port	pp;
 	bool			is_gen1;
 };
 
@@ -57,96 +57,26 @@
 };
 
 /* CR0 ID */
-#define RX_LANE_FLIP_EN_ID			0
-#define TX_LANE_FLIP_EN_ID			1
-#define SYS_AUX_PWR_DET_ID			2
 #define APP_LTSSM_ENABLE_ID			3
-#define SYS_ATTEN_BUTTON_PRESSED_ID		4
-#define SYS_MRL_SENSOR_STATE_ID			5
-#define SYS_PWR_FAULT_DET_ID			6
-#define SYS_MRL_SENSOR_CHGED_ID			7
-#define SYS_PRE_DET_CHGED_ID			8
-#define SYS_CMD_CPLED_INT_ID			9
-#define APP_INIT_RST_0_ID			11
-#define APP_REQ_ENTR_L1_ID			12
-#define APP_READY_ENTR_L23_ID			13
-#define APP_REQ_EXIT_L1_ID			14
-#define DEVICE_TYPE_EP				(0 << 25)
-#define DEVICE_TYPE_LEP				(1 << 25)
 #define DEVICE_TYPE_RC				(4 << 25)
-#define SYS_INT_ID				29
 #define MISCTRL_EN_ID				30
 #define REG_TRANSLATION_ENABLE			31
 
-/* CR1 ID */
-#define APPS_PM_XMT_TURNOFF_ID			2
-#define APPS_PM_XMT_PME_ID			5
-
 /* CR3 ID */
-#define XMLH_LTSSM_STATE_DETECT_QUIET		0x00
-#define XMLH_LTSSM_STATE_DETECT_ACT		0x01
-#define XMLH_LTSSM_STATE_POLL_ACTIVE		0x02
-#define XMLH_LTSSM_STATE_POLL_COMPLIANCE	0x03
-#define XMLH_LTSSM_STATE_POLL_CONFIG		0x04
-#define XMLH_LTSSM_STATE_PRE_DETECT_QUIET	0x05
-#define XMLH_LTSSM_STATE_DETECT_WAIT		0x06
-#define XMLH_LTSSM_STATE_CFG_LINKWD_START	0x07
-#define XMLH_LTSSM_STATE_CFG_LINKWD_ACEPT	0x08
-#define XMLH_LTSSM_STATE_CFG_LANENUM_WAIT	0x09
-#define XMLH_LTSSM_STATE_CFG_LANENUM_ACEPT	0x0A
-#define XMLH_LTSSM_STATE_CFG_COMPLETE		0x0B
-#define XMLH_LTSSM_STATE_CFG_IDLE		0x0C
-#define XMLH_LTSSM_STATE_RCVRY_LOCK		0x0D
-#define XMLH_LTSSM_STATE_RCVRY_SPEED		0x0E
-#define XMLH_LTSSM_STATE_RCVRY_RCVRCFG		0x0F
-#define XMLH_LTSSM_STATE_RCVRY_IDLE		0x10
-#define XMLH_LTSSM_STATE_L0			0x11
-#define XMLH_LTSSM_STATE_L0S			0x12
-#define XMLH_LTSSM_STATE_L123_SEND_EIDLE	0x13
-#define XMLH_LTSSM_STATE_L1_IDLE		0x14
-#define XMLH_LTSSM_STATE_L2_IDLE		0x15
-#define XMLH_LTSSM_STATE_L2_WAKE		0x16
-#define XMLH_LTSSM_STATE_DISABLED_ENTRY		0x17
-#define XMLH_LTSSM_STATE_DISABLED_IDLE		0x18
-#define XMLH_LTSSM_STATE_DISABLED		0x19
-#define XMLH_LTSSM_STATE_LPBK_ENTRY		0x1A
-#define XMLH_LTSSM_STATE_LPBK_ACTIVE		0x1B
-#define XMLH_LTSSM_STATE_LPBK_EXIT		0x1C
-#define XMLH_LTSSM_STATE_LPBK_EXIT_TIMEOUT	0x1D
-#define XMLH_LTSSM_STATE_HOT_RESET_ENTRY	0x1E
-#define XMLH_LTSSM_STATE_HOT_RESET		0x1F
-#define XMLH_LTSSM_STATE_MASK			0x3F
 #define XMLH_LINK_UP				(1 << 6)
 
-/* CR4 ID */
-#define CFG_MSI_EN_ID				18
-
 /* CR6 */
-#define INTA_CTRL_INT				(1 << 7)
-#define INTB_CTRL_INT				(1 << 8)
-#define INTC_CTRL_INT				(1 << 9)
-#define INTD_CTRL_INT				(1 << 10)
 #define MSI_CTRL_INT				(1 << 26)
 
-/* CR19 ID */
-#define VEN_MSI_REQ_ID				11
-#define VEN_MSI_FUN_NUM_ID			8
-#define VEN_MSI_TC_ID				5
-#define VEN_MSI_VECTOR_ID			0
-#define VEN_MSI_REQ_EN		((u32)0x1 << VEN_MSI_REQ_ID)
-#define VEN_MSI_FUN_NUM_MASK	((u32)0x7 << VEN_MSI_FUN_NUM_ID)
-#define VEN_MSI_TC_MASK		((u32)0x7 << VEN_MSI_TC_ID)
-#define VEN_MSI_VECTOR_MASK	((u32)0x1F << VEN_MSI_VECTOR_ID)
-
 #define EXP_CAP_ID_OFFSET			0x70
 
 #define to_spear13xx_pcie(x)	container_of(x, struct spear13xx_pcie, pp)
 
-static int spear13xx_pcie_establish_link(struct pcie_port *pp)
+static int spear13xx_pcie_establish_link(struct spear13xx_pcie *spear13xx_pcie)
 {
-	u32 val;
-	struct spear13xx_pcie *spear13xx_pcie = to_spear13xx_pcie(pp);
+	struct pcie_port *pp = &spear13xx_pcie->pp;
 	struct pcie_app_reg *app_reg = spear13xx_pcie->app_base;
+	u32 val;
 	u32 exp_cap_off = EXP_CAP_ID_OFFSET;
 
 	if (dw_pcie_link_up(pp)) {
@@ -203,9 +133,9 @@
 
 static irqreturn_t spear13xx_pcie_irq_handler(int irq, void *arg)
 {
-	struct pcie_port *pp = arg;
-	struct spear13xx_pcie *spear13xx_pcie = to_spear13xx_pcie(pp);
+	struct spear13xx_pcie *spear13xx_pcie = arg;
 	struct pcie_app_reg *app_reg = spear13xx_pcie->app_base;
+	struct pcie_port *pp = &spear13xx_pcie->pp;
 	unsigned int status;
 
 	status = readl(&app_reg->int_sts);
@@ -220,9 +150,9 @@
 	return IRQ_HANDLED;
 }
 
-static void spear13xx_pcie_enable_interrupts(struct pcie_port *pp)
+static void spear13xx_pcie_enable_interrupts(struct spear13xx_pcie *spear13xx_pcie)
 {
-	struct spear13xx_pcie *spear13xx_pcie = to_spear13xx_pcie(pp);
+	struct pcie_port *pp = &spear13xx_pcie->pp;
 	struct pcie_app_reg *app_reg = spear13xx_pcie->app_base;
 
 	/* Enable MSI interrupt */
@@ -246,8 +176,10 @@
 
 static void spear13xx_pcie_host_init(struct pcie_port *pp)
 {
-	spear13xx_pcie_establish_link(pp);
-	spear13xx_pcie_enable_interrupts(pp);
+	struct spear13xx_pcie *spear13xx_pcie = to_spear13xx_pcie(pp);
+
+	spear13xx_pcie_establish_link(spear13xx_pcie);
+	spear13xx_pcie_enable_interrupts(spear13xx_pcie);
 }
 
 static struct pcie_host_ops spear13xx_pcie_host_ops = {
@@ -255,10 +187,11 @@
 	.host_init = spear13xx_pcie_host_init,
 };
 
-static int spear13xx_add_pcie_port(struct pcie_port *pp,
-					 struct platform_device *pdev)
+static int spear13xx_add_pcie_port(struct spear13xx_pcie *spear13xx_pcie,
+				   struct platform_device *pdev)
 {
-	struct device *dev = &pdev->dev;
+	struct pcie_port *pp = &spear13xx_pcie->pp;
+	struct device *dev = pp->dev;
 	int ret;
 
 	pp->irq = platform_get_irq(pdev, 0);
@@ -268,7 +201,7 @@
 	}
 	ret = devm_request_irq(dev, pp->irq, spear13xx_pcie_irq_handler,
 			       IRQF_SHARED | IRQF_NO_THREAD,
-			       "spear1340-pcie", pp);
+			       "spear1340-pcie", spear13xx_pcie);
 	if (ret) {
 		dev_err(dev, "failed to request irq %d\n", pp->irq);
 		return ret;
@@ -288,10 +221,10 @@
 
 static int spear13xx_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct spear13xx_pcie *spear13xx_pcie;
 	struct pcie_port *pp;
-	struct device *dev = &pdev->dev;
-	struct device_node *np = pdev->dev.of_node;
+	struct device_node *np = dev->of_node;
 	struct resource *dbi_base;
 	int ret;
 
@@ -323,7 +256,6 @@
 	}
 
 	pp = &spear13xx_pcie->pp;
-
 	pp->dev = dev;
 
 	dbi_base = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi");
@@ -338,7 +270,7 @@
 	if (of_property_read_bool(np, "st,pcie-is-gen1"))
 		spear13xx_pcie->is_gen1 = true;
 
-	ret = spear13xx_add_pcie_port(pp, pdev);
+	ret = spear13xx_add_pcie_port(spear13xx_pcie, pdev);
 	if (ret < 0)
 		goto fail_clk;
 
diff --git a/drivers/pci/host/pcie-xilinx-nwl.c b/drivers/pci/host/pcie-xilinx-nwl.c
index 67eae41..43eaa4a 100644
--- a/drivers/pci/host/pcie-xilinx-nwl.c
+++ b/drivers/pci/host/pcie-xilinx-nwl.c
@@ -212,6 +212,7 @@
 
 static int nwl_wait_for_link(struct nwl_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	int retries;
 
 	/* check if the link is up or not */
@@ -221,7 +222,7 @@
 		usleep_range(LINK_WAIT_USLEEP_MIN, LINK_WAIT_USLEEP_MAX);
 	}
 
-	dev_err(pcie->dev, "PHY link never came up\n");
+	dev_err(dev, "PHY link never came up\n");
 	return -ETIMEDOUT;
 }
 
@@ -277,6 +278,7 @@
 static irqreturn_t nwl_pcie_misc_handler(int irq, void *data)
 {
 	struct nwl_pcie *pcie = data;
+	struct device *dev = pcie->dev;
 	u32 misc_stat;
 
 	/* Checking for misc interrupts */
@@ -286,45 +288,43 @@
 		return IRQ_NONE;
 
 	if (misc_stat & MSGF_MISC_SR_RXMSG_OVER)
-		dev_err(pcie->dev, "Received Message FIFO Overflow\n");
+		dev_err(dev, "Received Message FIFO Overflow\n");
 
 	if (misc_stat & MSGF_MISC_SR_SLAVE_ERR)
-		dev_err(pcie->dev, "Slave error\n");
+		dev_err(dev, "Slave error\n");
 
 	if (misc_stat & MSGF_MISC_SR_MASTER_ERR)
-		dev_err(pcie->dev, "Master error\n");
+		dev_err(dev, "Master error\n");
 
 	if (misc_stat & MSGF_MISC_SR_I_ADDR_ERR)
-		dev_err(pcie->dev,
-			"In Misc Ingress address translation error\n");
+		dev_err(dev, "In Misc Ingress address translation error\n");
 
 	if (misc_stat & MSGF_MISC_SR_E_ADDR_ERR)
-		dev_err(pcie->dev,
-			"In Misc Egress address translation error\n");
+		dev_err(dev, "In Misc Egress address translation error\n");
 
 	if (misc_stat & MSGF_MISC_SR_FATAL_AER)
-		dev_err(pcie->dev, "Fatal Error in AER Capability\n");
+		dev_err(dev, "Fatal Error in AER Capability\n");
 
 	if (misc_stat & MSGF_MISC_SR_NON_FATAL_AER)
-		dev_err(pcie->dev, "Non-Fatal Error in AER Capability\n");
+		dev_err(dev, "Non-Fatal Error in AER Capability\n");
 
 	if (misc_stat & MSGF_MISC_SR_CORR_AER)
-		dev_err(pcie->dev, "Correctable Error in AER Capability\n");
+		dev_err(dev, "Correctable Error in AER Capability\n");
 
 	if (misc_stat & MSGF_MISC_SR_UR_DETECT)
-		dev_err(pcie->dev, "Unsupported request Detected\n");
+		dev_err(dev, "Unsupported request Detected\n");
 
 	if (misc_stat & MSGF_MISC_SR_NON_FATAL_DEV)
-		dev_err(pcie->dev, "Non-Fatal Error Detected\n");
+		dev_err(dev, "Non-Fatal Error Detected\n");
 
 	if (misc_stat & MSGF_MISC_SR_FATAL_DEV)
-		dev_err(pcie->dev, "Fatal Error Detected\n");
+		dev_err(dev, "Fatal Error Detected\n");
 
 	if (misc_stat & MSGF_MSIC_SR_LINK_AUTO_BWIDTH)
-		dev_info(pcie->dev, "Link Autonomous Bandwidth Management Status bit set\n");
+		dev_info(dev, "Link Autonomous Bandwidth Management Status bit set\n");
 
 	if (misc_stat & MSGF_MSIC_SR_LINK_BWIDTH)
-		dev_info(pcie->dev, "Link Bandwidth Management Status bit set\n");
+		dev_info(dev, "Link Bandwidth Management Status bit set\n");
 
 	/* Clear misc interrupt status */
 	nwl_bridge_writel(pcie, misc_stat, MSGF_MISC_STATUS);
@@ -494,20 +494,21 @@
 static int nwl_pcie_init_msi_irq_domain(struct nwl_pcie *pcie)
 {
 #ifdef CONFIG_PCI_MSI
-	struct fwnode_handle *fwnode = of_node_to_fwnode(pcie->dev->of_node);
+	struct device *dev = pcie->dev;
+	struct fwnode_handle *fwnode = of_node_to_fwnode(dev->of_node);
 	struct nwl_msi *msi = &pcie->msi;
 
 	msi->dev_domain = irq_domain_add_linear(NULL, INT_PCI_MSI_NR,
 						&dev_msi_domain_ops, pcie);
 	if (!msi->dev_domain) {
-		dev_err(pcie->dev, "failed to create dev IRQ domain\n");
+		dev_err(dev, "failed to create dev IRQ domain\n");
 		return -ENOMEM;
 	}
 	msi->msi_domain = pci_msi_create_irq_domain(fwnode,
 						    &nwl_msi_domain_info,
 						    msi->dev_domain);
 	if (!msi->msi_domain) {
-		dev_err(pcie->dev, "failed to create msi IRQ domain\n");
+		dev_err(dev, "failed to create msi IRQ domain\n");
 		irq_domain_remove(msi->dev_domain);
 		return -ENOMEM;
 	}
@@ -517,12 +518,13 @@
 
 static int nwl_pcie_init_irq_domain(struct nwl_pcie *pcie)
 {
-	struct device_node *node = pcie->dev->of_node;
+	struct device *dev = pcie->dev;
+	struct device_node *node = dev->of_node;
 	struct device_node *legacy_intc_node;
 
 	legacy_intc_node = of_get_next_child(node, NULL);
 	if (!legacy_intc_node) {
-		dev_err(pcie->dev, "No legacy intc node found\n");
+		dev_err(dev, "No legacy intc node found\n");
 		return -EINVAL;
 	}
 
@@ -532,7 +534,7 @@
 							pcie);
 
 	if (!pcie->legacy_irq_domain) {
-		dev_err(pcie->dev, "failed to create IRQ domain\n");
+		dev_err(dev, "failed to create IRQ domain\n");
 		return -ENOMEM;
 	}
 
@@ -542,7 +544,8 @@
 
 static int nwl_pcie_enable_msi(struct nwl_pcie *pcie, struct pci_bus *bus)
 {
-	struct platform_device *pdev = to_platform_device(pcie->dev);
+	struct device *dev = pcie->dev;
+	struct platform_device *pdev = to_platform_device(dev);
 	struct nwl_msi *msi = &pcie->msi;
 	unsigned long base;
 	int ret;
@@ -557,7 +560,7 @@
 	/* Get msi_1 IRQ number */
 	msi->irq_msi1 = platform_get_irq_byname(pdev, "msi1");
 	if (msi->irq_msi1 < 0) {
-		dev_err(&pdev->dev, "failed to get IRQ#%d\n", msi->irq_msi1);
+		dev_err(dev, "failed to get IRQ#%d\n", msi->irq_msi1);
 		ret = -EINVAL;
 		goto err;
 	}
@@ -568,7 +571,7 @@
 	/* Get msi_0 IRQ number */
 	msi->irq_msi0 = platform_get_irq_byname(pdev, "msi0");
 	if (msi->irq_msi0 < 0) {
-		dev_err(&pdev->dev, "failed to get IRQ#%d\n", msi->irq_msi0);
+		dev_err(dev, "failed to get IRQ#%d\n", msi->irq_msi0);
 		ret = -EINVAL;
 		goto err;
 	}
@@ -579,7 +582,7 @@
 	/* Check for msii_present bit */
 	ret = nwl_bridge_readl(pcie, I_MSII_CAPABILITIES) & MSII_PRESENT;
 	if (!ret) {
-		dev_err(pcie->dev, "MSI not present\n");
+		dev_err(dev, "MSI not present\n");
 		ret = -EIO;
 		goto err;
 	}
@@ -628,13 +631,14 @@
 
 static int nwl_pcie_bridge_init(struct nwl_pcie *pcie)
 {
-	struct platform_device *pdev = to_platform_device(pcie->dev);
+	struct device *dev = pcie->dev;
+	struct platform_device *pdev = to_platform_device(dev);
 	u32 breg_val, ecam_val, first_busno = 0;
 	int err;
 
 	breg_val = nwl_bridge_readl(pcie, E_BREG_CAPABILITIES) & BREG_PRESENT;
 	if (!breg_val) {
-		dev_err(pcie->dev, "BREG is not present\n");
+		dev_err(dev, "BREG is not present\n");
 		return breg_val;
 	}
 
@@ -665,7 +669,7 @@
 
 	ecam_val = nwl_bridge_readl(pcie, E_ECAM_CAPABILITIES) & E_ECAM_PRESENT;
 	if (!ecam_val) {
-		dev_err(pcie->dev, "ECAM is not present\n");
+		dev_err(dev, "ECAM is not present\n");
 		return ecam_val;
 	}
 
@@ -692,23 +696,23 @@
 	writel(ecam_val, (pcie->ecam_base + PCI_PRIMARY_BUS));
 
 	if (nwl_pcie_link_up(pcie))
-		dev_info(pcie->dev, "Link is UP\n");
+		dev_info(dev, "Link is UP\n");
 	else
-		dev_info(pcie->dev, "Link is DOWN\n");
+		dev_info(dev, "Link is DOWN\n");
 
 	/* Get misc IRQ number */
 	pcie->irq_misc = platform_get_irq_byname(pdev, "misc");
 	if (pcie->irq_misc < 0) {
-		dev_err(&pdev->dev, "failed to get misc IRQ %d\n",
+		dev_err(dev, "failed to get misc IRQ %d\n",
 			pcie->irq_misc);
 		return -EINVAL;
 	}
 
-	err = devm_request_irq(pcie->dev, pcie->irq_misc,
+	err = devm_request_irq(dev, pcie->irq_misc,
 			       nwl_pcie_misc_handler, IRQF_SHARED,
 			       "nwl_pcie:misc", pcie);
 	if (err) {
-		dev_err(pcie->dev, "fail to register misc IRQ#%d\n",
+		dev_err(dev, "fail to register misc IRQ#%d\n",
 			pcie->irq_misc);
 		return err;
 	}
@@ -744,31 +748,32 @@
 static int nwl_pcie_parse_dt(struct nwl_pcie *pcie,
 			     struct platform_device *pdev)
 {
-	struct device_node *node = pcie->dev->of_node;
+	struct device *dev = pcie->dev;
+	struct device_node *node = dev->of_node;
 	struct resource *res;
 	const char *type;
 
 	/* Check for device type */
 	type = of_get_property(node, "device_type", NULL);
 	if (!type || strcmp(type, "pci")) {
-		dev_err(pcie->dev, "invalid \"device_type\" %s\n", type);
+		dev_err(dev, "invalid \"device_type\" %s\n", type);
 		return -EINVAL;
 	}
 
 	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "breg");
-	pcie->breg_base = devm_ioremap_resource(pcie->dev, res);
+	pcie->breg_base = devm_ioremap_resource(dev, res);
 	if (IS_ERR(pcie->breg_base))
 		return PTR_ERR(pcie->breg_base);
 	pcie->phys_breg_base = res->start;
 
 	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "pcireg");
-	pcie->pcireg_base = devm_ioremap_resource(pcie->dev, res);
+	pcie->pcireg_base = devm_ioremap_resource(dev, res);
 	if (IS_ERR(pcie->pcireg_base))
 		return PTR_ERR(pcie->pcireg_base);
 	pcie->phys_pcie_reg_base = res->start;
 
 	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "cfg");
-	pcie->ecam_base = devm_ioremap_resource(pcie->dev, res);
+	pcie->ecam_base = devm_ioremap_resource(dev, res);
 	if (IS_ERR(pcie->ecam_base))
 		return PTR_ERR(pcie->ecam_base);
 	pcie->phys_ecam_base = res->start;
@@ -776,8 +781,7 @@
 	/* Get intx IRQ number */
 	pcie->irq_intx = platform_get_irq_byname(pdev, "intx");
 	if (pcie->irq_intx < 0) {
-		dev_err(&pdev->dev, "failed to get intx IRQ %d\n",
-			pcie->irq_intx);
+		dev_err(dev, "failed to get intx IRQ %d\n", pcie->irq_intx);
 		return -EINVAL;
 	}
 
@@ -794,7 +798,8 @@
 
 static int nwl_pcie_probe(struct platform_device *pdev)
 {
-	struct device_node *node = pdev->dev.of_node;
+	struct device *dev = &pdev->dev;
+	struct device_node *node = dev->of_node;
 	struct nwl_pcie *pcie;
 	struct pci_bus *bus;
 	struct pci_bus *child;
@@ -802,42 +807,42 @@
 	resource_size_t iobase = 0;
 	LIST_HEAD(res);
 
-	pcie = devm_kzalloc(&pdev->dev, sizeof(*pcie), GFP_KERNEL);
+	pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
 	if (!pcie)
 		return -ENOMEM;
 
-	pcie->dev = &pdev->dev;
+	pcie->dev = dev;
 	pcie->ecam_value = NWL_ECAM_VALUE_DEFAULT;
 
 	err = nwl_pcie_parse_dt(pcie, pdev);
 	if (err) {
-		dev_err(pcie->dev, "Parsing DT failed\n");
+		dev_err(dev, "Parsing DT failed\n");
 		return err;
 	}
 
 	err = nwl_pcie_bridge_init(pcie);
 	if (err) {
-		dev_err(pcie->dev, "HW Initialization failed\n");
+		dev_err(dev, "HW Initialization failed\n");
 		return err;
 	}
 
 	err = of_pci_get_host_bridge_resources(node, 0, 0xff, &res, &iobase);
 	if (err) {
-		dev_err(pcie->dev, "Getting bridge resources failed\n");
+		dev_err(dev, "Getting bridge resources failed\n");
 		return err;
 	}
 
-	err = devm_request_pci_bus_resources(pcie->dev, &res);
+	err = devm_request_pci_bus_resources(dev, &res);
 	if (err)
 		goto error;
 
 	err = nwl_pcie_init_irq_domain(pcie);
 	if (err) {
-		dev_err(pcie->dev, "Failed creating IRQ Domain\n");
+		dev_err(dev, "Failed creating IRQ Domain\n");
 		goto error;
 	}
 
-	bus = pci_create_root_bus(&pdev->dev, pcie->root_busno,
+	bus = pci_create_root_bus(dev, pcie->root_busno,
 				  &nwl_pcie_ops, pcie, &res);
 	if (!bus) {
 		err = -ENOMEM;
@@ -847,8 +852,7 @@
 	if (IS_ENABLED(CONFIG_PCI_MSI)) {
 		err = nwl_pcie_enable_msi(pcie, bus);
 		if (err < 0) {
-			dev_err(&pdev->dev,
-				"failed to enable MSI support: %d\n", err);
+			dev_err(dev, "failed to enable MSI support: %d\n", err);
 			goto error;
 		}
 	}
@@ -857,7 +861,6 @@
 	list_for_each_entry(child, &bus->children, node)
 		pcie_bus_configure_settings(child);
 	pci_bus_add_devices(bus);
-	platform_set_drvdata(pdev, pcie);
 	return 0;
 
 error:
diff --git a/drivers/pci/host/pcie-xilinx.c b/drivers/pci/host/pcie-xilinx.c
index be56803..c8616fa 100644
--- a/drivers/pci/host/pcie-xilinx.c
+++ b/drivers/pci/host/pcie-xilinx.c
@@ -140,10 +140,11 @@
  */
 static void xilinx_pcie_clear_err_interrupts(struct xilinx_pcie_port *port)
 {
+	struct device *dev = port->dev;
 	unsigned long val = pcie_read(port, XILINX_PCIE_REG_RPEFR);
 
 	if (val & XILINX_PCIE_RPEFR_ERR_VALID) {
-		dev_dbg(port->dev, "Requester ID %lu\n",
+		dev_dbg(dev, "Requester ID %lu\n",
 			val & XILINX_PCIE_RPEFR_REQ_ID);
 		pcie_write(port, XILINX_PCIE_RPEFR_ALL_MASK,
 			   XILINX_PCIE_REG_RPEFR);
@@ -228,11 +229,10 @@
 
 /**
  * xilinx_pcie_assign_msi - Allocate MSI number
- * @port: PCIe port structure
  *
  * Return: A valid IRQ on success and error value on failure.
  */
-static int xilinx_pcie_assign_msi(struct xilinx_pcie_port *port)
+static int xilinx_pcie_assign_msi(void)
 {
 	int pos;
 
@@ -275,7 +275,7 @@
 	struct msi_msg msg;
 	phys_addr_t msg_addr;
 
-	hwirq = xilinx_pcie_assign_msi(port);
+	hwirq = xilinx_pcie_assign_msi();
 	if (hwirq < 0)
 		return hwirq;
 
@@ -383,6 +383,7 @@
 static irqreturn_t xilinx_pcie_intr_handler(int irq, void *data)
 {
 	struct xilinx_pcie_port *port = (struct xilinx_pcie_port *)data;
+	struct device *dev = port->dev;
 	u32 val, mask, status, msi_data;
 
 	/* Read interrupt decode and mask registers */
@@ -394,32 +395,32 @@
 		return IRQ_NONE;
 
 	if (status & XILINX_PCIE_INTR_LINK_DOWN)
-		dev_warn(port->dev, "Link Down\n");
+		dev_warn(dev, "Link Down\n");
 
 	if (status & XILINX_PCIE_INTR_ECRC_ERR)
-		dev_warn(port->dev, "ECRC failed\n");
+		dev_warn(dev, "ECRC failed\n");
 
 	if (status & XILINX_PCIE_INTR_STR_ERR)
-		dev_warn(port->dev, "Streaming error\n");
+		dev_warn(dev, "Streaming error\n");
 
 	if (status & XILINX_PCIE_INTR_HOT_RESET)
-		dev_info(port->dev, "Hot reset\n");
+		dev_info(dev, "Hot reset\n");
 
 	if (status & XILINX_PCIE_INTR_CFG_TIMEOUT)
-		dev_warn(port->dev, "ECAM access timeout\n");
+		dev_warn(dev, "ECAM access timeout\n");
 
 	if (status & XILINX_PCIE_INTR_CORRECTABLE) {
-		dev_warn(port->dev, "Correctable error message\n");
+		dev_warn(dev, "Correctable error message\n");
 		xilinx_pcie_clear_err_interrupts(port);
 	}
 
 	if (status & XILINX_PCIE_INTR_NONFATAL) {
-		dev_warn(port->dev, "Non fatal error message\n");
+		dev_warn(dev, "Non fatal error message\n");
 		xilinx_pcie_clear_err_interrupts(port);
 	}
 
 	if (status & XILINX_PCIE_INTR_FATAL) {
-		dev_warn(port->dev, "Fatal error message\n");
+		dev_warn(dev, "Fatal error message\n");
 		xilinx_pcie_clear_err_interrupts(port);
 	}
 
@@ -429,7 +430,7 @@
 
 		/* Check whether interrupt valid */
 		if (!(val & XILINX_PCIE_RPIFR1_INTR_VALID)) {
-			dev_warn(port->dev, "RP Intr FIFO1 read error\n");
+			dev_warn(dev, "RP Intr FIFO1 read error\n");
 			goto error;
 		}
 
@@ -451,7 +452,7 @@
 		val = pcie_read(port, XILINX_PCIE_REG_RPIFR1);
 
 		if (!(val & XILINX_PCIE_RPIFR1_INTR_VALID)) {
-			dev_warn(port->dev, "RP Intr FIFO1 read error\n");
+			dev_warn(dev, "RP Intr FIFO1 read error\n");
 			goto error;
 		}
 
@@ -471,31 +472,31 @@
 	}
 
 	if (status & XILINX_PCIE_INTR_SLV_UNSUPP)
-		dev_warn(port->dev, "Slave unsupported request\n");
+		dev_warn(dev, "Slave unsupported request\n");
 
 	if (status & XILINX_PCIE_INTR_SLV_UNEXP)
-		dev_warn(port->dev, "Slave unexpected completion\n");
+		dev_warn(dev, "Slave unexpected completion\n");
 
 	if (status & XILINX_PCIE_INTR_SLV_COMPL)
-		dev_warn(port->dev, "Slave completion timeout\n");
+		dev_warn(dev, "Slave completion timeout\n");
 
 	if (status & XILINX_PCIE_INTR_SLV_ERRP)
-		dev_warn(port->dev, "Slave Error Poison\n");
+		dev_warn(dev, "Slave Error Poison\n");
 
 	if (status & XILINX_PCIE_INTR_SLV_CMPABT)
-		dev_warn(port->dev, "Slave Completer Abort\n");
+		dev_warn(dev, "Slave Completer Abort\n");
 
 	if (status & XILINX_PCIE_INTR_SLV_ILLBUR)
-		dev_warn(port->dev, "Slave Illegal Burst\n");
+		dev_warn(dev, "Slave Illegal Burst\n");
 
 	if (status & XILINX_PCIE_INTR_MST_DECERR)
-		dev_warn(port->dev, "Master decode error\n");
+		dev_warn(dev, "Master decode error\n");
 
 	if (status & XILINX_PCIE_INTR_MST_SLVERR)
-		dev_warn(port->dev, "Master slave error\n");
+		dev_warn(dev, "Master slave error\n");
 
 	if (status & XILINX_PCIE_INTR_MST_ERRP)
-		dev_warn(port->dev, "Master error poison\n");
+		dev_warn(dev, "Master error poison\n");
 
 error:
 	/* Clear the Interrupt Decode register */
@@ -554,10 +555,12 @@
  */
 static void xilinx_pcie_init_port(struct xilinx_pcie_port *port)
 {
+	struct device *dev = port->dev;
+
 	if (xilinx_pcie_link_is_up(port))
-		dev_info(port->dev, "PCIe Link is UP\n");
+		dev_info(dev, "PCIe Link is UP\n");
 	else
-		dev_info(port->dev, "PCIe Link is DOWN\n");
+		dev_info(dev, "PCIe Link is DOWN\n");
 
 	/* Disable all interrupts */
 	pcie_write(port, ~XILINX_PCIE_IDR_ALL_MASK,
@@ -627,8 +630,8 @@
  */
 static int xilinx_pcie_probe(struct platform_device *pdev)
 {
-	struct xilinx_pcie_port *port;
 	struct device *dev = &pdev->dev;
+	struct xilinx_pcie_port *port;
 	struct pci_bus *bus;
 	int err;
 	resource_size_t iobase = 0;
@@ -668,15 +671,14 @@
 	if (err)
 		goto error;
 
-	bus = pci_create_root_bus(&pdev->dev, 0,
-				  &xilinx_pcie_ops, port, &res);
+	bus = pci_create_root_bus(dev, 0, &xilinx_pcie_ops, port, &res);
 	if (!bus) {
 		err = -ENOMEM;
 		goto error;
 	}
 
 #ifdef CONFIG_PCI_MSI
-	xilinx_pcie_msi_chip.dev = port->dev;
+	xilinx_pcie_msi_chip.dev = dev;
 	bus->msi = &xilinx_pcie_msi_chip;
 #endif
 	pci_scan_child_bus(bus);
@@ -685,8 +687,6 @@
 	pci_fixup_irqs(pci_common_swizzle, of_irq_parse_and_map_pci);
 #endif
 	pci_bus_add_devices(bus);
-	platform_set_drvdata(pdev, port);
-
 	return 0;
 
 error:
diff --git a/drivers/platform/x86/acerhdf.c b/drivers/platform/x86/acerhdf.c
index 460fa67..2acdb0d 100644
--- a/drivers/platform/x86/acerhdf.c
+++ b/drivers/platform/x86/acerhdf.c
@@ -405,7 +405,7 @@
 	kernelmode = 1;
 
 	thz_dev->polling_delay = interval*1000;
-	thermal_zone_device_update(thz_dev);
+	thermal_zone_device_update(thz_dev, THERMAL_EVENT_UNSPECIFIED);
 	pr_notice("kernel mode fan control ON\n");
 }
 
diff --git a/drivers/platform/x86/asus-laptop.c b/drivers/platform/x86/asus-laptop.c
index 15f1311..28551f5 100644
--- a/drivers/platform/x86/asus-laptop.c
+++ b/drivers/platform/x86/asus-laptop.c
@@ -932,30 +932,19 @@
 }
 static DEVICE_ATTR_RO(infos);
 
-static int parse_arg(const char *buf, unsigned long count, int *val)
-{
-	if (!count)
-		return 0;
-	if (count > 31)
-		return -EINVAL;
-	if (sscanf(buf, "%i", val) != 1)
-		return -EINVAL;
-	return count;
-}
-
 static ssize_t sysfs_acpi_set(struct asus_laptop *asus,
 			      const char *buf, size_t count,
 			      const char *method)
 {
 	int rv, value;
 
-	rv = parse_arg(buf, count, &value);
-	if (rv <= 0)
+	rv = kstrtoint(buf, 0, &value);
+	if (rv < 0)
 		return rv;
 
 	if (write_acpi_int(asus->handle, method, value))
 		return -ENODEV;
-	return rv;
+	return count;
 }
 
 /*
@@ -975,15 +964,17 @@
 	struct asus_laptop *asus = dev_get_drvdata(dev);
 	int rv, value;
 
-	rv = parse_arg(buf, count, &value);
-	if (rv > 0) {
-		if (write_acpi_int(asus->handle, METHOD_LEDD, value)) {
-			pr_warn("LED display write failed\n");
-			return -ENODEV;
-		}
-		asus->ledd_status = (u32) value;
+	rv = kstrtoint(buf, 0, &value);
+	if (rv < 0)
+		return rv;
+
+	if (write_acpi_int(asus->handle, METHOD_LEDD, value)) {
+		pr_warn("LED display write failed\n");
+		return -ENODEV;
 	}
-	return rv;
+
+	asus->ledd_status = (u32) value;
+	return count;
 }
 static DEVICE_ATTR_RW(ledd);
 
@@ -1148,10 +1139,12 @@
 	struct asus_laptop *asus = dev_get_drvdata(dev);
 	int rv, value;
 
-	rv = parse_arg(buf, count, &value);
-	if (rv > 0)
-		asus_set_display(asus, value);
-	return rv;
+	rv = kstrtoint(buf, 0, &value);
+	if (rv < 0)
+		return rv;
+
+	asus_set_display(asus, value);
+	return count;
 }
 static DEVICE_ATTR_WO(display);
 
@@ -1190,11 +1183,12 @@
 	struct asus_laptop *asus = dev_get_drvdata(dev);
 	int rv, value;
 
-	rv = parse_arg(buf, count, &value);
-	if (rv > 0)
-		asus_als_switch(asus, value ? 1 : 0);
+	rv = kstrtoint(buf, 0, &value);
+	if (rv < 0)
+		return rv;
 
-	return rv;
+	asus_als_switch(asus, value ? 1 : 0);
+	return count;
 }
 static DEVICE_ATTR_RW(ls_switch);
 
@@ -1219,14 +1213,15 @@
 	struct asus_laptop *asus = dev_get_drvdata(dev);
 	int rv, value;
 
-	rv = parse_arg(buf, count, &value);
-	if (rv > 0) {
-		value = (0 < value) ? ((15 < value) ? 15 : value) : 0;
-		/* 0 <= value <= 15 */
-		asus_als_level(asus, value);
-	}
+	rv = kstrtoint(buf, 0, &value);
+	if (rv < 0)
+		return rv;
 
-	return rv;
+	value = (0 < value) ? ((15 < value) ? 15 : value) : 0;
+	/* 0 <= value <= 15 */
+	asus_als_level(asus, value);
+
+	return count;
 }
 static DEVICE_ATTR_RW(ls_level);
 
@@ -1301,14 +1296,14 @@
 	int rv, value;
 	int ret;
 
-	rv = parse_arg(buf, count, &value);
-	if (rv <= 0)
-		return -EINVAL;
+	rv = kstrtoint(buf, 0, &value);
+	if (rv < 0)
+		return rv;
 	ret = asus_gps_switch(asus, !!value);
 	if (ret)
 		return ret;
 	rfkill_set_sw_state(asus->gps.rfkill, !value);
-	return rv;
+	return count;
 }
 static DEVICE_ATTR_RW(gps);
 
diff --git a/drivers/platform/x86/asus-nb-wmi.c b/drivers/platform/x86/asus-nb-wmi.c
index adecc1c..26e4cbc 100644
--- a/drivers/platform/x86/asus-nb-wmi.c
+++ b/drivers/platform/x86/asus-nb-wmi.c
@@ -27,6 +27,7 @@
 #include <linux/input/sparse-keymap.h>
 #include <linux/fb.h>
 #include <linux/dmi.h>
+#include <linux/i8042.h>
 
 #include "asus-wmi.h"
 
@@ -55,10 +56,34 @@
 
 static struct quirk_entry *quirks;
 
+static bool asus_q500a_i8042_filter(unsigned char data, unsigned char str,
+			      struct serio *port)
+{
+	static bool extended;
+	bool ret = false;
+
+	if (str & I8042_STR_AUXDATA)
+		return false;
+
+	if (unlikely(data == 0xe1)) {
+		extended = true;
+		ret = true;
+	} else if (unlikely(extended)) {
+		extended = false;
+		ret = true;
+	}
+
+	return ret;
+}
+
 static struct quirk_entry quirk_asus_unknown = {
 	.wapf = 0,
 };
 
+static struct quirk_entry quirk_asus_q500a = {
+	.i8042_filter = asus_q500a_i8042_filter,
+};
+
 /*
  * For those machines that need software to control bt/wifi status
  * and can't adjust brightness through ACPI interface
@@ -87,6 +112,10 @@
 	.no_rfkill = true,
 };
 
+static struct quirk_entry quirk_asus_ux303ub = {
+	.wmi_backlight_native = true,
+};
+
 static int dmi_matched(const struct dmi_system_id *dmi)
 {
 	quirks = dmi->driver_data;
@@ -96,6 +125,15 @@
 static const struct dmi_system_id asus_quirks[] = {
 	{
 		.callback = dmi_matched,
+		.ident = "ASUSTeK COMPUTER INC. Q500A",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Q500A"),
+		},
+		.driver_data = &quirk_asus_q500a,
+	},
+	{
+		.callback = dmi_matched,
 		.ident = "ASUSTeK COMPUTER INC. U32U",
 		.matches = {
 			DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."),
@@ -351,11 +389,22 @@
 		},
 		.driver_data = &quirk_no_rfkill,
 	},
+	{
+		.callback = dmi_matched,
+		.ident = "ASUSTeK COMPUTER INC. UX303UB",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "UX303UB"),
+		},
+		.driver_data = &quirk_asus_ux303ub,
+	},
 	{},
 };
 
 static void asus_nb_wmi_quirks(struct asus_wmi_driver *driver)
 {
+	int ret;
+
 	quirks = &quirk_asus_unknown;
 	dmi_check_system(asus_quirks);
 
@@ -367,6 +416,15 @@
 		quirks->wapf = wapf;
 	else
 		wapf = quirks->wapf;
+
+	if (quirks->i8042_filter) {
+		ret = i8042_install_filter(quirks->i8042_filter);
+		if (ret) {
+			pr_warn("Unable to install key filter\n");
+			return;
+		}
+		pr_info("Using i8042 filter function for receiving events\n");
+	}
 }
 
 static const struct key_entry asus_nb_wmi_keymap[] = {
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index 7c093a0..ce6ca31 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -2084,6 +2084,9 @@
 	if (asus->driver->quirks->wmi_backlight_power)
 		acpi_video_set_dmi_backlight_type(acpi_backlight_vendor);
 
+	if (asus->driver->quirks->wmi_backlight_native)
+		acpi_video_set_dmi_backlight_type(acpi_backlight_native);
+
 	if (acpi_video_get_backlight_type() == acpi_backlight_vendor) {
 		err = asus_wmi_backlight_init(asus);
 		if (err && err != -ENODEV)
diff --git a/drivers/platform/x86/asus-wmi.h b/drivers/platform/x86/asus-wmi.h
index 5de1df5..0e19014 100644
--- a/drivers/platform/x86/asus-wmi.h
+++ b/drivers/platform/x86/asus-wmi.h
@@ -28,6 +28,7 @@
 #define _ASUS_WMI_H_
 
 #include <linux/platform_device.h>
+#include <linux/i8042.h>
 
 #define ASUS_WMI_KEY_IGNORE (-1)
 #define ASUS_WMI_BRN_DOWN	0x20
@@ -43,6 +44,7 @@
 	bool scalar_panel_brightness;
 	bool store_backlight_power;
 	bool wmi_backlight_power;
+	bool wmi_backlight_native;
 	int wapf;
 	/*
 	 * For machines with AMD graphic chips, it will send out WMI event
@@ -51,6 +53,9 @@
 	 * and let the ACPI interrupt to send out the key event.
 	 */
 	int no_display_toggle;
+
+	bool (*i8042_filter)(unsigned char data, unsigned char str,
+			     struct serio *serio);
 };
 
 struct asus_wmi_driver {
diff --git a/drivers/platform/x86/dell-smo8800.c b/drivers/platform/x86/dell-smo8800.c
index 0aec4fd..37e6460 100644
--- a/drivers/platform/x86/dell-smo8800.c
+++ b/drivers/platform/x86/dell-smo8800.c
@@ -24,6 +24,7 @@
 #include <linux/acpi.h>
 #include <linux/interrupt.h>
 #include <linux/miscdevice.h>
+#include <linux/uaccess.h>
 
 struct smo8800_device {
 	u32 irq;                     /* acpi device irq */
diff --git a/drivers/platform/x86/intel_pmc_core.c b/drivers/platform/x86/intel_pmc_core.c
index 520b58a..e8b1b83 100644
--- a/drivers/platform/x86/intel_pmc_core.c
+++ b/drivers/platform/x86/intel_pmc_core.c
@@ -100,7 +100,7 @@
 	struct dentry *dir, *file;
 
 	dir = debugfs_create_dir("pmc_core", NULL);
-	if (IS_ERR_OR_NULL(dir))
+	if (!dir)
 		return -ENOMEM;
 
 	pmcdev->dbgfs_dir = dir;
diff --git a/drivers/platform/x86/intel_pmc_ipc.c b/drivers/platform/x86/intel_pmc_ipc.c
index a511d51..0bf51d5 100644
--- a/drivers/platform/x86/intel_pmc_ipc.c
+++ b/drivers/platform/x86/intel_pmc_ipc.c
@@ -522,48 +522,36 @@
 static int ipc_create_punit_device(void)
 {
 	struct platform_device *pdev;
-	int ret;
+	const struct platform_device_info pdevinfo = {
+		.parent = ipcdev.dev,
+		.name = PUNIT_DEVICE_NAME,
+		.id = -1,
+		.res = punit_res_array,
+		.num_res = ARRAY_SIZE(punit_res_array),
+		};
 
-	pdev = platform_device_alloc(PUNIT_DEVICE_NAME, -1);
-	if (!pdev) {
-		dev_err(ipcdev.dev, "Failed to alloc punit platform device\n");
-		return -ENOMEM;
-	}
+	pdev = platform_device_register_full(&pdevinfo);
+	if (IS_ERR(pdev))
+		return PTR_ERR(pdev);
 
-	pdev->dev.parent = ipcdev.dev;
-	ret = platform_device_add_resources(pdev, punit_res_array,
-					    ARRAY_SIZE(punit_res_array));
-	if (ret) {
-		dev_err(ipcdev.dev, "Failed to add platform punit resources\n");
-		goto err;
-	}
-
-	ret = platform_device_add(pdev);
-	if (ret) {
-		dev_err(ipcdev.dev, "Failed to add punit platform device\n");
-		goto err;
-	}
 	ipcdev.punit_dev = pdev;
 
 	return 0;
-err:
-	platform_device_put(pdev);
-	return ret;
 }
 
 static int ipc_create_tco_device(void)
 {
 	struct platform_device *pdev;
 	struct resource *res;
-	int ret;
-
-	pdev = platform_device_alloc(TCO_DEVICE_NAME, -1);
-	if (!pdev) {
-		dev_err(ipcdev.dev, "Failed to alloc tco platform device\n");
-		return -ENOMEM;
-	}
-
-	pdev->dev.parent = ipcdev.dev;
+	const struct platform_device_info pdevinfo = {
+		.parent = ipcdev.dev,
+		.name = TCO_DEVICE_NAME,
+		.id = -1,
+		.res = tco_res,
+		.num_res = ARRAY_SIZE(tco_res),
+		.data = &tco_info,
+		.size_data = sizeof(tco_info),
+		};
 
 	res = tco_res + TCO_RESOURCE_ACPI_IO;
 	res->start = ipcdev.acpi_io_base + TCO_BASE_OFFSET;
@@ -577,45 +565,26 @@
 	res->start = ipcdev.gcr_base + TCO_PMC_OFFSET;
 	res->end = res->start + TCO_PMC_SIZE - 1;
 
-	ret = platform_device_add_resources(pdev, tco_res, ARRAY_SIZE(tco_res));
-	if (ret) {
-		dev_err(ipcdev.dev, "Failed to add tco platform resources\n");
-		goto err;
-	}
+	pdev = platform_device_register_full(&pdevinfo);
+	if (IS_ERR(pdev))
+		return PTR_ERR(pdev);
 
-	ret = platform_device_add_data(pdev, &tco_info, sizeof(tco_info));
-	if (ret) {
-		dev_err(ipcdev.dev, "Failed to add tco platform data\n");
-		goto err;
-	}
-
-	ret = platform_device_add(pdev);
-	if (ret) {
-		dev_err(ipcdev.dev, "Failed to add tco platform device\n");
-		goto err;
-	}
 	ipcdev.tco_dev = pdev;
 
 	return 0;
-err:
-	platform_device_put(pdev);
-	return ret;
 }
 
 static int ipc_create_telemetry_device(void)
 {
 	struct platform_device *pdev;
 	struct resource *res;
-	int ret;
-
-	pdev = platform_device_alloc(TELEMETRY_DEVICE_NAME, -1);
-	if (!pdev) {
-		dev_err(ipcdev.dev,
-			"Failed to allocate telemetry platform device\n");
-		return -ENOMEM;
-	}
-
-	pdev->dev.parent = ipcdev.dev;
+	const struct platform_device_info pdevinfo = {
+		.parent = ipcdev.dev,
+		.name = TELEMETRY_DEVICE_NAME,
+		.id = -1,
+		.res = telemetry_res,
+		.num_res = ARRAY_SIZE(telemetry_res),
+		};
 
 	res = telemetry_res + TELEMETRY_RESOURCE_PUNIT_SSRAM;
 	res->start = ipcdev.telem_punit_ssram_base;
@@ -625,26 +594,13 @@
 	res->start = ipcdev.telem_pmc_ssram_base;
 	res->end = res->start + ipcdev.telem_pmc_ssram_size - 1;
 
-	ret = platform_device_add_resources(pdev, telemetry_res,
-					    ARRAY_SIZE(telemetry_res));
-	if (ret) {
-		dev_err(ipcdev.dev,
-			"Failed to add telemetry platform resources\n");
-		goto err;
-	}
+	pdev = platform_device_register_full(&pdevinfo);
+	if (IS_ERR(pdev))
+		return PTR_ERR(pdev);
 
-	ret = platform_device_add(pdev);
-	if (ret) {
-		dev_err(ipcdev.dev,
-			"Failed to add telemetry platform device\n");
-		goto err;
-	}
 	ipcdev.telemetry_dev = pdev;
 
 	return 0;
-err:
-	platform_device_put(pdev);
-	return ret;
 }
 
 static int ipc_create_pmc_devices(void)
diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c
index 9d60a40..074bf2f 100644
--- a/drivers/platform/x86/toshiba_acpi.c
+++ b/drivers/platform/x86/toshiba_acpi.c
@@ -321,10 +321,9 @@
 static acpi_status tci_raw(struct toshiba_acpi_dev *dev,
 			   const u32 in[TCI_WORDS], u32 out[TCI_WORDS])
 {
+	union acpi_object in_objs[TCI_WORDS], out_objs[TCI_WORDS + 1];
 	struct acpi_object_list params;
-	union acpi_object in_objs[TCI_WORDS];
 	struct acpi_buffer results;
-	union acpi_object out_objs[TCI_WORDS + 1];
 	acpi_status status;
 	int i;
 
@@ -387,9 +386,8 @@
 {
 	u32 in[TCI_WORDS] = { SCI_OPEN, 0, 0, 0, 0, 0 };
 	u32 out[TCI_WORDS];
-	acpi_status status;
+	acpi_status status = tci_raw(dev, in, out);
 
-	status = tci_raw(dev, in, out);
 	if  (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to open SCI failed\n");
 		return 0;
@@ -425,9 +423,8 @@
 {
 	u32 in[TCI_WORDS] = { SCI_CLOSE, 0, 0, 0, 0, 0 };
 	u32 out[TCI_WORDS];
-	acpi_status status;
+	acpi_status status = tci_raw(dev, in, out);
 
-	status = tci_raw(dev, in, out);
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to close SCI failed\n");
 		return;
@@ -479,10 +476,15 @@
 
 	status = tci_raw(dev, in, out);
 	sci_close(dev);
-	if (ACPI_FAILURE(status))
+	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to query Illumination support failed\n");
-	else if (out[0] == TOS_SUCCESS)
-		dev->illumination_supported = 1;
+		return;
+	}
+
+	if (out[0] != TOS_SUCCESS)
+		return;
+
+	dev->illumination_supported = 1;
 }
 
 static void toshiba_illumination_set(struct led_classdev *cdev,
@@ -509,7 +511,8 @@
 {
 	struct toshiba_acpi_dev *dev = container_of(cdev,
 			struct toshiba_acpi_dev, led_dev);
-	u32 state, result;
+	u32 result;
+	u32 state;
 
 	/* First request : initialize communication. */
 	if (!sci_open(dev))
@@ -546,24 +549,28 @@
 	sci_close(dev);
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to query kbd illumination support failed\n");
-	} else if (out[0] == TOS_SUCCESS) {
-		/*
-		 * Check for keyboard backlight timeout max value,
-		 * previous kbd backlight implementation set this to
-		 * 0x3c0003, and now the new implementation set this
-		 * to 0x3c001a, use this to distinguish between them.
-		 */
-		if (out[3] == SCI_KBD_TIME_MAX)
-			dev->kbd_type = 2;
-		else
-			dev->kbd_type = 1;
-		/* Get the current keyboard backlight mode */
-		dev->kbd_mode = out[2] & SCI_KBD_MODE_MASK;
-		/* Get the current time (1-60 seconds) */
-		dev->kbd_time = out[2] >> HCI_MISC_SHIFT;
-		/* Flag as supported */
-		dev->kbd_illum_supported = 1;
+		return;
 	}
+
+	if (out[0] != TOS_SUCCESS)
+		return;
+
+	/*
+	 * Check for keyboard backlight timeout max value,
+	 * previous kbd backlight implementation set this to
+	 * 0x3c0003, and now the new implementation set this
+	 * to 0x3c001a, use this to distinguish between them.
+	 */
+	if (out[3] == SCI_KBD_TIME_MAX)
+		dev->kbd_type = 2;
+	else
+		dev->kbd_type = 1;
+	/* Get the current keyboard backlight mode */
+	dev->kbd_mode = out[2] & SCI_KBD_MODE_MASK;
+	/* Get the current time (1-60 seconds) */
+	dev->kbd_time = out[2] >> HCI_MISC_SHIFT;
+	/* Flag as supported */
+	dev->kbd_illum_supported = 1;
 }
 
 static int toshiba_kbd_illum_status_set(struct toshiba_acpi_dev *dev, u32 time)
@@ -672,9 +679,9 @@
 /* Eco Mode support */
 static void toshiba_eco_mode_available(struct toshiba_acpi_dev *dev)
 {
-	acpi_status status;
 	u32 in[TCI_WORDS] = { HCI_GET, HCI_ECO_MODE, 0, 0, 0, 0 };
 	u32 out[TCI_WORDS];
+	acpi_status status;
 
 	dev->eco_supported = 0;
 	dev->eco_led_registered = false;
@@ -682,7 +689,10 @@
 	status = tci_raw(dev, in, out);
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to get ECO led failed\n");
-	} else if (out[0] == TOS_INPUT_DATA_ERROR) {
+		return;
+	}
+
+	if (out[0] == TOS_INPUT_DATA_ERROR) {
 		/*
 		 * If we receive 0x8300 (Input Data Error), it means that the
 		 * LED device is present, but that we just screwed the input
@@ -694,10 +704,15 @@
 		 */
 		in[3] = 1;
 		status = tci_raw(dev, in, out);
-		if (ACPI_FAILURE(status))
+		if (ACPI_FAILURE(status)) {
 			pr_err("ACPI call to get ECO led failed\n");
-		else if (out[0] == TOS_SUCCESS)
-			dev->eco_supported = 1;
+			return;
+		}
+
+		if (out[0] != TOS_SUCCESS)
+			return;
+
+		dev->eco_supported = 1;
 	}
 }
 
@@ -714,10 +729,11 @@
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to get ECO led failed\n");
 		return LED_OFF;
-	} else if (out[0] != TOS_SUCCESS) {
-		return LED_OFF;
 	}
 
+	if (out[0] != TOS_SUCCESS)
+		return LED_OFF;
+
 	return out[2] ? LED_FULL : LED_OFF;
 }
 
@@ -751,10 +767,15 @@
 	 * this call also serves as initialization
 	 */
 	status = tci_raw(dev, in, out);
-	if (ACPI_FAILURE(status))
+	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to query the accelerometer failed\n");
-	else if (out[0] == TOS_SUCCESS)
-		dev->accelerometer_supported = 1;
+		return;
+	}
+
+	if (out[0] != TOS_SUCCESS)
+		return;
+
+	dev->accelerometer_supported = 1;
 }
 
 static int toshiba_accelerometer_get(struct toshiba_acpi_dev *dev,
@@ -769,15 +790,18 @@
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to query the accelerometer failed\n");
 		return -EIO;
-	} else if (out[0] == TOS_NOT_SUPPORTED) {
-		return -ENODEV;
-	} else if (out[0] == TOS_SUCCESS) {
-		*xy = out[2];
-		*z = out[4];
-		return 0;
 	}
 
-	return -EIO;
+	if (out[0] == TOS_NOT_SUPPORTED)
+		return -ENODEV;
+
+	if (out[0] != TOS_SUCCESS)
+		return -EIO;
+
+	*xy = out[2];
+	*z = out[4];
+
+	return 0;
 }
 
 /* Sleep (Charge and Music) utilities support */
@@ -797,24 +821,29 @@
 		pr_err("ACPI call to get USB Sleep and Charge mode failed\n");
 		sci_close(dev);
 		return;
-	} else if (out[0] == TOS_NOT_SUPPORTED) {
+	}
+
+	if (out[0] != TOS_SUCCESS) {
 		sci_close(dev);
 		return;
-	} else if (out[0] == TOS_SUCCESS) {
-		dev->usbsc_mode_base = out[4];
 	}
 
+	dev->usbsc_mode_base = out[4];
+
 	in[5] = SCI_USB_CHARGE_BAT_LVL;
 	status = tci_raw(dev, in, out);
 	sci_close(dev);
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to get USB Sleep and Charge mode failed\n");
-	} else if (out[0] == TOS_SUCCESS) {
-		dev->usbsc_bat_level = out[2];
-		/* Flag as supported */
-		dev->usb_sleep_charge_supported = 1;
+		return;
 	}
 
+	if (out[0] != TOS_SUCCESS)
+		return;
+
+	dev->usbsc_bat_level = out[2];
+	/* Flag as supported */
+	dev->usb_sleep_charge_supported = 1;
 }
 
 static int toshiba_usb_sleep_charge_get(struct toshiba_acpi_dev *dev,
@@ -868,14 +897,19 @@
 	sci_close(dev);
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to get USB S&C battery level failed\n");
-	} else if (out[0] == TOS_NOT_SUPPORTED) {
-		return -ENODEV;
-	} else if (out[0] == TOS_SUCCESS) {
-		*mode = out[2];
-		return 0;
+		return -EIO;
 	}
 
-	return -EIO;
+	if (out[0] == TOS_NOT_SUPPORTED)
+		return -ENODEV;
+
+	if (out[0] != TOS_SUCCESS)
+		return -EIO;
+
+	*mode = out[2];
+
+	return 0;
+
 }
 
 static int toshiba_sleep_functions_status_set(struct toshiba_acpi_dev *dev,
@@ -892,9 +926,12 @@
 	in[5] = SCI_USB_CHARGE_BAT_LVL;
 	status = tci_raw(dev, in, out);
 	sci_close(dev);
-	if (ACPI_FAILURE(status))
+	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to set USB S&C battery level failed\n");
-	else if (out[0] == TOS_NOT_SUPPORTED)
+		return -EIO;
+	}
+
+	if (out[0] == TOS_NOT_SUPPORTED)
 		return -ENODEV;
 
 	return out[0] == TOS_SUCCESS ? 0 : -EIO;
@@ -915,14 +952,18 @@
 	sci_close(dev);
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to get USB Rapid Charge failed\n");
-	} else if (out[0] == TOS_NOT_SUPPORTED) {
-		return -ENODEV;
-	} else if (out[0] == TOS_SUCCESS || out[0] == TOS_SUCCESS2) {
-		*state = out[2];
-		return 0;
+		return -EIO;
 	}
 
-	return -EIO;
+	if (out[0] == TOS_NOT_SUPPORTED)
+		return -ENODEV;
+
+	if (out[0] != TOS_SUCCESS && out[0] != TOS_SUCCESS2)
+		return -EIO;
+
+	*state = out[2];
+
+	return 0;
 }
 
 static int toshiba_usb_rapid_charge_set(struct toshiba_acpi_dev *dev,
@@ -939,9 +980,12 @@
 	in[5] = SCI_USB_CHARGE_RAPID_DSP;
 	status = tci_raw(dev, in, out);
 	sci_close(dev);
-	if (ACPI_FAILURE(status))
+	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to set USB Rapid Charge failed\n");
-	else if (out[0] == TOS_NOT_SUPPORTED)
+		return -EIO;
+	}
+
+	if (out[0] == TOS_NOT_SUPPORTED)
 		return -ENODEV;
 
 	return (out[0] == TOS_SUCCESS || out[0] == TOS_SUCCESS2) ? 0 : -EIO;
@@ -1097,14 +1141,18 @@
 	status = tci_raw(dev, in, out);
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to get System type failed\n");
-	} else if (out[0] == TOS_NOT_SUPPORTED) {
-		return -ENODEV;
-	} else if (out[0] == TOS_SUCCESS) {
-		*type = out[3];
-		return 0;
+		return -EIO;
 	}
 
-	return -EIO;
+	if (out[0] == TOS_NOT_SUPPORTED)
+		return -ENODEV;
+
+	if (out[0] != TOS_SUCCESS)
+		return -EIO;
+
+	*type = out[3];
+
+	return 0;
 }
 
 /* Wireless status (RFKill, WLAN, BT, WWAN) */
@@ -1154,7 +1202,6 @@
 	 */
 	in[3] = HCI_WIRELESS_WWAN;
 	status = tci_raw(dev, in, out);
-
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to get WWAN status failed\n");
 		return;
@@ -1174,7 +1221,6 @@
 
 	in[3] = HCI_WIRELESS_WWAN_STATUS;
 	status = tci_raw(dev, in, out);
-
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to set WWAN status failed\n");
 		return -EIO;
@@ -1193,7 +1239,6 @@
 	 */
 	in[3] = HCI_WIRELESS_WWAN_POWER;
 	status = tci_raw(dev, in, out);
-
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to set WWAN power failed\n");
 		return -EIO;
@@ -1216,8 +1261,10 @@
 	dev->max_cooling_method = 0;
 
 	status = tci_raw(dev, in, out);
-	if (ACPI_FAILURE(status))
+	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to get Cooling Method failed\n");
+		return;
+	}
 
 	if (out[0] != TOS_SUCCESS && out[0] != TOS_SUCCESS2)
 		return;
@@ -1244,7 +1291,7 @@
 	u32 result = hci_write(dev, HCI_COOLING_METHOD, state);
 
 	if (result == TOS_FAILURE)
-		pr_err("ACPI call to get Cooling Method failed\n");
+		pr_err("ACPI call to set Cooling Method failed\n");
 
 	if (result == TOS_NOT_SUPPORTED)
 		return -ENODEV;
@@ -1282,9 +1329,9 @@
 /* LCD Brightness */
 static int __get_lcd_brightness(struct toshiba_acpi_dev *dev)
 {
+	int brightness = 0;
 	u32 result;
 	u32 value;
-	int brightness = 0;
 
 	if (dev->tr_backlight_supported) {
 		int ret = get_tr_backlight_status(dev, &value);
@@ -1301,10 +1348,10 @@
 		pr_err("ACPI call to get LCD Brightness failed\n");
 	else if (result == TOS_NOT_SUPPORTED)
 		return -ENODEV;
-	if (result == TOS_SUCCESS)
-		return brightness + (value >> HCI_LCD_BRIGHTNESS_SHIFT);
 
-	return -EIO;
+	return result == TOS_SUCCESS ?
+			brightness + (value >> HCI_LCD_BRIGHTNESS_SHIFT) :
+			-EIO;
 }
 
 static int get_lcd_brightness(struct backlight_device *bd)
@@ -1325,15 +1372,15 @@
 
 	levels = dev->backlight_dev->props.max_brightness + 1;
 	value = get_lcd_brightness(dev->backlight_dev);
-	if (value >= 0) {
-		seq_printf(m, "brightness:              %d\n", value);
-		seq_printf(m, "brightness_levels:       %d\n", levels);
-		return 0;
+	if (value < 0) {
+		pr_err("Error reading LCD brightness\n");
+		return value;
 	}
 
-	pr_err("Error reading LCD brightness\n");
+	seq_printf(m, "brightness:              %d\n", value);
+	seq_printf(m, "brightness_levels:       %d\n", levels);
 
-	return -EIO;
+	return 0;
 }
 
 static int lcd_proc_open(struct inode *inode, struct file *file)
@@ -1377,7 +1424,7 @@
 	struct toshiba_acpi_dev *dev = PDE_DATA(file_inode(file));
 	char cmd[42];
 	size_t len;
-	int levels = dev->backlight_dev->props.max_brightness + 1;
+	int levels;
 	int value;
 
 	len = min(count, sizeof(cmd) - 1);
@@ -1385,6 +1432,7 @@
 		return -EFAULT;
 	cmd[len] = '\0';
 
+	levels = dev->backlight_dev->props.max_brightness + 1;
 	if (sscanf(cmd, " brightness : %i", &value) != 1 &&
 	    value < 0 && value > levels)
 		return -EINVAL;
@@ -1420,20 +1468,21 @@
 static int video_proc_show(struct seq_file *m, void *v)
 {
 	struct toshiba_acpi_dev *dev = m->private;
+	int is_lcd, is_crt, is_tv;
 	u32 value;
 
-	if (!get_video_status(dev, &value)) {
-		int is_lcd = (value & HCI_VIDEO_OUT_LCD) ? 1 : 0;
-		int is_crt = (value & HCI_VIDEO_OUT_CRT) ? 1 : 0;
-		int is_tv = (value & HCI_VIDEO_OUT_TV) ? 1 : 0;
+	if (get_video_status(dev, &value))
+		return -EIO;
 
-		seq_printf(m, "lcd_out:                 %d\n", is_lcd);
-		seq_printf(m, "crt_out:                 %d\n", is_crt);
-		seq_printf(m, "tv_out:                  %d\n", is_tv);
-		return 0;
-	}
+	is_lcd = (value & HCI_VIDEO_OUT_LCD) ? 1 : 0;
+	is_crt = (value & HCI_VIDEO_OUT_CRT) ? 1 : 0;
+	is_tv = (value & HCI_VIDEO_OUT_TV) ? 1 : 0;
 
-	return -EIO;
+	seq_printf(m, "lcd_out:                 %d\n", is_lcd);
+	seq_printf(m, "crt_out:                 %d\n", is_crt);
+	seq_printf(m, "tv_out:                  %d\n", is_tv);
+
+	return 0;
 }
 
 static int video_proc_open(struct inode *inode, struct file *file)
@@ -1447,10 +1496,8 @@
 	struct toshiba_acpi_dev *dev = PDE_DATA(file_inode(file));
 	char *buffer;
 	char *cmd;
+	int lcd_out, crt_out, tv_out;
 	int remain = count;
-	int lcd_out = -1;
-	int crt_out = -1;
-	int tv_out = -1;
 	int value;
 	int ret;
 	u32 video_out;
@@ -1486,6 +1533,7 @@
 
 	kfree(cmd);
 
+	lcd_out = crt_out = tv_out = -1;
 	ret = get_video_status(dev, &video_out);
 	if (!ret) {
 		unsigned int new_video_out = video_out;
@@ -1980,8 +2028,8 @@
 				      const char *buf, size_t count)
 {
 	struct toshiba_acpi_dev *toshiba = dev_get_drvdata(dev);
-	u32 mode;
 	int state;
+	u32 mode;
 	int ret;
 
 	ret = kstrtoint(buf, 0, &state);
@@ -2021,9 +2069,8 @@
 					       char *buf)
 {
 	struct toshiba_acpi_dev *toshiba = dev_get_drvdata(dev);
+	int bat_lvl, status;
 	u32 state;
-	int bat_lvl;
-	int status;
 	int ret;
 	int tmp;
 
diff --git a/drivers/platform/x86/toshiba_bluetooth.c b/drivers/platform/x86/toshiba_bluetooth.c
index 5db495dd..be1d137 100644
--- a/drivers/platform/x86/toshiba_bluetooth.c
+++ b/drivers/platform/x86/toshiba_bluetooth.c
@@ -80,7 +80,9 @@
 	if (ACPI_FAILURE(result)) {
 		pr_err("ACPI call to query Bluetooth presence failed\n");
 		return -ENXIO;
-	} else if (!bt_present) {
+	}
+
+	if (!bt_present) {
 		pr_info("Bluetooth device not present\n");
 		return -ENODEV;
 	}
diff --git a/drivers/platform/x86/toshiba_haps.c b/drivers/platform/x86/toshiba_haps.c
index 7f2afc6..b3dec52 100644
--- a/drivers/platform/x86/toshiba_haps.c
+++ b/drivers/platform/x86/toshiba_haps.c
@@ -59,7 +59,7 @@
 		return -EIO;
 	}
 
-	pr_info("HDD protection level set to: %d\n", level);
+	pr_debug("HDD protection level set to: %d\n", level);
 
 	return 0;
 }
@@ -141,7 +141,7 @@
  */
 static void toshiba_haps_notify(struct acpi_device *device, u32 event)
 {
-	pr_info("Received event: 0x%x", event);
+	pr_debug("Received event: 0x%x", event);
 
 	acpi_bus_generate_netlink_event(device->pnp.device_class,
 					dev_name(&device->dev),
@@ -168,9 +168,13 @@
 	 * A non existent device as well as having (only)
 	 * Solid State Drives can cause the call to fail.
 	 */
-	status = acpi_evaluate_integer(handle, "_STA", NULL,
-				       &hdd_present);
-	if (ACPI_FAILURE(status) || !hdd_present) {
+	status = acpi_evaluate_integer(handle, "_STA", NULL, &hdd_present);
+	if (ACPI_FAILURE(status)) {
+		pr_err("ACPI call to query HDD protection failed\n");
+		return 0;
+	}
+
+	if (!hdd_present) {
 		pr_info("HDD protection not available or using SSD\n");
 		return 0;
 	}
diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig
index 80a566a..bf01288 100644
--- a/drivers/pwm/Kconfig
+++ b/drivers/pwm/Kconfig
@@ -262,6 +262,15 @@
 	  To compile this driver as a module, choose M here: the module
 	  will be called pwm-lpss-platform.
 
+config PWM_MESON
+	tristate "Amlogic Meson PWM driver"
+	depends on ARCH_MESON
+	help
+	  The platform driver for Amlogic Meson PWM controller.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called pwm-meson.
+
 config PWM_MTK_DISP
 	tristate "MediaTek display PWM driver"
 	depends on ARCH_MEDIATEK || COMPILE_TEST
diff --git a/drivers/pwm/Makefile b/drivers/pwm/Makefile
index feef1dd..1194c54 100644
--- a/drivers/pwm/Makefile
+++ b/drivers/pwm/Makefile
@@ -24,6 +24,7 @@
 obj-$(CONFIG_PWM_LPSS)		+= pwm-lpss.o
 obj-$(CONFIG_PWM_LPSS_PCI)	+= pwm-lpss-pci.o
 obj-$(CONFIG_PWM_LPSS_PLATFORM)	+= pwm-lpss-platform.o
+obj-$(CONFIG_PWM_MESON)		+= pwm-meson.o
 obj-$(CONFIG_PWM_MTK_DISP)	+= pwm-mtk-disp.o
 obj-$(CONFIG_PWM_MXS)		+= pwm-mxs.o
 obj-$(CONFIG_PWM_OMAP_DMTIMER)	+= pwm-omap-dmtimer.o
diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 0dbd29e..172ef82 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -339,6 +339,8 @@
 	unsigned int i;
 	int ret = 0;
 
+	pwmchip_sysfs_unexport_children(chip);
+
 	mutex_lock(&pwm_lock);
 
 	for (i = 0; i < chip->npwm; i++) {
diff --git a/drivers/pwm/pwm-berlin.c b/drivers/pwm/pwm-berlin.c
index 6510812..01339c1 100644
--- a/drivers/pwm/pwm-berlin.c
+++ b/drivers/pwm/pwm-berlin.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/pwm.h>
+#include <linux/slab.h>
 
 #define BERLIN_PWM_EN			0x0
 #define  BERLIN_PWM_ENABLE		BIT(0)
@@ -27,6 +28,13 @@
 #define BERLIN_PWM_TCNT			0xc
 #define  BERLIN_PWM_MAX_TCNT		65535
 
+struct berlin_pwm_channel {
+	u32 enable;
+	u32 ctrl;
+	u32 duty;
+	u32 tcnt;
+};
+
 struct berlin_pwm_chip {
 	struct pwm_chip chip;
 	struct clk *clk;
@@ -55,6 +63,25 @@
 	writel_relaxed(value, chip->base + channel * 0x10 + offset);
 }
 
+static int berlin_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+	struct berlin_pwm_channel *channel;
+
+	channel = kzalloc(sizeof(*channel), GFP_KERNEL);
+	if (!channel)
+		return -ENOMEM;
+
+	return pwm_set_chip_data(pwm, channel);
+}
+
+static void berlin_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+	struct berlin_pwm_channel *channel = pwm_get_chip_data(pwm);
+
+	pwm_set_chip_data(pwm, NULL);
+	kfree(channel);
+}
+
 static int berlin_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm_dev,
 			     int duty_ns, int period_ns)
 {
@@ -137,6 +164,8 @@
 }
 
 static const struct pwm_ops berlin_pwm_ops = {
+	.request = berlin_pwm_request,
+	.free = berlin_pwm_free,
 	.config = berlin_pwm_config,
 	.set_polarity = berlin_pwm_set_polarity,
 	.enable = berlin_pwm_enable,
@@ -204,12 +233,67 @@
 	return ret;
 }
 
+#ifdef CONFIG_PM_SLEEP
+static int berlin_pwm_suspend(struct device *dev)
+{
+	struct berlin_pwm_chip *pwm = dev_get_drvdata(dev);
+	unsigned int i;
+
+	for (i = 0; i < pwm->chip.npwm; i++) {
+		struct berlin_pwm_channel *channel;
+
+		channel = pwm_get_chip_data(&pwm->chip.pwms[i]);
+		if (!channel)
+			continue;
+
+		channel->enable = berlin_pwm_readl(pwm, i, BERLIN_PWM_ENABLE);
+		channel->ctrl = berlin_pwm_readl(pwm, i, BERLIN_PWM_CONTROL);
+		channel->duty = berlin_pwm_readl(pwm, i, BERLIN_PWM_DUTY);
+		channel->tcnt = berlin_pwm_readl(pwm, i, BERLIN_PWM_TCNT);
+	}
+
+	clk_disable_unprepare(pwm->clk);
+
+	return 0;
+}
+
+static int berlin_pwm_resume(struct device *dev)
+{
+	struct berlin_pwm_chip *pwm = dev_get_drvdata(dev);
+	unsigned int i;
+	int ret;
+
+	ret = clk_prepare_enable(pwm->clk);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < pwm->chip.npwm; i++) {
+		struct berlin_pwm_channel *channel;
+
+		channel = pwm_get_chip_data(&pwm->chip.pwms[i]);
+		if (!channel)
+			continue;
+
+		berlin_pwm_writel(pwm, i, channel->ctrl, BERLIN_PWM_CONTROL);
+		berlin_pwm_writel(pwm, i, channel->duty, BERLIN_PWM_DUTY);
+		berlin_pwm_writel(pwm, i, channel->tcnt, BERLIN_PWM_TCNT);
+		berlin_pwm_writel(pwm, i, channel->enable, BERLIN_PWM_ENABLE);
+	}
+
+	return 0;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(berlin_pwm_pm_ops, berlin_pwm_suspend,
+			 berlin_pwm_resume);
+
 static struct platform_driver berlin_pwm_driver = {
 	.probe = berlin_pwm_probe,
 	.remove = berlin_pwm_remove,
 	.driver = {
 		.name = "berlin-pwm",
 		.of_match_table = berlin_pwm_match,
+		.pm = &berlin_pwm_pm_ops,
 	},
 };
 module_platform_driver(berlin_pwm_driver);
diff --git a/drivers/pwm/pwm-cros-ec.c b/drivers/pwm/pwm-cros-ec.c
index 99b9acc..f6ca4e8 100644
--- a/drivers/pwm/pwm-cros-ec.c
+++ b/drivers/pwm/pwm-cros-ec.c
@@ -38,7 +38,7 @@
 	struct {
 		struct cros_ec_command msg;
 		struct ec_params_pwm_set_duty params;
-	} buf;
+	} __packed buf;
 	struct ec_params_pwm_set_duty *params = &buf.params;
 	struct cros_ec_command *msg = &buf.msg;
 
@@ -65,7 +65,7 @@
 			struct ec_params_pwm_get_duty params;
 			struct ec_response_pwm_get_duty resp;
 		};
-	} buf;
+	} __packed buf;
 	struct ec_params_pwm_get_duty *params = &buf.params;
 	struct ec_response_pwm_get_duty *resp = &buf.resp;
 	struct cros_ec_command *msg = &buf.msg;
diff --git a/drivers/pwm/pwm-lpc18xx-sct.c b/drivers/pwm/pwm-lpc18xx-sct.c
index 19dc64c..d7f5f7d 100644
--- a/drivers/pwm/pwm-lpc18xx-sct.c
+++ b/drivers/pwm/pwm-lpc18xx-sct.c
@@ -413,14 +413,18 @@
 	}
 
 	for (i = 0; i < lpc18xx_pwm->chip.npwm; i++) {
+		struct lpc18xx_pwm_data *data;
+
 		pwm = &lpc18xx_pwm->chip.pwms[i];
-		pwm->chip_data = devm_kzalloc(lpc18xx_pwm->dev,
-					      sizeof(struct lpc18xx_pwm_data),
-					      GFP_KERNEL);
-		if (!pwm->chip_data) {
+
+		data = devm_kzalloc(lpc18xx_pwm->dev, sizeof(*data),
+				    GFP_KERNEL);
+		if (!data) {
 			ret = -ENOMEM;
 			goto remove_pwmchip;
 		}
+
+		pwm_set_chip_data(pwm, data);
 	}
 
 	platform_set_drvdata(pdev, lpc18xx_pwm);
diff --git a/drivers/pwm/pwm-meson.c b/drivers/pwm/pwm-meson.c
new file mode 100644
index 0000000..381871b
--- /dev/null
+++ b/drivers/pwm/pwm-meson.c
@@ -0,0 +1,529 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright (c) 2016 BayLibre, SAS.
+ * Author: Neil Armstrong <narmstrong@baylibre.com>
+ * Copyright (C) 2014 Amlogic, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ * The full GNU General Public License is included in this distribution
+ * in the file called COPYING.
+ *
+ * BSD LICENSE
+ *
+ * Copyright (c) 2016 BayLibre, SAS.
+ * Author: Neil Armstrong <narmstrong@baylibre.com>
+ * Copyright (C) 2014 Amlogic, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/clk.h>
+#include <linux/clk-provider.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/pwm.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#define REG_PWM_A		0x0
+#define REG_PWM_B		0x4
+#define PWM_HIGH_SHIFT		16
+
+#define REG_MISC_AB		0x8
+#define MISC_B_CLK_EN		BIT(23)
+#define MISC_A_CLK_EN		BIT(15)
+#define MISC_CLK_DIV_MASK	0x7f
+#define MISC_B_CLK_DIV_SHIFT	16
+#define MISC_A_CLK_DIV_SHIFT	8
+#define MISC_B_CLK_SEL_SHIFT	6
+#define MISC_A_CLK_SEL_SHIFT	4
+#define MISC_CLK_SEL_WIDTH	2
+#define MISC_B_EN		BIT(1)
+#define MISC_A_EN		BIT(0)
+
+static const unsigned int mux_reg_shifts[] = {
+	MISC_A_CLK_SEL_SHIFT,
+	MISC_B_CLK_SEL_SHIFT
+};
+
+struct meson_pwm_channel {
+	unsigned int hi;
+	unsigned int lo;
+	u8 pre_div;
+
+	struct pwm_state state;
+
+	struct clk *clk_parent;
+	struct clk_mux mux;
+	struct clk *clk;
+};
+
+struct meson_pwm_data {
+	const char * const *parent_names;
+};
+
+struct meson_pwm {
+	struct pwm_chip chip;
+	const struct meson_pwm_data *data;
+	void __iomem *base;
+	u8 inverter_mask;
+	spinlock_t lock;
+};
+
+static inline struct meson_pwm *to_meson_pwm(struct pwm_chip *chip)
+{
+	return container_of(chip, struct meson_pwm, chip);
+}
+
+static int meson_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+	struct meson_pwm_channel *channel = pwm_get_chip_data(pwm);
+	struct device *dev = chip->dev;
+	int err;
+
+	if (!channel)
+		return -ENODEV;
+
+	if (channel->clk_parent) {
+		err = clk_set_parent(channel->clk, channel->clk_parent);
+		if (err < 0) {
+			dev_err(dev, "failed to set parent %s for %s: %d\n",
+				__clk_get_name(channel->clk_parent),
+				__clk_get_name(channel->clk), err);
+				return err;
+		}
+	}
+
+	err = clk_prepare_enable(channel->clk);
+	if (err < 0) {
+		dev_err(dev, "failed to enable clock %s: %d\n",
+			__clk_get_name(channel->clk), err);
+		return err;
+	}
+
+	chip->ops->get_state(chip, pwm, &channel->state);
+
+	return 0;
+}
+
+static void meson_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+	struct meson_pwm_channel *channel = pwm_get_chip_data(pwm);
+
+	if (channel)
+		clk_disable_unprepare(channel->clk);
+}
+
+static int meson_pwm_calc(struct meson_pwm *meson,
+			  struct meson_pwm_channel *channel, unsigned int id,
+			  unsigned int duty, unsigned int period)
+{
+	unsigned int pre_div, cnt, duty_cnt;
+	unsigned long fin_freq = -1, fin_ns;
+
+	if (~(meson->inverter_mask >> id) & 0x1)
+		duty = period - duty;
+
+	if (period == channel->state.period &&
+	    duty == channel->state.duty_cycle)
+		return 0;
+
+	fin_freq = clk_get_rate(channel->clk);
+	if (fin_freq == 0) {
+		dev_err(meson->chip.dev, "invalid source clock frequency\n");
+		return -EINVAL;
+	}
+
+	dev_dbg(meson->chip.dev, "fin_freq: %lu Hz\n", fin_freq);
+	fin_ns = NSEC_PER_SEC / fin_freq;
+
+	/* Calc pre_div with the period */
+	for (pre_div = 0; pre_div < MISC_CLK_DIV_MASK; pre_div++) {
+		cnt = DIV_ROUND_CLOSEST(period, fin_ns * (pre_div + 1));
+		dev_dbg(meson->chip.dev, "fin_ns=%lu pre_div=%u cnt=%u\n",
+			fin_ns, pre_div, cnt);
+		if (cnt <= 0xffff)
+			break;
+	}
+
+	if (pre_div == MISC_CLK_DIV_MASK) {
+		dev_err(meson->chip.dev, "unable to get period pre_div\n");
+		return -EINVAL;
+	}
+
+	dev_dbg(meson->chip.dev, "period=%u pre_div=%u cnt=%u\n", period,
+		pre_div, cnt);
+
+	if (duty == period) {
+		channel->pre_div = pre_div;
+		channel->hi = cnt;
+		channel->lo = 0;
+	} else if (duty == 0) {
+		channel->pre_div = pre_div;
+		channel->hi = 0;
+		channel->lo = cnt;
+	} else {
+		/* Then check is we can have the duty with the same pre_div */
+		duty_cnt = DIV_ROUND_CLOSEST(duty, fin_ns * (pre_div + 1));
+		if (duty_cnt > 0xffff) {
+			dev_err(meson->chip.dev, "unable to get duty cycle\n");
+			return -EINVAL;
+		}
+
+		dev_dbg(meson->chip.dev, "duty=%u pre_div=%u duty_cnt=%u\n",
+			duty, pre_div, duty_cnt);
+
+		channel->pre_div = pre_div;
+		channel->hi = duty_cnt;
+		channel->lo = cnt - duty_cnt;
+	}
+
+	return 0;
+}
+
+static void meson_pwm_enable(struct meson_pwm *meson,
+			     struct meson_pwm_channel *channel,
+			     unsigned int id)
+{
+	u32 value, clk_shift, clk_enable, enable;
+	unsigned int offset;
+
+	switch (id) {
+	case 0:
+		clk_shift = MISC_A_CLK_DIV_SHIFT;
+		clk_enable = MISC_A_CLK_EN;
+		enable = MISC_A_EN;
+		offset = REG_PWM_A;
+		break;
+
+	case 1:
+		clk_shift = MISC_B_CLK_DIV_SHIFT;
+		clk_enable = MISC_B_CLK_EN;
+		enable = MISC_B_EN;
+		offset = REG_PWM_B;
+		break;
+
+	default:
+		return;
+	}
+
+	value = readl(meson->base + REG_MISC_AB);
+	value &= ~(MISC_CLK_DIV_MASK << clk_shift);
+	value |= channel->pre_div << clk_shift;
+	value |= clk_enable;
+	writel(value, meson->base + REG_MISC_AB);
+
+	value = (channel->hi << PWM_HIGH_SHIFT) | channel->lo;
+	writel(value, meson->base + offset);
+
+	value = readl(meson->base + REG_MISC_AB);
+	value |= enable;
+	writel(value, meson->base + REG_MISC_AB);
+}
+
+static void meson_pwm_disable(struct meson_pwm *meson, unsigned int id)
+{
+	u32 value, enable;
+
+	switch (id) {
+	case 0:
+		enable = MISC_A_EN;
+		break;
+
+	case 1:
+		enable = MISC_B_EN;
+		break;
+
+	default:
+		return;
+	}
+
+	value = readl(meson->base + REG_MISC_AB);
+	value &= ~enable;
+	writel(value, meson->base + REG_MISC_AB);
+}
+
+static int meson_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
+			   struct pwm_state *state)
+{
+	struct meson_pwm_channel *channel = pwm_get_chip_data(pwm);
+	struct meson_pwm *meson = to_meson_pwm(chip);
+	unsigned long flags;
+	int err = 0;
+
+	if (!state)
+		return -EINVAL;
+
+	spin_lock_irqsave(&meson->lock, flags);
+
+	if (!state->enabled) {
+		meson_pwm_disable(meson, pwm->hwpwm);
+		channel->state.enabled = false;
+
+		goto unlock;
+	}
+
+	if (state->period != channel->state.period ||
+	    state->duty_cycle != channel->state.duty_cycle ||
+	    state->polarity != channel->state.polarity) {
+		if (channel->state.enabled) {
+			meson_pwm_disable(meson, pwm->hwpwm);
+			channel->state.enabled = false;
+		}
+
+		if (state->polarity != channel->state.polarity) {
+			if (state->polarity == PWM_POLARITY_NORMAL)
+				meson->inverter_mask |= BIT(pwm->hwpwm);
+			else
+				meson->inverter_mask &= ~BIT(pwm->hwpwm);
+		}
+
+		err = meson_pwm_calc(meson, channel, pwm->hwpwm,
+				     state->duty_cycle, state->period);
+		if (err < 0)
+			goto unlock;
+
+		channel->state.polarity = state->polarity;
+		channel->state.period = state->period;
+		channel->state.duty_cycle = state->duty_cycle;
+	}
+
+	if (state->enabled && !channel->state.enabled) {
+		meson_pwm_enable(meson, channel, pwm->hwpwm);
+		channel->state.enabled = true;
+	}
+
+unlock:
+	spin_unlock_irqrestore(&meson->lock, flags);
+	return err;
+}
+
+static void meson_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm,
+				struct pwm_state *state)
+{
+	struct meson_pwm *meson = to_meson_pwm(chip);
+	u32 value, mask;
+
+	if (!state)
+		return;
+
+	switch (pwm->hwpwm) {
+	case 0:
+		mask = MISC_A_EN;
+		break;
+
+	case 1:
+		mask = MISC_B_EN;
+		break;
+
+	default:
+		return;
+	}
+
+	value = readl(meson->base + REG_MISC_AB);
+	state->enabled = (value & mask) != 0;
+}
+
+static const struct pwm_ops meson_pwm_ops = {
+	.request = meson_pwm_request,
+	.free = meson_pwm_free,
+	.apply = meson_pwm_apply,
+	.get_state = meson_pwm_get_state,
+	.owner = THIS_MODULE,
+};
+
+static const char * const pwm_meson8b_parent_names[] = {
+	"xtal", "vid_pll", "fclk_div4", "fclk_div3"
+};
+
+static const struct meson_pwm_data pwm_meson8b_data = {
+	.parent_names = pwm_meson8b_parent_names,
+};
+
+static const char * const pwm_gxbb_parent_names[] = {
+	"xtal", "hdmi_pll", "fclk_div4", "fclk_div3"
+};
+
+static const struct meson_pwm_data pwm_gxbb_data = {
+	.parent_names = pwm_gxbb_parent_names,
+};
+
+static const struct of_device_id meson_pwm_matches[] = {
+	{ .compatible = "amlogic,meson8b-pwm", .data = &pwm_meson8b_data },
+	{ .compatible = "amlogic,meson-gxbb-pwm", .data = &pwm_gxbb_data },
+	{},
+};
+MODULE_DEVICE_TABLE(of, meson_pwm_matches);
+
+static int meson_pwm_init_channels(struct meson_pwm *meson,
+				   struct meson_pwm_channel *channels)
+{
+	struct device *dev = meson->chip.dev;
+	struct device_node *np = dev->of_node;
+	struct clk_init_data init;
+	unsigned int i;
+	char name[255];
+	int err;
+
+	for (i = 0; i < meson->chip.npwm; i++) {
+		struct meson_pwm_channel *channel = &channels[i];
+
+		snprintf(name, sizeof(name), "%s#mux%u", np->full_name, i);
+
+		init.name = name;
+		init.ops = &clk_mux_ops;
+		init.flags = CLK_IS_BASIC;
+		init.parent_names = meson->data->parent_names;
+		init.num_parents = 1 << MISC_CLK_SEL_WIDTH;
+
+		channel->mux.reg = meson->base + REG_MISC_AB;
+		channel->mux.shift = mux_reg_shifts[i];
+		channel->mux.mask = BIT(MISC_CLK_SEL_WIDTH) - 1;
+		channel->mux.flags = 0;
+		channel->mux.lock = &meson->lock;
+		channel->mux.table = NULL;
+		channel->mux.hw.init = &init;
+
+		channel->clk = devm_clk_register(dev, &channel->mux.hw);
+		if (IS_ERR(channel->clk)) {
+			err = PTR_ERR(channel->clk);
+			dev_err(dev, "failed to register %s: %d\n", name, err);
+			return err;
+		}
+
+		snprintf(name, sizeof(name), "clkin%u", i);
+
+		channel->clk_parent = devm_clk_get(dev, name);
+		if (IS_ERR(channel->clk_parent)) {
+			err = PTR_ERR(channel->clk_parent);
+			if (err == -EPROBE_DEFER)
+				return err;
+
+			channel->clk_parent = NULL;
+		}
+	}
+
+	return 0;
+}
+
+static void meson_pwm_add_channels(struct meson_pwm *meson,
+				   struct meson_pwm_channel *channels)
+{
+	unsigned int i;
+
+	for (i = 0; i < meson->chip.npwm; i++)
+		pwm_set_chip_data(&meson->chip.pwms[i], &channels[i]);
+}
+
+static int meson_pwm_probe(struct platform_device *pdev)
+{
+	struct meson_pwm_channel *channels;
+	struct meson_pwm *meson;
+	struct resource *regs;
+	int err;
+
+	meson = devm_kzalloc(&pdev->dev, sizeof(*meson), GFP_KERNEL);
+	if (!meson)
+		return -ENOMEM;
+
+	regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	meson->base = devm_ioremap_resource(&pdev->dev, regs);
+	if (IS_ERR(meson->base))
+		return PTR_ERR(meson->base);
+
+	meson->chip.dev = &pdev->dev;
+	meson->chip.ops = &meson_pwm_ops;
+	meson->chip.base = -1;
+	meson->chip.npwm = 2;
+	meson->chip.of_xlate = of_pwm_xlate_with_flags;
+	meson->chip.of_pwm_n_cells = 3;
+
+	meson->data = of_device_get_match_data(&pdev->dev);
+	meson->inverter_mask = BIT(meson->chip.npwm) - 1;
+
+	channels = devm_kcalloc(&pdev->dev, meson->chip.npwm, sizeof(*meson),
+				GFP_KERNEL);
+	if (!channels)
+		return -ENOMEM;
+
+	err = meson_pwm_init_channels(meson, channels);
+	if (err < 0)
+		return err;
+
+	err = pwmchip_add(&meson->chip);
+	if (err < 0) {
+		dev_err(&pdev->dev, "failed to register PWM chip: %d\n", err);
+		return err;
+	}
+
+	meson_pwm_add_channels(meson, channels);
+
+	platform_set_drvdata(pdev, meson);
+
+	return 0;
+}
+
+static int meson_pwm_remove(struct platform_device *pdev)
+{
+	struct meson_pwm *meson = platform_get_drvdata(pdev);
+
+	return pwmchip_remove(&meson->chip);
+}
+
+static struct platform_driver meson_pwm_driver = {
+	.driver = {
+		.name = "meson-pwm",
+		.of_match_table = meson_pwm_matches,
+	},
+	.probe = meson_pwm_probe,
+	.remove = meson_pwm_remove,
+};
+module_platform_driver(meson_pwm_driver);
+
+MODULE_ALIAS("platform:meson-pwm");
+MODULE_DESCRIPTION("Amlogic Meson PWM Generator driver");
+MODULE_AUTHOR("Neil Armstrong <narmstrong@baylibre.com>");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/pwm/pwm-mtk-disp.c b/drivers/pwm/pwm-mtk-disp.c
index 0ad3385..893940d 100644
--- a/drivers/pwm/pwm-mtk-disp.c
+++ b/drivers/pwm/pwm-mtk-disp.c
@@ -18,30 +18,40 @@
 #include <linux/io.h>
 #include <linux/module.h>
 #include <linux/of.h>
+#include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/pwm.h>
 #include <linux/slab.h>
 
 #define DISP_PWM_EN		0x00
-#define PWM_ENABLE_MASK		BIT(0)
 
-#define DISP_PWM_COMMIT		0x08
-#define PWM_COMMIT_MASK		BIT(0)
-
-#define DISP_PWM_CON_0		0x10
 #define PWM_CLKDIV_SHIFT	16
 #define PWM_CLKDIV_MAX		0x3ff
 #define PWM_CLKDIV_MASK		(PWM_CLKDIV_MAX << PWM_CLKDIV_SHIFT)
 
-#define DISP_PWM_CON_1		0x14
 #define PWM_PERIOD_BIT_WIDTH	12
 #define PWM_PERIOD_MASK		((1 << PWM_PERIOD_BIT_WIDTH) - 1)
 
 #define PWM_HIGH_WIDTH_SHIFT	16
 #define PWM_HIGH_WIDTH_MASK	(0x1fff << PWM_HIGH_WIDTH_SHIFT)
 
+struct mtk_pwm_data {
+	u32 enable_mask;
+	unsigned int con0;
+	u32 con0_sel;
+	unsigned int con1;
+
+	bool has_commit;
+	unsigned int commit;
+	unsigned int commit_mask;
+
+	unsigned int bls_debug;
+	u32 bls_debug_mask;
+};
+
 struct mtk_disp_pwm {
 	struct pwm_chip chip;
+	const struct mtk_pwm_data *data;
 	struct clk *clk_main;
 	struct clk *clk_mm;
 	void __iomem *base;
@@ -106,12 +116,21 @@
 		return err;
 	}
 
-	mtk_disp_pwm_update_bits(mdp, DISP_PWM_CON_0, PWM_CLKDIV_MASK,
+	mtk_disp_pwm_update_bits(mdp, mdp->data->con0,
+				 PWM_CLKDIV_MASK,
 				 clk_div << PWM_CLKDIV_SHIFT);
-	mtk_disp_pwm_update_bits(mdp, DISP_PWM_CON_1,
-				 PWM_PERIOD_MASK | PWM_HIGH_WIDTH_MASK, value);
-	mtk_disp_pwm_update_bits(mdp, DISP_PWM_COMMIT, PWM_COMMIT_MASK, 1);
-	mtk_disp_pwm_update_bits(mdp, DISP_PWM_COMMIT, PWM_COMMIT_MASK, 0);
+	mtk_disp_pwm_update_bits(mdp, mdp->data->con1,
+				 PWM_PERIOD_MASK | PWM_HIGH_WIDTH_MASK,
+				 value);
+
+	if (mdp->data->has_commit) {
+		mtk_disp_pwm_update_bits(mdp, mdp->data->commit,
+					 mdp->data->commit_mask,
+					 mdp->data->commit_mask);
+		mtk_disp_pwm_update_bits(mdp, mdp->data->commit,
+					 mdp->data->commit_mask,
+					 0x0);
+	}
 
 	clk_disable(mdp->clk_mm);
 	clk_disable(mdp->clk_main);
@@ -134,7 +153,8 @@
 		return err;
 	}
 
-	mtk_disp_pwm_update_bits(mdp, DISP_PWM_EN, PWM_ENABLE_MASK, 1);
+	mtk_disp_pwm_update_bits(mdp, DISP_PWM_EN, mdp->data->enable_mask,
+				 mdp->data->enable_mask);
 
 	return 0;
 }
@@ -143,7 +163,8 @@
 {
 	struct mtk_disp_pwm *mdp = to_mtk_disp_pwm(chip);
 
-	mtk_disp_pwm_update_bits(mdp, DISP_PWM_EN, PWM_ENABLE_MASK, 0);
+	mtk_disp_pwm_update_bits(mdp, DISP_PWM_EN, mdp->data->enable_mask,
+				 0x0);
 
 	clk_disable(mdp->clk_mm);
 	clk_disable(mdp->clk_main);
@@ -166,6 +187,8 @@
 	if (!mdp)
 		return -ENOMEM;
 
+	mdp->data = of_device_get_match_data(&pdev->dev);
+
 	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	mdp->base = devm_ioremap_resource(&pdev->dev, r);
 	if (IS_ERR(mdp->base))
@@ -200,6 +223,19 @@
 
 	platform_set_drvdata(pdev, mdp);
 
+	/*
+	 * For MT2701, disable double buffer before writing register
+	 * and select manual mode and use PWM_PERIOD/PWM_HIGH_WIDTH.
+	 */
+	if (!mdp->data->has_commit) {
+		mtk_disp_pwm_update_bits(mdp, mdp->data->bls_debug,
+					 mdp->data->bls_debug_mask,
+					 mdp->data->bls_debug_mask);
+		mtk_disp_pwm_update_bits(mdp, mdp->data->con0,
+					 mdp->data->con0_sel,
+					 mdp->data->con0_sel);
+	}
+
 	return 0;
 
 disable_clk_mm:
@@ -221,9 +257,30 @@
 	return ret;
 }
 
+static const struct mtk_pwm_data mt2701_pwm_data = {
+	.enable_mask = BIT(16),
+	.con0 = 0xa8,
+	.con0_sel = 0x2,
+	.con1 = 0xac,
+	.has_commit = false,
+	.bls_debug = 0xb0,
+	.bls_debug_mask = 0x3,
+};
+
+static const struct mtk_pwm_data mt8173_pwm_data = {
+	.enable_mask = BIT(0),
+	.con0 = 0x10,
+	.con0_sel = 0x0,
+	.con1 = 0x14,
+	.has_commit = true,
+	.commit = 0x8,
+	.commit_mask = 0x1,
+};
+
 static const struct of_device_id mtk_disp_pwm_of_match[] = {
-	{ .compatible = "mediatek,mt8173-disp-pwm" },
-	{ .compatible = "mediatek,mt6595-disp-pwm" },
+	{ .compatible = "mediatek,mt2701-disp-pwm", .data = &mt2701_pwm_data},
+	{ .compatible = "mediatek,mt6595-disp-pwm", .data = &mt8173_pwm_data},
+	{ .compatible = "mediatek,mt8173-disp-pwm", .data = &mt8173_pwm_data},
 	{ }
 };
 MODULE_DEVICE_TABLE(of, mtk_disp_pwm_of_match);
diff --git a/drivers/pwm/pwm-samsung.c b/drivers/pwm/pwm-samsung.c
index ada2d32..f113cda 100644
--- a/drivers/pwm/pwm-samsung.c
+++ b/drivers/pwm/pwm-samsung.c
@@ -193,9 +193,18 @@
 	 * divider settings and choose the lowest divisor that can generate
 	 * frequencies lower than requested.
 	 */
-	for (div = variant->div_base; div < 4; ++div)
-		if ((rate >> (variant->bits + div)) < freq)
-			break;
+	if (variant->bits < 32) {
+		/* Only for s3c24xx */
+		for (div = variant->div_base; div < 4; ++div)
+			if ((rate >> (variant->bits + div)) < freq)
+				break;
+	} else {
+		/*
+		 * Other variants have enough counter bits to generate any
+		 * requested rate, so no need to check higher divisors.
+		 */
+		div = variant->div_base;
+	}
 
 	pwm_samsung_set_divisor(chip, chan, BIT(div));
 
diff --git a/drivers/pwm/pwm-sti.c b/drivers/pwm/pwm-sti.c
index 92abbd5..dd82dc8 100644
--- a/drivers/pwm/pwm-sti.c
+++ b/drivers/pwm/pwm-sti.c
@@ -1,8 +1,10 @@
 /*
- * PWM device driver for ST SoCs.
- * Author: Ajit Pal Singh <ajitpal.singh@st.com>
+ * PWM device driver for ST SoCs
  *
- * Copyright (C) 2013-2014 STMicroelectronics (R&D) Limited
+ * Copyright (C) 2013-2016 STMicroelectronics (R&D) Limited
+ *
+ * Author: Ajit Pal Singh <ajitpal.singh@st.com>
+ *         Lee Jones <lee.jones@linaro.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -11,6 +13,7 @@
  */
 
 #include <linux/clk.h>
+#include <linux/interrupt.h>
 #include <linux/math64.h>
 #include <linux/mfd/syscon.h>
 #include <linux/module.h>
@@ -18,43 +21,82 @@
 #include <linux/platform_device.h>
 #include <linux/pwm.h>
 #include <linux/regmap.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/time.h>
+#include <linux/wait.h>
 
-#define STI_DS_REG(ch)	(4 * (ch))	/* Channel's Duty Cycle register */
-#define STI_PWMCR	0x50		/* Control/Config register */
-#define STI_INTEN	0x54		/* Interrupt Enable/Disable register */
-#define PWM_PRESCALE_LOW_MASK		0x0f
-#define PWM_PRESCALE_HIGH_MASK		0xf0
+#define PWM_OUT_VAL(x)	(0x00 + (4 * (x))) /* Device's Duty Cycle register */
+#define PWM_CPT_VAL(x)	(0x10 + (4 * (x))) /* Capture value */
+#define PWM_CPT_EDGE(x) (0x30 + (4 * (x))) /* Edge to capture on */
+
+#define STI_PWM_CTRL		0x50	/* Control/Config register */
+#define STI_INT_EN		0x54	/* Interrupt Enable/Disable register */
+#define STI_INT_STA		0x58	/* Interrupt Status register */
+#define PWM_INT_ACK		0x5c
+#define PWM_PRESCALE_LOW_MASK	0x0f
+#define PWM_PRESCALE_HIGH_MASK	0xf0
+#define PWM_CPT_EDGE_MASK	0x03
+#define PWM_INT_ACK_MASK	0x1ff
+
+#define STI_MAX_CPT_DEVS	4
+#define CPT_DC_MAX		0xff
 
 /* Regfield IDs */
 enum {
+	/* Bits in PWM_CTRL*/
 	PWMCLK_PRESCALE_LOW,
 	PWMCLK_PRESCALE_HIGH,
-	PWM_EN,
-	PWM_INT_EN,
+	CPTCLK_PRESCALE,
+
+	PWM_OUT_EN,
+	PWM_CPT_EN,
+
+	PWM_CPT_INT_EN,
+	PWM_CPT_INT_STAT,
 
 	/* Keep last */
 	MAX_REGFIELDS
 };
 
+/*
+ * Each capture input can be programmed to detect rising-edge, falling-edge,
+ * either edge or neither egde.
+ */
+enum sti_cpt_edge {
+	CPT_EDGE_DISABLED,
+	CPT_EDGE_RISING,
+	CPT_EDGE_FALLING,
+	CPT_EDGE_BOTH,
+};
+
+struct sti_cpt_ddata {
+	u32 snapshot[3];
+	unsigned int index;
+	struct mutex lock;
+	wait_queue_head_t wait;
+};
+
 struct sti_pwm_compat_data {
 	const struct reg_field *reg_fields;
-	unsigned int num_chan;
+	unsigned int pwm_num_devs;
+	unsigned int cpt_num_devs;
 	unsigned int max_pwm_cnt;
 	unsigned int max_prescale;
 };
 
 struct sti_pwm_chip {
 	struct device *dev;
-	struct clk *clk;
-	unsigned long clk_rate;
+	struct clk *pwm_clk;
+	struct clk *cpt_clk;
 	struct regmap *regmap;
 	struct sti_pwm_compat_data *cdata;
 	struct regmap_field *prescale_low;
 	struct regmap_field *prescale_high;
-	struct regmap_field *pwm_en;
-	struct regmap_field *pwm_int_en;
+	struct regmap_field *pwm_out_en;
+	struct regmap_field *pwm_cpt_en;
+	struct regmap_field *pwm_cpt_int_en;
+	struct regmap_field *pwm_cpt_int_stat;
 	struct pwm_chip chip;
 	struct pwm_device *cur;
 	unsigned long configured;
@@ -64,10 +106,13 @@
 };
 
 static const struct reg_field sti_pwm_regfields[MAX_REGFIELDS] = {
-	[PWMCLK_PRESCALE_LOW]	= REG_FIELD(STI_PWMCR, 0, 3),
-	[PWMCLK_PRESCALE_HIGH]	= REG_FIELD(STI_PWMCR, 11, 14),
-	[PWM_EN]		= REG_FIELD(STI_PWMCR, 9, 9),
-	[PWM_INT_EN]		= REG_FIELD(STI_INTEN, 0, 0),
+	[PWMCLK_PRESCALE_LOW] = REG_FIELD(STI_PWM_CTRL, 0, 3),
+	[PWMCLK_PRESCALE_HIGH] = REG_FIELD(STI_PWM_CTRL, 11, 14),
+	[CPTCLK_PRESCALE] = REG_FIELD(STI_PWM_CTRL, 4, 8),
+	[PWM_OUT_EN] = REG_FIELD(STI_PWM_CTRL, 9, 9),
+	[PWM_CPT_EN] = REG_FIELD(STI_PWM_CTRL, 10, 10),
+	[PWM_CPT_INT_EN] = REG_FIELD(STI_INT_EN, 1, 4),
+	[PWM_CPT_INT_STAT] = REG_FIELD(STI_INT_STA, 1, 4),
 };
 
 static inline struct sti_pwm_chip *to_sti_pwmchip(struct pwm_chip *chip)
@@ -82,61 +127,68 @@
 				unsigned int *prescale)
 {
 	struct sti_pwm_compat_data *cdata = pc->cdata;
-	unsigned long val;
+	unsigned long clk_rate;
+	unsigned long value;
 	unsigned int ps;
 
-	/*
-	 * prescale = ((period_ns * clk_rate) / (10^9 * (max_pwm_count + 1)) - 1
-	 */
-	val = NSEC_PER_SEC / pc->clk_rate;
-	val *= cdata->max_pwm_cnt + 1;
-
-	if (period % val) {
+	clk_rate = clk_get_rate(pc->pwm_clk);
+	if (!clk_rate) {
+		dev_err(pc->dev, "failed to get clock rate\n");
 		return -EINVAL;
-	} else {
-		ps  = period / val - 1;
-		if (ps > cdata->max_prescale)
-			return -EINVAL;
 	}
+
+	/*
+	 * prescale = ((period_ns * clk_rate) / (10^9 * (max_pwm_cnt + 1)) - 1
+	 */
+	value = NSEC_PER_SEC / clk_rate;
+	value *= cdata->max_pwm_cnt + 1;
+
+	if (period % value)
+		return -EINVAL;
+
+	ps  = period / value - 1;
+	if (ps > cdata->max_prescale)
+		return -EINVAL;
+
 	*prescale = ps;
 
 	return 0;
 }
 
 /*
- * For STiH4xx PWM IP, the PWM period is fixed to 256 local clock cycles.
- * The only way to change the period (apart from changing the PWM input clock)
- * is to change the PWM clock prescaler.
- * The prescaler is of 8 bits, so 256 prescaler values and hence
- * 256 possible period values are supported (for a particular clock rate).
- * The requested period will be applied only if it matches one of these
- * 256 values.
+ * For STiH4xx PWM IP, the PWM period is fixed to 256 local clock cycles. The
+ * only way to change the period (apart from changing the PWM input clock) is
+ * to change the PWM clock prescaler.
+ *
+ * The prescaler is of 8 bits, so 256 prescaler values and hence 256 possible
+ * period values are supported (for a particular clock rate). The requested
+ * period will be applied only if it matches one of these 256 values.
  */
 static int sti_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
-			 int duty_ns, int period_ns)
+			  int duty_ns, int period_ns)
 {
 	struct sti_pwm_chip *pc = to_sti_pwmchip(chip);
 	struct sti_pwm_compat_data *cdata = pc->cdata;
+	unsigned int ncfg, value, prescale = 0;
 	struct pwm_device *cur = pc->cur;
 	struct device *dev = pc->dev;
-	unsigned int prescale = 0, pwmvalx;
-	int ret;
-	unsigned int ncfg;
 	bool period_same = false;
+	int ret;
 
 	ncfg = hweight_long(pc->configured);
 	if (ncfg)
 		period_same = (period_ns == pwm_get_period(cur));
 
-	/* Allow configuration changes if one of the
-	 * following conditions satisfy.
-	 * 1. No channels have been configured.
-	 * 2. Only one channel has been configured and the new request
-	 *    is for the same channel.
-	 * 3. Only one channel has been configured and the new request is
-	 *    for a new channel and period of the new channel is same as
-	 *    the current configured period.
-	 * 4. More than one channels are configured and period of the new
+	/*
+	 * Allow configuration changes if one of the following conditions
+	 * satisfy.
+	 * 1. No devices have been configured.
+	 * 2. Only one device has been configured and the new request is for
+	 *    the same device.
+	 * 3. Only one device has been configured and the new request is for
+	 *    a new device and period of the new device is same as the current
+	 *    configured period.
+	 * 4. More than one devices are configured and period of the new
 	 *    requestis the same as the current period.
 	 */
 	if (!ncfg ||
@@ -144,7 +196,11 @@
 	    ((ncfg == 1) && (pwm->hwpwm != cur->hwpwm) && period_same) ||
 	    ((ncfg > 1) && period_same)) {
 		/* Enable clock before writing to PWM registers. */
-		ret = clk_enable(pc->clk);
+		ret = clk_enable(pc->pwm_clk);
+		if (ret)
+			return ret;
+
+		ret = clk_enable(pc->cpt_clk);
 		if (ret)
 			return ret;
 
@@ -153,15 +209,15 @@
 			if (ret)
 				goto clk_dis;
 
-			ret =
-			regmap_field_write(pc->prescale_low,
-					   prescale & PWM_PRESCALE_LOW_MASK);
+			value = prescale & PWM_PRESCALE_LOW_MASK;
+
+			ret = regmap_field_write(pc->prescale_low, value);
 			if (ret)
 				goto clk_dis;
 
-			ret =
-			regmap_field_write(pc->prescale_high,
-				(prescale & PWM_PRESCALE_HIGH_MASK) >> 4);
+			value = (prescale & PWM_PRESCALE_HIGH_MASK) >> 4;
+
+			ret = regmap_field_write(pc->prescale_high, value);
 			if (ret)
 				goto clk_dis;
 		}
@@ -172,25 +228,26 @@
 		 * PWM pulse = (max_pwm_count + 1) local cycles,
 		 * that is continuous pulse: signal never goes low.
 		 */
-		pwmvalx = cdata->max_pwm_cnt * duty_ns / period_ns;
+		value = cdata->max_pwm_cnt * duty_ns / period_ns;
 
-		ret = regmap_write(pc->regmap, STI_DS_REG(pwm->hwpwm), pwmvalx);
+		ret = regmap_write(pc->regmap, PWM_OUT_VAL(pwm->hwpwm), value);
 		if (ret)
 			goto clk_dis;
 
-		ret = regmap_field_write(pc->pwm_int_en, 0);
+		ret = regmap_field_write(pc->pwm_cpt_int_en, 0);
 
 		set_bit(pwm->hwpwm, &pc->configured);
 		pc->cur = pwm;
 
-		dev_dbg(dev, "prescale:%u, period:%i, duty:%i, pwmvalx:%u\n",
-			prescale, period_ns, duty_ns, pwmvalx);
+		dev_dbg(dev, "prescale:%u, period:%i, duty:%i, value:%u\n",
+			prescale, period_ns, duty_ns, value);
 	} else {
 		return -EINVAL;
 	}
 
 clk_dis:
-	clk_disable(pc->clk);
+	clk_disable(pc->pwm_clk);
+	clk_disable(pc->cpt_clk);
 	return ret;
 }
 
@@ -201,23 +258,30 @@
 	int ret = 0;
 
 	/*
-	 * Since we have a common enable for all PWM channels,
-	 * do not enable if already enabled.
+	 * Since we have a common enable for all PWM devices, do not enable if
+	 * already enabled.
 	 */
 	mutex_lock(&pc->sti_pwm_lock);
+
 	if (!pc->en_count) {
-		ret = clk_enable(pc->clk);
+		ret = clk_enable(pc->pwm_clk);
 		if (ret)
 			goto out;
 
-		ret = regmap_field_write(pc->pwm_en, 1);
+		ret = clk_enable(pc->cpt_clk);
+		if (ret)
+			goto out;
+
+		ret = regmap_field_write(pc->pwm_out_en, 1);
 		if (ret) {
-			dev_err(dev, "failed to enable PWM device:%d\n",
-				pwm->hwpwm);
+			dev_err(dev, "failed to enable PWM device %u: %d\n",
+				pwm->hwpwm, ret);
 			goto out;
 		}
 	}
+
 	pc->en_count++;
+
 out:
 	mutex_unlock(&pc->sti_pwm_lock);
 	return ret;
@@ -228,13 +292,17 @@
 	struct sti_pwm_chip *pc = to_sti_pwmchip(chip);
 
 	mutex_lock(&pc->sti_pwm_lock);
+
 	if (--pc->en_count) {
 		mutex_unlock(&pc->sti_pwm_lock);
 		return;
 	}
-	regmap_field_write(pc->pwm_en, 0);
 
-	clk_disable(pc->clk);
+	regmap_field_write(pc->pwm_out_en, 0);
+
+	clk_disable(pc->pwm_clk);
+	clk_disable(pc->cpt_clk);
+
 	mutex_unlock(&pc->sti_pwm_lock);
 }
 
@@ -245,7 +313,90 @@
 	clear_bit(pwm->hwpwm, &pc->configured);
 }
 
+static int sti_pwm_capture(struct pwm_chip *chip, struct pwm_device *pwm,
+			   struct pwm_capture *result, unsigned long timeout)
+{
+	struct sti_pwm_chip *pc = to_sti_pwmchip(chip);
+	struct sti_pwm_compat_data *cdata = pc->cdata;
+	struct sti_cpt_ddata *ddata = pwm_get_chip_data(pwm);
+	struct device *dev = pc->dev;
+	unsigned int effective_ticks;
+	unsigned long long high, low;
+	int ret;
+
+	if (pwm->hwpwm >= cdata->cpt_num_devs) {
+		dev_err(dev, "device %u is not valid\n", pwm->hwpwm);
+		return -EINVAL;
+	}
+
+	mutex_lock(&ddata->lock);
+	ddata->index = 0;
+
+	/* Prepare capture measurement */
+	regmap_write(pc->regmap, PWM_CPT_EDGE(pwm->hwpwm), CPT_EDGE_RISING);
+	regmap_field_write(pc->pwm_cpt_int_en, BIT(pwm->hwpwm));
+
+	/* Enable capture */
+	ret = regmap_field_write(pc->pwm_cpt_en, 1);
+	if (ret) {
+		dev_err(dev, "failed to enable PWM capture %u: %d\n",
+			pwm->hwpwm, ret);
+		goto out;
+	}
+
+	ret = wait_event_interruptible_timeout(ddata->wait, ddata->index > 1,
+					       msecs_to_jiffies(timeout));
+
+	regmap_write(pc->regmap, PWM_CPT_EDGE(pwm->hwpwm), CPT_EDGE_DISABLED);
+
+	if (ret == -ERESTARTSYS)
+		goto out;
+
+	switch (ddata->index) {
+	case 0:
+	case 1:
+		/*
+		 * Getting here could mean:
+		 *  - input signal is constant of less than 1 Hz
+		 *  - there is no input signal at all
+		 *
+		 * In such case the frequency is rounded down to 0
+		 */
+		result->period = 0;
+		result->duty_cycle = 0;
+
+		break;
+
+	case 2:
+		/* We have everying we need */
+		high = ddata->snapshot[1] - ddata->snapshot[0];
+		low = ddata->snapshot[2] - ddata->snapshot[1];
+
+		effective_ticks = clk_get_rate(pc->cpt_clk);
+
+		result->period = (high + low) * NSEC_PER_SEC;
+		result->period /= effective_ticks;
+
+		result->duty_cycle = high * NSEC_PER_SEC;
+		result->duty_cycle /= effective_ticks;
+
+		break;
+
+	default:
+		dev_err(dev, "internal error\n");
+		break;
+	}
+
+out:
+	/* Disable capture */
+	regmap_field_write(pc->pwm_cpt_en, 0);
+
+	mutex_unlock(&ddata->lock);
+	return ret;
+}
+
 static const struct pwm_ops sti_pwm_ops = {
+	.capture = sti_pwm_capture,
 	.config = sti_pwm_config,
 	.enable = sti_pwm_enable,
 	.disable = sti_pwm_disable,
@@ -253,17 +404,98 @@
 	.owner = THIS_MODULE,
 };
 
+static irqreturn_t sti_pwm_interrupt(int irq, void *data)
+{
+	struct sti_pwm_chip *pc = data;
+	struct device *dev = pc->dev;
+	struct sti_cpt_ddata *ddata;
+	int devicenum;
+	unsigned int cpt_int_stat;
+	unsigned int reg;
+	int ret = IRQ_NONE;
+
+	ret = regmap_field_read(pc->pwm_cpt_int_stat, &cpt_int_stat);
+	if (ret)
+		return ret;
+
+	while (cpt_int_stat) {
+		devicenum = ffs(cpt_int_stat) - 1;
+
+		ddata = pwm_get_chip_data(&pc->chip.pwms[devicenum]);
+
+		/*
+		 * Capture input:
+		 *    _______                   _______
+		 *   |       |                 |       |
+		 * __|       |_________________|       |________
+		 *   ^0      ^1                ^2
+		 *
+		 * Capture start by the first available rising edge. When a
+		 * capture event occurs, capture value (CPT_VALx) is stored,
+		 * index incremented, capture edge changed.
+		 *
+		 * After the capture, if the index > 1, we have collected the
+		 * necessary data so we signal the thread waiting for it and
+		 * disable the capture by setting capture edge to none
+		 */
+
+		regmap_read(pc->regmap,
+			    PWM_CPT_VAL(devicenum),
+			    &ddata->snapshot[ddata->index]);
+
+		switch (ddata->index) {
+		case 0:
+		case 1:
+			regmap_read(pc->regmap, PWM_CPT_EDGE(devicenum), &reg);
+			reg ^= PWM_CPT_EDGE_MASK;
+			regmap_write(pc->regmap, PWM_CPT_EDGE(devicenum), reg);
+
+			ddata->index++;
+			break;
+
+		case 2:
+			regmap_write(pc->regmap,
+				     PWM_CPT_EDGE(devicenum),
+				     CPT_EDGE_DISABLED);
+			wake_up(&ddata->wait);
+			break;
+
+		default:
+			dev_err(dev, "Internal error\n");
+		}
+
+		cpt_int_stat &= ~BIT_MASK(devicenum);
+
+		ret = IRQ_HANDLED;
+	}
+
+	/* Just ACK everything */
+	regmap_write(pc->regmap, PWM_INT_ACK, PWM_INT_ACK_MASK);
+
+	return ret;
+}
+
 static int sti_pwm_probe_dt(struct sti_pwm_chip *pc)
 {
 	struct device *dev = pc->dev;
 	const struct reg_field *reg_fields;
 	struct device_node *np = dev->of_node;
 	struct sti_pwm_compat_data *cdata = pc->cdata;
-	u32 num_chan;
+	u32 num_devs;
+	int ret;
 
-	of_property_read_u32(np, "st,pwm-num-chan", &num_chan);
-	if (num_chan)
-		cdata->num_chan = num_chan;
+	ret = of_property_read_u32(np, "st,pwm-num-chan", &num_devs);
+	if (!ret)
+		cdata->pwm_num_devs = num_devs;
+
+	ret = of_property_read_u32(np, "st,capture-num-chan", &num_devs);
+	if (!ret)
+		cdata->cpt_num_devs = num_devs;
+
+	if (!cdata->pwm_num_devs && !cdata->cpt_num_devs) {
+		dev_err(dev, "No channels configured\n");
+		return -EINVAL;
+	}
 
 	reg_fields = cdata->reg_fields;
 
@@ -277,15 +509,26 @@
 	if (IS_ERR(pc->prescale_high))
 		return PTR_ERR(pc->prescale_high);
 
-	pc->pwm_en = devm_regmap_field_alloc(dev, pc->regmap,
-					     reg_fields[PWM_EN]);
-	if (IS_ERR(pc->pwm_en))
-		return PTR_ERR(pc->pwm_en);
 
-	pc->pwm_int_en = devm_regmap_field_alloc(dev, pc->regmap,
-						 reg_fields[PWM_INT_EN]);
-	if (IS_ERR(pc->pwm_int_en))
-		return PTR_ERR(pc->pwm_int_en);
+	pc->pwm_out_en = devm_regmap_field_alloc(dev, pc->regmap,
+						 reg_fields[PWM_OUT_EN]);
+	if (IS_ERR(pc->pwm_out_en))
+		return PTR_ERR(pc->pwm_out_en);
+
+	pc->pwm_cpt_en = devm_regmap_field_alloc(dev, pc->regmap,
+						 reg_fields[PWM_CPT_EN]);
+	if (IS_ERR(pc->pwm_cpt_en))
+		return PTR_ERR(pc->pwm_cpt_en);
+
+	pc->pwm_cpt_int_en = devm_regmap_field_alloc(dev, pc->regmap,
+						reg_fields[PWM_CPT_INT_EN]);
+	if (IS_ERR(pc->pwm_cpt_int_en))
+		return PTR_ERR(pc->pwm_cpt_int_en);
+
+	pc->pwm_cpt_int_stat = devm_regmap_field_alloc(dev, pc->regmap,
+						reg_fields[PWM_CPT_INT_STAT]);
+	if (PTR_ERR_OR_ZERO(pc->pwm_cpt_int_stat))
+		return PTR_ERR(pc->pwm_cpt_int_stat);
 
 	return 0;
 }
@@ -302,7 +545,8 @@
 	struct sti_pwm_compat_data *cdata;
 	struct sti_pwm_chip *pc;
 	struct resource *res;
-	int ret;
+	unsigned int i;
+	int irq, ret;
 
 	pc = devm_kzalloc(dev, sizeof(*pc), GFP_KERNEL);
 	if (!pc)
@@ -323,14 +567,28 @@
 	if (IS_ERR(pc->regmap))
 		return PTR_ERR(pc->regmap);
 
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0) {
+		dev_err(&pdev->dev, "Failed to obtain IRQ\n");
+		return irq;
+	}
+
+	ret = devm_request_irq(&pdev->dev, irq, sti_pwm_interrupt, 0,
+			       pdev->name, pc);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "Failed to request IRQ\n");
+		return ret;
+	}
+
 	/*
 	 * Setup PWM data with default values: some values could be replaced
 	 * with specific ones provided from Device Tree.
 	 */
-	cdata->reg_fields   = &sti_pwm_regfields[0];
+	cdata->reg_fields = sti_pwm_regfields;
 	cdata->max_prescale = 0xff;
-	cdata->max_pwm_cnt  = 255;
-	cdata->num_chan     = 1;
+	cdata->max_pwm_cnt = 255;
+	cdata->pwm_num_devs = 0;
+	cdata->cpt_num_devs = 0;
 
 	pc->cdata = cdata;
 	pc->dev = dev;
@@ -341,36 +599,64 @@
 	if (ret)
 		return ret;
 
-	pc->clk = of_clk_get_by_name(dev->of_node, "pwm");
-	if (IS_ERR(pc->clk)) {
+	if (!cdata->pwm_num_devs)
+		goto skip_pwm;
+
+	pc->pwm_clk = of_clk_get_by_name(dev->of_node, "pwm");
+	if (IS_ERR(pc->pwm_clk)) {
 		dev_err(dev, "failed to get PWM clock\n");
-		return PTR_ERR(pc->clk);
+		return PTR_ERR(pc->pwm_clk);
 	}
 
-	pc->clk_rate = clk_get_rate(pc->clk);
-	if (!pc->clk_rate) {
-		dev_err(dev, "failed to get clock rate\n");
-		return -EINVAL;
-	}
-
-	ret = clk_prepare(pc->clk);
+	ret = clk_prepare(pc->pwm_clk);
 	if (ret) {
 		dev_err(dev, "failed to prepare clock\n");
 		return ret;
 	}
 
+skip_pwm:
+	if (!cdata->cpt_num_devs)
+		goto skip_cpt;
+
+	pc->cpt_clk = of_clk_get_by_name(dev->of_node, "capture");
+	if (IS_ERR(pc->cpt_clk)) {
+		dev_err(dev, "failed to get PWM capture clock\n");
+		return PTR_ERR(pc->cpt_clk);
+	}
+
+	ret = clk_prepare(pc->cpt_clk);
+	if (ret) {
+		dev_err(dev, "failed to prepare clock\n");
+		return ret;
+	}
+
+skip_cpt:
 	pc->chip.dev = dev;
 	pc->chip.ops = &sti_pwm_ops;
 	pc->chip.base = -1;
-	pc->chip.npwm = pc->cdata->num_chan;
+	pc->chip.npwm = pc->cdata->pwm_num_devs;
 	pc->chip.can_sleep = true;
 
 	ret = pwmchip_add(&pc->chip);
 	if (ret < 0) {
-		clk_unprepare(pc->clk);
+		clk_unprepare(pc->pwm_clk);
+		clk_unprepare(pc->cpt_clk);
 		return ret;
 	}
 
+	for (i = 0; i < cdata->cpt_num_devs; i++) {
+		struct sti_cpt_ddata *ddata;
+
+		ddata = devm_kzalloc(dev, sizeof(*ddata), GFP_KERNEL);
+		if (!ddata)
+			return -ENOMEM;
+
+		init_waitqueue_head(&ddata->wait);
+		mutex_init(&ddata->lock);
+
+		pwm_set_chip_data(&pc->chip.pwms[i], ddata);
+	}
+
 	platform_set_drvdata(pdev, pc);
 
 	return 0;
@@ -381,10 +667,11 @@
 	struct sti_pwm_chip *pc = platform_get_drvdata(pdev);
 	unsigned int i;
 
-	for (i = 0; i < pc->cdata->num_chan; i++)
+	for (i = 0; i < pc->cdata->pwm_num_devs; i++)
 		pwm_disable(&pc->chip.pwms[i]);
 
-	clk_unprepare(pc->clk);
+	clk_unprepare(pc->pwm_clk);
+	clk_unprepare(pc->cpt_clk);
 
 	return pwmchip_remove(&pc->chip);
 }
diff --git a/drivers/pwm/pwm-sun4i.c b/drivers/pwm/pwm-sun4i.c
index 03a99a5..b0803f6 100644
--- a/drivers/pwm/pwm-sun4i.c
+++ b/drivers/pwm/pwm-sun4i.c
@@ -284,6 +284,12 @@
 	.npwm = 2,
 };
 
+static const struct sun4i_pwm_data sun4i_pwm_data_h3 = {
+	.has_prescaler_bypass = true,
+	.has_rdy = true,
+	.npwm = 1,
+};
+
 static const struct of_device_id sun4i_pwm_dt_ids[] = {
 	{
 		.compatible = "allwinner,sun4i-a10-pwm",
@@ -298,6 +304,9 @@
 		.compatible = "allwinner,sun7i-a20-pwm",
 		.data = &sun4i_pwm_data_a20,
 	}, {
+		.compatible = "allwinner,sun8i-h3-pwm",
+		.data = &sun4i_pwm_data_h3,
+	}, {
 		/* sentinel */
 	},
 };
diff --git a/drivers/pwm/pwm-tipwmss.c b/drivers/pwm/pwm-tipwmss.c
index 829f499..7fa85a1 100644
--- a/drivers/pwm/pwm-tipwmss.c
+++ b/drivers/pwm/pwm-tipwmss.c
@@ -34,7 +34,6 @@
 	struct device_node *node = pdev->dev.of_node;
 
 	pm_runtime_enable(&pdev->dev);
-	pm_runtime_get_sync(&pdev->dev);
 
 	/* Populate all the child nodes here... */
 	ret = of_platform_populate(node, NULL, NULL, &pdev->dev);
@@ -46,31 +45,13 @@
 
 static int pwmss_remove(struct platform_device *pdev)
 {
-	pm_runtime_put_sync(&pdev->dev);
 	pm_runtime_disable(&pdev->dev);
 	return 0;
 }
 
-#ifdef CONFIG_PM_SLEEP
-static int pwmss_suspend(struct device *dev)
-{
-	pm_runtime_put_sync(dev);
-	return 0;
-}
-
-static int pwmss_resume(struct device *dev)
-{
-	pm_runtime_get_sync(dev);
-	return 0;
-}
-#endif
-
-static SIMPLE_DEV_PM_OPS(pwmss_pm_ops, pwmss_suspend, pwmss_resume);
-
 static struct platform_driver pwmss_driver = {
 	.driver	= {
 		.name	= "pwmss",
-		.pm	= &pwmss_pm_ops,
 		.of_match_table	= pwmss_of_match,
 	},
 	.probe	= pwmss_probe,
diff --git a/drivers/pwm/pwm-twl.c b/drivers/pwm/pwm-twl.c
index 04f7672..7a993b0 100644
--- a/drivers/pwm/pwm-twl.c
+++ b/drivers/pwm/pwm-twl.c
@@ -269,6 +269,22 @@
 		goto out;
 	}
 
+	val |= TWL6030_PWM_TOGGLE(pwm->hwpwm, TWL6030_PWMXEN);
+
+	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, val, TWL6030_TOGGLE3_REG);
+	if (ret < 0) {
+		dev_err(chip->dev, "%s: Failed to disable PWM\n", pwm->label);
+		goto out;
+	}
+
+	val &= ~TWL6030_PWM_TOGGLE(pwm->hwpwm, TWL6030_PWMXEN);
+
+	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, val, TWL6030_TOGGLE3_REG);
+	if (ret < 0) {
+		dev_err(chip->dev, "%s: Failed to disable PWM\n", pwm->label);
+		goto out;
+	}
+
 	twl->twl6030_toggle3 = val;
 out:
 	mutex_unlock(&twl->mutex);
diff --git a/drivers/pwm/sysfs.c b/drivers/pwm/sysfs.c
index 18ed725..0296d81 100644
--- a/drivers/pwm/sysfs.c
+++ b/drivers/pwm/sysfs.c
@@ -409,6 +409,24 @@
 	}
 }
 
+void pwmchip_sysfs_unexport_children(struct pwm_chip *chip)
+{
+	struct device *parent;
+	unsigned int i;
+
+	parent = class_find_device(&pwm_class, NULL, chip,
+				   pwmchip_sysfs_match);
+	if (!parent)
+		return;
+
+	for (i = 0; i < chip->npwm; i++) {
+		struct pwm_device *pwm = &chip->pwms[i];
+
+		if (test_bit(PWMF_EXPORTED, &pwm->flags))
+			pwm_unexport_child(parent, pwm);
+	}
+}
+
 static int __init pwm_sysfs_init(void)
 {
 	return class_register(&pwm_class);
diff --git a/drivers/regulator/max8973-regulator.c b/drivers/regulator/max8973-regulator.c
index 3958f50..e0c747a 100644
--- a/drivers/regulator/max8973-regulator.c
+++ b/drivers/regulator/max8973-regulator.c
@@ -495,7 +495,8 @@
 {
 	struct max8973_chip *mchip = data;
 
-	thermal_zone_device_update(mchip->tz_device);
+	thermal_zone_device_update(mchip->tz_device,
+				   THERMAL_EVENT_UNSPECIFIED);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index 2d702ca..a13541b 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -186,7 +186,7 @@
 
 config IMX_THERMAL
 	tristate "Temperature sensor driver for Freescale i.MX SoCs"
-	depends on CPU_THERMAL
+	depends on (ARCH_MXC && CPU_THERMAL) || COMPILE_TEST
 	depends on MFD_SYSCON
 	depends on OF
 	help
@@ -195,6 +195,26 @@
 	  cpufreq is used as the cooling device to throttle CPUs when the
 	  passive trip is crossed.
 
+config MAX77620_THERMAL
+	tristate "Temperature sensor driver for Maxim MAX77620 PMIC"
+	depends on MFD_MAX77620
+	depends on OF
+	help
+	  Support for die junction temperature warning alarm for Maxim
+	  Semiconductor PMIC MAX77620 device. Device generates two alarm
+	  interrupts when PMIC die temperature cross the threshold of
+	  120 degC and 140 degC.
+
+config QORIQ_THERMAL
+	tristate "QorIQ Thermal Monitoring Unit"
+	depends on THERMAL_OF
+	depends on HAS_IOMEM
+	help
+	  Support for Thermal Monitoring Unit (TMU) found on QorIQ platforms.
+	  It supports one critical trip point and one passive trip point. The
+	  cpufreq is used as the cooling device to throttle CPUs when the
+	  passive trip is crossed.
+
 config SPEAR_THERMAL
 	tristate "SPEAr thermal sensor driver"
 	depends on PLAT_SPEAR || COMPILE_TEST
@@ -332,6 +352,16 @@
 source drivers/thermal/int340x_thermal/Kconfig
 endmenu
 
+config INTEL_BXT_PMIC_THERMAL
+	tristate "Intel Broxton PMIC thermal driver"
+	depends on X86 && INTEL_SOC_PMIC && REGMAP
+	help
+	  Select this driver for Intel Broxton PMIC with ADC channels monitoring
+	  system temperature measurements and alerts.
+	  This driver is used for monitoring the ADC channels of PMIC and handles
+	  the alert trip point interrupts and notifies the thermal framework with
+	  the trip point and temperature details of the zone.
+
 config INTEL_PCH_THERMAL
 	tristate "Intel PCH Thermal Reporting Driver"
 	depends on X86 && PCI
@@ -399,4 +429,9 @@
 	  to this driver. This driver reports the temperature by reading ADC
 	  channel and converts it to temperature based on lookup table.
 
+menu "Qualcomm thermal drivers"
+depends on (ARCH_QCOM && OF) || COMPILE_TEST
+source "drivers/thermal/qcom/Kconfig"
+endmenu
+
 endif
diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index 10b07c1..c92eb22 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -37,6 +37,8 @@
 obj-$(CONFIG_ARMADA_THERMAL)	+= armada_thermal.o
 obj-$(CONFIG_TANGO_THERMAL)	+= tango_thermal.o
 obj-$(CONFIG_IMX_THERMAL)	+= imx_thermal.o
+obj-$(CONFIG_MAX77620_THERMAL)	+= max77620_thermal.o
+obj-$(CONFIG_QORIQ_THERMAL)	+= qoriq_thermal.o
 obj-$(CONFIG_DB8500_CPUFREQ_COOLING)	+= db8500_cpufreq_cooling.o
 obj-$(CONFIG_INTEL_POWERCLAMP)	+= intel_powerclamp.o
 obj-$(CONFIG_X86_PKG_TEMP_THERMAL)	+= x86_pkg_temp_thermal.o
@@ -45,8 +47,10 @@
 obj-$(CONFIG_INTEL_QUARK_DTS_THERMAL)	+= intel_quark_dts_thermal.o
 obj-$(CONFIG_TI_SOC_THERMAL)	+= ti-soc-thermal/
 obj-$(CONFIG_INT340X_THERMAL)  += int340x_thermal/
+obj-$(CONFIG_INTEL_BXT_PMIC_THERMAL) += intel_bxt_pmic_thermal.o
 obj-$(CONFIG_INTEL_PCH_THERMAL)	+= intel_pch_thermal.o
 obj-$(CONFIG_ST_THERMAL)	+= st/
+obj-$(CONFIG_QCOM_TSENS)	+= qcom/
 obj-$(CONFIG_TEGRA_SOCTHERM)	+= tegra/
 obj-$(CONFIG_HISI_THERMAL)     += hisi_thermal.o
 obj-$(CONFIG_MTK_THERMAL)	+= mtk_thermal.o
diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c
index a32b417..9ce0e9e 100644
--- a/drivers/thermal/cpu_cooling.c
+++ b/drivers/thermal/cpu_cooling.c
@@ -74,7 +74,7 @@
  *	cpufreq frequencies.
  * @allowed_cpus: all the cpus involved for this cpufreq_cooling_device.
  * @node: list_head to link all cpufreq_cooling_device together.
- * @last_load: load measured by the latest call to cpufreq_get_actual_power()
+ * @last_load: load measured by the latest call to cpufreq_get_requested_power()
  * @time_in_idle: previous reading of the absolute time that this cpu was idle
  * @time_in_idle_timestamp: wall time of the last invocation of
  *	get_cpu_idle_time_us()
diff --git a/drivers/thermal/db8500_thermal.c b/drivers/thermal/db8500_thermal.c
index 652acd8..e776cea 100644
--- a/drivers/thermal/db8500_thermal.c
+++ b/drivers/thermal/db8500_thermal.c
@@ -306,7 +306,7 @@
 	if (cur_mode == THERMAL_DEVICE_DISABLED)
 		return;
 
-	thermal_zone_device_update(pzone->therm_dev);
+	thermal_zone_device_update(pzone->therm_dev, THERMAL_EVENT_UNSPECIFIED);
 	dev_dbg(&pzone->therm_dev->device, "thermal work finished.\n");
 }
 
diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c
index 01f0015..81631b1 100644
--- a/drivers/thermal/devfreq_cooling.c
+++ b/drivers/thermal/devfreq_cooling.c
@@ -312,7 +312,7 @@
 	unsigned long freq;
 	u32 static_power;
 
-	if (state < 0 || state >= dfc->freq_table_size)
+	if (state >= dfc->freq_table_size)
 		return -EINVAL;
 
 	freq = dfc->freq_table[state];
diff --git a/drivers/thermal/gov_bang_bang.c b/drivers/thermal/gov_bang_bang.c
index bb118a1..fc5e505 100644
--- a/drivers/thermal/gov_bang_bang.c
+++ b/drivers/thermal/gov_bang_bang.c
@@ -65,7 +65,7 @@
 		if (instance->target == 0 && tz->temperature >= trip_temp)
 			instance->target = 1;
 		else if (instance->target == 1 &&
-				tz->temperature < trip_temp - trip_hyst)
+				tz->temperature <= trip_temp - trip_hyst)
 			instance->target = 0;
 
 		dev_dbg(&instance->cdev->device, "target=%d\n",
diff --git a/drivers/thermal/hisi_thermal.c b/drivers/thermal/hisi_thermal.c
index 97fad8f..f642966 100644
--- a/drivers/thermal/hisi_thermal.c
+++ b/drivers/thermal/hisi_thermal.c
@@ -237,7 +237,8 @@
 		if (!data->sensors[i].tzd)
 			continue;
 
-		thermal_zone_device_update(data->sensors[i].tzd);
+		thermal_zone_device_update(data->sensors[i].tzd,
+					   THERMAL_EVENT_UNSPECIFIED);
 	}
 
 	return IRQ_HANDLED;
diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c
index e473548..06912f0 100644
--- a/drivers/thermal/imx_thermal.c
+++ b/drivers/thermal/imx_thermal.c
@@ -246,7 +246,7 @@
 	}
 
 	data->mode = mode;
-	thermal_zone_device_update(tz);
+	thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
 
 	return 0;
 }
@@ -457,7 +457,7 @@
 	dev_dbg(&data->tz->device, "THERMAL ALARM: T > %d\n",
 		data->alarm_temp / 1000);
 
-	thermal_zone_device_update(data->tz);
+	thermal_zone_device_update(data->tz, THERMAL_EVENT_UNSPECIFIED);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/thermal/int340x_thermal/int3402_thermal.c b/drivers/thermal/int340x_thermal/int3402_thermal.c
index 69df3d9..8e90b31 100644
--- a/drivers/thermal/int340x_thermal/int3402_thermal.c
+++ b/drivers/thermal/int340x_thermal/int3402_thermal.c
@@ -35,7 +35,8 @@
 	case INT3402_PERF_CHANGED_EVENT:
 		break;
 	case INT3402_THERMAL_EVENT:
-		int340x_thermal_zone_device_update(priv->int340x_zone);
+		int340x_thermal_zone_device_update(priv->int340x_zone,
+						   THERMAL_TRIP_VIOLATED);
 		break;
 	default:
 		break;
diff --git a/drivers/thermal/int340x_thermal/int3403_thermal.c b/drivers/thermal/int340x_thermal/int3403_thermal.c
index 50a7a08..c4890c9 100644
--- a/drivers/thermal/int340x_thermal/int3403_thermal.c
+++ b/drivers/thermal/int340x_thermal/int3403_thermal.c
@@ -25,6 +25,7 @@
 #define INT3403_TYPE_CHARGER		0x0B
 #define INT3403_TYPE_BATTERY		0x0C
 #define INT3403_PERF_CHANGED_EVENT	0x80
+#define INT3403_PERF_TRIP_POINT_CHANGED	0x81
 #define INT3403_THERMAL_EVENT		0x90
 
 /* Preserved structure for future expandbility */
@@ -72,7 +73,13 @@
 	case INT3403_PERF_CHANGED_EVENT:
 		break;
 	case INT3403_THERMAL_EVENT:
-		int340x_thermal_zone_device_update(obj->int340x_zone);
+		int340x_thermal_zone_device_update(obj->int340x_zone,
+						   THERMAL_TRIP_VIOLATED);
+		break;
+	case INT3403_PERF_TRIP_POINT_CHANGED:
+		int340x_thermal_read_trips(obj->int340x_zone);
+		int340x_thermal_zone_device_update(obj->int340x_zone,
+						   THERMAL_TRIP_CHANGED);
 		break;
 	default:
 		dev_err(&priv->pdev->dev, "Unsupported event [0x%x]\n", event);
diff --git a/drivers/thermal/int340x_thermal/int340x_thermal_zone.c b/drivers/thermal/int340x_thermal/int340x_thermal_zone.c
index b9b2666..145a5c53 100644
--- a/drivers/thermal/int340x_thermal/int340x_thermal_zone.c
+++ b/drivers/thermal/int340x_thermal/int340x_thermal_zone.c
@@ -177,6 +177,42 @@
 	return 0;
 }
 
+int int340x_thermal_read_trips(struct int34x_thermal_zone *int34x_zone)
+{
+	int trip_cnt = int34x_zone->aux_trip_nr;
+	int i;
+
+	int34x_zone->crt_trip_id = -1;
+	if (!int340x_thermal_get_trip_config(int34x_zone->adev->handle, "_CRT",
+					     &int34x_zone->crt_temp))
+		int34x_zone->crt_trip_id = trip_cnt++;
+
+	int34x_zone->hot_trip_id = -1;
+	if (!int340x_thermal_get_trip_config(int34x_zone->adev->handle, "_HOT",
+					     &int34x_zone->hot_temp))
+		int34x_zone->hot_trip_id = trip_cnt++;
+
+	int34x_zone->psv_trip_id = -1;
+	if (!int340x_thermal_get_trip_config(int34x_zone->adev->handle, "_PSV",
+					     &int34x_zone->psv_temp))
+		int34x_zone->psv_trip_id = trip_cnt++;
+
+	for (i = 0; i < INT340X_THERMAL_MAX_ACT_TRIP_COUNT; i++) {
+		char name[5] = { '_', 'A', 'C', '0' + i, '\0' };
+
+		if (int340x_thermal_get_trip_config(int34x_zone->adev->handle,
+					name,
+					&int34x_zone->act_trips[i].temp))
+			break;
+
+		int34x_zone->act_trips[i].id = trip_cnt++;
+		int34x_zone->act_trips[i].valid = true;
+	}
+
+	return trip_cnt;
+}
+EXPORT_SYMBOL_GPL(int340x_thermal_read_trips);
+
 static struct thermal_zone_params int340x_thermal_params = {
 	.governor_name = "user_space",
 	.no_hwmon = true,
@@ -188,7 +224,7 @@
 	struct int34x_thermal_zone *int34x_thermal_zone;
 	acpi_status status;
 	unsigned long long trip_cnt;
-	int trip_mask = 0, i;
+	int trip_mask = 0;
 	int ret;
 
 	int34x_thermal_zone = kzalloc(sizeof(*int34x_thermal_zone),
@@ -214,28 +250,8 @@
 		int34x_thermal_zone->aux_trip_nr = trip_cnt;
 	}
 
-	int34x_thermal_zone->crt_trip_id = -1;
-	if (!int340x_thermal_get_trip_config(adev->handle, "_CRT",
-					     &int34x_thermal_zone->crt_temp))
-		int34x_thermal_zone->crt_trip_id = trip_cnt++;
-	int34x_thermal_zone->hot_trip_id = -1;
-	if (!int340x_thermal_get_trip_config(adev->handle, "_HOT",
-					     &int34x_thermal_zone->hot_temp))
-		int34x_thermal_zone->hot_trip_id = trip_cnt++;
-	int34x_thermal_zone->psv_trip_id = -1;
-	if (!int340x_thermal_get_trip_config(adev->handle, "_PSV",
-					     &int34x_thermal_zone->psv_temp))
-		int34x_thermal_zone->psv_trip_id = trip_cnt++;
-	for (i = 0; i < INT340X_THERMAL_MAX_ACT_TRIP_COUNT; i++) {
-		char name[5] = { '_', 'A', 'C', '0' + i, '\0' };
+	trip_cnt = int340x_thermal_read_trips(int34x_thermal_zone);
 
-		if (int340x_thermal_get_trip_config(adev->handle, name,
-				&int34x_thermal_zone->act_trips[i].temp))
-			break;
-
-		int34x_thermal_zone->act_trips[i].id = trip_cnt++;
-		int34x_thermal_zone->act_trips[i].valid = true;
-	}
 	int34x_thermal_zone->lpat_table = acpi_lpat_get_conversion_table(
 								adev->handle);
 
diff --git a/drivers/thermal/int340x_thermal/int340x_thermal_zone.h b/drivers/thermal/int340x_thermal/int340x_thermal_zone.h
index aaadf72..5f3ba47 100644
--- a/drivers/thermal/int340x_thermal/int340x_thermal_zone.h
+++ b/drivers/thermal/int340x_thermal/int340x_thermal_zone.h
@@ -46,6 +46,7 @@
 struct int34x_thermal_zone *int340x_thermal_zone_add(struct acpi_device *,
 				struct thermal_zone_device_ops *override_ops);
 void int340x_thermal_zone_remove(struct int34x_thermal_zone *);
+int int340x_thermal_read_trips(struct int34x_thermal_zone *int34x_zone);
 
 static inline void int340x_thermal_zone_set_priv_data(
 			struct int34x_thermal_zone *tzone, void *priv_data)
@@ -60,9 +61,10 @@
 }
 
 static inline void int340x_thermal_zone_device_update(
-			struct int34x_thermal_zone *tzone)
+					struct int34x_thermal_zone *tzone,
+					enum thermal_notify_event event)
 {
-	thermal_zone_device_update(tzone->zone);
+	thermal_zone_device_update(tzone->zone, event);
 }
 
 #endif
diff --git a/drivers/thermal/int340x_thermal/processor_thermal_device.c b/drivers/thermal/int340x_thermal/processor_thermal_device.c
index 42c1ac0..ff3b36f 100644
--- a/drivers/thermal/int340x_thermal/processor_thermal_device.c
+++ b/drivers/thermal/int340x_thermal/processor_thermal_device.c
@@ -258,7 +258,8 @@
 	switch (event) {
 	case PROC_POWER_CAPABILITY_CHANGED:
 		proc_thermal_read_ppcc(proc_priv);
-		int340x_thermal_zone_device_update(proc_priv->int340x_zone);
+		int340x_thermal_zone_device_update(proc_priv->int340x_zone,
+				THERMAL_DEVICE_POWER_CAPABILITY_CHANGED);
 		break;
 	default:
 		dev_err(proc_priv->dev, "Unsupported event [0x%x]\n", event);
diff --git a/drivers/thermal/intel_bxt_pmic_thermal.c b/drivers/thermal/intel_bxt_pmic_thermal.c
new file mode 100644
index 0000000..0f19a393
--- /dev/null
+++ b/drivers/thermal/intel_bxt_pmic_thermal.c
@@ -0,0 +1,300 @@
+/*
+ * Intel Broxton PMIC thermal driver
+ *
+ * Copyright (C) 2016 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/device.h>
+#include <linux/thermal.h>
+#include <linux/platform_device.h>
+#include <linux/sched.h>
+#include <linux/mfd/intel_soc_pmic.h>
+
+#define BXTWC_THRM0IRQ		0x4E04
+#define BXTWC_THRM1IRQ		0x4E05
+#define BXTWC_THRM2IRQ		0x4E06
+#define BXTWC_MTHRM0IRQ		0x4E12
+#define BXTWC_MTHRM1IRQ		0x4E13
+#define BXTWC_MTHRM2IRQ		0x4E14
+#define BXTWC_STHRM0IRQ		0x4F19
+#define BXTWC_STHRM1IRQ		0x4F1A
+#define BXTWC_STHRM2IRQ		0x4F1B
+
+struct trip_config_map {
+	u16 irq_reg;
+	u16 irq_en;
+	u16 evt_stat;
+	u8 irq_mask;
+	u8 irq_en_mask;
+	u8 evt_mask;
+	u8 trip_num;
+};
+
+struct thermal_irq_map {
+	char handle[20];
+	int num_trips;
+	const struct trip_config_map *trip_config;
+};
+
+struct pmic_thermal_data {
+	const struct thermal_irq_map *maps;
+	int num_maps;
+};
+
+static const struct trip_config_map bxtwc_str0_trip_config[] = {
+	{
+		.irq_reg = BXTWC_THRM0IRQ,
+		.irq_mask = 0x01,
+		.irq_en = BXTWC_MTHRM0IRQ,
+		.irq_en_mask = 0x01,
+		.evt_stat = BXTWC_STHRM0IRQ,
+		.evt_mask = 0x01,
+		.trip_num = 0
+	},
+	{
+		.irq_reg = BXTWC_THRM0IRQ,
+		.irq_mask = 0x10,
+		.irq_en = BXTWC_MTHRM0IRQ,
+		.irq_en_mask = 0x10,
+		.evt_stat = BXTWC_STHRM0IRQ,
+		.evt_mask = 0x10,
+		.trip_num = 1
+	}
+};
+
+static const struct trip_config_map bxtwc_str1_trip_config[] = {
+	{
+		.irq_reg = BXTWC_THRM0IRQ,
+		.irq_mask = 0x02,
+		.irq_en = BXTWC_MTHRM0IRQ,
+		.irq_en_mask = 0x02,
+		.evt_stat = BXTWC_STHRM0IRQ,
+		.evt_mask = 0x02,
+		.trip_num = 0
+	},
+	{
+		.irq_reg = BXTWC_THRM0IRQ,
+		.irq_mask = 0x20,
+		.irq_en = BXTWC_MTHRM0IRQ,
+		.irq_en_mask = 0x20,
+		.evt_stat = BXTWC_STHRM0IRQ,
+		.evt_mask = 0x20,
+		.trip_num = 1
+	},
+};
+
+static const struct trip_config_map bxtwc_str2_trip_config[] = {
+	{
+		.irq_reg = BXTWC_THRM0IRQ,
+		.irq_mask = 0x04,
+		.irq_en = BXTWC_MTHRM0IRQ,
+		.irq_en_mask = 0x04,
+		.evt_stat = BXTWC_STHRM0IRQ,
+		.evt_mask = 0x04,
+		.trip_num = 0
+	},
+	{
+		.irq_reg = BXTWC_THRM0IRQ,
+		.irq_mask = 0x40,
+		.irq_en = BXTWC_MTHRM0IRQ,
+		.irq_en_mask = 0x40,
+		.evt_stat = BXTWC_STHRM0IRQ,
+		.evt_mask = 0x40,
+		.trip_num = 1
+	},
+};
+
+static const struct trip_config_map bxtwc_str3_trip_config[] = {
+	{
+		.irq_reg = BXTWC_THRM2IRQ,
+		.irq_mask = 0x10,
+		.irq_en = BXTWC_MTHRM2IRQ,
+		.irq_en_mask = 0x10,
+		.evt_stat = BXTWC_STHRM2IRQ,
+		.evt_mask = 0x10,
+		.trip_num = 0
+	},
+};
+
+static const struct thermal_irq_map bxtwc_thermal_irq_map[] = {
+	{
+		.handle = "STR0",
+		.trip_config = bxtwc_str0_trip_config,
+		.num_trips = ARRAY_SIZE(bxtwc_str0_trip_config),
+	},
+	{
+		.handle = "STR1",
+		.trip_config = bxtwc_str1_trip_config,
+		.num_trips = ARRAY_SIZE(bxtwc_str1_trip_config),
+	},
+	{
+		.handle = "STR2",
+		.trip_config = bxtwc_str2_trip_config,
+		.num_trips = ARRAY_SIZE(bxtwc_str2_trip_config),
+	},
+	{
+		.handle = "STR3",
+		.trip_config = bxtwc_str3_trip_config,
+		.num_trips = ARRAY_SIZE(bxtwc_str3_trip_config),
+	},
+};
+
+static const struct pmic_thermal_data bxtwc_thermal_data = {
+	.maps = bxtwc_thermal_irq_map,
+	.num_maps = ARRAY_SIZE(bxtwc_thermal_irq_map),
+};
+
+static irqreturn_t pmic_thermal_irq_handler(int irq, void *data)
+{
+	struct platform_device *pdev = data;
+	struct thermal_zone_device *tzd;
+	struct pmic_thermal_data *td;
+	struct intel_soc_pmic *pmic;
+	struct regmap *regmap;
+	u8 reg_val, mask, irq_stat, trip;
+	u16 reg, evt_stat_reg;
+	int i, j, ret;
+
+	pmic = dev_get_drvdata(pdev->dev.parent);
+	regmap = pmic->regmap;
+	td = (struct pmic_thermal_data *)
+		platform_get_device_id(pdev)->driver_data;
+
+	/* Resolve thermal irqs */
+	for (i = 0; i < td->num_maps; i++) {
+		for (j = 0; j < td->maps[i].num_trips; j++) {
+			reg = td->maps[i].trip_config[j].irq_reg;
+			mask = td->maps[i].trip_config[j].irq_mask;
+			/*
+			 * Read the irq register to resolve whether the
+			 * interrupt was triggered for this sensor
+			 */
+			if (regmap_read(regmap, reg, &ret))
+				return IRQ_HANDLED;
+
+			reg_val = (u8)ret;
+			irq_stat = ((u8)ret & mask);
+
+			if (!irq_stat)
+				continue;
+
+			/*
+			 * Read the status register to find out what
+			 * event occurred i.e a high or a low
+			 */
+			evt_stat_reg = td->maps[i].trip_config[j].evt_stat;
+			if (regmap_read(regmap, evt_stat_reg, &ret))
+				return IRQ_HANDLED;
+
+			trip = td->maps[i].trip_config[j].trip_num;
+			tzd = thermal_zone_get_zone_by_name(td->maps[i].handle);
+			if (!IS_ERR(tzd))
+				thermal_zone_device_update(tzd,
+						THERMAL_EVENT_UNSPECIFIED);
+
+			/* Clear the appropriate irq */
+			regmap_write(regmap, reg, reg_val & mask);
+		}
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int pmic_thermal_probe(struct platform_device *pdev)
+{
+	struct regmap_irq_chip_data *regmap_irq_chip;
+	struct pmic_thermal_data *thermal_data;
+	int ret, irq, virq, i, j, pmic_irq_count;
+	struct intel_soc_pmic *pmic;
+	struct regmap *regmap;
+	struct device *dev;
+	u16 reg;
+	u8 mask;
+
+	dev = &pdev->dev;
+	pmic = dev_get_drvdata(pdev->dev.parent);
+	if (!pmic) {
+		dev_err(dev, "Failed to get struct intel_soc_pmic pointer\n");
+		return -ENODEV;
+	}
+
+	thermal_data = (struct pmic_thermal_data *)
+				platform_get_device_id(pdev)->driver_data;
+	if (!thermal_data) {
+		dev_err(dev, "No thermal data initialized!!\n");
+		return -ENODEV;
+	}
+
+	regmap = pmic->regmap;
+	regmap_irq_chip = pmic->irq_chip_data_level2;
+
+	pmic_irq_count = 0;
+	while ((irq = platform_get_irq(pdev, pmic_irq_count)) != -ENXIO) {
+		virq = regmap_irq_get_virq(regmap_irq_chip, irq);
+		if (virq < 0) {
+			dev_err(dev, "failed to get virq by irq %d\n", irq);
+			return virq;
+		}
+
+		ret = devm_request_threaded_irq(&pdev->dev, virq,
+				NULL, pmic_thermal_irq_handler,
+				IRQF_ONESHOT, "pmic_thermal", pdev);
+
+		if (ret) {
+			dev_err(dev, "request irq(%d) failed: %d\n", virq, ret);
+			return ret;
+		}
+		pmic_irq_count++;
+	}
+
+	/* Enable thermal interrupts */
+	for (i = 0; i < thermal_data->num_maps; i++) {
+		for (j = 0; j < thermal_data->maps[i].num_trips; j++) {
+			reg = thermal_data->maps[i].trip_config[j].irq_en;
+			mask = thermal_data->maps[i].trip_config[j].irq_en_mask;
+			ret = regmap_update_bits(regmap, reg, mask, 0x00);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static const struct platform_device_id pmic_thermal_id_table[] = {
+	{
+		.name = "bxt_wcove_thermal",
+		.driver_data = (kernel_ulong_t)&bxtwc_thermal_data,
+	},
+	{},
+};
+
+static struct platform_driver pmic_thermal_driver = {
+	.probe = pmic_thermal_probe,
+	.driver = {
+		.name = "pmic_thermal",
+	},
+	.id_table = pmic_thermal_id_table,
+};
+
+MODULE_DEVICE_TABLE(platform, pmic_thermal_id_table);
+module_platform_driver(pmic_thermal_driver);
+
+MODULE_AUTHOR("Yegnesh S Iyer <yegnesh.s.iyer@intel.com>");
+MODULE_DESCRIPTION("Intel Broxton PMIC Thermal Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/thermal/intel_soc_dts_iosf.c b/drivers/thermal/intel_soc_dts_iosf.c
index f72e1db..e0813df 100644
--- a/drivers/thermal/intel_soc_dts_iosf.c
+++ b/drivers/thermal/intel_soc_dts_iosf.c
@@ -391,7 +391,8 @@
 
 		for (i = 0; i < SOC_MAX_DTS_SENSORS; ++i) {
 			pr_debug("TZD update for zone %d\n", i);
-			thermal_zone_device_update(sensors->soc_dts[i].tzone);
+			thermal_zone_device_update(sensors->soc_dts[i].tzone,
+						   THERMAL_EVENT_UNSPECIFIED);
 		}
 	} else
 		spin_unlock_irqrestore(&sensors->intr_notify_lock, flags);
diff --git a/drivers/thermal/max77620_thermal.c b/drivers/thermal/max77620_thermal.c
new file mode 100644
index 0000000..83905ff
--- /dev/null
+++ b/drivers/thermal/max77620_thermal.c
@@ -0,0 +1,166 @@
+/*
+ * Junction temperature thermal driver for Maxim Max77620.
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Author: Laxman Dewangan <ldewangan@nvidia.com>
+ *	   Mallikarjun Kasoju <mkasoju@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/mfd/max77620.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+#include <linux/thermal.h>
+
+#define MAX77620_NORMAL_OPERATING_TEMP	100000
+#define MAX77620_TJALARM1_TEMP		120000
+#define MAX77620_TJALARM2_TEMP		140000
+
+struct max77620_therm_info {
+	struct device			*dev;
+	struct regmap			*rmap;
+	struct thermal_zone_device	*tz_device;
+	int				irq_tjalarm1;
+	int				irq_tjalarm2;
+};
+
+/**
+ * max77620_thermal_read_temp: Read PMIC die temperatue.
+ * @data:	Device specific data.
+ * temp:	Temperature in millidegrees Celsius
+ *
+ * The actual temperature of PMIC die is not available from PMIC.
+ * PMIC only tells the status if it has crossed or not the threshold level
+ * of 120degC or 140degC.
+ * If threshold has not been crossed then assume die temperature as 100degC
+ * else 120degC or 140deG based on the PMIC die temp threshold status.
+ *
+ * Return 0 on success otherwise error number to show reason of failure.
+ */
+
+static int max77620_thermal_read_temp(void *data, int *temp)
+{
+	struct max77620_therm_info *mtherm = data;
+	unsigned int val;
+	int ret;
+
+	ret = regmap_read(mtherm->rmap, MAX77620_REG_STATLBT, &val);
+	if (ret < 0) {
+		dev_err(mtherm->dev, "Failed to read STATLBT: %d\n", ret);
+		return ret;
+	}
+
+	if (val & MAX77620_IRQ_TJALRM2_MASK)
+		*temp = MAX77620_TJALARM2_TEMP;
+	else if (val & MAX77620_IRQ_TJALRM1_MASK)
+		*temp = MAX77620_TJALARM1_TEMP;
+	else
+		*temp = MAX77620_NORMAL_OPERATING_TEMP;
+
+	return 0;
+}
+
+static const struct thermal_zone_of_device_ops max77620_thermal_ops = {
+	.get_temp = max77620_thermal_read_temp,
+};
+
+static irqreturn_t max77620_thermal_irq(int irq, void *data)
+{
+	struct max77620_therm_info *mtherm = data;
+
+	if (irq == mtherm->irq_tjalarm1)
+		dev_warn(mtherm->dev, "Junction Temp Alarm1(120C) occurred\n");
+	else if (irq == mtherm->irq_tjalarm2)
+		dev_crit(mtherm->dev, "Junction Temp Alarm2(140C) occurred\n");
+
+	thermal_zone_device_update(mtherm->tz_device,
+				   THERMAL_EVENT_UNSPECIFIED);
+
+	return IRQ_HANDLED;
+}
+
+static int max77620_thermal_probe(struct platform_device *pdev)
+{
+	struct max77620_therm_info *mtherm;
+	int ret;
+
+	mtherm = devm_kzalloc(&pdev->dev, sizeof(*mtherm), GFP_KERNEL);
+	if (!mtherm)
+		return -ENOMEM;
+
+	mtherm->irq_tjalarm1 = platform_get_irq(pdev, 0);
+	mtherm->irq_tjalarm2 = platform_get_irq(pdev, 1);
+	if ((mtherm->irq_tjalarm1 < 0) || (mtherm->irq_tjalarm2 < 0)) {
+		dev_err(&pdev->dev, "Alarm irq number not available\n");
+		return -EINVAL;
+	}
+
+	pdev->dev.of_node = pdev->dev.parent->of_node;
+
+	mtherm->dev = &pdev->dev;
+	mtherm->rmap = dev_get_regmap(pdev->dev.parent, NULL);
+	if (!mtherm->rmap) {
+		dev_err(&pdev->dev, "Failed to get parent regmap\n");
+		return -ENODEV;
+	}
+
+	mtherm->tz_device = devm_thermal_zone_of_sensor_register(&pdev->dev, 0,
+				mtherm, &max77620_thermal_ops);
+	if (IS_ERR(mtherm->tz_device)) {
+		ret = PTR_ERR(mtherm->tz_device);
+		dev_err(&pdev->dev, "Failed to register thermal zone: %d\n",
+			ret);
+		return ret;
+	}
+
+	ret = devm_request_threaded_irq(&pdev->dev, mtherm->irq_tjalarm1, NULL,
+					max77620_thermal_irq,
+					IRQF_ONESHOT | IRQF_SHARED,
+					dev_name(&pdev->dev), mtherm);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "Failed to request irq1: %d\n", ret);
+		return ret;
+	}
+
+	ret = devm_request_threaded_irq(&pdev->dev, mtherm->irq_tjalarm2, NULL,
+					max77620_thermal_irq,
+					IRQF_ONESHOT | IRQF_SHARED,
+					dev_name(&pdev->dev), mtherm);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "Failed to request irq2: %d\n", ret);
+		return ret;
+	}
+
+	platform_set_drvdata(pdev, mtherm);
+
+	return 0;
+}
+
+static struct platform_device_id max77620_thermal_devtype[] = {
+	{ .name = "max77620-thermal", },
+	{},
+};
+
+static struct platform_driver max77620_thermal_driver = {
+	.driver = {
+		.name = "max77620-thermal",
+	},
+	.probe = max77620_thermal_probe,
+	.id_table = max77620_thermal_devtype,
+};
+
+module_platform_driver(max77620_thermal_driver);
+
+MODULE_DESCRIPTION("Max77620 Junction temperature Thermal driver");
+MODULE_AUTHOR("Laxman Dewangan <ldewangan@nvidia.com>");
+MODULE_AUTHOR("Mallikarjun Kasoju <mkasoju@nvidia.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/thermal/mtk_thermal.c b/drivers/thermal/mtk_thermal.c
index 262ab0a..34169c3 100644
--- a/drivers/thermal/mtk_thermal.c
+++ b/drivers/thermal/mtk_thermal.c
@@ -2,6 +2,7 @@
  * Copyright (c) 2015 MediaTek Inc.
  * Author: Hanyi Wu <hanyi.wu@mediatek.com>
  *         Sascha Hauer <s.hauer@pengutronix.de>
+ *         Dawei Chien <dawei.chien@mediatek.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -21,6 +22,7 @@
 #include <linux/nvmem-consumer.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
+#include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/io.h>
@@ -88,6 +90,7 @@
 #define TEMP_ADCVALIDMASK_VALID_HIGH		BIT(5)
 #define TEMP_ADCVALIDMASK_VALID_POS(bit)	(bit)
 
+/* MT8173 thermal sensors */
 #define MT8173_TS1	0
 #define MT8173_TS2	1
 #define MT8173_TS3	2
@@ -106,7 +109,12 @@
 /* The number of sensing points per bank */
 #define MT8173_NUM_SENSORS_PER_ZONE	4
 
-/* Layout of the fuses providing the calibration data */
+/*
+ * Layout of the fuses providing the calibration data
+ * These macros could be used for both MT8173 and MT2701.
+ * MT8173 has five sensors and need five VTS calibration data,
+ * and MT2701 has three sensors and need three VTS calibration data.
+ */
 #define MT8173_CALIB_BUF0_VALID		BIT(0)
 #define MT8173_CALIB_BUF1_ADC_GE(x)	(((x) >> 22) & 0x3ff)
 #define MT8173_CALIB_BUF0_VTS_TS1(x)	(((x) >> 17) & 0x1ff)
@@ -117,24 +125,50 @@
 #define MT8173_CALIB_BUF0_DEGC_CALI(x)	(((x) >> 1) & 0x3f)
 #define MT8173_CALIB_BUF0_O_SLOPE(x)	(((x) >> 26) & 0x3f)
 
+/* MT2701 thermal sensors */
+#define MT2701_TS1	0
+#define MT2701_TS2	1
+#define MT2701_TSABB	2
+
+/* AUXADC channel 11 is used for the temperature sensors */
+#define MT2701_TEMP_AUXADC_CHANNEL	11
+
+/* The total number of temperature sensors in the MT2701 */
+#define MT2701_NUM_SENSORS	3
+
 #define THERMAL_NAME    "mtk-thermal"
 
+/* The number of sensing points per bank */
+#define MT2701_NUM_SENSORS_PER_ZONE	3
+
 struct mtk_thermal;
 
+struct thermal_bank_cfg {
+	unsigned int num_sensors;
+	const int *sensors;
+};
+
 struct mtk_thermal_bank {
 	struct mtk_thermal *mt;
 	int id;
 };
 
+struct mtk_thermal_data {
+	s32 num_banks;
+	s32 num_sensors;
+	s32 auxadc_channel;
+	const int *sensor_mux_values;
+	const int *msr;
+	const int *adcpnp;
+	struct thermal_bank_cfg bank_data[];
+};
+
 struct mtk_thermal {
 	struct device *dev;
 	void __iomem *thermal_base;
 
 	struct clk *clk_peri_therm;
 	struct clk *clk_auxadc;
-
-	struct mtk_thermal_bank banks[MT8173_NUM_ZONES];
-
 	/* lock: for getting and putting banks */
 	struct mutex lock;
 
@@ -144,16 +178,44 @@
 	s32 o_slope;
 	s32 vts[MT8173_NUM_SENSORS];
 
+	const struct mtk_thermal_data *conf;
+	struct mtk_thermal_bank banks[];
 };
 
-struct mtk_thermal_bank_cfg {
-	unsigned int num_sensors;
-	unsigned int sensors[MT8173_NUM_SENSORS_PER_ZONE];
+/* MT8173 thermal sensor data */
+const int mt8173_bank_data[MT8173_NUM_ZONES][3] = {
+	{ MT8173_TS2, MT8173_TS3 },
+	{ MT8173_TS2, MT8173_TS4 },
+	{ MT8173_TS1, MT8173_TS2, MT8173_TSABB },
+	{ MT8173_TS2 },
 };
 
-static const int sensor_mux_values[MT8173_NUM_SENSORS] = { 0, 1, 2, 3, 16 };
+const int mt8173_msr[MT8173_NUM_SENSORS_PER_ZONE] = {
+	TEMP_MSR0, TEMP_MSR1, TEMP_MSR2, TEMP_MSR2
+};
 
-/*
+const int mt8173_adcpnp[MT8173_NUM_SENSORS_PER_ZONE] = {
+	TEMP_ADCPNP0, TEMP_ADCPNP1, TEMP_ADCPNP2, TEMP_ADCPNP3
+};
+
+const int mt8173_mux_values[MT8173_NUM_SENSORS] = { 0, 1, 2, 3, 16 };
+
+/* MT2701 thermal sensor data */
+const int mt2701_bank_data[MT2701_NUM_SENSORS] = {
+	MT2701_TS1, MT2701_TS2, MT2701_TSABB
+};
+
+const int mt2701_msr[MT2701_NUM_SENSORS_PER_ZONE] = {
+	TEMP_MSR0, TEMP_MSR1, TEMP_MSR2
+};
+
+const int mt2701_adcpnp[MT2701_NUM_SENSORS_PER_ZONE] = {
+	TEMP_ADCPNP0, TEMP_ADCPNP1, TEMP_ADCPNP2
+};
+
+const int mt2701_mux_values[MT2701_NUM_SENSORS] = { 0, 1, 16 };
+
+/**
  * The MT8173 thermal controller has four banks. Each bank can read up to
  * four temperature sensors simultaneously. The MT8173 has a total of 5
  * temperature sensors. We use each bank to measure a certain area of the
@@ -166,42 +228,53 @@
  * data, and this indeed needs the temperatures of the individual banks
  * for making better decisions.
  */
-static const struct mtk_thermal_bank_cfg bank_data[] = {
-	{
-		.num_sensors = 2,
-		.sensors = { MT8173_TS2, MT8173_TS3 },
-	}, {
-		.num_sensors = 2,
-		.sensors = { MT8173_TS2, MT8173_TS4 },
-	}, {
-		.num_sensors = 3,
-		.sensors = { MT8173_TS1, MT8173_TS2, MT8173_TSABB },
-	}, {
-		.num_sensors = 1,
-		.sensors = { MT8173_TS2 },
+static const struct mtk_thermal_data mt8173_thermal_data = {
+	.auxadc_channel = MT8173_TEMP_AUXADC_CHANNEL,
+	.num_banks = MT8173_NUM_ZONES,
+	.num_sensors = MT8173_NUM_SENSORS,
+	.bank_data = {
+		{
+			.num_sensors = 2,
+			.sensors = mt8173_bank_data[0],
+		}, {
+			.num_sensors = 2,
+			.sensors = mt8173_bank_data[1],
+		}, {
+			.num_sensors = 3,
+			.sensors = mt8173_bank_data[2],
+		}, {
+			.num_sensors = 1,
+			.sensors = mt8173_bank_data[3],
+		},
 	},
+	.msr = mt8173_msr,
+	.adcpnp = mt8173_adcpnp,
+	.sensor_mux_values = mt8173_mux_values,
 };
 
-struct mtk_thermal_sense_point {
-	int msr;
-	int adcpnp;
-};
-
-static const struct mtk_thermal_sense_point
-		sensing_points[MT8173_NUM_SENSORS_PER_ZONE] = {
-	{
-		.msr = TEMP_MSR0,
-		.adcpnp = TEMP_ADCPNP0,
-	}, {
-		.msr = TEMP_MSR1,
-		.adcpnp = TEMP_ADCPNP1,
-	}, {
-		.msr = TEMP_MSR2,
-		.adcpnp = TEMP_ADCPNP2,
-	}, {
-		.msr = TEMP_MSR3,
-		.adcpnp = TEMP_ADCPNP3,
+/**
+ * The MT2701 thermal controller has one bank, which can read up to
+ * three temperature sensors simultaneously. The MT2701 has a total of 3
+ * temperature sensors.
+ *
+ * The thermal core only gets the maximum temperature of this one bank,
+ * so the bank concept wouldn't be necessary here. However, the SVS (Smart
+ * Voltage Scaling) unit makes its decisions based on the same bank
+ * data.
+ */
+static const struct mtk_thermal_data mt2701_thermal_data = {
+	.auxadc_channel = MT2701_TEMP_AUXADC_CHANNEL,
+	.num_banks = 1,
+	.num_sensors = MT2701_NUM_SENSORS,
+	.bank_data = {
+		{
+			.num_sensors = 3,
+			.sensors = mt2701_bank_data,
+		},
 	},
+	.msr = mt2701_msr,
+	.adcpnp = mt2701_adcpnp,
+	.sensor_mux_values = mt2701_mux_values,
 };
 
 /**
@@ -270,13 +343,16 @@
 static int mtk_thermal_bank_temperature(struct mtk_thermal_bank *bank)
 {
 	struct mtk_thermal *mt = bank->mt;
+	const struct mtk_thermal_data *conf = mt->conf;
 	int i, temp = INT_MIN, max = INT_MIN;
 	u32 raw;
 
-	for (i = 0; i < bank_data[bank->id].num_sensors; i++) {
-		raw = readl(mt->thermal_base + sensing_points[i].msr);
+	for (i = 0; i < conf->bank_data[bank->id].num_sensors; i++) {
+		raw = readl(mt->thermal_base + conf->msr[i]);
 
-		temp = raw_to_mcelsius(mt, bank_data[bank->id].sensors[i], raw);
+		temp = raw_to_mcelsius(mt,
+				       conf->bank_data[bank->id].sensors[i],
+				       raw);
 
 		/*
 		 * The first read of a sensor often contains very high bogus
@@ -299,7 +375,7 @@
 	int i;
 	int tempmax = INT_MIN;
 
-	for (i = 0; i < MT8173_NUM_ZONES; i++) {
+	for (i = 0; i < mt->conf->num_banks; i++) {
 		struct mtk_thermal_bank *bank = &mt->banks[i];
 
 		mtk_thermal_get_bank(bank);
@@ -322,7 +398,7 @@
 				  u32 apmixed_phys_base, u32 auxadc_phys_base)
 {
 	struct mtk_thermal_bank *bank = &mt->banks[num];
-	const struct mtk_thermal_bank_cfg *cfg = &bank_data[num];
+	const struct mtk_thermal_data *conf = mt->conf;
 	int i;
 
 	bank->id = num;
@@ -368,7 +444,7 @@
 	 * this value will be stored to TEMP_PNPMUXADDR (TEMP_SPARE0)
 	 * automatically by hw
 	 */
-	writel(BIT(MT8173_TEMP_AUXADC_CHANNEL), mt->thermal_base + TEMP_ADCMUX);
+	writel(BIT(conf->auxadc_channel), mt->thermal_base + TEMP_ADCMUX);
 
 	/* AHB address for auxadc mux selection */
 	writel(auxadc_phys_base + AUXADC_CON1_CLR_V,
@@ -379,18 +455,18 @@
 	       mt->thermal_base + TEMP_PNPMUXADDR);
 
 	/* AHB value for auxadc enable */
-	writel(BIT(MT8173_TEMP_AUXADC_CHANNEL), mt->thermal_base + TEMP_ADCEN);
+	writel(BIT(conf->auxadc_channel), mt->thermal_base + TEMP_ADCEN);
 
 	/* AHB address for auxadc enable (channel 0 immediate mode selected) */
 	writel(auxadc_phys_base + AUXADC_CON1_SET_V,
 	       mt->thermal_base + TEMP_ADCENADDR);
 
 	/* AHB address for auxadc valid bit */
-	writel(auxadc_phys_base + AUXADC_DATA(MT8173_TEMP_AUXADC_CHANNEL),
+	writel(auxadc_phys_base + AUXADC_DATA(conf->auxadc_channel),
 	       mt->thermal_base + TEMP_ADCVALIDADDR);
 
 	/* AHB address for auxadc voltage output */
-	writel(auxadc_phys_base + AUXADC_DATA(MT8173_TEMP_AUXADC_CHANNEL),
+	writel(auxadc_phys_base + AUXADC_DATA(conf->auxadc_channel),
 	       mt->thermal_base + TEMP_ADCVOLTADDR);
 
 	/* read valid & voltage are at the same register */
@@ -407,11 +483,12 @@
 	writel(TEMP_ADCWRITECTRL_ADC_MUX_WRITE,
 	       mt->thermal_base + TEMP_ADCWRITECTRL);
 
-	for (i = 0; i < cfg->num_sensors; i++)
-		writel(sensor_mux_values[cfg->sensors[i]],
-		       mt->thermal_base + sensing_points[i].adcpnp);
+	for (i = 0; i < conf->bank_data[num].num_sensors; i++)
+		writel(conf->sensor_mux_values[conf->bank_data[num].sensors[i]],
+		       mt->thermal_base + conf->adcpnp[i]);
 
-	writel((1 << cfg->num_sensors) - 1, mt->thermal_base + TEMP_MONCTL0);
+	writel((1 << conf->bank_data[num].num_sensors) - 1,
+	       mt->thermal_base + TEMP_MONCTL0);
 
 	writel(TEMP_ADCWRITECTRL_ADC_PNP_WRITE |
 	       TEMP_ADCWRITECTRL_ADC_MUX_WRITE,
@@ -442,7 +519,7 @@
 
 	/* Start with default values */
 	mt->adc_ge = 512;
-	for (i = 0; i < MT8173_NUM_SENSORS; i++)
+	for (i = 0; i < mt->conf->num_sensors; i++)
 		mt->vts[i] = 260;
 	mt->degc_cali = 40;
 	mt->o_slope = 0;
@@ -486,18 +563,37 @@
 	return ret;
 }
 
+static const struct of_device_id mtk_thermal_of_match[] = {
+	{
+		.compatible = "mediatek,mt8173-thermal",
+		.data = (void *)&mt8173_thermal_data,
+	},
+	{
+		.compatible = "mediatek,mt2701-thermal",
+		.data = (void *)&mt2701_thermal_data,
+	}, {
+	},
+};
+MODULE_DEVICE_TABLE(of, mtk_thermal_of_match);
+
 static int mtk_thermal_probe(struct platform_device *pdev)
 {
 	int ret, i;
 	struct device_node *auxadc, *apmixedsys, *np = pdev->dev.of_node;
 	struct mtk_thermal *mt;
 	struct resource *res;
+	const struct of_device_id *of_id;
 	u64 auxadc_phys_base, apmixed_phys_base;
+	struct thermal_zone_device *tzdev;
 
 	mt = devm_kzalloc(&pdev->dev, sizeof(*mt), GFP_KERNEL);
 	if (!mt)
 		return -ENOMEM;
 
+	of_id = of_match_device(mtk_thermal_of_match, &pdev->dev);
+	if (of_id)
+		mt->conf = (const struct mtk_thermal_data *)of_id->data;
+
 	mt->clk_peri_therm = devm_clk_get(&pdev->dev, "therm");
 	if (IS_ERR(mt->clk_peri_therm))
 		return PTR_ERR(mt->clk_peri_therm);
@@ -565,17 +661,23 @@
 		goto err_disable_clk_auxadc;
 	}
 
-	for (i = 0; i < MT8173_NUM_ZONES; i++)
+	for (i = 0; i < mt->conf->num_banks; i++)
 		mtk_thermal_init_bank(mt, i, apmixed_phys_base,
 				      auxadc_phys_base);
 
 	platform_set_drvdata(pdev, mt);
 
-	devm_thermal_zone_of_sensor_register(&pdev->dev, 0, mt,
-					     &mtk_thermal_ops);
+	tzdev = devm_thermal_zone_of_sensor_register(&pdev->dev, 0, mt,
+						     &mtk_thermal_ops);
+	if (IS_ERR(tzdev)) {
+		ret = PTR_ERR(tzdev);
+		goto err_disable_clk_peri_therm;
+	}
 
 	return 0;
 
+err_disable_clk_peri_therm:
+	clk_disable_unprepare(mt->clk_peri_therm);
 err_disable_clk_auxadc:
 	clk_disable_unprepare(mt->clk_auxadc);
 
@@ -592,13 +694,6 @@
 	return 0;
 }
 
-static const struct of_device_id mtk_thermal_of_match[] = {
-	{
-		.compatible = "mediatek,mt8173-thermal",
-	}, {
-	},
-};
-
 static struct platform_driver mtk_thermal_driver = {
 	.probe = mtk_thermal_probe,
 	.remove = mtk_thermal_remove,
@@ -610,6 +705,7 @@
 
 module_platform_driver(mtk_thermal_driver);
 
+MODULE_AUTHOR("Dawei Chien <dawei.chien@mediatek.com>");
 MODULE_AUTHOR("Sascha Hauer <s.hauer@pengutronix.de>");
 MODULE_AUTHOR("Hanyi Wu <hanyi.wu@mediatek.com>");
 MODULE_DESCRIPTION("Mediatek thermal driver");
diff --git a/drivers/thermal/of-thermal.c b/drivers/thermal/of-thermal.c
index b8e509c..d04ec3b 100644
--- a/drivers/thermal/of-thermal.c
+++ b/drivers/thermal/of-thermal.c
@@ -101,6 +101,17 @@
 	return data->ops->get_temp(data->sensor_data, temp);
 }
 
+static int of_thermal_set_trips(struct thermal_zone_device *tz,
+				int low, int high)
+{
+	struct __thermal_zone *data = tz->devdata;
+
+	if (!data->ops || !data->ops->set_trips)
+		return -EINVAL;
+
+	return data->ops->set_trips(data->sensor_data, low, high);
+}
+
 /**
  * of_thermal_get_ntrips - function to export number of available trip
  *			   points.
@@ -181,9 +192,6 @@
 {
 	struct __thermal_zone *data = tz->devdata;
 
-	if (!data->ops || !data->ops->set_emul_temp)
-		return -EINVAL;
-
 	return data->ops->set_emul_temp(data->sensor_data, temp);
 }
 
@@ -191,25 +199,11 @@
 				enum thermal_trend *trend)
 {
 	struct __thermal_zone *data = tz->devdata;
-	long dev_trend;
-	int r;
 
 	if (!data->ops->get_trend)
 		return -EINVAL;
 
-	r = data->ops->get_trend(data->sensor_data, &dev_trend);
-	if (r)
-		return r;
-
-	/* TODO: These intervals might have some thresholds, but in core code */
-	if (dev_trend > 0)
-		*trend = THERMAL_TREND_RAISING;
-	else if (dev_trend < 0)
-		*trend = THERMAL_TREND_DROPPING;
-	else
-		*trend = THERMAL_TREND_STABLE;
-
-	return 0;
+	return data->ops->get_trend(data->sensor_data, trip, trend);
 }
 
 static int of_thermal_bind(struct thermal_zone_device *thermal,
@@ -292,7 +286,7 @@
 	mutex_unlock(&tz->lock);
 
 	data->mode = mode;
-	thermal_zone_device_update(tz);
+	thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
 
 	return 0;
 }
@@ -427,7 +421,17 @@
 
 	tzd->ops->get_temp = of_thermal_get_temp;
 	tzd->ops->get_trend = of_thermal_get_trend;
-	tzd->ops->set_emul_temp = of_thermal_set_emul_temp;
+
+	/*
+	 * The thermal zone core will calculate the window if they have set the
+	 * optional set_trips pointer.
+	 */
+	if (ops->set_trips)
+		tzd->ops->set_trips = of_thermal_set_trips;
+
+	if (ops->set_emul_temp)
+		tzd->ops->set_emul_temp = of_thermal_set_emul_temp;
+
 	mutex_unlock(&tzd->lock);
 
 	return tzd;
@@ -596,7 +600,7 @@
  * Return: On success returns a valid struct thermal_zone_device,
  * otherwise, it returns a corresponding ERR_PTR(). Caller must
  * check the return value with help of IS_ERR() helper.
- * Registered hermal_zone_device device will automatically be
+ * Registered thermal_zone_device device will automatically be
  * released when device is unbounded.
  */
 struct thermal_zone_device *devm_thermal_zone_of_sensor_register(
diff --git a/drivers/thermal/qcom-spmi-temp-alarm.c b/drivers/thermal/qcom-spmi-temp-alarm.c
index f8a3c60..819c6d5 100644
--- a/drivers/thermal/qcom-spmi-temp-alarm.c
+++ b/drivers/thermal/qcom-spmi-temp-alarm.c
@@ -150,7 +150,7 @@
 {
 	struct qpnp_tm_chip *chip = data;
 
-	thermal_zone_device_update(chip->tz_dev);
+	thermal_zone_device_update(chip->tz_dev, THERMAL_EVENT_UNSPECIFIED);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/thermal/qcom/Kconfig b/drivers/thermal/qcom/Kconfig
new file mode 100644
index 0000000..be32e5a
--- /dev/null
+++ b/drivers/thermal/qcom/Kconfig
@@ -0,0 +1,11 @@
+config QCOM_TSENS
+	tristate "Qualcomm TSENS Temperature Alarm"
+	depends on THERMAL
+	depends on QCOM_QFPROM
+	depends on ARCH_QCOM || COMPILE_TEST
+	help
+	  This enables the thermal sysfs driver for the TSENS device. It shows
+	  up in Sysfs as a thermal zone with multiple trip points. Disabling the
+	  thermal zone device via the mode file results in disabling the sensor.
+	  Also able to set threshold temperature for both hot and cold and update
+	  when a threshold is reached.
diff --git a/drivers/thermal/qcom/Makefile b/drivers/thermal/qcom/Makefile
new file mode 100644
index 0000000..2cc2193
--- /dev/null
+++ b/drivers/thermal/qcom/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_QCOM_TSENS)	+= qcom_tsens.o
+qcom_tsens-y			+= tsens.o tsens-common.o tsens-8916.o tsens-8974.o tsens-8960.o tsens-8996.o
diff --git a/drivers/thermal/qcom/tsens-8916.c b/drivers/thermal/qcom/tsens-8916.c
new file mode 100644
index 0000000..fdf561b
--- /dev/null
+++ b/drivers/thermal/qcom/tsens-8916.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/platform_device.h>
+#include "tsens.h"
+
+/* eeprom layout data for 8916 */
+#define BASE0_MASK	0x0000007f
+#define BASE1_MASK	0xfe000000
+#define BASE0_SHIFT	0
+#define BASE1_SHIFT	25
+
+#define S0_P1_MASK	0x00000f80
+#define S1_P1_MASK	0x003e0000
+#define S2_P1_MASK	0xf8000000
+#define S3_P1_MASK	0x000003e0
+#define S4_P1_MASK	0x000f8000
+
+#define S0_P2_MASK	0x0001f000
+#define S1_P2_MASK	0x07c00000
+#define S2_P2_MASK	0x0000001f
+#define S3_P2_MASK	0x00007c00
+#define S4_P2_MASK	0x01f00000
+
+#define S0_P1_SHIFT	7
+#define S1_P1_SHIFT	17
+#define S2_P1_SHIFT	27
+#define S3_P1_SHIFT	5
+#define S4_P1_SHIFT	15
+
+#define S0_P2_SHIFT	12
+#define S1_P2_SHIFT	22
+#define S2_P2_SHIFT	0
+#define S3_P2_SHIFT	10
+#define S4_P2_SHIFT	20
+
+#define CAL_SEL_MASK	0xe0000000
+#define CAL_SEL_SHIFT	29
+
+static int calibrate_8916(struct tsens_device *tmdev)
+{
+	int base0 = 0, base1 = 0, i;
+	u32 p1[5], p2[5];
+	int mode = 0;
+	u32 *qfprom_cdata, *qfprom_csel;
+
+	qfprom_cdata = (u32 *)qfprom_read(tmdev->dev, "calib");
+	if (IS_ERR(qfprom_cdata))
+		return PTR_ERR(qfprom_cdata);
+
+	qfprom_csel = (u32 *)qfprom_read(tmdev->dev, "calib_sel");
+	if (IS_ERR(qfprom_csel))
+		return PTR_ERR(qfprom_csel);
+
+	mode = (qfprom_csel[0] & CAL_SEL_MASK) >> CAL_SEL_SHIFT;
+	dev_dbg(tmdev->dev, "calibration mode is %d\n", mode);
+
+	switch (mode) {
+	case TWO_PT_CALIB:
+		base1 = (qfprom_cdata[1] & BASE1_MASK) >> BASE1_SHIFT;
+		p2[0] = (qfprom_cdata[0] & S0_P2_MASK) >> S0_P2_SHIFT;
+		p2[1] = (qfprom_cdata[0] & S1_P2_MASK) >> S1_P2_SHIFT;
+		p2[2] = (qfprom_cdata[1] & S2_P2_MASK) >> S2_P2_SHIFT;
+		p2[3] = (qfprom_cdata[1] & S3_P2_MASK) >> S3_P2_SHIFT;
+		p2[4] = (qfprom_cdata[1] & S4_P2_MASK) >> S4_P2_SHIFT;
+		for (i = 0; i < tmdev->num_sensors; i++)
+			p2[i] = ((base1 + p2[i]) << 3);
+		/* Fall through */
+	case ONE_PT_CALIB2:
+		base0 = (qfprom_cdata[0] & BASE0_MASK);
+		p1[0] = (qfprom_cdata[0] & S0_P1_MASK) >> S0_P1_SHIFT;
+		p1[1] = (qfprom_cdata[0] & S1_P1_MASK) >> S1_P1_SHIFT;
+		p1[2] = (qfprom_cdata[0] & S2_P1_MASK) >> S2_P1_SHIFT;
+		p1[3] = (qfprom_cdata[1] & S3_P1_MASK) >> S3_P1_SHIFT;
+		p1[4] = (qfprom_cdata[1] & S4_P1_MASK) >> S4_P1_SHIFT;
+		for (i = 0; i < tmdev->num_sensors; i++)
+			p1[i] = (((base0) + p1[i]) << 3);
+		break;
+	default:
+		for (i = 0; i < tmdev->num_sensors; i++) {
+			p1[i] = 500;
+			p2[i] = 780;
+		}
+		break;
+	}
+
+	compute_intercept_slope(tmdev, p1, p2, mode);
+
+	return 0;
+}
+
+static const struct tsens_ops ops_8916 = {
+	.init		= init_common,
+	.calibrate	= calibrate_8916,
+	.get_temp	= get_temp_common,
+};
+
+const struct tsens_data data_8916 = {
+	.num_sensors	= 5,
+	.ops		= &ops_8916,
+	.hw_ids		= (unsigned int []){0, 1, 2, 4, 5 },
+};
diff --git a/drivers/thermal/qcom/tsens-8960.c b/drivers/thermal/qcom/tsens-8960.c
new file mode 100644
index 0000000..0451277
--- /dev/null
+++ b/drivers/thermal/qcom/tsens-8960.c
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/platform_device.h>
+#include <linux/delay.h>
+#include <linux/bitops.h>
+#include <linux/regmap.h>
+#include <linux/thermal.h>
+#include "tsens.h"
+
+#define CAL_MDEGC		30000
+
+#define CONFIG_ADDR		0x3640
+#define CONFIG_ADDR_8660	0x3620
+/* CONFIG_ADDR bitmasks */
+#define CONFIG			0x9b
+#define CONFIG_MASK		0xf
+#define CONFIG_8660		1
+#define CONFIG_SHIFT_8660	28
+#define CONFIG_MASK_8660	(3 << CONFIG_SHIFT_8660)
+
+#define STATUS_CNTL_ADDR_8064	0x3660
+#define CNTL_ADDR		0x3620
+/* CNTL_ADDR bitmasks */
+#define EN			BIT(0)
+#define SW_RST			BIT(1)
+#define SENSOR0_EN		BIT(3)
+#define SLP_CLK_ENA		BIT(26)
+#define SLP_CLK_ENA_8660	BIT(24)
+#define MEASURE_PERIOD		1
+#define SENSOR0_SHIFT		3
+
+/* INT_STATUS_ADDR bitmasks */
+#define MIN_STATUS_MASK		BIT(0)
+#define LOWER_STATUS_CLR	BIT(1)
+#define UPPER_STATUS_CLR	BIT(2)
+#define MAX_STATUS_MASK		BIT(3)
+
+#define THRESHOLD_ADDR		0x3624
+/* THRESHOLD_ADDR bitmasks */
+#define THRESHOLD_MAX_LIMIT_SHIFT	24
+#define THRESHOLD_MIN_LIMIT_SHIFT	16
+#define THRESHOLD_UPPER_LIMIT_SHIFT	8
+#define THRESHOLD_LOWER_LIMIT_SHIFT	0
+
+/* Initial temperature threshold values */
+#define LOWER_LIMIT_TH		0x50
+#define UPPER_LIMIT_TH		0xdf
+#define MIN_LIMIT_TH		0x0
+#define MAX_LIMIT_TH		0xff
+
+#define S0_STATUS_ADDR		0x3628
+#define INT_STATUS_ADDR		0x363c
+#define TRDY_MASK		BIT(7)
+#define TIMEOUT_US		100
+
+static int suspend_8960(struct tsens_device *tmdev)
+{
+	int ret;
+	unsigned int mask;
+	struct regmap *map = tmdev->map;
+
+	ret = regmap_read(map, THRESHOLD_ADDR, &tmdev->ctx.threshold);
+	if (ret)
+		return ret;
+
+	ret = regmap_read(map, CNTL_ADDR, &tmdev->ctx.control);
+	if (ret)
+		return ret;
+
+	if (tmdev->num_sensors > 1)
+		mask = SLP_CLK_ENA | EN;
+	else
+		mask = SLP_CLK_ENA_8660 | EN;
+
+	ret = regmap_update_bits(map, CNTL_ADDR, mask, 0);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int resume_8960(struct tsens_device *tmdev)
+{
+	int ret;
+	struct regmap *map = tmdev->map;
+
+	ret = regmap_update_bits(map, CNTL_ADDR, SW_RST, SW_RST);
+	if (ret)
+		return ret;
+
+	/*
+	 * Separate CONFIG restore is not needed only for 8660 as
+	 * config is part of CTRL Addr and its restored as such
+	 */
+	if (tmdev->num_sensors > 1) {
+		ret = regmap_update_bits(map, CONFIG_ADDR, CONFIG_MASK, CONFIG);
+		if (ret)
+			return ret;
+	}
+
+	ret = regmap_write(map, THRESHOLD_ADDR, tmdev->ctx.threshold);
+	if (ret)
+		return ret;
+
+	ret = regmap_write(map, CNTL_ADDR, tmdev->ctx.control);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int enable_8960(struct tsens_device *tmdev, int id)
+{
+	int ret;
+	u32 reg, mask;
+
+	ret = regmap_read(tmdev->map, CNTL_ADDR, &reg);
+	if (ret)
+		return ret;
+
+	mask = BIT(id + SENSOR0_SHIFT);
+	ret = regmap_write(tmdev->map, CNTL_ADDR, reg | SW_RST);
+	if (ret)
+		return ret;
+
+	if (tmdev->num_sensors > 1)
+		reg |= mask | SLP_CLK_ENA | EN;
+	else
+		reg |= mask | SLP_CLK_ENA_8660 | EN;
+
+	ret = regmap_write(tmdev->map, CNTL_ADDR, reg);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static void disable_8960(struct tsens_device *tmdev)
+{
+	int ret;
+	u32 reg_cntl;
+	u32 mask;
+
+	mask = GENMASK(tmdev->num_sensors - 1, 0);
+	mask <<= SENSOR0_SHIFT;
+	mask |= EN;
+
+	ret = regmap_read(tmdev->map, CNTL_ADDR, &reg_cntl);
+	if (ret)
+		return;
+
+	reg_cntl &= ~mask;
+
+	if (tmdev->num_sensors > 1)
+		reg_cntl &= ~SLP_CLK_ENA;
+	else
+		reg_cntl &= ~SLP_CLK_ENA_8660;
+
+	regmap_write(tmdev->map, CNTL_ADDR, reg_cntl);
+}
+
+static int init_8960(struct tsens_device *tmdev)
+{
+	int ret, i;
+	u32 reg_cntl;
+
+	tmdev->map = dev_get_regmap(tmdev->dev, NULL);
+	if (!tmdev->map)
+		return -ENODEV;
+
+	/*
+	 * The status registers for each sensor are discontiguous
+	 * because some SoCs have 5 sensors while others have more
+	 * but the control registers stay in the same place, i.e
+	 * directly after the first 5 status registers.
+	 */
+	for (i = 0; i < tmdev->num_sensors; i++) {
+		if (i >= 5)
+			tmdev->sensor[i].status = S0_STATUS_ADDR + 40;
+		tmdev->sensor[i].status += i * 4;
+	}
+
+	reg_cntl = SW_RST;
+	ret = regmap_update_bits(tmdev->map, CNTL_ADDR, SW_RST, reg_cntl);
+	if (ret)
+		return ret;
+
+	if (tmdev->num_sensors > 1) {
+		reg_cntl |= SLP_CLK_ENA | (MEASURE_PERIOD << 18);
+		reg_cntl &= ~SW_RST;
+		ret = regmap_update_bits(tmdev->map, CONFIG_ADDR,
+					 CONFIG_MASK, CONFIG);
+	} else {
+		reg_cntl |= SLP_CLK_ENA_8660 | (MEASURE_PERIOD << 16);
+		reg_cntl &= ~CONFIG_MASK_8660;
+		reg_cntl |= CONFIG_8660 << CONFIG_SHIFT_8660;
+	}
+
+	reg_cntl |= GENMASK(tmdev->num_sensors - 1, 0) << SENSOR0_SHIFT;
+	ret = regmap_write(tmdev->map, CNTL_ADDR, reg_cntl);
+	if (ret)
+		return ret;
+
+	reg_cntl |= EN;
+	ret = regmap_write(tmdev->map, CNTL_ADDR, reg_cntl);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int calibrate_8960(struct tsens_device *tmdev)
+{
+	int i;
+	char *data;
+
+	ssize_t num_read = tmdev->num_sensors;
+	struct tsens_sensor *s = tmdev->sensor;
+
+	data = qfprom_read(tmdev->dev, "calib");
+	if (IS_ERR(data))
+		data = qfprom_read(tmdev->dev, "calib_backup");
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	for (i = 0; i < num_read; i++, s++)
+		s->offset = data[i];
+
+	return 0;
+}
+
+/* Temperature on y axis and ADC-code on x-axis */
+static inline int code_to_mdegC(u32 adc_code, const struct tsens_sensor *s)
+{
+	int slope, offset;
+
+	slope = thermal_zone_get_slope(s->tzd);
+	offset = CAL_MDEGC - slope * s->offset;
+
+	return adc_code * slope + offset;
+}
+
+static int get_temp_8960(struct tsens_device *tmdev, int id, int *temp)
+{
+	int ret;
+	u32 code, trdy;
+	const struct tsens_sensor *s = &tmdev->sensor[id];
+	unsigned long timeout;
+
+	timeout = jiffies + usecs_to_jiffies(TIMEOUT_US);
+	do {
+		ret = regmap_read(tmdev->map, INT_STATUS_ADDR, &trdy);
+		if (ret)
+			return ret;
+		if (!(trdy & TRDY_MASK))
+			continue;
+		ret = regmap_read(tmdev->map, s->status, &code);
+		if (ret)
+			return ret;
+		*temp = code_to_mdegC(code, s);
+		return 0;
+	} while (time_before(jiffies, timeout));
+
+	return -ETIMEDOUT;
+}
+
+static const struct tsens_ops ops_8960 = {
+	.init		= init_8960,
+	.calibrate	= calibrate_8960,
+	.get_temp	= get_temp_8960,
+	.enable		= enable_8960,
+	.disable	= disable_8960,
+	.suspend	= suspend_8960,
+	.resume		= resume_8960,
+};
+
+const struct tsens_data data_8960 = {
+	.num_sensors	= 11,
+	.ops		= &ops_8960,
+};
diff --git a/drivers/thermal/qcom/tsens-8974.c b/drivers/thermal/qcom/tsens-8974.c
new file mode 100644
index 0000000..9baf77e
--- /dev/null
+++ b/drivers/thermal/qcom/tsens-8974.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/platform_device.h>
+#include "tsens.h"
+
+/* eeprom layout data for 8974 */
+#define BASE1_MASK		0xff
+#define S0_P1_MASK		0x3f00
+#define S1_P1_MASK		0xfc000
+#define S2_P1_MASK		0x3f00000
+#define S3_P1_MASK		0xfc000000
+#define S4_P1_MASK		0x3f
+#define S5_P1_MASK		0xfc0
+#define S6_P1_MASK		0x3f000
+#define S7_P1_MASK		0xfc0000
+#define S8_P1_MASK		0x3f000000
+#define S8_P1_MASK_BKP		0x3f
+#define S9_P1_MASK		0x3f
+#define S9_P1_MASK_BKP		0xfc0
+#define S10_P1_MASK		0xfc0
+#define S10_P1_MASK_BKP		0x3f000
+#define CAL_SEL_0_1		0xc0000000
+#define CAL_SEL_2		0x40000000
+#define CAL_SEL_SHIFT		30
+#define CAL_SEL_SHIFT_2		28
+
+#define S0_P1_SHIFT		8
+#define S1_P1_SHIFT		14
+#define S2_P1_SHIFT		20
+#define S3_P1_SHIFT		26
+#define S5_P1_SHIFT		6
+#define S6_P1_SHIFT		12
+#define S7_P1_SHIFT		18
+#define S8_P1_SHIFT		24
+#define S9_P1_BKP_SHIFT		6
+#define S10_P1_SHIFT		6
+#define S10_P1_BKP_SHIFT	12
+
+#define BASE2_SHIFT		12
+#define BASE2_BKP_SHIFT		18
+#define S0_P2_SHIFT		20
+#define S0_P2_BKP_SHIFT		26
+#define S1_P2_SHIFT		26
+#define S2_P2_BKP_SHIFT		6
+#define S3_P2_SHIFT		6
+#define S3_P2_BKP_SHIFT		12
+#define S4_P2_SHIFT		12
+#define S4_P2_BKP_SHIFT		18
+#define S5_P2_SHIFT		18
+#define S5_P2_BKP_SHIFT		24
+#define S6_P2_SHIFT		24
+#define S7_P2_BKP_SHIFT		6
+#define S8_P2_SHIFT		6
+#define S8_P2_BKP_SHIFT		12
+#define S9_P2_SHIFT		12
+#define S9_P2_BKP_SHIFT		18
+#define S10_P2_SHIFT		18
+#define S10_P2_BKP_SHIFT	24
+
+#define BASE2_MASK		0xff000
+#define BASE2_BKP_MASK		0xfc0000
+#define S0_P2_MASK		0x3f00000
+#define S0_P2_BKP_MASK		0xfc000000
+#define S1_P2_MASK		0xfc000000
+#define S1_P2_BKP_MASK		0x3f
+#define S2_P2_MASK		0x3f
+#define S2_P2_BKP_MASK		0xfc0
+#define S3_P2_MASK		0xfc0
+#define S3_P2_BKP_MASK		0x3f000
+#define S4_P2_MASK		0x3f000
+#define S4_P2_BKP_MASK		0xfc0000
+#define S5_P2_MASK		0xfc0000
+#define S5_P2_BKP_MASK		0x3f000000
+#define S6_P2_MASK		0x3f000000
+#define S6_P2_BKP_MASK		0x3f
+#define S7_P2_MASK		0x3f
+#define S7_P2_BKP_MASK		0xfc0
+#define S8_P2_MASK		0xfc0
+#define S8_P2_BKP_MASK		0x3f000
+#define S9_P2_MASK		0x3f000
+#define S9_P2_BKP_MASK		0xfc0000
+#define S10_P2_MASK		0xfc0000
+#define S10_P2_BKP_MASK		0x3f000000
+
+#define BKP_SEL			0x3
+#define BKP_REDUN_SEL		0xe0000000
+#define BKP_REDUN_SHIFT		29
+
+#define BIT_APPEND		0x3
+
+static int calibrate_8974(struct tsens_device *tmdev)
+{
+	int base1 = 0, base2 = 0, i;
+	u32 p1[11], p2[11];
+	int mode = 0;
+	u32 *calib, *bkp;
+	u32 calib_redun_sel;
+
+	calib = (u32 *)qfprom_read(tmdev->dev, "calib");
+	if (IS_ERR(calib))
+		return PTR_ERR(calib);
+
+	bkp = (u32 *)qfprom_read(tmdev->dev, "calib_backup");
+	if (IS_ERR(bkp))
+		return PTR_ERR(bkp);
+
+	calib_redun_sel =  bkp[1] & BKP_REDUN_SEL;
+	calib_redun_sel >>= BKP_REDUN_SHIFT;
+
+	if (calib_redun_sel == BKP_SEL) {
+		mode = (calib[4] & CAL_SEL_0_1) >> CAL_SEL_SHIFT;
+		mode |= (calib[5] & CAL_SEL_2) >> CAL_SEL_SHIFT_2;
+
+		switch (mode) {
+		case TWO_PT_CALIB:
+			base2 = (bkp[2] & BASE2_BKP_MASK) >> BASE2_BKP_SHIFT;
+			p2[0] = (bkp[2] & S0_P2_BKP_MASK) >> S0_P2_BKP_SHIFT;
+			p2[1] = (bkp[3] & S1_P2_BKP_MASK);
+			p2[2] = (bkp[3] & S2_P2_BKP_MASK) >> S2_P2_BKP_SHIFT;
+			p2[3] = (bkp[3] & S3_P2_BKP_MASK) >> S3_P2_BKP_SHIFT;
+			p2[4] = (bkp[3] & S4_P2_BKP_MASK) >> S4_P2_BKP_SHIFT;
+			p2[5] = (calib[4] & S5_P2_BKP_MASK) >> S5_P2_BKP_SHIFT;
+			p2[6] = (calib[5] & S6_P2_BKP_MASK);
+			p2[7] = (calib[5] & S7_P2_BKP_MASK) >> S7_P2_BKP_SHIFT;
+			p2[8] = (calib[5] & S8_P2_BKP_MASK) >> S8_P2_BKP_SHIFT;
+			p2[9] = (calib[5] & S9_P2_BKP_MASK) >> S9_P2_BKP_SHIFT;
+			p2[10] = (calib[5] & S10_P2_BKP_MASK) >> S10_P2_BKP_SHIFT;
+			/* Fall through */
+		case ONE_PT_CALIB:
+		case ONE_PT_CALIB2:
+			base1 = bkp[0] & BASE1_MASK;
+			p1[0] = (bkp[0] & S0_P1_MASK) >> S0_P1_SHIFT;
+			p1[1] = (bkp[0] & S1_P1_MASK) >> S1_P1_SHIFT;
+			p1[2] = (bkp[0] & S2_P1_MASK) >> S2_P1_SHIFT;
+			p1[3] = (bkp[0] & S3_P1_MASK) >> S3_P1_SHIFT;
+			p1[4] = (bkp[1] & S4_P1_MASK);
+			p1[5] = (bkp[1] & S5_P1_MASK) >> S5_P1_SHIFT;
+			p1[6] = (bkp[1] & S6_P1_MASK) >> S6_P1_SHIFT;
+			p1[7] = (bkp[1] & S7_P1_MASK) >> S7_P1_SHIFT;
+			p1[8] = (bkp[2] & S8_P1_MASK_BKP) >> S8_P1_SHIFT;
+			p1[9] = (bkp[2] & S9_P1_MASK_BKP) >> S9_P1_BKP_SHIFT;
+			p1[10] = (bkp[2] & S10_P1_MASK_BKP) >> S10_P1_BKP_SHIFT;
+			break;
+		}
+	} else {
+		mode = (calib[1] & CAL_SEL_0_1) >> CAL_SEL_SHIFT;
+		mode |= (calib[3] & CAL_SEL_2) >> CAL_SEL_SHIFT_2;
+
+		switch (mode) {
+		case TWO_PT_CALIB:
+			base2 = (calib[2] & BASE2_MASK) >> BASE2_SHIFT;
+			p2[0] = (calib[2] & S0_P2_MASK) >> S0_P2_SHIFT;
+			p2[1] = (calib[2] & S1_P2_MASK) >> S1_P2_SHIFT;
+			p2[2] = (calib[3] & S2_P2_MASK);
+			p2[3] = (calib[3] & S3_P2_MASK) >> S3_P2_SHIFT;
+			p2[4] = (calib[3] & S4_P2_MASK) >> S4_P2_SHIFT;
+			p2[5] = (calib[3] & S5_P2_MASK) >> S5_P2_SHIFT;
+			p2[6] = (calib[3] & S6_P2_MASK) >> S6_P2_SHIFT;
+			p2[7] = (calib[4] & S7_P2_MASK);
+			p2[8] = (calib[4] & S8_P2_MASK) >> S8_P2_SHIFT;
+			p2[9] = (calib[4] & S9_P2_MASK) >> S9_P2_SHIFT;
+			p2[10] = (calib[4] & S10_P2_MASK) >> S10_P2_SHIFT;
+			/* Fall through */
+		case ONE_PT_CALIB:
+		case ONE_PT_CALIB2:
+			base1 = calib[0] & BASE1_MASK;
+			p1[0] = (calib[0] & S0_P1_MASK) >> S0_P1_SHIFT;
+			p1[1] = (calib[0] & S1_P1_MASK) >> S1_P1_SHIFT;
+			p1[2] = (calib[0] & S2_P1_MASK) >> S2_P1_SHIFT;
+			p1[3] = (calib[0] & S3_P1_MASK) >> S3_P1_SHIFT;
+			p1[4] = (calib[1] & S4_P1_MASK);
+			p1[5] = (calib[1] & S5_P1_MASK) >> S5_P1_SHIFT;
+			p1[6] = (calib[1] & S6_P1_MASK) >> S6_P1_SHIFT;
+			p1[7] = (calib[1] & S7_P1_MASK) >> S7_P1_SHIFT;
+			p1[8] = (calib[1] & S8_P1_MASK) >> S8_P1_SHIFT;
+			p1[9] = (calib[2] & S9_P1_MASK);
+			p1[10] = (calib[2] & S10_P1_MASK) >> S10_P1_SHIFT;
+			break;
+		}
+	}
+
+	switch (mode) {
+	case ONE_PT_CALIB:
+		for (i = 0; i < tmdev->num_sensors; i++)
+			p1[i] += (base1 << 2) | BIT_APPEND;
+		break;
+	case TWO_PT_CALIB:
+		for (i = 0; i < tmdev->num_sensors; i++) {
+			p2[i] += base2;
+			p2[i] <<= 2;
+			p2[i] |= BIT_APPEND;
+		}
+		/* Fall through */
+	case ONE_PT_CALIB2:
+		for (i = 0; i < tmdev->num_sensors; i++) {
+			p1[i] += base1;
+			p1[i] <<= 2;
+			p1[i] |= BIT_APPEND;
+		}
+		break;
+	default:
+		for (i = 0; i < tmdev->num_sensors; i++)
+			p2[i] = 780;
+		p1[0] = 502;
+		p1[1] = 509;
+		p1[2] = 503;
+		p1[3] = 509;
+		p1[4] = 505;
+		p1[5] = 509;
+		p1[6] = 507;
+		p1[7] = 510;
+		p1[8] = 508;
+		p1[9] = 509;
+		p1[10] = 508;
+		break;
+	}
+
+	compute_intercept_slope(tmdev, p1, p2, mode);
+
+	return 0;
+}
+
+static const struct tsens_ops ops_8974 = {
+	.init		= init_common,
+	.calibrate	= calibrate_8974,
+	.get_temp	= get_temp_common,
+};
+
+const struct tsens_data data_8974 = {
+	.num_sensors	= 11,
+	.ops		= &ops_8974,
+};
diff --git a/drivers/thermal/qcom/tsens-8996.c b/drivers/thermal/qcom/tsens-8996.c
new file mode 100644
index 0000000..e1f7781
--- /dev/null
+++ b/drivers/thermal/qcom/tsens-8996.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include "tsens.h"
+
+#define STATUS_OFFSET	0x10a0
+#define LAST_TEMP_MASK	0xfff
+#define STATUS_VALID_BIT	BIT(21)
+#define CODE_SIGN_BIT		BIT(11)
+
+static int get_temp_8996(struct tsens_device *tmdev, int id, int *temp)
+{
+	struct tsens_sensor *s = &tmdev->sensor[id];
+	u32 code;
+	unsigned int sensor_addr;
+	int last_temp = 0, last_temp2 = 0, last_temp3 = 0, ret;
+
+	sensor_addr = STATUS_OFFSET + s->hw_id * 4;
+	ret = regmap_read(tmdev->map, sensor_addr, &code);
+	if (ret)
+		return ret;
+	last_temp = code & LAST_TEMP_MASK;
+	if (code & STATUS_VALID_BIT)
+		goto done;
+
+	/* Try a second time */
+	ret = regmap_read(tmdev->map, sensor_addr, &code);
+	if (ret)
+		return ret;
+	if (code & STATUS_VALID_BIT) {
+		last_temp = code & LAST_TEMP_MASK;
+		goto done;
+	} else {
+		last_temp2 = code & LAST_TEMP_MASK;
+	}
+
+	/* Try a third/last time */
+	ret = regmap_read(tmdev->map, sensor_addr, &code);
+	if (ret)
+		return ret;
+	if (code & STATUS_VALID_BIT) {
+		last_temp = code & LAST_TEMP_MASK;
+		goto done;
+	} else {
+		last_temp3 = code & LAST_TEMP_MASK;
+	}
+
+	if (last_temp == last_temp2)
+		last_temp = last_temp2;
+	else if (last_temp2 == last_temp3)
+		last_temp = last_temp3;
+done:
+	/* Code sign bit is the sign extension for a negative value */
+	if (last_temp & CODE_SIGN_BIT)
+		last_temp |= ~CODE_SIGN_BIT;
+
+	/* Temperatures are in deciCelicius */
+	*temp = last_temp * 100;
+
+	return 0;
+}
+
+static const struct tsens_ops ops_8996 = {
+	.init		= init_common,
+	.get_temp	= get_temp_8996,
+};
+
+const struct tsens_data data_8996 = {
+	.num_sensors	= 13,
+	.ops		= &ops_8996,
+};
diff --git a/drivers/thermal/qcom/tsens-common.c b/drivers/thermal/qcom/tsens-common.c
new file mode 100644
index 0000000..b1449ad
--- /dev/null
+++ b/drivers/thermal/qcom/tsens-common.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/nvmem-consumer.h>
+#include <linux/of_address.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include "tsens.h"
+
+#define S0_ST_ADDR		0x1030
+#define SN_ADDR_OFFSET		0x4
+#define SN_ST_TEMP_MASK		0x3ff
+#define CAL_DEGC_PT1		30
+#define CAL_DEGC_PT2		120
+#define SLOPE_FACTOR		1000
+#define SLOPE_DEFAULT		3200
+
+char *qfprom_read(struct device *dev, const char *cname)
+{
+	struct nvmem_cell *cell;
+	ssize_t data;
+	char *ret;
+
+	cell = nvmem_cell_get(dev, cname);
+	if (IS_ERR(cell))
+		return ERR_CAST(cell);
+
+	ret = nvmem_cell_read(cell, &data);
+	nvmem_cell_put(cell);
+
+	return ret;
+}
+
+/*
+ * Use this function on devices where slope and offset calculations
+ * depend on calibration data read from qfprom. On others the slope
+ * and offset values are derived from tz->tzp->slope and tz->tzp->offset
+ * resp.
+ */
+void compute_intercept_slope(struct tsens_device *tmdev, u32 *p1,
+			     u32 *p2, u32 mode)
+{
+	int i;
+	int num, den;
+
+	for (i = 0; i < tmdev->num_sensors; i++) {
+		dev_dbg(tmdev->dev,
+			"sensor%d - data_point1:%#x data_point2:%#x\n",
+			i, p1[i], p2[i]);
+
+		tmdev->sensor[i].slope = SLOPE_DEFAULT;
+		if (mode == TWO_PT_CALIB) {
+			/*
+			 * slope (m) = adc_code2 - adc_code1 (y2 - y1)/
+			 *	temp_120_degc - temp_30_degc (x2 - x1)
+			 */
+			num = p2[i] - p1[i];
+			num *= SLOPE_FACTOR;
+			den = CAL_DEGC_PT2 - CAL_DEGC_PT1;
+			tmdev->sensor[i].slope = num / den;
+		}
+
+		tmdev->sensor[i].offset = (p1[i] * SLOPE_FACTOR) -
+				(CAL_DEGC_PT1 *
+				tmdev->sensor[i].slope);
+		dev_dbg(tmdev->dev, "offset:%d\n", tmdev->sensor[i].offset);
+	}
+}
+
+static inline int code_to_degc(u32 adc_code, const struct tsens_sensor *s)
+{
+	int degc, num, den;
+
+	num = (adc_code * SLOPE_FACTOR) - s->offset;
+	den = s->slope;
+
+	if (num > 0)
+		degc = num + (den / 2);
+	else if (num < 0)
+		degc = num - (den / 2);
+	else
+		degc = num;
+
+	degc /= den;
+
+	return degc;
+}
+
+int get_temp_common(struct tsens_device *tmdev, int id, int *temp)
+{
+	struct tsens_sensor *s = &tmdev->sensor[id];
+	u32 code;
+	unsigned int sensor_addr;
+	int last_temp = 0, ret;
+
+	sensor_addr = S0_ST_ADDR + s->hw_id * SN_ADDR_OFFSET;
+	ret = regmap_read(tmdev->map, sensor_addr, &code);
+	if (ret)
+		return ret;
+	last_temp = code & SN_ST_TEMP_MASK;
+
+	*temp = code_to_degc(last_temp, s) * 1000;
+
+	return 0;
+}
+
+static const struct regmap_config tsens_config = {
+	.reg_bits	= 32,
+	.val_bits	= 32,
+	.reg_stride	= 4,
+};
+
+int __init init_common(struct tsens_device *tmdev)
+{
+	void __iomem *base;
+
+	base = of_iomap(tmdev->dev->of_node, 0);
+	if (!base)
+		return -EINVAL;
+
+	tmdev->map = devm_regmap_init_mmio(tmdev->dev, base, &tsens_config);
+	if (IS_ERR(tmdev->map)) {
+		iounmap(base);
+		return PTR_ERR(tmdev->map);
+	}
+
+	return 0;
+}
diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c
new file mode 100644
index 0000000..3f9fe6a
--- /dev/null
+++ b/drivers/thermal/qcom/tsens.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+#include <linux/slab.h>
+#include <linux/thermal.h>
+#include "tsens.h"
+
+static int tsens_get_temp(void *data, int *temp)
+{
+	const struct tsens_sensor *s = data;
+	struct tsens_device *tmdev = s->tmdev;
+
+	return tmdev->ops->get_temp(tmdev, s->id, temp);
+}
+
+static int tsens_get_trend(void *p, int trip, enum thermal_trend *trend)
+{
+	const struct tsens_sensor *s = p;
+	struct tsens_device *tmdev = s->tmdev;
+
+	if (tmdev->ops->get_trend)
+		return  tmdev->ops->get_trend(tmdev, s->id, trend);
+
+	return -ENOTSUPP;
+}
+
+static int  __maybe_unused tsens_suspend(struct device *dev)
+{
+	struct tsens_device *tmdev = dev_get_drvdata(dev);
+
+	if (tmdev->ops && tmdev->ops->suspend)
+		return tmdev->ops->suspend(tmdev);
+
+	return 0;
+}
+
+static int __maybe_unused tsens_resume(struct device *dev)
+{
+	struct tsens_device *tmdev = dev_get_drvdata(dev);
+
+	if (tmdev->ops && tmdev->ops->resume)
+		return tmdev->ops->resume(tmdev);
+
+	return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(tsens_pm_ops, tsens_suspend, tsens_resume);
+
+static const struct of_device_id tsens_table[] = {
+	{
+		.compatible = "qcom,msm8916-tsens",
+		.data = &data_8916,
+	}, {
+		.compatible = "qcom,msm8974-tsens",
+		.data = &data_8974,
+	}, {
+		.compatible = "qcom,msm8996-tsens",
+		.data = &data_8996,
+	},
+	{}
+};
+MODULE_DEVICE_TABLE(of, tsens_table);
+
+static const struct thermal_zone_of_device_ops tsens_of_ops = {
+	.get_temp = tsens_get_temp,
+	.get_trend = tsens_get_trend,
+};
+
+static int tsens_register(struct tsens_device *tmdev)
+{
+	int i;
+	struct thermal_zone_device *tzd;
+	u32 *hw_id, n = tmdev->num_sensors;
+
+	hw_id = devm_kcalloc(tmdev->dev, n, sizeof(u32), GFP_KERNEL);
+	if (!hw_id)
+		return -ENOMEM;
+
+	for (i = 0;  i < tmdev->num_sensors; i++) {
+		tmdev->sensor[i].tmdev = tmdev;
+		tmdev->sensor[i].id = i;
+		tzd = devm_thermal_zone_of_sensor_register(tmdev->dev, i,
+							   &tmdev->sensor[i],
+							   &tsens_of_ops);
+		if (IS_ERR(tzd))
+			continue;
+		tmdev->sensor[i].tzd = tzd;
+		if (tmdev->ops->enable)
+			tmdev->ops->enable(tmdev, i);
+	}
+	return 0;
+}
+
+static int tsens_probe(struct platform_device *pdev)
+{
+	int ret, i;
+	struct device *dev;
+	struct device_node *np;
+	struct tsens_sensor *s;
+	struct tsens_device *tmdev;
+	const struct tsens_data *data;
+	const struct of_device_id *id;
+
+	if (pdev->dev.of_node)
+		dev = &pdev->dev;
+	else
+		dev = pdev->dev.parent;
+
+	np = dev->of_node;
+
+	id = of_match_node(tsens_table, np);
+	if (id)
+		data = id->data;
+	else
+		data = &data_8960;
+
+	if (data->num_sensors <= 0) {
+		dev_err(dev, "invalid number of sensors\n");
+		return -EINVAL;
+	}
+
+	tmdev = devm_kzalloc(dev, sizeof(*tmdev) +
+			     data->num_sensors * sizeof(*s), GFP_KERNEL);
+	if (!tmdev)
+		return -ENOMEM;
+
+	tmdev->dev = dev;
+	tmdev->num_sensors = data->num_sensors;
+	tmdev->ops = data->ops;
+	for (i = 0;  i < tmdev->num_sensors; i++) {
+		if (data->hw_ids)
+			tmdev->sensor[i].hw_id = data->hw_ids[i];
+		else
+			tmdev->sensor[i].hw_id = i;
+	}
+
+	if (!tmdev->ops || !tmdev->ops->init || !tmdev->ops->get_temp)
+		return -EINVAL;
+
+	ret = tmdev->ops->init(tmdev);
+	if (ret < 0) {
+		dev_err(dev, "tsens init failed\n");
+		return ret;
+	}
+
+	if (tmdev->ops->calibrate) {
+		ret = tmdev->ops->calibrate(tmdev);
+		if (ret < 0) {
+			dev_err(dev, "tsens calibration failed\n");
+			return ret;
+		}
+	}
+
+	ret = tsens_register(tmdev);
+
+	platform_set_drvdata(pdev, tmdev);
+
+	return ret;
+}
+
+static int tsens_remove(struct platform_device *pdev)
+{
+	struct tsens_device *tmdev = platform_get_drvdata(pdev);
+
+	if (tmdev->ops->disable)
+		tmdev->ops->disable(tmdev);
+
+	return 0;
+}
+
+static struct platform_driver tsens_driver = {
+	.probe = tsens_probe,
+	.remove = tsens_remove,
+	.driver = {
+		.name = "qcom-tsens",
+		.pm	= &tsens_pm_ops,
+		.of_match_table = tsens_table,
+	},
+};
+module_platform_driver(tsens_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("QCOM Temperature Sensor driver");
+MODULE_ALIAS("platform:qcom-tsens");
diff --git a/drivers/thermal/qcom/tsens.h b/drivers/thermal/qcom/tsens.h
new file mode 100644
index 0000000..911c197
--- /dev/null
+++ b/drivers/thermal/qcom/tsens.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#ifndef __QCOM_TSENS_H__
+#define __QCOM_TSENS_H__
+
+#define ONE_PT_CALIB		0x1
+#define ONE_PT_CALIB2		0x2
+#define TWO_PT_CALIB		0x3
+
+#include <linux/thermal.h>
+
+struct tsens_device;
+
+struct tsens_sensor {
+	struct tsens_device		*tmdev;
+	struct thermal_zone_device	*tzd;
+	int				offset;
+	int				id;
+	int				hw_id;
+	int				slope;
+	u32				status;
+};
+
+/**
+ * struct tsens_ops - operations as supported by the tsens device
+ * @init: Function to initialize the tsens device
+ * @calibrate: Function to calibrate the tsens device
+ * @get_temp: Function which returns the temp in millidegC
+ * @enable: Function to enable (clocks/power) tsens device
+ * @disable: Function to disable the tsens device
+ * @suspend: Function to suspend the tsens device
+ * @resume: Function to resume the tsens device
+ * @get_trend: Function to get the thermal/temp trend
+ */
+struct tsens_ops {
+	/* mandatory callbacks */
+	int (*init)(struct tsens_device *);
+	int (*calibrate)(struct tsens_device *);
+	int (*get_temp)(struct tsens_device *, int, int *);
+	/* optional callbacks */
+	int (*enable)(struct tsens_device *, int);
+	void (*disable)(struct tsens_device *);
+	int (*suspend)(struct tsens_device *);
+	int (*resume)(struct tsens_device *);
+	int (*get_trend)(struct tsens_device *, int, enum thermal_trend *);
+};
+
+/**
+ * struct tsens_data - tsens instance specific data
+ * @num_sensors: Max number of sensors supported by platform
+ * @ops: operations the tsens instance supports
+ * @hw_ids: Subset of sensors ids supported by platform, if not the first n
+ */
+struct tsens_data {
+	const u32		num_sensors;
+	const struct tsens_ops	*ops;
+	unsigned int		*hw_ids;
+};
+
+/* Registers to be saved/restored across a context loss */
+struct tsens_context {
+	int	threshold;
+	int	control;
+};
+
+struct tsens_device {
+	struct device			*dev;
+	u32				num_sensors;
+	struct regmap			*map;
+	struct regmap_field		*status_field;
+	struct tsens_context		ctx;
+	bool				trdy;
+	const struct tsens_ops		*ops;
+	struct tsens_sensor		sensor[0];
+};
+
+char *qfprom_read(struct device *, const char *);
+void compute_intercept_slope(struct tsens_device *, u32 *, u32 *, u32);
+int init_common(struct tsens_device *);
+int get_temp_common(struct tsens_device *, int, int *);
+
+extern const struct tsens_data data_8916, data_8974, data_8960, data_8996;
+
+#endif /* __QCOM_TSENS_H__ */
diff --git a/drivers/thermal/qoriq_thermal.c b/drivers/thermal/qoriq_thermal.c
new file mode 100644
index 0000000..644ba52
--- /dev/null
+++ b/drivers/thermal/qoriq_thermal.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright 2016 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/thermal.h>
+
+#include "thermal_core.h"
+
+#define SITES_MAX	16
+
+/*
+ * QorIQ TMU Registers
+ */
+struct qoriq_tmu_site_regs {
+	u32 tritsr;		/* Immediate Temperature Site Register */
+	u32 tratsr;		/* Average Temperature Site Register */
+	u8 res0[0x8];
+};
+
+struct qoriq_tmu_regs {
+	u32 tmr;		/* Mode Register */
+#define TMR_DISABLE	0x0
+#define TMR_ME		0x80000000
+#define TMR_ALPF	0x0c000000
+	u32 tsr;		/* Status Register */
+	u32 tmtmir;		/* Temperature measurement interval Register */
+#define TMTMIR_DEFAULT	0x0000000f
+	u8 res0[0x14];
+	u32 tier;		/* Interrupt Enable Register */
+#define TIER_DISABLE	0x0
+	u32 tidr;		/* Interrupt Detect Register */
+	u32 tiscr;		/* Interrupt Site Capture Register */
+	u32 ticscr;		/* Interrupt Critical Site Capture Register */
+	u8 res1[0x10];
+	u32 tmhtcrh;		/* High Temperature Capture Register */
+	u32 tmhtcrl;		/* Low Temperature Capture Register */
+	u8 res2[0x8];
+	u32 tmhtitr;		/* High Temperature Immediate Threshold */
+	u32 tmhtatr;		/* High Temperature Average Threshold */
+	u32 tmhtactr;	/* High Temperature Average Crit Threshold */
+	u8 res3[0x24];
+	u32 ttcfgr;		/* Temperature Configuration Register */
+	u32 tscfgr;		/* Sensor Configuration Register */
+	u8 res4[0x78];
+	struct qoriq_tmu_site_regs site[SITES_MAX];
+	u8 res5[0x9f8];
+	u32 ipbrr0;		/* IP Block Revision Register 0 */
+	u32 ipbrr1;		/* IP Block Revision Register 1 */
+	u8 res6[0x310];
+	u32 ttr0cr;		/* Temperature Range 0 Control Register */
+	u32 ttr1cr;		/* Temperature Range 1 Control Register */
+	u32 ttr2cr;		/* Temperature Range 2 Control Register */
+	u32 ttr3cr;		/* Temperature Range 3 Control Register */
+};
+
+/*
+ * Thermal zone data
+ */
+struct qoriq_tmu_data {
+	struct thermal_zone_device *tz;
+	struct qoriq_tmu_regs __iomem *regs;
+	int sensor_id;
+	bool little_endian;
+};
+
+static void tmu_write(struct qoriq_tmu_data *p, u32 val, void __iomem *addr)
+{
+	if (p->little_endian)
+		iowrite32(val, addr);
+	else
+		iowrite32be(val, addr);
+}
+
+static u32 tmu_read(struct qoriq_tmu_data *p, void __iomem *addr)
+{
+	if (p->little_endian)
+		return ioread32(addr);
+	else
+		return ioread32be(addr);
+}
+
+static int tmu_get_temp(void *p, int *temp)
+{
+	u32 val;
+	struct qoriq_tmu_data *data = p;
+
+	val = tmu_read(data, &data->regs->site[data->sensor_id].tritsr);
+	*temp = (val & 0xff) * 1000;
+
+	return 0;
+}
+
+static int qoriq_tmu_get_sensor_id(void)
+{
+	int ret, id;
+	struct of_phandle_args sensor_specs;
+	struct device_node *np, *sensor_np;
+
+	np = of_find_node_by_name(NULL, "thermal-zones");
+	if (!np)
+		return -ENODEV;
+
+	sensor_np = of_get_next_child(np, NULL);
+	ret = of_parse_phandle_with_args(sensor_np, "thermal-sensors",
+			"#thermal-sensor-cells",
+			0, &sensor_specs);
+	if (ret) {
+		of_node_put(np);
+		of_node_put(sensor_np);
+		return ret;
+	}
+
+	if (sensor_specs.args_count >= 1) {
+		id = sensor_specs.args[0];
+		WARN(sensor_specs.args_count > 1,
+				"%s: too many cells in sensor specifier %d\n",
+				sensor_specs.np->name, sensor_specs.args_count);
+	} else {
+		id = 0;
+	}
+
+	of_node_put(np);
+	of_node_put(sensor_np);
+
+	return id;
+}
+
+static int qoriq_tmu_calibration(struct platform_device *pdev)
+{
+	int i, val, len;
+	u32 range[4];
+	const u32 *calibration;
+	struct device_node *np = pdev->dev.of_node;
+	struct qoriq_tmu_data *data = platform_get_drvdata(pdev);
+
+	if (of_property_read_u32_array(np, "fsl,tmu-range", range, 4)) {
+		dev_err(&pdev->dev, "missing calibration range.\n");
+		return -ENODEV;
+	}
+
+	/* Init temperature range registers */
+	tmu_write(data, range[0], &data->regs->ttr0cr);
+	tmu_write(data, range[1], &data->regs->ttr1cr);
+	tmu_write(data, range[2], &data->regs->ttr2cr);
+	tmu_write(data, range[3], &data->regs->ttr3cr);
+
+	calibration = of_get_property(np, "fsl,tmu-calibration", &len);
+	if (calibration == NULL || len % 8) {
+		dev_err(&pdev->dev, "invalid calibration data.\n");
+		return -ENODEV;
+	}
+
+	for (i = 0; i < len; i += 8, calibration += 2) {
+		val = of_read_number(calibration, 1);
+		tmu_write(data, val, &data->regs->ttcfgr);
+		val = of_read_number(calibration + 1, 1);
+		tmu_write(data, val, &data->regs->tscfgr);
+	}
+
+	return 0;
+}
+
+static void qoriq_tmu_init_device(struct qoriq_tmu_data *data)
+{
+	/* Disable interrupt, using polling instead */
+	tmu_write(data, TIER_DISABLE, &data->regs->tier);
+
+	/* Set update_interval */
+	tmu_write(data, TMTMIR_DEFAULT, &data->regs->tmtmir);
+
+	/* Disable monitoring */
+	tmu_write(data, TMR_DISABLE, &data->regs->tmr);
+}
+
+static struct thermal_zone_of_device_ops tmu_tz_ops = {
+	.get_temp = tmu_get_temp,
+};
+
+static int qoriq_tmu_probe(struct platform_device *pdev)
+{
+	int ret;
+	const struct thermal_trip *trip;
+	struct qoriq_tmu_data *data;
+	struct device_node *np = pdev->dev.of_node;
+	u32 site = 0;
+
+	if (!np) {
+		dev_err(&pdev->dev, "Device OF-Node is NULL");
+		return -ENODEV;
+	}
+
+	data = devm_kzalloc(&pdev->dev, sizeof(struct qoriq_tmu_data),
+			    GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, data);
+
+	data->little_endian = of_property_read_bool(np, "little-endian");
+
+	data->sensor_id = qoriq_tmu_get_sensor_id();
+	if (data->sensor_id < 0) {
+		dev_err(&pdev->dev, "Failed to get sensor id\n");
+		ret = -ENODEV;
+		goto err_iomap;
+	}
+
+	data->regs = of_iomap(np, 0);
+	if (!data->regs) {
+		dev_err(&pdev->dev, "Failed to get memory region\n");
+		ret = -ENODEV;
+		goto err_iomap;
+	}
+
+	qoriq_tmu_init_device(data);	/* TMU initialization */
+
+	ret = qoriq_tmu_calibration(pdev);	/* TMU calibration */
+	if (ret < 0)
+		goto err_tmu;
+
+	data->tz = thermal_zone_of_sensor_register(&pdev->dev, data->sensor_id,
+				data, &tmu_tz_ops);
+	if (IS_ERR(data->tz)) {
+		ret = PTR_ERR(data->tz);
+		dev_err(&pdev->dev,
+			"Failed to register thermal zone device %d\n", ret);
+		goto err_tmu;
+	}
+
+	trip = of_thermal_get_trip_points(data->tz);
+
+	/* Enable monitoring */
+	site |= 0x1 << (15 - data->sensor_id);
+	tmu_write(data, site | TMR_ME | TMR_ALPF, &data->regs->tmr);
+
+	return 0;
+
+err_tmu:
+	iounmap(data->regs);
+
+err_iomap:
+	platform_set_drvdata(pdev, NULL);
+
+	return ret;
+}
+
+static int qoriq_tmu_remove(struct platform_device *pdev)
+{
+	struct qoriq_tmu_data *data = platform_get_drvdata(pdev);
+
+	thermal_zone_of_sensor_unregister(&pdev->dev, data->tz);
+
+	/* Disable monitoring */
+	tmu_write(data, TMR_DISABLE, &data->regs->tmr);
+
+	iounmap(data->regs);
+	platform_set_drvdata(pdev, NULL);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int qoriq_tmu_suspend(struct device *dev)
+{
+	u32 tmr;
+	struct qoriq_tmu_data *data = dev_get_drvdata(dev);
+
+	/* Disable monitoring */
+	tmr = tmu_read(data, &data->regs->tmr);
+	tmr &= ~TMR_ME;
+	tmu_write(data, tmr, &data->regs->tmr);
+
+	return 0;
+}
+
+static int qoriq_tmu_resume(struct device *dev)
+{
+	u32 tmr;
+	struct qoriq_tmu_data *data = dev_get_drvdata(dev);
+
+	/* Enable monitoring */
+	tmr = tmu_read(data, &data->regs->tmr);
+	tmr |= TMR_ME;
+	tmu_write(data, tmr, &data->regs->tmr);
+
+	return 0;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(qoriq_tmu_pm_ops,
+			 qoriq_tmu_suspend, qoriq_tmu_resume);
+
+static const struct of_device_id qoriq_tmu_match[] = {
+	{ .compatible = "fsl,qoriq-tmu", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, qoriq_tmu_match);
+
+static struct platform_driver qoriq_tmu = {
+	.driver	= {
+		.name		= "qoriq_thermal",
+		.pm		= &qoriq_tmu_pm_ops,
+		.of_match_table	= qoriq_tmu_match,
+	},
+	.probe	= qoriq_tmu_probe,
+	.remove	= qoriq_tmu_remove,
+};
+module_platform_driver(qoriq_tmu);
+
+MODULE_AUTHOR("Jia Hongtao <hongtao.jia@nxp.com>");
+MODULE_DESCRIPTION("QorIQ Thermal Monitoring Unit driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/thermal/rcar_thermal.c b/drivers/thermal/rcar_thermal.c
index 5f81792..73e5fee 100644
--- a/drivers/thermal/rcar_thermal.c
+++ b/drivers/thermal/rcar_thermal.c
@@ -31,6 +31,8 @@
 #include <linux/spinlock.h>
 #include <linux/thermal.h>
 
+#include "thermal_hwmon.h"
+
 #define IDLE_INTERVAL	5000
 
 #define COMMON_STR	0x00
@@ -75,6 +77,8 @@
 #define rcar_priv_to_dev(priv)		((priv)->common->dev)
 #define rcar_has_irq_support(priv)	((priv)->common->base)
 #define rcar_id_to_shift(priv)		((priv)->id * 8)
+#define rcar_of_data(dev)		((unsigned long)of_device_get_match_data(dev))
+#define rcar_use_of_thermal(dev)	(rcar_of_data(dev) == USE_OF_THERMAL)
 
 #define USE_OF_THERMAL	1
 static const struct of_device_id rcar_thermal_dt_ids[] = {
@@ -354,7 +358,8 @@
 		return;
 
 	if (nctemp != cctemp)
-		thermal_zone_device_update(priv->zone);
+		thermal_zone_device_update(priv->zone,
+					   THERMAL_EVENT_UNSPECIFIED);
 }
 
 static u32 rcar_thermal_had_changed(struct rcar_thermal_priv *priv, u32 status)
@@ -415,7 +420,10 @@
 
 	rcar_thermal_for_each_priv(priv, common) {
 		rcar_thermal_irq_disable(priv);
-		thermal_zone_device_unregister(priv->zone);
+		if (rcar_use_of_thermal(dev))
+			thermal_remove_hwmon_sysfs(priv->zone);
+		else
+			thermal_zone_device_unregister(priv->zone);
 	}
 
 	pm_runtime_put(dev);
@@ -430,7 +438,6 @@
 	struct rcar_thermal_priv *priv;
 	struct device *dev = &pdev->dev;
 	struct resource *res, *irq;
-	unsigned long of_data = (unsigned long)of_device_get_match_data(dev);
 	int mres = 0;
 	int i;
 	int ret = -ENODEV;
@@ -491,7 +498,7 @@
 		if (ret < 0)
 			goto error_unregister;
 
-		if (of_data == USE_OF_THERMAL)
+		if (rcar_use_of_thermal(dev))
 			priv->zone = devm_thermal_zone_of_sensor_register(
 						dev, i, priv,
 						&rcar_thermal_zone_of_ops);
@@ -508,6 +515,17 @@
 			goto error_unregister;
 		}
 
+		if (rcar_use_of_thermal(dev)) {
+			/*
+			 * thermal_zone doesn't enable hwmon as default,
+			 * but, enable it here to keep compatible
+			 */
+			priv->zone->tzp->no_hwmon = false;
+			ret = thermal_add_hwmon_sysfs(priv->zone);
+			if (ret)
+				goto error_unregister;
+		}
+
 		rcar_thermal_irq_enable(priv);
 
 		list_move_tail(&priv->list, &common->head);
diff --git a/drivers/thermal/rockchip_thermal.c b/drivers/thermal/rockchip_thermal.c
index 5d491f1..e227a9f 100644
--- a/drivers/thermal/rockchip_thermal.c
+++ b/drivers/thermal/rockchip_thermal.c
@@ -96,6 +96,7 @@
  * @initialize: SoC special initialize tsadc controller method
  * @irq_ack: clear the interrupt
  * @get_temp: get the temperature
+ * @set_alarm_temp: set the high temperature interrupt
  * @set_tshut_temp: set the hardware-controlled shutdown temperature
  * @set_tshut_mode: set the hardware-controlled shutdown mode
  * @table: the chip-specific conversion table
@@ -119,6 +120,8 @@
 	/* Per-sensor methods */
 	int (*get_temp)(struct chip_tsadc_table table,
 			int chn, void __iomem *reg, int *temp);
+	void (*set_alarm_temp)(struct chip_tsadc_table table,
+			       int chn, void __iomem *reg, int temp);
 	void (*set_tshut_temp)(struct chip_tsadc_table table,
 			       int chn, void __iomem *reg, int temp);
 	void (*set_tshut_mode)(int chn, void __iomem *reg, enum tshut_mode m);
@@ -183,6 +186,7 @@
 #define TSADCV2_INT_EN				0x08
 #define TSADCV2_INT_PD				0x0c
 #define TSADCV2_DATA(chn)			(0x20 + (chn) * 0x04)
+#define TSADCV2_COMP_INT(chn)		        (0x30 + (chn) * 0x04)
 #define TSADCV2_COMP_SHUT(chn)		        (0x40 + (chn) * 0x04)
 #define TSADCV2_HIGHT_INT_DEBOUNCE		0x60
 #define TSADCV2_HIGHT_TSHUT_DEBOUNCE		0x64
@@ -207,18 +211,21 @@
 
 #define TSADCV2_HIGHT_INT_DEBOUNCE_COUNT	4
 #define TSADCV2_HIGHT_TSHUT_DEBOUNCE_COUNT	4
-#define TSADCV2_AUTO_PERIOD_TIME		250 /* msec */
-#define TSADCV2_AUTO_PERIOD_HT_TIME		50  /* msec */
+#define TSADCV2_AUTO_PERIOD_TIME		250 /* 250ms */
+#define TSADCV2_AUTO_PERIOD_HT_TIME		50  /* 50ms */
+#define TSADCV3_AUTO_PERIOD_TIME		1875 /* 2.5ms */
+#define TSADCV3_AUTO_PERIOD_HT_TIME		1875 /* 2.5ms */
+
 #define TSADCV2_USER_INTER_PD_SOC		0x340 /* 13 clocks */
 
 #define GRF_SARADC_TESTBIT			0x0e644
 #define GRF_TSADC_TESTBIT_L			0x0e648
 #define GRF_TSADC_TESTBIT_H			0x0e64c
 
-#define GRF_TSADC_TSEN_PD_ON			(0x30003 << 0)
-#define GRF_TSADC_TSEN_PD_OFF			(0x30000 << 0)
 #define GRF_SARADC_TESTBIT_ON			(0x10001 << 2)
 #define GRF_TSADC_TESTBIT_H_ON			(0x10001 << 2)
+#define GRF_TSADC_VCM_EN_L			(0x10001 << 7)
+#define GRF_TSADC_VCM_EN_H			(0x10001 << 7)
 
 /**
  * struct tsadc_table - code to temperature conversion table
@@ -394,13 +401,17 @@
 				   int temp)
 {
 	int high, low, mid;
+	u32 error = 0;
 
 	low = 0;
 	high = table.length - 1;
 	mid = (high + low) / 2;
 
-	if (temp < table.id[low].temp || temp > table.id[high].temp)
-		return 0;
+	/* Return mask code data when the temp is over table range */
+	if (temp < table.id[low].temp || temp > table.id[high].temp) {
+		error = table.data_mask;
+		goto exit;
+	}
 
 	while (low <= high) {
 		if (temp == table.id[mid].temp)
@@ -412,7 +423,9 @@
 		mid = (low + high) / 2;
 	}
 
-	return 0;
+exit:
+	pr_err("Invalid the conversion, error=%d\n", error);
+	return error;
 }
 
 static int rk_tsadcv2_code_to_temp(struct chip_tsadc_table table, u32 code,
@@ -543,14 +556,34 @@
 		/* Set interleave value to workround ic time sync issue */
 		writel_relaxed(TSADCV2_USER_INTER_PD_SOC, regs +
 			       TSADCV2_USER_CON);
+
+		writel_relaxed(TSADCV2_AUTO_PERIOD_TIME,
+			       regs + TSADCV2_AUTO_PERIOD);
+		writel_relaxed(TSADCV2_HIGHT_INT_DEBOUNCE_COUNT,
+			       regs + TSADCV2_HIGHT_INT_DEBOUNCE);
+		writel_relaxed(TSADCV2_AUTO_PERIOD_HT_TIME,
+			       regs + TSADCV2_AUTO_PERIOD_HT);
+		writel_relaxed(TSADCV2_HIGHT_TSHUT_DEBOUNCE_COUNT,
+			       regs + TSADCV2_HIGHT_TSHUT_DEBOUNCE);
+
 	} else {
-		regmap_write(grf, GRF_TSADC_TESTBIT_L, GRF_TSADC_TSEN_PD_ON);
-		mdelay(10);
-		regmap_write(grf, GRF_TSADC_TESTBIT_L, GRF_TSADC_TSEN_PD_OFF);
+		/* Enable the voltage common mode feature */
+		regmap_write(grf, GRF_TSADC_TESTBIT_L, GRF_TSADC_VCM_EN_L);
+		regmap_write(grf, GRF_TSADC_TESTBIT_H, GRF_TSADC_VCM_EN_H);
+
 		usleep_range(15, 100); /* The spec note says at least 15 us */
 		regmap_write(grf, GRF_SARADC_TESTBIT, GRF_SARADC_TESTBIT_ON);
 		regmap_write(grf, GRF_TSADC_TESTBIT_H, GRF_TSADC_TESTBIT_H_ON);
 		usleep_range(90, 200); /* The spec note says at least 90 us */
+
+		writel_relaxed(TSADCV3_AUTO_PERIOD_TIME,
+			       regs + TSADCV2_AUTO_PERIOD);
+		writel_relaxed(TSADCV2_HIGHT_INT_DEBOUNCE_COUNT,
+			       regs + TSADCV2_HIGHT_INT_DEBOUNCE);
+		writel_relaxed(TSADCV3_AUTO_PERIOD_HT_TIME,
+			       regs + TSADCV2_AUTO_PERIOD_HT);
+		writel_relaxed(TSADCV2_HIGHT_TSHUT_DEBOUNCE_COUNT,
+			       regs + TSADCV2_HIGHT_TSHUT_DEBOUNCE);
 	}
 
 	if (tshut_polarity == TSHUT_HIGH_ACTIVE)
@@ -559,14 +592,6 @@
 	else
 		writel_relaxed(0U & ~TSADCV2_AUTO_TSHUT_POLARITY_HIGH,
 			       regs + TSADCV2_AUTO_CON);
-
-	writel_relaxed(TSADCV2_AUTO_PERIOD_TIME, regs + TSADCV2_AUTO_PERIOD);
-	writel_relaxed(TSADCV2_HIGHT_INT_DEBOUNCE_COUNT,
-		       regs + TSADCV2_HIGHT_INT_DEBOUNCE);
-	writel_relaxed(TSADCV2_AUTO_PERIOD_HT_TIME,
-		       regs + TSADCV2_AUTO_PERIOD_HT);
-	writel_relaxed(TSADCV2_HIGHT_TSHUT_DEBOUNCE_COUNT,
-		       regs + TSADCV2_HIGHT_TSHUT_DEBOUNCE);
 }
 
 static void rk_tsadcv2_irq_ack(void __iomem *regs)
@@ -628,12 +653,34 @@
 	return rk_tsadcv2_code_to_temp(table, val, temp);
 }
 
+static void rk_tsadcv2_alarm_temp(struct chip_tsadc_table table,
+				  int chn, void __iomem *regs, int temp)
+{
+	u32 alarm_value, int_en;
+
+	/* Make sure the value is valid */
+	alarm_value = rk_tsadcv2_temp_to_code(table, temp);
+	if (alarm_value == table.data_mask)
+		return;
+
+	writel_relaxed(alarm_value & table.data_mask,
+		       regs + TSADCV2_COMP_INT(chn));
+
+	int_en = readl_relaxed(regs + TSADCV2_INT_EN);
+	int_en |= TSADCV2_INT_SRC_EN(chn);
+	writel_relaxed(int_en, regs + TSADCV2_INT_EN);
+}
+
 static void rk_tsadcv2_tshut_temp(struct chip_tsadc_table table,
 				  int chn, void __iomem *regs, int temp)
 {
 	u32 tshut_value, val;
 
+	/* Make sure the value is valid */
 	tshut_value = rk_tsadcv2_temp_to_code(table, temp);
+	if (tshut_value == table.data_mask)
+		return;
+
 	writel_relaxed(tshut_value, regs + TSADCV2_COMP_SHUT(chn));
 
 	/* TSHUT will be valid */
@@ -670,6 +717,7 @@
 	.irq_ack = rk_tsadcv3_irq_ack,
 	.control = rk_tsadcv3_control,
 	.get_temp = rk_tsadcv2_get_temp,
+	.set_alarm_temp = rk_tsadcv2_alarm_temp,
 	.set_tshut_temp = rk_tsadcv2_tshut_temp,
 	.set_tshut_mode = rk_tsadcv2_tshut_mode,
 
@@ -694,6 +742,7 @@
 	.irq_ack = rk_tsadcv2_irq_ack,
 	.control = rk_tsadcv2_control,
 	.get_temp = rk_tsadcv2_get_temp,
+	.set_alarm_temp = rk_tsadcv2_alarm_temp,
 	.set_tshut_temp = rk_tsadcv2_tshut_temp,
 	.set_tshut_mode = rk_tsadcv2_tshut_mode,
 
@@ -718,6 +767,7 @@
 	.irq_ack = rk_tsadcv3_irq_ack,
 	.control = rk_tsadcv3_control,
 	.get_temp = rk_tsadcv2_get_temp,
+	.set_alarm_temp = rk_tsadcv2_alarm_temp,
 	.set_tshut_temp = rk_tsadcv2_tshut_temp,
 	.set_tshut_mode = rk_tsadcv2_tshut_mode,
 
@@ -742,6 +792,7 @@
 	.irq_ack = rk_tsadcv2_irq_ack,
 	.control = rk_tsadcv2_control,
 	.get_temp = rk_tsadcv2_get_temp,
+	.set_alarm_temp = rk_tsadcv2_alarm_temp,
 	.set_tshut_temp = rk_tsadcv2_tshut_temp,
 	.set_tshut_mode = rk_tsadcv2_tshut_mode,
 
@@ -766,6 +817,7 @@
 	.irq_ack = rk_tsadcv3_irq_ack,
 	.control = rk_tsadcv3_control,
 	.get_temp = rk_tsadcv2_get_temp,
+	.set_alarm_temp = rk_tsadcv2_alarm_temp,
 	.set_tshut_temp = rk_tsadcv2_tshut_temp,
 	.set_tshut_mode = rk_tsadcv2_tshut_mode,
 
@@ -821,11 +873,27 @@
 	thermal->chip->irq_ack(thermal->regs);
 
 	for (i = 0; i < thermal->chip->chn_num; i++)
-		thermal_zone_device_update(thermal->sensors[i].tzd);
+		thermal_zone_device_update(thermal->sensors[i].tzd,
+					   THERMAL_EVENT_UNSPECIFIED);
 
 	return IRQ_HANDLED;
 }
 
+static int rockchip_thermal_set_trips(void *_sensor, int low, int high)
+{
+	struct rockchip_thermal_sensor *sensor = _sensor;
+	struct rockchip_thermal_data *thermal = sensor->thermal;
+	const struct rockchip_tsadc_chip *tsadc = thermal->chip;
+
+	dev_dbg(&thermal->pdev->dev, "%s: sensor %d: low: %d, high %d\n",
+		__func__, sensor->id, low, high);
+
+	tsadc->set_alarm_temp(tsadc->table,
+			      sensor->id, thermal->regs, high);
+
+	return 0;
+}
+
 static int rockchip_thermal_get_temp(void *_sensor, int *out_temp)
 {
 	struct rockchip_thermal_sensor *sensor = _sensor;
@@ -843,6 +911,7 @@
 
 static const struct thermal_zone_of_device_ops rockchip_of_thermal_ops = {
 	.get_temp = rockchip_thermal_get_temp,
+	.set_trips = rockchip_thermal_set_trips,
 };
 
 static int rockchip_configure_from_dt(struct device *dev,
diff --git a/drivers/thermal/samsung/exynos_tmu.c b/drivers/thermal/samsung/exynos_tmu.c
index f3ce94e..ad1186d 100644
--- a/drivers/thermal/samsung/exynos_tmu.c
+++ b/drivers/thermal/samsung/exynos_tmu.c
@@ -225,7 +225,7 @@
 		return;
 	}
 
-	thermal_zone_device_update(tz);
+	thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
 
 	mutex_lock(&tz->lock);
 	/* Find the level for which trip happened */
diff --git a/drivers/thermal/st/st_thermal_memmap.c b/drivers/thermal/st/st_thermal_memmap.c
index fc0c9e1..91d4231 100644
--- a/drivers/thermal/st/st_thermal_memmap.c
+++ b/drivers/thermal/st/st_thermal_memmap.c
@@ -42,7 +42,8 @@
 {
 	struct st_thermal_sensor *sensor = sdata;
 
-	thermal_zone_device_update(sensor->thermal_dev);
+	thermal_zone_device_update(sensor->thermal_dev,
+				   THERMAL_EVENT_UNSPECIFIED);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/thermal/tango_thermal.c b/drivers/thermal/tango_thermal.c
index 70e0d9f..201304a 100644
--- a/drivers/thermal/tango_thermal.c
+++ b/drivers/thermal/tango_thermal.c
@@ -64,6 +64,12 @@
 	.get_temp	= tango_get_temp,
 };
 
+static void tango_thermal_init(struct tango_thermal_priv *priv)
+{
+	writel(0, priv->base + TEMPSI_CFG);
+	writel(CMD_ON, priv->base + TEMPSI_CMD);
+}
+
 static int tango_thermal_probe(struct platform_device *pdev)
 {
 	struct resource *res;
@@ -79,14 +85,22 @@
 	if (IS_ERR(priv->base))
 		return PTR_ERR(priv->base);
 
+	platform_set_drvdata(pdev, priv);
 	priv->thresh_idx = IDX_MIN;
-	writel(0, priv->base + TEMPSI_CFG);
-	writel(CMD_ON, priv->base + TEMPSI_CMD);
+	tango_thermal_init(priv);
 
 	tzdev = devm_thermal_zone_of_sensor_register(&pdev->dev, 0, priv, &ops);
 	return PTR_ERR_OR_ZERO(tzdev);
 }
 
+static int __maybe_unused tango_thermal_resume(struct device *dev)
+{
+	tango_thermal_init(dev_get_drvdata(dev));
+	return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(tango_thermal_pm, NULL, tango_thermal_resume);
+
 static const struct of_device_id tango_sensor_ids[] = {
 	{
 		.compatible = "sigma,smp8758-thermal",
@@ -99,6 +113,7 @@
 	.driver	= {
 		.name		= "tango-thermal",
 		.of_match_table	= tango_sensor_ids,
+		.pm		= &tango_thermal_pm,
 	},
 };
 
diff --git a/drivers/thermal/tegra/soctherm.c b/drivers/thermal/tegra/soctherm.c
index b865172..7d2db23 100644
--- a/drivers/thermal/tegra/soctherm.c
+++ b/drivers/thermal/tegra/soctherm.c
@@ -30,6 +30,7 @@
 
 #include <dt-bindings/thermal/tegra124-soctherm.h>
 
+#include "../thermal_core.h"
 #include "soctherm.h"
 
 #define SENSOR_CONFIG0				0
@@ -67,35 +68,228 @@
 #define READBACK_ADD_HALF			BIT(7)
 #define READBACK_NEGATE				BIT(0)
 
+/*
+ * THERMCTL_LEVEL0_GROUP_CPU is defined in soctherm.h
+ * because it will be used by tegraxxx_soctherm.c
+ */
+#define THERMCTL_LVL0_CPU0_EN_MASK		BIT(8)
+#define THERMCTL_LVL0_CPU0_CPU_THROT_MASK	(0x3 << 5)
+#define THERMCTL_LVL0_CPU0_CPU_THROT_LIGHT	0x1
+#define THERMCTL_LVL0_CPU0_CPU_THROT_HEAVY	0x2
+#define THERMCTL_LVL0_CPU0_GPU_THROT_MASK	(0x3 << 3)
+#define THERMCTL_LVL0_CPU0_GPU_THROT_LIGHT	0x1
+#define THERMCTL_LVL0_CPU0_GPU_THROT_HEAVY	0x2
+#define THERMCTL_LVL0_CPU0_MEM_THROT_MASK	BIT(2)
+#define THERMCTL_LVL0_CPU0_STATUS_MASK		0x3
+
+#define THERMCTL_LVL0_UP_STATS			0x10
+#define THERMCTL_LVL0_DN_STATS			0x14
+
+#define THERMCTL_STATS_CTL			0x94
+#define STATS_CTL_CLR_DN			0x8
+#define STATS_CTL_EN_DN				0x4
+#define STATS_CTL_CLR_UP			0x2
+#define STATS_CTL_EN_UP				0x1
+
+#define THROT_GLOBAL_CFG			0x400
+#define THROT_GLOBAL_ENB_MASK			BIT(0)
+
+#define CPU_PSKIP_STATUS			0x418
+#define XPU_PSKIP_STATUS_M_MASK			(0xff << 12)
+#define XPU_PSKIP_STATUS_N_MASK			(0xff << 4)
+#define XPU_PSKIP_STATUS_SW_OVERRIDE_MASK	BIT(1)
+#define XPU_PSKIP_STATUS_ENABLED_MASK		BIT(0)
+
+#define THROT_PRIORITY_LOCK			0x424
+#define THROT_PRIORITY_LOCK_PRIORITY_MASK	0xff
+
+#define THROT_STATUS				0x428
+#define THROT_STATUS_BREACH_MASK		BIT(12)
+#define THROT_STATUS_STATE_MASK			(0xff << 4)
+#define THROT_STATUS_ENABLED_MASK		BIT(0)
+
+#define THROT_PSKIP_CTRL_LITE_CPU		0x430
+#define THROT_PSKIP_CTRL_ENABLE_MASK            BIT(31)
+#define THROT_PSKIP_CTRL_DIVIDEND_MASK          (0xff << 8)
+#define THROT_PSKIP_CTRL_DIVISOR_MASK           0xff
+#define THROT_PSKIP_CTRL_VECT_GPU_MASK          (0x7 << 16)
+#define THROT_PSKIP_CTRL_VECT_CPU_MASK          (0x7 << 8)
+#define THROT_PSKIP_CTRL_VECT2_CPU_MASK         0x7
+
+#define THROT_VECT_NONE				0x0 /* 3'b000 */
+#define THROT_VECT_LOW				0x1 /* 3'b001 */
+#define THROT_VECT_MED				0x3 /* 3'b011 */
+#define THROT_VECT_HIGH				0x7 /* 3'b111 */
+
+#define THROT_PSKIP_RAMP_LITE_CPU		0x434
+#define THROT_PSKIP_RAMP_SEQ_BYPASS_MODE_MASK	BIT(31)
+#define THROT_PSKIP_RAMP_DURATION_MASK		(0xffff << 8)
+#define THROT_PSKIP_RAMP_STEP_MASK		0xff
+
+#define THROT_PRIORITY_LITE			0x444
+#define THROT_PRIORITY_LITE_PRIO_MASK		0xff
+
+#define THROT_DELAY_LITE			0x448
+#define THROT_DELAY_LITE_DELAY_MASK		0xff
+
+/* car register offsets needed for enabling HW throttling */
+#define CAR_SUPER_CCLKG_DIVIDER			0x36c
+#define CDIVG_USE_THERM_CONTROLS_MASK		BIT(30)
+
+/* ccroc register offsets needed for enabling HW throttling for Tegra132 */
+#define CCROC_SUPER_CCLKG_DIVIDER		0x024
+
+#define CCROC_GLOBAL_CFG			0x148
+
+#define CCROC_THROT_PSKIP_RAMP_CPU		0x150
+#define CCROC_THROT_PSKIP_RAMP_SEQ_BYPASS_MODE_MASK	BIT(31)
+#define CCROC_THROT_PSKIP_RAMP_DURATION_MASK	(0xffff << 8)
+#define CCROC_THROT_PSKIP_RAMP_STEP_MASK	0xff
+
+#define CCROC_THROT_PSKIP_CTRL_CPU		0x154
+#define CCROC_THROT_PSKIP_CTRL_ENB_MASK		BIT(31)
+#define CCROC_THROT_PSKIP_CTRL_DIVIDEND_MASK	(0xff << 8)
+#define CCROC_THROT_PSKIP_CTRL_DIVISOR_MASK	0xff
+
 /* get val from register(r) mask bits(m) */
 #define REG_GET_MASK(r, m)	(((r) & (m)) >> (ffs(m) - 1))
 /* set val(v) to mask bits(m) of register(r) */
 #define REG_SET_MASK(r, m, v)	(((r) & ~(m)) | \
 				 (((v) & (m >> (ffs(m) - 1))) << (ffs(m) - 1)))
 
+/* get dividend from the depth */
+#define THROT_DEPTH_DIVIDEND(depth)	((256 * (100 - (depth)) / 100) - 1)
+
+/* get THROT_PSKIP_xxx offset per LIGHT/HEAVY throt and CPU/GPU dev */
+#define THROT_OFFSET			0x30
+#define THROT_PSKIP_CTRL(throt, dev)	(THROT_PSKIP_CTRL_LITE_CPU + \
+					(THROT_OFFSET * throt) + (8 * dev))
+#define THROT_PSKIP_RAMP(throt, dev)	(THROT_PSKIP_RAMP_LITE_CPU + \
+					(THROT_OFFSET * throt) + (8 * dev))
+
+/* get THROT_xxx_CTRL offset per LIGHT/HEAVY throt */
+#define THROT_PRIORITY_CTRL(throt)	(THROT_PRIORITY_LITE + \
+					(THROT_OFFSET * throt))
+#define THROT_DELAY_CTRL(throt)		(THROT_DELAY_LITE + \
+					(THROT_OFFSET * throt))
+
+/* get CCROC_THROT_PSKIP_xxx offset per HIGH/MED/LOW vect*/
+#define CCROC_THROT_OFFSET			0x0c
+#define CCROC_THROT_PSKIP_CTRL_CPU_REG(vect)    (CCROC_THROT_PSKIP_CTRL_CPU + \
+						(CCROC_THROT_OFFSET * vect))
+#define CCROC_THROT_PSKIP_RAMP_CPU_REG(vect)    (CCROC_THROT_PSKIP_RAMP_CPU + \
+						(CCROC_THROT_OFFSET * vect))
+
+/* get THERMCTL_LEVELx offset per CPU/GPU/MEM/TSENSE rg and LEVEL0~3 lv */
+#define THERMCTL_LVL_REGS_SIZE		0x20
+#define THERMCTL_LVL_REG(rg, lv)	((rg) + ((lv) * THERMCTL_LVL_REGS_SIZE))
+
 static const int min_low_temp = -127000;
 static const int max_high_temp = 127000;
 
+enum soctherm_throttle_id {
+	THROTTLE_LIGHT = 0,
+	THROTTLE_HEAVY,
+	THROTTLE_SIZE,
+};
+
+enum soctherm_throttle_dev_id {
+	THROTTLE_DEV_CPU = 0,
+	THROTTLE_DEV_GPU,
+	THROTTLE_DEV_SIZE,
+};
+
+static const char *const throt_names[] = {
+	[THROTTLE_LIGHT] = "light",
+	[THROTTLE_HEAVY] = "heavy",
+};
+
+struct tegra_soctherm;
 struct tegra_thermctl_zone {
 	void __iomem *reg;
 	struct device *dev;
+	struct tegra_soctherm *ts;
 	struct thermal_zone_device *tz;
 	const struct tegra_tsensor_group *sg;
 };
 
+struct soctherm_throt_cfg {
+	const char *name;
+	unsigned int id;
+	u8 priority;
+	u8 cpu_throt_level;
+	u32 cpu_throt_depth;
+	struct thermal_cooling_device *cdev;
+	bool init;
+};
+
 struct tegra_soctherm {
 	struct reset_control *reset;
 	struct clk *clock_tsensor;
 	struct clk *clock_soctherm;
 	void __iomem *regs;
-	struct thermal_zone_device **thermctl_tzs;
+	void __iomem *clk_regs;
+	void __iomem *ccroc_regs;
 
 	u32 *calib;
+	struct thermal_zone_device **thermctl_tzs;
 	struct tegra_soctherm_soc *soc;
 
+	struct soctherm_throt_cfg throt_cfgs[THROTTLE_SIZE];
+
 	struct dentry *debugfs_dir;
 };
 
+/**
+ * clk_writel() - writes a value to a CAR register
+ * @ts: pointer to a struct tegra_soctherm
+ * @v: the value to write
+ * @reg: the register offset
+ *
+ * Writes @v to @reg.  No return value.
+ */
+static inline void clk_writel(struct tegra_soctherm *ts, u32 value, u32 reg)
+{
+	writel(value, (ts->clk_regs + reg));
+}
+
+/**
+ * clk_readl() - reads specified register from CAR IP block
+ * @ts: pointer to a struct tegra_soctherm
+ * @reg: register address to be read
+ *
+ * Return: the value of the register
+ */
+static inline u32 clk_readl(struct tegra_soctherm *ts, u32 reg)
+{
+	return readl(ts->clk_regs + reg);
+}
+
+/**
+ * ccroc_writel() - writes a value to a CCROC register
+ * @ts: pointer to a struct tegra_soctherm
+ * @v: the value to write
+ * @reg: the register offset
+ *
+ * Writes @v to @reg.  No return value.
+ */
+static inline void ccroc_writel(struct tegra_soctherm *ts, u32 value, u32 reg)
+{
+	writel(value, (ts->ccroc_regs + reg));
+}
+
+/**
+ * ccroc_readl() - reads specified register from CCROC IP block
+ * @ts: pointer to a struct tegra_soctherm
+ * @reg: register address to be read
+ *
+ * Return: the value of the register
+ */
+static inline u32 ccroc_readl(struct tegra_soctherm *ts, u32 reg)
+{
+	return readl(ts->ccroc_regs + reg);
+}
+
 static void enable_tsensor(struct tegra_soctherm *tegra, unsigned int i)
 {
 	const struct tegra_tsensor *sensor = &tegra->soc->tsensors[i];
@@ -150,11 +344,17 @@
 static int
 thermtrip_program(struct device *dev, const struct tegra_tsensor_group *sg,
 		  int trip_temp);
+static int
+throttrip_program(struct device *dev, const struct tegra_tsensor_group *sg,
+		  struct soctherm_throt_cfg *stc, int trip_temp);
+static struct soctherm_throt_cfg *
+find_throttle_cfg_by_name(struct tegra_soctherm *ts, const char *name);
 
 static int tegra_thermctl_set_trip_temp(void *data, int trip, int temp)
 {
 	struct tegra_thermctl_zone *zone = data;
 	struct thermal_zone_device *tz = zone->tz;
+	struct tegra_soctherm *ts = zone->ts;
 	const struct tegra_tsensor_group *sg = zone->sg;
 	struct device *dev = zone->dev;
 	enum thermal_trip_type type;
@@ -167,10 +367,29 @@
 	if (ret)
 		return ret;
 
-	if (type != THERMAL_TRIP_CRITICAL)
-		return 0;
+	if (type == THERMAL_TRIP_CRITICAL) {
+		return thermtrip_program(dev, sg, temp);
+	} else if (type == THERMAL_TRIP_HOT) {
+		int i;
 
-	return thermtrip_program(dev, sg, temp);
+		for (i = 0; i < THROTTLE_SIZE; i++) {
+			struct thermal_cooling_device *cdev;
+			struct soctherm_throt_cfg *stc;
+
+			if (!ts->throt_cfgs[i].init)
+				continue;
+
+			cdev = ts->throt_cfgs[i].cdev;
+			if (get_thermal_instance(tz, cdev, trip))
+				stc = find_throttle_cfg_by_name(ts, cdev->type);
+			else
+				continue;
+
+			return throttrip_program(dev, sg, stc, temp);
+		}
+	}
+
+	return 0;
 }
 
 static const struct thermal_zone_of_device_ops tegra_of_thermal_ops = {
@@ -238,14 +457,110 @@
 }
 
 /**
+ * throttrip_program() - Configures the hardware to throttle the
+ * pulse if a given sensor group reaches a given temperature
+ * @dev: ptr to the struct device for the SOC_THERM IP block
+ * @sg: pointer to the sensor group to set the thermtrip temperature for
+ * @stc: pointer to the throttle need to be triggered
+ * @trip_temp: the temperature in millicelsius to trigger the thermal trip at
+ *
+ * Sets the thermal trip threshold and throttle event of the given sensor
+ * group. If this threshold is crossed, the hardware will trigger the
+ * throttle.
+ *
+ * Note that, although @trip_temp is specified in millicelsius, the
+ * hardware is programmed in degrees Celsius.
+ *
+ * Return: 0 upon success, or %-EINVAL upon failure.
+ */
+static int throttrip_program(struct device *dev,
+			     const struct tegra_tsensor_group *sg,
+			     struct soctherm_throt_cfg *stc,
+			     int trip_temp)
+{
+	struct tegra_soctherm *ts = dev_get_drvdata(dev);
+	int temp, cpu_throt, gpu_throt;
+	unsigned int throt;
+	u32 r, reg_off;
+
+	if (!dev || !sg || !stc || !stc->init)
+		return -EINVAL;
+
+	temp = enforce_temp_range(dev, trip_temp) / ts->soc->thresh_grain;
+
+	/* Hardcode LIGHT on LEVEL1 and HEAVY on LEVEL2 */
+	throt = stc->id;
+	reg_off = THERMCTL_LVL_REG(sg->thermctl_lvl0_offset, throt + 1);
+
+	if (throt == THROTTLE_LIGHT) {
+		cpu_throt = THERMCTL_LVL0_CPU0_CPU_THROT_LIGHT;
+		gpu_throt = THERMCTL_LVL0_CPU0_GPU_THROT_LIGHT;
+	} else {
+		cpu_throt = THERMCTL_LVL0_CPU0_CPU_THROT_HEAVY;
+		gpu_throt = THERMCTL_LVL0_CPU0_GPU_THROT_HEAVY;
+		if (throt != THROTTLE_HEAVY)
+			dev_warn(dev,
+				 "invalid throt id %d - assuming HEAVY",
+				 throt);
+	}
+
+	r = readl(ts->regs + reg_off);
+	r = REG_SET_MASK(r, sg->thermctl_lvl0_up_thresh_mask, temp);
+	r = REG_SET_MASK(r, sg->thermctl_lvl0_dn_thresh_mask, temp);
+	r = REG_SET_MASK(r, THERMCTL_LVL0_CPU0_CPU_THROT_MASK, cpu_throt);
+	r = REG_SET_MASK(r, THERMCTL_LVL0_CPU0_GPU_THROT_MASK, gpu_throt);
+	r = REG_SET_MASK(r, THERMCTL_LVL0_CPU0_EN_MASK, 1);
+	writel(r, ts->regs + reg_off);
+
+	return 0;
+}
+
+static struct soctherm_throt_cfg *
+find_throttle_cfg_by_name(struct tegra_soctherm *ts, const char *name)
+{
+	unsigned int i;
+
+	for (i = 0; ts->throt_cfgs[i].name; i++)
+		if (!strcmp(ts->throt_cfgs[i].name, name))
+			return &ts->throt_cfgs[i];
+
+	return NULL;
+}
+
+static int get_hot_temp(struct thermal_zone_device *tz, int *trip, int *temp)
+{
+	int ntrips, i, ret;
+	enum thermal_trip_type type;
+
+	ntrips = of_thermal_get_ntrips(tz);
+	if (ntrips <= 0)
+		return -EINVAL;
+
+	for (i = 0; i < ntrips; i++) {
+		ret = tz->ops->get_trip_type(tz, i, &type);
+		if (ret)
+			return -EINVAL;
+		if (type == THERMAL_TRIP_HOT) {
+			ret = tz->ops->get_trip_temp(tz, i, temp);
+			if (!ret)
+				*trip = i;
+
+			return ret;
+		}
+	}
+
+	return -EINVAL;
+}
+
+/**
  * tegra_soctherm_set_hwtrips() - set HW trip point from DT data
  * @dev: struct device * of the SOC_THERM instance
  *
  * Configure the SOC_THERM HW trip points, setting "THERMTRIP"
- * trip points , using "critical" type trip_temp from thermal
- * zone.
- * After they have been configured, THERMTRIP will take action
- * when the configured SoC thermal sensor group reaches a
+ * "THROTTLE" trip points , using "critical" or "hot" type trip_temp
+ * from thermal zone.
+ * After they have been configured, THERMTRIP or THROTTLE will take
+ * action when the configured SoC thermal sensor group reaches a
  * certain temperature.
  *
  * Return: 0 upon success, or a negative error code on failure.
@@ -254,19 +569,24 @@
  * THERMTRIP has been enabled successfully when a message similar to
  * this one appears on the serial console:
  * "thermtrip: will shut down when sensor group XXX reaches YYYYYY mC"
+ * THROTTLE has been enabled successfully when a message similar to
+ * this one appears on the serial console:
+ * ""throttrip: will throttle when sensor group XXX reaches YYYYYY mC"
  */
 static int tegra_soctherm_set_hwtrips(struct device *dev,
 				      const struct tegra_tsensor_group *sg,
 				      struct thermal_zone_device *tz)
 {
-	int temperature;
+	struct tegra_soctherm *ts = dev_get_drvdata(dev);
+	struct soctherm_throt_cfg *stc;
+	int i, trip, temperature;
 	int ret;
 
 	ret = tz->ops->get_crit_temp(tz, &temperature);
 	if (ret) {
 		dev_warn(dev, "thermtrip: %s: missing critical temperature\n",
 			 sg->name);
-		return ret;
+		goto set_throttle;
 	}
 
 	ret = thermtrip_program(dev, sg, temperature);
@@ -280,6 +600,43 @@
 		 "thermtrip: will shut down when %s reaches %d mC\n",
 		 sg->name, temperature);
 
+set_throttle:
+	ret = get_hot_temp(tz, &trip, &temperature);
+	if (ret) {
+		dev_warn(dev, "throttrip: %s: missing hot temperature\n",
+			 sg->name);
+		return 0;
+	}
+
+	for (i = 0; i < THROTTLE_SIZE; i++) {
+		struct thermal_cooling_device *cdev;
+
+		if (!ts->throt_cfgs[i].init)
+			continue;
+
+		cdev = ts->throt_cfgs[i].cdev;
+		if (get_thermal_instance(tz, cdev, trip))
+			stc = find_throttle_cfg_by_name(ts, cdev->type);
+		else
+			continue;
+
+		ret = throttrip_program(dev, sg, stc, temperature);
+		if (ret) {
+			dev_err(dev, "throttrip: %s: error during enable\n",
+				sg->name);
+			return ret;
+		}
+
+		dev_info(dev,
+			 "throttrip: will throttle when %s reaches %d mC\n",
+			 sg->name, temperature);
+		break;
+	}
+
+	if (i == THROTTLE_SIZE)
+		dev_warn(dev, "throttrip: %s: missing throttle cdev\n",
+			 sg->name);
+
 	return 0;
 }
 
@@ -291,7 +648,7 @@
 	const struct tegra_tsensor *tsensors = ts->soc->tsensors;
 	const struct tegra_tsensor_group **ttgs = ts->soc->ttgs;
 	u32 r, state;
-	int i;
+	int i, level;
 
 	seq_puts(s, "-----TSENSE (convert HW)-----\n");
 
@@ -365,6 +722,81 @@
 	state = REG_GET_MASK(r, SENSOR_TEMP2_MEM_TEMP_MASK);
 	seq_printf(s, " MEM(%d)\n", translate_temp(state));
 
+	for (i = 0; i < ts->soc->num_ttgs; i++) {
+		seq_printf(s, "%s:\n", ttgs[i]->name);
+		for (level = 0; level < 4; level++) {
+			s32 v;
+			u32 mask;
+			u16 off = ttgs[i]->thermctl_lvl0_offset;
+
+			r = readl(ts->regs + THERMCTL_LVL_REG(off, level));
+
+			mask = ttgs[i]->thermctl_lvl0_up_thresh_mask;
+			state = REG_GET_MASK(r, mask);
+			v = sign_extend32(state, ts->soc->bptt - 1);
+			v *= ts->soc->thresh_grain;
+			seq_printf(s, "   %d: Up/Dn(%d /", level, v);
+
+			mask = ttgs[i]->thermctl_lvl0_dn_thresh_mask;
+			state = REG_GET_MASK(r, mask);
+			v = sign_extend32(state, ts->soc->bptt - 1);
+			v *= ts->soc->thresh_grain;
+			seq_printf(s, "%d ) ", v);
+
+			mask = THERMCTL_LVL0_CPU0_EN_MASK;
+			state = REG_GET_MASK(r, mask);
+			seq_printf(s, "En(%d) ", state);
+
+			mask = THERMCTL_LVL0_CPU0_CPU_THROT_MASK;
+			state = REG_GET_MASK(r, mask);
+			seq_puts(s, "CPU Throt");
+			if (!state)
+				seq_printf(s, "(%s) ", "none");
+			else if (state == THERMCTL_LVL0_CPU0_CPU_THROT_LIGHT)
+				seq_printf(s, "(%s) ", "L");
+			else if (state == THERMCTL_LVL0_CPU0_CPU_THROT_HEAVY)
+				seq_printf(s, "(%s) ", "H");
+			else
+				seq_printf(s, "(%s) ", "H+L");
+
+			mask = THERMCTL_LVL0_CPU0_GPU_THROT_MASK;
+			state = REG_GET_MASK(r, mask);
+			seq_puts(s, "GPU Throt");
+			if (!state)
+				seq_printf(s, "(%s) ", "none");
+			else if (state == THERMCTL_LVL0_CPU0_GPU_THROT_LIGHT)
+				seq_printf(s, "(%s) ", "L");
+			else if (state == THERMCTL_LVL0_CPU0_GPU_THROT_HEAVY)
+				seq_printf(s, "(%s) ", "H");
+			else
+				seq_printf(s, "(%s) ", "H+L");
+
+			mask = THERMCTL_LVL0_CPU0_STATUS_MASK;
+			state = REG_GET_MASK(r, mask);
+			seq_printf(s, "Status(%s)\n",
+				   state == 0 ? "LO" :
+				   state == 1 ? "In" :
+				   state == 2 ? "Res" : "HI");
+		}
+	}
+
+	r = readl(ts->regs + THERMCTL_STATS_CTL);
+	seq_printf(s, "STATS: Up(%s) Dn(%s)\n",
+		   r & STATS_CTL_EN_UP ? "En" : "--",
+		   r & STATS_CTL_EN_DN ? "En" : "--");
+
+	for (level = 0; level < 4; level++) {
+		u16 off;
+
+		off = THERMCTL_LVL0_UP_STATS;
+		r = readl(ts->regs + THERMCTL_LVL_REG(off, level));
+		seq_printf(s, "  Level_%d Up(%d) ", level, r);
+
+		off = THERMCTL_LVL0_DN_STATS;
+		r = readl(ts->regs + THERMCTL_LVL_REG(off, level));
+		seq_printf(s, "Dn(%d)\n", r);
+	}
+
 	r = readl(ts->regs + THERMCTL_THERMTRIP_CTL);
 	state = REG_GET_MASK(r, ttgs[0]->thermtrip_any_en_mask);
 	seq_printf(s, "Thermtrip Any En(%d)\n", state);
@@ -376,6 +808,32 @@
 		seq_printf(s, "Thresh(%d)\n", state);
 	}
 
+	r = readl(ts->regs + THROT_GLOBAL_CFG);
+	seq_puts(s, "\n");
+	seq_printf(s, "GLOBAL THROTTLE CONFIG: 0x%08x\n", r);
+
+	seq_puts(s, "---------------------------------------------------\n");
+	r = readl(ts->regs + THROT_STATUS);
+	state = REG_GET_MASK(r, THROT_STATUS_BREACH_MASK);
+	seq_printf(s, "THROT STATUS: breach(%d) ", state);
+	state = REG_GET_MASK(r, THROT_STATUS_STATE_MASK);
+	seq_printf(s, "state(%d) ", state);
+	state = REG_GET_MASK(r, THROT_STATUS_ENABLED_MASK);
+	seq_printf(s, "enabled(%d)\n", state);
+
+	r = readl(ts->regs + CPU_PSKIP_STATUS);
+	if (ts->soc->use_ccroc) {
+		state = REG_GET_MASK(r, XPU_PSKIP_STATUS_ENABLED_MASK);
+		seq_printf(s, "CPU PSKIP STATUS: enabled(%d)\n", state);
+	} else {
+		state = REG_GET_MASK(r, XPU_PSKIP_STATUS_M_MASK);
+		seq_printf(s, "CPU PSKIP STATUS: M(%d) ", state);
+		state = REG_GET_MASK(r, XPU_PSKIP_STATUS_N_MASK);
+		seq_printf(s, "N(%d) ", state);
+		state = REG_GET_MASK(r, XPU_PSKIP_STATUS_ENABLED_MASK);
+		seq_printf(s, "enabled(%d)\n", state);
+	}
+
 	return 0;
 }
 
@@ -449,6 +907,326 @@
 	return 0;
 }
 
+static int throt_get_cdev_max_state(struct thermal_cooling_device *cdev,
+				    unsigned long *max_state)
+{
+	*max_state = 1;
+	return 0;
+}
+
+static int throt_get_cdev_cur_state(struct thermal_cooling_device *cdev,
+				    unsigned long *cur_state)
+{
+	struct tegra_soctherm *ts = cdev->devdata;
+	u32 r;
+
+	r = readl(ts->regs + THROT_STATUS);
+	if (REG_GET_MASK(r, THROT_STATUS_STATE_MASK))
+		*cur_state = 1;
+	else
+		*cur_state = 0;
+
+	return 0;
+}
+
+static int throt_set_cdev_state(struct thermal_cooling_device *cdev,
+				unsigned long cur_state)
+{
+	return 0;
+}
+
+static struct thermal_cooling_device_ops throt_cooling_ops = {
+	.get_max_state = throt_get_cdev_max_state,
+	.get_cur_state = throt_get_cdev_cur_state,
+	.set_cur_state = throt_set_cdev_state,
+};
+
+/**
+ * soctherm_init_hw_throt_cdev() - Parse the HW throttle configurations
+ * and register them as cooling devices.
+ */
+static void soctherm_init_hw_throt_cdev(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct tegra_soctherm *ts = dev_get_drvdata(dev);
+	struct device_node *np_stc, *np_stcc;
+	const char *name;
+	u32 val;
+	int i, r;
+
+	for (i = 0; i < THROTTLE_SIZE; i++) {
+		ts->throt_cfgs[i].name = throt_names[i];
+		ts->throt_cfgs[i].id = i;
+		ts->throt_cfgs[i].init = false;
+	}
+
+	np_stc = of_get_child_by_name(dev->of_node, "throttle-cfgs");
+	if (!np_stc) {
+		dev_info(dev,
+			 "throttle-cfg: no throttle-cfgs - not enabling\n");
+		return;
+	}
+
+	for_each_child_of_node(np_stc, np_stcc) {
+		struct soctherm_throt_cfg *stc;
+		struct thermal_cooling_device *tcd;
+
+		name = np_stcc->name;
+		stc = find_throttle_cfg_by_name(ts, name);
+		if (!stc) {
+			dev_err(dev,
+				"throttle-cfg: could not find %s\n", name);
+			continue;
+		}
+
+		r = of_property_read_u32(np_stcc, "nvidia,priority", &val);
+		if (r) {
+			dev_info(dev,
+				 "throttle-cfg: %s: missing priority\n", name);
+			continue;
+		}
+		stc->priority = val;
+
+		if (ts->soc->use_ccroc) {
+			r = of_property_read_u32(np_stcc,
+						 "nvidia,cpu-throt-level",
+						 &val);
+			if (r) {
+				dev_info(dev,
+					 "throttle-cfg: %s: missing cpu-throt-level\n",
+					 name);
+				continue;
+			}
+			stc->cpu_throt_level = val;
+		} else {
+			r = of_property_read_u32(np_stcc,
+						 "nvidia,cpu-throt-percent",
+						 &val);
+			if (r) {
+				dev_info(dev,
+					 "throttle-cfg: %s: missing cpu-throt-percent\n",
+					 name);
+				continue;
+			}
+			stc->cpu_throt_depth = val;
+		}
+
+		tcd = thermal_of_cooling_device_register(np_stcc,
+							 (char *)name, ts,
+							 &throt_cooling_ops);
+		of_node_put(np_stcc);
+		if (IS_ERR_OR_NULL(tcd)) {
+			dev_err(dev,
+				"throttle-cfg: %s: failed to register cooling device\n",
+				name);
+			continue;
+		}
+
+		stc->cdev = tcd;
+		stc->init = true;
+	}
+
+	of_node_put(np_stc);
+}
+
+/**
+ * throttlectl_cpu_level_cfg() - programs CCROC NV_THERM level config
+ * @level: describing the level LOW/MED/HIGH of throttling
+ *
+ * It's necessary to set up the CPU-local CCROC NV_THERM instance with
+ * the M/N values desired for each level. This function does this.
+ *
+ * This function pre-programs the CCROC NV_THERM levels in terms of
+ * pre-configured "Low", "Medium" or "Heavy" throttle levels which are
+ * mapped to THROT_LEVEL_LOW, THROT_LEVEL_MED and THROT_LEVEL_HVY.
+ */
+static void throttlectl_cpu_level_cfg(struct tegra_soctherm *ts, int level)
+{
+	u8 depth, dividend;
+	u32 r;
+
+	switch (level) {
+	case TEGRA_SOCTHERM_THROT_LEVEL_LOW:
+		depth = 50;
+		break;
+	case TEGRA_SOCTHERM_THROT_LEVEL_MED:
+		depth = 75;
+		break;
+	case TEGRA_SOCTHERM_THROT_LEVEL_HIGH:
+		depth = 80;
+		break;
+	case TEGRA_SOCTHERM_THROT_LEVEL_NONE:
+		return;
+	default:
+		return;
+	}
+
+	dividend = THROT_DEPTH_DIVIDEND(depth);
+
+	/* setup PSKIP in ccroc nv_therm registers */
+	r = ccroc_readl(ts, CCROC_THROT_PSKIP_RAMP_CPU_REG(level));
+	r = REG_SET_MASK(r, CCROC_THROT_PSKIP_RAMP_DURATION_MASK, 0xff);
+	r = REG_SET_MASK(r, CCROC_THROT_PSKIP_RAMP_STEP_MASK, 0xf);
+	ccroc_writel(ts, r, CCROC_THROT_PSKIP_RAMP_CPU_REG(level));
+
+	r = ccroc_readl(ts, CCROC_THROT_PSKIP_CTRL_CPU_REG(level));
+	r = REG_SET_MASK(r, CCROC_THROT_PSKIP_CTRL_ENB_MASK, 1);
+	r = REG_SET_MASK(r, CCROC_THROT_PSKIP_CTRL_DIVIDEND_MASK, dividend);
+	r = REG_SET_MASK(r, CCROC_THROT_PSKIP_CTRL_DIVISOR_MASK, 0xff);
+	ccroc_writel(ts, r, CCROC_THROT_PSKIP_CTRL_CPU_REG(level));
+}
+
+/**
+ * throttlectl_cpu_level_select() - program CPU pulse skipper config
+ * @throt: the LIGHT/HEAVY of throttle event id
+ *
+ * Pulse skippers are used to throttle clock frequencies.  This
+ * function programs the pulse skippers based on @throt and platform
+ * data.  This function is used on SoCs which have CPU-local pulse
+ * skipper control, such as T13x. It programs soctherm's interface to
+ * Denver:CCROC NV_THERM in terms of Low, Medium and HIGH throttling
+ * vectors. PSKIP_BYPASS mode is set as required per HW spec.
+ */
+static void throttlectl_cpu_level_select(struct tegra_soctherm *ts,
+					 enum soctherm_throttle_id throt)
+{
+	u32 r, throt_vect;
+
+	/* Denver:CCROC NV_THERM interface N:3 Mapping */
+	switch (ts->throt_cfgs[throt].cpu_throt_level) {
+	case TEGRA_SOCTHERM_THROT_LEVEL_LOW:
+		throt_vect = THROT_VECT_LOW;
+		break;
+	case TEGRA_SOCTHERM_THROT_LEVEL_MED:
+		throt_vect = THROT_VECT_MED;
+		break;
+	case TEGRA_SOCTHERM_THROT_LEVEL_HIGH:
+		throt_vect = THROT_VECT_HIGH;
+		break;
+	default:
+		throt_vect = THROT_VECT_NONE;
+		break;
+	}
+
+	r = readl(ts->regs + THROT_PSKIP_CTRL(throt, THROTTLE_DEV_CPU));
+	r = REG_SET_MASK(r, THROT_PSKIP_CTRL_ENABLE_MASK, 1);
+	r = REG_SET_MASK(r, THROT_PSKIP_CTRL_VECT_CPU_MASK, throt_vect);
+	r = REG_SET_MASK(r, THROT_PSKIP_CTRL_VECT2_CPU_MASK, throt_vect);
+	writel(r, ts->regs + THROT_PSKIP_CTRL(throt, THROTTLE_DEV_CPU));
+
+	/* bypass sequencer in soc_therm as it is programmed in ccroc */
+	r = REG_SET_MASK(0, THROT_PSKIP_RAMP_SEQ_BYPASS_MODE_MASK, 1);
+	writel(r, ts->regs + THROT_PSKIP_RAMP(throt, THROTTLE_DEV_CPU));
+}
+
+/**
+ * throttlectl_cpu_mn() - program CPU pulse skipper configuration
+ * @throt: the LIGHT/HEAVY of throttle event id
+ *
+ * Pulse skippers are used to throttle clock frequencies.  This
+ * function programs the pulse skippers based on @throt and platform
+ * data.  This function is used for CPUs that have "remote" pulse
+ * skipper control, e.g., the CPU pulse skipper is controlled by the
+ * SOC_THERM IP block.  (SOC_THERM is located outside the CPU
+ * complex.)
+ */
+static void throttlectl_cpu_mn(struct tegra_soctherm *ts,
+			       enum soctherm_throttle_id throt)
+{
+	u32 r;
+	int depth;
+	u8 dividend;
+
+	depth = ts->throt_cfgs[throt].cpu_throt_depth;
+	dividend = THROT_DEPTH_DIVIDEND(depth);
+
+	r = readl(ts->regs + THROT_PSKIP_CTRL(throt, THROTTLE_DEV_CPU));
+	r = REG_SET_MASK(r, THROT_PSKIP_CTRL_ENABLE_MASK, 1);
+	r = REG_SET_MASK(r, THROT_PSKIP_CTRL_DIVIDEND_MASK, dividend);
+	r = REG_SET_MASK(r, THROT_PSKIP_CTRL_DIVISOR_MASK, 0xff);
+	writel(r, ts->regs + THROT_PSKIP_CTRL(throt, THROTTLE_DEV_CPU));
+
+	r = readl(ts->regs + THROT_PSKIP_RAMP(throt, THROTTLE_DEV_CPU));
+	r = REG_SET_MASK(r, THROT_PSKIP_RAMP_DURATION_MASK, 0xff);
+	r = REG_SET_MASK(r, THROT_PSKIP_RAMP_STEP_MASK, 0xf);
+	writel(r, ts->regs + THROT_PSKIP_RAMP(throt, THROTTLE_DEV_CPU));
+}
+
+/**
+ * soctherm_throttle_program() - programs pulse skippers' configuration
+ * @throt: the LIGHT/HEAVY of the throttle event id.
+ *
+ * Pulse skippers are used to throttle clock frequencies.
+ * This function programs the pulse skippers.
+ */
+static void soctherm_throttle_program(struct tegra_soctherm *ts,
+				      enum soctherm_throttle_id throt)
+{
+	u32 r;
+	struct soctherm_throt_cfg stc = ts->throt_cfgs[throt];
+
+	if (!stc.init)
+		return;
+
+	/* Setup PSKIP parameters */
+	if (ts->soc->use_ccroc)
+		throttlectl_cpu_level_select(ts, throt);
+	else
+		throttlectl_cpu_mn(ts, throt);
+
+	r = REG_SET_MASK(0, THROT_PRIORITY_LITE_PRIO_MASK, stc.priority);
+	writel(r, ts->regs + THROT_PRIORITY_CTRL(throt));
+
+	r = REG_SET_MASK(0, THROT_DELAY_LITE_DELAY_MASK, 0);
+	writel(r, ts->regs + THROT_DELAY_CTRL(throt));
+
+	r = readl(ts->regs + THROT_PRIORITY_LOCK);
+	r = REG_GET_MASK(r, THROT_PRIORITY_LOCK_PRIORITY_MASK);
+	if (r >= stc.priority)
+		return;
+	r = REG_SET_MASK(0, THROT_PRIORITY_LOCK_PRIORITY_MASK,
+			 stc.priority);
+	writel(r, ts->regs + THROT_PRIORITY_LOCK);
+}
+
+static void tegra_soctherm_throttle(struct device *dev)
+{
+	struct tegra_soctherm *ts = dev_get_drvdata(dev);
+	u32 v;
+	int i;
+
+	/* configure LOW, MED and HIGH levels for CCROC NV_THERM */
+	if (ts->soc->use_ccroc) {
+		throttlectl_cpu_level_cfg(ts, TEGRA_SOCTHERM_THROT_LEVEL_LOW);
+		throttlectl_cpu_level_cfg(ts, TEGRA_SOCTHERM_THROT_LEVEL_MED);
+		throttlectl_cpu_level_cfg(ts, TEGRA_SOCTHERM_THROT_LEVEL_HIGH);
+	}
+
+	/* Thermal HW throttle programming */
+	for (i = 0; i < THROTTLE_SIZE; i++)
+		soctherm_throttle_program(ts, i);
+
+	v = REG_SET_MASK(0, THROT_GLOBAL_ENB_MASK, 1);
+	if (ts->soc->use_ccroc) {
+		ccroc_writel(ts, v, CCROC_GLOBAL_CFG);
+
+		v = ccroc_readl(ts, CCROC_SUPER_CCLKG_DIVIDER);
+		v = REG_SET_MASK(v, CDIVG_USE_THERM_CONTROLS_MASK, 1);
+		ccroc_writel(ts, v, CCROC_SUPER_CCLKG_DIVIDER);
+	} else {
+		writel(v, ts->regs + THROT_GLOBAL_CFG);
+
+		v = clk_readl(ts, CAR_SUPER_CCLKG_DIVIDER);
+		v = REG_SET_MASK(v, CDIVG_USE_THERM_CONTROLS_MASK, 1);
+		clk_writel(ts, v, CAR_SUPER_CCLKG_DIVIDER);
+	}
+
+	/* initialize stats collection */
+	v = STATS_CTL_CLR_DN | STATS_CTL_EN_DN |
+	    STATS_CTL_CLR_UP | STATS_CTL_EN_UP;
+	writel(v, ts->regs + THERMCTL_STATS_CTL);
+}
+
 static void soctherm_init(struct platform_device *pdev)
 {
 	struct tegra_soctherm *tegra = platform_get_drvdata(pdev);
@@ -475,6 +1253,9 @@
 	}
 	writel(pdiv, tegra->regs + SENSOR_PDIV);
 	writel(hotspot, tegra->regs + SENSOR_HOTSPOT_OFF);
+
+	/* Configure hw throttle */
+	tegra_soctherm_throttle(&pdev->dev);
 }
 
 static const struct of_device_id tegra_soctherm_of_match[] = {
@@ -527,10 +1308,31 @@
 
 	tegra->soc = soc;
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
+					   "soctherm-reg");
 	tegra->regs = devm_ioremap_resource(&pdev->dev, res);
-	if (IS_ERR(tegra->regs))
+	if (IS_ERR(tegra->regs)) {
+		dev_err(&pdev->dev, "can't get soctherm registers");
 		return PTR_ERR(tegra->regs);
+	}
+
+	if (!tegra->soc->use_ccroc) {
+		res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
+						   "car-reg");
+		tegra->clk_regs = devm_ioremap_resource(&pdev->dev, res);
+		if (IS_ERR(tegra->clk_regs)) {
+			dev_err(&pdev->dev, "can't get car clk registers");
+			return PTR_ERR(tegra->clk_regs);
+		}
+	} else {
+		res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
+						   "ccroc-reg");
+		tegra->ccroc_regs = devm_ioremap_resource(&pdev->dev, res);
+		if (IS_ERR(tegra->ccroc_regs)) {
+			dev_err(&pdev->dev, "can't get ccroc registers");
+			return PTR_ERR(tegra->ccroc_regs);
+		}
+	}
 
 	tegra->reset = devm_reset_control_get(&pdev->dev, "soctherm");
 	if (IS_ERR(tegra->reset)) {
@@ -580,6 +1382,8 @@
 	if (err)
 		return err;
 
+	soctherm_init_hw_throt_cdev(pdev);
+
 	soctherm_init(pdev);
 
 	for (i = 0; i < soc->num_ttgs; ++i) {
@@ -593,6 +1397,7 @@
 		zone->reg = tegra->regs + soc->ttgs[i]->sensor_temp_offset;
 		zone->dev = &pdev->dev;
 		zone->sg = soc->ttgs[i];
+		zone->ts = tegra;
 
 		z = devm_thermal_zone_of_sensor_register(&pdev->dev,
 							 soc->ttgs[i]->id, zone,
@@ -608,7 +1413,9 @@
 		tegra->thermctl_tzs[soc->ttgs[i]->id] = z;
 
 		/* Configure hw trip points */
-		tegra_soctherm_set_hwtrips(&pdev->dev, soc->ttgs[i], z);
+		err = tegra_soctherm_set_hwtrips(&pdev->dev, soc->ttgs[i], z);
+		if (err)
+			goto disable_clocks;
 	}
 
 	soctherm_debug_init(pdev);
@@ -661,7 +1468,12 @@
 		struct thermal_zone_device *tz;
 
 		tz = tegra->thermctl_tzs[soc->ttgs[i]->id];
-		tegra_soctherm_set_hwtrips(dev, soc->ttgs[i], tz);
+		err = tegra_soctherm_set_hwtrips(dev, soc->ttgs[i], tz);
+		if (err) {
+			dev_err(&pdev->dev,
+				"Resume failed: set hwtrips failed\n");
+			return err;
+		}
 	}
 
 	return 0;
diff --git a/drivers/thermal/tegra/soctherm.h b/drivers/thermal/tegra/soctherm.h
index 28e18ec..e96ca73 100644
--- a/drivers/thermal/tegra/soctherm.h
+++ b/drivers/thermal/tegra/soctherm.h
@@ -15,6 +15,11 @@
 #ifndef __DRIVERS_THERMAL_TEGRA_SOCTHERM_H
 #define __DRIVERS_THERMAL_TEGRA_SOCTHERM_H
 
+#define THERMCTL_LEVEL0_GROUP_CPU               0x0
+#define THERMCTL_LEVEL0_GROUP_GPU		0x4
+#define THERMCTL_LEVEL0_GROUP_MEM		0x8
+#define THERMCTL_LEVEL0_GROUP_TSENSE		0xc
+
 #define SENSOR_CONFIG2                          8
 #define SENSOR_CONFIG2_THERMA_MASK		(0xffff << 16)
 #define SENSOR_CONFIG2_THERMA_SHIFT		16
@@ -65,6 +70,9 @@
 	u32 thermtrip_enable_mask;
 	u32 thermtrip_any_en_mask;
 	u32 thermtrip_threshold_mask;
+	u16 thermctl_lvl0_offset;
+	u32 thermctl_lvl0_up_thresh_mask;
+	u32 thermctl_lvl0_dn_thresh_mask;
 };
 
 struct tegra_tsensor_configuration {
@@ -103,6 +111,8 @@
 	const unsigned int num_ttgs;
 	const struct tegra_soctherm_fuse *tfuse;
 	const int thresh_grain;
+	const unsigned int bptt;
+	const bool use_ccroc;
 };
 
 int tegra_calc_shared_calib(const struct tegra_soctherm_fuse *tfuse,
diff --git a/drivers/thermal/tegra/tegra124-soctherm.c b/drivers/thermal/tegra/tegra124-soctherm.c
index beb9d36..3676863 100644
--- a/drivers/thermal/tegra/tegra124-soctherm.c
+++ b/drivers/thermal/tegra/tegra124-soctherm.c
@@ -28,7 +28,11 @@
 #define TEGRA124_THERMTRIP_CPU_THRESH_MASK	(0xff << 8)
 #define TEGRA124_THERMTRIP_TSENSE_THRESH_MASK	0xff
 
+#define TEGRA124_THERMCTL_LVL0_UP_THRESH_MASK	(0xff << 17)
+#define TEGRA124_THERMCTL_LVL0_DN_THRESH_MASK	(0xff << 9)
+
 #define TEGRA124_THRESH_GRAIN			1000
+#define TEGRA124_BPTT				8
 
 static const struct tegra_tsensor_configuration tegra124_tsensor_config = {
 	.tall = 16300,
@@ -51,6 +55,9 @@
 	.thermtrip_any_en_mask = TEGRA124_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA124_THERMTRIP_CPU_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA124_THERMTRIP_CPU_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_CPU,
+	.thermctl_lvl0_up_thresh_mask = TEGRA124_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA124_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group tegra124_tsensor_group_gpu = {
@@ -66,6 +73,9 @@
 	.thermtrip_any_en_mask = TEGRA124_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA124_THERMTRIP_GPU_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA124_THERMTRIP_GPUMEM_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_GPU,
+	.thermctl_lvl0_up_thresh_mask = TEGRA124_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA124_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group tegra124_tsensor_group_pll = {
@@ -79,6 +89,9 @@
 	.thermtrip_any_en_mask = TEGRA124_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA124_THERMTRIP_TSENSE_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA124_THERMTRIP_TSENSE_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_TSENSE,
+	.thermctl_lvl0_up_thresh_mask = TEGRA124_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA124_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group tegra124_tsensor_group_mem = {
@@ -94,6 +107,9 @@
 	.thermtrip_any_en_mask = TEGRA124_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA124_THERMTRIP_MEM_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA124_THERMTRIP_GPUMEM_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_MEM,
+	.thermctl_lvl0_up_thresh_mask = TEGRA124_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA124_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group *tegra124_tsensor_groups[] = {
@@ -193,4 +209,6 @@
 	.num_ttgs = ARRAY_SIZE(tegra124_tsensor_groups),
 	.tfuse = &tegra124_soctherm_fuse,
 	.thresh_grain = TEGRA124_THRESH_GRAIN,
+	.bptt = TEGRA124_BPTT,
+	.use_ccroc = false,
 };
diff --git a/drivers/thermal/tegra/tegra132-soctherm.c b/drivers/thermal/tegra/tegra132-soctherm.c
index e2aa84e..97fa305 100644
--- a/drivers/thermal/tegra/tegra132-soctherm.c
+++ b/drivers/thermal/tegra/tegra132-soctherm.c
@@ -28,7 +28,11 @@
 #define TEGRA132_THERMTRIP_CPU_THRESH_MASK	(0xff << 8)
 #define TEGRA132_THERMTRIP_TSENSE_THRESH_MASK	0xff
 
+#define TEGRA132_THERMCTL_LVL0_UP_THRESH_MASK	(0xff << 17)
+#define TEGRA132_THERMCTL_LVL0_DN_THRESH_MASK	(0xff << 9)
+
 #define TEGRA132_THRESH_GRAIN			1000
+#define TEGRA132_BPTT				8
 
 static const struct tegra_tsensor_configuration tegra132_tsensor_config = {
 	.tall = 16300,
@@ -51,6 +55,9 @@
 	.thermtrip_any_en_mask = TEGRA132_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA132_THERMTRIP_CPU_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA132_THERMTRIP_CPU_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_CPU,
+	.thermctl_lvl0_up_thresh_mask = TEGRA132_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA132_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group tegra132_tsensor_group_gpu = {
@@ -66,6 +73,9 @@
 	.thermtrip_any_en_mask = TEGRA132_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA132_THERMTRIP_GPU_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA132_THERMTRIP_GPUMEM_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_GPU,
+	.thermctl_lvl0_up_thresh_mask = TEGRA132_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA132_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group tegra132_tsensor_group_pll = {
@@ -79,6 +89,9 @@
 	.thermtrip_any_en_mask = TEGRA132_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA132_THERMTRIP_TSENSE_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA132_THERMTRIP_TSENSE_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_TSENSE,
+	.thermctl_lvl0_up_thresh_mask = TEGRA132_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA132_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group tegra132_tsensor_group_mem = {
@@ -94,6 +107,9 @@
 	.thermtrip_any_en_mask = TEGRA132_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA132_THERMTRIP_MEM_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA132_THERMTRIP_GPUMEM_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_MEM,
+	.thermctl_lvl0_up_thresh_mask = TEGRA132_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA132_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group *tegra132_tsensor_groups[] = {
@@ -193,4 +209,6 @@
 	.num_ttgs = ARRAY_SIZE(tegra132_tsensor_groups),
 	.tfuse = &tegra132_soctherm_fuse,
 	.thresh_grain = TEGRA132_THRESH_GRAIN,
+	.bptt = TEGRA132_BPTT,
+	.use_ccroc = true,
 };
diff --git a/drivers/thermal/tegra/tegra210-soctherm.c b/drivers/thermal/tegra/tegra210-soctherm.c
index 19cc0ab..ad53169 100644
--- a/drivers/thermal/tegra/tegra210-soctherm.c
+++ b/drivers/thermal/tegra/tegra210-soctherm.c
@@ -29,7 +29,11 @@
 #define TEGRA210_THERMTRIP_CPU_THRESH_MASK	(0x1ff << 9)
 #define TEGRA210_THERMTRIP_TSENSE_THRESH_MASK	0x1ff
 
+#define TEGRA210_THERMCTL_LVL0_UP_THRESH_MASK	(0x1ff << 18)
+#define TEGRA210_THERMCTL_LVL0_DN_THRESH_MASK	(0x1ff << 9)
+
 #define TEGRA210_THRESH_GRAIN			500
+#define TEGRA210_BPTT				9
 
 static const struct tegra_tsensor_configuration tegra210_tsensor_config = {
 	.tall = 16300,
@@ -52,6 +56,9 @@
 	.thermtrip_any_en_mask = TEGRA210_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA210_THERMTRIP_CPU_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA210_THERMTRIP_CPU_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_CPU,
+	.thermctl_lvl0_up_thresh_mask = TEGRA210_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA210_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group tegra210_tsensor_group_gpu = {
@@ -67,6 +74,9 @@
 	.thermtrip_any_en_mask = TEGRA210_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA210_THERMTRIP_GPU_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA210_THERMTRIP_GPUMEM_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_GPU,
+	.thermctl_lvl0_up_thresh_mask = TEGRA210_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA210_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group tegra210_tsensor_group_pll = {
@@ -80,6 +90,9 @@
 	.thermtrip_any_en_mask = TEGRA210_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA210_THERMTRIP_TSENSE_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA210_THERMTRIP_TSENSE_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_TSENSE,
+	.thermctl_lvl0_up_thresh_mask = TEGRA210_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA210_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group tegra210_tsensor_group_mem = {
@@ -95,6 +108,9 @@
 	.thermtrip_any_en_mask = TEGRA210_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA210_THERMTRIP_MEM_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA210_THERMTRIP_GPUMEM_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_MEM,
+	.thermctl_lvl0_up_thresh_mask = TEGRA210_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA210_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group *tegra210_tsensor_groups[] = {
@@ -194,4 +210,6 @@
 	.num_ttgs = ARRAY_SIZE(tegra210_tsensor_groups),
 	.tfuse = &tegra210_soctherm_fuse,
 	.thresh_grain = TEGRA210_THRESH_GRAIN,
+	.bptt = TEGRA210_BPTT,
+	.use_ccroc = false,
 };
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index e2fc616..226b0b4ac 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -520,6 +520,56 @@
 }
 EXPORT_SYMBOL_GPL(thermal_zone_get_temp);
 
+void thermal_zone_set_trips(struct thermal_zone_device *tz)
+{
+	int low = -INT_MAX;
+	int high = INT_MAX;
+	int trip_temp, hysteresis;
+	int i, ret;
+
+	mutex_lock(&tz->lock);
+
+	if (!tz->ops->set_trips || !tz->ops->get_trip_hyst)
+		goto exit;
+
+	for (i = 0; i < tz->trips; i++) {
+		int trip_low;
+
+		tz->ops->get_trip_temp(tz, i, &trip_temp);
+		tz->ops->get_trip_hyst(tz, i, &hysteresis);
+
+		trip_low = trip_temp - hysteresis;
+
+		if (trip_low < tz->temperature && trip_low > low)
+			low = trip_low;
+
+		if (trip_temp > tz->temperature && trip_temp < high)
+			high = trip_temp;
+	}
+
+	/* No need to change trip points */
+	if (tz->prev_low_trip == low && tz->prev_high_trip == high)
+		goto exit;
+
+	tz->prev_low_trip = low;
+	tz->prev_high_trip = high;
+
+	dev_dbg(&tz->device,
+		"new temperature boundaries: %d < x < %d\n", low, high);
+
+	/*
+	 * Set a temperature window. When this window is left the driver
+	 * must inform the thermal core via thermal_zone_device_update.
+	 */
+	ret = tz->ops->set_trips(tz, low, high);
+	if (ret)
+		dev_err(&tz->device, "Failed to set trips: %d\n", ret);
+
+exit:
+	mutex_unlock(&tz->lock);
+}
+EXPORT_SYMBOL_GPL(thermal_zone_set_trips);
+
 static void update_temperature(struct thermal_zone_device *tz)
 {
 	int temp, ret;
@@ -557,7 +607,8 @@
 		pos->initialized = false;
 }
 
-void thermal_zone_device_update(struct thermal_zone_device *tz)
+void thermal_zone_device_update(struct thermal_zone_device *tz,
+				enum thermal_notify_event event)
 {
 	int count;
 
@@ -569,6 +620,10 @@
 
 	update_temperature(tz);
 
+	thermal_zone_set_trips(tz);
+
+	tz->notify_event = event;
+
 	for (count = 0; count < tz->trips; count++)
 		handle_thermal_trip(tz, count);
 }
@@ -579,7 +634,7 @@
 	struct thermal_zone_device *tz = container_of(work, struct
 						      thermal_zone_device,
 						      poll_queue.work);
-	thermal_zone_device_update(tz);
+	thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
 }
 
 /* sys I/F for thermal zone */
@@ -703,7 +758,7 @@
 	if (ret)
 		return ret;
 
-	thermal_zone_device_update(tz);
+	thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
 
 	return count;
 }
@@ -754,6 +809,9 @@
 	 */
 	ret = tz->ops->set_trip_hyst(tz, trip, temperature);
 
+	if (!ret)
+		thermal_zone_set_trips(tz);
+
 	return ret ? ret : count;
 }
 
@@ -822,7 +880,7 @@
 
 	tz->forced_passive = state;
 
-	thermal_zone_device_update(tz);
+	thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
 
 	return count;
 }
@@ -913,7 +971,7 @@
 	}
 
 	if (!ret)
-		thermal_zone_device_update(tz);
+		thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
 
 	return ret ? ret : count;
 }
@@ -1509,7 +1567,8 @@
 	mutex_lock(&thermal_list_lock);
 	list_for_each_entry(pos, &thermal_tz_list, node)
 		if (atomic_cmpxchg(&pos->need_update, 1, 0))
-			thermal_zone_device_update(pos);
+			thermal_zone_device_update(pos,
+						   THERMAL_EVENT_UNSPECIFIED);
 	mutex_unlock(&thermal_list_lock);
 
 	return cdev;
@@ -1952,7 +2011,7 @@
 	thermal_zone_device_reset(tz);
 	/* Update the new thermal zone and mark it as already updated. */
 	if (atomic_cmpxchg(&tz->need_update, 1, 0))
-		thermal_zone_device_update(tz);
+		thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
 
 	return tz;
 
@@ -2069,6 +2128,36 @@
 }
 EXPORT_SYMBOL_GPL(thermal_zone_get_zone_by_name);
 
+/**
+ * thermal_zone_get_slope - return the slope attribute of the thermal zone
+ * @tz: thermal zone device with the slope attribute
+ *
+ * Return: If the thermal zone device has a slope attribute, return it, else
+ * return 1.
+ */
+int thermal_zone_get_slope(struct thermal_zone_device *tz)
+{
+	if (tz && tz->tzp)
+		return tz->tzp->slope;
+	return 1;
+}
+EXPORT_SYMBOL_GPL(thermal_zone_get_slope);
+
+/**
+ * thermal_zone_get_offset - return the offset attribute of the thermal zone
+ * @tz: thermal zone device with the offset attribute
+ *
+ * Return: If the thermal zone device has a offset attribute, return it, else
+ * return 0.
+ */
+int thermal_zone_get_offset(struct thermal_zone_device *tz)
+{
+	if (tz && tz->tzp)
+		return tz->tzp->offset;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(thermal_zone_get_offset);
+
 #ifdef CONFIG_NET
 static const struct genl_multicast_group thermal_event_mcgrps[] = {
 	{ .name = THERMAL_GENL_MCAST_GROUP_NAME, },
@@ -2209,7 +2298,8 @@
 		atomic_set(&in_suspend, 0);
 		list_for_each_entry(tz, &thermal_tz_list, node) {
 			thermal_zone_device_reset(tz);
-			thermal_zone_device_update(tz);
+			thermal_zone_device_update(tz,
+						   THERMAL_EVENT_UNSPECIFIED);
 		}
 		break;
 	default:
diff --git a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
index 15c0a9a..0586bd0 100644
--- a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
+++ b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
@@ -52,7 +52,7 @@
 	struct ti_thermal_data *data = container_of(work,
 					struct ti_thermal_data, thermal_wq);
 
-	thermal_zone_device_update(data->ti_thermal);
+	thermal_zone_device_update(data->ti_thermal, THERMAL_EVENT_UNSPECIFIED);
 
 	dev_dbg(&data->ti_thermal->device, "updated thermal zone %s\n",
 		data->ti_thermal->type);
@@ -205,7 +205,7 @@
 	data->mode = mode;
 	ti_bandgap_write_update_interval(bgp, data->sensor_id,
 					data->ti_thermal->polling_delay);
-	thermal_zone_device_update(data->ti_thermal);
+	thermal_zone_device_update(data->ti_thermal, THERMAL_EVENT_UNSPECIFIED);
 	dev_dbg(&thermal->device, "thermal polling set for duration=%d msec\n",
 		data->ti_thermal->polling_delay);
 
@@ -239,7 +239,7 @@
 	return 0;
 }
 
-static int __ti_thermal_get_trend(void *p, long *trend)
+static int __ti_thermal_get_trend(void *p, int trip, enum thermal_trend *trend)
 {
 	struct ti_thermal_data *data = p;
 	struct ti_bandgap *bgp;
@@ -252,22 +252,6 @@
 	if (ret)
 		return ret;
 
-	*trend = tr;
-
-	return 0;
-}
-
-/* Get the temperature trend callback functions for thermal zone */
-static int ti_thermal_get_trend(struct thermal_zone_device *thermal,
-				int trip, enum thermal_trend *trend)
-{
-	int ret;
-	long tr;
-
-	ret = __ti_thermal_get_trend(thermal->devdata, &tr);
-	if (ret)
-		return ret;
-
 	if (tr > 0)
 		*trend = THERMAL_TREND_RAISING;
 	else if (tr < 0)
@@ -278,6 +262,13 @@
 	return 0;
 }
 
+/* Get the temperature trend callback functions for thermal zone */
+static int ti_thermal_get_trend(struct thermal_zone_device *thermal,
+				int trip, enum thermal_trend *trend)
+{
+	return __ti_thermal_get_trend(thermal->devdata, trip, trend);
+}
+
 /* Get critical temperature callback functions for thermal zone */
 static int ti_thermal_get_crit_temp(struct thermal_zone_device *thermal,
 				    int *temp)
diff --git a/drivers/thermal/user_space.c b/drivers/thermal/user_space.c
index 10adcdd..c908150 100644
--- a/drivers/thermal/user_space.c
+++ b/drivers/thermal/user_space.c
@@ -23,19 +23,30 @@
  */
 
 #include <linux/thermal.h>
-
+#include <linux/slab.h>
 #include "thermal_core.h"
 
 /**
  * notify_user_space - Notifies user space about thermal events
  * @tz - thermal_zone_device
+ * @trip - Trip point index
  *
  * This function notifies the user space through UEvents.
  */
 static int notify_user_space(struct thermal_zone_device *tz, int trip)
 {
+	char *thermal_prop[5];
+	int i;
+
 	mutex_lock(&tz->lock);
-	kobject_uevent(&tz->device.kobj, KOBJ_CHANGE);
+	thermal_prop[0] = kasprintf(GFP_KERNEL, "NAME=%s", tz->type);
+	thermal_prop[1] = kasprintf(GFP_KERNEL, "TEMP=%d", tz->temperature);
+	thermal_prop[2] = kasprintf(GFP_KERNEL, "TRIP=%d", trip);
+	thermal_prop[3] = kasprintf(GFP_KERNEL, "EVENT=%d", tz->notify_event);
+	thermal_prop[4] = NULL;
+	kobject_uevent_env(&tz->device.kobj, KOBJ_CHANGE, thermal_prop);
+	for (i = 0; i < 4; ++i)
+		kfree(thermal_prop[i]);
 	mutex_unlock(&tz->lock);
 	return 0;
 }
diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c
index 97f0a2b..95f4c1b 100644
--- a/drivers/thermal/x86_pkg_temp_thermal.c
+++ b/drivers/thermal/x86_pkg_temp_thermal.c
@@ -348,7 +348,8 @@
 	}
 	if (notify) {
 		pr_debug("thermal_zone_device_update\n");
-		thermal_zone_device_update(phdev->tzone);
+		thermal_zone_device_update(phdev->tzone,
+					   THERMAL_EVENT_UNSPECIFIED);
 	}
 }
 
diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig
index 88b008f..af2f117 100644
--- a/drivers/video/fbdev/Kconfig
+++ b/drivers/video/fbdev/Kconfig
@@ -284,12 +284,14 @@
 config FB_ARMCLCD
 	tristate "ARM PrimeCell PL110 support"
 	depends on ARM || ARM64 || COMPILE_TEST
-	depends on FB && ARM_AMBA
+	depends on FB && ARM_AMBA && HAS_IOMEM
 	select FB_CFB_FILLRECT
 	select FB_CFB_COPYAREA
 	select FB_CFB_IMAGEBLIT
 	select FB_MODE_HELPERS if OF
 	select VIDEOMODE_HELPERS if OF
+	select BACKLIGHT_LCD_SUPPORT if OF
+	select BACKLIGHT_CLASS_DEVICE if OF
 	help
 	  This framebuffer device driver is for the ARM PrimeCell PL110
 	  Colour LCD controller.  ARM PrimeCells provide the building
@@ -305,6 +307,8 @@
 	def_bool ARCH_VERSATILE || ARCH_REALVIEW || ARCH_VEXPRESS || ARCH_INTEGRATOR
 	depends on ARM
 	depends on FB_ARMCLCD && FB=y
+	select REGMAP
+	select MFD_SYSCON
 
 config FB_ACORN
 	bool "Acorn VIDC support"
@@ -2443,7 +2447,6 @@
 
 source "drivers/video/fbdev/omap/Kconfig"
 source "drivers/video/fbdev/omap2/Kconfig"
-source "drivers/video/fbdev/exynos/Kconfig"
 source "drivers/video/fbdev/mmp/Kconfig"
 
 config FB_SH_MOBILE_MERAM
diff --git a/drivers/video/fbdev/Makefile b/drivers/video/fbdev/Makefile
index f673186..ee8c814 100644
--- a/drivers/video/fbdev/Makefile
+++ b/drivers/video/fbdev/Makefile
@@ -6,8 +6,6 @@
 
 obj-y				+= core/
 
-obj-$(CONFIG_EXYNOS_VIDEO)     += exynos/
-
 obj-$(CONFIG_FB_MACMODES)      += macmodes.o
 obj-$(CONFIG_FB_WMT_GE_ROPS)   += wmt_ge_rops.o
 
@@ -79,6 +77,7 @@
 obj-$(CONFIG_FB_PVR2)             += pvr2fb.o
 obj-$(CONFIG_FB_VOODOO1)          += sstfb.o
 obj-$(CONFIG_FB_ARMCLCD)	  += amba-clcd.o
+obj-$(CONFIG_ARCH_NOMADIK)	  += amba-clcd-nomadik.o
 obj-$(CONFIG_PLAT_VERSATILE_CLCD) += amba-clcd-versatile.o
 obj-$(CONFIG_FB_GOLDFISH)         += goldfishfb.o
 obj-$(CONFIG_FB_68328)            += 68328fb.o
diff --git a/drivers/video/fbdev/amba-clcd-nomadik.c b/drivers/video/fbdev/amba-clcd-nomadik.c
new file mode 100644
index 0000000..0c06fca
--- /dev/null
+++ b/drivers/video/fbdev/amba-clcd-nomadik.c
@@ -0,0 +1,259 @@
+#include <linux/amba/bus.h>
+#include <linux/amba/clcd.h>
+#include <linux/gpio/consumer.h>
+#include <linux/of.h>
+#include <linux/of_graph.h>
+#include <linux/delay.h>
+#include <linux/bitops.h>
+#include <linux/mfd/syscon.h>
+#include <linux/regmap.h>
+
+#include "amba-clcd-nomadik.h"
+
+static struct gpio_desc *grestb;
+static struct gpio_desc *scen;
+static struct gpio_desc *scl;
+static struct gpio_desc *sda;
+
+static u8 tpg110_readwrite_reg(bool write, u8 address, u8 outval)
+{
+	int i;
+	u8 inval = 0;
+
+	/* Assert SCEN */
+	gpiod_set_value_cansleep(scen, 1);
+	ndelay(150);
+	/* Hammer out the address */
+	for (i = 5; i >= 0; i--) {
+		if (address & BIT(i))
+			gpiod_set_value_cansleep(sda, 1);
+		else
+			gpiod_set_value_cansleep(sda, 0);
+		ndelay(150);
+		/* Send an SCL pulse */
+		gpiod_set_value_cansleep(scl, 1);
+		ndelay(160);
+		gpiod_set_value_cansleep(scl, 0);
+		ndelay(160);
+	}
+
+	if (write) {
+		/* WRITE */
+		gpiod_set_value_cansleep(sda, 0);
+	} else {
+		/* READ */
+		gpiod_set_value_cansleep(sda, 1);
+	}
+	ndelay(150);
+	/* Send an SCL pulse */
+	gpiod_set_value_cansleep(scl, 1);
+	ndelay(160);
+	gpiod_set_value_cansleep(scl, 0);
+	ndelay(160);
+
+	if (!write)
+		/* HiZ turn-around cycle */
+		gpiod_direction_input(sda);
+	ndelay(150);
+	/* Send an SCL pulse */
+	gpiod_set_value_cansleep(scl, 1);
+	ndelay(160);
+	gpiod_set_value_cansleep(scl, 0);
+	ndelay(160);
+
+	/* Hammer in/out the data */
+	for (i = 7; i >= 0; i--) {
+		int value;
+
+		if (write) {
+			value = !!(outval & BIT(i));
+			gpiod_set_value_cansleep(sda, value);
+		} else {
+			value = gpiod_get_value(sda);
+			if (value)
+				inval |= BIT(i);
+		}
+		ndelay(150);
+		/* Send an SCL pulse */
+		gpiod_set_value_cansleep(scl, 1);
+		ndelay(160);
+		gpiod_set_value_cansleep(scl, 0);
+		ndelay(160);
+	}
+
+	gpiod_direction_output(sda, 0);
+	/* Deassert SCEN */
+	gpiod_set_value_cansleep(scen, 0);
+	/* Satisfies SCEN pulse width */
+	udelay(1);
+
+	return inval;
+}
+
+static u8 tpg110_read_reg(u8 address)
+{
+	return tpg110_readwrite_reg(false, address, 0);
+}
+
+static void tpg110_write_reg(u8 address, u8 outval)
+{
+	tpg110_readwrite_reg(true, address, outval);
+}
+
+static void tpg110_startup(struct device *dev)
+{
+	u8 val;
+
+	dev_info(dev, "TPG110 display enable\n");
+	/* De-assert the reset signal */
+	gpiod_set_value_cansleep(grestb, 0);
+	mdelay(1);
+	dev_info(dev, "de-asserted GRESTB\n");
+
+	/* Test display communication */
+	tpg110_write_reg(0x00, 0x55);
+	val = tpg110_read_reg(0x00);
+	if (val == 0x55)
+		dev_info(dev, "passed communication test\n");
+	val = tpg110_read_reg(0x01);
+	dev_info(dev, "TPG110 chip ID: %d version: %d\n",
+		val>>4, val&0x0f);
+
+	/* Show display resolution */
+	val = tpg110_read_reg(0x02);
+	val &= 7;
+	switch (val) {
+	case 0x0:
+		dev_info(dev, "IN 400x240 RGB -> OUT 800x480 RGB (dual scan)");
+		break;
+	case 0x1:
+		dev_info(dev, "IN 480x272 RGB -> OUT 800x480 RGB (dual scan)");
+		break;
+	case 0x4:
+		dev_info(dev, "480x640 RGB");
+		break;
+	case 0x5:
+		dev_info(dev, "480x272 RGB");
+		break;
+	case 0x6:
+		dev_info(dev, "640x480 RGB");
+		break;
+	case 0x7:
+		dev_info(dev, "800x480 RGB");
+		break;
+	default:
+		dev_info(dev, "ILLEGAL RESOLUTION");
+		break;
+	}
+
+	val = tpg110_read_reg(0x03);
+	dev_info(dev, "resolution is controlled by %s\n",
+		(val & BIT(7)) ? "software" : "hardware");
+}
+
+static void tpg110_enable(struct clcd_fb *fb)
+{
+	struct device *dev = &fb->dev->dev;
+	static bool startup;
+	u8 val;
+
+	if (!startup) {
+		tpg110_startup(dev);
+		startup = true;
+	}
+
+	/* Take chip out of standby */
+	val = tpg110_read_reg(0x03);
+	val |= BIT(0);
+	tpg110_write_reg(0x03, val);
+}
+
+static void tpg110_disable(struct clcd_fb *fb)
+{
+	u8 val;
+
+	dev_info(&fb->dev->dev, "TPG110 display disable\n");
+	val = tpg110_read_reg(0x03);
+	/* Put into standby */
+	val &= ~BIT(0);
+	tpg110_write_reg(0x03, val);
+}
+
+static void tpg110_init(struct device *dev, struct device_node *np,
+			struct clcd_board *board)
+{
+	dev_info(dev, "TPG110 display init\n");
+
+	grestb = devm_get_gpiod_from_child(dev, "grestb", &np->fwnode);
+	if (IS_ERR(grestb)) {
+		dev_err(dev, "no GRESTB GPIO\n");
+		return;
+	}
+	/* This asserts the GRESTB signal, putting the display into reset */
+	gpiod_direction_output(grestb, 1);
+
+	scen = devm_get_gpiod_from_child(dev, "scen", &np->fwnode);
+	if (IS_ERR(scen)) {
+		dev_err(dev, "no SCEN GPIO\n");
+		return;
+	}
+	gpiod_direction_output(scen, 0);
+	scl = devm_get_gpiod_from_child(dev, "scl", &np->fwnode);
+	if (IS_ERR(scl)) {
+		dev_err(dev, "no SCL GPIO\n");
+		return;
+	}
+	gpiod_direction_output(scl, 0);
+	sda = devm_get_gpiod_from_child(dev, "sda", &np->fwnode);
+	if (IS_ERR(sda)) {
+		dev_err(dev, "no SDA GPIO\n");
+		return;
+	}
+	gpiod_direction_output(sda, 0);
+	board->enable = tpg110_enable;
+	board->disable = tpg110_disable;
+}
+
+int nomadik_clcd_init_panel(struct clcd_fb *fb,
+			    struct device_node *endpoint)
+{
+	struct device_node *panel;
+
+	panel = of_graph_get_remote_port_parent(endpoint);
+	if (!panel)
+		return -ENODEV;
+
+	if (of_device_is_compatible(panel, "tpo,tpg110"))
+		tpg110_init(&fb->dev->dev, panel, fb->board);
+	else
+		dev_info(&fb->dev->dev, "unknown panel\n");
+
+	/* Unknown panel, fall through */
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nomadik_clcd_init_panel);
+
+#define PMU_CTRL_OFFSET 0x0000
+#define PMU_CTRL_LCDNDIF BIT(26)
+
+int nomadik_clcd_init_board(struct amba_device *adev,
+			    struct clcd_board *board)
+{
+	struct regmap *pmu_regmap;
+
+	dev_info(&adev->dev, "Nomadik CLCD board init\n");
+	pmu_regmap =
+		syscon_regmap_lookup_by_compatible("stericsson,nomadik-pmu");
+	if (IS_ERR(pmu_regmap)) {
+		dev_err(&adev->dev, "could not find PMU syscon regmap\n");
+		return PTR_ERR(pmu_regmap);
+	}
+	regmap_update_bits(pmu_regmap,
+			   PMU_CTRL_OFFSET,
+			   PMU_CTRL_LCDNDIF,
+			   0);
+	dev_info(&adev->dev, "set PMU mux to CLCD mode\n");
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nomadik_clcd_init_board);
diff --git a/drivers/video/fbdev/amba-clcd-nomadik.h b/drivers/video/fbdev/amba-clcd-nomadik.h
new file mode 100644
index 0000000..50aa9bd
--- /dev/null
+++ b/drivers/video/fbdev/amba-clcd-nomadik.h
@@ -0,0 +1,24 @@
+#ifndef _AMBA_CLCD_NOMADIK_H
+#define _AMBA_CLCD_NOMADIK_H
+
+#include <linux/amba/bus.h>
+
+#ifdef CONFIG_ARCH_NOMADIK
+int nomadik_clcd_init_board(struct amba_device *adev,
+			     struct clcd_board *board);
+int nomadik_clcd_init_panel(struct clcd_fb *fb,
+			    struct device_node *endpoint);
+#else
+static inline int nomadik_clcd_init_board(struct amba_device *adev,
+					  struct clcd_board *board)
+{
+	return 0;
+}
+static inline int nomadik_clcd_init_panel(struct clcd_fb *fb,
+					  struct device_node *endpoint)
+{
+	return 0;
+}
+#endif
+
+#endif /* inclusion guard */
diff --git a/drivers/video/fbdev/amba-clcd-versatile.c b/drivers/video/fbdev/amba-clcd-versatile.c
index a8a22da..19ad864 100644
--- a/drivers/video/fbdev/amba-clcd-versatile.c
+++ b/drivers/video/fbdev/amba-clcd-versatile.c
@@ -3,6 +3,12 @@
 #include <linux/amba/bus.h>
 #include <linux/amba/clcd.h>
 #include <linux/platform_data/video-clcd-versatile.h>
+#include <linux/of.h>
+#include <linux/of_graph.h>
+#include <linux/regmap.h>
+#include <linux/mfd/syscon.h>
+#include <linux/bitops.h>
+#include "amba-clcd-versatile.h"
 
 static struct clcd_panel vga = {
 	.mode		= {
@@ -178,3 +184,392 @@
 	dma_free_wc(&fb->dev->dev, fb->fb.fix.smem_len, fb->fb.screen_base,
 		    fb->fb.fix.smem_start);
 }
+
+#ifdef CONFIG_OF
+
+static struct regmap *versatile_syscon_map;
+static struct regmap *versatile_ib2_map;
+
+/*
+ * We detect the different syscon types from the compatible strings.
+ */
+enum versatile_clcd {
+	INTEGRATOR_CLCD_CM,
+	VERSATILE_CLCD,
+	REALVIEW_CLCD_EB,
+	REALVIEW_CLCD_PB1176,
+	REALVIEW_CLCD_PB11MP,
+	REALVIEW_CLCD_PBA8,
+	REALVIEW_CLCD_PBX,
+};
+
+static const struct of_device_id versatile_clcd_of_match[] = {
+	{
+		.compatible = "arm,core-module-integrator",
+		.data = (void *)INTEGRATOR_CLCD_CM,
+	},
+	{
+		.compatible = "arm,versatile-sysreg",
+		.data = (void *)VERSATILE_CLCD,
+	},
+	{
+		.compatible = "arm,realview-eb-syscon",
+		.data = (void *)REALVIEW_CLCD_EB,
+	},
+	{
+		.compatible = "arm,realview-pb1176-syscon",
+		.data = (void *)REALVIEW_CLCD_PB1176,
+	},
+	{
+		.compatible = "arm,realview-pb11mp-syscon",
+		.data = (void *)REALVIEW_CLCD_PB11MP,
+	},
+	{
+		.compatible = "arm,realview-pba8-syscon",
+		.data = (void *)REALVIEW_CLCD_PBA8,
+	},
+	{
+		.compatible = "arm,realview-pbx-syscon",
+		.data = (void *)REALVIEW_CLCD_PBX,
+	},
+	{},
+};
+
+/*
+ * Core module CLCD control on the Integrator/CP, bits
+ * 8 thru 19 of the CM_CONTROL register controls a bunch
+ * of CLCD settings.
+ */
+#define INTEGRATOR_HDR_CTRL_OFFSET	0x0C
+#define INTEGRATOR_CLCD_LCDBIASEN	BIT(8)
+#define INTEGRATOR_CLCD_LCDBIASUP	BIT(9)
+#define INTEGRATOR_CLCD_LCDBIASDN	BIT(10)
+/* Bits 11,12,13 controls the LCD type */
+#define INTEGRATOR_CLCD_LCDMUX_MASK	(BIT(11)|BIT(12)|BIT(13))
+#define INTEGRATOR_CLCD_LCDMUX_LCD24	BIT(11)
+#define INTEGRATOR_CLCD_LCDMUX_VGA565	BIT(12)
+#define INTEGRATOR_CLCD_LCDMUX_SHARP	(BIT(11)|BIT(12))
+#define INTEGRATOR_CLCD_LCDMUX_VGA555	BIT(13)
+#define INTEGRATOR_CLCD_LCDMUX_VGA24	(BIT(11)|BIT(12)|BIT(13))
+#define INTEGRATOR_CLCD_LCD0_EN		BIT(14)
+#define INTEGRATOR_CLCD_LCD1_EN		BIT(15)
+/* R/L flip on Sharp */
+#define INTEGRATOR_CLCD_LCD_STATIC1	BIT(16)
+/* U/D flip on Sharp */
+#define INTEGRATOR_CLCD_LCD_STATIC2	BIT(17)
+/* No connection on Sharp */
+#define INTEGRATOR_CLCD_LCD_STATIC	BIT(18)
+/* 0 = 24bit VGA, 1 = 18bit VGA */
+#define INTEGRATOR_CLCD_LCD_N24BITEN	BIT(19)
+
+#define INTEGRATOR_CLCD_MASK		(INTEGRATOR_CLCD_LCDBIASEN | \
+					 INTEGRATOR_CLCD_LCDBIASUP | \
+					 INTEGRATOR_CLCD_LCDBIASDN | \
+					 INTEGRATOR_CLCD_LCDMUX_MASK | \
+					 INTEGRATOR_CLCD_LCD0_EN | \
+					 INTEGRATOR_CLCD_LCD1_EN | \
+					 INTEGRATOR_CLCD_LCD_STATIC1 | \
+					 INTEGRATOR_CLCD_LCD_STATIC2 | \
+					 INTEGRATOR_CLCD_LCD_STATIC | \
+					 INTEGRATOR_CLCD_LCD_N24BITEN)
+
+static void integrator_clcd_enable(struct clcd_fb *fb)
+{
+	struct fb_var_screeninfo *var = &fb->fb.var;
+	u32 val;
+
+	dev_info(&fb->dev->dev, "enable Integrator CLCD connectors\n");
+
+	/* FIXME: really needed? */
+	val = INTEGRATOR_CLCD_LCD_STATIC1 | INTEGRATOR_CLCD_LCD_STATIC2 |
+		INTEGRATOR_CLCD_LCD0_EN | INTEGRATOR_CLCD_LCD1_EN;
+	if (var->bits_per_pixel <= 8 ||
+	    (var->bits_per_pixel == 16 && var->green.length == 5))
+		/* Pseudocolor, RGB555, BGR555 */
+		val |= INTEGRATOR_CLCD_LCDMUX_VGA555;
+	else if (fb->fb.var.bits_per_pixel <= 16)
+		/* truecolor RGB565 */
+		val |= INTEGRATOR_CLCD_LCDMUX_VGA565;
+	else
+		val = 0; /* no idea for this, don't trust the docs */
+
+	regmap_update_bits(versatile_syscon_map,
+			   INTEGRATOR_HDR_CTRL_OFFSET,
+			   INTEGRATOR_CLCD_MASK,
+			   val);
+}
+
+/*
+ * This configuration register in the Versatile and RealView
+ * family is uniformly present but appears more and more
+ * unutilized starting with the RealView series.
+ */
+#define SYS_CLCD			0x50
+#define SYS_CLCD_MODE_MASK		(BIT(0)|BIT(1))
+#define SYS_CLCD_MODE_888		0
+#define SYS_CLCD_MODE_5551		BIT(0)
+#define SYS_CLCD_MODE_565_R_LSB		BIT(1)
+#define SYS_CLCD_MODE_565_B_LSB		(BIT(0)|BIT(1))
+#define SYS_CLCD_CONNECTOR_MASK		(BIT(2)|BIT(3)|BIT(4)|BIT(5))
+#define SYS_CLCD_NLCDIOON		BIT(2)
+#define SYS_CLCD_VDDPOSSWITCH		BIT(3)
+#define SYS_CLCD_PWR3V5SWITCH		BIT(4)
+#define SYS_CLCD_VDDNEGSWITCH		BIT(5)
+#define SYS_CLCD_TSNSS			BIT(6) /* touchscreen enable */
+#define SYS_CLCD_SSPEXP			BIT(7) /* SSP expansion enable */
+
+/* The Versatile can detect the connected panel type */
+#define SYS_CLCD_CLCDID_MASK		(BIT(8)|BIT(9)|BIT(10)|BIT(11)|BIT(12))
+#define SYS_CLCD_ID_SANYO_3_8		(0x00 << 8)
+#define SYS_CLCD_ID_SHARP_8_4		(0x01 << 8)
+#define SYS_CLCD_ID_EPSON_2_2		(0x02 << 8)
+#define SYS_CLCD_ID_SANYO_2_5		(0x07 << 8)
+#define SYS_CLCD_ID_VGA			(0x1f << 8)
+
+#define SYS_CLCD_TSNDAV			BIT(13) /* data ready from TS */
+
+/* IB2 control register for the Versatile daughterboard */
+#define IB2_CTRL			0x00
+#define IB2_CTRL_LCD_SD			BIT(1) /* 1 = shut down LCD */
+#define IB2_CTRL_LCD_BL_ON		BIT(0)
+#define IB2_CTRL_LCD_MASK		(BIT(0)|BIT(1))
+
+static void versatile_clcd_disable(struct clcd_fb *fb)
+{
+	dev_info(&fb->dev->dev, "disable Versatile CLCD connectors\n");
+	regmap_update_bits(versatile_syscon_map,
+			   SYS_CLCD,
+			   SYS_CLCD_CONNECTOR_MASK,
+			   0);
+
+	/* If we're on an IB2 daughterboard, turn off display */
+	if (versatile_ib2_map) {
+		dev_info(&fb->dev->dev, "disable IB2 display\n");
+		regmap_update_bits(versatile_ib2_map,
+				   IB2_CTRL,
+				   IB2_CTRL_LCD_MASK,
+				   IB2_CTRL_LCD_SD);
+	}
+}
+
+static void versatile_clcd_enable(struct clcd_fb *fb)
+{
+	struct fb_var_screeninfo *var = &fb->fb.var;
+	u32 val = 0;
+
+	dev_info(&fb->dev->dev, "enable Versatile CLCD connectors\n");
+	switch (var->green.length) {
+	case 5:
+		val |= SYS_CLCD_MODE_5551;
+		break;
+	case 6:
+		if (var->red.offset == 0)
+			val |= SYS_CLCD_MODE_565_R_LSB;
+		else
+			val |= SYS_CLCD_MODE_565_B_LSB;
+		break;
+	case 8:
+		val |= SYS_CLCD_MODE_888;
+		break;
+	}
+
+	/* Set up the MUX */
+	regmap_update_bits(versatile_syscon_map,
+			   SYS_CLCD,
+			   SYS_CLCD_MODE_MASK,
+			   val);
+
+	/* Then enable the display */
+	regmap_update_bits(versatile_syscon_map,
+			   SYS_CLCD,
+			   SYS_CLCD_CONNECTOR_MASK,
+			   SYS_CLCD_NLCDIOON | SYS_CLCD_PWR3V5SWITCH);
+
+	/* If we're on an IB2 daughterboard, turn on display */
+	if (versatile_ib2_map) {
+		dev_info(&fb->dev->dev, "enable IB2 display\n");
+		regmap_update_bits(versatile_ib2_map,
+				   IB2_CTRL,
+				   IB2_CTRL_LCD_MASK,
+				   IB2_CTRL_LCD_BL_ON);
+	}
+}
+
+static void versatile_clcd_decode(struct clcd_fb *fb, struct clcd_regs *regs)
+{
+	clcdfb_decode(fb, regs);
+
+	/* Always clear BGR for RGB565: we do the routing externally */
+	if (fb->fb.var.green.length == 6)
+		regs->cntl &= ~CNTL_BGR;
+}
+
+static void realview_clcd_disable(struct clcd_fb *fb)
+{
+	dev_info(&fb->dev->dev, "disable RealView CLCD connectors\n");
+	regmap_update_bits(versatile_syscon_map,
+			   SYS_CLCD,
+			   SYS_CLCD_CONNECTOR_MASK,
+			   0);
+}
+
+static void realview_clcd_enable(struct clcd_fb *fb)
+{
+	dev_info(&fb->dev->dev, "enable RealView CLCD connectors\n");
+	regmap_update_bits(versatile_syscon_map,
+			   SYS_CLCD,
+			   SYS_CLCD_CONNECTOR_MASK,
+			   SYS_CLCD_NLCDIOON | SYS_CLCD_PWR3V5SWITCH);
+}
+
+struct versatile_panel {
+	u32 id;
+	char *compatible;
+	bool ib2;
+};
+
+static const struct versatile_panel versatile_panels[] = {
+	{
+		.id = SYS_CLCD_ID_VGA,
+		.compatible = "VGA",
+	},
+	{
+		.id = SYS_CLCD_ID_SANYO_3_8,
+		.compatible = "sanyo,tm38qv67a02a",
+	},
+	{
+		.id = SYS_CLCD_ID_SHARP_8_4,
+		.compatible = "sharp,lq084v1dg21",
+	},
+	{
+		.id = SYS_CLCD_ID_EPSON_2_2,
+		.compatible = "epson,l2f50113t00",
+	},
+	{
+		.id = SYS_CLCD_ID_SANYO_2_5,
+		.compatible = "sanyo,alr252rgt",
+		.ib2 = true,
+	},
+};
+
+static void versatile_panel_probe(struct device *dev,
+				  struct device_node *endpoint)
+{
+	struct versatile_panel const *vpanel = NULL;
+	struct device_node *panel = NULL;
+	u32 val;
+	int ret;
+	int i;
+
+	/*
+	 * The Versatile CLCD has a panel auto-detection mechanism.
+	 * We use this and look for the compatible panel in the
+	 * device tree.
+	 */
+	ret = regmap_read(versatile_syscon_map, SYS_CLCD, &val);
+	if (ret) {
+		dev_err(dev, "cannot read CLCD syscon register\n");
+		return;
+	}
+	val &= SYS_CLCD_CLCDID_MASK;
+
+	/* First find corresponding panel information */
+	for (i = 0; i < ARRAY_SIZE(versatile_panels); i++) {
+		vpanel = &versatile_panels[i];
+
+		if (val == vpanel->id) {
+			dev_err(dev, "autodetected panel \"%s\"\n",
+				vpanel->compatible);
+			break;
+		}
+	}
+	if (i == ARRAY_SIZE(versatile_panels)) {
+		dev_err(dev, "could not auto-detect panel\n");
+		return;
+	}
+
+	panel = of_graph_get_remote_port_parent(endpoint);
+	if (!panel) {
+		dev_err(dev, "could not locate panel in DT\n");
+		return;
+	}
+	if (!of_device_is_compatible(panel, vpanel->compatible))
+		dev_err(dev, "panel in DT is not compatible with the "
+			"auto-detected panel, continuing anyway\n");
+
+	/*
+	 * If we have a Sanyo 2.5" port
+	 * that we're running on an IB2 and proceed to look for the
+	 * IB2 syscon regmap.
+	 */
+	if (!vpanel->ib2)
+		return;
+
+	versatile_ib2_map = syscon_regmap_lookup_by_compatible(
+		"arm,versatile-ib2-syscon");
+	if (IS_ERR(versatile_ib2_map)) {
+		dev_err(dev, "could not locate IB2 control register\n");
+		versatile_ib2_map = NULL;
+		return;
+	}
+}
+
+int versatile_clcd_init_panel(struct clcd_fb *fb,
+			      struct device_node *endpoint)
+{
+	const struct of_device_id *clcd_id;
+	enum versatile_clcd versatile_clcd_type;
+	struct device_node *np;
+	struct regmap *map;
+	struct device *dev = &fb->dev->dev;
+
+	np = of_find_matching_node_and_match(NULL, versatile_clcd_of_match,
+					     &clcd_id);
+	if (!np) {
+		dev_err(dev, "no Versatile syscon node\n");
+		return -ENODEV;
+	}
+	versatile_clcd_type = (enum versatile_clcd)clcd_id->data;
+
+	map = syscon_node_to_regmap(np);
+	if (IS_ERR(map)) {
+		dev_err(dev, "no Versatile syscon regmap\n");
+		return PTR_ERR(map);
+	}
+
+	switch (versatile_clcd_type) {
+	case INTEGRATOR_CLCD_CM:
+		versatile_syscon_map = map;
+		fb->board->enable = integrator_clcd_enable;
+		/* Override the caps, we have only these */
+		fb->board->caps = CLCD_CAP_5551 | CLCD_CAP_RGB565 |
+			CLCD_CAP_888;
+		dev_info(dev, "set up callbacks for Integrator PL110\n");
+		break;
+	case VERSATILE_CLCD:
+		versatile_syscon_map = map;
+		fb->board->enable = versatile_clcd_enable;
+		fb->board->disable = versatile_clcd_disable;
+		fb->board->decode = versatile_clcd_decode;
+		versatile_panel_probe(dev, endpoint);
+		dev_info(dev, "set up callbacks for Versatile\n");
+		break;
+	case REALVIEW_CLCD_EB:
+	case REALVIEW_CLCD_PB1176:
+	case REALVIEW_CLCD_PB11MP:
+	case REALVIEW_CLCD_PBA8:
+	case REALVIEW_CLCD_PBX:
+		versatile_syscon_map = map;
+		fb->board->enable = realview_clcd_enable;
+		fb->board->disable = realview_clcd_disable;
+		dev_info(dev, "set up callbacks for RealView PL111\n");
+		break;
+	default:
+		dev_info(dev, "unknown Versatile system controller\n");
+		break;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(versatile_clcd_init_panel);
+#endif
diff --git a/drivers/video/fbdev/amba-clcd-versatile.h b/drivers/video/fbdev/amba-clcd-versatile.h
new file mode 100644
index 0000000..1b14359
--- /dev/null
+++ b/drivers/video/fbdev/amba-clcd-versatile.h
@@ -0,0 +1,17 @@
+/*
+ * Special local versatile callbacks
+ */
+#include <linux/of.h>
+#include <linux/amba/bus.h>
+#include <linux/platform_data/video-clcd-versatile.h>
+
+#if defined(CONFIG_PLAT_VERSATILE_CLCD) && defined(CONFIG_OF)
+int versatile_clcd_init_panel(struct clcd_fb *fb,
+			      struct device_node *endpoint);
+#else
+static inline int versatile_clcd_init_panel(struct clcd_fb *fb,
+				struct device_node *endpoint)
+{
+	return 0;
+}
+#endif
diff --git a/drivers/video/fbdev/amba-clcd.c b/drivers/video/fbdev/amba-clcd.c
index 9b15886..ec2671d 100644
--- a/drivers/video/fbdev/amba-clcd.c
+++ b/drivers/video/fbdev/amba-clcd.c
@@ -30,10 +30,14 @@
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/of_graph.h>
+#include <linux/backlight.h>
 #include <video/display_timing.h>
 #include <video/of_display_timing.h>
 #include <video/videomode.h>
 
+#include "amba-clcd-nomadik.h"
+#include "amba-clcd-versatile.h"
+
 #define to_clcd(info)	container_of(info, struct clcd_fb, fb)
 
 /* This is limited to 16 characters when displayed by X startup */
@@ -71,6 +75,11 @@
 	if (fb->board->disable)
 		fb->board->disable(fb);
 
+	if (fb->panel->backlight) {
+		fb->panel->backlight->props.power = FB_BLANK_POWERDOWN;
+		backlight_update_status(fb->panel->backlight);
+	}
+
 	val = readl(fb->regs + fb->off_cntl);
 	if (val & CNTL_LCDPWR) {
 		val &= ~CNTL_LCDPWR;
@@ -117,6 +126,14 @@
 	writel(cntl, fb->regs + fb->off_cntl);
 
 	/*
+	 * Turn on backlight
+	 */
+	if (fb->panel->backlight) {
+		fb->panel->backlight->props.power = FB_BLANK_UNBLANK;
+		backlight_update_status(fb->panel->backlight);
+	}
+
+	/*
 	 * finally, enable the interface.
 	 */
 	if (fb->board->enable)
@@ -211,6 +228,15 @@
 			var->blue.length = 4;
 		}
 		break;
+	case 24:
+		if (fb->vendor->packed_24_bit_pixels) {
+			var->red.length = 8;
+			var->green.length = 8;
+			var->blue.length = 8;
+		} else {
+			ret = -EINVAL;
+		}
+		break;
 	case 32:
 		/* If we can't do 888, reject */
 		caps &= CLCD_CAP_888;
@@ -297,6 +323,12 @@
 
 	clcdfb_disable(fb);
 
+	/* Some variants must be clocked here */
+	if (fb->vendor->clock_timregs && !fb->clk_enabled) {
+		fb->clk_enabled = true;
+		clk_enable(fb->clk);
+	}
+
 	writel(regs.tim0, fb->regs + CLCD_TIM0);
 	writel(regs.tim1, fb->regs + CLCD_TIM1);
 	writel(regs.tim2, fb->regs + CLCD_TIM2);
@@ -551,7 +583,7 @@
 
 #ifdef CONFIG_OF
 static int clcdfb_of_get_dpi_panel_mode(struct device_node *node,
-		struct fb_videomode *mode)
+		struct clcd_panel *clcd_panel)
 {
 	int err;
 	struct display_timing timing;
@@ -563,10 +595,31 @@
 
 	videomode_from_timing(&timing, &video);
 
-	err = fb_videomode_from_videomode(&video, mode);
+	err = fb_videomode_from_videomode(&video, &clcd_panel->mode);
 	if (err)
 		return err;
 
+	/* Set up some inversion flags */
+	if (timing.flags & DISPLAY_FLAGS_PIXDATA_NEGEDGE)
+		clcd_panel->tim2 |= TIM2_IPC;
+	else if (!(timing.flags & DISPLAY_FLAGS_PIXDATA_POSEDGE))
+		/*
+		 * To preserve backwards compatibility, the IPC (inverted
+		 * pixel clock) flag needs to be set on any display that
+		 * doesn't explicitly specify that the pixel clock is
+		 * active on the negative or positive edge.
+		 */
+		clcd_panel->tim2 |= TIM2_IPC;
+
+	if (timing.flags & DISPLAY_FLAGS_HSYNC_LOW)
+		clcd_panel->tim2 |= TIM2_IHS;
+
+	if (timing.flags & DISPLAY_FLAGS_VSYNC_LOW)
+		clcd_panel->tim2 |= TIM2_IVS;
+
+	if (timing.flags & DISPLAY_FLAGS_DE_LOW)
+		clcd_panel->tim2 |= TIM2_IOE;
+
 	return 0;
 }
 
@@ -576,11 +629,34 @@
 			mode->refresh);
 }
 
+static int clcdfb_of_get_backlight(struct device_node *endpoint,
+				   struct clcd_panel *clcd_panel)
+{
+	struct device_node *panel;
+	struct device_node *backlight;
+
+	panel = of_graph_get_remote_port_parent(endpoint);
+	if (!panel)
+		return -ENODEV;
+
+	/* Look up the optional backlight phandle */
+	backlight = of_parse_phandle(panel, "backlight", 0);
+	if (backlight) {
+		clcd_panel->backlight = of_find_backlight_by_node(backlight);
+		of_node_put(backlight);
+
+		if (!clcd_panel->backlight)
+			return -EPROBE_DEFER;
+	}
+	return 0;
+}
+
 static int clcdfb_of_get_mode(struct device *dev, struct device_node *endpoint,
-		struct fb_videomode *mode)
+		struct clcd_panel *clcd_panel)
 {
 	int err;
 	struct device_node *panel;
+	struct fb_videomode *mode;
 	char *name;
 	int len;
 
@@ -590,11 +666,12 @@
 
 	/* Only directly connected DPI panels supported for now */
 	if (of_device_is_compatible(panel, "panel-dpi"))
-		err = clcdfb_of_get_dpi_panel_mode(panel, mode);
+		err = clcdfb_of_get_dpi_panel_mode(panel, clcd_panel);
 	else
 		err = -ENOENT;
 	if (err)
 		return err;
+	mode = &clcd_panel->mode;
 
 	len = clcdfb_snprintf_mode(NULL, 0, mode);
 	name = devm_kzalloc(dev, len + 1, GFP_KERNEL);
@@ -616,6 +693,7 @@
 	} panels[] = {
 		{ 0x110, 1,  7, 13, CLCD_CAP_5551 },
 		{ 0x110, 0,  8, 16, CLCD_CAP_888 },
+		{ 0x110, 16, 8, 0,  CLCD_CAP_888 },
 		{ 0x111, 4, 14, 20, CLCD_CAP_444 },
 		{ 0x111, 3, 11, 19, CLCD_CAP_444 | CLCD_CAP_5551 },
 		{ 0x111, 3, 10, 19, CLCD_CAP_444 | CLCD_CAP_5551 |
@@ -625,8 +703,8 @@
 	};
 	int i;
 
-	/* Bypass pixel clock divider, data output on the falling edge */
-	fb->panel->tim2 = TIM2_BCD | TIM2_IPC;
+	/* Bypass pixel clock divider */
+	fb->panel->tim2 |= TIM2_BCD;
 
 	/* TFT display, vert. comp. interrupt at the start of the back porch */
 	fb->panel->cntl |= CNTL_LCDTFT | CNTL_LCDVCOMP(1);
@@ -643,6 +721,49 @@
 			fb->panel->caps = panels[i].caps;
 	}
 
+	/*
+	 * If we actually physically connected the R lines to B and
+	 * vice versa
+	 */
+	if (r0 != 0 && b0 == 0)
+		fb->panel->bgr_connection = true;
+
+	if (fb->panel->caps && fb->vendor->st_bitmux_control) {
+		/*
+		 * Set up the special bits for the Nomadik control register
+		 * (other platforms tend to do this through an external
+		 * register).
+		 */
+
+		/* Offset of the highest used color */
+		int maxoff = max3(r0, g0, b0);
+		/* Most significant bit out, highest used bit */
+		int msb = 0;
+
+		if (fb->panel->caps & CLCD_CAP_888) {
+			msb = maxoff + 8 - 1;
+		} else if (fb->panel->caps & CLCD_CAP_565) {
+			msb = maxoff + 5 - 1;
+			fb->panel->cntl |= CNTL_ST_1XBPP_565;
+		} else if (fb->panel->caps & CLCD_CAP_5551) {
+			msb = maxoff + 5 - 1;
+			fb->panel->cntl |= CNTL_ST_1XBPP_5551;
+		} else if (fb->panel->caps & CLCD_CAP_444) {
+			msb = maxoff + 4 - 1;
+			fb->panel->cntl |= CNTL_ST_1XBPP_444;
+		}
+
+		/* Send out as many bits as we need */
+		if (msb > 17)
+			fb->panel->cntl |= CNTL_ST_CDWID_24;
+		else if (msb > 15)
+			fb->panel->cntl |= CNTL_ST_CDWID_18;
+		else if (msb > 11)
+			fb->panel->cntl |= CNTL_ST_CDWID_16;
+		else
+			fb->panel->cntl |= CNTL_ST_CDWID_12;
+	}
+
 	return fb->panel->caps ? 0 : -EINVAL;
 }
 
@@ -658,11 +779,24 @@
 	if (!fb->panel)
 		return -ENOMEM;
 
+	/*
+	 * Fetch the panel endpoint.
+	 */
 	endpoint = of_graph_get_next_endpoint(fb->dev->dev.of_node, NULL);
 	if (!endpoint)
 		return -ENODEV;
 
-	err = clcdfb_of_get_mode(&fb->dev->dev, endpoint, &fb->panel->mode);
+	if (fb->vendor->init_panel) {
+		err = fb->vendor->init_panel(fb, endpoint);
+		if (err)
+			return err;
+	}
+
+	err = clcdfb_of_get_backlight(endpoint, fb->panel);
+	if (err)
+		return err;
+
+	err = clcdfb_of_get_mode(&fb->dev->dev, endpoint, fb->panel);
 	if (err)
 		return err;
 
@@ -693,11 +827,11 @@
 
 	if (of_property_read_u32_array(endpoint,
 			"arm,pl11x,tft-r0g0b0-pads",
-			tft_r0b0g0, ARRAY_SIZE(tft_r0b0g0)) == 0)
-		return clcdfb_of_init_tft_panel(fb, tft_r0b0g0[0],
-				 tft_r0b0g0[1],  tft_r0b0g0[2]);
+			tft_r0b0g0, ARRAY_SIZE(tft_r0b0g0)) != 0)
+		return -ENOENT;
 
-	return -ENOENT;
+	return clcdfb_of_init_tft_panel(fb, tft_r0b0g0[0],
+					tft_r0b0g0[1],  tft_r0b0g0[2]);
 }
 
 static int clcdfb_of_vram_setup(struct clcd_fb *fb)
@@ -818,6 +952,7 @@
 static int clcdfb_probe(struct amba_device *dev, const struct amba_id *id)
 {
 	struct clcd_board *board = dev_get_platdata(&dev->dev);
+	struct clcd_vendor_data *vendor = id->data;
 	struct clcd_fb *fb;
 	int ret;
 
@@ -827,6 +962,12 @@
 	if (!board)
 		return -EINVAL;
 
+	if (vendor->init_board) {
+		ret = vendor->init_board(dev, board);
+		if (ret)
+			return ret;
+	}
+
 	ret = dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32));
 	if (ret)
 		goto out;
@@ -845,17 +986,18 @@
 	}
 
 	fb->dev = dev;
+	fb->vendor = vendor;
 	fb->board = board;
 
-	dev_info(&fb->dev->dev, "PL%03x rev%u at 0x%08llx\n",
-		amba_part(dev), amba_rev(dev),
+	dev_info(&fb->dev->dev, "PL%03x designer %02x rev%u at 0x%08llx\n",
+		amba_part(dev), amba_manf(dev), amba_rev(dev),
 		(unsigned long long)dev->res.start);
 
 	ret = fb->board->setup(fb);
 	if (ret)
 		goto free_fb;
 
-	ret = clcdfb_register(fb); 
+	ret = clcdfb_register(fb);
 	if (ret == 0) {
 		amba_set_drvdata(dev, fb);
 		goto out;
@@ -891,10 +1033,30 @@
 	return 0;
 }
 
+static struct clcd_vendor_data vendor_arm = {
+	/* Sets up the versatile board displays */
+	.init_panel = versatile_clcd_init_panel,
+};
+
+static struct clcd_vendor_data vendor_nomadik = {
+	.clock_timregs = true,
+	.packed_24_bit_pixels = true,
+	.st_bitmux_control = true,
+	.init_board = nomadik_clcd_init_board,
+	.init_panel = nomadik_clcd_init_panel,
+};
+
 static struct amba_id clcdfb_id_table[] = {
 	{
 		.id	= 0x00041110,
 		.mask	= 0x000ffffe,
+		.data	= &vendor_arm,
+	},
+	/* ST Electronics Nomadik variant */
+	{
+		.id	= 0x00180110,
+		.mask	= 0x00fffffe,
+		.data	= &vendor_nomadik,
 	},
 	{ 0, 0 },
 };
diff --git a/drivers/video/fbdev/arcfb.c b/drivers/video/fbdev/arcfb.c
index 1b0b233..1928cb2 100644
--- a/drivers/video/fbdev/arcfb.c
+++ b/drivers/video/fbdev/arcfb.c
@@ -79,7 +79,7 @@
 	spinlock_t lock;
 };
 
-static struct fb_fix_screeninfo arcfb_fix = {
+static const struct fb_fix_screeninfo arcfb_fix = {
 	.id =		"arcfb",
 	.type =		FB_TYPE_PACKED_PIXELS,
 	.visual =	FB_VISUAL_MONO01,
@@ -89,7 +89,7 @@
 	.accel =	FB_ACCEL_NONE,
 };
 
-static struct fb_var_screeninfo arcfb_var = {
+static const struct fb_var_screeninfo arcfb_var = {
 	.xres		= 128,
 	.yres		= 64,
 	.xres_virtual	= 128,
diff --git a/drivers/video/fbdev/asiliantfb.c b/drivers/video/fbdev/asiliantfb.c
index 7e8ddf0..91eea45 100644
--- a/drivers/video/fbdev/asiliantfb.c
+++ b/drivers/video/fbdev/asiliantfb.c
@@ -474,7 +474,7 @@
 		write_fr(chips_init_fr[i].addr, chips_init_fr[i].data);
 }
 
-static struct fb_fix_screeninfo asiliantfb_fix = {
+static const struct fb_fix_screeninfo asiliantfb_fix = {
 	.id =		"Asiliant 69000",
 	.type =		FB_TYPE_PACKED_PIXELS,
 	.visual =	FB_VISUAL_PSEUDOCOLOR,
@@ -483,7 +483,7 @@
 	.smem_len =	0x200000,	/* 2MB */
 };
 
-static struct fb_var_screeninfo asiliantfb_var = {
+static const struct fb_var_screeninfo asiliantfb_var = {
 	.xres 		= 640,
 	.yres 		= 480,
 	.xres_virtual 	= 640,
diff --git a/drivers/video/fbdev/aty/aty128fb.c b/drivers/video/fbdev/aty/aty128fb.c
index 0a46268..fa07242 100644
--- a/drivers/video/fbdev/aty/aty128fb.c
+++ b/drivers/video/fbdev/aty/aty128fb.c
@@ -93,7 +93,7 @@
 
 #ifndef CONFIG_PPC_PMAC
 /* default mode */
-static struct fb_var_screeninfo default_var = {
+static const struct fb_var_screeninfo default_var = {
 	/* 640x480, 60 Hz, Non-Interlaced (25.175 MHz dotclock) */
 	640, 480, 640, 480, 0, 0, 8, 0,
 	{0, 8, 0}, {0, 8, 0}, {0, 8, 0}, {0, 0, 0},
@@ -104,7 +104,7 @@
 #else /* CONFIG_PPC_PMAC */
 /* default to 1024x768 at 75Hz on PPC - this will work
  * on the iMac, the usual 640x480 @ 60Hz doesn't. */
-static struct fb_var_screeninfo default_var = {
+static const struct fb_var_screeninfo default_var = {
 	/* 1024x768, 75 Hz, Non-Interlaced (78.75 MHz dotclock) */
 	1024, 768, 1024, 768, 0, 0, 8, 0,
 	{0, 8, 0}, {0, 8, 0}, {0, 8, 0}, {0, 0, 0},
@@ -375,7 +375,7 @@
 	.name = "64-bit DDR SGRAM",
 };
 
-static struct fb_fix_screeninfo aty128fb_fix = {
+static const struct fb_fix_screeninfo aty128fb_fix = {
 	.id		= "ATY Rage128",
 	.type		= FB_TYPE_PACKED_PIXELS,
 	.visual		= FB_VISUAL_PSEUDOCOLOR,
diff --git a/drivers/video/fbdev/aty/atyfb_base.c b/drivers/video/fbdev/aty/atyfb_base.c
index f34ed47f..11026e7 100644
--- a/drivers/video/fbdev/aty/atyfb_base.c
+++ b/drivers/video/fbdev/aty/atyfb_base.c
@@ -212,7 +212,7 @@
 	unsigned long prot_mask;
 };
 
-static struct fb_fix_screeninfo atyfb_fix = {
+static const struct fb_fix_screeninfo atyfb_fix = {
 	.id		= "ATY Mach64",
 	.type		= FB_TYPE_PACKED_PIXELS,
 	.visual		= FB_VISUAL_PSEUDOCOLOR,
diff --git a/drivers/video/fbdev/aty/radeon_monitor.c b/drivers/video/fbdev/aty/radeon_monitor.c
index f1ce229..278b421 100644
--- a/drivers/video/fbdev/aty/radeon_monitor.c
+++ b/drivers/video/fbdev/aty/radeon_monitor.c
@@ -4,7 +4,7 @@
 
 #include "../edid.h"
 
-static struct fb_var_screeninfo radeonfb_default_var = {
+static const struct fb_var_screeninfo radeonfb_default_var = {
 	.xres		= 640,
 	.yres		= 480,
 	.xres_virtual	= 640,
diff --git a/drivers/video/fbdev/au1200fb.c b/drivers/video/fbdev/au1200fb.c
index f9507b1..6c2b2ca 100644
--- a/drivers/video/fbdev/au1200fb.c
+++ b/drivers/video/fbdev/au1200fb.c
@@ -43,6 +43,7 @@
 #include <linux/ctype.h>
 #include <linux/dma-mapping.h>
 #include <linux/slab.h>
+#include <linux/uaccess.h>
 
 #include <asm/mach-au1x00/au1000.h>
 #include <asm/mach-au1x00/au1200fb.h>	/* platform_data */
diff --git a/drivers/video/fbdev/bfin_adv7393fb.c b/drivers/video/fbdev/bfin_adv7393fb.c
index e2d7d03..542ffad 100644
--- a/drivers/video/fbdev/bfin_adv7393fb.c
+++ b/drivers/video/fbdev/bfin_adv7393fb.c
@@ -375,7 +375,6 @@
 {
 	int ret = 0;
 	struct proc_dir_entry *entry;
-	int num_modes = ARRAY_SIZE(known_modes);
 
 	struct adv7393fb_device *fbdev = NULL;
 
@@ -384,7 +383,7 @@
 		return -EINVAL;
 	}
 
-	if (mode > num_modes) {
+	if (mode >= ARRAY_SIZE(known_modes)) {
 		dev_err(&client->dev, "mode %d: not supported", mode);
 		return -EFAULT;
 	}
@@ -797,7 +796,7 @@
 
 static int __init bfin_adv7393_fb_driver_init(void)
 {
-#if  defined(CONFIG_I2C_BLACKFIN_TWI) || defined(CONFIG_I2C_BLACKFIN_TWI_MODULE)
+#if IS_ENABLED(CONFIG_I2C_BLACKFIN_TWI)
 	request_module("i2c-bfin-twi");
 #else
 	request_module("i2c-gpio");
diff --git a/drivers/video/fbdev/efifb.c b/drivers/video/fbdev/efifb.c
index 924bad4..37a37c4 100644
--- a/drivers/video/fbdev/efifb.c
+++ b/drivers/video/fbdev/efifb.c
@@ -50,9 +50,9 @@
 		return 1;
 
 	if (regno < 16) {
-		red   >>= 8;
-		green >>= 8;
-		blue  >>= 8;
+		red   >>= 16 - info->var.red.length;
+		green >>= 16 - info->var.green.length;
+		blue  >>= 16 - info->var.blue.length;
 		((u32 *)(info->pseudo_palette))[regno] =
 			(red   << info->var.red.offset)   |
 			(green << info->var.green.offset) |
diff --git a/drivers/video/fbdev/exynos/Kconfig b/drivers/video/fbdev/exynos/Kconfig
deleted file mode 100644
index d916bef..0000000
--- a/drivers/video/fbdev/exynos/Kconfig
+++ /dev/null
@@ -1,32 +0,0 @@
-#
-# Exynos Video configuration
-#
-
-menuconfig EXYNOS_VIDEO
-	tristate "Exynos Video driver support"
-	depends on ARCH_S5PV210 || ARCH_EXYNOS
-	help
-	  This enables support for EXYNOS Video device.
-
-if EXYNOS_VIDEO
-
-#
-# MIPI DSI driver
-#
-
-config EXYNOS_MIPI_DSI
-	tristate "EXYNOS MIPI DSI driver support."
-	select GENERIC_PHY
-	help
-	  This enables support for MIPI-DSI device.
-
-config EXYNOS_LCD_S6E8AX0
-	tristate "S6E8AX0 MIPI AMOLED LCD Driver"
-	depends on EXYNOS_MIPI_DSI && BACKLIGHT_CLASS_DEVICE
-	depends on (LCD_CLASS_DEVICE = y)
-	default n
-	help
-	  If you have an S6E8AX0 MIPI AMOLED LCD Panel, say Y to enable its
-	  LCD control driver.
-
-endif # EXYNOS_VIDEO
diff --git a/drivers/video/fbdev/exynos/Makefile b/drivers/video/fbdev/exynos/Makefile
deleted file mode 100644
index 02d8dc5..0000000
--- a/drivers/video/fbdev/exynos/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-#
-# Makefile for the exynos video drivers.
-#
-
-obj-$(CONFIG_EXYNOS_MIPI_DSI)		+= exynos-mipi-dsi-mod.o
-
-exynos-mipi-dsi-mod-objs		+= exynos_mipi_dsi.o exynos_mipi_dsi_common.o \
-					   exynos_mipi_dsi_lowlevel.o
-obj-$(CONFIG_EXYNOS_LCD_S6E8AX0)	+= s6e8ax0.o
diff --git a/drivers/video/fbdev/exynos/exynos_mipi_dsi.c b/drivers/video/fbdev/exynos/exynos_mipi_dsi.c
deleted file mode 100644
index 92e4af3..0000000
--- a/drivers/video/fbdev/exynos/exynos_mipi_dsi.c
+++ /dev/null
@@ -1,574 +0,0 @@
-/* linux/drivers/video/exynos/exynos_mipi_dsi.c
- *
- * Samsung SoC MIPI-DSIM driver.
- *
- * Copyright (c) 2012 Samsung Electronics Co., Ltd
- *
- * InKi Dae, <inki.dae@samsung.com>
- * Donghwa Lee, <dh09.lee@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/clk.h>
-#include <linux/mutex.h>
-#include <linux/wait.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/fb.h>
-#include <linux/ctype.h>
-#include <linux/platform_device.h>
-#include <linux/io.h>
-#include <linux/irq.h>
-#include <linux/memory.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/kthread.h>
-#include <linux/notifier.h>
-#include <linux/phy/phy.h>
-#include <linux/regulator/consumer.h>
-#include <linux/pm_runtime.h>
-#include <linux/err.h>
-
-#include <video/exynos_mipi_dsim.h>
-
-#include "exynos_mipi_dsi_common.h"
-#include "exynos_mipi_dsi_lowlevel.h"
-
-struct mipi_dsim_ddi {
-	int				bus_id;
-	struct list_head		list;
-	struct mipi_dsim_lcd_device	*dsim_lcd_dev;
-	struct mipi_dsim_lcd_driver	*dsim_lcd_drv;
-};
-
-static LIST_HEAD(dsim_ddi_list);
-
-static DEFINE_MUTEX(mipi_dsim_lock);
-
-static struct mipi_dsim_platform_data *to_dsim_plat(struct platform_device
-							*pdev)
-{
-	return pdev->dev.platform_data;
-}
-
-static struct regulator_bulk_data supplies[] = {
-	{ .supply = "vdd11", },
-	{ .supply = "vdd18", },
-};
-
-static int exynos_mipi_regulator_enable(struct mipi_dsim_device *dsim)
-{
-	int ret;
-
-	mutex_lock(&dsim->lock);
-	ret = regulator_bulk_enable(ARRAY_SIZE(supplies), supplies);
-	mutex_unlock(&dsim->lock);
-
-	return ret;
-}
-
-static int exynos_mipi_regulator_disable(struct mipi_dsim_device *dsim)
-{
-	int ret;
-
-	mutex_lock(&dsim->lock);
-	ret = regulator_bulk_disable(ARRAY_SIZE(supplies), supplies);
-	mutex_unlock(&dsim->lock);
-
-	return ret;
-}
-
-/* update all register settings to MIPI DSI controller. */
-static void exynos_mipi_update_cfg(struct mipi_dsim_device *dsim)
-{
-	/*
-	 * data from Display controller(FIMD) is not transferred in video mode
-	 * but in case of command mode, all settings is not updated to
-	 * registers.
-	 */
-	exynos_mipi_dsi_stand_by(dsim, 0);
-
-	exynos_mipi_dsi_init_dsim(dsim);
-	exynos_mipi_dsi_init_link(dsim);
-
-	exynos_mipi_dsi_set_hs_enable(dsim);
-
-	/* set display timing. */
-	exynos_mipi_dsi_set_display_mode(dsim, dsim->dsim_config);
-
-	exynos_mipi_dsi_init_interrupt(dsim);
-
-	/*
-	 * data from Display controller(FIMD) is transferred in video mode
-	 * but in case of command mode, all settings are updated to registers.
-	 */
-	exynos_mipi_dsi_stand_by(dsim, 1);
-}
-
-static int exynos_mipi_dsi_early_blank_mode(struct mipi_dsim_device *dsim,
-		int power)
-{
-	struct mipi_dsim_lcd_driver *client_drv = dsim->dsim_lcd_drv;
-	struct mipi_dsim_lcd_device *client_dev = dsim->dsim_lcd_dev;
-
-	switch (power) {
-	case FB_BLANK_POWERDOWN:
-		if (dsim->suspended)
-			return 0;
-
-		if (client_drv && client_drv->suspend)
-			client_drv->suspend(client_dev);
-
-		clk_disable(dsim->clock);
-
-		exynos_mipi_regulator_disable(dsim);
-
-		dsim->suspended = true;
-
-		break;
-	default:
-		break;
-	}
-
-	return 0;
-}
-
-static int exynos_mipi_dsi_blank_mode(struct mipi_dsim_device *dsim, int power)
-{
-	struct mipi_dsim_lcd_driver *client_drv = dsim->dsim_lcd_drv;
-	struct mipi_dsim_lcd_device *client_dev = dsim->dsim_lcd_dev;
-
-	switch (power) {
-	case FB_BLANK_UNBLANK:
-		if (!dsim->suspended)
-			return 0;
-
-		/* lcd panel power on. */
-		if (client_drv && client_drv->power_on)
-			client_drv->power_on(client_dev, 1);
-
-		exynos_mipi_regulator_enable(dsim);
-
-		/* enable MIPI-DSI PHY. */
-		phy_power_on(dsim->phy);
-
-		clk_enable(dsim->clock);
-
-		exynos_mipi_update_cfg(dsim);
-
-		/* set lcd panel sequence commands. */
-		if (client_drv && client_drv->set_sequence)
-			client_drv->set_sequence(client_dev);
-
-		dsim->suspended = false;
-
-		break;
-	case FB_BLANK_NORMAL:
-		/* TODO. */
-		break;
-	default:
-		break;
-	}
-
-	return 0;
-}
-
-int exynos_mipi_dsi_register_lcd_device(struct mipi_dsim_lcd_device *lcd_dev)
-{
-	struct mipi_dsim_ddi *dsim_ddi;
-
-	if (!lcd_dev->name) {
-		pr_err("dsim_lcd_device name is NULL.\n");
-		return -EFAULT;
-	}
-
-	dsim_ddi = kzalloc(sizeof(struct mipi_dsim_ddi), GFP_KERNEL);
-	if (!dsim_ddi) {
-		pr_err("failed to allocate dsim_ddi object.\n");
-		return -ENOMEM;
-	}
-
-	dsim_ddi->dsim_lcd_dev = lcd_dev;
-
-	mutex_lock(&mipi_dsim_lock);
-	list_add_tail(&dsim_ddi->list, &dsim_ddi_list);
-	mutex_unlock(&mipi_dsim_lock);
-
-	return 0;
-}
-
-static struct mipi_dsim_ddi *exynos_mipi_dsi_find_lcd_device(
-					struct mipi_dsim_lcd_driver *lcd_drv)
-{
-	struct mipi_dsim_ddi *dsim_ddi, *next;
-	struct mipi_dsim_lcd_device *lcd_dev;
-
-	mutex_lock(&mipi_dsim_lock);
-
-	list_for_each_entry_safe(dsim_ddi, next, &dsim_ddi_list, list) {
-		if (!dsim_ddi)
-			goto out;
-
-		lcd_dev = dsim_ddi->dsim_lcd_dev;
-		if (!lcd_dev)
-			continue;
-
-		if ((strcmp(lcd_drv->name, lcd_dev->name)) == 0) {
-			/**
-			 * bus_id would be used to identify
-			 * connected bus.
-			 */
-			dsim_ddi->bus_id = lcd_dev->bus_id;
-			mutex_unlock(&mipi_dsim_lock);
-
-			return dsim_ddi;
-		}
-
-		list_del(&dsim_ddi->list);
-		kfree(dsim_ddi);
-	}
-
-out:
-	mutex_unlock(&mipi_dsim_lock);
-
-	return NULL;
-}
-
-int exynos_mipi_dsi_register_lcd_driver(struct mipi_dsim_lcd_driver *lcd_drv)
-{
-	struct mipi_dsim_ddi *dsim_ddi;
-
-	if (!lcd_drv->name) {
-		pr_err("dsim_lcd_driver name is NULL.\n");
-		return -EFAULT;
-	}
-
-	dsim_ddi = exynos_mipi_dsi_find_lcd_device(lcd_drv);
-	if (!dsim_ddi) {
-		pr_err("mipi_dsim_ddi object not found.\n");
-		return -EFAULT;
-	}
-
-	dsim_ddi->dsim_lcd_drv = lcd_drv;
-
-	pr_info("registered panel driver(%s) to mipi-dsi driver.\n",
-		lcd_drv->name);
-
-	return 0;
-
-}
-EXPORT_SYMBOL_GPL(exynos_mipi_dsi_register_lcd_driver);
-
-static struct mipi_dsim_ddi *exynos_mipi_dsi_bind_lcd_ddi(
-						struct mipi_dsim_device *dsim,
-						const char *name)
-{
-	struct mipi_dsim_ddi *dsim_ddi, *next;
-	struct mipi_dsim_lcd_driver *lcd_drv;
-	struct mipi_dsim_lcd_device *lcd_dev;
-	int ret;
-
-	mutex_lock(&dsim->lock);
-
-	list_for_each_entry_safe(dsim_ddi, next, &dsim_ddi_list, list) {
-		lcd_drv = dsim_ddi->dsim_lcd_drv;
-		lcd_dev = dsim_ddi->dsim_lcd_dev;
-		if (!lcd_drv || !lcd_dev ||
-			(dsim->id != dsim_ddi->bus_id))
-				continue;
-
-		dev_dbg(dsim->dev, "lcd_drv->id = %d, lcd_dev->id = %d\n",
-				lcd_drv->id, lcd_dev->id);
-		dev_dbg(dsim->dev, "lcd_dev->bus_id = %d, dsim->id = %d\n",
-				lcd_dev->bus_id, dsim->id);
-
-		if ((strcmp(lcd_drv->name, name) == 0)) {
-			lcd_dev->master = dsim;
-
-			lcd_dev->dev.parent = dsim->dev;
-			dev_set_name(&lcd_dev->dev, "%s", lcd_drv->name);
-
-			ret = device_register(&lcd_dev->dev);
-			if (ret < 0) {
-				dev_err(dsim->dev,
-					"can't register %s, status %d\n",
-					dev_name(&lcd_dev->dev), ret);
-				mutex_unlock(&dsim->lock);
-
-				return NULL;
-			}
-
-			dsim->dsim_lcd_dev = lcd_dev;
-			dsim->dsim_lcd_drv = lcd_drv;
-
-			mutex_unlock(&dsim->lock);
-
-			return dsim_ddi;
-		}
-	}
-
-	mutex_unlock(&dsim->lock);
-
-	return NULL;
-}
-
-/* define MIPI-DSI Master operations. */
-static struct mipi_dsim_master_ops master_ops = {
-	.cmd_read			= exynos_mipi_dsi_rd_data,
-	.cmd_write			= exynos_mipi_dsi_wr_data,
-	.get_dsim_frame_done		= exynos_mipi_dsi_get_frame_done_status,
-	.clear_dsim_frame_done		= exynos_mipi_dsi_clear_frame_done,
-	.set_early_blank_mode		= exynos_mipi_dsi_early_blank_mode,
-	.set_blank_mode			= exynos_mipi_dsi_blank_mode,
-};
-
-static int exynos_mipi_dsi_probe(struct platform_device *pdev)
-{
-	struct resource *res;
-	struct mipi_dsim_device *dsim;
-	struct mipi_dsim_config *dsim_config;
-	struct mipi_dsim_platform_data *dsim_pd;
-	struct mipi_dsim_ddi *dsim_ddi;
-	int ret = -EINVAL;
-
-	dsim = devm_kzalloc(&pdev->dev, sizeof(struct mipi_dsim_device),
-				GFP_KERNEL);
-	if (!dsim) {
-		dev_err(&pdev->dev, "failed to allocate dsim object.\n");
-		return -ENOMEM;
-	}
-
-	dsim->pd = to_dsim_plat(pdev);
-	dsim->dev = &pdev->dev;
-	dsim->id = pdev->id;
-
-	/* get mipi_dsim_platform_data. */
-	dsim_pd = (struct mipi_dsim_platform_data *)dsim->pd;
-	if (dsim_pd == NULL) {
-		dev_err(&pdev->dev, "failed to get platform data for dsim.\n");
-		return -EINVAL;
-	}
-	/* get mipi_dsim_config. */
-	dsim_config = dsim_pd->dsim_config;
-	if (dsim_config == NULL) {
-		dev_err(&pdev->dev, "failed to get dsim config data.\n");
-		return -EINVAL;
-	}
-
-	dsim->dsim_config = dsim_config;
-	dsim->master_ops = &master_ops;
-
-	mutex_init(&dsim->lock);
-
-	ret = devm_regulator_bulk_get(&pdev->dev, ARRAY_SIZE(supplies),
-					supplies);
-	if (ret) {
-		dev_err(&pdev->dev, "Failed to get regulators: %d\n", ret);
-		return ret;
-	}
-
-	dsim->phy = devm_phy_get(&pdev->dev, "dsim");
-	if (IS_ERR(dsim->phy))
-		return PTR_ERR(dsim->phy);
-
-	dsim->clock = devm_clk_get(&pdev->dev, "dsim0");
-	if (IS_ERR(dsim->clock)) {
-		dev_err(&pdev->dev, "failed to get dsim clock source\n");
-		return -ENODEV;
-	}
-
-	clk_enable(dsim->clock);
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-
-	dsim->reg_base = devm_ioremap_resource(&pdev->dev, res);
-	if (IS_ERR(dsim->reg_base)) {
-		ret = PTR_ERR(dsim->reg_base);
-		goto error;
-	}
-
-	mutex_init(&dsim->lock);
-
-	/* bind lcd ddi matched with panel name. */
-	dsim_ddi = exynos_mipi_dsi_bind_lcd_ddi(dsim, dsim_pd->lcd_panel_name);
-	if (!dsim_ddi) {
-		dev_err(&pdev->dev, "mipi_dsim_ddi object not found.\n");
-		ret = -EINVAL;
-		goto error;
-	}
-
-	ret = platform_get_irq(pdev, 0);
-	if (ret < 0) {
-		dev_err(&pdev->dev, "failed to request dsim irq resource\n");
-		goto error;
-	}
-	dsim->irq = ret;
-
-	init_completion(&dsim_wr_comp);
-	init_completion(&dsim_rd_comp);
-	platform_set_drvdata(pdev, dsim);
-
-	ret = devm_request_irq(&pdev->dev, dsim->irq,
-			exynos_mipi_dsi_interrupt_handler,
-			IRQF_SHARED, dev_name(&pdev->dev), dsim);
-	if (ret != 0) {
-		dev_err(&pdev->dev, "failed to request dsim irq\n");
-		ret = -EINVAL;
-		goto error;
-	}
-
-	/* enable interrupts */
-	exynos_mipi_dsi_init_interrupt(dsim);
-
-	/* initialize mipi-dsi client(lcd panel). */
-	if (dsim_ddi->dsim_lcd_drv && dsim_ddi->dsim_lcd_drv->probe)
-		dsim_ddi->dsim_lcd_drv->probe(dsim_ddi->dsim_lcd_dev);
-
-	/* in case mipi-dsi has been enabled by bootloader */
-	if (dsim_pd->enabled) {
-		exynos_mipi_regulator_enable(dsim);
-		goto done;
-	}
-
-	/* lcd panel power on. */
-	if (dsim_ddi->dsim_lcd_drv && dsim_ddi->dsim_lcd_drv->power_on)
-		dsim_ddi->dsim_lcd_drv->power_on(dsim_ddi->dsim_lcd_dev, 1);
-
-	exynos_mipi_regulator_enable(dsim);
-
-	/* enable MIPI-DSI PHY. */
-	phy_power_on(dsim->phy);
-
-	exynos_mipi_update_cfg(dsim);
-
-	/* set lcd panel sequence commands. */
-	if (dsim_ddi->dsim_lcd_drv && dsim_ddi->dsim_lcd_drv->set_sequence)
-		dsim_ddi->dsim_lcd_drv->set_sequence(dsim_ddi->dsim_lcd_dev);
-
-	dsim->suspended = false;
-
-done:
-	platform_set_drvdata(pdev, dsim);
-
-	dev_dbg(&pdev->dev, "%s() completed successfully (%s mode)\n", __func__,
-		dsim_config->e_interface == DSIM_COMMAND ? "CPU" : "RGB");
-
-	return 0;
-
-error:
-	clk_disable(dsim->clock);
-	return ret;
-}
-
-static int exynos_mipi_dsi_remove(struct platform_device *pdev)
-{
-	struct mipi_dsim_device *dsim = platform_get_drvdata(pdev);
-	struct mipi_dsim_ddi *dsim_ddi, *next;
-	struct mipi_dsim_lcd_driver *dsim_lcd_drv;
-
-	clk_disable(dsim->clock);
-
-	list_for_each_entry_safe(dsim_ddi, next, &dsim_ddi_list, list) {
-		if (dsim_ddi) {
-			if (dsim->id != dsim_ddi->bus_id)
-				continue;
-
-			dsim_lcd_drv = dsim_ddi->dsim_lcd_drv;
-
-			if (dsim_lcd_drv->remove)
-				dsim_lcd_drv->remove(dsim_ddi->dsim_lcd_dev);
-
-			kfree(dsim_ddi);
-		}
-	}
-
-	return 0;
-}
-
-#ifdef CONFIG_PM_SLEEP
-static int exynos_mipi_dsi_suspend(struct device *dev)
-{
-	struct platform_device *pdev = to_platform_device(dev);
-	struct mipi_dsim_device *dsim = platform_get_drvdata(pdev);
-	struct mipi_dsim_lcd_driver *client_drv = dsim->dsim_lcd_drv;
-	struct mipi_dsim_lcd_device *client_dev = dsim->dsim_lcd_dev;
-
-	disable_irq(dsim->irq);
-
-	if (dsim->suspended)
-		return 0;
-
-	if (client_drv && client_drv->suspend)
-		client_drv->suspend(client_dev);
-
-	/* disable MIPI-DSI PHY. */
-	phy_power_off(dsim->phy);
-
-	clk_disable(dsim->clock);
-
-	exynos_mipi_regulator_disable(dsim);
-
-	dsim->suspended = true;
-
-	return 0;
-}
-
-static int exynos_mipi_dsi_resume(struct device *dev)
-{
-	struct platform_device *pdev = to_platform_device(dev);
-	struct mipi_dsim_device *dsim = platform_get_drvdata(pdev);
-	struct mipi_dsim_lcd_driver *client_drv = dsim->dsim_lcd_drv;
-	struct mipi_dsim_lcd_device *client_dev = dsim->dsim_lcd_dev;
-
-	enable_irq(dsim->irq);
-
-	if (!dsim->suspended)
-		return 0;
-
-	/* lcd panel power on. */
-	if (client_drv && client_drv->power_on)
-		client_drv->power_on(client_dev, 1);
-
-	exynos_mipi_regulator_enable(dsim);
-
-	/* enable MIPI-DSI PHY. */
-	phy_power_on(dsim->phy);
-
-	clk_enable(dsim->clock);
-
-	exynos_mipi_update_cfg(dsim);
-
-	/* set lcd panel sequence commands. */
-	if (client_drv && client_drv->set_sequence)
-		client_drv->set_sequence(client_dev);
-
-	dsim->suspended = false;
-
-	return 0;
-}
-#endif
-
-static const struct dev_pm_ops exynos_mipi_dsi_pm_ops = {
-	SET_SYSTEM_SLEEP_PM_OPS(exynos_mipi_dsi_suspend, exynos_mipi_dsi_resume)
-};
-
-static struct platform_driver exynos_mipi_dsi_driver = {
-	.probe = exynos_mipi_dsi_probe,
-	.remove = exynos_mipi_dsi_remove,
-	.driver = {
-		   .name = "exynos-mipi-dsim",
-		   .pm = &exynos_mipi_dsi_pm_ops,
-	},
-};
-
-module_platform_driver(exynos_mipi_dsi_driver);
-
-MODULE_AUTHOR("InKi Dae <inki.dae@samsung.com>");
-MODULE_DESCRIPTION("Samsung SoC MIPI-DSI driver");
-MODULE_LICENSE("GPL");
diff --git a/drivers/video/fbdev/exynos/exynos_mipi_dsi_common.c b/drivers/video/fbdev/exynos/exynos_mipi_dsi_common.c
deleted file mode 100644
index 2358a2f..0000000
--- a/drivers/video/fbdev/exynos/exynos_mipi_dsi_common.c
+++ /dev/null
@@ -1,880 +0,0 @@
-/* linux/drivers/video/exynos/exynos_mipi_dsi_common.c
- *
- * Samsung SoC MIPI-DSI common driver.
- *
- * Copyright (c) 2012 Samsung Electronics Co., Ltd
- *
- * InKi Dae, <inki.dae@samsung.com>
- * Donghwa Lee, <dh09.lee@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/mutex.h>
-#include <linux/wait.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/fb.h>
-#include <linux/ctype.h>
-#include <linux/platform_device.h>
-#include <linux/io.h>
-#include <linux/memory.h>
-#include <linux/delay.h>
-#include <linux/irqreturn.h>
-#include <linux/kthread.h>
-
-#include <video/mipi_display.h>
-#include <video/exynos_mipi_dsim.h>
-
-#include "exynos_mipi_dsi_regs.h"
-#include "exynos_mipi_dsi_lowlevel.h"
-#include "exynos_mipi_dsi_common.h"
-
-#define MIPI_FIFO_TIMEOUT	msecs_to_jiffies(250)
-#define MIPI_RX_FIFO_READ_DONE  0x30800002
-#define MIPI_MAX_RX_FIFO        20
-#define MHZ			(1000 * 1000)
-#define FIN_HZ			(24 * MHZ)
-
-#define DFIN_PLL_MIN_HZ		(6 * MHZ)
-#define DFIN_PLL_MAX_HZ		(12 * MHZ)
-
-#define DFVCO_MIN_HZ		(500 * MHZ)
-#define DFVCO_MAX_HZ		(1000 * MHZ)
-
-#define TRY_GET_FIFO_TIMEOUT	(5000 * 2)
-#define TRY_FIFO_CLEAR		(10)
-
-/* MIPI-DSIM status types. */
-enum {
-	DSIM_STATE_INIT,	/* should be initialized. */
-	DSIM_STATE_STOP,	/* CPU and LCDC are LP mode. */
-	DSIM_STATE_HSCLKEN,	/* HS clock was enabled. */
-	DSIM_STATE_ULPS
-};
-
-/* define DSI lane types. */
-enum {
-	DSIM_LANE_CLOCK = (1 << 0),
-	DSIM_LANE_DATA0 = (1 << 1),
-	DSIM_LANE_DATA1 = (1 << 2),
-	DSIM_LANE_DATA2 = (1 << 3),
-	DSIM_LANE_DATA3 = (1 << 4)
-};
-
-static unsigned int dpll_table[15] = {
-	100, 120, 170, 220, 270,
-	320, 390, 450, 510, 560,
-	640, 690, 770, 870, 950
-};
-
-irqreturn_t exynos_mipi_dsi_interrupt_handler(int irq, void *dev_id)
-{
-	struct mipi_dsim_device *dsim = dev_id;
-	unsigned int intsrc, intmsk;
-
-	intsrc = exynos_mipi_dsi_read_interrupt(dsim);
-	intmsk = exynos_mipi_dsi_read_interrupt_mask(dsim);
-	intmsk = ~intmsk & intsrc;
-
-	if (intsrc & INTMSK_RX_DONE) {
-		complete(&dsim_rd_comp);
-		dev_dbg(dsim->dev, "MIPI INTMSK_RX_DONE\n");
-	}
-	if (intsrc & INTMSK_FIFO_EMPTY) {
-		complete(&dsim_wr_comp);
-		dev_dbg(dsim->dev, "MIPI INTMSK_FIFO_EMPTY\n");
-	}
-
-	exynos_mipi_dsi_clear_interrupt(dsim, intmsk);
-
-	return IRQ_HANDLED;
-}
-
-/*
- * write long packet to mipi dsi slave
- * @dsim: mipi dsim device structure.
- * @data0: packet data to send.
- * @data1: size of packet data
- */
-static void exynos_mipi_dsi_long_data_wr(struct mipi_dsim_device *dsim,
-		const unsigned char *data0, unsigned int data_size)
-{
-	unsigned int data_cnt = 0, payload = 0;
-
-	/* in case that data count is more then 4 */
-	for (data_cnt = 0; data_cnt < data_size; data_cnt += 4) {
-		/*
-		 * after sending 4bytes per one time,
-		 * send remainder data less then 4.
-		 */
-		if ((data_size - data_cnt) < 4) {
-			if ((data_size - data_cnt) == 3) {
-				payload = data0[data_cnt] |
-				    data0[data_cnt + 1] << 8 |
-					data0[data_cnt + 2] << 16;
-			dev_dbg(dsim->dev, "count = 3 payload = %x, %x %x %x\n",
-				payload, data0[data_cnt],
-				data0[data_cnt + 1],
-				data0[data_cnt + 2]);
-			} else if ((data_size - data_cnt) == 2) {
-				payload = data0[data_cnt] |
-					data0[data_cnt + 1] << 8;
-			dev_dbg(dsim->dev,
-				"count = 2 payload = %x, %x %x\n", payload,
-				data0[data_cnt],
-				data0[data_cnt + 1]);
-			} else if ((data_size - data_cnt) == 1) {
-				payload = data0[data_cnt];
-			}
-
-			exynos_mipi_dsi_wr_tx_data(dsim, payload);
-		/* send 4bytes per one time. */
-		} else {
-			payload = data0[data_cnt] |
-				data0[data_cnt + 1] << 8 |
-				data0[data_cnt + 2] << 16 |
-				data0[data_cnt + 3] << 24;
-
-			dev_dbg(dsim->dev,
-				"count = 4 payload = %x, %x %x %x %x\n",
-				payload, *(u8 *)(data0 + data_cnt),
-				data0[data_cnt + 1],
-				data0[data_cnt + 2],
-				data0[data_cnt + 3]);
-
-			exynos_mipi_dsi_wr_tx_data(dsim, payload);
-		}
-	}
-}
-
-int exynos_mipi_dsi_wr_data(struct mipi_dsim_device *dsim, unsigned int data_id,
-	const unsigned char *data0, unsigned int data_size)
-{
-	unsigned int check_rx_ack = 0;
-
-	if (dsim->state == DSIM_STATE_ULPS) {
-		dev_err(dsim->dev, "state is ULPS.\n");
-
-		return -EINVAL;
-	}
-
-	/* FIXME!!! why does it need this delay? */
-	msleep(20);
-
-	mutex_lock(&dsim->lock);
-
-	switch (data_id) {
-	/* short packet types of packet types for command. */
-	case MIPI_DSI_GENERIC_SHORT_WRITE_0_PARAM:
-	case MIPI_DSI_GENERIC_SHORT_WRITE_1_PARAM:
-	case MIPI_DSI_GENERIC_SHORT_WRITE_2_PARAM:
-	case MIPI_DSI_DCS_SHORT_WRITE:
-	case MIPI_DSI_DCS_SHORT_WRITE_PARAM:
-	case MIPI_DSI_SET_MAXIMUM_RETURN_PACKET_SIZE:
-		exynos_mipi_dsi_wr_tx_header(dsim, data_id, data0[0], data0[1]);
-		if (check_rx_ack) {
-			/* process response func should be implemented */
-			mutex_unlock(&dsim->lock);
-			return 0;
-		} else {
-			mutex_unlock(&dsim->lock);
-			return -EINVAL;
-		}
-
-	/* general command */
-	case MIPI_DSI_COLOR_MODE_OFF:
-	case MIPI_DSI_COLOR_MODE_ON:
-	case MIPI_DSI_SHUTDOWN_PERIPHERAL:
-	case MIPI_DSI_TURN_ON_PERIPHERAL:
-		exynos_mipi_dsi_wr_tx_header(dsim, data_id, data0[0], data0[1]);
-		if (check_rx_ack) {
-			/* process response func should be implemented. */
-			mutex_unlock(&dsim->lock);
-			return 0;
-		} else {
-			mutex_unlock(&dsim->lock);
-			return -EINVAL;
-		}
-
-	/* packet types for video data */
-	case MIPI_DSI_V_SYNC_START:
-	case MIPI_DSI_V_SYNC_END:
-	case MIPI_DSI_H_SYNC_START:
-	case MIPI_DSI_H_SYNC_END:
-	case MIPI_DSI_END_OF_TRANSMISSION:
-		mutex_unlock(&dsim->lock);
-		return 0;
-
-	/* long packet type and null packet */
-	case MIPI_DSI_NULL_PACKET:
-	case MIPI_DSI_BLANKING_PACKET:
-		mutex_unlock(&dsim->lock);
-		return 0;
-	case MIPI_DSI_GENERIC_LONG_WRITE:
-	case MIPI_DSI_DCS_LONG_WRITE:
-	{
-		unsigned int size, payload = 0;
-		reinit_completion(&dsim_wr_comp);
-
-		size = data_size * 4;
-
-		/* if data count is less then 4, then send 3bytes data.  */
-		if (data_size < 4) {
-			payload = data0[0] |
-				data0[1] << 8 |
-				data0[2] << 16;
-
-			exynos_mipi_dsi_wr_tx_data(dsim, payload);
-
-			dev_dbg(dsim->dev, "count = %d payload = %x,%x %x %x\n",
-				data_size, payload, data0[0],
-				data0[1], data0[2]);
-
-		/* in case that data count is more then 4 */
-		} else
-			exynos_mipi_dsi_long_data_wr(dsim, data0, data_size);
-
-		/* put data into header fifo */
-		exynos_mipi_dsi_wr_tx_header(dsim, data_id, data_size & 0xff,
-			(data_size & 0xff00) >> 8);
-
-		if (!wait_for_completion_interruptible_timeout(&dsim_wr_comp,
-							MIPI_FIFO_TIMEOUT)) {
-			dev_warn(dsim->dev, "command write timeout.\n");
-			mutex_unlock(&dsim->lock);
-			return -EAGAIN;
-		}
-
-		if (check_rx_ack) {
-			/* process response func should be implemented. */
-			mutex_unlock(&dsim->lock);
-			return 0;
-		} else {
-			mutex_unlock(&dsim->lock);
-			return -EINVAL;
-		}
-	}
-
-	/* packet typo for video data */
-	case MIPI_DSI_PACKED_PIXEL_STREAM_16:
-	case MIPI_DSI_PACKED_PIXEL_STREAM_18:
-	case MIPI_DSI_PIXEL_STREAM_3BYTE_18:
-	case MIPI_DSI_PACKED_PIXEL_STREAM_24:
-		if (check_rx_ack) {
-			/* process response func should be implemented. */
-			mutex_unlock(&dsim->lock);
-			return 0;
-		} else {
-			mutex_unlock(&dsim->lock);
-			return -EINVAL;
-		}
-	default:
-		dev_warn(dsim->dev,
-			"data id %x is not supported current DSI spec.\n",
-			data_id);
-
-		mutex_unlock(&dsim->lock);
-		return -EINVAL;
-	}
-}
-
-static unsigned int exynos_mipi_dsi_long_data_rd(struct mipi_dsim_device *dsim,
-		unsigned int req_size, unsigned int rx_data, u8 *rx_buf)
-{
-	unsigned int rcv_pkt, i, j;
-	u16 rxsize;
-
-	/* for long packet */
-	rxsize = (u16)((rx_data & 0x00ffff00) >> 8);
-	dev_dbg(dsim->dev, "mipi dsi rx size : %d\n", rxsize);
-	if (rxsize != req_size) {
-		dev_dbg(dsim->dev,
-			"received size mismatch received: %d, requested: %d\n",
-			rxsize, req_size);
-		goto err;
-	}
-
-	for (i = 0; i < (rxsize >> 2); i++) {
-		rcv_pkt = exynos_mipi_dsi_rd_rx_fifo(dsim);
-		dev_dbg(dsim->dev, "received pkt : %08x\n", rcv_pkt);
-		for (j = 0; j < 4; j++) {
-			rx_buf[(i * 4) + j] =
-					(u8)(rcv_pkt >> (j * 8)) & 0xff;
-			dev_dbg(dsim->dev, "received value : %02x\n",
-					(rcv_pkt >> (j * 8)) & 0xff);
-		}
-	}
-	if (rxsize % 4) {
-		rcv_pkt = exynos_mipi_dsi_rd_rx_fifo(dsim);
-		dev_dbg(dsim->dev, "received pkt : %08x\n", rcv_pkt);
-		for (j = 0; j < (rxsize % 4); j++) {
-			rx_buf[(i * 4) + j] =
-					(u8)(rcv_pkt >> (j * 8)) & 0xff;
-			dev_dbg(dsim->dev, "received value : %02x\n",
-					(rcv_pkt >> (j * 8)) & 0xff);
-		}
-	}
-
-	return rxsize;
-
-err:
-	return -EINVAL;
-}
-
-static unsigned int exynos_mipi_dsi_response_size(unsigned int req_size)
-{
-	switch (req_size) {
-	case 1:
-		return MIPI_DSI_RX_GENERIC_SHORT_READ_RESPONSE_1BYTE;
-	case 2:
-		return MIPI_DSI_RX_GENERIC_SHORT_READ_RESPONSE_2BYTE;
-	default:
-		return MIPI_DSI_RX_GENERIC_LONG_READ_RESPONSE;
-	}
-}
-
-int exynos_mipi_dsi_rd_data(struct mipi_dsim_device *dsim, unsigned int data_id,
-	unsigned int data0, unsigned int req_size, u8 *rx_buf)
-{
-	unsigned int rx_data, rcv_pkt, i;
-	u8 response = 0;
-	u16 rxsize;
-
-	if (dsim->state == DSIM_STATE_ULPS) {
-		dev_err(dsim->dev, "state is ULPS.\n");
-
-		return -EINVAL;
-	}
-
-	/* FIXME!!! */
-	msleep(20);
-
-	mutex_lock(&dsim->lock);
-	reinit_completion(&dsim_rd_comp);
-	exynos_mipi_dsi_rd_tx_header(dsim,
-		MIPI_DSI_SET_MAXIMUM_RETURN_PACKET_SIZE, req_size);
-
-	response = exynos_mipi_dsi_response_size(req_size);
-
-	switch (data_id) {
-	case MIPI_DSI_GENERIC_READ_REQUEST_0_PARAM:
-	case MIPI_DSI_GENERIC_READ_REQUEST_1_PARAM:
-	case MIPI_DSI_GENERIC_READ_REQUEST_2_PARAM:
-	case MIPI_DSI_DCS_READ:
-		exynos_mipi_dsi_rd_tx_header(dsim,
-			data_id, data0);
-		/* process response func should be implemented. */
-		break;
-	default:
-		dev_warn(dsim->dev,
-			"data id %x is not supported current DSI spec.\n",
-			data_id);
-
-		mutex_unlock(&dsim->lock);
-		return -EINVAL;
-	}
-
-	if (!wait_for_completion_interruptible_timeout(&dsim_rd_comp,
-				MIPI_FIFO_TIMEOUT)) {
-		pr_err("RX done interrupt timeout\n");
-		mutex_unlock(&dsim->lock);
-		return 0;
-	}
-
-	msleep(20);
-
-	rx_data = exynos_mipi_dsi_rd_rx_fifo(dsim);
-
-	if ((u8)(rx_data & 0xff) != response) {
-		printk(KERN_ERR
-			"mipi dsi wrong response rx_data : %x, response:%x\n",
-			rx_data, response);
-		goto clear_rx_fifo;
-	}
-
-	if (req_size <= 2) {
-		/* for short packet */
-		for (i = 0; i < req_size; i++)
-			rx_buf[i] = (rx_data >> (8 + (i * 8))) & 0xff;
-		rxsize = req_size;
-	} else {
-		/* for long packet */
-		rxsize = exynos_mipi_dsi_long_data_rd(dsim, req_size, rx_data,
-							rx_buf);
-		if (rxsize != req_size)
-			goto clear_rx_fifo;
-	}
-
-	rcv_pkt = exynos_mipi_dsi_rd_rx_fifo(dsim);
-
-	msleep(20);
-
-	if (rcv_pkt != MIPI_RX_FIFO_READ_DONE) {
-		dev_info(dsim->dev,
-			"Can't found RX FIFO READ DONE FLAG : %x\n", rcv_pkt);
-		goto clear_rx_fifo;
-	}
-
-	mutex_unlock(&dsim->lock);
-
-	return rxsize;
-
-clear_rx_fifo:
-	i = 0;
-	while (1) {
-		rcv_pkt = exynos_mipi_dsi_rd_rx_fifo(dsim);
-		if ((rcv_pkt == MIPI_RX_FIFO_READ_DONE)
-				|| (i > MIPI_MAX_RX_FIFO))
-			break;
-		dev_dbg(dsim->dev,
-				"mipi dsi clear rx fifo : %08x\n", rcv_pkt);
-		i++;
-	}
-	dev_info(dsim->dev,
-		"mipi dsi rx done count : %d, rcv_pkt : %08x\n", i, rcv_pkt);
-
-	mutex_unlock(&dsim->lock);
-
-	return 0;
-}
-
-static int exynos_mipi_dsi_pll_on(struct mipi_dsim_device *dsim,
-				unsigned int enable)
-{
-	int sw_timeout;
-
-	if (enable) {
-		sw_timeout = 1000;
-
-		exynos_mipi_dsi_enable_pll(dsim, 1);
-		while (1) {
-			sw_timeout--;
-			if (exynos_mipi_dsi_is_pll_stable(dsim))
-				return 0;
-			if (sw_timeout == 0)
-				return -EINVAL;
-		}
-	} else
-		exynos_mipi_dsi_enable_pll(dsim, 0);
-
-	return 0;
-}
-
-static unsigned long exynos_mipi_dsi_change_pll(struct mipi_dsim_device *dsim,
-	unsigned int pre_divider, unsigned int main_divider,
-	unsigned int scaler)
-{
-	unsigned long dfin_pll, dfvco, dpll_out;
-	unsigned int i, freq_band = 0xf;
-
-	dfin_pll = (FIN_HZ / pre_divider);
-
-	/******************************************************
-	 *	Serial Clock(=ByteClk X 8)	FreqBand[3:0] *
-	 ******************************************************
-	 *	~ 99.99 MHz			0000
-	 *	100 ~ 119.99 MHz		0001
-	 *	120 ~ 159.99 MHz		0010
-	 *	160 ~ 199.99 MHz		0011
-	 *	200 ~ 239.99 MHz		0100
-	 *	140 ~ 319.99 MHz		0101
-	 *	320 ~ 389.99 MHz		0110
-	 *	390 ~ 449.99 MHz		0111
-	 *	450 ~ 509.99 MHz		1000
-	 *	510 ~ 559.99 MHz		1001
-	 *	560 ~ 639.99 MHz		1010
-	 *	640 ~ 689.99 MHz		1011
-	 *	690 ~ 769.99 MHz		1100
-	 *	770 ~ 869.99 MHz		1101
-	 *	870 ~ 949.99 MHz		1110
-	 *	950 ~ 1000 MHz			1111
-	 ******************************************************/
-	if (dfin_pll < DFIN_PLL_MIN_HZ || dfin_pll > DFIN_PLL_MAX_HZ) {
-		dev_warn(dsim->dev, "fin_pll range should be 6MHz ~ 12MHz\n");
-		exynos_mipi_dsi_enable_afc(dsim, 0, 0);
-	} else {
-		if (dfin_pll < 7 * MHZ)
-			exynos_mipi_dsi_enable_afc(dsim, 1, 0x1);
-		else if (dfin_pll < 8 * MHZ)
-			exynos_mipi_dsi_enable_afc(dsim, 1, 0x0);
-		else if (dfin_pll < 9 * MHZ)
-			exynos_mipi_dsi_enable_afc(dsim, 1, 0x3);
-		else if (dfin_pll < 10 * MHZ)
-			exynos_mipi_dsi_enable_afc(dsim, 1, 0x2);
-		else if (dfin_pll < 11 * MHZ)
-			exynos_mipi_dsi_enable_afc(dsim, 1, 0x5);
-		else
-			exynos_mipi_dsi_enable_afc(dsim, 1, 0x4);
-	}
-
-	dfvco = dfin_pll * main_divider;
-	dev_dbg(dsim->dev, "dfvco = %lu, dfin_pll = %lu, main_divider = %d\n",
-				dfvco, dfin_pll, main_divider);
-	if (dfvco < DFVCO_MIN_HZ || dfvco > DFVCO_MAX_HZ)
-		dev_warn(dsim->dev, "fvco range should be 500MHz ~ 1000MHz\n");
-
-	dpll_out = dfvco / (1 << scaler);
-	dev_dbg(dsim->dev, "dpll_out = %lu, dfvco = %lu, scaler = %d\n",
-		dpll_out, dfvco, scaler);
-
-	for (i = 0; i < ARRAY_SIZE(dpll_table); i++) {
-		if (dpll_out < dpll_table[i] * MHZ) {
-			freq_band = i;
-			break;
-		}
-	}
-
-	dev_dbg(dsim->dev, "freq_band = %d\n", freq_band);
-
-	exynos_mipi_dsi_pll_freq(dsim, pre_divider, main_divider, scaler);
-
-	exynos_mipi_dsi_hs_zero_ctrl(dsim, 0);
-	exynos_mipi_dsi_prep_ctrl(dsim, 0);
-
-	/* Freq Band */
-	exynos_mipi_dsi_pll_freq_band(dsim, freq_band);
-
-	/* Stable time */
-	exynos_mipi_dsi_pll_stable_time(dsim, dsim->dsim_config->pll_stable_time);
-
-	/* Enable PLL */
-	dev_dbg(dsim->dev, "FOUT of mipi dphy pll is %luMHz\n",
-		(dpll_out / MHZ));
-
-	return dpll_out;
-}
-
-static int exynos_mipi_dsi_set_clock(struct mipi_dsim_device *dsim,
-	unsigned int byte_clk_sel, unsigned int enable)
-{
-	unsigned int esc_div;
-	unsigned long esc_clk_error_rate;
-	unsigned long hs_clk = 0, byte_clk = 0, escape_clk = 0;
-
-	if (enable) {
-		dsim->e_clk_src = byte_clk_sel;
-
-		/* Escape mode clock and byte clock source */
-		exynos_mipi_dsi_set_byte_clock_src(dsim, byte_clk_sel);
-
-		/* DPHY, DSIM Link : D-PHY clock out */
-		if (byte_clk_sel == DSIM_PLL_OUT_DIV8) {
-			hs_clk = exynos_mipi_dsi_change_pll(dsim,
-				dsim->dsim_config->p, dsim->dsim_config->m,
-				dsim->dsim_config->s);
-			if (hs_clk == 0) {
-				dev_err(dsim->dev,
-					"failed to get hs clock.\n");
-				return -EINVAL;
-			}
-
-			byte_clk = hs_clk / 8;
-			exynos_mipi_dsi_enable_pll_bypass(dsim, 0);
-			exynos_mipi_dsi_pll_on(dsim, 1);
-		/* DPHY : D-PHY clock out, DSIM link : external clock out */
-		} else if (byte_clk_sel == DSIM_EXT_CLK_DIV8) {
-			dev_warn(dsim->dev, "this project is not support\n");
-			dev_warn(dsim->dev,
-				"external clock source for MIPI DSIM.\n");
-		} else if (byte_clk_sel == DSIM_EXT_CLK_BYPASS) {
-			dev_warn(dsim->dev, "this project is not support\n");
-			dev_warn(dsim->dev,
-				"external clock source for MIPI DSIM\n");
-		}
-
-		/* escape clock divider */
-		esc_div = byte_clk / (dsim->dsim_config->esc_clk);
-		dev_dbg(dsim->dev,
-			"esc_div = %d, byte_clk = %lu, esc_clk = %lu\n",
-			esc_div, byte_clk, dsim->dsim_config->esc_clk);
-		if ((byte_clk / esc_div) >= (20 * MHZ) ||
-				(byte_clk / esc_div) >
-					dsim->dsim_config->esc_clk)
-			esc_div += 1;
-
-		escape_clk = byte_clk / esc_div;
-		dev_dbg(dsim->dev,
-			"escape_clk = %lu, byte_clk = %lu, esc_div = %d\n",
-			escape_clk, byte_clk, esc_div);
-
-		/* enable escape clock. */
-		exynos_mipi_dsi_enable_byte_clock(dsim, 1);
-
-		/* enable byte clk and escape clock */
-		exynos_mipi_dsi_set_esc_clk_prs(dsim, 1, esc_div);
-		/* escape clock on lane */
-		exynos_mipi_dsi_enable_esc_clk_on_lane(dsim,
-			(DSIM_LANE_CLOCK | dsim->data_lane), 1);
-
-		dev_dbg(dsim->dev, "byte clock is %luMHz\n",
-			(byte_clk / MHZ));
-		dev_dbg(dsim->dev, "escape clock that user's need is %lu\n",
-			(dsim->dsim_config->esc_clk / MHZ));
-		dev_dbg(dsim->dev, "escape clock divider is %x\n", esc_div);
-		dev_dbg(dsim->dev, "escape clock is %luMHz\n",
-			((byte_clk / esc_div) / MHZ));
-
-		if ((byte_clk / esc_div) > escape_clk) {
-			esc_clk_error_rate = escape_clk /
-				(byte_clk / esc_div);
-			dev_warn(dsim->dev, "error rate is %lu over.\n",
-				(esc_clk_error_rate / 100));
-		} else if ((byte_clk / esc_div) < (escape_clk)) {
-			esc_clk_error_rate = (byte_clk / esc_div) /
-				escape_clk;
-			dev_warn(dsim->dev, "error rate is %lu under.\n",
-				(esc_clk_error_rate / 100));
-		}
-	} else {
-		exynos_mipi_dsi_enable_esc_clk_on_lane(dsim,
-			(DSIM_LANE_CLOCK | dsim->data_lane), 0);
-		exynos_mipi_dsi_set_esc_clk_prs(dsim, 0, 0);
-
-		/* disable escape clock. */
-		exynos_mipi_dsi_enable_byte_clock(dsim, 0);
-
-		if (byte_clk_sel == DSIM_PLL_OUT_DIV8)
-			exynos_mipi_dsi_pll_on(dsim, 0);
-	}
-
-	return 0;
-}
-
-int exynos_mipi_dsi_init_dsim(struct mipi_dsim_device *dsim)
-{
-	dsim->state = DSIM_STATE_INIT;
-
-	switch (dsim->dsim_config->e_no_data_lane) {
-	case DSIM_DATA_LANE_1:
-		dsim->data_lane = DSIM_LANE_DATA0;
-		break;
-	case DSIM_DATA_LANE_2:
-		dsim->data_lane = DSIM_LANE_DATA0 | DSIM_LANE_DATA1;
-		break;
-	case DSIM_DATA_LANE_3:
-		dsim->data_lane = DSIM_LANE_DATA0 | DSIM_LANE_DATA1 |
-			DSIM_LANE_DATA2;
-		break;
-	case DSIM_DATA_LANE_4:
-		dsim->data_lane = DSIM_LANE_DATA0 | DSIM_LANE_DATA1 |
-			DSIM_LANE_DATA2 | DSIM_LANE_DATA3;
-		break;
-	default:
-		dev_info(dsim->dev, "data lane is invalid.\n");
-		return -EINVAL;
-	}
-
-	exynos_mipi_dsi_sw_reset(dsim);
-	exynos_mipi_dsi_func_reset(dsim);
-
-	exynos_mipi_dsi_dp_dn_swap(dsim, 0);
-
-	return 0;
-}
-
-void exynos_mipi_dsi_init_interrupt(struct mipi_dsim_device *dsim)
-{
-	unsigned int src = 0;
-
-	src = (INTSRC_SFR_FIFO_EMPTY | INTSRC_RX_DATA_DONE);
-	exynos_mipi_dsi_set_interrupt(dsim, src, 1);
-
-	src = 0;
-	src = ~(INTMSK_RX_DONE | INTMSK_FIFO_EMPTY);
-	exynos_mipi_dsi_set_interrupt_mask(dsim, src, 1);
-}
-
-int exynos_mipi_dsi_enable_frame_done_int(struct mipi_dsim_device *dsim,
-	unsigned int enable)
-{
-	/* enable only frame done interrupt */
-	exynos_mipi_dsi_set_interrupt_mask(dsim, INTMSK_FRAME_DONE, enable);
-
-	return 0;
-}
-
-void exynos_mipi_dsi_stand_by(struct mipi_dsim_device *dsim,
-		unsigned int enable)
-{
-
-	/* consider Main display and Sub display. */
-
-	exynos_mipi_dsi_set_main_stand_by(dsim, enable);
-}
-
-int exynos_mipi_dsi_set_display_mode(struct mipi_dsim_device *dsim,
-	struct mipi_dsim_config *dsim_config)
-{
-	struct mipi_dsim_platform_data *dsim_pd;
-	struct fb_videomode *timing;
-
-	dsim_pd = (struct mipi_dsim_platform_data *)dsim->pd;
-	timing = (struct fb_videomode *)dsim_pd->lcd_panel_info;
-
-	/* in case of VIDEO MODE (RGB INTERFACE), it sets polarities. */
-	if (dsim_config->e_interface == (u32) DSIM_VIDEO) {
-		if (dsim_config->auto_vertical_cnt == 0) {
-			exynos_mipi_dsi_set_main_disp_vporch(dsim,
-				dsim_config->cmd_allow,
-				timing->lower_margin,
-				timing->upper_margin);
-			exynos_mipi_dsi_set_main_disp_hporch(dsim,
-				timing->right_margin,
-				timing->left_margin);
-			exynos_mipi_dsi_set_main_disp_sync_area(dsim,
-				timing->vsync_len,
-				timing->hsync_len);
-		}
-	}
-
-	exynos_mipi_dsi_set_main_disp_resol(dsim, timing->xres,
-			timing->yres);
-
-	exynos_mipi_dsi_display_config(dsim, dsim_config);
-
-	dev_info(dsim->dev, "lcd panel ==> width = %d, height = %d\n",
-			timing->xres, timing->yres);
-
-	return 0;
-}
-
-int exynos_mipi_dsi_init_link(struct mipi_dsim_device *dsim)
-{
-	unsigned int time_out = 100;
-
-	switch (dsim->state) {
-	case DSIM_STATE_INIT:
-		exynos_mipi_dsi_init_fifo_pointer(dsim, 0x1f);
-
-		/* dsi configuration */
-		exynos_mipi_dsi_init_config(dsim);
-		exynos_mipi_dsi_enable_lane(dsim, DSIM_LANE_CLOCK, 1);
-		exynos_mipi_dsi_enable_lane(dsim, dsim->data_lane, 1);
-
-		/* set clock configuration */
-		exynos_mipi_dsi_set_clock(dsim, dsim->dsim_config->e_byte_clk, 1);
-
-		/* check clock and data lane state are stop state */
-		while (!(exynos_mipi_dsi_is_lane_state(dsim))) {
-			time_out--;
-			if (time_out == 0) {
-				dev_err(dsim->dev,
-					"DSI Master is not stop state.\n");
-				dev_err(dsim->dev,
-					"Check initialization process\n");
-
-				return -EINVAL;
-			}
-		}
-		if (time_out != 0) {
-			dev_info(dsim->dev,
-				"DSI Master driver has been completed.\n");
-			dev_info(dsim->dev, "DSI Master state is stop state\n");
-		}
-
-		dsim->state = DSIM_STATE_STOP;
-
-		/* BTA sequence counters */
-		exynos_mipi_dsi_set_stop_state_counter(dsim,
-			dsim->dsim_config->stop_holding_cnt);
-		exynos_mipi_dsi_set_bta_timeout(dsim,
-			dsim->dsim_config->bta_timeout);
-		exynos_mipi_dsi_set_lpdr_timeout(dsim,
-			dsim->dsim_config->rx_timeout);
-
-		return 0;
-	default:
-		dev_info(dsim->dev, "DSI Master is already init.\n");
-		return 0;
-	}
-
-	return 0;
-}
-
-int exynos_mipi_dsi_set_hs_enable(struct mipi_dsim_device *dsim)
-{
-	if (dsim->state != DSIM_STATE_STOP) {
-		dev_warn(dsim->dev, "DSIM is not in stop state.\n");
-		return 0;
-	}
-
-	if (dsim->e_clk_src == DSIM_EXT_CLK_BYPASS) {
-		dev_warn(dsim->dev, "clock source is external bypass.\n");
-		return 0;
-	}
-
-	dsim->state = DSIM_STATE_HSCLKEN;
-
-	 /* set LCDC and CPU transfer mode to HS. */
-	exynos_mipi_dsi_set_lcdc_transfer_mode(dsim, 0);
-	exynos_mipi_dsi_set_cpu_transfer_mode(dsim, 0);
-	exynos_mipi_dsi_enable_hs_clock(dsim, 1);
-
-	return 0;
-}
-
-int exynos_mipi_dsi_set_data_transfer_mode(struct mipi_dsim_device *dsim,
-		unsigned int mode)
-{
-	if (mode) {
-		if (dsim->state != DSIM_STATE_HSCLKEN) {
-			dev_err(dsim->dev, "HS Clock lane is not enabled.\n");
-			return -EINVAL;
-		}
-
-		exynos_mipi_dsi_set_lcdc_transfer_mode(dsim, 0);
-	} else {
-		if (dsim->state == DSIM_STATE_INIT || dsim->state ==
-			DSIM_STATE_ULPS) {
-			dev_err(dsim->dev,
-				"DSI Master is not STOP or HSDT state.\n");
-			return -EINVAL;
-		}
-
-		exynos_mipi_dsi_set_cpu_transfer_mode(dsim, 0);
-	}
-
-	return 0;
-}
-
-int exynos_mipi_dsi_get_frame_done_status(struct mipi_dsim_device *dsim)
-{
-	return _exynos_mipi_dsi_get_frame_done_status(dsim);
-}
-
-int exynos_mipi_dsi_clear_frame_done(struct mipi_dsim_device *dsim)
-{
-	_exynos_mipi_dsi_clear_frame_done(dsim);
-
-	return 0;
-}
-
-int exynos_mipi_dsi_fifo_clear(struct mipi_dsim_device *dsim,
-				unsigned int val)
-{
-	int try = TRY_FIFO_CLEAR;
-
-	exynos_mipi_dsi_sw_reset_release(dsim);
-	exynos_mipi_dsi_func_reset(dsim);
-
-	do {
-		if (exynos_mipi_dsi_get_sw_reset_release(dsim)) {
-			exynos_mipi_dsi_init_interrupt(dsim);
-			dev_dbg(dsim->dev, "reset release done.\n");
-			return 0;
-		}
-	} while (--try);
-
-	dev_err(dsim->dev, "failed to clear dsim fifo.\n");
-	return -EAGAIN;
-}
-
-MODULE_AUTHOR("InKi Dae <inki.dae@samsung.com>");
-MODULE_DESCRIPTION("Samsung SoC MIPI-DSI common driver");
-MODULE_LICENSE("GPL");
diff --git a/drivers/video/fbdev/exynos/exynos_mipi_dsi_common.h b/drivers/video/fbdev/exynos/exynos_mipi_dsi_common.h
deleted file mode 100644
index 4125522..0000000
--- a/drivers/video/fbdev/exynos/exynos_mipi_dsi_common.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* linux/drivers/video/exynos_mipi_dsi_common.h
- *
- * Header file for Samsung SoC MIPI-DSI common driver.
- *
- * Copyright (c) 2012 Samsung Electronics Co., Ltd
- *
- * InKi Dae <inki.dae@samsung.com>
- * Donghwa Lee <dh09.lee@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-#ifndef _EXYNOS_MIPI_DSI_COMMON_H
-#define _EXYNOS_MIPI_DSI_COMMON_H
-
-static DECLARE_COMPLETION(dsim_rd_comp);
-static DECLARE_COMPLETION(dsim_wr_comp);
-
-int exynos_mipi_dsi_wr_data(struct mipi_dsim_device *dsim, unsigned int data_id,
-	const unsigned char *data0, unsigned int data_size);
-int exynos_mipi_dsi_rd_data(struct mipi_dsim_device *dsim, unsigned int data_id,
-	unsigned int data0, unsigned int req_size, u8 *rx_buf);
-irqreturn_t exynos_mipi_dsi_interrupt_handler(int irq, void *dev_id);
-void exynos_mipi_dsi_init_interrupt(struct mipi_dsim_device *dsim);
-int exynos_mipi_dsi_init_dsim(struct mipi_dsim_device *dsim);
-void exynos_mipi_dsi_stand_by(struct mipi_dsim_device *dsim,
-		unsigned int enable);
-int exynos_mipi_dsi_set_display_mode(struct mipi_dsim_device *dsim,
-			struct mipi_dsim_config *dsim_info);
-int exynos_mipi_dsi_init_link(struct mipi_dsim_device *dsim);
-int exynos_mipi_dsi_set_hs_enable(struct mipi_dsim_device *dsim);
-int exynos_mipi_dsi_set_data_transfer_mode(struct mipi_dsim_device *dsim,
-		unsigned int mode);
-int exynos_mipi_dsi_enable_frame_done_int(struct mipi_dsim_device *dsim,
-	unsigned int enable);
-int exynos_mipi_dsi_get_frame_done_status(struct mipi_dsim_device *dsim);
-int exynos_mipi_dsi_clear_frame_done(struct mipi_dsim_device *dsim);
-
-extern struct fb_info *registered_fb[FB_MAX] __read_mostly;
-
-int exynos_mipi_dsi_fifo_clear(struct mipi_dsim_device *dsim,
-				unsigned int val);
-
-#endif /* _EXYNOS_MIPI_DSI_COMMON_H */
diff --git a/drivers/video/fbdev/exynos/exynos_mipi_dsi_lowlevel.c b/drivers/video/fbdev/exynos/exynos_mipi_dsi_lowlevel.c
deleted file mode 100644
index c148d06..0000000
--- a/drivers/video/fbdev/exynos/exynos_mipi_dsi_lowlevel.c
+++ /dev/null
@@ -1,618 +0,0 @@
-/* linux/drivers/video/exynos/exynos_mipi_dsi_lowlevel.c
- *
- * Samsung SoC MIPI-DSI lowlevel driver.
- *
- * Copyright (c) 2012 Samsung Electronics Co., Ltd
- *
- * InKi Dae, <inki.dae@samsung.com>
- * Donghwa Lee, <dh09.lee@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/mutex.h>
-#include <linux/wait.h>
-#include <linux/delay.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/ctype.h>
-#include <linux/platform_device.h>
-#include <linux/io.h>
-
-#include <video/exynos_mipi_dsim.h>
-
-#include "exynos_mipi_dsi_regs.h"
-#include "exynos_mipi_dsi_lowlevel.h"
-
-void exynos_mipi_dsi_func_reset(struct mipi_dsim_device *dsim)
-{
-	unsigned int reg;
-
-	reg = readl(dsim->reg_base + EXYNOS_DSIM_SWRST);
-
-	reg |= DSIM_FUNCRST;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_SWRST);
-}
-
-void exynos_mipi_dsi_sw_reset(struct mipi_dsim_device *dsim)
-{
-	unsigned int reg;
-
-	reg = readl(dsim->reg_base + EXYNOS_DSIM_SWRST);
-
-	reg |= DSIM_SWRST;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_SWRST);
-}
-
-void exynos_mipi_dsi_sw_reset_release(struct mipi_dsim_device *dsim)
-{
-	unsigned int reg;
-
-	reg = readl(dsim->reg_base + EXYNOS_DSIM_INTSRC);
-
-	reg |= INTSRC_SW_RST_RELEASE;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_INTSRC);
-}
-
-int exynos_mipi_dsi_get_sw_reset_release(struct mipi_dsim_device *dsim)
-{
-	return (readl(dsim->reg_base + EXYNOS_DSIM_INTSRC)) &
-			INTSRC_SW_RST_RELEASE;
-}
-
-unsigned int exynos_mipi_dsi_read_interrupt_mask(struct mipi_dsim_device *dsim)
-{
-	unsigned int reg;
-
-	reg = readl(dsim->reg_base + EXYNOS_DSIM_INTMSK);
-
-	return reg;
-}
-
-void exynos_mipi_dsi_set_interrupt_mask(struct mipi_dsim_device *dsim,
-		unsigned int mode, unsigned int mask)
-{
-	unsigned int reg = 0;
-
-	if (mask)
-		reg |= mode;
-	else
-		reg &= ~mode;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_INTMSK);
-}
-
-void exynos_mipi_dsi_init_fifo_pointer(struct mipi_dsim_device *dsim,
-		unsigned int cfg)
-{
-	unsigned int reg;
-
-	reg = readl(dsim->reg_base + EXYNOS_DSIM_FIFOCTRL);
-
-	writel(reg & ~(cfg), dsim->reg_base + EXYNOS_DSIM_FIFOCTRL);
-	mdelay(10);
-	reg |= cfg;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_FIFOCTRL);
-}
-
-/*
- * this function set PLL P, M and S value in D-PHY
- */
-void exynos_mipi_dsi_set_phy_tunning(struct mipi_dsim_device *dsim,
-		unsigned int value)
-{
-	writel(DSIM_AFC_CTL(value), dsim->reg_base + EXYNOS_DSIM_PHYACCHR);
-}
-
-void exynos_mipi_dsi_set_main_stand_by(struct mipi_dsim_device *dsim,
-		unsigned int enable)
-{
-	unsigned int reg;
-
-	reg = readl(dsim->reg_base + EXYNOS_DSIM_MDRESOL);
-
-	reg &= ~DSIM_MAIN_STAND_BY;
-
-	if (enable)
-		reg |= DSIM_MAIN_STAND_BY;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_MDRESOL);
-}
-
-void exynos_mipi_dsi_set_main_disp_resol(struct mipi_dsim_device *dsim,
-	unsigned int width_resol, unsigned int height_resol)
-{
-	unsigned int reg;
-
-	/* standby should be set after configuration so set to not ready*/
-	reg = (readl(dsim->reg_base + EXYNOS_DSIM_MDRESOL)) &
-		~(DSIM_MAIN_STAND_BY);
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_MDRESOL);
-
-	reg &= ~((0x7ff << 16) | (0x7ff << 0));
-	reg |= DSIM_MAIN_VRESOL(height_resol) | DSIM_MAIN_HRESOL(width_resol);
-
-	reg |= DSIM_MAIN_STAND_BY;
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_MDRESOL);
-}
-
-void exynos_mipi_dsi_set_main_disp_vporch(struct mipi_dsim_device *dsim,
-	unsigned int cmd_allow, unsigned int vfront, unsigned int vback)
-{
-	unsigned int reg;
-
-	reg = (readl(dsim->reg_base + EXYNOS_DSIM_MVPORCH)) &
-		~((DSIM_CMD_ALLOW_MASK) | (DSIM_STABLE_VFP_MASK) |
-		(DSIM_MAIN_VBP_MASK));
-
-	reg |= (DSIM_CMD_ALLOW_SHIFT(cmd_allow & 0xf) |
-		DSIM_STABLE_VFP_SHIFT(vfront & 0x7ff) |
-		DSIM_MAIN_VBP_SHIFT(vback & 0x7ff));
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_MVPORCH);
-}
-
-void exynos_mipi_dsi_set_main_disp_hporch(struct mipi_dsim_device *dsim,
-	unsigned int front, unsigned int back)
-{
-	unsigned int reg;
-
-	reg = (readl(dsim->reg_base + EXYNOS_DSIM_MHPORCH)) &
-		~((DSIM_MAIN_HFP_MASK) | (DSIM_MAIN_HBP_MASK));
-
-	reg |= DSIM_MAIN_HFP_SHIFT(front) | DSIM_MAIN_HBP_SHIFT(back);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_MHPORCH);
-}
-
-void exynos_mipi_dsi_set_main_disp_sync_area(struct mipi_dsim_device *dsim,
-	unsigned int vert, unsigned int hori)
-{
-	unsigned int reg;
-
-	reg = (readl(dsim->reg_base + EXYNOS_DSIM_MSYNC)) &
-		~((DSIM_MAIN_VSA_MASK) | (DSIM_MAIN_HSA_MASK));
-
-	reg |= (DSIM_MAIN_VSA_SHIFT(vert & 0x3ff) |
-		DSIM_MAIN_HSA_SHIFT(hori));
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_MSYNC);
-}
-
-void exynos_mipi_dsi_set_sub_disp_resol(struct mipi_dsim_device *dsim,
-	unsigned int vert, unsigned int hori)
-{
-	unsigned int reg;
-
-	reg = (readl(dsim->reg_base + EXYNOS_DSIM_SDRESOL)) &
-		~(DSIM_SUB_STANDY_MASK);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_SDRESOL);
-
-	reg &= ~(DSIM_SUB_VRESOL_MASK) | ~(DSIM_SUB_HRESOL_MASK);
-	reg |= (DSIM_SUB_VRESOL_SHIFT(vert & 0x7ff) |
-		DSIM_SUB_HRESOL_SHIFT(hori & 0x7ff));
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_SDRESOL);
-
-	reg |= DSIM_SUB_STANDY_SHIFT(1);
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_SDRESOL);
-}
-
-void exynos_mipi_dsi_init_config(struct mipi_dsim_device *dsim)
-{
-	struct mipi_dsim_config *dsim_config = dsim->dsim_config;
-
-	unsigned int cfg = (readl(dsim->reg_base + EXYNOS_DSIM_CONFIG)) &
-		~((1 << 28) | (0x1f << 20) | (0x3 << 5));
-
-	cfg =	((DSIM_AUTO_FLUSH(dsim_config->auto_flush)) |
-		(DSIM_EOT_DISABLE(dsim_config->eot_disable)) |
-		(DSIM_AUTO_MODE_SHIFT(dsim_config->auto_vertical_cnt)) |
-		(DSIM_HSE_MODE_SHIFT(dsim_config->hse)) |
-		(DSIM_HFP_MODE_SHIFT(dsim_config->hfp)) |
-		(DSIM_HBP_MODE_SHIFT(dsim_config->hbp)) |
-		(DSIM_HSA_MODE_SHIFT(dsim_config->hsa)) |
-		(DSIM_NUM_OF_DATALANE_SHIFT(dsim_config->e_no_data_lane)));
-
-	writel(cfg, dsim->reg_base + EXYNOS_DSIM_CONFIG);
-}
-
-void exynos_mipi_dsi_display_config(struct mipi_dsim_device *dsim,
-				struct mipi_dsim_config *dsim_config)
-{
-	u32 reg = (readl(dsim->reg_base + EXYNOS_DSIM_CONFIG)) &
-		~((0x3 << 26) | (1 << 25) | (0x3 << 18) | (0x7 << 12) |
-		(0x3 << 16) | (0x7 << 8));
-
-	if (dsim_config->e_interface == DSIM_VIDEO)
-		reg |= (1 << 25);
-	else if (dsim_config->e_interface == DSIM_COMMAND)
-		reg &= ~(1 << 25);
-	else {
-		dev_err(dsim->dev, "unknown lcd type.\n");
-		return;
-	}
-
-	/* main lcd */
-	reg |= ((u8) (dsim_config->e_burst_mode) & 0x3) << 26 |
-		((u8) (dsim_config->e_virtual_ch) & 0x3) << 18 |
-		((u8) (dsim_config->e_pixel_format) & 0x7) << 12;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_CONFIG);
-}
-
-void exynos_mipi_dsi_enable_lane(struct mipi_dsim_device *dsim, unsigned int lane,
-	unsigned int enable)
-{
-	unsigned int reg;
-
-	reg = readl(dsim->reg_base + EXYNOS_DSIM_CONFIG);
-
-	if (enable)
-		reg |= DSIM_LANE_ENx(lane);
-	else
-		reg &= ~DSIM_LANE_ENx(lane);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_CONFIG);
-}
-
-
-void exynos_mipi_dsi_set_data_lane_number(struct mipi_dsim_device *dsim,
-	unsigned int count)
-{
-	unsigned int cfg;
-
-	/* get the data lane number. */
-	cfg = DSIM_NUM_OF_DATALANE_SHIFT(count);
-
-	writel(cfg, dsim->reg_base + EXYNOS_DSIM_CONFIG);
-}
-
-void exynos_mipi_dsi_enable_afc(struct mipi_dsim_device *dsim, unsigned int enable,
-	unsigned int afc_code)
-{
-	unsigned int reg = readl(dsim->reg_base + EXYNOS_DSIM_PHYACCHR);
-
-	if (enable) {
-		reg |= (1 << 14);
-		reg &= ~(0x7 << 5);
-		reg |= (afc_code & 0x7) << 5;
-	} else
-		reg &= ~(1 << 14);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_PHYACCHR);
-}
-
-void exynos_mipi_dsi_enable_pll_bypass(struct mipi_dsim_device *dsim,
-	unsigned int enable)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_CLKCTRL)) &
-		~(DSIM_PLL_BYPASS_SHIFT(0x1));
-
-	reg |= DSIM_PLL_BYPASS_SHIFT(enable);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_CLKCTRL);
-}
-
-void exynos_mipi_dsi_set_pll_pms(struct mipi_dsim_device *dsim, unsigned int p,
-	unsigned int m, unsigned int s)
-{
-	unsigned int reg = readl(dsim->reg_base + EXYNOS_DSIM_PLLCTRL);
-
-	reg |= ((p & 0x3f) << 13) | ((m & 0x1ff) << 4) | ((s & 0x7) << 1);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_PLLCTRL);
-}
-
-void exynos_mipi_dsi_pll_freq_band(struct mipi_dsim_device *dsim,
-		unsigned int freq_band)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_PLLCTRL)) &
-		~(DSIM_FREQ_BAND_SHIFT(0x1f));
-
-	reg |= DSIM_FREQ_BAND_SHIFT(freq_band & 0x1f);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_PLLCTRL);
-}
-
-void exynos_mipi_dsi_pll_freq(struct mipi_dsim_device *dsim,
-		unsigned int pre_divider, unsigned int main_divider,
-		unsigned int scaler)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_PLLCTRL)) &
-		~(0x7ffff << 1);
-
-	reg |= (pre_divider & 0x3f) << 13 | (main_divider & 0x1ff) << 4 |
-		(scaler & 0x7) << 1;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_PLLCTRL);
-}
-
-void exynos_mipi_dsi_pll_stable_time(struct mipi_dsim_device *dsim,
-	unsigned int lock_time)
-{
-	writel(lock_time, dsim->reg_base + EXYNOS_DSIM_PLLTMR);
-}
-
-void exynos_mipi_dsi_enable_pll(struct mipi_dsim_device *dsim, unsigned int enable)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_PLLCTRL)) &
-		~(DSIM_PLL_EN_SHIFT(0x1));
-
-	reg |= DSIM_PLL_EN_SHIFT(enable & 0x1);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_PLLCTRL);
-}
-
-void exynos_mipi_dsi_set_byte_clock_src(struct mipi_dsim_device *dsim,
-		unsigned int src)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_CLKCTRL)) &
-		~(DSIM_BYTE_CLK_SRC_SHIFT(0x3));
-
-	reg |= (DSIM_BYTE_CLK_SRC_SHIFT(src));
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_CLKCTRL);
-}
-
-void exynos_mipi_dsi_enable_byte_clock(struct mipi_dsim_device *dsim,
-		unsigned int enable)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_CLKCTRL)) &
-		~(DSIM_BYTE_CLKEN_SHIFT(0x1));
-
-	reg |= DSIM_BYTE_CLKEN_SHIFT(enable);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_CLKCTRL);
-}
-
-void exynos_mipi_dsi_set_esc_clk_prs(struct mipi_dsim_device *dsim,
-		unsigned int enable, unsigned int prs_val)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_CLKCTRL)) &
-		~(DSIM_ESC_CLKEN_SHIFT(0x1) | 0xffff);
-
-	reg |= DSIM_ESC_CLKEN_SHIFT(enable);
-	if (enable)
-		reg |= prs_val;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_CLKCTRL);
-}
-
-void exynos_mipi_dsi_enable_esc_clk_on_lane(struct mipi_dsim_device *dsim,
-		unsigned int lane_sel, unsigned int enable)
-{
-	unsigned int reg = readl(dsim->reg_base + EXYNOS_DSIM_CLKCTRL);
-
-	if (enable)
-		reg |= DSIM_LANE_ESC_CLKEN(lane_sel);
-	else
-
-		reg &= ~DSIM_LANE_ESC_CLKEN(lane_sel);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_CLKCTRL);
-}
-
-void exynos_mipi_dsi_force_dphy_stop_state(struct mipi_dsim_device *dsim,
-	unsigned int enable)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_ESCMODE)) &
-		~(DSIM_FORCE_STOP_STATE_SHIFT(0x1));
-
-	reg |= (DSIM_FORCE_STOP_STATE_SHIFT(enable & 0x1));
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_ESCMODE);
-}
-
-unsigned int exynos_mipi_dsi_is_lane_state(struct mipi_dsim_device *dsim)
-{
-	unsigned int reg = readl(dsim->reg_base + EXYNOS_DSIM_STATUS);
-
-	/**
-	 * check clock and data lane states.
-	 * if MIPI-DSI controller was enabled at bootloader then
-	 * TX_READY_HS_CLK is enabled otherwise STOP_STATE_CLK.
-	 * so it should be checked for two case.
-	 */
-	if ((reg & DSIM_STOP_STATE_DAT(0xf)) &&
-			((reg & DSIM_STOP_STATE_CLK) ||
-			 (reg & DSIM_TX_READY_HS_CLK)))
-		return 1;
-
-	return 0;
-}
-
-void exynos_mipi_dsi_set_stop_state_counter(struct mipi_dsim_device *dsim,
-		unsigned int cnt_val)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_ESCMODE)) &
-		~(DSIM_STOP_STATE_CNT_SHIFT(0x7ff));
-
-	reg |= (DSIM_STOP_STATE_CNT_SHIFT(cnt_val & 0x7ff));
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_ESCMODE);
-}
-
-void exynos_mipi_dsi_set_bta_timeout(struct mipi_dsim_device *dsim,
-		unsigned int timeout)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_TIMEOUT)) &
-		~(DSIM_BTA_TOUT_SHIFT(0xff));
-
-	reg |= (DSIM_BTA_TOUT_SHIFT(timeout));
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_TIMEOUT);
-}
-
-void exynos_mipi_dsi_set_lpdr_timeout(struct mipi_dsim_device *dsim,
-		unsigned int timeout)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_TIMEOUT)) &
-		~(DSIM_LPDR_TOUT_SHIFT(0xffff));
-
-	reg |= (DSIM_LPDR_TOUT_SHIFT(timeout));
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_TIMEOUT);
-}
-
-void exynos_mipi_dsi_set_cpu_transfer_mode(struct mipi_dsim_device *dsim,
-		unsigned int lp)
-{
-	unsigned int reg = readl(dsim->reg_base + EXYNOS_DSIM_ESCMODE);
-
-	reg &= ~DSIM_CMD_LPDT_LP;
-
-	if (lp)
-		reg |= DSIM_CMD_LPDT_LP;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_ESCMODE);
-}
-
-void exynos_mipi_dsi_set_lcdc_transfer_mode(struct mipi_dsim_device *dsim,
-		unsigned int lp)
-{
-	unsigned int reg = readl(dsim->reg_base + EXYNOS_DSIM_ESCMODE);
-
-	reg &= ~DSIM_TX_LPDT_LP;
-
-	if (lp)
-		reg |= DSIM_TX_LPDT_LP;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_ESCMODE);
-}
-
-void exynos_mipi_dsi_enable_hs_clock(struct mipi_dsim_device *dsim,
-		unsigned int enable)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_CLKCTRL)) &
-		~(DSIM_TX_REQUEST_HSCLK_SHIFT(0x1));
-
-	reg |= DSIM_TX_REQUEST_HSCLK_SHIFT(enable);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_CLKCTRL);
-}
-
-void exynos_mipi_dsi_dp_dn_swap(struct mipi_dsim_device *dsim,
-		unsigned int swap_en)
-{
-	unsigned int reg = readl(dsim->reg_base + EXYNOS_DSIM_PHYACCHR1);
-
-	reg &= ~(0x3 << 0);
-	reg |= (swap_en & 0x3) << 0;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_PHYACCHR1);
-}
-
-void exynos_mipi_dsi_hs_zero_ctrl(struct mipi_dsim_device *dsim,
-		unsigned int hs_zero)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_PLLCTRL)) &
-		~(0xf << 28);
-
-	reg |= ((hs_zero & 0xf) << 28);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_PLLCTRL);
-}
-
-void exynos_mipi_dsi_prep_ctrl(struct mipi_dsim_device *dsim, unsigned int prep)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_PLLCTRL)) &
-		~(0x7 << 20);
-
-	reg |= ((prep & 0x7) << 20);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_PLLCTRL);
-}
-
-unsigned int exynos_mipi_dsi_read_interrupt(struct mipi_dsim_device *dsim)
-{
-	return readl(dsim->reg_base + EXYNOS_DSIM_INTSRC);
-}
-
-void exynos_mipi_dsi_clear_interrupt(struct mipi_dsim_device *dsim,
-					unsigned int src)
-{
-	unsigned int reg = readl(dsim->reg_base + EXYNOS_DSIM_INTSRC);
-
-	reg |= src;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_INTSRC);
-}
-
-void exynos_mipi_dsi_set_interrupt(struct mipi_dsim_device *dsim,
-					unsigned int src, unsigned int enable)
-{
-	unsigned int reg = 0;
-
-	if (enable)
-		reg |= src;
-	else
-		reg &= ~src;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_INTSRC);
-}
-
-unsigned int exynos_mipi_dsi_is_pll_stable(struct mipi_dsim_device *dsim)
-{
-	unsigned int reg;
-
-	reg = readl(dsim->reg_base + EXYNOS_DSIM_STATUS);
-
-	return reg & (1 << 31) ? 1 : 0;
-}
-
-unsigned int exynos_mipi_dsi_get_fifo_state(struct mipi_dsim_device *dsim)
-{
-	return readl(dsim->reg_base + EXYNOS_DSIM_FIFOCTRL) & ~(0x1f);
-}
-
-void exynos_mipi_dsi_wr_tx_header(struct mipi_dsim_device *dsim,
-	unsigned int di, unsigned int data0, unsigned int data1)
-{
-	unsigned int reg = (data1 << 16) | (data0 << 8) | ((di & 0x3f) << 0);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_PKTHDR);
-}
-
-void exynos_mipi_dsi_rd_tx_header(struct mipi_dsim_device *dsim,
-	unsigned int di, unsigned int data0)
-{
-	unsigned int reg = (data0 << 8) | (di << 0);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_PKTHDR);
-}
-
-unsigned int exynos_mipi_dsi_rd_rx_fifo(struct mipi_dsim_device *dsim)
-{
-	return readl(dsim->reg_base + EXYNOS_DSIM_RXFIFO);
-}
-
-unsigned int _exynos_mipi_dsi_get_frame_done_status(struct mipi_dsim_device *dsim)
-{
-	unsigned int reg = readl(dsim->reg_base + EXYNOS_DSIM_INTSRC);
-
-	return (reg & INTSRC_FRAME_DONE) ? 1 : 0;
-}
-
-void _exynos_mipi_dsi_clear_frame_done(struct mipi_dsim_device *dsim)
-{
-	unsigned int reg = readl(dsim->reg_base + EXYNOS_DSIM_INTSRC);
-
-	writel(reg | INTSRC_FRAME_DONE, dsim->reg_base +
-		EXYNOS_DSIM_INTSRC);
-}
-
-void exynos_mipi_dsi_wr_tx_data(struct mipi_dsim_device *dsim,
-		unsigned int tx_data)
-{
-	writel(tx_data, dsim->reg_base + EXYNOS_DSIM_PAYLOAD);
-}
diff --git a/drivers/video/fbdev/exynos/exynos_mipi_dsi_lowlevel.h b/drivers/video/fbdev/exynos/exynos_mipi_dsi_lowlevel.h
deleted file mode 100644
index 8546070..0000000
--- a/drivers/video/fbdev/exynos/exynos_mipi_dsi_lowlevel.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* linux/drivers/video/exynos/exynos_mipi_dsi_lowlevel.h
- *
- * Header file for Samsung SoC MIPI-DSI lowlevel driver.
- *
- * Copyright (c) 2012 Samsung Electronics Co., Ltd
- *
- * InKi Dae <inki.dae@samsung.com>
- * Donghwa Lee <dh09.lee@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-#ifndef _EXYNOS_MIPI_DSI_LOWLEVEL_H
-#define _EXYNOS_MIPI_DSI_LOWLEVEL_H
-
-void exynos_mipi_dsi_func_reset(struct mipi_dsim_device *dsim);
-void exynos_mipi_dsi_sw_reset(struct mipi_dsim_device *dsim);
-void exynos_mipi_dsi_sw_reset_release(struct mipi_dsim_device *dsim);
-int exynos_mipi_dsi_get_sw_reset_release(struct mipi_dsim_device *dsim);
-void exynos_mipi_dsi_set_interrupt_mask(struct mipi_dsim_device *dsim,
-	unsigned int mode, unsigned int mask);
-void exynos_mipi_dsi_set_data_lane_number(struct mipi_dsim_device *dsim,
-					unsigned int count);
-void exynos_mipi_dsi_init_fifo_pointer(struct mipi_dsim_device *dsim,
-					unsigned int cfg);
-void exynos_mipi_dsi_set_phy_tunning(struct mipi_dsim_device *dsim,
-				unsigned int value);
-void exynos_mipi_dsi_set_phy_tunning(struct mipi_dsim_device *dsim,
-				unsigned int value);
-void exynos_mipi_dsi_set_main_stand_by(struct mipi_dsim_device *dsim,
-		unsigned int enable);
-void exynos_mipi_dsi_set_main_disp_resol(struct mipi_dsim_device *dsim,
-		unsigned int width_resol, unsigned int height_resol);
-void exynos_mipi_dsi_set_main_disp_vporch(struct mipi_dsim_device *dsim,
-	unsigned int cmd_allow, unsigned int vfront, unsigned int vback);
-void exynos_mipi_dsi_set_main_disp_hporch(struct mipi_dsim_device *dsim,
-			unsigned int front, unsigned int back);
-void exynos_mipi_dsi_set_main_disp_sync_area(struct mipi_dsim_device *dsim,
-				unsigned int vert, unsigned int hori);
-void exynos_mipi_dsi_set_sub_disp_resol(struct mipi_dsim_device *dsim,
-				unsigned int vert, unsigned int hori);
-void exynos_mipi_dsi_init_config(struct mipi_dsim_device *dsim);
-void exynos_mipi_dsi_display_config(struct mipi_dsim_device *dsim,
-				struct mipi_dsim_config *dsim_config);
-void exynos_mipi_dsi_set_data_lane_number(struct mipi_dsim_device *dsim,
-				unsigned int count);
-void exynos_mipi_dsi_enable_lane(struct mipi_dsim_device *dsim, unsigned int lane,
-				unsigned int enable);
-void exynos_mipi_dsi_enable_afc(struct mipi_dsim_device *dsim, unsigned int enable,
-				unsigned int afc_code);
-void exynos_mipi_dsi_enable_pll_bypass(struct mipi_dsim_device *dsim,
-				unsigned int enable);
-void exynos_mipi_dsi_set_pll_pms(struct mipi_dsim_device *dsim, unsigned int p,
-				unsigned int m, unsigned int s);
-void exynos_mipi_dsi_pll_freq_band(struct mipi_dsim_device *dsim,
-				unsigned int freq_band);
-void exynos_mipi_dsi_pll_freq(struct mipi_dsim_device *dsim,
-			unsigned int pre_divider, unsigned int main_divider,
-			unsigned int scaler);
-void exynos_mipi_dsi_pll_stable_time(struct mipi_dsim_device *dsim,
-			unsigned int lock_time);
-void exynos_mipi_dsi_enable_pll(struct mipi_dsim_device *dsim,
-					unsigned int enable);
-void exynos_mipi_dsi_set_byte_clock_src(struct mipi_dsim_device *dsim,
-					unsigned int src);
-void exynos_mipi_dsi_enable_byte_clock(struct mipi_dsim_device *dsim,
-					unsigned int enable);
-void exynos_mipi_dsi_set_esc_clk_prs(struct mipi_dsim_device *dsim,
-				unsigned int enable, unsigned int prs_val);
-void exynos_mipi_dsi_enable_esc_clk_on_lane(struct mipi_dsim_device *dsim,
-				unsigned int lane_sel, unsigned int enable);
-void exynos_mipi_dsi_force_dphy_stop_state(struct mipi_dsim_device *dsim,
-				unsigned int enable);
-unsigned int exynos_mipi_dsi_is_lane_state(struct mipi_dsim_device *dsim);
-void exynos_mipi_dsi_set_stop_state_counter(struct mipi_dsim_device *dsim,
-				unsigned int cnt_val);
-void exynos_mipi_dsi_set_bta_timeout(struct mipi_dsim_device *dsim,
-				unsigned int timeout);
-void exynos_mipi_dsi_set_lpdr_timeout(struct mipi_dsim_device *dsim,
-				unsigned int timeout);
-void exynos_mipi_dsi_set_lcdc_transfer_mode(struct mipi_dsim_device *dsim,
-					unsigned int lp);
-void exynos_mipi_dsi_set_cpu_transfer_mode(struct mipi_dsim_device *dsim,
-					unsigned int lp);
-void exynos_mipi_dsi_enable_hs_clock(struct mipi_dsim_device *dsim,
-				unsigned int enable);
-void exynos_mipi_dsi_dp_dn_swap(struct mipi_dsim_device *dsim,
-				unsigned int swap_en);
-void exynos_mipi_dsi_hs_zero_ctrl(struct mipi_dsim_device *dsim,
-				unsigned int hs_zero);
-void exynos_mipi_dsi_prep_ctrl(struct mipi_dsim_device *dsim, unsigned int prep);
-unsigned int exynos_mipi_dsi_read_interrupt(struct mipi_dsim_device *dsim);
-unsigned int exynos_mipi_dsi_read_interrupt_mask(struct mipi_dsim_device *dsim);
-void exynos_mipi_dsi_clear_interrupt(struct mipi_dsim_device *dsim,
-					unsigned int src);
-void exynos_mipi_dsi_set_interrupt(struct mipi_dsim_device *dsim,
-					unsigned int src, unsigned int enable);
-unsigned int exynos_mipi_dsi_is_pll_stable(struct mipi_dsim_device *dsim);
-unsigned int exynos_mipi_dsi_get_fifo_state(struct mipi_dsim_device *dsim);
-unsigned int _exynos_mipi_dsi_get_frame_done_status(struct mipi_dsim_device *dsim);
-void _exynos_mipi_dsi_clear_frame_done(struct mipi_dsim_device *dsim);
-void exynos_mipi_dsi_wr_tx_header(struct mipi_dsim_device *dsim, unsigned int di,
-				unsigned int data0, unsigned int data1);
-void exynos_mipi_dsi_wr_tx_data(struct mipi_dsim_device *dsim,
-		unsigned int tx_data);
-void exynos_mipi_dsi_rd_tx_header(struct mipi_dsim_device *dsim,
-		unsigned int data0, unsigned int data1);
-unsigned int exynos_mipi_dsi_rd_rx_fifo(struct mipi_dsim_device *dsim);
-
-#endif /* _EXYNOS_MIPI_DSI_LOWLEVEL_H */
diff --git a/drivers/video/fbdev/exynos/exynos_mipi_dsi_regs.h b/drivers/video/fbdev/exynos/exynos_mipi_dsi_regs.h
deleted file mode 100644
index 4227106..0000000
--- a/drivers/video/fbdev/exynos/exynos_mipi_dsi_regs.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* linux/driver/video/exynos/exynos_mipi_dsi_regs.h
- *
- * Register definition file for Samsung MIPI-DSIM driver
- *
- * Copyright (c) 2012 Samsung Electronics Co., Ltd
- *
- * InKi Dae <inki.dae@samsung.com>
- * Donghwa Lee <dh09.lee@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-#ifndef _EXYNOS_MIPI_DSI_REGS_H
-#define _EXYNOS_MIPI_DSI_REGS_H
-
-#define EXYNOS_DSIM_STATUS		0x0	/* Status register */
-#define EXYNOS_DSIM_SWRST		0x4	/* Software reset register */
-#define EXYNOS_DSIM_CLKCTRL		0x8	/* Clock control register */
-#define EXYNOS_DSIM_TIMEOUT		0xc	/* Time out register */
-#define EXYNOS_DSIM_CONFIG		0x10	/* Configuration register */
-#define EXYNOS_DSIM_ESCMODE		0x14	/* Escape mode register */
-
-/* Main display image resolution register */
-#define EXYNOS_DSIM_MDRESOL		0x18
-#define EXYNOS_DSIM_MVPORCH		0x1c	/* Main display Vporch register */
-#define EXYNOS_DSIM_MHPORCH		0x20	/* Main display Hporch register */
-#define EXYNOS_DSIM_MSYNC		0x24	/* Main display sync area register */
-
-/* Sub display image resolution register */
-#define EXYNOS_DSIM_SDRESOL		0x28
-#define EXYNOS_DSIM_INTSRC		0x2c	/* Interrupt source register */
-#define EXYNOS_DSIM_INTMSK		0x30	/* Interrupt mask register */
-#define EXYNOS_DSIM_PKTHDR		0x34	/* Packet Header FIFO register */
-#define EXYNOS_DSIM_PAYLOAD		0x38	/* Payload FIFO register */
-#define EXYNOS_DSIM_RXFIFO		0x3c	/* Read FIFO register */
-#define EXYNOS_DSIM_FIFOTHLD		0x40	/* FIFO threshold level register */
-#define EXYNOS_DSIM_FIFOCTRL		0x44	/* FIFO status and control register */
-
-/* FIFO memory AC characteristic register */
-#define EXYNOS_DSIM_PLLCTRL		0x4c	/* PLL control register */
-#define EXYNOS_DSIM_PLLTMR		0x50	/* PLL timer register */
-#define EXYNOS_DSIM_PHYACCHR		0x54	/* D-PHY AC characteristic register */
-#define EXYNOS_DSIM_PHYACCHR1		0x58	/* D-PHY AC characteristic register1 */
-
-/* DSIM_STATUS */
-#define DSIM_STOP_STATE_DAT(x)		(((x) & 0xf) << 0)
-#define DSIM_STOP_STATE_CLK		(1 << 8)
-#define DSIM_TX_READY_HS_CLK		(1 << 10)
-
-/* DSIM_SWRST */
-#define DSIM_FUNCRST			(1 << 16)
-#define DSIM_SWRST			(1 << 0)
-
-/* EXYNOS_DSIM_TIMEOUT */
-#define DSIM_LPDR_TOUT_SHIFT(x)		((x) << 0)
-#define DSIM_BTA_TOUT_SHIFT(x)		((x) << 16)
-
-/* EXYNOS_DSIM_CLKCTRL */
-#define DSIM_LANE_ESC_CLKEN(x)		(((x) & 0x1f) << 19)
-#define DSIM_BYTE_CLKEN_SHIFT(x)	((x) << 24)
-#define DSIM_BYTE_CLK_SRC_SHIFT(x)	((x) <<	25)
-#define DSIM_PLL_BYPASS_SHIFT(x)	((x) <<	27)
-#define DSIM_ESC_CLKEN_SHIFT(x)		((x) << 28)
-#define DSIM_TX_REQUEST_HSCLK_SHIFT(x)	((x) << 31)
-
-/* EXYNOS_DSIM_CONFIG */
-#define DSIM_LANE_ENx(x)		(((x) & 0x1f) << 0)
-#define DSIM_NUM_OF_DATALANE_SHIFT(x)	((x) << 5)
-#define DSIM_HSA_MODE_SHIFT(x)		((x) << 20)
-#define DSIM_HBP_MODE_SHIFT(x)		((x) << 21)
-#define DSIM_HFP_MODE_SHIFT(x)		((x) << 22)
-#define DSIM_HSE_MODE_SHIFT(x)		((x) << 23)
-#define DSIM_AUTO_MODE_SHIFT(x)		((x) << 24)
-#define DSIM_EOT_DISABLE(x)		((x) << 28)
-#define DSIM_AUTO_FLUSH(x)		((x) << 29)
-
-#define DSIM_NUM_OF_DATA_LANE(x)	((x) << DSIM_NUM_OF_DATALANE_SHIFT)
-
-/* EXYNOS_DSIM_ESCMODE */
-#define DSIM_TX_LPDT_LP			(1 << 6)
-#define DSIM_CMD_LPDT_LP		(1 << 7)
-#define DSIM_FORCE_STOP_STATE_SHIFT(x)	((x) << 20)
-#define DSIM_STOP_STATE_CNT_SHIFT(x)	((x) << 21)
-
-/* EXYNOS_DSIM_MDRESOL */
-#define DSIM_MAIN_STAND_BY		(1 << 31)
-#define DSIM_MAIN_VRESOL(x)		(((x) & 0x7ff) << 16)
-#define DSIM_MAIN_HRESOL(x)		(((x) & 0X7ff) << 0)
-
-/* EXYNOS_DSIM_MVPORCH */
-#define DSIM_CMD_ALLOW_SHIFT(x)		((x) << 28)
-#define DSIM_STABLE_VFP_SHIFT(x)	((x) << 16)
-#define DSIM_MAIN_VBP_SHIFT(x)		((x) << 0)
-#define DSIM_CMD_ALLOW_MASK		(0xf << 28)
-#define DSIM_STABLE_VFP_MASK		(0x7ff << 16)
-#define DSIM_MAIN_VBP_MASK		(0x7ff << 0)
-
-/* EXYNOS_DSIM_MHPORCH */
-#define DSIM_MAIN_HFP_SHIFT(x)		((x) << 16)
-#define DSIM_MAIN_HBP_SHIFT(x)		((x) << 0)
-#define DSIM_MAIN_HFP_MASK		((0xffff) << 16)
-#define DSIM_MAIN_HBP_MASK		((0xffff) << 0)
-
-/* EXYNOS_DSIM_MSYNC */
-#define DSIM_MAIN_VSA_SHIFT(x)		((x) << 22)
-#define DSIM_MAIN_HSA_SHIFT(x)		((x) << 0)
-#define DSIM_MAIN_VSA_MASK		((0x3ff) << 22)
-#define DSIM_MAIN_HSA_MASK		((0xffff) << 0)
-
-/* EXYNOS_DSIM_SDRESOL */
-#define DSIM_SUB_STANDY_SHIFT(x)	((x) << 31)
-#define DSIM_SUB_VRESOL_SHIFT(x)	((x) << 16)
-#define DSIM_SUB_HRESOL_SHIFT(x)	((x) << 0)
-#define DSIM_SUB_STANDY_MASK		((0x1) << 31)
-#define DSIM_SUB_VRESOL_MASK		((0x7ff) << 16)
-#define DSIM_SUB_HRESOL_MASK		((0x7ff) << 0)
-
-/* EXYNOS_DSIM_INTSRC */
-#define INTSRC_PLL_STABLE		(1 << 31)
-#define INTSRC_SW_RST_RELEASE		(1 << 30)
-#define INTSRC_SFR_FIFO_EMPTY		(1 << 29)
-#define INTSRC_FRAME_DONE		(1 << 24)
-#define INTSRC_RX_DATA_DONE		(1 << 18)
-
-/* EXYNOS_DSIM_INTMSK */
-#define INTMSK_FIFO_EMPTY		(1 << 29)
-#define INTMSK_BTA			(1 << 25)
-#define INTMSK_FRAME_DONE		(1 << 24)
-#define INTMSK_RX_TIMEOUT		(1 << 21)
-#define INTMSK_BTA_TIMEOUT		(1 << 20)
-#define INTMSK_RX_DONE			(1 << 18)
-#define INTMSK_RX_TE			(1 << 17)
-#define INTMSK_RX_ACK			(1 << 16)
-#define INTMSK_RX_ECC_ERR		(1 << 15)
-#define INTMSK_RX_CRC_ERR		(1 << 14)
-
-/* EXYNOS_DSIM_FIFOCTRL */
-#define SFR_HEADER_EMPTY		(1 << 22)
-
-/* EXYNOS_DSIM_PHYACCHR */
-#define DSIM_AFC_CTL(x)			(((x) & 0x7) << 5)
-
-/* EXYNOS_DSIM_PLLCTRL */
-#define DSIM_PLL_EN_SHIFT(x)		((x) << 23)
-#define DSIM_FREQ_BAND_SHIFT(x)		((x) << 24)
-
-#endif /* _EXYNOS_MIPI_DSI_REGS_H */
diff --git a/drivers/video/fbdev/exynos/s6e8ax0.c b/drivers/video/fbdev/exynos/s6e8ax0.c
deleted file mode 100644
index de2f3e7..0000000
--- a/drivers/video/fbdev/exynos/s6e8ax0.c
+++ /dev/null
@@ -1,887 +0,0 @@
-/* linux/drivers/video/exynos/s6e8ax0.c
- *
- * MIPI-DSI based s6e8ax0 AMOLED lcd 4.65 inch panel driver.
- *
- * Inki Dae, <inki.dae@samsung.com>
- * Donghwa Lee, <dh09.lee@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/mutex.h>
-#include <linux/wait.h>
-#include <linux/ctype.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/lcd.h>
-#include <linux/fb.h>
-#include <linux/backlight.h>
-#include <linux/regulator/consumer.h>
-
-#include <video/mipi_display.h>
-#include <video/exynos_mipi_dsim.h>
-
-#define LDI_MTP_LENGTH		24
-#define DSIM_PM_STABLE_TIME	10
-#define MIN_BRIGHTNESS		0
-#define MAX_BRIGHTNESS		24
-#define GAMMA_TABLE_COUNT	26
-
-#define POWER_IS_ON(pwr)	((pwr) == FB_BLANK_UNBLANK)
-#define POWER_IS_OFF(pwr)	((pwr) == FB_BLANK_POWERDOWN)
-#define POWER_IS_NRM(pwr)	((pwr) == FB_BLANK_NORMAL)
-
-#define lcd_to_master(a)	(a->dsim_dev->master)
-#define lcd_to_master_ops(a)	((lcd_to_master(a))->master_ops)
-
-enum {
-	DSIM_NONE_STATE = 0,
-	DSIM_RESUME_COMPLETE = 1,
-	DSIM_FRAME_DONE = 2,
-};
-
-struct s6e8ax0 {
-	struct device	*dev;
-	unsigned int			power;
-	unsigned int			id;
-	unsigned int			gamma;
-	unsigned int			acl_enable;
-	unsigned int			cur_acl;
-
-	struct lcd_device	*ld;
-	struct backlight_device	*bd;
-
-	struct mipi_dsim_lcd_device	*dsim_dev;
-	struct lcd_platform_data	*ddi_pd;
-	struct mutex			lock;
-	bool  enabled;
-};
-
-
-static struct regulator_bulk_data supplies[] = {
-	{ .supply = "vdd3", },
-	{ .supply = "vci", },
-};
-
-static void s6e8ax0_regulator_enable(struct s6e8ax0 *lcd)
-{
-	int ret = 0;
-	struct lcd_platform_data *pd = NULL;
-
-	pd = lcd->ddi_pd;
-	mutex_lock(&lcd->lock);
-	if (!lcd->enabled) {
-		ret = regulator_bulk_enable(ARRAY_SIZE(supplies), supplies);
-		if (ret)
-			goto out;
-
-		lcd->enabled = true;
-	}
-	msleep(pd->power_on_delay);
-out:
-	mutex_unlock(&lcd->lock);
-}
-
-static void s6e8ax0_regulator_disable(struct s6e8ax0 *lcd)
-{
-	int ret = 0;
-
-	mutex_lock(&lcd->lock);
-	if (lcd->enabled) {
-		ret = regulator_bulk_disable(ARRAY_SIZE(supplies), supplies);
-		if (ret)
-			goto out;
-
-		lcd->enabled = false;
-	}
-out:
-	mutex_unlock(&lcd->lock);
-}
-
-static const unsigned char s6e8ax0_22_gamma_30[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xf5, 0x00, 0xff, 0xad, 0xaf,
-	0xbA, 0xc3, 0xd8, 0xc5, 0x9f, 0xc6, 0x9e, 0xc1, 0xdc, 0xc0,
-	0x00, 0x61, 0x00, 0x5a, 0x00, 0x74,
-};
-
-static const unsigned char s6e8ax0_22_gamma_50[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xe8, 0x1f, 0xf7, 0xad, 0xc0,
-	0xb5, 0xc4, 0xdc, 0xc4, 0x9e, 0xc6, 0x9c, 0xbb, 0xd8, 0xbb,
-	0x00, 0x70, 0x00, 0x68, 0x00, 0x86,
-};
-
-static const unsigned char s6e8ax0_22_gamma_60[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xde, 0x1f, 0xef, 0xad, 0xc4,
-	0xb3, 0xc3, 0xdd, 0xc4, 0x9e, 0xc6, 0x9c, 0xbc, 0xd6, 0xba,
-	0x00, 0x75, 0x00, 0x6e, 0x00, 0x8d,
-};
-
-static const unsigned char s6e8ax0_22_gamma_70[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xd8, 0x1f, 0xe7, 0xaf, 0xc8,
-	0xb4, 0xc4, 0xdd, 0xc3, 0x9d, 0xc6, 0x9c, 0xbb, 0xd6, 0xb9,
-	0x00, 0x7a, 0x00, 0x72, 0x00, 0x93,
-};
-
-static const unsigned char s6e8ax0_22_gamma_80[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xc9, 0x1f, 0xde, 0xae, 0xc9,
-	0xb1, 0xc3, 0xdd, 0xc2, 0x9d, 0xc5, 0x9b, 0xbc, 0xd6, 0xbb,
-	0x00, 0x7f, 0x00, 0x77, 0x00, 0x99,
-};
-
-static const unsigned char s6e8ax0_22_gamma_90[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xc7, 0x1f, 0xd9, 0xb0, 0xcc,
-	0xb2, 0xc3, 0xdc, 0xc1, 0x9c, 0xc6, 0x9c, 0xbc, 0xd4, 0xb9,
-	0x00, 0x83, 0x00, 0x7b, 0x00, 0x9e,
-};
-
-static const unsigned char s6e8ax0_22_gamma_100[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xbd, 0x80, 0xcd, 0xba, 0xce,
-	0xb3, 0xc4, 0xde, 0xc3, 0x9c, 0xc4, 0x9, 0xb8, 0xd3, 0xb6,
-	0x00, 0x88, 0x00, 0x80, 0x00, 0xa5,
-};
-
-static const unsigned char s6e8ax0_22_gamma_120[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb9, 0x95, 0xc8, 0xb1, 0xcf,
-	0xb2, 0xc6, 0xdf, 0xc5, 0x9b, 0xc3, 0x99, 0xb6, 0xd2, 0xb6,
-	0x00, 0x8f, 0x00, 0x86, 0x00, 0xac,
-};
-
-static const unsigned char s6e8ax0_22_gamma_130[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb7, 0xa0, 0xc7, 0xb1, 0xd0,
-	0xb2, 0xc4, 0xdd, 0xc3, 0x9a, 0xc3, 0x98, 0xb6, 0xd0, 0xb4,
-	0x00, 0x92, 0x00, 0x8a, 0x00, 0xb1,
-};
-
-static const unsigned char s6e8ax0_22_gamma_140[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb7, 0xa0, 0xc5, 0xb2, 0xd0,
-	0xb3, 0xc3, 0xde, 0xc3, 0x9b, 0xc2, 0x98, 0xb6, 0xd0, 0xb4,
-	0x00, 0x95, 0x00, 0x8d, 0x00, 0xb5,
-};
-
-static const unsigned char s6e8ax0_22_gamma_150[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb3, 0xa0, 0xc2, 0xb2, 0xd0,
-	0xb2, 0xc1, 0xdd, 0xc2, 0x9b, 0xc2, 0x98, 0xb4, 0xcf, 0xb1,
-	0x00, 0x99, 0x00, 0x90, 0x00, 0xba,
-};
-
-static const unsigned char s6e8ax0_22_gamma_160[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xaf, 0xa5, 0xbf, 0xb0, 0xd0,
-	0xb1, 0xc3, 0xde, 0xc2, 0x99, 0xc1, 0x97, 0xb4, 0xce, 0xb1,
-	0x00, 0x9c, 0x00, 0x93, 0x00, 0xbe,
-};
-
-static const unsigned char s6e8ax0_22_gamma_170[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xaf, 0xb5, 0xbf, 0xb1, 0xd1,
-	0xb1, 0xc3, 0xde, 0xc3, 0x99, 0xc0, 0x96, 0xb4, 0xce, 0xb1,
-	0x00, 0x9f, 0x00, 0x96, 0x00, 0xc2,
-};
-
-static const unsigned char s6e8ax0_22_gamma_180[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xaf, 0xb7, 0xbe, 0xb3, 0xd2,
-	0xb3, 0xc3, 0xde, 0xc2, 0x97, 0xbf, 0x95, 0xb4, 0xcd, 0xb1,
-	0x00, 0xa2, 0x00, 0x99, 0x00, 0xc5,
-};
-
-static const unsigned char s6e8ax0_22_gamma_190[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xaf, 0xb9, 0xbe, 0xb2, 0xd2,
-	0xb2, 0xc3, 0xdd, 0xc3, 0x98, 0xbf, 0x95, 0xb2, 0xcc, 0xaf,
-	0x00, 0xa5, 0x00, 0x9c, 0x00, 0xc9,
-};
-
-static const unsigned char s6e8ax0_22_gamma_200[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xaf, 0xb9, 0xbc, 0xb2, 0xd2,
-	0xb1, 0xc4, 0xdd, 0xc3, 0x97, 0xbe, 0x95, 0xb1, 0xcb, 0xae,
-	0x00, 0xa8, 0x00, 0x9f, 0x00, 0xcd,
-};
-
-static const unsigned char s6e8ax0_22_gamma_210[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb1, 0xc1, 0xbd, 0xb1, 0xd1,
-	0xb1, 0xc2, 0xde, 0xc2, 0x97, 0xbe, 0x94, 0xB0, 0xc9, 0xad,
-	0x00, 0xae, 0x00, 0xa4, 0x00, 0xd4,
-};
-
-static const unsigned char s6e8ax0_22_gamma_220[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb1, 0xc7, 0xbd, 0xb1, 0xd1,
-	0xb1, 0xc2, 0xdd, 0xc2, 0x97, 0xbd, 0x94, 0xb0, 0xc9, 0xad,
-	0x00, 0xad, 0x00, 0xa2, 0x00, 0xd3,
-};
-
-static const unsigned char s6e8ax0_22_gamma_230[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb1, 0xc3, 0xbd, 0xb2, 0xd1,
-	0xb1, 0xc3, 0xdd, 0xc1, 0x96, 0xbd, 0x94, 0xb0, 0xc9, 0xad,
-	0x00, 0xb0, 0x00, 0xa7, 0x00, 0xd7,
-};
-
-static const unsigned char s6e8ax0_22_gamma_240[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb1, 0xcb, 0xbd, 0xb1, 0xd2,
-	0xb1, 0xc3, 0xdD, 0xc2, 0x95, 0xbd, 0x93, 0xaf, 0xc8, 0xab,
-	0x00, 0xb3, 0x00, 0xa9, 0x00, 0xdb,
-};
-
-static const unsigned char s6e8ax0_22_gamma_250[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb3, 0xcc, 0xbe, 0xb0, 0xd2,
-	0xb0, 0xc3, 0xdD, 0xc2, 0x94, 0xbc, 0x92, 0xae, 0xc8, 0xab,
-	0x00, 0xb6, 0x00, 0xab, 0x00, 0xde,
-};
-
-static const unsigned char s6e8ax0_22_gamma_260[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb3, 0xd0, 0xbe, 0xaf, 0xd1,
-	0xaf, 0xc2, 0xdd, 0xc1, 0x96, 0xbc, 0x93, 0xaf, 0xc8, 0xac,
-	0x00, 0xb7, 0x00, 0xad, 0x00, 0xe0,
-};
-
-static const unsigned char s6e8ax0_22_gamma_270[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb2, 0xcF, 0xbd, 0xb0, 0xd2,
-	0xaf, 0xc2, 0xdc, 0xc1, 0x95, 0xbd, 0x93, 0xae, 0xc6, 0xaa,
-	0x00, 0xba, 0x00, 0xb0, 0x00, 0xe4,
-};
-
-static const unsigned char s6e8ax0_22_gamma_280[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb2, 0xd0, 0xbd, 0xaf, 0xd0,
-	0xad, 0xc4, 0xdd, 0xc3, 0x95, 0xbd, 0x93, 0xac, 0xc5, 0xa9,
-	0x00, 0xbd, 0x00, 0xb2, 0x00, 0xe7,
-};
-
-static const unsigned char s6e8ax0_22_gamma_300[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb5, 0xd3, 0xbd, 0xb1, 0xd2,
-	0xb0, 0xc0, 0xdc, 0xc0, 0x94, 0xba, 0x91, 0xac, 0xc5, 0xa9,
-	0x00, 0xc2, 0x00, 0xb7, 0x00, 0xed,
-};
-
-static const unsigned char *s6e8ax0_22_gamma_table[] = {
-	s6e8ax0_22_gamma_30,
-	s6e8ax0_22_gamma_50,
-	s6e8ax0_22_gamma_60,
-	s6e8ax0_22_gamma_70,
-	s6e8ax0_22_gamma_80,
-	s6e8ax0_22_gamma_90,
-	s6e8ax0_22_gamma_100,
-	s6e8ax0_22_gamma_120,
-	s6e8ax0_22_gamma_130,
-	s6e8ax0_22_gamma_140,
-	s6e8ax0_22_gamma_150,
-	s6e8ax0_22_gamma_160,
-	s6e8ax0_22_gamma_170,
-	s6e8ax0_22_gamma_180,
-	s6e8ax0_22_gamma_190,
-	s6e8ax0_22_gamma_200,
-	s6e8ax0_22_gamma_210,
-	s6e8ax0_22_gamma_220,
-	s6e8ax0_22_gamma_230,
-	s6e8ax0_22_gamma_240,
-	s6e8ax0_22_gamma_250,
-	s6e8ax0_22_gamma_260,
-	s6e8ax0_22_gamma_270,
-	s6e8ax0_22_gamma_280,
-	s6e8ax0_22_gamma_300,
-};
-
-static void s6e8ax0_panel_cond(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-
-	static const unsigned char data_to_send[] = {
-		0xf8, 0x3d, 0x35, 0x00, 0x00, 0x00, 0x93, 0x00, 0x3c, 0x7d,
-		0x08, 0x27, 0x7d, 0x3f, 0x00, 0x00, 0x00, 0x20, 0x04, 0x08,
-		0x6e, 0x00, 0x00, 0x00, 0x02, 0x08, 0x08, 0x23, 0x23, 0xc0,
-		0xc8, 0x08, 0x48, 0xc1, 0x00, 0xc1, 0xff, 0xff, 0xc8
-	};
-	static const unsigned char data_to_send_panel_reverse[] = {
-		0xf8, 0x19, 0x35, 0x00, 0x00, 0x00, 0x93, 0x00, 0x3c, 0x7d,
-		0x08, 0x27, 0x7d, 0x3f, 0x00, 0x00, 0x00, 0x20, 0x04, 0x08,
-		0x6e, 0x00, 0x00, 0x00, 0x02, 0x08, 0x08, 0x23, 0x23, 0xc0,
-		0xc1, 0x01, 0x41, 0xc1, 0x00, 0xc1, 0xf6, 0xf6, 0xc1
-	};
-
-	if (lcd->dsim_dev->panel_reverse)
-		ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-				data_to_send_panel_reverse,
-				ARRAY_SIZE(data_to_send_panel_reverse));
-	else
-		ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-				data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_display_cond(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xf2, 0x80, 0x03, 0x0d
-	};
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-/* Gamma 2.2 Setting (200cd, 7500K, 10MPCD) */
-static void s6e8ax0_gamma_cond(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	unsigned int gamma = lcd->bd->props.brightness;
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-			s6e8ax0_22_gamma_table[gamma],
-			GAMMA_TABLE_COUNT);
-}
-
-static void s6e8ax0_gamma_update(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xf7, 0x03
-	};
-
-	ops->cmd_write(lcd_to_master(lcd),
-		MIPI_DSI_DCS_SHORT_WRITE_PARAM, data_to_send,
-		ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_etc_cond1(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xd1, 0xfe, 0x80, 0x00, 0x01, 0x0b, 0x00, 0x00, 0x40,
-		0x0d, 0x00, 0x00
-	};
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_etc_cond2(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xb6, 0x0c, 0x02, 0x03, 0x32, 0xff, 0x44, 0x44, 0xc0,
-		0x00
-	};
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_etc_cond3(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xe1, 0x10, 0x1c, 0x17, 0x08, 0x1d
-	};
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_etc_cond4(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xe2, 0xed, 0x07, 0xc3, 0x13, 0x0d, 0x03
-	};
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_etc_cond5(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xf4, 0xcf, 0x0a, 0x12, 0x10, 0x19, 0x33, 0x02
-	};
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-static void s6e8ax0_etc_cond6(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xe3, 0x40
-	};
-
-	ops->cmd_write(lcd_to_master(lcd),
-		MIPI_DSI_DCS_SHORT_WRITE_PARAM,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_etc_cond7(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xe4, 0x00, 0x00, 0x14, 0x80, 0x00, 0x00, 0x00
-	};
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_elvss_set(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xb1, 0x04, 0x00
-	};
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_elvss_nvm_set(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xd9, 0x5c, 0x20, 0x0c, 0x0f, 0x41, 0x00, 0x10, 0x11,
-		0x12, 0xd1, 0x00, 0x00, 0x00, 0x00, 0x80, 0xcb, 0xed,
-		0x64, 0xaf
-	};
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_sleep_in(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0x10, 0x00
-	};
-
-	ops->cmd_write(lcd_to_master(lcd),
-		MIPI_DSI_DCS_SHORT_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_sleep_out(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0x11, 0x00
-	};
-
-	ops->cmd_write(lcd_to_master(lcd),
-		MIPI_DSI_DCS_SHORT_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_display_on(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0x29, 0x00
-	};
-
-	ops->cmd_write(lcd_to_master(lcd),
-		MIPI_DSI_DCS_SHORT_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_display_off(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0x28, 0x00
-	};
-
-	ops->cmd_write(lcd_to_master(lcd),
-		MIPI_DSI_DCS_SHORT_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_apply_level2_key(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xf0, 0x5a, 0x5a
-	};
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_acl_on(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xc0, 0x01
-	};
-
-	ops->cmd_write(lcd_to_master(lcd),
-		MIPI_DSI_DCS_SHORT_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_acl_off(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xc0, 0x00
-	};
-
-	ops->cmd_write(lcd_to_master(lcd),
-		MIPI_DSI_DCS_SHORT_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-/* Full white 50% reducing setting */
-static void s6e8ax0_acl_ctrl_set(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	/* Full white 50% reducing setting */
-	static const unsigned char cutoff_50[] = {
-		0xc1, 0x47, 0x53, 0x13, 0x53, 0x00, 0x00, 0x02, 0xcf,
-		0x00, 0x00, 0x04, 0xff,	0x00, 0x00, 0x00, 0x00, 0x00,
-		0x01, 0x08, 0x0f, 0x16, 0x1d, 0x24, 0x2a, 0x31, 0x38,
-		0x3f, 0x46
-	};
-	/* Full white 45% reducing setting */
-	static const unsigned char cutoff_45[] = {
-		0xc1, 0x47, 0x53, 0x13, 0x53, 0x00, 0x00, 0x02, 0xcf,
-		0x00, 0x00, 0x04, 0xff,	0x00, 0x00, 0x00, 0x00, 0x00,
-		0x01, 0x07, 0x0d, 0x13, 0x19, 0x1f, 0x25, 0x2b, 0x31,
-		0x37, 0x3d
-	};
-	/* Full white 40% reducing setting */
-	static const unsigned char cutoff_40[] = {
-		0xc1, 0x47, 0x53, 0x13, 0x53, 0x00, 0x00, 0x02, 0xcf,
-		0x00, 0x00, 0x04, 0xff,	0x00, 0x00, 0x00, 0x00, 0x00,
-		0x01, 0x06, 0x0c, 0x11, 0x16, 0x1c, 0x21, 0x26, 0x2b,
-		0x31, 0x36
-	};
-
-	if (lcd->acl_enable) {
-		if (lcd->cur_acl == 0) {
-			if (lcd->gamma == 0 || lcd->gamma == 1) {
-				s6e8ax0_acl_off(lcd);
-				dev_dbg(&lcd->ld->dev,
-					"cur_acl=%d\n", lcd->cur_acl);
-			} else
-				s6e8ax0_acl_on(lcd);
-		}
-		switch (lcd->gamma) {
-		case 0: /* 30cd */
-			s6e8ax0_acl_off(lcd);
-			lcd->cur_acl = 0;
-			break;
-		case 1 ... 3: /* 50cd ~ 90cd */
-			ops->cmd_write(lcd_to_master(lcd),
-				MIPI_DSI_DCS_LONG_WRITE,
-				cutoff_40,
-				ARRAY_SIZE(cutoff_40));
-			lcd->cur_acl = 40;
-			break;
-		case 4 ... 7: /* 120cd ~ 210cd */
-			ops->cmd_write(lcd_to_master(lcd),
-				MIPI_DSI_DCS_LONG_WRITE,
-				cutoff_45,
-				ARRAY_SIZE(cutoff_45));
-			lcd->cur_acl = 45;
-			break;
-		case 8 ... 10: /* 220cd ~ 300cd */
-			ops->cmd_write(lcd_to_master(lcd),
-				MIPI_DSI_DCS_LONG_WRITE,
-				cutoff_50,
-				ARRAY_SIZE(cutoff_50));
-			lcd->cur_acl = 50;
-			break;
-		default:
-			break;
-		}
-	} else {
-		s6e8ax0_acl_off(lcd);
-		lcd->cur_acl = 0;
-		dev_dbg(&lcd->ld->dev, "cur_acl = %d\n", lcd->cur_acl);
-	}
-}
-
-static void s6e8ax0_read_id(struct s6e8ax0 *lcd, u8 *mtp_id)
-{
-	unsigned int ret;
-	unsigned int addr = 0xd1;	/* MTP ID */
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-
-	ret = ops->cmd_read(lcd_to_master(lcd),
-			MIPI_DSI_GENERIC_READ_REQUEST_1_PARAM,
-			addr, 3, mtp_id);
-}
-
-static int s6e8ax0_panel_init(struct s6e8ax0 *lcd)
-{
-	s6e8ax0_apply_level2_key(lcd);
-	s6e8ax0_sleep_out(lcd);
-	msleep(1);
-	s6e8ax0_panel_cond(lcd);
-	s6e8ax0_display_cond(lcd);
-	s6e8ax0_gamma_cond(lcd);
-	s6e8ax0_gamma_update(lcd);
-
-	s6e8ax0_etc_cond1(lcd);
-	s6e8ax0_etc_cond2(lcd);
-	s6e8ax0_etc_cond3(lcd);
-	s6e8ax0_etc_cond4(lcd);
-	s6e8ax0_etc_cond5(lcd);
-	s6e8ax0_etc_cond6(lcd);
-	s6e8ax0_etc_cond7(lcd);
-
-	s6e8ax0_elvss_nvm_set(lcd);
-	s6e8ax0_elvss_set(lcd);
-
-	s6e8ax0_acl_ctrl_set(lcd);
-	s6e8ax0_acl_on(lcd);
-
-	/* if ID3 value is not 33h, branch private elvss mode */
-	msleep(lcd->ddi_pd->power_on_delay);
-
-	return 0;
-}
-
-static int s6e8ax0_update_gamma_ctrl(struct s6e8ax0 *lcd, int brightness)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-			s6e8ax0_22_gamma_table[brightness],
-			ARRAY_SIZE(s6e8ax0_22_gamma_table));
-
-	/* update gamma table. */
-	s6e8ax0_gamma_update(lcd);
-	lcd->gamma = brightness;
-
-	return 0;
-}
-
-static int s6e8ax0_gamma_ctrl(struct s6e8ax0 *lcd, int gamma)
-{
-	s6e8ax0_update_gamma_ctrl(lcd, gamma);
-
-	return 0;
-}
-
-static int s6e8ax0_set_power(struct lcd_device *ld, int power)
-{
-	struct s6e8ax0 *lcd = lcd_get_data(ld);
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	int ret = 0;
-
-	if (power != FB_BLANK_UNBLANK && power != FB_BLANK_POWERDOWN &&
-			power != FB_BLANK_NORMAL) {
-		dev_err(lcd->dev, "power value should be 0, 1 or 4.\n");
-		return -EINVAL;
-	}
-
-	if ((power == FB_BLANK_UNBLANK) && ops->set_blank_mode) {
-		/* LCD power on */
-		if ((POWER_IS_ON(power) && POWER_IS_OFF(lcd->power))
-			|| (POWER_IS_ON(power) && POWER_IS_NRM(lcd->power))) {
-			ret = ops->set_blank_mode(lcd_to_master(lcd), power);
-			if (!ret && lcd->power != power)
-				lcd->power = power;
-		}
-	} else if ((power == FB_BLANK_POWERDOWN) && ops->set_early_blank_mode) {
-		/* LCD power off */
-		if ((POWER_IS_OFF(power) && POWER_IS_ON(lcd->power)) ||
-		(POWER_IS_ON(lcd->power) && POWER_IS_NRM(power))) {
-			ret = ops->set_early_blank_mode(lcd_to_master(lcd),
-							power);
-			if (!ret && lcd->power != power)
-				lcd->power = power;
-		}
-	}
-
-	return ret;
-}
-
-static int s6e8ax0_get_power(struct lcd_device *ld)
-{
-	struct s6e8ax0 *lcd = lcd_get_data(ld);
-
-	return lcd->power;
-}
-
-static int s6e8ax0_set_brightness(struct backlight_device *bd)
-{
-	int ret = 0, brightness = bd->props.brightness;
-	struct s6e8ax0 *lcd = bl_get_data(bd);
-
-	if (brightness < MIN_BRIGHTNESS ||
-		brightness > bd->props.max_brightness) {
-		dev_err(lcd->dev, "lcd brightness should be %d to %d.\n",
-			MIN_BRIGHTNESS, MAX_BRIGHTNESS);
-		return -EINVAL;
-	}
-
-	ret = s6e8ax0_gamma_ctrl(lcd, brightness);
-	if (ret) {
-		dev_err(&bd->dev, "lcd brightness setting failed.\n");
-		return -EIO;
-	}
-
-	return ret;
-}
-
-static struct lcd_ops s6e8ax0_lcd_ops = {
-	.set_power = s6e8ax0_set_power,
-	.get_power = s6e8ax0_get_power,
-};
-
-static const struct backlight_ops s6e8ax0_backlight_ops = {
-	.update_status = s6e8ax0_set_brightness,
-};
-
-static void s6e8ax0_power_on(struct mipi_dsim_lcd_device *dsim_dev, int power)
-{
-	struct s6e8ax0 *lcd = dev_get_drvdata(&dsim_dev->dev);
-
-	msleep(lcd->ddi_pd->power_on_delay);
-
-	/* lcd power on */
-	if (power)
-		s6e8ax0_regulator_enable(lcd);
-	else
-		s6e8ax0_regulator_disable(lcd);
-
-	msleep(lcd->ddi_pd->reset_delay);
-
-	/* lcd reset */
-	if (lcd->ddi_pd->reset)
-		lcd->ddi_pd->reset(lcd->ld);
-	msleep(5);
-}
-
-static void s6e8ax0_set_sequence(struct mipi_dsim_lcd_device *dsim_dev)
-{
-	struct s6e8ax0 *lcd = dev_get_drvdata(&dsim_dev->dev);
-
-	s6e8ax0_panel_init(lcd);
-	s6e8ax0_display_on(lcd);
-
-	lcd->power = FB_BLANK_UNBLANK;
-}
-
-static int s6e8ax0_probe(struct mipi_dsim_lcd_device *dsim_dev)
-{
-	struct s6e8ax0 *lcd;
-	int ret;
-	u8 mtp_id[3] = {0, };
-
-	lcd = devm_kzalloc(&dsim_dev->dev, sizeof(struct s6e8ax0), GFP_KERNEL);
-	if (!lcd) {
-		dev_err(&dsim_dev->dev, "failed to allocate s6e8ax0 structure.\n");
-		return -ENOMEM;
-	}
-
-	lcd->dsim_dev = dsim_dev;
-	lcd->ddi_pd = (struct lcd_platform_data *)dsim_dev->platform_data;
-	lcd->dev = &dsim_dev->dev;
-
-	mutex_init(&lcd->lock);
-
-	ret = devm_regulator_bulk_get(lcd->dev, ARRAY_SIZE(supplies), supplies);
-	if (ret) {
-		dev_err(lcd->dev, "Failed to get regulators: %d\n", ret);
-		return ret;
-	}
-
-	lcd->ld = devm_lcd_device_register(lcd->dev, "s6e8ax0", lcd->dev, lcd,
-			&s6e8ax0_lcd_ops);
-	if (IS_ERR(lcd->ld)) {
-		dev_err(lcd->dev, "failed to register lcd ops.\n");
-		return PTR_ERR(lcd->ld);
-	}
-
-	lcd->bd = devm_backlight_device_register(lcd->dev, "s6e8ax0-bl",
-				lcd->dev, lcd, &s6e8ax0_backlight_ops, NULL);
-	if (IS_ERR(lcd->bd)) {
-		dev_err(lcd->dev, "failed to register backlight ops.\n");
-		return PTR_ERR(lcd->bd);
-	}
-
-	lcd->bd->props.max_brightness = MAX_BRIGHTNESS;
-	lcd->bd->props.brightness = MAX_BRIGHTNESS;
-
-	s6e8ax0_read_id(lcd, mtp_id);
-	if (mtp_id[0] == 0x00)
-		dev_err(lcd->dev, "read id failed\n");
-
-	dev_info(lcd->dev, "Read ID : %x, %x, %x\n",
-			mtp_id[0], mtp_id[1], mtp_id[2]);
-
-	if (mtp_id[2] == 0x33)
-		dev_info(lcd->dev,
-			"ID-3 is 0xff does not support dynamic elvss\n");
-	else
-		dev_info(lcd->dev,
-			"ID-3 is 0x%x support dynamic elvss\n", mtp_id[2]);
-
-	lcd->acl_enable = 1;
-	lcd->cur_acl = 0;
-
-	dev_set_drvdata(&dsim_dev->dev, lcd);
-
-	dev_dbg(lcd->dev, "probed s6e8ax0 panel driver.\n");
-
-	return 0;
-}
-
-static int __maybe_unused s6e8ax0_suspend(struct mipi_dsim_lcd_device *dsim_dev)
-{
-	struct s6e8ax0 *lcd = dev_get_drvdata(&dsim_dev->dev);
-
-	s6e8ax0_sleep_in(lcd);
-	msleep(lcd->ddi_pd->power_off_delay);
-	s6e8ax0_display_off(lcd);
-
-	s6e8ax0_regulator_disable(lcd);
-
-	return 0;
-}
-
-static int __maybe_unused s6e8ax0_resume(struct mipi_dsim_lcd_device *dsim_dev)
-{
-	struct s6e8ax0 *lcd = dev_get_drvdata(&dsim_dev->dev);
-
-	s6e8ax0_sleep_out(lcd);
-	msleep(lcd->ddi_pd->power_on_delay);
-
-	s6e8ax0_regulator_enable(lcd);
-	s6e8ax0_set_sequence(dsim_dev);
-
-	return 0;
-}
-
-static struct mipi_dsim_lcd_driver s6e8ax0_dsim_ddi_driver = {
-	.name = "s6e8ax0",
-	.id = -1,
-
-	.power_on = s6e8ax0_power_on,
-	.set_sequence = s6e8ax0_set_sequence,
-	.probe = s6e8ax0_probe,
-	.suspend = IS_ENABLED(CONFIG_PM) ? s6e8ax0_suspend : NULL,
-	.resume = IS_ENABLED(CONFIG_PM) ? s6e8ax0_resume : NULL,
-};
-
-static int s6e8ax0_init(void)
-{
-	exynos_mipi_dsi_register_lcd_driver(&s6e8ax0_dsim_ddi_driver);
-
-	return 0;
-}
-
-static void s6e8ax0_exit(void)
-{
-	return;
-}
-
-module_init(s6e8ax0_init);
-module_exit(s6e8ax0_exit);
-
-MODULE_AUTHOR("Donghwa Lee <dh09.lee@samsung.com>");
-MODULE_AUTHOR("Inki Dae <inki.dae@samsung.com>");
-MODULE_DESCRIPTION("MIPI-DSI based s6e8ax0 AMOLED LCD Panel Driver");
-MODULE_LICENSE("GPL");
diff --git a/drivers/video/fbdev/hecubafb.c b/drivers/video/fbdev/hecubafb.c
index e4031ef..8577195 100644
--- a/drivers/video/fbdev/hecubafb.c
+++ b/drivers/video/fbdev/hecubafb.c
@@ -47,7 +47,7 @@
 #define DPY_W 600
 #define DPY_H 800
 
-static struct fb_fix_screeninfo hecubafb_fix = {
+static const struct fb_fix_screeninfo hecubafb_fix = {
 	.id =		"hecubafb",
 	.type =		FB_TYPE_PACKED_PIXELS,
 	.visual =	FB_VISUAL_MONO01,
@@ -58,7 +58,7 @@
 	.accel =	FB_ACCEL_NONE,
 };
 
-static struct fb_var_screeninfo hecubafb_var = {
+static const struct fb_var_screeninfo hecubafb_var = {
 	.xres		= DPY_W,
 	.yres		= DPY_H,
 	.xres_virtual	= DPY_W,
diff --git a/drivers/video/fbdev/hgafb.c b/drivers/video/fbdev/hgafb.c
index 15d3ccf..4630285 100644
--- a/drivers/video/fbdev/hgafb.c
+++ b/drivers/video/fbdev/hgafb.c
@@ -106,7 +106,7 @@
 
 /* Framebuffer driver structures */
 
-static struct fb_var_screeninfo hga_default_var = {
+static const struct fb_var_screeninfo hga_default_var = {
 	.xres		= 720,
 	.yres 		= 348,
 	.xres_virtual 	= 720,
diff --git a/drivers/video/fbdev/i740fb.c b/drivers/video/fbdev/i740fb.c
index cf5ccd0..7bc5f60 100644
--- a/drivers/video/fbdev/i740fb.c
+++ b/drivers/video/fbdev/i740fb.c
@@ -82,7 +82,7 @@
 #define DACSPEED24_SD	128
 #define DACSPEED32	86
 
-static struct fb_fix_screeninfo i740fb_fix = {
+static const struct fb_fix_screeninfo i740fb_fix = {
 	.id =		"i740fb",
 	.type =		FB_TYPE_PACKED_PIXELS,
 	.visual =	FB_VISUAL_TRUECOLOR,
diff --git a/drivers/video/fbdev/i810/i810_main.c b/drivers/video/fbdev/i810/i810_main.c
index 025b882..483ab25 100644
--- a/drivers/video/fbdev/i810/i810_main.c
+++ b/drivers/video/fbdev/i810/i810_main.c
@@ -1691,7 +1691,7 @@
 	if (!(par->i810_gtt.i810_cursor_memory = 
 	      agp_allocate_memory(bridge, par->cursor_heap.size >> 12,
 				  AGP_PHYSICAL_MEMORY))) {
-		printk("i810fb_alloc_cursormem:  can't allocate" 
+		printk("i810fb_alloc_cursormem:  can't allocate "
 		       "cursor memory\n");
 		agp_backend_release(bridge);
 		return -ENOMEM;
diff --git a/drivers/video/fbdev/intelfb/intelfbdrv.c b/drivers/video/fbdev/intelfb/intelfbdrv.c
index bf20744..ff2a5d2 100644
--- a/drivers/video/fbdev/intelfb/intelfbdrv.c
+++ b/drivers/video/fbdev/intelfb/intelfbdrv.c
@@ -1301,11 +1301,6 @@
 		break;
 	}
 
-	if (v.xoffset < 0)
-		v.xoffset = 0;
-	if (v.yoffset < 0)
-		v.yoffset = 0;
-
 	if (v.xoffset > v.xres_virtual - v.xres)
 		v.xoffset = v.xres_virtual - v.xres;
 	if (v.yoffset > v.yres_virtual - v.yres)
diff --git a/drivers/video/fbdev/kyro/fbdev.c b/drivers/video/fbdev/kyro/fbdev.c
index 5bb0153..f77478f 100644
--- a/drivers/video/fbdev/kyro/fbdev.c
+++ b/drivers/video/fbdev/kyro/fbdev.c
@@ -44,7 +44,7 @@
 	.accel		= FB_ACCEL_NONE,
 };
 
-static struct fb_var_screeninfo kyro_var = {
+static const struct fb_var_screeninfo kyro_var = {
 	/* 640x480, 16bpp @ 60 Hz */
 	.xres		= 640,
 	.yres		= 480,
diff --git a/drivers/video/fbdev/matrox/matroxfb_Ti3026.c b/drivers/video/fbdev/matrox/matroxfb_Ti3026.c
index 195ad7c..68fa037 100644
--- a/drivers/video/fbdev/matrox/matroxfb_Ti3026.c
+++ b/drivers/video/fbdev/matrox/matroxfb_Ti3026.c
@@ -372,7 +372,7 @@
 
 	DBG(__func__)
 
-	memcpy(hw->DACreg, MGADACbpp32, sizeof(hw->DACreg));
+	memcpy(hw->DACreg, MGADACbpp32, sizeof(MGADACbpp32));
 	switch (minfo->fbcon.var.bits_per_pixel) {
 		case 4:	hw->DACreg[POS3026_XLATCHCTRL] = TVP3026_XLATCHCTRL_16_1;	/* or _8_1, they are same */
 			hw->DACreg[POS3026_XTRUECOLORCTRL] = TVP3026_XTRUECOLORCTRL_PSEUDOCOLOR;
diff --git a/drivers/video/fbdev/matrox/matroxfb_g450.c b/drivers/video/fbdev/matrox/matroxfb_g450.c
index cff0546..f108ae6 100644
--- a/drivers/video/fbdev/matrox/matroxfb_g450.c
+++ b/drivers/video/fbdev/matrox/matroxfb_g450.c
@@ -433,7 +433,7 @@
 		0x00,	/* 3E written multiple times */
 		0x00,	/* 3F not written */
 	} };
-	static struct mavenregs ntscregs = { {
+	static const struct mavenregs ntscregs = { {
 		0x21, 0xF0, 0x7C, 0x1F,	/* 00: chroma subcarrier */
 		0x00,
 		0x00,	/* test */
diff --git a/drivers/video/fbdev/mb862xx/mb862xx-i2c.c b/drivers/video/fbdev/mb862xx/mb862xx-i2c.c
index c87e17a..ba96c44 100644
--- a/drivers/video/fbdev/mb862xx/mb862xx-i2c.c
+++ b/drivers/video/fbdev/mb862xx/mb862xx-i2c.c
@@ -157,17 +157,10 @@
 
 int mb862xx_i2c_init(struct mb862xxfb_par *par)
 {
-	int ret;
-
 	mb862xx_i2c_adapter.algo_data = par;
 	par->adap = &mb862xx_i2c_adapter;
 
-	ret = i2c_add_adapter(par->adap);
-	if (ret < 0) {
-		dev_err(par->dev, "failed to add %s\n",
-			mb862xx_i2c_adapter.name);
-	}
-	return ret;
+	return i2c_add_adapter(par->adap);
 }
 
 void mb862xx_i2c_exit(struct mb862xxfb_par *par)
diff --git a/drivers/video/fbdev/mx3fb.c b/drivers/video/fbdev/mx3fb.c
index f91b1db..8778e01 100644
--- a/drivers/video/fbdev/mx3fb.c
+++ b/drivers/video/fbdev/mx3fb.c
@@ -845,7 +845,7 @@
 		if (fbi->var.sync & FB_SYNC_SHARP_MODE)
 			mode = IPU_PANEL_SHARP_TFT;
 
-		dev_dbg(fbi->device, "pixclock = %ul Hz\n",
+		dev_dbg(fbi->device, "pixclock = %u Hz\n",
 			(u32) (PICOS2KHZ(fbi->var.pixclock) * 1000UL));
 
 		if (sdc_init_panel(mx3fb, mode,
diff --git a/drivers/video/fbdev/mxsfb.c b/drivers/video/fbdev/mxsfb.c
index 4e6608c..7846f0e 100644
--- a/drivers/video/fbdev/mxsfb.c
+++ b/drivers/video/fbdev/mxsfb.c
@@ -800,6 +800,7 @@
 			struct fb_videomode *vmode)
 {
 	int ret;
+	struct device *dev = &host->pdev->dev;
 	struct fb_info *fb_info = &host->fb_info;
 	struct fb_var_screeninfo *var = &fb_info->var;
 	dma_addr_t fb_phys;
@@ -825,12 +826,10 @@
 
 	/* Memory allocation for framebuffer */
 	fb_size = SZ_2M;
-	fb_virt = alloc_pages_exact(fb_size, GFP_DMA);
+	fb_virt = dma_alloc_wc(dev, PAGE_ALIGN(fb_size), &fb_phys, GFP_KERNEL);
 	if (!fb_virt)
 		return -ENOMEM;
 
-	fb_phys = virt_to_phys(fb_virt);
-
 	fb_info->fix.smem_start = fb_phys;
 	fb_info->screen_base = fb_virt;
 	fb_info->screen_size = fb_info->fix.smem_len = fb_size;
@@ -843,9 +842,11 @@
 
 static void mxsfb_free_videomem(struct mxsfb_info *host)
 {
+	struct device *dev = &host->pdev->dev;
 	struct fb_info *fb_info = &host->fb_info;
 
-	free_pages_exact(fb_info->screen_base, fb_info->fix.smem_len);
+	dma_free_wc(dev, fb_info->screen_size, fb_info->screen_base,
+		    fb_info->fix.smem_start);
 }
 
 static const struct platform_device_id mxsfb_devtype[] = {
diff --git a/drivers/video/fbdev/offb.c b/drivers/video/fbdev/offb.c
index fb60a8f..906c6e7 100644
--- a/drivers/video/fbdev/offb.c
+++ b/drivers/video/fbdev/offb.c
@@ -625,6 +625,21 @@
 	if (address == OF_BAD_ADDR && addr_prop)
 		address = (u64)addr_prop;
 	if (address != OF_BAD_ADDR) {
+#ifdef CONFIG_PCI
+		const __be32 *vidp, *didp;
+		u32 vid, did;
+		struct pci_dev *pdev;
+
+		vidp = of_get_property(dp, "vendor-id", NULL);
+		didp = of_get_property(dp, "device-id", NULL);
+		if (vidp && didp) {
+			vid = be32_to_cpup(vidp);
+			did = be32_to_cpup(didp);
+			pdev = pci_get_device(vid, did, NULL);
+			if (!pdev || pci_enable_device(pdev))
+				return;
+		}
+#endif
 		/* kludge for valkyrie */
 		if (strcmp(dp->name, "valkyrie") == 0)
 			address += 0x1000;
diff --git a/drivers/video/fbdev/omap/lcd_mipid.c b/drivers/video/fbdev/omap/lcd_mipid.c
index 0e4cee9a..c81f150 100644
--- a/drivers/video/fbdev/omap/lcd_mipid.c
+++ b/drivers/video/fbdev/omap/lcd_mipid.c
@@ -60,7 +60,6 @@
 	struct mutex		mutex;
 	struct lcd_panel	panel;
 
-	struct workqueue_struct	*esd_wq;
 	struct delayed_work	esd_work;
 	void			(*esd_check)(struct mipid_device *m);
 };
@@ -390,7 +389,7 @@
 static void mipid_esd_start_check(struct mipid_device *md)
 {
 	if (md->esd_check != NULL)
-		queue_delayed_work(md->esd_wq, &md->esd_work,
+		schedule_delayed_work(&md->esd_work,
 				   MIPID_ESD_CHECK_PERIOD);
 }
 
@@ -476,11 +475,6 @@
 	struct mipid_device *md = to_mipid_device(panel);
 
 	md->fbdev = fbdev;
-	md->esd_wq = create_singlethread_workqueue("mipid_esd");
-	if (md->esd_wq == NULL) {
-		dev_err(&md->spi->dev, "can't create ESD workqueue\n");
-		return -ENOMEM;
-	}
 	INIT_DELAYED_WORK(&md->esd_work, mipid_esd_work);
 	mutex_init(&md->mutex);
 
@@ -500,7 +494,6 @@
 
 	if (md->enabled)
 		mipid_esd_stop_check(md);
-	destroy_workqueue(md->esd_wq);
 }
 
 static struct lcd_panel mipid_panel = {
diff --git a/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c b/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c
index b58012b..8b81069 100644
--- a/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c
+++ b/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c
@@ -75,8 +75,6 @@
 
 	bool intro_printed;
 
-	struct workqueue_struct *workqueue;
-
 	bool ulps_enabled;
 	unsigned ulps_timeout;
 	struct delayed_work ulps_work;
@@ -232,7 +230,7 @@
 static void dsicm_queue_ulps_work(struct panel_drv_data *ddata)
 {
 	if (ddata->ulps_timeout > 0)
-		queue_delayed_work(ddata->workqueue, &ddata->ulps_work,
+		schedule_delayed_work(&ddata->ulps_work,
 				msecs_to_jiffies(ddata->ulps_timeout));
 }
 
@@ -1244,11 +1242,6 @@
 		dev_dbg(dev, "Using GPIO TE\n");
 	}
 
-	ddata->workqueue = create_singlethread_workqueue("dsicm_wq");
-	if (ddata->workqueue == NULL) {
-		dev_err(dev, "can't create workqueue\n");
-		return -ENOMEM;
-	}
 	INIT_DELAYED_WORK(&ddata->ulps_work, dsicm_ulps_work);
 
 	dsicm_hw_reset(ddata);
@@ -1262,7 +1255,7 @@
 				dev, ddata, &dsicm_bl_ops, &props);
 		if (IS_ERR(bldev)) {
 			r = PTR_ERR(bldev);
-			goto err_bl;
+			goto err_reg;
 		}
 
 		ddata->bldev = bldev;
@@ -1285,8 +1278,6 @@
 err_sysfs_create:
 	if (bldev != NULL)
 		backlight_device_unregister(bldev);
-err_bl:
-	destroy_workqueue(ddata->workqueue);
 err_reg:
 	return r;
 }
@@ -1316,7 +1307,6 @@
 	omap_dss_put_device(ddata->in);
 
 	dsicm_cancel_ulps_work(ddata);
-	destroy_workqueue(ddata->workqueue);
 
 	/* reset, to be sure that the panel is in a valid state */
 	dsicm_hw_reset(ddata);
diff --git a/drivers/video/fbdev/omap2/omapfb/dss/dispc-compat.c b/drivers/video/fbdev/omap2/omapfb/dss/dispc-compat.c
index 3691bde..a864608 100644
--- a/drivers/video/fbdev/omap2/omapfb/dss/dispc-compat.c
+++ b/drivers/video/fbdev/omap2/omapfb/dss/dispc-compat.c
@@ -644,6 +644,7 @@
 {
 
 	int r;
+	long time_left;
 	DECLARE_COMPLETION_ONSTACK(completion);
 
 	r = omap_dispc_register_isr(dispc_irq_wait_handler, &completion,
@@ -652,15 +653,15 @@
 	if (r)
 		return r;
 
-	timeout = wait_for_completion_interruptible_timeout(&completion,
+	time_left = wait_for_completion_interruptible_timeout(&completion,
 			timeout);
 
 	omap_dispc_unregister_isr(dispc_irq_wait_handler, &completion, irqmask);
 
-	if (timeout == 0)
+	if (time_left == 0)
 		return -ETIMEDOUT;
 
-	if (timeout == -ERESTARTSYS)
+	if (time_left == -ERESTARTSYS)
 		return -ERESTARTSYS;
 
 	return 0;
diff --git a/drivers/video/fbdev/omap2/omapfb/dss/dsi.c b/drivers/video/fbdev/omap2/omapfb/dss/dsi.c
index 9e4800a..30d49f3 100644
--- a/drivers/video/fbdev/omap2/omapfb/dss/dsi.c
+++ b/drivers/video/fbdev/omap2/omapfb/dss/dsi.c
@@ -1167,7 +1167,6 @@
 {
 	struct dsi_data *dsi = dsi_get_dsidrv_data(dsidev);
 	struct regulator *vdds_dsi;
-	int r;
 
 	if (dsi->vdds_dsi_reg != NULL)
 		return 0;
@@ -1180,13 +1179,6 @@
 		return PTR_ERR(vdds_dsi);
 	}
 
-	r = regulator_set_voltage(vdds_dsi, 1800000, 1800000);
-	if (r) {
-		devm_regulator_put(vdds_dsi);
-		DSSERR("can't set the DSI regulator voltage\n");
-		return r;
-	}
-
 	dsi->vdds_dsi_reg = vdds_dsi;
 
 	return 0;
@@ -5348,7 +5340,7 @@
 
 	dsi->phy_base = devm_ioremap(&dsidev->dev, res->start,
 		resource_size(res));
-	if (!dsi->proto_base) {
+	if (!dsi->phy_base) {
 		DSSERR("can't ioremap DSI PHY\n");
 		return -ENOMEM;
 	}
@@ -5368,7 +5360,7 @@
 
 	dsi->pll_base = devm_ioremap(&dsidev->dev, res->start,
 		resource_size(res));
-	if (!dsi->proto_base) {
+	if (!dsi->pll_base) {
 		DSSERR("can't ioremap DSI PLL\n");
 		return -ENOMEM;
 	}
diff --git a/drivers/video/fbdev/omap2/omapfb/dss/hdmi4.c b/drivers/video/fbdev/omap2/omapfb/dss/hdmi4.c
index 926a6f2..156a254 100644
--- a/drivers/video/fbdev/omap2/omapfb/dss/hdmi4.c
+++ b/drivers/video/fbdev/omap2/omapfb/dss/hdmi4.c
@@ -100,7 +100,6 @@
 
 static int hdmi_init_regulator(void)
 {
-	int r;
 	struct regulator *reg;
 
 	if (hdmi.vdda_reg != NULL)
@@ -114,13 +113,6 @@
 		return PTR_ERR(reg);
 	}
 
-	r = regulator_set_voltage(reg, 1800000, 1800000);
-	if (r) {
-		devm_regulator_put(reg);
-		DSSWARN("can't set the regulator voltage\n");
-		return r;
-	}
-
 	hdmi.vdda_reg = reg;
 
 	return 0;
diff --git a/drivers/video/fbdev/omap2/omapfb/dss/hdmi5.c b/drivers/video/fbdev/omap2/omapfb/dss/hdmi5.c
index 0ee829a..4da36bc 100644
--- a/drivers/video/fbdev/omap2/omapfb/dss/hdmi5.c
+++ b/drivers/video/fbdev/omap2/omapfb/dss/hdmi5.c
@@ -119,7 +119,6 @@
 
 static int hdmi_init_regulator(void)
 {
-	int r;
 	struct regulator *reg;
 
 	if (hdmi.vdda_reg != NULL)
@@ -131,13 +130,6 @@
 		return PTR_ERR(reg);
 	}
 
-	r = regulator_set_voltage(reg, 1800000, 1800000);
-	if (r) {
-		devm_regulator_put(reg);
-		DSSWARN("can't set the regulator voltage\n");
-		return r;
-	}
-
 	hdmi.vdda_reg = reg;
 
 	return 0;
diff --git a/drivers/video/fbdev/pm2fb.c b/drivers/video/fbdev/pm2fb.c
index aa8d288..1a4070f 100644
--- a/drivers/video/fbdev/pm2fb.c
+++ b/drivers/video/fbdev/pm2fb.c
@@ -113,7 +113,7 @@
 /*
  * Default video mode. In case the modedb doesn't work.
  */
-static struct fb_var_screeninfo pm2fb_var = {
+static const struct fb_var_screeninfo pm2fb_var = {
 	/* "640x480, 8 bpp @ 60 Hz */
 	.xres =			640,
 	.yres =			480,
diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c
index 2c0487f..ef73f14 100644
--- a/drivers/video/fbdev/pxafb.c
+++ b/drivers/video/fbdev/pxafb.c
@@ -2125,7 +2125,7 @@
 
 	timings = of_get_display_timings(disp);
 	if (!timings)
-		goto out;
+		return -EINVAL;
 
 	ret = -ENOMEM;
 	info->modes = kmalloc_array(timings->num_timings,
@@ -2186,6 +2186,7 @@
 	ret = of_property_read_u32(np, "bus-width", &bus_width);
 	if (ret) {
 		dev_err(dev, "no bus-width specified: %d\n", ret);
+		of_node_put(np);
 		return ret;
 	}
 
diff --git a/drivers/video/fbdev/s1d13xxxfb.c b/drivers/video/fbdev/s1d13xxxfb.c
index 96aa46d..5d6179e 100644
--- a/drivers/video/fbdev/s1d13xxxfb.c
+++ b/drivers/video/fbdev/s1d13xxxfb.c
@@ -83,7 +83,7 @@
 /*
  * here we define the default struct fb_fix_screeninfo
  */
-static struct fb_fix_screeninfo s1d13xxxfb_fix = {
+static const struct fb_fix_screeninfo s1d13xxxfb_fix = {
 	.id		= S1D_FBID,
 	.type		= FB_TYPE_PACKED_PIXELS,
 	.visual		= FB_VISUAL_PSEUDOCOLOR,
@@ -929,7 +929,7 @@
 		s1dfb->disp_save = kmalloc(info->fix.smem_len, GFP_KERNEL);
 
 	if (!s1dfb->disp_save) {
-		printk(KERN_ERR PFX "no memory to save screen");
+		printk(KERN_ERR PFX "no memory to save screen\n");
 		return -ENOMEM;
 	}
 
diff --git a/drivers/video/fbdev/s3c2410fb.c b/drivers/video/fbdev/s3c2410fb.c
index 0dd86be..a67e456 100644
--- a/drivers/video/fbdev/s3c2410fb.c
+++ b/drivers/video/fbdev/s3c2410fb.c
@@ -767,7 +767,7 @@
 	return IRQ_HANDLED;
 }
 
-#ifdef CONFIG_CPU_FREQ
+#ifdef CONFIG_ARM_S3C24XX_CPUFREQ
 
 static int s3c2410fb_cpufreq_transition(struct notifier_block *nb,
 					unsigned long val, void *data)
diff --git a/drivers/video/fbdev/s3c2410fb.h b/drivers/video/fbdev/s3c2410fb.h
index 47a17bd..cdd11e2 100644
--- a/drivers/video/fbdev/s3c2410fb.h
+++ b/drivers/video/fbdev/s3c2410fb.h
@@ -32,7 +32,7 @@
 	unsigned long		clk_rate;
 	unsigned int		palette_ready;
 
-#ifdef CONFIG_CPU_FREQ
+#ifdef CONFIG_ARM_S3C24XX_CPUFREQ
 	struct notifier_block	freq_transition;
 #endif
 
diff --git a/drivers/video/fbdev/savage/savagefb_driver.c b/drivers/video/fbdev/savage/savagefb_driver.c
index 6c77ab0..c30a91c 100644
--- a/drivers/video/fbdev/savage/savagefb_driver.c
+++ b/drivers/video/fbdev/savage/savagefb_driver.c
@@ -1660,7 +1660,7 @@
 
 /* --------------------------------------------------------------------- */
 
-static struct fb_var_screeninfo savagefb_var800x600x8 = {
+static const struct fb_var_screeninfo savagefb_var800x600x8 = {
 	.accel_flags =	FB_ACCELF_TEXT,
 	.xres =		800,
 	.yres =		600,
diff --git a/drivers/video/fbdev/simplefb.c b/drivers/video/fbdev/simplefb.c
index e9cf199..61f799a 100644
--- a/drivers/video/fbdev/simplefb.c
+++ b/drivers/video/fbdev/simplefb.c
@@ -33,14 +33,14 @@
 #include <linux/parser.h>
 #include <linux/regulator/consumer.h>
 
-static struct fb_fix_screeninfo simplefb_fix = {
+static const struct fb_fix_screeninfo simplefb_fix = {
 	.id		= "simple",
 	.type		= FB_TYPE_PACKED_PIXELS,
 	.visual		= FB_VISUAL_TRUECOLOR,
 	.accel		= FB_ACCEL_NONE,
 };
 
-static struct fb_var_screeninfo simplefb_var = {
+static const struct fb_var_screeninfo simplefb_var = {
 	.height		= -1,
 	.width		= -1,
 	.activate	= FB_ACTIVATE_NOW,
@@ -74,8 +74,14 @@
 	return 0;
 }
 
+struct simplefb_par;
+static void simplefb_clocks_destroy(struct simplefb_par *par);
+static void simplefb_regulators_destroy(struct simplefb_par *par);
+
 static void simplefb_destroy(struct fb_info *info)
 {
+	simplefb_regulators_destroy(info->par);
+	simplefb_clocks_destroy(info->par);
 	if (info->screen_base)
 		iounmap(info->screen_base);
 }
@@ -487,11 +493,8 @@
 static int simplefb_remove(struct platform_device *pdev)
 {
 	struct fb_info *info = platform_get_drvdata(pdev);
-	struct simplefb_par *par = info->par;
 
 	unregister_framebuffer(info);
-	simplefb_regulators_destroy(par);
-	simplefb_clocks_destroy(par);
 	framebuffer_release(info);
 
 	return 0;
diff --git a/drivers/video/fbdev/sm712fb.c b/drivers/video/fbdev/sm712fb.c
index 86ae1d4..73cb4ff 100644
--- a/drivers/video/fbdev/sm712fb.c
+++ b/drivers/video/fbdev/sm712fb.c
@@ -56,7 +56,7 @@
 
 void __iomem *smtc_regbaseaddress;	/* Memory Map IO starting address */
 
-static struct fb_var_screeninfo smtcfb_var = {
+static const struct fb_var_screeninfo smtcfb_var = {
 	.xres           = 1024,
 	.yres           = 600,
 	.xres_virtual   = 1024,
diff --git a/drivers/video/fbdev/smscufx.c b/drivers/video/fbdev/smscufx.c
index 9279e5f..ec2e7e3 100644
--- a/drivers/video/fbdev/smscufx.c
+++ b/drivers/video/fbdev/smscufx.c
@@ -1761,10 +1761,8 @@
 static void ufx_usb_disconnect(struct usb_interface *interface)
 {
 	struct ufx_data *dev;
-	struct fb_info *info;
 
 	dev = usb_get_intfdata(interface);
-	info = dev->info;
 
 	pr_debug("USB disconnect starting\n");
 
diff --git a/drivers/video/fbdev/ssd1307fb.c b/drivers/video/fbdev/ssd1307fb.c
index a9c45c8..2925d5c 100644
--- a/drivers/video/fbdev/ssd1307fb.c
+++ b/drivers/video/fbdev/ssd1307fb.c
@@ -64,7 +64,7 @@
 	u32 contrast;
 	u32 dclk_div;
 	u32 dclk_frq;
-	struct ssd1307fb_deviceinfo *device_info;
+	const struct ssd1307fb_deviceinfo *device_info;
 	struct i2c_client *client;
 	u32 height;
 	struct fb_info *info;
@@ -84,7 +84,7 @@
 	u8	data[0];
 };
 
-static struct fb_fix_screeninfo ssd1307fb_fix = {
+static const struct fb_fix_screeninfo ssd1307fb_fix = {
 	.id		= "Solomon SSD1307",
 	.type		= FB_TYPE_PACKED_PIXELS,
 	.visual		= FB_VISUAL_MONO10,
@@ -94,7 +94,7 @@
 	.accel		= FB_ACCEL_NONE,
 };
 
-static struct fb_var_screeninfo ssd1307fb_var = {
+static const struct fb_var_screeninfo ssd1307fb_var = {
 	.bits_per_pixel	= 1,
 };
 
@@ -559,8 +559,7 @@
 	par->info = info;
 	par->client = client;
 
-	par->device_info = (struct ssd1307fb_deviceinfo *)of_match_device(
-			ssd1307fb_of_match, &client->dev)->data;
+	par->device_info = of_device_get_match_data(&client->dev);
 
 	par->reset = of_get_named_gpio(client->dev.of_node,
 					 "reset-gpios", 0);
diff --git a/drivers/video/fbdev/tdfxfb.c b/drivers/video/fbdev/tdfxfb.c
index 621fa44..d5fa313 100644
--- a/drivers/video/fbdev/tdfxfb.c
+++ b/drivers/video/fbdev/tdfxfb.c
@@ -82,7 +82,7 @@
 #define VOODOO3_MAX_PIXCLOCK 300000
 #define VOODOO5_MAX_PIXCLOCK 350000
 
-static struct fb_fix_screeninfo tdfx_fix = {
+static const struct fb_fix_screeninfo tdfx_fix = {
 	.type =		FB_TYPE_PACKED_PIXELS,
 	.visual =	FB_VISUAL_PSEUDOCOLOR,
 	.ypanstep =	1,
@@ -90,7 +90,7 @@
 	.accel =	FB_ACCEL_3DFX_BANSHEE
 };
 
-static struct fb_var_screeninfo tdfx_var = {
+static const struct fb_var_screeninfo tdfx_var = {
 	/* "640x480, 8 bpp @ 60 Hz */
 	.xres =		640,
 	.yres =		480,
diff --git a/drivers/video/fbdev/uvesafb.c b/drivers/video/fbdev/uvesafb.c
index 178ae93..98af9e0 100644
--- a/drivers/video/fbdev/uvesafb.c
+++ b/drivers/video/fbdev/uvesafb.c
@@ -33,7 +33,7 @@
 static char v86d_path[PATH_MAX] = "/sbin/v86d";
 static char v86d_started;	/* has v86d been started by uvesafb? */
 
-static struct fb_fix_screeninfo uvesafb_fix = {
+static const struct fb_fix_screeninfo uvesafb_fix = {
 	.id	= "VESA VGA",
 	.type	= FB_TYPE_PACKED_PIXELS,
 	.accel	= FB_ACCEL_NONE,
diff --git a/drivers/video/fbdev/vfb.c b/drivers/video/fbdev/vfb.c
index b9c2f81..da653a0 100644
--- a/drivers/video/fbdev/vfb.c
+++ b/drivers/video/fbdev/vfb.c
@@ -35,76 +35,23 @@
 static void *videomemory;
 static u_long videomemorysize = VIDEOMEMSIZE;
 module_param(videomemorysize, ulong, 0);
+MODULE_PARM_DESC(videomemorysize, "RAM available to frame buffer (in bytes)");
 
-/**********************************************************************
- *
- * Memory management
- *
- **********************************************************************/
-static void *rvmalloc(unsigned long size)
-{
-	void *mem;
-	unsigned long adr;
+static char *mode_option = NULL;
+module_param(mode_option, charp, 0);
+MODULE_PARM_DESC(mode_option, "Preferred video mode (e.g. 640x480-8@60)");
 
-	size = PAGE_ALIGN(size);
-	mem = vmalloc_32(size);
-	if (!mem)
-		return NULL;
-
-	/*
-	 * VFB must clear memory to prevent kernel info
-	 * leakage into userspace
-	 * VGA-based drivers MUST NOT clear memory if
-	 * they want to be able to take over vgacon
-	 */
-
-	memset(mem, 0, size);
-	adr = (unsigned long) mem;
-	while (size > 0) {
-		SetPageReserved(vmalloc_to_page((void *)adr));
-		adr += PAGE_SIZE;
-		size -= PAGE_SIZE;
-	}
-
-	return mem;
-}
-
-static void rvfree(void *mem, unsigned long size)
-{
-	unsigned long adr;
-
-	if (!mem)
-		return;
-
-	adr = (unsigned long) mem;
-	while ((long) size > 0) {
-		ClearPageReserved(vmalloc_to_page((void *)adr));
-		adr += PAGE_SIZE;
-		size -= PAGE_SIZE;
-	}
-	vfree(mem);
-}
-
-static struct fb_var_screeninfo vfb_default = {
+static const struct fb_videomode vfb_default = {
 	.xres =		640,
 	.yres =		480,
-	.xres_virtual =	640,
-	.yres_virtual =	480,
-	.bits_per_pixel = 8,
-	.red =		{ 0, 8, 0 },
-      	.green =	{ 0, 8, 0 },
-      	.blue =		{ 0, 8, 0 },
-      	.activate =	FB_ACTIVATE_TEST,
-      	.height =	-1,
-      	.width =	-1,
-      	.pixclock =	20000,
-      	.left_margin =	64,
-      	.right_margin =	64,
-      	.upper_margin =	32,
-      	.lower_margin =	32,
-      	.hsync_len =	64,
-      	.vsync_len =	2,
-      	.vmode =	FB_VMODE_NONINTERLACED,
+	.pixclock =	20000,
+	.left_margin =	64,
+	.right_margin =	64,
+	.upper_margin =	32,
+	.lower_margin =	32,
+	.hsync_len =	64,
+	.vsync_len =	2,
+	.vmode =	FB_VMODE_NONINTERLACED,
 };
 
 static struct fb_fix_screeninfo vfb_fix = {
@@ -119,6 +66,7 @@
 
 static bool vfb_enable __initdata = 0;	/* disabled by default */
 module_param(vfb_enable, bool, 0);
+MODULE_PARM_DESC(vfb_enable, "Enable Virtual FB driver");
 
 static int vfb_check_var(struct fb_var_screeninfo *var,
 			 struct fb_info *info);
@@ -421,35 +369,7 @@
 static int vfb_mmap(struct fb_info *info,
 		    struct vm_area_struct *vma)
 {
-	unsigned long start = vma->vm_start;
-	unsigned long size = vma->vm_end - vma->vm_start;
-	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
-	unsigned long page, pos;
-
-	if (vma->vm_pgoff > (~0UL >> PAGE_SHIFT))
-		return -EINVAL;
-	if (size > info->fix.smem_len)
-		return -EINVAL;
-	if (offset > info->fix.smem_len - size)
-		return -EINVAL;
-
-	pos = (unsigned long)info->fix.smem_start + offset;
-
-	while (size > 0) {
-		page = vmalloc_to_pfn((void *)pos);
-		if (remap_pfn_range(vma, start, page, PAGE_SIZE, PAGE_SHARED)) {
-			return -EAGAIN;
-		}
-		start += PAGE_SIZE;
-		pos += PAGE_SIZE;
-		if (size > PAGE_SIZE)
-			size -= PAGE_SIZE;
-		else
-			size = 0;
-	}
-
-	return 0;
-
+	return remap_vmalloc_range(vma, (void *)info->fix.smem_start, vma->vm_pgoff);
 }
 
 #ifndef MODULE
@@ -477,6 +397,8 @@
 		/* Test disable for backwards compatibility */
 		if (!strcmp(this_opt, "disable"))
 			vfb_enable = 0;
+		else
+			mode_option = this_opt;
 	}
 	return 1;
 }
@@ -489,12 +411,13 @@
 static int vfb_probe(struct platform_device *dev)
 {
 	struct fb_info *info;
+	unsigned int size = PAGE_ALIGN(videomemorysize);
 	int retval = -ENOMEM;
 
 	/*
 	 * For real video cards we use ioremap.
 	 */
-	if (!(videomemory = rvmalloc(videomemorysize)))
+	if (!(videomemory = vmalloc_32_user(size)))
 		return retval;
 
 	info = framebuffer_alloc(sizeof(u32) * 256, &dev->dev);
@@ -504,11 +427,13 @@
 	info->screen_base = (char __iomem *)videomemory;
 	info->fbops = &vfb_ops;
 
-	retval = fb_find_mode(&info->var, info, NULL,
-			      NULL, 0, NULL, 8);
+	if (!fb_find_mode(&info->var, info, mode_option,
+			  NULL, 0, &vfb_default, 8)){
+		fb_err(info, "Unable to find usable video mode.\n");
+		retval = -EINVAL;
+		goto err1;
+	}
 
-	if (!retval || (retval == 4))
-		info->var = vfb_default;
 	vfb_fix.smem_start = (unsigned long) videomemory;
 	vfb_fix.smem_len = videomemorysize;
 	info->fix = vfb_fix;
@@ -533,7 +458,7 @@
 err1:
 	framebuffer_release(info);
 err:
-	rvfree(videomemory, videomemorysize);
+	vfree(videomemory);
 	return retval;
 }
 
@@ -543,7 +468,7 @@
 
 	if (info) {
 		unregister_framebuffer(info);
-		rvfree(videomemory, videomemorysize);
+		vfree(videomemory);
 		fb_dealloc_cmap(&info->cmap);
 		framebuffer_release(info);
 	}
diff --git a/drivers/video/fbdev/vga16fb.c b/drivers/video/fbdev/vga16fb.c
index 283d335..5f0690c 100644
--- a/drivers/video/fbdev/vga16fb.c
+++ b/drivers/video/fbdev/vga16fb.c
@@ -85,7 +85,7 @@
 };
 
 /* name should not depend on EGA/VGA */
-static struct fb_fix_screeninfo vga16fb_fix = {
+static const struct fb_fix_screeninfo vga16fb_fix = {
 	.id		= "VGA16 VGA",
 	.smem_start	= VGA_FB_PHYS,
 	.smem_len	= VGA_FB_PHYS_LEN,
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 50dbaa8..fdd3228 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -1844,4 +1844,53 @@
 
 	  Most people will say N.
 
+comment "Watchdog Pretimeout Governors"
+
+config WATCHDOG_PRETIMEOUT_GOV
+	bool "Enable watchdog pretimeout governors"
+	help
+	  The option allows to select watchdog pretimeout governors.
+
+if WATCHDOG_PRETIMEOUT_GOV
+
+choice
+	prompt "Default Watchdog Pretimeout Governor"
+	default WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC
+	help
+	  This option selects a default watchdog pretimeout governor.
+	  The governor takes its action, if a watchdog is capable
+	  to report a pretimeout event.
+
+config WATCHDOG_PRETIMEOUT_DEFAULT_GOV_NOOP
+	bool "noop"
+	select WATCHDOG_PRETIMEOUT_GOV_NOOP
+	help
+	  Use noop watchdog pretimeout governor by default. If noop
+	  governor is selected by a user, write a short message to
+	  the kernel log buffer and don't do any system changes.
+
+config WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC
+	bool "panic"
+	select WATCHDOG_PRETIMEOUT_GOV_PANIC
+	help
+	  Use panic watchdog pretimeout governor by default, if
+	  a watchdog pretimeout event happens, consider that
+	  a watchdog feeder is dead and reboot is unavoidable.
+
+endchoice
+
+config WATCHDOG_PRETIMEOUT_GOV_NOOP
+	tristate "Noop watchdog pretimeout governor"
+	help
+	  Noop watchdog pretimeout governor, only an informational
+	  message is added to kernel log buffer.
+
+config WATCHDOG_PRETIMEOUT_GOV_PANIC
+	tristate "Panic watchdog pretimeout governor"
+	help
+	  Panic watchdog pretimeout governor, on watchdog pretimeout
+	  event put the kernel into panic.
+
+endif # WATCHDOG_PRETIMEOUT_GOV
+
 endif # WATCHDOG
diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile
index cba0043..caa9f4a 100644
--- a/drivers/watchdog/Makefile
+++ b/drivers/watchdog/Makefile
@@ -3,9 +3,15 @@
 #
 
 # The WatchDog Timer Driver Core.
-watchdog-objs	+= watchdog_core.o watchdog_dev.o
 obj-$(CONFIG_WATCHDOG_CORE)	+= watchdog.o
 
+watchdog-objs	+= watchdog_core.o watchdog_dev.o
+
+watchdog-$(CONFIG_WATCHDOG_PRETIMEOUT_GOV)	+= watchdog_pretimeout.o
+
+obj-$(CONFIG_WATCHDOG_PRETIMEOUT_GOV_NOOP)	+= pretimeout_noop.o
+obj-$(CONFIG_WATCHDOG_PRETIMEOUT_GOV_PANIC)	+= pretimeout_panic.o
+
 # Only one watchdog can succeed. We probe the ISA/PCI/USB based
 # watchdog-cards first, then the architecture specific watchdog
 # drivers and then the architecture independent "softdog" driver.
diff --git a/drivers/watchdog/asm9260_wdt.c b/drivers/watchdog/asm9260_wdt.c
index c9686b2..d0b59ba 100644
--- a/drivers/watchdog/asm9260_wdt.c
+++ b/drivers/watchdog/asm9260_wdt.c
@@ -389,7 +389,6 @@
 static struct platform_driver asm9260_wdt_driver = {
 	.driver = {
 		.name = "asm9260-wdt",
-		.owner = THIS_MODULE,
 		.of_match_table	= asm9260_wdt_of_match,
 	},
 	.probe = asm9260_wdt_probe,
diff --git a/drivers/watchdog/ath79_wdt.c b/drivers/watchdog/ath79_wdt.c
index 835d310..e2209bf 100644
--- a/drivers/watchdog/ath79_wdt.c
+++ b/drivers/watchdog/ath79_wdt.c
@@ -35,6 +35,7 @@
 #include <linux/err.h>
 #include <linux/of.h>
 #include <linux/of_platform.h>
+#include <linux/uaccess.h>
 
 #define DRIVER_NAME	"ath79-wdt"
 
diff --git a/drivers/watchdog/bcm7038_wdt.c b/drivers/watchdog/bcm7038_wdt.c
index 4245b65..e238df4 100644
--- a/drivers/watchdog/bcm7038_wdt.c
+++ b/drivers/watchdog/bcm7038_wdt.c
@@ -107,7 +107,7 @@
 				WDIOF_MAGICCLOSE
 };
 
-static struct watchdog_ops bcm7038_wdt_ops = {
+static const struct watchdog_ops bcm7038_wdt_ops = {
 	.owner		= THIS_MODULE,
 	.start		= bcm7038_wdt_start,
 	.stop		= bcm7038_wdt_stop,
diff --git a/drivers/watchdog/cadence_wdt.c b/drivers/watchdog/cadence_wdt.c
index 4dda902..98acef7 100644
--- a/drivers/watchdog/cadence_wdt.c
+++ b/drivers/watchdog/cadence_wdt.c
@@ -269,7 +269,7 @@
 };
 
 /* Watchdog Core Ops */
-static struct watchdog_ops cdns_wdt_ops = {
+static const struct watchdog_ops cdns_wdt_ops = {
 	.owner = THIS_MODULE,
 	.start = cdns_wdt_start,
 	.stop = cdns_wdt_stop,
@@ -424,8 +424,10 @@
 	struct platform_device *pdev = to_platform_device(dev);
 	struct cdns_wdt *wdt = platform_get_drvdata(pdev);
 
-	cdns_wdt_stop(&wdt->cdns_wdt_device);
-	clk_disable_unprepare(wdt->clk);
+	if (watchdog_active(&wdt->cdns_wdt_device)) {
+		cdns_wdt_stop(&wdt->cdns_wdt_device);
+		clk_disable_unprepare(wdt->clk);
+	}
 
 	return 0;
 }
@@ -442,12 +444,14 @@
 	struct platform_device *pdev = to_platform_device(dev);
 	struct cdns_wdt *wdt = platform_get_drvdata(pdev);
 
-	ret = clk_prepare_enable(wdt->clk);
-	if (ret) {
-		dev_err(dev, "unable to enable clock\n");
-		return ret;
+	if (watchdog_active(&wdt->cdns_wdt_device)) {
+		ret = clk_prepare_enable(wdt->clk);
+		if (ret) {
+			dev_err(dev, "unable to enable clock\n");
+			return ret;
+		}
+		cdns_wdt_start(&wdt->cdns_wdt_device);
 	}
-	cdns_wdt_start(&wdt->cdns_wdt_device);
 
 	return 0;
 }
diff --git a/drivers/watchdog/dw_wdt.c b/drivers/watchdog/dw_wdt.c
index 2acb51c..3c6a3de 100644
--- a/drivers/watchdog/dw_wdt.c
+++ b/drivers/watchdog/dw_wdt.c
@@ -54,6 +54,7 @@
 struct dw_wdt {
 	void __iomem		*regs;
 	struct clk		*clk;
+	unsigned long		rate;
 	struct notifier_block	restart_handler;
 	struct watchdog_device	wdd;
 };
@@ -72,7 +73,7 @@
 	 * There are 16 possible timeout values in 0..15 where the number of
 	 * cycles is 2 ^ (16 + i) and the watchdog counts down.
 	 */
-	return (1U << (16 + top)) / clk_get_rate(dw_wdt->clk);
+	return (1U << (16 + top)) / dw_wdt->rate;
 }
 
 static int dw_wdt_get_top(struct dw_wdt *dw_wdt)
@@ -163,7 +164,7 @@
 	struct dw_wdt *dw_wdt = to_dw_wdt(wdd);
 
 	return readl(dw_wdt->regs + WDOG_CURRENT_COUNT_REG_OFFSET) /
-		clk_get_rate(dw_wdt->clk);
+		dw_wdt->rate;
 }
 
 static const struct watchdog_info dw_wdt_ident = {
@@ -231,6 +232,12 @@
 	if (ret)
 		return ret;
 
+	dw_wdt->rate = clk_get_rate(dw_wdt->clk);
+	if (dw_wdt->rate == 0) {
+		ret = -EINVAL;
+		goto out_disable_clk;
+	}
+
 	wdd = &dw_wdt->wdd;
 	wdd->info = &dw_wdt_ident;
 	wdd->ops = &dw_wdt_ops;
diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c
index 8f89bd8..70c7194 100644
--- a/drivers/watchdog/hpwdt.c
+++ b/drivers/watchdog/hpwdt.c
@@ -39,7 +39,7 @@
 #include <asm/nmi.h>
 #include <asm/frame.h>
 
-#define HPWDT_VERSION			"1.3.3"
+#define HPWDT_VERSION			"1.4.0"
 #define SECS_TO_TICKS(secs)		((secs) * 1000 / 128)
 #define TICKS_TO_SECS(ticks)		((ticks) * 128 / 1000)
 #define HPWDT_MAX_TIMER			TICKS_TO_SECS(65535)
@@ -814,7 +814,8 @@
 	 * not run on a legacy ASM box.
 	 * So we only support the G5 ProLiant servers and higher.
 	 */
-	if (dev->subsystem_vendor != PCI_VENDOR_ID_HP) {
+	if (dev->subsystem_vendor != PCI_VENDOR_ID_HP &&
+	    dev->subsystem_vendor != PCI_VENDOR_ID_HP_3PAR) {
 		dev_warn(&dev->dev,
 			"This server does not have an iLO2+ ASIC.\n");
 		return -ENODEV;
@@ -823,7 +824,8 @@
 	/*
 	 * Ignore all auxilary iLO devices with the following PCI ID
 	 */
-	if (dev->subsystem_device == 0x1979)
+	if (dev->subsystem_vendor == PCI_VENDOR_ID_HP &&
+	    dev->subsystem_device == 0x1979)
 		return -ENODEV;
 
 	if (pci_enable_device(dev)) {
diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c
index 54cab18..06fcb6c 100644
--- a/drivers/watchdog/iTCO_wdt.c
+++ b/drivers/watchdog/iTCO_wdt.c
@@ -629,7 +629,7 @@
 	return 0;
 }
 
-static struct dev_pm_ops iTCO_wdt_pm = {
+static const struct dev_pm_ops iTCO_wdt_pm = {
 	.suspend_noirq = iTCO_wdt_suspend_noirq,
 	.resume_noirq = iTCO_wdt_resume_noirq,
 };
diff --git a/drivers/watchdog/imx2_wdt.c b/drivers/watchdog/imx2_wdt.c
index 62f346b..4874b0f 100644
--- a/drivers/watchdog/imx2_wdt.c
+++ b/drivers/watchdog/imx2_wdt.c
@@ -24,6 +24,7 @@
 #include <linux/clk.h>
 #include <linux/delay.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -37,18 +38,23 @@
 
 #define IMX2_WDT_WCR		0x00		/* Control Register */
 #define IMX2_WDT_WCR_WT		(0xFF << 8)	/* -> Watchdog Timeout Field */
-#define IMX2_WDT_WCR_WDA	(1 << 5)	/* -> External Reset WDOG_B */
-#define IMX2_WDT_WCR_SRS	(1 << 4)	/* -> Software Reset Signal */
-#define IMX2_WDT_WCR_WRE	(1 << 3)	/* -> WDOG Reset Enable */
-#define IMX2_WDT_WCR_WDE	(1 << 2)	/* -> Watchdog Enable */
-#define IMX2_WDT_WCR_WDZST	(1 << 0)	/* -> Watchdog timer Suspend */
+#define IMX2_WDT_WCR_WDA	BIT(5)		/* -> External Reset WDOG_B */
+#define IMX2_WDT_WCR_SRS	BIT(4)		/* -> Software Reset Signal */
+#define IMX2_WDT_WCR_WRE	BIT(3)		/* -> WDOG Reset Enable */
+#define IMX2_WDT_WCR_WDE	BIT(2)		/* -> Watchdog Enable */
+#define IMX2_WDT_WCR_WDZST	BIT(0)		/* -> Watchdog timer Suspend */
 
 #define IMX2_WDT_WSR		0x02		/* Service Register */
 #define IMX2_WDT_SEQ1		0x5555		/* -> service sequence 1 */
 #define IMX2_WDT_SEQ2		0xAAAA		/* -> service sequence 2 */
 
 #define IMX2_WDT_WRSR		0x04		/* Reset Status Register */
-#define IMX2_WDT_WRSR_TOUT	(1 << 1)	/* -> Reset due to Timeout */
+#define IMX2_WDT_WRSR_TOUT	BIT(1)		/* -> Reset due to Timeout */
+
+#define IMX2_WDT_WICR		0x06		/* Interrupt Control Register */
+#define IMX2_WDT_WICR_WIE	BIT(15)		/* -> Interrupt Enable */
+#define IMX2_WDT_WICR_WTIS	BIT(14)		/* -> Interrupt Status */
+#define IMX2_WDT_WICR_WICT	0xFF		/* -> Interrupt Count Timeout */
 
 #define IMX2_WDT_WMCR		0x08		/* Misc Register */
 
@@ -80,6 +86,12 @@
 	.options = WDIOF_KEEPALIVEPING | WDIOF_SETTIMEOUT | WDIOF_MAGICCLOSE,
 };
 
+static const struct watchdog_info imx2_wdt_pretimeout_info = {
+	.identity = "imx2+ watchdog",
+	.options = WDIOF_KEEPALIVEPING | WDIOF_SETTIMEOUT | WDIOF_MAGICCLOSE |
+		   WDIOF_PRETIMEOUT,
+};
+
 static int imx2_wdt_restart(struct watchdog_device *wdog, unsigned long action,
 			    void *data)
 {
@@ -169,6 +181,35 @@
 	return 0;
 }
 
+static int imx2_wdt_set_pretimeout(struct watchdog_device *wdog,
+				   unsigned int new_pretimeout)
+{
+	struct imx2_wdt_device *wdev = watchdog_get_drvdata(wdog);
+
+	if (new_pretimeout >= IMX2_WDT_MAX_TIME)
+		return -EINVAL;
+
+	wdog->pretimeout = new_pretimeout;
+
+	regmap_update_bits(wdev->regmap, IMX2_WDT_WICR,
+			   IMX2_WDT_WICR_WIE | IMX2_WDT_WICR_WICT,
+			   IMX2_WDT_WICR_WIE | (new_pretimeout << 1));
+	return 0;
+}
+
+static irqreturn_t imx2_wdt_isr(int irq, void *wdog_arg)
+{
+	struct watchdog_device *wdog = wdog_arg;
+	struct imx2_wdt_device *wdev = watchdog_get_drvdata(wdog);
+
+	regmap_write_bits(wdev->regmap, IMX2_WDT_WICR,
+			  IMX2_WDT_WICR_WTIS, IMX2_WDT_WICR_WTIS);
+
+	watchdog_notify_pretimeout(wdog);
+
+	return IRQ_HANDLED;
+}
+
 static int imx2_wdt_start(struct watchdog_device *wdog)
 {
 	struct imx2_wdt_device *wdev = watchdog_get_drvdata(wdog);
@@ -188,6 +229,7 @@
 	.start = imx2_wdt_start,
 	.ping = imx2_wdt_ping,
 	.set_timeout = imx2_wdt_set_timeout,
+	.set_pretimeout = imx2_wdt_set_pretimeout,
 	.restart = imx2_wdt_restart,
 };
 
@@ -236,6 +278,12 @@
 	wdog->max_hw_heartbeat_ms = IMX2_WDT_MAX_TIME * 1000;
 	wdog->parent		= &pdev->dev;
 
+	ret = platform_get_irq(pdev, 0);
+	if (ret > 0)
+		if (!devm_request_irq(&pdev->dev, ret, imx2_wdt_isr, 0,
+				      dev_name(&pdev->dev), wdog))
+			wdog->info = &imx2_wdt_pretimeout_info;
+
 	ret = clk_prepare_enable(wdev->clk);
 	if (ret)
 		return ret;
diff --git a/drivers/watchdog/kempld_wdt.c b/drivers/watchdog/kempld_wdt.c
index 5bf931c..8e302d0 100644
--- a/drivers/watchdog/kempld_wdt.c
+++ b/drivers/watchdog/kempld_wdt.c
@@ -430,7 +430,7 @@
 			WDIOF_PRETIMEOUT
 };
 
-static struct watchdog_ops kempld_wdt_ops = {
+static const struct watchdog_ops kempld_wdt_ops = {
 	.owner		= THIS_MODULE,
 	.start		= kempld_wdt_start,
 	.stop		= kempld_wdt_stop,
diff --git a/drivers/watchdog/mt7621_wdt.c b/drivers/watchdog/mt7621_wdt.c
index 4a2290f..d5735c1 100644
--- a/drivers/watchdog/mt7621_wdt.c
+++ b/drivers/watchdog/mt7621_wdt.c
@@ -139,7 +139,6 @@
 	if (!IS_ERR(mt7621_wdt_reset))
 		reset_control_deassert(mt7621_wdt_reset);
 
-	mt7621_wdt_dev.dev = &pdev->dev;
 	mt7621_wdt_dev.bootstatus = mt7621_wdt_bootcause();
 
 	watchdog_init_timeout(&mt7621_wdt_dev, mt7621_wdt_dev.max_timeout,
diff --git a/drivers/watchdog/of_xilinx_wdt.c b/drivers/watchdog/of_xilinx_wdt.c
index b2e1b4c..fae7fe9 100644
--- a/drivers/watchdog/of_xilinx_wdt.c
+++ b/drivers/watchdog/of_xilinx_wdt.c
@@ -10,6 +10,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include <linux/clk.h>
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/types.h>
@@ -45,6 +46,7 @@
 	u32 wdt_interval;
 	spinlock_t spinlock;
 	struct watchdog_device xilinx_wdt_wdd;
+	struct clk		*clk;
 };
 
 static int xilinx_wdt_start(struct watchdog_device *wdd)
@@ -195,16 +197,30 @@
 	spin_lock_init(&xdev->spinlock);
 	watchdog_set_drvdata(xilinx_wdt_wdd, xdev);
 
+	xdev->clk = devm_clk_get(&pdev->dev, NULL);
+	if (IS_ERR(xdev->clk)) {
+		if (PTR_ERR(xdev->clk) == -ENOENT)
+			xdev->clk = NULL;
+		else
+			return PTR_ERR(xdev->clk);
+	}
+
+	rc = clk_prepare_enable(xdev->clk);
+	if (rc) {
+		dev_err(&pdev->dev, "unable to enable clock\n");
+		return rc;
+	}
+
 	rc = xwdt_selftest(xdev);
 	if (rc == XWT_TIMER_FAILED) {
 		dev_err(&pdev->dev, "SelfTest routine error\n");
-		return rc;
+		goto err_clk_disable;
 	}
 
 	rc = watchdog_register_device(xilinx_wdt_wdd);
 	if (rc) {
 		dev_err(&pdev->dev, "Cannot register watchdog (err=%d)\n", rc);
-		return rc;
+		goto err_clk_disable;
 	}
 
 	dev_info(&pdev->dev, "Xilinx Watchdog Timer at %p with timeout %ds\n",
@@ -213,6 +229,10 @@
 	platform_set_drvdata(pdev, xdev);
 
 	return 0;
+err_clk_disable:
+	clk_disable_unprepare(xdev->clk);
+
+	return rc;
 }
 
 static int xwdt_remove(struct platform_device *pdev)
@@ -220,6 +240,7 @@
 	struct xwdt_device *xdev = platform_get_drvdata(pdev);
 
 	watchdog_unregister_device(&xdev->xilinx_wdt_wdd);
+	clk_disable_unprepare(xdev->clk);
 
 	return 0;
 }
diff --git a/drivers/watchdog/pretimeout_noop.c b/drivers/watchdog/pretimeout_noop.c
new file mode 100644
index 0000000..85f5299
--- /dev/null
+++ b/drivers/watchdog/pretimeout_noop.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2015-2016 Mentor Graphics
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/watchdog.h>
+
+#include "watchdog_pretimeout.h"
+
+/**
+ * pretimeout_noop - No operation on watchdog pretimeout event
+ * @wdd - watchdog_device
+ *
+ * This function prints a message about pretimeout to kernel log.
+ */
+static void pretimeout_noop(struct watchdog_device *wdd)
+{
+	pr_alert("watchdog%d: pretimeout event\n", wdd->id);
+}
+
+static struct watchdog_governor watchdog_gov_noop = {
+	.name		= "noop",
+	.pretimeout	= pretimeout_noop,
+};
+
+static int __init watchdog_gov_noop_register(void)
+{
+	return watchdog_register_governor(&watchdog_gov_noop);
+}
+
+static void __exit watchdog_gov_noop_unregister(void)
+{
+	watchdog_unregister_governor(&watchdog_gov_noop);
+}
+module_init(watchdog_gov_noop_register);
+module_exit(watchdog_gov_noop_unregister);
+
+MODULE_AUTHOR("Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>");
+MODULE_DESCRIPTION("Panic watchdog pretimeout governor");
+MODULE_LICENSE("GPL");
diff --git a/drivers/watchdog/pretimeout_panic.c b/drivers/watchdog/pretimeout_panic.c
new file mode 100644
index 0000000..0c197a1
--- /dev/null
+++ b/drivers/watchdog/pretimeout_panic.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2015-2016 Mentor Graphics
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/watchdog.h>
+
+#include "watchdog_pretimeout.h"
+
+/**
+ * pretimeout_panic - Panic on watchdog pretimeout event
+ * @wdd - watchdog_device
+ *
+ * Panic, watchdog has not been fed till pretimeout event.
+ */
+static void pretimeout_panic(struct watchdog_device *wdd)
+{
+	panic("watchdog pretimeout event\n");
+}
+
+static struct watchdog_governor watchdog_gov_panic = {
+	.name		= "panic",
+	.pretimeout	= pretimeout_panic,
+};
+
+static int __init watchdog_gov_panic_register(void)
+{
+	return watchdog_register_governor(&watchdog_gov_panic);
+}
+
+static void __exit watchdog_gov_panic_unregister(void)
+{
+	watchdog_unregister_governor(&watchdog_gov_panic);
+}
+module_init(watchdog_gov_panic_register);
+module_exit(watchdog_gov_panic_unregister);
+
+MODULE_AUTHOR("Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>");
+MODULE_DESCRIPTION("Panic watchdog pretimeout governor");
+MODULE_LICENSE("GPL");
diff --git a/drivers/watchdog/rn5t618_wdt.c b/drivers/watchdog/rn5t618_wdt.c
index d1c1227..0805ee2 100644
--- a/drivers/watchdog/rn5t618_wdt.c
+++ b/drivers/watchdog/rn5t618_wdt.c
@@ -136,7 +136,7 @@
 	.identity	= DRIVER_NAME,
 };
 
-static struct watchdog_ops rn5t618_wdt_ops = {
+static const struct watchdog_ops rn5t618_wdt_ops = {
 	.owner          = THIS_MODULE,
 	.start          = rn5t618_wdt_start,
 	.stop           = rn5t618_wdt_stop,
diff --git a/drivers/watchdog/rt2880_wdt.c b/drivers/watchdog/rt2880_wdt.c
index 1967919..14b4fd4 100644
--- a/drivers/watchdog/rt2880_wdt.c
+++ b/drivers/watchdog/rt2880_wdt.c
@@ -158,7 +158,6 @@
 
 	rt288x_wdt_freq = clk_get_rate(rt288x_wdt_clk) / RALINK_WDT_PRESCALE;
 
-	rt288x_wdt_dev.dev = &pdev->dev;
 	rt288x_wdt_dev.bootstatus = rt288x_wdt_bootcause();
 	rt288x_wdt_dev.max_timeout = (0xfffful / rt288x_wdt_freq);
 	rt288x_wdt_dev.parent = &pdev->dev;
diff --git a/drivers/watchdog/softdog.c b/drivers/watchdog/softdog.c
index b067edf..c7bdc98 100644
--- a/drivers/watchdog/softdog.c
+++ b/drivers/watchdog/softdog.c
@@ -72,10 +72,27 @@
 static struct timer_list softdog_ticktock =
 		TIMER_INITIALIZER(softdog_fire, 0, 0);
 
+static struct watchdog_device softdog_dev;
+
+static void softdog_pretimeout(unsigned long data)
+{
+	watchdog_notify_pretimeout(&softdog_dev);
+}
+
+static struct timer_list softdog_preticktock =
+		TIMER_INITIALIZER(softdog_pretimeout, 0, 0);
+
 static int softdog_ping(struct watchdog_device *w)
 {
 	if (!mod_timer(&softdog_ticktock, jiffies + (w->timeout * HZ)))
 		__module_get(THIS_MODULE);
+
+	if (w->pretimeout)
+		mod_timer(&softdog_preticktock, jiffies +
+			  (w->timeout - w->pretimeout) * HZ);
+	else
+		del_timer(&softdog_preticktock);
+
 	return 0;
 }
 
@@ -84,15 +101,18 @@
 	if (del_timer(&softdog_ticktock))
 		module_put(THIS_MODULE);
 
+	del_timer(&softdog_preticktock);
+
 	return 0;
 }
 
 static struct watchdog_info softdog_info = {
 	.identity = "Software Watchdog",
-	.options = WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING | WDIOF_MAGICCLOSE,
+	.options = WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING | WDIOF_MAGICCLOSE |
+		   WDIOF_PRETIMEOUT,
 };
 
-static struct watchdog_ops softdog_ops = {
+static const struct watchdog_ops softdog_ops = {
 	.owner = THIS_MODULE,
 	.start = softdog_ping,
 	.stop = softdog_stop,
diff --git a/drivers/watchdog/st_lpc_wdt.c b/drivers/watchdog/st_lpc_wdt.c
index 14e9bad..e6100e4 100644
--- a/drivers/watchdog/st_lpc_wdt.c
+++ b/drivers/watchdog/st_lpc_wdt.c
@@ -52,27 +52,6 @@
 	bool warm_reset;
 };
 
-static struct st_wdog_syscfg stid127_syscfg = {
-	.reset_type_reg		= 0x004,
-	.reset_type_mask	= BIT(2),
-	.enable_reg		= 0x000,
-	.enable_mask		= BIT(2),
-};
-
-static struct st_wdog_syscfg stih415_syscfg = {
-	.reset_type_reg		= 0x0B8,
-	.reset_type_mask	= BIT(6),
-	.enable_reg		= 0x0B4,
-	.enable_mask		= BIT(7),
-};
-
-static struct st_wdog_syscfg stih416_syscfg = {
-	.reset_type_reg		= 0x88C,
-	.reset_type_mask	= BIT(6),
-	.enable_reg		= 0x888,
-	.enable_mask		= BIT(7),
-};
-
 static struct st_wdog_syscfg stih407_syscfg = {
 	.enable_reg		= 0x204,
 	.enable_mask		= BIT(19),
@@ -83,18 +62,6 @@
 		.compatible = "st,stih407-lpc",
 		.data = &stih407_syscfg,
 	},
-	{
-		.compatible = "st,stih416-lpc",
-		.data = &stih416_syscfg,
-	},
-	{
-		.compatible = "st,stih415-lpc",
-		.data = &stih415_syscfg,
-	},
-	{
-		.compatible = "st,stid127-lpc",
-		.data = &stid127_syscfg,
-	},
 	{},
 };
 MODULE_DEVICE_TABLE(of, st_wdog_match);
diff --git a/drivers/watchdog/tegra_wdt.c b/drivers/watchdog/tegra_wdt.c
index 9ec5760..2d53c3f 100644
--- a/drivers/watchdog/tegra_wdt.c
+++ b/drivers/watchdog/tegra_wdt.c
@@ -178,7 +178,7 @@
 	.identity	= "Tegra Watchdog",
 };
 
-static struct watchdog_ops tegra_wdt_ops = {
+static const struct watchdog_ops tegra_wdt_ops = {
 	.owner = THIS_MODULE,
 	.start = tegra_wdt_start,
 	.stop = tegra_wdt_stop,
diff --git a/drivers/watchdog/txx9wdt.c b/drivers/watchdog/txx9wdt.c
index c2da880..6f7a9de 100644
--- a/drivers/watchdog/txx9wdt.c
+++ b/drivers/watchdog/txx9wdt.c
@@ -112,7 +112,7 @@
 		txx9_imclk = NULL;
 		goto exit;
 	}
-	ret = clk_enable(txx9_imclk);
+	ret = clk_prepare_enable(txx9_imclk);
 	if (ret) {
 		clk_put(txx9_imclk);
 		txx9_imclk = NULL;
@@ -144,7 +144,7 @@
 	return 0;
 exit:
 	if (txx9_imclk) {
-		clk_disable(txx9_imclk);
+		clk_disable_unprepare(txx9_imclk);
 		clk_put(txx9_imclk);
 	}
 	return ret;
@@ -153,7 +153,7 @@
 static int __exit txx9wdt_remove(struct platform_device *dev)
 {
 	watchdog_unregister_device(&txx9wdt);
-	clk_disable(txx9_imclk);
+	clk_disable_unprepare(txx9_imclk);
 	clk_put(txx9_imclk);
 	return 0;
 }
diff --git a/drivers/watchdog/w83627hf_wdt.c b/drivers/watchdog/w83627hf_wdt.c
index 09e8003..ef2ecaf 100644
--- a/drivers/watchdog/w83627hf_wdt.c
+++ b/drivers/watchdog/w83627hf_wdt.c
@@ -302,7 +302,7 @@
 	.identity = "W83627HF Watchdog",
 };
 
-static struct watchdog_ops wdt_ops = {
+static const struct watchdog_ops wdt_ops = {
 	.owner = THIS_MODULE,
 	.start = wdt_start,
 	.stop = wdt_stop,
diff --git a/drivers/watchdog/watchdog_core.c b/drivers/watchdog/watchdog_core.c
index 6abb83c..74265b2 100644
--- a/drivers/watchdog/watchdog_core.c
+++ b/drivers/watchdog/watchdog_core.c
@@ -349,7 +349,7 @@
 	struct watchdog_device **rcwdd;
 	int ret;
 
-	rcwdd = devres_alloc(devm_watchdog_unregister_device, sizeof(*wdd),
+	rcwdd = devres_alloc(devm_watchdog_unregister_device, sizeof(*rcwdd),
 			     GFP_KERNEL);
 	if (!rcwdd)
 		return -ENOMEM;
diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c
index 040bf83..32930a0 100644
--- a/drivers/watchdog/watchdog_dev.c
+++ b/drivers/watchdog/watchdog_dev.c
@@ -49,6 +49,7 @@
 #include <linux/uaccess.h>	/* For copy_to_user/put_user/... */
 
 #include "watchdog_core.h"
+#include "watchdog_pretimeout.h"
 
 /*
  * struct watchdog_core_data - watchdog core internal data
@@ -335,10 +336,14 @@
 	if (watchdog_timeout_invalid(wdd, timeout))
 		return -EINVAL;
 
-	if (wdd->ops->set_timeout)
+	if (wdd->ops->set_timeout) {
 		err = wdd->ops->set_timeout(wdd, timeout);
-	else
+	} else {
 		wdd->timeout = timeout;
+		/* Disable pretimeout if it doesn't fit the new timeout */
+		if (wdd->pretimeout >= wdd->timeout)
+			wdd->pretimeout = 0;
+	}
 
 	watchdog_update_worker(wdd);
 
@@ -346,6 +351,31 @@
 }
 
 /*
+ *	watchdog_set_pretimeout: set the watchdog timer pretimeout
+ *	@wdd: the watchdog device to set the timeout for
+ *	@timeout: pretimeout to set in seconds
+ */
+
+static int watchdog_set_pretimeout(struct watchdog_device *wdd,
+				   unsigned int timeout)
+{
+	int err = 0;
+
+	if (!(wdd->info->options & WDIOF_PRETIMEOUT))
+		return -EOPNOTSUPP;
+
+	if (watchdog_pretimeout_invalid(wdd, timeout))
+		return -EINVAL;
+
+	if (wdd->ops->set_pretimeout)
+		err = wdd->ops->set_pretimeout(wdd, timeout);
+	else
+		wdd->pretimeout = timeout;
+
+	return err;
+}
+
+/*
  *	watchdog_get_timeleft: wrapper to get the time left before a reboot
  *	@wdd: the watchdog device to get the remaining time from
  *	@timeleft: the time that's left
@@ -429,6 +459,15 @@
 }
 static DEVICE_ATTR_RO(timeout);
 
+static ssize_t pretimeout_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct watchdog_device *wdd = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%u\n", wdd->pretimeout);
+}
+static DEVICE_ATTR_RO(pretimeout);
+
 static ssize_t identity_show(struct device *dev, struct device_attribute *attr,
 				char *buf)
 {
@@ -450,6 +489,36 @@
 }
 static DEVICE_ATTR_RO(state);
 
+static ssize_t pretimeout_available_governors_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	return watchdog_pretimeout_available_governors_get(buf);
+}
+static DEVICE_ATTR_RO(pretimeout_available_governors);
+
+static ssize_t pretimeout_governor_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct watchdog_device *wdd = dev_get_drvdata(dev);
+
+	return watchdog_pretimeout_governor_get(wdd, buf);
+}
+
+static ssize_t pretimeout_governor_store(struct device *dev,
+					 struct device_attribute *attr,
+					 const char *buf, size_t count)
+{
+	struct watchdog_device *wdd = dev_get_drvdata(dev);
+	int ret = watchdog_pretimeout_governor_set(wdd, buf);
+
+	if (!ret)
+		ret = count;
+
+	return ret;
+}
+static DEVICE_ATTR_RW(pretimeout_governor);
+
 static umode_t wdt_is_visible(struct kobject *kobj, struct attribute *attr,
 				int n)
 {
@@ -459,6 +528,14 @@
 
 	if (attr == &dev_attr_timeleft.attr && !wdd->ops->get_timeleft)
 		mode = 0;
+	else if (attr == &dev_attr_pretimeout.attr &&
+		 !(wdd->info->options & WDIOF_PRETIMEOUT))
+		mode = 0;
+	else if ((attr == &dev_attr_pretimeout_governor.attr ||
+		  attr == &dev_attr_pretimeout_available_governors.attr) &&
+		 (!(wdd->info->options & WDIOF_PRETIMEOUT) ||
+		  !IS_ENABLED(CONFIG_WATCHDOG_PRETIMEOUT_GOV)))
+		mode = 0;
 
 	return mode;
 }
@@ -466,10 +543,13 @@
 	&dev_attr_state.attr,
 	&dev_attr_identity.attr,
 	&dev_attr_timeout.attr,
+	&dev_attr_pretimeout.attr,
 	&dev_attr_timeleft.attr,
 	&dev_attr_bootstatus.attr,
 	&dev_attr_status.attr,
 	&dev_attr_nowayout.attr,
+	&dev_attr_pretimeout_governor.attr,
+	&dev_attr_pretimeout_available_governors.attr,
 	NULL,
 };
 
@@ -646,6 +726,16 @@
 			break;
 		err = put_user(val, p);
 		break;
+	case WDIOC_SETPRETIMEOUT:
+		if (get_user(val, p)) {
+			err = -EFAULT;
+			break;
+		}
+		err = watchdog_set_pretimeout(wdd, val);
+		break;
+	case WDIOC_GETPRETIMEOUT:
+		err = put_user(wdd->pretimeout, p);
+		break;
 	default:
 		err = -ENOTTY;
 		break;
@@ -937,6 +1027,12 @@
 		return PTR_ERR(dev);
 	}
 
+	ret = watchdog_register_pretimeout(wdd);
+	if (ret) {
+		device_destroy(&watchdog_class, devno);
+		watchdog_cdev_unregister(wdd);
+	}
+
 	return ret;
 }
 
@@ -950,6 +1046,7 @@
 
 void watchdog_dev_unregister(struct watchdog_device *wdd)
 {
+	watchdog_unregister_pretimeout(wdd);
 	device_destroy(&watchdog_class, wdd->wd_data->cdev.dev);
 	watchdog_cdev_unregister(wdd);
 }
diff --git a/drivers/watchdog/watchdog_pretimeout.c b/drivers/watchdog/watchdog_pretimeout.c
new file mode 100644
index 0000000..9db07bf
--- /dev/null
+++ b/drivers/watchdog/watchdog_pretimeout.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (C) 2015-2016 Mentor Graphics
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/watchdog.h>
+
+#include "watchdog_pretimeout.h"
+
+/* Default watchdog pretimeout governor */
+static struct watchdog_governor *default_gov;
+
+/* The spinlock protects default_gov, wdd->gov and pretimeout_list */
+static DEFINE_SPINLOCK(pretimeout_lock);
+
+/* List of watchdog devices, which can generate a pretimeout event */
+static LIST_HEAD(pretimeout_list);
+
+struct watchdog_pretimeout {
+	struct watchdog_device		*wdd;
+	struct list_head		entry;
+};
+
+/* The mutex protects governor list and serializes external interfaces */
+static DEFINE_MUTEX(governor_lock);
+
+/* List of the registered watchdog pretimeout governors */
+static LIST_HEAD(governor_list);
+
+struct governor_priv {
+	struct watchdog_governor	*gov;
+	struct list_head		entry;
+};
+
+static struct governor_priv *find_governor_by_name(const char *gov_name)
+{
+	struct governor_priv *priv;
+
+	list_for_each_entry(priv, &governor_list, entry)
+		if (sysfs_streq(gov_name, priv->gov->name))
+			return priv;
+
+	return NULL;
+}
+
+int watchdog_pretimeout_available_governors_get(char *buf)
+{
+	struct governor_priv *priv;
+	int count = 0;
+
+	mutex_lock(&governor_lock);
+
+	list_for_each_entry(priv, &governor_list, entry)
+		count += sprintf(buf + count, "%s\n", priv->gov->name);
+
+	mutex_unlock(&governor_lock);
+
+	return count;
+}
+
+int watchdog_pretimeout_governor_get(struct watchdog_device *wdd, char *buf)
+{
+	int count = 0;
+
+	spin_lock_irq(&pretimeout_lock);
+	if (wdd->gov)
+		count = sprintf(buf, "%s\n", wdd->gov->name);
+	spin_unlock_irq(&pretimeout_lock);
+
+	return count;
+}
+
+int watchdog_pretimeout_governor_set(struct watchdog_device *wdd,
+				     const char *buf)
+{
+	struct governor_priv *priv;
+
+	mutex_lock(&governor_lock);
+
+	priv = find_governor_by_name(buf);
+	if (!priv) {
+		mutex_unlock(&governor_lock);
+		return -EINVAL;
+	}
+
+	spin_lock_irq(&pretimeout_lock);
+	wdd->gov = priv->gov;
+	spin_unlock_irq(&pretimeout_lock);
+
+	mutex_unlock(&governor_lock);
+
+	return 0;
+}
+
+void watchdog_notify_pretimeout(struct watchdog_device *wdd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pretimeout_lock, flags);
+	if (!wdd->gov) {
+		spin_unlock_irqrestore(&pretimeout_lock, flags);
+		return;
+	}
+
+	wdd->gov->pretimeout(wdd);
+	spin_unlock_irqrestore(&pretimeout_lock, flags);
+}
+EXPORT_SYMBOL_GPL(watchdog_notify_pretimeout);
+
+int watchdog_register_governor(struct watchdog_governor *gov)
+{
+	struct watchdog_pretimeout *p;
+	struct governor_priv *priv;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	mutex_lock(&governor_lock);
+
+	if (find_governor_by_name(gov->name)) {
+		mutex_unlock(&governor_lock);
+		kfree(priv);
+		return -EBUSY;
+	}
+
+	priv->gov = gov;
+	list_add(&priv->entry, &governor_list);
+
+	if (!strncmp(gov->name, WATCHDOG_PRETIMEOUT_DEFAULT_GOV,
+		     WATCHDOG_GOV_NAME_MAXLEN)) {
+		spin_lock_irq(&pretimeout_lock);
+		default_gov = gov;
+
+		list_for_each_entry(p, &pretimeout_list, entry)
+			if (!p->wdd->gov)
+				p->wdd->gov = default_gov;
+		spin_unlock_irq(&pretimeout_lock);
+	}
+
+	mutex_unlock(&governor_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(watchdog_register_governor);
+
+void watchdog_unregister_governor(struct watchdog_governor *gov)
+{
+	struct watchdog_pretimeout *p;
+	struct governor_priv *priv, *t;
+
+	mutex_lock(&governor_lock);
+
+	list_for_each_entry_safe(priv, t, &governor_list, entry) {
+		if (priv->gov == gov) {
+			list_del(&priv->entry);
+			kfree(priv);
+			break;
+		}
+	}
+
+	spin_lock_irq(&pretimeout_lock);
+	list_for_each_entry(p, &pretimeout_list, entry)
+		if (p->wdd->gov == gov)
+			p->wdd->gov = default_gov;
+	spin_unlock_irq(&pretimeout_lock);
+
+	mutex_unlock(&governor_lock);
+}
+EXPORT_SYMBOL(watchdog_unregister_governor);
+
+int watchdog_register_pretimeout(struct watchdog_device *wdd)
+{
+	struct watchdog_pretimeout *p;
+
+	if (!(wdd->info->options & WDIOF_PRETIMEOUT))
+		return 0;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	spin_lock_irq(&pretimeout_lock);
+	list_add(&p->entry, &pretimeout_list);
+	p->wdd = wdd;
+	wdd->gov = default_gov;
+	spin_unlock_irq(&pretimeout_lock);
+
+	return 0;
+}
+
+void watchdog_unregister_pretimeout(struct watchdog_device *wdd)
+{
+	struct watchdog_pretimeout *p, *t;
+
+	if (!(wdd->info->options & WDIOF_PRETIMEOUT))
+		return;
+
+	spin_lock_irq(&pretimeout_lock);
+	wdd->gov = NULL;
+
+	list_for_each_entry_safe(p, t, &pretimeout_list, entry) {
+		if (p->wdd == wdd) {
+			list_del(&p->entry);
+			break;
+		}
+	}
+	spin_unlock_irq(&pretimeout_lock);
+
+	kfree(p);
+}
diff --git a/drivers/watchdog/watchdog_pretimeout.h b/drivers/watchdog/watchdog_pretimeout.h
new file mode 100644
index 0000000..a5a32b3
--- /dev/null
+++ b/drivers/watchdog/watchdog_pretimeout.h
@@ -0,0 +1,60 @@
+#ifndef __WATCHDOG_PRETIMEOUT_H
+#define __WATCHDOG_PRETIMEOUT_H
+
+#define WATCHDOG_GOV_NAME_MAXLEN	20
+
+struct watchdog_device;
+
+struct watchdog_governor {
+	const char	name[WATCHDOG_GOV_NAME_MAXLEN];
+	void		(*pretimeout)(struct watchdog_device *wdd);
+};
+
+#if IS_ENABLED(CONFIG_WATCHDOG_PRETIMEOUT_GOV)
+/* Interfaces to watchdog pretimeout governors */
+int watchdog_register_governor(struct watchdog_governor *gov);
+void watchdog_unregister_governor(struct watchdog_governor *gov);
+
+/* Interfaces to watchdog_dev.c */
+int watchdog_register_pretimeout(struct watchdog_device *wdd);
+void watchdog_unregister_pretimeout(struct watchdog_device *wdd);
+int watchdog_pretimeout_available_governors_get(char *buf);
+int watchdog_pretimeout_governor_get(struct watchdog_device *wdd, char *buf);
+int watchdog_pretimeout_governor_set(struct watchdog_device *wdd,
+				     const char *buf);
+
+#if IS_ENABLED(CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_NOOP)
+#define WATCHDOG_PRETIMEOUT_DEFAULT_GOV		"noop"
+#elif IS_ENABLED(CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC)
+#define WATCHDOG_PRETIMEOUT_DEFAULT_GOV		"panic"
+#endif
+
+#else
+static inline int watchdog_register_pretimeout(struct watchdog_device *wdd)
+{
+	return 0;
+}
+
+static inline void watchdog_unregister_pretimeout(struct watchdog_device *wdd)
+{
+}
+
+static inline int watchdog_pretimeout_available_governors_get(char *buf)
+{
+	return -EINVAL;
+}
+
+static inline int watchdog_pretimeout_governor_get(struct watchdog_device *wdd,
+						   char *buf)
+{
+	return -EINVAL;
+}
+
+static inline int watchdog_pretimeout_governor_set(struct watchdog_device *wdd,
+						   const char *buf)
+{
+	return -EINVAL;
+}
+#endif
+
+#endif
diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c
index fa1efef..b4e0cea 100644
--- a/drivers/watchdog/ziirave_wdt.c
+++ b/drivers/watchdog/ziirave_wdt.c
@@ -18,7 +18,10 @@
  * GNU General Public License for more details.
  */
 
+#include <linux/delay.h>
 #include <linux/i2c.h>
+#include <linux/ihex.h>
+#include <linux/firmware.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -36,6 +39,8 @@
 #define ZIIRAVE_STATE_OFF	0x1
 #define ZIIRAVE_STATE_ON	0x2
 
+#define ZIIRAVE_FW_NAME		"ziirave_wdt.fw"
+
 static char *ziirave_reasons[] = {"power cycle", "hw watchdog", NULL, NULL,
 				  "host request", NULL, "illegal configuration",
 				  "illegal instruction", "illegal trap",
@@ -50,12 +55,35 @@
 #define ZIIRAVE_WDT_PING		0x9
 #define ZIIRAVE_WDT_RESET_DURATION	0xa
 
+#define ZIIRAVE_FIRM_PKT_TOTAL_SIZE	20
+#define ZIIRAVE_FIRM_PKT_DATA_SIZE	16
+#define ZIIRAVE_FIRM_FLASH_MEMORY_START	0x1600
+#define ZIIRAVE_FIRM_FLASH_MEMORY_END	0x2bbf
+
+/* Received and ready for next Download packet. */
+#define ZIIRAVE_FIRM_DOWNLOAD_ACK	1
+/* Currently writing to flash. Retry Download status in a moment! */
+#define ZIIRAVE_FIRM_DOWNLOAD_BUSY	2
+
+/* Wait for ACK timeout in ms */
+#define ZIIRAVE_FIRM_WAIT_FOR_ACK_TIMEOUT	50
+
+/* Firmware commands */
+#define ZIIRAVE_CMD_DOWNLOAD_START		0x10
+#define ZIIRAVE_CMD_DOWNLOAD_END		0x11
+#define ZIIRAVE_CMD_DOWNLOAD_SET_READ_ADDR	0x12
+#define ZIIRAVE_CMD_DOWNLOAD_READ_BYTE		0x13
+#define ZIIRAVE_CMD_RESET_PROCESSOR		0x0b
+#define ZIIRAVE_CMD_JUMP_TO_BOOTLOADER		0x0c
+#define ZIIRAVE_CMD_DOWNLOAD_PACKET		0x0e
+
 struct ziirave_wdt_rev {
 	unsigned char major;
 	unsigned char minor;
 };
 
 struct ziirave_wdt_data {
+	struct mutex sysfs_mutex;
 	struct watchdog_device wdd;
 	struct ziirave_wdt_rev bootloader_rev;
 	struct ziirave_wdt_rev firmware_rev;
@@ -146,6 +174,293 @@
 	return ret;
 }
 
+static int ziirave_firm_wait_for_ack(struct watchdog_device *wdd)
+{
+	struct i2c_client *client = to_i2c_client(wdd->parent);
+	int ret;
+	unsigned long timeout;
+
+	timeout = jiffies + msecs_to_jiffies(ZIIRAVE_FIRM_WAIT_FOR_ACK_TIMEOUT);
+	do {
+		if (time_after(jiffies, timeout))
+			return -ETIMEDOUT;
+
+		usleep_range(5000, 10000);
+
+		ret = i2c_smbus_read_byte(client);
+		if (ret < 0) {
+			dev_err(&client->dev, "Failed to read byte\n");
+			return ret;
+		}
+	} while (ret == ZIIRAVE_FIRM_DOWNLOAD_BUSY);
+
+	return ret == ZIIRAVE_FIRM_DOWNLOAD_ACK ? 0 : -EIO;
+}
+
+static int ziirave_firm_set_read_addr(struct watchdog_device *wdd, u16 addr)
+{
+	struct i2c_client *client = to_i2c_client(wdd->parent);
+	u8 address[2];
+
+	address[0] = addr & 0xff;
+	address[1] = (addr >> 8) & 0xff;
+
+	return i2c_smbus_write_block_data(client,
+					  ZIIRAVE_CMD_DOWNLOAD_SET_READ_ADDR,
+					  ARRAY_SIZE(address), address);
+}
+
+static int ziirave_firm_write_block_data(struct watchdog_device *wdd,
+					 u8 command, u8 length, const u8 *data,
+					 bool wait_for_ack)
+{
+	struct i2c_client *client = to_i2c_client(wdd->parent);
+	int ret;
+
+	ret = i2c_smbus_write_block_data(client, command, length, data);
+	if (ret) {
+		dev_err(&client->dev,
+			"Failed to send command 0x%02x: %d\n", command, ret);
+		return ret;
+	}
+
+	if (wait_for_ack)
+		ret = ziirave_firm_wait_for_ack(wdd);
+
+	return ret;
+}
+
+static int ziirave_firm_write_byte(struct watchdog_device *wdd, u8 command,
+				   u8 byte, bool wait_for_ack)
+{
+	return ziirave_firm_write_block_data(wdd, command, 1, &byte,
+					     wait_for_ack);
+}
+
+/*
+ * ziirave_firm_write_pkt() - Build and write a firmware packet
+ *
+ * A packet to send to the firmware is composed by following bytes:
+ *     Length | Addr0 | Addr1 | Data0 .. Data15 | Checksum |
+ * Where,
+ *     Length: A data byte containing the length of the data.
+ *     Addr0: Low byte of the address.
+ *     Addr1: High byte of the address.
+ *     Data0 .. Data15: Array of 16 bytes of data.
+ *     Checksum: Checksum byte to verify data integrity.
+ */
+static int ziirave_firm_write_pkt(struct watchdog_device *wdd,
+				  const struct ihex_binrec *rec)
+{
+	struct i2c_client *client = to_i2c_client(wdd->parent);
+	u8 i, checksum = 0, packet[ZIIRAVE_FIRM_PKT_TOTAL_SIZE];
+	int ret;
+	u16 addr;
+
+	memset(packet, 0, ARRAY_SIZE(packet));
+
+	/* Packet length */
+	packet[0] = (u8)be16_to_cpu(rec->len);
+	/* Packet address */
+	addr = (be32_to_cpu(rec->addr) & 0xffff) >> 1;
+	packet[1] = addr & 0xff;
+	packet[2] = (addr & 0xff00) >> 8;
+
+	/* Packet data */
+	if (be16_to_cpu(rec->len) > ZIIRAVE_FIRM_PKT_DATA_SIZE)
+		return -EMSGSIZE;
+	memcpy(packet + 3, rec->data, be16_to_cpu(rec->len));
+
+	/* Packet checksum */
+	for (i = 0; i < ZIIRAVE_FIRM_PKT_TOTAL_SIZE - 1; i++)
+		checksum += packet[i];
+	packet[ZIIRAVE_FIRM_PKT_TOTAL_SIZE - 1] = checksum;
+
+	ret = ziirave_firm_write_block_data(wdd, ZIIRAVE_CMD_DOWNLOAD_PACKET,
+					    ARRAY_SIZE(packet), packet, true);
+	if (ret)
+		dev_err(&client->dev,
+		      "Failed to write firmware packet at address 0x%04x: %d\n",
+		      addr, ret);
+
+	return ret;
+}
+
+static int ziirave_firm_verify(struct watchdog_device *wdd,
+			       const struct firmware *fw)
+{
+	struct i2c_client *client = to_i2c_client(wdd->parent);
+	const struct ihex_binrec *rec;
+	int i, ret;
+	u8 data[ZIIRAVE_FIRM_PKT_DATA_SIZE];
+	u16 addr;
+
+	for (rec = (void *)fw->data; rec; rec = ihex_next_binrec(rec)) {
+		/* Zero length marks end of records */
+		if (!be16_to_cpu(rec->len))
+			break;
+
+		addr = (be32_to_cpu(rec->addr) & 0xffff) >> 1;
+		if (addr < ZIIRAVE_FIRM_FLASH_MEMORY_START ||
+		    addr > ZIIRAVE_FIRM_FLASH_MEMORY_END)
+			continue;
+
+		ret = ziirave_firm_set_read_addr(wdd, addr);
+		if (ret) {
+			dev_err(&client->dev,
+				"Failed to send SET_READ_ADDR command: %d\n",
+				ret);
+			return ret;
+		}
+
+		for (i = 0; i < ARRAY_SIZE(data); i++) {
+			ret = i2c_smbus_read_byte_data(client,
+						ZIIRAVE_CMD_DOWNLOAD_READ_BYTE);
+			if (ret < 0) {
+				dev_err(&client->dev,
+					"Failed to READ DATA: %d\n", ret);
+				return ret;
+			}
+			data[i] = ret;
+		}
+
+		if (memcmp(data, rec->data, be16_to_cpu(rec->len))) {
+			dev_err(&client->dev,
+				"Firmware mismatch at address 0x%04x\n", addr);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int ziirave_firm_upload(struct watchdog_device *wdd,
+			       const struct firmware *fw)
+{
+	struct i2c_client *client = to_i2c_client(wdd->parent);
+	int ret, words_till_page_break;
+	const struct ihex_binrec *rec;
+	struct ihex_binrec *rec_new;
+
+	ret = ziirave_firm_write_byte(wdd, ZIIRAVE_CMD_JUMP_TO_BOOTLOADER, 1,
+				      false);
+	if (ret)
+		return ret;
+
+	msleep(500);
+
+	ret = ziirave_firm_write_byte(wdd, ZIIRAVE_CMD_DOWNLOAD_START, 1, true);
+	if (ret)
+		return ret;
+
+	msleep(500);
+
+	for (rec = (void *)fw->data; rec; rec = ihex_next_binrec(rec)) {
+		/* Zero length marks end of records */
+		if (!be16_to_cpu(rec->len))
+			break;
+
+		/* Check max data size */
+		if (be16_to_cpu(rec->len) > ZIIRAVE_FIRM_PKT_DATA_SIZE) {
+			dev_err(&client->dev, "Firmware packet too long (%d)\n",
+				be16_to_cpu(rec->len));
+			return -EMSGSIZE;
+		}
+
+		/* Calculate words till page break */
+		words_till_page_break = (64 - ((be32_to_cpu(rec->addr) >> 1) &
+					 0x3f));
+		if ((be16_to_cpu(rec->len) >> 1) > words_till_page_break) {
+			/*
+			 * Data in passes page boundary, so we need to split in
+			 * two blocks of data. Create a packet with the first
+			 * block of data.
+			 */
+			rec_new = kzalloc(sizeof(struct ihex_binrec) +
+					  (words_till_page_break << 1),
+					  GFP_KERNEL);
+			if (!rec_new)
+				return -ENOMEM;
+
+			rec_new->len = cpu_to_be16(words_till_page_break << 1);
+			rec_new->addr = rec->addr;
+			memcpy(rec_new->data, rec->data,
+			       be16_to_cpu(rec_new->len));
+
+			ret = ziirave_firm_write_pkt(wdd, rec_new);
+			kfree(rec_new);
+			if (ret)
+				return ret;
+
+			/* Create a packet with the second block of data */
+			rec_new = kzalloc(sizeof(struct ihex_binrec) +
+					  be16_to_cpu(rec->len) -
+					  (words_till_page_break << 1),
+					  GFP_KERNEL);
+			if (!rec_new)
+				return -ENOMEM;
+
+			/* Remaining bytes */
+			rec_new->len = rec->len -
+				       cpu_to_be16(words_till_page_break << 1);
+
+			rec_new->addr = cpu_to_be32(be32_to_cpu(rec->addr) +
+					(words_till_page_break << 1));
+
+			memcpy(rec_new->data,
+			       rec->data + (words_till_page_break << 1),
+			       be16_to_cpu(rec_new->len));
+
+			ret = ziirave_firm_write_pkt(wdd, rec_new);
+			kfree(rec_new);
+			if (ret)
+				return ret;
+		} else {
+			ret = ziirave_firm_write_pkt(wdd, rec);
+			if (ret)
+				return ret;
+		}
+	}
+
+	/* For end of download, the length field will be set to 0 */
+	rec_new = kzalloc(sizeof(struct ihex_binrec) + 1, GFP_KERNEL);
+	if (!rec_new)
+		return -ENOMEM;
+
+	ret = ziirave_firm_write_pkt(wdd, rec_new);
+	kfree(rec_new);
+	if (ret) {
+		dev_err(&client->dev, "Failed to send EMPTY packet: %d\n", ret);
+		return ret;
+	}
+
+	/* This sleep seems to be required */
+	msleep(20);
+
+	/* Start firmware verification */
+	ret = ziirave_firm_verify(wdd, fw);
+	if (ret) {
+		dev_err(&client->dev,
+			"Failed to verify firmware: %d\n", ret);
+		return ret;
+	}
+
+	/* End download operation */
+	ret = ziirave_firm_write_byte(wdd, ZIIRAVE_CMD_DOWNLOAD_END, 1, false);
+	if (ret)
+		return ret;
+
+	/* Reset the processor */
+	ret = ziirave_firm_write_byte(wdd, ZIIRAVE_CMD_RESET_PROCESSOR, 1,
+				      false);
+	if (ret)
+		return ret;
+
+	msleep(500);
+
+	return 0;
+}
+
 static const struct watchdog_info ziirave_wdt_info = {
 	.options = WDIOF_SETTIMEOUT | WDIOF_MAGICCLOSE | WDIOF_KEEPALIVEPING,
 	.identity = "Zodiac RAVE Watchdog",
@@ -166,9 +481,18 @@
 {
 	struct i2c_client *client = to_i2c_client(dev->parent);
 	struct ziirave_wdt_data *w_priv = i2c_get_clientdata(client);
+	int ret;
 
-	return sprintf(buf, "02.%02u.%02u", w_priv->firmware_rev.major,
-		       w_priv->firmware_rev.minor);
+	ret = mutex_lock_interruptible(&w_priv->sysfs_mutex);
+	if (ret)
+		return ret;
+
+	ret = sprintf(buf, "02.%02u.%02u", w_priv->firmware_rev.major,
+		      w_priv->firmware_rev.minor);
+
+	mutex_unlock(&w_priv->sysfs_mutex);
+
+	return ret;
 }
 
 static DEVICE_ATTR(firmware_version, S_IRUGO, ziirave_wdt_sysfs_show_firm,
@@ -180,9 +504,18 @@
 {
 	struct i2c_client *client = to_i2c_client(dev->parent);
 	struct ziirave_wdt_data *w_priv = i2c_get_clientdata(client);
+	int ret;
 
-	return sprintf(buf, "01.%02u.%02u", w_priv->bootloader_rev.major,
-		       w_priv->bootloader_rev.minor);
+	ret = mutex_lock_interruptible(&w_priv->sysfs_mutex);
+	if (ret)
+		return ret;
+
+	ret = sprintf(buf, "01.%02u.%02u", w_priv->bootloader_rev.major,
+		      w_priv->bootloader_rev.minor);
+
+	mutex_unlock(&w_priv->sysfs_mutex);
+
+	return ret;
 }
 
 static DEVICE_ATTR(bootloader_version, S_IRUGO, ziirave_wdt_sysfs_show_boot,
@@ -194,17 +527,81 @@
 {
 	struct i2c_client *client = to_i2c_client(dev->parent);
 	struct ziirave_wdt_data *w_priv = i2c_get_clientdata(client);
+	int ret;
 
-	return sprintf(buf, "%s", ziirave_reasons[w_priv->reset_reason]);
+	ret = mutex_lock_interruptible(&w_priv->sysfs_mutex);
+	if (ret)
+		return ret;
+
+	ret = sprintf(buf, "%s", ziirave_reasons[w_priv->reset_reason]);
+
+	mutex_unlock(&w_priv->sysfs_mutex);
+
+	return ret;
 }
 
 static DEVICE_ATTR(reset_reason, S_IRUGO, ziirave_wdt_sysfs_show_reason,
 		   NULL);
 
+static ssize_t ziirave_wdt_sysfs_store_firm(struct device *dev,
+					    struct device_attribute *attr,
+					    const char *buf, size_t count)
+{
+	struct i2c_client *client = to_i2c_client(dev->parent);
+	struct ziirave_wdt_data *w_priv = i2c_get_clientdata(client);
+	const struct firmware *fw;
+	int err;
+
+	err = request_ihex_firmware(&fw, ZIIRAVE_FW_NAME, dev);
+	if (err) {
+		dev_err(&client->dev, "Failed to request ihex firmware\n");
+		return err;
+	}
+
+	err = mutex_lock_interruptible(&w_priv->sysfs_mutex);
+	if (err)
+		goto release_firmware;
+
+	err = ziirave_firm_upload(&w_priv->wdd, fw);
+	if (err) {
+		dev_err(&client->dev, "The firmware update failed: %d\n", err);
+		goto unlock_mutex;
+	}
+
+	/* Update firmware version */
+	err = ziirave_wdt_revision(client, &w_priv->firmware_rev,
+				   ZIIRAVE_WDT_FIRM_VER_MAJOR);
+	if (err) {
+		dev_err(&client->dev, "Failed to read firmware version: %d\n",
+			err);
+		goto unlock_mutex;
+	}
+
+	dev_info(&client->dev, "Firmware updated to version 02.%02u.%02u\n",
+		 w_priv->firmware_rev.major, w_priv->firmware_rev.minor);
+
+	/* Restore the watchdog timeout */
+	err = ziirave_wdt_set_timeout(&w_priv->wdd, w_priv->wdd.timeout);
+	if (err)
+		dev_err(&client->dev, "Failed to set timeout: %d\n", err);
+
+unlock_mutex:
+	mutex_unlock(&w_priv->sysfs_mutex);
+
+release_firmware:
+	release_firmware(fw);
+
+	return err ? err : count;
+}
+
+static DEVICE_ATTR(update_firmware, S_IWUSR, NULL,
+		   ziirave_wdt_sysfs_store_firm);
+
 static struct attribute *ziirave_wdt_attrs[] = {
 	&dev_attr_firmware_version.attr,
 	&dev_attr_bootloader_version.attr,
 	&dev_attr_reset_reason.attr,
+	&dev_attr_update_firmware.attr,
 	NULL
 };
 ATTRIBUTE_GROUPS(ziirave_wdt);
@@ -252,6 +649,8 @@
 	if (!w_priv)
 		return -ENOMEM;
 
+	mutex_init(&w_priv->sysfs_mutex);
+
 	w_priv->wdd.info = &ziirave_wdt_info;
 	w_priv->wdd.ops = &ziirave_wdt_ops;
 	w_priv->wdd.min_timeout = ZIIRAVE_TIMEOUT_MIN;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c1e9f29..f2d7402 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1209,6 +1209,8 @@
 COMPATIBLE_IOCTL(WDIOC_KEEPALIVE)
 COMPATIBLE_IOCTL(WDIOC_SETTIMEOUT)
 COMPATIBLE_IOCTL(WDIOC_GETTIMEOUT)
+COMPATIBLE_IOCTL(WDIOC_SETPRETIMEOUT)
+COMPATIBLE_IOCTL(WDIOC_GETPRETIMEOUT)
 /* Big R */
 COMPATIBLE_IOCTL(RNDGETENTCNT)
 COMPATIBLE_IOCTL(RNDADDTOENTCNT)
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 207ba8d..a4b531b 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -428,10 +428,10 @@
 	if (!nop || !nop->fh_to_dentry)
 		return ERR_PTR(-ESTALE);
 	result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
-	if (!result)
-		result = ERR_PTR(-ESTALE);
-	if (IS_ERR(result))
-		return result;
+	if (PTR_ERR(result) == -ENOMEM)
+		return ERR_CAST(result);
+	if (IS_ERR_OR_NULL(result))
+		return ERR_PTR(-ESTALE);
 
 	if (d_is_dir(result)) {
 		/*
@@ -541,6 +541,8 @@
 
  err_result:
 	dput(result);
+	if (err != -ENOMEM)
+		err = -ESTALE;
 	return ERR_PTR(err);
 }
 EXPORT_SYMBOL_GPL(exportfs_decode_fh);
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index 5f7b053..6de1570 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -76,7 +76,7 @@
 
 	dreq = container_of(d, struct nfs_cache_defer_req, deferred_req);
 
-	complete_all(&dreq->completion);
+	complete(&dreq->completion);
 	nfs_cache_defer_req_put(dreq);
 }
 
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 52a2831..532d8e2 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -31,8 +31,6 @@
 struct nfs_callback_data {
 	unsigned int users;
 	struct svc_serv *serv;
-	struct svc_rqst *rqst;
-	struct task_struct *task;
 };
 
 static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1];
@@ -89,15 +87,6 @@
 	return 0;
 }
 
-/*
- * Prepare to bring up the NFSv4 callback service
- */
-static struct svc_rqst *
-nfs4_callback_up(struct svc_serv *serv)
-{
-	return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
-}
-
 #if defined(CONFIG_NFS_V4_1)
 /*
  * The callback service for NFSv4.1 callbacks
@@ -139,29 +128,6 @@
 	return 0;
 }
 
-/*
- * Bring up the NFSv4.1 callback service
- */
-static struct svc_rqst *
-nfs41_callback_up(struct svc_serv *serv)
-{
-	struct svc_rqst *rqstp;
-
-	INIT_LIST_HEAD(&serv->sv_cb_list);
-	spin_lock_init(&serv->sv_cb_lock);
-	init_waitqueue_head(&serv->sv_cb_waitq);
-	rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
-	dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp));
-	return rqstp;
-}
-
-static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv,
-		struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
-{
-	*rqstpp = nfs41_callback_up(serv);
-	*callback_svc = nfs41_callback_svc;
-}
-
 static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
 		struct svc_serv *serv)
 {
@@ -173,13 +139,6 @@
 		xprt->bc_serv = serv;
 }
 #else
-static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv,
-		struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
-{
-	*rqstpp = ERR_PTR(-ENOTSUPP);
-	*callback_svc = ERR_PTR(-ENOTSUPP);
-}
-
 static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
 		struct svc_serv *serv)
 {
@@ -189,45 +148,22 @@
 static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
 				  struct svc_serv *serv)
 {
-	struct svc_rqst *rqstp;
-	int (*callback_svc)(void *vrqstp);
-	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+	int nrservs = nfs_callback_nr_threads;
 	int ret;
 
 	nfs_callback_bc_serv(minorversion, xprt, serv);
 
-	if (cb_info->task)
+	if (nrservs < NFS4_MIN_NR_CALLBACK_THREADS)
+		nrservs = NFS4_MIN_NR_CALLBACK_THREADS;
+
+	if (serv->sv_nrthreads-1 == nrservs)
 		return 0;
 
-	switch (minorversion) {
-	case 0:
-		/* v4.0 callback setup */
-		rqstp = nfs4_callback_up(serv);
-		callback_svc = nfs4_callback_svc;
-		break;
-	default:
-		nfs_minorversion_callback_svc_setup(serv,
-				&rqstp, &callback_svc);
-	}
-
-	if (IS_ERR(rqstp))
-		return PTR_ERR(rqstp);
-
-	svc_sock_update_bufs(serv);
-
-	cb_info->serv = serv;
-	cb_info->rqst = rqstp;
-	cb_info->task = kthread_create(callback_svc, cb_info->rqst,
-				    "nfsv4.%u-svc", minorversion);
-	if (IS_ERR(cb_info->task)) {
-		ret = PTR_ERR(cb_info->task);
-		svc_exit_thread(cb_info->rqst);
-		cb_info->rqst = NULL;
-		cb_info->task = NULL;
+	ret = serv->sv_ops->svo_setup(serv, NULL, nrservs);
+	if (ret) {
+		serv->sv_ops->svo_setup(serv, NULL, 0);
 		return ret;
 	}
-	rqstp->rq_task = cb_info->task;
-	wake_up_process(cb_info->task);
 	dprintk("nfs_callback_up: service started\n");
 	return 0;
 }
@@ -281,19 +217,41 @@
 	return ret;
 }
 
-static struct svc_serv_ops nfs_cb_sv_ops = {
+static struct svc_serv_ops nfs40_cb_sv_ops = {
+	.svo_function		= nfs4_callback_svc,
 	.svo_enqueue_xprt	= svc_xprt_do_enqueue,
+	.svo_setup		= svc_set_num_threads,
+	.svo_module		= THIS_MODULE,
 };
+#if defined(CONFIG_NFS_V4_1)
+static struct svc_serv_ops nfs41_cb_sv_ops = {
+	.svo_function		= nfs41_callback_svc,
+	.svo_enqueue_xprt	= svc_xprt_do_enqueue,
+	.svo_setup		= svc_set_num_threads,
+	.svo_module		= THIS_MODULE,
+};
+
+struct svc_serv_ops *nfs4_cb_sv_ops[] = {
+	[0] = &nfs40_cb_sv_ops,
+	[1] = &nfs41_cb_sv_ops,
+};
+#else
+struct svc_serv_ops *nfs4_cb_sv_ops[] = {
+	[0] = &nfs40_cb_sv_ops,
+	[1] = NULL,
+};
+#endif
 
 static struct svc_serv *nfs_callback_create_svc(int minorversion)
 {
 	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
 	struct svc_serv *serv;
+	struct svc_serv_ops *sv_ops;
 
 	/*
 	 * Check whether we're already up and running.
 	 */
-	if (cb_info->task) {
+	if (cb_info->serv) {
 		/*
 		 * Note: increase service usage, because later in case of error
 		 * svc_destroy() will be called.
@@ -302,6 +260,17 @@
 		return cb_info->serv;
 	}
 
+	switch (minorversion) {
+	case 0:
+		sv_ops = nfs4_cb_sv_ops[0];
+		break;
+	default:
+		sv_ops = nfs4_cb_sv_ops[1];
+	}
+
+	if (sv_ops == NULL)
+		return ERR_PTR(-ENOTSUPP);
+
 	/*
 	 * Sanity check: if there's no task,
 	 * we should be the first user ...
@@ -310,11 +279,12 @@
 		printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
 			cb_info->users);
 
-	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, &nfs_cb_sv_ops);
+	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops);
 	if (!serv) {
 		printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
 		return ERR_PTR(-ENOMEM);
 	}
+	cb_info->serv = serv;
 	/* As there is only one thread we need to over-ride the
 	 * default maximum of 80 connections
 	 */
@@ -357,6 +327,8 @@
 	 * thread exits.
 	 */
 err_net:
+	if (!cb_info->users)
+		cb_info->serv = NULL;
 	svc_destroy(serv);
 err_create:
 	mutex_unlock(&nfs_callback_mutex);
@@ -374,18 +346,18 @@
 void nfs_callback_down(int minorversion, struct net *net)
 {
 	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+	struct svc_serv *serv;
 
 	mutex_lock(&nfs_callback_mutex);
-	nfs_callback_down_net(minorversion, cb_info->serv, net);
+	serv = cb_info->serv;
+	nfs_callback_down_net(minorversion, serv, net);
 	cb_info->users--;
-	if (cb_info->users == 0 && cb_info->task != NULL) {
-		kthread_stop(cb_info->task);
-		dprintk("nfs_callback_down: service stopped\n");
-		svc_exit_thread(cb_info->rqst);
+	if (cb_info->users == 0) {
+		svc_get(serv);
+		serv->sv_ops->svo_setup(serv, NULL, 0);
+		svc_destroy(serv);
 		dprintk("nfs_callback_down: service destroyed\n");
 		cb_info->serv = NULL;
-		cb_info->rqst = NULL;
-		cb_info->task = NULL;
 	}
 	mutex_unlock(&nfs_callback_mutex);
 }
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 5fe1cec..c701c30 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -179,6 +179,15 @@
 	struct cb_devicenotifyargs *args,
 	void *dummy, struct cb_process_state *cps);
 
+struct cb_notify_lock_args {
+	struct nfs_fh			cbnl_fh;
+	struct nfs_lowner		cbnl_owner;
+	bool				cbnl_valid;
+};
+
+extern __be32 nfs4_callback_notify_lock(struct cb_notify_lock_args *args,
+					 void *dummy,
+					 struct cb_process_state *cps);
 #endif /* CONFIG_NFS_V4_1 */
 extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
 extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
@@ -198,6 +207,9 @@
 #define NFS41_BC_MIN_CALLBACKS 1
 #define NFS41_BC_MAX_CALLBACKS 1
 
+#define NFS4_MIN_NR_CALLBACK_THREADS 1
+
 extern unsigned int nfs_callback_set_tcpport;
+extern unsigned short nfs_callback_nr_threads;
 
 #endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index f953ef6..e9aa235e 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -628,4 +628,20 @@
 	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
 	return status;
 }
+
+__be32 nfs4_callback_notify_lock(struct cb_notify_lock_args *args, void *dummy,
+				 struct cb_process_state *cps)
+{
+	if (!cps->clp) /* set in cb_sequence */
+		return htonl(NFS4ERR_OP_NOT_IN_SESSION);
+
+	dprintk_rcu("NFS: CB_NOTIFY_LOCK request from %s\n",
+		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+	/* Don't wake anybody if the string looked bogus */
+	if (args->cbnl_valid)
+		__wake_up(&cps->clp->cl_lock_waitq, TASK_NORMAL, 0, args);
+
+	return htonl(NFS4_OK);
+}
 #endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 656f68f..eb094c6 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -35,6 +35,7 @@
 					 (1 + 3) * 4) // seqid, 3 slotids
 #define CB_OP_RECALLANY_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
 #define CB_OP_RECALLSLOT_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_NOTIFY_LOCK_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
 #endif /* CONFIG_NFS_V4_1 */
 
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
@@ -72,7 +73,7 @@
 	return xdr_ressize_check(rqstp, p);
 }
 
-static __be32 *read_buf(struct xdr_stream *xdr, int nbytes)
+static __be32 *read_buf(struct xdr_stream *xdr, size_t nbytes)
 {
 	__be32 *p;
 
@@ -534,6 +535,49 @@
 	return 0;
 }
 
+static __be32 decode_lockowner(struct xdr_stream *xdr, struct cb_notify_lock_args *args)
+{
+	__be32		*p;
+	unsigned int	len;
+
+	p = read_buf(xdr, 12);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_BADXDR);
+
+	p = xdr_decode_hyper(p, &args->cbnl_owner.clientid);
+	len = be32_to_cpu(*p);
+
+	p = read_buf(xdr, len);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_BADXDR);
+
+	/* Only try to decode if the length is right */
+	if (len == 20) {
+		p += 2;	/* skip "lock id:" */
+		args->cbnl_owner.s_dev = be32_to_cpu(*p++);
+		xdr_decode_hyper(p, &args->cbnl_owner.id);
+		args->cbnl_valid = true;
+	} else {
+		args->cbnl_owner.s_dev = 0;
+		args->cbnl_owner.id = 0;
+		args->cbnl_valid = false;
+	}
+	return 0;
+}
+
+static __be32 decode_notify_lock_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_notify_lock_args *args)
+{
+	__be32 status;
+
+	status = decode_fh(xdr, &args->cbnl_fh);
+	if (unlikely(status != 0))
+		goto out;
+	status = decode_lockowner(xdr, args);
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+	return status;
+}
+
 #endif /* CONFIG_NFS_V4_1 */
 
 static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
@@ -746,6 +790,7 @@
 	case OP_CB_RECALL_SLOT:
 	case OP_CB_LAYOUTRECALL:
 	case OP_CB_NOTIFY_DEVICEID:
+	case OP_CB_NOTIFY_LOCK:
 		*op = &callback_ops[op_nr];
 		break;
 
@@ -753,7 +798,6 @@
 	case OP_CB_PUSH_DELEG:
 	case OP_CB_RECALLABLE_OBJ_AVAIL:
 	case OP_CB_WANTS_CANCELLED:
-	case OP_CB_NOTIFY_LOCK:
 		return htonl(NFS4ERR_NOTSUPP);
 
 	default:
@@ -1006,6 +1050,11 @@
 		.decode_args = (callback_decode_arg_t)decode_recallslot_args,
 		.res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ,
 	},
+	[OP_CB_NOTIFY_LOCK] = {
+		.process_op = (callback_process_op_t)nfs4_callback_notify_lock,
+		.decode_args = (callback_decode_arg_t)decode_notify_lock_args,
+		.res_maxsize = CB_OP_NOTIFY_LOCK_RES_MAXSZ,
+	},
 #endif /* CONFIG_NFS_V4_1 */
 };
 
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 1e10678..7555ba8 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -313,7 +313,10 @@
 			continue;
 		/* Match the full socket address */
 		if (!rpc_cmp_addr_port(sap, clap))
-			continue;
+			/* Match all xprt_switch full socket addresses */
+			if (!rpc_clnt_xprt_switch_has_addr(clp->cl_rpcclient,
+							   sap))
+				continue;
 
 		atomic_inc(&clp->cl_count);
 		return clp;
@@ -785,7 +788,8 @@
 	}
 
 	fsinfo.fattr = fattr;
-	fsinfo.layouttype = 0;
+	fsinfo.nlayouttypes = 0;
+	memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype));
 	error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
 	if (error < 0)
 		goto out_error;
@@ -1078,7 +1082,7 @@
 	idr_init(&nn->cb_ident_idr);
 #endif
 	spin_lock_init(&nn->nfs_client_lock);
-	nn->boot_time = CURRENT_TIME;
+	nn->boot_time = ktime_get_real();
 }
 
 #ifdef CONFIG_PROC_FS
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 322c258..dff600a 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -41,6 +41,17 @@
 	set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
 }
 
+static bool
+nfs4_is_valid_delegation(const struct nfs_delegation *delegation,
+		fmode_t flags)
+{
+	if (delegation != NULL && (delegation->type & flags) == flags &&
+	    !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) &&
+	    !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+		return true;
+	return false;
+}
+
 static int
 nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
 {
@@ -50,8 +61,7 @@
 	flags &= FMODE_READ|FMODE_WRITE;
 	rcu_read_lock();
 	delegation = rcu_dereference(NFS_I(inode)->delegation);
-	if (delegation != NULL && (delegation->type & flags) == flags &&
-	    !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
+	if (nfs4_is_valid_delegation(delegation, flags)) {
 		if (mark)
 			nfs_mark_delegation_referenced(delegation);
 		ret = 1;
@@ -185,15 +195,13 @@
 			rcu_read_unlock();
 			put_rpccred(oldcred);
 			trace_nfs4_reclaim_delegation(inode, res->delegation_type);
-		} else {
-			/* We appear to have raced with a delegation return. */
-			spin_unlock(&delegation->lock);
-			rcu_read_unlock();
-			nfs_inode_set_delegation(inode, cred, res);
+			return;
 		}
-	} else {
-		rcu_read_unlock();
+		/* We appear to have raced with a delegation return. */
+		spin_unlock(&delegation->lock);
 	}
+	rcu_read_unlock();
+	nfs_inode_set_delegation(inode, cred, res);
 }
 
 static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
@@ -642,28 +650,49 @@
 	rcu_read_unlock();
 }
 
-static void nfs_revoke_delegation(struct inode *inode)
+static void nfs_mark_delegation_revoked(struct nfs_server *server,
+		struct nfs_delegation *delegation)
 {
-	struct nfs_delegation *delegation;
-	rcu_read_lock();
-	delegation = rcu_dereference(NFS_I(inode)->delegation);
-	if (delegation != NULL) {
-		set_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
-		nfs_mark_return_delegation(NFS_SERVER(inode), delegation);
-	}
-	rcu_read_unlock();
+	set_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
+	delegation->stateid.type = NFS4_INVALID_STATEID_TYPE;
+	nfs_mark_return_delegation(server, delegation);
 }
 
-void nfs_remove_bad_delegation(struct inode *inode)
+static bool nfs_revoke_delegation(struct inode *inode,
+		const nfs4_stateid *stateid)
+{
+	struct nfs_delegation *delegation;
+	nfs4_stateid tmp;
+	bool ret = false;
+
+	rcu_read_lock();
+	delegation = rcu_dereference(NFS_I(inode)->delegation);
+	if (delegation == NULL)
+		goto out;
+	if (stateid == NULL) {
+		nfs4_stateid_copy(&tmp, &delegation->stateid);
+		stateid = &tmp;
+	} else if (!nfs4_stateid_match(stateid, &delegation->stateid))
+		goto out;
+	nfs_mark_delegation_revoked(NFS_SERVER(inode), delegation);
+	ret = true;
+out:
+	rcu_read_unlock();
+	if (ret)
+		nfs_inode_find_state_and_recover(inode, stateid);
+	return ret;
+}
+
+void nfs_remove_bad_delegation(struct inode *inode,
+		const nfs4_stateid *stateid)
 {
 	struct nfs_delegation *delegation;
 
-	nfs_revoke_delegation(inode);
+	if (!nfs_revoke_delegation(inode, stateid))
+		return;
 	delegation = nfs_inode_detach_delegation(inode);
-	if (delegation) {
-		nfs_inode_find_state_and_recover(inode, &delegation->stateid);
+	if (delegation)
 		nfs_free_delegation(delegation);
-	}
 }
 EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation);
 
@@ -786,8 +815,15 @@
 {
 	struct nfs_delegation *delegation;
 
-	list_for_each_entry_rcu(delegation, &server->delegations, super_list)
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+		/*
+		 * If the delegation may have been admin revoked, then we
+		 * cannot reclaim it.
+		 */
+		if (test_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags))
+			continue;
 		set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+	}
 }
 
 /**
@@ -851,6 +887,141 @@
 	rcu_read_unlock();
 }
 
+static inline bool nfs4_server_rebooted(const struct nfs_client *clp)
+{
+	return (clp->cl_state & (BIT(NFS4CLNT_CHECK_LEASE) |
+				BIT(NFS4CLNT_LEASE_EXPIRED) |
+				BIT(NFS4CLNT_SESSION_RESET))) != 0;
+}
+
+static void nfs_mark_test_expired_delegation(struct nfs_server *server,
+	    struct nfs_delegation *delegation)
+{
+	if (delegation->stateid.type == NFS4_INVALID_STATEID_TYPE)
+		return;
+	clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+	set_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
+	set_bit(NFS4CLNT_DELEGATION_EXPIRED, &server->nfs_client->cl_state);
+}
+
+static void nfs_inode_mark_test_expired_delegation(struct nfs_server *server,
+		struct inode *inode)
+{
+	struct nfs_delegation *delegation;
+
+	rcu_read_lock();
+	delegation = rcu_dereference(NFS_I(inode)->delegation);
+	if (delegation)
+		nfs_mark_test_expired_delegation(server, delegation);
+	rcu_read_unlock();
+
+}
+
+static void nfs_delegation_mark_test_expired_server(struct nfs_server *server)
+{
+	struct nfs_delegation *delegation;
+
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list)
+		nfs_mark_test_expired_delegation(server, delegation);
+}
+
+/**
+ * nfs_mark_test_expired_all_delegations - mark all delegations for testing
+ * @clp: nfs_client to process
+ *
+ * Iterates through all the delegations associated with this server and
+ * marks them as needing to be checked for validity.
+ */
+void nfs_mark_test_expired_all_delegations(struct nfs_client *clp)
+{
+	struct nfs_server *server;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs_delegation_mark_test_expired_server(server);
+	rcu_read_unlock();
+}
+
+/**
+ * nfs_reap_expired_delegations - reap expired delegations
+ * @clp: nfs_client to process
+ *
+ * Iterates through all the delegations associated with this server and
+ * checks if they have may have been revoked. This function is usually
+ * expected to be called in cases where the server may have lost its
+ * lease.
+ */
+void nfs_reap_expired_delegations(struct nfs_client *clp)
+{
+	const struct nfs4_minor_version_ops *ops = clp->cl_mvops;
+	struct nfs_delegation *delegation;
+	struct nfs_server *server;
+	struct inode *inode;
+	struct rpc_cred *cred;
+	nfs4_stateid stateid;
+
+restart:
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		list_for_each_entry_rcu(delegation, &server->delegations,
+								super_list) {
+			if (test_bit(NFS_DELEGATION_RETURNING,
+						&delegation->flags))
+				continue;
+			if (test_bit(NFS_DELEGATION_TEST_EXPIRED,
+						&delegation->flags) == 0)
+				continue;
+			if (!nfs_sb_active(server->super))
+				continue;
+			inode = nfs_delegation_grab_inode(delegation);
+			if (inode == NULL) {
+				rcu_read_unlock();
+				nfs_sb_deactive(server->super);
+				goto restart;
+			}
+			cred = get_rpccred_rcu(delegation->cred);
+			nfs4_stateid_copy(&stateid, &delegation->stateid);
+			clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
+			rcu_read_unlock();
+			if (cred != NULL &&
+			    ops->test_and_free_expired(server, &stateid, cred) < 0) {
+				nfs_revoke_delegation(inode, &stateid);
+				nfs_inode_find_state_and_recover(inode, &stateid);
+			}
+			put_rpccred(cred);
+			if (nfs4_server_rebooted(clp)) {
+				nfs_inode_mark_test_expired_delegation(server,inode);
+				iput(inode);
+				nfs_sb_deactive(server->super);
+				return;
+			}
+			iput(inode);
+			nfs_sb_deactive(server->super);
+			goto restart;
+		}
+	}
+	rcu_read_unlock();
+}
+
+void nfs_inode_find_delegation_state_and_recover(struct inode *inode,
+		const nfs4_stateid *stateid)
+{
+	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+	struct nfs_delegation *delegation;
+	bool found = false;
+
+	rcu_read_lock();
+	delegation = rcu_dereference(NFS_I(inode)->delegation);
+	if (delegation &&
+	    nfs4_stateid_match_other(&delegation->stateid, stateid)) {
+		nfs_mark_test_expired_delegation(NFS_SERVER(inode), delegation);
+		found = true;
+	}
+	rcu_read_unlock();
+	if (found)
+		nfs4_schedule_state_manager(clp);
+}
+
 /**
  * nfs_delegations_present - check for existence of delegations
  * @clp: client state handle
@@ -893,7 +1064,7 @@
 	flags &= FMODE_READ|FMODE_WRITE;
 	rcu_read_lock();
 	delegation = rcu_dereference(nfsi->delegation);
-	ret = (delegation != NULL && (delegation->type & flags) == flags);
+	ret = nfs4_is_valid_delegation(delegation, flags);
 	if (ret) {
 		nfs4_stateid_copy(dst, &delegation->stateid);
 		nfs_mark_delegation_referenced(delegation);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 64724d2..e9d5557 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -32,6 +32,7 @@
 	NFS_DELEGATION_REFERENCED,
 	NFS_DELEGATION_RETURNING,
 	NFS_DELEGATION_REVOKED,
+	NFS_DELEGATION_TEST_EXPIRED,
 };
 
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
@@ -47,11 +48,14 @@
 void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
 int nfs_client_return_marked_delegations(struct nfs_client *clp);
 int nfs_delegations_present(struct nfs_client *clp);
-void nfs_remove_bad_delegation(struct inode *inode);
+void nfs_remove_bad_delegation(struct inode *inode, const nfs4_stateid *stateid);
 
 void nfs_delegation_mark_reclaim(struct nfs_client *clp);
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
 
+void nfs_mark_test_expired_all_delegations(struct nfs_client *clp);
+void nfs_reap_expired_delegations(struct nfs_client *clp);
+
 /* NFSv4 delegation-related procedures */
 int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
 int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type);
@@ -62,6 +66,8 @@
 int nfs4_have_delegation(struct inode *inode, fmode_t flags);
 int nfs4_check_delegation(struct inode *inode, fmode_t flags);
 bool nfs4_delegation_flush_on_close(const struct inode *inode);
+void nfs_inode_find_delegation_state_and_recover(struct inode *inode,
+		const nfs4_stateid *stateid);
 
 #endif
 
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 06e0bf0..5f1af4c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -435,11 +435,11 @@
 		return 0;
 
 	nfsi = NFS_I(inode);
-	if (entry->fattr->fileid == nfsi->fileid)
-		return 1;
-	if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0)
-		return 1;
-	return 0;
+	if (entry->fattr->fileid != nfsi->fileid)
+		return 0;
+	if (entry->fh->size && nfs_compare_fh(entry->fh, &nfsi->fh) != 0)
+		return 0;
+	return 1;
 }
 
 static
@@ -496,6 +496,14 @@
 		return;
 	if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID))
 		return;
+	if (filename.len == 0)
+		return;
+	/* Validate that the name doesn't contain any illegal '\0' */
+	if (strnlen(filename.name, filename.len) != filename.len)
+		return;
+	/* ...or '/' */
+	if (strnchr(filename.name, filename.len, '/'))
+		return;
 	if (filename.name[0] == '.') {
 		if (filename.len == 1)
 			return;
@@ -517,6 +525,8 @@
 					&entry->fattr->fsid))
 			goto out;
 		if (nfs_same_file(dentry, entry)) {
+			if (!entry->fh->size)
+				goto out;
 			nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 			status = nfs_refresh_inode(d_inode(dentry), entry->fattr);
 			if (!status)
@@ -529,6 +539,10 @@
 			goto again;
 		}
 	}
+	if (!entry->fh->size) {
+		d_lookup_done(dentry);
+		goto out;
+	}
 
 	inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label);
 	alias = d_splice_alias(inode, dentry);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 72b7d13..bd81bcf 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -387,7 +387,7 @@
 		dreq->iocb->ki_complete(dreq->iocb, res, 0);
 	}
 
-	complete_all(&dreq->completion);
+	complete(&dreq->completion);
 
 	nfs_direct_req_release(dreq);
 }
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 2efbdde..9ea85ae 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -520,7 +520,9 @@
 	.invalidatepage = nfs_invalidate_page,
 	.releasepage = nfs_release_page,
 	.direct_IO = nfs_direct_IO,
+#ifdef CONFIG_MIGRATION
 	.migratepage = nfs_migrate_page,
+#endif
 	.launder_page = nfs_launder_page,
 	.is_dirty_writeback = nfs_check_dirty_writeback,
 	.error_remove_page = generic_error_remove_page,
@@ -685,11 +687,6 @@
 	goto out;
 }
 
-static int do_vfs_lock(struct file *file, struct file_lock *fl)
-{
-	return locks_lock_file_wait(file, fl);
-}
-
 static int
 do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
@@ -722,7 +719,7 @@
 	if (!is_local)
 		status = NFS_PROTO(inode)->lock(filp, cmd, fl);
 	else
-		status = do_vfs_lock(filp, fl);
+		status = locks_lock_file_wait(filp, fl);
 	return status;
 }
 
@@ -747,7 +744,7 @@
 	if (!is_local)
 		status = NFS_PROTO(inode)->lock(filp, cmd, fl);
 	else
-		status = do_vfs_lock(filp, fl);
+		status = locks_lock_file_wait(filp, fl);
 	if (status < 0)
 		goto out;
 
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 51b5136..98ace12 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1080,7 +1080,7 @@
 	case -NFS4ERR_BAD_STATEID:
 		if (state == NULL)
 			break;
-		nfs_remove_bad_delegation(state->inode);
+		nfs_remove_bad_delegation(state->inode, NULL);
 	case -NFS4ERR_OPENMODE:
 		if (state == NULL)
 			break;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index a6acce6..80bcc0b 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -534,12 +534,9 @@
 }
 #endif
 
-
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
 		struct page *, struct page *, enum migrate_mode);
-#else
-#define nfs_migrate_page NULL
 #endif
 
 static inline int
@@ -562,7 +559,6 @@
 extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
 
 /* nfs4proc.c */
-extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
 extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
 			    const struct nfs_client_initdata *);
 extern int nfs40_walk_client_list(struct nfs_client *clp,
@@ -571,6 +567,9 @@
 extern int nfs41_walk_client_list(struct nfs_client *clp,
 				struct nfs_client **result,
 				struct rpc_cred *cred);
+extern int nfs4_test_session_trunk(struct rpc_clnt *,
+				struct rpc_xprt *,
+				void *);
 
 static inline struct inode *nfs_igrab_and_active(struct inode *inode)
 {
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index f0e06e4..fbce0d8 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -29,7 +29,7 @@
 	int cb_users[NFS4_MAX_MINOR_VERSION + 1];
 #endif
 	spinlock_t nfs_client_lock;
-	struct timespec boot_time;
+	ktime_t boot_time;
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry *proc_nfsfs;
 #endif
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 64b43b4..608501971 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -443,6 +443,7 @@
 	task = rpc_run_task(&task_setup);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
+	rpc_put_task(task);
 	return 0;
 }
 
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 9bf64ea..9b3a82a 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -39,6 +39,7 @@
 	NFS4CLNT_BIND_CONN_TO_SESSION,
 	NFS4CLNT_MOVED,
 	NFS4CLNT_LEASE_MOVED,
+	NFS4CLNT_DELEGATION_EXPIRED,
 };
 
 #define NFS4_RENEW_TIMEOUT		0x01
@@ -57,8 +58,11 @@
 			struct nfs_fsinfo *);
 	void	(*free_lock_state)(struct nfs_server *,
 			struct nfs4_lock_state *);
+	int	(*test_and_free_expired)(struct nfs_server *,
+			nfs4_stateid *, struct rpc_cred *);
 	struct nfs_seqid *
 		(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
+	int	(*session_trunk)(struct rpc_clnt *, struct rpc_xprt *, void *);
 	const struct rpc_call_ops *call_sync_ops;
 	const struct nfs4_state_recovery_ops *reboot_recovery_ops;
 	const struct nfs4_state_recovery_ops *nograce_recovery_ops;
@@ -156,6 +160,7 @@
 	NFS_STATE_RECLAIM_NOGRACE,	/* OPEN stateid needs to recover state */
 	NFS_STATE_POSIX_LOCKS,		/* Posix locks are supported */
 	NFS_STATE_RECOVERY_FAILED,	/* OPEN stateid state recovery failed */
+	NFS_STATE_MAY_NOTIFY_LOCK,	/* server may CB_NOTIFY_LOCK */
 };
 
 struct nfs4_state {
@@ -203,6 +208,11 @@
 		struct rpc_cred *);
 };
 
+struct nfs4_add_xprt_data {
+	struct nfs_client	*clp;
+	struct rpc_cred		*cred;
+};
+
 struct nfs4_state_maintenance_ops {
 	int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *, unsigned);
 	struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *);
@@ -278,6 +288,8 @@
 		struct nfs_fsinfo *fsinfo);
 extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
 				  bool sync);
+extern int nfs4_detect_session_trunking(struct nfs_client *clp,
+		struct nfs41_exchange_id_res *res, struct rpc_xprt *xprt);
 
 static inline bool
 is_ds_only_client(struct nfs_client *clp)
@@ -439,7 +451,7 @@
 extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
 extern int nfs4_schedule_migration_recovery(const struct nfs_server *);
 extern void nfs4_schedule_lease_moved_recovery(struct nfs_client *);
-extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
+extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags, bool);
 extern void nfs41_handle_server_scope(struct nfs_client *,
 				      struct nfs41_server_scope **);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
@@ -471,6 +483,7 @@
 struct dentry *nfs4_try_mount(int, const char *, struct nfs_mount_info *, struct nfs_subversion *);
 extern bool nfs4_disable_idmapping;
 extern unsigned short max_session_slots;
+extern unsigned short max_session_cb_slots;
 extern unsigned short send_implementation_id;
 extern bool recover_lost_locks;
 
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index cd3b7cf..074ac71 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -199,6 +199,9 @@
 	clp->cl_minorversion = cl_init->minorversion;
 	clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
 	clp->cl_mig_gen = 1;
+#if IS_ENABLED(CONFIG_NFS_V4_1)
+	init_waitqueue_head(&clp->cl_lock_waitq);
+#endif
 	return clp;
 
 error:
@@ -562,15 +565,15 @@
 /*
  * Returns true if the client IDs match
  */
-static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b)
+static bool nfs4_match_clientids(u64 a, u64 b)
 {
-	if (a->cl_clientid != b->cl_clientid) {
+	if (a != b) {
 		dprintk("NFS: --> %s client ID %llx does not match %llx\n",
-			__func__, a->cl_clientid, b->cl_clientid);
+			__func__, a, b);
 		return false;
 	}
 	dprintk("NFS: --> %s client ID %llx matches %llx\n",
-		__func__, a->cl_clientid, b->cl_clientid);
+		__func__, a, b);
 	return true;
 }
 
@@ -578,17 +581,15 @@
  * Returns true if the server major ids match
  */
 static bool
-nfs4_check_clientid_trunking(struct nfs_client *a, struct nfs_client *b)
+nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1,
+				struct nfs41_server_owner *o2)
 {
-	struct nfs41_server_owner *o1 = a->cl_serverowner;
-	struct nfs41_server_owner *o2 = b->cl_serverowner;
-
 	if (o1->major_id_sz != o2->major_id_sz)
 		goto out_major_mismatch;
 	if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0)
 		goto out_major_mismatch;
 
-	dprintk("NFS: --> %s server owners match\n", __func__);
+	dprintk("NFS: --> %s server owner major IDs match\n", __func__);
 	return true;
 
 out_major_mismatch:
@@ -597,6 +598,100 @@
 	return false;
 }
 
+/*
+ * Returns true if server minor ids match
+ */
+static bool
+nfs4_check_serverowner_minor_id(struct nfs41_server_owner *o1,
+				struct nfs41_server_owner *o2)
+{
+	/* Check eir_server_owner so_minor_id */
+	if (o1->minor_id != o2->minor_id)
+		goto out_minor_mismatch;
+
+	dprintk("NFS: --> %s server owner minor IDs match\n", __func__);
+	return true;
+
+out_minor_mismatch:
+	dprintk("NFS: --> %s server owner minor IDs do not match\n", __func__);
+	return false;
+}
+
+/*
+ * Returns true if the server scopes match
+ */
+static bool
+nfs4_check_server_scope(struct nfs41_server_scope *s1,
+			struct nfs41_server_scope *s2)
+{
+	if (s1->server_scope_sz != s2->server_scope_sz)
+		goto out_scope_mismatch;
+	if (memcmp(s1->server_scope, s2->server_scope,
+		   s1->server_scope_sz) != 0)
+		goto out_scope_mismatch;
+
+	dprintk("NFS: --> %s server scopes match\n", __func__);
+	return true;
+
+out_scope_mismatch:
+	dprintk("NFS: --> %s server scopes do not match\n",
+		__func__);
+	return false;
+}
+
+/**
+ * nfs4_detect_session_trunking - Checks for session trunking.
+ *
+ * Called after a successful EXCHANGE_ID on a multi-addr connection.
+ * Upon success, add the transport.
+ *
+ * @clp:    original mount nfs_client
+ * @res:    result structure from an exchange_id using the original mount
+ *          nfs_client with a new multi_addr transport
+ *
+ * Returns zero on success, otherwise -EINVAL
+ *
+ * Note: since the exchange_id for the new multi_addr transport uses the
+ * same nfs_client from the original mount, the cl_owner_id is reused,
+ * so eir_clientowner is the same.
+ */
+int nfs4_detect_session_trunking(struct nfs_client *clp,
+				 struct nfs41_exchange_id_res *res,
+				 struct rpc_xprt *xprt)
+{
+	/* Check eir_clientid */
+	if (!nfs4_match_clientids(clp->cl_clientid, res->clientid))
+		goto out_err;
+
+	/* Check eir_server_owner so_major_id */
+	if (!nfs4_check_serverowner_major_id(clp->cl_serverowner,
+					     res->server_owner))
+		goto out_err;
+
+	/* Check eir_server_owner so_minor_id */
+	if (!nfs4_check_serverowner_minor_id(clp->cl_serverowner,
+					     res->server_owner))
+		goto out_err;
+
+	/* Check eir_server_scope */
+	if (!nfs4_check_server_scope(clp->cl_serverscope, res->server_scope))
+		goto out_err;
+
+	/* Session trunking passed, add the xprt */
+	rpc_clnt_xprt_switch_add_xprt(clp->cl_rpcclient, xprt);
+
+	pr_info("NFS:  %s: Session trunking succeeded for %s\n",
+		clp->cl_hostname,
+		xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+	return 0;
+out_err:
+	pr_info("NFS:  %s: Session trunking failed for %s\n", clp->cl_hostname,
+		xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+	return -EINVAL;
+}
+
 /**
  * nfs41_walk_client_list - Find nfs_client that matches a client/server owner
  *
@@ -650,7 +745,7 @@
 		if (pos->cl_cons_state != NFS_CS_READY)
 			continue;
 
-		if (!nfs4_match_clientids(pos, new))
+		if (!nfs4_match_clientids(pos->cl_clientid, new->cl_clientid))
 			continue;
 
 		/*
@@ -658,7 +753,8 @@
 		 * client id trunking. In either case, we want to fall back
 		 * to using the existing nfs_client.
 		 */
-		if (!nfs4_check_clientid_trunking(pos, new))
+		if (!nfs4_check_serverowner_major_id(pos->cl_serverowner,
+						     new->cl_serverowner))
 			continue;
 
 		/* Unlike NFSv4.0, we know that NFSv4.1 always uses the
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0e32752..ad917bd7 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -99,8 +99,8 @@
 #ifdef CONFIG_NFS_V4_1
 static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
 		struct rpc_cred *);
-static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *,
-		struct rpc_cred *);
+static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *,
+		struct rpc_cred *, bool);
 #endif
 
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
@@ -328,6 +328,33 @@
 	kunmap_atomic(start);
 }
 
+static void nfs4_test_and_free_stateid(struct nfs_server *server,
+		nfs4_stateid *stateid,
+		struct rpc_cred *cred)
+{
+	const struct nfs4_minor_version_ops *ops = server->nfs_client->cl_mvops;
+
+	ops->test_and_free_expired(server, stateid, cred);
+}
+
+static void __nfs4_free_revoked_stateid(struct nfs_server *server,
+		nfs4_stateid *stateid,
+		struct rpc_cred *cred)
+{
+	stateid->type = NFS4_REVOKED_STATEID_TYPE;
+	nfs4_test_and_free_stateid(server, stateid, cred);
+}
+
+static void nfs4_free_revoked_stateid(struct nfs_server *server,
+		const nfs4_stateid *stateid,
+		struct rpc_cred *cred)
+{
+	nfs4_stateid tmp;
+
+	nfs4_stateid_copy(&tmp, stateid);
+	__nfs4_free_revoked_stateid(server, &tmp, cred);
+}
+
 static long nfs4_update_delay(long *timeout)
 {
 	long ret;
@@ -370,13 +397,23 @@
 	exception->delay = 0;
 	exception->recovering = 0;
 	exception->retry = 0;
+
+	if (stateid == NULL && state != NULL)
+		stateid = &state->stateid;
+
 	switch(errorcode) {
 		case 0:
 			return 0;
-		case -NFS4ERR_OPENMODE:
 		case -NFS4ERR_DELEG_REVOKED:
 		case -NFS4ERR_ADMIN_REVOKED:
+		case -NFS4ERR_EXPIRED:
 		case -NFS4ERR_BAD_STATEID:
+			if (inode != NULL && stateid != NULL) {
+				nfs_inode_find_state_and_recover(inode,
+						stateid);
+				goto wait_on_recovery;
+			}
+		case -NFS4ERR_OPENMODE:
 			if (inode) {
 				int err;
 
@@ -395,12 +432,6 @@
 			if (ret < 0)
 				break;
 			goto wait_on_recovery;
-		case -NFS4ERR_EXPIRED:
-			if (state != NULL) {
-				ret = nfs4_schedule_stateid_recovery(server, state);
-				if (ret < 0)
-					break;
-			}
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_STALE_CLIENTID:
 			nfs4_schedule_lease_recovery(clp);
@@ -616,6 +647,7 @@
 	}
 	spin_unlock(&tbl->slot_tbl_lock);
 
+	slot->privileged = args->sa_privileged ? 1 : 0;
 	args->sa_slot = slot;
 	res->sr_slot = slot;
 
@@ -723,12 +755,20 @@
 	/* Check the SEQUENCE operation status */
 	switch (res->sr_status) {
 	case 0:
+		/* If previous op on slot was interrupted and we reused
+		 * the seq# and got a reply from the cache, then retry
+		 */
+		if (task->tk_status == -EREMOTEIO && interrupted) {
+			++slot->seq_nr;
+			goto retry_nowait;
+		}
 		/* Update the slot's sequence and clientid lease timer */
 		slot->seq_done = 1;
 		clp = session->clp;
 		do_renew_lease(clp, res->sr_timestamp);
 		/* Check sequence flags */
-		nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
+		nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags,
+				!!slot->privileged);
 		nfs41_update_target_slotid(slot->table, slot, res);
 		break;
 	case 1:
@@ -875,6 +915,7 @@
 	}
 	spin_unlock(&tbl->slot_tbl_lock);
 
+	slot->privileged = args->sa_privileged ? 1 : 0;
 	args->sa_slot = slot;
 
 	dprintk("<-- %s slotid=%u seqid=%u\n", __func__,
@@ -1353,6 +1394,19 @@
 	nfs4_state_set_mode_locked(state, state->state | fmode);
 }
 
+#ifdef CONFIG_NFS_V4_1
+static bool nfs_open_stateid_recover_openmode(struct nfs4_state *state)
+{
+	if (state->n_rdonly && !test_bit(NFS_O_RDONLY_STATE, &state->flags))
+		return true;
+	if (state->n_wronly && !test_bit(NFS_O_WRONLY_STATE, &state->flags))
+		return true;
+	if (state->n_rdwr && !test_bit(NFS_O_RDWR_STATE, &state->flags))
+		return true;
+	return false;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
 {
 	struct nfs_client *clp = state->owner->so_server->nfs_client;
@@ -1369,11 +1423,12 @@
 }
 
 static bool nfs_need_update_open_stateid(struct nfs4_state *state,
-		nfs4_stateid *stateid)
+		const nfs4_stateid *stateid, nfs4_stateid *freeme)
 {
 	if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0)
 		return true;
 	if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+		nfs4_stateid_copy(freeme, &state->open_stateid);
 		nfs_test_and_clear_all_open_stateid(state);
 		return true;
 	}
@@ -1437,7 +1492,9 @@
 		nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
 }
 
-static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+static void nfs_set_open_stateid_locked(struct nfs4_state *state,
+		const nfs4_stateid *stateid, fmode_t fmode,
+		nfs4_stateid *freeme)
 {
 	switch (fmode) {
 		case FMODE_READ:
@@ -1449,14 +1506,18 @@
 		case FMODE_READ|FMODE_WRITE:
 			set_bit(NFS_O_RDWR_STATE, &state->flags);
 	}
-	if (!nfs_need_update_open_stateid(state, stateid))
+	if (!nfs_need_update_open_stateid(state, stateid, freeme))
 		return;
 	if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
 		nfs4_stateid_copy(&state->stateid, stateid);
 	nfs4_stateid_copy(&state->open_stateid, stateid);
 }
 
-static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode)
+static void __update_open_stateid(struct nfs4_state *state,
+		const nfs4_stateid *open_stateid,
+		const nfs4_stateid *deleg_stateid,
+		fmode_t fmode,
+		nfs4_stateid *freeme)
 {
 	/*
 	 * Protect the call to nfs4_state_set_mode_locked and
@@ -1469,16 +1530,22 @@
 		set_bit(NFS_DELEGATED_STATE, &state->flags);
 	}
 	if (open_stateid != NULL)
-		nfs_set_open_stateid_locked(state, open_stateid, fmode);
+		nfs_set_open_stateid_locked(state, open_stateid, fmode, freeme);
 	write_sequnlock(&state->seqlock);
 	update_open_stateflags(state, fmode);
 	spin_unlock(&state->owner->so_lock);
 }
 
-static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *delegation, fmode_t fmode)
+static int update_open_stateid(struct nfs4_state *state,
+		const nfs4_stateid *open_stateid,
+		const nfs4_stateid *delegation,
+		fmode_t fmode)
 {
+	struct nfs_server *server = NFS_SERVER(state->inode);
+	struct nfs_client *clp = server->nfs_client;
 	struct nfs_inode *nfsi = NFS_I(state->inode);
 	struct nfs_delegation *deleg_cur;
+	nfs4_stateid freeme = {0};
 	int ret = 0;
 
 	fmode &= (FMODE_READ|FMODE_WRITE);
@@ -1500,7 +1567,8 @@
 		goto no_delegation_unlock;
 
 	nfs_mark_delegation_referenced(deleg_cur);
-	__update_open_stateid(state, open_stateid, &deleg_cur->stateid, fmode);
+	__update_open_stateid(state, open_stateid, &deleg_cur->stateid,
+			fmode, &freeme);
 	ret = 1;
 no_delegation_unlock:
 	spin_unlock(&deleg_cur->lock);
@@ -1508,11 +1576,14 @@
 	rcu_read_unlock();
 
 	if (!ret && open_stateid != NULL) {
-		__update_open_stateid(state, open_stateid, NULL, fmode);
+		__update_open_stateid(state, open_stateid, NULL, fmode, &freeme);
 		ret = 1;
 	}
 	if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
-		nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
+		nfs4_schedule_state_manager(clp);
+	if (freeme.type != 0)
+		nfs4_test_and_free_stateid(server, &freeme,
+				state->owner->so_cred);
 
 	return ret;
 }
@@ -1889,7 +1960,6 @@
 		case -NFS4ERR_STALE_CLIENTID:
 		case -NFS4ERR_STALE_STATEID:
 			set_bit(NFS_DELEGATED_STATE, &state->flags);
-		case -NFS4ERR_EXPIRED:
 			/* Don't recall a delegation if it was lost */
 			nfs4_schedule_lease_recovery(server->nfs_client);
 			return -EAGAIN;
@@ -1901,6 +1971,7 @@
 			return -EAGAIN;
 		case -NFS4ERR_DELEG_REVOKED:
 		case -NFS4ERR_ADMIN_REVOKED:
+		case -NFS4ERR_EXPIRED:
 		case -NFS4ERR_BAD_STATEID:
 		case -NFS4ERR_OPENMODE:
 			nfs_inode_find_state_and_recover(state->inode,
@@ -2382,9 +2453,10 @@
 	return ret;
 }
 
-static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state)
+static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state,
+		const nfs4_stateid *stateid)
 {
-	nfs_remove_bad_delegation(state->inode);
+	nfs_remove_bad_delegation(state->inode, stateid);
 	write_seqlock(&state->seqlock);
 	nfs4_stateid_copy(&state->stateid, &state->open_stateid);
 	write_sequnlock(&state->seqlock);
@@ -2394,7 +2466,7 @@
 static void nfs40_clear_delegation_stateid(struct nfs4_state *state)
 {
 	if (rcu_access_pointer(NFS_I(state->inode)->delegation) != NULL)
-		nfs_finish_clear_delegation_stateid(state);
+		nfs_finish_clear_delegation_stateid(state, NULL);
 }
 
 static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
@@ -2404,7 +2476,45 @@
 	return nfs4_open_expired(sp, state);
 }
 
+static int nfs40_test_and_free_expired_stateid(struct nfs_server *server,
+		nfs4_stateid *stateid,
+		struct rpc_cred *cred)
+{
+	return -NFS4ERR_BAD_STATEID;
+}
+
 #if defined(CONFIG_NFS_V4_1)
+static int nfs41_test_and_free_expired_stateid(struct nfs_server *server,
+		nfs4_stateid *stateid,
+		struct rpc_cred *cred)
+{
+	int status;
+
+	switch (stateid->type) {
+	default:
+		break;
+	case NFS4_INVALID_STATEID_TYPE:
+	case NFS4_SPECIAL_STATEID_TYPE:
+		return -NFS4ERR_BAD_STATEID;
+	case NFS4_REVOKED_STATEID_TYPE:
+		goto out_free;
+	}
+
+	status = nfs41_test_stateid(server, stateid, cred);
+	switch (status) {
+	case -NFS4ERR_EXPIRED:
+	case -NFS4ERR_ADMIN_REVOKED:
+	case -NFS4ERR_DELEG_REVOKED:
+		break;
+	default:
+		return status;
+	}
+out_free:
+	/* Ack the revoked state to the server */
+	nfs41_free_stateid(server, stateid, cred, true);
+	return -NFS4ERR_EXPIRED;
+}
+
 static void nfs41_check_delegation_stateid(struct nfs4_state *state)
 {
 	struct nfs_server *server = NFS_SERVER(state->inode);
@@ -2422,23 +2532,68 @@
 	}
 
 	nfs4_stateid_copy(&stateid, &delegation->stateid);
-	cred = get_rpccred(delegation->cred);
-	rcu_read_unlock();
-	status = nfs41_test_stateid(server, &stateid, cred);
-	trace_nfs4_test_delegation_stateid(state, NULL, status);
-
-	if (status != NFS_OK) {
-		/* Free the stateid unless the server explicitly
-		 * informs us the stateid is unrecognized. */
-		if (status != -NFS4ERR_BAD_STATEID)
-			nfs41_free_stateid(server, &stateid, cred);
-		nfs_finish_clear_delegation_stateid(state);
+	if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
+		rcu_read_unlock();
+		nfs_finish_clear_delegation_stateid(state, &stateid);
+		return;
 	}
 
+	if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags)) {
+		rcu_read_unlock();
+		return;
+	}
+
+	cred = get_rpccred(delegation->cred);
+	rcu_read_unlock();
+	status = nfs41_test_and_free_expired_stateid(server, &stateid, cred);
+	trace_nfs4_test_delegation_stateid(state, NULL, status);
+	if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID)
+		nfs_finish_clear_delegation_stateid(state, &stateid);
+
 	put_rpccred(cred);
 }
 
 /**
+ * nfs41_check_expired_locks - possibly free a lock stateid
+ *
+ * @state: NFSv4 state for an inode
+ *
+ * Returns NFS_OK if recovery for this stateid is now finished.
+ * Otherwise a negative NFS4ERR value is returned.
+ */
+static int nfs41_check_expired_locks(struct nfs4_state *state)
+{
+	int status, ret = NFS_OK;
+	struct nfs4_lock_state *lsp;
+	struct nfs_server *server = NFS_SERVER(state->inode);
+
+	if (!test_bit(LK_STATE_IN_USE, &state->flags))
+		goto out;
+	list_for_each_entry(lsp, &state->lock_states, ls_locks) {
+		if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
+			struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
+
+			status = nfs41_test_and_free_expired_stateid(server,
+					&lsp->ls_stateid,
+					cred);
+			trace_nfs4_test_lock_stateid(state, lsp, status);
+			if (status == -NFS4ERR_EXPIRED ||
+			    status == -NFS4ERR_BAD_STATEID) {
+				clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
+				lsp->ls_stateid.type = NFS4_INVALID_STATEID_TYPE;
+				if (!recover_lost_locks)
+					set_bit(NFS_LOCK_LOST, &lsp->ls_flags);
+			} else if (status != NFS_OK) {
+				ret = status;
+				break;
+			}
+		}
+	};
+out:
+	return ret;
+}
+
+/**
  * nfs41_check_open_stateid - possibly free an open stateid
  *
  * @state: NFSv4 state for an inode
@@ -2453,26 +2608,28 @@
 	struct rpc_cred *cred = state->owner->so_cred;
 	int status;
 
-	/* If a state reset has been done, test_stateid is unneeded */
-	if ((test_bit(NFS_O_RDONLY_STATE, &state->flags) == 0) &&
-	    (test_bit(NFS_O_WRONLY_STATE, &state->flags) == 0) &&
-	    (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0))
+	if (test_bit(NFS_OPEN_STATE, &state->flags) == 0) {
+		if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)  {
+			if (nfs4_have_delegation(state->inode, state->state))
+				return NFS_OK;
+			return -NFS4ERR_OPENMODE;
+		}
 		return -NFS4ERR_BAD_STATEID;
-
-	status = nfs41_test_stateid(server, stateid, cred);
+	}
+	status = nfs41_test_and_free_expired_stateid(server, stateid, cred);
 	trace_nfs4_test_open_stateid(state, NULL, status);
-	if (status != NFS_OK) {
-		/* Free the stateid unless the server explicitly
-		 * informs us the stateid is unrecognized. */
-		if (status != -NFS4ERR_BAD_STATEID)
-			nfs41_free_stateid(server, stateid, cred);
-
+	if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) {
 		clear_bit(NFS_O_RDONLY_STATE, &state->flags);
 		clear_bit(NFS_O_WRONLY_STATE, &state->flags);
 		clear_bit(NFS_O_RDWR_STATE, &state->flags);
 		clear_bit(NFS_OPEN_STATE, &state->flags);
+		stateid->type = NFS4_INVALID_STATEID_TYPE;
 	}
-	return status;
+	if (status != NFS_OK)
+		return status;
+	if (nfs_open_stateid_recover_openmode(state))
+		return -NFS4ERR_OPENMODE;
+	return NFS_OK;
 }
 
 static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
@@ -2480,6 +2637,9 @@
 	int status;
 
 	nfs41_check_delegation_stateid(state);
+	status = nfs41_check_expired_locks(state);
+	if (status != NFS_OK)
+		return status;
 	status = nfs41_check_open_stateid(state);
 	if (status != NFS_OK)
 		status = nfs4_open_expired(sp, state);
@@ -2537,6 +2697,8 @@
 		goto out;
 	if (server->caps & NFS_CAP_POSIX_LOCK)
 		set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
+	if (opendata->o_res.rflags & NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK)
+		set_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags);
 
 	dentry = opendata->dentry;
 	if (d_really_is_negative(dentry)) {
@@ -2899,9 +3061,12 @@
 			break;
 		case -NFS4ERR_ADMIN_REVOKED:
 		case -NFS4ERR_STALE_STATEID:
+		case -NFS4ERR_EXPIRED:
+			nfs4_free_revoked_stateid(server,
+					&calldata->arg.stateid,
+					task->tk_msg.rpc_cred);
 		case -NFS4ERR_OLD_STATEID:
 		case -NFS4ERR_BAD_STATEID:
-		case -NFS4ERR_EXPIRED:
 			if (!nfs4_stateid_match(&calldata->arg.stateid,
 						&state->open_stateid)) {
 				rpc_restart_call_prepare(task);
@@ -4312,7 +4477,7 @@
 	if (error == 0) {
 		/* block layout checks this! */
 		server->pnfs_blksize = fsinfo->blksize;
-		set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype);
+		set_pnfs_layoutdriver(server, fhandle, fsinfo);
 	}
 
 	return error;
@@ -4399,24 +4564,25 @@
 	return false;
 }
 
-void __nfs4_read_done_cb(struct nfs_pgio_header *hdr)
-{
-	nfs_invalidate_atime(hdr->inode);
-}
-
 static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
 	struct nfs_server *server = NFS_SERVER(hdr->inode);
 
 	trace_nfs4_read(hdr, task->tk_status);
-	if (nfs4_async_handle_error(task, server,
-				    hdr->args.context->state,
-				    NULL) == -EAGAIN) {
-		rpc_restart_call_prepare(task);
-		return -EAGAIN;
+	if (task->tk_status < 0) {
+		struct nfs4_exception exception = {
+			.inode = hdr->inode,
+			.state = hdr->args.context->state,
+			.stateid = &hdr->args.stateid,
+		};
+		task->tk_status = nfs4_async_handle_exception(task,
+				server, task->tk_status, &exception);
+		if (exception.retry) {
+			rpc_restart_call_prepare(task);
+			return -EAGAIN;
+		}
 	}
 
-	__nfs4_read_done_cb(hdr);
 	if (task->tk_status > 0)
 		renew_lease(server, hdr->timestamp);
 	return 0;
@@ -4445,6 +4611,8 @@
 		return -EAGAIN;
 	if (nfs4_read_stateid_changed(task, &hdr->args))
 		return -EAGAIN;
+	if (task->tk_status > 0)
+		nfs_invalidate_atime(hdr->inode);
 	return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
 				    nfs4_read_done_cb(task, hdr);
 }
@@ -4482,11 +4650,19 @@
 	struct inode *inode = hdr->inode;
 
 	trace_nfs4_write(hdr, task->tk_status);
-	if (nfs4_async_handle_error(task, NFS_SERVER(inode),
-				    hdr->args.context->state,
-				    NULL) == -EAGAIN) {
-		rpc_restart_call_prepare(task);
-		return -EAGAIN;
+	if (task->tk_status < 0) {
+		struct nfs4_exception exception = {
+			.inode = hdr->inode,
+			.state = hdr->args.context->state,
+			.stateid = &hdr->args.stateid,
+		};
+		task->tk_status = nfs4_async_handle_exception(task,
+				NFS_SERVER(inode), task->tk_status,
+				&exception);
+		if (exception.retry) {
+			rpc_restart_call_prepare(task);
+			return -EAGAIN;
+		}
 	}
 	if (task->tk_status >= 0) {
 		renew_lease(NFS_SERVER(inode), hdr->timestamp);
@@ -5123,12 +5299,14 @@
 	if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) {
 		/* An impossible timestamp guarantees this value
 		 * will never match a generated boot time. */
-		verf[0] = 0;
-		verf[1] = cpu_to_be32(NSEC_PER_SEC + 1);
+		verf[0] = cpu_to_be32(U32_MAX);
+		verf[1] = cpu_to_be32(U32_MAX);
 	} else {
 		struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
-		verf[0] = cpu_to_be32(nn->boot_time.tv_sec);
-		verf[1] = cpu_to_be32(nn->boot_time.tv_nsec);
+		u64 ns = ktime_to_ns(nn->boot_time);
+
+		verf[0] = cpu_to_be32(ns >> 32);
+		verf[1] = cpu_to_be32(ns);
 	}
 	memcpy(bootverf->data, verf, sizeof(bootverf->data));
 }
@@ -5393,10 +5571,13 @@
 		renew_lease(data->res.server, data->timestamp);
 	case -NFS4ERR_ADMIN_REVOKED:
 	case -NFS4ERR_DELEG_REVOKED:
+	case -NFS4ERR_EXPIRED:
+		nfs4_free_revoked_stateid(data->res.server,
+				data->args.stateid,
+				task->tk_msg.rpc_cred);
 	case -NFS4ERR_BAD_STATEID:
 	case -NFS4ERR_OLD_STATEID:
 	case -NFS4ERR_STALE_STATEID:
-	case -NFS4ERR_EXPIRED:
 		task->tk_status = 0;
 		if (data->roc)
 			pnfs_roc_set_barrier(data->inode, data->roc_barrier);
@@ -5528,22 +5709,6 @@
 	return err;
 }
 
-#define NFS4_LOCK_MINTIMEOUT (1 * HZ)
-#define NFS4_LOCK_MAXTIMEOUT (30 * HZ)
-
-/* 
- * sleep, with exponential backoff, and retry the LOCK operation. 
- */
-static unsigned long
-nfs4_set_lock_task_retry(unsigned long timeout)
-{
-	freezable_schedule_timeout_killable_unsafe(timeout);
-	timeout <<= 1;
-	if (timeout > NFS4_LOCK_MAXTIMEOUT)
-		return NFS4_LOCK_MAXTIMEOUT;
-	return timeout;
-}
-
 static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
 	struct inode *inode = state->inode;
@@ -5600,11 +5765,6 @@
 	return err;
 }
 
-static int do_vfs_lock(struct inode *inode, struct file_lock *fl)
-{
-	return locks_lock_inode_wait(inode, fl);
-}
-
 struct nfs4_unlockdata {
 	struct nfs_locku_args arg;
 	struct nfs_locku_res res;
@@ -5657,14 +5817,18 @@
 	switch (task->tk_status) {
 		case 0:
 			renew_lease(calldata->server, calldata->timestamp);
-			do_vfs_lock(calldata->lsp->ls_state->inode, &calldata->fl);
+			locks_lock_inode_wait(calldata->lsp->ls_state->inode, &calldata->fl);
 			if (nfs4_update_lock_stateid(calldata->lsp,
 					&calldata->res.stateid))
 				break;
+		case -NFS4ERR_ADMIN_REVOKED:
+		case -NFS4ERR_EXPIRED:
+			nfs4_free_revoked_stateid(calldata->server,
+					&calldata->arg.stateid,
+					task->tk_msg.rpc_cred);
 		case -NFS4ERR_BAD_STATEID:
 		case -NFS4ERR_OLD_STATEID:
 		case -NFS4ERR_STALE_STATEID:
-		case -NFS4ERR_EXPIRED:
 			if (!nfs4_stateid_match(&calldata->arg.stateid,
 						&calldata->lsp->ls_stateid))
 				rpc_restart_call_prepare(task);
@@ -5765,7 +5929,7 @@
 	mutex_lock(&sp->so_delegreturn_mutex);
 	/* Exclude nfs4_reclaim_open_stateid() - note nesting! */
 	down_read(&nfsi->rwsem);
-	if (do_vfs_lock(inode, request) == -ENOENT) {
+	if (locks_lock_inode_wait(inode, request) == -ENOENT) {
 		up_read(&nfsi->rwsem);
 		mutex_unlock(&sp->so_delegreturn_mutex);
 		goto out;
@@ -5906,7 +6070,7 @@
 				data->timestamp);
 		if (data->arg.new_lock) {
 			data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
-			if (do_vfs_lock(lsp->ls_state->inode, &data->fl) < 0) {
+			if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0) {
 				rpc_restart_call_prepare(task);
 				break;
 			}
@@ -5965,6 +6129,7 @@
 {
 	switch (error) {
 	case -NFS4ERR_ADMIN_REVOKED:
+	case -NFS4ERR_EXPIRED:
 	case -NFS4ERR_BAD_STATEID:
 		lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
 		if (new_lock_owner != 0 ||
@@ -5973,7 +6138,6 @@
 		break;
 	case -NFS4ERR_STALE_STATEID:
 		lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
-	case -NFS4ERR_EXPIRED:
 		nfs4_schedule_lease_recovery(server->nfs_client);
 	};
 }
@@ -6083,52 +6247,19 @@
 }
 
 #if defined(CONFIG_NFS_V4_1)
-/**
- * nfs41_check_expired_locks - possibly free a lock stateid
- *
- * @state: NFSv4 state for an inode
- *
- * Returns NFS_OK if recovery for this stateid is now finished.
- * Otherwise a negative NFS4ERR value is returned.
- */
-static int nfs41_check_expired_locks(struct nfs4_state *state)
-{
-	int status, ret = -NFS4ERR_BAD_STATEID;
-	struct nfs4_lock_state *lsp;
-	struct nfs_server *server = NFS_SERVER(state->inode);
-
-	list_for_each_entry(lsp, &state->lock_states, ls_locks) {
-		if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
-			struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
-
-			status = nfs41_test_stateid(server,
-					&lsp->ls_stateid,
-					cred);
-			trace_nfs4_test_lock_stateid(state, lsp, status);
-			if (status != NFS_OK) {
-				/* Free the stateid unless the server
-				 * informs us the stateid is unrecognized. */
-				if (status != -NFS4ERR_BAD_STATEID)
-					nfs41_free_stateid(server,
-							&lsp->ls_stateid,
-							cred);
-				clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
-				ret = status;
-			}
-		}
-	};
-
-	return ret;
-}
-
 static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request)
 {
-	int status = NFS_OK;
+	struct nfs4_lock_state *lsp;
+	int status;
 
-	if (test_bit(LK_STATE_IN_USE, &state->flags))
-		status = nfs41_check_expired_locks(state);
-	if (status != NFS_OK)
-		status = nfs4_lock_expired(state, request);
+	status = nfs4_set_lock_state(state, request);
+	if (status != 0)
+		return status;
+	lsp = request->fl_u.nfs4_fl.owner;
+	if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) ||
+	    test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
+		return 0;
+	status = nfs4_lock_expired(state, request);
 	return status;
 }
 #endif
@@ -6138,17 +6269,10 @@
 	struct nfs_inode *nfsi = NFS_I(state->inode);
 	struct nfs4_state_owner *sp = state->owner;
 	unsigned char fl_flags = request->fl_flags;
-	int status = -ENOLCK;
+	int status;
 
-	if ((fl_flags & FL_POSIX) &&
-			!test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
-		goto out;
-	/* Is this a delegated open? */
-	status = nfs4_set_lock_state(state, request);
-	if (status != 0)
-		goto out;
 	request->fl_flags |= FL_ACCESS;
-	status = do_vfs_lock(state->inode, request);
+	status = locks_lock_inode_wait(state->inode, request);
 	if (status < 0)
 		goto out;
 	mutex_lock(&sp->so_delegreturn_mutex);
@@ -6157,7 +6281,7 @@
 		/* Yes: cache locks! */
 		/* ...but avoid races with delegation recall... */
 		request->fl_flags = fl_flags & ~FL_SLEEP;
-		status = do_vfs_lock(state->inode, request);
+		status = locks_lock_inode_wait(state->inode, request);
 		up_read(&nfsi->rwsem);
 		mutex_unlock(&sp->so_delegreturn_mutex);
 		goto out;
@@ -6188,12 +6312,124 @@
 	return err;
 }
 
+#define NFS4_LOCK_MINTIMEOUT (1 * HZ)
+#define NFS4_LOCK_MAXTIMEOUT (30 * HZ)
+
+static int
+nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd,
+			struct file_lock *request)
+{
+	int		status = -ERESTARTSYS;
+	unsigned long	timeout = NFS4_LOCK_MINTIMEOUT;
+
+	while(!signalled()) {
+		status = nfs4_proc_setlk(state, cmd, request);
+		if ((status != -EAGAIN) || IS_SETLK(cmd))
+			break;
+		freezable_schedule_timeout_interruptible(timeout);
+		timeout *= 2;
+		timeout = min_t(unsigned long, NFS4_LOCK_MAXTIMEOUT, timeout);
+		status = -ERESTARTSYS;
+	}
+	return status;
+}
+
+#ifdef CONFIG_NFS_V4_1
+struct nfs4_lock_waiter {
+	struct task_struct	*task;
+	struct inode		*inode;
+	struct nfs_lowner	*owner;
+	bool			notified;
+};
+
+static int
+nfs4_wake_lock_waiter(wait_queue_t *wait, unsigned int mode, int flags, void *key)
+{
+	int ret;
+	struct cb_notify_lock_args *cbnl = key;
+	struct nfs4_lock_waiter	*waiter	= wait->private;
+	struct nfs_lowner	*lowner = &cbnl->cbnl_owner,
+				*wowner = waiter->owner;
+
+	/* Only wake if the callback was for the same owner */
+	if (lowner->clientid != wowner->clientid ||
+	    lowner->id != wowner->id		 ||
+	    lowner->s_dev != wowner->s_dev)
+		return 0;
+
+	/* Make sure it's for the right inode */
+	if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh))
+		return 0;
+
+	waiter->notified = true;
+
+	/* override "private" so we can use default_wake_function */
+	wait->private = waiter->task;
+	ret = autoremove_wake_function(wait, mode, flags, key);
+	wait->private = waiter;
+	return ret;
+}
+
+static int
+nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+	int status = -ERESTARTSYS;
+	unsigned long flags;
+	struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner;
+	struct nfs_server *server = NFS_SERVER(state->inode);
+	struct nfs_client *clp = server->nfs_client;
+	wait_queue_head_t *q = &clp->cl_lock_waitq;
+	struct nfs_lowner owner = { .clientid = clp->cl_clientid,
+				    .id = lsp->ls_seqid.owner_id,
+				    .s_dev = server->s_dev };
+	struct nfs4_lock_waiter waiter = { .task  = current,
+					   .inode = state->inode,
+					   .owner = &owner,
+					   .notified = false };
+	wait_queue_t wait;
+
+	/* Don't bother with waitqueue if we don't expect a callback */
+	if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags))
+		return nfs4_retry_setlk_simple(state, cmd, request);
+
+	init_wait(&wait);
+	wait.private = &waiter;
+	wait.func = nfs4_wake_lock_waiter;
+	add_wait_queue(q, &wait);
+
+	while(!signalled()) {
+		status = nfs4_proc_setlk(state, cmd, request);
+		if ((status != -EAGAIN) || IS_SETLK(cmd))
+			break;
+
+		status = -ERESTARTSYS;
+		spin_lock_irqsave(&q->lock, flags);
+		if (waiter.notified) {
+			spin_unlock_irqrestore(&q->lock, flags);
+			continue;
+		}
+		set_current_state(TASK_INTERRUPTIBLE);
+		spin_unlock_irqrestore(&q->lock, flags);
+
+		freezable_schedule_timeout_interruptible(NFS4_LOCK_MAXTIMEOUT);
+	}
+
+	finish_wait(q, &wait);
+	return status;
+}
+#else /* !CONFIG_NFS_V4_1 */
+static inline int
+nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+	return nfs4_retry_setlk_simple(state, cmd, request);
+}
+#endif
+
 static int
 nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
 {
 	struct nfs_open_context *ctx;
 	struct nfs4_state *state;
-	unsigned long timeout = NFS4_LOCK_MINTIMEOUT;
 	int status;
 
 	/* verify open state */
@@ -6220,6 +6456,11 @@
 
 	if (state == NULL)
 		return -ENOLCK;
+
+	if ((request->fl_flags & FL_POSIX) &&
+	    !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
+		return -ENOLCK;
+
 	/*
 	 * Don't rely on the VFS having checked the file open mode,
 	 * since it won't do this for flock() locks.
@@ -6234,16 +6475,11 @@
 			return -EBADF;
 	}
 
-	do {
-		status = nfs4_proc_setlk(state, cmd, request);
-		if ((status != -EAGAIN) || IS_SETLK(cmd))
-			break;
-		timeout = nfs4_set_lock_task_retry(timeout);
-		status = -ERESTARTSYS;
-		if (signalled())
-			break;
-	} while(status < 0);
-	return status;
+	status = nfs4_set_lock_state(state, request);
+	if (status != 0)
+		return status;
+
+	return nfs4_retry_setlk(state, cmd, request);
 }
 
 int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid)
@@ -7104,114 +7340,56 @@
 	return 0;
 }
 
-/*
- * _nfs4_proc_exchange_id()
- *
- * Wrapper for EXCHANGE_ID operation.
- */
-static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
-	u32 sp4_how)
+struct nfs41_exchange_id_data {
+	struct nfs41_exchange_id_res res;
+	struct nfs41_exchange_id_args args;
+	struct rpc_xprt *xprt;
+	int rpc_status;
+};
+
+static void nfs4_exchange_id_done(struct rpc_task *task, void *data)
 {
-	nfs4_verifier verifier;
-	struct nfs41_exchange_id_args args = {
-		.verifier = &verifier,
-		.client = clp,
-#ifdef CONFIG_NFS_V4_1_MIGRATION
-		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
-			 EXCHGID4_FLAG_BIND_PRINC_STATEID |
-			 EXCHGID4_FLAG_SUPP_MOVED_MIGR,
-#else
-		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
-			 EXCHGID4_FLAG_BIND_PRINC_STATEID,
-#endif
-	};
-	struct nfs41_exchange_id_res res = {
-		0
-	};
-	int status;
-	struct rpc_message msg = {
-		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID],
-		.rpc_argp = &args,
-		.rpc_resp = &res,
-		.rpc_cred = cred,
-	};
+	struct nfs41_exchange_id_data *cdata =
+					(struct nfs41_exchange_id_data *)data;
+	struct nfs_client *clp = cdata->args.client;
+	int status = task->tk_status;
 
-	nfs4_init_boot_verifier(clp, &verifier);
-
-	status = nfs4_init_uniform_client_string(clp);
-	if (status)
-		goto out;
-
-	dprintk("NFS call  exchange_id auth=%s, '%s'\n",
-		clp->cl_rpcclient->cl_auth->au_ops->au_name,
-		clp->cl_owner_id);
-
-	res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
-					GFP_NOFS);
-	if (unlikely(res.server_owner == NULL)) {
-		status = -ENOMEM;
-		goto out;
-	}
-
-	res.server_scope = kzalloc(sizeof(struct nfs41_server_scope),
-					GFP_NOFS);
-	if (unlikely(res.server_scope == NULL)) {
-		status = -ENOMEM;
-		goto out_server_owner;
-	}
-
-	res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS);
-	if (unlikely(res.impl_id == NULL)) {
-		status = -ENOMEM;
-		goto out_server_scope;
-	}
-
-	switch (sp4_how) {
-	case SP4_NONE:
-		args.state_protect.how = SP4_NONE;
-		break;
-
-	case SP4_MACH_CRED:
-		args.state_protect = nfs4_sp4_mach_cred_request;
-		break;
-
-	default:
-		/* unsupported! */
-		WARN_ON_ONCE(1);
-		status = -EINVAL;
-		goto out_impl_id;
-	}
-
-	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
 	trace_nfs4_exchange_id(clp, status);
-	if (status == 0)
-		status = nfs4_check_cl_exchange_flags(res.flags);
 
 	if (status == 0)
-		status = nfs4_sp4_select_mode(clp, &res.state_protect);
+		status = nfs4_check_cl_exchange_flags(cdata->res.flags);
+
+	if (cdata->xprt && status == 0) {
+		status = nfs4_detect_session_trunking(clp, &cdata->res,
+						      cdata->xprt);
+		goto out;
+	}
+
+	if (status  == 0)
+		status = nfs4_sp4_select_mode(clp, &cdata->res.state_protect);
 
 	if (status == 0) {
-		clp->cl_clientid = res.clientid;
-		clp->cl_exchange_flags = res.flags;
+		clp->cl_clientid = cdata->res.clientid;
+		clp->cl_exchange_flags = cdata->res.flags;
 		/* Client ID is not confirmed */
-		if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) {
+		if (!(cdata->res.flags & EXCHGID4_FLAG_CONFIRMED_R)) {
 			clear_bit(NFS4_SESSION_ESTABLISHED,
-					&clp->cl_session->session_state);
-			clp->cl_seqid = res.seqid;
+			&clp->cl_session->session_state);
+			clp->cl_seqid = cdata->res.seqid;
 		}
 
 		kfree(clp->cl_serverowner);
-		clp->cl_serverowner = res.server_owner;
-		res.server_owner = NULL;
+		clp->cl_serverowner = cdata->res.server_owner;
+		cdata->res.server_owner = NULL;
 
 		/* use the most recent implementation id */
 		kfree(clp->cl_implid);
-		clp->cl_implid = res.impl_id;
-		res.impl_id = NULL;
+		clp->cl_implid = cdata->res.impl_id;
+		cdata->res.impl_id = NULL;
 
 		if (clp->cl_serverscope != NULL &&
 		    !nfs41_same_server_scope(clp->cl_serverscope,
-					     res.server_scope)) {
+					cdata->res.server_scope)) {
 			dprintk("%s: server_scope mismatch detected\n",
 				__func__);
 			set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state);
@@ -7220,17 +7398,147 @@
 		}
 
 		if (clp->cl_serverscope == NULL) {
-			clp->cl_serverscope = res.server_scope;
-			res.server_scope = NULL;
+			clp->cl_serverscope = cdata->res.server_scope;
+			cdata->res.server_scope = NULL;
 		}
+		/* Save the EXCHANGE_ID verifier session trunk tests */
+		memcpy(clp->cl_confirm.data, cdata->args.verifier->data,
+		       sizeof(clp->cl_confirm.data));
+	}
+out:
+	cdata->rpc_status = status;
+	return;
+}
+
+static void nfs4_exchange_id_release(void *data)
+{
+	struct nfs41_exchange_id_data *cdata =
+					(struct nfs41_exchange_id_data *)data;
+
+	nfs_put_client(cdata->args.client);
+	if (cdata->xprt) {
+		xprt_put(cdata->xprt);
+		rpc_clnt_xprt_switch_put(cdata->args.client->cl_rpcclient);
+	}
+	kfree(cdata->res.impl_id);
+	kfree(cdata->res.server_scope);
+	kfree(cdata->res.server_owner);
+	kfree(cdata);
+}
+
+static const struct rpc_call_ops nfs4_exchange_id_call_ops = {
+	.rpc_call_done = nfs4_exchange_id_done,
+	.rpc_release = nfs4_exchange_id_release,
+};
+
+/*
+ * _nfs4_proc_exchange_id()
+ *
+ * Wrapper for EXCHANGE_ID operation.
+ */
+static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
+			u32 sp4_how, struct rpc_xprt *xprt)
+{
+	nfs4_verifier verifier;
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID],
+		.rpc_cred = cred,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = clp->cl_rpcclient,
+		.callback_ops = &nfs4_exchange_id_call_ops,
+		.rpc_message = &msg,
+		.flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT,
+	};
+	struct nfs41_exchange_id_data *calldata;
+	struct rpc_task *task;
+	int status = -EIO;
+
+	if (!atomic_inc_not_zero(&clp->cl_count))
+		goto out;
+
+	status = -ENOMEM;
+	calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
+	if (!calldata)
+		goto out;
+
+	if (!xprt)
+		nfs4_init_boot_verifier(clp, &verifier);
+
+	status = nfs4_init_uniform_client_string(clp);
+	if (status)
+		goto out_calldata;
+
+	dprintk("NFS call  exchange_id auth=%s, '%s'\n",
+		clp->cl_rpcclient->cl_auth->au_ops->au_name,
+		clp->cl_owner_id);
+
+	calldata->res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
+						GFP_NOFS);
+	status = -ENOMEM;
+	if (unlikely(calldata->res.server_owner == NULL))
+		goto out_calldata;
+
+	calldata->res.server_scope = kzalloc(sizeof(struct nfs41_server_scope),
+					GFP_NOFS);
+	if (unlikely(calldata->res.server_scope == NULL))
+		goto out_server_owner;
+
+	calldata->res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS);
+	if (unlikely(calldata->res.impl_id == NULL))
+		goto out_server_scope;
+
+	switch (sp4_how) {
+	case SP4_NONE:
+		calldata->args.state_protect.how = SP4_NONE;
+		break;
+
+	case SP4_MACH_CRED:
+		calldata->args.state_protect = nfs4_sp4_mach_cred_request;
+		break;
+
+	default:
+		/* unsupported! */
+		WARN_ON_ONCE(1);
+		status = -EINVAL;
+		goto out_impl_id;
+	}
+	if (xprt) {
+		calldata->xprt = xprt;
+		task_setup_data.rpc_xprt = xprt;
+		task_setup_data.flags =
+				RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC;
+		calldata->args.verifier = &clp->cl_confirm;
+	} else {
+		calldata->args.verifier = &verifier;
+	}
+	calldata->args.client = clp;
+#ifdef CONFIG_NFS_V4_1_MIGRATION
+	calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+	EXCHGID4_FLAG_BIND_PRINC_STATEID |
+	EXCHGID4_FLAG_SUPP_MOVED_MIGR,
+#else
+	calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+	EXCHGID4_FLAG_BIND_PRINC_STATEID,
+#endif
+	msg.rpc_argp = &calldata->args;
+	msg.rpc_resp = &calldata->res;
+	task_setup_data.callback_data = calldata;
+
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task)) {
+	status = PTR_ERR(task);
+		goto out_impl_id;
 	}
 
-out_impl_id:
-	kfree(res.impl_id);
-out_server_scope:
-	kfree(res.server_scope);
-out_server_owner:
-	kfree(res.server_owner);
+	if (!xprt) {
+		status = rpc_wait_for_completion_task(task);
+		if (!status)
+			status = calldata->rpc_status;
+	} else	/* session trunking test */
+		status = calldata->rpc_status;
+
+	rpc_put_task(task);
 out:
 	if (clp->cl_implid != NULL)
 		dprintk("NFS reply exchange_id: Server Implementation ID: "
@@ -7240,6 +7548,16 @@
 			clp->cl_implid->date.nseconds);
 	dprintk("NFS reply exchange_id: %d\n", status);
 	return status;
+
+out_impl_id:
+	kfree(calldata->res.impl_id);
+out_server_scope:
+	kfree(calldata->res.server_scope);
+out_server_owner:
+	kfree(calldata->res.server_owner);
+out_calldata:
+	kfree(calldata);
+	goto out;
 }
 
 /*
@@ -7262,15 +7580,46 @@
 	/* try SP4_MACH_CRED if krb5i/p	*/
 	if (authflavor == RPC_AUTH_GSS_KRB5I ||
 	    authflavor == RPC_AUTH_GSS_KRB5P) {
-		status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED);
+		status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED, NULL);
 		if (!status)
 			return 0;
 	}
 
 	/* try SP4_NONE */
-	return _nfs4_proc_exchange_id(clp, cred, SP4_NONE);
+	return _nfs4_proc_exchange_id(clp, cred, SP4_NONE, NULL);
 }
 
+/**
+ * nfs4_test_session_trunk
+ *
+ * This is an add_xprt_test() test function called from
+ * rpc_clnt_setup_test_and_add_xprt.
+ *
+ * The rpc_xprt_switch is referrenced by rpc_clnt_setup_test_and_add_xprt
+ * and is dereferrenced in nfs4_exchange_id_release
+ *
+ * Upon success, add the new transport to the rpc_clnt
+ *
+ * @clnt: struct rpc_clnt to get new transport
+ * @xprt: the rpc_xprt to test
+ * @data: call data for _nfs4_proc_exchange_id.
+ */
+int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
+			    void *data)
+{
+	struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data;
+	u32 sp4_how;
+
+	dprintk("--> %s try %s\n", __func__,
+		xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+	sp4_how = (adata->clp->cl_sp4_flags == 0 ? SP4_NONE : SP4_MACH_CRED);
+
+	/* Test connection for session trunking. Async exchange_id call */
+	return  _nfs4_proc_exchange_id(adata->clp, adata->cred, sp4_how, xprt);
+}
+EXPORT_SYMBOL_GPL(nfs4_test_session_trunk);
+
 static int _nfs4_proc_destroy_clientid(struct nfs_client *clp,
 		struct rpc_cred *cred)
 {
@@ -7463,7 +7812,7 @@
 	args->bc_attrs.max_resp_sz = max_bc_payload;
 	args->bc_attrs.max_resp_sz_cached = 0;
 	args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
-	args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS;
+	args->bc_attrs.max_reqs = min_t(unsigned short, max_session_cb_slots, 1);
 
 	dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u "
 		"max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
@@ -7510,10 +7859,9 @@
 		return -EINVAL;
 	if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
 		return -EINVAL;
-	/* These would render the backchannel useless: */
-	if (rcvd->max_ops != sent->max_ops)
+	if (rcvd->max_ops > sent->max_ops)
 		return -EINVAL;
-	if (rcvd->max_reqs != sent->max_reqs)
+	if (rcvd->max_reqs > sent->max_reqs)
 		return -EINVAL;
 out:
 	return 0;
@@ -7982,6 +8330,8 @@
 	case -NFS4ERR_RECALLCONFLICT:
 		status = -ERECALLCONFLICT;
 		break;
+	case -NFS4ERR_DELEG_REVOKED:
+	case -NFS4ERR_ADMIN_REVOKED:
 	case -NFS4ERR_EXPIRED:
 	case -NFS4ERR_BAD_STATEID:
 		exception->timeout = 0;
@@ -7993,6 +8343,7 @@
 					&lgp->args.ctx->state->stateid)) {
 			spin_unlock(&inode->i_lock);
 			exception->state = lgp->args.ctx->state;
+			exception->stateid = &lgp->args.stateid;
 			break;
 		}
 
@@ -8591,6 +8942,24 @@
 	return -res.status;
 }
 
+static void nfs4_handle_delay_or_session_error(struct nfs_server *server,
+		int err, struct nfs4_exception *exception)
+{
+	exception->retry = 0;
+	switch(err) {
+	case -NFS4ERR_DELAY:
+	case -NFS4ERR_RETRY_UNCACHED_REP:
+		nfs4_handle_exception(server, err, exception);
+		break;
+	case -NFS4ERR_BADSESSION:
+	case -NFS4ERR_BADSLOT:
+	case -NFS4ERR_BAD_HIGH_SLOT:
+	case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+	case -NFS4ERR_DEADSESSION:
+		nfs4_do_handle_exception(server, err, exception);
+	}
+}
+
 /**
  * nfs41_test_stateid - perform a TEST_STATEID operation
  *
@@ -8610,9 +8979,7 @@
 	int err;
 	do {
 		err = _nfs41_test_stateid(server, stateid, cred);
-		if (err != -NFS4ERR_DELAY)
-			break;
-		nfs4_handle_exception(server, err, &exception);
+		nfs4_handle_delay_or_session_error(server, err, &exception);
 	} while (exception.retry);
 	return err;
 }
@@ -8657,7 +9024,7 @@
 };
 
 static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
-		nfs4_stateid *stateid,
+		const nfs4_stateid *stateid,
 		struct rpc_cred *cred,
 		bool privileged)
 {
@@ -8687,7 +9054,7 @@
 
 	msg.rpc_argp = &data->args;
 	msg.rpc_resp = &data->res;
-	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
 	if (privileged)
 		nfs4_set_sequence_privileged(&data->args.seq_args);
 
@@ -8700,38 +9067,31 @@
  * @server: server / transport on which to perform the operation
  * @stateid: state ID to release
  * @cred: credential
+ * @is_recovery: set to true if this call needs to be privileged
  *
- * Returns NFS_OK if the server freed "stateid".  Otherwise a
- * negative NFS4ERR value is returned.
+ * Note: this function is always asynchronous.
  */
 static int nfs41_free_stateid(struct nfs_server *server,
-		nfs4_stateid *stateid,
-		struct rpc_cred *cred)
+		const nfs4_stateid *stateid,
+		struct rpc_cred *cred,
+		bool is_recovery)
 {
 	struct rpc_task *task;
-	int ret;
 
-	task = _nfs41_free_stateid(server, stateid, cred, true);
+	task = _nfs41_free_stateid(server, stateid, cred, is_recovery);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
-	ret = rpc_wait_for_completion_task(task);
-	if (!ret)
-		ret = task->tk_status;
 	rpc_put_task(task);
-	return ret;
+	return 0;
 }
 
 static void
 nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
 {
-	struct rpc_task *task;
 	struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
 
-	task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
+	nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
 	nfs4_free_lock_state(server, lsp);
-	if (IS_ERR(task))
-		return;
-	rpc_put_task(task);
 }
 
 static bool nfs41_match_stateid(const nfs4_stateid *s1,
@@ -8835,6 +9195,7 @@
 	.match_stateid = nfs4_match_stateid,
 	.find_root_sec = nfs4_find_root_sec,
 	.free_lock_state = nfs4_release_lockowner,
+	.test_and_free_expired = nfs40_test_and_free_expired_stateid,
 	.alloc_seqid = nfs_alloc_seqid,
 	.call_sync_ops = &nfs40_call_sync_ops,
 	.reboot_recovery_ops = &nfs40_reboot_recovery_ops,
@@ -8862,7 +9223,9 @@
 	.match_stateid = nfs41_match_stateid,
 	.find_root_sec = nfs41_find_root_sec,
 	.free_lock_state = nfs41_free_lock_state,
+	.test_and_free_expired = nfs41_test_and_free_expired_stateid,
 	.alloc_seqid = nfs_alloc_no_seqid,
+	.session_trunk = nfs4_test_session_trunk,
 	.call_sync_ops = &nfs41_call_sync_ops,
 	.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
 	.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
@@ -8891,7 +9254,9 @@
 	.find_root_sec = nfs41_find_root_sec,
 	.free_lock_state = nfs41_free_lock_state,
 	.call_sync_ops = &nfs41_call_sync_ops,
+	.test_and_free_expired = nfs41_test_and_free_expired_stateid,
 	.alloc_seqid = nfs_alloc_no_seqid,
+	.session_trunk = nfs4_test_session_trunk,
 	.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
 	.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
 	.state_renewal_ops = &nfs41_state_renewal_ops,
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index f703b75..dae3855 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -9,6 +9,7 @@
 
 /* maximum number of slots to use */
 #define NFS4_DEF_SLOT_TABLE_SIZE (64U)
+#define NFS4_DEF_CB_SLOT_TABLE_SIZE (1U)
 #define NFS4_MAX_SLOT_TABLE (1024U)
 #define NFS4_NO_SLOT ((u32)-1)
 
@@ -22,6 +23,7 @@
 	u32			slot_nr;
 	u32		 	seq_nr;
 	unsigned int		interrupted : 1,
+				privileged : 1,
 				seq_done : 1;
 };
 
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index cada00a..5f4281e 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -991,6 +991,8 @@
 {
 	int ret;
 
+	if (!nfs4_valid_open_stateid(state))
+		return -EIO;
 	if (cred != NULL)
 		*cred = NULL;
 	ret = nfs4_copy_lock_stateid(dst, state, lockowner);
@@ -1303,6 +1305,8 @@
 static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
 {
 
+	if (!nfs4_valid_open_stateid(state))
+		return 0;
 	set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
 	/* Don't recover state that expired before the reboot */
 	if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) {
@@ -1316,6 +1320,8 @@
 
 int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
 {
+	if (!nfs4_valid_open_stateid(state))
+		return 0;
 	set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
 	clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
 	set_bit(NFS_OWNER_RECLAIM_NOGRACE, &state->owner->so_flags);
@@ -1327,9 +1333,8 @@
 {
 	struct nfs_client *clp = server->nfs_client;
 
-	if (!nfs4_valid_open_stateid(state))
+	if (!nfs4_state_mark_reclaim_nograce(clp, state))
 		return -EBADF;
-	nfs4_state_mark_reclaim_nograce(clp, state);
 	dprintk("%s: scheduling stateid recovery for server %s\n", __func__,
 			clp->cl_hostname);
 	nfs4_schedule_state_manager(clp);
@@ -1337,6 +1342,35 @@
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery);
 
+static struct nfs4_lock_state *
+nfs_state_find_lock_state_by_stateid(struct nfs4_state *state,
+		const nfs4_stateid *stateid)
+{
+	struct nfs4_lock_state *pos;
+
+	list_for_each_entry(pos, &state->lock_states, ls_locks) {
+		if (!test_bit(NFS_LOCK_INITIALIZED, &pos->ls_flags))
+			continue;
+		if (nfs4_stateid_match_other(&pos->ls_stateid, stateid))
+			return pos;
+	}
+	return NULL;
+}
+
+static bool nfs_state_lock_state_matches_stateid(struct nfs4_state *state,
+		const nfs4_stateid *stateid)
+{
+	bool found = false;
+
+	if (test_bit(LK_STATE_IN_USE, &state->flags)) {
+		spin_lock(&state->state_lock);
+		if (nfs_state_find_lock_state_by_stateid(state, stateid))
+			found = true;
+		spin_unlock(&state->state_lock);
+	}
+	return found;
+}
+
 void nfs_inode_find_state_and_recover(struct inode *inode,
 		const nfs4_stateid *stateid)
 {
@@ -1351,14 +1385,18 @@
 		state = ctx->state;
 		if (state == NULL)
 			continue;
-		if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
+		if (nfs4_stateid_match_other(&state->stateid, stateid) &&
+		    nfs4_state_mark_reclaim_nograce(clp, state)) {
+			found = true;
 			continue;
-		if (!nfs4_stateid_match(&state->stateid, stateid))
-			continue;
-		nfs4_state_mark_reclaim_nograce(clp, state);
-		found = true;
+		}
+		if (nfs_state_lock_state_matches_stateid(state, stateid) &&
+		    nfs4_state_mark_reclaim_nograce(clp, state))
+			found = true;
 	}
 	spin_unlock(&inode->i_lock);
+
+	nfs_inode_find_delegation_state_and_recover(inode, stateid);
 	if (found)
 		nfs4_schedule_state_manager(clp);
 }
@@ -1498,6 +1536,9 @@
 					__func__, status);
 			case -ENOENT:
 			case -ENOMEM:
+			case -EACCES:
+			case -EROFS:
+			case -EIO:
 			case -ESTALE:
 				/* Open state on this file cannot be recovered */
 				nfs4_state_mark_recovery_failed(state, status);
@@ -1656,15 +1697,9 @@
 	put_rpccred(cred);
 }
 
-static void nfs_delegation_clear_all(struct nfs_client *clp)
-{
-	nfs_delegation_mark_reclaim(clp);
-	nfs_delegation_reap_unclaimed(clp);
-}
-
 static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
 {
-	nfs_delegation_clear_all(clp);
+	nfs_mark_test_expired_all_delegations(clp);
 	nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
 }
 
@@ -2195,7 +2230,7 @@
 
 static void nfs41_handle_some_state_revoked(struct nfs_client *clp)
 {
-	nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
+	nfs4_state_start_reclaim_nograce(clp);
 	nfs4_schedule_state_manager(clp);
 
 	dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname);
@@ -2227,13 +2262,22 @@
 		nfs4_schedule_state_manager(clp);
 }
 
-void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
+void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags,
+		bool recovery)
 {
 	if (!flags)
 		return;
 
 	dprintk("%s: \"%s\" (client ID %llx) flags=0x%08x\n",
 		__func__, clp->cl_hostname, clp->cl_clientid, flags);
+	/*
+	 * If we're called from the state manager thread, then assume we're
+	 * already handling the RECLAIM_NEEDED and/or STATE_REVOKED.
+	 * Those flags are expected to remain set until we're done
+	 * recovering (see RFC5661, section 18.46.3).
+	 */
+	if (recovery)
+		goto out_recovery;
 
 	if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED)
 		nfs41_handle_server_reboot(clp);
@@ -2246,6 +2290,7 @@
 		nfs4_schedule_lease_moved_recovery(clp);
 	if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
 		nfs41_handle_recallable_state_revoked(clp);
+out_recovery:
 	if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT)
 		nfs41_handle_backchannel_fault(clp);
 	else if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
@@ -2410,6 +2455,13 @@
 			nfs4_state_end_reclaim_reboot(clp);
 		}
 
+		/* Detect expired delegations... */
+		if (test_and_clear_bit(NFS4CLNT_DELEGATION_EXPIRED, &clp->cl_state)) {
+			section = "detect expired delegations";
+			nfs_reap_expired_delegations(clp);
+			continue;
+		}
+
 		/* Now recover expired state... */
 		if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
 			section = "reclaim nograce";
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7bd3a5c..fc89e5e 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1850,7 +1850,7 @@
 	*p++ = cpu_to_be32(RPC_AUTH_UNIX);			/* auth_sys */
 
 	/* authsys_parms rfc1831 */
-	*p++ = cpu_to_be32(nn->boot_time.tv_nsec);	/* stamp */
+	*p++ = cpu_to_be32(ktime_to_ns(nn->boot_time));	/* stamp */
 	p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen);
 	*p++ = cpu_to_be32(0);				/* UID */
 	*p++ = cpu_to_be32(0);				/* GID */
@@ -4725,34 +4725,37 @@
 }
 
 /*
- * Decode potentially multiple layout types. Currently we only support
- * one layout driver per file system.
+ * Decode potentially multiple layout types.
  */
-static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
-					 uint32_t *layouttype)
+static int decode_pnfs_layout_types(struct xdr_stream *xdr,
+				    struct nfs_fsinfo *fsinfo)
 {
 	__be32 *p;
-	int num;
+	uint32_t i;
 
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	num = be32_to_cpup(p);
+	fsinfo->nlayouttypes = be32_to_cpup(p);
 
 	/* pNFS is not supported by the underlying file system */
-	if (num == 0) {
-		*layouttype = 0;
+	if (fsinfo->nlayouttypes == 0)
 		return 0;
-	}
-	if (num > 1)
-		printk(KERN_INFO "NFS: %s: Warning: Multiple pNFS layout "
-			"drivers per filesystem not supported\n", __func__);
 
 	/* Decode and set first layout type, move xdr->p past unused types */
-	p = xdr_inline_decode(xdr, num * 4);
+	p = xdr_inline_decode(xdr, fsinfo->nlayouttypes * 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	*layouttype = be32_to_cpup(p);
+
+	/* If we get too many, then just cap it at the max */
+	if (fsinfo->nlayouttypes > NFS_MAX_LAYOUT_TYPES) {
+		printk(KERN_INFO "NFS: %s: Warning: Too many (%u) pNFS layout types\n",
+			__func__, fsinfo->nlayouttypes);
+		fsinfo->nlayouttypes = NFS_MAX_LAYOUT_TYPES;
+	}
+
+	for(i = 0; i < fsinfo->nlayouttypes; ++i)
+		fsinfo->layouttype[i] = be32_to_cpup(p++);
 	return 0;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
@@ -4764,7 +4767,7 @@
  * Note we must ensure that layouttype is set in any non-error case.
  */
 static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
-				uint32_t *layouttype)
+				struct nfs_fsinfo *fsinfo)
 {
 	int status = 0;
 
@@ -4772,10 +4775,9 @@
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U)))
 		return -EIO;
 	if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) {
-		status = decode_first_pnfs_layout_type(xdr, layouttype);
+		status = decode_pnfs_layout_types(xdr, fsinfo);
 		bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES;
-	} else
-		*layouttype = 0;
+	}
 	return status;
 }
 
@@ -4856,7 +4858,7 @@
 	status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
 	if (status != 0)
 		goto xdr_error;
-	status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
+	status = decode_attr_pnfstype(xdr, bitmap, fsinfo);
 	if (status != 0)
 		goto xdr_error;
 
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 2c93a85..56b2d96 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -30,6 +30,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
 #include <linux/module.h>
+#include <linux/sort.h>
 #include "internal.h"
 #include "pnfs.h"
 #include "iostat.h"
@@ -99,35 +100,79 @@
 }
 
 /*
+ * When the server sends a list of layout types, we choose one in the order
+ * given in the list below.
+ *
+ * FIXME: should this list be configurable in some fashion? module param?
+ * 	  mount option? something else?
+ */
+static const u32 ld_prefs[] = {
+	LAYOUT_SCSI,
+	LAYOUT_BLOCK_VOLUME,
+	LAYOUT_OSD2_OBJECTS,
+	LAYOUT_FLEX_FILES,
+	LAYOUT_NFSV4_1_FILES,
+	0
+};
+
+static int
+ld_cmp(const void *e1, const void *e2)
+{
+	u32 ld1 = *((u32 *)e1);
+	u32 ld2 = *((u32 *)e2);
+	int i;
+
+	for (i = 0; ld_prefs[i] != 0; i++) {
+		if (ld1 == ld_prefs[i])
+			return -1;
+
+		if (ld2 == ld_prefs[i])
+			return 1;
+	}
+	return 0;
+}
+
+/*
  * Try to set the server's pnfs module to the pnfs layout type specified by id.
  * Currently only one pNFS layout driver per filesystem is supported.
  *
- * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
+ * @ids array of layout types supported by MDS.
  */
 void
 set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
-		      u32 id)
+		      struct nfs_fsinfo *fsinfo)
 {
 	struct pnfs_layoutdriver_type *ld_type = NULL;
+	u32 id;
+	int i;
 
-	if (id == 0)
-		goto out_no_driver;
 	if (!(server->nfs_client->cl_exchange_flags &
 		 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
-		printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n",
-			__func__, id, server->nfs_client->cl_exchange_flags);
+		printk(KERN_ERR "NFS: %s: cl_exchange_flags 0x%x\n",
+			__func__, server->nfs_client->cl_exchange_flags);
 		goto out_no_driver;
 	}
-	ld_type = find_pnfs_driver(id);
-	if (!ld_type) {
-		request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
+
+	sort(fsinfo->layouttype, fsinfo->nlayouttypes,
+		sizeof(*fsinfo->layouttype), ld_cmp, NULL);
+
+	for (i = 0; i < fsinfo->nlayouttypes; i++) {
+		id = fsinfo->layouttype[i];
 		ld_type = find_pnfs_driver(id);
 		if (!ld_type) {
-			dprintk("%s: No pNFS module found for %u.\n",
-				__func__, id);
-			goto out_no_driver;
+			request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX,
+					id);
+			ld_type = find_pnfs_driver(id);
 		}
+		if (ld_type)
+			break;
 	}
+
+	if (!ld_type) {
+		dprintk("%s: No pNFS module found!\n", __func__);
+		goto out_no_driver;
+	}
+
 	server->pnfs_curr_ld = ld_type;
 	if (ld_type->set_layoutdriver
 	    && ld_type->set_layoutdriver(server, mntfh)) {
@@ -2185,10 +2230,8 @@
  */
 void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
 {
-	if (likely(!hdr->pnfs_error)) {
-		__nfs4_read_done_cb(hdr);
+	if (likely(!hdr->pnfs_error))
 		hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
-	}
 	trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
 	if (unlikely(hdr->pnfs_error))
 		pnfs_ld_handle_read_error(hdr);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 31d99b2..5c29551 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -236,7 +236,7 @@
 void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
 void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg);
 
-void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
+void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *);
 void unset_pnfs_layoutdriver(struct nfs_server *);
 void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
 int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
@@ -657,7 +657,8 @@
 }
 
 static inline void set_pnfs_layoutdriver(struct nfs_server *s,
-					 const struct nfs_fh *mntfh, u32 id)
+					 const struct nfs_fh *mntfh,
+					 struct nfs_fsinfo *fsinfo)
 {
 }
 
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index f3468b5..53b4705 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -690,13 +690,50 @@
 		dprintk("%s: DS %s: trying address %s\n",
 			__func__, ds->ds_remotestr, da->da_remotestr);
 
-		clp = nfs4_set_ds_client(mds_srv,
-					(struct sockaddr *)&da->da_addr,
-					da->da_addrlen, IPPROTO_TCP,
-					timeo, retrans, minor_version,
-					au_flavor);
-		if (!IS_ERR(clp))
-			break;
+		if (!IS_ERR(clp) && clp->cl_mvops->session_trunk) {
+			struct xprt_create xprt_args = {
+				.ident = XPRT_TRANSPORT_TCP,
+				.net = clp->cl_net,
+				.dstaddr = (struct sockaddr *)&da->da_addr,
+				.addrlen = da->da_addrlen,
+				.servername = clp->cl_hostname,
+			};
+			struct nfs4_add_xprt_data xprtdata = {
+				.clp = clp,
+				.cred = nfs4_get_clid_cred(clp),
+			};
+			struct rpc_add_xprt_test rpcdata = {
+				.add_xprt_test = clp->cl_mvops->session_trunk,
+				.data = &xprtdata,
+			};
+
+			/**
+			* Test this address for session trunking and
+			* add as an alias
+			*/
+			rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
+					  rpc_clnt_setup_test_and_add_xprt,
+					  &rpcdata);
+			if (xprtdata.cred)
+				put_rpccred(xprtdata.cred);
+		} else {
+			clp = nfs4_set_ds_client(mds_srv,
+						(struct sockaddr *)&da->da_addr,
+						da->da_addrlen, IPPROTO_TCP,
+						timeo, retrans, minor_version,
+						au_flavor);
+			if (IS_ERR(clp))
+				continue;
+
+			status = nfs4_init_ds_session(clp,
+					mds_srv->nfs_client->cl_lease_time);
+			if (status) {
+				nfs_put_client(clp);
+				clp = ERR_PTR(-EIO);
+				continue;
+			}
+
+		}
 	}
 
 	if (IS_ERR(clp)) {
@@ -704,18 +741,11 @@
 		goto out;
 	}
 
-	status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
-	if (status)
-		goto out_put;
-
 	smp_wmb();
 	ds->ds_clp = clp;
 	dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
 out:
 	return status;
-out_put:
-	nfs_put_client(clp);
-	goto out;
 }
 
 /*
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d396013..001796b 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2848,19 +2848,23 @@
  * NFS client for backwards compatibility
  */
 unsigned int nfs_callback_set_tcpport;
+unsigned short nfs_callback_nr_threads;
 /* Default cache timeout is 10 minutes */
 unsigned int nfs_idmap_cache_timeout = 600;
 /* Turn off NFSv4 uid/gid mapping when using AUTH_SYS */
 bool nfs4_disable_idmapping = true;
 unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE;
+unsigned short max_session_cb_slots = NFS4_DEF_CB_SLOT_TABLE_SIZE;
 unsigned short send_implementation_id = 1;
 char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = "";
 bool recover_lost_locks = false;
 
+EXPORT_SYMBOL_GPL(nfs_callback_nr_threads);
 EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport);
 EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout);
 EXPORT_SYMBOL_GPL(nfs4_disable_idmapping);
 EXPORT_SYMBOL_GPL(max_session_slots);
+EXPORT_SYMBOL_GPL(max_session_cb_slots);
 EXPORT_SYMBOL_GPL(send_implementation_id);
 EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier);
 EXPORT_SYMBOL_GPL(recover_lost_locks);
@@ -2887,6 +2891,9 @@
 #define param_check_portnr(name, p) __param_check(name, p, unsigned int);
 
 module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
+module_param_named(callback_nr_threads, nfs_callback_nr_threads, ushort, 0644);
+MODULE_PARM_DESC(callback_nr_threads, "Number of threads that will be "
+		"assigned to the NFSv4 callback channels.");
 module_param(nfs_idmap_cache_timeout, int, 0644);
 module_param(nfs4_disable_idmapping, bool, 0644);
 module_param_string(nfs4_unique_id, nfs4_client_id_uniquifier,
@@ -2896,6 +2903,9 @@
 module_param(max_session_slots, ushort, 0644);
 MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 "
 		"requests the client will negotiate");
+module_param(max_session_cb_slots, ushort, 0644);
+MODULE_PARM_DESC(max_session_slots, "Maximum number of parallel NFSv4.1 "
+		"callbacks the client will process for a given server");
 module_param(send_implementation_id, ushort, 0644);
 MODULE_PARM_DESC(send_implementation_id,
 		"Send implementation ID with NFSv4.1 exchange_id");
diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c
index df880e9..b672873 100644
--- a/fs/nfsd/flexfilelayout.c
+++ b/fs/nfsd/flexfilelayout.c
@@ -126,6 +126,7 @@
 const struct nfsd4_layout_ops ff_layout_ops = {
 	.notify_types		=
 			NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE,
+	.disable_recalls	= true,
 	.proc_getdeviceinfo	= nfsd4_ff_proc_getdeviceinfo,
 	.encode_getdeviceinfo	= nfsd4_ff_encode_getdeviceinfo,
 	.proc_layoutget		= nfsd4_ff_proc_layoutget,
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 5fbf3bb..b10d557 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -84,6 +84,7 @@
 	struct list_head client_lru;
 	struct list_head close_lru;
 	struct list_head del_recall_lru;
+	struct list_head blocked_locks_lru;
 
 	struct delayed_work laundromat_work;
 
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 04c68d9..211dc2a 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -448,7 +448,7 @@
 {
 	int status;
 
-	if (cb->cb_minorversion == 0)
+	if (cb->cb_clp->cl_minorversion == 0)
 		return 0;
 
 	status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_seq_status);
@@ -485,7 +485,7 @@
 	const struct nfs4_delegation *dp = cb_to_delegation(cb);
 	struct nfs4_cb_compound_hdr hdr = {
 		.ident = cb->cb_clp->cl_cb_ident,
-		.minorversion = cb->cb_minorversion,
+		.minorversion = cb->cb_clp->cl_minorversion,
 	};
 
 	encode_cb_compound4args(xdr, &hdr);
@@ -594,7 +594,7 @@
 		container_of(cb, struct nfs4_layout_stateid, ls_recall);
 	struct nfs4_cb_compound_hdr hdr = {
 		.ident = 0,
-		.minorversion = cb->cb_minorversion,
+		.minorversion = cb->cb_clp->cl_minorversion,
 	};
 
 	encode_cb_compound4args(xdr, &hdr);
@@ -623,6 +623,62 @@
 }
 #endif /* CONFIG_NFSD_PNFS */
 
+static void encode_stateowner(struct xdr_stream *xdr, struct nfs4_stateowner *so)
+{
+	__be32	*p;
+
+	p = xdr_reserve_space(xdr, 8 + 4 + so->so_owner.len);
+	p = xdr_encode_opaque_fixed(p, &so->so_client->cl_clientid, 8);
+	xdr_encode_opaque(p, so->so_owner.data, so->so_owner.len);
+}
+
+static void nfs4_xdr_enc_cb_notify_lock(struct rpc_rqst *req,
+					struct xdr_stream *xdr,
+					const struct nfsd4_callback *cb)
+{
+	const struct nfsd4_blocked_lock *nbl =
+		container_of(cb, struct nfsd4_blocked_lock, nbl_cb);
+	struct nfs4_lockowner *lo = (struct nfs4_lockowner *)nbl->nbl_lock.fl_owner;
+	struct nfs4_cb_compound_hdr hdr = {
+		.ident = 0,
+		.minorversion = cb->cb_clp->cl_minorversion,
+	};
+
+	__be32 *p;
+
+	BUG_ON(hdr.minorversion == 0);
+
+	encode_cb_compound4args(xdr, &hdr);
+	encode_cb_sequence4args(xdr, cb, &hdr);
+
+	p = xdr_reserve_space(xdr, 4);
+	*p = cpu_to_be32(OP_CB_NOTIFY_LOCK);
+	encode_nfs_fh4(xdr, &nbl->nbl_fh);
+	encode_stateowner(xdr, &lo->lo_owner);
+	hdr.nops++;
+
+	encode_cb_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp,
+					struct xdr_stream *xdr,
+					struct nfsd4_callback *cb)
+{
+	struct nfs4_cb_compound_hdr hdr;
+	int status;
+
+	status = decode_cb_compound4res(xdr, &hdr);
+	if (unlikely(status))
+		return status;
+
+	if (cb) {
+		status = decode_cb_sequence4res(xdr, cb);
+		if (unlikely(status || cb->cb_seq_status))
+			return status;
+	}
+	return decode_cb_op_status(xdr, OP_CB_NOTIFY_LOCK, &cb->cb_status);
+}
+
 /*
  * RPC procedure tables
  */
@@ -643,6 +699,7 @@
 #ifdef CONFIG_NFSD_PNFS
 	PROC(CB_LAYOUT,	COMPOUND,	cb_layout,	cb_layout),
 #endif
+	PROC(CB_NOTIFY_LOCK,	COMPOUND,	cb_notify_lock,	cb_notify_lock),
 };
 
 static struct rpc_version nfs_cb_version4 = {
@@ -862,7 +919,6 @@
 	struct nfs4_client *clp = cb->cb_clp;
 	u32 minorversion = clp->cl_minorversion;
 
-	cb->cb_minorversion = minorversion;
 	/*
 	 * cb_seq_status is only set in decode_cb_sequence4res,
 	 * and so will remain 1 if an rpc level failure occurs.
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 2be9602..42aace4 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -174,7 +174,8 @@
 	list_del_init(&ls->ls_perfile);
 	spin_unlock(&fp->fi_lock);
 
-	vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls);
+	if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls)
+		vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls);
 	fput(ls->ls_file);
 
 	if (ls->ls_recalled)
@@ -189,6 +190,9 @@
 	struct file_lock *fl;
 	int status;
 
+	if (nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls)
+		return 0;
+
 	fl = locks_alloc_lock();
 	if (!fl)
 		return -ENOMEM;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 1fb2227..abb09b5 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1010,47 +1010,97 @@
 }
 
 static __be32
-nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
-		struct nfsd4_clone *clone)
+nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		  stateid_t *src_stateid, struct file **src,
+		  stateid_t *dst_stateid, struct file **dst)
 {
-	struct file *src, *dst;
 	__be32 status;
 
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh,
-					    &clone->cl_src_stateid, RD_STATE,
-					    &src, NULL);
+					    src_stateid, RD_STATE, src, NULL);
 	if (status) {
 		dprintk("NFSD: %s: couldn't process src stateid!\n", __func__);
 		goto out;
 	}
 
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
-					    &clone->cl_dst_stateid, WR_STATE,
-					    &dst, NULL);
+					    dst_stateid, WR_STATE, dst, NULL);
 	if (status) {
 		dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__);
 		goto out_put_src;
 	}
 
 	/* fix up for NFS-specific error code */
-	if (!S_ISREG(file_inode(src)->i_mode) ||
-	    !S_ISREG(file_inode(dst)->i_mode)) {
+	if (!S_ISREG(file_inode(*src)->i_mode) ||
+	    !S_ISREG(file_inode(*dst)->i_mode)) {
 		status = nfserr_wrong_type;
 		goto out_put_dst;
 	}
 
+out:
+	return status;
+out_put_dst:
+	fput(*dst);
+out_put_src:
+	fput(*src);
+	goto out;
+}
+
+static __be32
+nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		struct nfsd4_clone *clone)
+{
+	struct file *src, *dst;
+	__be32 status;
+
+	status = nfsd4_verify_copy(rqstp, cstate, &clone->cl_src_stateid, &src,
+				   &clone->cl_dst_stateid, &dst);
+	if (status)
+		goto out;
+
 	status = nfsd4_clone_file_range(src, clone->cl_src_pos,
 			dst, clone->cl_dst_pos, clone->cl_count);
 
-out_put_dst:
 	fput(dst);
-out_put_src:
 	fput(src);
 out:
 	return status;
 }
 
 static __be32
+nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		struct nfsd4_copy *copy)
+{
+	struct file *src, *dst;
+	__be32 status;
+	ssize_t bytes;
+
+	status = nfsd4_verify_copy(rqstp, cstate, &copy->cp_src_stateid, &src,
+				   &copy->cp_dst_stateid, &dst);
+	if (status)
+		goto out;
+
+	bytes = nfsd_copy_file_range(src, copy->cp_src_pos,
+			dst, copy->cp_dst_pos, copy->cp_count);
+
+	if (bytes < 0)
+		status = nfserrno(bytes);
+	else {
+		copy->cp_res.wr_bytes_written = bytes;
+		copy->cp_res.wr_stable_how = NFS_UNSTABLE;
+		copy->cp_consecutive = 1;
+		copy->cp_synchronous = 1;
+		gen_boot_verifier(&copy->cp_res.wr_verifier, SVC_NET(rqstp));
+		status = nfs_ok;
+	}
+
+	fput(src);
+	fput(dst);
+out:
+	return status;
+}
+
+static __be32
 nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		struct nfsd4_fallocate *fallocate, int flags)
 {
@@ -1966,6 +2016,18 @@
 		op_encode_channel_attrs_maxsz) * sizeof(__be32);
 }
 
+static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size +
+		1 /* wr_callback */ +
+		op_encode_stateid_maxsz /* wr_callback */ +
+		2 /* wr_count */ +
+		1 /* wr_committed */ +
+		op_encode_verifier_maxsz +
+		1 /* cr_consecutive */ +
+		1 /* cr_synchronous */) * sizeof(__be32);
+}
+
 #ifdef CONFIG_NFSD_PNFS
 /*
  * At this stage we don't really know what layout driver will handle the request,
@@ -2328,6 +2390,12 @@
 		.op_name = "OP_CLONE",
 		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
+	[OP_COPY] = {
+		.op_func = (nfsd4op_func)nfsd4_copy,
+		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+		.op_name = "OP_COPY",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_copy_rsize,
+	},
 	[OP_SEEK] = {
 		.op_func = (nfsd4op_func)nfsd4_seek,
 		.op_name = "OP_SEEK",
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 39bfaba..9752beb 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -99,6 +99,7 @@
 static void free_session(struct nfsd4_session *);
 
 static const struct nfsd4_callback_ops nfsd4_cb_recall_ops;
+static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops;
 
 static bool is_session_dead(struct nfsd4_session *ses)
 {
@@ -210,6 +211,85 @@
 	spin_unlock(&nn->client_lock);
 }
 
+static struct nfsd4_blocked_lock *
+find_blocked_lock(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
+			struct nfsd_net *nn)
+{
+	struct nfsd4_blocked_lock *cur, *found = NULL;
+
+	spin_lock(&nn->client_lock);
+	list_for_each_entry(cur, &lo->lo_blocked, nbl_list) {
+		if (fh_match(fh, &cur->nbl_fh)) {
+			list_del_init(&cur->nbl_list);
+			list_del_init(&cur->nbl_lru);
+			found = cur;
+			break;
+		}
+	}
+	spin_unlock(&nn->client_lock);
+	if (found)
+		posix_unblock_lock(&found->nbl_lock);
+	return found;
+}
+
+static struct nfsd4_blocked_lock *
+find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
+			struct nfsd_net *nn)
+{
+	struct nfsd4_blocked_lock *nbl;
+
+	nbl = find_blocked_lock(lo, fh, nn);
+	if (!nbl) {
+		nbl= kmalloc(sizeof(*nbl), GFP_KERNEL);
+		if (nbl) {
+			fh_copy_shallow(&nbl->nbl_fh, fh);
+			locks_init_lock(&nbl->nbl_lock);
+			nfsd4_init_cb(&nbl->nbl_cb, lo->lo_owner.so_client,
+					&nfsd4_cb_notify_lock_ops,
+					NFSPROC4_CLNT_CB_NOTIFY_LOCK);
+		}
+	}
+	return nbl;
+}
+
+static void
+free_blocked_lock(struct nfsd4_blocked_lock *nbl)
+{
+	locks_release_private(&nbl->nbl_lock);
+	kfree(nbl);
+}
+
+static int
+nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task)
+{
+	/*
+	 * Since this is just an optimization, we don't try very hard if it
+	 * turns out not to succeed. We'll requeue it on NFS4ERR_DELAY, and
+	 * just quit trying on anything else.
+	 */
+	switch (task->tk_status) {
+	case -NFS4ERR_DELAY:
+		rpc_delay(task, 1 * HZ);
+		return 0;
+	default:
+		return 1;
+	}
+}
+
+static void
+nfsd4_cb_notify_lock_release(struct nfsd4_callback *cb)
+{
+	struct nfsd4_blocked_lock	*nbl = container_of(cb,
+						struct nfsd4_blocked_lock, nbl_cb);
+
+	free_blocked_lock(nbl);
+}
+
+static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = {
+	.done		= nfsd4_cb_notify_lock_done,
+	.release	= nfsd4_cb_notify_lock_release,
+};
+
 static inline struct nfs4_stateowner *
 nfs4_get_stateowner(struct nfs4_stateowner *sop)
 {
@@ -3224,9 +3304,10 @@
 		goto out;
 	/* cases below refer to rfc 3530 section 14.2.34: */
 	if (!unconf || !same_verf(&confirm, &unconf->cl_confirm)) {
-		if (conf && !unconf) /* case 2: probable retransmit */
+		if (conf && same_verf(&confirm, &conf->cl_confirm)) {
+			/* case 2: probable retransmit */
 			status = nfs_ok;
-		else /* case 4: client hasn't noticed we rebooted yet? */
+		} else /* case 4: client hasn't noticed we rebooted yet? */
 			status = nfserr_stale_clientid;
 		goto out;
 	}
@@ -4410,9 +4491,11 @@
 	* To finish the open response, we just need to set the rflags.
 	*/
 	open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
-	if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) &&
-	    !nfsd4_has_session(&resp->cstate))
+	if (nfsd4_has_session(&resp->cstate))
+		open->op_rflags |= NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK;
+	else if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED))
 		open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
+
 	if (dp)
 		nfs4_put_stid(&dp->dl_stid);
 	if (stp)
@@ -4501,6 +4584,7 @@
 	struct nfs4_openowner *oo;
 	struct nfs4_delegation *dp;
 	struct nfs4_ol_stateid *stp;
+	struct nfsd4_blocked_lock *nbl;
 	struct list_head *pos, *next, reaplist;
 	time_t cutoff = get_seconds() - nn->nfsd4_lease;
 	time_t t, new_timeo = nn->nfsd4_lease;
@@ -4569,6 +4653,41 @@
 	}
 	spin_unlock(&nn->client_lock);
 
+	/*
+	 * It's possible for a client to try and acquire an already held lock
+	 * that is being held for a long time, and then lose interest in it.
+	 * So, we clean out any un-revisited request after a lease period
+	 * under the assumption that the client is no longer interested.
+	 *
+	 * RFC5661, sec. 9.6 states that the client must not rely on getting
+	 * notifications and must continue to poll for locks, even when the
+	 * server supports them. Thus this shouldn't lead to clients blocking
+	 * indefinitely once the lock does become free.
+	 */
+	BUG_ON(!list_empty(&reaplist));
+	spin_lock(&nn->client_lock);
+	while (!list_empty(&nn->blocked_locks_lru)) {
+		nbl = list_first_entry(&nn->blocked_locks_lru,
+					struct nfsd4_blocked_lock, nbl_lru);
+		if (time_after((unsigned long)nbl->nbl_time,
+			       (unsigned long)cutoff)) {
+			t = nbl->nbl_time - cutoff;
+			new_timeo = min(new_timeo, t);
+			break;
+		}
+		list_move(&nbl->nbl_lru, &reaplist);
+		list_del_init(&nbl->nbl_list);
+	}
+	spin_unlock(&nn->client_lock);
+
+	while (!list_empty(&reaplist)) {
+		nbl = list_first_entry(&nn->blocked_locks_lru,
+					struct nfsd4_blocked_lock, nbl_lru);
+		list_del_init(&nbl->nbl_lru);
+		posix_unblock_lock(&nbl->nbl_lock);
+		free_blocked_lock(nbl);
+	}
+
 	new_timeo = max_t(time_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT);
 	return new_timeo;
 }
@@ -5309,7 +5428,31 @@
 		nfs4_put_stateowner(&lo->lo_owner);
 }
 
+static void
+nfsd4_lm_notify(struct file_lock *fl)
+{
+	struct nfs4_lockowner		*lo = (struct nfs4_lockowner *)fl->fl_owner;
+	struct net			*net = lo->lo_owner.so_client->net;
+	struct nfsd_net			*nn = net_generic(net, nfsd_net_id);
+	struct nfsd4_blocked_lock	*nbl = container_of(fl,
+						struct nfsd4_blocked_lock, nbl_lock);
+	bool queue = false;
+
+	/* An empty list means that something else is going to be using it */
+	spin_lock(&nn->client_lock);
+	if (!list_empty(&nbl->nbl_list)) {
+		list_del_init(&nbl->nbl_list);
+		list_del_init(&nbl->nbl_lru);
+		queue = true;
+	}
+	spin_unlock(&nn->client_lock);
+
+	if (queue)
+		nfsd4_run_cb(&nbl->nbl_cb);
+}
+
 static const struct lock_manager_operations nfsd_posix_mng_ops  = {
+	.lm_notify = nfsd4_lm_notify,
 	.lm_get_owner = nfsd4_fl_get_owner,
 	.lm_put_owner = nfsd4_fl_put_owner,
 };
@@ -5407,6 +5550,7 @@
 	lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp);
 	if (!lo)
 		return NULL;
+	INIT_LIST_HEAD(&lo->lo_blocked);
 	INIT_LIST_HEAD(&lo->lo_owner.so_stateids);
 	lo->lo_owner.so_is_open_owner = 0;
 	lo->lo_owner.so_seqid = lock->lk_new_lock_seqid;
@@ -5588,12 +5732,15 @@
 	struct nfs4_ol_stateid *open_stp = NULL;
 	struct nfs4_file *fp;
 	struct file *filp = NULL;
+	struct nfsd4_blocked_lock *nbl = NULL;
 	struct file_lock *file_lock = NULL;
 	struct file_lock *conflock = NULL;
 	__be32 status = 0;
 	int lkflg;
 	int err;
 	bool new = false;
+	unsigned char fl_type;
+	unsigned int fl_flags = FL_POSIX;
 	struct net *net = SVC_NET(rqstp);
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
@@ -5658,46 +5805,55 @@
 	if (!locks_in_grace(net) && lock->lk_reclaim)
 		goto out;
 
-	file_lock = locks_alloc_lock();
-	if (!file_lock) {
-		dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
-		status = nfserr_jukebox;
-		goto out;
-	}
-
 	fp = lock_stp->st_stid.sc_file;
 	switch (lock->lk_type) {
-		case NFS4_READ_LT:
 		case NFS4_READW_LT:
+			if (nfsd4_has_session(cstate))
+				fl_flags |= FL_SLEEP;
+			/* Fallthrough */
+		case NFS4_READ_LT:
 			spin_lock(&fp->fi_lock);
 			filp = find_readable_file_locked(fp);
 			if (filp)
 				get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ);
 			spin_unlock(&fp->fi_lock);
-			file_lock->fl_type = F_RDLCK;
+			fl_type = F_RDLCK;
 			break;
-		case NFS4_WRITE_LT:
 		case NFS4_WRITEW_LT:
+			if (nfsd4_has_session(cstate))
+				fl_flags |= FL_SLEEP;
+			/* Fallthrough */
+		case NFS4_WRITE_LT:
 			spin_lock(&fp->fi_lock);
 			filp = find_writeable_file_locked(fp);
 			if (filp)
 				get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE);
 			spin_unlock(&fp->fi_lock);
-			file_lock->fl_type = F_WRLCK;
+			fl_type = F_WRLCK;
 			break;
 		default:
 			status = nfserr_inval;
 		goto out;
 	}
+
 	if (!filp) {
 		status = nfserr_openmode;
 		goto out;
 	}
 
+	nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn);
+	if (!nbl) {
+		dprintk("NFSD: %s: unable to allocate block!\n", __func__);
+		status = nfserr_jukebox;
+		goto out;
+	}
+
+	file_lock = &nbl->nbl_lock;
+	file_lock->fl_type = fl_type;
 	file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner));
 	file_lock->fl_pid = current->tgid;
 	file_lock->fl_file = filp;
-	file_lock->fl_flags = FL_POSIX;
+	file_lock->fl_flags = fl_flags;
 	file_lock->fl_lmops = &nfsd_posix_mng_ops;
 	file_lock->fl_start = lock->lk_offset;
 	file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
@@ -5710,18 +5866,29 @@
 		goto out;
 	}
 
+	if (fl_flags & FL_SLEEP) {
+		nbl->nbl_time = jiffies;
+		spin_lock(&nn->client_lock);
+		list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked);
+		list_add_tail(&nbl->nbl_lru, &nn->blocked_locks_lru);
+		spin_unlock(&nn->client_lock);
+	}
+
 	err = vfs_lock_file(filp, F_SETLK, file_lock, conflock);
-	switch (-err) {
+	switch (err) {
 	case 0: /* success! */
 		nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid);
 		status = 0;
 		break;
-	case (EAGAIN):		/* conflock holds conflicting lock */
+	case FILE_LOCK_DEFERRED:
+		nbl = NULL;
+		/* Fallthrough */
+	case -EAGAIN:		/* conflock holds conflicting lock */
 		status = nfserr_denied;
 		dprintk("NFSD: nfsd4_lock: conflicting lock found!\n");
 		nfs4_set_lock_denied(conflock, &lock->lk_denied);
 		break;
-	case (EDEADLK):
+	case -EDEADLK:
 		status = nfserr_deadlock;
 		break;
 	default:
@@ -5730,6 +5897,16 @@
 		break;
 	}
 out:
+	if (nbl) {
+		/* dequeue it if we queued it before */
+		if (fl_flags & FL_SLEEP) {
+			spin_lock(&nn->client_lock);
+			list_del_init(&nbl->nbl_list);
+			list_del_init(&nbl->nbl_lru);
+			spin_unlock(&nn->client_lock);
+		}
+		free_blocked_lock(nbl);
+	}
 	if (filp)
 		fput(filp);
 	if (lock_stp) {
@@ -5753,8 +5930,6 @@
 	if (open_stp)
 		nfs4_put_stid(&open_stp->st_stid);
 	nfsd4_bump_seqid(cstate, status);
-	if (file_lock)
-		locks_free_lock(file_lock);
 	if (conflock)
 		locks_free_lock(conflock);
 	return status;
@@ -6768,6 +6943,7 @@
 	INIT_LIST_HEAD(&nn->client_lru);
 	INIT_LIST_HEAD(&nn->close_lru);
 	INIT_LIST_HEAD(&nn->del_recall_lru);
+	INIT_LIST_HEAD(&nn->blocked_locks_lru);
 	spin_lock_init(&nn->client_lock);
 
 	INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main);
@@ -6865,6 +7041,7 @@
 	struct nfs4_delegation *dp = NULL;
 	struct list_head *pos, *next, reaplist;
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	struct nfsd4_blocked_lock *nbl;
 
 	cancel_delayed_work_sync(&nn->laundromat_work);
 	locks_end_grace(&nn->nfsd4_manager);
@@ -6885,6 +7062,24 @@
 		nfs4_put_stid(&dp->dl_stid);
 	}
 
+	BUG_ON(!list_empty(&reaplist));
+	spin_lock(&nn->client_lock);
+	while (!list_empty(&nn->blocked_locks_lru)) {
+		nbl = list_first_entry(&nn->blocked_locks_lru,
+					struct nfsd4_blocked_lock, nbl_lru);
+		list_move(&nbl->nbl_lru, &reaplist);
+		list_del_init(&nbl->nbl_list);
+	}
+	spin_unlock(&nn->client_lock);
+
+	while (!list_empty(&reaplist)) {
+		nbl = list_first_entry(&nn->blocked_locks_lru,
+					struct nfsd4_blocked_lock, nbl_lru);
+		list_del_init(&nbl->nbl_lru);
+		posix_unblock_lock(&nbl->nbl_lock);
+		free_blocked_lock(nbl);
+	}
+
 	nfsd4_client_tracking_exit(net);
 	nfs4_state_destroy_net(net);
 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 0aa0236..c2d2895 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1694,6 +1694,30 @@
 }
 
 static __be32
+nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
+{
+	DECODE_HEAD;
+	unsigned int tmp;
+
+	status = nfsd4_decode_stateid(argp, &copy->cp_src_stateid);
+	if (status)
+		return status;
+	status = nfsd4_decode_stateid(argp, &copy->cp_dst_stateid);
+	if (status)
+		return status;
+
+	READ_BUF(8 + 8 + 8 + 4 + 4 + 4);
+	p = xdr_decode_hyper(p, &copy->cp_src_pos);
+	p = xdr_decode_hyper(p, &copy->cp_dst_pos);
+	p = xdr_decode_hyper(p, &copy->cp_count);
+	copy->cp_consecutive = be32_to_cpup(p++);
+	copy->cp_synchronous = be32_to_cpup(p++);
+	tmp = be32_to_cpup(p); /* Source server list not supported */
+
+	DECODE_TAIL;
+}
+
+static __be32
 nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
 {
 	DECODE_HEAD;
@@ -1793,7 +1817,7 @@
 
 	/* new operations for NFSv4.2 */
 	[OP_ALLOCATE]		= (nfsd4_dec)nfsd4_decode_fallocate,
-	[OP_COPY]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_COPY]		= (nfsd4_dec)nfsd4_decode_copy,
 	[OP_COPY_NOTIFY]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_DEALLOCATE]		= (nfsd4_dec)nfsd4_decode_fallocate,
 	[OP_IO_ADVISE]		= (nfsd4_dec)nfsd4_decode_notsupp,
@@ -4062,7 +4086,7 @@
 	u32 starting_len = xdr->buf->len, needed_len;
 	__be32 *p;
 
-	dprintk("%s: err %d\n", __func__, nfserr);
+	dprintk("%s: err %d\n", __func__, be32_to_cpu(nfserr));
 	if (nfserr)
 		goto out;
 
@@ -4202,6 +4226,41 @@
 #endif /* CONFIG_NFSD_PNFS */
 
 static __be32
+nfsd42_encode_write_res(struct nfsd4_compoundres *resp, struct nfsd42_write_res *write)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(&resp->xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE);
+	if (!p)
+		return nfserr_resource;
+
+	*p++ = cpu_to_be32(0);
+	p = xdr_encode_hyper(p, write->wr_bytes_written);
+	*p++ = cpu_to_be32(write->wr_stable_how);
+	p = xdr_encode_opaque_fixed(p, write->wr_verifier.data,
+				    NFS4_VERIFIER_SIZE);
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
+		  struct nfsd4_copy *copy)
+{
+	__be32 *p;
+
+	if (!nfserr) {
+		nfserr = nfsd42_encode_write_res(resp, &copy->cp_res);
+		if (nfserr)
+			return nfserr;
+
+		p = xdr_reserve_space(&resp->xdr, 4 + 4);
+		*p++ = cpu_to_be32(copy->cp_consecutive);
+		*p++ = cpu_to_be32(copy->cp_synchronous);
+	}
+	return nfserr;
+}
+
+static __be32
 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
 		  struct nfsd4_seek *seek)
 {
@@ -4300,7 +4359,7 @@
 
 	/* NFSv4.2 operations */
 	[OP_ALLOCATE]		= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_COPY]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_COPY]		= (nfsd4_enc)nfsd4_encode_copy,
 	[OP_COPY_NOTIFY]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_DEALLOCATE]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_IO_ADVISE]		= (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 65ad016..36b2af9 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1216,6 +1216,8 @@
 		goto out_idmap_error;
 	nn->nfsd4_lease = 90;	/* default lease time */
 	nn->nfsd4_grace = 90;
+	nn->clverifier_counter = prandom_u32();
+	nn->clientid_counter = prandom_u32();
 	return 0;
 
 out_idmap_error:
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 08188743..010aff5 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -789,6 +789,7 @@
 		{ nfserr_toosmall, -ETOOSMALL },
 		{ nfserr_serverfault, -ESERVERFAULT },
 		{ nfserr_serverfault, -ENFILE },
+		{ nfserr_io, -EUCLEAN },
 	};
 	int	i;
 
@@ -796,7 +797,7 @@
 		if (nfs_errtbl[i].syserr == errno)
 			return nfs_errtbl[i].nfserr;
 	}
-	WARN(1, "nfsd: non-standard errno: %d\n", errno);
+	WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno);
 	return nfserr_io;
 }
 
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 45007ac..a2b65fc 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -366,14 +366,21 @@
 };
 #endif
 
+/* Only used under nfsd_mutex, so this atomic may be overkill: */
+static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0);
+
 static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
-	unregister_inetaddr_notifier(&nfsd_inetaddr_notifier);
+	/* check if the notifier still has clients */
+	if (atomic_dec_return(&nfsd_notifier_refcount) == 0) {
+		unregister_inetaddr_notifier(&nfsd_inetaddr_notifier);
 #if IS_ENABLED(CONFIG_IPV6)
-	unregister_inet6addr_notifier(&nfsd_inet6addr_notifier);
+		unregister_inet6addr_notifier(&nfsd_inet6addr_notifier);
 #endif
+	}
+
 	/*
 	 * write_ports can create the server without actually starting
 	 * any threads--if we get shut down before any threads are
@@ -488,10 +495,13 @@
 	}
 
 	set_max_drc();
-	register_inetaddr_notifier(&nfsd_inetaddr_notifier);
+	/* check if the notifier is already set */
+	if (atomic_inc_return(&nfsd_notifier_refcount) == 1) {
+		register_inetaddr_notifier(&nfsd_inetaddr_notifier);
 #if IS_ENABLED(CONFIG_IPV6)
-	register_inet6addr_notifier(&nfsd_inet6addr_notifier);
+		register_inet6addr_notifier(&nfsd_inet6addr_notifier);
 #endif
+	}
 	do_gettimeofday(&nn->nfssvc_boot);		/* record boot time */
 	return 0;
 }
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
index 0c2a716..d27a5aa 100644
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -19,6 +19,7 @@
 
 struct nfsd4_layout_ops {
 	u32		notify_types;
+	bool		disable_recalls;
 
 	__be32 (*proc_getdeviceinfo)(struct super_block *sb,
 			struct svc_rqst *rqstp,
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index b95adf9..c939936 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -63,7 +63,6 @@
 
 struct nfsd4_callback {
 	struct nfs4_client *cb_clp;
-	u32 cb_minorversion;
 	struct rpc_message cb_msg;
 	const struct nfsd4_callback_ops *cb_ops;
 	struct work_struct cb_work;
@@ -441,11 +440,11 @@
 /*
  * Represents a generic "lockowner". Similar to an openowner. References to it
  * are held by the lock stateids that are created on its behalf. This object is
- * a superset of the nfs4_stateowner struct (or would be if it needed any extra
- * fields).
+ * a superset of the nfs4_stateowner struct.
  */
 struct nfs4_lockowner {
-	struct nfs4_stateowner	lo_owner; /* must be first element */
+	struct nfs4_stateowner	lo_owner;	/* must be first element */
+	struct list_head	lo_blocked;	/* blocked file_locks */
 };
 
 static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so)
@@ -572,6 +571,7 @@
 	NFSPROC4_CLNT_CB_RECALL,
 	NFSPROC4_CLNT_CB_LAYOUT,
 	NFSPROC4_CLNT_CB_SEQUENCE,
+	NFSPROC4_CLNT_CB_NOTIFY_LOCK,
 };
 
 /* Returns true iff a is later than b: */
@@ -580,6 +580,20 @@
 	return (s32)(a->si_generation - b->si_generation) > 0;
 }
 
+/*
+ * When a client tries to get a lock on a file, we set one of these objects
+ * on the blocking lock. When the lock becomes free, we can then issue a
+ * CB_NOTIFY_LOCK to the server.
+ */
+struct nfsd4_blocked_lock {
+	struct list_head	nbl_list;
+	struct list_head	nbl_lru;
+	unsigned long		nbl_time;
+	struct file_lock	nbl_lock;
+	struct knfsd_fh		nbl_fh;
+	struct nfsd4_callback	nbl_cb;
+};
+
 struct nfsd4_compound_state;
 struct nfsd_net;
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ff476e6..8ca642f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -513,6 +513,22 @@
 			count));
 }
 
+ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
+			     u64 dst_pos, u64 count)
+{
+
+	/*
+	 * Limit copy to 4MB to prevent indefinitely blocking an nfsd
+	 * thread and client rpc slot.  The choice of 4MB is somewhat
+	 * arbitrary.  We might instead base this on r/wsize, or make it
+	 * tunable, or use a time instead of a byte limit, or implement
+	 * asynchronous copy.  In theory a client could also recognize a
+	 * limit like this and pipeline multiple COPY requests.
+	 */
+	count = min_t(u64, count, 1 << 22);
+	return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
+}
+
 __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			   struct file *file, loff_t offset, loff_t len,
 			   int flags)
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 3cbb1b3..0bf9e7b 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -96,6 +96,8 @@
 				struct svc_fh *res);
 __be32		nfsd_link(struct svc_rqst *, struct svc_fh *,
 				char *, int, struct svc_fh *);
+ssize_t		nfsd_copy_file_range(struct file *, u64,
+				     struct file *, u64, u64);
 __be32		nfsd_rename(struct svc_rqst *,
 				struct svc_fh *, char *, int,
 				struct svc_fh *, char *, int);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index beea0c5..8fda4ab 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -503,6 +503,28 @@
 	u64		cl_count;
 };
 
+struct nfsd42_write_res {
+	u64			wr_bytes_written;
+	u32			wr_stable_how;
+	nfs4_verifier		wr_verifier;
+};
+
+struct nfsd4_copy {
+	/* request */
+	stateid_t	cp_src_stateid;
+	stateid_t	cp_dst_stateid;
+	u64		cp_src_pos;
+	u64		cp_dst_pos;
+	u64		cp_count;
+
+	/* both */
+	bool		cp_consecutive;
+	bool		cp_synchronous;
+
+	/* response */
+	struct nfsd42_write_res	cp_res;
+};
+
 struct nfsd4_seek {
 	/* request */
 	stateid_t	seek_stateid;
@@ -568,6 +590,7 @@
 		struct nfsd4_fallocate		allocate;
 		struct nfsd4_fallocate		deallocate;
 		struct nfsd4_clone		clone;
+		struct nfsd4_copy		copy;
 		struct nfsd4_seek		seek;
 	} u;
 	struct nfs4_replay *			replay;
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index c47f6fd..49b719d 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -28,3 +28,12 @@
 #define NFS4_dec_cb_layout_sz		(cb_compound_dec_hdr_sz  +      \
 					cb_sequence_dec_sz +            \
 					op_dec_sz)
+
+#define NFS4_enc_cb_notify_lock_sz	(cb_compound_enc_hdr_sz +        \
+					cb_sequence_enc_sz +             \
+					2 + 1 +				 \
+					XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+					enc_nfs4_fh_sz)
+#define NFS4_dec_cb_notify_lock_sz	(cb_compound_dec_hdr_sz  +      \
+					cb_sequence_dec_sz +            \
+					op_dec_sz)
diff --git a/fs/open.c b/fs/open.c
index a7719cf..d3ed817 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -267,6 +267,11 @@
 	    (mode & ~FALLOC_FL_INSERT_RANGE))
 		return -EINVAL;
 
+	/* Unshare range should only be used with allocate mode. */
+	if ((mode & FALLOC_FL_UNSHARE_RANGE) &&
+	    (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE)))
+		return -EINVAL;
+
 	if (!(file->f_mode & FMODE_WRITE))
 		return -EBADF;
 
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 584e87e..26ef195 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -55,6 +55,8 @@
 				   xfs_ag_resv.o \
 				   xfs_rmap.o \
 				   xfs_rmap_btree.o \
+				   xfs_refcount.o \
+				   xfs_refcount_btree.o \
 				   xfs_sb.o \
 				   xfs_symlink_remote.o \
 				   xfs_trans_resv.o \
@@ -88,6 +90,7 @@
 				   xfs_message.o \
 				   xfs_mount.o \
 				   xfs_mru_cache.o \
+				   xfs_reflink.o \
 				   xfs_stats.o \
 				   xfs_super.o \
 				   xfs_symlink.o \
@@ -100,16 +103,20 @@
 # low-level transaction/log code
 xfs-y				+= xfs_log.o \
 				   xfs_log_cil.o \
+				   xfs_bmap_item.o \
 				   xfs_buf_item.o \
 				   xfs_extfree_item.o \
 				   xfs_icreate_item.o \
 				   xfs_inode_item.o \
+				   xfs_refcount_item.o \
 				   xfs_rmap_item.o \
 				   xfs_log_recover.o \
 				   xfs_trans_ail.o \
+				   xfs_trans_bmap.o \
 				   xfs_trans_buf.o \
 				   xfs_trans_extfree.o \
 				   xfs_trans_inode.o \
+				   xfs_trans_refcount.o \
 				   xfs_trans_rmap.o \
 
 # optional features
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index e3ae0f2..e5ebc37 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -38,6 +38,7 @@
 #include "xfs_trans_space.h"
 #include "xfs_rmap_btree.h"
 #include "xfs_btree.h"
+#include "xfs_refcount_btree.h"
 
 /*
  * Per-AG Block Reservations
@@ -108,7 +109,9 @@
 	trace_xfs_ag_resv_critical(pag, type, avail);
 
 	/* Critically low if less than 10% or max btree height remains. */
-	return avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS;
+	return XFS_TEST_ERROR(avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS,
+			pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL,
+			XFS_RANDOM_AG_RESV_CRITICAL);
 }
 
 /*
@@ -228,6 +231,11 @@
 	if (pag->pag_meta_resv.ar_asked == 0) {
 		ask = used = 0;
 
+		error = xfs_refcountbt_calc_reserves(pag->pag_mount,
+				pag->pag_agno, &ask, &used);
+		if (error)
+			goto out;
+
 		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
 				ask, used);
 		if (error)
@@ -238,6 +246,11 @@
 	if (pag->pag_agfl_resv.ar_asked == 0) {
 		ask = used = 0;
 
+		error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno,
+				&ask, &used);
+		if (error)
+			goto out;
+
 		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_AGFL, ask, used);
 		if (error)
 			goto out;
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index ca75dc9..effb64c 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -52,10 +52,23 @@
 STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
 		xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
 
+unsigned int
+xfs_refc_block(
+	struct xfs_mount	*mp)
+{
+	if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+		return XFS_RMAP_BLOCK(mp) + 1;
+	if (xfs_sb_version_hasfinobt(&mp->m_sb))
+		return XFS_FIBT_BLOCK(mp) + 1;
+	return XFS_IBT_BLOCK(mp) + 1;
+}
+
 xfs_extlen_t
 xfs_prealloc_blocks(
 	struct xfs_mount	*mp)
 {
+	if (xfs_sb_version_hasreflink(&mp->m_sb))
+		return xfs_refc_block(mp) + 1;
 	if (xfs_sb_version_hasrmapbt(&mp->m_sb))
 		return XFS_RMAP_BLOCK(mp) + 1;
 	if (xfs_sb_version_hasfinobt(&mp->m_sb))
@@ -115,6 +128,8 @@
 		blocks++;		/* finobt root block */
 	if (xfs_sb_version_hasrmapbt(&mp->m_sb))
 		blocks++; 		/* rmap root block */
+	if (xfs_sb_version_hasreflink(&mp->m_sb))
+		blocks++;		/* refcount root block */
 
 	return mp->m_sb.sb_agblocks - blocks;
 }
@@ -2321,6 +2336,9 @@
 		offsetof(xfs_agf_t, agf_btreeblks),
 		offsetof(xfs_agf_t, agf_uuid),
 		offsetof(xfs_agf_t, agf_rmap_blocks),
+		offsetof(xfs_agf_t, agf_refcount_blocks),
+		offsetof(xfs_agf_t, agf_refcount_root),
+		offsetof(xfs_agf_t, agf_refcount_level),
 		/* needed so that we don't log the whole rest of the structure: */
 		offsetof(xfs_agf_t, agf_spare64),
 		sizeof(xfs_agf_t)
@@ -2458,6 +2476,10 @@
 	    be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
 		return false;
 
+	if (xfs_sb_version_hasreflink(&mp->m_sb) &&
+	    be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)
+		return false;
+
 	return true;;
 
 }
@@ -2578,6 +2600,7 @@
 			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
 		pag->pagf_levels[XFS_BTNUM_RMAPi] =
 			be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
+		pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
 		spin_lock_init(&pag->pagb_lock);
 		pag->pagb_count = 0;
 		pag->pagb_tree = RB_ROOT;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 9d7f61d..c27344c 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -48,6 +48,7 @@
 #include "xfs_filestream.h"
 #include "xfs_rmap.h"
 #include "xfs_ag_resv.h"
+#include "xfs_refcount.h"
 
 
 kmem_zone_t		*xfs_bmap_free_item_zone;
@@ -140,7 +141,8 @@
  */
 static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
 {
-	return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+	return whichfork != XFS_COW_FORK &&
+		XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
 		XFS_IFORK_NEXTENTS(ip, whichfork) >
 			XFS_IFORK_MAXEXT(ip, whichfork);
 }
@@ -150,7 +152,8 @@
  */
 static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
 {
-	return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+	return whichfork != XFS_COW_FORK &&
+		XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
 		XFS_IFORK_NEXTENTS(ip, whichfork) <=
 			XFS_IFORK_MAXEXT(ip, whichfork);
 }
@@ -640,6 +643,7 @@
 
 	mp = ip->i_mount;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(whichfork != XFS_COW_FORK);
 	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
 	rblock = ifp->if_broot;
@@ -706,6 +710,7 @@
 	xfs_bmbt_ptr_t		*pp;		/* root block address pointer */
 
 	mp = ip->i_mount;
+	ASSERT(whichfork != XFS_COW_FORK);
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
 
@@ -748,6 +753,7 @@
 		args.type = XFS_ALLOCTYPE_START_BNO;
 		args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
 	} else if (dfops->dop_low) {
+try_another_ag:
 		args.type = XFS_ALLOCTYPE_START_BNO;
 		args.fsbno = *firstblock;
 	} else {
@@ -762,6 +768,21 @@
 		xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 		return error;
 	}
+
+	/*
+	 * During a CoW operation, the allocation and bmbt updates occur in
+	 * different transactions.  The mapping code tries to put new bmbt
+	 * blocks near extents being mapped, but the only way to guarantee this
+	 * is if the alloc and the mapping happen in a single transaction that
+	 * has a block reservation.  That isn't the case here, so if we run out
+	 * of space we'll try again with another AG.
+	 */
+	if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
+	    args.fsbno == NULLFSBLOCK &&
+	    args.type == XFS_ALLOCTYPE_NEAR_BNO) {
+		dfops->dop_low = true;
+		goto try_another_ag;
+	}
 	/*
 	 * Allocation can't fail, the space was reserved.
 	 */
@@ -837,6 +858,7 @@
 {
 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
 
+	ASSERT(whichfork != XFS_COW_FORK);
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
 	ASSERT(ifp->if_bytes == 0);
 	ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
@@ -896,6 +918,7 @@
 	 * file currently fits in an inode.
 	 */
 	if (*firstblock == NULLFSBLOCK) {
+try_another_ag:
 		args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
 		args.type = XFS_ALLOCTYPE_START_BNO;
 	} else {
@@ -908,6 +931,19 @@
 	if (error)
 		goto done;
 
+	/*
+	 * During a CoW operation, the allocation and bmbt updates occur in
+	 * different transactions.  The mapping code tries to put new bmbt
+	 * blocks near extents being mapped, but the only way to guarantee this
+	 * is if the alloc and the mapping happen in a single transaction that
+	 * has a block reservation.  That isn't the case here, so if we run out
+	 * of space we'll try again with another AG.
+	 */
+	if (xfs_sb_version_hasreflink(&ip->i_mount->m_sb) &&
+	    args.fsbno == NULLFSBLOCK &&
+	    args.type == XFS_ALLOCTYPE_NEAR_BNO) {
+		goto try_another_ag;
+	}
 	/* Can't fail, the space was reserved. */
 	ASSERT(args.fsbno != NULLFSBLOCK);
 	ASSERT(args.len == 1);
@@ -1670,7 +1706,8 @@
  */
 STATIC int				/* error */
 xfs_bmap_add_extent_delay_real(
-	struct xfs_bmalloca	*bma)
+	struct xfs_bmalloca	*bma,
+	int			whichfork)
 {
 	struct xfs_bmbt_irec	*new = &bma->got;
 	int			diff;	/* temp value */
@@ -1688,11 +1725,14 @@
 	xfs_filblks_t		temp=0;	/* value for da_new calculations */
 	xfs_filblks_t		temp2=0;/* value for da_new calculations */
 	int			tmp_rval;	/* partial logging flags */
-	int			whichfork = XFS_DATA_FORK;
 	struct xfs_mount	*mp;
+	xfs_extnum_t		*nextents;
 
 	mp = bma->ip->i_mount;
 	ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+	ASSERT(whichfork != XFS_ATTR_FORK);
+	nextents = (whichfork == XFS_COW_FORK ? &bma->ip->i_cnextents :
+						&bma->ip->i_d.di_nextents);
 
 	ASSERT(bma->idx >= 0);
 	ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
@@ -1706,6 +1746,9 @@
 #define	RIGHT		r[1]
 #define	PREV		r[2]
 
+	if (whichfork == XFS_COW_FORK)
+		state |= BMAP_COWFORK;
+
 	/*
 	 * Set up a bunch of variables to make the tests simpler.
 	 */
@@ -1792,7 +1835,7 @@
 		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
 		xfs_iext_remove(bma->ip, bma->idx + 1, 2, state);
-		bma->ip->i_d.di_nextents--;
+		(*nextents)--;
 		if (bma->cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
@@ -1894,7 +1937,7 @@
 		xfs_bmbt_set_startblock(ep, new->br_startblock);
 		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
-		bma->ip->i_d.di_nextents++;
+		(*nextents)++;
 		if (bma->cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
@@ -1964,7 +2007,7 @@
 		temp = PREV.br_blockcount - new->br_blockcount;
 		xfs_bmbt_set_blockcount(ep, temp);
 		xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
-		bma->ip->i_d.di_nextents++;
+		(*nextents)++;
 		if (bma->cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
@@ -2048,7 +2091,7 @@
 		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
 		xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state);
-		bma->ip->i_d.di_nextents++;
+		(*nextents)++;
 		if (bma->cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
@@ -2117,7 +2160,7 @@
 		RIGHT.br_blockcount = temp2;
 		/* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
 		xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state);
-		bma->ip->i_d.di_nextents++;
+		(*nextents)++;
 		if (bma->cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
@@ -2215,7 +2258,8 @@
 
 	xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
 done:
-	bma->logflags |= rval;
+	if (whichfork != XFS_COW_FORK)
+		bma->logflags |= rval;
 	return error;
 #undef	LEFT
 #undef	RIGHT
@@ -2759,6 +2803,7 @@
 STATIC void
 xfs_bmap_add_extent_hole_delay(
 	xfs_inode_t		*ip,	/* incore inode pointer */
+	int			whichfork,
 	xfs_extnum_t		*idx,	/* extent number to update/insert */
 	xfs_bmbt_irec_t		*new)	/* new data to add to file extents */
 {
@@ -2770,8 +2815,10 @@
 	int			state;  /* state bits, accessed thru macros */
 	xfs_filblks_t		temp=0;	/* temp for indirect calculations */
 
-	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	ifp = XFS_IFORK_PTR(ip, whichfork);
 	state = 0;
+	if (whichfork == XFS_COW_FORK)
+		state |= BMAP_COWFORK;
 	ASSERT(isnullstartblock(new->br_startblock));
 
 	/*
@@ -2789,7 +2836,7 @@
 	 * Check and set flags if the current (right) segment exists.
 	 * If it doesn't exist, we're converting the hole at end-of-file.
 	 */
-	if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+	if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
 		state |= BMAP_RIGHT_VALID;
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
 
@@ -2923,6 +2970,7 @@
 	ASSERT(!isnullstartblock(new->br_startblock));
 	ASSERT(!bma->cur ||
 	       !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+	ASSERT(whichfork != XFS_COW_FORK);
 
 	XFS_STATS_INC(mp, xs_add_exlist);
 
@@ -3648,7 +3696,9 @@
 	else if (mp->m_dalign)
 		stripe_align = mp->m_dalign;
 
-	if (xfs_alloc_is_userdata(ap->datatype))
+	if (ap->flags & XFS_BMAPI_COWFORK)
+		align = xfs_get_cowextsz_hint(ap->ip);
+	else if (xfs_alloc_is_userdata(ap->datatype))
 		align = xfs_get_extsz_hint(ap->ip);
 	if (unlikely(align)) {
 		error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
@@ -3856,7 +3906,8 @@
 		ASSERT(nullfb || fb_agno == args.agno ||
 		       (ap->dfops->dop_low && fb_agno < args.agno));
 		ap->length = args.len;
-		ap->ip->i_d.di_nblocks += args.len;
+		if (!(ap->flags & XFS_BMAPI_COWFORK))
+			ap->ip->i_d.di_nblocks += args.len;
 		xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
 		if (ap->wasdel)
 			ap->ip->i_delayed_blks -= args.len;
@@ -3876,6 +3927,63 @@
 }
 
 /*
+ * For a remap operation, just "allocate" an extent at the address that the
+ * caller passed in, and ensure that the AGFL is the right size.  The caller
+ * will then map the "allocated" extent into the file somewhere.
+ */
+STATIC int
+xfs_bmap_remap_alloc(
+	struct xfs_bmalloca	*ap)
+{
+	struct xfs_trans	*tp = ap->tp;
+	struct xfs_mount	*mp = tp->t_mountp;
+	xfs_agblock_t		bno;
+	struct xfs_alloc_arg	args;
+	int			error;
+
+	/*
+	 * validate that the block number is legal - the enables us to detect
+	 * and handle a silent filesystem corruption rather than crashing.
+	 */
+	memset(&args, 0, sizeof(struct xfs_alloc_arg));
+	args.tp = ap->tp;
+	args.mp = ap->tp->t_mountp;
+	bno = *ap->firstblock;
+	args.agno = XFS_FSB_TO_AGNO(mp, bno);
+	args.agbno = XFS_FSB_TO_AGBNO(mp, bno);
+	if (args.agno >= mp->m_sb.sb_agcount ||
+	    args.agbno >= mp->m_sb.sb_agblocks)
+		return -EFSCORRUPTED;
+
+	/* "Allocate" the extent from the range we passed in. */
+	trace_xfs_bmap_remap_alloc(ap->ip, *ap->firstblock, ap->length);
+	ap->blkno = bno;
+	ap->ip->i_d.di_nblocks += ap->length;
+	xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+
+	/* Fix the freelist, like a real allocator does. */
+	args.datatype = ap->datatype;
+	args.pag = xfs_perag_get(args.mp, args.agno);
+	ASSERT(args.pag);
+
+	/*
+	 * The freelist fixing code will decline the allocation if
+	 * the size and shape of the free space doesn't allow for
+	 * allocating the extent and updating all the metadata that
+	 * happens during an allocation.  We're remapping, not
+	 * allocating, so skip that check by pretending to be freeing.
+	 */
+	error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
+	if (error)
+		goto error0;
+error0:
+	xfs_perag_put(args.pag);
+	if (error)
+		trace_xfs_bmap_remap_alloc_error(ap->ip, error, _RET_IP_);
+	return error;
+}
+
+/*
  * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
  * It figures out where to ask the underlying allocator to put the new extent.
  */
@@ -3883,6 +3991,8 @@
 xfs_bmap_alloc(
 	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
 {
+	if (ap->flags & XFS_BMAPI_REMAP)
+		return xfs_bmap_remap_alloc(ap);
 	if (XFS_IS_REALTIME_INODE(ap->ip) &&
 	    xfs_alloc_is_userdata(ap->datatype))
 		return xfs_bmap_rtalloc(ap);
@@ -4012,12 +4122,11 @@
 	int			error;
 	int			eof;
 	int			n = 0;
-	int			whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
-						XFS_ATTR_FORK : XFS_DATA_FORK;
+	int			whichfork = xfs_bmapi_whichfork(flags);
 
 	ASSERT(*nmap >= 1);
 	ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
-			   XFS_BMAPI_IGSTATE)));
+			   XFS_BMAPI_IGSTATE|XFS_BMAPI_COWFORK)));
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
 
 	if (unlikely(XFS_TEST_ERROR(
@@ -4035,6 +4144,16 @@
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 
+	/* No CoW fork?  Return a hole. */
+	if (whichfork == XFS_COW_FORK && !ifp) {
+		mval->br_startoff = bno;
+		mval->br_startblock = HOLESTARTBLOCK;
+		mval->br_blockcount = len;
+		mval->br_state = XFS_EXT_NORM;
+		*nmap = 1;
+		return 0;
+	}
+
 	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
 		error = xfs_iread_extents(NULL, ip, whichfork);
 		if (error)
@@ -4084,6 +4203,7 @@
 int
 xfs_bmapi_reserve_delalloc(
 	struct xfs_inode	*ip,
+	int			whichfork,
 	xfs_fileoff_t		aoff,
 	xfs_filblks_t		len,
 	struct xfs_bmbt_irec	*got,
@@ -4092,7 +4212,7 @@
 	int			eof)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
 	xfs_extlen_t		alen;
 	xfs_extlen_t		indlen;
 	char			rt = XFS_IS_REALTIME_INODE(ip);
@@ -4104,7 +4224,10 @@
 		alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
 
 	/* Figure out the extent size, adjust alen */
-	extsz = xfs_get_extsz_hint(ip);
+	if (whichfork == XFS_COW_FORK)
+		extsz = xfs_get_cowextsz_hint(ip);
+	else
+		extsz = xfs_get_extsz_hint(ip);
 	if (extsz) {
 		error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
 					       1, 0, &aoff, &alen);
@@ -4151,7 +4274,7 @@
 	got->br_startblock = nullstartblock(indlen);
 	got->br_blockcount = alen;
 	got->br_state = XFS_EXT_NORM;
-	xfs_bmap_add_extent_hole_delay(ip, lastx, got);
+	xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got);
 
 	/*
 	 * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
@@ -4182,8 +4305,7 @@
 	struct xfs_bmalloca	*bma)
 {
 	struct xfs_mount	*mp = bma->ip->i_mount;
-	int			whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ?
-						XFS_ATTR_FORK : XFS_DATA_FORK;
+	int			whichfork = xfs_bmapi_whichfork(bma->flags);
 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);
 	int			tmp_logflags = 0;
 	int			error;
@@ -4278,7 +4400,7 @@
 		bma->got.br_state = XFS_EXT_UNWRITTEN;
 
 	if (bma->wasdel)
-		error = xfs_bmap_add_extent_delay_real(bma);
+		error = xfs_bmap_add_extent_delay_real(bma, whichfork);
 	else
 		error = xfs_bmap_add_extent_hole_real(bma, whichfork);
 
@@ -4308,8 +4430,7 @@
 	xfs_filblks_t		len,
 	int			flags)
 {
-	int			whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
-						XFS_ATTR_FORK : XFS_DATA_FORK;
+	int			whichfork = xfs_bmapi_whichfork(flags);
 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);
 	int			tmp_logflags = 0;
 	int			error;
@@ -4325,6 +4446,8 @@
 			(XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
 		return 0;
 
+	ASSERT(whichfork != XFS_COW_FORK);
+
 	/*
 	 * Modify (by adding) the state flag, if writing.
 	 */
@@ -4431,8 +4554,7 @@
 	orig_mval = mval;
 	orig_nmap = *nmap;
 #endif
-	whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
-		XFS_ATTR_FORK : XFS_DATA_FORK;
+	whichfork = xfs_bmapi_whichfork(flags);
 
 	ASSERT(*nmap >= 1);
 	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
@@ -4441,6 +4563,11 @@
 	ASSERT(len > 0);
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK);
+	ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP));
+	ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP));
+	ASSERT(!(flags & XFS_BMAPI_PREALLOC) || whichfork != XFS_COW_FORK);
+	ASSERT(!(flags & XFS_BMAPI_CONVERT) || whichfork != XFS_COW_FORK);
 
 	/* zeroing is for currently only for data extents, not metadata */
 	ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
@@ -4502,6 +4629,14 @@
 		wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
 
 		/*
+		 * Make sure we only reflink into a hole.
+		 */
+		if (flags & XFS_BMAPI_REMAP)
+			ASSERT(inhole);
+		if (flags & XFS_BMAPI_COWFORK)
+			ASSERT(!inhole);
+
+		/*
 		 * First, deal with the hole before the allocated space
 		 * that we found, if any.
 		 */
@@ -4531,6 +4666,17 @@
 				goto error0;
 			if (bma.blkno == NULLFSBLOCK)
 				break;
+
+			/*
+			 * If this is a CoW allocation, record the data in
+			 * the refcount btree for orphan recovery.
+			 */
+			if (whichfork == XFS_COW_FORK) {
+				error = xfs_refcount_alloc_cow_extent(mp, dfops,
+						bma.blkno, bma.length);
+				if (error)
+					goto error0;
+			}
 		}
 
 		/* Deal with the allocated space we found.  */
@@ -4696,7 +4842,8 @@
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*del,	/* data to remove from extents */
 	int			*logflagsp, /* inode logging flags */
-	int			whichfork) /* data or attr fork */
+	int			whichfork, /* data or attr fork */
+	int			bflags)	/* bmapi flags */
 {
 	xfs_filblks_t		da_new;	/* new delay-alloc indirect blocks */
 	xfs_filblks_t		da_old;	/* old delay-alloc indirect blocks */
@@ -4725,6 +4872,8 @@
 
 	if (whichfork == XFS_ATTR_FORK)
 		state |= BMAP_ATTRFORK;
+	else if (whichfork == XFS_COW_FORK)
+		state |= BMAP_COWFORK;
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
@@ -4805,6 +4954,7 @@
 		/*
 		 * Matches the whole extent.  Delete the entry.
 		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
 		xfs_iext_remove(ip, *idx, 1,
 				whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
 		--*idx;
@@ -4988,9 +5138,16 @@
 	/*
 	 * If we need to, add to list of extents to delete.
 	 */
-	if (do_fx)
-		xfs_bmap_add_free(mp, dfops, del->br_startblock,
-				del->br_blockcount, NULL);
+	if (do_fx && !(bflags & XFS_BMAPI_REMAP)) {
+		if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
+			error = xfs_refcount_decrease_extent(mp, dfops, del);
+			if (error)
+				goto done;
+		} else
+			xfs_bmap_add_free(mp, dfops, del->br_startblock,
+					del->br_blockcount, NULL);
+	}
+
 	/*
 	 * Adjust inode # blocks in the file.
 	 */
@@ -4999,7 +5156,7 @@
 	/*
 	 * Adjust quota data.
 	 */
-	if (qfield)
+	if (qfield && !(bflags & XFS_BMAPI_REMAP))
 		xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
 
 	/*
@@ -5014,6 +5171,175 @@
 	return error;
 }
 
+/* Remove an extent from the CoW fork.  Similar to xfs_bmap_del_extent. */
+int
+xfs_bunmapi_cow(
+	struct xfs_inode		*ip,
+	struct xfs_bmbt_irec		*del)
+{
+	xfs_filblks_t			da_new;
+	xfs_filblks_t			da_old;
+	xfs_fsblock_t			del_endblock = 0;
+	xfs_fileoff_t			del_endoff;
+	int				delay;
+	struct xfs_bmbt_rec_host	*ep;
+	int				error;
+	struct xfs_bmbt_irec		got;
+	xfs_fileoff_t			got_endoff;
+	struct xfs_ifork		*ifp;
+	struct xfs_mount		*mp;
+	xfs_filblks_t			nblks;
+	struct xfs_bmbt_irec		new;
+	/* REFERENCED */
+	uint				qfield;
+	xfs_filblks_t			temp;
+	xfs_filblks_t			temp2;
+	int				state = BMAP_COWFORK;
+	int				eof;
+	xfs_extnum_t			eidx;
+
+	mp = ip->i_mount;
+	XFS_STATS_INC(mp, xs_del_exlist);
+
+	ep = xfs_bmap_search_extents(ip, del->br_startoff, XFS_COW_FORK, &eof,
+			&eidx, &got, &new);
+
+	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); ifp = ifp;
+	ASSERT((eidx >= 0) && (eidx < ifp->if_bytes /
+		(uint)sizeof(xfs_bmbt_rec_t)));
+	ASSERT(del->br_blockcount > 0);
+	ASSERT(got.br_startoff <= del->br_startoff);
+	del_endoff = del->br_startoff + del->br_blockcount;
+	got_endoff = got.br_startoff + got.br_blockcount;
+	ASSERT(got_endoff >= del_endoff);
+	delay = isnullstartblock(got.br_startblock);
+	ASSERT(isnullstartblock(del->br_startblock) == delay);
+	qfield = 0;
+	error = 0;
+	/*
+	 * If deleting a real allocation, must free up the disk space.
+	 */
+	if (!delay) {
+		nblks = del->br_blockcount;
+		qfield = XFS_TRANS_DQ_BCOUNT;
+		/*
+		 * Set up del_endblock and cur for later.
+		 */
+		del_endblock = del->br_startblock + del->br_blockcount;
+		da_old = da_new = 0;
+	} else {
+		da_old = startblockval(got.br_startblock);
+		da_new = 0;
+		nblks = 0;
+	}
+	qfield = qfield;
+	nblks = nblks;
+
+	/*
+	 * Set flag value to use in switch statement.
+	 * Left-contig is 2, right-contig is 1.
+	 */
+	switch (((got.br_startoff == del->br_startoff) << 1) |
+		(got_endoff == del_endoff)) {
+	case 3:
+		/*
+		 * Matches the whole extent.  Delete the entry.
+		 */
+		xfs_iext_remove(ip, eidx, 1, BMAP_COWFORK);
+		--eidx;
+		break;
+
+	case 2:
+		/*
+		 * Deleting the first part of the extent.
+		 */
+		trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_);
+		xfs_bmbt_set_startoff(ep, del_endoff);
+		temp = got.br_blockcount - del->br_blockcount;
+		xfs_bmbt_set_blockcount(ep, temp);
+		if (delay) {
+			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+				da_old);
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+			trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_);
+			da_new = temp;
+			break;
+		}
+		xfs_bmbt_set_startblock(ep, del_endblock);
+		trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_);
+		break;
+
+	case 1:
+		/*
+		 * Deleting the last part of the extent.
+		 */
+		temp = got.br_blockcount - del->br_blockcount;
+		trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);
+		if (delay) {
+			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+				da_old);
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+			trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_);
+			da_new = temp;
+			break;
+		}
+		trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_);
+		break;
+
+	case 0:
+		/*
+		 * Deleting the middle of the extent.
+		 */
+		temp = del->br_startoff - got.br_startoff;
+		trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);
+		new.br_startoff = del_endoff;
+		temp2 = got_endoff - del_endoff;
+		new.br_blockcount = temp2;
+		new.br_state = got.br_state;
+		if (!delay) {
+			new.br_startblock = del_endblock;
+		} else {
+			temp = xfs_bmap_worst_indlen(ip, temp);
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+			temp2 = xfs_bmap_worst_indlen(ip, temp2);
+			new.br_startblock = nullstartblock((int)temp2);
+			da_new = temp + temp2;
+			while (da_new > da_old) {
+				if (temp) {
+					temp--;
+					da_new--;
+					xfs_bmbt_set_startblock(ep,
+						nullstartblock((int)temp));
+				}
+				if (da_new == da_old)
+					break;
+				if (temp2) {
+					temp2--;
+					da_new--;
+					new.br_startblock =
+						nullstartblock((int)temp2);
+				}
+			}
+		}
+		trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_);
+		xfs_iext_insert(ip, eidx + 1, 1, &new, state);
+		++eidx;
+		break;
+	}
+
+	/*
+	 * Account for change in delayed indirect blocks.
+	 * Nothing to do for disk quota accounting here.
+	 */
+	ASSERT(da_old >= da_new);
+	if (da_old > da_new)
+		xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false);
+
+	return error;
+}
+
 /*
  * Unmap (remove) blocks from a file.
  * If nexts is nonzero then the number of extents to remove is limited to
@@ -5021,17 +5347,16 @@
  * *done is set.
  */
 int						/* error */
-xfs_bunmapi(
+__xfs_bunmapi(
 	xfs_trans_t		*tp,		/* transaction pointer */
 	struct xfs_inode	*ip,		/* incore inode */
 	xfs_fileoff_t		bno,		/* starting offset to unmap */
-	xfs_filblks_t		len,		/* length to unmap in file */
+	xfs_filblks_t		*rlen,		/* i/o: amount remaining */
 	int			flags,		/* misc flags */
 	xfs_extnum_t		nexts,		/* number of extents max */
 	xfs_fsblock_t		*firstblock,	/* first allocated block
 						   controls a.g. for allocs */
-	struct xfs_defer_ops	*dfops,		/* i/o: list extents to free */
-	int			*done)		/* set if not done yet */
+	struct xfs_defer_ops	*dfops)		/* i/o: deferred updates */
 {
 	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
 	xfs_bmbt_irec_t		del;		/* extent being deleted */
@@ -5053,11 +5378,12 @@
 	int			wasdel;		/* was a delayed alloc extent */
 	int			whichfork;	/* data or attribute fork */
 	xfs_fsblock_t		sum;
+	xfs_filblks_t		len = *rlen;	/* length to unmap in file */
 
 	trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
 
-	whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
-		XFS_ATTR_FORK : XFS_DATA_FORK;
+	whichfork = xfs_bmapi_whichfork(flags);
+	ASSERT(whichfork != XFS_COW_FORK);
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	if (unlikely(
 	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -5079,7 +5405,7 @@
 		return error;
 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
 	if (nextents == 0) {
-		*done = 1;
+		*rlen = 0;
 		return 0;
 	}
 	XFS_STATS_INC(mp, xs_blk_unmap);
@@ -5324,7 +5650,7 @@
 			cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
 
 		error = xfs_bmap_del_extent(ip, tp, &lastx, dfops, cur, &del,
-				&tmp_logflags, whichfork);
+				&tmp_logflags, whichfork, flags);
 		logflags |= tmp_logflags;
 		if (error)
 			goto error0;
@@ -5350,7 +5676,10 @@
 			extno++;
 		}
 	}
-	*done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
+	if (bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0)
+		*rlen = 0;
+	else
+		*rlen = bno - start + 1;
 
 	/*
 	 * Convert to a btree if necessary.
@@ -5406,6 +5735,27 @@
 	return error;
 }
 
+/* Unmap a range of a file. */
+int
+xfs_bunmapi(
+	xfs_trans_t		*tp,
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	int			flags,
+	xfs_extnum_t		nexts,
+	xfs_fsblock_t		*firstblock,
+	struct xfs_defer_ops	*dfops,
+	int			*done)
+{
+	int			error;
+
+	error = __xfs_bunmapi(tp, ip, bno, &len, flags, nexts, firstblock,
+			dfops);
+	*done = (len == 0);
+	return error;
+}
+
 /*
  * Determine whether an extent shift can be accomplished by a merge with the
  * extent that precedes the target hole of the shift.
@@ -5985,3 +6335,146 @@
 	xfs_trans_cancel(tp);
 	return error;
 }
+
+/* Deferred mapping is only for real extents in the data fork. */
+static bool
+xfs_bmap_is_update_needed(
+	struct xfs_bmbt_irec	*bmap)
+{
+	return  bmap->br_startblock != HOLESTARTBLOCK &&
+		bmap->br_startblock != DELAYSTARTBLOCK;
+}
+
+/* Record a bmap intent. */
+static int
+__xfs_bmap_add(
+	struct xfs_mount		*mp,
+	struct xfs_defer_ops		*dfops,
+	enum xfs_bmap_intent_type	type,
+	struct xfs_inode		*ip,
+	int				whichfork,
+	struct xfs_bmbt_irec		*bmap)
+{
+	int				error;
+	struct xfs_bmap_intent		*bi;
+
+	trace_xfs_bmap_defer(mp,
+			XFS_FSB_TO_AGNO(mp, bmap->br_startblock),
+			type,
+			XFS_FSB_TO_AGBNO(mp, bmap->br_startblock),
+			ip->i_ino, whichfork,
+			bmap->br_startoff,
+			bmap->br_blockcount,
+			bmap->br_state);
+
+	bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_SLEEP | KM_NOFS);
+	INIT_LIST_HEAD(&bi->bi_list);
+	bi->bi_type = type;
+	bi->bi_owner = ip;
+	bi->bi_whichfork = whichfork;
+	bi->bi_bmap = *bmap;
+
+	error = xfs_defer_join(dfops, bi->bi_owner);
+	if (error) {
+		kmem_free(bi);
+		return error;
+	}
+
+	xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list);
+	return 0;
+}
+
+/* Map an extent into a file. */
+int
+xfs_bmap_map_extent(
+	struct xfs_mount	*mp,
+	struct xfs_defer_ops	*dfops,
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*PREV)
+{
+	if (!xfs_bmap_is_update_needed(PREV))
+		return 0;
+
+	return __xfs_bmap_add(mp, dfops, XFS_BMAP_MAP, ip,
+			XFS_DATA_FORK, PREV);
+}
+
+/* Unmap an extent out of a file. */
+int
+xfs_bmap_unmap_extent(
+	struct xfs_mount	*mp,
+	struct xfs_defer_ops	*dfops,
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*PREV)
+{
+	if (!xfs_bmap_is_update_needed(PREV))
+		return 0;
+
+	return __xfs_bmap_add(mp, dfops, XFS_BMAP_UNMAP, ip,
+			XFS_DATA_FORK, PREV);
+}
+
+/*
+ * Process one of the deferred bmap operations.  We pass back the
+ * btree cursor to maintain our lock on the bmapbt between calls.
+ */
+int
+xfs_bmap_finish_one(
+	struct xfs_trans		*tp,
+	struct xfs_defer_ops		*dfops,
+	struct xfs_inode		*ip,
+	enum xfs_bmap_intent_type	type,
+	int				whichfork,
+	xfs_fileoff_t			startoff,
+	xfs_fsblock_t			startblock,
+	xfs_filblks_t			blockcount,
+	xfs_exntst_t			state)
+{
+	struct xfs_bmbt_irec		bmap;
+	int				nimaps = 1;
+	xfs_fsblock_t			firstfsb;
+	int				flags = XFS_BMAPI_REMAP;
+	int				done;
+	int				error = 0;
+
+	bmap.br_startblock = startblock;
+	bmap.br_startoff = startoff;
+	bmap.br_blockcount = blockcount;
+	bmap.br_state = state;
+
+	trace_xfs_bmap_deferred(tp->t_mountp,
+			XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
+			XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
+			ip->i_ino, whichfork, startoff, blockcount, state);
+
+	if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK)
+		return -EFSCORRUPTED;
+	if (whichfork == XFS_ATTR_FORK)
+		flags |= XFS_BMAPI_ATTRFORK;
+
+	if (XFS_TEST_ERROR(false, tp->t_mountp,
+			XFS_ERRTAG_BMAP_FINISH_ONE,
+			XFS_RANDOM_BMAP_FINISH_ONE))
+		return -EIO;
+
+	switch (type) {
+	case XFS_BMAP_MAP:
+		firstfsb = bmap.br_startblock;
+		error = xfs_bmapi_write(tp, ip, bmap.br_startoff,
+					bmap.br_blockcount, flags, &firstfsb,
+					bmap.br_blockcount, &bmap, &nimaps,
+					dfops);
+		break;
+	case XFS_BMAP_UNMAP:
+		error = xfs_bunmapi(tp, ip, bmap.br_startoff,
+				bmap.br_blockcount, flags, 1, &firstfsb,
+				dfops, &done);
+		ASSERT(done);
+		break;
+	default:
+		ASSERT(0);
+		error = -EFSCORRUPTED;
+	}
+
+	return error;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 8395f6e..f97db71 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -97,6 +97,19 @@
  */
 #define XFS_BMAPI_ZERO		0x080
 
+/*
+ * Map the inode offset to the block given in ap->firstblock.  Primarily
+ * used for reflink.  The range must be in a hole, and this flag cannot be
+ * turned on with PREALLOC or CONVERT, and cannot be used on the attr fork.
+ *
+ * For bunmapi, this flag unmaps the range without adjusting quota, reducing
+ * refcount, or freeing the blocks.
+ */
+#define XFS_BMAPI_REMAP		0x100
+
+/* Map something in the CoW fork. */
+#define XFS_BMAPI_COWFORK	0x200
+
 #define XFS_BMAPI_FLAGS \
 	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
 	{ XFS_BMAPI_METADATA,	"METADATA" }, \
@@ -105,12 +118,24 @@
 	{ XFS_BMAPI_IGSTATE,	"IGSTATE" }, \
 	{ XFS_BMAPI_CONTIG,	"CONTIG" }, \
 	{ XFS_BMAPI_CONVERT,	"CONVERT" }, \
-	{ XFS_BMAPI_ZERO,	"ZERO" }
+	{ XFS_BMAPI_ZERO,	"ZERO" }, \
+	{ XFS_BMAPI_REMAP,	"REMAP" }, \
+	{ XFS_BMAPI_COWFORK,	"COWFORK" }
 
 
 static inline int xfs_bmapi_aflag(int w)
 {
-	return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0);
+	return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK :
+	       (w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0));
+}
+
+static inline int xfs_bmapi_whichfork(int bmapi_flags)
+{
+	if (bmapi_flags & XFS_BMAPI_COWFORK)
+		return XFS_COW_FORK;
+	else if (bmapi_flags & XFS_BMAPI_ATTRFORK)
+		return XFS_ATTR_FORK;
+	return XFS_DATA_FORK;
 }
 
 /*
@@ -131,13 +156,15 @@
 #define BMAP_LEFT_VALID		(1 << 6)
 #define BMAP_RIGHT_VALID	(1 << 7)
 #define BMAP_ATTRFORK		(1 << 8)
+#define BMAP_COWFORK		(1 << 9)
 
 #define XFS_BMAP_EXT_FLAGS \
 	{ BMAP_LEFT_CONTIG,	"LC" }, \
 	{ BMAP_RIGHT_CONTIG,	"RC" }, \
 	{ BMAP_LEFT_FILLING,	"LF" }, \
 	{ BMAP_RIGHT_FILLING,	"RF" }, \
-	{ BMAP_ATTRFORK,	"ATTR" }
+	{ BMAP_ATTRFORK,	"ATTR" }, \
+	{ BMAP_COWFORK,		"COW" }
 
 
 /*
@@ -186,10 +213,15 @@
 		xfs_fsblock_t *firstblock, xfs_extlen_t total,
 		struct xfs_bmbt_irec *mval, int *nmap,
 		struct xfs_defer_ops *dfops);
+int	__xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fileoff_t bno, xfs_filblks_t *rlen, int flags,
+		xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
+		struct xfs_defer_ops *dfops);
 int	xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
 		xfs_fileoff_t bno, xfs_filblks_t len, int flags,
 		xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
 		struct xfs_defer_ops *dfops, int *done);
+int	xfs_bunmapi_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *del);
 int	xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
 		xfs_extnum_t num);
 uint	xfs_default_attroffset(struct xfs_inode *ip);
@@ -203,8 +235,31 @@
 	xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno,
 		int fork, int *eofp, xfs_extnum_t *lastxp,
 		struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp);
-int	xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, xfs_fileoff_t aoff,
-		xfs_filblks_t len, struct xfs_bmbt_irec *got,
-		struct xfs_bmbt_irec *prev, xfs_extnum_t *lastx, int eof);
+int	xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
+		xfs_fileoff_t aoff, xfs_filblks_t len,
+		struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *prev,
+		xfs_extnum_t *lastx, int eof);
+
+enum xfs_bmap_intent_type {
+	XFS_BMAP_MAP = 1,
+	XFS_BMAP_UNMAP,
+};
+
+struct xfs_bmap_intent {
+	struct list_head			bi_list;
+	enum xfs_bmap_intent_type		bi_type;
+	struct xfs_inode			*bi_owner;
+	int					bi_whichfork;
+	struct xfs_bmbt_irec			bi_bmap;
+};
+
+int	xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops,
+		struct xfs_inode *ip, enum xfs_bmap_intent_type type,
+		int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
+		xfs_filblks_t blockcount, xfs_exntst_t state);
+int	xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+		struct xfs_inode *ip, struct xfs_bmbt_irec *imap);
+int	xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+		struct xfs_inode *ip, struct xfs_bmbt_irec *imap);
 
 #endif	/* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index cd85274..8007d2b 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -453,6 +453,7 @@
 
 	if (args.fsbno == NULLFSBLOCK) {
 		args.fsbno = be64_to_cpu(start->l);
+try_another_ag:
 		args.type = XFS_ALLOCTYPE_START_BNO;
 		/*
 		 * Make sure there is sufficient room left in the AG to
@@ -482,6 +483,22 @@
 	if (error)
 		goto error0;
 
+	/*
+	 * During a CoW operation, the allocation and bmbt updates occur in
+	 * different transactions.  The mapping code tries to put new bmbt
+	 * blocks near extents being mapped, but the only way to guarantee this
+	 * is if the alloc and the mapping happen in a single transaction that
+	 * has a block reservation.  That isn't the case here, so if we run out
+	 * of space we'll try again with another AG.
+	 */
+	if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
+	    args.fsbno == NULLFSBLOCK &&
+	    args.type == XFS_ALLOCTYPE_NEAR_BNO) {
+		cur->bc_private.b.dfops->dop_low = true;
+		args.fsbno = cur->bc_private.b.firstblock;
+		goto try_another_ag;
+	}
+
 	if (args.fsbno == NULLFSBLOCK && args.minleft) {
 		/*
 		 * Could not find an AG with enough free space to satisfy
@@ -777,6 +794,7 @@
 {
 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
 	struct xfs_btree_cur	*cur;
+	ASSERT(whichfork != XFS_COW_FORK);
 
 	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
 
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index aa1752f..5c8e6f2 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -45,9 +45,10 @@
  */
 static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
 	{ XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
-	  XFS_FIBT_MAGIC },
+	  XFS_FIBT_MAGIC, 0 },
 	{ XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
-	  XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
+	  XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
+	  XFS_REFC_CRC_MAGIC }
 };
 #define xfs_btree_magic(cur) \
 	xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
@@ -1216,6 +1217,9 @@
 	case XFS_BTNUM_RMAP:
 		xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
 		break;
+	case XFS_BTNUM_REFC:
+		xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
+		break;
 	default:
 		ASSERT(0);
 	}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 3f8556a..c2b01d1 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -49,6 +49,7 @@
 	struct xfs_inobt_key		inobt;
 	struct xfs_rmap_key		rmap;
 	struct xfs_rmap_key		__rmap_bigkey[2];
+	struct xfs_refcount_key		refc;
 };
 
 union xfs_btree_rec {
@@ -57,6 +58,7 @@
 	struct xfs_alloc_rec		alloc;
 	struct xfs_inobt_rec		inobt;
 	struct xfs_rmap_rec		rmap;
+	struct xfs_refcount_rec		refc;
 };
 
 /*
@@ -72,6 +74,7 @@
 #define	XFS_BTNUM_INO	((xfs_btnum_t)XFS_BTNUM_INOi)
 #define	XFS_BTNUM_FINO	((xfs_btnum_t)XFS_BTNUM_FINOi)
 #define	XFS_BTNUM_RMAP	((xfs_btnum_t)XFS_BTNUM_RMAPi)
+#define	XFS_BTNUM_REFC	((xfs_btnum_t)XFS_BTNUM_REFCi)
 
 /*
  * For logging record fields.
@@ -105,6 +108,7 @@
 	case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \
 	case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \
 	case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \
+	case XFS_BTNUM_REFC: __XFS_BTREE_STATS_INC(__mp, refcbt, stat); break; \
 	case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;	\
 	}       \
 } while (0)
@@ -127,6 +131,8 @@
 		__XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \
 	case XFS_BTNUM_RMAP:	\
 		__XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \
+	case XFS_BTNUM_REFC:	\
+		__XFS_BTREE_STATS_ADD(__mp, refcbt, stat, val); break; \
 	case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
 	}       \
 } while (0)
@@ -217,6 +223,15 @@
 	struct xfs_bmbt_irec		b;
 	struct xfs_inobt_rec_incore	i;
 	struct xfs_rmap_irec		r;
+	struct xfs_refcount_irec	rc;
+};
+
+/* Per-AG btree private information. */
+union xfs_btree_cur_private {
+	struct {
+		unsigned long	nr_ops;		/* # record updates */
+		int		shape_changes;	/* # of extent splits */
+	} refc;
 };
 
 /*
@@ -243,6 +258,7 @@
 			struct xfs_buf	*agbp;	/* agf/agi buffer pointer */
 			struct xfs_defer_ops *dfops;	/* deferred updates */
 			xfs_agnumber_t	agno;	/* ag number */
+			union xfs_btree_cur_private	priv;
 		} a;
 		struct {			/* needed for BMAP */
 			struct xfs_inode *ip;	/* pointer to our inode */
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index e96533d..f6e93ef 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -51,6 +51,8 @@
  * find all the space it needs.
  */
 enum xfs_defer_ops_type {
+	XFS_DEFER_OPS_TYPE_BMAP,
+	XFS_DEFER_OPS_TYPE_REFCOUNT,
 	XFS_DEFER_OPS_TYPE_RMAP,
 	XFS_DEFER_OPS_TYPE_FREE,
 	XFS_DEFER_OPS_TYPE_MAX,
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 270fb5c..f6547fc 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -456,9 +456,11 @@
 
 #define XFS_SB_FEAT_RO_COMPAT_FINOBT   (1 << 0)		/* free inode btree */
 #define XFS_SB_FEAT_RO_COMPAT_RMAPBT   (1 << 1)		/* reverse map btree */
+#define XFS_SB_FEAT_RO_COMPAT_REFLINK  (1 << 2)		/* reflinked files */
 #define XFS_SB_FEAT_RO_COMPAT_ALL \
 		(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
-		 XFS_SB_FEAT_RO_COMPAT_RMAPBT)
+		 XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
+		 XFS_SB_FEAT_RO_COMPAT_REFLINK)
 #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN	~XFS_SB_FEAT_RO_COMPAT_ALL
 static inline bool
 xfs_sb_has_ro_compat_feature(
@@ -546,6 +548,12 @@
 		(sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT);
 }
 
+static inline bool xfs_sb_version_hasreflink(struct xfs_sb *sbp)
+{
+	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+		(sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK);
+}
+
 /*
  * end of superblock version macros
  */
@@ -641,14 +649,17 @@
 	uuid_t		agf_uuid;	/* uuid of filesystem */
 
 	__be32		agf_rmap_blocks;	/* rmapbt blocks used */
-	__be32		agf_padding;		/* padding */
+	__be32		agf_refcount_blocks;	/* refcountbt blocks used */
+
+	__be32		agf_refcount_root;	/* refcount tree root block */
+	__be32		agf_refcount_level;	/* refcount btree levels */
 
 	/*
 	 * reserve some contiguous space for future logged fields before we add
 	 * the unlogged fields. This makes the range logging via flags and
 	 * structure offsets much simpler.
 	 */
-	__be64		agf_spare64[15];
+	__be64		agf_spare64[14];
 
 	/* unlogged fields, written during buffer writeback. */
 	__be64		agf_lsn;	/* last write sequence */
@@ -674,8 +685,11 @@
 #define	XFS_AGF_BTREEBLKS	0x00000800
 #define	XFS_AGF_UUID		0x00001000
 #define	XFS_AGF_RMAP_BLOCKS	0x00002000
-#define	XFS_AGF_SPARE64		0x00004000
-#define	XFS_AGF_NUM_BITS	15
+#define	XFS_AGF_REFCOUNT_BLOCKS	0x00004000
+#define	XFS_AGF_REFCOUNT_ROOT	0x00008000
+#define	XFS_AGF_REFCOUNT_LEVEL	0x00010000
+#define	XFS_AGF_SPARE64		0x00020000
+#define	XFS_AGF_NUM_BITS	18
 #define	XFS_AGF_ALL_BITS	((1 << XFS_AGF_NUM_BITS) - 1)
 
 #define XFS_AGF_FLAGS \
@@ -693,6 +707,9 @@
 	{ XFS_AGF_BTREEBLKS,	"BTREEBLKS" }, \
 	{ XFS_AGF_UUID,		"UUID" }, \
 	{ XFS_AGF_RMAP_BLOCKS,	"RMAP_BLOCKS" }, \
+	{ XFS_AGF_REFCOUNT_BLOCKS,	"REFCOUNT_BLOCKS" }, \
+	{ XFS_AGF_REFCOUNT_ROOT,	"REFCOUNT_ROOT" }, \
+	{ XFS_AGF_REFCOUNT_LEVEL,	"REFCOUNT_LEVEL" }, \
 	{ XFS_AGF_SPARE64,	"SPARE64" }
 
 /* disk block (xfs_daddr_t) in the AG */
@@ -885,7 +902,8 @@
 	__be64		di_changecount;	/* number of attribute changes */
 	__be64		di_lsn;		/* flush sequence */
 	__be64		di_flags2;	/* more random flags */
-	__u8		di_pad2[16];	/* more padding for future expansion */
+	__be32		di_cowextsize;	/* basic cow extent size for file */
+	__u8		di_pad2[12];	/* more padding for future expansion */
 
 	/* fields only written to during inode creation */
 	xfs_timestamp_t	di_crtime;	/* time created */
@@ -1041,9 +1059,14 @@
  * 16 bits of the XFS_XFLAG_s range.
  */
 #define XFS_DIFLAG2_DAX_BIT	0	/* use DAX for this inode */
+#define XFS_DIFLAG2_REFLINK_BIT	1	/* file's blocks may be shared */
+#define XFS_DIFLAG2_COWEXTSIZE_BIT   2  /* copy on write extent size hint */
 #define XFS_DIFLAG2_DAX		(1 << XFS_DIFLAG2_DAX_BIT)
+#define XFS_DIFLAG2_REFLINK     (1 << XFS_DIFLAG2_REFLINK_BIT)
+#define XFS_DIFLAG2_COWEXTSIZE  (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
 
-#define XFS_DIFLAG2_ANY		(XFS_DIFLAG2_DAX)
+#define XFS_DIFLAG2_ANY \
+	(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)
 
 /*
  * Inode number format:
@@ -1353,7 +1376,9 @@
 #define XFS_RMAP_OWN_AG		(-5ULL)	/* AG freespace btree blocks */
 #define XFS_RMAP_OWN_INOBT	(-6ULL)	/* Inode btree blocks */
 #define XFS_RMAP_OWN_INODES	(-7ULL)	/* Inode chunk */
-#define XFS_RMAP_OWN_MIN	(-8ULL) /* guard */
+#define XFS_RMAP_OWN_REFC	(-8ULL) /* refcount tree */
+#define XFS_RMAP_OWN_COW	(-9ULL) /* cow allocations */
+#define XFS_RMAP_OWN_MIN	(-10ULL) /* guard */
 
 #define XFS_RMAP_NON_INODE_OWNER(owner)	(!!((owner) & (1ULL << 63)))
 
@@ -1434,6 +1459,62 @@
 	 XFS_IBT_BLOCK(mp) + 1)
 
 /*
+ * Reference Count Btree format definitions
+ *
+ */
+#define	XFS_REFC_CRC_MAGIC	0x52334643	/* 'R3FC' */
+
+unsigned int xfs_refc_block(struct xfs_mount *mp);
+
+/*
+ * Data record/key structure
+ *
+ * Each record associates a range of physical blocks (starting at
+ * rc_startblock and ending rc_blockcount blocks later) with a reference
+ * count (rc_refcount).  Extents that are being used to stage a copy on
+ * write (CoW) operation are recorded in the refcount btree with a
+ * refcount of 1.  All other records must have a refcount > 1 and must
+ * track an extent mapped only by file data forks.
+ *
+ * Extents with a single owner (attributes, metadata, non-shared file
+ * data) are not tracked here.  Free space is also not tracked here.
+ * This is consistent with pre-reflink XFS.
+ */
+
+/*
+ * Extents that are being used to stage a copy on write are stored
+ * in the refcount btree with a refcount of 1 and the upper bit set
+ * on the startblock.  This speeds up mount time deletion of stale
+ * staging extents because they're all at the right side of the tree.
+ */
+#define XFS_REFC_COW_START		((xfs_agblock_t)(1U << 31))
+#define REFCNTBT_COWFLAG_BITLEN		1
+#define REFCNTBT_AGBLOCK_BITLEN		31
+
+struct xfs_refcount_rec {
+	__be32		rc_startblock;	/* starting block number */
+	__be32		rc_blockcount;	/* count of blocks */
+	__be32		rc_refcount;	/* number of inodes linked here */
+};
+
+struct xfs_refcount_key {
+	__be32		rc_startblock;	/* starting block number */
+};
+
+struct xfs_refcount_irec {
+	xfs_agblock_t	rc_startblock;	/* starting block number */
+	xfs_extlen_t	rc_blockcount;	/* count of free blocks */
+	xfs_nlink_t	rc_refcount;	/* number of inodes linked here */
+};
+
+#define MAXREFCOUNT	((xfs_nlink_t)~0U)
+#define MAXREFCEXTLEN	((xfs_extlen_t)~0U)
+
+/* btree pointer type */
+typedef __be32 xfs_refcount_ptr_t;
+
+
+/*
  * BMAP Btree format definitions
  *
  * This includes both the root block definition that sits inside an inode fork
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 7945505..b72dc82 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -81,14 +81,16 @@
 #define BMV_IF_PREALLOC		0x4	/* rtn status BMV_OF_PREALLOC if req */
 #define BMV_IF_DELALLOC		0x8	/* rtn status BMV_OF_DELALLOC if req */
 #define BMV_IF_NO_HOLES		0x10	/* Do not return holes */
+#define BMV_IF_COWFORK		0x20	/* return CoW fork rather than data */
 #define BMV_IF_VALID	\
 	(BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|	\
-	 BMV_IF_DELALLOC|BMV_IF_NO_HOLES)
+	 BMV_IF_DELALLOC|BMV_IF_NO_HOLES|BMV_IF_COWFORK)
 
 /*	bmv_oflags values - returned for each non-header segment */
 #define BMV_OF_PREALLOC		0x1	/* segment = unwritten pre-allocation */
 #define BMV_OF_DELALLOC		0x2	/* segment = delayed allocation */
 #define BMV_OF_LAST		0x4	/* segment is the last in the file */
+#define BMV_OF_SHARED		0x8	/* segment shared with another file */
 
 /*
  * Structure for XFS_IOC_FSSETDM.
@@ -206,7 +208,8 @@
 #define XFS_FSOP_GEOM_FLAGS_FTYPE	0x10000	/* inode directory types */
 #define XFS_FSOP_GEOM_FLAGS_FINOBT	0x20000	/* free inode btree */
 #define XFS_FSOP_GEOM_FLAGS_SPINODES	0x40000	/* sparse inode chunks	*/
-#define XFS_FSOP_GEOM_FLAGS_RMAPBT	0x80000	/* Reverse mapping btree */
+#define XFS_FSOP_GEOM_FLAGS_RMAPBT	0x80000	/* reverse mapping btree */
+#define XFS_FSOP_GEOM_FLAGS_REFLINK	0x100000 /* files can share blocks */
 
 /*
  * Minimum and maximum sizes need for growth checks.
@@ -275,7 +278,8 @@
 #define	bs_projid	bs_projid_lo	/* (previously just bs_projid)	*/
 	__u16		bs_forkoff;	/* inode fork offset in bytes	*/
 	__u16		bs_projid_hi;	/* higher part of project id	*/
-	unsigned char	bs_pad[10];	/* pad space, unused		*/
+	unsigned char	bs_pad[6];	/* pad space, unused		*/
+	__u32		bs_cowextsize;	/* cow extent size		*/
 	__u32		bs_dmevmask;	/* DMIG event mask		*/
 	__u16		bs_dmstate;	/* DMIG state info		*/
 	__u16		bs_aextents;	/* attribute number of extents	*/
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 4b9769e..8de9a3a 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -256,6 +256,7 @@
 		to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
 		to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
 		to->di_flags2 = be64_to_cpu(from->di_flags2);
+		to->di_cowextsize = be32_to_cpu(from->di_cowextsize);
 	}
 }
 
@@ -305,7 +306,7 @@
 		to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
 		to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
 		to->di_flags2 = cpu_to_be64(from->di_flags2);
-
+		to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
 		to->di_ino = cpu_to_be64(ip->i_ino);
 		to->di_lsn = cpu_to_be64(lsn);
 		memset(to->di_pad2, 0, sizeof(to->di_pad2));
@@ -357,6 +358,7 @@
 		to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
 		to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
 		to->di_flags2 = cpu_to_be64(from->di_flags2);
+		to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
 		to->di_ino = cpu_to_be64(from->di_ino);
 		to->di_lsn = cpu_to_be64(from->di_lsn);
 		memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
@@ -373,6 +375,9 @@
 	struct xfs_inode	*ip,
 	struct xfs_dinode	*dip)
 {
+	uint16_t		flags;
+	uint64_t		flags2;
+
 	if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
 		return false;
 
@@ -389,6 +394,23 @@
 		return false;
 	if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid))
 		return false;
+
+	flags = be16_to_cpu(dip->di_flags);
+	flags2 = be64_to_cpu(dip->di_flags2);
+
+	/* don't allow reflink/cowextsize if we don't have reflink */
+	if ((flags2 & (XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)) &&
+            !xfs_sb_version_hasreflink(&mp->m_sb))
+		return false;
+
+	/* don't let reflink and realtime mix */
+	if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME))
+		return false;
+
+	/* don't let reflink and dax mix */
+	if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags2 & XFS_DIFLAG2_DAX))
+		return false;
+
 	return true;
 }
 
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 7c4dd32..62d9d46 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -47,6 +47,7 @@
 	__uint16_t	di_flags;	/* random flags, XFS_DIFLAG_... */
 
 	__uint64_t	di_flags2;	/* more random flags */
+	__uint32_t	di_cowextsize;	/* basic cow extent size for file */
 
 	xfs_ictimestamp_t di_crtime;	/* time created */
 };
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index bbcc8c7..5dd56d3 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -121,6 +121,26 @@
 		return -EFSCORRUPTED;
 	}
 
+	if (unlikely(xfs_is_reflink_inode(ip) &&
+	    (VFS_I(ip)->i_mode & S_IFMT) != S_IFREG)) {
+		xfs_warn(ip->i_mount,
+			"corrupt dinode %llu, wrong file type for reflink.",
+			ip->i_ino);
+		XFS_CORRUPTION_ERROR("xfs_iformat(reflink)",
+				     XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+		return -EFSCORRUPTED;
+	}
+
+	if (unlikely(xfs_is_reflink_inode(ip) &&
+	    (ip->i_d.di_flags & XFS_DIFLAG_REALTIME))) {
+		xfs_warn(ip->i_mount,
+			"corrupt dinode %llu, has reflink+realtime flag set.",
+			ip->i_ino);
+		XFS_CORRUPTION_ERROR("xfs_iformat(reflink)",
+				     XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+		return -EFSCORRUPTED;
+	}
+
 	switch (VFS_I(ip)->i_mode & S_IFMT) {
 	case S_IFIFO:
 	case S_IFCHR:
@@ -186,9 +206,14 @@
 		XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
 		return -EFSCORRUPTED;
 	}
-	if (error) {
+	if (error)
 		return error;
+
+	if (xfs_is_reflink_inode(ip)) {
+		ASSERT(ip->i_cowfp == NULL);
+		xfs_ifork_init_cow(ip);
 	}
+
 	if (!XFS_DFORK_Q(dip))
 		return 0;
 
@@ -208,7 +233,8 @@
 			XFS_CORRUPTION_ERROR("xfs_iformat(8)",
 					     XFS_ERRLEVEL_LOW,
 					     ip->i_mount, dip);
-			return -EFSCORRUPTED;
+			error = -EFSCORRUPTED;
+			break;
 		}
 
 		error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
@@ -226,6 +252,9 @@
 	if (error) {
 		kmem_zone_free(xfs_ifork_zone, ip->i_afp);
 		ip->i_afp = NULL;
+		if (ip->i_cowfp)
+			kmem_zone_free(xfs_ifork_zone, ip->i_cowfp);
+		ip->i_cowfp = NULL;
 		xfs_idestroy_fork(ip, XFS_DATA_FORK);
 	}
 	return error;
@@ -740,6 +769,9 @@
 	if (whichfork == XFS_ATTR_FORK) {
 		kmem_zone_free(xfs_ifork_zone, ip->i_afp);
 		ip->i_afp = NULL;
+	} else if (whichfork == XFS_COW_FORK) {
+		kmem_zone_free(xfs_ifork_zone, ip->i_cowfp);
+		ip->i_cowfp = NULL;
 	}
 }
 
@@ -927,6 +959,19 @@
 	}
 }
 
+/* Convert bmap state flags to an inode fork. */
+struct xfs_ifork *
+xfs_iext_state_to_fork(
+	struct xfs_inode	*ip,
+	int			state)
+{
+	if (state & BMAP_COWFORK)
+		return ip->i_cowfp;
+	else if (state & BMAP_ATTRFORK)
+		return ip->i_afp;
+	return &ip->i_df;
+}
+
 /*
  * Insert new item(s) into the extent records for incore inode
  * fork 'ifp'.  'count' new items are inserted at index 'idx'.
@@ -939,7 +984,7 @@
 	xfs_bmbt_irec_t	*new,		/* items to insert */
 	int		state)		/* type of extent conversion */
 {
-	xfs_ifork_t	*ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+	xfs_ifork_t	*ifp = xfs_iext_state_to_fork(ip, state);
 	xfs_extnum_t	i;		/* extent record index */
 
 	trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
@@ -1189,7 +1234,7 @@
 	int		ext_diff,	/* number of extents to remove */
 	int		state)		/* type of extent conversion */
 {
-	xfs_ifork_t	*ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+	xfs_ifork_t	*ifp = xfs_iext_state_to_fork(ip, state);
 	xfs_extnum_t	nextents;	/* number of extents in file */
 	int		new_size;	/* size of extents after removal */
 
@@ -1934,3 +1979,20 @@
 		ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
 	}
 }
+
+/*
+ * Initialize an inode's copy-on-write fork.
+ */
+void
+xfs_ifork_init_cow(
+	struct xfs_inode	*ip)
+{
+	if (ip->i_cowfp)
+		return;
+
+	ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone,
+				       KM_SLEEP | KM_NOFS);
+	ip->i_cowfp->if_flags = XFS_IFEXTENTS;
+	ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
+	ip->i_cnextents = 0;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index f95e072..c9476f5 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -92,7 +92,9 @@
 #define XFS_IFORK_PTR(ip,w)		\
 	((w) == XFS_DATA_FORK ? \
 		&(ip)->i_df : \
-		(ip)->i_afp)
+		((w) == XFS_ATTR_FORK ? \
+			(ip)->i_afp : \
+			(ip)->i_cowfp))
 #define XFS_IFORK_DSIZE(ip) \
 	(XFS_IFORK_Q(ip) ? \
 		XFS_IFORK_BOFF(ip) : \
@@ -105,26 +107,38 @@
 #define XFS_IFORK_SIZE(ip,w) \
 	((w) == XFS_DATA_FORK ? \
 		XFS_IFORK_DSIZE(ip) : \
-		XFS_IFORK_ASIZE(ip))
+		((w) == XFS_ATTR_FORK ? \
+			XFS_IFORK_ASIZE(ip) : \
+			0))
 #define XFS_IFORK_FORMAT(ip,w) \
 	((w) == XFS_DATA_FORK ? \
 		(ip)->i_d.di_format : \
-		(ip)->i_d.di_aformat)
+		((w) == XFS_ATTR_FORK ? \
+			(ip)->i_d.di_aformat : \
+			(ip)->i_cformat))
 #define XFS_IFORK_FMT_SET(ip,w,n) \
 	((w) == XFS_DATA_FORK ? \
 		((ip)->i_d.di_format = (n)) : \
-		((ip)->i_d.di_aformat = (n)))
+		((w) == XFS_ATTR_FORK ? \
+			((ip)->i_d.di_aformat = (n)) : \
+			((ip)->i_cformat = (n))))
 #define XFS_IFORK_NEXTENTS(ip,w) \
 	((w) == XFS_DATA_FORK ? \
 		(ip)->i_d.di_nextents : \
-		(ip)->i_d.di_anextents)
+		((w) == XFS_ATTR_FORK ? \
+			(ip)->i_d.di_anextents : \
+			(ip)->i_cnextents))
 #define XFS_IFORK_NEXT_SET(ip,w,n) \
 	((w) == XFS_DATA_FORK ? \
 		((ip)->i_d.di_nextents = (n)) : \
-		((ip)->i_d.di_anextents = (n)))
+		((w) == XFS_ATTR_FORK ? \
+			((ip)->i_d.di_anextents = (n)) : \
+			((ip)->i_cnextents = (n))))
 #define XFS_IFORK_MAXEXT(ip, w) \
 	(XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
 
+struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state);
+
 int		xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *);
 void		xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
 				struct xfs_inode_log_item *, int);
@@ -169,4 +183,6 @@
 
 extern struct kmem_zone	*xfs_ifork_zone;
 
+extern void xfs_ifork_init_cow(struct xfs_inode *ip);
+
 #endif	/* __XFS_INODE_FORK_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index fc5eef8..083cdd6 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -112,7 +112,11 @@
 #define XLOG_REG_TYPE_ICREATE		20
 #define XLOG_REG_TYPE_RUI_FORMAT	21
 #define XLOG_REG_TYPE_RUD_FORMAT	22
-#define XLOG_REG_TYPE_MAX		22
+#define XLOG_REG_TYPE_CUI_FORMAT	23
+#define XLOG_REG_TYPE_CUD_FORMAT	24
+#define XLOG_REG_TYPE_BUI_FORMAT	25
+#define XLOG_REG_TYPE_BUD_FORMAT	26
+#define XLOG_REG_TYPE_MAX		26
 
 /*
  * Flags to log operation header
@@ -231,6 +235,10 @@
 #define	XFS_LI_ICREATE		0x123f
 #define	XFS_LI_RUI		0x1240	/* rmap update intent */
 #define	XFS_LI_RUD		0x1241
+#define	XFS_LI_CUI		0x1242	/* refcount update intent */
+#define	XFS_LI_CUD		0x1243
+#define	XFS_LI_BUI		0x1244	/* bmbt update intent */
+#define	XFS_LI_BUD		0x1245
 
 #define XFS_LI_TYPE_DESC \
 	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
@@ -242,7 +250,11 @@
 	{ XFS_LI_QUOTAOFF,	"XFS_LI_QUOTAOFF" }, \
 	{ XFS_LI_ICREATE,	"XFS_LI_ICREATE" }, \
 	{ XFS_LI_RUI,		"XFS_LI_RUI" }, \
-	{ XFS_LI_RUD,		"XFS_LI_RUD" }
+	{ XFS_LI_RUD,		"XFS_LI_RUD" }, \
+	{ XFS_LI_CUI,		"XFS_LI_CUI" }, \
+	{ XFS_LI_CUD,		"XFS_LI_CUD" }, \
+	{ XFS_LI_BUI,		"XFS_LI_BUI" }, \
+	{ XFS_LI_BUD,		"XFS_LI_BUD" }
 
 /*
  * Inode Log Item Format definitions.
@@ -411,7 +423,8 @@
 	__uint64_t	di_changecount;	/* number of attribute changes */
 	xfs_lsn_t	di_lsn;		/* flush sequence */
 	__uint64_t	di_flags2;	/* more random flags */
-	__uint8_t	di_pad2[16];	/* more padding for future expansion */
+	__uint32_t	di_cowextsize;	/* basic cow extent size for file */
+	__uint8_t	di_pad2[12];	/* more padding for future expansion */
 
 	/* fields only written to during inode creation */
 	xfs_ictimestamp_t di_crtime;	/* time created */
@@ -622,8 +635,11 @@
 
 /* rmap me_flags: upper bits are flags, lower byte is type code */
 #define XFS_RMAP_EXTENT_MAP		1
+#define XFS_RMAP_EXTENT_MAP_SHARED	2
 #define XFS_RMAP_EXTENT_UNMAP		3
+#define XFS_RMAP_EXTENT_UNMAP_SHARED	4
 #define XFS_RMAP_EXTENT_CONVERT		5
+#define XFS_RMAP_EXTENT_CONVERT_SHARED	6
 #define XFS_RMAP_EXTENT_ALLOC		7
 #define XFS_RMAP_EXTENT_FREE		8
 #define XFS_RMAP_EXTENT_TYPE_MASK	0xFF
@@ -671,6 +687,102 @@
 };
 
 /*
+ * CUI/CUD (refcount update) log format definitions
+ */
+struct xfs_phys_extent {
+	__uint64_t		pe_startblock;
+	__uint32_t		pe_len;
+	__uint32_t		pe_flags;
+};
+
+/* refcount pe_flags: upper bits are flags, lower byte is type code */
+/* Type codes are taken directly from enum xfs_refcount_intent_type. */
+#define XFS_REFCOUNT_EXTENT_TYPE_MASK	0xFF
+
+#define XFS_REFCOUNT_EXTENT_FLAGS	(XFS_REFCOUNT_EXTENT_TYPE_MASK)
+
+/*
+ * This is the structure used to lay out a cui log item in the
+ * log.  The cui_extents field is a variable size array whose
+ * size is given by cui_nextents.
+ */
+struct xfs_cui_log_format {
+	__uint16_t		cui_type;	/* cui log item type */
+	__uint16_t		cui_size;	/* size of this item */
+	__uint32_t		cui_nextents;	/* # extents to free */
+	__uint64_t		cui_id;		/* cui identifier */
+	struct xfs_phys_extent	cui_extents[];	/* array of extents */
+};
+
+static inline size_t
+xfs_cui_log_format_sizeof(
+	unsigned int		nr)
+{
+	return sizeof(struct xfs_cui_log_format) +
+			nr * sizeof(struct xfs_phys_extent);
+}
+
+/*
+ * This is the structure used to lay out a cud log item in the
+ * log.  The cud_extents array is a variable size array whose
+ * size is given by cud_nextents;
+ */
+struct xfs_cud_log_format {
+	__uint16_t		cud_type;	/* cud log item type */
+	__uint16_t		cud_size;	/* size of this item */
+	__uint32_t		__pad;
+	__uint64_t		cud_cui_id;	/* id of corresponding cui */
+};
+
+/*
+ * BUI/BUD (inode block mapping) log format definitions
+ */
+
+/* bmbt me_flags: upper bits are flags, lower byte is type code */
+/* Type codes are taken directly from enum xfs_bmap_intent_type. */
+#define XFS_BMAP_EXTENT_TYPE_MASK	0xFF
+
+#define XFS_BMAP_EXTENT_ATTR_FORK	(1U << 31)
+#define XFS_BMAP_EXTENT_UNWRITTEN	(1U << 30)
+
+#define XFS_BMAP_EXTENT_FLAGS		(XFS_BMAP_EXTENT_TYPE_MASK | \
+					 XFS_BMAP_EXTENT_ATTR_FORK | \
+					 XFS_BMAP_EXTENT_UNWRITTEN)
+
+/*
+ * This is the structure used to lay out an bui log item in the
+ * log.  The bui_extents field is a variable size array whose
+ * size is given by bui_nextents.
+ */
+struct xfs_bui_log_format {
+	__uint16_t		bui_type;	/* bui log item type */
+	__uint16_t		bui_size;	/* size of this item */
+	__uint32_t		bui_nextents;	/* # extents to free */
+	__uint64_t		bui_id;		/* bui identifier */
+	struct xfs_map_extent	bui_extents[];	/* array of extents to bmap */
+};
+
+static inline size_t
+xfs_bui_log_format_sizeof(
+	unsigned int		nr)
+{
+	return sizeof(struct xfs_bui_log_format) +
+			nr * sizeof(struct xfs_map_extent);
+}
+
+/*
+ * This is the structure used to lay out an bud log item in the
+ * log.  The bud_extents array is a variable size array whose
+ * size is given by bud_nextents;
+ */
+struct xfs_bud_log_format {
+	__uint16_t		bud_type;	/* bud log item type */
+	__uint16_t		bud_size;	/* size of this item */
+	__uint32_t		__pad;
+	__uint64_t		bud_bui_id;	/* id of corresponding bui */
+};
+
+/*
  * Dquot Log format definitions.
  *
  * The first two fields must be the type and size fitting into
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
new file mode 100644
index 0000000..b177ef3
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -0,0 +1,1698 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_bit.h"
+#include "xfs_refcount.h"
+#include "xfs_rmap.h"
+
+/* Allowable refcount adjustment amounts. */
+enum xfs_refc_adjust_op {
+	XFS_REFCOUNT_ADJUST_INCREASE	= 1,
+	XFS_REFCOUNT_ADJUST_DECREASE	= -1,
+	XFS_REFCOUNT_ADJUST_COW_ALLOC	= 0,
+	XFS_REFCOUNT_ADJUST_COW_FREE	= -1,
+};
+
+STATIC int __xfs_refcount_cow_alloc(struct xfs_btree_cur *rcur,
+		xfs_agblock_t agbno, xfs_extlen_t aglen,
+		struct xfs_defer_ops *dfops);
+STATIC int __xfs_refcount_cow_free(struct xfs_btree_cur *rcur,
+		xfs_agblock_t agbno, xfs_extlen_t aglen,
+		struct xfs_defer_ops *dfops);
+
+/*
+ * Look up the first record less than or equal to [bno, len] in the btree
+ * given by cur.
+ */
+int
+xfs_refcount_lookup_le(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		bno,
+	int			*stat)
+{
+	trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno,
+			XFS_LOOKUP_LE);
+	cur->bc_rec.rc.rc_startblock = bno;
+	cur->bc_rec.rc.rc_blockcount = 0;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Look up the first record greater than or equal to [bno, len] in the btree
+ * given by cur.
+ */
+int
+xfs_refcount_lookup_ge(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		bno,
+	int			*stat)
+{
+	trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno,
+			XFS_LOOKUP_GE);
+	cur->bc_rec.rc.rc_startblock = bno;
+	cur->bc_rec.rc.rc_blockcount = 0;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/* Convert on-disk record to in-core format. */
+static inline void
+xfs_refcount_btrec_to_irec(
+	union xfs_btree_rec		*rec,
+	struct xfs_refcount_irec	*irec)
+{
+	irec->rc_startblock = be32_to_cpu(rec->refc.rc_startblock);
+	irec->rc_blockcount = be32_to_cpu(rec->refc.rc_blockcount);
+	irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_refcount_get_rec(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*irec,
+	int				*stat)
+{
+	union xfs_btree_rec		*rec;
+	int				error;
+
+	error = xfs_btree_get_rec(cur, &rec, stat);
+	if (!error && *stat == 1) {
+		xfs_refcount_btrec_to_irec(rec, irec);
+		trace_xfs_refcount_get(cur->bc_mp, cur->bc_private.a.agno,
+				irec);
+	}
+	return error;
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len, refcount].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_refcount_update(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*irec)
+{
+	union xfs_btree_rec	rec;
+	int			error;
+
+	trace_xfs_refcount_update(cur->bc_mp, cur->bc_private.a.agno, irec);
+	rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock);
+	rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount);
+	rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount);
+	error = xfs_btree_update(cur, &rec);
+	if (error)
+		trace_xfs_refcount_update_error(cur->bc_mp,
+				cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Insert the record referred to by cur to the value given
+ * by [bno, len, refcount].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_refcount_insert(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*irec,
+	int				*i)
+{
+	int				error;
+
+	trace_xfs_refcount_insert(cur->bc_mp, cur->bc_private.a.agno, irec);
+	cur->bc_rec.rc.rc_startblock = irec->rc_startblock;
+	cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount;
+	cur->bc_rec.rc.rc_refcount = irec->rc_refcount;
+	error = xfs_btree_insert(cur, i);
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error);
+out_error:
+	if (error)
+		trace_xfs_refcount_insert_error(cur->bc_mp,
+				cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Remove the record referred to by cur, then set the pointer to the spot
+ * where the record could be re-inserted, in case we want to increment or
+ * decrement the cursor.
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_refcount_delete(
+	struct xfs_btree_cur	*cur,
+	int			*i)
+{
+	struct xfs_refcount_irec	irec;
+	int			found_rec;
+	int			error;
+
+	error = xfs_refcount_get_rec(cur, &irec, &found_rec);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+	trace_xfs_refcount_delete(cur->bc_mp, cur->bc_private.a.agno, &irec);
+	error = xfs_btree_delete(cur, i);
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error);
+	if (error)
+		goto out_error;
+	error = xfs_refcount_lookup_ge(cur, irec.rc_startblock, &found_rec);
+out_error:
+	if (error)
+		trace_xfs_refcount_delete_error(cur->bc_mp,
+				cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Adjusting the Reference Count
+ *
+ * As stated elsewhere, the reference count btree (refcbt) stores
+ * >1 reference counts for extents of physical blocks.  In this
+ * operation, we're either raising or lowering the reference count of
+ * some subrange stored in the tree:
+ *
+ *      <------ adjustment range ------>
+ * ----+   +---+-----+ +--+--------+---------
+ *  2  |   | 3 |  4  | |17|   55   |   10
+ * ----+   +---+-----+ +--+--------+---------
+ * X axis is physical blocks number;
+ * reference counts are the numbers inside the rectangles
+ *
+ * The first thing we need to do is to ensure that there are no
+ * refcount extents crossing either boundary of the range to be
+ * adjusted.  For any extent that does cross a boundary, split it into
+ * two extents so that we can increment the refcount of one of the
+ * pieces later:
+ *
+ *      <------ adjustment range ------>
+ * ----+   +---+-----+ +--+--------+----+----
+ *  2  |   | 3 |  2  | |17|   55   | 10 | 10
+ * ----+   +---+-----+ +--+--------+----+----
+ *
+ * For this next step, let's assume that all the physical blocks in
+ * the adjustment range are mapped to a file and are therefore in use
+ * at least once.  Therefore, we can infer that any gap in the
+ * refcount tree within the adjustment range represents a physical
+ * extent with refcount == 1:
+ *
+ *      <------ adjustment range ------>
+ * ----+---+---+-----+-+--+--------+----+----
+ *  2  |"1"| 3 |  2  |1|17|   55   | 10 | 10
+ * ----+---+---+-----+-+--+--------+----+----
+ *      ^
+ *
+ * For each extent that falls within the interval range, figure out
+ * which extent is to the left or the right of that extent.  Now we
+ * have a left, current, and right extent.  If the new reference count
+ * of the center extent enables us to merge left, center, and right
+ * into one record covering all three, do so.  If the center extent is
+ * at the left end of the range, abuts the left extent, and its new
+ * reference count matches the left extent's record, then merge them.
+ * If the center extent is at the right end of the range, abuts the
+ * right extent, and the reference counts match, merge those.  In the
+ * example, we can left merge (assuming an increment operation):
+ *
+ *      <------ adjustment range ------>
+ * --------+---+-----+-+--+--------+----+----
+ *    2    | 3 |  2  |1|17|   55   | 10 | 10
+ * --------+---+-----+-+--+--------+----+----
+ *          ^
+ *
+ * For all other extents within the range, adjust the reference count
+ * or delete it if the refcount falls below 2.  If we were
+ * incrementing, the end result looks like this:
+ *
+ *      <------ adjustment range ------>
+ * --------+---+-----+-+--+--------+----+----
+ *    2    | 4 |  3  |2|18|   56   | 11 | 10
+ * --------+---+-----+-+--+--------+----+----
+ *
+ * The result of a decrement operation looks as such:
+ *
+ *      <------ adjustment range ------>
+ * ----+   +---+       +--+--------+----+----
+ *  2  |   | 2 |       |16|   54   |  9 | 10
+ * ----+   +---+       +--+--------+----+----
+ *      DDDD    111111DD
+ *
+ * The blocks marked "D" are freed; the blocks marked "1" are only
+ * referenced once and therefore the record is removed from the
+ * refcount btree.
+ */
+
+/* Next block after this extent. */
+static inline xfs_agblock_t
+xfs_refc_next(
+	struct xfs_refcount_irec	*rc)
+{
+	return rc->rc_startblock + rc->rc_blockcount;
+}
+
+/*
+ * Split a refcount extent that crosses agbno.
+ */
+STATIC int
+xfs_refcount_split_extent(
+	struct xfs_btree_cur		*cur,
+	xfs_agblock_t			agbno,
+	bool				*shape_changed)
+{
+	struct xfs_refcount_irec	rcext, tmp;
+	int				found_rec;
+	int				error;
+
+	*shape_changed = false;
+	error = xfs_refcount_lookup_le(cur, agbno, &found_rec);
+	if (error)
+		goto out_error;
+	if (!found_rec)
+		return 0;
+
+	error = xfs_refcount_get_rec(cur, &rcext, &found_rec);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+	if (rcext.rc_startblock == agbno || xfs_refc_next(&rcext) <= agbno)
+		return 0;
+
+	*shape_changed = true;
+	trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_private.a.agno,
+			&rcext, agbno);
+
+	/* Establish the right extent. */
+	tmp = rcext;
+	tmp.rc_startblock = agbno;
+	tmp.rc_blockcount -= (agbno - rcext.rc_startblock);
+	error = xfs_refcount_update(cur, &tmp);
+	if (error)
+		goto out_error;
+
+	/* Insert the left extent. */
+	tmp = rcext;
+	tmp.rc_blockcount = agbno - rcext.rc_startblock;
+	error = xfs_refcount_insert(cur, &tmp, &found_rec);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+	return error;
+
+out_error:
+	trace_xfs_refcount_split_extent_error(cur->bc_mp,
+			cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Merge the left, center, and right extents.
+ */
+STATIC int
+xfs_refcount_merge_center_extents(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*left,
+	struct xfs_refcount_irec	*center,
+	struct xfs_refcount_irec	*right,
+	unsigned long long		extlen,
+	xfs_agblock_t			*agbno,
+	xfs_extlen_t			*aglen)
+{
+	int				error;
+	int				found_rec;
+
+	trace_xfs_refcount_merge_center_extents(cur->bc_mp,
+			cur->bc_private.a.agno, left, center, right);
+
+	/*
+	 * Make sure the center and right extents are not in the btree.
+	 * If the center extent was synthesized, the first delete call
+	 * removes the right extent and we skip the second deletion.
+	 * If center and right were in the btree, then the first delete
+	 * call removes the center and the second one removes the right
+	 * extent.
+	 */
+	error = xfs_refcount_lookup_ge(cur, center->rc_startblock,
+			&found_rec);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+	error = xfs_refcount_delete(cur, &found_rec);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+	if (center->rc_refcount > 1) {
+		error = xfs_refcount_delete(cur, &found_rec);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+				out_error);
+	}
+
+	/* Enlarge the left extent. */
+	error = xfs_refcount_lookup_le(cur, left->rc_startblock,
+			&found_rec);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+	left->rc_blockcount = extlen;
+	error = xfs_refcount_update(cur, left);
+	if (error)
+		goto out_error;
+
+	*aglen = 0;
+	return error;
+
+out_error:
+	trace_xfs_refcount_merge_center_extents_error(cur->bc_mp,
+			cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Merge with the left extent.
+ */
+STATIC int
+xfs_refcount_merge_left_extent(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*left,
+	struct xfs_refcount_irec	*cleft,
+	xfs_agblock_t			*agbno,
+	xfs_extlen_t			*aglen)
+{
+	int				error;
+	int				found_rec;
+
+	trace_xfs_refcount_merge_left_extent(cur->bc_mp,
+			cur->bc_private.a.agno, left, cleft);
+
+	/* If the extent at agbno (cleft) wasn't synthesized, remove it. */
+	if (cleft->rc_refcount > 1) {
+		error = xfs_refcount_lookup_le(cur, cleft->rc_startblock,
+				&found_rec);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+				out_error);
+
+		error = xfs_refcount_delete(cur, &found_rec);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+				out_error);
+	}
+
+	/* Enlarge the left extent. */
+	error = xfs_refcount_lookup_le(cur, left->rc_startblock,
+			&found_rec);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+	left->rc_blockcount += cleft->rc_blockcount;
+	error = xfs_refcount_update(cur, left);
+	if (error)
+		goto out_error;
+
+	*agbno += cleft->rc_blockcount;
+	*aglen -= cleft->rc_blockcount;
+	return error;
+
+out_error:
+	trace_xfs_refcount_merge_left_extent_error(cur->bc_mp,
+			cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Merge with the right extent.
+ */
+STATIC int
+xfs_refcount_merge_right_extent(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*right,
+	struct xfs_refcount_irec	*cright,
+	xfs_agblock_t			*agbno,
+	xfs_extlen_t			*aglen)
+{
+	int				error;
+	int				found_rec;
+
+	trace_xfs_refcount_merge_right_extent(cur->bc_mp,
+			cur->bc_private.a.agno, cright, right);
+
+	/*
+	 * If the extent ending at agbno+aglen (cright) wasn't synthesized,
+	 * remove it.
+	 */
+	if (cright->rc_refcount > 1) {
+		error = xfs_refcount_lookup_le(cur, cright->rc_startblock,
+			&found_rec);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+				out_error);
+
+		error = xfs_refcount_delete(cur, &found_rec);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+				out_error);
+	}
+
+	/* Enlarge the right extent. */
+	error = xfs_refcount_lookup_le(cur, right->rc_startblock,
+			&found_rec);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+	right->rc_startblock -= cright->rc_blockcount;
+	right->rc_blockcount += cright->rc_blockcount;
+	error = xfs_refcount_update(cur, right);
+	if (error)
+		goto out_error;
+
+	*aglen -= cright->rc_blockcount;
+	return error;
+
+out_error:
+	trace_xfs_refcount_merge_right_extent_error(cur->bc_mp,
+			cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+#define XFS_FIND_RCEXT_SHARED	1
+#define XFS_FIND_RCEXT_COW	2
+/*
+ * Find the left extent and the one after it (cleft).  This function assumes
+ * that we've already split any extent crossing agbno.
+ */
+STATIC int
+xfs_refcount_find_left_extents(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*left,
+	struct xfs_refcount_irec	*cleft,
+	xfs_agblock_t			agbno,
+	xfs_extlen_t			aglen,
+	int				flags)
+{
+	struct xfs_refcount_irec	tmp;
+	int				error;
+	int				found_rec;
+
+	left->rc_startblock = cleft->rc_startblock = NULLAGBLOCK;
+	error = xfs_refcount_lookup_le(cur, agbno - 1, &found_rec);
+	if (error)
+		goto out_error;
+	if (!found_rec)
+		return 0;
+
+	error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+	if (xfs_refc_next(&tmp) != agbno)
+		return 0;
+	if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2)
+		return 0;
+	if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1)
+		return 0;
+	/* We have a left extent; retrieve (or invent) the next right one */
+	*left = tmp;
+
+	error = xfs_btree_increment(cur, 0, &found_rec);
+	if (error)
+		goto out_error;
+	if (found_rec) {
+		error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+				out_error);
+
+		/* if tmp starts at the end of our range, just use that */
+		if (tmp.rc_startblock == agbno)
+			*cleft = tmp;
+		else {
+			/*
+			 * There's a gap in the refcntbt at the start of the
+			 * range we're interested in (refcount == 1) so
+			 * synthesize the implied extent and pass it back.
+			 * We assume here that the agbno/aglen range was
+			 * passed in from a data fork extent mapping and
+			 * therefore is allocated to exactly one owner.
+			 */
+			cleft->rc_startblock = agbno;
+			cleft->rc_blockcount = min(aglen,
+					tmp.rc_startblock - agbno);
+			cleft->rc_refcount = 1;
+		}
+	} else {
+		/*
+		 * No extents, so pretend that there's one covering the whole
+		 * range.
+		 */
+		cleft->rc_startblock = agbno;
+		cleft->rc_blockcount = aglen;
+		cleft->rc_refcount = 1;
+	}
+	trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_private.a.agno,
+			left, cleft, agbno);
+	return error;
+
+out_error:
+	trace_xfs_refcount_find_left_extent_error(cur->bc_mp,
+			cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Find the right extent and the one before it (cright).  This function
+ * assumes that we've already split any extents crossing agbno + aglen.
+ */
+STATIC int
+xfs_refcount_find_right_extents(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*right,
+	struct xfs_refcount_irec	*cright,
+	xfs_agblock_t			agbno,
+	xfs_extlen_t			aglen,
+	int				flags)
+{
+	struct xfs_refcount_irec	tmp;
+	int				error;
+	int				found_rec;
+
+	right->rc_startblock = cright->rc_startblock = NULLAGBLOCK;
+	error = xfs_refcount_lookup_ge(cur, agbno + aglen, &found_rec);
+	if (error)
+		goto out_error;
+	if (!found_rec)
+		return 0;
+
+	error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+	if (tmp.rc_startblock != agbno + aglen)
+		return 0;
+	if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2)
+		return 0;
+	if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1)
+		return 0;
+	/* We have a right extent; retrieve (or invent) the next left one */
+	*right = tmp;
+
+	error = xfs_btree_decrement(cur, 0, &found_rec);
+	if (error)
+		goto out_error;
+	if (found_rec) {
+		error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+				out_error);
+
+		/* if tmp ends at the end of our range, just use that */
+		if (xfs_refc_next(&tmp) == agbno + aglen)
+			*cright = tmp;
+		else {
+			/*
+			 * There's a gap in the refcntbt at the end of the
+			 * range we're interested in (refcount == 1) so
+			 * create the implied extent and pass it back.
+			 * We assume here that the agbno/aglen range was
+			 * passed in from a data fork extent mapping and
+			 * therefore is allocated to exactly one owner.
+			 */
+			cright->rc_startblock = max(agbno, xfs_refc_next(&tmp));
+			cright->rc_blockcount = right->rc_startblock -
+					cright->rc_startblock;
+			cright->rc_refcount = 1;
+		}
+	} else {
+		/*
+		 * No extents, so pretend that there's one covering the whole
+		 * range.
+		 */
+		cright->rc_startblock = agbno;
+		cright->rc_blockcount = aglen;
+		cright->rc_refcount = 1;
+	}
+	trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_private.a.agno,
+			cright, right, agbno + aglen);
+	return error;
+
+out_error:
+	trace_xfs_refcount_find_right_extent_error(cur->bc_mp,
+			cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/* Is this extent valid? */
+static inline bool
+xfs_refc_valid(
+	struct xfs_refcount_irec	*rc)
+{
+	return rc->rc_startblock != NULLAGBLOCK;
+}
+
+/*
+ * Try to merge with any extents on the boundaries of the adjustment range.
+ */
+STATIC int
+xfs_refcount_merge_extents(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		*agbno,
+	xfs_extlen_t		*aglen,
+	enum xfs_refc_adjust_op adjust,
+	int			flags,
+	bool			*shape_changed)
+{
+	struct xfs_refcount_irec	left = {0}, cleft = {0};
+	struct xfs_refcount_irec	cright = {0}, right = {0};
+	int				error;
+	unsigned long long		ulen;
+	bool				cequal;
+
+	*shape_changed = false;
+	/*
+	 * Find the extent just below agbno [left], just above agbno [cleft],
+	 * just below (agbno + aglen) [cright], and just above (agbno + aglen)
+	 * [right].
+	 */
+	error = xfs_refcount_find_left_extents(cur, &left, &cleft, *agbno,
+			*aglen, flags);
+	if (error)
+		return error;
+	error = xfs_refcount_find_right_extents(cur, &right, &cright, *agbno,
+			*aglen, flags);
+	if (error)
+		return error;
+
+	/* No left or right extent to merge; exit. */
+	if (!xfs_refc_valid(&left) && !xfs_refc_valid(&right))
+		return 0;
+
+	cequal = (cleft.rc_startblock == cright.rc_startblock) &&
+		 (cleft.rc_blockcount == cright.rc_blockcount);
+
+	/* Try to merge left, cleft, and right.  cleft must == cright. */
+	ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount +
+			right.rc_blockcount;
+	if (xfs_refc_valid(&left) && xfs_refc_valid(&right) &&
+	    xfs_refc_valid(&cleft) && xfs_refc_valid(&cright) && cequal &&
+	    left.rc_refcount == cleft.rc_refcount + adjust &&
+	    right.rc_refcount == cleft.rc_refcount + adjust &&
+	    ulen < MAXREFCEXTLEN) {
+		*shape_changed = true;
+		return xfs_refcount_merge_center_extents(cur, &left, &cleft,
+				&right, ulen, agbno, aglen);
+	}
+
+	/* Try to merge left and cleft. */
+	ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount;
+	if (xfs_refc_valid(&left) && xfs_refc_valid(&cleft) &&
+	    left.rc_refcount == cleft.rc_refcount + adjust &&
+	    ulen < MAXREFCEXTLEN) {
+		*shape_changed = true;
+		error = xfs_refcount_merge_left_extent(cur, &left, &cleft,
+				agbno, aglen);
+		if (error)
+			return error;
+
+		/*
+		 * If we just merged left + cleft and cleft == cright,
+		 * we no longer have a cright to merge with right.  We're done.
+		 */
+		if (cequal)
+			return 0;
+	}
+
+	/* Try to merge cright and right. */
+	ulen = (unsigned long long)right.rc_blockcount + cright.rc_blockcount;
+	if (xfs_refc_valid(&right) && xfs_refc_valid(&cright) &&
+	    right.rc_refcount == cright.rc_refcount + adjust &&
+	    ulen < MAXREFCEXTLEN) {
+		*shape_changed = true;
+		return xfs_refcount_merge_right_extent(cur, &right, &cright,
+				agbno, aglen);
+	}
+
+	return error;
+}
+
+/*
+ * While we're adjusting the refcounts records of an extent, we have
+ * to keep an eye on the number of extents we're dirtying -- run too
+ * many in a single transaction and we'll exceed the transaction's
+ * reservation and crash the fs.  Each record adds 12 bytes to the
+ * log (plus any key updates) so we'll conservatively assume 24 bytes
+ * per record.  We must also leave space for btree splits on both ends
+ * of the range and space for the CUD and a new CUI.
+ *
+ * XXX: This is a pretty hand-wavy estimate.  The penalty for guessing
+ * true incorrectly is a shutdown FS; the penalty for guessing false
+ * incorrectly is more transaction rolls than might be necessary.
+ * Be conservative here.
+ */
+static bool
+xfs_refcount_still_have_space(
+	struct xfs_btree_cur		*cur)
+{
+	unsigned long			overhead;
+
+	overhead = cur->bc_private.a.priv.refc.shape_changes *
+			xfs_allocfree_log_count(cur->bc_mp, 1);
+	overhead *= cur->bc_mp->m_sb.sb_blocksize;
+
+	/*
+	 * Only allow 2 refcount extent updates per transaction if the
+	 * refcount continue update "error" has been injected.
+	 */
+	if (cur->bc_private.a.priv.refc.nr_ops > 2 &&
+	    XFS_TEST_ERROR(false, cur->bc_mp,
+			XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE,
+			XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE))
+		return false;
+
+	if (cur->bc_private.a.priv.refc.nr_ops == 0)
+		return true;
+	else if (overhead > cur->bc_tp->t_log_res)
+		return false;
+	return  cur->bc_tp->t_log_res - overhead >
+		cur->bc_private.a.priv.refc.nr_ops * 32;
+}
+
+/*
+ * Adjust the refcounts of middle extents.  At this point we should have
+ * split extents that crossed the adjustment range; merged with adjacent
+ * extents; and updated agbno/aglen to reflect the merges.  Therefore,
+ * all we have to do is update the extents inside [agbno, agbno + aglen].
+ */
+STATIC int
+xfs_refcount_adjust_extents(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		*agbno,
+	xfs_extlen_t		*aglen,
+	enum xfs_refc_adjust_op	adj,
+	struct xfs_defer_ops	*dfops,
+	struct xfs_owner_info	*oinfo)
+{
+	struct xfs_refcount_irec	ext, tmp;
+	int				error;
+	int				found_rec, found_tmp;
+	xfs_fsblock_t			fsbno;
+
+	/* Merging did all the work already. */
+	if (*aglen == 0)
+		return 0;
+
+	error = xfs_refcount_lookup_ge(cur, *agbno, &found_rec);
+	if (error)
+		goto out_error;
+
+	while (*aglen > 0 && xfs_refcount_still_have_space(cur)) {
+		error = xfs_refcount_get_rec(cur, &ext, &found_rec);
+		if (error)
+			goto out_error;
+		if (!found_rec) {
+			ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
+			ext.rc_blockcount = 0;
+			ext.rc_refcount = 0;
+		}
+
+		/*
+		 * Deal with a hole in the refcount tree; if a file maps to
+		 * these blocks and there's no refcountbt record, pretend that
+		 * there is one with refcount == 1.
+		 */
+		if (ext.rc_startblock != *agbno) {
+			tmp.rc_startblock = *agbno;
+			tmp.rc_blockcount = min(*aglen,
+					ext.rc_startblock - *agbno);
+			tmp.rc_refcount = 1 + adj;
+			trace_xfs_refcount_modify_extent(cur->bc_mp,
+					cur->bc_private.a.agno, &tmp);
+
+			/*
+			 * Either cover the hole (increment) or
+			 * delete the range (decrement).
+			 */
+			if (tmp.rc_refcount) {
+				error = xfs_refcount_insert(cur, &tmp,
+						&found_tmp);
+				if (error)
+					goto out_error;
+				XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+						found_tmp == 1, out_error);
+				cur->bc_private.a.priv.refc.nr_ops++;
+			} else {
+				fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
+						cur->bc_private.a.agno,
+						tmp.rc_startblock);
+				xfs_bmap_add_free(cur->bc_mp, dfops, fsbno,
+						tmp.rc_blockcount, oinfo);
+			}
+
+			(*agbno) += tmp.rc_blockcount;
+			(*aglen) -= tmp.rc_blockcount;
+
+			error = xfs_refcount_lookup_ge(cur, *agbno,
+					&found_rec);
+			if (error)
+				goto out_error;
+		}
+
+		/* Stop if there's nothing left to modify */
+		if (*aglen == 0 || !xfs_refcount_still_have_space(cur))
+			break;
+
+		/*
+		 * Adjust the reference count and either update the tree
+		 * (incr) or free the blocks (decr).
+		 */
+		if (ext.rc_refcount == MAXREFCOUNT)
+			goto skip;
+		ext.rc_refcount += adj;
+		trace_xfs_refcount_modify_extent(cur->bc_mp,
+				cur->bc_private.a.agno, &ext);
+		if (ext.rc_refcount > 1) {
+			error = xfs_refcount_update(cur, &ext);
+			if (error)
+				goto out_error;
+			cur->bc_private.a.priv.refc.nr_ops++;
+		} else if (ext.rc_refcount == 1) {
+			error = xfs_refcount_delete(cur, &found_rec);
+			if (error)
+				goto out_error;
+			XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+					found_rec == 1, out_error);
+			cur->bc_private.a.priv.refc.nr_ops++;
+			goto advloop;
+		} else {
+			fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
+					cur->bc_private.a.agno,
+					ext.rc_startblock);
+			xfs_bmap_add_free(cur->bc_mp, dfops, fsbno,
+					ext.rc_blockcount, oinfo);
+		}
+
+skip:
+		error = xfs_btree_increment(cur, 0, &found_rec);
+		if (error)
+			goto out_error;
+
+advloop:
+		(*agbno) += ext.rc_blockcount;
+		(*aglen) -= ext.rc_blockcount;
+	}
+
+	return error;
+out_error:
+	trace_xfs_refcount_modify_extent_error(cur->bc_mp,
+			cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/* Adjust the reference count of a range of AG blocks. */
+STATIC int
+xfs_refcount_adjust(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		aglen,
+	xfs_agblock_t		*new_agbno,
+	xfs_extlen_t		*new_aglen,
+	enum xfs_refc_adjust_op	adj,
+	struct xfs_defer_ops	*dfops,
+	struct xfs_owner_info	*oinfo)
+{
+	bool			shape_changed;
+	int			shape_changes = 0;
+	int			error;
+
+	*new_agbno = agbno;
+	*new_aglen = aglen;
+	if (adj == XFS_REFCOUNT_ADJUST_INCREASE)
+		trace_xfs_refcount_increase(cur->bc_mp, cur->bc_private.a.agno,
+				agbno, aglen);
+	else
+		trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_private.a.agno,
+				agbno, aglen);
+
+	/*
+	 * Ensure that no rcextents cross the boundary of the adjustment range.
+	 */
+	error = xfs_refcount_split_extent(cur, agbno, &shape_changed);
+	if (error)
+		goto out_error;
+	if (shape_changed)
+		shape_changes++;
+
+	error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed);
+	if (error)
+		goto out_error;
+	if (shape_changed)
+		shape_changes++;
+
+	/*
+	 * Try to merge with the left or right extents of the range.
+	 */
+	error = xfs_refcount_merge_extents(cur, new_agbno, new_aglen, adj,
+			XFS_FIND_RCEXT_SHARED, &shape_changed);
+	if (error)
+		goto out_error;
+	if (shape_changed)
+		shape_changes++;
+	if (shape_changes)
+		cur->bc_private.a.priv.refc.shape_changes++;
+
+	/* Now that we've taken care of the ends, adjust the middle extents */
+	error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen,
+			adj, dfops, oinfo);
+	if (error)
+		goto out_error;
+
+	return 0;
+
+out_error:
+	trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_private.a.agno,
+			error, _RET_IP_);
+	return error;
+}
+
+/* Clean up after calling xfs_refcount_finish_one. */
+void
+xfs_refcount_finish_one_cleanup(
+	struct xfs_trans	*tp,
+	struct xfs_btree_cur	*rcur,
+	int			error)
+{
+	struct xfs_buf		*agbp;
+
+	if (rcur == NULL)
+		return;
+	agbp = rcur->bc_private.a.agbp;
+	xfs_btree_del_cursor(rcur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	if (error)
+		xfs_trans_brelse(tp, agbp);
+}
+
+/*
+ * Process one of the deferred refcount operations.  We pass back the
+ * btree cursor to maintain our lock on the btree between calls.
+ * This saves time and eliminates a buffer deadlock between the
+ * superblock and the AGF because we'll always grab them in the same
+ * order.
+ */
+int
+xfs_refcount_finish_one(
+	struct xfs_trans		*tp,
+	struct xfs_defer_ops		*dfops,
+	enum xfs_refcount_intent_type	type,
+	xfs_fsblock_t			startblock,
+	xfs_extlen_t			blockcount,
+	xfs_fsblock_t			*new_fsb,
+	xfs_extlen_t			*new_len,
+	struct xfs_btree_cur		**pcur)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_btree_cur		*rcur;
+	struct xfs_buf			*agbp = NULL;
+	int				error = 0;
+	xfs_agnumber_t			agno;
+	xfs_agblock_t			bno;
+	xfs_agblock_t			new_agbno;
+	unsigned long			nr_ops = 0;
+	int				shape_changes = 0;
+
+	agno = XFS_FSB_TO_AGNO(mp, startblock);
+	ASSERT(agno != NULLAGNUMBER);
+	bno = XFS_FSB_TO_AGBNO(mp, startblock);
+
+	trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock),
+			type, XFS_FSB_TO_AGBNO(mp, startblock),
+			blockcount);
+
+	if (XFS_TEST_ERROR(false, mp,
+			XFS_ERRTAG_REFCOUNT_FINISH_ONE,
+			XFS_RANDOM_REFCOUNT_FINISH_ONE))
+		return -EIO;
+
+	/*
+	 * If we haven't gotten a cursor or the cursor AG doesn't match
+	 * the startblock, get one now.
+	 */
+	rcur = *pcur;
+	if (rcur != NULL && rcur->bc_private.a.agno != agno) {
+		nr_ops = rcur->bc_private.a.priv.refc.nr_ops;
+		shape_changes = rcur->bc_private.a.priv.refc.shape_changes;
+		xfs_refcount_finish_one_cleanup(tp, rcur, 0);
+		rcur = NULL;
+		*pcur = NULL;
+	}
+	if (rcur == NULL) {
+		error = xfs_alloc_read_agf(tp->t_mountp, tp, agno,
+				XFS_ALLOC_FLAG_FREEING, &agbp);
+		if (error)
+			return error;
+		if (!agbp)
+			return -EFSCORRUPTED;
+
+		rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, dfops);
+		if (!rcur) {
+			error = -ENOMEM;
+			goto out_cur;
+		}
+		rcur->bc_private.a.priv.refc.nr_ops = nr_ops;
+		rcur->bc_private.a.priv.refc.shape_changes = shape_changes;
+	}
+	*pcur = rcur;
+
+	switch (type) {
+	case XFS_REFCOUNT_INCREASE:
+		error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
+			new_len, XFS_REFCOUNT_ADJUST_INCREASE, dfops, NULL);
+		*new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno);
+		break;
+	case XFS_REFCOUNT_DECREASE:
+		error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
+			new_len, XFS_REFCOUNT_ADJUST_DECREASE, dfops, NULL);
+		*new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno);
+		break;
+	case XFS_REFCOUNT_ALLOC_COW:
+		*new_fsb = startblock + blockcount;
+		*new_len = 0;
+		error = __xfs_refcount_cow_alloc(rcur, bno, blockcount, dfops);
+		break;
+	case XFS_REFCOUNT_FREE_COW:
+		*new_fsb = startblock + blockcount;
+		*new_len = 0;
+		error = __xfs_refcount_cow_free(rcur, bno, blockcount, dfops);
+		break;
+	default:
+		ASSERT(0);
+		error = -EFSCORRUPTED;
+	}
+	if (!error && *new_len > 0)
+		trace_xfs_refcount_finish_one_leftover(mp, agno, type,
+				bno, blockcount, new_agbno, *new_len);
+	return error;
+
+out_cur:
+	xfs_trans_brelse(tp, agbp);
+
+	return error;
+}
+
+/*
+ * Record a refcount intent for later processing.
+ */
+static int
+__xfs_refcount_add(
+	struct xfs_mount		*mp,
+	struct xfs_defer_ops		*dfops,
+	enum xfs_refcount_intent_type	type,
+	xfs_fsblock_t			startblock,
+	xfs_extlen_t			blockcount)
+{
+	struct xfs_refcount_intent	*ri;
+
+	trace_xfs_refcount_defer(mp, XFS_FSB_TO_AGNO(mp, startblock),
+			type, XFS_FSB_TO_AGBNO(mp, startblock),
+			blockcount);
+
+	ri = kmem_alloc(sizeof(struct xfs_refcount_intent),
+			KM_SLEEP | KM_NOFS);
+	INIT_LIST_HEAD(&ri->ri_list);
+	ri->ri_type = type;
+	ri->ri_startblock = startblock;
+	ri->ri_blockcount = blockcount;
+
+	xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list);
+	return 0;
+}
+
+/*
+ * Increase the reference count of the blocks backing a file's extent.
+ */
+int
+xfs_refcount_increase_extent(
+	struct xfs_mount		*mp,
+	struct xfs_defer_ops		*dfops,
+	struct xfs_bmbt_irec		*PREV)
+{
+	if (!xfs_sb_version_hasreflink(&mp->m_sb))
+		return 0;
+
+	return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_INCREASE,
+			PREV->br_startblock, PREV->br_blockcount);
+}
+
+/*
+ * Decrease the reference count of the blocks backing a file's extent.
+ */
+int
+xfs_refcount_decrease_extent(
+	struct xfs_mount		*mp,
+	struct xfs_defer_ops		*dfops,
+	struct xfs_bmbt_irec		*PREV)
+{
+	if (!xfs_sb_version_hasreflink(&mp->m_sb))
+		return 0;
+
+	return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_DECREASE,
+			PREV->br_startblock, PREV->br_blockcount);
+}
+
+/*
+ * Given an AG extent, find the lowest-numbered run of shared blocks
+ * within that range and return the range in fbno/flen.  If
+ * find_end_of_shared is set, return the longest contiguous extent of
+ * shared blocks; if not, just return the first extent we find.  If no
+ * shared blocks are found, fbno and flen will be set to NULLAGBLOCK
+ * and 0, respectively.
+ */
+int
+xfs_refcount_find_shared(
+	struct xfs_btree_cur		*cur,
+	xfs_agblock_t			agbno,
+	xfs_extlen_t			aglen,
+	xfs_agblock_t			*fbno,
+	xfs_extlen_t			*flen,
+	bool				find_end_of_shared)
+{
+	struct xfs_refcount_irec	tmp;
+	int				i;
+	int				have;
+	int				error;
+
+	trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_private.a.agno,
+			agbno, aglen);
+
+	/* By default, skip the whole range */
+	*fbno = NULLAGBLOCK;
+	*flen = 0;
+
+	/* Try to find a refcount extent that crosses the start */
+	error = xfs_refcount_lookup_le(cur, agbno, &have);
+	if (error)
+		goto out_error;
+	if (!have) {
+		/* No left extent, look at the next one */
+		error = xfs_btree_increment(cur, 0, &have);
+		if (error)
+			goto out_error;
+		if (!have)
+			goto done;
+	}
+	error = xfs_refcount_get_rec(cur, &tmp, &i);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error);
+
+	/* If the extent ends before the start, look at the next one */
+	if (tmp.rc_startblock + tmp.rc_blockcount <= agbno) {
+		error = xfs_btree_increment(cur, 0, &have);
+		if (error)
+			goto out_error;
+		if (!have)
+			goto done;
+		error = xfs_refcount_get_rec(cur, &tmp, &i);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error);
+	}
+
+	/* If the extent starts after the range we want, bail out */
+	if (tmp.rc_startblock >= agbno + aglen)
+		goto done;
+
+	/* We found the start of a shared extent! */
+	if (tmp.rc_startblock < agbno) {
+		tmp.rc_blockcount -= (agbno - tmp.rc_startblock);
+		tmp.rc_startblock = agbno;
+	}
+
+	*fbno = tmp.rc_startblock;
+	*flen = min(tmp.rc_blockcount, agbno + aglen - *fbno);
+	if (!find_end_of_shared)
+		goto done;
+
+	/* Otherwise, find the end of this shared extent */
+	while (*fbno + *flen < agbno + aglen) {
+		error = xfs_btree_increment(cur, 0, &have);
+		if (error)
+			goto out_error;
+		if (!have)
+			break;
+		error = xfs_refcount_get_rec(cur, &tmp, &i);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error);
+		if (tmp.rc_startblock >= agbno + aglen ||
+		    tmp.rc_startblock != *fbno + *flen)
+			break;
+		*flen = min(*flen + tmp.rc_blockcount, agbno + aglen - *fbno);
+	}
+
+done:
+	trace_xfs_refcount_find_shared_result(cur->bc_mp,
+			cur->bc_private.a.agno, *fbno, *flen);
+
+out_error:
+	if (error)
+		trace_xfs_refcount_find_shared_error(cur->bc_mp,
+				cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Recovering CoW Blocks After a Crash
+ *
+ * Due to the way that the copy on write mechanism works, there's a window of
+ * opportunity in which we can lose track of allocated blocks during a crash.
+ * Because CoW uses delayed allocation in the in-core CoW fork, writeback
+ * causes blocks to be allocated and stored in the CoW fork.  The blocks are
+ * no longer in the free space btree but are not otherwise recorded anywhere
+ * until the write completes and the blocks are mapped into the file.  A crash
+ * in between allocation and remapping results in the replacement blocks being
+ * lost.  This situation is exacerbated by the CoW extent size hint because
+ * allocations can hang around for long time.
+ *
+ * However, there is a place where we can record these allocations before they
+ * become mappings -- the reference count btree.  The btree does not record
+ * extents with refcount == 1, so we can record allocations with a refcount of
+ * 1.  Blocks being used for CoW writeout cannot be shared, so there should be
+ * no conflict with shared block records.  These mappings should be created
+ * when we allocate blocks to the CoW fork and deleted when they're removed
+ * from the CoW fork.
+ *
+ * Minor nit: records for in-progress CoW allocations and records for shared
+ * extents must never be merged, to preserve the property that (except for CoW
+ * allocations) there are no refcount btree entries with refcount == 1.  The
+ * only time this could potentially happen is when unsharing a block that's
+ * adjacent to CoW allocations, so we must be careful to avoid this.
+ *
+ * At mount time we recover lost CoW allocations by searching the refcount
+ * btree for these refcount == 1 mappings.  These represent CoW allocations
+ * that were in progress at the time the filesystem went down, so we can free
+ * them to get the space back.
+ *
+ * This mechanism is superior to creating EFIs for unmapped CoW extents for
+ * several reasons -- first, EFIs pin the tail of the log and would have to be
+ * periodically relogged to avoid filling up the log.  Second, CoW completions
+ * will have to file an EFD and create new EFIs for whatever remains in the
+ * CoW fork; this partially takes care of (1) but extent-size reservations
+ * will have to periodically relog even if there's no writeout in progress.
+ * This can happen if the CoW extent size hint is set, which you really want.
+ * Third, EFIs cannot currently be automatically relogged into newer
+ * transactions to advance the log tail.  Fourth, stuffing the log full of
+ * EFIs places an upper bound on the number of CoW allocations that can be
+ * held filesystem-wide at any given time.  Recording them in the refcount
+ * btree doesn't require us to maintain any state in memory and doesn't pin
+ * the log.
+ */
+/*
+ * Adjust the refcounts of CoW allocations.  These allocations are "magic"
+ * in that they're not referenced anywhere else in the filesystem, so we
+ * stash them in the refcount btree with a refcount of 1 until either file
+ * remapping (or CoW cancellation) happens.
+ */
+STATIC int
+xfs_refcount_adjust_cow_extents(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		aglen,
+	enum xfs_refc_adjust_op	adj,
+	struct xfs_defer_ops	*dfops,
+	struct xfs_owner_info	*oinfo)
+{
+	struct xfs_refcount_irec	ext, tmp;
+	int				error;
+	int				found_rec, found_tmp;
+
+	if (aglen == 0)
+		return 0;
+
+	/* Find any overlapping refcount records */
+	error = xfs_refcount_lookup_ge(cur, agbno, &found_rec);
+	if (error)
+		goto out_error;
+	error = xfs_refcount_get_rec(cur, &ext, &found_rec);
+	if (error)
+		goto out_error;
+	if (!found_rec) {
+		ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks +
+				XFS_REFC_COW_START;
+		ext.rc_blockcount = 0;
+		ext.rc_refcount = 0;
+	}
+
+	switch (adj) {
+	case XFS_REFCOUNT_ADJUST_COW_ALLOC:
+		/* Adding a CoW reservation, there should be nothing here. */
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+				ext.rc_startblock >= agbno + aglen, out_error);
+
+		tmp.rc_startblock = agbno;
+		tmp.rc_blockcount = aglen;
+		tmp.rc_refcount = 1;
+		trace_xfs_refcount_modify_extent(cur->bc_mp,
+				cur->bc_private.a.agno, &tmp);
+
+		error = xfs_refcount_insert(cur, &tmp,
+				&found_tmp);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+				found_tmp == 1, out_error);
+		break;
+	case XFS_REFCOUNT_ADJUST_COW_FREE:
+		/* Removing a CoW reservation, there should be one extent. */
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+			ext.rc_startblock == agbno, out_error);
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+			ext.rc_blockcount == aglen, out_error);
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+			ext.rc_refcount == 1, out_error);
+
+		ext.rc_refcount = 0;
+		trace_xfs_refcount_modify_extent(cur->bc_mp,
+				cur->bc_private.a.agno, &ext);
+		error = xfs_refcount_delete(cur, &found_rec);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+				found_rec == 1, out_error);
+		break;
+	default:
+		ASSERT(0);
+	}
+
+	return error;
+out_error:
+	trace_xfs_refcount_modify_extent_error(cur->bc_mp,
+			cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Add or remove refcount btree entries for CoW reservations.
+ */
+STATIC int
+xfs_refcount_adjust_cow(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		aglen,
+	enum xfs_refc_adjust_op	adj,
+	struct xfs_defer_ops	*dfops)
+{
+	bool			shape_changed;
+	int			error;
+
+	agbno += XFS_REFC_COW_START;
+
+	/*
+	 * Ensure that no rcextents cross the boundary of the adjustment range.
+	 */
+	error = xfs_refcount_split_extent(cur, agbno, &shape_changed);
+	if (error)
+		goto out_error;
+
+	error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed);
+	if (error)
+		goto out_error;
+
+	/*
+	 * Try to merge with the left or right extents of the range.
+	 */
+	error = xfs_refcount_merge_extents(cur, &agbno, &aglen, adj,
+			XFS_FIND_RCEXT_COW, &shape_changed);
+	if (error)
+		goto out_error;
+
+	/* Now that we've taken care of the ends, adjust the middle extents */
+	error = xfs_refcount_adjust_cow_extents(cur, agbno, aglen, adj,
+			dfops, NULL);
+	if (error)
+		goto out_error;
+
+	return 0;
+
+out_error:
+	trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_private.a.agno,
+			error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Record a CoW allocation in the refcount btree.
+ */
+STATIC int
+__xfs_refcount_cow_alloc(
+	struct xfs_btree_cur	*rcur,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		aglen,
+	struct xfs_defer_ops	*dfops)
+{
+	int			error;
+
+	trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno,
+			agbno, aglen);
+
+	/* Add refcount btree reservation */
+	error = xfs_refcount_adjust_cow(rcur, agbno, aglen,
+			XFS_REFCOUNT_ADJUST_COW_ALLOC, dfops);
+	if (error)
+		return error;
+
+	/* Add rmap entry */
+	if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) {
+		error = xfs_rmap_alloc_extent(rcur->bc_mp, dfops,
+				rcur->bc_private.a.agno,
+				agbno, aglen, XFS_RMAP_OWN_COW);
+		if (error)
+			return error;
+	}
+
+	return error;
+}
+
+/*
+ * Remove a CoW allocation from the refcount btree.
+ */
+STATIC int
+__xfs_refcount_cow_free(
+	struct xfs_btree_cur	*rcur,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		aglen,
+	struct xfs_defer_ops	*dfops)
+{
+	int			error;
+
+	trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno,
+			agbno, aglen);
+
+	/* Remove refcount btree reservation */
+	error = xfs_refcount_adjust_cow(rcur, agbno, aglen,
+			XFS_REFCOUNT_ADJUST_COW_FREE, dfops);
+	if (error)
+		return error;
+
+	/* Remove rmap entry */
+	if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) {
+		error = xfs_rmap_free_extent(rcur->bc_mp, dfops,
+				rcur->bc_private.a.agno,
+				agbno, aglen, XFS_RMAP_OWN_COW);
+		if (error)
+			return error;
+	}
+
+	return error;
+}
+
+/* Record a CoW staging extent in the refcount btree. */
+int
+xfs_refcount_alloc_cow_extent(
+	struct xfs_mount		*mp,
+	struct xfs_defer_ops		*dfops,
+	xfs_fsblock_t			fsb,
+	xfs_extlen_t			len)
+{
+	if (!xfs_sb_version_hasreflink(&mp->m_sb))
+		return 0;
+
+	return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_ALLOC_COW,
+			fsb, len);
+}
+
+/* Forget a CoW staging event in the refcount btree. */
+int
+xfs_refcount_free_cow_extent(
+	struct xfs_mount		*mp,
+	struct xfs_defer_ops		*dfops,
+	xfs_fsblock_t			fsb,
+	xfs_extlen_t			len)
+{
+	if (!xfs_sb_version_hasreflink(&mp->m_sb))
+		return 0;
+
+	return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_FREE_COW,
+			fsb, len);
+}
+
+struct xfs_refcount_recovery {
+	struct list_head		rr_list;
+	struct xfs_refcount_irec	rr_rrec;
+};
+
+/* Stuff an extent on the recovery list. */
+STATIC int
+xfs_refcount_recover_extent(
+	struct xfs_btree_cur		*cur,
+	union xfs_btree_rec		*rec,
+	void				*priv)
+{
+	struct list_head		*debris = priv;
+	struct xfs_refcount_recovery	*rr;
+
+	if (be32_to_cpu(rec->refc.rc_refcount) != 1)
+		return -EFSCORRUPTED;
+
+	rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), KM_SLEEP);
+	xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
+	list_add_tail(&rr->rr_list, debris);
+
+	return 0;
+}
+
+/* Find and remove leftover CoW reservations. */
+int
+xfs_refcount_recover_cow_leftovers(
+	struct xfs_mount		*mp,
+	xfs_agnumber_t			agno)
+{
+	struct xfs_trans		*tp;
+	struct xfs_btree_cur		*cur;
+	struct xfs_buf			*agbp;
+	struct xfs_refcount_recovery	*rr, *n;
+	struct list_head		debris;
+	union xfs_btree_irec		low;
+	union xfs_btree_irec		high;
+	struct xfs_defer_ops		dfops;
+	xfs_fsblock_t			fsb;
+	xfs_agblock_t			agbno;
+	int				error;
+
+	if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START)
+		return -EOPNOTSUPP;
+
+	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+	if (error)
+		return error;
+	cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
+
+	/* Find all the leftover CoW staging extents. */
+	INIT_LIST_HEAD(&debris);
+	memset(&low, 0, sizeof(low));
+	memset(&high, 0, sizeof(high));
+	low.rc.rc_startblock = XFS_REFC_COW_START;
+	high.rc.rc_startblock = -1U;
+	error = xfs_btree_query_range(cur, &low, &high,
+			xfs_refcount_recover_extent, &debris);
+	if (error)
+		goto out_cursor;
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	xfs_buf_relse(agbp);
+
+	/* Now iterate the list to free the leftovers */
+	list_for_each_entry(rr, &debris, rr_list) {
+		/* Set up transaction. */
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
+		if (error)
+			goto out_free;
+
+		trace_xfs_refcount_recover_extent(mp, agno, &rr->rr_rrec);
+
+		/* Free the orphan record */
+		xfs_defer_init(&dfops, &fsb);
+		agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START;
+		fsb = XFS_AGB_TO_FSB(mp, agno, agbno);
+		error = xfs_refcount_free_cow_extent(mp, &dfops, fsb,
+				rr->rr_rrec.rc_blockcount);
+		if (error)
+			goto out_defer;
+
+		/* Free the block. */
+		xfs_bmap_add_free(mp, &dfops, fsb,
+				rr->rr_rrec.rc_blockcount, NULL);
+
+		error = xfs_defer_finish(&tp, &dfops, NULL);
+		if (error)
+			goto out_defer;
+
+		error = xfs_trans_commit(tp);
+		if (error)
+			goto out_free;
+	}
+
+out_free:
+	/* Free the leftover list */
+	list_for_each_entry_safe(rr, n, &debris, rr_list) {
+		list_del(&rr->rr_list);
+		kmem_free(rr);
+	}
+	return error;
+
+out_cursor:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	xfs_buf_relse(agbp);
+	goto out_free;
+
+out_defer:
+	xfs_defer_cancel(&dfops);
+	xfs_trans_cancel(tp);
+	goto out_free;
+}
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
new file mode 100644
index 0000000..098dc668
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#ifndef __XFS_REFCOUNT_H__
+#define __XFS_REFCOUNT_H__
+
+extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur,
+		xfs_agblock_t bno, int *stat);
+extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur,
+		xfs_agblock_t bno, int *stat);
+extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur,
+		struct xfs_refcount_irec *irec, int *stat);
+
+enum xfs_refcount_intent_type {
+	XFS_REFCOUNT_INCREASE = 1,
+	XFS_REFCOUNT_DECREASE,
+	XFS_REFCOUNT_ALLOC_COW,
+	XFS_REFCOUNT_FREE_COW,
+};
+
+struct xfs_refcount_intent {
+	struct list_head			ri_list;
+	enum xfs_refcount_intent_type		ri_type;
+	xfs_fsblock_t				ri_startblock;
+	xfs_extlen_t				ri_blockcount;
+};
+
+extern int xfs_refcount_increase_extent(struct xfs_mount *mp,
+		struct xfs_defer_ops *dfops, struct xfs_bmbt_irec *irec);
+extern int xfs_refcount_decrease_extent(struct xfs_mount *mp,
+		struct xfs_defer_ops *dfops, struct xfs_bmbt_irec *irec);
+
+extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp,
+		struct xfs_btree_cur *rcur, int error);
+extern int xfs_refcount_finish_one(struct xfs_trans *tp,
+		struct xfs_defer_ops *dfops, enum xfs_refcount_intent_type type,
+		xfs_fsblock_t startblock, xfs_extlen_t blockcount,
+		xfs_fsblock_t *new_fsb, xfs_extlen_t *new_len,
+		struct xfs_btree_cur **pcur);
+
+extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur,
+		xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
+		xfs_extlen_t *flen, bool find_end_of_shared);
+
+extern int xfs_refcount_alloc_cow_extent(struct xfs_mount *mp,
+		struct xfs_defer_ops *dfops, xfs_fsblock_t fsb,
+		xfs_extlen_t len);
+extern int xfs_refcount_free_cow_extent(struct xfs_mount *mp,
+		struct xfs_defer_ops *dfops, xfs_fsblock_t fsb,
+		xfs_extlen_t len);
+extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
+		xfs_agnumber_t agno);
+
+#endif	/* __XFS_REFCOUNT_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
new file mode 100644
index 0000000..453bb27
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -0,0 +1,451 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_bit.h"
+#include "xfs_rmap.h"
+
+static struct xfs_btree_cur *
+xfs_refcountbt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp,
+			cur->bc_private.a.agbp, cur->bc_private.a.agno,
+			cur->bc_private.a.dfops);
+}
+
+STATIC void
+xfs_refcountbt_set_root(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	int			inc)
+{
+	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	xfs_agnumber_t		seqno = be32_to_cpu(agf->agf_seqno);
+	struct xfs_perag	*pag = xfs_perag_get(cur->bc_mp, seqno);
+
+	ASSERT(ptr->s != 0);
+
+	agf->agf_refcount_root = ptr->s;
+	be32_add_cpu(&agf->agf_refcount_level, inc);
+	pag->pagf_refcount_level += inc;
+	xfs_perag_put(pag);
+
+	xfs_alloc_log_agf(cur->bc_tp, agbp,
+			XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL);
+}
+
+STATIC int
+xfs_refcountbt_alloc_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*start,
+	union xfs_btree_ptr	*new,
+	int			*stat)
+{
+	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	struct xfs_alloc_arg	args;		/* block allocation args */
+	int			error;		/* error return value */
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+	memset(&args, 0, sizeof(args));
+	args.tp = cur->bc_tp;
+	args.mp = cur->bc_mp;
+	args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
+			xfs_refc_block(args.mp));
+	args.firstblock = args.fsbno;
+	xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_REFC);
+	args.minlen = args.maxlen = args.prod = 1;
+	args.resv = XFS_AG_RESV_METADATA;
+
+	error = xfs_alloc_vextent(&args);
+	if (error)
+		goto out_error;
+	trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno,
+			args.agbno, 1);
+	if (args.fsbno == NULLFSBLOCK) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		*stat = 0;
+		return 0;
+	}
+	ASSERT(args.agno == cur->bc_private.a.agno);
+	ASSERT(args.len == 1);
+
+	new->s = cpu_to_be32(args.agbno);
+	be32_add_cpu(&agf->agf_refcount_blocks, 1);
+	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+
+out_error:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+STATIC int
+xfs_refcountbt_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+	struct xfs_owner_info	oinfo;
+	int			error;
+
+	trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
+			XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1);
+	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC);
+	be32_add_cpu(&agf->agf_refcount_blocks, -1);
+	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
+	error = xfs_free_extent(cur->bc_tp, fsbno, 1, &oinfo,
+			XFS_AG_RESV_METADATA);
+	if (error)
+		return error;
+
+	return error;
+}
+
+STATIC int
+xfs_refcountbt_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return cur->bc_mp->m_refc_mnr[level != 0];
+}
+
+STATIC int
+xfs_refcountbt_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return cur->bc_mp->m_refc_mxr[level != 0];
+}
+
+STATIC void
+xfs_refcountbt_init_key_from_rec(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	key->refc.rc_startblock = rec->refc.rc_startblock;
+}
+
+STATIC void
+xfs_refcountbt_init_high_key_from_rec(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	__u32			x;
+
+	x = be32_to_cpu(rec->refc.rc_startblock);
+	x += be32_to_cpu(rec->refc.rc_blockcount) - 1;
+	key->refc.rc_startblock = cpu_to_be32(x);
+}
+
+STATIC void
+xfs_refcountbt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	rec->refc.rc_startblock = cpu_to_be32(cur->bc_rec.rc.rc_startblock);
+	rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount);
+	rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount);
+}
+
+STATIC void
+xfs_refcountbt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+
+	ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+	ASSERT(agf->agf_refcount_root != 0);
+
+	ptr->s = agf->agf_refcount_root;
+}
+
+STATIC __int64_t
+xfs_refcountbt_key_diff(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*key)
+{
+	struct xfs_refcount_irec	*rec = &cur->bc_rec.rc;
+	struct xfs_refcount_key		*kp = &key->refc;
+
+	return (__int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock;
+}
+
+STATIC __int64_t
+xfs_refcountbt_diff_two_keys(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*k1,
+	union xfs_btree_key	*k2)
+{
+	return (__int64_t)be32_to_cpu(k1->refc.rc_startblock) -
+			  be32_to_cpu(k2->refc.rc_startblock);
+}
+
+STATIC bool
+xfs_refcountbt_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_perag	*pag = bp->b_pag;
+	unsigned int		level;
+
+	if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC))
+		return false;
+
+	if (!xfs_sb_version_hasreflink(&mp->m_sb))
+		return false;
+	if (!xfs_btree_sblock_v5hdr_verify(bp))
+		return false;
+
+	level = be16_to_cpu(block->bb_level);
+	if (pag && pag->pagf_init) {
+		if (level >= pag->pagf_refcount_level)
+			return false;
+	} else if (level >= mp->m_refc_maxlevels)
+		return false;
+
+	return xfs_btree_sblock_verify(bp, mp->m_refc_mxr[level != 0]);
+}
+
+STATIC void
+xfs_refcountbt_read_verify(
+	struct xfs_buf	*bp)
+{
+	if (!xfs_btree_sblock_verify_crc(bp))
+		xfs_buf_ioerror(bp, -EFSBADCRC);
+	else if (!xfs_refcountbt_verify(bp))
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+	if (bp->b_error) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_verifier_error(bp);
+	}
+}
+
+STATIC void
+xfs_refcountbt_write_verify(
+	struct xfs_buf	*bp)
+{
+	if (!xfs_refcountbt_verify(bp)) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+	xfs_btree_sblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_refcountbt_buf_ops = {
+	.name			= "xfs_refcountbt",
+	.verify_read		= xfs_refcountbt_read_verify,
+	.verify_write		= xfs_refcountbt_write_verify,
+};
+
+#if defined(DEBUG) || defined(XFS_WARN)
+STATIC int
+xfs_refcountbt_keys_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*k1,
+	union xfs_btree_key	*k2)
+{
+	return be32_to_cpu(k1->refc.rc_startblock) <
+	       be32_to_cpu(k2->refc.rc_startblock);
+}
+
+STATIC int
+xfs_refcountbt_recs_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*r1,
+	union xfs_btree_rec	*r2)
+{
+	return  be32_to_cpu(r1->refc.rc_startblock) +
+		be32_to_cpu(r1->refc.rc_blockcount) <=
+		be32_to_cpu(r2->refc.rc_startblock);
+}
+#endif
+
+static const struct xfs_btree_ops xfs_refcountbt_ops = {
+	.rec_len		= sizeof(struct xfs_refcount_rec),
+	.key_len		= sizeof(struct xfs_refcount_key),
+
+	.dup_cursor		= xfs_refcountbt_dup_cursor,
+	.set_root		= xfs_refcountbt_set_root,
+	.alloc_block		= xfs_refcountbt_alloc_block,
+	.free_block		= xfs_refcountbt_free_block,
+	.get_minrecs		= xfs_refcountbt_get_minrecs,
+	.get_maxrecs		= xfs_refcountbt_get_maxrecs,
+	.init_key_from_rec	= xfs_refcountbt_init_key_from_rec,
+	.init_high_key_from_rec	= xfs_refcountbt_init_high_key_from_rec,
+	.init_rec_from_cur	= xfs_refcountbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_refcountbt_init_ptr_from_cur,
+	.key_diff		= xfs_refcountbt_key_diff,
+	.buf_ops		= &xfs_refcountbt_buf_ops,
+	.diff_two_keys		= xfs_refcountbt_diff_two_keys,
+#if defined(DEBUG) || defined(XFS_WARN)
+	.keys_inorder		= xfs_refcountbt_keys_inorder,
+	.recs_inorder		= xfs_refcountbt_recs_inorder,
+#endif
+};
+
+/*
+ * Allocate a new refcount btree cursor.
+ */
+struct xfs_btree_cur *
+xfs_refcountbt_init_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	xfs_agnumber_t		agno,
+	struct xfs_defer_ops	*dfops)
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	struct xfs_btree_cur	*cur;
+
+	ASSERT(agno != NULLAGNUMBER);
+	ASSERT(agno < mp->m_sb.sb_agcount);
+	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
+
+	cur->bc_tp = tp;
+	cur->bc_mp = mp;
+	cur->bc_btnum = XFS_BTNUM_REFC;
+	cur->bc_blocklog = mp->m_sb.sb_blocklog;
+	cur->bc_ops = &xfs_refcountbt_ops;
+
+	cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
+
+	cur->bc_private.a.agbp = agbp;
+	cur->bc_private.a.agno = agno;
+	cur->bc_private.a.dfops = dfops;
+	cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+	cur->bc_private.a.priv.refc.nr_ops = 0;
+	cur->bc_private.a.priv.refc.shape_changes = 0;
+
+	return cur;
+}
+
+/*
+ * Calculate the number of records in a refcount btree block.
+ */
+int
+xfs_refcountbt_maxrecs(
+	struct xfs_mount	*mp,
+	int			blocklen,
+	bool			leaf)
+{
+	blocklen -= XFS_REFCOUNT_BLOCK_LEN;
+
+	if (leaf)
+		return blocklen / sizeof(struct xfs_refcount_rec);
+	return blocklen / (sizeof(struct xfs_refcount_key) +
+			   sizeof(xfs_refcount_ptr_t));
+}
+
+/* Compute the maximum height of a refcount btree. */
+void
+xfs_refcountbt_compute_maxlevels(
+	struct xfs_mount		*mp)
+{
+	mp->m_refc_maxlevels = xfs_btree_compute_maxlevels(mp,
+			mp->m_refc_mnr, mp->m_sb.sb_agblocks);
+}
+
+/* Calculate the refcount btree size for some records. */
+xfs_extlen_t
+xfs_refcountbt_calc_size(
+	struct xfs_mount	*mp,
+	unsigned long long	len)
+{
+	return xfs_btree_calc_size(mp, mp->m_refc_mnr, len);
+}
+
+/*
+ * Calculate the maximum refcount btree size.
+ */
+xfs_extlen_t
+xfs_refcountbt_max_size(
+	struct xfs_mount	*mp)
+{
+	/* Bail out if we're uninitialized, which can happen in mkfs. */
+	if (mp->m_refc_mxr[0] == 0)
+		return 0;
+
+	return xfs_refcountbt_calc_size(mp, mp->m_sb.sb_agblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+int
+xfs_refcountbt_calc_reserves(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_extlen_t		*ask,
+	xfs_extlen_t		*used)
+{
+	struct xfs_buf		*agbp;
+	struct xfs_agf		*agf;
+	xfs_extlen_t		tree_len;
+	int			error;
+
+	if (!xfs_sb_version_hasreflink(&mp->m_sb))
+		return 0;
+
+	*ask += xfs_refcountbt_max_size(mp);
+
+	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+	if (error)
+		return error;
+
+	agf = XFS_BUF_TO_AGF(agbp);
+	tree_len = be32_to_cpu(agf->agf_refcount_blocks);
+	xfs_buf_relse(agbp);
+
+	*used += tree_len;
+
+	return error;
+}
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
new file mode 100644
index 0000000..3be7768
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#ifndef __XFS_REFCOUNT_BTREE_H__
+#define	__XFS_REFCOUNT_BTREE_H__
+
+/*
+ * Reference Count Btree on-disk structures
+ */
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+
+/*
+ * Btree block header size
+ */
+#define XFS_REFCOUNT_BLOCK_LEN	XFS_BTREE_SBLOCK_CRC_LEN
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_REFCOUNT_REC_ADDR(block, index) \
+	((struct xfs_refcount_rec *) \
+		((char *)(block) + \
+		 XFS_REFCOUNT_BLOCK_LEN + \
+		 (((index) - 1) * sizeof(struct xfs_refcount_rec))))
+
+#define XFS_REFCOUNT_KEY_ADDR(block, index) \
+	((struct xfs_refcount_key *) \
+		((char *)(block) + \
+		 XFS_REFCOUNT_BLOCK_LEN + \
+		 ((index) - 1) * sizeof(struct xfs_refcount_key)))
+
+#define XFS_REFCOUNT_PTR_ADDR(block, index, maxrecs) \
+	((xfs_refcount_ptr_t *) \
+		((char *)(block) + \
+		 XFS_REFCOUNT_BLOCK_LEN + \
+		 (maxrecs) * sizeof(struct xfs_refcount_key) + \
+		 ((index) - 1) * sizeof(xfs_refcount_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp,
+		struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno,
+		struct xfs_defer_ops *dfops);
+extern int xfs_refcountbt_maxrecs(struct xfs_mount *mp, int blocklen,
+		bool leaf);
+extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
+
+extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp,
+		unsigned long long len);
+extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp);
+
+extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp,
+		xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
+
+#endif	/* __XFS_REFCOUNT_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 73d0540..3a8cc71 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -148,6 +148,37 @@
 	return error;
 }
 
+STATIC int
+xfs_rmap_delete(
+	struct xfs_btree_cur	*rcur,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		len,
+	uint64_t		owner,
+	uint64_t		offset,
+	unsigned int		flags)
+{
+	int			i;
+	int			error;
+
+	trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_private.a.agno, agbno,
+			len, owner, offset, flags);
+
+	error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
+	if (error)
+		goto done;
+	XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done);
+
+	error = xfs_btree_delete(rcur, &i);
+	if (error)
+		goto done;
+	XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done);
+done:
+	if (error)
+		trace_xfs_rmap_delete_error(rcur->bc_mp,
+				rcur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
 static int
 xfs_rmap_btrec_to_irec(
 	union xfs_btree_rec	*rec,
@@ -180,6 +211,160 @@
 	return xfs_rmap_btrec_to_irec(rec, irec);
 }
 
+struct xfs_find_left_neighbor_info {
+	struct xfs_rmap_irec	high;
+	struct xfs_rmap_irec	*irec;
+	int			*stat;
+};
+
+/* For each rmap given, figure out if it matches the key we want. */
+STATIC int
+xfs_rmap_find_left_neighbor_helper(
+	struct xfs_btree_cur	*cur,
+	struct xfs_rmap_irec	*rec,
+	void			*priv)
+{
+	struct xfs_find_left_neighbor_info	*info = priv;
+
+	trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp,
+			cur->bc_private.a.agno, rec->rm_startblock,
+			rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
+			rec->rm_flags);
+
+	if (rec->rm_owner != info->high.rm_owner)
+		return XFS_BTREE_QUERY_RANGE_CONTINUE;
+	if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
+	    !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
+	    rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset)
+		return XFS_BTREE_QUERY_RANGE_CONTINUE;
+
+	*info->irec = *rec;
+	*info->stat = 1;
+	return XFS_BTREE_QUERY_RANGE_ABORT;
+}
+
+/*
+ * Find the record to the left of the given extent, being careful only to
+ * return a match with the same owner and adjacent physical and logical
+ * block ranges.
+ */
+int
+xfs_rmap_find_left_neighbor(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		bno,
+	uint64_t		owner,
+	uint64_t		offset,
+	unsigned int		flags,
+	struct xfs_rmap_irec	*irec,
+	int			*stat)
+{
+	struct xfs_find_left_neighbor_info	info;
+	int			error;
+
+	*stat = 0;
+	if (bno == 0)
+		return 0;
+	info.high.rm_startblock = bno - 1;
+	info.high.rm_owner = owner;
+	if (!XFS_RMAP_NON_INODE_OWNER(owner) &&
+	    !(flags & XFS_RMAP_BMBT_BLOCK)) {
+		if (offset == 0)
+			return 0;
+		info.high.rm_offset = offset - 1;
+	} else
+		info.high.rm_offset = 0;
+	info.high.rm_flags = flags;
+	info.high.rm_blockcount = 0;
+	info.irec = irec;
+	info.stat = stat;
+
+	trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp,
+			cur->bc_private.a.agno, bno, 0, owner, offset, flags);
+
+	error = xfs_rmap_query_range(cur, &info.high, &info.high,
+			xfs_rmap_find_left_neighbor_helper, &info);
+	if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+		error = 0;
+	if (*stat)
+		trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
+				cur->bc_private.a.agno, irec->rm_startblock,
+				irec->rm_blockcount, irec->rm_owner,
+				irec->rm_offset, irec->rm_flags);
+	return error;
+}
+
+/* For each rmap given, figure out if it matches the key we want. */
+STATIC int
+xfs_rmap_lookup_le_range_helper(
+	struct xfs_btree_cur	*cur,
+	struct xfs_rmap_irec	*rec,
+	void			*priv)
+{
+	struct xfs_find_left_neighbor_info	*info = priv;
+
+	trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp,
+			cur->bc_private.a.agno, rec->rm_startblock,
+			rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
+			rec->rm_flags);
+
+	if (rec->rm_owner != info->high.rm_owner)
+		return XFS_BTREE_QUERY_RANGE_CONTINUE;
+	if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
+	    !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
+	    (rec->rm_offset > info->high.rm_offset ||
+	     rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset))
+		return XFS_BTREE_QUERY_RANGE_CONTINUE;
+
+	*info->irec = *rec;
+	*info->stat = 1;
+	return XFS_BTREE_QUERY_RANGE_ABORT;
+}
+
+/*
+ * Find the record to the left of the given extent, being careful only to
+ * return a match with the same owner and overlapping physical and logical
+ * block ranges.  This is the overlapping-interval version of
+ * xfs_rmap_lookup_le.
+ */
+int
+xfs_rmap_lookup_le_range(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		bno,
+	uint64_t		owner,
+	uint64_t		offset,
+	unsigned int		flags,
+	struct xfs_rmap_irec	*irec,
+	int			*stat)
+{
+	struct xfs_find_left_neighbor_info	info;
+	int			error;
+
+	info.high.rm_startblock = bno;
+	info.high.rm_owner = owner;
+	if (!XFS_RMAP_NON_INODE_OWNER(owner) && !(flags & XFS_RMAP_BMBT_BLOCK))
+		info.high.rm_offset = offset;
+	else
+		info.high.rm_offset = 0;
+	info.high.rm_flags = flags;
+	info.high.rm_blockcount = 0;
+	*stat = 0;
+	info.irec = irec;
+	info.stat = stat;
+
+	trace_xfs_rmap_lookup_le_range(cur->bc_mp,
+			cur->bc_private.a.agno, bno, 0, owner, offset, flags);
+	error = xfs_rmap_query_range(cur, &info.high, &info.high,
+			xfs_rmap_lookup_le_range_helper, &info);
+	if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+		error = 0;
+	if (*stat)
+		trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+				cur->bc_private.a.agno, irec->rm_startblock,
+				irec->rm_blockcount, irec->rm_owner,
+				irec->rm_offset, irec->rm_flags);
+	return error;
+}
+
 /*
  * Find the extent in the rmap btree and remove it.
  *
@@ -1093,11 +1278,704 @@
 	return error;
 }
 
+/*
+ * Convert an unwritten extent to a real extent or vice versa.  If there is no
+ * possibility of overlapping extents, delegate to the simpler convert
+ * function.
+ */
+STATIC int
+xfs_rmap_convert_shared(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len,
+	bool			unwritten,
+	struct xfs_owner_info	*oinfo)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_rmap_irec	r[4];	/* neighbor extent entries */
+					/* left is 0, right is 1, prev is 2 */
+					/* new is 3 */
+	uint64_t		owner;
+	uint64_t		offset;
+	uint64_t		new_endoff;
+	unsigned int		oldext;
+	unsigned int		newext;
+	unsigned int		flags = 0;
+	int			i;
+	int			state = 0;
+	int			error;
+
+	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+	ASSERT(!(XFS_RMAP_NON_INODE_OWNER(owner) ||
+			(flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
+	oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
+	new_endoff = offset + len;
+	trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len,
+			unwritten, oinfo);
+
+	/*
+	 * For the initial lookup, look for and exact match or the left-adjacent
+	 * record for our insertion point. This will also give us the record for
+	 * start block contiguity tests.
+	 */
+	error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags,
+			&PREV, &i);
+	XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+
+	ASSERT(PREV.rm_offset <= offset);
+	ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff);
+	ASSERT((PREV.rm_flags & XFS_RMAP_UNWRITTEN) == oldext);
+	newext = ~oldext & XFS_RMAP_UNWRITTEN;
+
+	/*
+	 * Set flags determining what part of the previous oldext allocation
+	 * extent is being replaced by a newext allocation.
+	 */
+	if (PREV.rm_offset == offset)
+		state |= RMAP_LEFT_FILLING;
+	if (PREV.rm_offset + PREV.rm_blockcount == new_endoff)
+		state |= RMAP_RIGHT_FILLING;
+
+	/* Is there a left record that abuts our range? */
+	error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, newext,
+			&LEFT, &i);
+	if (error)
+		goto done;
+	if (i) {
+		state |= RMAP_LEFT_VALID;
+		XFS_WANT_CORRUPTED_GOTO(mp,
+				LEFT.rm_startblock + LEFT.rm_blockcount <= bno,
+				done);
+		if (xfs_rmap_is_mergeable(&LEFT, owner, newext))
+			state |= RMAP_LEFT_CONTIG;
+	}
+
+	/* Is there a right record that abuts our range? */
+	error = xfs_rmap_lookup_eq(cur, bno + len, len, owner, offset + len,
+			newext, &i);
+	if (error)
+		goto done;
+	if (i) {
+		state |= RMAP_RIGHT_VALID;
+		error = xfs_rmap_get_rec(cur, &RIGHT, &i);
+		if (error)
+			goto done;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= RIGHT.rm_startblock,
+				done);
+		trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
+				cur->bc_private.a.agno, RIGHT.rm_startblock,
+				RIGHT.rm_blockcount, RIGHT.rm_owner,
+				RIGHT.rm_offset, RIGHT.rm_flags);
+		if (xfs_rmap_is_mergeable(&RIGHT, owner, newext))
+			state |= RMAP_RIGHT_CONTIG;
+	}
+
+	/* check that left + prev + right is not too long */
+	if ((state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+			 RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) ==
+	    (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+	     RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG) &&
+	    (unsigned long)LEFT.rm_blockcount + len +
+	     RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
+		state &= ~RMAP_RIGHT_CONTIG;
+
+	trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state,
+			_RET_IP_);
+	/*
+	 * Switch out based on the FILLING and CONTIG state bits.
+	 */
+	switch (state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+			 RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) {
+	case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+	     RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The left and right neighbors are both contiguous with new.
+		 */
+		error = xfs_rmap_delete(cur, RIGHT.rm_startblock,
+				RIGHT.rm_blockcount, RIGHT.rm_owner,
+				RIGHT.rm_offset, RIGHT.rm_flags);
+		if (error)
+			goto done;
+		error = xfs_rmap_delete(cur, PREV.rm_startblock,
+				PREV.rm_blockcount, PREV.rm_owner,
+				PREV.rm_offset, PREV.rm_flags);
+		if (error)
+			goto done;
+		NEW = LEFT;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The left neighbor is contiguous, the right is not.
+		 */
+		error = xfs_rmap_delete(cur, PREV.rm_startblock,
+				PREV.rm_blockcount, PREV.rm_owner,
+				PREV.rm_offset, PREV.rm_flags);
+		if (error)
+			goto done;
+		NEW = LEFT;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		NEW.rm_blockcount += PREV.rm_blockcount;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The right neighbor is contiguous, the left is not.
+		 */
+		error = xfs_rmap_delete(cur, RIGHT.rm_startblock,
+				RIGHT.rm_blockcount, RIGHT.rm_owner,
+				RIGHT.rm_offset, RIGHT.rm_flags);
+		if (error)
+			goto done;
+		NEW = PREV;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		NEW.rm_blockcount += RIGHT.rm_blockcount;
+		NEW.rm_flags = RIGHT.rm_flags;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * Neither the left nor right neighbors are contiguous with
+		 * the new one.
+		 */
+		NEW = PREV;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		NEW.rm_flags = newext;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG:
+		/*
+		 * Setting the first part of a previous oldext extent to newext.
+		 * The left neighbor is contiguous.
+		 */
+		NEW = PREV;
+		error = xfs_rmap_delete(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags);
+		if (error)
+			goto done;
+		NEW.rm_offset += len;
+		NEW.rm_startblock += len;
+		NEW.rm_blockcount -= len;
+		error = xfs_rmap_insert(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags);
+		if (error)
+			goto done;
+		NEW = LEFT;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		NEW.rm_blockcount += len;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_LEFT_FILLING:
+		/*
+		 * Setting the first part of a previous oldext extent to newext.
+		 * The left neighbor is not contiguous.
+		 */
+		NEW = PREV;
+		error = xfs_rmap_delete(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags);
+		if (error)
+			goto done;
+		NEW.rm_offset += len;
+		NEW.rm_startblock += len;
+		NEW.rm_blockcount -= len;
+		error = xfs_rmap_insert(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags);
+		if (error)
+			goto done;
+		error = xfs_rmap_insert(cur, bno, len, owner, offset, newext);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+		/*
+		 * Setting the last part of a previous oldext extent to newext.
+		 * The right neighbor is contiguous with the new allocation.
+		 */
+		NEW = PREV;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		NEW.rm_blockcount = offset - NEW.rm_offset;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		NEW = RIGHT;
+		error = xfs_rmap_delete(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags);
+		if (error)
+			goto done;
+		NEW.rm_offset = offset;
+		NEW.rm_startblock = bno;
+		NEW.rm_blockcount += len;
+		error = xfs_rmap_insert(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_RIGHT_FILLING:
+		/*
+		 * Setting the last part of a previous oldext extent to newext.
+		 * The right neighbor is not contiguous.
+		 */
+		NEW = PREV;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		NEW.rm_blockcount -= len;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		error = xfs_rmap_insert(cur, bno, len, owner, offset, newext);
+		if (error)
+			goto done;
+		break;
+
+	case 0:
+		/*
+		 * Setting the middle part of a previous oldext extent to
+		 * newext.  Contiguity is impossible here.
+		 * One extent becomes three extents.
+		 */
+		/* new right extent - oldext */
+		NEW.rm_startblock = bno + len;
+		NEW.rm_owner = owner;
+		NEW.rm_offset = new_endoff;
+		NEW.rm_blockcount = PREV.rm_offset + PREV.rm_blockcount -
+				new_endoff;
+		NEW.rm_flags = PREV.rm_flags;
+		error = xfs_rmap_insert(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset,
+				NEW.rm_flags);
+		if (error)
+			goto done;
+		/* new left extent - oldext */
+		NEW = PREV;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		NEW.rm_blockcount = offset - NEW.rm_offset;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		/* new middle extent - newext */
+		NEW.rm_startblock = bno;
+		NEW.rm_blockcount = len;
+		NEW.rm_owner = owner;
+		NEW.rm_offset = offset;
+		NEW.rm_flags = newext;
+		error = xfs_rmap_insert(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset,
+				NEW.rm_flags);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+	case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+	case RMAP_LEFT_FILLING | RMAP_RIGHT_CONTIG:
+	case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG:
+	case RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+	case RMAP_LEFT_CONTIG:
+	case RMAP_RIGHT_CONTIG:
+		/*
+		 * These cases are all impossible.
+		 */
+		ASSERT(0);
+	}
+
+	trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len,
+			unwritten, oinfo);
+done:
+	if (error)
+		trace_xfs_rmap_convert_error(cur->bc_mp,
+				cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
 #undef	NEW
 #undef	LEFT
 #undef	RIGHT
 #undef	PREV
 
+/*
+ * Find an extent in the rmap btree and unmap it.  For rmap extent types that
+ * can overlap (data fork rmaps on reflink filesystems) we must be careful
+ * that the prev/next records in the btree might belong to another owner.
+ * Therefore we must use delete+insert to alter any of the key fields.
+ *
+ * For every other situation there can only be one owner for a given extent,
+ * so we can call the regular _free function.
+ */
+STATIC int
+xfs_rmap_unmap_shared(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len,
+	bool			unwritten,
+	struct xfs_owner_info	*oinfo)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_rmap_irec	ltrec;
+	uint64_t		ltoff;
+	int			error = 0;
+	int			i;
+	uint64_t		owner;
+	uint64_t		offset;
+	unsigned int		flags;
+
+	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+	if (unwritten)
+		flags |= XFS_RMAP_UNWRITTEN;
+	trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len,
+			unwritten, oinfo);
+
+	/*
+	 * We should always have a left record because there's a static record
+	 * for the AG headers at rm_startblock == 0 created by mkfs/growfs that
+	 * will not ever be removed from the tree.
+	 */
+	error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags,
+			&ltrec, &i);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+	ltoff = ltrec.rm_offset;
+
+	/* Make sure the extent we found covers the entire freeing range. */
+	XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno &&
+		ltrec.rm_startblock + ltrec.rm_blockcount >=
+		bno + len, out_error);
+
+	/* Make sure the owner matches what we expect to find in the tree. */
+	XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner, out_error);
+
+	/* Make sure the unwritten flag matches. */
+	XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) ==
+			(ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error);
+
+	/* Check the offset. */
+	XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_offset <= offset, out_error);
+	XFS_WANT_CORRUPTED_GOTO(mp, offset <= ltoff + ltrec.rm_blockcount,
+			out_error);
+
+	if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) {
+		/* Exact match, simply remove the record from rmap tree. */
+		error = xfs_rmap_delete(cur, ltrec.rm_startblock,
+				ltrec.rm_blockcount, ltrec.rm_owner,
+				ltrec.rm_offset, ltrec.rm_flags);
+		if (error)
+			goto out_error;
+	} else if (ltrec.rm_startblock == bno) {
+		/*
+		 * Overlap left hand side of extent: move the start, trim the
+		 * length and update the current record.
+		 *
+		 *       ltbno                ltlen
+		 * Orig:    |oooooooooooooooooooo|
+		 * Freeing: |fffffffff|
+		 * Result:            |rrrrrrrrrr|
+		 *         bno       len
+		 */
+
+		/* Delete prev rmap. */
+		error = xfs_rmap_delete(cur, ltrec.rm_startblock,
+				ltrec.rm_blockcount, ltrec.rm_owner,
+				ltrec.rm_offset, ltrec.rm_flags);
+		if (error)
+			goto out_error;
+
+		/* Add an rmap at the new offset. */
+		ltrec.rm_startblock += len;
+		ltrec.rm_blockcount -= len;
+		ltrec.rm_offset += len;
+		error = xfs_rmap_insert(cur, ltrec.rm_startblock,
+				ltrec.rm_blockcount, ltrec.rm_owner,
+				ltrec.rm_offset, ltrec.rm_flags);
+		if (error)
+			goto out_error;
+	} else if (ltrec.rm_startblock + ltrec.rm_blockcount == bno + len) {
+		/*
+		 * Overlap right hand side of extent: trim the length and
+		 * update the current record.
+		 *
+		 *       ltbno                ltlen
+		 * Orig:    |oooooooooooooooooooo|
+		 * Freeing:            |fffffffff|
+		 * Result:  |rrrrrrrrrr|
+		 *                    bno       len
+		 */
+		error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock,
+				ltrec.rm_blockcount, ltrec.rm_owner,
+				ltrec.rm_offset, ltrec.rm_flags, &i);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+		ltrec.rm_blockcount -= len;
+		error = xfs_rmap_update(cur, &ltrec);
+		if (error)
+			goto out_error;
+	} else {
+		/*
+		 * Overlap middle of extent: trim the length of the existing
+		 * record to the length of the new left-extent size, increment
+		 * the insertion position so we can insert a new record
+		 * containing the remaining right-extent space.
+		 *
+		 *       ltbno                ltlen
+		 * Orig:    |oooooooooooooooooooo|
+		 * Freeing:       |fffffffff|
+		 * Result:  |rrrrr|         |rrrr|
+		 *               bno       len
+		 */
+		xfs_extlen_t	orig_len = ltrec.rm_blockcount;
+
+		/* Shrink the left side of the rmap */
+		error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock,
+				ltrec.rm_blockcount, ltrec.rm_owner,
+				ltrec.rm_offset, ltrec.rm_flags, &i);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+		ltrec.rm_blockcount = bno - ltrec.rm_startblock;
+		error = xfs_rmap_update(cur, &ltrec);
+		if (error)
+			goto out_error;
+
+		/* Add an rmap at the new offset */
+		error = xfs_rmap_insert(cur, bno + len,
+				orig_len - len - ltrec.rm_blockcount,
+				ltrec.rm_owner, offset + len,
+				ltrec.rm_flags);
+		if (error)
+			goto out_error;
+	}
+
+	trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len,
+			unwritten, oinfo);
+out_error:
+	if (error)
+		trace_xfs_rmap_unmap_error(cur->bc_mp,
+				cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Find an extent in the rmap btree and map it.  For rmap extent types that
+ * can overlap (data fork rmaps on reflink filesystems) we must be careful
+ * that the prev/next records in the btree might belong to another owner.
+ * Therefore we must use delete+insert to alter any of the key fields.
+ *
+ * For every other situation there can only be one owner for a given extent,
+ * so we can call the regular _alloc function.
+ */
+STATIC int
+xfs_rmap_map_shared(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len,
+	bool			unwritten,
+	struct xfs_owner_info	*oinfo)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_rmap_irec	ltrec;
+	struct xfs_rmap_irec	gtrec;
+	int			have_gt;
+	int			have_lt;
+	int			error = 0;
+	int			i;
+	uint64_t		owner;
+	uint64_t		offset;
+	unsigned int		flags = 0;
+
+	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+	if (unwritten)
+		flags |= XFS_RMAP_UNWRITTEN;
+	trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len,
+			unwritten, oinfo);
+
+	/* Is there a left record that abuts our range? */
+	error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, flags,
+			&ltrec, &have_lt);
+	if (error)
+		goto out_error;
+	if (have_lt &&
+	    !xfs_rmap_is_mergeable(&ltrec, owner, flags))
+		have_lt = 0;
+
+	/* Is there a right record that abuts our range? */
+	error = xfs_rmap_lookup_eq(cur, bno + len, len, owner, offset + len,
+			flags, &have_gt);
+	if (error)
+		goto out_error;
+	if (have_gt) {
+		error = xfs_rmap_get_rec(cur, &gtrec, &have_gt);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out_error);
+		trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
+			cur->bc_private.a.agno, gtrec.rm_startblock,
+			gtrec.rm_blockcount, gtrec.rm_owner,
+			gtrec.rm_offset, gtrec.rm_flags);
+
+		if (!xfs_rmap_is_mergeable(&gtrec, owner, flags))
+			have_gt = 0;
+	}
+
+	if (have_lt &&
+	    ltrec.rm_startblock + ltrec.rm_blockcount == bno &&
+	    ltrec.rm_offset + ltrec.rm_blockcount == offset) {
+		/*
+		 * Left edge contiguous, merge into left record.
+		 *
+		 *       ltbno     ltlen
+		 * orig:   |ooooooooo|
+		 * adding:           |aaaaaaaaa|
+		 * result: |rrrrrrrrrrrrrrrrrrr|
+		 *                  bno       len
+		 */
+		ltrec.rm_blockcount += len;
+		if (have_gt &&
+		    bno + len == gtrec.rm_startblock &&
+		    offset + len == gtrec.rm_offset) {
+			/*
+			 * Right edge also contiguous, delete right record
+			 * and merge into left record.
+			 *
+			 *       ltbno     ltlen    gtbno     gtlen
+			 * orig:   |ooooooooo|         |ooooooooo|
+			 * adding:           |aaaaaaaaa|
+			 * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr|
+			 */
+			ltrec.rm_blockcount += gtrec.rm_blockcount;
+			error = xfs_rmap_delete(cur, gtrec.rm_startblock,
+					gtrec.rm_blockcount, gtrec.rm_owner,
+					gtrec.rm_offset, gtrec.rm_flags);
+			if (error)
+				goto out_error;
+		}
+
+		/* Point the cursor back to the left record and update. */
+		error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock,
+				ltrec.rm_blockcount, ltrec.rm_owner,
+				ltrec.rm_offset, ltrec.rm_flags, &i);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+
+		error = xfs_rmap_update(cur, &ltrec);
+		if (error)
+			goto out_error;
+	} else if (have_gt &&
+		   bno + len == gtrec.rm_startblock &&
+		   offset + len == gtrec.rm_offset) {
+		/*
+		 * Right edge contiguous, merge into right record.
+		 *
+		 *                 gtbno     gtlen
+		 * Orig:             |ooooooooo|
+		 * adding: |aaaaaaaaa|
+		 * Result: |rrrrrrrrrrrrrrrrrrr|
+		 *        bno       len
+		 */
+		/* Delete the old record. */
+		error = xfs_rmap_delete(cur, gtrec.rm_startblock,
+				gtrec.rm_blockcount, gtrec.rm_owner,
+				gtrec.rm_offset, gtrec.rm_flags);
+		if (error)
+			goto out_error;
+
+		/* Move the start and re-add it. */
+		gtrec.rm_startblock = bno;
+		gtrec.rm_blockcount += len;
+		gtrec.rm_offset = offset;
+		error = xfs_rmap_insert(cur, gtrec.rm_startblock,
+				gtrec.rm_blockcount, gtrec.rm_owner,
+				gtrec.rm_offset, gtrec.rm_flags);
+		if (error)
+			goto out_error;
+	} else {
+		/*
+		 * No contiguous edge with identical owner, insert
+		 * new record at current cursor position.
+		 */
+		error = xfs_rmap_insert(cur, bno, len, owner, offset, flags);
+		if (error)
+			goto out_error;
+	}
+
+	trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len,
+			unwritten, oinfo);
+out_error:
+	if (error)
+		trace_xfs_rmap_map_error(cur->bc_mp,
+				cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
 struct xfs_rmap_query_range_info {
 	xfs_rmap_query_range_fn	fn;
 	void				*priv;
@@ -1237,15 +2115,27 @@
 	case XFS_RMAP_MAP:
 		error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo);
 		break;
+	case XFS_RMAP_MAP_SHARED:
+		error = xfs_rmap_map_shared(rcur, bno, blockcount, unwritten,
+				&oinfo);
+		break;
 	case XFS_RMAP_FREE:
 	case XFS_RMAP_UNMAP:
 		error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten,
 				&oinfo);
 		break;
+	case XFS_RMAP_UNMAP_SHARED:
+		error = xfs_rmap_unmap_shared(rcur, bno, blockcount, unwritten,
+				&oinfo);
+		break;
 	case XFS_RMAP_CONVERT:
 		error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten,
 				&oinfo);
 		break;
+	case XFS_RMAP_CONVERT_SHARED:
+		error = xfs_rmap_convert_shared(rcur, bno, blockcount,
+				!unwritten, &oinfo);
+		break;
 	default:
 		ASSERT(0);
 		error = -EFSCORRUPTED;
@@ -1263,9 +2153,10 @@
  */
 static bool
 xfs_rmap_update_is_needed(
-	struct xfs_mount	*mp)
+	struct xfs_mount	*mp,
+	int			whichfork)
 {
-	return xfs_sb_version_hasrmapbt(&mp->m_sb);
+	return xfs_sb_version_hasrmapbt(&mp->m_sb) && whichfork != XFS_COW_FORK;
 }
 
 /*
@@ -1311,10 +2202,11 @@
 	int			whichfork,
 	struct xfs_bmbt_irec	*PREV)
 {
-	if (!xfs_rmap_update_is_needed(mp))
+	if (!xfs_rmap_update_is_needed(mp, whichfork))
 		return 0;
 
-	return __xfs_rmap_add(mp, dfops, XFS_RMAP_MAP, ip->i_ino,
+	return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ?
+			XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino,
 			whichfork, PREV);
 }
 
@@ -1327,10 +2219,11 @@
 	int			whichfork,
 	struct xfs_bmbt_irec	*PREV)
 {
-	if (!xfs_rmap_update_is_needed(mp))
+	if (!xfs_rmap_update_is_needed(mp, whichfork))
 		return 0;
 
-	return __xfs_rmap_add(mp, dfops, XFS_RMAP_UNMAP, ip->i_ino,
+	return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ?
+			XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino,
 			whichfork, PREV);
 }
 
@@ -1343,10 +2236,11 @@
 	int			whichfork,
 	struct xfs_bmbt_irec	*PREV)
 {
-	if (!xfs_rmap_update_is_needed(mp))
+	if (!xfs_rmap_update_is_needed(mp, whichfork))
 		return 0;
 
-	return __xfs_rmap_add(mp, dfops, XFS_RMAP_CONVERT, ip->i_ino,
+	return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ?
+			XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino,
 			whichfork, PREV);
 }
 
@@ -1362,7 +2256,7 @@
 {
 	struct xfs_bmbt_irec	bmap;
 
-	if (!xfs_rmap_update_is_needed(mp))
+	if (!xfs_rmap_update_is_needed(mp, XFS_DATA_FORK))
 		return 0;
 
 	bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno);
@@ -1386,7 +2280,7 @@
 {
 	struct xfs_bmbt_irec	bmap;
 
-	if (!xfs_rmap_update_is_needed(mp))
+	if (!xfs_rmap_update_is_needed(mp, XFS_DATA_FORK))
 		return 0;
 
 	bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno);
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 71cf99a..7899305 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -206,4 +206,11 @@
 		xfs_fsblock_t startblock, xfs_filblks_t blockcount,
 		xfs_exntst_t state, struct xfs_btree_cur **pcur);
 
+int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+		uint64_t owner, uint64_t offset, unsigned int flags,
+		struct xfs_rmap_irec *irec, int	*stat);
+int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+		uint64_t owner, uint64_t offset, unsigned int flags,
+		struct xfs_rmap_irec *irec, int	*stat);
+
 #endif	/* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 17b8eeb..83e672f 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -35,6 +35,7 @@
 #include "xfs_cksum.h"
 #include "xfs_error.h"
 #include "xfs_extent_busy.h"
+#include "xfs_ag_resv.h"
 
 /*
  * Reverse map btree.
@@ -512,6 +513,83 @@
 xfs_rmapbt_compute_maxlevels(
 	struct xfs_mount		*mp)
 {
-	mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp,
-			mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
+	/*
+	 * On a non-reflink filesystem, the maximum number of rmap
+	 * records is the number of blocks in the AG, hence the max
+	 * rmapbt height is log_$maxrecs($agblocks).  However, with
+	 * reflink each AG block can have up to 2^32 (per the refcount
+	 * record format) owners, which means that theoretically we
+	 * could face up to 2^64 rmap records.
+	 *
+	 * That effectively means that the max rmapbt height must be
+	 * XFS_BTREE_MAXLEVELS.  "Fortunately" we'll run out of AG
+	 * blocks to feed the rmapbt long before the rmapbt reaches
+	 * maximum height.  The reflink code uses ag_resv_critical to
+	 * disallow reflinking when less than 10% of the per-AG metadata
+	 * block reservation since the fallback is a regular file copy.
+	 */
+	if (xfs_sb_version_hasreflink(&mp->m_sb))
+		mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS;
+	else
+		mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp,
+				mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
+}
+
+/* Calculate the refcount btree size for some records. */
+xfs_extlen_t
+xfs_rmapbt_calc_size(
+	struct xfs_mount	*mp,
+	unsigned long long	len)
+{
+	return xfs_btree_calc_size(mp, mp->m_rmap_mnr, len);
+}
+
+/*
+ * Calculate the maximum refcount btree size.
+ */
+xfs_extlen_t
+xfs_rmapbt_max_size(
+	struct xfs_mount	*mp)
+{
+	/* Bail out if we're uninitialized, which can happen in mkfs. */
+	if (mp->m_rmap_mxr[0] == 0)
+		return 0;
+
+	return xfs_rmapbt_calc_size(mp, mp->m_sb.sb_agblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+int
+xfs_rmapbt_calc_reserves(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_extlen_t		*ask,
+	xfs_extlen_t		*used)
+{
+	struct xfs_buf		*agbp;
+	struct xfs_agf		*agf;
+	xfs_extlen_t		pool_len;
+	xfs_extlen_t		tree_len;
+	int			error;
+
+	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+		return 0;
+
+	/* Reserve 1% of the AG or enough for 1 block per record. */
+	pool_len = max(mp->m_sb.sb_agblocks / 100, xfs_rmapbt_max_size(mp));
+	*ask += pool_len;
+
+	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+	if (error)
+		return error;
+
+	agf = XFS_BUF_TO_AGF(agbp);
+	tree_len = be32_to_cpu(agf->agf_rmap_blocks);
+	xfs_buf_relse(agbp);
+
+	*used += tree_len;
+
+	return error;
 }
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
index e73a553..2a9ac47 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -58,4 +58,11 @@
 int xfs_rmapbt_maxrecs(struct xfs_mount *mp, int blocklen, int leaf);
 extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp);
 
+extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp,
+		unsigned long long len);
+extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp);
+
+extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp,
+		xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
+
 #endif	/* __XFS_RMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 4aecc5f..a70aec9 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -38,6 +38,8 @@
 #include "xfs_ialloc_btree.h"
 #include "xfs_log.h"
 #include "xfs_rmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_refcount_btree.h"
 
 /*
  * Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -737,6 +739,13 @@
 	mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2;
 	mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2;
 
+	mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize,
+			true);
+	mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize,
+			false);
+	mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2;
+	mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2;
+
 	mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
 	mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
 					sbp->sb_inopblock);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 0c5b30b..c6f4eb4 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -39,6 +39,7 @@
 extern const struct xfs_buf_ops xfs_agfl_buf_ops;
 extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
 extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
+extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
 extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
 extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
 extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
@@ -122,6 +123,7 @@
 #define	XFS_INO_REF		2
 #define	XFS_ATTR_BTREE_REF	1
 #define	XFS_DQUOT_REF		1
+#define	XFS_REFC_BTREE_REF	1
 
 /*
  * Flags for xfs_trans_ichgtime().
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 301ef2f..b456cca 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -67,13 +67,14 @@
  * Per-extent log reservation for the btree changes involved in freeing or
  * allocating an extent.  In classic XFS there were two trees that will be
  * modified (bnobt + cntbt).  With rmap enabled, there are three trees
- * (rmapbt).  The number of blocks reserved is based on the formula:
+ * (rmapbt).  With reflink, there are four trees (refcountbt).  The number of
+ * blocks reserved is based on the formula:
  *
  * num trees * ((2 blocks/level * max depth) - 1)
  *
  * Keep in mind that max depth is calculated separately for each type of tree.
  */
-static uint
+uint
 xfs_allocfree_log_count(
 	struct xfs_mount *mp,
 	uint		num_ops)
@@ -83,6 +84,8 @@
 	blocks = num_ops * 2 * (2 * mp->m_ag_maxlevels - 1);
 	if (xfs_sb_version_hasrmapbt(&mp->m_sb))
 		blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1);
+	if (xfs_sb_version_hasreflink(&mp->m_sb))
+		blocks += num_ops * (2 * mp->m_refc_maxlevels - 1);
 
 	return blocks;
 }
@@ -809,11 +812,18 @@
 	 * require a permanent reservation on space.
 	 */
 	resp->tr_write.tr_logres = xfs_calc_write_reservation(mp);
-	resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+	if (xfs_sb_version_hasreflink(&mp->m_sb))
+		resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+	else
+		resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
 	resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
 
 	resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp);
-	resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+	if (xfs_sb_version_hasreflink(&mp->m_sb))
+		resp->tr_itruncate.tr_logcount =
+				XFS_ITRUNCATE_LOG_COUNT_REFLINK;
+	else
+		resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
 	resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
 
 	resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
@@ -870,7 +880,10 @@
 	resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
 
 	resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp);
-	resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+	if (xfs_sb_version_hasreflink(&mp->m_sb))
+		resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+	else
+		resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
 	resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
 
 	/*
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 0eb46ed..b7e5357 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -87,6 +87,7 @@
 #define	XFS_DEFAULT_LOG_COUNT		1
 #define	XFS_DEFAULT_PERM_LOG_COUNT	2
 #define	XFS_ITRUNCATE_LOG_COUNT		2
+#define	XFS_ITRUNCATE_LOG_COUNT_REFLINK	8
 #define XFS_INACTIVE_LOG_COUNT		2
 #define	XFS_CREATE_LOG_COUNT		2
 #define	XFS_CREATE_TMPFILE_LOG_COUNT	2
@@ -96,11 +97,13 @@
 #define	XFS_LINK_LOG_COUNT		2
 #define	XFS_RENAME_LOG_COUNT		2
 #define	XFS_WRITE_LOG_COUNT		2
+#define	XFS_WRITE_LOG_COUNT_REFLINK	8
 #define	XFS_ADDAFORK_LOG_COUNT		2
 #define	XFS_ATTRINVAL_LOG_COUNT		1
 #define	XFS_ATTRSET_LOG_COUNT		3
 #define	XFS_ATTRRM_LOG_COUNT		3
 
 void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
+uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops);
 
 #endif	/* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index 41e0428..7917f6e 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -21,6 +21,8 @@
 /*
  * Components of space reservations.
  */
+#define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)    \
+		(((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0]))
 #define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)    \
 		(((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0]))
 #define	XFS_EXTENTADD_SPACE_RES(mp,w)	(XFS_BM_MAXLEVELS(mp,w) - 1)
@@ -28,6 +30,13 @@
 	(((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
 	  XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
 	  XFS_EXTENTADD_SPACE_RES(mp,w))
+#define XFS_SWAP_RMAP_SPACE_RES(mp,b,w)\
+	(((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
+	  XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
+	  XFS_EXTENTADD_SPACE_RES(mp,w) + \
+	 ((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \
+	  XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \
+	  (mp)->m_rmap_maxlevels)
 #define	XFS_DAENTER_1B(mp,w)	\
 	((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1)
 #define	XFS_DAENTER_DBS(mp,w)	\
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 3d50364..8d74870 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -90,6 +90,7 @@
  */
 #define	XFS_DATA_FORK	0
 #define	XFS_ATTR_FORK	1
+#define	XFS_COW_FORK	2
 
 /*
  * Min numbers of data/attr fork btree root pointers.
@@ -109,7 +110,7 @@
 
 typedef enum {
 	XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi,
-	XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_MAX
+	XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX
 } xfs_btnum_t;
 
 struct xfs_name {
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 4a28fa9..3e57a56 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -31,6 +31,7 @@
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
 #include "xfs_bmap_btree.h"
+#include "xfs_reflink.h"
 #include <linux/gfp.h>
 #include <linux/mpage.h>
 #include <linux/pagevec.h>
@@ -39,6 +40,7 @@
 /* flags for direct write completions */
 #define XFS_DIO_FLAG_UNWRITTEN	(1 << 0)
 #define XFS_DIO_FLAG_APPEND	(1 << 1)
+#define XFS_DIO_FLAG_COW	(1 << 2)
 
 /*
  * structure owned by writepages passed to individual writepage calls
@@ -287,6 +289,25 @@
 		error = -EIO;
 
 	/*
+	 * For a CoW extent, we need to move the mapping from the CoW fork
+	 * to the data fork.  If instead an error happened, just dump the
+	 * new blocks.
+	 */
+	if (ioend->io_type == XFS_IO_COW) {
+		if (error)
+			goto done;
+		if (ioend->io_bio->bi_error) {
+			error = xfs_reflink_cancel_cow_range(ip,
+					ioend->io_offset, ioend->io_size);
+			goto done;
+		}
+		error = xfs_reflink_end_cow(ip, ioend->io_offset,
+				ioend->io_size);
+		if (error)
+			goto done;
+	}
+
+	/*
 	 * For unwritten extents we need to issue transactions to convert a
 	 * range to normal written extens after the data I/O has finished.
 	 * Detecting and handling completion IO errors is done individually
@@ -301,7 +322,8 @@
 	} else if (ioend->io_append_trans) {
 		error = xfs_setfilesize_ioend(ioend, error);
 	} else {
-		ASSERT(!xfs_ioend_is_append(ioend));
+		ASSERT(!xfs_ioend_is_append(ioend) ||
+		       ioend->io_type == XFS_IO_COW);
 	}
 
 done:
@@ -315,7 +337,7 @@
 	struct xfs_ioend	*ioend = bio->bi_private;
 	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
 
-	if (ioend->io_type == XFS_IO_UNWRITTEN)
+	if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
 		queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
 	else if (ioend->io_append_trans)
 		queue_work(mp->m_data_workqueue, &ioend->io_work);
@@ -341,6 +363,7 @@
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
+	ASSERT(type != XFS_IO_COW);
 	if (type == XFS_IO_UNWRITTEN)
 		bmapi_flags |= XFS_BMAPI_IGSTATE;
 
@@ -355,6 +378,13 @@
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
 				imap, &nimaps, bmapi_flags);
+	/*
+	 * Truncate an overwrite extent if there's a pending CoW
+	 * reservation before the end of this extent.  This forces us
+	 * to come back to writepage to take care of the CoW.
+	 */
+	if (nimaps && type == XFS_IO_OVERWRITE)
+		xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, imap);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
 	if (error)
@@ -362,7 +392,8 @@
 
 	if (type == XFS_IO_DELALLOC &&
 	    (!nimaps || isnullstartblock(imap->br_startblock))) {
-		error = xfs_iomap_write_allocate(ip, offset, imap);
+		error = xfs_iomap_write_allocate(ip, XFS_DATA_FORK, offset,
+				imap);
 		if (!error)
 			trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
 		return error;
@@ -737,6 +768,56 @@
 	return;
 }
 
+static int
+xfs_map_cow(
+	struct xfs_writepage_ctx *wpc,
+	struct inode		*inode,
+	loff_t			offset,
+	unsigned int		*new_type)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_bmbt_irec	imap;
+	bool			is_cow = false, need_alloc = false;
+	int			error;
+
+	/*
+	 * If we already have a valid COW mapping keep using it.
+	 */
+	if (wpc->io_type == XFS_IO_COW) {
+		wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset);
+		if (wpc->imap_valid) {
+			*new_type = XFS_IO_COW;
+			return 0;
+		}
+	}
+
+	/*
+	 * Else we need to check if there is a COW mapping at this offset.
+	 */
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, &need_alloc);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	if (!is_cow)
+		return 0;
+
+	/*
+	 * And if the COW mapping has a delayed extent here we need to
+	 * allocate real space for it now.
+	 */
+	if (need_alloc) {
+		error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset,
+				&imap);
+		if (error)
+			return error;
+	}
+
+	wpc->io_type = *new_type = XFS_IO_COW;
+	wpc->imap_valid = true;
+	wpc->imap = imap;
+	return 0;
+}
+
 /*
  * We implement an immediate ioend submission policy here to avoid needing to
  * chain multiple ioends and hence nest mempool allocations which can violate
@@ -769,6 +850,7 @@
 	int			error = 0;
 	int			count = 0;
 	int			uptodate = 1;
+	unsigned int		new_type;
 
 	bh = head = page_buffers(page);
 	offset = page_offset(page);
@@ -789,22 +871,13 @@
 			continue;
 		}
 
-		if (buffer_unwritten(bh)) {
-			if (wpc->io_type != XFS_IO_UNWRITTEN) {
-				wpc->io_type = XFS_IO_UNWRITTEN;
-				wpc->imap_valid = false;
-			}
-		} else if (buffer_delay(bh)) {
-			if (wpc->io_type != XFS_IO_DELALLOC) {
-				wpc->io_type = XFS_IO_DELALLOC;
-				wpc->imap_valid = false;
-			}
-		} else if (buffer_uptodate(bh)) {
-			if (wpc->io_type != XFS_IO_OVERWRITE) {
-				wpc->io_type = XFS_IO_OVERWRITE;
-				wpc->imap_valid = false;
-			}
-		} else {
+		if (buffer_unwritten(bh))
+			new_type = XFS_IO_UNWRITTEN;
+		else if (buffer_delay(bh))
+			new_type = XFS_IO_DELALLOC;
+		else if (buffer_uptodate(bh))
+			new_type = XFS_IO_OVERWRITE;
+		else {
 			if (PageUptodate(page))
 				ASSERT(buffer_mapped(bh));
 			/*
@@ -817,6 +890,17 @@
 			continue;
 		}
 
+		if (xfs_is_reflink_inode(XFS_I(inode))) {
+			error = xfs_map_cow(wpc, inode, offset, &new_type);
+			if (error)
+				goto out;
+		}
+
+		if (wpc->io_type != new_type) {
+			wpc->io_type = new_type;
+			wpc->imap_valid = false;
+		}
+
 		if (wpc->imap_valid)
 			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
 							 offset);
@@ -1107,18 +1191,24 @@
 	struct inode		*inode,
 	struct buffer_head	*bh_result,
 	struct xfs_bmbt_irec	*imap,
-	xfs_off_t		offset)
+	xfs_off_t		offset,
+	bool			is_cow)
 {
 	uintptr_t		*flags = (uintptr_t *)&bh_result->b_private;
 	xfs_off_t		size = bh_result->b_size;
 
 	trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
-		ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap);
+		ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW :
+		XFS_IO_OVERWRITE, imap);
 
 	if (ISUNWRITTEN(imap)) {
 		*flags |= XFS_DIO_FLAG_UNWRITTEN;
 		set_buffer_defer_completion(bh_result);
-	} else if (offset + size > i_size_read(inode) || offset + size < 0) {
+	} else if (is_cow) {
+		*flags |= XFS_DIO_FLAG_COW;
+		set_buffer_defer_completion(bh_result);
+	}
+	if (offset + size > i_size_read(inode) || offset + size < 0) {
 		*flags |= XFS_DIO_FLAG_APPEND;
 		set_buffer_defer_completion(bh_result);
 	}
@@ -1164,6 +1254,44 @@
 	bh_result->b_size = mapping_size;
 }
 
+/* Bounce unaligned directio writes to the page cache. */
+static int
+xfs_bounce_unaligned_dio_write(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		offset_fsb,
+	struct xfs_bmbt_irec	*imap)
+{
+	struct xfs_bmbt_irec	irec;
+	xfs_fileoff_t		delta;
+	bool			shared;
+	bool			x;
+	int			error;
+
+	irec = *imap;
+	if (offset_fsb > irec.br_startoff) {
+		delta = offset_fsb - irec.br_startoff;
+		irec.br_blockcount -= delta;
+		irec.br_startblock += delta;
+		irec.br_startoff = offset_fsb;
+	}
+	error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x);
+	if (error)
+		return error;
+
+	/*
+	 * We're here because we're trying to do a directio write to a
+	 * region that isn't aligned to a filesystem block.  If any part
+	 * of the extent is shared, fall back to buffered mode to handle
+	 * the RMW.  This is done by returning -EREMCHG ("remote addr
+	 * changed"), which is caught further up the call stack.
+	 */
+	if (shared) {
+		trace_xfs_reflink_bounce_dio_write(ip, imap);
+		return -EREMCHG;
+	}
+	return 0;
+}
+
 STATIC int
 __xfs_get_blocks(
 	struct inode		*inode,
@@ -1183,6 +1311,8 @@
 	xfs_off_t		offset;
 	ssize_t			size;
 	int			new = 0;
+	bool			is_cow = false;
+	bool			need_alloc = false;
 
 	BUG_ON(create && !direct);
 
@@ -1208,8 +1338,26 @@
 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 
-	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
-				&imap, &nimaps, XFS_BMAPI_ENTIRE);
+	if (create && direct && xfs_is_reflink_inode(ip))
+		is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap,
+					&need_alloc);
+	if (!is_cow) {
+		error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+					&imap, &nimaps, XFS_BMAPI_ENTIRE);
+		/*
+		 * Truncate an overwrite extent if there's a pending CoW
+		 * reservation before the end of this extent.  This
+		 * forces us to come back to get_blocks to take care of
+		 * the CoW.
+		 */
+		if (create && direct && nimaps &&
+		    imap.br_startblock != HOLESTARTBLOCK &&
+		    imap.br_startblock != DELAYSTARTBLOCK &&
+		    !ISUNWRITTEN(&imap))
+			xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb,
+					&imap);
+	}
+	ASSERT(!need_alloc);
 	if (error)
 		goto out_unlock;
 
@@ -1261,6 +1409,13 @@
 	if (imap.br_startblock != HOLESTARTBLOCK &&
 	    imap.br_startblock != DELAYSTARTBLOCK &&
 	    (create || !ISUNWRITTEN(&imap))) {
+		if (create && direct && !is_cow) {
+			error = xfs_bounce_unaligned_dio_write(ip, offset_fsb,
+					&imap);
+			if (error)
+				return error;
+		}
+
 		xfs_map_buffer(inode, bh_result, &imap, offset);
 		if (ISUNWRITTEN(&imap))
 			set_buffer_unwritten(bh_result);
@@ -1269,7 +1424,8 @@
 			if (dax_fault)
 				ASSERT(!ISUNWRITTEN(&imap));
 			else
-				xfs_map_direct(inode, bh_result, &imap, offset);
+				xfs_map_direct(inode, bh_result, &imap, offset,
+						is_cow);
 		}
 	}
 
@@ -1391,11 +1547,14 @@
 		i_size_write(inode, offset + size);
 	spin_unlock(&ip->i_flags_lock);
 
+	if (flags & XFS_DIO_FLAG_COW)
+		error = xfs_reflink_end_cow(ip, offset, size);
 	if (flags & XFS_DIO_FLAG_UNWRITTEN) {
 		trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
 
 		error = xfs_iomap_write_unwritten(ip, offset, size);
-	} else if (flags & XFS_DIO_FLAG_APPEND) {
+	}
+	if (flags & XFS_DIO_FLAG_APPEND) {
 		trace_xfs_end_io_direct_write_append(ip, offset, size);
 
 		error = xfs_setfilesize(ip, offset, size);
@@ -1425,6 +1584,17 @@
 
 	trace_xfs_vm_bmap(XFS_I(inode));
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
+
+	/*
+	 * The swap code (ab-)uses ->bmap to get a block mapping and then
+	 * bypasseѕ the file system for actual I/O.  We really can't allow
+	 * that on reflinks inodes, so we have to skip out here.  And yes,
+	 * 0 is the magic code for a bmap error..
+	 */
+	if (xfs_is_reflink_inode(ip)) {
+		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+		return 0;
+	}
 	filemap_write_and_wait(mapping);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 	return generic_block_bmap(mapping, block, xfs_get_blocks);
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 1950e3b..b3c6634 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -28,13 +28,15 @@
 	XFS_IO_DELALLOC,	/* covers delalloc region */
 	XFS_IO_UNWRITTEN,	/* covers allocated but uninitialized data */
 	XFS_IO_OVERWRITE,	/* covers already allocated extent */
+	XFS_IO_COW,		/* covers copy-on-write extent */
 };
 
 #define XFS_IO_TYPES \
 	{ XFS_IO_INVALID,		"invalid" }, \
 	{ XFS_IO_DELALLOC,		"delalloc" }, \
 	{ XFS_IO_UNWRITTEN,		"unwritten" }, \
-	{ XFS_IO_OVERWRITE,		"overwrite" }
+	{ XFS_IO_OVERWRITE,		"overwrite" }, \
+	{ XFS_IO_COW,			"CoW" }
 
 /*
  * Structure for buffered I/O completions.
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
new file mode 100644
index 0000000..9bf57c7
--- /dev/null
+++ b/fs/xfs/xfs_bmap_item.c
@@ -0,0 +1,508 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_bmap_item.h"
+#include "xfs_log.h"
+#include "xfs_bmap.h"
+#include "xfs_icache.h"
+#include "xfs_trace.h"
+
+
+kmem_zone_t	*xfs_bui_zone;
+kmem_zone_t	*xfs_bud_zone;
+
+static inline struct xfs_bui_log_item *BUI_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_bui_log_item, bui_item);
+}
+
+void
+xfs_bui_item_free(
+	struct xfs_bui_log_item	*buip)
+{
+	kmem_zone_free(xfs_bui_zone, buip);
+}
+
+STATIC void
+xfs_bui_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	struct xfs_bui_log_item	*buip = BUI_ITEM(lip);
+
+	*nvecs += 1;
+	*nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given bui log item. We use only 1 iovec, and we point that
+ * at the bui_log_format structure embedded in the bui item.
+ * It is at this point that we assert that all of the extent
+ * slots in the bui item have been filled.
+ */
+STATIC void
+xfs_bui_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_bui_log_item	*buip = BUI_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+
+	ASSERT(atomic_read(&buip->bui_next_extent) ==
+			buip->bui_format.bui_nextents);
+
+	buip->bui_format.bui_type = XFS_LI_BUI;
+	buip->bui_format.bui_size = 1;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_BUI_FORMAT, &buip->bui_format,
+			xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents));
+}
+
+/*
+ * Pinning has no meaning for an bui item, so just return.
+ */
+STATIC void
+xfs_bui_item_pin(
+	struct xfs_log_item	*lip)
+{
+}
+
+/*
+ * The unpin operation is the last place an BUI is manipulated in the log. It is
+ * either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the BUI transaction has been successfully committed to make it
+ * this far. Therefore, we expect whoever committed the BUI to either construct
+ * and commit the BUD or drop the BUD's reference in the event of error. Simply
+ * drop the log's BUI reference now that the log is done with it.
+ */
+STATIC void
+xfs_bui_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+	struct xfs_bui_log_item	*buip = BUI_ITEM(lip);
+
+	xfs_bui_release(buip);
+}
+
+/*
+ * BUI items have no locking or pushing.  However, since BUIs are pulled from
+ * the AIL when their corresponding BUDs are committed to disk, their situation
+ * is very similar to being pinned.  Return XFS_ITEM_PINNED so that the caller
+ * will eventually flush the log.  This should help in getting the BUI out of
+ * the AIL.
+ */
+STATIC uint
+xfs_bui_item_push(
+	struct xfs_log_item	*lip,
+	struct list_head	*buffer_list)
+{
+	return XFS_ITEM_PINNED;
+}
+
+/*
+ * The BUI has been either committed or aborted if the transaction has been
+ * cancelled. If the transaction was cancelled, an BUD isn't going to be
+ * constructed and thus we free the BUI here directly.
+ */
+STATIC void
+xfs_bui_item_unlock(
+	struct xfs_log_item	*lip)
+{
+	if (lip->li_flags & XFS_LI_ABORTED)
+		xfs_bui_item_free(BUI_ITEM(lip));
+}
+
+/*
+ * The BUI is logged only once and cannot be moved in the log, so simply return
+ * the lsn at which it's been logged.
+ */
+STATIC xfs_lsn_t
+xfs_bui_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	return lsn;
+}
+
+/*
+ * The BUI dependency tracking op doesn't do squat.  It can't because
+ * it doesn't know where the free extent is coming from.  The dependency
+ * tracking has to be handled by the "enclosing" metadata object.  For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+STATIC void
+xfs_bui_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all bui log items.
+ */
+static const struct xfs_item_ops xfs_bui_item_ops = {
+	.iop_size	= xfs_bui_item_size,
+	.iop_format	= xfs_bui_item_format,
+	.iop_pin	= xfs_bui_item_pin,
+	.iop_unpin	= xfs_bui_item_unpin,
+	.iop_unlock	= xfs_bui_item_unlock,
+	.iop_committed	= xfs_bui_item_committed,
+	.iop_push	= xfs_bui_item_push,
+	.iop_committing = xfs_bui_item_committing,
+};
+
+/*
+ * Allocate and initialize an bui item with the given number of extents.
+ */
+struct xfs_bui_log_item *
+xfs_bui_init(
+	struct xfs_mount		*mp)
+
+{
+	struct xfs_bui_log_item		*buip;
+
+	buip = kmem_zone_zalloc(xfs_bui_zone, KM_SLEEP);
+
+	xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
+	buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
+	buip->bui_format.bui_id = (uintptr_t)(void *)buip;
+	atomic_set(&buip->bui_next_extent, 0);
+	atomic_set(&buip->bui_refcount, 2);
+
+	return buip;
+}
+
+/*
+ * Freeing the BUI requires that we remove it from the AIL if it has already
+ * been placed there. However, the BUI may not yet have been placed in the AIL
+ * when called by xfs_bui_release() from BUD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the BUI.
+ */
+void
+xfs_bui_release(
+	struct xfs_bui_log_item	*buip)
+{
+	if (atomic_dec_and_test(&buip->bui_refcount)) {
+		xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR);
+		xfs_bui_item_free(buip);
+	}
+}
+
+static inline struct xfs_bud_log_item *BUD_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_bud_log_item, bud_item);
+}
+
+STATIC void
+xfs_bud_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	*nvecs += 1;
+	*nbytes += sizeof(struct xfs_bud_log_format);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given bud log item. We use only 1 iovec, and we point that
+ * at the bud_log_format structure embedded in the bud item.
+ * It is at this point that we assert that all of the extent
+ * slots in the bud item have been filled.
+ */
+STATIC void
+xfs_bud_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_bud_log_item	*budp = BUD_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+
+	budp->bud_format.bud_type = XFS_LI_BUD;
+	budp->bud_format.bud_size = 1;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_BUD_FORMAT, &budp->bud_format,
+			sizeof(struct xfs_bud_log_format));
+}
+
+/*
+ * Pinning has no meaning for an bud item, so just return.
+ */
+STATIC void
+xfs_bud_item_pin(
+	struct xfs_log_item	*lip)
+{
+}
+
+/*
+ * Since pinning has no meaning for an bud item, unpinning does
+ * not either.
+ */
+STATIC void
+xfs_bud_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+}
+
+/*
+ * There isn't much you can do to push on an bud item.  It is simply stuck
+ * waiting for the log to be flushed to disk.
+ */
+STATIC uint
+xfs_bud_item_push(
+	struct xfs_log_item	*lip,
+	struct list_head	*buffer_list)
+{
+	return XFS_ITEM_PINNED;
+}
+
+/*
+ * The BUD is either committed or aborted if the transaction is cancelled. If
+ * the transaction is cancelled, drop our reference to the BUI and free the
+ * BUD.
+ */
+STATIC void
+xfs_bud_item_unlock(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_bud_log_item	*budp = BUD_ITEM(lip);
+
+	if (lip->li_flags & XFS_LI_ABORTED) {
+		xfs_bui_release(budp->bud_buip);
+		kmem_zone_free(xfs_bud_zone, budp);
+	}
+}
+
+/*
+ * When the bud item is committed to disk, all we need to do is delete our
+ * reference to our partner bui item and then free ourselves. Since we're
+ * freeing ourselves we must return -1 to keep the transaction code from
+ * further referencing this item.
+ */
+STATIC xfs_lsn_t
+xfs_bud_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	struct xfs_bud_log_item	*budp = BUD_ITEM(lip);
+
+	/*
+	 * Drop the BUI reference regardless of whether the BUD has been
+	 * aborted. Once the BUD transaction is constructed, it is the sole
+	 * responsibility of the BUD to release the BUI (even if the BUI is
+	 * aborted due to log I/O error).
+	 */
+	xfs_bui_release(budp->bud_buip);
+	kmem_zone_free(xfs_bud_zone, budp);
+
+	return (xfs_lsn_t)-1;
+}
+
+/*
+ * The BUD dependency tracking op doesn't do squat.  It can't because
+ * it doesn't know where the free extent is coming from.  The dependency
+ * tracking has to be handled by the "enclosing" metadata object.  For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+STATIC void
+xfs_bud_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all bud log items.
+ */
+static const struct xfs_item_ops xfs_bud_item_ops = {
+	.iop_size	= xfs_bud_item_size,
+	.iop_format	= xfs_bud_item_format,
+	.iop_pin	= xfs_bud_item_pin,
+	.iop_unpin	= xfs_bud_item_unpin,
+	.iop_unlock	= xfs_bud_item_unlock,
+	.iop_committed	= xfs_bud_item_committed,
+	.iop_push	= xfs_bud_item_push,
+	.iop_committing = xfs_bud_item_committing,
+};
+
+/*
+ * Allocate and initialize an bud item with the given number of extents.
+ */
+struct xfs_bud_log_item *
+xfs_bud_init(
+	struct xfs_mount		*mp,
+	struct xfs_bui_log_item		*buip)
+
+{
+	struct xfs_bud_log_item	*budp;
+
+	budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP);
+	xfs_log_item_init(mp, &budp->bud_item, XFS_LI_BUD, &xfs_bud_item_ops);
+	budp->bud_buip = buip;
+	budp->bud_format.bud_bui_id = buip->bui_format.bui_id;
+
+	return budp;
+}
+
+/*
+ * Process a bmap update intent item that was recovered from the log.
+ * We need to update some inode's bmbt.
+ */
+int
+xfs_bui_recover(
+	struct xfs_mount		*mp,
+	struct xfs_bui_log_item		*buip)
+{
+	int				error = 0;
+	unsigned int			bui_type;
+	struct xfs_map_extent		*bmap;
+	xfs_fsblock_t			startblock_fsb;
+	xfs_fsblock_t			inode_fsb;
+	bool				op_ok;
+	struct xfs_bud_log_item		*budp;
+	enum xfs_bmap_intent_type	type;
+	int				whichfork;
+	xfs_exntst_t			state;
+	struct xfs_trans		*tp;
+	struct xfs_inode		*ip = NULL;
+	struct xfs_defer_ops		dfops;
+	xfs_fsblock_t			firstfsb;
+
+	ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags));
+
+	/* Only one mapping operation per BUI... */
+	if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) {
+		set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
+		xfs_bui_release(buip);
+		return -EIO;
+	}
+
+	/*
+	 * First check the validity of the extent described by the
+	 * BUI.  If anything is bad, then toss the BUI.
+	 */
+	bmap = &buip->bui_format.bui_extents[0];
+	startblock_fsb = XFS_BB_TO_FSB(mp,
+			   XFS_FSB_TO_DADDR(mp, bmap->me_startblock));
+	inode_fsb = XFS_BB_TO_FSB(mp, XFS_FSB_TO_DADDR(mp,
+			XFS_INO_TO_FSB(mp, bmap->me_owner)));
+	switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) {
+	case XFS_BMAP_MAP:
+	case XFS_BMAP_UNMAP:
+		op_ok = true;
+		break;
+	default:
+		op_ok = false;
+		break;
+	}
+	if (!op_ok || startblock_fsb == 0 ||
+	    bmap->me_len == 0 ||
+	    inode_fsb == 0 ||
+	    startblock_fsb >= mp->m_sb.sb_dblocks ||
+	    bmap->me_len >= mp->m_sb.sb_agblocks ||
+	    inode_fsb >= mp->m_sb.sb_dblocks ||
+	    (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)) {
+		/*
+		 * This will pull the BUI from the AIL and
+		 * free the memory associated with it.
+		 */
+		set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
+		xfs_bui_release(buip);
+		return -EIO;
+	}
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+	if (error)
+		return error;
+	budp = xfs_trans_get_bud(tp, buip);
+
+	/* Grab the inode. */
+	error = xfs_iget(mp, tp, bmap->me_owner, 0, XFS_ILOCK_EXCL, &ip);
+	if (error)
+		goto err_inode;
+
+	if (VFS_I(ip)->i_nlink == 0)
+		xfs_iflags_set(ip, XFS_IRECOVERY);
+	xfs_defer_init(&dfops, &firstfsb);
+
+	/* Process deferred bmap item. */
+	state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
+			XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+	whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
+			XFS_ATTR_FORK : XFS_DATA_FORK;
+	bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
+	switch (bui_type) {
+	case XFS_BMAP_MAP:
+	case XFS_BMAP_UNMAP:
+		type = bui_type;
+		break;
+	default:
+		error = -EFSCORRUPTED;
+		goto err_dfops;
+	}
+	xfs_trans_ijoin(tp, ip, 0);
+
+	error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type,
+			ip, whichfork, bmap->me_startoff,
+			bmap->me_startblock, bmap->me_len,
+			state);
+	if (error)
+		goto err_dfops;
+
+	/* Finish transaction, free inodes. */
+	error = xfs_defer_finish(&tp, &dfops, NULL);
+	if (error)
+		goto err_dfops;
+
+	set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
+	error = xfs_trans_commit(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	IRELE(ip);
+
+	return error;
+
+err_dfops:
+	xfs_defer_cancel(&dfops);
+err_inode:
+	xfs_trans_cancel(tp);
+	if (ip) {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		IRELE(ip);
+	}
+	return error;
+}
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
new file mode 100644
index 0000000..c867daa
--- /dev/null
+++ b/fs/xfs/xfs_bmap_item.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#ifndef	__XFS_BMAP_ITEM_H__
+#define	__XFS_BMAP_ITEM_H__
+
+/*
+ * There are (currently) two pairs of bmap btree redo item types: map & unmap.
+ * The common abbreviations for these are BUI (bmap update intent) and BUD
+ * (bmap update done).  The redo item type is encoded in the flags field of
+ * each xfs_map_extent.
+ *
+ * *I items should be recorded in the *first* of a series of rolled
+ * transactions, and the *D items should be recorded in the same transaction
+ * that records the associated bmbt updates.
+ *
+ * Should the system crash after the commit of the first transaction but
+ * before the commit of the final transaction in a series, log recovery will
+ * use the redo information recorded by the intent items to replay the
+ * bmbt metadata updates in the non-first transaction.
+ */
+
+/* kernel only BUI/BUD definitions */
+
+struct xfs_mount;
+struct kmem_zone;
+
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define	XFS_BUI_MAX_FAST_EXTENTS	1
+
+/*
+ * Define BUI flag bits. Manipulated by set/clear/test_bit operators.
+ */
+#define	XFS_BUI_RECOVERED		1
+
+/*
+ * This is the "bmap update intent" log item.  It is used to log the fact that
+ * some reverse mappings need to change.  It is used in conjunction with the
+ * "bmap update done" log item described below.
+ *
+ * These log items follow the same rules as struct xfs_efi_log_item; see the
+ * comments about that structure (in xfs_extfree_item.h) for more details.
+ */
+struct xfs_bui_log_item {
+	struct xfs_log_item		bui_item;
+	atomic_t			bui_refcount;
+	atomic_t			bui_next_extent;
+	unsigned long			bui_flags;	/* misc flags */
+	struct xfs_bui_log_format	bui_format;
+};
+
+static inline size_t
+xfs_bui_log_item_sizeof(
+	unsigned int		nr)
+{
+	return offsetof(struct xfs_bui_log_item, bui_format) +
+			xfs_bui_log_format_sizeof(nr);
+}
+
+/*
+ * This is the "bmap update done" log item.  It is used to log the fact that
+ * some bmbt updates mentioned in an earlier bui item have been performed.
+ */
+struct xfs_bud_log_item {
+	struct xfs_log_item		bud_item;
+	struct xfs_bui_log_item		*bud_buip;
+	struct xfs_bud_log_format	bud_format;
+};
+
+extern struct kmem_zone	*xfs_bui_zone;
+extern struct kmem_zone	*xfs_bud_zone;
+
+struct xfs_bui_log_item *xfs_bui_init(struct xfs_mount *);
+struct xfs_bud_log_item *xfs_bud_init(struct xfs_mount *,
+		struct xfs_bui_log_item *);
+void xfs_bui_item_free(struct xfs_bui_log_item *);
+void xfs_bui_release(struct xfs_bui_log_item *);
+int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip);
+
+#endif	/* __XFS_BMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index e827d65..552465e 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -42,6 +42,9 @@
 #include "xfs_icache.h"
 #include "xfs_log.h"
 #include "xfs_rmap_btree.h"
+#include "xfs_iomap.h"
+#include "xfs_reflink.h"
+#include "xfs_refcount.h"
 
 /* Kernel only BMAP related definitions and functions */
 
@@ -389,11 +392,13 @@
 STATIC int
 xfs_getbmapx_fix_eof_hole(
 	xfs_inode_t		*ip,		/* xfs incore inode pointer */
+	int			whichfork,
 	struct getbmapx		*out,		/* output structure */
 	int			prealloced,	/* this is a file with
 						 * preallocated data space */
 	__int64_t		end,		/* last block requested */
-	xfs_fsblock_t		startblock)
+	xfs_fsblock_t		startblock,
+	bool			moretocome)
 {
 	__int64_t		fixlen;
 	xfs_mount_t		*mp;		/* file system mount point */
@@ -418,8 +423,9 @@
 		else
 			out->bmv_block = xfs_fsb_to_db(ip, startblock);
 		fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
-		ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-		if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
+		ifp = XFS_IFORK_PTR(ip, whichfork);
+		if (!moretocome &&
+		    xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
 		   (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
 			out->bmv_oflags |= BMV_OF_LAST;
 	}
@@ -427,6 +433,81 @@
 	return 1;
 }
 
+/* Adjust the reported bmap around shared/unshared extent transitions. */
+STATIC int
+xfs_getbmap_adjust_shared(
+	struct xfs_inode		*ip,
+	int				whichfork,
+	struct xfs_bmbt_irec		*map,
+	struct getbmapx			*out,
+	struct xfs_bmbt_irec		*next_map)
+{
+	struct xfs_mount		*mp = ip->i_mount;
+	xfs_agnumber_t			agno;
+	xfs_agblock_t			agbno;
+	xfs_agblock_t			ebno;
+	xfs_extlen_t			elen;
+	xfs_extlen_t			nlen;
+	int				error;
+
+	next_map->br_startblock = NULLFSBLOCK;
+	next_map->br_startoff = NULLFILEOFF;
+	next_map->br_blockcount = 0;
+
+	/* Only written data blocks can be shared. */
+	if (!xfs_is_reflink_inode(ip) || whichfork != XFS_DATA_FORK ||
+	    map->br_startblock == DELAYSTARTBLOCK ||
+	    map->br_startblock == HOLESTARTBLOCK ||
+	    ISUNWRITTEN(map))
+		return 0;
+
+	agno = XFS_FSB_TO_AGNO(mp, map->br_startblock);
+	agbno = XFS_FSB_TO_AGBNO(mp, map->br_startblock);
+	error = xfs_reflink_find_shared(mp, agno, agbno, map->br_blockcount,
+			&ebno, &elen, true);
+	if (error)
+		return error;
+
+	if (ebno == NULLAGBLOCK) {
+		/* No shared blocks at all. */
+		return 0;
+	} else if (agbno == ebno) {
+		/*
+		 * Shared extent at (agbno, elen).  Shrink the reported
+		 * extent length and prepare to move the start of map[i]
+		 * to agbno+elen, with the aim of (re)formatting the new
+		 * map[i] the next time through the inner loop.
+		 */
+		out->bmv_length = XFS_FSB_TO_BB(mp, elen);
+		out->bmv_oflags |= BMV_OF_SHARED;
+		if (elen != map->br_blockcount) {
+			*next_map = *map;
+			next_map->br_startblock += elen;
+			next_map->br_startoff += elen;
+			next_map->br_blockcount -= elen;
+		}
+		map->br_blockcount -= elen;
+	} else {
+		/*
+		 * There's an unshared extent (agbno, ebno - agbno)
+		 * followed by shared extent at (ebno, elen).  Shrink
+		 * the reported extent length to cover only the unshared
+		 * extent and prepare to move up the start of map[i] to
+		 * ebno, with the aim of (re)formatting the new map[i]
+		 * the next time through the inner loop.
+		 */
+		*next_map = *map;
+		nlen = ebno - agbno;
+		out->bmv_length = XFS_FSB_TO_BB(mp, nlen);
+		next_map->br_startblock += nlen;
+		next_map->br_startoff += nlen;
+		next_map->br_blockcount -= nlen;
+		map->br_blockcount -= nlen;
+	}
+
+	return 0;
+}
+
 /*
  * Get inode's extents as described in bmv, and format for output.
  * Calls formatter to fill the user's buffer until all extents
@@ -459,12 +540,28 @@
 	int			iflags;		/* interface flags */
 	int			bmapi_flags;	/* flags for xfs_bmapi */
 	int			cur_ext = 0;
+	struct xfs_bmbt_irec	inject_map;
 
 	mp = ip->i_mount;
 	iflags = bmv->bmv_iflags;
-	whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
 
-	if (whichfork == XFS_ATTR_FORK) {
+#ifndef DEBUG
+	/* Only allow CoW fork queries if we're debugging. */
+	if (iflags & BMV_IF_COWFORK)
+		return -EINVAL;
+#endif
+	if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK))
+		return -EINVAL;
+
+	if (iflags & BMV_IF_ATTRFORK)
+		whichfork = XFS_ATTR_FORK;
+	else if (iflags & BMV_IF_COWFORK)
+		whichfork = XFS_COW_FORK;
+	else
+		whichfork = XFS_DATA_FORK;
+
+	switch (whichfork) {
+	case XFS_ATTR_FORK:
 		if (XFS_IFORK_Q(ip)) {
 			if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
 			    ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
@@ -480,7 +577,20 @@
 
 		prealloced = 0;
 		fixlen = 1LL << 32;
-	} else {
+		break;
+	case XFS_COW_FORK:
+		if (ip->i_cformat != XFS_DINODE_FMT_EXTENTS)
+			return -EINVAL;
+
+		if (xfs_get_cowextsz_hint(ip)) {
+			prealloced = 1;
+			fixlen = mp->m_super->s_maxbytes;
+		} else {
+			prealloced = 0;
+			fixlen = XFS_ISIZE(ip);
+		}
+		break;
+	default:
 		if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
 		    ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
 		    ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
@@ -494,6 +604,7 @@
 			prealloced = 0;
 			fixlen = XFS_ISIZE(ip);
 		}
+		break;
 	}
 
 	if (bmv->bmv_length == -1) {
@@ -520,7 +631,8 @@
 		return -ENOMEM;
 
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	if (whichfork == XFS_DATA_FORK) {
+	switch (whichfork) {
+	case XFS_DATA_FORK:
 		if (!(iflags & BMV_IF_DELALLOC) &&
 		    (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
 			error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
@@ -538,8 +650,14 @@
 		}
 
 		lock = xfs_ilock_data_map_shared(ip);
-	} else {
+		break;
+	case XFS_COW_FORK:
+		lock = XFS_ILOCK_SHARED;
+		xfs_ilock(ip, lock);
+		break;
+	case XFS_ATTR_FORK:
 		lock = xfs_ilock_attr_map_shared(ip);
+		break;
 	}
 
 	/*
@@ -581,7 +699,8 @@
 			goto out_free_map;
 		ASSERT(nmap <= subnex);
 
-		for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
+		for (i = 0; i < nmap && nexleft && bmv->bmv_length &&
+				cur_ext < bmv->bmv_count; i++) {
 			out[cur_ext].bmv_oflags = 0;
 			if (map[i].br_state == XFS_EXT_UNWRITTEN)
 				out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
@@ -614,9 +733,16 @@
 				goto out_free_map;
 			}
 
-			if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
-					prealloced, bmvend,
-					map[i].br_startblock))
+			/* Is this a shared block? */
+			error = xfs_getbmap_adjust_shared(ip, whichfork,
+					&map[i], &out[cur_ext], &inject_map);
+			if (error)
+				goto out_free_map;
+
+			if (!xfs_getbmapx_fix_eof_hole(ip, whichfork,
+					&out[cur_ext], prealloced, bmvend,
+					map[i].br_startblock,
+					inject_map.br_startblock != NULLFSBLOCK))
 				goto out_free_map;
 
 			bmv->bmv_offset =
@@ -636,11 +762,16 @@
 				continue;
 			}
 
-			nexleft--;
+			if (inject_map.br_startblock != NULLFSBLOCK) {
+				map[i] = inject_map;
+				i--;
+			} else
+				nexleft--;
 			bmv->bmv_entries++;
 			cur_ext++;
 		}
-	} while (nmap && nexleft && bmv->bmv_length);
+	} while (nmap && nexleft && bmv->bmv_length &&
+		 cur_ext < bmv->bmv_count);
 
  out_free_map:
 	kmem_free(map);
@@ -1433,8 +1564,8 @@
  */
 static int
 xfs_swap_extents_check_format(
-	xfs_inode_t	*ip,	/* target inode */
-	xfs_inode_t	*tip)	/* tmp inode */
+	struct xfs_inode	*ip,	/* target inode */
+	struct xfs_inode	*tip)	/* tmp inode */
 {
 
 	/* Should never get a local format */
@@ -1450,6 +1581,13 @@
 		return -EINVAL;
 
 	/*
+	 * If we have to use the (expensive) rmap swap method, we can
+	 * handle any number of extents and any format.
+	 */
+	if (xfs_sb_version_hasrmapbt(&ip->i_mount->m_sb))
+		return 0;
+
+	/*
 	 * if the target inode is in extent form and the temp inode is in btree
 	 * form then we will end up with the target inode in the wrong format
 	 * as we already know there are less extents in the temp inode.
@@ -1518,125 +1656,161 @@
 	return 0;
 }
 
-int
-xfs_swap_extents(
-	xfs_inode_t	*ip,	/* target inode */
-	xfs_inode_t	*tip,	/* tmp inode */
-	xfs_swapext_t	*sxp)
+/*
+ * Move extents from one file to another, when rmap is enabled.
+ */
+STATIC int
+xfs_swap_extent_rmap(
+	struct xfs_trans		**tpp,
+	struct xfs_inode		*ip,
+	struct xfs_inode		*tip)
 {
-	xfs_mount_t	*mp = ip->i_mount;
-	xfs_trans_t	*tp;
-	xfs_bstat_t	*sbp = &sxp->sx_stat;
-	xfs_ifork_t	*tempifp, *ifp, *tifp;
-	int		src_log_flags, target_log_flags;
-	int		error = 0;
-	int		aforkblks = 0;
-	int		taforkblks = 0;
-	__uint64_t	tmp;
-	int		lock_flags;
-
-	/* XXX: we can't do this with rmap, will fix later */
-	if (xfs_sb_version_hasrmapbt(&mp->m_sb))
-		return -EOPNOTSUPP;
-
-	tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
-	if (!tempifp) {
-		error = -ENOMEM;
-		goto out;
-	}
+	struct xfs_bmbt_irec		irec;
+	struct xfs_bmbt_irec		uirec;
+	struct xfs_bmbt_irec		tirec;
+	xfs_fileoff_t			offset_fsb;
+	xfs_fileoff_t			end_fsb;
+	xfs_filblks_t			count_fsb;
+	xfs_fsblock_t			firstfsb;
+	struct xfs_defer_ops		dfops;
+	int				error;
+	xfs_filblks_t			ilen;
+	xfs_filblks_t			rlen;
+	int				nimaps;
+	__uint64_t			tip_flags2;
 
 	/*
-	 * Lock the inodes against other IO, page faults and truncate to
-	 * begin with.  Then we can ensure the inodes are flushed and have no
-	 * page cache safely. Once we have done this we can take the ilocks and
-	 * do the rest of the checks.
+	 * If the source file has shared blocks, we must flag the donor
+	 * file as having shared blocks so that we get the shared-block
+	 * rmap functions when we go to fix up the rmaps.  The flags
+	 * will be switch for reals later.
 	 */
-	lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
-	xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
-	xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
+	tip_flags2 = tip->i_d.di_flags2;
+	if (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK)
+		tip->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK;
 
-	/* Verify that both files have the same format */
-	if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) {
-		error = -EINVAL;
-		goto out_unlock;
+	offset_fsb = 0;
+	end_fsb = XFS_B_TO_FSB(ip->i_mount, i_size_read(VFS_I(ip)));
+	count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb);
+
+	while (count_fsb) {
+		/* Read extent from the donor file */
+		nimaps = 1;
+		error = xfs_bmapi_read(tip, offset_fsb, count_fsb, &tirec,
+				&nimaps, 0);
+		if (error)
+			goto out;
+		ASSERT(nimaps == 1);
+		ASSERT(tirec.br_startblock != DELAYSTARTBLOCK);
+
+		trace_xfs_swap_extent_rmap_remap(tip, &tirec);
+		ilen = tirec.br_blockcount;
+
+		/* Unmap the old blocks in the source file. */
+		while (tirec.br_blockcount) {
+			xfs_defer_init(&dfops, &firstfsb);
+			trace_xfs_swap_extent_rmap_remap_piece(tip, &tirec);
+
+			/* Read extent from the source file */
+			nimaps = 1;
+			error = xfs_bmapi_read(ip, tirec.br_startoff,
+					tirec.br_blockcount, &irec,
+					&nimaps, 0);
+			if (error)
+				goto out_defer;
+			ASSERT(nimaps == 1);
+			ASSERT(tirec.br_startoff == irec.br_startoff);
+			trace_xfs_swap_extent_rmap_remap_piece(ip, &irec);
+
+			/* Trim the extent. */
+			uirec = tirec;
+			uirec.br_blockcount = rlen = min_t(xfs_filblks_t,
+					tirec.br_blockcount,
+					irec.br_blockcount);
+			trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec);
+
+			/* Remove the mapping from the donor file. */
+			error = xfs_bmap_unmap_extent((*tpp)->t_mountp, &dfops,
+					tip, &uirec);
+			if (error)
+				goto out_defer;
+
+			/* Remove the mapping from the source file. */
+			error = xfs_bmap_unmap_extent((*tpp)->t_mountp, &dfops,
+					ip, &irec);
+			if (error)
+				goto out_defer;
+
+			/* Map the donor file's blocks into the source file. */
+			error = xfs_bmap_map_extent((*tpp)->t_mountp, &dfops,
+					ip, &uirec);
+			if (error)
+				goto out_defer;
+
+			/* Map the source file's blocks into the donor file. */
+			error = xfs_bmap_map_extent((*tpp)->t_mountp, &dfops,
+					tip, &irec);
+			if (error)
+				goto out_defer;
+
+			error = xfs_defer_finish(tpp, &dfops, ip);
+			if (error)
+				goto out_defer;
+
+			tirec.br_startoff += rlen;
+			if (tirec.br_startblock != HOLESTARTBLOCK &&
+			    tirec.br_startblock != DELAYSTARTBLOCK)
+				tirec.br_startblock += rlen;
+			tirec.br_blockcount -= rlen;
+		}
+
+		/* Roll on... */
+		count_fsb -= ilen;
+		offset_fsb += ilen;
 	}
 
-	/* Verify both files are either real-time or non-realtime */
-	if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
-		error = -EINVAL;
-		goto out_unlock;
-	}
+	tip->i_d.di_flags2 = tip_flags2;
+	return 0;
 
-	error = xfs_swap_extent_flush(ip);
-	if (error)
-		goto out_unlock;
-	error = xfs_swap_extent_flush(tip);
-	if (error)
-		goto out_unlock;
+out_defer:
+	xfs_defer_cancel(&dfops);
+out:
+	trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_);
+	tip->i_d.di_flags2 = tip_flags2;
+	return error;
+}
 
-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
-	if (error)
-		goto out_unlock;
+/* Swap the extents of two files by swapping data forks. */
+STATIC int
+xfs_swap_extent_forks(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	struct xfs_inode	*tip,
+	int			*src_log_flags,
+	int			*target_log_flags)
+{
+	struct xfs_ifork	tempifp, *ifp, *tifp;
+	int			aforkblks = 0;
+	int			taforkblks = 0;
+	__uint64_t		tmp;
+	int			error;
 
 	/*
-	 * Lock and join the inodes to the tansaction so that transaction commit
-	 * or cancel will unlock the inodes from this point onwards.
-	 */
-	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
-	lock_flags |= XFS_ILOCK_EXCL;
-	xfs_trans_ijoin(tp, ip, lock_flags);
-	xfs_trans_ijoin(tp, tip, lock_flags);
-
-
-	/* Verify all data are being swapped */
-	if (sxp->sx_offset != 0 ||
-	    sxp->sx_length != ip->i_d.di_size ||
-	    sxp->sx_length != tip->i_d.di_size) {
-		error = -EFAULT;
-		goto out_trans_cancel;
-	}
-
-	trace_xfs_swap_extent_before(ip, 0);
-	trace_xfs_swap_extent_before(tip, 1);
-
-	/* check inode formats now that data is flushed */
-	error = xfs_swap_extents_check_format(ip, tip);
-	if (error) {
-		xfs_notice(mp,
-		    "%s: inode 0x%llx format is incompatible for exchanging.",
-				__func__, ip->i_ino);
-		goto out_trans_cancel;
-	}
-
-	/*
-	 * Compare the current change & modify times with that
-	 * passed in.  If they differ, we abort this swap.
-	 * This is the mechanism used to ensure the calling
-	 * process that the file was not changed out from
-	 * under it.
-	 */
-	if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
-	    (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
-	    (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
-	    (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
-		error = -EBUSY;
-		goto out_trans_cancel;
-	}
-	/*
 	 * Count the number of extended attribute blocks
 	 */
 	if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
 	     (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
-		error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
+		error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK,
+				&aforkblks);
 		if (error)
-			goto out_trans_cancel;
+			return error;
 	}
 	if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
 	     (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
 		error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
-			&taforkblks);
+				&taforkblks);
 		if (error)
-			goto out_trans_cancel;
+			return error;
 	}
 
 	/*
@@ -1645,31 +1819,23 @@
 	 * buffers, and so the validation done on read will expect the owner
 	 * field to be correctly set. Once we change the owners, we can swap the
 	 * inode forks.
-	 *
-	 * Note the trickiness in setting the log flags - we set the owner log
-	 * flag on the opposite inode (i.e. the inode we are setting the new
-	 * owner to be) because once we swap the forks and log that, log
-	 * recovery is going to see the fork as owned by the swapped inode,
-	 * not the pre-swapped inodes.
 	 */
-	src_log_flags = XFS_ILOG_CORE;
-	target_log_flags = XFS_ILOG_CORE;
 	if (ip->i_d.di_version == 3 &&
 	    ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
-		target_log_flags |= XFS_ILOG_DOWNER;
+		(*target_log_flags) |= XFS_ILOG_DOWNER;
 		error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
 					      tip->i_ino, NULL);
 		if (error)
-			goto out_trans_cancel;
+			return error;
 	}
 
 	if (tip->i_d.di_version == 3 &&
 	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
-		src_log_flags |= XFS_ILOG_DOWNER;
+		(*src_log_flags) |= XFS_ILOG_DOWNER;
 		error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
 					      ip->i_ino, NULL);
 		if (error)
-			goto out_trans_cancel;
+			return error;
 	}
 
 	/*
@@ -1677,9 +1843,9 @@
 	 */
 	ifp = &ip->i_df;
 	tifp = &tip->i_df;
-	*tempifp = *ifp;	/* struct copy */
+	tempifp = *ifp;		/* struct copy */
 	*ifp = *tifp;		/* struct copy */
-	*tifp = *tempifp;	/* struct copy */
+	*tifp = tempifp;	/* struct copy */
 
 	/*
 	 * Fix the on-disk inode values
@@ -1719,12 +1885,12 @@
 			ifp->if_u1.if_extents =
 				ifp->if_u2.if_inline_ext;
 		}
-		src_log_flags |= XFS_ILOG_DEXT;
+		(*src_log_flags) |= XFS_ILOG_DEXT;
 		break;
 	case XFS_DINODE_FMT_BTREE:
 		ASSERT(ip->i_d.di_version < 3 ||
-		       (src_log_flags & XFS_ILOG_DOWNER));
-		src_log_flags |= XFS_ILOG_DBROOT;
+		       (*src_log_flags & XFS_ILOG_DOWNER));
+		(*src_log_flags) |= XFS_ILOG_DBROOT;
 		break;
 	}
 
@@ -1738,15 +1904,166 @@
 			tifp->if_u1.if_extents =
 				tifp->if_u2.if_inline_ext;
 		}
-		target_log_flags |= XFS_ILOG_DEXT;
+		(*target_log_flags) |= XFS_ILOG_DEXT;
 		break;
 	case XFS_DINODE_FMT_BTREE:
-		target_log_flags |= XFS_ILOG_DBROOT;
+		(*target_log_flags) |= XFS_ILOG_DBROOT;
 		ASSERT(tip->i_d.di_version < 3 ||
-		       (target_log_flags & XFS_ILOG_DOWNER));
+		       (*target_log_flags & XFS_ILOG_DOWNER));
 		break;
 	}
 
+	return 0;
+}
+
+int
+xfs_swap_extents(
+	struct xfs_inode	*ip,	/* target inode */
+	struct xfs_inode	*tip,	/* tmp inode */
+	struct xfs_swapext	*sxp)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	struct xfs_bstat	*sbp = &sxp->sx_stat;
+	int			src_log_flags, target_log_flags;
+	int			error = 0;
+	int			lock_flags;
+	struct xfs_ifork	*cowfp;
+	__uint64_t		f;
+	int			resblks;
+
+	/*
+	 * Lock the inodes against other IO, page faults and truncate to
+	 * begin with.  Then we can ensure the inodes are flushed and have no
+	 * page cache safely. Once we have done this we can take the ilocks and
+	 * do the rest of the checks.
+	 */
+	lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+	xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+	xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
+
+	/* Verify that both files have the same format */
+	if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) {
+		error = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* Verify both files are either real-time or non-realtime */
+	if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
+		error = -EINVAL;
+		goto out_unlock;
+	}
+
+	error = xfs_swap_extent_flush(ip);
+	if (error)
+		goto out_unlock;
+	error = xfs_swap_extent_flush(tip);
+	if (error)
+		goto out_unlock;
+
+	/*
+	 * Extent "swapping" with rmap requires a permanent reservation and
+	 * a block reservation because it's really just a remap operation
+	 * performed with log redo items!
+	 */
+	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+		/*
+		 * Conceptually this shouldn't affect the shape of either
+		 * bmbt, but since we atomically move extents one by one,
+		 * we reserve enough space to rebuild both trees.
+		 */
+		resblks = XFS_SWAP_RMAP_SPACE_RES(mp,
+				XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK),
+				XFS_DATA_FORK) +
+			  XFS_SWAP_RMAP_SPACE_RES(mp,
+				XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK),
+				XFS_DATA_FORK);
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
+				0, 0, &tp);
+	} else
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0,
+				0, 0, &tp);
+	if (error)
+		goto out_unlock;
+
+	/*
+	 * Lock and join the inodes to the tansaction so that transaction commit
+	 * or cancel will unlock the inodes from this point onwards.
+	 */
+	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
+	lock_flags |= XFS_ILOCK_EXCL;
+	xfs_trans_ijoin(tp, ip, 0);
+	xfs_trans_ijoin(tp, tip, 0);
+
+
+	/* Verify all data are being swapped */
+	if (sxp->sx_offset != 0 ||
+	    sxp->sx_length != ip->i_d.di_size ||
+	    sxp->sx_length != tip->i_d.di_size) {
+		error = -EFAULT;
+		goto out_trans_cancel;
+	}
+
+	trace_xfs_swap_extent_before(ip, 0);
+	trace_xfs_swap_extent_before(tip, 1);
+
+	/* check inode formats now that data is flushed */
+	error = xfs_swap_extents_check_format(ip, tip);
+	if (error) {
+		xfs_notice(mp,
+		    "%s: inode 0x%llx format is incompatible for exchanging.",
+				__func__, ip->i_ino);
+		goto out_trans_cancel;
+	}
+
+	/*
+	 * Compare the current change & modify times with that
+	 * passed in.  If they differ, we abort this swap.
+	 * This is the mechanism used to ensure the calling
+	 * process that the file was not changed out from
+	 * under it.
+	 */
+	if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
+	    (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
+	    (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
+	    (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
+		error = -EBUSY;
+		goto out_trans_cancel;
+	}
+
+	/*
+	 * Note the trickiness in setting the log flags - we set the owner log
+	 * flag on the opposite inode (i.e. the inode we are setting the new
+	 * owner to be) because once we swap the forks and log that, log
+	 * recovery is going to see the fork as owned by the swapped inode,
+	 * not the pre-swapped inodes.
+	 */
+	src_log_flags = XFS_ILOG_CORE;
+	target_log_flags = XFS_ILOG_CORE;
+
+	if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+		error = xfs_swap_extent_rmap(&tp, ip, tip);
+	else
+		error = xfs_swap_extent_forks(tp, ip, tip, &src_log_flags,
+				&target_log_flags);
+	if (error)
+		goto out_trans_cancel;
+
+	/* Do we have to swap reflink flags? */
+	if ((ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK) ^
+	    (tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK)) {
+		f = ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK;
+		ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+		ip->i_d.di_flags2 |= tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK;
+		tip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+		tip->i_d.di_flags2 |= f & XFS_DIFLAG2_REFLINK;
+		cowfp = ip->i_cowfp;
+		ip->i_cowfp = tip->i_cowfp;
+		tip->i_cowfp = cowfp;
+		xfs_inode_set_cowblocks_tag(ip);
+		xfs_inode_set_cowblocks_tag(tip);
+	}
+
 	xfs_trans_log_inode(tp, ip,  src_log_flags);
 	xfs_trans_log_inode(tp, tip, target_log_flags);
 
@@ -1761,16 +2078,16 @@
 
 	trace_xfs_swap_extent_after(ip, 0);
 	trace_xfs_swap_extent_after(tip, 1);
-out:
-	kmem_free(tempifp);
+
+	xfs_iunlock(ip, lock_flags);
+	xfs_iunlock(tip, lock_flags);
 	return error;
 
+out_trans_cancel:
+	xfs_trans_cancel(tp);
+
 out_unlock:
 	xfs_iunlock(ip, lock_flags);
 	xfs_iunlock(tip, lock_flags);
-	goto out;
-
-out_trans_cancel:
-	xfs_trans_cancel(tp);
-	goto out;
+	return error;
 }
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index f44f799..2981698 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -84,7 +84,8 @@
 
 	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
 
-	ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+	if (dp->i_d.di_size < xfs_dir2_sf_hdr_size(sfp->i8count))
+		return -EFSCORRUPTED;
 
 	/*
 	 * If the block number in the offset is out of range, we're done.
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 3d22470..05f8666 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -92,7 +92,11 @@
 #define XFS_ERRTAG_BMAPIFORMAT				21
 #define XFS_ERRTAG_FREE_EXTENT				22
 #define XFS_ERRTAG_RMAP_FINISH_ONE			23
-#define XFS_ERRTAG_MAX					24
+#define XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE		24
+#define XFS_ERRTAG_REFCOUNT_FINISH_ONE			25
+#define XFS_ERRTAG_BMAP_FINISH_ONE			26
+#define XFS_ERRTAG_AG_RESV_CRITICAL			27
+#define XFS_ERRTAG_MAX					28
 
 /*
  * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -121,6 +125,10 @@
 #define	XFS_RANDOM_BMAPIFORMAT				XFS_RANDOM_DEFAULT
 #define XFS_RANDOM_FREE_EXTENT				1
 #define XFS_RANDOM_RMAP_FINISH_ONE			1
+#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE		1
+#define XFS_RANDOM_REFCOUNT_FINISH_ONE			1
+#define XFS_RANDOM_BMAP_FINISH_ONE			1
+#define XFS_RANDOM_AG_RESV_CRITICAL			4
 
 #ifdef DEBUG
 extern int xfs_error_test_active;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 2bc58b3..a314fc7 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -38,6 +38,7 @@
 #include "xfs_icache.h"
 #include "xfs_pnfs.h"
 #include "xfs_iomap.h"
+#include "xfs_reflink.h"
 
 #include <linux/dcache.h>
 #include <linux/falloc.h>
@@ -634,6 +635,13 @@
 
 	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
 
+	/* If this is a block-aligned directio CoW, remap immediately. */
+	if (xfs_is_reflink_inode(ip) && !unaligned_io) {
+		ret = xfs_reflink_allocate_cow_range(ip, iocb->ki_pos, count);
+		if (ret)
+			goto out;
+	}
+
 	data = *from;
 	ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
 			xfs_get_blocks_direct, xfs_end_io_direct_write,
@@ -735,6 +743,9 @@
 		enospc = xfs_inode_free_quota_eofblocks(ip);
 		if (enospc)
 			goto write_retry;
+		enospc = xfs_inode_free_quota_cowblocks(ip);
+		if (enospc)
+			goto write_retry;
 	} else if (ret == -ENOSPC && !enospc) {
 		struct xfs_eofblocks eofb = {0};
 
@@ -774,10 +785,20 @@
 
 	if (IS_DAX(inode))
 		ret = xfs_file_dax_write(iocb, from);
-	else if (iocb->ki_flags & IOCB_DIRECT)
+	else if (iocb->ki_flags & IOCB_DIRECT) {
+		/*
+		 * Allow a directio write to fall back to a buffered
+		 * write *only* in the case that we're doing a reflink
+		 * CoW.  In all other directio scenarios we do not
+		 * allow an operation to fall back to buffered mode.
+		 */
 		ret = xfs_file_dio_aio_write(iocb, from);
-	else
+		if (ret == -EREMCHG)
+			goto buffered;
+	} else {
+buffered:
 		ret = xfs_file_buffered_aio_write(iocb, from);
+	}
 
 	if (ret > 0) {
 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
@@ -791,7 +812,7 @@
 #define	XFS_FALLOC_FL_SUPPORTED						\
 		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\
 		 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |	\
-		 FALLOC_FL_INSERT_RANGE)
+		 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
 
 STATIC long
 xfs_file_fallocate(
@@ -881,9 +902,15 @@
 
 		if (mode & FALLOC_FL_ZERO_RANGE)
 			error = xfs_zero_file_space(ip, offset, len);
-		else
+		else {
+			if (mode & FALLOC_FL_UNSHARE_RANGE) {
+				error = xfs_reflink_unshare(ip, offset, len);
+				if (error)
+					goto out_unlock;
+			}
 			error = xfs_alloc_file_space(ip, offset, len,
 						     XFS_BMAPI_PREALLOC);
+		}
 		if (error)
 			goto out_unlock;
 	}
@@ -920,6 +947,189 @@
 	return error;
 }
 
+/*
+ * Flush all file writes out to disk.
+ */
+static int
+xfs_file_wait_for_io(
+	struct inode	*inode,
+	loff_t		offset,
+	size_t		len)
+{
+	loff_t		rounding;
+	loff_t		ioffset;
+	loff_t		iendoffset;
+	loff_t		bs;
+	int		ret;
+
+	bs = inode->i_sb->s_blocksize;
+	inode_dio_wait(inode);
+
+	rounding = max_t(xfs_off_t, bs, PAGE_SIZE);
+	ioffset = round_down(offset, rounding);
+	iendoffset = round_up(offset + len, rounding) - 1;
+	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
+					   iendoffset);
+	return ret;
+}
+
+/* Hook up to the VFS reflink function */
+STATIC int
+xfs_file_share_range(
+	struct file	*file_in,
+	loff_t		pos_in,
+	struct file	*file_out,
+	loff_t		pos_out,
+	u64		len,
+	bool		is_dedupe)
+{
+	struct inode	*inode_in;
+	struct inode	*inode_out;
+	ssize_t		ret;
+	loff_t		bs;
+	loff_t		isize;
+	int		same_inode;
+	loff_t		blen;
+	unsigned int	flags = 0;
+
+	inode_in = file_inode(file_in);
+	inode_out = file_inode(file_out);
+	bs = inode_out->i_sb->s_blocksize;
+
+	/* Don't touch certain kinds of inodes */
+	if (IS_IMMUTABLE(inode_out))
+		return -EPERM;
+	if (IS_SWAPFILE(inode_in) ||
+	    IS_SWAPFILE(inode_out))
+		return -ETXTBSY;
+
+	/* Reflink only works within this filesystem. */
+	if (inode_in->i_sb != inode_out->i_sb)
+		return -EXDEV;
+	same_inode = (inode_in->i_ino == inode_out->i_ino);
+
+	/* Don't reflink dirs, pipes, sockets... */
+	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+		return -EISDIR;
+	if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
+		return -EINVAL;
+	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+		return -EINVAL;
+
+	/* Don't share DAX file data for now. */
+	if (IS_DAX(inode_in) || IS_DAX(inode_out))
+		return -EINVAL;
+
+	/* Are we going all the way to the end? */
+	isize = i_size_read(inode_in);
+	if (isize == 0)
+		return 0;
+	if (len == 0)
+		len = isize - pos_in;
+
+	/* Ensure offsets don't wrap and the input is inside i_size */
+	if (pos_in + len < pos_in || pos_out + len < pos_out ||
+	    pos_in + len > isize)
+		return -EINVAL;
+
+	/* Don't allow dedupe past EOF in the dest file */
+	if (is_dedupe) {
+		loff_t	disize;
+
+		disize = i_size_read(inode_out);
+		if (pos_out >= disize || pos_out + len > disize)
+			return -EINVAL;
+	}
+
+	/* If we're linking to EOF, continue to the block boundary. */
+	if (pos_in + len == isize)
+		blen = ALIGN(isize, bs) - pos_in;
+	else
+		blen = len;
+
+	/* Only reflink if we're aligned to block boundaries */
+	if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
+	    !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
+		return -EINVAL;
+
+	/* Don't allow overlapped reflink within the same file */
+	if (same_inode && pos_out + blen > pos_in && pos_out < pos_in + blen)
+		return -EINVAL;
+
+	/* Wait for the completion of any pending IOs on srcfile */
+	ret = xfs_file_wait_for_io(inode_in, pos_in, len);
+	if (ret)
+		goto out;
+	ret = xfs_file_wait_for_io(inode_out, pos_out, len);
+	if (ret)
+		goto out;
+
+	if (is_dedupe)
+		flags |= XFS_REFLINK_DEDUPE;
+	ret = xfs_reflink_remap_range(XFS_I(inode_in), pos_in, XFS_I(inode_out),
+			pos_out, len, flags);
+	if (ret < 0)
+		goto out;
+
+out:
+	return ret;
+}
+
+STATIC ssize_t
+xfs_file_copy_range(
+	struct file	*file_in,
+	loff_t		pos_in,
+	struct file	*file_out,
+	loff_t		pos_out,
+	size_t		len,
+	unsigned int	flags)
+{
+	int		error;
+
+	error = xfs_file_share_range(file_in, pos_in, file_out, pos_out,
+				     len, false);
+	if (error)
+		return error;
+	return len;
+}
+
+STATIC int
+xfs_file_clone_range(
+	struct file	*file_in,
+	loff_t		pos_in,
+	struct file	*file_out,
+	loff_t		pos_out,
+	u64		len)
+{
+	return xfs_file_share_range(file_in, pos_in, file_out, pos_out,
+				     len, false);
+}
+
+#define XFS_MAX_DEDUPE_LEN	(16 * 1024 * 1024)
+STATIC ssize_t
+xfs_file_dedupe_range(
+	struct file	*src_file,
+	u64		loff,
+	u64		len,
+	struct file	*dst_file,
+	u64		dst_loff)
+{
+	int		error;
+
+	/*
+	 * Limit the total length we will dedupe for each operation.
+	 * This is intended to bound the total time spent in this
+	 * ioctl to something sane.
+	 */
+	if (len > XFS_MAX_DEDUPE_LEN)
+		len = XFS_MAX_DEDUPE_LEN;
+
+	error = xfs_file_share_range(src_file, loff, dst_file, dst_loff,
+				     len, true);
+	if (error)
+		return error;
+	return len;
+}
 
 STATIC int
 xfs_file_open(
@@ -1581,6 +1791,9 @@
 	.fsync		= xfs_file_fsync,
 	.get_unmapped_area = thp_get_unmapped_area,
 	.fallocate	= xfs_file_fallocate,
+	.copy_file_range = xfs_file_copy_range,
+	.clone_file_range = xfs_file_clone_range,
+	.dedupe_file_range = xfs_file_dedupe_range,
 };
 
 const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 94ac06f..93d12fa 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -43,6 +43,7 @@
 #include "xfs_log.h"
 #include "xfs_filestream.h"
 #include "xfs_rmap.h"
+#include "xfs_ag_resv.h"
 
 /*
  * File system operations
@@ -108,7 +109,9 @@
 			(xfs_sb_version_hassparseinodes(&mp->m_sb) ?
 				XFS_FSOP_GEOM_FLAGS_SPINODES : 0) |
 			(xfs_sb_version_hasrmapbt(&mp->m_sb) ?
-				XFS_FSOP_GEOM_FLAGS_RMAPBT : 0);
+				XFS_FSOP_GEOM_FLAGS_RMAPBT : 0) |
+			(xfs_sb_version_hasreflink(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_REFLINK : 0);
 		geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
 				mp->m_sb.sb_logsectsize : BBSIZE;
 		geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -259,6 +262,12 @@
 		agf->agf_longest = cpu_to_be32(tmpsize);
 		if (xfs_sb_version_hascrc(&mp->m_sb))
 			uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
+		if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+			agf->agf_refcount_root = cpu_to_be32(
+					xfs_refc_block(mp));
+			agf->agf_refcount_level = cpu_to_be32(1);
+			agf->agf_refcount_blocks = cpu_to_be32(1);
+		}
 
 		error = xfs_bwrite(bp);
 		xfs_buf_relse(bp);
@@ -450,6 +459,17 @@
 			rrec->rm_offset = 0;
 			be16_add_cpu(&block->bb_numrecs, 1);
 
+			/* account for refc btree root */
+			if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+				rrec = XFS_RMAP_REC_ADDR(block, 5);
+				rrec->rm_startblock = cpu_to_be32(
+						xfs_refc_block(mp));
+				rrec->rm_blockcount = cpu_to_be32(1);
+				rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC);
+				rrec->rm_offset = 0;
+				be16_add_cpu(&block->bb_numrecs, 1);
+			}
+
 			error = xfs_bwrite(bp);
 			xfs_buf_relse(bp);
 			if (error)
@@ -507,6 +527,28 @@
 				goto error0;
 		}
 
+		/*
+		 * refcount btree root block
+		 */
+		if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+			bp = xfs_growfs_get_hdr_buf(mp,
+				XFS_AGB_TO_DADDR(mp, agno, xfs_refc_block(mp)),
+				BTOBB(mp->m_sb.sb_blocksize), 0,
+				&xfs_refcountbt_buf_ops);
+			if (!bp) {
+				error = -ENOMEM;
+				goto error0;
+			}
+
+			xfs_btree_init_block(mp, bp, XFS_REFC_CRC_MAGIC,
+					     0, 0, agno,
+					     XFS_BTREE_CRC_BLOCKS);
+
+			error = xfs_bwrite(bp);
+			xfs_buf_relse(bp);
+			if (error)
+				goto error0;
+		}
 	}
 	xfs_trans_agblocks_delta(tp, nfree);
 	/*
@@ -589,6 +631,11 @@
 	xfs_set_low_space_thresholds(mp);
 	mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
 
+	/* Reserve AG metadata blocks. */
+	error = xfs_fs_reserve_ag_blocks(mp);
+	if (error && error != -ENOSPC)
+		goto out;
+
 	/* update secondary superblocks. */
 	for (agno = 1; agno < nagcount; agno++) {
 		error = 0;
@@ -639,6 +686,8 @@
 			continue;
 		}
 	}
+
+ out:
 	return saved_error ? saved_error : error;
 
  error0:
@@ -948,3 +997,59 @@
 	"Please umount the filesystem and rectify the problem(s)");
 	}
 }
+
+/*
+ * Reserve free space for per-AG metadata.
+ */
+int
+xfs_fs_reserve_ag_blocks(
+	struct xfs_mount	*mp)
+{
+	xfs_agnumber_t		agno;
+	struct xfs_perag	*pag;
+	int			error = 0;
+	int			err2;
+
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		pag = xfs_perag_get(mp, agno);
+		err2 = xfs_ag_resv_init(pag);
+		xfs_perag_put(pag);
+		if (err2 && !error)
+			error = err2;
+	}
+
+	if (error && error != -ENOSPC) {
+		xfs_warn(mp,
+	"Error %d reserving per-AG metadata reserve pool.", error);
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+	}
+
+	return error;
+}
+
+/*
+ * Free space reserved for per-AG metadata.
+ */
+int
+xfs_fs_unreserve_ag_blocks(
+	struct xfs_mount	*mp)
+{
+	xfs_agnumber_t		agno;
+	struct xfs_perag	*pag;
+	int			error = 0;
+	int			err2;
+
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		pag = xfs_perag_get(mp, agno);
+		err2 = xfs_ag_resv_free(pag);
+		xfs_perag_put(pag);
+		if (err2 && !error)
+			error = err2;
+	}
+
+	if (error)
+		xfs_warn(mp,
+	"Error %d freeing per-AG metadata reserve pool.", error);
+
+	return error;
+}
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index f32713f..f349158 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -26,4 +26,7 @@
 				xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
 
+extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
+extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp);
+
 #endif	/* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 4d41b24..687a4b0 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -21,8 +21,8 @@
 /*
  * Tunable XFS parameters.  xfs_params is required even when CONFIG_SYSCTL=n,
  * other XFS code uses these values.  Times are measured in centisecs (i.e.
- * 100ths of a second) with the exception of eofb_timer, which is measured in
- * seconds.
+ * 100ths of a second) with the exception of eofb_timer and cowb_timer, which
+ * are measured in seconds.
  */
 xfs_param_t xfs_params = {
 			  /*	MIN		DFLT		MAX	*/
@@ -42,6 +42,7 @@
 	.inherit_nodfrg	= {	0,		1,		1	},
 	.fstrm_timer	= {	1,		30*100,		3600*100},
 	.eofb_timer	= {	1,		300,		3600*24},
+	.cowb_timer	= {	1,		1800,		3600*24},
 };
 
 struct xfs_globals xfs_globals = {
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 65b2e3f..14796b7 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -33,6 +33,7 @@
 #include "xfs_bmap_util.h"
 #include "xfs_dquot_item.h"
 #include "xfs_dquot.h"
+#include "xfs_reflink.h"
 
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -76,6 +77,9 @@
 	ip->i_mount = mp;
 	memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
 	ip->i_afp = NULL;
+	ip->i_cowfp = NULL;
+	ip->i_cnextents = 0;
+	ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
 	memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
 	ip->i_flags = 0;
 	ip->i_delayed_blks = 0;
@@ -101,6 +105,8 @@
 
 	if (ip->i_afp)
 		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+	if (ip->i_cowfp)
+		xfs_idestroy_fork(ip, XFS_COW_FORK);
 
 	if (ip->i_itemp) {
 		ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
@@ -787,6 +793,33 @@
 	xfs_queue_eofblocks(mp);
 }
 
+/*
+ * Background scanning to trim preallocated CoW space. This is queued
+ * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default).
+ * (We'll just piggyback on the post-EOF prealloc space workqueue.)
+ */
+STATIC void
+xfs_queue_cowblocks(
+	struct xfs_mount *mp)
+{
+	rcu_read_lock();
+	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG))
+		queue_delayed_work(mp->m_eofblocks_workqueue,
+				   &mp->m_cowblocks_work,
+				   msecs_to_jiffies(xfs_cowb_secs * 1000));
+	rcu_read_unlock();
+}
+
+void
+xfs_cowblocks_worker(
+	struct work_struct *work)
+{
+	struct xfs_mount *mp = container_of(to_delayed_work(work),
+				struct xfs_mount, m_cowblocks_work);
+	xfs_icache_free_cowblocks(mp, NULL);
+	xfs_queue_cowblocks(mp);
+}
+
 int
 xfs_inode_ag_iterator(
 	struct xfs_mount	*mp,
@@ -1343,18 +1376,30 @@
 	return ret;
 }
 
-int
-xfs_icache_free_eofblocks(
+static int
+__xfs_icache_free_eofblocks(
 	struct xfs_mount	*mp,
-	struct xfs_eofblocks	*eofb)
+	struct xfs_eofblocks	*eofb,
+	int			(*execute)(struct xfs_inode *ip, int flags,
+					   void *args),
+	int			tag)
 {
 	int flags = SYNC_TRYLOCK;
 
 	if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))
 		flags = SYNC_WAIT;
 
-	return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags,
-					 eofb, XFS_ICI_EOFBLOCKS_TAG);
+	return xfs_inode_ag_iterator_tag(mp, execute, flags,
+					 eofb, tag);
+}
+
+int
+xfs_icache_free_eofblocks(
+	struct xfs_mount	*mp,
+	struct xfs_eofblocks	*eofb)
+{
+	return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_eofblocks,
+			XFS_ICI_EOFBLOCKS_TAG);
 }
 
 /*
@@ -1363,9 +1408,11 @@
  * failure. We make a best effort by including each quota under low free space
  * conditions (less than 1% free space) in the scan.
  */
-int
-xfs_inode_free_quota_eofblocks(
-	struct xfs_inode *ip)
+static int
+__xfs_inode_free_quota_eofblocks(
+	struct xfs_inode	*ip,
+	int			(*execute)(struct xfs_mount *mp,
+					   struct xfs_eofblocks	*eofb))
 {
 	int scan = 0;
 	struct xfs_eofblocks eofb = {0};
@@ -1401,14 +1448,25 @@
 	}
 
 	if (scan)
-		xfs_icache_free_eofblocks(ip->i_mount, &eofb);
+		execute(ip->i_mount, &eofb);
 
 	return scan;
 }
 
-void
-xfs_inode_set_eofblocks_tag(
-	xfs_inode_t	*ip)
+int
+xfs_inode_free_quota_eofblocks(
+	struct xfs_inode *ip)
+{
+	return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks);
+}
+
+static void
+__xfs_inode_set_eofblocks_tag(
+	xfs_inode_t	*ip,
+	void		(*execute)(struct xfs_mount *mp),
+	void		(*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
+				  int error, unsigned long caller_ip),
+	int		tag)
 {
 	struct xfs_mount *mp = ip->i_mount;
 	struct xfs_perag *pag;
@@ -1426,26 +1484,22 @@
 
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 	spin_lock(&pag->pag_ici_lock);
-	trace_xfs_inode_set_eofblocks_tag(ip);
 
-	tagged = radix_tree_tagged(&pag->pag_ici_root,
-				   XFS_ICI_EOFBLOCKS_TAG);
+	tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
 	radix_tree_tag_set(&pag->pag_ici_root,
-			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
-			   XFS_ICI_EOFBLOCKS_TAG);
+			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag);
 	if (!tagged) {
 		/* propagate the eofblocks tag up into the perag radix tree */
 		spin_lock(&ip->i_mount->m_perag_lock);
 		radix_tree_tag_set(&ip->i_mount->m_perag_tree,
 				   XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
-				   XFS_ICI_EOFBLOCKS_TAG);
+				   tag);
 		spin_unlock(&ip->i_mount->m_perag_lock);
 
 		/* kick off background trimming */
-		xfs_queue_eofblocks(ip->i_mount);
+		execute(ip->i_mount);
 
-		trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno,
-					      -1, _RET_IP_);
+		set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_);
 	}
 
 	spin_unlock(&pag->pag_ici_lock);
@@ -1453,9 +1507,22 @@
 }
 
 void
-xfs_inode_clear_eofblocks_tag(
+xfs_inode_set_eofblocks_tag(
 	xfs_inode_t	*ip)
 {
+	trace_xfs_inode_set_eofblocks_tag(ip);
+	return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_eofblocks,
+			trace_xfs_perag_set_eofblocks,
+			XFS_ICI_EOFBLOCKS_TAG);
+}
+
+static void
+__xfs_inode_clear_eofblocks_tag(
+	xfs_inode_t	*ip,
+	void		(*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
+				    int error, unsigned long caller_ip),
+	int		tag)
+{
 	struct xfs_mount *mp = ip->i_mount;
 	struct xfs_perag *pag;
 
@@ -1465,23 +1532,141 @@
 
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 	spin_lock(&pag->pag_ici_lock);
-	trace_xfs_inode_clear_eofblocks_tag(ip);
 
 	radix_tree_tag_clear(&pag->pag_ici_root,
-			     XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
-			     XFS_ICI_EOFBLOCKS_TAG);
-	if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) {
+			     XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag);
+	if (!radix_tree_tagged(&pag->pag_ici_root, tag)) {
 		/* clear the eofblocks tag from the perag radix tree */
 		spin_lock(&ip->i_mount->m_perag_lock);
 		radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
 				     XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
-				     XFS_ICI_EOFBLOCKS_TAG);
+				     tag);
 		spin_unlock(&ip->i_mount->m_perag_lock);
-		trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno,
-					       -1, _RET_IP_);
+		clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_);
 	}
 
 	spin_unlock(&pag->pag_ici_lock);
 	xfs_perag_put(pag);
 }
 
+void
+xfs_inode_clear_eofblocks_tag(
+	xfs_inode_t	*ip)
+{
+	trace_xfs_inode_clear_eofblocks_tag(ip);
+	return __xfs_inode_clear_eofblocks_tag(ip,
+			trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG);
+}
+
+/*
+ * Automatic CoW Reservation Freeing
+ *
+ * These functions automatically garbage collect leftover CoW reservations
+ * that were made on behalf of a cowextsize hint when we start to run out
+ * of quota or when the reservations sit around for too long.  If the file
+ * has dirty pages or is undergoing writeback, its CoW reservations will
+ * be retained.
+ *
+ * The actual garbage collection piggybacks off the same code that runs
+ * the speculative EOF preallocation garbage collector.
+ */
+STATIC int
+xfs_inode_free_cowblocks(
+	struct xfs_inode	*ip,
+	int			flags,
+	void			*args)
+{
+	int ret;
+	struct xfs_eofblocks *eofb = args;
+	bool need_iolock = true;
+	int match;
+
+	ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
+
+	if (!xfs_reflink_has_real_cow_blocks(ip)) {
+		trace_xfs_inode_free_cowblocks_invalid(ip);
+		xfs_inode_clear_cowblocks_tag(ip);
+		return 0;
+	}
+
+	/*
+	 * If the mapping is dirty or under writeback we cannot touch the
+	 * CoW fork.  Leave it alone if we're in the midst of a directio.
+	 */
+	if (mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
+	    mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
+	    atomic_read(&VFS_I(ip)->i_dio_count))
+		return 0;
+
+	if (eofb) {
+		if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
+			match = xfs_inode_match_id_union(ip, eofb);
+		else
+			match = xfs_inode_match_id(ip, eofb);
+		if (!match)
+			return 0;
+
+		/* skip the inode if the file size is too small */
+		if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
+		    XFS_ISIZE(ip) < eofb->eof_min_file_size)
+			return 0;
+
+		/*
+		 * A scan owner implies we already hold the iolock. Skip it in
+		 * xfs_free_eofblocks() to avoid deadlock. This also eliminates
+		 * the possibility of EAGAIN being returned.
+		 */
+		if (eofb->eof_scan_owner == ip->i_ino)
+			need_iolock = false;
+	}
+
+	/* Free the CoW blocks */
+	if (need_iolock) {
+		xfs_ilock(ip, XFS_IOLOCK_EXCL);
+		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+	}
+
+	ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);
+
+	if (need_iolock) {
+		xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	}
+
+	return ret;
+}
+
+int
+xfs_icache_free_cowblocks(
+	struct xfs_mount	*mp,
+	struct xfs_eofblocks	*eofb)
+{
+	return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_cowblocks,
+			XFS_ICI_COWBLOCKS_TAG);
+}
+
+int
+xfs_inode_free_quota_cowblocks(
+	struct xfs_inode *ip)
+{
+	return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks);
+}
+
+void
+xfs_inode_set_cowblocks_tag(
+	xfs_inode_t	*ip)
+{
+	trace_xfs_inode_set_eofblocks_tag(ip);
+	return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_cowblocks,
+			trace_xfs_perag_set_eofblocks,
+			XFS_ICI_COWBLOCKS_TAG);
+}
+
+void
+xfs_inode_clear_cowblocks_tag(
+	xfs_inode_t	*ip)
+{
+	trace_xfs_inode_clear_eofblocks_tag(ip);
+	return __xfs_inode_clear_eofblocks_tag(ip,
+			trace_xfs_perag_clear_eofblocks, XFS_ICI_COWBLOCKS_TAG);
+}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 05bac99..a1e02f4 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -40,6 +40,7 @@
 					   in xfs_inode_ag_iterator */
 #define XFS_ICI_RECLAIM_TAG	0	/* inode is to be reclaimed */
 #define XFS_ICI_EOFBLOCKS_TAG	1	/* inode has blocks beyond EOF */
+#define XFS_ICI_COWBLOCKS_TAG	2	/* inode can have cow blocks to gc */
 
 /*
  * Flags for xfs_iget()
@@ -70,6 +71,12 @@
 void xfs_eofblocks_worker(struct work_struct *);
 void xfs_queue_eofblocks(struct xfs_mount *);
 
+void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip);
+void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip);
+int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *);
+int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip);
+void xfs_cowblocks_worker(struct work_struct *);
+
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
 	int (*execute)(struct xfs_inode *ip, int flags, void *args),
 	int flags, void *args);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 624e1df..4e560e6 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -49,6 +49,7 @@
 #include "xfs_trans_priv.h"
 #include "xfs_log.h"
 #include "xfs_bmap_btree.h"
+#include "xfs_reflink.h"
 
 kmem_zone_t *xfs_inode_zone;
 
@@ -77,6 +78,29 @@
 }
 
 /*
+ * Helper function to extract CoW extent size hint from inode.
+ * Between the extent size hint and the CoW extent size hint, we
+ * return the greater of the two.  If the value is zero (automatic),
+ * use the default size.
+ */
+xfs_extlen_t
+xfs_get_cowextsz_hint(
+	struct xfs_inode	*ip)
+{
+	xfs_extlen_t		a, b;
+
+	a = 0;
+	if (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
+		a = ip->i_d.di_cowextsize;
+	b = xfs_get_extsz_hint(ip);
+
+	a = max(a, b);
+	if (a == 0)
+		return XFS_DEFAULT_COWEXTSZ_HINT;
+	return a;
+}
+
+/*
  * These two are wrapper routines around the xfs_ilock() routine used to
  * centralize some grungy code.  They are used in places that wish to lock the
  * inode solely for reading the extents.  The reason these places can't just
@@ -651,6 +675,8 @@
 	if (di_flags2 & XFS_DIFLAG2_ANY) {
 		if (di_flags2 & XFS_DIFLAG2_DAX)
 			flags |= FS_XFLAG_DAX;
+		if (di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
+			flags |= FS_XFLAG_COWEXTSIZE;
 	}
 
 	if (has_attr)
@@ -834,6 +860,7 @@
 	if (ip->i_d.di_version == 3) {
 		inode->i_version = 1;
 		ip->i_d.di_flags2 = 0;
+		ip->i_d.di_cowextsize = 0;
 		ip->i_d.di_crtime.t_sec = (__int32_t)tv.tv_sec;
 		ip->i_d.di_crtime.t_nsec = (__int32_t)tv.tv_nsec;
 	}
@@ -896,6 +923,15 @@
 			ip->i_d.di_flags |= di_flags;
 			ip->i_d.di_flags2 |= di_flags2;
 		}
+		if (pip &&
+		    (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) &&
+		    pip->i_d.di_version == 3 &&
+		    ip->i_d.di_version == 3) {
+			if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
+				ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+				ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
+			}
+		}
 		/* FALLTHROUGH */
 	case S_IFLNK:
 		ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
@@ -1586,6 +1622,20 @@
 			goto out;
 	}
 
+	/* Remove all pending CoW reservations. */
+	error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block,
+			last_block);
+	if (error)
+		goto out;
+
+	/*
+	 * Clear the reflink flag if we truncated everything.
+	 */
+	if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip)) {
+		ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+		xfs_inode_clear_cowblocks_tag(ip);
+	}
+
 	/*
 	 * Always re-log the inode so that our permanent transaction can keep
 	 * on rolling it forward in the log.
@@ -1850,6 +1900,7 @@
 	}
 
 	mp = ip->i_mount;
+	ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
 
 	/* If this is a read-only mount, don't do this (would generate I/O) */
 	if (mp->m_flags & XFS_MOUNT_RDONLY)
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 8f30d25..f14c1de 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -47,6 +47,7 @@
 
 	/* Extent information. */
 	xfs_ifork_t		*i_afp;		/* attribute fork pointer */
+	xfs_ifork_t		*i_cowfp;	/* copy on write extents */
 	xfs_ifork_t		i_df;		/* data fork */
 
 	/* operations vectors */
@@ -65,6 +66,9 @@
 
 	struct xfs_icdinode	i_d;		/* most of ondisk inode */
 
+	xfs_extnum_t		i_cnextents;	/* # of extents in cow fork */
+	unsigned int		i_cformat;	/* format of cow fork */
+
 	/* VFS inode */
 	struct inode		i_vnode;	/* embedded VFS inode */
 } xfs_inode_t;
@@ -202,6 +206,11 @@
 	return XFS_PROJID_DEFAULT;
 }
 
+static inline bool xfs_is_reflink_inode(struct xfs_inode *ip)
+{
+	return ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK;
+}
+
 /*
  * In-core inode flags.
  */
@@ -217,6 +226,12 @@
 #define XFS_IPINNED		(1 << __XFS_IPINNED_BIT)
 #define XFS_IDONTCACHE		(1 << 9) /* don't cache the inode long term */
 #define XFS_IEOFBLOCKS		(1 << 10)/* has the preallocblocks tag set */
+/*
+ * If this unlinked inode is in the middle of recovery, don't let drop_inode
+ * truncate and free the inode.  This can happen if we iget the inode during
+ * log recovery to replay a bmap operation on the inode.
+ */
+#define XFS_IRECOVERY		(1 << 11)
 
 /*
  * Per-lifetime flags need to be reset when re-using a reclaimable inode during
@@ -411,6 +426,7 @@
 void		xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 
 xfs_extlen_t	xfs_get_extsz_hint(struct xfs_inode *ip);
+xfs_extlen_t	xfs_get_cowextsz_hint(struct xfs_inode *ip);
 
 int		xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
 			       xfs_nlink_t, xfs_dev_t, prid_t, int,
@@ -474,4 +490,7 @@
 
 extern struct kmem_zone	*xfs_inode_zone;
 
+/* The default CoW extent size hint. */
+#define XFS_DEFAULT_COWEXTSZ_HINT 32
+
 #endif	/* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 892c2ac..9610e9c 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -368,7 +368,7 @@
 		to->di_crtime.t_sec = from->di_crtime.t_sec;
 		to->di_crtime.t_nsec = from->di_crtime.t_nsec;
 		to->di_flags2 = from->di_flags2;
-
+		to->di_cowextsize = from->di_cowextsize;
 		to->di_ino = ip->i_ino;
 		to->di_lsn = lsn;
 		memset(to->di_pad2, 0, sizeof(to->di_pad2));
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0d9021f..c245bed 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -903,6 +903,8 @@
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 	fa.fsx_xflags = xfs_ip2xflags(ip);
 	fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
+	fa.fsx_cowextsize = ip->i_d.di_cowextsize <<
+			ip->i_mount->m_sb.sb_blocklog;
 	fa.fsx_projid = xfs_get_projid(ip);
 
 	if (attr) {
@@ -973,12 +975,13 @@
 	if (ip->i_d.di_version < 3)
 		return;
 
-	di_flags2 = 0;
+	di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
 	if (xflags & FS_XFLAG_DAX)
 		di_flags2 |= XFS_DIFLAG2_DAX;
+	if (xflags & FS_XFLAG_COWEXTSIZE)
+		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
 
 	ip->i_d.di_flags2 = di_flags2;
-
 }
 
 STATIC void
@@ -1031,6 +1034,14 @@
 			return -EINVAL;
 	}
 
+	/* Clear reflink if we are actually able to set the rt flag. */
+	if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip))
+		ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+
+	/* Don't allow us to set DAX mode for a reflinked file for now. */
+	if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip))
+		return -EINVAL;
+
 	/*
 	 * Can't modify an immutable/append-only file unless
 	 * we have appropriate permission.
@@ -1219,6 +1230,56 @@
 	return 0;
 }
 
+/*
+ * CoW extent size hint validation rules are:
+ *
+ * 1. CoW extent size hint can only be set if reflink is enabled on the fs.
+ *    The inode does not have to have any shared blocks, but it must be a v3.
+ * 2. FS_XFLAG_COWEXTSIZE is only valid for directories and regular files;
+ *    for a directory, the hint is propagated to new files.
+ * 3. Can be changed on files & directories at any time.
+ * 4. CoW extsize hint of 0 turns off hints, clears inode flags.
+ * 5. Extent size must be a multiple of the appropriate block size.
+ * 6. The extent size hint must be limited to half the AG size to avoid
+ *    alignment extending the extent beyond the limits of the AG.
+ */
+static int
+xfs_ioctl_setattr_check_cowextsize(
+	struct xfs_inode	*ip,
+	struct fsxattr		*fa)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE))
+		return 0;
+
+	if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb) ||
+	    ip->i_d.di_version != 3)
+		return -EINVAL;
+
+	if (!S_ISREG(VFS_I(ip)->i_mode) && !S_ISDIR(VFS_I(ip)->i_mode))
+		return -EINVAL;
+
+	if (fa->fsx_cowextsize != 0) {
+		xfs_extlen_t    size;
+		xfs_fsblock_t   cowextsize_fsb;
+
+		cowextsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_cowextsize);
+		if (cowextsize_fsb > MAXEXTLEN)
+			return -EINVAL;
+
+		size = mp->m_sb.sb_blocksize;
+		if (cowextsize_fsb > mp->m_sb.sb_agblocks / 2)
+			return -EINVAL;
+
+		if (fa->fsx_cowextsize % size)
+			return -EINVAL;
+	} else
+		fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
+
+	return 0;
+}
+
 static int
 xfs_ioctl_setattr_check_projid(
 	struct xfs_inode	*ip,
@@ -1311,6 +1372,10 @@
 	if (code)
 		goto error_trans_cancel;
 
+	code = xfs_ioctl_setattr_check_cowextsize(ip, fa);
+	if (code)
+		goto error_trans_cancel;
+
 	code = xfs_ioctl_setattr_xflags(tp, ip, fa);
 	if (code)
 		goto error_trans_cancel;
@@ -1346,6 +1411,12 @@
 		ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
 	else
 		ip->i_d.di_extsize = 0;
+	if (ip->i_d.di_version == 3 &&
+	    (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
+		ip->i_d.di_cowextsize = fa->fsx_cowextsize >>
+				mp->m_sb.sb_blocklog;
+	else
+		ip->i_d.di_cowextsize = 0;
 
 	code = xfs_trans_commit(tp);
 
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index c08253e..d907eb9 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -39,6 +39,7 @@
 #include "xfs_quota.h"
 #include "xfs_dquot_item.h"
 #include "xfs_dquot.h"
+#include "xfs_reflink.h"
 
 
 #define XFS_WRITEIO_ALIGN(mp,off)	(((off) >> mp->m_writeio_log) \
@@ -70,7 +71,7 @@
 	iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
 }
 
-static xfs_extlen_t
+xfs_extlen_t
 xfs_eof_alignment(
 	struct xfs_inode	*ip,
 	xfs_extlen_t		extsize)
@@ -609,7 +610,7 @@
 	}
 
 retry:
-	error = xfs_bmapi_reserve_delalloc(ip, offset_fsb,
+	error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
 			end_fsb - offset_fsb, &got,
 			&prev, &idx, eof);
 	switch (error) {
@@ -666,6 +667,7 @@
 int
 xfs_iomap_write_allocate(
 	xfs_inode_t	*ip,
+	int		whichfork,
 	xfs_off_t	offset,
 	xfs_bmbt_irec_t *imap)
 {
@@ -678,8 +680,12 @@
 	xfs_trans_t	*tp;
 	int		nimaps;
 	int		error = 0;
+	int		flags = 0;
 	int		nres;
 
+	if (whichfork == XFS_COW_FORK)
+		flags |= XFS_BMAPI_COWFORK;
+
 	/*
 	 * Make sure that the dquots are there.
 	 */
@@ -773,7 +779,7 @@
 			 * pointer that the caller gave to us.
 			 */
 			error = xfs_bmapi_write(tp, ip, map_start_fsb,
-						count_fsb, 0, &first_block,
+						count_fsb, flags, &first_block,
 						nres, imap, &nimaps,
 						&dfops);
 			if (error)
@@ -955,14 +961,22 @@
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_bmbt_irec	imap;
 	xfs_fileoff_t		offset_fsb, end_fsb;
+	bool			shared, trimmed;
 	int			nimaps = 1, error = 0;
 	unsigned		lockmode;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	if ((flags & IOMAP_WRITE) &&
-	    !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
+	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
+		error = xfs_reflink_reserve_cow_range(ip, offset, length);
+		if (error < 0)
+			return error;
+	}
+
+	if ((flags & IOMAP_WRITE) && !IS_DAX(inode) &&
+		   !xfs_get_extsz_hint(ip)) {
+		/* Reserve delalloc blocks for regular writeback. */
 		return xfs_file_iomap_begin_delay(inode, offset, length, flags,
 				iomap);
 	}
@@ -976,7 +990,14 @@
 	end_fsb = XFS_B_TO_FSB(mp, offset + length);
 
 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
-			       &nimaps, XFS_BMAPI_ENTIRE);
+			       &nimaps, 0);
+	if (error) {
+		xfs_iunlock(ip, lockmode);
+		return error;
+	}
+
+	/* Trim the mapping to the nearest shared extent boundary. */
+	error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed);
 	if (error) {
 		xfs_iunlock(ip, lockmode);
 		return error;
@@ -1015,6 +1036,8 @@
 	}
 
 	xfs_bmbt_to_iomap(ip, iomap, &imap);
+	if (shared)
+		iomap->flags |= IOMAP_F_SHARED;
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 6498be4..6d45cf0 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -25,12 +25,13 @@
 
 int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
 			struct xfs_bmbt_irec *, int);
-int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
+int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,
 			struct xfs_bmbt_irec *);
 int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
 
 void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
 		struct xfs_bmbt_irec *);
+xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
 
 extern struct iomap_ops xfs_iomap_ops;
 extern struct iomap_ops xfs_xattr_iomap_ops;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index c5da95e..405a65c 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1159,6 +1159,7 @@
 		inode->i_flags |= S_NOATIME;
 	if (S_ISREG(inode->i_mode) &&
 	    ip->i_mount->m_sb.sb_blocksize == PAGE_SIZE &&
+	    !xfs_is_reflink_inode(ip) &&
 	    (ip->i_mount->m_flags & XFS_MOUNT_DAX ||
 	     ip->i_d.di_flags2 & XFS_DIFLAG2_DAX))
 		inode->i_flags |= S_DAX;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index ce73eb3..66e8817 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -66,7 +66,7 @@
 	if (!buffer || xfs_internal_inum(mp, ino))
 		return -EINVAL;
 
-	buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL);
+	buf = kmem_zalloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL);
 	if (!buf)
 		return -ENOMEM;
 
@@ -111,6 +111,12 @@
 	buf->bs_aextents = dic->di_anextents;
 	buf->bs_forkoff = XFS_IFORK_BOFF(ip);
 
+	if (dic->di_version == 3) {
+		if (dic->di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
+			buf->bs_cowextsize = dic->di_cowextsize <<
+					mp->m_sb.sb_blocklog;
+	}
+
 	switch (dic->di_format) {
 	case XFS_DINODE_FMT_DEV:
 		buf->bs_rdev = ip->i_df.if_u2.if_rdev;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index b8d64d5..68640fb 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -116,6 +116,7 @@
 #define xfs_inherit_nodefrag	xfs_params.inherit_nodfrg.val
 #define xfs_fstrm_centisecs	xfs_params.fstrm_timer.val
 #define xfs_eofb_secs		xfs_params.eofb_timer.val
+#define xfs_cowb_secs		xfs_params.cowb_timer.val
 
 #define current_cpu()		(raw_smp_processor_id())
 #define current_pid()		(current->pid)
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 846483d..9b3d7c7 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -45,6 +45,8 @@
 #include "xfs_dir2.h"
 #include "xfs_rmap_item.h"
 #include "xfs_buf_item.h"
+#include "xfs_refcount_item.h"
+#include "xfs_bmap_item.h"
 
 #define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1)
 
@@ -1924,6 +1926,10 @@
 		case XFS_LI_EFI:
 		case XFS_LI_RUI:
 		case XFS_LI_RUD:
+		case XFS_LI_CUI:
+		case XFS_LI_CUD:
+		case XFS_LI_BUI:
+		case XFS_LI_BUD:
 			trace_xfs_log_recover_item_reorder_tail(log,
 							trans, item, pass);
 			list_move_tail(&item->ri_list, &inode_list);
@@ -2242,6 +2248,7 @@
 	case XFS_ABTB_MAGIC:
 	case XFS_ABTC_MAGIC:
 	case XFS_RMAP_CRC_MAGIC:
+	case XFS_REFC_CRC_MAGIC:
 	case XFS_IBT_CRC_MAGIC:
 	case XFS_IBT_MAGIC: {
 		struct xfs_btree_block *btb = blk;
@@ -2415,6 +2422,9 @@
 		case XFS_RMAP_CRC_MAGIC:
 			bp->b_ops = &xfs_rmapbt_buf_ops;
 			break;
+		case XFS_REFC_CRC_MAGIC:
+			bp->b_ops = &xfs_refcountbt_buf_ops;
+			break;
 		default:
 			warnmsg = "Bad btree block magic!";
 			break;
@@ -3547,6 +3557,242 @@
 }
 
 /*
+ * Copy an CUI format buffer from the given buf, and into the destination
+ * CUI format structure.  The CUI/CUD items were designed not to need any
+ * special alignment handling.
+ */
+static int
+xfs_cui_copy_format(
+	struct xfs_log_iovec		*buf,
+	struct xfs_cui_log_format	*dst_cui_fmt)
+{
+	struct xfs_cui_log_format	*src_cui_fmt;
+	uint				len;
+
+	src_cui_fmt = buf->i_addr;
+	len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents);
+
+	if (buf->i_len == len) {
+		memcpy(dst_cui_fmt, src_cui_fmt, len);
+		return 0;
+	}
+	return -EFSCORRUPTED;
+}
+
+/*
+ * This routine is called to create an in-core extent refcount update
+ * item from the cui format structure which was logged on disk.
+ * It allocates an in-core cui, copies the extents from the format
+ * structure into it, and adds the cui to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_cui_pass2(
+	struct xlog			*log,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	int				error;
+	struct xfs_mount		*mp = log->l_mp;
+	struct xfs_cui_log_item		*cuip;
+	struct xfs_cui_log_format	*cui_formatp;
+
+	cui_formatp = item->ri_buf[0].i_addr;
+
+	cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
+	error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format);
+	if (error) {
+		xfs_cui_item_free(cuip);
+		return error;
+	}
+	atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
+
+	spin_lock(&log->l_ailp->xa_lock);
+	/*
+	 * The CUI has two references. One for the CUD and one for CUI to ensure
+	 * it makes it into the AIL. Insert the CUI into the AIL directly and
+	 * drop the CUI reference. Note that xfs_trans_ail_update() drops the
+	 * AIL lock.
+	 */
+	xfs_trans_ail_update(log->l_ailp, &cuip->cui_item, lsn);
+	xfs_cui_release(cuip);
+	return 0;
+}
+
+
+/*
+ * This routine is called when an CUD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding CUI if it
+ * was still in the log. To do this it searches the AIL for the CUI with an id
+ * equal to that in the CUD format structure. If we find it we drop the CUD
+ * reference, which removes the CUI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_cud_pass2(
+	struct xlog			*log,
+	struct xlog_recover_item	*item)
+{
+	struct xfs_cud_log_format	*cud_formatp;
+	struct xfs_cui_log_item		*cuip = NULL;
+	struct xfs_log_item		*lip;
+	__uint64_t			cui_id;
+	struct xfs_ail_cursor		cur;
+	struct xfs_ail			*ailp = log->l_ailp;
+
+	cud_formatp = item->ri_buf[0].i_addr;
+	if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format))
+		return -EFSCORRUPTED;
+	cui_id = cud_formatp->cud_cui_id;
+
+	/*
+	 * Search for the CUI with the id in the CUD format structure in the
+	 * AIL.
+	 */
+	spin_lock(&ailp->xa_lock);
+	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+	while (lip != NULL) {
+		if (lip->li_type == XFS_LI_CUI) {
+			cuip = (struct xfs_cui_log_item *)lip;
+			if (cuip->cui_format.cui_id == cui_id) {
+				/*
+				 * Drop the CUD reference to the CUI. This
+				 * removes the CUI from the AIL and frees it.
+				 */
+				spin_unlock(&ailp->xa_lock);
+				xfs_cui_release(cuip);
+				spin_lock(&ailp->xa_lock);
+				break;
+			}
+		}
+		lip = xfs_trans_ail_cursor_next(ailp, &cur);
+	}
+
+	xfs_trans_ail_cursor_done(&cur);
+	spin_unlock(&ailp->xa_lock);
+
+	return 0;
+}
+
+/*
+ * Copy an BUI format buffer from the given buf, and into the destination
+ * BUI format structure.  The BUI/BUD items were designed not to need any
+ * special alignment handling.
+ */
+static int
+xfs_bui_copy_format(
+	struct xfs_log_iovec		*buf,
+	struct xfs_bui_log_format	*dst_bui_fmt)
+{
+	struct xfs_bui_log_format	*src_bui_fmt;
+	uint				len;
+
+	src_bui_fmt = buf->i_addr;
+	len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents);
+
+	if (buf->i_len == len) {
+		memcpy(dst_bui_fmt, src_bui_fmt, len);
+		return 0;
+	}
+	return -EFSCORRUPTED;
+}
+
+/*
+ * This routine is called to create an in-core extent bmap update
+ * item from the bui format structure which was logged on disk.
+ * It allocates an in-core bui, copies the extents from the format
+ * structure into it, and adds the bui to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_bui_pass2(
+	struct xlog			*log,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	int				error;
+	struct xfs_mount		*mp = log->l_mp;
+	struct xfs_bui_log_item		*buip;
+	struct xfs_bui_log_format	*bui_formatp;
+
+	bui_formatp = item->ri_buf[0].i_addr;
+
+	if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS)
+		return -EFSCORRUPTED;
+	buip = xfs_bui_init(mp);
+	error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format);
+	if (error) {
+		xfs_bui_item_free(buip);
+		return error;
+	}
+	atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents);
+
+	spin_lock(&log->l_ailp->xa_lock);
+	/*
+	 * The RUI has two references. One for the RUD and one for RUI to ensure
+	 * it makes it into the AIL. Insert the RUI into the AIL directly and
+	 * drop the RUI reference. Note that xfs_trans_ail_update() drops the
+	 * AIL lock.
+	 */
+	xfs_trans_ail_update(log->l_ailp, &buip->bui_item, lsn);
+	xfs_bui_release(buip);
+	return 0;
+}
+
+
+/*
+ * This routine is called when an BUD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding BUI if it
+ * was still in the log. To do this it searches the AIL for the BUI with an id
+ * equal to that in the BUD format structure. If we find it we drop the BUD
+ * reference, which removes the BUI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_bud_pass2(
+	struct xlog			*log,
+	struct xlog_recover_item	*item)
+{
+	struct xfs_bud_log_format	*bud_formatp;
+	struct xfs_bui_log_item		*buip = NULL;
+	struct xfs_log_item		*lip;
+	__uint64_t			bui_id;
+	struct xfs_ail_cursor		cur;
+	struct xfs_ail			*ailp = log->l_ailp;
+
+	bud_formatp = item->ri_buf[0].i_addr;
+	if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format))
+		return -EFSCORRUPTED;
+	bui_id = bud_formatp->bud_bui_id;
+
+	/*
+	 * Search for the BUI with the id in the BUD format structure in the
+	 * AIL.
+	 */
+	spin_lock(&ailp->xa_lock);
+	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+	while (lip != NULL) {
+		if (lip->li_type == XFS_LI_BUI) {
+			buip = (struct xfs_bui_log_item *)lip;
+			if (buip->bui_format.bui_id == bui_id) {
+				/*
+				 * Drop the BUD reference to the BUI. This
+				 * removes the BUI from the AIL and frees it.
+				 */
+				spin_unlock(&ailp->xa_lock);
+				xfs_bui_release(buip);
+				spin_lock(&ailp->xa_lock);
+				break;
+			}
+		}
+		lip = xfs_trans_ail_cursor_next(ailp, &cur);
+	}
+
+	xfs_trans_ail_cursor_done(&cur);
+	spin_unlock(&ailp->xa_lock);
+
+	return 0;
+}
+
+/*
  * This routine is called when an inode create format structure is found in a
  * committed transaction in the log.  It's purpose is to initialise the inodes
  * being allocated on disk. This requires us to get inode cluster buffers that
@@ -3773,6 +4019,10 @@
 	case XFS_LI_QUOTAOFF:
 	case XFS_LI_RUI:
 	case XFS_LI_RUD:
+	case XFS_LI_CUI:
+	case XFS_LI_CUD:
+	case XFS_LI_BUI:
+	case XFS_LI_BUD:
 	default:
 		break;
 	}
@@ -3798,6 +4048,10 @@
 	case XFS_LI_ICREATE:
 	case XFS_LI_RUI:
 	case XFS_LI_RUD:
+	case XFS_LI_CUI:
+	case XFS_LI_CUD:
+	case XFS_LI_BUI:
+	case XFS_LI_BUD:
 		/* nothing to do in pass 1 */
 		return 0;
 	default:
@@ -3832,6 +4086,14 @@
 		return xlog_recover_rui_pass2(log, item, trans->r_lsn);
 	case XFS_LI_RUD:
 		return xlog_recover_rud_pass2(log, item);
+	case XFS_LI_CUI:
+		return xlog_recover_cui_pass2(log, item, trans->r_lsn);
+	case XFS_LI_CUD:
+		return xlog_recover_cud_pass2(log, item);
+	case XFS_LI_BUI:
+		return xlog_recover_bui_pass2(log, item, trans->r_lsn);
+	case XFS_LI_BUD:
+		return xlog_recover_bud_pass2(log, item);
 	case XFS_LI_DQUOT:
 		return xlog_recover_dquot_pass2(log, buffer_list, item,
 						trans->r_lsn);
@@ -4419,12 +4681,94 @@
 	spin_lock(&ailp->xa_lock);
 }
 
+/* Recover the CUI if necessary. */
+STATIC int
+xlog_recover_process_cui(
+	struct xfs_mount		*mp,
+	struct xfs_ail			*ailp,
+	struct xfs_log_item		*lip)
+{
+	struct xfs_cui_log_item		*cuip;
+	int				error;
+
+	/*
+	 * Skip CUIs that we've already processed.
+	 */
+	cuip = container_of(lip, struct xfs_cui_log_item, cui_item);
+	if (test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags))
+		return 0;
+
+	spin_unlock(&ailp->xa_lock);
+	error = xfs_cui_recover(mp, cuip);
+	spin_lock(&ailp->xa_lock);
+
+	return error;
+}
+
+/* Release the CUI since we're cancelling everything. */
+STATIC void
+xlog_recover_cancel_cui(
+	struct xfs_mount		*mp,
+	struct xfs_ail			*ailp,
+	struct xfs_log_item		*lip)
+{
+	struct xfs_cui_log_item		*cuip;
+
+	cuip = container_of(lip, struct xfs_cui_log_item, cui_item);
+
+	spin_unlock(&ailp->xa_lock);
+	xfs_cui_release(cuip);
+	spin_lock(&ailp->xa_lock);
+}
+
+/* Recover the BUI if necessary. */
+STATIC int
+xlog_recover_process_bui(
+	struct xfs_mount		*mp,
+	struct xfs_ail			*ailp,
+	struct xfs_log_item		*lip)
+{
+	struct xfs_bui_log_item		*buip;
+	int				error;
+
+	/*
+	 * Skip BUIs that we've already processed.
+	 */
+	buip = container_of(lip, struct xfs_bui_log_item, bui_item);
+	if (test_bit(XFS_BUI_RECOVERED, &buip->bui_flags))
+		return 0;
+
+	spin_unlock(&ailp->xa_lock);
+	error = xfs_bui_recover(mp, buip);
+	spin_lock(&ailp->xa_lock);
+
+	return error;
+}
+
+/* Release the BUI since we're cancelling everything. */
+STATIC void
+xlog_recover_cancel_bui(
+	struct xfs_mount		*mp,
+	struct xfs_ail			*ailp,
+	struct xfs_log_item		*lip)
+{
+	struct xfs_bui_log_item		*buip;
+
+	buip = container_of(lip, struct xfs_bui_log_item, bui_item);
+
+	spin_unlock(&ailp->xa_lock);
+	xfs_bui_release(buip);
+	spin_lock(&ailp->xa_lock);
+}
+
 /* Is this log item a deferred action intent? */
 static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
 {
 	switch (lip->li_type) {
 	case XFS_LI_EFI:
 	case XFS_LI_RUI:
+	case XFS_LI_CUI:
+	case XFS_LI_BUI:
 		return true;
 	default:
 		return false;
@@ -4488,6 +4832,12 @@
 		case XFS_LI_RUI:
 			error = xlog_recover_process_rui(log->l_mp, ailp, lip);
 			break;
+		case XFS_LI_CUI:
+			error = xlog_recover_process_cui(log->l_mp, ailp, lip);
+			break;
+		case XFS_LI_BUI:
+			error = xlog_recover_process_bui(log->l_mp, ailp, lip);
+			break;
 		}
 		if (error)
 			goto out;
@@ -4535,6 +4885,12 @@
 		case XFS_LI_RUI:
 			xlog_recover_cancel_rui(log->l_mp, ailp, lip);
 			break;
+		case XFS_LI_CUI:
+			xlog_recover_cancel_cui(log->l_mp, ailp, lip);
+			break;
+		case XFS_LI_BUI:
+			xlog_recover_cancel_bui(log->l_mp, ailp, lip);
+			break;
 		}
 
 		lip = xfs_trans_ail_cursor_next(ailp, &cur);
@@ -4613,6 +4969,7 @@
 	if (error)
 		goto fail_iput;
 
+	xfs_iflags_clear(ip, XFS_IRECOVERY);
 	ASSERT(VFS_I(ip)->i_nlink == 0);
 	ASSERT(VFS_I(ip)->i_mode != 0);
 
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 56e85a6..fc78739 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -43,6 +43,8 @@
 #include "xfs_icache.h"
 #include "xfs_sysfs.h"
 #include "xfs_rmap_btree.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_reflink.h"
 
 
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -684,6 +686,7 @@
 	xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
 	xfs_ialloc_compute_maxlevels(mp);
 	xfs_rmapbt_compute_maxlevels(mp);
+	xfs_refcountbt_compute_maxlevels(mp);
 
 	xfs_set_maxicount(mp);
 
@@ -923,6 +926,15 @@
 	}
 
 	/*
+	 * During the second phase of log recovery, we need iget and
+	 * iput to behave like they do for an active filesystem.
+	 * xfs_fs_drop_inode needs to be able to prevent the deletion
+	 * of inodes before we're done replaying log items on those
+	 * inodes.
+	 */
+	mp->m_super->s_flags |= MS_ACTIVE;
+
+	/*
 	 * Finish recovering the file system.  This part needed to be delayed
 	 * until after the root and real-time bitmap inodes were consistently
 	 * read in.
@@ -974,10 +986,28 @@
 		if (error)
 			xfs_warn(mp,
 	"Unable to allocate reserve blocks. Continuing without reserve pool.");
+
+		/* Recover any CoW blocks that never got remapped. */
+		error = xfs_reflink_recover_cow(mp);
+		if (error) {
+			xfs_err(mp,
+	"Error %d recovering leftover CoW allocations.", error);
+			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+			goto out_quota;
+		}
+
+		/* Reserve AG blocks for future btree expansion. */
+		error = xfs_fs_reserve_ag_blocks(mp);
+		if (error && error != -ENOSPC)
+			goto out_agresv;
 	}
 
 	return 0;
 
+ out_agresv:
+	xfs_fs_unreserve_ag_blocks(mp);
+ out_quota:
+	xfs_qm_unmount_quotas(mp);
  out_rtunmount:
 	xfs_rtunmount_inodes(mp);
  out_rele_rip:
@@ -1019,7 +1049,9 @@
 	int			error;
 
 	cancel_delayed_work_sync(&mp->m_eofblocks_work);
+	cancel_delayed_work_sync(&mp->m_cowblocks_work);
 
+	xfs_fs_unreserve_ag_blocks(mp);
 	xfs_qm_unmount_quotas(mp);
 	xfs_rtunmount_inodes(mp);
 	IRELE(mp->m_rootip);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 041d949..819b80b 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -124,10 +124,13 @@
 	uint			m_inobt_mnr[2];	/* min inobt btree records */
 	uint			m_rmap_mxr[2];	/* max rmap btree records */
 	uint			m_rmap_mnr[2];	/* min rmap btree records */
+	uint			m_refc_mxr[2];	/* max refc btree records */
+	uint			m_refc_mnr[2];	/* min refc btree records */
 	uint			m_ag_maxlevels;	/* XFS_AG_MAXLEVELS */
 	uint			m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
 	uint			m_in_maxlevels;	/* max inobt btree levels. */
 	uint			m_rmap_maxlevels; /* max rmap btree levels */
+	uint			m_refc_maxlevels; /* max refcount btree level */
 	xfs_extlen_t		m_ag_prealloc_blocks; /* reserved ag blocks */
 	uint			m_alloc_set_aside; /* space we can't use */
 	uint			m_ag_max_usable; /* max space per AG */
@@ -161,6 +164,8 @@
 	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
 	struct delayed_work	m_eofblocks_work; /* background eof blocks
 						     trimming */
+	struct delayed_work	m_cowblocks_work; /* background cow blocks
+						     trimming */
 	bool			m_update_sb;	/* sb needs update in mount */
 	int64_t			m_low_space[XFS_LOWSP_MAX];
 						/* low free space thresholds */
@@ -399,6 +404,9 @@
 	struct xfs_ag_resv	pag_meta_resv;
 	/* Blocks reserved for just AGFL-based metadata. */
 	struct xfs_ag_resv	pag_agfl_resv;
+
+	/* reference count */
+	__uint8_t		pagf_refcount_level;
 } xfs_perag_t;
 
 static inline struct xfs_ag_resv *
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 69e2986..0c381d7 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -49,6 +49,8 @@
 	XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr,		56);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key,		4);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec,		16);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_key,		4);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_rec,		12);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key,		20);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec,		24);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp,		8);
@@ -56,6 +58,7 @@
 	XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t,			4);
 	XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t,			8);
 	XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t,			4);
+	XFS_CHECK_STRUCT_SIZE(xfs_refcount_ptr_t,		4);
 	XFS_CHECK_STRUCT_SIZE(xfs_rmap_ptr_t,			4);
 
 	/* dir/attr trees */
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 0f14b2e..93a7aaf 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -114,6 +114,13 @@
 		return -ENXIO;
 
 	/*
+	 * The pNFS block layout spec actually supports reflink like
+	 * functionality, but the Linux pNFS server doesn't implement it yet.
+	 */
+	if (xfs_is_reflink_inode(ip))
+		return -ENXIO;
+
+	/*
 	 * Lock out any other I/O before we flush and invalidate the pagecache,
 	 * and then hand out a layout to the remote system.  This is very
 	 * similar to direct I/O, except that the synchronization is much more
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
new file mode 100644
index 0000000..fe86a66
--- /dev/null
+++ b/fs/xfs/xfs_refcount_item.c
@@ -0,0 +1,539 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_refcount_item.h"
+#include "xfs_log.h"
+#include "xfs_refcount.h"
+
+
+kmem_zone_t	*xfs_cui_zone;
+kmem_zone_t	*xfs_cud_zone;
+
+static inline struct xfs_cui_log_item *CUI_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_cui_log_item, cui_item);
+}
+
+void
+xfs_cui_item_free(
+	struct xfs_cui_log_item	*cuip)
+{
+	if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
+		kmem_free(cuip);
+	else
+		kmem_zone_free(xfs_cui_zone, cuip);
+}
+
+STATIC void
+xfs_cui_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	struct xfs_cui_log_item	*cuip = CUI_ITEM(lip);
+
+	*nvecs += 1;
+	*nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given cui log item. We use only 1 iovec, and we point that
+ * at the cui_log_format structure embedded in the cui item.
+ * It is at this point that we assert that all of the extent
+ * slots in the cui item have been filled.
+ */
+STATIC void
+xfs_cui_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_cui_log_item	*cuip = CUI_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+
+	ASSERT(atomic_read(&cuip->cui_next_extent) ==
+			cuip->cui_format.cui_nextents);
+
+	cuip->cui_format.cui_type = XFS_LI_CUI;
+	cuip->cui_format.cui_size = 1;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUI_FORMAT, &cuip->cui_format,
+			xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents));
+}
+
+/*
+ * Pinning has no meaning for an cui item, so just return.
+ */
+STATIC void
+xfs_cui_item_pin(
+	struct xfs_log_item	*lip)
+{
+}
+
+/*
+ * The unpin operation is the last place an CUI is manipulated in the log. It is
+ * either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the CUI transaction has been successfully committed to make it
+ * this far. Therefore, we expect whoever committed the CUI to either construct
+ * and commit the CUD or drop the CUD's reference in the event of error. Simply
+ * drop the log's CUI reference now that the log is done with it.
+ */
+STATIC void
+xfs_cui_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+	struct xfs_cui_log_item	*cuip = CUI_ITEM(lip);
+
+	xfs_cui_release(cuip);
+}
+
+/*
+ * CUI items have no locking or pushing.  However, since CUIs are pulled from
+ * the AIL when their corresponding CUDs are committed to disk, their situation
+ * is very similar to being pinned.  Return XFS_ITEM_PINNED so that the caller
+ * will eventually flush the log.  This should help in getting the CUI out of
+ * the AIL.
+ */
+STATIC uint
+xfs_cui_item_push(
+	struct xfs_log_item	*lip,
+	struct list_head	*buffer_list)
+{
+	return XFS_ITEM_PINNED;
+}
+
+/*
+ * The CUI has been either committed or aborted if the transaction has been
+ * cancelled. If the transaction was cancelled, an CUD isn't going to be
+ * constructed and thus we free the CUI here directly.
+ */
+STATIC void
+xfs_cui_item_unlock(
+	struct xfs_log_item	*lip)
+{
+	if (lip->li_flags & XFS_LI_ABORTED)
+		xfs_cui_item_free(CUI_ITEM(lip));
+}
+
+/*
+ * The CUI is logged only once and cannot be moved in the log, so simply return
+ * the lsn at which it's been logged.
+ */
+STATIC xfs_lsn_t
+xfs_cui_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	return lsn;
+}
+
+/*
+ * The CUI dependency tracking op doesn't do squat.  It can't because
+ * it doesn't know where the free extent is coming from.  The dependency
+ * tracking has to be handled by the "enclosing" metadata object.  For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+STATIC void
+xfs_cui_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all cui log items.
+ */
+static const struct xfs_item_ops xfs_cui_item_ops = {
+	.iop_size	= xfs_cui_item_size,
+	.iop_format	= xfs_cui_item_format,
+	.iop_pin	= xfs_cui_item_pin,
+	.iop_unpin	= xfs_cui_item_unpin,
+	.iop_unlock	= xfs_cui_item_unlock,
+	.iop_committed	= xfs_cui_item_committed,
+	.iop_push	= xfs_cui_item_push,
+	.iop_committing = xfs_cui_item_committing,
+};
+
+/*
+ * Allocate and initialize an cui item with the given number of extents.
+ */
+struct xfs_cui_log_item *
+xfs_cui_init(
+	struct xfs_mount		*mp,
+	uint				nextents)
+
+{
+	struct xfs_cui_log_item		*cuip;
+
+	ASSERT(nextents > 0);
+	if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
+		cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
+				KM_SLEEP);
+	else
+		cuip = kmem_zone_zalloc(xfs_cui_zone, KM_SLEEP);
+
+	xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
+	cuip->cui_format.cui_nextents = nextents;
+	cuip->cui_format.cui_id = (uintptr_t)(void *)cuip;
+	atomic_set(&cuip->cui_next_extent, 0);
+	atomic_set(&cuip->cui_refcount, 2);
+
+	return cuip;
+}
+
+/*
+ * Freeing the CUI requires that we remove it from the AIL if it has already
+ * been placed there. However, the CUI may not yet have been placed in the AIL
+ * when called by xfs_cui_release() from CUD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the CUI.
+ */
+void
+xfs_cui_release(
+	struct xfs_cui_log_item	*cuip)
+{
+	if (atomic_dec_and_test(&cuip->cui_refcount)) {
+		xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR);
+		xfs_cui_item_free(cuip);
+	}
+}
+
+static inline struct xfs_cud_log_item *CUD_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_cud_log_item, cud_item);
+}
+
+STATIC void
+xfs_cud_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	*nvecs += 1;
+	*nbytes += sizeof(struct xfs_cud_log_format);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given cud log item. We use only 1 iovec, and we point that
+ * at the cud_log_format structure embedded in the cud item.
+ * It is at this point that we assert that all of the extent
+ * slots in the cud item have been filled.
+ */
+STATIC void
+xfs_cud_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_cud_log_item	*cudp = CUD_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+
+	cudp->cud_format.cud_type = XFS_LI_CUD;
+	cudp->cud_format.cud_size = 1;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUD_FORMAT, &cudp->cud_format,
+			sizeof(struct xfs_cud_log_format));
+}
+
+/*
+ * Pinning has no meaning for an cud item, so just return.
+ */
+STATIC void
+xfs_cud_item_pin(
+	struct xfs_log_item	*lip)
+{
+}
+
+/*
+ * Since pinning has no meaning for an cud item, unpinning does
+ * not either.
+ */
+STATIC void
+xfs_cud_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+}
+
+/*
+ * There isn't much you can do to push on an cud item.  It is simply stuck
+ * waiting for the log to be flushed to disk.
+ */
+STATIC uint
+xfs_cud_item_push(
+	struct xfs_log_item	*lip,
+	struct list_head	*buffer_list)
+{
+	return XFS_ITEM_PINNED;
+}
+
+/*
+ * The CUD is either committed or aborted if the transaction is cancelled. If
+ * the transaction is cancelled, drop our reference to the CUI and free the
+ * CUD.
+ */
+STATIC void
+xfs_cud_item_unlock(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_cud_log_item	*cudp = CUD_ITEM(lip);
+
+	if (lip->li_flags & XFS_LI_ABORTED) {
+		xfs_cui_release(cudp->cud_cuip);
+		kmem_zone_free(xfs_cud_zone, cudp);
+	}
+}
+
+/*
+ * When the cud item is committed to disk, all we need to do is delete our
+ * reference to our partner cui item and then free ourselves. Since we're
+ * freeing ourselves we must return -1 to keep the transaction code from
+ * further referencing this item.
+ */
+STATIC xfs_lsn_t
+xfs_cud_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	struct xfs_cud_log_item	*cudp = CUD_ITEM(lip);
+
+	/*
+	 * Drop the CUI reference regardless of whether the CUD has been
+	 * aborted. Once the CUD transaction is constructed, it is the sole
+	 * responsibility of the CUD to release the CUI (even if the CUI is
+	 * aborted due to log I/O error).
+	 */
+	xfs_cui_release(cudp->cud_cuip);
+	kmem_zone_free(xfs_cud_zone, cudp);
+
+	return (xfs_lsn_t)-1;
+}
+
+/*
+ * The CUD dependency tracking op doesn't do squat.  It can't because
+ * it doesn't know where the free extent is coming from.  The dependency
+ * tracking has to be handled by the "enclosing" metadata object.  For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+STATIC void
+xfs_cud_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all cud log items.
+ */
+static const struct xfs_item_ops xfs_cud_item_ops = {
+	.iop_size	= xfs_cud_item_size,
+	.iop_format	= xfs_cud_item_format,
+	.iop_pin	= xfs_cud_item_pin,
+	.iop_unpin	= xfs_cud_item_unpin,
+	.iop_unlock	= xfs_cud_item_unlock,
+	.iop_committed	= xfs_cud_item_committed,
+	.iop_push	= xfs_cud_item_push,
+	.iop_committing = xfs_cud_item_committing,
+};
+
+/*
+ * Allocate and initialize an cud item with the given number of extents.
+ */
+struct xfs_cud_log_item *
+xfs_cud_init(
+	struct xfs_mount		*mp,
+	struct xfs_cui_log_item		*cuip)
+
+{
+	struct xfs_cud_log_item	*cudp;
+
+	cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP);
+	xfs_log_item_init(mp, &cudp->cud_item, XFS_LI_CUD, &xfs_cud_item_ops);
+	cudp->cud_cuip = cuip;
+	cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id;
+
+	return cudp;
+}
+
+/*
+ * Process a refcount update intent item that was recovered from the log.
+ * We need to update the refcountbt.
+ */
+int
+xfs_cui_recover(
+	struct xfs_mount		*mp,
+	struct xfs_cui_log_item		*cuip)
+{
+	int				i;
+	int				error = 0;
+	unsigned int			refc_type;
+	struct xfs_phys_extent		*refc;
+	xfs_fsblock_t			startblock_fsb;
+	bool				op_ok;
+	struct xfs_cud_log_item		*cudp;
+	struct xfs_trans		*tp;
+	struct xfs_btree_cur		*rcur = NULL;
+	enum xfs_refcount_intent_type	type;
+	xfs_fsblock_t			firstfsb;
+	xfs_fsblock_t			new_fsb;
+	xfs_extlen_t			new_len;
+	struct xfs_bmbt_irec		irec;
+	struct xfs_defer_ops		dfops;
+	bool				requeue_only = false;
+
+	ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags));
+
+	/*
+	 * First check the validity of the extents described by the
+	 * CUI.  If any are bad, then assume that all are bad and
+	 * just toss the CUI.
+	 */
+	for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
+		refc = &cuip->cui_format.cui_extents[i];
+		startblock_fsb = XFS_BB_TO_FSB(mp,
+				   XFS_FSB_TO_DADDR(mp, refc->pe_startblock));
+		switch (refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) {
+		case XFS_REFCOUNT_INCREASE:
+		case XFS_REFCOUNT_DECREASE:
+		case XFS_REFCOUNT_ALLOC_COW:
+		case XFS_REFCOUNT_FREE_COW:
+			op_ok = true;
+			break;
+		default:
+			op_ok = false;
+			break;
+		}
+		if (!op_ok || startblock_fsb == 0 ||
+		    refc->pe_len == 0 ||
+		    startblock_fsb >= mp->m_sb.sb_dblocks ||
+		    refc->pe_len >= mp->m_sb.sb_agblocks ||
+		    (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)) {
+			/*
+			 * This will pull the CUI from the AIL and
+			 * free the memory associated with it.
+			 */
+			set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
+			xfs_cui_release(cuip);
+			return -EIO;
+		}
+	}
+
+	/*
+	 * Under normal operation, refcount updates are deferred, so we
+	 * wouldn't be adding them directly to a transaction.  All
+	 * refcount updates manage reservation usage internally and
+	 * dynamically by deferring work that won't fit in the
+	 * transaction.  Normally, any work that needs to be deferred
+	 * gets attached to the same defer_ops that scheduled the
+	 * refcount update.  However, we're in log recovery here, so we
+	 * we create our own defer_ops and use that to finish up any
+	 * work that doesn't fit.
+	 */
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+	if (error)
+		return error;
+	cudp = xfs_trans_get_cud(tp, cuip);
+
+	xfs_defer_init(&dfops, &firstfsb);
+	for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
+		refc = &cuip->cui_format.cui_extents[i];
+		refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
+		switch (refc_type) {
+		case XFS_REFCOUNT_INCREASE:
+		case XFS_REFCOUNT_DECREASE:
+		case XFS_REFCOUNT_ALLOC_COW:
+		case XFS_REFCOUNT_FREE_COW:
+			type = refc_type;
+			break;
+		default:
+			error = -EFSCORRUPTED;
+			goto abort_error;
+		}
+		if (requeue_only) {
+			new_fsb = refc->pe_startblock;
+			new_len = refc->pe_len;
+		} else
+			error = xfs_trans_log_finish_refcount_update(tp, cudp,
+				&dfops, type, refc->pe_startblock, refc->pe_len,
+				&new_fsb, &new_len, &rcur);
+		if (error)
+			goto abort_error;
+
+		/* Requeue what we didn't finish. */
+		if (new_len > 0) {
+			irec.br_startblock = new_fsb;
+			irec.br_blockcount = new_len;
+			switch (type) {
+			case XFS_REFCOUNT_INCREASE:
+				error = xfs_refcount_increase_extent(
+						tp->t_mountp, &dfops, &irec);
+				break;
+			case XFS_REFCOUNT_DECREASE:
+				error = xfs_refcount_decrease_extent(
+						tp->t_mountp, &dfops, &irec);
+				break;
+			case XFS_REFCOUNT_ALLOC_COW:
+				error = xfs_refcount_alloc_cow_extent(
+						tp->t_mountp, &dfops,
+						irec.br_startblock,
+						irec.br_blockcount);
+				break;
+			case XFS_REFCOUNT_FREE_COW:
+				error = xfs_refcount_free_cow_extent(
+						tp->t_mountp, &dfops,
+						irec.br_startblock,
+						irec.br_blockcount);
+				break;
+			default:
+				ASSERT(0);
+			}
+			if (error)
+				goto abort_error;
+			requeue_only = true;
+		}
+	}
+
+	xfs_refcount_finish_one_cleanup(tp, rcur, error);
+	error = xfs_defer_finish(&tp, &dfops, NULL);
+	if (error)
+		goto abort_error;
+	set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
+	error = xfs_trans_commit(tp);
+	return error;
+
+abort_error:
+	xfs_refcount_finish_one_cleanup(tp, rcur, error);
+	xfs_defer_cancel(&dfops);
+	xfs_trans_cancel(tp);
+	return error;
+}
diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
new file mode 100644
index 0000000..5b74ddd
--- /dev/null
+++ b/fs/xfs/xfs_refcount_item.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#ifndef	__XFS_REFCOUNT_ITEM_H__
+#define	__XFS_REFCOUNT_ITEM_H__
+
+/*
+ * There are (currently) two pairs of refcount btree redo item types:
+ * increase and decrease.  The log items for these are CUI (refcount
+ * update intent) and CUD (refcount update done).  The redo item type
+ * is encoded in the flags field of each xfs_map_extent.
+ *
+ * *I items should be recorded in the *first* of a series of rolled
+ * transactions, and the *D items should be recorded in the same
+ * transaction that records the associated refcountbt updates.
+ *
+ * Should the system crash after the commit of the first transaction
+ * but before the commit of the final transaction in a series, log
+ * recovery will use the redo information recorded by the intent items
+ * to replay the refcountbt metadata updates.
+ */
+
+/* kernel only CUI/CUD definitions */
+
+struct xfs_mount;
+struct kmem_zone;
+
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define	XFS_CUI_MAX_FAST_EXTENTS	16
+
+/*
+ * Define CUI flag bits. Manipulated by set/clear/test_bit operators.
+ */
+#define	XFS_CUI_RECOVERED		1
+
+/*
+ * This is the "refcount update intent" log item.  It is used to log
+ * the fact that some reverse mappings need to change.  It is used in
+ * conjunction with the "refcount update done" log item described
+ * below.
+ *
+ * These log items follow the same rules as struct xfs_efi_log_item;
+ * see the comments about that structure (in xfs_extfree_item.h) for
+ * more details.
+ */
+struct xfs_cui_log_item {
+	struct xfs_log_item		cui_item;
+	atomic_t			cui_refcount;
+	atomic_t			cui_next_extent;
+	unsigned long			cui_flags;	/* misc flags */
+	struct xfs_cui_log_format	cui_format;
+};
+
+static inline size_t
+xfs_cui_log_item_sizeof(
+	unsigned int		nr)
+{
+	return offsetof(struct xfs_cui_log_item, cui_format) +
+			xfs_cui_log_format_sizeof(nr);
+}
+
+/*
+ * This is the "refcount update done" log item.  It is used to log the
+ * fact that some refcountbt updates mentioned in an earlier cui item
+ * have been performed.
+ */
+struct xfs_cud_log_item {
+	struct xfs_log_item		cud_item;
+	struct xfs_cui_log_item		*cud_cuip;
+	struct xfs_cud_log_format	cud_format;
+};
+
+extern struct kmem_zone	*xfs_cui_zone;
+extern struct kmem_zone	*xfs_cud_zone;
+
+struct xfs_cui_log_item *xfs_cui_init(struct xfs_mount *, uint);
+struct xfs_cud_log_item *xfs_cud_init(struct xfs_mount *,
+		struct xfs_cui_log_item *);
+void xfs_cui_item_free(struct xfs_cui_log_item *);
+void xfs_cui_release(struct xfs_cui_log_item *);
+int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip);
+
+#endif	/* __XFS_REFCOUNT_ITEM_H__ */
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
new file mode 100644
index 0000000..5965e94
--- /dev/null
+++ b/fs/xfs/xfs_reflink.c
@@ -0,0 +1,1688 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_error.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_ioctl.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_icache.h"
+#include "xfs_pnfs.h"
+#include "xfs_btree.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_bit.h"
+#include "xfs_alloc.h"
+#include "xfs_quota_defs.h"
+#include "xfs_quota.h"
+#include "xfs_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_reflink.h"
+#include "xfs_iomap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_sb.h"
+#include "xfs_ag_resv.h"
+
+/*
+ * Copy on Write of Shared Blocks
+ *
+ * XFS must preserve "the usual" file semantics even when two files share
+ * the same physical blocks.  This means that a write to one file must not
+ * alter the blocks in a different file; the way that we'll do that is
+ * through the use of a copy-on-write mechanism.  At a high level, that
+ * means that when we want to write to a shared block, we allocate a new
+ * block, write the data to the new block, and if that succeeds we map the
+ * new block into the file.
+ *
+ * XFS provides a "delayed allocation" mechanism that defers the allocation
+ * of disk blocks to dirty-but-not-yet-mapped file blocks as long as
+ * possible.  This reduces fragmentation by enabling the filesystem to ask
+ * for bigger chunks less often, which is exactly what we want for CoW.
+ *
+ * The delalloc mechanism begins when the kernel wants to make a block
+ * writable (write_begin or page_mkwrite).  If the offset is not mapped, we
+ * create a delalloc mapping, which is a regular in-core extent, but without
+ * a real startblock.  (For delalloc mappings, the startblock encodes both
+ * a flag that this is a delalloc mapping, and a worst-case estimate of how
+ * many blocks might be required to put the mapping into the BMBT.)  delalloc
+ * mappings are a reservation against the free space in the filesystem;
+ * adjacent mappings can also be combined into fewer larger mappings.
+ *
+ * When dirty pages are being written out (typically in writepage), the
+ * delalloc reservations are converted into real mappings by allocating
+ * blocks and replacing the delalloc mapping with real ones.  A delalloc
+ * mapping can be replaced by several real ones if the free space is
+ * fragmented.
+ *
+ * We want to adapt the delalloc mechanism for copy-on-write, since the
+ * write paths are similar.  The first two steps (creating the reservation
+ * and allocating the blocks) are exactly the same as delalloc except that
+ * the mappings must be stored in a separate CoW fork because we do not want
+ * to disturb the mapping in the data fork until we're sure that the write
+ * succeeded.  IO completion in this case is the process of removing the old
+ * mapping from the data fork and moving the new mapping from the CoW fork to
+ * the data fork.  This will be discussed shortly.
+ *
+ * For now, unaligned directio writes will be bounced back to the page cache.
+ * Block-aligned directio writes will use the same mechanism as buffered
+ * writes.
+ *
+ * CoW remapping must be done after the data block write completes,
+ * because we don't want to destroy the old data fork map until we're sure
+ * the new block has been written.  Since the new mappings are kept in a
+ * separate fork, we can simply iterate these mappings to find the ones
+ * that cover the file blocks that we just CoW'd.  For each extent, simply
+ * unmap the corresponding range in the data fork, map the new range into
+ * the data fork, and remove the extent from the CoW fork.
+ *
+ * Since the remapping operation can be applied to an arbitrary file
+ * range, we record the need for the remap step as a flag in the ioend
+ * instead of declaring a new IO type.  This is required for direct io
+ * because we only have ioend for the whole dio, and we have to be able to
+ * remember the presence of unwritten blocks and CoW blocks with a single
+ * ioend structure.  Better yet, the more ground we can cover with one
+ * ioend, the better.
+ */
+
+/*
+ * Given an AG extent, find the lowest-numbered run of shared blocks
+ * within that range and return the range in fbno/flen.  If
+ * find_end_of_shared is true, return the longest contiguous extent of
+ * shared blocks.  If there are no shared extents, fbno and flen will
+ * be set to NULLAGBLOCK and 0, respectively.
+ */
+int
+xfs_reflink_find_shared(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		aglen,
+	xfs_agblock_t		*fbno,
+	xfs_extlen_t		*flen,
+	bool			find_end_of_shared)
+{
+	struct xfs_buf		*agbp;
+	struct xfs_btree_cur	*cur;
+	int			error;
+
+	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+	if (error)
+		return error;
+
+	cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
+
+	error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
+			find_end_of_shared);
+
+	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+
+	xfs_buf_relse(agbp);
+	return error;
+}
+
+/*
+ * Trim the mapping to the next block where there's a change in the
+ * shared/unshared status.  More specifically, this means that we
+ * find the lowest-numbered extent of shared blocks that coincides with
+ * the given block mapping.  If the shared extent overlaps the start of
+ * the mapping, trim the mapping to the end of the shared extent.  If
+ * the shared region intersects the mapping, trim the mapping to the
+ * start of the shared extent.  If there are no shared regions that
+ * overlap, just return the original extent.
+ */
+int
+xfs_reflink_trim_around_shared(
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*irec,
+	bool			*shared,
+	bool			*trimmed)
+{
+	xfs_agnumber_t		agno;
+	xfs_agblock_t		agbno;
+	xfs_extlen_t		aglen;
+	xfs_agblock_t		fbno;
+	xfs_extlen_t		flen;
+	int			error = 0;
+
+	/* Holes, unwritten, and delalloc extents cannot be shared */
+	if (!xfs_is_reflink_inode(ip) ||
+	    ISUNWRITTEN(irec) ||
+	    irec->br_startblock == HOLESTARTBLOCK ||
+	    irec->br_startblock == DELAYSTARTBLOCK) {
+		*shared = false;
+		return 0;
+	}
+
+	trace_xfs_reflink_trim_around_shared(ip, irec);
+
+	agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock);
+	agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock);
+	aglen = irec->br_blockcount;
+
+	error = xfs_reflink_find_shared(ip->i_mount, agno, agbno,
+			aglen, &fbno, &flen, true);
+	if (error)
+		return error;
+
+	*shared = *trimmed = false;
+	if (fbno == NULLAGBLOCK) {
+		/* No shared blocks at all. */
+		return 0;
+	} else if (fbno == agbno) {
+		/*
+		 * The start of this extent is shared.  Truncate the
+		 * mapping at the end of the shared region so that a
+		 * subsequent iteration starts at the start of the
+		 * unshared region.
+		 */
+		irec->br_blockcount = flen;
+		*shared = true;
+		if (flen != aglen)
+			*trimmed = true;
+		return 0;
+	} else {
+		/*
+		 * There's a shared extent midway through this extent.
+		 * Truncate the mapping at the start of the shared
+		 * extent so that a subsequent iteration starts at the
+		 * start of the shared region.
+		 */
+		irec->br_blockcount = fbno - agbno;
+		*trimmed = true;
+		return 0;
+	}
+}
+
+/* Create a CoW reservation for a range of blocks within a file. */
+static int
+__xfs_reflink_reserve_cow(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		*offset_fsb,
+	xfs_fileoff_t		end_fsb,
+	bool			*skipped)
+{
+	struct xfs_bmbt_irec	got, prev, imap;
+	xfs_fileoff_t		orig_end_fsb;
+	int			nimaps, eof = 0, error = 0;
+	bool			shared = false, trimmed = false;
+	xfs_extnum_t		idx;
+	xfs_extlen_t		align;
+
+	/* Already reserved?  Skip the refcount btree access. */
+	xfs_bmap_search_extents(ip, *offset_fsb, XFS_COW_FORK, &eof, &idx,
+			&got, &prev);
+	if (!eof && got.br_startoff <= *offset_fsb) {
+		end_fsb = orig_end_fsb = got.br_startoff + got.br_blockcount;
+		trace_xfs_reflink_cow_found(ip, &got);
+		goto done;
+	}
+
+	/* Read extent from the source file. */
+	nimaps = 1;
+	error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb,
+			&imap, &nimaps, 0);
+	if (error)
+		goto out_unlock;
+	ASSERT(nimaps == 1);
+
+	/* Trim the mapping to the nearest shared extent boundary. */
+	error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed);
+	if (error)
+		goto out_unlock;
+
+	end_fsb = orig_end_fsb = imap.br_startoff + imap.br_blockcount;
+
+	/* Not shared?  Just report the (potentially capped) extent. */
+	if (!shared) {
+		*skipped = true;
+		goto done;
+	}
+
+	/*
+	 * Fork all the shared blocks from our write offset until the end of
+	 * the extent.
+	 */
+	error = xfs_qm_dqattach_locked(ip, 0);
+	if (error)
+		goto out_unlock;
+
+	align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip));
+	if (align)
+		end_fsb = roundup_64(end_fsb, align);
+
+retry:
+	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, *offset_fsb,
+			end_fsb - *offset_fsb, &got,
+			&prev, &idx, eof);
+	switch (error) {
+	case 0:
+		break;
+	case -ENOSPC:
+	case -EDQUOT:
+		/* retry without any preallocation */
+		trace_xfs_reflink_cow_enospc(ip, &imap);
+		if (end_fsb != orig_end_fsb) {
+			end_fsb = orig_end_fsb;
+			goto retry;
+		}
+		/*FALLTHRU*/
+	default:
+		goto out_unlock;
+	}
+
+	if (end_fsb != orig_end_fsb)
+		xfs_inode_set_cowblocks_tag(ip);
+
+	trace_xfs_reflink_cow_alloc(ip, &got);
+done:
+	*offset_fsb = end_fsb;
+out_unlock:
+	return error;
+}
+
+/* Create a CoW reservation for part of a file. */
+int
+xfs_reflink_reserve_cow_range(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		count)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		offset_fsb, end_fsb;
+	bool			skipped = false;
+	int			error;
+
+	trace_xfs_reflink_reserve_cow_range(ip, offset, count);
+
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	end_fsb = XFS_B_TO_FSB(mp, offset + count);
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	while (offset_fsb < end_fsb) {
+		error = __xfs_reflink_reserve_cow(ip, &offset_fsb, end_fsb,
+				&skipped);
+		if (error) {
+			trace_xfs_reflink_reserve_cow_range_error(ip, error,
+				_RET_IP_);
+			break;
+		}
+	}
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	return error;
+}
+
+/* Allocate all CoW reservations covering a range of blocks in a file. */
+static int
+__xfs_reflink_allocate_cow(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		*offset_fsb,
+	xfs_fileoff_t		end_fsb)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_bmbt_irec	imap;
+	struct xfs_defer_ops	dfops;
+	struct xfs_trans	*tp;
+	xfs_fsblock_t		first_block;
+	xfs_fileoff_t		next_fsb;
+	int			nimaps = 1, error;
+	bool			skipped = false;
+
+	xfs_defer_init(&dfops, &first_block);
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0,
+			XFS_TRANS_RESERVE, &tp);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	next_fsb = *offset_fsb;
+	error = __xfs_reflink_reserve_cow(ip, &next_fsb, end_fsb, &skipped);
+	if (error)
+		goto out_trans_cancel;
+
+	if (skipped) {
+		*offset_fsb = next_fsb;
+		goto out_trans_cancel;
+	}
+
+	xfs_trans_ijoin(tp, ip, 0);
+	error = xfs_bmapi_write(tp, ip, *offset_fsb, next_fsb - *offset_fsb,
+			XFS_BMAPI_COWFORK, &first_block,
+			XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK),
+			&imap, &nimaps, &dfops);
+	if (error)
+		goto out_trans_cancel;
+
+	/* We might not have been able to map the whole delalloc extent */
+	*offset_fsb = min(*offset_fsb + imap.br_blockcount, next_fsb);
+
+	error = xfs_defer_finish(&tp, &dfops, NULL);
+	if (error)
+		goto out_trans_cancel;
+
+	error = xfs_trans_commit(tp);
+
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+out_trans_cancel:
+	xfs_defer_cancel(&dfops);
+	xfs_trans_cancel(tp);
+	goto out_unlock;
+}
+
+/* Allocate all CoW reservations covering a part of a file. */
+int
+xfs_reflink_allocate_cow_range(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		count)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
+	int			error;
+
+	ASSERT(xfs_is_reflink_inode(ip));
+
+	trace_xfs_reflink_allocate_cow_range(ip, offset, count);
+
+	/*
+	 * Make sure that the dquots are there.
+	 */
+	error = xfs_qm_dqattach(ip, 0);
+	if (error)
+		return error;
+
+	while (offset_fsb < end_fsb) {
+		error = __xfs_reflink_allocate_cow(ip, &offset_fsb, end_fsb);
+		if (error) {
+			trace_xfs_reflink_allocate_cow_range_error(ip, error,
+					_RET_IP_);
+			break;
+		}
+	}
+
+	return error;
+}
+
+/*
+ * Find the CoW reservation (and whether or not it needs block allocation)
+ * for a given byte offset of a file.
+ */
+bool
+xfs_reflink_find_cow_mapping(
+	struct xfs_inode		*ip,
+	xfs_off_t			offset,
+	struct xfs_bmbt_irec		*imap,
+	bool				*need_alloc)
+{
+	struct xfs_bmbt_irec		irec;
+	struct xfs_ifork		*ifp;
+	struct xfs_bmbt_rec_host	*gotp;
+	xfs_fileoff_t			bno;
+	xfs_extnum_t			idx;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
+	ASSERT(xfs_is_reflink_inode(ip));
+
+	/* Find the extent in the CoW fork. */
+	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+	bno = XFS_B_TO_FSBT(ip->i_mount, offset);
+	gotp = xfs_iext_bno_to_ext(ifp, bno, &idx);
+	if (!gotp)
+		return false;
+
+	xfs_bmbt_get_all(gotp, &irec);
+	if (bno >= irec.br_startoff + irec.br_blockcount ||
+	    bno < irec.br_startoff)
+		return false;
+
+	trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE,
+			&irec);
+
+	/* If it's still delalloc, we must allocate later. */
+	*imap = irec;
+	*need_alloc = !!(isnullstartblock(irec.br_startblock));
+
+	return true;
+}
+
+/*
+ * Trim an extent to end at the next CoW reservation past offset_fsb.
+ */
+int
+xfs_reflink_trim_irec_to_next_cow(
+	struct xfs_inode		*ip,
+	xfs_fileoff_t			offset_fsb,
+	struct xfs_bmbt_irec		*imap)
+{
+	struct xfs_bmbt_irec		irec;
+	struct xfs_ifork		*ifp;
+	struct xfs_bmbt_rec_host	*gotp;
+	xfs_extnum_t			idx;
+
+	if (!xfs_is_reflink_inode(ip))
+		return 0;
+
+	/* Find the extent in the CoW fork. */
+	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+	gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx);
+	if (!gotp)
+		return 0;
+	xfs_bmbt_get_all(gotp, &irec);
+
+	/* This is the extent before; try sliding up one. */
+	if (irec.br_startoff < offset_fsb) {
+		idx++;
+		if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+			return 0;
+		gotp = xfs_iext_get_ext(ifp, idx);
+		xfs_bmbt_get_all(gotp, &irec);
+	}
+
+	if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount)
+		return 0;
+
+	imap->br_blockcount = irec.br_startoff - imap->br_startoff;
+	trace_xfs_reflink_trim_irec(ip, imap);
+
+	return 0;
+}
+
+/*
+ * Cancel all pending CoW reservations for some block range of an inode.
+ */
+int
+xfs_reflink_cancel_cow_blocks(
+	struct xfs_inode		*ip,
+	struct xfs_trans		**tpp,
+	xfs_fileoff_t			offset_fsb,
+	xfs_fileoff_t			end_fsb)
+{
+	struct xfs_bmbt_irec		irec;
+	xfs_filblks_t			count_fsb;
+	xfs_fsblock_t			firstfsb;
+	struct xfs_defer_ops		dfops;
+	int				error = 0;
+	int				nimaps;
+
+	if (!xfs_is_reflink_inode(ip))
+		return 0;
+
+	/* Go find the old extent in the CoW fork. */
+	while (offset_fsb < end_fsb) {
+		nimaps = 1;
+		count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb);
+		error = xfs_bmapi_read(ip, offset_fsb, count_fsb, &irec,
+				&nimaps, XFS_BMAPI_COWFORK);
+		if (error)
+			break;
+		ASSERT(nimaps == 1);
+
+		trace_xfs_reflink_cancel_cow(ip, &irec);
+
+		if (irec.br_startblock == DELAYSTARTBLOCK) {
+			/* Free a delayed allocation. */
+			xfs_mod_fdblocks(ip->i_mount, irec.br_blockcount,
+					false);
+			ip->i_delayed_blks -= irec.br_blockcount;
+
+			/* Remove the mapping from the CoW fork. */
+			error = xfs_bunmapi_cow(ip, &irec);
+			if (error)
+				break;
+		} else if (irec.br_startblock == HOLESTARTBLOCK) {
+			/* empty */
+		} else {
+			xfs_trans_ijoin(*tpp, ip, 0);
+			xfs_defer_init(&dfops, &firstfsb);
+
+			/* Free the CoW orphan record. */
+			error = xfs_refcount_free_cow_extent(ip->i_mount,
+					&dfops, irec.br_startblock,
+					irec.br_blockcount);
+			if (error)
+				break;
+
+			xfs_bmap_add_free(ip->i_mount, &dfops,
+					irec.br_startblock, irec.br_blockcount,
+					NULL);
+
+			/* Update quota accounting */
+			xfs_trans_mod_dquot_byino(*tpp, ip, XFS_TRANS_DQ_BCOUNT,
+					-(long)irec.br_blockcount);
+
+			/* Roll the transaction */
+			error = xfs_defer_finish(tpp, &dfops, ip);
+			if (error) {
+				xfs_defer_cancel(&dfops);
+				break;
+			}
+
+			/* Remove the mapping from the CoW fork. */
+			error = xfs_bunmapi_cow(ip, &irec);
+			if (error)
+				break;
+		}
+
+		/* Roll on... */
+		offset_fsb = irec.br_startoff + irec.br_blockcount;
+	}
+
+	return error;
+}
+
+/*
+ * Cancel all pending CoW reservations for some byte range of an inode.
+ */
+int
+xfs_reflink_cancel_cow_range(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		count)
+{
+	struct xfs_trans	*tp;
+	xfs_fileoff_t		offset_fsb;
+	xfs_fileoff_t		end_fsb;
+	int			error;
+
+	trace_xfs_reflink_cancel_cow_range(ip, offset, count);
+	ASSERT(xfs_is_reflink_inode(ip));
+
+	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+	if (count == NULLFILEOFF)
+		end_fsb = NULLFILEOFF;
+	else
+		end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
+
+	/* Start a rolling transaction to remove the mappings */
+	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
+			0, 0, 0, &tp);
+	if (error)
+		goto out;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	/* Scrape out the old CoW reservations */
+	error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb);
+	if (error)
+		goto out_cancel;
+
+	error = xfs_trans_commit(tp);
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+
+out_cancel:
+	xfs_trans_cancel(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out:
+	trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Remap parts of a file's data fork after a successful CoW.
+ */
+int
+xfs_reflink_end_cow(
+	struct xfs_inode		*ip,
+	xfs_off_t			offset,
+	xfs_off_t			count)
+{
+	struct xfs_bmbt_irec		irec;
+	struct xfs_bmbt_irec		uirec;
+	struct xfs_trans		*tp;
+	xfs_fileoff_t			offset_fsb;
+	xfs_fileoff_t			end_fsb;
+	xfs_filblks_t			count_fsb;
+	xfs_fsblock_t			firstfsb;
+	struct xfs_defer_ops		dfops;
+	int				error;
+	unsigned int			resblks;
+	xfs_filblks_t			ilen;
+	xfs_filblks_t			rlen;
+	int				nimaps;
+
+	trace_xfs_reflink_end_cow(ip, offset, count);
+
+	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
+	count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb);
+
+	/* Start a rolling transaction to switch the mappings */
+	resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
+			resblks, 0, 0, &tp);
+	if (error)
+		goto out;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	/* Go find the old extent in the CoW fork. */
+	while (offset_fsb < end_fsb) {
+		/* Read extent from the source file */
+		nimaps = 1;
+		count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb);
+		error = xfs_bmapi_read(ip, offset_fsb, count_fsb, &irec,
+				&nimaps, XFS_BMAPI_COWFORK);
+		if (error)
+			goto out_cancel;
+		ASSERT(nimaps == 1);
+
+		ASSERT(irec.br_startblock != DELAYSTARTBLOCK);
+		trace_xfs_reflink_cow_remap(ip, &irec);
+
+		/*
+		 * We can have a hole in the CoW fork if part of a directio
+		 * write is CoW but part of it isn't.
+		 */
+		rlen = ilen = irec.br_blockcount;
+		if (irec.br_startblock == HOLESTARTBLOCK)
+			goto next_extent;
+
+		/* Unmap the old blocks in the data fork. */
+		while (rlen) {
+			xfs_defer_init(&dfops, &firstfsb);
+			error = __xfs_bunmapi(tp, ip, irec.br_startoff,
+					&rlen, 0, 1, &firstfsb, &dfops);
+			if (error)
+				goto out_defer;
+
+			/*
+			 * Trim the extent to whatever got unmapped.
+			 * Remember, bunmapi works backwards.
+			 */
+			uirec.br_startblock = irec.br_startblock + rlen;
+			uirec.br_startoff = irec.br_startoff + rlen;
+			uirec.br_blockcount = irec.br_blockcount - rlen;
+			irec.br_blockcount = rlen;
+			trace_xfs_reflink_cow_remap_piece(ip, &uirec);
+
+			/* Free the CoW orphan record. */
+			error = xfs_refcount_free_cow_extent(tp->t_mountp,
+					&dfops, uirec.br_startblock,
+					uirec.br_blockcount);
+			if (error)
+				goto out_defer;
+
+			/* Map the new blocks into the data fork. */
+			error = xfs_bmap_map_extent(tp->t_mountp, &dfops,
+					ip, &uirec);
+			if (error)
+				goto out_defer;
+
+			/* Remove the mapping from the CoW fork. */
+			error = xfs_bunmapi_cow(ip, &uirec);
+			if (error)
+				goto out_defer;
+
+			error = xfs_defer_finish(&tp, &dfops, ip);
+			if (error)
+				goto out_defer;
+		}
+
+next_extent:
+		/* Roll on... */
+		offset_fsb = irec.br_startoff + ilen;
+	}
+
+	error = xfs_trans_commit(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	if (error)
+		goto out;
+	return 0;
+
+out_defer:
+	xfs_defer_cancel(&dfops);
+out_cancel:
+	xfs_trans_cancel(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out:
+	trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Free leftover CoW reservations that didn't get cleaned out.
+ */
+int
+xfs_reflink_recover_cow(
+	struct xfs_mount	*mp)
+{
+	xfs_agnumber_t		agno;
+	int			error = 0;
+
+	if (!xfs_sb_version_hasreflink(&mp->m_sb))
+		return 0;
+
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		error = xfs_refcount_recover_cow_leftovers(mp, agno);
+		if (error)
+			break;
+	}
+
+	return error;
+}
+
+/*
+ * Reflinking (Block) Ranges of Two Files Together
+ *
+ * First, ensure that the reflink flag is set on both inodes.  The flag is an
+ * optimization to avoid unnecessary refcount btree lookups in the write path.
+ *
+ * Now we can iteratively remap the range of extents (and holes) in src to the
+ * corresponding ranges in dest.  Let drange and srange denote the ranges of
+ * logical blocks in dest and src touched by the reflink operation.
+ *
+ * While the length of drange is greater than zero,
+ *    - Read src's bmbt at the start of srange ("imap")
+ *    - If imap doesn't exist, make imap appear to start at the end of srange
+ *      with zero length.
+ *    - If imap starts before srange, advance imap to start at srange.
+ *    - If imap goes beyond srange, truncate imap to end at the end of srange.
+ *    - Punch (imap start - srange start + imap len) blocks from dest at
+ *      offset (drange start).
+ *    - If imap points to a real range of pblks,
+ *         > Increase the refcount of the imap's pblks
+ *         > Map imap's pblks into dest at the offset
+ *           (drange start + imap start - srange start)
+ *    - Advance drange and srange by (imap start - srange start + imap len)
+ *
+ * Finally, if the reflink made dest longer, update both the in-core and
+ * on-disk file sizes.
+ *
+ * ASCII Art Demonstration:
+ *
+ * Let's say we want to reflink this source file:
+ *
+ * ----SSSSSSS-SSSSS----SSSSSS (src file)
+ *   <-------------------->
+ *
+ * into this destination file:
+ *
+ * --DDDDDDDDDDDDDDDDDDD--DDD (dest file)
+ *        <-------------------->
+ * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest.
+ * Observe that the range has different logical offsets in either file.
+ *
+ * Consider that the first extent in the source file doesn't line up with our
+ * reflink range.  Unmapping  and remapping are separate operations, so we can
+ * unmap more blocks from the destination file than we remap.
+ *
+ * ----SSSSSSS-SSSSS----SSSSSS
+ *   <------->
+ * --DDDDD---------DDDDD--DDD
+ *        <------->
+ *
+ * Now remap the source extent into the destination file:
+ *
+ * ----SSSSSSS-SSSSS----SSSSSS
+ *   <------->
+ * --DDDDD--SSSSSSSDDDDD--DDD
+ *        <------->
+ *
+ * Do likewise with the second hole and extent in our range.  Holes in the
+ * unmap range don't affect our operation.
+ *
+ * ----SSSSSSS-SSSSS----SSSSSS
+ *            <---->
+ * --DDDDD--SSSSSSS-SSSSS-DDD
+ *                 <---->
+ *
+ * Finally, unmap and remap part of the third extent.  This will increase the
+ * size of the destination file.
+ *
+ * ----SSSSSSS-SSSSS----SSSSSS
+ *                  <----->
+ * --DDDDD--SSSSSSS-SSSSS----SSS
+ *                       <----->
+ *
+ * Once we update the destination file's i_size, we're done.
+ */
+
+/*
+ * Ensure the reflink bit is set in both inodes.
+ */
+STATIC int
+xfs_reflink_set_inode_flag(
+	struct xfs_inode	*src,
+	struct xfs_inode	*dest)
+{
+	struct xfs_mount	*mp = src->i_mount;
+	int			error;
+	struct xfs_trans	*tp;
+
+	if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest))
+		return 0;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+	if (error)
+		goto out_error;
+
+	/* Lock both files against IO */
+	if (src->i_ino == dest->i_ino)
+		xfs_ilock(src, XFS_ILOCK_EXCL);
+	else
+		xfs_lock_two_inodes(src, dest, XFS_ILOCK_EXCL);
+
+	if (!xfs_is_reflink_inode(src)) {
+		trace_xfs_reflink_set_inode_flag(src);
+		xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL);
+		src->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK;
+		xfs_trans_log_inode(tp, src, XFS_ILOG_CORE);
+		xfs_ifork_init_cow(src);
+	} else
+		xfs_iunlock(src, XFS_ILOCK_EXCL);
+
+	if (src->i_ino == dest->i_ino)
+		goto commit_flags;
+
+	if (!xfs_is_reflink_inode(dest)) {
+		trace_xfs_reflink_set_inode_flag(dest);
+		xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
+		dest->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK;
+		xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
+		xfs_ifork_init_cow(dest);
+	} else
+		xfs_iunlock(dest, XFS_ILOCK_EXCL);
+
+commit_flags:
+	error = xfs_trans_commit(tp);
+	if (error)
+		goto out_error;
+	return error;
+
+out_error:
+	trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Update destination inode size & cowextsize hint, if necessary.
+ */
+STATIC int
+xfs_reflink_update_dest(
+	struct xfs_inode	*dest,
+	xfs_off_t		newlen,
+	xfs_extlen_t		cowextsize)
+{
+	struct xfs_mount	*mp = dest->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+
+	if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
+		return 0;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+	if (error)
+		goto out_error;
+
+	xfs_ilock(dest, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
+
+	if (newlen > i_size_read(VFS_I(dest))) {
+		trace_xfs_reflink_update_inode_size(dest, newlen);
+		i_size_write(VFS_I(dest), newlen);
+		dest->i_d.di_size = newlen;
+	}
+
+	if (cowextsize) {
+		dest->i_d.di_cowextsize = cowextsize;
+		dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+	}
+
+	xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
+
+	error = xfs_trans_commit(tp);
+	if (error)
+		goto out_error;
+	return error;
+
+out_error:
+	trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Do we have enough reserve in this AG to handle a reflink?  The refcount
+ * btree already reserved all the space it needs, but the rmap btree can grow
+ * infinitely, so we won't allow more reflinks when the AG is down to the
+ * btree reserves.
+ */
+static int
+xfs_reflink_ag_has_free_space(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno)
+{
+	struct xfs_perag	*pag;
+	int			error = 0;
+
+	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+		return 0;
+
+	pag = xfs_perag_get(mp, agno);
+	if (xfs_ag_resv_critical(pag, XFS_AG_RESV_AGFL) ||
+	    xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA))
+		error = -ENOSPC;
+	xfs_perag_put(pag);
+	return error;
+}
+
+/*
+ * Unmap a range of blocks from a file, then map other blocks into the hole.
+ * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount).
+ * The extent irec is mapped into dest at irec->br_startoff.
+ */
+STATIC int
+xfs_reflink_remap_extent(
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*irec,
+	xfs_fileoff_t		destoff,
+	xfs_off_t		new_isize)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	xfs_fsblock_t		firstfsb;
+	unsigned int		resblks;
+	struct xfs_defer_ops	dfops;
+	struct xfs_bmbt_irec	uirec;
+	bool			real_extent;
+	xfs_filblks_t		rlen;
+	xfs_filblks_t		unmap_len;
+	xfs_off_t		newlen;
+	int			error;
+
+	unmap_len = irec->br_startoff + irec->br_blockcount - destoff;
+	trace_xfs_reflink_punch_range(ip, destoff, unmap_len);
+
+	/* Only remap normal extents. */
+	real_extent =  (irec->br_startblock != HOLESTARTBLOCK &&
+			irec->br_startblock != DELAYSTARTBLOCK &&
+			!ISUNWRITTEN(irec));
+
+	/* No reflinking if we're low on space */
+	if (real_extent) {
+		error = xfs_reflink_ag_has_free_space(mp,
+				XFS_FSB_TO_AGNO(mp, irec->br_startblock));
+		if (error)
+			goto out;
+	}
+
+	/* Start a rolling transaction to switch the mappings */
+	resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+	if (error)
+		goto out;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	/* If we're not just clearing space, then do we have enough quota? */
+	if (real_extent) {
+		error = xfs_trans_reserve_quota_nblks(tp, ip,
+				irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS);
+		if (error)
+			goto out_cancel;
+	}
+
+	trace_xfs_reflink_remap(ip, irec->br_startoff,
+				irec->br_blockcount, irec->br_startblock);
+
+	/* Unmap the old blocks in the data fork. */
+	rlen = unmap_len;
+	while (rlen) {
+		xfs_defer_init(&dfops, &firstfsb);
+		error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1,
+				&firstfsb, &dfops);
+		if (error)
+			goto out_defer;
+
+		/*
+		 * Trim the extent to whatever got unmapped.
+		 * Remember, bunmapi works backwards.
+		 */
+		uirec.br_startblock = irec->br_startblock + rlen;
+		uirec.br_startoff = irec->br_startoff + rlen;
+		uirec.br_blockcount = unmap_len - rlen;
+		unmap_len = rlen;
+
+		/* If this isn't a real mapping, we're done. */
+		if (!real_extent || uirec.br_blockcount == 0)
+			goto next_extent;
+
+		trace_xfs_reflink_remap(ip, uirec.br_startoff,
+				uirec.br_blockcount, uirec.br_startblock);
+
+		/* Update the refcount tree */
+		error = xfs_refcount_increase_extent(mp, &dfops, &uirec);
+		if (error)
+			goto out_defer;
+
+		/* Map the new blocks into the data fork. */
+		error = xfs_bmap_map_extent(mp, &dfops, ip, &uirec);
+		if (error)
+			goto out_defer;
+
+		/* Update quota accounting. */
+		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
+				uirec.br_blockcount);
+
+		/* Update dest isize if needed. */
+		newlen = XFS_FSB_TO_B(mp,
+				uirec.br_startoff + uirec.br_blockcount);
+		newlen = min_t(xfs_off_t, newlen, new_isize);
+		if (newlen > i_size_read(VFS_I(ip))) {
+			trace_xfs_reflink_update_inode_size(ip, newlen);
+			i_size_write(VFS_I(ip), newlen);
+			ip->i_d.di_size = newlen;
+			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+		}
+
+next_extent:
+		/* Process all the deferred stuff. */
+		error = xfs_defer_finish(&tp, &dfops, ip);
+		if (error)
+			goto out_defer;
+	}
+
+	error = xfs_trans_commit(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	if (error)
+		goto out;
+	return 0;
+
+out_defer:
+	xfs_defer_cancel(&dfops);
+out_cancel:
+	xfs_trans_cancel(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out:
+	trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Iteratively remap one file's extents (and holes) to another's.
+ */
+STATIC int
+xfs_reflink_remap_blocks(
+	struct xfs_inode	*src,
+	xfs_fileoff_t		srcoff,
+	struct xfs_inode	*dest,
+	xfs_fileoff_t		destoff,
+	xfs_filblks_t		len,
+	xfs_off_t		new_isize)
+{
+	struct xfs_bmbt_irec	imap;
+	int			nimaps;
+	int			error = 0;
+	xfs_filblks_t		range_len;
+
+	/* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
+	while (len) {
+		trace_xfs_reflink_remap_blocks_loop(src, srcoff, len,
+				dest, destoff);
+		/* Read extent from the source file */
+		nimaps = 1;
+		xfs_ilock(src, XFS_ILOCK_EXCL);
+		error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
+		xfs_iunlock(src, XFS_ILOCK_EXCL);
+		if (error)
+			goto err;
+		ASSERT(nimaps == 1);
+
+		trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
+				&imap);
+
+		/* Translate imap into the destination file. */
+		range_len = imap.br_startoff + imap.br_blockcount - srcoff;
+		imap.br_startoff += destoff - srcoff;
+
+		/* Clear dest from destoff to the end of imap and map it in. */
+		error = xfs_reflink_remap_extent(dest, &imap, destoff,
+				new_isize);
+		if (error)
+			goto err;
+
+		if (fatal_signal_pending(current)) {
+			error = -EINTR;
+			goto err;
+		}
+
+		/* Advance drange/srange */
+		srcoff += range_len;
+		destoff += range_len;
+		len -= range_len;
+	}
+
+	return 0;
+
+err:
+	trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Read a page's worth of file data into the page cache.  Return the page
+ * locked.
+ */
+static struct page *
+xfs_get_page(
+	struct inode	*inode,
+	xfs_off_t	offset)
+{
+	struct address_space	*mapping;
+	struct page		*page;
+	pgoff_t			n;
+
+	n = offset >> PAGE_SHIFT;
+	mapping = inode->i_mapping;
+	page = read_mapping_page(mapping, n, NULL);
+	if (IS_ERR(page))
+		return page;
+	if (!PageUptodate(page)) {
+		put_page(page);
+		return ERR_PTR(-EIO);
+	}
+	lock_page(page);
+	return page;
+}
+
+/*
+ * Compare extents of two files to see if they are the same.
+ */
+static int
+xfs_compare_extents(
+	struct inode	*src,
+	xfs_off_t	srcoff,
+	struct inode	*dest,
+	xfs_off_t	destoff,
+	xfs_off_t	len,
+	bool		*is_same)
+{
+	xfs_off_t	src_poff;
+	xfs_off_t	dest_poff;
+	void		*src_addr;
+	void		*dest_addr;
+	struct page	*src_page;
+	struct page	*dest_page;
+	xfs_off_t	cmp_len;
+	bool		same;
+	int		error;
+
+	error = -EINVAL;
+	same = true;
+	while (len) {
+		src_poff = srcoff & (PAGE_SIZE - 1);
+		dest_poff = destoff & (PAGE_SIZE - 1);
+		cmp_len = min(PAGE_SIZE - src_poff,
+			      PAGE_SIZE - dest_poff);
+		cmp_len = min(cmp_len, len);
+		ASSERT(cmp_len > 0);
+
+		trace_xfs_reflink_compare_extents(XFS_I(src), srcoff, cmp_len,
+				XFS_I(dest), destoff);
+
+		src_page = xfs_get_page(src, srcoff);
+		if (IS_ERR(src_page)) {
+			error = PTR_ERR(src_page);
+			goto out_error;
+		}
+		dest_page = xfs_get_page(dest, destoff);
+		if (IS_ERR(dest_page)) {
+			error = PTR_ERR(dest_page);
+			unlock_page(src_page);
+			put_page(src_page);
+			goto out_error;
+		}
+		src_addr = kmap_atomic(src_page);
+		dest_addr = kmap_atomic(dest_page);
+
+		flush_dcache_page(src_page);
+		flush_dcache_page(dest_page);
+
+		if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+			same = false;
+
+		kunmap_atomic(dest_addr);
+		kunmap_atomic(src_addr);
+		unlock_page(dest_page);
+		unlock_page(src_page);
+		put_page(dest_page);
+		put_page(src_page);
+
+		if (!same)
+			break;
+
+		srcoff += cmp_len;
+		destoff += cmp_len;
+		len -= cmp_len;
+	}
+
+	*is_same = same;
+	return 0;
+
+out_error:
+	trace_xfs_reflink_compare_extents_error(XFS_I(dest), error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Link a range of blocks from one file to another.
+ */
+int
+xfs_reflink_remap_range(
+	struct xfs_inode	*src,
+	xfs_off_t		srcoff,
+	struct xfs_inode	*dest,
+	xfs_off_t		destoff,
+	xfs_off_t		len,
+	unsigned int		flags)
+{
+	struct xfs_mount	*mp = src->i_mount;
+	xfs_fileoff_t		sfsbno, dfsbno;
+	xfs_filblks_t		fsblen;
+	int			error;
+	xfs_extlen_t		cowextsize;
+	bool			is_same;
+
+	if (!xfs_sb_version_hasreflink(&mp->m_sb))
+		return -EOPNOTSUPP;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	/* Don't reflink realtime inodes */
+	if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
+		return -EINVAL;
+
+	if (flags & ~XFS_REFLINK_ALL)
+		return -EINVAL;
+
+	trace_xfs_reflink_remap_range(src, srcoff, len, dest, destoff);
+
+	/* Lock both files against IO */
+	if (src->i_ino == dest->i_ino) {
+		xfs_ilock(src, XFS_IOLOCK_EXCL);
+		xfs_ilock(src, XFS_MMAPLOCK_EXCL);
+	} else {
+		xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL);
+		xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL);
+	}
+
+	/*
+	 * Check that the extents are the same.
+	 */
+	if (flags & XFS_REFLINK_DEDUPE) {
+		is_same = false;
+		error = xfs_compare_extents(VFS_I(src), srcoff, VFS_I(dest),
+				destoff, len, &is_same);
+		if (error)
+			goto out_error;
+		if (!is_same) {
+			error = -EBADE;
+			goto out_error;
+		}
+	}
+
+	error = xfs_reflink_set_inode_flag(src, dest);
+	if (error)
+		goto out_error;
+
+	/*
+	 * Invalidate the page cache so that we can clear any CoW mappings
+	 * in the destination file.
+	 */
+	truncate_inode_pages_range(&VFS_I(dest)->i_data, destoff,
+				   PAGE_ALIGN(destoff + len) - 1);
+
+	dfsbno = XFS_B_TO_FSBT(mp, destoff);
+	sfsbno = XFS_B_TO_FSBT(mp, srcoff);
+	fsblen = XFS_B_TO_FSB(mp, len);
+	error = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
+			destoff + len);
+	if (error)
+		goto out_error;
+
+	/*
+	 * Carry the cowextsize hint from src to dest if we're sharing the
+	 * entire source file to the entire destination file, the source file
+	 * has a cowextsize hint, and the destination file does not.
+	 */
+	cowextsize = 0;
+	if (srcoff == 0 && len == i_size_read(VFS_I(src)) &&
+	    (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+	    destoff == 0 && len >= i_size_read(VFS_I(dest)) &&
+	    !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
+		cowextsize = src->i_d.di_cowextsize;
+
+	error = xfs_reflink_update_dest(dest, destoff + len, cowextsize);
+	if (error)
+		goto out_error;
+
+out_error:
+	xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
+	xfs_iunlock(src, XFS_IOLOCK_EXCL);
+	if (src->i_ino != dest->i_ino) {
+		xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
+		xfs_iunlock(dest, XFS_IOLOCK_EXCL);
+	}
+	if (error)
+		trace_xfs_reflink_remap_range_error(dest, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * The user wants to preemptively CoW all shared blocks in this file,
+ * which enables us to turn off the reflink flag.  Iterate all
+ * extents which are not prealloc/delalloc to see which ranges are
+ * mentioned in the refcount tree, then read those blocks into the
+ * pagecache, dirty them, fsync them back out, and then we can update
+ * the inode flag.  What happens if we run out of memory? :)
+ */
+STATIC int
+xfs_reflink_dirty_extents(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		fbno,
+	xfs_filblks_t		end,
+	xfs_off_t		isize)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_agnumber_t		agno;
+	xfs_agblock_t		agbno;
+	xfs_extlen_t		aglen;
+	xfs_agblock_t		rbno;
+	xfs_extlen_t		rlen;
+	xfs_off_t		fpos;
+	xfs_off_t		flen;
+	struct xfs_bmbt_irec	map[2];
+	int			nmaps;
+	int			error = 0;
+
+	while (end - fbno > 0) {
+		nmaps = 1;
+		/*
+		 * Look for extents in the file.  Skip holes, delalloc, or
+		 * unwritten extents; they can't be reflinked.
+		 */
+		error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0);
+		if (error)
+			goto out;
+		if (nmaps == 0)
+			break;
+		if (map[0].br_startblock == HOLESTARTBLOCK ||
+		    map[0].br_startblock == DELAYSTARTBLOCK ||
+		    ISUNWRITTEN(&map[0]))
+			goto next;
+
+		map[1] = map[0];
+		while (map[1].br_blockcount) {
+			agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock);
+			agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock);
+			aglen = map[1].br_blockcount;
+
+			error = xfs_reflink_find_shared(mp, agno, agbno, aglen,
+					&rbno, &rlen, true);
+			if (error)
+				goto out;
+			if (rbno == NULLAGBLOCK)
+				break;
+
+			/* Dirty the pages */
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+			fpos = XFS_FSB_TO_B(mp, map[1].br_startoff +
+					(rbno - agbno));
+			flen = XFS_FSB_TO_B(mp, rlen);
+			if (fpos + flen > isize)
+				flen = isize - fpos;
+			error = iomap_file_dirty(VFS_I(ip), fpos, flen,
+					&xfs_iomap_ops);
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+			if (error)
+				goto out;
+
+			map[1].br_blockcount -= (rbno - agbno + rlen);
+			map[1].br_startoff += (rbno - agbno + rlen);
+			map[1].br_startblock += (rbno - agbno + rlen);
+		}
+
+next:
+		fbno = map[0].br_startoff + map[0].br_blockcount;
+	}
+out:
+	return error;
+}
+
+/* Clear the inode reflink flag if there are no shared extents. */
+int
+xfs_reflink_clear_inode_flag(
+	struct xfs_inode	*ip,
+	struct xfs_trans	**tpp)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		fbno;
+	xfs_filblks_t		end;
+	xfs_agnumber_t		agno;
+	xfs_agblock_t		agbno;
+	xfs_extlen_t		aglen;
+	xfs_agblock_t		rbno;
+	xfs_extlen_t		rlen;
+	struct xfs_bmbt_irec	map;
+	int			nmaps;
+	int			error = 0;
+
+	ASSERT(xfs_is_reflink_inode(ip));
+
+	fbno = 0;
+	end = XFS_B_TO_FSB(mp, i_size_read(VFS_I(ip)));
+	while (end - fbno > 0) {
+		nmaps = 1;
+		/*
+		 * Look for extents in the file.  Skip holes, delalloc, or
+		 * unwritten extents; they can't be reflinked.
+		 */
+		error = xfs_bmapi_read(ip, fbno, end - fbno, &map, &nmaps, 0);
+		if (error)
+			return error;
+		if (nmaps == 0)
+			break;
+		if (map.br_startblock == HOLESTARTBLOCK ||
+		    map.br_startblock == DELAYSTARTBLOCK ||
+		    ISUNWRITTEN(&map))
+			goto next;
+
+		agno = XFS_FSB_TO_AGNO(mp, map.br_startblock);
+		agbno = XFS_FSB_TO_AGBNO(mp, map.br_startblock);
+		aglen = map.br_blockcount;
+
+		error = xfs_reflink_find_shared(mp, agno, agbno, aglen,
+				&rbno, &rlen, false);
+		if (error)
+			return error;
+		/* Is there still a shared block here? */
+		if (rbno != NULLAGBLOCK)
+			return 0;
+next:
+		fbno = map.br_startoff + map.br_blockcount;
+	}
+
+	/*
+	 * We didn't find any shared blocks so turn off the reflink flag.
+	 * First, get rid of any leftover CoW mappings.
+	 */
+	error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF);
+	if (error)
+		return error;
+
+	/* Clear the inode flag. */
+	trace_xfs_reflink_unset_inode_flag(ip);
+	ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+	xfs_inode_clear_cowblocks_tag(ip);
+	xfs_trans_ijoin(*tpp, ip, 0);
+	xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
+
+	return error;
+}
+
+/*
+ * Clear the inode reflink flag if there are no shared extents and the size
+ * hasn't changed.
+ */
+STATIC int
+xfs_reflink_try_clear_inode_flag(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error = 0;
+
+	/* Start a rolling transaction to remove the mappings */
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	error = xfs_reflink_clear_inode_flag(ip, &tp);
+	if (error)
+		goto cancel;
+
+	error = xfs_trans_commit(tp);
+	if (error)
+		goto out;
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return 0;
+cancel:
+	xfs_trans_cancel(tp);
+out:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * Pre-COW all shared blocks within a given byte range of a file and turn off
+ * the reflink flag if we unshare all of the file's blocks.
+ */
+int
+xfs_reflink_unshare(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		len)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		fbno;
+	xfs_filblks_t		end;
+	xfs_off_t		isize;
+	int			error;
+
+	if (!xfs_is_reflink_inode(ip))
+		return 0;
+
+	trace_xfs_reflink_unshare(ip, offset, len);
+
+	inode_dio_wait(VFS_I(ip));
+
+	/* Try to CoW the selected ranges */
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	fbno = XFS_B_TO_FSBT(mp, offset);
+	isize = i_size_read(VFS_I(ip));
+	end = XFS_B_TO_FSB(mp, offset + len);
+	error = xfs_reflink_dirty_extents(ip, fbno, end, isize);
+	if (error)
+		goto out_unlock;
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	/* Wait for the IO to finish */
+	error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+	if (error)
+		goto out;
+
+	/* Turn off the reflink flag if possible. */
+	error = xfs_reflink_try_clear_inode_flag(ip);
+	if (error)
+		goto out;
+
+	return 0;
+
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out:
+	trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Does this inode have any real CoW reservations?
+ */
+bool
+xfs_reflink_has_real_cow_blocks(
+	struct xfs_inode		*ip)
+{
+	struct xfs_bmbt_irec		irec;
+	struct xfs_ifork		*ifp;
+	struct xfs_bmbt_rec_host	*gotp;
+	xfs_extnum_t			idx;
+
+	if (!xfs_is_reflink_inode(ip))
+		return false;
+
+	/* Go find the old extent in the CoW fork. */
+	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+	gotp = xfs_iext_bno_to_ext(ifp, 0, &idx);
+	while (gotp) {
+		xfs_bmbt_get_all(gotp, &irec);
+
+		if (!isnullstartblock(irec.br_startblock))
+			return true;
+
+		/* Roll on... */
+		idx++;
+		if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+			break;
+		gotp = xfs_iext_get_ext(ifp, idx);
+	}
+
+	return false;
+}
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
new file mode 100644
index 0000000..5dc3c8a
--- /dev/null
+++ b/fs/xfs/xfs_reflink.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#ifndef __XFS_REFLINK_H
+#define __XFS_REFLINK_H 1
+
+extern int xfs_reflink_find_shared(struct xfs_mount *mp, xfs_agnumber_t agno,
+		xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
+		xfs_extlen_t *flen, bool find_maximal);
+extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
+		struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed);
+
+extern int xfs_reflink_reserve_cow_range(struct xfs_inode *ip,
+		xfs_off_t offset, xfs_off_t count);
+extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip,
+		xfs_off_t offset, xfs_off_t count);
+extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset,
+		struct xfs_bmbt_irec *imap, bool *need_alloc);
+extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
+		xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap);
+
+extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip,
+		struct xfs_trans **tpp, xfs_fileoff_t offset_fsb,
+		xfs_fileoff_t end_fsb);
+extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
+		xfs_off_t count);
+extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
+		xfs_off_t count);
+extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
+#define XFS_REFLINK_DEDUPE	1	/* only reflink if contents match */
+#define XFS_REFLINK_ALL		(XFS_REFLINK_DEDUPE)
+extern int xfs_reflink_remap_range(struct xfs_inode *src, xfs_off_t srcoff,
+		struct xfs_inode *dest, xfs_off_t destoff, xfs_off_t len,
+		unsigned int flags);
+extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip,
+		struct xfs_trans **tpp);
+extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset,
+		xfs_off_t len);
+
+extern bool xfs_reflink_has_real_cow_blocks(struct xfs_inode *ip);
+
+#endif /* __XFS_REFLINK_H */
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 0432a45..73c8278 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -441,8 +441,11 @@
 				   XFS_FSB_TO_DADDR(mp, rmap->me_startblock));
 		switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
 		case XFS_RMAP_EXTENT_MAP:
+		case XFS_RMAP_EXTENT_MAP_SHARED:
 		case XFS_RMAP_EXTENT_UNMAP:
+		case XFS_RMAP_EXTENT_UNMAP_SHARED:
 		case XFS_RMAP_EXTENT_CONVERT:
+		case XFS_RMAP_EXTENT_CONVERT_SHARED:
 		case XFS_RMAP_EXTENT_ALLOC:
 		case XFS_RMAP_EXTENT_FREE:
 			op_ok = true;
@@ -481,12 +484,21 @@
 		case XFS_RMAP_EXTENT_MAP:
 			type = XFS_RMAP_MAP;
 			break;
+		case XFS_RMAP_EXTENT_MAP_SHARED:
+			type = XFS_RMAP_MAP_SHARED;
+			break;
 		case XFS_RMAP_EXTENT_UNMAP:
 			type = XFS_RMAP_UNMAP;
 			break;
+		case XFS_RMAP_EXTENT_UNMAP_SHARED:
+			type = XFS_RMAP_UNMAP_SHARED;
+			break;
 		case XFS_RMAP_EXTENT_CONVERT:
 			type = XFS_RMAP_CONVERT;
 			break;
+		case XFS_RMAP_EXTENT_CONVERT_SHARED:
+			type = XFS_RMAP_CONVERT_SHARED;
+			break;
 		case XFS_RMAP_EXTENT_ALLOC:
 			type = XFS_RMAP_ALLOC;
 			break;
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 6e812fe0..12d48cd 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -62,6 +62,7 @@
 		{ "ibt2",		XFSSTAT_END_IBT_V2		},
 		{ "fibt2",		XFSSTAT_END_FIBT_V2		},
 		{ "rmapbt",		XFSSTAT_END_RMAP_V2		},
+		{ "refcntbt",		XFSSTAT_END_REFCOUNT		},
 		/* we print both series of quota information together */
 		{ "qm",			XFSSTAT_END_QM			},
 	};
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 657865f..79ad2e6 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -213,7 +213,23 @@
 	__uint32_t		xs_rmap_2_alloc;
 	__uint32_t		xs_rmap_2_free;
 	__uint32_t		xs_rmap_2_moves;
-#define XFSSTAT_END_XQMSTAT		(XFSSTAT_END_RMAP_V2+6)
+#define XFSSTAT_END_REFCOUNT		(XFSSTAT_END_RMAP_V2 + 15)
+	__uint32_t		xs_refcbt_2_lookup;
+	__uint32_t		xs_refcbt_2_compare;
+	__uint32_t		xs_refcbt_2_insrec;
+	__uint32_t		xs_refcbt_2_delrec;
+	__uint32_t		xs_refcbt_2_newroot;
+	__uint32_t		xs_refcbt_2_killroot;
+	__uint32_t		xs_refcbt_2_increment;
+	__uint32_t		xs_refcbt_2_decrement;
+	__uint32_t		xs_refcbt_2_lshift;
+	__uint32_t		xs_refcbt_2_rshift;
+	__uint32_t		xs_refcbt_2_split;
+	__uint32_t		xs_refcbt_2_join;
+	__uint32_t		xs_refcbt_2_alloc;
+	__uint32_t		xs_refcbt_2_free;
+	__uint32_t		xs_refcbt_2_moves;
+#define XFSSTAT_END_XQMSTAT		(XFSSTAT_END_REFCOUNT + 6)
 	__uint32_t		xs_qm_dqreclaims;
 	__uint32_t		xs_qm_dqreclaim_misses;
 	__uint32_t		xs_qm_dquot_dups;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 2d092f9..ade4691 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -47,6 +47,9 @@
 #include "xfs_sysfs.h"
 #include "xfs_ondisk.h"
 #include "xfs_rmap_item.h"
+#include "xfs_refcount_item.h"
+#include "xfs_bmap_item.h"
+#include "xfs_reflink.h"
 
 #include <linux/namei.h>
 #include <linux/init.h>
@@ -936,6 +939,7 @@
 	struct inode		*inode)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
+	int			error;
 
 	trace_xfs_destroy_inode(ip);
 
@@ -943,6 +947,14 @@
 	XFS_STATS_INC(ip->i_mount, vn_rele);
 	XFS_STATS_INC(ip->i_mount, vn_remove);
 
+	if (xfs_is_reflink_inode(ip)) {
+		error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);
+		if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount))
+			xfs_warn(ip->i_mount,
+"Error %d while evicting CoW blocks for inode %llu.",
+					error, ip->i_ino);
+	}
+
 	xfs_inactive(ip);
 
 	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
@@ -1006,6 +1018,16 @@
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 
+	/*
+	 * If this unlinked inode is in the middle of recovery, don't
+	 * drop the inode just yet; log recovery will take care of
+	 * that.  See the comment for this inode flag.
+	 */
+	if (ip->i_flags & XFS_IRECOVERY) {
+		ASSERT(ip->i_mount->m_log->l_flags & XLOG_RECOVERY_NEEDED);
+		return 0;
+	}
+
 	return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE);
 }
 
@@ -1296,10 +1318,31 @@
 		xfs_restore_resvblks(mp);
 		xfs_log_work_queue(mp);
 		xfs_queue_eofblocks(mp);
+
+		/* Recover any CoW blocks that never got remapped. */
+		error = xfs_reflink_recover_cow(mp);
+		if (error) {
+			xfs_err(mp,
+	"Error %d recovering leftover CoW allocations.", error);
+			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+			return error;
+		}
+
+		/* Create the per-AG metadata reservation pool .*/
+		error = xfs_fs_reserve_ag_blocks(mp);
+		if (error && error != -ENOSPC)
+			return error;
 	}
 
 	/* rw -> ro */
 	if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
+		/* Free the per-AG metadata reservation pool. */
+		error = xfs_fs_unreserve_ag_blocks(mp);
+		if (error) {
+			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+			return error;
+		}
+
 		/*
 		 * Before we sync the metadata, we need to free up the reserve
 		 * block pool so that the used block count in the superblock on
@@ -1490,6 +1533,7 @@
 	atomic_set(&mp->m_active_trans, 0);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
 	INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
+	INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
 	mp->m_kobj.kobject.kset = xfs_kset;
 
 	mp->m_super = sb;
@@ -1572,6 +1616,9 @@
 			"DAX unsupported by block device. Turning off DAX.");
 			mp->m_flags &= ~XFS_MOUNT_DAX;
 		}
+		if (xfs_sb_version_hasreflink(&mp->m_sb))
+			xfs_alert(mp,
+		"DAX and reflink have not been tested together!");
 	}
 
 	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
@@ -1585,6 +1632,10 @@
 	"EXPERIMENTAL reverse mapping btree feature enabled. Use at your own risk!");
 	}
 
+	if (xfs_sb_version_hasreflink(&mp->m_sb))
+		xfs_alert(mp,
+	"EXPERIMENTAL reflink feature enabled. Use at your own risk!");
+
 	error = xfs_mountfs(mp);
 	if (error)
 		goto out_filestream_unmount;
@@ -1788,8 +1839,38 @@
 	if (!xfs_rui_zone)
 		goto out_destroy_rud_zone;
 
+	xfs_cud_zone = kmem_zone_init(sizeof(struct xfs_cud_log_item),
+			"xfs_cud_item");
+	if (!xfs_cud_zone)
+		goto out_destroy_rui_zone;
+
+	xfs_cui_zone = kmem_zone_init(
+			xfs_cui_log_item_sizeof(XFS_CUI_MAX_FAST_EXTENTS),
+			"xfs_cui_item");
+	if (!xfs_cui_zone)
+		goto out_destroy_cud_zone;
+
+	xfs_bud_zone = kmem_zone_init(sizeof(struct xfs_bud_log_item),
+			"xfs_bud_item");
+	if (!xfs_bud_zone)
+		goto out_destroy_cui_zone;
+
+	xfs_bui_zone = kmem_zone_init(
+			xfs_bui_log_item_sizeof(XFS_BUI_MAX_FAST_EXTENTS),
+			"xfs_bui_item");
+	if (!xfs_bui_zone)
+		goto out_destroy_bud_zone;
+
 	return 0;
 
+ out_destroy_bud_zone:
+	kmem_zone_destroy(xfs_bud_zone);
+ out_destroy_cui_zone:
+	kmem_zone_destroy(xfs_cui_zone);
+ out_destroy_cud_zone:
+	kmem_zone_destroy(xfs_cud_zone);
+ out_destroy_rui_zone:
+	kmem_zone_destroy(xfs_rui_zone);
  out_destroy_rud_zone:
 	kmem_zone_destroy(xfs_rud_zone);
  out_destroy_icreate_zone:
@@ -1832,6 +1913,10 @@
 	 * destroy caches.
 	 */
 	rcu_barrier();
+	kmem_zone_destroy(xfs_bui_zone);
+	kmem_zone_destroy(xfs_bud_zone);
+	kmem_zone_destroy(xfs_cui_zone);
+	kmem_zone_destroy(xfs_cud_zone);
 	kmem_zone_destroy(xfs_rui_zone);
 	kmem_zone_destroy(xfs_rud_zone);
 	kmem_zone_destroy(xfs_icreate_zone);
@@ -1885,6 +1970,8 @@
 
 	xfs_extent_free_init_defer_op();
 	xfs_rmap_update_init_defer_op();
+	xfs_refcount_update_init_defer_op();
+	xfs_bmap_update_init_defer_op();
 
 	xfs_dir_startup();
 
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index aed74d3..afe1f66 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -184,6 +184,15 @@
 		.extra1		= &xfs_params.eofb_timer.min,
 		.extra2		= &xfs_params.eofb_timer.max,
 	},
+	{
+		.procname	= "speculative_cow_prealloc_lifetime",
+		.data		= &xfs_params.cowb_timer.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.cowb_timer.min,
+		.extra2		= &xfs_params.cowb_timer.max,
+	},
 	/* please keep this the last entry */
 #ifdef CONFIG_PROC_FS
 	{
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index ffef453..984a349 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -48,6 +48,7 @@
 	xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
 	xfs_sysctl_val_t fstrm_timer;	/* Filestream dir-AG assoc'n timeout. */
 	xfs_sysctl_val_t eofb_timer;	/* Interval between eofb scan wakeups */
+	xfs_sysctl_val_t cowb_timer;	/* Interval between cowb scan wakeups */
 } xfs_param_t;
 
 /*
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 16093c7..ad188d3 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -39,6 +39,7 @@
 struct xfs_inode_log_format;
 struct xfs_bmbt_irec;
 struct xfs_btree_cur;
+struct xfs_refcount_irec;
 
 DECLARE_EVENT_CLASS(xfs_attr_list_class,
 	TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -135,6 +136,8 @@
 DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
 DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks);
 DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_cowblocks);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_cowblocks);
 
 DECLARE_EVENT_CLASS(xfs_ag_class,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno),
@@ -268,10 +271,10 @@
 		__field(unsigned long, caller_ip)
 	),
 	TP_fast_assign(
-		struct xfs_ifork	*ifp = (state & BMAP_ATTRFORK) ?
-						ip->i_afp : &ip->i_df;
+		struct xfs_ifork	*ifp;
 		struct xfs_bmbt_irec	r;
 
+		ifp = xfs_iext_state_to_fork(ip, state);
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r);
 		__entry->dev = VFS_I(ip)->i_sb->s_dev;
 		__entry->ino = ip->i_ino;
@@ -686,6 +689,9 @@
 DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
+DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag);
+DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
+DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
 
 DEFINE_INODE_EVENT(xfs_filemap_fault);
 DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
@@ -2581,10 +2587,20 @@
 DEFINE_AG_ERROR_EVENT(xfs_rmap_insert_error);
 DEFINE_AG_ERROR_EVENT(xfs_rmap_delete_error);
 DEFINE_AG_ERROR_EVENT(xfs_rmap_update_error);
+
+DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_candidate);
+DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_query);
+DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_candidate);
+DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range);
 DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_result);
 DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result);
 DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result);
 
+/* deferred bmbt updates */
+#define DEFINE_BMAP_DEFERRED_EVENT	DEFINE_RMAP_DEFERRED_EVENT
+DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_defer);
+DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_deferred);
+
 /* per-AG reservation */
 DECLARE_EVENT_CLASS(xfs_ag_resv_class,
 	TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type resv,
@@ -2639,6 +2655,728 @@
 DEFINE_AG_ERROR_EVENT(xfs_ag_resv_free_error);
 DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error);
 
+/* refcount tracepoint classes */
+
+/* reuse the discard trace class for agbno/aglen-based traces */
+#define DEFINE_AG_EXTENT_EVENT(name) DEFINE_DISCARD_EVENT(name)
+
+/* ag btree lookup tracepoint class */
+#define XFS_AG_BTREE_CMP_FORMAT_STR \
+	{ XFS_LOOKUP_EQ,	"eq" }, \
+	{ XFS_LOOKUP_LE,	"le" }, \
+	{ XFS_LOOKUP_GE,	"ge" }
+DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_lookup_t dir),
+	TP_ARGS(mp, agno, agbno, dir),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_lookup_t, dir)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->dir = dir;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __print_symbolic(__entry->dir, XFS_AG_BTREE_CMP_FORMAT_STR),
+		  __entry->dir)
+)
+
+#define DEFINE_AG_BTREE_LOOKUP_EVENT(name) \
+DEFINE_EVENT(xfs_ag_btree_lookup_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 xfs_agblock_t agbno, xfs_lookup_t dir), \
+	TP_ARGS(mp, agno, agbno, dir))
+
+/* single-rcext tracepoint class */
+DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 struct xfs_refcount_irec *irec),
+	TP_ARGS(mp, agno, irec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, startblock)
+		__field(xfs_extlen_t, blockcount)
+		__field(xfs_nlink_t, refcount)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->startblock = irec->rc_startblock;
+		__entry->blockcount = irec->rc_blockcount;
+		__entry->refcount = irec->rc_refcount;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->startblock,
+		  __entry->blockcount,
+		  __entry->refcount)
+)
+
+#define DEFINE_REFCOUNT_EXTENT_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_extent_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 struct xfs_refcount_irec *irec), \
+	TP_ARGS(mp, agno, irec))
+
+/* single-rcext and an agbno tracepoint class */
+DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 struct xfs_refcount_irec *irec, xfs_agblock_t agbno),
+	TP_ARGS(mp, agno, irec, agbno),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, startblock)
+		__field(xfs_extlen_t, blockcount)
+		__field(xfs_nlink_t, refcount)
+		__field(xfs_agblock_t, agbno)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->startblock = irec->rc_startblock;
+		__entry->blockcount = irec->rc_blockcount;
+		__entry->refcount = irec->rc_refcount;
+		__entry->agbno = agbno;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->startblock,
+		  __entry->blockcount,
+		  __entry->refcount,
+		  __entry->agbno)
+)
+
+#define DEFINE_REFCOUNT_EXTENT_AT_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_extent_at_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 struct xfs_refcount_irec *irec, xfs_agblock_t agbno), \
+	TP_ARGS(mp, agno, irec, agbno))
+
+/* double-rcext tracepoint class */
+DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2),
+	TP_ARGS(mp, agno, i1, i2),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, i1_startblock)
+		__field(xfs_extlen_t, i1_blockcount)
+		__field(xfs_nlink_t, i1_refcount)
+		__field(xfs_agblock_t, i2_startblock)
+		__field(xfs_extlen_t, i2_blockcount)
+		__field(xfs_nlink_t, i2_refcount)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->i1_startblock = i1->rc_startblock;
+		__entry->i1_blockcount = i1->rc_blockcount;
+		__entry->i1_refcount = i1->rc_refcount;
+		__entry->i2_startblock = i2->rc_startblock;
+		__entry->i2_blockcount = i2->rc_blockcount;
+		__entry->i2_refcount = i2->rc_refcount;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
+		  "agbno %u len %u refcount %u\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->i1_startblock,
+		  __entry->i1_blockcount,
+		  __entry->i1_refcount,
+		  __entry->i2_startblock,
+		  __entry->i2_blockcount,
+		  __entry->i2_refcount)
+)
+
+#define DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_double_extent_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2), \
+	TP_ARGS(mp, agno, i1, i2))
+
+/* double-rcext and an agbno tracepoint class */
+DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2,
+		 xfs_agblock_t agbno),
+	TP_ARGS(mp, agno, i1, i2, agbno),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, i1_startblock)
+		__field(xfs_extlen_t, i1_blockcount)
+		__field(xfs_nlink_t, i1_refcount)
+		__field(xfs_agblock_t, i2_startblock)
+		__field(xfs_extlen_t, i2_blockcount)
+		__field(xfs_nlink_t, i2_refcount)
+		__field(xfs_agblock_t, agbno)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->i1_startblock = i1->rc_startblock;
+		__entry->i1_blockcount = i1->rc_blockcount;
+		__entry->i1_refcount = i1->rc_refcount;
+		__entry->i2_startblock = i2->rc_startblock;
+		__entry->i2_blockcount = i2->rc_blockcount;
+		__entry->i2_refcount = i2->rc_refcount;
+		__entry->agbno = agbno;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
+		  "agbno %u len %u refcount %u @ agbno %u\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->i1_startblock,
+		  __entry->i1_blockcount,
+		  __entry->i1_refcount,
+		  __entry->i2_startblock,
+		  __entry->i2_blockcount,
+		  __entry->i2_refcount,
+		  __entry->agbno)
+)
+
+#define DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_double_extent_at_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \
+		 xfs_agblock_t agbno), \
+	TP_ARGS(mp, agno, i1, i2, agbno))
+
+/* triple-rcext tracepoint class */
+DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2,
+		 struct xfs_refcount_irec *i3),
+	TP_ARGS(mp, agno, i1, i2, i3),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, i1_startblock)
+		__field(xfs_extlen_t, i1_blockcount)
+		__field(xfs_nlink_t, i1_refcount)
+		__field(xfs_agblock_t, i2_startblock)
+		__field(xfs_extlen_t, i2_blockcount)
+		__field(xfs_nlink_t, i2_refcount)
+		__field(xfs_agblock_t, i3_startblock)
+		__field(xfs_extlen_t, i3_blockcount)
+		__field(xfs_nlink_t, i3_refcount)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->i1_startblock = i1->rc_startblock;
+		__entry->i1_blockcount = i1->rc_blockcount;
+		__entry->i1_refcount = i1->rc_refcount;
+		__entry->i2_startblock = i2->rc_startblock;
+		__entry->i2_blockcount = i2->rc_blockcount;
+		__entry->i2_refcount = i2->rc_refcount;
+		__entry->i3_startblock = i3->rc_startblock;
+		__entry->i3_blockcount = i3->rc_blockcount;
+		__entry->i3_refcount = i3->rc_refcount;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
+		  "agbno %u len %u refcount %u -- "
+		  "agbno %u len %u refcount %u\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->i1_startblock,
+		  __entry->i1_blockcount,
+		  __entry->i1_refcount,
+		  __entry->i2_startblock,
+		  __entry->i2_blockcount,
+		  __entry->i2_refcount,
+		  __entry->i3_startblock,
+		  __entry->i3_blockcount,
+		  __entry->i3_refcount)
+);
+
+#define DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_triple_extent_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \
+		 struct xfs_refcount_irec *i3), \
+	TP_ARGS(mp, agno, i1, i2, i3))
+
+/* refcount btree tracepoints */
+DEFINE_BUSY_EVENT(xfs_refcountbt_alloc_block);
+DEFINE_BUSY_EVENT(xfs_refcountbt_free_block);
+DEFINE_AG_BTREE_LOOKUP_EVENT(xfs_refcount_lookup);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_update);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_insert);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_delete);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_insert_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_delete_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_update_error);
+
+/* refcount adjustment tracepoints */
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_increase);
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_decrease);
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_increase);
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_decrease);
+DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(xfs_refcount_merge_center_extents);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_modify_extent);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_recover_extent);
+DEFINE_REFCOUNT_EXTENT_AT_EVENT(xfs_refcount_split_extent);
+DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_left_extent);
+DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_right_extent);
+DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_left_extent);
+DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_right_extent);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_cow_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_center_extents_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_modify_extent_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_split_extent_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_left_extent_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_right_extent_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_find_left_extent_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_find_right_extent_error);
+
+/* reflink helpers */
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared);
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared_result);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_find_shared_error);
+#define DEFINE_REFCOUNT_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
+DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_defer);
+DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred);
+
+TRACE_EVENT(xfs_refcount_finish_one_leftover,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 int type, xfs_agblock_t agbno, xfs_extlen_t len,
+		 xfs_agblock_t new_agbno, xfs_extlen_t new_len),
+	TP_ARGS(mp, agno, type, agbno, len, new_agbno, new_len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(int, type)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+		__field(xfs_agblock_t, new_agbno)
+		__field(xfs_extlen_t, new_len)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->type = type;
+		__entry->agbno = agbno;
+		__entry->len = len;
+		__entry->new_agbno = new_agbno;
+		__entry->new_len = new_len;
+	),
+	TP_printk("dev %d:%d type %d agno %u agbno %u len %u new_agbno %u new_len %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->type,
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len,
+		  __entry->new_agbno,
+		  __entry->new_len)
+);
+
+/* simple inode-based error/%ip tracepoint class */
+DECLARE_EVENT_CLASS(xfs_inode_error_class,
+	TP_PROTO(struct xfs_inode *ip, int error, unsigned long caller_ip),
+	TP_ARGS(ip, error, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, error)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->error = error;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d ino %llx error %d caller %ps",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->error,
+		  (char *)__entry->caller_ip)
+);
+
+#define DEFINE_INODE_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_inode_error_class, name, \
+	TP_PROTO(struct xfs_inode *ip, int error, \
+		 unsigned long caller_ip), \
+	TP_ARGS(ip, error, caller_ip))
+
+/* reflink allocator */
+TRACE_EVENT(xfs_bmap_remap_alloc,
+	TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t fsbno,
+		 xfs_extlen_t len),
+	TP_ARGS(ip, fsbno, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsblock_t, fsbno)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->fsbno = fsbno;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d ino 0x%llx fsbno 0x%llx len %x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->fsbno,
+		  __entry->len)
+);
+DEFINE_INODE_ERROR_EVENT(xfs_bmap_remap_alloc_error);
+
+/* reflink tracepoint classes */
+
+/* two-file io tracepoint class */
+DECLARE_EVENT_CLASS(xfs_double_io_class,
+	TP_PROTO(struct xfs_inode *src, xfs_off_t soffset, xfs_off_t len,
+		 struct xfs_inode *dest, xfs_off_t doffset),
+	TP_ARGS(src, soffset, len, dest, doffset),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, src_ino)
+		__field(loff_t, src_isize)
+		__field(loff_t, src_disize)
+		__field(loff_t, src_offset)
+		__field(size_t, len)
+		__field(xfs_ino_t, dest_ino)
+		__field(loff_t, dest_isize)
+		__field(loff_t, dest_disize)
+		__field(loff_t, dest_offset)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(src)->i_sb->s_dev;
+		__entry->src_ino = src->i_ino;
+		__entry->src_isize = VFS_I(src)->i_size;
+		__entry->src_disize = src->i_d.di_size;
+		__entry->src_offset = soffset;
+		__entry->len = len;
+		__entry->dest_ino = dest->i_ino;
+		__entry->dest_isize = VFS_I(dest)->i_size;
+		__entry->dest_disize = dest->i_d.di_size;
+		__entry->dest_offset = doffset;
+	),
+	TP_printk("dev %d:%d count %zd "
+		  "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx -> "
+		  "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->len,
+		  __entry->src_ino,
+		  __entry->src_isize,
+		  __entry->src_disize,
+		  __entry->src_offset,
+		  __entry->dest_ino,
+		  __entry->dest_isize,
+		  __entry->dest_disize,
+		  __entry->dest_offset)
+)
+
+#define DEFINE_DOUBLE_IO_EVENT(name)	\
+DEFINE_EVENT(xfs_double_io_class, name,	\
+	TP_PROTO(struct xfs_inode *src, xfs_off_t soffset, xfs_off_t len, \
+		 struct xfs_inode *dest, xfs_off_t doffset), \
+	TP_ARGS(src, soffset, len, dest, doffset))
+
+/* two-file vfs io tracepoint class */
+DECLARE_EVENT_CLASS(xfs_double_vfs_io_class,
+	TP_PROTO(struct inode *src, u64 soffset, u64 len,
+		 struct inode *dest, u64 doffset),
+	TP_ARGS(src, soffset, len, dest, doffset),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned long, src_ino)
+		__field(loff_t, src_isize)
+		__field(loff_t, src_offset)
+		__field(size_t, len)
+		__field(unsigned long, dest_ino)
+		__field(loff_t, dest_isize)
+		__field(loff_t, dest_offset)
+	),
+	TP_fast_assign(
+		__entry->dev = src->i_sb->s_dev;
+		__entry->src_ino = src->i_ino;
+		__entry->src_isize = i_size_read(src);
+		__entry->src_offset = soffset;
+		__entry->len = len;
+		__entry->dest_ino = dest->i_ino;
+		__entry->dest_isize = i_size_read(dest);
+		__entry->dest_offset = doffset;
+	),
+	TP_printk("dev %d:%d count %zd "
+		  "ino 0x%lx isize 0x%llx offset 0x%llx -> "
+		  "ino 0x%lx isize 0x%llx offset 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->len,
+		  __entry->src_ino,
+		  __entry->src_isize,
+		  __entry->src_offset,
+		  __entry->dest_ino,
+		  __entry->dest_isize,
+		  __entry->dest_offset)
+)
+
+#define DEFINE_DOUBLE_VFS_IO_EVENT(name)	\
+DEFINE_EVENT(xfs_double_vfs_io_class, name,	\
+	TP_PROTO(struct inode *src, u64 soffset, u64 len, \
+		 struct inode *dest, u64 doffset), \
+	TP_ARGS(src, soffset, len, dest, doffset))
+
+/* CoW write tracepoint */
+DECLARE_EVENT_CLASS(xfs_copy_on_write_class,
+	TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk,
+		 xfs_extlen_t len, xfs_fsblock_t new_pblk),
+	TP_ARGS(ip, lblk, pblk, len, new_pblk),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fileoff_t, lblk)
+		__field(xfs_fsblock_t, pblk)
+		__field(xfs_extlen_t, len)
+		__field(xfs_fsblock_t, new_pblk)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->lblk = lblk;
+		__entry->pblk = pblk;
+		__entry->len = len;
+		__entry->new_pblk = new_pblk;
+	),
+	TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx pblk 0x%llx "
+		  "len 0x%x new_pblk %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->lblk,
+		  __entry->pblk,
+		  __entry->len,
+		  __entry->new_pblk)
+)
+
+#define DEFINE_COW_EVENT(name)	\
+DEFINE_EVENT(xfs_copy_on_write_class, name,	\
+	TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, \
+		 xfs_extlen_t len, xfs_fsblock_t new_pblk), \
+	TP_ARGS(ip, lblk, pblk, len, new_pblk))
+
+/* inode/irec events */
+DECLARE_EVENT_CLASS(xfs_inode_irec_class,
+	TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec),
+	TP_ARGS(ip, irec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fileoff_t, lblk)
+		__field(xfs_extlen_t, len)
+		__field(xfs_fsblock_t, pblk)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->lblk = irec->br_startoff;
+		__entry->len = irec->br_blockcount;
+		__entry->pblk = irec->br_startblock;
+	),
+	TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->lblk,
+		  __entry->len,
+		  __entry->pblk)
+);
+#define DEFINE_INODE_IREC_EVENT(name) \
+DEFINE_EVENT(xfs_inode_irec_class, name, \
+	TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), \
+	TP_ARGS(ip, irec))
+
+/* refcount/reflink tracepoint definitions */
+
+/* reflink tracepoints */
+DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag);
+DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag);
+DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size);
+DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap);
+TRACE_EVENT(xfs_reflink_remap_blocks_loop,
+	TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset,
+		 xfs_filblks_t len, struct xfs_inode *dest,
+		 xfs_fileoff_t doffset),
+	TP_ARGS(src, soffset, len, dest, doffset),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, src_ino)
+		__field(xfs_fileoff_t, src_lblk)
+		__field(xfs_filblks_t, len)
+		__field(xfs_ino_t, dest_ino)
+		__field(xfs_fileoff_t, dest_lblk)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(src)->i_sb->s_dev;
+		__entry->src_ino = src->i_ino;
+		__entry->src_lblk = soffset;
+		__entry->len = len;
+		__entry->dest_ino = dest->i_ino;
+		__entry->dest_lblk = doffset;
+	),
+	TP_printk("dev %d:%d len 0x%llx "
+		  "ino 0x%llx offset 0x%llx blocks -> "
+		  "ino 0x%llx offset 0x%llx blocks",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->len,
+		  __entry->src_ino,
+		  __entry->src_lblk,
+		  __entry->dest_ino,
+		  __entry->dest_lblk)
+);
+TRACE_EVENT(xfs_reflink_punch_range,
+	TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk,
+		 xfs_extlen_t len),
+	TP_ARGS(ip, lblk, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fileoff_t, lblk)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->lblk = lblk;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->lblk,
+		  __entry->len)
+);
+TRACE_EVENT(xfs_reflink_remap,
+	TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk,
+		 xfs_extlen_t len, xfs_fsblock_t new_pblk),
+	TP_ARGS(ip, lblk, len, new_pblk),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fileoff_t, lblk)
+		__field(xfs_extlen_t, len)
+		__field(xfs_fsblock_t, new_pblk)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->lblk = lblk;
+		__entry->len = len;
+		__entry->new_pblk = new_pblk;
+	),
+	TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x new_pblk %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->lblk,
+		  __entry->len,
+		  __entry->new_pblk)
+);
+DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_reflink_main_loop_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_read_iomap_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error);
+
+/* dedupe tracepoints */
+DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_compare_extents_error);
+
+/* ioctl tracepoints */
+DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_reflink);
+DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_clone_range);
+DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_file_extent_same);
+TRACE_EVENT(xfs_ioctl_clone,
+	TP_PROTO(struct inode *src, struct inode *dest),
+	TP_ARGS(src, dest),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned long, src_ino)
+		__field(loff_t, src_isize)
+		__field(unsigned long, dest_ino)
+		__field(loff_t, dest_isize)
+	),
+	TP_fast_assign(
+		__entry->dev = src->i_sb->s_dev;
+		__entry->src_ino = src->i_ino;
+		__entry->src_isize = i_size_read(src);
+		__entry->dest_ino = dest->i_ino;
+		__entry->dest_isize = i_size_read(dest);
+	),
+	TP_printk("dev %d:%d "
+		  "ino 0x%lx isize 0x%llx -> "
+		  "ino 0x%lx isize 0x%llx\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->src_ino,
+		  __entry->src_isize,
+		  __entry->dest_ino,
+		  __entry->dest_isize)
+);
+
+/* unshare tracepoints */
+DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare);
+DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cow_eof_block);
+DEFINE_PAGE_EVENT(xfs_reflink_unshare_page);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_cow_eof_block_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_dirty_page_error);
+
+/* copy on write */
+DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
+
+DEFINE_RW_EVENT(xfs_reflink_reserve_cow_range);
+DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range);
+
+DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write);
+DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec);
+
+DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
+DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_piece);
+
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_reserve_cow_range_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
+
+DEFINE_COW_EVENT(xfs_reflink_fork_buf);
+DEFINE_COW_EVENT(xfs_reflink_finish_fork_buf);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_fork_buf_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_finish_fork_buf_error);
+
+DEFINE_INODE_EVENT(xfs_reflink_cancel_pending_cow);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cancel_cow);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_pending_cow_error);
+
+/* rmap swapext tracepoints */
+DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap);
+DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece);
+DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index e2bf86a..61b7fbd 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -36,6 +36,11 @@
 struct xfs_rud_log_item;
 struct xfs_rui_log_item;
 struct xfs_btree_cur;
+struct xfs_cui_log_item;
+struct xfs_cud_log_item;
+struct xfs_defer_ops;
+struct xfs_bui_log_item;
+struct xfs_bud_log_item;
 
 typedef struct xfs_log_item {
 	struct list_head		li_ail;		/* AIL pointers */
@@ -248,4 +253,28 @@
 		xfs_fsblock_t startblock, xfs_filblks_t blockcount,
 		xfs_exntst_t state, struct xfs_btree_cur **pcur);
 
+/* refcount updates */
+enum xfs_refcount_intent_type;
+
+void xfs_refcount_update_init_defer_op(void);
+struct xfs_cud_log_item *xfs_trans_get_cud(struct xfs_trans *tp,
+		struct xfs_cui_log_item *cuip);
+int xfs_trans_log_finish_refcount_update(struct xfs_trans *tp,
+		struct xfs_cud_log_item *cudp, struct xfs_defer_ops *dfops,
+		enum xfs_refcount_intent_type type, xfs_fsblock_t startblock,
+		xfs_extlen_t blockcount, xfs_fsblock_t *new_fsb,
+		xfs_extlen_t *new_len, struct xfs_btree_cur **pcur);
+
+/* mapping updates */
+enum xfs_bmap_intent_type;
+
+void xfs_bmap_update_init_defer_op(void);
+struct xfs_bud_log_item *xfs_trans_get_bud(struct xfs_trans *tp,
+		struct xfs_bui_log_item *buip);
+int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp,
+		struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops,
+		enum xfs_bmap_intent_type type, struct xfs_inode *ip,
+		int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
+		xfs_filblks_t blockcount, xfs_exntst_t state);
+
 #endif	/* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
new file mode 100644
index 0000000..6408e7d
--- /dev/null
+++ b/fs/xfs/xfs_trans_bmap.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_bmap_item.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_inode.h"
+
+/*
+ * This routine is called to allocate a "bmap update done"
+ * log item.
+ */
+struct xfs_bud_log_item *
+xfs_trans_get_bud(
+	struct xfs_trans		*tp,
+	struct xfs_bui_log_item		*buip)
+{
+	struct xfs_bud_log_item		*budp;
+
+	budp = xfs_bud_init(tp->t_mountp, buip);
+	xfs_trans_add_item(tp, &budp->bud_item);
+	return budp;
+}
+
+/*
+ * Finish an bmap update and log it to the BUD. Note that the
+ * transaction is marked dirty regardless of whether the bmap update
+ * succeeds or fails to support the BUI/BUD lifecycle rules.
+ */
+int
+xfs_trans_log_finish_bmap_update(
+	struct xfs_trans		*tp,
+	struct xfs_bud_log_item		*budp,
+	struct xfs_defer_ops		*dop,
+	enum xfs_bmap_intent_type	type,
+	struct xfs_inode		*ip,
+	int				whichfork,
+	xfs_fileoff_t			startoff,
+	xfs_fsblock_t			startblock,
+	xfs_filblks_t			blockcount,
+	xfs_exntst_t			state)
+{
+	int				error;
+
+	error = xfs_bmap_finish_one(tp, dop, ip, type, whichfork, startoff,
+			startblock, blockcount, state);
+
+	/*
+	 * Mark the transaction dirty, even on error. This ensures the
+	 * transaction is aborted, which:
+	 *
+	 * 1.) releases the BUI and frees the BUD
+	 * 2.) shuts down the filesystem
+	 */
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	budp->bud_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+	return error;
+}
+
+/* Sort bmap intents by inode. */
+static int
+xfs_bmap_update_diff_items(
+	void				*priv,
+	struct list_head		*a,
+	struct list_head		*b)
+{
+	struct xfs_bmap_intent		*ba;
+	struct xfs_bmap_intent		*bb;
+
+	ba = container_of(a, struct xfs_bmap_intent, bi_list);
+	bb = container_of(b, struct xfs_bmap_intent, bi_list);
+	return ba->bi_owner->i_ino - bb->bi_owner->i_ino;
+}
+
+/* Get an BUI. */
+STATIC void *
+xfs_bmap_update_create_intent(
+	struct xfs_trans		*tp,
+	unsigned int			count)
+{
+	struct xfs_bui_log_item		*buip;
+
+	ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
+	ASSERT(tp != NULL);
+
+	buip = xfs_bui_init(tp->t_mountp);
+	ASSERT(buip != NULL);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	xfs_trans_add_item(tp, &buip->bui_item);
+	return buip;
+}
+
+/* Set the map extent flags for this mapping. */
+static void
+xfs_trans_set_bmap_flags(
+	struct xfs_map_extent		*bmap,
+	enum xfs_bmap_intent_type	type,
+	int				whichfork,
+	xfs_exntst_t			state)
+{
+	bmap->me_flags = 0;
+	switch (type) {
+	case XFS_BMAP_MAP:
+	case XFS_BMAP_UNMAP:
+		bmap->me_flags = type;
+		break;
+	default:
+		ASSERT(0);
+	}
+	if (state == XFS_EXT_UNWRITTEN)
+		bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
+	if (whichfork == XFS_ATTR_FORK)
+		bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
+}
+
+/* Log bmap updates in the intent item. */
+STATIC void
+xfs_bmap_update_log_item(
+	struct xfs_trans		*tp,
+	void				*intent,
+	struct list_head		*item)
+{
+	struct xfs_bui_log_item		*buip = intent;
+	struct xfs_bmap_intent		*bmap;
+	uint				next_extent;
+	struct xfs_map_extent		*map;
+
+	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	buip->bui_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+	/*
+	 * atomic_inc_return gives us the value after the increment;
+	 * we want to use it as an array index so we need to subtract 1 from
+	 * it.
+	 */
+	next_extent = atomic_inc_return(&buip->bui_next_extent) - 1;
+	ASSERT(next_extent < buip->bui_format.bui_nextents);
+	map = &buip->bui_format.bui_extents[next_extent];
+	map->me_owner = bmap->bi_owner->i_ino;
+	map->me_startblock = bmap->bi_bmap.br_startblock;
+	map->me_startoff = bmap->bi_bmap.br_startoff;
+	map->me_len = bmap->bi_bmap.br_blockcount;
+	xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork,
+			bmap->bi_bmap.br_state);
+}
+
+/* Get an BUD so we can process all the deferred rmap updates. */
+STATIC void *
+xfs_bmap_update_create_done(
+	struct xfs_trans		*tp,
+	void				*intent,
+	unsigned int			count)
+{
+	return xfs_trans_get_bud(tp, intent);
+}
+
+/* Process a deferred rmap update. */
+STATIC int
+xfs_bmap_update_finish_item(
+	struct xfs_trans		*tp,
+	struct xfs_defer_ops		*dop,
+	struct list_head		*item,
+	void				*done_item,
+	void				**state)
+{
+	struct xfs_bmap_intent		*bmap;
+	int				error;
+
+	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+	error = xfs_trans_log_finish_bmap_update(tp, done_item, dop,
+			bmap->bi_type,
+			bmap->bi_owner, bmap->bi_whichfork,
+			bmap->bi_bmap.br_startoff,
+			bmap->bi_bmap.br_startblock,
+			bmap->bi_bmap.br_blockcount,
+			bmap->bi_bmap.br_state);
+	kmem_free(bmap);
+	return error;
+}
+
+/* Abort all pending BUIs. */
+STATIC void
+xfs_bmap_update_abort_intent(
+	void				*intent)
+{
+	xfs_bui_release(intent);
+}
+
+/* Cancel a deferred rmap update. */
+STATIC void
+xfs_bmap_update_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_bmap_intent		*bmap;
+
+	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+	kmem_free(bmap);
+}
+
+static const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
+	.type		= XFS_DEFER_OPS_TYPE_BMAP,
+	.max_items	= XFS_BUI_MAX_FAST_EXTENTS,
+	.diff_items	= xfs_bmap_update_diff_items,
+	.create_intent	= xfs_bmap_update_create_intent,
+	.abort_intent	= xfs_bmap_update_abort_intent,
+	.log_item	= xfs_bmap_update_log_item,
+	.create_done	= xfs_bmap_update_create_done,
+	.finish_item	= xfs_bmap_update_finish_item,
+	.cancel_item	= xfs_bmap_update_cancel_item,
+};
+
+/* Register the deferred op type. */
+void
+xfs_bmap_update_init_defer_op(void)
+{
+	xfs_defer_init_op_type(&xfs_bmap_update_defer_type);
+}
diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c
new file mode 100644
index 0000000..94c1877
--- /dev/null
+++ b/fs/xfs/xfs_trans_refcount.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_refcount_item.h"
+#include "xfs_alloc.h"
+#include "xfs_refcount.h"
+
+/*
+ * This routine is called to allocate a "refcount update done"
+ * log item.
+ */
+struct xfs_cud_log_item *
+xfs_trans_get_cud(
+	struct xfs_trans		*tp,
+	struct xfs_cui_log_item		*cuip)
+{
+	struct xfs_cud_log_item		*cudp;
+
+	cudp = xfs_cud_init(tp->t_mountp, cuip);
+	xfs_trans_add_item(tp, &cudp->cud_item);
+	return cudp;
+}
+
+/*
+ * Finish an refcount update and log it to the CUD. Note that the
+ * transaction is marked dirty regardless of whether the refcount
+ * update succeeds or fails to support the CUI/CUD lifecycle rules.
+ */
+int
+xfs_trans_log_finish_refcount_update(
+	struct xfs_trans		*tp,
+	struct xfs_cud_log_item		*cudp,
+	struct xfs_defer_ops		*dop,
+	enum xfs_refcount_intent_type	type,
+	xfs_fsblock_t			startblock,
+	xfs_extlen_t			blockcount,
+	xfs_fsblock_t			*new_fsb,
+	xfs_extlen_t			*new_len,
+	struct xfs_btree_cur		**pcur)
+{
+	int				error;
+
+	error = xfs_refcount_finish_one(tp, dop, type, startblock,
+			blockcount, new_fsb, new_len, pcur);
+
+	/*
+	 * Mark the transaction dirty, even on error. This ensures the
+	 * transaction is aborted, which:
+	 *
+	 * 1.) releases the CUI and frees the CUD
+	 * 2.) shuts down the filesystem
+	 */
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	cudp->cud_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+	return error;
+}
+
+/* Sort refcount intents by AG. */
+static int
+xfs_refcount_update_diff_items(
+	void				*priv,
+	struct list_head		*a,
+	struct list_head		*b)
+{
+	struct xfs_mount		*mp = priv;
+	struct xfs_refcount_intent	*ra;
+	struct xfs_refcount_intent	*rb;
+
+	ra = container_of(a, struct xfs_refcount_intent, ri_list);
+	rb = container_of(b, struct xfs_refcount_intent, ri_list);
+	return  XFS_FSB_TO_AGNO(mp, ra->ri_startblock) -
+		XFS_FSB_TO_AGNO(mp, rb->ri_startblock);
+}
+
+/* Get an CUI. */
+STATIC void *
+xfs_refcount_update_create_intent(
+	struct xfs_trans		*tp,
+	unsigned int			count)
+{
+	struct xfs_cui_log_item		*cuip;
+
+	ASSERT(tp != NULL);
+	ASSERT(count > 0);
+
+	cuip = xfs_cui_init(tp->t_mountp, count);
+	ASSERT(cuip != NULL);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	xfs_trans_add_item(tp, &cuip->cui_item);
+	return cuip;
+}
+
+/* Set the phys extent flags for this reverse mapping. */
+static void
+xfs_trans_set_refcount_flags(
+	struct xfs_phys_extent		*refc,
+	enum xfs_refcount_intent_type	type)
+{
+	refc->pe_flags = 0;
+	switch (type) {
+	case XFS_REFCOUNT_INCREASE:
+	case XFS_REFCOUNT_DECREASE:
+	case XFS_REFCOUNT_ALLOC_COW:
+	case XFS_REFCOUNT_FREE_COW:
+		refc->pe_flags |= type;
+		break;
+	default:
+		ASSERT(0);
+	}
+}
+
+/* Log refcount updates in the intent item. */
+STATIC void
+xfs_refcount_update_log_item(
+	struct xfs_trans		*tp,
+	void				*intent,
+	struct list_head		*item)
+{
+	struct xfs_cui_log_item		*cuip = intent;
+	struct xfs_refcount_intent	*refc;
+	uint				next_extent;
+	struct xfs_phys_extent		*ext;
+
+	refc = container_of(item, struct xfs_refcount_intent, ri_list);
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	cuip->cui_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+	/*
+	 * atomic_inc_return gives us the value after the increment;
+	 * we want to use it as an array index so we need to subtract 1 from
+	 * it.
+	 */
+	next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1;
+	ASSERT(next_extent < cuip->cui_format.cui_nextents);
+	ext = &cuip->cui_format.cui_extents[next_extent];
+	ext->pe_startblock = refc->ri_startblock;
+	ext->pe_len = refc->ri_blockcount;
+	xfs_trans_set_refcount_flags(ext, refc->ri_type);
+}
+
+/* Get an CUD so we can process all the deferred refcount updates. */
+STATIC void *
+xfs_refcount_update_create_done(
+	struct xfs_trans		*tp,
+	void				*intent,
+	unsigned int			count)
+{
+	return xfs_trans_get_cud(tp, intent);
+}
+
+/* Process a deferred refcount update. */
+STATIC int
+xfs_refcount_update_finish_item(
+	struct xfs_trans		*tp,
+	struct xfs_defer_ops		*dop,
+	struct list_head		*item,
+	void				*done_item,
+	void				**state)
+{
+	struct xfs_refcount_intent	*refc;
+	xfs_fsblock_t			new_fsb;
+	xfs_extlen_t			new_aglen;
+	int				error;
+
+	refc = container_of(item, struct xfs_refcount_intent, ri_list);
+	error = xfs_trans_log_finish_refcount_update(tp, done_item, dop,
+			refc->ri_type,
+			refc->ri_startblock,
+			refc->ri_blockcount,
+			&new_fsb, &new_aglen,
+			(struct xfs_btree_cur **)state);
+	/* Did we run out of reservation?  Requeue what we didn't finish. */
+	if (!error && new_aglen > 0) {
+		ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE ||
+		       refc->ri_type == XFS_REFCOUNT_DECREASE);
+		refc->ri_startblock = new_fsb;
+		refc->ri_blockcount = new_aglen;
+		return -EAGAIN;
+	}
+	kmem_free(refc);
+	return error;
+}
+
+/* Clean up after processing deferred refcounts. */
+STATIC void
+xfs_refcount_update_finish_cleanup(
+	struct xfs_trans	*tp,
+	void			*state,
+	int			error)
+{
+	struct xfs_btree_cur	*rcur = state;
+
+	xfs_refcount_finish_one_cleanup(tp, rcur, error);
+}
+
+/* Abort all pending CUIs. */
+STATIC void
+xfs_refcount_update_abort_intent(
+	void				*intent)
+{
+	xfs_cui_release(intent);
+}
+
+/* Cancel a deferred refcount update. */
+STATIC void
+xfs_refcount_update_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_refcount_intent	*refc;
+
+	refc = container_of(item, struct xfs_refcount_intent, ri_list);
+	kmem_free(refc);
+}
+
+static const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
+	.type		= XFS_DEFER_OPS_TYPE_REFCOUNT,
+	.max_items	= XFS_CUI_MAX_FAST_EXTENTS,
+	.diff_items	= xfs_refcount_update_diff_items,
+	.create_intent	= xfs_refcount_update_create_intent,
+	.abort_intent	= xfs_refcount_update_abort_intent,
+	.log_item	= xfs_refcount_update_log_item,
+	.create_done	= xfs_refcount_update_create_done,
+	.finish_item	= xfs_refcount_update_finish_item,
+	.finish_cleanup = xfs_refcount_update_finish_cleanup,
+	.cancel_item	= xfs_refcount_update_cancel_item,
+};
+
+/* Register the deferred op type. */
+void
+xfs_refcount_update_init_defer_op(void)
+{
+	xfs_defer_init_op_type(&xfs_refcount_update_defer_type);
+}
diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c
index 5a50ef8..9ead064 100644
--- a/fs/xfs/xfs_trans_rmap.c
+++ b/fs/xfs/xfs_trans_rmap.c
@@ -48,12 +48,21 @@
 	case XFS_RMAP_MAP:
 		rmap->me_flags |= XFS_RMAP_EXTENT_MAP;
 		break;
+	case XFS_RMAP_MAP_SHARED:
+		rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED;
+		break;
 	case XFS_RMAP_UNMAP:
 		rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP;
 		break;
+	case XFS_RMAP_UNMAP_SHARED:
+		rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED;
+		break;
 	case XFS_RMAP_CONVERT:
 		rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT;
 		break;
+	case XFS_RMAP_CONVERT_SHARED:
+		rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED;
+		break;
 	case XFS_RMAP_ALLOC:
 		rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC;
 		break;
diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h
index 6df9b07..cc6bb31 100644
--- a/include/asm-generic/uaccess.h
+++ b/include/asm-generic/uaccess.h
@@ -69,10 +69,6 @@
 	unsigned long insn, fixup;
 };
 
-/* Returns 0 if exception not found and fixup otherwise.  */
-extern unsigned long search_exception_table(unsigned long);
-
-
 /*
  * architectures with an MMU should override these two
  */
diff --git a/include/dt-bindings/thermal/tegra124-soctherm.h b/include/dt-bindings/thermal/tegra124-soctherm.h
index 729ab9f..2a99f1d 100644
--- a/include/dt-bindings/thermal/tegra124-soctherm.h
+++ b/include/dt-bindings/thermal/tegra124-soctherm.h
@@ -11,4 +11,9 @@
 #define TEGRA124_SOCTHERM_SENSOR_PLLX 3
 #define TEGRA124_SOCTHERM_SENSOR_NUM 4
 
+#define TEGRA_SOCTHERM_THROT_LEVEL_LOW  0
+#define TEGRA_SOCTHERM_THROT_LEVEL_MED  1
+#define TEGRA_SOCTHERM_THROT_LEVEL_HIGH 2
+#define TEGRA_SOCTHERM_THROT_LEVEL_NONE -1
+
 #endif
diff --git a/include/linux/amba/clcd.h b/include/linux/amba/clcd.h
index e82e3ee..1035879 100644
--- a/include/linux/amba/clcd.h
+++ b/include/linux/amba/clcd.h
@@ -67,6 +67,17 @@
 #define CNTL_LDMAFIFOTIME	(1 << 15)
 #define CNTL_WATERMARK		(1 << 16)
 
+/* ST Microelectronics variant bits */
+#define CNTL_ST_1XBPP_444	0x0
+#define CNTL_ST_1XBPP_5551	(1 << 17)
+#define CNTL_ST_1XBPP_565	(1 << 18)
+#define CNTL_ST_CDWID_12	0x0
+#define CNTL_ST_CDWID_16	(1 << 19)
+#define CNTL_ST_CDWID_18	(1 << 20)
+#define CNTL_ST_CDWID_24	((1 << 19)|(1 << 20))
+#define CNTL_ST_CEAEN		(1 << 21)
+#define CNTL_ST_LCDBPP24_PACKED	(6 << 1)
+
 enum {
 	/* individual formats */
 	CLCD_CAP_RGB444		= (1 << 0),
@@ -93,6 +104,8 @@
 	CLCD_CAP_ALL		= CLCD_CAP_BGR | CLCD_CAP_RGB,
 };
 
+struct backlight_device;
+
 struct clcd_panel {
 	struct fb_videomode	mode;
 	signed short		width;	/* width in mm */
@@ -105,6 +118,13 @@
 				fixedtimings:1,
 				grayscale:1;
 	unsigned int		connector;
+	struct backlight_device	*backlight;
+	/*
+	 * If the B/R lines are switched between the CLCD
+	 * and the panel we need to know this and not try to
+	 * compensate with the BGR bit in the control register.
+	 */
+	bool			bgr_connection;
 };
 
 struct clcd_regs {
@@ -170,11 +190,38 @@
 struct amba_device;
 struct clk;
 
+/**
+ * struct clcd_vendor_data - holds hardware (IP-block) vendor-specific
+ * variant information
+ *
+ * @clock_timregs: the CLCD needs to be clocked when accessing the
+ * timer registers, or the hardware will hang.
+ * @packed_24_bit_pixels: this variant supports 24bit packed pixel data,
+ * so that RGB accesses 3 bytes at a time, not just on even 32bit
+ * boundaries, packing the pixel data in memory. ST Microelectronics
+ * have this.
+ * @st_bitmux_control: ST Microelectronics have implemented output
+ * bit line multiplexing into the CLCD control register. This indicates
+ * that we need to use this.
+ * @init_board: custom board init function for this variant
+ * @init_panel: custom panel init function for this variant
+ */
+struct clcd_vendor_data {
+	bool	clock_timregs;
+	bool	packed_24_bit_pixels;
+	bool	st_bitmux_control;
+	int	(*init_board)(struct amba_device *adev,
+			      struct clcd_board *board);
+	int	(*init_panel)(struct clcd_fb *fb,
+			      struct device_node *panel);
+};
+
 /* this data structure describes each frame buffer device we find */
 struct clcd_fb {
 	struct fb_info		fb;
 	struct amba_device	*dev;
 	struct clk		*clk;
+	struct clcd_vendor_data	*vendor;
 	struct clcd_panel	*panel;
 	struct clcd_board	*board;
 	void			*board_data;
@@ -231,16 +278,22 @@
 	if (var->grayscale)
 		val |= CNTL_LCDBW;
 
-	if (fb->panel->caps && fb->board->caps &&
-	    var->bits_per_pixel >= 16) {
+	if (fb->panel->caps && fb->board->caps && var->bits_per_pixel >= 16) {
 		/*
 		 * if board and panel supply capabilities, we can support
-		 * changing BGR/RGB depending on supplied parameters
+		 * changing BGR/RGB depending on supplied parameters. Here
+		 * we switch to what the framebuffer is providing if need
+		 * be, so if the framebuffer is BGR but the display connection
+		 * is RGB (first case) we switch it around. Vice versa mutatis
+		 * mutandis if the framebuffer is RGB but the display connection
+		 * is BGR, we flip it around.
 		 */
 		if (var->red.offset == 0)
 			val &= ~CNTL_BGR;
 		else
 			val |= CNTL_BGR;
+		if (fb->panel->bgr_connection)
+			val ^= CNTL_BGR;
 	}
 
 	switch (var->bits_per_pixel) {
@@ -270,6 +323,10 @@
 		else
 			val |= CNTL_LCDBPP16_444;
 		break;
+	case 24:
+		/* Modified variant supporting 24 bit packed pixels */
+		val |= CNTL_ST_LCDBPP24_PACKED;
+		break;
 	case 32:
 		val |= CNTL_LCDBPP24;
 		break;
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index b03c062..5ab958c 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -157,12 +157,13 @@
  *    @fh_to_dentry is given a &struct super_block (@sb) and a file handle
  *    fragment (@fh, @fh_len). It should return a &struct dentry which refers
  *    to the same file that the file handle fragment refers to.  If it cannot,
- *    it should return a %NULL pointer if the file was found but no acceptable
- *    &dentries were available, or an %ERR_PTR error code indicating why it
- *    couldn't be found (e.g. %ENOENT or %ENOMEM).  Any suitable dentry can be
- *    returned including, if necessary, a new dentry created with d_alloc_root.
- *    The caller can then find any other extant dentries by following the
- *    d_alias links.
+ *    it should return a %NULL pointer if the file cannot be found, or an
+ *    %ERR_PTR error code of %ENOMEM if a memory allocation failure occurred.
+ *    Any other error code is treated like %NULL, and will cause an %ESTALE error
+ *    for callers of exportfs_decode_fh().
+ *    Any suitable dentry can be returned including, if necessary, a new dentry
+ *    created with d_alloc_root.  The caller can then find any other extant
+ *    dentries by following the d_alias links.
  *
  * fh_to_parent:
  *    Same as @fh_to_dentry, except that it returns a pointer to the parent
diff --git a/include/linux/falloc.h b/include/linux/falloc.h
index 9961110..7494dc6 100644
--- a/include/linux/falloc.h
+++ b/include/linux/falloc.h
@@ -25,6 +25,7 @@
 					 FALLOC_FL_PUNCH_HOLE |		\
 					 FALLOC_FL_COLLAPSE_RANGE |	\
 					 FALLOC_FL_ZERO_RANGE |		\
-					 FALLOC_FL_INSERT_RANGE)
+					 FALLOC_FL_INSERT_RANGE |	\
+					 FALLOC_FL_UNSHARE_RANGE)
 
 #endif /* _FALLOC_H_ */
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index c6564ad..9094faf 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -67,6 +67,7 @@
 		NFS4_DELEGATION_STATEID_TYPE,
 		NFS4_LAYOUT_STATEID_TYPE,
 		NFS4_PNFS_DS_STATEID_TYPE,
+		NFS4_REVOKED_STATEID_TYPE,
 	} type;
 };
 
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 14a762d..b34097c 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -103,6 +103,9 @@
 #define NFS_SP4_MACH_CRED_WRITE    5	/* WRITE */
 #define NFS_SP4_MACH_CRED_COMMIT   6	/* COMMIT */
 #define NFS_SP4_MACH_CRED_PNFS_CLEANUP  7 /* LAYOUTRETURN */
+#if IS_ENABLED(CONFIG_NFS_V4_1)
+	wait_queue_head_t	cl_lock_waitq;
+#endif /* CONFIG_NFS_V4_1 */
 #endif /* CONFIG_NFS_V4 */
 
 	/* Our own IP address, as a null-terminated string.
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 7cc0dee..beb1e10 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -125,6 +125,11 @@
 		| NFS_ATTR_FATTR_V4_SECURITY_LABEL)
 
 /*
+ * Maximal number of supported layout drivers.
+ */
+#define NFS_MAX_LAYOUT_TYPES 8
+
+/*
  * Info on the file system
  */
 struct nfs_fsinfo {
@@ -139,7 +144,8 @@
 	__u64			maxfilesize;
 	struct timespec		time_delta; /* server time granularity */
 	__u32			lease_time; /* in seconds */
-	__u32			layouttype; /* supported pnfs layout driver */
+	__u32			nlayouttypes; /* number of layouttypes */
+	__u32			layouttype[NFS_MAX_LAYOUT_TYPES]; /* supported pnfs layout driver */
 	__u32			blksize; /* preferred pnfs io block size */
 	__u32			clone_blksize; /* granularity of a CLONE operation */
 };
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index f1bbae0..2c6c511 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -641,6 +641,7 @@
 #ifdef CONFIG_PWM_SYSFS
 void pwmchip_sysfs_export(struct pwm_chip *chip);
 void pwmchip_sysfs_unexport(struct pwm_chip *chip);
+void pwmchip_sysfs_unexport_children(struct pwm_chip *chip);
 #else
 static inline void pwmchip_sysfs_export(struct pwm_chip *chip)
 {
@@ -649,6 +650,10 @@
 static inline void pwmchip_sysfs_unexport(struct pwm_chip *chip)
 {
 }
+
+static inline void pwmchip_sysfs_unexport_children(struct pwm_chip *chip)
+{
+}
 #endif /* CONFIG_PWM_SYSFS */
 
 #endif /* __LINUX_PWM_H */
diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 4ccf184..b1bc62b 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -131,6 +131,7 @@
 	struct rpc_auth *	(*create)(struct rpc_auth_create_args *, struct rpc_clnt *);
 	void			(*destroy)(struct rpc_auth *);
 
+	int			(*hash_cred)(struct auth_cred *, unsigned int);
 	struct rpc_cred *	(*lookup_cred)(struct rpc_auth *, struct auth_cred *, int);
 	struct rpc_cred *	(*crcreate)(struct rpc_auth*, struct auth_cred *, int, gfp_t);
 	int			(*list_pseudoflavors)(rpc_authflavor_t *, int);
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 5c02b06..85cc819 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -125,6 +125,13 @@
 	struct svc_xprt		*bc_xprt;	/* NFSv4.1 backchannel */
 };
 
+struct rpc_add_xprt_test {
+	int (*add_xprt_test)(struct rpc_clnt *,
+		struct rpc_xprt *,
+		void *calldata);
+	void *data;
+};
+
 /* Values for "flags" field */
 #define RPC_CLNT_CREATE_HARDRTRY	(1UL << 0)
 #define RPC_CLNT_CREATE_AUTOBIND	(1UL << 2)
@@ -198,6 +205,16 @@
 void		rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt,
 			unsigned long timeo);
 
+int		rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *,
+			struct rpc_xprt_switch *,
+			struct rpc_xprt *,
+			void *);
+
 const char *rpc_proc_name(const struct rpc_task *task);
+
+void rpc_clnt_xprt_switch_put(struct rpc_clnt *);
+void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *);
+bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
+			const struct sockaddr *sap);
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SUNRPC_CLNT_H */
diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h
index 3b1ff38..cfda6adc 100644
--- a/include/linux/sunrpc/rpc_rdma.h
+++ b/include/linux/sunrpc/rpc_rdma.h
@@ -41,10 +41,15 @@
 #define _LINUX_SUNRPC_RPC_RDMA_H
 
 #include <linux/types.h>
+#include <linux/bitops.h>
 
 #define RPCRDMA_VERSION		1
 #define rpcrdma_version		cpu_to_be32(RPCRDMA_VERSION)
 
+enum {
+	RPCRDMA_V1_DEF_INLINE_SIZE	= 1024,
+};
+
 struct rpcrdma_segment {
 	__be32 rs_handle;	/* Registered memory handle */
 	__be32 rs_length;	/* Length of the chunk in bytes */
@@ -129,4 +134,38 @@
 #define rdma_done	cpu_to_be32(RDMA_DONE)
 #define rdma_error	cpu_to_be32(RDMA_ERROR)
 
+/*
+ * Private extension to RPC-over-RDMA Version One.
+ * Message passed during RDMA-CM connection set-up.
+ *
+ * Add new fields at the end, and don't permute existing
+ * fields.
+ */
+struct rpcrdma_connect_private {
+	__be32			cp_magic;
+	u8			cp_version;
+	u8			cp_flags;
+	u8			cp_send_size;
+	u8			cp_recv_size;
+} __packed;
+
+#define rpcrdma_cmp_magic	__cpu_to_be32(0xf6ab0e18)
+
+enum {
+	RPCRDMA_CMP_VERSION		= 1,
+	RPCRDMA_CMP_F_SND_W_INV_OK	= BIT(0),
+};
+
+static inline u8
+rpcrdma_encode_buffer_size(unsigned int size)
+{
+	return (size >> 10) - 1;
+}
+
+static inline unsigned int
+rpcrdma_decode_buffer_size(u8 val)
+{
+	return ((unsigned int)val + 1) << 10;
+}
+
 #endif				/* _LINUX_SUNRPC_RPC_RDMA_H */
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 817af0b..7ba040c 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -239,8 +239,8 @@
 					void *);
 void		rpc_wake_up_status(struct rpc_wait_queue *, int);
 void		rpc_delay(struct rpc_task *, unsigned long);
-void *		rpc_malloc(struct rpc_task *, size_t);
-void		rpc_free(void *);
+int		rpc_malloc(struct rpc_task *);
+void		rpc_free(struct rpc_task *);
 int		rpciod_up(void);
 void		rpciod_down(void);
 int		__rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *);
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index d6917b8..cc3ae16 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -86,6 +86,7 @@
 	unsigned long flags;
 	enum dma_data_direction direction;
 	int count;
+	unsigned int mapped_sges;
 	struct ib_sge sge[RPCSVC_MAXPAGES];
 	struct page *pages[RPCSVC_MAXPAGES];
 };
@@ -136,6 +137,7 @@
 	int		     sc_ord;		/* RDMA read limit */
 	int                  sc_max_sge;
 	int                  sc_max_sge_rd;	/* max sge for read target */
+	bool		     sc_snd_w_inv;	/* OK to use Send With Invalidate */
 
 	atomic_t             sc_sq_count;	/* Number of SQ WR on queue */
 	unsigned int	     sc_sq_depth;	/* Depth of SQ */
@@ -193,6 +195,14 @@
 
 #define RPCSVC_MAXPAYLOAD_RDMA	RPCSVC_MAXPAYLOAD
 
+/* Track DMA maps for this transport and context */
+static inline void svc_rdma_count_mappings(struct svcxprt_rdma *rdma,
+					   struct svc_rdma_op_ctxt *ctxt)
+{
+	ctxt->mapped_sges++;
+	atomic_inc(&rdma->sc_dma_used);
+}
+
 /* svc_rdma_backchannel.c */
 extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt,
 				    struct rpcrdma_msg *rmsgp,
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 70c6b92..56c48c8 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -67,6 +67,18 @@
 			len;		/* Length of XDR encoded message */
 };
 
+static inline void
+xdr_buf_init(struct xdr_buf *buf, void *start, size_t len)
+{
+	buf->head[0].iov_base = start;
+	buf->head[0].iov_len = len;
+	buf->tail[0].iov_len = 0;
+	buf->page_len = 0;
+	buf->flags = 0;
+	buf->len = 0;
+	buf->buflen = len;
+}
+
 /*
  * pre-xdr'ed macros.
  */
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index a16070d..a5da60b 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -83,9 +83,11 @@
 	void (*rq_release_snd_buf)(struct rpc_rqst *); /* release rq_enc_pages */
 	struct list_head	rq_list;
 
-	__u32 *			rq_buffer;	/* XDR encode buffer */
-	size_t			rq_callsize,
-				rq_rcvsize;
+	void			*rq_xprtdata;	/* Per-xprt private data */
+	void			*rq_buffer;	/* Call XDR encode buffer */
+	size_t			rq_callsize;
+	void			*rq_rbuffer;	/* Reply XDR decode buffer */
+	size_t			rq_rcvsize;
 	size_t			rq_xmit_bytes_sent;	/* total bytes sent */
 	size_t			rq_reply_bytes_recvd;	/* total reply bytes */
 							/* received */
@@ -127,8 +129,8 @@
 	void		(*rpcbind)(struct rpc_task *task);
 	void		(*set_port)(struct rpc_xprt *xprt, unsigned short port);
 	void		(*connect)(struct rpc_xprt *xprt, struct rpc_task *task);
-	void *		(*buf_alloc)(struct rpc_task *task, size_t size);
-	void		(*buf_free)(void *buffer);
+	int		(*buf_alloc)(struct rpc_task *task);
+	void		(*buf_free)(struct rpc_task *task);
 	int		(*send_request)(struct rpc_task *task);
 	void		(*set_retrans_timeout)(struct rpc_task *task);
 	void		(*timer)(struct rpc_xprt *xprt, struct rpc_task *task);
diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h
index 5a9acff..507418c 100644
--- a/include/linux/sunrpc/xprtmultipath.h
+++ b/include/linux/sunrpc/xprtmultipath.h
@@ -66,4 +66,6 @@
 extern struct rpc_xprt *xprt_iter_get_xprt(struct rpc_xprt_iter *xpi);
 extern struct rpc_xprt *xprt_iter_get_next(struct rpc_xprt_iter *xpi);
 
+extern bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps,
+		const struct sockaddr *sap);
 #endif
diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index 39267dc..221b7a2 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -53,8 +53,8 @@
 #define RPCRDMA_MAX_SLOT_TABLE	(256U)
 
 #define RPCRDMA_MIN_INLINE  (1024)	/* min inline thresh */
-#define RPCRDMA_DEF_INLINE  (1024)	/* default inline thresh */
-#define RPCRDMA_MAX_INLINE  (3068)	/* max inline thresh */
+#define RPCRDMA_DEF_INLINE  (4096)	/* default inline thresh */
+#define RPCRDMA_MAX_INLINE  (65536)	/* max inline thresh */
 
 /* Memory registration strategies, by number.
  * This is part of a kernel / user space API. Do not remove. */
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index ee517be..511182a 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -92,12 +92,24 @@
 	THERMAL_TREND_DROP_FULL, /* apply lowest cooling action */
 };
 
+/* Thermal notification reason */
+enum thermal_notify_event {
+	THERMAL_EVENT_UNSPECIFIED, /* Unspecified event */
+	THERMAL_EVENT_TEMP_SAMPLE, /* New Temperature sample */
+	THERMAL_TRIP_VIOLATED, /* TRIP Point violation */
+	THERMAL_TRIP_CHANGED, /* TRIP Point temperature changed */
+	THERMAL_DEVICE_DOWN, /* Thermal device is down */
+	THERMAL_DEVICE_UP, /* Thermal device is up after a down event */
+	THERMAL_DEVICE_POWER_CAPABILITY_CHANGED, /* power capability changed */
+};
+
 struct thermal_zone_device_ops {
 	int (*bind) (struct thermal_zone_device *,
 		     struct thermal_cooling_device *);
 	int (*unbind) (struct thermal_zone_device *,
 		       struct thermal_cooling_device *);
 	int (*get_temp) (struct thermal_zone_device *, int *);
+	int (*set_trips) (struct thermal_zone_device *, int, int);
 	int (*get_mode) (struct thermal_zone_device *,
 			 enum thermal_device_mode *);
 	int (*set_mode) (struct thermal_zone_device *,
@@ -168,6 +180,10 @@
  * @last_temperature:	previous temperature read
  * @emul_temperature:	emulated temperature when using CONFIG_THERMAL_EMULATION
  * @passive:		1 if you've crossed a passive trip point, 0 otherwise.
+ * @prev_low_trip:	the low current temperature if you've crossed a passive
+			trip point.
+ * @prev_high_trip:	the above current temperature if you've crossed a
+			passive trip point.
  * @forced_passive:	If > 0, temperature at which to switch on all ACPI
  *			processor cooling devices.  Currently only used by the
  *			step-wise governor.
@@ -182,6 +198,7 @@
  * @lock:	lock to protect thermal_instances list
  * @node:	node in thermal_tz_list (in thermal_core.c)
  * @poll_queue:	delayed work for polling
+ * @notify_event: Last notification event
  */
 struct thermal_zone_device {
 	int id;
@@ -199,6 +216,8 @@
 	int last_temperature;
 	int emul_temperature;
 	int passive;
+	int prev_low_trip;
+	int prev_high_trip;
 	unsigned int forced_passive;
 	atomic_t need_update;
 	struct thermal_zone_device_ops *ops;
@@ -210,6 +229,7 @@
 	struct mutex lock;
 	struct list_head node;
 	struct delayed_work poll_queue;
+	enum thermal_notify_event notify_event;
 };
 
 /**
@@ -333,6 +353,9 @@
  *
  * Optional:
  * @get_trend: a pointer to a function that reads the sensor temperature trend.
+ * @set_trips: a pointer to a function that sets a temperature window. When
+ *	       this window is left the driver must inform the thermal core via
+ *	       thermal_zone_device_update.
  * @set_emul_temp: a pointer to a function that sets sensor emulated
  *		   temperature.
  * @set_trip_temp: a pointer to a function that sets the trip temperature on
@@ -340,7 +363,8 @@
  */
 struct thermal_zone_of_device_ops {
 	int (*get_temp)(void *, int *);
-	int (*get_trend)(void *, long *);
+	int (*get_trend)(void *, int, enum thermal_trend *);
+	int (*set_trips)(void *, int, int);
 	int (*set_emul_temp)(void *, int);
 	int (*set_trip_temp)(void *, int, int);
 };
@@ -425,7 +449,9 @@
 				     unsigned int);
 int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int,
 				       struct thermal_cooling_device *);
-void thermal_zone_device_update(struct thermal_zone_device *);
+void thermal_zone_device_update(struct thermal_zone_device *,
+				enum thermal_notify_event);
+void thermal_zone_set_trips(struct thermal_zone_device *);
 
 struct thermal_cooling_device *thermal_cooling_device_register(char *, void *,
 		const struct thermal_cooling_device_ops *);
@@ -435,6 +461,8 @@
 void thermal_cooling_device_unregister(struct thermal_cooling_device *);
 struct thermal_zone_device *thermal_zone_get_zone_by_name(const char *name);
 int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp);
+int thermal_zone_get_slope(struct thermal_zone_device *tz);
+int thermal_zone_get_offset(struct thermal_zone_device *tz);
 
 int get_tz_trend(struct thermal_zone_device *, int);
 struct thermal_instance *get_thermal_instance(struct thermal_zone_device *,
@@ -473,7 +501,10 @@
 	struct thermal_zone_device *tz, int trip,
 	struct thermal_cooling_device *cdev)
 { return -ENODEV; }
-static inline void thermal_zone_device_update(struct thermal_zone_device *tz)
+static inline void thermal_zone_device_update(struct thermal_zone_device *tz,
+					      enum thermal_notify_event event)
+{ }
+static inline void thermal_zone_set_trips(struct thermal_zone_device *tz)
 { }
 static inline struct thermal_cooling_device *
 thermal_cooling_device_register(char *type, void *devdata,
@@ -492,6 +523,12 @@
 static inline int thermal_zone_get_temp(
 		struct thermal_zone_device *tz, int *temp)
 { return -ENODEV; }
+static inline int thermal_zone_get_slope(
+		struct thermal_zone_device *tz)
+{ return -ENODEV; }
+static inline int thermal_zone_get_offset(
+		struct thermal_zone_device *tz)
+{ return -ENODEV; }
 static inline int get_tz_trend(struct thermal_zone_device *tz, int trip)
 { return -ENODEV; }
 static inline struct thermal_instance *
diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index 7047bc7..35a4d81 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -19,6 +19,7 @@
 struct watchdog_ops;
 struct watchdog_device;
 struct watchdog_core_data;
+struct watchdog_governor;
 
 /** struct watchdog_ops - The watchdog-devices operations
  *
@@ -28,6 +29,7 @@
  * @ping:	The routine that sends a keepalive ping to the watchdog device.
  * @status:	The routine that shows the status of the watchdog device.
  * @set_timeout:The routine for setting the watchdog devices timeout value (in seconds).
+ * @set_pretimeout:The routine for setting the watchdog devices pretimeout.
  * @get_timeleft:The routine that gets the time left before a reset (in seconds).
  * @restart:	The routine for restarting the machine.
  * @ioctl:	The routines that handles extra ioctl calls.
@@ -46,6 +48,7 @@
 	int (*ping)(struct watchdog_device *);
 	unsigned int (*status)(struct watchdog_device *);
 	int (*set_timeout)(struct watchdog_device *, unsigned int);
+	int (*set_pretimeout)(struct watchdog_device *, unsigned int);
 	unsigned int (*get_timeleft)(struct watchdog_device *);
 	int (*restart)(struct watchdog_device *, unsigned long, void *);
 	long (*ioctl)(struct watchdog_device *, unsigned int, unsigned long);
@@ -59,8 +62,10 @@
  *		watchdog device.
  * @info:	Pointer to a watchdog_info structure.
  * @ops:	Pointer to the list of watchdog operations.
+ * @gov:	Pointer to watchdog pretimeout governor.
  * @bootstatus:	Status of the watchdog device at boot.
  * @timeout:	The watchdog devices timeout value (in seconds).
+ * @pretimeout: The watchdog devices pre_timeout value.
  * @min_timeout:The watchdog devices minimum timeout value (in seconds).
  * @max_timeout:The watchdog devices maximum timeout value (in seconds)
  *		as configurable from user space. Only relevant if
@@ -94,8 +99,10 @@
 	const struct attribute_group **groups;
 	const struct watchdog_info *info;
 	const struct watchdog_ops *ops;
+	const struct watchdog_governor *gov;
 	unsigned int bootstatus;
 	unsigned int timeout;
+	unsigned int pretimeout;
 	unsigned int min_timeout;
 	unsigned int max_timeout;
 	unsigned int min_hw_heartbeat_ms;
@@ -163,6 +170,13 @@
 		 t > wdd->max_timeout);
 }
 
+/* Use the following function to check if a pretimeout value is invalid */
+static inline bool watchdog_pretimeout_invalid(struct watchdog_device *wdd,
+					       unsigned int t)
+{
+	return t && wdd->timeout && t >= wdd->timeout;
+}
+
 /* Use the following functions to manipulate watchdog driver specific data */
 static inline void watchdog_set_drvdata(struct watchdog_device *wdd, void *data)
 {
@@ -174,6 +188,16 @@
 	return wdd->driver_data;
 }
 
+/* Use the following functions to report watchdog pretimeout event */
+#if IS_ENABLED(CONFIG_WATCHDOG_PRETIMEOUT_GOV)
+void watchdog_notify_pretimeout(struct watchdog_device *wdd);
+#else
+static inline void watchdog_notify_pretimeout(struct watchdog_device *wdd)
+{
+	pr_alert("watchdog%d: pretimeout event\n", wdd->id);
+}
+#endif
+
 /* drivers/watchdog/watchdog_core.c */
 void watchdog_set_restart_priority(struct watchdog_device *wdd, int priority);
 extern int watchdog_init_timeout(struct watchdog_device *wdd,
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 6360c25..f32f7ef 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -37,18 +37,6 @@
 #ifndef __long_aligned
 #define __long_aligned __attribute__((aligned((sizeof(long)))))
 #endif
-/*
- * Less bad way to call ioctl from within the kernel; this needs to be
- * done some other way to get the call out of interrupt context.
- * Needs "ioctl" variable to be supplied by calling context.
- */
-#define IOCTL(dev, arg, cmd) ({		\
-	int res = 0;			\
-	mm_segment_t fs = get_fs();	\
-	set_fs(get_ds());		\
-	res = ioctl(dev, arg, cmd);	\
-	set_fs(fs);			\
-	res; })
 
 #define BOND_MODE(bond) ((bond)->params.mode)
 
diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h
index 3e445a7..b075f60 100644
--- a/include/uapi/linux/falloc.h
+++ b/include/uapi/linux/falloc.h
@@ -58,4 +58,22 @@
  */
 #define FALLOC_FL_INSERT_RANGE		0x20
 
+/*
+ * FALLOC_FL_UNSHARE_RANGE is used to unshare shared blocks within the
+ * file size without overwriting any existing data. The purpose of this
+ * call is to preemptively reallocate any blocks that are subject to
+ * copy-on-write.
+ *
+ * Different filesystems may implement different limitations on the
+ * granularity of the operation. Most will limit operations to filesystem
+ * block size boundaries, but this boundary may be larger or smaller
+ * depending on the filesystem and/or the configuration of the filesystem
+ * or file.
+ *
+ * This flag can only be used with allocate-mode fallocate, which is
+ * to say that it cannot be used with the punch, zero, collapse, or
+ * insert range modes.
+ */
+#define FALLOC_FL_UNSHARE_RANGE		0x40
+
 #endif /* _UAPI_FALLOC_H_ */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 2473272..acb2b61 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -158,7 +158,8 @@
 	__u32		fsx_extsize;	/* extsize field value (get/set)*/
 	__u32		fsx_nextents;	/* nextents field value (get)	*/
 	__u32		fsx_projid;	/* project identifier (get/set) */
-	unsigned char	fsx_pad[12];
+	__u32		fsx_cowextsize;	/* CoW extsize field value (get/set)*/
+	unsigned char	fsx_pad[8];
 };
 
 /*
@@ -179,6 +180,7 @@
 #define FS_XFLAG_NODEFRAG	0x00002000	/* do not defragment */
 #define FS_XFLAG_FILESTREAM	0x00004000	/* use filestream allocator */
 #define FS_XFLAG_DAX		0x00008000	/* use DAX for IO */
+#define FS_XFLAG_COWEXTSIZE	0x00010000	/* CoW extent size allocator hint */
 #define FS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this	*/
 
 /* the read-only stuff doesn't really belong here, but any other place is
diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h
index 2b871e0..4ae6279 100644
--- a/include/uapi/linux/nfs4.h
+++ b/include/uapi/linux/nfs4.h
@@ -39,8 +39,9 @@
 #define NFS4_FH_VOL_MIGRATION		0x0004
 #define NFS4_FH_VOL_RENAME		0x0008
 
-#define NFS4_OPEN_RESULT_CONFIRM 0x0002
-#define NFS4_OPEN_RESULT_LOCKTYPE_POSIX 0x0004
+#define NFS4_OPEN_RESULT_CONFIRM		0x0002
+#define NFS4_OPEN_RESULT_LOCKTYPE_POSIX		0x0004
+#define NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK	0x0020
 
 #define NFS4_SHARE_ACCESS_MASK	0x000F
 #define NFS4_SHARE_ACCESS_READ	0x0001
diff --git a/include/video/exynos_mipi_dsim.h b/include/video/exynos_mipi_dsim.h
deleted file mode 100644
index 6a578f8..0000000
--- a/include/video/exynos_mipi_dsim.h
+++ /dev/null
@@ -1,358 +0,0 @@
-/* include/video/exynos_mipi_dsim.h
- *
- * Platform data header for Samsung SoC MIPI-DSIM.
- *
- * Copyright (c) 2012 Samsung Electronics Co., Ltd
- *
- * InKi Dae <inki.dae@samsung.com>
- * Donghwa Lee <dh09.lee@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-#ifndef _EXYNOS_MIPI_DSIM_H
-#define _EXYNOS_MIPI_DSIM_H
-
-#include <linux/device.h>
-#include <linux/fb.h>
-
-#define PANEL_NAME_SIZE		(32)
-
-/*
- * Enumerate display interface type.
- *
- * DSIM_COMMAND means cpu interface and rgb interface for DSIM_VIDEO.
- *
- * P.S. MIPI DSI Master has two display controller intefaces, RGB Interface
- *	for main display and CPU Interface(same as I80 Interface) for main
- *	and sub display.
- */
-enum mipi_dsim_interface_type {
-	DSIM_COMMAND,
-	DSIM_VIDEO
-};
-
-enum mipi_dsim_virtual_ch_no {
-	DSIM_VIRTUAL_CH_0,
-	DSIM_VIRTUAL_CH_1,
-	DSIM_VIRTUAL_CH_2,
-	DSIM_VIRTUAL_CH_3
-};
-
-enum mipi_dsim_burst_mode_type {
-	DSIM_NON_BURST_SYNC_EVENT,
-	DSIM_BURST_SYNC_EVENT,
-	DSIM_NON_BURST_SYNC_PULSE,
-	DSIM_BURST,
-	DSIM_NON_VIDEO_MODE
-};
-
-enum mipi_dsim_no_of_data_lane {
-	DSIM_DATA_LANE_1,
-	DSIM_DATA_LANE_2,
-	DSIM_DATA_LANE_3,
-	DSIM_DATA_LANE_4
-};
-
-enum mipi_dsim_byte_clk_src {
-	DSIM_PLL_OUT_DIV8,
-	DSIM_EXT_CLK_DIV8,
-	DSIM_EXT_CLK_BYPASS
-};
-
-enum mipi_dsim_pixel_format {
-	DSIM_CMD_3BPP,
-	DSIM_CMD_8BPP,
-	DSIM_CMD_12BPP,
-	DSIM_CMD_16BPP,
-	DSIM_VID_16BPP_565,
-	DSIM_VID_18BPP_666PACKED,
-	DSIM_18BPP_666LOOSELYPACKED,
-	DSIM_24BPP_888
-};
-
-/*
- * struct mipi_dsim_config - interface for configuring mipi-dsi controller.
- *
- * @auto_flush: enable or disable Auto flush of MD FIFO using VSYNC pulse.
- * @eot_disable: enable or disable EoT packet in HS mode.
- * @auto_vertical_cnt: specifies auto vertical count mode.
- *	in Video mode, the vertical line transition uses line counter
- *	configured by VSA, VBP, and Vertical resolution.
- *	If this bit is set to '1', the line counter does not use VSA and VBP
- *	registers.(in command mode, this variable is ignored)
- * @hse: set horizontal sync event mode.
- *	In VSYNC pulse and Vporch area, MIPI DSI master transfers only HSYNC
- *	start packet to MIPI DSI slave at MIPI DSI spec1.1r02.
- *	this bit transfers HSYNC end packet in VSYNC pulse and Vporch area
- *	(in mommand mode, this variable is ignored)
- * @hfp: specifies HFP disable mode.
- *	if this variable is set, DSI master ignores HFP area in VIDEO mode.
- *	(in command mode, this variable is ignored)
- * @hbp: specifies HBP disable mode.
- *	if this variable is set, DSI master ignores HBP area in VIDEO mode.
- *	(in command mode, this variable is ignored)
- * @hsa: specifies HSA disable mode.
- *	if this variable is set, DSI master ignores HSA area in VIDEO mode.
- *	(in command mode, this variable is ignored)
- * @cma_allow: specifies the number of horizontal lines, where command packet
- *	transmission is allowed after Stable VFP period.
- * @e_interface: specifies interface to be used.(CPU or RGB interface)
- * @e_virtual_ch: specifies virtual channel number that main or
- *	sub diaplsy uses.
- * @e_pixel_format: specifies pixel stream format for main or sub display.
- * @e_burst_mode: selects Burst mode in Video mode.
- *	in Non-burst mode, RGB data area is filled with RGB data and NULL
- *	packets, according to input bandwidth of RGB interface.
- *	In Burst mode, RGB data area is filled with RGB data only.
- * @e_no_data_lane: specifies data lane count to be used by Master.
- * @e_byte_clk: select byte clock source. (it must be DSIM_PLL_OUT_DIV8)
- *	DSIM_EXT_CLK_DIV8 and DSIM_EXT_CLK_BYPASSS are not supported.
- * @pll_stable_time: specifies the PLL Timer for stability of the ganerated
- *	clock(System clock cycle base)
- *	if the timer value goes to 0x00000000, the clock stable bit of status
- *	and interrupt register is set.
- * @esc_clk: specifies escape clock frequency for getting the escape clock
- *	prescaler value.
- * @stop_holding_cnt: specifies the interval value between transmitting
- *	read packet(or write "set_tear_on" command) and BTA request.
- *	after transmitting read packet or write "set_tear_on" command,
- *	BTA requests to D-PHY automatically. this counter value specifies
- *	the interval between them.
- * @bta_timeout: specifies the timer for BTA.
- *	this register specifies time out from BTA request to change
- *	the direction with respect to Tx escape clock.
- * @rx_timeout: specifies the timer for LP Rx mode timeout.
- *	this register specifies time out on how long RxValid deasserts,
- *	after RxLpdt asserts with respect to Tx escape clock.
- *	- RxValid specifies Rx data valid indicator.
- *	- RxLpdt specifies an indicator that D-PHY is under RxLpdt mode.
- *	- RxValid and RxLpdt specifies signal from D-PHY.
- */
-struct mipi_dsim_config {
-	unsigned char			auto_flush;
-	unsigned char			eot_disable;
-
-	unsigned char			auto_vertical_cnt;
-	unsigned char			hse;
-	unsigned char			hfp;
-	unsigned char			hbp;
-	unsigned char			hsa;
-	unsigned char			cmd_allow;
-
-	enum mipi_dsim_interface_type	e_interface;
-	enum mipi_dsim_virtual_ch_no	e_virtual_ch;
-	enum mipi_dsim_pixel_format	e_pixel_format;
-	enum mipi_dsim_burst_mode_type	e_burst_mode;
-	enum mipi_dsim_no_of_data_lane	e_no_data_lane;
-	enum mipi_dsim_byte_clk_src	e_byte_clk;
-
-	/*
-	 * ===========================================
-	 * |    P    |    M    |    S    |    MHz    |
-	 * -------------------------------------------
-	 * |    3    |   100   |    3    |    100    |
-	 * |    3    |   100   |    2    |    200    |
-	 * |    3    |    63   |    1    |    252    |
-	 * |    4    |   100   |    1    |    300    |
-	 * |    4    |   110   |    1    |    330    |
-	 * |   12    |   350   |    1    |    350    |
-	 * |    3    |   100   |    1    |    400    |
-	 * |    4    |   150   |    1    |    450    |
-	 * |    6    |   118   |    1    |    472    |
-	 * |	3    |   120   |    1    |    480    |
-	 * |   12    |   250   |    0    |    500    |
-	 * |    4    |   100   |    0    |    600    |
-	 * |    3    |    81   |    0    |    648    |
-	 * |    3    |    88   |    0    |    704    |
-	 * |    3    |    90   |    0    |    720    |
-	 * |    3    |   100   |    0    |    800    |
-	 * |   12    |   425   |    0    |    850    |
-	 * |    4    |   150   |    0    |    900    |
-	 * |   12    |   475   |    0    |    950    |
-	 * |    6    |   250   |    0    |   1000    |
-	 * -------------------------------------------
-	 */
-
-	/*
-	 * pms could be calculated as the following.
-	 * M * 24 / P * 2 ^ S = MHz
-	 */
-	unsigned char			p;
-	unsigned short			m;
-	unsigned char			s;
-
-	unsigned int			pll_stable_time;
-	unsigned long			esc_clk;
-
-	unsigned short			stop_holding_cnt;
-	unsigned char			bta_timeout;
-	unsigned short			rx_timeout;
-};
-
-/*
- * struct mipi_dsim_device - global interface for mipi-dsi driver.
- *
- * @dev: driver model representation of the device.
- * @id: unique device id.
- * @clock: pointer to MIPI-DSI clock of clock framework.
- * @irq: interrupt number to MIPI-DSI controller.
- * @reg_base: base address to memory mapped SRF of MIPI-DSI controller.
- *	(virtual address)
- * @lock: the mutex protecting this data structure.
- * @dsim_info: infomation for configuring mipi-dsi controller.
- * @master_ops: callbacks to mipi-dsi operations.
- * @dsim_lcd_dev: pointer to activated ddi device.
- *	(it would be registered by mipi-dsi driver.)
- * @dsim_lcd_drv: pointer to activated_ddi driver.
- *	(it would be registered by mipi-dsi driver.)
- * @lcd_info: pointer to mipi_lcd_info structure.
- * @state: specifies status of MIPI-DSI controller.
- *	the status could be RESET, INIT, STOP, HSCLKEN and ULPS.
- * @data_lane: specifiec enabled data lane number.
- *	this variable would be set by driver according to e_no_data_lane
- *	automatically.
- * @e_clk_src: select byte clock source.
- * @pd: pointer to MIPI-DSI driver platform data.
- * @phy: pointer to the MIPI-DSI PHY
- */
-struct mipi_dsim_device {
-	struct device			*dev;
-	int				id;
-	struct clk			*clock;
-	unsigned int			irq;
-	void __iomem			*reg_base;
-	struct mutex			lock;
-
-	struct mipi_dsim_config		*dsim_config;
-	struct mipi_dsim_master_ops	*master_ops;
-	struct mipi_dsim_lcd_device	*dsim_lcd_dev;
-	struct mipi_dsim_lcd_driver	*dsim_lcd_drv;
-
-	unsigned int			state;
-	unsigned int			data_lane;
-	unsigned int			e_clk_src;
-	bool				suspended;
-
-	struct mipi_dsim_platform_data	*pd;
-	struct phy			*phy;
-};
-
-/*
- * struct mipi_dsim_platform_data - interface to platform data
- *	for mipi-dsi driver.
- *
- * @lcd_panel_name: specifies lcd panel name registered to mipi-dsi driver.
- *	lcd panel driver searched would be actived.
- * @dsim_config: pointer of structure for configuring mipi-dsi controller.
- * @enabled: indicate whether mipi controller got enabled or not.
- * @lcd_panel_info: pointer for lcd panel specific structure.
- *	this structure specifies width, height, timing and polarity and so on.
- */
-struct mipi_dsim_platform_data {
-	char				lcd_panel_name[PANEL_NAME_SIZE];
-
-	struct mipi_dsim_config		*dsim_config;
-	unsigned int			enabled;
-	void				*lcd_panel_info;
-};
-
-/*
- * struct mipi_dsim_master_ops - callbacks to mipi-dsi operations.
- *
- * @cmd_write: transfer command to lcd panel at LP mode.
- * @cmd_read: read command from rx register.
- * @get_dsim_frame_done: get the status that all screen data have been
- *	transferred to mipi-dsi.
- * @clear_dsim_frame_done: clear frame done status.
- * @get_fb_frame_done: get frame done status of display controller.
- * @trigger: trigger display controller.
- *	- this one would be used only in case of CPU mode.
- *  @set_early_blank_mode: set framebuffer blank mode.
- *	- this callback should be called prior to fb_blank() by a client driver
- *	only if needing.
- *  @set_blank_mode: set framebuffer blank mode.
- *	- this callback should be called after fb_blank() by a client driver
- *	only if needing.
- */
-
-struct mipi_dsim_master_ops {
-	int (*cmd_write)(struct mipi_dsim_device *dsim, unsigned int data_id,
-		const unsigned char *data0, unsigned int data1);
-	int (*cmd_read)(struct mipi_dsim_device *dsim, unsigned int data_id,
-		unsigned int data0, unsigned int req_size, u8 *rx_buf);
-	int (*get_dsim_frame_done)(struct mipi_dsim_device *dsim);
-	int (*clear_dsim_frame_done)(struct mipi_dsim_device *dsim);
-
-	int (*get_fb_frame_done)(struct fb_info *info);
-	void (*trigger)(struct fb_info *info);
-	int (*set_early_blank_mode)(struct mipi_dsim_device *dsim, int power);
-	int (*set_blank_mode)(struct mipi_dsim_device *dsim, int power);
-};
-
-/*
- * device structure for mipi-dsi based lcd panel.
- *
- * @name: name of the device to use with this device, or an
- *	alias for that name.
- * @dev: driver model representation of the device.
- * @id: id of device to be registered.
- * @bus_id: bus id for identifing connected bus
- *	and this bus id should be same as id of mipi_dsim_device.
- * @irq: irq number for signaling when framebuffer transfer of
- *	lcd panel module is completed.
- *	this irq would be used only for MIPI-DSI based CPU mode lcd panel.
- * @master: pointer to mipi-dsi master device object.
- * @platform_data: lcd panel specific platform data.
- */
-struct mipi_dsim_lcd_device {
-	char			*name;
-	struct device		dev;
-	int			id;
-	int			bus_id;
-	int			irq;
-	int			panel_reverse;
-
-	struct mipi_dsim_device *master;
-	void			*platform_data;
-};
-
-/*
- * driver structure for mipi-dsi based lcd panel.
- *
- * this structure should be registered by lcd panel driver.
- * mipi-dsi driver seeks lcd panel registered through name field
- * and calls these callback functions in appropriate time.
- *
- * @name: name of the driver to use with this device, or an
- *	alias for that name.
- * @id: id of driver to be registered.
- *	this id would be used for finding device object registered.
- */
-struct mipi_dsim_lcd_driver {
-	char			*name;
-	int			id;
-
-	void	(*power_on)(struct mipi_dsim_lcd_device *dsim_dev, int enable);
-	void	(*set_sequence)(struct mipi_dsim_lcd_device *dsim_dev);
-	int	(*probe)(struct mipi_dsim_lcd_device *dsim_dev);
-	int	(*remove)(struct mipi_dsim_lcd_device *dsim_dev);
-	void	(*shutdown)(struct mipi_dsim_lcd_device *dsim_dev);
-	int	(*suspend)(struct mipi_dsim_lcd_device *dsim_dev);
-	int	(*resume)(struct mipi_dsim_lcd_device *dsim_dev);
-};
-
-/*
- * register mipi_dsim_lcd_device to mipi-dsi master.
- */
-int exynos_mipi_dsi_register_lcd_device(struct mipi_dsim_lcd_device
-						*lcd_dev);
-/**
- * register mipi_dsim_lcd_driver object defined by lcd panel driver
- * to mipi-dsi driver.
- */
-int exynos_mipi_dsi_register_lcd_driver(struct mipi_dsim_lcd_driver
-						*lcd_drv);
-#endif /* _EXYNOS_MIPI_DSIM_H */
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 992ab9d..e579808 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -1,8 +1,4 @@
 
-# We are fully aware of the dangers of __builtin_return_address()
-FRAME_CFLAGS := $(call cc-disable-warning,frame-address)
-KBUILD_CFLAGS += $(FRAME_CFLAGS)
-
 # Do not instrument the tracer itself:
 
 ifdef CONFIG_FUNCTION_TRACER
diff --git a/mm/Makefile b/mm/Makefile
index 2ca1faf..295bd7a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -21,9 +21,6 @@
 KCOV_INSTRUMENT_mmzone.o := n
 KCOV_INSTRUMENT_vmstat.o := n
 
-# Since __builtin_frame_address does work as used, disable the warning.
-CFLAGS_usercopy.o += $(call cc-disable-warning, frame-address)
-
 mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= gup.o highmem.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index a7e42f9..2bff63a 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -551,7 +551,7 @@
 			*entry, *new;
 	unsigned int nr;
 
-	nr = hash_long(from_kuid(&init_user_ns, acred->uid), cache->hashbits);
+	nr = auth->au_ops->hash_cred(acred, cache->hashbits);
 
 	rcu_read_lock();
 	hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) {
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 83dffea..f1df983 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -78,6 +78,14 @@
 	return auth->au_ops->lookup_cred(auth, acred, lookupflags);
 }
 
+static int
+generic_hash_cred(struct auth_cred *acred, unsigned int hashbits)
+{
+	return hash_64(from_kgid(&init_user_ns, acred->gid) |
+		((u64)from_kuid(&init_user_ns, acred->uid) <<
+			(sizeof(gid_t) * 8)), hashbits);
+}
+
 /*
  * Lookup generic creds for current process
  */
@@ -258,6 +266,7 @@
 static const struct rpc_authops generic_auth_ops = {
 	.owner = THIS_MODULE,
 	.au_name = "Generic",
+	.hash_cred = generic_hash_cred,
 	.lookup_cred = generic_lookup_cred,
 	.crcreate = generic_create_cred,
 	.key_timeout = generic_key_timeout,
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 976c781..d8bd97a 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1298,6 +1298,12 @@
 	gss_destroy_nullcred(cred);
 }
 
+static int
+gss_hash_cred(struct auth_cred *acred, unsigned int hashbits)
+{
+	return hash_64(from_kuid(&init_user_ns, acred->uid), hashbits);
+}
+
 /*
  * Lookup RPCSEC_GSS cred for the current process
  */
@@ -1982,6 +1988,7 @@
 	.au_name	= "RPCSEC_GSS",
 	.create		= gss_create,
 	.destroy	= gss_destroy,
+	.hash_cred	= gss_hash_cred,
 	.lookup_cred	= gss_lookup_cred,
 	.crcreate	= gss_create_cred,
 	.list_pseudoflavors = gss_mech_list_pseudoflavors,
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index a1d768a9..306fc0f 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -46,6 +46,14 @@
 	rpcauth_clear_credcache(auth->au_credcache);
 }
 
+static int
+unx_hash_cred(struct auth_cred *acred, unsigned int hashbits)
+{
+	return hash_64(from_kgid(&init_user_ns, acred->gid) |
+		((u64)from_kuid(&init_user_ns, acred->uid) <<
+			(sizeof(gid_t) * 8)), hashbits);
+}
+
 /*
  * Lookup AUTH_UNIX creds for current process
  */
@@ -220,6 +228,7 @@
 	.au_name	= "UNIX",
 	.create		= unx_create,
 	.destroy	= unx_destroy,
+	.hash_cred	= unx_hash_cred,
 	.lookup_cred	= unx_lookup_cred,
 	.crcreate	= unx_create_cred,
 };
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 229956b..ac701c2 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -76,13 +76,7 @@
 	page = alloc_page(gfp_flags);
 	if (page == NULL)
 		return -ENOMEM;
-	buf->head[0].iov_base = page_address(page);
-	buf->head[0].iov_len = PAGE_SIZE;
-	buf->tail[0].iov_base = NULL;
-	buf->tail[0].iov_len = 0;
-	buf->page_len = 0;
-	buf->len = 0;
-	buf->buflen = PAGE_SIZE;
+	xdr_buf_init(buf, page_address(page), PAGE_SIZE);
 	return 0;
 }
 
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 4d8e11f..8aabe12 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -353,7 +353,7 @@
 	spin_unlock(&cache_list_lock);
 
 	/* start the cleaning process */
-	schedule_delayed_work(&cache_cleaner, 0);
+	queue_delayed_work(system_power_efficient_wq, &cache_cleaner, 0);
 }
 EXPORT_SYMBOL_GPL(sunrpc_init_cache_detail);
 
@@ -476,7 +476,8 @@
 		delay = 0;
 
 	if (delay)
-		schedule_delayed_work(&cache_cleaner, delay);
+		queue_delayed_work(system_power_efficient_wq,
+				   &cache_cleaner, delay);
 }
 
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 66f23b3..34dd7b2 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -184,7 +184,6 @@
 				   struct super_block *sb)
 {
 	struct dentry *dentry;
-	int err = 0;
 
 	switch (event) {
 	case RPC_PIPEFS_MOUNT:
@@ -201,7 +200,7 @@
 		printk(KERN_ERR "%s: unknown event: %ld\n", __func__, event);
 		return -ENOTSUPP;
 	}
-	return err;
+	return 0;
 }
 
 static int __rpc_pipefs_event(struct rpc_clnt *clnt, unsigned long event,
@@ -988,7 +987,6 @@
 {
 
 	if (clnt != NULL) {
-		rpc_task_release_client(task);
 		if (task->tk_xprt == NULL)
 			task->tk_xprt = xprt_iter_get_next(&clnt->cl_xpi);
 		task->tk_client = clnt;
@@ -1693,6 +1691,7 @@
 	struct rpc_rqst *req = task->tk_rqstp;
 	struct rpc_xprt *xprt = req->rq_xprt;
 	struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
+	int status;
 
 	dprint_status(task);
 
@@ -1718,11 +1717,14 @@
 	req->rq_rcvsize = RPC_REPHDRSIZE + slack + proc->p_replen;
 	req->rq_rcvsize <<= 2;
 
-	req->rq_buffer = xprt->ops->buf_alloc(task,
-					req->rq_callsize + req->rq_rcvsize);
-	if (req->rq_buffer != NULL)
-		return;
+	status = xprt->ops->buf_alloc(task);
 	xprt_inject_disconnect(xprt);
+	if (status == 0)
+		return;
+	if (status != -ENOMEM) {
+		rpc_exit(task, status);
+		return;
+	}
 
 	dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid);
 
@@ -1748,18 +1750,6 @@
 	task->tk_rqstp->rq_bytes_sent = 0;
 }
 
-static inline void
-rpc_xdr_buf_init(struct xdr_buf *buf, void *start, size_t len)
-{
-	buf->head[0].iov_base = start;
-	buf->head[0].iov_len = len;
-	buf->tail[0].iov_len = 0;
-	buf->page_len = 0;
-	buf->flags = 0;
-	buf->len = 0;
-	buf->buflen = len;
-}
-
 /*
  * 3.	Encode arguments of an RPC call
  */
@@ -1772,12 +1762,12 @@
 
 	dprint_status(task);
 
-	rpc_xdr_buf_init(&req->rq_snd_buf,
-			 req->rq_buffer,
-			 req->rq_callsize);
-	rpc_xdr_buf_init(&req->rq_rcv_buf,
-			 (char *)req->rq_buffer + req->rq_callsize,
-			 req->rq_rcvsize);
+	xdr_buf_init(&req->rq_snd_buf,
+		     req->rq_buffer,
+		     req->rq_callsize);
+	xdr_buf_init(&req->rq_rcv_buf,
+		     req->rq_rbuffer,
+		     req->rq_rcvsize);
 
 	p = rpc_encode_header(task);
 	if (p == NULL) {
@@ -2616,6 +2606,70 @@
 EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt);
 
 /**
+ * rpc_clnt_setup_test_and_add_xprt()
+ *
+ * This is an rpc_clnt_add_xprt setup() function which returns 1 so:
+ *   1) caller of the test function must dereference the rpc_xprt_switch
+ *   and the rpc_xprt.
+ *   2) test function must call rpc_xprt_switch_add_xprt, usually in
+ *   the rpc_call_done routine.
+ *
+ * Upon success (return of 1), the test function adds the new
+ * transport to the rpc_clnt xprt switch
+ *
+ * @clnt: struct rpc_clnt to get the new transport
+ * @xps:  the rpc_xprt_switch to hold the new transport
+ * @xprt: the rpc_xprt to test
+ * @data: a struct rpc_add_xprt_test pointer that holds the test function
+ *        and test function call data
+ */
+int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt,
+				     struct rpc_xprt_switch *xps,
+				     struct rpc_xprt *xprt,
+				     void *data)
+{
+	struct rpc_cred *cred;
+	struct rpc_task *task;
+	struct rpc_add_xprt_test *xtest = (struct rpc_add_xprt_test *)data;
+	int status = -EADDRINUSE;
+
+	xprt = xprt_get(xprt);
+	xprt_switch_get(xps);
+
+	if (rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr))
+		goto out_err;
+
+	/* Test the connection */
+	cred = authnull_ops.lookup_cred(NULL, NULL, 0);
+	task = rpc_call_null_helper(clnt, xprt, cred,
+				    RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
+				    NULL, NULL);
+	put_rpccred(cred);
+	if (IS_ERR(task)) {
+		status = PTR_ERR(task);
+		goto out_err;
+	}
+	status = task->tk_status;
+	rpc_put_task(task);
+
+	if (status < 0)
+		goto out_err;
+
+	/* rpc_xprt_switch and rpc_xprt are deferrenced by add_xprt_test() */
+	xtest->add_xprt_test(clnt, xprt, xtest->data);
+
+	/* so that rpc_clnt_add_xprt does not call rpc_xprt_switch_add_xprt */
+	return 1;
+out_err:
+	xprt_put(xprt);
+	xprt_switch_put(xps);
+	pr_info("RPC:   rpc_clnt_test_xprt failed: %d addr %s not added\n",
+		status, xprt->address_strings[RPC_DISPLAY_ADDR]);
+	return status;
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_setup_test_and_add_xprt);
+
+/**
  * rpc_clnt_add_xprt - Add a new transport to a rpc_clnt
  * @clnt: pointer to struct rpc_clnt
  * @xprtargs: pointer to struct xprt_create
@@ -2697,6 +2751,34 @@
 }
 EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout);
 
+void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt)
+{
+	xprt_switch_put(rcu_dereference(clnt->cl_xpi.xpi_xpswitch));
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put);
+
+void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
+{
+	rpc_xprt_switch_add_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch),
+				 xprt);
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_add_xprt);
+
+bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
+				   const struct sockaddr *sap)
+{
+	struct rpc_xprt_switch *xps;
+	bool ret;
+
+	xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+
+	rcu_read_lock();
+	ret = rpc_xprt_switch_has_addr(xps, sap);
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_has_addr);
+
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 static void rpc_show_header(void)
 {
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 9ae5885..5db68b3 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -849,14 +849,17 @@
 }
 
 /**
- * rpc_malloc - allocate an RPC buffer
- * @task: RPC task that will use this buffer
- * @size: requested byte size
+ * rpc_malloc - allocate RPC buffer resources
+ * @task: RPC task
+ *
+ * A single memory region is allocated, which is split between the
+ * RPC call and RPC reply that this task is being used for. When
+ * this RPC is retired, the memory is released by calling rpc_free.
  *
  * To prevent rpciod from hanging, this allocator never sleeps,
- * returning NULL and suppressing warning if the request cannot be serviced
- * immediately.
- * The caller can arrange to sleep in a way that is safe for rpciod.
+ * returning -ENOMEM and suppressing warning if the request cannot
+ * be serviced immediately. The caller can arrange to sleep in a
+ * way that is safe for rpciod.
  *
  * Most requests are 'small' (under 2KiB) and can be serviced from a
  * mempool, ensuring that NFS reads and writes can always proceed,
@@ -865,8 +868,10 @@
  * In order to avoid memory starvation triggering more writebacks of
  * NFS requests, we avoid using GFP_KERNEL.
  */
-void *rpc_malloc(struct rpc_task *task, size_t size)
+int rpc_malloc(struct rpc_task *task)
 {
+	struct rpc_rqst *rqst = task->tk_rqstp;
+	size_t size = rqst->rq_callsize + rqst->rq_rcvsize;
 	struct rpc_buffer *buf;
 	gfp_t gfp = GFP_NOIO | __GFP_NOWARN;
 
@@ -880,28 +885,28 @@
 		buf = kmalloc(size, gfp);
 
 	if (!buf)
-		return NULL;
+		return -ENOMEM;
 
 	buf->len = size;
 	dprintk("RPC: %5u allocated buffer of size %zu at %p\n",
 			task->tk_pid, size, buf);
-	return &buf->data;
+	rqst->rq_buffer = buf->data;
+	rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(rpc_malloc);
 
 /**
- * rpc_free - free buffer allocated via rpc_malloc
- * @buffer: buffer to free
+ * rpc_free - free RPC buffer resources allocated via rpc_malloc
+ * @task: RPC task
  *
  */
-void rpc_free(void *buffer)
+void rpc_free(struct rpc_task *task)
 {
+	void *buffer = task->tk_rqstp->rq_buffer;
 	size_t size;
 	struct rpc_buffer *buf;
 
-	if (!buffer)
-		return;
-
 	buf = container_of(buffer, struct rpc_buffer, data);
 	size = buf->len;
 
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index c5b0cb4..7c8070e 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -401,6 +401,21 @@
 }
 EXPORT_SYMBOL_GPL(svc_bind);
 
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+static void
+__svc_init_bc(struct svc_serv *serv)
+{
+	INIT_LIST_HEAD(&serv->sv_cb_list);
+	spin_lock_init(&serv->sv_cb_lock);
+	init_waitqueue_head(&serv->sv_cb_waitq);
+}
+#else
+static void
+__svc_init_bc(struct svc_serv *serv)
+{
+}
+#endif
+
 /*
  * Create an RPC service
  */
@@ -443,6 +458,8 @@
 	init_timer(&serv->sv_temptimer);
 	spin_lock_init(&serv->sv_lock);
 
+	__svc_init_bc(serv);
+
 	serv->sv_nrpools = npools;
 	serv->sv_pools =
 		kcalloc(serv->sv_nrpools, sizeof(struct svc_pool),
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index c4f3cc0..7f1071e 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -767,7 +767,7 @@
 	newbase -= xdr->buf->page_base;
 
 	if (xdr_set_page_base(xdr, newbase, PAGE_SIZE) < 0)
-		xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len);
+		xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2);
 }
 
 static bool xdr_set_next_buffer(struct xdr_stream *xdr)
@@ -776,7 +776,7 @@
 		xdr_set_next_page(xdr);
 	else if (xdr->iov == xdr->buf->head) {
 		if (xdr_set_page_base(xdr, 0, PAGE_SIZE) < 0)
-			xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len);
+			xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2);
 	}
 	return xdr->p != xdr->end;
 }
@@ -859,12 +859,15 @@
 static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes)
 {
 	__be32 *p;
-	void *cpdest = xdr->scratch.iov_base;
+	char *cpdest = xdr->scratch.iov_base;
 	size_t cplen = (char *)xdr->end - (char *)xdr->p;
 
 	if (nbytes > xdr->scratch.iov_len)
 		return NULL;
-	memcpy(cpdest, xdr->p, cplen);
+	p = __xdr_inline_decode(xdr, cplen);
+	if (p == NULL)
+		return NULL;
+	memcpy(cpdest, p, cplen);
 	cpdest += cplen;
 	nbytes -= cplen;
 	if (!xdr_set_next_buffer(xdr))
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index ea244b2..685e6d2 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1295,7 +1295,7 @@
 	xprt_schedule_autodisconnect(xprt);
 	spin_unlock_bh(&xprt->transport_lock);
 	if (req->rq_buffer)
-		xprt->ops->buf_free(req->rq_buffer);
+		xprt->ops->buf_free(task);
 	xprt_inject_disconnect(xprt);
 	if (req->rq_cred != NULL)
 		put_rpccred(req->rq_cred);
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 66c9d63..ae92a9e 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -15,6 +15,7 @@
 #include <asm/cmpxchg.h>
 #include <linux/spinlock.h>
 #include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/addr.h>
 #include <linux/sunrpc/xprtmultipath.h>
 
 typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct list_head *head,
@@ -49,7 +50,8 @@
 	if (xprt == NULL)
 		return;
 	spin_lock(&xps->xps_lock);
-	if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL)
+	if ((xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) &&
+	    !rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr))
 		xprt_switch_add_xprt_locked(xps, xprt);
 	spin_unlock(&xps->xps_lock);
 }
@@ -232,6 +234,26 @@
 	return xprt_switch_find_current_entry(head, xpi->xpi_cursor);
 }
 
+bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps,
+			      const struct sockaddr *sap)
+{
+	struct list_head *head;
+	struct rpc_xprt *pos;
+
+	if (xps == NULL || sap == NULL)
+		return false;
+
+	head = &xps->xps_xprt_list;
+	list_for_each_entry_rcu(pos, head, xprt_switch) {
+		if (rpc_cmp_addr_port(sap, (struct sockaddr *)&pos->addr)) {
+			pr_info("RPC:   addr %s already in xprt switch\n",
+				pos->address_strings[RPC_DISPLAY_ADDR]);
+			return true;
+		}
+	}
+	return false;
+}
+
 static
 struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head,
 		const struct rpc_xprt *cur)
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 87762d9..2c472e1 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -27,7 +27,7 @@
 	list_del(&req->rl_all);
 	spin_unlock(&buf->rb_reqslock);
 
-	rpcrdma_destroy_req(&r_xprt->rx_ia, req);
+	rpcrdma_destroy_req(req);
 
 	kfree(rqst);
 }
@@ -35,10 +35,8 @@
 static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
 				 struct rpc_rqst *rqst)
 {
-	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct rpcrdma_regbuf *rb;
 	struct rpcrdma_req *req;
-	struct xdr_buf *buf;
 	size_t size;
 
 	req = rpcrdma_create_req(r_xprt);
@@ -46,30 +44,19 @@
 		return PTR_ERR(req);
 	req->rl_backchannel = true;
 
-	size = RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
-	rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL);
+	rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE,
+				  DMA_TO_DEVICE, GFP_KERNEL);
 	if (IS_ERR(rb))
 		goto out_fail;
 	req->rl_rdmabuf = rb;
 
-	size += RPCRDMA_INLINE_READ_THRESHOLD(rqst);
-	rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL);
+	size = r_xprt->rx_data.inline_rsize;
+	rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL);
 	if (IS_ERR(rb))
 		goto out_fail;
-	rb->rg_owner = req;
 	req->rl_sendbuf = rb;
-	/* so that rpcr_to_rdmar works when receiving a request */
-	rqst->rq_buffer = (void *)req->rl_sendbuf->rg_base;
-
-	buf = &rqst->rq_snd_buf;
-	buf->head[0].iov_base = rqst->rq_buffer;
-	buf->head[0].iov_len = 0;
-	buf->tail[0].iov_base = NULL;
-	buf->tail[0].iov_len = 0;
-	buf->page_len = 0;
-	buf->len = 0;
-	buf->buflen = size;
-
+	xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base, size);
+	rpcrdma_set_xprtdata(rqst, req);
 	return 0;
 
 out_fail:
@@ -219,7 +206,6 @@
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
 	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
 	struct rpcrdma_msg *headerp;
-	size_t rpclen;
 
 	headerp = rdmab_to_msg(req->rl_rdmabuf);
 	headerp->rm_xid = rqst->rq_xid;
@@ -231,26 +217,9 @@
 	headerp->rm_body.rm_chunks[1] = xdr_zero;
 	headerp->rm_body.rm_chunks[2] = xdr_zero;
 
-	rpclen = rqst->rq_svec[0].iov_len;
-
-#ifdef RPCRDMA_BACKCHANNEL_DEBUG
-	pr_info("RPC:       %s: rpclen %zd headerp 0x%p lkey 0x%x\n",
-		__func__, rpclen, headerp, rdmab_lkey(req->rl_rdmabuf));
-	pr_info("RPC:       %s: RPC/RDMA: %*ph\n",
-		__func__, (int)RPCRDMA_HDRLEN_MIN, headerp);
-	pr_info("RPC:       %s:      RPC: %*ph\n",
-		__func__, (int)rpclen, rqst->rq_svec[0].iov_base);
-#endif
-
-	req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
-	req->rl_send_iov[0].length = RPCRDMA_HDRLEN_MIN;
-	req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
-
-	req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
-	req->rl_send_iov[1].length = rpclen;
-	req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
-
-	req->rl_niovs = 2;
+	if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, RPCRDMA_HDRLEN_MIN,
+				       &rqst->rq_snd_buf, rpcrdma_noch))
+		return -EIO;
 	return 0;
 }
 
@@ -402,7 +371,7 @@
 out_short:
 	pr_warn("RPC/RDMA short backward direction call\n");
 
-	if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
+	if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
 		xprt_disconnect_done(xprt);
 	else
 		pr_warn("RPC:       %s: reposting rep %p\n",
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 21cb3b1..1ebb09e 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -160,9 +160,8 @@
 fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
 	    struct rpcrdma_create_data_internal *cdata)
 {
-	rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
-						      RPCRDMA_MAX_DATA_SEGS /
-						      RPCRDMA_MAX_FMR_SGES));
+	ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
+				RPCRDMA_MAX_FMR_SGES);
 	return 0;
 }
 
@@ -274,6 +273,7 @@
 	 */
 	list_for_each_entry(mw, &req->rl_registered, mw_list)
 		list_add_tail(&mw->fmr.fm_mr->list, &unmap_list);
+	r_xprt->rx_stats.local_inv_needed++;
 	rc = ib_unmap_fmr(&unmap_list);
 	if (rc)
 		goto out_reset;
@@ -331,4 +331,5 @@
 	.ro_init_mr			= fmr_op_init_mr,
 	.ro_release_mr			= fmr_op_release_mr,
 	.ro_displayname			= "fmr",
+	.ro_send_w_inv_ok		= 0,
 };
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 892b5e1..2109495 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -67,6 +67,8 @@
  * pending send queue WRs before the transport is reconnected.
  */
 
+#include <linux/sunrpc/rpc_rdma.h>
+
 #include "xprt_rdma.h"
 
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -161,7 +163,7 @@
 		return PTR_ERR(f->fr_mr);
 	}
 
-	dprintk("RPC:       %s: recovered FRMR %p\n", __func__, r);
+	dprintk("RPC:       %s: recovered FRMR %p\n", __func__, f);
 	f->fr_state = FRMR_IS_INVALID;
 	return 0;
 }
@@ -242,9 +244,8 @@
 					       depth;
 	}
 
-	rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
-						      RPCRDMA_MAX_DATA_SEGS /
-						      ia->ri_max_frmr_depth));
+	ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
+				ia->ri_max_frmr_depth);
 	return 0;
 }
 
@@ -329,7 +330,7 @@
 	frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe);
 	if (wc->status != IB_WC_SUCCESS)
 		__frwr_sendcompletion_flush(wc, frmr, "localinv");
-	complete_all(&frmr->fr_linv_done);
+	complete(&frmr->fr_linv_done);
 }
 
 /* Post a REG_MR Work Request to register a memory region
@@ -396,7 +397,7 @@
 		goto out_mapmr_err;
 
 	dprintk("RPC:       %s: Using frmr %p to map %u segments (%u bytes)\n",
-		__func__, mw, mw->mw_nents, mr->length);
+		__func__, frmr, mw->mw_nents, mr->length);
 
 	key = (u8)(mr->rkey & 0x000000FF);
 	ib_update_fast_reg_key(mr, ++key);
@@ -449,6 +450,8 @@
 	struct rpcrdma_frmr *f = &mw->frmr;
 	struct ib_send_wr *invalidate_wr;
 
+	dprintk("RPC:       %s: invalidating frmr %p\n", __func__, f);
+
 	f->fr_state = FRMR_IS_INVALID;
 	invalidate_wr = &f->fr_invwr;
 
@@ -472,6 +475,7 @@
 frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
 	struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr;
+	struct rpcrdma_rep *rep = req->rl_reply;
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct rpcrdma_mw *mw, *tmp;
 	struct rpcrdma_frmr *f;
@@ -487,6 +491,12 @@
 	f = NULL;
 	invalidate_wrs = pos = prev = NULL;
 	list_for_each_entry(mw, &req->rl_registered, mw_list) {
+		if ((rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) &&
+		    (mw->mw_handle == rep->rr_inv_rkey)) {
+			mw->frmr.fr_state = FRMR_IS_INVALID;
+			continue;
+		}
+
 		pos = __frwr_prepare_linv_wr(mw);
 
 		if (!invalidate_wrs)
@@ -496,6 +506,8 @@
 		prev = pos;
 		f = &mw->frmr;
 	}
+	if (!f)
+		goto unmap;
 
 	/* Strong send queue ordering guarantees that when the
 	 * last WR in the chain completes, all WRs in the chain
@@ -510,6 +522,7 @@
 	 * replaces the QP. The RPC reply handler won't call us
 	 * unless ri_id->qp is a valid pointer.
 	 */
+	r_xprt->rx_stats.local_inv_needed++;
 	rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr);
 	if (rc)
 		goto reset_mrs;
@@ -521,6 +534,8 @@
 	 */
 unmap:
 	list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
+		dprintk("RPC:       %s: unmapping frmr %p\n",
+			__func__, &mw->frmr);
 		list_del_init(&mw->mw_list);
 		ib_dma_unmap_sg(ia->ri_device,
 				mw->mw_sg, mw->mw_nents, mw->mw_dir);
@@ -576,4 +591,5 @@
 	.ro_init_mr			= frwr_op_init_mr,
 	.ro_release_mr			= frwr_op_release_mr,
 	.ro_displayname			= "frwr",
+	.ro_send_w_inv_ok		= RPCRDMA_CMP_F_SND_W_INV_OK,
 };
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index a47f170..d987c2d 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -53,14 +53,6 @@
 # define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 
-enum rpcrdma_chunktype {
-	rpcrdma_noch = 0,
-	rpcrdma_readch,
-	rpcrdma_areadch,
-	rpcrdma_writech,
-	rpcrdma_replych
-};
-
 static const char transfertypes[][12] = {
 	"inline",	/* no chunks */
 	"read list",	/* some argument via rdma read */
@@ -118,10 +110,12 @@
 	return size;
 }
 
-void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *ia,
-				  struct rpcrdma_create_data_internal *cdata,
-				  unsigned int maxsegs)
+void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
 {
+	struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	unsigned int maxsegs = ia->ri_max_segs;
+
 	ia->ri_max_inline_write = cdata->inline_wsize -
 				  rpcrdma_max_call_header_size(maxsegs);
 	ia->ri_max_inline_read = cdata->inline_rsize -
@@ -155,42 +149,6 @@
 	return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read;
 }
 
-static int
-rpcrdma_tail_pullup(struct xdr_buf *buf)
-{
-	size_t tlen = buf->tail[0].iov_len;
-	size_t skip = tlen & 3;
-
-	/* Do not include the tail if it is only an XDR pad */
-	if (tlen < 4)
-		return 0;
-
-	/* xdr_write_pages() adds a pad at the beginning of the tail
-	 * if the content in "buf->pages" is unaligned. Force the
-	 * tail's actual content to land at the next XDR position
-	 * after the head instead.
-	 */
-	if (skip) {
-		unsigned char *src, *dst;
-		unsigned int count;
-
-		src = buf->tail[0].iov_base;
-		dst = buf->head[0].iov_base;
-		dst += buf->head[0].iov_len;
-
-		src += skip;
-		tlen -= skip;
-
-		dprintk("RPC:       %s: skip=%zu, memmove(%p, %p, %zu)\n",
-			__func__, skip, dst, src, tlen);
-
-		for (count = tlen; count; count--)
-			*dst++ = *src++;
-	}
-
-	return tlen;
-}
-
 /* Split "vec" on page boundaries into segments. FMR registers pages,
  * not a byte range. Other modes coalesce these segments into a single
  * MR when they can.
@@ -229,7 +187,8 @@
 
 static int
 rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
-	enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg)
+	enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg,
+	bool reminv_expected)
 {
 	int len, n, p, page_base;
 	struct page **ppages;
@@ -271,6 +230,13 @@
 	if (type == rpcrdma_readch)
 		return n;
 
+	/* When encoding the Write list, some servers need to see an extra
+	 * segment for odd-length Write chunks. The upper layer provides
+	 * space in the tail iovec for this purpose.
+	 */
+	if (type == rpcrdma_writech && reminv_expected)
+		return n;
+
 	if (xdrbuf->tail[0].iov_len) {
 		/* the rpcrdma protocol allows us to omit any trailing
 		 * xdr pad bytes, saving the server an RDMA operation. */
@@ -327,7 +293,7 @@
 	if (rtype == rpcrdma_areadch)
 		pos = 0;
 	seg = req->rl_segments;
-	nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg);
+	nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, false);
 	if (nsegs < 0)
 		return ERR_PTR(nsegs);
 
@@ -391,7 +357,8 @@
 	seg = req->rl_segments;
 	nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
 				     rqst->rq_rcv_buf.head[0].iov_len,
-				     wtype, seg);
+				     wtype, seg,
+				     r_xprt->rx_ia.ri_reminv_expected);
 	if (nsegs < 0)
 		return ERR_PTR(nsegs);
 
@@ -456,7 +423,8 @@
 	}
 
 	seg = req->rl_segments;
-	nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg);
+	nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg,
+				     r_xprt->rx_ia.ri_reminv_expected);
 	if (nsegs < 0)
 		return ERR_PTR(nsegs);
 
@@ -491,74 +459,184 @@
 	return iptr;
 }
 
-/*
- * Copy write data inline.
- * This function is used for "small" requests. Data which is passed
- * to RPC via iovecs (or page list) is copied directly into the
- * pre-registered memory buffer for this request. For small amounts
- * of data, this is efficient. The cutoff value is tunable.
+/* Prepare the RPC-over-RDMA header SGE.
  */
-static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
+static bool
+rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
+			u32 len)
 {
-	int i, npages, curlen;
-	int copy_len;
-	unsigned char *srcp, *destp;
-	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
-	int page_base;
-	struct page **ppages;
+	struct rpcrdma_regbuf *rb = req->rl_rdmabuf;
+	struct ib_sge *sge = &req->rl_send_sge[0];
 
-	destp = rqst->rq_svec[0].iov_base;
-	curlen = rqst->rq_svec[0].iov_len;
-	destp += curlen;
+	if (unlikely(!rpcrdma_regbuf_is_mapped(rb))) {
+		if (!__rpcrdma_dma_map_regbuf(ia, rb))
+			return false;
+		sge->addr = rdmab_addr(rb);
+		sge->lkey = rdmab_lkey(rb);
+	}
+	sge->length = len;
 
-	dprintk("RPC:       %s: destp 0x%p len %d hdrlen %d\n",
-		__func__, destp, rqst->rq_slen, curlen);
+	ib_dma_sync_single_for_device(ia->ri_device, sge->addr,
+				      sge->length, DMA_TO_DEVICE);
+	req->rl_send_wr.num_sge++;
+	return true;
+}
 
-	copy_len = rqst->rq_snd_buf.page_len;
+/* Prepare the Send SGEs. The head and tail iovec, and each entry
+ * in the page list, gets its own SGE.
+ */
+static bool
+rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
+			 struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
+{
+	unsigned int sge_no, page_base, len, remaining;
+	struct rpcrdma_regbuf *rb = req->rl_sendbuf;
+	struct ib_device *device = ia->ri_device;
+	struct ib_sge *sge = req->rl_send_sge;
+	u32 lkey = ia->ri_pd->local_dma_lkey;
+	struct page *page, **ppages;
 
-	if (rqst->rq_snd_buf.tail[0].iov_len) {
-		curlen = rqst->rq_snd_buf.tail[0].iov_len;
-		if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) {
-			memmove(destp + copy_len,
-				rqst->rq_snd_buf.tail[0].iov_base, curlen);
-			r_xprt->rx_stats.pullup_copy_count += curlen;
+	/* The head iovec is straightforward, as it is already
+	 * DMA-mapped. Sync the content that has changed.
+	 */
+	if (!rpcrdma_dma_map_regbuf(ia, rb))
+		return false;
+	sge_no = 1;
+	sge[sge_no].addr = rdmab_addr(rb);
+	sge[sge_no].length = xdr->head[0].iov_len;
+	sge[sge_no].lkey = rdmab_lkey(rb);
+	ib_dma_sync_single_for_device(device, sge[sge_no].addr,
+				      sge[sge_no].length, DMA_TO_DEVICE);
+
+	/* If there is a Read chunk, the page list is being handled
+	 * via explicit RDMA, and thus is skipped here. However, the
+	 * tail iovec may include an XDR pad for the page list, as
+	 * well as additional content, and may not reside in the
+	 * same page as the head iovec.
+	 */
+	if (rtype == rpcrdma_readch) {
+		len = xdr->tail[0].iov_len;
+
+		/* Do not include the tail if it is only an XDR pad */
+		if (len < 4)
+			goto out;
+
+		page = virt_to_page(xdr->tail[0].iov_base);
+		page_base = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
+
+		/* If the content in the page list is an odd length,
+		 * xdr_write_pages() has added a pad at the beginning
+		 * of the tail iovec. Force the tail's non-pad content
+		 * to land at the next XDR position in the Send message.
+		 */
+		page_base += len & 3;
+		len -= len & 3;
+		goto map_tail;
+	}
+
+	/* If there is a page list present, temporarily DMA map
+	 * and prepare an SGE for each page to be sent.
+	 */
+	if (xdr->page_len) {
+		ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+		page_base = xdr->page_base & ~PAGE_MASK;
+		remaining = xdr->page_len;
+		while (remaining) {
+			sge_no++;
+			if (sge_no > RPCRDMA_MAX_SEND_SGES - 2)
+				goto out_mapping_overflow;
+
+			len = min_t(u32, PAGE_SIZE - page_base, remaining);
+			sge[sge_no].addr = ib_dma_map_page(device, *ppages,
+							   page_base, len,
+							   DMA_TO_DEVICE);
+			if (ib_dma_mapping_error(device, sge[sge_no].addr))
+				goto out_mapping_err;
+			sge[sge_no].length = len;
+			sge[sge_no].lkey = lkey;
+
+			req->rl_mapped_sges++;
+			ppages++;
+			remaining -= len;
+			page_base = 0;
 		}
-		dprintk("RPC:       %s: tail destp 0x%p len %d\n",
-			__func__, destp + copy_len, curlen);
-		rqst->rq_svec[0].iov_len += curlen;
 	}
-	r_xprt->rx_stats.pullup_copy_count += copy_len;
 
-	page_base = rqst->rq_snd_buf.page_base;
-	ppages = rqst->rq_snd_buf.pages + (page_base >> PAGE_SHIFT);
-	page_base &= ~PAGE_MASK;
-	npages = PAGE_ALIGN(page_base+copy_len) >> PAGE_SHIFT;
-	for (i = 0; copy_len && i < npages; i++) {
-		curlen = PAGE_SIZE - page_base;
-		if (curlen > copy_len)
-			curlen = copy_len;
-		dprintk("RPC:       %s: page %d destp 0x%p len %d curlen %d\n",
-			__func__, i, destp, copy_len, curlen);
-		srcp = kmap_atomic(ppages[i]);
-		memcpy(destp, srcp+page_base, curlen);
-		kunmap_atomic(srcp);
-		rqst->rq_svec[0].iov_len += curlen;
-		destp += curlen;
-		copy_len -= curlen;
-		page_base = 0;
+	/* The tail iovec is not always constructed in the same
+	 * page where the head iovec resides (see, for example,
+	 * gss_wrap_req_priv). To neatly accommodate that case,
+	 * DMA map it separately.
+	 */
+	if (xdr->tail[0].iov_len) {
+		page = virt_to_page(xdr->tail[0].iov_base);
+		page_base = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
+		len = xdr->tail[0].iov_len;
+
+map_tail:
+		sge_no++;
+		sge[sge_no].addr = ib_dma_map_page(device, page,
+						   page_base, len,
+						   DMA_TO_DEVICE);
+		if (ib_dma_mapping_error(device, sge[sge_no].addr))
+			goto out_mapping_err;
+		sge[sge_no].length = len;
+		sge[sge_no].lkey = lkey;
+		req->rl_mapped_sges++;
 	}
-	/* header now contains entire send message */
+
+out:
+	req->rl_send_wr.num_sge = sge_no + 1;
+	return true;
+
+out_mapping_overflow:
+	pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no);
+	return false;
+
+out_mapping_err:
+	pr_err("rpcrdma: Send mapping error\n");
+	return false;
+}
+
+bool
+rpcrdma_prepare_send_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
+			  u32 hdrlen, struct xdr_buf *xdr,
+			  enum rpcrdma_chunktype rtype)
+{
+	req->rl_send_wr.num_sge = 0;
+	req->rl_mapped_sges = 0;
+
+	if (!rpcrdma_prepare_hdr_sge(ia, req, hdrlen))
+		goto out_map;
+
+	if (rtype != rpcrdma_areadch)
+		if (!rpcrdma_prepare_msg_sges(ia, req, xdr, rtype))
+			goto out_map;
+
+	return true;
+
+out_map:
+	pr_err("rpcrdma: failed to DMA map a Send buffer\n");
+	return false;
+}
+
+void
+rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+{
+	struct ib_device *device = ia->ri_device;
+	struct ib_sge *sge;
+	int count;
+
+	sge = &req->rl_send_sge[2];
+	for (count = req->rl_mapped_sges; count--; sge++)
+		ib_dma_unmap_page(device, sge->addr, sge->length,
+				  DMA_TO_DEVICE);
+	req->rl_mapped_sges = 0;
 }
 
 /*
  * Marshal a request: the primary job of this routine is to choose
  * the transfer modes. See comments below.
  *
- * Prepares up to two IOVs per Call message:
- *
- *  [0] -- RPC RDMA header
- *  [1] -- the RPC header/data
- *
  * Returns zero on success, otherwise a negative errno.
  */
 
@@ -626,12 +704,11 @@
 	 */
 	if (rpcrdma_args_inline(r_xprt, rqst)) {
 		rtype = rpcrdma_noch;
-		rpcrdma_inline_pullup(rqst);
-		rpclen = rqst->rq_svec[0].iov_len;
+		rpclen = rqst->rq_snd_buf.len;
 	} else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
 		rtype = rpcrdma_readch;
-		rpclen = rqst->rq_svec[0].iov_len;
-		rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
+		rpclen = rqst->rq_snd_buf.head[0].iov_len +
+			 rqst->rq_snd_buf.tail[0].iov_len;
 	} else {
 		r_xprt->rx_stats.nomsg_call_count++;
 		headerp->rm_type = htonl(RDMA_NOMSG);
@@ -673,34 +750,18 @@
 		goto out_unmap;
 	hdrlen = (unsigned char *)iptr - (unsigned char *)headerp;
 
-	if (hdrlen + rpclen > RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
-		goto out_overflow;
-
 	dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n",
 		rqst->rq_task->tk_pid, __func__,
 		transfertypes[rtype], transfertypes[wtype],
 		hdrlen, rpclen);
 
-	req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
-	req->rl_send_iov[0].length = hdrlen;
-	req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
-
-	req->rl_niovs = 1;
-	if (rtype == rpcrdma_areadch)
-		return 0;
-
-	req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
-	req->rl_send_iov[1].length = rpclen;
-	req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
-
-	req->rl_niovs = 2;
+	if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen,
+				       &rqst->rq_snd_buf, rtype)) {
+		iptr = ERR_PTR(-EIO);
+		goto out_unmap;
+	}
 	return 0;
 
-out_overflow:
-	pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n",
-		hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]);
-	iptr = ERR_PTR(-EIO);
-
 out_unmap:
 	r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
 	return PTR_ERR(iptr);
@@ -916,8 +977,10 @@
  * allowed to timeout, to discover the errors at that time.
  */
 void
-rpcrdma_reply_handler(struct rpcrdma_rep *rep)
+rpcrdma_reply_handler(struct work_struct *work)
 {
+	struct rpcrdma_rep *rep =
+			container_of(work, struct rpcrdma_rep, rr_work);
 	struct rpcrdma_msg *headerp;
 	struct rpcrdma_req *req;
 	struct rpc_rqst *rqst;
@@ -1132,6 +1195,6 @@
 
 repost:
 	r_xprt->rx_stats.bad_reply_count++;
-	if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
+	if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
 		rpcrdma_recv_buffer_put(rep);
 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index a2a7519..2d8545c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -129,7 +129,7 @@
 		ret = -EIO;
 		goto out_unmap;
 	}
-	atomic_inc(&rdma->sc_dma_used);
+	svc_rdma_count_mappings(rdma, ctxt);
 
 	memset(&send_wr, 0, sizeof(send_wr));
 	ctxt->cqe.done = svc_rdma_wc_send;
@@ -159,33 +159,34 @@
 /* Server-side transport endpoint wants a whole page for its send
  * buffer. The client RPC code constructs the RPC header in this
  * buffer before it invokes ->send_request.
- *
- * Returns NULL if there was a temporary allocation failure.
  */
-static void *
-xprt_rdma_bc_allocate(struct rpc_task *task, size_t size)
+static int
+xprt_rdma_bc_allocate(struct rpc_task *task)
 {
 	struct rpc_rqst *rqst = task->tk_rqstp;
 	struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
+	size_t size = rqst->rq_callsize;
 	struct svcxprt_rdma *rdma;
 	struct page *page;
 
 	rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
 
-	/* Prevent an infinite loop: try to make this case work */
-	if (size > PAGE_SIZE)
+	if (size > PAGE_SIZE) {
 		WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n",
 			  size);
+		return -EINVAL;
+	}
 
 	page = alloc_page(RPCRDMA_DEF_GFP);
 	if (!page)
-		return NULL;
+		return -ENOMEM;
 
-	return page_address(page);
+	rqst->rq_buffer = page_address(page);
+	return 0;
 }
 
 static void
-xprt_rdma_bc_free(void *buffer)
+xprt_rdma_bc_free(struct rpc_task *task)
 {
 	/* No-op: ctxt and page have already been freed. */
 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 2c25606..ad1df97 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -159,7 +159,7 @@
 					   ctxt->sge[pno].addr);
 		if (ret)
 			goto err;
-		atomic_inc(&xprt->sc_dma_used);
+		svc_rdma_count_mappings(xprt, ctxt);
 
 		ctxt->sge[pno].lkey = xprt->sc_pd->local_dma_lkey;
 		ctxt->sge[pno].length = len;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 54d53330..f5a91ed 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -225,6 +225,48 @@
 	return rp_ary;
 }
 
+/* RPC-over-RDMA Version One private extension: Remote Invalidation.
+ * Responder's choice: requester signals it can handle Send With
+ * Invalidate, and responder chooses one rkey to invalidate.
+ *
+ * Find a candidate rkey to invalidate when sending a reply.  Picks the
+ * first rkey it finds in the chunks lists.
+ *
+ * Returns zero if RPC's chunk lists are empty.
+ */
+static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp,
+				 struct rpcrdma_write_array *wr_ary,
+				 struct rpcrdma_write_array *rp_ary)
+{
+	struct rpcrdma_read_chunk *rd_ary;
+	struct rpcrdma_segment *arg_ch;
+	u32 inv_rkey;
+
+	inv_rkey = 0;
+
+	rd_ary = svc_rdma_get_read_chunk(rdma_argp);
+	if (rd_ary) {
+		inv_rkey = be32_to_cpu(rd_ary->rc_target.rs_handle);
+		goto out;
+	}
+
+	if (wr_ary && be32_to_cpu(wr_ary->wc_nchunks)) {
+		arg_ch = &wr_ary->wc_array[0].wc_target;
+		inv_rkey = be32_to_cpu(arg_ch->rs_handle);
+		goto out;
+	}
+
+	if (rp_ary && be32_to_cpu(rp_ary->wc_nchunks)) {
+		arg_ch = &rp_ary->wc_array[0].wc_target;
+		inv_rkey = be32_to_cpu(arg_ch->rs_handle);
+		goto out;
+	}
+
+out:
+	dprintk("svcrdma: Send With Invalidate rkey=%08x\n", inv_rkey);
+	return inv_rkey;
+}
+
 /* Assumptions:
  * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
  */
@@ -280,7 +322,7 @@
 		if (ib_dma_mapping_error(xprt->sc_cm_id->device,
 					 sge[sge_no].addr))
 			goto err;
-		atomic_inc(&xprt->sc_dma_used);
+		svc_rdma_count_mappings(xprt, ctxt);
 		sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
 		ctxt->count++;
 		sge_off = 0;
@@ -464,7 +506,8 @@
 		      struct page *page,
 		      struct rpcrdma_msg *rdma_resp,
 		      struct svc_rdma_req_map *vec,
-		      int byte_count)
+		      int byte_count,
+		      u32 inv_rkey)
 {
 	struct svc_rdma_op_ctxt *ctxt;
 	struct ib_send_wr send_wr;
@@ -489,7 +532,7 @@
 			    ctxt->sge[0].length, DMA_TO_DEVICE);
 	if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
 		goto err;
-	atomic_inc(&rdma->sc_dma_used);
+	svc_rdma_count_mappings(rdma, ctxt);
 
 	ctxt->direction = DMA_TO_DEVICE;
 
@@ -505,7 +548,7 @@
 		if (ib_dma_mapping_error(rdma->sc_cm_id->device,
 					 ctxt->sge[sge_no].addr))
 			goto err;
-		atomic_inc(&rdma->sc_dma_used);
+		svc_rdma_count_mappings(rdma, ctxt);
 		ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
 		ctxt->sge[sge_no].length = sge_bytes;
 	}
@@ -523,23 +566,9 @@
 		ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
 		ctxt->count++;
 		rqstp->rq_respages[page_no] = NULL;
-		/*
-		 * If there are more pages than SGE, terminate SGE
-		 * list so that svc_rdma_unmap_dma doesn't attempt to
-		 * unmap garbage.
-		 */
-		if (page_no+1 >= sge_no)
-			ctxt->sge[page_no+1].length = 0;
 	}
 	rqstp->rq_next_page = rqstp->rq_respages + 1;
 
-	/* The loop above bumps sc_dma_used for each sge. The
-	 * xdr_buf.tail gets a separate sge, but resides in the
-	 * same page as xdr_buf.head. Don't count it twice.
-	 */
-	if (sge_no > ctxt->count)
-		atomic_dec(&rdma->sc_dma_used);
-
 	if (sge_no > rdma->sc_max_sge) {
 		pr_err("svcrdma: Too many sges (%d)\n", sge_no);
 		goto err;
@@ -549,7 +578,11 @@
 	send_wr.wr_cqe = &ctxt->cqe;
 	send_wr.sg_list = ctxt->sge;
 	send_wr.num_sge = sge_no;
-	send_wr.opcode = IB_WR_SEND;
+	if (inv_rkey) {
+		send_wr.opcode = IB_WR_SEND_WITH_INV;
+		send_wr.ex.invalidate_rkey = inv_rkey;
+	} else
+		send_wr.opcode = IB_WR_SEND;
 	send_wr.send_flags =  IB_SEND_SIGNALED;
 
 	ret = svc_rdma_send(rdma, &send_wr);
@@ -581,6 +614,7 @@
 	int inline_bytes;
 	struct page *res_page;
 	struct svc_rdma_req_map *vec;
+	u32 inv_rkey;
 
 	dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
 
@@ -591,6 +625,10 @@
 	wr_ary = svc_rdma_get_write_array(rdma_argp);
 	rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary);
 
+	inv_rkey = 0;
+	if (rdma->sc_snd_w_inv)
+		inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_ary, rp_ary);
+
 	/* Build an req vec for the XDR */
 	vec = svc_rdma_get_req_map(rdma);
 	ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL);
@@ -633,9 +671,9 @@
 		goto err1;
 
 	ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec,
-			 inline_bytes);
+			 inline_bytes, inv_rkey);
 	if (ret < 0)
-		goto err1;
+		goto err0;
 
 	svc_rdma_put_req_map(rdma, vec);
 	dprintk("svcrdma: send_reply returns %d\n", ret);
@@ -692,7 +730,7 @@
 		svc_rdma_put_context(ctxt, 1);
 		return;
 	}
-	atomic_inc(&xprt->sc_dma_used);
+	svc_rdma_count_mappings(xprt, ctxt);
 
 	/* Prepare SEND WR */
 	memset(&err_wr, 0, sizeof(err_wr));
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index eb2857f..6864fb9 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -198,6 +198,7 @@
 
 out:
 	ctxt->count = 0;
+	ctxt->mapped_sges = 0;
 	ctxt->frmr = NULL;
 	return ctxt;
 
@@ -221,22 +222,27 @@
 void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
 {
 	struct svcxprt_rdma *xprt = ctxt->xprt;
-	int i;
-	for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
+	struct ib_device *device = xprt->sc_cm_id->device;
+	u32 lkey = xprt->sc_pd->local_dma_lkey;
+	unsigned int i, count;
+
+	for (count = 0, i = 0; i < ctxt->mapped_sges; i++) {
 		/*
 		 * Unmap the DMA addr in the SGE if the lkey matches
 		 * the local_dma_lkey, otherwise, ignore it since it is
 		 * an FRMR lkey and will be unmapped later when the
 		 * last WR that uses it completes.
 		 */
-		if (ctxt->sge[i].lkey == xprt->sc_pd->local_dma_lkey) {
-			atomic_dec(&xprt->sc_dma_used);
-			ib_dma_unmap_page(xprt->sc_cm_id->device,
+		if (ctxt->sge[i].lkey == lkey) {
+			count++;
+			ib_dma_unmap_page(device,
 					    ctxt->sge[i].addr,
 					    ctxt->sge[i].length,
 					    ctxt->direction);
 		}
 	}
+	ctxt->mapped_sges = 0;
+	atomic_sub(count, &xprt->sc_dma_used);
 }
 
 void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
@@ -600,7 +606,7 @@
 				     DMA_FROM_DEVICE);
 		if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
 			goto err_put_ctxt;
-		atomic_inc(&xprt->sc_dma_used);
+		svc_rdma_count_mappings(xprt, ctxt);
 		ctxt->sge[sge_no].addr = pa;
 		ctxt->sge[sge_no].length = PAGE_SIZE;
 		ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
@@ -642,6 +648,26 @@
 	return ret;
 }
 
+static void
+svc_rdma_parse_connect_private(struct svcxprt_rdma *newxprt,
+			       struct rdma_conn_param *param)
+{
+	const struct rpcrdma_connect_private *pmsg = param->private_data;
+
+	if (pmsg &&
+	    pmsg->cp_magic == rpcrdma_cmp_magic &&
+	    pmsg->cp_version == RPCRDMA_CMP_VERSION) {
+		newxprt->sc_snd_w_inv = pmsg->cp_flags &
+					RPCRDMA_CMP_F_SND_W_INV_OK;
+
+		dprintk("svcrdma: client send_size %u, recv_size %u "
+			"remote inv %ssupported\n",
+			rpcrdma_decode_buffer_size(pmsg->cp_send_size),
+			rpcrdma_decode_buffer_size(pmsg->cp_recv_size),
+			newxprt->sc_snd_w_inv ? "" : "un");
+	}
+}
+
 /*
  * This function handles the CONNECT_REQUEST event on a listening
  * endpoint. It is passed the cma_id for the _new_ connection. The context in
@@ -653,7 +679,8 @@
  * will call the recvfrom method on the listen xprt which will accept the new
  * connection.
  */
-static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird)
+static void handle_connect_req(struct rdma_cm_id *new_cma_id,
+			       struct rdma_conn_param *param)
 {
 	struct svcxprt_rdma *listen_xprt = new_cma_id->context;
 	struct svcxprt_rdma *newxprt;
@@ -669,9 +696,10 @@
 	new_cma_id->context = newxprt;
 	dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
 		newxprt, newxprt->sc_cm_id, listen_xprt);
+	svc_rdma_parse_connect_private(newxprt, param);
 
 	/* Save client advertised inbound read limit for use later in accept. */
-	newxprt->sc_ord = client_ird;
+	newxprt->sc_ord = param->initiator_depth;
 
 	/* Set the local and remote addresses in the transport */
 	sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
@@ -706,8 +734,7 @@
 		dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
 			"event = %s (%d)\n", cma_id, cma_id->context,
 			rdma_event_msg(event->event), event->event);
-		handle_connect_req(cma_id,
-				   event->param.conn.initiator_depth);
+		handle_connect_req(cma_id, &event->param.conn);
 		break;
 
 	case RDMA_CM_EVENT_ESTABLISHED:
@@ -941,6 +968,7 @@
 	struct svcxprt_rdma *listen_rdma;
 	struct svcxprt_rdma *newxprt = NULL;
 	struct rdma_conn_param conn_param;
+	struct rpcrdma_connect_private pmsg;
 	struct ib_qp_init_attr qp_attr;
 	struct ib_device *dev;
 	unsigned int i;
@@ -1070,7 +1098,8 @@
 			dev->attrs.max_fast_reg_page_list_len;
 		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
 		newxprt->sc_reader = rdma_read_chunk_frmr;
-	}
+	} else
+		newxprt->sc_snd_w_inv = false;
 
 	/*
 	 * Determine if a DMA MR is required and if so, what privs are required
@@ -1094,11 +1123,20 @@
 	/* Swap out the handler */
 	newxprt->sc_cm_id->event_handler = rdma_cma_handler;
 
+	/* Construct RDMA-CM private message */
+	pmsg.cp_magic = rpcrdma_cmp_magic;
+	pmsg.cp_version = RPCRDMA_CMP_VERSION;
+	pmsg.cp_flags = 0;
+	pmsg.cp_send_size = pmsg.cp_recv_size =
+		rpcrdma_encode_buffer_size(newxprt->sc_max_req_size);
+
 	/* Accept Connection */
 	set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
 	memset(&conn_param, 0, sizeof conn_param);
 	conn_param.responder_resources = 0;
 	conn_param.initiator_depth = newxprt->sc_ord;
+	conn_param.private_data = &pmsg;
+	conn_param.private_data_len = sizeof(pmsg);
 	ret = rdma_accept(newxprt->sc_cm_id, &conn_param);
 	if (ret) {
 		dprintk("svcrdma: failed to accept new connection, ret=%d\n",
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 81f0e87..ed5e285 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -97,7 +97,7 @@
 		.data		= &xprt_rdma_max_inline_read,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &min_inline_size,
 		.extra2		= &max_inline_size,
 	},
@@ -106,7 +106,7 @@
 		.data		= &xprt_rdma_max_inline_write,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &min_inline_size,
 		.extra2		= &max_inline_size,
 	},
@@ -477,115 +477,152 @@
 	}
 }
 
-/*
- * The RDMA allocate/free functions need the task structure as a place
- * to hide the struct rpcrdma_req, which is necessary for the actual send/recv
- * sequence.
- *
- * The RPC layer allocates both send and receive buffers in the same call
- * (rq_send_buf and rq_rcv_buf are both part of a single contiguous buffer).
- * We may register rq_rcv_buf when using reply chunks.
+/* Allocate a fixed-size buffer in which to construct and send the
+ * RPC-over-RDMA header for this request.
  */
-static void *
-xprt_rdma_allocate(struct rpc_task *task, size_t size)
+static bool
+rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+		    gfp_t flags)
 {
-	struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
-	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	size_t size = RPCRDMA_HDRBUF_SIZE;
 	struct rpcrdma_regbuf *rb;
+
+	if (req->rl_rdmabuf)
+		return true;
+
+	rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags);
+	if (IS_ERR(rb))
+		return false;
+
+	r_xprt->rx_stats.hardway_register_count += size;
+	req->rl_rdmabuf = rb;
+	return true;
+}
+
+static bool
+rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+		    size_t size, gfp_t flags)
+{
+	struct rpcrdma_regbuf *rb;
+
+	if (req->rl_sendbuf && rdmab_length(req->rl_sendbuf) >= size)
+		return true;
+
+	rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags);
+	if (IS_ERR(rb))
+		return false;
+
+	rpcrdma_free_regbuf(req->rl_sendbuf);
+	r_xprt->rx_stats.hardway_register_count += size;
+	req->rl_sendbuf = rb;
+	return true;
+}
+
+/* The rq_rcv_buf is used only if a Reply chunk is necessary.
+ * The decision to use a Reply chunk is made later in
+ * rpcrdma_marshal_req. This buffer is registered at that time.
+ *
+ * Otherwise, the associated RPC Reply arrives in a separate
+ * Receive buffer, arbitrarily chosen by the HCA. The buffer
+ * allocated here for the RPC Reply is not utilized in that
+ * case. See rpcrdma_inline_fixup.
+ *
+ * A regbuf is used here to remember the buffer size.
+ */
+static bool
+rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+		    size_t size, gfp_t flags)
+{
+	struct rpcrdma_regbuf *rb;
+
+	if (req->rl_recvbuf && rdmab_length(req->rl_recvbuf) >= size)
+		return true;
+
+	rb = rpcrdma_alloc_regbuf(size, DMA_NONE, flags);
+	if (IS_ERR(rb))
+		return false;
+
+	rpcrdma_free_regbuf(req->rl_recvbuf);
+	r_xprt->rx_stats.hardway_register_count += size;
+	req->rl_recvbuf = rb;
+	return true;
+}
+
+/**
+ * xprt_rdma_allocate - allocate transport resources for an RPC
+ * @task: RPC task
+ *
+ * Return values:
+ *        0:	Success; rq_buffer points to RPC buffer to use
+ *   ENOMEM:	Out of memory, call again later
+ *      EIO:	A permanent error occurred, do not retry
+ *
+ * The RDMA allocate/free functions need the task structure as a place
+ * to hide the struct rpcrdma_req, which is necessary for the actual
+ * send/recv sequence.
+ *
+ * xprt_rdma_allocate provides buffers that are already mapped for
+ * DMA, and a local DMA lkey is provided for each.
+ */
+static int
+xprt_rdma_allocate(struct rpc_task *task)
+{
+	struct rpc_rqst *rqst = task->tk_rqstp;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
 	struct rpcrdma_req *req;
-	size_t min_size;
 	gfp_t flags;
 
 	req = rpcrdma_buffer_get(&r_xprt->rx_buf);
 	if (req == NULL)
-		return NULL;
+		return -ENOMEM;
 
 	flags = RPCRDMA_DEF_GFP;
 	if (RPC_IS_SWAPPER(task))
 		flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
 
-	if (req->rl_rdmabuf == NULL)
-		goto out_rdmabuf;
-	if (req->rl_sendbuf == NULL)
-		goto out_sendbuf;
-	if (size > req->rl_sendbuf->rg_size)
-		goto out_sendbuf;
+	if (!rpcrdma_get_rdmabuf(r_xprt, req, flags))
+		goto out_fail;
+	if (!rpcrdma_get_sendbuf(r_xprt, req, rqst->rq_callsize, flags))
+		goto out_fail;
+	if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags))
+		goto out_fail;
 
-out:
-	dprintk("RPC:       %s: size %zd, request 0x%p\n", __func__, size, req);
+	dprintk("RPC: %5u %s: send size = %zd, recv size = %zd, req = %p\n",
+		task->tk_pid, __func__, rqst->rq_callsize,
+		rqst->rq_rcvsize, req);
+
 	req->rl_connect_cookie = 0;	/* our reserved value */
-	req->rl_task = task;
-	return req->rl_sendbuf->rg_base;
-
-out_rdmabuf:
-	min_size = RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp);
-	rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, min_size, flags);
-	if (IS_ERR(rb))
-		goto out_fail;
-	req->rl_rdmabuf = rb;
-
-out_sendbuf:
-	/* XDR encoding and RPC/RDMA marshaling of this request has not
-	 * yet occurred. Thus a lower bound is needed to prevent buffer
-	 * overrun during marshaling.
-	 *
-	 * RPC/RDMA marshaling may choose to send payload bearing ops
-	 * inline, if the result is smaller than the inline threshold.
-	 * The value of the "size" argument accounts for header
-	 * requirements but not for the payload in these cases.
-	 *
-	 * Likewise, allocate enough space to receive a reply up to the
-	 * size of the inline threshold.
-	 *
-	 * It's unlikely that both the send header and the received
-	 * reply will be large, but slush is provided here to allow
-	 * flexibility when marshaling.
-	 */
-	min_size = RPCRDMA_INLINE_READ_THRESHOLD(task->tk_rqstp);
-	min_size += RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp);
-	if (size < min_size)
-		size = min_size;
-
-	rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags);
-	if (IS_ERR(rb))
-		goto out_fail;
-	rb->rg_owner = req;
-
-	r_xprt->rx_stats.hardway_register_count += size;
-	rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf);
-	req->rl_sendbuf = rb;
-	goto out;
+	rpcrdma_set_xprtdata(rqst, req);
+	rqst->rq_buffer = req->rl_sendbuf->rg_base;
+	rqst->rq_rbuffer = req->rl_recvbuf->rg_base;
+	return 0;
 
 out_fail:
 	rpcrdma_buffer_put(req);
-	return NULL;
+	return -ENOMEM;
 }
 
-/*
- * This function returns all RDMA resources to the pool.
+/**
+ * xprt_rdma_free - release resources allocated by xprt_rdma_allocate
+ * @task: RPC task
+ *
+ * Caller guarantees rqst->rq_buffer is non-NULL.
  */
 static void
-xprt_rdma_free(void *buffer)
+xprt_rdma_free(struct rpc_task *task)
 {
-	struct rpcrdma_req *req;
-	struct rpcrdma_xprt *r_xprt;
-	struct rpcrdma_regbuf *rb;
+	struct rpc_rqst *rqst = task->tk_rqstp;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 
-	if (buffer == NULL)
-		return;
-
-	rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]);
-	req = rb->rg_owner;
 	if (req->rl_backchannel)
 		return;
 
-	r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
-
 	dprintk("RPC:       %s: called on 0x%p\n", __func__, req->rl_reply);
 
-	r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req,
-					    !RPC_IS_ASYNC(req->rl_task));
-
+	ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task));
+	rpcrdma_unmap_sges(ia, req);
 	rpcrdma_buffer_put(req);
 }
 
@@ -685,10 +722,11 @@
 		   r_xprt->rx_stats.failed_marshal_count,
 		   r_xprt->rx_stats.bad_reply_count,
 		   r_xprt->rx_stats.nomsg_call_count);
-	seq_printf(seq, "%lu %lu %lu\n",
+	seq_printf(seq, "%lu %lu %lu %lu\n",
 		   r_xprt->rx_stats.mrs_recovered,
 		   r_xprt->rx_stats.mrs_orphaned,
-		   r_xprt->rx_stats.mrs_allocated);
+		   r_xprt->rx_stats.mrs_allocated,
+		   r_xprt->rx_stats.local_inv_needed);
 }
 
 static int
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index be3178e..ec74289 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -129,15 +129,6 @@
 		       wc->status, wc->vendor_err);
 }
 
-static void
-rpcrdma_receive_worker(struct work_struct *work)
-{
-	struct rpcrdma_rep *rep =
-			container_of(work, struct rpcrdma_rep, rr_work);
-
-	rpcrdma_reply_handler(rep);
-}
-
 /* Perform basic sanity checking to avoid using garbage
  * to update the credit grant value.
  */
@@ -161,13 +152,13 @@
 }
 
 /**
- * rpcrdma_receive_wc - Invoked by RDMA provider for each polled Receive WC
+ * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
  * @cq:	completion queue (ignored)
  * @wc:	completed WR
  *
  */
 static void
-rpcrdma_receive_wc(struct ib_cq *cq, struct ib_wc *wc)
+rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 {
 	struct ib_cqe *cqe = wc->wr_cqe;
 	struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep,
@@ -185,6 +176,9 @@
 		__func__, rep, wc->byte_len);
 
 	rep->rr_len = wc->byte_len;
+	rep->rr_wc_flags = wc->wc_flags;
+	rep->rr_inv_rkey = wc->ex.invalidate_rkey;
+
 	ib_dma_sync_single_for_cpu(rep->rr_device,
 				   rdmab_addr(rep->rr_rdmabuf),
 				   rep->rr_len, DMA_FROM_DEVICE);
@@ -204,6 +198,36 @@
 	goto out_schedule;
 }
 
+static void
+rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
+			       struct rdma_conn_param *param)
+{
+	struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+	const struct rpcrdma_connect_private *pmsg = param->private_data;
+	unsigned int rsize, wsize;
+
+	/* Default settings for RPC-over-RDMA Version One */
+	r_xprt->rx_ia.ri_reminv_expected = false;
+	rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
+	wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
+
+	if (pmsg &&
+	    pmsg->cp_magic == rpcrdma_cmp_magic &&
+	    pmsg->cp_version == RPCRDMA_CMP_VERSION) {
+		r_xprt->rx_ia.ri_reminv_expected = true;
+		rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
+		wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
+	}
+
+	if (rsize < cdata->inline_rsize)
+		cdata->inline_rsize = rsize;
+	if (wsize < cdata->inline_wsize)
+		cdata->inline_wsize = wsize;
+	pr_info("rpcrdma: max send %u, max recv %u\n",
+		cdata->inline_wsize, cdata->inline_rsize);
+	rpcrdma_set_max_header_sizes(r_xprt);
+}
+
 static int
 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
 {
@@ -244,6 +268,7 @@
 			" (%d initiator)\n",
 			__func__, attr->max_dest_rd_atomic,
 			attr->max_rd_atomic);
+		rpcrdma_update_connect_private(xprt, &event->param.conn);
 		goto connected;
 	case RDMA_CM_EVENT_CONNECT_ERROR:
 		connstate = -ENOTCONN;
@@ -454,11 +479,12 @@
 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 				struct rpcrdma_create_data_internal *cdata)
 {
+	struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
 	struct ib_cq *sendcq, *recvcq;
 	unsigned int max_qp_wr;
 	int rc;
 
-	if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_IOVS) {
+	if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_SEND_SGES) {
 		dprintk("RPC:       %s: insufficient sge's available\n",
 			__func__);
 		return -ENOMEM;
@@ -487,7 +513,7 @@
 	ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
 	ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
 	ep->rep_attr.cap.max_recv_wr += 1;	/* drain cqe */
-	ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
+	ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_SEND_SGES;
 	ep->rep_attr.cap.max_recv_sge = 1;
 	ep->rep_attr.cap.max_inline_data = 0;
 	ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -536,9 +562,14 @@
 	/* Initialize cma parameters */
 	memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
 
-	/* RPC/RDMA does not use private data */
-	ep->rep_remote_cma.private_data = NULL;
-	ep->rep_remote_cma.private_data_len = 0;
+	/* Prepare RDMA-CM private message */
+	pmsg->cp_magic = rpcrdma_cmp_magic;
+	pmsg->cp_version = RPCRDMA_CMP_VERSION;
+	pmsg->cp_flags |= ia->ri_ops->ro_send_w_inv_ok;
+	pmsg->cp_send_size = rpcrdma_encode_buffer_size(cdata->inline_wsize);
+	pmsg->cp_recv_size = rpcrdma_encode_buffer_size(cdata->inline_rsize);
+	ep->rep_remote_cma.private_data = pmsg;
+	ep->rep_remote_cma.private_data_len = sizeof(*pmsg);
 
 	/* Client offers RDMA Read but does not initiate */
 	ep->rep_remote_cma.initiator_depth = 0;
@@ -849,6 +880,10 @@
 	req->rl_cqe.done = rpcrdma_wc_send;
 	req->rl_buffer = &r_xprt->rx_buf;
 	INIT_LIST_HEAD(&req->rl_registered);
+	req->rl_send_wr.next = NULL;
+	req->rl_send_wr.wr_cqe = &req->rl_cqe;
+	req->rl_send_wr.sg_list = req->rl_send_sge;
+	req->rl_send_wr.opcode = IB_WR_SEND;
 	return req;
 }
 
@@ -865,17 +900,21 @@
 	if (rep == NULL)
 		goto out;
 
-	rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize,
-					       GFP_KERNEL);
+	rep->rr_rdmabuf = rpcrdma_alloc_regbuf(cdata->inline_rsize,
+					       DMA_FROM_DEVICE, GFP_KERNEL);
 	if (IS_ERR(rep->rr_rdmabuf)) {
 		rc = PTR_ERR(rep->rr_rdmabuf);
 		goto out_free;
 	}
 
 	rep->rr_device = ia->ri_device;
-	rep->rr_cqe.done = rpcrdma_receive_wc;
+	rep->rr_cqe.done = rpcrdma_wc_receive;
 	rep->rr_rxprt = r_xprt;
-	INIT_WORK(&rep->rr_work, rpcrdma_receive_worker);
+	INIT_WORK(&rep->rr_work, rpcrdma_reply_handler);
+	rep->rr_recv_wr.next = NULL;
+	rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
+	rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
+	rep->rr_recv_wr.num_sge = 1;
 	return rep;
 
 out_free:
@@ -966,17 +1005,18 @@
 }
 
 static void
-rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
+rpcrdma_destroy_rep(struct rpcrdma_rep *rep)
 {
-	rpcrdma_free_regbuf(ia, rep->rr_rdmabuf);
+	rpcrdma_free_regbuf(rep->rr_rdmabuf);
 	kfree(rep);
 }
 
 void
-rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+rpcrdma_destroy_req(struct rpcrdma_req *req)
 {
-	rpcrdma_free_regbuf(ia, req->rl_sendbuf);
-	rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
+	rpcrdma_free_regbuf(req->rl_recvbuf);
+	rpcrdma_free_regbuf(req->rl_sendbuf);
+	rpcrdma_free_regbuf(req->rl_rdmabuf);
 	kfree(req);
 }
 
@@ -1009,15 +1049,13 @@
 void
 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 {
-	struct rpcrdma_ia *ia = rdmab_to_ia(buf);
-
 	cancel_delayed_work_sync(&buf->rb_recovery_worker);
 
 	while (!list_empty(&buf->rb_recv_bufs)) {
 		struct rpcrdma_rep *rep;
 
 		rep = rpcrdma_buffer_get_rep_locked(buf);
-		rpcrdma_destroy_rep(ia, rep);
+		rpcrdma_destroy_rep(rep);
 	}
 	buf->rb_send_count = 0;
 
@@ -1030,7 +1068,7 @@
 		list_del(&req->rl_all);
 
 		spin_unlock(&buf->rb_reqslock);
-		rpcrdma_destroy_req(ia, req);
+		rpcrdma_destroy_req(req);
 		spin_lock(&buf->rb_reqslock);
 	}
 	spin_unlock(&buf->rb_reqslock);
@@ -1129,7 +1167,7 @@
 	struct rpcrdma_buffer *buffers = req->rl_buffer;
 	struct rpcrdma_rep *rep = req->rl_reply;
 
-	req->rl_niovs = 0;
+	req->rl_send_wr.num_sge = 0;
 	req->rl_reply = NULL;
 
 	spin_lock(&buffers->rb_lock);
@@ -1171,70 +1209,81 @@
 	spin_unlock(&buffers->rb_lock);
 }
 
-/*
- * Wrappers for internal-use kmalloc memory registration, used by buffer code.
- */
-
 /**
- * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
- * @ia: controlling rpcrdma_ia
+ * rpcrdma_alloc_regbuf - allocate and DMA-map memory for SEND/RECV buffers
  * @size: size of buffer to be allocated, in bytes
+ * @direction: direction of data movement
  * @flags: GFP flags
  *
- * Returns pointer to private header of an area of internally
- * registered memory, or an ERR_PTR. The registered buffer follows
- * the end of the private header.
+ * Returns an ERR_PTR, or a pointer to a regbuf, a buffer that
+ * can be persistently DMA-mapped for I/O.
  *
  * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
- * receiving the payload of RDMA RECV operations. regbufs are not
- * used for RDMA READ/WRITE operations, thus are registered only for
- * LOCAL access.
+ * receiving the payload of RDMA RECV operations. During Long Calls
+ * or Replies they may be registered externally via ro_map.
  */
 struct rpcrdma_regbuf *
-rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
+rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction,
+		     gfp_t flags)
 {
 	struct rpcrdma_regbuf *rb;
-	struct ib_sge *iov;
 
 	rb = kmalloc(sizeof(*rb) + size, flags);
 	if (rb == NULL)
-		goto out;
+		return ERR_PTR(-ENOMEM);
 
-	iov = &rb->rg_iov;
-	iov->addr = ib_dma_map_single(ia->ri_device,
-				      (void *)rb->rg_base, size,
-				      DMA_BIDIRECTIONAL);
-	if (ib_dma_mapping_error(ia->ri_device, iov->addr))
-		goto out_free;
+	rb->rg_device = NULL;
+	rb->rg_direction = direction;
+	rb->rg_iov.length = size;
 
-	iov->length = size;
-	iov->lkey = ia->ri_pd->local_dma_lkey;
-	rb->rg_size = size;
-	rb->rg_owner = NULL;
 	return rb;
+}
 
-out_free:
-	kfree(rb);
-out:
-	return ERR_PTR(-ENOMEM);
+/**
+ * __rpcrdma_map_regbuf - DMA-map a regbuf
+ * @ia: controlling rpcrdma_ia
+ * @rb: regbuf to be mapped
+ */
+bool
+__rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
+{
+	if (rb->rg_direction == DMA_NONE)
+		return false;
+
+	rb->rg_iov.addr = ib_dma_map_single(ia->ri_device,
+					    (void *)rb->rg_base,
+					    rdmab_length(rb),
+					    rb->rg_direction);
+	if (ib_dma_mapping_error(ia->ri_device, rdmab_addr(rb)))
+		return false;
+
+	rb->rg_device = ia->ri_device;
+	rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey;
+	return true;
+}
+
+static void
+rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb)
+{
+	if (!rpcrdma_regbuf_is_mapped(rb))
+		return;
+
+	ib_dma_unmap_single(rb->rg_device, rdmab_addr(rb),
+			    rdmab_length(rb), rb->rg_direction);
+	rb->rg_device = NULL;
 }
 
 /**
  * rpcrdma_free_regbuf - deregister and free registered buffer
- * @ia: controlling rpcrdma_ia
  * @rb: regbuf to be deregistered and freed
  */
 void
-rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
+rpcrdma_free_regbuf(struct rpcrdma_regbuf *rb)
 {
-	struct ib_sge *iov;
-
 	if (!rb)
 		return;
 
-	iov = &rb->rg_iov;
-	ib_dma_unmap_single(ia->ri_device,
-			    iov->addr, iov->length, DMA_BIDIRECTIONAL);
+	rpcrdma_dma_unmap_regbuf(rb);
 	kfree(rb);
 }
 
@@ -1248,39 +1297,28 @@
 		struct rpcrdma_ep *ep,
 		struct rpcrdma_req *req)
 {
-	struct ib_device *device = ia->ri_device;
-	struct ib_send_wr send_wr, *send_wr_fail;
-	struct rpcrdma_rep *rep = req->rl_reply;
-	struct ib_sge *iov = req->rl_send_iov;
-	int i, rc;
+	struct ib_send_wr *send_wr = &req->rl_send_wr;
+	struct ib_send_wr *send_wr_fail;
+	int rc;
 
-	if (rep) {
-		rc = rpcrdma_ep_post_recv(ia, ep, rep);
+	if (req->rl_reply) {
+		rc = rpcrdma_ep_post_recv(ia, req->rl_reply);
 		if (rc)
 			return rc;
 		req->rl_reply = NULL;
 	}
 
-	send_wr.next = NULL;
-	send_wr.wr_cqe = &req->rl_cqe;
-	send_wr.sg_list = iov;
-	send_wr.num_sge = req->rl_niovs;
-	send_wr.opcode = IB_WR_SEND;
-
-	for (i = 0; i < send_wr.num_sge; i++)
-		ib_dma_sync_single_for_device(device, iov[i].addr,
-					      iov[i].length, DMA_TO_DEVICE);
 	dprintk("RPC:       %s: posting %d s/g entries\n",
-		__func__, send_wr.num_sge);
+		__func__, send_wr->num_sge);
 
 	if (DECR_CQCOUNT(ep) > 0)
-		send_wr.send_flags = 0;
+		send_wr->send_flags = 0;
 	else { /* Provider must take a send completion every now and then */
 		INIT_CQCOUNT(ep);
-		send_wr.send_flags = IB_SEND_SIGNALED;
+		send_wr->send_flags = IB_SEND_SIGNALED;
 	}
 
-	rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
+	rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail);
 	if (rc)
 		goto out_postsend_err;
 	return 0;
@@ -1290,32 +1328,24 @@
 	return -ENOTCONN;
 }
 
-/*
- * (Re)post a receive buffer.
- */
 int
 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
-		     struct rpcrdma_ep *ep,
 		     struct rpcrdma_rep *rep)
 {
-	struct ib_recv_wr recv_wr, *recv_wr_fail;
+	struct ib_recv_wr *recv_wr_fail;
 	int rc;
 
-	recv_wr.next = NULL;
-	recv_wr.wr_cqe = &rep->rr_cqe;
-	recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
-	recv_wr.num_sge = 1;
-
-	ib_dma_sync_single_for_cpu(ia->ri_device,
-				   rdmab_addr(rep->rr_rdmabuf),
-				   rdmab_length(rep->rr_rdmabuf),
-				   DMA_BIDIRECTIONAL);
-
-	rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
+	if (!rpcrdma_dma_map_regbuf(ia, rep->rr_rdmabuf))
+		goto out_map;
+	rc = ib_post_recv(ia->ri_id->qp, &rep->rr_recv_wr, &recv_wr_fail);
 	if (rc)
 		goto out_postrecv;
 	return 0;
 
+out_map:
+	pr_err("rpcrdma: failed to DMA map the Receive buffer\n");
+	return -EIO;
+
 out_postrecv:
 	pr_err("rpcrdma: ib_post_recv returned %i\n", rc);
 	return -ENOTCONN;
@@ -1333,7 +1363,6 @@
 {
 	struct rpcrdma_buffer *buffers = &r_xprt->rx_buf;
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-	struct rpcrdma_ep *ep = &r_xprt->rx_ep;
 	struct rpcrdma_rep *rep;
 	int rc;
 
@@ -1344,7 +1373,7 @@
 		rep = rpcrdma_buffer_get_rep_locked(buffers);
 		spin_unlock(&buffers->rb_lock);
 
-		rc = rpcrdma_ep_post_recv(ia, ep, rep);
+		rc = rpcrdma_ep_post_recv(ia, rep);
 		if (rc)
 			goto out_rc;
 	}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index a71b0f5..0d35b76 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -70,9 +70,11 @@
 	struct ib_pd		*ri_pd;
 	struct completion	ri_done;
 	int			ri_async_rc;
+	unsigned int		ri_max_segs;
 	unsigned int		ri_max_frmr_depth;
 	unsigned int		ri_max_inline_write;
 	unsigned int		ri_max_inline_read;
+	bool			ri_reminv_expected;
 	struct ib_qp_attr	ri_qp_attr;
 	struct ib_qp_init_attr	ri_qp_init_attr;
 };
@@ -87,6 +89,7 @@
 	int			rep_connected;
 	struct ib_qp_init_attr	rep_attr;
 	wait_queue_head_t 	rep_connect_wait;
+	struct rpcrdma_connect_private	rep_cm_private;
 	struct rdma_conn_param	rep_remote_cma;
 	struct sockaddr_storage	rep_remote_addr;
 	struct delayed_work	rep_connect_worker;
@@ -112,9 +115,9 @@
  */
 
 struct rpcrdma_regbuf {
-	size_t			rg_size;
-	struct rpcrdma_req	*rg_owner;
 	struct ib_sge		rg_iov;
+	struct ib_device	*rg_device;
+	enum dma_data_direction	rg_direction;
 	__be32			rg_base[0] __attribute__ ((aligned(256)));
 };
 
@@ -162,7 +165,10 @@
  * The smallest inline threshold is 1024 bytes, ensuring that
  * at least 750 bytes are available for RPC messages.
  */
-#define RPCRDMA_MAX_HDR_SEGS	(8)
+enum {
+	RPCRDMA_MAX_HDR_SEGS = 8,
+	RPCRDMA_HDRBUF_SIZE = 256,
+};
 
 /*
  * struct rpcrdma_rep -- this structure encapsulates state required to recv
@@ -182,10 +188,13 @@
 struct rpcrdma_rep {
 	struct ib_cqe		rr_cqe;
 	unsigned int		rr_len;
+	int			rr_wc_flags;
+	u32			rr_inv_rkey;
 	struct ib_device	*rr_device;
 	struct rpcrdma_xprt	*rr_rxprt;
 	struct work_struct	rr_work;
 	struct list_head	rr_list;
+	struct ib_recv_wr	rr_recv_wr;
 	struct rpcrdma_regbuf	*rr_rdmabuf;
 };
 
@@ -276,19 +285,30 @@
 	char		*mr_offset;	/* kva if no page, else offset */
 };
 
-#define RPCRDMA_MAX_IOVS	(2)
+/* Reserve enough Send SGEs to send a maximum size inline request:
+ * - RPC-over-RDMA header
+ * - xdr_buf head iovec
+ * - RPCRDMA_MAX_INLINE bytes, possibly unaligned, in pages
+ * - xdr_buf tail iovec
+ */
+enum {
+	RPCRDMA_MAX_SEND_PAGES = PAGE_SIZE + RPCRDMA_MAX_INLINE - 1,
+	RPCRDMA_MAX_PAGE_SGES = (RPCRDMA_MAX_SEND_PAGES >> PAGE_SHIFT) + 1,
+	RPCRDMA_MAX_SEND_SGES = 1 + 1 + RPCRDMA_MAX_PAGE_SGES + 1,
+};
 
 struct rpcrdma_buffer;
 struct rpcrdma_req {
 	struct list_head	rl_free;
-	unsigned int		rl_niovs;
+	unsigned int		rl_mapped_sges;
 	unsigned int		rl_connect_cookie;
-	struct rpc_task		*rl_task;
 	struct rpcrdma_buffer	*rl_buffer;
-	struct rpcrdma_rep	*rl_reply;/* holder for reply buffer */
-	struct ib_sge		rl_send_iov[RPCRDMA_MAX_IOVS];
-	struct rpcrdma_regbuf	*rl_rdmabuf;
-	struct rpcrdma_regbuf	*rl_sendbuf;
+	struct rpcrdma_rep	*rl_reply;
+	struct ib_send_wr	rl_send_wr;
+	struct ib_sge		rl_send_sge[RPCRDMA_MAX_SEND_SGES];
+	struct rpcrdma_regbuf	*rl_rdmabuf;	/* xprt header */
+	struct rpcrdma_regbuf	*rl_sendbuf;	/* rq_snd_buf */
+	struct rpcrdma_regbuf	*rl_recvbuf;	/* rq_rcv_buf */
 
 	struct ib_cqe		rl_cqe;
 	struct list_head	rl_all;
@@ -298,14 +318,16 @@
 	struct rpcrdma_mr_seg	rl_segments[RPCRDMA_MAX_SEGS];
 };
 
+static inline void
+rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req)
+{
+	rqst->rq_xprtdata = req;
+}
+
 static inline struct rpcrdma_req *
 rpcr_to_rdmar(struct rpc_rqst *rqst)
 {
-	void *buffer = rqst->rq_buffer;
-	struct rpcrdma_regbuf *rb;
-
-	rb = container_of(buffer, struct rpcrdma_regbuf, rg_base);
-	return rb->rg_owner;
+	return rqst->rq_xprtdata;
 }
 
 /*
@@ -356,15 +378,6 @@
 	unsigned int	padding;	/* non-rdma write header padding */
 };
 
-#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \
-	(rpcx_to_rdmad(rq->rq_xprt).inline_rsize)
-
-#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\
-	(rpcx_to_rdmad(rq->rq_xprt).inline_wsize)
-
-#define RPCRDMA_INLINE_PAD_VALUE(rq)\
-	rpcx_to_rdmad(rq->rq_xprt).padding
-
 /*
  * Statistics for RPCRDMA
  */
@@ -386,6 +399,7 @@
 	unsigned long		mrs_recovered;
 	unsigned long		mrs_orphaned;
 	unsigned long		mrs_allocated;
+	unsigned long		local_inv_needed;
 };
 
 /*
@@ -409,6 +423,7 @@
 				      struct rpcrdma_mw *);
 	void		(*ro_release_mr)(struct rpcrdma_mw *);
 	const char	*ro_displayname;
+	const int	ro_send_w_inv_ok;
 };
 
 extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
@@ -461,15 +476,14 @@
 
 int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
 				struct rpcrdma_req *);
-int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
-				struct rpcrdma_rep *);
+int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_rep *);
 
 /*
  * Buffer calls - xprtrdma/verbs.c
  */
 struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *);
 struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *);
-void rpcrdma_destroy_req(struct rpcrdma_ia *, struct rpcrdma_req *);
+void rpcrdma_destroy_req(struct rpcrdma_req *);
 int rpcrdma_buffer_create(struct rpcrdma_xprt *);
 void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
 
@@ -482,10 +496,24 @@
 
 void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *);
 
-struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
-					    size_t, gfp_t);
-void rpcrdma_free_regbuf(struct rpcrdma_ia *,
-			 struct rpcrdma_regbuf *);
+struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(size_t, enum dma_data_direction,
+					    gfp_t);
+bool __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *);
+void rpcrdma_free_regbuf(struct rpcrdma_regbuf *);
+
+static inline bool
+rpcrdma_regbuf_is_mapped(struct rpcrdma_regbuf *rb)
+{
+	return rb->rg_device != NULL;
+}
+
+static inline bool
+rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
+{
+	if (likely(rpcrdma_regbuf_is_mapped(rb)))
+		return true;
+	return __rpcrdma_dma_map_regbuf(ia, rb);
+}
 
 int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
 
@@ -507,15 +535,25 @@
  */
 void rpcrdma_connect_worker(struct work_struct *);
 void rpcrdma_conn_func(struct rpcrdma_ep *);
-void rpcrdma_reply_handler(struct rpcrdma_rep *);
+void rpcrdma_reply_handler(struct work_struct *);
 
 /*
  * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
  */
+
+enum rpcrdma_chunktype {
+	rpcrdma_noch = 0,
+	rpcrdma_readch,
+	rpcrdma_areadch,
+	rpcrdma_writech,
+	rpcrdma_replych
+};
+
+bool rpcrdma_prepare_send_sges(struct rpcrdma_ia *, struct rpcrdma_req *,
+			       u32, struct xdr_buf *, enum rpcrdma_chunktype);
+void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *);
 int rpcrdma_marshal_req(struct rpc_rqst *);
-void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *,
-				  struct rpcrdma_create_data_internal *,
-				  unsigned int);
+void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
 
 /* RPC/RDMA module init - xprtrdma/transport.c
  */
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index bf16883..0137af1 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -473,7 +473,16 @@
 	spin_unlock_bh(&xprt->transport_lock);
 
 	/* Race breaker in case memory is freed before above code is called */
-	sk->sk_write_space(sk);
+	if (ret == -EAGAIN) {
+		struct socket_wq *wq;
+
+		rcu_read_lock();
+		wq = rcu_dereference(sk->sk_wq);
+		set_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags);
+		rcu_read_unlock();
+
+		sk->sk_write_space(sk);
+	}
 	return ret;
 }
 
@@ -2533,35 +2542,38 @@
  * we allocate pages instead doing a kmalloc like rpc_malloc is because we want
  * to use the server side send routines.
  */
-static void *bc_malloc(struct rpc_task *task, size_t size)
+static int bc_malloc(struct rpc_task *task)
 {
+	struct rpc_rqst *rqst = task->tk_rqstp;
+	size_t size = rqst->rq_callsize;
 	struct page *page;
 	struct rpc_buffer *buf;
 
-	WARN_ON_ONCE(size > PAGE_SIZE - sizeof(struct rpc_buffer));
-	if (size > PAGE_SIZE - sizeof(struct rpc_buffer))
-		return NULL;
+	if (size > PAGE_SIZE - sizeof(struct rpc_buffer)) {
+		WARN_ONCE(1, "xprtsock: large bc buffer request (size %zu)\n",
+			  size);
+		return -EINVAL;
+	}
 
 	page = alloc_page(GFP_KERNEL);
 	if (!page)
-		return NULL;
+		return -ENOMEM;
 
 	buf = page_address(page);
 	buf->len = PAGE_SIZE;
 
-	return buf->data;
+	rqst->rq_buffer = buf->data;
+	return 0;
 }
 
 /*
  * Free the space allocated in the bc_alloc routine
  */
-static void bc_free(void *buffer)
+static void bc_free(struct rpc_task *task)
 {
+	void *buffer = task->tk_rqstp->rq_buffer;
 	struct rpc_buffer *buf;
 
-	if (!buffer)
-		return;
-
 	buf = container_of(buffer, struct rpc_buffer, data);
 	free_page((unsigned long)buf);
 }