Merge "defconfig: msm: Disable config EXT2_FS and EXT3_FS for sdm670"
diff --git a/arch/arm/boot/dts/qcom/sdxpoorwills-ion.dtsi b/arch/arm/boot/dts/qcom/sdxpoorwills-ion.dtsi
index a09b149..6957063 100644
--- a/arch/arm/boot/dts/qcom/sdxpoorwills-ion.dtsi
+++ b/arch/arm/boot/dts/qcom/sdxpoorwills-ion.dtsi
@@ -1,4 +1,4 @@
-/* Copyright (c) 2017, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2017-2018, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -26,5 +26,17 @@
memory-region = <&audio_mem>;
qcom,ion-heap-type = "DMA";
};
+
+ qcom,ion-heap@27 { /* QSEECOM HEAP */
+ reg = <27>;
+ memory-region = <&qseecom_mem>;
+ qcom,ion-heap-type = "DMA";
+ };
+
+ qcom,ion-heap@19 { /* QSEECOM TA HEAP */
+ reg = <19>;
+ memory-region = <&qseecom_ta_mem>;
+ qcom,ion-heap-type = "DMA";
+ };
};
};
diff --git a/arch/arm/boot/dts/qcom/sdxpoorwills-ttp.dtsi b/arch/arm/boot/dts/qcom/sdxpoorwills-ttp.dtsi
index 7f49b6d..fa8f3a4 100644
--- a/arch/arm/boot/dts/qcom/sdxpoorwills-ttp.dtsi
+++ b/arch/arm/boot/dts/qcom/sdxpoorwills-ttp.dtsi
@@ -12,3 +12,12 @@
#include "sdxpoorwills-mtp.dtsi"
+&vbus_detect {
+ status = "okay";
+};
+
+&usb {
+ status = "okay";
+ qcom,connector-type-uAB;
+ extcon = <0>, <0>, <0>, <&vbus_detect>;
+};
diff --git a/arch/arm/boot/dts/qcom/sdxpoorwills.dtsi b/arch/arm/boot/dts/qcom/sdxpoorwills.dtsi
index eefea0f..9584c15 100644
--- a/arch/arm/boot/dts/qcom/sdxpoorwills.dtsi
+++ b/arch/arm/boot/dts/qcom/sdxpoorwills.dtsi
@@ -18,6 +18,7 @@
#include <dt-bindings/interrupt-controller/arm-gic.h>
#include <dt-bindings/regulator/qcom,rpmh-regulator.h>
#include <dt-bindings/clock/qcom,aop-qmp.h>
+#include <dt-bindings/msm/msm-bus-ids.h>
#define MHZ_TO_MBPS(mhz, w) ((mhz * 1000000 * w) / (1024 * 1024))
@@ -79,6 +80,20 @@
reusable;
size = <0x400000>;
};
+
+ qseecom_mem: qseecom_region@0 {
+ compatible = "shared-dma-pool";
+ reusable;
+ alignment = <0x400000>;
+ size = <0x1400000>;
+ };
+
+ qseecom_ta_mem: qseecom_ta_region@0 {
+ compatible = "shared-dma-pool";
+ reusable;
+ alignment = <0x400000>;
+ size = <0x1000000>;
+ };
};
cpus {
@@ -926,33 +941,43 @@
qcom,bandwidth-vote-for-ipa;
qcom,msm-bus,name = "ipa";
qcom,msm-bus,num-cases = <5>;
- qcom,msm-bus,num-paths = <4>;
+ qcom,msm-bus,num-paths = <5>;
qcom,msm-bus,vectors-KBps =
/* No vote */
- <90 512 0 0>,
- <90 585 0 0>,
- <1 676 0 0>,
- <143 777 0 0>,
+ <MSM_BUS_MASTER_IPA MSM_BUS_SLAVE_SNOC_MEM_NOC_GC 0 0>,
+ <MSM_BUS_MASTER_SNOC_GC_MEM_NOC MSM_BUS_SLAVE_EBI_CH0 0 0>,
+ <MSM_BUS_MASTER_IPA MSM_BUS_SLAVE_OCIMEM 0 0>,
+ <MSM_BUS_MASTER_AMPSS_M0 MSM_BUS_SLAVE_IPA_CFG 0 0>,
+ <MSM_BUS_MASTER_IPA_CORE MSM_BUS_SLAVE_IPA_CORE 0 0>,
+
/* SVS2 */
- <90 512 900000 1800000>,
- <90 585 300000 600000>,
- <1 676 90000 179000>, /*gcc_config_noc_clk_src */
- <143 777 0 120>, /* IB defined for IPA2X_clk in MHz*/
+ <MSM_BUS_MASTER_IPA MSM_BUS_SLAVE_SNOC_MEM_NOC_GC 240000 480000>,
+ <MSM_BUS_MASTER_SNOC_GC_MEM_NOC MSM_BUS_SLAVE_EBI_CH0 900000 1800000>,
+ <MSM_BUS_MASTER_IPA MSM_BUS_SLAVE_OCIMEM 300000 600000>,
+ <MSM_BUS_MASTER_AMPSS_M0 MSM_BUS_SLAVE_IPA_CFG 90000 179000>,
+ <MSM_BUS_MASTER_IPA_CORE MSM_BUS_SLAVE_IPA_CORE 0 120>,
+
/* SVS */
- <90 512 1530000 3060000>,
- <90 585 400000 800000>,
- <1 676 100000 199000>,
- <143 777 0 250>, /* IB defined for IPA2X_clk in MHz*/
+ <MSM_BUS_MASTER_IPA MSM_BUS_SLAVE_SNOC_MEM_NOC_GC 360000 720000>,
+ <MSM_BUS_MASTER_SNOC_GC_MEM_NOC MSM_BUS_SLAVE_EBI_CH0 1530000 3060000>,
+ <MSM_BUS_MASTER_IPA MSM_BUS_SLAVE_OCIMEM 400000 800000>,
+ <MSM_BUS_MASTER_AMPSS_M0 MSM_BUS_SLAVE_IPA_CFG 100000 199000>,
+ <MSM_BUS_MASTER_IPA_CORE MSM_BUS_SLAVE_IPA_CORE 0 250>,
+
/* NOMINAL */
- <90 512 2592000 5184000>,
- <90 585 800000 1600000>,
- <1 676 200000 399000>,
- <143 777 0 440>, /* IB defined for IPA2X_clk in MHz*/
+ <MSM_BUS_MASTER_IPA MSM_BUS_SLAVE_SNOC_MEM_NOC_GC 780000 1560000>,
+ <MSM_BUS_MASTER_SNOC_GC_MEM_NOC MSM_BUS_SLAVE_EBI_CH0 2592000 5184000>,
+ <MSM_BUS_MASTER_IPA MSM_BUS_SLAVE_OCIMEM 800000 1600000>,
+ <MSM_BUS_MASTER_AMPSS_M0 MSM_BUS_SLAVE_IPA_CFG 200000 399000>,
+ <MSM_BUS_MASTER_IPA_CORE MSM_BUS_SLAVE_IPA_CORE 0 440>,
+
/* TURBO */
- <90 512 2592000 5184000>,
- <90 585 960000 1920000>,
- <1 676 266000 531000>,
- <143 777 0 500>; /* IB defined for IPA clk in MHz*/
+ <MSM_BUS_MASTER_IPA MSM_BUS_SLAVE_SNOC_MEM_NOC_GC 960000 1920000>,
+ <MSM_BUS_MASTER_SNOC_GC_MEM_NOC MSM_BUS_SLAVE_EBI_CH0 2592000 5184000>,
+ <MSM_BUS_MASTER_IPA MSM_BUS_SLAVE_OCIMEM 960000 1920000>,
+ <MSM_BUS_MASTER_AMPSS_M0 MSM_BUS_SLAVE_IPA_CFG 266000 531000>,
+ <MSM_BUS_MASTER_IPA_CORE MSM_BUS_SLAVE_IPA_CORE 0 500>;
+
qcom,bus-vector-names = "MIN", "SVS2", "SVS", "NOMINAL",
"TURBO";
qcom,throughput-threshold = <310 600 1000>;
@@ -1141,6 +1166,33 @@
clock-names = "iface_clk";
};
+ qcom_seecom: qseecom@90000000{
+ compatible = "qcom,qseecom";
+ reg = <0x90000000 0x600000>;
+ reg-names = "secapp-region";
+ qcom,hlos-num-ce-hw-instances = <1>;
+ qcom,hlos-ce-hw-instance = <0>;
+ qcom,qsee-ce-hw-instance = <0>;
+ qcom,no-clock-support;
+ qcom,msm-bus,name = "qseecom-noc";
+ qcom,msm-bus,num-cases = <4>;
+ qcom,msm-bus,num-paths = <1>;
+ clocks = <&clock_gcc GCC_CE1_CLK>,
+ <&clock_gcc GCC_CE1_CLK>,
+ <&clock_gcc GCC_CE1_AHB_CLK>,
+ <&clock_gcc GCC_CE1_AXI_CLK>;
+ qcom,msm-bus,vectors-KBps =
+ <125 512 0 0>,
+ <125 512 20000 40000>,
+ <125 512 30000 80000>,
+ <125 512 40000 100000>;
+ clock-names = "core_clk_src", "core_clk",
+ "iface_clk", "bus_clk";
+ qcom,ce-opp-freq = <171430000>;
+ qcom,qsee-reentrancy-support = <2>;
+ status = "disabled";
+ };
+
qcom_cedev: qcedev@1de0000 {
compatible = "qcom,qcedev";
reg = <0x1de0000 0x20000>,
diff --git a/arch/arm/configs/msm8909-perf_defconfig b/arch/arm/configs/msm8909-perf_defconfig
index e21e912..444b80c 100644
--- a/arch/arm/configs/msm8909-perf_defconfig
+++ b/arch/arm/configs/msm8909-perf_defconfig
@@ -9,8 +9,6 @@
CONFIG_TASK_IO_ACCOUNTING=y
CONFIG_RCU_EXPERT=y
CONFIG_RCU_FAST_NO_HZ=y
-CONFIG_RCU_NOCB_CPU=y
-CONFIG_RCU_NOCB_CPU_ALL=y
CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
CONFIG_LOG_CPU_MAX_BUF_SHIFT=17
@@ -26,8 +24,6 @@
CONFIG_SCHED_TUNE=y
CONFIG_DEFAULT_USE_ENERGY_AWARE=y
CONFIG_BLK_DEV_INITRD=y
-# CONFIG_RD_BZIP2 is not set
-# CONFIG_RD_LZMA is not set
# CONFIG_RD_XZ is not set
# CONFIG_RD_LZO is not set
# CONFIG_RD_LZ4 is not set
@@ -38,7 +34,7 @@
CONFIG_EMBEDDED=y
# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
-CONFIG_OPROFILE=y
+CONFIG_OPROFILE=m
CONFIG_CC_STACKPROTECTOR_REGULAR=y
CONFIG_ARCH_MMAP_RND_BITS=16
CONFIG_MODULES=y
@@ -55,7 +51,6 @@
CONFIG_SCHED_MC=y
CONFIG_PREEMPT=y
CONFIG_AEABI=y
-CONFIG_HIGHMEM=y
CONFIG_ARM_MODULE_PLTS=y
CONFIG_CMA=y
CONFIG_ZSMALLOC=y
@@ -67,7 +62,6 @@
CONFIG_CPU_IDLE=y
CONFIG_VFP=y
CONFIG_NEON=y
-CONFIG_KERNEL_MODE_NEON=y
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
CONFIG_PM_AUTOSLEEP=y
CONFIG_PM_WAKELOCKS=y
@@ -88,7 +82,6 @@
CONFIG_IP_PNP_DHCP=y
CONFIG_INET_AH=y
CONFIG_INET_ESP=y
-CONFIG_INET_IPCOMP=y
CONFIG_INET_DIAG_DESTROY=y
CONFIG_IPV6_ROUTER_PREF=y
CONFIG_IPV6_ROUTE_INFO=y
@@ -118,14 +111,12 @@
CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_CT=y
CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
CONFIG_NETFILTER_XT_TARGET_HARDIDLETIMER=y
-CONFIG_NETFILTER_XT_TARGET_LOG=y
CONFIG_NETFILTER_XT_TARGET_MARK=y
CONFIG_NETFILTER_XT_TARGET_NFLOG=y
CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
-CONFIG_NETFILTER_XT_TARGET_NOTRACK=y
-CONFIG_NETFILTER_XT_TARGET_TEE=y
CONFIG_NETFILTER_XT_TARGET_TPROXY=y
CONFIG_NETFILTER_XT_TARGET_TRACE=y
CONFIG_NETFILTER_XT_TARGET_SECMARK=y
@@ -157,6 +148,8 @@
CONFIG_NETFILTER_XT_MATCH_TIME=y
CONFIG_NETFILTER_XT_MATCH_U32=y
CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_NF_DUP_IPV4=y
+CONFIG_NF_LOG_IPV4=y
CONFIG_IP_NF_IPTABLES=y
CONFIG_IP_NF_MATCH_AH=y
CONFIG_IP_NF_MATCH_ECN=y
@@ -175,6 +168,8 @@
CONFIG_IP_NF_ARPFILTER=y
CONFIG_IP_NF_ARP_MANGLE=y
CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_NF_DUP_IPV6=y
+CONFIG_NF_LOG_IPV6=y
CONFIG_IP6_NF_IPTABLES=y
CONFIG_IP6_NF_MATCH_RPFILTER=y
CONFIG_IP6_NF_FILTER=y
@@ -191,11 +186,8 @@
CONFIG_NET_SCHED=y
CONFIG_NET_SCH_HTB=y
CONFIG_NET_SCH_PRIO=y
-CONFIG_NET_SCH_MULTIQ=y
-CONFIG_NET_SCH_INGRESS=y
CONFIG_NET_CLS_FW=y
CONFIG_NET_CLS_U32=y
-CONFIG_CLS_U32_MARK=y
CONFIG_NET_CLS_FLOW=y
CONFIG_NET_EMATCH=y
CONFIG_NET_EMATCH_CMP=y
@@ -216,7 +208,6 @@
CONFIG_CFG80211=y
CONFIG_CFG80211_INTERNAL_REGDB=y
CONFIG_RFKILL=y
-CONFIG_NFC_NQ=y
CONFIG_IPC_ROUTER=y
CONFIG_IPC_ROUTER_SECURITY=y
CONFIG_FW_LOADER_USER_HELPER_FALLBACK=y
@@ -227,27 +218,17 @@
CONFIG_BLK_DEV_RAM_SIZE=8192
CONFIG_HDCP_QSEECOM=y
CONFIG_QSEECOM=y
-CONFIG_UID_SYS_STATS=y
CONFIG_MEMORY_STATE_TIME=y
CONFIG_SCSI=y
CONFIG_BLK_DEV_SD=y
CONFIG_CHR_DEV_SG=y
CONFIG_CHR_DEV_SCH=y
-CONFIG_SCSI_CONSTANTS=y
-CONFIG_SCSI_LOGGING=y
CONFIG_SCSI_SCAN_ASYNC=y
-CONFIG_SCSI_UFSHCD=y
-CONFIG_SCSI_UFSHCD_PLATFORM=y
-CONFIG_SCSI_UFS_QCOM=y
-CONFIG_SCSI_UFS_QCOM_ICE=y
-CONFIG_SCSI_UFSHCD_CMD_LOGGING=y
CONFIG_MD=y
CONFIG_BLK_DEV_DM=y
-CONFIG_DM_DEBUG=y
CONFIG_DM_CRYPT=y
CONFIG_DM_UEVENT=y
CONFIG_DM_VERITY=y
-CONFIG_DM_VERITY_FEC=y
CONFIG_NETDEVICES=y
CONFIG_DUMMY=y
CONFIG_TUN=y
@@ -268,23 +249,19 @@
CONFIG_CNSS_SDIO=y
CONFIG_CLD_HL_SDIO_CORE=y
CONFIG_INPUT_EVDEV=y
-CONFIG_INPUT_EVBUG=y
+CONFIG_INPUT_EVBUG=m
CONFIG_KEYBOARD_GPIO=y
CONFIG_INPUT_JOYSTICK=y
CONFIG_INPUT_TOUCHSCREEN=y
CONFIG_INPUT_MISC=y
CONFIG_INPUT_QPNP_POWER_ON=y
CONFIG_INPUT_UINPUT=y
-CONFIG_SERIAL_MSM=y
-CONFIG_SERIAL_MSM_CONSOLE=y
CONFIG_SERIAL_MSM_SMD=y
CONFIG_DIAG_CHAR=y
CONFIG_DIAG_USES_SMD=y
CONFIG_HW_RANDOM=y
-CONFIG_HW_RANDOM_MSM_LEGACY=y
CONFIG_MSM_SMD_PKT=y
CONFIG_MSM_ADSPRPC=y
-CONFIG_MSM_RDBG=y
CONFIG_I2C_CHARDEV=y
CONFIG_I2C_MSM_V2=y
CONFIG_SPI=y
@@ -346,10 +323,8 @@
CONFIG_MSM_ISPIF=y
CONFIG_QCOM_KGSL=y
CONFIG_FB=y
-CONFIG_FB_VIRTUAL=y
CONFIG_BACKLIGHT_LCD_SUPPORT=y
CONFIG_BACKLIGHT_CLASS_DEVICE=y
-CONFIG_LOGO=y
CONFIG_SOUND=y
CONFIG_SND=y
CONFIG_SND_DYNAMIC_MINORS=y
@@ -375,10 +350,6 @@
CONFIG_USB_EHCI_HCD=y
CONFIG_USB_EHCI_MSM=y
CONFIG_USB_STORAGE=y
-CONFIG_USB_DWC3=y
-CONFIG_USB_DWC3_GADGET=y
-CONFIG_NOP_USB_XCEIV=y
-CONFIG_DUAL_ROLE_USB_INTF=y
CONFIG_USB_GADGET=y
CONFIG_USB_GADGET_DEBUG_FILES=y
CONFIG_USB_GADGET_DEBUG_FS=y
@@ -386,8 +357,6 @@
CONFIG_USB_CI13XXX_MSM=y
CONFIG_USB_CONFIGFS=y
CONFIG_USB_CONFIGFS_SERIAL=y
-CONFIG_USB_CONFIGFS_NCM=y
-CONFIG_USB_CONFIGFS_RMNET_BAM=y
CONFIG_USB_CONFIGFS_MASS_STORAGE=y
CONFIG_USB_CONFIGFS_F_FS=y
CONFIG_USB_CONFIGFS_UEVENT=y
@@ -400,7 +369,6 @@
CONFIG_MMC_CLKGATE=y
CONFIG_MMC_BLOCK_MINORS=32
CONFIG_MMC_BLOCK_DEFERRED_RESUME=y
-CONFIG_MMC_TEST=y
CONFIG_MMC_SDHCI=y
CONFIG_MMC_SDHCI_PLTFM=y
CONFIG_MMC_SDHCI_MSM=y
@@ -420,8 +388,6 @@
CONFIG_ANDROID_LOW_MEMORY_KILLER=y
CONFIG_ION=y
CONFIG_ION_MSM=y
-CONFIG_IPA=y
-CONFIG_RMNET_IPA=y
CONFIG_SPS=y
CONFIG_SPS_SUPPORT_NDP_BAM=y
CONFIG_QPNP_REVID=y
@@ -443,23 +409,16 @@
CONFIG_MSM_SMEM=y
CONFIG_MSM_SMD=y
CONFIG_MSM_SMD_DEBUG=y
-CONFIG_MSM_GLINK=y
CONFIG_MSM_TZ_SMMU=y
-CONFIG_MSM_GLINK_LOOPBACK_SERVER=y
-CONFIG_MSM_GLINK_SMEM_NATIVE_XPRT=y
-CONFIG_MSM_GLINK_SPI_XPRT=y
CONFIG_TRACER_PKT=y
CONFIG_MSM_SMP2P=y
CONFIG_MSM_IPC_ROUTER_SMD_XPRT=y
-CONFIG_MSM_IPC_ROUTER_GLINK_XPRT=y
CONFIG_MSM_QMI_INTERFACE=y
-CONFIG_MSM_GLINK_PKT=y
CONFIG_MSM_SUBSYSTEM_RESTART=y
CONFIG_MSM_PIL=y
CONFIG_MSM_PIL_SSR_GENERIC=y
CONFIG_MSM_PIL_MSS_QDSP6V5=y
CONFIG_MSM_EVENT_TIMER=y
-CONFIG_QTI_RPM_STATS_LOG=y
CONFIG_QCOM_FORCE_WDOG_BITE_ON_PANIC=y
CONFIG_MSM_BAM_DMUX=y
CONFIG_CNSS_CRYPTO=y
@@ -468,58 +427,30 @@
CONFIG_QTI_MPM=y
CONFIG_ANDROID=y
CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_STM=y
CONFIG_SENSORS_SSC=y
CONFIG_MSM_TZ_LOG=y
-CONFIG_EXT2_FS=y
-CONFIG_EXT2_FS_XATTR=y
-CONFIG_EXT3_FS=y
+CONFIG_EXT4_FS=y
CONFIG_EXT4_FS_SECURITY=y
CONFIG_QUOTA=y
CONFIG_QUOTA_NETLINK_INTERFACE=y
CONFIG_FUSE_FS=y
-CONFIG_MSDOS_FS=y
CONFIG_VFAT_FS=y
CONFIG_TMPFS=y
-CONFIG_ECRYPT_FS=y
-CONFIG_ECRYPT_FS_MESSAGING=y
CONFIG_SDCARD_FS=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_INFO=y
CONFIG_FRAME_WARN=2048
-CONFIG_PAGE_OWNER=y
-CONFIG_PAGE_OWNER_ENABLE_DEFAULT=y
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_LOCKUP_DETECTOR=y
-# CONFIG_DETECT_HUNG_TASK is not set
+CONFIG_DEBUG_FS=y
CONFIG_WQ_WATCHDOG=y
CONFIG_PANIC_TIMEOUT=5
CONFIG_PANIC_ON_SCHED_BUG=y
CONFIG_PANIC_ON_RT_THROTTLING=y
-CONFIG_SCHEDSTATS=y
-CONFIG_SCHED_STACK_END_CHECK=y
# CONFIG_DEBUG_PREEMPT is not set
-# CONFIG_FTRACE is not set
-CONFIG_LKDTM=y
-CONFIG_PANIC_ON_DATA_CORRUPTION=y
-# CONFIG_ARM_UNWIND is not set
-CONFIG_PID_IN_CONTEXTIDR=y
+CONFIG_STACKTRACE=y
CONFIG_DEBUG_SET_MODULE_RONX=y
-CONFIG_CORESIGHT=y
-CONFIG_CORESIGHT_LINK_AND_SINK_TMC=y
-CONFIG_CORESIGHT_SINK_TPIU=y
-CONFIG_CORESIGHT_SOURCE_ETM3X=y
-CONFIG_CORESIGHT_REMOTE_ETM=y
-CONFIG_CORESIGHT_REMOTE_ETM_DEFAULT_ENABLE=0
-CONFIG_CORESIGHT_QCOM_REPLICATOR=y
-CONFIG_CORESIGHT_DBGUI=y
-CONFIG_CORESIGHT_STM=y
-CONFIG_CORESIGHT_TPDA=y
-CONFIG_CORESIGHT_TPDM=y
-CONFIG_CORESIGHT_CTI=y
-CONFIG_CORESIGHT_EVENT=y
-CONFIG_CORESIGHT_HWEVENT=y
CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y
CONFIG_SECURITY=y
CONFIG_LSM_MMAP_MIN_ADDR=4096
@@ -538,10 +469,4 @@
CONFIG_CRYPTO_DEV_QCEDEV=y
CONFIG_CRYPTO_DEV_OTA_CRYPTO=y
CONFIG_CRYPTO_DEV_QCOM_ICE=y
-CONFIG_ARM_CRYPTO=y
-CONFIG_CRYPTO_SHA1_ARM_NEON=y
-CONFIG_CRYPTO_SHA2_ARM_CE=y
-CONFIG_CRYPTO_AES_ARM_BS=y
-CONFIG_CRYPTO_AES_ARM_CE=y
-CONFIG_XZ_DEC=y
CONFIG_QMI_ENCDEC=y
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 80f39e9..f72ce30 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -129,6 +129,7 @@
select SPARSE_IRQ
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
+ select ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT
help
ARM 64-bit (AArch64) Linux support.
diff --git a/arch/arm64/boot/dts/qcom/apq8053-lite-dragon.dtsi b/arch/arm64/boot/dts/qcom/apq8053-lite-dragon.dtsi
index bd48f09..0b77a20 100644
--- a/arch/arm64/boot/dts/qcom/apq8053-lite-dragon.dtsi
+++ b/arch/arm64/boot/dts/qcom/apq8053-lite-dragon.dtsi
@@ -103,6 +103,21 @@
pinctrl-1 = <&sec_tlmm_lines_sus>;
};
+ gpio_keys {
+ compatible = "gpio-keys";
+ input-name = "gpio-keys";
+ pinctrl-names = "default";
+ pinctrl-0 = <&gpio_key_active>;
+ vol_up {
+ label = "volume_up";
+ gpios = <&tlmm 85 0x1>;
+ linux,input-type = <1>;
+ linux,code = <115>;
+ debounce-interval = <15>;
+ linux,can-disable;
+ gpio-key,wakeup;
+ };
+ };
};
&firmware {
@@ -358,14 +373,6 @@
};
&spmi_bus {
- qcom,pm8953@0 {
- qcom,power-on@800 {
- qcom,resin-gpiobase = <1019>;
- qcom,pon_2 {
- /delete-property/ linux,code;
- };
- };
- };
qcom,pmi8950@2 {
qcom,leds@a100 {
compatible = "qcom,leds-qpnp";
diff --git a/arch/arm64/boot/dts/qcom/msm8917-cdp.dtsi b/arch/arm64/boot/dts/qcom/msm8917-cdp.dtsi
index fde4847..513e995 100644
--- a/arch/arm64/boot/dts/qcom/msm8917-cdp.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8917-cdp.dtsi
@@ -17,9 +17,8 @@
gpio_keys {
compatible = "gpio-keys";
input-name = "gpio-keys";
- pinctrl-names = "tlmm_gpio_key_active","tlmm_gpio_key_suspend";
+ pinctrl-names = "default";
pinctrl-0 = <&gpio_key_active>;
- pinctrl-1 = <&gpio_key_suspend>;
camera_focus {
label = "camera_focus";
@@ -27,6 +26,8 @@
linux,input-type = <1>;
linux,code = <0x210>;
debounce-interval = <15>;
+ linux,can-disable;
+ gpio-key,wakeup;
};
camera_snapshot {
@@ -35,6 +36,8 @@
linux,input-type = <1>;
linux,code = <0x2fe>;
debounce-interval = <15>;
+ linux,can-disable;
+ gpio-key,wakeup;
};
vol_up {
@@ -43,6 +46,8 @@
linux,input-type = <1>;
linux,code = <115>;
debounce-interval = <15>;
+ linux,can-disable;
+ gpio-key,wakeup;
};
home {
@@ -51,6 +56,8 @@
linux,input-type = <1>;
linux,code = <102>;
debounce-interval = <15>;
+ linux,can-disable;
+ gpio-key,wakeup;
};
};
diff --git a/arch/arm64/boot/dts/qcom/msm8917-mtp.dtsi b/arch/arm64/boot/dts/qcom/msm8917-mtp.dtsi
index 164b781..c79b9f8 100644
--- a/arch/arm64/boot/dts/qcom/msm8917-mtp.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8917-mtp.dtsi
@@ -80,6 +80,43 @@
qcom,dig-vtg-min = <1800000>;
qcom,dig-vtg-max = <1800000>;
};
+
+ gpio_keys {
+ compatible = "gpio-keys";
+ input-name = "gpio-keys";
+ pinctrl-names = "default";
+ pinctrl-0 = <&gpio_key_active>;
+
+ camera_focus {
+ label = "camera_focus";
+ gpios = <&tlmm 128 0x1>;
+ linux,input-type = <1>;
+ linux,code = <0x210>;
+ debounce-interval = <15>;
+ linux,can-disable;
+ gpio-key,wakeup;
+ };
+
+ camera_snapshot {
+ label = "camera_snapshot";
+ gpios = <&tlmm 127 0x1>;
+ linux,input-type = <1>;
+ linux,code = <0x2fe>;
+ debounce-interval = <15>;
+ linux,can-disable;
+ gpio-key,wakeup;
+ };
+
+ vol_up {
+ label = "volume_up";
+ gpios = <&tlmm 91 0x1>;
+ linux,input-type = <1>;
+ linux,code = <115>;
+ debounce-interval = <15>;
+ linux,can-disable;
+ gpio-key,wakeup;
+ };
+ };
};
#include "msm8937-mdss-panels.dtsi"
diff --git a/arch/arm64/boot/dts/qcom/msm8917-qrd.dtsi b/arch/arm64/boot/dts/qcom/msm8917-qrd.dtsi
index 431a5e5..1e5393b 100644
--- a/arch/arm64/boot/dts/qcom/msm8917-qrd.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8917-qrd.dtsi
@@ -88,9 +88,8 @@
gpio_keys {
compatible = "gpio-keys";
input-name = "gpio-keys";
- pinctrl-names = "tlmm_gpio_key_active","tlmm_gpio_key_suspend";
+ pinctrl-names = "default";
pinctrl-0 = <&gpio_key_active>;
- pinctrl-1 = <&gpio_key_suspend>;
vol_up {
label = "volume_up";
@@ -98,6 +97,8 @@
linux,input-type = <1>;
linux,code = <115>;
debounce-interval = <15>;
+ linux,can-disable;
+ gpio-key,wakeup;
};
};
};
diff --git a/arch/arm64/boot/dts/qcom/msm8953-mdss-panels.dtsi b/arch/arm64/boot/dts/qcom/msm8953-mdss-panels.dtsi
index a80b4fe..7bc181c 100644
--- a/arch/arm64/boot/dts/qcom/msm8953-mdss-panels.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8953-mdss-panels.dtsi
@@ -124,6 +124,12 @@
24 1f 08 09 05 03 04 a0
24 1f 08 09 05 03 04 a0
24 1c 08 09 05 03 04 a0];
+ qcom,mdss-dsi-h-front-porch = <52>;
+ qcom,mdss-dsi-h-back-porch = <48>;
+ qcom,mdss-dsi-h-pulse-width = <8>;
+ qcom,mdss-dsi-v-back-porch = <8>;
+ qcom,mdss-dsi-v-front-porch = <8>;
+ qcom,mdss-dsi-v-pulse-width = <4>;
qcom,esd-check-enabled;
qcom,mdss-dsi-panel-status-check-mode = "reg_read";
qcom,mdss-dsi-panel-status-command = [06 01 00 01 00 00 01 0a];
diff --git a/arch/arm64/boot/dts/qcom/msm8953.dtsi b/arch/arm64/boot/dts/qcom/msm8953.dtsi
index c3178e0..c256518 100644
--- a/arch/arm64/boot/dts/qcom/msm8953.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8953.dtsi
@@ -1464,8 +1464,10 @@
sdhc_1: sdhci@7824900 {
compatible = "qcom,sdhci-msm";
- reg = <0x7824900 0x500>, <0x7824000 0x800>, <0x7824e00 0x200>;
- reg-names = "hc_mem", "core_mem", "cmdq_mem";
+ reg = <0x7824900 0x500>, <0x7824000 0x800>, <0x7824e00 0x200>,
+ <0x0119d000 0x4>;
+ reg-names = "hc_mem", "core_mem", "cmdq_mem",
+ "tlmm_mem";
interrupts = <0 123 0>, <0 138 0>;
interrupt-names = "hc_irq", "pwr_irq";
diff --git a/arch/arm64/boot/dts/qcom/sda670-hdk.dtsi b/arch/arm64/boot/dts/qcom/sda670-hdk.dtsi
index eec80f6..ef8667b 100644
--- a/arch/arm64/boot/dts/qcom/sda670-hdk.dtsi
+++ b/arch/arm64/boot/dts/qcom/sda670-hdk.dtsi
@@ -61,3 +61,46 @@
status = "ok";
};
+
+&qusb_phy0 {
+ qcom,qusb-phy-host-init-seq =
+ /* <value reg_offset> */
+ <0x23 0x210 /* PWR_CTRL1 */
+ 0x03 0x04 /* PLL_ANALOG_CONTROLS_TWO */
+ 0x7c 0x18c /* PLL_CLOCK_INVERTERS */
+ 0x80 0x2c /* PLL_CMODE */
+ 0x0a 0x184 /* PLL_LOCK_DELAY */
+ 0x19 0xb4 /* PLL_DIGITAL_TIMERS_TWO */
+ 0x40 0x194 /* PLL_BIAS_CONTROL_1 */
+ 0x20 0x198 /* PLL_BIAS_CONTROL_2 */
+ 0x21 0x214 /* PWR_CTRL2 */
+ 0x00 0x220 /* IMP_CTRL1 */
+ 0x58 0x224 /* IMP_CTRL2 */
+ 0x77 0x240 /* TUNE1 */
+ 0x29 0x244 /* TUNE2 */
+ 0xca 0x248 /* TUNE3 */
+ 0x04 0x24c /* TUNE4 */
+ 0x03 0x250 /* TUNE5 */
+ 0x00 0x23c /* CHG_CTRL2 */
+ 0x22 0x210>; /* PWR_CTRL1 */
+ qcom,qusb-phy-init-seq =
+ /* <value reg_offset> */
+ <0x23 0x210 /* PWR_CTRL1 */
+ 0x03 0x04 /* PLL_ANALOG_CONTROLS_TWO */
+ 0x7c 0x18c /* PLL_CLOCK_INVERTERS */
+ 0x80 0x2c /* PLL_CMODE */
+ 0x0a 0x184 /* PLL_LOCK_DELAY */
+ 0x19 0xb4 /* PLL_DIGITAL_TIMERS_TWO */
+ 0x40 0x194 /* PLL_BIAS_CONTROL_1 */
+ 0x20 0x198 /* PLL_BIAS_CONTROL_2 */
+ 0x21 0x214 /* PWR_CTRL2 */
+ 0x25 0x220 /* IMP_CTRL1 */
+ 0x58 0x224 /* IMP_CTRL2 */
+ 0x65 0x240 /* TUNE1 */
+ 0x29 0x244 /* TUNE2 */
+ 0xca 0x248 /* TUNE3 */
+ 0x04 0x24c /* TUNE4 */
+ 0x03 0x250 /* TUNE5 */
+ 0x00 0x23c /* CHG_CTRL2 */
+ 0x22 0x210>; /* PWR_CTRL1 */
+};
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 6befc9c..c1d02d1 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -253,14 +253,12 @@
#define VM_FAULT_BADMAP 0x010000
#define VM_FAULT_BADACCESS 0x020000
-static int __do_page_fault(struct mm_struct *mm, unsigned long addr,
+static int __do_page_fault(struct vm_area_struct *vma, unsigned long addr,
unsigned int mm_flags, unsigned long vm_flags,
struct task_struct *tsk)
{
- struct vm_area_struct *vma;
int fault;
- vma = find_vma(mm, addr);
fault = VM_FAULT_BADMAP;
if (unlikely(!vma))
goto out;
@@ -318,6 +316,7 @@
int fault, sig, code;
unsigned long vm_flags = VM_READ | VM_WRITE;
unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ struct vm_area_struct *vma = NULL;
if (notify_page_fault(regs, esr))
return 0;
@@ -356,6 +355,14 @@
}
/*
+ * let's try a speculative page fault without grabbing the
+ * mmap_sem.
+ */
+ fault = handle_speculative_fault(mm, addr, mm_flags, &vma);
+ if (fault != VM_FAULT_RETRY)
+ goto done;
+
+ /*
* As per x86, we may deadlock here. However, since the kernel only
* validly references user space from well defined areas of the code,
* we can bug out early if this is from code which shouldn't.
@@ -377,19 +384,44 @@
#endif
}
- fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk);
+ if (!vma || !can_reuse_spf_vma(vma, addr))
+ vma = find_vma(mm, addr);
- /*
- * If we need to retry but a fatal signal is pending, handle the
- * signal first. We do not need to release the mmap_sem because it
- * would already be released in __lock_page_or_retry in mm/filemap.c.
- */
- if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
- if (!user_mode(regs))
- goto no_context;
- return 0;
+ fault = __do_page_fault(vma, addr, mm_flags, vm_flags, tsk);
+ if (fault & VM_FAULT_RETRY) {
+ /*
+ * If we need to retry but a fatal signal is pending, handle the
+ * signal first. We do not need to release the mmap_sem because
+ * it would already be released in __lock_page_or_retry in
+ * mm/filemap.c.
+ */
+
+ if (fatal_signal_pending(current)) {
+ if (!user_mode(regs))
+ goto no_context;
+ return 0;
+ }
+
+ /*
+ * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of
+ * starvation.
+ */
+ if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
+ mm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
+ mm_flags |= FAULT_FLAG_TRIED;
+
+ /*
+ * Do not try to reuse this vma and fetch it
+ * again since we will release the mmap_sem.
+ */
+ vma = NULL;
+ goto retry;
+ }
}
+ up_read(&mm->mmap_sem);
+done:
+
/*
* Major/minor page fault accounting is only done on the initial
* attempt. If we go through a retry, it is extremely likely that the
@@ -407,19 +439,8 @@
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs,
addr);
}
- if (fault & VM_FAULT_RETRY) {
- /*
- * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of
- * starvation.
- */
- mm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
- mm_flags |= FAULT_FLAG_TRIED;
- goto retry;
- }
}
- up_read(&mm->mmap_sem);
-
/*
* Handle the "normal" case first - VM_FAULT_MAJOR
*/
diff --git a/drivers/cpuidle/lpm-levels-legacy.c b/drivers/cpuidle/lpm-levels-legacy.c
index 006a5ef..26cb52a 100644
--- a/drivers/cpuidle/lpm-levels-legacy.c
+++ b/drivers/cpuidle/lpm-levels-legacy.c
@@ -643,9 +643,11 @@
cpumask_copy(&cpumask, cpumask_of(cpu));
nextcpu = level->disable_dynamic_routing ? NULL : &cpumask;
- if (sys_pm_ops && sys_pm_ops->enter)
- if ((sys_pm_ops->enter(nextcpu)))
- return -EBUSY;
+ if (sys_pm_ops && sys_pm_ops->enter) {
+ ret = sys_pm_ops->enter(nextcpu);
+ if (ret)
+ goto failed_set_mode;
+ }
if (cluster->no_saw_devices && !use_psci)
msm_spm_set_rpm_hs(true);
diff --git a/drivers/mailbox/qcom-rpmh-mailbox.c b/drivers/mailbox/qcom-rpmh-mailbox.c
index c81fe52..9f699db 100644
--- a/drivers/mailbox/qcom-rpmh-mailbox.c
+++ b/drivers/mailbox/qcom-rpmh-mailbox.c
@@ -695,9 +695,10 @@
}
/* sanity check to ensure the seq is same */
for (j = 1; j < len; j++) {
- WARN((tcs->cmd_addr[i + j] != cmd[j].addr),
- "Message does not match previous sequence.\n");
+ if (tcs->cmd_addr[i + j] != cmd[j].addr) {
+ pr_debug("Message does not match previous sequence.\n");
return -EINVAL;
+ }
}
found = true;
break;
@@ -725,12 +726,12 @@
do {
slot = bitmap_find_next_zero_area(tcs->slots, MAX_TCS_SLOTS,
n, msg->num_payload, 0);
- if (slot == MAX_TCS_SLOTS)
+ if (slot >= MAX_TCS_SLOTS)
break;
n += tcs->ncpt;
} while (slot + msg->num_payload - 1 >= n);
- return (slot != MAX_TCS_SLOTS) ? slot : -ENOMEM;
+ return (slot < MAX_TCS_SLOTS) ? slot : -ENOMEM;
}
static int tcs_mbox_write(struct mbox_chan *chan, struct tcs_mbox_msg *msg,
diff --git a/drivers/media/platform/msm/camera_v2/isp/msm_isp_axi_util_32.c b/drivers/media/platform/msm/camera_v2/isp/msm_isp_axi_util_32.c
index 55a10ca..21bac16 100644
--- a/drivers/media/platform/msm/camera_v2/isp/msm_isp_axi_util_32.c
+++ b/drivers/media/platform/msm/camera_v2/isp/msm_isp_axi_util_32.c
@@ -1602,6 +1602,8 @@
vfe_dev->hw_info->vfe_ops.core_ops.reg_update(vfe_dev, 0xF);
msm_isp_update_camif_output_count(vfe_dev, stream_cfg_cmd);
msm_isp_update_rdi_output_count(vfe_dev, stream_cfg_cmd);
+ /*Configure UB*/
+ vfe_dev->hw_info->vfe_ops.axi_ops.cfg_ub(vfe_dev);
if (camif_update == ENABLE_CAMIF) {
atomic_set(&vfe_dev->error_info.overflow_state,
NO_OVERFLOW);
@@ -1735,7 +1737,6 @@
{
int rc = 0;
struct msm_vfe_axi_stream_cfg_cmd *stream_cfg_cmd = arg;
- struct msm_vfe_axi_shared_data *axi_data = &vfe_dev->axi_data;
enum msm_isp_camif_update_state camif_update;
rc = msm_isp_axi_check_stream_state(vfe_dev, stream_cfg_cmd);
@@ -1744,10 +1745,6 @@
return rc;
}
- if (axi_data->num_active_stream == 0) {
- /*Configure UB*/
- vfe_dev->hw_info->vfe_ops.axi_ops.cfg_ub(vfe_dev);
- }
camif_update = msm_isp_get_camif_update_state(vfe_dev, stream_cfg_cmd);
if (stream_cfg_cmd->cmd == START_STREAM) {
diff --git a/drivers/media/platform/msm/camera_v2/sensor/csiphy/msm_csiphy.c b/drivers/media/platform/msm/camera_v2/sensor/csiphy/msm_csiphy.c
index 0cfddb3..35d5984 100644
--- a/drivers/media/platform/msm/camera_v2/sensor/csiphy/msm_csiphy.c
+++ b/drivers/media/platform/msm/camera_v2/sensor/csiphy/msm_csiphy.c
@@ -1706,10 +1706,8 @@
CDBG("%s:%d called\n", __func__, __LINE__);
if (csiphy_dev->csiphy_state == CSIPHY_POWER_UP) {
- pr_err("%s: csiphy invalid state %d\n", __func__,
+ pr_err("%s: csiphy current state %d\n", __func__,
csiphy_dev->csiphy_state);
- rc = -EINVAL;
- return rc;
}
CDBG("%s:%d called\n", __func__, __LINE__);
diff --git a/drivers/power/supply/qcom/qpnp-smb5.c b/drivers/power/supply/qcom/qpnp-smb5.c
index 9840396..ab4990a 100644
--- a/drivers/power/supply/qcom/qpnp-smb5.c
+++ b/drivers/power/supply/qcom/qpnp-smb5.c
@@ -985,6 +985,10 @@
pr_err("Failed to force 5V\n");
else
chg->pulse_cnt = 0;
+ } else {
+ /* USB absent & flash not-active - vote 100mA */
+ vote(chg->usb_icl_votable, SW_ICL_MAX_VOTER,
+ true, SDP_100_MA);
}
pr_debug("flash active VBUS 5V restriction %s\n",
diff --git a/drivers/power/supply/qcom/qpnp-smbcharger.c b/drivers/power/supply/qcom/qpnp-smbcharger.c
index 40cfd9c..71b8b8b 100644
--- a/drivers/power/supply/qcom/qpnp-smbcharger.c
+++ b/drivers/power/supply/qcom/qpnp-smbcharger.c
@@ -632,6 +632,18 @@
mutex_unlock(&chip->pm_lock);
};
+static bool is_bms_psy_present(struct smbchg_chip *chip)
+{
+ if (chip->bms_psy)
+ return true;
+
+ if (chip->bms_psy_name)
+ chip->bms_psy = power_supply_get_by_name(
+ (char *)chip->bms_psy_name);
+
+ return chip->bms_psy ? true : false;
+}
+
enum pwr_path_type {
UNKNOWN = 0,
PWR_PATH_BATTERY = 1,
@@ -3748,17 +3760,11 @@
static void smbchg_external_power_changed(struct power_supply *psy)
{
struct smbchg_chip *chip = power_supply_get_drvdata(psy);
- union power_supply_propval prop = {0,};
- int rc, current_limit = 0, soc;
- enum power_supply_type usb_supply_type;
- char *usb_type_name = "null";
-
- if (chip->bms_psy_name)
- chip->bms_psy =
- power_supply_get_by_name((char *)chip->bms_psy_name);
+ int rc, soc;
smbchg_aicl_deglitch_wa_check(chip);
- if (chip->bms_psy) {
+
+ if (is_bms_psy_present(chip)) {
check_battery_type(chip);
soc = get_prop_batt_capacity(chip);
if (chip->previous_soc != soc) {
@@ -3773,37 +3779,8 @@
rc);
}
- rc = power_supply_get_property(chip->usb_psy,
- POWER_SUPPLY_PROP_CHARGING_ENABLED, &prop);
- if (rc == 0)
- vote(chip->usb_suspend_votable, POWER_SUPPLY_EN_VOTER,
- !prop.intval, 0);
-
- current_limit = chip->usb_current_max / 1000;
-
- /* Override if type-c charger used */
- if (chip->typec_current_ma > 500 &&
- current_limit < chip->typec_current_ma)
- current_limit = chip->typec_current_ma;
-
- read_usb_type(chip, &usb_type_name, &usb_supply_type);
-
- if (usb_supply_type != POWER_SUPPLY_TYPE_USB)
- goto skip_current_for_non_sdp;
-
- pr_smb(PR_MISC, "usb type = %s current_limit = %d\n",
- usb_type_name, current_limit);
-
- rc = vote(chip->usb_icl_votable, PSY_ICL_VOTER, true,
- current_limit);
- if (rc < 0)
- pr_err("Couldn't update USB PSY ICL vote rc=%d\n", rc);
-
-skip_current_for_non_sdp:
+ /* adjust vfloat */
smbchg_vfloat_adjust_check(chip);
-
- if (chip->batt_psy)
- power_supply_changed(chip->batt_psy);
}
static int smbchg_otg_regulator_enable(struct regulator_dev *rdev)
@@ -5754,6 +5731,21 @@
}
}
+static int smbchg_set_sdp_current(struct smbchg_chip *chip, int current_ma)
+{
+ if (chip->usb_supply_type == POWER_SUPPLY_TYPE_USB) {
+ /* Override if type-c charger used */
+ if (chip->typec_current_ma > 500 &&
+ current_ma < chip->typec_current_ma) {
+ current_ma = chip->typec_current_ma;
+ }
+ pr_smb(PR_MISC, "from USB current_ma = %d\n", current_ma);
+ vote(chip->usb_icl_votable, PSY_ICL_VOTER, true, current_ma);
+ }
+
+ return 0;
+}
+
static int smbchg_usb_get_property(struct power_supply *psy,
enum power_supply_property psp,
union power_supply_propval *val)
@@ -5762,7 +5754,12 @@
switch (psp) {
case POWER_SUPPLY_PROP_CURRENT_MAX:
- val->intval = chip->usb_current_max;
+ case POWER_SUPPLY_PROP_SDP_CURRENT_MAX:
+ if (chip->usb_icl_votable)
+ val->intval = get_client_vote(chip->usb_icl_votable,
+ PSY_ICL_VOTER) * 1000;
+ else
+ val->intval = 0;
break;
case POWER_SUPPLY_PROP_PRESENT:
val->intval = chip->usb_present;
@@ -5792,17 +5789,16 @@
struct smbchg_chip *chip = power_supply_get_drvdata(psy);
switch (psp) {
- case POWER_SUPPLY_PROP_CURRENT_MAX:
- chip->usb_current_max = val->intval;
- break;
case POWER_SUPPLY_PROP_ONLINE:
chip->usb_online = val->intval;
break;
+ case POWER_SUPPLY_PROP_CURRENT_MAX:
+ case POWER_SUPPLY_PROP_SDP_CURRENT_MAX:
+ smbchg_set_sdp_current(chip, val->intval / 1000);
default:
return -EINVAL;
}
- power_supply_changed(psy);
return 0;
}
@@ -5812,6 +5808,7 @@
{
switch (psp) {
case POWER_SUPPLY_PROP_CURRENT_MAX:
+ case POWER_SUPPLY_PROP_SDP_CURRENT_MAX:
return 1;
default:
break;
@@ -5833,6 +5830,7 @@
POWER_SUPPLY_PROP_TYPE,
POWER_SUPPLY_PROP_REAL_TYPE,
POWER_SUPPLY_PROP_HEALTH,
+ POWER_SUPPLY_PROP_SDP_CURRENT_MAX,
};
#define CHARGE_OUTPUT_VTG_RATIO 840
diff --git a/drivers/power/supply/qcom/schgm-flash.c b/drivers/power/supply/qcom/schgm-flash.c
index eed70d3..9dd619c 100644
--- a/drivers/power/supply/qcom/schgm-flash.c
+++ b/drivers/power/supply/qcom/schgm-flash.c
@@ -101,6 +101,11 @@
}
}
+bool is_flash_active(struct smb_charger *chg)
+{
+ return chg->flash_active ? true : false;
+}
+
int schgm_flash_get_vreg_ok(struct smb_charger *chg, int *val)
{
int rc, vreg_state;
diff --git a/drivers/power/supply/qcom/schgm-flash.h b/drivers/power/supply/qcom/schgm-flash.h
index b6fff6c..be3953b 100644
--- a/drivers/power/supply/qcom/schgm-flash.h
+++ b/drivers/power/supply/qcom/schgm-flash.h
@@ -47,6 +47,7 @@
int schgm_flash_get_vreg_ok(struct smb_charger *chg, int *val);
int schgm_flash_init(struct smb_charger *chg);
+bool is_flash_active(struct smb_charger *chg);
irqreturn_t schgm_flash_default_irq_handler(int irq, void *data);
irqreturn_t schgm_flash_ilim2_irq_handler(int irq, void *data);
diff --git a/drivers/power/supply/qcom/smb-lib.c b/drivers/power/supply/qcom/smb-lib.c
index 612c3dd..5b94ff2 100644
--- a/drivers/power/supply/qcom/smb-lib.c
+++ b/drivers/power/supply/qcom/smb-lib.c
@@ -4757,6 +4757,7 @@
{
int rc;
u8 stat4, stat5;
+ bool lock = false;
struct smb_charger *chg = container_of(work, struct smb_charger,
rdstd_cc2_detach_work);
@@ -4819,9 +4820,28 @@
rc = smblib_masked_write(chg, TYPE_C_INTRPT_ENB_SOFTWARE_CTRL_REG,
EXIT_SNK_BASED_ON_CC_BIT, 0);
smblib_reg_block_restore(chg, cc2_detach_settings);
- mutex_lock(&chg->lock);
+
+ /*
+ * Mutex acquisition deadlock can happen while cancelling this work
+ * during pd_hard_reset from the function smblib_cc2_sink_removal_exit
+ * which is called in the same lock context that we try to acquire in
+ * this work routine.
+ * Check if this work is running during pd_hard_reset and use trylock
+ * instead of mutex_lock to prevent any deadlock if mutext is already
+ * held.
+ */
+ if (chg->pd_hard_reset) {
+ if (mutex_trylock(&chg->lock))
+ lock = true;
+ } else {
+ mutex_lock(&chg->lock);
+ lock = true;
+ }
+
smblib_usb_typec_change(chg);
- mutex_unlock(&chg->lock);
+
+ if (lock)
+ mutex_unlock(&chg->lock);
return;
rerun:
diff --git a/drivers/power/supply/qcom/smb5-lib.c b/drivers/power/supply/qcom/smb5-lib.c
index c70d51e8..94e2ccb 100644
--- a/drivers/power/supply/qcom/smb5-lib.c
+++ b/drivers/power/supply/qcom/smb5-lib.c
@@ -22,6 +22,7 @@
#include "smb5-lib.h"
#include "smb5-reg.h"
#include "battery.h"
+#include "schgm-flash.h"
#include "step-chg-jeita.h"
#include "storm-watch.h"
@@ -705,7 +706,6 @@
return 0;
}
-#define SDP_100_MA 100000
static void smblib_uusb_removal(struct smb_charger *chg)
{
int rc;
@@ -730,7 +730,8 @@
/* reset both usbin current and voltage votes */
vote(chg->pl_enable_votable_indirect, USBIN_I_VOTER, false, 0);
vote(chg->pl_enable_votable_indirect, USBIN_V_VOTER, false, 0);
- vote(chg->usb_icl_votable, SW_ICL_MAX_VOTER, true, SDP_100_MA);
+ vote(chg->usb_icl_votable, SW_ICL_MAX_VOTER, true,
+ is_flash_active(chg) ? SDP_CURRENT_UA : SDP_100_MA);
vote(chg->usb_icl_votable, SW_QC3_VOTER, false, 0);
/* reconfigure allowed voltage for HVDCP */
@@ -2159,13 +2160,6 @@
return 0;
}
-#define SDP_CURRENT_UA 500000
-#define CDP_CURRENT_UA 1500000
-#define DCP_CURRENT_UA 1500000
-#define HVDCP_CURRENT_UA 3000000
-#define TYPEC_DEFAULT_CURRENT_UA 900000
-#define TYPEC_MEDIUM_CURRENT_UA 1500000
-#define TYPEC_HIGH_CURRENT_UA 3000000
static int get_rp_based_dcp_current(struct smb_charger *chg, int typec_mode)
{
int rp_ua;
@@ -2205,6 +2199,7 @@
int usb_current)
{
int rc = 0, rp_ua, typec_mode;
+ union power_supply_propval val = {0, };
if (chg->real_charger_type == POWER_SUPPLY_TYPE_USB_FLOAT) {
if (usb_current == -ETIMEDOUT) {
@@ -2259,8 +2254,16 @@
return rc;
}
} else {
- rc = vote(chg->usb_icl_votable, USB_PSY_VOTER,
- true, usb_current);
+ rc = smblib_get_prop_usb_present(chg, &val);
+ if (!rc && !val.intval)
+ return 0;
+
+ /* if flash is active force 500mA */
+ if ((usb_current < SDP_CURRENT_UA) && is_flash_active(chg))
+ usb_current = SDP_CURRENT_UA;
+
+ rc = vote(chg->usb_icl_votable, USB_PSY_VOTER, true,
+ usb_current);
if (rc < 0) {
pr_err("Couldn't vote ICL USB_PSY_VOTER rc=%d\n", rc);
return rc;
@@ -3028,9 +3031,12 @@
* enumeration is done.
*/
if (!is_client_vote_enabled(chg->usb_icl_votable,
- USB_PSY_VOTER))
+ USB_PSY_VOTER)) {
+ /* if flash is active force 500mA */
vote(chg->usb_icl_votable, USB_PSY_VOTER, true,
- SDP_100_MA);
+ is_flash_active(chg) ?
+ SDP_CURRENT_UA : SDP_100_MA);
+ }
vote(chg->usb_icl_votable, SW_ICL_MAX_VOTER, false, 0);
break;
case POWER_SUPPLY_TYPE_USB_CDP:
@@ -3235,7 +3241,8 @@
cancel_delayed_work_sync(&chg->pl_enable_work);
/* reset input current limit voters */
- vote(chg->usb_icl_votable, SW_ICL_MAX_VOTER, true, SDP_100_MA);
+ vote(chg->usb_icl_votable, SW_ICL_MAX_VOTER, true,
+ is_flash_active(chg) ? SDP_CURRENT_UA : SDP_100_MA);
vote(chg->usb_icl_votable, PD_VOTER, false, 0);
vote(chg->usb_icl_votable, USB_PSY_VOTER, false, 0);
vote(chg->usb_icl_votable, DCP_VOTER, false, 0);
diff --git a/drivers/power/supply/qcom/smb5-lib.h b/drivers/power/supply/qcom/smb5-lib.h
index 35e5dd3..a0e835f 100644
--- a/drivers/power/supply/qcom/smb5-lib.h
+++ b/drivers/power/supply/qcom/smb5-lib.h
@@ -73,6 +73,15 @@
#define VBAT_TO_VRAW_ADC(v) div_u64((u64)v * 1000000UL, 194637UL)
+#define SDP_100_MA 100000
+#define SDP_CURRENT_UA 500000
+#define CDP_CURRENT_UA 1500000
+#define DCP_CURRENT_UA 1500000
+#define HVDCP_CURRENT_UA 3000000
+#define TYPEC_DEFAULT_CURRENT_UA 900000
+#define TYPEC_MEDIUM_CURRENT_UA 1500000
+#define TYPEC_HIGH_CURRENT_UA 3000000
+
enum smb_mode {
PARALLEL_MASTER = 0,
PARALLEL_SLAVE,
diff --git a/drivers/soc/qcom/msm_smem.c b/drivers/soc/qcom/msm_smem.c
index 1bbd751..959aab9 100644
--- a/drivers/soc/qcom/msm_smem.c
+++ b/drivers/soc/qcom/msm_smem.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -26,6 +26,7 @@
#include <soc/qcom/subsystem_notif.h>
#include <soc/qcom/subsystem_restart.h>
#include <soc/qcom/ramdump.h>
+#include <soc/qcom/scm.h>
#include <soc/qcom/smem.h>
@@ -1085,12 +1086,15 @@
void *handle;
struct restart_notifier_block *nb;
- if (smem_dev)
- smem_ramdump_dev = create_ramdump_device("smem", smem_dev);
- if (IS_ERR_OR_NULL(smem_ramdump_dev)) {
- LOG_ERR("%s: Unable to create smem ramdump device.\n",
- __func__);
- smem_ramdump_dev = NULL;
+ if (scm_is_secure_device()) {
+ if (smem_dev)
+ smem_ramdump_dev = create_ramdump_device("smem",
+ smem_dev);
+ if (IS_ERR_OR_NULL(smem_ramdump_dev)) {
+ LOG_ERR("%s: Unable to create smem ramdump device.\n",
+ __func__);
+ smem_ramdump_dev = NULL;
+ }
}
for (i = 0; i < ARRAY_SIZE(restart_notifiers); i++) {
diff --git a/drivers/soc/qcom/spcom.c b/drivers/soc/qcom/spcom.c
index d568014..f2597e3 100644
--- a/drivers/soc/qcom/spcom.c
+++ b/drivers/soc/qcom/spcom.c
@@ -2764,7 +2764,7 @@
ret = spcom_register_chardev();
if (ret) {
pr_err("create character device failed.\n");
- goto fail_reg_chardev;
+ goto fail_while_chardev_reg;
}
link_info.glink_link_state_notif_cb = spcom_link_state_notif_cb;
@@ -2802,6 +2802,7 @@
fail_reg_chardev:
pr_err("Failed to init driver.\n");
spcom_unregister_chrdev();
+fail_while_chardev_reg:
kfree(dev);
spcom_dev = NULL;
diff --git a/fs/exec.c b/fs/exec.c
index 3e2de29..d27f5e9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -306,7 +306,7 @@
vma->vm_start = vma->vm_end - PAGE_SIZE;
vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
- INIT_LIST_HEAD(&vma->anon_vma_chain);
+ INIT_VMA(vma);
err = insert_vm_struct(mm, vma);
if (err)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c585e7e..4fc0895 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1123,8 +1123,11 @@
goto out_mm;
}
for (vma = mm->mmap; vma; vma = vma->vm_next) {
- vma->vm_flags &= ~VM_SOFTDIRTY;
+ vm_write_begin(vma);
+ WRITE_ONCE(vma->vm_flags,
+ vma->vm_flags & ~VM_SOFTDIRTY);
vma_set_page_prot(vma);
+ vm_write_end(vma);
}
downgrade_write(&mm->mmap_sem);
break;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 9d9c032..00b661d 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -499,8 +499,10 @@
vma = prev;
else
prev = vma;
- vma->vm_flags = new_flags;
+ vm_write_begin(vma);
+ WRITE_ONCE(vma->vm_flags, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ vm_write_end(vma);
}
up_write(&mm->mmap_sem);
mmput(mm);
@@ -895,8 +897,10 @@
* the next vma was merged into the current one and
* the current one has not been updated yet.
*/
- vma->vm_flags = new_flags;
+ vm_write_begin(vma);
+ WRITE_ONCE(vma->vm_flags, new_flags);
vma->vm_userfaultfd_ctx.ctx = ctx;
+ vm_write_end(vma);
skip:
prev = vma;
@@ -1033,8 +1037,10 @@
* the next vma was merged into the current one and
* the current one has not been updated yet.
*/
- vma->vm_flags = new_flags;
+ vm_write_begin(vma);
+ WRITE_ONCE(vma->vm_flags, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ vm_write_end(vma);
skip:
prev = vma;
diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index a4e7ca0..6cfdfca 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -7,7 +7,7 @@
static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
{
- return !!(vma->vm_flags & VM_HUGETLB);
+ return !!(READ_ONCE(vma->vm_flags) & VM_HUGETLB);
}
#else
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index ae8d475..df8e0b0 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -89,14 +89,14 @@
#ifdef CONFIG_NUMA_BALANCING
extern bool pmd_trans_migrating(pmd_t pmd);
extern int migrate_misplaced_page(struct page *page,
- struct vm_area_struct *vma, int node);
+ struct fault_env *fe, int node);
#else
static inline bool pmd_trans_migrating(pmd_t pmd)
{
return false;
}
static inline int migrate_misplaced_page(struct page *page,
- struct vm_area_struct *vma, int node)
+ struct fault_env *fe, int node)
{
return -EAGAIN; /* can't migrate now */
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b328cca..dd0e119 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -284,6 +284,8 @@
#define FAULT_FLAG_USER 0x40 /* The fault originated in userspace */
#define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */
#define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */
+/* Speculative fault, not holding mmap_sem */
+#define FAULT_FLAG_SPECULATIVE 0x200
/*
* vm_fault is filled by the the pagefault handler and passed to the vma's
@@ -300,7 +302,6 @@
gfp_t gfp_mask; /* gfp mask to be used for allocations */
pgoff_t pgoff; /* Logical page offset based on vma */
void __user *virtual_address; /* Faulting virtual address */
-
struct page *cow_page; /* Handler may choose to COW */
struct page *page; /* ->fault handlers should return a
* page here, unless VM_FAULT_NOPAGE
@@ -341,6 +342,17 @@
* page table to avoid allocation from
* atomic context.
*/
+ /*
+ * These entries are required when handling speculative page fault.
+ * This way the page handling is done using consistent field values.
+ */
+ unsigned long vma_flags;
+ pgprot_t vma_page_prot;
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+ unsigned int sequence;
+ pmd_t orig_pmd; /* value of PMD at the time of fault */
+ pte_t orig_pte; /* Value of PTE at the time of fault */
+#endif
};
/*
@@ -623,9 +635,9 @@
* pte_mkwrite. But get_user_pages can cause write faults for mappings
* that do not have writing enabled, when used by access_process_vm.
*/
-static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+static inline pte_t maybe_mkwrite(pte_t pte, unsigned long vma_flags)
{
- if (likely(vma->vm_flags & VM_WRITE))
+ if (likely(vma_flags & VM_WRITE))
pte = pte_mkwrite(pte);
return pte;
}
@@ -1118,6 +1130,7 @@
#define VM_FAULT_DAX_LOCKED 0x1000 /* ->fault has locked DAX entry */
#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
+#define VM_FAULT_PTNOTSAME 0x4000 /* Page table entries have changed */
#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \
VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
@@ -1167,8 +1180,23 @@
pgoff_t last_index; /* Highest page->index to unmap */
};
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte);
+struct page *__vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte, unsigned long vma_flags);
+static inline struct page *vm_normal_page(struct vm_area_struct *vma,
+ unsigned long addr, pte_t pte)
+{
+ return __vm_normal_page(vma, addr, pte, vma->vm_flags);
+}
+
+static inline void INIT_VMA(struct vm_area_struct *vma)
+{
+ INIT_LIST_HEAD(&vma->anon_vma_chain);
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+ seqcount_init(&vma->vm_sequence);
+ atomic_set(&vma->vm_ref_count, 1);
+#endif
+}
+
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd);
@@ -1238,6 +1266,47 @@
unmap_mapping_range(mapping, holebegin, holelen, 0);
}
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+static inline void vm_write_begin(struct vm_area_struct *vma)
+{
+ write_seqcount_begin(&vma->vm_sequence);
+}
+static inline void vm_write_begin_nested(struct vm_area_struct *vma,
+ int subclass)
+{
+ write_seqcount_begin_nested(&vma->vm_sequence, subclass);
+}
+static inline void vm_write_end(struct vm_area_struct *vma)
+{
+ write_seqcount_end(&vma->vm_sequence);
+}
+static inline void vm_raw_write_begin(struct vm_area_struct *vma)
+{
+ raw_write_seqcount_begin(&vma->vm_sequence);
+}
+static inline void vm_raw_write_end(struct vm_area_struct *vma)
+{
+ raw_write_seqcount_end(&vma->vm_sequence);
+}
+#else
+static inline void vm_write_begin(struct vm_area_struct *vma)
+{
+}
+static inline void vm_write_begin_nested(struct vm_area_struct *vma,
+ int subclass)
+{
+}
+static inline void vm_write_end(struct vm_area_struct *vma)
+{
+}
+static inline void vm_raw_write_begin(struct vm_area_struct *vma)
+{
+}
+static inline void vm_raw_write_end(struct vm_area_struct *vma)
+{
+}
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
+
extern void truncate_pagecache(struct inode *inode, loff_t new);
extern void truncate_setsize(struct inode *inode, loff_t newsize);
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
@@ -1249,6 +1318,43 @@
#ifdef CONFIG_MMU
extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags);
+
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+extern int __handle_speculative_fault(struct mm_struct *mm,
+ unsigned long address,
+ unsigned int flags,
+ struct vm_area_struct **vma);
+static inline int handle_speculative_fault(struct mm_struct *mm,
+ unsigned long address,
+ unsigned int flags,
+ struct vm_area_struct **vma)
+{
+ /*
+ * Try speculative page fault for multithreaded user space task only.
+ */
+ if (!(flags & FAULT_FLAG_USER) || atomic_read(&mm->mm_users) == 1) {
+ *vma = NULL;
+ return VM_FAULT_RETRY;
+ }
+ return __handle_speculative_fault(mm, address, flags, vma);
+}
+extern bool can_reuse_spf_vma(struct vm_area_struct *vma,
+ unsigned long address);
+#else
+static inline int handle_speculative_fault(struct mm_struct *mm,
+ unsigned long address,
+ unsigned int flags,
+ struct vm_area_struct **vma)
+{
+ return VM_FAULT_RETRY;
+}
+static inline bool can_reuse_spf_vma(struct vm_area_struct *vma,
+ unsigned long address)
+{
+ return false;
+}
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
+
extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
unsigned long address, unsigned int fault_flags,
bool *unlocked);
@@ -1957,16 +2063,29 @@
extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
- struct vm_area_struct *expand);
+ struct vm_area_struct *expand, bool keep_locked);
static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
{
- return __vma_adjust(vma, start, end, pgoff, insert, NULL);
+ return __vma_adjust(vma, start, end, pgoff, insert, NULL, false);
}
-extern struct vm_area_struct *vma_merge(struct mm_struct *,
+
+extern struct vm_area_struct *__vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
- unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
- struct mempolicy *, struct vm_userfaultfd_ctx, const char __user *);
+ unsigned long vm_flags, struct anon_vma *anon, struct file *file,
+ pgoff_t pgoff, struct mempolicy *mpol, struct vm_userfaultfd_ctx uff,
+ const char __user *user, bool keep_locked);
+
+static inline struct vm_area_struct *vma_merge(struct mm_struct *mm,
+ struct vm_area_struct *prev, unsigned long addr, unsigned long end,
+ unsigned long vm_flags, struct anon_vma *anon, struct file *file,
+ pgoff_t off, struct mempolicy *pol, struct vm_userfaultfd_ctx uff,
+ const char __user *user)
+{
+ return __vma_merge(mm, prev, addr, end, vm_flags, anon, file, off,
+ pol, uff, user, false);
+}
+
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int split_vma(struct mm_struct *,
struct vm_area_struct *, unsigned long addr, int new_below);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5942478..cb2cc30 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -365,6 +365,10 @@
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
#endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+ seqcount_t vm_sequence;
+ atomic_t vm_ref_count; /* see vma_get(), vma_put() */
+#endif
};
struct core_thread {
@@ -403,6 +407,9 @@
struct mm_struct {
struct vm_area_struct *mmap; /* list of VMAs */
struct rb_root mm_rb;
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+ rwlock_t mm_rb_lock;
+#endif
u32 vmacache_seqnum; /* per-thread vmacache */
#ifdef CONFIG_MMU
unsigned long (*get_unmapped_area) (struct file *filp,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 791c547..9dbf9c3 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -427,8 +427,8 @@
pgoff_t pgoff;
if (unlikely(is_vm_hugetlb_page(vma)))
return linear_hugepage_index(vma, address);
- pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
- pgoff += vma->vm_pgoff;
+ pgoff = (address - READ_ONCE(vma->vm_start)) >> PAGE_SHIFT;
+ pgoff += READ_ONCE(vma->vm_pgoff);
return pgoff;
}
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 71fd2b3..92a297c 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -168,8 +168,16 @@
unsigned long, bool);
void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long, int);
-void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
- unsigned long, bool);
+void __page_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, bool compound);
+static inline void page_add_new_anon_rmap(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address, bool compound)
+{
+ VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+ __page_add_new_anon_rmap(page, vma, address, compound);
+}
+
void page_add_file_rmap(struct page *, bool);
void page_remove_rmap(struct page *, bool);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 92d1fde..7b488ec 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -313,8 +313,14 @@
extern void add_page_to_unevictable_list(struct page *page);
-extern void lru_cache_add_active_or_unevictable(struct page *page,
- struct vm_area_struct *vma);
+extern void __lru_cache_add_active_or_unevictable(struct page *page,
+ unsigned long vma_flags);
+
+static inline void lru_cache_add_active_or_unevictable(struct page *page,
+ struct vm_area_struct *vma)
+{
+ return __lru_cache_add_active_or_unevictable(page, vma->vm_flags);
+}
/* linux/mm/vmscan.c */
extern unsigned long zone_reclaimable_pages(struct zone *zone);
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index a9c2e4c..4c679792 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -99,6 +99,9 @@
VMACACHE_FIND_HITS,
VMACACHE_FULL_FLUSHES,
#endif
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+ SPECULATIVE_PGFAULT,
+#endif
NR_VM_EVENT_ITEMS
};
diff --git a/include/trace/events/pagefault.h b/include/trace/events/pagefault.h
new file mode 100644
index 0000000..a9643b3
--- /dev/null
+++ b/include/trace/events/pagefault.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM pagefault
+
+#if !defined(_TRACE_PAGEFAULT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PAGEFAULT_H
+
+#include <linux/tracepoint.h>
+#include <linux/mm.h>
+
+DECLARE_EVENT_CLASS(spf,
+
+ TP_PROTO(unsigned long caller,
+ struct vm_area_struct *vma, unsigned long address),
+
+ TP_ARGS(caller, vma, address),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, caller)
+ __field(unsigned long, vm_start)
+ __field(unsigned long, vm_end)
+ __field(unsigned long, address)
+ ),
+
+ TP_fast_assign(
+ __entry->caller = caller;
+ __entry->vm_start = vma->vm_start;
+ __entry->vm_end = vma->vm_end;
+ __entry->address = address;
+ ),
+
+ TP_printk("ip:%lx vma:%lx-%lx address:%lx",
+ __entry->caller, __entry->vm_start, __entry->vm_end,
+ __entry->address)
+);
+
+DEFINE_EVENT(spf, spf_pte_lock,
+
+ TP_PROTO(unsigned long caller,
+ struct vm_area_struct *vma, unsigned long address),
+
+ TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_changed,
+
+ TP_PROTO(unsigned long caller,
+ struct vm_area_struct *vma, unsigned long address),
+
+ TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_noanon,
+
+ TP_PROTO(unsigned long caller,
+ struct vm_area_struct *vma, unsigned long address),
+
+ TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_notsup,
+
+ TP_PROTO(unsigned long caller,
+ struct vm_area_struct *vma, unsigned long address),
+
+ TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_access,
+
+ TP_PROTO(unsigned long caller,
+ struct vm_area_struct *vma, unsigned long address),
+
+ TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_pmd_changed,
+
+ TP_PROTO(unsigned long caller,
+ struct vm_area_struct *vma, unsigned long address),
+
+ TP_ARGS(caller, vma, address)
+);
+
+#endif /* _TRACE_PAGEFAULT_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 79fdfd8..965c091 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -614,7 +614,7 @@
if (!tmp)
goto fail_nomem;
*tmp = *mpnt;
- INIT_LIST_HEAD(&tmp->anon_vma_chain);
+ INIT_VMA(tmp);
retval = vma_dup_policy(mpnt, tmp);
if (retval)
goto fail_nomem_policy;
@@ -764,6 +764,9 @@
mm->mmap = NULL;
mm->mm_rb = RB_ROOT;
mm->vmacache_seqnum = 0;
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+ rwlock_init(&mm->mm_rb_lock);
+#endif
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
diff --git a/mm/Kconfig b/mm/Kconfig
index 051f7bc..3fe3ac4 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -747,3 +747,25 @@
(addr, addr + size-bytes) of the process.
Any other vaule is ignored.
+
+config ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT
+ def_bool n
+
+config SPECULATIVE_PAGE_FAULT
+ bool "Speculative page faults"
+ default y
+ depends on ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT
+ depends on MMU && SMP
+ help
+ Try to handle user space page faults without holding the mmap_sem.
+
+ This should allow better concurrency for massively threaded process
+ since the page fault handler will not wait for other threads memory
+ layout change to be done, assuming that this change is done in another
+ part of the process's memory space. This type of page fault is named
+ speculative page fault.
+
+ If the speculative page fault fails because of a concurrency is
+ detected or because underlying PMD or PTE tables are not yet
+ allocating, it is failing its processing and a classic page fault
+ is then tried.
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7243728..a557862 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -958,8 +958,8 @@
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
pte_t entry;
- entry = mk_pte(pages[i], vma->vm_page_prot);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ entry = mk_pte(pages[i], fe->vma_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), fe->vma_flags);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
page_add_new_anon_rmap(pages[i], fe->vma, haddr, false);
@@ -1679,7 +1679,7 @@
entry = pte_swp_mksoft_dirty(entry);
} else {
entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
- entry = maybe_mkwrite(entry, vma);
+ entry = maybe_mkwrite(entry, vma->vm_flags);
if (!write)
entry = pte_wrprotect(entry);
if (!young)
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 975e49f..4d21629 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -16,6 +16,9 @@
struct mm_struct init_mm = {
.mm_rb = RB_ROOT,
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+ .mm_rb_lock = __RW_LOCK_UNLOCKED(init_mm.mm_rb_lock),
+#endif
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
diff --git a/mm/internal.h b/mm/internal.h
index 6aa1c51..d9ac6de 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -38,6 +38,26 @@
int do_swap_page(struct fault_env *fe, pte_t orig_pte);
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+extern struct vm_area_struct *get_vma(struct mm_struct *mm,
+ unsigned long addr);
+extern void put_vma(struct vm_area_struct *vma);
+
+static inline bool vma_has_changed(struct fault_env *fe)
+{
+ int ret = RB_EMPTY_NODE(&fe->vma->vm_rb);
+ unsigned int seq = READ_ONCE(fe->vma->vm_sequence.sequence);
+
+ /*
+ * Matches both the wmb in write_seqlock_{begin,end}() and
+ * the wmb in vma_rb_erase().
+ */
+ smp_rmb();
+
+ return ret || seq != fe->sequence;
+}
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
+
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 1df37ee..6b58aaf 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -887,6 +887,8 @@
.address = address,
.flags = FAULT_FLAG_ALLOW_RETRY,
.pmd = pmd,
+ .vma_flags = vma->vm_flags,
+ .vma_page_prot = vma->vm_page_prot,
};
/* we only decide to swapin, if there is enough young ptes */
@@ -1011,6 +1013,7 @@
if (mm_find_pmd(mm, address) != pmd)
goto out;
+ vm_write_begin(vma);
anon_vma_lock_write(vma->anon_vma);
pte = pte_offset_map(pmd, address);
@@ -1046,6 +1049,7 @@
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(pmd_ptl);
anon_vma_unlock_write(vma->anon_vma);
+ vm_write_end(vma);
result = SCAN_FAIL;
goto out;
}
@@ -1080,6 +1084,7 @@
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
spin_unlock(pmd_ptl);
+ vm_write_end(vma);
*hpage = NULL;
diff --git a/mm/madvise.c b/mm/madvise.c
index 59d1aae..ee7ad9b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -135,8 +135,10 @@
/*
* vm_flags is protected by the mmap_sem held in write mode.
*/
- vma->vm_flags = new_flags;
+ vm_write_begin(vma);
+ WRITE_ONCE(vma->vm_flags, new_flags);
+ vm_write_end(vma);
out:
if (error == -ENOMEM)
error = -EAGAIN;
@@ -404,9 +406,11 @@
.private = tlb,
};
+ vm_write_begin(vma);
tlb_start_vma(tlb, vma);
walk_page_range(addr, end, &free_walk);
tlb_end_vma(tlb, vma);
+ vm_write_end(vma);
}
static int madvise_free_single_vma(struct vm_area_struct *vma,
diff --git a/mm/memory.c b/mm/memory.c
index cc6ab38..69c6c45 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -75,6 +75,9 @@
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/pagefault.h>
+
#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
#endif
@@ -549,7 +552,9 @@
* Hide vma from rmap and truncate_pagecache before freeing
* pgtables
*/
+ vm_write_begin(vma);
unlink_anon_vmas(vma);
+ vm_write_end(vma);
unlink_file_vma(vma);
if (is_vm_hugetlb_page(vma)) {
@@ -563,7 +568,9 @@
&& !is_vm_hugetlb_page(next)) {
vma = next;
next = vma->vm_next;
+ vm_write_begin(vma);
unlink_anon_vmas(vma);
+ vm_write_end(vma);
unlink_file_vma(vma);
}
free_pgd_range(tlb, addr, vma->vm_end,
@@ -689,7 +696,8 @@
if (page)
dump_page(page, "bad pte");
pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
- (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
+ (void *)addr, READ_ONCE(vma->vm_flags), vma->anon_vma,
+ mapping, index);
/*
* Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
*/
@@ -703,7 +711,8 @@
}
/*
- * vm_normal_page -- This function gets the "struct page" associated with a pte.
+ * __vm_normal_page -- This function gets the "struct page" associated with
+ * a pte.
*
* "Special" mappings do not wish to be associated with a "struct page" (either
* it doesn't exist, or it exists but they don't want to touch it). In this
@@ -749,8 +758,8 @@
#else
# define HAVE_PTE_SPECIAL 0
#endif
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte)
+struct page *__vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte, unsigned long vma_flags)
{
unsigned long pfn = pte_pfn(pte);
@@ -759,7 +768,7 @@
goto check_pfn;
if (vma->vm_ops && vma->vm_ops->find_special_page)
return vma->vm_ops->find_special_page(vma, addr);
- if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
+ if (vma_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
if (!is_zero_pfn(pfn))
print_bad_pte(vma, addr, pte, NULL);
@@ -767,9 +776,13 @@
}
/* !HAVE_PTE_SPECIAL case follows: */
+ /*
+ * This part should never get called when CONFIG_SPECULATIVE_PAGE_FAULT
+ * is set. This is mainly because we can't rely on vm_start.
+ */
- if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
- if (vma->vm_flags & VM_MIXEDMAP) {
+ if (unlikely(vma_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+ if (vma_flags & VM_MIXEDMAP) {
if (!pfn_valid(pfn))
return NULL;
goto out;
@@ -778,7 +791,7 @@
off = (addr - vma->vm_start) >> PAGE_SHIFT;
if (pfn == vma->vm_pgoff + off)
return NULL;
- if (!is_cow_mapping(vma->vm_flags))
+ if (!is_cow_mapping(vma_flags))
return NULL;
}
}
@@ -1285,6 +1298,7 @@
unsigned long next;
BUG_ON(addr >= end);
+ vm_write_begin(vma);
tlb_start_vma(tlb, vma);
pgd = pgd_offset(vma->vm_mm, addr);
do {
@@ -1294,6 +1308,7 @@
next = zap_pud_range(tlb, vma, pgd, addr, next, details);
} while (pgd++, addr = next, addr != end);
tlb_end_vma(tlb, vma);
+ vm_write_end(vma);
}
@@ -1961,6 +1976,145 @@
}
EXPORT_SYMBOL_GPL(apply_to_page_range);
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+static bool pte_spinlock(struct mm_struct *mm,
+ struct fault_env *fe)
+{
+ bool ret = false;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ pmd_t pmdval;
+#endif
+
+ /* Check if vma is still valid */
+ if (!(fe->flags & FAULT_FLAG_SPECULATIVE)) {
+ fe->ptl = pte_lockptr(mm, fe->pmd);
+ spin_lock(fe->ptl);
+ return true;
+ }
+
+ local_irq_disable();
+ if (vma_has_changed(fe)) {
+ trace_spf_vma_changed(_RET_IP_, fe->vma, fe->address);
+ goto out;
+ }
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /*
+ * We check if the pmd value is still the same to ensure that there
+ * is not a huge collapse operation in progress in our back.
+ */
+ pmdval = READ_ONCE(*fe->pmd);
+ if (!pmd_same(pmdval, fe->orig_pmd)) {
+ trace_spf_pmd_changed(_RET_IP_, fe->vma, fe->address);
+ goto out;
+ }
+#endif
+
+ fe->ptl = pte_lockptr(mm, fe->pmd);
+ if (unlikely(!spin_trylock(fe->ptl))) {
+ trace_spf_pte_lock(_RET_IP_, fe->vma, fe->address);
+ goto out;
+ }
+
+ if (vma_has_changed(fe)) {
+ spin_unlock(fe->ptl);
+ trace_spf_vma_changed(_RET_IP_, fe->vma, fe->address);
+ goto out;
+ }
+
+ ret = true;
+out:
+ local_irq_enable();
+ return ret;
+}
+
+static bool pte_map_lock(struct mm_struct *mm,
+ struct fault_env *fe)
+{
+ bool ret = false;
+ pte_t *pte;
+ spinlock_t *ptl;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ pmd_t pmdval;
+#endif
+
+ if (!(fe->flags & FAULT_FLAG_SPECULATIVE)) {
+ fe->pte = pte_offset_map_lock(mm, fe->pmd,
+ fe->address, &fe->ptl);
+ return true;
+ }
+
+ /*
+ * The first vma_has_changed() guarantees the page-tables are still
+ * valid, having IRQs disabled ensures they stay around, hence the
+ * second vma_has_changed() to make sure they are still valid once
+ * we've got the lock. After that a concurrent zap_pte_range() will
+ * block on the PTL and thus we're safe.
+ */
+ local_irq_disable();
+ if (vma_has_changed(fe)) {
+ trace_spf_vma_changed(_RET_IP_, fe->vma, fe->address);
+ goto out;
+ }
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /*
+ * We check if the pmd value is still the same to ensure that there
+ * is not a huge collapse operation in progress in our back.
+ */
+ pmdval = READ_ONCE(*fe->pmd);
+ if (!pmd_same(pmdval, fe->orig_pmd)) {
+ trace_spf_pmd_changed(_RET_IP_, fe->vma, fe->address);
+ goto out;
+ }
+#endif
+
+ /*
+ * Same as pte_offset_map_lock() except that we call
+ * spin_trylock() in place of spin_lock() to avoid race with
+ * unmap path which may have the lock and wait for this CPU
+ * to invalidate TLB but this CPU has irq disabled.
+ * Since we are in a speculative patch, accept it could fail
+ */
+ ptl = pte_lockptr(mm, fe->pmd);
+ pte = pte_offset_map(fe->pmd, fe->address);
+ if (unlikely(!spin_trylock(ptl))) {
+ pte_unmap(pte);
+ trace_spf_pte_lock(_RET_IP_, fe->vma, fe->address);
+ goto out;
+ }
+
+ if (vma_has_changed(fe)) {
+ pte_unmap_unlock(pte, ptl);
+ trace_spf_vma_changed(_RET_IP_, fe->vma, fe->address);
+ goto out;
+ }
+
+ fe->pte = pte;
+ fe->ptl = ptl;
+ ret = true;
+out:
+ local_irq_enable();
+ return ret;
+}
+#else
+static inline bool pte_spinlock(struct mm_struct *mm,
+ struct fault_env *fe)
+{
+ fe->ptl = pte_lockptr(mm, fe->pmd);
+ spin_lock(fe->ptl);
+ return true;
+}
+
+static inline bool pte_map_lock(struct mm_struct *mm,
+ struct fault_env *fe)
+{
+ fe->pte = pte_offset_map_lock(mm, fe->pmd,
+ fe->address, &fe->ptl);
+ return true;
+}
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
+
/*
* handle_pte_fault chooses page fault handler according to an entry which was
* read non-atomically. Before making any commitment, on those architectures
@@ -1968,21 +2122,30 @@
* parts, do_swap_page must check under lock before unmapping the pte and
* proceeding (but do_wp_page is only called after already making such a check;
* and do_anonymous_page can safely check later on).
+ *
+ * pte_unmap_same() returns:
+ * 0 if the PTE are the same
+ * VM_FAULT_PTNOTSAME if the PTE are different
+ * VM_FAULT_RETRY if the VMA has changed in our back during
+ * a speculative page fault handling.
*/
-static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
- pte_t *page_table, pte_t orig_pte)
+static inline int pte_unmap_same(struct mm_struct *mm, struct fault_env *fe,
+ pte_t orig_pte)
{
- int same = 1;
+ int ret = 0;
+
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
if (sizeof(pte_t) > sizeof(unsigned long)) {
- spinlock_t *ptl = pte_lockptr(mm, pmd);
- spin_lock(ptl);
- same = pte_same(*page_table, orig_pte);
- spin_unlock(ptl);
+ if (pte_spinlock(mm, fe)) {
+ if (!pte_same(*fe->pte, orig_pte))
+ ret = VM_FAULT_PTNOTSAME;
+ spin_unlock(fe->ptl);
+ } else
+ ret = VM_FAULT_RETRY;
}
#endif
- pte_unmap(page_table);
- return same;
+ pte_unmap(fe->pte);
+ return ret;
}
static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
@@ -2085,7 +2248,7 @@
flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ entry = maybe_mkwrite(pte_mkdirty(entry), fe->vma_flags);
if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1))
update_mmu_cache(vma, fe->address, fe->pte);
pte_unmap_unlock(fe->pte, fe->ptl);
@@ -2145,24 +2308,25 @@
const unsigned long mmun_start = fe->address & PAGE_MASK;
const unsigned long mmun_end = mmun_start + PAGE_SIZE;
struct mem_cgroup *memcg;
+ int ret = VM_FAULT_OOM;
if (unlikely(anon_vma_prepare(vma)))
- goto oom;
+ goto out;
if (is_zero_pfn(pte_pfn(orig_pte))) {
new_page = alloc_zeroed_user_highpage_movable(vma, fe->address);
if (!new_page)
- goto oom;
+ goto out;
} else {
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
fe->address);
if (!new_page)
- goto oom;
+ goto out;
cow_user_page(new_page, old_page, fe->address, vma);
}
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
- goto oom_free_new;
+ goto out_free_new;
__SetPageUptodate(new_page);
@@ -2171,7 +2335,10 @@
/*
* Re-check the pte - we dropped the lock
*/
- fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl);
+ if (!pte_map_lock(mm, fe)) {
+ ret = VM_FAULT_RETRY;
+ goto out_uncharge;
+ }
if (likely(pte_same(*fe->pte, orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
@@ -2183,8 +2350,8 @@
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
- entry = mk_pte(new_page, vma->vm_page_prot);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ entry = mk_pte(new_page, fe->vma_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), fe->vma_flags);
/*
* Clear the pte entry and flush it first, before updating the
* pte with the new entry. This will avoid a race condition
@@ -2192,9 +2359,9 @@
* thread doing COW.
*/
ptep_clear_flush_notify(vma, fe->address, fe->pte);
- page_add_new_anon_rmap(new_page, vma, fe->address, false);
+ __page_add_new_anon_rmap(new_page, vma, fe->address, false);
mem_cgroup_commit_charge(new_page, memcg, false, false);
- lru_cache_add_active_or_unevictable(new_page, vma);
+ __lru_cache_add_active_or_unevictable(new_page, fe->vma_flags);
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
@@ -2245,7 +2412,7 @@
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
- if (page_copied && (vma->vm_flags & VM_LOCKED)) {
+ if (page_copied && (fe->vma_flags & VM_LOCKED)) {
lock_page(old_page); /* LRU manipulation */
if (PageMlocked(old_page))
munlock_vma_page(old_page);
@@ -2254,12 +2421,14 @@
put_page(old_page);
}
return page_copied ? VM_FAULT_WRITE : 0;
-oom_free_new:
+out_uncharge:
+ mem_cgroup_cancel_charge(new_page, memcg, false);
+out_free_new:
put_page(new_page);
-oom:
+out:
if (old_page)
put_page(old_page);
- return VM_FAULT_OOM;
+ return ret;
}
/*
@@ -2284,8 +2453,8 @@
ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
if (ret & VM_FAULT_ERROR)
return ret;
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
+ if (!pte_map_lock(vma->vm_mm, fe))
+ return VM_FAULT_RETRY;
/*
* We might have raced with another page fault while we
* released the pte_offset_map_lock.
@@ -2361,7 +2530,8 @@
struct vm_area_struct *vma = fe->vma;
struct page *old_page;
- old_page = vm_normal_page(vma, fe->address, orig_pte);
+ old_page = __vm_normal_page(vma, fe->address, orig_pte,
+ fe->vma_flags);
if (!old_page) {
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
@@ -2370,7 +2540,7 @@
* We should not cow pages in a shared writeable mapping.
* Just mark the pages writable and/or call ops->pfn_mkwrite.
*/
- if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+ if ((fe->vma_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))
return wp_pfn_shared(fe, orig_pte);
@@ -2388,8 +2558,11 @@
get_page(old_page);
pte_unmap_unlock(fe->pte, fe->ptl);
lock_page(old_page);
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
- fe->address, &fe->ptl);
+ if (!pte_map_lock(vma->vm_mm, fe)) {
+ unlock_page(old_page);
+ put_page(old_page);
+ return VM_FAULT_RETRY;
+ }
if (!pte_same(*fe->pte, orig_pte)) {
unlock_page(old_page);
pte_unmap_unlock(fe->pte, fe->ptl);
@@ -2413,7 +2586,7 @@
return wp_page_reuse(fe, orig_pte, old_page, 0, 0);
}
unlock_page(old_page);
- } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+ } else if (unlikely((fe->vma_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
return wp_page_shared(fe, orig_pte, old_page);
}
@@ -2523,9 +2696,17 @@
int exclusive = 0;
int ret = 0;
- if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte))
+ ret = pte_unmap_same(vma->vm_mm, fe, orig_pte);
+ if (ret) {
+ /*
+ * If pte != orig_pte, this means another thread did the
+ * swap operation in our back.
+ * So nothing else to do.
+ */
+ if (ret == VM_FAULT_PTNOTSAME)
+ ret = 0;
goto out;
-
+ }
entry = pte_to_swp_entry(orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
@@ -2545,11 +2726,16 @@
GFP_HIGHUSER_MOVABLE, vma, fe->address);
if (!page) {
/*
- * Back out if somebody else faulted in this pte
- * while we released the pte lock.
+ * Back out if the VMA has changed in our back during
+ * a speculative page fault or if somebody else
+ * faulted in this pte while we released the pte lock.
*/
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
- fe->address, &fe->ptl);
+ if (!pte_map_lock(vma->vm_mm, fe)) {
+ delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+ ret = VM_FAULT_RETRY;
+ goto out;
+ }
+
if (likely(pte_same(*fe->pte, orig_pte)))
ret = VM_FAULT_OOM;
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2603,10 +2789,13 @@
}
/*
- * Back out if somebody else already faulted in this pte.
+ * Back out if the VMA has changed in our back during a speculative
+ * page fault or if somebody else already faulted in this pte.
*/
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
+ if (!pte_map_lock(vma->vm_mm, fe)) {
+ ret = VM_FAULT_RETRY;
+ goto out_cancel_cgroup;
+ }
if (unlikely(!pte_same(*fe->pte, orig_pte)))
goto out_nomap;
@@ -2627,9 +2816,9 @@
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
- pte = mk_pte(page, vma->vm_page_prot);
+ pte = mk_pte(page, fe->vma_page_prot);
if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
- pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+ pte = maybe_mkwrite(pte_mkdirty(pte), fe->vma_flags);
fe->flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
exclusive = RMAP_EXCLUSIVE;
@@ -2643,14 +2832,14 @@
mem_cgroup_commit_charge(page, memcg, true, false);
activate_page(page);
} else { /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, fe->address, false);
+ __page_add_new_anon_rmap(page, vma, fe->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
- lru_cache_add_active_or_unevictable(page, vma);
+ __lru_cache_add_active_or_unevictable(page, fe->vma_flags);
}
swap_free(entry);
if (mem_cgroup_swap_full(page) ||
- (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
+ (fe->vma_flags & VM_LOCKED) || PageMlocked(page))
try_to_free_swap(page);
unlock_page(page);
if (page != swapcache) {
@@ -2680,8 +2869,9 @@
out:
return ret;
out_nomap:
- mem_cgroup_cancel_charge(page, memcg, false);
pte_unmap_unlock(fe->pte, fe->ptl);
+out_cancel_cgroup:
+ mem_cgroup_cancel_charge(page, memcg, false);
out_page:
unlock_page(page);
out_release:
@@ -2703,10 +2893,11 @@
struct vm_area_struct *vma = fe->vma;
struct mem_cgroup *memcg;
struct page *page;
+ int ret = 0;
pte_t entry;
/* File mapping without ->vm_ops ? */
- if (vma->vm_flags & VM_SHARED)
+ if (fe->vma_flags & VM_SHARED)
return VM_FAULT_SIGBUS;
/*
@@ -2730,11 +2921,19 @@
if (!(fe->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address),
- vma->vm_page_prot));
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
+ fe->vma_page_prot));
+ if (!pte_map_lock(vma->vm_mm, fe))
+ return VM_FAULT_RETRY;
if (!pte_none(*fe->pte))
goto unlock;
+ /*
+ * Don't call the userfaultfd during the speculative path.
+ * We already checked for the VMA to not be managed through
+ * userfaultfd, but it may be set in our back once we have lock
+ * the pte. In such a case we can ignore it this time.
+ */
+ if (fe->flags & FAULT_FLAG_SPECULATIVE)
+ goto setpte;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(fe->pte, fe->ptl);
@@ -2760,17 +2959,19 @@
*/
__SetPageUptodate(page);
- entry = mk_pte(page, vma->vm_page_prot);
- if (vma->vm_flags & VM_WRITE)
+ entry = mk_pte(page, fe->vma_page_prot);
+ if (fe->vma_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (!pte_none(*fe->pte))
+ if (!pte_map_lock(vma->vm_mm, fe)) {
+ ret = VM_FAULT_RETRY;
goto release;
+ }
+ if (!pte_none(*fe->pte))
+ goto unlock_and_release;
/* Deliver the page fault to userland, check inside PT lock */
- if (userfaultfd_missing(vma)) {
+ if (!(fe->flags & FAULT_FLAG_SPECULATIVE) && userfaultfd_missing(vma)) {
pte_unmap_unlock(fe->pte, fe->ptl);
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
@@ -2778,9 +2979,9 @@
}
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, fe->address, false);
+ __page_add_new_anon_rmap(page, vma, fe->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
- lru_cache_add_active_or_unevictable(page, vma);
+ __lru_cache_add_active_or_unevictable(page, fe->vma_flags);
setpte:
set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
@@ -2788,11 +2989,13 @@
update_mmu_cache(vma, fe->address, fe->pte);
unlock:
pte_unmap_unlock(fe->pte, fe->ptl);
- return 0;
+ return ret;
+unlock_and_release:
+ pte_unmap_unlock(fe->pte, fe->ptl);
release:
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
- goto unlock;
+ return ret;
oom_free_page:
put_page(page);
oom:
@@ -2897,8 +3100,9 @@
* pte_none() under vmf->ptl protection when we return to
* alloc_set_pte().
*/
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
+ if (!pte_map_lock(vma->vm_mm, fe))
+ return VM_FAULT_RETRY;
+
return 0;
}
@@ -2937,7 +3141,7 @@
for (i = 0; i < HPAGE_PMD_NR; i++)
flush_icache_page(vma, page + i);
- entry = mk_huge_pmd(page, vma->vm_page_prot);
+ entry = mk_huge_pmd(page, fe->vma_page_prot);
if (write)
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -3005,15 +3209,15 @@
return VM_FAULT_NOPAGE;
flush_icache_page(vma, page);
- entry = mk_pte(page, vma->vm_page_prot);
+ entry = mk_pte(page, fe->vma_page_prot);
if (write)
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ entry = maybe_mkwrite(pte_mkdirty(entry), fe->vma_flags);
/* copy-on-write page */
- if (write && !(vma->vm_flags & VM_SHARED)) {
+ if (write && !(fe->vma_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, fe->address, false);
+ __page_add_new_anon_rmap(page, vma, fe->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
- lru_cache_add_active_or_unevictable(page, vma);
+ __lru_cache_add_active_or_unevictable(page, fe->vma_flags);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
@@ -3027,7 +3231,7 @@
}
static unsigned long fault_around_bytes __read_mostly =
- rounddown_pow_of_two(4096);
+ rounddown_pow_of_two(65536);
#ifdef CONFIG_DEBUG_FS
static int fault_around_bytes_get(void *data, u64 *val)
@@ -3301,7 +3505,7 @@
return VM_FAULT_SIGBUS;
if (!(fe->flags & FAULT_FLAG_WRITE))
return do_read_fault(fe, pgoff);
- if (!(vma->vm_flags & VM_SHARED))
+ if (!(fe->vma_flags & VM_SHARED))
return do_cow_fault(fe, pgoff);
return do_shared_fault(fe, pgoff);
}
@@ -3341,22 +3545,22 @@
* page table entry is not accessible, so there would be no
* concurrent hardware modifications to the PTE.
*/
- fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd);
- spin_lock(fe->ptl);
+ if (!pte_spinlock(vma->vm_mm, fe))
+ return VM_FAULT_RETRY;
if (unlikely(!pte_same(*fe->pte, pte))) {
pte_unmap_unlock(fe->pte, fe->ptl);
goto out;
}
/* Make it present again */
- pte = pte_modify(pte, vma->vm_page_prot);
+ pte = pte_modify(pte, fe->vma_page_prot);
pte = pte_mkyoung(pte);
if (was_writable)
pte = pte_mkwrite(pte);
set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
update_mmu_cache(vma, fe->address, fe->pte);
- page = vm_normal_page(vma, fe->address, pte);
+ page = __vm_normal_page(vma, fe->address, pte, fe->vma_flags);
if (!page) {
pte_unmap_unlock(fe->pte, fe->ptl);
return 0;
@@ -3383,7 +3587,7 @@
* Flag if the page is shared between multiple address spaces. This
* is later used when determining whether to group tasks together
*/
- if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
+ if (page_mapcount(page) > 1 && (fe->vma_flags & VM_SHARED))
flags |= TNF_SHARED;
last_cpupid = page_cpupid_last(page);
@@ -3397,7 +3601,7 @@
}
/* Migrate to the requested node */
- migrated = migrate_misplaced_page(page, vma, target_nid);
+ migrated = migrate_misplaced_page(page, fe, target_nid);
if (migrated) {
page_nid = target_nid;
flags |= TNF_MIGRATED;
@@ -3430,7 +3634,7 @@
fe->flags);
/* COW handled on pte level: split pmd */
- VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma);
+ VM_BUG_ON_VMA(fe->vma_flags & VM_SHARED, fe->vma);
split_huge_pmd(fe->vma, fe->pmd, fe->address);
return VM_FAULT_FALLBACK;
@@ -3458,17 +3662,26 @@
*/
static int handle_pte_fault(struct fault_env *fe)
{
- pte_t entry;
+ pte_t uninitialized_var(entry);
if (unlikely(pmd_none(*fe->pmd))) {
/*
+ * In the case of the speculative page fault handler we abort
+ * the speculative path immediately as the pmd is probably
+ * in the way to be converted in a huge one. We will try
+ * again holding the mmap_sem (which implies that the collapse
+ * operation is done).
+ */
+ if (fe->flags & FAULT_FLAG_SPECULATIVE)
+ return VM_FAULT_RETRY;
+ /*
* Leave __pte_alloc() until later: because vm_ops->fault may
* want to allocate huge page, and if we expose page table
* for an instant, it will be difficult to retract from
* concurrent faults and from rmap lookups.
*/
fe->pte = NULL;
- } else {
+ } else if (!(fe->flags & FAULT_FLAG_SPECULATIVE)) {
/* See comment in pte_alloc_one_map() */
if (pmd_devmap_trans_unstable(fe->pmd))
return 0;
@@ -3477,6 +3690,9 @@
* pmd from under us anymore at this point because we hold the
* mmap_sem read mode and khugepaged takes it in write mode.
* So now it's safe to run pte_offset_map().
+ * This is not applicable to the speculative page fault handler
+ * but in that case, the pte is fetched earlier in
+ * handle_speculative_fault().
*/
fe->pte = pte_offset_map(fe->pmd, fe->address);
@@ -3500,18 +3716,24 @@
if (!fe->pte) {
if (vma_is_anonymous(fe->vma))
return do_anonymous_page(fe);
+ else if (fe->flags & FAULT_FLAG_SPECULATIVE)
+ return VM_FAULT_RETRY;
else
return do_fault(fe);
}
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+ if (fe->flags & FAULT_FLAG_SPECULATIVE)
+ entry = fe->orig_pte;
+#endif
if (!pte_present(entry))
return do_swap_page(fe, entry);
if (pte_protnone(entry) && vma_is_accessible(fe->vma))
return do_numa_page(fe, entry);
- fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd);
- spin_lock(fe->ptl);
+ if (!pte_spinlock(fe->vma->vm_mm, fe))
+ return VM_FAULT_RETRY;
if (unlikely(!pte_same(*fe->pte, entry)))
goto unlock;
if (fe->flags & FAULT_FLAG_WRITE) {
@@ -3551,6 +3773,8 @@
.vma = vma,
.address = address,
.flags = flags,
+ .vma_flags = vma->vm_flags,
+ .vma_page_prot = vma->vm_page_prot,
};
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
@@ -3570,7 +3794,9 @@
} else {
pmd_t orig_pmd = *fe.pmd;
int ret;
-
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+ fe.sequence = raw_read_seqcount(&vma->vm_sequence);
+#endif
barrier();
if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
@@ -3591,6 +3817,247 @@
return handle_pte_fault(&fe);
}
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+
+#ifndef __HAVE_ARCH_PTE_SPECIAL
+/* This is required by vm_normal_page() */
+#error "Speculative page fault handler requires __HAVE_ARCH_PTE_SPECIAL"
+#endif
+/*
+ * vm_normal_page() adds some processing which should be done while
+ * hodling the mmap_sem.
+ */
+
+/*
+ * Tries to handle the page fault in a speculative way, without grabbing the
+ * mmap_sem.
+ * When VM_FAULT_RETRY is returned, the vma pointer is valid and this vma must
+ * be checked later when the mmap_sem has been grabbed by calling
+ * can_reuse_spf_vma().
+ * This is needed as the returned vma is kept in memory until the call to
+ * can_reuse_spf_vma() is made.
+ */
+int __handle_speculative_fault(struct mm_struct *mm, unsigned long address,
+ unsigned int flags, struct vm_area_struct **vma)
+{
+ struct fault_env fe = {
+ .address = address,
+ };
+ pgd_t *pgd, pgdval;
+ pud_t *pud, pudval;
+ int seq, ret;
+
+ /* Clear flags that may lead to release the mmap_sem to retry */
+ flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE);
+ flags |= FAULT_FLAG_SPECULATIVE;
+
+ *vma = get_vma(mm, address);
+ if (!*vma)
+ return VM_FAULT_RETRY;
+ fe.vma = *vma;
+
+ /* rmb <-> seqlock,vma_rb_erase() */
+ seq = raw_read_seqcount(&fe.vma->vm_sequence);
+ if (seq & 1) {
+ trace_spf_vma_changed(_RET_IP_, fe.vma, address);
+ return VM_FAULT_RETRY;
+ }
+
+ /*
+ * Can't call vm_ops service has we don't know what they would do
+ * with the VMA.
+ * This include huge page from hugetlbfs.
+ */
+ if (fe.vma->vm_ops) {
+ trace_spf_vma_notsup(_RET_IP_, fe.vma, address);
+ return VM_FAULT_RETRY;
+ }
+
+ /*
+ * __anon_vma_prepare() requires the mmap_sem to be held
+ * because vm_next and vm_prev must be safe. This can't be guaranteed
+ * in the speculative path.
+ */
+ if (unlikely(!fe.vma->anon_vma)) {
+ trace_spf_vma_notsup(_RET_IP_, fe.vma, address);
+ return VM_FAULT_RETRY;
+ }
+
+ fe.vma_flags = READ_ONCE(fe.vma->vm_flags);
+ fe.vma_page_prot = READ_ONCE(fe.vma->vm_page_prot);
+
+ /* Can't call userland page fault handler in the speculative path */
+ if (unlikely(fe.vma_flags & VM_UFFD_MISSING)) {
+ trace_spf_vma_notsup(_RET_IP_, fe.vma, address);
+ return VM_FAULT_RETRY;
+ }
+
+ if (fe.vma_flags & VM_GROWSDOWN || fe.vma_flags & VM_GROWSUP) {
+ /*
+ * This could be detected by the check address against VMA's
+ * boundaries but we want to trace it as not supported instead
+ * of changed.
+ */
+ trace_spf_vma_notsup(_RET_IP_, fe.vma, address);
+ return VM_FAULT_RETRY;
+ }
+
+ if (address < READ_ONCE(fe.vma->vm_start)
+ || READ_ONCE(fe.vma->vm_end) <= address) {
+ trace_spf_vma_changed(_RET_IP_, fe.vma, address);
+ return VM_FAULT_RETRY;
+ }
+
+ if (!arch_vma_access_permitted(fe.vma, flags & FAULT_FLAG_WRITE,
+ flags & FAULT_FLAG_INSTRUCTION,
+ flags & FAULT_FLAG_REMOTE))
+ goto out_segv;
+
+ /* This is one is required to check that the VMA has write access set */
+ if (flags & FAULT_FLAG_WRITE) {
+ if (unlikely(!(fe.vma_flags & VM_WRITE)))
+ goto out_segv;
+ } else if (unlikely(!(fe.vma_flags & (VM_READ|VM_EXEC|VM_WRITE))))
+ goto out_segv;
+
+#ifdef CONFIG_NUMA
+ struct mempolicy *pol;
+
+ /*
+ * MPOL_INTERLEAVE implies additional checks in
+ * mpol_misplaced() which are not compatible with the
+ *speculative page fault processing.
+ */
+ pol = __get_vma_policy(fe.vma, address);
+ if (!pol)
+ pol = get_task_policy(current);
+
+ if (pol && pol->mode == MPOL_INTERLEAVE) {
+ trace_spf_vma_notsup(_RET_IP_, fe.vma, address);
+ return VM_FAULT_RETRY;
+ }
+#endif
+
+ /*
+ * Do a speculative lookup of the PTE entry.
+ */
+ local_irq_disable();
+ pgd = pgd_offset(mm, address);
+ pgdval = READ_ONCE(*pgd);
+ if (pgd_none(pgdval) || unlikely(pgd_bad(pgdval)))
+ goto out_walk;
+
+ pud = pud_offset(pgd, address);
+ pudval = READ_ONCE(*pud);
+ if (pud_none(pudval) || unlikely(pud_bad(pudval)))
+ goto out_walk;
+
+ fe.pmd = pmd_offset(pud, address);
+ fe.orig_pmd = READ_ONCE(*fe.pmd);
+ /*
+ * pmd_none could mean that a hugepage collapse is in progress
+ * in our back as collapse_huge_page() mark it before
+ * invalidating the pte (which is done once the IPI is catched
+ * by all CPU and we have interrupt disabled).
+ * For this reason we cannot handle THP in a speculative way since we
+ * can't safely indentify an in progress collapse operation done in our
+ * back on that PMD.
+ * Regarding the order of the following checks, see comment in
+ * pmd_devmap_trans_unstable()
+ */
+ if (unlikely(pmd_devmap(fe.orig_pmd) ||
+ pmd_none(fe.orig_pmd) || pmd_trans_huge(fe.orig_pmd)))
+ goto out_walk;
+
+ /*
+ * The above does not allocate/instantiate page-tables because doing so
+ * would lead to the possibility of instantiating page-tables after
+ * free_pgtables() -- and consequently leaking them.
+ *
+ * The result is that we take at least one !speculative fault per PMD
+ * in order to instantiate it.
+ */
+
+ fe.pte = pte_offset_map(fe.pmd, address);
+ fe.orig_pte = READ_ONCE(*fe.pte);
+ barrier(); /* See comment in handle_pte_fault() */
+ if (pte_none(fe.orig_pte)) {
+ pte_unmap(fe.pte);
+ fe.pte = NULL;
+ }
+
+ fe.sequence = seq;
+ fe.flags = flags;
+
+ local_irq_enable();
+
+ /*
+ * We need to re-validate the VMA after checking the bounds, otherwise
+ * we might have a false positive on the bounds.
+ */
+ if (read_seqcount_retry(&fe.vma->vm_sequence, seq)) {
+ trace_spf_vma_changed(_RET_IP_, fe.vma, address);
+ return VM_FAULT_RETRY;
+ }
+
+ mem_cgroup_oom_enable();
+ ret = handle_pte_fault(&fe);
+ mem_cgroup_oom_disable();
+
+ /*
+ * If there is no need to retry, don't return the vma to the caller.
+ */
+ if (ret != VM_FAULT_RETRY) {
+ count_vm_event(SPECULATIVE_PGFAULT);
+ put_vma(fe.vma);
+ *vma = NULL;
+ }
+
+ /*
+ * The task may have entered a memcg OOM situation but
+ * if the allocation error was handled gracefully (no
+ * VM_FAULT_OOM), there is no need to kill anything.
+ * Just clean up the OOM state peacefully.
+ */
+ if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
+ mem_cgroup_oom_synchronize(false);
+ return ret;
+
+out_walk:
+ trace_spf_vma_notsup(_RET_IP_, fe.vma, address);
+ local_irq_enable();
+ return VM_FAULT_RETRY;
+
+out_segv:
+ trace_spf_vma_access(_RET_IP_, fe.vma, address);
+ /*
+ * We don't return VM_FAULT_RETRY so the caller is not expected to
+ * retrieve the fetched VMA.
+ */
+ put_vma(fe.vma);
+ *vma = NULL;
+ return VM_FAULT_SIGSEGV;
+}
+
+/*
+ * This is used to know if the vma fetch in the speculative page fault handler
+ * is still valid when trying the regular fault path while holding the
+ * mmap_sem.
+ * The call to put_vma(vma) must be made after checking the vma's fields, as
+ * the vma may be freed by put_vma(). In such a case it is expected that false
+ * is returned.
+ */
+bool can_reuse_spf_vma(struct vm_area_struct *vma, unsigned long address)
+{
+ bool ret;
+
+ ret = !RB_EMPTY_NODE(&vma->vm_rb) &&
+ vma->vm_start <= address && address < vma->vm_end;
+ put_vma(vma);
+ return ret;
+}
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
+
/*
* By the time we get here, we already hold the mm semaphore
*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d0adeef..ecbe6ec 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -442,8 +442,11 @@
struct vm_area_struct *vma;
down_write(&mm->mmap_sem);
- for (vma = mm->mmap; vma; vma = vma->vm_next)
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ vm_write_begin(vma);
mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
+ vm_write_end(vma);
+ }
up_write(&mm->mmap_sem);
}
@@ -601,9 +604,11 @@
{
int nr_updated;
+ vm_write_begin(vma);
nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
if (nr_updated)
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
+ vm_write_end(vma);
return nr_updated;
}
@@ -704,6 +709,7 @@
if (IS_ERR(new))
return PTR_ERR(new);
+ vm_write_begin(vma);
if (vma->vm_ops && vma->vm_ops->set_policy) {
err = vma->vm_ops->set_policy(vma, new);
if (err)
@@ -711,11 +717,17 @@
}
old = vma->vm_policy;
- vma->vm_policy = new; /* protected by mmap_sem */
+ /*
+ * The speculative page fault handler accesses this field without
+ * hodling the mmap_sem.
+ */
+ WRITE_ONCE(vma->vm_policy, new);
+ vm_write_end(vma);
mpol_put(old);
return 0;
err_out:
+ vm_write_end(vma);
mpol_put(new);
return err;
}
@@ -1586,23 +1598,28 @@
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
unsigned long addr)
{
- struct mempolicy *pol = NULL;
+ struct mempolicy *pol;
- if (vma) {
- if (vma->vm_ops && vma->vm_ops->get_policy) {
- pol = vma->vm_ops->get_policy(vma, addr);
- } else if (vma->vm_policy) {
- pol = vma->vm_policy;
+ if (!vma)
+ return NULL;
- /*
- * shmem_alloc_page() passes MPOL_F_SHARED policy with
- * a pseudo vma whose vma->vm_ops=NULL. Take a reference
- * count on these policies which will be dropped by
- * mpol_cond_put() later
- */
- if (mpol_needs_cond_ref(pol))
- mpol_get(pol);
- }
+ if (vma->vm_ops && vma->vm_ops->get_policy)
+ return vma->vm_ops->get_policy(vma, addr);
+
+ /*
+ * This could be called without holding the mmap_sem in the
+ * speculative page fault handler's path.
+ */
+ pol = READ_ONCE(vma->vm_policy);
+ if (pol) {
+ /*
+ * shmem_alloc_page() passes MPOL_F_SHARED policy with
+ * a pseudo vma whose vma->vm_ops=NULL. Take a reference
+ * count on these policies which will be dropped by
+ * mpol_cond_put() later
+ */
+ if (mpol_needs_cond_ref(pol))
+ mpol_get(pol);
}
return pol;
diff --git a/mm/migrate.c b/mm/migrate.c
index eb1f043..595b456 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -241,7 +241,7 @@
/* Recheck VMA as permissions can change since migration started */
if (is_write_migration_entry(entry))
- pte = maybe_mkwrite(pte, vma);
+ pte = maybe_mkwrite(pte, vma->vm_flags);
#ifdef CONFIG_HUGETLB_PAGE
if (PageHuge(new)) {
@@ -1855,7 +1855,7 @@
* node. Caller is expected to have an elevated reference count on
* the page that will be dropped by this function before returning.
*/
-int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
+int migrate_misplaced_page(struct page *page, struct fault_env *fe,
int node)
{
pg_data_t *pgdat = NODE_DATA(node);
@@ -1868,7 +1868,7 @@
* with execute permissions as they are probably shared libraries.
*/
if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
- (vma->vm_flags & VM_EXEC))
+ (fe->vma_flags & VM_EXEC))
goto out;
/*
diff --git a/mm/mlock.c b/mm/mlock.c
index 9cdd063..f648acb 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -438,7 +438,9 @@
void munlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
- vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+ vm_write_begin(vma);
+ WRITE_ONCE(vma->vm_flags, vma->vm_flags & VM_LOCKED_CLEAR_MASK);
+ vm_write_end(vma);
while (start < end) {
struct page *page;
@@ -563,10 +565,11 @@
* It's okay if try_to_unmap_one unmaps a page just after we
* set VM_LOCKED, populate_vma_page_range will bring it back.
*/
-
- if (lock)
- vma->vm_flags = newflags;
- else
+ if (lock) {
+ vm_write_begin(vma);
+ WRITE_ONCE(vma->vm_flags, newflags);
+ vm_write_end(vma);
+ } else
munlock_vma_pages_range(vma, start, end);
out:
diff --git a/mm/mmap.c b/mm/mmap.c
index 5df92da..f549597 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -160,6 +160,27 @@
}
}
+static void __free_vma(struct vm_area_struct *vma)
+{
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ mpol_put(vma_policy(vma));
+ kmem_cache_free(vm_area_cachep, vma);
+}
+
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+void put_vma(struct vm_area_struct *vma)
+{
+ if (atomic_dec_and_test(&vma->vm_ref_count))
+ __free_vma(vma);
+}
+#else
+static inline void put_vma(struct vm_area_struct *vma)
+{
+ __free_vma(vma);
+}
+#endif
+
/*
* Close a vm structure and free it, returning the next.
*/
@@ -170,10 +191,7 @@
might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
- if (vma->vm_file)
- fput(vma->vm_file);
- mpol_put(vma_policy(vma));
- kmem_cache_free(vm_area_cachep, vma);
+ put_vma(vma);
return next;
}
@@ -391,6 +409,14 @@
#define validate_mm(mm) do { } while (0)
#endif
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+#define mm_rb_write_lock(mm) write_lock(&(mm)->mm_rb_lock)
+#define mm_rb_write_unlock(mm) write_unlock(&(mm)->mm_rb_lock)
+#else
+#define mm_rb_write_lock(mm) do { } while (0)
+#define mm_rb_write_unlock(mm) do { } while (0)
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
+
RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
@@ -409,26 +435,37 @@
}
static inline void vma_rb_insert(struct vm_area_struct *vma,
- struct rb_root *root)
+ struct mm_struct *mm)
{
+ struct rb_root *root = &mm->mm_rb;
+
/* All rb_subtree_gap values must be consistent prior to insertion */
validate_mm_rb(root, NULL);
rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
}
-static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
+static void __vma_rb_erase(struct vm_area_struct *vma, struct mm_struct *mm)
{
+ struct rb_root *root = &mm->mm_rb;
/*
* Note rb_erase_augmented is a fairly large inline function,
* so make sure we instantiate it only once with our desired
* augmented rbtree callbacks.
*/
+ mm_rb_write_lock(mm);
rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
+ mm_rb_write_unlock(mm); /* wmb */
+
+ /*
+ * Ensure the removal is complete before clearing the node.
+ * Matched by vma_has_changed()/handle_speculative_fault().
+ */
+ RB_CLEAR_NODE(&vma->vm_rb);
}
static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
- struct rb_root *root,
+ struct mm_struct *mm,
struct vm_area_struct *ignore)
{
/*
@@ -436,21 +473,21 @@
* with the possible exception of the "next" vma being erased if
* next->vm_start was reduced.
*/
- validate_mm_rb(root, ignore);
+ validate_mm_rb(&mm->mm_rb, ignore);
- __vma_rb_erase(vma, root);
+ __vma_rb_erase(vma, mm);
}
static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
- struct rb_root *root)
+ struct mm_struct *mm)
{
/*
* All rb_subtree_gap values must be consistent prior to erase,
* with the possible exception of the vma being erased.
*/
- validate_mm_rb(root, vma);
+ validate_mm_rb(&mm->mm_rb, vma);
- __vma_rb_erase(vma, root);
+ __vma_rb_erase(vma, mm);
}
/*
@@ -565,10 +602,12 @@
* immediately update the gap to the correct value. Finally we
* rebalance the rbtree after all augmented values have been set.
*/
+ mm_rb_write_lock(mm);
rb_link_node(&vma->vm_rb, rb_parent, rb_link);
vma->rb_subtree_gap = 0;
vma_gap_update(vma);
- vma_rb_insert(vma, &mm->mm_rb);
+ vma_rb_insert(vma, mm);
+ mm_rb_write_unlock(mm);
}
static void __vma_link_file(struct vm_area_struct *vma)
@@ -644,7 +683,7 @@
{
struct vm_area_struct *next;
- vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
+ vma_rb_erase_ignore(vma, mm, ignore);
next = vma->vm_next;
if (has_prev)
prev->vm_next = next;
@@ -678,7 +717,7 @@
*/
int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
- struct vm_area_struct *expand)
+ struct vm_area_struct *expand, bool keep_locked)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
@@ -690,6 +729,30 @@
long adjust_next = 0;
int remove_next = 0;
+ /*
+ * Why using vm_raw_write*() functions here to avoid lockdep's warning ?
+ *
+ * Locked is complaining about a theoretical lock dependency, involving
+ * 3 locks:
+ * mapping->i_mmap_rwsem --> vma->vm_sequence --> fs_reclaim
+ *
+ * Here are the major path leading to this dependency :
+ * 1. __vma_adjust() mmap_sem -> vm_sequence -> i_mmap_rwsem
+ * 2. move_vmap() mmap_sem -> vm_sequence -> fs_reclaim
+ * 3. __alloc_pages_nodemask() fs_reclaim -> i_mmap_rwsem
+ * 4. unmap_mapping_range() i_mmap_rwsem -> vm_sequence
+ *
+ * So there is no way to solve this easily, especially because in
+ * unmap_mapping_range() the i_mmap_rwsem is grab while the impacted
+ * VMAs are not yet known.
+ * However, the way the vm_seq is used is guarantying that we will
+ * never block on it since we just check for its value and never wait
+ * for it to move, see vma_has_changed() and handle_speculative_fault().
+ */
+ vm_raw_write_begin(vma);
+ if (next)
+ vm_raw_write_begin(next);
+
if (next && !insert) {
struct vm_area_struct *exporter = NULL, *importer = NULL;
@@ -770,8 +833,12 @@
importer->anon_vma = exporter->anon_vma;
error = anon_vma_clone(importer, exporter);
- if (error)
+ if (error) {
+ if (next && next != vma)
+ vm_raw_write_end(next);
+ vm_raw_write_end(vma);
return error;
+ }
}
}
again:
@@ -817,17 +884,18 @@
}
if (start != vma->vm_start) {
- vma->vm_start = start;
+ WRITE_ONCE(vma->vm_start, start);
start_changed = true;
}
if (end != vma->vm_end) {
- vma->vm_end = end;
+ WRITE_ONCE(vma->vm_end, end);
end_changed = true;
}
- vma->vm_pgoff = pgoff;
+ WRITE_ONCE(vma->vm_pgoff, pgoff);
if (adjust_next) {
- next->vm_start += adjust_next << PAGE_SHIFT;
- next->vm_pgoff += adjust_next;
+ WRITE_ONCE(next->vm_start,
+ next->vm_start + (adjust_next << PAGE_SHIFT));
+ WRITE_ONCE(next->vm_pgoff, next->vm_pgoff + adjust_next);
}
if (root) {
@@ -892,15 +960,13 @@
}
if (remove_next) {
- if (file) {
+ if (file)
uprobe_munmap(next, next->vm_start, next->vm_end);
- fput(file);
- }
if (next->anon_vma)
anon_vma_merge(vma, next);
mm->map_count--;
- mpol_put(vma_policy(next));
- kmem_cache_free(vm_area_cachep, next);
+ vm_raw_write_end(next);
+ put_vma(next);
/*
* In mprotect's case 6 (see comments on vma_merge),
* we must remove another next too. It would clutter
@@ -914,6 +980,8 @@
* "vma->vm_next" gap must be updated.
*/
next = vma->vm_next;
+ if (next)
+ vm_raw_write_begin(next);
} else {
/*
* For the scope of the comment "next" and
@@ -960,6 +1028,11 @@
if (insert && file)
uprobe_mmap(insert);
+ if (next && next != vma)
+ vm_raw_write_end(next);
+ if (!keep_locked)
+ vm_raw_write_end(vma);
+
validate_mm(mm);
return 0;
@@ -1099,13 +1172,13 @@
* parameter) may establish ptes with the wrong permissions of NNNN
* instead of the right permissions of XXXX.
*/
-struct vm_area_struct *vma_merge(struct mm_struct *mm,
+struct vm_area_struct *__vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
unsigned long end, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- const char __user *anon_name)
+ const char __user *anon_name, bool keep_locked)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
@@ -1155,10 +1228,11 @@
/* cases 1, 6 */
err = __vma_adjust(prev, prev->vm_start,
next->vm_end, prev->vm_pgoff, NULL,
- prev);
+ prev, keep_locked);
} else /* cases 2, 5, 7 */
err = __vma_adjust(prev, prev->vm_start,
- end, prev->vm_pgoff, NULL, prev);
+ end, prev->vm_pgoff, NULL, prev,
+ keep_locked);
if (err)
return NULL;
khugepaged_enter_vma_merge(prev, vm_flags);
@@ -1176,10 +1250,12 @@
anon_name)) {
if (prev && addr < prev->vm_end) /* case 4 */
err = __vma_adjust(prev, prev->vm_start,
- addr, prev->vm_pgoff, NULL, next);
+ addr, prev->vm_pgoff, NULL, next,
+ keep_locked);
else { /* cases 3, 8 */
err = __vma_adjust(area, addr, next->vm_end,
- next->vm_pgoff - pglen, NULL, next);
+ next->vm_pgoff - pglen, NULL, next,
+ keep_locked);
/*
* In case 3 area is already equal to next and
* this is a noop, but in case 8 "area" has
@@ -1672,7 +1748,7 @@
vma->vm_flags = vm_flags;
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
- INIT_LIST_HEAD(&vma->anon_vma_chain);
+ INIT_VMA(vma);
if (file) {
if (vm_flags & VM_DENYWRITE) {
@@ -1725,13 +1801,15 @@
out:
perf_event_mmap(vma);
+ vm_write_begin(vma);
vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm)))
mm->locked_vm += (len >> PAGE_SHIFT);
else
- vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+ WRITE_ONCE(vma->vm_flags,
+ vma->vm_flags & VM_LOCKED_CLEAR_MASK);
}
if (file)
@@ -1744,9 +1822,10 @@
* then new mapped in-place (which must be aimed as
* a completely new data area).
*/
- vma->vm_flags |= VM_SOFTDIRTY;
+ WRITE_ONCE(vma->vm_flags, vma->vm_flags | VM_SOFTDIRTY);
vma_set_page_prot(vma);
+ vm_write_end(vma);
return addr;
@@ -2118,15 +2197,11 @@
EXPORT_SYMBOL(get_unmapped_area);
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
-struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+static struct vm_area_struct *__find_vma(struct mm_struct *mm,
+ unsigned long addr)
{
struct rb_node *rb_node;
- struct vm_area_struct *vma;
-
- /* Check the cache first. */
- vma = vmacache_find(mm, addr);
- if (likely(vma))
- return vma;
+ struct vm_area_struct *vma = NULL;
rb_node = mm->mm_rb.rb_node;
@@ -2144,13 +2219,40 @@
rb_node = rb_node->rb_right;
}
+ return vma;
+}
+
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma;
+
+ /* Check the cache first. */
+ vma = vmacache_find(mm, addr);
+ if (likely(vma))
+ return vma;
+
+ vma = __find_vma(mm, addr);
if (vma)
vmacache_update(addr, vma);
return vma;
}
-
EXPORT_SYMBOL(find_vma);
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+struct vm_area_struct *get_vma(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma = NULL;
+
+ read_lock(&mm->mm_rb_lock);
+ vma = __find_vma(mm, addr);
+ if (vma)
+ atomic_inc(&vma->vm_ref_count);
+ read_unlock(&mm->mm_rb_lock);
+
+ return vma;
+}
+#endif
+
/*
* Same as find_vma, but also return a pointer to the previous VMA in *pprev.
*/
@@ -2380,8 +2482,8 @@
mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
- vma->vm_start = address;
- vma->vm_pgoff -= grow;
+ WRITE_ONCE(vma->vm_start, address);
+ WRITE_ONCE(vma->vm_pgoff, vma->vm_pgoff - grow);
anon_vma_interval_tree_post_update_vma(vma);
vma_gap_update(vma);
spin_unlock(&mm->page_table_lock);
@@ -2523,7 +2625,7 @@
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
vma->vm_prev = NULL;
do {
- vma_rb_erase(vma, &mm->mm_rb);
+ vma_rb_erase(vma, mm);
mm->map_count--;
tail_vma = vma;
vma = vma->vm_next;
@@ -2563,7 +2665,7 @@
/* most fields are the same, copy all, and then fixup */
*new = *vma;
- INIT_LIST_HEAD(&new->anon_vma_chain);
+ INIT_VMA(new);
if (new_below)
new->vm_end = addr;
@@ -2920,7 +3022,7 @@
return -ENOMEM;
}
- INIT_LIST_HEAD(&vma->anon_vma_chain);
+ INIT_VMA(vma);
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
@@ -3083,9 +3185,21 @@
if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
return NULL; /* should never get here */
- new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
- vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
+
+ /* There is 3 cases to manage here in
+ * AAAA AAAA AAAA AAAA
+ * PPPP.... PPPP......NNNN PPPP....NNNN PP........NN
+ * PPPPPPPP(A) PPPP..NNNNNNNN(B) PPPPPPPPPPPP(1) NULL
+ * PPPPPPPPNNNN(2)
+ * PPPPNNNNNNNN(3)
+ *
+ * new_vma == prev in case A,1,2
+ * new_vma == next in case B,3
+ */
+ new_vma = __vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
+ vma->anon_vma, vma->vm_file, pgoff,
+ vma_policy(vma), vma->vm_userfaultfd_ctx,
+ vma_get_anon_name(vma), true);
if (new_vma) {
/*
* Source vma may have been merged into new_vma
@@ -3118,13 +3232,22 @@
new_vma->vm_pgoff = pgoff;
if (vma_dup_policy(vma, new_vma))
goto out_free_vma;
- INIT_LIST_HEAD(&new_vma->anon_vma_chain);
+ INIT_VMA(new_vma);
if (anon_vma_clone(new_vma, vma))
goto out_free_mempol;
if (new_vma->vm_file)
get_file(new_vma->vm_file);
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
+ /*
+ * As the VMA is linked right now, it may be hit by the
+ * speculative page fault handler. But we don't want it to
+ * to start mapping page in this area until the caller has
+ * potentially move the pte from the moved VMA. To prevent
+ * that we protect it right now, and let the caller unprotect
+ * it once the move is done.
+ */
+ vm_raw_write_begin(new_vma);
vma_link(mm, new_vma, prev, rb_link, rb_parent);
*need_rmap_locks = false;
}
@@ -3256,7 +3379,7 @@
if (unlikely(vma == NULL))
return ERR_PTR(-ENOMEM);
- INIT_LIST_HEAD(&vma->anon_vma_chain);
+ INIT_VMA(vma);
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 1f2c969..60b16418 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -329,12 +329,14 @@
* vm_flags and vm_page_prot are protected by the mmap_sem
* held in write mode.
*/
- vma->vm_flags = newflags;
+ vm_write_begin(vma);
+ WRITE_ONCE(vma->vm_flags, newflags);
dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
vma_set_page_prot(vma);
change_protection(vma, start, end, vma->vm_page_prot,
dirty_accountable, 0);
+ vm_write_end(vma);
/*
* Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
diff --git a/mm/mremap.c b/mm/mremap.c
index 1597671..2302762 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -289,6 +289,14 @@
if (!new_vma)
return -ENOMEM;
+ /* new_vma is returned protected by copy_vma, to prevent speculative
+ * page fault to be done in the destination area before we move the pte.
+ * Now, we must also protect the source VMA since we don't want pages
+ * to be mapped in our back while we are copying the PTEs.
+ */
+ if (vma != new_vma)
+ vm_raw_write_begin(vma);
+
moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
need_rmap_locks);
if (moved_len < old_len) {
@@ -305,6 +313,8 @@
*/
move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
true);
+ if (vma != new_vma)
+ vm_raw_write_end(vma);
vma = new_vma;
old_len = new_len;
old_addr = new_addr;
@@ -312,7 +322,10 @@
} else {
arch_remap(mm, old_addr, old_addr + old_len,
new_addr, new_addr + new_len);
+ if (vma != new_vma)
+ vm_raw_write_end(vma);
}
+ vm_raw_write_end(new_vma);
/* Conceal VM_ACCOUNT so old reservation is not undone */
if (vm_flags & VM_ACCOUNT) {
diff --git a/mm/nommu.c b/mm/nommu.c
index 44265e0..d033ee8 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1243,7 +1243,7 @@
region->vm_flags = vm_flags;
region->vm_pgoff = pgoff;
- INIT_LIST_HEAD(&vma->anon_vma_chain);
+ INIT_VMA(vma);
vma->vm_flags = vm_flags;
vma->vm_pgoff = pgoff;
diff --git a/mm/rmap.c b/mm/rmap.c
index 4d19dd1..24470a6 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1266,7 +1266,7 @@
}
/**
- * page_add_new_anon_rmap - add pte mapping to a new anonymous page
+ * __page_add_new_anon_rmap - add pte mapping to a new anonymous page
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
@@ -1276,12 +1276,11 @@
* This means the inc-and-test can be bypassed.
* Page does not have to be locked.
*/
-void page_add_new_anon_rmap(struct page *page,
+void __page_add_new_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address, bool compound)
{
int nr = compound ? hpage_nr_pages(page) : 1;
- VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
__SetPageSwapBacked(page);
if (compound) {
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
diff --git a/mm/swap.c b/mm/swap.c
index 6f22754..5827225 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -468,12 +468,12 @@
* directly back onto it's zone's unevictable list, it does NOT use a
* per cpu pagevec.
*/
-void lru_cache_add_active_or_unevictable(struct page *page,
- struct vm_area_struct *vma)
+void __lru_cache_add_active_or_unevictable(struct page *page,
+ unsigned long vma_flags)
{
VM_BUG_ON_PAGE(PageLRU(page), page);
- if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
+ if (likely((vma_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
SetPageActive(page);
lru_cache_add(page);
return;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 5ac5846..35f882d 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -467,6 +467,10 @@
* the readahead.
*
* Caller must hold down_read on the vma->vm_mm if vma is not NULL.
+ * This is needed to ensure the VMA will not be freed in our back. In the case
+ * of the speculative page fault handler, this cannot happen, even if we don't
+ * hold the mmap_sem. Callees are assumed to take care of reading VMA's fields
+ * using READ_ONCE() to read consistent values.
*/
struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8bd62ed..3d128da 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1088,7 +1088,10 @@
"vmacache_find_hits",
"vmacache_full_flushes",
#endif
-#endif /* CONFIG_VM_EVENTS_COUNTERS */
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+ "speculative_pgfault"
+#endif
+#endif /* CONFIG_VM_EVENT_COUNTERS */
};
#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
diff --git a/security/pfe/pfk_ice.c b/security/pfe/pfk_ice.c
index 16ed516..e1d0100 100644
--- a/security/pfe/pfk_ice.c
+++ b/security/pfe/pfk_ice.c
@@ -121,7 +121,7 @@
goto out;
}
- ret = scm_call2(smc_id, &desc);
+ ret = scm_call2_noretry(smc_id, &desc);
if (ret) {
pr_err("%s: Set Key Error: %d\n", __func__, ret);
@@ -134,7 +134,7 @@
smc_id = TZ_ES_INVALIDATE_ICE_KEY_ID;
desc.arginfo = TZ_ES_INVALIDATE_ICE_KEY_PARAM_ID;
desc.args[0] = index;
- ret1 = scm_call2(smc_id, &desc);
+ ret1 = scm_call2_noretry(smc_id, &desc);
if (ret1)
pr_err("%s: Invalidate Key Error: %d\n", __func__,
ret1);
@@ -175,7 +175,7 @@
return ret;
}
- ret = scm_call2(smc_id, &desc);
+ ret = scm_call2_noretry(smc_id, &desc);
if (ret) {
pr_err("%s: Error: 0x%x\n", __func__, ret);