mm: swap: don't delay swap free for fast swap devices There are couple of issues with swapcache usage when ZRAM is used as swap device. 1) Kernel does a swap readahead which can be around 6 to 8 pages depending on total ram, which is not required for zram since accesses are fast. 2) Kernel delays the freeing up of swapcache expecting a later hit, which again is useless in the case of zram. 3) This is not related to swapcache, but zram usage itself. As mentioned in (2) kernel delays freeing of swapcache, but along with that it delays zram compressed page free also. i.e. there can be 2 copies, though one is compressed. This patch addresses these issues using two new flags QUEUE_FLAG_FAST and SWP_FAST, to indicate that accesses to the device will be fast and cheap, and instructs the swap layer to free up swap space agressively, and not to do read ahead. Change-Id: I5d2d5176a5f9420300bb2f843f6ecbdb25ea80e4 Signed-off-by: Vinayak Menon <vinmenon@codeaurora.org>

commit: bb730a17fa1e3fd4047ba72e0f1795883754e4e6 [log] [tgz]
author: Vinayak Menon <vinmenon@codeaurora.org> Wed Feb 25 19:43:59 2015 +0530
committer: Vinayak Menon <vinmenon@codeaurora.org> Wed Jul 19 18:41:26 2017 +0530
tree: 8ec897a31083a82c7ba69a4267eb788b9ca9ce32
parent: 99fddf1fcf7087d9d07d5fc9950944616528afb7 [diff]
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index c9914d65..3868665 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c

@@ -1271,6 +1271,7 @@
 	zram->disk->private_data = zram;
 	snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
 
+	__set_bit(QUEUE_FLAG_FAST, &zram->disk->queue->queue_flags);
 	/* Actual capacity set using syfs (/sys/block/zram<id>/disksize */
 	set_capacity(zram->disk, 0);
 	/* zram devices sort of resembles non-rotational disks */

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fb910c6..0693c3e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h

@@ -508,6 +508,7 @@
 #define QUEUE_FLAG_FUA	       24	/* device supports FUA writes */
 #define QUEUE_FLAG_FLUSH_NQ    25	/* flush not queueuable */
 #define QUEUE_FLAG_DAX         26	/* device supports DAX */
+#define QUEUE_FLAG_FAST        27	/* fast block device (e.g. ram based) */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
@@ -598,6 +599,7 @@
 #define blk_queue_secure_erase(q) \
 	(test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags))
 #define blk_queue_dax(q)	test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags)
+#define blk_queue_fast(q)	test_bit(QUEUE_FLAG_FAST, &(q)->queue_flags)
 
 #define blk_noretry_request(rq) \
 	((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 55ff559..d0af005 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h

@@ -151,8 +151,9 @@
 	SWP_AREA_DISCARD = (1 << 8),	/* single-time swap area discards */
 	SWP_PAGE_DISCARD = (1 << 9),	/* freed swap page-cluster discards */
 	SWP_STABLE_WRITES = (1 << 10),	/* no overwrite PG_writeback pages */
+	SWP_FAST	= (1 << 11),	/* blkdev access is fast and cheap */
 					/* add others here before... */
-	SWP_SCANNING	= (1 << 11),	/* refcount in scan_swap_map */
+	SWP_SCANNING	= (1 << 12),	/* refcount in scan_swap_map */
 };
 
 #define SWAP_CLUSTER_MAX 32UL
@@ -389,10 +390,18 @@
 /* linux/mm/swapfile.c */
 extern atomic_long_t nr_swap_pages;
 extern long total_swap_pages;
+extern bool is_swap_fast(swp_entry_t entry);
 
 /* Swap 50% full? Release swapcache more aggressively.. */
-static inline bool vm_swap_full(void)
+static inline bool vm_swap_full(struct swap_info_struct *si)
 {
+	/*
+	 * If the swap device is fast, return true
+	 * not to delay swap free.
+	 */
+	if (si->flags & SWP_FAST)
+		return true;
+
 	return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages;
 }
 
@@ -428,7 +437,7 @@
 #define get_nr_swap_pages()			0L
 #define total_swap_pages			0L
 #define total_swapcache_pages()			0UL
-#define vm_swap_full()				0
+#define vm_swap_full(si)			0
 
 #define si_swapinfo(val) \
 	do { (val)->freeswap = (val)->totalswap = 0; } while (0)
@@ -579,7 +588,7 @@
 
 static inline bool mem_cgroup_swap_full(struct page *page)
 {
-	return vm_swap_full();
+	return vm_swap_full(page_swap_info(page));
 }
 #endif
 

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fdc790a..3b38b73 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c

@@ -5996,7 +5996,7 @@
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 
-	if (vm_swap_full())
+	if (vm_swap_full(page_swap_info(page)))
 		return true;
 	if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
 		return false;

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 35d7e0e..8d8b850 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c

@@ -475,7 +475,8 @@
 	unsigned long entry_offset = swp_offset(entry);
 	unsigned long offset = entry_offset;
 	unsigned long start_offset, end_offset;
-	unsigned long mask;
+	unsigned long mask = is_swap_fast(entry) ? 0 :
+				(1UL << page_cluster) - 1;
 	struct blk_plug plug;
 
 	mask = swapin_nr_pages(offset) - 1;

diff --git a/mm/swapfile.c b/mm/swapfile.c
index d76b2a1..daf1480 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c

@@ -97,6 +97,26 @@
 	return ent & ~SWAP_HAS_CACHE;	/* may include SWAP_HAS_CONT flag */
 }
 
+bool is_swap_fast(swp_entry_t entry)
+{
+	struct swap_info_struct *p;
+	unsigned long type;
+
+	if (non_swap_entry(entry))
+		return false;
+
+	type = swp_type(entry);
+	if (type >= nr_swapfiles)
+		return false;
+
+	p = swap_info[type];
+
+	if (p->flags & SWP_FAST)
+		return true;
+
+	return false;
+}
+
 /* returns 1 if swap entry is freed */
 static int
 __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
@@ -573,7 +593,7 @@
 		scan_base = offset = si->lowest_bit;
 
 	/* reuse swap entry of cache-only swap if not busy. */
-	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+	if (vm_swap_full(si) && si->swap_map[offset] == SWAP_HAS_CACHE) {
 		int swap_was_freed;
 		spin_unlock(&si->lock);
 		swap_was_freed = __try_to_reclaim_swap(si, offset);
@@ -613,7 +633,8 @@
 			spin_lock(&si->lock);
 			goto checks;
 		}
-		if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+		if (vm_swap_full(si) &&
+			si->swap_map[offset] == SWAP_HAS_CACHE) {
 			spin_lock(&si->lock);
 			goto checks;
 		}
@@ -628,7 +649,8 @@
 			spin_lock(&si->lock);
 			goto checks;
 		}
-		if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+		if (vm_swap_full(si) &&
+			si->swap_map[offset] == SWAP_HAS_CACHE) {
 			spin_lock(&si->lock);
 			goto checks;
 		}
@@ -2537,6 +2559,9 @@
 				pr_err("swapon: discard_swap(%p): %d\n",
 					p, err);
 		}
+
+		if (blk_queue_fast(bdev_get_queue(p->bdev)))
+			p->flags |= SWP_FAST;
 	}
 
 	mutex_lock(&swapon_mutex);
commit	bb730a17fa1e3fd4047ba72e0f1795883754e4e6	[log] [tgz]
author	Vinayak Menon <vinmenon@codeaurora.org>	Wed Feb 25 19:43:59 2015 +0530
committer	Vinayak Menon <vinmenon@codeaurora.org>	Wed Jul 19 18:41:26 2017 +0530
tree	8ec897a31083a82c7ba69a4267eb788b9ca9ce32
parent	99fddf1fcf7087d9d07d5fc9950944616528afb7 [diff]