Blame - mm/zswap.c - kernel/msm-4.9

blob: deda2b671e128600c817714754a6d58decb0c50a [file] [log] [blame]

Seth Jennings	2b28111	2013-07-10 16:05:03 -0700	[diff] [blame]	1	/*
				2	* zswap.c - zswap driver file
				3	*
				4	* zswap is a backend for frontswap that takes pages that are in the process
				5	* of being swapped out and attempts to compress and store them in a
				6	* RAM-based memory pool. This can result in a significant I/O reduction on
				7	* the swap device and, in the case where decompressing from RAM is faster
				8	* than reading from the swap device, can also improve workload performance.
				9	*
				10	* Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com>
				11	*
				12	* This program is free software; you can redistribute it and/or
				13	* modify it under the terms of the GNU General Public License
				14	* as published by the Free Software Foundation; either version 2
				15	* of the License, or (at your option) any later version.
				16	*
				17	* This program is distributed in the hope that it will be useful,
				18	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				20	* GNU General Public License for more details.
				21	*/
				22
				23	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
				24
				25	#include <linux/module.h>
				26	#include <linux/cpu.h>
				27	#include <linux/highmem.h>
				28	#include <linux/slab.h>
				29	#include <linux/spinlock.h>
				30	#include <linux/types.h>
				31	#include <linux/atomic.h>
				32	#include <linux/frontswap.h>
				33	#include <linux/rbtree.h>
				34	#include <linux/swap.h>
				35	#include <linux/crypto.h>
				36	#include <linux/mempool.h>
				37	#include <linux/zbud.h>
				38
				39	#include <linux/mm_types.h>
				40	#include <linux/page-flags.h>
				41	#include <linux/swapops.h>
				42	#include <linux/writeback.h>
				43	#include <linux/pagemap.h>
				44
				45	/*********************************
				46	* statistics
				47	**********************************/
				48	/* Number of memory pages used by the compressed pool */
				49	static u64 zswap_pool_pages;
				50	/* The number of compressed pages currently stored in zswap */
				51	static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
				52
				53	/*
				54	* The statistics below are not protected from concurrent access for
				55	* performance reasons so they may not be a 100% accurate. However,
				56	* they do provide useful information on roughly how many times a
				57	* certain event is occurring.
				58	*/
				59
				60	/* Pool limit was hit (see zswap_max_pool_percent) */
				61	static u64 zswap_pool_limit_hit;
				62	/* Pages written back when pool limit was reached */
				63	static u64 zswap_written_back_pages;
				64	/* Store failed due to a reclaim failure after pool limit was reached */
				65	static u64 zswap_reject_reclaim_fail;
				66	/* Compressed page was too big for the allocator to (optimally) store */
				67	static u64 zswap_reject_compress_poor;
				68	/* Store failed because underlying allocator could not get memory */
				69	static u64 zswap_reject_alloc_fail;
				70	/* Store failed because the entry metadata could not be allocated (rare) */
				71	static u64 zswap_reject_kmemcache_fail;
				72	/* Duplicate store was encountered (rare) */
				73	static u64 zswap_duplicate_entry;
				74
				75	/*********************************
				76	* tunables
				77	**********************************/
				78	/* Enable/disable zswap (disabled by default, fixed at boot for now) */
				79	static bool zswap_enabled __read_mostly;
				80	module_param_named(enabled, zswap_enabled, bool, 0);
				81
				82	/* Compressor to be used by zswap (fixed at boot for now) */
				83	#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
				84	static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
				85	module_param_named(compressor, zswap_compressor, charp, 0);
				86
				87	/* The maximum percentage of memory that the compressed pool can occupy */
				88	static unsigned int zswap_max_pool_percent = 20;
				89	module_param_named(max_pool_percent,
				90	zswap_max_pool_percent, uint, 0644);
				91
				92	/*********************************
				93	* compression functions
				94	**********************************/
				95	/* per-cpu compression transforms */
				96	static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms;
				97
				98	enum comp_op {
				99	ZSWAP_COMPOP_COMPRESS,
				100	ZSWAP_COMPOP_DECOMPRESS
				101	};
				102
				103	static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen,
				104	u8 dst, unsigned int dlen)
				105	{
				106	struct crypto_comp *tfm;
				107	int ret;
				108
				109	tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu());
				110	switch (op) {
				111	case ZSWAP_COMPOP_COMPRESS:
				112	ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
				113	break;
				114	case ZSWAP_COMPOP_DECOMPRESS:
				115	ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
				116	break;
				117	default:
				118	ret = -EINVAL;
				119	}
				120
				121	put_cpu();
				122	return ret;
				123	}
				124
				125	static int __init zswap_comp_init(void)
				126	{
				127	if (!crypto_has_comp(zswap_compressor, 0, 0)) {
				128	pr_info("%s compressor not available\n", zswap_compressor);
				129	/* fall back to default compressor */
				130	zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
				131	if (!crypto_has_comp(zswap_compressor, 0, 0))
				132	/* can't even load the default compressor */
				133	return -ENODEV;
				134	}
				135	pr_info("using %s compressor\n", zswap_compressor);
				136
				137	/* alloc percpu transforms */
				138	zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
				139	if (!zswap_comp_pcpu_tfms)
				140	return -ENOMEM;
				141	return 0;
				142	}
				143
				144	static void zswap_comp_exit(void)
				145	{
				146	/* free percpu transforms */
				147	if (zswap_comp_pcpu_tfms)
				148	free_percpu(zswap_comp_pcpu_tfms);
				149	}
				150
				151	/*********************************
				152	* data structures
				153	**********************************/
				154	/*
				155	* struct zswap_entry
				156	*
				157	* This structure contains the metadata for tracking a single compressed
				158	* page within zswap.
				159	*
				160	* rbnode - links the entry into red-black tree for the appropriate swap type
				161	* refcount - the number of outstanding reference to the entry. This is needed
				162	* to protect against premature freeing of the entry by code
				163	* concurent calls to load, invalidate, and writeback. The lock
				164	* for the zswap_tree structure that contains the entry must
				165	* be held while changing the refcount. Since the lock must
				166	* be held, there is no reason to also make refcount atomic.
				167	* offset - the swap offset for the entry. Index into the red-black tree.
				168	* handle - zsmalloc allocation handle that stores the compressed page data
				169	* length - the length in bytes of the compressed page data. Needed during
				170	* decompression
				171	*/
				172	struct zswap_entry {
				173	struct rb_node rbnode;
				174	pgoff_t offset;
				175	int refcount;
				176	unsigned int length;
				177	unsigned long handle;
				178	};
				179
				180	struct zswap_header {
				181	swp_entry_t swpentry;
				182	};
				183
				184	/*
				185	* The tree lock in the zswap_tree struct protects a few things:
				186	* - the rbtree
				187	* - the refcount field of each entry in the tree
				188	*/
				189	struct zswap_tree {
				190	struct rb_root rbroot;
				191	spinlock_t lock;
				192	struct zbud_pool *pool;
				193	};
				194
				195	static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
				196
				197	/*********************************
				198	* zswap entry functions
				199	**********************************/
				200	static struct kmem_cache *zswap_entry_cache;
				201
				202	static int zswap_entry_cache_create(void)
				203	{
				204	zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
				205	return (zswap_entry_cache == NULL);
				206	}
				207
				208	static void zswap_entry_cache_destory(void)
				209	{
				210	kmem_cache_destroy(zswap_entry_cache);
				211	}
				212
				213	static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
				214	{
				215	struct zswap_entry *entry;
				216	entry = kmem_cache_alloc(zswap_entry_cache, gfp);
				217	if (!entry)
				218	return NULL;
				219	entry->refcount = 1;
				220	return entry;
				221	}
				222
				223	static void zswap_entry_cache_free(struct zswap_entry *entry)
				224	{
				225	kmem_cache_free(zswap_entry_cache, entry);
				226	}
				227
				228	/* caller must hold the tree lock */
				229	static void zswap_entry_get(struct zswap_entry *entry)
				230	{
				231	entry->refcount++;
				232	}
				233
				234	/* caller must hold the tree lock */
				235	static int zswap_entry_put(struct zswap_entry *entry)
				236	{
				237	entry->refcount--;
				238	return entry->refcount;
				239	}
				240
				241	/*********************************
				242	* rbtree functions
				243	**********************************/
				244	static struct zswap_entry zswap_rb_search(struct rb_root root, pgoff_t offset)
				245	{
				246	struct rb_node *node = root->rb_node;
				247	struct zswap_entry *entry;
				248
				249	while (node) {
				250	entry = rb_entry(node, struct zswap_entry, rbnode);
				251	if (entry->offset > offset)
				252	node = node->rb_left;
				253	else if (entry->offset < offset)
				254	node = node->rb_right;
				255	else
				256	return entry;
				257	}
				258	return NULL;
				259	}
				260
				261	/*
				262	* In the case that a entry with the same offset is found, a pointer to
				263	* the existing entry is stored in dupentry and the function returns -EEXIST
				264	*/
				265	static int zswap_rb_insert(struct rb_root root, struct zswap_entry entry,
				266	struct zswap_entry **dupentry)
				267	{
				268	struct rb_node *link = &root->rb_node, parent = NULL;
				269	struct zswap_entry *myentry;
				270
				271	while (*link) {
				272	parent = *link;
				273	myentry = rb_entry(parent, struct zswap_entry, rbnode);
				274	if (myentry->offset > entry->offset)
				275	link = &(*link)->rb_left;
				276	else if (myentry->offset < entry->offset)
				277	link = &(*link)->rb_right;
				278	else {
				279	*dupentry = myentry;
				280	return -EEXIST;
				281	}
				282	}
				283	rb_link_node(&entry->rbnode, parent, link);
				284	rb_insert_color(&entry->rbnode, root);
				285	return 0;
				286	}
				287
				288	/*********************************
				289	* per-cpu code
				290	**********************************/
				291	static DEFINE_PER_CPU(u8 *, zswap_dstmem);
				292
				293	static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
				294	{
				295	struct crypto_comp *tfm;
				296	u8 *dst;
				297
				298	switch (action) {
				299	case CPU_UP_PREPARE:
				300	tfm = crypto_alloc_comp(zswap_compressor, 0, 0);
				301	if (IS_ERR(tfm)) {
				302	pr_err("can't allocate compressor transform\n");
				303	return NOTIFY_BAD;
				304	}
				305	*per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm;
				306	dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL);
				307	if (!dst) {
				308	pr_err("can't allocate compressor buffer\n");
				309	crypto_free_comp(tfm);
				310	*per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
				311	return NOTIFY_BAD;
				312	}
				313	per_cpu(zswap_dstmem, cpu) = dst;
				314	break;
				315	case CPU_DEAD:
				316	case CPU_UP_CANCELED:
				317	tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu);
				318	if (tfm) {
				319	crypto_free_comp(tfm);
				320	*per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
				321	}
				322	dst = per_cpu(zswap_dstmem, cpu);
				323	kfree(dst);
				324	per_cpu(zswap_dstmem, cpu) = NULL;
				325	break;
				326	default:
				327	break;
				328	}
				329	return NOTIFY_OK;
				330	}
				331
				332	static int zswap_cpu_notifier(struct notifier_block *nb,
				333	unsigned long action, void *pcpu)
				334	{
				335	unsigned long cpu = (unsigned long)pcpu;
				336	return __zswap_cpu_notifier(action, cpu);
				337	}
				338
				339	static struct notifier_block zswap_cpu_notifier_block = {
				340	.notifier_call = zswap_cpu_notifier
				341	};
				342
				343	static int zswap_cpu_init(void)
				344	{
				345	unsigned long cpu;
				346
				347	get_online_cpus();
				348	for_each_online_cpu(cpu)
				349	if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK)
				350	goto cleanup;
				351	register_cpu_notifier(&zswap_cpu_notifier_block);
				352	put_online_cpus();
				353	return 0;
				354
				355	cleanup:
				356	for_each_online_cpu(cpu)
				357	__zswap_cpu_notifier(CPU_UP_CANCELED, cpu);
				358	put_online_cpus();
				359	return -ENOMEM;
				360	}
				361
				362	/*********************************
				363	* helpers
				364	**********************************/
				365	static bool zswap_is_full(void)
				366	{
				367	return (totalram_pages * zswap_max_pool_percent / 100 <
				368	zswap_pool_pages);
				369	}
				370
				371	/*
				372	* Carries out the common pattern of freeing and entry's zsmalloc allocation,
				373	* freeing the entry itself, and decrementing the number of stored pages.
				374	*/
				375	static void zswap_free_entry(struct zswap_tree tree, struct zswap_entry entry)
				376	{
				377	zbud_free(tree->pool, entry->handle);
				378	zswap_entry_cache_free(entry);
				379	atomic_dec(&zswap_stored_pages);
				380	zswap_pool_pages = zbud_get_pool_size(tree->pool);
				381	}
				382
				383	/*********************************
				384	* writeback code
				385	**********************************/
				386	/* return enum for zswap_get_swap_cache_page */
				387	enum zswap_get_swap_ret {
				388	ZSWAP_SWAPCACHE_NEW,
				389	ZSWAP_SWAPCACHE_EXIST,
				390	ZSWAP_SWAPCACHE_NOMEM
				391	};
				392
				393	/*
				394	* zswap_get_swap_cache_page
				395	*
				396	* This is an adaption of read_swap_cache_async()
				397	*
				398	* This function tries to find a page with the given swap entry
				399	* in the swapper_space address space (the swap cache). If the page
				400	* is found, it is returned in retpage. Otherwise, a page is allocated,
				401	* added to the swap cache, and returned in retpage.
				402	*
				403	* If success, the swap cache page is returned in retpage
				404	* Returns 0 if page was already in the swap cache, page is not locked
				405	* Returns 1 if the new page needs to be populated, page is locked
				406	* Returns <0 on error
				407	*/
				408	static int zswap_get_swap_cache_page(swp_entry_t entry,
				409	struct page **retpage)
				410	{
				411	struct page found_page, new_page = NULL;
				412	struct address_space *swapper_space = &swapper_spaces[swp_type(entry)];
				413	int err;
				414
				415	*retpage = NULL;
				416	do {
				417	/*
				418	* First check the swap cache. Since this is normally
				419	* called after lookup_swap_cache() failed, re-calling
				420	* that would confuse statistics.
				421	*/
				422	found_page = find_get_page(swapper_space, entry.val);
				423	if (found_page)
				424	break;
				425
				426	/*
				427	* Get a new page to read into from swap.
				428	*/
				429	if (!new_page) {
				430	new_page = alloc_page(GFP_KERNEL);
				431	if (!new_page)
				432	break; /* Out of memory */
				433	}
				434
				435	/*
				436	* call radix_tree_preload() while we can wait.
				437	*/
				438	err = radix_tree_preload(GFP_KERNEL);
				439	if (err)
				440	break;
				441
				442	/*
				443	* Swap entry may have been freed since our caller observed it.
				444	*/
				445	err = swapcache_prepare(entry);
				446	if (err == -EEXIST) { /* seems racy */
				447	radix_tree_preload_end();
				448	continue;
				449	}
				450	if (err) { /* swp entry is obsolete ? */
				451	radix_tree_preload_end();
				452	break;
				453	}
				454
				455	/* May fail (-ENOMEM) if radix-tree node allocation failed. */
				456	__set_page_locked(new_page);
				457	SetPageSwapBacked(new_page);
				458	err = __add_to_swap_cache(new_page, entry);
				459	if (likely(!err)) {
				460	radix_tree_preload_end();
				461	lru_cache_add_anon(new_page);
				462	*retpage = new_page;
				463	return ZSWAP_SWAPCACHE_NEW;
				464	}
				465	radix_tree_preload_end();
				466	ClearPageSwapBacked(new_page);
				467	__clear_page_locked(new_page);
				468	/*
				469	* add_to_swap_cache() doesn't return -EEXIST, so we can safely
				470	* clear SWAP_HAS_CACHE flag.
				471	*/
				472	swapcache_free(entry, NULL);
				473	} while (err != -ENOMEM);
				474
				475	if (new_page)
				476	page_cache_release(new_page);
				477	if (!found_page)
				478	return ZSWAP_SWAPCACHE_NOMEM;
				479	*retpage = found_page;
				480	return ZSWAP_SWAPCACHE_EXIST;
				481	}
				482
				483	/*
				484	* Attempts to free an entry by adding a page to the swap cache,
				485	* decompressing the entry data into the page, and issuing a
				486	* bio write to write the page back to the swap device.
				487	*
				488	* This can be thought of as a "resumed writeback" of the page
				489	* to the swap device. We are basically resuming the same swap
				490	* writeback path that was intercepted with the frontswap_store()
				491	* in the first place. After the page has been decompressed into
				492	* the swap cache, the compressed version stored by zswap can be
				493	* freed.
				494	*/
				495	static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
				496	{
				497	struct zswap_header *zhdr;
				498	swp_entry_t swpentry;
				499	struct zswap_tree *tree;
				500	pgoff_t offset;
				501	struct zswap_entry *entry;
				502	struct page *page;
				503	u8 src, dst;
				504	unsigned int dlen;
				505	int ret, refcount;
				506	struct writeback_control wbc = {
				507	.sync_mode = WB_SYNC_NONE,
				508	};
				509
				510	/* extract swpentry from data */
				511	zhdr = zbud_map(pool, handle);
				512	swpentry = zhdr->swpentry; /* here */
				513	zbud_unmap(pool, handle);
				514	tree = zswap_trees[swp_type(swpentry)];
				515	offset = swp_offset(swpentry);
				516	BUG_ON(pool != tree->pool);
				517
				518	/* find and ref zswap entry */
				519	spin_lock(&tree->lock);
				520	entry = zswap_rb_search(&tree->rbroot, offset);
				521	if (!entry) {
				522	/* entry was invalidated */
				523	spin_unlock(&tree->lock);
				524	return 0;
				525	}
				526	zswap_entry_get(entry);
				527	spin_unlock(&tree->lock);
				528	BUG_ON(offset != entry->offset);
				529
				530	/* try to allocate swap cache page */
				531	switch (zswap_get_swap_cache_page(swpentry, &page)) {
				532	case ZSWAP_SWAPCACHE_NOMEM: /* no memory */
				533	ret = -ENOMEM;
				534	goto fail;
				535
				536	case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */
				537	/* page is already in the swap cache, ignore for now */
				538	page_cache_release(page);
				539	ret = -EEXIST;
				540	goto fail;
				541
				542	case ZSWAP_SWAPCACHE_NEW: /* page is locked */
				543	/* decompress */
				544	dlen = PAGE_SIZE;
				545	src = (u8 *)zbud_map(tree->pool, entry->handle) +
				546	sizeof(struct zswap_header);
				547	dst = kmap_atomic(page);
				548	ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
				549	entry->length, dst, &dlen);
				550	kunmap_atomic(dst);
				551	zbud_unmap(tree->pool, entry->handle);
				552	BUG_ON(ret);
				553	BUG_ON(dlen != PAGE_SIZE);
				554
				555	/* page is up to date */
				556	SetPageUptodate(page);
				557	}
				558
				559	/* start writeback */
				560	__swap_writepage(page, &wbc, end_swap_bio_write);
				561	page_cache_release(page);
				562	zswap_written_back_pages++;
				563
				564	spin_lock(&tree->lock);
				565
				566	/* drop local reference */
				567	zswap_entry_put(entry);
				568	/* drop the initial reference from entry creation */
				569	refcount = zswap_entry_put(entry);
				570
				571	/*
				572	* There are three possible values for refcount here:
				573	* (1) refcount is 1, load is in progress, unlink from rbtree,
				574	* load will free
				575	* (2) refcount is 0, (normal case) entry is valid,
				576	* remove from rbtree and free entry
				577	* (3) refcount is -1, invalidate happened during writeback,
				578	* free entry
				579	*/
				580	if (refcount >= 0) {
				581	/* no invalidate yet, remove from rbtree */
				582	rb_erase(&entry->rbnode, &tree->rbroot);
				583	}
				584	spin_unlock(&tree->lock);
				585	if (refcount <= 0) {
				586	/* free the entry */
				587	zswap_free_entry(tree, entry);
				588	return 0;
				589	}
				590	return -EAGAIN;
				591
				592	fail:
				593	spin_lock(&tree->lock);
				594	zswap_entry_put(entry);
				595	spin_unlock(&tree->lock);
				596	return ret;
				597	}
				598
				599	/*********************************
				600	* frontswap hooks
				601	**********************************/
				602	/* attempts to compress and store an single page */
				603	static int zswap_frontswap_store(unsigned type, pgoff_t offset,
				604	struct page *page)
				605	{
				606	struct zswap_tree *tree = zswap_trees[type];
				607	struct zswap_entry entry, dupentry;
				608	int ret;
				609	unsigned int dlen = PAGE_SIZE, len;
				610	unsigned long handle;
				611	char *buf;
				612	u8 src, dst;
				613	struct zswap_header *zhdr;
				614
				615	if (!tree) {
				616	ret = -ENODEV;
				617	goto reject;
				618	}
				619
				620	/* reclaim space if needed */
				621	if (zswap_is_full()) {
				622	zswap_pool_limit_hit++;
				623	if (zbud_reclaim_page(tree->pool, 8)) {
				624	zswap_reject_reclaim_fail++;
				625	ret = -ENOMEM;
				626	goto reject;
				627	}
				628	}
				629
				630	/* allocate entry */
				631	entry = zswap_entry_cache_alloc(GFP_KERNEL);
				632	if (!entry) {
				633	zswap_reject_kmemcache_fail++;
				634	ret = -ENOMEM;
				635	goto reject;
				636	}
				637
				638	/* compress */
				639	dst = get_cpu_var(zswap_dstmem);
				640	src = kmap_atomic(page);
				641	ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen);
				642	kunmap_atomic(src);
				643	if (ret) {
				644	ret = -EINVAL;
				645	goto freepage;
				646	}
				647
				648	/* store */
				649	len = dlen + sizeof(struct zswap_header);
				650	ret = zbud_alloc(tree->pool, len, __GFP_NORETRY \| __GFP_NOWARN,
				651	&handle);
				652	if (ret == -ENOSPC) {
				653	zswap_reject_compress_poor++;
				654	goto freepage;
				655	}
				656	if (ret) {
				657	zswap_reject_alloc_fail++;
				658	goto freepage;
				659	}
				660	zhdr = zbud_map(tree->pool, handle);
				661	zhdr->swpentry = swp_entry(type, offset);
				662	buf = (u8 *)(zhdr + 1);
				663	memcpy(buf, dst, dlen);
				664	zbud_unmap(tree->pool, handle);
				665	put_cpu_var(zswap_dstmem);
				666
				667	/* populate entry */
				668	entry->offset = offset;
				669	entry->handle = handle;
				670	entry->length = dlen;
				671
				672	/* map */
				673	spin_lock(&tree->lock);
				674	do {
				675	ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
				676	if (ret == -EEXIST) {
				677	zswap_duplicate_entry++;
				678	/* remove from rbtree */
				679	rb_erase(&dupentry->rbnode, &tree->rbroot);
				680	if (!zswap_entry_put(dupentry)) {
				681	/* free */
				682	zswap_free_entry(tree, dupentry);
				683	}
				684	}
				685	} while (ret == -EEXIST);
				686	spin_unlock(&tree->lock);
				687
				688	/* update stats */
				689	atomic_inc(&zswap_stored_pages);
				690	zswap_pool_pages = zbud_get_pool_size(tree->pool);
				691
				692	return 0;
				693
				694	freepage:
				695	put_cpu_var(zswap_dstmem);
				696	zswap_entry_cache_free(entry);
				697	reject:
				698	return ret;
				699	}
				700
				701	/*
				702	* returns 0 if the page was successfully decompressed
				703	* return -1 on entry not found or error
				704	*/
				705	static int zswap_frontswap_load(unsigned type, pgoff_t offset,
				706	struct page *page)
				707	{
				708	struct zswap_tree *tree = zswap_trees[type];
				709	struct zswap_entry *entry;
				710	u8 src, dst;
				711	unsigned int dlen;
				712	int refcount, ret;
				713
				714	/* find */
				715	spin_lock(&tree->lock);
				716	entry = zswap_rb_search(&tree->rbroot, offset);
				717	if (!entry) {
				718	/* entry was written back */
				719	spin_unlock(&tree->lock);
				720	return -1;
				721	}
				722	zswap_entry_get(entry);
				723	spin_unlock(&tree->lock);
				724
				725	/* decompress */
				726	dlen = PAGE_SIZE;
				727	src = (u8 *)zbud_map(tree->pool, entry->handle) +
				728	sizeof(struct zswap_header);
				729	dst = kmap_atomic(page);
				730	ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
				731	dst, &dlen);
				732	kunmap_atomic(dst);
				733	zbud_unmap(tree->pool, entry->handle);
				734	BUG_ON(ret);
				735
				736	spin_lock(&tree->lock);
				737	refcount = zswap_entry_put(entry);
				738	if (likely(refcount)) {
				739	spin_unlock(&tree->lock);
				740	return 0;
				741	}
				742	spin_unlock(&tree->lock);
				743
				744	/*
				745	* We don't have to unlink from the rbtree because
				746	* zswap_writeback_entry() or zswap_frontswap_invalidate page()
				747	* has already done this for us if we are the last reference.
				748	*/
				749	/* free */
				750
				751	zswap_free_entry(tree, entry);
				752
				753	return 0;
				754	}
				755
				756	/* frees an entry in zswap */
				757	static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
				758	{
				759	struct zswap_tree *tree = zswap_trees[type];
				760	struct zswap_entry *entry;
				761	int refcount;
				762
				763	/* find */
				764	spin_lock(&tree->lock);
				765	entry = zswap_rb_search(&tree->rbroot, offset);
				766	if (!entry) {
				767	/* entry was written back */
				768	spin_unlock(&tree->lock);
				769	return;
				770	}
				771
				772	/* remove from rbtree */
				773	rb_erase(&entry->rbnode, &tree->rbroot);
				774
				775	/* drop the initial reference from entry creation */
				776	refcount = zswap_entry_put(entry);
				777
				778	spin_unlock(&tree->lock);
				779
				780	if (refcount) {
				781	/* writeback in progress, writeback will free */
				782	return;
				783	}
				784
				785	/* free */
				786	zswap_free_entry(tree, entry);
				787	}
				788
				789	/* frees all zswap entries for the given swap type */
				790	static void zswap_frontswap_invalidate_area(unsigned type)
				791	{
				792	struct zswap_tree *tree = zswap_trees[type];
				793	struct rb_node *node;
				794	struct zswap_entry *entry;
				795
				796	if (!tree)
				797	return;
				798
				799	/* walk the tree and free everything */
				800	spin_lock(&tree->lock);
				801	/*
				802	* TODO: Even though this code should not be executed because
				803	* the try_to_unuse() in swapoff should have emptied the tree,
				804	* it is very wasteful to rebalance the tree after every
				805	* removal when we are freeing the whole tree.
				806	*
				807	* If post-order traversal code is ever added to the rbtree
				808	* implementation, it should be used here.
				809	*/
				810	while ((node = rb_first(&tree->rbroot))) {
				811	entry = rb_entry(node, struct zswap_entry, rbnode);
				812	rb_erase(&entry->rbnode, &tree->rbroot);
				813	zbud_free(tree->pool, entry->handle);
				814	zswap_entry_cache_free(entry);
				815	atomic_dec(&zswap_stored_pages);
				816	}
				817	tree->rbroot = RB_ROOT;
				818	spin_unlock(&tree->lock);
				819	}
				820
				821	static struct zbud_ops zswap_zbud_ops = {
				822	.evict = zswap_writeback_entry
				823	};
				824
				825	static void zswap_frontswap_init(unsigned type)
				826	{
				827	struct zswap_tree *tree;
				828
				829	tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
				830	if (!tree)
				831	goto err;
				832	tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
				833	if (!tree->pool)
				834	goto freetree;
				835	tree->rbroot = RB_ROOT;
				836	spin_lock_init(&tree->lock);
				837	zswap_trees[type] = tree;
				838	return;
				839
				840	freetree:
				841	kfree(tree);
				842	err:
				843	pr_err("alloc failed, zswap disabled for swap type %d\n", type);
				844	}
				845
				846	static struct frontswap_ops zswap_frontswap_ops = {
				847	.store = zswap_frontswap_store,
				848	.load = zswap_frontswap_load,
				849	.invalidate_page = zswap_frontswap_invalidate_page,
				850	.invalidate_area = zswap_frontswap_invalidate_area,
				851	.init = zswap_frontswap_init
				852	};
				853
				854	/*********************************
				855	* debugfs functions
				856	**********************************/
				857	#ifdef CONFIG_DEBUG_FS
				858	#include <linux/debugfs.h>
				859
				860	static struct dentry *zswap_debugfs_root;
				861
				862	static int __init zswap_debugfs_init(void)
				863	{
				864	if (!debugfs_initialized())
				865	return -ENODEV;
				866
				867	zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
				868	if (!zswap_debugfs_root)
				869	return -ENOMEM;
				870
				871	debugfs_create_u64("pool_limit_hit", S_IRUGO,
				872	zswap_debugfs_root, &zswap_pool_limit_hit);
				873	debugfs_create_u64("reject_reclaim_fail", S_IRUGO,
				874	zswap_debugfs_root, &zswap_reject_reclaim_fail);
				875	debugfs_create_u64("reject_alloc_fail", S_IRUGO,
				876	zswap_debugfs_root, &zswap_reject_alloc_fail);
				877	debugfs_create_u64("reject_kmemcache_fail", S_IRUGO,
				878	zswap_debugfs_root, &zswap_reject_kmemcache_fail);
				879	debugfs_create_u64("reject_compress_poor", S_IRUGO,
				880	zswap_debugfs_root, &zswap_reject_compress_poor);
				881	debugfs_create_u64("written_back_pages", S_IRUGO,
				882	zswap_debugfs_root, &zswap_written_back_pages);
				883	debugfs_create_u64("duplicate_entry", S_IRUGO,
				884	zswap_debugfs_root, &zswap_duplicate_entry);
				885	debugfs_create_u64("pool_pages", S_IRUGO,
				886	zswap_debugfs_root, &zswap_pool_pages);
				887	debugfs_create_atomic_t("stored_pages", S_IRUGO,
				888	zswap_debugfs_root, &zswap_stored_pages);
				889
				890	return 0;
				891	}
				892
				893	static void __exit zswap_debugfs_exit(void)
				894	{
				895	debugfs_remove_recursive(zswap_debugfs_root);
				896	}
				897	#else
				898	static int __init zswap_debugfs_init(void)
				899	{
				900	return 0;
				901	}
				902
				903	static void __exit zswap_debugfs_exit(void) { }
				904	#endif
				905
				906	/*********************************
				907	* module init and exit
				908	**********************************/
				909	static int __init init_zswap(void)
				910	{
				911	if (!zswap_enabled)
				912	return 0;
				913
				914	pr_info("loading zswap\n");
				915	if (zswap_entry_cache_create()) {
				916	pr_err("entry cache creation failed\n");
				917	goto error;
				918	}
				919	if (zswap_comp_init()) {
				920	pr_err("compressor initialization failed\n");
				921	goto compfail;
				922	}
				923	if (zswap_cpu_init()) {
				924	pr_err("per-cpu initialization failed\n");
				925	goto pcpufail;
				926	}
				927	frontswap_register_ops(&zswap_frontswap_ops);
				928	if (zswap_debugfs_init())
				929	pr_warn("debugfs initialization failed\n");
				930	return 0;
				931	pcpufail:
				932	zswap_comp_exit();
				933	compfail:
				934	zswap_entry_cache_destory();
				935	error:
				936	return -ENOMEM;
				937	}
				938	/* must be late so crypto has time to come up */
				939	late_initcall(init_zswap);
				940
				941	MODULE_LICENSE("GPL");
				942	MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
				943	MODULE_DESCRIPTION("Compressed cache for swap pages");