blob: 989fac5b01b3c88d830cf109bc73dfbaa652fbf1 [file] [log] [blame]
Nitin Gupta306b0c92009-09-22 10:26:53 +05301/*
2 * Compressed RAM based swap device
3 *
4 * Copyright (C) 2008, 2009 Nitin Gupta
5 *
6 * This code is released using a dual license strategy: BSD/GPL
7 * You can choose the licence that better fits your requirements.
8 *
9 * Released under the terms of 3-clause BSD License
10 * Released under the terms of GNU General Public License Version 2.0
11 *
12 * Project home: http://compcache.googlecode.com
13 */
14
15#define KMSG_COMPONENT "ramzswap"
16#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
17
18#include <linux/module.h>
19#include <linux/kernel.h>
20#include <linux/bitops.h>
21#include <linux/blkdev.h>
22#include <linux/buffer_head.h>
23#include <linux/device.h>
24#include <linux/genhd.h>
25#include <linux/highmem.h>
26#include <linux/lzo.h>
27#include <linux/mutex.h>
28#include <linux/string.h>
29#include <linux/swap.h>
30#include <linux/swapops.h>
31#include <linux/vmalloc.h>
32#include <linux/version.h>
33
34#include "ramzswap_drv.h"
35
36/* Globals */
37static int ramzswap_major;
38static struct ramzswap *devices;
39
40/*
41 * Pages that compress to larger than this size are
42 * forwarded to backing swap, if present or stored
43 * uncompressed in memory otherwise.
44 */
45static unsigned int max_zpage_size;
46
47/* Module params (documentation at end) */
48static unsigned int num_devices;
49
50static int rzs_test_flag(struct ramzswap *rzs, u32 index,
51 enum rzs_pageflags flag)
52{
53 return rzs->table[index].flags & BIT(flag);
54}
55
56static void rzs_set_flag(struct ramzswap *rzs, u32 index,
57 enum rzs_pageflags flag)
58{
59 rzs->table[index].flags |= BIT(flag);
60}
61
62static void rzs_clear_flag(struct ramzswap *rzs, u32 index,
63 enum rzs_pageflags flag)
64{
65 rzs->table[index].flags &= ~BIT(flag);
66}
67
68static int page_zero_filled(void *ptr)
69{
70 unsigned int pos;
71 unsigned long *page;
72
73 page = (unsigned long *)ptr;
74
75 for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
76 if (page[pos])
77 return 0;
78 }
79
80 return 1;
81}
82
83/*
84 * memlimit cannot be greater than backing disk size.
85 */
86static void ramzswap_set_memlimit(struct ramzswap *rzs, size_t totalram_bytes)
87{
88 int memlimit_valid = 1;
89
90 if (!rzs->memlimit) {
91 pr_info("Memory limit not set.\n");
92 memlimit_valid = 0;
93 }
94
95 if (rzs->memlimit > rzs->disksize) {
96 pr_info("Memory limit cannot be greater than "
97 "disksize: limit=%zu, disksize=%zu\n",
98 rzs->memlimit, rzs->disksize);
99 memlimit_valid = 0;
100 }
101
102 if (!memlimit_valid) {
103 size_t mempart, disksize;
104 pr_info("Using default: smaller of (%u%% of RAM) and "
105 "(backing disk size).\n",
106 default_memlimit_perc_ram);
107 mempart = default_memlimit_perc_ram * (totalram_bytes / 100);
108 disksize = rzs->disksize;
109 rzs->memlimit = mempart > disksize ? disksize : mempart;
110 }
111
112 if (rzs->memlimit > totalram_bytes / 2) {
113 pr_info(
114 "Its not advisable setting limit more than half of "
115 "size of memory since we expect a 2:1 compression ratio. "
116 "Limit represents amount of *compressed* data we can keep "
117 "in memory!\n"
118 "\tMemory Size: %zu kB\n"
119 "\tLimit you selected: %zu kB\n"
120 "Continuing anyway ...\n",
121 totalram_bytes >> 10, rzs->memlimit >> 10
122 );
123 }
124
125 rzs->memlimit &= PAGE_MASK;
126 BUG_ON(!rzs->memlimit);
127}
128
129static void ramzswap_set_disksize(struct ramzswap *rzs, size_t totalram_bytes)
130{
131 if (!rzs->disksize) {
132 pr_info(
133 "disk size not provided. You can use disksize_kb module "
134 "param to specify size.\nUsing default: (%u%% of RAM).\n",
135 default_disksize_perc_ram
136 );
137 rzs->disksize = default_disksize_perc_ram *
138 (totalram_bytes / 100);
139 }
140
141 if (rzs->disksize > 2 * (totalram_bytes)) {
142 pr_info(
143 "There is little point creating a ramzswap of greater than "
144 "twice the size of memory since we expect a 2:1 compression "
145 "ratio. Note that ramzswap uses about 0.1%% of the size of "
146 "the swap device when not in use so a huge ramzswap is "
147 "wasteful.\n"
148 "\tMemory Size: %zu kB\n"
149 "\tSize you selected: %zu kB\n"
150 "Continuing anyway ...\n",
151 totalram_bytes >> 10, rzs->disksize
152 );
153 }
154
155 rzs->disksize &= PAGE_MASK;
156}
157
158/*
159 * Swap header (1st page of swap device) contains information
160 * to indentify it as a swap partition. Prepare such a header
161 * for ramzswap device (ramzswap0) so that swapon can identify
162 * it as swap partition. In case backing swap device is provided,
163 * copy its swap header.
164 */
165static int setup_swap_header(struct ramzswap *rzs, union swap_header *s)
166{
167 int ret = 0;
168 struct page *page;
169 struct address_space *mapping;
170 union swap_header *backing_swap_header;
171
172 /*
173 * There is no backing swap device. Create a swap header
174 * that is acceptable by swapon.
175 */
176 if (!rzs->backing_swap) {
177 s->info.version = 1;
178 s->info.last_page = (rzs->disksize >> PAGE_SHIFT) - 1;
179 s->info.nr_badpages = 0;
180 memcpy(s->magic.magic, "SWAPSPACE2", 10);
181 return 0;
182 }
183
184 /*
185 * We have a backing swap device. Copy its swap header
186 * to ramzswap device header. If this header contains
187 * invalid information (backing device not a swap
188 * partition, etc.), swapon will fail for ramzswap
189 * which is correct behavior - we don't want to swap
190 * over filesystem partition!
191 */
192
193 /* Read the backing swap header (code from sys_swapon) */
194 mapping = rzs->swap_file->f_mapping;
195 if (!mapping->a_ops->readpage) {
196 ret = -EINVAL;
197 goto out;
198 }
199
200 page = read_mapping_page(mapping, 0, rzs->swap_file);
201 if (IS_ERR(page)) {
202 ret = PTR_ERR(page);
203 goto out;
204 }
205
206 backing_swap_header = kmap(page);
207 memcpy(s, backing_swap_header, sizeof(*s));
208 if (s->info.nr_badpages) {
209 pr_info("Cannot use backing swap with bad pages (%u)\n",
210 s->info.nr_badpages);
211 ret = -EINVAL;
212 }
213 /*
214 * ramzswap disksize equals number of usable pages in backing
215 * swap. Set last_page in swap header to match this disksize
216 * ('last_page' means 0-based index of last usable swap page).
217 */
218 s->info.last_page = (rzs->disksize >> PAGE_SHIFT) - 1;
219 kunmap(page);
220
221out:
222 return ret;
223}
224
Nitin Gupta306b0c92009-09-22 10:26:53 +0530225void ramzswap_ioctl_get_stats(struct ramzswap *rzs,
226 struct ramzswap_ioctl_stats *s)
227{
228 strncpy(s->backing_swap_name, rzs->backing_swap_name,
229 MAX_SWAP_NAME_LEN - 1);
230 s->backing_swap_name[MAX_SWAP_NAME_LEN - 1] = '\0';
231
232 s->disksize = rzs->disksize;
233 s->memlimit = rzs->memlimit;
234
235#if defined(CONFIG_RAMZSWAP_STATS)
236 {
237 struct ramzswap_stats *rs = &rzs->stats;
238 size_t succ_writes, mem_used;
239 unsigned int good_compress_perc = 0, no_compress_perc = 0;
240
241 mem_used = xv_get_total_size_bytes(rzs->mem_pool)
242 + (rs->pages_expand << PAGE_SHIFT);
243 succ_writes = rs->num_writes - rs->failed_writes;
244
245 if (succ_writes && rs->pages_stored) {
246 good_compress_perc = rs->good_compress * 100
247 / rs->pages_stored;
248 no_compress_perc = rs->pages_expand * 100
249 / rs->pages_stored;
250 }
251
252 s->num_reads = rs->num_reads;
253 s->num_writes = rs->num_writes;
254 s->failed_reads = rs->failed_reads;
255 s->failed_writes = rs->failed_writes;
256 s->invalid_io = rs->invalid_io;
257 s->pages_zero = rs->pages_zero;
258
259 s->good_compress_pct = good_compress_perc;
260 s->pages_expand_pct = no_compress_perc;
261
262 s->pages_stored = rs->pages_stored;
263 s->pages_used = mem_used >> PAGE_SHIFT;
264 s->orig_data_size = rs->pages_stored << PAGE_SHIFT;
265 s->compr_data_size = rs->compr_size;
266 s->mem_used_total = mem_used;
267
268 s->bdev_num_reads = rs->bdev_num_reads;
269 s->bdev_num_writes = rs->bdev_num_writes;
270 }
271#endif /* CONFIG_RAMZSWAP_STATS */
272}
273
274static int add_backing_swap_extent(struct ramzswap *rzs,
275 pgoff_t phy_pagenum,
276 pgoff_t num_pages)
277{
278 unsigned int idx;
279 struct list_head *head;
280 struct page *curr_page, *new_page;
281 unsigned int extents_per_page = PAGE_SIZE /
282 sizeof(struct ramzswap_backing_extent);
283
284 idx = rzs->num_extents % extents_per_page;
285 if (!idx) {
286 new_page = alloc_page(__GFP_ZERO);
287 if (!new_page)
288 return -ENOMEM;
289
290 if (rzs->num_extents) {
291 curr_page = virt_to_page(rzs->curr_extent);
292 head = &curr_page->lru;
293 } else {
294 head = &rzs->backing_swap_extent_list;
295 }
296
297 list_add(&new_page->lru, head);
298 rzs->curr_extent = page_address(new_page);
299 }
300
301 rzs->curr_extent->phy_pagenum = phy_pagenum;
302 rzs->curr_extent->num_pages = num_pages;
303
304 pr_debug("add_extent: idx=%u, phy_pgnum=%lu, num_pgs=%lu, "
305 "pg_last=%lu, curr_ext=%p\n", idx, phy_pagenum, num_pages,
306 phy_pagenum + num_pages - 1, rzs->curr_extent);
307
308 if (idx != extents_per_page - 1)
309 rzs->curr_extent++;
310
311 return 0;
312}
313
314static int setup_backing_swap_extents(struct ramzswap *rzs,
315 struct inode *inode, unsigned long *num_pages)
316{
317 int ret = 0;
318 unsigned blkbits;
319 unsigned blocks_per_page;
320 pgoff_t contig_pages = 0, total_pages = 0;
321 pgoff_t pagenum = 0, prev_pagenum = 0;
322 sector_t probe_block = 0;
323 sector_t last_block;
324
325 blkbits = inode->i_blkbits;
326 blocks_per_page = PAGE_SIZE >> blkbits;
327
328 last_block = i_size_read(inode) >> blkbits;
329 while (probe_block + blocks_per_page <= last_block) {
330 unsigned block_in_page;
331 sector_t first_block;
332
333 first_block = bmap(inode, probe_block);
334 if (first_block == 0)
335 goto bad_bmap;
336
337 /* It must be PAGE_SIZE aligned on-disk */
338 if (first_block & (blocks_per_page - 1)) {
339 probe_block++;
340 goto probe_next;
341 }
342
343 /* All blocks within this page must be contiguous on disk */
344 for (block_in_page = 1; block_in_page < blocks_per_page;
345 block_in_page++) {
346 sector_t block;
347
348 block = bmap(inode, probe_block + block_in_page);
349 if (block == 0)
350 goto bad_bmap;
351 if (block != first_block + block_in_page) {
352 /* Discontiguity */
353 probe_block++;
354 goto probe_next;
355 }
356 }
357
358 /*
359 * We found a PAGE_SIZE length, PAGE_SIZE aligned
360 * run of blocks.
361 */
362 pagenum = first_block >> (PAGE_SHIFT - blkbits);
363
364 if (total_pages && (pagenum != prev_pagenum + 1)) {
365 ret = add_backing_swap_extent(rzs, prev_pagenum -
366 (contig_pages - 1), contig_pages);
367 if (ret < 0)
368 goto out;
369 rzs->num_extents++;
370 contig_pages = 0;
371 }
372 total_pages++;
373 contig_pages++;
374 prev_pagenum = pagenum;
375 probe_block += blocks_per_page;
376
377probe_next:
378 continue;
379 }
380
381 if (contig_pages) {
382 pr_debug("adding last extent: pagenum=%lu, "
383 "contig_pages=%lu\n", pagenum, contig_pages);
384 ret = add_backing_swap_extent(rzs,
385 prev_pagenum - (contig_pages - 1), contig_pages);
386 if (ret < 0)
387 goto out;
388 rzs->num_extents++;
389 }
390 if (!rzs->num_extents) {
391 pr_err("No swap extents found!\n");
392 ret = -EINVAL;
393 }
394
395 if (!ret) {
396 *num_pages = total_pages;
397 pr_info("Found %lu extents containing %luk\n",
398 rzs->num_extents, *num_pages << (PAGE_SHIFT - 10));
399 }
400 goto out;
401
402bad_bmap:
403 pr_err("Backing swapfile has holes\n");
404 ret = -EINVAL;
405out:
406 while (ret && !list_empty(&rzs->backing_swap_extent_list)) {
407 struct page *page;
408 struct list_head *entry = rzs->backing_swap_extent_list.next;
409 page = list_entry(entry, struct page, lru);
410 list_del(entry);
411 __free_page(page);
412 }
413 return ret;
414}
415
416static void map_backing_swap_extents(struct ramzswap *rzs)
417{
418 struct ramzswap_backing_extent *se;
419 struct page *table_page, *se_page;
420 unsigned long num_pages, num_table_pages, entry;
421 unsigned long se_idx, span;
422 unsigned entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
423 unsigned extents_per_page = PAGE_SIZE / sizeof(*se);
424
425 /* True for block device */
426 if (!rzs->num_extents)
427 return;
428
429 se_page = list_entry(rzs->backing_swap_extent_list.next,
430 struct page, lru);
431 se = page_address(se_page);
432 span = se->num_pages;
433 num_pages = rzs->disksize >> PAGE_SHIFT;
434 num_table_pages = DIV_ROUND_UP(num_pages * sizeof(*rzs->table),
435 PAGE_SIZE);
436
437 entry = 0;
438 se_idx = 0;
439 while (num_table_pages--) {
440 table_page = vmalloc_to_page(&rzs->table[entry]);
441 while (span <= entry) {
442 se_idx++;
443 if (se_idx == rzs->num_extents)
444 BUG();
445
446 if (!(se_idx % extents_per_page)) {
447 se_page = list_entry(se_page->lru.next,
448 struct page, lru);
449 se = page_address(se_page);
450 } else
451 se++;
452
453 span += se->num_pages;
454 }
455 table_page->mapping = (struct address_space *)se;
456 table_page->private = se->num_pages - (span - entry);
457 pr_debug("map_table: entry=%lu, span=%lu, map=%p, priv=%lu\n",
458 entry, span, table_page->mapping, table_page->private);
459 entry += entries_per_page;
460 }
461}
462
463/*
464 * Check if value of backing_swap module param is sane.
465 * Claim this device and set ramzswap size equal to
466 * size of this block device.
467 */
468static int setup_backing_swap(struct ramzswap *rzs)
469{
470 int ret = 0;
471 size_t disksize;
472 unsigned long num_pages = 0;
473 struct inode *inode;
474 struct file *swap_file;
475 struct address_space *mapping;
476 struct block_device *bdev = NULL;
477
478 if (!rzs->backing_swap_name[0]) {
479 pr_debug("backing_swap param not given\n");
480 goto out;
481 }
482
483 pr_info("Using backing swap device: %s\n", rzs->backing_swap_name);
484
485 swap_file = filp_open(rzs->backing_swap_name,
486 O_RDWR | O_LARGEFILE, 0);
487 if (IS_ERR(swap_file)) {
488 pr_err("Error opening backing device: %s\n",
489 rzs->backing_swap_name);
490 ret = -EINVAL;
491 goto out;
492 }
493
494 mapping = swap_file->f_mapping;
495 inode = mapping->host;
496
497 if (S_ISBLK(inode->i_mode)) {
498 bdev = I_BDEV(inode);
499 ret = bd_claim(bdev, setup_backing_swap);
500 if (ret < 0) {
501 bdev = NULL;
502 goto bad_param;
503 }
504 disksize = i_size_read(inode);
505 } else if (S_ISREG(inode->i_mode)) {
506 bdev = inode->i_sb->s_bdev;
507 if (IS_SWAPFILE(inode)) {
508 ret = -EBUSY;
509 goto bad_param;
510 }
511 ret = setup_backing_swap_extents(rzs, inode, &num_pages);
512 if (ret < 0)
513 goto bad_param;
514 disksize = num_pages << PAGE_SHIFT;
515 } else {
516 goto bad_param;
517 }
518
519 rzs->swap_file = swap_file;
520 rzs->backing_swap = bdev;
521 rzs->disksize = disksize;
522 BUG_ON(!rzs->disksize);
523
524 return 0;
525
526bad_param:
527 if (bdev)
528 bd_release(bdev);
529 filp_close(swap_file, NULL);
530
531out:
532 rzs->backing_swap = NULL;
533 return ret;
534}
535
536/*
537 * Map logical page number 'pagenum' to physical page number
538 * on backing swap device. For block device, this is a nop.
539 */
540u32 map_backing_swap_page(struct ramzswap *rzs, u32 pagenum)
541{
542 u32 skip_pages, entries_per_page;
543 size_t delta, se_offset, skipped;
544 struct page *table_page, *se_page;
545 struct ramzswap_backing_extent *se;
546
547 if (!rzs->num_extents)
548 return pagenum;
549
550 entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
551
552 table_page = vmalloc_to_page(&rzs->table[pagenum]);
553 se = (struct ramzswap_backing_extent *)table_page->mapping;
554 se_page = virt_to_page(se);
555
556 skip_pages = pagenum - (pagenum / entries_per_page * entries_per_page);
557 se_offset = table_page->private + skip_pages;
558
559 if (se_offset < se->num_pages)
560 return se->phy_pagenum + se_offset;
561
562 skipped = se->num_pages - table_page->private;
563 do {
564 struct ramzswap_backing_extent *se_base;
565 u32 se_entries_per_page = PAGE_SIZE / sizeof(*se);
566
567 /* Get next swap extent */
568 se_base = (struct ramzswap_backing_extent *)
569 page_address(se_page);
570 if (se - se_base == se_entries_per_page - 1) {
571 se_page = list_entry(se_page->lru.next,
572 struct page, lru);
573 se = page_address(se_page);
574 } else {
575 se++;
576 }
577
578 skipped += se->num_pages;
579 } while (skipped < skip_pages);
580
581 delta = skipped - skip_pages;
582 se_offset = se->num_pages - delta;
583
584 return se->phy_pagenum + se_offset;
585}
586
587static void ramzswap_free_page(struct ramzswap *rzs, size_t index)
588{
589 u32 clen;
590 void *obj;
591
592 struct page *page = rzs->table[index].page;
593 u32 offset = rzs->table[index].offset;
594
595 if (unlikely(!page)) {
596 if (rzs_test_flag(rzs, index, RZS_ZERO)) {
597 rzs_clear_flag(rzs, index, RZS_ZERO);
598 stat_dec(rzs->stats.pages_zero);
599 }
600 return;
601 }
602
603 if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED))) {
604 clen = PAGE_SIZE;
605 __free_page(page);
606 rzs_clear_flag(rzs, index, RZS_UNCOMPRESSED);
607 stat_dec(rzs->stats.pages_expand);
608 goto out;
609 }
610
611 obj = kmap_atomic(page, KM_USER0) + offset;
612 clen = xv_get_object_size(obj) - sizeof(struct zobj_header);
613 kunmap_atomic(obj, KM_USER0);
614
615 xv_free(rzs->mem_pool, page, offset);
616 if (clen <= PAGE_SIZE / 2)
617 stat_dec(rzs->stats.good_compress);
618
619out:
620 rzs->stats.compr_size -= clen;
621 stat_dec(rzs->stats.pages_stored);
622
623 rzs->table[index].page = NULL;
624 rzs->table[index].offset = 0;
625}
626
627static int handle_zero_page(struct bio *bio)
628{
629 void *user_mem;
630 struct page *page = bio->bi_io_vec[0].bv_page;
631
632 user_mem = kmap_atomic(page, KM_USER0);
633 memset(user_mem, 0, PAGE_SIZE);
634 kunmap_atomic(user_mem, KM_USER0);
635
Nitin Gupta30fb8a72009-12-12 11:44:46 +0530636 flush_dcache_page(page);
Nitin Gupta306b0c92009-09-22 10:26:53 +0530637
638 set_bit(BIO_UPTODATE, &bio->bi_flags);
639 bio_endio(bio, 0);
640 return 0;
641}
642
643static int handle_uncompressed_page(struct ramzswap *rzs, struct bio *bio)
644{
645 u32 index;
646 struct page *page;
647 unsigned char *user_mem, *cmem;
648
649 page = bio->bi_io_vec[0].bv_page;
650 index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
651
652 user_mem = kmap_atomic(page, KM_USER0);
653 cmem = kmap_atomic(rzs->table[index].page, KM_USER1) +
654 rzs->table[index].offset;
655
656 memcpy(user_mem, cmem, PAGE_SIZE);
657 kunmap_atomic(user_mem, KM_USER0);
658 kunmap_atomic(cmem, KM_USER1);
659
Nitin Gupta30fb8a72009-12-12 11:44:46 +0530660 flush_dcache_page(page);
Nitin Gupta306b0c92009-09-22 10:26:53 +0530661
662 set_bit(BIO_UPTODATE, &bio->bi_flags);
663 bio_endio(bio, 0);
664 return 0;
665}
666
667
668/*
669 * Called when request page is not present in ramzswap.
670 * Its either in backing swap device (if present) or
671 * this is an attempt to read before any previous write
672 * to this location - this happens due to readahead when
673 * swap device is read from user-space (e.g. during swapon)
674 */
675static int handle_ramzswap_fault(struct ramzswap *rzs, struct bio *bio)
676{
677 /*
678 * Always forward such requests to backing swap
679 * device (if present)
680 */
681 if (rzs->backing_swap) {
682 u32 pagenum;
683 stat_dec(rzs->stats.num_reads);
684 stat_inc(rzs->stats.bdev_num_reads);
685 bio->bi_bdev = rzs->backing_swap;
686
687 /*
688 * In case backing swap is a file, find the right offset within
689 * the file corresponding to logical position 'index'. For block
690 * device, this is a nop.
691 */
692 pagenum = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
693 bio->bi_sector = map_backing_swap_page(rzs, pagenum)
694 << SECTORS_PER_PAGE_SHIFT;
695 return 1;
696 }
697
698 /*
699 * Its unlikely event in case backing dev is
700 * not present
701 */
702 pr_debug("Read before write on swap device: "
703 "sector=%lu, size=%u, offset=%u\n",
704 (ulong)(bio->bi_sector), bio->bi_size,
705 bio->bi_io_vec[0].bv_offset);
706
707 /* Do nothing. Just return success */
708 set_bit(BIO_UPTODATE, &bio->bi_flags);
709 bio_endio(bio, 0);
710 return 0;
711}
712
713static int ramzswap_read(struct ramzswap *rzs, struct bio *bio)
714{
715 int ret;
716 u32 index;
717 size_t clen;
718 struct page *page;
719 struct zobj_header *zheader;
720 unsigned char *user_mem, *cmem;
721
722 stat_inc(rzs->stats.num_reads);
723
724 page = bio->bi_io_vec[0].bv_page;
725 index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
726
727 if (rzs_test_flag(rzs, index, RZS_ZERO))
728 return handle_zero_page(bio);
729
730 /* Requested page is not present in compressed area */
731 if (!rzs->table[index].page)
732 return handle_ramzswap_fault(rzs, bio);
733
734 /* Page is stored uncompressed since its incompressible */
735 if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)))
736 return handle_uncompressed_page(rzs, bio);
737
738 user_mem = kmap_atomic(page, KM_USER0);
739 clen = PAGE_SIZE;
740
741 cmem = kmap_atomic(rzs->table[index].page, KM_USER1) +
742 rzs->table[index].offset;
743
744 ret = lzo1x_decompress_safe(
745 cmem + sizeof(*zheader),
746 xv_get_object_size(cmem) - sizeof(*zheader),
747 user_mem, &clen);
748
749 kunmap_atomic(user_mem, KM_USER0);
750 kunmap_atomic(cmem, KM_USER1);
751
752 /* should NEVER happen */
753 if (unlikely(ret != LZO_E_OK)) {
754 pr_err("Decompression failed! err=%d, page=%u\n",
755 ret, index);
756 stat_inc(rzs->stats.failed_reads);
757 goto out;
758 }
759
Nitin Gupta30fb8a72009-12-12 11:44:46 +0530760 flush_dcache_page(page);
Nitin Gupta306b0c92009-09-22 10:26:53 +0530761
762 set_bit(BIO_UPTODATE, &bio->bi_flags);
763 bio_endio(bio, 0);
764 return 0;
765
766out:
767 bio_io_error(bio);
768 return 0;
769}
770
771static int ramzswap_write(struct ramzswap *rzs, struct bio *bio)
772{
773 int ret, fwd_write_request = 0;
774 u32 offset, index;
775 size_t clen;
776 struct zobj_header *zheader;
777 struct page *page, *page_store;
778 unsigned char *user_mem, *cmem, *src;
779
780 stat_inc(rzs->stats.num_writes);
781
782 page = bio->bi_io_vec[0].bv_page;
783 index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
784
785 src = rzs->compress_buffer;
786
787 /*
788 * System swaps to same sector again when the stored page
789 * is no longer referenced by any process. So, its now safe
790 * to free the memory that was allocated for this page.
791 */
792 if (rzs->table[index].page)
793 ramzswap_free_page(rzs, index);
794
795 /*
796 * No memory ia allocated for zero filled pages.
797 * Simply clear zero page flag.
798 */
799 if (rzs_test_flag(rzs, index, RZS_ZERO)) {
800 stat_dec(rzs->stats.pages_zero);
801 rzs_clear_flag(rzs, index, RZS_ZERO);
802 }
803
804 mutex_lock(&rzs->lock);
805
806 user_mem = kmap_atomic(page, KM_USER0);
807 if (page_zero_filled(user_mem)) {
808 kunmap_atomic(user_mem, KM_USER0);
809 mutex_unlock(&rzs->lock);
810 stat_inc(rzs->stats.pages_zero);
811 rzs_set_flag(rzs, index, RZS_ZERO);
812
813 set_bit(BIO_UPTODATE, &bio->bi_flags);
814 bio_endio(bio, 0);
815 return 0;
816 }
817
818 if (rzs->backing_swap &&
819 (rzs->stats.compr_size > rzs->memlimit - PAGE_SIZE)) {
820 kunmap_atomic(user_mem, KM_USER0);
821 mutex_unlock(&rzs->lock);
822 fwd_write_request = 1;
823 goto out;
824 }
825
826 ret = lzo1x_1_compress(user_mem, PAGE_SIZE, src, &clen,
827 rzs->compress_workmem);
828
829 kunmap_atomic(user_mem, KM_USER0);
830
831 if (unlikely(ret != LZO_E_OK)) {
832 mutex_unlock(&rzs->lock);
833 pr_err("Compression failed! err=%d\n", ret);
834 stat_inc(rzs->stats.failed_writes);
835 goto out;
836 }
837
838 /*
839 * Page is incompressible. Forward it to backing swap
840 * if present. Otherwise, store it as-is (uncompressed)
841 * since we do not want to return too many swap write
842 * errors which has side effect of hanging the system.
843 */
844 if (unlikely(clen > max_zpage_size)) {
845 if (rzs->backing_swap) {
846 mutex_unlock(&rzs->lock);
847 fwd_write_request = 1;
848 goto out;
849 }
850
851 clen = PAGE_SIZE;
852 page_store = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
853 if (unlikely(!page_store)) {
854 mutex_unlock(&rzs->lock);
855 pr_info("Error allocating memory for incompressible "
856 "page: %u\n", index);
857 stat_inc(rzs->stats.failed_writes);
858 goto out;
859 }
860
861 offset = 0;
862 rzs_set_flag(rzs, index, RZS_UNCOMPRESSED);
863 stat_inc(rzs->stats.pages_expand);
864 rzs->table[index].page = page_store;
865 src = kmap_atomic(page, KM_USER0);
866 goto memstore;
867 }
868
869 if (xv_malloc(rzs->mem_pool, clen + sizeof(*zheader),
870 &rzs->table[index].page, &offset,
871 GFP_NOIO | __GFP_HIGHMEM)) {
872 mutex_unlock(&rzs->lock);
873 pr_info("Error allocating memory for compressed "
874 "page: %u, size=%zu\n", index, clen);
875 stat_inc(rzs->stats.failed_writes);
876 if (rzs->backing_swap)
877 fwd_write_request = 1;
878 goto out;
879 }
880
881memstore:
882 rzs->table[index].offset = offset;
883
884 cmem = kmap_atomic(rzs->table[index].page, KM_USER1) +
885 rzs->table[index].offset;
886
887#if 0
888 /* Back-reference needed for memory defragmentation */
889 if (!rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)) {
890 zheader = (struct zobj_header *)cmem;
891 zheader->table_idx = index;
892 cmem += sizeof(*zheader);
893 }
894#endif
895
896 memcpy(cmem, src, clen);
897
898 kunmap_atomic(cmem, KM_USER1);
899 if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)))
900 kunmap_atomic(src, KM_USER0);
901
902 /* Update stats */
903 rzs->stats.compr_size += clen;
904 stat_inc(rzs->stats.pages_stored);
905 if (clen <= PAGE_SIZE / 2)
906 stat_inc(rzs->stats.good_compress);
907
908 mutex_unlock(&rzs->lock);
909
910 set_bit(BIO_UPTODATE, &bio->bi_flags);
911 bio_endio(bio, 0);
912 return 0;
913
914out:
915 if (fwd_write_request) {
916 stat_inc(rzs->stats.bdev_num_writes);
917 bio->bi_bdev = rzs->backing_swap;
918#if 0
919 /*
920 * TODO: We currently have linear mapping of ramzswap and
921 * backing swap sectors. This is not desired since we want
922 * to optimize writes to backing swap to minimize disk seeks
923 * or have effective wear leveling (for SSDs). Also, a
924 * non-linear mapping is required to implement compressed
925 * on-disk swapping.
926 */
927 bio->bi_sector = get_backing_swap_page()
928 << SECTORS_PER_PAGE_SHIFT;
929#endif
930 /*
931 * In case backing swap is a file, find the right offset within
932 * the file corresponding to logical position 'index'. For block
933 * device, this is a nop.
934 */
935 bio->bi_sector = map_backing_swap_page(rzs, index)
936 << SECTORS_PER_PAGE_SHIFT;
937 return 1;
938 }
939
940 bio_io_error(bio);
941 return 0;
942}
943
944
945/*
946 * Check if request is within bounds and page aligned.
947 */
948static inline int valid_swap_request(struct ramzswap *rzs, struct bio *bio)
949{
950 if (unlikely(
951 (bio->bi_sector >= (rzs->disksize >> SECTOR_SHIFT)) ||
952 (bio->bi_sector & (SECTORS_PER_PAGE - 1)) ||
953 (bio->bi_vcnt != 1) ||
954 (bio->bi_size != PAGE_SIZE) ||
955 (bio->bi_io_vec[0].bv_offset != 0))) {
956
957 return 0;
958 }
959
960 /* swap request is valid */
961 return 1;
962}
963
964/*
965 * Handler function for all ramzswap I/O requests.
966 */
967static int ramzswap_make_request(struct request_queue *queue, struct bio *bio)
968{
969 int ret = 0;
970 struct ramzswap *rzs = queue->queuedata;
971
972 if (unlikely(!rzs->init_done)) {
973 bio_io_error(bio);
974 return 0;
975 }
976
977 if (!valid_swap_request(rzs, bio)) {
978 stat_inc(rzs->stats.invalid_io);
979 bio_io_error(bio);
980 return 0;
981 }
982
983 switch (bio_data_dir(bio)) {
984 case READ:
985 ret = ramzswap_read(rzs, bio);
986 break;
987
988 case WRITE:
989 ret = ramzswap_write(rzs, bio);
990 break;
991 }
992
993 return ret;
994}
995
996static void reset_device(struct ramzswap *rzs)
997{
998 int is_backing_blkdev = 0;
999 size_t index, num_pages;
1000 unsigned entries_per_page;
1001 unsigned long num_table_pages, entry = 0;
1002
1003 if (rzs->backing_swap && !rzs->num_extents)
1004 is_backing_blkdev = 1;
1005
1006 num_pages = rzs->disksize >> PAGE_SHIFT;
1007
1008 /* Free various per-device buffers */
1009 kfree(rzs->compress_workmem);
1010 free_pages((unsigned long)rzs->compress_buffer, 1);
1011
1012 rzs->compress_workmem = NULL;
1013 rzs->compress_buffer = NULL;
1014
1015 /* Free all pages that are still in this ramzswap device */
1016 for (index = 0; index < num_pages; index++) {
1017 struct page *page;
1018 u16 offset;
1019
1020 page = rzs->table[index].page;
1021 offset = rzs->table[index].offset;
1022
1023 if (!page)
1024 continue;
1025
1026 if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED)))
1027 __free_page(page);
1028 else
1029 xv_free(rzs->mem_pool, page, offset);
1030 }
1031
1032 entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
1033 num_table_pages = DIV_ROUND_UP(num_pages * sizeof(*rzs->table),
1034 PAGE_SIZE);
1035 /*
1036 * Set page->mapping to NULL for every table page.
1037 * Otherwise, we will hit bad_page() during free.
1038 */
1039 while (rzs->num_extents && num_table_pages--) {
1040 struct page *page;
1041 page = vmalloc_to_page(&rzs->table[entry]);
1042 page->mapping = NULL;
1043 entry += entries_per_page;
1044 }
1045 vfree(rzs->table);
1046 rzs->table = NULL;
1047
1048 xv_destroy_pool(rzs->mem_pool);
1049 rzs->mem_pool = NULL;
1050
1051 /* Free all swap extent pages */
1052 while (!list_empty(&rzs->backing_swap_extent_list)) {
1053 struct page *page;
1054 struct list_head *entry;
1055 entry = rzs->backing_swap_extent_list.next;
1056 page = list_entry(entry, struct page, lru);
1057 list_del(entry);
1058 __free_page(page);
1059 }
1060 INIT_LIST_HEAD(&rzs->backing_swap_extent_list);
1061 rzs->num_extents = 0;
1062
1063 /* Close backing swap device, if present */
1064 if (rzs->backing_swap) {
1065 if (is_backing_blkdev)
1066 bd_release(rzs->backing_swap);
1067 filp_close(rzs->swap_file, NULL);
1068 rzs->backing_swap = NULL;
1069 }
1070
1071 /* Reset stats */
1072 memset(&rzs->stats, 0, sizeof(rzs->stats));
1073
1074 rzs->disksize = 0;
1075 rzs->memlimit = 0;
1076
1077 /* Back to uninitialized state */
1078 rzs->init_done = 0;
1079}
1080
1081static int ramzswap_ioctl_init_device(struct ramzswap *rzs)
1082{
1083 int ret;
1084 size_t num_pages;
1085 struct page *page;
1086 union swap_header *swap_header;
1087
1088 if (rzs->init_done) {
1089 pr_info("Device already initialized!\n");
1090 return -EBUSY;
1091 }
1092
1093 ret = setup_backing_swap(rzs);
1094 if (ret)
1095 goto fail;
1096
1097 if (rzs->backing_swap)
1098 ramzswap_set_memlimit(rzs, totalram_pages << PAGE_SHIFT);
1099 else
1100 ramzswap_set_disksize(rzs, totalram_pages << PAGE_SHIFT);
1101
1102 rzs->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
1103 if (!rzs->compress_workmem) {
1104 pr_err("Error allocating compressor working memory!\n");
1105 ret = -ENOMEM;
1106 goto fail;
1107 }
1108
1109 rzs->compress_buffer = (void *)__get_free_pages(__GFP_ZERO, 1);
1110 if (!rzs->compress_buffer) {
1111 pr_err("Error allocating compressor buffer space\n");
1112 ret = -ENOMEM;
1113 goto fail;
1114 }
1115
1116 num_pages = rzs->disksize >> PAGE_SHIFT;
1117 rzs->table = vmalloc(num_pages * sizeof(*rzs->table));
1118 if (!rzs->table) {
1119 pr_err("Error allocating ramzswap address table\n");
1120 /* To prevent accessing table entries during cleanup */
1121 rzs->disksize = 0;
1122 ret = -ENOMEM;
1123 goto fail;
1124 }
1125 memset(rzs->table, 0, num_pages * sizeof(*rzs->table));
1126
1127 map_backing_swap_extents(rzs);
1128
1129 page = alloc_page(__GFP_ZERO);
1130 if (!page) {
1131 pr_err("Error allocating swap header page\n");
1132 ret = -ENOMEM;
1133 goto fail;
1134 }
1135 rzs->table[0].page = page;
1136 rzs_set_flag(rzs, 0, RZS_UNCOMPRESSED);
1137
1138 swap_header = kmap(page);
1139 ret = setup_swap_header(rzs, swap_header);
1140 kunmap(page);
1141 if (ret) {
1142 pr_err("Error setting swap header\n");
1143 goto fail;
1144 }
1145
1146 set_capacity(rzs->disk, rzs->disksize >> SECTOR_SHIFT);
1147
1148 /*
1149 * We have ident mapping of sectors for ramzswap and
1150 * and the backing swap device. So, this queue flag
1151 * should be according to backing dev.
1152 */
1153 if (!rzs->backing_swap ||
1154 blk_queue_nonrot(rzs->backing_swap->bd_disk->queue))
1155 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, rzs->disk->queue);
1156
1157 rzs->mem_pool = xv_create_pool();
1158 if (!rzs->mem_pool) {
1159 pr_err("Error creating memory pool\n");
1160 ret = -ENOMEM;
1161 goto fail;
1162 }
1163
1164 /*
1165 * Pages that compress to size greater than this are forwarded
1166 * to physical swap disk (if backing dev is provided)
1167 * TODO: make this configurable
1168 */
1169 if (rzs->backing_swap)
1170 max_zpage_size = max_zpage_size_bdev;
1171 else
1172 max_zpage_size = max_zpage_size_nobdev;
1173 pr_debug("Max compressed page size: %u bytes\n", max_zpage_size);
1174
1175 rzs->init_done = 1;
1176
1177 pr_debug("Initialization done!\n");
1178 return 0;
1179
1180fail:
1181 reset_device(rzs);
1182
1183 pr_err("Initialization failed: err=%d\n", ret);
1184 return ret;
1185}
1186
1187static int ramzswap_ioctl_reset_device(struct ramzswap *rzs)
1188{
1189 if (rzs->init_done)
1190 reset_device(rzs);
1191
1192 return 0;
1193}
1194
1195static int ramzswap_ioctl(struct block_device *bdev, fmode_t mode,
1196 unsigned int cmd, unsigned long arg)
1197{
1198 int ret = 0;
1199 size_t disksize_kb, memlimit_kb;
1200
1201 struct ramzswap *rzs = bdev->bd_disk->private_data;
1202
1203 switch (cmd) {
1204 case RZSIO_SET_DISKSIZE_KB:
1205 if (rzs->init_done) {
1206 ret = -EBUSY;
1207 goto out;
1208 }
1209 if (copy_from_user(&disksize_kb, (void *)arg,
1210 _IOC_SIZE(cmd))) {
1211 ret = -EFAULT;
1212 goto out;
1213 }
1214 rzs->disksize = disksize_kb << 10;
1215 pr_info("Disk size set to %zu kB\n", disksize_kb);
1216 break;
1217
1218 case RZSIO_SET_MEMLIMIT_KB:
1219 if (rzs->init_done) {
1220 /* TODO: allow changing memlimit */
1221 ret = -EBUSY;
1222 goto out;
1223 }
1224 if (copy_from_user(&memlimit_kb, (void *)arg,
1225 _IOC_SIZE(cmd))) {
1226 ret = -EFAULT;
1227 goto out;
1228 }
1229 rzs->memlimit = memlimit_kb << 10;
1230 pr_info("Memory limit set to %zu kB\n", memlimit_kb);
1231 break;
1232
1233 case RZSIO_SET_BACKING_SWAP:
1234 if (rzs->init_done) {
1235 ret = -EBUSY;
1236 goto out;
1237 }
1238
1239 if (copy_from_user(&rzs->backing_swap_name, (void *)arg,
1240 _IOC_SIZE(cmd))) {
1241 ret = -EFAULT;
1242 goto out;
1243 }
1244 rzs->backing_swap_name[MAX_SWAP_NAME_LEN - 1] = '\0';
1245 pr_info("Backing swap set to %s\n", rzs->backing_swap_name);
1246 break;
1247
1248 case RZSIO_GET_STATS:
1249 {
1250 struct ramzswap_ioctl_stats *stats;
1251 if (!rzs->init_done) {
1252 ret = -ENOTTY;
1253 goto out;
1254 }
1255 stats = kzalloc(sizeof(*stats), GFP_KERNEL);
1256 if (!stats) {
1257 ret = -ENOMEM;
1258 goto out;
1259 }
1260 ramzswap_ioctl_get_stats(rzs, stats);
1261 if (copy_to_user((void *)arg, stats, sizeof(*stats))) {
1262 kfree(stats);
1263 ret = -EFAULT;
1264 goto out;
1265 }
1266 kfree(stats);
1267 break;
1268 }
1269 case RZSIO_INIT:
1270 ret = ramzswap_ioctl_init_device(rzs);
1271 break;
1272
1273 case RZSIO_RESET:
1274 /* Do not reset an active device! */
1275 if (bdev->bd_holders) {
1276 ret = -EBUSY;
1277 goto out;
1278 }
1279 ret = ramzswap_ioctl_reset_device(rzs);
1280 break;
1281
1282 default:
1283 pr_info("Invalid ioctl %u\n", cmd);
1284 ret = -ENOTTY;
1285 }
1286
1287out:
1288 return ret;
1289}
1290
1291static struct block_device_operations ramzswap_devops = {
1292 .ioctl = ramzswap_ioctl,
1293 .owner = THIS_MODULE,
1294};
1295
1296static void create_device(struct ramzswap *rzs, int device_id)
1297{
1298 mutex_init(&rzs->lock);
1299 INIT_LIST_HEAD(&rzs->backing_swap_extent_list);
1300
1301 rzs->queue = blk_alloc_queue(GFP_KERNEL);
1302 if (!rzs->queue) {
1303 pr_err("Error allocating disk queue for device %d\n",
1304 device_id);
1305 return;
1306 }
1307
1308 blk_queue_make_request(rzs->queue, ramzswap_make_request);
1309 rzs->queue->queuedata = rzs;
1310
1311 /* gendisk structure */
1312 rzs->disk = alloc_disk(1);
1313 if (!rzs->disk) {
1314 blk_cleanup_queue(rzs->queue);
1315 pr_warning("Error allocating disk structure for device %d\n",
1316 device_id);
1317 return;
1318 }
1319
1320 rzs->disk->major = ramzswap_major;
1321 rzs->disk->first_minor = device_id;
1322 rzs->disk->fops = &ramzswap_devops;
1323 rzs->disk->queue = rzs->queue;
1324 rzs->disk->private_data = rzs;
1325 snprintf(rzs->disk->disk_name, 16, "ramzswap%d", device_id);
1326
1327 /*
1328 * Actual capacity set using RZSIO_SET_DISKSIZE_KB ioctl
1329 * or set equal to backing swap device (if provided)
1330 */
1331 set_capacity(rzs->disk, 0);
1332 add_disk(rzs->disk);
1333
1334 rzs->init_done = 0;
1335}
1336
1337static void destroy_device(struct ramzswap *rzs)
1338{
1339 if (rzs->disk) {
1340 del_gendisk(rzs->disk);
1341 put_disk(rzs->disk);
1342 }
1343
1344 if (rzs->queue)
1345 blk_cleanup_queue(rzs->queue);
1346}
1347
1348static int __init ramzswap_init(void)
1349{
1350 int i, ret;
1351
1352 if (num_devices > max_num_devices) {
1353 pr_warning("Invalid value for num_devices: %u\n",
1354 num_devices);
1355 return -EINVAL;
1356 }
1357
1358 ramzswap_major = register_blkdev(0, "ramzswap");
1359 if (ramzswap_major <= 0) {
1360 pr_warning("Unable to get major number\n");
1361 return -EBUSY;
1362 }
1363
1364 if (!num_devices) {
1365 pr_info("num_devices not specified. Using default: 1\n");
1366 num_devices = 1;
1367 }
1368
1369 /* Allocate the device array and initialize each one */
1370 pr_info("Creating %u devices ...\n", num_devices);
1371 devices = kzalloc(num_devices * sizeof(struct ramzswap), GFP_KERNEL);
1372 if (!devices) {
1373 ret = -ENOMEM;
1374 goto out;
1375 }
1376
1377 for (i = 0; i < num_devices; i++)
1378 create_device(&devices[i], i);
1379
1380 return 0;
1381out:
1382 unregister_blkdev(ramzswap_major, "ramzswap");
1383 return ret;
1384}
1385
1386static void __exit ramzswap_exit(void)
1387{
1388 int i;
1389 struct ramzswap *rzs;
1390
1391 for (i = 0; i < num_devices; i++) {
1392 rzs = &devices[i];
1393
1394 destroy_device(rzs);
1395 if (rzs->init_done)
1396 reset_device(rzs);
1397 }
1398
1399 unregister_blkdev(ramzswap_major, "ramzswap");
1400
1401 kfree(devices);
1402 pr_debug("Cleanup done!\n");
1403}
1404
1405module_param(num_devices, uint, 0);
1406MODULE_PARM_DESC(num_devices, "Number of ramzswap devices");
1407
1408module_init(ramzswap_init);
1409module_exit(ramzswap_exit);
1410
1411MODULE_LICENSE("Dual BSD/GPL");
1412MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
1413MODULE_DESCRIPTION("Compressed RAM Based Swap Device");