Blame - mm/swapfile.c - kernel/msm

blob: 62e0da8f7e6eeb8aa0f7c99475b61dae8619eda9 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/mm/swapfile.c
				3	*
				4	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
				5	* Swap reorganised 29.12.95, Stephen Tweedie
				6	*/
				7
				8	#include <linux/config.h>
				9	#include <linux/mm.h>
				10	#include <linux/hugetlb.h>
				11	#include <linux/mman.h>
				12	#include <linux/slab.h>
				13	#include <linux/kernel_stat.h>
				14	#include <linux/swap.h>
				15	#include <linux/vmalloc.h>
				16	#include <linux/pagemap.h>
				17	#include <linux/namei.h>
				18	#include <linux/shm.h>
				19	#include <linux/blkdev.h>
				20	#include <linux/writeback.h>
				21	#include <linux/proc_fs.h>
				22	#include <linux/seq_file.h>
				23	#include <linux/init.h>
				24	#include <linux/module.h>
				25	#include <linux/rmap.h>
				26	#include <linux/security.h>
				27	#include <linux/backing-dev.h>
				28	#include <linux/syscalls.h>
				29
				30	#include <asm/pgtable.h>
				31	#include <asm/tlbflush.h>
				32	#include <linux/swapops.h>
				33
				34	DEFINE_SPINLOCK(swaplock);
				35	unsigned int nr_swapfiles;
				36	long total_swap_pages;
				37	static int swap_overflow;
				38
				39	EXPORT_SYMBOL(total_swap_pages);
				40
				41	static const char Bad_file[] = "Bad swap file entry ";
				42	static const char Unused_file[] = "Unused swap file entry ";
				43	static const char Bad_offset[] = "Bad swap offset entry ";
				44	static const char Unused_offset[] = "Unused swap offset entry ";
				45
				46	struct swap_list_t swap_list = {-1, -1};
				47
				48	struct swap_info_struct swap_info[MAX_SWAPFILES];
				49
				50	static DECLARE_MUTEX(swapon_sem);
				51
				52	/*
				53	* We need this because the bdev->unplug_fn can sleep and we cannot
				54	* hold swap_list_lock while calling the unplug_fn. And swap_list_lock
				55	* cannot be turned into a semaphore.
				56	*/
				57	static DECLARE_RWSEM(swap_unplug_sem);
				58
				59	#define SWAPFILE_CLUSTER 256
				60
				61	void swap_unplug_io_fn(struct backing_dev_info unused_bdi, struct page page)
				62	{
				63	swp_entry_t entry;
				64
				65	down_read(&swap_unplug_sem);
				66	entry.val = page->private;
				67	if (PageSwapCache(page)) {
				68	struct block_device *bdev = swap_info[swp_type(entry)].bdev;
				69	struct backing_dev_info *bdi;
				70
				71	/*
				72	* If the page is removed from swapcache from under us (with a
				73	* racy try_to_unuse/swapoff) we need an additional reference
				74	* count to avoid reading garbage from page->private above. If
				75	* the WARN_ON triggers during a swapoff it maybe the race
				76	* condition and it's harmless. However if it triggers without
				77	* swapoff it signals a problem.
				78	*/
				79	WARN_ON(page_count(page) <= 1);
				80
				81	bdi = bdev->bd_inode->i_mapping->backing_dev_info;
McMullan, Jason	ba32311	2005-05-16 21:53:40 -0700	[diff] [blame]	82	blk_run_backing_dev(bdi, page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	83	}
				84	up_read(&swap_unplug_sem);
				85	}
				86
Hugh Dickins	6eb396d	2005-09-03 15:54:35 -0700	[diff] [blame]	87	static inline unsigned long scan_swap_map(struct swap_info_struct *si)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	88	{
				89	unsigned long offset;
				90	/*
				91	* We try to cluster swap pages by allocating them
				92	* sequentially in swap. Once we've allocated
				93	* SWAPFILE_CLUSTER pages this way, however, we resort to
				94	* first-free allocation, starting a new cluster. This
				95	* prevents us from scattering swap pages all over the entire
				96	* swap partition, so that we reduce overall disk seek times
				97	* between swap pages. -- sct */
				98	if (si->cluster_nr) {
				99	while (si->cluster_next <= si->highest_bit) {
				100	offset = si->cluster_next++;
				101	if (si->swap_map[offset])
				102	continue;
				103	si->cluster_nr--;
				104	goto got_page;
				105	}
				106	}
				107	si->cluster_nr = SWAPFILE_CLUSTER;
				108
				109	/* try to find an empty (even not aligned) cluster. */
				110	offset = si->lowest_bit;
				111	check_next_cluster:
				112	if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
				113	{
				114	unsigned long nr;
				115	for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
				116	if (si->swap_map[nr])
				117	{
				118	offset = nr+1;
				119	goto check_next_cluster;
				120	}
				121	/* We found a completly empty cluster, so start
				122	* using it.
				123	*/
				124	goto got_page;
				125	}
				126	/* No luck, so now go finegrined as usual. -Andrea */
				127	for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
				128	if (si->swap_map[offset])
				129	continue;
				130	si->lowest_bit = offset+1;
				131	got_page:
				132	if (offset == si->lowest_bit)
				133	si->lowest_bit++;
				134	if (offset == si->highest_bit)
				135	si->highest_bit--;
				136	if (si->lowest_bit > si->highest_bit) {
				137	si->lowest_bit = si->max;
				138	si->highest_bit = 0;
				139	}
				140	si->swap_map[offset] = 1;
				141	si->inuse_pages++;
				142	nr_swap_pages--;
				143	si->cluster_next = offset+1;
				144	return offset;
				145	}
				146	si->lowest_bit = si->max;
				147	si->highest_bit = 0;
				148	return 0;
				149	}
				150
				151	swp_entry_t get_swap_page(void)
				152	{
				153	struct swap_info_struct * p;
				154	unsigned long offset;
				155	swp_entry_t entry;
				156	int type, wrapped = 0;
				157
				158	entry.val = 0; /* Out of memory */
				159	swap_list_lock();
				160	type = swap_list.next;
				161	if (type < 0)
				162	goto out;
				163	if (nr_swap_pages <= 0)
				164	goto out;
				165
				166	while (1) {
				167	p = &swap_info[type];
				168	if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
				169	swap_device_lock(p);
				170	offset = scan_swap_map(p);
				171	swap_device_unlock(p);
				172	if (offset) {
				173	entry = swp_entry(type,offset);
				174	type = swap_info[type].next;
				175	if (type < 0 \|\|
				176	p->prio != swap_info[type].prio) {
				177	swap_list.next = swap_list.head;
				178	} else {
				179	swap_list.next = type;
				180	}
				181	goto out;
				182	}
				183	}
				184	type = p->next;
				185	if (!wrapped) {
				186	if (type < 0 \|\| p->prio != swap_info[type].prio) {
				187	type = swap_list.head;
				188	wrapped = 1;
				189	}
				190	} else
				191	if (type < 0)
				192	goto out; /* out of swap space */
				193	}
				194	out:
				195	swap_list_unlock();
				196	return entry;
				197	}
				198
				199	static struct swap_info_struct * swap_info_get(swp_entry_t entry)
				200	{
				201	struct swap_info_struct * p;
				202	unsigned long offset, type;
				203
				204	if (!entry.val)
				205	goto out;
				206	type = swp_type(entry);
				207	if (type >= nr_swapfiles)
				208	goto bad_nofile;
				209	p = & swap_info[type];
				210	if (!(p->flags & SWP_USED))
				211	goto bad_device;
				212	offset = swp_offset(entry);
				213	if (offset >= p->max)
				214	goto bad_offset;
				215	if (!p->swap_map[offset])
				216	goto bad_free;
				217	swap_list_lock();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	218	swap_device_lock(p);
				219	return p;
				220
				221	bad_free:
				222	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
				223	goto out;
				224	bad_offset:
				225	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
				226	goto out;
				227	bad_device:
				228	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
				229	goto out;
				230	bad_nofile:
				231	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
				232	out:
				233	return NULL;
				234	}
				235
				236	static void swap_info_put(struct swap_info_struct * p)
				237	{
				238	swap_device_unlock(p);
				239	swap_list_unlock();
				240	}
				241
				242	static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
				243	{
				244	int count = p->swap_map[offset];
				245
				246	if (count < SWAP_MAP_MAX) {
				247	count--;
				248	p->swap_map[offset] = count;
				249	if (!count) {
				250	if (offset < p->lowest_bit)
				251	p->lowest_bit = offset;
				252	if (offset > p->highest_bit)
				253	p->highest_bit = offset;
Hugh Dickins	89d09a2	2005-09-03 15:54:36 -0700	[diff] [blame^]	254	if (p->prio > swap_info[swap_list.next].prio)
				255	swap_list.next = p - swap_info;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	256	nr_swap_pages++;
				257	p->inuse_pages--;
				258	}
				259	}
				260	return count;
				261	}
				262
				263	/*
				264	* Caller has made sure that the swapdevice corresponding to entry
				265	* is still around or has not been recycled.
				266	*/
				267	void swap_free(swp_entry_t entry)
				268	{
				269	struct swap_info_struct * p;
				270
				271	p = swap_info_get(entry);
				272	if (p) {
				273	swap_entry_free(p, swp_offset(entry));
				274	swap_info_put(p);
				275	}
				276	}
				277
				278	/*
Hugh Dickins	c475a8a	2005-06-21 17:15:12 -0700	[diff] [blame]	279	* How many references to page are currently swapped out?
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	280	*/
Hugh Dickins	c475a8a	2005-06-21 17:15:12 -0700	[diff] [blame]	281	static inline int page_swapcount(struct page *page)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	282	{
Hugh Dickins	c475a8a	2005-06-21 17:15:12 -0700	[diff] [blame]	283	int count = 0;
				284	struct swap_info_struct *p;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	285	swp_entry_t entry;
				286
				287	entry.val = page->private;
				288	p = swap_info_get(entry);
				289	if (p) {
Hugh Dickins	c475a8a	2005-06-21 17:15:12 -0700	[diff] [blame]	290	/* Subtract the 1 for the swap cache itself */
				291	count = p->swap_map[swp_offset(entry)] - 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	292	swap_info_put(p);
				293	}
Hugh Dickins	c475a8a	2005-06-21 17:15:12 -0700	[diff] [blame]	294	return count;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	295	}
				296
				297	/*
				298	* We can use this swap cache entry directly
				299	* if there are no other references to it.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	300	*/
				301	int can_share_swap_page(struct page *page)
				302	{
Hugh Dickins	c475a8a	2005-06-21 17:15:12 -0700	[diff] [blame]	303	int count;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	304
Hugh Dickins	c475a8a	2005-06-21 17:15:12 -0700	[diff] [blame]	305	BUG_ON(!PageLocked(page));
				306	count = page_mapcount(page);
				307	if (count <= 1 && PageSwapCache(page))
				308	count += page_swapcount(page);
				309	return count == 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	310	}
				311
				312	/*
				313	* Work out if there are any other processes sharing this
				314	* swap cache page. Free it if you can. Return success.
				315	*/
				316	int remove_exclusive_swap_page(struct page *page)
				317	{
				318	int retval;
				319	struct swap_info_struct * p;
				320	swp_entry_t entry;
				321
				322	BUG_ON(PagePrivate(page));
				323	BUG_ON(!PageLocked(page));
				324
				325	if (!PageSwapCache(page))
				326	return 0;
				327	if (PageWriteback(page))
				328	return 0;
				329	if (page_count(page) != 2) /* 2: us + cache */
				330	return 0;
				331
				332	entry.val = page->private;
				333	p = swap_info_get(entry);
				334	if (!p)
				335	return 0;
				336
				337	/* Is the only swap cache user the cache itself? */
				338	retval = 0;
				339	if (p->swap_map[swp_offset(entry)] == 1) {
				340	/* Recheck the page count with the swapcache lock held.. */
				341	write_lock_irq(&swapper_space.tree_lock);
				342	if ((page_count(page) == 2) && !PageWriteback(page)) {
				343	__delete_from_swap_cache(page);
				344	SetPageDirty(page);
				345	retval = 1;
				346	}
				347	write_unlock_irq(&swapper_space.tree_lock);
				348	}
				349	swap_info_put(p);
				350
				351	if (retval) {
				352	swap_free(entry);
				353	page_cache_release(page);
				354	}
				355
				356	return retval;
				357	}
				358
				359	/*
				360	* Free the swap entry like above, but also try to
				361	* free the page cache entry if it is the last user.
				362	*/
				363	void free_swap_and_cache(swp_entry_t entry)
				364	{
				365	struct swap_info_struct * p;
				366	struct page *page = NULL;
				367
				368	p = swap_info_get(entry);
				369	if (p) {
				370	if (swap_entry_free(p, swp_offset(entry)) == 1)
				371	page = find_trylock_page(&swapper_space, entry.val);
				372	swap_info_put(p);
				373	}
				374	if (page) {
				375	int one_user;
				376
				377	BUG_ON(PagePrivate(page));
				378	page_cache_get(page);
				379	one_user = (page_count(page) == 2);
				380	/* Only cache user (+us), or swap space full? Free it! */
				381	if (!PageWriteback(page) && (one_user \|\| vm_swap_full())) {
				382	delete_from_swap_cache(page);
				383	SetPageDirty(page);
				384	}
				385	unlock_page(page);
				386	page_cache_release(page);
				387	}
				388	}
				389
				390	/*
				391	* Always set the resulting pte to be nowrite (the same as COW pages
				392	* after one process has exited). We don't know just how many PTEs will
				393	* share this swap entry, so be cautious and let do_wp_page work out
				394	* what to do if a write is requested later.
				395	*
				396	* vma->vm_mm->page_table_lock is held.
				397	*/
				398	static void unuse_pte(struct vm_area_struct vma, pte_t pte,
				399	unsigned long addr, swp_entry_t entry, struct page *page)
				400	{
				401	inc_mm_counter(vma->vm_mm, rss);
				402	get_page(page);
				403	set_pte_at(vma->vm_mm, addr, pte,
				404	pte_mkold(mk_pte(page, vma->vm_page_prot)));
				405	page_add_anon_rmap(page, vma, addr);
				406	swap_free(entry);
				407	/*
				408	* Move the page to the active list so it is not
				409	* immediately swapped out again after swapon.
				410	*/
				411	activate_page(page);
				412	}
				413
				414	static int unuse_pte_range(struct vm_area_struct vma, pmd_t pmd,
				415	unsigned long addr, unsigned long end,
				416	swp_entry_t entry, struct page *page)
				417	{
				418	pte_t *pte;
				419	pte_t swp_pte = swp_entry_to_pte(entry);
				420
				421	pte = pte_offset_map(pmd, addr);
				422	do {
				423	/*
				424	* swapoff spends a _lot_ of time in this loop!
				425	* Test inline before going to call unuse_pte.
				426	*/
				427	if (unlikely(pte_same(*pte, swp_pte))) {
				428	unuse_pte(vma, pte, addr, entry, page);
				429	pte_unmap(pte);
				430	return 1;
				431	}
				432	} while (pte++, addr += PAGE_SIZE, addr != end);
				433	pte_unmap(pte - 1);
				434	return 0;
				435	}
				436
				437	static inline int unuse_pmd_range(struct vm_area_struct vma, pud_t pud,
				438	unsigned long addr, unsigned long end,
				439	swp_entry_t entry, struct page *page)
				440	{
				441	pmd_t *pmd;
				442	unsigned long next;
				443
				444	pmd = pmd_offset(pud, addr);
				445	do {
				446	next = pmd_addr_end(addr, end);
				447	if (pmd_none_or_clear_bad(pmd))
				448	continue;
				449	if (unuse_pte_range(vma, pmd, addr, next, entry, page))
				450	return 1;
				451	} while (pmd++, addr = next, addr != end);
				452	return 0;
				453	}
				454
				455	static inline int unuse_pud_range(struct vm_area_struct vma, pgd_t pgd,
				456	unsigned long addr, unsigned long end,
				457	swp_entry_t entry, struct page *page)
				458	{
				459	pud_t *pud;
				460	unsigned long next;
				461
				462	pud = pud_offset(pgd, addr);
				463	do {
				464	next = pud_addr_end(addr, end);
				465	if (pud_none_or_clear_bad(pud))
				466	continue;
				467	if (unuse_pmd_range(vma, pud, addr, next, entry, page))
				468	return 1;
				469	} while (pud++, addr = next, addr != end);
				470	return 0;
				471	}
				472
				473	static int unuse_vma(struct vm_area_struct *vma,
				474	swp_entry_t entry, struct page *page)
				475	{
				476	pgd_t *pgd;
				477	unsigned long addr, end, next;
				478
				479	if (page->mapping) {
				480	addr = page_address_in_vma(page, vma);
				481	if (addr == -EFAULT)
				482	return 0;
				483	else
				484	end = addr + PAGE_SIZE;
				485	} else {
				486	addr = vma->vm_start;
				487	end = vma->vm_end;
				488	}
				489
				490	pgd = pgd_offset(vma->vm_mm, addr);
				491	do {
				492	next = pgd_addr_end(addr, end);
				493	if (pgd_none_or_clear_bad(pgd))
				494	continue;
				495	if (unuse_pud_range(vma, pgd, addr, next, entry, page))
				496	return 1;
				497	} while (pgd++, addr = next, addr != end);
				498	return 0;
				499	}
				500
				501	static int unuse_mm(struct mm_struct *mm,
				502	swp_entry_t entry, struct page *page)
				503	{
				504	struct vm_area_struct *vma;
				505
				506	if (!down_read_trylock(&mm->mmap_sem)) {
				507	/*
Hugh Dickins	c475a8a	2005-06-21 17:15:12 -0700	[diff] [blame]	508	* Activate page so shrink_cache is unlikely to unmap its
				509	* ptes while lock is dropped, so swapoff can make progress.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	510	*/
Hugh Dickins	c475a8a	2005-06-21 17:15:12 -0700	[diff] [blame]	511	activate_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	512	unlock_page(page);
				513	down_read(&mm->mmap_sem);
				514	lock_page(page);
				515	}
				516	spin_lock(&mm->page_table_lock);
				517	for (vma = mm->mmap; vma; vma = vma->vm_next) {
				518	if (vma->anon_vma && unuse_vma(vma, entry, page))
				519	break;
				520	}
				521	spin_unlock(&mm->page_table_lock);
				522	up_read(&mm->mmap_sem);
				523	/*
				524	* Currently unuse_mm cannot fail, but leave error handling
				525	* at call sites for now, since we change it from time to time.
				526	*/
				527	return 0;
				528	}
				529
				530	/*
				531	* Scan swap_map from current position to next entry still in use.
				532	* Recycle to start on reaching the end, returning 0 when empty.
				533	*/
Hugh Dickins	6eb396d	2005-09-03 15:54:35 -0700	[diff] [blame]	534	static unsigned int find_next_to_unuse(struct swap_info_struct *si,
				535	unsigned int prev)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	536	{
Hugh Dickins	6eb396d	2005-09-03 15:54:35 -0700	[diff] [blame]	537	unsigned int max = si->max;
				538	unsigned int i = prev;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	539	int count;
				540
				541	/*
				542	* No need for swap_device_lock(si) here: we're just looking
				543	* for whether an entry is in use, not modifying it; false
				544	* hits are okay, and sys_swapoff() has already prevented new
				545	* allocations from this area (while holding swap_list_lock()).
				546	*/
				547	for (;;) {
				548	if (++i >= max) {
				549	if (!prev) {
				550	i = 0;
				551	break;
				552	}
				553	/*
				554	* No entries in use at top of swap_map,
				555	* loop back to start and recheck there.
				556	*/
				557	max = prev + 1;
				558	prev = 0;
				559	i = 1;
				560	}
				561	count = si->swap_map[i];
				562	if (count && count != SWAP_MAP_BAD)
				563	break;
				564	}
				565	return i;
				566	}
				567
				568	/*
				569	* We completely avoid races by reading each swap page in advance,
				570	* and then search for the process using it. All the necessary
				571	* page table adjustments can then be made atomically.
				572	*/
				573	static int try_to_unuse(unsigned int type)
				574	{
				575	struct swap_info_struct * si = &swap_info[type];
				576	struct mm_struct *start_mm;
				577	unsigned short *swap_map;
				578	unsigned short swcount;
				579	struct page *page;
				580	swp_entry_t entry;
Hugh Dickins	6eb396d	2005-09-03 15:54:35 -0700	[diff] [blame]	581	unsigned int i = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	582	int retval = 0;
				583	int reset_overflow = 0;
				584	int shmem;
				585
				586	/*
				587	* When searching mms for an entry, a good strategy is to
				588	* start at the first mm we freed the previous entry from
				589	* (though actually we don't notice whether we or coincidence
				590	* freed the entry). Initialize this start_mm with a hold.
				591	*
				592	* A simpler strategy would be to start at the last mm we
				593	* freed the previous entry from; but that would take less
				594	* advantage of mmlist ordering, which clusters forked mms
				595	* together, child after parent. If we race with dup_mmap(), we
				596	* prefer to resolve parent before child, lest we miss entries
				597	* duplicated after we scanned child: using last mm would invert
				598	* that. Though it's only a serious concern when an overflowed
				599	* swap count is reset from SWAP_MAP_MAX, preventing a rescan.
				600	*/
				601	start_mm = &init_mm;
				602	atomic_inc(&init_mm.mm_users);
				603
				604	/*
				605	* Keep on scanning until all entries have gone. Usually,
				606	* one pass through swap_map is enough, but not necessarily:
				607	* there are races when an instance of an entry might be missed.
				608	*/
				609	while ((i = find_next_to_unuse(si, i)) != 0) {
				610	if (signal_pending(current)) {
				611	retval = -EINTR;
				612	break;
				613	}
				614
				615	/*
				616	* Get a page for the entry, using the existing swap
				617	* cache page if there is one. Otherwise, get a clean
				618	* page and read the swap into it.
				619	*/
				620	swap_map = &si->swap_map[i];
				621	entry = swp_entry(type, i);
				622	page = read_swap_cache_async(entry, NULL, 0);
				623	if (!page) {
				624	/*
				625	* Either swap_duplicate() failed because entry
				626	* has been freed independently, and will not be
				627	* reused since sys_swapoff() already disabled
				628	* allocation from here, or alloc_page() failed.
				629	*/
				630	if (!*swap_map)
				631	continue;
				632	retval = -ENOMEM;
				633	break;
				634	}
				635
				636	/*
				637	* Don't hold on to start_mm if it looks like exiting.
				638	*/
				639	if (atomic_read(&start_mm->mm_users) == 1) {
				640	mmput(start_mm);
				641	start_mm = &init_mm;
				642	atomic_inc(&init_mm.mm_users);
				643	}
				644
				645	/*
				646	* Wait for and lock page. When do_swap_page races with
				647	* try_to_unuse, do_swap_page can handle the fault much
				648	* faster than try_to_unuse can locate the entry. This
				649	* apparently redundant "wait_on_page_locked" lets try_to_unuse
				650	* defer to do_swap_page in such a case - in some tests,
				651	* do_swap_page and try_to_unuse repeatedly compete.
				652	*/
				653	wait_on_page_locked(page);
				654	wait_on_page_writeback(page);
				655	lock_page(page);
				656	wait_on_page_writeback(page);
				657
				658	/*
				659	* Remove all references to entry.
				660	* Whenever we reach init_mm, there's no address space
				661	* to search, but use it as a reminder to search shmem.
				662	*/
				663	shmem = 0;
				664	swcount = *swap_map;
				665	if (swcount > 1) {
				666	if (start_mm == &init_mm)
				667	shmem = shmem_unuse(entry, page);
				668	else
				669	retval = unuse_mm(start_mm, entry, page);
				670	}
				671	if (*swap_map > 1) {
				672	int set_start_mm = (*swap_map >= swcount);
				673	struct list_head *p = &start_mm->mmlist;
				674	struct mm_struct *new_start_mm = start_mm;
				675	struct mm_struct *prev_mm = start_mm;
				676	struct mm_struct *mm;
				677
				678	atomic_inc(&new_start_mm->mm_users);
				679	atomic_inc(&prev_mm->mm_users);
				680	spin_lock(&mmlist_lock);
				681	while (*swap_map > 1 && !retval &&
				682	(p = p->next) != &start_mm->mmlist) {
				683	mm = list_entry(p, struct mm_struct, mmlist);
				684	if (atomic_inc_return(&mm->mm_users) == 1) {
				685	atomic_dec(&mm->mm_users);
				686	continue;
				687	}
				688	spin_unlock(&mmlist_lock);
				689	mmput(prev_mm);
				690	prev_mm = mm;
				691
				692	cond_resched();
				693
				694	swcount = *swap_map;
				695	if (swcount <= 1)
				696	;
				697	else if (mm == &init_mm) {
				698	set_start_mm = 1;
				699	shmem = shmem_unuse(entry, page);
				700	} else
				701	retval = unuse_mm(mm, entry, page);
				702	if (set_start_mm && *swap_map < swcount) {
				703	mmput(new_start_mm);
				704	atomic_inc(&mm->mm_users);
				705	new_start_mm = mm;
				706	set_start_mm = 0;
				707	}
				708	spin_lock(&mmlist_lock);
				709	}
				710	spin_unlock(&mmlist_lock);
				711	mmput(prev_mm);
				712	mmput(start_mm);
				713	start_mm = new_start_mm;
				714	}
				715	if (retval) {
				716	unlock_page(page);
				717	page_cache_release(page);
				718	break;
				719	}
				720
				721	/*
				722	* How could swap count reach 0x7fff when the maximum
				723	* pid is 0x7fff, and there's no way to repeat a swap
				724	* page within an mm (except in shmem, where it's the
				725	* shared object which takes the reference count)?
				726	* We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
				727	*
				728	* If that's wrong, then we should worry more about
				729	* exit_mmap() and do_munmap() cases described above:
				730	* we might be resetting SWAP_MAP_MAX too early here.
				731	* We know "Undead"s can happen, they're okay, so don't
				732	* report them; but do report if we reset SWAP_MAP_MAX.
				733	*/
				734	if (*swap_map == SWAP_MAP_MAX) {
				735	swap_device_lock(si);
				736	*swap_map = 1;
				737	swap_device_unlock(si);
				738	reset_overflow = 1;
				739	}
				740
				741	/*
				742	* If a reference remains (rare), we would like to leave
				743	* the page in the swap cache; but try_to_unmap could
				744	* then re-duplicate the entry once we drop page lock,
				745	* so we might loop indefinitely; also, that page could
				746	* not be swapped out to other storage meanwhile. So:
				747	* delete from cache even if there's another reference,
				748	* after ensuring that the data has been saved to disk -
				749	* since if the reference remains (rarer), it will be
				750	* read from disk into another page. Splitting into two
				751	* pages would be incorrect if swap supported "shared
				752	* private" pages, but they are handled by tmpfs files.
				753	*
				754	* Note shmem_unuse already deleted a swappage from
				755	* the swap cache, unless the move to filepage failed:
				756	* in which case it left swappage in cache, lowered its
				757	* swap count to pass quickly through the loops above,
				758	* and now we must reincrement count to try again later.
				759	*/
				760	if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
				761	struct writeback_control wbc = {
				762	.sync_mode = WB_SYNC_NONE,
				763	};
				764
				765	swap_writepage(page, &wbc);
				766	lock_page(page);
				767	wait_on_page_writeback(page);
				768	}
				769	if (PageSwapCache(page)) {
				770	if (shmem)
				771	swap_duplicate(entry);
				772	else
				773	delete_from_swap_cache(page);
				774	}
				775
				776	/*
				777	* So we could skip searching mms once swap count went
				778	* to 1, we did not mark any present ptes as dirty: must
				779	* mark page dirty so shrink_list will preserve it.
				780	*/
				781	SetPageDirty(page);
				782	unlock_page(page);
				783	page_cache_release(page);
				784
				785	/*
				786	* Make sure that we aren't completely killing
				787	* interactive performance.
				788	*/
				789	cond_resched();
				790	}
				791
				792	mmput(start_mm);
				793	if (reset_overflow) {
				794	printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
				795	swap_overflow = 0;
				796	}
				797	return retval;
				798	}
				799
				800	/*
				801	* After a successful try_to_unuse, if no swap is now in use, we know we
				802	* can empty the mmlist. swap_list_lock must be held on entry and exit.
				803	* Note that mmlist_lock nests inside swap_list_lock, and an mm must be
				804	* added to the mmlist just after page_duplicate - before would be racy.
				805	*/
				806	static void drain_mmlist(void)
				807	{
				808	struct list_head p, next;
				809	unsigned int i;
				810
				811	for (i = 0; i < nr_swapfiles; i++)
				812	if (swap_info[i].inuse_pages)
				813	return;
				814	spin_lock(&mmlist_lock);
				815	list_for_each_safe(p, next, &init_mm.mmlist)
				816	list_del_init(p);
				817	spin_unlock(&mmlist_lock);
				818	}
				819
				820	/*
				821	* Use this swapdev's extent info to locate the (PAGE_SIZE) block which
				822	* corresponds to page offset `offset'.
				823	*/
				824	sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
				825	{
				826	struct swap_extent *se = sis->curr_swap_extent;
				827	struct swap_extent *start_se = se;
				828
				829	for ( ; ; ) {
				830	struct list_head *lh;
				831
				832	if (se->start_page <= offset &&
				833	offset < (se->start_page + se->nr_pages)) {
				834	return se->start_block + (offset - se->start_page);
				835	}
Hugh Dickins	11d3188	2005-09-03 15:54:34 -0700	[diff] [blame]	836	lh = se->list.next;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	837	if (lh == &sis->extent_list)
Hugh Dickins	11d3188	2005-09-03 15:54:34 -0700	[diff] [blame]	838	lh = lh->next;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	839	se = list_entry(lh, struct swap_extent, list);
				840	sis->curr_swap_extent = se;
				841	BUG_ON(se == start_se); /* It must be present */
				842	}
				843	}
				844
				845	/*
				846	* Free all of a swapdev's extent information
				847	*/
				848	static void destroy_swap_extents(struct swap_info_struct *sis)
				849	{
				850	while (!list_empty(&sis->extent_list)) {
				851	struct swap_extent *se;
				852
				853	se = list_entry(sis->extent_list.next,
				854	struct swap_extent, list);
				855	list_del(&se->list);
				856	kfree(se);
				857	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	858	}
				859
				860	/*
				861	* Add a block range (and the corresponding page range) into this swapdev's
Hugh Dickins	11d3188	2005-09-03 15:54:34 -0700	[diff] [blame]	862	* extent list. The extent list is kept sorted in page order.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	863	*
Hugh Dickins	11d3188	2005-09-03 15:54:34 -0700	[diff] [blame]	864	* This function rather assumes that it is called in ascending page order.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	865	*/
				866	static int
				867	add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
				868	unsigned long nr_pages, sector_t start_block)
				869	{
				870	struct swap_extent *se;
				871	struct swap_extent *new_se;
				872	struct list_head *lh;
				873
Hugh Dickins	11d3188	2005-09-03 15:54:34 -0700	[diff] [blame]	874	lh = sis->extent_list.prev; /* The highest page extent */
				875	if (lh != &sis->extent_list) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	876	se = list_entry(lh, struct swap_extent, list);
Hugh Dickins	11d3188	2005-09-03 15:54:34 -0700	[diff] [blame]	877	BUG_ON(se->start_page + se->nr_pages != start_page);
				878	if (se->start_block + se->nr_pages == start_block) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	879	/* Merge it */
				880	se->nr_pages += nr_pages;
				881	return 0;
				882	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	883	}
				884
				885	/*
				886	* No merge. Insert a new extent, preserving ordering.
				887	*/
				888	new_se = kmalloc(sizeof(*se), GFP_KERNEL);
				889	if (new_se == NULL)
				890	return -ENOMEM;
				891	new_se->start_page = start_page;
				892	new_se->nr_pages = nr_pages;
				893	new_se->start_block = start_block;
				894
Hugh Dickins	11d3188	2005-09-03 15:54:34 -0700	[diff] [blame]	895	list_add_tail(&new_se->list, &sis->extent_list);
Hugh Dickins	53092a7	2005-09-03 15:54:34 -0700	[diff] [blame]	896	return 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	897	}
				898
				899	/*
				900	* A `swap extent' is a simple thing which maps a contiguous range of pages
				901	* onto a contiguous range of disk blocks. An ordered list of swap extents
				902	* is built at swapon time and is then used at swap_writepage/swap_readpage
				903	* time for locating where on disk a page belongs.
				904	*
				905	* If the swapfile is an S_ISBLK block device, a single extent is installed.
				906	* This is done so that the main operating code can treat S_ISBLK and S_ISREG
				907	* swap files identically.
				908	*
				909	* Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
				910	* extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
				911	* swapfiles are handled identically after swapon time.
				912	*
				913	* For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
				914	* and will parse them into an ordered extent list, in PAGE_SIZE chunks. If
				915	* some stray blocks are found which do not fall within the PAGE_SIZE alignment
				916	* requirements, they are simply tossed out - we will never use those blocks
				917	* for swapping.
				918	*
Hugh Dickins	b0d9bcd	2005-09-03 15:54:31 -0700	[diff] [blame]	919	* For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	920	* prevents root from shooting her foot off by ftruncating an in-use swapfile,
				921	* which will scribble on the fs.
				922	*
				923	* The amount of disk space which a single swap extent represents varies.
				924	* Typically it is in the 1-4 megabyte range. So we can have hundreds of
				925	* extents in the list. To avoid much list walking, we cache the previous
				926	* search location in `curr_swap_extent', and start new searches from there.
				927	* This is extremely effective. The average number of iterations in
				928	* map_swap_page() has been measured at about 0.3 per page. - akpm.
				929	*/
Hugh Dickins	53092a7	2005-09-03 15:54:34 -0700	[diff] [blame]	930	static int setup_swap_extents(struct swap_info_struct sis, sector_t span)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	931	{
				932	struct inode *inode;
				933	unsigned blocks_per_page;
				934	unsigned long page_no;
				935	unsigned blkbits;
				936	sector_t probe_block;
				937	sector_t last_block;
Hugh Dickins	53092a7	2005-09-03 15:54:34 -0700	[diff] [blame]	938	sector_t lowest_block = -1;
				939	sector_t highest_block = 0;
				940	int nr_extents = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	941	int ret;
				942
				943	inode = sis->swap_file->f_mapping->host;
				944	if (S_ISBLK(inode->i_mode)) {
				945	ret = add_swap_extent(sis, 0, sis->max, 0);
Hugh Dickins	53092a7	2005-09-03 15:54:34 -0700	[diff] [blame]	946	*span = sis->pages;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	947	goto done;
				948	}
				949
				950	blkbits = inode->i_blkbits;
				951	blocks_per_page = PAGE_SIZE >> blkbits;
				952
				953	/*
				954	* Map all the blocks into the extent list. This code doesn't try
				955	* to be very smart.
				956	*/
				957	probe_block = 0;
				958	page_no = 0;
				959	last_block = i_size_read(inode) >> blkbits;
				960	while ((probe_block + blocks_per_page) <= last_block &&
				961	page_no < sis->max) {
				962	unsigned block_in_page;
				963	sector_t first_block;
				964
				965	first_block = bmap(inode, probe_block);
				966	if (first_block == 0)
				967	goto bad_bmap;
				968
				969	/*
				970	* It must be PAGE_SIZE aligned on-disk
				971	*/
				972	if (first_block & (blocks_per_page - 1)) {
				973	probe_block++;
				974	goto reprobe;
				975	}
				976
				977	for (block_in_page = 1; block_in_page < blocks_per_page;
				978	block_in_page++) {
				979	sector_t block;
				980
				981	block = bmap(inode, probe_block + block_in_page);
				982	if (block == 0)
				983	goto bad_bmap;
				984	if (block != first_block + block_in_page) {
				985	/* Discontiguity */
				986	probe_block++;
				987	goto reprobe;
				988	}
				989	}
				990
Hugh Dickins	53092a7	2005-09-03 15:54:34 -0700	[diff] [blame]	991	first_block >>= (PAGE_SHIFT - blkbits);
				992	if (page_no) { /* exclude the header page */
				993	if (first_block < lowest_block)
				994	lowest_block = first_block;
				995	if (first_block > highest_block)
				996	highest_block = first_block;
				997	}
				998
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	999	/*
				1000	* We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
				1001	*/
Hugh Dickins	53092a7	2005-09-03 15:54:34 -0700	[diff] [blame]	1002	ret = add_swap_extent(sis, page_no, 1, first_block);
				1003	if (ret < 0)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1004	goto out;
Hugh Dickins	53092a7	2005-09-03 15:54:34 -0700	[diff] [blame]	1005	nr_extents += ret;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1006	page_no++;
				1007	probe_block += blocks_per_page;
				1008	reprobe:
				1009	continue;
				1010	}
Hugh Dickins	53092a7	2005-09-03 15:54:34 -0700	[diff] [blame]	1011	ret = nr_extents;
				1012	*span = 1 + highest_block - lowest_block;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1013	if (page_no == 0)
Hugh Dickins	e2244ec	2005-09-03 15:54:32 -0700	[diff] [blame]	1014	page_no = 1; /* force Empty message */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1015	sis->max = page_no;
Hugh Dickins	e2244ec	2005-09-03 15:54:32 -0700	[diff] [blame]	1016	sis->pages = page_no - 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1017	sis->highest_bit = page_no - 1;
				1018	done:
				1019	sis->curr_swap_extent = list_entry(sis->extent_list.prev,
				1020	struct swap_extent, list);
				1021	goto out;
				1022	bad_bmap:
				1023	printk(KERN_ERR "swapon: swapfile has holes\n");
				1024	ret = -EINVAL;
				1025	out:
				1026	return ret;
				1027	}
				1028
				1029	#if 0 /* We don't need this yet */
				1030	#include <linux/backing-dev.h>
				1031	int page_queue_congested(struct page *page)
				1032	{
				1033	struct backing_dev_info *bdi;
				1034
				1035	BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
				1036
				1037	if (PageSwapCache(page)) {
				1038	swp_entry_t entry = { .val = page->private };
				1039	struct swap_info_struct *sis;
				1040
				1041	sis = get_swap_info_struct(swp_type(entry));
				1042	bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
				1043	} else
				1044	bdi = page->mapping->backing_dev_info;
				1045	return bdi_write_congested(bdi);
				1046	}
				1047	#endif
				1048
				1049	asmlinkage long sys_swapoff(const char __user * specialfile)
				1050	{
				1051	struct swap_info_struct * p = NULL;
				1052	unsigned short *swap_map;
				1053	struct file swap_file, victim;
				1054	struct address_space *mapping;
				1055	struct inode *inode;
				1056	char * pathname;
				1057	int i, type, prev;
				1058	int err;
				1059
				1060	if (!capable(CAP_SYS_ADMIN))
				1061	return -EPERM;
				1062
				1063	pathname = getname(specialfile);
				1064	err = PTR_ERR(pathname);
				1065	if (IS_ERR(pathname))
				1066	goto out;
				1067
				1068	victim = filp_open(pathname, O_RDWR\|O_LARGEFILE, 0);
				1069	putname(pathname);
				1070	err = PTR_ERR(victim);
				1071	if (IS_ERR(victim))
				1072	goto out;
				1073
				1074	mapping = victim->f_mapping;
				1075	prev = -1;
				1076	swap_list_lock();
				1077	for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
				1078	p = swap_info + type;
				1079	if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
				1080	if (p->swap_file->f_mapping == mapping)
				1081	break;
				1082	}
				1083	prev = type;
				1084	}
				1085	if (type < 0) {
				1086	err = -EINVAL;
				1087	swap_list_unlock();
				1088	goto out_dput;
				1089	}
				1090	if (!security_vm_enough_memory(p->pages))
				1091	vm_unacct_memory(p->pages);
				1092	else {
				1093	err = -ENOMEM;
				1094	swap_list_unlock();
				1095	goto out_dput;
				1096	}
				1097	if (prev < 0) {
				1098	swap_list.head = p->next;
				1099	} else {
				1100	swap_info[prev].next = p->next;
				1101	}
				1102	if (type == swap_list.next) {
				1103	/* just pick something that's safe... */
				1104	swap_list.next = swap_list.head;
				1105	}
				1106	nr_swap_pages -= p->pages;
				1107	total_swap_pages -= p->pages;
				1108	p->flags &= ~SWP_WRITEOK;
				1109	swap_list_unlock();
				1110	current->flags \|= PF_SWAPOFF;
				1111	err = try_to_unuse(type);
				1112	current->flags &= ~PF_SWAPOFF;
				1113
				1114	/* wait for any unplug function to finish */
				1115	down_write(&swap_unplug_sem);
				1116	up_write(&swap_unplug_sem);
				1117
				1118	if (err) {
				1119	/* re-insert swap space back into swap_list */
				1120	swap_list_lock();
				1121	for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
				1122	if (p->prio >= swap_info[i].prio)
				1123	break;
				1124	p->next = i;
				1125	if (prev < 0)
				1126	swap_list.head = swap_list.next = p - swap_info;
				1127	else
				1128	swap_info[prev].next = p - swap_info;
				1129	nr_swap_pages += p->pages;
				1130	total_swap_pages += p->pages;
				1131	p->flags \|= SWP_WRITEOK;
				1132	swap_list_unlock();
				1133	goto out_dput;
				1134	}
Hugh Dickins	4cd3bb1	2005-09-03 15:54:33 -0700	[diff] [blame]	1135	destroy_swap_extents(p);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1136	down(&swapon_sem);
				1137	swap_list_lock();
				1138	drain_mmlist();
				1139	swap_device_lock(p);
				1140	swap_file = p->swap_file;
				1141	p->swap_file = NULL;
				1142	p->max = 0;
				1143	swap_map = p->swap_map;
				1144	p->swap_map = NULL;
				1145	p->flags = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1146	swap_device_unlock(p);
				1147	swap_list_unlock();
				1148	up(&swapon_sem);
				1149	vfree(swap_map);
				1150	inode = mapping->host;
				1151	if (S_ISBLK(inode->i_mode)) {
				1152	struct block_device *bdev = I_BDEV(inode);
				1153	set_blocksize(bdev, p->old_block_size);
				1154	bd_release(bdev);
				1155	} else {
				1156	down(&inode->i_sem);
				1157	inode->i_flags &= ~S_SWAPFILE;
				1158	up(&inode->i_sem);
				1159	}
				1160	filp_close(swap_file, NULL);
				1161	err = 0;
				1162
				1163	out_dput:
				1164	filp_close(victim, NULL);
				1165	out:
				1166	return err;
				1167	}
				1168
				1169	#ifdef CONFIG_PROC_FS
				1170	/* iterator */
				1171	static void swap_start(struct seq_file swap, loff_t *pos)
				1172	{
				1173	struct swap_info_struct *ptr = swap_info;
				1174	int i;
				1175	loff_t l = *pos;
				1176
				1177	down(&swapon_sem);
				1178
				1179	for (i = 0; i < nr_swapfiles; i++, ptr++) {
				1180	if (!(ptr->flags & SWP_USED) \|\| !ptr->swap_map)
				1181	continue;
				1182	if (!l--)
				1183	return ptr;
				1184	}
				1185
				1186	return NULL;
				1187	}
				1188
				1189	static void swap_next(struct seq_file swap, void v, loff_t pos)
				1190	{
				1191	struct swap_info_struct *ptr = v;
				1192	struct swap_info_struct *endptr = swap_info + nr_swapfiles;
				1193
				1194	for (++ptr; ptr < endptr; ptr++) {
				1195	if (!(ptr->flags & SWP_USED) \|\| !ptr->swap_map)
				1196	continue;
				1197	++*pos;
				1198	return ptr;
				1199	}
				1200
				1201	return NULL;
				1202	}
				1203
				1204	static void swap_stop(struct seq_file swap, void v)
				1205	{
				1206	up(&swapon_sem);
				1207	}
				1208
				1209	static int swap_show(struct seq_file swap, void v)
				1210	{
				1211	struct swap_info_struct *ptr = v;
				1212	struct file *file;
				1213	int len;
				1214
				1215	if (v == swap_info)
				1216	seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
				1217
				1218	file = ptr->swap_file;
				1219	len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\");
Hugh Dickins	6eb396d	2005-09-03 15:54:35 -0700	[diff] [blame]	1220	seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1221	len < 40 ? 40 - len : 1, " ",
				1222	S_ISBLK(file->f_dentry->d_inode->i_mode) ?
				1223	"partition" : "file\t",
				1224	ptr->pages << (PAGE_SHIFT - 10),
				1225	ptr->inuse_pages << (PAGE_SHIFT - 10),
				1226	ptr->prio);
				1227	return 0;
				1228	}
				1229
				1230	static struct seq_operations swaps_op = {
				1231	.start = swap_start,
				1232	.next = swap_next,
				1233	.stop = swap_stop,
				1234	.show = swap_show
				1235	};
				1236
				1237	static int swaps_open(struct inode inode, struct file file)
				1238	{
				1239	return seq_open(file, &swaps_op);
				1240	}
				1241
				1242	static struct file_operations proc_swaps_operations = {
				1243	.open = swaps_open,
				1244	.read = seq_read,
				1245	.llseek = seq_lseek,
				1246	.release = seq_release,
				1247	};
				1248
				1249	static int __init procswaps_init(void)
				1250	{
				1251	struct proc_dir_entry *entry;
				1252
				1253	entry = create_proc_entry("swaps", 0, NULL);
				1254	if (entry)
				1255	entry->proc_fops = &proc_swaps_operations;
				1256	return 0;
				1257	}
				1258	__initcall(procswaps_init);
				1259	#endif /* CONFIG_PROC_FS */
				1260
				1261	/*
				1262	* Written 01/25/92 by Simmule Turner, heavily changed by Linus.
				1263	*
				1264	* The swapon system call
				1265	*/
				1266	asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
				1267	{
				1268	struct swap_info_struct * p;
				1269	char *name = NULL;
				1270	struct block_device *bdev = NULL;
				1271	struct file *swap_file = NULL;
				1272	struct address_space *mapping;
				1273	unsigned int type;
				1274	int i, prev;
				1275	int error;
				1276	static int least_priority;
				1277	union swap_header *swap_header = NULL;
				1278	int swap_header_version;
Hugh Dickins	6eb396d	2005-09-03 15:54:35 -0700	[diff] [blame]	1279	unsigned int nr_good_pages = 0;
				1280	int nr_extents = 0;
Hugh Dickins	53092a7	2005-09-03 15:54:34 -0700	[diff] [blame]	1281	sector_t span;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1282	unsigned long maxpages = 1;
				1283	int swapfilesize;
				1284	unsigned short *swap_map;
				1285	struct page *page = NULL;
				1286	struct inode *inode = NULL;
				1287	int did_down = 0;
				1288
				1289	if (!capable(CAP_SYS_ADMIN))
				1290	return -EPERM;
				1291	swap_list_lock();
				1292	p = swap_info;
				1293	for (type = 0 ; type < nr_swapfiles ; type++,p++)
				1294	if (!(p->flags & SWP_USED))
				1295	break;
				1296	error = -EPERM;
				1297	/*
				1298	* Test if adding another swap device is possible. There are
				1299	* two limiting factors: 1) the number of bits for the swap
				1300	* type swp_entry_t definition and 2) the number of bits for
				1301	* the swap type in the swap ptes as defined by the different
				1302	* architectures. To honor both limitations a swap entry
				1303	* with swap offset 0 and swap type ~0UL is created, encoded
				1304	* to a swap pte, decoded to a swp_entry_t again and finally
				1305	* the swap type part is extracted. This will mask all bits
				1306	* from the initial ~0UL that can't be encoded in either the
				1307	* swp_entry_t or the architecture definition of a swap pte.
				1308	*/
				1309	if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
				1310	swap_list_unlock();
				1311	goto out;
				1312	}
				1313	if (type >= nr_swapfiles)
				1314	nr_swapfiles = type+1;
				1315	INIT_LIST_HEAD(&p->extent_list);
				1316	p->flags = SWP_USED;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1317	p->swap_file = NULL;
				1318	p->old_block_size = 0;
				1319	p->swap_map = NULL;
				1320	p->lowest_bit = 0;
				1321	p->highest_bit = 0;
				1322	p->cluster_nr = 0;
				1323	p->inuse_pages = 0;
				1324	spin_lock_init(&p->sdev_lock);
				1325	p->next = -1;
				1326	if (swap_flags & SWAP_FLAG_PREFER) {
				1327	p->prio =
				1328	(swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
				1329	} else {
				1330	p->prio = --least_priority;
				1331	}
				1332	swap_list_unlock();
				1333	name = getname(specialfile);
				1334	error = PTR_ERR(name);
				1335	if (IS_ERR(name)) {
				1336	name = NULL;
				1337	goto bad_swap_2;
				1338	}
				1339	swap_file = filp_open(name, O_RDWR\|O_LARGEFILE, 0);
				1340	error = PTR_ERR(swap_file);
				1341	if (IS_ERR(swap_file)) {
				1342	swap_file = NULL;
				1343	goto bad_swap_2;
				1344	}
				1345
				1346	p->swap_file = swap_file;
				1347	mapping = swap_file->f_mapping;
				1348	inode = mapping->host;
				1349
				1350	error = -EBUSY;
				1351	for (i = 0; i < nr_swapfiles; i++) {
				1352	struct swap_info_struct *q = &swap_info[i];
				1353
				1354	if (i == type \|\| !q->swap_file)
				1355	continue;
				1356	if (mapping == q->swap_file->f_mapping)
				1357	goto bad_swap;
				1358	}
				1359
				1360	error = -EINVAL;
				1361	if (S_ISBLK(inode->i_mode)) {
				1362	bdev = I_BDEV(inode);
				1363	error = bd_claim(bdev, sys_swapon);
				1364	if (error < 0) {
				1365	bdev = NULL;
				1366	goto bad_swap;
				1367	}
				1368	p->old_block_size = block_size(bdev);
				1369	error = set_blocksize(bdev, PAGE_SIZE);
				1370	if (error < 0)
				1371	goto bad_swap;
				1372	p->bdev = bdev;
				1373	} else if (S_ISREG(inode->i_mode)) {
				1374	p->bdev = inode->i_sb->s_bdev;
				1375	down(&inode->i_sem);
				1376	did_down = 1;
				1377	if (IS_SWAPFILE(inode)) {
				1378	error = -EBUSY;
				1379	goto bad_swap;
				1380	}
				1381	} else {
				1382	goto bad_swap;
				1383	}
				1384
				1385	swapfilesize = i_size_read(inode) >> PAGE_SHIFT;
				1386
				1387	/*
				1388	* Read the swap header.
				1389	*/
				1390	if (!mapping->a_ops->readpage) {
				1391	error = -EINVAL;
				1392	goto bad_swap;
				1393	}
				1394	page = read_cache_page(mapping, 0,
				1395	(filler_t *)mapping->a_ops->readpage, swap_file);
				1396	if (IS_ERR(page)) {
				1397	error = PTR_ERR(page);
				1398	goto bad_swap;
				1399	}
				1400	wait_on_page_locked(page);
				1401	if (!PageUptodate(page))
				1402	goto bad_swap;
				1403	kmap(page);
				1404	swap_header = page_address(page);
				1405
				1406	if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
				1407	swap_header_version = 1;
				1408	else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
				1409	swap_header_version = 2;
				1410	else {
				1411	printk("Unable to find swap-space signature\n");
				1412	error = -EINVAL;
				1413	goto bad_swap;
				1414	}
				1415
				1416	switch (swap_header_version) {
				1417	case 1:
				1418	printk(KERN_ERR "version 0 swap is no longer supported. "
				1419	"Use mkswap -v1 %s\n", name);
				1420	error = -EINVAL;
				1421	goto bad_swap;
				1422	case 2:
				1423	/* Check the swap header's sub-version and the size of
				1424	the swap file and bad block lists */
				1425	if (swap_header->info.version != 1) {
				1426	printk(KERN_WARNING
				1427	"Unable to handle swap header version %d\n",
				1428	swap_header->info.version);
				1429	error = -EINVAL;
				1430	goto bad_swap;
				1431	}
				1432
				1433	p->lowest_bit = 1;
				1434	/*
				1435	* Find out how many pages are allowed for a single swap
				1436	* device. There are two limiting factors: 1) the number of
				1437	* bits for the swap offset in the swp_entry_t type and
				1438	* 2) the number of bits in the a swap pte as defined by
				1439	* the different architectures. In order to find the
				1440	* largest possible bit mask a swap entry with swap type 0
				1441	* and swap offset ~0UL is created, encoded to a swap pte,
				1442	* decoded to a swp_entry_t again and finally the swap
				1443	* offset is extracted. This will mask all the bits from
				1444	* the initial ~0UL mask that can't be encoded in either
				1445	* the swp_entry_t or the architecture definition of a
				1446	* swap pte.
				1447	*/
				1448	maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1;
				1449	if (maxpages > swap_header->info.last_page)
				1450	maxpages = swap_header->info.last_page;
				1451	p->highest_bit = maxpages - 1;
				1452
				1453	error = -EINVAL;
Hugh Dickins	e2244ec	2005-09-03 15:54:32 -0700	[diff] [blame]	1454	if (!maxpages)
				1455	goto bad_swap;
				1456	if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
				1457	goto bad_swap;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1458	if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
				1459	goto bad_swap;
				1460
				1461	/* OK, set up the swap map and apply the bad block list */
				1462	if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
				1463	error = -ENOMEM;
				1464	goto bad_swap;
				1465	}
				1466
				1467	error = 0;
				1468	memset(p->swap_map, 0, maxpages * sizeof(short));
				1469	for (i=0; i<swap_header->info.nr_badpages; i++) {
				1470	int page = swap_header->info.badpages[i];
				1471	if (page <= 0 \|\| page >= swap_header->info.last_page)
				1472	error = -EINVAL;
				1473	else
				1474	p->swap_map[page] = SWAP_MAP_BAD;
				1475	}
				1476	nr_good_pages = swap_header->info.last_page -
				1477	swap_header->info.nr_badpages -
				1478	1 /* header page */;
				1479	if (error)
				1480	goto bad_swap;
				1481	}
Hugh Dickins	e2244ec	2005-09-03 15:54:32 -0700	[diff] [blame]	1482
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1483	if (swapfilesize && maxpages > swapfilesize) {
				1484	printk(KERN_WARNING
				1485	"Swap area shorter than signature indicates\n");
				1486	error = -EINVAL;
				1487	goto bad_swap;
				1488	}
Hugh Dickins	e2244ec	2005-09-03 15:54:32 -0700	[diff] [blame]	1489	if (nr_good_pages) {
				1490	p->swap_map[0] = SWAP_MAP_BAD;
				1491	p->max = maxpages;
				1492	p->pages = nr_good_pages;
Hugh Dickins	53092a7	2005-09-03 15:54:34 -0700	[diff] [blame]	1493	nr_extents = setup_swap_extents(p, &span);
				1494	if (nr_extents < 0) {
				1495	error = nr_extents;
Hugh Dickins	e2244ec	2005-09-03 15:54:32 -0700	[diff] [blame]	1496	goto bad_swap;
Hugh Dickins	53092a7	2005-09-03 15:54:34 -0700	[diff] [blame]	1497	}
Hugh Dickins	e2244ec	2005-09-03 15:54:32 -0700	[diff] [blame]	1498	nr_good_pages = p->pages;
				1499	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1500	if (!nr_good_pages) {
				1501	printk(KERN_WARNING "Empty swap-file\n");
				1502	error = -EINVAL;
				1503	goto bad_swap;
				1504	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1505
				1506	down(&swapon_sem);
				1507	swap_list_lock();
				1508	swap_device_lock(p);
				1509	p->flags = SWP_ACTIVE;
				1510	nr_swap_pages += nr_good_pages;
				1511	total_swap_pages += nr_good_pages;
Hugh Dickins	53092a7	2005-09-03 15:54:34 -0700	[diff] [blame]	1512
Hugh Dickins	6eb396d	2005-09-03 15:54:35 -0700	[diff] [blame]	1513	printk(KERN_INFO "Adding %uk swap on %s. "
Hugh Dickins	53092a7	2005-09-03 15:54:34 -0700	[diff] [blame]	1514	"Priority:%d extents:%d across:%lluk\n",
				1515	nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
				1516	nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1517
				1518	/* insert swap space into swap_list: */
				1519	prev = -1;
				1520	for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
				1521	if (p->prio >= swap_info[i].prio) {
				1522	break;
				1523	}
				1524	prev = i;
				1525	}
				1526	p->next = i;
				1527	if (prev < 0) {
				1528	swap_list.head = swap_list.next = p - swap_info;
				1529	} else {
				1530	swap_info[prev].next = p - swap_info;
				1531	}
				1532	swap_device_unlock(p);
				1533	swap_list_unlock();
				1534	up(&swapon_sem);
				1535	error = 0;
				1536	goto out;
				1537	bad_swap:
				1538	if (bdev) {
				1539	set_blocksize(bdev, p->old_block_size);
				1540	bd_release(bdev);
				1541	}
Hugh Dickins	4cd3bb1	2005-09-03 15:54:33 -0700	[diff] [blame]	1542	destroy_swap_extents(p);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1543	bad_swap_2:
				1544	swap_list_lock();
				1545	swap_map = p->swap_map;
				1546	p->swap_file = NULL;
				1547	p->swap_map = NULL;
				1548	p->flags = 0;
				1549	if (!(swap_flags & SWAP_FLAG_PREFER))
				1550	++least_priority;
				1551	swap_list_unlock();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1552	vfree(swap_map);
				1553	if (swap_file)
				1554	filp_close(swap_file, NULL);
				1555	out:
				1556	if (page && !IS_ERR(page)) {
				1557	kunmap(page);
				1558	page_cache_release(page);
				1559	}
				1560	if (name)
				1561	putname(name);
				1562	if (did_down) {
				1563	if (!error)
				1564	inode->i_flags \|= S_SWAPFILE;
				1565	up(&inode->i_sem);
				1566	}
				1567	return error;
				1568	}
				1569
				1570	void si_swapinfo(struct sysinfo *val)
				1571	{
				1572	unsigned int i;
				1573	unsigned long nr_to_be_unused = 0;
				1574
				1575	swap_list_lock();
				1576	for (i = 0; i < nr_swapfiles; i++) {
				1577	if (!(swap_info[i].flags & SWP_USED) \|\|
				1578	(swap_info[i].flags & SWP_WRITEOK))
				1579	continue;
				1580	nr_to_be_unused += swap_info[i].inuse_pages;
				1581	}
				1582	val->freeswap = nr_swap_pages + nr_to_be_unused;
				1583	val->totalswap = total_swap_pages + nr_to_be_unused;
				1584	swap_list_unlock();
				1585	}
				1586
				1587	/*
				1588	* Verify that a swap entry is valid and increment its swap map count.
				1589	*
				1590	* Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
				1591	* "permanent", but will be reclaimed by the next swapoff.
				1592	*/
				1593	int swap_duplicate(swp_entry_t entry)
				1594	{
				1595	struct swap_info_struct * p;
				1596	unsigned long offset, type;
				1597	int result = 0;
				1598
				1599	type = swp_type(entry);
				1600	if (type >= nr_swapfiles)
				1601	goto bad_file;
				1602	p = type + swap_info;
				1603	offset = swp_offset(entry);
				1604
				1605	swap_device_lock(p);
				1606	if (offset < p->max && p->swap_map[offset]) {
				1607	if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
				1608	p->swap_map[offset]++;
				1609	result = 1;
				1610	} else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
				1611	if (swap_overflow++ < 5)
				1612	printk(KERN_WARNING "swap_dup: swap entry overflow\n");
				1613	p->swap_map[offset] = SWAP_MAP_MAX;
				1614	result = 1;
				1615	}
				1616	}
				1617	swap_device_unlock(p);
				1618	out:
				1619	return result;
				1620
				1621	bad_file:
				1622	printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
				1623	goto out;
				1624	}
				1625
				1626	struct swap_info_struct *
				1627	get_swap_info_struct(unsigned type)
				1628	{
				1629	return &swap_info[type];
				1630	}
				1631
				1632	/*
				1633	* swap_device_lock prevents swap_map being freed. Don't grab an extra
				1634	* reference on the swaphandle, it doesn't matter if it becomes unused.
				1635	*/
				1636	int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
				1637	{
				1638	int ret = 0, i = 1 << page_cluster;
				1639	unsigned long toff;
				1640	struct swap_info_struct *swapdev = swp_type(entry) + swap_info;
				1641
				1642	if (!page_cluster) /* no readahead */
				1643	return 0;
				1644	toff = (swp_offset(entry) >> page_cluster) << page_cluster;
				1645	if (!toff) /* first page is swap header */
				1646	toff++, i--;
				1647	*offset = toff;
				1648
				1649	swap_device_lock(swapdev);
				1650	do {
				1651	/* Don't read-ahead past the end of the swap area */
				1652	if (toff >= swapdev->max)
				1653	break;
				1654	/* Don't read in free or bad pages */
				1655	if (!swapdev->swap_map[toff])
				1656	break;
				1657	if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
				1658	break;
				1659	toff++;
				1660	ret++;
				1661	} while (--i);
				1662	swap_device_unlock(swapdev);
				1663	return ret;
				1664	}