[PATCH] swap: get_swap_page drop swap_list_lock

Rewrite get_swap_page to allocate in just the same sequence as before, but
without holding swap_list_lock across its scan_swap_map.  Decrement
nr_swap_pages and update swap_list.next in advance, while still holding
swap_list_lock.  Skip full devices by testing highest_bit.  Swapoff hold
swap_device_lock as well as swap_list_lock to clear SWP_WRITEOK.  Reduces lock
contention when there are parallel swap devices of the same priority.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 62e0da8..e54d60a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -139,7 +139,6 @@
 		}
 		si->swap_map[offset] = 1;
 		si->inuse_pages++;
-		nr_swap_pages--;
 		si->cluster_next = offset+1;
 		return offset;
 	}
@@ -150,50 +149,45 @@
 
 swp_entry_t get_swap_page(void)
 {
-	struct swap_info_struct * p;
-	unsigned long offset;
-	swp_entry_t entry;
-	int type, wrapped = 0;
+	struct swap_info_struct *si;
+	pgoff_t offset;
+	int type, next;
+	int wrapped = 0;
 
-	entry.val = 0;	/* Out of memory */
 	swap_list_lock();
-	type = swap_list.next;
-	if (type < 0)
-		goto out;
 	if (nr_swap_pages <= 0)
-		goto out;
+		goto noswap;
+	nr_swap_pages--;
 
-	while (1) {
-		p = &swap_info[type];
-		if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
-			swap_device_lock(p);
-			offset = scan_swap_map(p);
-			swap_device_unlock(p);
-			if (offset) {
-				entry = swp_entry(type,offset);
-				type = swap_info[type].next;
-				if (type < 0 ||
-					p->prio != swap_info[type].prio) {
-						swap_list.next = swap_list.head;
-				} else {
-					swap_list.next = type;
-				}
-				goto out;
-			}
+	for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
+		si = swap_info + type;
+		next = si->next;
+		if (next < 0 ||
+		    (!wrapped && si->prio != swap_info[next].prio)) {
+			next = swap_list.head;
+			wrapped++;
 		}
-		type = p->next;
-		if (!wrapped) {
-			if (type < 0 || p->prio != swap_info[type].prio) {
-				type = swap_list.head;
-				wrapped = 1;
-			}
-		} else
-			if (type < 0)
-				goto out;	/* out of swap space */
+
+		if (!si->highest_bit)
+			continue;
+		if (!(si->flags & SWP_WRITEOK))
+			continue;
+
+		swap_list.next = next;
+		swap_device_lock(si);
+		swap_list_unlock();
+		offset = scan_swap_map(si);
+		swap_device_unlock(si);
+		if (offset)
+			return swp_entry(type, offset);
+		swap_list_lock();
+		next = swap_list.next;
 	}
-out:
+
+	nr_swap_pages++;
+noswap:
 	swap_list_unlock();
-	return entry;
+	return (swp_entry_t) {0};
 }
 
 static struct swap_info_struct * swap_info_get(swp_entry_t entry)
@@ -1105,8 +1099,11 @@
 	}
 	nr_swap_pages -= p->pages;
 	total_swap_pages -= p->pages;
+	swap_device_lock(p);
 	p->flags &= ~SWP_WRITEOK;
+	swap_device_unlock(p);
 	swap_list_unlock();
+
 	current->flags |= PF_SWAPOFF;
 	err = try_to_unuse(type);
 	current->flags &= ~PF_SWAPOFF;