Blame - mm/slub.c - kernel/msm-4.9

blob: 347e44821bcb8b7e5b10a77a1c79c7debed1caa9 [file] [log] [blame]

Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1	/*
				2	* SLUB: A slab allocator that limits cache line use instead of queuing
				3	* objects in per cpu and per node lists.
				4	*
				5	* The allocator synchronizes using per slab locks and only
				6	* uses a centralized lock to manage a pool of partial slabs.
				7	*
				8	* (C) 2007 SGI, Christoph Lameter <clameter@sgi.com>
				9	*/
				10
				11	#include <linux/mm.h>
				12	#include <linux/module.h>
				13	#include <linux/bit_spinlock.h>
				14	#include <linux/interrupt.h>
				15	#include <linux/bitops.h>
				16	#include <linux/slab.h>
				17	#include <linux/seq_file.h>
				18	#include <linux/cpu.h>
				19	#include <linux/cpuset.h>
				20	#include <linux/mempolicy.h>
				21	#include <linux/ctype.h>
				22	#include <linux/kallsyms.h>
				23
				24	/*
				25	* Lock order:
				26	* 1. slab_lock(page)
				27	* 2. slab->list_lock
				28	*
				29	* The slab_lock protects operations on the object of a particular
				30	* slab and its metadata in the page struct. If the slab lock
				31	* has been taken then no allocations nor frees can be performed
				32	* on the objects in the slab nor can the slab be added or removed
				33	* from the partial or full lists since this would mean modifying
				34	* the page_struct of the slab.
				35	*
				36	* The list_lock protects the partial and full list on each node and
				37	* the partial slab counter. If taken then no new slabs may be added or
				38	* removed from the lists nor make the number of partial slabs be modified.
				39	* (Note that the total number of slabs is an atomic value that may be
				40	* modified without taking the list lock).
				41	*
				42	* The list_lock is a centralized lock and thus we avoid taking it as
				43	* much as possible. As long as SLUB does not have to handle partial
				44	* slabs, operations can continue without any centralized lock. F.e.
				45	* allocating a long series of objects that fill up slabs does not require
				46	* the list lock.
				47	*
				48	* The lock order is sometimes inverted when we are trying to get a slab
				49	* off a list. We take the list_lock and then look for a page on the list
				50	* to use. While we do that objects in the slabs may be freed. We can
				51	* only operate on the slab if we have also taken the slab_lock. So we use
				52	* a slab_trylock() on the slab. If trylock was successful then no frees
				53	* can occur anymore and we can use the slab for allocations etc. If the
				54	* slab_trylock() does not succeed then frees are in progress in the slab and
				55	* we must stay away from it for a while since we may cause a bouncing
				56	* cacheline if we try to acquire the lock. So go onto the next slab.
				57	* If all pages are busy then we may allocate a new slab instead of reusing
				58	* a partial slab. A new slab has noone operating on it and thus there is
				59	* no danger of cacheline contention.
				60	*
				61	* Interrupts are disabled during allocation and deallocation in order to
				62	* make the slab allocator safe to use in the context of an irq. In addition
				63	* interrupts are disabled to ensure that the processor does not change
				64	* while handling per_cpu slabs, due to kernel preemption.
				65	*
				66	* SLUB assigns one slab for allocation to each processor.
				67	* Allocations only occur from these slabs called cpu slabs.
				68	*
				69	* Slabs with free elements are kept on a partial list.
				70	* There is no list for full slabs. If an object in a full slab is
				71	* freed then the slab will show up again on the partial lists.
				72	* Otherwise there is no need to track full slabs unless we have to
				73	* track full slabs for debugging purposes.
				74	*
				75	* Slabs are freed when they become empty. Teardown and setup is
				76	* minimal so we rely on the page allocators per cpu caches for
				77	* fast frees and allocs.
				78	*
				79	* Overloading of page flags that are otherwise used for LRU management.
				80	*
				81	* PageActive The slab is used as a cpu cache. Allocations
				82	* may be performed from the slab. The slab is not
				83	* on any slab list and cannot be moved onto one.
				84	*
				85	* PageError Slab requires special handling due to debug
				86	* options set. This moves slab handling out of
				87	* the fast path.
				88	*/
				89
				90	/*
				91	* Issues still to be resolved:
				92	*
				93	* - The per cpu array is updated for each new slab and and is a remote
				94	* cacheline for most nodes. This could become a bouncing cacheline given
				95	* enough frequent updates. There are 16 pointers in a cacheline.so at
				96	* max 16 cpus could compete. Likely okay.
				97	*
				98	* - Support PAGE_ALLOC_DEBUG. Should be easy to do.
				99	*
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	100	* - Variable sizing of the per node arrays
				101	*/
				102
				103	/* Enable to test recovery from slab corruption on boot */
				104	#undef SLUB_RESILIENCY_TEST
				105
				106	#if PAGE_SHIFT <= 12
				107
				108	/*
				109	* Small page size. Make sure that we do not fragment memory
				110	*/
				111	#define DEFAULT_MAX_ORDER 1
				112	#define DEFAULT_MIN_OBJECTS 4
				113
				114	#else
				115
				116	/*
				117	* Large page machines are customarily able to handle larger
				118	* page orders.
				119	*/
				120	#define DEFAULT_MAX_ORDER 2
				121	#define DEFAULT_MIN_OBJECTS 8
				122
				123	#endif
				124
				125	/*
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame]	126	* Mininum number of partial slabs. These will be left on the partial
				127	* lists even if they are empty. kmem_cache_shrink may reclaim them.
				128	*/
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	129	#define MIN_PARTIAL 2
				130
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame]	131	/*
				132	* Maximum number of desirable partial slabs.
				133	* The existence of more partial slabs makes kmem_cache_shrink
				134	* sort the partial list by the number of objects in the.
				135	*/
				136	#define MAX_PARTIAL 10
				137
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	138	#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE \| SLAB_RED_ZONE \| \
				139	SLAB_POISON \| SLAB_STORE_USER)
				140	/*
				141	* Set of flags that will prevent slab merging
				142	*/
				143	#define SLUB_NEVER_MERGE (SLAB_RED_ZONE \| SLAB_POISON \| SLAB_STORE_USER \| \
				144	SLAB_TRACE \| SLAB_DESTROY_BY_RCU)
				145
				146	#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE \| SLAB_RECLAIM_ACCOUNT \| \
				147	SLAB_CACHE_DMA)
				148
				149	#ifndef ARCH_KMALLOC_MINALIGN
Christoph Lameter	47bfdc0	2007-05-06 14:49:37 -0700	[diff] [blame]	150	#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	151	#endif
				152
				153	#ifndef ARCH_SLAB_MINALIGN
Christoph Lameter	47bfdc0	2007-05-06 14:49:37 -0700	[diff] [blame]	154	#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	155	#endif
				156
				157	/* Internal SLUB flags */
				158	#define __OBJECT_POISON 0x80000000 /* Poison object */
				159
				160	static int kmem_size = sizeof(struct kmem_cache);
				161
				162	#ifdef CONFIG_SMP
				163	static struct notifier_block slab_notifier;
				164	#endif
				165
				166	static enum {
				167	DOWN, /* No slab functionality available */
				168	PARTIAL, /* kmem_cache_open() works but kmalloc does not */
				169	UP, /* Everything works */
				170	SYSFS /* Sysfs up */
				171	} slab_state = DOWN;
				172
				173	/* A list of all slab caches on the system */
				174	static DECLARE_RWSEM(slub_lock);
				175	LIST_HEAD(slab_caches);
				176
				177	#ifdef CONFIG_SYSFS
				178	static int sysfs_slab_add(struct kmem_cache *);
				179	static int sysfs_slab_alias(struct kmem_cache , const char );
				180	static void sysfs_slab_remove(struct kmem_cache *);
				181	#else
				182	static int sysfs_slab_add(struct kmem_cache *s) { return 0; }
				183	static int sysfs_slab_alias(struct kmem_cache s, const char p) { return 0; }
				184	static void sysfs_slab_remove(struct kmem_cache *s) {}
				185	#endif
				186
				187	/********************************************************************
				188	* Core slab cache functions
				189	*******************************************************************/
				190
				191	int slab_is_available(void)
				192	{
				193	return slab_state >= UP;
				194	}
				195
				196	static inline struct kmem_cache_node get_node(struct kmem_cache s, int node)
				197	{
				198	#ifdef CONFIG_NUMA
				199	return s->node[node];
				200	#else
				201	return &s->local_node;
				202	#endif
				203	}
				204
				205	/*
				206	* Object debugging
				207	*/
				208	static void print_section(char text, u8 addr, unsigned int length)
				209	{
				210	int i, offset;
				211	int newline = 1;
				212	char ascii[17];
				213
				214	ascii[16] = 0;
				215
				216	for (i = 0; i < length; i++) {
				217	if (newline) {
				218	printk(KERN_ERR "%10s 0x%p: ", text, addr + i);
				219	newline = 0;
				220	}
				221	printk(" %02x", addr[i]);
				222	offset = i % 16;
				223	ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
				224	if (offset == 15) {
				225	printk(" %s\n",ascii);
				226	newline = 1;
				227	}
				228	}
				229	if (!newline) {
				230	i %= 16;
				231	while (i < 16) {
				232	printk(" ");
				233	ascii[i] = ' ';
				234	i++;
				235	}
				236	printk(" %s\n", ascii);
				237	}
				238	}
				239
				240	/*
				241	* Slow version of get and set free pointer.
				242	*
				243	* This requires touching the cache lines of kmem_cache.
				244	* The offset can also be obtained from the page. In that
				245	* case it is in the cacheline that we already need to touch.
				246	*/
				247	static void get_freepointer(struct kmem_cache s, void *object)
				248	{
				249	return (void *)(object + s->offset);
				250	}
				251
				252	static void set_freepointer(struct kmem_cache s, void object, void *fp)
				253	{
				254	(void *)(object + s->offset) = fp;
				255	}
				256
				257	/*
				258	* Tracking user of a slab.
				259	*/
				260	struct track {
				261	void addr; / Called from address */
				262	int cpu; /* Was running on cpu */
				263	int pid; /* Pid context */
				264	unsigned long when; /* When did the operation occur */
				265	};
				266
				267	enum track_item { TRACK_ALLOC, TRACK_FREE };
				268
				269	static struct track get_track(struct kmem_cache s, void *object,
				270	enum track_item alloc)
				271	{
				272	struct track *p;
				273
				274	if (s->offset)
				275	p = object + s->offset + sizeof(void *);
				276	else
				277	p = object + s->inuse;
				278
				279	return p + alloc;
				280	}
				281
				282	static void set_track(struct kmem_cache s, void object,
				283	enum track_item alloc, void *addr)
				284	{
				285	struct track *p;
				286
				287	if (s->offset)
				288	p = object + s->offset + sizeof(void *);
				289	else
				290	p = object + s->inuse;
				291
				292	p += alloc;
				293	if (addr) {
				294	p->addr = addr;
				295	p->cpu = smp_processor_id();
				296	p->pid = current ? current->pid : -1;
				297	p->when = jiffies;
				298	} else
				299	memset(p, 0, sizeof(struct track));
				300	}
				301
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	302	static void init_tracking(struct kmem_cache s, void object)
				303	{
				304	if (s->flags & SLAB_STORE_USER) {
				305	set_track(s, object, TRACK_FREE, NULL);
				306	set_track(s, object, TRACK_ALLOC, NULL);
				307	}
				308	}
				309
				310	static void print_track(const char s, struct track t)
				311	{
				312	if (!t->addr)
				313	return;
				314
				315	printk(KERN_ERR "%s: ", s);
				316	__print_symbol("%s", (unsigned long)t->addr);
				317	printk(" jiffies_ago=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
				318	}
				319
				320	static void print_trailer(struct kmem_cache s, u8 p)
				321	{
				322	unsigned int off; /* Offset of last byte */
				323
				324	if (s->flags & SLAB_RED_ZONE)
				325	print_section("Redzone", p + s->objsize,
				326	s->inuse - s->objsize);
				327
				328	printk(KERN_ERR "FreePointer 0x%p -> 0x%p\n",
				329	p + s->offset,
				330	get_freepointer(s, p));
				331
				332	if (s->offset)
				333	off = s->offset + sizeof(void *);
				334	else
				335	off = s->inuse;
				336
				337	if (s->flags & SLAB_STORE_USER) {
				338	print_track("Last alloc", get_track(s, p, TRACK_ALLOC));
				339	print_track("Last free ", get_track(s, p, TRACK_FREE));
				340	off += 2 * sizeof(struct track);
				341	}
				342
				343	if (off != s->size)
				344	/* Beginning of the filler is the free pointer */
				345	print_section("Filler", p + off, s->size - off);
				346	}
				347
				348	static void object_err(struct kmem_cache s, struct page page,
				349	u8 object, char reason)
				350	{
				351	u8 *addr = page_address(page);
				352
				353	printk(KERN_ERR "*** SLUB %s: %s@0x%p slab 0x%p\n",
				354	s->name, reason, object, page);
				355	printk(KERN_ERR " offset=%tu flags=0x%04lx inuse=%u freelist=0x%p\n",
				356	object - addr, page->flags, page->inuse, page->freelist);
				357	if (object > addr + 16)
				358	print_section("Bytes b4", object - 16, 16);
				359	print_section("Object", object, min(s->objsize, 128));
				360	print_trailer(s, object);
				361	dump_stack();
				362	}
				363
				364	static void slab_err(struct kmem_cache s, struct page page, char *reason, ...)
				365	{
				366	va_list args;
				367	char buf[100];
				368
				369	va_start(args, reason);
				370	vsnprintf(buf, sizeof(buf), reason, args);
				371	va_end(args);
				372	printk(KERN_ERR "*** SLUB %s: %s in slab @0x%p\n", s->name, buf,
				373	page);
				374	dump_stack();
				375	}
				376
				377	static void init_object(struct kmem_cache s, void object, int active)
				378	{
				379	u8 *p = object;
				380
				381	if (s->flags & __OBJECT_POISON) {
				382	memset(p, POISON_FREE, s->objsize - 1);
				383	p[s->objsize -1] = POISON_END;
				384	}
				385
				386	if (s->flags & SLAB_RED_ZONE)
				387	memset(p + s->objsize,
				388	active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
				389	s->inuse - s->objsize);
				390	}
				391
				392	static int check_bytes(u8 *start, unsigned int value, unsigned int bytes)
				393	{
				394	while (bytes) {
				395	if (*start != (u8)value)
				396	return 0;
				397	start++;
				398	bytes--;
				399	}
				400	return 1;
				401	}
				402
				403
				404	static int check_valid_pointer(struct kmem_cache s, struct page page,
				405	void *object)
				406	{
				407	void *base;
				408
				409	if (!object)
				410	return 1;
				411
				412	base = page_address(page);
				413	if (object < base \|\| object >= base + s->objects * s->size \|\|
				414	(object - base) % s->size) {
				415	return 0;
				416	}
				417
				418	return 1;
				419	}
				420
				421	/*
				422	* Object layout:
				423	*
				424	* object address
				425	* Bytes of the object to be managed.
				426	* If the freepointer may overlay the object then the free
				427	* pointer is the first word of the object.
				428	* Poisoning uses 0x6b (POISON_FREE) and the last byte is
				429	* 0xa5 (POISON_END)
				430	*
				431	* object + s->objsize
				432	* Padding to reach word boundary. This is also used for Redzoning.
				433	* Padding is extended to word size if Redzoning is enabled
				434	* and objsize == inuse.
				435	* We fill with 0xbb (RED_INACTIVE) for inactive objects and with
				436	* 0xcc (RED_ACTIVE) for objects in use.
				437	*
				438	* object + s->inuse
				439	* A. Free pointer (if we cannot overwrite object on free)
				440	* B. Tracking data for SLAB_STORE_USER
				441	* C. Padding to reach required alignment boundary
				442	* Padding is done using 0x5a (POISON_INUSE)
				443	*
				444	* object + s->size
				445	*
				446	* If slabcaches are merged then the objsize and inuse boundaries are to
				447	* be ignored. And therefore no slab options that rely on these boundaries
				448	* may be used with merged slabcaches.
				449	*/
				450
				451	static void restore_bytes(struct kmem_cache s, char message, u8 data,
				452	void from, void to)
				453	{
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	454	printk(KERN_ERR "@@@ SLUB %s: Restoring %s (0x%x) from 0x%p-0x%p\n",
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	455	s->name, message, data, from, to - 1);
				456	memset(from, data, to - from);
				457	}
				458
				459	static int check_pad_bytes(struct kmem_cache s, struct page page, u8 *p)
				460	{
				461	unsigned long off = s->inuse; /* The end of info */
				462
				463	if (s->offset)
				464	/* Freepointer is placed after the object. */
				465	off += sizeof(void *);
				466
				467	if (s->flags & SLAB_STORE_USER)
				468	/* We also have user information there */
				469	off += 2 * sizeof(struct track);
				470
				471	if (s->size == off)
				472	return 1;
				473
				474	if (check_bytes(p + off, POISON_INUSE, s->size - off))
				475	return 1;
				476
				477	object_err(s, page, p, "Object padding check fails");
				478
				479	/*
				480	* Restore padding
				481	*/
				482	restore_bytes(s, "object padding", POISON_INUSE, p + off, p + s->size);
				483	return 0;
				484	}
				485
				486	static int slab_pad_check(struct kmem_cache s, struct page page)
				487	{
				488	u8 *p;
				489	int length, remainder;
				490
				491	if (!(s->flags & SLAB_POISON))
				492	return 1;
				493
				494	p = page_address(page);
				495	length = s->objects * s->size;
				496	remainder = (PAGE_SIZE << s->order) - length;
				497	if (!remainder)
				498	return 1;
				499
				500	if (!check_bytes(p + length, POISON_INUSE, remainder)) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	501	slab_err(s, page, "Padding check failed");
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	502	restore_bytes(s, "slab padding", POISON_INUSE, p + length,
				503	p + length + remainder);
				504	return 0;
				505	}
				506	return 1;
				507	}
				508
				509	static int check_object(struct kmem_cache s, struct page page,
				510	void *object, int active)
				511	{
				512	u8 *p = object;
				513	u8 *endobject = object + s->objsize;
				514
				515	if (s->flags & SLAB_RED_ZONE) {
				516	unsigned int red =
				517	active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
				518
				519	if (!check_bytes(endobject, red, s->inuse - s->objsize)) {
				520	object_err(s, page, object,
				521	active ? "Redzone Active" : "Redzone Inactive");
				522	restore_bytes(s, "redzone", red,
				523	endobject, object + s->inuse);
				524	return 0;
				525	}
				526	} else {
				527	if ((s->flags & SLAB_POISON) && s->objsize < s->inuse &&
				528	!check_bytes(endobject, POISON_INUSE,
				529	s->inuse - s->objsize)) {
				530	object_err(s, page, p, "Alignment padding check fails");
				531	/*
				532	* Fix it so that there will not be another report.
				533	*
				534	* Hmmm... We may be corrupting an object that now expects
				535	* to be longer than allowed.
				536	*/
				537	restore_bytes(s, "alignment padding", POISON_INUSE,
				538	endobject, object + s->inuse);
				539	}
				540	}
				541
				542	if (s->flags & SLAB_POISON) {
				543	if (!active && (s->flags & __OBJECT_POISON) &&
				544	(!check_bytes(p, POISON_FREE, s->objsize - 1) \|\|
				545	p[s->objsize - 1] != POISON_END)) {
				546
				547	object_err(s, page, p, "Poison check failed");
				548	restore_bytes(s, "Poison", POISON_FREE,
				549	p, p + s->objsize -1);
				550	restore_bytes(s, "Poison", POISON_END,
				551	p + s->objsize - 1, p + s->objsize);
				552	return 0;
				553	}
				554	/*
				555	* check_pad_bytes cleans up on its own.
				556	*/
				557	check_pad_bytes(s, page, p);
				558	}
				559
				560	if (!s->offset && active)
				561	/*
				562	* Object and freepointer overlap. Cannot check
				563	* freepointer while object is allocated.
				564	*/
				565	return 1;
				566
				567	/* Check free pointer validity */
				568	if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
				569	object_err(s, page, p, "Freepointer corrupt");
				570	/*
				571	* No choice but to zap it and thus loose the remainder
				572	* of the free objects in this slab. May cause
				573	* another error because the object count maybe
				574	* wrong now.
				575	*/
				576	set_freepointer(s, p, NULL);
				577	return 0;
				578	}
				579	return 1;
				580	}
				581
				582	static int check_slab(struct kmem_cache s, struct page page)
				583	{
				584	VM_BUG_ON(!irqs_disabled());
				585
				586	if (!PageSlab(page)) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	587	slab_err(s, page, "Not a valid slab page flags=%lx "
				588	"mapping=0x%p count=%d", page->flags, page->mapping,
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	589	page_count(page));
				590	return 0;
				591	}
				592	if (page->offset * sizeof(void *) != s->offset) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	593	slab_err(s, page, "Corrupted offset %lu flags=0x%lx "
				594	"mapping=0x%p count=%d",
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	595	(unsigned long)(page->offset * sizeof(void *)),
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	596	page->flags,
				597	page->mapping,
				598	page_count(page));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	599	return 0;
				600	}
				601	if (page->inuse > s->objects) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	602	slab_err(s, page, "inuse %u > max %u @0x%p flags=%lx "
				603	"mapping=0x%p count=%d",
				604	s->name, page->inuse, s->objects, page->flags,
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	605	page->mapping, page_count(page));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	606	return 0;
				607	}
				608	/* Slab_pad_check fixes things up after itself */
				609	slab_pad_check(s, page);
				610	return 1;
				611	}
				612
				613	/*
				614	* Determine if a certain object on a page is on the freelist and
				615	* therefore free. Must hold the slab lock for cpu slabs to
				616	* guarantee that the chains are consistent.
				617	*/
				618	static int on_freelist(struct kmem_cache s, struct page page, void *search)
				619	{
				620	int nr = 0;
				621	void *fp = page->freelist;
				622	void *object = NULL;
				623
				624	while (fp && nr <= s->objects) {
				625	if (fp == search)
				626	return 1;
				627	if (!check_valid_pointer(s, page, fp)) {
				628	if (object) {
				629	object_err(s, page, object,
				630	"Freechain corrupt");
				631	set_freepointer(s, object, NULL);
				632	break;
				633	} else {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	634	slab_err(s, page, "Freepointer 0x%p corrupt",
				635	fp);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	636	page->freelist = NULL;
				637	page->inuse = s->objects;
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	638	printk(KERN_ERR "@@@ SLUB %s: Freelist "
				639	"cleared. Slab 0x%p\n",
				640	s->name, page);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	641	return 0;
				642	}
				643	break;
				644	}
				645	object = fp;
				646	fp = get_freepointer(s, object);
				647	nr++;
				648	}
				649
				650	if (page->inuse != s->objects - nr) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	651	slab_err(s, page, "Wrong object count. Counter is %d but "
				652	"counted were %d", s, page, page->inuse,
				653	s->objects - nr);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	654	page->inuse = s->objects - nr;
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	655	printk(KERN_ERR "@@@ SLUB %s: Object count adjusted. "
				656	"Slab @0x%p\n", s->name, page);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	657	}
				658	return search == NULL;
				659	}
				660
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	661	/*
				662	* Tracking of fully allocated slabs for debugging
				663	*/
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	664	static void add_full(struct kmem_cache_node n, struct page page)
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	665	{
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	666	spin_lock(&n->list_lock);
				667	list_add(&page->lru, &n->full);
				668	spin_unlock(&n->list_lock);
				669	}
				670
				671	static void remove_full(struct kmem_cache s, struct page page)
				672	{
				673	struct kmem_cache_node *n;
				674
				675	if (!(s->flags & SLAB_STORE_USER))
				676	return;
				677
				678	n = get_node(s, page_to_nid(page));
				679
				680	spin_lock(&n->list_lock);
				681	list_del(&page->lru);
				682	spin_unlock(&n->list_lock);
				683	}
				684
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	685	static int alloc_object_checks(struct kmem_cache s, struct page page,
				686	void *object)
				687	{
				688	if (!check_slab(s, page))
				689	goto bad;
				690
				691	if (object && !on_freelist(s, page, object)) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	692	slab_err(s, page, "Object 0x%p already allocated", object);
				693	goto bad;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	694	}
				695
				696	if (!check_valid_pointer(s, page, object)) {
				697	object_err(s, page, object, "Freelist Pointer check fails");
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	698	goto bad;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	699	}
				700
				701	if (!object)
				702	return 1;
				703
				704	if (!check_object(s, page, object, 0))
				705	goto bad;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	706
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	707	return 1;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	708	bad:
				709	if (PageSlab(page)) {
				710	/*
				711	* If this is a slab page then lets do the best we can
				712	* to avoid issues in the future. Marking all objects
				713	* as used avoids touching the remainder.
				714	*/
				715	printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n",
				716	s->name, page);
				717	page->inuse = s->objects;
				718	page->freelist = NULL;
				719	/* Fix up fields that may be corrupted */
				720	page->offset = s->offset / sizeof(void *);
				721	}
				722	return 0;
				723	}
				724
				725	static int free_object_checks(struct kmem_cache s, struct page page,
				726	void *object)
				727	{
				728	if (!check_slab(s, page))
				729	goto fail;
				730
				731	if (!check_valid_pointer(s, page, object)) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	732	slab_err(s, page, "Invalid object pointer 0x%p", object);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	733	goto fail;
				734	}
				735
				736	if (on_freelist(s, page, object)) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	737	slab_err(s, page, "Object 0x%p already free", object);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	738	goto fail;
				739	}
				740
				741	if (!check_object(s, page, object, 1))
				742	return 0;
				743
				744	if (unlikely(s != page->slab)) {
				745	if (!PageSlab(page))
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	746	slab_err(s, page, "Attempt to free object(0x%p) "
				747	"outside of slab", object);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	748	else
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	749	if (!page->slab) {
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	750	printk(KERN_ERR
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	751	"SLUB <none>: no slab for object 0x%p.\n",
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	752	object);
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	753	dump_stack();
				754	}
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	755	else
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	756	slab_err(s, page, "object at 0x%p belongs "
				757	"to slab %s", object, page->slab->name);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	758	goto fail;
				759	}
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	760	return 1;
				761	fail:
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	762	printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n",
				763	s->name, page, object);
				764	return 0;
				765	}
				766
				767	/*
				768	* Slab allocation and freeing
				769	*/
				770	static struct page allocate_slab(struct kmem_cache s, gfp_t flags, int node)
				771	{
				772	struct page * page;
				773	int pages = 1 << s->order;
				774
				775	if (s->order)
				776	flags \|= __GFP_COMP;
				777
				778	if (s->flags & SLAB_CACHE_DMA)
				779	flags \|= SLUB_DMA;
				780
				781	if (node == -1)
				782	page = alloc_pages(flags, s->order);
				783	else
				784	page = alloc_pages_node(node, flags, s->order);
				785
				786	if (!page)
				787	return NULL;
				788
				789	mod_zone_page_state(page_zone(page),
				790	(s->flags & SLAB_RECLAIM_ACCOUNT) ?
				791	NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
				792	pages);
				793
				794	return page;
				795	}
				796
				797	static void setup_object(struct kmem_cache s, struct page page,
				798	void *object)
				799	{
				800	if (PageError(page)) {
				801	init_object(s, object, 0);
				802	init_tracking(s, object);
				803	}
				804
Christoph Lameter	4f10493	2007-05-06 14:50:17 -0700	[diff] [blame^]	805	if (unlikely(s->ctor))
				806	s->ctor(object, s, SLAB_CTOR_CONSTRUCTOR);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	807	}
				808
				809	static struct page new_slab(struct kmem_cache s, gfp_t flags, int node)
				810	{
				811	struct page *page;
				812	struct kmem_cache_node *n;
				813	void *start;
				814	void *end;
				815	void *last;
				816	void *p;
				817
				818	if (flags & __GFP_NO_GROW)
				819	return NULL;
				820
				821	BUG_ON(flags & ~(GFP_DMA \| GFP_LEVEL_MASK));
				822
				823	if (flags & __GFP_WAIT)
				824	local_irq_enable();
				825
				826	page = allocate_slab(s, flags & GFP_LEVEL_MASK, node);
				827	if (!page)
				828	goto out;
				829
				830	n = get_node(s, page_to_nid(page));
				831	if (n)
				832	atomic_long_inc(&n->nr_slabs);
				833	page->offset = s->offset / sizeof(void *);
				834	page->slab = s;
				835	page->flags \|= 1 << PG_slab;
				836	if (s->flags & (SLAB_DEBUG_FREE \| SLAB_RED_ZONE \| SLAB_POISON \|
				837	SLAB_STORE_USER \| SLAB_TRACE))
				838	page->flags \|= 1 << PG_error;
				839
				840	start = page_address(page);
				841	end = start + s->objects * s->size;
				842
				843	if (unlikely(s->flags & SLAB_POISON))
				844	memset(start, POISON_INUSE, PAGE_SIZE << s->order);
				845
				846	last = start;
				847	for (p = start + s->size; p < end; p += s->size) {
				848	setup_object(s, page, last);
				849	set_freepointer(s, last, p);
				850	last = p;
				851	}
				852	setup_object(s, page, last);
				853	set_freepointer(s, last, NULL);
				854
				855	page->freelist = start;
				856	page->inuse = 0;
				857	out:
				858	if (flags & __GFP_WAIT)
				859	local_irq_disable();
				860	return page;
				861	}
				862
				863	static void __free_slab(struct kmem_cache s, struct page page)
				864	{
				865	int pages = 1 << s->order;
				866
				867	if (unlikely(PageError(page) \|\| s->dtor)) {
				868	void *start = page_address(page);
				869	void *end = start + (pages << PAGE_SHIFT);
				870	void *p;
				871
				872	slab_pad_check(s, page);
				873	for (p = start; p <= end - s->size; p += s->size) {
				874	if (s->dtor)
				875	s->dtor(p, s, 0);
				876	check_object(s, page, p, 0);
				877	}
				878	}
				879
				880	mod_zone_page_state(page_zone(page),
				881	(s->flags & SLAB_RECLAIM_ACCOUNT) ?
				882	NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
				883	- pages);
				884
				885	page->mapping = NULL;
				886	__free_pages(page, s->order);
				887	}
				888
				889	static void rcu_free_slab(struct rcu_head *h)
				890	{
				891	struct page *page;
				892
				893	page = container_of((struct list_head *)h, struct page, lru);
				894	__free_slab(page->slab, page);
				895	}
				896
				897	static void free_slab(struct kmem_cache s, struct page page)
				898	{
				899	if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
				900	/*
				901	* RCU free overloads the RCU head over the LRU
				902	*/
				903	struct rcu_head head = (void )&page->lru;
				904
				905	call_rcu(head, rcu_free_slab);
				906	} else
				907	__free_slab(s, page);
				908	}
				909
				910	static void discard_slab(struct kmem_cache s, struct page page)
				911	{
				912	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
				913
				914	atomic_long_dec(&n->nr_slabs);
				915	reset_page_mapcount(page);
				916	page->flags &= ~(1 << PG_slab \| 1 << PG_error);
				917	free_slab(s, page);
				918	}
				919
				920	/*
				921	* Per slab locking using the pagelock
				922	*/
				923	static __always_inline void slab_lock(struct page *page)
				924	{
				925	bit_spin_lock(PG_locked, &page->flags);
				926	}
				927
				928	static __always_inline void slab_unlock(struct page *page)
				929	{
				930	bit_spin_unlock(PG_locked, &page->flags);
				931	}
				932
				933	static __always_inline int slab_trylock(struct page *page)
				934	{
				935	int rc = 1;
				936
				937	rc = bit_spin_trylock(PG_locked, &page->flags);
				938	return rc;
				939	}
				940
				941	/*
				942	* Management of partially allocated slabs
				943	*/
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	944	static void add_partial_tail(struct kmem_cache_node n, struct page page)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	945	{
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	946	spin_lock(&n->list_lock);
				947	n->nr_partial++;
				948	list_add_tail(&page->lru, &n->partial);
				949	spin_unlock(&n->list_lock);
				950	}
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	951
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	952	static void add_partial(struct kmem_cache_node n, struct page page)
				953	{
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	954	spin_lock(&n->list_lock);
				955	n->nr_partial++;
				956	list_add(&page->lru, &n->partial);
				957	spin_unlock(&n->list_lock);
				958	}
				959
				960	static void remove_partial(struct kmem_cache *s,
				961	struct page *page)
				962	{
				963	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
				964
				965	spin_lock(&n->list_lock);
				966	list_del(&page->lru);
				967	n->nr_partial--;
				968	spin_unlock(&n->list_lock);
				969	}
				970
				971	/*
				972	* Lock page and remove it from the partial list
				973	*
				974	* Must hold list_lock
				975	*/
				976	static int lock_and_del_slab(struct kmem_cache_node n, struct page page)
				977	{
				978	if (slab_trylock(page)) {
				979	list_del(&page->lru);
				980	n->nr_partial--;
				981	return 1;
				982	}
				983	return 0;
				984	}
				985
				986	/*
				987	* Try to get a partial slab from a specific node
				988	*/
				989	static struct page get_partial_node(struct kmem_cache_node n)
				990	{
				991	struct page *page;
				992
				993	/*
				994	* Racy check. If we mistakenly see no partial slabs then we
				995	* just allocate an empty slab. If we mistakenly try to get a
				996	* partial slab then get_partials() will return NULL.
				997	*/
				998	if (!n \|\| !n->nr_partial)
				999	return NULL;
				1000
				1001	spin_lock(&n->list_lock);
				1002	list_for_each_entry(page, &n->partial, lru)
				1003	if (lock_and_del_slab(n, page))
				1004	goto out;
				1005	page = NULL;
				1006	out:
				1007	spin_unlock(&n->list_lock);
				1008	return page;
				1009	}
				1010
				1011	/*
				1012	* Get a page from somewhere. Search in increasing NUMA
				1013	* distances.
				1014	*/
				1015	static struct page get_any_partial(struct kmem_cache s, gfp_t flags)
				1016	{
				1017	#ifdef CONFIG_NUMA
				1018	struct zonelist *zonelist;
				1019	struct zone **z;
				1020	struct page *page;
				1021
				1022	/*
				1023	* The defrag ratio allows to configure the tradeoffs between
				1024	* inter node defragmentation and node local allocations.
				1025	* A lower defrag_ratio increases the tendency to do local
				1026	* allocations instead of scanning throught the partial
				1027	* lists on other nodes.
				1028	*
				1029	* If defrag_ratio is set to 0 then kmalloc() always
				1030	* returns node local objects. If its higher then kmalloc()
				1031	* may return off node objects in order to avoid fragmentation.
				1032	*
				1033	* A higher ratio means slabs may be taken from other nodes
				1034	* thus reducing the number of partial slabs on those nodes.
				1035	*
				1036	* If /sys/slab/xx/defrag_ratio is set to 100 (which makes
				1037	* defrag_ratio = 1000) then every (well almost) allocation
				1038	* will first attempt to defrag slab caches on other nodes. This
				1039	* means scanning over all nodes to look for partial slabs which
				1040	* may be a bit expensive to do on every slab allocation.
				1041	*/
				1042	if (!s->defrag_ratio \|\| get_cycles() % 1024 > s->defrag_ratio)
				1043	return NULL;
				1044
				1045	zonelist = &NODE_DATA(slab_node(current->mempolicy))
				1046	->node_zonelists[gfp_zone(flags)];
				1047	for (z = zonelist->zones; *z; z++) {
				1048	struct kmem_cache_node *n;
				1049
				1050	n = get_node(s, zone_to_nid(*z));
				1051
				1052	if (n && cpuset_zone_allowed_hardwall(*z, flags) &&
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1053	n->nr_partial > MIN_PARTIAL) {
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1054	page = get_partial_node(n);
				1055	if (page)
				1056	return page;
				1057	}
				1058	}
				1059	#endif
				1060	return NULL;
				1061	}
				1062
				1063	/*
				1064	* Get a partial page, lock it and return it.
				1065	*/
				1066	static struct page get_partial(struct kmem_cache s, gfp_t flags, int node)
				1067	{
				1068	struct page *page;
				1069	int searchnode = (node == -1) ? numa_node_id() : node;
				1070
				1071	page = get_partial_node(get_node(s, searchnode));
				1072	if (page \|\| (flags & __GFP_THISNODE))
				1073	return page;
				1074
				1075	return get_any_partial(s, flags);
				1076	}
				1077
				1078	/*
				1079	* Move a page back to the lists.
				1080	*
				1081	* Must be called with the slab lock held.
				1082	*
				1083	* On exit the slab lock will have been dropped.
				1084	*/
				1085	static void putback_slab(struct kmem_cache s, struct page page)
				1086	{
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1087	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
				1088
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1089	if (page->inuse) {
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1090
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1091	if (page->freelist)
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1092	add_partial(n, page);
				1093	else if (PageError(page) && (s->flags & SLAB_STORE_USER))
				1094	add_full(n, page);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1095	slab_unlock(page);
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1096
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1097	} else {
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1098	if (n->nr_partial < MIN_PARTIAL) {
				1099	/*
				1100	* Adding an empty page to the partial slabs in order
				1101	* to avoid page allocator overhead. This page needs to
				1102	* come after all the others that are not fully empty
				1103	* in order to make sure that we do maximum
				1104	* defragmentation.
				1105	*/
				1106	add_partial_tail(n, page);
				1107	slab_unlock(page);
				1108	} else {
				1109	slab_unlock(page);
				1110	discard_slab(s, page);
				1111	}
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1112	}
				1113	}
				1114
				1115	/*
				1116	* Remove the cpu slab
				1117	*/
				1118	static void deactivate_slab(struct kmem_cache s, struct page page, int cpu)
				1119	{
				1120	s->cpu_slab[cpu] = NULL;
				1121	ClearPageActive(page);
				1122
				1123	putback_slab(s, page);
				1124	}
				1125
				1126	static void flush_slab(struct kmem_cache s, struct page page, int cpu)
				1127	{
				1128	slab_lock(page);
				1129	deactivate_slab(s, page, cpu);
				1130	}
				1131
				1132	/*
				1133	* Flush cpu slab.
				1134	* Called from IPI handler with interrupts disabled.
				1135	*/
				1136	static void __flush_cpu_slab(struct kmem_cache *s, int cpu)
				1137	{
				1138	struct page *page = s->cpu_slab[cpu];
				1139
				1140	if (likely(page))
				1141	flush_slab(s, page, cpu);
				1142	}
				1143
				1144	static void flush_cpu_slab(void *d)
				1145	{
				1146	struct kmem_cache *s = d;
				1147	int cpu = smp_processor_id();
				1148
				1149	__flush_cpu_slab(s, cpu);
				1150	}
				1151
				1152	static void flush_all(struct kmem_cache *s)
				1153	{
				1154	#ifdef CONFIG_SMP
				1155	on_each_cpu(flush_cpu_slab, s, 1, 1);
				1156	#else
				1157	unsigned long flags;
				1158
				1159	local_irq_save(flags);
				1160	flush_cpu_slab(s);
				1161	local_irq_restore(flags);
				1162	#endif
				1163	}
				1164
				1165	/*
				1166	* slab_alloc is optimized to only modify two cachelines on the fast path
				1167	* (aside from the stack):
				1168	*
				1169	* 1. The page struct
				1170	* 2. The first cacheline of the object to be allocated.
				1171	*
				1172	* The only cache lines that are read (apart from code) is the
				1173	* per cpu array in the kmem_cache struct.
				1174	*
				1175	* Fastpath is not possible if we need to get a new slab or have
				1176	* debugging enabled (which means all slabs are marked with PageError)
				1177	*/
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1178	static void slab_alloc(struct kmem_cache s,
				1179	gfp_t gfpflags, int node, void *addr)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1180	{
				1181	struct page *page;
				1182	void **object;
				1183	unsigned long flags;
				1184	int cpu;
				1185
				1186	local_irq_save(flags);
				1187	cpu = smp_processor_id();
				1188	page = s->cpu_slab[cpu];
				1189	if (!page)
				1190	goto new_slab;
				1191
				1192	slab_lock(page);
				1193	if (unlikely(node != -1 && page_to_nid(page) != node))
				1194	goto another_slab;
				1195	redo:
				1196	object = page->freelist;
				1197	if (unlikely(!object))
				1198	goto another_slab;
				1199	if (unlikely(PageError(page)))
				1200	goto debug;
				1201
				1202	have_object:
				1203	page->inuse++;
				1204	page->freelist = object[page->offset];
				1205	slab_unlock(page);
				1206	local_irq_restore(flags);
				1207	return object;
				1208
				1209	another_slab:
				1210	deactivate_slab(s, page, cpu);
				1211
				1212	new_slab:
				1213	page = get_partial(s, gfpflags, node);
				1214	if (likely(page)) {
				1215	have_slab:
				1216	s->cpu_slab[cpu] = page;
				1217	SetPageActive(page);
				1218	goto redo;
				1219	}
				1220
				1221	page = new_slab(s, gfpflags, node);
				1222	if (page) {
				1223	cpu = smp_processor_id();
				1224	if (s->cpu_slab[cpu]) {
				1225	/*
				1226	* Someone else populated the cpu_slab while we enabled
				1227	* interrupts, or we have got scheduled on another cpu.
				1228	* The page may not be on the requested node.
				1229	*/
				1230	if (node == -1 \|\|
				1231	page_to_nid(s->cpu_slab[cpu]) == node) {
				1232	/*
				1233	* Current cpuslab is acceptable and we
				1234	* want the current one since its cache hot
				1235	*/
				1236	discard_slab(s, page);
				1237	page = s->cpu_slab[cpu];
				1238	slab_lock(page);
				1239	goto redo;
				1240	}
				1241	/* Dump the current slab */
				1242	flush_slab(s, s->cpu_slab[cpu], cpu);
				1243	}
				1244	slab_lock(page);
				1245	goto have_slab;
				1246	}
				1247	local_irq_restore(flags);
				1248	return NULL;
				1249	debug:
				1250	if (!alloc_object_checks(s, page, object))
				1251	goto another_slab;
				1252	if (s->flags & SLAB_STORE_USER)
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1253	set_track(s, object, TRACK_ALLOC, addr);
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	1254	if (s->flags & SLAB_TRACE) {
				1255	printk(KERN_INFO "TRACE %s alloc 0x%p inuse=%d fp=0x%p\n",
				1256	s->name, object, page->inuse,
				1257	page->freelist);
				1258	dump_stack();
				1259	}
				1260	init_object(s, object, 1);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1261	goto have_object;
				1262	}
				1263
				1264	void kmem_cache_alloc(struct kmem_cache s, gfp_t gfpflags)
				1265	{
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1266	return slab_alloc(s, gfpflags, -1, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1267	}
				1268	EXPORT_SYMBOL(kmem_cache_alloc);
				1269
				1270	#ifdef CONFIG_NUMA
				1271	void kmem_cache_alloc_node(struct kmem_cache s, gfp_t gfpflags, int node)
				1272	{
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1273	return slab_alloc(s, gfpflags, node, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1274	}
				1275	EXPORT_SYMBOL(kmem_cache_alloc_node);
				1276	#endif
				1277
				1278	/*
				1279	* The fastpath only writes the cacheline of the page struct and the first
				1280	* cacheline of the object.
				1281	*
				1282	* No special cachelines need to be read
				1283	*/
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1284	static void slab_free(struct kmem_cache s, struct page page,
				1285	void x, void addr)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1286	{
				1287	void *prior;
				1288	void *object = (void )x;
				1289	unsigned long flags;
				1290
				1291	local_irq_save(flags);
				1292	slab_lock(page);
				1293
				1294	if (unlikely(PageError(page)))
				1295	goto debug;
				1296	checks_ok:
				1297	prior = object[page->offset] = page->freelist;
				1298	page->freelist = object;
				1299	page->inuse--;
				1300
				1301	if (unlikely(PageActive(page)))
				1302	/*
				1303	* Cpu slabs are never on partial lists and are
				1304	* never freed.
				1305	*/
				1306	goto out_unlock;
				1307
				1308	if (unlikely(!page->inuse))
				1309	goto slab_empty;
				1310
				1311	/*
				1312	* Objects left in the slab. If it
				1313	* was not on the partial list before
				1314	* then add it.
				1315	*/
				1316	if (unlikely(!prior))
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1317	add_partial(get_node(s, page_to_nid(page)), page);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1318
				1319	out_unlock:
				1320	slab_unlock(page);
				1321	local_irq_restore(flags);
				1322	return;
				1323
				1324	slab_empty:
				1325	if (prior)
				1326	/*
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	1327	* Slab on the partial list.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1328	*/
				1329	remove_partial(s, page);
				1330
				1331	slab_unlock(page);
				1332	discard_slab(s, page);
				1333	local_irq_restore(flags);
				1334	return;
				1335
				1336	debug:
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1337	if (!free_object_checks(s, page, x))
				1338	goto out_unlock;
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	1339	if (!PageActive(page) && !page->freelist)
				1340	remove_full(s, page);
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1341	if (s->flags & SLAB_STORE_USER)
				1342	set_track(s, x, TRACK_FREE, addr);
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	1343	if (s->flags & SLAB_TRACE) {
				1344	printk(KERN_INFO "TRACE %s free 0x%p inuse=%d fp=0x%p\n",
				1345	s->name, object, page->inuse,
				1346	page->freelist);
				1347	print_section("Object", (void *)object, s->objsize);
				1348	dump_stack();
				1349	}
				1350	init_object(s, object, 0);
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1351	goto checks_ok;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1352	}
				1353
				1354	void kmem_cache_free(struct kmem_cache s, void x)
				1355	{
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1356	struct page *page;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1357
Christoph Lameter	b49af68	2007-05-06 14:49:41 -0700	[diff] [blame]	1358	page = virt_to_head_page(x);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1359
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1360	slab_free(s, page, x, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1361	}
				1362	EXPORT_SYMBOL(kmem_cache_free);
				1363
				1364	/* Figure out on which slab object the object resides */
				1365	static struct page get_object_page(const void x)
				1366	{
Christoph Lameter	b49af68	2007-05-06 14:49:41 -0700	[diff] [blame]	1367	struct page *page = virt_to_head_page(x);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1368
				1369	if (!PageSlab(page))
				1370	return NULL;
				1371
				1372	return page;
				1373	}
				1374
				1375	/*
				1376	* kmem_cache_open produces objects aligned at "size" and the first object
				1377	* is placed at offset 0 in the slab (We have no metainformation on the
				1378	* slab, all slabs are in essence "off slab").
				1379	*
				1380	* In order to get the desired alignment one just needs to align the
				1381	* size.
				1382	*
				1383	* Notice that the allocation order determines the sizes of the per cpu
				1384	* caches. Each processor has always one slab available for allocations.
				1385	* Increasing the allocation order reduces the number of times that slabs
				1386	* must be moved on and off the partial lists and therefore may influence
				1387	* locking overhead.
				1388	*
				1389	* The offset is used to relocate the free list link in each object. It is
				1390	* therefore possible to move the free list link behind the object. This
				1391	* is necessary for RCU to work properly and also useful for debugging.
				1392	*/
				1393
				1394	/*
				1395	* Mininum / Maximum order of slab pages. This influences locking overhead
				1396	* and slab fragmentation. A higher order reduces the number of partial slabs
				1397	* and increases the number of allocations possible without having to
				1398	* take the list_lock.
				1399	*/
				1400	static int slub_min_order;
				1401	static int slub_max_order = DEFAULT_MAX_ORDER;
				1402
				1403	/*
				1404	* Minimum number of objects per slab. This is necessary in order to
				1405	* reduce locking overhead. Similar to the queue size in SLAB.
				1406	*/
				1407	static int slub_min_objects = DEFAULT_MIN_OBJECTS;
				1408
				1409	/*
				1410	* Merge control. If this is set then no merging of slab caches will occur.
				1411	*/
				1412	static int slub_nomerge;
				1413
				1414	/*
				1415	* Debug settings:
				1416	*/
				1417	static int slub_debug;
				1418
				1419	static char *slub_debug_slabs;
				1420
				1421	/*
				1422	* Calculate the order of allocation given an slab object size.
				1423	*
				1424	* The order of allocation has significant impact on other elements
				1425	* of the system. Generally order 0 allocations should be preferred
				1426	* since they do not cause fragmentation in the page allocator. Larger
				1427	* objects may have problems with order 0 because there may be too much
				1428	* space left unused in a slab. We go to a higher order if more than 1/8th
				1429	* of the slab would be wasted.
				1430	*
				1431	* In order to reach satisfactory performance we must ensure that
				1432	* a minimum number of objects is in one slab. Otherwise we may
				1433	* generate too much activity on the partial lists. This is less a
				1434	* concern for large slabs though. slub_max_order specifies the order
				1435	* where we begin to stop considering the number of objects in a slab.
				1436	*
				1437	* Higher order allocations also allow the placement of more objects
				1438	* in a slab and thereby reduce object handling overhead. If the user
				1439	* has requested a higher mininum order then we start with that one
				1440	* instead of zero.
				1441	*/
				1442	static int calculate_order(int size)
				1443	{
				1444	int order;
				1445	int rem;
				1446
				1447	for (order = max(slub_min_order, fls(size - 1) - PAGE_SHIFT);
				1448	order < MAX_ORDER; order++) {
				1449	unsigned long slab_size = PAGE_SIZE << order;
				1450
				1451	if (slub_max_order > order &&
				1452	slab_size < slub_min_objects * size)
				1453	continue;
				1454
				1455	if (slab_size < size)
				1456	continue;
				1457
				1458	rem = slab_size % size;
				1459
				1460	if (rem <= (PAGE_SIZE << order) / 8)
				1461	break;
				1462
				1463	}
				1464	if (order >= MAX_ORDER)
				1465	return -E2BIG;
				1466	return order;
				1467	}
				1468
				1469	/*
				1470	* Function to figure out which alignment to use from the
				1471	* various ways of specifying it.
				1472	*/
				1473	static unsigned long calculate_alignment(unsigned long flags,
				1474	unsigned long align, unsigned long size)
				1475	{
				1476	/*
				1477	* If the user wants hardware cache aligned objects then
				1478	* follow that suggestion if the object is sufficiently
				1479	* large.
				1480	*
				1481	* The hardware cache alignment cannot override the
				1482	* specified alignment though. If that is greater
				1483	* then use it.
				1484	*/
Christoph Lameter	5af6083	2007-05-06 14:49:56 -0700	[diff] [blame]	1485	if ((flags & SLAB_HWCACHE_ALIGN) &&
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1486	size > L1_CACHE_BYTES / 2)
				1487	return max_t(unsigned long, align, L1_CACHE_BYTES);
				1488
				1489	if (align < ARCH_SLAB_MINALIGN)
				1490	return ARCH_SLAB_MINALIGN;
				1491
				1492	return ALIGN(align, sizeof(void *));
				1493	}
				1494
				1495	static void init_kmem_cache_node(struct kmem_cache_node *n)
				1496	{
				1497	n->nr_partial = 0;
				1498	atomic_long_set(&n->nr_slabs, 0);
				1499	spin_lock_init(&n->list_lock);
				1500	INIT_LIST_HEAD(&n->partial);
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	1501	INIT_LIST_HEAD(&n->full);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1502	}
				1503
				1504	#ifdef CONFIG_NUMA
				1505	/*
				1506	* No kmalloc_node yet so do it by hand. We know that this is the first
				1507	* slab on the node for this slabcache. There are no concurrent accesses
				1508	* possible.
				1509	*
				1510	* Note that this function only works on the kmalloc_node_cache
				1511	* when allocating for the kmalloc_node_cache.
				1512	*/
				1513	static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags,
				1514	int node)
				1515	{
				1516	struct page *page;
				1517	struct kmem_cache_node *n;
				1518
				1519	BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
				1520
				1521	page = new_slab(kmalloc_caches, gfpflags \| GFP_THISNODE, node);
				1522	/* new_slab() disables interupts */
				1523	local_irq_enable();
				1524
				1525	BUG_ON(!page);
				1526	n = page->freelist;
				1527	BUG_ON(!n);
				1528	page->freelist = get_freepointer(kmalloc_caches, n);
				1529	page->inuse++;
				1530	kmalloc_caches->node[node] = n;
				1531	init_object(kmalloc_caches, n, 1);
				1532	init_kmem_cache_node(n);
				1533	atomic_long_inc(&n->nr_slabs);
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1534	add_partial(n, page);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1535	return n;
				1536	}
				1537
				1538	static void free_kmem_cache_nodes(struct kmem_cache *s)
				1539	{
				1540	int node;
				1541
				1542	for_each_online_node(node) {
				1543	struct kmem_cache_node *n = s->node[node];
				1544	if (n && n != &s->local_node)
				1545	kmem_cache_free(kmalloc_caches, n);
				1546	s->node[node] = NULL;
				1547	}
				1548	}
				1549
				1550	static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
				1551	{
				1552	int node;
				1553	int local_node;
				1554
				1555	if (slab_state >= UP)
				1556	local_node = page_to_nid(virt_to_page(s));
				1557	else
				1558	local_node = 0;
				1559
				1560	for_each_online_node(node) {
				1561	struct kmem_cache_node *n;
				1562
				1563	if (local_node == node)
				1564	n = &s->local_node;
				1565	else {
				1566	if (slab_state == DOWN) {
				1567	n = early_kmem_cache_node_alloc(gfpflags,
				1568	node);
				1569	continue;
				1570	}
				1571	n = kmem_cache_alloc_node(kmalloc_caches,
				1572	gfpflags, node);
				1573
				1574	if (!n) {
				1575	free_kmem_cache_nodes(s);
				1576	return 0;
				1577	}
				1578
				1579	}
				1580	s->node[node] = n;
				1581	init_kmem_cache_node(n);
				1582	}
				1583	return 1;
				1584	}
				1585	#else
				1586	static void free_kmem_cache_nodes(struct kmem_cache *s)
				1587	{
				1588	}
				1589
				1590	static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
				1591	{
				1592	init_kmem_cache_node(&s->local_node);
				1593	return 1;
				1594	}
				1595	#endif
				1596
				1597	/*
				1598	* calculate_sizes() determines the order and the distribution of data within
				1599	* a slab object.
				1600	*/
				1601	static int calculate_sizes(struct kmem_cache *s)
				1602	{
				1603	unsigned long flags = s->flags;
				1604	unsigned long size = s->objsize;
				1605	unsigned long align = s->align;
				1606
				1607	/*
				1608	* Determine if we can poison the object itself. If the user of
				1609	* the slab may touch the object after free or before allocation
				1610	* then we should never poison the object itself.
				1611	*/
				1612	if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
				1613	!s->ctor && !s->dtor)
				1614	s->flags \|= __OBJECT_POISON;
				1615	else
				1616	s->flags &= ~__OBJECT_POISON;
				1617
				1618	/*
				1619	* Round up object size to the next word boundary. We can only
				1620	* place the free pointer at word boundaries and this determines
				1621	* the possible location of the free pointer.
				1622	*/
				1623	size = ALIGN(size, sizeof(void *));
				1624
				1625	/*
				1626	* If we are redzoning then check if there is some space between the
				1627	* end of the object and the free pointer. If not then add an
				1628	* additional word, so that we can establish a redzone between
				1629	* the object and the freepointer to be able to check for overwrites.
				1630	*/
				1631	if ((flags & SLAB_RED_ZONE) && size == s->objsize)
				1632	size += sizeof(void *);
				1633
				1634	/*
				1635	* With that we have determined how much of the slab is in actual
				1636	* use by the object. This is the potential offset to the free
				1637	* pointer.
				1638	*/
				1639	s->inuse = size;
				1640
				1641	if (((flags & (SLAB_DESTROY_BY_RCU \| SLAB_POISON)) \|\|
				1642	s->ctor \|\| s->dtor)) {
				1643	/*
				1644	* Relocate free pointer after the object if it is not
				1645	* permitted to overwrite the first word of the object on
				1646	* kmem_cache_free.
				1647	*
				1648	* This is the case if we do RCU, have a constructor or
				1649	* destructor or are poisoning the objects.
				1650	*/
				1651	s->offset = size;
				1652	size += sizeof(void *);
				1653	}
				1654
				1655	if (flags & SLAB_STORE_USER)
				1656	/*
				1657	* Need to store information about allocs and frees after
				1658	* the object.
				1659	*/
				1660	size += 2 * sizeof(struct track);
				1661
				1662	if (flags & DEBUG_DEFAULT_FLAGS)
				1663	/*
				1664	* Add some empty padding so that we can catch
				1665	* overwrites from earlier objects rather than let
				1666	* tracking information or the free pointer be
				1667	* corrupted if an user writes before the start
				1668	* of the object.
				1669	*/
				1670	size += sizeof(void *);
				1671	/*
				1672	* Determine the alignment based on various parameters that the
				1673	* user specified (this is unecessarily complex due to the attempt
				1674	* to be compatible with SLAB. Should be cleaned up some day).
				1675	*/
				1676	align = calculate_alignment(flags, align, s->objsize);
				1677
				1678	/*
				1679	* SLUB stores one object immediately after another beginning from
				1680	* offset 0. In order to align the objects we have to simply size
				1681	* each object to conform to the alignment.
				1682	*/
				1683	size = ALIGN(size, align);
				1684	s->size = size;
				1685
				1686	s->order = calculate_order(size);
				1687	if (s->order < 0)
				1688	return 0;
				1689
				1690	/*
				1691	* Determine the number of objects per slab
				1692	*/
				1693	s->objects = (PAGE_SIZE << s->order) / size;
				1694
				1695	/*
				1696	* Verify that the number of objects is within permitted limits.
				1697	* The page->inuse field is only 16 bit wide! So we cannot have
				1698	* more than 64k objects per slab.
				1699	*/
				1700	if (!s->objects \|\| s->objects > 65535)
				1701	return 0;
				1702	return 1;
				1703
				1704	}
				1705
				1706	static int __init finish_bootstrap(void)
				1707	{
				1708	struct list_head *h;
				1709	int err;
				1710
				1711	slab_state = SYSFS;
				1712
				1713	list_for_each(h, &slab_caches) {
				1714	struct kmem_cache *s =
				1715	container_of(h, struct kmem_cache, list);
				1716
				1717	err = sysfs_slab_add(s);
				1718	BUG_ON(err);
				1719	}
				1720	return 0;
				1721	}
				1722
				1723	static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
				1724	const char *name, size_t size,
				1725	size_t align, unsigned long flags,
				1726	void (ctor)(void , struct kmem_cache *, unsigned long),
				1727	void (dtor)(void , struct kmem_cache *, unsigned long))
				1728	{
				1729	memset(s, 0, kmem_size);
				1730	s->name = name;
				1731	s->ctor = ctor;
				1732	s->dtor = dtor;
				1733	s->objsize = size;
				1734	s->flags = flags;
				1735	s->align = align;
				1736
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1737	/*
				1738	* The page->offset field is only 16 bit wide. This is an offset
				1739	* in units of words from the beginning of an object. If the slab
				1740	* size is bigger then we cannot move the free pointer behind the
				1741	* object anymore.
				1742	*
				1743	* On 32 bit platforms the limit is 256k. On 64bit platforms
				1744	* the limit is 512k.
				1745	*
				1746	* Debugging or ctor/dtors may create a need to move the free
				1747	* pointer. Fail if this happens.
				1748	*/
				1749	if (s->size >= 65535 * sizeof(void *)) {
				1750	BUG_ON(flags & (SLAB_RED_ZONE \| SLAB_POISON \|
				1751	SLAB_STORE_USER \| SLAB_DESTROY_BY_RCU));
				1752	BUG_ON(ctor \|\| dtor);
				1753	}
				1754	else
				1755	/*
				1756	* Enable debugging if selected on the kernel commandline.
				1757	*/
				1758	if (slub_debug && (!slub_debug_slabs \|\|
				1759	strncmp(slub_debug_slabs, name,
				1760	strlen(slub_debug_slabs)) == 0))
				1761	s->flags \|= slub_debug;
				1762
				1763	if (!calculate_sizes(s))
				1764	goto error;
				1765
				1766	s->refcount = 1;
				1767	#ifdef CONFIG_NUMA
				1768	s->defrag_ratio = 100;
				1769	#endif
				1770
				1771	if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
				1772	return 1;
				1773	error:
				1774	if (flags & SLAB_PANIC)
				1775	panic("Cannot create slab %s size=%lu realsize=%u "
				1776	"order=%u offset=%u flags=%lx\n",
				1777	s->name, (unsigned long)size, s->size, s->order,
				1778	s->offset, flags);
				1779	return 0;
				1780	}
				1781	EXPORT_SYMBOL(kmem_cache_open);
				1782
				1783	/*
				1784	* Check if a given pointer is valid
				1785	*/
				1786	int kmem_ptr_validate(struct kmem_cache s, const void object)
				1787	{
				1788	struct page * page;
				1789	void *addr;
				1790
				1791	page = get_object_page(object);
				1792
				1793	if (!page \|\| s != page->slab)
				1794	/* No slab or wrong slab */
				1795	return 0;
				1796
				1797	addr = page_address(page);
				1798	if (object < addr \|\| object >= addr + s->objects * s->size)
				1799	/* Out of bounds */
				1800	return 0;
				1801
				1802	if ((object - addr) % s->size)
				1803	/* Improperly aligned */
				1804	return 0;
				1805
				1806	/*
				1807	* We could also check if the object is on the slabs freelist.
				1808	* But this would be too expensive and it seems that the main
				1809	* purpose of kmem_ptr_valid is to check if the object belongs
				1810	* to a certain slab.
				1811	*/
				1812	return 1;
				1813	}
				1814	EXPORT_SYMBOL(kmem_ptr_validate);
				1815
				1816	/*
				1817	* Determine the size of a slab object
				1818	*/
				1819	unsigned int kmem_cache_size(struct kmem_cache *s)
				1820	{
				1821	return s->objsize;
				1822	}
				1823	EXPORT_SYMBOL(kmem_cache_size);
				1824
				1825	const char kmem_cache_name(struct kmem_cache s)
				1826	{
				1827	return s->name;
				1828	}
				1829	EXPORT_SYMBOL(kmem_cache_name);
				1830
				1831	/*
				1832	* Attempt to free all slabs on a node
				1833	*/
				1834	static int free_list(struct kmem_cache s, struct kmem_cache_node n,
				1835	struct list_head *list)
				1836	{
				1837	int slabs_inuse = 0;
				1838	unsigned long flags;
				1839	struct page page, h;
				1840
				1841	spin_lock_irqsave(&n->list_lock, flags);
				1842	list_for_each_entry_safe(page, h, list, lru)
				1843	if (!page->inuse) {
				1844	list_del(&page->lru);
				1845	discard_slab(s, page);
				1846	} else
				1847	slabs_inuse++;
				1848	spin_unlock_irqrestore(&n->list_lock, flags);
				1849	return slabs_inuse;
				1850	}
				1851
				1852	/*
				1853	* Release all resources used by slab cache
				1854	*/
				1855	static int kmem_cache_close(struct kmem_cache *s)
				1856	{
				1857	int node;
				1858
				1859	flush_all(s);
				1860
				1861	/* Attempt to free all objects */
				1862	for_each_online_node(node) {
				1863	struct kmem_cache_node *n = get_node(s, node);
				1864
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame]	1865	n->nr_partial -= free_list(s, n, &n->partial);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1866	if (atomic_long_read(&n->nr_slabs))
				1867	return 1;
				1868	}
				1869	free_kmem_cache_nodes(s);
				1870	return 0;
				1871	}
				1872
				1873	/*
				1874	* Close a cache and release the kmem_cache structure
				1875	* (must be used for caches created using kmem_cache_create)
				1876	*/
				1877	void kmem_cache_destroy(struct kmem_cache *s)
				1878	{
				1879	down_write(&slub_lock);
				1880	s->refcount--;
				1881	if (!s->refcount) {
				1882	list_del(&s->list);
				1883	if (kmem_cache_close(s))
				1884	WARN_ON(1);
				1885	sysfs_slab_remove(s);
				1886	kfree(s);
				1887	}
				1888	up_write(&slub_lock);
				1889	}
				1890	EXPORT_SYMBOL(kmem_cache_destroy);
				1891
				1892	/********************************************************************
				1893	* Kmalloc subsystem
				1894	*******************************************************************/
				1895
				1896	struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned;
				1897	EXPORT_SYMBOL(kmalloc_caches);
				1898
				1899	#ifdef CONFIG_ZONE_DMA
				1900	static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1];
				1901	#endif
				1902
				1903	static int __init setup_slub_min_order(char *str)
				1904	{
				1905	get_option (&str, &slub_min_order);
				1906
				1907	return 1;
				1908	}
				1909
				1910	__setup("slub_min_order=", setup_slub_min_order);
				1911
				1912	static int __init setup_slub_max_order(char *str)
				1913	{
				1914	get_option (&str, &slub_max_order);
				1915
				1916	return 1;
				1917	}
				1918
				1919	__setup("slub_max_order=", setup_slub_max_order);
				1920
				1921	static int __init setup_slub_min_objects(char *str)
				1922	{
				1923	get_option (&str, &slub_min_objects);
				1924
				1925	return 1;
				1926	}
				1927
				1928	__setup("slub_min_objects=", setup_slub_min_objects);
				1929
				1930	static int __init setup_slub_nomerge(char *str)
				1931	{
				1932	slub_nomerge = 1;
				1933	return 1;
				1934	}
				1935
				1936	__setup("slub_nomerge", setup_slub_nomerge);
				1937
				1938	static int __init setup_slub_debug(char *str)
				1939	{
				1940	if (!str \|\| *str != '=')
				1941	slub_debug = DEBUG_DEFAULT_FLAGS;
				1942	else {
				1943	str++;
				1944	if (str == 0 \|\| str == ',')
				1945	slub_debug = DEBUG_DEFAULT_FLAGS;
				1946	else
				1947	for( ;str && str != ','; str++)
				1948	switch (*str) {
				1949	case 'f' : case 'F' :
				1950	slub_debug \|= SLAB_DEBUG_FREE;
				1951	break;
				1952	case 'z' : case 'Z' :
				1953	slub_debug \|= SLAB_RED_ZONE;
				1954	break;
				1955	case 'p' : case 'P' :
				1956	slub_debug \|= SLAB_POISON;
				1957	break;
				1958	case 'u' : case 'U' :
				1959	slub_debug \|= SLAB_STORE_USER;
				1960	break;
				1961	case 't' : case 'T' :
				1962	slub_debug \|= SLAB_TRACE;
				1963	break;
				1964	default:
				1965	printk(KERN_ERR "slub_debug option '%c' "
				1966	"unknown. skipped\n",*str);
				1967	}
				1968	}
				1969
				1970	if (*str == ',')
				1971	slub_debug_slabs = str + 1;
				1972	return 1;
				1973	}
				1974
				1975	__setup("slub_debug", setup_slub_debug);
				1976
				1977	static struct kmem_cache create_kmalloc_cache(struct kmem_cache s,
				1978	const char *name, int size, gfp_t gfp_flags)
				1979	{
				1980	unsigned int flags = 0;
				1981
				1982	if (gfp_flags & SLUB_DMA)
				1983	flags = SLAB_CACHE_DMA;
				1984
				1985	down_write(&slub_lock);
				1986	if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
				1987	flags, NULL, NULL))
				1988	goto panic;
				1989
				1990	list_add(&s->list, &slab_caches);
				1991	up_write(&slub_lock);
				1992	if (sysfs_slab_add(s))
				1993	goto panic;
				1994	return s;
				1995
				1996	panic:
				1997	panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
				1998	}
				1999
				2000	static struct kmem_cache *get_slab(size_t size, gfp_t flags)
				2001	{
				2002	int index = kmalloc_index(size);
				2003
Christoph Lameter	614410d	2007-05-06 14:49:38 -0700	[diff] [blame]	2004	if (!index)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2005	return NULL;
				2006
				2007	/* Allocation too large? */
				2008	BUG_ON(index < 0);
				2009
				2010	#ifdef CONFIG_ZONE_DMA
				2011	if ((flags & SLUB_DMA)) {
				2012	struct kmem_cache *s;
				2013	struct kmem_cache *x;
				2014	char *text;
				2015	size_t realsize;
				2016
				2017	s = kmalloc_caches_dma[index];
				2018	if (s)
				2019	return s;
				2020
				2021	/* Dynamically create dma cache */
				2022	x = kmalloc(kmem_size, flags & ~SLUB_DMA);
				2023	if (!x)
				2024	panic("Unable to allocate memory for dma cache\n");
				2025
				2026	if (index <= KMALLOC_SHIFT_HIGH)
				2027	realsize = 1 << index;
				2028	else {
				2029	if (index == 1)
				2030	realsize = 96;
				2031	else
				2032	realsize = 192;
				2033	}
				2034
				2035	text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
				2036	(unsigned int)realsize);
				2037	s = create_kmalloc_cache(x, text, realsize, flags);
				2038	kmalloc_caches_dma[index] = s;
				2039	return s;
				2040	}
				2041	#endif
				2042	return &kmalloc_caches[index];
				2043	}
				2044
				2045	void *__kmalloc(size_t size, gfp_t flags)
				2046	{
				2047	struct kmem_cache *s = get_slab(size, flags);
				2048
				2049	if (s)
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2050	return slab_alloc(s, flags, -1, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2051	return NULL;
				2052	}
				2053	EXPORT_SYMBOL(__kmalloc);
				2054
				2055	#ifdef CONFIG_NUMA
				2056	void *__kmalloc_node(size_t size, gfp_t flags, int node)
				2057	{
				2058	struct kmem_cache *s = get_slab(size, flags);
				2059
				2060	if (s)
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2061	return slab_alloc(s, flags, node, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2062	return NULL;
				2063	}
				2064	EXPORT_SYMBOL(__kmalloc_node);
				2065	#endif
				2066
				2067	size_t ksize(const void *object)
				2068	{
				2069	struct page *page = get_object_page(object);
				2070	struct kmem_cache *s;
				2071
				2072	BUG_ON(!page);
				2073	s = page->slab;
				2074	BUG_ON(!s);
				2075
				2076	/*
				2077	* Debugging requires use of the padding between object
				2078	* and whatever may come after it.
				2079	*/
				2080	if (s->flags & (SLAB_RED_ZONE \| SLAB_POISON))
				2081	return s->objsize;
				2082
				2083	/*
				2084	* If we have the need to store the freelist pointer
				2085	* back there or track user information then we can
				2086	* only use the space before that information.
				2087	*/
				2088	if (s->flags & (SLAB_DESTROY_BY_RCU \| SLAB_STORE_USER))
				2089	return s->inuse;
				2090
				2091	/*
				2092	* Else we can use all the padding etc for the allocation
				2093	*/
				2094	return s->size;
				2095	}
				2096	EXPORT_SYMBOL(ksize);
				2097
				2098	void kfree(const void *x)
				2099	{
				2100	struct kmem_cache *s;
				2101	struct page *page;
				2102
				2103	if (!x)
				2104	return;
				2105
Christoph Lameter	b49af68	2007-05-06 14:49:41 -0700	[diff] [blame]	2106	page = virt_to_head_page(x);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2107	s = page->slab;
				2108
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2109	slab_free(s, page, (void *)x, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2110	}
				2111	EXPORT_SYMBOL(kfree);
				2112
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame]	2113	/*
				2114	* kmem_cache_shrink removes empty slabs from the partial lists
				2115	* and then sorts the partially allocated slabs by the number
				2116	* of items in use. The slabs with the most items in use
				2117	* come first. New allocations will remove these from the
				2118	* partial list because they are full. The slabs with the
				2119	* least items are placed last. If it happens that the objects
				2120	* are freed then the page can be returned to the page allocator.
				2121	*/
				2122	int kmem_cache_shrink(struct kmem_cache *s)
				2123	{
				2124	int node;
				2125	int i;
				2126	struct kmem_cache_node *n;
				2127	struct page *page;
				2128	struct page *t;
				2129	struct list_head *slabs_by_inuse =
				2130	kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL);
				2131	unsigned long flags;
				2132
				2133	if (!slabs_by_inuse)
				2134	return -ENOMEM;
				2135
				2136	flush_all(s);
				2137	for_each_online_node(node) {
				2138	n = get_node(s, node);
				2139
				2140	if (!n->nr_partial)
				2141	continue;
				2142
				2143	for (i = 0; i < s->objects; i++)
				2144	INIT_LIST_HEAD(slabs_by_inuse + i);
				2145
				2146	spin_lock_irqsave(&n->list_lock, flags);
				2147
				2148	/*
				2149	* Build lists indexed by the items in use in
				2150	* each slab or free slabs if empty.
				2151	*
				2152	* Note that concurrent frees may occur while
				2153	* we hold the list_lock. page->inuse here is
				2154	* the upper limit.
				2155	*/
				2156	list_for_each_entry_safe(page, t, &n->partial, lru) {
				2157	if (!page->inuse && slab_trylock(page)) {
				2158	/*
				2159	* Must hold slab lock here because slab_free
				2160	* may have freed the last object and be
				2161	* waiting to release the slab.
				2162	*/
				2163	list_del(&page->lru);
				2164	n->nr_partial--;
				2165	slab_unlock(page);
				2166	discard_slab(s, page);
				2167	} else {
				2168	if (n->nr_partial > MAX_PARTIAL)
				2169	list_move(&page->lru,
				2170	slabs_by_inuse + page->inuse);
				2171	}
				2172	}
				2173
				2174	if (n->nr_partial <= MAX_PARTIAL)
				2175	goto out;
				2176
				2177	/*
				2178	* Rebuild the partial list with the slabs filled up
				2179	* most first and the least used slabs at the end.
				2180	*/
				2181	for (i = s->objects - 1; i >= 0; i--)
				2182	list_splice(slabs_by_inuse + i, n->partial.prev);
				2183
				2184	out:
				2185	spin_unlock_irqrestore(&n->list_lock, flags);
				2186	}
				2187
				2188	kfree(slabs_by_inuse);
				2189	return 0;
				2190	}
				2191	EXPORT_SYMBOL(kmem_cache_shrink);
				2192
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2193	/**
				2194	* krealloc - reallocate memory. The contents will remain unchanged.
				2195	*
				2196	* @p: object to reallocate memory for.
				2197	* @new_size: how many bytes of memory are required.
				2198	* @flags: the type of memory to allocate.
				2199	*
				2200	* The contents of the object pointed to are preserved up to the
				2201	* lesser of the new and old sizes. If @p is %NULL, krealloc()
				2202	* behaves exactly like kmalloc(). If @size is 0 and @p is not a
				2203	* %NULL pointer, the object pointed to is freed.
				2204	*/
				2205	void krealloc(const void p, size_t new_size, gfp_t flags)
				2206	{
				2207	struct kmem_cache *new_cache;
				2208	void *ret;
				2209	struct page *page;
				2210
				2211	if (unlikely(!p))
				2212	return kmalloc(new_size, flags);
				2213
				2214	if (unlikely(!new_size)) {
				2215	kfree(p);
				2216	return NULL;
				2217	}
				2218
Christoph Lameter	b49af68	2007-05-06 14:49:41 -0700	[diff] [blame]	2219	page = virt_to_head_page(p);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2220
				2221	new_cache = get_slab(new_size, flags);
				2222
				2223	/*
				2224	* If new size fits in the current cache, bail out.
				2225	*/
				2226	if (likely(page->slab == new_cache))
				2227	return (void *)p;
				2228
				2229	ret = kmalloc(new_size, flags);
				2230	if (ret) {
				2231	memcpy(ret, p, min(new_size, ksize(p)));
				2232	kfree(p);
				2233	}
				2234	return ret;
				2235	}
				2236	EXPORT_SYMBOL(krealloc);
				2237
				2238	/********************************************************************
				2239	* Basic setup of slabs
				2240	*******************************************************************/
				2241
				2242	void __init kmem_cache_init(void)
				2243	{
				2244	int i;
				2245
				2246	#ifdef CONFIG_NUMA
				2247	/*
				2248	* Must first have the slab cache available for the allocations of the
				2249	* struct kmalloc_cache_node's. There is special bootstrap code in
				2250	* kmem_cache_open for slab_state == DOWN.
				2251	*/
				2252	create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
				2253	sizeof(struct kmem_cache_node), GFP_KERNEL);
				2254	#endif
				2255
				2256	/* Able to allocate the per node structures */
				2257	slab_state = PARTIAL;
				2258
				2259	/* Caches that are not of the two-to-the-power-of size */
				2260	create_kmalloc_cache(&kmalloc_caches[1],
				2261	"kmalloc-96", 96, GFP_KERNEL);
				2262	create_kmalloc_cache(&kmalloc_caches[2],
				2263	"kmalloc-192", 192, GFP_KERNEL);
				2264
				2265	for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
				2266	create_kmalloc_cache(&kmalloc_caches[i],
				2267	"kmalloc", 1 << i, GFP_KERNEL);
				2268
				2269	slab_state = UP;
				2270
				2271	/* Provide the correct kmalloc names now that the caches are up */
				2272	for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
				2273	kmalloc_caches[i]. name =
				2274	kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
				2275
				2276	#ifdef CONFIG_SMP
				2277	register_cpu_notifier(&slab_notifier);
				2278	#endif
				2279
				2280	if (nr_cpu_ids) /* Remove when nr_cpu_ids is fixed upstream ! */
				2281	kmem_size = offsetof(struct kmem_cache, cpu_slab)
				2282	+ nr_cpu_ids * sizeof(struct page *);
				2283
				2284	printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
				2285	" Processors=%d, Nodes=%d\n",
				2286	KMALLOC_SHIFT_HIGH, L1_CACHE_BYTES,
				2287	slub_min_order, slub_max_order, slub_min_objects,
				2288	nr_cpu_ids, nr_node_ids);
				2289	}
				2290
				2291	/*
				2292	* Find a mergeable slab cache
				2293	*/
				2294	static int slab_unmergeable(struct kmem_cache *s)
				2295	{
				2296	if (slub_nomerge \|\| (s->flags & SLUB_NEVER_MERGE))
				2297	return 1;
				2298
				2299	if (s->ctor \|\| s->dtor)
				2300	return 1;
				2301
				2302	return 0;
				2303	}
				2304
				2305	static struct kmem_cache *find_mergeable(size_t size,
				2306	size_t align, unsigned long flags,
				2307	void (ctor)(void , struct kmem_cache *, unsigned long),
				2308	void (dtor)(void , struct kmem_cache *, unsigned long))
				2309	{
				2310	struct list_head *h;
				2311
				2312	if (slub_nomerge \|\| (flags & SLUB_NEVER_MERGE))
				2313	return NULL;
				2314
				2315	if (ctor \|\| dtor)
				2316	return NULL;
				2317
				2318	size = ALIGN(size, sizeof(void *));
				2319	align = calculate_alignment(flags, align, size);
				2320	size = ALIGN(size, align);
				2321
				2322	list_for_each(h, &slab_caches) {
				2323	struct kmem_cache *s =
				2324	container_of(h, struct kmem_cache, list);
				2325
				2326	if (slab_unmergeable(s))
				2327	continue;
				2328
				2329	if (size > s->size)
				2330	continue;
				2331
				2332	if (((flags \| slub_debug) & SLUB_MERGE_SAME) !=
				2333	(s->flags & SLUB_MERGE_SAME))
				2334	continue;
				2335	/*
				2336	* Check if alignment is compatible.
				2337	* Courtesy of Adrian Drzewiecki
				2338	*/
				2339	if ((s->size & ~(align -1)) != s->size)
				2340	continue;
				2341
				2342	if (s->size - size >= sizeof(void *))
				2343	continue;
				2344
				2345	return s;
				2346	}
				2347	return NULL;
				2348	}
				2349
				2350	struct kmem_cache kmem_cache_create(const char name, size_t size,
				2351	size_t align, unsigned long flags,
				2352	void (ctor)(void , struct kmem_cache *, unsigned long),
				2353	void (dtor)(void , struct kmem_cache *, unsigned long))
				2354	{
				2355	struct kmem_cache *s;
				2356
				2357	down_write(&slub_lock);
				2358	s = find_mergeable(size, align, flags, dtor, ctor);
				2359	if (s) {
				2360	s->refcount++;
				2361	/*
				2362	* Adjust the object sizes so that we clear
				2363	* the complete object on kzalloc.
				2364	*/
				2365	s->objsize = max(s->objsize, (int)size);
				2366	s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
				2367	if (sysfs_slab_alias(s, name))
				2368	goto err;
				2369	} else {
				2370	s = kmalloc(kmem_size, GFP_KERNEL);
				2371	if (s && kmem_cache_open(s, GFP_KERNEL, name,
				2372	size, align, flags, ctor, dtor)) {
				2373	if (sysfs_slab_add(s)) {
				2374	kfree(s);
				2375	goto err;
				2376	}
				2377	list_add(&s->list, &slab_caches);
				2378	} else
				2379	kfree(s);
				2380	}
				2381	up_write(&slub_lock);
				2382	return s;
				2383
				2384	err:
				2385	up_write(&slub_lock);
				2386	if (flags & SLAB_PANIC)
				2387	panic("Cannot create slabcache %s\n", name);
				2388	else
				2389	s = NULL;
				2390	return s;
				2391	}
				2392	EXPORT_SYMBOL(kmem_cache_create);
				2393
				2394	void kmem_cache_zalloc(struct kmem_cache s, gfp_t flags)
				2395	{
				2396	void *x;
				2397
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2398	x = slab_alloc(s, flags, -1, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2399	if (x)
				2400	memset(x, 0, s->objsize);
				2401	return x;
				2402	}
				2403	EXPORT_SYMBOL(kmem_cache_zalloc);
				2404
				2405	#ifdef CONFIG_SMP
				2406	static void for_all_slabs(void (func)(struct kmem_cache , int), int cpu)
				2407	{
				2408	struct list_head *h;
				2409
				2410	down_read(&slub_lock);
				2411	list_for_each(h, &slab_caches) {
				2412	struct kmem_cache *s =
				2413	container_of(h, struct kmem_cache, list);
				2414
				2415	func(s, cpu);
				2416	}
				2417	up_read(&slub_lock);
				2418	}
				2419
				2420	/*
				2421	* Use the cpu notifier to insure that the slab are flushed
				2422	* when necessary.
				2423	*/
				2424	static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
				2425	unsigned long action, void *hcpu)
				2426	{
				2427	long cpu = (long)hcpu;
				2428
				2429	switch (action) {
				2430	case CPU_UP_CANCELED:
				2431	case CPU_DEAD:
				2432	for_all_slabs(__flush_cpu_slab, cpu);
				2433	break;
				2434	default:
				2435	break;
				2436	}
				2437	return NOTIFY_OK;
				2438	}
				2439
				2440	static struct notifier_block __cpuinitdata slab_notifier =
				2441	{ &slab_cpuup_callback, NULL, 0 };
				2442
				2443	#endif
				2444
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2445	#ifdef CONFIG_NUMA
				2446
				2447	/*****************************************************************
				2448	* Generic reaper used to support the page allocator
				2449	* (the cpu slabs are reaped by a per slab workqueue).
				2450	*
				2451	* Maybe move this to the page allocator?
				2452	****************************************************************/
				2453
				2454	static DEFINE_PER_CPU(unsigned long, reap_node);
				2455
				2456	static void init_reap_node(int cpu)
				2457	{
				2458	int node;
				2459
				2460	node = next_node(cpu_to_node(cpu), node_online_map);
				2461	if (node == MAX_NUMNODES)
				2462	node = first_node(node_online_map);
				2463
				2464	__get_cpu_var(reap_node) = node;
				2465	}
				2466
				2467	static void next_reap_node(void)
				2468	{
				2469	int node = __get_cpu_var(reap_node);
				2470
				2471	/*
				2472	* Also drain per cpu pages on remote zones
				2473	*/
				2474	if (node != numa_node_id())
				2475	drain_node_pages(node);
				2476
				2477	node = next_node(node, node_online_map);
				2478	if (unlikely(node >= MAX_NUMNODES))
				2479	node = first_node(node_online_map);
				2480	__get_cpu_var(reap_node) = node;
				2481	}
				2482	#else
				2483	#define init_reap_node(cpu) do { } while (0)
				2484	#define next_reap_node(void) do { } while (0)
				2485	#endif
				2486
				2487	#define REAPTIMEOUT_CPUC (2*HZ)
				2488
				2489	#ifdef CONFIG_SMP
				2490	static DEFINE_PER_CPU(struct delayed_work, reap_work);
				2491
				2492	static void cache_reap(struct work_struct *unused)
				2493	{
				2494	next_reap_node();
				2495	refresh_cpu_vm_stats(smp_processor_id());
				2496	schedule_delayed_work(&__get_cpu_var(reap_work),
				2497	REAPTIMEOUT_CPUC);
				2498	}
				2499
				2500	static void __devinit start_cpu_timer(int cpu)
				2501	{
				2502	struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
				2503
				2504	/*
				2505	* When this gets called from do_initcalls via cpucache_init(),
				2506	* init_workqueues() has already run, so keventd will be setup
				2507	* at that time.
				2508	*/
				2509	if (keventd_up() && reap_work->work.func == NULL) {
				2510	init_reap_node(cpu);
				2511	INIT_DELAYED_WORK(reap_work, cache_reap);
				2512	schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
				2513	}
				2514	}
				2515
				2516	static int __init cpucache_init(void)
				2517	{
				2518	int cpu;
				2519
				2520	/*
				2521	* Register the timers that drain pcp pages and update vm statistics
				2522	*/
				2523	for_each_online_cpu(cpu)
				2524	start_cpu_timer(cpu);
				2525	return 0;
				2526	}
				2527	__initcall(cpucache_init);
				2528	#endif
				2529
				2530	#ifdef SLUB_RESILIENCY_TEST
				2531	static unsigned long validate_slab_cache(struct kmem_cache *s);
				2532
				2533	static void resiliency_test(void)
				2534	{
				2535	u8 *p;
				2536
				2537	printk(KERN_ERR "SLUB resiliency testing\n");
				2538	printk(KERN_ERR "-----------------------\n");
				2539	printk(KERN_ERR "A. Corruption after allocation\n");
				2540
				2541	p = kzalloc(16, GFP_KERNEL);
				2542	p[16] = 0x12;
				2543	printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
				2544	" 0x12->0x%p\n\n", p + 16);
				2545
				2546	validate_slab_cache(kmalloc_caches + 4);
				2547
				2548	/* Hmmm... The next two are dangerous */
				2549	p = kzalloc(32, GFP_KERNEL);
				2550	p[32 + sizeof(void *)] = 0x34;
				2551	printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
				2552	" 0x34 -> -0x%p\n", p);
				2553	printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
				2554
				2555	validate_slab_cache(kmalloc_caches + 5);
				2556	p = kzalloc(64, GFP_KERNEL);
				2557	p += 64 + (get_cycles() & 0xff) * sizeof(void *);
				2558	*p = 0x56;
				2559	printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
				2560	p);
				2561	printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
				2562	validate_slab_cache(kmalloc_caches + 6);
				2563
				2564	printk(KERN_ERR "\nB. Corruption after free\n");
				2565	p = kzalloc(128, GFP_KERNEL);
				2566	kfree(p);
				2567	*p = 0x78;
				2568	printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
				2569	validate_slab_cache(kmalloc_caches + 7);
				2570
				2571	p = kzalloc(256, GFP_KERNEL);
				2572	kfree(p);
				2573	p[50] = 0x9a;
				2574	printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
				2575	validate_slab_cache(kmalloc_caches + 8);
				2576
				2577	p = kzalloc(512, GFP_KERNEL);
				2578	kfree(p);
				2579	p[512] = 0xab;
				2580	printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
				2581	validate_slab_cache(kmalloc_caches + 9);
				2582	}
				2583	#else
				2584	static void resiliency_test(void) {};
				2585	#endif
				2586
				2587	/*
				2588	* These are not as efficient as kmalloc for the non debug case.
				2589	* We do not have the page struct available so we have to touch one
				2590	* cacheline in struct kmem_cache to check slab flags.
				2591	*/
				2592	void __kmalloc_track_caller(size_t size, gfp_t gfpflags, void caller)
				2593	{
				2594	struct kmem_cache *s = get_slab(size, gfpflags);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2595
				2596	if (!s)
				2597	return NULL;
				2598
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2599	return slab_alloc(s, gfpflags, -1, caller);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2600	}
				2601
				2602	void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
				2603	int node, void *caller)
				2604	{
				2605	struct kmem_cache *s = get_slab(size, gfpflags);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2606
				2607	if (!s)
				2608	return NULL;
				2609
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2610	return slab_alloc(s, gfpflags, node, caller);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2611	}
				2612
				2613	#ifdef CONFIG_SYSFS
				2614
Christoph Lameter	53e15af	2007-05-06 14:49:43 -0700	[diff] [blame]	2615	static int validate_slab(struct kmem_cache s, struct page page)
				2616	{
				2617	void *p;
				2618	void *addr = page_address(page);
				2619	unsigned long map[BITS_TO_LONGS(s->objects)];
				2620
				2621	if (!check_slab(s, page) \|\|
				2622	!on_freelist(s, page, NULL))
				2623	return 0;
				2624
				2625	/* Now we know that a valid freelist exists */
				2626	bitmap_zero(map, s->objects);
				2627
				2628	for(p = page->freelist; p; p = get_freepointer(s, p)) {
				2629	set_bit((p - addr) / s->size, map);
				2630	if (!check_object(s, page, p, 0))
				2631	return 0;
				2632	}
				2633
				2634	for(p = addr; p < addr + s->objects * s->size; p += s->size)
				2635	if (!test_bit((p - addr) / s->size, map))
				2636	if (!check_object(s, page, p, 1))
				2637	return 0;
				2638	return 1;
				2639	}
				2640
				2641	static void validate_slab_slab(struct kmem_cache s, struct page page)
				2642	{
				2643	if (slab_trylock(page)) {
				2644	validate_slab(s, page);
				2645	slab_unlock(page);
				2646	} else
				2647	printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
				2648	s->name, page);
				2649
				2650	if (s->flags & DEBUG_DEFAULT_FLAGS) {
				2651	if (!PageError(page))
				2652	printk(KERN_ERR "SLUB %s: PageError not set "
				2653	"on slab 0x%p\n", s->name, page);
				2654	} else {
				2655	if (PageError(page))
				2656	printk(KERN_ERR "SLUB %s: PageError set on "
				2657	"slab 0x%p\n", s->name, page);
				2658	}
				2659	}
				2660
				2661	static int validate_slab_node(struct kmem_cache s, struct kmem_cache_node n)
				2662	{
				2663	unsigned long count = 0;
				2664	struct page *page;
				2665	unsigned long flags;
				2666
				2667	spin_lock_irqsave(&n->list_lock, flags);
				2668
				2669	list_for_each_entry(page, &n->partial, lru) {
				2670	validate_slab_slab(s, page);
				2671	count++;
				2672	}
				2673	if (count != n->nr_partial)
				2674	printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
				2675	"counter=%ld\n", s->name, count, n->nr_partial);
				2676
				2677	if (!(s->flags & SLAB_STORE_USER))
				2678	goto out;
				2679
				2680	list_for_each_entry(page, &n->full, lru) {
				2681	validate_slab_slab(s, page);
				2682	count++;
				2683	}
				2684	if (count != atomic_long_read(&n->nr_slabs))
				2685	printk(KERN_ERR "SLUB: %s %ld slabs counted but "
				2686	"counter=%ld\n", s->name, count,
				2687	atomic_long_read(&n->nr_slabs));
				2688
				2689	out:
				2690	spin_unlock_irqrestore(&n->list_lock, flags);
				2691	return count;
				2692	}
				2693
				2694	static unsigned long validate_slab_cache(struct kmem_cache *s)
				2695	{
				2696	int node;
				2697	unsigned long count = 0;
				2698
				2699	flush_all(s);
				2700	for_each_online_node(node) {
				2701	struct kmem_cache_node *n = get_node(s, node);
				2702
				2703	count += validate_slab_node(s, n);
				2704	}
				2705	return count;
				2706	}
				2707
Christoph Lameter	88a420e	2007-05-06 14:49:45 -0700	[diff] [blame]	2708	/*
				2709	* Generate lists of locations where slabcache objects are allocated
				2710	* and freed.
				2711	*/
				2712
				2713	struct location {
				2714	unsigned long count;
				2715	void *addr;
				2716	};
				2717
				2718	struct loc_track {
				2719	unsigned long max;
				2720	unsigned long count;
				2721	struct location *loc;
				2722	};
				2723
				2724	static void free_loc_track(struct loc_track *t)
				2725	{
				2726	if (t->max)
				2727	free_pages((unsigned long)t->loc,
				2728	get_order(sizeof(struct location) * t->max));
				2729	}
				2730
				2731	static int alloc_loc_track(struct loc_track *t, unsigned long max)
				2732	{
				2733	struct location *l;
				2734	int order;
				2735
				2736	if (!max)
				2737	max = PAGE_SIZE / sizeof(struct location);
				2738
				2739	order = get_order(sizeof(struct location) * max);
				2740
				2741	l = (void *)__get_free_pages(GFP_KERNEL, order);
				2742
				2743	if (!l)
				2744	return 0;
				2745
				2746	if (t->count) {
				2747	memcpy(l, t->loc, sizeof(struct location) * t->count);
				2748	free_loc_track(t);
				2749	}
				2750	t->max = max;
				2751	t->loc = l;
				2752	return 1;
				2753	}
				2754
				2755	static int add_location(struct loc_track t, struct kmem_cache s,
				2756	void *addr)
				2757	{
				2758	long start, end, pos;
				2759	struct location *l;
				2760	void *caddr;
				2761
				2762	start = -1;
				2763	end = t->count;
				2764
				2765	for ( ; ; ) {
				2766	pos = start + (end - start + 1) / 2;
				2767
				2768	/*
				2769	* There is nothing at "end". If we end up there
				2770	* we need to add something to before end.
				2771	*/
				2772	if (pos == end)
				2773	break;
				2774
				2775	caddr = t->loc[pos].addr;
				2776	if (addr == caddr) {
				2777	t->loc[pos].count++;
				2778	return 1;
				2779	}
				2780
				2781	if (addr < caddr)
				2782	end = pos;
				2783	else
				2784	start = pos;
				2785	}
				2786
				2787	/*
				2788	* Not found. Insert new tracking element
				2789	*/
				2790	if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max))
				2791	return 0;
				2792
				2793	l = t->loc + pos;
				2794	if (pos < t->count)
				2795	memmove(l + 1, l,
				2796	(t->count - pos) * sizeof(struct location));
				2797	t->count++;
				2798	l->count = 1;
				2799	l->addr = addr;
				2800	return 1;
				2801	}
				2802
				2803	static void process_slab(struct loc_track t, struct kmem_cache s,
				2804	struct page *page, enum track_item alloc)
				2805	{
				2806	void *addr = page_address(page);
				2807	unsigned long map[BITS_TO_LONGS(s->objects)];
				2808	void *p;
				2809
				2810	bitmap_zero(map, s->objects);
				2811	for (p = page->freelist; p; p = get_freepointer(s, p))
				2812	set_bit((p - addr) / s->size, map);
				2813
				2814	for (p = addr; p < addr + s->objects * s->size; p += s->size)
				2815	if (!test_bit((p - addr) / s->size, map)) {
				2816	void *addr = get_track(s, p, alloc)->addr;
				2817
				2818	add_location(t, s, addr);
				2819	}
				2820	}
				2821
				2822	static int list_locations(struct kmem_cache s, char buf,
				2823	enum track_item alloc)
				2824	{
				2825	int n = 0;
				2826	unsigned long i;
				2827	struct loc_track t;
				2828	int node;
				2829
				2830	t.count = 0;
				2831	t.max = 0;
				2832
				2833	/* Push back cpu slabs */
				2834	flush_all(s);
				2835
				2836	for_each_online_node(node) {
				2837	struct kmem_cache_node *n = get_node(s, node);
				2838	unsigned long flags;
				2839	struct page *page;
				2840
				2841	if (!atomic_read(&n->nr_slabs))
				2842	continue;
				2843
				2844	spin_lock_irqsave(&n->list_lock, flags);
				2845	list_for_each_entry(page, &n->partial, lru)
				2846	process_slab(&t, s, page, alloc);
				2847	list_for_each_entry(page, &n->full, lru)
				2848	process_slab(&t, s, page, alloc);
				2849	spin_unlock_irqrestore(&n->list_lock, flags);
				2850	}
				2851
				2852	for (i = 0; i < t.count; i++) {
				2853	void *addr = t.loc[i].addr;
				2854
				2855	if (n > PAGE_SIZE - 100)
				2856	break;
				2857	n += sprintf(buf + n, "%7ld ", t.loc[i].count);
				2858	if (addr)
				2859	n += sprint_symbol(buf + n, (unsigned long)t.loc[i].addr);
				2860	else
				2861	n += sprintf(buf + n, "<not-available>");
				2862	n += sprintf(buf + n, "\n");
				2863	}
				2864
				2865	free_loc_track(&t);
				2866	if (!t.count)
				2867	n += sprintf(buf, "No data\n");
				2868	return n;
				2869	}
				2870
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2871	static unsigned long count_partial(struct kmem_cache_node *n)
				2872	{
				2873	unsigned long flags;
				2874	unsigned long x = 0;
				2875	struct page *page;
				2876
				2877	spin_lock_irqsave(&n->list_lock, flags);
				2878	list_for_each_entry(page, &n->partial, lru)
				2879	x += page->inuse;
				2880	spin_unlock_irqrestore(&n->list_lock, flags);
				2881	return x;
				2882	}
				2883
				2884	enum slab_stat_type {
				2885	SL_FULL,
				2886	SL_PARTIAL,
				2887	SL_CPU,
				2888	SL_OBJECTS
				2889	};
				2890
				2891	#define SO_FULL (1 << SL_FULL)
				2892	#define SO_PARTIAL (1 << SL_PARTIAL)
				2893	#define SO_CPU (1 << SL_CPU)
				2894	#define SO_OBJECTS (1 << SL_OBJECTS)
				2895
				2896	static unsigned long slab_objects(struct kmem_cache *s,
				2897	char *buf, unsigned long flags)
				2898	{
				2899	unsigned long total = 0;
				2900	int cpu;
				2901	int node;
				2902	int x;
				2903	unsigned long *nodes;
				2904	unsigned long *per_cpu;
				2905
				2906	nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
				2907	per_cpu = nodes + nr_node_ids;
				2908
				2909	for_each_possible_cpu(cpu) {
				2910	struct page *page = s->cpu_slab[cpu];
				2911	int node;
				2912
				2913	if (page) {
				2914	node = page_to_nid(page);
				2915	if (flags & SO_CPU) {
				2916	int x = 0;
				2917
				2918	if (flags & SO_OBJECTS)
				2919	x = page->inuse;
				2920	else
				2921	x = 1;
				2922	total += x;
				2923	nodes[node] += x;
				2924	}
				2925	per_cpu[node]++;
				2926	}
				2927	}
				2928
				2929	for_each_online_node(node) {
				2930	struct kmem_cache_node *n = get_node(s, node);
				2931
				2932	if (flags & SO_PARTIAL) {
				2933	if (flags & SO_OBJECTS)
				2934	x = count_partial(n);
				2935	else
				2936	x = n->nr_partial;
				2937	total += x;
				2938	nodes[node] += x;
				2939	}
				2940
				2941	if (flags & SO_FULL) {
				2942	int full_slabs = atomic_read(&n->nr_slabs)
				2943	- per_cpu[node]
				2944	- n->nr_partial;
				2945
				2946	if (flags & SO_OBJECTS)
				2947	x = full_slabs * s->objects;
				2948	else
				2949	x = full_slabs;
				2950	total += x;
				2951	nodes[node] += x;
				2952	}
				2953	}
				2954
				2955	x = sprintf(buf, "%lu", total);
				2956	#ifdef CONFIG_NUMA
				2957	for_each_online_node(node)
				2958	if (nodes[node])
				2959	x += sprintf(buf + x, " N%d=%lu",
				2960	node, nodes[node]);
				2961	#endif
				2962	kfree(nodes);
				2963	return x + sprintf(buf + x, "\n");
				2964	}
				2965
				2966	static int any_slab_objects(struct kmem_cache *s)
				2967	{
				2968	int node;
				2969	int cpu;
				2970
				2971	for_each_possible_cpu(cpu)
				2972	if (s->cpu_slab[cpu])
				2973	return 1;
				2974
				2975	for_each_node(node) {
				2976	struct kmem_cache_node *n = get_node(s, node);
				2977
				2978	if (n->nr_partial \|\| atomic_read(&n->nr_slabs))
				2979	return 1;
				2980	}
				2981	return 0;
				2982	}
				2983
				2984	#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
				2985	#define to_slab(n) container_of(n, struct kmem_cache, kobj);
				2986
				2987	struct slab_attribute {
				2988	struct attribute attr;
				2989	ssize_t (show)(struct kmem_cache s, char *buf);
				2990	ssize_t (store)(struct kmem_cache s, const char *x, size_t count);
				2991	};
				2992
				2993	#define SLAB_ATTR_RO(_name) \
				2994	static struct slab_attribute _name##_attr = __ATTR_RO(_name)
				2995
				2996	#define SLAB_ATTR(_name) \
				2997	static struct slab_attribute _name##_attr = \
				2998	__ATTR(_name, 0644, _name##_show, _name##_store)
				2999
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	3000	static ssize_t slab_size_show(struct kmem_cache s, char buf)
				3001	{
				3002	return sprintf(buf, "%d\n", s->size);
				3003	}
				3004	SLAB_ATTR_RO(slab_size);
				3005
				3006	static ssize_t align_show(struct kmem_cache s, char buf)
				3007	{
				3008	return sprintf(buf, "%d\n", s->align);
				3009	}
				3010	SLAB_ATTR_RO(align);
				3011
				3012	static ssize_t object_size_show(struct kmem_cache s, char buf)
				3013	{
				3014	return sprintf(buf, "%d\n", s->objsize);
				3015	}
				3016	SLAB_ATTR_RO(object_size);
				3017
				3018	static ssize_t objs_per_slab_show(struct kmem_cache s, char buf)
				3019	{
				3020	return sprintf(buf, "%d\n", s->objects);
				3021	}
				3022	SLAB_ATTR_RO(objs_per_slab);
				3023
				3024	static ssize_t order_show(struct kmem_cache s, char buf)
				3025	{
				3026	return sprintf(buf, "%d\n", s->order);
				3027	}
				3028	SLAB_ATTR_RO(order);
				3029
				3030	static ssize_t ctor_show(struct kmem_cache s, char buf)
				3031	{
				3032	if (s->ctor) {
				3033	int n = sprint_symbol(buf, (unsigned long)s->ctor);
				3034
				3035	return n + sprintf(buf + n, "\n");
				3036	}
				3037	return 0;
				3038	}
				3039	SLAB_ATTR_RO(ctor);
				3040
				3041	static ssize_t dtor_show(struct kmem_cache s, char buf)
				3042	{
				3043	if (s->dtor) {
				3044	int n = sprint_symbol(buf, (unsigned long)s->dtor);
				3045
				3046	return n + sprintf(buf + n, "\n");
				3047	}
				3048	return 0;
				3049	}
				3050	SLAB_ATTR_RO(dtor);
				3051
				3052	static ssize_t aliases_show(struct kmem_cache s, char buf)
				3053	{
				3054	return sprintf(buf, "%d\n", s->refcount - 1);
				3055	}
				3056	SLAB_ATTR_RO(aliases);
				3057
				3058	static ssize_t slabs_show(struct kmem_cache s, char buf)
				3059	{
				3060	return slab_objects(s, buf, SO_FULL\|SO_PARTIAL\|SO_CPU);
				3061	}
				3062	SLAB_ATTR_RO(slabs);
				3063
				3064	static ssize_t partial_show(struct kmem_cache s, char buf)
				3065	{
				3066	return slab_objects(s, buf, SO_PARTIAL);
				3067	}
				3068	SLAB_ATTR_RO(partial);
				3069
				3070	static ssize_t cpu_slabs_show(struct kmem_cache s, char buf)
				3071	{
				3072	return slab_objects(s, buf, SO_CPU);
				3073	}
				3074	SLAB_ATTR_RO(cpu_slabs);
				3075
				3076	static ssize_t objects_show(struct kmem_cache s, char buf)
				3077	{
				3078	return slab_objects(s, buf, SO_FULL\|SO_PARTIAL\|SO_CPU\|SO_OBJECTS);
				3079	}
				3080	SLAB_ATTR_RO(objects);
				3081
				3082	static ssize_t sanity_checks_show(struct kmem_cache s, char buf)
				3083	{
				3084	return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
				3085	}
				3086
				3087	static ssize_t sanity_checks_store(struct kmem_cache *s,
				3088	const char *buf, size_t length)
				3089	{
				3090	s->flags &= ~SLAB_DEBUG_FREE;
				3091	if (buf[0] == '1')
				3092	s->flags \|= SLAB_DEBUG_FREE;
				3093	return length;
				3094	}
				3095	SLAB_ATTR(sanity_checks);
				3096
				3097	static ssize_t trace_show(struct kmem_cache s, char buf)
				3098	{
				3099	return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
				3100	}
				3101
				3102	static ssize_t trace_store(struct kmem_cache s, const char buf,
				3103	size_t length)
				3104	{
				3105	s->flags &= ~SLAB_TRACE;
				3106	if (buf[0] == '1')
				3107	s->flags \|= SLAB_TRACE;
				3108	return length;
				3109	}
				3110	SLAB_ATTR(trace);
				3111
				3112	static ssize_t reclaim_account_show(struct kmem_cache s, char buf)
				3113	{
				3114	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
				3115	}
				3116
				3117	static ssize_t reclaim_account_store(struct kmem_cache *s,
				3118	const char *buf, size_t length)
				3119	{
				3120	s->flags &= ~SLAB_RECLAIM_ACCOUNT;
				3121	if (buf[0] == '1')
				3122	s->flags \|= SLAB_RECLAIM_ACCOUNT;
				3123	return length;
				3124	}
				3125	SLAB_ATTR(reclaim_account);
				3126
				3127	static ssize_t hwcache_align_show(struct kmem_cache s, char buf)
				3128	{
Christoph Lameter	5af6083	2007-05-06 14:49:56 -0700	[diff] [blame]	3129	return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	3130	}
				3131	SLAB_ATTR_RO(hwcache_align);
				3132
				3133	#ifdef CONFIG_ZONE_DMA
				3134	static ssize_t cache_dma_show(struct kmem_cache s, char buf)
				3135	{
				3136	return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
				3137	}
				3138	SLAB_ATTR_RO(cache_dma);
				3139	#endif
				3140
				3141	static ssize_t destroy_by_rcu_show(struct kmem_cache s, char buf)
				3142	{
				3143	return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
				3144	}
				3145	SLAB_ATTR_RO(destroy_by_rcu);
				3146
				3147	static ssize_t red_zone_show(struct kmem_cache s, char buf)
				3148	{
				3149	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
				3150	}
				3151
				3152	static ssize_t red_zone_store(struct kmem_cache *s,
				3153	const char *buf, size_t length)
				3154	{
				3155	if (any_slab_objects(s))
				3156	return -EBUSY;
				3157
				3158	s->flags &= ~SLAB_RED_ZONE;
				3159	if (buf[0] == '1')
				3160	s->flags \|= SLAB_RED_ZONE;
				3161	calculate_sizes(s);
				3162	return length;
				3163	}
				3164	SLAB_ATTR(red_zone);
				3165
				3166	static ssize_t poison_show(struct kmem_cache s, char buf)
				3167	{
				3168	return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
				3169	}
				3170
				3171	static ssize_t poison_store(struct kmem_cache *s,
				3172	const char *buf, size_t length)
				3173	{
				3174	if (any_slab_objects(s))
				3175	return -EBUSY;
				3176
				3177	s->flags &= ~SLAB_POISON;
				3178	if (buf[0] == '1')
				3179	s->flags \|= SLAB_POISON;
				3180	calculate_sizes(s);
				3181	return length;
				3182	}
				3183	SLAB_ATTR(poison);
				3184
				3185	static ssize_t store_user_show(struct kmem_cache s, char buf)
				3186	{
				3187	return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
				3188	}
				3189
				3190	static ssize_t store_user_store(struct kmem_cache *s,
				3191	const char *buf, size_t length)
				3192	{
				3193	if (any_slab_objects(s))
				3194	return -EBUSY;
				3195
				3196	s->flags &= ~SLAB_STORE_USER;
				3197	if (buf[0] == '1')
				3198	s->flags \|= SLAB_STORE_USER;
				3199	calculate_sizes(s);
				3200	return length;
				3201	}
				3202	SLAB_ATTR(store_user);
				3203
Christoph Lameter	53e15af	2007-05-06 14:49:43 -0700	[diff] [blame]	3204	static ssize_t validate_show(struct kmem_cache s, char buf)
				3205	{
				3206	return 0;
				3207	}
				3208
				3209	static ssize_t validate_store(struct kmem_cache *s,
				3210	const char *buf, size_t length)
				3211	{
				3212	if (buf[0] == '1')
				3213	validate_slab_cache(s);
				3214	else
				3215	return -EINVAL;
				3216	return length;
				3217	}
				3218	SLAB_ATTR(validate);
				3219
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame]	3220	static ssize_t shrink_show(struct kmem_cache s, char buf)
				3221	{
				3222	return 0;
				3223	}
				3224
				3225	static ssize_t shrink_store(struct kmem_cache *s,
				3226	const char *buf, size_t length)
				3227	{
				3228	if (buf[0] == '1') {
				3229	int rc = kmem_cache_shrink(s);
				3230
				3231	if (rc)
				3232	return rc;
				3233	} else
				3234	return -EINVAL;
				3235	return length;
				3236	}
				3237	SLAB_ATTR(shrink);
				3238
Christoph Lameter	88a420e	2007-05-06 14:49:45 -0700	[diff] [blame]	3239	static ssize_t alloc_calls_show(struct kmem_cache s, char buf)
				3240	{
				3241	if (!(s->flags & SLAB_STORE_USER))
				3242	return -ENOSYS;
				3243	return list_locations(s, buf, TRACK_ALLOC);
				3244	}
				3245	SLAB_ATTR_RO(alloc_calls);
				3246
				3247	static ssize_t free_calls_show(struct kmem_cache s, char buf)
				3248	{
				3249	if (!(s->flags & SLAB_STORE_USER))
				3250	return -ENOSYS;
				3251	return list_locations(s, buf, TRACK_FREE);
				3252	}
				3253	SLAB_ATTR_RO(free_calls);
				3254
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	3255	#ifdef CONFIG_NUMA
				3256	static ssize_t defrag_ratio_show(struct kmem_cache s, char buf)
				3257	{
				3258	return sprintf(buf, "%d\n", s->defrag_ratio / 10);
				3259	}
				3260
				3261	static ssize_t defrag_ratio_store(struct kmem_cache *s,
				3262	const char *buf, size_t length)
				3263	{
				3264	int n = simple_strtoul(buf, NULL, 10);
				3265
				3266	if (n < 100)
				3267	s->defrag_ratio = n * 10;
				3268	return length;
				3269	}
				3270	SLAB_ATTR(defrag_ratio);
				3271	#endif
				3272
				3273	static struct attribute * slab_attrs[] = {
				3274	&slab_size_attr.attr,
				3275	&object_size_attr.attr,
				3276	&objs_per_slab_attr.attr,
				3277	&order_attr.attr,
				3278	&objects_attr.attr,
				3279	&slabs_attr.attr,
				3280	&partial_attr.attr,
				3281	&cpu_slabs_attr.attr,
				3282	&ctor_attr.attr,
				3283	&dtor_attr.attr,
				3284	&aliases_attr.attr,
				3285	&align_attr.attr,
				3286	&sanity_checks_attr.attr,
				3287	&trace_attr.attr,
				3288	&hwcache_align_attr.attr,
				3289	&reclaim_account_attr.attr,
				3290	&destroy_by_rcu_attr.attr,
				3291	&red_zone_attr.attr,
				3292	&poison_attr.attr,
				3293	&store_user_attr.attr,
Christoph Lameter	53e15af	2007-05-06 14:49:43 -0700	[diff] [blame]	3294	&validate_attr.attr,
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame]	3295	&shrink_attr.attr,
Christoph Lameter	88a420e	2007-05-06 14:49:45 -0700	[diff] [blame]	3296	&alloc_calls_attr.attr,
				3297	&free_calls_attr.attr,
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	3298	#ifdef CONFIG_ZONE_DMA
				3299	&cache_dma_attr.attr,
				3300	#endif
				3301	#ifdef CONFIG_NUMA
				3302	&defrag_ratio_attr.attr,
				3303	#endif
				3304	NULL
				3305	};
				3306
				3307	static struct attribute_group slab_attr_group = {
				3308	.attrs = slab_attrs,
				3309	};
				3310
				3311	static ssize_t slab_attr_show(struct kobject *kobj,
				3312	struct attribute *attr,
				3313	char *buf)
				3314	{
				3315	struct slab_attribute *attribute;
				3316	struct kmem_cache *s;
				3317	int err;
				3318
				3319	attribute = to_slab_attr(attr);
				3320	s = to_slab(kobj);
				3321
				3322	if (!attribute->show)
				3323	return -EIO;
				3324
				3325	err = attribute->show(s, buf);
				3326
				3327	return err;
				3328	}
				3329
				3330	static ssize_t slab_attr_store(struct kobject *kobj,
				3331	struct attribute *attr,
				3332	const char *buf, size_t len)
				3333	{
				3334	struct slab_attribute *attribute;
				3335	struct kmem_cache *s;
				3336	int err;
				3337
				3338	attribute = to_slab_attr(attr);
				3339	s = to_slab(kobj);
				3340
				3341	if (!attribute->store)
				3342	return -EIO;
				3343
				3344	err = attribute->store(s, buf, len);
				3345
				3346	return err;
				3347	}
				3348
				3349	static struct sysfs_ops slab_sysfs_ops = {
				3350	.show = slab_attr_show,
				3351	.store = slab_attr_store,
				3352	};
				3353
				3354	static struct kobj_type slab_ktype = {
				3355	.sysfs_ops = &slab_sysfs_ops,
				3356	};
				3357
				3358	static int uevent_filter(struct kset kset, struct kobject kobj)
				3359	{
				3360	struct kobj_type *ktype = get_ktype(kobj);
				3361
				3362	if (ktype == &slab_ktype)
				3363	return 1;
				3364	return 0;
				3365	}
				3366
				3367	static struct kset_uevent_ops slab_uevent_ops = {
				3368	.filter = uevent_filter,
				3369	};
				3370
				3371	decl_subsys(slab, &slab_ktype, &slab_uevent_ops);
				3372
				3373	#define ID_STR_LENGTH 64
				3374
				3375	/* Create a unique string id for a slab cache:
				3376	* format
				3377	* :[flags-]size:[memory address of kmemcache]
				3378	*/
				3379	static char create_unique_id(struct kmem_cache s)
				3380	{
				3381	char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
				3382	char *p = name;
				3383
				3384	BUG_ON(!name);
				3385
				3386	*p++ = ':';
				3387	/*
				3388	* First flags affecting slabcache operations. We will only
				3389	* get here for aliasable slabs so we do not need to support
				3390	* too many flags. The flags here must cover all flags that
				3391	* are matched during merging to guarantee that the id is
				3392	* unique.
				3393	*/
				3394	if (s->flags & SLAB_CACHE_DMA)
				3395	*p++ = 'd';
				3396	if (s->flags & SLAB_RECLAIM_ACCOUNT)
				3397	*p++ = 'a';
				3398	if (s->flags & SLAB_DEBUG_FREE)
				3399	*p++ = 'F';
				3400	if (p != name + 1)
				3401	*p++ = '-';
				3402	p += sprintf(p, "%07d", s->size);
				3403	BUG_ON(p > name + ID_STR_LENGTH - 1);
				3404	return name;
				3405	}
				3406
				3407	static int sysfs_slab_add(struct kmem_cache *s)
				3408	{
				3409	int err;
				3410	const char *name;
				3411	int unmergeable;
				3412
				3413	if (slab_state < SYSFS)
				3414	/* Defer until later */
				3415	return 0;
				3416
				3417	unmergeable = slab_unmergeable(s);
				3418	if (unmergeable) {
				3419	/*
				3420	* Slabcache can never be merged so we can use the name proper.
				3421	* This is typically the case for debug situations. In that
				3422	* case we can catch duplicate names easily.
				3423	*/
				3424	sysfs_remove_link(&slab_subsys.kset.kobj, s->name);
				3425	name = s->name;
				3426	} else {
				3427	/*
				3428	* Create a unique name for the slab as a target
				3429	* for the symlinks.
				3430	*/
				3431	name = create_unique_id(s);
				3432	}
				3433
				3434	kobj_set_kset_s(s, slab_subsys);
				3435	kobject_set_name(&s->kobj, name);
				3436	kobject_init(&s->kobj);
				3437	err = kobject_add(&s->kobj);
				3438	if (err)
				3439	return err;
				3440
				3441	err = sysfs_create_group(&s->kobj, &slab_attr_group);
				3442	if (err)
				3443	return err;
				3444	kobject_uevent(&s->kobj, KOBJ_ADD);
				3445	if (!unmergeable) {
				3446	/* Setup first alias */
				3447	sysfs_slab_alias(s, s->name);
				3448	kfree(name);
				3449	}
				3450	return 0;
				3451	}
				3452
				3453	static void sysfs_slab_remove(struct kmem_cache *s)
				3454	{
				3455	kobject_uevent(&s->kobj, KOBJ_REMOVE);
				3456	kobject_del(&s->kobj);
				3457	}
				3458
				3459	/*
				3460	* Need to buffer aliases during bootup until sysfs becomes
				3461	* available lest we loose that information.
				3462	*/
				3463	struct saved_alias {
				3464	struct kmem_cache *s;
				3465	const char *name;
				3466	struct saved_alias *next;
				3467	};
				3468
				3469	struct saved_alias *alias_list;
				3470
				3471	static int sysfs_slab_alias(struct kmem_cache s, const char name)
				3472	{
				3473	struct saved_alias *al;
				3474
				3475	if (slab_state == SYSFS) {
				3476	/*
				3477	* If we have a leftover link then remove it.
				3478	*/
				3479	sysfs_remove_link(&slab_subsys.kset.kobj, name);
				3480	return sysfs_create_link(&slab_subsys.kset.kobj,
				3481	&s->kobj, name);
				3482	}
				3483
				3484	al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
				3485	if (!al)
				3486	return -ENOMEM;
				3487
				3488	al->s = s;
				3489	al->name = name;
				3490	al->next = alias_list;
				3491	alias_list = al;
				3492	return 0;
				3493	}
				3494
				3495	static int __init slab_sysfs_init(void)
				3496	{
				3497	int err;
				3498
				3499	err = subsystem_register(&slab_subsys);
				3500	if (err) {
				3501	printk(KERN_ERR "Cannot register slab subsystem.\n");
				3502	return -ENOSYS;
				3503	}
				3504
				3505	finish_bootstrap();
				3506
				3507	while (alias_list) {
				3508	struct saved_alias *al = alias_list;
				3509
				3510	alias_list = alias_list->next;
				3511	err = sysfs_slab_alias(al->s, al->name);
				3512	BUG_ON(err);
				3513	kfree(al);
				3514	}
				3515
				3516	resiliency_test();
				3517	return 0;
				3518	}
				3519
				3520	__initcall(slab_sysfs_init);
				3521	#else
				3522	__initcall(finish_bootstrap);
				3523	#endif