Blame - mm/slub.c - kernel/msm

blob: c58a974d15acec2a1c7f1899b2e93abe11b231ad [file] [log] [blame]

Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1	/*
				2	* SLUB: A slab allocator that limits cache line use instead of queuing
				3	* objects in per cpu and per node lists.
				4	*
				5	* The allocator synchronizes using per slab locks and only
				6	* uses a centralized lock to manage a pool of partial slabs.
				7	*
				8	* (C) 2007 SGI, Christoph Lameter <clameter@sgi.com>
				9	*/
				10
				11	#include <linux/mm.h>
				12	#include <linux/module.h>
				13	#include <linux/bit_spinlock.h>
				14	#include <linux/interrupt.h>
				15	#include <linux/bitops.h>
				16	#include <linux/slab.h>
				17	#include <linux/seq_file.h>
				18	#include <linux/cpu.h>
				19	#include <linux/cpuset.h>
				20	#include <linux/mempolicy.h>
				21	#include <linux/ctype.h>
				22	#include <linux/kallsyms.h>
				23
				24	/*
				25	* Lock order:
				26	* 1. slab_lock(page)
				27	* 2. slab->list_lock
				28	*
				29	* The slab_lock protects operations on the object of a particular
				30	* slab and its metadata in the page struct. If the slab lock
				31	* has been taken then no allocations nor frees can be performed
				32	* on the objects in the slab nor can the slab be added or removed
				33	* from the partial or full lists since this would mean modifying
				34	* the page_struct of the slab.
				35	*
				36	* The list_lock protects the partial and full list on each node and
				37	* the partial slab counter. If taken then no new slabs may be added or
				38	* removed from the lists nor make the number of partial slabs be modified.
				39	* (Note that the total number of slabs is an atomic value that may be
				40	* modified without taking the list lock).
				41	*
				42	* The list_lock is a centralized lock and thus we avoid taking it as
				43	* much as possible. As long as SLUB does not have to handle partial
				44	* slabs, operations can continue without any centralized lock. F.e.
				45	* allocating a long series of objects that fill up slabs does not require
				46	* the list lock.
				47	*
				48	* The lock order is sometimes inverted when we are trying to get a slab
				49	* off a list. We take the list_lock and then look for a page on the list
				50	* to use. While we do that objects in the slabs may be freed. We can
				51	* only operate on the slab if we have also taken the slab_lock. So we use
				52	* a slab_trylock() on the slab. If trylock was successful then no frees
				53	* can occur anymore and we can use the slab for allocations etc. If the
				54	* slab_trylock() does not succeed then frees are in progress in the slab and
				55	* we must stay away from it for a while since we may cause a bouncing
				56	* cacheline if we try to acquire the lock. So go onto the next slab.
				57	* If all pages are busy then we may allocate a new slab instead of reusing
				58	* a partial slab. A new slab has noone operating on it and thus there is
				59	* no danger of cacheline contention.
				60	*
				61	* Interrupts are disabled during allocation and deallocation in order to
				62	* make the slab allocator safe to use in the context of an irq. In addition
				63	* interrupts are disabled to ensure that the processor does not change
				64	* while handling per_cpu slabs, due to kernel preemption.
				65	*
				66	* SLUB assigns one slab for allocation to each processor.
				67	* Allocations only occur from these slabs called cpu slabs.
				68	*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	69	* Slabs with free elements are kept on a partial list and during regular
				70	* operations no list for full slabs is used. If an object in a full slab is
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	71	* freed then the slab will show up again on the partial lists.
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	72	* We track full slabs for debugging purposes though because otherwise we
				73	* cannot scan all objects.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	74	*
				75	* Slabs are freed when they become empty. Teardown and setup is
				76	* minimal so we rely on the page allocators per cpu caches for
				77	* fast frees and allocs.
				78	*
				79	* Overloading of page flags that are otherwise used for LRU management.
				80	*
				81	* PageActive The slab is used as a cpu cache. Allocations
				82	* may be performed from the slab. The slab is not
				83	* on any slab list and cannot be moved onto one.
				84	*
				85	* PageError Slab requires special handling due to debug
				86	* options set. This moves slab handling out of
				87	* the fast path.
				88	*/
				89
Christoph Lameter	35e5d7e	2007-05-09 02:32:42 -0700	[diff] [blame^]	90	static inline int SlabDebug(struct page *page)
				91	{
				92	return PageError(page);
				93	}
				94
				95	static inline void SetSlabDebug(struct page *page)
				96	{
				97	SetPageError(page);
				98	}
				99
				100	static inline void ClearSlabDebug(struct page *page)
				101	{
				102	ClearPageError(page);
				103	}
				104
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	105	/*
				106	* Issues still to be resolved:
				107	*
				108	* - The per cpu array is updated for each new slab and and is a remote
				109	* cacheline for most nodes. This could become a bouncing cacheline given
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	110	* enough frequent updates. There are 16 pointers in a cacheline, so at
				111	* max 16 cpus could compete for the cacheline which may be okay.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	112	*
				113	* - Support PAGE_ALLOC_DEBUG. Should be easy to do.
				114	*
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	115	* - Variable sizing of the per node arrays
				116	*/
				117
				118	/* Enable to test recovery from slab corruption on boot */
				119	#undef SLUB_RESILIENCY_TEST
				120
				121	#if PAGE_SHIFT <= 12
				122
				123	/*
				124	* Small page size. Make sure that we do not fragment memory
				125	*/
				126	#define DEFAULT_MAX_ORDER 1
				127	#define DEFAULT_MIN_OBJECTS 4
				128
				129	#else
				130
				131	/*
				132	* Large page machines are customarily able to handle larger
				133	* page orders.
				134	*/
				135	#define DEFAULT_MAX_ORDER 2
				136	#define DEFAULT_MIN_OBJECTS 8
				137
				138	#endif
				139
				140	/*
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame]	141	* Mininum number of partial slabs. These will be left on the partial
				142	* lists even if they are empty. kmem_cache_shrink may reclaim them.
				143	*/
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	144	#define MIN_PARTIAL 2
				145
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame]	146	/*
				147	* Maximum number of desirable partial slabs.
				148	* The existence of more partial slabs makes kmem_cache_shrink
				149	* sort the partial list by the number of objects in the.
				150	*/
				151	#define MAX_PARTIAL 10
				152
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	153	#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE \| SLAB_RED_ZONE \| \
				154	SLAB_POISON \| SLAB_STORE_USER)
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	155
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	156	/*
				157	* Set of flags that will prevent slab merging
				158	*/
				159	#define SLUB_NEVER_MERGE (SLAB_RED_ZONE \| SLAB_POISON \| SLAB_STORE_USER \| \
				160	SLAB_TRACE \| SLAB_DESTROY_BY_RCU)
				161
				162	#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE \| SLAB_RECLAIM_ACCOUNT \| \
				163	SLAB_CACHE_DMA)
				164
				165	#ifndef ARCH_KMALLOC_MINALIGN
Christoph Lameter	47bfdc0	2007-05-06 14:49:37 -0700	[diff] [blame]	166	#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	167	#endif
				168
				169	#ifndef ARCH_SLAB_MINALIGN
Christoph Lameter	47bfdc0	2007-05-06 14:49:37 -0700	[diff] [blame]	170	#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	171	#endif
				172
				173	/* Internal SLUB flags */
				174	#define __OBJECT_POISON 0x80000000 /* Poison object */
				175
Christoph Lameter	65c02d4	2007-05-09 02:32:35 -0700	[diff] [blame]	176	/* Not all arches define cache_line_size */
				177	#ifndef cache_line_size
				178	#define cache_line_size() L1_CACHE_BYTES
				179	#endif
				180
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	181	static int kmem_size = sizeof(struct kmem_cache);
				182
				183	#ifdef CONFIG_SMP
				184	static struct notifier_block slab_notifier;
				185	#endif
				186
				187	static enum {
				188	DOWN, /* No slab functionality available */
				189	PARTIAL, /* kmem_cache_open() works but kmalloc does not */
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	190	UP, /* Everything works but does not show up in sysfs */
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	191	SYSFS /* Sysfs up */
				192	} slab_state = DOWN;
				193
				194	/* A list of all slab caches on the system */
				195	static DECLARE_RWSEM(slub_lock);
				196	LIST_HEAD(slab_caches);
				197
				198	#ifdef CONFIG_SYSFS
				199	static int sysfs_slab_add(struct kmem_cache *);
				200	static int sysfs_slab_alias(struct kmem_cache , const char );
				201	static void sysfs_slab_remove(struct kmem_cache *);
				202	#else
				203	static int sysfs_slab_add(struct kmem_cache *s) { return 0; }
				204	static int sysfs_slab_alias(struct kmem_cache s, const char p) { return 0; }
				205	static void sysfs_slab_remove(struct kmem_cache *s) {}
				206	#endif
				207
				208	/********************************************************************
				209	* Core slab cache functions
				210	*******************************************************************/
				211
				212	int slab_is_available(void)
				213	{
				214	return slab_state >= UP;
				215	}
				216
				217	static inline struct kmem_cache_node get_node(struct kmem_cache s, int node)
				218	{
				219	#ifdef CONFIG_NUMA
				220	return s->node[node];
				221	#else
				222	return &s->local_node;
				223	#endif
				224	}
				225
				226	/*
Christoph Lameter	7656c72	2007-05-09 02:32:40 -0700	[diff] [blame]	227	* Slow version of get and set free pointer.
				228	*
				229	* This version requires touching the cache lines of kmem_cache which
				230	* we avoid to do in the fast alloc free paths. There we obtain the offset
				231	* from the page struct.
				232	*/
				233	static inline void get_freepointer(struct kmem_cache s, void *object)
				234	{
				235	return (void *)(object + s->offset);
				236	}
				237
				238	static inline void set_freepointer(struct kmem_cache s, void object, void *fp)
				239	{
				240	(void *)(object + s->offset) = fp;
				241	}
				242
				243	/* Loop over all objects in a slab */
				244	#define for_each_object(__p, __s, __addr) \
				245	for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\
				246	__p += (__s)->size)
				247
				248	/* Scan freelist */
				249	#define for_each_free_object(__p, __s, __free) \
				250	for (__p = (__free); __p; __p = get_freepointer((__s), __p))
				251
				252	/* Determine object index from a given position */
				253	static inline int slab_index(void p, struct kmem_cache s, void *addr)
				254	{
				255	return (p - addr) / s->size;
				256	}
				257
				258	/*
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	259	* Object debugging
				260	*/
				261	static void print_section(char text, u8 addr, unsigned int length)
				262	{
				263	int i, offset;
				264	int newline = 1;
				265	char ascii[17];
				266
				267	ascii[16] = 0;
				268
				269	for (i = 0; i < length; i++) {
				270	if (newline) {
				271	printk(KERN_ERR "%10s 0x%p: ", text, addr + i);
				272	newline = 0;
				273	}
				274	printk(" %02x", addr[i]);
				275	offset = i % 16;
				276	ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
				277	if (offset == 15) {
				278	printk(" %s\n",ascii);
				279	newline = 1;
				280	}
				281	}
				282	if (!newline) {
				283	i %= 16;
				284	while (i < 16) {
				285	printk(" ");
				286	ascii[i] = ' ';
				287	i++;
				288	}
				289	printk(" %s\n", ascii);
				290	}
				291	}
				292
				293	/*
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	294	* Tracking user of a slab.
				295	*/
				296	struct track {
				297	void addr; / Called from address */
				298	int cpu; /* Was running on cpu */
				299	int pid; /* Pid context */
				300	unsigned long when; /* When did the operation occur */
				301	};
				302
				303	enum track_item { TRACK_ALLOC, TRACK_FREE };
				304
				305	static struct track get_track(struct kmem_cache s, void *object,
				306	enum track_item alloc)
				307	{
				308	struct track *p;
				309
				310	if (s->offset)
				311	p = object + s->offset + sizeof(void *);
				312	else
				313	p = object + s->inuse;
				314
				315	return p + alloc;
				316	}
				317
				318	static void set_track(struct kmem_cache s, void object,
				319	enum track_item alloc, void *addr)
				320	{
				321	struct track *p;
				322
				323	if (s->offset)
				324	p = object + s->offset + sizeof(void *);
				325	else
				326	p = object + s->inuse;
				327
				328	p += alloc;
				329	if (addr) {
				330	p->addr = addr;
				331	p->cpu = smp_processor_id();
				332	p->pid = current ? current->pid : -1;
				333	p->when = jiffies;
				334	} else
				335	memset(p, 0, sizeof(struct track));
				336	}
				337
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	338	static void init_tracking(struct kmem_cache s, void object)
				339	{
				340	if (s->flags & SLAB_STORE_USER) {
				341	set_track(s, object, TRACK_FREE, NULL);
				342	set_track(s, object, TRACK_ALLOC, NULL);
				343	}
				344	}
				345
				346	static void print_track(const char s, struct track t)
				347	{
				348	if (!t->addr)
				349	return;
				350
				351	printk(KERN_ERR "%s: ", s);
				352	__print_symbol("%s", (unsigned long)t->addr);
				353	printk(" jiffies_ago=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
				354	}
				355
				356	static void print_trailer(struct kmem_cache s, u8 p)
				357	{
				358	unsigned int off; /* Offset of last byte */
				359
				360	if (s->flags & SLAB_RED_ZONE)
				361	print_section("Redzone", p + s->objsize,
				362	s->inuse - s->objsize);
				363
				364	printk(KERN_ERR "FreePointer 0x%p -> 0x%p\n",
				365	p + s->offset,
				366	get_freepointer(s, p));
				367
				368	if (s->offset)
				369	off = s->offset + sizeof(void *);
				370	else
				371	off = s->inuse;
				372
				373	if (s->flags & SLAB_STORE_USER) {
				374	print_track("Last alloc", get_track(s, p, TRACK_ALLOC));
				375	print_track("Last free ", get_track(s, p, TRACK_FREE));
				376	off += 2 * sizeof(struct track);
				377	}
				378
				379	if (off != s->size)
				380	/* Beginning of the filler is the free pointer */
				381	print_section("Filler", p + off, s->size - off);
				382	}
				383
				384	static void object_err(struct kmem_cache s, struct page page,
				385	u8 object, char reason)
				386	{
				387	u8 *addr = page_address(page);
				388
				389	printk(KERN_ERR "*** SLUB %s: %s@0x%p slab 0x%p\n",
				390	s->name, reason, object, page);
				391	printk(KERN_ERR " offset=%tu flags=0x%04lx inuse=%u freelist=0x%p\n",
				392	object - addr, page->flags, page->inuse, page->freelist);
				393	if (object > addr + 16)
				394	print_section("Bytes b4", object - 16, 16);
				395	print_section("Object", object, min(s->objsize, 128));
				396	print_trailer(s, object);
				397	dump_stack();
				398	}
				399
				400	static void slab_err(struct kmem_cache s, struct page page, char *reason, ...)
				401	{
				402	va_list args;
				403	char buf[100];
				404
				405	va_start(args, reason);
				406	vsnprintf(buf, sizeof(buf), reason, args);
				407	va_end(args);
				408	printk(KERN_ERR "*** SLUB %s: %s in slab @0x%p\n", s->name, buf,
				409	page);
				410	dump_stack();
				411	}
				412
				413	static void init_object(struct kmem_cache s, void object, int active)
				414	{
				415	u8 *p = object;
				416
				417	if (s->flags & __OBJECT_POISON) {
				418	memset(p, POISON_FREE, s->objsize - 1);
				419	p[s->objsize -1] = POISON_END;
				420	}
				421
				422	if (s->flags & SLAB_RED_ZONE)
				423	memset(p + s->objsize,
				424	active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
				425	s->inuse - s->objsize);
				426	}
				427
				428	static int check_bytes(u8 *start, unsigned int value, unsigned int bytes)
				429	{
				430	while (bytes) {
				431	if (*start != (u8)value)
				432	return 0;
				433	start++;
				434	bytes--;
				435	}
				436	return 1;
				437	}
				438
Christoph Lameter	abcd08a	2007-05-09 02:32:37 -0700	[diff] [blame]	439	static inline int check_valid_pointer(struct kmem_cache *s,
				440	struct page page, const void object)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	441	{
				442	void *base;
				443
				444	if (!object)
				445	return 1;
				446
				447	base = page_address(page);
				448	if (object < base \|\| object >= base + s->objects * s->size \|\|
				449	(object - base) % s->size) {
				450	return 0;
				451	}
				452
				453	return 1;
				454	}
				455
				456	/*
				457	* Object layout:
				458	*
				459	* object address
				460	* Bytes of the object to be managed.
				461	* If the freepointer may overlay the object then the free
				462	* pointer is the first word of the object.
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	463	*
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	464	* Poisoning uses 0x6b (POISON_FREE) and the last byte is
				465	* 0xa5 (POISON_END)
				466	*
				467	* object + s->objsize
				468	* Padding to reach word boundary. This is also used for Redzoning.
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	469	* Padding is extended by another word if Redzoning is enabled and
				470	* objsize == inuse.
				471	*
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	472	* We fill with 0xbb (RED_INACTIVE) for inactive objects and with
				473	* 0xcc (RED_ACTIVE) for objects in use.
				474	*
				475	* object + s->inuse
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	476	* Meta data starts here.
				477	*
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	478	* A. Free pointer (if we cannot overwrite object on free)
				479	* B. Tracking data for SLAB_STORE_USER
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	480	* C. Padding to reach required alignment boundary or at mininum
				481	* one word if debuggin is on to be able to detect writes
				482	* before the word boundary.
				483	*
				484	* Padding is done using 0x5a (POISON_INUSE)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	485	*
				486	* object + s->size
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	487	* Nothing is used beyond s->size.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	488	*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	489	* If slabcaches are merged then the objsize and inuse boundaries are mostly
				490	* ignored. And therefore no slab options that rely on these boundaries
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	491	* may be used with merged slabcaches.
				492	*/
				493
				494	static void restore_bytes(struct kmem_cache s, char message, u8 data,
				495	void from, void to)
				496	{
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	497	printk(KERN_ERR "@@@ SLUB %s: Restoring %s (0x%x) from 0x%p-0x%p\n",
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	498	s->name, message, data, from, to - 1);
				499	memset(from, data, to - from);
				500	}
				501
				502	static int check_pad_bytes(struct kmem_cache s, struct page page, u8 *p)
				503	{
				504	unsigned long off = s->inuse; /* The end of info */
				505
				506	if (s->offset)
				507	/* Freepointer is placed after the object. */
				508	off += sizeof(void *);
				509
				510	if (s->flags & SLAB_STORE_USER)
				511	/* We also have user information there */
				512	off += 2 * sizeof(struct track);
				513
				514	if (s->size == off)
				515	return 1;
				516
				517	if (check_bytes(p + off, POISON_INUSE, s->size - off))
				518	return 1;
				519
				520	object_err(s, page, p, "Object padding check fails");
				521
				522	/*
				523	* Restore padding
				524	*/
				525	restore_bytes(s, "object padding", POISON_INUSE, p + off, p + s->size);
				526	return 0;
				527	}
				528
				529	static int slab_pad_check(struct kmem_cache s, struct page page)
				530	{
				531	u8 *p;
				532	int length, remainder;
				533
				534	if (!(s->flags & SLAB_POISON))
				535	return 1;
				536
				537	p = page_address(page);
				538	length = s->objects * s->size;
				539	remainder = (PAGE_SIZE << s->order) - length;
				540	if (!remainder)
				541	return 1;
				542
				543	if (!check_bytes(p + length, POISON_INUSE, remainder)) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	544	slab_err(s, page, "Padding check failed");
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	545	restore_bytes(s, "slab padding", POISON_INUSE, p + length,
				546	p + length + remainder);
				547	return 0;
				548	}
				549	return 1;
				550	}
				551
				552	static int check_object(struct kmem_cache s, struct page page,
				553	void *object, int active)
				554	{
				555	u8 *p = object;
				556	u8 *endobject = object + s->objsize;
				557
				558	if (s->flags & SLAB_RED_ZONE) {
				559	unsigned int red =
				560	active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
				561
				562	if (!check_bytes(endobject, red, s->inuse - s->objsize)) {
				563	object_err(s, page, object,
				564	active ? "Redzone Active" : "Redzone Inactive");
				565	restore_bytes(s, "redzone", red,
				566	endobject, object + s->inuse);
				567	return 0;
				568	}
				569	} else {
				570	if ((s->flags & SLAB_POISON) && s->objsize < s->inuse &&
				571	!check_bytes(endobject, POISON_INUSE,
				572	s->inuse - s->objsize)) {
				573	object_err(s, page, p, "Alignment padding check fails");
				574	/*
				575	* Fix it so that there will not be another report.
				576	*
				577	* Hmmm... We may be corrupting an object that now expects
				578	* to be longer than allowed.
				579	*/
				580	restore_bytes(s, "alignment padding", POISON_INUSE,
				581	endobject, object + s->inuse);
				582	}
				583	}
				584
				585	if (s->flags & SLAB_POISON) {
				586	if (!active && (s->flags & __OBJECT_POISON) &&
				587	(!check_bytes(p, POISON_FREE, s->objsize - 1) \|\|
				588	p[s->objsize - 1] != POISON_END)) {
				589
				590	object_err(s, page, p, "Poison check failed");
				591	restore_bytes(s, "Poison", POISON_FREE,
				592	p, p + s->objsize -1);
				593	restore_bytes(s, "Poison", POISON_END,
				594	p + s->objsize - 1, p + s->objsize);
				595	return 0;
				596	}
				597	/*
				598	* check_pad_bytes cleans up on its own.
				599	*/
				600	check_pad_bytes(s, page, p);
				601	}
				602
				603	if (!s->offset && active)
				604	/*
				605	* Object and freepointer overlap. Cannot check
				606	* freepointer while object is allocated.
				607	*/
				608	return 1;
				609
				610	/* Check free pointer validity */
				611	if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
				612	object_err(s, page, p, "Freepointer corrupt");
				613	/*
				614	* No choice but to zap it and thus loose the remainder
				615	* of the free objects in this slab. May cause
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	616	* another error because the object count is now wrong.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	617	*/
				618	set_freepointer(s, p, NULL);
				619	return 0;
				620	}
				621	return 1;
				622	}
				623
				624	static int check_slab(struct kmem_cache s, struct page page)
				625	{
				626	VM_BUG_ON(!irqs_disabled());
				627
				628	if (!PageSlab(page)) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	629	slab_err(s, page, "Not a valid slab page flags=%lx "
				630	"mapping=0x%p count=%d", page->flags, page->mapping,
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	631	page_count(page));
				632	return 0;
				633	}
				634	if (page->offset * sizeof(void *) != s->offset) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	635	slab_err(s, page, "Corrupted offset %lu flags=0x%lx "
				636	"mapping=0x%p count=%d",
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	637	(unsigned long)(page->offset * sizeof(void *)),
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	638	page->flags,
				639	page->mapping,
				640	page_count(page));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	641	return 0;
				642	}
				643	if (page->inuse > s->objects) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	644	slab_err(s, page, "inuse %u > max %u @0x%p flags=%lx "
				645	"mapping=0x%p count=%d",
				646	s->name, page->inuse, s->objects, page->flags,
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	647	page->mapping, page_count(page));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	648	return 0;
				649	}
				650	/* Slab_pad_check fixes things up after itself */
				651	slab_pad_check(s, page);
				652	return 1;
				653	}
				654
				655	/*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	656	* Determine if a certain object on a page is on the freelist. Must hold the
				657	* slab lock to guarantee that the chains are in a consistent state.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	658	*/
				659	static int on_freelist(struct kmem_cache s, struct page page, void *search)
				660	{
				661	int nr = 0;
				662	void *fp = page->freelist;
				663	void *object = NULL;
				664
				665	while (fp && nr <= s->objects) {
				666	if (fp == search)
				667	return 1;
				668	if (!check_valid_pointer(s, page, fp)) {
				669	if (object) {
				670	object_err(s, page, object,
				671	"Freechain corrupt");
				672	set_freepointer(s, object, NULL);
				673	break;
				674	} else {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	675	slab_err(s, page, "Freepointer 0x%p corrupt",
				676	fp);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	677	page->freelist = NULL;
				678	page->inuse = s->objects;
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	679	printk(KERN_ERR "@@@ SLUB %s: Freelist "
				680	"cleared. Slab 0x%p\n",
				681	s->name, page);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	682	return 0;
				683	}
				684	break;
				685	}
				686	object = fp;
				687	fp = get_freepointer(s, object);
				688	nr++;
				689	}
				690
				691	if (page->inuse != s->objects - nr) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	692	slab_err(s, page, "Wrong object count. Counter is %d but "
				693	"counted were %d", s, page, page->inuse,
				694	s->objects - nr);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	695	page->inuse = s->objects - nr;
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	696	printk(KERN_ERR "@@@ SLUB %s: Object count adjusted. "
				697	"Slab @0x%p\n", s->name, page);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	698	}
				699	return search == NULL;
				700	}
				701
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	702	/*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	703	* Tracking of fully allocated slabs for debugging purposes.
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	704	*/
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	705	static void add_full(struct kmem_cache_node n, struct page page)
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	706	{
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	707	spin_lock(&n->list_lock);
				708	list_add(&page->lru, &n->full);
				709	spin_unlock(&n->list_lock);
				710	}
				711
				712	static void remove_full(struct kmem_cache s, struct page page)
				713	{
				714	struct kmem_cache_node *n;
				715
				716	if (!(s->flags & SLAB_STORE_USER))
				717	return;
				718
				719	n = get_node(s, page_to_nid(page));
				720
				721	spin_lock(&n->list_lock);
				722	list_del(&page->lru);
				723	spin_unlock(&n->list_lock);
				724	}
				725
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	726	static int alloc_object_checks(struct kmem_cache s, struct page page,
				727	void *object)
				728	{
				729	if (!check_slab(s, page))
				730	goto bad;
				731
				732	if (object && !on_freelist(s, page, object)) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	733	slab_err(s, page, "Object 0x%p already allocated", object);
				734	goto bad;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	735	}
				736
				737	if (!check_valid_pointer(s, page, object)) {
				738	object_err(s, page, object, "Freelist Pointer check fails");
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	739	goto bad;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	740	}
				741
				742	if (!object)
				743	return 1;
				744
				745	if (!check_object(s, page, object, 0))
				746	goto bad;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	747
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	748	return 1;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	749	bad:
				750	if (PageSlab(page)) {
				751	/*
				752	* If this is a slab page then lets do the best we can
				753	* to avoid issues in the future. Marking all objects
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	754	* as used avoids touching the remaining objects.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	755	*/
				756	printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n",
				757	s->name, page);
				758	page->inuse = s->objects;
				759	page->freelist = NULL;
				760	/* Fix up fields that may be corrupted */
				761	page->offset = s->offset / sizeof(void *);
				762	}
				763	return 0;
				764	}
				765
				766	static int free_object_checks(struct kmem_cache s, struct page page,
				767	void *object)
				768	{
				769	if (!check_slab(s, page))
				770	goto fail;
				771
				772	if (!check_valid_pointer(s, page, object)) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	773	slab_err(s, page, "Invalid object pointer 0x%p", object);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	774	goto fail;
				775	}
				776
				777	if (on_freelist(s, page, object)) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	778	slab_err(s, page, "Object 0x%p already free", object);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	779	goto fail;
				780	}
				781
				782	if (!check_object(s, page, object, 1))
				783	return 0;
				784
				785	if (unlikely(s != page->slab)) {
				786	if (!PageSlab(page))
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	787	slab_err(s, page, "Attempt to free object(0x%p) "
				788	"outside of slab", object);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	789	else
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	790	if (!page->slab) {
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	791	printk(KERN_ERR
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	792	"SLUB <none>: no slab for object 0x%p.\n",
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	793	object);
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	794	dump_stack();
				795	}
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	796	else
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	797	slab_err(s, page, "object at 0x%p belongs "
				798	"to slab %s", object, page->slab->name);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	799	goto fail;
				800	}
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	801	return 1;
				802	fail:
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	803	printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n",
				804	s->name, page, object);
				805	return 0;
				806	}
				807
				808	/*
				809	* Slab allocation and freeing
				810	*/
				811	static struct page allocate_slab(struct kmem_cache s, gfp_t flags, int node)
				812	{
				813	struct page * page;
				814	int pages = 1 << s->order;
				815
				816	if (s->order)
				817	flags \|= __GFP_COMP;
				818
				819	if (s->flags & SLAB_CACHE_DMA)
				820	flags \|= SLUB_DMA;
				821
				822	if (node == -1)
				823	page = alloc_pages(flags, s->order);
				824	else
				825	page = alloc_pages_node(node, flags, s->order);
				826
				827	if (!page)
				828	return NULL;
				829
				830	mod_zone_page_state(page_zone(page),
				831	(s->flags & SLAB_RECLAIM_ACCOUNT) ?
				832	NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
				833	pages);
				834
				835	return page;
				836	}
				837
				838	static void setup_object(struct kmem_cache s, struct page page,
				839	void *object)
				840	{
Christoph Lameter	35e5d7e	2007-05-09 02:32:42 -0700	[diff] [blame^]	841	if (SlabDebug(page)) {
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	842	init_object(s, object, 0);
				843	init_tracking(s, object);
				844	}
				845
Christoph Lameter	4f10493	2007-05-06 14:50:17 -0700	[diff] [blame]	846	if (unlikely(s->ctor))
				847	s->ctor(object, s, SLAB_CTOR_CONSTRUCTOR);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	848	}
				849
				850	static struct page new_slab(struct kmem_cache s, gfp_t flags, int node)
				851	{
				852	struct page *page;
				853	struct kmem_cache_node *n;
				854	void *start;
				855	void *end;
				856	void *last;
				857	void *p;
				858
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	859	BUG_ON(flags & ~(GFP_DMA \| GFP_LEVEL_MASK));
				860
				861	if (flags & __GFP_WAIT)
				862	local_irq_enable();
				863
				864	page = allocate_slab(s, flags & GFP_LEVEL_MASK, node);
				865	if (!page)
				866	goto out;
				867
				868	n = get_node(s, page_to_nid(page));
				869	if (n)
				870	atomic_long_inc(&n->nr_slabs);
				871	page->offset = s->offset / sizeof(void *);
				872	page->slab = s;
				873	page->flags \|= 1 << PG_slab;
				874	if (s->flags & (SLAB_DEBUG_FREE \| SLAB_RED_ZONE \| SLAB_POISON \|
				875	SLAB_STORE_USER \| SLAB_TRACE))
Christoph Lameter	35e5d7e	2007-05-09 02:32:42 -0700	[diff] [blame^]	876	SetSlabDebug(page);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	877
				878	start = page_address(page);
				879	end = start + s->objects * s->size;
				880
				881	if (unlikely(s->flags & SLAB_POISON))
				882	memset(start, POISON_INUSE, PAGE_SIZE << s->order);
				883
				884	last = start;
Christoph Lameter	7656c72	2007-05-09 02:32:40 -0700	[diff] [blame]	885	for_each_object(p, s, start) {
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	886	setup_object(s, page, last);
				887	set_freepointer(s, last, p);
				888	last = p;
				889	}
				890	setup_object(s, page, last);
				891	set_freepointer(s, last, NULL);
				892
				893	page->freelist = start;
				894	page->inuse = 0;
				895	out:
				896	if (flags & __GFP_WAIT)
				897	local_irq_disable();
				898	return page;
				899	}
				900
				901	static void __free_slab(struct kmem_cache s, struct page page)
				902	{
				903	int pages = 1 << s->order;
				904
Christoph Lameter	35e5d7e	2007-05-09 02:32:42 -0700	[diff] [blame^]	905	if (unlikely(SlabDebug(page) \|\| s->dtor)) {
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	906	void *p;
				907
				908	slab_pad_check(s, page);
Christoph Lameter	7656c72	2007-05-09 02:32:40 -0700	[diff] [blame]	909	for_each_object(p, s, page_address(page)) {
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	910	if (s->dtor)
				911	s->dtor(p, s, 0);
				912	check_object(s, page, p, 0);
				913	}
				914	}
				915
				916	mod_zone_page_state(page_zone(page),
				917	(s->flags & SLAB_RECLAIM_ACCOUNT) ?
				918	NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
				919	- pages);
				920
				921	page->mapping = NULL;
				922	__free_pages(page, s->order);
				923	}
				924
				925	static void rcu_free_slab(struct rcu_head *h)
				926	{
				927	struct page *page;
				928
				929	page = container_of((struct list_head *)h, struct page, lru);
				930	__free_slab(page->slab, page);
				931	}
				932
				933	static void free_slab(struct kmem_cache s, struct page page)
				934	{
				935	if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
				936	/*
				937	* RCU free overloads the RCU head over the LRU
				938	*/
				939	struct rcu_head head = (void )&page->lru;
				940
				941	call_rcu(head, rcu_free_slab);
				942	} else
				943	__free_slab(s, page);
				944	}
				945
				946	static void discard_slab(struct kmem_cache s, struct page page)
				947	{
				948	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
				949
				950	atomic_long_dec(&n->nr_slabs);
				951	reset_page_mapcount(page);
Christoph Lameter	35e5d7e	2007-05-09 02:32:42 -0700	[diff] [blame^]	952	ClearSlabDebug(page);
				953	__ClearPageSlab(page);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	954	free_slab(s, page);
				955	}
				956
				957	/*
				958	* Per slab locking using the pagelock
				959	*/
				960	static __always_inline void slab_lock(struct page *page)
				961	{
				962	bit_spin_lock(PG_locked, &page->flags);
				963	}
				964
				965	static __always_inline void slab_unlock(struct page *page)
				966	{
				967	bit_spin_unlock(PG_locked, &page->flags);
				968	}
				969
				970	static __always_inline int slab_trylock(struct page *page)
				971	{
				972	int rc = 1;
				973
				974	rc = bit_spin_trylock(PG_locked, &page->flags);
				975	return rc;
				976	}
				977
				978	/*
				979	* Management of partially allocated slabs
				980	*/
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	981	static void add_partial_tail(struct kmem_cache_node n, struct page page)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	982	{
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	983	spin_lock(&n->list_lock);
				984	n->nr_partial++;
				985	list_add_tail(&page->lru, &n->partial);
				986	spin_unlock(&n->list_lock);
				987	}
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	988
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	989	static void add_partial(struct kmem_cache_node n, struct page page)
				990	{
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	991	spin_lock(&n->list_lock);
				992	n->nr_partial++;
				993	list_add(&page->lru, &n->partial);
				994	spin_unlock(&n->list_lock);
				995	}
				996
				997	static void remove_partial(struct kmem_cache *s,
				998	struct page *page)
				999	{
				1000	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
				1001
				1002	spin_lock(&n->list_lock);
				1003	list_del(&page->lru);
				1004	n->nr_partial--;
				1005	spin_unlock(&n->list_lock);
				1006	}
				1007
				1008	/*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1009	* Lock slab and remove from the partial list.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1010	*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1011	* Must hold list_lock.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1012	*/
				1013	static int lock_and_del_slab(struct kmem_cache_node n, struct page page)
				1014	{
				1015	if (slab_trylock(page)) {
				1016	list_del(&page->lru);
				1017	n->nr_partial--;
				1018	return 1;
				1019	}
				1020	return 0;
				1021	}
				1022
				1023	/*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1024	* Try to allocate a partial slab from a specific node.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1025	*/
				1026	static struct page get_partial_node(struct kmem_cache_node n)
				1027	{
				1028	struct page *page;
				1029
				1030	/*
				1031	* Racy check. If we mistakenly see no partial slabs then we
				1032	* just allocate an empty slab. If we mistakenly try to get a
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1033	* partial slab and there is none available then get_partials()
				1034	* will return NULL.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1035	*/
				1036	if (!n \|\| !n->nr_partial)
				1037	return NULL;
				1038
				1039	spin_lock(&n->list_lock);
				1040	list_for_each_entry(page, &n->partial, lru)
				1041	if (lock_and_del_slab(n, page))
				1042	goto out;
				1043	page = NULL;
				1044	out:
				1045	spin_unlock(&n->list_lock);
				1046	return page;
				1047	}
				1048
				1049	/*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1050	* Get a page from somewhere. Search in increasing NUMA distances.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1051	*/
				1052	static struct page get_any_partial(struct kmem_cache s, gfp_t flags)
				1053	{
				1054	#ifdef CONFIG_NUMA
				1055	struct zonelist *zonelist;
				1056	struct zone **z;
				1057	struct page *page;
				1058
				1059	/*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1060	* The defrag ratio allows a configuration of the tradeoffs between
				1061	* inter node defragmentation and node local allocations. A lower
				1062	* defrag_ratio increases the tendency to do local allocations
				1063	* instead of attempting to obtain partial slabs from other nodes.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1064	*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1065	* If the defrag_ratio is set to 0 then kmalloc() always
				1066	* returns node local objects. If the ratio is higher then kmalloc()
				1067	* may return off node objects because partial slabs are obtained
				1068	* from other nodes and filled up.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1069	*
				1070	* If /sys/slab/xx/defrag_ratio is set to 100 (which makes
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1071	* defrag_ratio = 1000) then every (well almost) allocation will
				1072	* first attempt to defrag slab caches on other nodes. This means
				1073	* scanning over all nodes to look for partial slabs which may be
				1074	* expensive if we do it every time we are trying to find a slab
				1075	* with available objects.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1076	*/
				1077	if (!s->defrag_ratio \|\| get_cycles() % 1024 > s->defrag_ratio)
				1078	return NULL;
				1079
				1080	zonelist = &NODE_DATA(slab_node(current->mempolicy))
				1081	->node_zonelists[gfp_zone(flags)];
				1082	for (z = zonelist->zones; *z; z++) {
				1083	struct kmem_cache_node *n;
				1084
				1085	n = get_node(s, zone_to_nid(*z));
				1086
				1087	if (n && cpuset_zone_allowed_hardwall(*z, flags) &&
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1088	n->nr_partial > MIN_PARTIAL) {
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1089	page = get_partial_node(n);
				1090	if (page)
				1091	return page;
				1092	}
				1093	}
				1094	#endif
				1095	return NULL;
				1096	}
				1097
				1098	/*
				1099	* Get a partial page, lock it and return it.
				1100	*/
				1101	static struct page get_partial(struct kmem_cache s, gfp_t flags, int node)
				1102	{
				1103	struct page *page;
				1104	int searchnode = (node == -1) ? numa_node_id() : node;
				1105
				1106	page = get_partial_node(get_node(s, searchnode));
				1107	if (page \|\| (flags & __GFP_THISNODE))
				1108	return page;
				1109
				1110	return get_any_partial(s, flags);
				1111	}
				1112
				1113	/*
				1114	* Move a page back to the lists.
				1115	*
				1116	* Must be called with the slab lock held.
				1117	*
				1118	* On exit the slab lock will have been dropped.
				1119	*/
				1120	static void putback_slab(struct kmem_cache s, struct page page)
				1121	{
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1122	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
				1123
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1124	if (page->inuse) {
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1125
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1126	if (page->freelist)
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1127	add_partial(n, page);
Christoph Lameter	35e5d7e	2007-05-09 02:32:42 -0700	[diff] [blame^]	1128	else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER))
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1129	add_full(n, page);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1130	slab_unlock(page);
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1131
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1132	} else {
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1133	if (n->nr_partial < MIN_PARTIAL) {
				1134	/*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1135	* Adding an empty slab to the partial slabs in order
				1136	* to avoid page allocator overhead. This slab needs
				1137	* to come after the other slabs with objects in
				1138	* order to fill them up. That way the size of the
				1139	* partial list stays small. kmem_cache_shrink can
				1140	* reclaim empty slabs from the partial list.
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1141	*/
				1142	add_partial_tail(n, page);
				1143	slab_unlock(page);
				1144	} else {
				1145	slab_unlock(page);
				1146	discard_slab(s, page);
				1147	}
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1148	}
				1149	}
				1150
				1151	/*
				1152	* Remove the cpu slab
				1153	*/
				1154	static void deactivate_slab(struct kmem_cache s, struct page page, int cpu)
				1155	{
				1156	s->cpu_slab[cpu] = NULL;
				1157	ClearPageActive(page);
				1158
				1159	putback_slab(s, page);
				1160	}
				1161
				1162	static void flush_slab(struct kmem_cache s, struct page page, int cpu)
				1163	{
				1164	slab_lock(page);
				1165	deactivate_slab(s, page, cpu);
				1166	}
				1167
				1168	/*
				1169	* Flush cpu slab.
				1170	* Called from IPI handler with interrupts disabled.
				1171	*/
				1172	static void __flush_cpu_slab(struct kmem_cache *s, int cpu)
				1173	{
				1174	struct page *page = s->cpu_slab[cpu];
				1175
				1176	if (likely(page))
				1177	flush_slab(s, page, cpu);
				1178	}
				1179
				1180	static void flush_cpu_slab(void *d)
				1181	{
				1182	struct kmem_cache *s = d;
				1183	int cpu = smp_processor_id();
				1184
				1185	__flush_cpu_slab(s, cpu);
				1186	}
				1187
				1188	static void flush_all(struct kmem_cache *s)
				1189	{
				1190	#ifdef CONFIG_SMP
				1191	on_each_cpu(flush_cpu_slab, s, 1, 1);
				1192	#else
				1193	unsigned long flags;
				1194
				1195	local_irq_save(flags);
				1196	flush_cpu_slab(s);
				1197	local_irq_restore(flags);
				1198	#endif
				1199	}
				1200
				1201	/*
				1202	* slab_alloc is optimized to only modify two cachelines on the fast path
				1203	* (aside from the stack):
				1204	*
				1205	* 1. The page struct
				1206	* 2. The first cacheline of the object to be allocated.
				1207	*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1208	* The only other cache lines that are read (apart from code) is the
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1209	* per cpu array in the kmem_cache struct.
				1210	*
				1211	* Fastpath is not possible if we need to get a new slab or have
Christoph Lameter	35e5d7e	2007-05-09 02:32:42 -0700	[diff] [blame^]	1212	* debugging enabled (which means all slabs are marked with SlabDebug)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1213	*/
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1214	static void slab_alloc(struct kmem_cache s,
				1215	gfp_t gfpflags, int node, void *addr)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1216	{
				1217	struct page *page;
				1218	void **object;
				1219	unsigned long flags;
				1220	int cpu;
				1221
				1222	local_irq_save(flags);
				1223	cpu = smp_processor_id();
				1224	page = s->cpu_slab[cpu];
				1225	if (!page)
				1226	goto new_slab;
				1227
				1228	slab_lock(page);
				1229	if (unlikely(node != -1 && page_to_nid(page) != node))
				1230	goto another_slab;
				1231	redo:
				1232	object = page->freelist;
				1233	if (unlikely(!object))
				1234	goto another_slab;
Christoph Lameter	35e5d7e	2007-05-09 02:32:42 -0700	[diff] [blame^]	1235	if (unlikely(SlabDebug(page)))
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1236	goto debug;
				1237
				1238	have_object:
				1239	page->inuse++;
				1240	page->freelist = object[page->offset];
				1241	slab_unlock(page);
				1242	local_irq_restore(flags);
				1243	return object;
				1244
				1245	another_slab:
				1246	deactivate_slab(s, page, cpu);
				1247
				1248	new_slab:
				1249	page = get_partial(s, gfpflags, node);
				1250	if (likely(page)) {
				1251	have_slab:
				1252	s->cpu_slab[cpu] = page;
				1253	SetPageActive(page);
				1254	goto redo;
				1255	}
				1256
				1257	page = new_slab(s, gfpflags, node);
				1258	if (page) {
				1259	cpu = smp_processor_id();
				1260	if (s->cpu_slab[cpu]) {
				1261	/*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1262	* Someone else populated the cpu_slab while we
				1263	* enabled interrupts, or we have gotten scheduled
				1264	* on another cpu. The page may not be on the
				1265	* requested node even if __GFP_THISNODE was
				1266	* specified. So we need to recheck.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1267	*/
				1268	if (node == -1 \|\|
				1269	page_to_nid(s->cpu_slab[cpu]) == node) {
				1270	/*
				1271	* Current cpuslab is acceptable and we
				1272	* want the current one since its cache hot
				1273	*/
				1274	discard_slab(s, page);
				1275	page = s->cpu_slab[cpu];
				1276	slab_lock(page);
				1277	goto redo;
				1278	}
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1279	/* New slab does not fit our expectations */
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1280	flush_slab(s, s->cpu_slab[cpu], cpu);
				1281	}
				1282	slab_lock(page);
				1283	goto have_slab;
				1284	}
				1285	local_irq_restore(flags);
				1286	return NULL;
				1287	debug:
				1288	if (!alloc_object_checks(s, page, object))
				1289	goto another_slab;
				1290	if (s->flags & SLAB_STORE_USER)
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1291	set_track(s, object, TRACK_ALLOC, addr);
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	1292	if (s->flags & SLAB_TRACE) {
				1293	printk(KERN_INFO "TRACE %s alloc 0x%p inuse=%d fp=0x%p\n",
				1294	s->name, object, page->inuse,
				1295	page->freelist);
				1296	dump_stack();
				1297	}
				1298	init_object(s, object, 1);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1299	goto have_object;
				1300	}
				1301
				1302	void kmem_cache_alloc(struct kmem_cache s, gfp_t gfpflags)
				1303	{
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1304	return slab_alloc(s, gfpflags, -1, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1305	}
				1306	EXPORT_SYMBOL(kmem_cache_alloc);
				1307
				1308	#ifdef CONFIG_NUMA
				1309	void kmem_cache_alloc_node(struct kmem_cache s, gfp_t gfpflags, int node)
				1310	{
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1311	return slab_alloc(s, gfpflags, node, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1312	}
				1313	EXPORT_SYMBOL(kmem_cache_alloc_node);
				1314	#endif
				1315
				1316	/*
				1317	* The fastpath only writes the cacheline of the page struct and the first
				1318	* cacheline of the object.
				1319	*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1320	* We read the cpu_slab cacheline to check if the slab is the per cpu
				1321	* slab for this processor.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1322	*/
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1323	static void slab_free(struct kmem_cache s, struct page page,
				1324	void x, void addr)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1325	{
				1326	void *prior;
				1327	void *object = (void )x;
				1328	unsigned long flags;
				1329
				1330	local_irq_save(flags);
				1331	slab_lock(page);
				1332
Christoph Lameter	35e5d7e	2007-05-09 02:32:42 -0700	[diff] [blame^]	1333	if (unlikely(SlabDebug(page)))
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1334	goto debug;
				1335	checks_ok:
				1336	prior = object[page->offset] = page->freelist;
				1337	page->freelist = object;
				1338	page->inuse--;
				1339
				1340	if (unlikely(PageActive(page)))
				1341	/*
				1342	* Cpu slabs are never on partial lists and are
				1343	* never freed.
				1344	*/
				1345	goto out_unlock;
				1346
				1347	if (unlikely(!page->inuse))
				1348	goto slab_empty;
				1349
				1350	/*
				1351	* Objects left in the slab. If it
				1352	* was not on the partial list before
				1353	* then add it.
				1354	*/
				1355	if (unlikely(!prior))
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1356	add_partial(get_node(s, page_to_nid(page)), page);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1357
				1358	out_unlock:
				1359	slab_unlock(page);
				1360	local_irq_restore(flags);
				1361	return;
				1362
				1363	slab_empty:
				1364	if (prior)
				1365	/*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1366	* Slab still on the partial list.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1367	*/
				1368	remove_partial(s, page);
				1369
				1370	slab_unlock(page);
				1371	discard_slab(s, page);
				1372	local_irq_restore(flags);
				1373	return;
				1374
				1375	debug:
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1376	if (!free_object_checks(s, page, x))
				1377	goto out_unlock;
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	1378	if (!PageActive(page) && !page->freelist)
				1379	remove_full(s, page);
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1380	if (s->flags & SLAB_STORE_USER)
				1381	set_track(s, x, TRACK_FREE, addr);
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	1382	if (s->flags & SLAB_TRACE) {
				1383	printk(KERN_INFO "TRACE %s free 0x%p inuse=%d fp=0x%p\n",
				1384	s->name, object, page->inuse,
				1385	page->freelist);
				1386	print_section("Object", (void *)object, s->objsize);
				1387	dump_stack();
				1388	}
				1389	init_object(s, object, 0);
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1390	goto checks_ok;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1391	}
				1392
				1393	void kmem_cache_free(struct kmem_cache s, void x)
				1394	{
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1395	struct page *page;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1396
Christoph Lameter	b49af68	2007-05-06 14:49:41 -0700	[diff] [blame]	1397	page = virt_to_head_page(x);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1398
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1399	slab_free(s, page, x, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1400	}
				1401	EXPORT_SYMBOL(kmem_cache_free);
				1402
				1403	/* Figure out on which slab object the object resides */
				1404	static struct page get_object_page(const void x)
				1405	{
Christoph Lameter	b49af68	2007-05-06 14:49:41 -0700	[diff] [blame]	1406	struct page *page = virt_to_head_page(x);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1407
				1408	if (!PageSlab(page))
				1409	return NULL;
				1410
				1411	return page;
				1412	}
				1413
				1414	/*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1415	* Object placement in a slab is made very easy because we always start at
				1416	* offset 0. If we tune the size of the object to the alignment then we can
				1417	* get the required alignment by putting one properly sized object after
				1418	* another.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1419	*
				1420	* Notice that the allocation order determines the sizes of the per cpu
				1421	* caches. Each processor has always one slab available for allocations.
				1422	* Increasing the allocation order reduces the number of times that slabs
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1423	* must be moved on and off the partial lists and is therefore a factor in
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1424	* locking overhead.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1425	*/
				1426
				1427	/*
				1428	* Mininum / Maximum order of slab pages. This influences locking overhead
				1429	* and slab fragmentation. A higher order reduces the number of partial slabs
				1430	* and increases the number of allocations possible without having to
				1431	* take the list_lock.
				1432	*/
				1433	static int slub_min_order;
				1434	static int slub_max_order = DEFAULT_MAX_ORDER;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1435	static int slub_min_objects = DEFAULT_MIN_OBJECTS;
				1436
				1437	/*
				1438	* Merge control. If this is set then no merging of slab caches will occur.
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1439	* (Could be removed. This was introduced to pacify the merge skeptics.)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1440	*/
				1441	static int slub_nomerge;
				1442
				1443	/*
				1444	* Debug settings:
				1445	*/
				1446	static int slub_debug;
				1447
				1448	static char *slub_debug_slabs;
				1449
				1450	/*
				1451	* Calculate the order of allocation given an slab object size.
				1452	*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1453	* The order of allocation has significant impact on performance and other
				1454	* system components. Generally order 0 allocations should be preferred since
				1455	* order 0 does not cause fragmentation in the page allocator. Larger objects
				1456	* be problematic to put into order 0 slabs because there may be too much
				1457	* unused space left. We go to a higher order if more than 1/8th of the slab
				1458	* would be wasted.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1459	*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1460	* In order to reach satisfactory performance we must ensure that a minimum
				1461	* number of objects is in one slab. Otherwise we may generate too much
				1462	* activity on the partial lists which requires taking the list_lock. This is
				1463	* less a concern for large slabs though which are rarely used.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1464	*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1465	* slub_max_order specifies the order where we begin to stop considering the
				1466	* number of objects in a slab as critical. If we reach slub_max_order then
				1467	* we try to keep the page order as low as possible. So we accept more waste
				1468	* of space in favor of a small page order.
				1469	*
				1470	* Higher order allocations also allow the placement of more objects in a
				1471	* slab and thereby reduce object handling overhead. If the user has
				1472	* requested a higher mininum order then we start with that one instead of
				1473	* the smallest order which will fit the object.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1474	*/
				1475	static int calculate_order(int size)
				1476	{
				1477	int order;
				1478	int rem;
				1479
				1480	for (order = max(slub_min_order, fls(size - 1) - PAGE_SHIFT);
				1481	order < MAX_ORDER; order++) {
				1482	unsigned long slab_size = PAGE_SIZE << order;
				1483
				1484	if (slub_max_order > order &&
				1485	slab_size < slub_min_objects * size)
				1486	continue;
				1487
				1488	if (slab_size < size)
				1489	continue;
				1490
				1491	rem = slab_size % size;
				1492
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1493	if (rem <= slab_size / 8)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1494	break;
				1495
				1496	}
				1497	if (order >= MAX_ORDER)
				1498	return -E2BIG;
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1499
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1500	return order;
				1501	}
				1502
				1503	/*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1504	* Figure out what the alignment of the objects will be.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1505	*/
				1506	static unsigned long calculate_alignment(unsigned long flags,
				1507	unsigned long align, unsigned long size)
				1508	{
				1509	/*
				1510	* If the user wants hardware cache aligned objects then
				1511	* follow that suggestion if the object is sufficiently
				1512	* large.
				1513	*
				1514	* The hardware cache alignment cannot override the
				1515	* specified alignment though. If that is greater
				1516	* then use it.
				1517	*/
Christoph Lameter	5af6083	2007-05-06 14:49:56 -0700	[diff] [blame]	1518	if ((flags & SLAB_HWCACHE_ALIGN) &&
Christoph Lameter	65c02d4	2007-05-09 02:32:35 -0700	[diff] [blame]	1519	size > cache_line_size() / 2)
				1520	return max_t(unsigned long, align, cache_line_size());
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1521
				1522	if (align < ARCH_SLAB_MINALIGN)
				1523	return ARCH_SLAB_MINALIGN;
				1524
				1525	return ALIGN(align, sizeof(void *));
				1526	}
				1527
				1528	static void init_kmem_cache_node(struct kmem_cache_node *n)
				1529	{
				1530	n->nr_partial = 0;
				1531	atomic_long_set(&n->nr_slabs, 0);
				1532	spin_lock_init(&n->list_lock);
				1533	INIT_LIST_HEAD(&n->partial);
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	1534	INIT_LIST_HEAD(&n->full);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1535	}
				1536
				1537	#ifdef CONFIG_NUMA
				1538	/*
				1539	* No kmalloc_node yet so do it by hand. We know that this is the first
				1540	* slab on the node for this slabcache. There are no concurrent accesses
				1541	* possible.
				1542	*
				1543	* Note that this function only works on the kmalloc_node_cache
				1544	* when allocating for the kmalloc_node_cache.
				1545	*/
				1546	static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags,
				1547	int node)
				1548	{
				1549	struct page *page;
				1550	struct kmem_cache_node *n;
				1551
				1552	BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
				1553
				1554	page = new_slab(kmalloc_caches, gfpflags \| GFP_THISNODE, node);
				1555	/* new_slab() disables interupts */
				1556	local_irq_enable();
				1557
				1558	BUG_ON(!page);
				1559	n = page->freelist;
				1560	BUG_ON(!n);
				1561	page->freelist = get_freepointer(kmalloc_caches, n);
				1562	page->inuse++;
				1563	kmalloc_caches->node[node] = n;
				1564	init_object(kmalloc_caches, n, 1);
				1565	init_kmem_cache_node(n);
				1566	atomic_long_inc(&n->nr_slabs);
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1567	add_partial(n, page);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1568	return n;
				1569	}
				1570
				1571	static void free_kmem_cache_nodes(struct kmem_cache *s)
				1572	{
				1573	int node;
				1574
				1575	for_each_online_node(node) {
				1576	struct kmem_cache_node *n = s->node[node];
				1577	if (n && n != &s->local_node)
				1578	kmem_cache_free(kmalloc_caches, n);
				1579	s->node[node] = NULL;
				1580	}
				1581	}
				1582
				1583	static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
				1584	{
				1585	int node;
				1586	int local_node;
				1587
				1588	if (slab_state >= UP)
				1589	local_node = page_to_nid(virt_to_page(s));
				1590	else
				1591	local_node = 0;
				1592
				1593	for_each_online_node(node) {
				1594	struct kmem_cache_node *n;
				1595
				1596	if (local_node == node)
				1597	n = &s->local_node;
				1598	else {
				1599	if (slab_state == DOWN) {
				1600	n = early_kmem_cache_node_alloc(gfpflags,
				1601	node);
				1602	continue;
				1603	}
				1604	n = kmem_cache_alloc_node(kmalloc_caches,
				1605	gfpflags, node);
				1606
				1607	if (!n) {
				1608	free_kmem_cache_nodes(s);
				1609	return 0;
				1610	}
				1611
				1612	}
				1613	s->node[node] = n;
				1614	init_kmem_cache_node(n);
				1615	}
				1616	return 1;
				1617	}
				1618	#else
				1619	static void free_kmem_cache_nodes(struct kmem_cache *s)
				1620	{
				1621	}
				1622
				1623	static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
				1624	{
				1625	init_kmem_cache_node(&s->local_node);
				1626	return 1;
				1627	}
				1628	#endif
				1629
				1630	/*
				1631	* calculate_sizes() determines the order and the distribution of data within
				1632	* a slab object.
				1633	*/
				1634	static int calculate_sizes(struct kmem_cache *s)
				1635	{
				1636	unsigned long flags = s->flags;
				1637	unsigned long size = s->objsize;
				1638	unsigned long align = s->align;
				1639
				1640	/*
				1641	* Determine if we can poison the object itself. If the user of
				1642	* the slab may touch the object after free or before allocation
				1643	* then we should never poison the object itself.
				1644	*/
				1645	if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
				1646	!s->ctor && !s->dtor)
				1647	s->flags \|= __OBJECT_POISON;
				1648	else
				1649	s->flags &= ~__OBJECT_POISON;
				1650
				1651	/*
				1652	* Round up object size to the next word boundary. We can only
				1653	* place the free pointer at word boundaries and this determines
				1654	* the possible location of the free pointer.
				1655	*/
				1656	size = ALIGN(size, sizeof(void *));
				1657
				1658	/*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1659	* If we are Redzoning then check if there is some space between the
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1660	* end of the object and the free pointer. If not then add an
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1661	* additional word to have some bytes to store Redzone information.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1662	*/
				1663	if ((flags & SLAB_RED_ZONE) && size == s->objsize)
				1664	size += sizeof(void *);
				1665
				1666	/*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1667	* With that we have determined the number of bytes in actual use
				1668	* by the object. This is the potential offset to the free pointer.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1669	*/
				1670	s->inuse = size;
				1671
				1672	if (((flags & (SLAB_DESTROY_BY_RCU \| SLAB_POISON)) \|\|
				1673	s->ctor \|\| s->dtor)) {
				1674	/*
				1675	* Relocate free pointer after the object if it is not
				1676	* permitted to overwrite the first word of the object on
				1677	* kmem_cache_free.
				1678	*
				1679	* This is the case if we do RCU, have a constructor or
				1680	* destructor or are poisoning the objects.
				1681	*/
				1682	s->offset = size;
				1683	size += sizeof(void *);
				1684	}
				1685
				1686	if (flags & SLAB_STORE_USER)
				1687	/*
				1688	* Need to store information about allocs and frees after
				1689	* the object.
				1690	*/
				1691	size += 2 * sizeof(struct track);
				1692
Christoph Lameter	be7b3fb	2007-05-09 02:32:36 -0700	[diff] [blame]	1693	if (flags & SLAB_RED_ZONE)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1694	/*
				1695	* Add some empty padding so that we can catch
				1696	* overwrites from earlier objects rather than let
				1697	* tracking information or the free pointer be
				1698	* corrupted if an user writes before the start
				1699	* of the object.
				1700	*/
				1701	size += sizeof(void *);
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1702
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1703	/*
				1704	* Determine the alignment based on various parameters that the
Christoph Lameter	65c02d4	2007-05-09 02:32:35 -0700	[diff] [blame]	1705	* user specified and the dynamic determination of cache line size
				1706	* on bootup.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1707	*/
				1708	align = calculate_alignment(flags, align, s->objsize);
				1709
				1710	/*
				1711	* SLUB stores one object immediately after another beginning from
				1712	* offset 0. In order to align the objects we have to simply size
				1713	* each object to conform to the alignment.
				1714	*/
				1715	size = ALIGN(size, align);
				1716	s->size = size;
				1717
				1718	s->order = calculate_order(size);
				1719	if (s->order < 0)
				1720	return 0;
				1721
				1722	/*
				1723	* Determine the number of objects per slab
				1724	*/
				1725	s->objects = (PAGE_SIZE << s->order) / size;
				1726
				1727	/*
				1728	* Verify that the number of objects is within permitted limits.
				1729	* The page->inuse field is only 16 bit wide! So we cannot have
				1730	* more than 64k objects per slab.
				1731	*/
				1732	if (!s->objects \|\| s->objects > 65535)
				1733	return 0;
				1734	return 1;
				1735
				1736	}
				1737
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1738	static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
				1739	const char *name, size_t size,
				1740	size_t align, unsigned long flags,
				1741	void (ctor)(void , struct kmem_cache *, unsigned long),
				1742	void (dtor)(void , struct kmem_cache *, unsigned long))
				1743	{
				1744	memset(s, 0, kmem_size);
				1745	s->name = name;
				1746	s->ctor = ctor;
				1747	s->dtor = dtor;
				1748	s->objsize = size;
				1749	s->flags = flags;
				1750	s->align = align;
				1751
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1752	/*
				1753	* The page->offset field is only 16 bit wide. This is an offset
				1754	* in units of words from the beginning of an object. If the slab
				1755	* size is bigger then we cannot move the free pointer behind the
				1756	* object anymore.
				1757	*
				1758	* On 32 bit platforms the limit is 256k. On 64bit platforms
				1759	* the limit is 512k.
				1760	*
				1761	* Debugging or ctor/dtors may create a need to move the free
				1762	* pointer. Fail if this happens.
				1763	*/
				1764	if (s->size >= 65535 * sizeof(void *)) {
				1765	BUG_ON(flags & (SLAB_RED_ZONE \| SLAB_POISON \|
				1766	SLAB_STORE_USER \| SLAB_DESTROY_BY_RCU));
				1767	BUG_ON(ctor \|\| dtor);
				1768	}
				1769	else
				1770	/*
				1771	* Enable debugging if selected on the kernel commandline.
				1772	*/
				1773	if (slub_debug && (!slub_debug_slabs \|\|
				1774	strncmp(slub_debug_slabs, name,
				1775	strlen(slub_debug_slabs)) == 0))
				1776	s->flags \|= slub_debug;
				1777
				1778	if (!calculate_sizes(s))
				1779	goto error;
				1780
				1781	s->refcount = 1;
				1782	#ifdef CONFIG_NUMA
				1783	s->defrag_ratio = 100;
				1784	#endif
				1785
				1786	if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
				1787	return 1;
				1788	error:
				1789	if (flags & SLAB_PANIC)
				1790	panic("Cannot create slab %s size=%lu realsize=%u "
				1791	"order=%u offset=%u flags=%lx\n",
				1792	s->name, (unsigned long)size, s->size, s->order,
				1793	s->offset, flags);
				1794	return 0;
				1795	}
				1796	EXPORT_SYMBOL(kmem_cache_open);
				1797
				1798	/*
				1799	* Check if a given pointer is valid
				1800	*/
				1801	int kmem_ptr_validate(struct kmem_cache s, const void object)
				1802	{
				1803	struct page * page;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1804
				1805	page = get_object_page(object);
				1806
				1807	if (!page \|\| s != page->slab)
				1808	/* No slab or wrong slab */
				1809	return 0;
				1810
Christoph Lameter	abcd08a	2007-05-09 02:32:37 -0700	[diff] [blame]	1811	if (!check_valid_pointer(s, page, object))
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1812	return 0;
				1813
				1814	/*
				1815	* We could also check if the object is on the slabs freelist.
				1816	* But this would be too expensive and it seems that the main
				1817	* purpose of kmem_ptr_valid is to check if the object belongs
				1818	* to a certain slab.
				1819	*/
				1820	return 1;
				1821	}
				1822	EXPORT_SYMBOL(kmem_ptr_validate);
				1823
				1824	/*
				1825	* Determine the size of a slab object
				1826	*/
				1827	unsigned int kmem_cache_size(struct kmem_cache *s)
				1828	{
				1829	return s->objsize;
				1830	}
				1831	EXPORT_SYMBOL(kmem_cache_size);
				1832
				1833	const char kmem_cache_name(struct kmem_cache s)
				1834	{
				1835	return s->name;
				1836	}
				1837	EXPORT_SYMBOL(kmem_cache_name);
				1838
				1839	/*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1840	* Attempt to free all slabs on a node. Return the number of slabs we
				1841	* were unable to free.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1842	*/
				1843	static int free_list(struct kmem_cache s, struct kmem_cache_node n,
				1844	struct list_head *list)
				1845	{
				1846	int slabs_inuse = 0;
				1847	unsigned long flags;
				1848	struct page page, h;
				1849
				1850	spin_lock_irqsave(&n->list_lock, flags);
				1851	list_for_each_entry_safe(page, h, list, lru)
				1852	if (!page->inuse) {
				1853	list_del(&page->lru);
				1854	discard_slab(s, page);
				1855	} else
				1856	slabs_inuse++;
				1857	spin_unlock_irqrestore(&n->list_lock, flags);
				1858	return slabs_inuse;
				1859	}
				1860
				1861	/*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	1862	* Release all resources used by a slab cache.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1863	*/
				1864	static int kmem_cache_close(struct kmem_cache *s)
				1865	{
				1866	int node;
				1867
				1868	flush_all(s);
				1869
				1870	/* Attempt to free all objects */
				1871	for_each_online_node(node) {
				1872	struct kmem_cache_node *n = get_node(s, node);
				1873
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame]	1874	n->nr_partial -= free_list(s, n, &n->partial);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1875	if (atomic_long_read(&n->nr_slabs))
				1876	return 1;
				1877	}
				1878	free_kmem_cache_nodes(s);
				1879	return 0;
				1880	}
				1881
				1882	/*
				1883	* Close a cache and release the kmem_cache structure
				1884	* (must be used for caches created using kmem_cache_create)
				1885	*/
				1886	void kmem_cache_destroy(struct kmem_cache *s)
				1887	{
				1888	down_write(&slub_lock);
				1889	s->refcount--;
				1890	if (!s->refcount) {
				1891	list_del(&s->list);
				1892	if (kmem_cache_close(s))
				1893	WARN_ON(1);
				1894	sysfs_slab_remove(s);
				1895	kfree(s);
				1896	}
				1897	up_write(&slub_lock);
				1898	}
				1899	EXPORT_SYMBOL(kmem_cache_destroy);
				1900
				1901	/********************************************************************
				1902	* Kmalloc subsystem
				1903	*******************************************************************/
				1904
				1905	struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned;
				1906	EXPORT_SYMBOL(kmalloc_caches);
				1907
				1908	#ifdef CONFIG_ZONE_DMA
				1909	static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1];
				1910	#endif
				1911
				1912	static int __init setup_slub_min_order(char *str)
				1913	{
				1914	get_option (&str, &slub_min_order);
				1915
				1916	return 1;
				1917	}
				1918
				1919	__setup("slub_min_order=", setup_slub_min_order);
				1920
				1921	static int __init setup_slub_max_order(char *str)
				1922	{
				1923	get_option (&str, &slub_max_order);
				1924
				1925	return 1;
				1926	}
				1927
				1928	__setup("slub_max_order=", setup_slub_max_order);
				1929
				1930	static int __init setup_slub_min_objects(char *str)
				1931	{
				1932	get_option (&str, &slub_min_objects);
				1933
				1934	return 1;
				1935	}
				1936
				1937	__setup("slub_min_objects=", setup_slub_min_objects);
				1938
				1939	static int __init setup_slub_nomerge(char *str)
				1940	{
				1941	slub_nomerge = 1;
				1942	return 1;
				1943	}
				1944
				1945	__setup("slub_nomerge", setup_slub_nomerge);
				1946
				1947	static int __init setup_slub_debug(char *str)
				1948	{
				1949	if (!str \|\| *str != '=')
				1950	slub_debug = DEBUG_DEFAULT_FLAGS;
				1951	else {
				1952	str++;
				1953	if (str == 0 \|\| str == ',')
				1954	slub_debug = DEBUG_DEFAULT_FLAGS;
				1955	else
				1956	for( ;str && str != ','; str++)
				1957	switch (*str) {
				1958	case 'f' : case 'F' :
				1959	slub_debug \|= SLAB_DEBUG_FREE;
				1960	break;
				1961	case 'z' : case 'Z' :
				1962	slub_debug \|= SLAB_RED_ZONE;
				1963	break;
				1964	case 'p' : case 'P' :
				1965	slub_debug \|= SLAB_POISON;
				1966	break;
				1967	case 'u' : case 'U' :
				1968	slub_debug \|= SLAB_STORE_USER;
				1969	break;
				1970	case 't' : case 'T' :
				1971	slub_debug \|= SLAB_TRACE;
				1972	break;
				1973	default:
				1974	printk(KERN_ERR "slub_debug option '%c' "
				1975	"unknown. skipped\n",*str);
				1976	}
				1977	}
				1978
				1979	if (*str == ',')
				1980	slub_debug_slabs = str + 1;
				1981	return 1;
				1982	}
				1983
				1984	__setup("slub_debug", setup_slub_debug);
				1985
				1986	static struct kmem_cache create_kmalloc_cache(struct kmem_cache s,
				1987	const char *name, int size, gfp_t gfp_flags)
				1988	{
				1989	unsigned int flags = 0;
				1990
				1991	if (gfp_flags & SLUB_DMA)
				1992	flags = SLAB_CACHE_DMA;
				1993
				1994	down_write(&slub_lock);
				1995	if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
				1996	flags, NULL, NULL))
				1997	goto panic;
				1998
				1999	list_add(&s->list, &slab_caches);
				2000	up_write(&slub_lock);
				2001	if (sysfs_slab_add(s))
				2002	goto panic;
				2003	return s;
				2004
				2005	panic:
				2006	panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
				2007	}
				2008
				2009	static struct kmem_cache *get_slab(size_t size, gfp_t flags)
				2010	{
				2011	int index = kmalloc_index(size);
				2012
Christoph Lameter	614410d	2007-05-06 14:49:38 -0700	[diff] [blame]	2013	if (!index)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2014	return NULL;
				2015
				2016	/* Allocation too large? */
				2017	BUG_ON(index < 0);
				2018
				2019	#ifdef CONFIG_ZONE_DMA
				2020	if ((flags & SLUB_DMA)) {
				2021	struct kmem_cache *s;
				2022	struct kmem_cache *x;
				2023	char *text;
				2024	size_t realsize;
				2025
				2026	s = kmalloc_caches_dma[index];
				2027	if (s)
				2028	return s;
				2029
				2030	/* Dynamically create dma cache */
				2031	x = kmalloc(kmem_size, flags & ~SLUB_DMA);
				2032	if (!x)
				2033	panic("Unable to allocate memory for dma cache\n");
				2034
				2035	if (index <= KMALLOC_SHIFT_HIGH)
				2036	realsize = 1 << index;
				2037	else {
				2038	if (index == 1)
				2039	realsize = 96;
				2040	else
				2041	realsize = 192;
				2042	}
				2043
				2044	text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
				2045	(unsigned int)realsize);
				2046	s = create_kmalloc_cache(x, text, realsize, flags);
				2047	kmalloc_caches_dma[index] = s;
				2048	return s;
				2049	}
				2050	#endif
				2051	return &kmalloc_caches[index];
				2052	}
				2053
				2054	void *__kmalloc(size_t size, gfp_t flags)
				2055	{
				2056	struct kmem_cache *s = get_slab(size, flags);
				2057
				2058	if (s)
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2059	return slab_alloc(s, flags, -1, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2060	return NULL;
				2061	}
				2062	EXPORT_SYMBOL(__kmalloc);
				2063
				2064	#ifdef CONFIG_NUMA
				2065	void *__kmalloc_node(size_t size, gfp_t flags, int node)
				2066	{
				2067	struct kmem_cache *s = get_slab(size, flags);
				2068
				2069	if (s)
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2070	return slab_alloc(s, flags, node, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2071	return NULL;
				2072	}
				2073	EXPORT_SYMBOL(__kmalloc_node);
				2074	#endif
				2075
				2076	size_t ksize(const void *object)
				2077	{
				2078	struct page *page = get_object_page(object);
				2079	struct kmem_cache *s;
				2080
				2081	BUG_ON(!page);
				2082	s = page->slab;
				2083	BUG_ON(!s);
				2084
				2085	/*
				2086	* Debugging requires use of the padding between object
				2087	* and whatever may come after it.
				2088	*/
				2089	if (s->flags & (SLAB_RED_ZONE \| SLAB_POISON))
				2090	return s->objsize;
				2091
				2092	/*
				2093	* If we have the need to store the freelist pointer
				2094	* back there or track user information then we can
				2095	* only use the space before that information.
				2096	*/
				2097	if (s->flags & (SLAB_DESTROY_BY_RCU \| SLAB_STORE_USER))
				2098	return s->inuse;
				2099
				2100	/*
				2101	* Else we can use all the padding etc for the allocation
				2102	*/
				2103	return s->size;
				2104	}
				2105	EXPORT_SYMBOL(ksize);
				2106
				2107	void kfree(const void *x)
				2108	{
				2109	struct kmem_cache *s;
				2110	struct page *page;
				2111
				2112	if (!x)
				2113	return;
				2114
Christoph Lameter	b49af68	2007-05-06 14:49:41 -0700	[diff] [blame]	2115	page = virt_to_head_page(x);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2116	s = page->slab;
				2117
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2118	slab_free(s, page, (void *)x, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2119	}
				2120	EXPORT_SYMBOL(kfree);
				2121
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame]	2122	/*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	2123	* kmem_cache_shrink removes empty slabs from the partial lists and sorts
				2124	* the remaining slabs by the number of items in use. The slabs with the
				2125	* most items in use come first. New allocations will then fill those up
				2126	* and thus they can be removed from the partial lists.
				2127	*
				2128	* The slabs with the least items are placed last. This results in them
				2129	* being allocated from last increasing the chance that the last objects
				2130	* are freed in them.
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame]	2131	*/
				2132	int kmem_cache_shrink(struct kmem_cache *s)
				2133	{
				2134	int node;
				2135	int i;
				2136	struct kmem_cache_node *n;
				2137	struct page *page;
				2138	struct page *t;
				2139	struct list_head *slabs_by_inuse =
				2140	kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL);
				2141	unsigned long flags;
				2142
				2143	if (!slabs_by_inuse)
				2144	return -ENOMEM;
				2145
				2146	flush_all(s);
				2147	for_each_online_node(node) {
				2148	n = get_node(s, node);
				2149
				2150	if (!n->nr_partial)
				2151	continue;
				2152
				2153	for (i = 0; i < s->objects; i++)
				2154	INIT_LIST_HEAD(slabs_by_inuse + i);
				2155
				2156	spin_lock_irqsave(&n->list_lock, flags);
				2157
				2158	/*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	2159	* Build lists indexed by the items in use in each slab.
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame]	2160	*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	2161	* Note that concurrent frees may occur while we hold the
				2162	* list_lock. page->inuse here is the upper limit.
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame]	2163	*/
				2164	list_for_each_entry_safe(page, t, &n->partial, lru) {
				2165	if (!page->inuse && slab_trylock(page)) {
				2166	/*
				2167	* Must hold slab lock here because slab_free
				2168	* may have freed the last object and be
				2169	* waiting to release the slab.
				2170	*/
				2171	list_del(&page->lru);
				2172	n->nr_partial--;
				2173	slab_unlock(page);
				2174	discard_slab(s, page);
				2175	} else {
				2176	if (n->nr_partial > MAX_PARTIAL)
				2177	list_move(&page->lru,
				2178	slabs_by_inuse + page->inuse);
				2179	}
				2180	}
				2181
				2182	if (n->nr_partial <= MAX_PARTIAL)
				2183	goto out;
				2184
				2185	/*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	2186	* Rebuild the partial list with the slabs filled up most
				2187	* first and the least used slabs at the end.
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame]	2188	*/
				2189	for (i = s->objects - 1; i >= 0; i--)
				2190	list_splice(slabs_by_inuse + i, n->partial.prev);
				2191
				2192	out:
				2193	spin_unlock_irqrestore(&n->list_lock, flags);
				2194	}
				2195
				2196	kfree(slabs_by_inuse);
				2197	return 0;
				2198	}
				2199	EXPORT_SYMBOL(kmem_cache_shrink);
				2200
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2201	/**
				2202	* krealloc - reallocate memory. The contents will remain unchanged.
				2203	*
				2204	* @p: object to reallocate memory for.
				2205	* @new_size: how many bytes of memory are required.
				2206	* @flags: the type of memory to allocate.
				2207	*
				2208	* The contents of the object pointed to are preserved up to the
				2209	* lesser of the new and old sizes. If @p is %NULL, krealloc()
				2210	* behaves exactly like kmalloc(). If @size is 0 and @p is not a
				2211	* %NULL pointer, the object pointed to is freed.
				2212	*/
				2213	void krealloc(const void p, size_t new_size, gfp_t flags)
				2214	{
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2215	void *ret;
Christoph Lameter	1f99a28	2007-05-09 02:32:38 -0700	[diff] [blame]	2216	size_t ks;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2217
				2218	if (unlikely(!p))
				2219	return kmalloc(new_size, flags);
				2220
				2221	if (unlikely(!new_size)) {
				2222	kfree(p);
				2223	return NULL;
				2224	}
				2225
Christoph Lameter	1f99a28	2007-05-09 02:32:38 -0700	[diff] [blame]	2226	ks = ksize(p);
				2227	if (ks >= new_size)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2228	return (void *)p;
				2229
				2230	ret = kmalloc(new_size, flags);
				2231	if (ret) {
Christoph Lameter	1f99a28	2007-05-09 02:32:38 -0700	[diff] [blame]	2232	memcpy(ret, p, min(new_size, ks));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2233	kfree(p);
				2234	}
				2235	return ret;
				2236	}
				2237	EXPORT_SYMBOL(krealloc);
				2238
				2239	/********************************************************************
				2240	* Basic setup of slabs
				2241	*******************************************************************/
				2242
				2243	void __init kmem_cache_init(void)
				2244	{
				2245	int i;
				2246
				2247	#ifdef CONFIG_NUMA
				2248	/*
				2249	* Must first have the slab cache available for the allocations of the
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	2250	* struct kmem_cache_node's. There is special bootstrap code in
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2251	* kmem_cache_open for slab_state == DOWN.
				2252	*/
				2253	create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
				2254	sizeof(struct kmem_cache_node), GFP_KERNEL);
				2255	#endif
				2256
				2257	/* Able to allocate the per node structures */
				2258	slab_state = PARTIAL;
				2259
				2260	/* Caches that are not of the two-to-the-power-of size */
				2261	create_kmalloc_cache(&kmalloc_caches[1],
				2262	"kmalloc-96", 96, GFP_KERNEL);
				2263	create_kmalloc_cache(&kmalloc_caches[2],
				2264	"kmalloc-192", 192, GFP_KERNEL);
				2265
				2266	for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
				2267	create_kmalloc_cache(&kmalloc_caches[i],
				2268	"kmalloc", 1 << i, GFP_KERNEL);
				2269
				2270	slab_state = UP;
				2271
				2272	/* Provide the correct kmalloc names now that the caches are up */
				2273	for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
				2274	kmalloc_caches[i]. name =
				2275	kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
				2276
				2277	#ifdef CONFIG_SMP
				2278	register_cpu_notifier(&slab_notifier);
				2279	#endif
				2280
				2281	if (nr_cpu_ids) /* Remove when nr_cpu_ids is fixed upstream ! */
				2282	kmem_size = offsetof(struct kmem_cache, cpu_slab)
				2283	+ nr_cpu_ids * sizeof(struct page *);
				2284
				2285	printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
				2286	" Processors=%d, Nodes=%d\n",
Christoph Lameter	65c02d4	2007-05-09 02:32:35 -0700	[diff] [blame]	2287	KMALLOC_SHIFT_HIGH, cache_line_size(),
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2288	slub_min_order, slub_max_order, slub_min_objects,
				2289	nr_cpu_ids, nr_node_ids);
				2290	}
				2291
				2292	/*
				2293	* Find a mergeable slab cache
				2294	*/
				2295	static int slab_unmergeable(struct kmem_cache *s)
				2296	{
				2297	if (slub_nomerge \|\| (s->flags & SLUB_NEVER_MERGE))
				2298	return 1;
				2299
				2300	if (s->ctor \|\| s->dtor)
				2301	return 1;
				2302
				2303	return 0;
				2304	}
				2305
				2306	static struct kmem_cache *find_mergeable(size_t size,
				2307	size_t align, unsigned long flags,
				2308	void (ctor)(void , struct kmem_cache *, unsigned long),
				2309	void (dtor)(void , struct kmem_cache *, unsigned long))
				2310	{
				2311	struct list_head *h;
				2312
				2313	if (slub_nomerge \|\| (flags & SLUB_NEVER_MERGE))
				2314	return NULL;
				2315
				2316	if (ctor \|\| dtor)
				2317	return NULL;
				2318
				2319	size = ALIGN(size, sizeof(void *));
				2320	align = calculate_alignment(flags, align, size);
				2321	size = ALIGN(size, align);
				2322
				2323	list_for_each(h, &slab_caches) {
				2324	struct kmem_cache *s =
				2325	container_of(h, struct kmem_cache, list);
				2326
				2327	if (slab_unmergeable(s))
				2328	continue;
				2329
				2330	if (size > s->size)
				2331	continue;
				2332
				2333	if (((flags \| slub_debug) & SLUB_MERGE_SAME) !=
				2334	(s->flags & SLUB_MERGE_SAME))
				2335	continue;
				2336	/*
				2337	* Check if alignment is compatible.
				2338	* Courtesy of Adrian Drzewiecki
				2339	*/
				2340	if ((s->size & ~(align -1)) != s->size)
				2341	continue;
				2342
				2343	if (s->size - size >= sizeof(void *))
				2344	continue;
				2345
				2346	return s;
				2347	}
				2348	return NULL;
				2349	}
				2350
				2351	struct kmem_cache kmem_cache_create(const char name, size_t size,
				2352	size_t align, unsigned long flags,
				2353	void (ctor)(void , struct kmem_cache *, unsigned long),
				2354	void (dtor)(void , struct kmem_cache *, unsigned long))
				2355	{
				2356	struct kmem_cache *s;
				2357
				2358	down_write(&slub_lock);
				2359	s = find_mergeable(size, align, flags, dtor, ctor);
				2360	if (s) {
				2361	s->refcount++;
				2362	/*
				2363	* Adjust the object sizes so that we clear
				2364	* the complete object on kzalloc.
				2365	*/
				2366	s->objsize = max(s->objsize, (int)size);
				2367	s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
				2368	if (sysfs_slab_alias(s, name))
				2369	goto err;
				2370	} else {
				2371	s = kmalloc(kmem_size, GFP_KERNEL);
				2372	if (s && kmem_cache_open(s, GFP_KERNEL, name,
				2373	size, align, flags, ctor, dtor)) {
				2374	if (sysfs_slab_add(s)) {
				2375	kfree(s);
				2376	goto err;
				2377	}
				2378	list_add(&s->list, &slab_caches);
				2379	} else
				2380	kfree(s);
				2381	}
				2382	up_write(&slub_lock);
				2383	return s;
				2384
				2385	err:
				2386	up_write(&slub_lock);
				2387	if (flags & SLAB_PANIC)
				2388	panic("Cannot create slabcache %s\n", name);
				2389	else
				2390	s = NULL;
				2391	return s;
				2392	}
				2393	EXPORT_SYMBOL(kmem_cache_create);
				2394
				2395	void kmem_cache_zalloc(struct kmem_cache s, gfp_t flags)
				2396	{
				2397	void *x;
				2398
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2399	x = slab_alloc(s, flags, -1, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2400	if (x)
				2401	memset(x, 0, s->objsize);
				2402	return x;
				2403	}
				2404	EXPORT_SYMBOL(kmem_cache_zalloc);
				2405
				2406	#ifdef CONFIG_SMP
				2407	static void for_all_slabs(void (func)(struct kmem_cache , int), int cpu)
				2408	{
				2409	struct list_head *h;
				2410
				2411	down_read(&slub_lock);
				2412	list_for_each(h, &slab_caches) {
				2413	struct kmem_cache *s =
				2414	container_of(h, struct kmem_cache, list);
				2415
				2416	func(s, cpu);
				2417	}
				2418	up_read(&slub_lock);
				2419	}
				2420
				2421	/*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	2422	* Use the cpu notifier to insure that the cpu slabs are flushed when
				2423	* necessary.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2424	*/
				2425	static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
				2426	unsigned long action, void *hcpu)
				2427	{
				2428	long cpu = (long)hcpu;
				2429
				2430	switch (action) {
				2431	case CPU_UP_CANCELED:
				2432	case CPU_DEAD:
				2433	for_all_slabs(__flush_cpu_slab, cpu);
				2434	break;
				2435	default:
				2436	break;
				2437	}
				2438	return NOTIFY_OK;
				2439	}
				2440
				2441	static struct notifier_block __cpuinitdata slab_notifier =
				2442	{ &slab_cpuup_callback, NULL, 0 };
				2443
				2444	#endif
				2445
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2446	#ifdef CONFIG_NUMA
				2447
				2448	/*****************************************************************
				2449	* Generic reaper used to support the page allocator
				2450	* (the cpu slabs are reaped by a per slab workqueue).
				2451	*
				2452	* Maybe move this to the page allocator?
				2453	****************************************************************/
				2454
				2455	static DEFINE_PER_CPU(unsigned long, reap_node);
				2456
				2457	static void init_reap_node(int cpu)
				2458	{
				2459	int node;
				2460
				2461	node = next_node(cpu_to_node(cpu), node_online_map);
				2462	if (node == MAX_NUMNODES)
				2463	node = first_node(node_online_map);
				2464
				2465	__get_cpu_var(reap_node) = node;
				2466	}
				2467
				2468	static void next_reap_node(void)
				2469	{
				2470	int node = __get_cpu_var(reap_node);
				2471
				2472	/*
				2473	* Also drain per cpu pages on remote zones
				2474	*/
				2475	if (node != numa_node_id())
				2476	drain_node_pages(node);
				2477
				2478	node = next_node(node, node_online_map);
				2479	if (unlikely(node >= MAX_NUMNODES))
				2480	node = first_node(node_online_map);
				2481	__get_cpu_var(reap_node) = node;
				2482	}
				2483	#else
				2484	#define init_reap_node(cpu) do { } while (0)
				2485	#define next_reap_node(void) do { } while (0)
				2486	#endif
				2487
				2488	#define REAPTIMEOUT_CPUC (2*HZ)
				2489
				2490	#ifdef CONFIG_SMP
				2491	static DEFINE_PER_CPU(struct delayed_work, reap_work);
				2492
				2493	static void cache_reap(struct work_struct *unused)
				2494	{
				2495	next_reap_node();
				2496	refresh_cpu_vm_stats(smp_processor_id());
				2497	schedule_delayed_work(&__get_cpu_var(reap_work),
				2498	REAPTIMEOUT_CPUC);
				2499	}
				2500
				2501	static void __devinit start_cpu_timer(int cpu)
				2502	{
				2503	struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
				2504
				2505	/*
				2506	* When this gets called from do_initcalls via cpucache_init(),
				2507	* init_workqueues() has already run, so keventd will be setup
				2508	* at that time.
				2509	*/
				2510	if (keventd_up() && reap_work->work.func == NULL) {
				2511	init_reap_node(cpu);
				2512	INIT_DELAYED_WORK(reap_work, cache_reap);
				2513	schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
				2514	}
				2515	}
				2516
				2517	static int __init cpucache_init(void)
				2518	{
				2519	int cpu;
				2520
				2521	/*
				2522	* Register the timers that drain pcp pages and update vm statistics
				2523	*/
				2524	for_each_online_cpu(cpu)
				2525	start_cpu_timer(cpu);
				2526	return 0;
				2527	}
				2528	__initcall(cpucache_init);
				2529	#endif
				2530
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2531	void __kmalloc_track_caller(size_t size, gfp_t gfpflags, void caller)
				2532	{
				2533	struct kmem_cache *s = get_slab(size, gfpflags);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2534
				2535	if (!s)
				2536	return NULL;
				2537
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2538	return slab_alloc(s, gfpflags, -1, caller);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2539	}
				2540
				2541	void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
				2542	int node, void *caller)
				2543	{
				2544	struct kmem_cache *s = get_slab(size, gfpflags);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2545
				2546	if (!s)
				2547	return NULL;
				2548
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2549	return slab_alloc(s, gfpflags, node, caller);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2550	}
				2551
				2552	#ifdef CONFIG_SYSFS
				2553
Christoph Lameter	53e15af	2007-05-06 14:49:43 -0700	[diff] [blame]	2554	static int validate_slab(struct kmem_cache s, struct page page)
				2555	{
				2556	void *p;
				2557	void *addr = page_address(page);
Christoph Lameter	7656c72	2007-05-09 02:32:40 -0700	[diff] [blame]	2558	DECLARE_BITMAP(map, s->objects);
Christoph Lameter	53e15af	2007-05-06 14:49:43 -0700	[diff] [blame]	2559
				2560	if (!check_slab(s, page) \|\|
				2561	!on_freelist(s, page, NULL))
				2562	return 0;
				2563
				2564	/* Now we know that a valid freelist exists */
				2565	bitmap_zero(map, s->objects);
				2566
Christoph Lameter	7656c72	2007-05-09 02:32:40 -0700	[diff] [blame]	2567	for_each_free_object(p, s, page->freelist) {
				2568	set_bit(slab_index(p, s, addr), map);
Christoph Lameter	53e15af	2007-05-06 14:49:43 -0700	[diff] [blame]	2569	if (!check_object(s, page, p, 0))
				2570	return 0;
				2571	}
				2572
Christoph Lameter	7656c72	2007-05-09 02:32:40 -0700	[diff] [blame]	2573	for_each_object(p, s, addr)
				2574	if (!test_bit(slab_index(p, s, addr), map))
Christoph Lameter	53e15af	2007-05-06 14:49:43 -0700	[diff] [blame]	2575	if (!check_object(s, page, p, 1))
				2576	return 0;
				2577	return 1;
				2578	}
				2579
				2580	static void validate_slab_slab(struct kmem_cache s, struct page page)
				2581	{
				2582	if (slab_trylock(page)) {
				2583	validate_slab(s, page);
				2584	slab_unlock(page);
				2585	} else
				2586	printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
				2587	s->name, page);
				2588
				2589	if (s->flags & DEBUG_DEFAULT_FLAGS) {
Christoph Lameter	35e5d7e	2007-05-09 02:32:42 -0700	[diff] [blame^]	2590	if (!SlabDebug(page))
				2591	printk(KERN_ERR "SLUB %s: SlabDebug not set "
Christoph Lameter	53e15af	2007-05-06 14:49:43 -0700	[diff] [blame]	2592	"on slab 0x%p\n", s->name, page);
				2593	} else {
Christoph Lameter	35e5d7e	2007-05-09 02:32:42 -0700	[diff] [blame^]	2594	if (SlabDebug(page))
				2595	printk(KERN_ERR "SLUB %s: SlabDebug set on "
Christoph Lameter	53e15af	2007-05-06 14:49:43 -0700	[diff] [blame]	2596	"slab 0x%p\n", s->name, page);
				2597	}
				2598	}
				2599
				2600	static int validate_slab_node(struct kmem_cache s, struct kmem_cache_node n)
				2601	{
				2602	unsigned long count = 0;
				2603	struct page *page;
				2604	unsigned long flags;
				2605
				2606	spin_lock_irqsave(&n->list_lock, flags);
				2607
				2608	list_for_each_entry(page, &n->partial, lru) {
				2609	validate_slab_slab(s, page);
				2610	count++;
				2611	}
				2612	if (count != n->nr_partial)
				2613	printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
				2614	"counter=%ld\n", s->name, count, n->nr_partial);
				2615
				2616	if (!(s->flags & SLAB_STORE_USER))
				2617	goto out;
				2618
				2619	list_for_each_entry(page, &n->full, lru) {
				2620	validate_slab_slab(s, page);
				2621	count++;
				2622	}
				2623	if (count != atomic_long_read(&n->nr_slabs))
				2624	printk(KERN_ERR "SLUB: %s %ld slabs counted but "
				2625	"counter=%ld\n", s->name, count,
				2626	atomic_long_read(&n->nr_slabs));
				2627
				2628	out:
				2629	spin_unlock_irqrestore(&n->list_lock, flags);
				2630	return count;
				2631	}
				2632
				2633	static unsigned long validate_slab_cache(struct kmem_cache *s)
				2634	{
				2635	int node;
				2636	unsigned long count = 0;
				2637
				2638	flush_all(s);
				2639	for_each_online_node(node) {
				2640	struct kmem_cache_node *n = get_node(s, node);
				2641
				2642	count += validate_slab_node(s, n);
				2643	}
				2644	return count;
				2645	}
				2646
Christoph Lameter	b345970	2007-05-09 02:32:41 -0700	[diff] [blame]	2647	#ifdef SLUB_RESILIENCY_TEST
				2648	static void resiliency_test(void)
				2649	{
				2650	u8 *p;
				2651
				2652	printk(KERN_ERR "SLUB resiliency testing\n");
				2653	printk(KERN_ERR "-----------------------\n");
				2654	printk(KERN_ERR "A. Corruption after allocation\n");
				2655
				2656	p = kzalloc(16, GFP_KERNEL);
				2657	p[16] = 0x12;
				2658	printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
				2659	" 0x12->0x%p\n\n", p + 16);
				2660
				2661	validate_slab_cache(kmalloc_caches + 4);
				2662
				2663	/* Hmmm... The next two are dangerous */
				2664	p = kzalloc(32, GFP_KERNEL);
				2665	p[32 + sizeof(void *)] = 0x34;
				2666	printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
				2667	" 0x34 -> -0x%p\n", p);
				2668	printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
				2669
				2670	validate_slab_cache(kmalloc_caches + 5);
				2671	p = kzalloc(64, GFP_KERNEL);
				2672	p += 64 + (get_cycles() & 0xff) * sizeof(void *);
				2673	*p = 0x56;
				2674	printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
				2675	p);
				2676	printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
				2677	validate_slab_cache(kmalloc_caches + 6);
				2678
				2679	printk(KERN_ERR "\nB. Corruption after free\n");
				2680	p = kzalloc(128, GFP_KERNEL);
				2681	kfree(p);
				2682	*p = 0x78;
				2683	printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
				2684	validate_slab_cache(kmalloc_caches + 7);
				2685
				2686	p = kzalloc(256, GFP_KERNEL);
				2687	kfree(p);
				2688	p[50] = 0x9a;
				2689	printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
				2690	validate_slab_cache(kmalloc_caches + 8);
				2691
				2692	p = kzalloc(512, GFP_KERNEL);
				2693	kfree(p);
				2694	p[512] = 0xab;
				2695	printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
				2696	validate_slab_cache(kmalloc_caches + 9);
				2697	}
				2698	#else
				2699	static void resiliency_test(void) {};
				2700	#endif
				2701
Christoph Lameter	88a420e	2007-05-06 14:49:45 -0700	[diff] [blame]	2702	/*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	2703	* Generate lists of code addresses where slabcache objects are allocated
Christoph Lameter	88a420e	2007-05-06 14:49:45 -0700	[diff] [blame]	2704	* and freed.
				2705	*/
				2706
				2707	struct location {
				2708	unsigned long count;
				2709	void *addr;
				2710	};
				2711
				2712	struct loc_track {
				2713	unsigned long max;
				2714	unsigned long count;
				2715	struct location *loc;
				2716	};
				2717
				2718	static void free_loc_track(struct loc_track *t)
				2719	{
				2720	if (t->max)
				2721	free_pages((unsigned long)t->loc,
				2722	get_order(sizeof(struct location) * t->max));
				2723	}
				2724
				2725	static int alloc_loc_track(struct loc_track *t, unsigned long max)
				2726	{
				2727	struct location *l;
				2728	int order;
				2729
				2730	if (!max)
				2731	max = PAGE_SIZE / sizeof(struct location);
				2732
				2733	order = get_order(sizeof(struct location) * max);
				2734
				2735	l = (void *)__get_free_pages(GFP_KERNEL, order);
				2736
				2737	if (!l)
				2738	return 0;
				2739
				2740	if (t->count) {
				2741	memcpy(l, t->loc, sizeof(struct location) * t->count);
				2742	free_loc_track(t);
				2743	}
				2744	t->max = max;
				2745	t->loc = l;
				2746	return 1;
				2747	}
				2748
				2749	static int add_location(struct loc_track t, struct kmem_cache s,
				2750	void *addr)
				2751	{
				2752	long start, end, pos;
				2753	struct location *l;
				2754	void *caddr;
				2755
				2756	start = -1;
				2757	end = t->count;
				2758
				2759	for ( ; ; ) {
				2760	pos = start + (end - start + 1) / 2;
				2761
				2762	/*
				2763	* There is nothing at "end". If we end up there
				2764	* we need to add something to before end.
				2765	*/
				2766	if (pos == end)
				2767	break;
				2768
				2769	caddr = t->loc[pos].addr;
				2770	if (addr == caddr) {
				2771	t->loc[pos].count++;
				2772	return 1;
				2773	}
				2774
				2775	if (addr < caddr)
				2776	end = pos;
				2777	else
				2778	start = pos;
				2779	}
				2780
				2781	/*
Christoph Lameter	672bba3	2007-05-09 02:32:39 -0700	[diff] [blame]	2782	* Not found. Insert new tracking element.
Christoph Lameter	88a420e	2007-05-06 14:49:45 -0700	[diff] [blame]	2783	*/
				2784	if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max))
				2785	return 0;
				2786
				2787	l = t->loc + pos;
				2788	if (pos < t->count)
				2789	memmove(l + 1, l,
				2790	(t->count - pos) * sizeof(struct location));
				2791	t->count++;
				2792	l->count = 1;
				2793	l->addr = addr;
				2794	return 1;
				2795	}
				2796
				2797	static void process_slab(struct loc_track t, struct kmem_cache s,
				2798	struct page *page, enum track_item alloc)
				2799	{
				2800	void *addr = page_address(page);
Christoph Lameter	7656c72	2007-05-09 02:32:40 -0700	[diff] [blame]	2801	DECLARE_BITMAP(map, s->objects);
Christoph Lameter	88a420e	2007-05-06 14:49:45 -0700	[diff] [blame]	2802	void *p;
				2803
				2804	bitmap_zero(map, s->objects);
Christoph Lameter	7656c72	2007-05-09 02:32:40 -0700	[diff] [blame]	2805	for_each_free_object(p, s, page->freelist)
				2806	set_bit(slab_index(p, s, addr), map);
Christoph Lameter	88a420e	2007-05-06 14:49:45 -0700	[diff] [blame]	2807
Christoph Lameter	7656c72	2007-05-09 02:32:40 -0700	[diff] [blame]	2808	for_each_object(p, s, addr)
				2809	if (!test_bit(slab_index(p, s, addr), map)) {
Christoph Lameter	88a420e	2007-05-06 14:49:45 -0700	[diff] [blame]	2810	void *addr = get_track(s, p, alloc)->addr;
				2811
				2812	add_location(t, s, addr);
				2813	}
				2814	}
				2815
				2816	static int list_locations(struct kmem_cache s, char buf,
				2817	enum track_item alloc)
				2818	{
				2819	int n = 0;
				2820	unsigned long i;
				2821	struct loc_track t;
				2822	int node;
				2823
				2824	t.count = 0;
				2825	t.max = 0;
				2826
				2827	/* Push back cpu slabs */
				2828	flush_all(s);
				2829
				2830	for_each_online_node(node) {
				2831	struct kmem_cache_node *n = get_node(s, node);
				2832	unsigned long flags;
				2833	struct page *page;
				2834
				2835	if (!atomic_read(&n->nr_slabs))
				2836	continue;
				2837
				2838	spin_lock_irqsave(&n->list_lock, flags);
				2839	list_for_each_entry(page, &n->partial, lru)
				2840	process_slab(&t, s, page, alloc);
				2841	list_for_each_entry(page, &n->full, lru)
				2842	process_slab(&t, s, page, alloc);
				2843	spin_unlock_irqrestore(&n->list_lock, flags);
				2844	}
				2845
				2846	for (i = 0; i < t.count; i++) {
				2847	void *addr = t.loc[i].addr;
				2848
				2849	if (n > PAGE_SIZE - 100)
				2850	break;
				2851	n += sprintf(buf + n, "%7ld ", t.loc[i].count);
				2852	if (addr)
				2853	n += sprint_symbol(buf + n, (unsigned long)t.loc[i].addr);
				2854	else
				2855	n += sprintf(buf + n, "<not-available>");
				2856	n += sprintf(buf + n, "\n");
				2857	}
				2858
				2859	free_loc_track(&t);
				2860	if (!t.count)
				2861	n += sprintf(buf, "No data\n");
				2862	return n;
				2863	}
				2864
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2865	static unsigned long count_partial(struct kmem_cache_node *n)
				2866	{
				2867	unsigned long flags;
				2868	unsigned long x = 0;
				2869	struct page *page;
				2870
				2871	spin_lock_irqsave(&n->list_lock, flags);
				2872	list_for_each_entry(page, &n->partial, lru)
				2873	x += page->inuse;
				2874	spin_unlock_irqrestore(&n->list_lock, flags);
				2875	return x;
				2876	}
				2877
				2878	enum slab_stat_type {
				2879	SL_FULL,
				2880	SL_PARTIAL,
				2881	SL_CPU,
				2882	SL_OBJECTS
				2883	};
				2884
				2885	#define SO_FULL (1 << SL_FULL)
				2886	#define SO_PARTIAL (1 << SL_PARTIAL)
				2887	#define SO_CPU (1 << SL_CPU)
				2888	#define SO_OBJECTS (1 << SL_OBJECTS)
				2889
				2890	static unsigned long slab_objects(struct kmem_cache *s,
				2891	char *buf, unsigned long flags)
				2892	{
				2893	unsigned long total = 0;
				2894	int cpu;
				2895	int node;
				2896	int x;
				2897	unsigned long *nodes;
				2898	unsigned long *per_cpu;
				2899
				2900	nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
				2901	per_cpu = nodes + nr_node_ids;
				2902
				2903	for_each_possible_cpu(cpu) {
				2904	struct page *page = s->cpu_slab[cpu];
				2905	int node;
				2906
				2907	if (page) {
				2908	node = page_to_nid(page);
				2909	if (flags & SO_CPU) {
				2910	int x = 0;
				2911
				2912	if (flags & SO_OBJECTS)
				2913	x = page->inuse;
				2914	else
				2915	x = 1;
				2916	total += x;
				2917	nodes[node] += x;
				2918	}
				2919	per_cpu[node]++;
				2920	}
				2921	}
				2922
				2923	for_each_online_node(node) {
				2924	struct kmem_cache_node *n = get_node(s, node);
				2925
				2926	if (flags & SO_PARTIAL) {
				2927	if (flags & SO_OBJECTS)
				2928	x = count_partial(n);
				2929	else
				2930	x = n->nr_partial;
				2931	total += x;
				2932	nodes[node] += x;
				2933	}
				2934
				2935	if (flags & SO_FULL) {
				2936	int full_slabs = atomic_read(&n->nr_slabs)
				2937	- per_cpu[node]
				2938	- n->nr_partial;
				2939
				2940	if (flags & SO_OBJECTS)
				2941	x = full_slabs * s->objects;
				2942	else
				2943	x = full_slabs;
				2944	total += x;
				2945	nodes[node] += x;
				2946	}
				2947	}
				2948
				2949	x = sprintf(buf, "%lu", total);
				2950	#ifdef CONFIG_NUMA
				2951	for_each_online_node(node)
				2952	if (nodes[node])
				2953	x += sprintf(buf + x, " N%d=%lu",
				2954	node, nodes[node]);
				2955	#endif
				2956	kfree(nodes);
				2957	return x + sprintf(buf + x, "\n");
				2958	}
				2959
				2960	static int any_slab_objects(struct kmem_cache *s)
				2961	{
				2962	int node;
				2963	int cpu;
				2964
				2965	for_each_possible_cpu(cpu)
				2966	if (s->cpu_slab[cpu])
				2967	return 1;
				2968
				2969	for_each_node(node) {
				2970	struct kmem_cache_node *n = get_node(s, node);
				2971
				2972	if (n->nr_partial \|\| atomic_read(&n->nr_slabs))
				2973	return 1;
				2974	}
				2975	return 0;
				2976	}
				2977
				2978	#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
				2979	#define to_slab(n) container_of(n, struct kmem_cache, kobj);
				2980
				2981	struct slab_attribute {
				2982	struct attribute attr;
				2983	ssize_t (show)(struct kmem_cache s, char *buf);
				2984	ssize_t (store)(struct kmem_cache s, const char *x, size_t count);
				2985	};
				2986
				2987	#define SLAB_ATTR_RO(_name) \
				2988	static struct slab_attribute _name##_attr = __ATTR_RO(_name)
				2989
				2990	#define SLAB_ATTR(_name) \
				2991	static struct slab_attribute _name##_attr = \
				2992	__ATTR(_name, 0644, _name##_show, _name##_store)
				2993
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2994	static ssize_t slab_size_show(struct kmem_cache s, char buf)
				2995	{
				2996	return sprintf(buf, "%d\n", s->size);
				2997	}
				2998	SLAB_ATTR_RO(slab_size);
				2999
				3000	static ssize_t align_show(struct kmem_cache s, char buf)
				3001	{
				3002	return sprintf(buf, "%d\n", s->align);
				3003	}
				3004	SLAB_ATTR_RO(align);
				3005
				3006	static ssize_t object_size_show(struct kmem_cache s, char buf)
				3007	{
				3008	return sprintf(buf, "%d\n", s->objsize);
				3009	}
				3010	SLAB_ATTR_RO(object_size);
				3011
				3012	static ssize_t objs_per_slab_show(struct kmem_cache s, char buf)
				3013	{
				3014	return sprintf(buf, "%d\n", s->objects);
				3015	}
				3016	SLAB_ATTR_RO(objs_per_slab);
				3017
				3018	static ssize_t order_show(struct kmem_cache s, char buf)
				3019	{
				3020	return sprintf(buf, "%d\n", s->order);
				3021	}
				3022	SLAB_ATTR_RO(order);
				3023
				3024	static ssize_t ctor_show(struct kmem_cache s, char buf)
				3025	{
				3026	if (s->ctor) {
				3027	int n = sprint_symbol(buf, (unsigned long)s->ctor);
				3028
				3029	return n + sprintf(buf + n, "\n");
				3030	}
				3031	return 0;
				3032	}
				3033	SLAB_ATTR_RO(ctor);
				3034
				3035	static ssize_t dtor_show(struct kmem_cache s, char buf)
				3036	{
				3037	if (s->dtor) {
				3038	int n = sprint_symbol(buf, (unsigned long)s->dtor);
				3039
				3040	return n + sprintf(buf + n, "\n");
				3041	}
				3042	return 0;
				3043	}
				3044	SLAB_ATTR_RO(dtor);
				3045
				3046	static ssize_t aliases_show(struct kmem_cache s, char buf)
				3047	{
				3048	return sprintf(buf, "%d\n", s->refcount - 1);
				3049	}
				3050	SLAB_ATTR_RO(aliases);
				3051
				3052	static ssize_t slabs_show(struct kmem_cache s, char buf)
				3053	{
				3054	return slab_objects(s, buf, SO_FULL\|SO_PARTIAL\|SO_CPU);
				3055	}
				3056	SLAB_ATTR_RO(slabs);
				3057
				3058	static ssize_t partial_show(struct kmem_cache s, char buf)
				3059	{
				3060	return slab_objects(s, buf, SO_PARTIAL);
				3061	}
				3062	SLAB_ATTR_RO(partial);
				3063
				3064	static ssize_t cpu_slabs_show(struct kmem_cache s, char buf)
				3065	{
				3066	return slab_objects(s, buf, SO_CPU);
				3067	}
				3068	SLAB_ATTR_RO(cpu_slabs);
				3069
				3070	static ssize_t objects_show(struct kmem_cache s, char buf)
				3071	{
				3072	return slab_objects(s, buf, SO_FULL\|SO_PARTIAL\|SO_CPU\|SO_OBJECTS);
				3073	}
				3074	SLAB_ATTR_RO(objects);
				3075
				3076	static ssize_t sanity_checks_show(struct kmem_cache s, char buf)
				3077	{
				3078	return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
				3079	}
				3080
				3081	static ssize_t sanity_checks_store(struct kmem_cache *s,
				3082	const char *buf, size_t length)
				3083	{
				3084	s->flags &= ~SLAB_DEBUG_FREE;
				3085	if (buf[0] == '1')
				3086	s->flags \|= SLAB_DEBUG_FREE;
				3087	return length;
				3088	}
				3089	SLAB_ATTR(sanity_checks);
				3090
				3091	static ssize_t trace_show(struct kmem_cache s, char buf)
				3092	{
				3093	return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
				3094	}
				3095
				3096	static ssize_t trace_store(struct kmem_cache s, const char buf,
				3097	size_t length)
				3098	{
				3099	s->flags &= ~SLAB_TRACE;
				3100	if (buf[0] == '1')
				3101	s->flags \|= SLAB_TRACE;
				3102	return length;
				3103	}
				3104	SLAB_ATTR(trace);
				3105
				3106	static ssize_t reclaim_account_show(struct kmem_cache s, char buf)
				3107	{
				3108	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
				3109	}
				3110
				3111	static ssize_t reclaim_account_store(struct kmem_cache *s,
				3112	const char *buf, size_t length)
				3113	{
				3114	s->flags &= ~SLAB_RECLAIM_ACCOUNT;
				3115	if (buf[0] == '1')
				3116	s->flags \|= SLAB_RECLAIM_ACCOUNT;
				3117	return length;
				3118	}
				3119	SLAB_ATTR(reclaim_account);
				3120
				3121	static ssize_t hwcache_align_show(struct kmem_cache s, char buf)
				3122	{
Christoph Lameter	5af6083	2007-05-06 14:49:56 -0700	[diff] [blame]	3123	return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	3124	}
				3125	SLAB_ATTR_RO(hwcache_align);
				3126
				3127	#ifdef CONFIG_ZONE_DMA
				3128	static ssize_t cache_dma_show(struct kmem_cache s, char buf)
				3129	{
				3130	return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
				3131	}
				3132	SLAB_ATTR_RO(cache_dma);
				3133	#endif
				3134
				3135	static ssize_t destroy_by_rcu_show(struct kmem_cache s, char buf)
				3136	{
				3137	return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
				3138	}
				3139	SLAB_ATTR_RO(destroy_by_rcu);
				3140
				3141	static ssize_t red_zone_show(struct kmem_cache s, char buf)
				3142	{
				3143	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
				3144	}
				3145
				3146	static ssize_t red_zone_store(struct kmem_cache *s,
				3147	const char *buf, size_t length)
				3148	{
				3149	if (any_slab_objects(s))
				3150	return -EBUSY;
				3151
				3152	s->flags &= ~SLAB_RED_ZONE;
				3153	if (buf[0] == '1')
				3154	s->flags \|= SLAB_RED_ZONE;
				3155	calculate_sizes(s);
				3156	return length;
				3157	}
				3158	SLAB_ATTR(red_zone);
				3159
				3160	static ssize_t poison_show(struct kmem_cache s, char buf)
				3161	{
				3162	return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
				3163	}
				3164
				3165	static ssize_t poison_store(struct kmem_cache *s,
				3166	const char *buf, size_t length)
				3167	{
				3168	if (any_slab_objects(s))
				3169	return -EBUSY;
				3170
				3171	s->flags &= ~SLAB_POISON;
				3172	if (buf[0] == '1')
				3173	s->flags \|= SLAB_POISON;
				3174	calculate_sizes(s);
				3175	return length;
				3176	}
				3177	SLAB_ATTR(poison);
				3178
				3179	static ssize_t store_user_show(struct kmem_cache s, char buf)
				3180	{
				3181	return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
				3182	}
				3183
				3184	static ssize_t store_user_store(struct kmem_cache *s,
				3185	const char *buf, size_t length)
				3186	{
				3187	if (any_slab_objects(s))
				3188	return -EBUSY;
				3189
				3190	s->flags &= ~SLAB_STORE_USER;
				3191	if (buf[0] == '1')
				3192	s->flags \|= SLAB_STORE_USER;
				3193	calculate_sizes(s);
				3194	return length;
				3195	}
				3196	SLAB_ATTR(store_user);
				3197
Christoph Lameter	53e15af	2007-05-06 14:49:43 -0700	[diff] [blame]	3198	static ssize_t validate_show(struct kmem_cache s, char buf)
				3199	{
				3200	return 0;
				3201	}
				3202
				3203	static ssize_t validate_store(struct kmem_cache *s,
				3204	const char *buf, size_t length)
				3205	{
				3206	if (buf[0] == '1')
				3207	validate_slab_cache(s);
				3208	else
				3209	return -EINVAL;
				3210	return length;
				3211	}
				3212	SLAB_ATTR(validate);
				3213
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame]	3214	static ssize_t shrink_show(struct kmem_cache s, char buf)
				3215	{
				3216	return 0;
				3217	}
				3218
				3219	static ssize_t shrink_store(struct kmem_cache *s,
				3220	const char *buf, size_t length)
				3221	{
				3222	if (buf[0] == '1') {
				3223	int rc = kmem_cache_shrink(s);
				3224
				3225	if (rc)
				3226	return rc;
				3227	} else
				3228	return -EINVAL;
				3229	return length;
				3230	}
				3231	SLAB_ATTR(shrink);
				3232
Christoph Lameter	88a420e	2007-05-06 14:49:45 -0700	[diff] [blame]	3233	static ssize_t alloc_calls_show(struct kmem_cache s, char buf)
				3234	{
				3235	if (!(s->flags & SLAB_STORE_USER))
				3236	return -ENOSYS;
				3237	return list_locations(s, buf, TRACK_ALLOC);
				3238	}
				3239	SLAB_ATTR_RO(alloc_calls);
				3240
				3241	static ssize_t free_calls_show(struct kmem_cache s, char buf)
				3242	{
				3243	if (!(s->flags & SLAB_STORE_USER))
				3244	return -ENOSYS;
				3245	return list_locations(s, buf, TRACK_FREE);
				3246	}
				3247	SLAB_ATTR_RO(free_calls);
				3248
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	3249	#ifdef CONFIG_NUMA
				3250	static ssize_t defrag_ratio_show(struct kmem_cache s, char buf)
				3251	{
				3252	return sprintf(buf, "%d\n", s->defrag_ratio / 10);
				3253	}
				3254
				3255	static ssize_t defrag_ratio_store(struct kmem_cache *s,
				3256	const char *buf, size_t length)
				3257	{
				3258	int n = simple_strtoul(buf, NULL, 10);
				3259
				3260	if (n < 100)
				3261	s->defrag_ratio = n * 10;
				3262	return length;
				3263	}
				3264	SLAB_ATTR(defrag_ratio);
				3265	#endif
				3266
				3267	static struct attribute * slab_attrs[] = {
				3268	&slab_size_attr.attr,
				3269	&object_size_attr.attr,
				3270	&objs_per_slab_attr.attr,
				3271	&order_attr.attr,
				3272	&objects_attr.attr,
				3273	&slabs_attr.attr,
				3274	&partial_attr.attr,
				3275	&cpu_slabs_attr.attr,
				3276	&ctor_attr.attr,
				3277	&dtor_attr.attr,
				3278	&aliases_attr.attr,
				3279	&align_attr.attr,
				3280	&sanity_checks_attr.attr,
				3281	&trace_attr.attr,
				3282	&hwcache_align_attr.attr,
				3283	&reclaim_account_attr.attr,
				3284	&destroy_by_rcu_attr.attr,
				3285	&red_zone_attr.attr,
				3286	&poison_attr.attr,
				3287	&store_user_attr.attr,
Christoph Lameter	53e15af	2007-05-06 14:49:43 -0700	[diff] [blame]	3288	&validate_attr.attr,
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame]	3289	&shrink_attr.attr,
Christoph Lameter	88a420e	2007-05-06 14:49:45 -0700	[diff] [blame]	3290	&alloc_calls_attr.attr,
				3291	&free_calls_attr.attr,
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	3292	#ifdef CONFIG_ZONE_DMA
				3293	&cache_dma_attr.attr,
				3294	#endif
				3295	#ifdef CONFIG_NUMA
				3296	&defrag_ratio_attr.attr,
				3297	#endif
				3298	NULL
				3299	};
				3300
				3301	static struct attribute_group slab_attr_group = {
				3302	.attrs = slab_attrs,
				3303	};
				3304
				3305	static ssize_t slab_attr_show(struct kobject *kobj,
				3306	struct attribute *attr,
				3307	char *buf)
				3308	{
				3309	struct slab_attribute *attribute;
				3310	struct kmem_cache *s;
				3311	int err;
				3312
				3313	attribute = to_slab_attr(attr);
				3314	s = to_slab(kobj);
				3315
				3316	if (!attribute->show)
				3317	return -EIO;
				3318
				3319	err = attribute->show(s, buf);
				3320
				3321	return err;
				3322	}
				3323
				3324	static ssize_t slab_attr_store(struct kobject *kobj,
				3325	struct attribute *attr,
				3326	const char *buf, size_t len)
				3327	{
				3328	struct slab_attribute *attribute;
				3329	struct kmem_cache *s;
				3330	int err;
				3331
				3332	attribute = to_slab_attr(attr);
				3333	s = to_slab(kobj);
				3334
				3335	if (!attribute->store)
				3336	return -EIO;
				3337
				3338	err = attribute->store(s, buf, len);
				3339
				3340	return err;
				3341	}
				3342
				3343	static struct sysfs_ops slab_sysfs_ops = {
				3344	.show = slab_attr_show,
				3345	.store = slab_attr_store,
				3346	};
				3347
				3348	static struct kobj_type slab_ktype = {
				3349	.sysfs_ops = &slab_sysfs_ops,
				3350	};
				3351
				3352	static int uevent_filter(struct kset kset, struct kobject kobj)
				3353	{
				3354	struct kobj_type *ktype = get_ktype(kobj);
				3355
				3356	if (ktype == &slab_ktype)
				3357	return 1;
				3358	return 0;
				3359	}
				3360
				3361	static struct kset_uevent_ops slab_uevent_ops = {
				3362	.filter = uevent_filter,
				3363	};
				3364
				3365	decl_subsys(slab, &slab_ktype, &slab_uevent_ops);
				3366
				3367	#define ID_STR_LENGTH 64
				3368
				3369	/* Create a unique string id for a slab cache:
				3370	* format
				3371	* :[flags-]size:[memory address of kmemcache]
				3372	*/
				3373	static char create_unique_id(struct kmem_cache s)
				3374	{
				3375	char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
				3376	char *p = name;
				3377
				3378	BUG_ON(!name);
				3379
				3380	*p++ = ':';
				3381	/*
				3382	* First flags affecting slabcache operations. We will only
				3383	* get here for aliasable slabs so we do not need to support
				3384	* too many flags. The flags here must cover all flags that
				3385	* are matched during merging to guarantee that the id is
				3386	* unique.
				3387	*/
				3388	if (s->flags & SLAB_CACHE_DMA)
				3389	*p++ = 'd';
				3390	if (s->flags & SLAB_RECLAIM_ACCOUNT)
				3391	*p++ = 'a';
				3392	if (s->flags & SLAB_DEBUG_FREE)
				3393	*p++ = 'F';
				3394	if (p != name + 1)
				3395	*p++ = '-';
				3396	p += sprintf(p, "%07d", s->size);
				3397	BUG_ON(p > name + ID_STR_LENGTH - 1);
				3398	return name;
				3399	}
				3400
				3401	static int sysfs_slab_add(struct kmem_cache *s)
				3402	{
				3403	int err;
				3404	const char *name;
				3405	int unmergeable;
				3406
				3407	if (slab_state < SYSFS)
				3408	/* Defer until later */
				3409	return 0;
				3410
				3411	unmergeable = slab_unmergeable(s);
				3412	if (unmergeable) {
				3413	/*
				3414	* Slabcache can never be merged so we can use the name proper.
				3415	* This is typically the case for debug situations. In that
				3416	* case we can catch duplicate names easily.
				3417	*/
Linus Torvalds	0f9008e	2007-05-07 12:31:58 -0700	[diff] [blame]	3418	sysfs_remove_link(&slab_subsys.kobj, s->name);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	3419	name = s->name;
				3420	} else {
				3421	/*
				3422	* Create a unique name for the slab as a target
				3423	* for the symlinks.
				3424	*/
				3425	name = create_unique_id(s);
				3426	}
				3427
				3428	kobj_set_kset_s(s, slab_subsys);
				3429	kobject_set_name(&s->kobj, name);
				3430	kobject_init(&s->kobj);
				3431	err = kobject_add(&s->kobj);
				3432	if (err)
				3433	return err;
				3434
				3435	err = sysfs_create_group(&s->kobj, &slab_attr_group);
				3436	if (err)
				3437	return err;
				3438	kobject_uevent(&s->kobj, KOBJ_ADD);
				3439	if (!unmergeable) {
				3440	/* Setup first alias */
				3441	sysfs_slab_alias(s, s->name);
				3442	kfree(name);
				3443	}
				3444	return 0;
				3445	}
				3446
				3447	static void sysfs_slab_remove(struct kmem_cache *s)
				3448	{
				3449	kobject_uevent(&s->kobj, KOBJ_REMOVE);
				3450	kobject_del(&s->kobj);
				3451	}
				3452
				3453	/*
				3454	* Need to buffer aliases during bootup until sysfs becomes
				3455	* available lest we loose that information.
				3456	*/
				3457	struct saved_alias {
				3458	struct kmem_cache *s;
				3459	const char *name;
				3460	struct saved_alias *next;
				3461	};
				3462
				3463	struct saved_alias *alias_list;
				3464
				3465	static int sysfs_slab_alias(struct kmem_cache s, const char name)
				3466	{
				3467	struct saved_alias *al;
				3468
				3469	if (slab_state == SYSFS) {
				3470	/*
				3471	* If we have a leftover link then remove it.
				3472	*/
Linus Torvalds	0f9008e	2007-05-07 12:31:58 -0700	[diff] [blame]	3473	sysfs_remove_link(&slab_subsys.kobj, name);
				3474	return sysfs_create_link(&slab_subsys.kobj,
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	3475	&s->kobj, name);
				3476	}
				3477
				3478	al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
				3479	if (!al)
				3480	return -ENOMEM;
				3481
				3482	al->s = s;
				3483	al->name = name;
				3484	al->next = alias_list;
				3485	alias_list = al;
				3486	return 0;
				3487	}
				3488
				3489	static int __init slab_sysfs_init(void)
				3490	{
Christoph Lameter	26a7bd0	2007-05-09 02:32:39 -0700	[diff] [blame]	3491	struct list_head *h;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	3492	int err;
				3493
				3494	err = subsystem_register(&slab_subsys);
				3495	if (err) {
				3496	printk(KERN_ERR "Cannot register slab subsystem.\n");
				3497	return -ENOSYS;
				3498	}
				3499
Christoph Lameter	26a7bd0	2007-05-09 02:32:39 -0700	[diff] [blame]	3500	slab_state = SYSFS;
				3501
				3502	list_for_each(h, &slab_caches) {
				3503	struct kmem_cache *s =
				3504	container_of(h, struct kmem_cache, list);
				3505
				3506	err = sysfs_slab_add(s);
				3507	BUG_ON(err);
				3508	}
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	3509
				3510	while (alias_list) {
				3511	struct saved_alias *al = alias_list;
				3512
				3513	alias_list = alias_list->next;
				3514	err = sysfs_slab_alias(al->s, al->name);
				3515	BUG_ON(err);
				3516	kfree(al);
				3517	}
				3518
				3519	resiliency_test();
				3520	return 0;
				3521	}
				3522
				3523	__initcall(slab_sysfs_init);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	3524	#endif