Blame - mm/slub.c - kernel/msm

blob: 69ee7f807e84595a395eccd142d54105292c7dd6 [file] [log] [blame]

Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1	/*
				2	* SLUB: A slab allocator that limits cache line use instead of queuing
				3	* objects in per cpu and per node lists.
				4	*
				5	* The allocator synchronizes using per slab locks and only
				6	* uses a centralized lock to manage a pool of partial slabs.
				7	*
				8	* (C) 2007 SGI, Christoph Lameter <clameter@sgi.com>
				9	*/
				10
				11	#include <linux/mm.h>
				12	#include <linux/module.h>
				13	#include <linux/bit_spinlock.h>
				14	#include <linux/interrupt.h>
				15	#include <linux/bitops.h>
				16	#include <linux/slab.h>
				17	#include <linux/seq_file.h>
				18	#include <linux/cpu.h>
				19	#include <linux/cpuset.h>
				20	#include <linux/mempolicy.h>
				21	#include <linux/ctype.h>
				22	#include <linux/kallsyms.h>
				23
				24	/*
				25	* Lock order:
				26	* 1. slab_lock(page)
				27	* 2. slab->list_lock
				28	*
				29	* The slab_lock protects operations on the object of a particular
				30	* slab and its metadata in the page struct. If the slab lock
				31	* has been taken then no allocations nor frees can be performed
				32	* on the objects in the slab nor can the slab be added or removed
				33	* from the partial or full lists since this would mean modifying
				34	* the page_struct of the slab.
				35	*
				36	* The list_lock protects the partial and full list on each node and
				37	* the partial slab counter. If taken then no new slabs may be added or
				38	* removed from the lists nor make the number of partial slabs be modified.
				39	* (Note that the total number of slabs is an atomic value that may be
				40	* modified without taking the list lock).
				41	*
				42	* The list_lock is a centralized lock and thus we avoid taking it as
				43	* much as possible. As long as SLUB does not have to handle partial
				44	* slabs, operations can continue without any centralized lock. F.e.
				45	* allocating a long series of objects that fill up slabs does not require
				46	* the list lock.
				47	*
				48	* The lock order is sometimes inverted when we are trying to get a slab
				49	* off a list. We take the list_lock and then look for a page on the list
				50	* to use. While we do that objects in the slabs may be freed. We can
				51	* only operate on the slab if we have also taken the slab_lock. So we use
				52	* a slab_trylock() on the slab. If trylock was successful then no frees
				53	* can occur anymore and we can use the slab for allocations etc. If the
				54	* slab_trylock() does not succeed then frees are in progress in the slab and
				55	* we must stay away from it for a while since we may cause a bouncing
				56	* cacheline if we try to acquire the lock. So go onto the next slab.
				57	* If all pages are busy then we may allocate a new slab instead of reusing
				58	* a partial slab. A new slab has noone operating on it and thus there is
				59	* no danger of cacheline contention.
				60	*
				61	* Interrupts are disabled during allocation and deallocation in order to
				62	* make the slab allocator safe to use in the context of an irq. In addition
				63	* interrupts are disabled to ensure that the processor does not change
				64	* while handling per_cpu slabs, due to kernel preemption.
				65	*
				66	* SLUB assigns one slab for allocation to each processor.
				67	* Allocations only occur from these slabs called cpu slabs.
				68	*
				69	* Slabs with free elements are kept on a partial list.
				70	* There is no list for full slabs. If an object in a full slab is
				71	* freed then the slab will show up again on the partial lists.
				72	* Otherwise there is no need to track full slabs unless we have to
				73	* track full slabs for debugging purposes.
				74	*
				75	* Slabs are freed when they become empty. Teardown and setup is
				76	* minimal so we rely on the page allocators per cpu caches for
				77	* fast frees and allocs.
				78	*
				79	* Overloading of page flags that are otherwise used for LRU management.
				80	*
				81	* PageActive The slab is used as a cpu cache. Allocations
				82	* may be performed from the slab. The slab is not
				83	* on any slab list and cannot be moved onto one.
				84	*
				85	* PageError Slab requires special handling due to debug
				86	* options set. This moves slab handling out of
				87	* the fast path.
				88	*/
				89
				90	/*
				91	* Issues still to be resolved:
				92	*
				93	* - The per cpu array is updated for each new slab and and is a remote
				94	* cacheline for most nodes. This could become a bouncing cacheline given
				95	* enough frequent updates. There are 16 pointers in a cacheline.so at
				96	* max 16 cpus could compete. Likely okay.
				97	*
				98	* - Support PAGE_ALLOC_DEBUG. Should be easy to do.
				99	*
				100	* - Support DEBUG_SLAB_LEAK. Trouble is we do not know where the full
				101	* slabs are in SLUB.
				102	*
				103	* - SLAB_DEBUG_INITIAL is not supported but I have never seen a use of
				104	* it.
				105	*
				106	* - Variable sizing of the per node arrays
				107	*/
				108
				109	/* Enable to test recovery from slab corruption on boot */
				110	#undef SLUB_RESILIENCY_TEST
				111
				112	#if PAGE_SHIFT <= 12
				113
				114	/*
				115	* Small page size. Make sure that we do not fragment memory
				116	*/
				117	#define DEFAULT_MAX_ORDER 1
				118	#define DEFAULT_MIN_OBJECTS 4
				119
				120	#else
				121
				122	/*
				123	* Large page machines are customarily able to handle larger
				124	* page orders.
				125	*/
				126	#define DEFAULT_MAX_ORDER 2
				127	#define DEFAULT_MIN_OBJECTS 8
				128
				129	#endif
				130
				131	/*
				132	* Flags from the regular SLAB that SLUB does not support:
				133	*/
				134	#define SLUB_UNIMPLEMENTED (SLAB_DEBUG_INITIAL)
				135
				136	#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE \| SLAB_RED_ZONE \| \
				137	SLAB_POISON \| SLAB_STORE_USER)
				138	/*
				139	* Set of flags that will prevent slab merging
				140	*/
				141	#define SLUB_NEVER_MERGE (SLAB_RED_ZONE \| SLAB_POISON \| SLAB_STORE_USER \| \
				142	SLAB_TRACE \| SLAB_DESTROY_BY_RCU)
				143
				144	#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE \| SLAB_RECLAIM_ACCOUNT \| \
				145	SLAB_CACHE_DMA)
				146
				147	#ifndef ARCH_KMALLOC_MINALIGN
Christoph Lameter	47bfdc0	2007-05-06 14:49:37 -0700	[diff] [blame]	148	#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	149	#endif
				150
				151	#ifndef ARCH_SLAB_MINALIGN
Christoph Lameter	47bfdc0	2007-05-06 14:49:37 -0700	[diff] [blame]	152	#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	153	#endif
				154
				155	/* Internal SLUB flags */
				156	#define __OBJECT_POISON 0x80000000 /* Poison object */
				157
				158	static int kmem_size = sizeof(struct kmem_cache);
				159
				160	#ifdef CONFIG_SMP
				161	static struct notifier_block slab_notifier;
				162	#endif
				163
				164	static enum {
				165	DOWN, /* No slab functionality available */
				166	PARTIAL, /* kmem_cache_open() works but kmalloc does not */
				167	UP, /* Everything works */
				168	SYSFS /* Sysfs up */
				169	} slab_state = DOWN;
				170
				171	/* A list of all slab caches on the system */
				172	static DECLARE_RWSEM(slub_lock);
				173	LIST_HEAD(slab_caches);
				174
				175	#ifdef CONFIG_SYSFS
				176	static int sysfs_slab_add(struct kmem_cache *);
				177	static int sysfs_slab_alias(struct kmem_cache , const char );
				178	static void sysfs_slab_remove(struct kmem_cache *);
				179	#else
				180	static int sysfs_slab_add(struct kmem_cache *s) { return 0; }
				181	static int sysfs_slab_alias(struct kmem_cache s, const char p) { return 0; }
				182	static void sysfs_slab_remove(struct kmem_cache *s) {}
				183	#endif
				184
				185	/********************************************************************
				186	* Core slab cache functions
				187	*******************************************************************/
				188
				189	int slab_is_available(void)
				190	{
				191	return slab_state >= UP;
				192	}
				193
				194	static inline struct kmem_cache_node get_node(struct kmem_cache s, int node)
				195	{
				196	#ifdef CONFIG_NUMA
				197	return s->node[node];
				198	#else
				199	return &s->local_node;
				200	#endif
				201	}
				202
				203	/*
				204	* Object debugging
				205	*/
				206	static void print_section(char text, u8 addr, unsigned int length)
				207	{
				208	int i, offset;
				209	int newline = 1;
				210	char ascii[17];
				211
				212	ascii[16] = 0;
				213
				214	for (i = 0; i < length; i++) {
				215	if (newline) {
				216	printk(KERN_ERR "%10s 0x%p: ", text, addr + i);
				217	newline = 0;
				218	}
				219	printk(" %02x", addr[i]);
				220	offset = i % 16;
				221	ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
				222	if (offset == 15) {
				223	printk(" %s\n",ascii);
				224	newline = 1;
				225	}
				226	}
				227	if (!newline) {
				228	i %= 16;
				229	while (i < 16) {
				230	printk(" ");
				231	ascii[i] = ' ';
				232	i++;
				233	}
				234	printk(" %s\n", ascii);
				235	}
				236	}
				237
				238	/*
				239	* Slow version of get and set free pointer.
				240	*
				241	* This requires touching the cache lines of kmem_cache.
				242	* The offset can also be obtained from the page. In that
				243	* case it is in the cacheline that we already need to touch.
				244	*/
				245	static void get_freepointer(struct kmem_cache s, void *object)
				246	{
				247	return (void *)(object + s->offset);
				248	}
				249
				250	static void set_freepointer(struct kmem_cache s, void object, void *fp)
				251	{
				252	(void *)(object + s->offset) = fp;
				253	}
				254
				255	/*
				256	* Tracking user of a slab.
				257	*/
				258	struct track {
				259	void addr; / Called from address */
				260	int cpu; /* Was running on cpu */
				261	int pid; /* Pid context */
				262	unsigned long when; /* When did the operation occur */
				263	};
				264
				265	enum track_item { TRACK_ALLOC, TRACK_FREE };
				266
				267	static struct track get_track(struct kmem_cache s, void *object,
				268	enum track_item alloc)
				269	{
				270	struct track *p;
				271
				272	if (s->offset)
				273	p = object + s->offset + sizeof(void *);
				274	else
				275	p = object + s->inuse;
				276
				277	return p + alloc;
				278	}
				279
				280	static void set_track(struct kmem_cache s, void object,
				281	enum track_item alloc, void *addr)
				282	{
				283	struct track *p;
				284
				285	if (s->offset)
				286	p = object + s->offset + sizeof(void *);
				287	else
				288	p = object + s->inuse;
				289
				290	p += alloc;
				291	if (addr) {
				292	p->addr = addr;
				293	p->cpu = smp_processor_id();
				294	p->pid = current ? current->pid : -1;
				295	p->when = jiffies;
				296	} else
				297	memset(p, 0, sizeof(struct track));
				298	}
				299
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	300	static void init_tracking(struct kmem_cache s, void object)
				301	{
				302	if (s->flags & SLAB_STORE_USER) {
				303	set_track(s, object, TRACK_FREE, NULL);
				304	set_track(s, object, TRACK_ALLOC, NULL);
				305	}
				306	}
				307
				308	static void print_track(const char s, struct track t)
				309	{
				310	if (!t->addr)
				311	return;
				312
				313	printk(KERN_ERR "%s: ", s);
				314	__print_symbol("%s", (unsigned long)t->addr);
				315	printk(" jiffies_ago=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
				316	}
				317
				318	static void print_trailer(struct kmem_cache s, u8 p)
				319	{
				320	unsigned int off; /* Offset of last byte */
				321
				322	if (s->flags & SLAB_RED_ZONE)
				323	print_section("Redzone", p + s->objsize,
				324	s->inuse - s->objsize);
				325
				326	printk(KERN_ERR "FreePointer 0x%p -> 0x%p\n",
				327	p + s->offset,
				328	get_freepointer(s, p));
				329
				330	if (s->offset)
				331	off = s->offset + sizeof(void *);
				332	else
				333	off = s->inuse;
				334
				335	if (s->flags & SLAB_STORE_USER) {
				336	print_track("Last alloc", get_track(s, p, TRACK_ALLOC));
				337	print_track("Last free ", get_track(s, p, TRACK_FREE));
				338	off += 2 * sizeof(struct track);
				339	}
				340
				341	if (off != s->size)
				342	/* Beginning of the filler is the free pointer */
				343	print_section("Filler", p + off, s->size - off);
				344	}
				345
				346	static void object_err(struct kmem_cache s, struct page page,
				347	u8 object, char reason)
				348	{
				349	u8 *addr = page_address(page);
				350
				351	printk(KERN_ERR "*** SLUB %s: %s@0x%p slab 0x%p\n",
				352	s->name, reason, object, page);
				353	printk(KERN_ERR " offset=%tu flags=0x%04lx inuse=%u freelist=0x%p\n",
				354	object - addr, page->flags, page->inuse, page->freelist);
				355	if (object > addr + 16)
				356	print_section("Bytes b4", object - 16, 16);
				357	print_section("Object", object, min(s->objsize, 128));
				358	print_trailer(s, object);
				359	dump_stack();
				360	}
				361
				362	static void slab_err(struct kmem_cache s, struct page page, char *reason, ...)
				363	{
				364	va_list args;
				365	char buf[100];
				366
				367	va_start(args, reason);
				368	vsnprintf(buf, sizeof(buf), reason, args);
				369	va_end(args);
				370	printk(KERN_ERR "*** SLUB %s: %s in slab @0x%p\n", s->name, buf,
				371	page);
				372	dump_stack();
				373	}
				374
				375	static void init_object(struct kmem_cache s, void object, int active)
				376	{
				377	u8 *p = object;
				378
				379	if (s->flags & __OBJECT_POISON) {
				380	memset(p, POISON_FREE, s->objsize - 1);
				381	p[s->objsize -1] = POISON_END;
				382	}
				383
				384	if (s->flags & SLAB_RED_ZONE)
				385	memset(p + s->objsize,
				386	active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
				387	s->inuse - s->objsize);
				388	}
				389
				390	static int check_bytes(u8 *start, unsigned int value, unsigned int bytes)
				391	{
				392	while (bytes) {
				393	if (*start != (u8)value)
				394	return 0;
				395	start++;
				396	bytes--;
				397	}
				398	return 1;
				399	}
				400
				401
				402	static int check_valid_pointer(struct kmem_cache s, struct page page,
				403	void *object)
				404	{
				405	void *base;
				406
				407	if (!object)
				408	return 1;
				409
				410	base = page_address(page);
				411	if (object < base \|\| object >= base + s->objects * s->size \|\|
				412	(object - base) % s->size) {
				413	return 0;
				414	}
				415
				416	return 1;
				417	}
				418
				419	/*
				420	* Object layout:
				421	*
				422	* object address
				423	* Bytes of the object to be managed.
				424	* If the freepointer may overlay the object then the free
				425	* pointer is the first word of the object.
				426	* Poisoning uses 0x6b (POISON_FREE) and the last byte is
				427	* 0xa5 (POISON_END)
				428	*
				429	* object + s->objsize
				430	* Padding to reach word boundary. This is also used for Redzoning.
				431	* Padding is extended to word size if Redzoning is enabled
				432	* and objsize == inuse.
				433	* We fill with 0xbb (RED_INACTIVE) for inactive objects and with
				434	* 0xcc (RED_ACTIVE) for objects in use.
				435	*
				436	* object + s->inuse
				437	* A. Free pointer (if we cannot overwrite object on free)
				438	* B. Tracking data for SLAB_STORE_USER
				439	* C. Padding to reach required alignment boundary
				440	* Padding is done using 0x5a (POISON_INUSE)
				441	*
				442	* object + s->size
				443	*
				444	* If slabcaches are merged then the objsize and inuse boundaries are to
				445	* be ignored. And therefore no slab options that rely on these boundaries
				446	* may be used with merged slabcaches.
				447	*/
				448
				449	static void restore_bytes(struct kmem_cache s, char message, u8 data,
				450	void from, void to)
				451	{
				452	printk(KERN_ERR "@@@ SLUB: %s Restoring %s (0x%x) from 0x%p-0x%p\n",
				453	s->name, message, data, from, to - 1);
				454	memset(from, data, to - from);
				455	}
				456
				457	static int check_pad_bytes(struct kmem_cache s, struct page page, u8 *p)
				458	{
				459	unsigned long off = s->inuse; /* The end of info */
				460
				461	if (s->offset)
				462	/* Freepointer is placed after the object. */
				463	off += sizeof(void *);
				464
				465	if (s->flags & SLAB_STORE_USER)
				466	/* We also have user information there */
				467	off += 2 * sizeof(struct track);
				468
				469	if (s->size == off)
				470	return 1;
				471
				472	if (check_bytes(p + off, POISON_INUSE, s->size - off))
				473	return 1;
				474
				475	object_err(s, page, p, "Object padding check fails");
				476
				477	/*
				478	* Restore padding
				479	*/
				480	restore_bytes(s, "object padding", POISON_INUSE, p + off, p + s->size);
				481	return 0;
				482	}
				483
				484	static int slab_pad_check(struct kmem_cache s, struct page page)
				485	{
				486	u8 *p;
				487	int length, remainder;
				488
				489	if (!(s->flags & SLAB_POISON))
				490	return 1;
				491
				492	p = page_address(page);
				493	length = s->objects * s->size;
				494	remainder = (PAGE_SIZE << s->order) - length;
				495	if (!remainder)
				496	return 1;
				497
				498	if (!check_bytes(p + length, POISON_INUSE, remainder)) {
				499	printk(KERN_ERR "SLUB: %s slab 0x%p: Padding fails check\n",
				500	s->name, p);
				501	dump_stack();
				502	restore_bytes(s, "slab padding", POISON_INUSE, p + length,
				503	p + length + remainder);
				504	return 0;
				505	}
				506	return 1;
				507	}
				508
				509	static int check_object(struct kmem_cache s, struct page page,
				510	void *object, int active)
				511	{
				512	u8 *p = object;
				513	u8 *endobject = object + s->objsize;
				514
				515	if (s->flags & SLAB_RED_ZONE) {
				516	unsigned int red =
				517	active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
				518
				519	if (!check_bytes(endobject, red, s->inuse - s->objsize)) {
				520	object_err(s, page, object,
				521	active ? "Redzone Active" : "Redzone Inactive");
				522	restore_bytes(s, "redzone", red,
				523	endobject, object + s->inuse);
				524	return 0;
				525	}
				526	} else {
				527	if ((s->flags & SLAB_POISON) && s->objsize < s->inuse &&
				528	!check_bytes(endobject, POISON_INUSE,
				529	s->inuse - s->objsize)) {
				530	object_err(s, page, p, "Alignment padding check fails");
				531	/*
				532	* Fix it so that there will not be another report.
				533	*
				534	* Hmmm... We may be corrupting an object that now expects
				535	* to be longer than allowed.
				536	*/
				537	restore_bytes(s, "alignment padding", POISON_INUSE,
				538	endobject, object + s->inuse);
				539	}
				540	}
				541
				542	if (s->flags & SLAB_POISON) {
				543	if (!active && (s->flags & __OBJECT_POISON) &&
				544	(!check_bytes(p, POISON_FREE, s->objsize - 1) \|\|
				545	p[s->objsize - 1] != POISON_END)) {
				546
				547	object_err(s, page, p, "Poison check failed");
				548	restore_bytes(s, "Poison", POISON_FREE,
				549	p, p + s->objsize -1);
				550	restore_bytes(s, "Poison", POISON_END,
				551	p + s->objsize - 1, p + s->objsize);
				552	return 0;
				553	}
				554	/*
				555	* check_pad_bytes cleans up on its own.
				556	*/
				557	check_pad_bytes(s, page, p);
				558	}
				559
				560	if (!s->offset && active)
				561	/*
				562	* Object and freepointer overlap. Cannot check
				563	* freepointer while object is allocated.
				564	*/
				565	return 1;
				566
				567	/* Check free pointer validity */
				568	if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
				569	object_err(s, page, p, "Freepointer corrupt");
				570	/*
				571	* No choice but to zap it and thus loose the remainder
				572	* of the free objects in this slab. May cause
				573	* another error because the object count maybe
				574	* wrong now.
				575	*/
				576	set_freepointer(s, p, NULL);
				577	return 0;
				578	}
				579	return 1;
				580	}
				581
				582	static int check_slab(struct kmem_cache s, struct page page)
				583	{
				584	VM_BUG_ON(!irqs_disabled());
				585
				586	if (!PageSlab(page)) {
				587	printk(KERN_ERR "SLUB: %s Not a valid slab page @0x%p "
				588	"flags=%lx mapping=0x%p count=%d \n",
				589	s->name, page, page->flags, page->mapping,
				590	page_count(page));
				591	return 0;
				592	}
				593	if (page->offset * sizeof(void *) != s->offset) {
				594	printk(KERN_ERR "SLUB: %s Corrupted offset %lu in slab @0x%p"
				595	" flags=0x%lx mapping=0x%p count=%d\n",
				596	s->name,
				597	(unsigned long)(page->offset * sizeof(void *)),
				598	page,
				599	page->flags,
				600	page->mapping,
				601	page_count(page));
				602	dump_stack();
				603	return 0;
				604	}
				605	if (page->inuse > s->objects) {
				606	printk(KERN_ERR "SLUB: %s Inuse %u > max %u in slab "
				607	"page @0x%p flags=%lx mapping=0x%p count=%d\n",
				608	s->name, page->inuse, s->objects, page, page->flags,
				609	page->mapping, page_count(page));
				610	dump_stack();
				611	return 0;
				612	}
				613	/* Slab_pad_check fixes things up after itself */
				614	slab_pad_check(s, page);
				615	return 1;
				616	}
				617
				618	/*
				619	* Determine if a certain object on a page is on the freelist and
				620	* therefore free. Must hold the slab lock for cpu slabs to
				621	* guarantee that the chains are consistent.
				622	*/
				623	static int on_freelist(struct kmem_cache s, struct page page, void *search)
				624	{
				625	int nr = 0;
				626	void *fp = page->freelist;
				627	void *object = NULL;
				628
				629	while (fp && nr <= s->objects) {
				630	if (fp == search)
				631	return 1;
				632	if (!check_valid_pointer(s, page, fp)) {
				633	if (object) {
				634	object_err(s, page, object,
				635	"Freechain corrupt");
				636	set_freepointer(s, object, NULL);
				637	break;
				638	} else {
				639	printk(KERN_ERR "SLUB: %s slab 0x%p "
				640	"freepointer 0x%p corrupted.\n",
				641	s->name, page, fp);
				642	dump_stack();
				643	page->freelist = NULL;
				644	page->inuse = s->objects;
				645	return 0;
				646	}
				647	break;
				648	}
				649	object = fp;
				650	fp = get_freepointer(s, object);
				651	nr++;
				652	}
				653
				654	if (page->inuse != s->objects - nr) {
				655	printk(KERN_ERR "slab %s: page 0x%p wrong object count."
				656	" counter is %d but counted were %d\n",
				657	s->name, page, page->inuse,
				658	s->objects - nr);
				659	page->inuse = s->objects - nr;
				660	}
				661	return search == NULL;
				662	}
				663
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	664	/*
				665	* Tracking of fully allocated slabs for debugging
				666	*/
				667	static void add_full(struct kmem_cache s, struct page page)
				668	{
				669	struct kmem_cache_node *n;
				670
				671	VM_BUG_ON(!irqs_disabled());
				672
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	673	if (!(s->flags & SLAB_STORE_USER))
				674	return;
				675
				676	n = get_node(s, page_to_nid(page));
				677	spin_lock(&n->list_lock);
				678	list_add(&page->lru, &n->full);
				679	spin_unlock(&n->list_lock);
				680	}
				681
				682	static void remove_full(struct kmem_cache s, struct page page)
				683	{
				684	struct kmem_cache_node *n;
				685
				686	if (!(s->flags & SLAB_STORE_USER))
				687	return;
				688
				689	n = get_node(s, page_to_nid(page));
				690
				691	spin_lock(&n->list_lock);
				692	list_del(&page->lru);
				693	spin_unlock(&n->list_lock);
				694	}
				695
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	696	static int alloc_object_checks(struct kmem_cache s, struct page page,
				697	void *object)
				698	{
				699	if (!check_slab(s, page))
				700	goto bad;
				701
				702	if (object && !on_freelist(s, page, object)) {
				703	printk(KERN_ERR "SLUB: %s Object 0x%p@0x%p "
				704	"already allocated.\n",
				705	s->name, object, page);
				706	goto dump;
				707	}
				708
				709	if (!check_valid_pointer(s, page, object)) {
				710	object_err(s, page, object, "Freelist Pointer check fails");
				711	goto dump;
				712	}
				713
				714	if (!object)
				715	return 1;
				716
				717	if (!check_object(s, page, object, 0))
				718	goto bad;
				719	init_object(s, object, 1);
				720
				721	if (s->flags & SLAB_TRACE) {
				722	printk(KERN_INFO "TRACE %s alloc 0x%p inuse=%d fp=0x%p\n",
				723	s->name, object, page->inuse,
				724	page->freelist);
				725	dump_stack();
				726	}
				727	return 1;
				728	dump:
				729	dump_stack();
				730	bad:
				731	if (PageSlab(page)) {
				732	/*
				733	* If this is a slab page then lets do the best we can
				734	* to avoid issues in the future. Marking all objects
				735	* as used avoids touching the remainder.
				736	*/
				737	printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n",
				738	s->name, page);
				739	page->inuse = s->objects;
				740	page->freelist = NULL;
				741	/* Fix up fields that may be corrupted */
				742	page->offset = s->offset / sizeof(void *);
				743	}
				744	return 0;
				745	}
				746
				747	static int free_object_checks(struct kmem_cache s, struct page page,
				748	void *object)
				749	{
				750	if (!check_slab(s, page))
				751	goto fail;
				752
				753	if (!check_valid_pointer(s, page, object)) {
				754	printk(KERN_ERR "SLUB: %s slab 0x%p invalid "
				755	"object pointer 0x%p\n",
				756	s->name, page, object);
				757	goto fail;
				758	}
				759
				760	if (on_freelist(s, page, object)) {
				761	printk(KERN_ERR "SLUB: %s slab 0x%p object "
				762	"0x%p already free.\n", s->name, page, object);
				763	goto fail;
				764	}
				765
				766	if (!check_object(s, page, object, 1))
				767	return 0;
				768
				769	if (unlikely(s != page->slab)) {
				770	if (!PageSlab(page))
				771	printk(KERN_ERR "slab_free %s size %d: attempt to"
				772	"free object(0x%p) outside of slab.\n",
				773	s->name, s->size, object);
				774	else
				775	if (!page->slab)
				776	printk(KERN_ERR
				777	"slab_free : no slab(NULL) for object 0x%p.\n",
				778	object);
				779	else
				780	printk(KERN_ERR "slab_free %s(%d): object at 0x%p"
				781	" belongs to slab %s(%d)\n",
				782	s->name, s->size, object,
				783	page->slab->name, page->slab->size);
				784	goto fail;
				785	}
				786	if (s->flags & SLAB_TRACE) {
				787	printk(KERN_INFO "TRACE %s free 0x%p inuse=%d fp=0x%p\n",
				788	s->name, object, page->inuse,
				789	page->freelist);
				790	print_section("Object", object, s->objsize);
				791	dump_stack();
				792	}
				793	init_object(s, object, 0);
				794	return 1;
				795	fail:
				796	dump_stack();
				797	printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n",
				798	s->name, page, object);
				799	return 0;
				800	}
				801
				802	/*
				803	* Slab allocation and freeing
				804	*/
				805	static struct page allocate_slab(struct kmem_cache s, gfp_t flags, int node)
				806	{
				807	struct page * page;
				808	int pages = 1 << s->order;
				809
				810	if (s->order)
				811	flags \|= __GFP_COMP;
				812
				813	if (s->flags & SLAB_CACHE_DMA)
				814	flags \|= SLUB_DMA;
				815
				816	if (node == -1)
				817	page = alloc_pages(flags, s->order);
				818	else
				819	page = alloc_pages_node(node, flags, s->order);
				820
				821	if (!page)
				822	return NULL;
				823
				824	mod_zone_page_state(page_zone(page),
				825	(s->flags & SLAB_RECLAIM_ACCOUNT) ?
				826	NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
				827	pages);
				828
				829	return page;
				830	}
				831
				832	static void setup_object(struct kmem_cache s, struct page page,
				833	void *object)
				834	{
				835	if (PageError(page)) {
				836	init_object(s, object, 0);
				837	init_tracking(s, object);
				838	}
				839
				840	if (unlikely(s->ctor)) {
				841	int mode = SLAB_CTOR_CONSTRUCTOR;
				842
				843	if (!(s->flags & __GFP_WAIT))
				844	mode \|= SLAB_CTOR_ATOMIC;
				845
				846	s->ctor(object, s, mode);
				847	}
				848	}
				849
				850	static struct page new_slab(struct kmem_cache s, gfp_t flags, int node)
				851	{
				852	struct page *page;
				853	struct kmem_cache_node *n;
				854	void *start;
				855	void *end;
				856	void *last;
				857	void *p;
				858
				859	if (flags & __GFP_NO_GROW)
				860	return NULL;
				861
				862	BUG_ON(flags & ~(GFP_DMA \| GFP_LEVEL_MASK));
				863
				864	if (flags & __GFP_WAIT)
				865	local_irq_enable();
				866
				867	page = allocate_slab(s, flags & GFP_LEVEL_MASK, node);
				868	if (!page)
				869	goto out;
				870
				871	n = get_node(s, page_to_nid(page));
				872	if (n)
				873	atomic_long_inc(&n->nr_slabs);
				874	page->offset = s->offset / sizeof(void *);
				875	page->slab = s;
				876	page->flags \|= 1 << PG_slab;
				877	if (s->flags & (SLAB_DEBUG_FREE \| SLAB_RED_ZONE \| SLAB_POISON \|
				878	SLAB_STORE_USER \| SLAB_TRACE))
				879	page->flags \|= 1 << PG_error;
				880
				881	start = page_address(page);
				882	end = start + s->objects * s->size;
				883
				884	if (unlikely(s->flags & SLAB_POISON))
				885	memset(start, POISON_INUSE, PAGE_SIZE << s->order);
				886
				887	last = start;
				888	for (p = start + s->size; p < end; p += s->size) {
				889	setup_object(s, page, last);
				890	set_freepointer(s, last, p);
				891	last = p;
				892	}
				893	setup_object(s, page, last);
				894	set_freepointer(s, last, NULL);
				895
				896	page->freelist = start;
				897	page->inuse = 0;
				898	out:
				899	if (flags & __GFP_WAIT)
				900	local_irq_disable();
				901	return page;
				902	}
				903
				904	static void __free_slab(struct kmem_cache s, struct page page)
				905	{
				906	int pages = 1 << s->order;
				907
				908	if (unlikely(PageError(page) \|\| s->dtor)) {
				909	void *start = page_address(page);
				910	void *end = start + (pages << PAGE_SHIFT);
				911	void *p;
				912
				913	slab_pad_check(s, page);
				914	for (p = start; p <= end - s->size; p += s->size) {
				915	if (s->dtor)
				916	s->dtor(p, s, 0);
				917	check_object(s, page, p, 0);
				918	}
				919	}
				920
				921	mod_zone_page_state(page_zone(page),
				922	(s->flags & SLAB_RECLAIM_ACCOUNT) ?
				923	NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
				924	- pages);
				925
				926	page->mapping = NULL;
				927	__free_pages(page, s->order);
				928	}
				929
				930	static void rcu_free_slab(struct rcu_head *h)
				931	{
				932	struct page *page;
				933
				934	page = container_of((struct list_head *)h, struct page, lru);
				935	__free_slab(page->slab, page);
				936	}
				937
				938	static void free_slab(struct kmem_cache s, struct page page)
				939	{
				940	if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
				941	/*
				942	* RCU free overloads the RCU head over the LRU
				943	*/
				944	struct rcu_head head = (void )&page->lru;
				945
				946	call_rcu(head, rcu_free_slab);
				947	} else
				948	__free_slab(s, page);
				949	}
				950
				951	static void discard_slab(struct kmem_cache s, struct page page)
				952	{
				953	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
				954
				955	atomic_long_dec(&n->nr_slabs);
				956	reset_page_mapcount(page);
				957	page->flags &= ~(1 << PG_slab \| 1 << PG_error);
				958	free_slab(s, page);
				959	}
				960
				961	/*
				962	* Per slab locking using the pagelock
				963	*/
				964	static __always_inline void slab_lock(struct page *page)
				965	{
				966	bit_spin_lock(PG_locked, &page->flags);
				967	}
				968
				969	static __always_inline void slab_unlock(struct page *page)
				970	{
				971	bit_spin_unlock(PG_locked, &page->flags);
				972	}
				973
				974	static __always_inline int slab_trylock(struct page *page)
				975	{
				976	int rc = 1;
				977
				978	rc = bit_spin_trylock(PG_locked, &page->flags);
				979	return rc;
				980	}
				981
				982	/*
				983	* Management of partially allocated slabs
				984	*/
				985	static void add_partial(struct kmem_cache s, struct page page)
				986	{
				987	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
				988
				989	spin_lock(&n->list_lock);
				990	n->nr_partial++;
				991	list_add(&page->lru, &n->partial);
				992	spin_unlock(&n->list_lock);
				993	}
				994
				995	static void remove_partial(struct kmem_cache *s,
				996	struct page *page)
				997	{
				998	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
				999
				1000	spin_lock(&n->list_lock);
				1001	list_del(&page->lru);
				1002	n->nr_partial--;
				1003	spin_unlock(&n->list_lock);
				1004	}
				1005
				1006	/*
				1007	* Lock page and remove it from the partial list
				1008	*
				1009	* Must hold list_lock
				1010	*/
				1011	static int lock_and_del_slab(struct kmem_cache_node n, struct page page)
				1012	{
				1013	if (slab_trylock(page)) {
				1014	list_del(&page->lru);
				1015	n->nr_partial--;
				1016	return 1;
				1017	}
				1018	return 0;
				1019	}
				1020
				1021	/*
				1022	* Try to get a partial slab from a specific node
				1023	*/
				1024	static struct page get_partial_node(struct kmem_cache_node n)
				1025	{
				1026	struct page *page;
				1027
				1028	/*
				1029	* Racy check. If we mistakenly see no partial slabs then we
				1030	* just allocate an empty slab. If we mistakenly try to get a
				1031	* partial slab then get_partials() will return NULL.
				1032	*/
				1033	if (!n \|\| !n->nr_partial)
				1034	return NULL;
				1035
				1036	spin_lock(&n->list_lock);
				1037	list_for_each_entry(page, &n->partial, lru)
				1038	if (lock_and_del_slab(n, page))
				1039	goto out;
				1040	page = NULL;
				1041	out:
				1042	spin_unlock(&n->list_lock);
				1043	return page;
				1044	}
				1045
				1046	/*
				1047	* Get a page from somewhere. Search in increasing NUMA
				1048	* distances.
				1049	*/
				1050	static struct page get_any_partial(struct kmem_cache s, gfp_t flags)
				1051	{
				1052	#ifdef CONFIG_NUMA
				1053	struct zonelist *zonelist;
				1054	struct zone **z;
				1055	struct page *page;
				1056
				1057	/*
				1058	* The defrag ratio allows to configure the tradeoffs between
				1059	* inter node defragmentation and node local allocations.
				1060	* A lower defrag_ratio increases the tendency to do local
				1061	* allocations instead of scanning throught the partial
				1062	* lists on other nodes.
				1063	*
				1064	* If defrag_ratio is set to 0 then kmalloc() always
				1065	* returns node local objects. If its higher then kmalloc()
				1066	* may return off node objects in order to avoid fragmentation.
				1067	*
				1068	* A higher ratio means slabs may be taken from other nodes
				1069	* thus reducing the number of partial slabs on those nodes.
				1070	*
				1071	* If /sys/slab/xx/defrag_ratio is set to 100 (which makes
				1072	* defrag_ratio = 1000) then every (well almost) allocation
				1073	* will first attempt to defrag slab caches on other nodes. This
				1074	* means scanning over all nodes to look for partial slabs which
				1075	* may be a bit expensive to do on every slab allocation.
				1076	*/
				1077	if (!s->defrag_ratio \|\| get_cycles() % 1024 > s->defrag_ratio)
				1078	return NULL;
				1079
				1080	zonelist = &NODE_DATA(slab_node(current->mempolicy))
				1081	->node_zonelists[gfp_zone(flags)];
				1082	for (z = zonelist->zones; *z; z++) {
				1083	struct kmem_cache_node *n;
				1084
				1085	n = get_node(s, zone_to_nid(*z));
				1086
				1087	if (n && cpuset_zone_allowed_hardwall(*z, flags) &&
				1088	n->nr_partial > 2) {
				1089	page = get_partial_node(n);
				1090	if (page)
				1091	return page;
				1092	}
				1093	}
				1094	#endif
				1095	return NULL;
				1096	}
				1097
				1098	/*
				1099	* Get a partial page, lock it and return it.
				1100	*/
				1101	static struct page get_partial(struct kmem_cache s, gfp_t flags, int node)
				1102	{
				1103	struct page *page;
				1104	int searchnode = (node == -1) ? numa_node_id() : node;
				1105
				1106	page = get_partial_node(get_node(s, searchnode));
				1107	if (page \|\| (flags & __GFP_THISNODE))
				1108	return page;
				1109
				1110	return get_any_partial(s, flags);
				1111	}
				1112
				1113	/*
				1114	* Move a page back to the lists.
				1115	*
				1116	* Must be called with the slab lock held.
				1117	*
				1118	* On exit the slab lock will have been dropped.
				1119	*/
				1120	static void putback_slab(struct kmem_cache s, struct page page)
				1121	{
				1122	if (page->inuse) {
				1123	if (page->freelist)
				1124	add_partial(s, page);
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	1125	else if (PageError(page))
				1126	add_full(s, page);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1127	slab_unlock(page);
				1128	} else {
				1129	slab_unlock(page);
				1130	discard_slab(s, page);
				1131	}
				1132	}
				1133
				1134	/*
				1135	* Remove the cpu slab
				1136	*/
				1137	static void deactivate_slab(struct kmem_cache s, struct page page, int cpu)
				1138	{
				1139	s->cpu_slab[cpu] = NULL;
				1140	ClearPageActive(page);
				1141
				1142	putback_slab(s, page);
				1143	}
				1144
				1145	static void flush_slab(struct kmem_cache s, struct page page, int cpu)
				1146	{
				1147	slab_lock(page);
				1148	deactivate_slab(s, page, cpu);
				1149	}
				1150
				1151	/*
				1152	* Flush cpu slab.
				1153	* Called from IPI handler with interrupts disabled.
				1154	*/
				1155	static void __flush_cpu_slab(struct kmem_cache *s, int cpu)
				1156	{
				1157	struct page *page = s->cpu_slab[cpu];
				1158
				1159	if (likely(page))
				1160	flush_slab(s, page, cpu);
				1161	}
				1162
				1163	static void flush_cpu_slab(void *d)
				1164	{
				1165	struct kmem_cache *s = d;
				1166	int cpu = smp_processor_id();
				1167
				1168	__flush_cpu_slab(s, cpu);
				1169	}
				1170
				1171	static void flush_all(struct kmem_cache *s)
				1172	{
				1173	#ifdef CONFIG_SMP
				1174	on_each_cpu(flush_cpu_slab, s, 1, 1);
				1175	#else
				1176	unsigned long flags;
				1177
				1178	local_irq_save(flags);
				1179	flush_cpu_slab(s);
				1180	local_irq_restore(flags);
				1181	#endif
				1182	}
				1183
				1184	/*
				1185	* slab_alloc is optimized to only modify two cachelines on the fast path
				1186	* (aside from the stack):
				1187	*
				1188	* 1. The page struct
				1189	* 2. The first cacheline of the object to be allocated.
				1190	*
				1191	* The only cache lines that are read (apart from code) is the
				1192	* per cpu array in the kmem_cache struct.
				1193	*
				1194	* Fastpath is not possible if we need to get a new slab or have
				1195	* debugging enabled (which means all slabs are marked with PageError)
				1196	*/
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1197	static void slab_alloc(struct kmem_cache s,
				1198	gfp_t gfpflags, int node, void *addr)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1199	{
				1200	struct page *page;
				1201	void **object;
				1202	unsigned long flags;
				1203	int cpu;
				1204
				1205	local_irq_save(flags);
				1206	cpu = smp_processor_id();
				1207	page = s->cpu_slab[cpu];
				1208	if (!page)
				1209	goto new_slab;
				1210
				1211	slab_lock(page);
				1212	if (unlikely(node != -1 && page_to_nid(page) != node))
				1213	goto another_slab;
				1214	redo:
				1215	object = page->freelist;
				1216	if (unlikely(!object))
				1217	goto another_slab;
				1218	if (unlikely(PageError(page)))
				1219	goto debug;
				1220
				1221	have_object:
				1222	page->inuse++;
				1223	page->freelist = object[page->offset];
				1224	slab_unlock(page);
				1225	local_irq_restore(flags);
				1226	return object;
				1227
				1228	another_slab:
				1229	deactivate_slab(s, page, cpu);
				1230
				1231	new_slab:
				1232	page = get_partial(s, gfpflags, node);
				1233	if (likely(page)) {
				1234	have_slab:
				1235	s->cpu_slab[cpu] = page;
				1236	SetPageActive(page);
				1237	goto redo;
				1238	}
				1239
				1240	page = new_slab(s, gfpflags, node);
				1241	if (page) {
				1242	cpu = smp_processor_id();
				1243	if (s->cpu_slab[cpu]) {
				1244	/*
				1245	* Someone else populated the cpu_slab while we enabled
				1246	* interrupts, or we have got scheduled on another cpu.
				1247	* The page may not be on the requested node.
				1248	*/
				1249	if (node == -1 \|\|
				1250	page_to_nid(s->cpu_slab[cpu]) == node) {
				1251	/*
				1252	* Current cpuslab is acceptable and we
				1253	* want the current one since its cache hot
				1254	*/
				1255	discard_slab(s, page);
				1256	page = s->cpu_slab[cpu];
				1257	slab_lock(page);
				1258	goto redo;
				1259	}
				1260	/* Dump the current slab */
				1261	flush_slab(s, s->cpu_slab[cpu], cpu);
				1262	}
				1263	slab_lock(page);
				1264	goto have_slab;
				1265	}
				1266	local_irq_restore(flags);
				1267	return NULL;
				1268	debug:
				1269	if (!alloc_object_checks(s, page, object))
				1270	goto another_slab;
				1271	if (s->flags & SLAB_STORE_USER)
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1272	set_track(s, object, TRACK_ALLOC, addr);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1273	goto have_object;
				1274	}
				1275
				1276	void kmem_cache_alloc(struct kmem_cache s, gfp_t gfpflags)
				1277	{
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1278	return slab_alloc(s, gfpflags, -1, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1279	}
				1280	EXPORT_SYMBOL(kmem_cache_alloc);
				1281
				1282	#ifdef CONFIG_NUMA
				1283	void kmem_cache_alloc_node(struct kmem_cache s, gfp_t gfpflags, int node)
				1284	{
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1285	return slab_alloc(s, gfpflags, node, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1286	}
				1287	EXPORT_SYMBOL(kmem_cache_alloc_node);
				1288	#endif
				1289
				1290	/*
				1291	* The fastpath only writes the cacheline of the page struct and the first
				1292	* cacheline of the object.
				1293	*
				1294	* No special cachelines need to be read
				1295	*/
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1296	static void slab_free(struct kmem_cache s, struct page page,
				1297	void x, void addr)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1298	{
				1299	void *prior;
				1300	void *object = (void )x;
				1301	unsigned long flags;
				1302
				1303	local_irq_save(flags);
				1304	slab_lock(page);
				1305
				1306	if (unlikely(PageError(page)))
				1307	goto debug;
				1308	checks_ok:
				1309	prior = object[page->offset] = page->freelist;
				1310	page->freelist = object;
				1311	page->inuse--;
				1312
				1313	if (unlikely(PageActive(page)))
				1314	/*
				1315	* Cpu slabs are never on partial lists and are
				1316	* never freed.
				1317	*/
				1318	goto out_unlock;
				1319
				1320	if (unlikely(!page->inuse))
				1321	goto slab_empty;
				1322
				1323	/*
				1324	* Objects left in the slab. If it
				1325	* was not on the partial list before
				1326	* then add it.
				1327	*/
				1328	if (unlikely(!prior))
				1329	add_partial(s, page);
				1330
				1331	out_unlock:
				1332	slab_unlock(page);
				1333	local_irq_restore(flags);
				1334	return;
				1335
				1336	slab_empty:
				1337	if (prior)
				1338	/*
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	1339	* Slab on the partial list.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1340	*/
				1341	remove_partial(s, page);
				1342
				1343	slab_unlock(page);
				1344	discard_slab(s, page);
				1345	local_irq_restore(flags);
				1346	return;
				1347
				1348	debug:
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1349	if (!free_object_checks(s, page, x))
				1350	goto out_unlock;
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	1351	if (!PageActive(page) && !page->freelist)
				1352	remove_full(s, page);
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1353	if (s->flags & SLAB_STORE_USER)
				1354	set_track(s, x, TRACK_FREE, addr);
				1355	goto checks_ok;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1356	}
				1357
				1358	void kmem_cache_free(struct kmem_cache s, void x)
				1359	{
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1360	struct page *page;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1361
Christoph Lameter	b49af68	2007-05-06 14:49:41 -0700	[diff] [blame]	1362	page = virt_to_head_page(x);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1363
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1364	slab_free(s, page, x, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1365	}
				1366	EXPORT_SYMBOL(kmem_cache_free);
				1367
				1368	/* Figure out on which slab object the object resides */
				1369	static struct page get_object_page(const void x)
				1370	{
Christoph Lameter	b49af68	2007-05-06 14:49:41 -0700	[diff] [blame]	1371	struct page *page = virt_to_head_page(x);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1372
				1373	if (!PageSlab(page))
				1374	return NULL;
				1375
				1376	return page;
				1377	}
				1378
				1379	/*
				1380	* kmem_cache_open produces objects aligned at "size" and the first object
				1381	* is placed at offset 0 in the slab (We have no metainformation on the
				1382	* slab, all slabs are in essence "off slab").
				1383	*
				1384	* In order to get the desired alignment one just needs to align the
				1385	* size.
				1386	*
				1387	* Notice that the allocation order determines the sizes of the per cpu
				1388	* caches. Each processor has always one slab available for allocations.
				1389	* Increasing the allocation order reduces the number of times that slabs
				1390	* must be moved on and off the partial lists and therefore may influence
				1391	* locking overhead.
				1392	*
				1393	* The offset is used to relocate the free list link in each object. It is
				1394	* therefore possible to move the free list link behind the object. This
				1395	* is necessary for RCU to work properly and also useful for debugging.
				1396	*/
				1397
				1398	/*
				1399	* Mininum / Maximum order of slab pages. This influences locking overhead
				1400	* and slab fragmentation. A higher order reduces the number of partial slabs
				1401	* and increases the number of allocations possible without having to
				1402	* take the list_lock.
				1403	*/
				1404	static int slub_min_order;
				1405	static int slub_max_order = DEFAULT_MAX_ORDER;
				1406
				1407	/*
				1408	* Minimum number of objects per slab. This is necessary in order to
				1409	* reduce locking overhead. Similar to the queue size in SLAB.
				1410	*/
				1411	static int slub_min_objects = DEFAULT_MIN_OBJECTS;
				1412
				1413	/*
				1414	* Merge control. If this is set then no merging of slab caches will occur.
				1415	*/
				1416	static int slub_nomerge;
				1417
				1418	/*
				1419	* Debug settings:
				1420	*/
				1421	static int slub_debug;
				1422
				1423	static char *slub_debug_slabs;
				1424
				1425	/*
				1426	* Calculate the order of allocation given an slab object size.
				1427	*
				1428	* The order of allocation has significant impact on other elements
				1429	* of the system. Generally order 0 allocations should be preferred
				1430	* since they do not cause fragmentation in the page allocator. Larger
				1431	* objects may have problems with order 0 because there may be too much
				1432	* space left unused in a slab. We go to a higher order if more than 1/8th
				1433	* of the slab would be wasted.
				1434	*
				1435	* In order to reach satisfactory performance we must ensure that
				1436	* a minimum number of objects is in one slab. Otherwise we may
				1437	* generate too much activity on the partial lists. This is less a
				1438	* concern for large slabs though. slub_max_order specifies the order
				1439	* where we begin to stop considering the number of objects in a slab.
				1440	*
				1441	* Higher order allocations also allow the placement of more objects
				1442	* in a slab and thereby reduce object handling overhead. If the user
				1443	* has requested a higher mininum order then we start with that one
				1444	* instead of zero.
				1445	*/
				1446	static int calculate_order(int size)
				1447	{
				1448	int order;
				1449	int rem;
				1450
				1451	for (order = max(slub_min_order, fls(size - 1) - PAGE_SHIFT);
				1452	order < MAX_ORDER; order++) {
				1453	unsigned long slab_size = PAGE_SIZE << order;
				1454
				1455	if (slub_max_order > order &&
				1456	slab_size < slub_min_objects * size)
				1457	continue;
				1458
				1459	if (slab_size < size)
				1460	continue;
				1461
				1462	rem = slab_size % size;
				1463
				1464	if (rem <= (PAGE_SIZE << order) / 8)
				1465	break;
				1466
				1467	}
				1468	if (order >= MAX_ORDER)
				1469	return -E2BIG;
				1470	return order;
				1471	}
				1472
				1473	/*
				1474	* Function to figure out which alignment to use from the
				1475	* various ways of specifying it.
				1476	*/
				1477	static unsigned long calculate_alignment(unsigned long flags,
				1478	unsigned long align, unsigned long size)
				1479	{
				1480	/*
				1481	* If the user wants hardware cache aligned objects then
				1482	* follow that suggestion if the object is sufficiently
				1483	* large.
				1484	*
				1485	* The hardware cache alignment cannot override the
				1486	* specified alignment though. If that is greater
				1487	* then use it.
				1488	*/
				1489	if ((flags & (SLAB_MUST_HWCACHE_ALIGN \| SLAB_HWCACHE_ALIGN)) &&
				1490	size > L1_CACHE_BYTES / 2)
				1491	return max_t(unsigned long, align, L1_CACHE_BYTES);
				1492
				1493	if (align < ARCH_SLAB_MINALIGN)
				1494	return ARCH_SLAB_MINALIGN;
				1495
				1496	return ALIGN(align, sizeof(void *));
				1497	}
				1498
				1499	static void init_kmem_cache_node(struct kmem_cache_node *n)
				1500	{
				1501	n->nr_partial = 0;
				1502	atomic_long_set(&n->nr_slabs, 0);
				1503	spin_lock_init(&n->list_lock);
				1504	INIT_LIST_HEAD(&n->partial);
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	1505	INIT_LIST_HEAD(&n->full);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1506	}
				1507
				1508	#ifdef CONFIG_NUMA
				1509	/*
				1510	* No kmalloc_node yet so do it by hand. We know that this is the first
				1511	* slab on the node for this slabcache. There are no concurrent accesses
				1512	* possible.
				1513	*
				1514	* Note that this function only works on the kmalloc_node_cache
				1515	* when allocating for the kmalloc_node_cache.
				1516	*/
				1517	static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags,
				1518	int node)
				1519	{
				1520	struct page *page;
				1521	struct kmem_cache_node *n;
				1522
				1523	BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
				1524
				1525	page = new_slab(kmalloc_caches, gfpflags \| GFP_THISNODE, node);
				1526	/* new_slab() disables interupts */
				1527	local_irq_enable();
				1528
				1529	BUG_ON(!page);
				1530	n = page->freelist;
				1531	BUG_ON(!n);
				1532	page->freelist = get_freepointer(kmalloc_caches, n);
				1533	page->inuse++;
				1534	kmalloc_caches->node[node] = n;
				1535	init_object(kmalloc_caches, n, 1);
				1536	init_kmem_cache_node(n);
				1537	atomic_long_inc(&n->nr_slabs);
				1538	add_partial(kmalloc_caches, page);
				1539	return n;
				1540	}
				1541
				1542	static void free_kmem_cache_nodes(struct kmem_cache *s)
				1543	{
				1544	int node;
				1545
				1546	for_each_online_node(node) {
				1547	struct kmem_cache_node *n = s->node[node];
				1548	if (n && n != &s->local_node)
				1549	kmem_cache_free(kmalloc_caches, n);
				1550	s->node[node] = NULL;
				1551	}
				1552	}
				1553
				1554	static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
				1555	{
				1556	int node;
				1557	int local_node;
				1558
				1559	if (slab_state >= UP)
				1560	local_node = page_to_nid(virt_to_page(s));
				1561	else
				1562	local_node = 0;
				1563
				1564	for_each_online_node(node) {
				1565	struct kmem_cache_node *n;
				1566
				1567	if (local_node == node)
				1568	n = &s->local_node;
				1569	else {
				1570	if (slab_state == DOWN) {
				1571	n = early_kmem_cache_node_alloc(gfpflags,
				1572	node);
				1573	continue;
				1574	}
				1575	n = kmem_cache_alloc_node(kmalloc_caches,
				1576	gfpflags, node);
				1577
				1578	if (!n) {
				1579	free_kmem_cache_nodes(s);
				1580	return 0;
				1581	}
				1582
				1583	}
				1584	s->node[node] = n;
				1585	init_kmem_cache_node(n);
				1586	}
				1587	return 1;
				1588	}
				1589	#else
				1590	static void free_kmem_cache_nodes(struct kmem_cache *s)
				1591	{
				1592	}
				1593
				1594	static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
				1595	{
				1596	init_kmem_cache_node(&s->local_node);
				1597	return 1;
				1598	}
				1599	#endif
				1600
				1601	/*
				1602	* calculate_sizes() determines the order and the distribution of data within
				1603	* a slab object.
				1604	*/
				1605	static int calculate_sizes(struct kmem_cache *s)
				1606	{
				1607	unsigned long flags = s->flags;
				1608	unsigned long size = s->objsize;
				1609	unsigned long align = s->align;
				1610
				1611	/*
				1612	* Determine if we can poison the object itself. If the user of
				1613	* the slab may touch the object after free or before allocation
				1614	* then we should never poison the object itself.
				1615	*/
				1616	if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
				1617	!s->ctor && !s->dtor)
				1618	s->flags \|= __OBJECT_POISON;
				1619	else
				1620	s->flags &= ~__OBJECT_POISON;
				1621
				1622	/*
				1623	* Round up object size to the next word boundary. We can only
				1624	* place the free pointer at word boundaries and this determines
				1625	* the possible location of the free pointer.
				1626	*/
				1627	size = ALIGN(size, sizeof(void *));
				1628
				1629	/*
				1630	* If we are redzoning then check if there is some space between the
				1631	* end of the object and the free pointer. If not then add an
				1632	* additional word, so that we can establish a redzone between
				1633	* the object and the freepointer to be able to check for overwrites.
				1634	*/
				1635	if ((flags & SLAB_RED_ZONE) && size == s->objsize)
				1636	size += sizeof(void *);
				1637
				1638	/*
				1639	* With that we have determined how much of the slab is in actual
				1640	* use by the object. This is the potential offset to the free
				1641	* pointer.
				1642	*/
				1643	s->inuse = size;
				1644
				1645	if (((flags & (SLAB_DESTROY_BY_RCU \| SLAB_POISON)) \|\|
				1646	s->ctor \|\| s->dtor)) {
				1647	/*
				1648	* Relocate free pointer after the object if it is not
				1649	* permitted to overwrite the first word of the object on
				1650	* kmem_cache_free.
				1651	*
				1652	* This is the case if we do RCU, have a constructor or
				1653	* destructor or are poisoning the objects.
				1654	*/
				1655	s->offset = size;
				1656	size += sizeof(void *);
				1657	}
				1658
				1659	if (flags & SLAB_STORE_USER)
				1660	/*
				1661	* Need to store information about allocs and frees after
				1662	* the object.
				1663	*/
				1664	size += 2 * sizeof(struct track);
				1665
				1666	if (flags & DEBUG_DEFAULT_FLAGS)
				1667	/*
				1668	* Add some empty padding so that we can catch
				1669	* overwrites from earlier objects rather than let
				1670	* tracking information or the free pointer be
				1671	* corrupted if an user writes before the start
				1672	* of the object.
				1673	*/
				1674	size += sizeof(void *);
				1675	/*
				1676	* Determine the alignment based on various parameters that the
				1677	* user specified (this is unecessarily complex due to the attempt
				1678	* to be compatible with SLAB. Should be cleaned up some day).
				1679	*/
				1680	align = calculate_alignment(flags, align, s->objsize);
				1681
				1682	/*
				1683	* SLUB stores one object immediately after another beginning from
				1684	* offset 0. In order to align the objects we have to simply size
				1685	* each object to conform to the alignment.
				1686	*/
				1687	size = ALIGN(size, align);
				1688	s->size = size;
				1689
				1690	s->order = calculate_order(size);
				1691	if (s->order < 0)
				1692	return 0;
				1693
				1694	/*
				1695	* Determine the number of objects per slab
				1696	*/
				1697	s->objects = (PAGE_SIZE << s->order) / size;
				1698
				1699	/*
				1700	* Verify that the number of objects is within permitted limits.
				1701	* The page->inuse field is only 16 bit wide! So we cannot have
				1702	* more than 64k objects per slab.
				1703	*/
				1704	if (!s->objects \|\| s->objects > 65535)
				1705	return 0;
				1706	return 1;
				1707
				1708	}
				1709
				1710	static int __init finish_bootstrap(void)
				1711	{
				1712	struct list_head *h;
				1713	int err;
				1714
				1715	slab_state = SYSFS;
				1716
				1717	list_for_each(h, &slab_caches) {
				1718	struct kmem_cache *s =
				1719	container_of(h, struct kmem_cache, list);
				1720
				1721	err = sysfs_slab_add(s);
				1722	BUG_ON(err);
				1723	}
				1724	return 0;
				1725	}
				1726
				1727	static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
				1728	const char *name, size_t size,
				1729	size_t align, unsigned long flags,
				1730	void (ctor)(void , struct kmem_cache *, unsigned long),
				1731	void (dtor)(void , struct kmem_cache *, unsigned long))
				1732	{
				1733	memset(s, 0, kmem_size);
				1734	s->name = name;
				1735	s->ctor = ctor;
				1736	s->dtor = dtor;
				1737	s->objsize = size;
				1738	s->flags = flags;
				1739	s->align = align;
				1740
				1741	BUG_ON(flags & SLUB_UNIMPLEMENTED);
				1742
				1743	/*
				1744	* The page->offset field is only 16 bit wide. This is an offset
				1745	* in units of words from the beginning of an object. If the slab
				1746	* size is bigger then we cannot move the free pointer behind the
				1747	* object anymore.
				1748	*
				1749	* On 32 bit platforms the limit is 256k. On 64bit platforms
				1750	* the limit is 512k.
				1751	*
				1752	* Debugging or ctor/dtors may create a need to move the free
				1753	* pointer. Fail if this happens.
				1754	*/
				1755	if (s->size >= 65535 * sizeof(void *)) {
				1756	BUG_ON(flags & (SLAB_RED_ZONE \| SLAB_POISON \|
				1757	SLAB_STORE_USER \| SLAB_DESTROY_BY_RCU));
				1758	BUG_ON(ctor \|\| dtor);
				1759	}
				1760	else
				1761	/*
				1762	* Enable debugging if selected on the kernel commandline.
				1763	*/
				1764	if (slub_debug && (!slub_debug_slabs \|\|
				1765	strncmp(slub_debug_slabs, name,
				1766	strlen(slub_debug_slabs)) == 0))
				1767	s->flags \|= slub_debug;
				1768
				1769	if (!calculate_sizes(s))
				1770	goto error;
				1771
				1772	s->refcount = 1;
				1773	#ifdef CONFIG_NUMA
				1774	s->defrag_ratio = 100;
				1775	#endif
				1776
				1777	if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
				1778	return 1;
				1779	error:
				1780	if (flags & SLAB_PANIC)
				1781	panic("Cannot create slab %s size=%lu realsize=%u "
				1782	"order=%u offset=%u flags=%lx\n",
				1783	s->name, (unsigned long)size, s->size, s->order,
				1784	s->offset, flags);
				1785	return 0;
				1786	}
				1787	EXPORT_SYMBOL(kmem_cache_open);
				1788
				1789	/*
				1790	* Check if a given pointer is valid
				1791	*/
				1792	int kmem_ptr_validate(struct kmem_cache s, const void object)
				1793	{
				1794	struct page * page;
				1795	void *addr;
				1796
				1797	page = get_object_page(object);
				1798
				1799	if (!page \|\| s != page->slab)
				1800	/* No slab or wrong slab */
				1801	return 0;
				1802
				1803	addr = page_address(page);
				1804	if (object < addr \|\| object >= addr + s->objects * s->size)
				1805	/* Out of bounds */
				1806	return 0;
				1807
				1808	if ((object - addr) % s->size)
				1809	/* Improperly aligned */
				1810	return 0;
				1811
				1812	/*
				1813	* We could also check if the object is on the slabs freelist.
				1814	* But this would be too expensive and it seems that the main
				1815	* purpose of kmem_ptr_valid is to check if the object belongs
				1816	* to a certain slab.
				1817	*/
				1818	return 1;
				1819	}
				1820	EXPORT_SYMBOL(kmem_ptr_validate);
				1821
				1822	/*
				1823	* Determine the size of a slab object
				1824	*/
				1825	unsigned int kmem_cache_size(struct kmem_cache *s)
				1826	{
				1827	return s->objsize;
				1828	}
				1829	EXPORT_SYMBOL(kmem_cache_size);
				1830
				1831	const char kmem_cache_name(struct kmem_cache s)
				1832	{
				1833	return s->name;
				1834	}
				1835	EXPORT_SYMBOL(kmem_cache_name);
				1836
				1837	/*
				1838	* Attempt to free all slabs on a node
				1839	*/
				1840	static int free_list(struct kmem_cache s, struct kmem_cache_node n,
				1841	struct list_head *list)
				1842	{
				1843	int slabs_inuse = 0;
				1844	unsigned long flags;
				1845	struct page page, h;
				1846
				1847	spin_lock_irqsave(&n->list_lock, flags);
				1848	list_for_each_entry_safe(page, h, list, lru)
				1849	if (!page->inuse) {
				1850	list_del(&page->lru);
				1851	discard_slab(s, page);
				1852	} else
				1853	slabs_inuse++;
				1854	spin_unlock_irqrestore(&n->list_lock, flags);
				1855	return slabs_inuse;
				1856	}
				1857
				1858	/*
				1859	* Release all resources used by slab cache
				1860	*/
				1861	static int kmem_cache_close(struct kmem_cache *s)
				1862	{
				1863	int node;
				1864
				1865	flush_all(s);
				1866
				1867	/* Attempt to free all objects */
				1868	for_each_online_node(node) {
				1869	struct kmem_cache_node *n = get_node(s, node);
				1870
				1871	free_list(s, n, &n->partial);
				1872	if (atomic_long_read(&n->nr_slabs))
				1873	return 1;
				1874	}
				1875	free_kmem_cache_nodes(s);
				1876	return 0;
				1877	}
				1878
				1879	/*
				1880	* Close a cache and release the kmem_cache structure
				1881	* (must be used for caches created using kmem_cache_create)
				1882	*/
				1883	void kmem_cache_destroy(struct kmem_cache *s)
				1884	{
				1885	down_write(&slub_lock);
				1886	s->refcount--;
				1887	if (!s->refcount) {
				1888	list_del(&s->list);
				1889	if (kmem_cache_close(s))
				1890	WARN_ON(1);
				1891	sysfs_slab_remove(s);
				1892	kfree(s);
				1893	}
				1894	up_write(&slub_lock);
				1895	}
				1896	EXPORT_SYMBOL(kmem_cache_destroy);
				1897
				1898	/********************************************************************
				1899	* Kmalloc subsystem
				1900	*******************************************************************/
				1901
				1902	struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned;
				1903	EXPORT_SYMBOL(kmalloc_caches);
				1904
				1905	#ifdef CONFIG_ZONE_DMA
				1906	static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1];
				1907	#endif
				1908
				1909	static int __init setup_slub_min_order(char *str)
				1910	{
				1911	get_option (&str, &slub_min_order);
				1912
				1913	return 1;
				1914	}
				1915
				1916	__setup("slub_min_order=", setup_slub_min_order);
				1917
				1918	static int __init setup_slub_max_order(char *str)
				1919	{
				1920	get_option (&str, &slub_max_order);
				1921
				1922	return 1;
				1923	}
				1924
				1925	__setup("slub_max_order=", setup_slub_max_order);
				1926
				1927	static int __init setup_slub_min_objects(char *str)
				1928	{
				1929	get_option (&str, &slub_min_objects);
				1930
				1931	return 1;
				1932	}
				1933
				1934	__setup("slub_min_objects=", setup_slub_min_objects);
				1935
				1936	static int __init setup_slub_nomerge(char *str)
				1937	{
				1938	slub_nomerge = 1;
				1939	return 1;
				1940	}
				1941
				1942	__setup("slub_nomerge", setup_slub_nomerge);
				1943
				1944	static int __init setup_slub_debug(char *str)
				1945	{
				1946	if (!str \|\| *str != '=')
				1947	slub_debug = DEBUG_DEFAULT_FLAGS;
				1948	else {
				1949	str++;
				1950	if (str == 0 \|\| str == ',')
				1951	slub_debug = DEBUG_DEFAULT_FLAGS;
				1952	else
				1953	for( ;str && str != ','; str++)
				1954	switch (*str) {
				1955	case 'f' : case 'F' :
				1956	slub_debug \|= SLAB_DEBUG_FREE;
				1957	break;
				1958	case 'z' : case 'Z' :
				1959	slub_debug \|= SLAB_RED_ZONE;
				1960	break;
				1961	case 'p' : case 'P' :
				1962	slub_debug \|= SLAB_POISON;
				1963	break;
				1964	case 'u' : case 'U' :
				1965	slub_debug \|= SLAB_STORE_USER;
				1966	break;
				1967	case 't' : case 'T' :
				1968	slub_debug \|= SLAB_TRACE;
				1969	break;
				1970	default:
				1971	printk(KERN_ERR "slub_debug option '%c' "
				1972	"unknown. skipped\n",*str);
				1973	}
				1974	}
				1975
				1976	if (*str == ',')
				1977	slub_debug_slabs = str + 1;
				1978	return 1;
				1979	}
				1980
				1981	__setup("slub_debug", setup_slub_debug);
				1982
				1983	static struct kmem_cache create_kmalloc_cache(struct kmem_cache s,
				1984	const char *name, int size, gfp_t gfp_flags)
				1985	{
				1986	unsigned int flags = 0;
				1987
				1988	if (gfp_flags & SLUB_DMA)
				1989	flags = SLAB_CACHE_DMA;
				1990
				1991	down_write(&slub_lock);
				1992	if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
				1993	flags, NULL, NULL))
				1994	goto panic;
				1995
				1996	list_add(&s->list, &slab_caches);
				1997	up_write(&slub_lock);
				1998	if (sysfs_slab_add(s))
				1999	goto panic;
				2000	return s;
				2001
				2002	panic:
				2003	panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
				2004	}
				2005
				2006	static struct kmem_cache *get_slab(size_t size, gfp_t flags)
				2007	{
				2008	int index = kmalloc_index(size);
				2009
Christoph Lameter	614410d	2007-05-06 14:49:38 -0700	[diff] [blame]	2010	if (!index)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2011	return NULL;
				2012
				2013	/* Allocation too large? */
				2014	BUG_ON(index < 0);
				2015
				2016	#ifdef CONFIG_ZONE_DMA
				2017	if ((flags & SLUB_DMA)) {
				2018	struct kmem_cache *s;
				2019	struct kmem_cache *x;
				2020	char *text;
				2021	size_t realsize;
				2022
				2023	s = kmalloc_caches_dma[index];
				2024	if (s)
				2025	return s;
				2026
				2027	/* Dynamically create dma cache */
				2028	x = kmalloc(kmem_size, flags & ~SLUB_DMA);
				2029	if (!x)
				2030	panic("Unable to allocate memory for dma cache\n");
				2031
				2032	if (index <= KMALLOC_SHIFT_HIGH)
				2033	realsize = 1 << index;
				2034	else {
				2035	if (index == 1)
				2036	realsize = 96;
				2037	else
				2038	realsize = 192;
				2039	}
				2040
				2041	text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
				2042	(unsigned int)realsize);
				2043	s = create_kmalloc_cache(x, text, realsize, flags);
				2044	kmalloc_caches_dma[index] = s;
				2045	return s;
				2046	}
				2047	#endif
				2048	return &kmalloc_caches[index];
				2049	}
				2050
				2051	void *__kmalloc(size_t size, gfp_t flags)
				2052	{
				2053	struct kmem_cache *s = get_slab(size, flags);
				2054
				2055	if (s)
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2056	return slab_alloc(s, flags, -1, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2057	return NULL;
				2058	}
				2059	EXPORT_SYMBOL(__kmalloc);
				2060
				2061	#ifdef CONFIG_NUMA
				2062	void *__kmalloc_node(size_t size, gfp_t flags, int node)
				2063	{
				2064	struct kmem_cache *s = get_slab(size, flags);
				2065
				2066	if (s)
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2067	return slab_alloc(s, flags, node, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2068	return NULL;
				2069	}
				2070	EXPORT_SYMBOL(__kmalloc_node);
				2071	#endif
				2072
				2073	size_t ksize(const void *object)
				2074	{
				2075	struct page *page = get_object_page(object);
				2076	struct kmem_cache *s;
				2077
				2078	BUG_ON(!page);
				2079	s = page->slab;
				2080	BUG_ON(!s);
				2081
				2082	/*
				2083	* Debugging requires use of the padding between object
				2084	* and whatever may come after it.
				2085	*/
				2086	if (s->flags & (SLAB_RED_ZONE \| SLAB_POISON))
				2087	return s->objsize;
				2088
				2089	/*
				2090	* If we have the need to store the freelist pointer
				2091	* back there or track user information then we can
				2092	* only use the space before that information.
				2093	*/
				2094	if (s->flags & (SLAB_DESTROY_BY_RCU \| SLAB_STORE_USER))
				2095	return s->inuse;
				2096
				2097	/*
				2098	* Else we can use all the padding etc for the allocation
				2099	*/
				2100	return s->size;
				2101	}
				2102	EXPORT_SYMBOL(ksize);
				2103
				2104	void kfree(const void *x)
				2105	{
				2106	struct kmem_cache *s;
				2107	struct page *page;
				2108
				2109	if (!x)
				2110	return;
				2111
Christoph Lameter	b49af68	2007-05-06 14:49:41 -0700	[diff] [blame]	2112	page = virt_to_head_page(x);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2113	s = page->slab;
				2114
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2115	slab_free(s, page, (void *)x, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2116	}
				2117	EXPORT_SYMBOL(kfree);
				2118
				2119	/**
				2120	* krealloc - reallocate memory. The contents will remain unchanged.
				2121	*
				2122	* @p: object to reallocate memory for.
				2123	* @new_size: how many bytes of memory are required.
				2124	* @flags: the type of memory to allocate.
				2125	*
				2126	* The contents of the object pointed to are preserved up to the
				2127	* lesser of the new and old sizes. If @p is %NULL, krealloc()
				2128	* behaves exactly like kmalloc(). If @size is 0 and @p is not a
				2129	* %NULL pointer, the object pointed to is freed.
				2130	*/
				2131	void krealloc(const void p, size_t new_size, gfp_t flags)
				2132	{
				2133	struct kmem_cache *new_cache;
				2134	void *ret;
				2135	struct page *page;
				2136
				2137	if (unlikely(!p))
				2138	return kmalloc(new_size, flags);
				2139
				2140	if (unlikely(!new_size)) {
				2141	kfree(p);
				2142	return NULL;
				2143	}
				2144
Christoph Lameter	b49af68	2007-05-06 14:49:41 -0700	[diff] [blame]	2145	page = virt_to_head_page(p);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2146
				2147	new_cache = get_slab(new_size, flags);
				2148
				2149	/*
				2150	* If new size fits in the current cache, bail out.
				2151	*/
				2152	if (likely(page->slab == new_cache))
				2153	return (void *)p;
				2154
				2155	ret = kmalloc(new_size, flags);
				2156	if (ret) {
				2157	memcpy(ret, p, min(new_size, ksize(p)));
				2158	kfree(p);
				2159	}
				2160	return ret;
				2161	}
				2162	EXPORT_SYMBOL(krealloc);
				2163
				2164	/********************************************************************
				2165	* Basic setup of slabs
				2166	*******************************************************************/
				2167
				2168	void __init kmem_cache_init(void)
				2169	{
				2170	int i;
				2171
				2172	#ifdef CONFIG_NUMA
				2173	/*
				2174	* Must first have the slab cache available for the allocations of the
				2175	* struct kmalloc_cache_node's. There is special bootstrap code in
				2176	* kmem_cache_open for slab_state == DOWN.
				2177	*/
				2178	create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
				2179	sizeof(struct kmem_cache_node), GFP_KERNEL);
				2180	#endif
				2181
				2182	/* Able to allocate the per node structures */
				2183	slab_state = PARTIAL;
				2184
				2185	/* Caches that are not of the two-to-the-power-of size */
				2186	create_kmalloc_cache(&kmalloc_caches[1],
				2187	"kmalloc-96", 96, GFP_KERNEL);
				2188	create_kmalloc_cache(&kmalloc_caches[2],
				2189	"kmalloc-192", 192, GFP_KERNEL);
				2190
				2191	for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
				2192	create_kmalloc_cache(&kmalloc_caches[i],
				2193	"kmalloc", 1 << i, GFP_KERNEL);
				2194
				2195	slab_state = UP;
				2196
				2197	/* Provide the correct kmalloc names now that the caches are up */
				2198	for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
				2199	kmalloc_caches[i]. name =
				2200	kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
				2201
				2202	#ifdef CONFIG_SMP
				2203	register_cpu_notifier(&slab_notifier);
				2204	#endif
				2205
				2206	if (nr_cpu_ids) /* Remove when nr_cpu_ids is fixed upstream ! */
				2207	kmem_size = offsetof(struct kmem_cache, cpu_slab)
				2208	+ nr_cpu_ids * sizeof(struct page *);
				2209
				2210	printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
				2211	" Processors=%d, Nodes=%d\n",
				2212	KMALLOC_SHIFT_HIGH, L1_CACHE_BYTES,
				2213	slub_min_order, slub_max_order, slub_min_objects,
				2214	nr_cpu_ids, nr_node_ids);
				2215	}
				2216
				2217	/*
				2218	* Find a mergeable slab cache
				2219	*/
				2220	static int slab_unmergeable(struct kmem_cache *s)
				2221	{
				2222	if (slub_nomerge \|\| (s->flags & SLUB_NEVER_MERGE))
				2223	return 1;
				2224
				2225	if (s->ctor \|\| s->dtor)
				2226	return 1;
				2227
				2228	return 0;
				2229	}
				2230
				2231	static struct kmem_cache *find_mergeable(size_t size,
				2232	size_t align, unsigned long flags,
				2233	void (ctor)(void , struct kmem_cache *, unsigned long),
				2234	void (dtor)(void , struct kmem_cache *, unsigned long))
				2235	{
				2236	struct list_head *h;
				2237
				2238	if (slub_nomerge \|\| (flags & SLUB_NEVER_MERGE))
				2239	return NULL;
				2240
				2241	if (ctor \|\| dtor)
				2242	return NULL;
				2243
				2244	size = ALIGN(size, sizeof(void *));
				2245	align = calculate_alignment(flags, align, size);
				2246	size = ALIGN(size, align);
				2247
				2248	list_for_each(h, &slab_caches) {
				2249	struct kmem_cache *s =
				2250	container_of(h, struct kmem_cache, list);
				2251
				2252	if (slab_unmergeable(s))
				2253	continue;
				2254
				2255	if (size > s->size)
				2256	continue;
				2257
				2258	if (((flags \| slub_debug) & SLUB_MERGE_SAME) !=
				2259	(s->flags & SLUB_MERGE_SAME))
				2260	continue;
				2261	/*
				2262	* Check if alignment is compatible.
				2263	* Courtesy of Adrian Drzewiecki
				2264	*/
				2265	if ((s->size & ~(align -1)) != s->size)
				2266	continue;
				2267
				2268	if (s->size - size >= sizeof(void *))
				2269	continue;
				2270
				2271	return s;
				2272	}
				2273	return NULL;
				2274	}
				2275
				2276	struct kmem_cache kmem_cache_create(const char name, size_t size,
				2277	size_t align, unsigned long flags,
				2278	void (ctor)(void , struct kmem_cache *, unsigned long),
				2279	void (dtor)(void , struct kmem_cache *, unsigned long))
				2280	{
				2281	struct kmem_cache *s;
				2282
				2283	down_write(&slub_lock);
				2284	s = find_mergeable(size, align, flags, dtor, ctor);
				2285	if (s) {
				2286	s->refcount++;
				2287	/*
				2288	* Adjust the object sizes so that we clear
				2289	* the complete object on kzalloc.
				2290	*/
				2291	s->objsize = max(s->objsize, (int)size);
				2292	s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
				2293	if (sysfs_slab_alias(s, name))
				2294	goto err;
				2295	} else {
				2296	s = kmalloc(kmem_size, GFP_KERNEL);
				2297	if (s && kmem_cache_open(s, GFP_KERNEL, name,
				2298	size, align, flags, ctor, dtor)) {
				2299	if (sysfs_slab_add(s)) {
				2300	kfree(s);
				2301	goto err;
				2302	}
				2303	list_add(&s->list, &slab_caches);
				2304	} else
				2305	kfree(s);
				2306	}
				2307	up_write(&slub_lock);
				2308	return s;
				2309
				2310	err:
				2311	up_write(&slub_lock);
				2312	if (flags & SLAB_PANIC)
				2313	panic("Cannot create slabcache %s\n", name);
				2314	else
				2315	s = NULL;
				2316	return s;
				2317	}
				2318	EXPORT_SYMBOL(kmem_cache_create);
				2319
				2320	void kmem_cache_zalloc(struct kmem_cache s, gfp_t flags)
				2321	{
				2322	void *x;
				2323
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2324	x = slab_alloc(s, flags, -1, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2325	if (x)
				2326	memset(x, 0, s->objsize);
				2327	return x;
				2328	}
				2329	EXPORT_SYMBOL(kmem_cache_zalloc);
				2330
				2331	#ifdef CONFIG_SMP
				2332	static void for_all_slabs(void (func)(struct kmem_cache , int), int cpu)
				2333	{
				2334	struct list_head *h;
				2335
				2336	down_read(&slub_lock);
				2337	list_for_each(h, &slab_caches) {
				2338	struct kmem_cache *s =
				2339	container_of(h, struct kmem_cache, list);
				2340
				2341	func(s, cpu);
				2342	}
				2343	up_read(&slub_lock);
				2344	}
				2345
				2346	/*
				2347	* Use the cpu notifier to insure that the slab are flushed
				2348	* when necessary.
				2349	*/
				2350	static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
				2351	unsigned long action, void *hcpu)
				2352	{
				2353	long cpu = (long)hcpu;
				2354
				2355	switch (action) {
				2356	case CPU_UP_CANCELED:
				2357	case CPU_DEAD:
				2358	for_all_slabs(__flush_cpu_slab, cpu);
				2359	break;
				2360	default:
				2361	break;
				2362	}
				2363	return NOTIFY_OK;
				2364	}
				2365
				2366	static struct notifier_block __cpuinitdata slab_notifier =
				2367	{ &slab_cpuup_callback, NULL, 0 };
				2368
				2369	#endif
				2370
				2371	/***************************************************************
				2372	* Compatiblility definitions
				2373	**************************************************************/
				2374
				2375	int kmem_cache_shrink(struct kmem_cache *s)
				2376	{
				2377	flush_all(s);
				2378	return 0;
				2379	}
				2380	EXPORT_SYMBOL(kmem_cache_shrink);
				2381
				2382	#ifdef CONFIG_NUMA
				2383
				2384	/*****************************************************************
				2385	* Generic reaper used to support the page allocator
				2386	* (the cpu slabs are reaped by a per slab workqueue).
				2387	*
				2388	* Maybe move this to the page allocator?
				2389	****************************************************************/
				2390
				2391	static DEFINE_PER_CPU(unsigned long, reap_node);
				2392
				2393	static void init_reap_node(int cpu)
				2394	{
				2395	int node;
				2396
				2397	node = next_node(cpu_to_node(cpu), node_online_map);
				2398	if (node == MAX_NUMNODES)
				2399	node = first_node(node_online_map);
				2400
				2401	__get_cpu_var(reap_node) = node;
				2402	}
				2403
				2404	static void next_reap_node(void)
				2405	{
				2406	int node = __get_cpu_var(reap_node);
				2407
				2408	/*
				2409	* Also drain per cpu pages on remote zones
				2410	*/
				2411	if (node != numa_node_id())
				2412	drain_node_pages(node);
				2413
				2414	node = next_node(node, node_online_map);
				2415	if (unlikely(node >= MAX_NUMNODES))
				2416	node = first_node(node_online_map);
				2417	__get_cpu_var(reap_node) = node;
				2418	}
				2419	#else
				2420	#define init_reap_node(cpu) do { } while (0)
				2421	#define next_reap_node(void) do { } while (0)
				2422	#endif
				2423
				2424	#define REAPTIMEOUT_CPUC (2*HZ)
				2425
				2426	#ifdef CONFIG_SMP
				2427	static DEFINE_PER_CPU(struct delayed_work, reap_work);
				2428
				2429	static void cache_reap(struct work_struct *unused)
				2430	{
				2431	next_reap_node();
				2432	refresh_cpu_vm_stats(smp_processor_id());
				2433	schedule_delayed_work(&__get_cpu_var(reap_work),
				2434	REAPTIMEOUT_CPUC);
				2435	}
				2436
				2437	static void __devinit start_cpu_timer(int cpu)
				2438	{
				2439	struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
				2440
				2441	/*
				2442	* When this gets called from do_initcalls via cpucache_init(),
				2443	* init_workqueues() has already run, so keventd will be setup
				2444	* at that time.
				2445	*/
				2446	if (keventd_up() && reap_work->work.func == NULL) {
				2447	init_reap_node(cpu);
				2448	INIT_DELAYED_WORK(reap_work, cache_reap);
				2449	schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
				2450	}
				2451	}
				2452
				2453	static int __init cpucache_init(void)
				2454	{
				2455	int cpu;
				2456
				2457	/*
				2458	* Register the timers that drain pcp pages and update vm statistics
				2459	*/
				2460	for_each_online_cpu(cpu)
				2461	start_cpu_timer(cpu);
				2462	return 0;
				2463	}
				2464	__initcall(cpucache_init);
				2465	#endif
				2466
				2467	#ifdef SLUB_RESILIENCY_TEST
				2468	static unsigned long validate_slab_cache(struct kmem_cache *s);
				2469
				2470	static void resiliency_test(void)
				2471	{
				2472	u8 *p;
				2473
				2474	printk(KERN_ERR "SLUB resiliency testing\n");
				2475	printk(KERN_ERR "-----------------------\n");
				2476	printk(KERN_ERR "A. Corruption after allocation\n");
				2477
				2478	p = kzalloc(16, GFP_KERNEL);
				2479	p[16] = 0x12;
				2480	printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
				2481	" 0x12->0x%p\n\n", p + 16);
				2482
				2483	validate_slab_cache(kmalloc_caches + 4);
				2484
				2485	/* Hmmm... The next two are dangerous */
				2486	p = kzalloc(32, GFP_KERNEL);
				2487	p[32 + sizeof(void *)] = 0x34;
				2488	printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
				2489	" 0x34 -> -0x%p\n", p);
				2490	printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
				2491
				2492	validate_slab_cache(kmalloc_caches + 5);
				2493	p = kzalloc(64, GFP_KERNEL);
				2494	p += 64 + (get_cycles() & 0xff) * sizeof(void *);
				2495	*p = 0x56;
				2496	printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
				2497	p);
				2498	printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
				2499	validate_slab_cache(kmalloc_caches + 6);
				2500
				2501	printk(KERN_ERR "\nB. Corruption after free\n");
				2502	p = kzalloc(128, GFP_KERNEL);
				2503	kfree(p);
				2504	*p = 0x78;
				2505	printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
				2506	validate_slab_cache(kmalloc_caches + 7);
				2507
				2508	p = kzalloc(256, GFP_KERNEL);
				2509	kfree(p);
				2510	p[50] = 0x9a;
				2511	printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
				2512	validate_slab_cache(kmalloc_caches + 8);
				2513
				2514	p = kzalloc(512, GFP_KERNEL);
				2515	kfree(p);
				2516	p[512] = 0xab;
				2517	printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
				2518	validate_slab_cache(kmalloc_caches + 9);
				2519	}
				2520	#else
				2521	static void resiliency_test(void) {};
				2522	#endif
				2523
				2524	/*
				2525	* These are not as efficient as kmalloc for the non debug case.
				2526	* We do not have the page struct available so we have to touch one
				2527	* cacheline in struct kmem_cache to check slab flags.
				2528	*/
				2529	void __kmalloc_track_caller(size_t size, gfp_t gfpflags, void caller)
				2530	{
				2531	struct kmem_cache *s = get_slab(size, gfpflags);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2532
				2533	if (!s)
				2534	return NULL;
				2535
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2536	return slab_alloc(s, gfpflags, -1, caller);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2537	}
				2538
				2539	void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
				2540	int node, void *caller)
				2541	{
				2542	struct kmem_cache *s = get_slab(size, gfpflags);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2543
				2544	if (!s)
				2545	return NULL;
				2546
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2547	return slab_alloc(s, gfpflags, node, caller);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2548	}
				2549
				2550	#ifdef CONFIG_SYSFS
				2551
Christoph Lameter	53e15af	2007-05-06 14:49:43 -0700	[diff] [blame^]	2552	static int validate_slab(struct kmem_cache s, struct page page)
				2553	{
				2554	void *p;
				2555	void *addr = page_address(page);
				2556	unsigned long map[BITS_TO_LONGS(s->objects)];
				2557
				2558	if (!check_slab(s, page) \|\|
				2559	!on_freelist(s, page, NULL))
				2560	return 0;
				2561
				2562	/* Now we know that a valid freelist exists */
				2563	bitmap_zero(map, s->objects);
				2564
				2565	for(p = page->freelist; p; p = get_freepointer(s, p)) {
				2566	set_bit((p - addr) / s->size, map);
				2567	if (!check_object(s, page, p, 0))
				2568	return 0;
				2569	}
				2570
				2571	for(p = addr; p < addr + s->objects * s->size; p += s->size)
				2572	if (!test_bit((p - addr) / s->size, map))
				2573	if (!check_object(s, page, p, 1))
				2574	return 0;
				2575	return 1;
				2576	}
				2577
				2578	static void validate_slab_slab(struct kmem_cache s, struct page page)
				2579	{
				2580	if (slab_trylock(page)) {
				2581	validate_slab(s, page);
				2582	slab_unlock(page);
				2583	} else
				2584	printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
				2585	s->name, page);
				2586
				2587	if (s->flags & DEBUG_DEFAULT_FLAGS) {
				2588	if (!PageError(page))
				2589	printk(KERN_ERR "SLUB %s: PageError not set "
				2590	"on slab 0x%p\n", s->name, page);
				2591	} else {
				2592	if (PageError(page))
				2593	printk(KERN_ERR "SLUB %s: PageError set on "
				2594	"slab 0x%p\n", s->name, page);
				2595	}
				2596	}
				2597
				2598	static int validate_slab_node(struct kmem_cache s, struct kmem_cache_node n)
				2599	{
				2600	unsigned long count = 0;
				2601	struct page *page;
				2602	unsigned long flags;
				2603
				2604	spin_lock_irqsave(&n->list_lock, flags);
				2605
				2606	list_for_each_entry(page, &n->partial, lru) {
				2607	validate_slab_slab(s, page);
				2608	count++;
				2609	}
				2610	if (count != n->nr_partial)
				2611	printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
				2612	"counter=%ld\n", s->name, count, n->nr_partial);
				2613
				2614	if (!(s->flags & SLAB_STORE_USER))
				2615	goto out;
				2616
				2617	list_for_each_entry(page, &n->full, lru) {
				2618	validate_slab_slab(s, page);
				2619	count++;
				2620	}
				2621	if (count != atomic_long_read(&n->nr_slabs))
				2622	printk(KERN_ERR "SLUB: %s %ld slabs counted but "
				2623	"counter=%ld\n", s->name, count,
				2624	atomic_long_read(&n->nr_slabs));
				2625
				2626	out:
				2627	spin_unlock_irqrestore(&n->list_lock, flags);
				2628	return count;
				2629	}
				2630
				2631	static unsigned long validate_slab_cache(struct kmem_cache *s)
				2632	{
				2633	int node;
				2634	unsigned long count = 0;
				2635
				2636	flush_all(s);
				2637	for_each_online_node(node) {
				2638	struct kmem_cache_node *n = get_node(s, node);
				2639
				2640	count += validate_slab_node(s, n);
				2641	}
				2642	return count;
				2643	}
				2644
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2645	static unsigned long count_partial(struct kmem_cache_node *n)
				2646	{
				2647	unsigned long flags;
				2648	unsigned long x = 0;
				2649	struct page *page;
				2650
				2651	spin_lock_irqsave(&n->list_lock, flags);
				2652	list_for_each_entry(page, &n->partial, lru)
				2653	x += page->inuse;
				2654	spin_unlock_irqrestore(&n->list_lock, flags);
				2655	return x;
				2656	}
				2657
				2658	enum slab_stat_type {
				2659	SL_FULL,
				2660	SL_PARTIAL,
				2661	SL_CPU,
				2662	SL_OBJECTS
				2663	};
				2664
				2665	#define SO_FULL (1 << SL_FULL)
				2666	#define SO_PARTIAL (1 << SL_PARTIAL)
				2667	#define SO_CPU (1 << SL_CPU)
				2668	#define SO_OBJECTS (1 << SL_OBJECTS)
				2669
				2670	static unsigned long slab_objects(struct kmem_cache *s,
				2671	char *buf, unsigned long flags)
				2672	{
				2673	unsigned long total = 0;
				2674	int cpu;
				2675	int node;
				2676	int x;
				2677	unsigned long *nodes;
				2678	unsigned long *per_cpu;
				2679
				2680	nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
				2681	per_cpu = nodes + nr_node_ids;
				2682
				2683	for_each_possible_cpu(cpu) {
				2684	struct page *page = s->cpu_slab[cpu];
				2685	int node;
				2686
				2687	if (page) {
				2688	node = page_to_nid(page);
				2689	if (flags & SO_CPU) {
				2690	int x = 0;
				2691
				2692	if (flags & SO_OBJECTS)
				2693	x = page->inuse;
				2694	else
				2695	x = 1;
				2696	total += x;
				2697	nodes[node] += x;
				2698	}
				2699	per_cpu[node]++;
				2700	}
				2701	}
				2702
				2703	for_each_online_node(node) {
				2704	struct kmem_cache_node *n = get_node(s, node);
				2705
				2706	if (flags & SO_PARTIAL) {
				2707	if (flags & SO_OBJECTS)
				2708	x = count_partial(n);
				2709	else
				2710	x = n->nr_partial;
				2711	total += x;
				2712	nodes[node] += x;
				2713	}
				2714
				2715	if (flags & SO_FULL) {
				2716	int full_slabs = atomic_read(&n->nr_slabs)
				2717	- per_cpu[node]
				2718	- n->nr_partial;
				2719
				2720	if (flags & SO_OBJECTS)
				2721	x = full_slabs * s->objects;
				2722	else
				2723	x = full_slabs;
				2724	total += x;
				2725	nodes[node] += x;
				2726	}
				2727	}
				2728
				2729	x = sprintf(buf, "%lu", total);
				2730	#ifdef CONFIG_NUMA
				2731	for_each_online_node(node)
				2732	if (nodes[node])
				2733	x += sprintf(buf + x, " N%d=%lu",
				2734	node, nodes[node]);
				2735	#endif
				2736	kfree(nodes);
				2737	return x + sprintf(buf + x, "\n");
				2738	}
				2739
				2740	static int any_slab_objects(struct kmem_cache *s)
				2741	{
				2742	int node;
				2743	int cpu;
				2744
				2745	for_each_possible_cpu(cpu)
				2746	if (s->cpu_slab[cpu])
				2747	return 1;
				2748
				2749	for_each_node(node) {
				2750	struct kmem_cache_node *n = get_node(s, node);
				2751
				2752	if (n->nr_partial \|\| atomic_read(&n->nr_slabs))
				2753	return 1;
				2754	}
				2755	return 0;
				2756	}
				2757
				2758	#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
				2759	#define to_slab(n) container_of(n, struct kmem_cache, kobj);
				2760
				2761	struct slab_attribute {
				2762	struct attribute attr;
				2763	ssize_t (show)(struct kmem_cache s, char *buf);
				2764	ssize_t (store)(struct kmem_cache s, const char *x, size_t count);
				2765	};
				2766
				2767	#define SLAB_ATTR_RO(_name) \
				2768	static struct slab_attribute _name##_attr = __ATTR_RO(_name)
				2769
				2770	#define SLAB_ATTR(_name) \
				2771	static struct slab_attribute _name##_attr = \
				2772	__ATTR(_name, 0644, _name##_show, _name##_store)
				2773
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2774	static ssize_t slab_size_show(struct kmem_cache s, char buf)
				2775	{
				2776	return sprintf(buf, "%d\n", s->size);
				2777	}
				2778	SLAB_ATTR_RO(slab_size);
				2779
				2780	static ssize_t align_show(struct kmem_cache s, char buf)
				2781	{
				2782	return sprintf(buf, "%d\n", s->align);
				2783	}
				2784	SLAB_ATTR_RO(align);
				2785
				2786	static ssize_t object_size_show(struct kmem_cache s, char buf)
				2787	{
				2788	return sprintf(buf, "%d\n", s->objsize);
				2789	}
				2790	SLAB_ATTR_RO(object_size);
				2791
				2792	static ssize_t objs_per_slab_show(struct kmem_cache s, char buf)
				2793	{
				2794	return sprintf(buf, "%d\n", s->objects);
				2795	}
				2796	SLAB_ATTR_RO(objs_per_slab);
				2797
				2798	static ssize_t order_show(struct kmem_cache s, char buf)
				2799	{
				2800	return sprintf(buf, "%d\n", s->order);
				2801	}
				2802	SLAB_ATTR_RO(order);
				2803
				2804	static ssize_t ctor_show(struct kmem_cache s, char buf)
				2805	{
				2806	if (s->ctor) {
				2807	int n = sprint_symbol(buf, (unsigned long)s->ctor);
				2808
				2809	return n + sprintf(buf + n, "\n");
				2810	}
				2811	return 0;
				2812	}
				2813	SLAB_ATTR_RO(ctor);
				2814
				2815	static ssize_t dtor_show(struct kmem_cache s, char buf)
				2816	{
				2817	if (s->dtor) {
				2818	int n = sprint_symbol(buf, (unsigned long)s->dtor);
				2819
				2820	return n + sprintf(buf + n, "\n");
				2821	}
				2822	return 0;
				2823	}
				2824	SLAB_ATTR_RO(dtor);
				2825
				2826	static ssize_t aliases_show(struct kmem_cache s, char buf)
				2827	{
				2828	return sprintf(buf, "%d\n", s->refcount - 1);
				2829	}
				2830	SLAB_ATTR_RO(aliases);
				2831
				2832	static ssize_t slabs_show(struct kmem_cache s, char buf)
				2833	{
				2834	return slab_objects(s, buf, SO_FULL\|SO_PARTIAL\|SO_CPU);
				2835	}
				2836	SLAB_ATTR_RO(slabs);
				2837
				2838	static ssize_t partial_show(struct kmem_cache s, char buf)
				2839	{
				2840	return slab_objects(s, buf, SO_PARTIAL);
				2841	}
				2842	SLAB_ATTR_RO(partial);
				2843
				2844	static ssize_t cpu_slabs_show(struct kmem_cache s, char buf)
				2845	{
				2846	return slab_objects(s, buf, SO_CPU);
				2847	}
				2848	SLAB_ATTR_RO(cpu_slabs);
				2849
				2850	static ssize_t objects_show(struct kmem_cache s, char buf)
				2851	{
				2852	return slab_objects(s, buf, SO_FULL\|SO_PARTIAL\|SO_CPU\|SO_OBJECTS);
				2853	}
				2854	SLAB_ATTR_RO(objects);
				2855
				2856	static ssize_t sanity_checks_show(struct kmem_cache s, char buf)
				2857	{
				2858	return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
				2859	}
				2860
				2861	static ssize_t sanity_checks_store(struct kmem_cache *s,
				2862	const char *buf, size_t length)
				2863	{
				2864	s->flags &= ~SLAB_DEBUG_FREE;
				2865	if (buf[0] == '1')
				2866	s->flags \|= SLAB_DEBUG_FREE;
				2867	return length;
				2868	}
				2869	SLAB_ATTR(sanity_checks);
				2870
				2871	static ssize_t trace_show(struct kmem_cache s, char buf)
				2872	{
				2873	return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
				2874	}
				2875
				2876	static ssize_t trace_store(struct kmem_cache s, const char buf,
				2877	size_t length)
				2878	{
				2879	s->flags &= ~SLAB_TRACE;
				2880	if (buf[0] == '1')
				2881	s->flags \|= SLAB_TRACE;
				2882	return length;
				2883	}
				2884	SLAB_ATTR(trace);
				2885
				2886	static ssize_t reclaim_account_show(struct kmem_cache s, char buf)
				2887	{
				2888	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
				2889	}
				2890
				2891	static ssize_t reclaim_account_store(struct kmem_cache *s,
				2892	const char *buf, size_t length)
				2893	{
				2894	s->flags &= ~SLAB_RECLAIM_ACCOUNT;
				2895	if (buf[0] == '1')
				2896	s->flags \|= SLAB_RECLAIM_ACCOUNT;
				2897	return length;
				2898	}
				2899	SLAB_ATTR(reclaim_account);
				2900
				2901	static ssize_t hwcache_align_show(struct kmem_cache s, char buf)
				2902	{
				2903	return sprintf(buf, "%d\n", !!(s->flags &
				2904	(SLAB_HWCACHE_ALIGN\|SLAB_MUST_HWCACHE_ALIGN)));
				2905	}
				2906	SLAB_ATTR_RO(hwcache_align);
				2907
				2908	#ifdef CONFIG_ZONE_DMA
				2909	static ssize_t cache_dma_show(struct kmem_cache s, char buf)
				2910	{
				2911	return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
				2912	}
				2913	SLAB_ATTR_RO(cache_dma);
				2914	#endif
				2915
				2916	static ssize_t destroy_by_rcu_show(struct kmem_cache s, char buf)
				2917	{
				2918	return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
				2919	}
				2920	SLAB_ATTR_RO(destroy_by_rcu);
				2921
				2922	static ssize_t red_zone_show(struct kmem_cache s, char buf)
				2923	{
				2924	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
				2925	}
				2926
				2927	static ssize_t red_zone_store(struct kmem_cache *s,
				2928	const char *buf, size_t length)
				2929	{
				2930	if (any_slab_objects(s))
				2931	return -EBUSY;
				2932
				2933	s->flags &= ~SLAB_RED_ZONE;
				2934	if (buf[0] == '1')
				2935	s->flags \|= SLAB_RED_ZONE;
				2936	calculate_sizes(s);
				2937	return length;
				2938	}
				2939	SLAB_ATTR(red_zone);
				2940
				2941	static ssize_t poison_show(struct kmem_cache s, char buf)
				2942	{
				2943	return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
				2944	}
				2945
				2946	static ssize_t poison_store(struct kmem_cache *s,
				2947	const char *buf, size_t length)
				2948	{
				2949	if (any_slab_objects(s))
				2950	return -EBUSY;
				2951
				2952	s->flags &= ~SLAB_POISON;
				2953	if (buf[0] == '1')
				2954	s->flags \|= SLAB_POISON;
				2955	calculate_sizes(s);
				2956	return length;
				2957	}
				2958	SLAB_ATTR(poison);
				2959
				2960	static ssize_t store_user_show(struct kmem_cache s, char buf)
				2961	{
				2962	return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
				2963	}
				2964
				2965	static ssize_t store_user_store(struct kmem_cache *s,
				2966	const char *buf, size_t length)
				2967	{
				2968	if (any_slab_objects(s))
				2969	return -EBUSY;
				2970
				2971	s->flags &= ~SLAB_STORE_USER;
				2972	if (buf[0] == '1')
				2973	s->flags \|= SLAB_STORE_USER;
				2974	calculate_sizes(s);
				2975	return length;
				2976	}
				2977	SLAB_ATTR(store_user);
				2978
Christoph Lameter	53e15af	2007-05-06 14:49:43 -0700	[diff] [blame^]	2979	static ssize_t validate_show(struct kmem_cache s, char buf)
				2980	{
				2981	return 0;
				2982	}
				2983
				2984	static ssize_t validate_store(struct kmem_cache *s,
				2985	const char *buf, size_t length)
				2986	{
				2987	if (buf[0] == '1')
				2988	validate_slab_cache(s);
				2989	else
				2990	return -EINVAL;
				2991	return length;
				2992	}
				2993	SLAB_ATTR(validate);
				2994
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2995	#ifdef CONFIG_NUMA
				2996	static ssize_t defrag_ratio_show(struct kmem_cache s, char buf)
				2997	{
				2998	return sprintf(buf, "%d\n", s->defrag_ratio / 10);
				2999	}
				3000
				3001	static ssize_t defrag_ratio_store(struct kmem_cache *s,
				3002	const char *buf, size_t length)
				3003	{
				3004	int n = simple_strtoul(buf, NULL, 10);
				3005
				3006	if (n < 100)
				3007	s->defrag_ratio = n * 10;
				3008	return length;
				3009	}
				3010	SLAB_ATTR(defrag_ratio);
				3011	#endif
				3012
				3013	static struct attribute * slab_attrs[] = {
				3014	&slab_size_attr.attr,
				3015	&object_size_attr.attr,
				3016	&objs_per_slab_attr.attr,
				3017	&order_attr.attr,
				3018	&objects_attr.attr,
				3019	&slabs_attr.attr,
				3020	&partial_attr.attr,
				3021	&cpu_slabs_attr.attr,
				3022	&ctor_attr.attr,
				3023	&dtor_attr.attr,
				3024	&aliases_attr.attr,
				3025	&align_attr.attr,
				3026	&sanity_checks_attr.attr,
				3027	&trace_attr.attr,
				3028	&hwcache_align_attr.attr,
				3029	&reclaim_account_attr.attr,
				3030	&destroy_by_rcu_attr.attr,
				3031	&red_zone_attr.attr,
				3032	&poison_attr.attr,
				3033	&store_user_attr.attr,
Christoph Lameter	53e15af	2007-05-06 14:49:43 -0700	[diff] [blame^]	3034	&validate_attr.attr,
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	3035	#ifdef CONFIG_ZONE_DMA
				3036	&cache_dma_attr.attr,
				3037	#endif
				3038	#ifdef CONFIG_NUMA
				3039	&defrag_ratio_attr.attr,
				3040	#endif
				3041	NULL
				3042	};
				3043
				3044	static struct attribute_group slab_attr_group = {
				3045	.attrs = slab_attrs,
				3046	};
				3047
				3048	static ssize_t slab_attr_show(struct kobject *kobj,
				3049	struct attribute *attr,
				3050	char *buf)
				3051	{
				3052	struct slab_attribute *attribute;
				3053	struct kmem_cache *s;
				3054	int err;
				3055
				3056	attribute = to_slab_attr(attr);
				3057	s = to_slab(kobj);
				3058
				3059	if (!attribute->show)
				3060	return -EIO;
				3061
				3062	err = attribute->show(s, buf);
				3063
				3064	return err;
				3065	}
				3066
				3067	static ssize_t slab_attr_store(struct kobject *kobj,
				3068	struct attribute *attr,
				3069	const char *buf, size_t len)
				3070	{
				3071	struct slab_attribute *attribute;
				3072	struct kmem_cache *s;
				3073	int err;
				3074
				3075	attribute = to_slab_attr(attr);
				3076	s = to_slab(kobj);
				3077
				3078	if (!attribute->store)
				3079	return -EIO;
				3080
				3081	err = attribute->store(s, buf, len);
				3082
				3083	return err;
				3084	}
				3085
				3086	static struct sysfs_ops slab_sysfs_ops = {
				3087	.show = slab_attr_show,
				3088	.store = slab_attr_store,
				3089	};
				3090
				3091	static struct kobj_type slab_ktype = {
				3092	.sysfs_ops = &slab_sysfs_ops,
				3093	};
				3094
				3095	static int uevent_filter(struct kset kset, struct kobject kobj)
				3096	{
				3097	struct kobj_type *ktype = get_ktype(kobj);
				3098
				3099	if (ktype == &slab_ktype)
				3100	return 1;
				3101	return 0;
				3102	}
				3103
				3104	static struct kset_uevent_ops slab_uevent_ops = {
				3105	.filter = uevent_filter,
				3106	};
				3107
				3108	decl_subsys(slab, &slab_ktype, &slab_uevent_ops);
				3109
				3110	#define ID_STR_LENGTH 64
				3111
				3112	/* Create a unique string id for a slab cache:
				3113	* format
				3114	* :[flags-]size:[memory address of kmemcache]
				3115	*/
				3116	static char create_unique_id(struct kmem_cache s)
				3117	{
				3118	char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
				3119	char *p = name;
				3120
				3121	BUG_ON(!name);
				3122
				3123	*p++ = ':';
				3124	/*
				3125	* First flags affecting slabcache operations. We will only
				3126	* get here for aliasable slabs so we do not need to support
				3127	* too many flags. The flags here must cover all flags that
				3128	* are matched during merging to guarantee that the id is
				3129	* unique.
				3130	*/
				3131	if (s->flags & SLAB_CACHE_DMA)
				3132	*p++ = 'd';
				3133	if (s->flags & SLAB_RECLAIM_ACCOUNT)
				3134	*p++ = 'a';
				3135	if (s->flags & SLAB_DEBUG_FREE)
				3136	*p++ = 'F';
				3137	if (p != name + 1)
				3138	*p++ = '-';
				3139	p += sprintf(p, "%07d", s->size);
				3140	BUG_ON(p > name + ID_STR_LENGTH - 1);
				3141	return name;
				3142	}
				3143
				3144	static int sysfs_slab_add(struct kmem_cache *s)
				3145	{
				3146	int err;
				3147	const char *name;
				3148	int unmergeable;
				3149
				3150	if (slab_state < SYSFS)
				3151	/* Defer until later */
				3152	return 0;
				3153
				3154	unmergeable = slab_unmergeable(s);
				3155	if (unmergeable) {
				3156	/*
				3157	* Slabcache can never be merged so we can use the name proper.
				3158	* This is typically the case for debug situations. In that
				3159	* case we can catch duplicate names easily.
				3160	*/
				3161	sysfs_remove_link(&slab_subsys.kset.kobj, s->name);
				3162	name = s->name;
				3163	} else {
				3164	/*
				3165	* Create a unique name for the slab as a target
				3166	* for the symlinks.
				3167	*/
				3168	name = create_unique_id(s);
				3169	}
				3170
				3171	kobj_set_kset_s(s, slab_subsys);
				3172	kobject_set_name(&s->kobj, name);
				3173	kobject_init(&s->kobj);
				3174	err = kobject_add(&s->kobj);
				3175	if (err)
				3176	return err;
				3177
				3178	err = sysfs_create_group(&s->kobj, &slab_attr_group);
				3179	if (err)
				3180	return err;
				3181	kobject_uevent(&s->kobj, KOBJ_ADD);
				3182	if (!unmergeable) {
				3183	/* Setup first alias */
				3184	sysfs_slab_alias(s, s->name);
				3185	kfree(name);
				3186	}
				3187	return 0;
				3188	}
				3189
				3190	static void sysfs_slab_remove(struct kmem_cache *s)
				3191	{
				3192	kobject_uevent(&s->kobj, KOBJ_REMOVE);
				3193	kobject_del(&s->kobj);
				3194	}
				3195
				3196	/*
				3197	* Need to buffer aliases during bootup until sysfs becomes
				3198	* available lest we loose that information.
				3199	*/
				3200	struct saved_alias {
				3201	struct kmem_cache *s;
				3202	const char *name;
				3203	struct saved_alias *next;
				3204	};
				3205
				3206	struct saved_alias *alias_list;
				3207
				3208	static int sysfs_slab_alias(struct kmem_cache s, const char name)
				3209	{
				3210	struct saved_alias *al;
				3211
				3212	if (slab_state == SYSFS) {
				3213	/*
				3214	* If we have a leftover link then remove it.
				3215	*/
				3216	sysfs_remove_link(&slab_subsys.kset.kobj, name);
				3217	return sysfs_create_link(&slab_subsys.kset.kobj,
				3218	&s->kobj, name);
				3219	}
				3220
				3221	al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
				3222	if (!al)
				3223	return -ENOMEM;
				3224
				3225	al->s = s;
				3226	al->name = name;
				3227	al->next = alias_list;
				3228	alias_list = al;
				3229	return 0;
				3230	}
				3231
				3232	static int __init slab_sysfs_init(void)
				3233	{
				3234	int err;
				3235
				3236	err = subsystem_register(&slab_subsys);
				3237	if (err) {
				3238	printk(KERN_ERR "Cannot register slab subsystem.\n");
				3239	return -ENOSYS;
				3240	}
				3241
				3242	finish_bootstrap();
				3243
				3244	while (alias_list) {
				3245	struct saved_alias *al = alias_list;
				3246
				3247	alias_list = alias_list->next;
				3248	err = sysfs_slab_alias(al->s, al->name);
				3249	BUG_ON(err);
				3250	kfree(al);
				3251	}
				3252
				3253	resiliency_test();
				3254	return 0;
				3255	}
				3256
				3257	__initcall(slab_sysfs_init);
				3258	#else
				3259	__initcall(finish_bootstrap);
				3260	#endif