Blame - mm/slub.c - kernel/msm-4.9

blob: ed2846240f962bed0a694a64cf19113975f01fdf [file] [log] [blame]

Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1	/*
				2	* SLUB: A slab allocator that limits cache line use instead of queuing
				3	* objects in per cpu and per node lists.
				4	*
				5	* The allocator synchronizes using per slab locks and only
				6	* uses a centralized lock to manage a pool of partial slabs.
				7	*
				8	* (C) 2007 SGI, Christoph Lameter <clameter@sgi.com>
				9	*/
				10
				11	#include <linux/mm.h>
				12	#include <linux/module.h>
				13	#include <linux/bit_spinlock.h>
				14	#include <linux/interrupt.h>
				15	#include <linux/bitops.h>
				16	#include <linux/slab.h>
				17	#include <linux/seq_file.h>
				18	#include <linux/cpu.h>
				19	#include <linux/cpuset.h>
				20	#include <linux/mempolicy.h>
				21	#include <linux/ctype.h>
				22	#include <linux/kallsyms.h>
				23
				24	/*
				25	* Lock order:
				26	* 1. slab_lock(page)
				27	* 2. slab->list_lock
				28	*
				29	* The slab_lock protects operations on the object of a particular
				30	* slab and its metadata in the page struct. If the slab lock
				31	* has been taken then no allocations nor frees can be performed
				32	* on the objects in the slab nor can the slab be added or removed
				33	* from the partial or full lists since this would mean modifying
				34	* the page_struct of the slab.
				35	*
				36	* The list_lock protects the partial and full list on each node and
				37	* the partial slab counter. If taken then no new slabs may be added or
				38	* removed from the lists nor make the number of partial slabs be modified.
				39	* (Note that the total number of slabs is an atomic value that may be
				40	* modified without taking the list lock).
				41	*
				42	* The list_lock is a centralized lock and thus we avoid taking it as
				43	* much as possible. As long as SLUB does not have to handle partial
				44	* slabs, operations can continue without any centralized lock. F.e.
				45	* allocating a long series of objects that fill up slabs does not require
				46	* the list lock.
				47	*
				48	* The lock order is sometimes inverted when we are trying to get a slab
				49	* off a list. We take the list_lock and then look for a page on the list
				50	* to use. While we do that objects in the slabs may be freed. We can
				51	* only operate on the slab if we have also taken the slab_lock. So we use
				52	* a slab_trylock() on the slab. If trylock was successful then no frees
				53	* can occur anymore and we can use the slab for allocations etc. If the
				54	* slab_trylock() does not succeed then frees are in progress in the slab and
				55	* we must stay away from it for a while since we may cause a bouncing
				56	* cacheline if we try to acquire the lock. So go onto the next slab.
				57	* If all pages are busy then we may allocate a new slab instead of reusing
				58	* a partial slab. A new slab has noone operating on it and thus there is
				59	* no danger of cacheline contention.
				60	*
				61	* Interrupts are disabled during allocation and deallocation in order to
				62	* make the slab allocator safe to use in the context of an irq. In addition
				63	* interrupts are disabled to ensure that the processor does not change
				64	* while handling per_cpu slabs, due to kernel preemption.
				65	*
				66	* SLUB assigns one slab for allocation to each processor.
				67	* Allocations only occur from these slabs called cpu slabs.
				68	*
				69	* Slabs with free elements are kept on a partial list.
				70	* There is no list for full slabs. If an object in a full slab is
				71	* freed then the slab will show up again on the partial lists.
				72	* Otherwise there is no need to track full slabs unless we have to
				73	* track full slabs for debugging purposes.
				74	*
				75	* Slabs are freed when they become empty. Teardown and setup is
				76	* minimal so we rely on the page allocators per cpu caches for
				77	* fast frees and allocs.
				78	*
				79	* Overloading of page flags that are otherwise used for LRU management.
				80	*
				81	* PageActive The slab is used as a cpu cache. Allocations
				82	* may be performed from the slab. The slab is not
				83	* on any slab list and cannot be moved onto one.
				84	*
				85	* PageError Slab requires special handling due to debug
				86	* options set. This moves slab handling out of
				87	* the fast path.
				88	*/
				89
				90	/*
				91	* Issues still to be resolved:
				92	*
				93	* - The per cpu array is updated for each new slab and and is a remote
				94	* cacheline for most nodes. This could become a bouncing cacheline given
				95	* enough frequent updates. There are 16 pointers in a cacheline.so at
				96	* max 16 cpus could compete. Likely okay.
				97	*
				98	* - Support PAGE_ALLOC_DEBUG. Should be easy to do.
				99	*
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	100	* - SLAB_DEBUG_INITIAL is not supported but I have never seen a use of
				101	* it.
				102	*
				103	* - Variable sizing of the per node arrays
				104	*/
				105
				106	/* Enable to test recovery from slab corruption on boot */
				107	#undef SLUB_RESILIENCY_TEST
				108
				109	#if PAGE_SHIFT <= 12
				110
				111	/*
				112	* Small page size. Make sure that we do not fragment memory
				113	*/
				114	#define DEFAULT_MAX_ORDER 1
				115	#define DEFAULT_MIN_OBJECTS 4
				116
				117	#else
				118
				119	/*
				120	* Large page machines are customarily able to handle larger
				121	* page orders.
				122	*/
				123	#define DEFAULT_MAX_ORDER 2
				124	#define DEFAULT_MIN_OBJECTS 8
				125
				126	#endif
				127
				128	/*
				129	* Flags from the regular SLAB that SLUB does not support:
				130	*/
				131	#define SLUB_UNIMPLEMENTED (SLAB_DEBUG_INITIAL)
				132
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame^]	133	/*
				134	* Mininum number of partial slabs. These will be left on the partial
				135	* lists even if they are empty. kmem_cache_shrink may reclaim them.
				136	*/
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	137	#define MIN_PARTIAL 2
				138
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame^]	139	/*
				140	* Maximum number of desirable partial slabs.
				141	* The existence of more partial slabs makes kmem_cache_shrink
				142	* sort the partial list by the number of objects in the.
				143	*/
				144	#define MAX_PARTIAL 10
				145
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	146	#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE \| SLAB_RED_ZONE \| \
				147	SLAB_POISON \| SLAB_STORE_USER)
				148	/*
				149	* Set of flags that will prevent slab merging
				150	*/
				151	#define SLUB_NEVER_MERGE (SLAB_RED_ZONE \| SLAB_POISON \| SLAB_STORE_USER \| \
				152	SLAB_TRACE \| SLAB_DESTROY_BY_RCU)
				153
				154	#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE \| SLAB_RECLAIM_ACCOUNT \| \
				155	SLAB_CACHE_DMA)
				156
				157	#ifndef ARCH_KMALLOC_MINALIGN
Christoph Lameter	47bfdc0	2007-05-06 14:49:37 -0700	[diff] [blame]	158	#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	159	#endif
				160
				161	#ifndef ARCH_SLAB_MINALIGN
Christoph Lameter	47bfdc0	2007-05-06 14:49:37 -0700	[diff] [blame]	162	#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	163	#endif
				164
				165	/* Internal SLUB flags */
				166	#define __OBJECT_POISON 0x80000000 /* Poison object */
				167
				168	static int kmem_size = sizeof(struct kmem_cache);
				169
				170	#ifdef CONFIG_SMP
				171	static struct notifier_block slab_notifier;
				172	#endif
				173
				174	static enum {
				175	DOWN, /* No slab functionality available */
				176	PARTIAL, /* kmem_cache_open() works but kmalloc does not */
				177	UP, /* Everything works */
				178	SYSFS /* Sysfs up */
				179	} slab_state = DOWN;
				180
				181	/* A list of all slab caches on the system */
				182	static DECLARE_RWSEM(slub_lock);
				183	LIST_HEAD(slab_caches);
				184
				185	#ifdef CONFIG_SYSFS
				186	static int sysfs_slab_add(struct kmem_cache *);
				187	static int sysfs_slab_alias(struct kmem_cache , const char );
				188	static void sysfs_slab_remove(struct kmem_cache *);
				189	#else
				190	static int sysfs_slab_add(struct kmem_cache *s) { return 0; }
				191	static int sysfs_slab_alias(struct kmem_cache s, const char p) { return 0; }
				192	static void sysfs_slab_remove(struct kmem_cache *s) {}
				193	#endif
				194
				195	/********************************************************************
				196	* Core slab cache functions
				197	*******************************************************************/
				198
				199	int slab_is_available(void)
				200	{
				201	return slab_state >= UP;
				202	}
				203
				204	static inline struct kmem_cache_node get_node(struct kmem_cache s, int node)
				205	{
				206	#ifdef CONFIG_NUMA
				207	return s->node[node];
				208	#else
				209	return &s->local_node;
				210	#endif
				211	}
				212
				213	/*
				214	* Object debugging
				215	*/
				216	static void print_section(char text, u8 addr, unsigned int length)
				217	{
				218	int i, offset;
				219	int newline = 1;
				220	char ascii[17];
				221
				222	ascii[16] = 0;
				223
				224	for (i = 0; i < length; i++) {
				225	if (newline) {
				226	printk(KERN_ERR "%10s 0x%p: ", text, addr + i);
				227	newline = 0;
				228	}
				229	printk(" %02x", addr[i]);
				230	offset = i % 16;
				231	ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
				232	if (offset == 15) {
				233	printk(" %s\n",ascii);
				234	newline = 1;
				235	}
				236	}
				237	if (!newline) {
				238	i %= 16;
				239	while (i < 16) {
				240	printk(" ");
				241	ascii[i] = ' ';
				242	i++;
				243	}
				244	printk(" %s\n", ascii);
				245	}
				246	}
				247
				248	/*
				249	* Slow version of get and set free pointer.
				250	*
				251	* This requires touching the cache lines of kmem_cache.
				252	* The offset can also be obtained from the page. In that
				253	* case it is in the cacheline that we already need to touch.
				254	*/
				255	static void get_freepointer(struct kmem_cache s, void *object)
				256	{
				257	return (void *)(object + s->offset);
				258	}
				259
				260	static void set_freepointer(struct kmem_cache s, void object, void *fp)
				261	{
				262	(void *)(object + s->offset) = fp;
				263	}
				264
				265	/*
				266	* Tracking user of a slab.
				267	*/
				268	struct track {
				269	void addr; / Called from address */
				270	int cpu; /* Was running on cpu */
				271	int pid; /* Pid context */
				272	unsigned long when; /* When did the operation occur */
				273	};
				274
				275	enum track_item { TRACK_ALLOC, TRACK_FREE };
				276
				277	static struct track get_track(struct kmem_cache s, void *object,
				278	enum track_item alloc)
				279	{
				280	struct track *p;
				281
				282	if (s->offset)
				283	p = object + s->offset + sizeof(void *);
				284	else
				285	p = object + s->inuse;
				286
				287	return p + alloc;
				288	}
				289
				290	static void set_track(struct kmem_cache s, void object,
				291	enum track_item alloc, void *addr)
				292	{
				293	struct track *p;
				294
				295	if (s->offset)
				296	p = object + s->offset + sizeof(void *);
				297	else
				298	p = object + s->inuse;
				299
				300	p += alloc;
				301	if (addr) {
				302	p->addr = addr;
				303	p->cpu = smp_processor_id();
				304	p->pid = current ? current->pid : -1;
				305	p->when = jiffies;
				306	} else
				307	memset(p, 0, sizeof(struct track));
				308	}
				309
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	310	static void init_tracking(struct kmem_cache s, void object)
				311	{
				312	if (s->flags & SLAB_STORE_USER) {
				313	set_track(s, object, TRACK_FREE, NULL);
				314	set_track(s, object, TRACK_ALLOC, NULL);
				315	}
				316	}
				317
				318	static void print_track(const char s, struct track t)
				319	{
				320	if (!t->addr)
				321	return;
				322
				323	printk(KERN_ERR "%s: ", s);
				324	__print_symbol("%s", (unsigned long)t->addr);
				325	printk(" jiffies_ago=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
				326	}
				327
				328	static void print_trailer(struct kmem_cache s, u8 p)
				329	{
				330	unsigned int off; /* Offset of last byte */
				331
				332	if (s->flags & SLAB_RED_ZONE)
				333	print_section("Redzone", p + s->objsize,
				334	s->inuse - s->objsize);
				335
				336	printk(KERN_ERR "FreePointer 0x%p -> 0x%p\n",
				337	p + s->offset,
				338	get_freepointer(s, p));
				339
				340	if (s->offset)
				341	off = s->offset + sizeof(void *);
				342	else
				343	off = s->inuse;
				344
				345	if (s->flags & SLAB_STORE_USER) {
				346	print_track("Last alloc", get_track(s, p, TRACK_ALLOC));
				347	print_track("Last free ", get_track(s, p, TRACK_FREE));
				348	off += 2 * sizeof(struct track);
				349	}
				350
				351	if (off != s->size)
				352	/* Beginning of the filler is the free pointer */
				353	print_section("Filler", p + off, s->size - off);
				354	}
				355
				356	static void object_err(struct kmem_cache s, struct page page,
				357	u8 object, char reason)
				358	{
				359	u8 *addr = page_address(page);
				360
				361	printk(KERN_ERR "*** SLUB %s: %s@0x%p slab 0x%p\n",
				362	s->name, reason, object, page);
				363	printk(KERN_ERR " offset=%tu flags=0x%04lx inuse=%u freelist=0x%p\n",
				364	object - addr, page->flags, page->inuse, page->freelist);
				365	if (object > addr + 16)
				366	print_section("Bytes b4", object - 16, 16);
				367	print_section("Object", object, min(s->objsize, 128));
				368	print_trailer(s, object);
				369	dump_stack();
				370	}
				371
				372	static void slab_err(struct kmem_cache s, struct page page, char *reason, ...)
				373	{
				374	va_list args;
				375	char buf[100];
				376
				377	va_start(args, reason);
				378	vsnprintf(buf, sizeof(buf), reason, args);
				379	va_end(args);
				380	printk(KERN_ERR "*** SLUB %s: %s in slab @0x%p\n", s->name, buf,
				381	page);
				382	dump_stack();
				383	}
				384
				385	static void init_object(struct kmem_cache s, void object, int active)
				386	{
				387	u8 *p = object;
				388
				389	if (s->flags & __OBJECT_POISON) {
				390	memset(p, POISON_FREE, s->objsize - 1);
				391	p[s->objsize -1] = POISON_END;
				392	}
				393
				394	if (s->flags & SLAB_RED_ZONE)
				395	memset(p + s->objsize,
				396	active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
				397	s->inuse - s->objsize);
				398	}
				399
				400	static int check_bytes(u8 *start, unsigned int value, unsigned int bytes)
				401	{
				402	while (bytes) {
				403	if (*start != (u8)value)
				404	return 0;
				405	start++;
				406	bytes--;
				407	}
				408	return 1;
				409	}
				410
				411
				412	static int check_valid_pointer(struct kmem_cache s, struct page page,
				413	void *object)
				414	{
				415	void *base;
				416
				417	if (!object)
				418	return 1;
				419
				420	base = page_address(page);
				421	if (object < base \|\| object >= base + s->objects * s->size \|\|
				422	(object - base) % s->size) {
				423	return 0;
				424	}
				425
				426	return 1;
				427	}
				428
				429	/*
				430	* Object layout:
				431	*
				432	* object address
				433	* Bytes of the object to be managed.
				434	* If the freepointer may overlay the object then the free
				435	* pointer is the first word of the object.
				436	* Poisoning uses 0x6b (POISON_FREE) and the last byte is
				437	* 0xa5 (POISON_END)
				438	*
				439	* object + s->objsize
				440	* Padding to reach word boundary. This is also used for Redzoning.
				441	* Padding is extended to word size if Redzoning is enabled
				442	* and objsize == inuse.
				443	* We fill with 0xbb (RED_INACTIVE) for inactive objects and with
				444	* 0xcc (RED_ACTIVE) for objects in use.
				445	*
				446	* object + s->inuse
				447	* A. Free pointer (if we cannot overwrite object on free)
				448	* B. Tracking data for SLAB_STORE_USER
				449	* C. Padding to reach required alignment boundary
				450	* Padding is done using 0x5a (POISON_INUSE)
				451	*
				452	* object + s->size
				453	*
				454	* If slabcaches are merged then the objsize and inuse boundaries are to
				455	* be ignored. And therefore no slab options that rely on these boundaries
				456	* may be used with merged slabcaches.
				457	*/
				458
				459	static void restore_bytes(struct kmem_cache s, char message, u8 data,
				460	void from, void to)
				461	{
				462	printk(KERN_ERR "@@@ SLUB: %s Restoring %s (0x%x) from 0x%p-0x%p\n",
				463	s->name, message, data, from, to - 1);
				464	memset(from, data, to - from);
				465	}
				466
				467	static int check_pad_bytes(struct kmem_cache s, struct page page, u8 *p)
				468	{
				469	unsigned long off = s->inuse; /* The end of info */
				470
				471	if (s->offset)
				472	/* Freepointer is placed after the object. */
				473	off += sizeof(void *);
				474
				475	if (s->flags & SLAB_STORE_USER)
				476	/* We also have user information there */
				477	off += 2 * sizeof(struct track);
				478
				479	if (s->size == off)
				480	return 1;
				481
				482	if (check_bytes(p + off, POISON_INUSE, s->size - off))
				483	return 1;
				484
				485	object_err(s, page, p, "Object padding check fails");
				486
				487	/*
				488	* Restore padding
				489	*/
				490	restore_bytes(s, "object padding", POISON_INUSE, p + off, p + s->size);
				491	return 0;
				492	}
				493
				494	static int slab_pad_check(struct kmem_cache s, struct page page)
				495	{
				496	u8 *p;
				497	int length, remainder;
				498
				499	if (!(s->flags & SLAB_POISON))
				500	return 1;
				501
				502	p = page_address(page);
				503	length = s->objects * s->size;
				504	remainder = (PAGE_SIZE << s->order) - length;
				505	if (!remainder)
				506	return 1;
				507
				508	if (!check_bytes(p + length, POISON_INUSE, remainder)) {
				509	printk(KERN_ERR "SLUB: %s slab 0x%p: Padding fails check\n",
				510	s->name, p);
				511	dump_stack();
				512	restore_bytes(s, "slab padding", POISON_INUSE, p + length,
				513	p + length + remainder);
				514	return 0;
				515	}
				516	return 1;
				517	}
				518
				519	static int check_object(struct kmem_cache s, struct page page,
				520	void *object, int active)
				521	{
				522	u8 *p = object;
				523	u8 *endobject = object + s->objsize;
				524
				525	if (s->flags & SLAB_RED_ZONE) {
				526	unsigned int red =
				527	active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
				528
				529	if (!check_bytes(endobject, red, s->inuse - s->objsize)) {
				530	object_err(s, page, object,
				531	active ? "Redzone Active" : "Redzone Inactive");
				532	restore_bytes(s, "redzone", red,
				533	endobject, object + s->inuse);
				534	return 0;
				535	}
				536	} else {
				537	if ((s->flags & SLAB_POISON) && s->objsize < s->inuse &&
				538	!check_bytes(endobject, POISON_INUSE,
				539	s->inuse - s->objsize)) {
				540	object_err(s, page, p, "Alignment padding check fails");
				541	/*
				542	* Fix it so that there will not be another report.
				543	*
				544	* Hmmm... We may be corrupting an object that now expects
				545	* to be longer than allowed.
				546	*/
				547	restore_bytes(s, "alignment padding", POISON_INUSE,
				548	endobject, object + s->inuse);
				549	}
				550	}
				551
				552	if (s->flags & SLAB_POISON) {
				553	if (!active && (s->flags & __OBJECT_POISON) &&
				554	(!check_bytes(p, POISON_FREE, s->objsize - 1) \|\|
				555	p[s->objsize - 1] != POISON_END)) {
				556
				557	object_err(s, page, p, "Poison check failed");
				558	restore_bytes(s, "Poison", POISON_FREE,
				559	p, p + s->objsize -1);
				560	restore_bytes(s, "Poison", POISON_END,
				561	p + s->objsize - 1, p + s->objsize);
				562	return 0;
				563	}
				564	/*
				565	* check_pad_bytes cleans up on its own.
				566	*/
				567	check_pad_bytes(s, page, p);
				568	}
				569
				570	if (!s->offset && active)
				571	/*
				572	* Object and freepointer overlap. Cannot check
				573	* freepointer while object is allocated.
				574	*/
				575	return 1;
				576
				577	/* Check free pointer validity */
				578	if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
				579	object_err(s, page, p, "Freepointer corrupt");
				580	/*
				581	* No choice but to zap it and thus loose the remainder
				582	* of the free objects in this slab. May cause
				583	* another error because the object count maybe
				584	* wrong now.
				585	*/
				586	set_freepointer(s, p, NULL);
				587	return 0;
				588	}
				589	return 1;
				590	}
				591
				592	static int check_slab(struct kmem_cache s, struct page page)
				593	{
				594	VM_BUG_ON(!irqs_disabled());
				595
				596	if (!PageSlab(page)) {
				597	printk(KERN_ERR "SLUB: %s Not a valid slab page @0x%p "
				598	"flags=%lx mapping=0x%p count=%d \n",
				599	s->name, page, page->flags, page->mapping,
				600	page_count(page));
				601	return 0;
				602	}
				603	if (page->offset * sizeof(void *) != s->offset) {
				604	printk(KERN_ERR "SLUB: %s Corrupted offset %lu in slab @0x%p"
				605	" flags=0x%lx mapping=0x%p count=%d\n",
				606	s->name,
				607	(unsigned long)(page->offset * sizeof(void *)),
				608	page,
				609	page->flags,
				610	page->mapping,
				611	page_count(page));
				612	dump_stack();
				613	return 0;
				614	}
				615	if (page->inuse > s->objects) {
				616	printk(KERN_ERR "SLUB: %s Inuse %u > max %u in slab "
				617	"page @0x%p flags=%lx mapping=0x%p count=%d\n",
				618	s->name, page->inuse, s->objects, page, page->flags,
				619	page->mapping, page_count(page));
				620	dump_stack();
				621	return 0;
				622	}
				623	/* Slab_pad_check fixes things up after itself */
				624	slab_pad_check(s, page);
				625	return 1;
				626	}
				627
				628	/*
				629	* Determine if a certain object on a page is on the freelist and
				630	* therefore free. Must hold the slab lock for cpu slabs to
				631	* guarantee that the chains are consistent.
				632	*/
				633	static int on_freelist(struct kmem_cache s, struct page page, void *search)
				634	{
				635	int nr = 0;
				636	void *fp = page->freelist;
				637	void *object = NULL;
				638
				639	while (fp && nr <= s->objects) {
				640	if (fp == search)
				641	return 1;
				642	if (!check_valid_pointer(s, page, fp)) {
				643	if (object) {
				644	object_err(s, page, object,
				645	"Freechain corrupt");
				646	set_freepointer(s, object, NULL);
				647	break;
				648	} else {
				649	printk(KERN_ERR "SLUB: %s slab 0x%p "
				650	"freepointer 0x%p corrupted.\n",
				651	s->name, page, fp);
				652	dump_stack();
				653	page->freelist = NULL;
				654	page->inuse = s->objects;
				655	return 0;
				656	}
				657	break;
				658	}
				659	object = fp;
				660	fp = get_freepointer(s, object);
				661	nr++;
				662	}
				663
				664	if (page->inuse != s->objects - nr) {
				665	printk(KERN_ERR "slab %s: page 0x%p wrong object count."
				666	" counter is %d but counted were %d\n",
				667	s->name, page, page->inuse,
				668	s->objects - nr);
				669	page->inuse = s->objects - nr;
				670	}
				671	return search == NULL;
				672	}
				673
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	674	/*
				675	* Tracking of fully allocated slabs for debugging
				676	*/
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	677	static void add_full(struct kmem_cache_node n, struct page page)
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	678	{
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	679	spin_lock(&n->list_lock);
				680	list_add(&page->lru, &n->full);
				681	spin_unlock(&n->list_lock);
				682	}
				683
				684	static void remove_full(struct kmem_cache s, struct page page)
				685	{
				686	struct kmem_cache_node *n;
				687
				688	if (!(s->flags & SLAB_STORE_USER))
				689	return;
				690
				691	n = get_node(s, page_to_nid(page));
				692
				693	spin_lock(&n->list_lock);
				694	list_del(&page->lru);
				695	spin_unlock(&n->list_lock);
				696	}
				697
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	698	static int alloc_object_checks(struct kmem_cache s, struct page page,
				699	void *object)
				700	{
				701	if (!check_slab(s, page))
				702	goto bad;
				703
				704	if (object && !on_freelist(s, page, object)) {
				705	printk(KERN_ERR "SLUB: %s Object 0x%p@0x%p "
				706	"already allocated.\n",
				707	s->name, object, page);
				708	goto dump;
				709	}
				710
				711	if (!check_valid_pointer(s, page, object)) {
				712	object_err(s, page, object, "Freelist Pointer check fails");
				713	goto dump;
				714	}
				715
				716	if (!object)
				717	return 1;
				718
				719	if (!check_object(s, page, object, 0))
				720	goto bad;
				721	init_object(s, object, 1);
				722
				723	if (s->flags & SLAB_TRACE) {
				724	printk(KERN_INFO "TRACE %s alloc 0x%p inuse=%d fp=0x%p\n",
				725	s->name, object, page->inuse,
				726	page->freelist);
				727	dump_stack();
				728	}
				729	return 1;
				730	dump:
				731	dump_stack();
				732	bad:
				733	if (PageSlab(page)) {
				734	/*
				735	* If this is a slab page then lets do the best we can
				736	* to avoid issues in the future. Marking all objects
				737	* as used avoids touching the remainder.
				738	*/
				739	printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n",
				740	s->name, page);
				741	page->inuse = s->objects;
				742	page->freelist = NULL;
				743	/* Fix up fields that may be corrupted */
				744	page->offset = s->offset / sizeof(void *);
				745	}
				746	return 0;
				747	}
				748
				749	static int free_object_checks(struct kmem_cache s, struct page page,
				750	void *object)
				751	{
				752	if (!check_slab(s, page))
				753	goto fail;
				754
				755	if (!check_valid_pointer(s, page, object)) {
				756	printk(KERN_ERR "SLUB: %s slab 0x%p invalid "
				757	"object pointer 0x%p\n",
				758	s->name, page, object);
				759	goto fail;
				760	}
				761
				762	if (on_freelist(s, page, object)) {
				763	printk(KERN_ERR "SLUB: %s slab 0x%p object "
				764	"0x%p already free.\n", s->name, page, object);
				765	goto fail;
				766	}
				767
				768	if (!check_object(s, page, object, 1))
				769	return 0;
				770
				771	if (unlikely(s != page->slab)) {
				772	if (!PageSlab(page))
				773	printk(KERN_ERR "slab_free %s size %d: attempt to"
				774	"free object(0x%p) outside of slab.\n",
				775	s->name, s->size, object);
				776	else
				777	if (!page->slab)
				778	printk(KERN_ERR
				779	"slab_free : no slab(NULL) for object 0x%p.\n",
				780	object);
				781	else
				782	printk(KERN_ERR "slab_free %s(%d): object at 0x%p"
				783	" belongs to slab %s(%d)\n",
				784	s->name, s->size, object,
				785	page->slab->name, page->slab->size);
				786	goto fail;
				787	}
				788	if (s->flags & SLAB_TRACE) {
				789	printk(KERN_INFO "TRACE %s free 0x%p inuse=%d fp=0x%p\n",
				790	s->name, object, page->inuse,
				791	page->freelist);
				792	print_section("Object", object, s->objsize);
				793	dump_stack();
				794	}
				795	init_object(s, object, 0);
				796	return 1;
				797	fail:
				798	dump_stack();
				799	printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n",
				800	s->name, page, object);
				801	return 0;
				802	}
				803
				804	/*
				805	* Slab allocation and freeing
				806	*/
				807	static struct page allocate_slab(struct kmem_cache s, gfp_t flags, int node)
				808	{
				809	struct page * page;
				810	int pages = 1 << s->order;
				811
				812	if (s->order)
				813	flags \|= __GFP_COMP;
				814
				815	if (s->flags & SLAB_CACHE_DMA)
				816	flags \|= SLUB_DMA;
				817
				818	if (node == -1)
				819	page = alloc_pages(flags, s->order);
				820	else
				821	page = alloc_pages_node(node, flags, s->order);
				822
				823	if (!page)
				824	return NULL;
				825
				826	mod_zone_page_state(page_zone(page),
				827	(s->flags & SLAB_RECLAIM_ACCOUNT) ?
				828	NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
				829	pages);
				830
				831	return page;
				832	}
				833
				834	static void setup_object(struct kmem_cache s, struct page page,
				835	void *object)
				836	{
				837	if (PageError(page)) {
				838	init_object(s, object, 0);
				839	init_tracking(s, object);
				840	}
				841
				842	if (unlikely(s->ctor)) {
				843	int mode = SLAB_CTOR_CONSTRUCTOR;
				844
				845	if (!(s->flags & __GFP_WAIT))
				846	mode \|= SLAB_CTOR_ATOMIC;
				847
				848	s->ctor(object, s, mode);
				849	}
				850	}
				851
				852	static struct page new_slab(struct kmem_cache s, gfp_t flags, int node)
				853	{
				854	struct page *page;
				855	struct kmem_cache_node *n;
				856	void *start;
				857	void *end;
				858	void *last;
				859	void *p;
				860
				861	if (flags & __GFP_NO_GROW)
				862	return NULL;
				863
				864	BUG_ON(flags & ~(GFP_DMA \| GFP_LEVEL_MASK));
				865
				866	if (flags & __GFP_WAIT)
				867	local_irq_enable();
				868
				869	page = allocate_slab(s, flags & GFP_LEVEL_MASK, node);
				870	if (!page)
				871	goto out;
				872
				873	n = get_node(s, page_to_nid(page));
				874	if (n)
				875	atomic_long_inc(&n->nr_slabs);
				876	page->offset = s->offset / sizeof(void *);
				877	page->slab = s;
				878	page->flags \|= 1 << PG_slab;
				879	if (s->flags & (SLAB_DEBUG_FREE \| SLAB_RED_ZONE \| SLAB_POISON \|
				880	SLAB_STORE_USER \| SLAB_TRACE))
				881	page->flags \|= 1 << PG_error;
				882
				883	start = page_address(page);
				884	end = start + s->objects * s->size;
				885
				886	if (unlikely(s->flags & SLAB_POISON))
				887	memset(start, POISON_INUSE, PAGE_SIZE << s->order);
				888
				889	last = start;
				890	for (p = start + s->size; p < end; p += s->size) {
				891	setup_object(s, page, last);
				892	set_freepointer(s, last, p);
				893	last = p;
				894	}
				895	setup_object(s, page, last);
				896	set_freepointer(s, last, NULL);
				897
				898	page->freelist = start;
				899	page->inuse = 0;
				900	out:
				901	if (flags & __GFP_WAIT)
				902	local_irq_disable();
				903	return page;
				904	}
				905
				906	static void __free_slab(struct kmem_cache s, struct page page)
				907	{
				908	int pages = 1 << s->order;
				909
				910	if (unlikely(PageError(page) \|\| s->dtor)) {
				911	void *start = page_address(page);
				912	void *end = start + (pages << PAGE_SHIFT);
				913	void *p;
				914
				915	slab_pad_check(s, page);
				916	for (p = start; p <= end - s->size; p += s->size) {
				917	if (s->dtor)
				918	s->dtor(p, s, 0);
				919	check_object(s, page, p, 0);
				920	}
				921	}
				922
				923	mod_zone_page_state(page_zone(page),
				924	(s->flags & SLAB_RECLAIM_ACCOUNT) ?
				925	NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
				926	- pages);
				927
				928	page->mapping = NULL;
				929	__free_pages(page, s->order);
				930	}
				931
				932	static void rcu_free_slab(struct rcu_head *h)
				933	{
				934	struct page *page;
				935
				936	page = container_of((struct list_head *)h, struct page, lru);
				937	__free_slab(page->slab, page);
				938	}
				939
				940	static void free_slab(struct kmem_cache s, struct page page)
				941	{
				942	if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
				943	/*
				944	* RCU free overloads the RCU head over the LRU
				945	*/
				946	struct rcu_head head = (void )&page->lru;
				947
				948	call_rcu(head, rcu_free_slab);
				949	} else
				950	__free_slab(s, page);
				951	}
				952
				953	static void discard_slab(struct kmem_cache s, struct page page)
				954	{
				955	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
				956
				957	atomic_long_dec(&n->nr_slabs);
				958	reset_page_mapcount(page);
				959	page->flags &= ~(1 << PG_slab \| 1 << PG_error);
				960	free_slab(s, page);
				961	}
				962
				963	/*
				964	* Per slab locking using the pagelock
				965	*/
				966	static __always_inline void slab_lock(struct page *page)
				967	{
				968	bit_spin_lock(PG_locked, &page->flags);
				969	}
				970
				971	static __always_inline void slab_unlock(struct page *page)
				972	{
				973	bit_spin_unlock(PG_locked, &page->flags);
				974	}
				975
				976	static __always_inline int slab_trylock(struct page *page)
				977	{
				978	int rc = 1;
				979
				980	rc = bit_spin_trylock(PG_locked, &page->flags);
				981	return rc;
				982	}
				983
				984	/*
				985	* Management of partially allocated slabs
				986	*/
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	987	static void add_partial_tail(struct kmem_cache_node n, struct page page)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	988	{
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	989	spin_lock(&n->list_lock);
				990	n->nr_partial++;
				991	list_add_tail(&page->lru, &n->partial);
				992	spin_unlock(&n->list_lock);
				993	}
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	994
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	995	static void add_partial(struct kmem_cache_node n, struct page page)
				996	{
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	997	spin_lock(&n->list_lock);
				998	n->nr_partial++;
				999	list_add(&page->lru, &n->partial);
				1000	spin_unlock(&n->list_lock);
				1001	}
				1002
				1003	static void remove_partial(struct kmem_cache *s,
				1004	struct page *page)
				1005	{
				1006	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
				1007
				1008	spin_lock(&n->list_lock);
				1009	list_del(&page->lru);
				1010	n->nr_partial--;
				1011	spin_unlock(&n->list_lock);
				1012	}
				1013
				1014	/*
				1015	* Lock page and remove it from the partial list
				1016	*
				1017	* Must hold list_lock
				1018	*/
				1019	static int lock_and_del_slab(struct kmem_cache_node n, struct page page)
				1020	{
				1021	if (slab_trylock(page)) {
				1022	list_del(&page->lru);
				1023	n->nr_partial--;
				1024	return 1;
				1025	}
				1026	return 0;
				1027	}
				1028
				1029	/*
				1030	* Try to get a partial slab from a specific node
				1031	*/
				1032	static struct page get_partial_node(struct kmem_cache_node n)
				1033	{
				1034	struct page *page;
				1035
				1036	/*
				1037	* Racy check. If we mistakenly see no partial slabs then we
				1038	* just allocate an empty slab. If we mistakenly try to get a
				1039	* partial slab then get_partials() will return NULL.
				1040	*/
				1041	if (!n \|\| !n->nr_partial)
				1042	return NULL;
				1043
				1044	spin_lock(&n->list_lock);
				1045	list_for_each_entry(page, &n->partial, lru)
				1046	if (lock_and_del_slab(n, page))
				1047	goto out;
				1048	page = NULL;
				1049	out:
				1050	spin_unlock(&n->list_lock);
				1051	return page;
				1052	}
				1053
				1054	/*
				1055	* Get a page from somewhere. Search in increasing NUMA
				1056	* distances.
				1057	*/
				1058	static struct page get_any_partial(struct kmem_cache s, gfp_t flags)
				1059	{
				1060	#ifdef CONFIG_NUMA
				1061	struct zonelist *zonelist;
				1062	struct zone **z;
				1063	struct page *page;
				1064
				1065	/*
				1066	* The defrag ratio allows to configure the tradeoffs between
				1067	* inter node defragmentation and node local allocations.
				1068	* A lower defrag_ratio increases the tendency to do local
				1069	* allocations instead of scanning throught the partial
				1070	* lists on other nodes.
				1071	*
				1072	* If defrag_ratio is set to 0 then kmalloc() always
				1073	* returns node local objects. If its higher then kmalloc()
				1074	* may return off node objects in order to avoid fragmentation.
				1075	*
				1076	* A higher ratio means slabs may be taken from other nodes
				1077	* thus reducing the number of partial slabs on those nodes.
				1078	*
				1079	* If /sys/slab/xx/defrag_ratio is set to 100 (which makes
				1080	* defrag_ratio = 1000) then every (well almost) allocation
				1081	* will first attempt to defrag slab caches on other nodes. This
				1082	* means scanning over all nodes to look for partial slabs which
				1083	* may be a bit expensive to do on every slab allocation.
				1084	*/
				1085	if (!s->defrag_ratio \|\| get_cycles() % 1024 > s->defrag_ratio)
				1086	return NULL;
				1087
				1088	zonelist = &NODE_DATA(slab_node(current->mempolicy))
				1089	->node_zonelists[gfp_zone(flags)];
				1090	for (z = zonelist->zones; *z; z++) {
				1091	struct kmem_cache_node *n;
				1092
				1093	n = get_node(s, zone_to_nid(*z));
				1094
				1095	if (n && cpuset_zone_allowed_hardwall(*z, flags) &&
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1096	n->nr_partial > MIN_PARTIAL) {
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1097	page = get_partial_node(n);
				1098	if (page)
				1099	return page;
				1100	}
				1101	}
				1102	#endif
				1103	return NULL;
				1104	}
				1105
				1106	/*
				1107	* Get a partial page, lock it and return it.
				1108	*/
				1109	static struct page get_partial(struct kmem_cache s, gfp_t flags, int node)
				1110	{
				1111	struct page *page;
				1112	int searchnode = (node == -1) ? numa_node_id() : node;
				1113
				1114	page = get_partial_node(get_node(s, searchnode));
				1115	if (page \|\| (flags & __GFP_THISNODE))
				1116	return page;
				1117
				1118	return get_any_partial(s, flags);
				1119	}
				1120
				1121	/*
				1122	* Move a page back to the lists.
				1123	*
				1124	* Must be called with the slab lock held.
				1125	*
				1126	* On exit the slab lock will have been dropped.
				1127	*/
				1128	static void putback_slab(struct kmem_cache s, struct page page)
				1129	{
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1130	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
				1131
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1132	if (page->inuse) {
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1133
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1134	if (page->freelist)
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1135	add_partial(n, page);
				1136	else if (PageError(page) && (s->flags & SLAB_STORE_USER))
				1137	add_full(n, page);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1138	slab_unlock(page);
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1139
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1140	} else {
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1141	if (n->nr_partial < MIN_PARTIAL) {
				1142	/*
				1143	* Adding an empty page to the partial slabs in order
				1144	* to avoid page allocator overhead. This page needs to
				1145	* come after all the others that are not fully empty
				1146	* in order to make sure that we do maximum
				1147	* defragmentation.
				1148	*/
				1149	add_partial_tail(n, page);
				1150	slab_unlock(page);
				1151	} else {
				1152	slab_unlock(page);
				1153	discard_slab(s, page);
				1154	}
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1155	}
				1156	}
				1157
				1158	/*
				1159	* Remove the cpu slab
				1160	*/
				1161	static void deactivate_slab(struct kmem_cache s, struct page page, int cpu)
				1162	{
				1163	s->cpu_slab[cpu] = NULL;
				1164	ClearPageActive(page);
				1165
				1166	putback_slab(s, page);
				1167	}
				1168
				1169	static void flush_slab(struct kmem_cache s, struct page page, int cpu)
				1170	{
				1171	slab_lock(page);
				1172	deactivate_slab(s, page, cpu);
				1173	}
				1174
				1175	/*
				1176	* Flush cpu slab.
				1177	* Called from IPI handler with interrupts disabled.
				1178	*/
				1179	static void __flush_cpu_slab(struct kmem_cache *s, int cpu)
				1180	{
				1181	struct page *page = s->cpu_slab[cpu];
				1182
				1183	if (likely(page))
				1184	flush_slab(s, page, cpu);
				1185	}
				1186
				1187	static void flush_cpu_slab(void *d)
				1188	{
				1189	struct kmem_cache *s = d;
				1190	int cpu = smp_processor_id();
				1191
				1192	__flush_cpu_slab(s, cpu);
				1193	}
				1194
				1195	static void flush_all(struct kmem_cache *s)
				1196	{
				1197	#ifdef CONFIG_SMP
				1198	on_each_cpu(flush_cpu_slab, s, 1, 1);
				1199	#else
				1200	unsigned long flags;
				1201
				1202	local_irq_save(flags);
				1203	flush_cpu_slab(s);
				1204	local_irq_restore(flags);
				1205	#endif
				1206	}
				1207
				1208	/*
				1209	* slab_alloc is optimized to only modify two cachelines on the fast path
				1210	* (aside from the stack):
				1211	*
				1212	* 1. The page struct
				1213	* 2. The first cacheline of the object to be allocated.
				1214	*
				1215	* The only cache lines that are read (apart from code) is the
				1216	* per cpu array in the kmem_cache struct.
				1217	*
				1218	* Fastpath is not possible if we need to get a new slab or have
				1219	* debugging enabled (which means all slabs are marked with PageError)
				1220	*/
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1221	static void slab_alloc(struct kmem_cache s,
				1222	gfp_t gfpflags, int node, void *addr)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1223	{
				1224	struct page *page;
				1225	void **object;
				1226	unsigned long flags;
				1227	int cpu;
				1228
				1229	local_irq_save(flags);
				1230	cpu = smp_processor_id();
				1231	page = s->cpu_slab[cpu];
				1232	if (!page)
				1233	goto new_slab;
				1234
				1235	slab_lock(page);
				1236	if (unlikely(node != -1 && page_to_nid(page) != node))
				1237	goto another_slab;
				1238	redo:
				1239	object = page->freelist;
				1240	if (unlikely(!object))
				1241	goto another_slab;
				1242	if (unlikely(PageError(page)))
				1243	goto debug;
				1244
				1245	have_object:
				1246	page->inuse++;
				1247	page->freelist = object[page->offset];
				1248	slab_unlock(page);
				1249	local_irq_restore(flags);
				1250	return object;
				1251
				1252	another_slab:
				1253	deactivate_slab(s, page, cpu);
				1254
				1255	new_slab:
				1256	page = get_partial(s, gfpflags, node);
				1257	if (likely(page)) {
				1258	have_slab:
				1259	s->cpu_slab[cpu] = page;
				1260	SetPageActive(page);
				1261	goto redo;
				1262	}
				1263
				1264	page = new_slab(s, gfpflags, node);
				1265	if (page) {
				1266	cpu = smp_processor_id();
				1267	if (s->cpu_slab[cpu]) {
				1268	/*
				1269	* Someone else populated the cpu_slab while we enabled
				1270	* interrupts, or we have got scheduled on another cpu.
				1271	* The page may not be on the requested node.
				1272	*/
				1273	if (node == -1 \|\|
				1274	page_to_nid(s->cpu_slab[cpu]) == node) {
				1275	/*
				1276	* Current cpuslab is acceptable and we
				1277	* want the current one since its cache hot
				1278	*/
				1279	discard_slab(s, page);
				1280	page = s->cpu_slab[cpu];
				1281	slab_lock(page);
				1282	goto redo;
				1283	}
				1284	/* Dump the current slab */
				1285	flush_slab(s, s->cpu_slab[cpu], cpu);
				1286	}
				1287	slab_lock(page);
				1288	goto have_slab;
				1289	}
				1290	local_irq_restore(flags);
				1291	return NULL;
				1292	debug:
				1293	if (!alloc_object_checks(s, page, object))
				1294	goto another_slab;
				1295	if (s->flags & SLAB_STORE_USER)
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1296	set_track(s, object, TRACK_ALLOC, addr);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1297	goto have_object;
				1298	}
				1299
				1300	void kmem_cache_alloc(struct kmem_cache s, gfp_t gfpflags)
				1301	{
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1302	return slab_alloc(s, gfpflags, -1, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1303	}
				1304	EXPORT_SYMBOL(kmem_cache_alloc);
				1305
				1306	#ifdef CONFIG_NUMA
				1307	void kmem_cache_alloc_node(struct kmem_cache s, gfp_t gfpflags, int node)
				1308	{
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1309	return slab_alloc(s, gfpflags, node, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1310	}
				1311	EXPORT_SYMBOL(kmem_cache_alloc_node);
				1312	#endif
				1313
				1314	/*
				1315	* The fastpath only writes the cacheline of the page struct and the first
				1316	* cacheline of the object.
				1317	*
				1318	* No special cachelines need to be read
				1319	*/
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1320	static void slab_free(struct kmem_cache s, struct page page,
				1321	void x, void addr)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1322	{
				1323	void *prior;
				1324	void *object = (void )x;
				1325	unsigned long flags;
				1326
				1327	local_irq_save(flags);
				1328	slab_lock(page);
				1329
				1330	if (unlikely(PageError(page)))
				1331	goto debug;
				1332	checks_ok:
				1333	prior = object[page->offset] = page->freelist;
				1334	page->freelist = object;
				1335	page->inuse--;
				1336
				1337	if (unlikely(PageActive(page)))
				1338	/*
				1339	* Cpu slabs are never on partial lists and are
				1340	* never freed.
				1341	*/
				1342	goto out_unlock;
				1343
				1344	if (unlikely(!page->inuse))
				1345	goto slab_empty;
				1346
				1347	/*
				1348	* Objects left in the slab. If it
				1349	* was not on the partial list before
				1350	* then add it.
				1351	*/
				1352	if (unlikely(!prior))
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1353	add_partial(get_node(s, page_to_nid(page)), page);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1354
				1355	out_unlock:
				1356	slab_unlock(page);
				1357	local_irq_restore(flags);
				1358	return;
				1359
				1360	slab_empty:
				1361	if (prior)
				1362	/*
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	1363	* Slab on the partial list.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1364	*/
				1365	remove_partial(s, page);
				1366
				1367	slab_unlock(page);
				1368	discard_slab(s, page);
				1369	local_irq_restore(flags);
				1370	return;
				1371
				1372	debug:
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1373	if (!free_object_checks(s, page, x))
				1374	goto out_unlock;
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	1375	if (!PageActive(page) && !page->freelist)
				1376	remove_full(s, page);
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1377	if (s->flags & SLAB_STORE_USER)
				1378	set_track(s, x, TRACK_FREE, addr);
				1379	goto checks_ok;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1380	}
				1381
				1382	void kmem_cache_free(struct kmem_cache s, void x)
				1383	{
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1384	struct page *page;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1385
Christoph Lameter	b49af68	2007-05-06 14:49:41 -0700	[diff] [blame]	1386	page = virt_to_head_page(x);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1387
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1388	slab_free(s, page, x, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1389	}
				1390	EXPORT_SYMBOL(kmem_cache_free);
				1391
				1392	/* Figure out on which slab object the object resides */
				1393	static struct page get_object_page(const void x)
				1394	{
Christoph Lameter	b49af68	2007-05-06 14:49:41 -0700	[diff] [blame]	1395	struct page *page = virt_to_head_page(x);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1396
				1397	if (!PageSlab(page))
				1398	return NULL;
				1399
				1400	return page;
				1401	}
				1402
				1403	/*
				1404	* kmem_cache_open produces objects aligned at "size" and the first object
				1405	* is placed at offset 0 in the slab (We have no metainformation on the
				1406	* slab, all slabs are in essence "off slab").
				1407	*
				1408	* In order to get the desired alignment one just needs to align the
				1409	* size.
				1410	*
				1411	* Notice that the allocation order determines the sizes of the per cpu
				1412	* caches. Each processor has always one slab available for allocations.
				1413	* Increasing the allocation order reduces the number of times that slabs
				1414	* must be moved on and off the partial lists and therefore may influence
				1415	* locking overhead.
				1416	*
				1417	* The offset is used to relocate the free list link in each object. It is
				1418	* therefore possible to move the free list link behind the object. This
				1419	* is necessary for RCU to work properly and also useful for debugging.
				1420	*/
				1421
				1422	/*
				1423	* Mininum / Maximum order of slab pages. This influences locking overhead
				1424	* and slab fragmentation. A higher order reduces the number of partial slabs
				1425	* and increases the number of allocations possible without having to
				1426	* take the list_lock.
				1427	*/
				1428	static int slub_min_order;
				1429	static int slub_max_order = DEFAULT_MAX_ORDER;
				1430
				1431	/*
				1432	* Minimum number of objects per slab. This is necessary in order to
				1433	* reduce locking overhead. Similar to the queue size in SLAB.
				1434	*/
				1435	static int slub_min_objects = DEFAULT_MIN_OBJECTS;
				1436
				1437	/*
				1438	* Merge control. If this is set then no merging of slab caches will occur.
				1439	*/
				1440	static int slub_nomerge;
				1441
				1442	/*
				1443	* Debug settings:
				1444	*/
				1445	static int slub_debug;
				1446
				1447	static char *slub_debug_slabs;
				1448
				1449	/*
				1450	* Calculate the order of allocation given an slab object size.
				1451	*
				1452	* The order of allocation has significant impact on other elements
				1453	* of the system. Generally order 0 allocations should be preferred
				1454	* since they do not cause fragmentation in the page allocator. Larger
				1455	* objects may have problems with order 0 because there may be too much
				1456	* space left unused in a slab. We go to a higher order if more than 1/8th
				1457	* of the slab would be wasted.
				1458	*
				1459	* In order to reach satisfactory performance we must ensure that
				1460	* a minimum number of objects is in one slab. Otherwise we may
				1461	* generate too much activity on the partial lists. This is less a
				1462	* concern for large slabs though. slub_max_order specifies the order
				1463	* where we begin to stop considering the number of objects in a slab.
				1464	*
				1465	* Higher order allocations also allow the placement of more objects
				1466	* in a slab and thereby reduce object handling overhead. If the user
				1467	* has requested a higher mininum order then we start with that one
				1468	* instead of zero.
				1469	*/
				1470	static int calculate_order(int size)
				1471	{
				1472	int order;
				1473	int rem;
				1474
				1475	for (order = max(slub_min_order, fls(size - 1) - PAGE_SHIFT);
				1476	order < MAX_ORDER; order++) {
				1477	unsigned long slab_size = PAGE_SIZE << order;
				1478
				1479	if (slub_max_order > order &&
				1480	slab_size < slub_min_objects * size)
				1481	continue;
				1482
				1483	if (slab_size < size)
				1484	continue;
				1485
				1486	rem = slab_size % size;
				1487
				1488	if (rem <= (PAGE_SIZE << order) / 8)
				1489	break;
				1490
				1491	}
				1492	if (order >= MAX_ORDER)
				1493	return -E2BIG;
				1494	return order;
				1495	}
				1496
				1497	/*
				1498	* Function to figure out which alignment to use from the
				1499	* various ways of specifying it.
				1500	*/
				1501	static unsigned long calculate_alignment(unsigned long flags,
				1502	unsigned long align, unsigned long size)
				1503	{
				1504	/*
				1505	* If the user wants hardware cache aligned objects then
				1506	* follow that suggestion if the object is sufficiently
				1507	* large.
				1508	*
				1509	* The hardware cache alignment cannot override the
				1510	* specified alignment though. If that is greater
				1511	* then use it.
				1512	*/
				1513	if ((flags & (SLAB_MUST_HWCACHE_ALIGN \| SLAB_HWCACHE_ALIGN)) &&
				1514	size > L1_CACHE_BYTES / 2)
				1515	return max_t(unsigned long, align, L1_CACHE_BYTES);
				1516
				1517	if (align < ARCH_SLAB_MINALIGN)
				1518	return ARCH_SLAB_MINALIGN;
				1519
				1520	return ALIGN(align, sizeof(void *));
				1521	}
				1522
				1523	static void init_kmem_cache_node(struct kmem_cache_node *n)
				1524	{
				1525	n->nr_partial = 0;
				1526	atomic_long_set(&n->nr_slabs, 0);
				1527	spin_lock_init(&n->list_lock);
				1528	INIT_LIST_HEAD(&n->partial);
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	1529	INIT_LIST_HEAD(&n->full);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1530	}
				1531
				1532	#ifdef CONFIG_NUMA
				1533	/*
				1534	* No kmalloc_node yet so do it by hand. We know that this is the first
				1535	* slab on the node for this slabcache. There are no concurrent accesses
				1536	* possible.
				1537	*
				1538	* Note that this function only works on the kmalloc_node_cache
				1539	* when allocating for the kmalloc_node_cache.
				1540	*/
				1541	static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags,
				1542	int node)
				1543	{
				1544	struct page *page;
				1545	struct kmem_cache_node *n;
				1546
				1547	BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
				1548
				1549	page = new_slab(kmalloc_caches, gfpflags \| GFP_THISNODE, node);
				1550	/* new_slab() disables interupts */
				1551	local_irq_enable();
				1552
				1553	BUG_ON(!page);
				1554	n = page->freelist;
				1555	BUG_ON(!n);
				1556	page->freelist = get_freepointer(kmalloc_caches, n);
				1557	page->inuse++;
				1558	kmalloc_caches->node[node] = n;
				1559	init_object(kmalloc_caches, n, 1);
				1560	init_kmem_cache_node(n);
				1561	atomic_long_inc(&n->nr_slabs);
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1562	add_partial(n, page);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1563	return n;
				1564	}
				1565
				1566	static void free_kmem_cache_nodes(struct kmem_cache *s)
				1567	{
				1568	int node;
				1569
				1570	for_each_online_node(node) {
				1571	struct kmem_cache_node *n = s->node[node];
				1572	if (n && n != &s->local_node)
				1573	kmem_cache_free(kmalloc_caches, n);
				1574	s->node[node] = NULL;
				1575	}
				1576	}
				1577
				1578	static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
				1579	{
				1580	int node;
				1581	int local_node;
				1582
				1583	if (slab_state >= UP)
				1584	local_node = page_to_nid(virt_to_page(s));
				1585	else
				1586	local_node = 0;
				1587
				1588	for_each_online_node(node) {
				1589	struct kmem_cache_node *n;
				1590
				1591	if (local_node == node)
				1592	n = &s->local_node;
				1593	else {
				1594	if (slab_state == DOWN) {
				1595	n = early_kmem_cache_node_alloc(gfpflags,
				1596	node);
				1597	continue;
				1598	}
				1599	n = kmem_cache_alloc_node(kmalloc_caches,
				1600	gfpflags, node);
				1601
				1602	if (!n) {
				1603	free_kmem_cache_nodes(s);
				1604	return 0;
				1605	}
				1606
				1607	}
				1608	s->node[node] = n;
				1609	init_kmem_cache_node(n);
				1610	}
				1611	return 1;
				1612	}
				1613	#else
				1614	static void free_kmem_cache_nodes(struct kmem_cache *s)
				1615	{
				1616	}
				1617
				1618	static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
				1619	{
				1620	init_kmem_cache_node(&s->local_node);
				1621	return 1;
				1622	}
				1623	#endif
				1624
				1625	/*
				1626	* calculate_sizes() determines the order and the distribution of data within
				1627	* a slab object.
				1628	*/
				1629	static int calculate_sizes(struct kmem_cache *s)
				1630	{
				1631	unsigned long flags = s->flags;
				1632	unsigned long size = s->objsize;
				1633	unsigned long align = s->align;
				1634
				1635	/*
				1636	* Determine if we can poison the object itself. If the user of
				1637	* the slab may touch the object after free or before allocation
				1638	* then we should never poison the object itself.
				1639	*/
				1640	if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
				1641	!s->ctor && !s->dtor)
				1642	s->flags \|= __OBJECT_POISON;
				1643	else
				1644	s->flags &= ~__OBJECT_POISON;
				1645
				1646	/*
				1647	* Round up object size to the next word boundary. We can only
				1648	* place the free pointer at word boundaries and this determines
				1649	* the possible location of the free pointer.
				1650	*/
				1651	size = ALIGN(size, sizeof(void *));
				1652
				1653	/*
				1654	* If we are redzoning then check if there is some space between the
				1655	* end of the object and the free pointer. If not then add an
				1656	* additional word, so that we can establish a redzone between
				1657	* the object and the freepointer to be able to check for overwrites.
				1658	*/
				1659	if ((flags & SLAB_RED_ZONE) && size == s->objsize)
				1660	size += sizeof(void *);
				1661
				1662	/*
				1663	* With that we have determined how much of the slab is in actual
				1664	* use by the object. This is the potential offset to the free
				1665	* pointer.
				1666	*/
				1667	s->inuse = size;
				1668
				1669	if (((flags & (SLAB_DESTROY_BY_RCU \| SLAB_POISON)) \|\|
				1670	s->ctor \|\| s->dtor)) {
				1671	/*
				1672	* Relocate free pointer after the object if it is not
				1673	* permitted to overwrite the first word of the object on
				1674	* kmem_cache_free.
				1675	*
				1676	* This is the case if we do RCU, have a constructor or
				1677	* destructor or are poisoning the objects.
				1678	*/
				1679	s->offset = size;
				1680	size += sizeof(void *);
				1681	}
				1682
				1683	if (flags & SLAB_STORE_USER)
				1684	/*
				1685	* Need to store information about allocs and frees after
				1686	* the object.
				1687	*/
				1688	size += 2 * sizeof(struct track);
				1689
				1690	if (flags & DEBUG_DEFAULT_FLAGS)
				1691	/*
				1692	* Add some empty padding so that we can catch
				1693	* overwrites from earlier objects rather than let
				1694	* tracking information or the free pointer be
				1695	* corrupted if an user writes before the start
				1696	* of the object.
				1697	*/
				1698	size += sizeof(void *);
				1699	/*
				1700	* Determine the alignment based on various parameters that the
				1701	* user specified (this is unecessarily complex due to the attempt
				1702	* to be compatible with SLAB. Should be cleaned up some day).
				1703	*/
				1704	align = calculate_alignment(flags, align, s->objsize);
				1705
				1706	/*
				1707	* SLUB stores one object immediately after another beginning from
				1708	* offset 0. In order to align the objects we have to simply size
				1709	* each object to conform to the alignment.
				1710	*/
				1711	size = ALIGN(size, align);
				1712	s->size = size;
				1713
				1714	s->order = calculate_order(size);
				1715	if (s->order < 0)
				1716	return 0;
				1717
				1718	/*
				1719	* Determine the number of objects per slab
				1720	*/
				1721	s->objects = (PAGE_SIZE << s->order) / size;
				1722
				1723	/*
				1724	* Verify that the number of objects is within permitted limits.
				1725	* The page->inuse field is only 16 bit wide! So we cannot have
				1726	* more than 64k objects per slab.
				1727	*/
				1728	if (!s->objects \|\| s->objects > 65535)
				1729	return 0;
				1730	return 1;
				1731
				1732	}
				1733
				1734	static int __init finish_bootstrap(void)
				1735	{
				1736	struct list_head *h;
				1737	int err;
				1738
				1739	slab_state = SYSFS;
				1740
				1741	list_for_each(h, &slab_caches) {
				1742	struct kmem_cache *s =
				1743	container_of(h, struct kmem_cache, list);
				1744
				1745	err = sysfs_slab_add(s);
				1746	BUG_ON(err);
				1747	}
				1748	return 0;
				1749	}
				1750
				1751	static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
				1752	const char *name, size_t size,
				1753	size_t align, unsigned long flags,
				1754	void (ctor)(void , struct kmem_cache *, unsigned long),
				1755	void (dtor)(void , struct kmem_cache *, unsigned long))
				1756	{
				1757	memset(s, 0, kmem_size);
				1758	s->name = name;
				1759	s->ctor = ctor;
				1760	s->dtor = dtor;
				1761	s->objsize = size;
				1762	s->flags = flags;
				1763	s->align = align;
				1764
				1765	BUG_ON(flags & SLUB_UNIMPLEMENTED);
				1766
				1767	/*
				1768	* The page->offset field is only 16 bit wide. This is an offset
				1769	* in units of words from the beginning of an object. If the slab
				1770	* size is bigger then we cannot move the free pointer behind the
				1771	* object anymore.
				1772	*
				1773	* On 32 bit platforms the limit is 256k. On 64bit platforms
				1774	* the limit is 512k.
				1775	*
				1776	* Debugging or ctor/dtors may create a need to move the free
				1777	* pointer. Fail if this happens.
				1778	*/
				1779	if (s->size >= 65535 * sizeof(void *)) {
				1780	BUG_ON(flags & (SLAB_RED_ZONE \| SLAB_POISON \|
				1781	SLAB_STORE_USER \| SLAB_DESTROY_BY_RCU));
				1782	BUG_ON(ctor \|\| dtor);
				1783	}
				1784	else
				1785	/*
				1786	* Enable debugging if selected on the kernel commandline.
				1787	*/
				1788	if (slub_debug && (!slub_debug_slabs \|\|
				1789	strncmp(slub_debug_slabs, name,
				1790	strlen(slub_debug_slabs)) == 0))
				1791	s->flags \|= slub_debug;
				1792
				1793	if (!calculate_sizes(s))
				1794	goto error;
				1795
				1796	s->refcount = 1;
				1797	#ifdef CONFIG_NUMA
				1798	s->defrag_ratio = 100;
				1799	#endif
				1800
				1801	if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
				1802	return 1;
				1803	error:
				1804	if (flags & SLAB_PANIC)
				1805	panic("Cannot create slab %s size=%lu realsize=%u "
				1806	"order=%u offset=%u flags=%lx\n",
				1807	s->name, (unsigned long)size, s->size, s->order,
				1808	s->offset, flags);
				1809	return 0;
				1810	}
				1811	EXPORT_SYMBOL(kmem_cache_open);
				1812
				1813	/*
				1814	* Check if a given pointer is valid
				1815	*/
				1816	int kmem_ptr_validate(struct kmem_cache s, const void object)
				1817	{
				1818	struct page * page;
				1819	void *addr;
				1820
				1821	page = get_object_page(object);
				1822
				1823	if (!page \|\| s != page->slab)
				1824	/* No slab or wrong slab */
				1825	return 0;
				1826
				1827	addr = page_address(page);
				1828	if (object < addr \|\| object >= addr + s->objects * s->size)
				1829	/* Out of bounds */
				1830	return 0;
				1831
				1832	if ((object - addr) % s->size)
				1833	/* Improperly aligned */
				1834	return 0;
				1835
				1836	/*
				1837	* We could also check if the object is on the slabs freelist.
				1838	* But this would be too expensive and it seems that the main
				1839	* purpose of kmem_ptr_valid is to check if the object belongs
				1840	* to a certain slab.
				1841	*/
				1842	return 1;
				1843	}
				1844	EXPORT_SYMBOL(kmem_ptr_validate);
				1845
				1846	/*
				1847	* Determine the size of a slab object
				1848	*/
				1849	unsigned int kmem_cache_size(struct kmem_cache *s)
				1850	{
				1851	return s->objsize;
				1852	}
				1853	EXPORT_SYMBOL(kmem_cache_size);
				1854
				1855	const char kmem_cache_name(struct kmem_cache s)
				1856	{
				1857	return s->name;
				1858	}
				1859	EXPORT_SYMBOL(kmem_cache_name);
				1860
				1861	/*
				1862	* Attempt to free all slabs on a node
				1863	*/
				1864	static int free_list(struct kmem_cache s, struct kmem_cache_node n,
				1865	struct list_head *list)
				1866	{
				1867	int slabs_inuse = 0;
				1868	unsigned long flags;
				1869	struct page page, h;
				1870
				1871	spin_lock_irqsave(&n->list_lock, flags);
				1872	list_for_each_entry_safe(page, h, list, lru)
				1873	if (!page->inuse) {
				1874	list_del(&page->lru);
				1875	discard_slab(s, page);
				1876	} else
				1877	slabs_inuse++;
				1878	spin_unlock_irqrestore(&n->list_lock, flags);
				1879	return slabs_inuse;
				1880	}
				1881
				1882	/*
				1883	* Release all resources used by slab cache
				1884	*/
				1885	static int kmem_cache_close(struct kmem_cache *s)
				1886	{
				1887	int node;
				1888
				1889	flush_all(s);
				1890
				1891	/* Attempt to free all objects */
				1892	for_each_online_node(node) {
				1893	struct kmem_cache_node *n = get_node(s, node);
				1894
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame^]	1895	n->nr_partial -= free_list(s, n, &n->partial);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1896	if (atomic_long_read(&n->nr_slabs))
				1897	return 1;
				1898	}
				1899	free_kmem_cache_nodes(s);
				1900	return 0;
				1901	}
				1902
				1903	/*
				1904	* Close a cache and release the kmem_cache structure
				1905	* (must be used for caches created using kmem_cache_create)
				1906	*/
				1907	void kmem_cache_destroy(struct kmem_cache *s)
				1908	{
				1909	down_write(&slub_lock);
				1910	s->refcount--;
				1911	if (!s->refcount) {
				1912	list_del(&s->list);
				1913	if (kmem_cache_close(s))
				1914	WARN_ON(1);
				1915	sysfs_slab_remove(s);
				1916	kfree(s);
				1917	}
				1918	up_write(&slub_lock);
				1919	}
				1920	EXPORT_SYMBOL(kmem_cache_destroy);
				1921
				1922	/********************************************************************
				1923	* Kmalloc subsystem
				1924	*******************************************************************/
				1925
				1926	struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned;
				1927	EXPORT_SYMBOL(kmalloc_caches);
				1928
				1929	#ifdef CONFIG_ZONE_DMA
				1930	static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1];
				1931	#endif
				1932
				1933	static int __init setup_slub_min_order(char *str)
				1934	{
				1935	get_option (&str, &slub_min_order);
				1936
				1937	return 1;
				1938	}
				1939
				1940	__setup("slub_min_order=", setup_slub_min_order);
				1941
				1942	static int __init setup_slub_max_order(char *str)
				1943	{
				1944	get_option (&str, &slub_max_order);
				1945
				1946	return 1;
				1947	}
				1948
				1949	__setup("slub_max_order=", setup_slub_max_order);
				1950
				1951	static int __init setup_slub_min_objects(char *str)
				1952	{
				1953	get_option (&str, &slub_min_objects);
				1954
				1955	return 1;
				1956	}
				1957
				1958	__setup("slub_min_objects=", setup_slub_min_objects);
				1959
				1960	static int __init setup_slub_nomerge(char *str)
				1961	{
				1962	slub_nomerge = 1;
				1963	return 1;
				1964	}
				1965
				1966	__setup("slub_nomerge", setup_slub_nomerge);
				1967
				1968	static int __init setup_slub_debug(char *str)
				1969	{
				1970	if (!str \|\| *str != '=')
				1971	slub_debug = DEBUG_DEFAULT_FLAGS;
				1972	else {
				1973	str++;
				1974	if (str == 0 \|\| str == ',')
				1975	slub_debug = DEBUG_DEFAULT_FLAGS;
				1976	else
				1977	for( ;str && str != ','; str++)
				1978	switch (*str) {
				1979	case 'f' : case 'F' :
				1980	slub_debug \|= SLAB_DEBUG_FREE;
				1981	break;
				1982	case 'z' : case 'Z' :
				1983	slub_debug \|= SLAB_RED_ZONE;
				1984	break;
				1985	case 'p' : case 'P' :
				1986	slub_debug \|= SLAB_POISON;
				1987	break;
				1988	case 'u' : case 'U' :
				1989	slub_debug \|= SLAB_STORE_USER;
				1990	break;
				1991	case 't' : case 'T' :
				1992	slub_debug \|= SLAB_TRACE;
				1993	break;
				1994	default:
				1995	printk(KERN_ERR "slub_debug option '%c' "
				1996	"unknown. skipped\n",*str);
				1997	}
				1998	}
				1999
				2000	if (*str == ',')
				2001	slub_debug_slabs = str + 1;
				2002	return 1;
				2003	}
				2004
				2005	__setup("slub_debug", setup_slub_debug);
				2006
				2007	static struct kmem_cache create_kmalloc_cache(struct kmem_cache s,
				2008	const char *name, int size, gfp_t gfp_flags)
				2009	{
				2010	unsigned int flags = 0;
				2011
				2012	if (gfp_flags & SLUB_DMA)
				2013	flags = SLAB_CACHE_DMA;
				2014
				2015	down_write(&slub_lock);
				2016	if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
				2017	flags, NULL, NULL))
				2018	goto panic;
				2019
				2020	list_add(&s->list, &slab_caches);
				2021	up_write(&slub_lock);
				2022	if (sysfs_slab_add(s))
				2023	goto panic;
				2024	return s;
				2025
				2026	panic:
				2027	panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
				2028	}
				2029
				2030	static struct kmem_cache *get_slab(size_t size, gfp_t flags)
				2031	{
				2032	int index = kmalloc_index(size);
				2033
Christoph Lameter	614410d	2007-05-06 14:49:38 -0700	[diff] [blame]	2034	if (!index)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2035	return NULL;
				2036
				2037	/* Allocation too large? */
				2038	BUG_ON(index < 0);
				2039
				2040	#ifdef CONFIG_ZONE_DMA
				2041	if ((flags & SLUB_DMA)) {
				2042	struct kmem_cache *s;
				2043	struct kmem_cache *x;
				2044	char *text;
				2045	size_t realsize;
				2046
				2047	s = kmalloc_caches_dma[index];
				2048	if (s)
				2049	return s;
				2050
				2051	/* Dynamically create dma cache */
				2052	x = kmalloc(kmem_size, flags & ~SLUB_DMA);
				2053	if (!x)
				2054	panic("Unable to allocate memory for dma cache\n");
				2055
				2056	if (index <= KMALLOC_SHIFT_HIGH)
				2057	realsize = 1 << index;
				2058	else {
				2059	if (index == 1)
				2060	realsize = 96;
				2061	else
				2062	realsize = 192;
				2063	}
				2064
				2065	text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
				2066	(unsigned int)realsize);
				2067	s = create_kmalloc_cache(x, text, realsize, flags);
				2068	kmalloc_caches_dma[index] = s;
				2069	return s;
				2070	}
				2071	#endif
				2072	return &kmalloc_caches[index];
				2073	}
				2074
				2075	void *__kmalloc(size_t size, gfp_t flags)
				2076	{
				2077	struct kmem_cache *s = get_slab(size, flags);
				2078
				2079	if (s)
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2080	return slab_alloc(s, flags, -1, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2081	return NULL;
				2082	}
				2083	EXPORT_SYMBOL(__kmalloc);
				2084
				2085	#ifdef CONFIG_NUMA
				2086	void *__kmalloc_node(size_t size, gfp_t flags, int node)
				2087	{
				2088	struct kmem_cache *s = get_slab(size, flags);
				2089
				2090	if (s)
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2091	return slab_alloc(s, flags, node, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2092	return NULL;
				2093	}
				2094	EXPORT_SYMBOL(__kmalloc_node);
				2095	#endif
				2096
				2097	size_t ksize(const void *object)
				2098	{
				2099	struct page *page = get_object_page(object);
				2100	struct kmem_cache *s;
				2101
				2102	BUG_ON(!page);
				2103	s = page->slab;
				2104	BUG_ON(!s);
				2105
				2106	/*
				2107	* Debugging requires use of the padding between object
				2108	* and whatever may come after it.
				2109	*/
				2110	if (s->flags & (SLAB_RED_ZONE \| SLAB_POISON))
				2111	return s->objsize;
				2112
				2113	/*
				2114	* If we have the need to store the freelist pointer
				2115	* back there or track user information then we can
				2116	* only use the space before that information.
				2117	*/
				2118	if (s->flags & (SLAB_DESTROY_BY_RCU \| SLAB_STORE_USER))
				2119	return s->inuse;
				2120
				2121	/*
				2122	* Else we can use all the padding etc for the allocation
				2123	*/
				2124	return s->size;
				2125	}
				2126	EXPORT_SYMBOL(ksize);
				2127
				2128	void kfree(const void *x)
				2129	{
				2130	struct kmem_cache *s;
				2131	struct page *page;
				2132
				2133	if (!x)
				2134	return;
				2135
Christoph Lameter	b49af68	2007-05-06 14:49:41 -0700	[diff] [blame]	2136	page = virt_to_head_page(x);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2137	s = page->slab;
				2138
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2139	slab_free(s, page, (void *)x, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2140	}
				2141	EXPORT_SYMBOL(kfree);
				2142
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame^]	2143	/*
				2144	* kmem_cache_shrink removes empty slabs from the partial lists
				2145	* and then sorts the partially allocated slabs by the number
				2146	* of items in use. The slabs with the most items in use
				2147	* come first. New allocations will remove these from the
				2148	* partial list because they are full. The slabs with the
				2149	* least items are placed last. If it happens that the objects
				2150	* are freed then the page can be returned to the page allocator.
				2151	*/
				2152	int kmem_cache_shrink(struct kmem_cache *s)
				2153	{
				2154	int node;
				2155	int i;
				2156	struct kmem_cache_node *n;
				2157	struct page *page;
				2158	struct page *t;
				2159	struct list_head *slabs_by_inuse =
				2160	kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL);
				2161	unsigned long flags;
				2162
				2163	if (!slabs_by_inuse)
				2164	return -ENOMEM;
				2165
				2166	flush_all(s);
				2167	for_each_online_node(node) {
				2168	n = get_node(s, node);
				2169
				2170	if (!n->nr_partial)
				2171	continue;
				2172
				2173	for (i = 0; i < s->objects; i++)
				2174	INIT_LIST_HEAD(slabs_by_inuse + i);
				2175
				2176	spin_lock_irqsave(&n->list_lock, flags);
				2177
				2178	/*
				2179	* Build lists indexed by the items in use in
				2180	* each slab or free slabs if empty.
				2181	*
				2182	* Note that concurrent frees may occur while
				2183	* we hold the list_lock. page->inuse here is
				2184	* the upper limit.
				2185	*/
				2186	list_for_each_entry_safe(page, t, &n->partial, lru) {
				2187	if (!page->inuse && slab_trylock(page)) {
				2188	/*
				2189	* Must hold slab lock here because slab_free
				2190	* may have freed the last object and be
				2191	* waiting to release the slab.
				2192	*/
				2193	list_del(&page->lru);
				2194	n->nr_partial--;
				2195	slab_unlock(page);
				2196	discard_slab(s, page);
				2197	} else {
				2198	if (n->nr_partial > MAX_PARTIAL)
				2199	list_move(&page->lru,
				2200	slabs_by_inuse + page->inuse);
				2201	}
				2202	}
				2203
				2204	if (n->nr_partial <= MAX_PARTIAL)
				2205	goto out;
				2206
				2207	/*
				2208	* Rebuild the partial list with the slabs filled up
				2209	* most first and the least used slabs at the end.
				2210	*/
				2211	for (i = s->objects - 1; i >= 0; i--)
				2212	list_splice(slabs_by_inuse + i, n->partial.prev);
				2213
				2214	out:
				2215	spin_unlock_irqrestore(&n->list_lock, flags);
				2216	}
				2217
				2218	kfree(slabs_by_inuse);
				2219	return 0;
				2220	}
				2221	EXPORT_SYMBOL(kmem_cache_shrink);
				2222
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2223	/**
				2224	* krealloc - reallocate memory. The contents will remain unchanged.
				2225	*
				2226	* @p: object to reallocate memory for.
				2227	* @new_size: how many bytes of memory are required.
				2228	* @flags: the type of memory to allocate.
				2229	*
				2230	* The contents of the object pointed to are preserved up to the
				2231	* lesser of the new and old sizes. If @p is %NULL, krealloc()
				2232	* behaves exactly like kmalloc(). If @size is 0 and @p is not a
				2233	* %NULL pointer, the object pointed to is freed.
				2234	*/
				2235	void krealloc(const void p, size_t new_size, gfp_t flags)
				2236	{
				2237	struct kmem_cache *new_cache;
				2238	void *ret;
				2239	struct page *page;
				2240
				2241	if (unlikely(!p))
				2242	return kmalloc(new_size, flags);
				2243
				2244	if (unlikely(!new_size)) {
				2245	kfree(p);
				2246	return NULL;
				2247	}
				2248
Christoph Lameter	b49af68	2007-05-06 14:49:41 -0700	[diff] [blame]	2249	page = virt_to_head_page(p);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2250
				2251	new_cache = get_slab(new_size, flags);
				2252
				2253	/*
				2254	* If new size fits in the current cache, bail out.
				2255	*/
				2256	if (likely(page->slab == new_cache))
				2257	return (void *)p;
				2258
				2259	ret = kmalloc(new_size, flags);
				2260	if (ret) {
				2261	memcpy(ret, p, min(new_size, ksize(p)));
				2262	kfree(p);
				2263	}
				2264	return ret;
				2265	}
				2266	EXPORT_SYMBOL(krealloc);
				2267
				2268	/********************************************************************
				2269	* Basic setup of slabs
				2270	*******************************************************************/
				2271
				2272	void __init kmem_cache_init(void)
				2273	{
				2274	int i;
				2275
				2276	#ifdef CONFIG_NUMA
				2277	/*
				2278	* Must first have the slab cache available for the allocations of the
				2279	* struct kmalloc_cache_node's. There is special bootstrap code in
				2280	* kmem_cache_open for slab_state == DOWN.
				2281	*/
				2282	create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
				2283	sizeof(struct kmem_cache_node), GFP_KERNEL);
				2284	#endif
				2285
				2286	/* Able to allocate the per node structures */
				2287	slab_state = PARTIAL;
				2288
				2289	/* Caches that are not of the two-to-the-power-of size */
				2290	create_kmalloc_cache(&kmalloc_caches[1],
				2291	"kmalloc-96", 96, GFP_KERNEL);
				2292	create_kmalloc_cache(&kmalloc_caches[2],
				2293	"kmalloc-192", 192, GFP_KERNEL);
				2294
				2295	for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
				2296	create_kmalloc_cache(&kmalloc_caches[i],
				2297	"kmalloc", 1 << i, GFP_KERNEL);
				2298
				2299	slab_state = UP;
				2300
				2301	/* Provide the correct kmalloc names now that the caches are up */
				2302	for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
				2303	kmalloc_caches[i]. name =
				2304	kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
				2305
				2306	#ifdef CONFIG_SMP
				2307	register_cpu_notifier(&slab_notifier);
				2308	#endif
				2309
				2310	if (nr_cpu_ids) /* Remove when nr_cpu_ids is fixed upstream ! */
				2311	kmem_size = offsetof(struct kmem_cache, cpu_slab)
				2312	+ nr_cpu_ids * sizeof(struct page *);
				2313
				2314	printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
				2315	" Processors=%d, Nodes=%d\n",
				2316	KMALLOC_SHIFT_HIGH, L1_CACHE_BYTES,
				2317	slub_min_order, slub_max_order, slub_min_objects,
				2318	nr_cpu_ids, nr_node_ids);
				2319	}
				2320
				2321	/*
				2322	* Find a mergeable slab cache
				2323	*/
				2324	static int slab_unmergeable(struct kmem_cache *s)
				2325	{
				2326	if (slub_nomerge \|\| (s->flags & SLUB_NEVER_MERGE))
				2327	return 1;
				2328
				2329	if (s->ctor \|\| s->dtor)
				2330	return 1;
				2331
				2332	return 0;
				2333	}
				2334
				2335	static struct kmem_cache *find_mergeable(size_t size,
				2336	size_t align, unsigned long flags,
				2337	void (ctor)(void , struct kmem_cache *, unsigned long),
				2338	void (dtor)(void , struct kmem_cache *, unsigned long))
				2339	{
				2340	struct list_head *h;
				2341
				2342	if (slub_nomerge \|\| (flags & SLUB_NEVER_MERGE))
				2343	return NULL;
				2344
				2345	if (ctor \|\| dtor)
				2346	return NULL;
				2347
				2348	size = ALIGN(size, sizeof(void *));
				2349	align = calculate_alignment(flags, align, size);
				2350	size = ALIGN(size, align);
				2351
				2352	list_for_each(h, &slab_caches) {
				2353	struct kmem_cache *s =
				2354	container_of(h, struct kmem_cache, list);
				2355
				2356	if (slab_unmergeable(s))
				2357	continue;
				2358
				2359	if (size > s->size)
				2360	continue;
				2361
				2362	if (((flags \| slub_debug) & SLUB_MERGE_SAME) !=
				2363	(s->flags & SLUB_MERGE_SAME))
				2364	continue;
				2365	/*
				2366	* Check if alignment is compatible.
				2367	* Courtesy of Adrian Drzewiecki
				2368	*/
				2369	if ((s->size & ~(align -1)) != s->size)
				2370	continue;
				2371
				2372	if (s->size - size >= sizeof(void *))
				2373	continue;
				2374
				2375	return s;
				2376	}
				2377	return NULL;
				2378	}
				2379
				2380	struct kmem_cache kmem_cache_create(const char name, size_t size,
				2381	size_t align, unsigned long flags,
				2382	void (ctor)(void , struct kmem_cache *, unsigned long),
				2383	void (dtor)(void , struct kmem_cache *, unsigned long))
				2384	{
				2385	struct kmem_cache *s;
				2386
				2387	down_write(&slub_lock);
				2388	s = find_mergeable(size, align, flags, dtor, ctor);
				2389	if (s) {
				2390	s->refcount++;
				2391	/*
				2392	* Adjust the object sizes so that we clear
				2393	* the complete object on kzalloc.
				2394	*/
				2395	s->objsize = max(s->objsize, (int)size);
				2396	s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
				2397	if (sysfs_slab_alias(s, name))
				2398	goto err;
				2399	} else {
				2400	s = kmalloc(kmem_size, GFP_KERNEL);
				2401	if (s && kmem_cache_open(s, GFP_KERNEL, name,
				2402	size, align, flags, ctor, dtor)) {
				2403	if (sysfs_slab_add(s)) {
				2404	kfree(s);
				2405	goto err;
				2406	}
				2407	list_add(&s->list, &slab_caches);
				2408	} else
				2409	kfree(s);
				2410	}
				2411	up_write(&slub_lock);
				2412	return s;
				2413
				2414	err:
				2415	up_write(&slub_lock);
				2416	if (flags & SLAB_PANIC)
				2417	panic("Cannot create slabcache %s\n", name);
				2418	else
				2419	s = NULL;
				2420	return s;
				2421	}
				2422	EXPORT_SYMBOL(kmem_cache_create);
				2423
				2424	void kmem_cache_zalloc(struct kmem_cache s, gfp_t flags)
				2425	{
				2426	void *x;
				2427
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2428	x = slab_alloc(s, flags, -1, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2429	if (x)
				2430	memset(x, 0, s->objsize);
				2431	return x;
				2432	}
				2433	EXPORT_SYMBOL(kmem_cache_zalloc);
				2434
				2435	#ifdef CONFIG_SMP
				2436	static void for_all_slabs(void (func)(struct kmem_cache , int), int cpu)
				2437	{
				2438	struct list_head *h;
				2439
				2440	down_read(&slub_lock);
				2441	list_for_each(h, &slab_caches) {
				2442	struct kmem_cache *s =
				2443	container_of(h, struct kmem_cache, list);
				2444
				2445	func(s, cpu);
				2446	}
				2447	up_read(&slub_lock);
				2448	}
				2449
				2450	/*
				2451	* Use the cpu notifier to insure that the slab are flushed
				2452	* when necessary.
				2453	*/
				2454	static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
				2455	unsigned long action, void *hcpu)
				2456	{
				2457	long cpu = (long)hcpu;
				2458
				2459	switch (action) {
				2460	case CPU_UP_CANCELED:
				2461	case CPU_DEAD:
				2462	for_all_slabs(__flush_cpu_slab, cpu);
				2463	break;
				2464	default:
				2465	break;
				2466	}
				2467	return NOTIFY_OK;
				2468	}
				2469
				2470	static struct notifier_block __cpuinitdata slab_notifier =
				2471	{ &slab_cpuup_callback, NULL, 0 };
				2472
				2473	#endif
				2474
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2475	#ifdef CONFIG_NUMA
				2476
				2477	/*****************************************************************
				2478	* Generic reaper used to support the page allocator
				2479	* (the cpu slabs are reaped by a per slab workqueue).
				2480	*
				2481	* Maybe move this to the page allocator?
				2482	****************************************************************/
				2483
				2484	static DEFINE_PER_CPU(unsigned long, reap_node);
				2485
				2486	static void init_reap_node(int cpu)
				2487	{
				2488	int node;
				2489
				2490	node = next_node(cpu_to_node(cpu), node_online_map);
				2491	if (node == MAX_NUMNODES)
				2492	node = first_node(node_online_map);
				2493
				2494	__get_cpu_var(reap_node) = node;
				2495	}
				2496
				2497	static void next_reap_node(void)
				2498	{
				2499	int node = __get_cpu_var(reap_node);
				2500
				2501	/*
				2502	* Also drain per cpu pages on remote zones
				2503	*/
				2504	if (node != numa_node_id())
				2505	drain_node_pages(node);
				2506
				2507	node = next_node(node, node_online_map);
				2508	if (unlikely(node >= MAX_NUMNODES))
				2509	node = first_node(node_online_map);
				2510	__get_cpu_var(reap_node) = node;
				2511	}
				2512	#else
				2513	#define init_reap_node(cpu) do { } while (0)
				2514	#define next_reap_node(void) do { } while (0)
				2515	#endif
				2516
				2517	#define REAPTIMEOUT_CPUC (2*HZ)
				2518
				2519	#ifdef CONFIG_SMP
				2520	static DEFINE_PER_CPU(struct delayed_work, reap_work);
				2521
				2522	static void cache_reap(struct work_struct *unused)
				2523	{
				2524	next_reap_node();
				2525	refresh_cpu_vm_stats(smp_processor_id());
				2526	schedule_delayed_work(&__get_cpu_var(reap_work),
				2527	REAPTIMEOUT_CPUC);
				2528	}
				2529
				2530	static void __devinit start_cpu_timer(int cpu)
				2531	{
				2532	struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
				2533
				2534	/*
				2535	* When this gets called from do_initcalls via cpucache_init(),
				2536	* init_workqueues() has already run, so keventd will be setup
				2537	* at that time.
				2538	*/
				2539	if (keventd_up() && reap_work->work.func == NULL) {
				2540	init_reap_node(cpu);
				2541	INIT_DELAYED_WORK(reap_work, cache_reap);
				2542	schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
				2543	}
				2544	}
				2545
				2546	static int __init cpucache_init(void)
				2547	{
				2548	int cpu;
				2549
				2550	/*
				2551	* Register the timers that drain pcp pages and update vm statistics
				2552	*/
				2553	for_each_online_cpu(cpu)
				2554	start_cpu_timer(cpu);
				2555	return 0;
				2556	}
				2557	__initcall(cpucache_init);
				2558	#endif
				2559
				2560	#ifdef SLUB_RESILIENCY_TEST
				2561	static unsigned long validate_slab_cache(struct kmem_cache *s);
				2562
				2563	static void resiliency_test(void)
				2564	{
				2565	u8 *p;
				2566
				2567	printk(KERN_ERR "SLUB resiliency testing\n");
				2568	printk(KERN_ERR "-----------------------\n");
				2569	printk(KERN_ERR "A. Corruption after allocation\n");
				2570
				2571	p = kzalloc(16, GFP_KERNEL);
				2572	p[16] = 0x12;
				2573	printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
				2574	" 0x12->0x%p\n\n", p + 16);
				2575
				2576	validate_slab_cache(kmalloc_caches + 4);
				2577
				2578	/* Hmmm... The next two are dangerous */
				2579	p = kzalloc(32, GFP_KERNEL);
				2580	p[32 + sizeof(void *)] = 0x34;
				2581	printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
				2582	" 0x34 -> -0x%p\n", p);
				2583	printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
				2584
				2585	validate_slab_cache(kmalloc_caches + 5);
				2586	p = kzalloc(64, GFP_KERNEL);
				2587	p += 64 + (get_cycles() & 0xff) * sizeof(void *);
				2588	*p = 0x56;
				2589	printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
				2590	p);
				2591	printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
				2592	validate_slab_cache(kmalloc_caches + 6);
				2593
				2594	printk(KERN_ERR "\nB. Corruption after free\n");
				2595	p = kzalloc(128, GFP_KERNEL);
				2596	kfree(p);
				2597	*p = 0x78;
				2598	printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
				2599	validate_slab_cache(kmalloc_caches + 7);
				2600
				2601	p = kzalloc(256, GFP_KERNEL);
				2602	kfree(p);
				2603	p[50] = 0x9a;
				2604	printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
				2605	validate_slab_cache(kmalloc_caches + 8);
				2606
				2607	p = kzalloc(512, GFP_KERNEL);
				2608	kfree(p);
				2609	p[512] = 0xab;
				2610	printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
				2611	validate_slab_cache(kmalloc_caches + 9);
				2612	}
				2613	#else
				2614	static void resiliency_test(void) {};
				2615	#endif
				2616
				2617	/*
				2618	* These are not as efficient as kmalloc for the non debug case.
				2619	* We do not have the page struct available so we have to touch one
				2620	* cacheline in struct kmem_cache to check slab flags.
				2621	*/
				2622	void __kmalloc_track_caller(size_t size, gfp_t gfpflags, void caller)
				2623	{
				2624	struct kmem_cache *s = get_slab(size, gfpflags);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2625
				2626	if (!s)
				2627	return NULL;
				2628
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2629	return slab_alloc(s, gfpflags, -1, caller);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2630	}
				2631
				2632	void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
				2633	int node, void *caller)
				2634	{
				2635	struct kmem_cache *s = get_slab(size, gfpflags);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2636
				2637	if (!s)
				2638	return NULL;
				2639
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2640	return slab_alloc(s, gfpflags, node, caller);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2641	}
				2642
				2643	#ifdef CONFIG_SYSFS
				2644
Christoph Lameter	53e15af	2007-05-06 14:49:43 -0700	[diff] [blame]	2645	static int validate_slab(struct kmem_cache s, struct page page)
				2646	{
				2647	void *p;
				2648	void *addr = page_address(page);
				2649	unsigned long map[BITS_TO_LONGS(s->objects)];
				2650
				2651	if (!check_slab(s, page) \|\|
				2652	!on_freelist(s, page, NULL))
				2653	return 0;
				2654
				2655	/* Now we know that a valid freelist exists */
				2656	bitmap_zero(map, s->objects);
				2657
				2658	for(p = page->freelist; p; p = get_freepointer(s, p)) {
				2659	set_bit((p - addr) / s->size, map);
				2660	if (!check_object(s, page, p, 0))
				2661	return 0;
				2662	}
				2663
				2664	for(p = addr; p < addr + s->objects * s->size; p += s->size)
				2665	if (!test_bit((p - addr) / s->size, map))
				2666	if (!check_object(s, page, p, 1))
				2667	return 0;
				2668	return 1;
				2669	}
				2670
				2671	static void validate_slab_slab(struct kmem_cache s, struct page page)
				2672	{
				2673	if (slab_trylock(page)) {
				2674	validate_slab(s, page);
				2675	slab_unlock(page);
				2676	} else
				2677	printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
				2678	s->name, page);
				2679
				2680	if (s->flags & DEBUG_DEFAULT_FLAGS) {
				2681	if (!PageError(page))
				2682	printk(KERN_ERR "SLUB %s: PageError not set "
				2683	"on slab 0x%p\n", s->name, page);
				2684	} else {
				2685	if (PageError(page))
				2686	printk(KERN_ERR "SLUB %s: PageError set on "
				2687	"slab 0x%p\n", s->name, page);
				2688	}
				2689	}
				2690
				2691	static int validate_slab_node(struct kmem_cache s, struct kmem_cache_node n)
				2692	{
				2693	unsigned long count = 0;
				2694	struct page *page;
				2695	unsigned long flags;
				2696
				2697	spin_lock_irqsave(&n->list_lock, flags);
				2698
				2699	list_for_each_entry(page, &n->partial, lru) {
				2700	validate_slab_slab(s, page);
				2701	count++;
				2702	}
				2703	if (count != n->nr_partial)
				2704	printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
				2705	"counter=%ld\n", s->name, count, n->nr_partial);
				2706
				2707	if (!(s->flags & SLAB_STORE_USER))
				2708	goto out;
				2709
				2710	list_for_each_entry(page, &n->full, lru) {
				2711	validate_slab_slab(s, page);
				2712	count++;
				2713	}
				2714	if (count != atomic_long_read(&n->nr_slabs))
				2715	printk(KERN_ERR "SLUB: %s %ld slabs counted but "
				2716	"counter=%ld\n", s->name, count,
				2717	atomic_long_read(&n->nr_slabs));
				2718
				2719	out:
				2720	spin_unlock_irqrestore(&n->list_lock, flags);
				2721	return count;
				2722	}
				2723
				2724	static unsigned long validate_slab_cache(struct kmem_cache *s)
				2725	{
				2726	int node;
				2727	unsigned long count = 0;
				2728
				2729	flush_all(s);
				2730	for_each_online_node(node) {
				2731	struct kmem_cache_node *n = get_node(s, node);
				2732
				2733	count += validate_slab_node(s, n);
				2734	}
				2735	return count;
				2736	}
				2737
Christoph Lameter	88a420e	2007-05-06 14:49:45 -0700	[diff] [blame]	2738	/*
				2739	* Generate lists of locations where slabcache objects are allocated
				2740	* and freed.
				2741	*/
				2742
				2743	struct location {
				2744	unsigned long count;
				2745	void *addr;
				2746	};
				2747
				2748	struct loc_track {
				2749	unsigned long max;
				2750	unsigned long count;
				2751	struct location *loc;
				2752	};
				2753
				2754	static void free_loc_track(struct loc_track *t)
				2755	{
				2756	if (t->max)
				2757	free_pages((unsigned long)t->loc,
				2758	get_order(sizeof(struct location) * t->max));
				2759	}
				2760
				2761	static int alloc_loc_track(struct loc_track *t, unsigned long max)
				2762	{
				2763	struct location *l;
				2764	int order;
				2765
				2766	if (!max)
				2767	max = PAGE_SIZE / sizeof(struct location);
				2768
				2769	order = get_order(sizeof(struct location) * max);
				2770
				2771	l = (void *)__get_free_pages(GFP_KERNEL, order);
				2772
				2773	if (!l)
				2774	return 0;
				2775
				2776	if (t->count) {
				2777	memcpy(l, t->loc, sizeof(struct location) * t->count);
				2778	free_loc_track(t);
				2779	}
				2780	t->max = max;
				2781	t->loc = l;
				2782	return 1;
				2783	}
				2784
				2785	static int add_location(struct loc_track t, struct kmem_cache s,
				2786	void *addr)
				2787	{
				2788	long start, end, pos;
				2789	struct location *l;
				2790	void *caddr;
				2791
				2792	start = -1;
				2793	end = t->count;
				2794
				2795	for ( ; ; ) {
				2796	pos = start + (end - start + 1) / 2;
				2797
				2798	/*
				2799	* There is nothing at "end". If we end up there
				2800	* we need to add something to before end.
				2801	*/
				2802	if (pos == end)
				2803	break;
				2804
				2805	caddr = t->loc[pos].addr;
				2806	if (addr == caddr) {
				2807	t->loc[pos].count++;
				2808	return 1;
				2809	}
				2810
				2811	if (addr < caddr)
				2812	end = pos;
				2813	else
				2814	start = pos;
				2815	}
				2816
				2817	/*
				2818	* Not found. Insert new tracking element
				2819	*/
				2820	if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max))
				2821	return 0;
				2822
				2823	l = t->loc + pos;
				2824	if (pos < t->count)
				2825	memmove(l + 1, l,
				2826	(t->count - pos) * sizeof(struct location));
				2827	t->count++;
				2828	l->count = 1;
				2829	l->addr = addr;
				2830	return 1;
				2831	}
				2832
				2833	static void process_slab(struct loc_track t, struct kmem_cache s,
				2834	struct page *page, enum track_item alloc)
				2835	{
				2836	void *addr = page_address(page);
				2837	unsigned long map[BITS_TO_LONGS(s->objects)];
				2838	void *p;
				2839
				2840	bitmap_zero(map, s->objects);
				2841	for (p = page->freelist; p; p = get_freepointer(s, p))
				2842	set_bit((p - addr) / s->size, map);
				2843
				2844	for (p = addr; p < addr + s->objects * s->size; p += s->size)
				2845	if (!test_bit((p - addr) / s->size, map)) {
				2846	void *addr = get_track(s, p, alloc)->addr;
				2847
				2848	add_location(t, s, addr);
				2849	}
				2850	}
				2851
				2852	static int list_locations(struct kmem_cache s, char buf,
				2853	enum track_item alloc)
				2854	{
				2855	int n = 0;
				2856	unsigned long i;
				2857	struct loc_track t;
				2858	int node;
				2859
				2860	t.count = 0;
				2861	t.max = 0;
				2862
				2863	/* Push back cpu slabs */
				2864	flush_all(s);
				2865
				2866	for_each_online_node(node) {
				2867	struct kmem_cache_node *n = get_node(s, node);
				2868	unsigned long flags;
				2869	struct page *page;
				2870
				2871	if (!atomic_read(&n->nr_slabs))
				2872	continue;
				2873
				2874	spin_lock_irqsave(&n->list_lock, flags);
				2875	list_for_each_entry(page, &n->partial, lru)
				2876	process_slab(&t, s, page, alloc);
				2877	list_for_each_entry(page, &n->full, lru)
				2878	process_slab(&t, s, page, alloc);
				2879	spin_unlock_irqrestore(&n->list_lock, flags);
				2880	}
				2881
				2882	for (i = 0; i < t.count; i++) {
				2883	void *addr = t.loc[i].addr;
				2884
				2885	if (n > PAGE_SIZE - 100)
				2886	break;
				2887	n += sprintf(buf + n, "%7ld ", t.loc[i].count);
				2888	if (addr)
				2889	n += sprint_symbol(buf + n, (unsigned long)t.loc[i].addr);
				2890	else
				2891	n += sprintf(buf + n, "<not-available>");
				2892	n += sprintf(buf + n, "\n");
				2893	}
				2894
				2895	free_loc_track(&t);
				2896	if (!t.count)
				2897	n += sprintf(buf, "No data\n");
				2898	return n;
				2899	}
				2900
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2901	static unsigned long count_partial(struct kmem_cache_node *n)
				2902	{
				2903	unsigned long flags;
				2904	unsigned long x = 0;
				2905	struct page *page;
				2906
				2907	spin_lock_irqsave(&n->list_lock, flags);
				2908	list_for_each_entry(page, &n->partial, lru)
				2909	x += page->inuse;
				2910	spin_unlock_irqrestore(&n->list_lock, flags);
				2911	return x;
				2912	}
				2913
				2914	enum slab_stat_type {
				2915	SL_FULL,
				2916	SL_PARTIAL,
				2917	SL_CPU,
				2918	SL_OBJECTS
				2919	};
				2920
				2921	#define SO_FULL (1 << SL_FULL)
				2922	#define SO_PARTIAL (1 << SL_PARTIAL)
				2923	#define SO_CPU (1 << SL_CPU)
				2924	#define SO_OBJECTS (1 << SL_OBJECTS)
				2925
				2926	static unsigned long slab_objects(struct kmem_cache *s,
				2927	char *buf, unsigned long flags)
				2928	{
				2929	unsigned long total = 0;
				2930	int cpu;
				2931	int node;
				2932	int x;
				2933	unsigned long *nodes;
				2934	unsigned long *per_cpu;
				2935
				2936	nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
				2937	per_cpu = nodes + nr_node_ids;
				2938
				2939	for_each_possible_cpu(cpu) {
				2940	struct page *page = s->cpu_slab[cpu];
				2941	int node;
				2942
				2943	if (page) {
				2944	node = page_to_nid(page);
				2945	if (flags & SO_CPU) {
				2946	int x = 0;
				2947
				2948	if (flags & SO_OBJECTS)
				2949	x = page->inuse;
				2950	else
				2951	x = 1;
				2952	total += x;
				2953	nodes[node] += x;
				2954	}
				2955	per_cpu[node]++;
				2956	}
				2957	}
				2958
				2959	for_each_online_node(node) {
				2960	struct kmem_cache_node *n = get_node(s, node);
				2961
				2962	if (flags & SO_PARTIAL) {
				2963	if (flags & SO_OBJECTS)
				2964	x = count_partial(n);
				2965	else
				2966	x = n->nr_partial;
				2967	total += x;
				2968	nodes[node] += x;
				2969	}
				2970
				2971	if (flags & SO_FULL) {
				2972	int full_slabs = atomic_read(&n->nr_slabs)
				2973	- per_cpu[node]
				2974	- n->nr_partial;
				2975
				2976	if (flags & SO_OBJECTS)
				2977	x = full_slabs * s->objects;
				2978	else
				2979	x = full_slabs;
				2980	total += x;
				2981	nodes[node] += x;
				2982	}
				2983	}
				2984
				2985	x = sprintf(buf, "%lu", total);
				2986	#ifdef CONFIG_NUMA
				2987	for_each_online_node(node)
				2988	if (nodes[node])
				2989	x += sprintf(buf + x, " N%d=%lu",
				2990	node, nodes[node]);
				2991	#endif
				2992	kfree(nodes);
				2993	return x + sprintf(buf + x, "\n");
				2994	}
				2995
				2996	static int any_slab_objects(struct kmem_cache *s)
				2997	{
				2998	int node;
				2999	int cpu;
				3000
				3001	for_each_possible_cpu(cpu)
				3002	if (s->cpu_slab[cpu])
				3003	return 1;
				3004
				3005	for_each_node(node) {
				3006	struct kmem_cache_node *n = get_node(s, node);
				3007
				3008	if (n->nr_partial \|\| atomic_read(&n->nr_slabs))
				3009	return 1;
				3010	}
				3011	return 0;
				3012	}
				3013
				3014	#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
				3015	#define to_slab(n) container_of(n, struct kmem_cache, kobj);
				3016
				3017	struct slab_attribute {
				3018	struct attribute attr;
				3019	ssize_t (show)(struct kmem_cache s, char *buf);
				3020	ssize_t (store)(struct kmem_cache s, const char *x, size_t count);
				3021	};
				3022
				3023	#define SLAB_ATTR_RO(_name) \
				3024	static struct slab_attribute _name##_attr = __ATTR_RO(_name)
				3025
				3026	#define SLAB_ATTR(_name) \
				3027	static struct slab_attribute _name##_attr = \
				3028	__ATTR(_name, 0644, _name##_show, _name##_store)
				3029
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	3030	static ssize_t slab_size_show(struct kmem_cache s, char buf)
				3031	{
				3032	return sprintf(buf, "%d\n", s->size);
				3033	}
				3034	SLAB_ATTR_RO(slab_size);
				3035
				3036	static ssize_t align_show(struct kmem_cache s, char buf)
				3037	{
				3038	return sprintf(buf, "%d\n", s->align);
				3039	}
				3040	SLAB_ATTR_RO(align);
				3041
				3042	static ssize_t object_size_show(struct kmem_cache s, char buf)
				3043	{
				3044	return sprintf(buf, "%d\n", s->objsize);
				3045	}
				3046	SLAB_ATTR_RO(object_size);
				3047
				3048	static ssize_t objs_per_slab_show(struct kmem_cache s, char buf)
				3049	{
				3050	return sprintf(buf, "%d\n", s->objects);
				3051	}
				3052	SLAB_ATTR_RO(objs_per_slab);
				3053
				3054	static ssize_t order_show(struct kmem_cache s, char buf)
				3055	{
				3056	return sprintf(buf, "%d\n", s->order);
				3057	}
				3058	SLAB_ATTR_RO(order);
				3059
				3060	static ssize_t ctor_show(struct kmem_cache s, char buf)
				3061	{
				3062	if (s->ctor) {
				3063	int n = sprint_symbol(buf, (unsigned long)s->ctor);
				3064
				3065	return n + sprintf(buf + n, "\n");
				3066	}
				3067	return 0;
				3068	}
				3069	SLAB_ATTR_RO(ctor);
				3070
				3071	static ssize_t dtor_show(struct kmem_cache s, char buf)
				3072	{
				3073	if (s->dtor) {
				3074	int n = sprint_symbol(buf, (unsigned long)s->dtor);
				3075
				3076	return n + sprintf(buf + n, "\n");
				3077	}
				3078	return 0;
				3079	}
				3080	SLAB_ATTR_RO(dtor);
				3081
				3082	static ssize_t aliases_show(struct kmem_cache s, char buf)
				3083	{
				3084	return sprintf(buf, "%d\n", s->refcount - 1);
				3085	}
				3086	SLAB_ATTR_RO(aliases);
				3087
				3088	static ssize_t slabs_show(struct kmem_cache s, char buf)
				3089	{
				3090	return slab_objects(s, buf, SO_FULL\|SO_PARTIAL\|SO_CPU);
				3091	}
				3092	SLAB_ATTR_RO(slabs);
				3093
				3094	static ssize_t partial_show(struct kmem_cache s, char buf)
				3095	{
				3096	return slab_objects(s, buf, SO_PARTIAL);
				3097	}
				3098	SLAB_ATTR_RO(partial);
				3099
				3100	static ssize_t cpu_slabs_show(struct kmem_cache s, char buf)
				3101	{
				3102	return slab_objects(s, buf, SO_CPU);
				3103	}
				3104	SLAB_ATTR_RO(cpu_slabs);
				3105
				3106	static ssize_t objects_show(struct kmem_cache s, char buf)
				3107	{
				3108	return slab_objects(s, buf, SO_FULL\|SO_PARTIAL\|SO_CPU\|SO_OBJECTS);
				3109	}
				3110	SLAB_ATTR_RO(objects);
				3111
				3112	static ssize_t sanity_checks_show(struct kmem_cache s, char buf)
				3113	{
				3114	return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
				3115	}
				3116
				3117	static ssize_t sanity_checks_store(struct kmem_cache *s,
				3118	const char *buf, size_t length)
				3119	{
				3120	s->flags &= ~SLAB_DEBUG_FREE;
				3121	if (buf[0] == '1')
				3122	s->flags \|= SLAB_DEBUG_FREE;
				3123	return length;
				3124	}
				3125	SLAB_ATTR(sanity_checks);
				3126
				3127	static ssize_t trace_show(struct kmem_cache s, char buf)
				3128	{
				3129	return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
				3130	}
				3131
				3132	static ssize_t trace_store(struct kmem_cache s, const char buf,
				3133	size_t length)
				3134	{
				3135	s->flags &= ~SLAB_TRACE;
				3136	if (buf[0] == '1')
				3137	s->flags \|= SLAB_TRACE;
				3138	return length;
				3139	}
				3140	SLAB_ATTR(trace);
				3141
				3142	static ssize_t reclaim_account_show(struct kmem_cache s, char buf)
				3143	{
				3144	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
				3145	}
				3146
				3147	static ssize_t reclaim_account_store(struct kmem_cache *s,
				3148	const char *buf, size_t length)
				3149	{
				3150	s->flags &= ~SLAB_RECLAIM_ACCOUNT;
				3151	if (buf[0] == '1')
				3152	s->flags \|= SLAB_RECLAIM_ACCOUNT;
				3153	return length;
				3154	}
				3155	SLAB_ATTR(reclaim_account);
				3156
				3157	static ssize_t hwcache_align_show(struct kmem_cache s, char buf)
				3158	{
				3159	return sprintf(buf, "%d\n", !!(s->flags &
				3160	(SLAB_HWCACHE_ALIGN\|SLAB_MUST_HWCACHE_ALIGN)));
				3161	}
				3162	SLAB_ATTR_RO(hwcache_align);
				3163
				3164	#ifdef CONFIG_ZONE_DMA
				3165	static ssize_t cache_dma_show(struct kmem_cache s, char buf)
				3166	{
				3167	return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
				3168	}
				3169	SLAB_ATTR_RO(cache_dma);
				3170	#endif
				3171
				3172	static ssize_t destroy_by_rcu_show(struct kmem_cache s, char buf)
				3173	{
				3174	return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
				3175	}
				3176	SLAB_ATTR_RO(destroy_by_rcu);
				3177
				3178	static ssize_t red_zone_show(struct kmem_cache s, char buf)
				3179	{
				3180	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
				3181	}
				3182
				3183	static ssize_t red_zone_store(struct kmem_cache *s,
				3184	const char *buf, size_t length)
				3185	{
				3186	if (any_slab_objects(s))
				3187	return -EBUSY;
				3188
				3189	s->flags &= ~SLAB_RED_ZONE;
				3190	if (buf[0] == '1')
				3191	s->flags \|= SLAB_RED_ZONE;
				3192	calculate_sizes(s);
				3193	return length;
				3194	}
				3195	SLAB_ATTR(red_zone);
				3196
				3197	static ssize_t poison_show(struct kmem_cache s, char buf)
				3198	{
				3199	return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
				3200	}
				3201
				3202	static ssize_t poison_store(struct kmem_cache *s,
				3203	const char *buf, size_t length)
				3204	{
				3205	if (any_slab_objects(s))
				3206	return -EBUSY;
				3207
				3208	s->flags &= ~SLAB_POISON;
				3209	if (buf[0] == '1')
				3210	s->flags \|= SLAB_POISON;
				3211	calculate_sizes(s);
				3212	return length;
				3213	}
				3214	SLAB_ATTR(poison);
				3215
				3216	static ssize_t store_user_show(struct kmem_cache s, char buf)
				3217	{
				3218	return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
				3219	}
				3220
				3221	static ssize_t store_user_store(struct kmem_cache *s,
				3222	const char *buf, size_t length)
				3223	{
				3224	if (any_slab_objects(s))
				3225	return -EBUSY;
				3226
				3227	s->flags &= ~SLAB_STORE_USER;
				3228	if (buf[0] == '1')
				3229	s->flags \|= SLAB_STORE_USER;
				3230	calculate_sizes(s);
				3231	return length;
				3232	}
				3233	SLAB_ATTR(store_user);
				3234
Christoph Lameter	53e15af	2007-05-06 14:49:43 -0700	[diff] [blame]	3235	static ssize_t validate_show(struct kmem_cache s, char buf)
				3236	{
				3237	return 0;
				3238	}
				3239
				3240	static ssize_t validate_store(struct kmem_cache *s,
				3241	const char *buf, size_t length)
				3242	{
				3243	if (buf[0] == '1')
				3244	validate_slab_cache(s);
				3245	else
				3246	return -EINVAL;
				3247	return length;
				3248	}
				3249	SLAB_ATTR(validate);
				3250
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame^]	3251	static ssize_t shrink_show(struct kmem_cache s, char buf)
				3252	{
				3253	return 0;
				3254	}
				3255
				3256	static ssize_t shrink_store(struct kmem_cache *s,
				3257	const char *buf, size_t length)
				3258	{
				3259	if (buf[0] == '1') {
				3260	int rc = kmem_cache_shrink(s);
				3261
				3262	if (rc)
				3263	return rc;
				3264	} else
				3265	return -EINVAL;
				3266	return length;
				3267	}
				3268	SLAB_ATTR(shrink);
				3269
Christoph Lameter	88a420e	2007-05-06 14:49:45 -0700	[diff] [blame]	3270	static ssize_t alloc_calls_show(struct kmem_cache s, char buf)
				3271	{
				3272	if (!(s->flags & SLAB_STORE_USER))
				3273	return -ENOSYS;
				3274	return list_locations(s, buf, TRACK_ALLOC);
				3275	}
				3276	SLAB_ATTR_RO(alloc_calls);
				3277
				3278	static ssize_t free_calls_show(struct kmem_cache s, char buf)
				3279	{
				3280	if (!(s->flags & SLAB_STORE_USER))
				3281	return -ENOSYS;
				3282	return list_locations(s, buf, TRACK_FREE);
				3283	}
				3284	SLAB_ATTR_RO(free_calls);
				3285
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	3286	#ifdef CONFIG_NUMA
				3287	static ssize_t defrag_ratio_show(struct kmem_cache s, char buf)
				3288	{
				3289	return sprintf(buf, "%d\n", s->defrag_ratio / 10);
				3290	}
				3291
				3292	static ssize_t defrag_ratio_store(struct kmem_cache *s,
				3293	const char *buf, size_t length)
				3294	{
				3295	int n = simple_strtoul(buf, NULL, 10);
				3296
				3297	if (n < 100)
				3298	s->defrag_ratio = n * 10;
				3299	return length;
				3300	}
				3301	SLAB_ATTR(defrag_ratio);
				3302	#endif
				3303
				3304	static struct attribute * slab_attrs[] = {
				3305	&slab_size_attr.attr,
				3306	&object_size_attr.attr,
				3307	&objs_per_slab_attr.attr,
				3308	&order_attr.attr,
				3309	&objects_attr.attr,
				3310	&slabs_attr.attr,
				3311	&partial_attr.attr,
				3312	&cpu_slabs_attr.attr,
				3313	&ctor_attr.attr,
				3314	&dtor_attr.attr,
				3315	&aliases_attr.attr,
				3316	&align_attr.attr,
				3317	&sanity_checks_attr.attr,
				3318	&trace_attr.attr,
				3319	&hwcache_align_attr.attr,
				3320	&reclaim_account_attr.attr,
				3321	&destroy_by_rcu_attr.attr,
				3322	&red_zone_attr.attr,
				3323	&poison_attr.attr,
				3324	&store_user_attr.attr,
Christoph Lameter	53e15af	2007-05-06 14:49:43 -0700	[diff] [blame]	3325	&validate_attr.attr,
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame^]	3326	&shrink_attr.attr,
Christoph Lameter	88a420e	2007-05-06 14:49:45 -0700	[diff] [blame]	3327	&alloc_calls_attr.attr,
				3328	&free_calls_attr.attr,
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	3329	#ifdef CONFIG_ZONE_DMA
				3330	&cache_dma_attr.attr,
				3331	#endif
				3332	#ifdef CONFIG_NUMA
				3333	&defrag_ratio_attr.attr,
				3334	#endif
				3335	NULL
				3336	};
				3337
				3338	static struct attribute_group slab_attr_group = {
				3339	.attrs = slab_attrs,
				3340	};
				3341
				3342	static ssize_t slab_attr_show(struct kobject *kobj,
				3343	struct attribute *attr,
				3344	char *buf)
				3345	{
				3346	struct slab_attribute *attribute;
				3347	struct kmem_cache *s;
				3348	int err;
				3349
				3350	attribute = to_slab_attr(attr);
				3351	s = to_slab(kobj);
				3352
				3353	if (!attribute->show)
				3354	return -EIO;
				3355
				3356	err = attribute->show(s, buf);
				3357
				3358	return err;
				3359	}
				3360
				3361	static ssize_t slab_attr_store(struct kobject *kobj,
				3362	struct attribute *attr,
				3363	const char *buf, size_t len)
				3364	{
				3365	struct slab_attribute *attribute;
				3366	struct kmem_cache *s;
				3367	int err;
				3368
				3369	attribute = to_slab_attr(attr);
				3370	s = to_slab(kobj);
				3371
				3372	if (!attribute->store)
				3373	return -EIO;
				3374
				3375	err = attribute->store(s, buf, len);
				3376
				3377	return err;
				3378	}
				3379
				3380	static struct sysfs_ops slab_sysfs_ops = {
				3381	.show = slab_attr_show,
				3382	.store = slab_attr_store,
				3383	};
				3384
				3385	static struct kobj_type slab_ktype = {
				3386	.sysfs_ops = &slab_sysfs_ops,
				3387	};
				3388
				3389	static int uevent_filter(struct kset kset, struct kobject kobj)
				3390	{
				3391	struct kobj_type *ktype = get_ktype(kobj);
				3392
				3393	if (ktype == &slab_ktype)
				3394	return 1;
				3395	return 0;
				3396	}
				3397
				3398	static struct kset_uevent_ops slab_uevent_ops = {
				3399	.filter = uevent_filter,
				3400	};
				3401
				3402	decl_subsys(slab, &slab_ktype, &slab_uevent_ops);
				3403
				3404	#define ID_STR_LENGTH 64
				3405
				3406	/* Create a unique string id for a slab cache:
				3407	* format
				3408	* :[flags-]size:[memory address of kmemcache]
				3409	*/
				3410	static char create_unique_id(struct kmem_cache s)
				3411	{
				3412	char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
				3413	char *p = name;
				3414
				3415	BUG_ON(!name);
				3416
				3417	*p++ = ':';
				3418	/*
				3419	* First flags affecting slabcache operations. We will only
				3420	* get here for aliasable slabs so we do not need to support
				3421	* too many flags. The flags here must cover all flags that
				3422	* are matched during merging to guarantee that the id is
				3423	* unique.
				3424	*/
				3425	if (s->flags & SLAB_CACHE_DMA)
				3426	*p++ = 'd';
				3427	if (s->flags & SLAB_RECLAIM_ACCOUNT)
				3428	*p++ = 'a';
				3429	if (s->flags & SLAB_DEBUG_FREE)
				3430	*p++ = 'F';
				3431	if (p != name + 1)
				3432	*p++ = '-';
				3433	p += sprintf(p, "%07d", s->size);
				3434	BUG_ON(p > name + ID_STR_LENGTH - 1);
				3435	return name;
				3436	}
				3437
				3438	static int sysfs_slab_add(struct kmem_cache *s)
				3439	{
				3440	int err;
				3441	const char *name;
				3442	int unmergeable;
				3443
				3444	if (slab_state < SYSFS)
				3445	/* Defer until later */
				3446	return 0;
				3447
				3448	unmergeable = slab_unmergeable(s);
				3449	if (unmergeable) {
				3450	/*
				3451	* Slabcache can never be merged so we can use the name proper.
				3452	* This is typically the case for debug situations. In that
				3453	* case we can catch duplicate names easily.
				3454	*/
				3455	sysfs_remove_link(&slab_subsys.kset.kobj, s->name);
				3456	name = s->name;
				3457	} else {
				3458	/*
				3459	* Create a unique name for the slab as a target
				3460	* for the symlinks.
				3461	*/
				3462	name = create_unique_id(s);
				3463	}
				3464
				3465	kobj_set_kset_s(s, slab_subsys);
				3466	kobject_set_name(&s->kobj, name);
				3467	kobject_init(&s->kobj);
				3468	err = kobject_add(&s->kobj);
				3469	if (err)
				3470	return err;
				3471
				3472	err = sysfs_create_group(&s->kobj, &slab_attr_group);
				3473	if (err)
				3474	return err;
				3475	kobject_uevent(&s->kobj, KOBJ_ADD);
				3476	if (!unmergeable) {
				3477	/* Setup first alias */
				3478	sysfs_slab_alias(s, s->name);
				3479	kfree(name);
				3480	}
				3481	return 0;
				3482	}
				3483
				3484	static void sysfs_slab_remove(struct kmem_cache *s)
				3485	{
				3486	kobject_uevent(&s->kobj, KOBJ_REMOVE);
				3487	kobject_del(&s->kobj);
				3488	}
				3489
				3490	/*
				3491	* Need to buffer aliases during bootup until sysfs becomes
				3492	* available lest we loose that information.
				3493	*/
				3494	struct saved_alias {
				3495	struct kmem_cache *s;
				3496	const char *name;
				3497	struct saved_alias *next;
				3498	};
				3499
				3500	struct saved_alias *alias_list;
				3501
				3502	static int sysfs_slab_alias(struct kmem_cache s, const char name)
				3503	{
				3504	struct saved_alias *al;
				3505
				3506	if (slab_state == SYSFS) {
				3507	/*
				3508	* If we have a leftover link then remove it.
				3509	*/
				3510	sysfs_remove_link(&slab_subsys.kset.kobj, name);
				3511	return sysfs_create_link(&slab_subsys.kset.kobj,
				3512	&s->kobj, name);
				3513	}
				3514
				3515	al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
				3516	if (!al)
				3517	return -ENOMEM;
				3518
				3519	al->s = s;
				3520	al->name = name;
				3521	al->next = alias_list;
				3522	alias_list = al;
				3523	return 0;
				3524	}
				3525
				3526	static int __init slab_sysfs_init(void)
				3527	{
				3528	int err;
				3529
				3530	err = subsystem_register(&slab_subsys);
				3531	if (err) {
				3532	printk(KERN_ERR "Cannot register slab subsystem.\n");
				3533	return -ENOSYS;
				3534	}
				3535
				3536	finish_bootstrap();
				3537
				3538	while (alias_list) {
				3539	struct saved_alias *al = alias_list;
				3540
				3541	alias_list = alias_list->next;
				3542	err = sysfs_slab_alias(al->s, al->name);
				3543	BUG_ON(err);
				3544	kfree(al);
				3545	}
				3546
				3547	resiliency_test();
				3548	return 0;
				3549	}
				3550
				3551	__initcall(slab_sysfs_init);
				3552	#else
				3553	__initcall(finish_bootstrap);
				3554	#endif