Blame - mm/slub.c - kernel/msm-4.9

blob: 1832ae1ea5366ed62a5ad80c8b667633f6472d06 [file] [log] [blame]

Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1	/*
				2	* SLUB: A slab allocator that limits cache line use instead of queuing
				3	* objects in per cpu and per node lists.
				4	*
				5	* The allocator synchronizes using per slab locks and only
				6	* uses a centralized lock to manage a pool of partial slabs.
				7	*
				8	* (C) 2007 SGI, Christoph Lameter <clameter@sgi.com>
				9	*/
				10
				11	#include <linux/mm.h>
				12	#include <linux/module.h>
				13	#include <linux/bit_spinlock.h>
				14	#include <linux/interrupt.h>
				15	#include <linux/bitops.h>
				16	#include <linux/slab.h>
				17	#include <linux/seq_file.h>
				18	#include <linux/cpu.h>
				19	#include <linux/cpuset.h>
				20	#include <linux/mempolicy.h>
				21	#include <linux/ctype.h>
				22	#include <linux/kallsyms.h>
				23
				24	/*
				25	* Lock order:
				26	* 1. slab_lock(page)
				27	* 2. slab->list_lock
				28	*
				29	* The slab_lock protects operations on the object of a particular
				30	* slab and its metadata in the page struct. If the slab lock
				31	* has been taken then no allocations nor frees can be performed
				32	* on the objects in the slab nor can the slab be added or removed
				33	* from the partial or full lists since this would mean modifying
				34	* the page_struct of the slab.
				35	*
				36	* The list_lock protects the partial and full list on each node and
				37	* the partial slab counter. If taken then no new slabs may be added or
				38	* removed from the lists nor make the number of partial slabs be modified.
				39	* (Note that the total number of slabs is an atomic value that may be
				40	* modified without taking the list lock).
				41	*
				42	* The list_lock is a centralized lock and thus we avoid taking it as
				43	* much as possible. As long as SLUB does not have to handle partial
				44	* slabs, operations can continue without any centralized lock. F.e.
				45	* allocating a long series of objects that fill up slabs does not require
				46	* the list lock.
				47	*
				48	* The lock order is sometimes inverted when we are trying to get a slab
				49	* off a list. We take the list_lock and then look for a page on the list
				50	* to use. While we do that objects in the slabs may be freed. We can
				51	* only operate on the slab if we have also taken the slab_lock. So we use
				52	* a slab_trylock() on the slab. If trylock was successful then no frees
				53	* can occur anymore and we can use the slab for allocations etc. If the
				54	* slab_trylock() does not succeed then frees are in progress in the slab and
				55	* we must stay away from it for a while since we may cause a bouncing
				56	* cacheline if we try to acquire the lock. So go onto the next slab.
				57	* If all pages are busy then we may allocate a new slab instead of reusing
				58	* a partial slab. A new slab has noone operating on it and thus there is
				59	* no danger of cacheline contention.
				60	*
				61	* Interrupts are disabled during allocation and deallocation in order to
				62	* make the slab allocator safe to use in the context of an irq. In addition
				63	* interrupts are disabled to ensure that the processor does not change
				64	* while handling per_cpu slabs, due to kernel preemption.
				65	*
				66	* SLUB assigns one slab for allocation to each processor.
				67	* Allocations only occur from these slabs called cpu slabs.
				68	*
				69	* Slabs with free elements are kept on a partial list.
				70	* There is no list for full slabs. If an object in a full slab is
				71	* freed then the slab will show up again on the partial lists.
				72	* Otherwise there is no need to track full slabs unless we have to
				73	* track full slabs for debugging purposes.
				74	*
				75	* Slabs are freed when they become empty. Teardown and setup is
				76	* minimal so we rely on the page allocators per cpu caches for
				77	* fast frees and allocs.
				78	*
				79	* Overloading of page flags that are otherwise used for LRU management.
				80	*
				81	* PageActive The slab is used as a cpu cache. Allocations
				82	* may be performed from the slab. The slab is not
				83	* on any slab list and cannot be moved onto one.
				84	*
				85	* PageError Slab requires special handling due to debug
				86	* options set. This moves slab handling out of
				87	* the fast path.
				88	*/
				89
				90	/*
				91	* Issues still to be resolved:
				92	*
				93	* - The per cpu array is updated for each new slab and and is a remote
				94	* cacheline for most nodes. This could become a bouncing cacheline given
				95	* enough frequent updates. There are 16 pointers in a cacheline.so at
				96	* max 16 cpus could compete. Likely okay.
				97	*
				98	* - Support PAGE_ALLOC_DEBUG. Should be easy to do.
				99	*
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	100	* - Variable sizing of the per node arrays
				101	*/
				102
				103	/* Enable to test recovery from slab corruption on boot */
				104	#undef SLUB_RESILIENCY_TEST
				105
				106	#if PAGE_SHIFT <= 12
				107
				108	/*
				109	* Small page size. Make sure that we do not fragment memory
				110	*/
				111	#define DEFAULT_MAX_ORDER 1
				112	#define DEFAULT_MIN_OBJECTS 4
				113
				114	#else
				115
				116	/*
				117	* Large page machines are customarily able to handle larger
				118	* page orders.
				119	*/
				120	#define DEFAULT_MAX_ORDER 2
				121	#define DEFAULT_MIN_OBJECTS 8
				122
				123	#endif
				124
				125	/*
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame]	126	* Mininum number of partial slabs. These will be left on the partial
				127	* lists even if they are empty. kmem_cache_shrink may reclaim them.
				128	*/
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	129	#define MIN_PARTIAL 2
				130
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame]	131	/*
				132	* Maximum number of desirable partial slabs.
				133	* The existence of more partial slabs makes kmem_cache_shrink
				134	* sort the partial list by the number of objects in the.
				135	*/
				136	#define MAX_PARTIAL 10
				137
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	138	#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE \| SLAB_RED_ZONE \| \
				139	SLAB_POISON \| SLAB_STORE_USER)
				140	/*
				141	* Set of flags that will prevent slab merging
				142	*/
				143	#define SLUB_NEVER_MERGE (SLAB_RED_ZONE \| SLAB_POISON \| SLAB_STORE_USER \| \
				144	SLAB_TRACE \| SLAB_DESTROY_BY_RCU)
				145
				146	#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE \| SLAB_RECLAIM_ACCOUNT \| \
				147	SLAB_CACHE_DMA)
				148
				149	#ifndef ARCH_KMALLOC_MINALIGN
Christoph Lameter	47bfdc0	2007-05-06 14:49:37 -0700	[diff] [blame]	150	#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	151	#endif
				152
				153	#ifndef ARCH_SLAB_MINALIGN
Christoph Lameter	47bfdc0	2007-05-06 14:49:37 -0700	[diff] [blame]	154	#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	155	#endif
				156
				157	/* Internal SLUB flags */
				158	#define __OBJECT_POISON 0x80000000 /* Poison object */
				159
Christoph Lameter	65c02d4	2007-05-09 02:32:35 -0700	[diff] [blame]	160	/* Not all arches define cache_line_size */
				161	#ifndef cache_line_size
				162	#define cache_line_size() L1_CACHE_BYTES
				163	#endif
				164
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	165	static int kmem_size = sizeof(struct kmem_cache);
				166
				167	#ifdef CONFIG_SMP
				168	static struct notifier_block slab_notifier;
				169	#endif
				170
				171	static enum {
				172	DOWN, /* No slab functionality available */
				173	PARTIAL, /* kmem_cache_open() works but kmalloc does not */
				174	UP, /* Everything works */
				175	SYSFS /* Sysfs up */
				176	} slab_state = DOWN;
				177
				178	/* A list of all slab caches on the system */
				179	static DECLARE_RWSEM(slub_lock);
				180	LIST_HEAD(slab_caches);
				181
				182	#ifdef CONFIG_SYSFS
				183	static int sysfs_slab_add(struct kmem_cache *);
				184	static int sysfs_slab_alias(struct kmem_cache , const char );
				185	static void sysfs_slab_remove(struct kmem_cache *);
				186	#else
				187	static int sysfs_slab_add(struct kmem_cache *s) { return 0; }
				188	static int sysfs_slab_alias(struct kmem_cache s, const char p) { return 0; }
				189	static void sysfs_slab_remove(struct kmem_cache *s) {}
				190	#endif
				191
				192	/********************************************************************
				193	* Core slab cache functions
				194	*******************************************************************/
				195
				196	int slab_is_available(void)
				197	{
				198	return slab_state >= UP;
				199	}
				200
				201	static inline struct kmem_cache_node get_node(struct kmem_cache s, int node)
				202	{
				203	#ifdef CONFIG_NUMA
				204	return s->node[node];
				205	#else
				206	return &s->local_node;
				207	#endif
				208	}
				209
				210	/*
				211	* Object debugging
				212	*/
				213	static void print_section(char text, u8 addr, unsigned int length)
				214	{
				215	int i, offset;
				216	int newline = 1;
				217	char ascii[17];
				218
				219	ascii[16] = 0;
				220
				221	for (i = 0; i < length; i++) {
				222	if (newline) {
				223	printk(KERN_ERR "%10s 0x%p: ", text, addr + i);
				224	newline = 0;
				225	}
				226	printk(" %02x", addr[i]);
				227	offset = i % 16;
				228	ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
				229	if (offset == 15) {
				230	printk(" %s\n",ascii);
				231	newline = 1;
				232	}
				233	}
				234	if (!newline) {
				235	i %= 16;
				236	while (i < 16) {
				237	printk(" ");
				238	ascii[i] = ' ';
				239	i++;
				240	}
				241	printk(" %s\n", ascii);
				242	}
				243	}
				244
				245	/*
				246	* Slow version of get and set free pointer.
				247	*
				248	* This requires touching the cache lines of kmem_cache.
				249	* The offset can also be obtained from the page. In that
				250	* case it is in the cacheline that we already need to touch.
				251	*/
				252	static void get_freepointer(struct kmem_cache s, void *object)
				253	{
				254	return (void *)(object + s->offset);
				255	}
				256
				257	static void set_freepointer(struct kmem_cache s, void object, void *fp)
				258	{
				259	(void *)(object + s->offset) = fp;
				260	}
				261
				262	/*
				263	* Tracking user of a slab.
				264	*/
				265	struct track {
				266	void addr; / Called from address */
				267	int cpu; /* Was running on cpu */
				268	int pid; /* Pid context */
				269	unsigned long when; /* When did the operation occur */
				270	};
				271
				272	enum track_item { TRACK_ALLOC, TRACK_FREE };
				273
				274	static struct track get_track(struct kmem_cache s, void *object,
				275	enum track_item alloc)
				276	{
				277	struct track *p;
				278
				279	if (s->offset)
				280	p = object + s->offset + sizeof(void *);
				281	else
				282	p = object + s->inuse;
				283
				284	return p + alloc;
				285	}
				286
				287	static void set_track(struct kmem_cache s, void object,
				288	enum track_item alloc, void *addr)
				289	{
				290	struct track *p;
				291
				292	if (s->offset)
				293	p = object + s->offset + sizeof(void *);
				294	else
				295	p = object + s->inuse;
				296
				297	p += alloc;
				298	if (addr) {
				299	p->addr = addr;
				300	p->cpu = smp_processor_id();
				301	p->pid = current ? current->pid : -1;
				302	p->when = jiffies;
				303	} else
				304	memset(p, 0, sizeof(struct track));
				305	}
				306
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	307	static void init_tracking(struct kmem_cache s, void object)
				308	{
				309	if (s->flags & SLAB_STORE_USER) {
				310	set_track(s, object, TRACK_FREE, NULL);
				311	set_track(s, object, TRACK_ALLOC, NULL);
				312	}
				313	}
				314
				315	static void print_track(const char s, struct track t)
				316	{
				317	if (!t->addr)
				318	return;
				319
				320	printk(KERN_ERR "%s: ", s);
				321	__print_symbol("%s", (unsigned long)t->addr);
				322	printk(" jiffies_ago=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
				323	}
				324
				325	static void print_trailer(struct kmem_cache s, u8 p)
				326	{
				327	unsigned int off; /* Offset of last byte */
				328
				329	if (s->flags & SLAB_RED_ZONE)
				330	print_section("Redzone", p + s->objsize,
				331	s->inuse - s->objsize);
				332
				333	printk(KERN_ERR "FreePointer 0x%p -> 0x%p\n",
				334	p + s->offset,
				335	get_freepointer(s, p));
				336
				337	if (s->offset)
				338	off = s->offset + sizeof(void *);
				339	else
				340	off = s->inuse;
				341
				342	if (s->flags & SLAB_STORE_USER) {
				343	print_track("Last alloc", get_track(s, p, TRACK_ALLOC));
				344	print_track("Last free ", get_track(s, p, TRACK_FREE));
				345	off += 2 * sizeof(struct track);
				346	}
				347
				348	if (off != s->size)
				349	/* Beginning of the filler is the free pointer */
				350	print_section("Filler", p + off, s->size - off);
				351	}
				352
				353	static void object_err(struct kmem_cache s, struct page page,
				354	u8 object, char reason)
				355	{
				356	u8 *addr = page_address(page);
				357
				358	printk(KERN_ERR "*** SLUB %s: %s@0x%p slab 0x%p\n",
				359	s->name, reason, object, page);
				360	printk(KERN_ERR " offset=%tu flags=0x%04lx inuse=%u freelist=0x%p\n",
				361	object - addr, page->flags, page->inuse, page->freelist);
				362	if (object > addr + 16)
				363	print_section("Bytes b4", object - 16, 16);
				364	print_section("Object", object, min(s->objsize, 128));
				365	print_trailer(s, object);
				366	dump_stack();
				367	}
				368
				369	static void slab_err(struct kmem_cache s, struct page page, char *reason, ...)
				370	{
				371	va_list args;
				372	char buf[100];
				373
				374	va_start(args, reason);
				375	vsnprintf(buf, sizeof(buf), reason, args);
				376	va_end(args);
				377	printk(KERN_ERR "*** SLUB %s: %s in slab @0x%p\n", s->name, buf,
				378	page);
				379	dump_stack();
				380	}
				381
				382	static void init_object(struct kmem_cache s, void object, int active)
				383	{
				384	u8 *p = object;
				385
				386	if (s->flags & __OBJECT_POISON) {
				387	memset(p, POISON_FREE, s->objsize - 1);
				388	p[s->objsize -1] = POISON_END;
				389	}
				390
				391	if (s->flags & SLAB_RED_ZONE)
				392	memset(p + s->objsize,
				393	active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
				394	s->inuse - s->objsize);
				395	}
				396
				397	static int check_bytes(u8 *start, unsigned int value, unsigned int bytes)
				398	{
				399	while (bytes) {
				400	if (*start != (u8)value)
				401	return 0;
				402	start++;
				403	bytes--;
				404	}
				405	return 1;
				406	}
				407
Christoph Lameter	abcd08a	2007-05-09 02:32:37 -0700	[diff] [blame^]	408	static inline int check_valid_pointer(struct kmem_cache *s,
				409	struct page page, const void object)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	410	{
				411	void *base;
				412
				413	if (!object)
				414	return 1;
				415
				416	base = page_address(page);
				417	if (object < base \|\| object >= base + s->objects * s->size \|\|
				418	(object - base) % s->size) {
				419	return 0;
				420	}
				421
				422	return 1;
				423	}
				424
				425	/*
				426	* Object layout:
				427	*
				428	* object address
				429	* Bytes of the object to be managed.
				430	* If the freepointer may overlay the object then the free
				431	* pointer is the first word of the object.
				432	* Poisoning uses 0x6b (POISON_FREE) and the last byte is
				433	* 0xa5 (POISON_END)
				434	*
				435	* object + s->objsize
				436	* Padding to reach word boundary. This is also used for Redzoning.
				437	* Padding is extended to word size if Redzoning is enabled
				438	* and objsize == inuse.
				439	* We fill with 0xbb (RED_INACTIVE) for inactive objects and with
				440	* 0xcc (RED_ACTIVE) for objects in use.
				441	*
				442	* object + s->inuse
				443	* A. Free pointer (if we cannot overwrite object on free)
				444	* B. Tracking data for SLAB_STORE_USER
				445	* C. Padding to reach required alignment boundary
				446	* Padding is done using 0x5a (POISON_INUSE)
				447	*
				448	* object + s->size
				449	*
				450	* If slabcaches are merged then the objsize and inuse boundaries are to
				451	* be ignored. And therefore no slab options that rely on these boundaries
				452	* may be used with merged slabcaches.
				453	*/
				454
				455	static void restore_bytes(struct kmem_cache s, char message, u8 data,
				456	void from, void to)
				457	{
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	458	printk(KERN_ERR "@@@ SLUB %s: Restoring %s (0x%x) from 0x%p-0x%p\n",
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	459	s->name, message, data, from, to - 1);
				460	memset(from, data, to - from);
				461	}
				462
				463	static int check_pad_bytes(struct kmem_cache s, struct page page, u8 *p)
				464	{
				465	unsigned long off = s->inuse; /* The end of info */
				466
				467	if (s->offset)
				468	/* Freepointer is placed after the object. */
				469	off += sizeof(void *);
				470
				471	if (s->flags & SLAB_STORE_USER)
				472	/* We also have user information there */
				473	off += 2 * sizeof(struct track);
				474
				475	if (s->size == off)
				476	return 1;
				477
				478	if (check_bytes(p + off, POISON_INUSE, s->size - off))
				479	return 1;
				480
				481	object_err(s, page, p, "Object padding check fails");
				482
				483	/*
				484	* Restore padding
				485	*/
				486	restore_bytes(s, "object padding", POISON_INUSE, p + off, p + s->size);
				487	return 0;
				488	}
				489
				490	static int slab_pad_check(struct kmem_cache s, struct page page)
				491	{
				492	u8 *p;
				493	int length, remainder;
				494
				495	if (!(s->flags & SLAB_POISON))
				496	return 1;
				497
				498	p = page_address(page);
				499	length = s->objects * s->size;
				500	remainder = (PAGE_SIZE << s->order) - length;
				501	if (!remainder)
				502	return 1;
				503
				504	if (!check_bytes(p + length, POISON_INUSE, remainder)) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	505	slab_err(s, page, "Padding check failed");
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	506	restore_bytes(s, "slab padding", POISON_INUSE, p + length,
				507	p + length + remainder);
				508	return 0;
				509	}
				510	return 1;
				511	}
				512
				513	static int check_object(struct kmem_cache s, struct page page,
				514	void *object, int active)
				515	{
				516	u8 *p = object;
				517	u8 *endobject = object + s->objsize;
				518
				519	if (s->flags & SLAB_RED_ZONE) {
				520	unsigned int red =
				521	active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
				522
				523	if (!check_bytes(endobject, red, s->inuse - s->objsize)) {
				524	object_err(s, page, object,
				525	active ? "Redzone Active" : "Redzone Inactive");
				526	restore_bytes(s, "redzone", red,
				527	endobject, object + s->inuse);
				528	return 0;
				529	}
				530	} else {
				531	if ((s->flags & SLAB_POISON) && s->objsize < s->inuse &&
				532	!check_bytes(endobject, POISON_INUSE,
				533	s->inuse - s->objsize)) {
				534	object_err(s, page, p, "Alignment padding check fails");
				535	/*
				536	* Fix it so that there will not be another report.
				537	*
				538	* Hmmm... We may be corrupting an object that now expects
				539	* to be longer than allowed.
				540	*/
				541	restore_bytes(s, "alignment padding", POISON_INUSE,
				542	endobject, object + s->inuse);
				543	}
				544	}
				545
				546	if (s->flags & SLAB_POISON) {
				547	if (!active && (s->flags & __OBJECT_POISON) &&
				548	(!check_bytes(p, POISON_FREE, s->objsize - 1) \|\|
				549	p[s->objsize - 1] != POISON_END)) {
				550
				551	object_err(s, page, p, "Poison check failed");
				552	restore_bytes(s, "Poison", POISON_FREE,
				553	p, p + s->objsize -1);
				554	restore_bytes(s, "Poison", POISON_END,
				555	p + s->objsize - 1, p + s->objsize);
				556	return 0;
				557	}
				558	/*
				559	* check_pad_bytes cleans up on its own.
				560	*/
				561	check_pad_bytes(s, page, p);
				562	}
				563
				564	if (!s->offset && active)
				565	/*
				566	* Object and freepointer overlap. Cannot check
				567	* freepointer while object is allocated.
				568	*/
				569	return 1;
				570
				571	/* Check free pointer validity */
				572	if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
				573	object_err(s, page, p, "Freepointer corrupt");
				574	/*
				575	* No choice but to zap it and thus loose the remainder
				576	* of the free objects in this slab. May cause
				577	* another error because the object count maybe
				578	* wrong now.
				579	*/
				580	set_freepointer(s, p, NULL);
				581	return 0;
				582	}
				583	return 1;
				584	}
				585
				586	static int check_slab(struct kmem_cache s, struct page page)
				587	{
				588	VM_BUG_ON(!irqs_disabled());
				589
				590	if (!PageSlab(page)) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	591	slab_err(s, page, "Not a valid slab page flags=%lx "
				592	"mapping=0x%p count=%d", page->flags, page->mapping,
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	593	page_count(page));
				594	return 0;
				595	}
				596	if (page->offset * sizeof(void *) != s->offset) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	597	slab_err(s, page, "Corrupted offset %lu flags=0x%lx "
				598	"mapping=0x%p count=%d",
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	599	(unsigned long)(page->offset * sizeof(void *)),
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	600	page->flags,
				601	page->mapping,
				602	page_count(page));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	603	return 0;
				604	}
				605	if (page->inuse > s->objects) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	606	slab_err(s, page, "inuse %u > max %u @0x%p flags=%lx "
				607	"mapping=0x%p count=%d",
				608	s->name, page->inuse, s->objects, page->flags,
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	609	page->mapping, page_count(page));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	610	return 0;
				611	}
				612	/* Slab_pad_check fixes things up after itself */
				613	slab_pad_check(s, page);
				614	return 1;
				615	}
				616
				617	/*
				618	* Determine if a certain object on a page is on the freelist and
				619	* therefore free. Must hold the slab lock for cpu slabs to
				620	* guarantee that the chains are consistent.
				621	*/
				622	static int on_freelist(struct kmem_cache s, struct page page, void *search)
				623	{
				624	int nr = 0;
				625	void *fp = page->freelist;
				626	void *object = NULL;
				627
				628	while (fp && nr <= s->objects) {
				629	if (fp == search)
				630	return 1;
				631	if (!check_valid_pointer(s, page, fp)) {
				632	if (object) {
				633	object_err(s, page, object,
				634	"Freechain corrupt");
				635	set_freepointer(s, object, NULL);
				636	break;
				637	} else {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	638	slab_err(s, page, "Freepointer 0x%p corrupt",
				639	fp);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	640	page->freelist = NULL;
				641	page->inuse = s->objects;
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	642	printk(KERN_ERR "@@@ SLUB %s: Freelist "
				643	"cleared. Slab 0x%p\n",
				644	s->name, page);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	645	return 0;
				646	}
				647	break;
				648	}
				649	object = fp;
				650	fp = get_freepointer(s, object);
				651	nr++;
				652	}
				653
				654	if (page->inuse != s->objects - nr) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	655	slab_err(s, page, "Wrong object count. Counter is %d but "
				656	"counted were %d", s, page, page->inuse,
				657	s->objects - nr);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	658	page->inuse = s->objects - nr;
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	659	printk(KERN_ERR "@@@ SLUB %s: Object count adjusted. "
				660	"Slab @0x%p\n", s->name, page);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	661	}
				662	return search == NULL;
				663	}
				664
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	665	/*
				666	* Tracking of fully allocated slabs for debugging
				667	*/
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	668	static void add_full(struct kmem_cache_node n, struct page page)
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	669	{
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	670	spin_lock(&n->list_lock);
				671	list_add(&page->lru, &n->full);
				672	spin_unlock(&n->list_lock);
				673	}
				674
				675	static void remove_full(struct kmem_cache s, struct page page)
				676	{
				677	struct kmem_cache_node *n;
				678
				679	if (!(s->flags & SLAB_STORE_USER))
				680	return;
				681
				682	n = get_node(s, page_to_nid(page));
				683
				684	spin_lock(&n->list_lock);
				685	list_del(&page->lru);
				686	spin_unlock(&n->list_lock);
				687	}
				688
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	689	static int alloc_object_checks(struct kmem_cache s, struct page page,
				690	void *object)
				691	{
				692	if (!check_slab(s, page))
				693	goto bad;
				694
				695	if (object && !on_freelist(s, page, object)) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	696	slab_err(s, page, "Object 0x%p already allocated", object);
				697	goto bad;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	698	}
				699
				700	if (!check_valid_pointer(s, page, object)) {
				701	object_err(s, page, object, "Freelist Pointer check fails");
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	702	goto bad;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	703	}
				704
				705	if (!object)
				706	return 1;
				707
				708	if (!check_object(s, page, object, 0))
				709	goto bad;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	710
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	711	return 1;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	712	bad:
				713	if (PageSlab(page)) {
				714	/*
				715	* If this is a slab page then lets do the best we can
				716	* to avoid issues in the future. Marking all objects
				717	* as used avoids touching the remainder.
				718	*/
				719	printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n",
				720	s->name, page);
				721	page->inuse = s->objects;
				722	page->freelist = NULL;
				723	/* Fix up fields that may be corrupted */
				724	page->offset = s->offset / sizeof(void *);
				725	}
				726	return 0;
				727	}
				728
				729	static int free_object_checks(struct kmem_cache s, struct page page,
				730	void *object)
				731	{
				732	if (!check_slab(s, page))
				733	goto fail;
				734
				735	if (!check_valid_pointer(s, page, object)) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	736	slab_err(s, page, "Invalid object pointer 0x%p", object);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	737	goto fail;
				738	}
				739
				740	if (on_freelist(s, page, object)) {
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	741	slab_err(s, page, "Object 0x%p already free", object);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	742	goto fail;
				743	}
				744
				745	if (!check_object(s, page, object, 1))
				746	return 0;
				747
				748	if (unlikely(s != page->slab)) {
				749	if (!PageSlab(page))
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	750	slab_err(s, page, "Attempt to free object(0x%p) "
				751	"outside of slab", object);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	752	else
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	753	if (!page->slab) {
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	754	printk(KERN_ERR
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	755	"SLUB <none>: no slab for object 0x%p.\n",
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	756	object);
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	757	dump_stack();
				758	}
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	759	else
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	760	slab_err(s, page, "object at 0x%p belongs "
				761	"to slab %s", object, page->slab->name);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	762	goto fail;
				763	}
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	764	return 1;
				765	fail:
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	766	printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n",
				767	s->name, page, object);
				768	return 0;
				769	}
				770
				771	/*
				772	* Slab allocation and freeing
				773	*/
				774	static struct page allocate_slab(struct kmem_cache s, gfp_t flags, int node)
				775	{
				776	struct page * page;
				777	int pages = 1 << s->order;
				778
				779	if (s->order)
				780	flags \|= __GFP_COMP;
				781
				782	if (s->flags & SLAB_CACHE_DMA)
				783	flags \|= SLUB_DMA;
				784
				785	if (node == -1)
				786	page = alloc_pages(flags, s->order);
				787	else
				788	page = alloc_pages_node(node, flags, s->order);
				789
				790	if (!page)
				791	return NULL;
				792
				793	mod_zone_page_state(page_zone(page),
				794	(s->flags & SLAB_RECLAIM_ACCOUNT) ?
				795	NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
				796	pages);
				797
				798	return page;
				799	}
				800
				801	static void setup_object(struct kmem_cache s, struct page page,
				802	void *object)
				803	{
				804	if (PageError(page)) {
				805	init_object(s, object, 0);
				806	init_tracking(s, object);
				807	}
				808
Christoph Lameter	4f10493	2007-05-06 14:50:17 -0700	[diff] [blame]	809	if (unlikely(s->ctor))
				810	s->ctor(object, s, SLAB_CTOR_CONSTRUCTOR);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	811	}
				812
				813	static struct page new_slab(struct kmem_cache s, gfp_t flags, int node)
				814	{
				815	struct page *page;
				816	struct kmem_cache_node *n;
				817	void *start;
				818	void *end;
				819	void *last;
				820	void *p;
				821
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	822	BUG_ON(flags & ~(GFP_DMA \| GFP_LEVEL_MASK));
				823
				824	if (flags & __GFP_WAIT)
				825	local_irq_enable();
				826
				827	page = allocate_slab(s, flags & GFP_LEVEL_MASK, node);
				828	if (!page)
				829	goto out;
				830
				831	n = get_node(s, page_to_nid(page));
				832	if (n)
				833	atomic_long_inc(&n->nr_slabs);
				834	page->offset = s->offset / sizeof(void *);
				835	page->slab = s;
				836	page->flags \|= 1 << PG_slab;
				837	if (s->flags & (SLAB_DEBUG_FREE \| SLAB_RED_ZONE \| SLAB_POISON \|
				838	SLAB_STORE_USER \| SLAB_TRACE))
				839	page->flags \|= 1 << PG_error;
				840
				841	start = page_address(page);
				842	end = start + s->objects * s->size;
				843
				844	if (unlikely(s->flags & SLAB_POISON))
				845	memset(start, POISON_INUSE, PAGE_SIZE << s->order);
				846
				847	last = start;
				848	for (p = start + s->size; p < end; p += s->size) {
				849	setup_object(s, page, last);
				850	set_freepointer(s, last, p);
				851	last = p;
				852	}
				853	setup_object(s, page, last);
				854	set_freepointer(s, last, NULL);
				855
				856	page->freelist = start;
				857	page->inuse = 0;
				858	out:
				859	if (flags & __GFP_WAIT)
				860	local_irq_disable();
				861	return page;
				862	}
				863
				864	static void __free_slab(struct kmem_cache s, struct page page)
				865	{
				866	int pages = 1 << s->order;
				867
				868	if (unlikely(PageError(page) \|\| s->dtor)) {
				869	void *start = page_address(page);
				870	void *end = start + (pages << PAGE_SHIFT);
				871	void *p;
				872
				873	slab_pad_check(s, page);
				874	for (p = start; p <= end - s->size; p += s->size) {
				875	if (s->dtor)
				876	s->dtor(p, s, 0);
				877	check_object(s, page, p, 0);
				878	}
				879	}
				880
				881	mod_zone_page_state(page_zone(page),
				882	(s->flags & SLAB_RECLAIM_ACCOUNT) ?
				883	NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
				884	- pages);
				885
				886	page->mapping = NULL;
				887	__free_pages(page, s->order);
				888	}
				889
				890	static void rcu_free_slab(struct rcu_head *h)
				891	{
				892	struct page *page;
				893
				894	page = container_of((struct list_head *)h, struct page, lru);
				895	__free_slab(page->slab, page);
				896	}
				897
				898	static void free_slab(struct kmem_cache s, struct page page)
				899	{
				900	if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
				901	/*
				902	* RCU free overloads the RCU head over the LRU
				903	*/
				904	struct rcu_head head = (void )&page->lru;
				905
				906	call_rcu(head, rcu_free_slab);
				907	} else
				908	__free_slab(s, page);
				909	}
				910
				911	static void discard_slab(struct kmem_cache s, struct page page)
				912	{
				913	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
				914
				915	atomic_long_dec(&n->nr_slabs);
				916	reset_page_mapcount(page);
				917	page->flags &= ~(1 << PG_slab \| 1 << PG_error);
				918	free_slab(s, page);
				919	}
				920
				921	/*
				922	* Per slab locking using the pagelock
				923	*/
				924	static __always_inline void slab_lock(struct page *page)
				925	{
				926	bit_spin_lock(PG_locked, &page->flags);
				927	}
				928
				929	static __always_inline void slab_unlock(struct page *page)
				930	{
				931	bit_spin_unlock(PG_locked, &page->flags);
				932	}
				933
				934	static __always_inline int slab_trylock(struct page *page)
				935	{
				936	int rc = 1;
				937
				938	rc = bit_spin_trylock(PG_locked, &page->flags);
				939	return rc;
				940	}
				941
				942	/*
				943	* Management of partially allocated slabs
				944	*/
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	945	static void add_partial_tail(struct kmem_cache_node n, struct page page)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	946	{
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	947	spin_lock(&n->list_lock);
				948	n->nr_partial++;
				949	list_add_tail(&page->lru, &n->partial);
				950	spin_unlock(&n->list_lock);
				951	}
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	952
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	953	static void add_partial(struct kmem_cache_node n, struct page page)
				954	{
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	955	spin_lock(&n->list_lock);
				956	n->nr_partial++;
				957	list_add(&page->lru, &n->partial);
				958	spin_unlock(&n->list_lock);
				959	}
				960
				961	static void remove_partial(struct kmem_cache *s,
				962	struct page *page)
				963	{
				964	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
				965
				966	spin_lock(&n->list_lock);
				967	list_del(&page->lru);
				968	n->nr_partial--;
				969	spin_unlock(&n->list_lock);
				970	}
				971
				972	/*
				973	* Lock page and remove it from the partial list
				974	*
				975	* Must hold list_lock
				976	*/
				977	static int lock_and_del_slab(struct kmem_cache_node n, struct page page)
				978	{
				979	if (slab_trylock(page)) {
				980	list_del(&page->lru);
				981	n->nr_partial--;
				982	return 1;
				983	}
				984	return 0;
				985	}
				986
				987	/*
				988	* Try to get a partial slab from a specific node
				989	*/
				990	static struct page get_partial_node(struct kmem_cache_node n)
				991	{
				992	struct page *page;
				993
				994	/*
				995	* Racy check. If we mistakenly see no partial slabs then we
				996	* just allocate an empty slab. If we mistakenly try to get a
				997	* partial slab then get_partials() will return NULL.
				998	*/
				999	if (!n \|\| !n->nr_partial)
				1000	return NULL;
				1001
				1002	spin_lock(&n->list_lock);
				1003	list_for_each_entry(page, &n->partial, lru)
				1004	if (lock_and_del_slab(n, page))
				1005	goto out;
				1006	page = NULL;
				1007	out:
				1008	spin_unlock(&n->list_lock);
				1009	return page;
				1010	}
				1011
				1012	/*
				1013	* Get a page from somewhere. Search in increasing NUMA
				1014	* distances.
				1015	*/
				1016	static struct page get_any_partial(struct kmem_cache s, gfp_t flags)
				1017	{
				1018	#ifdef CONFIG_NUMA
				1019	struct zonelist *zonelist;
				1020	struct zone **z;
				1021	struct page *page;
				1022
				1023	/*
				1024	* The defrag ratio allows to configure the tradeoffs between
				1025	* inter node defragmentation and node local allocations.
				1026	* A lower defrag_ratio increases the tendency to do local
				1027	* allocations instead of scanning throught the partial
				1028	* lists on other nodes.
				1029	*
				1030	* If defrag_ratio is set to 0 then kmalloc() always
				1031	* returns node local objects. If its higher then kmalloc()
				1032	* may return off node objects in order to avoid fragmentation.
				1033	*
				1034	* A higher ratio means slabs may be taken from other nodes
				1035	* thus reducing the number of partial slabs on those nodes.
				1036	*
				1037	* If /sys/slab/xx/defrag_ratio is set to 100 (which makes
				1038	* defrag_ratio = 1000) then every (well almost) allocation
				1039	* will first attempt to defrag slab caches on other nodes. This
				1040	* means scanning over all nodes to look for partial slabs which
				1041	* may be a bit expensive to do on every slab allocation.
				1042	*/
				1043	if (!s->defrag_ratio \|\| get_cycles() % 1024 > s->defrag_ratio)
				1044	return NULL;
				1045
				1046	zonelist = &NODE_DATA(slab_node(current->mempolicy))
				1047	->node_zonelists[gfp_zone(flags)];
				1048	for (z = zonelist->zones; *z; z++) {
				1049	struct kmem_cache_node *n;
				1050
				1051	n = get_node(s, zone_to_nid(*z));
				1052
				1053	if (n && cpuset_zone_allowed_hardwall(*z, flags) &&
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1054	n->nr_partial > MIN_PARTIAL) {
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1055	page = get_partial_node(n);
				1056	if (page)
				1057	return page;
				1058	}
				1059	}
				1060	#endif
				1061	return NULL;
				1062	}
				1063
				1064	/*
				1065	* Get a partial page, lock it and return it.
				1066	*/
				1067	static struct page get_partial(struct kmem_cache s, gfp_t flags, int node)
				1068	{
				1069	struct page *page;
				1070	int searchnode = (node == -1) ? numa_node_id() : node;
				1071
				1072	page = get_partial_node(get_node(s, searchnode));
				1073	if (page \|\| (flags & __GFP_THISNODE))
				1074	return page;
				1075
				1076	return get_any_partial(s, flags);
				1077	}
				1078
				1079	/*
				1080	* Move a page back to the lists.
				1081	*
				1082	* Must be called with the slab lock held.
				1083	*
				1084	* On exit the slab lock will have been dropped.
				1085	*/
				1086	static void putback_slab(struct kmem_cache s, struct page page)
				1087	{
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1088	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
				1089
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1090	if (page->inuse) {
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1091
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1092	if (page->freelist)
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1093	add_partial(n, page);
				1094	else if (PageError(page) && (s->flags & SLAB_STORE_USER))
				1095	add_full(n, page);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1096	slab_unlock(page);
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1097
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1098	} else {
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1099	if (n->nr_partial < MIN_PARTIAL) {
				1100	/*
				1101	* Adding an empty page to the partial slabs in order
				1102	* to avoid page allocator overhead. This page needs to
				1103	* come after all the others that are not fully empty
				1104	* in order to make sure that we do maximum
				1105	* defragmentation.
				1106	*/
				1107	add_partial_tail(n, page);
				1108	slab_unlock(page);
				1109	} else {
				1110	slab_unlock(page);
				1111	discard_slab(s, page);
				1112	}
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1113	}
				1114	}
				1115
				1116	/*
				1117	* Remove the cpu slab
				1118	*/
				1119	static void deactivate_slab(struct kmem_cache s, struct page page, int cpu)
				1120	{
				1121	s->cpu_slab[cpu] = NULL;
				1122	ClearPageActive(page);
				1123
				1124	putback_slab(s, page);
				1125	}
				1126
				1127	static void flush_slab(struct kmem_cache s, struct page page, int cpu)
				1128	{
				1129	slab_lock(page);
				1130	deactivate_slab(s, page, cpu);
				1131	}
				1132
				1133	/*
				1134	* Flush cpu slab.
				1135	* Called from IPI handler with interrupts disabled.
				1136	*/
				1137	static void __flush_cpu_slab(struct kmem_cache *s, int cpu)
				1138	{
				1139	struct page *page = s->cpu_slab[cpu];
				1140
				1141	if (likely(page))
				1142	flush_slab(s, page, cpu);
				1143	}
				1144
				1145	static void flush_cpu_slab(void *d)
				1146	{
				1147	struct kmem_cache *s = d;
				1148	int cpu = smp_processor_id();
				1149
				1150	__flush_cpu_slab(s, cpu);
				1151	}
				1152
				1153	static void flush_all(struct kmem_cache *s)
				1154	{
				1155	#ifdef CONFIG_SMP
				1156	on_each_cpu(flush_cpu_slab, s, 1, 1);
				1157	#else
				1158	unsigned long flags;
				1159
				1160	local_irq_save(flags);
				1161	flush_cpu_slab(s);
				1162	local_irq_restore(flags);
				1163	#endif
				1164	}
				1165
				1166	/*
				1167	* slab_alloc is optimized to only modify two cachelines on the fast path
				1168	* (aside from the stack):
				1169	*
				1170	* 1. The page struct
				1171	* 2. The first cacheline of the object to be allocated.
				1172	*
				1173	* The only cache lines that are read (apart from code) is the
				1174	* per cpu array in the kmem_cache struct.
				1175	*
				1176	* Fastpath is not possible if we need to get a new slab or have
				1177	* debugging enabled (which means all slabs are marked with PageError)
				1178	*/
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1179	static void slab_alloc(struct kmem_cache s,
				1180	gfp_t gfpflags, int node, void *addr)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1181	{
				1182	struct page *page;
				1183	void **object;
				1184	unsigned long flags;
				1185	int cpu;
				1186
				1187	local_irq_save(flags);
				1188	cpu = smp_processor_id();
				1189	page = s->cpu_slab[cpu];
				1190	if (!page)
				1191	goto new_slab;
				1192
				1193	slab_lock(page);
				1194	if (unlikely(node != -1 && page_to_nid(page) != node))
				1195	goto another_slab;
				1196	redo:
				1197	object = page->freelist;
				1198	if (unlikely(!object))
				1199	goto another_slab;
				1200	if (unlikely(PageError(page)))
				1201	goto debug;
				1202
				1203	have_object:
				1204	page->inuse++;
				1205	page->freelist = object[page->offset];
				1206	slab_unlock(page);
				1207	local_irq_restore(flags);
				1208	return object;
				1209
				1210	another_slab:
				1211	deactivate_slab(s, page, cpu);
				1212
				1213	new_slab:
				1214	page = get_partial(s, gfpflags, node);
				1215	if (likely(page)) {
				1216	have_slab:
				1217	s->cpu_slab[cpu] = page;
				1218	SetPageActive(page);
				1219	goto redo;
				1220	}
				1221
				1222	page = new_slab(s, gfpflags, node);
				1223	if (page) {
				1224	cpu = smp_processor_id();
				1225	if (s->cpu_slab[cpu]) {
				1226	/*
				1227	* Someone else populated the cpu_slab while we enabled
				1228	* interrupts, or we have got scheduled on another cpu.
				1229	* The page may not be on the requested node.
				1230	*/
				1231	if (node == -1 \|\|
				1232	page_to_nid(s->cpu_slab[cpu]) == node) {
				1233	/*
				1234	* Current cpuslab is acceptable and we
				1235	* want the current one since its cache hot
				1236	*/
				1237	discard_slab(s, page);
				1238	page = s->cpu_slab[cpu];
				1239	slab_lock(page);
				1240	goto redo;
				1241	}
				1242	/* Dump the current slab */
				1243	flush_slab(s, s->cpu_slab[cpu], cpu);
				1244	}
				1245	slab_lock(page);
				1246	goto have_slab;
				1247	}
				1248	local_irq_restore(flags);
				1249	return NULL;
				1250	debug:
				1251	if (!alloc_object_checks(s, page, object))
				1252	goto another_slab;
				1253	if (s->flags & SLAB_STORE_USER)
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1254	set_track(s, object, TRACK_ALLOC, addr);
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	1255	if (s->flags & SLAB_TRACE) {
				1256	printk(KERN_INFO "TRACE %s alloc 0x%p inuse=%d fp=0x%p\n",
				1257	s->name, object, page->inuse,
				1258	page->freelist);
				1259	dump_stack();
				1260	}
				1261	init_object(s, object, 1);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1262	goto have_object;
				1263	}
				1264
				1265	void kmem_cache_alloc(struct kmem_cache s, gfp_t gfpflags)
				1266	{
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1267	return slab_alloc(s, gfpflags, -1, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1268	}
				1269	EXPORT_SYMBOL(kmem_cache_alloc);
				1270
				1271	#ifdef CONFIG_NUMA
				1272	void kmem_cache_alloc_node(struct kmem_cache s, gfp_t gfpflags, int node)
				1273	{
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1274	return slab_alloc(s, gfpflags, node, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1275	}
				1276	EXPORT_SYMBOL(kmem_cache_alloc_node);
				1277	#endif
				1278
				1279	/*
				1280	* The fastpath only writes the cacheline of the page struct and the first
				1281	* cacheline of the object.
				1282	*
				1283	* No special cachelines need to be read
				1284	*/
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1285	static void slab_free(struct kmem_cache s, struct page page,
				1286	void x, void addr)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1287	{
				1288	void *prior;
				1289	void *object = (void )x;
				1290	unsigned long flags;
				1291
				1292	local_irq_save(flags);
				1293	slab_lock(page);
				1294
				1295	if (unlikely(PageError(page)))
				1296	goto debug;
				1297	checks_ok:
				1298	prior = object[page->offset] = page->freelist;
				1299	page->freelist = object;
				1300	page->inuse--;
				1301
				1302	if (unlikely(PageActive(page)))
				1303	/*
				1304	* Cpu slabs are never on partial lists and are
				1305	* never freed.
				1306	*/
				1307	goto out_unlock;
				1308
				1309	if (unlikely(!page->inuse))
				1310	goto slab_empty;
				1311
				1312	/*
				1313	* Objects left in the slab. If it
				1314	* was not on the partial list before
				1315	* then add it.
				1316	*/
				1317	if (unlikely(!prior))
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1318	add_partial(get_node(s, page_to_nid(page)), page);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1319
				1320	out_unlock:
				1321	slab_unlock(page);
				1322	local_irq_restore(flags);
				1323	return;
				1324
				1325	slab_empty:
				1326	if (prior)
				1327	/*
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	1328	* Slab on the partial list.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1329	*/
				1330	remove_partial(s, page);
				1331
				1332	slab_unlock(page);
				1333	discard_slab(s, page);
				1334	local_irq_restore(flags);
				1335	return;
				1336
				1337	debug:
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1338	if (!free_object_checks(s, page, x))
				1339	goto out_unlock;
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	1340	if (!PageActive(page) && !page->freelist)
				1341	remove_full(s, page);
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1342	if (s->flags & SLAB_STORE_USER)
				1343	set_track(s, x, TRACK_FREE, addr);
Christoph Lameter	70d7122	2007-05-06 14:49:47 -0700	[diff] [blame]	1344	if (s->flags & SLAB_TRACE) {
				1345	printk(KERN_INFO "TRACE %s free 0x%p inuse=%d fp=0x%p\n",
				1346	s->name, object, page->inuse,
				1347	page->freelist);
				1348	print_section("Object", (void *)object, s->objsize);
				1349	dump_stack();
				1350	}
				1351	init_object(s, object, 0);
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1352	goto checks_ok;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1353	}
				1354
				1355	void kmem_cache_free(struct kmem_cache s, void x)
				1356	{
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1357	struct page *page;
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1358
Christoph Lameter	b49af68	2007-05-06 14:49:41 -0700	[diff] [blame]	1359	page = virt_to_head_page(x);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1360
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	1361	slab_free(s, page, x, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1362	}
				1363	EXPORT_SYMBOL(kmem_cache_free);
				1364
				1365	/* Figure out on which slab object the object resides */
				1366	static struct page get_object_page(const void x)
				1367	{
Christoph Lameter	b49af68	2007-05-06 14:49:41 -0700	[diff] [blame]	1368	struct page *page = virt_to_head_page(x);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1369
				1370	if (!PageSlab(page))
				1371	return NULL;
				1372
				1373	return page;
				1374	}
				1375
				1376	/*
				1377	* kmem_cache_open produces objects aligned at "size" and the first object
				1378	* is placed at offset 0 in the slab (We have no metainformation on the
				1379	* slab, all slabs are in essence "off slab").
				1380	*
				1381	* In order to get the desired alignment one just needs to align the
				1382	* size.
				1383	*
				1384	* Notice that the allocation order determines the sizes of the per cpu
				1385	* caches. Each processor has always one slab available for allocations.
				1386	* Increasing the allocation order reduces the number of times that slabs
				1387	* must be moved on and off the partial lists and therefore may influence
				1388	* locking overhead.
				1389	*
				1390	* The offset is used to relocate the free list link in each object. It is
				1391	* therefore possible to move the free list link behind the object. This
				1392	* is necessary for RCU to work properly and also useful for debugging.
				1393	*/
				1394
				1395	/*
				1396	* Mininum / Maximum order of slab pages. This influences locking overhead
				1397	* and slab fragmentation. A higher order reduces the number of partial slabs
				1398	* and increases the number of allocations possible without having to
				1399	* take the list_lock.
				1400	*/
				1401	static int slub_min_order;
				1402	static int slub_max_order = DEFAULT_MAX_ORDER;
				1403
				1404	/*
				1405	* Minimum number of objects per slab. This is necessary in order to
				1406	* reduce locking overhead. Similar to the queue size in SLAB.
				1407	*/
				1408	static int slub_min_objects = DEFAULT_MIN_OBJECTS;
				1409
				1410	/*
				1411	* Merge control. If this is set then no merging of slab caches will occur.
				1412	*/
				1413	static int slub_nomerge;
				1414
				1415	/*
				1416	* Debug settings:
				1417	*/
				1418	static int slub_debug;
				1419
				1420	static char *slub_debug_slabs;
				1421
				1422	/*
				1423	* Calculate the order of allocation given an slab object size.
				1424	*
				1425	* The order of allocation has significant impact on other elements
				1426	* of the system. Generally order 0 allocations should be preferred
				1427	* since they do not cause fragmentation in the page allocator. Larger
				1428	* objects may have problems with order 0 because there may be too much
				1429	* space left unused in a slab. We go to a higher order if more than 1/8th
				1430	* of the slab would be wasted.
				1431	*
				1432	* In order to reach satisfactory performance we must ensure that
				1433	* a minimum number of objects is in one slab. Otherwise we may
				1434	* generate too much activity on the partial lists. This is less a
				1435	* concern for large slabs though. slub_max_order specifies the order
				1436	* where we begin to stop considering the number of objects in a slab.
				1437	*
				1438	* Higher order allocations also allow the placement of more objects
				1439	* in a slab and thereby reduce object handling overhead. If the user
				1440	* has requested a higher mininum order then we start with that one
				1441	* instead of zero.
				1442	*/
				1443	static int calculate_order(int size)
				1444	{
				1445	int order;
				1446	int rem;
				1447
				1448	for (order = max(slub_min_order, fls(size - 1) - PAGE_SHIFT);
				1449	order < MAX_ORDER; order++) {
				1450	unsigned long slab_size = PAGE_SIZE << order;
				1451
				1452	if (slub_max_order > order &&
				1453	slab_size < slub_min_objects * size)
				1454	continue;
				1455
				1456	if (slab_size < size)
				1457	continue;
				1458
				1459	rem = slab_size % size;
				1460
				1461	if (rem <= (PAGE_SIZE << order) / 8)
				1462	break;
				1463
				1464	}
				1465	if (order >= MAX_ORDER)
				1466	return -E2BIG;
				1467	return order;
				1468	}
				1469
				1470	/*
				1471	* Function to figure out which alignment to use from the
				1472	* various ways of specifying it.
				1473	*/
				1474	static unsigned long calculate_alignment(unsigned long flags,
				1475	unsigned long align, unsigned long size)
				1476	{
				1477	/*
				1478	* If the user wants hardware cache aligned objects then
				1479	* follow that suggestion if the object is sufficiently
				1480	* large.
				1481	*
				1482	* The hardware cache alignment cannot override the
				1483	* specified alignment though. If that is greater
				1484	* then use it.
				1485	*/
Christoph Lameter	5af6083	2007-05-06 14:49:56 -0700	[diff] [blame]	1486	if ((flags & SLAB_HWCACHE_ALIGN) &&
Christoph Lameter	65c02d4	2007-05-09 02:32:35 -0700	[diff] [blame]	1487	size > cache_line_size() / 2)
				1488	return max_t(unsigned long, align, cache_line_size());
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1489
				1490	if (align < ARCH_SLAB_MINALIGN)
				1491	return ARCH_SLAB_MINALIGN;
				1492
				1493	return ALIGN(align, sizeof(void *));
				1494	}
				1495
				1496	static void init_kmem_cache_node(struct kmem_cache_node *n)
				1497	{
				1498	n->nr_partial = 0;
				1499	atomic_long_set(&n->nr_slabs, 0);
				1500	spin_lock_init(&n->list_lock);
				1501	INIT_LIST_HEAD(&n->partial);
Christoph Lameter	643b113	2007-05-06 14:49:42 -0700	[diff] [blame]	1502	INIT_LIST_HEAD(&n->full);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1503	}
				1504
				1505	#ifdef CONFIG_NUMA
				1506	/*
				1507	* No kmalloc_node yet so do it by hand. We know that this is the first
				1508	* slab on the node for this slabcache. There are no concurrent accesses
				1509	* possible.
				1510	*
				1511	* Note that this function only works on the kmalloc_node_cache
				1512	* when allocating for the kmalloc_node_cache.
				1513	*/
				1514	static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags,
				1515	int node)
				1516	{
				1517	struct page *page;
				1518	struct kmem_cache_node *n;
				1519
				1520	BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
				1521
				1522	page = new_slab(kmalloc_caches, gfpflags \| GFP_THISNODE, node);
				1523	/* new_slab() disables interupts */
				1524	local_irq_enable();
				1525
				1526	BUG_ON(!page);
				1527	n = page->freelist;
				1528	BUG_ON(!n);
				1529	page->freelist = get_freepointer(kmalloc_caches, n);
				1530	page->inuse++;
				1531	kmalloc_caches->node[node] = n;
				1532	init_object(kmalloc_caches, n, 1);
				1533	init_kmem_cache_node(n);
				1534	atomic_long_inc(&n->nr_slabs);
Christoph Lameter	e95eed5	2007-05-06 14:49:44 -0700	[diff] [blame]	1535	add_partial(n, page);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1536	return n;
				1537	}
				1538
				1539	static void free_kmem_cache_nodes(struct kmem_cache *s)
				1540	{
				1541	int node;
				1542
				1543	for_each_online_node(node) {
				1544	struct kmem_cache_node *n = s->node[node];
				1545	if (n && n != &s->local_node)
				1546	kmem_cache_free(kmalloc_caches, n);
				1547	s->node[node] = NULL;
				1548	}
				1549	}
				1550
				1551	static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
				1552	{
				1553	int node;
				1554	int local_node;
				1555
				1556	if (slab_state >= UP)
				1557	local_node = page_to_nid(virt_to_page(s));
				1558	else
				1559	local_node = 0;
				1560
				1561	for_each_online_node(node) {
				1562	struct kmem_cache_node *n;
				1563
				1564	if (local_node == node)
				1565	n = &s->local_node;
				1566	else {
				1567	if (slab_state == DOWN) {
				1568	n = early_kmem_cache_node_alloc(gfpflags,
				1569	node);
				1570	continue;
				1571	}
				1572	n = kmem_cache_alloc_node(kmalloc_caches,
				1573	gfpflags, node);
				1574
				1575	if (!n) {
				1576	free_kmem_cache_nodes(s);
				1577	return 0;
				1578	}
				1579
				1580	}
				1581	s->node[node] = n;
				1582	init_kmem_cache_node(n);
				1583	}
				1584	return 1;
				1585	}
				1586	#else
				1587	static void free_kmem_cache_nodes(struct kmem_cache *s)
				1588	{
				1589	}
				1590
				1591	static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
				1592	{
				1593	init_kmem_cache_node(&s->local_node);
				1594	return 1;
				1595	}
				1596	#endif
				1597
				1598	/*
				1599	* calculate_sizes() determines the order and the distribution of data within
				1600	* a slab object.
				1601	*/
				1602	static int calculate_sizes(struct kmem_cache *s)
				1603	{
				1604	unsigned long flags = s->flags;
				1605	unsigned long size = s->objsize;
				1606	unsigned long align = s->align;
				1607
				1608	/*
				1609	* Determine if we can poison the object itself. If the user of
				1610	* the slab may touch the object after free or before allocation
				1611	* then we should never poison the object itself.
				1612	*/
				1613	if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
				1614	!s->ctor && !s->dtor)
				1615	s->flags \|= __OBJECT_POISON;
				1616	else
				1617	s->flags &= ~__OBJECT_POISON;
				1618
				1619	/*
				1620	* Round up object size to the next word boundary. We can only
				1621	* place the free pointer at word boundaries and this determines
				1622	* the possible location of the free pointer.
				1623	*/
				1624	size = ALIGN(size, sizeof(void *));
				1625
				1626	/*
				1627	* If we are redzoning then check if there is some space between the
				1628	* end of the object and the free pointer. If not then add an
				1629	* additional word, so that we can establish a redzone between
				1630	* the object and the freepointer to be able to check for overwrites.
				1631	*/
				1632	if ((flags & SLAB_RED_ZONE) && size == s->objsize)
				1633	size += sizeof(void *);
				1634
				1635	/*
				1636	* With that we have determined how much of the slab is in actual
				1637	* use by the object. This is the potential offset to the free
				1638	* pointer.
				1639	*/
				1640	s->inuse = size;
				1641
				1642	if (((flags & (SLAB_DESTROY_BY_RCU \| SLAB_POISON)) \|\|
				1643	s->ctor \|\| s->dtor)) {
				1644	/*
				1645	* Relocate free pointer after the object if it is not
				1646	* permitted to overwrite the first word of the object on
				1647	* kmem_cache_free.
				1648	*
				1649	* This is the case if we do RCU, have a constructor or
				1650	* destructor or are poisoning the objects.
				1651	*/
				1652	s->offset = size;
				1653	size += sizeof(void *);
				1654	}
				1655
				1656	if (flags & SLAB_STORE_USER)
				1657	/*
				1658	* Need to store information about allocs and frees after
				1659	* the object.
				1660	*/
				1661	size += 2 * sizeof(struct track);
				1662
Christoph Lameter	be7b3fb	2007-05-09 02:32:36 -0700	[diff] [blame]	1663	if (flags & SLAB_RED_ZONE)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1664	/*
				1665	* Add some empty padding so that we can catch
				1666	* overwrites from earlier objects rather than let
				1667	* tracking information or the free pointer be
				1668	* corrupted if an user writes before the start
				1669	* of the object.
				1670	*/
				1671	size += sizeof(void *);
				1672	/*
				1673	* Determine the alignment based on various parameters that the
Christoph Lameter	65c02d4	2007-05-09 02:32:35 -0700	[diff] [blame]	1674	* user specified and the dynamic determination of cache line size
				1675	* on bootup.
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1676	*/
				1677	align = calculate_alignment(flags, align, s->objsize);
				1678
				1679	/*
				1680	* SLUB stores one object immediately after another beginning from
				1681	* offset 0. In order to align the objects we have to simply size
				1682	* each object to conform to the alignment.
				1683	*/
				1684	size = ALIGN(size, align);
				1685	s->size = size;
				1686
				1687	s->order = calculate_order(size);
				1688	if (s->order < 0)
				1689	return 0;
				1690
				1691	/*
				1692	* Determine the number of objects per slab
				1693	*/
				1694	s->objects = (PAGE_SIZE << s->order) / size;
				1695
				1696	/*
				1697	* Verify that the number of objects is within permitted limits.
				1698	* The page->inuse field is only 16 bit wide! So we cannot have
				1699	* more than 64k objects per slab.
				1700	*/
				1701	if (!s->objects \|\| s->objects > 65535)
				1702	return 0;
				1703	return 1;
				1704
				1705	}
				1706
				1707	static int __init finish_bootstrap(void)
				1708	{
				1709	struct list_head *h;
				1710	int err;
				1711
				1712	slab_state = SYSFS;
				1713
				1714	list_for_each(h, &slab_caches) {
				1715	struct kmem_cache *s =
				1716	container_of(h, struct kmem_cache, list);
				1717
				1718	err = sysfs_slab_add(s);
				1719	BUG_ON(err);
				1720	}
				1721	return 0;
				1722	}
				1723
				1724	static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
				1725	const char *name, size_t size,
				1726	size_t align, unsigned long flags,
				1727	void (ctor)(void , struct kmem_cache *, unsigned long),
				1728	void (dtor)(void , struct kmem_cache *, unsigned long))
				1729	{
				1730	memset(s, 0, kmem_size);
				1731	s->name = name;
				1732	s->ctor = ctor;
				1733	s->dtor = dtor;
				1734	s->objsize = size;
				1735	s->flags = flags;
				1736	s->align = align;
				1737
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1738	/*
				1739	* The page->offset field is only 16 bit wide. This is an offset
				1740	* in units of words from the beginning of an object. If the slab
				1741	* size is bigger then we cannot move the free pointer behind the
				1742	* object anymore.
				1743	*
				1744	* On 32 bit platforms the limit is 256k. On 64bit platforms
				1745	* the limit is 512k.
				1746	*
				1747	* Debugging or ctor/dtors may create a need to move the free
				1748	* pointer. Fail if this happens.
				1749	*/
				1750	if (s->size >= 65535 * sizeof(void *)) {
				1751	BUG_ON(flags & (SLAB_RED_ZONE \| SLAB_POISON \|
				1752	SLAB_STORE_USER \| SLAB_DESTROY_BY_RCU));
				1753	BUG_ON(ctor \|\| dtor);
				1754	}
				1755	else
				1756	/*
				1757	* Enable debugging if selected on the kernel commandline.
				1758	*/
				1759	if (slub_debug && (!slub_debug_slabs \|\|
				1760	strncmp(slub_debug_slabs, name,
				1761	strlen(slub_debug_slabs)) == 0))
				1762	s->flags \|= slub_debug;
				1763
				1764	if (!calculate_sizes(s))
				1765	goto error;
				1766
				1767	s->refcount = 1;
				1768	#ifdef CONFIG_NUMA
				1769	s->defrag_ratio = 100;
				1770	#endif
				1771
				1772	if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
				1773	return 1;
				1774	error:
				1775	if (flags & SLAB_PANIC)
				1776	panic("Cannot create slab %s size=%lu realsize=%u "
				1777	"order=%u offset=%u flags=%lx\n",
				1778	s->name, (unsigned long)size, s->size, s->order,
				1779	s->offset, flags);
				1780	return 0;
				1781	}
				1782	EXPORT_SYMBOL(kmem_cache_open);
				1783
				1784	/*
				1785	* Check if a given pointer is valid
				1786	*/
				1787	int kmem_ptr_validate(struct kmem_cache s, const void object)
				1788	{
				1789	struct page * page;
				1790	void *addr;
				1791
				1792	page = get_object_page(object);
				1793
				1794	if (!page \|\| s != page->slab)
				1795	/* No slab or wrong slab */
				1796	return 0;
				1797
Christoph Lameter	abcd08a	2007-05-09 02:32:37 -0700	[diff] [blame^]	1798	if (!check_valid_pointer(s, page, object))
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1799	return 0;
				1800
				1801	/*
				1802	* We could also check if the object is on the slabs freelist.
				1803	* But this would be too expensive and it seems that the main
				1804	* purpose of kmem_ptr_valid is to check if the object belongs
				1805	* to a certain slab.
				1806	*/
				1807	return 1;
				1808	}
				1809	EXPORT_SYMBOL(kmem_ptr_validate);
				1810
				1811	/*
				1812	* Determine the size of a slab object
				1813	*/
				1814	unsigned int kmem_cache_size(struct kmem_cache *s)
				1815	{
				1816	return s->objsize;
				1817	}
				1818	EXPORT_SYMBOL(kmem_cache_size);
				1819
				1820	const char kmem_cache_name(struct kmem_cache s)
				1821	{
				1822	return s->name;
				1823	}
				1824	EXPORT_SYMBOL(kmem_cache_name);
				1825
				1826	/*
				1827	* Attempt to free all slabs on a node
				1828	*/
				1829	static int free_list(struct kmem_cache s, struct kmem_cache_node n,
				1830	struct list_head *list)
				1831	{
				1832	int slabs_inuse = 0;
				1833	unsigned long flags;
				1834	struct page page, h;
				1835
				1836	spin_lock_irqsave(&n->list_lock, flags);
				1837	list_for_each_entry_safe(page, h, list, lru)
				1838	if (!page->inuse) {
				1839	list_del(&page->lru);
				1840	discard_slab(s, page);
				1841	} else
				1842	slabs_inuse++;
				1843	spin_unlock_irqrestore(&n->list_lock, flags);
				1844	return slabs_inuse;
				1845	}
				1846
				1847	/*
				1848	* Release all resources used by slab cache
				1849	*/
				1850	static int kmem_cache_close(struct kmem_cache *s)
				1851	{
				1852	int node;
				1853
				1854	flush_all(s);
				1855
				1856	/* Attempt to free all objects */
				1857	for_each_online_node(node) {
				1858	struct kmem_cache_node *n = get_node(s, node);
				1859
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame]	1860	n->nr_partial -= free_list(s, n, &n->partial);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	1861	if (atomic_long_read(&n->nr_slabs))
				1862	return 1;
				1863	}
				1864	free_kmem_cache_nodes(s);
				1865	return 0;
				1866	}
				1867
				1868	/*
				1869	* Close a cache and release the kmem_cache structure
				1870	* (must be used for caches created using kmem_cache_create)
				1871	*/
				1872	void kmem_cache_destroy(struct kmem_cache *s)
				1873	{
				1874	down_write(&slub_lock);
				1875	s->refcount--;
				1876	if (!s->refcount) {
				1877	list_del(&s->list);
				1878	if (kmem_cache_close(s))
				1879	WARN_ON(1);
				1880	sysfs_slab_remove(s);
				1881	kfree(s);
				1882	}
				1883	up_write(&slub_lock);
				1884	}
				1885	EXPORT_SYMBOL(kmem_cache_destroy);
				1886
				1887	/********************************************************************
				1888	* Kmalloc subsystem
				1889	*******************************************************************/
				1890
				1891	struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned;
				1892	EXPORT_SYMBOL(kmalloc_caches);
				1893
				1894	#ifdef CONFIG_ZONE_DMA
				1895	static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1];
				1896	#endif
				1897
				1898	static int __init setup_slub_min_order(char *str)
				1899	{
				1900	get_option (&str, &slub_min_order);
				1901
				1902	return 1;
				1903	}
				1904
				1905	__setup("slub_min_order=", setup_slub_min_order);
				1906
				1907	static int __init setup_slub_max_order(char *str)
				1908	{
				1909	get_option (&str, &slub_max_order);
				1910
				1911	return 1;
				1912	}
				1913
				1914	__setup("slub_max_order=", setup_slub_max_order);
				1915
				1916	static int __init setup_slub_min_objects(char *str)
				1917	{
				1918	get_option (&str, &slub_min_objects);
				1919
				1920	return 1;
				1921	}
				1922
				1923	__setup("slub_min_objects=", setup_slub_min_objects);
				1924
				1925	static int __init setup_slub_nomerge(char *str)
				1926	{
				1927	slub_nomerge = 1;
				1928	return 1;
				1929	}
				1930
				1931	__setup("slub_nomerge", setup_slub_nomerge);
				1932
				1933	static int __init setup_slub_debug(char *str)
				1934	{
				1935	if (!str \|\| *str != '=')
				1936	slub_debug = DEBUG_DEFAULT_FLAGS;
				1937	else {
				1938	str++;
				1939	if (str == 0 \|\| str == ',')
				1940	slub_debug = DEBUG_DEFAULT_FLAGS;
				1941	else
				1942	for( ;str && str != ','; str++)
				1943	switch (*str) {
				1944	case 'f' : case 'F' :
				1945	slub_debug \|= SLAB_DEBUG_FREE;
				1946	break;
				1947	case 'z' : case 'Z' :
				1948	slub_debug \|= SLAB_RED_ZONE;
				1949	break;
				1950	case 'p' : case 'P' :
				1951	slub_debug \|= SLAB_POISON;
				1952	break;
				1953	case 'u' : case 'U' :
				1954	slub_debug \|= SLAB_STORE_USER;
				1955	break;
				1956	case 't' : case 'T' :
				1957	slub_debug \|= SLAB_TRACE;
				1958	break;
				1959	default:
				1960	printk(KERN_ERR "slub_debug option '%c' "
				1961	"unknown. skipped\n",*str);
				1962	}
				1963	}
				1964
				1965	if (*str == ',')
				1966	slub_debug_slabs = str + 1;
				1967	return 1;
				1968	}
				1969
				1970	__setup("slub_debug", setup_slub_debug);
				1971
				1972	static struct kmem_cache create_kmalloc_cache(struct kmem_cache s,
				1973	const char *name, int size, gfp_t gfp_flags)
				1974	{
				1975	unsigned int flags = 0;
				1976
				1977	if (gfp_flags & SLUB_DMA)
				1978	flags = SLAB_CACHE_DMA;
				1979
				1980	down_write(&slub_lock);
				1981	if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
				1982	flags, NULL, NULL))
				1983	goto panic;
				1984
				1985	list_add(&s->list, &slab_caches);
				1986	up_write(&slub_lock);
				1987	if (sysfs_slab_add(s))
				1988	goto panic;
				1989	return s;
				1990
				1991	panic:
				1992	panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
				1993	}
				1994
				1995	static struct kmem_cache *get_slab(size_t size, gfp_t flags)
				1996	{
				1997	int index = kmalloc_index(size);
				1998
Christoph Lameter	614410d	2007-05-06 14:49:38 -0700	[diff] [blame]	1999	if (!index)
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2000	return NULL;
				2001
				2002	/* Allocation too large? */
				2003	BUG_ON(index < 0);
				2004
				2005	#ifdef CONFIG_ZONE_DMA
				2006	if ((flags & SLUB_DMA)) {
				2007	struct kmem_cache *s;
				2008	struct kmem_cache *x;
				2009	char *text;
				2010	size_t realsize;
				2011
				2012	s = kmalloc_caches_dma[index];
				2013	if (s)
				2014	return s;
				2015
				2016	/* Dynamically create dma cache */
				2017	x = kmalloc(kmem_size, flags & ~SLUB_DMA);
				2018	if (!x)
				2019	panic("Unable to allocate memory for dma cache\n");
				2020
				2021	if (index <= KMALLOC_SHIFT_HIGH)
				2022	realsize = 1 << index;
				2023	else {
				2024	if (index == 1)
				2025	realsize = 96;
				2026	else
				2027	realsize = 192;
				2028	}
				2029
				2030	text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
				2031	(unsigned int)realsize);
				2032	s = create_kmalloc_cache(x, text, realsize, flags);
				2033	kmalloc_caches_dma[index] = s;
				2034	return s;
				2035	}
				2036	#endif
				2037	return &kmalloc_caches[index];
				2038	}
				2039
				2040	void *__kmalloc(size_t size, gfp_t flags)
				2041	{
				2042	struct kmem_cache *s = get_slab(size, flags);
				2043
				2044	if (s)
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2045	return slab_alloc(s, flags, -1, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2046	return NULL;
				2047	}
				2048	EXPORT_SYMBOL(__kmalloc);
				2049
				2050	#ifdef CONFIG_NUMA
				2051	void *__kmalloc_node(size_t size, gfp_t flags, int node)
				2052	{
				2053	struct kmem_cache *s = get_slab(size, flags);
				2054
				2055	if (s)
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2056	return slab_alloc(s, flags, node, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2057	return NULL;
				2058	}
				2059	EXPORT_SYMBOL(__kmalloc_node);
				2060	#endif
				2061
				2062	size_t ksize(const void *object)
				2063	{
				2064	struct page *page = get_object_page(object);
				2065	struct kmem_cache *s;
				2066
				2067	BUG_ON(!page);
				2068	s = page->slab;
				2069	BUG_ON(!s);
				2070
				2071	/*
				2072	* Debugging requires use of the padding between object
				2073	* and whatever may come after it.
				2074	*/
				2075	if (s->flags & (SLAB_RED_ZONE \| SLAB_POISON))
				2076	return s->objsize;
				2077
				2078	/*
				2079	* If we have the need to store the freelist pointer
				2080	* back there or track user information then we can
				2081	* only use the space before that information.
				2082	*/
				2083	if (s->flags & (SLAB_DESTROY_BY_RCU \| SLAB_STORE_USER))
				2084	return s->inuse;
				2085
				2086	/*
				2087	* Else we can use all the padding etc for the allocation
				2088	*/
				2089	return s->size;
				2090	}
				2091	EXPORT_SYMBOL(ksize);
				2092
				2093	void kfree(const void *x)
				2094	{
				2095	struct kmem_cache *s;
				2096	struct page *page;
				2097
				2098	if (!x)
				2099	return;
				2100
Christoph Lameter	b49af68	2007-05-06 14:49:41 -0700	[diff] [blame]	2101	page = virt_to_head_page(x);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2102	s = page->slab;
				2103
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2104	slab_free(s, page, (void *)x, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2105	}
				2106	EXPORT_SYMBOL(kfree);
				2107
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame]	2108	/*
				2109	* kmem_cache_shrink removes empty slabs from the partial lists
				2110	* and then sorts the partially allocated slabs by the number
				2111	* of items in use. The slabs with the most items in use
				2112	* come first. New allocations will remove these from the
				2113	* partial list because they are full. The slabs with the
				2114	* least items are placed last. If it happens that the objects
				2115	* are freed then the page can be returned to the page allocator.
				2116	*/
				2117	int kmem_cache_shrink(struct kmem_cache *s)
				2118	{
				2119	int node;
				2120	int i;
				2121	struct kmem_cache_node *n;
				2122	struct page *page;
				2123	struct page *t;
				2124	struct list_head *slabs_by_inuse =
				2125	kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL);
				2126	unsigned long flags;
				2127
				2128	if (!slabs_by_inuse)
				2129	return -ENOMEM;
				2130
				2131	flush_all(s);
				2132	for_each_online_node(node) {
				2133	n = get_node(s, node);
				2134
				2135	if (!n->nr_partial)
				2136	continue;
				2137
				2138	for (i = 0; i < s->objects; i++)
				2139	INIT_LIST_HEAD(slabs_by_inuse + i);
				2140
				2141	spin_lock_irqsave(&n->list_lock, flags);
				2142
				2143	/*
				2144	* Build lists indexed by the items in use in
				2145	* each slab or free slabs if empty.
				2146	*
				2147	* Note that concurrent frees may occur while
				2148	* we hold the list_lock. page->inuse here is
				2149	* the upper limit.
				2150	*/
				2151	list_for_each_entry_safe(page, t, &n->partial, lru) {
				2152	if (!page->inuse && slab_trylock(page)) {
				2153	/*
				2154	* Must hold slab lock here because slab_free
				2155	* may have freed the last object and be
				2156	* waiting to release the slab.
				2157	*/
				2158	list_del(&page->lru);
				2159	n->nr_partial--;
				2160	slab_unlock(page);
				2161	discard_slab(s, page);
				2162	} else {
				2163	if (n->nr_partial > MAX_PARTIAL)
				2164	list_move(&page->lru,
				2165	slabs_by_inuse + page->inuse);
				2166	}
				2167	}
				2168
				2169	if (n->nr_partial <= MAX_PARTIAL)
				2170	goto out;
				2171
				2172	/*
				2173	* Rebuild the partial list with the slabs filled up
				2174	* most first and the least used slabs at the end.
				2175	*/
				2176	for (i = s->objects - 1; i >= 0; i--)
				2177	list_splice(slabs_by_inuse + i, n->partial.prev);
				2178
				2179	out:
				2180	spin_unlock_irqrestore(&n->list_lock, flags);
				2181	}
				2182
				2183	kfree(slabs_by_inuse);
				2184	return 0;
				2185	}
				2186	EXPORT_SYMBOL(kmem_cache_shrink);
				2187
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2188	/**
				2189	* krealloc - reallocate memory. The contents will remain unchanged.
				2190	*
				2191	* @p: object to reallocate memory for.
				2192	* @new_size: how many bytes of memory are required.
				2193	* @flags: the type of memory to allocate.
				2194	*
				2195	* The contents of the object pointed to are preserved up to the
				2196	* lesser of the new and old sizes. If @p is %NULL, krealloc()
				2197	* behaves exactly like kmalloc(). If @size is 0 and @p is not a
				2198	* %NULL pointer, the object pointed to is freed.
				2199	*/
				2200	void krealloc(const void p, size_t new_size, gfp_t flags)
				2201	{
				2202	struct kmem_cache *new_cache;
				2203	void *ret;
				2204	struct page *page;
				2205
				2206	if (unlikely(!p))
				2207	return kmalloc(new_size, flags);
				2208
				2209	if (unlikely(!new_size)) {
				2210	kfree(p);
				2211	return NULL;
				2212	}
				2213
Christoph Lameter	b49af68	2007-05-06 14:49:41 -0700	[diff] [blame]	2214	page = virt_to_head_page(p);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2215
				2216	new_cache = get_slab(new_size, flags);
				2217
				2218	/*
				2219	* If new size fits in the current cache, bail out.
				2220	*/
				2221	if (likely(page->slab == new_cache))
				2222	return (void *)p;
				2223
				2224	ret = kmalloc(new_size, flags);
				2225	if (ret) {
				2226	memcpy(ret, p, min(new_size, ksize(p)));
				2227	kfree(p);
				2228	}
				2229	return ret;
				2230	}
				2231	EXPORT_SYMBOL(krealloc);
				2232
				2233	/********************************************************************
				2234	* Basic setup of slabs
				2235	*******************************************************************/
				2236
				2237	void __init kmem_cache_init(void)
				2238	{
				2239	int i;
				2240
				2241	#ifdef CONFIG_NUMA
				2242	/*
				2243	* Must first have the slab cache available for the allocations of the
				2244	* struct kmalloc_cache_node's. There is special bootstrap code in
				2245	* kmem_cache_open for slab_state == DOWN.
				2246	*/
				2247	create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
				2248	sizeof(struct kmem_cache_node), GFP_KERNEL);
				2249	#endif
				2250
				2251	/* Able to allocate the per node structures */
				2252	slab_state = PARTIAL;
				2253
				2254	/* Caches that are not of the two-to-the-power-of size */
				2255	create_kmalloc_cache(&kmalloc_caches[1],
				2256	"kmalloc-96", 96, GFP_KERNEL);
				2257	create_kmalloc_cache(&kmalloc_caches[2],
				2258	"kmalloc-192", 192, GFP_KERNEL);
				2259
				2260	for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
				2261	create_kmalloc_cache(&kmalloc_caches[i],
				2262	"kmalloc", 1 << i, GFP_KERNEL);
				2263
				2264	slab_state = UP;
				2265
				2266	/* Provide the correct kmalloc names now that the caches are up */
				2267	for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
				2268	kmalloc_caches[i]. name =
				2269	kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
				2270
				2271	#ifdef CONFIG_SMP
				2272	register_cpu_notifier(&slab_notifier);
				2273	#endif
				2274
				2275	if (nr_cpu_ids) /* Remove when nr_cpu_ids is fixed upstream ! */
				2276	kmem_size = offsetof(struct kmem_cache, cpu_slab)
				2277	+ nr_cpu_ids * sizeof(struct page *);
				2278
				2279	printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
				2280	" Processors=%d, Nodes=%d\n",
Christoph Lameter	65c02d4	2007-05-09 02:32:35 -0700	[diff] [blame]	2281	KMALLOC_SHIFT_HIGH, cache_line_size(),
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2282	slub_min_order, slub_max_order, slub_min_objects,
				2283	nr_cpu_ids, nr_node_ids);
				2284	}
				2285
				2286	/*
				2287	* Find a mergeable slab cache
				2288	*/
				2289	static int slab_unmergeable(struct kmem_cache *s)
				2290	{
				2291	if (slub_nomerge \|\| (s->flags & SLUB_NEVER_MERGE))
				2292	return 1;
				2293
				2294	if (s->ctor \|\| s->dtor)
				2295	return 1;
				2296
				2297	return 0;
				2298	}
				2299
				2300	static struct kmem_cache *find_mergeable(size_t size,
				2301	size_t align, unsigned long flags,
				2302	void (ctor)(void , struct kmem_cache *, unsigned long),
				2303	void (dtor)(void , struct kmem_cache *, unsigned long))
				2304	{
				2305	struct list_head *h;
				2306
				2307	if (slub_nomerge \|\| (flags & SLUB_NEVER_MERGE))
				2308	return NULL;
				2309
				2310	if (ctor \|\| dtor)
				2311	return NULL;
				2312
				2313	size = ALIGN(size, sizeof(void *));
				2314	align = calculate_alignment(flags, align, size);
				2315	size = ALIGN(size, align);
				2316
				2317	list_for_each(h, &slab_caches) {
				2318	struct kmem_cache *s =
				2319	container_of(h, struct kmem_cache, list);
				2320
				2321	if (slab_unmergeable(s))
				2322	continue;
				2323
				2324	if (size > s->size)
				2325	continue;
				2326
				2327	if (((flags \| slub_debug) & SLUB_MERGE_SAME) !=
				2328	(s->flags & SLUB_MERGE_SAME))
				2329	continue;
				2330	/*
				2331	* Check if alignment is compatible.
				2332	* Courtesy of Adrian Drzewiecki
				2333	*/
				2334	if ((s->size & ~(align -1)) != s->size)
				2335	continue;
				2336
				2337	if (s->size - size >= sizeof(void *))
				2338	continue;
				2339
				2340	return s;
				2341	}
				2342	return NULL;
				2343	}
				2344
				2345	struct kmem_cache kmem_cache_create(const char name, size_t size,
				2346	size_t align, unsigned long flags,
				2347	void (ctor)(void , struct kmem_cache *, unsigned long),
				2348	void (dtor)(void , struct kmem_cache *, unsigned long))
				2349	{
				2350	struct kmem_cache *s;
				2351
				2352	down_write(&slub_lock);
				2353	s = find_mergeable(size, align, flags, dtor, ctor);
				2354	if (s) {
				2355	s->refcount++;
				2356	/*
				2357	* Adjust the object sizes so that we clear
				2358	* the complete object on kzalloc.
				2359	*/
				2360	s->objsize = max(s->objsize, (int)size);
				2361	s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
				2362	if (sysfs_slab_alias(s, name))
				2363	goto err;
				2364	} else {
				2365	s = kmalloc(kmem_size, GFP_KERNEL);
				2366	if (s && kmem_cache_open(s, GFP_KERNEL, name,
				2367	size, align, flags, ctor, dtor)) {
				2368	if (sysfs_slab_add(s)) {
				2369	kfree(s);
				2370	goto err;
				2371	}
				2372	list_add(&s->list, &slab_caches);
				2373	} else
				2374	kfree(s);
				2375	}
				2376	up_write(&slub_lock);
				2377	return s;
				2378
				2379	err:
				2380	up_write(&slub_lock);
				2381	if (flags & SLAB_PANIC)
				2382	panic("Cannot create slabcache %s\n", name);
				2383	else
				2384	s = NULL;
				2385	return s;
				2386	}
				2387	EXPORT_SYMBOL(kmem_cache_create);
				2388
				2389	void kmem_cache_zalloc(struct kmem_cache s, gfp_t flags)
				2390	{
				2391	void *x;
				2392
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2393	x = slab_alloc(s, flags, -1, __builtin_return_address(0));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2394	if (x)
				2395	memset(x, 0, s->objsize);
				2396	return x;
				2397	}
				2398	EXPORT_SYMBOL(kmem_cache_zalloc);
				2399
				2400	#ifdef CONFIG_SMP
				2401	static void for_all_slabs(void (func)(struct kmem_cache , int), int cpu)
				2402	{
				2403	struct list_head *h;
				2404
				2405	down_read(&slub_lock);
				2406	list_for_each(h, &slab_caches) {
				2407	struct kmem_cache *s =
				2408	container_of(h, struct kmem_cache, list);
				2409
				2410	func(s, cpu);
				2411	}
				2412	up_read(&slub_lock);
				2413	}
				2414
				2415	/*
				2416	* Use the cpu notifier to insure that the slab are flushed
				2417	* when necessary.
				2418	*/
				2419	static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
				2420	unsigned long action, void *hcpu)
				2421	{
				2422	long cpu = (long)hcpu;
				2423
				2424	switch (action) {
				2425	case CPU_UP_CANCELED:
				2426	case CPU_DEAD:
				2427	for_all_slabs(__flush_cpu_slab, cpu);
				2428	break;
				2429	default:
				2430	break;
				2431	}
				2432	return NOTIFY_OK;
				2433	}
				2434
				2435	static struct notifier_block __cpuinitdata slab_notifier =
				2436	{ &slab_cpuup_callback, NULL, 0 };
				2437
				2438	#endif
				2439
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2440	#ifdef CONFIG_NUMA
				2441
				2442	/*****************************************************************
				2443	* Generic reaper used to support the page allocator
				2444	* (the cpu slabs are reaped by a per slab workqueue).
				2445	*
				2446	* Maybe move this to the page allocator?
				2447	****************************************************************/
				2448
				2449	static DEFINE_PER_CPU(unsigned long, reap_node);
				2450
				2451	static void init_reap_node(int cpu)
				2452	{
				2453	int node;
				2454
				2455	node = next_node(cpu_to_node(cpu), node_online_map);
				2456	if (node == MAX_NUMNODES)
				2457	node = first_node(node_online_map);
				2458
				2459	__get_cpu_var(reap_node) = node;
				2460	}
				2461
				2462	static void next_reap_node(void)
				2463	{
				2464	int node = __get_cpu_var(reap_node);
				2465
				2466	/*
				2467	* Also drain per cpu pages on remote zones
				2468	*/
				2469	if (node != numa_node_id())
				2470	drain_node_pages(node);
				2471
				2472	node = next_node(node, node_online_map);
				2473	if (unlikely(node >= MAX_NUMNODES))
				2474	node = first_node(node_online_map);
				2475	__get_cpu_var(reap_node) = node;
				2476	}
				2477	#else
				2478	#define init_reap_node(cpu) do { } while (0)
				2479	#define next_reap_node(void) do { } while (0)
				2480	#endif
				2481
				2482	#define REAPTIMEOUT_CPUC (2*HZ)
				2483
				2484	#ifdef CONFIG_SMP
				2485	static DEFINE_PER_CPU(struct delayed_work, reap_work);
				2486
				2487	static void cache_reap(struct work_struct *unused)
				2488	{
				2489	next_reap_node();
				2490	refresh_cpu_vm_stats(smp_processor_id());
				2491	schedule_delayed_work(&__get_cpu_var(reap_work),
				2492	REAPTIMEOUT_CPUC);
				2493	}
				2494
				2495	static void __devinit start_cpu_timer(int cpu)
				2496	{
				2497	struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
				2498
				2499	/*
				2500	* When this gets called from do_initcalls via cpucache_init(),
				2501	* init_workqueues() has already run, so keventd will be setup
				2502	* at that time.
				2503	*/
				2504	if (keventd_up() && reap_work->work.func == NULL) {
				2505	init_reap_node(cpu);
				2506	INIT_DELAYED_WORK(reap_work, cache_reap);
				2507	schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
				2508	}
				2509	}
				2510
				2511	static int __init cpucache_init(void)
				2512	{
				2513	int cpu;
				2514
				2515	/*
				2516	* Register the timers that drain pcp pages and update vm statistics
				2517	*/
				2518	for_each_online_cpu(cpu)
				2519	start_cpu_timer(cpu);
				2520	return 0;
				2521	}
				2522	__initcall(cpucache_init);
				2523	#endif
				2524
				2525	#ifdef SLUB_RESILIENCY_TEST
				2526	static unsigned long validate_slab_cache(struct kmem_cache *s);
				2527
				2528	static void resiliency_test(void)
				2529	{
				2530	u8 *p;
				2531
				2532	printk(KERN_ERR "SLUB resiliency testing\n");
				2533	printk(KERN_ERR "-----------------------\n");
				2534	printk(KERN_ERR "A. Corruption after allocation\n");
				2535
				2536	p = kzalloc(16, GFP_KERNEL);
				2537	p[16] = 0x12;
				2538	printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
				2539	" 0x12->0x%p\n\n", p + 16);
				2540
				2541	validate_slab_cache(kmalloc_caches + 4);
				2542
				2543	/* Hmmm... The next two are dangerous */
				2544	p = kzalloc(32, GFP_KERNEL);
				2545	p[32 + sizeof(void *)] = 0x34;
				2546	printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
				2547	" 0x34 -> -0x%p\n", p);
				2548	printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
				2549
				2550	validate_slab_cache(kmalloc_caches + 5);
				2551	p = kzalloc(64, GFP_KERNEL);
				2552	p += 64 + (get_cycles() & 0xff) * sizeof(void *);
				2553	*p = 0x56;
				2554	printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
				2555	p);
				2556	printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
				2557	validate_slab_cache(kmalloc_caches + 6);
				2558
				2559	printk(KERN_ERR "\nB. Corruption after free\n");
				2560	p = kzalloc(128, GFP_KERNEL);
				2561	kfree(p);
				2562	*p = 0x78;
				2563	printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
				2564	validate_slab_cache(kmalloc_caches + 7);
				2565
				2566	p = kzalloc(256, GFP_KERNEL);
				2567	kfree(p);
				2568	p[50] = 0x9a;
				2569	printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
				2570	validate_slab_cache(kmalloc_caches + 8);
				2571
				2572	p = kzalloc(512, GFP_KERNEL);
				2573	kfree(p);
				2574	p[512] = 0xab;
				2575	printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
				2576	validate_slab_cache(kmalloc_caches + 9);
				2577	}
				2578	#else
				2579	static void resiliency_test(void) {};
				2580	#endif
				2581
				2582	/*
				2583	* These are not as efficient as kmalloc for the non debug case.
				2584	* We do not have the page struct available so we have to touch one
				2585	* cacheline in struct kmem_cache to check slab flags.
				2586	*/
				2587	void __kmalloc_track_caller(size_t size, gfp_t gfpflags, void caller)
				2588	{
				2589	struct kmem_cache *s = get_slab(size, gfpflags);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2590
				2591	if (!s)
				2592	return NULL;
				2593
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2594	return slab_alloc(s, gfpflags, -1, caller);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2595	}
				2596
				2597	void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
				2598	int node, void *caller)
				2599	{
				2600	struct kmem_cache *s = get_slab(size, gfpflags);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2601
				2602	if (!s)
				2603	return NULL;
				2604
Christoph Lameter	77c5e2d	2007-05-06 14:49:42 -0700	[diff] [blame]	2605	return slab_alloc(s, gfpflags, node, caller);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2606	}
				2607
				2608	#ifdef CONFIG_SYSFS
				2609
Christoph Lameter	53e15af	2007-05-06 14:49:43 -0700	[diff] [blame]	2610	static int validate_slab(struct kmem_cache s, struct page page)
				2611	{
				2612	void *p;
				2613	void *addr = page_address(page);
				2614	unsigned long map[BITS_TO_LONGS(s->objects)];
				2615
				2616	if (!check_slab(s, page) \|\|
				2617	!on_freelist(s, page, NULL))
				2618	return 0;
				2619
				2620	/* Now we know that a valid freelist exists */
				2621	bitmap_zero(map, s->objects);
				2622
				2623	for(p = page->freelist; p; p = get_freepointer(s, p)) {
				2624	set_bit((p - addr) / s->size, map);
				2625	if (!check_object(s, page, p, 0))
				2626	return 0;
				2627	}
				2628
				2629	for(p = addr; p < addr + s->objects * s->size; p += s->size)
				2630	if (!test_bit((p - addr) / s->size, map))
				2631	if (!check_object(s, page, p, 1))
				2632	return 0;
				2633	return 1;
				2634	}
				2635
				2636	static void validate_slab_slab(struct kmem_cache s, struct page page)
				2637	{
				2638	if (slab_trylock(page)) {
				2639	validate_slab(s, page);
				2640	slab_unlock(page);
				2641	} else
				2642	printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
				2643	s->name, page);
				2644
				2645	if (s->flags & DEBUG_DEFAULT_FLAGS) {
				2646	if (!PageError(page))
				2647	printk(KERN_ERR "SLUB %s: PageError not set "
				2648	"on slab 0x%p\n", s->name, page);
				2649	} else {
				2650	if (PageError(page))
				2651	printk(KERN_ERR "SLUB %s: PageError set on "
				2652	"slab 0x%p\n", s->name, page);
				2653	}
				2654	}
				2655
				2656	static int validate_slab_node(struct kmem_cache s, struct kmem_cache_node n)
				2657	{
				2658	unsigned long count = 0;
				2659	struct page *page;
				2660	unsigned long flags;
				2661
				2662	spin_lock_irqsave(&n->list_lock, flags);
				2663
				2664	list_for_each_entry(page, &n->partial, lru) {
				2665	validate_slab_slab(s, page);
				2666	count++;
				2667	}
				2668	if (count != n->nr_partial)
				2669	printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
				2670	"counter=%ld\n", s->name, count, n->nr_partial);
				2671
				2672	if (!(s->flags & SLAB_STORE_USER))
				2673	goto out;
				2674
				2675	list_for_each_entry(page, &n->full, lru) {
				2676	validate_slab_slab(s, page);
				2677	count++;
				2678	}
				2679	if (count != atomic_long_read(&n->nr_slabs))
				2680	printk(KERN_ERR "SLUB: %s %ld slabs counted but "
				2681	"counter=%ld\n", s->name, count,
				2682	atomic_long_read(&n->nr_slabs));
				2683
				2684	out:
				2685	spin_unlock_irqrestore(&n->list_lock, flags);
				2686	return count;
				2687	}
				2688
				2689	static unsigned long validate_slab_cache(struct kmem_cache *s)
				2690	{
				2691	int node;
				2692	unsigned long count = 0;
				2693
				2694	flush_all(s);
				2695	for_each_online_node(node) {
				2696	struct kmem_cache_node *n = get_node(s, node);
				2697
				2698	count += validate_slab_node(s, n);
				2699	}
				2700	return count;
				2701	}
				2702
Christoph Lameter	88a420e	2007-05-06 14:49:45 -0700	[diff] [blame]	2703	/*
				2704	* Generate lists of locations where slabcache objects are allocated
				2705	* and freed.
				2706	*/
				2707
				2708	struct location {
				2709	unsigned long count;
				2710	void *addr;
				2711	};
				2712
				2713	struct loc_track {
				2714	unsigned long max;
				2715	unsigned long count;
				2716	struct location *loc;
				2717	};
				2718
				2719	static void free_loc_track(struct loc_track *t)
				2720	{
				2721	if (t->max)
				2722	free_pages((unsigned long)t->loc,
				2723	get_order(sizeof(struct location) * t->max));
				2724	}
				2725
				2726	static int alloc_loc_track(struct loc_track *t, unsigned long max)
				2727	{
				2728	struct location *l;
				2729	int order;
				2730
				2731	if (!max)
				2732	max = PAGE_SIZE / sizeof(struct location);
				2733
				2734	order = get_order(sizeof(struct location) * max);
				2735
				2736	l = (void *)__get_free_pages(GFP_KERNEL, order);
				2737
				2738	if (!l)
				2739	return 0;
				2740
				2741	if (t->count) {
				2742	memcpy(l, t->loc, sizeof(struct location) * t->count);
				2743	free_loc_track(t);
				2744	}
				2745	t->max = max;
				2746	t->loc = l;
				2747	return 1;
				2748	}
				2749
				2750	static int add_location(struct loc_track t, struct kmem_cache s,
				2751	void *addr)
				2752	{
				2753	long start, end, pos;
				2754	struct location *l;
				2755	void *caddr;
				2756
				2757	start = -1;
				2758	end = t->count;
				2759
				2760	for ( ; ; ) {
				2761	pos = start + (end - start + 1) / 2;
				2762
				2763	/*
				2764	* There is nothing at "end". If we end up there
				2765	* we need to add something to before end.
				2766	*/
				2767	if (pos == end)
				2768	break;
				2769
				2770	caddr = t->loc[pos].addr;
				2771	if (addr == caddr) {
				2772	t->loc[pos].count++;
				2773	return 1;
				2774	}
				2775
				2776	if (addr < caddr)
				2777	end = pos;
				2778	else
				2779	start = pos;
				2780	}
				2781
				2782	/*
				2783	* Not found. Insert new tracking element
				2784	*/
				2785	if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max))
				2786	return 0;
				2787
				2788	l = t->loc + pos;
				2789	if (pos < t->count)
				2790	memmove(l + 1, l,
				2791	(t->count - pos) * sizeof(struct location));
				2792	t->count++;
				2793	l->count = 1;
				2794	l->addr = addr;
				2795	return 1;
				2796	}
				2797
				2798	static void process_slab(struct loc_track t, struct kmem_cache s,
				2799	struct page *page, enum track_item alloc)
				2800	{
				2801	void *addr = page_address(page);
				2802	unsigned long map[BITS_TO_LONGS(s->objects)];
				2803	void *p;
				2804
				2805	bitmap_zero(map, s->objects);
				2806	for (p = page->freelist; p; p = get_freepointer(s, p))
				2807	set_bit((p - addr) / s->size, map);
				2808
				2809	for (p = addr; p < addr + s->objects * s->size; p += s->size)
				2810	if (!test_bit((p - addr) / s->size, map)) {
				2811	void *addr = get_track(s, p, alloc)->addr;
				2812
				2813	add_location(t, s, addr);
				2814	}
				2815	}
				2816
				2817	static int list_locations(struct kmem_cache s, char buf,
				2818	enum track_item alloc)
				2819	{
				2820	int n = 0;
				2821	unsigned long i;
				2822	struct loc_track t;
				2823	int node;
				2824
				2825	t.count = 0;
				2826	t.max = 0;
				2827
				2828	/* Push back cpu slabs */
				2829	flush_all(s);
				2830
				2831	for_each_online_node(node) {
				2832	struct kmem_cache_node *n = get_node(s, node);
				2833	unsigned long flags;
				2834	struct page *page;
				2835
				2836	if (!atomic_read(&n->nr_slabs))
				2837	continue;
				2838
				2839	spin_lock_irqsave(&n->list_lock, flags);
				2840	list_for_each_entry(page, &n->partial, lru)
				2841	process_slab(&t, s, page, alloc);
				2842	list_for_each_entry(page, &n->full, lru)
				2843	process_slab(&t, s, page, alloc);
				2844	spin_unlock_irqrestore(&n->list_lock, flags);
				2845	}
				2846
				2847	for (i = 0; i < t.count; i++) {
				2848	void *addr = t.loc[i].addr;
				2849
				2850	if (n > PAGE_SIZE - 100)
				2851	break;
				2852	n += sprintf(buf + n, "%7ld ", t.loc[i].count);
				2853	if (addr)
				2854	n += sprint_symbol(buf + n, (unsigned long)t.loc[i].addr);
				2855	else
				2856	n += sprintf(buf + n, "<not-available>");
				2857	n += sprintf(buf + n, "\n");
				2858	}
				2859
				2860	free_loc_track(&t);
				2861	if (!t.count)
				2862	n += sprintf(buf, "No data\n");
				2863	return n;
				2864	}
				2865
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2866	static unsigned long count_partial(struct kmem_cache_node *n)
				2867	{
				2868	unsigned long flags;
				2869	unsigned long x = 0;
				2870	struct page *page;
				2871
				2872	spin_lock_irqsave(&n->list_lock, flags);
				2873	list_for_each_entry(page, &n->partial, lru)
				2874	x += page->inuse;
				2875	spin_unlock_irqrestore(&n->list_lock, flags);
				2876	return x;
				2877	}
				2878
				2879	enum slab_stat_type {
				2880	SL_FULL,
				2881	SL_PARTIAL,
				2882	SL_CPU,
				2883	SL_OBJECTS
				2884	};
				2885
				2886	#define SO_FULL (1 << SL_FULL)
				2887	#define SO_PARTIAL (1 << SL_PARTIAL)
				2888	#define SO_CPU (1 << SL_CPU)
				2889	#define SO_OBJECTS (1 << SL_OBJECTS)
				2890
				2891	static unsigned long slab_objects(struct kmem_cache *s,
				2892	char *buf, unsigned long flags)
				2893	{
				2894	unsigned long total = 0;
				2895	int cpu;
				2896	int node;
				2897	int x;
				2898	unsigned long *nodes;
				2899	unsigned long *per_cpu;
				2900
				2901	nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
				2902	per_cpu = nodes + nr_node_ids;
				2903
				2904	for_each_possible_cpu(cpu) {
				2905	struct page *page = s->cpu_slab[cpu];
				2906	int node;
				2907
				2908	if (page) {
				2909	node = page_to_nid(page);
				2910	if (flags & SO_CPU) {
				2911	int x = 0;
				2912
				2913	if (flags & SO_OBJECTS)
				2914	x = page->inuse;
				2915	else
				2916	x = 1;
				2917	total += x;
				2918	nodes[node] += x;
				2919	}
				2920	per_cpu[node]++;
				2921	}
				2922	}
				2923
				2924	for_each_online_node(node) {
				2925	struct kmem_cache_node *n = get_node(s, node);
				2926
				2927	if (flags & SO_PARTIAL) {
				2928	if (flags & SO_OBJECTS)
				2929	x = count_partial(n);
				2930	else
				2931	x = n->nr_partial;
				2932	total += x;
				2933	nodes[node] += x;
				2934	}
				2935
				2936	if (flags & SO_FULL) {
				2937	int full_slabs = atomic_read(&n->nr_slabs)
				2938	- per_cpu[node]
				2939	- n->nr_partial;
				2940
				2941	if (flags & SO_OBJECTS)
				2942	x = full_slabs * s->objects;
				2943	else
				2944	x = full_slabs;
				2945	total += x;
				2946	nodes[node] += x;
				2947	}
				2948	}
				2949
				2950	x = sprintf(buf, "%lu", total);
				2951	#ifdef CONFIG_NUMA
				2952	for_each_online_node(node)
				2953	if (nodes[node])
				2954	x += sprintf(buf + x, " N%d=%lu",
				2955	node, nodes[node]);
				2956	#endif
				2957	kfree(nodes);
				2958	return x + sprintf(buf + x, "\n");
				2959	}
				2960
				2961	static int any_slab_objects(struct kmem_cache *s)
				2962	{
				2963	int node;
				2964	int cpu;
				2965
				2966	for_each_possible_cpu(cpu)
				2967	if (s->cpu_slab[cpu])
				2968	return 1;
				2969
				2970	for_each_node(node) {
				2971	struct kmem_cache_node *n = get_node(s, node);
				2972
				2973	if (n->nr_partial \|\| atomic_read(&n->nr_slabs))
				2974	return 1;
				2975	}
				2976	return 0;
				2977	}
				2978
				2979	#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
				2980	#define to_slab(n) container_of(n, struct kmem_cache, kobj);
				2981
				2982	struct slab_attribute {
				2983	struct attribute attr;
				2984	ssize_t (show)(struct kmem_cache s, char *buf);
				2985	ssize_t (store)(struct kmem_cache s, const char *x, size_t count);
				2986	};
				2987
				2988	#define SLAB_ATTR_RO(_name) \
				2989	static struct slab_attribute _name##_attr = __ATTR_RO(_name)
				2990
				2991	#define SLAB_ATTR(_name) \
				2992	static struct slab_attribute _name##_attr = \
				2993	__ATTR(_name, 0644, _name##_show, _name##_store)
				2994
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	2995	static ssize_t slab_size_show(struct kmem_cache s, char buf)
				2996	{
				2997	return sprintf(buf, "%d\n", s->size);
				2998	}
				2999	SLAB_ATTR_RO(slab_size);
				3000
				3001	static ssize_t align_show(struct kmem_cache s, char buf)
				3002	{
				3003	return sprintf(buf, "%d\n", s->align);
				3004	}
				3005	SLAB_ATTR_RO(align);
				3006
				3007	static ssize_t object_size_show(struct kmem_cache s, char buf)
				3008	{
				3009	return sprintf(buf, "%d\n", s->objsize);
				3010	}
				3011	SLAB_ATTR_RO(object_size);
				3012
				3013	static ssize_t objs_per_slab_show(struct kmem_cache s, char buf)
				3014	{
				3015	return sprintf(buf, "%d\n", s->objects);
				3016	}
				3017	SLAB_ATTR_RO(objs_per_slab);
				3018
				3019	static ssize_t order_show(struct kmem_cache s, char buf)
				3020	{
				3021	return sprintf(buf, "%d\n", s->order);
				3022	}
				3023	SLAB_ATTR_RO(order);
				3024
				3025	static ssize_t ctor_show(struct kmem_cache s, char buf)
				3026	{
				3027	if (s->ctor) {
				3028	int n = sprint_symbol(buf, (unsigned long)s->ctor);
				3029
				3030	return n + sprintf(buf + n, "\n");
				3031	}
				3032	return 0;
				3033	}
				3034	SLAB_ATTR_RO(ctor);
				3035
				3036	static ssize_t dtor_show(struct kmem_cache s, char buf)
				3037	{
				3038	if (s->dtor) {
				3039	int n = sprint_symbol(buf, (unsigned long)s->dtor);
				3040
				3041	return n + sprintf(buf + n, "\n");
				3042	}
				3043	return 0;
				3044	}
				3045	SLAB_ATTR_RO(dtor);
				3046
				3047	static ssize_t aliases_show(struct kmem_cache s, char buf)
				3048	{
				3049	return sprintf(buf, "%d\n", s->refcount - 1);
				3050	}
				3051	SLAB_ATTR_RO(aliases);
				3052
				3053	static ssize_t slabs_show(struct kmem_cache s, char buf)
				3054	{
				3055	return slab_objects(s, buf, SO_FULL\|SO_PARTIAL\|SO_CPU);
				3056	}
				3057	SLAB_ATTR_RO(slabs);
				3058
				3059	static ssize_t partial_show(struct kmem_cache s, char buf)
				3060	{
				3061	return slab_objects(s, buf, SO_PARTIAL);
				3062	}
				3063	SLAB_ATTR_RO(partial);
				3064
				3065	static ssize_t cpu_slabs_show(struct kmem_cache s, char buf)
				3066	{
				3067	return slab_objects(s, buf, SO_CPU);
				3068	}
				3069	SLAB_ATTR_RO(cpu_slabs);
				3070
				3071	static ssize_t objects_show(struct kmem_cache s, char buf)
				3072	{
				3073	return slab_objects(s, buf, SO_FULL\|SO_PARTIAL\|SO_CPU\|SO_OBJECTS);
				3074	}
				3075	SLAB_ATTR_RO(objects);
				3076
				3077	static ssize_t sanity_checks_show(struct kmem_cache s, char buf)
				3078	{
				3079	return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
				3080	}
				3081
				3082	static ssize_t sanity_checks_store(struct kmem_cache *s,
				3083	const char *buf, size_t length)
				3084	{
				3085	s->flags &= ~SLAB_DEBUG_FREE;
				3086	if (buf[0] == '1')
				3087	s->flags \|= SLAB_DEBUG_FREE;
				3088	return length;
				3089	}
				3090	SLAB_ATTR(sanity_checks);
				3091
				3092	static ssize_t trace_show(struct kmem_cache s, char buf)
				3093	{
				3094	return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
				3095	}
				3096
				3097	static ssize_t trace_store(struct kmem_cache s, const char buf,
				3098	size_t length)
				3099	{
				3100	s->flags &= ~SLAB_TRACE;
				3101	if (buf[0] == '1')
				3102	s->flags \|= SLAB_TRACE;
				3103	return length;
				3104	}
				3105	SLAB_ATTR(trace);
				3106
				3107	static ssize_t reclaim_account_show(struct kmem_cache s, char buf)
				3108	{
				3109	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
				3110	}
				3111
				3112	static ssize_t reclaim_account_store(struct kmem_cache *s,
				3113	const char *buf, size_t length)
				3114	{
				3115	s->flags &= ~SLAB_RECLAIM_ACCOUNT;
				3116	if (buf[0] == '1')
				3117	s->flags \|= SLAB_RECLAIM_ACCOUNT;
				3118	return length;
				3119	}
				3120	SLAB_ATTR(reclaim_account);
				3121
				3122	static ssize_t hwcache_align_show(struct kmem_cache s, char buf)
				3123	{
Christoph Lameter	5af6083	2007-05-06 14:49:56 -0700	[diff] [blame]	3124	return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	3125	}
				3126	SLAB_ATTR_RO(hwcache_align);
				3127
				3128	#ifdef CONFIG_ZONE_DMA
				3129	static ssize_t cache_dma_show(struct kmem_cache s, char buf)
				3130	{
				3131	return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
				3132	}
				3133	SLAB_ATTR_RO(cache_dma);
				3134	#endif
				3135
				3136	static ssize_t destroy_by_rcu_show(struct kmem_cache s, char buf)
				3137	{
				3138	return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
				3139	}
				3140	SLAB_ATTR_RO(destroy_by_rcu);
				3141
				3142	static ssize_t red_zone_show(struct kmem_cache s, char buf)
				3143	{
				3144	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
				3145	}
				3146
				3147	static ssize_t red_zone_store(struct kmem_cache *s,
				3148	const char *buf, size_t length)
				3149	{
				3150	if (any_slab_objects(s))
				3151	return -EBUSY;
				3152
				3153	s->flags &= ~SLAB_RED_ZONE;
				3154	if (buf[0] == '1')
				3155	s->flags \|= SLAB_RED_ZONE;
				3156	calculate_sizes(s);
				3157	return length;
				3158	}
				3159	SLAB_ATTR(red_zone);
				3160
				3161	static ssize_t poison_show(struct kmem_cache s, char buf)
				3162	{
				3163	return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
				3164	}
				3165
				3166	static ssize_t poison_store(struct kmem_cache *s,
				3167	const char *buf, size_t length)
				3168	{
				3169	if (any_slab_objects(s))
				3170	return -EBUSY;
				3171
				3172	s->flags &= ~SLAB_POISON;
				3173	if (buf[0] == '1')
				3174	s->flags \|= SLAB_POISON;
				3175	calculate_sizes(s);
				3176	return length;
				3177	}
				3178	SLAB_ATTR(poison);
				3179
				3180	static ssize_t store_user_show(struct kmem_cache s, char buf)
				3181	{
				3182	return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
				3183	}
				3184
				3185	static ssize_t store_user_store(struct kmem_cache *s,
				3186	const char *buf, size_t length)
				3187	{
				3188	if (any_slab_objects(s))
				3189	return -EBUSY;
				3190
				3191	s->flags &= ~SLAB_STORE_USER;
				3192	if (buf[0] == '1')
				3193	s->flags \|= SLAB_STORE_USER;
				3194	calculate_sizes(s);
				3195	return length;
				3196	}
				3197	SLAB_ATTR(store_user);
				3198
Christoph Lameter	53e15af	2007-05-06 14:49:43 -0700	[diff] [blame]	3199	static ssize_t validate_show(struct kmem_cache s, char buf)
				3200	{
				3201	return 0;
				3202	}
				3203
				3204	static ssize_t validate_store(struct kmem_cache *s,
				3205	const char *buf, size_t length)
				3206	{
				3207	if (buf[0] == '1')
				3208	validate_slab_cache(s);
				3209	else
				3210	return -EINVAL;
				3211	return length;
				3212	}
				3213	SLAB_ATTR(validate);
				3214
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame]	3215	static ssize_t shrink_show(struct kmem_cache s, char buf)
				3216	{
				3217	return 0;
				3218	}
				3219
				3220	static ssize_t shrink_store(struct kmem_cache *s,
				3221	const char *buf, size_t length)
				3222	{
				3223	if (buf[0] == '1') {
				3224	int rc = kmem_cache_shrink(s);
				3225
				3226	if (rc)
				3227	return rc;
				3228	} else
				3229	return -EINVAL;
				3230	return length;
				3231	}
				3232	SLAB_ATTR(shrink);
				3233
Christoph Lameter	88a420e	2007-05-06 14:49:45 -0700	[diff] [blame]	3234	static ssize_t alloc_calls_show(struct kmem_cache s, char buf)
				3235	{
				3236	if (!(s->flags & SLAB_STORE_USER))
				3237	return -ENOSYS;
				3238	return list_locations(s, buf, TRACK_ALLOC);
				3239	}
				3240	SLAB_ATTR_RO(alloc_calls);
				3241
				3242	static ssize_t free_calls_show(struct kmem_cache s, char buf)
				3243	{
				3244	if (!(s->flags & SLAB_STORE_USER))
				3245	return -ENOSYS;
				3246	return list_locations(s, buf, TRACK_FREE);
				3247	}
				3248	SLAB_ATTR_RO(free_calls);
				3249
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	3250	#ifdef CONFIG_NUMA
				3251	static ssize_t defrag_ratio_show(struct kmem_cache s, char buf)
				3252	{
				3253	return sprintf(buf, "%d\n", s->defrag_ratio / 10);
				3254	}
				3255
				3256	static ssize_t defrag_ratio_store(struct kmem_cache *s,
				3257	const char *buf, size_t length)
				3258	{
				3259	int n = simple_strtoul(buf, NULL, 10);
				3260
				3261	if (n < 100)
				3262	s->defrag_ratio = n * 10;
				3263	return length;
				3264	}
				3265	SLAB_ATTR(defrag_ratio);
				3266	#endif
				3267
				3268	static struct attribute * slab_attrs[] = {
				3269	&slab_size_attr.attr,
				3270	&object_size_attr.attr,
				3271	&objs_per_slab_attr.attr,
				3272	&order_attr.attr,
				3273	&objects_attr.attr,
				3274	&slabs_attr.attr,
				3275	&partial_attr.attr,
				3276	&cpu_slabs_attr.attr,
				3277	&ctor_attr.attr,
				3278	&dtor_attr.attr,
				3279	&aliases_attr.attr,
				3280	&align_attr.attr,
				3281	&sanity_checks_attr.attr,
				3282	&trace_attr.attr,
				3283	&hwcache_align_attr.attr,
				3284	&reclaim_account_attr.attr,
				3285	&destroy_by_rcu_attr.attr,
				3286	&red_zone_attr.attr,
				3287	&poison_attr.attr,
				3288	&store_user_attr.attr,
Christoph Lameter	53e15af	2007-05-06 14:49:43 -0700	[diff] [blame]	3289	&validate_attr.attr,
Christoph Lameter	2086d26	2007-05-06 14:49:46 -0700	[diff] [blame]	3290	&shrink_attr.attr,
Christoph Lameter	88a420e	2007-05-06 14:49:45 -0700	[diff] [blame]	3291	&alloc_calls_attr.attr,
				3292	&free_calls_attr.attr,
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	3293	#ifdef CONFIG_ZONE_DMA
				3294	&cache_dma_attr.attr,
				3295	#endif
				3296	#ifdef CONFIG_NUMA
				3297	&defrag_ratio_attr.attr,
				3298	#endif
				3299	NULL
				3300	};
				3301
				3302	static struct attribute_group slab_attr_group = {
				3303	.attrs = slab_attrs,
				3304	};
				3305
				3306	static ssize_t slab_attr_show(struct kobject *kobj,
				3307	struct attribute *attr,
				3308	char *buf)
				3309	{
				3310	struct slab_attribute *attribute;
				3311	struct kmem_cache *s;
				3312	int err;
				3313
				3314	attribute = to_slab_attr(attr);
				3315	s = to_slab(kobj);
				3316
				3317	if (!attribute->show)
				3318	return -EIO;
				3319
				3320	err = attribute->show(s, buf);
				3321
				3322	return err;
				3323	}
				3324
				3325	static ssize_t slab_attr_store(struct kobject *kobj,
				3326	struct attribute *attr,
				3327	const char *buf, size_t len)
				3328	{
				3329	struct slab_attribute *attribute;
				3330	struct kmem_cache *s;
				3331	int err;
				3332
				3333	attribute = to_slab_attr(attr);
				3334	s = to_slab(kobj);
				3335
				3336	if (!attribute->store)
				3337	return -EIO;
				3338
				3339	err = attribute->store(s, buf, len);
				3340
				3341	return err;
				3342	}
				3343
				3344	static struct sysfs_ops slab_sysfs_ops = {
				3345	.show = slab_attr_show,
				3346	.store = slab_attr_store,
				3347	};
				3348
				3349	static struct kobj_type slab_ktype = {
				3350	.sysfs_ops = &slab_sysfs_ops,
				3351	};
				3352
				3353	static int uevent_filter(struct kset kset, struct kobject kobj)
				3354	{
				3355	struct kobj_type *ktype = get_ktype(kobj);
				3356
				3357	if (ktype == &slab_ktype)
				3358	return 1;
				3359	return 0;
				3360	}
				3361
				3362	static struct kset_uevent_ops slab_uevent_ops = {
				3363	.filter = uevent_filter,
				3364	};
				3365
				3366	decl_subsys(slab, &slab_ktype, &slab_uevent_ops);
				3367
				3368	#define ID_STR_LENGTH 64
				3369
				3370	/* Create a unique string id for a slab cache:
				3371	* format
				3372	* :[flags-]size:[memory address of kmemcache]
				3373	*/
				3374	static char create_unique_id(struct kmem_cache s)
				3375	{
				3376	char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
				3377	char *p = name;
				3378
				3379	BUG_ON(!name);
				3380
				3381	*p++ = ':';
				3382	/*
				3383	* First flags affecting slabcache operations. We will only
				3384	* get here for aliasable slabs so we do not need to support
				3385	* too many flags. The flags here must cover all flags that
				3386	* are matched during merging to guarantee that the id is
				3387	* unique.
				3388	*/
				3389	if (s->flags & SLAB_CACHE_DMA)
				3390	*p++ = 'd';
				3391	if (s->flags & SLAB_RECLAIM_ACCOUNT)
				3392	*p++ = 'a';
				3393	if (s->flags & SLAB_DEBUG_FREE)
				3394	*p++ = 'F';
				3395	if (p != name + 1)
				3396	*p++ = '-';
				3397	p += sprintf(p, "%07d", s->size);
				3398	BUG_ON(p > name + ID_STR_LENGTH - 1);
				3399	return name;
				3400	}
				3401
				3402	static int sysfs_slab_add(struct kmem_cache *s)
				3403	{
				3404	int err;
				3405	const char *name;
				3406	int unmergeable;
				3407
				3408	if (slab_state < SYSFS)
				3409	/* Defer until later */
				3410	return 0;
				3411
				3412	unmergeable = slab_unmergeable(s);
				3413	if (unmergeable) {
				3414	/*
				3415	* Slabcache can never be merged so we can use the name proper.
				3416	* This is typically the case for debug situations. In that
				3417	* case we can catch duplicate names easily.
				3418	*/
Linus Torvalds	0f9008e	2007-05-07 12:31:58 -0700	[diff] [blame]	3419	sysfs_remove_link(&slab_subsys.kobj, s->name);
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	3420	name = s->name;
				3421	} else {
				3422	/*
				3423	* Create a unique name for the slab as a target
				3424	* for the symlinks.
				3425	*/
				3426	name = create_unique_id(s);
				3427	}
				3428
				3429	kobj_set_kset_s(s, slab_subsys);
				3430	kobject_set_name(&s->kobj, name);
				3431	kobject_init(&s->kobj);
				3432	err = kobject_add(&s->kobj);
				3433	if (err)
				3434	return err;
				3435
				3436	err = sysfs_create_group(&s->kobj, &slab_attr_group);
				3437	if (err)
				3438	return err;
				3439	kobject_uevent(&s->kobj, KOBJ_ADD);
				3440	if (!unmergeable) {
				3441	/* Setup first alias */
				3442	sysfs_slab_alias(s, s->name);
				3443	kfree(name);
				3444	}
				3445	return 0;
				3446	}
				3447
				3448	static void sysfs_slab_remove(struct kmem_cache *s)
				3449	{
				3450	kobject_uevent(&s->kobj, KOBJ_REMOVE);
				3451	kobject_del(&s->kobj);
				3452	}
				3453
				3454	/*
				3455	* Need to buffer aliases during bootup until sysfs becomes
				3456	* available lest we loose that information.
				3457	*/
				3458	struct saved_alias {
				3459	struct kmem_cache *s;
				3460	const char *name;
				3461	struct saved_alias *next;
				3462	};
				3463
				3464	struct saved_alias *alias_list;
				3465
				3466	static int sysfs_slab_alias(struct kmem_cache s, const char name)
				3467	{
				3468	struct saved_alias *al;
				3469
				3470	if (slab_state == SYSFS) {
				3471	/*
				3472	* If we have a leftover link then remove it.
				3473	*/
Linus Torvalds	0f9008e	2007-05-07 12:31:58 -0700	[diff] [blame]	3474	sysfs_remove_link(&slab_subsys.kobj, name);
				3475	return sysfs_create_link(&slab_subsys.kobj,
Christoph Lameter	81819f0	2007-05-06 14:49:36 -0700	[diff] [blame]	3476	&s->kobj, name);
				3477	}
				3478
				3479	al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
				3480	if (!al)
				3481	return -ENOMEM;
				3482
				3483	al->s = s;
				3484	al->name = name;
				3485	al->next = alias_list;
				3486	alias_list = al;
				3487	return 0;
				3488	}
				3489
				3490	static int __init slab_sysfs_init(void)
				3491	{
				3492	int err;
				3493
				3494	err = subsystem_register(&slab_subsys);
				3495	if (err) {
				3496	printk(KERN_ERR "Cannot register slab subsystem.\n");
				3497	return -ENOSYS;
				3498	}
				3499
				3500	finish_bootstrap();
				3501
				3502	while (alias_list) {
				3503	struct saved_alias *al = alias_list;
				3504
				3505	alias_list = alias_list->next;
				3506	err = sysfs_slab_alias(al->s, al->name);
				3507	BUG_ON(err);
				3508	kfree(al);
				3509	}
				3510
				3511	resiliency_test();
				3512	return 0;
				3513	}
				3514
				3515	__initcall(slab_sysfs_init);
				3516	#else
				3517	__initcall(finish_bootstrap);
				3518	#endif