pstore: new filesystem interface to platform persistent storage

Some platforms have a small amount of non-volatile storage that
can be used to store information useful to diagnose the cause of
a system crash.  This is the generic part of a file system interface
that presents information from the crash as a series of files in
/dev/pstore.  Once the information has been seen, the underlying
storage is freed by deleting the files.

Signed-off-by: Tony Luck <tony.luck@intel.com>
diff --git a/Documentation/ABI/testing/pstore b/Documentation/ABI/testing/pstore
new file mode 100644
index 0000000..f1fb2a0
--- /dev/null
+++ b/Documentation/ABI/testing/pstore
@@ -0,0 +1,35 @@
+Where:		/dev/pstore/...
+Date:		January 2011
+Kernel Version: 2.6.38
+Contact:	tony.luck@intel.com
+Description:	Generic interface to platform dependent persistent storage.
+
+		Platforms that provide a mechanism to preserve some data
+		across system reboots can register with this driver to
+		provide a generic interface to show records captured in
+		the dying moments.  In the case of a panic the last part
+		of the console log is captured, but other interesting
+		data can also be saved.
+
+		# mount -t pstore - /dev/pstore
+
+		$ ls -l /dev/pstore
+		total 0
+		-r--r--r-- 1 root root 7896 Nov 30 15:38 dmesg-erst-1
+
+		Different users of this interface will result in different
+		filename prefixes.  Currently two are defined:
+
+		"dmesg"	- saved console log
+		"mce"	- architecture dependent data from fatal h/w error
+
+		Once the information in a file has been read, removing
+		the file will signal to the underlying persistent storage
+		device that it can reclaim the space for later re-use.
+
+		$ rm /dev/pstore/dmesg-erst-1
+
+		The expectation is that all files in /dev/pstore
+		will be saved elsewhere and erased from persistent store
+		soon after boot to free up space ready for the next
+		catastrophe.
diff --git a/Documentation/ABI/testing/sysfs-fs-pstore b/Documentation/ABI/testing/sysfs-fs-pstore
new file mode 100644
index 0000000..8e659d8
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-fs-pstore
@@ -0,0 +1,7 @@
+What:		/sys/fs/pstore/kmsg_bytes
+Date:		January 2011
+Kernel Version: 2.6.38
+Contact:	"Tony Luck" <tony.luck@intel.com>
+Description:
+		Controls amount of console log that will be saved
+		to persistent store on oops/panic.
diff --git a/fs/Kconfig b/fs/Kconfig
index 771f457..2bbe47f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -188,6 +188,7 @@
 source "fs/hpfs/Kconfig"
 source "fs/qnx4/Kconfig"
 source "fs/romfs/Kconfig"
+source "fs/pstore/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index a7f7cef..db71a5b 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -121,3 +121,4 @@
 obj-$(CONFIG_GFS2_FS)           += gfs2/
 obj-$(CONFIG_EXOFS_FS)          += exofs/
 obj-$(CONFIG_CEPH_FS)		+= ceph/
+obj-$(CONFIG_PSTORE)		+= pstore/
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
new file mode 100644
index 0000000..867d0ac
--- /dev/null
+++ b/fs/pstore/Kconfig
@@ -0,0 +1,13 @@
+config PSTORE
+	bool "Persistant store support"
+	default n
+	help
+	   This option enables generic access to platform level
+	   persistent storage via "pstore" filesystem that can
+	   be mounted as /dev/pstore.  Only useful if you have
+	   a platform level driver that registers with pstore to
+	   provide the data, so you probably should just go say "Y"
+	   (or "M") to a platform specific persistent store driver
+	   (e.g. ACPI_APEI on X86) which will select this for you.
+	   If you don't have a platform persistent store driver,
+	   say N.
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
new file mode 100644
index 0000000..760f4bc
--- /dev/null
+++ b/fs/pstore/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux pstorefs routines.
+#
+
+obj-y += pstore.o
+
+pstore-objs += inode.o platform.o
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
new file mode 100644
index 0000000..0e806aa
--- /dev/null
+++ b/fs/pstore/inode.c
@@ -0,0 +1,280 @@
+/*
+ * Persistent Storage - ramfs parts.
+ *
+ * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/mount.h>
+#include <linux/ramfs.h>
+#include <linux/sched.h>
+#include <linux/magic.h>
+#include <linux/pstore.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "internal.h"
+
+#define	PSTORE_NAMELEN	64
+
+struct pstore_private {
+	u64	id;
+	int	(*erase)(u64);
+};
+
+#define pstore_get_inode ramfs_get_inode
+
+/*
+ * When a file is unlinked from our file system we call the
+ * platform driver to erase the record from persistent store.
+ */
+static int pstore_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct pstore_private *p = dentry->d_inode->i_private;
+
+	p->erase(p->id);
+	kfree(p);
+
+	return simple_unlink(dir, dentry);
+}
+
+static const struct inode_operations pstore_dir_inode_operations = {
+	.lookup		= simple_lookup,
+	.unlink		= pstore_unlink,
+};
+
+static const struct super_operations pstore_ops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+	.show_options	= generic_show_options,
+};
+
+static struct super_block *pstore_sb;
+static struct vfsmount *pstore_mnt;
+
+int pstore_is_mounted(void)
+{
+	return pstore_mnt != NULL;
+}
+
+/*
+ * Set up a file structure as if we had opened this file and
+ * write our data to it.
+ */
+static int pstore_writefile(struct inode *inode, struct dentry *dentry,
+	char *data, size_t size)
+{
+	struct file f;
+	ssize_t n;
+	mm_segment_t old_fs = get_fs();
+
+	memset(&f, '0', sizeof f);
+	f.f_mapping = inode->i_mapping;
+	f.f_path.dentry = dentry;
+	f.f_path.mnt = pstore_mnt;
+	f.f_pos = 0;
+	f.f_op = inode->i_fop;
+	set_fs(KERNEL_DS);
+	n = do_sync_write(&f, data, size, &f.f_pos);
+	set_fs(old_fs);
+
+	fsnotify_modify(&f);
+
+	return n == size;
+}
+
+/*
+ * Make a regular file in the root directory of our file system.
+ * Load it up with "size" bytes of data from "buf".
+ * Set the mtime & ctime to the date that this record was originally stored.
+ */
+int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
+			      char *data, size_t size,
+			      struct timespec time, int (*erase)(u64))
+{
+	struct dentry		*root = pstore_sb->s_root;
+	struct dentry		*dentry;
+	struct inode		*inode;
+	int			rc;
+	char			name[PSTORE_NAMELEN];
+	struct pstore_private	*private;
+
+	rc = -ENOMEM;
+	inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0);
+	if (!inode)
+		goto fail;
+	inode->i_uid = inode->i_gid = 0;
+	private = kmalloc(sizeof *private, GFP_KERNEL);
+	if (!private)
+		goto fail_alloc;
+	private->id = id;
+	private->erase = erase;
+
+	switch (type) {
+	case PSTORE_TYPE_DMESG:
+		sprintf(name, "dmesg-%s-%lld", psname, id);
+		break;
+	case PSTORE_TYPE_MCE:
+		sprintf(name, "mce-%s-%lld", psname, id);
+		break;
+	case PSTORE_TYPE_UNKNOWN:
+		sprintf(name, "unknown-%s-%lld", psname, id);
+		break;
+	default:
+		sprintf(name, "type%d-%s-%lld", type, psname, id);
+		break;
+	}
+
+	mutex_lock(&root->d_inode->i_mutex);
+
+	rc = -ENOSPC;
+	dentry = d_alloc_name(root, name);
+	if (IS_ERR(dentry))
+		goto fail_lockedalloc;
+
+	d_add(dentry, inode);
+
+	mutex_unlock(&root->d_inode->i_mutex);
+
+	if (!pstore_writefile(inode, dentry, data, size))
+		goto fail_write;
+
+	inode->i_private = private;
+
+	if (time.tv_sec)
+		inode->i_mtime = inode->i_ctime = time;
+
+	return 0;
+
+fail_write:
+	kfree(private);
+	inode->i_nlink--;
+	mutex_lock(&root->d_inode->i_mutex);
+	d_delete(dentry);
+	dput(dentry);
+	mutex_unlock(&root->d_inode->i_mutex);
+	goto fail;
+
+fail_lockedalloc:
+	mutex_unlock(&root->d_inode->i_mutex);
+	kfree(private);
+fail_alloc:
+	iput(inode);
+
+fail:
+	return rc;
+}
+
+int pstore_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *inode = NULL;
+	struct dentry *root;
+	int err;
+
+	save_mount_options(sb, data);
+
+	pstore_sb = sb;
+
+	sb->s_maxbytes		= MAX_LFS_FILESIZE;
+	sb->s_blocksize		= PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits	= PAGE_CACHE_SHIFT;
+	sb->s_magic		= PSTOREFS_MAGIC;
+	sb->s_op		= &pstore_ops;
+	sb->s_time_gran		= 1;
+
+	inode = pstore_get_inode(sb, NULL, S_IFDIR | 0755, 0);
+	if (!inode) {
+		err = -ENOMEM;
+		goto fail;
+	}
+	/* override ramfs "dir" options so we catch unlink(2) */
+	inode->i_op = &pstore_dir_inode_operations;
+
+	root = d_alloc_root(inode);
+	sb->s_root = root;
+	if (!root) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	pstore_get_records();
+
+	return 0;
+fail:
+	iput(inode);
+	return err;
+}
+
+static int pstore_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	struct dentry *root;
+
+	root = mount_nodev(fs_type, flags, data, pstore_fill_super);
+	if (IS_ERR(root))
+		return -ENOMEM;
+
+	mnt->mnt_root = root;
+	mnt->mnt_sb = root->d_sb;
+	pstore_mnt = mnt;
+
+	return 0;
+}
+
+static void pstore_kill_sb(struct super_block *sb)
+{
+	kill_litter_super(sb);
+	pstore_sb = NULL;
+	pstore_mnt = NULL;
+}
+
+static struct file_system_type pstore_fs_type = {
+	.name		= "pstore",
+	.get_sb		= pstore_get_sb,
+	.kill_sb	= pstore_kill_sb,
+};
+
+static int __init init_pstore_fs(void)
+{
+	int ret = 0;
+	struct kobject *pstorefs_kobj;
+
+	pstorefs_kobj = kobject_create_and_add("pstore", fs_kobj);
+	if (!pstorefs_kobj)
+		return -ENOMEM;
+
+	sysfs_create_file(pstorefs_kobj, &pstore_kmsg_bytes_attr.attr);
+
+	ret = register_filesystem(&pstore_fs_type);
+
+	if (ret) {
+		sysfs_remove_file(pstorefs_kobj, &pstore_kmsg_bytes_attr.attr);
+		kobject_put(pstorefs_kobj);
+	}
+
+	return ret;
+}
+module_init(init_pstore_fs)
+
+MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
+MODULE_LICENSE("GPL");
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
new file mode 100644
index 0000000..76c26d2
--- /dev/null
+++ b/fs/pstore/internal.h
@@ -0,0 +1,7 @@
+extern void	pstore_get_records(void);
+extern int	pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
+			      char *data, size_t size,
+			      struct timespec time, int (*erase)(u64));
+extern int	pstore_is_mounted(void);
+
+extern struct kobj_attribute pstore_kmsg_bytes_attr;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
new file mode 100644
index 0000000..705fdf8
--- /dev/null
+++ b/fs/pstore/platform.c
@@ -0,0 +1,202 @@
+/*
+ * Persistent Storage - platform driver interface parts.
+ *
+ * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/atomic.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kmsg_dump.h>
+#include <linux/module.h>
+#include <linux/pstore.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "internal.h"
+
+/*
+ * pstore_lock just protects "psinfo" during
+ * calls to pstore_register()
+ */
+static DEFINE_SPINLOCK(pstore_lock);
+static struct pstore_info *psinfo;
+
+/* How much of the console log to snapshot. /sys/fs/pstore/kmsg_bytes */
+static unsigned long kmsg_bytes = 10240;
+
+static ssize_t b_show(struct kobject *kobj,
+		      struct kobj_attribute *attr, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%lu\n", kmsg_bytes);
+}
+
+static ssize_t b_store(struct kobject *kobj, struct kobj_attribute *attr,
+		       const char *buf, size_t count)
+{
+	return (sscanf(buf, "%lu", &kmsg_bytes) > 0) ? count : 0;
+}
+
+struct kobj_attribute pstore_kmsg_bytes_attr =
+	__ATTR(kmsg_bytes, S_IRUGO | S_IWUSR, b_show, b_store);
+
+/* Tag each group of saved records with a sequence number */
+static int	oopscount;
+
+/*
+ * callback from kmsg_dump. (s2,l2) has the most recently
+ * written bytes, older bytes are in (s1,l1). Save as much
+ * as we can from the end of the buffer.
+ */
+static void pstore_dump(struct kmsg_dumper *dumper,
+	    enum kmsg_dump_reason reason,
+	    const char *s1, unsigned long l1,
+	    const char *s2, unsigned long l2)
+{
+	unsigned long	s1_start, s2_start;
+	unsigned long	l1_cpy, l2_cpy;
+	unsigned long	size, total = 0;
+	char		*dst;
+	u64		id;
+	int		hsize, part = 1;
+
+	mutex_lock(&psinfo->buf_mutex);
+	oopscount++;
+	while (total < kmsg_bytes) {
+		dst = psinfo->buf;
+		hsize = sprintf(dst, "Oops#%d Part%d\n", oopscount, part++);
+		size = psinfo->bufsize - hsize;
+		dst += hsize;
+
+		l2_cpy = min(l2, size);
+		l1_cpy = min(l1, size - l2_cpy);
+
+		if (l1_cpy + l2_cpy == 0)
+			break;
+
+		s2_start = l2 - l2_cpy;
+		s1_start = l1 - l1_cpy;
+
+		memcpy(dst, s1 + s1_start, l1_cpy);
+		memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy);
+
+		id = psinfo->write(PSTORE_TYPE_DMESG, hsize + l1_cpy + l2_cpy);
+		if (pstore_is_mounted())
+			pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id,
+				      psinfo->buf, hsize + l1_cpy + l2_cpy,
+				      CURRENT_TIME, psinfo->erase);
+		l1 -= l1_cpy;
+		l2 -= l2_cpy;
+		total += l1_cpy + l2_cpy;
+	}
+	mutex_unlock(&psinfo->buf_mutex);
+}
+
+static struct kmsg_dumper pstore_dumper = {
+	.dump = pstore_dump,
+};
+
+/*
+ * platform specific persistent storage driver registers with
+ * us here. If pstore is already mounted, call the platform
+ * read function right away to populate the file system. If not
+ * then the pstore mount code will call us later to fill out
+ * the file system.
+ *
+ * Register with kmsg_dump to save last part of console log on panic.
+ */
+int pstore_register(struct pstore_info *psi)
+{
+	struct module *owner = psi->owner;
+
+	spin_lock(&pstore_lock);
+	if (psinfo) {
+		spin_unlock(&pstore_lock);
+		return -EBUSY;
+	}
+	psinfo = psi;
+	spin_unlock(&pstore_lock);
+
+	if (owner && !try_module_get(owner)) {
+		psinfo = NULL;
+		return -EINVAL;
+	}
+
+	if (pstore_is_mounted())
+		pstore_get_records();
+
+	kmsg_dump_register(&pstore_dumper);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pstore_register);
+
+/*
+ * Read all the records from the persistent store. Create and
+ * file files in our filesystem.
+ */
+void pstore_get_records(void)
+{
+	struct pstore_info *psi = psinfo;
+	size_t			size;
+	u64			id;
+	enum pstore_type_id	type;
+	struct timespec		time;
+	int			failed = 0;
+
+	if (!psi)
+		return;
+
+	mutex_lock(&psinfo->buf_mutex);
+	while ((size = psi->read(&id, &type, &time)) > 0) {
+		if (pstore_mkfile(type, psi->name, id, psi->buf, size,
+				  time, psi->erase))
+			failed++;
+	}
+	mutex_unlock(&psinfo->buf_mutex);
+
+	if (failed)
+		printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n",
+		       failed, psi->name);
+}
+
+/*
+ * Call platform driver to write a record to the
+ * persistent store.
+ */
+int pstore_write(enum pstore_type_id type, char *buf, size_t size)
+{
+	u64	id;
+
+	if (!psinfo)
+		return -ENODEV;
+
+	if (size > psinfo->bufsize)
+		return -EFBIG;
+
+	mutex_lock(&psinfo->buf_mutex);
+	memcpy(psinfo->buf, buf, size);
+	id = psinfo->write(type, size);
+	if (pstore_is_mounted())
+		pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf,
+			      size, CURRENT_TIME, psinfo->erase);
+	mutex_unlock(&psinfo->buf_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pstore_write);
diff --git a/include/linux/magic.h b/include/linux/magic.h
index ff690d0..e87fd5a 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -26,6 +26,7 @@
 #define ISOFS_SUPER_MAGIC	0x9660
 #define JFFS2_SUPER_MAGIC	0x72b6
 #define ANON_INODE_FS_MAGIC	0x09041934
+#define PSTOREFS_MAGIC		0x6165676C
 
 #define MINIX_SUPER_MAGIC	0x137F		/* original minix fs */
 #define MINIX_SUPER_MAGIC2	0x138F		/* minix fs, 30 char names */
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
new file mode 100644
index 0000000..4197773
--- /dev/null
+++ b/include/linux/pstore.h
@@ -0,0 +1,60 @@
+/*
+ * Persistent Storage - pstore.h
+ *
+ * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
+ *
+ * This code is the generic layer to export data records from platform
+ * level persistent storage via a file system.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef _LINUX_PSTORE_H
+#define _LINUX_PSTORE_H
+
+/* types */
+enum pstore_type_id {
+	PSTORE_TYPE_DMESG	= 0,
+	PSTORE_TYPE_MCE		= 1,
+	PSTORE_TYPE_UNKNOWN	= 255
+};
+
+struct pstore_info {
+	struct module	*owner;
+	char		*name;
+	struct mutex	buf_mutex;	/* serialize access to 'buf' */
+	char		*buf;
+	size_t		bufsize;
+	size_t		(*read)(u64 *id, enum pstore_type_id *type,
+			struct timespec *time);
+	u64		(*write)(enum pstore_type_id type, size_t size);
+	int		(*erase)(u64 id);
+};
+
+#ifdef CONFIG_PSTORE
+extern int pstore_register(struct pstore_info *);
+extern int pstore_write(enum pstore_type_id type, char *buf, size_t size);
+#else
+static inline int
+pstore_register(struct pstore_info *psi)
+{
+	return -ENODEV;
+}
+static inline int
+pstore_write(enum pstore_type_id type, char *buf, size_t size)
+{
+	return -ENODEV;
+}
+#endif
+
+#endif /*_LINUX_PSTORE_H*/