blob: dd299e55f65d125a78c23b212aa17968750f4046 [file] [log] [blame]
Dan Williams7b6be842017-04-11 09:49:49 -07001/*
2 * Copyright(c) 2017 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#include <linux/pagemap.h>
14#include <linux/module.h>
15#include <linux/mount.h>
16#include <linux/magic.h>
Dan Williamsef5104242017-05-08 10:55:27 -070017#include <linux/genhd.h>
Dan Williams7b6be842017-04-11 09:49:49 -070018#include <linux/cdev.h>
19#include <linux/hash.h>
20#include <linux/slab.h>
Dan Williams7e026c82017-05-29 12:57:56 -070021#include <linux/uio.h>
Dan Williams6568b082017-01-24 18:44:18 -080022#include <linux/dax.h>
Dan Williams7b6be842017-04-11 09:49:49 -070023#include <linux/fs.h>
24
Dan Williams7b6be842017-04-11 09:49:49 -070025static dev_t dax_devt;
26DEFINE_STATIC_SRCU(dax_srcu);
27static struct vfsmount *dax_mnt;
28static DEFINE_IDA(dax_minor_ida);
29static struct kmem_cache *dax_cache __read_mostly;
30static struct super_block *dax_superblock __read_mostly;
31
Dan Williams72058002017-04-19 15:14:31 -070032#define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head))
33static struct hlist_head dax_host_list[DAX_HASH_SIZE];
34static DEFINE_SPINLOCK(dax_host_lock);
35
Dan Williams7b6be842017-04-11 09:49:49 -070036int dax_read_lock(void)
37{
38 return srcu_read_lock(&dax_srcu);
39}
40EXPORT_SYMBOL_GPL(dax_read_lock);
41
42void dax_read_unlock(int id)
43{
44 srcu_read_unlock(&dax_srcu, id);
45}
46EXPORT_SYMBOL_GPL(dax_read_unlock);
47
Dan Williams9d109082017-05-13 16:18:21 -070048#ifdef CONFIG_BLOCK
Dan Williamsef5104242017-05-08 10:55:27 -070049int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
50 pgoff_t *pgoff)
51{
52 phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512;
53
54 if (pgoff)
55 *pgoff = PHYS_PFN(phys_off);
56 if (phys_off % PAGE_SIZE || size % PAGE_SIZE)
57 return -EINVAL;
58 return 0;
59}
60EXPORT_SYMBOL(bdev_dax_pgoff);
61
62/**
63 * __bdev_dax_supported() - Check if the device supports dax for filesystem
64 * @sb: The superblock of the device
65 * @blocksize: The block size of the device
66 *
67 * This is a library function for filesystems to check if the block device
68 * can be mounted with dax option.
69 *
70 * Return: negative errno if unsupported, 0 if supported.
71 */
72int __bdev_dax_supported(struct super_block *sb, int blocksize)
73{
74 struct block_device *bdev = sb->s_bdev;
75 struct dax_device *dax_dev;
76 pgoff_t pgoff;
77 int err, id;
78 void *kaddr;
79 pfn_t pfn;
80 long len;
81
82 if (blocksize != PAGE_SIZE) {
83 pr_err("VFS (%s): error: unsupported blocksize for dax\n",
84 sb->s_id);
85 return -EINVAL;
86 }
87
88 err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
89 if (err) {
90 pr_err("VFS (%s): error: unaligned partition for dax\n",
91 sb->s_id);
92 return err;
93 }
94
95 dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
96 if (!dax_dev) {
97 pr_err("VFS (%s): error: device does not support dax\n",
98 sb->s_id);
99 return -EOPNOTSUPP;
100 }
101
102 id = dax_read_lock();
103 len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn);
104 dax_read_unlock(id);
105
106 put_dax(dax_dev);
107
108 if (len < 1) {
109 pr_err("VFS (%s): error: dax access failed (%ld)",
110 sb->s_id, len);
111 return len < 0 ? len : -EIO;
112 }
113
114 return 0;
115}
116EXPORT_SYMBOL_GPL(__bdev_dax_supported);
Dan Williams9d109082017-05-13 16:18:21 -0700117#endif
Dan Williamsef5104242017-05-08 10:55:27 -0700118
Dan Williams7b6be842017-04-11 09:49:49 -0700119/**
120 * struct dax_device - anchor object for dax services
121 * @inode: core vfs
122 * @cdev: optional character interface for "device dax"
Dan Williams72058002017-04-19 15:14:31 -0700123 * @host: optional name for lookups where the device path is not available
Dan Williams7b6be842017-04-11 09:49:49 -0700124 * @private: dax driver private data
125 * @alive: !alive + rcu grace period == no new operations / mappings
126 */
127struct dax_device {
Dan Williams72058002017-04-19 15:14:31 -0700128 struct hlist_node list;
Dan Williams7b6be842017-04-11 09:49:49 -0700129 struct inode inode;
130 struct cdev cdev;
Dan Williams72058002017-04-19 15:14:31 -0700131 const char *host;
Dan Williams7b6be842017-04-11 09:49:49 -0700132 void *private;
133 bool alive;
Dan Williams6568b082017-01-24 18:44:18 -0800134 const struct dax_operations *ops;
Dan Williams7b6be842017-04-11 09:49:49 -0700135};
136
Dan Williamsb0686262017-01-26 20:37:35 -0800137/**
138 * dax_direct_access() - translate a device pgoff to an absolute pfn
139 * @dax_dev: a dax_device instance representing the logical memory range
140 * @pgoff: offset in pages from the start of the device to translate
141 * @nr_pages: number of consecutive pages caller can handle relative to @pfn
142 * @kaddr: output parameter that returns a virtual address mapping of pfn
143 * @pfn: output parameter that returns an absolute pfn translation of @pgoff
144 *
145 * Return: negative errno if an error occurs, otherwise the number of
146 * pages accessible at the device relative @pgoff.
147 */
148long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
149 void **kaddr, pfn_t *pfn)
150{
151 long avail;
152
153 /*
154 * The device driver is allowed to sleep, in order to make the
155 * memory directly accessible.
156 */
157 might_sleep();
158
159 if (!dax_dev)
160 return -EOPNOTSUPP;
161
162 if (!dax_alive(dax_dev))
163 return -ENXIO;
164
165 if (nr_pages < 0)
166 return nr_pages;
167
168 avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages,
169 kaddr, pfn);
170 if (!avail)
171 return -ERANGE;
172 return min(avail, nr_pages);
173}
174EXPORT_SYMBOL_GPL(dax_direct_access);
175
Dan Williams7e026c82017-05-29 12:57:56 -0700176size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
177 size_t bytes, struct iov_iter *i)
178{
179 if (!dax_alive(dax_dev))
180 return 0;
181
182 if (!dax_dev->ops->copy_from_iter)
183 return copy_from_iter(addr, bytes, i);
184 return dax_dev->ops->copy_from_iter(dax_dev, pgoff, addr, bytes, i);
185}
186EXPORT_SYMBOL_GPL(dax_copy_from_iter);
187
Dan Williams7b6be842017-04-11 09:49:49 -0700188bool dax_alive(struct dax_device *dax_dev)
189{
190 lockdep_assert_held(&dax_srcu);
191 return dax_dev->alive;
192}
193EXPORT_SYMBOL_GPL(dax_alive);
194
Dan Williams72058002017-04-19 15:14:31 -0700195static int dax_host_hash(const char *host)
196{
197 return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE;
198}
199
Dan Williams7b6be842017-04-11 09:49:49 -0700200/*
201 * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring
202 * that any fault handlers or operations that might have seen
203 * dax_alive(), have completed. Any operations that start after
204 * synchronize_srcu() has run will abort upon seeing !dax_alive().
205 */
206void kill_dax(struct dax_device *dax_dev)
207{
208 if (!dax_dev)
209 return;
210
211 dax_dev->alive = false;
Dan Williams72058002017-04-19 15:14:31 -0700212
Dan Williams7b6be842017-04-11 09:49:49 -0700213 synchronize_srcu(&dax_srcu);
Dan Williams72058002017-04-19 15:14:31 -0700214
215 spin_lock(&dax_host_lock);
216 hlist_del_init(&dax_dev->list);
217 spin_unlock(&dax_host_lock);
218
Dan Williams7b6be842017-04-11 09:49:49 -0700219 dax_dev->private = NULL;
220}
221EXPORT_SYMBOL_GPL(kill_dax);
222
223static struct inode *dax_alloc_inode(struct super_block *sb)
224{
225 struct dax_device *dax_dev;
226
227 dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL);
228 return &dax_dev->inode;
229}
230
231static struct dax_device *to_dax_dev(struct inode *inode)
232{
233 return container_of(inode, struct dax_device, inode);
234}
235
236static void dax_i_callback(struct rcu_head *head)
237{
238 struct inode *inode = container_of(head, struct inode, i_rcu);
239 struct dax_device *dax_dev = to_dax_dev(inode);
240
Dan Williams72058002017-04-19 15:14:31 -0700241 kfree(dax_dev->host);
242 dax_dev->host = NULL;
Dan Williams7b6be842017-04-11 09:49:49 -0700243 ida_simple_remove(&dax_minor_ida, MINOR(inode->i_rdev));
244 kmem_cache_free(dax_cache, dax_dev);
245}
246
247static void dax_destroy_inode(struct inode *inode)
248{
249 struct dax_device *dax_dev = to_dax_dev(inode);
250
251 WARN_ONCE(dax_dev->alive,
252 "kill_dax() must be called before final iput()\n");
253 call_rcu(&inode->i_rcu, dax_i_callback);
254}
255
256static const struct super_operations dax_sops = {
257 .statfs = simple_statfs,
258 .alloc_inode = dax_alloc_inode,
259 .destroy_inode = dax_destroy_inode,
260 .drop_inode = generic_delete_inode,
261};
262
263static struct dentry *dax_mount(struct file_system_type *fs_type,
264 int flags, const char *dev_name, void *data)
265{
266 return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC);
267}
268
269static struct file_system_type dax_fs_type = {
270 .name = "dax",
271 .mount = dax_mount,
272 .kill_sb = kill_anon_super,
273};
274
275static int dax_test(struct inode *inode, void *data)
276{
277 dev_t devt = *(dev_t *) data;
278
279 return inode->i_rdev == devt;
280}
281
282static int dax_set(struct inode *inode, void *data)
283{
284 dev_t devt = *(dev_t *) data;
285
286 inode->i_rdev = devt;
287 return 0;
288}
289
290static struct dax_device *dax_dev_get(dev_t devt)
291{
292 struct dax_device *dax_dev;
293 struct inode *inode;
294
295 inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31),
296 dax_test, dax_set, &devt);
297
298 if (!inode)
299 return NULL;
300
301 dax_dev = to_dax_dev(inode);
302 if (inode->i_state & I_NEW) {
303 dax_dev->alive = true;
304 inode->i_cdev = &dax_dev->cdev;
305 inode->i_mode = S_IFCHR;
306 inode->i_flags = S_DAX;
307 mapping_set_gfp_mask(&inode->i_data, GFP_USER);
308 unlock_new_inode(inode);
309 }
310
311 return dax_dev;
312}
313
Dan Williams72058002017-04-19 15:14:31 -0700314static void dax_add_host(struct dax_device *dax_dev, const char *host)
315{
316 int hash;
317
318 /*
319 * Unconditionally init dax_dev since it's coming from a
320 * non-zeroed slab cache
321 */
322 INIT_HLIST_NODE(&dax_dev->list);
323 dax_dev->host = host;
324 if (!host)
325 return;
326
327 hash = dax_host_hash(host);
328 spin_lock(&dax_host_lock);
329 hlist_add_head(&dax_dev->list, &dax_host_list[hash]);
330 spin_unlock(&dax_host_lock);
331}
332
Dan Williams6568b082017-01-24 18:44:18 -0800333struct dax_device *alloc_dax(void *private, const char *__host,
334 const struct dax_operations *ops)
Dan Williams7b6be842017-04-11 09:49:49 -0700335{
336 struct dax_device *dax_dev;
Dan Williams72058002017-04-19 15:14:31 -0700337 const char *host;
Dan Williams7b6be842017-04-11 09:49:49 -0700338 dev_t devt;
339 int minor;
340
Dan Williams72058002017-04-19 15:14:31 -0700341 host = kstrdup(__host, GFP_KERNEL);
342 if (__host && !host)
343 return NULL;
344
Dan Williamscf1e2282017-05-08 12:33:53 -0700345 minor = ida_simple_get(&dax_minor_ida, 0, MINORMASK+1, GFP_KERNEL);
Dan Williams7b6be842017-04-11 09:49:49 -0700346 if (minor < 0)
Dan Williams72058002017-04-19 15:14:31 -0700347 goto err_minor;
Dan Williams7b6be842017-04-11 09:49:49 -0700348
349 devt = MKDEV(MAJOR(dax_devt), minor);
350 dax_dev = dax_dev_get(devt);
351 if (!dax_dev)
Dan Williams72058002017-04-19 15:14:31 -0700352 goto err_dev;
Dan Williams7b6be842017-04-11 09:49:49 -0700353
Dan Williams72058002017-04-19 15:14:31 -0700354 dax_add_host(dax_dev, host);
Dan Williams6568b082017-01-24 18:44:18 -0800355 dax_dev->ops = ops;
Dan Williams7b6be842017-04-11 09:49:49 -0700356 dax_dev->private = private;
357 return dax_dev;
358
Dan Williams72058002017-04-19 15:14:31 -0700359 err_dev:
Dan Williams7b6be842017-04-11 09:49:49 -0700360 ida_simple_remove(&dax_minor_ida, minor);
Dan Williams72058002017-04-19 15:14:31 -0700361 err_minor:
362 kfree(host);
Dan Williams7b6be842017-04-11 09:49:49 -0700363 return NULL;
364}
365EXPORT_SYMBOL_GPL(alloc_dax);
366
367void put_dax(struct dax_device *dax_dev)
368{
369 if (!dax_dev)
370 return;
371 iput(&dax_dev->inode);
372}
373EXPORT_SYMBOL_GPL(put_dax);
374
375/**
Dan Williams72058002017-04-19 15:14:31 -0700376 * dax_get_by_host() - temporary lookup mechanism for filesystem-dax
377 * @host: alternate name for the device registered by a dax driver
378 */
379struct dax_device *dax_get_by_host(const char *host)
380{
381 struct dax_device *dax_dev, *found = NULL;
382 int hash, id;
383
384 if (!host)
385 return NULL;
386
387 hash = dax_host_hash(host);
388
389 id = dax_read_lock();
390 spin_lock(&dax_host_lock);
391 hlist_for_each_entry(dax_dev, &dax_host_list[hash], list) {
392 if (!dax_alive(dax_dev)
393 || strcmp(host, dax_dev->host) != 0)
394 continue;
395
396 if (igrab(&dax_dev->inode))
397 found = dax_dev;
398 break;
399 }
400 spin_unlock(&dax_host_lock);
401 dax_read_unlock(id);
402
403 return found;
404}
405EXPORT_SYMBOL_GPL(dax_get_by_host);
406
407/**
Dan Williams7b6be842017-04-11 09:49:49 -0700408 * inode_dax: convert a public inode into its dax_dev
409 * @inode: An inode with i_cdev pointing to a dax_dev
410 *
411 * Note this is not equivalent to to_dax_dev() which is for private
412 * internal use where we know the inode filesystem type == dax_fs_type.
413 */
414struct dax_device *inode_dax(struct inode *inode)
415{
416 struct cdev *cdev = inode->i_cdev;
417
418 return container_of(cdev, struct dax_device, cdev);
419}
420EXPORT_SYMBOL_GPL(inode_dax);
421
422struct inode *dax_inode(struct dax_device *dax_dev)
423{
424 return &dax_dev->inode;
425}
426EXPORT_SYMBOL_GPL(dax_inode);
427
428void *dax_get_private(struct dax_device *dax_dev)
429{
430 return dax_dev->private;
431}
432EXPORT_SYMBOL_GPL(dax_get_private);
433
434static void init_once(void *_dax_dev)
435{
436 struct dax_device *dax_dev = _dax_dev;
437 struct inode *inode = &dax_dev->inode;
438
439 inode_init_once(inode);
440}
441
442static int __dax_fs_init(void)
443{
444 int rc;
445
446 dax_cache = kmem_cache_create("dax_cache", sizeof(struct dax_device), 0,
447 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
448 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
449 init_once);
450 if (!dax_cache)
451 return -ENOMEM;
452
453 rc = register_filesystem(&dax_fs_type);
454 if (rc)
455 goto err_register_fs;
456
457 dax_mnt = kern_mount(&dax_fs_type);
458 if (IS_ERR(dax_mnt)) {
459 rc = PTR_ERR(dax_mnt);
460 goto err_mount;
461 }
462 dax_superblock = dax_mnt->mnt_sb;
463
464 return 0;
465
466 err_mount:
467 unregister_filesystem(&dax_fs_type);
468 err_register_fs:
469 kmem_cache_destroy(dax_cache);
470
471 return rc;
472}
473
474static void __dax_fs_exit(void)
475{
476 kern_unmount(dax_mnt);
477 unregister_filesystem(&dax_fs_type);
478 kmem_cache_destroy(dax_cache);
479}
480
481static int __init dax_fs_init(void)
482{
483 int rc;
484
485 rc = __dax_fs_init();
486 if (rc)
487 return rc;
488
Dan Williamscf1e2282017-05-08 12:33:53 -0700489 rc = alloc_chrdev_region(&dax_devt, 0, MINORMASK+1, "dax");
Dan Williams7b6be842017-04-11 09:49:49 -0700490 if (rc)
491 __dax_fs_exit();
492 return rc;
493}
494
495static void __exit dax_fs_exit(void)
496{
Dan Williamscf1e2282017-05-08 12:33:53 -0700497 unregister_chrdev_region(dax_devt, MINORMASK+1);
Dan Williams7b6be842017-04-11 09:49:49 -0700498 ida_destroy(&dax_minor_ida);
499 __dax_fs_exit();
500}
501
502MODULE_AUTHOR("Intel Corporation");
503MODULE_LICENSE("GPL v2");
504subsys_initcall(dax_fs_init);
505module_exit(dax_fs_exit);