blob: 994dfa507dfb4b33e4fa6b5881f82df87dac9374 [file] [log] [blame]
Dan Williamsab68f262016-05-18 09:15:08 -07001/*
2 * Copyright(c) 2016 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#include <linux/pagemap.h>
14#include <linux/module.h>
15#include <linux/device.h>
16#include <linux/pfn_t.h>
17#include <linux/slab.h>
18#include <linux/dax.h>
19#include <linux/fs.h>
20#include <linux/mm.h>
Dan Williamsccdb07f2016-08-06 16:05:06 -070021#include "dax.h"
Dan Williamsab68f262016-05-18 09:15:08 -070022
23static int dax_major;
24static struct class *dax_class;
25static DEFINE_IDA(dax_minor_ida);
26
27/**
28 * struct dax_region - mapping infrastructure for dax devices
29 * @id: kernel-wide unique region for a memory range
30 * @base: linear address corresponding to @res
31 * @kref: to pin while other agents have a need to do lookups
32 * @dev: parent device backing this region
33 * @align: allocation and mapping alignment for child dax devices
34 * @res: physical address range of the region
35 * @pfn_flags: identify whether the pfns are paged back or not
36 */
37struct dax_region {
38 int id;
39 struct ida ida;
40 void *base;
41 struct kref kref;
42 struct device *dev;
43 unsigned int align;
44 struct resource res;
45 unsigned long pfn_flags;
46};
47
48/**
49 * struct dax_dev - subdivision of a dax region
50 * @region - parent region
51 * @dev - device backing the character device
52 * @kref - enable this data to be tracked in filp->private_data
Dan Williamsdee41072016-05-14 12:20:44 -070053 * @alive - !alive + rcu grace period == no new mappings can be established
Dan Williamsab68f262016-05-18 09:15:08 -070054 * @id - child id in the region
55 * @num_resources - number of physical address extents in this device
56 * @res - array of physical address ranges
57 */
58struct dax_dev {
59 struct dax_region *region;
60 struct device *dev;
61 struct kref kref;
Dan Williamsdee41072016-05-14 12:20:44 -070062 bool alive;
Dan Williamsab68f262016-05-18 09:15:08 -070063 int id;
64 int num_resources;
65 struct resource res[0];
66};
67
68static void dax_region_free(struct kref *kref)
69{
70 struct dax_region *dax_region;
71
72 dax_region = container_of(kref, struct dax_region, kref);
73 kfree(dax_region);
74}
75
76void dax_region_put(struct dax_region *dax_region)
77{
78 kref_put(&dax_region->kref, dax_region_free);
79}
80EXPORT_SYMBOL_GPL(dax_region_put);
81
82static void dax_dev_free(struct kref *kref)
83{
84 struct dax_dev *dax_dev;
85
86 dax_dev = container_of(kref, struct dax_dev, kref);
87 dax_region_put(dax_dev->region);
88 kfree(dax_dev);
89}
90
91static void dax_dev_put(struct dax_dev *dax_dev)
92{
93 kref_put(&dax_dev->kref, dax_dev_free);
94}
95
96struct dax_region *alloc_dax_region(struct device *parent, int region_id,
97 struct resource *res, unsigned int align, void *addr,
98 unsigned long pfn_flags)
99{
100 struct dax_region *dax_region;
101
102 dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL);
103
104 if (!dax_region)
105 return NULL;
106
107 memcpy(&dax_region->res, res, sizeof(*res));
108 dax_region->pfn_flags = pfn_flags;
109 kref_init(&dax_region->kref);
110 dax_region->id = region_id;
111 ida_init(&dax_region->ida);
112 dax_region->align = align;
113 dax_region->dev = parent;
114 dax_region->base = addr;
115
116 return dax_region;
117}
118EXPORT_SYMBOL_GPL(alloc_dax_region);
119
120static ssize_t size_show(struct device *dev,
121 struct device_attribute *attr, char *buf)
122{
123 struct dax_dev *dax_dev = dev_get_drvdata(dev);
124 unsigned long long size = 0;
125 int i;
126
127 for (i = 0; i < dax_dev->num_resources; i++)
128 size += resource_size(&dax_dev->res[i]);
129
130 return sprintf(buf, "%llu\n", size);
131}
132static DEVICE_ATTR_RO(size);
133
134static struct attribute *dax_device_attributes[] = {
135 &dev_attr_size.attr,
136 NULL,
137};
138
139static const struct attribute_group dax_device_attribute_group = {
140 .attrs = dax_device_attributes,
141};
142
143static const struct attribute_group *dax_attribute_groups[] = {
144 &dax_device_attribute_group,
145 NULL,
146};
147
Dan Williamsdee41072016-05-14 12:20:44 -0700148static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma,
149 const char *func)
150{
151 struct dax_region *dax_region = dax_dev->region;
152 struct device *dev = dax_dev->dev;
153 unsigned long mask;
154
155 if (!dax_dev->alive)
156 return -ENXIO;
157
158 /* prevent private / writable mappings from being established */
159 if ((vma->vm_flags & (VM_NORESERVE|VM_SHARED|VM_WRITE)) == VM_WRITE) {
160 dev_info(dev, "%s: %s: fail, attempted private mapping\n",
161 current->comm, func);
162 return -EINVAL;
163 }
164
165 mask = dax_region->align - 1;
166 if (vma->vm_start & mask || vma->vm_end & mask) {
167 dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
168 current->comm, func, vma->vm_start, vma->vm_end,
169 mask);
170 return -EINVAL;
171 }
172
173 if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV
174 && (vma->vm_flags & VM_DONTCOPY) == 0) {
175 dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n",
176 current->comm, func);
177 return -EINVAL;
178 }
179
180 if (!vma_is_dax(vma)) {
181 dev_info(dev, "%s: %s: fail, vma is not DAX capable\n",
182 current->comm, func);
183 return -EINVAL;
184 }
185
186 return 0;
187}
188
189static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
190 unsigned long size)
191{
192 struct resource *res;
193 phys_addr_t phys;
194 int i;
195
196 for (i = 0; i < dax_dev->num_resources; i++) {
197 res = &dax_dev->res[i];
198 phys = pgoff * PAGE_SIZE + res->start;
199 if (phys >= res->start && phys <= res->end)
200 break;
201 pgoff -= PHYS_PFN(resource_size(res));
202 }
203
204 if (i < dax_dev->num_resources) {
205 res = &dax_dev->res[i];
206 if (phys + size - 1 <= res->end)
207 return phys;
208 }
209
210 return -1;
211}
212
213static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
214 struct vm_fault *vmf)
215{
216 unsigned long vaddr = (unsigned long) vmf->virtual_address;
217 struct device *dev = dax_dev->dev;
218 struct dax_region *dax_region;
219 int rc = VM_FAULT_SIGBUS;
220 phys_addr_t phys;
221 pfn_t pfn;
222
223 if (check_vma(dax_dev, vma, __func__))
224 return VM_FAULT_SIGBUS;
225
226 dax_region = dax_dev->region;
227 if (dax_region->align > PAGE_SIZE) {
228 dev_dbg(dev, "%s: alignment > fault size\n", __func__);
229 return VM_FAULT_SIGBUS;
230 }
231
232 phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE);
233 if (phys == -1) {
234 dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
235 vmf->pgoff);
236 return VM_FAULT_SIGBUS;
237 }
238
239 pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
240
241 rc = vm_insert_mixed(vma, vaddr, pfn);
242
243 if (rc == -ENOMEM)
244 return VM_FAULT_OOM;
245 if (rc < 0 && rc != -EBUSY)
246 return VM_FAULT_SIGBUS;
247
248 return VM_FAULT_NOPAGE;
249}
250
251static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
252{
253 int rc;
254 struct file *filp = vma->vm_file;
255 struct dax_dev *dax_dev = filp->private_data;
256
257 dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
258 current->comm, (vmf->flags & FAULT_FLAG_WRITE)
259 ? "write" : "read", vma->vm_start, vma->vm_end);
260 rcu_read_lock();
261 rc = __dax_dev_fault(dax_dev, vma, vmf);
262 rcu_read_unlock();
263
264 return rc;
265}
266
267static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
268 struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd,
269 unsigned int flags)
270{
271 unsigned long pmd_addr = addr & PMD_MASK;
272 struct device *dev = dax_dev->dev;
273 struct dax_region *dax_region;
274 phys_addr_t phys;
275 pgoff_t pgoff;
276 pfn_t pfn;
277
278 if (check_vma(dax_dev, vma, __func__))
279 return VM_FAULT_SIGBUS;
280
281 dax_region = dax_dev->region;
282 if (dax_region->align > PMD_SIZE) {
283 dev_dbg(dev, "%s: alignment > fault size\n", __func__);
284 return VM_FAULT_SIGBUS;
285 }
286
287 /* dax pmd mappings require pfn_t_devmap() */
288 if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
289 dev_dbg(dev, "%s: alignment > fault size\n", __func__);
290 return VM_FAULT_SIGBUS;
291 }
292
293 pgoff = linear_page_index(vma, pmd_addr);
294 phys = pgoff_to_phys(dax_dev, pgoff, PAGE_SIZE);
295 if (phys == -1) {
296 dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
297 pgoff);
298 return VM_FAULT_SIGBUS;
299 }
300
301 pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
302
303 return vmf_insert_pfn_pmd(vma, addr, pmd, pfn,
304 flags & FAULT_FLAG_WRITE);
305}
306
307static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
308 pmd_t *pmd, unsigned int flags)
309{
310 int rc;
311 struct file *filp = vma->vm_file;
312 struct dax_dev *dax_dev = filp->private_data;
313
314 dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
315 current->comm, (flags & FAULT_FLAG_WRITE)
316 ? "write" : "read", vma->vm_start, vma->vm_end);
317
318 rcu_read_lock();
319 rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags);
320 rcu_read_unlock();
321
322 return rc;
323}
324
325static void dax_dev_vm_open(struct vm_area_struct *vma)
326{
327 struct file *filp = vma->vm_file;
328 struct dax_dev *dax_dev = filp->private_data;
329
330 dev_dbg(dax_dev->dev, "%s\n", __func__);
331 kref_get(&dax_dev->kref);
332}
333
334static void dax_dev_vm_close(struct vm_area_struct *vma)
335{
336 struct file *filp = vma->vm_file;
337 struct dax_dev *dax_dev = filp->private_data;
338
339 dev_dbg(dax_dev->dev, "%s\n", __func__);
340 dax_dev_put(dax_dev);
341}
342
343static const struct vm_operations_struct dax_dev_vm_ops = {
344 .fault = dax_dev_fault,
345 .pmd_fault = dax_dev_pmd_fault,
346 .open = dax_dev_vm_open,
347 .close = dax_dev_vm_close,
348};
349
Dan Williamsaf69f512016-08-11 00:38:03 -0700350static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
Dan Williamsdee41072016-05-14 12:20:44 -0700351{
352 struct dax_dev *dax_dev = filp->private_data;
353 int rc;
354
355 dev_dbg(dax_dev->dev, "%s\n", __func__);
356
357 rc = check_vma(dax_dev, vma, __func__);
358 if (rc)
359 return rc;
360
361 kref_get(&dax_dev->kref);
362 vma->vm_ops = &dax_dev_vm_ops;
363 vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
364 return 0;
Dan Williams043a9252016-08-07 08:23:56 -0700365}
Dan Williamsdee41072016-05-14 12:20:44 -0700366
Dan Williams043a9252016-08-07 08:23:56 -0700367/* return an unmapped area aligned to the dax region specified alignment */
Dan Williamsaf69f512016-08-11 00:38:03 -0700368static unsigned long dax_get_unmapped_area(struct file *filp,
Dan Williams043a9252016-08-07 08:23:56 -0700369 unsigned long addr, unsigned long len, unsigned long pgoff,
370 unsigned long flags)
371{
372 unsigned long off, off_end, off_align, len_align, addr_align, align;
373 struct dax_dev *dax_dev = filp ? filp->private_data : NULL;
374 struct dax_region *dax_region;
375
376 if (!dax_dev || addr)
377 goto out;
378
379 dax_region = dax_dev->region;
380 align = dax_region->align;
381 off = pgoff << PAGE_SHIFT;
382 off_end = off + len;
383 off_align = round_up(off, align);
384
385 if ((off_end <= off_align) || ((off_end - off_align) < align))
386 goto out;
387
388 len_align = len + align;
389 if ((off + len_align) < off)
390 goto out;
391
392 addr_align = current->mm->get_unmapped_area(filp, addr, len_align,
393 pgoff, flags);
394 if (!IS_ERR_VALUE(addr_align)) {
395 addr_align += (off - addr_align) & (align - 1);
396 return addr_align;
397 }
398 out:
399 return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
400}
401
402static int __match_devt(struct device *dev, const void *data)
403{
404 const dev_t *devt = data;
405
406 return dev->devt == *devt;
407}
408
409static struct device *dax_dev_find(dev_t dev_t)
410{
411 return class_find_device(dax_class, NULL, &dev_t, __match_devt);
412}
413
Dan Williamsaf69f512016-08-11 00:38:03 -0700414static int dax_open(struct inode *inode, struct file *filp)
Dan Williams043a9252016-08-07 08:23:56 -0700415{
416 struct dax_dev *dax_dev = NULL;
417 struct device *dev;
418
419 dev = dax_dev_find(inode->i_rdev);
420 if (!dev)
421 return -ENXIO;
422
423 device_lock(dev);
424 dax_dev = dev_get_drvdata(dev);
425 if (dax_dev) {
426 dev_dbg(dev, "%s\n", __func__);
427 filp->private_data = dax_dev;
428 kref_get(&dax_dev->kref);
429 inode->i_flags = S_DAX;
430 }
431 device_unlock(dev);
432
433 if (!dax_dev) {
434 put_device(dev);
435 return -ENXIO;
436 }
437 return 0;
438}
439
Dan Williamsaf69f512016-08-11 00:38:03 -0700440static int dax_release(struct inode *inode, struct file *filp)
Dan Williams043a9252016-08-07 08:23:56 -0700441{
442 struct dax_dev *dax_dev = filp->private_data;
443 struct device *dev = dax_dev->dev;
444
445 dev_dbg(dax_dev->dev, "%s\n", __func__);
446 dax_dev_put(dax_dev);
447 put_device(dev);
448
449 return 0;
Dan Williamsdee41072016-05-14 12:20:44 -0700450}
451
Dan Williamsab68f262016-05-18 09:15:08 -0700452static const struct file_operations dax_fops = {
453 .llseek = noop_llseek,
454 .owner = THIS_MODULE,
Dan Williamsaf69f512016-08-11 00:38:03 -0700455 .open = dax_open,
456 .release = dax_release,
457 .get_unmapped_area = dax_get_unmapped_area,
458 .mmap = dax_mmap,
Dan Williamsab68f262016-05-18 09:15:08 -0700459};
460
Dan Williams043a9252016-08-07 08:23:56 -0700461static void unregister_dax_dev(void *_dev)
462{
463 struct device *dev = _dev;
464 struct dax_dev *dax_dev = dev_get_drvdata(dev);
465 struct dax_region *dax_region = dax_dev->region;
466
467 dev_dbg(dev, "%s\n", __func__);
468
469 /*
470 * Note, rcu is not protecting the liveness of dax_dev, rcu is
471 * ensuring that any fault handlers that might have seen
472 * dax_dev->alive == true, have completed. Any fault handlers
473 * that start after synchronize_rcu() has started will abort
474 * upon seeing dax_dev->alive == false.
475 */
476 dax_dev->alive = false;
477 synchronize_rcu();
478
479 get_device(dev);
480 device_unregister(dev);
481 ida_simple_remove(&dax_region->ida, dax_dev->id);
482 ida_simple_remove(&dax_minor_ida, MINOR(dev->devt));
483 put_device(dev);
484 dax_dev_put(dax_dev);
485}
486
487int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
488 int count)
489{
490 struct device *parent = dax_region->dev;
491 struct dax_dev *dax_dev;
492 struct device *dev;
493 int rc, minor;
494 dev_t dev_t;
495
496 dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL);
497 if (!dax_dev)
498 return -ENOMEM;
499 memcpy(dax_dev->res, res, sizeof(*res) * count);
500 dax_dev->num_resources = count;
501 kref_init(&dax_dev->kref);
502 dax_dev->alive = true;
503 dax_dev->region = dax_region;
504 kref_get(&dax_region->kref);
505
506 dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL);
507 if (dax_dev->id < 0) {
508 rc = dax_dev->id;
509 goto err_id;
510 }
511
512 minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL);
513 if (minor < 0) {
514 rc = minor;
515 goto err_minor;
516 }
517
518 dev_t = MKDEV(dax_major, minor);
519 dev = device_create_with_groups(dax_class, parent, dev_t, dax_dev,
520 dax_attribute_groups, "dax%d.%d", dax_region->id,
521 dax_dev->id);
522 if (IS_ERR(dev)) {
523 rc = PTR_ERR(dev);
524 goto err_create;
525 }
526 dax_dev->dev = dev;
527
528 rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev);
529 if (rc)
530 return rc;
531
532 return 0;
533
534 err_create:
535 ida_simple_remove(&dax_minor_ida, minor);
536 err_minor:
537 ida_simple_remove(&dax_region->ida, dax_dev->id);
538 err_id:
539 dax_dev_put(dax_dev);
540
541 return rc;
542}
543EXPORT_SYMBOL_GPL(devm_create_dax_dev);
544
Dan Williamsab68f262016-05-18 09:15:08 -0700545static int __init dax_init(void)
546{
547 int rc;
548
549 rc = register_chrdev(0, "dax", &dax_fops);
550 if (rc < 0)
551 return rc;
552 dax_major = rc;
553
554 dax_class = class_create(THIS_MODULE, "dax");
555 if (IS_ERR(dax_class)) {
556 unregister_chrdev(dax_major, "dax");
557 return PTR_ERR(dax_class);
558 }
559
560 return 0;
561}
562
563static void __exit dax_exit(void)
564{
565 class_destroy(dax_class);
566 unregister_chrdev(dax_major, "dax");
567 ida_destroy(&dax_minor_ida);
568}
569
570MODULE_AUTHOR("Intel Corporation");
571MODULE_LICENSE("GPL v2");
572subsys_initcall(dax_init);
573module_exit(dax_exit);