Merge tag 'libnvdimm-for-4.8' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
Pull libnvdimm updates from Dan Williams:
- Replace pcommit with ADR / directed-flushing.
The pcommit instruction, which has not shipped on any product, is
deprecated. Instead, the requirement is that platforms implement
either ADR, or provide one or more flush addresses per nvdimm.
ADR (Asynchronous DRAM Refresh) flushes data in posted write buffers
to the memory controller on a power-fail event.
Flush addresses are defined in ACPI 6.x as an NVDIMM Firmware
Interface Table (NFIT) sub-structure: "Flush Hint Address Structure".
A flush hint is an mmio address that when written and fenced assures
that all previous posted writes targeting a given dimm have been
flushed to media.
- On-demand ARS (address range scrub).
Linux uses the results of the ACPI ARS commands to track bad blocks
in pmem devices. When latent errors are detected we re-scrub the
media to refresh the bad block list, userspace can also request a
re-scrub at any time.
- Support for the Microsoft DSM (device specific method) command
format.
- Support for EDK2/OVMF virtual disk device memory ranges.
- Various fixes and cleanups across the subsystem.
* tag 'libnvdimm-for-4.8' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (41 commits)
libnvdimm-btt: Delete an unnecessary check before the function call "__nd_device_register"
nfit: do an ARS scrub on hitting a latent media error
nfit: move to nfit/ sub-directory
nfit, libnvdimm: allow an ARS scrub to be triggered on demand
libnvdimm: register nvdimm_bus devices with an nd_bus driver
pmem: clarify a debug print in pmem_clear_poison
x86/insn: remove pcommit
Revert "KVM: x86: add pcommit support"
nfit, tools/testing/nvdimm/: unify shutdown paths
libnvdimm: move ->module to struct nvdimm_bus_descriptor
nfit: cleanup acpi_nfit_init calling convention
nfit: fix _FIT evaluation memory leak + use after free
tools/testing/nvdimm: add manufacturing_{date|location} dimm properties
tools/testing/nvdimm: add virtual ramdisk range
acpi, nfit: treat virtual ramdisk SPA as pmem region
pmem: kill __pmem address space
pmem: kill wmb_pmem()
libnvdimm, pmem: use nvdimm_flush() for namespace I/O writes
fs/dax: remove wmb_pmem()
libnvdimm, pmem: flush posted-write queues on shutdown
...
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index acad70a..aebd944 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -454,32 +454,7 @@
If you are unsure what to do, do not enable this option.
-config ACPI_NFIT
- tristate "ACPI NVDIMM Firmware Interface Table (NFIT)"
- depends on PHYS_ADDR_T_64BIT
- depends on BLK_DEV
- depends on ARCH_HAS_MMIO_FLUSH
- select LIBNVDIMM
- help
- Infrastructure to probe ACPI 6 compliant platforms for
- NVDIMMs (NFIT) and register a libnvdimm device tree. In
- addition to storage devices this also enables libnvdimm to pass
- ACPI._DSM messages for platform/dimm configuration.
-
- To compile this driver as a module, choose M here:
- the module will be called nfit.
-
-config ACPI_NFIT_DEBUG
- bool "NFIT DSM debug"
- depends on ACPI_NFIT
- depends on DYNAMIC_DEBUG
- default n
- help
- Enabling this option causes the nfit driver to dump the
- input and output buffers of _DSM operations on the ACPI0012
- device and its children. This can be very verbose, so leave
- it disabled unless you are debugging a hardware / firmware
- issue.
+source "drivers/acpi/nfit/Kconfig"
source "drivers/acpi/apei/Kconfig"
source "drivers/acpi/dptf/Kconfig"
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 88f54f0..35a6ccb 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -69,7 +69,7 @@
obj-$(CONFIG_ACPI_PROCESSOR) += processor.o
obj-$(CONFIG_ACPI) += container.o
obj-$(CONFIG_ACPI_THERMAL) += thermal.o
-obj-$(CONFIG_ACPI_NFIT) += nfit.o
+obj-$(CONFIG_ACPI_NFIT) += nfit/
obj-$(CONFIG_ACPI) += acpi_memhotplug.o
obj-$(CONFIG_ACPI_HOTPLUG_IOAPIC) += ioapic.o
obj-$(CONFIG_ACPI_BATTERY) += battery.o
diff --git a/drivers/acpi/nfit/Kconfig b/drivers/acpi/nfit/Kconfig
new file mode 100644
index 0000000..dd0d53c
--- /dev/null
+++ b/drivers/acpi/nfit/Kconfig
@@ -0,0 +1,26 @@
+config ACPI_NFIT
+ tristate "ACPI NVDIMM Firmware Interface Table (NFIT)"
+ depends on PHYS_ADDR_T_64BIT
+ depends on BLK_DEV
+ depends on ARCH_HAS_MMIO_FLUSH
+ select LIBNVDIMM
+ help
+ Infrastructure to probe ACPI 6 compliant platforms for
+ NVDIMMs (NFIT) and register a libnvdimm device tree. In
+ addition to storage devices this also enables libnvdimm to pass
+ ACPI._DSM messages for platform/dimm configuration.
+
+ To compile this driver as a module, choose M here:
+ the module will be called nfit.
+
+config ACPI_NFIT_DEBUG
+ bool "NFIT DSM debug"
+ depends on ACPI_NFIT
+ depends on DYNAMIC_DEBUG
+ default n
+ help
+ Enabling this option causes the nfit driver to dump the
+ input and output buffers of _DSM operations on the ACPI0012
+ device and its children. This can be very verbose, so leave
+ it disabled unless you are debugging a hardware / firmware
+ issue.
diff --git a/drivers/acpi/nfit/Makefile b/drivers/acpi/nfit/Makefile
new file mode 100644
index 0000000..a407e76
--- /dev/null
+++ b/drivers/acpi/nfit/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_ACPI_NFIT) := nfit.o
+nfit-y := core.o
+nfit-$(CONFIG_X86_MCE) += mce.o
diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit/core.c
similarity index 86%
rename from drivers/acpi/nfit.c
rename to drivers/acpi/nfit/core.c
index 1f0e060..8c234dd 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit/core.c
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/ndctl.h>
+#include <linux/sysfs.h>
#include <linux/delay.h>
#include <linux/list.h>
#include <linux/acpi.h>
@@ -50,6 +51,9 @@
MODULE_PARM_DESC(disable_vendor_specific,
"Limit commands to the publicly specified set\n");
+LIST_HEAD(acpi_descs);
+DEFINE_MUTEX(acpi_desc_lock);
+
static struct workqueue_struct *nfit_wq;
struct nfit_table_prev {
@@ -360,7 +364,7 @@
return to_name[type];
}
-static int nfit_spa_type(struct acpi_nfit_system_address *spa)
+int nfit_spa_type(struct acpi_nfit_system_address *spa)
{
int i;
@@ -374,22 +378,25 @@
struct nfit_table_prev *prev,
struct acpi_nfit_system_address *spa)
{
- size_t length = min_t(size_t, sizeof(*spa), spa->header.length);
struct device *dev = acpi_desc->dev;
struct nfit_spa *nfit_spa;
+ if (spa->header.length != sizeof(*spa))
+ return false;
+
list_for_each_entry(nfit_spa, &prev->spas, list) {
- if (memcmp(nfit_spa->spa, spa, length) == 0) {
+ if (memcmp(nfit_spa->spa, spa, sizeof(*spa)) == 0) {
list_move_tail(&nfit_spa->list, &acpi_desc->spas);
return true;
}
}
- nfit_spa = devm_kzalloc(dev, sizeof(*nfit_spa), GFP_KERNEL);
+ nfit_spa = devm_kzalloc(dev, sizeof(*nfit_spa) + sizeof(*spa),
+ GFP_KERNEL);
if (!nfit_spa)
return false;
INIT_LIST_HEAD(&nfit_spa->list);
- nfit_spa->spa = spa;
+ memcpy(nfit_spa->spa, spa, sizeof(*spa));
list_add_tail(&nfit_spa->list, &acpi_desc->spas);
dev_dbg(dev, "%s: spa index: %d type: %s\n", __func__,
spa->range_index,
@@ -401,21 +408,24 @@
struct nfit_table_prev *prev,
struct acpi_nfit_memory_map *memdev)
{
- size_t length = min_t(size_t, sizeof(*memdev), memdev->header.length);
struct device *dev = acpi_desc->dev;
struct nfit_memdev *nfit_memdev;
+ if (memdev->header.length != sizeof(*memdev))
+ return false;
+
list_for_each_entry(nfit_memdev, &prev->memdevs, list)
- if (memcmp(nfit_memdev->memdev, memdev, length) == 0) {
+ if (memcmp(nfit_memdev->memdev, memdev, sizeof(*memdev)) == 0) {
list_move_tail(&nfit_memdev->list, &acpi_desc->memdevs);
return true;
}
- nfit_memdev = devm_kzalloc(dev, sizeof(*nfit_memdev), GFP_KERNEL);
+ nfit_memdev = devm_kzalloc(dev, sizeof(*nfit_memdev) + sizeof(*memdev),
+ GFP_KERNEL);
if (!nfit_memdev)
return false;
INIT_LIST_HEAD(&nfit_memdev->list);
- nfit_memdev->memdev = memdev;
+ memcpy(nfit_memdev->memdev, memdev, sizeof(*memdev));
list_add_tail(&nfit_memdev->list, &acpi_desc->memdevs);
dev_dbg(dev, "%s: memdev handle: %#x spa: %d dcr: %d\n",
__func__, memdev->device_handle, memdev->range_index,
@@ -423,25 +433,42 @@
return true;
}
+/*
+ * An implementation may provide a truncated control region if no block windows
+ * are defined.
+ */
+static size_t sizeof_dcr(struct acpi_nfit_control_region *dcr)
+{
+ if (dcr->header.length < offsetof(struct acpi_nfit_control_region,
+ window_size))
+ return 0;
+ if (dcr->windows)
+ return sizeof(*dcr);
+ return offsetof(struct acpi_nfit_control_region, window_size);
+}
+
static bool add_dcr(struct acpi_nfit_desc *acpi_desc,
struct nfit_table_prev *prev,
struct acpi_nfit_control_region *dcr)
{
- size_t length = min_t(size_t, sizeof(*dcr), dcr->header.length);
struct device *dev = acpi_desc->dev;
struct nfit_dcr *nfit_dcr;
+ if (!sizeof_dcr(dcr))
+ return false;
+
list_for_each_entry(nfit_dcr, &prev->dcrs, list)
- if (memcmp(nfit_dcr->dcr, dcr, length) == 0) {
+ if (memcmp(nfit_dcr->dcr, dcr, sizeof_dcr(dcr)) == 0) {
list_move_tail(&nfit_dcr->list, &acpi_desc->dcrs);
return true;
}
- nfit_dcr = devm_kzalloc(dev, sizeof(*nfit_dcr), GFP_KERNEL);
+ nfit_dcr = devm_kzalloc(dev, sizeof(*nfit_dcr) + sizeof(*dcr),
+ GFP_KERNEL);
if (!nfit_dcr)
return false;
INIT_LIST_HEAD(&nfit_dcr->list);
- nfit_dcr->dcr = dcr;
+ memcpy(nfit_dcr->dcr, dcr, sizeof_dcr(dcr));
list_add_tail(&nfit_dcr->list, &acpi_desc->dcrs);
dev_dbg(dev, "%s: dcr index: %d windows: %d\n", __func__,
dcr->region_index, dcr->windows);
@@ -452,71 +479,102 @@
struct nfit_table_prev *prev,
struct acpi_nfit_data_region *bdw)
{
- size_t length = min_t(size_t, sizeof(*bdw), bdw->header.length);
struct device *dev = acpi_desc->dev;
struct nfit_bdw *nfit_bdw;
+ if (bdw->header.length != sizeof(*bdw))
+ return false;
list_for_each_entry(nfit_bdw, &prev->bdws, list)
- if (memcmp(nfit_bdw->bdw, bdw, length) == 0) {
+ if (memcmp(nfit_bdw->bdw, bdw, sizeof(*bdw)) == 0) {
list_move_tail(&nfit_bdw->list, &acpi_desc->bdws);
return true;
}
- nfit_bdw = devm_kzalloc(dev, sizeof(*nfit_bdw), GFP_KERNEL);
+ nfit_bdw = devm_kzalloc(dev, sizeof(*nfit_bdw) + sizeof(*bdw),
+ GFP_KERNEL);
if (!nfit_bdw)
return false;
INIT_LIST_HEAD(&nfit_bdw->list);
- nfit_bdw->bdw = bdw;
+ memcpy(nfit_bdw->bdw, bdw, sizeof(*bdw));
list_add_tail(&nfit_bdw->list, &acpi_desc->bdws);
dev_dbg(dev, "%s: bdw dcr: %d windows: %d\n", __func__,
bdw->region_index, bdw->windows);
return true;
}
+static size_t sizeof_idt(struct acpi_nfit_interleave *idt)
+{
+ if (idt->header.length < sizeof(*idt))
+ return 0;
+ return sizeof(*idt) + sizeof(u32) * (idt->line_count - 1);
+}
+
static bool add_idt(struct acpi_nfit_desc *acpi_desc,
struct nfit_table_prev *prev,
struct acpi_nfit_interleave *idt)
{
- size_t length = min_t(size_t, sizeof(*idt), idt->header.length);
struct device *dev = acpi_desc->dev;
struct nfit_idt *nfit_idt;
- list_for_each_entry(nfit_idt, &prev->idts, list)
- if (memcmp(nfit_idt->idt, idt, length) == 0) {
+ if (!sizeof_idt(idt))
+ return false;
+
+ list_for_each_entry(nfit_idt, &prev->idts, list) {
+ if (sizeof_idt(nfit_idt->idt) != sizeof_idt(idt))
+ continue;
+
+ if (memcmp(nfit_idt->idt, idt, sizeof_idt(idt)) == 0) {
list_move_tail(&nfit_idt->list, &acpi_desc->idts);
return true;
}
+ }
- nfit_idt = devm_kzalloc(dev, sizeof(*nfit_idt), GFP_KERNEL);
+ nfit_idt = devm_kzalloc(dev, sizeof(*nfit_idt) + sizeof_idt(idt),
+ GFP_KERNEL);
if (!nfit_idt)
return false;
INIT_LIST_HEAD(&nfit_idt->list);
- nfit_idt->idt = idt;
+ memcpy(nfit_idt->idt, idt, sizeof_idt(idt));
list_add_tail(&nfit_idt->list, &acpi_desc->idts);
dev_dbg(dev, "%s: idt index: %d num_lines: %d\n", __func__,
idt->interleave_index, idt->line_count);
return true;
}
+static size_t sizeof_flush(struct acpi_nfit_flush_address *flush)
+{
+ if (flush->header.length < sizeof(*flush))
+ return 0;
+ return sizeof(*flush) + sizeof(u64) * (flush->hint_count - 1);
+}
+
static bool add_flush(struct acpi_nfit_desc *acpi_desc,
struct nfit_table_prev *prev,
struct acpi_nfit_flush_address *flush)
{
- size_t length = min_t(size_t, sizeof(*flush), flush->header.length);
struct device *dev = acpi_desc->dev;
struct nfit_flush *nfit_flush;
- list_for_each_entry(nfit_flush, &prev->flushes, list)
- if (memcmp(nfit_flush->flush, flush, length) == 0) {
+ if (!sizeof_flush(flush))
+ return false;
+
+ list_for_each_entry(nfit_flush, &prev->flushes, list) {
+ if (sizeof_flush(nfit_flush->flush) != sizeof_flush(flush))
+ continue;
+
+ if (memcmp(nfit_flush->flush, flush,
+ sizeof_flush(flush)) == 0) {
list_move_tail(&nfit_flush->list, &acpi_desc->flushes);
return true;
}
+ }
- nfit_flush = devm_kzalloc(dev, sizeof(*nfit_flush), GFP_KERNEL);
+ nfit_flush = devm_kzalloc(dev, sizeof(*nfit_flush)
+ + sizeof_flush(flush), GFP_KERNEL);
if (!nfit_flush)
return false;
INIT_LIST_HEAD(&nfit_flush->list);
- nfit_flush->flush = flush;
+ memcpy(nfit_flush->flush, flush, sizeof_flush(flush));
list_add_tail(&nfit_flush->list, &acpi_desc->flushes);
dev_dbg(dev, "%s: nfit_flush handle: %d hint_count: %d\n", __func__,
flush->device_handle, flush->hint_count);
@@ -614,7 +672,6 @@
{
u16 dcr = __to_nfit_memdev(nfit_mem)->region_index;
struct nfit_memdev *nfit_memdev;
- struct nfit_flush *nfit_flush;
struct nfit_bdw *nfit_bdw;
struct nfit_idt *nfit_idt;
u16 idt_idx, range_index;
@@ -647,14 +704,6 @@
nfit_mem->idt_bdw = nfit_idt->idt;
break;
}
-
- list_for_each_entry(nfit_flush, &acpi_desc->flushes, list) {
- if (nfit_flush->flush->device_handle !=
- nfit_memdev->memdev->device_handle)
- continue;
- nfit_mem->nfit_flush = nfit_flush;
- break;
- }
break;
}
}
@@ -675,6 +724,7 @@
}
list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
+ struct nfit_flush *nfit_flush;
struct nfit_dcr *nfit_dcr;
u32 device_handle;
u16 dcr;
@@ -721,6 +771,28 @@
break;
}
+ list_for_each_entry(nfit_flush, &acpi_desc->flushes, list) {
+ struct acpi_nfit_flush_address *flush;
+ u16 i;
+
+ if (nfit_flush->flush->device_handle != device_handle)
+ continue;
+ nfit_mem->nfit_flush = nfit_flush;
+ flush = nfit_flush->flush;
+ nfit_mem->flush_wpq = devm_kzalloc(acpi_desc->dev,
+ flush->hint_count
+ * sizeof(struct resource), GFP_KERNEL);
+ if (!nfit_mem->flush_wpq)
+ return -ENOMEM;
+ for (i = 0; i < flush->hint_count; i++) {
+ struct resource *res = &nfit_mem->flush_wpq[i];
+
+ res->start = flush->hint_address[i];
+ res->end = res->start + 8 - 1;
+ }
+ break;
+ }
+
if (dcr && !nfit_mem->dcr) {
dev_err(acpi_desc->dev, "SPA %d missing DCR %d\n",
spa->range_index, dcr);
@@ -806,14 +878,85 @@
}
static DEVICE_ATTR_RO(revision);
+/*
+ * This shows the number of full Address Range Scrubs that have been
+ * completed since driver load time. Userspace can wait on this using
+ * select/poll etc. A '+' at the end indicates an ARS is in progress
+ */
+static ssize_t scrub_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvdimm_bus_descriptor *nd_desc;
+ ssize_t rc = -ENXIO;
+
+ device_lock(dev);
+ nd_desc = dev_get_drvdata(dev);
+ if (nd_desc) {
+ struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
+
+ rc = sprintf(buf, "%d%s", acpi_desc->scrub_count,
+ (work_busy(&acpi_desc->work)) ? "+\n" : "\n");
+ }
+ device_unlock(dev);
+ return rc;
+}
+
+static ssize_t scrub_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t size)
+{
+ struct nvdimm_bus_descriptor *nd_desc;
+ ssize_t rc;
+ long val;
+
+ rc = kstrtol(buf, 0, &val);
+ if (rc)
+ return rc;
+ if (val != 1)
+ return -EINVAL;
+
+ device_lock(dev);
+ nd_desc = dev_get_drvdata(dev);
+ if (nd_desc) {
+ struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
+
+ rc = acpi_nfit_ars_rescan(acpi_desc);
+ }
+ device_unlock(dev);
+ if (rc)
+ return rc;
+ return size;
+}
+static DEVICE_ATTR_RW(scrub);
+
+static bool ars_supported(struct nvdimm_bus *nvdimm_bus)
+{
+ struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus);
+ const unsigned long mask = 1 << ND_CMD_ARS_CAP | 1 << ND_CMD_ARS_START
+ | 1 << ND_CMD_ARS_STATUS;
+
+ return (nd_desc->cmd_mask & mask) == mask;
+}
+
+static umode_t nfit_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+ struct device *dev = container_of(kobj, struct device, kobj);
+ struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+
+ if (a == &dev_attr_scrub.attr && !ars_supported(nvdimm_bus))
+ return 0;
+ return a->mode;
+}
+
static struct attribute *acpi_nfit_attributes[] = {
&dev_attr_revision.attr,
+ &dev_attr_scrub.attr,
NULL,
};
static struct attribute_group acpi_nfit_attribute_group = {
.name = "nfit",
.attrs = acpi_nfit_attributes,
+ .is_visible = nfit_visible,
};
static const struct attribute_group *acpi_nfit_attribute_groups[] = {
@@ -1130,11 +1273,11 @@
}
/*
- * Until standardization materializes we need to consider up to 3
+ * Until standardization materializes we need to consider 4
* different command sets. Note, that checking for function0 (bit0)
* tells us if any commands are reachable through this uuid.
*/
- for (i = NVDIMM_FAMILY_INTEL; i <= NVDIMM_FAMILY_HPE2; i++)
+ for (i = NVDIMM_FAMILY_INTEL; i <= NVDIMM_FAMILY_MSFT; i++)
if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1))
break;
@@ -1144,12 +1287,14 @@
dsm_mask = 0x3fe;
if (disable_vendor_specific)
dsm_mask &= ~(1 << ND_CMD_VENDOR);
- } else if (nfit_mem->family == NVDIMM_FAMILY_HPE1)
+ } else if (nfit_mem->family == NVDIMM_FAMILY_HPE1) {
dsm_mask = 0x1c3c76;
- else if (nfit_mem->family == NVDIMM_FAMILY_HPE2) {
+ } else if (nfit_mem->family == NVDIMM_FAMILY_HPE2) {
dsm_mask = 0x1fe;
if (disable_vendor_specific)
dsm_mask &= ~(1 << 8);
+ } else if (nfit_mem->family == NVDIMM_FAMILY_MSFT) {
+ dsm_mask = 0xffffffff;
} else {
dev_dbg(dev, "unknown dimm command family\n");
nfit_mem->family = -1;
@@ -1171,6 +1316,7 @@
int dimm_count = 0;
list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) {
+ struct acpi_nfit_flush_address *flush;
unsigned long flags = 0, cmd_mask;
struct nvdimm *nvdimm;
u32 device_handle;
@@ -1204,9 +1350,12 @@
if (nfit_mem->family == NVDIMM_FAMILY_INTEL)
cmd_mask |= nfit_mem->dsm_mask;
+ flush = nfit_mem->nfit_flush ? nfit_mem->nfit_flush->flush
+ : NULL;
nvdimm = nvdimm_create(acpi_desc->nvdimm_bus, nfit_mem,
acpi_nfit_dimm_attribute_groups,
- flags, cmd_mask);
+ flags, cmd_mask, flush ? flush->hint_count : 0,
+ nfit_mem->flush_wpq);
if (!nvdimm)
return -ENOMEM;
@@ -1374,24 +1523,6 @@
return mmio->base_offset + line_offset + table_offset + sub_line_offset;
}
-static void wmb_blk(struct nfit_blk *nfit_blk)
-{
-
- if (nfit_blk->nvdimm_flush) {
- /*
- * The first wmb() is needed to 'sfence' all previous writes
- * such that they are architecturally visible for the platform
- * buffer flush. Note that we've already arranged for pmem
- * writes to avoid the cache via arch_memcpy_to_pmem(). The
- * final wmb() ensures ordering for the NVDIMM flush write.
- */
- wmb();
- writeq(1, nfit_blk->nvdimm_flush);
- wmb();
- } else
- wmb_pmem();
-}
-
static u32 read_blk_stat(struct nfit_blk *nfit_blk, unsigned int bw)
{
struct nfit_blk_mmio *mmio = &nfit_blk->mmio[DCR];
@@ -1426,7 +1557,7 @@
offset = to_interleave_offset(offset, mmio);
writeq(cmd, mmio->addr.base + offset);
- wmb_blk(nfit_blk);
+ nvdimm_flush(nfit_blk->nd_region);
if (nfit_blk->dimm_flags & NFIT_BLK_DCR_LATCH)
readq(mmio->addr.base + offset);
@@ -1477,7 +1608,7 @@
}
if (rw)
- wmb_blk(nfit_blk);
+ nvdimm_flush(nfit_blk->nd_region);
rc = read_blk_stat(nfit_blk, lane) ? -EIO : 0;
return rc;
@@ -1509,125 +1640,6 @@
return rc;
}
-static void nfit_spa_mapping_release(struct kref *kref)
-{
- struct nfit_spa_mapping *spa_map = to_spa_map(kref);
- struct acpi_nfit_system_address *spa = spa_map->spa;
- struct acpi_nfit_desc *acpi_desc = spa_map->acpi_desc;
-
- WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex));
- dev_dbg(acpi_desc->dev, "%s: SPA%d\n", __func__, spa->range_index);
- if (spa_map->type == SPA_MAP_APERTURE)
- memunmap((void __force *)spa_map->addr.aperture);
- else
- iounmap(spa_map->addr.base);
- release_mem_region(spa->address, spa->length);
- list_del(&spa_map->list);
- kfree(spa_map);
-}
-
-static struct nfit_spa_mapping *find_spa_mapping(
- struct acpi_nfit_desc *acpi_desc,
- struct acpi_nfit_system_address *spa)
-{
- struct nfit_spa_mapping *spa_map;
-
- WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex));
- list_for_each_entry(spa_map, &acpi_desc->spa_maps, list)
- if (spa_map->spa == spa)
- return spa_map;
-
- return NULL;
-}
-
-static void nfit_spa_unmap(struct acpi_nfit_desc *acpi_desc,
- struct acpi_nfit_system_address *spa)
-{
- struct nfit_spa_mapping *spa_map;
-
- mutex_lock(&acpi_desc->spa_map_mutex);
- spa_map = find_spa_mapping(acpi_desc, spa);
-
- if (spa_map)
- kref_put(&spa_map->kref, nfit_spa_mapping_release);
- mutex_unlock(&acpi_desc->spa_map_mutex);
-}
-
-static void __iomem *__nfit_spa_map(struct acpi_nfit_desc *acpi_desc,
- struct acpi_nfit_system_address *spa, enum spa_map_type type)
-{
- resource_size_t start = spa->address;
- resource_size_t n = spa->length;
- struct nfit_spa_mapping *spa_map;
- struct resource *res;
-
- WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex));
-
- spa_map = find_spa_mapping(acpi_desc, spa);
- if (spa_map) {
- kref_get(&spa_map->kref);
- return spa_map->addr.base;
- }
-
- spa_map = kzalloc(sizeof(*spa_map), GFP_KERNEL);
- if (!spa_map)
- return NULL;
-
- INIT_LIST_HEAD(&spa_map->list);
- spa_map->spa = spa;
- kref_init(&spa_map->kref);
- spa_map->acpi_desc = acpi_desc;
-
- res = request_mem_region(start, n, dev_name(acpi_desc->dev));
- if (!res)
- goto err_mem;
-
- spa_map->type = type;
- if (type == SPA_MAP_APERTURE)
- spa_map->addr.aperture = (void __pmem *)memremap(start, n,
- ARCH_MEMREMAP_PMEM);
- else
- spa_map->addr.base = ioremap_nocache(start, n);
-
-
- if (!spa_map->addr.base)
- goto err_map;
-
- list_add_tail(&spa_map->list, &acpi_desc->spa_maps);
- return spa_map->addr.base;
-
- err_map:
- release_mem_region(start, n);
- err_mem:
- kfree(spa_map);
- return NULL;
-}
-
-/**
- * nfit_spa_map - interleave-aware managed-mappings of acpi_nfit_system_address ranges
- * @nvdimm_bus: NFIT-bus that provided the spa table entry
- * @nfit_spa: spa table to map
- * @type: aperture or control region
- *
- * In the case where block-data-window apertures and
- * dimm-control-regions are interleaved they will end up sharing a
- * single request_mem_region() + ioremap() for the address range. In
- * the style of devm nfit_spa_map() mappings are automatically dropped
- * when all region devices referencing the same mapping are disabled /
- * unbound.
- */
-static void __iomem *nfit_spa_map(struct acpi_nfit_desc *acpi_desc,
- struct acpi_nfit_system_address *spa, enum spa_map_type type)
-{
- void __iomem *iomem;
-
- mutex_lock(&acpi_desc->spa_map_mutex);
- iomem = __nfit_spa_map(acpi_desc, spa, type);
- mutex_unlock(&acpi_desc->spa_map_mutex);
-
- return iomem;
-}
-
static int nfit_blk_init_interleave(struct nfit_blk_mmio *mmio,
struct acpi_nfit_interleave *idt, u16 interleave_ways)
{
@@ -1669,9 +1681,7 @@
struct device *dev)
{
struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus);
- struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
struct nd_blk_region *ndbr = to_nd_blk_region(dev);
- struct nfit_flush *nfit_flush;
struct nfit_blk_mmio *mmio;
struct nfit_blk *nfit_blk;
struct nfit_mem *nfit_mem;
@@ -1697,8 +1707,8 @@
/* map block aperture memory */
nfit_blk->bdw_offset = nfit_mem->bdw->offset;
mmio = &nfit_blk->mmio[BDW];
- mmio->addr.base = nfit_spa_map(acpi_desc, nfit_mem->spa_bdw,
- SPA_MAP_APERTURE);
+ mmio->addr.base = devm_nvdimm_memremap(dev, nfit_mem->spa_bdw->address,
+ nfit_mem->spa_bdw->length, ARCH_MEMREMAP_PMEM);
if (!mmio->addr.base) {
dev_dbg(dev, "%s: %s failed to map bdw\n", __func__,
nvdimm_name(nvdimm));
@@ -1720,8 +1730,8 @@
nfit_blk->cmd_offset = nfit_mem->dcr->command_offset;
nfit_blk->stat_offset = nfit_mem->dcr->status_offset;
mmio = &nfit_blk->mmio[DCR];
- mmio->addr.base = nfit_spa_map(acpi_desc, nfit_mem->spa_dcr,
- SPA_MAP_CONTROL);
+ mmio->addr.base = devm_nvdimm_ioremap(dev, nfit_mem->spa_dcr->address,
+ nfit_mem->spa_dcr->length);
if (!mmio->addr.base) {
dev_dbg(dev, "%s: %s failed to map dcr\n", __func__,
nvdimm_name(nvdimm));
@@ -1746,15 +1756,7 @@
return rc;
}
- nfit_flush = nfit_mem->nfit_flush;
- if (nfit_flush && nfit_flush->flush->hint_count != 0) {
- nfit_blk->nvdimm_flush = devm_ioremap_nocache(dev,
- nfit_flush->flush->hint_address[0], 8);
- if (!nfit_blk->nvdimm_flush)
- return -ENOMEM;
- }
-
- if (!arch_has_wmb_pmem() && !nfit_blk->nvdimm_flush)
+ if (nvdimm_has_flush(nfit_blk->nd_region) < 0)
dev_warn(dev, "unable to guarantee persistence of writes\n");
if (mmio->line_size == 0)
@@ -1773,29 +1775,6 @@
return 0;
}
-static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus,
- struct device *dev)
-{
- struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus);
- struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
- struct nd_blk_region *ndbr = to_nd_blk_region(dev);
- struct nfit_blk *nfit_blk = nd_blk_region_provider_data(ndbr);
- int i;
-
- if (!nfit_blk)
- return; /* never enabled */
-
- /* auto-free BLK spa mappings */
- for (i = 0; i < 2; i++) {
- struct nfit_blk_mmio *mmio = &nfit_blk->mmio[i];
-
- if (mmio->addr.base)
- nfit_spa_unmap(acpi_desc, mmio->spa);
- }
- nd_blk_region_set_provider_data(ndbr, NULL);
- /* devm will free nfit_blk */
-}
-
static int ars_get_cap(struct acpi_nfit_desc *acpi_desc,
struct nd_cmd_ars_cap *cmd, struct nfit_spa *nfit_spa)
{
@@ -1919,11 +1898,11 @@
if (ret)
return ret;
- ret = devm_add_action(acpi_desc->dev, acpi_nfit_remove_resource, res);
- if (ret) {
- remove_resource(res);
+ ret = devm_add_action_or_reset(acpi_desc->dev,
+ acpi_nfit_remove_resource,
+ res);
+ if (ret)
return ret;
- }
return 0;
}
@@ -1969,7 +1948,6 @@
ndr_desc->num_mappings = blk_valid;
ndbr_desc = to_blk_region_desc(ndr_desc);
ndbr_desc->enable = acpi_nfit_blk_region_enable;
- ndbr_desc->disable = acpi_nfit_blk_region_disable;
ndbr_desc->do_io = acpi_desc->blk_do_io;
nfit_spa->nd_region = nvdimm_blk_region_create(acpi_desc->nvdimm_bus,
ndr_desc);
@@ -1981,6 +1959,14 @@
return 0;
}
+static bool nfit_spa_is_virtual(struct acpi_nfit_system_address *spa)
+{
+ return (nfit_spa_type(spa) == NFIT_SPA_VDISK ||
+ nfit_spa_type(spa) == NFIT_SPA_VCD ||
+ nfit_spa_type(spa) == NFIT_SPA_PDISK ||
+ nfit_spa_type(spa) == NFIT_SPA_PCD);
+}
+
static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
struct nfit_spa *nfit_spa)
{
@@ -1996,7 +1982,7 @@
if (nfit_spa->nd_region)
return 0;
- if (spa->range_index == 0) {
+ if (spa->range_index == 0 && !nfit_spa_is_virtual(spa)) {
dev_dbg(acpi_desc->dev, "%s: detected invalid spa index\n",
__func__);
return 0;
@@ -2060,6 +2046,11 @@
ndr_desc);
if (!nfit_spa->nd_region)
rc = -ENOMEM;
+ } else if (nfit_spa_is_virtual(spa)) {
+ nfit_spa->nd_region = nvdimm_pmem_region_create(nvdimm_bus,
+ ndr_desc);
+ if (!nfit_spa->nd_region)
+ rc = -ENOMEM;
}
out:
@@ -2139,7 +2130,7 @@
unsigned int tmo = scrub_timeout;
int rc;
- if (nfit_spa->ars_done || !nfit_spa->nd_region)
+ if (!nfit_spa->ars_required || !nfit_spa->nd_region)
return;
rc = ars_start(acpi_desc, nfit_spa);
@@ -2228,7 +2219,9 @@
* firmware initiated scrubs to complete and then we go search for the
* affected spa regions to mark them scanned. In the second phase we
* initiate a directed scrub for every range that was not scrubbed in
- * phase 1.
+ * phase 1. If we're called for a 'rescan', we harmlessly pass through
+ * the first phase, but really only care about running phase 2, where
+ * regions can be notified of new poison.
*/
/* process platform firmware initiated scrubs */
@@ -2331,14 +2324,17 @@
* Flag all the ranges that still need scrubbing, but
* register them now to make data available.
*/
- if (nfit_spa->nd_region)
- nfit_spa->ars_done = 1;
- else
+ if (!nfit_spa->nd_region) {
+ nfit_spa->ars_required = 1;
acpi_nfit_register_region(acpi_desc, nfit_spa);
+ }
}
list_for_each_entry(nfit_spa, &acpi_desc->spas, list)
acpi_nfit_async_scrub(acpi_desc, nfit_spa);
+ acpi_desc->scrub_count++;
+ if (acpi_desc->scrub_count_state)
+ sysfs_notify_dirent(acpi_desc->scrub_count_state);
mutex_unlock(&acpi_desc->init_mutex);
}
@@ -2376,14 +2372,89 @@
return 0;
}
-int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz)
+static int acpi_nfit_desc_init_scrub_attr(struct acpi_nfit_desc *acpi_desc)
+{
+ struct device *dev = acpi_desc->dev;
+ struct kernfs_node *nfit;
+ struct device *bus_dev;
+
+ if (!ars_supported(acpi_desc->nvdimm_bus))
+ return 0;
+
+ bus_dev = to_nvdimm_bus_dev(acpi_desc->nvdimm_bus);
+ nfit = sysfs_get_dirent(bus_dev->kobj.sd, "nfit");
+ if (!nfit) {
+ dev_err(dev, "sysfs_get_dirent 'nfit' failed\n");
+ return -ENODEV;
+ }
+ acpi_desc->scrub_count_state = sysfs_get_dirent(nfit, "scrub");
+ sysfs_put(nfit);
+ if (!acpi_desc->scrub_count_state) {
+ dev_err(dev, "sysfs_get_dirent 'scrub' failed\n");
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+static void acpi_nfit_destruct(void *data)
+{
+ struct acpi_nfit_desc *acpi_desc = data;
+ struct device *bus_dev = to_nvdimm_bus_dev(acpi_desc->nvdimm_bus);
+
+ /*
+ * Destruct under acpi_desc_lock so that nfit_handle_mce does not
+ * race teardown
+ */
+ mutex_lock(&acpi_desc_lock);
+ acpi_desc->cancel = 1;
+ /*
+ * Bounce the nvdimm bus lock to make sure any in-flight
+ * acpi_nfit_ars_rescan() submissions have had a chance to
+ * either submit or see ->cancel set.
+ */
+ device_lock(bus_dev);
+ device_unlock(bus_dev);
+
+ flush_workqueue(nfit_wq);
+ if (acpi_desc->scrub_count_state)
+ sysfs_put(acpi_desc->scrub_count_state);
+ nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
+ acpi_desc->nvdimm_bus = NULL;
+ list_del(&acpi_desc->list);
+ mutex_unlock(&acpi_desc_lock);
+}
+
+int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *data, acpi_size sz)
{
struct device *dev = acpi_desc->dev;
struct nfit_table_prev prev;
const void *end;
- u8 *data;
int rc;
+ if (!acpi_desc->nvdimm_bus) {
+ acpi_nfit_init_dsms(acpi_desc);
+
+ acpi_desc->nvdimm_bus = nvdimm_bus_register(dev,
+ &acpi_desc->nd_desc);
+ if (!acpi_desc->nvdimm_bus)
+ return -ENOMEM;
+
+ rc = devm_add_action_or_reset(dev, acpi_nfit_destruct,
+ acpi_desc);
+ if (rc)
+ return rc;
+
+ rc = acpi_nfit_desc_init_scrub_attr(acpi_desc);
+ if (rc)
+ return rc;
+
+ /* register this acpi_desc for mce notifications */
+ mutex_lock(&acpi_desc_lock);
+ list_add_tail(&acpi_desc->list, &acpi_descs);
+ mutex_unlock(&acpi_desc_lock);
+ }
+
mutex_lock(&acpi_desc->init_mutex);
INIT_LIST_HEAD(&prev.spas);
@@ -2406,7 +2477,6 @@
list_cut_position(&prev.flushes, &acpi_desc->flushes,
acpi_desc->flushes.prev);
- data = (u8 *) acpi_desc->nfit;
end = data + sz;
while (!IS_ERR_OR_NULL(data))
data = add_table(acpi_desc, &prev, data, end);
@@ -2422,12 +2492,9 @@
if (rc)
goto out_unlock;
- if (nfit_mem_init(acpi_desc) != 0) {
- rc = -ENOMEM;
+ rc = nfit_mem_init(acpi_desc);
+ if (rc)
goto out_unlock;
- }
-
- acpi_nfit_init_dsms(acpi_desc);
rc = acpi_nfit_register_dimms(acpi_desc);
if (rc)
@@ -2496,6 +2563,33 @@
return 0;
}
+int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc)
+{
+ struct device *dev = acpi_desc->dev;
+ struct nfit_spa *nfit_spa;
+
+ if (work_busy(&acpi_desc->work))
+ return -EBUSY;
+
+ if (acpi_desc->cancel)
+ return 0;
+
+ mutex_lock(&acpi_desc->init_mutex);
+ list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
+ struct acpi_nfit_system_address *spa = nfit_spa->spa;
+
+ if (nfit_spa_type(spa) != NFIT_SPA_PM)
+ continue;
+
+ nfit_spa->ars_required = 1;
+ }
+ queue_work(nfit_wq, &acpi_desc->work);
+ dev_dbg(dev, "%s: ars_scan triggered\n", __func__);
+ mutex_unlock(&acpi_desc->init_mutex);
+
+ return 0;
+}
+
void acpi_nfit_desc_init(struct acpi_nfit_desc *acpi_desc, struct device *dev)
{
struct nvdimm_bus_descriptor *nd_desc;
@@ -2505,12 +2599,12 @@
acpi_desc->blk_do_io = acpi_nfit_blk_region_do_io;
nd_desc = &acpi_desc->nd_desc;
nd_desc->provider_name = "ACPI.NFIT";
+ nd_desc->module = THIS_MODULE;
nd_desc->ndctl = acpi_nfit_ctl;
nd_desc->flush_probe = acpi_nfit_flush_probe;
nd_desc->clear_to_send = acpi_nfit_clear_to_send;
nd_desc->attr_groups = acpi_nfit_attribute_groups;
- INIT_LIST_HEAD(&acpi_desc->spa_maps);
INIT_LIST_HEAD(&acpi_desc->spas);
INIT_LIST_HEAD(&acpi_desc->dcrs);
INIT_LIST_HEAD(&acpi_desc->bdws);
@@ -2518,7 +2612,7 @@
INIT_LIST_HEAD(&acpi_desc->flushes);
INIT_LIST_HEAD(&acpi_desc->memdevs);
INIT_LIST_HEAD(&acpi_desc->dimms);
- mutex_init(&acpi_desc->spa_map_mutex);
+ INIT_LIST_HEAD(&acpi_desc->list);
mutex_init(&acpi_desc->init_mutex);
INIT_WORK(&acpi_desc->work, acpi_nfit_scrub);
}
@@ -2532,7 +2626,7 @@
struct acpi_table_header *tbl;
acpi_status status = AE_OK;
acpi_size sz;
- int rc;
+ int rc = 0;
status = acpi_get_table_with_size(ACPI_SIG_NFIT, 0, &tbl, &sz);
if (ACPI_FAILURE(status)) {
@@ -2545,50 +2639,33 @@
if (!acpi_desc)
return -ENOMEM;
acpi_nfit_desc_init(acpi_desc, &adev->dev);
- acpi_desc->nvdimm_bus = nvdimm_bus_register(dev, &acpi_desc->nd_desc);
- if (!acpi_desc->nvdimm_bus)
- return -ENOMEM;
- /*
- * Save the acpi header for later and then skip it,
- * making nfit point to the first nfit table header.
- */
+ /* Save the acpi header for exporting the revision via sysfs */
acpi_desc->acpi_header = *tbl;
- acpi_desc->nfit = (void *) tbl + sizeof(struct acpi_table_nfit);
- sz -= sizeof(struct acpi_table_nfit);
/* Evaluate _FIT and override with that if present */
status = acpi_evaluate_object(adev->handle, "_FIT", NULL, &buf);
if (ACPI_SUCCESS(status) && buf.length > 0) {
- union acpi_object *obj;
- /*
- * Adjust for the acpi_object header of the _FIT
- */
- obj = buf.pointer;
- if (obj->type == ACPI_TYPE_BUFFER) {
- acpi_desc->nfit =
- (struct acpi_nfit_header *)obj->buffer.pointer;
- sz = obj->buffer.length;
- } else
+ union acpi_object *obj = buf.pointer;
+
+ if (obj->type == ACPI_TYPE_BUFFER)
+ rc = acpi_nfit_init(acpi_desc, obj->buffer.pointer,
+ obj->buffer.length);
+ else
dev_dbg(dev, "%s invalid type %d, ignoring _FIT\n",
__func__, (int) obj->type);
- }
-
- rc = acpi_nfit_init(acpi_desc, sz);
- if (rc) {
- nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
- return rc;
- }
- return 0;
+ kfree(buf.pointer);
+ } else
+ /* skip over the lead-in header table */
+ rc = acpi_nfit_init(acpi_desc, (void *) tbl
+ + sizeof(struct acpi_table_nfit),
+ sz - sizeof(struct acpi_table_nfit));
+ return rc;
}
static int acpi_nfit_remove(struct acpi_device *adev)
{
- struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(&adev->dev);
-
- acpi_desc->cancel = 1;
- flush_workqueue(nfit_wq);
- nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
+ /* see acpi_nfit_destruct */
return 0;
}
@@ -2596,9 +2673,8 @@
{
struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(&adev->dev);
struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
- struct acpi_nfit_header *nfit_saved;
- union acpi_object *obj;
struct device *dev = &adev->dev;
+ union acpi_object *obj;
acpi_status status;
int ret;
@@ -2616,9 +2692,6 @@
if (!acpi_desc)
goto out_unlock;
acpi_nfit_desc_init(acpi_desc, &adev->dev);
- acpi_desc->nvdimm_bus = nvdimm_bus_register(dev, &acpi_desc->nd_desc);
- if (!acpi_desc->nvdimm_bus)
- goto out_unlock;
} else {
/*
* Finish previous registration before considering new
@@ -2634,21 +2707,14 @@
goto out_unlock;
}
- nfit_saved = acpi_desc->nfit;
obj = buf.pointer;
if (obj->type == ACPI_TYPE_BUFFER) {
- acpi_desc->nfit =
- (struct acpi_nfit_header *)obj->buffer.pointer;
- ret = acpi_nfit_init(acpi_desc, obj->buffer.length);
- if (ret) {
- /* Merge failed, restore old nfit, and exit */
- acpi_desc->nfit = nfit_saved;
+ ret = acpi_nfit_init(acpi_desc, obj->buffer.pointer,
+ obj->buffer.length);
+ if (ret)
dev_err(dev, "failed to merge updated NFIT\n");
- }
- } else {
- /* Bad _FIT, restore old nfit */
+ } else
dev_err(dev, "Invalid _FIT\n");
- }
kfree(buf.pointer);
out_unlock:
@@ -2693,18 +2759,23 @@
acpi_str_to_uuid(UUID_NFIT_DIMM, nfit_uuid[NFIT_DEV_DIMM]);
acpi_str_to_uuid(UUID_NFIT_DIMM_N_HPE1, nfit_uuid[NFIT_DEV_DIMM_N_HPE1]);
acpi_str_to_uuid(UUID_NFIT_DIMM_N_HPE2, nfit_uuid[NFIT_DEV_DIMM_N_HPE2]);
+ acpi_str_to_uuid(UUID_NFIT_DIMM_N_MSFT, nfit_uuid[NFIT_DEV_DIMM_N_MSFT]);
nfit_wq = create_singlethread_workqueue("nfit");
if (!nfit_wq)
return -ENOMEM;
+ nfit_mce_register();
+
return acpi_bus_register_driver(&acpi_nfit_driver);
}
static __exit void nfit_exit(void)
{
+ nfit_mce_unregister();
acpi_bus_unregister_driver(&acpi_nfit_driver);
destroy_workqueue(nfit_wq);
+ WARN_ON(!list_empty(&acpi_descs));
}
module_init(nfit_init);
diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c
new file mode 100644
index 0000000..4c745bf
--- /dev/null
+++ b/drivers/acpi/nfit/mce.c
@@ -0,0 +1,89 @@
+/*
+ * NFIT - Machine Check Handler
+ *
+ * Copyright(c) 2013-2016 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/notifier.h>
+#include <linux/acpi.h>
+#include <asm/mce.h>
+#include "nfit.h"
+
+static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *mce = (struct mce *)data;
+ struct acpi_nfit_desc *acpi_desc;
+ struct nfit_spa *nfit_spa;
+
+ /* We only care about memory errors */
+ if (!(mce->status & MCACOD))
+ return NOTIFY_DONE;
+
+ /*
+ * mce->addr contains the physical addr accessed that caused the
+ * machine check. We need to walk through the list of NFITs, and see
+ * if any of them matches that address, and only then start a scrub.
+ */
+ mutex_lock(&acpi_desc_lock);
+ list_for_each_entry(acpi_desc, &acpi_descs, list) {
+ struct device *dev = acpi_desc->dev;
+ int found_match = 0;
+
+ mutex_lock(&acpi_desc->init_mutex);
+ list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
+ struct acpi_nfit_system_address *spa = nfit_spa->spa;
+
+ if (nfit_spa_type(spa) == NFIT_SPA_PM)
+ continue;
+ /* find the spa that covers the mce addr */
+ if (spa->address > mce->addr)
+ continue;
+ if ((spa->address + spa->length - 1) < mce->addr)
+ continue;
+ found_match = 1;
+ dev_dbg(dev, "%s: addr in SPA %d (0x%llx, 0x%llx)\n",
+ __func__, spa->range_index, spa->address,
+ spa->length);
+ /*
+ * We can break at the first match because we're going
+ * to rescan all the SPA ranges. There shouldn't be any
+ * aliasing anyway.
+ */
+ break;
+ }
+ mutex_unlock(&acpi_desc->init_mutex);
+
+ /*
+ * We can ignore an -EBUSY here because if an ARS is already
+ * in progress, just let that be the last authoritative one
+ */
+ if (found_match)
+ acpi_nfit_ars_rescan(acpi_desc);
+ }
+
+ mutex_unlock(&acpi_desc_lock);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfit_mce_dec = {
+ .notifier_call = nfit_handle_mce,
+};
+
+void nfit_mce_register(void)
+{
+ mce_register_decode_chain(&nfit_mce_dec);
+}
+
+void nfit_mce_unregister(void)
+{
+ mce_unregister_decode_chain(&nfit_mce_dec);
+}
diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit/nfit.h
similarity index 82%
rename from drivers/acpi/nfit.h
rename to drivers/acpi/nfit/nfit.h
index 02b9ea1..e894ded 100644
--- a/drivers/acpi/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -16,6 +16,7 @@
#define __NFIT_H__
#include <linux/workqueue.h>
#include <linux/libnvdimm.h>
+#include <linux/ndctl.h>
#include <linux/types.h>
#include <linux/uuid.h>
#include <linux/acpi.h>
@@ -31,6 +32,9 @@
#define UUID_NFIT_DIMM_N_HPE1 "9002c334-acf3-4c0e-9642-a235f0d53bc6"
#define UUID_NFIT_DIMM_N_HPE2 "5008664b-b758-41a0-a03c-27c2f2d04f7e"
+/* https://msdn.microsoft.com/library/windows/hardware/mt604741 */
+#define UUID_NFIT_DIMM_N_MSFT "1ee68b36-d4bd-4a1a-9a16-4f8e53d46e05"
+
#define ACPI_NFIT_MEM_FAILED_MASK (ACPI_NFIT_MEM_SAVE_FAILED \
| ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
| ACPI_NFIT_MEM_NOT_ARMED)
@@ -40,6 +44,7 @@
NFIT_DEV_DIMM = NVDIMM_FAMILY_INTEL,
NFIT_DEV_DIMM_N_HPE1 = NVDIMM_FAMILY_HPE1,
NFIT_DEV_DIMM_N_HPE2 = NVDIMM_FAMILY_HPE2,
+ NFIT_DEV_DIMM_N_MSFT = NVDIMM_FAMILY_MSFT,
NFIT_SPA_VOLATILE,
NFIT_SPA_PM,
NFIT_SPA_DCR,
@@ -74,37 +79,37 @@
};
struct nfit_spa {
- struct acpi_nfit_system_address *spa;
struct list_head list;
struct nd_region *nd_region;
- unsigned int ars_done:1;
+ unsigned int ars_required:1;
u32 clear_err_unit;
u32 max_ars;
+ struct acpi_nfit_system_address spa[0];
};
struct nfit_dcr {
- struct acpi_nfit_control_region *dcr;
struct list_head list;
+ struct acpi_nfit_control_region dcr[0];
};
struct nfit_bdw {
- struct acpi_nfit_data_region *bdw;
struct list_head list;
+ struct acpi_nfit_data_region bdw[0];
};
struct nfit_idt {
- struct acpi_nfit_interleave *idt;
struct list_head list;
+ struct acpi_nfit_interleave idt[0];
};
struct nfit_flush {
- struct acpi_nfit_flush_address *flush;
struct list_head list;
+ struct acpi_nfit_flush_address flush[0];
};
struct nfit_memdev {
- struct acpi_nfit_memory_map *memdev;
struct list_head list;
+ struct acpi_nfit_memory_map memdev[0];
};
/* assembled tables for a given dimm/memory-device */
@@ -123,6 +128,7 @@
struct list_head list;
struct acpi_device *adev;
struct acpi_nfit_desc *acpi_desc;
+ struct resource *flush_wpq;
unsigned long dsm_mask;
int family;
};
@@ -130,10 +136,7 @@
struct acpi_nfit_desc {
struct nvdimm_bus_descriptor nd_desc;
struct acpi_table_header acpi_header;
- struct acpi_nfit_header *nfit;
- struct mutex spa_map_mutex;
struct mutex init_mutex;
- struct list_head spa_maps;
struct list_head memdevs;
struct list_head flushes;
struct list_head dimms;
@@ -146,6 +149,9 @@
struct nd_cmd_ars_status *ars_status;
size_t ars_status_size;
struct work_struct work;
+ struct list_head list;
+ struct kernfs_node *scrub_count_state;
+ unsigned int scrub_count;
unsigned int cancel:1;
unsigned long dimm_cmd_force_en;
unsigned long bus_cmd_force_en;
@@ -161,7 +167,7 @@
struct nd_blk_addr {
union {
void __iomem *base;
- void __pmem *aperture;
+ void *aperture;
};
};
@@ -180,28 +186,26 @@
u64 bdw_offset; /* post interleave offset */
u64 stat_offset;
u64 cmd_offset;
- void __iomem *nvdimm_flush;
u32 dimm_flags;
};
-enum spa_map_type {
- SPA_MAP_CONTROL,
- SPA_MAP_APERTURE,
-};
+extern struct list_head acpi_descs;
+extern struct mutex acpi_desc_lock;
+int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc);
-struct nfit_spa_mapping {
- struct acpi_nfit_desc *acpi_desc;
- struct acpi_nfit_system_address *spa;
- struct list_head list;
- struct kref kref;
- enum spa_map_type type;
- struct nd_blk_addr addr;
-};
-
-static inline struct nfit_spa_mapping *to_spa_map(struct kref *kref)
+#ifdef CONFIG_X86_MCE
+void nfit_mce_register(void);
+void nfit_mce_unregister(void);
+#else
+static inline void nfit_mce_register(void)
{
- return container_of(kref, struct nfit_spa_mapping, kref);
}
+static inline void nfit_mce_unregister(void)
+{
+}
+#endif
+
+int nfit_spa_type(struct acpi_nfit_system_address *spa);
static inline struct acpi_nfit_memory_map *__to_nfit_memdev(
struct nfit_mem *nfit_mem)
@@ -218,6 +222,6 @@
}
const u8 *to_nfit_uuid(enum nfit_uuids id);
-int acpi_nfit_init(struct acpi_nfit_desc *nfit, acpi_size sz);
+int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *nfit, acpi_size sz);
void acpi_nfit_desc_init(struct acpi_nfit_desc *acpi_desc, struct device *dev);
#endif /* __NFIT_H__ */
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index ba5145d..3022dad 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -379,7 +379,7 @@
#ifdef CONFIG_BLK_DEV_RAM_DAX
static long brd_direct_access(struct block_device *bdev, sector_t sector,
- void __pmem **kaddr, pfn_t *pfn, long size)
+ void **kaddr, pfn_t *pfn, long size)
{
struct brd_device *brd = bdev->bd_disk->private_data;
struct page *page;
@@ -389,7 +389,7 @@
page = brd_insert_page(brd, sector);
if (!page)
return -ENOSPC;
- *kaddr = (void __pmem *)page_address(page);
+ *kaddr = page_address(page);
*pfn = page_to_pfn_t(page);
return PAGE_SIZE;
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index b891a12..803f395 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -211,11 +211,9 @@
}
dax_dev->dev = dev;
- rc = devm_add_action(dax_region->dev, unregister_dax_dev, dev);
- if (rc) {
- unregister_dax_dev(dev);
+ rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev);
+ if (rc)
return rc;
- }
return 0;
diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
index 55d510e..dfb1685 100644
--- a/drivers/dax/pmem.c
+++ b/drivers/dax/pmem.c
@@ -102,21 +102,19 @@
if (rc)
return rc;
- rc = devm_add_action(dev, dax_pmem_percpu_exit, &dax_pmem->ref);
- if (rc) {
- dax_pmem_percpu_exit(&dax_pmem->ref);
+ rc = devm_add_action_or_reset(dev, dax_pmem_percpu_exit,
+ &dax_pmem->ref);
+ if (rc)
return rc;
- }
addr = devm_memremap_pages(dev, &res, &dax_pmem->ref, altmap);
if (IS_ERR(addr))
return PTR_ERR(addr);
- rc = devm_add_action(dev, dax_pmem_percpu_kill, &dax_pmem->ref);
- if (rc) {
- dax_pmem_percpu_kill(&dax_pmem->ref);
+ rc = devm_add_action_or_reset(dev, dax_pmem_percpu_kill,
+ &dax_pmem->ref);
+ if (rc)
return rc;
- }
nd_region = to_nd_region(dev->parent);
dax_region = alloc_dax_region(dev, nd_region->id, &res,
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 6d35dd4..4788b0b 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -142,7 +142,7 @@
}
static long linear_direct_access(struct dm_target *ti, sector_t sector,
- void __pmem **kaddr, pfn_t *pfn, long size)
+ void **kaddr, pfn_t *pfn, long size)
{
struct linear_c *lc = ti->private;
struct block_device *bdev = lc->dev->bdev;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 731e1f5..ce2a910 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -2303,7 +2303,7 @@
}
static long origin_direct_access(struct dm_target *ti, sector_t sector,
- void __pmem **kaddr, pfn_t *pfn, long size)
+ void **kaddr, pfn_t *pfn, long size)
{
DMWARN("device does not support dax.");
return -EIO;
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 01bb9cf..83f1d46 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -309,7 +309,7 @@
}
static long stripe_direct_access(struct dm_target *ti, sector_t sector,
- void __pmem **kaddr, pfn_t *pfn, long size)
+ void **kaddr, pfn_t *pfn, long size)
{
struct stripe_c *sc = ti->private;
uint32_t stripe;
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 6eecd6b..710ae28 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -149,7 +149,7 @@
}
static long io_err_direct_access(struct dm_target *ti, sector_t sector,
- void __pmem **kaddr, pfn_t *pfn, long size)
+ void **kaddr, pfn_t *pfn, long size)
{
return -EIO;
}
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ceb69fc..25d1d97 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -906,7 +906,7 @@
EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
static long dm_blk_direct_access(struct block_device *bdev, sector_t sector,
- void __pmem **kaddr, pfn_t *pfn, long size)
+ void **kaddr, pfn_t *pfn, long size)
{
struct mapped_device *md = bdev->bd_disk->private_data;
struct dm_table *map;
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
index 7c8a3bf..124c243 100644
--- a/drivers/nvdimm/Kconfig
+++ b/drivers/nvdimm/Kconfig
@@ -1,6 +1,7 @@
menuconfig LIBNVDIMM
tristate "NVDIMM (Non-Volatile Memory Device) Support"
depends on PHYS_ADDR_T_64BIT
+ depends on HAS_IOMEM
depends on BLK_DEV
help
Generic support for non-volatile memory devices including
@@ -19,7 +20,6 @@
config BLK_DEV_PMEM
tristate "PMEM: Persistent memory block device support"
default LIBNVDIMM
- depends on HAS_IOMEM
select ND_BTT if BTT
select ND_PFN if NVDIMM_PFN
help
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 7e262ef..9faaa96 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -267,10 +267,8 @@
q = blk_alloc_queue(GFP_KERNEL);
if (!q)
return -ENOMEM;
- if (devm_add_action(dev, nd_blk_release_queue, q)) {
- blk_cleanup_queue(q);
+ if (devm_add_action_or_reset(dev, nd_blk_release_queue, q))
return -ENOMEM;
- }
blk_queue_make_request(q, nd_blk_make_request);
blk_queue_max_hw_sectors(q, UINT_MAX);
@@ -282,10 +280,6 @@
disk = alloc_disk(0);
if (!disk)
return -ENOMEM;
- if (devm_add_action(dev, nd_blk_release_disk, disk)) {
- put_disk(disk);
- return -ENOMEM;
- }
disk->first_minor = 0;
disk->fops = &nd_blk_fops;
@@ -295,6 +289,9 @@
set_capacity(disk, 0);
device_add_disk(dev, disk);
+ if (devm_add_action_or_reset(dev, nd_blk_release_disk, disk))
+ return -ENOMEM;
+
if (nsblk_meta_size(nsblk)) {
int rc = nd_integrity_init(disk, nsblk_meta_size(nsblk));
diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
index 816d0da..3fa7919 100644
--- a/drivers/nvdimm/btt_devs.c
+++ b/drivers/nvdimm/btt_devs.c
@@ -198,8 +198,7 @@
{
struct device *dev = __nd_btt_create(nd_region, 0, NULL, NULL);
- if (dev)
- __nd_device_register(dev);
+ __nd_device_register(dev);
return dev;
}
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 5e4e5c7..458daf9 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -31,6 +31,7 @@
int nvdimm_major;
static int nvdimm_bus_major;
static struct class *nd_class;
+static DEFINE_IDA(nd_ida);
static int to_nd_device_type(struct device *dev)
{
@@ -60,20 +61,13 @@
to_nd_device_type(dev));
}
-static int nvdimm_bus_match(struct device *dev, struct device_driver *drv)
-{
- struct nd_device_driver *nd_drv = to_nd_device_driver(drv);
-
- return !!test_bit(to_nd_device_type(dev), &nd_drv->type);
-}
-
static struct module *to_bus_provider(struct device *dev)
{
/* pin bus providers while regions are enabled */
if (is_nd_pmem(dev) || is_nd_blk(dev)) {
struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
- return nvdimm_bus->module;
+ return nvdimm_bus->nd_desc->module;
}
return NULL;
}
@@ -136,6 +130,21 @@
return rc;
}
+static void nvdimm_bus_shutdown(struct device *dev)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+ struct nd_device_driver *nd_drv = NULL;
+
+ if (dev->driver)
+ nd_drv = to_nd_device_driver(dev->driver);
+
+ if (nd_drv && nd_drv->shutdown) {
+ nd_drv->shutdown(dev);
+ dev_dbg(&nvdimm_bus->dev, "%s.shutdown(%s)\n",
+ dev->driver->name, dev_name(dev));
+ }
+}
+
void nd_device_notify(struct device *dev, enum nvdimm_event event)
{
device_lock(dev);
@@ -208,14 +217,187 @@
}
EXPORT_SYMBOL_GPL(nvdimm_clear_poison);
+static int nvdimm_bus_match(struct device *dev, struct device_driver *drv);
+
static struct bus_type nvdimm_bus_type = {
.name = "nd",
.uevent = nvdimm_bus_uevent,
.match = nvdimm_bus_match,
.probe = nvdimm_bus_probe,
.remove = nvdimm_bus_remove,
+ .shutdown = nvdimm_bus_shutdown,
};
+static void nvdimm_bus_release(struct device *dev)
+{
+ struct nvdimm_bus *nvdimm_bus;
+
+ nvdimm_bus = container_of(dev, struct nvdimm_bus, dev);
+ ida_simple_remove(&nd_ida, nvdimm_bus->id);
+ kfree(nvdimm_bus);
+}
+
+static bool is_nvdimm_bus(struct device *dev)
+{
+ return dev->release == nvdimm_bus_release;
+}
+
+struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev)
+{
+ struct device *dev;
+
+ for (dev = nd_dev; dev; dev = dev->parent)
+ if (is_nvdimm_bus(dev))
+ break;
+ dev_WARN_ONCE(nd_dev, !dev, "invalid dev, not on nd bus\n");
+ if (dev)
+ return to_nvdimm_bus(dev);
+ return NULL;
+}
+
+struct nvdimm_bus *to_nvdimm_bus(struct device *dev)
+{
+ struct nvdimm_bus *nvdimm_bus;
+
+ nvdimm_bus = container_of(dev, struct nvdimm_bus, dev);
+ WARN_ON(!is_nvdimm_bus(dev));
+ return nvdimm_bus;
+}
+EXPORT_SYMBOL_GPL(to_nvdimm_bus);
+
+struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
+ struct nvdimm_bus_descriptor *nd_desc)
+{
+ struct nvdimm_bus *nvdimm_bus;
+ int rc;
+
+ nvdimm_bus = kzalloc(sizeof(*nvdimm_bus), GFP_KERNEL);
+ if (!nvdimm_bus)
+ return NULL;
+ INIT_LIST_HEAD(&nvdimm_bus->list);
+ INIT_LIST_HEAD(&nvdimm_bus->mapping_list);
+ INIT_LIST_HEAD(&nvdimm_bus->poison_list);
+ init_waitqueue_head(&nvdimm_bus->probe_wait);
+ nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
+ mutex_init(&nvdimm_bus->reconfig_mutex);
+ if (nvdimm_bus->id < 0) {
+ kfree(nvdimm_bus);
+ return NULL;
+ }
+ nvdimm_bus->nd_desc = nd_desc;
+ nvdimm_bus->dev.parent = parent;
+ nvdimm_bus->dev.release = nvdimm_bus_release;
+ nvdimm_bus->dev.groups = nd_desc->attr_groups;
+ nvdimm_bus->dev.bus = &nvdimm_bus_type;
+ dev_set_name(&nvdimm_bus->dev, "ndbus%d", nvdimm_bus->id);
+ rc = device_register(&nvdimm_bus->dev);
+ if (rc) {
+ dev_dbg(&nvdimm_bus->dev, "registration failed: %d\n", rc);
+ goto err;
+ }
+
+ return nvdimm_bus;
+ err:
+ put_device(&nvdimm_bus->dev);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nvdimm_bus_register);
+
+void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus)
+{
+ if (!nvdimm_bus)
+ return;
+ device_unregister(&nvdimm_bus->dev);
+}
+EXPORT_SYMBOL_GPL(nvdimm_bus_unregister);
+
+static int child_unregister(struct device *dev, void *data)
+{
+ /*
+ * the singular ndctl class device per bus needs to be
+ * "device_destroy"ed, so skip it here
+ *
+ * i.e. remove classless children
+ */
+ if (dev->class)
+ /* pass */;
+ else
+ nd_device_unregister(dev, ND_SYNC);
+ return 0;
+}
+
+static void free_poison_list(struct list_head *poison_list)
+{
+ struct nd_poison *pl, *next;
+
+ list_for_each_entry_safe(pl, next, poison_list, list) {
+ list_del(&pl->list);
+ kfree(pl);
+ }
+ list_del_init(poison_list);
+}
+
+static int nd_bus_remove(struct device *dev)
+{
+ struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+
+ mutex_lock(&nvdimm_bus_list_mutex);
+ list_del_init(&nvdimm_bus->list);
+ mutex_unlock(&nvdimm_bus_list_mutex);
+
+ nd_synchronize();
+ device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
+
+ nvdimm_bus_lock(&nvdimm_bus->dev);
+ free_poison_list(&nvdimm_bus->poison_list);
+ nvdimm_bus_unlock(&nvdimm_bus->dev);
+
+ nvdimm_bus_destroy_ndctl(nvdimm_bus);
+
+ return 0;
+}
+
+static int nd_bus_probe(struct device *dev)
+{
+ struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+ int rc;
+
+ rc = nvdimm_bus_create_ndctl(nvdimm_bus);
+ if (rc)
+ return rc;
+
+ mutex_lock(&nvdimm_bus_list_mutex);
+ list_add_tail(&nvdimm_bus->list, &nvdimm_bus_list);
+ mutex_unlock(&nvdimm_bus_list_mutex);
+
+ /* enable bus provider attributes to look up their local context */
+ dev_set_drvdata(dev, nvdimm_bus->nd_desc);
+
+ return 0;
+}
+
+static struct nd_device_driver nd_bus_driver = {
+ .probe = nd_bus_probe,
+ .remove = nd_bus_remove,
+ .drv = {
+ .name = "nd_bus",
+ .suppress_bind_attrs = true,
+ .bus = &nvdimm_bus_type,
+ .owner = THIS_MODULE,
+ .mod_name = KBUILD_MODNAME,
+ },
+};
+
+static int nvdimm_bus_match(struct device *dev, struct device_driver *drv)
+{
+ struct nd_device_driver *nd_drv = to_nd_device_driver(drv);
+
+ if (is_nvdimm_bus(dev) && nd_drv == &nd_bus_driver)
+ return true;
+
+ return !!test_bit(to_nd_device_type(dev), &nd_drv->type);
+}
+
static ASYNC_DOMAIN_EXCLUSIVE(nd_async_domain);
void nd_synchronize(void)
@@ -395,12 +577,10 @@
dev = device_create(nd_class, &nvdimm_bus->dev, devt, nvdimm_bus,
"ndctl%d", nvdimm_bus->id);
- if (IS_ERR(dev)) {
+ if (IS_ERR(dev))
dev_dbg(&nvdimm_bus->dev, "failed to register ndctl%d: %ld\n",
nvdimm_bus->id, PTR_ERR(dev));
- return PTR_ERR(dev);
- }
- return 0;
+ return PTR_ERR_OR_ZERO(dev);
}
void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus)
@@ -850,8 +1030,14 @@
goto err_class;
}
+ rc = driver_register(&nd_bus_driver.drv);
+ if (rc)
+ goto err_nd_bus;
+
return 0;
+ err_nd_bus:
+ class_destroy(nd_class);
err_class:
unregister_chrdev(nvdimm_major, "dimmctl");
err_dimm_chrdev:
@@ -864,8 +1050,10 @@
void nvdimm_bus_exit(void)
{
+ driver_unregister(&nd_bus_driver.drv);
class_destroy(nd_class);
unregister_chrdev(nvdimm_bus_major, "ndctl");
unregister_chrdev(nvdimm_major, "dimmctl");
bus_unregister(&nvdimm_bus_type);
+ ida_destroy(&nd_ida);
}
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 8b2e3c4..d5dc80c 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -240,7 +240,7 @@
return memcpy_from_pmem(buf, nsio->addr + offset, size);
} else {
memcpy_to_pmem(nsio->addr + offset, buf, size);
- wmb_pmem();
+ nvdimm_flush(to_nd_region(ndns->dev.parent));
}
return 0;
@@ -266,9 +266,8 @@
nsio->addr = devm_memremap(dev, res->start, resource_size(res),
ARCH_MEMREMAP_PMEM);
- if (IS_ERR(nsio->addr))
- return PTR_ERR(nsio->addr);
- return 0;
+
+ return PTR_ERR_OR_ZERO(nsio->addr);
}
EXPORT_SYMBOL_GPL(devm_nsio_enable);
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index be89764..715583f 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -20,12 +20,12 @@
#include <linux/ndctl.h>
#include <linux/mutex.h>
#include <linux/slab.h>
+#include <linux/io.h>
#include "nd-core.h"
#include "nd.h"
LIST_HEAD(nvdimm_bus_list);
DEFINE_MUTEX(nvdimm_bus_list_mutex);
-static DEFINE_IDA(nd_ida);
void nvdimm_bus_lock(struct device *dev)
{
@@ -57,6 +57,127 @@
}
EXPORT_SYMBOL(is_nvdimm_bus_locked);
+struct nvdimm_map {
+ struct nvdimm_bus *nvdimm_bus;
+ struct list_head list;
+ resource_size_t offset;
+ unsigned long flags;
+ size_t size;
+ union {
+ void *mem;
+ void __iomem *iomem;
+ };
+ struct kref kref;
+};
+
+static struct nvdimm_map *find_nvdimm_map(struct device *dev,
+ resource_size_t offset)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+ struct nvdimm_map *nvdimm_map;
+
+ list_for_each_entry(nvdimm_map, &nvdimm_bus->mapping_list, list)
+ if (nvdimm_map->offset == offset)
+ return nvdimm_map;
+ return NULL;
+}
+
+static struct nvdimm_map *alloc_nvdimm_map(struct device *dev,
+ resource_size_t offset, size_t size, unsigned long flags)
+{
+ struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+ struct nvdimm_map *nvdimm_map;
+
+ nvdimm_map = kzalloc(sizeof(*nvdimm_map), GFP_KERNEL);
+ if (!nvdimm_map)
+ return NULL;
+
+ INIT_LIST_HEAD(&nvdimm_map->list);
+ nvdimm_map->nvdimm_bus = nvdimm_bus;
+ nvdimm_map->offset = offset;
+ nvdimm_map->flags = flags;
+ nvdimm_map->size = size;
+ kref_init(&nvdimm_map->kref);
+
+ if (!request_mem_region(offset, size, dev_name(&nvdimm_bus->dev)))
+ goto err_request_region;
+
+ if (flags)
+ nvdimm_map->mem = memremap(offset, size, flags);
+ else
+ nvdimm_map->iomem = ioremap(offset, size);
+
+ if (!nvdimm_map->mem)
+ goto err_map;
+
+ dev_WARN_ONCE(dev, !is_nvdimm_bus_locked(dev), "%s: bus unlocked!",
+ __func__);
+ list_add(&nvdimm_map->list, &nvdimm_bus->mapping_list);
+
+ return nvdimm_map;
+
+ err_map:
+ release_mem_region(offset, size);
+ err_request_region:
+ kfree(nvdimm_map);
+ return NULL;
+}
+
+static void nvdimm_map_release(struct kref *kref)
+{
+ struct nvdimm_bus *nvdimm_bus;
+ struct nvdimm_map *nvdimm_map;
+
+ nvdimm_map = container_of(kref, struct nvdimm_map, kref);
+ nvdimm_bus = nvdimm_map->nvdimm_bus;
+
+ dev_dbg(&nvdimm_bus->dev, "%s: %pa\n", __func__, &nvdimm_map->offset);
+ list_del(&nvdimm_map->list);
+ if (nvdimm_map->flags)
+ memunmap(nvdimm_map->mem);
+ else
+ iounmap(nvdimm_map->iomem);
+ release_mem_region(nvdimm_map->offset, nvdimm_map->size);
+ kfree(nvdimm_map);
+}
+
+static void nvdimm_map_put(void *data)
+{
+ struct nvdimm_map *nvdimm_map = data;
+ struct nvdimm_bus *nvdimm_bus = nvdimm_map->nvdimm_bus;
+
+ nvdimm_bus_lock(&nvdimm_bus->dev);
+ kref_put(&nvdimm_map->kref, nvdimm_map_release);
+ nvdimm_bus_unlock(&nvdimm_bus->dev);
+}
+
+/**
+ * devm_nvdimm_memremap - map a resource that is shared across regions
+ * @dev: device that will own a reference to the shared mapping
+ * @offset: physical base address of the mapping
+ * @size: mapping size
+ * @flags: memremap flags, or, if zero, perform an ioremap instead
+ */
+void *devm_nvdimm_memremap(struct device *dev, resource_size_t offset,
+ size_t size, unsigned long flags)
+{
+ struct nvdimm_map *nvdimm_map;
+
+ nvdimm_bus_lock(dev);
+ nvdimm_map = find_nvdimm_map(dev, offset);
+ if (!nvdimm_map)
+ nvdimm_map = alloc_nvdimm_map(dev, offset, size, flags);
+ else
+ kref_get(&nvdimm_map->kref);
+ nvdimm_bus_unlock(dev);
+
+ if (devm_add_action_or_reset(dev, nvdimm_map_put, nvdimm_map))
+ return NULL;
+
+ return nvdimm_map->mem;
+}
+EXPORT_SYMBOL_GPL(devm_nvdimm_memremap);
+
u64 nd_fletcher64(void *addr, size_t len, bool le)
{
u32 *buf = addr;
@@ -73,25 +194,6 @@
}
EXPORT_SYMBOL_GPL(nd_fletcher64);
-static void nvdimm_bus_release(struct device *dev)
-{
- struct nvdimm_bus *nvdimm_bus;
-
- nvdimm_bus = container_of(dev, struct nvdimm_bus, dev);
- ida_simple_remove(&nd_ida, nvdimm_bus->id);
- kfree(nvdimm_bus);
-}
-
-struct nvdimm_bus *to_nvdimm_bus(struct device *dev)
-{
- struct nvdimm_bus *nvdimm_bus;
-
- nvdimm_bus = container_of(dev, struct nvdimm_bus, dev);
- WARN_ON(nvdimm_bus->dev.release != nvdimm_bus_release);
- return nvdimm_bus;
-}
-EXPORT_SYMBOL_GPL(to_nvdimm_bus);
-
struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus)
{
/* struct nvdimm_bus definition is private to libnvdimm */
@@ -99,18 +201,12 @@
}
EXPORT_SYMBOL_GPL(to_nd_desc);
-struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev)
+struct device *to_nvdimm_bus_dev(struct nvdimm_bus *nvdimm_bus)
{
- struct device *dev;
-
- for (dev = nd_dev; dev; dev = dev->parent)
- if (dev->release == nvdimm_bus_release)
- break;
- dev_WARN_ONCE(nd_dev, !dev, "invalid dev, not on nd bus\n");
- if (dev)
- return to_nvdimm_bus(dev);
- return NULL;
+ /* struct nvdimm_bus definition is private to libnvdimm */
+ return &nvdimm_bus->dev;
}
+EXPORT_SYMBOL_GPL(to_nvdimm_bus_dev);
static bool is_uuid_sep(char sep)
{
@@ -325,51 +421,6 @@
};
EXPORT_SYMBOL_GPL(nvdimm_bus_attribute_group);
-struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
- struct nvdimm_bus_descriptor *nd_desc, struct module *module)
-{
- struct nvdimm_bus *nvdimm_bus;
- int rc;
-
- nvdimm_bus = kzalloc(sizeof(*nvdimm_bus), GFP_KERNEL);
- if (!nvdimm_bus)
- return NULL;
- INIT_LIST_HEAD(&nvdimm_bus->list);
- INIT_LIST_HEAD(&nvdimm_bus->poison_list);
- init_waitqueue_head(&nvdimm_bus->probe_wait);
- nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
- mutex_init(&nvdimm_bus->reconfig_mutex);
- if (nvdimm_bus->id < 0) {
- kfree(nvdimm_bus);
- return NULL;
- }
- nvdimm_bus->nd_desc = nd_desc;
- nvdimm_bus->module = module;
- nvdimm_bus->dev.parent = parent;
- nvdimm_bus->dev.release = nvdimm_bus_release;
- nvdimm_bus->dev.groups = nd_desc->attr_groups;
- dev_set_name(&nvdimm_bus->dev, "ndbus%d", nvdimm_bus->id);
- rc = device_register(&nvdimm_bus->dev);
- if (rc) {
- dev_dbg(&nvdimm_bus->dev, "registration failed: %d\n", rc);
- goto err;
- }
-
- rc = nvdimm_bus_create_ndctl(nvdimm_bus);
- if (rc)
- goto err;
-
- mutex_lock(&nvdimm_bus_list_mutex);
- list_add_tail(&nvdimm_bus->list, &nvdimm_bus_list);
- mutex_unlock(&nvdimm_bus_list_mutex);
-
- return nvdimm_bus;
- err:
- put_device(&nvdimm_bus->dev);
- return NULL;
-}
-EXPORT_SYMBOL_GPL(__nvdimm_bus_register);
-
static void set_badblock(struct badblocks *bb, sector_t s, int num)
{
dev_dbg(bb->dev, "Found a poison range (0x%llx, 0x%llx)\n",
@@ -545,54 +596,6 @@
}
EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison);
-static void free_poison_list(struct list_head *poison_list)
-{
- struct nd_poison *pl, *next;
-
- list_for_each_entry_safe(pl, next, poison_list, list) {
- list_del(&pl->list);
- kfree(pl);
- }
- list_del_init(poison_list);
-}
-
-static int child_unregister(struct device *dev, void *data)
-{
- /*
- * the singular ndctl class device per bus needs to be
- * "device_destroy"ed, so skip it here
- *
- * i.e. remove classless children
- */
- if (dev->class)
- /* pass */;
- else
- nd_device_unregister(dev, ND_SYNC);
- return 0;
-}
-
-void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus)
-{
- if (!nvdimm_bus)
- return;
-
- mutex_lock(&nvdimm_bus_list_mutex);
- list_del_init(&nvdimm_bus->list);
- mutex_unlock(&nvdimm_bus_list_mutex);
-
- nd_synchronize();
- device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
-
- nvdimm_bus_lock(&nvdimm_bus->dev);
- free_poison_list(&nvdimm_bus->poison_list);
- nvdimm_bus_unlock(&nvdimm_bus->dev);
-
- nvdimm_bus_destroy_ndctl(nvdimm_bus);
-
- device_unregister(&nvdimm_bus->dev);
-}
-EXPORT_SYMBOL_GPL(nvdimm_bus_unregister);
-
#ifdef CONFIG_BLK_DEV_INTEGRITY
int nd_integrity_init(struct gendisk *disk, unsigned long meta_size)
{
@@ -601,7 +604,8 @@
if (meta_size == 0)
return 0;
- bi.profile = NULL;
+ memset(&bi, 0, sizeof(bi));
+
bi.tuple_size = meta_size;
bi.tag_size = meta_size;
@@ -650,7 +654,6 @@
nvdimm_bus_exit();
nd_region_devs_exit();
nvdimm_devs_exit();
- ida_destroy(&nd_ida);
}
MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index bbde28d..d9bba5e 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -346,7 +346,8 @@
struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
const struct attribute_group **groups, unsigned long flags,
- unsigned long cmd_mask)
+ unsigned long cmd_mask, int num_flush,
+ struct resource *flush_wpq)
{
struct nvdimm *nvdimm = kzalloc(sizeof(*nvdimm), GFP_KERNEL);
struct device *dev;
@@ -362,6 +363,8 @@
nvdimm->provider_data = provider_data;
nvdimm->flags = flags;
nvdimm->cmd_mask = cmd_mask;
+ nvdimm->num_flush = num_flush;
+ nvdimm->flush_wpq = flush_wpq;
atomic_set(&nvdimm->busy, 0);
dev = &nvdimm->dev;
dev_set_name(dev, "nmem%d", nvdimm->id);
diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c
index 95825b3..11ea901 100644
--- a/drivers/nvdimm/e820.c
+++ b/drivers/nvdimm/e820.c
@@ -47,6 +47,7 @@
nd_desc.attr_groups = e820_pmem_attribute_groups;
nd_desc.provider_name = "e820";
+ nd_desc.module = THIS_MODULE;
nvdimm_bus = nvdimm_bus_register(dev, &nd_desc);
if (!nvdimm_bus)
goto err;
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 284cdaa..38ce6bb 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -26,11 +26,11 @@
struct nvdimm_bus {
struct nvdimm_bus_descriptor *nd_desc;
wait_queue_head_t probe_wait;
- struct module *module;
struct list_head list;
struct device dev;
int id, probe_active;
struct list_head poison_list;
+ struct list_head mapping_list;
struct mutex reconfig_mutex;
};
@@ -40,7 +40,8 @@
unsigned long cmd_mask;
struct device dev;
atomic_t busy;
- int id;
+ int id, num_flush;
+ struct resource *flush_wpq;
};
bool is_nvdimm(struct device *dev);
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index d0ac93c..4047639 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -49,9 +49,11 @@
struct kref kref;
};
-struct nd_region_namespaces {
- int count;
- int active;
+struct nd_region_data {
+ int ns_count;
+ int ns_active;
+ unsigned int flush_mask;
+ void __iomem *flush_wpq[0][0];
};
static inline struct nd_namespace_index *to_namespace_index(
@@ -119,7 +121,6 @@
struct nd_blk_region {
int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
- void (*disable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
void *iobuf, u64 len, int rw);
void *blk_provider_data;
@@ -325,6 +326,7 @@
}
#endif
int nd_blk_region_init(struct nd_region *nd_region);
+int nd_region_activate(struct nd_region *nd_region);
void __nd_iostat_start(struct bio *bio, unsigned long *start);
static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
{
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 36cb390..b511099 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -29,27 +29,28 @@
#include <linux/slab.h>
#include <linux/pmem.h>
#include <linux/nd.h>
+#include "pmem.h"
#include "pfn.h"
#include "nd.h"
-struct pmem_device {
- /* One contiguous memory region per device */
- phys_addr_t phys_addr;
- /* when non-zero this device is hosting a 'pfn' instance */
- phys_addr_t data_offset;
- u64 pfn_flags;
- void __pmem *virt_addr;
- /* immutable base size of the namespace */
- size_t size;
- /* trim size when namespace capacity has been section aligned */
- u32 pfn_pad;
- struct badblocks bb;
-};
+static struct device *to_dev(struct pmem_device *pmem)
+{
+ /*
+ * nvdimm bus services need a 'dev' parameter, and we record the device
+ * at init in bb.dev.
+ */
+ return pmem->bb.dev;
+}
+
+static struct nd_region *to_region(struct pmem_device *pmem)
+{
+ return to_nd_region(to_dev(pmem)->parent);
+}
static void pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset,
unsigned int len)
{
- struct device *dev = pmem->bb.dev;
+ struct device *dev = to_dev(pmem);
sector_t sector;
long cleared;
@@ -57,7 +58,7 @@
cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len);
if (cleared > 0 && cleared / 512) {
- dev_dbg(dev, "%s: %llx clear %ld sector%s\n",
+ dev_dbg(dev, "%s: %#llx clear %ld sector%s\n",
__func__, (unsigned long long) sector,
cleared / 512, cleared / 512 > 1 ? "s" : "");
badblocks_clear(&pmem->bb, sector, cleared / 512);
@@ -73,7 +74,7 @@
bool bad_pmem = false;
void *mem = kmap_atomic(page);
phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
- void __pmem *pmem_addr = pmem->virt_addr + pmem_off;
+ void *pmem_addr = pmem->virt_addr + pmem_off;
if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
bad_pmem = true;
@@ -112,6 +113,11 @@
return rc;
}
+/* account for REQ_FLUSH rename, replace with REQ_PREFLUSH after v4.8-rc1 */
+#ifndef REQ_FLUSH
+#define REQ_FLUSH REQ_PREFLUSH
+#endif
+
static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
{
int rc = 0;
@@ -120,6 +126,10 @@
struct bio_vec bvec;
struct bvec_iter iter;
struct pmem_device *pmem = q->queuedata;
+ struct nd_region *nd_region = to_region(pmem);
+
+ if (bio->bi_rw & REQ_FLUSH)
+ nvdimm_flush(nd_region);
do_acct = nd_iostat_start(bio, &start);
bio_for_each_segment(bvec, bio, iter) {
@@ -134,8 +144,8 @@
if (do_acct)
nd_iostat_end(bio, start);
- if (bio_data_dir(bio))
- wmb_pmem();
+ if (bio->bi_rw & REQ_FUA)
+ nvdimm_flush(nd_region);
bio_endio(bio);
return BLK_QC_T_NONE;
@@ -148,8 +158,6 @@
int rc;
rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, rw, sector);
- if (rw & WRITE)
- wmb_pmem();
/*
* The ->rw_page interface is subtle and tricky. The core
@@ -163,8 +171,9 @@
return rc;
}
-static long pmem_direct_access(struct block_device *bdev, sector_t sector,
- void __pmem **kaddr, pfn_t *pfn, long size)
+/* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */
+__weak long pmem_direct_access(struct block_device *bdev, sector_t sector,
+ void **kaddr, pfn_t *pfn, long size)
{
struct pmem_device *pmem = bdev->bd_queue->queuedata;
resource_size_t offset = sector * 512 + pmem->data_offset;
@@ -195,7 +204,7 @@
blk_cleanup_queue(q);
}
-void pmem_release_disk(void *disk)
+static void pmem_release_disk(void *disk)
{
del_gendisk(disk);
put_disk(disk);
@@ -205,6 +214,7 @@
struct nd_namespace_common *ndns)
{
struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+ struct nd_region *nd_region = to_nd_region(dev->parent);
struct vmem_altmap __altmap, *altmap = NULL;
struct resource *res = &nsio->res;
struct nd_pfn *nd_pfn = NULL;
@@ -234,7 +244,7 @@
dev_set_drvdata(dev, pmem);
pmem->phys_addr = res->start;
pmem->size = resource_size(res);
- if (!arch_has_wmb_pmem())
+ if (nvdimm_has_flush(nd_region) < 0)
dev_warn(dev, "unable to guarantee persistence of writes\n");
if (!devm_request_mem_region(dev, res->start, resource_size(res),
@@ -269,15 +279,14 @@
* At release time the queue must be dead before
* devm_memremap_pages is unwound
*/
- if (devm_add_action(dev, pmem_release_queue, q)) {
- blk_cleanup_queue(q);
+ if (devm_add_action_or_reset(dev, pmem_release_queue, q))
return -ENOMEM;
- }
if (IS_ERR(addr))
return PTR_ERR(addr);
- pmem->virt_addr = (void __pmem *) addr;
+ pmem->virt_addr = addr;
+ blk_queue_write_cache(q, true, true);
blk_queue_make_request(q, pmem_make_request);
blk_queue_physical_block_size(q, PAGE_SIZE);
blk_queue_max_hw_sectors(q, UINT_MAX);
@@ -289,10 +298,6 @@
disk = alloc_disk_node(0, nid);
if (!disk)
return -ENOMEM;
- if (devm_add_action(dev, pmem_release_disk, disk)) {
- put_disk(disk);
- return -ENOMEM;
- }
disk->fops = &pmem_fops;
disk->queue = q;
@@ -302,9 +307,13 @@
/ 512);
if (devm_init_badblocks(dev, &pmem->bb))
return -ENOMEM;
- nvdimm_badblocks_populate(to_nd_region(dev->parent), &pmem->bb, res);
+ nvdimm_badblocks_populate(nd_region, &pmem->bb, res);
disk->bb = &pmem->bb;
device_add_disk(dev, disk);
+
+ if (devm_add_action_or_reset(dev, pmem_release_disk, disk))
+ return -ENOMEM;
+
revalidate_disk(disk);
return 0;
@@ -340,13 +349,20 @@
{
if (is_nd_btt(dev))
nvdimm_namespace_detach_btt(to_nd_btt(dev));
+ nvdimm_flush(to_nd_region(dev->parent));
+
return 0;
}
+static void nd_pmem_shutdown(struct device *dev)
+{
+ nvdimm_flush(to_nd_region(dev->parent));
+}
+
static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
{
- struct nd_region *nd_region = to_nd_region(dev->parent);
struct pmem_device *pmem = dev_get_drvdata(dev);
+ struct nd_region *nd_region = to_region(pmem);
resource_size_t offset = 0, end_trunc = 0;
struct nd_namespace_common *ndns;
struct nd_namespace_io *nsio;
@@ -382,6 +398,7 @@
.probe = nd_pmem_probe,
.remove = nd_pmem_remove,
.notify = nd_pmem_notify,
+ .shutdown = nd_pmem_shutdown,
.drv = {
.name = "nd_pmem",
},
diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
new file mode 100644
index 0000000..b4ee4f71
--- /dev/null
+++ b/drivers/nvdimm/pmem.h
@@ -0,0 +1,24 @@
+#ifndef __NVDIMM_PMEM_H__
+#define __NVDIMM_PMEM_H__
+#include <linux/badblocks.h>
+#include <linux/types.h>
+#include <linux/pfn_t.h>
+#include <linux/fs.h>
+
+long pmem_direct_access(struct block_device *bdev, sector_t sector,
+ void **kaddr, pfn_t *pfn, long size);
+/* this definition is in it's own header for tools/testing/nvdimm to consume */
+struct pmem_device {
+ /* One contiguous memory region per device */
+ phys_addr_t phys_addr;
+ /* when non-zero this device is hosting a 'pfn' instance */
+ phys_addr_t data_offset;
+ u64 pfn_flags;
+ void *virt_addr;
+ /* immutable base size of the namespace */
+ size_t size;
+ /* trim size when namespace capacity has been section aligned */
+ u32 pfn_pad;
+ struct badblocks bb;
+};
+#endif /* __NVDIMM_PMEM_H__ */
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index 05a9123..8f24177 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -20,7 +20,7 @@
{
int err, rc;
static unsigned long once;
- struct nd_region_namespaces *num_ns;
+ struct nd_region_data *ndrd;
struct nd_region *nd_region = to_nd_region(dev);
if (nd_region->num_lanes > num_online_cpus()
@@ -33,21 +33,21 @@
nd_region->num_lanes);
}
+ rc = nd_region_activate(nd_region);
+ if (rc)
+ return rc;
+
rc = nd_blk_region_init(nd_region);
if (rc)
return rc;
rc = nd_region_register_namespaces(nd_region, &err);
- num_ns = devm_kzalloc(dev, sizeof(*num_ns), GFP_KERNEL);
- if (!num_ns)
- return -ENOMEM;
-
if (rc < 0)
return rc;
- num_ns->active = rc;
- num_ns->count = rc + err;
- dev_set_drvdata(dev, num_ns);
+ ndrd = dev_get_drvdata(dev);
+ ndrd->ns_active = rc;
+ ndrd->ns_count = rc + err;
if (rc && err && rc == err)
return -ENODEV;
@@ -82,6 +82,8 @@
{
struct nd_region *nd_region = to_nd_region(dev);
+ device_for_each_child(dev, NULL, child_unregister);
+
/* flush attribute readers and disable */
nvdimm_bus_lock(dev);
nd_region->ns_seed = NULL;
@@ -91,7 +93,6 @@
dev_set_drvdata(dev, NULL);
nvdimm_bus_unlock(dev);
- device_for_each_child(dev, NULL, child_unregister);
return 0;
}
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 40fcfea..e8d5ba7 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -14,13 +14,97 @@
#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/hash.h>
+#include <linux/pmem.h>
#include <linux/sort.h>
#include <linux/io.h>
#include <linux/nd.h>
#include "nd-core.h"
#include "nd.h"
+/*
+ * For readq() and writeq() on 32-bit builds, the hi-lo, lo-hi order is
+ * irrelevant.
+ */
+#include <linux/io-64-nonatomic-hi-lo.h>
+
static DEFINE_IDA(region_ida);
+static DEFINE_PER_CPU(int, flush_idx);
+
+static int nvdimm_map_flush(struct device *dev, struct nvdimm *nvdimm, int dimm,
+ struct nd_region_data *ndrd)
+{
+ int i, j;
+
+ dev_dbg(dev, "%s: map %d flush address%s\n", nvdimm_name(nvdimm),
+ nvdimm->num_flush, nvdimm->num_flush == 1 ? "" : "es");
+ for (i = 0; i < nvdimm->num_flush; i++) {
+ struct resource *res = &nvdimm->flush_wpq[i];
+ unsigned long pfn = PHYS_PFN(res->start);
+ void __iomem *flush_page;
+
+ /* check if flush hints share a page */
+ for (j = 0; j < i; j++) {
+ struct resource *res_j = &nvdimm->flush_wpq[j];
+ unsigned long pfn_j = PHYS_PFN(res_j->start);
+
+ if (pfn == pfn_j)
+ break;
+ }
+
+ if (j < i)
+ flush_page = (void __iomem *) ((unsigned long)
+ ndrd->flush_wpq[dimm][j] & PAGE_MASK);
+ else
+ flush_page = devm_nvdimm_ioremap(dev,
+ PHYS_PFN(pfn), PAGE_SIZE);
+ if (!flush_page)
+ return -ENXIO;
+ ndrd->flush_wpq[dimm][i] = flush_page
+ + (res->start & ~PAGE_MASK);
+ }
+
+ return 0;
+}
+
+int nd_region_activate(struct nd_region *nd_region)
+{
+ int i, num_flush = 0;
+ struct nd_region_data *ndrd;
+ struct device *dev = &nd_region->dev;
+ size_t flush_data_size = sizeof(void *);
+
+ nvdimm_bus_lock(&nd_region->dev);
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+ /* at least one null hint slot per-dimm for the "no-hint" case */
+ flush_data_size += sizeof(void *);
+ num_flush = min_not_zero(num_flush, nvdimm->num_flush);
+ if (!nvdimm->num_flush)
+ continue;
+ flush_data_size += nvdimm->num_flush * sizeof(void *);
+ }
+ nvdimm_bus_unlock(&nd_region->dev);
+
+ ndrd = devm_kzalloc(dev, sizeof(*ndrd) + flush_data_size, GFP_KERNEL);
+ if (!ndrd)
+ return -ENOMEM;
+ dev_set_drvdata(dev, ndrd);
+
+ ndrd->flush_mask = (1 << ilog2(num_flush)) - 1;
+ for (i = 0; i < nd_region->ndr_mappings; i++) {
+ struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+ struct nvdimm *nvdimm = nd_mapping->nvdimm;
+ int rc = nvdimm_map_flush(&nd_region->dev, nvdimm, i, ndrd);
+
+ if (rc)
+ return rc;
+ }
+
+ return 0;
+}
static void nd_region_release(struct device *dev)
{
@@ -242,12 +326,12 @@
static ssize_t init_namespaces_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct nd_region_namespaces *num_ns = dev_get_drvdata(dev);
+ struct nd_region_data *ndrd = dev_get_drvdata(dev);
ssize_t rc;
nvdimm_bus_lock(dev);
- if (num_ns)
- rc = sprintf(buf, "%d/%d\n", num_ns->active, num_ns->count);
+ if (ndrd)
+ rc = sprintf(buf, "%d/%d\n", ndrd->ns_active, ndrd->ns_count);
else
rc = -ENXIO;
nvdimm_bus_unlock(dev);
@@ -433,8 +517,6 @@
if (is_nd_pmem(dev))
return;
-
- to_nd_blk_region(dev)->disable(nvdimm_bus, dev);
}
if (dev->parent && is_nd_blk(dev->parent) && probe) {
nd_region = to_nd_region(dev->parent);
@@ -698,7 +780,6 @@
if (ndbr) {
nd_region = &ndbr->nd_region;
ndbr->enable = ndbr_desc->enable;
- ndbr->disable = ndbr_desc->disable;
ndbr->do_io = ndbr_desc->do_io;
}
region_buf = ndbr;
@@ -794,6 +875,67 @@
}
EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create);
+/**
+ * nvdimm_flush - flush any posted write queues between the cpu and pmem media
+ * @nd_region: blk or interleaved pmem region
+ */
+void nvdimm_flush(struct nd_region *nd_region)
+{
+ struct nd_region_data *ndrd = dev_get_drvdata(&nd_region->dev);
+ int i, idx;
+
+ /*
+ * Try to encourage some diversity in flush hint addresses
+ * across cpus assuming a limited number of flush hints.
+ */
+ idx = this_cpu_read(flush_idx);
+ idx = this_cpu_add_return(flush_idx, hash_32(current->pid + idx, 8));
+
+ /*
+ * The first wmb() is needed to 'sfence' all previous writes
+ * such that they are architecturally visible for the platform
+ * buffer flush. Note that we've already arranged for pmem
+ * writes to avoid the cache via arch_memcpy_to_pmem(). The
+ * final wmb() ensures ordering for the NVDIMM flush write.
+ */
+ wmb();
+ for (i = 0; i < nd_region->ndr_mappings; i++)
+ if (ndrd->flush_wpq[i][0])
+ writeq(1, ndrd->flush_wpq[i][idx & ndrd->flush_mask]);
+ wmb();
+}
+EXPORT_SYMBOL_GPL(nvdimm_flush);
+
+/**
+ * nvdimm_has_flush - determine write flushing requirements
+ * @nd_region: blk or interleaved pmem region
+ *
+ * Returns 1 if writes require flushing
+ * Returns 0 if writes do not require flushing
+ * Returns -ENXIO if flushing capability can not be determined
+ */
+int nvdimm_has_flush(struct nd_region *nd_region)
+{
+ struct nd_region_data *ndrd = dev_get_drvdata(&nd_region->dev);
+ int i;
+
+ /* no nvdimm == flushing capability unknown */
+ if (nd_region->ndr_mappings == 0)
+ return -ENXIO;
+
+ for (i = 0; i < nd_region->ndr_mappings; i++)
+ /* flush hints present, flushing required */
+ if (ndrd->flush_wpq[i][0])
+ return 1;
+
+ /*
+ * The platform defines dimm devices without hints, assume
+ * platform persistence mechanism like ADR
+ */
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nvdimm_has_flush);
+
void __exit nd_region_devs_exit(void)
{
ida_destroy(®ion_ida);
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index fac1b51..9d66b4f 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -31,7 +31,7 @@
static blk_qc_t dcssblk_make_request(struct request_queue *q,
struct bio *bio);
static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
- void __pmem **kaddr, pfn_t *pfn, long size);
+ void **kaddr, pfn_t *pfn, long size);
static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
@@ -884,7 +884,7 @@
static long
dcssblk_direct_access (struct block_device *bdev, sector_t secnum,
- void __pmem **kaddr, pfn_t *pfn, long size)
+ void **kaddr, pfn_t *pfn, long size)
{
struct dcssblk_dev_info *dev_info;
unsigned long offset, dev_sz;
@@ -894,7 +894,7 @@
return -ENODEV;
dev_sz = dev_info->end - dev_info->start;
offset = secnum * 512;
- *kaddr = (void __pmem *) (dev_info->start + offset);
+ *kaddr = (void *) dev_info->start + offset;
*pfn = __pfn_to_pfn_t(PFN_DOWN(dev_info->start + offset), PFN_DEV);
return dev_sz - offset;