Merge branch 'for-4.3/drivers' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe:
"On top of the 4.3 core block IO changes, here are the driver related
changes for 4.3. Basically just NVMe and nbd this time around:
- NVMe:
- PRACT PI improvement from Alok Pandey.
- Cleanups and improvements on submission queue doorbell and
writing, using CMB if available. From Jon Derrick.
- From Keith, support for setting queue maximum segments, and
reset support.
- Also from Jon, fixup of u64 division issue on 32-bit archs and
wiring up of the reset support through and ioctl.
- Two small cleanups from Matias and Sunad
- Various code cleanups and fixes from Markus Pargmann"
* 'for-4.3/drivers' of git://git.kernel.dk/linux-block:
NVMe: Using PRACT bit to generate and verify PI by controller
NVMe:Remove unreachable code in nvme_abort_req
NVMe: Add nvme subsystem reset IOCTL
NVMe: Add nvme subsystem reset support
NVMe: removed unused nn var from nvme_dev_add
NVMe: Set queue max segments
nbd: flags is a u32 variable
nbd: Rename functions for clearness of recv/send path
nbd: Change 'disconnect' to be boolean
nbd: Add debugfs entries
nbd: Remove variable 'pid'
nbd: Move clear queue debug message
nbd: Remove 'harderror' and propagate error properly
nbd: restructure sock_shutdown
nbd: sock_shutdown, remove conditional lock
nbd: Fix timeout detection
nvme: Fixes u64 division which breaks i386 builds
NVMe: Use CMB for the IO SQes if available
NVMe: Unify SQ entry writing and doorbell ringing
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index f169faf..293495a 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -33,6 +33,7 @@
#include <linux/net.h>
#include <linux/kthread.h>
#include <linux/types.h>
+#include <linux/debugfs.h>
#include <asm/uaccess.h>
#include <asm/types.h>
@@ -40,8 +41,7 @@
#include <linux/nbd.h>
struct nbd_device {
- int flags;
- int harderror; /* Code of hard error */
+ u32 flags;
struct socket * sock; /* If == NULL, device is not ready, yet */
int magic;
@@ -56,11 +56,24 @@
struct gendisk *disk;
int blksize;
loff_t bytesize;
- pid_t pid; /* pid of nbd-client, if attached */
int xmit_timeout;
- int disconnect; /* a disconnect has been requested by user */
+ bool disconnect; /* a disconnect has been requested by user */
+
+ struct timer_list timeout_timer;
+ struct task_struct *task_recv;
+ struct task_struct *task_send;
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+ struct dentry *dbg_dir;
+#endif
};
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+static struct dentry *nbd_dbg_dir;
+#endif
+
+#define nbd_name(nbd) ((nbd)->disk->disk_name)
+
#define NBD_MAGIC 0x68797548
static unsigned int nbds_max = 16;
@@ -113,26 +126,36 @@
/*
* Forcibly shutdown the socket causing all listeners to error
*/
-static void sock_shutdown(struct nbd_device *nbd, int lock)
+static void sock_shutdown(struct nbd_device *nbd)
{
- if (lock)
- mutex_lock(&nbd->tx_lock);
- if (nbd->sock) {
- dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
- kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
- nbd->sock = NULL;
- }
- if (lock)
- mutex_unlock(&nbd->tx_lock);
+ if (!nbd->sock)
+ return;
+
+ dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
+ kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
+ nbd->sock = NULL;
+ del_timer_sync(&nbd->timeout_timer);
}
static void nbd_xmit_timeout(unsigned long arg)
{
- struct task_struct *task = (struct task_struct *)arg;
+ struct nbd_device *nbd = (struct nbd_device *)arg;
+ struct task_struct *task;
- printk(KERN_WARNING "nbd: killing hung xmit (%s, pid: %d)\n",
- task->comm, task->pid);
- force_sig(SIGKILL, task);
+ if (list_empty(&nbd->queue_head))
+ return;
+
+ nbd->disconnect = true;
+
+ task = READ_ONCE(nbd->task_recv);
+ if (task)
+ force_sig(SIGKILL, task);
+
+ task = READ_ONCE(nbd->task_send);
+ if (task)
+ force_sig(SIGKILL, nbd->task_send);
+
+ dev_err(nbd_to_dev(nbd), "Connection timed out, killed receiver and sender, shutting down connection\n");
}
/*
@@ -171,33 +194,12 @@
msg.msg_controllen = 0;
msg.msg_flags = msg_flags | MSG_NOSIGNAL;
- if (send) {
- struct timer_list ti;
-
- if (nbd->xmit_timeout) {
- init_timer(&ti);
- ti.function = nbd_xmit_timeout;
- ti.data = (unsigned long)current;
- ti.expires = jiffies + nbd->xmit_timeout;
- add_timer(&ti);
- }
+ if (send)
result = kernel_sendmsg(sock, &msg, &iov, 1, size);
- if (nbd->xmit_timeout)
- del_timer_sync(&ti);
- } else
+ else
result = kernel_recvmsg(sock, &msg, &iov, 1, size,
msg.msg_flags);
- if (signal_pending(current)) {
- siginfo_t info;
- printk(KERN_WARNING "nbd (pid %d: %s) got signal %d\n",
- task_pid_nr(current), current->comm,
- dequeue_signal_lock(current, ¤t->blocked, &info));
- result = -EINTR;
- sock_shutdown(nbd, !send);
- break;
- }
-
if (result <= 0) {
if (result == 0)
result = -EPIPE; /* short read */
@@ -210,6 +212,9 @@
sigprocmask(SIG_SETMASK, &oldset, NULL);
tsk_restore_flags(current, pflags, PF_MEMALLOC);
+ if (!send && nbd->xmit_timeout)
+ mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);
+
return result;
}
@@ -333,26 +338,24 @@
if (result <= 0) {
dev_err(disk_to_dev(nbd->disk),
"Receive control failed (result %d)\n", result);
- goto harderror;
+ return ERR_PTR(result);
}
if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
(unsigned long)ntohl(reply.magic));
- result = -EPROTO;
- goto harderror;
+ return ERR_PTR(-EPROTO);
}
req = nbd_find_request(nbd, *(struct request **)reply.handle);
if (IS_ERR(req)) {
result = PTR_ERR(req);
if (result != -ENOENT)
- goto harderror;
+ return ERR_PTR(result);
dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n",
reply.handle);
- result = -EBADR;
- goto harderror;
+ return ERR_PTR(-EBADR);
}
if (ntohl(reply.error)) {
@@ -380,18 +383,15 @@
}
}
return req;
-harderror:
- nbd->harderror = result;
- return NULL;
}
static ssize_t pid_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
+ struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
- return sprintf(buf, "%ld\n",
- (long) ((struct nbd_device *)disk->private_data)->pid);
+ return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
}
static struct device_attribute pid_attr = {
@@ -399,7 +399,7 @@
.show = pid_show,
};
-static int nbd_do_it(struct nbd_device *nbd)
+static int nbd_thread_recv(struct nbd_device *nbd)
{
struct request *req;
int ret;
@@ -407,20 +407,43 @@
BUG_ON(nbd->magic != NBD_MAGIC);
sk_set_memalloc(nbd->sock->sk);
- nbd->pid = task_pid_nr(current);
+
+ nbd->task_recv = current;
+
ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
if (ret) {
dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
- nbd->pid = 0;
+ nbd->task_recv = NULL;
return ret;
}
- while ((req = nbd_read_stat(nbd)) != NULL)
+ while (1) {
+ req = nbd_read_stat(nbd);
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ break;
+ }
+
nbd_end_request(nbd, req);
+ }
device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
- nbd->pid = 0;
- return 0;
+
+ nbd->task_recv = NULL;
+
+ if (signal_pending(current)) {
+ siginfo_t info;
+
+ ret = dequeue_signal_lock(current, ¤t->blocked, &info);
+ dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n",
+ task_pid_nr(current), current->comm, ret);
+ mutex_lock(&nbd->tx_lock);
+ sock_shutdown(nbd);
+ mutex_unlock(&nbd->tx_lock);
+ ret = -ETIMEDOUT;
+ }
+
+ return ret;
}
static void nbd_clear_que(struct nbd_device *nbd)
@@ -455,6 +478,7 @@
req->errors++;
nbd_end_request(nbd, req);
}
+ dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
}
@@ -482,6 +506,9 @@
nbd->active_req = req;
+ if (nbd->xmit_timeout && list_empty_careful(&nbd->queue_head))
+ mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);
+
if (nbd_send_req(nbd, req) != 0) {
dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
req->errors++;
@@ -503,11 +530,13 @@
nbd_end_request(nbd, req);
}
-static int nbd_thread(void *data)
+static int nbd_thread_send(void *data)
{
struct nbd_device *nbd = data;
struct request *req;
+ nbd->task_send = current;
+
set_user_nice(current, MIN_NICE);
while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
/* wait for something to do */
@@ -515,6 +544,20 @@
kthread_should_stop() ||
!list_empty(&nbd->waiting_queue));
+ if (signal_pending(current)) {
+ siginfo_t info;
+ int ret;
+
+ ret = dequeue_signal_lock(current, ¤t->blocked,
+ &info);
+ dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n",
+ task_pid_nr(current), current->comm, ret);
+ mutex_lock(&nbd->tx_lock);
+ sock_shutdown(nbd);
+ mutex_unlock(&nbd->tx_lock);
+ break;
+ }
+
/* extract request */
if (list_empty(&nbd->waiting_queue))
continue;
@@ -528,6 +571,9 @@
/* handle request */
nbd_handle_req(nbd, req);
}
+
+ nbd->task_send = NULL;
+
return 0;
}
@@ -538,7 +584,7 @@
* { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
*/
-static void do_nbd_request(struct request_queue *q)
+static void nbd_request_handler(struct request_queue *q)
__releases(q->queue_lock) __acquires(q->queue_lock)
{
struct request *req;
@@ -574,6 +620,9 @@
}
}
+static int nbd_dev_dbg_init(struct nbd_device *nbd);
+static void nbd_dev_dbg_close(struct nbd_device *nbd);
+
/* Must be called with tx_lock held */
static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
@@ -597,7 +646,7 @@
if (!nbd->sock)
return -EINVAL;
- nbd->disconnect = 1;
+ nbd->disconnect = true;
nbd_send_req(nbd, &sreq);
return 0;
@@ -625,7 +674,7 @@
nbd->sock = sock;
if (max_part > 0)
bdev->bd_invalidated = 1;
- nbd->disconnect = 0; /* we're connected now */
+ nbd->disconnect = false; /* we're connected now */
return 0;
}
return -EINVAL;
@@ -648,6 +697,12 @@
case NBD_SET_TIMEOUT:
nbd->xmit_timeout = arg * HZ;
+ if (arg)
+ mod_timer(&nbd->timeout_timer,
+ jiffies + nbd->xmit_timeout);
+ else
+ del_timer_sync(&nbd->timeout_timer);
+
return 0;
case NBD_SET_FLAGS:
@@ -666,7 +721,7 @@
struct socket *sock;
int error;
- if (nbd->pid)
+ if (nbd->task_recv)
return -EBUSY;
if (!nbd->sock)
return -EINVAL;
@@ -683,24 +738,24 @@
else
blk_queue_flush(nbd->disk->queue, 0);
- thread = kthread_run(nbd_thread, nbd, "%s",
- nbd->disk->disk_name);
+ thread = kthread_run(nbd_thread_send, nbd, "%s",
+ nbd_name(nbd));
if (IS_ERR(thread)) {
mutex_lock(&nbd->tx_lock);
return PTR_ERR(thread);
}
- error = nbd_do_it(nbd);
+ nbd_dev_dbg_init(nbd);
+ error = nbd_thread_recv(nbd);
+ nbd_dev_dbg_close(nbd);
kthread_stop(thread);
mutex_lock(&nbd->tx_lock);
- if (error)
- return error;
- sock_shutdown(nbd, 0);
+
+ sock_shutdown(nbd);
sock = nbd->sock;
nbd->sock = NULL;
nbd_clear_que(nbd);
- dev_warn(disk_to_dev(nbd->disk), "queue cleared\n");
kill_bdev(bdev);
queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
set_device_ro(bdev, false);
@@ -714,7 +769,7 @@
blkdev_reread_part(bdev);
if (nbd->disconnect) /* user requested, ignore socket errors */
return 0;
- return nbd->harderror;
+ return error;
}
case NBD_CLEAR_QUE:
@@ -758,6 +813,161 @@
.ioctl = nbd_ioctl,
};
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+
+static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
+{
+ struct nbd_device *nbd = s->private;
+
+ if (nbd->task_recv)
+ seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
+ if (nbd->task_send)
+ seq_printf(s, "send: %d\n", task_pid_nr(nbd->task_send));
+
+ return 0;
+}
+
+static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, nbd_dbg_tasks_show, inode->i_private);
+}
+
+static const struct file_operations nbd_dbg_tasks_ops = {
+ .open = nbd_dbg_tasks_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
+{
+ struct nbd_device *nbd = s->private;
+ u32 flags = nbd->flags;
+
+ seq_printf(s, "Hex: 0x%08x\n\n", flags);
+
+ seq_puts(s, "Known flags:\n");
+
+ if (flags & NBD_FLAG_HAS_FLAGS)
+ seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
+ if (flags & NBD_FLAG_READ_ONLY)
+ seq_puts(s, "NBD_FLAG_READ_ONLY\n");
+ if (flags & NBD_FLAG_SEND_FLUSH)
+ seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
+ if (flags & NBD_FLAG_SEND_TRIM)
+ seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
+
+ return 0;
+}
+
+static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, nbd_dbg_flags_show, inode->i_private);
+}
+
+static const struct file_operations nbd_dbg_flags_ops = {
+ .open = nbd_dbg_flags_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int nbd_dev_dbg_init(struct nbd_device *nbd)
+{
+ struct dentry *dir;
+ struct dentry *f;
+
+ dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
+ if (IS_ERR_OR_NULL(dir)) {
+ dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s' (%ld)\n",
+ nbd_name(nbd), PTR_ERR(dir));
+ return PTR_ERR(dir);
+ }
+ nbd->dbg_dir = dir;
+
+ f = debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
+ if (IS_ERR_OR_NULL(f)) {
+ dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'tasks', %ld\n",
+ PTR_ERR(f));
+ return PTR_ERR(f);
+ }
+
+ f = debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize);
+ if (IS_ERR_OR_NULL(f)) {
+ dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'size_bytes', %ld\n",
+ PTR_ERR(f));
+ return PTR_ERR(f);
+ }
+
+ f = debugfs_create_u32("timeout", 0444, dir, &nbd->xmit_timeout);
+ if (IS_ERR_OR_NULL(f)) {
+ dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'timeout', %ld\n",
+ PTR_ERR(f));
+ return PTR_ERR(f);
+ }
+
+ f = debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize);
+ if (IS_ERR_OR_NULL(f)) {
+ dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'blocksize', %ld\n",
+ PTR_ERR(f));
+ return PTR_ERR(f);
+ }
+
+ f = debugfs_create_file("flags", 0444, dir, &nbd, &nbd_dbg_flags_ops);
+ if (IS_ERR_OR_NULL(f)) {
+ dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'flags', %ld\n",
+ PTR_ERR(f));
+ return PTR_ERR(f);
+ }
+
+ return 0;
+}
+
+static void nbd_dev_dbg_close(struct nbd_device *nbd)
+{
+ debugfs_remove_recursive(nbd->dbg_dir);
+}
+
+static int nbd_dbg_init(void)
+{
+ struct dentry *dbg_dir;
+
+ dbg_dir = debugfs_create_dir("nbd", NULL);
+ if (IS_ERR(dbg_dir))
+ return PTR_ERR(dbg_dir);
+
+ nbd_dbg_dir = dbg_dir;
+
+ return 0;
+}
+
+static void nbd_dbg_close(void)
+{
+ debugfs_remove_recursive(nbd_dbg_dir);
+}
+
+#else /* IS_ENABLED(CONFIG_DEBUG_FS) */
+
+static int nbd_dev_dbg_init(struct nbd_device *nbd)
+{
+ return 0;
+}
+
+static void nbd_dev_dbg_close(struct nbd_device *nbd)
+{
+}
+
+static int nbd_dbg_init(void)
+{
+ return 0;
+}
+
+static void nbd_dbg_close(void)
+{
+}
+
+#endif
+
/*
* And here should be modules and kernel interface
* (Just smiley confuses emacs :-)
@@ -811,7 +1021,7 @@
* every gendisk to have its very own request_queue struct.
* These structs are big so we dynamically allocate them.
*/
- disk->queue = blk_init_queue(do_nbd_request, &nbd_lock);
+ disk->queue = blk_init_queue(nbd_request_handler, &nbd_lock);
if (!disk->queue) {
put_disk(disk);
goto out;
@@ -835,6 +1045,8 @@
printk(KERN_INFO "nbd: registered device at major %d\n", NBD_MAJOR);
+ nbd_dbg_init();
+
for (i = 0; i < nbds_max; i++) {
struct gendisk *disk = nbd_dev[i].disk;
nbd_dev[i].magic = NBD_MAGIC;
@@ -842,6 +1054,9 @@
spin_lock_init(&nbd_dev[i].queue_lock);
INIT_LIST_HEAD(&nbd_dev[i].queue_head);
mutex_init(&nbd_dev[i].tx_lock);
+ init_timer(&nbd_dev[i].timeout_timer);
+ nbd_dev[i].timeout_timer.function = nbd_xmit_timeout;
+ nbd_dev[i].timeout_timer.data = (unsigned long)&nbd_dev[i];
init_waitqueue_head(&nbd_dev[i].active_wq);
init_waitqueue_head(&nbd_dev[i].waiting_wq);
nbd_dev[i].blksize = 1024;
@@ -868,6 +1083,9 @@
static void __exit nbd_cleanup(void)
{
int i;
+
+ nbd_dbg_close();
+
for (i = 0; i < nbds_max; i++) {
struct gendisk *disk = nbd_dev[i].disk;
nbd_dev[i].magic = 0;
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 2f694d7..b97fc3f 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -72,6 +72,10 @@
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0);
+static bool use_cmb_sqes = true;
+module_param(use_cmb_sqes, bool, 0644);
+MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");
+
static DEFINE_SPINLOCK(dev_list_lock);
static LIST_HEAD(dev_list);
static struct task_struct *nvme_thread;
@@ -103,6 +107,7 @@
char irqname[24]; /* nvme4294967295-65535\0 */
spinlock_t q_lock;
struct nvme_command *sq_cmds;
+ struct nvme_command __iomem *sq_cmds_io;
volatile struct nvme_completion *cqes;
struct blk_mq_tags **tags;
dma_addr_t sq_dma_addr;
@@ -379,27 +384,28 @@
*
* Safe to use from interrupt context
*/
-static int __nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
+static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
+ struct nvme_command *cmd)
{
u16 tail = nvmeq->sq_tail;
- memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
+ if (nvmeq->sq_cmds_io)
+ memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd));
+ else
+ memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
+
if (++tail == nvmeq->q_depth)
tail = 0;
writel(tail, nvmeq->q_db);
nvmeq->sq_tail = tail;
-
- return 0;
}
-static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
+static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
{
unsigned long flags;
- int ret;
spin_lock_irqsave(&nvmeq->q_lock, flags);
- ret = __nvme_submit_cmd(nvmeq, cmd);
+ __nvme_submit_cmd(nvmeq, cmd);
spin_unlock_irqrestore(&nvmeq->q_lock, flags);
- return ret;
}
static __le64 **iod_list(struct nvme_iod *iod)
@@ -730,18 +736,16 @@
static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req,
struct nvme_iod *iod)
{
- struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
+ struct nvme_command cmnd;
- memcpy(cmnd, req->cmd, sizeof(struct nvme_command));
- cmnd->rw.command_id = req->tag;
+ memcpy(&cmnd, req->cmd, sizeof(cmnd));
+ cmnd.rw.command_id = req->tag;
if (req->nr_phys_segments) {
- cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
- cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
+ cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+ cmnd.rw.prp2 = cpu_to_le64(iod->first_dma);
}
- if (++nvmeq->sq_tail == nvmeq->q_depth)
- nvmeq->sq_tail = 0;
- writel(nvmeq->sq_tail, nvmeq->q_db);
+ __nvme_submit_cmd(nvmeq, &cmnd);
}
/*
@@ -754,45 +758,41 @@
{
struct nvme_dsm_range *range =
(struct nvme_dsm_range *)iod_list(iod)[0];
- struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
+ struct nvme_command cmnd;
range->cattr = cpu_to_le32(0);
range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift);
range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
- memset(cmnd, 0, sizeof(*cmnd));
- cmnd->dsm.opcode = nvme_cmd_dsm;
- cmnd->dsm.command_id = req->tag;
- cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
- cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma);
- cmnd->dsm.nr = 0;
- cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
+ memset(&cmnd, 0, sizeof(cmnd));
+ cmnd.dsm.opcode = nvme_cmd_dsm;
+ cmnd.dsm.command_id = req->tag;
+ cmnd.dsm.nsid = cpu_to_le32(ns->ns_id);
+ cmnd.dsm.prp1 = cpu_to_le64(iod->first_dma);
+ cmnd.dsm.nr = 0;
+ cmnd.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
- if (++nvmeq->sq_tail == nvmeq->q_depth)
- nvmeq->sq_tail = 0;
- writel(nvmeq->sq_tail, nvmeq->q_db);
+ __nvme_submit_cmd(nvmeq, &cmnd);
}
static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
int cmdid)
{
- struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
+ struct nvme_command cmnd;
- memset(cmnd, 0, sizeof(*cmnd));
- cmnd->common.opcode = nvme_cmd_flush;
- cmnd->common.command_id = cmdid;
- cmnd->common.nsid = cpu_to_le32(ns->ns_id);
+ memset(&cmnd, 0, sizeof(cmnd));
+ cmnd.common.opcode = nvme_cmd_flush;
+ cmnd.common.command_id = cmdid;
+ cmnd.common.nsid = cpu_to_le32(ns->ns_id);
- if (++nvmeq->sq_tail == nvmeq->q_depth)
- nvmeq->sq_tail = 0;
- writel(nvmeq->sq_tail, nvmeq->q_db);
+ __nvme_submit_cmd(nvmeq, &cmnd);
}
static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
struct nvme_ns *ns)
{
struct request *req = iod_get_private(iod);
- struct nvme_command *cmnd;
+ struct nvme_command cmnd;
u16 control = 0;
u32 dsmgmt = 0;
@@ -804,19 +804,16 @@
if (req->cmd_flags & REQ_RAHEAD)
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
- cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
- memset(cmnd, 0, sizeof(*cmnd));
+ memset(&cmnd, 0, sizeof(cmnd));
+ cmnd.rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
+ cmnd.rw.command_id = req->tag;
+ cmnd.rw.nsid = cpu_to_le32(ns->ns_id);
+ cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+ cmnd.rw.prp2 = cpu_to_le64(iod->first_dma);
+ cmnd.rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+ cmnd.rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
- cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
- cmnd->rw.command_id = req->tag;
- cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
- cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
- cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
- cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
- cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
-
- if (blk_integrity_rq(req)) {
- cmnd->rw.metadata = cpu_to_le64(sg_dma_address(iod->meta_sg));
+ if (ns->ms) {
switch (ns->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
control |= NVME_RW_PRINFO_PRCHK_GUARD;
@@ -825,19 +822,21 @@
case NVME_NS_DPS_PI_TYPE2:
control |= NVME_RW_PRINFO_PRCHK_GUARD |
NVME_RW_PRINFO_PRCHK_REF;
- cmnd->rw.reftag = cpu_to_le32(
+ cmnd.rw.reftag = cpu_to_le32(
nvme_block_nr(ns, blk_rq_pos(req)));
break;
}
- } else if (ns->ms)
- control |= NVME_RW_PRINFO_PRACT;
+ if (blk_integrity_rq(req))
+ cmnd.rw.metadata =
+ cpu_to_le64(sg_dma_address(iod->meta_sg));
+ else
+ control |= NVME_RW_PRINFO_PRACT;
+ }
- cmnd->rw.control = cpu_to_le16(control);
- cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
+ cmnd.rw.control = cpu_to_le16(control);
+ cmnd.rw.dsmgmt = cpu_to_le32(dsmgmt);
- if (++nvmeq->sq_tail == nvmeq->q_depth)
- nvmeq->sq_tail = 0;
- writel(nvmeq->sq_tail, nvmeq->q_db);
+ __nvme_submit_cmd(nvmeq, &cmnd);
return 0;
}
@@ -1080,7 +1079,8 @@
c.common.command_id = req->tag;
blk_mq_free_request(req);
- return __nvme_submit_cmd(nvmeq, &c);
+ __nvme_submit_cmd(nvmeq, &c);
+ return 0;
}
static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
@@ -1103,7 +1103,8 @@
cmd->common.command_id = req->tag;
- return nvme_submit_cmd(nvmeq, cmd);
+ nvme_submit_cmd(nvmeq, cmd);
+ return 0;
}
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
@@ -1315,12 +1316,7 @@
dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag,
nvmeq->qid);
- if (nvme_submit_cmd(dev->queues[0], &cmd) < 0) {
- dev_warn(nvmeq->q_dmadev,
- "Could not abort I/O %d QID %d",
- req->tag, nvmeq->qid);
- blk_mq_free_request(abort_req);
- }
+ nvme_submit_cmd(dev->queues[0], &cmd);
}
static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved)
@@ -1374,7 +1370,8 @@
{
dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
- dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+ if (nvmeq->sq_cmds)
+ dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
nvmeq->sq_cmds, nvmeq->sq_dma_addr);
kfree(nvmeq);
}
@@ -1447,6 +1444,47 @@
spin_unlock_irq(&nvmeq->q_lock);
}
+static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
+ int entry_size)
+{
+ int q_depth = dev->q_depth;
+ unsigned q_size_aligned = roundup(q_depth * entry_size, dev->page_size);
+
+ if (q_size_aligned * nr_io_queues > dev->cmb_size) {
+ u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
+ mem_per_q = round_down(mem_per_q, dev->page_size);
+ q_depth = div_u64(mem_per_q, entry_size);
+
+ /*
+ * Ensure the reduced q_depth is above some threshold where it
+ * would be better to map queues in system memory with the
+ * original depth
+ */
+ if (q_depth < 64)
+ return -ENOMEM;
+ }
+
+ return q_depth;
+}
+
+static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
+ int qid, int depth)
+{
+ if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
+ unsigned offset = (qid - 1) *
+ roundup(SQ_SIZE(depth), dev->page_size);
+ nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset;
+ nvmeq->sq_cmds_io = dev->cmb + offset;
+ } else {
+ nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
+ &nvmeq->sq_dma_addr, GFP_KERNEL);
+ if (!nvmeq->sq_cmds)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
int depth)
{
@@ -1459,9 +1497,7 @@
if (!nvmeq->cqes)
goto free_nvmeq;
- nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
- &nvmeq->sq_dma_addr, GFP_KERNEL);
- if (!nvmeq->sq_cmds)
+ if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth))
goto free_cqdma;
nvmeq->q_dmadev = dev->dev;
@@ -1696,6 +1732,12 @@
page_shift = dev_page_max;
}
+ dev->subsystem = readl(&dev->bar->vs) >= NVME_VS(1, 1) ?
+ NVME_CAP_NSSRC(cap) : 0;
+
+ if (dev->subsystem && (readl(&dev->bar->csts) & NVME_CSTS_NSSRO))
+ writel(NVME_CSTS_NSSRO, &dev->bar->csts);
+
result = nvme_disable_ctrl(dev, cap);
if (result < 0)
return result;
@@ -1856,6 +1898,15 @@
return status;
}
+static int nvme_subsys_reset(struct nvme_dev *dev)
+{
+ if (!dev->subsystem)
+ return -ENOTTY;
+
+ writel(0x4E564D65, &dev->bar->nssr); /* "NVMe" */
+ return 0;
+}
+
static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
unsigned long arg)
{
@@ -1989,7 +2040,7 @@
!ns->ext)
nvme_init_integrity(ns);
- if (ns->ms && !blk_get_integrity(disk))
+ if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
set_capacity(disk, 0);
else
set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
@@ -2020,7 +2071,10 @@
spin_lock(&dev_list_lock);
list_for_each_entry_safe(dev, next, &dev_list, node) {
int i;
- if (readl(&dev->bar->csts) & NVME_CSTS_CFS) {
+ u32 csts = readl(&dev->bar->csts);
+
+ if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) ||
+ csts & NVME_CSTS_CFS) {
if (work_busy(&dev->reset_work))
continue;
list_del_init(&dev->node);
@@ -2080,8 +2134,11 @@
list_add_tail(&ns->list, &dev->namespaces);
blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
- if (dev->max_hw_sectors)
+ if (dev->max_hw_sectors) {
blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
+ blk_queue_max_segments(ns->queue,
+ ((dev->max_hw_sectors << 9) / dev->page_size) + 1);
+ }
if (dev->stripe_size)
blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9);
if (dev->vwc & NVME_CTRL_VWC_PRESENT)
@@ -2159,6 +2216,58 @@
return min(result & 0xffff, result >> 16) + 1;
}
+static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
+{
+ u64 szu, size, offset;
+ u32 cmbloc;
+ resource_size_t bar_size;
+ struct pci_dev *pdev = to_pci_dev(dev->dev);
+ void __iomem *cmb;
+ dma_addr_t dma_addr;
+
+ if (!use_cmb_sqes)
+ return NULL;
+
+ dev->cmbsz = readl(&dev->bar->cmbsz);
+ if (!(NVME_CMB_SZ(dev->cmbsz)))
+ return NULL;
+
+ cmbloc = readl(&dev->bar->cmbloc);
+
+ szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz));
+ size = szu * NVME_CMB_SZ(dev->cmbsz);
+ offset = szu * NVME_CMB_OFST(cmbloc);
+ bar_size = pci_resource_len(pdev, NVME_CMB_BIR(cmbloc));
+
+ if (offset > bar_size)
+ return NULL;
+
+ /*
+ * Controllers may support a CMB size larger than their BAR,
+ * for example, due to being behind a bridge. Reduce the CMB to
+ * the reported size of the BAR
+ */
+ if (size > bar_size - offset)
+ size = bar_size - offset;
+
+ dma_addr = pci_resource_start(pdev, NVME_CMB_BIR(cmbloc)) + offset;
+ cmb = ioremap_wc(dma_addr, size);
+ if (!cmb)
+ return NULL;
+
+ dev->cmb_dma_addr = dma_addr;
+ dev->cmb_size = size;
+ return cmb;
+}
+
+static inline void nvme_release_cmb(struct nvme_dev *dev)
+{
+ if (dev->cmb) {
+ iounmap(dev->cmb);
+ dev->cmb = NULL;
+ }
+}
+
static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
{
return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
@@ -2177,6 +2286,15 @@
if (result < nr_io_queues)
nr_io_queues = result;
+ if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) {
+ result = nvme_cmb_qdepth(dev, nr_io_queues,
+ sizeof(struct nvme_command));
+ if (result > 0)
+ dev->q_depth = result;
+ else
+ nvme_release_cmb(dev);
+ }
+
size = db_bar_size(dev, nr_io_queues);
if (size > 8192) {
iounmap(dev->bar);
@@ -2344,7 +2462,6 @@
{
struct pci_dev *pdev = to_pci_dev(dev->dev);
int res;
- unsigned nn;
struct nvme_id_ctrl *ctrl;
int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
@@ -2354,7 +2471,6 @@
return -EIO;
}
- nn = le32_to_cpup(&ctrl->nn);
dev->oncs = le16_to_cpup(&ctrl->oncs);
dev->abort_limit = ctrl->acl + 1;
dev->vwc = ctrl->vwc;
@@ -2440,6 +2556,8 @@
dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
dev->dbs = ((void __iomem *)dev->bar) + 4096;
+ if (readl(&dev->bar->vs) >= NVME_VS(1, 2))
+ dev->cmb = nvme_map_cmb(dev);
return 0;
@@ -2820,6 +2938,8 @@
case NVME_IOCTL_RESET:
dev_warn(dev->dev, "resetting controller\n");
return nvme_reset(dev);
+ case NVME_IOCTL_SUBSYS_RESET:
+ return nvme_subsys_reset(dev);
default:
return -ENOTTY;
}
@@ -3145,6 +3265,7 @@
nvme_dev_remove_admin(dev);
device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance));
nvme_free_queues(dev, 0);
+ nvme_release_cmb(dev);
nvme_release_prp_pools(dev);
kref_put(&dev->kref, nvme_free_dev);
}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index c0d94ed..b5812c3 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -28,18 +28,32 @@
__u32 cc; /* Controller Configuration */
__u32 rsvd1; /* Reserved */
__u32 csts; /* Controller Status */
- __u32 rsvd2; /* Reserved */
+ __u32 nssr; /* Subsystem Reset */
__u32 aqa; /* Admin Queue Attributes */
__u64 asq; /* Admin SQ Base Address */
__u64 acq; /* Admin CQ Base Address */
+ __u32 cmbloc; /* Controller Memory Buffer Location */
+ __u32 cmbsz; /* Controller Memory Buffer Size */
};
#define NVME_CAP_MQES(cap) ((cap) & 0xffff)
#define NVME_CAP_TIMEOUT(cap) (((cap) >> 24) & 0xff)
#define NVME_CAP_STRIDE(cap) (((cap) >> 32) & 0xf)
+#define NVME_CAP_NSSRC(cap) (((cap) >> 36) & 0x1)
#define NVME_CAP_MPSMIN(cap) (((cap) >> 48) & 0xf)
#define NVME_CAP_MPSMAX(cap) (((cap) >> 52) & 0xf)
+#define NVME_CMB_BIR(cmbloc) ((cmbloc) & 0x7)
+#define NVME_CMB_OFST(cmbloc) (((cmbloc) >> 12) & 0xfffff)
+#define NVME_CMB_SZ(cmbsz) (((cmbsz) >> 12) & 0xfffff)
+#define NVME_CMB_SZU(cmbsz) (((cmbsz) >> 8) & 0xf)
+
+#define NVME_CMB_WDS(cmbsz) ((cmbsz) & 0x10)
+#define NVME_CMB_RDS(cmbsz) ((cmbsz) & 0x8)
+#define NVME_CMB_LISTS(cmbsz) ((cmbsz) & 0x4)
+#define NVME_CMB_CQS(cmbsz) ((cmbsz) & 0x2)
+#define NVME_CMB_SQS(cmbsz) ((cmbsz) & 0x1)
+
enum {
NVME_CC_ENABLE = 1 << 0,
NVME_CC_CSS_NVM = 0 << 4,
@@ -55,6 +69,7 @@
NVME_CC_IOCQES = 4 << 20,
NVME_CSTS_RDY = 1 << 0,
NVME_CSTS_CFS = 1 << 1,
+ NVME_CSTS_NSSRO = 1 << 4,
NVME_CSTS_SHST_NORMAL = 0 << 2,
NVME_CSTS_SHST_OCCUR = 1 << 2,
NVME_CSTS_SHST_CMPLT = 2 << 2,
@@ -97,9 +112,14 @@
char serial[20];
char model[40];
char firmware_rev[8];
+ bool subsystem;
u32 max_hw_sectors;
u32 stripe_size;
u32 page_size;
+ void __iomem *cmb;
+ dma_addr_t cmb_dma_addr;
+ u64 cmb_size;
+ u32 cmbsz;
u16 oncs;
u16 abort_limit;
u8 event_limit;
diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h
index 732b32e..8864194 100644
--- a/include/uapi/linux/nvme.h
+++ b/include/uapi/linux/nvme.h
@@ -584,5 +584,6 @@
#define NVME_IOCTL_SUBMIT_IO _IOW('N', 0x42, struct nvme_user_io)
#define NVME_IOCTL_IO_CMD _IOWR('N', 0x43, struct nvme_passthru_cmd)
#define NVME_IOCTL_RESET _IO('N', 0x44)
+#define NVME_IOCTL_SUBSYS_RESET _IO('N', 0x45)
#endif /* _UAPI_LINUX_NVME_H */