blob: 42bc2734992bd68d7d85975e5a231cb64c19f14a [file] [log] [blame]
/*
* drivers/block/vs_block_server.c
*
* Copyright (c) 2012-2018 General Dynamics
* Copyright (c) 2014 Open Kernel Labs, Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* block vservice server driver
*
*/
#include <linux/device.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/genhd.h>
#include <linux/fs.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/spinlock.h>
#include <linux/module.h>
#include <vservices/types.h>
#include <vservices/buffer.h>
#include <vservices/protocol/block/types.h>
#include <vservices/protocol/block/common.h>
#include <vservices/protocol/block/server.h>
#include <vservices/protocol/block/client.h>
#include <vservices/service.h>
#include <vservices/wait.h>
#define VS_BLOCK_BLKDEV_DEFAULT_MODE FMODE_READ
/* Must match Linux bio sector_size (512 bytes) */
#define VS_BLOCK_BLK_DEF_SECTOR_SIZE 512
/* XXX should lookup block device physical_block_size */
#define VS_BLOCK_BLK_DEF_MIN_SECTORS 8
/*
* Metadata for a request. Note that the bio must be embedded at the end of
* this structure, because it is allocated from a bioset.
*/
struct block_server_request {
struct block_server *server;
u32 tagid;
u32 size;
int op_err;
struct list_head list;
struct vs_pbuf pbuf;
struct vs_mbuf *mbuf;
bool bounced;
bool submitted;
struct bio bio;
};
struct block_server {
struct vs_server_block_state server;
struct vs_service_device *service;
struct block_device *bdev;
struct bio_set *bioset;
unsigned int sector_size;
bool started;
/* Bounced writes are deferred to keep memcpy off service queue */
struct list_head bounce_req_queue;
struct work_struct bounce_req_work;
spinlock_t bounce_req_lock;
/* Count of outstanding requests submitted to block layer */
atomic_t submitted_req_count;
wait_queue_head_t submitted_req_wq;
/* Completions are deferred because end_io may be in atomic context */
struct list_head completed_req_queue;
struct work_struct completed_req_work;
spinlock_t completed_req_lock;
};
#define state_to_block_server(state) \
container_of(state, struct block_server, server)
#define dev_to_block_server(dev) \
state_to_block_server(dev_get_drvdata(dev))
static inline vservice_block_block_io_error_t
block_server_linux_to_vs_error(int err)
{
/*
* This list is not exhaustive. For all other errors, we return
* unsupported_command.
*/
switch (err) {
case -ECOMM:
case -EIO:
case -ENOMEM:
return VSERVICE_BLOCK_MEDIA_FAILURE;
case -ETIME:
case -ETIMEDOUT:
return VSERVICE_BLOCK_MEDIA_TIMEOUT;
case -EILSEQ:
return VSERVICE_BLOCK_INVALID_INDEX;
default:
if (err)
return VSERVICE_BLOCK_UNSUPPORTED_COMMAND;
return 0;
}
return 0;
}
static inline u32 vs_req_num_sectors(struct block_server *server,
struct block_server_request *req)
{
return req->size / server->sector_size;
}
static inline u64 vs_req_sector_index(struct block_server_request *req)
{
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0)
return req->bio.bi_iter.bi_sector;
#else
return req->bio.bi_sector;
#endif
}
static void vs_block_server_closed(struct vs_server_block_state *state)
{
struct block_server *server = state_to_block_server(state);
struct block_server_request *req;
/*
* Fail all requests that haven't been sent to the block layer yet.
*/
spin_lock(&server->bounce_req_lock);
while (!list_empty(&server->bounce_req_queue)) {
req = list_first_entry(&server->bounce_req_queue,
struct block_server_request, list);
list_del(&req->list);
spin_unlock(&server->bounce_req_lock);
bio_io_error(&req->bio);
spin_lock(&server->bounce_req_lock);
}
spin_unlock(&server->bounce_req_lock);
/*
* Wait until all outstanding requests to the block layer are
* complete.
*/
wait_event(server->submitted_req_wq,
!atomic_read(&server->submitted_req_count));
/*
* Discard all the completed requests.
*/
spin_lock_irq(&server->completed_req_lock);
while (!list_empty(&server->completed_req_queue)) {
req = list_first_entry(&server->completed_req_queue,
struct block_server_request, list);
list_del(&req->list);
if (req->mbuf) {
spin_unlock_irq(&server->completed_req_lock);
if (bio_data_dir(&req->bio) == WRITE)
vs_server_block_io_free_req_write(state,
&req->pbuf, req->mbuf);
else
vs_server_block_io_free_ack_read(state,
&req->pbuf, req->mbuf);
spin_lock_irq(&server->completed_req_lock);
}
bio_put(&req->bio);
}
spin_unlock_irq(&server->completed_req_lock);
}
static ssize_t
vs_block_server_readonly_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
struct block_server *server = dev_to_block_server(dev);
int err;
unsigned long val;
vs_service_state_lock(server->service);
if (server->started) {
err = -EBUSY;
goto unlock;
}
err = kstrtoul(buf, 0, &val);
if (err)
goto unlock;
if (bdev_read_only(server->bdev) && !val) {
dev_info(dev,
"Cannot set %s to read/write: read-only device\n",
server->service->name);
err = -EINVAL;
goto unlock;
}
server->server.readonly = val;
err = count;
unlock:
vs_service_state_unlock(server->service);
return err;
}
static ssize_t
vs_block_server_readonly_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct block_server *server = dev_to_block_server(dev);
int cnt;
vs_service_state_lock(server->service);
cnt = scnprintf(buf, PAGE_SIZE, "%d\n", server->server.readonly);
vs_service_state_unlock(server->service);
return cnt;
}
static ssize_t
vs_block_server_start_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
struct block_server *server = dev_to_block_server(dev);
int err;
unsigned long val;
vs_service_state_lock(server->service);
err = kstrtoul(buf, 0, &val);
if (err)
goto unlock;
if (!val && server->started) {
err = -EBUSY;
goto unlock;
}
if (val && !server->started) {
server->started = true;
if (server->server.state.base.statenum ==
VSERVICE_BASE_STATE_CLOSED__OPEN)
vs_server_block_open_complete(&server->server,
VS_SERVER_RESP_SUCCESS);
}
err = count;
unlock:
vs_service_state_unlock(server->service);
return err;
}
static ssize_t
vs_block_server_start_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct block_server *server = dev_to_block_server(dev);
int cnt;
vs_service_state_lock(server->service);
cnt = scnprintf(buf, PAGE_SIZE, "%d\n", server->started);
vs_service_state_unlock(server->service);
return cnt;
}
static DEVICE_ATTR(start, S_IWUSR | S_IRUSR, vs_block_server_start_show,
vs_block_server_start_store);
static DEVICE_ATTR(readonly, S_IWUSR | S_IRUSR, vs_block_server_readonly_show,
vs_block_server_readonly_store);
static struct attribute *vs_block_server_dev_attrs[] = {
&dev_attr_start.attr,
&dev_attr_readonly.attr,
NULL,
};
static const struct attribute_group vs_block_server_attr_group = {
.attrs = vs_block_server_dev_attrs
};
/*
* Invoked by vs_server_block_handle_req_open() after receiving open
* requests to perform server specific initialisations
*
* The "delayed start" feature can be enforced here
*/
static vs_server_response_type_t
vs_block_server_open(struct vs_server_block_state * _state)
{
struct block_server *server = state_to_block_server(_state);
return (server->started) ? VS_SERVER_RESP_SUCCESS :
VS_SERVER_RESP_EXPLICIT_COMPLETE;
}
static int
vs_block_server_complete_req_read(struct block_server_request *req)
{
struct block_server *server = req->server;
struct vs_server_block_state *state = &server->server;
int err = -EIO;
if (req->op_err) {
err = req->op_err;
dev_dbg(&server->service->dev,
"read nack, err %d sector 0x%llx num 0x%x\n",
err, vs_req_sector_index(req),
vs_req_num_sectors(server, req));
if (req->mbuf)
vs_server_block_io_free_ack_read(state, &req->pbuf,
req->mbuf);
err = vs_server_block_io_send_nack_read(state, req->tagid,
block_server_linux_to_vs_error(err),
GFP_KERNEL);
} else {
if (req->bounced && !req->mbuf) {
req->mbuf = vs_server_block_io_alloc_ack_read(
&server->server, &req->pbuf,
GFP_KERNEL);
if (IS_ERR(req->mbuf)) {
err = PTR_ERR(req->mbuf);
req->mbuf = NULL;
}
}
if (req->bounced && req->mbuf) {
int i;
struct bio_vec *bv;
void *data = req->pbuf.data;
if (vs_pbuf_resize(&req->pbuf, req->size) < 0) {
bio_io_error(&req->bio);
return 0;
}
bio_for_each_segment_all(bv, &req->bio, i) {
memcpy(data, page_address(bv->bv_page) +
bv->bv_offset, bv->bv_len);
data += bv->bv_len;
__free_page(bv->bv_page);
}
req->bounced = false;
}
if (req->mbuf) {
dev_vdbg(&server->service->dev,
"read ack, sector 0x%llx num 0x%x\n",
vs_req_sector_index(req),
vs_req_num_sectors(server, req));
err = vs_server_block_io_send_ack_read(state,
req->tagid, req->pbuf, req->mbuf);
if (err && (err != -ENOBUFS)) {
vs_server_block_io_free_ack_read(state,
&req->pbuf, req->mbuf);
req->mbuf = NULL;
}
} else {
WARN_ON(!err || !req->bounced);
}
}
if (err && (err != -ENOBUFS))
dev_dbg(&server->service->dev,
"error %d sending read reply\n", err);
else if (err == -ENOBUFS)
dev_vdbg(&server->service->dev, "out of quota, will retry\n");
return err;
}
static int
vs_block_server_complete_req_write(struct block_server_request *req)
{
struct block_server *server = req->server;
struct vs_server_block_state *state = &server->server;
int err;
WARN_ON(req->mbuf);
if (req->op_err) {
dev_dbg(&server->service->dev,
"write nack, err %d sector 0x%llx num 0x%x\n",
req->op_err, vs_req_sector_index(req),
vs_req_num_sectors(server, req));
err = vs_server_block_io_send_nack_write(state, req->tagid,
block_server_linux_to_vs_error(req->op_err),
GFP_KERNEL);
} else {
dev_vdbg(&server->service->dev,
"write ack, sector 0x%llx num 0x%x\n",
vs_req_sector_index(req),
vs_req_num_sectors(server, req));
err = vs_server_block_io_send_ack_write(state, req->tagid,
GFP_KERNEL);
}
if (err && (err != -ENOBUFS))
dev_dbg(&server->service->dev,
"error %d sending write reply\n", err);
else if (err == -ENOBUFS)
dev_vdbg(&server->service->dev, "out of quota, will retry\n");
return err;
}
static int vs_block_server_complete_req(struct block_server *server,
struct block_server_request *req)
{
int err;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0)
req->bio.bi_iter.bi_idx = 0;
#else
req->bio.bi_idx = 0;
#endif
if (!vs_state_lock_safe(&server->server))
return -ENOLINK;
if (bio_data_dir(&req->bio) == WRITE)
err = vs_block_server_complete_req_write(req);
else
err = vs_block_server_complete_req_read(req);
vs_state_unlock(&server->server);
if (err == -ENOBUFS)
dev_vdbg(&server->service->dev, "bio %pK response out of quota, will retry\n", &req->bio);
return err;
}
static void vs_block_server_complete_requests_work(struct work_struct *work)
{
struct block_server *server = container_of(work, struct block_server,
completed_req_work);
struct block_server_request *req;
vs_service_send_batch_start(server->service, false);
/*
* Send ack/nack responses for each completed request. If a request
* cannot be sent because we are over-quota then this function will
* return with a non-empty list, and the tx_ready handler will
* reschedule us when we are back under quota. In all other cases
* this function will return with an empty list.
*/
spin_lock_irq(&server->completed_req_lock);
while (!list_empty(&server->completed_req_queue)) {
int err;
req = list_first_entry(&server->completed_req_queue,
struct block_server_request, list);
dev_vdbg(&server->service->dev, "complete bio %pK\n", &req->bio);
list_del(&req->list);
spin_unlock_irq(&server->completed_req_lock);
err = vs_block_server_complete_req(server, req);
if (err == -ENOBUFS) {
dev_vdbg(&server->service->dev, "defer bio %pK\n", &req->bio);
/*
* Couldn't send the completion; re-queue the request
* and exit. We'll start again when more quota becomes
* available.
*/
spin_lock_irq(&server->completed_req_lock);
list_add_tail(&req->list,
&server->completed_req_queue);
break;
}
dev_vdbg(&server->service->dev, "free bio %pK err %d\n", &req->bio, err);
bio_put(&req->bio);
spin_lock_irq(&server->completed_req_lock);
}
spin_unlock_irq(&server->completed_req_lock);
vs_service_send_batch_end(server->service, true);
}
static int vs_block_server_tx_ready(struct vs_server_block_state *state)
{
struct block_server *server = state_to_block_server(state);
schedule_work(&server->completed_req_work);
return 0;
}
static bool vs_block_can_map_pbuf(struct request_queue *q,
struct vs_pbuf *pbuf, size_t size)
{
/* The pbuf must satisfy the driver's alignment requirements. */
if (!blk_rq_aligned(q, (unsigned long)pbuf->data, size))
return false;
/*
* bios can only contain pages. Sometime the pbuf is in an IO region
* that has no struct page (e.g. a channel primary buffer), in which
* case we can't map it into a bio.
*/
/* FIXME: Redmine issue #930 - philip. */
if (!pfn_valid(__pa(pbuf->data) >> PAGE_SHIFT))
return false;
return true;
}
static int vs_block_bio_map_pbuf(struct bio *bio, struct vs_pbuf *pbuf)
{
int offset = offset_in_page((unsigned long)pbuf->data);
void *ptr = pbuf->data;
int size = pbuf->size;
while (size > 0) {
unsigned bytes = min_t(unsigned, PAGE_SIZE - offset, size);
if (bio_add_page(bio, virt_to_page(ptr), bytes,
offset) < bytes)
return -EIO;
ptr += bytes;
size -= bytes;
offset = 0;
}
return 0;
}
/* Read request handling */
#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0)
static void vs_block_server_read_done(struct bio *bio, int err)
#else
static void vs_block_server_read_done(struct bio *bio)
#endif
{
unsigned long flags;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
int err = bio->bi_error;
#endif
struct block_server_request *req = container_of(bio,
struct block_server_request, bio);
struct block_server *server = req->server;
req->op_err = err;
spin_lock_irqsave(&server->completed_req_lock, flags);
if (req->mbuf)
list_add(&req->list, &server->completed_req_queue);
else
list_add_tail(&req->list, &server->completed_req_queue);
spin_unlock_irqrestore(&server->completed_req_lock, flags);
if (req->submitted && atomic_dec_and_test(&server->submitted_req_count))
wake_up_all(&server->submitted_req_wq);
schedule_work(&server->completed_req_work);
}
/*
* TODO: this may need to split and chain the bio if it exceeds the physical
* segment limit of the device. Not clear whose responsibility that is; queue
* might do it for us (if there is one)
*/
#define vs_block_make_request(bio) generic_make_request(bio)
static int vs_block_submit_read(struct block_server *server,
struct block_server_request *req, gfp_t gfp)
{
struct request_queue *q = bdev_get_queue(server->bdev);
struct bio *bio = &req->bio;
int size = req->size;
int err = 0;
if (req->mbuf && vs_block_can_map_pbuf(q, &req->pbuf, size)) {
/*
* The mbuf is valid and the driver can directly access the
* pbuf, so we don't need a bounce buffer. Map the pbuf
* directly into the bio.
*/
if (vs_pbuf_resize(&req->pbuf, size) < 0)
err = -EIO;
if (!err)
err = vs_block_bio_map_pbuf(bio, &req->pbuf);
} else {
/* We need a bounce buffer. First set up the bvecs. */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0)
bio->bi_iter.bi_size = size;
#else
bio->bi_size = size;
#endif
while (size > 0) {
struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt];
BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
bvec->bv_page = NULL; /* Allocated below */
bvec->bv_len = min_t(unsigned, PAGE_SIZE, size);
bvec->bv_offset = 0;
bio->bi_vcnt++;
size -= bvec->bv_len;
}
err = bio_alloc_pages(bio, gfp);
if (!err) {
blk_recount_segments(q, bio);
req->bounced = true;
}
}
if (err) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
bio->bi_error = err;
bio_endio(bio);
#else
bio_endio(bio, err);
#endif
} else {
dev_vdbg(&server->service->dev,
"submit read req sector %#llx count %#x\n",
vs_req_sector_index(req),
vs_req_num_sectors(server, req));
req->submitted = true;
atomic_inc(&server->submitted_req_count);
vs_block_make_request(bio);
}
return 0;
}
static int vs_block_server_io_req_read(struct vs_server_block_state *state,
u32 tagid, u64 sector_index, u32 num_sects, bool nodelay,
bool flush)
{
struct block_server *server = state_to_block_server(state);
struct bio *bio;
struct block_server_request *req;
unsigned size = num_sects * server->sector_size;
unsigned op_flags = 0;
/*
* This nr_pages calculation assumes that the pbuf data is offset from
* the start of the size-aligned message buffer by more than 0 but
* less than one sector, which is always true for the current message
* layout generated by mill when we assume 512-byte sectors.
*/
unsigned nr_pages = 1 + (size >> PAGE_SHIFT);
bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, server->bioset);
if (!bio)
return -ENOMEM;
dev_vdbg(&server->service->dev, "alloc r bio %pK\n", bio);
req = container_of(bio, struct block_server_request, bio);
req->server = server;
req->tagid = tagid;
req->op_err = 0;
req->mbuf = NULL;
req->size = size;
req->bounced = false;
req->submitted = false;
if (flush) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,8,0)
op_flags |= REQ_PREFLUSH;
#else
op_flags |= REQ_FLUSH;
#endif
}
if (nodelay) {
op_flags |= REQ_SYNC;
}
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0)
bio->bi_iter.bi_sector = (sector_t)sector_index;
#else
bio->bi_sector = (sector_t)sector_index;
#endif
bio->bi_bdev = server->bdev;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0)
bio_set_op_attrs(bio, REQ_OP_READ, op_flags);
#else
bio->bi_rw = READ | op_flags;
#endif
bio->bi_end_io = vs_block_server_read_done;
req->mbuf = vs_server_block_io_alloc_ack_read(state, &req->pbuf,
GFP_KERNEL);
if (IS_ERR(req->mbuf) && (PTR_ERR(req->mbuf) == -ENOBUFS)) {
/* Fall back to a bounce buffer */
req->mbuf = NULL;
} else if (IS_ERR(req->mbuf)) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
bio->bi_error = PTR_ERR(req->mbuf);
bio_endio(bio);
#else
bio_endio(bio, PTR_ERR(req->mbuf));
#endif
return 0;
}
return vs_block_submit_read(server, req, GFP_KERNEL);
}
/* Write request handling */
static int vs_block_submit_bounced_write(struct block_server *server,
struct block_server_request *req, gfp_t gfp)
{
struct bio *bio = &req->bio;
void *data = req->pbuf.data;
struct bio_vec *bv;
int i;
if (bio_alloc_pages(bio, gfp | __GFP_NOWARN) < 0)
return -ENOMEM;
blk_recount_segments(bdev_get_queue(server->bdev), bio);
req->bounced = true;
/* Copy all the data into the bounce buffer */
bio_for_each_segment_all(bv, bio, i) {
memcpy(page_address(bv->bv_page) + bv->bv_offset, data,
bv->bv_len);
data += bv->bv_len;
}
vs_server_block_io_free_req_write(&server->server, &req->pbuf,
req->mbuf);
req->mbuf = NULL;
dev_vdbg(&server->service->dev,
"submit bounced write req sector %#llx count %#x\n",
vs_req_sector_index(req),
vs_req_num_sectors(server, req));
req->submitted = true;
atomic_inc(&server->submitted_req_count);
vs_block_make_request(bio);
return 0;
}
static void vs_block_server_write_bounce_work(struct work_struct *work)
{
struct block_server *server = container_of(work, struct block_server,
bounce_req_work);
struct block_server_request *req;
spin_lock(&server->bounce_req_lock);
while (!list_empty(&server->bounce_req_queue)) {
req = list_first_entry(&server->bounce_req_queue,
struct block_server_request, list);
dev_vdbg(&server->service->dev, "write bio %pK\n", &req->bio);
list_del(&req->list);
spin_unlock(&server->bounce_req_lock);
if (vs_block_submit_bounced_write(server, req,
GFP_KERNEL) == -ENOMEM) {
spin_lock(&server->bounce_req_lock);
list_add(&req->list, &server->bounce_req_queue);
spin_unlock(&server->bounce_req_lock);
schedule_work(work);
return;
}
spin_lock(&server->bounce_req_lock);
}
spin_unlock(&server->bounce_req_lock);
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0)
static void vs_block_server_write_done(struct bio *bio, int err)
#else
static void vs_block_server_write_done(struct bio *bio)
#endif
{
unsigned long flags;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
int err = bio->bi_error;
#endif
struct block_server_request *req = container_of(bio,
struct block_server_request, bio);
struct block_server *server = req->server;
if (req->bounced) {
int i;
struct bio_vec *bv;
bio_for_each_segment_all(bv, bio, i)
__free_page(bv->bv_page);
} else if (req->mbuf) {
vs_server_block_io_free_req_write(&server->server, &req->pbuf,
req->mbuf);
req->mbuf = NULL;
}
if (req->submitted && atomic_dec_and_test(&server->submitted_req_count))
wake_up_all(&server->submitted_req_wq);
req->op_err = err;
spin_lock_irqsave(&server->completed_req_lock, flags);
list_add_tail(&req->list, &server->completed_req_queue);
spin_unlock_irqrestore(&server->completed_req_lock, flags);
schedule_work(&server->completed_req_work);
}
static int vs_block_server_io_req_write(struct vs_server_block_state *state,
u32 tagid, u64 sector_index, u32 num_sects, bool nodelay,
bool flush, bool commit, struct vs_pbuf pbuf, struct vs_mbuf *mbuf)
{
struct block_server *server = state_to_block_server(state);
struct request_queue *q = bdev_get_queue(server->bdev);
struct bio *bio;
struct block_server_request *req;
unsigned long data = (unsigned long)pbuf.data;
unsigned long start = data >> PAGE_SHIFT;
unsigned long end = (data + pbuf.size + PAGE_SIZE - 1) >> PAGE_SHIFT;
int err;
unsigned op_flags = 0;
bio = bio_alloc_bioset(GFP_KERNEL, end - start, server->bioset);
if (!bio)
return -ENOMEM;
dev_vdbg(&server->service->dev, "alloc w bio %pK\n", bio);
req = container_of(bio, struct block_server_request, bio);
req->server = server;
req->tagid = tagid;
req->op_err = 0;
req->mbuf = mbuf;
req->pbuf = pbuf;
req->size = server->sector_size * num_sects;
req->bounced = false;
req->submitted = false;
if (flush) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,8,0)
op_flags |= REQ_PREFLUSH;
#else
op_flags |= REQ_FLUSH;
#endif
}
if (commit) {
op_flags |= REQ_FUA;
}
if (nodelay) {
op_flags |= REQ_SYNC;
}
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0)
bio->bi_iter.bi_sector = (sector_t)sector_index;
#else
bio->bi_sector = (sector_t)sector_index;
#endif
bio->bi_bdev = server->bdev;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0)
bio_set_op_attrs(bio, REQ_OP_WRITE, op_flags);
#else
bio->bi_rw = WRITE | op_flags;
#endif
bio->bi_end_io = vs_block_server_write_done;
if (pbuf.size < req->size) {
err = -EINVAL;
goto fail_bio;
}
if (WARN_ON(pbuf.size > req->size))
pbuf.size = req->size;
if (state->readonly) {
err = -EROFS;
goto fail_bio;
}
if (!vs_block_can_map_pbuf(q, &req->pbuf, req->pbuf.size)) {
/* We need a bounce buffer. First set up the bvecs. */
int size = pbuf.size;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0)
bio->bi_iter.bi_size = size;
#else
bio->bi_size = size;
#endif
while (size > 0) {
struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt];
BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
bvec->bv_page = NULL; /* Allocated later */
bvec->bv_len = min_t(unsigned, PAGE_SIZE, size);
bvec->bv_offset = 0;
bio->bi_vcnt++;
size -= bvec->bv_len;
}
/*
* Defer the rest so we don't have to hold the state lock
* during alloc_page & memcpy
*/
spin_lock(&server->bounce_req_lock);
list_add_tail(&req->list, &server->bounce_req_queue);
spin_unlock(&server->bounce_req_lock);
schedule_work(&server->bounce_req_work);
return 0;
}
/* No bounce needed; map the pbuf directly. */
err = vs_block_bio_map_pbuf(bio, &pbuf);
if (err < 0)
goto fail_bio;
dev_vdbg(&server->service->dev,
"submit direct write req sector %#llx count %#x\n",
vs_req_sector_index(req),
vs_req_num_sectors(server, req));
req->submitted = true;
atomic_inc(&server->submitted_req_count);
vs_block_make_request(bio);
return 0;
fail_bio:
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)
bio->bi_error = err;
bio_endio(bio);
#else
bio_endio(bio, err);
#endif
return 0;
}
static struct block_device *
vs_block_server_find_by_name(struct block_server *server)
{
struct block_device *bdev = NULL;
struct class_dev_iter iter;
struct device *dev;
class_dev_iter_init(&iter, &block_class, NULL, NULL);
while (1) {
dev = class_dev_iter_next(&iter);
if (!dev)
break;
if (strcmp(dev_name(dev), server->service->name) == 0) {
bdev = blkdev_get_by_dev(dev->devt,
VS_BLOCK_BLKDEV_DEFAULT_MODE, NULL);
if (!IS_ERR_OR_NULL(bdev))
break;
}
}
class_dev_iter_exit(&iter);
if (!dev || IS_ERR_OR_NULL(bdev))
return ERR_PTR(-ENODEV);
dev_dbg(&server->service->dev, "Attached to block device %s (%d:%d)\n",
dev_name(dev), MAJOR(dev->devt), MINOR(dev->devt));
return bdev;
}
static struct block_device *
vs_block_server_find_by_path(struct block_server *server, const char *base_path)
{
struct block_device *bdev;
char *bdev_path;
bdev_path = kasprintf(GFP_KERNEL, "%s/%s", base_path,
server->service->name);
if (!bdev_path)
return ERR_PTR(-ENOMEM);
bdev = blkdev_get_by_path(bdev_path, VS_BLOCK_BLKDEV_DEFAULT_MODE,
NULL);
dev_dbg(&server->service->dev, "Attached to block device %s\n",
bdev_path);
kfree(bdev_path);
if (!bdev)
return ERR_PTR(-ENODEV);
return bdev;
}
static struct block_device *
vs_block_server_attach_block_device(struct block_server *server)
{
const char *paths[] = {
"/dev",
"/dev/block",
"/dev/mapper",
"/dev/disk/by-partlabel",
"/dev/disk/by-label",
"/dev/disk/by-partuuid",
"/dev/disk/by-uuid"
};
struct block_device *bdev;
int i;
/*
* Try first to look the block device up by path. This is done because
* the name exposed to user-space in /dev/ is not necessarily the name
* being used inside the kernel for the device.
*/
for (i = 0; i < ARRAY_SIZE(paths); i++) {
bdev = vs_block_server_find_by_path(server, paths[i]);
if (!IS_ERR(bdev))
break;
}
if (i == ARRAY_SIZE(paths)) {
/*
* Couldn't find the block device in any of the usual places.
* Try to match it against the kernel's device name. If the
* name of the service and the name of a device in the block
* class match then attempt to look the block device up by the
* dev_t (major/minor) value.
*/
bdev = vs_block_server_find_by_name(server);
}
if (IS_ERR(bdev))
return bdev;
// XXX get block device physical block size
server->sector_size = VS_BLOCK_BLK_DEF_SECTOR_SIZE;
server->server.segment_size = round_down(
vs_service_max_mbuf_size(server->service) -
sizeof(vs_message_id_t), server->sector_size);
server->server.sector_size = server->sector_size *
VS_BLOCK_BLK_DEF_MIN_SECTORS;
server->server.device_sectors = bdev->bd_part->nr_sects /
VS_BLOCK_BLK_DEF_MIN_SECTORS;
if (bdev_read_only(bdev))
server->server.readonly = true;
server->server.flushable = true;
server->server.committable = true;
return bdev;
}
static struct vs_server_block_state *
vs_block_server_alloc(struct vs_service_device *service)
{
struct block_server *server;
int err;
server = kzalloc(sizeof(*server), GFP_KERNEL);
if (!server)
return NULL;
server->service = service;
server->started = false;
INIT_LIST_HEAD(&server->bounce_req_queue);
INIT_WORK(&server->bounce_req_work, vs_block_server_write_bounce_work);
spin_lock_init(&server->bounce_req_lock);
atomic_set(&server->submitted_req_count, 0);
init_waitqueue_head(&server->submitted_req_wq);
INIT_LIST_HEAD(&server->completed_req_queue);
INIT_WORK(&server->completed_req_work,
vs_block_server_complete_requests_work);
spin_lock_init(&server->completed_req_lock);
server->bdev = vs_block_server_attach_block_device(server);
if (IS_ERR(server->bdev)) {
dev_err(&server->service->dev,
"No appropriate block device was found to satisfy the service name %s - error %ld\n",
server->service->name, PTR_ERR(server->bdev));
goto fail_attach_device;
}
dev_set_drvdata(&service->dev, &server->server);
err = sysfs_create_group(&service->dev.kobj,
&vs_block_server_attr_group);
if (err) {
dev_err(&service->dev,
"Failed to create attribute group for service %s\n",
service->name);
goto fail_create_group;
}
/*
* We know the upper bound on simultaneously active bios (i.e. the
* smaller of the in quota, and the sum of the read and write command
* tag limits), so we can pre-allocate that many, and hopefully never
* fail to allocate one in a request handler.
*
* However, allocation may fail if the number of pages (and thus
* bvecs) in a request exceeds BIO_INLINE_VECS (which is hard-coded to
* 4 in all mainline kernels). That possibility is the only reason we
* can't enable rx_atomic for this driver.
*/
server->bioset = bioset_create(min_t(unsigned, service->recv_quota,
VSERVICE_BLOCK_IO_READ_MAX_PENDING +
VSERVICE_BLOCK_IO_WRITE_MAX_PENDING),
offsetof(struct block_server_request, bio));
if (!server->bioset) {
dev_err(&service->dev,
"Failed to allocate bioset for service %s\n",
service->name);
goto fail_create_bioset;
}
dev_dbg(&service->dev, "New block server %pK\n", server);
return &server->server;
fail_create_bioset:
sysfs_remove_group(&server->service->dev.kobj,
&vs_block_server_attr_group);
fail_create_group:
dev_set_drvdata(&service->dev, NULL);
blkdev_put(server->bdev, VS_BLOCK_BLKDEV_DEFAULT_MODE);
fail_attach_device:
kfree(server);
return NULL;
}
static void vs_block_server_release(struct vs_server_block_state *state)
{
struct block_server *server = state_to_block_server(state);
cancel_work_sync(&server->bounce_req_work);
cancel_work_sync(&server->completed_req_work);
blkdev_put(server->bdev, VS_BLOCK_BLKDEV_DEFAULT_MODE);
sysfs_remove_group(&server->service->dev.kobj,
&vs_block_server_attr_group);
bioset_free(server->bioset);
kfree(server);
}
static struct vs_server_block block_server_driver = {
.alloc = vs_block_server_alloc,
.release = vs_block_server_release,
.open = vs_block_server_open,
.closed = vs_block_server_closed,
.tx_ready = vs_block_server_tx_ready,
.io = {
.req_read = vs_block_server_io_req_read,
.req_write = vs_block_server_io_req_write,
},
/* Large default quota for batching read/write commands */
.in_quota_best = 32,
.out_quota_best = 32,
};
static int __init vs_block_server_init(void)
{
return vservice_block_server_register(&block_server_driver,
"block_server_driver");
}
static void __exit vs_block_server_exit(void)
{
vservice_block_server_unregister(&block_server_driver);
}
module_init(vs_block_server_init);
module_exit(vs_block_server_exit);
MODULE_DESCRIPTION("OKL4 Virtual Services Block Server Driver");
MODULE_AUTHOR("Open Kernel Labs, Inc");