| /* |
| * drivers/block/vs_block_server.c |
| * |
| * Copyright (c) 2012-2018 General Dynamics |
| * Copyright (c) 2014 Open Kernel Labs, Inc. |
| * |
| * This program is free software; you can redistribute it and/or modify |
| * it under the terms of the GNU General Public License version 2 as |
| * published by the Free Software Foundation. |
| * |
| * block vservice server driver |
| * |
| */ |
| #include <linux/device.h> |
| #include <linux/slab.h> |
| #include <linux/init.h> |
| #include <linux/genhd.h> |
| #include <linux/fs.h> |
| #include <linux/bio.h> |
| #include <linux/blkdev.h> |
| #include <linux/spinlock.h> |
| #include <linux/module.h> |
| |
| #include <vservices/types.h> |
| #include <vservices/buffer.h> |
| #include <vservices/protocol/block/types.h> |
| #include <vservices/protocol/block/common.h> |
| #include <vservices/protocol/block/server.h> |
| #include <vservices/protocol/block/client.h> |
| #include <vservices/service.h> |
| #include <vservices/wait.h> |
| |
| #define VS_BLOCK_BLKDEV_DEFAULT_MODE FMODE_READ |
| #define VS_BLOCK_BLK_DEF_SECTOR_SIZE 512 |
| |
| /* |
| * Metadata for a request. Note that the bio must be embedded at the end of |
| * this structure, because it is allocated from a bioset. |
| */ |
| struct block_server_request { |
| struct block_server *server; |
| u32 tagid; |
| u32 size; |
| int op_err; |
| struct list_head list; |
| struct vs_pbuf pbuf; |
| struct vs_mbuf *mbuf; |
| bool bounced; |
| bool submitted; |
| |
| struct bio bio; |
| }; |
| |
| struct block_server { |
| struct vs_server_block_state server; |
| struct vs_service_device *service; |
| |
| struct block_device *bdev; |
| struct bio_set *bioset; |
| |
| unsigned int sector_size; |
| bool started; |
| |
| /* Bounced writes are deferred to keep memcpy off service queue */ |
| struct list_head bounce_req_queue; |
| struct work_struct bounce_req_work; |
| spinlock_t bounce_req_lock; |
| |
| /* Count of outstanding requests submitted to block layer */ |
| atomic_t submitted_req_count; |
| wait_queue_head_t submitted_req_wq; |
| |
| /* Completions are deferred because end_io may be in atomic context */ |
| struct list_head completed_req_queue; |
| struct work_struct completed_req_work; |
| spinlock_t completed_req_lock; |
| }; |
| |
| #define state_to_block_server(state) \ |
| container_of(state, struct block_server, server) |
| |
| #define dev_to_block_server(dev) \ |
| state_to_block_server(dev_get_drvdata(dev)) |
| |
| static inline vservice_block_block_io_error_t |
| block_server_linux_to_vs_error(int err) |
| { |
| /* |
| * This list is not exhaustive. For all other errors, we return |
| * unsupported_command. |
| */ |
| switch (err) { |
| case -ECOMM: |
| case -EIO: |
| case -ENOMEM: |
| return VSERVICE_BLOCK_MEDIA_FAILURE; |
| case -ETIME: |
| case -ETIMEDOUT: |
| return VSERVICE_BLOCK_MEDIA_TIMEOUT; |
| case -EILSEQ: |
| return VSERVICE_BLOCK_INVALID_INDEX; |
| default: |
| if (err) |
| return VSERVICE_BLOCK_UNSUPPORTED_COMMAND; |
| return 0; |
| } |
| |
| return 0; |
| } |
| |
| static inline u32 vs_req_num_sectors(struct block_server *server, |
| struct block_server_request *req) |
| { |
| return req->size / server->sector_size; |
| } |
| |
| static inline u64 vs_req_sector_index(struct block_server_request *req) |
| { |
| #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) |
| return req->bio.bi_iter.bi_sector; |
| #else |
| return req->bio.bi_sector; |
| #endif |
| } |
| |
| static void vs_block_server_closed(struct vs_server_block_state *state) |
| { |
| struct block_server *server = state_to_block_server(state); |
| struct block_server_request *req; |
| |
| /* |
| * Fail all requests that haven't been sent to the block layer yet. |
| */ |
| spin_lock(&server->bounce_req_lock); |
| while (!list_empty(&server->bounce_req_queue)) { |
| req = list_first_entry(&server->bounce_req_queue, |
| struct block_server_request, list); |
| list_del(&req->list); |
| spin_unlock(&server->bounce_req_lock); |
| bio_io_error(&req->bio); |
| spin_lock(&server->bounce_req_lock); |
| } |
| spin_unlock(&server->bounce_req_lock); |
| |
| /* |
| * Wait until all outstanding requests to the block layer are |
| * complete. |
| */ |
| wait_event(server->submitted_req_wq, |
| !atomic_read(&server->submitted_req_count)); |
| |
| /* |
| * Discard all the completed requests. |
| */ |
| spin_lock_irq(&server->completed_req_lock); |
| while (!list_empty(&server->completed_req_queue)) { |
| req = list_first_entry(&server->completed_req_queue, |
| struct block_server_request, list); |
| list_del(&req->list); |
| if (req->mbuf) { |
| spin_unlock_irq(&server->completed_req_lock); |
| if (bio_data_dir(&req->bio) == WRITE) |
| vs_server_block_io_free_req_write(state, |
| &req->pbuf, req->mbuf); |
| else |
| vs_server_block_io_free_ack_read(state, |
| &req->pbuf, req->mbuf); |
| spin_lock_irq(&server->completed_req_lock); |
| } |
| bio_put(&req->bio); |
| } |
| spin_unlock_irq(&server->completed_req_lock); |
| } |
| |
| static ssize_t |
| vs_block_server_readonly_store(struct device *dev, |
| struct device_attribute *attr, const char *buf, size_t count) |
| { |
| struct block_server *server = dev_to_block_server(dev); |
| int err; |
| unsigned long val; |
| |
| vs_service_state_lock(server->service); |
| if (server->started) { |
| err = -EBUSY; |
| goto unlock; |
| } |
| |
| err = kstrtoul(buf, 0, &val); |
| if (err) |
| goto unlock; |
| |
| if (bdev_read_only(server->bdev) && !val) { |
| dev_info(dev, |
| "Cannot set %s to read/write: read-only device\n", |
| server->service->name); |
| err = -EINVAL; |
| goto unlock; |
| } |
| |
| server->server.readonly = val; |
| err = count; |
| |
| unlock: |
| vs_service_state_unlock(server->service); |
| |
| return err; |
| } |
| |
| static ssize_t |
| vs_block_server_readonly_show(struct device *dev, |
| struct device_attribute *attr, char *buf) |
| { |
| struct block_server *server = dev_to_block_server(dev); |
| int cnt; |
| |
| vs_service_state_lock(server->service); |
| cnt = scnprintf(buf, PAGE_SIZE, "%d\n", server->server.readonly); |
| vs_service_state_unlock(server->service); |
| |
| return cnt; |
| } |
| |
| static ssize_t |
| vs_block_server_start_store(struct device *dev, |
| struct device_attribute *attr, const char *buf, size_t count) |
| { |
| struct block_server *server = dev_to_block_server(dev); |
| int err; |
| unsigned long val; |
| |
| vs_service_state_lock(server->service); |
| |
| err = kstrtoul(buf, 0, &val); |
| if (err) |
| goto unlock; |
| |
| if (!val && server->started) { |
| err = -EBUSY; |
| goto unlock; |
| } |
| |
| if (val && !server->started) { |
| server->started = true; |
| |
| if (server->server.state.base.statenum == |
| VSERVICE_BASE_STATE_CLOSED__OPEN) |
| vs_server_block_open_complete(&server->server, |
| VS_SERVER_RESP_SUCCESS); |
| } |
| |
| err = count; |
| unlock: |
| vs_service_state_unlock(server->service); |
| |
| return err; |
| } |
| |
| static ssize_t |
| vs_block_server_start_show(struct device *dev, |
| struct device_attribute *attr, char *buf) |
| { |
| struct block_server *server = dev_to_block_server(dev); |
| int cnt; |
| |
| vs_service_state_lock(server->service); |
| cnt = scnprintf(buf, PAGE_SIZE, "%d\n", server->started); |
| vs_service_state_unlock(server->service); |
| |
| return cnt; |
| } |
| |
| static DEVICE_ATTR(start, S_IWUSR | S_IRUSR, vs_block_server_start_show, |
| vs_block_server_start_store); |
| static DEVICE_ATTR(readonly, S_IWUSR | S_IRUSR, vs_block_server_readonly_show, |
| vs_block_server_readonly_store); |
| |
| static struct attribute *vs_block_server_dev_attrs[] = { |
| &dev_attr_start.attr, |
| &dev_attr_readonly.attr, |
| NULL, |
| }; |
| |
| static const struct attribute_group vs_block_server_attr_group = { |
| .attrs = vs_block_server_dev_attrs |
| }; |
| |
| /* |
| * Invoked by vs_server_block_handle_req_open() after receiving open |
| * requests to perform server specific initialisations |
| * |
| * The "delayed start" feature can be enforced here |
| */ |
| static vs_server_response_type_t |
| vs_block_server_open(struct vs_server_block_state * _state) |
| { |
| struct block_server *server = state_to_block_server(_state); |
| |
| return (server->started) ? VS_SERVER_RESP_SUCCESS : |
| VS_SERVER_RESP_EXPLICIT_COMPLETE; |
| } |
| |
| static int |
| vs_block_server_complete_req_read(struct block_server_request *req) |
| { |
| struct block_server *server = req->server; |
| struct vs_server_block_state *state = &server->server; |
| int err = -EIO; |
| |
| if (req->op_err) { |
| err = req->op_err; |
| dev_dbg(&server->service->dev, |
| "read nack, err %d sector 0x%llx num 0x%x\n", |
| err, vs_req_sector_index(req), |
| vs_req_num_sectors(server, req)); |
| |
| if (req->mbuf) |
| vs_server_block_io_free_ack_read(state, &req->pbuf, |
| req->mbuf); |
| |
| err = vs_server_block_io_send_nack_read(state, req->tagid, |
| block_server_linux_to_vs_error(err), |
| GFP_KERNEL); |
| } else { |
| if (req->bounced && !req->mbuf) { |
| req->mbuf = vs_server_block_io_alloc_ack_read( |
| &server->server, &req->pbuf, |
| GFP_KERNEL); |
| if (IS_ERR(req->mbuf)) { |
| err = PTR_ERR(req->mbuf); |
| req->mbuf = NULL; |
| } |
| } |
| |
| if (req->bounced && req->mbuf) { |
| int i; |
| struct bio_vec *bv; |
| void *data = req->pbuf.data; |
| |
| if (vs_pbuf_resize(&req->pbuf, req->size) < 0) { |
| bio_io_error(&req->bio); |
| return 0; |
| } |
| |
| bio_for_each_segment_all(bv, &req->bio, i) { |
| memcpy(data, page_address(bv->bv_page) + |
| bv->bv_offset, bv->bv_len); |
| data += bv->bv_len; |
| __free_page(bv->bv_page); |
| } |
| req->bounced = false; |
| } |
| |
| if (req->mbuf) { |
| dev_vdbg(&server->service->dev, |
| "read ack, sector 0x%llx num 0x%x\n", |
| vs_req_sector_index(req), |
| vs_req_num_sectors(server, req)); |
| |
| err = vs_server_block_io_send_ack_read(state, |
| req->tagid, req->pbuf, req->mbuf); |
| |
| if (err && (err != -ENOBUFS)) { |
| vs_server_block_io_free_ack_read(state, |
| &req->pbuf, req->mbuf); |
| req->mbuf = NULL; |
| } |
| } else { |
| WARN_ON(!err || !req->bounced); |
| } |
| } |
| |
| if (err && (err != -ENOBUFS)) |
| dev_dbg(&server->service->dev, |
| "error %d sending read reply\n", err); |
| else if (err == -ENOBUFS) |
| dev_vdbg(&server->service->dev, "out of quota, will retry\n"); |
| |
| return err; |
| } |
| |
| static int |
| vs_block_server_complete_req_write(struct block_server_request *req) |
| { |
| struct block_server *server = req->server; |
| struct vs_server_block_state *state = &server->server; |
| int err; |
| |
| WARN_ON(req->mbuf); |
| |
| if (req->op_err) { |
| dev_dbg(&server->service->dev, |
| "write nack, err %d sector 0x%llx num 0x%x\n", |
| req->op_err, vs_req_sector_index(req), |
| vs_req_num_sectors(server, req)); |
| |
| err = vs_server_block_io_send_nack_write(state, req->tagid, |
| block_server_linux_to_vs_error(req->op_err), |
| GFP_KERNEL); |
| } else { |
| dev_vdbg(&server->service->dev, |
| "write ack, sector 0x%llx num 0x%x\n", |
| vs_req_sector_index(req), |
| vs_req_num_sectors(server, req)); |
| |
| err = vs_server_block_io_send_ack_write(state, req->tagid, |
| GFP_KERNEL); |
| } |
| |
| if (err && (err != -ENOBUFS)) |
| dev_dbg(&server->service->dev, |
| "error %d sending write reply\n", err); |
| else if (err == -ENOBUFS) |
| dev_vdbg(&server->service->dev, "out of quota, will retry\n"); |
| |
| return err; |
| } |
| |
| static int vs_block_server_complete_req(struct block_server *server, |
| struct block_server_request *req) |
| { |
| int err; |
| |
| #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) |
| req->bio.bi_iter.bi_idx = 0; |
| #else |
| req->bio.bi_idx = 0; |
| #endif |
| if (!vs_state_lock_safe(&server->server)) |
| return -ENOLINK; |
| |
| if (bio_data_dir(&req->bio) == WRITE) |
| err = vs_block_server_complete_req_write(req); |
| else |
| err = vs_block_server_complete_req_read(req); |
| |
| vs_state_unlock(&server->server); |
| |
| if (err == -ENOBUFS) |
| dev_vdbg(&server->service->dev, "bio %pK response out of quota, will retry\n", &req->bio); |
| |
| return err; |
| } |
| |
| static void vs_block_server_complete_requests_work(struct work_struct *work) |
| { |
| struct block_server *server = container_of(work, struct block_server, |
| completed_req_work); |
| struct block_server_request *req; |
| |
| vs_service_send_batch_start(server->service, false); |
| |
| /* |
| * Send ack/nack responses for each completed request. If a request |
| * cannot be sent because we are over-quota then this function will |
| * return with a non-empty list, and the tx_ready handler will |
| * reschedule us when we are back under quota. In all other cases |
| * this function will return with an empty list. |
| */ |
| spin_lock_irq(&server->completed_req_lock); |
| while (!list_empty(&server->completed_req_queue)) { |
| int err; |
| req = list_first_entry(&server->completed_req_queue, |
| struct block_server_request, list); |
| dev_vdbg(&server->service->dev, "complete bio %pK\n", &req->bio); |
| list_del(&req->list); |
| spin_unlock_irq(&server->completed_req_lock); |
| |
| err = vs_block_server_complete_req(server, req); |
| if (err == -ENOBUFS) { |
| dev_vdbg(&server->service->dev, "defer bio %pK\n", &req->bio); |
| /* |
| * Couldn't send the completion; re-queue the request |
| * and exit. We'll start again when more quota becomes |
| * available. |
| */ |
| spin_lock_irq(&server->completed_req_lock); |
| list_add_tail(&req->list, |
| &server->completed_req_queue); |
| break; |
| } |
| |
| dev_vdbg(&server->service->dev, "free bio %pK err %d\n", &req->bio, err); |
| bio_put(&req->bio); |
| |
| spin_lock_irq(&server->completed_req_lock); |
| } |
| spin_unlock_irq(&server->completed_req_lock); |
| |
| vs_service_send_batch_end(server->service, true); |
| } |
| |
| static int vs_block_server_tx_ready(struct vs_server_block_state *state) |
| { |
| struct block_server *server = state_to_block_server(state); |
| |
| schedule_work(&server->completed_req_work); |
| |
| return 0; |
| } |
| |
| static bool vs_block_can_map_pbuf(struct request_queue *q, |
| struct vs_pbuf *pbuf, size_t size) |
| { |
| /* The pbuf must satisfy the driver's alignment requirements. */ |
| if (!blk_rq_aligned(q, (unsigned long)pbuf->data, size)) |
| return false; |
| |
| /* |
| * bios can only contain pages. Sometime the pbuf is in an IO region |
| * that has no struct page (e.g. a channel primary buffer), in which |
| * case we can't map it into a bio. |
| */ |
| /* FIXME: Redmine issue #930 - philip. */ |
| if (!pfn_valid(__pa(pbuf->data) >> PAGE_SHIFT)) |
| return false; |
| |
| return true; |
| } |
| |
| static int vs_block_bio_map_pbuf(struct bio *bio, struct vs_pbuf *pbuf) |
| { |
| int offset = offset_in_page((unsigned long)pbuf->data); |
| void *ptr = pbuf->data; |
| int size = pbuf->size; |
| |
| while (size > 0) { |
| unsigned bytes = min_t(unsigned, PAGE_SIZE - offset, size); |
| |
| if (bio_add_page(bio, virt_to_page(ptr), bytes, |
| offset) < bytes) |
| return -EIO; |
| |
| ptr += bytes; |
| size -= bytes; |
| offset = 0; |
| } |
| |
| return 0; |
| } |
| |
| /* Read request handling */ |
| #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0) |
| static void vs_block_server_read_done(struct bio *bio, int err) |
| #else |
| static void vs_block_server_read_done(struct bio *bio) |
| #endif |
| { |
| unsigned long flags; |
| #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) |
| int err = bio->bi_error; |
| #endif |
| struct block_server_request *req = container_of(bio, |
| struct block_server_request, bio); |
| struct block_server *server = req->server; |
| req->op_err = err; |
| |
| spin_lock_irqsave(&server->completed_req_lock, flags); |
| if (req->mbuf) |
| list_add(&req->list, &server->completed_req_queue); |
| else |
| list_add_tail(&req->list, &server->completed_req_queue); |
| spin_unlock_irqrestore(&server->completed_req_lock, flags); |
| |
| if (req->submitted && atomic_dec_and_test(&server->submitted_req_count)) |
| wake_up_all(&server->submitted_req_wq); |
| |
| schedule_work(&server->completed_req_work); |
| } |
| |
| /* |
| * TODO: this may need to split and chain the bio if it exceeds the physical |
| * segment limit of the device. Not clear whose responsibility that is; queue |
| * might do it for us (if there is one) |
| */ |
| #define vs_block_make_request(bio) generic_make_request(bio) |
| |
| static int vs_block_submit_read(struct block_server *server, |
| struct block_server_request *req, gfp_t gfp) |
| { |
| struct request_queue *q = bdev_get_queue(server->bdev); |
| struct bio *bio = &req->bio; |
| int size = req->size; |
| int err = 0; |
| |
| if (req->mbuf && vs_block_can_map_pbuf(q, &req->pbuf, size)) { |
| /* |
| * The mbuf is valid and the driver can directly access the |
| * pbuf, so we don't need a bounce buffer. Map the pbuf |
| * directly into the bio. |
| */ |
| if (vs_pbuf_resize(&req->pbuf, size) < 0) |
| err = -EIO; |
| if (!err) |
| err = vs_block_bio_map_pbuf(bio, &req->pbuf); |
| } else { |
| /* We need a bounce buffer. First set up the bvecs. */ |
| #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) |
| bio->bi_iter.bi_size = size; |
| #else |
| bio->bi_size = size; |
| #endif |
| |
| while (size > 0) { |
| struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt]; |
| |
| BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs); |
| |
| bvec->bv_page = NULL; /* Allocated below */ |
| bvec->bv_len = min_t(unsigned, PAGE_SIZE, size); |
| bvec->bv_offset = 0; |
| |
| bio->bi_vcnt++; |
| size -= bvec->bv_len; |
| } |
| |
| err = bio_alloc_pages(bio, gfp); |
| if (!err) { |
| blk_recount_segments(q, bio); |
| req->bounced = true; |
| } |
| } |
| |
| if (err) { |
| #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) |
| bio->bi_error = err; |
| bio_endio(bio); |
| #else |
| bio_endio(bio, err); |
| #endif |
| } else { |
| dev_vdbg(&server->service->dev, |
| "submit read req sector %#llx count %#x\n", |
| vs_req_sector_index(req), |
| vs_req_num_sectors(server, req)); |
| req->submitted = true; |
| atomic_inc(&server->submitted_req_count); |
| vs_block_make_request(bio); |
| } |
| |
| return 0; |
| } |
| |
| static int vs_block_server_io_req_read(struct vs_server_block_state *state, |
| u32 tagid, u64 sector_index, u32 num_sects, bool nodelay, |
| bool flush) |
| { |
| struct block_server *server = state_to_block_server(state); |
| struct bio *bio; |
| struct block_server_request *req; |
| unsigned size = num_sects * server->sector_size; |
| unsigned op_flags = 0; |
| |
| /* |
| * This nr_pages calculation assumes that the pbuf data is offset from |
| * the start of the size-aligned message buffer by more than 0 but |
| * less than one sector, which is always true for the current message |
| * layout generated by mill when we assume 512-byte sectors. |
| */ |
| unsigned nr_pages = 1 + (size >> PAGE_SHIFT); |
| |
| bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, server->bioset); |
| if (!bio) |
| return -ENOMEM; |
| dev_vdbg(&server->service->dev, "alloc r bio %pK\n", bio); |
| req = container_of(bio, struct block_server_request, bio); |
| |
| req->server = server; |
| req->tagid = tagid; |
| req->op_err = 0; |
| req->mbuf = NULL; |
| req->size = size; |
| req->bounced = false; |
| req->submitted = false; |
| |
| if (flush) { |
| #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,8,0) |
| op_flags |= REQ_PREFLUSH; |
| #else |
| op_flags |= REQ_FLUSH; |
| #endif |
| } |
| if (nodelay) { |
| op_flags |= REQ_SYNC; |
| } |
| |
| #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) |
| bio->bi_iter.bi_sector = (sector_t)sector_index; |
| #else |
| bio->bi_sector = (sector_t)sector_index; |
| #endif |
| bio->bi_bdev = server->bdev; |
| #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0) |
| bio_set_op_attrs(bio, REQ_OP_READ, op_flags); |
| #else |
| bio->bi_rw = READ | op_flags; |
| #endif |
| bio->bi_end_io = vs_block_server_read_done; |
| |
| req->mbuf = vs_server_block_io_alloc_ack_read(state, &req->pbuf, |
| GFP_KERNEL); |
| if (IS_ERR(req->mbuf) && (PTR_ERR(req->mbuf) == -ENOBUFS)) { |
| /* Fall back to a bounce buffer */ |
| req->mbuf = NULL; |
| } else if (IS_ERR(req->mbuf)) { |
| #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) |
| bio->bi_error = PTR_ERR(req->mbuf); |
| bio_endio(bio); |
| #else |
| bio_endio(bio, PTR_ERR(req->mbuf)); |
| #endif |
| return 0; |
| } |
| |
| return vs_block_submit_read(server, req, GFP_KERNEL); |
| } |
| |
| /* Write request handling */ |
| static int vs_block_submit_bounced_write(struct block_server *server, |
| struct block_server_request *req, gfp_t gfp) |
| { |
| struct bio *bio = &req->bio; |
| void *data = req->pbuf.data; |
| struct bio_vec *bv; |
| int i; |
| |
| if (bio_alloc_pages(bio, gfp | __GFP_NOWARN) < 0) |
| return -ENOMEM; |
| blk_recount_segments(bdev_get_queue(server->bdev), bio); |
| req->bounced = true; |
| |
| /* Copy all the data into the bounce buffer */ |
| bio_for_each_segment_all(bv, bio, i) { |
| memcpy(page_address(bv->bv_page) + bv->bv_offset, data, |
| bv->bv_len); |
| data += bv->bv_len; |
| } |
| |
| vs_server_block_io_free_req_write(&server->server, &req->pbuf, |
| req->mbuf); |
| req->mbuf = NULL; |
| |
| dev_vdbg(&server->service->dev, |
| "submit bounced write req sector %#llx count %#x\n", |
| vs_req_sector_index(req), |
| vs_req_num_sectors(server, req)); |
| req->submitted = true; |
| atomic_inc(&server->submitted_req_count); |
| vs_block_make_request(bio); |
| |
| return 0; |
| } |
| |
| static void vs_block_server_write_bounce_work(struct work_struct *work) |
| { |
| struct block_server *server = container_of(work, struct block_server, |
| bounce_req_work); |
| struct block_server_request *req; |
| |
| spin_lock(&server->bounce_req_lock); |
| while (!list_empty(&server->bounce_req_queue)) { |
| req = list_first_entry(&server->bounce_req_queue, |
| struct block_server_request, list); |
| dev_vdbg(&server->service->dev, "write bio %pK\n", &req->bio); |
| list_del(&req->list); |
| spin_unlock(&server->bounce_req_lock); |
| |
| if (vs_block_submit_bounced_write(server, req, |
| GFP_KERNEL) == -ENOMEM) { |
| spin_lock(&server->bounce_req_lock); |
| list_add(&req->list, &server->bounce_req_queue); |
| spin_unlock(&server->bounce_req_lock); |
| schedule_work(work); |
| return; |
| } |
| |
| spin_lock(&server->bounce_req_lock); |
| } |
| spin_unlock(&server->bounce_req_lock); |
| } |
| |
| #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0) |
| static void vs_block_server_write_done(struct bio *bio, int err) |
| #else |
| static void vs_block_server_write_done(struct bio *bio) |
| #endif |
| { |
| unsigned long flags; |
| #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) |
| int err = bio->bi_error; |
| #endif |
| struct block_server_request *req = container_of(bio, |
| struct block_server_request, bio); |
| struct block_server *server = req->server; |
| |
| if (req->bounced) { |
| int i; |
| struct bio_vec *bv; |
| bio_for_each_segment_all(bv, bio, i) |
| __free_page(bv->bv_page); |
| } else if (req->mbuf) { |
| vs_server_block_io_free_req_write(&server->server, &req->pbuf, |
| req->mbuf); |
| req->mbuf = NULL; |
| } |
| |
| if (req->submitted && atomic_dec_and_test(&server->submitted_req_count)) |
| wake_up_all(&server->submitted_req_wq); |
| |
| req->op_err = err; |
| |
| spin_lock_irqsave(&server->completed_req_lock, flags); |
| list_add_tail(&req->list, &server->completed_req_queue); |
| spin_unlock_irqrestore(&server->completed_req_lock, flags); |
| |
| schedule_work(&server->completed_req_work); |
| } |
| |
| static int vs_block_server_io_req_write(struct vs_server_block_state *state, |
| u32 tagid, u64 sector_index, u32 num_sects, bool nodelay, |
| bool flush, bool commit, struct vs_pbuf pbuf, struct vs_mbuf *mbuf) |
| { |
| struct block_server *server = state_to_block_server(state); |
| struct request_queue *q = bdev_get_queue(server->bdev); |
| struct bio *bio; |
| struct block_server_request *req; |
| unsigned long data = (unsigned long)pbuf.data; |
| unsigned long start = data >> PAGE_SHIFT; |
| unsigned long end = (data + pbuf.size + PAGE_SIZE - 1) >> PAGE_SHIFT; |
| int err; |
| unsigned op_flags = 0; |
| |
| bio = bio_alloc_bioset(GFP_KERNEL, end - start, server->bioset); |
| if (!bio) |
| return -ENOMEM; |
| dev_vdbg(&server->service->dev, "alloc w bio %pK\n", bio); |
| req = container_of(bio, struct block_server_request, bio); |
| |
| req->server = server; |
| req->tagid = tagid; |
| req->op_err = 0; |
| req->mbuf = mbuf; |
| req->pbuf = pbuf; |
| req->size = server->sector_size * num_sects; |
| req->bounced = false; |
| req->submitted = false; |
| |
| if (flush) { |
| #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,8,0) |
| op_flags |= REQ_PREFLUSH; |
| #else |
| op_flags |= REQ_FLUSH; |
| #endif |
| } |
| if (commit) { |
| op_flags |= REQ_FUA; |
| } |
| if (nodelay) { |
| op_flags |= REQ_SYNC; |
| } |
| |
| #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) |
| bio->bi_iter.bi_sector = (sector_t)sector_index; |
| #else |
| bio->bi_sector = (sector_t)sector_index; |
| #endif |
| bio->bi_bdev = server->bdev; |
| #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0) |
| bio_set_op_attrs(bio, REQ_OP_WRITE, op_flags); |
| #else |
| bio->bi_rw = WRITE | op_flags; |
| #endif |
| bio->bi_end_io = vs_block_server_write_done; |
| |
| if (pbuf.size < req->size) { |
| err = -EINVAL; |
| goto fail_bio; |
| } |
| if (WARN_ON(pbuf.size > req->size)) |
| pbuf.size = req->size; |
| |
| if (state->readonly) { |
| err = -EROFS; |
| goto fail_bio; |
| } |
| |
| if (!vs_block_can_map_pbuf(q, &req->pbuf, req->pbuf.size)) { |
| /* We need a bounce buffer. First set up the bvecs. */ |
| int size = pbuf.size; |
| |
| #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) |
| bio->bi_iter.bi_size = size; |
| #else |
| bio->bi_size = size; |
| #endif |
| |
| while (size > 0) { |
| struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt]; |
| |
| BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs); |
| |
| bvec->bv_page = NULL; /* Allocated later */ |
| bvec->bv_len = min_t(unsigned, PAGE_SIZE, size); |
| bvec->bv_offset = 0; |
| |
| bio->bi_vcnt++; |
| size -= bvec->bv_len; |
| } |
| |
| /* |
| * Defer the rest so we don't have to hold the state lock |
| * during alloc_page & memcpy |
| */ |
| spin_lock(&server->bounce_req_lock); |
| list_add_tail(&req->list, &server->bounce_req_queue); |
| spin_unlock(&server->bounce_req_lock); |
| schedule_work(&server->bounce_req_work); |
| |
| return 0; |
| } |
| |
| /* No bounce needed; map the pbuf directly. */ |
| err = vs_block_bio_map_pbuf(bio, &pbuf); |
| if (err < 0) |
| goto fail_bio; |
| |
| dev_vdbg(&server->service->dev, |
| "submit direct write req sector %#llx count %#x\n", |
| vs_req_sector_index(req), |
| vs_req_num_sectors(server, req)); |
| req->submitted = true; |
| atomic_inc(&server->submitted_req_count); |
| vs_block_make_request(bio); |
| |
| return 0; |
| |
| fail_bio: |
| #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0) |
| bio->bi_error = err; |
| bio_endio(bio); |
| #else |
| bio_endio(bio, err); |
| #endif |
| return 0; |
| } |
| |
| static struct block_device * |
| vs_block_server_find_by_name(struct block_server *server) |
| { |
| struct block_device *bdev = NULL; |
| struct class_dev_iter iter; |
| struct device *dev; |
| |
| class_dev_iter_init(&iter, &block_class, NULL, NULL); |
| while (1) { |
| dev = class_dev_iter_next(&iter); |
| if (!dev) |
| break; |
| |
| if (strcmp(dev_name(dev), server->service->name) == 0) { |
| bdev = blkdev_get_by_dev(dev->devt, |
| VS_BLOCK_BLKDEV_DEFAULT_MODE, NULL); |
| if (!IS_ERR_OR_NULL(bdev)) |
| break; |
| } |
| } |
| class_dev_iter_exit(&iter); |
| |
| if (!dev || IS_ERR_OR_NULL(bdev)) |
| return ERR_PTR(-ENODEV); |
| |
| dev_dbg(&server->service->dev, "Attached to block device %s (%d:%d)\n", |
| dev_name(dev), MAJOR(dev->devt), MINOR(dev->devt)); |
| return bdev; |
| } |
| |
| static struct block_device * |
| vs_block_server_find_by_path(struct block_server *server, const char *base_path) |
| { |
| struct block_device *bdev; |
| char *bdev_path; |
| |
| bdev_path = kasprintf(GFP_KERNEL, "%s/%s", base_path, |
| server->service->name); |
| if (!bdev_path) |
| return ERR_PTR(-ENOMEM); |
| |
| bdev = blkdev_get_by_path(bdev_path, VS_BLOCK_BLKDEV_DEFAULT_MODE, |
| NULL); |
| dev_dbg(&server->service->dev, "Attached to block device %s\n", |
| bdev_path); |
| |
| kfree(bdev_path); |
| |
| if (!bdev) |
| return ERR_PTR(-ENODEV); |
| return bdev; |
| } |
| |
| static struct block_device * |
| vs_block_server_attach_block_device(struct block_server *server) |
| { |
| const char *paths[] = { |
| "/dev", |
| "/dev/block", |
| "/dev/mapper", |
| "/dev/disk/by-partlabel", |
| "/dev/disk/by-label", |
| "/dev/disk/by-partuuid", |
| "/dev/disk/by-uuid" |
| }; |
| struct block_device *bdev; |
| int i; |
| |
| /* |
| * Try first to look the block device up by path. This is done because |
| * the name exposed to user-space in /dev/ is not necessarily the name |
| * being used inside the kernel for the device. |
| */ |
| for (i = 0; i < ARRAY_SIZE(paths); i++) { |
| bdev = vs_block_server_find_by_path(server, paths[i]); |
| if (!IS_ERR(bdev)) |
| break; |
| } |
| if (i == ARRAY_SIZE(paths)) { |
| /* |
| * Couldn't find the block device in any of the usual places. |
| * Try to match it against the kernel's device name. If the |
| * name of the service and the name of a device in the block |
| * class match then attempt to look the block device up by the |
| * dev_t (major/minor) value. |
| */ |
| bdev = vs_block_server_find_by_name(server); |
| } |
| if (IS_ERR(bdev)) |
| return bdev; |
| |
| server->sector_size = VS_BLOCK_BLK_DEF_SECTOR_SIZE; |
| server->server.segment_size = round_down( |
| vs_service_max_mbuf_size(server->service) - |
| sizeof(vs_message_id_t), server->sector_size); |
| server->server.sector_size = server->sector_size; |
| server->server.device_sectors = bdev->bd_part->nr_sects; |
| if (bdev_read_only(bdev)) |
| server->server.readonly = true; |
| server->server.flushable = true; |
| server->server.committable = true; |
| |
| return bdev; |
| } |
| |
| static struct vs_server_block_state * |
| vs_block_server_alloc(struct vs_service_device *service) |
| { |
| struct block_server *server; |
| int err; |
| |
| server = kzalloc(sizeof(*server), GFP_KERNEL); |
| if (!server) |
| return NULL; |
| |
| server->service = service; |
| server->started = false; |
| INIT_LIST_HEAD(&server->bounce_req_queue); |
| INIT_WORK(&server->bounce_req_work, vs_block_server_write_bounce_work); |
| spin_lock_init(&server->bounce_req_lock); |
| atomic_set(&server->submitted_req_count, 0); |
| init_waitqueue_head(&server->submitted_req_wq); |
| INIT_LIST_HEAD(&server->completed_req_queue); |
| INIT_WORK(&server->completed_req_work, |
| vs_block_server_complete_requests_work); |
| spin_lock_init(&server->completed_req_lock); |
| |
| server->bdev = vs_block_server_attach_block_device(server); |
| if (IS_ERR(server->bdev)) { |
| dev_err(&server->service->dev, |
| "No appropriate block device was found to satisfy the service name %s - error %ld\n", |
| server->service->name, PTR_ERR(server->bdev)); |
| goto fail_attach_device; |
| } |
| |
| dev_set_drvdata(&service->dev, &server->server); |
| |
| err = sysfs_create_group(&service->dev.kobj, |
| &vs_block_server_attr_group); |
| if (err) { |
| dev_err(&service->dev, |
| "Failed to create attribute group for service %s\n", |
| service->name); |
| goto fail_create_group; |
| } |
| |
| /* |
| * We know the upper bound on simultaneously active bios (i.e. the |
| * smaller of the in quota, and the sum of the read and write command |
| * tag limits), so we can pre-allocate that many, and hopefully never |
| * fail to allocate one in a request handler. |
| * |
| * However, allocation may fail if the number of pages (and thus |
| * bvecs) in a request exceeds BIO_INLINE_VECS (which is hard-coded to |
| * 4 in all mainline kernels). That possibility is the only reason we |
| * can't enable rx_atomic for this driver. |
| */ |
| server->bioset = bioset_create(min_t(unsigned, service->recv_quota, |
| VSERVICE_BLOCK_IO_READ_MAX_PENDING + |
| VSERVICE_BLOCK_IO_WRITE_MAX_PENDING), |
| offsetof(struct block_server_request, bio)); |
| if (!server->bioset) { |
| dev_err(&service->dev, |
| "Failed to allocate bioset for service %s\n", |
| service->name); |
| goto fail_create_bioset; |
| } |
| |
| dev_dbg(&service->dev, "New block server %pK\n", server); |
| |
| return &server->server; |
| |
| fail_create_bioset: |
| sysfs_remove_group(&server->service->dev.kobj, |
| &vs_block_server_attr_group); |
| fail_create_group: |
| dev_set_drvdata(&service->dev, NULL); |
| blkdev_put(server->bdev, VS_BLOCK_BLKDEV_DEFAULT_MODE); |
| fail_attach_device: |
| kfree(server); |
| |
| return NULL; |
| } |
| |
| static void vs_block_server_release(struct vs_server_block_state *state) |
| { |
| struct block_server *server = state_to_block_server(state); |
| |
| cancel_work_sync(&server->bounce_req_work); |
| cancel_work_sync(&server->completed_req_work); |
| |
| blkdev_put(server->bdev, VS_BLOCK_BLKDEV_DEFAULT_MODE); |
| |
| sysfs_remove_group(&server->service->dev.kobj, |
| &vs_block_server_attr_group); |
| |
| bioset_free(server->bioset); |
| |
| kfree(server); |
| } |
| |
| static struct vs_server_block block_server_driver = { |
| .alloc = vs_block_server_alloc, |
| .release = vs_block_server_release, |
| .open = vs_block_server_open, |
| .closed = vs_block_server_closed, |
| .tx_ready = vs_block_server_tx_ready, |
| .io = { |
| .req_read = vs_block_server_io_req_read, |
| .req_write = vs_block_server_io_req_write, |
| }, |
| |
| /* Large default quota for batching read/write commands */ |
| .in_quota_best = 32, |
| .out_quota_best = 32, |
| }; |
| |
| static int __init vs_block_server_init(void) |
| { |
| return vservice_block_server_register(&block_server_driver, |
| "block_server_driver"); |
| } |
| |
| static void __exit vs_block_server_exit(void) |
| { |
| vservice_block_server_unregister(&block_server_driver); |
| } |
| |
| module_init(vs_block_server_init); |
| module_exit(vs_block_server_exit); |
| |
| MODULE_DESCRIPTION("OKL4 Virtual Services Block Server Driver"); |
| MODULE_AUTHOR("Open Kernel Labs, Inc"); |