blob: 160cf14431ac55118646018d8eb184e0ab697129 [file] [log] [blame]
Rusty Russelle2c97842007-07-26 10:41:03 -07001/*D:400
2 * The Guest block driver
Rusty Russellb7544162007-07-19 01:49:29 -07003 *
Rusty Russelle2c97842007-07-26 10:41:03 -07004 * This is a simple block driver, which appears as /dev/lgba, lgbb, lgbc etc.
5 * The mechanism is simple: we place the information about the request in the
6 * device page, then use SEND_DMA (containing the data for a write, or an empty
7 * "ping" DMA for a read).
8 :*/
9/* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
Rusty Russellb7544162007-07-19 01:49:29 -070010 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */
25//#define DEBUG
26#include <linux/init.h>
27#include <linux/types.h>
28#include <linux/blkdev.h>
29#include <linux/interrupt.h>
30#include <linux/lguest_bus.h>
31
32static char next_block_index = 'a';
33
Rusty Russelle2c97842007-07-26 10:41:03 -070034/*D:420 Here is the structure which holds all the information we need about
35 * each Guest block device.
36 *
37 * I'm sure at this stage, you're wondering "hey, where was the adventure I was
38 * promised?" and thinking "Rusty sucks, I shall say nasty things about him on
39 * my blog". I think Real adventures have boring bits, too, and you're in the
40 * middle of one. But it gets better. Just not quite yet. */
Rusty Russellb7544162007-07-19 01:49:29 -070041struct blockdev
42{
Rusty Russelle2c97842007-07-26 10:41:03 -070043 /* The block queue infrastructure wants a spinlock: it is held while it
44 * calls our block request function. We grab it in our interrupt
45 * handler so the responses don't mess with new requests. */
Rusty Russellb7544162007-07-19 01:49:29 -070046 spinlock_t lock;
47
Rusty Russelle2c97842007-07-26 10:41:03 -070048 /* The disk structure registered with kernel. */
Rusty Russellb7544162007-07-19 01:49:29 -070049 struct gendisk *disk;
50
Rusty Russelle2c97842007-07-26 10:41:03 -070051 /* The major device number for this disk, and the interrupt. We only
52 * really keep them here for completeness; we'd need them if we
53 * supported device unplugging. */
Rusty Russellb7544162007-07-19 01:49:29 -070054 int major;
55 int irq;
56
Rusty Russelle2c97842007-07-26 10:41:03 -070057 /* The physical address of this device's memory page */
Rusty Russellb7544162007-07-19 01:49:29 -070058 unsigned long phys_addr;
Rusty Russelle2c97842007-07-26 10:41:03 -070059 /* The mapped memory page for convenient acces. */
Rusty Russellb7544162007-07-19 01:49:29 -070060 struct lguest_block_page *lb_page;
61
Rusty Russelle2c97842007-07-26 10:41:03 -070062 /* We only have a single request outstanding at a time: this is it. */
Rusty Russellb7544162007-07-19 01:49:29 -070063 struct lguest_dma dma;
64 struct request *req;
65};
66
Rusty Russelle2c97842007-07-26 10:41:03 -070067/*D:495 We originally used end_request() throughout the driver, but it turns
68 * out that end_request() is deprecated, and doesn't actually end the request
69 * (which seems like a good reason to deprecate it!). It simply ends the first
70 * bio. So if we had 3 bios in a "struct request" we would do all 3,
71 * end_request(), do 2, end_request(), do 1 and end_request(): twice as much
72 * work as we needed to do.
73 *
74 * This reinforced to me that I do not understand the block layer.
75 *
76 * Nonetheless, Jens Axboe gave me this nice helper to end all chunks of a
77 * request. This improved disk speed by 130%. */
Rusty Russellb7544162007-07-19 01:49:29 -070078static void end_entire_request(struct request *req, int uptodate)
79{
80 if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
81 BUG();
82 add_disk_randomness(req->rq_disk);
83 blkdev_dequeue_request(req);
84 end_that_request_last(req, uptodate);
85}
86
Rusty Russelle2c97842007-07-26 10:41:03 -070087/* I'm told there are only two stories in the world worth telling: love and
88 * hate. So there used to be a love scene here like this:
89 *
90 * Launcher: We could make beautiful I/O together, you and I.
91 * Guest: My, that's a big disk!
92 *
93 * Unfortunately, it was just too raunchy for our otherwise-gentle tale. */
94
95/*D:490 This is the interrupt handler, called when a block read or write has
96 * been completed for us. */
Rusty Russellb7544162007-07-19 01:49:29 -070097static irqreturn_t lgb_irq(int irq, void *_bd)
98{
Rusty Russelle2c97842007-07-26 10:41:03 -070099 /* We handed our "struct blockdev" as the argument to request_irq(), so
100 * it is passed through to us here. This tells us which device we're
101 * dealing with in case we have more than one. */
Rusty Russellb7544162007-07-19 01:49:29 -0700102 struct blockdev *bd = _bd;
103 unsigned long flags;
104
Rusty Russelle2c97842007-07-26 10:41:03 -0700105 /* We weren't doing anything? Strange, but could happen if we shared
106 * interrupts (we don't!). */
Rusty Russellb7544162007-07-19 01:49:29 -0700107 if (!bd->req) {
108 pr_debug("No work!\n");
109 return IRQ_NONE;
110 }
111
Rusty Russelle2c97842007-07-26 10:41:03 -0700112 /* Not done yet? That's equally strange. */
Rusty Russellb7544162007-07-19 01:49:29 -0700113 if (!bd->lb_page->result) {
114 pr_debug("No result!\n");
115 return IRQ_NONE;
116 }
117
Rusty Russelle2c97842007-07-26 10:41:03 -0700118 /* We have to grab the lock before ending the request. */
Rusty Russellb7544162007-07-19 01:49:29 -0700119 spin_lock_irqsave(&bd->lock, flags);
Rusty Russelle2c97842007-07-26 10:41:03 -0700120 /* "result" is 1 for success, 2 for failure: end_entire_request() wants
121 * to know whether this succeeded or not. */
Rusty Russellb7544162007-07-19 01:49:29 -0700122 end_entire_request(bd->req, bd->lb_page->result == 1);
Rusty Russelle2c97842007-07-26 10:41:03 -0700123 /* Clear out request, it's done. */
Rusty Russellb7544162007-07-19 01:49:29 -0700124 bd->req = NULL;
Rusty Russelle2c97842007-07-26 10:41:03 -0700125 /* Reset incoming DMA for next time. */
Rusty Russellb7544162007-07-19 01:49:29 -0700126 bd->dma.used_len = 0;
Rusty Russelle2c97842007-07-26 10:41:03 -0700127 /* Ready for more reads or writes */
Rusty Russellb7544162007-07-19 01:49:29 -0700128 blk_start_queue(bd->disk->queue);
129 spin_unlock_irqrestore(&bd->lock, flags);
Rusty Russelle2c97842007-07-26 10:41:03 -0700130
131 /* The interrupt was for us, we dealt with it. */
Rusty Russellb7544162007-07-19 01:49:29 -0700132 return IRQ_HANDLED;
133}
134
Rusty Russelle2c97842007-07-26 10:41:03 -0700135/*D:480 The block layer's "struct request" contains a number of "struct bio"s,
136 * each of which contains "struct bio_vec"s, each of which contains a page, an
137 * offset and a length.
138 *
139 * Fortunately there are iterators to help us walk through the "struct
140 * request". Even more fortunately, there were plenty of places to steal the
141 * code from. We pack the "struct request" into our "struct lguest_dma" and
142 * return the total length. */
Rusty Russellb7544162007-07-19 01:49:29 -0700143static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma)
144{
145 unsigned int i = 0, idx, len = 0;
146 struct bio *bio;
147
148 rq_for_each_bio(bio, req) {
149 struct bio_vec *bvec;
150 bio_for_each_segment(bvec, bio, idx) {
Rusty Russelle2c97842007-07-26 10:41:03 -0700151 /* We told the block layer not to give us too many. */
Rusty Russellb7544162007-07-19 01:49:29 -0700152 BUG_ON(i == LGUEST_MAX_DMA_SECTIONS);
Rusty Russelle2c97842007-07-26 10:41:03 -0700153 /* If we had a zero-length segment, it would look like
154 * the end of the data referred to by the "struct
155 * lguest_dma", so make sure that doesn't happen. */
Rusty Russellb7544162007-07-19 01:49:29 -0700156 BUG_ON(!bvec->bv_len);
Rusty Russelle2c97842007-07-26 10:41:03 -0700157 /* Convert page & offset to a physical address */
Rusty Russellb7544162007-07-19 01:49:29 -0700158 dma->addr[i] = page_to_phys(bvec->bv_page)
159 + bvec->bv_offset;
160 dma->len[i] = bvec->bv_len;
161 len += bvec->bv_len;
162 i++;
163 }
164 }
Rusty Russelle2c97842007-07-26 10:41:03 -0700165 /* If the array isn't full, we mark the end with a 0 length */
Rusty Russellb7544162007-07-19 01:49:29 -0700166 if (i < LGUEST_MAX_DMA_SECTIONS)
167 dma->len[i] = 0;
168 return len;
169}
170
Rusty Russelle2c97842007-07-26 10:41:03 -0700171/* This creates an empty DMA, useful for prodding the Host without sending data
172 * (ie. when we want to do a read) */
Rusty Russellb7544162007-07-19 01:49:29 -0700173static void empty_dma(struct lguest_dma *dma)
174{
175 dma->len[0] = 0;
176}
177
Rusty Russelle2c97842007-07-26 10:41:03 -0700178/*D:470 Setting up a request is fairly easy: */
Rusty Russellb7544162007-07-19 01:49:29 -0700179static void setup_req(struct blockdev *bd,
180 int type, struct request *req, struct lguest_dma *dma)
181{
Rusty Russelle2c97842007-07-26 10:41:03 -0700182 /* The type is 1 (write) or 0 (read). */
Rusty Russellb7544162007-07-19 01:49:29 -0700183 bd->lb_page->type = type;
Rusty Russelle2c97842007-07-26 10:41:03 -0700184 /* The sector on disk where the read or write starts. */
Rusty Russellb7544162007-07-19 01:49:29 -0700185 bd->lb_page->sector = req->sector;
Rusty Russelle2c97842007-07-26 10:41:03 -0700186 /* The result is initialized to 0 (unfinished). */
Rusty Russellb7544162007-07-19 01:49:29 -0700187 bd->lb_page->result = 0;
Rusty Russelle2c97842007-07-26 10:41:03 -0700188 /* The current request (so we can end it in the interrupt handler). */
Rusty Russellb7544162007-07-19 01:49:29 -0700189 bd->req = req;
Rusty Russelle2c97842007-07-26 10:41:03 -0700190 /* The number of bytes: returned as a side-effect of req_to_dma(),
191 * which packs the block layer's "struct request" into our "struct
192 * lguest_dma" */
Rusty Russellb7544162007-07-19 01:49:29 -0700193 bd->lb_page->bytes = req_to_dma(req, dma);
194}
195
Rusty Russelle2c97842007-07-26 10:41:03 -0700196/*D:450 Write is pretty straightforward: we pack the request into a "struct
197 * lguest_dma", then use SEND_DMA to send the request. */
Rusty Russellb7544162007-07-19 01:49:29 -0700198static void do_write(struct blockdev *bd, struct request *req)
199{
200 struct lguest_dma send;
201
202 pr_debug("lgb: WRITE sector %li\n", (long)req->sector);
203 setup_req(bd, 1, req, &send);
204
205 lguest_send_dma(bd->phys_addr, &send);
206}
207
Rusty Russelle2c97842007-07-26 10:41:03 -0700208/* Read is similar to write, except we pack the request into our receive
209 * "struct lguest_dma" and send through an empty DMA just to tell the Host that
210 * there's a request pending. */
Rusty Russellb7544162007-07-19 01:49:29 -0700211static void do_read(struct blockdev *bd, struct request *req)
212{
213 struct lguest_dma ping;
214
215 pr_debug("lgb: READ sector %li\n", (long)req->sector);
216 setup_req(bd, 0, req, &bd->dma);
217
218 empty_dma(&ping);
219 lguest_send_dma(bd->phys_addr, &ping);
220}
221
Rusty Russelle2c97842007-07-26 10:41:03 -0700222/*D:440 This where requests come in: we get handed the request queue and are
223 * expected to pull a "struct request" off it until we've finished them or
224 * we're waiting for a reply: */
Jens Axboe165125e2007-07-24 09:28:11 +0200225static void do_lgb_request(struct request_queue *q)
Rusty Russellb7544162007-07-19 01:49:29 -0700226{
227 struct blockdev *bd;
228 struct request *req;
229
230again:
Rusty Russelle2c97842007-07-26 10:41:03 -0700231 /* This sometimes returns NULL even on the very first time around. I
232 * wonder if it's something to do with letting elves handle the request
233 * queue... */
Rusty Russellb7544162007-07-19 01:49:29 -0700234 req = elv_next_request(q);
235 if (!req)
236 return;
237
Rusty Russelle2c97842007-07-26 10:41:03 -0700238 /* We attached the struct blockdev to the disk: get it back */
Rusty Russellb7544162007-07-19 01:49:29 -0700239 bd = req->rq_disk->private_data;
Rusty Russelle2c97842007-07-26 10:41:03 -0700240 /* Sometimes we get repeated requests after blk_stop_queue(), but we
241 * can only handle one at a time. */
Rusty Russellb7544162007-07-19 01:49:29 -0700242 if (bd->req)
243 return;
244
Rusty Russelle2c97842007-07-26 10:41:03 -0700245 /* We only do reads and writes: no tricky business! */
Rusty Russellb7544162007-07-19 01:49:29 -0700246 if (!blk_fs_request(req)) {
247 pr_debug("Got non-command 0x%08x\n", req->cmd_type);
248 req->errors++;
249 end_entire_request(req, 0);
250 goto again;
251 }
252
253 if (rq_data_dir(req) == WRITE)
254 do_write(bd, req);
255 else
256 do_read(bd, req);
257
Rusty Russelle2c97842007-07-26 10:41:03 -0700258 /* We've put out the request, so stop any more coming in until we get
259 * an interrupt, which takes us to lgb_irq() to re-enable the queue. */
Rusty Russellb7544162007-07-19 01:49:29 -0700260 blk_stop_queue(q);
261}
262
Rusty Russelle2c97842007-07-26 10:41:03 -0700263/*D:430 This is the "struct block_device_operations" we attach to the disk at
264 * the end of lguestblk_probe(). It doesn't seem to want much. */
Rusty Russellb7544162007-07-19 01:49:29 -0700265static struct block_device_operations lguestblk_fops = {
266 .owner = THIS_MODULE,
267};
268
Rusty Russelle2c97842007-07-26 10:41:03 -0700269/*D:425 Setting up a disk device seems to involve a lot of code. I'm not sure
270 * quite why. I do know that the IDE code sent two or three of the maintainers
271 * insane, perhaps this is the fringe of the same disease?
272 *
273 * As in the console code, the probe function gets handed the generic
274 * lguest_device from lguest_bus.c: */
Rusty Russellb7544162007-07-19 01:49:29 -0700275static int lguestblk_probe(struct lguest_device *lgdev)
276{
277 struct blockdev *bd;
278 int err;
279 int irqflags = IRQF_SHARED;
280
Rusty Russelle2c97842007-07-26 10:41:03 -0700281 /* First we allocate our own "struct blockdev" and initialize the easy
282 * fields. */
Rusty Russellb7544162007-07-19 01:49:29 -0700283 bd = kmalloc(sizeof(*bd), GFP_KERNEL);
284 if (!bd)
285 return -ENOMEM;
286
287 spin_lock_init(&bd->lock);
288 bd->irq = lgdev_irq(lgdev);
289 bd->req = NULL;
290 bd->dma.used_len = 0;
291 bd->dma.len[0] = 0;
Rusty Russelle2c97842007-07-26 10:41:03 -0700292 /* The descriptor in the lguest_devices array provided by the Host
293 * gives the Guest the physical page number of the device's page. */
Rusty Russellb7544162007-07-19 01:49:29 -0700294 bd->phys_addr = (lguest_devices[lgdev->index].pfn << PAGE_SHIFT);
295
Rusty Russelle2c97842007-07-26 10:41:03 -0700296 /* We use lguest_map() to get a pointer to the device page */
Rusty Russellb7544162007-07-19 01:49:29 -0700297 bd->lb_page = lguest_map(bd->phys_addr, 1);
298 if (!bd->lb_page) {
299 err = -ENOMEM;
300 goto out_free_bd;
301 }
302
Rusty Russelle2c97842007-07-26 10:41:03 -0700303 /* We need a major device number: 0 means "assign one dynamically". */
Rusty Russellb7544162007-07-19 01:49:29 -0700304 bd->major = register_blkdev(0, "lguestblk");
305 if (bd->major < 0) {
306 err = bd->major;
307 goto out_unmap;
308 }
309
Rusty Russelle2c97842007-07-26 10:41:03 -0700310 /* This allocates a "struct gendisk" where we pack all the information
Rusty Russell9ef7ad22007-08-17 14:05:27 +1000311 * about the disk which the rest of Linux sees. The argument is the
312 * number of minor devices desired: we need one minor for the main
313 * disk, and one for each partition. Of course, we can't possibly know
314 * how many partitions are on the disk (add_disk does that).
315 */
316 bd->disk = alloc_disk(16);
Rusty Russellb7544162007-07-19 01:49:29 -0700317 if (!bd->disk) {
318 err = -ENOMEM;
319 goto out_unregister_blkdev;
320 }
321
Rusty Russelle2c97842007-07-26 10:41:03 -0700322 /* Every disk needs a queue for requests to come in: we set up the
323 * queue with a callback function (the core of our driver) and the lock
324 * to use. */
Rusty Russellb7544162007-07-19 01:49:29 -0700325 bd->disk->queue = blk_init_queue(do_lgb_request, &bd->lock);
326 if (!bd->disk->queue) {
327 err = -ENOMEM;
328 goto out_put_disk;
329 }
330
Rusty Russelle2c97842007-07-26 10:41:03 -0700331 /* We can only handle a certain number of pointers in our SEND_DMA
332 * call, so we set that with blk_queue_max_hw_segments(). This is not
333 * to be confused with blk_queue_max_phys_segments() of course! I
334 * know, who could possibly confuse the two?
335 *
336 * Well, it's simple to tell them apart: this one seems to work and the
337 * other one didn't. */
Rusty Russellb7544162007-07-19 01:49:29 -0700338 blk_queue_max_hw_segments(bd->disk->queue, LGUEST_MAX_DMA_SECTIONS);
Rusty Russelle2c97842007-07-26 10:41:03 -0700339
340 /* Due to technical limitations of our Host (and simple coding) we
341 * can't have a single buffer which crosses a page boundary. Tell it
342 * here. This means that our maximum request size is 16
343 * (LGUEST_MAX_DMA_SECTIONS) pages. */
Rusty Russellb7544162007-07-19 01:49:29 -0700344 blk_queue_segment_boundary(bd->disk->queue, PAGE_SIZE-1);
345
Rusty Russelle2c97842007-07-26 10:41:03 -0700346 /* We name our disk: this becomes the device name when udev does its
347 * magic thing and creates the device node, such as /dev/lgba.
348 * next_block_index is a global which starts at 'a'. Unfortunately
349 * this simple increment logic means that the 27th disk will be called
350 * "/dev/lgb{". In that case, I recommend having at least 29 disks, so
351 * your /dev directory will be balanced. */
Rusty Russellb7544162007-07-19 01:49:29 -0700352 sprintf(bd->disk->disk_name, "lgb%c", next_block_index++);
Rusty Russelle2c97842007-07-26 10:41:03 -0700353
354 /* We look to the device descriptor again to see if this device's
355 * interrupts are expected to be random. If they are, we tell the irq
356 * subsystem. At the moment this bit is always set. */
Rusty Russellb7544162007-07-19 01:49:29 -0700357 if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS)
358 irqflags |= IRQF_SAMPLE_RANDOM;
Rusty Russelle2c97842007-07-26 10:41:03 -0700359
360 /* Now we have the name and irqflags, we can request the interrupt; we
361 * give it the "struct blockdev" we have set up to pass to lgb_irq()
362 * when there is an interrupt. */
Rusty Russellb7544162007-07-19 01:49:29 -0700363 err = request_irq(bd->irq, lgb_irq, irqflags, bd->disk->disk_name, bd);
364 if (err)
365 goto out_cleanup_queue;
366
Rusty Russelle2c97842007-07-26 10:41:03 -0700367 /* We bind our one-entry DMA pool to the key for this block device so
368 * the Host can reply to our requests. The key is equal to the
369 * physical address of the device's page, which is conveniently
370 * unique. */
Rusty Russellb7544162007-07-19 01:49:29 -0700371 err = lguest_bind_dma(bd->phys_addr, &bd->dma, 1, bd->irq);
372 if (err)
373 goto out_free_irq;
374
Rusty Russelle2c97842007-07-26 10:41:03 -0700375 /* We finish our disk initialization and add the disk to the system. */
Rusty Russellb7544162007-07-19 01:49:29 -0700376 bd->disk->major = bd->major;
377 bd->disk->first_minor = 0;
378 bd->disk->private_data = bd;
379 bd->disk->fops = &lguestblk_fops;
Rusty Russelle2c97842007-07-26 10:41:03 -0700380 /* This is initialized to the disk size by the Launcher. */
Rusty Russellb7544162007-07-19 01:49:29 -0700381 set_capacity(bd->disk, bd->lb_page->num_sectors);
382 add_disk(bd->disk);
383
384 printk(KERN_INFO "%s: device %i at major %d\n",
385 bd->disk->disk_name, lgdev->index, bd->major);
386
Rusty Russelle2c97842007-07-26 10:41:03 -0700387 /* We don't need to keep the "struct blockdev" around, but if we ever
388 * implemented device removal, we'd need this. */
Rusty Russellb7544162007-07-19 01:49:29 -0700389 lgdev->private = bd;
390 return 0;
391
392out_free_irq:
393 free_irq(bd->irq, bd);
394out_cleanup_queue:
395 blk_cleanup_queue(bd->disk->queue);
396out_put_disk:
397 put_disk(bd->disk);
398out_unregister_blkdev:
399 unregister_blkdev(bd->major, "lguestblk");
400out_unmap:
401 lguest_unmap(bd->lb_page);
402out_free_bd:
403 kfree(bd);
404 return err;
405}
406
Rusty Russelle2c97842007-07-26 10:41:03 -0700407/*D:410 The boilerplate code for registering the lguest block driver is just
408 * like the console: */
Rusty Russellb7544162007-07-19 01:49:29 -0700409static struct lguest_driver lguestblk_drv = {
410 .name = "lguestblk",
411 .owner = THIS_MODULE,
412 .device_type = LGUEST_DEVICE_T_BLOCK,
413 .probe = lguestblk_probe,
414};
415
416static __init int lguestblk_init(void)
417{
418 return register_lguest_driver(&lguestblk_drv);
419}
420module_init(lguestblk_init);
421
422MODULE_DESCRIPTION("Lguest block driver");
423MODULE_LICENSE("GPL");