blob: 09a688b3d48ca1445e136544321a54b112b280e1 [file] [log] [blame]
Jim Ramsay9d0eb0a2013-07-10 23:41:19 +01001/*
2 * Copyright (C) 2010-2012 by Dell Inc. All rights reserved.
3 * Copyright (C) 2011-2013 Red Hat, Inc.
4 *
5 * This file is released under the GPL.
6 *
7 * dm-switch is a device-mapper target that maps IO to underlying block
8 * devices efficiently when there are a large number of fixed-sized
9 * address regions but there is no simple pattern to allow for a compact
10 * mapping representation such as dm-stripe.
11 */
12
13#include <linux/device-mapper.h>
14
15#include <linux/module.h>
16#include <linux/init.h>
17#include <linux/vmalloc.h>
18
19#define DM_MSG_PREFIX "switch"
20
21/*
22 * One region_table_slot_t holds <region_entries_per_slot> region table
23 * entries each of which is <region_table_entry_bits> in size.
24 */
25typedef unsigned long region_table_slot_t;
26
27/*
28 * A device with the offset to its start sector.
29 */
30struct switch_path {
31 struct dm_dev *dmdev;
32 sector_t start;
33};
34
35/*
36 * Context block for a dm switch device.
37 */
38struct switch_ctx {
39 struct dm_target *ti;
40
41 unsigned nr_paths; /* Number of paths in path_list. */
42
43 unsigned region_size; /* Region size in 512-byte sectors */
44 unsigned long nr_regions; /* Number of regions making up the device */
45 signed char region_size_bits; /* log2 of region_size or -1 */
46
47 unsigned char region_table_entry_bits; /* Number of bits in one region table entry */
48 unsigned char region_entries_per_slot; /* Number of entries in one region table slot */
49 signed char region_entries_per_slot_bits; /* log2 of region_entries_per_slot or -1 */
50
51 region_table_slot_t *region_table; /* Region table */
52
53 /*
54 * Array of dm devices to switch between.
55 */
56 struct switch_path path_list[0];
57};
58
59static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths,
60 unsigned region_size)
61{
62 struct switch_ctx *sctx;
63
64 sctx = kzalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_path),
65 GFP_KERNEL);
66 if (!sctx)
67 return NULL;
68
69 sctx->ti = ti;
70 sctx->region_size = region_size;
71
72 ti->private = sctx;
73
74 return sctx;
75}
76
77static int alloc_region_table(struct dm_target *ti, unsigned nr_paths)
78{
79 struct switch_ctx *sctx = ti->private;
80 sector_t nr_regions = ti->len;
81 sector_t nr_slots;
82
83 if (!(sctx->region_size & (sctx->region_size - 1)))
84 sctx->region_size_bits = __ffs(sctx->region_size);
85 else
86 sctx->region_size_bits = -1;
87
88 sctx->region_table_entry_bits = 1;
89 while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 &&
90 (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths)
91 sctx->region_table_entry_bits++;
92
93 sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits;
94 if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1)))
95 sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot);
96 else
97 sctx->region_entries_per_slot_bits = -1;
98
99 if (sector_div(nr_regions, sctx->region_size))
100 nr_regions++;
101
102 sctx->nr_regions = nr_regions;
103 if (sctx->nr_regions != nr_regions || sctx->nr_regions >= ULONG_MAX) {
104 ti->error = "Region table too large";
105 return -EINVAL;
106 }
107
108 nr_slots = nr_regions;
109 if (sector_div(nr_slots, sctx->region_entries_per_slot))
110 nr_slots++;
111
112 if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) {
113 ti->error = "Region table too large";
114 return -EINVAL;
115 }
116
117 sctx->region_table = vmalloc(nr_slots * sizeof(region_table_slot_t));
118 if (!sctx->region_table) {
119 ti->error = "Cannot allocate region table";
120 return -ENOMEM;
121 }
122
123 return 0;
124}
125
126static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr,
127 unsigned long *region_index, unsigned *bit)
128{
129 if (sctx->region_entries_per_slot_bits >= 0) {
130 *region_index = region_nr >> sctx->region_entries_per_slot_bits;
131 *bit = region_nr & (sctx->region_entries_per_slot - 1);
132 } else {
133 *region_index = region_nr / sctx->region_entries_per_slot;
134 *bit = region_nr % sctx->region_entries_per_slot;
135 }
136
137 *bit *= sctx->region_table_entry_bits;
138}
139
140/*
141 * Find which path to use at given offset.
142 */
143static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
144{
145 unsigned long region_index;
146 unsigned bit, path_nr;
147 sector_t p;
148
149 p = offset;
150 if (sctx->region_size_bits >= 0)
151 p >>= sctx->region_size_bits;
152 else
153 sector_div(p, sctx->region_size);
154
155 switch_get_position(sctx, p, &region_index, &bit);
156 path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
157 ((1 << sctx->region_table_entry_bits) - 1);
158
159 /* This can only happen if the processor uses non-atomic stores. */
160 if (unlikely(path_nr >= sctx->nr_paths))
161 path_nr = 0;
162
163 return path_nr;
164}
165
166static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr,
167 unsigned value)
168{
169 unsigned long region_index;
170 unsigned bit;
171 region_table_slot_t pte;
172
173 switch_get_position(sctx, region_nr, &region_index, &bit);
174
175 pte = sctx->region_table[region_index];
176 pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit);
177 pte |= (region_table_slot_t)value << bit;
178 sctx->region_table[region_index] = pte;
179}
180
181/*
182 * Fill the region table with an initial round robin pattern.
183 */
184static void initialise_region_table(struct switch_ctx *sctx)
185{
186 unsigned path_nr = 0;
187 unsigned long region_nr;
188
189 for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) {
190 switch_region_table_write(sctx, region_nr, path_nr);
191 if (++path_nr >= sctx->nr_paths)
192 path_nr = 0;
193 }
194}
195
196static int parse_path(struct dm_arg_set *as, struct dm_target *ti)
197{
198 struct switch_ctx *sctx = ti->private;
199 unsigned long long start;
200 int r;
201
202 r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
203 &sctx->path_list[sctx->nr_paths].dmdev);
204 if (r) {
205 ti->error = "Device lookup failed";
206 return r;
207 }
208
209 if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) {
210 ti->error = "Invalid device starting offset";
211 dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
212 return -EINVAL;
213 }
214
215 sctx->path_list[sctx->nr_paths].start = start;
216
217 sctx->nr_paths++;
218
219 return 0;
220}
221
222/*
223 * Destructor: Don't free the dm_target, just the ti->private data (if any).
224 */
225static void switch_dtr(struct dm_target *ti)
226{
227 struct switch_ctx *sctx = ti->private;
228
229 while (sctx->nr_paths--)
230 dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
231
232 vfree(sctx->region_table);
233 kfree(sctx);
234}
235
236/*
237 * Constructor arguments:
238 * <num_paths> <region_size> <num_optional_args> [<optional_args>...]
239 * [<dev_path> <offset>]+
240 *
241 * Optional args are to allow for future extension: currently this
242 * parameter must be 0.
243 */
244static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
245{
246 static struct dm_arg _args[] = {
247 {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
248 {1, UINT_MAX, "Invalid region size"},
249 {0, 0, "Invalid number of optional args"},
250 };
251
252 struct switch_ctx *sctx;
253 struct dm_arg_set as;
254 unsigned nr_paths, region_size, nr_optional_args;
255 int r;
256
257 as.argc = argc;
258 as.argv = argv;
259
260 r = dm_read_arg(_args, &as, &nr_paths, &ti->error);
261 if (r)
262 return -EINVAL;
263
264 r = dm_read_arg(_args + 1, &as, &region_size, &ti->error);
265 if (r)
266 return r;
267
268 r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error);
269 if (r)
270 return r;
271 /* parse optional arguments here, if we add any */
272
273 if (as.argc != nr_paths * 2) {
274 ti->error = "Incorrect number of path arguments";
275 return -EINVAL;
276 }
277
278 sctx = alloc_switch_ctx(ti, nr_paths, region_size);
279 if (!sctx) {
280 ti->error = "Cannot allocate redirection context";
281 return -ENOMEM;
282 }
283
284 r = dm_set_target_max_io_len(ti, region_size);
285 if (r)
286 goto error;
287
288 while (as.argc) {
289 r = parse_path(&as, ti);
290 if (r)
291 goto error;
292 }
293
294 r = alloc_region_table(ti, nr_paths);
295 if (r)
296 goto error;
297
298 initialise_region_table(sctx);
299
300 /* For UNMAP, sending the request down any path is sufficient */
301 ti->num_discard_bios = 1;
302
303 return 0;
304
305error:
306 switch_dtr(ti);
307
308 return r;
309}
310
311static int switch_map(struct dm_target *ti, struct bio *bio)
312{
313 struct switch_ctx *sctx = ti->private;
Kent Overstreet4f024f32013-10-11 15:44:27 -0700314 sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector);
Jim Ramsay9d0eb0a2013-07-10 23:41:19 +0100315 unsigned path_nr = switch_get_path_nr(sctx, offset);
316
317 bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev;
Kent Overstreet4f024f32013-10-11 15:44:27 -0700318 bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset;
Jim Ramsay9d0eb0a2013-07-10 23:41:19 +0100319
320 return DM_MAPIO_REMAPPED;
321}
322
323/*
324 * We need to parse hex numbers in the message as quickly as possible.
325 *
326 * This table-based hex parser improves performance.
327 * It improves a time to load 1000000 entries compared to the condition-based
328 * parser.
329 * table-based parser condition-based parser
330 * PA-RISC 0.29s 0.31s
331 * Opteron 0.0495s 0.0498s
332 */
333static const unsigned char hex_table[256] = {
334255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
335255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
336255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
3370, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255,
338255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
339255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
340255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
341255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
342255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
343255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
344255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
345255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
346255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
347255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
348255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
349255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
350};
351
352static __always_inline unsigned long parse_hex(const char **string)
353{
354 unsigned char d;
355 unsigned long r = 0;
356
357 while ((d = hex_table[(unsigned char)**string]) < 16) {
358 r = (r << 4) | d;
359 (*string)++;
360 }
361
362 return r;
363}
364
365static int process_set_region_mappings(struct switch_ctx *sctx,
366 unsigned argc, char **argv)
367{
368 unsigned i;
369 unsigned long region_index = 0;
370
371 for (i = 1; i < argc; i++) {
372 unsigned long path_nr;
373 const char *string = argv[i];
374
375 if (*string == ':')
376 region_index++;
377 else {
378 region_index = parse_hex(&string);
379 if (unlikely(*string != ':')) {
380 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
381 return -EINVAL;
382 }
383 }
384
385 string++;
386 if (unlikely(!*string)) {
387 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
388 return -EINVAL;
389 }
390
391 path_nr = parse_hex(&string);
392 if (unlikely(*string)) {
393 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
394 return -EINVAL;
395 }
396 if (unlikely(region_index >= sctx->nr_regions)) {
397 DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions);
398 return -EINVAL;
399 }
400 if (unlikely(path_nr >= sctx->nr_paths)) {
401 DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths);
402 return -EINVAL;
403 }
404
405 switch_region_table_write(sctx, region_index, path_nr);
406 }
407
408 return 0;
409}
410
411/*
412 * Messages are processed one-at-a-time.
413 *
414 * Only set_region_mappings is supported.
415 */
416static int switch_message(struct dm_target *ti, unsigned argc, char **argv)
417{
418 static DEFINE_MUTEX(message_mutex);
419
420 struct switch_ctx *sctx = ti->private;
421 int r = -EINVAL;
422
423 mutex_lock(&message_mutex);
424
425 if (!strcasecmp(argv[0], "set_region_mappings"))
426 r = process_set_region_mappings(sctx, argc, argv);
427 else
428 DMWARN("Unrecognised message received.");
429
430 mutex_unlock(&message_mutex);
431
432 return r;
433}
434
435static void switch_status(struct dm_target *ti, status_type_t type,
436 unsigned status_flags, char *result, unsigned maxlen)
437{
438 struct switch_ctx *sctx = ti->private;
439 unsigned sz = 0;
440 int path_nr;
441
442 switch (type) {
443 case STATUSTYPE_INFO:
444 result[0] = '\0';
445 break;
446
447 case STATUSTYPE_TABLE:
448 DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size);
449 for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++)
450 DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name,
451 (unsigned long long)sctx->path_list[path_nr].start);
452 break;
453 }
454}
455
456/*
457 * Switch ioctl:
458 *
459 * Passthrough all ioctls to the path for sector 0
460 */
461static int switch_ioctl(struct dm_target *ti, unsigned cmd,
462 unsigned long arg)
463{
464 struct switch_ctx *sctx = ti->private;
465 struct block_device *bdev;
466 fmode_t mode;
467 unsigned path_nr;
468 int r = 0;
469
470 path_nr = switch_get_path_nr(sctx, 0);
471
472 bdev = sctx->path_list[path_nr].dmdev->bdev;
473 mode = sctx->path_list[path_nr].dmdev->mode;
474
475 /*
476 * Only pass ioctls through if the device sizes match exactly.
477 */
478 if (ti->len + sctx->path_list[path_nr].start != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
479 r = scsi_verify_blk_ioctl(NULL, cmd);
480
481 return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
482}
483
484static int switch_iterate_devices(struct dm_target *ti,
485 iterate_devices_callout_fn fn, void *data)
486{
487 struct switch_ctx *sctx = ti->private;
488 int path_nr;
489 int r;
490
491 for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) {
492 r = fn(ti, sctx->path_list[path_nr].dmdev,
493 sctx->path_list[path_nr].start, ti->len, data);
494 if (r)
495 return r;
496 }
497
498 return 0;
499}
500
501static struct target_type switch_target = {
502 .name = "switch",
503 .version = {1, 0, 0},
504 .module = THIS_MODULE,
505 .ctr = switch_ctr,
506 .dtr = switch_dtr,
507 .map = switch_map,
508 .message = switch_message,
509 .status = switch_status,
510 .ioctl = switch_ioctl,
511 .iterate_devices = switch_iterate_devices,
512};
513
514static int __init dm_switch_init(void)
515{
516 int r;
517
518 r = dm_register_target(&switch_target);
519 if (r < 0)
520 DMERR("dm_register_target() failed %d", r);
521
522 return r;
523}
524
525static void __exit dm_switch_exit(void)
526{
527 dm_unregister_target(&switch_target);
528}
529
530module_init(dm_switch_init);
531module_exit(dm_switch_exit);
532
533MODULE_DESCRIPTION(DM_NAME " dynamic path switching target");
534MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
535MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>");
536MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>");
537MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
538MODULE_LICENSE("GPL");