blob: fae35caf367208ab22cbe6900010ee353f52d666 [file] [log] [blame]
Jim Ramsay9d0eb0a2013-07-10 23:41:19 +01001/*
2 * Copyright (C) 2010-2012 by Dell Inc. All rights reserved.
3 * Copyright (C) 2011-2013 Red Hat, Inc.
4 *
5 * This file is released under the GPL.
6 *
7 * dm-switch is a device-mapper target that maps IO to underlying block
8 * devices efficiently when there are a large number of fixed-sized
9 * address regions but there is no simple pattern to allow for a compact
10 * mapping representation such as dm-stripe.
11 */
12
13#include <linux/device-mapper.h>
14
15#include <linux/module.h>
16#include <linux/init.h>
17#include <linux/vmalloc.h>
18
19#define DM_MSG_PREFIX "switch"
20
21/*
22 * One region_table_slot_t holds <region_entries_per_slot> region table
23 * entries each of which is <region_table_entry_bits> in size.
24 */
25typedef unsigned long region_table_slot_t;
26
27/*
28 * A device with the offset to its start sector.
29 */
30struct switch_path {
31 struct dm_dev *dmdev;
32 sector_t start;
33};
34
35/*
36 * Context block for a dm switch device.
37 */
38struct switch_ctx {
39 struct dm_target *ti;
40
41 unsigned nr_paths; /* Number of paths in path_list. */
42
43 unsigned region_size; /* Region size in 512-byte sectors */
44 unsigned long nr_regions; /* Number of regions making up the device */
45 signed char region_size_bits; /* log2 of region_size or -1 */
46
47 unsigned char region_table_entry_bits; /* Number of bits in one region table entry */
48 unsigned char region_entries_per_slot; /* Number of entries in one region table slot */
49 signed char region_entries_per_slot_bits; /* log2 of region_entries_per_slot or -1 */
50
51 region_table_slot_t *region_table; /* Region table */
52
53 /*
54 * Array of dm devices to switch between.
55 */
56 struct switch_path path_list[0];
57};
58
59static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths,
60 unsigned region_size)
61{
62 struct switch_ctx *sctx;
63
64 sctx = kzalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_path),
65 GFP_KERNEL);
66 if (!sctx)
67 return NULL;
68
69 sctx->ti = ti;
70 sctx->region_size = region_size;
71
72 ti->private = sctx;
73
74 return sctx;
75}
76
77static int alloc_region_table(struct dm_target *ti, unsigned nr_paths)
78{
79 struct switch_ctx *sctx = ti->private;
80 sector_t nr_regions = ti->len;
81 sector_t nr_slots;
82
83 if (!(sctx->region_size & (sctx->region_size - 1)))
84 sctx->region_size_bits = __ffs(sctx->region_size);
85 else
86 sctx->region_size_bits = -1;
87
88 sctx->region_table_entry_bits = 1;
89 while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 &&
90 (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths)
91 sctx->region_table_entry_bits++;
92
93 sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits;
94 if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1)))
95 sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot);
96 else
97 sctx->region_entries_per_slot_bits = -1;
98
99 if (sector_div(nr_regions, sctx->region_size))
100 nr_regions++;
101
Tomohiro Kusumiaad9ae42015-10-29 03:54:21 +0900102 if (nr_regions >= ULONG_MAX) {
Jim Ramsay9d0eb0a2013-07-10 23:41:19 +0100103 ti->error = "Region table too large";
104 return -EINVAL;
105 }
Tomohiro Kusumiaad9ae42015-10-29 03:54:21 +0900106 sctx->nr_regions = nr_regions;
Jim Ramsay9d0eb0a2013-07-10 23:41:19 +0100107
108 nr_slots = nr_regions;
109 if (sector_div(nr_slots, sctx->region_entries_per_slot))
110 nr_slots++;
111
112 if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) {
113 ti->error = "Region table too large";
114 return -EINVAL;
115 }
116
Kees Cook42bc47b2018-06-12 14:27:11 -0700117 sctx->region_table = vmalloc(array_size(nr_slots,
118 sizeof(region_table_slot_t)));
Jim Ramsay9d0eb0a2013-07-10 23:41:19 +0100119 if (!sctx->region_table) {
120 ti->error = "Cannot allocate region table";
121 return -ENOMEM;
122 }
123
124 return 0;
125}
126
127static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr,
128 unsigned long *region_index, unsigned *bit)
129{
130 if (sctx->region_entries_per_slot_bits >= 0) {
131 *region_index = region_nr >> sctx->region_entries_per_slot_bits;
132 *bit = region_nr & (sctx->region_entries_per_slot - 1);
133 } else {
134 *region_index = region_nr / sctx->region_entries_per_slot;
135 *bit = region_nr % sctx->region_entries_per_slot;
136 }
137
138 *bit *= sctx->region_table_entry_bits;
139}
140
Mikulas Patocka99eb1902014-07-28 17:49:41 -0400141static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr)
142{
143 unsigned long region_index;
144 unsigned bit;
145
146 switch_get_position(sctx, region_nr, &region_index, &bit);
147
Mark Rutland6aa7de02017-10-23 14:07:29 -0700148 return (READ_ONCE(sctx->region_table[region_index]) >> bit) &
Mikulas Patocka99eb1902014-07-28 17:49:41 -0400149 ((1 << sctx->region_table_entry_bits) - 1);
150}
151
Jim Ramsay9d0eb0a2013-07-10 23:41:19 +0100152/*
153 * Find which path to use at given offset.
154 */
155static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
156{
Mikulas Patocka99eb1902014-07-28 17:49:41 -0400157 unsigned path_nr;
Jim Ramsay9d0eb0a2013-07-10 23:41:19 +0100158 sector_t p;
159
160 p = offset;
161 if (sctx->region_size_bits >= 0)
162 p >>= sctx->region_size_bits;
163 else
164 sector_div(p, sctx->region_size);
165
Mikulas Patocka99eb1902014-07-28 17:49:41 -0400166 path_nr = switch_region_table_read(sctx, p);
Jim Ramsay9d0eb0a2013-07-10 23:41:19 +0100167
168 /* This can only happen if the processor uses non-atomic stores. */
169 if (unlikely(path_nr >= sctx->nr_paths))
170 path_nr = 0;
171
172 return path_nr;
173}
174
175static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr,
176 unsigned value)
177{
178 unsigned long region_index;
179 unsigned bit;
180 region_table_slot_t pte;
181
182 switch_get_position(sctx, region_nr, &region_index, &bit);
183
184 pte = sctx->region_table[region_index];
185 pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit);
186 pte |= (region_table_slot_t)value << bit;
187 sctx->region_table[region_index] = pte;
188}
189
190/*
191 * Fill the region table with an initial round robin pattern.
192 */
193static void initialise_region_table(struct switch_ctx *sctx)
194{
195 unsigned path_nr = 0;
196 unsigned long region_nr;
197
198 for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) {
199 switch_region_table_write(sctx, region_nr, path_nr);
200 if (++path_nr >= sctx->nr_paths)
201 path_nr = 0;
202 }
203}
204
205static int parse_path(struct dm_arg_set *as, struct dm_target *ti)
206{
207 struct switch_ctx *sctx = ti->private;
208 unsigned long long start;
209 int r;
210
211 r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
212 &sctx->path_list[sctx->nr_paths].dmdev);
213 if (r) {
214 ti->error = "Device lookup failed";
215 return r;
216 }
217
218 if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) {
219 ti->error = "Invalid device starting offset";
220 dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
221 return -EINVAL;
222 }
223
224 sctx->path_list[sctx->nr_paths].start = start;
225
226 sctx->nr_paths++;
227
228 return 0;
229}
230
231/*
232 * Destructor: Don't free the dm_target, just the ti->private data (if any).
233 */
234static void switch_dtr(struct dm_target *ti)
235{
236 struct switch_ctx *sctx = ti->private;
237
238 while (sctx->nr_paths--)
239 dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
240
241 vfree(sctx->region_table);
242 kfree(sctx);
243}
244
245/*
246 * Constructor arguments:
247 * <num_paths> <region_size> <num_optional_args> [<optional_args>...]
248 * [<dev_path> <offset>]+
249 *
250 * Optional args are to allow for future extension: currently this
251 * parameter must be 0.
252 */
253static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
254{
Eric Biggers5916a222017-06-22 11:32:45 -0700255 static const struct dm_arg _args[] = {
Jim Ramsay9d0eb0a2013-07-10 23:41:19 +0100256 {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
257 {1, UINT_MAX, "Invalid region size"},
258 {0, 0, "Invalid number of optional args"},
259 };
260
261 struct switch_ctx *sctx;
262 struct dm_arg_set as;
263 unsigned nr_paths, region_size, nr_optional_args;
264 int r;
265
266 as.argc = argc;
267 as.argv = argv;
268
269 r = dm_read_arg(_args, &as, &nr_paths, &ti->error);
270 if (r)
271 return -EINVAL;
272
273 r = dm_read_arg(_args + 1, &as, &region_size, &ti->error);
274 if (r)
275 return r;
276
277 r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error);
278 if (r)
279 return r;
280 /* parse optional arguments here, if we add any */
281
282 if (as.argc != nr_paths * 2) {
283 ti->error = "Incorrect number of path arguments";
284 return -EINVAL;
285 }
286
287 sctx = alloc_switch_ctx(ti, nr_paths, region_size);
288 if (!sctx) {
289 ti->error = "Cannot allocate redirection context";
290 return -ENOMEM;
291 }
292
293 r = dm_set_target_max_io_len(ti, region_size);
294 if (r)
295 goto error;
296
297 while (as.argc) {
298 r = parse_path(&as, ti);
299 if (r)
300 goto error;
301 }
302
303 r = alloc_region_table(ti, nr_paths);
304 if (r)
305 goto error;
306
307 initialise_region_table(sctx);
308
309 /* For UNMAP, sending the request down any path is sufficient */
310 ti->num_discard_bios = 1;
311
312 return 0;
313
314error:
315 switch_dtr(ti);
316
317 return r;
318}
319
320static int switch_map(struct dm_target *ti, struct bio *bio)
321{
322 struct switch_ctx *sctx = ti->private;
Kent Overstreet4f024f32013-10-11 15:44:27 -0700323 sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector);
Jim Ramsay9d0eb0a2013-07-10 23:41:19 +0100324 unsigned path_nr = switch_get_path_nr(sctx, offset);
325
Christoph Hellwig74d46992017-08-23 19:10:32 +0200326 bio_set_dev(bio, sctx->path_list[path_nr].dmdev->bdev);
Kent Overstreet4f024f32013-10-11 15:44:27 -0700327 bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset;
Jim Ramsay9d0eb0a2013-07-10 23:41:19 +0100328
329 return DM_MAPIO_REMAPPED;
330}
331
332/*
333 * We need to parse hex numbers in the message as quickly as possible.
334 *
335 * This table-based hex parser improves performance.
336 * It improves a time to load 1000000 entries compared to the condition-based
337 * parser.
338 * table-based parser condition-based parser
339 * PA-RISC 0.29s 0.31s
340 * Opteron 0.0495s 0.0498s
341 */
342static const unsigned char hex_table[256] = {
343255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
344255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
345255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
3460, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255,
347255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
348255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
349255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
350255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
351255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
352255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
353255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
354255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
355255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
356255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
357255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
358255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
359};
360
361static __always_inline unsigned long parse_hex(const char **string)
362{
363 unsigned char d;
364 unsigned long r = 0;
365
366 while ((d = hex_table[(unsigned char)**string]) < 16) {
367 r = (r << 4) | d;
368 (*string)++;
369 }
370
371 return r;
372}
373
374static int process_set_region_mappings(struct switch_ctx *sctx,
Mikulas Patocka56b1ebf2014-07-28 18:11:25 -0400375 unsigned argc, char **argv)
Jim Ramsay9d0eb0a2013-07-10 23:41:19 +0100376{
377 unsigned i;
378 unsigned long region_index = 0;
379
380 for (i = 1; i < argc; i++) {
381 unsigned long path_nr;
382 const char *string = argv[i];
383
Mikulas Patocka56b1ebf2014-07-28 18:11:25 -0400384 if ((*string & 0xdf) == 'R') {
385 unsigned long cycle_length, num_write;
386
387 string++;
388 if (unlikely(*string == ',')) {
389 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
390 return -EINVAL;
391 }
392 cycle_length = parse_hex(&string);
393 if (unlikely(*string != ',')) {
394 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
395 return -EINVAL;
396 }
397 string++;
398 if (unlikely(!*string)) {
399 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
400 return -EINVAL;
401 }
402 num_write = parse_hex(&string);
403 if (unlikely(*string)) {
404 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
405 return -EINVAL;
406 }
407
408 if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) {
409 DMWARN("invalid set_region_mappings cycle length: %lu > %lu",
410 cycle_length - 1, region_index);
411 return -EINVAL;
412 }
413 if (unlikely(region_index + num_write < region_index) ||
414 unlikely(region_index + num_write >= sctx->nr_regions)) {
415 DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu",
416 region_index, num_write, sctx->nr_regions);
417 return -EINVAL;
418 }
419
420 while (num_write--) {
421 region_index++;
422 path_nr = switch_region_table_read(sctx, region_index - cycle_length);
423 switch_region_table_write(sctx, region_index, path_nr);
424 }
425
426 continue;
427 }
428
Jim Ramsay9d0eb0a2013-07-10 23:41:19 +0100429 if (*string == ':')
430 region_index++;
431 else {
432 region_index = parse_hex(&string);
433 if (unlikely(*string != ':')) {
434 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
435 return -EINVAL;
436 }
437 }
438
439 string++;
440 if (unlikely(!*string)) {
441 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
442 return -EINVAL;
443 }
444
445 path_nr = parse_hex(&string);
446 if (unlikely(*string)) {
447 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
448 return -EINVAL;
449 }
450 if (unlikely(region_index >= sctx->nr_regions)) {
451 DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions);
452 return -EINVAL;
453 }
454 if (unlikely(path_nr >= sctx->nr_paths)) {
455 DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths);
456 return -EINVAL;
457 }
458
459 switch_region_table_write(sctx, region_index, path_nr);
460 }
461
462 return 0;
463}
464
465/*
466 * Messages are processed one-at-a-time.
467 *
468 * Only set_region_mappings is supported.
469 */
Mike Snitzer1eb5fa82018-02-28 15:59:59 -0500470static int switch_message(struct dm_target *ti, unsigned argc, char **argv,
471 char *result, unsigned maxlen)
Jim Ramsay9d0eb0a2013-07-10 23:41:19 +0100472{
473 static DEFINE_MUTEX(message_mutex);
474
475 struct switch_ctx *sctx = ti->private;
476 int r = -EINVAL;
477
478 mutex_lock(&message_mutex);
479
480 if (!strcasecmp(argv[0], "set_region_mappings"))
481 r = process_set_region_mappings(sctx, argc, argv);
482 else
483 DMWARN("Unrecognised message received.");
484
485 mutex_unlock(&message_mutex);
486
487 return r;
488}
489
490static void switch_status(struct dm_target *ti, status_type_t type,
491 unsigned status_flags, char *result, unsigned maxlen)
492{
493 struct switch_ctx *sctx = ti->private;
494 unsigned sz = 0;
495 int path_nr;
496
497 switch (type) {
498 case STATUSTYPE_INFO:
499 result[0] = '\0';
500 break;
501
502 case STATUSTYPE_TABLE:
503 DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size);
504 for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++)
505 DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name,
506 (unsigned long long)sctx->path_list[path_nr].start);
507 break;
508 }
509}
510
511/*
512 * Switch ioctl:
513 *
514 * Passthrough all ioctls to the path for sector 0
515 */
Mike Snitzer5bd5e8d2018-04-03 16:54:10 -0400516static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
Jim Ramsay9d0eb0a2013-07-10 23:41:19 +0100517{
518 struct switch_ctx *sctx = ti->private;
Jim Ramsay9d0eb0a2013-07-10 23:41:19 +0100519 unsigned path_nr;
Jim Ramsay9d0eb0a2013-07-10 23:41:19 +0100520
521 path_nr = switch_get_path_nr(sctx, 0);
522
Christoph Hellwige56f81e2015-10-15 14:10:50 +0200523 *bdev = sctx->path_list[path_nr].dmdev->bdev;
Jim Ramsay9d0eb0a2013-07-10 23:41:19 +0100524
525 /*
526 * Only pass ioctls through if the device sizes match exactly.
527 */
Christoph Hellwige56f81e2015-10-15 14:10:50 +0200528 if (ti->len + sctx->path_list[path_nr].start !=
529 i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
530 return 1;
531 return 0;
Jim Ramsay9d0eb0a2013-07-10 23:41:19 +0100532}
533
534static int switch_iterate_devices(struct dm_target *ti,
535 iterate_devices_callout_fn fn, void *data)
536{
537 struct switch_ctx *sctx = ti->private;
538 int path_nr;
539 int r;
540
541 for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) {
542 r = fn(ti, sctx->path_list[path_nr].dmdev,
543 sctx->path_list[path_nr].start, ti->len, data);
544 if (r)
545 return r;
546 }
547
548 return 0;
549}
550
551static struct target_type switch_target = {
552 .name = "switch",
Mikulas Patocka56b1ebf2014-07-28 18:11:25 -0400553 .version = {1, 1, 0},
Jim Ramsay9d0eb0a2013-07-10 23:41:19 +0100554 .module = THIS_MODULE,
555 .ctr = switch_ctr,
556 .dtr = switch_dtr,
557 .map = switch_map,
558 .message = switch_message,
559 .status = switch_status,
Christoph Hellwige56f81e2015-10-15 14:10:50 +0200560 .prepare_ioctl = switch_prepare_ioctl,
Jim Ramsay9d0eb0a2013-07-10 23:41:19 +0100561 .iterate_devices = switch_iterate_devices,
562};
563
564static int __init dm_switch_init(void)
565{
566 int r;
567
568 r = dm_register_target(&switch_target);
569 if (r < 0)
570 DMERR("dm_register_target() failed %d", r);
571
572 return r;
573}
574
575static void __exit dm_switch_exit(void)
576{
577 dm_unregister_target(&switch_target);
578}
579
580module_init(dm_switch_init);
581module_exit(dm_switch_exit);
582
583MODULE_DESCRIPTION(DM_NAME " dynamic path switching target");
584MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
585MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>");
586MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>");
587MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
588MODULE_LICENSE("GPL");