Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 1 | /* |
| 2 | * linux/fs/nfs/blocklayout/blocklayoutdev.c |
| 3 | * |
| 4 | * Device operations for the pnfs nfs4 file layout driver. |
| 5 | * |
| 6 | * Copyright (c) 2006 The Regents of the University of Michigan. |
| 7 | * All rights reserved. |
| 8 | * |
| 9 | * Andy Adamson <andros@citi.umich.edu> |
| 10 | * Fred Isaman <iisaman@umich.edu> |
| 11 | * |
| 12 | * permission is granted to use, copy, create derivative works and |
| 13 | * redistribute this software and such derivative works for any purpose, |
| 14 | * so long as the name of the university of michigan is not used in |
| 15 | * any advertising or publicity pertaining to the use or distribution |
| 16 | * of this software without specific, written prior authorization. if |
| 17 | * the above copyright notice or any other identification of the |
| 18 | * university of michigan is included in any copy of any portion of |
| 19 | * this software, then the disclaimer below must also be included. |
| 20 | * |
| 21 | * this software is provided as is, without representation from the |
| 22 | * university of michigan as to its fitness for any purpose, and without |
| 23 | * warranty by the university of michigan of any kind, either express |
| 24 | * or implied, including without limitation the implied warranties of |
| 25 | * merchantability and fitness for a particular purpose. the regents |
| 26 | * of the university of michigan shall not be liable for any damages, |
| 27 | * including special, indirect, incidental, or consequential damages, |
| 28 | * with respect to any claim arising out or in connection with the use |
| 29 | * of the software, even if it has been or is hereafter advised of the |
| 30 | * possibility of such damages. |
| 31 | */ |
| 32 | #include <linux/module.h> |
| 33 | #include <linux/buffer_head.h> /* __bread */ |
| 34 | |
| 35 | #include <linux/genhd.h> |
| 36 | #include <linux/blkdev.h> |
| 37 | #include <linux/hash.h> |
| 38 | |
| 39 | #include "blocklayout.h" |
| 40 | |
| 41 | #define NFSDBG_FACILITY NFSDBG_PNFS_LD |
| 42 | |
Fred Isaman | e9437cc | 2011-07-30 20:52:47 -0400 | [diff] [blame] | 43 | static int decode_sector_number(__be32 **rp, sector_t *sp) |
| 44 | { |
| 45 | uint64_t s; |
| 46 | |
| 47 | *rp = xdr_decode_hyper(*rp, &s); |
| 48 | if (s & 0x1ff) { |
Weston Andros Adamson | a030889 | 2012-01-26 13:32:23 -0500 | [diff] [blame] | 49 | printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__); |
Fred Isaman | e9437cc | 2011-07-30 20:52:47 -0400 | [diff] [blame] | 50 | return -1; |
| 51 | } |
| 52 | *sp = s >> SECTOR_SHIFT; |
| 53 | return 0; |
| 54 | } |
| 55 | |
Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 56 | /* Open a block_device by device number. */ |
| 57 | struct block_device *nfs4_blkdev_get(dev_t dev) |
| 58 | { |
| 59 | struct block_device *bd; |
| 60 | |
| 61 | dprintk("%s enter\n", __func__); |
| 62 | bd = blkdev_get_by_dev(dev, FMODE_READ, NULL); |
| 63 | if (IS_ERR(bd)) |
| 64 | goto fail; |
| 65 | return bd; |
| 66 | fail: |
| 67 | dprintk("%s failed to open device : %ld\n", |
| 68 | __func__, PTR_ERR(bd)); |
| 69 | return NULL; |
| 70 | } |
| 71 | |
| 72 | /* |
| 73 | * Release the block device |
| 74 | */ |
| 75 | int nfs4_blkdev_put(struct block_device *bdev) |
| 76 | { |
| 77 | dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), |
| 78 | MINOR(bdev->bd_dev)); |
| 79 | return blkdev_put(bdev, FMODE_READ); |
| 80 | } |
| 81 | |
Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 82 | ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, |
| 83 | size_t mlen) |
| 84 | { |
Stanislav Kinsbursky | cb9c1c4 | 2012-03-11 18:20:23 +0400 | [diff] [blame] | 85 | struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info, |
| 86 | nfs_net_id); |
| 87 | |
Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 88 | if (mlen != sizeof (struct bl_dev_msg)) |
| 89 | return -EINVAL; |
| 90 | |
Stanislav Kinsbursky | cb9c1c4 | 2012-03-11 18:20:23 +0400 | [diff] [blame] | 91 | if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0) |
Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 92 | return -EFAULT; |
| 93 | |
Stanislav Kinsbursky | 5ffaf85 | 2012-03-11 18:20:31 +0400 | [diff] [blame] | 94 | wake_up(&nn->bl_wq); |
Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 95 | |
| 96 | return mlen; |
| 97 | } |
| 98 | |
| 99 | void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) |
| 100 | { |
Stanislav Kinsbursky | 5ffaf85 | 2012-03-11 18:20:31 +0400 | [diff] [blame] | 101 | struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg); |
| 102 | |
Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 103 | if (msg->errno >= 0) |
| 104 | return; |
Stanislav Kinsbursky | 5ffaf85 | 2012-03-11 18:20:31 +0400 | [diff] [blame] | 105 | wake_up(bl_pipe_msg->bl_wq); |
Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 106 | } |
| 107 | |
| 108 | /* |
| 109 | * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf. |
| 110 | */ |
| 111 | struct pnfs_block_dev * |
| 112 | nfs4_blk_decode_device(struct nfs_server *server, |
Fred Isaman | 2f9fd18 | 2011-07-30 20:52:46 -0400 | [diff] [blame] | 113 | struct pnfs_device *dev) |
Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 114 | { |
Jim Rees | 516f2e2 | 2011-09-22 21:50:08 -0400 | [diff] [blame] | 115 | struct pnfs_block_dev *rv; |
Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 116 | struct block_device *bd = NULL; |
Stanislav Kinsbursky | 5ffaf85 | 2012-03-11 18:20:31 +0400 | [diff] [blame] | 117 | struct bl_pipe_msg bl_pipe_msg; |
| 118 | struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; |
Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 119 | struct bl_msg_hdr bl_msg = { |
| 120 | .type = BL_DEVICE_MOUNT, |
| 121 | .totallen = dev->mincount, |
| 122 | }; |
| 123 | uint8_t *dataptr; |
| 124 | DECLARE_WAITQUEUE(wq, current); |
Jim Rees | 516f2e2 | 2011-09-22 21:50:08 -0400 | [diff] [blame] | 125 | int offset, len, i, rc; |
Stanislav Kinsbursky | 9e2e74d | 2012-01-10 17:04:24 +0400 | [diff] [blame] | 126 | struct net *net = server->nfs_client->net; |
| 127 | struct nfs_net *nn = net_generic(net, nfs_net_id); |
Stanislav Kinsbursky | cb9c1c4 | 2012-03-11 18:20:23 +0400 | [diff] [blame] | 128 | struct bl_dev_msg *reply = &nn->bl_mount_reply; |
Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 129 | |
| 130 | dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); |
| 131 | dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, |
| 132 | dev->mincount); |
| 133 | |
Stanislav Kinsbursky | 5ffaf85 | 2012-03-11 18:20:31 +0400 | [diff] [blame] | 134 | bl_pipe_msg.bl_wq = &nn->bl_wq; |
| 135 | memset(msg, 0, sizeof(*msg)); |
| 136 | msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS); |
| 137 | if (!msg->data) { |
Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 138 | rv = ERR_PTR(-ENOMEM); |
| 139 | goto out; |
| 140 | } |
| 141 | |
Stanislav Kinsbursky | 5ffaf85 | 2012-03-11 18:20:31 +0400 | [diff] [blame] | 142 | memcpy(msg->data, &bl_msg, sizeof(bl_msg)); |
| 143 | dataptr = (uint8_t *) msg->data; |
Fred Isaman | 2f9fd18 | 2011-07-30 20:52:46 -0400 | [diff] [blame] | 144 | len = dev->mincount; |
| 145 | offset = sizeof(bl_msg); |
| 146 | for (i = 0; len > 0; i++) { |
| 147 | memcpy(&dataptr[offset], page_address(dev->pages[i]), |
| 148 | len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE); |
| 149 | len -= PAGE_CACHE_SIZE; |
| 150 | offset += PAGE_CACHE_SIZE; |
| 151 | } |
Stanislav Kinsbursky | 5ffaf85 | 2012-03-11 18:20:31 +0400 | [diff] [blame] | 152 | msg->len = sizeof(bl_msg) + dev->mincount; |
Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 153 | |
| 154 | dprintk("%s CALLING USERSPACE DAEMON\n", __func__); |
Stanislav Kinsbursky | 5ffaf85 | 2012-03-11 18:20:31 +0400 | [diff] [blame] | 155 | add_wait_queue(&nn->bl_wq, &wq); |
| 156 | rc = rpc_queue_upcall(nn->bl_device_pipe, msg); |
Jim Rees | 516f2e2 | 2011-09-22 21:50:08 -0400 | [diff] [blame] | 157 | if (rc < 0) { |
Stanislav Kinsbursky | 5ffaf85 | 2012-03-11 18:20:31 +0400 | [diff] [blame] | 158 | remove_wait_queue(&nn->bl_wq, &wq); |
Jim Rees | 516f2e2 | 2011-09-22 21:50:08 -0400 | [diff] [blame] | 159 | rv = ERR_PTR(rc); |
Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 160 | goto out; |
| 161 | } |
| 162 | |
| 163 | set_current_state(TASK_UNINTERRUPTIBLE); |
| 164 | schedule(); |
| 165 | __set_current_state(TASK_RUNNING); |
Stanislav Kinsbursky | 5ffaf85 | 2012-03-11 18:20:31 +0400 | [diff] [blame] | 166 | remove_wait_queue(&nn->bl_wq, &wq); |
Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 167 | |
| 168 | if (reply->status != BL_DEVICE_REQUEST_PROC) { |
| 169 | dprintk("%s failed to open device: %d\n", |
| 170 | __func__, reply->status); |
| 171 | rv = ERR_PTR(-EINVAL); |
| 172 | goto out; |
| 173 | } |
| 174 | |
| 175 | bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor)); |
| 176 | if (IS_ERR(bd)) { |
Jim Rees | 516f2e2 | 2011-09-22 21:50:08 -0400 | [diff] [blame] | 177 | rc = PTR_ERR(bd); |
| 178 | dprintk("%s failed to open device : %d\n", __func__, rc); |
| 179 | rv = ERR_PTR(rc); |
Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 180 | goto out; |
| 181 | } |
| 182 | |
| 183 | rv = kzalloc(sizeof(*rv), GFP_NOFS); |
| 184 | if (!rv) { |
| 185 | rv = ERR_PTR(-ENOMEM); |
| 186 | goto out; |
| 187 | } |
| 188 | |
| 189 | rv->bm_mdev = bd; |
| 190 | memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); |
Stanislav Kinsbursky | 9e2e74d | 2012-01-10 17:04:24 +0400 | [diff] [blame] | 191 | rv->net = net; |
Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 192 | dprintk("%s Created device %s with bd_block_size %u\n", |
| 193 | __func__, |
| 194 | bd->bd_disk->disk_name, |
| 195 | bd->bd_block_size); |
| 196 | |
| 197 | out: |
Stanislav Kinsbursky | 5ffaf85 | 2012-03-11 18:20:31 +0400 | [diff] [blame] | 198 | kfree(msg->data); |
Jim Rees | fe0a9b7 | 2011-07-30 20:52:42 -0400 | [diff] [blame] | 199 | return rv; |
| 200 | } |
Fred Isaman | a60d2eb | 2011-07-30 20:52:44 -0400 | [diff] [blame] | 201 | |
Fred Isaman | e9437cc | 2011-07-30 20:52:47 -0400 | [diff] [blame] | 202 | /* Map deviceid returned by the server to constructed block_device */ |
| 203 | static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, |
| 204 | struct nfs4_deviceid *id) |
| 205 | { |
| 206 | struct block_device *rv = NULL; |
| 207 | struct block_mount_id *mid; |
| 208 | struct pnfs_block_dev *dev; |
| 209 | |
| 210 | dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); |
| 211 | mid = BLK_ID(lo); |
| 212 | spin_lock(&mid->bm_lock); |
| 213 | list_for_each_entry(dev, &mid->bm_devlist, bm_node) { |
| 214 | if (memcmp(id->data, dev->bm_mdevid.data, |
| 215 | NFS4_DEVICEID4_SIZE) == 0) { |
| 216 | rv = dev->bm_mdev; |
| 217 | goto out; |
| 218 | } |
| 219 | } |
| 220 | out: |
| 221 | spin_unlock(&mid->bm_lock); |
| 222 | dprintk("%s returning %p\n", __func__, rv); |
| 223 | return rv; |
| 224 | } |
| 225 | |
| 226 | /* Tracks info needed to ensure extents in layout obey constraints of spec */ |
| 227 | struct layout_verification { |
| 228 | u32 mode; /* R or RW */ |
| 229 | u64 start; /* Expected start of next non-COW extent */ |
| 230 | u64 inval; /* Start of INVAL coverage */ |
| 231 | u64 cowread; /* End of COW read coverage */ |
| 232 | }; |
| 233 | |
| 234 | /* Verify the extent meets the layout requirements of the pnfs-block draft, |
| 235 | * section 2.3.1. |
| 236 | */ |
| 237 | static int verify_extent(struct pnfs_block_extent *be, |
| 238 | struct layout_verification *lv) |
| 239 | { |
| 240 | if (lv->mode == IOMODE_READ) { |
| 241 | if (be->be_state == PNFS_BLOCK_READWRITE_DATA || |
| 242 | be->be_state == PNFS_BLOCK_INVALID_DATA) |
| 243 | return -EIO; |
| 244 | if (be->be_f_offset != lv->start) |
| 245 | return -EIO; |
| 246 | lv->start += be->be_length; |
| 247 | return 0; |
| 248 | } |
| 249 | /* lv->mode == IOMODE_RW */ |
| 250 | if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { |
| 251 | if (be->be_f_offset != lv->start) |
| 252 | return -EIO; |
| 253 | if (lv->cowread > lv->start) |
| 254 | return -EIO; |
| 255 | lv->start += be->be_length; |
| 256 | lv->inval = lv->start; |
| 257 | return 0; |
| 258 | } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { |
| 259 | if (be->be_f_offset != lv->start) |
| 260 | return -EIO; |
| 261 | lv->start += be->be_length; |
| 262 | return 0; |
| 263 | } else if (be->be_state == PNFS_BLOCK_READ_DATA) { |
| 264 | if (be->be_f_offset > lv->start) |
| 265 | return -EIO; |
| 266 | if (be->be_f_offset < lv->inval) |
| 267 | return -EIO; |
| 268 | if (be->be_f_offset < lv->cowread) |
| 269 | return -EIO; |
| 270 | /* It looks like you might want to min this with lv->start, |
| 271 | * but you really don't. |
| 272 | */ |
| 273 | lv->inval = lv->inval + be->be_length; |
| 274 | lv->cowread = be->be_f_offset + be->be_length; |
| 275 | return 0; |
| 276 | } else |
| 277 | return -EIO; |
| 278 | } |
| 279 | |
| 280 | /* XDR decode pnfs_block_layout4 structure */ |
Fred Isaman | a60d2eb | 2011-07-30 20:52:44 -0400 | [diff] [blame] | 281 | int |
| 282 | nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, |
| 283 | struct nfs4_layoutget_res *lgr, gfp_t gfp_flags) |
| 284 | { |
Fred Isaman | e9437cc | 2011-07-30 20:52:47 -0400 | [diff] [blame] | 285 | struct pnfs_block_layout *bl = BLK_LO2EXT(lo); |
| 286 | int i, status = -EIO; |
| 287 | uint32_t count; |
| 288 | struct pnfs_block_extent *be = NULL, *save; |
| 289 | struct xdr_stream stream; |
| 290 | struct xdr_buf buf; |
| 291 | struct page *scratch; |
| 292 | __be32 *p; |
| 293 | struct layout_verification lv = { |
| 294 | .mode = lgr->range.iomode, |
| 295 | .start = lgr->range.offset >> SECTOR_SHIFT, |
| 296 | .inval = lgr->range.offset >> SECTOR_SHIFT, |
| 297 | .cowread = lgr->range.offset >> SECTOR_SHIFT, |
| 298 | }; |
| 299 | LIST_HEAD(extents); |
| 300 | |
| 301 | dprintk("---> %s\n", __func__); |
| 302 | |
| 303 | scratch = alloc_page(gfp_flags); |
| 304 | if (!scratch) |
| 305 | return -ENOMEM; |
| 306 | |
| 307 | xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); |
| 308 | xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); |
| 309 | |
| 310 | p = xdr_inline_decode(&stream, 4); |
| 311 | if (unlikely(!p)) |
| 312 | goto out_err; |
| 313 | |
| 314 | count = be32_to_cpup(p++); |
| 315 | |
| 316 | dprintk("%s enter, number of extents %i\n", __func__, count); |
| 317 | p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count); |
| 318 | if (unlikely(!p)) |
| 319 | goto out_err; |
| 320 | |
| 321 | /* Decode individual extents, putting them in temporary |
| 322 | * staging area until whole layout is decoded to make error |
| 323 | * recovery easier. |
| 324 | */ |
| 325 | for (i = 0; i < count; i++) { |
| 326 | be = bl_alloc_extent(); |
| 327 | if (!be) { |
| 328 | status = -ENOMEM; |
| 329 | goto out_err; |
| 330 | } |
| 331 | memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE); |
| 332 | p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); |
| 333 | be->be_mdev = translate_devid(lo, &be->be_devid); |
| 334 | if (!be->be_mdev) |
| 335 | goto out_err; |
| 336 | |
| 337 | /* The next three values are read in as bytes, |
| 338 | * but stored as 512-byte sector lengths |
| 339 | */ |
| 340 | if (decode_sector_number(&p, &be->be_f_offset) < 0) |
| 341 | goto out_err; |
| 342 | if (decode_sector_number(&p, &be->be_length) < 0) |
| 343 | goto out_err; |
| 344 | if (decode_sector_number(&p, &be->be_v_offset) < 0) |
| 345 | goto out_err; |
| 346 | be->be_state = be32_to_cpup(p++); |
| 347 | if (be->be_state == PNFS_BLOCK_INVALID_DATA) |
| 348 | be->be_inval = &bl->bl_inval; |
| 349 | if (verify_extent(be, &lv)) { |
| 350 | dprintk("%s verify failed\n", __func__); |
| 351 | goto out_err; |
| 352 | } |
| 353 | list_add_tail(&be->be_node, &extents); |
| 354 | } |
| 355 | if (lgr->range.offset + lgr->range.length != |
| 356 | lv.start << SECTOR_SHIFT) { |
| 357 | dprintk("%s Final length mismatch\n", __func__); |
| 358 | be = NULL; |
| 359 | goto out_err; |
| 360 | } |
| 361 | if (lv.start < lv.cowread) { |
| 362 | dprintk("%s Final uncovered COW extent\n", __func__); |
| 363 | be = NULL; |
| 364 | goto out_err; |
| 365 | } |
| 366 | /* Extents decoded properly, now try to merge them in to |
| 367 | * existing layout extents. |
| 368 | */ |
| 369 | spin_lock(&bl->bl_ext_lock); |
| 370 | list_for_each_entry_safe(be, save, &extents, be_node) { |
| 371 | list_del(&be->be_node); |
| 372 | status = bl_add_merge_extent(bl, be); |
| 373 | if (status) { |
| 374 | spin_unlock(&bl->bl_ext_lock); |
| 375 | /* This is a fairly catastrophic error, as the |
| 376 | * entire layout extent lists are now corrupted. |
| 377 | * We should have some way to distinguish this. |
| 378 | */ |
| 379 | be = NULL; |
| 380 | goto out_err; |
| 381 | } |
| 382 | } |
| 383 | spin_unlock(&bl->bl_ext_lock); |
| 384 | status = 0; |
| 385 | out: |
| 386 | __free_page(scratch); |
| 387 | dprintk("%s returns %i\n", __func__, status); |
| 388 | return status; |
| 389 | |
| 390 | out_err: |
| 391 | bl_put_extent(be); |
| 392 | while (!list_empty(&extents)) { |
| 393 | be = list_first_entry(&extents, struct pnfs_block_extent, |
| 394 | be_node); |
| 395 | list_del(&be->be_node); |
| 396 | bl_put_extent(be); |
| 397 | } |
| 398 | goto out; |
Fred Isaman | a60d2eb | 2011-07-30 20:52:44 -0400 | [diff] [blame] | 399 | } |