blob: 5157ef6d00417e18c7ce51f87fe1f2270f65afa1 [file] [log] [blame]
Boaz Harrosh09f5bf42011-05-22 19:50:20 +03001/*
2 * pNFS Objects layout driver high level definitions
3 *
4 * Copyright (C) 2007 Panasas Inc. [year of first publication]
5 * All rights reserved.
6 *
7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <bharrosh@panasas.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * See the file COPYING included with this distribution for more details.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 *
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the Panasas company nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
28 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
34 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40#include <scsi/osd_initiator.h>
41#include "objlayout.h"
42
43#define NFSDBG_FACILITY NFSDBG_PNFS_LD
44/*
Benny Halevye51b8412011-05-22 19:51:48 +030045 * Create a objlayout layout structure for the given inode and return it.
46 */
47struct pnfs_layout_hdr *
48objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
49{
50 struct objlayout *objlay;
51
52 objlay = kzalloc(sizeof(struct objlayout), gfp_flags);
53 dprintk("%s: Return %p\n", __func__, objlay);
54 return &objlay->pnfs_layout;
55}
56
57/*
58 * Free an objlayout layout structure
59 */
60void
61objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
62{
63 struct objlayout *objlay = OBJLAYOUT(lo);
64
65 dprintk("%s: objlay %p\n", __func__, objlay);
66
67 kfree(objlay);
68}
69
70/*
Boaz Harrosh09f5bf42011-05-22 19:50:20 +030071 * Unmarshall layout and store it in pnfslay.
72 */
73struct pnfs_layout_segment *
74objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay,
75 struct nfs4_layoutget_res *lgr,
76 gfp_t gfp_flags)
77{
78 int status = -ENOMEM;
79 struct xdr_stream stream;
80 struct xdr_buf buf = {
81 .pages = lgr->layoutp->pages,
82 .page_len = lgr->layoutp->len,
83 .buflen = lgr->layoutp->len,
84 .len = lgr->layoutp->len,
85 };
86 struct page *scratch;
87 struct pnfs_layout_segment *lseg;
88
89 dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay);
90
91 scratch = alloc_page(gfp_flags);
92 if (!scratch)
93 goto err_nofree;
94
95 xdr_init_decode(&stream, &buf, NULL);
96 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
97
98 status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags);
99 if (unlikely(status)) {
100 dprintk("%s: objio_alloc_lseg Return err %d\n", __func__,
101 status);
102 goto err;
103 }
104
105 __free_page(scratch);
106
107 dprintk("%s: Return %p\n", __func__, lseg);
108 return lseg;
109
110err:
111 __free_page(scratch);
112err_nofree:
113 dprintk("%s: Err Return=>%d\n", __func__, status);
114 return ERR_PTR(status);
115}
116
117/*
118 * Free a layout segement
119 */
120void
121objlayout_free_lseg(struct pnfs_layout_segment *lseg)
122{
123 dprintk("%s: freeing layout segment %p\n", __func__, lseg);
124
125 if (unlikely(!lseg))
126 return;
127
128 objio_free_lseg(lseg);
129}
130
Boaz Harroshb6c05f12011-05-26 21:45:34 +0300131/*
Boaz Harrosh04f83452011-05-22 19:52:19 +0300132 * I/O Operations
133 */
134static inline u64
135end_offset(u64 start, u64 len)
136{
137 u64 end;
138
139 end = start + len;
140 return end >= start ? end : NFS4_MAX_UINT64;
141}
142
143/* last octet in a range */
144static inline u64
145last_byte_offset(u64 start, u64 len)
146{
147 u64 end;
148
149 BUG_ON(!len);
150 end = start + len;
151 return end > start ? end - 1 : NFS4_MAX_UINT64;
152}
153
154static struct objlayout_io_state *
155objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
156 struct page **pages,
157 unsigned pgbase,
158 loff_t offset,
159 size_t count,
160 struct pnfs_layout_segment *lseg,
161 void *rpcdata,
162 gfp_t gfp_flags)
163{
164 struct objlayout_io_state *state;
165 u64 lseg_end_offset;
166
167 dprintk("%s: allocating io_state\n", __func__);
168 if (objio_alloc_io_state(lseg, &state, gfp_flags))
169 return NULL;
170
171 BUG_ON(offset < lseg->pls_range.offset);
172 lseg_end_offset = end_offset(lseg->pls_range.offset,
173 lseg->pls_range.length);
174 BUG_ON(offset >= lseg_end_offset);
175 if (offset + count > lseg_end_offset) {
176 count = lseg->pls_range.length -
177 (offset - lseg->pls_range.offset);
178 dprintk("%s: truncated count %Zd\n", __func__, count);
179 }
180
181 if (pgbase > PAGE_SIZE) {
182 pages += pgbase >> PAGE_SHIFT;
183 pgbase &= ~PAGE_MASK;
184 }
185
186 state->lseg = lseg;
187 state->rpcdata = rpcdata;
188 state->pages = pages;
189 state->pgbase = pgbase;
190 state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
191 state->offset = offset;
192 state->count = count;
193 state->sync = 0;
194
195 return state;
196}
197
198static void
199objlayout_free_io_state(struct objlayout_io_state *state)
200{
201 dprintk("%s: freeing io_state\n", __func__);
202 if (unlikely(!state))
203 return;
204
205 objio_free_io_state(state);
206}
207
208/*
209 * I/O done common code
210 */
211static void
212objlayout_iodone(struct objlayout_io_state *state)
213{
214 dprintk("%s: state %p status\n", __func__, state);
215
216 objlayout_free_io_state(state);
217}
218
219/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
220 * This is because the osd completion is called with ints-off from
221 * the block layer
222 */
223static void _rpc_read_complete(struct work_struct *work)
224{
225 struct rpc_task *task;
226 struct nfs_read_data *rdata;
227
228 dprintk("%s enter\n", __func__);
229 task = container_of(work, struct rpc_task, u.tk_work);
230 rdata = container_of(task, struct nfs_read_data, task);
231
232 pnfs_ld_read_done(rdata);
233}
234
235void
236objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
237{
238 int eof = state->eof;
239 struct nfs_read_data *rdata;
240
241 state->status = status;
242 dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof);
243 rdata = state->rpcdata;
244 rdata->task.tk_status = status;
245 if (status >= 0) {
246 rdata->res.count = status;
247 rdata->res.eof = eof;
248 }
249 objlayout_iodone(state);
250 /* must not use state after this point */
251
252 if (sync)
253 pnfs_ld_read_done(rdata);
254 else {
255 INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
256 schedule_work(&rdata->task.u.tk_work);
257 }
258}
259
260/*
261 * Perform sync or async reads.
262 */
263enum pnfs_try_status
264objlayout_read_pagelist(struct nfs_read_data *rdata)
265{
266 loff_t offset = rdata->args.offset;
267 size_t count = rdata->args.count;
268 struct objlayout_io_state *state;
269 ssize_t status = 0;
270 loff_t eof;
271
272 dprintk("%s: Begin inode %p offset %llu count %d\n",
273 __func__, rdata->inode, offset, (int)count);
274
275 eof = i_size_read(rdata->inode);
276 if (unlikely(offset + count > eof)) {
277 if (offset >= eof) {
278 status = 0;
279 rdata->res.count = 0;
280 rdata->res.eof = 1;
281 goto out;
282 }
283 count = eof - offset;
284 }
285
286 state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
287 rdata->args.pages, rdata->args.pgbase,
288 offset, count,
289 rdata->lseg, rdata,
290 GFP_KERNEL);
291 if (unlikely(!state)) {
292 status = -ENOMEM;
293 goto out;
294 }
295
296 state->eof = state->offset + state->count >= eof;
297
298 status = objio_read_pagelist(state);
299 out:
300 dprintk("%s: Return status %Zd\n", __func__, status);
301 rdata->pnfs_error = status;
302 return PNFS_ATTEMPTED;
303}
304
305/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
306 * This is because the osd completion is called with ints-off from
307 * the block layer
308 */
309static void _rpc_write_complete(struct work_struct *work)
310{
311 struct rpc_task *task;
312 struct nfs_write_data *wdata;
313
314 dprintk("%s enter\n", __func__);
315 task = container_of(work, struct rpc_task, u.tk_work);
316 wdata = container_of(task, struct nfs_write_data, task);
317
318 pnfs_ld_write_done(wdata);
319}
320
321void
322objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
323 bool sync)
324{
325 struct nfs_write_data *wdata;
326
327 dprintk("%s: Begin\n", __func__);
328 wdata = state->rpcdata;
329 state->status = status;
330 wdata->task.tk_status = status;
331 if (status >= 0) {
332 wdata->res.count = status;
333 wdata->verf.committed = state->committed;
334 dprintk("%s: Return status %d committed %d\n",
335 __func__, wdata->task.tk_status,
336 wdata->verf.committed);
337 } else
338 dprintk("%s: Return status %d\n",
339 __func__, wdata->task.tk_status);
340 objlayout_iodone(state);
341 /* must not use state after this point */
342
343 if (sync)
344 pnfs_ld_write_done(wdata);
345 else {
346 INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
347 schedule_work(&wdata->task.u.tk_work);
348 }
349}
350
351/*
352 * Perform sync or async writes.
353 */
354enum pnfs_try_status
355objlayout_write_pagelist(struct nfs_write_data *wdata,
356 int how)
357{
358 struct objlayout_io_state *state;
359 ssize_t status;
360
361 dprintk("%s: Begin inode %p offset %llu count %u\n",
362 __func__, wdata->inode, wdata->args.offset, wdata->args.count);
363
364 state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
365 wdata->args.pages,
366 wdata->args.pgbase,
367 wdata->args.offset,
368 wdata->args.count,
369 wdata->lseg, wdata,
370 GFP_NOFS);
371 if (unlikely(!state)) {
372 status = -ENOMEM;
373 goto out;
374 }
375
376 state->sync = how & FLUSH_SYNC;
377
378 status = objio_write_pagelist(state, how & FLUSH_STABLE);
379 out:
380 dprintk("%s: Return status %Zd\n", __func__, status);
381 wdata->pnfs_error = status;
382 return PNFS_ATTEMPTED;
383}
384
385/*
Boaz Harroshb6c05f12011-05-26 21:45:34 +0300386 * Get Device Info API for io engines
387 */
388struct objlayout_deviceinfo {
389 struct page *page;
390 struct pnfs_osd_deviceaddr da; /* This must be last */
391};
392
393/* Initialize and call nfs_getdeviceinfo, then decode and return a
394 * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
395 * should be called.
396 */
397int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
398 struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
399 gfp_t gfp_flags)
400{
401 struct objlayout_deviceinfo *odi;
402 struct pnfs_device pd;
403 struct super_block *sb;
404 struct page *page, **pages;
405 u32 *p;
406 int err;
407
408 page = alloc_page(gfp_flags);
409 if (!page)
410 return -ENOMEM;
411
412 pages = &page;
413 pd.pages = pages;
414
415 memcpy(&pd.dev_id, d_id, sizeof(*d_id));
416 pd.layout_type = LAYOUT_OSD2_OBJECTS;
417 pd.pages = &page;
418 pd.pgbase = 0;
419 pd.pglen = PAGE_SIZE;
420 pd.mincount = 0;
421
422 sb = pnfslay->plh_inode->i_sb;
423 err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd);
424 dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
425 if (err)
426 goto err_out;
427
428 p = page_address(page);
429 odi = kzalloc(sizeof(*odi), gfp_flags);
430 if (!odi) {
431 err = -ENOMEM;
432 goto err_out;
433 }
434 pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
435 odi->page = page;
436 *deviceaddr = &odi->da;
437 return 0;
438
439err_out:
440 __free_page(page);
441 return err;
442}
443
444void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
445{
446 struct objlayout_deviceinfo *odi = container_of(deviceaddr,
447 struct objlayout_deviceinfo,
448 da);
449
450 __free_page(odi->page);
451 kfree(odi);
452}