blob: 3bb2b74cf600414be7d70a3c8a345a669ee04dcc [file] [log] [blame]
Tom Haynesf54bcf22014-12-11 15:34:59 -05001/*
2 * Common NFS I/O operations for the pnfs file based
3 * layout drivers.
4 *
5 * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
6 *
7 * Tom Haynes <loghyr@primarydata.com>
8 */
9
10#include <linux/nfs_fs.h>
11#include <linux/nfs_page.h>
12
13#include "internal.h"
14#include "pnfs.h"
15
Peng Tao875ae062014-05-29 21:06:57 +080016#define NFSDBG_FACILITY NFSDBG_PNFS
17
Tom Haynesf54bcf22014-12-11 15:34:59 -050018static void pnfs_generic_fenceme(struct inode *inode,
19 struct pnfs_layout_hdr *lo)
20{
21 if (!test_and_clear_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
22 return;
23 pnfs_return_layout(inode);
24}
25
26void pnfs_generic_rw_release(void *data)
27{
28 struct nfs_pgio_header *hdr = data;
29 struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
30
31 pnfs_generic_fenceme(lo->plh_inode, lo);
32 nfs_put_client(hdr->ds_clp);
33 hdr->mds_ops->rpc_release(data);
34}
35EXPORT_SYMBOL_GPL(pnfs_generic_rw_release);
36
37/* Fake up some data that will cause nfs_commit_release to retry the writes. */
38void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data)
39{
40 struct nfs_page *first = nfs_list_entry(data->pages.next);
41
42 data->task.tk_status = 0;
43 memcpy(&data->verf.verifier, &first->wb_verf,
44 sizeof(data->verf.verifier));
45 data->verf.verifier.data[0]++; /* ensure verifier mismatch */
46}
47EXPORT_SYMBOL_GPL(pnfs_generic_prepare_to_resend_writes);
48
49void pnfs_generic_write_commit_done(struct rpc_task *task, void *data)
50{
51 struct nfs_commit_data *wdata = data;
52
53 /* Note this may cause RPC to be resent */
54 wdata->mds_ops->rpc_call_done(task, data);
55}
56EXPORT_SYMBOL_GPL(pnfs_generic_write_commit_done);
57
58void pnfs_generic_commit_release(void *calldata)
59{
60 struct nfs_commit_data *data = calldata;
61
62 data->completion_ops->completion(data);
63 pnfs_put_lseg(data->lseg);
64 nfs_put_client(data->ds_clp);
65 nfs_commitdata_release(data);
66}
67EXPORT_SYMBOL_GPL(pnfs_generic_commit_release);
68
69/* The generic layer is about to remove the req from the commit list.
70 * If this will make the bucket empty, it will need to put the lseg reference.
Tom Haynes085d1e32014-12-11 13:04:55 -050071 * Note this must be called holding the inode (/cinfo) lock
Tom Haynesf54bcf22014-12-11 15:34:59 -050072 */
73void
74pnfs_generic_clear_request_commit(struct nfs_page *req,
75 struct nfs_commit_info *cinfo)
76{
77 struct pnfs_layout_segment *freeme = NULL;
78
79 if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
80 goto out;
81 cinfo->ds->nwritten--;
82 if (list_is_singular(&req->wb_list)) {
83 struct pnfs_commit_bucket *bucket;
84
85 bucket = list_first_entry(&req->wb_list,
86 struct pnfs_commit_bucket,
87 written);
88 freeme = bucket->wlseg;
89 bucket->wlseg = NULL;
90 }
91out:
92 nfs_request_remove_commit_list(req, cinfo);
93 pnfs_put_lseg_locked(freeme);
94}
95EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit);
96
97static int
98pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst,
99 struct nfs_commit_info *cinfo, int max)
100{
101 struct nfs_page *req, *tmp;
102 int ret = 0;
103
104 list_for_each_entry_safe(req, tmp, src, wb_list) {
105 if (!nfs_lock_request(req))
106 continue;
107 kref_get(&req->wb_kref);
108 if (cond_resched_lock(cinfo->lock))
109 list_safe_reset_next(req, tmp, wb_list);
110 nfs_request_remove_commit_list(req, cinfo);
111 clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
112 nfs_list_add_request(req, dst);
113 ret++;
114 if ((ret == max) && !cinfo->dreq)
115 break;
116 }
117 return ret;
118}
119
Tom Haynesf54bcf22014-12-11 15:34:59 -0500120static int
121pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
122 struct nfs_commit_info *cinfo,
123 int max)
124{
125 struct list_head *src = &bucket->written;
126 struct list_head *dst = &bucket->committing;
127 int ret;
128
Tom Haynes085d1e32014-12-11 13:04:55 -0500129 lockdep_assert_held(cinfo->lock);
Tom Haynesf54bcf22014-12-11 15:34:59 -0500130 ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max);
131 if (ret) {
132 cinfo->ds->nwritten -= ret;
133 cinfo->ds->ncommitting += ret;
134 bucket->clseg = bucket->wlseg;
135 if (list_empty(src))
136 bucket->wlseg = NULL;
137 else
138 pnfs_get_lseg(bucket->clseg);
139 }
140 return ret;
141}
142
Tom Haynes085d1e32014-12-11 13:04:55 -0500143/* Move reqs from written to committing lists, returning count
144 * of number moved.
Tom Haynesf54bcf22014-12-11 15:34:59 -0500145 */
146int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo,
147 int max)
148{
149 int i, rv = 0, cnt;
150
Tom Haynes085d1e32014-12-11 13:04:55 -0500151 lockdep_assert_held(cinfo->lock);
Tom Haynesf54bcf22014-12-11 15:34:59 -0500152 for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
153 cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i],
154 cinfo, max);
155 max -= cnt;
156 rv += cnt;
157 }
158 return rv;
159}
160EXPORT_SYMBOL_GPL(pnfs_generic_scan_commit_lists);
161
Tom Haynes085d1e32014-12-11 13:04:55 -0500162/* Pull everything off the committing lists and dump into @dst. */
Tom Haynesf54bcf22014-12-11 15:34:59 -0500163void pnfs_generic_recover_commit_reqs(struct list_head *dst,
164 struct nfs_commit_info *cinfo)
165{
166 struct pnfs_commit_bucket *b;
167 struct pnfs_layout_segment *freeme;
168 int i;
169
Tom Haynes085d1e32014-12-11 13:04:55 -0500170 lockdep_assert_held(cinfo->lock);
Tom Haynesf54bcf22014-12-11 15:34:59 -0500171restart:
Tom Haynesf54bcf22014-12-11 15:34:59 -0500172 for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
173 if (pnfs_generic_transfer_commit_list(&b->written, dst,
174 cinfo, 0)) {
175 freeme = b->wlseg;
176 b->wlseg = NULL;
177 spin_unlock(cinfo->lock);
178 pnfs_put_lseg(freeme);
Tom Haynes085d1e32014-12-11 13:04:55 -0500179 spin_lock(cinfo->lock);
Tom Haynesf54bcf22014-12-11 15:34:59 -0500180 goto restart;
181 }
182 }
183 cinfo->ds->nwritten = 0;
Tom Haynesf54bcf22014-12-11 15:34:59 -0500184}
185EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs);
186
187static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
188{
189 struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
190 struct pnfs_commit_bucket *bucket;
191 struct pnfs_layout_segment *freeme;
192 int i;
193
194 for (i = idx; i < fl_cinfo->nbuckets; i++) {
195 bucket = &fl_cinfo->buckets[i];
196 if (list_empty(&bucket->committing))
197 continue;
198 nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
199 spin_lock(cinfo->lock);
200 freeme = bucket->clseg;
201 bucket->clseg = NULL;
202 spin_unlock(cinfo->lock);
203 pnfs_put_lseg(freeme);
204 }
205}
206
207static unsigned int
208pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
209 struct list_head *list)
210{
211 struct pnfs_ds_commit_info *fl_cinfo;
212 struct pnfs_commit_bucket *bucket;
213 struct nfs_commit_data *data;
214 int i;
215 unsigned int nreq = 0;
216
217 fl_cinfo = cinfo->ds;
218 bucket = fl_cinfo->buckets;
219 for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
220 if (list_empty(&bucket->committing))
221 continue;
222 data = nfs_commitdata_alloc();
223 if (!data)
224 break;
225 data->ds_commit_index = i;
226 spin_lock(cinfo->lock);
227 data->lseg = bucket->clseg;
228 bucket->clseg = NULL;
229 spin_unlock(cinfo->lock);
230 list_add(&data->pages, list);
231 nreq++;
232 }
233
234 /* Clean up on error */
235 pnfs_generic_retry_commit(cinfo, i);
236 return nreq;
237}
238
239/* This follows nfs_commit_list pretty closely */
240int
241pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
242 int how, struct nfs_commit_info *cinfo,
243 int (*initiate_commit)(struct nfs_commit_data *data,
244 int how))
245{
246 struct nfs_commit_data *data, *tmp;
247 LIST_HEAD(list);
248 unsigned int nreq = 0;
249
250 if (!list_empty(mds_pages)) {
251 data = nfs_commitdata_alloc();
252 if (data != NULL) {
253 data->lseg = NULL;
254 list_add(&data->pages, &list);
255 nreq++;
256 } else {
257 nfs_retry_commit(mds_pages, NULL, cinfo);
258 pnfs_generic_retry_commit(cinfo, 0);
259 cinfo->completion_ops->error_cleanup(NFS_I(inode));
260 return -ENOMEM;
261 }
262 }
263
264 nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
265
266 if (nreq == 0) {
267 cinfo->completion_ops->error_cleanup(NFS_I(inode));
268 goto out;
269 }
270
271 atomic_add(nreq, &cinfo->mds->rpcs_out);
272
273 list_for_each_entry_safe(data, tmp, &list, pages) {
274 list_del_init(&data->pages);
275 if (!data->lseg) {
276 nfs_init_commit(data, mds_pages, NULL, cinfo);
277 nfs_initiate_commit(NFS_CLIENT(inode), data,
278 data->mds_ops, how, 0);
279 } else {
280 struct pnfs_commit_bucket *buckets;
281
282 buckets = cinfo->ds->buckets;
283 nfs_init_commit(data,
284 &buckets[data->ds_commit_index].committing,
285 data->lseg,
286 cinfo);
287 initiate_commit(data, how);
288 }
289 }
290out:
291 cinfo->ds->ncommitting = 0;
292 return PNFS_ATTEMPTED;
293}
294EXPORT_SYMBOL_GPL(pnfs_generic_commit_pagelist);
Peng Tao875ae062014-05-29 21:06:57 +0800295
296/*
297 * Data server cache
298 *
299 * Data servers can be mapped to different device ids.
300 * nfs4_pnfs_ds reference counting
301 * - set to 1 on allocation
302 * - incremented when a device id maps a data server already in the cache.
303 * - decremented when deviceid is removed from the cache.
304 */
305static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
306static LIST_HEAD(nfs4_data_server_cache);
307
308/* Debug routines */
309static void
310print_ds(struct nfs4_pnfs_ds *ds)
311{
312 if (ds == NULL) {
313 printk(KERN_WARNING "%s NULL device\n", __func__);
314 return;
315 }
316 printk(KERN_WARNING " ds %s\n"
317 " ref count %d\n"
318 " client %p\n"
319 " cl_exchange_flags %x\n",
320 ds->ds_remotestr,
321 atomic_read(&ds->ds_count), ds->ds_clp,
322 ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
323}
324
325static bool
326same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
327{
328 struct sockaddr_in *a, *b;
329 struct sockaddr_in6 *a6, *b6;
330
331 if (addr1->sa_family != addr2->sa_family)
332 return false;
333
334 switch (addr1->sa_family) {
335 case AF_INET:
336 a = (struct sockaddr_in *)addr1;
337 b = (struct sockaddr_in *)addr2;
338
339 if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
340 a->sin_port == b->sin_port)
341 return true;
342 break;
343
344 case AF_INET6:
345 a6 = (struct sockaddr_in6 *)addr1;
346 b6 = (struct sockaddr_in6 *)addr2;
347
348 /* LINKLOCAL addresses must have matching scope_id */
349 if (ipv6_addr_src_scope(&a6->sin6_addr) ==
350 IPV6_ADDR_SCOPE_LINKLOCAL &&
351 a6->sin6_scope_id != b6->sin6_scope_id)
352 return false;
353
354 if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
355 a6->sin6_port == b6->sin6_port)
356 return true;
357 break;
358
359 default:
360 dprintk("%s: unhandled address family: %u\n",
361 __func__, addr1->sa_family);
362 return false;
363 }
364
365 return false;
366}
367
368static bool
369_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
370 const struct list_head *dsaddrs2)
371{
372 struct nfs4_pnfs_ds_addr *da1, *da2;
373
374 /* step through both lists, comparing as we go */
375 for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
376 da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
377 da1 != NULL && da2 != NULL;
378 da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
379 da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
380 if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
381 (struct sockaddr *)&da2->da_addr))
382 return false;
383 }
384 if (da1 == NULL && da2 == NULL)
385 return true;
386
387 return false;
388}
389
390/*
391 * Lookup DS by addresses. nfs4_ds_cache_lock is held
392 */
393static struct nfs4_pnfs_ds *
394_data_server_lookup_locked(const struct list_head *dsaddrs)
395{
396 struct nfs4_pnfs_ds *ds;
397
398 list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
399 if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
400 return ds;
401 return NULL;
402}
403
404static void destroy_ds(struct nfs4_pnfs_ds *ds)
405{
406 struct nfs4_pnfs_ds_addr *da;
407
408 dprintk("--> %s\n", __func__);
409 ifdebug(FACILITY)
410 print_ds(ds);
411
412 nfs_put_client(ds->ds_clp);
413
414 while (!list_empty(&ds->ds_addrs)) {
415 da = list_first_entry(&ds->ds_addrs,
416 struct nfs4_pnfs_ds_addr,
417 da_node);
418 list_del_init(&da->da_node);
419 kfree(da->da_remotestr);
420 kfree(da);
421 }
422
423 kfree(ds->ds_remotestr);
424 kfree(ds);
425}
426
427void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds)
428{
429 if (atomic_dec_and_lock(&ds->ds_count,
430 &nfs4_ds_cache_lock)) {
431 list_del_init(&ds->ds_node);
432 spin_unlock(&nfs4_ds_cache_lock);
433 destroy_ds(ds);
434 }
435}
436EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_put);
437
438/*
439 * Create a string with a human readable address and port to avoid
440 * complicated setup around many dprinks.
441 */
442static char *
443nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
444{
445 struct nfs4_pnfs_ds_addr *da;
446 char *remotestr;
447 size_t len;
448 char *p;
449
450 len = 3; /* '{', '}' and eol */
451 list_for_each_entry(da, dsaddrs, da_node) {
452 len += strlen(da->da_remotestr) + 1; /* string plus comma */
453 }
454
455 remotestr = kzalloc(len, gfp_flags);
456 if (!remotestr)
457 return NULL;
458
459 p = remotestr;
460 *(p++) = '{';
461 len--;
462 list_for_each_entry(da, dsaddrs, da_node) {
463 size_t ll = strlen(da->da_remotestr);
464
465 if (ll > len)
466 goto out_err;
467
468 memcpy(p, da->da_remotestr, ll);
469 p += ll;
470 len -= ll;
471
472 if (len < 1)
473 goto out_err;
474 (*p++) = ',';
475 len--;
476 }
477 if (len < 2)
478 goto out_err;
479 *(p++) = '}';
480 *p = '\0';
481 return remotestr;
482out_err:
483 kfree(remotestr);
484 return NULL;
485}
486
487/*
488 * Given a list of multipath struct nfs4_pnfs_ds_addr, add it to ds cache if
489 * uncached and return cached struct nfs4_pnfs_ds.
490 */
491struct nfs4_pnfs_ds *
492nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
493{
494 struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
495 char *remotestr;
496
497 if (list_empty(dsaddrs)) {
498 dprintk("%s: no addresses defined\n", __func__);
499 goto out;
500 }
501
502 ds = kzalloc(sizeof(*ds), gfp_flags);
503 if (!ds)
504 goto out;
505
506 /* this is only used for debugging, so it's ok if its NULL */
507 remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
508
509 spin_lock(&nfs4_ds_cache_lock);
510 tmp_ds = _data_server_lookup_locked(dsaddrs);
511 if (tmp_ds == NULL) {
512 INIT_LIST_HEAD(&ds->ds_addrs);
513 list_splice_init(dsaddrs, &ds->ds_addrs);
514 ds->ds_remotestr = remotestr;
515 atomic_set(&ds->ds_count, 1);
516 INIT_LIST_HEAD(&ds->ds_node);
517 ds->ds_clp = NULL;
518 list_add(&ds->ds_node, &nfs4_data_server_cache);
519 dprintk("%s add new data server %s\n", __func__,
520 ds->ds_remotestr);
521 } else {
522 kfree(remotestr);
523 kfree(ds);
524 atomic_inc(&tmp_ds->ds_count);
525 dprintk("%s data server %s found, inc'ed ds_count to %d\n",
526 __func__, tmp_ds->ds_remotestr,
527 atomic_read(&tmp_ds->ds_count));
528 ds = tmp_ds;
529 }
530 spin_unlock(&nfs4_ds_cache_lock);
531out:
532 return ds;
533}
534EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_add);