blob: c169bdda66a307d9baf3b333f2790587954d346a [file] [log] [blame]
Mike Marshall5db11c22015-07-17 10:38:12 -04001/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6
7/*
8 * Linux VFS file operations.
9 */
10
11#include "protocol.h"
12#include "pvfs2-kernel.h"
13#include "pvfs2-bufmap.h"
14#include <linux/fs.h>
15#include <linux/pagemap.h>
16
17#define wake_up_daemon_for_return(op) \
18do { \
19 spin_lock(&op->lock); \
20 op->io_completed = 1; \
21 spin_unlock(&op->lock); \
22 wake_up_interruptible(&op->io_completion_waitq);\
23} while (0)
24
25/*
26 * Copy to client-core's address space from the buffers specified
27 * by the iovec upto total_size bytes.
28 * NOTE: the iovector can either contain addresses which
29 * can futher be kernel-space or user-space addresses.
30 * or it can pointers to struct page's
31 */
32static int precopy_buffers(struct pvfs2_bufmap *bufmap,
33 int buffer_index,
34 const struct iovec *vec,
35 unsigned long nr_segs,
Mike Marshall4d1c4402015-09-04 10:31:16 -040036 size_t total_size)
Mike Marshall5db11c22015-07-17 10:38:12 -040037{
38 int ret = 0;
Mike Marshall4d1c4402015-09-04 10:31:16 -040039 struct iov_iter iter;
Mike Marshall5db11c22015-07-17 10:38:12 -040040
41 /*
42 * copy data from application/kernel by pulling it out
43 * of the iovec.
44 */
Mike Marshall4d1c4402015-09-04 10:31:16 -040045
46
47 if (total_size) {
48 iov_iter_init(&iter, WRITE, vec, nr_segs, total_size);
49 ret = pvfs_bufmap_copy_from_iovec(bufmap,
50 &iter,
51 buffer_index,
52 total_size);
53 if (ret < 0)
54 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
55 __func__,
56 (long)ret);
Mike Marshall4d1c4402015-09-04 10:31:16 -040057 }
58
Mike Marshall5db11c22015-07-17 10:38:12 -040059 if (ret < 0)
60 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
61 __func__,
62 (long)ret);
63 return ret;
64}
65
66/*
67 * Copy from client-core's address space to the buffers specified
68 * by the iovec upto total_size bytes.
69 * NOTE: the iovector can either contain addresses which
70 * can futher be kernel-space or user-space addresses.
71 * or it can pointers to struct page's
72 */
73static int postcopy_buffers(struct pvfs2_bufmap *bufmap,
74 int buffer_index,
Al Viro5f0e3c92015-10-08 17:52:44 -040075 struct iov_iter *iter,
Mike Marshall4d1c4402015-09-04 10:31:16 -040076 size_t total_size)
Mike Marshall5db11c22015-07-17 10:38:12 -040077{
78 int ret = 0;
Mike Marshall5db11c22015-07-17 10:38:12 -040079 /*
80 * copy data to application/kernel by pushing it out to
81 * the iovec. NOTE; target buffers can be addresses or
82 * struct page pointers.
83 */
84 if (total_size) {
Mike Marshall4d1c4402015-09-04 10:31:16 -040085 ret = pvfs_bufmap_copy_to_iovec(bufmap,
Al Viro5f0e3c92015-10-08 17:52:44 -040086 iter,
Al Viro5c278222015-10-08 17:43:58 -040087 buffer_index,
88 total_size);
Mike Marshall5db11c22015-07-17 10:38:12 -040089 if (ret < 0)
Mike Marshall4d1c4402015-09-04 10:31:16 -040090 gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n",
Mike Marshall5db11c22015-07-17 10:38:12 -040091 __func__,
92 (long)ret);
93 }
94 return ret;
95}
96
97/*
98 * Post and wait for the I/O upcall to finish
99 */
100static ssize_t wait_for_direct_io(enum PVFS_io_type type, struct inode *inode,
101 loff_t *offset, struct iovec *vec, unsigned long nr_segs,
Mike Marshall4d1c4402015-09-04 10:31:16 -0400102 size_t total_size, loff_t readahead_size)
Mike Marshall5db11c22015-07-17 10:38:12 -0400103{
104 struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
105 struct pvfs2_khandle *handle = &pvfs2_inode->refn.khandle;
106 struct pvfs2_bufmap *bufmap = NULL;
107 struct pvfs2_kernel_op_s *new_op = NULL;
108 int buffer_index = -1;
109 ssize_t ret;
110
111 new_op = op_alloc(PVFS2_VFS_OP_FILE_IO);
112 if (!new_op) {
113 ret = -ENOMEM;
114 goto out;
115 }
116 /* synchronous I/O */
117 new_op->upcall.req.io.async_vfs_io = PVFS_VFS_SYNC_IO;
118 new_op->upcall.req.io.readahead_size = readahead_size;
119 new_op->upcall.req.io.io_type = type;
120 new_op->upcall.req.io.refn = pvfs2_inode->refn;
121
122populate_shared_memory:
123 /* get a shared buffer index */
124 ret = pvfs_bufmap_get(&bufmap, &buffer_index);
125 if (ret < 0) {
126 gossip_debug(GOSSIP_FILE_DEBUG,
127 "%s: pvfs_bufmap_get failure (%ld)\n",
128 __func__, (long)ret);
129 goto out;
130 }
131 gossip_debug(GOSSIP_FILE_DEBUG,
132 "%s(%pU): GET op %p -> buffer_index %d\n",
133 __func__,
134 handle,
135 new_op,
136 buffer_index);
137
138 new_op->uses_shared_memory = 1;
139 new_op->upcall.req.io.buf_index = buffer_index;
140 new_op->upcall.req.io.count = total_size;
141 new_op->upcall.req.io.offset = *offset;
142
143 gossip_debug(GOSSIP_FILE_DEBUG,
Mike Marshall4d1c4402015-09-04 10:31:16 -0400144 "%s(%pU): nr_segs %lu, offset: %llu total_size: %zd\n",
Mike Marshall5db11c22015-07-17 10:38:12 -0400145 __func__,
146 handle,
Mike Marshall5db11c22015-07-17 10:38:12 -0400147 nr_segs,
148 llu(*offset),
149 total_size);
150 /*
151 * Stage 1: copy the buffers into client-core's address space
152 * precopy_buffers only pertains to writes.
153 */
154 if (type == PVFS_IO_WRITE) {
155 ret = precopy_buffers(bufmap,
156 buffer_index,
157 vec,
158 nr_segs,
Mike Marshall4d1c4402015-09-04 10:31:16 -0400159 total_size);
Mike Marshall5db11c22015-07-17 10:38:12 -0400160 if (ret < 0)
161 goto out;
162 }
163
164 gossip_debug(GOSSIP_FILE_DEBUG,
165 "%s(%pU): Calling post_io_request with tag (%llu)\n",
166 __func__,
167 handle,
168 llu(new_op->tag));
169
170 /* Stage 2: Service the I/O operation */
171 ret = service_operation(new_op,
172 type == PVFS_IO_WRITE ?
173 "file_write" :
174 "file_read",
175 get_interruptible_flag(inode));
176
177 /*
178 * If service_operation() returns -EAGAIN #and# the operation was
179 * purged from pvfs2_request_list or htable_ops_in_progress, then
180 * we know that the client was restarted, causing the shared memory
181 * area to be wiped clean. To restart a write operation in this
182 * case, we must re-copy the data from the user's iovec to a NEW
183 * shared memory location. To restart a read operation, we must get
184 * a new shared memory location.
185 */
186 if (ret == -EAGAIN && op_state_purged(new_op)) {
187 pvfs_bufmap_put(bufmap, buffer_index);
188 gossip_debug(GOSSIP_FILE_DEBUG,
189 "%s:going to repopulate_shared_memory.\n",
190 __func__);
191 goto populate_shared_memory;
192 }
193
194 if (ret < 0) {
195 handle_io_error(); /* defined in pvfs2-kernel.h */
196 /*
Mike Marshall54804942015-10-05 13:44:24 -0400197 * don't write an error to syslog on signaled operation
198 * termination unless we've got debugging turned on, as
199 * this can happen regularly (i.e. ctrl-c)
Mike Marshall5db11c22015-07-17 10:38:12 -0400200 */
201 if (ret == -EINTR)
202 gossip_debug(GOSSIP_FILE_DEBUG,
203 "%s: returning error %ld\n", __func__,
204 (long)ret);
205 else
206 gossip_err("%s: error in %s handle %pU, returning %zd\n",
207 __func__,
208 type == PVFS_IO_READ ?
209 "read from" : "write to",
210 handle, ret);
211 goto out;
212 }
213
214 /*
215 * Stage 3: Post copy buffers from client-core's address space
216 * postcopy_buffers only pertains to reads.
217 */
218 if (type == PVFS_IO_READ) {
Al Viro5f0e3c92015-10-08 17:52:44 -0400219 struct iov_iter iter;
220 iov_iter_init(&iter, READ, vec, nr_segs, new_op->downcall.resp.io.amt_complete);
Mike Marshall5db11c22015-07-17 10:38:12 -0400221 ret = postcopy_buffers(bufmap,
222 buffer_index,
Al Viro5f0e3c92015-10-08 17:52:44 -0400223 &iter,
Mike Marshall4d1c4402015-09-04 10:31:16 -0400224 new_op->downcall.resp.io.amt_complete);
Mike Marshall5db11c22015-07-17 10:38:12 -0400225 if (ret < 0) {
226 /*
227 * put error codes in downcall so that handle_io_error()
228 * preserves it properly
229 */
230 new_op->downcall.status = ret;
231 handle_io_error();
232 goto out;
233 }
234 }
235 gossip_debug(GOSSIP_FILE_DEBUG,
236 "%s(%pU): Amount written as returned by the sys-io call:%d\n",
237 __func__,
238 handle,
239 (int)new_op->downcall.resp.io.amt_complete);
240
241 ret = new_op->downcall.resp.io.amt_complete;
242
243 /*
Mike Marshall54804942015-10-05 13:44:24 -0400244 * tell the device file owner waiting on I/O that this read has
245 * completed and it can return now. in this exact case, on
246 * wakeup the daemon will free the op, so we *cannot* touch it
247 * after this.
Mike Marshall5db11c22015-07-17 10:38:12 -0400248 */
249 wake_up_daemon_for_return(new_op);
250 new_op = NULL;
251
252out:
253 if (buffer_index >= 0) {
254 pvfs_bufmap_put(bufmap, buffer_index);
255 gossip_debug(GOSSIP_FILE_DEBUG,
256 "%s(%pU): PUT buffer_index %d\n",
257 __func__, handle, buffer_index);
258 buffer_index = -1;
259 }
260 if (new_op) {
261 op_release(new_op);
262 new_op = NULL;
263 }
264 return ret;
265}
266
267/*
268 * The reason we need to do this is to be able to support readv and writev
269 * that are larger than (pvfs_bufmap_size_query()) Default is
270 * PVFS2_BUFMAP_DEFAULT_DESC_SIZE MB. What that means is that we will
271 * create a new io vec descriptor for those memory addresses that
272 * go beyond the limit. Return value for this routine is negative in case
273 * of errors and 0 in case of success.
274 *
275 * Further, the new_nr_segs pointer is updated to hold the new value
276 * of number of iovecs, the new_vec pointer is updated to hold the pointer
277 * to the new split iovec, and the size array is an array of integers holding
278 * the number of iovecs that straddle pvfs_bufmap_size_query().
279 * The max_new_nr_segs value is computed by the caller and returned.
280 * (It will be (count of all iov_len/ block_size) + 1).
281 */
282static int split_iovecs(unsigned long max_new_nr_segs, /* IN */
283 unsigned long nr_segs, /* IN */
284 const struct iovec *original_iovec, /* IN */
285 unsigned long *new_nr_segs, /* OUT */
286 struct iovec **new_vec, /* OUT */
287 unsigned long *seg_count, /* OUT */
288 unsigned long **seg_array) /* OUT */
289{
290 unsigned long seg;
291 unsigned long count = 0;
292 unsigned long begin_seg;
293 unsigned long tmpnew_nr_segs = 0;
294 struct iovec *new_iovec = NULL;
295 struct iovec *orig_iovec;
296 unsigned long *sizes = NULL;
297 unsigned long sizes_count = 0;
298
299 if (nr_segs <= 0 ||
300 original_iovec == NULL ||
301 new_nr_segs == NULL ||
302 new_vec == NULL ||
303 seg_count == NULL ||
304 seg_array == NULL ||
305 max_new_nr_segs <= 0) {
306 gossip_err("Invalid parameters to split_iovecs\n");
307 return -EINVAL;
308 }
309 *new_nr_segs = 0;
310 *new_vec = NULL;
311 *seg_count = 0;
312 *seg_array = NULL;
313 /* copy the passed in iovec descriptor to a temp structure */
314 orig_iovec = kmalloc_array(nr_segs,
315 sizeof(*orig_iovec),
316 PVFS2_BUFMAP_GFP_FLAGS);
317 if (orig_iovec == NULL) {
318 gossip_err(
319 "split_iovecs: Could not allocate memory for %lu bytes!\n",
320 (unsigned long)(nr_segs * sizeof(*orig_iovec)));
321 return -ENOMEM;
322 }
323 new_iovec = kcalloc(max_new_nr_segs,
324 sizeof(*new_iovec),
325 PVFS2_BUFMAP_GFP_FLAGS);
326 if (new_iovec == NULL) {
327 kfree(orig_iovec);
328 gossip_err(
329 "split_iovecs: Could not allocate memory for %lu bytes!\n",
330 (unsigned long)(max_new_nr_segs * sizeof(*new_iovec)));
331 return -ENOMEM;
332 }
333 sizes = kcalloc(max_new_nr_segs,
334 sizeof(*sizes),
335 PVFS2_BUFMAP_GFP_FLAGS);
336 if (sizes == NULL) {
337 kfree(new_iovec);
338 kfree(orig_iovec);
339 gossip_err(
340 "split_iovecs: Could not allocate memory for %lu bytes!\n",
341 (unsigned long)(max_new_nr_segs * sizeof(*sizes)));
342 return -ENOMEM;
343 }
344 /* copy the passed in iovec to a temp structure */
345 memcpy(orig_iovec, original_iovec, nr_segs * sizeof(*orig_iovec));
346 begin_seg = 0;
347repeat:
348 for (seg = begin_seg; seg < nr_segs; seg++) {
349 if (tmpnew_nr_segs >= max_new_nr_segs ||
350 sizes_count >= max_new_nr_segs) {
351 kfree(sizes);
352 kfree(orig_iovec);
353 kfree(new_iovec);
354 gossip_err
355 ("split_iovecs: exceeded the index limit (%lu)\n",
356 tmpnew_nr_segs);
357 return -EINVAL;
358 }
359 if (count + orig_iovec[seg].iov_len <
360 pvfs_bufmap_size_query()) {
361 count += orig_iovec[seg].iov_len;
362 memcpy(&new_iovec[tmpnew_nr_segs],
363 &orig_iovec[seg],
364 sizeof(*new_iovec));
365 tmpnew_nr_segs++;
366 sizes[sizes_count]++;
367 } else {
368 new_iovec[tmpnew_nr_segs].iov_base =
369 orig_iovec[seg].iov_base;
370 new_iovec[tmpnew_nr_segs].iov_len =
371 (pvfs_bufmap_size_query() - count);
372 tmpnew_nr_segs++;
373 sizes[sizes_count]++;
374 sizes_count++;
375 begin_seg = seg;
376 orig_iovec[seg].iov_base +=
377 (pvfs_bufmap_size_query() - count);
378 orig_iovec[seg].iov_len -=
379 (pvfs_bufmap_size_query() - count);
380 count = 0;
381 break;
382 }
383 }
384 if (seg != nr_segs)
385 goto repeat;
386 else
387 sizes_count++;
388
389 *new_nr_segs = tmpnew_nr_segs;
390 /* new_iovec is freed by the caller */
391 *new_vec = new_iovec;
392 *seg_count = sizes_count;
393 /* seg_array is also freed by the caller */
394 *seg_array = sizes;
395 kfree(orig_iovec);
396 return 0;
397}
398
399static long bound_max_iovecs(const struct iovec *curr, unsigned long nr_segs,
400 ssize_t *total_count)
401{
402 unsigned long i;
403 long max_nr_iovecs;
404 ssize_t total;
405 ssize_t count;
406
407 total = 0;
408 count = 0;
409 max_nr_iovecs = 0;
410 for (i = 0; i < nr_segs; i++) {
411 const struct iovec *iv = &curr[i];
412
413 count += iv->iov_len;
414 if (unlikely((ssize_t) (count | iv->iov_len) < 0))
415 return -EINVAL;
416 if (total + iv->iov_len < pvfs_bufmap_size_query()) {
417 total += iv->iov_len;
418 max_nr_iovecs++;
419 } else {
420 total =
421 (total + iv->iov_len - pvfs_bufmap_size_query());
422 max_nr_iovecs += (total / pvfs_bufmap_size_query() + 2);
423 }
424 }
425 *total_count = count;
426 return max_nr_iovecs;
427}
428
429/*
430 * Common entry point for read/write/readv/writev
431 * This function will dispatch it to either the direct I/O
432 * or buffered I/O path depending on the mount options and/or
433 * augmented/extended metadata attached to the file.
434 * Note: File extended attributes override any mount options.
435 */
436static ssize_t do_readv_writev(enum PVFS_io_type type, struct file *file,
437 loff_t *offset, const struct iovec *iov, unsigned long nr_segs)
438{
439 struct inode *inode = file->f_mapping->host;
440 struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
441 struct pvfs2_khandle *handle = &pvfs2_inode->refn.khandle;
442 ssize_t ret;
443 ssize_t total_count;
444 unsigned int to_free;
445 size_t count;
446 unsigned long seg;
Mike Marshalleeaa3d42015-07-29 13:36:37 -0400447 unsigned long new_nr_segs;
448 unsigned long max_new_nr_segs;
449 unsigned long seg_count;
450 unsigned long *seg_array;
451 struct iovec *iovecptr;
452 struct iovec *ptr;
Mike Marshall5db11c22015-07-17 10:38:12 -0400453
454 total_count = 0;
455 ret = -EINVAL;
456 count = 0;
457 to_free = 0;
458
459 /* Compute total and max number of segments after split */
460 max_new_nr_segs = bound_max_iovecs(iov, nr_segs, &count);
Mike Marshall5db11c22015-07-17 10:38:12 -0400461
462 gossip_debug(GOSSIP_FILE_DEBUG,
463 "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n",
464 __func__,
465 handle,
466 (int)count);
467
468 if (type == PVFS_IO_WRITE) {
469 gossip_debug(GOSSIP_FILE_DEBUG,
470 "%s(%pU): proceeding with offset : %llu, "
471 "size %d\n",
472 __func__,
473 handle,
474 llu(*offset),
475 (int)count);
476 }
477
478 if (count == 0) {
479 ret = 0;
480 goto out;
481 }
482
483 /*
484 * if the total size of data transfer requested is greater than
485 * the kernel-set blocksize of PVFS2, then we split the iovecs
486 * such that no iovec description straddles a block size limit
487 */
488
489 gossip_debug(GOSSIP_FILE_DEBUG,
490 "%s: pvfs_bufmap_size:%d\n",
491 __func__,
492 pvfs_bufmap_size_query());
493
494 if (count > pvfs_bufmap_size_query()) {
495 /*
496 * Split up the given iovec description such that
497 * no iovec descriptor straddles over the block-size limitation.
498 * This makes us our job easier to stage the I/O.
499 * In addition, this function will also compute an array
500 * with seg_count entries that will store the number of
501 * segments that straddle the block-size boundaries.
502 */
503 ret = split_iovecs(max_new_nr_segs, /* IN */
504 nr_segs, /* IN */
505 iov, /* IN */
506 &new_nr_segs, /* OUT */
507 &iovecptr, /* OUT */
508 &seg_count, /* OUT */
509 &seg_array); /* OUT */
510 if (ret < 0) {
511 gossip_err("%s: Failed to split iovecs to satisfy larger than blocksize readv/writev request %zd\n",
512 __func__,
513 ret);
514 goto out;
515 }
516 gossip_debug(GOSSIP_FILE_DEBUG,
517 "%s: Splitting iovecs from %lu to %lu"
518 " [max_new %lu]\n",
519 __func__,
520 nr_segs,
521 new_nr_segs,
522 max_new_nr_segs);
523 /* We must free seg_array and iovecptr */
524 to_free = 1;
525 } else {
526 new_nr_segs = nr_segs;
527 /* use the given iovec description */
528 iovecptr = (struct iovec *)iov;
529 /* There is only 1 element in the seg_array */
530 seg_count = 1;
531 /* and its value is the number of segments passed in */
532 seg_array = &nr_segs;
533 /* We dont have to free up anything */
534 to_free = 0;
535 }
536 ptr = iovecptr;
537
538 gossip_debug(GOSSIP_FILE_DEBUG,
539 "%s(%pU) %zd@%llu\n",
540 __func__,
541 handle,
542 count,
543 llu(*offset));
544 gossip_debug(GOSSIP_FILE_DEBUG,
545 "%s(%pU): new_nr_segs: %lu, seg_count: %lu\n",
546 __func__,
547 handle,
548 new_nr_segs, seg_count);
549
550/* PVFS2_KERNEL_DEBUG is a CFLAGS define. */
551#ifdef PVFS2_KERNEL_DEBUG
552 for (seg = 0; seg < new_nr_segs; seg++)
553 gossip_debug(GOSSIP_FILE_DEBUG,
554 "%s: %d) %p to %p [%d bytes]\n",
555 __func__,
556 (int)seg + 1,
557 iovecptr[seg].iov_base,
558 iovecptr[seg].iov_base + iovecptr[seg].iov_len,
559 (int)iovecptr[seg].iov_len);
560 for (seg = 0; seg < seg_count; seg++)
561 gossip_debug(GOSSIP_FILE_DEBUG,
562 "%s: %zd) %lu\n",
563 __func__,
564 seg + 1,
565 seg_array[seg]);
566#endif
567 seg = 0;
568 while (total_count < count) {
569 size_t each_count;
570 size_t amt_complete;
571
572 /* how much to transfer in this loop iteration */
573 each_count =
574 (((count - total_count) > pvfs_bufmap_size_query()) ?
575 pvfs_bufmap_size_query() :
576 (count - total_count));
577
578 gossip_debug(GOSSIP_FILE_DEBUG,
579 "%s(%pU): size of each_count(%d)\n",
580 __func__,
581 handle,
582 (int)each_count);
583 gossip_debug(GOSSIP_FILE_DEBUG,
584 "%s(%pU): BEFORE wait_for_io: offset is %d\n",
585 __func__,
586 handle,
587 (int)*offset);
588
589 ret = wait_for_direct_io(type, inode, offset, ptr,
Mike Marshall4d1c4402015-09-04 10:31:16 -0400590 seg_array[seg], each_count, 0);
Mike Marshall5db11c22015-07-17 10:38:12 -0400591 gossip_debug(GOSSIP_FILE_DEBUG,
592 "%s(%pU): return from wait_for_io:%d\n",
593 __func__,
594 handle,
595 (int)ret);
596
597 if (ret < 0)
598 goto out;
599
600 /* advance the iovec pointer */
601 ptr += seg_array[seg];
602 seg++;
603 *offset += ret;
604 total_count += ret;
605 amt_complete = ret;
606
607 gossip_debug(GOSSIP_FILE_DEBUG,
608 "%s(%pU): AFTER wait_for_io: offset is %d\n",
609 __func__,
610 handle,
611 (int)*offset);
612
613 /*
614 * if we got a short I/O operations,
615 * fall out and return what we got so far
616 */
617 if (amt_complete < each_count)
618 break;
619 } /*end while */
620
621 if (total_count > 0)
622 ret = total_count;
623out:
624 if (to_free) {
625 kfree(iovecptr);
626 kfree(seg_array);
627 }
628 if (ret > 0) {
629 if (type == PVFS_IO_READ) {
630 file_accessed(file);
631 } else {
632 SetMtimeFlag(pvfs2_inode);
633 inode->i_mtime = CURRENT_TIME;
634 mark_inode_dirty_sync(inode);
635 }
636 }
637
638 gossip_debug(GOSSIP_FILE_DEBUG,
639 "%s(%pU): Value(%d) returned.\n",
640 __func__,
641 handle,
642 (int)ret);
643
644 return ret;
645}
646
647/*
648 * Read data from a specified offset in a file (referenced by inode).
649 * Data may be placed either in a user or kernel buffer.
650 */
651ssize_t pvfs2_inode_read(struct inode *inode,
652 char __user *buf,
653 size_t count,
654 loff_t *offset,
655 loff_t readahead_size)
656{
657 struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
658 size_t bufmap_size;
659 struct iovec vec;
660 ssize_t ret = -EINVAL;
661
662 g_pvfs2_stats.reads++;
663
664 vec.iov_base = buf;
665 vec.iov_len = count;
666
667 bufmap_size = pvfs_bufmap_size_query();
668 if (count > bufmap_size) {
669 gossip_debug(GOSSIP_FILE_DEBUG,
670 "%s: count is too large (%zd/%zd)!\n",
671 __func__, count, bufmap_size);
672 return -EINVAL;
673 }
674
675 gossip_debug(GOSSIP_FILE_DEBUG,
676 "%s(%pU) %zd@%llu\n",
677 __func__,
678 &pvfs2_inode->refn.khandle,
679 count,
680 llu(*offset));
681
682 ret = wait_for_direct_io(PVFS_IO_READ, inode, offset, &vec, 1,
Mike Marshall4d1c4402015-09-04 10:31:16 -0400683 count, readahead_size);
Mike Marshall5db11c22015-07-17 10:38:12 -0400684 if (ret > 0)
685 *offset += ret;
686
687 gossip_debug(GOSSIP_FILE_DEBUG,
688 "%s(%pU): Value(%zd) returned.\n",
689 __func__,
690 &pvfs2_inode->refn.khandle,
691 ret);
692
693 return ret;
694}
695
696static ssize_t pvfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
697{
698 struct file *file = iocb->ki_filp;
699 loff_t pos = *(&iocb->ki_pos);
700 ssize_t rc = 0;
701 unsigned long nr_segs = iter->nr_segs;
702
703 BUG_ON(iocb->private);
704
705 gossip_debug(GOSSIP_FILE_DEBUG, "pvfs2_file_read_iter\n");
706
707 g_pvfs2_stats.reads++;
708
709 rc = do_readv_writev(PVFS_IO_READ,
710 file,
711 &pos,
712 iter->iov,
713 nr_segs);
714 iocb->ki_pos = pos;
715
716 return rc;
717}
718
719static ssize_t pvfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
720{
721 struct file *file = iocb->ki_filp;
722 loff_t pos = *(&iocb->ki_pos);
723 unsigned long nr_segs = iter->nr_segs;
724 ssize_t rc;
725
726 BUG_ON(iocb->private);
727
728 gossip_debug(GOSSIP_FILE_DEBUG, "pvfs2_file_write_iter\n");
729
730 mutex_lock(&file->f_mapping->host->i_mutex);
731
732 /* Make sure generic_write_checks sees an up to date inode size. */
733 if (file->f_flags & O_APPEND) {
734 rc = pvfs2_inode_getattr(file->f_mapping->host,
735 PVFS_ATTR_SYS_SIZE);
736 if (rc) {
737 gossip_err("%s: pvfs2_inode_getattr failed, rc:%zd:.\n",
738 __func__, rc);
739 goto out;
740 }
741 }
742
743 if (file->f_pos > i_size_read(file->f_mapping->host))
744 pvfs2_i_size_write(file->f_mapping->host, file->f_pos);
745
746 rc = generic_write_checks(iocb, iter);
747
748 if (rc <= 0) {
749 gossip_err("%s: generic_write_checks failed, rc:%zd:.\n",
750 __func__, rc);
751 goto out;
752 }
753
754 rc = do_readv_writev(PVFS_IO_WRITE,
755 file,
756 &pos,
757 iter->iov,
758 nr_segs);
759 if (rc < 0) {
760 gossip_err("%s: do_readv_writev failed, rc:%zd:.\n",
761 __func__, rc);
762 goto out;
763 }
764
765 iocb->ki_pos = pos;
766 g_pvfs2_stats.writes++;
767
768out:
769
770 mutex_unlock(&file->f_mapping->host->i_mutex);
771 return rc;
772}
773
774/*
775 * Perform a miscellaneous operation on a file.
776 */
Mike Marshall84d02152015-07-28 13:27:51 -0400777static long pvfs2_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
Mike Marshall5db11c22015-07-17 10:38:12 -0400778{
779 int ret = -ENOTTY;
780 __u64 val = 0;
781 unsigned long uval;
782
783 gossip_debug(GOSSIP_FILE_DEBUG,
784 "pvfs2_ioctl: called with cmd %d\n",
785 cmd);
786
787 /*
788 * we understand some general ioctls on files, such as the immutable
789 * and append flags
790 */
791 if (cmd == FS_IOC_GETFLAGS) {
792 val = 0;
793 ret = pvfs2_xattr_get_default(file->f_path.dentry,
794 "user.pvfs2.meta_hint",
795 &val,
796 sizeof(val),
797 0);
798 if (ret < 0 && ret != -ENODATA)
799 return ret;
800 else if (ret == -ENODATA)
801 val = 0;
802 uval = val;
803 gossip_debug(GOSSIP_FILE_DEBUG,
804 "pvfs2_ioctl: FS_IOC_GETFLAGS: %llu\n",
805 (unsigned long long)uval);
806 return put_user(uval, (int __user *)arg);
807 } else if (cmd == FS_IOC_SETFLAGS) {
808 ret = 0;
809 if (get_user(uval, (int __user *)arg))
810 return -EFAULT;
811 /*
812 * PVFS_MIRROR_FL is set internally when the mirroring mode
813 * is turned on for a file. The user is not allowed to turn
814 * on this bit, but the bit is present if the user first gets
815 * the flags and then updates the flags with some new
816 * settings. So, we ignore it in the following edit. bligon.
817 */
818 if ((uval & ~PVFS_MIRROR_FL) &
819 (~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL))) {
820 gossip_err("pvfs2_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n");
821 return -EINVAL;
822 }
823 val = uval;
824 gossip_debug(GOSSIP_FILE_DEBUG,
825 "pvfs2_ioctl: FS_IOC_SETFLAGS: %llu\n",
826 (unsigned long long)val);
827 ret = pvfs2_xattr_set_default(file->f_path.dentry,
828 "user.pvfs2.meta_hint",
829 &val,
830 sizeof(val),
831 0,
832 0);
833 }
834
835 return ret;
836}
837
838/*
839 * Memory map a region of a file.
840 */
841static int pvfs2_file_mmap(struct file *file, struct vm_area_struct *vma)
842{
843 gossip_debug(GOSSIP_FILE_DEBUG,
844 "pvfs2_file_mmap: called on %s\n",
845 (file ?
846 (char *)file->f_path.dentry->d_name.name :
847 (char *)"Unknown"));
848
849 /* set the sequential readahead hint */
850 vma->vm_flags |= VM_SEQ_READ;
851 vma->vm_flags &= ~VM_RAND_READ;
Martin Brandenburg35390802015-09-30 13:11:54 -0400852
853 /* Use readonly mmap since we cannot support writable maps. */
854 return generic_file_readonly_mmap(file, vma);
Mike Marshall5db11c22015-07-17 10:38:12 -0400855}
856
857#define mapping_nrpages(idata) ((idata)->nrpages)
858
859/*
860 * Called to notify the module that there are no more references to
861 * this file (i.e. no processes have it open).
862 *
863 * \note Not called when each file is closed.
864 */
Mike Marshall84d02152015-07-28 13:27:51 -0400865static int pvfs2_file_release(struct inode *inode, struct file *file)
Mike Marshall5db11c22015-07-17 10:38:12 -0400866{
867 gossip_debug(GOSSIP_FILE_DEBUG,
868 "pvfs2_file_release: called on %s\n",
869 file->f_path.dentry->d_name.name);
870
871 pvfs2_flush_inode(inode);
872
873 /*
Mike Marshall54804942015-10-05 13:44:24 -0400874 * remove all associated inode pages from the page cache and mmap
875 * readahead cache (if any); this forces an expensive refresh of
876 * data for the next caller of mmap (or 'get_block' accesses)
Mike Marshall5db11c22015-07-17 10:38:12 -0400877 */
878 if (file->f_path.dentry->d_inode &&
879 file->f_path.dentry->d_inode->i_mapping &&
880 mapping_nrpages(&file->f_path.dentry->d_inode->i_data))
881 truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping,
882 0);
883 return 0;
884}
885
886/*
887 * Push all data for a specific file onto permanent storage.
888 */
Mike Marshall84d02152015-07-28 13:27:51 -0400889static int pvfs2_fsync(struct file *file,
890 loff_t start,
891 loff_t end,
892 int datasync)
Mike Marshall5db11c22015-07-17 10:38:12 -0400893{
894 int ret = -EINVAL;
895 struct pvfs2_inode_s *pvfs2_inode =
896 PVFS2_I(file->f_path.dentry->d_inode);
897 struct pvfs2_kernel_op_s *new_op = NULL;
898
899 /* required call */
900 filemap_write_and_wait_range(file->f_mapping, start, end);
901
902 new_op = op_alloc(PVFS2_VFS_OP_FSYNC);
903 if (!new_op)
904 return -ENOMEM;
905 new_op->upcall.req.fsync.refn = pvfs2_inode->refn;
906
907 ret = service_operation(new_op,
908 "pvfs2_fsync",
909 get_interruptible_flag(file->f_path.dentry->d_inode));
910
911 gossip_debug(GOSSIP_FILE_DEBUG,
912 "pvfs2_fsync got return value of %d\n",
913 ret);
914
915 op_release(new_op);
916
917 pvfs2_flush_inode(file->f_path.dentry->d_inode);
918 return ret;
919}
920
921/*
922 * Change the file pointer position for an instance of an open file.
923 *
924 * \note If .llseek is overriden, we must acquire lock as described in
925 * Documentation/filesystems/Locking.
926 *
927 * Future upgrade could support SEEK_DATA and SEEK_HOLE but would
928 * require much changes to the FS
929 */
Mike Marshall84d02152015-07-28 13:27:51 -0400930static loff_t pvfs2_file_llseek(struct file *file, loff_t offset, int origin)
Mike Marshall5db11c22015-07-17 10:38:12 -0400931{
932 int ret = -EINVAL;
933 struct inode *inode = file->f_path.dentry->d_inode;
934
935 if (!inode) {
936 gossip_err("pvfs2_file_llseek: invalid inode (NULL)\n");
937 return ret;
938 }
939
940 if (origin == PVFS2_SEEK_END) {
941 /*
942 * revalidate the inode's file size.
943 * NOTE: We are only interested in file size here,
944 * so we set mask accordingly.
945 */
946 ret = pvfs2_inode_getattr(inode, PVFS_ATTR_SYS_SIZE);
947 if (ret) {
948 gossip_debug(GOSSIP_FILE_DEBUG,
949 "%s:%s:%d calling make bad inode\n",
950 __FILE__,
951 __func__,
952 __LINE__);
953 pvfs2_make_bad_inode(inode);
954 return ret;
955 }
956 }
957
958 gossip_debug(GOSSIP_FILE_DEBUG,
Mike Marshall54804942015-10-05 13:44:24 -0400959 "pvfs2_file_llseek: offset is %ld | origin is %d"
960 " | inode size is %lu\n",
Mike Marshall5db11c22015-07-17 10:38:12 -0400961 (long)offset,
962 origin,
963 (unsigned long)file->f_path.dentry->d_inode->i_size);
964
965 return generic_file_llseek(file, offset, origin);
966}
967
968/*
969 * Support local locks (locks that only this kernel knows about)
970 * if Orangefs was mounted -o local_lock.
971 */
Mike Marshall84d02152015-07-28 13:27:51 -0400972static int pvfs2_lock(struct file *filp, int cmd, struct file_lock *fl)
Mike Marshall5db11c22015-07-17 10:38:12 -0400973{
Mike Marshallf957ae22015-09-24 12:53:05 -0400974 int rc = -EINVAL;
Mike Marshall5db11c22015-07-17 10:38:12 -0400975
976 if (PVFS2_SB(filp->f_inode->i_sb)->flags & PVFS2_OPT_LOCAL_LOCK) {
977 if (cmd == F_GETLK) {
978 rc = 0;
979 posix_test_lock(filp, fl);
980 } else {
981 rc = posix_lock_file(filp, fl, NULL);
982 }
983 }
984
985 return rc;
986}
987
988/** PVFS2 implementation of VFS file operations */
989const struct file_operations pvfs2_file_operations = {
990 .llseek = pvfs2_file_llseek,
991 .read_iter = pvfs2_file_read_iter,
992 .write_iter = pvfs2_file_write_iter,
993 .lock = pvfs2_lock,
994 .unlocked_ioctl = pvfs2_ioctl,
995 .mmap = pvfs2_file_mmap,
996 .open = generic_file_open,
997 .release = pvfs2_file_release,
998 .fsync = pvfs2_fsync,
999};