blob: 013a07c8bdfd69e71e158eac4e20197ac37aca0f [file] [log] [blame]
Mike Marshall5db11c22015-07-17 10:38:12 -04001/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6
7/*
8 * Linux VFS file operations.
9 */
10
11#include "protocol.h"
12#include "pvfs2-kernel.h"
13#include "pvfs2-bufmap.h"
14#include <linux/fs.h>
15#include <linux/pagemap.h>
16
17#define wake_up_daemon_for_return(op) \
18do { \
19 spin_lock(&op->lock); \
20 op->io_completed = 1; \
21 spin_unlock(&op->lock); \
22 wake_up_interruptible(&op->io_completion_waitq);\
23} while (0)
24
25/*
26 * Copy to client-core's address space from the buffers specified
27 * by the iovec upto total_size bytes.
28 * NOTE: the iovector can either contain addresses which
29 * can futher be kernel-space or user-space addresses.
30 * or it can pointers to struct page's
31 */
32static int precopy_buffers(struct pvfs2_bufmap *bufmap,
33 int buffer_index,
34 const struct iovec *vec,
35 unsigned long nr_segs,
36 size_t total_size,
37 int from_user)
38{
39 int ret = 0;
40
41 /*
42 * copy data from application/kernel by pulling it out
43 * of the iovec.
44 */
45 /* Are we copying from User Virtual Addresses? */
46 if (from_user)
47 ret = pvfs_bufmap_copy_iovec_from_user(
48 bufmap,
49 buffer_index,
50 vec,
51 nr_segs,
52 total_size);
53 /* Are we copying from Kernel Virtual Addresses? */
54 else
55 ret = pvfs_bufmap_copy_iovec_from_kernel(
56 bufmap,
57 buffer_index,
58 vec,
59 nr_segs,
60 total_size);
61 if (ret < 0)
62 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
63 __func__,
64 (long)ret);
65 return ret;
66}
67
68/*
69 * Copy from client-core's address space to the buffers specified
70 * by the iovec upto total_size bytes.
71 * NOTE: the iovector can either contain addresses which
72 * can futher be kernel-space or user-space addresses.
73 * or it can pointers to struct page's
74 */
75static int postcopy_buffers(struct pvfs2_bufmap *bufmap,
76 int buffer_index,
77 const struct iovec *vec,
78 int nr_segs,
79 size_t total_size,
80 int to_user)
81{
82 int ret = 0;
83
84 /*
85 * copy data to application/kernel by pushing it out to
86 * the iovec. NOTE; target buffers can be addresses or
87 * struct page pointers.
88 */
89 if (total_size) {
90 /* Are we copying to User Virtual Addresses? */
91 if (to_user)
92 ret = pvfs_bufmap_copy_to_user_iovec(
93 bufmap,
94 buffer_index,
95 vec,
96 nr_segs,
97 total_size);
98 /* Are we copying to Kern Virtual Addresses? */
99 else
100 ret = pvfs_bufmap_copy_to_kernel_iovec(
101 bufmap,
102 buffer_index,
103 vec,
104 nr_segs,
105 total_size);
106 if (ret < 0)
107 gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n",
108 __func__,
109 (long)ret);
110 }
111 return ret;
112}
113
114/*
115 * Post and wait for the I/O upcall to finish
116 */
117static ssize_t wait_for_direct_io(enum PVFS_io_type type, struct inode *inode,
118 loff_t *offset, struct iovec *vec, unsigned long nr_segs,
119 size_t total_size, loff_t readahead_size, int to_user)
120{
121 struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
122 struct pvfs2_khandle *handle = &pvfs2_inode->refn.khandle;
123 struct pvfs2_bufmap *bufmap = NULL;
124 struct pvfs2_kernel_op_s *new_op = NULL;
125 int buffer_index = -1;
126 ssize_t ret;
127
128 new_op = op_alloc(PVFS2_VFS_OP_FILE_IO);
129 if (!new_op) {
130 ret = -ENOMEM;
131 goto out;
132 }
133 /* synchronous I/O */
134 new_op->upcall.req.io.async_vfs_io = PVFS_VFS_SYNC_IO;
135 new_op->upcall.req.io.readahead_size = readahead_size;
136 new_op->upcall.req.io.io_type = type;
137 new_op->upcall.req.io.refn = pvfs2_inode->refn;
138
139populate_shared_memory:
140 /* get a shared buffer index */
141 ret = pvfs_bufmap_get(&bufmap, &buffer_index);
142 if (ret < 0) {
143 gossip_debug(GOSSIP_FILE_DEBUG,
144 "%s: pvfs_bufmap_get failure (%ld)\n",
145 __func__, (long)ret);
146 goto out;
147 }
148 gossip_debug(GOSSIP_FILE_DEBUG,
149 "%s(%pU): GET op %p -> buffer_index %d\n",
150 __func__,
151 handle,
152 new_op,
153 buffer_index);
154
155 new_op->uses_shared_memory = 1;
156 new_op->upcall.req.io.buf_index = buffer_index;
157 new_op->upcall.req.io.count = total_size;
158 new_op->upcall.req.io.offset = *offset;
159
160 gossip_debug(GOSSIP_FILE_DEBUG,
161 "%s(%pU): copy_to_user %d nr_segs %lu, offset: %llu total_size: %zd\n",
162 __func__,
163 handle,
164 to_user,
165 nr_segs,
166 llu(*offset),
167 total_size);
168 /*
169 * Stage 1: copy the buffers into client-core's address space
170 * precopy_buffers only pertains to writes.
171 */
172 if (type == PVFS_IO_WRITE) {
173 ret = precopy_buffers(bufmap,
174 buffer_index,
175 vec,
176 nr_segs,
177 total_size,
178 to_user);
179 if (ret < 0)
180 goto out;
181 }
182
183 gossip_debug(GOSSIP_FILE_DEBUG,
184 "%s(%pU): Calling post_io_request with tag (%llu)\n",
185 __func__,
186 handle,
187 llu(new_op->tag));
188
189 /* Stage 2: Service the I/O operation */
190 ret = service_operation(new_op,
191 type == PVFS_IO_WRITE ?
192 "file_write" :
193 "file_read",
194 get_interruptible_flag(inode));
195
196 /*
197 * If service_operation() returns -EAGAIN #and# the operation was
198 * purged from pvfs2_request_list or htable_ops_in_progress, then
199 * we know that the client was restarted, causing the shared memory
200 * area to be wiped clean. To restart a write operation in this
201 * case, we must re-copy the data from the user's iovec to a NEW
202 * shared memory location. To restart a read operation, we must get
203 * a new shared memory location.
204 */
205 if (ret == -EAGAIN && op_state_purged(new_op)) {
206 pvfs_bufmap_put(bufmap, buffer_index);
207 gossip_debug(GOSSIP_FILE_DEBUG,
208 "%s:going to repopulate_shared_memory.\n",
209 __func__);
210 goto populate_shared_memory;
211 }
212
213 if (ret < 0) {
214 handle_io_error(); /* defined in pvfs2-kernel.h */
215 /*
216 don't write an error to syslog on signaled operation
217 termination unless we've got debugging turned on, as
218 this can happen regularly (i.e. ctrl-c)
219 */
220 if (ret == -EINTR)
221 gossip_debug(GOSSIP_FILE_DEBUG,
222 "%s: returning error %ld\n", __func__,
223 (long)ret);
224 else
225 gossip_err("%s: error in %s handle %pU, returning %zd\n",
226 __func__,
227 type == PVFS_IO_READ ?
228 "read from" : "write to",
229 handle, ret);
230 goto out;
231 }
232
233 /*
234 * Stage 3: Post copy buffers from client-core's address space
235 * postcopy_buffers only pertains to reads.
236 */
237 if (type == PVFS_IO_READ) {
238 ret = postcopy_buffers(bufmap,
239 buffer_index,
240 vec,
241 nr_segs,
242 new_op->downcall.resp.io.amt_complete,
243 to_user);
244 if (ret < 0) {
245 /*
246 * put error codes in downcall so that handle_io_error()
247 * preserves it properly
248 */
249 new_op->downcall.status = ret;
250 handle_io_error();
251 goto out;
252 }
253 }
254 gossip_debug(GOSSIP_FILE_DEBUG,
255 "%s(%pU): Amount written as returned by the sys-io call:%d\n",
256 __func__,
257 handle,
258 (int)new_op->downcall.resp.io.amt_complete);
259
260 ret = new_op->downcall.resp.io.amt_complete;
261
262 /*
263 tell the device file owner waiting on I/O that this read has
264 completed and it can return now. in this exact case, on
265 wakeup the daemon will free the op, so we *cannot* touch it
266 after this.
267 */
268 wake_up_daemon_for_return(new_op);
269 new_op = NULL;
270
271out:
272 if (buffer_index >= 0) {
273 pvfs_bufmap_put(bufmap, buffer_index);
274 gossip_debug(GOSSIP_FILE_DEBUG,
275 "%s(%pU): PUT buffer_index %d\n",
276 __func__, handle, buffer_index);
277 buffer_index = -1;
278 }
279 if (new_op) {
280 op_release(new_op);
281 new_op = NULL;
282 }
283 return ret;
284}
285
286/*
287 * The reason we need to do this is to be able to support readv and writev
288 * that are larger than (pvfs_bufmap_size_query()) Default is
289 * PVFS2_BUFMAP_DEFAULT_DESC_SIZE MB. What that means is that we will
290 * create a new io vec descriptor for those memory addresses that
291 * go beyond the limit. Return value for this routine is negative in case
292 * of errors and 0 in case of success.
293 *
294 * Further, the new_nr_segs pointer is updated to hold the new value
295 * of number of iovecs, the new_vec pointer is updated to hold the pointer
296 * to the new split iovec, and the size array is an array of integers holding
297 * the number of iovecs that straddle pvfs_bufmap_size_query().
298 * The max_new_nr_segs value is computed by the caller and returned.
299 * (It will be (count of all iov_len/ block_size) + 1).
300 */
301static int split_iovecs(unsigned long max_new_nr_segs, /* IN */
302 unsigned long nr_segs, /* IN */
303 const struct iovec *original_iovec, /* IN */
304 unsigned long *new_nr_segs, /* OUT */
305 struct iovec **new_vec, /* OUT */
306 unsigned long *seg_count, /* OUT */
307 unsigned long **seg_array) /* OUT */
308{
309 unsigned long seg;
310 unsigned long count = 0;
311 unsigned long begin_seg;
312 unsigned long tmpnew_nr_segs = 0;
313 struct iovec *new_iovec = NULL;
314 struct iovec *orig_iovec;
315 unsigned long *sizes = NULL;
316 unsigned long sizes_count = 0;
317
318 if (nr_segs <= 0 ||
319 original_iovec == NULL ||
320 new_nr_segs == NULL ||
321 new_vec == NULL ||
322 seg_count == NULL ||
323 seg_array == NULL ||
324 max_new_nr_segs <= 0) {
325 gossip_err("Invalid parameters to split_iovecs\n");
326 return -EINVAL;
327 }
328 *new_nr_segs = 0;
329 *new_vec = NULL;
330 *seg_count = 0;
331 *seg_array = NULL;
332 /* copy the passed in iovec descriptor to a temp structure */
333 orig_iovec = kmalloc_array(nr_segs,
334 sizeof(*orig_iovec),
335 PVFS2_BUFMAP_GFP_FLAGS);
336 if (orig_iovec == NULL) {
337 gossip_err(
338 "split_iovecs: Could not allocate memory for %lu bytes!\n",
339 (unsigned long)(nr_segs * sizeof(*orig_iovec)));
340 return -ENOMEM;
341 }
342 new_iovec = kcalloc(max_new_nr_segs,
343 sizeof(*new_iovec),
344 PVFS2_BUFMAP_GFP_FLAGS);
345 if (new_iovec == NULL) {
346 kfree(orig_iovec);
347 gossip_err(
348 "split_iovecs: Could not allocate memory for %lu bytes!\n",
349 (unsigned long)(max_new_nr_segs * sizeof(*new_iovec)));
350 return -ENOMEM;
351 }
352 sizes = kcalloc(max_new_nr_segs,
353 sizeof(*sizes),
354 PVFS2_BUFMAP_GFP_FLAGS);
355 if (sizes == NULL) {
356 kfree(new_iovec);
357 kfree(orig_iovec);
358 gossip_err(
359 "split_iovecs: Could not allocate memory for %lu bytes!\n",
360 (unsigned long)(max_new_nr_segs * sizeof(*sizes)));
361 return -ENOMEM;
362 }
363 /* copy the passed in iovec to a temp structure */
364 memcpy(orig_iovec, original_iovec, nr_segs * sizeof(*orig_iovec));
365 begin_seg = 0;
366repeat:
367 for (seg = begin_seg; seg < nr_segs; seg++) {
368 if (tmpnew_nr_segs >= max_new_nr_segs ||
369 sizes_count >= max_new_nr_segs) {
370 kfree(sizes);
371 kfree(orig_iovec);
372 kfree(new_iovec);
373 gossip_err
374 ("split_iovecs: exceeded the index limit (%lu)\n",
375 tmpnew_nr_segs);
376 return -EINVAL;
377 }
378 if (count + orig_iovec[seg].iov_len <
379 pvfs_bufmap_size_query()) {
380 count += orig_iovec[seg].iov_len;
381 memcpy(&new_iovec[tmpnew_nr_segs],
382 &orig_iovec[seg],
383 sizeof(*new_iovec));
384 tmpnew_nr_segs++;
385 sizes[sizes_count]++;
386 } else {
387 new_iovec[tmpnew_nr_segs].iov_base =
388 orig_iovec[seg].iov_base;
389 new_iovec[tmpnew_nr_segs].iov_len =
390 (pvfs_bufmap_size_query() - count);
391 tmpnew_nr_segs++;
392 sizes[sizes_count]++;
393 sizes_count++;
394 begin_seg = seg;
395 orig_iovec[seg].iov_base +=
396 (pvfs_bufmap_size_query() - count);
397 orig_iovec[seg].iov_len -=
398 (pvfs_bufmap_size_query() - count);
399 count = 0;
400 break;
401 }
402 }
403 if (seg != nr_segs)
404 goto repeat;
405 else
406 sizes_count++;
407
408 *new_nr_segs = tmpnew_nr_segs;
409 /* new_iovec is freed by the caller */
410 *new_vec = new_iovec;
411 *seg_count = sizes_count;
412 /* seg_array is also freed by the caller */
413 *seg_array = sizes;
414 kfree(orig_iovec);
415 return 0;
416}
417
418static long bound_max_iovecs(const struct iovec *curr, unsigned long nr_segs,
419 ssize_t *total_count)
420{
421 unsigned long i;
422 long max_nr_iovecs;
423 ssize_t total;
424 ssize_t count;
425
426 total = 0;
427 count = 0;
428 max_nr_iovecs = 0;
429 for (i = 0; i < nr_segs; i++) {
430 const struct iovec *iv = &curr[i];
431
432 count += iv->iov_len;
433 if (unlikely((ssize_t) (count | iv->iov_len) < 0))
434 return -EINVAL;
435 if (total + iv->iov_len < pvfs_bufmap_size_query()) {
436 total += iv->iov_len;
437 max_nr_iovecs++;
438 } else {
439 total =
440 (total + iv->iov_len - pvfs_bufmap_size_query());
441 max_nr_iovecs += (total / pvfs_bufmap_size_query() + 2);
442 }
443 }
444 *total_count = count;
445 return max_nr_iovecs;
446}
447
448/*
449 * Common entry point for read/write/readv/writev
450 * This function will dispatch it to either the direct I/O
451 * or buffered I/O path depending on the mount options and/or
452 * augmented/extended metadata attached to the file.
453 * Note: File extended attributes override any mount options.
454 */
455static ssize_t do_readv_writev(enum PVFS_io_type type, struct file *file,
456 loff_t *offset, const struct iovec *iov, unsigned long nr_segs)
457{
458 struct inode *inode = file->f_mapping->host;
459 struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
460 struct pvfs2_khandle *handle = &pvfs2_inode->refn.khandle;
461 ssize_t ret;
462 ssize_t total_count;
463 unsigned int to_free;
464 size_t count;
465 unsigned long seg;
Mike Marshalleeaa3d42015-07-29 13:36:37 -0400466 unsigned long new_nr_segs;
467 unsigned long max_new_nr_segs;
468 unsigned long seg_count;
469 unsigned long *seg_array;
470 struct iovec *iovecptr;
471 struct iovec *ptr;
Mike Marshall5db11c22015-07-17 10:38:12 -0400472
473 total_count = 0;
474 ret = -EINVAL;
475 count = 0;
476 to_free = 0;
477
478 /* Compute total and max number of segments after split */
479 max_new_nr_segs = bound_max_iovecs(iov, nr_segs, &count);
Mike Marshall5db11c22015-07-17 10:38:12 -0400480
481 gossip_debug(GOSSIP_FILE_DEBUG,
482 "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n",
483 __func__,
484 handle,
485 (int)count);
486
487 if (type == PVFS_IO_WRITE) {
488 gossip_debug(GOSSIP_FILE_DEBUG,
489 "%s(%pU): proceeding with offset : %llu, "
490 "size %d\n",
491 __func__,
492 handle,
493 llu(*offset),
494 (int)count);
495 }
496
497 if (count == 0) {
498 ret = 0;
499 goto out;
500 }
501
502 /*
503 * if the total size of data transfer requested is greater than
504 * the kernel-set blocksize of PVFS2, then we split the iovecs
505 * such that no iovec description straddles a block size limit
506 */
507
508 gossip_debug(GOSSIP_FILE_DEBUG,
509 "%s: pvfs_bufmap_size:%d\n",
510 __func__,
511 pvfs_bufmap_size_query());
512
513 if (count > pvfs_bufmap_size_query()) {
514 /*
515 * Split up the given iovec description such that
516 * no iovec descriptor straddles over the block-size limitation.
517 * This makes us our job easier to stage the I/O.
518 * In addition, this function will also compute an array
519 * with seg_count entries that will store the number of
520 * segments that straddle the block-size boundaries.
521 */
522 ret = split_iovecs(max_new_nr_segs, /* IN */
523 nr_segs, /* IN */
524 iov, /* IN */
525 &new_nr_segs, /* OUT */
526 &iovecptr, /* OUT */
527 &seg_count, /* OUT */
528 &seg_array); /* OUT */
529 if (ret < 0) {
530 gossip_err("%s: Failed to split iovecs to satisfy larger than blocksize readv/writev request %zd\n",
531 __func__,
532 ret);
533 goto out;
534 }
535 gossip_debug(GOSSIP_FILE_DEBUG,
536 "%s: Splitting iovecs from %lu to %lu"
537 " [max_new %lu]\n",
538 __func__,
539 nr_segs,
540 new_nr_segs,
541 max_new_nr_segs);
542 /* We must free seg_array and iovecptr */
543 to_free = 1;
544 } else {
545 new_nr_segs = nr_segs;
546 /* use the given iovec description */
547 iovecptr = (struct iovec *)iov;
548 /* There is only 1 element in the seg_array */
549 seg_count = 1;
550 /* and its value is the number of segments passed in */
551 seg_array = &nr_segs;
552 /* We dont have to free up anything */
553 to_free = 0;
554 }
555 ptr = iovecptr;
556
557 gossip_debug(GOSSIP_FILE_DEBUG,
558 "%s(%pU) %zd@%llu\n",
559 __func__,
560 handle,
561 count,
562 llu(*offset));
563 gossip_debug(GOSSIP_FILE_DEBUG,
564 "%s(%pU): new_nr_segs: %lu, seg_count: %lu\n",
565 __func__,
566 handle,
567 new_nr_segs, seg_count);
568
569/* PVFS2_KERNEL_DEBUG is a CFLAGS define. */
570#ifdef PVFS2_KERNEL_DEBUG
571 for (seg = 0; seg < new_nr_segs; seg++)
572 gossip_debug(GOSSIP_FILE_DEBUG,
573 "%s: %d) %p to %p [%d bytes]\n",
574 __func__,
575 (int)seg + 1,
576 iovecptr[seg].iov_base,
577 iovecptr[seg].iov_base + iovecptr[seg].iov_len,
578 (int)iovecptr[seg].iov_len);
579 for (seg = 0; seg < seg_count; seg++)
580 gossip_debug(GOSSIP_FILE_DEBUG,
581 "%s: %zd) %lu\n",
582 __func__,
583 seg + 1,
584 seg_array[seg]);
585#endif
586 seg = 0;
587 while (total_count < count) {
588 size_t each_count;
589 size_t amt_complete;
590
591 /* how much to transfer in this loop iteration */
592 each_count =
593 (((count - total_count) > pvfs_bufmap_size_query()) ?
594 pvfs_bufmap_size_query() :
595 (count - total_count));
596
597 gossip_debug(GOSSIP_FILE_DEBUG,
598 "%s(%pU): size of each_count(%d)\n",
599 __func__,
600 handle,
601 (int)each_count);
602 gossip_debug(GOSSIP_FILE_DEBUG,
603 "%s(%pU): BEFORE wait_for_io: offset is %d\n",
604 __func__,
605 handle,
606 (int)*offset);
607
608 ret = wait_for_direct_io(type, inode, offset, ptr,
609 seg_array[seg], each_count, 0, 1);
610 gossip_debug(GOSSIP_FILE_DEBUG,
611 "%s(%pU): return from wait_for_io:%d\n",
612 __func__,
613 handle,
614 (int)ret);
615
616 if (ret < 0)
617 goto out;
618
619 /* advance the iovec pointer */
620 ptr += seg_array[seg];
621 seg++;
622 *offset += ret;
623 total_count += ret;
624 amt_complete = ret;
625
626 gossip_debug(GOSSIP_FILE_DEBUG,
627 "%s(%pU): AFTER wait_for_io: offset is %d\n",
628 __func__,
629 handle,
630 (int)*offset);
631
632 /*
633 * if we got a short I/O operations,
634 * fall out and return what we got so far
635 */
636 if (amt_complete < each_count)
637 break;
638 } /*end while */
639
640 if (total_count > 0)
641 ret = total_count;
642out:
643 if (to_free) {
644 kfree(iovecptr);
645 kfree(seg_array);
646 }
647 if (ret > 0) {
648 if (type == PVFS_IO_READ) {
649 file_accessed(file);
650 } else {
651 SetMtimeFlag(pvfs2_inode);
652 inode->i_mtime = CURRENT_TIME;
653 mark_inode_dirty_sync(inode);
654 }
655 }
656
657 gossip_debug(GOSSIP_FILE_DEBUG,
658 "%s(%pU): Value(%d) returned.\n",
659 __func__,
660 handle,
661 (int)ret);
662
663 return ret;
664}
665
666/*
667 * Read data from a specified offset in a file (referenced by inode).
668 * Data may be placed either in a user or kernel buffer.
669 */
670ssize_t pvfs2_inode_read(struct inode *inode,
671 char __user *buf,
672 size_t count,
673 loff_t *offset,
674 loff_t readahead_size)
675{
676 struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
677 size_t bufmap_size;
678 struct iovec vec;
679 ssize_t ret = -EINVAL;
680
681 g_pvfs2_stats.reads++;
682
683 vec.iov_base = buf;
684 vec.iov_len = count;
685
686 bufmap_size = pvfs_bufmap_size_query();
687 if (count > bufmap_size) {
688 gossip_debug(GOSSIP_FILE_DEBUG,
689 "%s: count is too large (%zd/%zd)!\n",
690 __func__, count, bufmap_size);
691 return -EINVAL;
692 }
693
694 gossip_debug(GOSSIP_FILE_DEBUG,
695 "%s(%pU) %zd@%llu\n",
696 __func__,
697 &pvfs2_inode->refn.khandle,
698 count,
699 llu(*offset));
700
701 ret = wait_for_direct_io(PVFS_IO_READ, inode, offset, &vec, 1,
702 count, readahead_size, 0);
703 if (ret > 0)
704 *offset += ret;
705
706 gossip_debug(GOSSIP_FILE_DEBUG,
707 "%s(%pU): Value(%zd) returned.\n",
708 __func__,
709 &pvfs2_inode->refn.khandle,
710 ret);
711
712 return ret;
713}
714
715static ssize_t pvfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
716{
717 struct file *file = iocb->ki_filp;
718 loff_t pos = *(&iocb->ki_pos);
719 ssize_t rc = 0;
720 unsigned long nr_segs = iter->nr_segs;
721
722 BUG_ON(iocb->private);
723
724 gossip_debug(GOSSIP_FILE_DEBUG, "pvfs2_file_read_iter\n");
725
726 g_pvfs2_stats.reads++;
727
728 rc = do_readv_writev(PVFS_IO_READ,
729 file,
730 &pos,
731 iter->iov,
732 nr_segs);
733 iocb->ki_pos = pos;
734
735 return rc;
736}
737
738static ssize_t pvfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
739{
740 struct file *file = iocb->ki_filp;
741 loff_t pos = *(&iocb->ki_pos);
742 unsigned long nr_segs = iter->nr_segs;
743 ssize_t rc;
744
745 BUG_ON(iocb->private);
746
747 gossip_debug(GOSSIP_FILE_DEBUG, "pvfs2_file_write_iter\n");
748
749 mutex_lock(&file->f_mapping->host->i_mutex);
750
751 /* Make sure generic_write_checks sees an up to date inode size. */
752 if (file->f_flags & O_APPEND) {
753 rc = pvfs2_inode_getattr(file->f_mapping->host,
754 PVFS_ATTR_SYS_SIZE);
755 if (rc) {
756 gossip_err("%s: pvfs2_inode_getattr failed, rc:%zd:.\n",
757 __func__, rc);
758 goto out;
759 }
760 }
761
762 if (file->f_pos > i_size_read(file->f_mapping->host))
763 pvfs2_i_size_write(file->f_mapping->host, file->f_pos);
764
765 rc = generic_write_checks(iocb, iter);
766
767 if (rc <= 0) {
768 gossip_err("%s: generic_write_checks failed, rc:%zd:.\n",
769 __func__, rc);
770 goto out;
771 }
772
773 rc = do_readv_writev(PVFS_IO_WRITE,
774 file,
775 &pos,
776 iter->iov,
777 nr_segs);
778 if (rc < 0) {
779 gossip_err("%s: do_readv_writev failed, rc:%zd:.\n",
780 __func__, rc);
781 goto out;
782 }
783
784 iocb->ki_pos = pos;
785 g_pvfs2_stats.writes++;
786
787out:
788
789 mutex_unlock(&file->f_mapping->host->i_mutex);
790 return rc;
791}
792
793/*
794 * Perform a miscellaneous operation on a file.
795 */
Mike Marshall84d02152015-07-28 13:27:51 -0400796static long pvfs2_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
Mike Marshall5db11c22015-07-17 10:38:12 -0400797{
798 int ret = -ENOTTY;
799 __u64 val = 0;
800 unsigned long uval;
801
802 gossip_debug(GOSSIP_FILE_DEBUG,
803 "pvfs2_ioctl: called with cmd %d\n",
804 cmd);
805
806 /*
807 * we understand some general ioctls on files, such as the immutable
808 * and append flags
809 */
810 if (cmd == FS_IOC_GETFLAGS) {
811 val = 0;
812 ret = pvfs2_xattr_get_default(file->f_path.dentry,
813 "user.pvfs2.meta_hint",
814 &val,
815 sizeof(val),
816 0);
817 if (ret < 0 && ret != -ENODATA)
818 return ret;
819 else if (ret == -ENODATA)
820 val = 0;
821 uval = val;
822 gossip_debug(GOSSIP_FILE_DEBUG,
823 "pvfs2_ioctl: FS_IOC_GETFLAGS: %llu\n",
824 (unsigned long long)uval);
825 return put_user(uval, (int __user *)arg);
826 } else if (cmd == FS_IOC_SETFLAGS) {
827 ret = 0;
828 if (get_user(uval, (int __user *)arg))
829 return -EFAULT;
830 /*
831 * PVFS_MIRROR_FL is set internally when the mirroring mode
832 * is turned on for a file. The user is not allowed to turn
833 * on this bit, but the bit is present if the user first gets
834 * the flags and then updates the flags with some new
835 * settings. So, we ignore it in the following edit. bligon.
836 */
837 if ((uval & ~PVFS_MIRROR_FL) &
838 (~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL))) {
839 gossip_err("pvfs2_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n");
840 return -EINVAL;
841 }
842 val = uval;
843 gossip_debug(GOSSIP_FILE_DEBUG,
844 "pvfs2_ioctl: FS_IOC_SETFLAGS: %llu\n",
845 (unsigned long long)val);
846 ret = pvfs2_xattr_set_default(file->f_path.dentry,
847 "user.pvfs2.meta_hint",
848 &val,
849 sizeof(val),
850 0,
851 0);
852 }
853
854 return ret;
855}
856
857/*
858 * Memory map a region of a file.
859 */
860static int pvfs2_file_mmap(struct file *file, struct vm_area_struct *vma)
861{
862 gossip_debug(GOSSIP_FILE_DEBUG,
863 "pvfs2_file_mmap: called on %s\n",
864 (file ?
865 (char *)file->f_path.dentry->d_name.name :
866 (char *)"Unknown"));
867
868 /* set the sequential readahead hint */
869 vma->vm_flags |= VM_SEQ_READ;
870 vma->vm_flags &= ~VM_RAND_READ;
871 return generic_file_mmap(file, vma);
872}
873
874#define mapping_nrpages(idata) ((idata)->nrpages)
875
876/*
877 * Called to notify the module that there are no more references to
878 * this file (i.e. no processes have it open).
879 *
880 * \note Not called when each file is closed.
881 */
Mike Marshall84d02152015-07-28 13:27:51 -0400882static int pvfs2_file_release(struct inode *inode, struct file *file)
Mike Marshall5db11c22015-07-17 10:38:12 -0400883{
884 gossip_debug(GOSSIP_FILE_DEBUG,
885 "pvfs2_file_release: called on %s\n",
886 file->f_path.dentry->d_name.name);
887
888 pvfs2_flush_inode(inode);
889
890 /*
891 remove all associated inode pages from the page cache and mmap
892 readahead cache (if any); this forces an expensive refresh of
893 data for the next caller of mmap (or 'get_block' accesses)
894 */
895 if (file->f_path.dentry->d_inode &&
896 file->f_path.dentry->d_inode->i_mapping &&
897 mapping_nrpages(&file->f_path.dentry->d_inode->i_data))
898 truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping,
899 0);
900 return 0;
901}
902
903/*
904 * Push all data for a specific file onto permanent storage.
905 */
Mike Marshall84d02152015-07-28 13:27:51 -0400906static int pvfs2_fsync(struct file *file,
907 loff_t start,
908 loff_t end,
909 int datasync)
Mike Marshall5db11c22015-07-17 10:38:12 -0400910{
911 int ret = -EINVAL;
912 struct pvfs2_inode_s *pvfs2_inode =
913 PVFS2_I(file->f_path.dentry->d_inode);
914 struct pvfs2_kernel_op_s *new_op = NULL;
915
916 /* required call */
917 filemap_write_and_wait_range(file->f_mapping, start, end);
918
919 new_op = op_alloc(PVFS2_VFS_OP_FSYNC);
920 if (!new_op)
921 return -ENOMEM;
922 new_op->upcall.req.fsync.refn = pvfs2_inode->refn;
923
924 ret = service_operation(new_op,
925 "pvfs2_fsync",
926 get_interruptible_flag(file->f_path.dentry->d_inode));
927
928 gossip_debug(GOSSIP_FILE_DEBUG,
929 "pvfs2_fsync got return value of %d\n",
930 ret);
931
932 op_release(new_op);
933
934 pvfs2_flush_inode(file->f_path.dentry->d_inode);
935 return ret;
936}
937
938/*
939 * Change the file pointer position for an instance of an open file.
940 *
941 * \note If .llseek is overriden, we must acquire lock as described in
942 * Documentation/filesystems/Locking.
943 *
944 * Future upgrade could support SEEK_DATA and SEEK_HOLE but would
945 * require much changes to the FS
946 */
Mike Marshall84d02152015-07-28 13:27:51 -0400947static loff_t pvfs2_file_llseek(struct file *file, loff_t offset, int origin)
Mike Marshall5db11c22015-07-17 10:38:12 -0400948{
949 int ret = -EINVAL;
950 struct inode *inode = file->f_path.dentry->d_inode;
951
952 if (!inode) {
953 gossip_err("pvfs2_file_llseek: invalid inode (NULL)\n");
954 return ret;
955 }
956
957 if (origin == PVFS2_SEEK_END) {
958 /*
959 * revalidate the inode's file size.
960 * NOTE: We are only interested in file size here,
961 * so we set mask accordingly.
962 */
963 ret = pvfs2_inode_getattr(inode, PVFS_ATTR_SYS_SIZE);
964 if (ret) {
965 gossip_debug(GOSSIP_FILE_DEBUG,
966 "%s:%s:%d calling make bad inode\n",
967 __FILE__,
968 __func__,
969 __LINE__);
970 pvfs2_make_bad_inode(inode);
971 return ret;
972 }
973 }
974
975 gossip_debug(GOSSIP_FILE_DEBUG,
976 "pvfs2_file_llseek: offset is %ld | origin is %d | "
977 "inode size is %lu\n",
978 (long)offset,
979 origin,
980 (unsigned long)file->f_path.dentry->d_inode->i_size);
981
982 return generic_file_llseek(file, offset, origin);
983}
984
985/*
986 * Support local locks (locks that only this kernel knows about)
987 * if Orangefs was mounted -o local_lock.
988 */
Mike Marshall84d02152015-07-28 13:27:51 -0400989static int pvfs2_lock(struct file *filp, int cmd, struct file_lock *fl)
Mike Marshall5db11c22015-07-17 10:38:12 -0400990{
991 int rc = -ENOLCK;
992
993 if (PVFS2_SB(filp->f_inode->i_sb)->flags & PVFS2_OPT_LOCAL_LOCK) {
994 if (cmd == F_GETLK) {
995 rc = 0;
996 posix_test_lock(filp, fl);
997 } else {
998 rc = posix_lock_file(filp, fl, NULL);
999 }
1000 }
1001
1002 return rc;
1003}
1004
1005/** PVFS2 implementation of VFS file operations */
1006const struct file_operations pvfs2_file_operations = {
1007 .llseek = pvfs2_file_llseek,
1008 .read_iter = pvfs2_file_read_iter,
1009 .write_iter = pvfs2_file_write_iter,
1010 .lock = pvfs2_lock,
1011 .unlocked_ioctl = pvfs2_ioctl,
1012 .mmap = pvfs2_file_mmap,
1013 .open = generic_file_open,
1014 .release = pvfs2_file_release,
1015 .fsync = pvfs2_fsync,
1016};