blob: 8e26f9fac289df8f1cc43577756fbaafd62c47d9 [file] [log] [blame]
Mike Marshall5db11c22015-07-17 10:38:12 -04001/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6
7/*
8 * Linux VFS file operations.
9 */
10
11#include "protocol.h"
12#include "pvfs2-kernel.h"
13#include "pvfs2-bufmap.h"
14#include <linux/fs.h>
15#include <linux/pagemap.h>
16
17#define wake_up_daemon_for_return(op) \
18do { \
19 spin_lock(&op->lock); \
20 op->io_completed = 1; \
21 spin_unlock(&op->lock); \
22 wake_up_interruptible(&op->io_completion_waitq);\
23} while (0)
24
25/*
26 * Copy to client-core's address space from the buffers specified
27 * by the iovec upto total_size bytes.
28 * NOTE: the iovector can either contain addresses which
29 * can futher be kernel-space or user-space addresses.
30 * or it can pointers to struct page's
31 */
32static int precopy_buffers(struct pvfs2_bufmap *bufmap,
33 int buffer_index,
34 const struct iovec *vec,
35 unsigned long nr_segs,
36 size_t total_size,
37 int from_user)
38{
39 int ret = 0;
40
41 /*
42 * copy data from application/kernel by pulling it out
43 * of the iovec.
44 */
45 /* Are we copying from User Virtual Addresses? */
46 if (from_user)
47 ret = pvfs_bufmap_copy_iovec_from_user(
48 bufmap,
49 buffer_index,
50 vec,
51 nr_segs,
52 total_size);
53 /* Are we copying from Kernel Virtual Addresses? */
54 else
55 ret = pvfs_bufmap_copy_iovec_from_kernel(
56 bufmap,
57 buffer_index,
58 vec,
59 nr_segs,
60 total_size);
61 if (ret < 0)
62 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
63 __func__,
64 (long)ret);
65 return ret;
66}
67
68/*
69 * Copy from client-core's address space to the buffers specified
70 * by the iovec upto total_size bytes.
71 * NOTE: the iovector can either contain addresses which
72 * can futher be kernel-space or user-space addresses.
73 * or it can pointers to struct page's
74 */
75static int postcopy_buffers(struct pvfs2_bufmap *bufmap,
76 int buffer_index,
77 const struct iovec *vec,
78 int nr_segs,
79 size_t total_size,
80 int to_user)
81{
82 int ret = 0;
83
84 /*
85 * copy data to application/kernel by pushing it out to
86 * the iovec. NOTE; target buffers can be addresses or
87 * struct page pointers.
88 */
89 if (total_size) {
90 /* Are we copying to User Virtual Addresses? */
91 if (to_user)
92 ret = pvfs_bufmap_copy_to_user_iovec(
93 bufmap,
94 buffer_index,
95 vec,
96 nr_segs,
97 total_size);
98 /* Are we copying to Kern Virtual Addresses? */
99 else
100 ret = pvfs_bufmap_copy_to_kernel_iovec(
101 bufmap,
102 buffer_index,
103 vec,
104 nr_segs,
105 total_size);
106 if (ret < 0)
107 gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n",
108 __func__,
109 (long)ret);
110 }
111 return ret;
112}
113
114/*
115 * Post and wait for the I/O upcall to finish
116 */
117static ssize_t wait_for_direct_io(enum PVFS_io_type type, struct inode *inode,
118 loff_t *offset, struct iovec *vec, unsigned long nr_segs,
119 size_t total_size, loff_t readahead_size, int to_user)
120{
121 struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
122 struct pvfs2_khandle *handle = &pvfs2_inode->refn.khandle;
123 struct pvfs2_bufmap *bufmap = NULL;
124 struct pvfs2_kernel_op_s *new_op = NULL;
125 int buffer_index = -1;
126 ssize_t ret;
127
128 new_op = op_alloc(PVFS2_VFS_OP_FILE_IO);
129 if (!new_op) {
130 ret = -ENOMEM;
131 goto out;
132 }
133 /* synchronous I/O */
134 new_op->upcall.req.io.async_vfs_io = PVFS_VFS_SYNC_IO;
135 new_op->upcall.req.io.readahead_size = readahead_size;
136 new_op->upcall.req.io.io_type = type;
137 new_op->upcall.req.io.refn = pvfs2_inode->refn;
138
139populate_shared_memory:
140 /* get a shared buffer index */
141 ret = pvfs_bufmap_get(&bufmap, &buffer_index);
142 if (ret < 0) {
143 gossip_debug(GOSSIP_FILE_DEBUG,
144 "%s: pvfs_bufmap_get failure (%ld)\n",
145 __func__, (long)ret);
146 goto out;
147 }
148 gossip_debug(GOSSIP_FILE_DEBUG,
149 "%s(%pU): GET op %p -> buffer_index %d\n",
150 __func__,
151 handle,
152 new_op,
153 buffer_index);
154
155 new_op->uses_shared_memory = 1;
156 new_op->upcall.req.io.buf_index = buffer_index;
157 new_op->upcall.req.io.count = total_size;
158 new_op->upcall.req.io.offset = *offset;
159
160 gossip_debug(GOSSIP_FILE_DEBUG,
161 "%s(%pU): copy_to_user %d nr_segs %lu, offset: %llu total_size: %zd\n",
162 __func__,
163 handle,
164 to_user,
165 nr_segs,
166 llu(*offset),
167 total_size);
168 /*
169 * Stage 1: copy the buffers into client-core's address space
170 * precopy_buffers only pertains to writes.
171 */
172 if (type == PVFS_IO_WRITE) {
173 ret = precopy_buffers(bufmap,
174 buffer_index,
175 vec,
176 nr_segs,
177 total_size,
178 to_user);
179 if (ret < 0)
180 goto out;
181 }
182
183 gossip_debug(GOSSIP_FILE_DEBUG,
184 "%s(%pU): Calling post_io_request with tag (%llu)\n",
185 __func__,
186 handle,
187 llu(new_op->tag));
188
189 /* Stage 2: Service the I/O operation */
190 ret = service_operation(new_op,
191 type == PVFS_IO_WRITE ?
192 "file_write" :
193 "file_read",
194 get_interruptible_flag(inode));
195
196 /*
197 * If service_operation() returns -EAGAIN #and# the operation was
198 * purged from pvfs2_request_list or htable_ops_in_progress, then
199 * we know that the client was restarted, causing the shared memory
200 * area to be wiped clean. To restart a write operation in this
201 * case, we must re-copy the data from the user's iovec to a NEW
202 * shared memory location. To restart a read operation, we must get
203 * a new shared memory location.
204 */
205 if (ret == -EAGAIN && op_state_purged(new_op)) {
206 pvfs_bufmap_put(bufmap, buffer_index);
207 gossip_debug(GOSSIP_FILE_DEBUG,
208 "%s:going to repopulate_shared_memory.\n",
209 __func__);
210 goto populate_shared_memory;
211 }
212
213 if (ret < 0) {
214 handle_io_error(); /* defined in pvfs2-kernel.h */
215 /*
216 don't write an error to syslog on signaled operation
217 termination unless we've got debugging turned on, as
218 this can happen regularly (i.e. ctrl-c)
219 */
220 if (ret == -EINTR)
221 gossip_debug(GOSSIP_FILE_DEBUG,
222 "%s: returning error %ld\n", __func__,
223 (long)ret);
224 else
225 gossip_err("%s: error in %s handle %pU, returning %zd\n",
226 __func__,
227 type == PVFS_IO_READ ?
228 "read from" : "write to",
229 handle, ret);
230 goto out;
231 }
232
233 /*
234 * Stage 3: Post copy buffers from client-core's address space
235 * postcopy_buffers only pertains to reads.
236 */
237 if (type == PVFS_IO_READ) {
238 ret = postcopy_buffers(bufmap,
239 buffer_index,
240 vec,
241 nr_segs,
242 new_op->downcall.resp.io.amt_complete,
243 to_user);
244 if (ret < 0) {
245 /*
246 * put error codes in downcall so that handle_io_error()
247 * preserves it properly
248 */
249 new_op->downcall.status = ret;
250 handle_io_error();
251 goto out;
252 }
253 }
254 gossip_debug(GOSSIP_FILE_DEBUG,
255 "%s(%pU): Amount written as returned by the sys-io call:%d\n",
256 __func__,
257 handle,
258 (int)new_op->downcall.resp.io.amt_complete);
259
260 ret = new_op->downcall.resp.io.amt_complete;
261
262 /*
263 tell the device file owner waiting on I/O that this read has
264 completed and it can return now. in this exact case, on
265 wakeup the daemon will free the op, so we *cannot* touch it
266 after this.
267 */
268 wake_up_daemon_for_return(new_op);
269 new_op = NULL;
270
271out:
272 if (buffer_index >= 0) {
273 pvfs_bufmap_put(bufmap, buffer_index);
274 gossip_debug(GOSSIP_FILE_DEBUG,
275 "%s(%pU): PUT buffer_index %d\n",
276 __func__, handle, buffer_index);
277 buffer_index = -1;
278 }
279 if (new_op) {
280 op_release(new_op);
281 new_op = NULL;
282 }
283 return ret;
284}
285
286/*
287 * The reason we need to do this is to be able to support readv and writev
288 * that are larger than (pvfs_bufmap_size_query()) Default is
289 * PVFS2_BUFMAP_DEFAULT_DESC_SIZE MB. What that means is that we will
290 * create a new io vec descriptor for those memory addresses that
291 * go beyond the limit. Return value for this routine is negative in case
292 * of errors and 0 in case of success.
293 *
294 * Further, the new_nr_segs pointer is updated to hold the new value
295 * of number of iovecs, the new_vec pointer is updated to hold the pointer
296 * to the new split iovec, and the size array is an array of integers holding
297 * the number of iovecs that straddle pvfs_bufmap_size_query().
298 * The max_new_nr_segs value is computed by the caller and returned.
299 * (It will be (count of all iov_len/ block_size) + 1).
300 */
301static int split_iovecs(unsigned long max_new_nr_segs, /* IN */
302 unsigned long nr_segs, /* IN */
303 const struct iovec *original_iovec, /* IN */
304 unsigned long *new_nr_segs, /* OUT */
305 struct iovec **new_vec, /* OUT */
306 unsigned long *seg_count, /* OUT */
307 unsigned long **seg_array) /* OUT */
308{
309 unsigned long seg;
310 unsigned long count = 0;
311 unsigned long begin_seg;
312 unsigned long tmpnew_nr_segs = 0;
313 struct iovec *new_iovec = NULL;
314 struct iovec *orig_iovec;
315 unsigned long *sizes = NULL;
316 unsigned long sizes_count = 0;
317
318 if (nr_segs <= 0 ||
319 original_iovec == NULL ||
320 new_nr_segs == NULL ||
321 new_vec == NULL ||
322 seg_count == NULL ||
323 seg_array == NULL ||
324 max_new_nr_segs <= 0) {
325 gossip_err("Invalid parameters to split_iovecs\n");
326 return -EINVAL;
327 }
328 *new_nr_segs = 0;
329 *new_vec = NULL;
330 *seg_count = 0;
331 *seg_array = NULL;
332 /* copy the passed in iovec descriptor to a temp structure */
333 orig_iovec = kmalloc_array(nr_segs,
334 sizeof(*orig_iovec),
335 PVFS2_BUFMAP_GFP_FLAGS);
336 if (orig_iovec == NULL) {
337 gossip_err(
338 "split_iovecs: Could not allocate memory for %lu bytes!\n",
339 (unsigned long)(nr_segs * sizeof(*orig_iovec)));
340 return -ENOMEM;
341 }
342 new_iovec = kcalloc(max_new_nr_segs,
343 sizeof(*new_iovec),
344 PVFS2_BUFMAP_GFP_FLAGS);
345 if (new_iovec == NULL) {
346 kfree(orig_iovec);
347 gossip_err(
348 "split_iovecs: Could not allocate memory for %lu bytes!\n",
349 (unsigned long)(max_new_nr_segs * sizeof(*new_iovec)));
350 return -ENOMEM;
351 }
352 sizes = kcalloc(max_new_nr_segs,
353 sizeof(*sizes),
354 PVFS2_BUFMAP_GFP_FLAGS);
355 if (sizes == NULL) {
356 kfree(new_iovec);
357 kfree(orig_iovec);
358 gossip_err(
359 "split_iovecs: Could not allocate memory for %lu bytes!\n",
360 (unsigned long)(max_new_nr_segs * sizeof(*sizes)));
361 return -ENOMEM;
362 }
363 /* copy the passed in iovec to a temp structure */
364 memcpy(orig_iovec, original_iovec, nr_segs * sizeof(*orig_iovec));
365 begin_seg = 0;
366repeat:
367 for (seg = begin_seg; seg < nr_segs; seg++) {
368 if (tmpnew_nr_segs >= max_new_nr_segs ||
369 sizes_count >= max_new_nr_segs) {
370 kfree(sizes);
371 kfree(orig_iovec);
372 kfree(new_iovec);
373 gossip_err
374 ("split_iovecs: exceeded the index limit (%lu)\n",
375 tmpnew_nr_segs);
376 return -EINVAL;
377 }
378 if (count + orig_iovec[seg].iov_len <
379 pvfs_bufmap_size_query()) {
380 count += orig_iovec[seg].iov_len;
381 memcpy(&new_iovec[tmpnew_nr_segs],
382 &orig_iovec[seg],
383 sizeof(*new_iovec));
384 tmpnew_nr_segs++;
385 sizes[sizes_count]++;
386 } else {
387 new_iovec[tmpnew_nr_segs].iov_base =
388 orig_iovec[seg].iov_base;
389 new_iovec[tmpnew_nr_segs].iov_len =
390 (pvfs_bufmap_size_query() - count);
391 tmpnew_nr_segs++;
392 sizes[sizes_count]++;
393 sizes_count++;
394 begin_seg = seg;
395 orig_iovec[seg].iov_base +=
396 (pvfs_bufmap_size_query() - count);
397 orig_iovec[seg].iov_len -=
398 (pvfs_bufmap_size_query() - count);
399 count = 0;
400 break;
401 }
402 }
403 if (seg != nr_segs)
404 goto repeat;
405 else
406 sizes_count++;
407
408 *new_nr_segs = tmpnew_nr_segs;
409 /* new_iovec is freed by the caller */
410 *new_vec = new_iovec;
411 *seg_count = sizes_count;
412 /* seg_array is also freed by the caller */
413 *seg_array = sizes;
414 kfree(orig_iovec);
415 return 0;
416}
417
418static long bound_max_iovecs(const struct iovec *curr, unsigned long nr_segs,
419 ssize_t *total_count)
420{
421 unsigned long i;
422 long max_nr_iovecs;
423 ssize_t total;
424 ssize_t count;
425
426 total = 0;
427 count = 0;
428 max_nr_iovecs = 0;
429 for (i = 0; i < nr_segs; i++) {
430 const struct iovec *iv = &curr[i];
431
432 count += iv->iov_len;
433 if (unlikely((ssize_t) (count | iv->iov_len) < 0))
434 return -EINVAL;
435 if (total + iv->iov_len < pvfs_bufmap_size_query()) {
436 total += iv->iov_len;
437 max_nr_iovecs++;
438 } else {
439 total =
440 (total + iv->iov_len - pvfs_bufmap_size_query());
441 max_nr_iovecs += (total / pvfs_bufmap_size_query() + 2);
442 }
443 }
444 *total_count = count;
445 return max_nr_iovecs;
446}
447
448/*
449 * Common entry point for read/write/readv/writev
450 * This function will dispatch it to either the direct I/O
451 * or buffered I/O path depending on the mount options and/or
452 * augmented/extended metadata attached to the file.
453 * Note: File extended attributes override any mount options.
454 */
455static ssize_t do_readv_writev(enum PVFS_io_type type, struct file *file,
456 loff_t *offset, const struct iovec *iov, unsigned long nr_segs)
457{
458 struct inode *inode = file->f_mapping->host;
459 struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
460 struct pvfs2_khandle *handle = &pvfs2_inode->refn.khandle;
461 ssize_t ret;
462 ssize_t total_count;
463 unsigned int to_free;
464 size_t count;
465 unsigned long seg;
466 unsigned long new_nr_segs = 0;
467 unsigned long max_new_nr_segs = 0;
468 unsigned long seg_count = 0;
469 unsigned long *seg_array = NULL;
470 struct iovec *iovecptr = NULL;
471 struct iovec *ptr = NULL;
472
473 total_count = 0;
474 ret = -EINVAL;
475 count = 0;
476 to_free = 0;
477
478 /* Compute total and max number of segments after split */
479 max_new_nr_segs = bound_max_iovecs(iov, nr_segs, &count);
480 if (max_new_nr_segs < 0) {
481 gossip_lerr("%s: could not bound iovec %lu\n",
482 __func__,
483 max_new_nr_segs);
484 goto out;
485 }
486
487 gossip_debug(GOSSIP_FILE_DEBUG,
488 "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n",
489 __func__,
490 handle,
491 (int)count);
492
493 if (type == PVFS_IO_WRITE) {
494 gossip_debug(GOSSIP_FILE_DEBUG,
495 "%s(%pU): proceeding with offset : %llu, "
496 "size %d\n",
497 __func__,
498 handle,
499 llu(*offset),
500 (int)count);
501 }
502
503 if (count == 0) {
504 ret = 0;
505 goto out;
506 }
507
508 /*
509 * if the total size of data transfer requested is greater than
510 * the kernel-set blocksize of PVFS2, then we split the iovecs
511 * such that no iovec description straddles a block size limit
512 */
513
514 gossip_debug(GOSSIP_FILE_DEBUG,
515 "%s: pvfs_bufmap_size:%d\n",
516 __func__,
517 pvfs_bufmap_size_query());
518
519 if (count > pvfs_bufmap_size_query()) {
520 /*
521 * Split up the given iovec description such that
522 * no iovec descriptor straddles over the block-size limitation.
523 * This makes us our job easier to stage the I/O.
524 * In addition, this function will also compute an array
525 * with seg_count entries that will store the number of
526 * segments that straddle the block-size boundaries.
527 */
528 ret = split_iovecs(max_new_nr_segs, /* IN */
529 nr_segs, /* IN */
530 iov, /* IN */
531 &new_nr_segs, /* OUT */
532 &iovecptr, /* OUT */
533 &seg_count, /* OUT */
534 &seg_array); /* OUT */
535 if (ret < 0) {
536 gossip_err("%s: Failed to split iovecs to satisfy larger than blocksize readv/writev request %zd\n",
537 __func__,
538 ret);
539 goto out;
540 }
541 gossip_debug(GOSSIP_FILE_DEBUG,
542 "%s: Splitting iovecs from %lu to %lu"
543 " [max_new %lu]\n",
544 __func__,
545 nr_segs,
546 new_nr_segs,
547 max_new_nr_segs);
548 /* We must free seg_array and iovecptr */
549 to_free = 1;
550 } else {
551 new_nr_segs = nr_segs;
552 /* use the given iovec description */
553 iovecptr = (struct iovec *)iov;
554 /* There is only 1 element in the seg_array */
555 seg_count = 1;
556 /* and its value is the number of segments passed in */
557 seg_array = &nr_segs;
558 /* We dont have to free up anything */
559 to_free = 0;
560 }
561 ptr = iovecptr;
562
563 gossip_debug(GOSSIP_FILE_DEBUG,
564 "%s(%pU) %zd@%llu\n",
565 __func__,
566 handle,
567 count,
568 llu(*offset));
569 gossip_debug(GOSSIP_FILE_DEBUG,
570 "%s(%pU): new_nr_segs: %lu, seg_count: %lu\n",
571 __func__,
572 handle,
573 new_nr_segs, seg_count);
574
575/* PVFS2_KERNEL_DEBUG is a CFLAGS define. */
576#ifdef PVFS2_KERNEL_DEBUG
577 for (seg = 0; seg < new_nr_segs; seg++)
578 gossip_debug(GOSSIP_FILE_DEBUG,
579 "%s: %d) %p to %p [%d bytes]\n",
580 __func__,
581 (int)seg + 1,
582 iovecptr[seg].iov_base,
583 iovecptr[seg].iov_base + iovecptr[seg].iov_len,
584 (int)iovecptr[seg].iov_len);
585 for (seg = 0; seg < seg_count; seg++)
586 gossip_debug(GOSSIP_FILE_DEBUG,
587 "%s: %zd) %lu\n",
588 __func__,
589 seg + 1,
590 seg_array[seg]);
591#endif
592 seg = 0;
593 while (total_count < count) {
594 size_t each_count;
595 size_t amt_complete;
596
597 /* how much to transfer in this loop iteration */
598 each_count =
599 (((count - total_count) > pvfs_bufmap_size_query()) ?
600 pvfs_bufmap_size_query() :
601 (count - total_count));
602
603 gossip_debug(GOSSIP_FILE_DEBUG,
604 "%s(%pU): size of each_count(%d)\n",
605 __func__,
606 handle,
607 (int)each_count);
608 gossip_debug(GOSSIP_FILE_DEBUG,
609 "%s(%pU): BEFORE wait_for_io: offset is %d\n",
610 __func__,
611 handle,
612 (int)*offset);
613
614 ret = wait_for_direct_io(type, inode, offset, ptr,
615 seg_array[seg], each_count, 0, 1);
616 gossip_debug(GOSSIP_FILE_DEBUG,
617 "%s(%pU): return from wait_for_io:%d\n",
618 __func__,
619 handle,
620 (int)ret);
621
622 if (ret < 0)
623 goto out;
624
625 /* advance the iovec pointer */
626 ptr += seg_array[seg];
627 seg++;
628 *offset += ret;
629 total_count += ret;
630 amt_complete = ret;
631
632 gossip_debug(GOSSIP_FILE_DEBUG,
633 "%s(%pU): AFTER wait_for_io: offset is %d\n",
634 __func__,
635 handle,
636 (int)*offset);
637
638 /*
639 * if we got a short I/O operations,
640 * fall out and return what we got so far
641 */
642 if (amt_complete < each_count)
643 break;
644 } /*end while */
645
646 if (total_count > 0)
647 ret = total_count;
648out:
649 if (to_free) {
650 kfree(iovecptr);
651 kfree(seg_array);
652 }
653 if (ret > 0) {
654 if (type == PVFS_IO_READ) {
655 file_accessed(file);
656 } else {
657 SetMtimeFlag(pvfs2_inode);
658 inode->i_mtime = CURRENT_TIME;
659 mark_inode_dirty_sync(inode);
660 }
661 }
662
663 gossip_debug(GOSSIP_FILE_DEBUG,
664 "%s(%pU): Value(%d) returned.\n",
665 __func__,
666 handle,
667 (int)ret);
668
669 return ret;
670}
671
672/*
673 * Read data from a specified offset in a file (referenced by inode).
674 * Data may be placed either in a user or kernel buffer.
675 */
676ssize_t pvfs2_inode_read(struct inode *inode,
677 char __user *buf,
678 size_t count,
679 loff_t *offset,
680 loff_t readahead_size)
681{
682 struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
683 size_t bufmap_size;
684 struct iovec vec;
685 ssize_t ret = -EINVAL;
686
687 g_pvfs2_stats.reads++;
688
689 vec.iov_base = buf;
690 vec.iov_len = count;
691
692 bufmap_size = pvfs_bufmap_size_query();
693 if (count > bufmap_size) {
694 gossip_debug(GOSSIP_FILE_DEBUG,
695 "%s: count is too large (%zd/%zd)!\n",
696 __func__, count, bufmap_size);
697 return -EINVAL;
698 }
699
700 gossip_debug(GOSSIP_FILE_DEBUG,
701 "%s(%pU) %zd@%llu\n",
702 __func__,
703 &pvfs2_inode->refn.khandle,
704 count,
705 llu(*offset));
706
707 ret = wait_for_direct_io(PVFS_IO_READ, inode, offset, &vec, 1,
708 count, readahead_size, 0);
709 if (ret > 0)
710 *offset += ret;
711
712 gossip_debug(GOSSIP_FILE_DEBUG,
713 "%s(%pU): Value(%zd) returned.\n",
714 __func__,
715 &pvfs2_inode->refn.khandle,
716 ret);
717
718 return ret;
719}
720
721static ssize_t pvfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
722{
723 struct file *file = iocb->ki_filp;
724 loff_t pos = *(&iocb->ki_pos);
725 ssize_t rc = 0;
726 unsigned long nr_segs = iter->nr_segs;
727
728 BUG_ON(iocb->private);
729
730 gossip_debug(GOSSIP_FILE_DEBUG, "pvfs2_file_read_iter\n");
731
732 g_pvfs2_stats.reads++;
733
734 rc = do_readv_writev(PVFS_IO_READ,
735 file,
736 &pos,
737 iter->iov,
738 nr_segs);
739 iocb->ki_pos = pos;
740
741 return rc;
742}
743
744static ssize_t pvfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
745{
746 struct file *file = iocb->ki_filp;
747 loff_t pos = *(&iocb->ki_pos);
748 unsigned long nr_segs = iter->nr_segs;
749 ssize_t rc;
750
751 BUG_ON(iocb->private);
752
753 gossip_debug(GOSSIP_FILE_DEBUG, "pvfs2_file_write_iter\n");
754
755 mutex_lock(&file->f_mapping->host->i_mutex);
756
757 /* Make sure generic_write_checks sees an up to date inode size. */
758 if (file->f_flags & O_APPEND) {
759 rc = pvfs2_inode_getattr(file->f_mapping->host,
760 PVFS_ATTR_SYS_SIZE);
761 if (rc) {
762 gossip_err("%s: pvfs2_inode_getattr failed, rc:%zd:.\n",
763 __func__, rc);
764 goto out;
765 }
766 }
767
768 if (file->f_pos > i_size_read(file->f_mapping->host))
769 pvfs2_i_size_write(file->f_mapping->host, file->f_pos);
770
771 rc = generic_write_checks(iocb, iter);
772
773 if (rc <= 0) {
774 gossip_err("%s: generic_write_checks failed, rc:%zd:.\n",
775 __func__, rc);
776 goto out;
777 }
778
779 rc = do_readv_writev(PVFS_IO_WRITE,
780 file,
781 &pos,
782 iter->iov,
783 nr_segs);
784 if (rc < 0) {
785 gossip_err("%s: do_readv_writev failed, rc:%zd:.\n",
786 __func__, rc);
787 goto out;
788 }
789
790 iocb->ki_pos = pos;
791 g_pvfs2_stats.writes++;
792
793out:
794
795 mutex_unlock(&file->f_mapping->host->i_mutex);
796 return rc;
797}
798
799/*
800 * Perform a miscellaneous operation on a file.
801 */
802long pvfs2_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
803{
804 int ret = -ENOTTY;
805 __u64 val = 0;
806 unsigned long uval;
807
808 gossip_debug(GOSSIP_FILE_DEBUG,
809 "pvfs2_ioctl: called with cmd %d\n",
810 cmd);
811
812 /*
813 * we understand some general ioctls on files, such as the immutable
814 * and append flags
815 */
816 if (cmd == FS_IOC_GETFLAGS) {
817 val = 0;
818 ret = pvfs2_xattr_get_default(file->f_path.dentry,
819 "user.pvfs2.meta_hint",
820 &val,
821 sizeof(val),
822 0);
823 if (ret < 0 && ret != -ENODATA)
824 return ret;
825 else if (ret == -ENODATA)
826 val = 0;
827 uval = val;
828 gossip_debug(GOSSIP_FILE_DEBUG,
829 "pvfs2_ioctl: FS_IOC_GETFLAGS: %llu\n",
830 (unsigned long long)uval);
831 return put_user(uval, (int __user *)arg);
832 } else if (cmd == FS_IOC_SETFLAGS) {
833 ret = 0;
834 if (get_user(uval, (int __user *)arg))
835 return -EFAULT;
836 /*
837 * PVFS_MIRROR_FL is set internally when the mirroring mode
838 * is turned on for a file. The user is not allowed to turn
839 * on this bit, but the bit is present if the user first gets
840 * the flags and then updates the flags with some new
841 * settings. So, we ignore it in the following edit. bligon.
842 */
843 if ((uval & ~PVFS_MIRROR_FL) &
844 (~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL))) {
845 gossip_err("pvfs2_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n");
846 return -EINVAL;
847 }
848 val = uval;
849 gossip_debug(GOSSIP_FILE_DEBUG,
850 "pvfs2_ioctl: FS_IOC_SETFLAGS: %llu\n",
851 (unsigned long long)val);
852 ret = pvfs2_xattr_set_default(file->f_path.dentry,
853 "user.pvfs2.meta_hint",
854 &val,
855 sizeof(val),
856 0,
857 0);
858 }
859
860 return ret;
861}
862
863/*
864 * Memory map a region of a file.
865 */
866static int pvfs2_file_mmap(struct file *file, struct vm_area_struct *vma)
867{
868 gossip_debug(GOSSIP_FILE_DEBUG,
869 "pvfs2_file_mmap: called on %s\n",
870 (file ?
871 (char *)file->f_path.dentry->d_name.name :
872 (char *)"Unknown"));
873
874 /* set the sequential readahead hint */
875 vma->vm_flags |= VM_SEQ_READ;
876 vma->vm_flags &= ~VM_RAND_READ;
877 return generic_file_mmap(file, vma);
878}
879
880#define mapping_nrpages(idata) ((idata)->nrpages)
881
882/*
883 * Called to notify the module that there are no more references to
884 * this file (i.e. no processes have it open).
885 *
886 * \note Not called when each file is closed.
887 */
888int pvfs2_file_release(struct inode *inode, struct file *file)
889{
890 gossip_debug(GOSSIP_FILE_DEBUG,
891 "pvfs2_file_release: called on %s\n",
892 file->f_path.dentry->d_name.name);
893
894 pvfs2_flush_inode(inode);
895
896 /*
897 remove all associated inode pages from the page cache and mmap
898 readahead cache (if any); this forces an expensive refresh of
899 data for the next caller of mmap (or 'get_block' accesses)
900 */
901 if (file->f_path.dentry->d_inode &&
902 file->f_path.dentry->d_inode->i_mapping &&
903 mapping_nrpages(&file->f_path.dentry->d_inode->i_data))
904 truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping,
905 0);
906 return 0;
907}
908
909/*
910 * Push all data for a specific file onto permanent storage.
911 */
912int pvfs2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
913{
914 int ret = -EINVAL;
915 struct pvfs2_inode_s *pvfs2_inode =
916 PVFS2_I(file->f_path.dentry->d_inode);
917 struct pvfs2_kernel_op_s *new_op = NULL;
918
919 /* required call */
920 filemap_write_and_wait_range(file->f_mapping, start, end);
921
922 new_op = op_alloc(PVFS2_VFS_OP_FSYNC);
923 if (!new_op)
924 return -ENOMEM;
925 new_op->upcall.req.fsync.refn = pvfs2_inode->refn;
926
927 ret = service_operation(new_op,
928 "pvfs2_fsync",
929 get_interruptible_flag(file->f_path.dentry->d_inode));
930
931 gossip_debug(GOSSIP_FILE_DEBUG,
932 "pvfs2_fsync got return value of %d\n",
933 ret);
934
935 op_release(new_op);
936
937 pvfs2_flush_inode(file->f_path.dentry->d_inode);
938 return ret;
939}
940
941/*
942 * Change the file pointer position for an instance of an open file.
943 *
944 * \note If .llseek is overriden, we must acquire lock as described in
945 * Documentation/filesystems/Locking.
946 *
947 * Future upgrade could support SEEK_DATA and SEEK_HOLE but would
948 * require much changes to the FS
949 */
950loff_t pvfs2_file_llseek(struct file *file, loff_t offset, int origin)
951{
952 int ret = -EINVAL;
953 struct inode *inode = file->f_path.dentry->d_inode;
954
955 if (!inode) {
956 gossip_err("pvfs2_file_llseek: invalid inode (NULL)\n");
957 return ret;
958 }
959
960 if (origin == PVFS2_SEEK_END) {
961 /*
962 * revalidate the inode's file size.
963 * NOTE: We are only interested in file size here,
964 * so we set mask accordingly.
965 */
966 ret = pvfs2_inode_getattr(inode, PVFS_ATTR_SYS_SIZE);
967 if (ret) {
968 gossip_debug(GOSSIP_FILE_DEBUG,
969 "%s:%s:%d calling make bad inode\n",
970 __FILE__,
971 __func__,
972 __LINE__);
973 pvfs2_make_bad_inode(inode);
974 return ret;
975 }
976 }
977
978 gossip_debug(GOSSIP_FILE_DEBUG,
979 "pvfs2_file_llseek: offset is %ld | origin is %d | "
980 "inode size is %lu\n",
981 (long)offset,
982 origin,
983 (unsigned long)file->f_path.dentry->d_inode->i_size);
984
985 return generic_file_llseek(file, offset, origin);
986}
987
988/*
989 * Support local locks (locks that only this kernel knows about)
990 * if Orangefs was mounted -o local_lock.
991 */
992int pvfs2_lock(struct file *filp, int cmd, struct file_lock *fl)
993{
994 int rc = -ENOLCK;
995
996 if (PVFS2_SB(filp->f_inode->i_sb)->flags & PVFS2_OPT_LOCAL_LOCK) {
997 if (cmd == F_GETLK) {
998 rc = 0;
999 posix_test_lock(filp, fl);
1000 } else {
1001 rc = posix_lock_file(filp, fl, NULL);
1002 }
1003 }
1004
1005 return rc;
1006}
1007
1008/** PVFS2 implementation of VFS file operations */
1009const struct file_operations pvfs2_file_operations = {
1010 .llseek = pvfs2_file_llseek,
1011 .read_iter = pvfs2_file_read_iter,
1012 .write_iter = pvfs2_file_write_iter,
1013 .lock = pvfs2_lock,
1014 .unlocked_ioctl = pvfs2_ioctl,
1015 .mmap = pvfs2_file_mmap,
1016 .open = generic_file_open,
1017 .release = pvfs2_file_release,
1018 .fsync = pvfs2_fsync,
1019};