blob: feb1764c2f8062148462d7704a0ae0dc7cc9f29a [file] [log] [blame]
Mike Marshall5db11c22015-07-17 10:38:12 -04001/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6
7/*
8 * Linux VFS file operations.
9 */
10
11#include "protocol.h"
12#include "pvfs2-kernel.h"
13#include "pvfs2-bufmap.h"
14#include <linux/fs.h>
15#include <linux/pagemap.h>
16
17#define wake_up_daemon_for_return(op) \
18do { \
19 spin_lock(&op->lock); \
20 op->io_completed = 1; \
21 spin_unlock(&op->lock); \
22 wake_up_interruptible(&op->io_completion_waitq);\
23} while (0)
24
25/*
26 * Copy to client-core's address space from the buffers specified
27 * by the iovec upto total_size bytes.
28 * NOTE: the iovector can either contain addresses which
29 * can futher be kernel-space or user-space addresses.
30 * or it can pointers to struct page's
31 */
32static int precopy_buffers(struct pvfs2_bufmap *bufmap,
33 int buffer_index,
34 const struct iovec *vec,
35 unsigned long nr_segs,
Mike Marshall4d1c4402015-09-04 10:31:16 -040036 size_t total_size)
Mike Marshall5db11c22015-07-17 10:38:12 -040037{
38 int ret = 0;
Mike Marshall4d1c4402015-09-04 10:31:16 -040039 struct iov_iter iter;
Mike Marshall5db11c22015-07-17 10:38:12 -040040
41 /*
42 * copy data from application/kernel by pulling it out
43 * of the iovec.
44 */
Mike Marshall4d1c4402015-09-04 10:31:16 -040045
46
47 if (total_size) {
48 iov_iter_init(&iter, WRITE, vec, nr_segs, total_size);
49 ret = pvfs_bufmap_copy_from_iovec(bufmap,
50 &iter,
51 buffer_index,
52 total_size);
53 if (ret < 0)
54 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
55 __func__,
56 (long)ret);
Mike Marshall4d1c4402015-09-04 10:31:16 -040057 }
58
Mike Marshall5db11c22015-07-17 10:38:12 -040059 if (ret < 0)
60 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
61 __func__,
62 (long)ret);
63 return ret;
64}
65
66/*
67 * Copy from client-core's address space to the buffers specified
68 * by the iovec upto total_size bytes.
69 * NOTE: the iovector can either contain addresses which
70 * can futher be kernel-space or user-space addresses.
71 * or it can pointers to struct page's
72 */
73static int postcopy_buffers(struct pvfs2_bufmap *bufmap,
74 int buffer_index,
75 const struct iovec *vec,
76 int nr_segs,
Mike Marshall4d1c4402015-09-04 10:31:16 -040077 size_t total_size)
Mike Marshall5db11c22015-07-17 10:38:12 -040078{
79 int ret = 0;
80
Mike Marshall4d1c4402015-09-04 10:31:16 -040081 struct iov_iter iter;
82
Mike Marshall5db11c22015-07-17 10:38:12 -040083 /*
84 * copy data to application/kernel by pushing it out to
85 * the iovec. NOTE; target buffers can be addresses or
86 * struct page pointers.
87 */
88 if (total_size) {
Mike Marshall4d1c4402015-09-04 10:31:16 -040089 iov_iter_init(&iter, READ, vec, nr_segs, total_size);
90 ret = pvfs_bufmap_copy_to_iovec(bufmap,
91 &iter,
92 buffer_index);
Mike Marshall5db11c22015-07-17 10:38:12 -040093 if (ret < 0)
Mike Marshall4d1c4402015-09-04 10:31:16 -040094 gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n",
Mike Marshall5db11c22015-07-17 10:38:12 -040095 __func__,
96 (long)ret);
97 }
98 return ret;
99}
100
101/*
102 * Post and wait for the I/O upcall to finish
103 */
104static ssize_t wait_for_direct_io(enum PVFS_io_type type, struct inode *inode,
105 loff_t *offset, struct iovec *vec, unsigned long nr_segs,
Mike Marshall4d1c4402015-09-04 10:31:16 -0400106 size_t total_size, loff_t readahead_size)
Mike Marshall5db11c22015-07-17 10:38:12 -0400107{
108 struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
109 struct pvfs2_khandle *handle = &pvfs2_inode->refn.khandle;
110 struct pvfs2_bufmap *bufmap = NULL;
111 struct pvfs2_kernel_op_s *new_op = NULL;
112 int buffer_index = -1;
113 ssize_t ret;
114
115 new_op = op_alloc(PVFS2_VFS_OP_FILE_IO);
116 if (!new_op) {
117 ret = -ENOMEM;
118 goto out;
119 }
120 /* synchronous I/O */
121 new_op->upcall.req.io.async_vfs_io = PVFS_VFS_SYNC_IO;
122 new_op->upcall.req.io.readahead_size = readahead_size;
123 new_op->upcall.req.io.io_type = type;
124 new_op->upcall.req.io.refn = pvfs2_inode->refn;
125
126populate_shared_memory:
127 /* get a shared buffer index */
128 ret = pvfs_bufmap_get(&bufmap, &buffer_index);
129 if (ret < 0) {
130 gossip_debug(GOSSIP_FILE_DEBUG,
131 "%s: pvfs_bufmap_get failure (%ld)\n",
132 __func__, (long)ret);
133 goto out;
134 }
135 gossip_debug(GOSSIP_FILE_DEBUG,
136 "%s(%pU): GET op %p -> buffer_index %d\n",
137 __func__,
138 handle,
139 new_op,
140 buffer_index);
141
142 new_op->uses_shared_memory = 1;
143 new_op->upcall.req.io.buf_index = buffer_index;
144 new_op->upcall.req.io.count = total_size;
145 new_op->upcall.req.io.offset = *offset;
146
147 gossip_debug(GOSSIP_FILE_DEBUG,
Mike Marshall4d1c4402015-09-04 10:31:16 -0400148 "%s(%pU): nr_segs %lu, offset: %llu total_size: %zd\n",
Mike Marshall5db11c22015-07-17 10:38:12 -0400149 __func__,
150 handle,
Mike Marshall5db11c22015-07-17 10:38:12 -0400151 nr_segs,
152 llu(*offset),
153 total_size);
154 /*
155 * Stage 1: copy the buffers into client-core's address space
156 * precopy_buffers only pertains to writes.
157 */
158 if (type == PVFS_IO_WRITE) {
159 ret = precopy_buffers(bufmap,
160 buffer_index,
161 vec,
162 nr_segs,
Mike Marshall4d1c4402015-09-04 10:31:16 -0400163 total_size);
Mike Marshall5db11c22015-07-17 10:38:12 -0400164 if (ret < 0)
165 goto out;
166 }
167
168 gossip_debug(GOSSIP_FILE_DEBUG,
169 "%s(%pU): Calling post_io_request with tag (%llu)\n",
170 __func__,
171 handle,
172 llu(new_op->tag));
173
174 /* Stage 2: Service the I/O operation */
175 ret = service_operation(new_op,
176 type == PVFS_IO_WRITE ?
177 "file_write" :
178 "file_read",
179 get_interruptible_flag(inode));
180
181 /*
182 * If service_operation() returns -EAGAIN #and# the operation was
183 * purged from pvfs2_request_list or htable_ops_in_progress, then
184 * we know that the client was restarted, causing the shared memory
185 * area to be wiped clean. To restart a write operation in this
186 * case, we must re-copy the data from the user's iovec to a NEW
187 * shared memory location. To restart a read operation, we must get
188 * a new shared memory location.
189 */
190 if (ret == -EAGAIN && op_state_purged(new_op)) {
191 pvfs_bufmap_put(bufmap, buffer_index);
192 gossip_debug(GOSSIP_FILE_DEBUG,
193 "%s:going to repopulate_shared_memory.\n",
194 __func__);
195 goto populate_shared_memory;
196 }
197
198 if (ret < 0) {
199 handle_io_error(); /* defined in pvfs2-kernel.h */
200 /*
Mike Marshall54804942015-10-05 13:44:24 -0400201 * don't write an error to syslog on signaled operation
202 * termination unless we've got debugging turned on, as
203 * this can happen regularly (i.e. ctrl-c)
Mike Marshall5db11c22015-07-17 10:38:12 -0400204 */
205 if (ret == -EINTR)
206 gossip_debug(GOSSIP_FILE_DEBUG,
207 "%s: returning error %ld\n", __func__,
208 (long)ret);
209 else
210 gossip_err("%s: error in %s handle %pU, returning %zd\n",
211 __func__,
212 type == PVFS_IO_READ ?
213 "read from" : "write to",
214 handle, ret);
215 goto out;
216 }
217
218 /*
219 * Stage 3: Post copy buffers from client-core's address space
220 * postcopy_buffers only pertains to reads.
221 */
222 if (type == PVFS_IO_READ) {
223 ret = postcopy_buffers(bufmap,
224 buffer_index,
225 vec,
226 nr_segs,
Mike Marshall4d1c4402015-09-04 10:31:16 -0400227 new_op->downcall.resp.io.amt_complete);
Mike Marshall5db11c22015-07-17 10:38:12 -0400228 if (ret < 0) {
229 /*
230 * put error codes in downcall so that handle_io_error()
231 * preserves it properly
232 */
233 new_op->downcall.status = ret;
234 handle_io_error();
235 goto out;
236 }
237 }
238 gossip_debug(GOSSIP_FILE_DEBUG,
239 "%s(%pU): Amount written as returned by the sys-io call:%d\n",
240 __func__,
241 handle,
242 (int)new_op->downcall.resp.io.amt_complete);
243
244 ret = new_op->downcall.resp.io.amt_complete;
245
246 /*
Mike Marshall54804942015-10-05 13:44:24 -0400247 * tell the device file owner waiting on I/O that this read has
248 * completed and it can return now. in this exact case, on
249 * wakeup the daemon will free the op, so we *cannot* touch it
250 * after this.
Mike Marshall5db11c22015-07-17 10:38:12 -0400251 */
252 wake_up_daemon_for_return(new_op);
253 new_op = NULL;
254
255out:
256 if (buffer_index >= 0) {
257 pvfs_bufmap_put(bufmap, buffer_index);
258 gossip_debug(GOSSIP_FILE_DEBUG,
259 "%s(%pU): PUT buffer_index %d\n",
260 __func__, handle, buffer_index);
261 buffer_index = -1;
262 }
263 if (new_op) {
264 op_release(new_op);
265 new_op = NULL;
266 }
267 return ret;
268}
269
270/*
271 * The reason we need to do this is to be able to support readv and writev
272 * that are larger than (pvfs_bufmap_size_query()) Default is
273 * PVFS2_BUFMAP_DEFAULT_DESC_SIZE MB. What that means is that we will
274 * create a new io vec descriptor for those memory addresses that
275 * go beyond the limit. Return value for this routine is negative in case
276 * of errors and 0 in case of success.
277 *
278 * Further, the new_nr_segs pointer is updated to hold the new value
279 * of number of iovecs, the new_vec pointer is updated to hold the pointer
280 * to the new split iovec, and the size array is an array of integers holding
281 * the number of iovecs that straddle pvfs_bufmap_size_query().
282 * The max_new_nr_segs value is computed by the caller and returned.
283 * (It will be (count of all iov_len/ block_size) + 1).
284 */
285static int split_iovecs(unsigned long max_new_nr_segs, /* IN */
286 unsigned long nr_segs, /* IN */
287 const struct iovec *original_iovec, /* IN */
288 unsigned long *new_nr_segs, /* OUT */
289 struct iovec **new_vec, /* OUT */
290 unsigned long *seg_count, /* OUT */
291 unsigned long **seg_array) /* OUT */
292{
293 unsigned long seg;
294 unsigned long count = 0;
295 unsigned long begin_seg;
296 unsigned long tmpnew_nr_segs = 0;
297 struct iovec *new_iovec = NULL;
298 struct iovec *orig_iovec;
299 unsigned long *sizes = NULL;
300 unsigned long sizes_count = 0;
301
302 if (nr_segs <= 0 ||
303 original_iovec == NULL ||
304 new_nr_segs == NULL ||
305 new_vec == NULL ||
306 seg_count == NULL ||
307 seg_array == NULL ||
308 max_new_nr_segs <= 0) {
309 gossip_err("Invalid parameters to split_iovecs\n");
310 return -EINVAL;
311 }
312 *new_nr_segs = 0;
313 *new_vec = NULL;
314 *seg_count = 0;
315 *seg_array = NULL;
316 /* copy the passed in iovec descriptor to a temp structure */
317 orig_iovec = kmalloc_array(nr_segs,
318 sizeof(*orig_iovec),
319 PVFS2_BUFMAP_GFP_FLAGS);
320 if (orig_iovec == NULL) {
321 gossip_err(
322 "split_iovecs: Could not allocate memory for %lu bytes!\n",
323 (unsigned long)(nr_segs * sizeof(*orig_iovec)));
324 return -ENOMEM;
325 }
326 new_iovec = kcalloc(max_new_nr_segs,
327 sizeof(*new_iovec),
328 PVFS2_BUFMAP_GFP_FLAGS);
329 if (new_iovec == NULL) {
330 kfree(orig_iovec);
331 gossip_err(
332 "split_iovecs: Could not allocate memory for %lu bytes!\n",
333 (unsigned long)(max_new_nr_segs * sizeof(*new_iovec)));
334 return -ENOMEM;
335 }
336 sizes = kcalloc(max_new_nr_segs,
337 sizeof(*sizes),
338 PVFS2_BUFMAP_GFP_FLAGS);
339 if (sizes == NULL) {
340 kfree(new_iovec);
341 kfree(orig_iovec);
342 gossip_err(
343 "split_iovecs: Could not allocate memory for %lu bytes!\n",
344 (unsigned long)(max_new_nr_segs * sizeof(*sizes)));
345 return -ENOMEM;
346 }
347 /* copy the passed in iovec to a temp structure */
348 memcpy(orig_iovec, original_iovec, nr_segs * sizeof(*orig_iovec));
349 begin_seg = 0;
350repeat:
351 for (seg = begin_seg; seg < nr_segs; seg++) {
352 if (tmpnew_nr_segs >= max_new_nr_segs ||
353 sizes_count >= max_new_nr_segs) {
354 kfree(sizes);
355 kfree(orig_iovec);
356 kfree(new_iovec);
357 gossip_err
358 ("split_iovecs: exceeded the index limit (%lu)\n",
359 tmpnew_nr_segs);
360 return -EINVAL;
361 }
362 if (count + orig_iovec[seg].iov_len <
363 pvfs_bufmap_size_query()) {
364 count += orig_iovec[seg].iov_len;
365 memcpy(&new_iovec[tmpnew_nr_segs],
366 &orig_iovec[seg],
367 sizeof(*new_iovec));
368 tmpnew_nr_segs++;
369 sizes[sizes_count]++;
370 } else {
371 new_iovec[tmpnew_nr_segs].iov_base =
372 orig_iovec[seg].iov_base;
373 new_iovec[tmpnew_nr_segs].iov_len =
374 (pvfs_bufmap_size_query() - count);
375 tmpnew_nr_segs++;
376 sizes[sizes_count]++;
377 sizes_count++;
378 begin_seg = seg;
379 orig_iovec[seg].iov_base +=
380 (pvfs_bufmap_size_query() - count);
381 orig_iovec[seg].iov_len -=
382 (pvfs_bufmap_size_query() - count);
383 count = 0;
384 break;
385 }
386 }
387 if (seg != nr_segs)
388 goto repeat;
389 else
390 sizes_count++;
391
392 *new_nr_segs = tmpnew_nr_segs;
393 /* new_iovec is freed by the caller */
394 *new_vec = new_iovec;
395 *seg_count = sizes_count;
396 /* seg_array is also freed by the caller */
397 *seg_array = sizes;
398 kfree(orig_iovec);
399 return 0;
400}
401
402static long bound_max_iovecs(const struct iovec *curr, unsigned long nr_segs,
403 ssize_t *total_count)
404{
405 unsigned long i;
406 long max_nr_iovecs;
407 ssize_t total;
408 ssize_t count;
409
410 total = 0;
411 count = 0;
412 max_nr_iovecs = 0;
413 for (i = 0; i < nr_segs; i++) {
414 const struct iovec *iv = &curr[i];
415
416 count += iv->iov_len;
417 if (unlikely((ssize_t) (count | iv->iov_len) < 0))
418 return -EINVAL;
419 if (total + iv->iov_len < pvfs_bufmap_size_query()) {
420 total += iv->iov_len;
421 max_nr_iovecs++;
422 } else {
423 total =
424 (total + iv->iov_len - pvfs_bufmap_size_query());
425 max_nr_iovecs += (total / pvfs_bufmap_size_query() + 2);
426 }
427 }
428 *total_count = count;
429 return max_nr_iovecs;
430}
431
432/*
433 * Common entry point for read/write/readv/writev
434 * This function will dispatch it to either the direct I/O
435 * or buffered I/O path depending on the mount options and/or
436 * augmented/extended metadata attached to the file.
437 * Note: File extended attributes override any mount options.
438 */
439static ssize_t do_readv_writev(enum PVFS_io_type type, struct file *file,
440 loff_t *offset, const struct iovec *iov, unsigned long nr_segs)
441{
442 struct inode *inode = file->f_mapping->host;
443 struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
444 struct pvfs2_khandle *handle = &pvfs2_inode->refn.khandle;
445 ssize_t ret;
446 ssize_t total_count;
447 unsigned int to_free;
448 size_t count;
449 unsigned long seg;
Mike Marshalleeaa3d42015-07-29 13:36:37 -0400450 unsigned long new_nr_segs;
451 unsigned long max_new_nr_segs;
452 unsigned long seg_count;
453 unsigned long *seg_array;
454 struct iovec *iovecptr;
455 struct iovec *ptr;
Mike Marshall5db11c22015-07-17 10:38:12 -0400456
457 total_count = 0;
458 ret = -EINVAL;
459 count = 0;
460 to_free = 0;
461
462 /* Compute total and max number of segments after split */
463 max_new_nr_segs = bound_max_iovecs(iov, nr_segs, &count);
Mike Marshall5db11c22015-07-17 10:38:12 -0400464
465 gossip_debug(GOSSIP_FILE_DEBUG,
466 "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n",
467 __func__,
468 handle,
469 (int)count);
470
471 if (type == PVFS_IO_WRITE) {
472 gossip_debug(GOSSIP_FILE_DEBUG,
473 "%s(%pU): proceeding with offset : %llu, "
474 "size %d\n",
475 __func__,
476 handle,
477 llu(*offset),
478 (int)count);
479 }
480
481 if (count == 0) {
482 ret = 0;
483 goto out;
484 }
485
486 /*
487 * if the total size of data transfer requested is greater than
488 * the kernel-set blocksize of PVFS2, then we split the iovecs
489 * such that no iovec description straddles a block size limit
490 */
491
492 gossip_debug(GOSSIP_FILE_DEBUG,
493 "%s: pvfs_bufmap_size:%d\n",
494 __func__,
495 pvfs_bufmap_size_query());
496
497 if (count > pvfs_bufmap_size_query()) {
498 /*
499 * Split up the given iovec description such that
500 * no iovec descriptor straddles over the block-size limitation.
501 * This makes us our job easier to stage the I/O.
502 * In addition, this function will also compute an array
503 * with seg_count entries that will store the number of
504 * segments that straddle the block-size boundaries.
505 */
506 ret = split_iovecs(max_new_nr_segs, /* IN */
507 nr_segs, /* IN */
508 iov, /* IN */
509 &new_nr_segs, /* OUT */
510 &iovecptr, /* OUT */
511 &seg_count, /* OUT */
512 &seg_array); /* OUT */
513 if (ret < 0) {
514 gossip_err("%s: Failed to split iovecs to satisfy larger than blocksize readv/writev request %zd\n",
515 __func__,
516 ret);
517 goto out;
518 }
519 gossip_debug(GOSSIP_FILE_DEBUG,
520 "%s: Splitting iovecs from %lu to %lu"
521 " [max_new %lu]\n",
522 __func__,
523 nr_segs,
524 new_nr_segs,
525 max_new_nr_segs);
526 /* We must free seg_array and iovecptr */
527 to_free = 1;
528 } else {
529 new_nr_segs = nr_segs;
530 /* use the given iovec description */
531 iovecptr = (struct iovec *)iov;
532 /* There is only 1 element in the seg_array */
533 seg_count = 1;
534 /* and its value is the number of segments passed in */
535 seg_array = &nr_segs;
536 /* We dont have to free up anything */
537 to_free = 0;
538 }
539 ptr = iovecptr;
540
541 gossip_debug(GOSSIP_FILE_DEBUG,
542 "%s(%pU) %zd@%llu\n",
543 __func__,
544 handle,
545 count,
546 llu(*offset));
547 gossip_debug(GOSSIP_FILE_DEBUG,
548 "%s(%pU): new_nr_segs: %lu, seg_count: %lu\n",
549 __func__,
550 handle,
551 new_nr_segs, seg_count);
552
553/* PVFS2_KERNEL_DEBUG is a CFLAGS define. */
554#ifdef PVFS2_KERNEL_DEBUG
555 for (seg = 0; seg < new_nr_segs; seg++)
556 gossip_debug(GOSSIP_FILE_DEBUG,
557 "%s: %d) %p to %p [%d bytes]\n",
558 __func__,
559 (int)seg + 1,
560 iovecptr[seg].iov_base,
561 iovecptr[seg].iov_base + iovecptr[seg].iov_len,
562 (int)iovecptr[seg].iov_len);
563 for (seg = 0; seg < seg_count; seg++)
564 gossip_debug(GOSSIP_FILE_DEBUG,
565 "%s: %zd) %lu\n",
566 __func__,
567 seg + 1,
568 seg_array[seg]);
569#endif
570 seg = 0;
571 while (total_count < count) {
572 size_t each_count;
573 size_t amt_complete;
574
575 /* how much to transfer in this loop iteration */
576 each_count =
577 (((count - total_count) > pvfs_bufmap_size_query()) ?
578 pvfs_bufmap_size_query() :
579 (count - total_count));
580
581 gossip_debug(GOSSIP_FILE_DEBUG,
582 "%s(%pU): size of each_count(%d)\n",
583 __func__,
584 handle,
585 (int)each_count);
586 gossip_debug(GOSSIP_FILE_DEBUG,
587 "%s(%pU): BEFORE wait_for_io: offset is %d\n",
588 __func__,
589 handle,
590 (int)*offset);
591
592 ret = wait_for_direct_io(type, inode, offset, ptr,
Mike Marshall4d1c4402015-09-04 10:31:16 -0400593 seg_array[seg], each_count, 0);
Mike Marshall5db11c22015-07-17 10:38:12 -0400594 gossip_debug(GOSSIP_FILE_DEBUG,
595 "%s(%pU): return from wait_for_io:%d\n",
596 __func__,
597 handle,
598 (int)ret);
599
600 if (ret < 0)
601 goto out;
602
603 /* advance the iovec pointer */
604 ptr += seg_array[seg];
605 seg++;
606 *offset += ret;
607 total_count += ret;
608 amt_complete = ret;
609
610 gossip_debug(GOSSIP_FILE_DEBUG,
611 "%s(%pU): AFTER wait_for_io: offset is %d\n",
612 __func__,
613 handle,
614 (int)*offset);
615
616 /*
617 * if we got a short I/O operations,
618 * fall out and return what we got so far
619 */
620 if (amt_complete < each_count)
621 break;
622 } /*end while */
623
624 if (total_count > 0)
625 ret = total_count;
626out:
627 if (to_free) {
628 kfree(iovecptr);
629 kfree(seg_array);
630 }
631 if (ret > 0) {
632 if (type == PVFS_IO_READ) {
633 file_accessed(file);
634 } else {
635 SetMtimeFlag(pvfs2_inode);
636 inode->i_mtime = CURRENT_TIME;
637 mark_inode_dirty_sync(inode);
638 }
639 }
640
641 gossip_debug(GOSSIP_FILE_DEBUG,
642 "%s(%pU): Value(%d) returned.\n",
643 __func__,
644 handle,
645 (int)ret);
646
647 return ret;
648}
649
650/*
651 * Read data from a specified offset in a file (referenced by inode).
652 * Data may be placed either in a user or kernel buffer.
653 */
654ssize_t pvfs2_inode_read(struct inode *inode,
655 char __user *buf,
656 size_t count,
657 loff_t *offset,
658 loff_t readahead_size)
659{
660 struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
661 size_t bufmap_size;
662 struct iovec vec;
663 ssize_t ret = -EINVAL;
664
665 g_pvfs2_stats.reads++;
666
667 vec.iov_base = buf;
668 vec.iov_len = count;
669
670 bufmap_size = pvfs_bufmap_size_query();
671 if (count > bufmap_size) {
672 gossip_debug(GOSSIP_FILE_DEBUG,
673 "%s: count is too large (%zd/%zd)!\n",
674 __func__, count, bufmap_size);
675 return -EINVAL;
676 }
677
678 gossip_debug(GOSSIP_FILE_DEBUG,
679 "%s(%pU) %zd@%llu\n",
680 __func__,
681 &pvfs2_inode->refn.khandle,
682 count,
683 llu(*offset));
684
685 ret = wait_for_direct_io(PVFS_IO_READ, inode, offset, &vec, 1,
Mike Marshall4d1c4402015-09-04 10:31:16 -0400686 count, readahead_size);
Mike Marshall5db11c22015-07-17 10:38:12 -0400687 if (ret > 0)
688 *offset += ret;
689
690 gossip_debug(GOSSIP_FILE_DEBUG,
691 "%s(%pU): Value(%zd) returned.\n",
692 __func__,
693 &pvfs2_inode->refn.khandle,
694 ret);
695
696 return ret;
697}
698
699static ssize_t pvfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
700{
701 struct file *file = iocb->ki_filp;
702 loff_t pos = *(&iocb->ki_pos);
703 ssize_t rc = 0;
704 unsigned long nr_segs = iter->nr_segs;
705
706 BUG_ON(iocb->private);
707
708 gossip_debug(GOSSIP_FILE_DEBUG, "pvfs2_file_read_iter\n");
709
710 g_pvfs2_stats.reads++;
711
712 rc = do_readv_writev(PVFS_IO_READ,
713 file,
714 &pos,
715 iter->iov,
716 nr_segs);
717 iocb->ki_pos = pos;
718
719 return rc;
720}
721
722static ssize_t pvfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
723{
724 struct file *file = iocb->ki_filp;
725 loff_t pos = *(&iocb->ki_pos);
726 unsigned long nr_segs = iter->nr_segs;
727 ssize_t rc;
728
729 BUG_ON(iocb->private);
730
731 gossip_debug(GOSSIP_FILE_DEBUG, "pvfs2_file_write_iter\n");
732
733 mutex_lock(&file->f_mapping->host->i_mutex);
734
735 /* Make sure generic_write_checks sees an up to date inode size. */
736 if (file->f_flags & O_APPEND) {
737 rc = pvfs2_inode_getattr(file->f_mapping->host,
738 PVFS_ATTR_SYS_SIZE);
739 if (rc) {
740 gossip_err("%s: pvfs2_inode_getattr failed, rc:%zd:.\n",
741 __func__, rc);
742 goto out;
743 }
744 }
745
746 if (file->f_pos > i_size_read(file->f_mapping->host))
747 pvfs2_i_size_write(file->f_mapping->host, file->f_pos);
748
749 rc = generic_write_checks(iocb, iter);
750
751 if (rc <= 0) {
752 gossip_err("%s: generic_write_checks failed, rc:%zd:.\n",
753 __func__, rc);
754 goto out;
755 }
756
757 rc = do_readv_writev(PVFS_IO_WRITE,
758 file,
759 &pos,
760 iter->iov,
761 nr_segs);
762 if (rc < 0) {
763 gossip_err("%s: do_readv_writev failed, rc:%zd:.\n",
764 __func__, rc);
765 goto out;
766 }
767
768 iocb->ki_pos = pos;
769 g_pvfs2_stats.writes++;
770
771out:
772
773 mutex_unlock(&file->f_mapping->host->i_mutex);
774 return rc;
775}
776
777/*
778 * Perform a miscellaneous operation on a file.
779 */
Mike Marshall84d02152015-07-28 13:27:51 -0400780static long pvfs2_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
Mike Marshall5db11c22015-07-17 10:38:12 -0400781{
782 int ret = -ENOTTY;
783 __u64 val = 0;
784 unsigned long uval;
785
786 gossip_debug(GOSSIP_FILE_DEBUG,
787 "pvfs2_ioctl: called with cmd %d\n",
788 cmd);
789
790 /*
791 * we understand some general ioctls on files, such as the immutable
792 * and append flags
793 */
794 if (cmd == FS_IOC_GETFLAGS) {
795 val = 0;
796 ret = pvfs2_xattr_get_default(file->f_path.dentry,
797 "user.pvfs2.meta_hint",
798 &val,
799 sizeof(val),
800 0);
801 if (ret < 0 && ret != -ENODATA)
802 return ret;
803 else if (ret == -ENODATA)
804 val = 0;
805 uval = val;
806 gossip_debug(GOSSIP_FILE_DEBUG,
807 "pvfs2_ioctl: FS_IOC_GETFLAGS: %llu\n",
808 (unsigned long long)uval);
809 return put_user(uval, (int __user *)arg);
810 } else if (cmd == FS_IOC_SETFLAGS) {
811 ret = 0;
812 if (get_user(uval, (int __user *)arg))
813 return -EFAULT;
814 /*
815 * PVFS_MIRROR_FL is set internally when the mirroring mode
816 * is turned on for a file. The user is not allowed to turn
817 * on this bit, but the bit is present if the user first gets
818 * the flags and then updates the flags with some new
819 * settings. So, we ignore it in the following edit. bligon.
820 */
821 if ((uval & ~PVFS_MIRROR_FL) &
822 (~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL))) {
823 gossip_err("pvfs2_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n");
824 return -EINVAL;
825 }
826 val = uval;
827 gossip_debug(GOSSIP_FILE_DEBUG,
828 "pvfs2_ioctl: FS_IOC_SETFLAGS: %llu\n",
829 (unsigned long long)val);
830 ret = pvfs2_xattr_set_default(file->f_path.dentry,
831 "user.pvfs2.meta_hint",
832 &val,
833 sizeof(val),
834 0,
835 0);
836 }
837
838 return ret;
839}
840
841/*
842 * Memory map a region of a file.
843 */
844static int pvfs2_file_mmap(struct file *file, struct vm_area_struct *vma)
845{
846 gossip_debug(GOSSIP_FILE_DEBUG,
847 "pvfs2_file_mmap: called on %s\n",
848 (file ?
849 (char *)file->f_path.dentry->d_name.name :
850 (char *)"Unknown"));
851
852 /* set the sequential readahead hint */
853 vma->vm_flags |= VM_SEQ_READ;
854 vma->vm_flags &= ~VM_RAND_READ;
Martin Brandenburg35390802015-09-30 13:11:54 -0400855
856 /* Use readonly mmap since we cannot support writable maps. */
857 return generic_file_readonly_mmap(file, vma);
Mike Marshall5db11c22015-07-17 10:38:12 -0400858}
859
860#define mapping_nrpages(idata) ((idata)->nrpages)
861
862/*
863 * Called to notify the module that there are no more references to
864 * this file (i.e. no processes have it open).
865 *
866 * \note Not called when each file is closed.
867 */
Mike Marshall84d02152015-07-28 13:27:51 -0400868static int pvfs2_file_release(struct inode *inode, struct file *file)
Mike Marshall5db11c22015-07-17 10:38:12 -0400869{
870 gossip_debug(GOSSIP_FILE_DEBUG,
871 "pvfs2_file_release: called on %s\n",
872 file->f_path.dentry->d_name.name);
873
874 pvfs2_flush_inode(inode);
875
876 /*
Mike Marshall54804942015-10-05 13:44:24 -0400877 * remove all associated inode pages from the page cache and mmap
878 * readahead cache (if any); this forces an expensive refresh of
879 * data for the next caller of mmap (or 'get_block' accesses)
Mike Marshall5db11c22015-07-17 10:38:12 -0400880 */
881 if (file->f_path.dentry->d_inode &&
882 file->f_path.dentry->d_inode->i_mapping &&
883 mapping_nrpages(&file->f_path.dentry->d_inode->i_data))
884 truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping,
885 0);
886 return 0;
887}
888
889/*
890 * Push all data for a specific file onto permanent storage.
891 */
Mike Marshall84d02152015-07-28 13:27:51 -0400892static int pvfs2_fsync(struct file *file,
893 loff_t start,
894 loff_t end,
895 int datasync)
Mike Marshall5db11c22015-07-17 10:38:12 -0400896{
897 int ret = -EINVAL;
898 struct pvfs2_inode_s *pvfs2_inode =
899 PVFS2_I(file->f_path.dentry->d_inode);
900 struct pvfs2_kernel_op_s *new_op = NULL;
901
902 /* required call */
903 filemap_write_and_wait_range(file->f_mapping, start, end);
904
905 new_op = op_alloc(PVFS2_VFS_OP_FSYNC);
906 if (!new_op)
907 return -ENOMEM;
908 new_op->upcall.req.fsync.refn = pvfs2_inode->refn;
909
910 ret = service_operation(new_op,
911 "pvfs2_fsync",
912 get_interruptible_flag(file->f_path.dentry->d_inode));
913
914 gossip_debug(GOSSIP_FILE_DEBUG,
915 "pvfs2_fsync got return value of %d\n",
916 ret);
917
918 op_release(new_op);
919
920 pvfs2_flush_inode(file->f_path.dentry->d_inode);
921 return ret;
922}
923
924/*
925 * Change the file pointer position for an instance of an open file.
926 *
927 * \note If .llseek is overriden, we must acquire lock as described in
928 * Documentation/filesystems/Locking.
929 *
930 * Future upgrade could support SEEK_DATA and SEEK_HOLE but would
931 * require much changes to the FS
932 */
Mike Marshall84d02152015-07-28 13:27:51 -0400933static loff_t pvfs2_file_llseek(struct file *file, loff_t offset, int origin)
Mike Marshall5db11c22015-07-17 10:38:12 -0400934{
935 int ret = -EINVAL;
936 struct inode *inode = file->f_path.dentry->d_inode;
937
938 if (!inode) {
939 gossip_err("pvfs2_file_llseek: invalid inode (NULL)\n");
940 return ret;
941 }
942
943 if (origin == PVFS2_SEEK_END) {
944 /*
945 * revalidate the inode's file size.
946 * NOTE: We are only interested in file size here,
947 * so we set mask accordingly.
948 */
949 ret = pvfs2_inode_getattr(inode, PVFS_ATTR_SYS_SIZE);
950 if (ret) {
951 gossip_debug(GOSSIP_FILE_DEBUG,
952 "%s:%s:%d calling make bad inode\n",
953 __FILE__,
954 __func__,
955 __LINE__);
956 pvfs2_make_bad_inode(inode);
957 return ret;
958 }
959 }
960
961 gossip_debug(GOSSIP_FILE_DEBUG,
Mike Marshall54804942015-10-05 13:44:24 -0400962 "pvfs2_file_llseek: offset is %ld | origin is %d"
963 " | inode size is %lu\n",
Mike Marshall5db11c22015-07-17 10:38:12 -0400964 (long)offset,
965 origin,
966 (unsigned long)file->f_path.dentry->d_inode->i_size);
967
968 return generic_file_llseek(file, offset, origin);
969}
970
971/*
972 * Support local locks (locks that only this kernel knows about)
973 * if Orangefs was mounted -o local_lock.
974 */
Mike Marshall84d02152015-07-28 13:27:51 -0400975static int pvfs2_lock(struct file *filp, int cmd, struct file_lock *fl)
Mike Marshall5db11c22015-07-17 10:38:12 -0400976{
Mike Marshallf957ae22015-09-24 12:53:05 -0400977 int rc = -EINVAL;
Mike Marshall5db11c22015-07-17 10:38:12 -0400978
979 if (PVFS2_SB(filp->f_inode->i_sb)->flags & PVFS2_OPT_LOCAL_LOCK) {
980 if (cmd == F_GETLK) {
981 rc = 0;
982 posix_test_lock(filp, fl);
983 } else {
984 rc = posix_lock_file(filp, fl, NULL);
985 }
986 }
987
988 return rc;
989}
990
991/** PVFS2 implementation of VFS file operations */
992const struct file_operations pvfs2_file_operations = {
993 .llseek = pvfs2_file_llseek,
994 .read_iter = pvfs2_file_read_iter,
995 .write_iter = pvfs2_file_write_iter,
996 .lock = pvfs2_lock,
997 .unlocked_ioctl = pvfs2_ioctl,
998 .mmap = pvfs2_file_mmap,
999 .open = generic_file_open,
1000 .release = pvfs2_file_release,
1001 .fsync = pvfs2_fsync,
1002};