blob: a555d0a83fe95efa33a06248633869e89814e807 [file] [log] [blame]
Jens Axboe5274f052006-03-30 15:15:30 +02001/*
2 * "splice": joining two ropes together by interweaving their strands.
3 *
4 * This is the "extended pipe" functionality, where a pipe is used as
5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
6 * buffer that you can use to transfer data from one end to the other.
7 *
8 * The traditional unix read/write is extended with a "splice()" operation
9 * that transfers data buffers to or from a pipe buffer.
10 *
11 * Named by Larry McVoy, original implementation from Linus, extended by
12 * Jens to support splicing to files and fixing the initial implementation
13 * bugs.
14 *
15 * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
16 * Copyright (C) 2005 Linus Torvalds <torvalds@osdl.org>
17 *
18 */
19#include <linux/fs.h>
20#include <linux/file.h>
21#include <linux/pagemap.h>
22#include <linux/pipe_fs_i.h>
23#include <linux/mm_inline.h>
Jens Axboe5abc97a2006-03-30 15:16:46 +020024#include <linux/swap.h>
Jeff Garzika0f06782006-03-30 23:06:13 -050025#include <linux/module.h>
Jens Axboe5274f052006-03-30 15:15:30 +020026
27/*
28 * Passed to the actors
29 */
30struct splice_desc {
31 unsigned int len, total_len; /* current and remaining length */
32 unsigned int flags; /* splice flags */
33 struct file *file; /* file to read/write */
34 loff_t pos; /* file position */
35};
36
Jens Axboe5abc97a2006-03-30 15:16:46 +020037static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
38 struct pipe_buffer *buf)
39{
40 struct page *page = buf->page;
41
42 WARN_ON(!PageLocked(page));
43 WARN_ON(!PageUptodate(page));
44
45 if (!remove_mapping(page_mapping(page), page))
46 return 1;
47
48 if (PageLRU(page)) {
49 struct zone *zone = page_zone(page);
50
51 spin_lock_irq(&zone->lru_lock);
52 BUG_ON(!PageLRU(page));
53 __ClearPageLRU(page);
54 del_page_from_lru(zone, page);
55 spin_unlock_irq(&zone->lru_lock);
56 }
57
58 buf->stolen = 1;
59 return 0;
60}
61
Jens Axboe5274f052006-03-30 15:15:30 +020062static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
63 struct pipe_buffer *buf)
64{
65 page_cache_release(buf->page);
66 buf->page = NULL;
Jens Axboe5abc97a2006-03-30 15:16:46 +020067 buf->stolen = 0;
Jens Axboe5274f052006-03-30 15:15:30 +020068}
69
70static void *page_cache_pipe_buf_map(struct file *file,
71 struct pipe_inode_info *info,
72 struct pipe_buffer *buf)
73{
74 struct page *page = buf->page;
75
76 lock_page(page);
77
78 if (!PageUptodate(page)) {
79 unlock_page(page);
80 return ERR_PTR(-EIO);
81 }
82
83 if (!page->mapping) {
84 unlock_page(page);
85 return ERR_PTR(-ENODATA);
86 }
87
88 return kmap(buf->page);
89}
90
91static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
92 struct pipe_buffer *buf)
93{
Jens Axboe5abc97a2006-03-30 15:16:46 +020094 if (!buf->stolen)
95 unlock_page(buf->page);
Jens Axboe5274f052006-03-30 15:15:30 +020096 kunmap(buf->page);
97}
98
99static struct pipe_buf_operations page_cache_pipe_buf_ops = {
100 .can_merge = 0,
101 .map = page_cache_pipe_buf_map,
102 .unmap = page_cache_pipe_buf_unmap,
103 .release = page_cache_pipe_buf_release,
Jens Axboe5abc97a2006-03-30 15:16:46 +0200104 .steal = page_cache_pipe_buf_steal,
Jens Axboe5274f052006-03-30 15:15:30 +0200105};
106
107static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
108 int nr_pages, unsigned long offset,
Linus Torvalds29e35092006-04-02 12:46:35 -0700109 unsigned long len, unsigned int flags)
Jens Axboe5274f052006-03-30 15:15:30 +0200110{
111 struct pipe_inode_info *info;
112 int ret, do_wakeup, i;
113
114 ret = 0;
115 do_wakeup = 0;
116 i = 0;
117
118 mutex_lock(PIPE_MUTEX(*inode));
119
120 info = inode->i_pipe;
121 for (;;) {
122 int bufs;
123
124 if (!PIPE_READERS(*inode)) {
125 send_sig(SIGPIPE, current, 0);
126 if (!ret)
127 ret = -EPIPE;
128 break;
129 }
130
131 bufs = info->nrbufs;
132 if (bufs < PIPE_BUFFERS) {
133 int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS - 1);
134 struct pipe_buffer *buf = info->bufs + newbuf;
135 struct page *page = pages[i++];
136 unsigned long this_len;
137
138 this_len = PAGE_CACHE_SIZE - offset;
139 if (this_len > len)
140 this_len = len;
141
142 buf->page = page;
143 buf->offset = offset;
144 buf->len = this_len;
145 buf->ops = &page_cache_pipe_buf_ops;
146 info->nrbufs = ++bufs;
147 do_wakeup = 1;
148
149 ret += this_len;
150 len -= this_len;
151 offset = 0;
152 if (!--nr_pages)
153 break;
154 if (!len)
155 break;
156 if (bufs < PIPE_BUFFERS)
157 continue;
158
159 break;
160 }
161
Linus Torvalds29e35092006-04-02 12:46:35 -0700162 if (flags & SPLICE_F_NONBLOCK) {
163 if (!ret)
164 ret = -EAGAIN;
165 break;
166 }
167
Jens Axboe5274f052006-03-30 15:15:30 +0200168 if (signal_pending(current)) {
169 if (!ret)
170 ret = -ERESTARTSYS;
171 break;
172 }
173
174 if (do_wakeup) {
175 wake_up_interruptible_sync(PIPE_WAIT(*inode));
176 kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO,
177 POLL_IN);
178 do_wakeup = 0;
179 }
180
181 PIPE_WAITING_WRITERS(*inode)++;
182 pipe_wait(inode);
183 PIPE_WAITING_WRITERS(*inode)--;
184 }
185
186 mutex_unlock(PIPE_MUTEX(*inode));
187
188 if (do_wakeup) {
189 wake_up_interruptible(PIPE_WAIT(*inode));
190 kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
191 }
192
193 while (i < nr_pages)
194 page_cache_release(pages[i++]);
195
196 return ret;
197}
198
199static int __generic_file_splice_read(struct file *in, struct inode *pipe,
Linus Torvalds29e35092006-04-02 12:46:35 -0700200 size_t len, unsigned int flags)
Jens Axboe5274f052006-03-30 15:15:30 +0200201{
202 struct address_space *mapping = in->f_mapping;
203 unsigned int offset, nr_pages;
204 struct page *pages[PIPE_BUFFERS], *shadow[PIPE_BUFFERS];
205 struct page *page;
206 pgoff_t index, pidx;
207 int i, j;
208
209 index = in->f_pos >> PAGE_CACHE_SHIFT;
210 offset = in->f_pos & ~PAGE_CACHE_MASK;
211 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
212
213 if (nr_pages > PIPE_BUFFERS)
214 nr_pages = PIPE_BUFFERS;
215
216 /*
217 * initiate read-ahead on this page range
218 */
219 do_page_cache_readahead(mapping, in, index, nr_pages);
220
221 /*
222 * Get as many pages from the page cache as possible..
223 * Start IO on the page cache entries we create (we
224 * can assume that any pre-existing ones we find have
225 * already had IO started on them).
226 */
227 i = find_get_pages(mapping, index, nr_pages, pages);
228
229 /*
230 * common case - we found all pages and they are contiguous,
231 * kick them off
232 */
233 if (i && (pages[i - 1]->index == index + i - 1))
234 goto splice_them;
235
236 /*
237 * fill shadow[] with pages at the right locations, so we only
238 * have to fill holes
239 */
Jens Axboe53cd9ae2006-04-02 23:04:21 +0200240 memset(shadow, 0, nr_pages * sizeof(struct page *));
241 for (j = 0; j < i; j++)
242 shadow[pages[j]->index - index] = pages[j];
Jens Axboe5274f052006-03-30 15:15:30 +0200243
244 /*
245 * now fill in the holes
246 */
247 for (i = 0, pidx = index; i < nr_pages; pidx++, i++) {
248 int error;
249
250 if (shadow[i])
251 continue;
252
253 /*
254 * no page there, look one up / create it
255 */
256 page = find_or_create_page(mapping, pidx,
257 mapping_gfp_mask(mapping));
258 if (!page)
259 break;
260
261 if (PageUptodate(page))
262 unlock_page(page);
263 else {
264 error = mapping->a_ops->readpage(in, page);
265
266 if (unlikely(error)) {
267 page_cache_release(page);
268 break;
269 }
270 }
271 shadow[i] = page;
272 }
273
274 if (!i) {
275 for (i = 0; i < nr_pages; i++) {
276 if (shadow[i])
277 page_cache_release(shadow[i]);
278 }
279 return 0;
280 }
281
282 memcpy(pages, shadow, i * sizeof(struct page *));
283
284 /*
285 * Now we splice them into the pipe..
286 */
287splice_them:
Linus Torvalds29e35092006-04-02 12:46:35 -0700288 return move_to_pipe(pipe, pages, i, offset, len, flags);
Jens Axboe5274f052006-03-30 15:15:30 +0200289}
290
291ssize_t generic_file_splice_read(struct file *in, struct inode *pipe,
292 size_t len, unsigned int flags)
293{
294 ssize_t spliced;
295 int ret;
296
297 ret = 0;
298 spliced = 0;
299 while (len) {
Linus Torvalds29e35092006-04-02 12:46:35 -0700300 ret = __generic_file_splice_read(in, pipe, len, flags);
Jens Axboe5274f052006-03-30 15:15:30 +0200301
302 if (ret <= 0)
303 break;
304
305 in->f_pos += ret;
306 len -= ret;
307 spliced += ret;
Linus Torvalds29e35092006-04-02 12:46:35 -0700308
309 if (!(flags & SPLICE_F_NONBLOCK))
310 continue;
311 ret = -EAGAIN;
312 break;
Jens Axboe5274f052006-03-30 15:15:30 +0200313 }
314
315 if (spliced)
316 return spliced;
317
318 return ret;
319}
320
321/*
322 * Send 'len' bytes to socket from 'file' at position 'pos' using sendpage().
323 */
324static int pipe_to_sendpage(struct pipe_inode_info *info,
325 struct pipe_buffer *buf, struct splice_desc *sd)
326{
327 struct file *file = sd->file;
328 loff_t pos = sd->pos;
329 unsigned int offset;
330 ssize_t ret;
331 void *ptr;
332
333 /*
334 * sub-optimal, but we are limited by the pipe ->map. we don't
335 * need a kmap'ed buffer here, we just want to make sure we
336 * have the page pinned if the pipe page originates from the
337 * page cache
338 */
339 ptr = buf->ops->map(file, info, buf);
340 if (IS_ERR(ptr))
341 return PTR_ERR(ptr);
342
343 offset = pos & ~PAGE_CACHE_MASK;
344
345 ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,
346 sd->len < sd->total_len);
347
348 buf->ops->unmap(info, buf);
349 if (ret == sd->len)
350 return 0;
351
352 return -EIO;
353}
354
355/*
356 * This is a little more tricky than the file -> pipe splicing. There are
357 * basically three cases:
358 *
359 * - Destination page already exists in the address space and there
360 * are users of it. For that case we have no other option that
361 * copying the data. Tough luck.
362 * - Destination page already exists in the address space, but there
363 * are no users of it. Make sure it's uptodate, then drop it. Fall
364 * through to last case.
365 * - Destination page does not exist, we can add the pipe page to
366 * the page cache and avoid the copy.
367 *
368 * For now we just do the slower thing and always copy pages over, it's
369 * easier than migrating pages from the pipe to the target file. For the
370 * case of doing file | file splicing, the migrate approach had some LRU
371 * nastiness...
372 */
373static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
374 struct splice_desc *sd)
375{
376 struct file *file = sd->file;
377 struct address_space *mapping = file->f_mapping;
378 unsigned int offset;
379 struct page *page;
Jens Axboe5274f052006-03-30 15:15:30 +0200380 pgoff_t index;
Jens Axboe5abc97a2006-03-30 15:16:46 +0200381 char *src;
Jens Axboe5274f052006-03-30 15:15:30 +0200382 int ret;
383
384 /*
385 * after this, page will be locked and unmapped
386 */
387 src = buf->ops->map(file, info, buf);
388 if (IS_ERR(src))
389 return PTR_ERR(src);
390
391 index = sd->pos >> PAGE_CACHE_SHIFT;
392 offset = sd->pos & ~PAGE_CACHE_MASK;
393
Jens Axboe5274f052006-03-30 15:15:30 +0200394 /*
Jens Axboe5abc97a2006-03-30 15:16:46 +0200395 * reuse buf page, if SPLICE_F_MOVE is set
Jens Axboe5274f052006-03-30 15:15:30 +0200396 */
Jens Axboe5abc97a2006-03-30 15:16:46 +0200397 if (sd->flags & SPLICE_F_MOVE) {
398 if (buf->ops->steal(info, buf))
399 goto find_page;
Jens Axboe5274f052006-03-30 15:15:30 +0200400
Jens Axboe5abc97a2006-03-30 15:16:46 +0200401 page = buf->page;
402 if (add_to_page_cache_lru(page, mapping, index,
403 mapping_gfp_mask(mapping)))
404 goto find_page;
405 } else {
406find_page:
407 ret = -ENOMEM;
408 page = find_or_create_page(mapping, index,
409 mapping_gfp_mask(mapping));
410 if (!page)
411 goto out;
Jens Axboe5274f052006-03-30 15:15:30 +0200412
Jens Axboe5abc97a2006-03-30 15:16:46 +0200413 /*
414 * If the page is uptodate, it is also locked. If it isn't
415 * uptodate, we can mark it uptodate if we are filling the
416 * full page. Otherwise we need to read it in first...
417 */
418 if (!PageUptodate(page)) {
419 if (sd->len < PAGE_CACHE_SIZE) {
420 ret = mapping->a_ops->readpage(file, page);
421 if (unlikely(ret))
422 goto out;
423
424 lock_page(page);
425
426 if (!PageUptodate(page)) {
427 /*
428 * page got invalidated, repeat
429 */
430 if (!page->mapping) {
431 unlock_page(page);
432 page_cache_release(page);
433 goto find_page;
434 }
435 ret = -EIO;
436 goto out;
Jens Axboe5274f052006-03-30 15:15:30 +0200437 }
Jens Axboe5abc97a2006-03-30 15:16:46 +0200438 } else {
439 WARN_ON(!PageLocked(page));
440 SetPageUptodate(page);
Jens Axboe5274f052006-03-30 15:15:30 +0200441 }
Jens Axboe5274f052006-03-30 15:15:30 +0200442 }
443 }
444
445 ret = mapping->a_ops->prepare_write(file, page, 0, sd->len);
446 if (ret)
447 goto out;
448
Jens Axboe5abc97a2006-03-30 15:16:46 +0200449 if (!buf->stolen) {
450 char *dst = kmap_atomic(page, KM_USER0);
451
452 memcpy(dst + offset, src + buf->offset, sd->len);
453 flush_dcache_page(page);
454 kunmap_atomic(dst, KM_USER0);
455 }
Jens Axboe5274f052006-03-30 15:15:30 +0200456
457 ret = mapping->a_ops->commit_write(file, page, 0, sd->len);
458 if (ret < 0)
459 goto out;
460
461 set_page_dirty(page);
462 ret = write_one_page(page, 0);
463out:
464 if (ret < 0)
465 unlock_page(page);
Jens Axboe5abc97a2006-03-30 15:16:46 +0200466 if (!buf->stolen)
467 page_cache_release(page);
Jens Axboe5274f052006-03-30 15:15:30 +0200468 buf->ops->unmap(info, buf);
469 return ret;
470}
471
472typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
473 struct splice_desc *);
474
475static ssize_t move_from_pipe(struct inode *inode, struct file *out,
476 size_t len, unsigned int flags,
477 splice_actor *actor)
478{
479 struct pipe_inode_info *info;
480 int ret, do_wakeup, err;
481 struct splice_desc sd;
482
483 ret = 0;
484 do_wakeup = 0;
485
486 sd.total_len = len;
487 sd.flags = flags;
488 sd.file = out;
489 sd.pos = out->f_pos;
490
491 mutex_lock(PIPE_MUTEX(*inode));
492
493 info = inode->i_pipe;
494 for (;;) {
495 int bufs = info->nrbufs;
496
497 if (bufs) {
498 int curbuf = info->curbuf;
499 struct pipe_buffer *buf = info->bufs + curbuf;
500 struct pipe_buf_operations *ops = buf->ops;
501
502 sd.len = buf->len;
503 if (sd.len > sd.total_len)
504 sd.len = sd.total_len;
505
506 err = actor(info, buf, &sd);
507 if (err) {
508 if (!ret && err != -ENODATA)
509 ret = err;
510
511 break;
512 }
513
514 ret += sd.len;
515 buf->offset += sd.len;
516 buf->len -= sd.len;
517 if (!buf->len) {
518 buf->ops = NULL;
519 ops->release(info, buf);
520 curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1);
521 info->curbuf = curbuf;
522 info->nrbufs = --bufs;
523 do_wakeup = 1;
524 }
525
526 sd.pos += sd.len;
527 sd.total_len -= sd.len;
528 if (!sd.total_len)
529 break;
530 }
531
532 if (bufs)
533 continue;
534 if (!PIPE_WRITERS(*inode))
535 break;
536 if (!PIPE_WAITING_WRITERS(*inode)) {
537 if (ret)
538 break;
539 }
540
Linus Torvalds29e35092006-04-02 12:46:35 -0700541 if (flags & SPLICE_F_NONBLOCK) {
542 if (!ret)
543 ret = -EAGAIN;
544 break;
545 }
546
Jens Axboe5274f052006-03-30 15:15:30 +0200547 if (signal_pending(current)) {
548 if (!ret)
549 ret = -ERESTARTSYS;
550 break;
551 }
552
553 if (do_wakeup) {
554 wake_up_interruptible_sync(PIPE_WAIT(*inode));
555 kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT);
556 do_wakeup = 0;
557 }
558
559 pipe_wait(inode);
560 }
561
562 mutex_unlock(PIPE_MUTEX(*inode));
563
564 if (do_wakeup) {
565 wake_up_interruptible(PIPE_WAIT(*inode));
566 kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
567 }
568
569 mutex_lock(&out->f_mapping->host->i_mutex);
570 out->f_pos = sd.pos;
571 mutex_unlock(&out->f_mapping->host->i_mutex);
572 return ret;
573
574}
575
576ssize_t generic_file_splice_write(struct inode *inode, struct file *out,
577 size_t len, unsigned int flags)
578{
579 return move_from_pipe(inode, out, len, flags, pipe_to_file);
580}
581
582ssize_t generic_splice_sendpage(struct inode *inode, struct file *out,
583 size_t len, unsigned int flags)
584{
585 return move_from_pipe(inode, out, len, flags, pipe_to_sendpage);
586}
587
Jeff Garzika0f06782006-03-30 23:06:13 -0500588EXPORT_SYMBOL(generic_file_splice_write);
589EXPORT_SYMBOL(generic_file_splice_read);
590
Jens Axboe5274f052006-03-30 15:15:30 +0200591static long do_splice_from(struct inode *pipe, struct file *out, size_t len,
592 unsigned int flags)
593{
594 loff_t pos;
595 int ret;
596
597 if (!out->f_op || !out->f_op->splice_write)
598 return -EINVAL;
599
600 if (!(out->f_mode & FMODE_WRITE))
601 return -EBADF;
602
603 pos = out->f_pos;
604 ret = rw_verify_area(WRITE, out, &pos, len);
605 if (unlikely(ret < 0))
606 return ret;
607
608 return out->f_op->splice_write(pipe, out, len, flags);
609}
610
611static long do_splice_to(struct file *in, struct inode *pipe, size_t len,
612 unsigned int flags)
613{
614 loff_t pos, isize, left;
615 int ret;
616
617 if (!in->f_op || !in->f_op->splice_read)
618 return -EINVAL;
619
620 if (!(in->f_mode & FMODE_READ))
621 return -EBADF;
622
623 pos = in->f_pos;
624 ret = rw_verify_area(READ, in, &pos, len);
625 if (unlikely(ret < 0))
626 return ret;
627
628 isize = i_size_read(in->f_mapping->host);
629 if (unlikely(in->f_pos >= isize))
630 return 0;
631
632 left = isize - in->f_pos;
633 if (left < len)
634 len = left;
635
636 return in->f_op->splice_read(in, pipe, len, flags);
637}
638
639static long do_splice(struct file *in, struct file *out, size_t len,
640 unsigned int flags)
641{
642 struct inode *pipe;
643
644 pipe = in->f_dentry->d_inode;
645 if (pipe->i_pipe)
646 return do_splice_from(pipe, out, len, flags);
647
648 pipe = out->f_dentry->d_inode;
649 if (pipe->i_pipe)
650 return do_splice_to(in, pipe, len, flags);
651
652 return -EINVAL;
653}
654
655asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags)
656{
657 long error;
658 struct file *in, *out;
659 int fput_in, fput_out;
660
661 if (unlikely(!len))
662 return 0;
663
664 error = -EBADF;
665 in = fget_light(fdin, &fput_in);
666 if (in) {
667 if (in->f_mode & FMODE_READ) {
668 out = fget_light(fdout, &fput_out);
669 if (out) {
670 if (out->f_mode & FMODE_WRITE)
671 error = do_splice(in, out, len, flags);
672 fput_light(out, fput_out);
673 }
674 }
675
676 fput_light(in, fput_in);
677 }
678
679 return error;
680}