blob: 27f5e3738a7bf62709f781b9dcd6aac9dc56d4a6 [file] [log] [blame]
Jens Axboe5274f052006-03-30 15:15:30 +02001/*
2 * "splice": joining two ropes together by interweaving their strands.
3 *
4 * This is the "extended pipe" functionality, where a pipe is used as
5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
6 * buffer that you can use to transfer data from one end to the other.
7 *
8 * The traditional unix read/write is extended with a "splice()" operation
9 * that transfers data buffers to or from a pipe buffer.
10 *
11 * Named by Larry McVoy, original implementation from Linus, extended by
Jens Axboec2058e02006-04-11 13:56:34 +020012 * Jens to support splicing to files, network, direct splicing, etc and
13 * fixing lots of bugs.
Jens Axboe5274f052006-03-30 15:15:30 +020014 *
Jens Axboec2058e02006-04-11 13:56:34 +020015 * Copyright (C) 2005-2006 Jens Axboe <axboe@suse.de>
16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
Jens Axboe5274f052006-03-30 15:15:30 +020018 *
19 */
20#include <linux/fs.h>
21#include <linux/file.h>
22#include <linux/pagemap.h>
23#include <linux/pipe_fs_i.h>
24#include <linux/mm_inline.h>
Jens Axboe5abc97a2006-03-30 15:16:46 +020025#include <linux/swap.h>
Jens Axboe4f6f0bd2006-04-02 23:04:46 +020026#include <linux/writeback.h>
27#include <linux/buffer_head.h>
Jeff Garzika0f06782006-03-30 23:06:13 -050028#include <linux/module.h>
Jens Axboe4f6f0bd2006-04-02 23:04:46 +020029#include <linux/syscalls.h>
Jens Axboe912d35f2006-04-26 10:59:21 +020030#include <linux/uio.h>
Jens Axboe5274f052006-03-30 15:15:30 +020031
Jens Axboe912d35f2006-04-26 10:59:21 +020032struct partial_page {
33 unsigned int offset;
34 unsigned int len;
35};
36
37/*
Jens Axboe00522fb2006-04-26 14:39:29 +020038 * Passed to splice_to_pipe
Jens Axboe912d35f2006-04-26 10:59:21 +020039 */
40struct splice_pipe_desc {
41 struct page **pages; /* page map */
42 struct partial_page *partial; /* pages[] may not be contig */
43 int nr_pages; /* number of pages in map */
44 unsigned int flags; /* splice flags */
45 struct pipe_buf_operations *ops;/* ops associated with output pipe */
46};
47
Jens Axboe83f91352006-04-02 23:05:09 +020048/*
49 * Attempt to steal a page from a pipe buffer. This should perhaps go into
50 * a vm helper function, it's already simplified quite a bit by the
51 * addition of remove_mapping(). If success is returned, the caller may
52 * attempt to reuse this page for another destination.
53 */
Jens Axboe5abc97a2006-03-30 15:16:46 +020054static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
55 struct pipe_buffer *buf)
56{
57 struct page *page = buf->page;
Jens Axboe4f6f0bd2006-04-02 23:04:46 +020058 struct address_space *mapping = page_mapping(page);
Jens Axboe5abc97a2006-03-30 15:16:46 +020059
Jens Axboe9e0267c2006-04-19 15:57:31 +020060 lock_page(page);
61
Jens Axboe5abc97a2006-03-30 15:16:46 +020062 WARN_ON(!PageUptodate(page));
63
Jens Axboead8d6f02006-04-02 23:10:32 +020064 /*
65 * At least for ext2 with nobh option, we need to wait on writeback
66 * completing on this page, since we'll remove it from the pagecache.
67 * Otherwise truncate wont wait on the page, allowing the disk
68 * blocks to be reused by someone else before we actually wrote our
69 * data to them. fs corruption ensues.
70 */
71 wait_on_page_writeback(page);
72
Jens Axboe4f6f0bd2006-04-02 23:04:46 +020073 if (PagePrivate(page))
74 try_to_release_page(page, mapping_gfp_mask(mapping));
75
Jens Axboe9e0267c2006-04-19 15:57:31 +020076 if (!remove_mapping(mapping, page)) {
77 unlock_page(page);
Jens Axboe5abc97a2006-03-30 15:16:46 +020078 return 1;
Jens Axboe9e0267c2006-04-19 15:57:31 +020079 }
Jens Axboe5abc97a2006-03-30 15:16:46 +020080
Jens Axboe5abc97a2006-03-30 15:16:46 +020081 return 0;
82}
83
Jens Axboe5274f052006-03-30 15:15:30 +020084static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
85 struct pipe_buffer *buf)
86{
87 page_cache_release(buf->page);
Jens Axboe5274f052006-03-30 15:15:30 +020088}
89
Jens Axboef84d7512006-05-01 19:59:03 +020090static int page_cache_pipe_buf_pin(struct pipe_inode_info *info,
91 struct pipe_buffer *buf)
Jens Axboe5274f052006-03-30 15:15:30 +020092{
93 struct page *page = buf->page;
Jens Axboe49d0b212006-04-10 09:04:41 +020094 int err;
Jens Axboe5274f052006-03-30 15:15:30 +020095
96 if (!PageUptodate(page)) {
Jens Axboe49d0b212006-04-10 09:04:41 +020097 lock_page(page);
98
99 /*
100 * Page got truncated/unhashed. This will cause a 0-byte
Ingo Molnar73d62d82006-04-11 13:57:21 +0200101 * splice, if this is the first page.
Jens Axboe49d0b212006-04-10 09:04:41 +0200102 */
103 if (!page->mapping) {
104 err = -ENODATA;
105 goto error;
106 }
107
108 /*
Ingo Molnar73d62d82006-04-11 13:57:21 +0200109 * Uh oh, read-error from disk.
Jens Axboe49d0b212006-04-10 09:04:41 +0200110 */
111 if (!PageUptodate(page)) {
112 err = -EIO;
113 goto error;
114 }
115
116 /*
Jens Axboef84d7512006-05-01 19:59:03 +0200117 * Page is ok afterall, we are done.
Jens Axboe49d0b212006-04-10 09:04:41 +0200118 */
Jens Axboe5274f052006-03-30 15:15:30 +0200119 unlock_page(page);
Jens Axboe5274f052006-03-30 15:15:30 +0200120 }
121
Jens Axboef84d7512006-05-01 19:59:03 +0200122 return 0;
Jens Axboe49d0b212006-04-10 09:04:41 +0200123error:
124 unlock_page(page);
Jens Axboef84d7512006-05-01 19:59:03 +0200125 return err;
Jens Axboe70524492006-04-11 15:51:17 +0200126}
127
Jens Axboe5274f052006-03-30 15:15:30 +0200128static struct pipe_buf_operations page_cache_pipe_buf_ops = {
129 .can_merge = 0,
Jens Axboef84d7512006-05-01 19:59:03 +0200130 .map = generic_pipe_buf_map,
131 .unmap = generic_pipe_buf_unmap,
132 .pin = page_cache_pipe_buf_pin,
Jens Axboe5274f052006-03-30 15:15:30 +0200133 .release = page_cache_pipe_buf_release,
Jens Axboe5abc97a2006-03-30 15:16:46 +0200134 .steal = page_cache_pipe_buf_steal,
Jens Axboef84d7512006-05-01 19:59:03 +0200135 .get = generic_pipe_buf_get,
Jens Axboe5274f052006-03-30 15:15:30 +0200136};
137
Jens Axboe912d35f2006-04-26 10:59:21 +0200138static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
139 struct pipe_buffer *buf)
140{
Jens Axboe7afa6fd2006-05-01 20:02:33 +0200141 if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
142 return 1;
143
Jens Axboe330ab712006-05-02 15:29:57 +0200144 return generic_pipe_buf_steal(pipe, buf);
Jens Axboe912d35f2006-04-26 10:59:21 +0200145}
146
147static struct pipe_buf_operations user_page_pipe_buf_ops = {
148 .can_merge = 0,
Jens Axboef84d7512006-05-01 19:59:03 +0200149 .map = generic_pipe_buf_map,
150 .unmap = generic_pipe_buf_unmap,
151 .pin = generic_pipe_buf_pin,
Jens Axboe912d35f2006-04-26 10:59:21 +0200152 .release = page_cache_pipe_buf_release,
153 .steal = user_page_pipe_buf_steal,
Jens Axboef84d7512006-05-01 19:59:03 +0200154 .get = generic_pipe_buf_get,
Jens Axboe912d35f2006-04-26 10:59:21 +0200155};
156
Jens Axboe83f91352006-04-02 23:05:09 +0200157/*
158 * Pipe output worker. This sets up our pipe format with the page cache
159 * pipe buffer operations. Otherwise very similar to the regular pipe_writev().
160 */
Jens Axboe00522fb2006-04-26 14:39:29 +0200161static ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
162 struct splice_pipe_desc *spd)
Jens Axboe5274f052006-03-30 15:15:30 +0200163{
Jens Axboe912d35f2006-04-26 10:59:21 +0200164 int ret, do_wakeup, page_nr;
Jens Axboe5274f052006-03-30 15:15:30 +0200165
166 ret = 0;
167 do_wakeup = 0;
Jens Axboe912d35f2006-04-26 10:59:21 +0200168 page_nr = 0;
Jens Axboe5274f052006-03-30 15:15:30 +0200169
Ingo Molnar3a326a22006-04-10 15:18:35 +0200170 if (pipe->inode)
171 mutex_lock(&pipe->inode->i_mutex);
Jens Axboe5274f052006-03-30 15:15:30 +0200172
Jens Axboe5274f052006-03-30 15:15:30 +0200173 for (;;) {
Ingo Molnar3a326a22006-04-10 15:18:35 +0200174 if (!pipe->readers) {
Jens Axboe5274f052006-03-30 15:15:30 +0200175 send_sig(SIGPIPE, current, 0);
176 if (!ret)
177 ret = -EPIPE;
178 break;
179 }
180
Jens Axboe6f767b02006-04-11 13:53:56 +0200181 if (pipe->nrbufs < PIPE_BUFFERS) {
182 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
Ingo Molnar3a326a22006-04-10 15:18:35 +0200183 struct pipe_buffer *buf = pipe->bufs + newbuf;
Jens Axboe5274f052006-03-30 15:15:30 +0200184
Jens Axboe912d35f2006-04-26 10:59:21 +0200185 buf->page = spd->pages[page_nr];
186 buf->offset = spd->partial[page_nr].offset;
187 buf->len = spd->partial[page_nr].len;
188 buf->ops = spd->ops;
Jens Axboe7afa6fd2006-05-01 20:02:33 +0200189 if (spd->flags & SPLICE_F_GIFT)
190 buf->flags |= PIPE_BUF_FLAG_GIFT;
191
Jens Axboe6f767b02006-04-11 13:53:56 +0200192 pipe->nrbufs++;
Jens Axboe912d35f2006-04-26 10:59:21 +0200193 page_nr++;
194 ret += buf->len;
195
Jens Axboe6f767b02006-04-11 13:53:56 +0200196 if (pipe->inode)
197 do_wakeup = 1;
Jens Axboe5274f052006-03-30 15:15:30 +0200198
Jens Axboe912d35f2006-04-26 10:59:21 +0200199 if (!--spd->nr_pages)
Jens Axboe5274f052006-03-30 15:15:30 +0200200 break;
Jens Axboe6f767b02006-04-11 13:53:56 +0200201 if (pipe->nrbufs < PIPE_BUFFERS)
Jens Axboe5274f052006-03-30 15:15:30 +0200202 continue;
203
204 break;
205 }
206
Jens Axboe912d35f2006-04-26 10:59:21 +0200207 if (spd->flags & SPLICE_F_NONBLOCK) {
Linus Torvalds29e35092006-04-02 12:46:35 -0700208 if (!ret)
209 ret = -EAGAIN;
210 break;
211 }
212
Jens Axboe5274f052006-03-30 15:15:30 +0200213 if (signal_pending(current)) {
214 if (!ret)
215 ret = -ERESTARTSYS;
216 break;
217 }
218
219 if (do_wakeup) {
Jens Axboec0bd1f62006-04-10 09:03:32 +0200220 smp_mb();
Ingo Molnar3a326a22006-04-10 15:18:35 +0200221 if (waitqueue_active(&pipe->wait))
222 wake_up_interruptible_sync(&pipe->wait);
223 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
Jens Axboe5274f052006-03-30 15:15:30 +0200224 do_wakeup = 0;
225 }
226
Ingo Molnar3a326a22006-04-10 15:18:35 +0200227 pipe->waiting_writers++;
228 pipe_wait(pipe);
229 pipe->waiting_writers--;
Jens Axboe5274f052006-03-30 15:15:30 +0200230 }
231
Ingo Molnar3a326a22006-04-10 15:18:35 +0200232 if (pipe->inode)
233 mutex_unlock(&pipe->inode->i_mutex);
Jens Axboe5274f052006-03-30 15:15:30 +0200234
235 if (do_wakeup) {
Jens Axboec0bd1f62006-04-10 09:03:32 +0200236 smp_mb();
Ingo Molnar3a326a22006-04-10 15:18:35 +0200237 if (waitqueue_active(&pipe->wait))
238 wake_up_interruptible(&pipe->wait);
239 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
Jens Axboe5274f052006-03-30 15:15:30 +0200240 }
241
Jens Axboe912d35f2006-04-26 10:59:21 +0200242 while (page_nr < spd->nr_pages)
243 page_cache_release(spd->pages[page_nr++]);
Jens Axboe5274f052006-03-30 15:15:30 +0200244
245 return ret;
246}
247
Ingo Molnar3a326a22006-04-10 15:18:35 +0200248static int
Jens Axboecbb7e572006-04-11 14:57:50 +0200249__generic_file_splice_read(struct file *in, loff_t *ppos,
250 struct pipe_inode_info *pipe, size_t len,
251 unsigned int flags)
Jens Axboe5274f052006-03-30 15:15:30 +0200252{
253 struct address_space *mapping = in->f_mapping;
Jens Axboe912d35f2006-04-26 10:59:21 +0200254 unsigned int loff, nr_pages;
Jens Axboe16c523d2006-04-10 09:03:58 +0200255 struct page *pages[PIPE_BUFFERS];
Jens Axboe912d35f2006-04-26 10:59:21 +0200256 struct partial_page partial[PIPE_BUFFERS];
Jens Axboe5274f052006-03-30 15:15:30 +0200257 struct page *page;
Jens Axboe91ad66e2006-04-19 15:55:10 +0200258 pgoff_t index, end_index;
259 loff_t isize;
Jens Axboe912d35f2006-04-26 10:59:21 +0200260 size_t total_len;
Jens Axboeeb207962006-04-27 11:05:22 +0200261 int error, page_nr;
Jens Axboe912d35f2006-04-26 10:59:21 +0200262 struct splice_pipe_desc spd = {
263 .pages = pages,
264 .partial = partial,
265 .flags = flags,
266 .ops = &page_cache_pipe_buf_ops,
267 };
Jens Axboe5274f052006-03-30 15:15:30 +0200268
Jens Axboecbb7e572006-04-11 14:57:50 +0200269 index = *ppos >> PAGE_CACHE_SHIFT;
Jens Axboe912d35f2006-04-26 10:59:21 +0200270 loff = *ppos & ~PAGE_CACHE_MASK;
271 nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
Jens Axboe5274f052006-03-30 15:15:30 +0200272
273 if (nr_pages > PIPE_BUFFERS)
274 nr_pages = PIPE_BUFFERS;
275
276 /*
Ingo Molnar73d62d82006-04-11 13:57:21 +0200277 * Initiate read-ahead on this page range. however, don't call into
Jens Axboe0b749ce2006-04-10 09:05:04 +0200278 * read-ahead if this is a non-zero offset (we are likely doing small
279 * chunk splice and the page is already there) for a single page.
Jens Axboe5274f052006-03-30 15:15:30 +0200280 */
Jens Axboeeb645a22006-04-27 08:44:27 +0200281 if (!loff || nr_pages > 1)
282 page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages);
Jens Axboe5274f052006-03-30 15:15:30 +0200283
284 /*
Ingo Molnar73d62d82006-04-11 13:57:21 +0200285 * Now fill in the holes:
Jens Axboe5274f052006-03-30 15:15:30 +0200286 */
Jens Axboe7480a902006-04-11 13:52:47 +0200287 error = 0;
Jens Axboe912d35f2006-04-26 10:59:21 +0200288 total_len = 0;
Jens Axboeeb207962006-04-27 11:05:22 +0200289
290 /*
291 * Lookup the (hopefully) full range of pages we need.
292 */
293 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
294
295 /*
296 * If find_get_pages_contig() returned fewer pages than we needed,
297 * allocate the rest.
298 */
299 index += spd.nr_pages;
300 while (spd.nr_pages < nr_pages) {
301 /*
302 * Page could be there, find_get_pages_contig() breaks on
303 * the first hole.
304 */
305 page = find_get_page(mapping, index);
306 if (!page) {
307 /*
Jens Axboee27dedd2006-05-01 19:59:54 +0200308 * Make sure the read-ahead engine is notified
309 * about this failure.
310 */
311 handle_ra_miss(mapping, &in->f_ra, index);
312
313 /*
Jens Axboeeb207962006-04-27 11:05:22 +0200314 * page didn't exist, allocate one.
315 */
316 page = page_cache_alloc_cold(mapping);
317 if (!page)
318 break;
319
320 error = add_to_page_cache_lru(page, mapping, index,
321 mapping_gfp_mask(mapping));
322 if (unlikely(error)) {
323 page_cache_release(page);
324 break;
325 }
326 /*
327 * add_to_page_cache() locks the page, unlock it
328 * to avoid convoluting the logic below even more.
329 */
330 unlock_page(page);
331 }
332
333 pages[spd.nr_pages++] = page;
334 index++;
335 }
336
337 /*
338 * Now loop over the map and see if we need to start IO on any
339 * pages, fill in the partial map, etc.
340 */
341 index = *ppos >> PAGE_CACHE_SHIFT;
342 nr_pages = spd.nr_pages;
343 spd.nr_pages = 0;
344 for (page_nr = 0; page_nr < nr_pages; page_nr++) {
Jens Axboe82aa5d62006-04-20 13:05:48 +0200345 unsigned int this_len;
346
347 if (!len)
348 break;
349
350 /*
351 * this_len is the max we'll use from this page
352 */
Andrew Mortonba5f5d92006-04-25 15:33:34 +0200353 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
Jens Axboeeb207962006-04-27 11:05:22 +0200354 page = pages[page_nr];
Jens Axboe7480a902006-04-11 13:52:47 +0200355
356 /*
357 * If the page isn't uptodate, we may need to start io on it
358 */
359 if (!PageUptodate(page)) {
Jens Axboec4f895c2006-04-19 15:56:12 +0200360 /*
361 * If in nonblock mode then dont block on waiting
362 * for an in-flight io page
363 */
364 if (flags & SPLICE_F_NONBLOCK)
365 break;
366
Jens Axboe7480a902006-04-11 13:52:47 +0200367 lock_page(page);
368
369 /*
370 * page was truncated, stop here. if this isn't the
371 * first page, we'll just complete what we already
372 * added
373 */
374 if (!page->mapping) {
375 unlock_page(page);
Jens Axboe7480a902006-04-11 13:52:47 +0200376 break;
377 }
378 /*
379 * page was already under io and is now done, great
380 */
381 if (PageUptodate(page)) {
382 unlock_page(page);
383 goto fill_it;
384 }
385
Jens Axboe7480a902006-04-11 13:52:47 +0200386 /*
387 * need to read in the page
388 */
389 error = mapping->a_ops->readpage(in, page);
Jens Axboe7480a902006-04-11 13:52:47 +0200390 if (unlikely(error)) {
Jens Axboeeb207962006-04-27 11:05:22 +0200391 /*
392 * We really should re-lookup the page here,
393 * but it complicates things a lot. Instead
394 * lets just do what we already stored, and
395 * we'll get it the next time we are called.
396 */
Jens Axboe7480a902006-04-11 13:52:47 +0200397 if (error == AOP_TRUNCATED_PAGE)
Jens Axboeeb207962006-04-27 11:05:22 +0200398 error = 0;
399
Jens Axboe7480a902006-04-11 13:52:47 +0200400 break;
401 }
Jens Axboe91ad66e2006-04-19 15:55:10 +0200402
403 /*
404 * i_size must be checked after ->readpage().
405 */
406 isize = i_size_read(mapping->host);
407 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
Jens Axboeeb207962006-04-27 11:05:22 +0200408 if (unlikely(!isize || index > end_index))
Jens Axboe91ad66e2006-04-19 15:55:10 +0200409 break;
Jens Axboe91ad66e2006-04-19 15:55:10 +0200410
411 /*
412 * if this is the last page, see if we need to shrink
413 * the length and stop
414 */
415 if (end_index == index) {
416 loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK);
Jens Axboeeb207962006-04-27 11:05:22 +0200417 if (total_len + loff > isize)
Jens Axboe91ad66e2006-04-19 15:55:10 +0200418 break;
Jens Axboe91ad66e2006-04-19 15:55:10 +0200419 /*
420 * force quit after adding this page
421 */
Jens Axboeeb207962006-04-27 11:05:22 +0200422 len = this_len;
Jens Axboe82aa5d62006-04-20 13:05:48 +0200423 this_len = min(this_len, loff);
Jens Axboe912d35f2006-04-26 10:59:21 +0200424 loff = 0;
Jens Axboe91ad66e2006-04-19 15:55:10 +0200425 }
Jens Axboe7480a902006-04-11 13:52:47 +0200426 }
427fill_it:
Jens Axboeeb207962006-04-27 11:05:22 +0200428 partial[page_nr].offset = loff;
429 partial[page_nr].len = this_len;
Jens Axboe82aa5d62006-04-20 13:05:48 +0200430 len -= this_len;
Jens Axboe912d35f2006-04-26 10:59:21 +0200431 total_len += this_len;
Jens Axboe91ad66e2006-04-19 15:55:10 +0200432 loff = 0;
Jens Axboeeb207962006-04-27 11:05:22 +0200433 spd.nr_pages++;
434 index++;
Jens Axboe5274f052006-03-30 15:15:30 +0200435 }
436
Jens Axboeeb207962006-04-27 11:05:22 +0200437 /*
438 * Release any pages at the end, if we quit early. 'i' is how far
439 * we got, 'nr_pages' is how many pages are in the map.
440 */
441 while (page_nr < nr_pages)
442 page_cache_release(pages[page_nr++]);
443
Jens Axboe912d35f2006-04-26 10:59:21 +0200444 if (spd.nr_pages)
Jens Axboe00522fb2006-04-26 14:39:29 +0200445 return splice_to_pipe(pipe, &spd);
Jens Axboe5274f052006-03-30 15:15:30 +0200446
Jens Axboe7480a902006-04-11 13:52:47 +0200447 return error;
Jens Axboe5274f052006-03-30 15:15:30 +0200448}
449
Jens Axboe83f91352006-04-02 23:05:09 +0200450/**
451 * generic_file_splice_read - splice data from file to a pipe
452 * @in: file to splice from
453 * @pipe: pipe to splice to
454 * @len: number of bytes to splice
455 * @flags: splice modifier flags
456 *
457 * Will read pages from given file and fill them into a pipe.
Jens Axboe83f91352006-04-02 23:05:09 +0200458 */
Jens Axboecbb7e572006-04-11 14:57:50 +0200459ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
460 struct pipe_inode_info *pipe, size_t len,
461 unsigned int flags)
Jens Axboe5274f052006-03-30 15:15:30 +0200462{
463 ssize_t spliced;
464 int ret;
465
466 ret = 0;
467 spliced = 0;
Ingo Molnar3a326a22006-04-10 15:18:35 +0200468
Jens Axboe5274f052006-03-30 15:15:30 +0200469 while (len) {
Jens Axboecbb7e572006-04-11 14:57:50 +0200470 ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
Jens Axboe5274f052006-03-30 15:15:30 +0200471
Jens Axboec4f895c2006-04-19 15:56:12 +0200472 if (ret < 0)
Jens Axboe5274f052006-03-30 15:15:30 +0200473 break;
Jens Axboec4f895c2006-04-19 15:56:12 +0200474 else if (!ret) {
475 if (spliced)
476 break;
477 if (flags & SPLICE_F_NONBLOCK) {
478 ret = -EAGAIN;
479 break;
480 }
481 }
Jens Axboe5274f052006-03-30 15:15:30 +0200482
Jens Axboecbb7e572006-04-11 14:57:50 +0200483 *ppos += ret;
Jens Axboe5274f052006-03-30 15:15:30 +0200484 len -= ret;
485 spliced += ret;
486 }
487
488 if (spliced)
489 return spliced;
490
491 return ret;
492}
493
Jens Axboe059a8f32006-04-02 23:06:05 +0200494EXPORT_SYMBOL(generic_file_splice_read);
495
Jens Axboe5274f052006-03-30 15:15:30 +0200496/*
Jens Axboe4f6f0bd2006-04-02 23:04:46 +0200497 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
Jens Axboe016b6612006-04-25 15:42:00 +0200498 * using sendpage(). Return the number of bytes sent.
Jens Axboe5274f052006-03-30 15:15:30 +0200499 */
500static int pipe_to_sendpage(struct pipe_inode_info *info,
501 struct pipe_buffer *buf, struct splice_desc *sd)
502{
503 struct file *file = sd->file;
504 loff_t pos = sd->pos;
Jens Axboef84d7512006-05-01 19:59:03 +0200505 int ret, more;
Jens Axboe5274f052006-03-30 15:15:30 +0200506
Jens Axboef84d7512006-05-01 19:59:03 +0200507 ret = buf->ops->pin(info, buf);
508 if (!ret) {
509 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
Jens Axboe5274f052006-03-30 15:15:30 +0200510
Jens Axboef84d7512006-05-01 19:59:03 +0200511 ret = file->f_op->sendpage(file, buf->page, buf->offset,
512 sd->len, &pos, more);
513 }
Jens Axboe5274f052006-03-30 15:15:30 +0200514
Jens Axboe016b6612006-04-25 15:42:00 +0200515 return ret;
Jens Axboe5274f052006-03-30 15:15:30 +0200516}
517
518/*
519 * This is a little more tricky than the file -> pipe splicing. There are
520 * basically three cases:
521 *
522 * - Destination page already exists in the address space and there
523 * are users of it. For that case we have no other option that
524 * copying the data. Tough luck.
525 * - Destination page already exists in the address space, but there
526 * are no users of it. Make sure it's uptodate, then drop it. Fall
527 * through to last case.
528 * - Destination page does not exist, we can add the pipe page to
529 * the page cache and avoid the copy.
530 *
Jens Axboe83f91352006-04-02 23:05:09 +0200531 * If asked to move pages to the output file (SPLICE_F_MOVE is set in
532 * sd->flags), we attempt to migrate pages from the pipe to the output
533 * file address space page cache. This is possible if no one else has
534 * the pipe page referenced outside of the pipe and page cache. If
535 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
536 * a new page in the output file page cache and fill/dirty that.
Jens Axboe5274f052006-03-30 15:15:30 +0200537 */
538static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
539 struct splice_desc *sd)
540{
541 struct file *file = sd->file;
542 struct address_space *mapping = file->f_mapping;
Jens Axboe3e7ee3e2006-04-02 23:11:04 +0200543 gfp_t gfp_mask = mapping_gfp_mask(mapping);
Jens Axboe016b6612006-04-25 15:42:00 +0200544 unsigned int offset, this_len;
Jens Axboe5274f052006-03-30 15:15:30 +0200545 struct page *page;
Jens Axboe5274f052006-03-30 15:15:30 +0200546 pgoff_t index;
Jens Axboe3e7ee3e2006-04-02 23:11:04 +0200547 int ret;
Jens Axboe5274f052006-03-30 15:15:30 +0200548
549 /*
Jens Axboe49d0b212006-04-10 09:04:41 +0200550 * make sure the data in this buffer is uptodate
Jens Axboe5274f052006-03-30 15:15:30 +0200551 */
Jens Axboef84d7512006-05-01 19:59:03 +0200552 ret = buf->ops->pin(info, buf);
553 if (unlikely(ret))
554 return ret;
Jens Axboe5274f052006-03-30 15:15:30 +0200555
556 index = sd->pos >> PAGE_CACHE_SHIFT;
557 offset = sd->pos & ~PAGE_CACHE_MASK;
558
Jens Axboe016b6612006-04-25 15:42:00 +0200559 this_len = sd->len;
560 if (this_len + offset > PAGE_CACHE_SIZE)
561 this_len = PAGE_CACHE_SIZE - offset;
562
Jens Axboe5274f052006-03-30 15:15:30 +0200563 /*
Jens Axboe0568b402006-05-01 19:50:48 +0200564 * Reuse buf page, if SPLICE_F_MOVE is set and we are doing a full
565 * page.
Jens Axboe5274f052006-03-30 15:15:30 +0200566 */
Jens Axboe0568b402006-05-01 19:50:48 +0200567 if ((sd->flags & SPLICE_F_MOVE) && this_len == PAGE_CACHE_SIZE) {
Jens Axboe83f91352006-04-02 23:05:09 +0200568 /*
569 * If steal succeeds, buf->page is now pruned from the vm
Jens Axboea893b992006-05-02 15:03:27 +0200570 * side (page cache) and we can reuse it. The page will also
571 * be locked on successful return.
Jens Axboe83f91352006-04-02 23:05:09 +0200572 */
Jens Axboe5abc97a2006-03-30 15:16:46 +0200573 if (buf->ops->steal(info, buf))
574 goto find_page;
Jens Axboe5274f052006-03-30 15:15:30 +0200575
Jens Axboe5abc97a2006-03-30 15:16:46 +0200576 page = buf->page;
Jens Axboea893b992006-05-02 15:03:27 +0200577 page_cache_get(page);
578
579 /*
580 * page must be on the LRU for adding to the pagecache.
581 * Check this without grabbing the zone lock, if it isn't
582 * the do grab the zone lock, recheck, and add if necessary.
583 */
584 if (!PageLRU(page)) {
585 struct zone *zone = page_zone(page);
586
587 spin_lock_irq(&zone->lru_lock);
588 if (!PageLRU(page)) {
589 SetPageLRU(page);
590 add_page_to_inactive_list(zone, page);
591 }
592 spin_unlock_irq(&zone->lru_lock);
593 }
594
Jens Axboe46e678c2006-04-30 16:36:32 +0200595 if (add_to_page_cache(page, mapping, index, gfp_mask)) {
Jens Axboea893b992006-05-02 15:03:27 +0200596 page_cache_release(page);
Jens Axboe46e678c2006-04-30 16:36:32 +0200597 unlock_page(page);
Jens Axboe5abc97a2006-03-30 15:16:46 +0200598 goto find_page;
Jens Axboe46e678c2006-04-30 16:36:32 +0200599 }
Jens Axboe5abc97a2006-03-30 15:16:46 +0200600 } else {
601find_page:
Jens Axboe9e0267c2006-04-19 15:57:31 +0200602 page = find_lock_page(mapping, index);
603 if (!page) {
604 ret = -ENOMEM;
605 page = page_cache_alloc_cold(mapping);
606 if (unlikely(!page))
607 goto out_nomem;
608
609 /*
610 * This will also lock the page
611 */
612 ret = add_to_page_cache_lru(page, mapping, index,
613 gfp_mask);
614 if (unlikely(ret))
615 goto out;
616 }
Jens Axboe5274f052006-03-30 15:15:30 +0200617
Jens Axboe5abc97a2006-03-30 15:16:46 +0200618 /*
Jens Axboe9e0267c2006-04-19 15:57:31 +0200619 * We get here with the page locked. If the page is also
620 * uptodate, we don't need to do more. If it isn't, we
621 * may need to bring it in if we are not going to overwrite
622 * the full page.
Jens Axboe5abc97a2006-03-30 15:16:46 +0200623 */
624 if (!PageUptodate(page)) {
Jens Axboe016b6612006-04-25 15:42:00 +0200625 if (this_len < PAGE_CACHE_SIZE) {
Jens Axboe5abc97a2006-03-30 15:16:46 +0200626 ret = mapping->a_ops->readpage(file, page);
627 if (unlikely(ret))
628 goto out;
629
630 lock_page(page);
631
632 if (!PageUptodate(page)) {
633 /*
Ingo Molnar73d62d82006-04-11 13:57:21 +0200634 * Page got invalidated, repeat.
Jens Axboe5abc97a2006-03-30 15:16:46 +0200635 */
636 if (!page->mapping) {
637 unlock_page(page);
638 page_cache_release(page);
639 goto find_page;
640 }
641 ret = -EIO;
642 goto out;
Jens Axboe5274f052006-03-30 15:15:30 +0200643 }
Jens Axboe9e0267c2006-04-19 15:57:31 +0200644 } else
Jens Axboe5abc97a2006-03-30 15:16:46 +0200645 SetPageUptodate(page);
Jens Axboe5274f052006-03-30 15:15:30 +0200646 }
647 }
648
Jens Axboe016b6612006-04-25 15:42:00 +0200649 ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
Jens Axboebfc4ee32006-05-03 10:35:10 +0200650 if (unlikely(ret)) {
651 loff_t isize = i_size_read(mapping->host);
652
653 if (ret != AOP_TRUNCATED_PAGE)
654 unlock_page(page);
Jens Axboe4f6f0bd2006-04-02 23:04:46 +0200655 page_cache_release(page);
Jens Axboebfc4ee32006-05-03 10:35:10 +0200656 if (ret == AOP_TRUNCATED_PAGE)
657 goto find_page;
658
659 /*
660 * prepare_write() may have instantiated a few blocks
661 * outside i_size. Trim these off again.
662 */
663 if (sd->pos + this_len > isize)
664 vmtruncate(mapping->host, isize);
665
Jens Axboe5274f052006-03-30 15:15:30 +0200666 goto out;
Jens Axboebfc4ee32006-05-03 10:35:10 +0200667 }
Jens Axboe5274f052006-03-30 15:15:30 +0200668
Jens Axboe0568b402006-05-01 19:50:48 +0200669 if (buf->page != page) {
Jens Axboef84d7512006-05-01 19:59:03 +0200670 /*
671 * Careful, ->map() uses KM_USER0!
672 */
Jens Axboef6762b72006-05-01 20:02:05 +0200673 char *src = buf->ops->map(info, buf, 1);
Jens Axboef84d7512006-05-01 19:59:03 +0200674 char *dst = kmap_atomic(page, KM_USER1);
Jens Axboe5abc97a2006-03-30 15:16:46 +0200675
Jens Axboe016b6612006-04-25 15:42:00 +0200676 memcpy(dst + offset, src + buf->offset, this_len);
Jens Axboe5abc97a2006-03-30 15:16:46 +0200677 flush_dcache_page(page);
Jens Axboef84d7512006-05-01 19:59:03 +0200678 kunmap_atomic(dst, KM_USER1);
Jens Axboef6762b72006-05-01 20:02:05 +0200679 buf->ops->unmap(info, buf, src);
Jens Axboe5abc97a2006-03-30 15:16:46 +0200680 }
Jens Axboe5274f052006-03-30 15:15:30 +0200681
Jens Axboe016b6612006-04-25 15:42:00 +0200682 ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
Jens Axboe0568b402006-05-01 19:50:48 +0200683 if (!ret) {
684 /*
685 * Return the number of bytes written and mark page as
686 * accessed, we are now done!
687 */
688 ret = this_len;
689 mark_page_accessed(page);
690 balance_dirty_pages_ratelimited(mapping);
691 } else if (ret == AOP_TRUNCATED_PAGE) {
Jens Axboe4f6f0bd2006-04-02 23:04:46 +0200692 page_cache_release(page);
693 goto find_page;
Jens Axboe0568b402006-05-01 19:50:48 +0200694 }
Jens Axboe5274f052006-03-30 15:15:30 +0200695out:
Jens Axboe0568b402006-05-01 19:50:48 +0200696 page_cache_release(page);
Jens Axboe9e0267c2006-04-19 15:57:31 +0200697 unlock_page(page);
Dave Jones9aefe432006-04-10 09:02:40 +0200698out_nomem:
Jens Axboe5274f052006-03-30 15:15:30 +0200699 return ret;
700}
701
Jens Axboe83f91352006-04-02 23:05:09 +0200702/*
703 * Pipe input worker. Most of this logic works like a regular pipe, the
704 * key here is the 'actor' worker passed in that actually moves the data
705 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
706 */
Jens Axboe00522fb2006-04-26 14:39:29 +0200707ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
708 loff_t *ppos, size_t len, unsigned int flags,
709 splice_actor *actor)
Jens Axboe5274f052006-03-30 15:15:30 +0200710{
Jens Axboe5274f052006-03-30 15:15:30 +0200711 int ret, do_wakeup, err;
712 struct splice_desc sd;
713
714 ret = 0;
715 do_wakeup = 0;
716
717 sd.total_len = len;
718 sd.flags = flags;
719 sd.file = out;
Jens Axboecbb7e572006-04-11 14:57:50 +0200720 sd.pos = *ppos;
Jens Axboe5274f052006-03-30 15:15:30 +0200721
Ingo Molnar3a326a22006-04-10 15:18:35 +0200722 if (pipe->inode)
723 mutex_lock(&pipe->inode->i_mutex);
Jens Axboe5274f052006-03-30 15:15:30 +0200724
Jens Axboe5274f052006-03-30 15:15:30 +0200725 for (;;) {
Jens Axboe6f767b02006-04-11 13:53:56 +0200726 if (pipe->nrbufs) {
727 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
Jens Axboe5274f052006-03-30 15:15:30 +0200728 struct pipe_buf_operations *ops = buf->ops;
729
730 sd.len = buf->len;
731 if (sd.len > sd.total_len)
732 sd.len = sd.total_len;
733
Ingo Molnar3a326a22006-04-10 15:18:35 +0200734 err = actor(pipe, buf, &sd);
Jens Axboe016b6612006-04-25 15:42:00 +0200735 if (err <= 0) {
Jens Axboe5274f052006-03-30 15:15:30 +0200736 if (!ret && err != -ENODATA)
737 ret = err;
738
739 break;
740 }
741
Jens Axboe016b6612006-04-25 15:42:00 +0200742 ret += err;
743 buf->offset += err;
744 buf->len -= err;
745
746 sd.len -= err;
747 sd.pos += err;
748 sd.total_len -= err;
749 if (sd.len)
750 continue;
Ingo Molnar73d62d82006-04-11 13:57:21 +0200751
Jens Axboe5274f052006-03-30 15:15:30 +0200752 if (!buf->len) {
753 buf->ops = NULL;
Ingo Molnar3a326a22006-04-10 15:18:35 +0200754 ops->release(pipe, buf);
Jens Axboe6f767b02006-04-11 13:53:56 +0200755 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
756 pipe->nrbufs--;
757 if (pipe->inode)
758 do_wakeup = 1;
Jens Axboe5274f052006-03-30 15:15:30 +0200759 }
760
Jens Axboe5274f052006-03-30 15:15:30 +0200761 if (!sd.total_len)
762 break;
763 }
764
Jens Axboe6f767b02006-04-11 13:53:56 +0200765 if (pipe->nrbufs)
Jens Axboe5274f052006-03-30 15:15:30 +0200766 continue;
Ingo Molnar3a326a22006-04-10 15:18:35 +0200767 if (!pipe->writers)
Jens Axboe5274f052006-03-30 15:15:30 +0200768 break;
Ingo Molnar3a326a22006-04-10 15:18:35 +0200769 if (!pipe->waiting_writers) {
Jens Axboe5274f052006-03-30 15:15:30 +0200770 if (ret)
771 break;
772 }
773
Linus Torvalds29e35092006-04-02 12:46:35 -0700774 if (flags & SPLICE_F_NONBLOCK) {
775 if (!ret)
776 ret = -EAGAIN;
777 break;
778 }
779
Jens Axboe5274f052006-03-30 15:15:30 +0200780 if (signal_pending(current)) {
781 if (!ret)
782 ret = -ERESTARTSYS;
783 break;
784 }
785
786 if (do_wakeup) {
Jens Axboec0bd1f62006-04-10 09:03:32 +0200787 smp_mb();
Ingo Molnar3a326a22006-04-10 15:18:35 +0200788 if (waitqueue_active(&pipe->wait))
789 wake_up_interruptible_sync(&pipe->wait);
790 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
Jens Axboe5274f052006-03-30 15:15:30 +0200791 do_wakeup = 0;
792 }
793
Ingo Molnar3a326a22006-04-10 15:18:35 +0200794 pipe_wait(pipe);
Jens Axboe5274f052006-03-30 15:15:30 +0200795 }
796
Ingo Molnar3a326a22006-04-10 15:18:35 +0200797 if (pipe->inode)
798 mutex_unlock(&pipe->inode->i_mutex);
Jens Axboe5274f052006-03-30 15:15:30 +0200799
800 if (do_wakeup) {
Jens Axboec0bd1f62006-04-10 09:03:32 +0200801 smp_mb();
Ingo Molnar3a326a22006-04-10 15:18:35 +0200802 if (waitqueue_active(&pipe->wait))
803 wake_up_interruptible(&pipe->wait);
804 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
Jens Axboe5274f052006-03-30 15:15:30 +0200805 }
806
Jens Axboe5274f052006-03-30 15:15:30 +0200807 return ret;
Jens Axboe5274f052006-03-30 15:15:30 +0200808}
809
Jens Axboe83f91352006-04-02 23:05:09 +0200810/**
811 * generic_file_splice_write - splice data from a pipe to a file
Ingo Molnar3a326a22006-04-10 15:18:35 +0200812 * @pipe: pipe info
Jens Axboe83f91352006-04-02 23:05:09 +0200813 * @out: file to write to
814 * @len: number of bytes to splice
815 * @flags: splice modifier flags
816 *
817 * Will either move or copy pages (determined by @flags options) from
818 * the given pipe inode to the given file.
819 *
820 */
Ingo Molnar3a326a22006-04-10 15:18:35 +0200821ssize_t
822generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
Jens Axboecbb7e572006-04-11 14:57:50 +0200823 loff_t *ppos, size_t len, unsigned int flags)
Jens Axboe5274f052006-03-30 15:15:30 +0200824{
Jens Axboe4f6f0bd2006-04-02 23:04:46 +0200825 struct address_space *mapping = out->f_mapping;
Ingo Molnar3a326a22006-04-10 15:18:35 +0200826 ssize_t ret;
827
Jens Axboe00522fb2006-04-26 14:39:29 +0200828 ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
Jens Axboea4514eb2006-04-19 15:57:05 +0200829 if (ret > 0) {
Jens Axboe4f6f0bd2006-04-02 23:04:46 +0200830 struct inode *inode = mapping->host;
Jens Axboe4f6f0bd2006-04-02 23:04:46 +0200831
Jens Axboea4514eb2006-04-19 15:57:05 +0200832 *ppos += ret;
Jens Axboe4f6f0bd2006-04-02 23:04:46 +0200833
Jens Axboea4514eb2006-04-19 15:57:05 +0200834 /*
835 * If file or inode is SYNC and we actually wrote some data,
836 * sync it.
837 */
838 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
839 int err;
840
841 mutex_lock(&inode->i_mutex);
842 err = generic_osync_inode(inode, mapping,
843 OSYNC_METADATA|OSYNC_DATA);
844 mutex_unlock(&inode->i_mutex);
845
846 if (err)
847 ret = err;
848 }
Jens Axboe4f6f0bd2006-04-02 23:04:46 +0200849 }
850
851 return ret;
Jens Axboe5274f052006-03-30 15:15:30 +0200852}
853
Jens Axboe059a8f32006-04-02 23:06:05 +0200854EXPORT_SYMBOL(generic_file_splice_write);
855
Jens Axboe83f91352006-04-02 23:05:09 +0200856/**
857 * generic_splice_sendpage - splice data from a pipe to a socket
858 * @inode: pipe inode
859 * @out: socket to write to
860 * @len: number of bytes to splice
861 * @flags: splice modifier flags
862 *
863 * Will send @len bytes from the pipe to a network socket. No data copying
864 * is involved.
865 *
866 */
Ingo Molnar3a326a22006-04-10 15:18:35 +0200867ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
Jens Axboecbb7e572006-04-11 14:57:50 +0200868 loff_t *ppos, size_t len, unsigned int flags)
Jens Axboe5274f052006-03-30 15:15:30 +0200869{
Jens Axboe00522fb2006-04-26 14:39:29 +0200870 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
Jens Axboe5274f052006-03-30 15:15:30 +0200871}
872
Jens Axboe059a8f32006-04-02 23:06:05 +0200873EXPORT_SYMBOL(generic_splice_sendpage);
Jeff Garzika0f06782006-03-30 23:06:13 -0500874
Jens Axboe83f91352006-04-02 23:05:09 +0200875/*
876 * Attempt to initiate a splice from pipe to file.
877 */
Ingo Molnar3a326a22006-04-10 15:18:35 +0200878static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
Jens Axboecbb7e572006-04-11 14:57:50 +0200879 loff_t *ppos, size_t len, unsigned int flags)
Jens Axboe5274f052006-03-30 15:15:30 +0200880{
Jens Axboe5274f052006-03-30 15:15:30 +0200881 int ret;
882
Jens Axboe49570e92006-04-11 13:56:09 +0200883 if (unlikely(!out->f_op || !out->f_op->splice_write))
Jens Axboe5274f052006-03-30 15:15:30 +0200884 return -EINVAL;
885
Jens Axboe49570e92006-04-11 13:56:09 +0200886 if (unlikely(!(out->f_mode & FMODE_WRITE)))
Jens Axboe5274f052006-03-30 15:15:30 +0200887 return -EBADF;
888
Jens Axboecbb7e572006-04-11 14:57:50 +0200889 ret = rw_verify_area(WRITE, out, ppos, len);
Jens Axboe5274f052006-03-30 15:15:30 +0200890 if (unlikely(ret < 0))
891 return ret;
892
Jens Axboecbb7e572006-04-11 14:57:50 +0200893 return out->f_op->splice_write(pipe, out, ppos, len, flags);
Jens Axboe5274f052006-03-30 15:15:30 +0200894}
895
Jens Axboe83f91352006-04-02 23:05:09 +0200896/*
897 * Attempt to initiate a splice from a file to a pipe.
898 */
Jens Axboecbb7e572006-04-11 14:57:50 +0200899static long do_splice_to(struct file *in, loff_t *ppos,
900 struct pipe_inode_info *pipe, size_t len,
901 unsigned int flags)
Jens Axboe5274f052006-03-30 15:15:30 +0200902{
Jens Axboecbb7e572006-04-11 14:57:50 +0200903 loff_t isize, left;
Jens Axboe5274f052006-03-30 15:15:30 +0200904 int ret;
905
Jens Axboe49570e92006-04-11 13:56:09 +0200906 if (unlikely(!in->f_op || !in->f_op->splice_read))
Jens Axboe5274f052006-03-30 15:15:30 +0200907 return -EINVAL;
908
Jens Axboe49570e92006-04-11 13:56:09 +0200909 if (unlikely(!(in->f_mode & FMODE_READ)))
Jens Axboe5274f052006-03-30 15:15:30 +0200910 return -EBADF;
911
Jens Axboecbb7e572006-04-11 14:57:50 +0200912 ret = rw_verify_area(READ, in, ppos, len);
Jens Axboe5274f052006-03-30 15:15:30 +0200913 if (unlikely(ret < 0))
914 return ret;
915
916 isize = i_size_read(in->f_mapping->host);
Jens Axboecbb7e572006-04-11 14:57:50 +0200917 if (unlikely(*ppos >= isize))
Jens Axboe5274f052006-03-30 15:15:30 +0200918 return 0;
919
Jens Axboecbb7e572006-04-11 14:57:50 +0200920 left = isize - *ppos;
Jens Axboe49570e92006-04-11 13:56:09 +0200921 if (unlikely(left < len))
Jens Axboe5274f052006-03-30 15:15:30 +0200922 len = left;
923
Jens Axboecbb7e572006-04-11 14:57:50 +0200924 return in->f_op->splice_read(in, ppos, pipe, len, flags);
Jens Axboe5274f052006-03-30 15:15:30 +0200925}
926
Jens Axboecbb7e572006-04-11 14:57:50 +0200927long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
928 size_t len, unsigned int flags)
Jens Axboeb92ce552006-04-11 13:52:07 +0200929{
930 struct pipe_inode_info *pipe;
931 long ret, bytes;
Jens Axboecbb7e572006-04-11 14:57:50 +0200932 loff_t out_off;
Jens Axboeb92ce552006-04-11 13:52:07 +0200933 umode_t i_mode;
934 int i;
935
936 /*
937 * We require the input being a regular file, as we don't want to
938 * randomly drop data for eg socket -> socket splicing. Use the
939 * piped splicing for that!
940 */
941 i_mode = in->f_dentry->d_inode->i_mode;
942 if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
943 return -EINVAL;
944
945 /*
946 * neither in nor out is a pipe, setup an internal pipe attached to
947 * 'out' and transfer the wanted data from 'in' to 'out' through that
948 */
949 pipe = current->splice_pipe;
Jens Axboe49570e92006-04-11 13:56:09 +0200950 if (unlikely(!pipe)) {
Jens Axboeb92ce552006-04-11 13:52:07 +0200951 pipe = alloc_pipe_info(NULL);
952 if (!pipe)
953 return -ENOMEM;
954
955 /*
956 * We don't have an immediate reader, but we'll read the stuff
Jens Axboe00522fb2006-04-26 14:39:29 +0200957 * out of the pipe right after the splice_to_pipe(). So set
Jens Axboeb92ce552006-04-11 13:52:07 +0200958 * PIPE_READERS appropriately.
959 */
960 pipe->readers = 1;
961
962 current->splice_pipe = pipe;
963 }
964
965 /*
Ingo Molnar73d62d82006-04-11 13:57:21 +0200966 * Do the splice.
Jens Axboeb92ce552006-04-11 13:52:07 +0200967 */
968 ret = 0;
969 bytes = 0;
Jens Axboecbb7e572006-04-11 14:57:50 +0200970 out_off = 0;
Jens Axboeb92ce552006-04-11 13:52:07 +0200971
972 while (len) {
973 size_t read_len, max_read_len;
974
975 /*
976 * Do at most PIPE_BUFFERS pages worth of transfer:
977 */
978 max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE));
979
Jens Axboecbb7e572006-04-11 14:57:50 +0200980 ret = do_splice_to(in, ppos, pipe, max_read_len, flags);
Jens Axboeb92ce552006-04-11 13:52:07 +0200981 if (unlikely(ret < 0))
982 goto out_release;
983
984 read_len = ret;
985
986 /*
987 * NOTE: nonblocking mode only applies to the input. We
988 * must not do the output in nonblocking mode as then we
989 * could get stuck data in the internal pipe:
990 */
Jens Axboecbb7e572006-04-11 14:57:50 +0200991 ret = do_splice_from(pipe, out, &out_off, read_len,
Jens Axboeb92ce552006-04-11 13:52:07 +0200992 flags & ~SPLICE_F_NONBLOCK);
993 if (unlikely(ret < 0))
994 goto out_release;
995
996 bytes += ret;
997 len -= ret;
998
999 /*
1000 * In nonblocking mode, if we got back a short read then
1001 * that was due to either an IO error or due to the
1002 * pagecache entry not being there. In the IO error case
1003 * the _next_ splice attempt will produce a clean IO error
1004 * return value (not a short read), so in both cases it's
1005 * correct to break out of the loop here:
1006 */
1007 if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len))
1008 break;
1009 }
1010
1011 pipe->nrbufs = pipe->curbuf = 0;
1012
1013 return bytes;
1014
1015out_release:
1016 /*
1017 * If we did an incomplete transfer we must release
1018 * the pipe buffers in question:
1019 */
1020 for (i = 0; i < PIPE_BUFFERS; i++) {
1021 struct pipe_buffer *buf = pipe->bufs + i;
1022
1023 if (buf->ops) {
1024 buf->ops->release(pipe, buf);
1025 buf->ops = NULL;
1026 }
1027 }
1028 pipe->nrbufs = pipe->curbuf = 0;
1029
1030 /*
1031 * If we transferred some data, return the number of bytes:
1032 */
1033 if (bytes > 0)
1034 return bytes;
1035
1036 return ret;
1037}
1038
1039EXPORT_SYMBOL(do_splice_direct);
1040
Jens Axboe83f91352006-04-02 23:05:09 +02001041/*
1042 * Determine where to splice to/from.
1043 */
Ingo Molnar529565d2006-04-10 15:18:58 +02001044static long do_splice(struct file *in, loff_t __user *off_in,
1045 struct file *out, loff_t __user *off_out,
1046 size_t len, unsigned int flags)
Jens Axboe5274f052006-03-30 15:15:30 +02001047{
Ingo Molnar3a326a22006-04-10 15:18:35 +02001048 struct pipe_inode_info *pipe;
Jens Axboecbb7e572006-04-11 14:57:50 +02001049 loff_t offset, *off;
Jens Axboea4514eb2006-04-19 15:57:05 +02001050 long ret;
Jens Axboe5274f052006-03-30 15:15:30 +02001051
Ingo Molnar3a326a22006-04-10 15:18:35 +02001052 pipe = in->f_dentry->d_inode->i_pipe;
Ingo Molnar529565d2006-04-10 15:18:58 +02001053 if (pipe) {
1054 if (off_in)
1055 return -ESPIPE;
Jens Axboeb92ce552006-04-11 13:52:07 +02001056 if (off_out) {
1057 if (out->f_op->llseek == no_llseek)
1058 return -EINVAL;
Jens Axboecbb7e572006-04-11 14:57:50 +02001059 if (copy_from_user(&offset, off_out, sizeof(loff_t)))
Jens Axboeb92ce552006-04-11 13:52:07 +02001060 return -EFAULT;
Jens Axboecbb7e572006-04-11 14:57:50 +02001061 off = &offset;
1062 } else
1063 off = &out->f_pos;
Ingo Molnar529565d2006-04-10 15:18:58 +02001064
Jens Axboea4514eb2006-04-19 15:57:05 +02001065 ret = do_splice_from(pipe, out, off, len, flags);
1066
1067 if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
1068 ret = -EFAULT;
1069
1070 return ret;
Ingo Molnar529565d2006-04-10 15:18:58 +02001071 }
Jens Axboe5274f052006-03-30 15:15:30 +02001072
Ingo Molnar3a326a22006-04-10 15:18:35 +02001073 pipe = out->f_dentry->d_inode->i_pipe;
Ingo Molnar529565d2006-04-10 15:18:58 +02001074 if (pipe) {
1075 if (off_out)
1076 return -ESPIPE;
Jens Axboeb92ce552006-04-11 13:52:07 +02001077 if (off_in) {
1078 if (in->f_op->llseek == no_llseek)
1079 return -EINVAL;
Jens Axboecbb7e572006-04-11 14:57:50 +02001080 if (copy_from_user(&offset, off_in, sizeof(loff_t)))
Jens Axboeb92ce552006-04-11 13:52:07 +02001081 return -EFAULT;
Jens Axboecbb7e572006-04-11 14:57:50 +02001082 off = &offset;
1083 } else
1084 off = &in->f_pos;
Ingo Molnar529565d2006-04-10 15:18:58 +02001085
Jens Axboea4514eb2006-04-19 15:57:05 +02001086 ret = do_splice_to(in, off, pipe, len, flags);
1087
1088 if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
1089 ret = -EFAULT;
1090
1091 return ret;
Ingo Molnar529565d2006-04-10 15:18:58 +02001092 }
Jens Axboe5274f052006-03-30 15:15:30 +02001093
1094 return -EINVAL;
1095}
1096
Jens Axboe912d35f2006-04-26 10:59:21 +02001097/*
1098 * Map an iov into an array of pages and offset/length tupples. With the
1099 * partial_page structure, we can map several non-contiguous ranges into
1100 * our ones pages[] map instead of splitting that operation into pieces.
1101 * Could easily be exported as a generic helper for other users, in which
1102 * case one would probably want to add a 'max_nr_pages' parameter as well.
1103 */
1104static int get_iovec_page_array(const struct iovec __user *iov,
1105 unsigned int nr_vecs, struct page **pages,
Jens Axboe7afa6fd2006-05-01 20:02:33 +02001106 struct partial_page *partial, int aligned)
Jens Axboe912d35f2006-04-26 10:59:21 +02001107{
1108 int buffers = 0, error = 0;
1109
1110 /*
1111 * It's ok to take the mmap_sem for reading, even
1112 * across a "get_user()".
1113 */
1114 down_read(&current->mm->mmap_sem);
1115
1116 while (nr_vecs) {
1117 unsigned long off, npages;
1118 void __user *base;
1119 size_t len;
1120 int i;
1121
1122 /*
1123 * Get user address base and length for this iovec.
1124 */
1125 error = get_user(base, &iov->iov_base);
1126 if (unlikely(error))
1127 break;
1128 error = get_user(len, &iov->iov_len);
1129 if (unlikely(error))
1130 break;
1131
1132 /*
1133 * Sanity check this iovec. 0 read succeeds.
1134 */
1135 if (unlikely(!len))
1136 break;
1137 error = -EFAULT;
1138 if (unlikely(!base))
1139 break;
1140
1141 /*
1142 * Get this base offset and number of pages, then map
1143 * in the user pages.
1144 */
1145 off = (unsigned long) base & ~PAGE_MASK;
Jens Axboe7afa6fd2006-05-01 20:02:33 +02001146
1147 /*
1148 * If asked for alignment, the offset must be zero and the
1149 * length a multiple of the PAGE_SIZE.
1150 */
1151 error = -EINVAL;
1152 if (aligned && (off || len & ~PAGE_MASK))
1153 break;
1154
Jens Axboe912d35f2006-04-26 10:59:21 +02001155 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1156 if (npages > PIPE_BUFFERS - buffers)
1157 npages = PIPE_BUFFERS - buffers;
1158
1159 error = get_user_pages(current, current->mm,
1160 (unsigned long) base, npages, 0, 0,
1161 &pages[buffers], NULL);
1162
1163 if (unlikely(error <= 0))
1164 break;
1165
1166 /*
1167 * Fill this contiguous range into the partial page map.
1168 */
1169 for (i = 0; i < error; i++) {
Jens Axboe75914892006-05-02 12:57:18 +02001170 const int plen = min_t(size_t, len, PAGE_SIZE - off);
Jens Axboe912d35f2006-04-26 10:59:21 +02001171
1172 partial[buffers].offset = off;
1173 partial[buffers].len = plen;
1174
1175 off = 0;
1176 len -= plen;
1177 buffers++;
1178 }
1179
1180 /*
1181 * We didn't complete this iov, stop here since it probably
1182 * means we have to move some of this into a pipe to
1183 * be able to continue.
1184 */
1185 if (len)
1186 break;
1187
1188 /*
1189 * Don't continue if we mapped fewer pages than we asked for,
1190 * or if we mapped the max number of pages that we have
1191 * room for.
1192 */
1193 if (error < npages || buffers == PIPE_BUFFERS)
1194 break;
1195
1196 nr_vecs--;
1197 iov++;
1198 }
1199
1200 up_read(&current->mm->mmap_sem);
1201
1202 if (buffers)
1203 return buffers;
1204
1205 return error;
1206}
1207
1208/*
1209 * vmsplice splices a user address range into a pipe. It can be thought of
1210 * as splice-from-memory, where the regular splice is splice-from-file (or
1211 * to file). In both cases the output is a pipe, naturally.
1212 *
1213 * Note that vmsplice only supports splicing _from_ user memory to a pipe,
1214 * not the other way around. Splicing from user memory is a simple operation
1215 * that can be supported without any funky alignment restrictions or nasty
1216 * vm tricks. We simply map in the user memory and fill them into a pipe.
1217 * The reverse isn't quite as easy, though. There are two possible solutions
1218 * for that:
1219 *
1220 * - memcpy() the data internally, at which point we might as well just
1221 * do a regular read() on the buffer anyway.
1222 * - Lots of nasty vm tricks, that are neither fast nor flexible (it
1223 * has restriction limitations on both ends of the pipe).
1224 *
1225 * Alas, it isn't here.
1226 *
1227 */
1228static long do_vmsplice(struct file *file, const struct iovec __user *iov,
1229 unsigned long nr_segs, unsigned int flags)
1230{
1231 struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe;
1232 struct page *pages[PIPE_BUFFERS];
1233 struct partial_page partial[PIPE_BUFFERS];
1234 struct splice_pipe_desc spd = {
1235 .pages = pages,
1236 .partial = partial,
1237 .flags = flags,
1238 .ops = &user_page_pipe_buf_ops,
1239 };
1240
1241 if (unlikely(!pipe))
1242 return -EBADF;
1243 if (unlikely(nr_segs > UIO_MAXIOV))
1244 return -EINVAL;
1245 else if (unlikely(!nr_segs))
1246 return 0;
1247
Jens Axboe7afa6fd2006-05-01 20:02:33 +02001248 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
1249 flags & SPLICE_F_GIFT);
Jens Axboe912d35f2006-04-26 10:59:21 +02001250 if (spd.nr_pages <= 0)
1251 return spd.nr_pages;
1252
Jens Axboe00522fb2006-04-26 14:39:29 +02001253 return splice_to_pipe(pipe, &spd);
Jens Axboe912d35f2006-04-26 10:59:21 +02001254}
1255
1256asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
1257 unsigned long nr_segs, unsigned int flags)
1258{
1259 struct file *file;
1260 long error;
1261 int fput;
1262
1263 error = -EBADF;
1264 file = fget_light(fd, &fput);
1265 if (file) {
1266 if (file->f_mode & FMODE_WRITE)
1267 error = do_vmsplice(file, iov, nr_segs, flags);
1268
1269 fput_light(file, fput);
1270 }
1271
1272 return error;
1273}
1274
Ingo Molnar529565d2006-04-10 15:18:58 +02001275asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
1276 int fd_out, loff_t __user *off_out,
1277 size_t len, unsigned int flags)
Jens Axboe5274f052006-03-30 15:15:30 +02001278{
1279 long error;
1280 struct file *in, *out;
1281 int fput_in, fput_out;
1282
1283 if (unlikely(!len))
1284 return 0;
1285
1286 error = -EBADF;
Ingo Molnar529565d2006-04-10 15:18:58 +02001287 in = fget_light(fd_in, &fput_in);
Jens Axboe5274f052006-03-30 15:15:30 +02001288 if (in) {
1289 if (in->f_mode & FMODE_READ) {
Ingo Molnar529565d2006-04-10 15:18:58 +02001290 out = fget_light(fd_out, &fput_out);
Jens Axboe5274f052006-03-30 15:15:30 +02001291 if (out) {
1292 if (out->f_mode & FMODE_WRITE)
Ingo Molnar529565d2006-04-10 15:18:58 +02001293 error = do_splice(in, off_in,
1294 out, off_out,
1295 len, flags);
Jens Axboe5274f052006-03-30 15:15:30 +02001296 fput_light(out, fput_out);
1297 }
1298 }
1299
1300 fput_light(in, fput_in);
1301 }
1302
1303 return error;
1304}
Jens Axboe70524492006-04-11 15:51:17 +02001305
1306/*
1307 * Link contents of ipipe to opipe.
1308 */
1309static int link_pipe(struct pipe_inode_info *ipipe,
1310 struct pipe_inode_info *opipe,
1311 size_t len, unsigned int flags)
1312{
1313 struct pipe_buffer *ibuf, *obuf;
Jens Axboe2a27250e2006-04-19 15:56:40 +02001314 int ret, do_wakeup, i, ipipe_first;
1315
1316 ret = do_wakeup = ipipe_first = 0;
Jens Axboe70524492006-04-11 15:51:17 +02001317
1318 /*
1319 * Potential ABBA deadlock, work around it by ordering lock
1320 * grabbing by inode address. Otherwise two different processes
1321 * could deadlock (one doing tee from A -> B, the other from B -> A).
1322 */
1323 if (ipipe->inode < opipe->inode) {
Jens Axboe2a27250e2006-04-19 15:56:40 +02001324 ipipe_first = 1;
Jens Axboe70524492006-04-11 15:51:17 +02001325 mutex_lock(&ipipe->inode->i_mutex);
1326 mutex_lock(&opipe->inode->i_mutex);
1327 } else {
1328 mutex_lock(&opipe->inode->i_mutex);
1329 mutex_lock(&ipipe->inode->i_mutex);
1330 }
1331
1332 for (i = 0;; i++) {
1333 if (!opipe->readers) {
1334 send_sig(SIGPIPE, current, 0);
1335 if (!ret)
1336 ret = -EPIPE;
1337 break;
1338 }
1339 if (ipipe->nrbufs - i) {
1340 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
1341
1342 /*
1343 * If we have room, fill this buffer
1344 */
1345 if (opipe->nrbufs < PIPE_BUFFERS) {
1346 int nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
1347
1348 /*
1349 * Get a reference to this pipe buffer,
1350 * so we can copy the contents over.
1351 */
1352 ibuf->ops->get(ipipe, ibuf);
1353
1354 obuf = opipe->bufs + nbuf;
1355 *obuf = *ibuf;
1356
Jens Axboe7afa6fd2006-05-01 20:02:33 +02001357 /*
1358 * Don't inherit the gift flag, we need to
1359 * prevent multiple steals of this page.
1360 */
1361 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1362
Jens Axboe70524492006-04-11 15:51:17 +02001363 if (obuf->len > len)
1364 obuf->len = len;
1365
1366 opipe->nrbufs++;
1367 do_wakeup = 1;
1368 ret += obuf->len;
1369 len -= obuf->len;
1370
1371 if (!len)
1372 break;
1373 if (opipe->nrbufs < PIPE_BUFFERS)
1374 continue;
1375 }
1376
1377 /*
1378 * We have input available, but no output room.
Jens Axboe2a27250e2006-04-19 15:56:40 +02001379 * If we already copied data, return that. If we
1380 * need to drop the opipe lock, it must be ordered
1381 * last to avoid deadlocks.
Jens Axboe70524492006-04-11 15:51:17 +02001382 */
Jens Axboe2a27250e2006-04-19 15:56:40 +02001383 if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) {
Jens Axboe70524492006-04-11 15:51:17 +02001384 if (!ret)
1385 ret = -EAGAIN;
1386 break;
1387 }
1388 if (signal_pending(current)) {
1389 if (!ret)
1390 ret = -ERESTARTSYS;
1391 break;
1392 }
1393 if (do_wakeup) {
1394 smp_mb();
1395 if (waitqueue_active(&opipe->wait))
1396 wake_up_interruptible(&opipe->wait);
1397 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1398 do_wakeup = 0;
1399 }
1400
1401 opipe->waiting_writers++;
1402 pipe_wait(opipe);
1403 opipe->waiting_writers--;
1404 continue;
1405 }
1406
1407 /*
1408 * No input buffers, do the usual checks for available
1409 * writers and blocking and wait if necessary
1410 */
1411 if (!ipipe->writers)
1412 break;
1413 if (!ipipe->waiting_writers) {
1414 if (ret)
1415 break;
1416 }
Jens Axboe2a27250e2006-04-19 15:56:40 +02001417 /*
1418 * pipe_wait() drops the ipipe mutex. To avoid deadlocks
1419 * with another process, we can only safely do that if
1420 * the ipipe lock is ordered last.
1421 */
1422 if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) {
Jens Axboe70524492006-04-11 15:51:17 +02001423 if (!ret)
1424 ret = -EAGAIN;
1425 break;
1426 }
1427 if (signal_pending(current)) {
1428 if (!ret)
1429 ret = -ERESTARTSYS;
1430 break;
1431 }
1432
1433 if (waitqueue_active(&ipipe->wait))
1434 wake_up_interruptible_sync(&ipipe->wait);
1435 kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT);
1436
1437 pipe_wait(ipipe);
1438 }
1439
1440 mutex_unlock(&ipipe->inode->i_mutex);
1441 mutex_unlock(&opipe->inode->i_mutex);
1442
1443 if (do_wakeup) {
1444 smp_mb();
1445 if (waitqueue_active(&opipe->wait))
1446 wake_up_interruptible(&opipe->wait);
1447 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1448 }
1449
1450 return ret;
1451}
1452
1453/*
1454 * This is a tee(1) implementation that works on pipes. It doesn't copy
1455 * any data, it simply references the 'in' pages on the 'out' pipe.
1456 * The 'flags' used are the SPLICE_F_* variants, currently the only
1457 * applicable one is SPLICE_F_NONBLOCK.
1458 */
1459static long do_tee(struct file *in, struct file *out, size_t len,
1460 unsigned int flags)
1461{
1462 struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe;
1463 struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe;
1464
1465 /*
1466 * Link ipipe to the two output pipes, consuming as we go along.
1467 */
1468 if (ipipe && opipe)
1469 return link_pipe(ipipe, opipe, len, flags);
1470
1471 return -EINVAL;
1472}
1473
1474asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags)
1475{
1476 struct file *in;
1477 int error, fput_in;
1478
1479 if (unlikely(!len))
1480 return 0;
1481
1482 error = -EBADF;
1483 in = fget_light(fdin, &fput_in);
1484 if (in) {
1485 if (in->f_mode & FMODE_READ) {
1486 int fput_out;
1487 struct file *out = fget_light(fdout, &fput_out);
1488
1489 if (out) {
1490 if (out->f_mode & FMODE_WRITE)
1491 error = do_tee(in, out, len, flags);
1492 fput_light(out, fput_out);
1493 }
1494 }
1495 fput_light(in, fput_in);
1496 }
1497
1498 return error;
1499}