blob: df388f6a77d7c659bc3fac0579fd424b88f5a411 [file] [log] [blame]
Jens Axboee5024352020-02-11 20:34:12 -07001/* SPDX-License-Identifier: MIT */
Simon Zenidf6b9a92020-10-28 21:19:59 -04002#define _POSIX_C_SOURCE 200112L
3
Jens Axboe213d6f32019-01-17 21:40:30 -07004#include <sys/types.h>
5#include <sys/stat.h>
6#include <sys/mman.h>
7#include <unistd.h>
8#include <errno.h>
9#include <string.h>
Jens Axboe043ea222019-06-17 11:41:15 -060010#include <stdbool.h>
Jens Axboe213d6f32019-01-17 21:40:30 -070011
Stefan Hajnoczic31c7ec2019-07-24 09:24:50 +010012#include "liburing/compat.h"
13#include "liburing/io_uring.h"
Jens Axboe213d6f32019-01-17 21:40:30 -070014#include "liburing.h"
Stefan Hajnoczic31c7ec2019-07-24 09:24:50 +010015#include "liburing/barrier.h"
Jens Axboe213d6f32019-01-17 21:40:30 -070016
Jens Axboe96144ea2019-12-01 11:21:39 -070017#include "syscall.h"
18
Jens Axboe98455102019-11-27 17:21:38 -070019/*
20 * Returns true if we're not using SQ thread (thus nobody submits but us)
21 * or if IORING_SQ_NEED_WAKEUP is set, so submit thread must be explicitly
22 * awakened. For the latter case, we set the thread wakeup flag.
23 */
Jens Axboe1bafb3c2020-08-20 21:40:16 -060024static inline bool sq_ring_needs_enter(struct io_uring *ring, unsigned *flags)
Jens Axboe98455102019-11-27 17:21:38 -070025{
Jens Axboe1bafb3c2020-08-20 21:40:16 -060026 if (!(ring->flags & IORING_SETUP_SQPOLL))
Jens Axboe98455102019-11-27 17:21:38 -070027 return true;
28 if (IO_URING_READ_ONCE(*ring->sq.kflags) & IORING_SQ_NEED_WAKEUP) {
29 *flags |= IORING_ENTER_SQ_WAKEUP;
30 return true;
31 }
32
33 return false;
34}
35
Xiaoguang Wang122eca62020-07-09 09:16:20 +080036static inline bool cq_ring_needs_flush(struct io_uring *ring)
37{
38 return IO_URING_READ_ONCE(*ring->sq.kflags) & IORING_SQ_CQ_OVERFLOW;
39}
40
Bijan Mottahedeh36c05ec2020-05-19 14:52:21 -070041static int __io_uring_peek_cqe(struct io_uring *ring,
Marcelo Diop-Gonzalez3bdd9832020-12-03 11:07:06 -050042 struct io_uring_cqe **cqe_ptr,
43 unsigned *nr_available)
Bijan Mottahedeh36c05ec2020-05-19 14:52:21 -070044{
45 struct io_uring_cqe *cqe;
Bijan Mottahedeh36c05ec2020-05-19 14:52:21 -070046 int err = 0;
Marcelo Diop-Gonzalez3bdd9832020-12-03 11:07:06 -050047 unsigned available;
48 unsigned mask = *ring->cq.kring_mask;
Bijan Mottahedeh36c05ec2020-05-19 14:52:21 -070049
50 do {
Marcelo Diop-Gonzalez3bdd9832020-12-03 11:07:06 -050051 unsigned tail = io_uring_smp_load_acquire(ring->cq.ktail);
52 unsigned head = *ring->cq.khead;
53
54 cqe = NULL;
55 available = tail - head;
56 if (!available)
Bijan Mottahedeh36c05ec2020-05-19 14:52:21 -070057 break;
Marcelo Diop-Gonzalez3bdd9832020-12-03 11:07:06 -050058
59 cqe = &ring->cq.cqes[head & mask];
60 if (cqe->user_data == LIBURING_UDATA_TIMEOUT) {
61 if (cqe->res < 0)
62 err = cqe->res;
63 io_uring_cq_advance(ring, 1);
64 if (!err)
65 continue;
66 cqe = NULL;
Bijan Mottahedeh36c05ec2020-05-19 14:52:21 -070067 }
Marcelo Diop-Gonzalez3bdd9832020-12-03 11:07:06 -050068
Bijan Mottahedeh36c05ec2020-05-19 14:52:21 -070069 break;
70 } while (1);
71
72 *cqe_ptr = cqe;
Marcelo Diop-Gonzalez3bdd9832020-12-03 11:07:06 -050073 *nr_available = available;
Bijan Mottahedeh36c05ec2020-05-19 14:52:21 -070074 return err;
75}
76
Jens Axboe898294d2020-11-04 11:44:48 -070077struct get_data {
78 unsigned submit;
79 unsigned wait_nr;
80 unsigned get_flags;
81 int sz;
82 void *arg;
83};
84
85static int _io_uring_get_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr,
86 struct get_data *data)
Jens Axboe213d6f32019-01-17 21:40:30 -070087{
Jens Axboe8ce3a072019-12-16 12:10:07 -070088 struct io_uring_cqe *cqe = NULL;
Jens Axboe898294d2020-11-04 11:44:48 -070089 const int to_wait = data->wait_nr;
Jens Axboe7ad0e4b2019-12-01 09:11:31 -070090 int ret = 0, err;
Jens Axboe213d6f32019-01-17 21:40:30 -070091
Jens Axboe213d6f32019-01-17 21:40:30 -070092 do {
Xiaoguang Wang122eca62020-07-09 09:16:20 +080093 bool cq_overflow_flush = false;
李通洲38c82de2019-12-02 22:36:04 +080094 unsigned flags = 0;
Marcelo Diop-Gonzalez3bdd9832020-12-03 11:07:06 -050095 unsigned nr_available;
Jens Axboe98455102019-11-27 17:21:38 -070096
Marcelo Diop-Gonzalez3bdd9832020-12-03 11:07:06 -050097 err = __io_uring_peek_cqe(ring, &cqe, &nr_available);
Jens Axboe7ad0e4b2019-12-01 09:11:31 -070098 if (err)
Jens Axboe213d6f32019-01-17 21:40:30 -070099 break;
Jens Axboe898294d2020-11-04 11:44:48 -0700100 if (!cqe && !to_wait && !data->submit) {
Xiaoguang Wang122eca62020-07-09 09:16:20 +0800101 if (!cq_ring_needs_flush(ring)) {
102 err = -EAGAIN;
103 break;
104 }
105 cq_overflow_flush = true;
Jens Axboe76e92322019-09-20 22:15:38 -0600106 }
Jens Axboe898294d2020-11-04 11:44:48 -0700107 if (data->wait_nr && cqe)
108 data->wait_nr--;
109 if (data->wait_nr || cq_overflow_flush)
110 flags = IORING_ENTER_GETEVENTS | data->get_flags;
111 if (data->submit)
Jens Axboe1bafb3c2020-08-20 21:40:16 -0600112 sq_ring_needs_enter(ring, &flags);
Marcelo Diop-Gonzalez3bdd9832020-12-03 11:07:06 -0500113 if (data->wait_nr > nr_available || data->submit ||
114 cq_overflow_flush)
Jens Axboe898294d2020-11-04 11:44:48 -0700115 ret = __sys_io_uring_enter2(ring->ring_fd, data->submit,
116 data->wait_nr, flags, data->arg,
117 data->sz);
Jens Axboedc14e302020-03-02 08:33:17 -0700118 if (ret < 0) {
Jens Axboe20c92932019-09-28 05:35:02 -0600119 err = -errno;
Jens Axboe898294d2020-11-04 11:44:48 -0700120 } else if (ret == (int)data->submit) {
121 data->submit = 0;
Bijan Mottahedeh87bad142020-05-19 14:52:19 -0700122 /*
123 * When SETUP_IOPOLL is set, __sys_io_uring enter()
124 * must be called to reap new completions but the call
125 * won't be made if both wait_nr and submit are zero
126 * so preserve wait_nr.
127 */
128 if (!(ring->flags & IORING_SETUP_IOPOLL))
Jens Axboe898294d2020-11-04 11:44:48 -0700129 data->wait_nr = 0;
Jens Axboedc14e302020-03-02 08:33:17 -0700130 } else {
Jens Axboe898294d2020-11-04 11:44:48 -0700131 data->submit -= ret;
Jens Axboedc14e302020-03-02 08:33:17 -0700132 }
Jens Axboe8ce3a072019-12-16 12:10:07 -0700133 if (cqe)
Jens Axboe7ad0e4b2019-12-01 09:11:31 -0700134 break;
Jens Axboe20c92932019-09-28 05:35:02 -0600135 } while (!err);
Jens Axboe213d6f32019-01-17 21:40:30 -0700136
Jens Axboe8ce3a072019-12-16 12:10:07 -0700137 *cqe_ptr = cqe;
Jens Axboe76e92322019-09-20 22:15:38 -0600138 return err;
Jens Axboe213d6f32019-01-17 21:40:30 -0700139}
140
Jens Axboe898294d2020-11-04 11:44:48 -0700141int __io_uring_get_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr,
142 unsigned submit, unsigned wait_nr, sigset_t *sigmask)
143{
144 struct get_data data = {
145 .submit = submit,
146 .wait_nr = wait_nr,
Jens Axboe5a7c8ac2020-11-04 13:57:17 -0700147 .get_flags = 0,
Jens Axboe898294d2020-11-04 11:44:48 -0700148 .sz = _NSIG / 8,
149 .arg = sigmask,
150 };
151
152 return _io_uring_get_cqe(ring, cqe_ptr, &data);
153}
154
Jens Axboe213d6f32019-01-17 21:40:30 -0700155/*
James Rouzier0b88d722019-09-25 15:35:06 -0400156 * Fill in an array of IO completions up to count, if any are available.
157 * Returns the amount of IO completions filled.
158 */
Jens Axboe6d338022019-09-26 00:41:24 -0600159unsigned io_uring_peek_batch_cqe(struct io_uring *ring,
160 struct io_uring_cqe **cqes, unsigned count)
James Rouzier0b88d722019-09-25 15:35:06 -0400161{
Jens Axboe6d338022019-09-26 00:41:24 -0600162 unsigned ready;
Xiaoguang Wang20a7c012020-07-09 15:33:49 +0800163 bool overflow_checked = false;
Jens Axboe6d338022019-09-26 00:41:24 -0600164
Xiaoguang Wang20a7c012020-07-09 15:33:49 +0800165again:
Jens Axboe6d338022019-09-26 00:41:24 -0600166 ready = io_uring_cq_ready(ring);
James Rouzier0b88d722019-09-25 15:35:06 -0400167 if (ready) {
James Rouzier0b88d722019-09-25 15:35:06 -0400168 unsigned head = *ring->cq.khead;
James Rouzier0b88d722019-09-25 15:35:06 -0400169 unsigned mask = *ring->cq.kring_mask;
Jens Axboe6d338022019-09-26 00:41:24 -0600170 unsigned last;
James Rouzier0b88d722019-09-25 15:35:06 -0400171 int i = 0;
Jens Axboe6d338022019-09-26 00:41:24 -0600172
173 count = count > ready ? ready : count;
174 last = head + count;
175 for (;head != last; head++, i++)
James Rouzier0b88d722019-09-25 15:35:06 -0400176 cqes[i] = &ring->cq.cqes[head & mask];
James Rouzier0b88d722019-09-25 15:35:06 -0400177
178 return count;
179 }
180
Xiaoguang Wang20a7c012020-07-09 15:33:49 +0800181 if (overflow_checked)
182 goto done;
183
184 if (cq_ring_needs_flush(ring)) {
185 __sys_io_uring_enter(ring->ring_fd, 0, 0,
186 IORING_ENTER_GETEVENTS, NULL);
187 overflow_checked = true;
188 goto again;
189 }
190
191done:
James Rouzier0b88d722019-09-25 15:35:06 -0400192 return 0;
193}
194
195/*
Jens Axboec39a0582019-12-19 10:06:28 -0700196 * Sync internal state with kernel ring state on the SQ side. Returns the
197 * number of pending items in the SQ ring, for the shared ring.
Jens Axboe8578f0d2019-09-27 04:13:42 -0600198 */
Glauber Costa59641342020-08-21 09:25:07 -0400199int __io_uring_flush_sq(struct io_uring *ring)
Jens Axboe8578f0d2019-09-27 04:13:42 -0600200{
201 struct io_uring_sq *sq = &ring->sq;
202 const unsigned mask = *sq->kring_mask;
Jens Axboe1781f0e2019-12-11 09:00:43 -0700203 unsigned ktail, to_submit;
Jens Axboe8578f0d2019-09-27 04:13:42 -0600204
Jens Axboec39a0582019-12-19 10:06:28 -0700205 if (sq->sqe_head == sq->sqe_tail) {
206 ktail = *sq->ktail;
207 goto out;
208 }
Jens Axboe8578f0d2019-09-27 04:13:42 -0600209
210 /*
211 * Fill in sqes that we have queued up, adding them to the kernel ring
212 */
Jens Axboe8578f0d2019-09-27 04:13:42 -0600213 ktail = *sq->ktail;
214 to_submit = sq->sqe_tail - sq->sqe_head;
215 while (to_submit--) {
216 sq->array[ktail & mask] = sq->sqe_head & mask;
217 ktail++;
218 sq->sqe_head++;
Jens Axboe8578f0d2019-09-27 04:13:42 -0600219 }
220
221 /*
222 * Ensure that the kernel sees the SQE updates before it sees the tail
223 * update.
224 */
Kornilios Kourtisf3897452019-10-30 13:25:13 +0100225 io_uring_smp_store_release(sq->ktail, ktail);
Jens Axboec39a0582019-12-19 10:06:28 -0700226out:
227 return ktail - *sq->khead;
Jens Axboe8578f0d2019-09-27 04:13:42 -0600228}
229
230/*
Jens Axboeba0a2e42020-11-04 14:34:03 -0700231 * If we have kernel support for IORING_ENTER_EXT_ARG, then we can use that
232 * more efficiently than queueing an internal timeout command.
Jens Axboe0de9d8c2020-11-04 12:02:28 -0700233 */
234static int io_uring_wait_cqes_new(struct io_uring *ring,
235 struct io_uring_cqe **cqe_ptr,
236 unsigned wait_nr, struct __kernel_timespec *ts,
237 sigset_t *sigmask)
238{
239 struct io_uring_getevents_arg arg = {
240 .sigmask = (unsigned long) sigmask,
241 .sigmask_sz = _NSIG / 8,
242 .ts = (unsigned long) ts
243 };
244 struct get_data data = {
245 .submit = __io_uring_flush_sq(ring),
246 .wait_nr = wait_nr,
Jens Axboeba0a2e42020-11-04 14:34:03 -0700247 .get_flags = IORING_ENTER_EXT_ARG,
Jens Axboe0de9d8c2020-11-04 12:02:28 -0700248 .sz = sizeof(arg),
249 .arg = &arg
250 };
251
252 return _io_uring_get_cqe(ring, cqe_ptr, &data);
253}
254
255/*
Jens Axboe76e92322019-09-20 22:15:38 -0600256 * Like io_uring_wait_cqe(), except it accepts a timeout value as well. Note
257 * that an sqe is used internally to handle the timeout. Applications using
258 * this function must never set sqe->user_data to LIBURING_UDATA_TIMEOUT!
Jens Axboe8b93cca2019-09-21 14:44:57 -0600259 *
Jens Axboe4f48c042020-03-06 07:03:24 -0700260 * If 'ts' is specified, the application need not call io_uring_submit() before
261 * calling this function, as we will do that on its behalf. From this it also
262 * follows that this function isn't safe to use for applications that split SQ
263 * and CQ handling between two threads and expect that to work without
264 * synchronization, as this function manipulates both the SQ and CQ side.
Jens Axboe76e92322019-09-20 22:15:38 -0600265 */
Jens Axboeac726402019-09-27 07:26:45 -0600266int io_uring_wait_cqes(struct io_uring *ring, struct io_uring_cqe **cqe_ptr,
Jens Axboee2934e12019-10-01 10:05:16 -0600267 unsigned wait_nr, struct __kernel_timespec *ts,
268 sigset_t *sigmask)
Jens Axboe76e92322019-09-20 22:15:38 -0600269{
Jens Axboee80a08c2019-12-01 17:19:16 -0700270 unsigned to_submit = 0;
Jens Axboe76e92322019-09-20 22:15:38 -0600271
Jens Axboe7ad0e4b2019-12-01 09:11:31 -0700272 if (ts) {
Jens Axboe11e18b32019-09-21 15:04:52 -0600273 struct io_uring_sqe *sqe;
Jens Axboe217756d2019-11-22 21:43:24 -0700274 int ret;
Jens Axboe11e18b32019-09-21 15:04:52 -0600275
Jens Axboeba0a2e42020-11-04 14:34:03 -0700276 if (ring->features & IORING_FEAT_EXT_ARG)
277 return io_uring_wait_cqes_new(ring, cqe_ptr, wait_nr,
278 ts, sigmask);
279
Jens Axboe11e18b32019-09-21 15:04:52 -0600280 /*
281 * If the SQ ring is full, we may need to submit IO first
282 */
Jens Axboe76e92322019-09-20 22:15:38 -0600283 sqe = io_uring_get_sqe(ring);
Jens Axboe11e18b32019-09-21 15:04:52 -0600284 if (!sqe) {
285 ret = io_uring_submit(ring);
286 if (ret < 0)
287 return ret;
288 sqe = io_uring_get_sqe(ring);
Jens Axboee80a08c2019-12-01 17:19:16 -0700289 if (!sqe)
290 return -EAGAIN;
Jens Axboe11e18b32019-09-21 15:04:52 -0600291 }
Jens Axboe11a8f2b2019-10-15 17:31:17 -0600292 io_uring_prep_timeout(sqe, ts, wait_nr, 0);
Jens Axboe11e18b32019-09-21 15:04:52 -0600293 sqe->user_data = LIBURING_UDATA_TIMEOUT;
Jens Axboec39a0582019-12-19 10:06:28 -0700294 to_submit = __io_uring_flush_sq(ring);
Jens Axboe76e92322019-09-20 22:15:38 -0600295 }
Jens Axboe11e18b32019-09-21 15:04:52 -0600296
Jens Axboee80a08c2019-12-01 17:19:16 -0700297 return __io_uring_get_cqe(ring, cqe_ptr, to_submit, wait_nr, sigmask);
Jens Axboe213d6f32019-01-17 21:40:30 -0700298}
299
300/*
Jens Axboe217756d2019-11-22 21:43:24 -0700301 * See io_uring_wait_cqes() - this function is the same, it just always uses
302 * '1' as the wait_nr.
Jens Axboe11e18b32019-09-21 15:04:52 -0600303 */
304int io_uring_wait_cqe_timeout(struct io_uring *ring,
305 struct io_uring_cqe **cqe_ptr,
Jens Axboee2934e12019-10-01 10:05:16 -0600306 struct __kernel_timespec *ts)
Jens Axboe11e18b32019-09-21 15:04:52 -0600307{
Jens Axboeac726402019-09-27 07:26:45 -0600308 return io_uring_wait_cqes(ring, cqe_ptr, 1, ts, NULL);
Jens Axboe11e18b32019-09-21 15:04:52 -0600309}
310
311/*
Jens Axboe40b44d22019-09-27 04:10:52 -0600312 * Submit sqes acquired from io_uring_get_sqe() to the kernel.
313 *
314 * Returns number of sqes submitted
315 */
316static int __io_uring_submit(struct io_uring *ring, unsigned submitted,
317 unsigned wait_nr)
318{
319 unsigned flags;
320 int ret;
Jens Axboe213d6f32019-01-17 21:40:30 -0700321
Jens Axboe043ea222019-06-17 11:41:15 -0600322 flags = 0;
Jens Axboe1bafb3c2020-08-20 21:40:16 -0600323 if (sq_ring_needs_enter(ring, &flags) || wait_nr) {
Glauber Costabf3aeb32019-12-19 11:15:48 -0500324 if (wait_nr || (ring->flags & IORING_SETUP_IOPOLL))
Jens Axboe91dde5c2019-06-06 10:46:13 -0600325 flags |= IORING_ENTER_GETEVENTS;
Roman Penyaevdf23d2d2019-05-27 21:05:09 +0200326
Jens Axboe96144ea2019-12-01 11:21:39 -0700327 ret = __sys_io_uring_enter(ring->ring_fd, submitted, wait_nr,
328 flags, NULL);
Roman Penyaevdf23d2d2019-05-27 21:05:09 +0200329 if (ret < 0)
330 return -errno;
331 } else
332 ret = submitted;
Jens Axboe82600292019-03-05 20:12:48 -0700333
Jens Axboea8652212019-03-13 08:48:45 -0600334 return ret;
Jens Axboe213d6f32019-01-17 21:40:30 -0700335}
336
Jens Axboe94c9df32019-09-27 05:35:28 -0600337static int __io_uring_submit_and_wait(struct io_uring *ring, unsigned wait_nr)
338{
Jens Axboec39a0582019-12-19 10:06:28 -0700339 return __io_uring_submit(ring, __io_uring_flush_sq(ring), wait_nr);
Jens Axboe94c9df32019-09-27 05:35:28 -0600340}
341
Jens Axboe213d6f32019-01-17 21:40:30 -0700342/*
Jens Axboe91dde5c2019-06-06 10:46:13 -0600343 * Submit sqes acquired from io_uring_get_sqe() to the kernel.
344 *
345 * Returns number of sqes submitted
346 */
347int io_uring_submit(struct io_uring *ring)
348{
Jens Axboe94c9df32019-09-27 05:35:28 -0600349 return __io_uring_submit_and_wait(ring, 0);
Jens Axboe91dde5c2019-06-06 10:46:13 -0600350}
351
352/*
353 * Like io_uring_submit(), but allows waiting for events as well.
354 *
355 * Returns number of sqes submitted
356 */
357int io_uring_submit_and_wait(struct io_uring *ring, unsigned wait_nr)
358{
Jens Axboe94c9df32019-09-27 05:35:28 -0600359 return __io_uring_submit_and_wait(ring, wait_nr);
Jens Axboe91dde5c2019-06-06 10:46:13 -0600360}
361
Bart Van Assche7fa184f2020-06-21 13:36:46 -0700362static inline struct io_uring_sqe *
363__io_uring_get_sqe(struct io_uring_sq *sq, unsigned int __head)
364{
365 unsigned int __next = (sq)->sqe_tail + 1;
366 struct io_uring_sqe *__sqe = NULL;
367
368 if (__next - __head <= *(sq)->kring_entries) {
369 __sqe = &(sq)->sqes[(sq)->sqe_tail & *(sq)->kring_mask];
370 (sq)->sqe_tail = __next;
371 }
372 return __sqe;
373}
Jens Axboe902e4462019-11-10 15:28:23 -0700374
Jens Axboe91dde5c2019-06-06 10:46:13 -0600375/*
Jens Axboe213d6f32019-01-17 21:40:30 -0700376 * Return an sqe to fill. Application must later call io_uring_submit()
377 * when it's ready to tell the kernel about it. The caller may call this
378 * function multiple times before calling io_uring_submit().
379 *
380 * Returns a vacant sqe, or NULL if we're full.
381 */
382struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring)
383{
384 struct io_uring_sq *sq = &ring->sq;
Jens Axboe213d6f32019-01-17 21:40:30 -0700385
Jens Axboe902e4462019-11-10 15:28:23 -0700386 return __io_uring_get_sqe(sq, io_uring_smp_load_acquire(sq->khead));
Jens Axboe213d6f32019-01-17 21:40:30 -0700387}
Jens Axboe29768112020-09-05 15:25:52 -0600388
389int __io_uring_sqring_wait(struct io_uring *ring)
390{
391 int ret;
392
393 ret = __sys_io_uring_enter(ring->ring_fd, 0, 0, IORING_ENTER_SQ_WAIT,
394 NULL);
395 if (ret < 0)
396 ret = -errno;
397 return ret;
398}