blob: 9a730744e7bcf7fd1a0d79963666e32494be773c [file] [log] [blame]
Andy Kingd021c342013-02-06 14:23:56 +00001/*
2 * VMware vSockets Driver
3 *
4 * Copyright (C) 2009-2013 VMware, Inc. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the Free
8 * Software Foundation version 2 and no later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 */
15
16#include <linux/types.h>
17#include <linux/socket.h>
18#include <linux/stddef.h>
19#include <net/sock.h>
20
21#include "vmci_transport_notify.h"
22
23#define PKT_FIELD(vsk, field_name) (vmci_trans(vsk)->notify.pkt.field_name)
24
25static bool vmci_transport_notify_waiting_write(struct vsock_sock *vsk)
26{
27#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
28 bool retval;
29 u64 notify_limit;
30
31 if (!PKT_FIELD(vsk, peer_waiting_write))
32 return false;
33
34#ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
35 /* When the sender blocks, we take that as a sign that the sender is
36 * faster than the receiver. To reduce the transmit rate of the sender,
37 * we delay the sending of the read notification by decreasing the
38 * write_notify_window. The notification is delayed until the number of
39 * bytes used in the queue drops below the write_notify_window.
40 */
41
42 if (!PKT_FIELD(vsk, peer_waiting_write_detected)) {
43 PKT_FIELD(vsk, peer_waiting_write_detected) = true;
44 if (PKT_FIELD(vsk, write_notify_window) < PAGE_SIZE) {
45 PKT_FIELD(vsk, write_notify_window) =
46 PKT_FIELD(vsk, write_notify_min_window);
47 } else {
48 PKT_FIELD(vsk, write_notify_window) -= PAGE_SIZE;
49 if (PKT_FIELD(vsk, write_notify_window) <
50 PKT_FIELD(vsk, write_notify_min_window))
51 PKT_FIELD(vsk, write_notify_window) =
52 PKT_FIELD(vsk, write_notify_min_window);
53
54 }
55 }
56 notify_limit = vmci_trans(vsk)->consume_size -
57 PKT_FIELD(vsk, write_notify_window);
58#else
59 notify_limit = 0;
60#endif
61
62 /* For now we ignore the wait information and just see if the free
63 * space exceeds the notify limit. Note that improving this function
64 * to be more intelligent will not require a protocol change and will
65 * retain compatibility between endpoints with mixed versions of this
66 * function.
67 *
68 * The notify_limit is used to delay notifications in the case where
69 * flow control is enabled. Below the test is expressed in terms of
70 * free space in the queue: if free_space > ConsumeSize -
71 * write_notify_window then notify An alternate way of expressing this
72 * is to rewrite the expression to use the data ready in the receive
73 * queue: if write_notify_window > bufferReady then notify as
74 * free_space == ConsumeSize - bufferReady.
75 */
76 retval = vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair) >
77 notify_limit;
78#ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
79 if (retval) {
80 /*
81 * Once we notify the peer, we reset the detected flag so the
82 * next wait will again cause a decrease in the window size.
83 */
84
85 PKT_FIELD(vsk, peer_waiting_write_detected) = false;
86 }
87#endif
88 return retval;
89#else
90 return true;
91#endif
92}
93
94static bool vmci_transport_notify_waiting_read(struct vsock_sock *vsk)
95{
96#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
97 if (!PKT_FIELD(vsk, peer_waiting_read))
98 return false;
99
100 /* For now we ignore the wait information and just see if there is any
101 * data for our peer to read. Note that improving this function to be
102 * more intelligent will not require a protocol change and will retain
103 * compatibility between endpoints with mixed versions of this
104 * function.
105 */
106 return vmci_qpair_produce_buf_ready(vmci_trans(vsk)->qpair) > 0;
107#else
108 return true;
109#endif
110}
111
112static void
113vmci_transport_handle_waiting_read(struct sock *sk,
114 struct vmci_transport_packet *pkt,
115 bool bottom_half,
116 struct sockaddr_vm *dst,
117 struct sockaddr_vm *src)
118{
119#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
120 struct vsock_sock *vsk;
121
122 vsk = vsock_sk(sk);
123
124 PKT_FIELD(vsk, peer_waiting_read) = true;
125 memcpy(&PKT_FIELD(vsk, peer_waiting_read_info), &pkt->u.wait,
126 sizeof(PKT_FIELD(vsk, peer_waiting_read_info)));
127
128 if (vmci_transport_notify_waiting_read(vsk)) {
129 bool sent;
130
131 if (bottom_half)
132 sent = vmci_transport_send_wrote_bh(dst, src) > 0;
133 else
134 sent = vmci_transport_send_wrote(sk) > 0;
135
136 if (sent)
137 PKT_FIELD(vsk, peer_waiting_read) = false;
138 }
139#endif
140}
141
142static void
143vmci_transport_handle_waiting_write(struct sock *sk,
144 struct vmci_transport_packet *pkt,
145 bool bottom_half,
146 struct sockaddr_vm *dst,
147 struct sockaddr_vm *src)
148{
149#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
150 struct vsock_sock *vsk;
151
152 vsk = vsock_sk(sk);
153
154 PKT_FIELD(vsk, peer_waiting_write) = true;
155 memcpy(&PKT_FIELD(vsk, peer_waiting_write_info), &pkt->u.wait,
156 sizeof(PKT_FIELD(vsk, peer_waiting_write_info)));
157
158 if (vmci_transport_notify_waiting_write(vsk)) {
159 bool sent;
160
161 if (bottom_half)
162 sent = vmci_transport_send_read_bh(dst, src) > 0;
163 else
164 sent = vmci_transport_send_read(sk) > 0;
165
166 if (sent)
167 PKT_FIELD(vsk, peer_waiting_write) = false;
168 }
169#endif
170}
171
172static void
173vmci_transport_handle_read(struct sock *sk,
174 struct vmci_transport_packet *pkt,
175 bool bottom_half,
176 struct sockaddr_vm *dst, struct sockaddr_vm *src)
177{
178#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
179 struct vsock_sock *vsk;
180
181 vsk = vsock_sk(sk);
182 PKT_FIELD(vsk, sent_waiting_write) = false;
183#endif
184
185 sk->sk_write_space(sk);
186}
187
188static bool send_waiting_read(struct sock *sk, u64 room_needed)
189{
190#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
191 struct vsock_sock *vsk;
192 struct vmci_transport_waiting_info waiting_info;
193 u64 tail;
194 u64 head;
195 u64 room_left;
196 bool ret;
197
198 vsk = vsock_sk(sk);
199
200 if (PKT_FIELD(vsk, sent_waiting_read))
201 return true;
202
203 if (PKT_FIELD(vsk, write_notify_window) <
204 vmci_trans(vsk)->consume_size)
205 PKT_FIELD(vsk, write_notify_window) =
206 min(PKT_FIELD(vsk, write_notify_window) + PAGE_SIZE,
207 vmci_trans(vsk)->consume_size);
208
209 vmci_qpair_get_consume_indexes(vmci_trans(vsk)->qpair, &tail, &head);
210 room_left = vmci_trans(vsk)->consume_size - head;
211 if (room_needed >= room_left) {
212 waiting_info.offset = room_needed - room_left;
213 waiting_info.generation =
214 PKT_FIELD(vsk, consume_q_generation) + 1;
215 } else {
216 waiting_info.offset = head + room_needed;
217 waiting_info.generation = PKT_FIELD(vsk, consume_q_generation);
218 }
219
220 ret = vmci_transport_send_waiting_read(sk, &waiting_info) > 0;
221 if (ret)
222 PKT_FIELD(vsk, sent_waiting_read) = true;
223
224 return ret;
225#else
226 return true;
227#endif
228}
229
230static bool send_waiting_write(struct sock *sk, u64 room_needed)
231{
232#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
233 struct vsock_sock *vsk;
234 struct vmci_transport_waiting_info waiting_info;
235 u64 tail;
236 u64 head;
237 u64 room_left;
238 bool ret;
239
240 vsk = vsock_sk(sk);
241
242 if (PKT_FIELD(vsk, sent_waiting_write))
243 return true;
244
245 vmci_qpair_get_produce_indexes(vmci_trans(vsk)->qpair, &tail, &head);
246 room_left = vmci_trans(vsk)->produce_size - tail;
247 if (room_needed + 1 >= room_left) {
248 /* Wraps around to current generation. */
249 waiting_info.offset = room_needed + 1 - room_left;
250 waiting_info.generation = PKT_FIELD(vsk, produce_q_generation);
251 } else {
252 waiting_info.offset = tail + room_needed + 1;
253 waiting_info.generation =
254 PKT_FIELD(vsk, produce_q_generation) - 1;
255 }
256
257 ret = vmci_transport_send_waiting_write(sk, &waiting_info) > 0;
258 if (ret)
259 PKT_FIELD(vsk, sent_waiting_write) = true;
260
261 return ret;
262#else
263 return true;
264#endif
265}
266
267static int vmci_transport_send_read_notification(struct sock *sk)
268{
269 struct vsock_sock *vsk;
270 bool sent_read;
271 unsigned int retries;
272 int err;
273
274 vsk = vsock_sk(sk);
275 sent_read = false;
276 retries = 0;
277 err = 0;
278
279 if (vmci_transport_notify_waiting_write(vsk)) {
280 /* Notify the peer that we have read, retrying the send on
281 * failure up to our maximum value. XXX For now we just log
282 * the failure, but later we should schedule a work item to
283 * handle the resend until it succeeds. That would require
284 * keeping track of work items in the vsk and cleaning them up
285 * upon socket close.
286 */
287 while (!(vsk->peer_shutdown & RCV_SHUTDOWN) &&
288 !sent_read &&
289 retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
290 err = vmci_transport_send_read(sk);
291 if (err >= 0)
292 sent_read = true;
293
294 retries++;
295 }
296
297 if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS)
298 pr_err("%p unable to send read notify to peer\n", sk);
299 else
300#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
301 PKT_FIELD(vsk, peer_waiting_write) = false;
302#endif
303
304 }
305 return err;
306}
307
308static void
309vmci_transport_handle_wrote(struct sock *sk,
310 struct vmci_transport_packet *pkt,
311 bool bottom_half,
312 struct sockaddr_vm *dst, struct sockaddr_vm *src)
313{
314#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
315 struct vsock_sock *vsk = vsock_sk(sk);
316 PKT_FIELD(vsk, sent_waiting_read) = false;
317#endif
318 sk->sk_data_ready(sk, 0);
319}
320
321static void vmci_transport_notify_pkt_socket_init(struct sock *sk)
322{
323 struct vsock_sock *vsk = vsock_sk(sk);
324
325 PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE;
326 PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE;
327 PKT_FIELD(vsk, peer_waiting_read) = false;
328 PKT_FIELD(vsk, peer_waiting_write) = false;
329 PKT_FIELD(vsk, peer_waiting_write_detected) = false;
330 PKT_FIELD(vsk, sent_waiting_read) = false;
331 PKT_FIELD(vsk, sent_waiting_write) = false;
332 PKT_FIELD(vsk, produce_q_generation) = 0;
333 PKT_FIELD(vsk, consume_q_generation) = 0;
334
335 memset(&PKT_FIELD(vsk, peer_waiting_read_info), 0,
336 sizeof(PKT_FIELD(vsk, peer_waiting_read_info)));
337 memset(&PKT_FIELD(vsk, peer_waiting_write_info), 0,
338 sizeof(PKT_FIELD(vsk, peer_waiting_write_info)));
339}
340
341static void vmci_transport_notify_pkt_socket_destruct(struct vsock_sock *vsk)
342{
343}
344
345static int
346vmci_transport_notify_pkt_poll_in(struct sock *sk,
347 size_t target, bool *data_ready_now)
348{
349 struct vsock_sock *vsk = vsock_sk(sk);
350
351 if (vsock_stream_has_data(vsk)) {
352 *data_ready_now = true;
353 } else {
354 /* We can't read right now because there is nothing in the
355 * queue. Ask for notifications when there is something to
356 * read.
357 */
358 if (sk->sk_state == SS_CONNECTED) {
359 if (!send_waiting_read(sk, 1))
360 return -1;
361
362 }
363 *data_ready_now = false;
364 }
365
366 return 0;
367}
368
369static int
370vmci_transport_notify_pkt_poll_out(struct sock *sk,
371 size_t target, bool *space_avail_now)
372{
373 s64 produce_q_free_space;
374 struct vsock_sock *vsk = vsock_sk(sk);
375
376 produce_q_free_space = vsock_stream_has_space(vsk);
377 if (produce_q_free_space > 0) {
378 *space_avail_now = true;
379 return 0;
380 } else if (produce_q_free_space == 0) {
381 /* This is a connected socket but we can't currently send data.
382 * Notify the peer that we are waiting if the queue is full. We
383 * only send a waiting write if the queue is full because
384 * otherwise we end up in an infinite WAITING_WRITE, READ,
385 * WAITING_WRITE, READ, etc. loop. Treat failing to send the
386 * notification as a socket error, passing that back through
387 * the mask.
388 */
389 if (!send_waiting_write(sk, 1))
390 return -1;
391
392 *space_avail_now = false;
393 }
394
395 return 0;
396}
397
398static int
399vmci_transport_notify_pkt_recv_init(
400 struct sock *sk,
401 size_t target,
402 struct vmci_transport_recv_notify_data *data)
403{
404 struct vsock_sock *vsk = vsock_sk(sk);
405
406#ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY
407 data->consume_head = 0;
408 data->produce_tail = 0;
409#ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
410 data->notify_on_block = false;
411
412 if (PKT_FIELD(vsk, write_notify_min_window) < target + 1) {
413 PKT_FIELD(vsk, write_notify_min_window) = target + 1;
414 if (PKT_FIELD(vsk, write_notify_window) <
415 PKT_FIELD(vsk, write_notify_min_window)) {
416 /* If the current window is smaller than the new
417 * minimal window size, we need to reevaluate whether
418 * we need to notify the sender. If the number of ready
419 * bytes are smaller than the new window, we need to
420 * send a notification to the sender before we block.
421 */
422
423 PKT_FIELD(vsk, write_notify_window) =
424 PKT_FIELD(vsk, write_notify_min_window);
425 data->notify_on_block = true;
426 }
427 }
428#endif
429#endif
430
431 return 0;
432}
433
434static int
435vmci_transport_notify_pkt_recv_pre_block(
436 struct sock *sk,
437 size_t target,
438 struct vmci_transport_recv_notify_data *data)
439{
440 int err = 0;
441
442 /* Notify our peer that we are waiting for data to read. */
443 if (!send_waiting_read(sk, target)) {
444 err = -EHOSTUNREACH;
445 return err;
446 }
447#ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
448 if (data->notify_on_block) {
449 err = vmci_transport_send_read_notification(sk);
450 if (err < 0)
451 return err;
452
453 data->notify_on_block = false;
454 }
455#endif
456
457 return err;
458}
459
460static int
461vmci_transport_notify_pkt_recv_pre_dequeue(
462 struct sock *sk,
463 size_t target,
464 struct vmci_transport_recv_notify_data *data)
465{
466 struct vsock_sock *vsk = vsock_sk(sk);
467
468 /* Now consume up to len bytes from the queue. Note that since we have
469 * the socket locked we should copy at least ready bytes.
470 */
471#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
472 vmci_qpair_get_consume_indexes(vmci_trans(vsk)->qpair,
473 &data->produce_tail,
474 &data->consume_head);
475#endif
476
477 return 0;
478}
479
480static int
481vmci_transport_notify_pkt_recv_post_dequeue(
482 struct sock *sk,
483 size_t target,
484 ssize_t copied,
485 bool data_read,
486 struct vmci_transport_recv_notify_data *data)
487{
488 struct vsock_sock *vsk;
489 int err;
490
491 vsk = vsock_sk(sk);
492 err = 0;
493
494 if (data_read) {
495#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
496 /* Detect a wrap-around to maintain queue generation. Note
497 * that this is safe since we hold the socket lock across the
498 * two queue pair operations.
499 */
500 if (copied >=
501 vmci_trans(vsk)->consume_size - data->consume_head)
502 PKT_FIELD(vsk, consume_q_generation)++;
503#endif
504
505 err = vmci_transport_send_read_notification(sk);
506 if (err < 0)
507 return err;
508
509 }
510 return err;
511}
512
513static int
514vmci_transport_notify_pkt_send_init(
515 struct sock *sk,
516 struct vmci_transport_send_notify_data *data)
517{
518#ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY
519 data->consume_head = 0;
520 data->produce_tail = 0;
521#endif
522
523 return 0;
524}
525
526static int
527vmci_transport_notify_pkt_send_pre_block(
528 struct sock *sk,
529 struct vmci_transport_send_notify_data *data)
530{
531 /* Notify our peer that we are waiting for room to write. */
532 if (!send_waiting_write(sk, 1))
533 return -EHOSTUNREACH;
534
535 return 0;
536}
537
538static int
539vmci_transport_notify_pkt_send_pre_enqueue(
540 struct sock *sk,
541 struct vmci_transport_send_notify_data *data)
542{
543 struct vsock_sock *vsk = vsock_sk(sk);
544
545#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
546 vmci_qpair_get_produce_indexes(vmci_trans(vsk)->qpair,
547 &data->produce_tail,
548 &data->consume_head);
549#endif
550
551 return 0;
552}
553
554static int
555vmci_transport_notify_pkt_send_post_enqueue(
556 struct sock *sk,
557 ssize_t written,
558 struct vmci_transport_send_notify_data *data)
559{
560 int err = 0;
561 struct vsock_sock *vsk;
562 bool sent_wrote = false;
563 int retries = 0;
564
565 vsk = vsock_sk(sk);
566
567#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
568 /* Detect a wrap-around to maintain queue generation. Note that this
569 * is safe since we hold the socket lock across the two queue pair
570 * operations.
571 */
572 if (written >= vmci_trans(vsk)->produce_size - data->produce_tail)
573 PKT_FIELD(vsk, produce_q_generation)++;
574
575#endif
576
577 if (vmci_transport_notify_waiting_read(vsk)) {
578 /* Notify the peer that we have written, retrying the send on
579 * failure up to our maximum value. See the XXX comment for the
580 * corresponding piece of code in StreamRecvmsg() for potential
581 * improvements.
582 */
583 while (!(vsk->peer_shutdown & RCV_SHUTDOWN) &&
584 !sent_wrote &&
585 retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
586 err = vmci_transport_send_wrote(sk);
587 if (err >= 0)
588 sent_wrote = true;
589
590 retries++;
591 }
592
593 if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
594 pr_err("%p unable to send wrote notify to peer\n", sk);
595 return err;
596 } else {
597#if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
598 PKT_FIELD(vsk, peer_waiting_read) = false;
599#endif
600 }
601 }
602 return err;
603}
604
605static void
606vmci_transport_notify_pkt_handle_pkt(
607 struct sock *sk,
608 struct vmci_transport_packet *pkt,
609 bool bottom_half,
610 struct sockaddr_vm *dst,
611 struct sockaddr_vm *src, bool *pkt_processed)
612{
613 bool processed = false;
614
615 switch (pkt->type) {
616 case VMCI_TRANSPORT_PACKET_TYPE_WROTE:
617 vmci_transport_handle_wrote(sk, pkt, bottom_half, dst, src);
618 processed = true;
619 break;
620 case VMCI_TRANSPORT_PACKET_TYPE_READ:
621 vmci_transport_handle_read(sk, pkt, bottom_half, dst, src);
622 processed = true;
623 break;
624 case VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE:
625 vmci_transport_handle_waiting_write(sk, pkt, bottom_half,
626 dst, src);
627 processed = true;
628 break;
629
630 case VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ:
631 vmci_transport_handle_waiting_read(sk, pkt, bottom_half,
632 dst, src);
633 processed = true;
634 break;
635 }
636
637 if (pkt_processed)
638 *pkt_processed = processed;
639}
640
641static void vmci_transport_notify_pkt_process_request(struct sock *sk)
642{
643 struct vsock_sock *vsk = vsock_sk(sk);
644
645 PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size;
646 if (vmci_trans(vsk)->consume_size <
647 PKT_FIELD(vsk, write_notify_min_window))
648 PKT_FIELD(vsk, write_notify_min_window) =
649 vmci_trans(vsk)->consume_size;
650}
651
652static void vmci_transport_notify_pkt_process_negotiate(struct sock *sk)
653{
654 struct vsock_sock *vsk = vsock_sk(sk);
655
656 PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size;
657 if (vmci_trans(vsk)->consume_size <
658 PKT_FIELD(vsk, write_notify_min_window))
659 PKT_FIELD(vsk, write_notify_min_window) =
660 vmci_trans(vsk)->consume_size;
661}
662
663/* Socket control packet based operations. */
664struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops = {
665 vmci_transport_notify_pkt_socket_init,
666 vmci_transport_notify_pkt_socket_destruct,
667 vmci_transport_notify_pkt_poll_in,
668 vmci_transport_notify_pkt_poll_out,
669 vmci_transport_notify_pkt_handle_pkt,
670 vmci_transport_notify_pkt_recv_init,
671 vmci_transport_notify_pkt_recv_pre_block,
672 vmci_transport_notify_pkt_recv_pre_dequeue,
673 vmci_transport_notify_pkt_recv_post_dequeue,
674 vmci_transport_notify_pkt_send_init,
675 vmci_transport_notify_pkt_send_pre_block,
676 vmci_transport_notify_pkt_send_pre_enqueue,
677 vmci_transport_notify_pkt_send_post_enqueue,
678 vmci_transport_notify_pkt_process_request,
679 vmci_transport_notify_pkt_process_negotiate,
680};