blob: 18f377306884b4ce0969057471f0705fb6e58892 [file] [log] [blame]
Andy Kingd021c342013-02-06 14:23:56 +00001/*
2 * VMware vSockets Driver
3 *
4 * Copyright (C) 2007-2013 VMware, Inc. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the Free
8 * Software Foundation version 2 and no later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 */
15
16/* Implementation notes:
17 *
18 * - There are two kinds of sockets: those created by user action (such as
19 * calling socket(2)) and those created by incoming connection request packets.
20 *
21 * - There are two "global" tables, one for bound sockets (sockets that have
22 * specified an address that they are responsible for) and one for connected
23 * sockets (sockets that have established a connection with another socket).
24 * These tables are "global" in that all sockets on the system are placed
25 * within them. - Note, though, that the bound table contains an extra entry
26 * for a list of unbound sockets and SOCK_DGRAM sockets will always remain in
27 * that list. The bound table is used solely for lookup of sockets when packets
28 * are received and that's not necessary for SOCK_DGRAM sockets since we create
29 * a datagram handle for each and need not perform a lookup. Keeping SOCK_DGRAM
30 * sockets out of the bound hash buckets will reduce the chance of collisions
31 * when looking for SOCK_STREAM sockets and prevents us from having to check the
32 * socket type in the hash table lookups.
33 *
34 * - Sockets created by user action will either be "client" sockets that
35 * initiate a connection or "server" sockets that listen for connections; we do
36 * not support simultaneous connects (two "client" sockets connecting).
37 *
38 * - "Server" sockets are referred to as listener sockets throughout this
Stefan Hajnocziea3803c2015-10-29 11:57:42 +000039 * implementation because they are in the VSOCK_SS_LISTEN state. When a
40 * connection request is received (the second kind of socket mentioned above),
41 * we create a new socket and refer to it as a pending socket. These pending
42 * sockets are placed on the pending connection list of the listener socket.
43 * When future packets are received for the address the listener socket is
44 * bound to, we check if the source of the packet is from one that has an
45 * existing pending connection. If it does, we process the packet for the
46 * pending socket. When that socket reaches the connected state, it is removed
47 * from the listener socket's pending list and enqueued in the listener
48 * socket's accept queue. Callers of accept(2) will accept connected sockets
49 * from the listener socket's accept queue. If the socket cannot be accepted
50 * for some reason then it is marked rejected. Once the connection is
51 * accepted, it is owned by the user process and the responsibility for cleanup
52 * falls with that user process.
Andy Kingd021c342013-02-06 14:23:56 +000053 *
54 * - It is possible that these pending sockets will never reach the connected
55 * state; in fact, we may never receive another packet after the connection
56 * request. Because of this, we must schedule a cleanup function to run in the
57 * future, after some amount of time passes where a connection should have been
58 * established. This function ensures that the socket is off all lists so it
59 * cannot be retrieved, then drops all references to the socket so it is cleaned
60 * up (sock_put() -> sk_free() -> our sk_destruct implementation). Note this
61 * function will also cleanup rejected sockets, those that reach the connected
62 * state but leave it before they have been accepted.
63 *
Stefan Hajnoczi4192f672016-06-23 16:28:58 +010064 * - Lock ordering for pending or accept queue sockets is:
65 *
66 * lock_sock(listener);
67 * lock_sock_nested(pending, SINGLE_DEPTH_NESTING);
68 *
69 * Using explicit nested locking keeps lockdep happy since normally only one
70 * lock of a given class may be taken at a time.
71 *
Andy Kingd021c342013-02-06 14:23:56 +000072 * - Sockets created by user action will be cleaned up when the user process
73 * calls close(2), causing our release implementation to be called. Our release
74 * implementation will perform some cleanup then drop the last reference so our
75 * sk_destruct implementation is invoked. Our sk_destruct implementation will
76 * perform additional cleanup that's common for both types of sockets.
77 *
78 * - A socket's reference count is what ensures that the structure won't be
79 * freed. Each entry in a list (such as the "global" bound and connected tables
80 * and the listener socket's pending list and connected queue) ensures a
81 * reference. When we defer work until process context and pass a socket as our
82 * argument, we must ensure the reference count is increased to ensure the
83 * socket isn't freed before the function is run; the deferred function will
84 * then drop the reference.
85 */
86
87#include <linux/types.h>
Andy Kingd021c342013-02-06 14:23:56 +000088#include <linux/bitops.h>
89#include <linux/cred.h>
90#include <linux/init.h>
91#include <linux/io.h>
92#include <linux/kernel.h>
93#include <linux/kmod.h>
94#include <linux/list.h>
95#include <linux/miscdevice.h>
96#include <linux/module.h>
97#include <linux/mutex.h>
98#include <linux/net.h>
99#include <linux/poll.h>
Lepton Wu006469c2018-12-11 11:12:55 -0800100#include <linux/random.h>
Andy Kingd021c342013-02-06 14:23:56 +0000101#include <linux/skbuff.h>
102#include <linux/smp.h>
103#include <linux/socket.h>
104#include <linux/stddef.h>
105#include <linux/unistd.h>
106#include <linux/wait.h>
107#include <linux/workqueue.h>
108#include <net/sock.h>
Asias He82a54d02013-07-25 17:39:34 +0800109#include <net/af_vsock.h>
Andy Kingd021c342013-02-06 14:23:56 +0000110
111static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr);
112static void vsock_sk_destruct(struct sock *sk);
113static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
114
115/* Protocol family. */
116static struct proto vsock_proto = {
117 .name = "AF_VSOCK",
118 .owner = THIS_MODULE,
119 .obj_size = sizeof(struct vsock_sock),
120};
121
122/* The default peer timeout indicates how long we will wait for a peer response
123 * to a control message.
124 */
125#define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ)
126
Andy Kingd021c342013-02-06 14:23:56 +0000127static const struct vsock_transport *transport;
128static DEFINE_MUTEX(vsock_register_mutex);
129
130/**** EXPORTS ****/
131
132/* Get the ID of the local context. This is transport dependent. */
133
134int vm_sockets_get_local_cid(void)
135{
136 return transport->get_local_cid();
137}
138EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid);
139
140/**** UTILS ****/
141
142/* Each bound VSocket is stored in the bind hash table and each connected
143 * VSocket is stored in the connected hash table.
144 *
145 * Unbound sockets are all put on the same list attached to the end of the hash
146 * table (vsock_unbound_sockets). Bound sockets are added to the hash table in
147 * the bucket that their local address hashes to (vsock_bound_sockets(addr)
148 * represents the list that addr hashes to).
149 *
150 * Specifically, we initialize the vsock_bind_table array to a size of
151 * VSOCK_HASH_SIZE + 1 so that vsock_bind_table[0] through
152 * vsock_bind_table[VSOCK_HASH_SIZE - 1] are for bound sockets and
153 * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets. The hash function
Asias Hea49dd9d2013-06-20 17:20:33 +0800154 * mods with VSOCK_HASH_SIZE to ensure this.
Andy Kingd021c342013-02-06 14:23:56 +0000155 */
156#define VSOCK_HASH_SIZE 251
157#define MAX_PORT_RETRIES 24
158
Asias Hea49dd9d2013-06-20 17:20:33 +0800159#define VSOCK_HASH(addr) ((addr)->svm_port % VSOCK_HASH_SIZE)
Andy Kingd021c342013-02-06 14:23:56 +0000160#define vsock_bound_sockets(addr) (&vsock_bind_table[VSOCK_HASH(addr)])
161#define vsock_unbound_sockets (&vsock_bind_table[VSOCK_HASH_SIZE])
162
163/* XXX This can probably be implemented in a better way. */
164#define VSOCK_CONN_HASH(src, dst) \
Asias Hea49dd9d2013-06-20 17:20:33 +0800165 (((src)->svm_cid ^ (dst)->svm_port) % VSOCK_HASH_SIZE)
Andy Kingd021c342013-02-06 14:23:56 +0000166#define vsock_connected_sockets(src, dst) \
167 (&vsock_connected_table[VSOCK_CONN_HASH(src, dst)])
168#define vsock_connected_sockets_vsk(vsk) \
169 vsock_connected_sockets(&(vsk)->remote_addr, &(vsk)->local_addr)
170
171static struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1];
172static struct list_head vsock_connected_table[VSOCK_HASH_SIZE];
173static DEFINE_SPINLOCK(vsock_table_lock);
174
Asias Heb3a6dfe2013-06-20 17:20:30 +0800175/* Autobind this socket to the local address if necessary. */
176static int vsock_auto_bind(struct vsock_sock *vsk)
177{
178 struct sock *sk = sk_vsock(vsk);
179 struct sockaddr_vm local_addr;
180
181 if (vsock_addr_bound(&vsk->local_addr))
182 return 0;
183 vsock_addr_init(&local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
184 return __vsock_bind(sk, &local_addr);
185}
186
Geert Uytterhoeven22ee3b52013-04-23 23:40:55 +0000187static void vsock_init_tables(void)
Andy Kingd021c342013-02-06 14:23:56 +0000188{
189 int i;
190
191 for (i = 0; i < ARRAY_SIZE(vsock_bind_table); i++)
192 INIT_LIST_HEAD(&vsock_bind_table[i]);
193
194 for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++)
195 INIT_LIST_HEAD(&vsock_connected_table[i]);
196}
197
198static void __vsock_insert_bound(struct list_head *list,
199 struct vsock_sock *vsk)
200{
201 sock_hold(&vsk->sk);
202 list_add(&vsk->bound_table, list);
203}
204
205static void __vsock_insert_connected(struct list_head *list,
206 struct vsock_sock *vsk)
207{
208 sock_hold(&vsk->sk);
209 list_add(&vsk->connected_table, list);
210}
211
212static void __vsock_remove_bound(struct vsock_sock *vsk)
213{
214 list_del_init(&vsk->bound_table);
215 sock_put(&vsk->sk);
216}
217
218static void __vsock_remove_connected(struct vsock_sock *vsk)
219{
220 list_del_init(&vsk->connected_table);
221 sock_put(&vsk->sk);
222}
223
224static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr)
225{
226 struct vsock_sock *vsk;
227
228 list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table)
Reilly Grant990454b2013-04-01 11:41:52 -0700229 if (addr->svm_port == vsk->local_addr.svm_port)
Andy Kingd021c342013-02-06 14:23:56 +0000230 return sk_vsock(vsk);
231
232 return NULL;
233}
234
235static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src,
236 struct sockaddr_vm *dst)
237{
238 struct vsock_sock *vsk;
239
240 list_for_each_entry(vsk, vsock_connected_sockets(src, dst),
241 connected_table) {
Reilly Grant990454b2013-04-01 11:41:52 -0700242 if (vsock_addr_equals_addr(src, &vsk->remote_addr) &&
243 dst->svm_port == vsk->local_addr.svm_port) {
Andy Kingd021c342013-02-06 14:23:56 +0000244 return sk_vsock(vsk);
245 }
246 }
247
248 return NULL;
249}
250
251static bool __vsock_in_bound_table(struct vsock_sock *vsk)
252{
253 return !list_empty(&vsk->bound_table);
254}
255
256static bool __vsock_in_connected_table(struct vsock_sock *vsk)
257{
258 return !list_empty(&vsk->connected_table);
259}
260
261static void vsock_insert_unbound(struct vsock_sock *vsk)
262{
263 spin_lock_bh(&vsock_table_lock);
264 __vsock_insert_bound(vsock_unbound_sockets, vsk);
265 spin_unlock_bh(&vsock_table_lock);
266}
267
268void vsock_insert_connected(struct vsock_sock *vsk)
269{
270 struct list_head *list = vsock_connected_sockets(
271 &vsk->remote_addr, &vsk->local_addr);
272
273 spin_lock_bh(&vsock_table_lock);
274 __vsock_insert_connected(list, vsk);
275 spin_unlock_bh(&vsock_table_lock);
276}
277EXPORT_SYMBOL_GPL(vsock_insert_connected);
278
279void vsock_remove_bound(struct vsock_sock *vsk)
280{
281 spin_lock_bh(&vsock_table_lock);
282 __vsock_remove_bound(vsk);
283 spin_unlock_bh(&vsock_table_lock);
284}
285EXPORT_SYMBOL_GPL(vsock_remove_bound);
286
287void vsock_remove_connected(struct vsock_sock *vsk)
288{
289 spin_lock_bh(&vsock_table_lock);
290 __vsock_remove_connected(vsk);
291 spin_unlock_bh(&vsock_table_lock);
292}
293EXPORT_SYMBOL_GPL(vsock_remove_connected);
294
295struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr)
296{
297 struct sock *sk;
298
299 spin_lock_bh(&vsock_table_lock);
300 sk = __vsock_find_bound_socket(addr);
301 if (sk)
302 sock_hold(sk);
303
304 spin_unlock_bh(&vsock_table_lock);
305
306 return sk;
307}
308EXPORT_SYMBOL_GPL(vsock_find_bound_socket);
309
310struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
311 struct sockaddr_vm *dst)
312{
313 struct sock *sk;
314
315 spin_lock_bh(&vsock_table_lock);
316 sk = __vsock_find_connected_socket(src, dst);
317 if (sk)
318 sock_hold(sk);
319
320 spin_unlock_bh(&vsock_table_lock);
321
322 return sk;
323}
324EXPORT_SYMBOL_GPL(vsock_find_connected_socket);
325
326static bool vsock_in_bound_table(struct vsock_sock *vsk)
327{
328 bool ret;
329
330 spin_lock_bh(&vsock_table_lock);
331 ret = __vsock_in_bound_table(vsk);
332 spin_unlock_bh(&vsock_table_lock);
333
334 return ret;
335}
336
337static bool vsock_in_connected_table(struct vsock_sock *vsk)
338{
339 bool ret;
340
341 spin_lock_bh(&vsock_table_lock);
342 ret = __vsock_in_connected_table(vsk);
343 spin_unlock_bh(&vsock_table_lock);
344
345 return ret;
346}
347
Stefan Hajnoczi6773b7d2016-07-28 15:36:31 +0100348void vsock_remove_sock(struct vsock_sock *vsk)
349{
350 if (vsock_in_bound_table(vsk))
351 vsock_remove_bound(vsk);
352
353 if (vsock_in_connected_table(vsk))
354 vsock_remove_connected(vsk);
355}
356EXPORT_SYMBOL_GPL(vsock_remove_sock);
357
Andy Kingd021c342013-02-06 14:23:56 +0000358void vsock_for_each_connected_socket(void (*fn)(struct sock *sk))
359{
360 int i;
361
362 spin_lock_bh(&vsock_table_lock);
363
364 for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) {
365 struct vsock_sock *vsk;
366 list_for_each_entry(vsk, &vsock_connected_table[i],
Julia Lawalld9af2d62013-08-05 16:47:38 +0200367 connected_table)
Andy Kingd021c342013-02-06 14:23:56 +0000368 fn(sk_vsock(vsk));
369 }
370
371 spin_unlock_bh(&vsock_table_lock);
372}
373EXPORT_SYMBOL_GPL(vsock_for_each_connected_socket);
374
375void vsock_add_pending(struct sock *listener, struct sock *pending)
376{
377 struct vsock_sock *vlistener;
378 struct vsock_sock *vpending;
379
380 vlistener = vsock_sk(listener);
381 vpending = vsock_sk(pending);
382
383 sock_hold(pending);
384 sock_hold(listener);
385 list_add_tail(&vpending->pending_links, &vlistener->pending_links);
386}
387EXPORT_SYMBOL_GPL(vsock_add_pending);
388
389void vsock_remove_pending(struct sock *listener, struct sock *pending)
390{
391 struct vsock_sock *vpending = vsock_sk(pending);
392
393 list_del_init(&vpending->pending_links);
394 sock_put(listener);
395 sock_put(pending);
396}
397EXPORT_SYMBOL_GPL(vsock_remove_pending);
398
399void vsock_enqueue_accept(struct sock *listener, struct sock *connected)
400{
401 struct vsock_sock *vlistener;
402 struct vsock_sock *vconnected;
403
404 vlistener = vsock_sk(listener);
405 vconnected = vsock_sk(connected);
406
407 sock_hold(connected);
408 sock_hold(listener);
409 list_add_tail(&vconnected->accept_queue, &vlistener->accept_queue);
410}
411EXPORT_SYMBOL_GPL(vsock_enqueue_accept);
412
413static struct sock *vsock_dequeue_accept(struct sock *listener)
414{
415 struct vsock_sock *vlistener;
416 struct vsock_sock *vconnected;
417
418 vlistener = vsock_sk(listener);
419
420 if (list_empty(&vlistener->accept_queue))
421 return NULL;
422
423 vconnected = list_entry(vlistener->accept_queue.next,
424 struct vsock_sock, accept_queue);
425
426 list_del_init(&vconnected->accept_queue);
427 sock_put(listener);
428 /* The caller will need a reference on the connected socket so we let
429 * it call sock_put().
430 */
431
432 return sk_vsock(vconnected);
433}
434
435static bool vsock_is_accept_queue_empty(struct sock *sk)
436{
437 struct vsock_sock *vsk = vsock_sk(sk);
438 return list_empty(&vsk->accept_queue);
439}
440
441static bool vsock_is_pending(struct sock *sk)
442{
443 struct vsock_sock *vsk = vsock_sk(sk);
444 return !list_empty(&vsk->pending_links);
445}
446
447static int vsock_send_shutdown(struct sock *sk, int mode)
448{
449 return transport->shutdown(vsock_sk(sk), mode);
450}
451
Cong Wangf6b82762018-08-06 11:06:02 -0700452static void vsock_pending_work(struct work_struct *work)
Andy Kingd021c342013-02-06 14:23:56 +0000453{
454 struct sock *sk;
455 struct sock *listener;
456 struct vsock_sock *vsk;
457 bool cleanup;
458
Cong Wangf6b82762018-08-06 11:06:02 -0700459 vsk = container_of(work, struct vsock_sock, pending_work.work);
Andy Kingd021c342013-02-06 14:23:56 +0000460 sk = sk_vsock(vsk);
461 listener = vsk->listener;
462 cleanup = true;
463
464 lock_sock(listener);
Stefan Hajnoczi4192f672016-06-23 16:28:58 +0100465 lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
Andy Kingd021c342013-02-06 14:23:56 +0000466
467 if (vsock_is_pending(sk)) {
468 vsock_remove_pending(listener, sk);
Jorgen Hansen1190cfd2016-09-26 23:59:53 -0700469
470 listener->sk_ack_backlog--;
Andy Kingd021c342013-02-06 14:23:56 +0000471 } else if (!vsk->rejected) {
472 /* We are not on the pending list and accept() did not reject
473 * us, so we must have been accepted by our user process. We
474 * just need to drop our references to the sockets and be on
475 * our way.
476 */
477 cleanup = false;
478 goto out;
479 }
480
Andy Kingd021c342013-02-06 14:23:56 +0000481 /* We need to remove ourself from the global connected sockets list so
482 * incoming packets can't find this socket, and to reduce the reference
483 * count.
484 */
485 if (vsock_in_connected_table(vsk))
486 vsock_remove_connected(vsk);
487
488 sk->sk_state = SS_FREE;
489
490out:
491 release_sock(sk);
492 release_sock(listener);
493 if (cleanup)
494 sock_put(sk);
495
496 sock_put(sk);
497 sock_put(listener);
498}
Andy Kingd021c342013-02-06 14:23:56 +0000499
500/**** SOCKET OPERATIONS ****/
501
502static int __vsock_bind_stream(struct vsock_sock *vsk,
503 struct sockaddr_vm *addr)
504{
Lepton Wu006469c2018-12-11 11:12:55 -0800505 static u32 port = 0;
Andy Kingd021c342013-02-06 14:23:56 +0000506 struct sockaddr_vm new_addr;
507
Lepton Wu006469c2018-12-11 11:12:55 -0800508 if (!port)
509 port = LAST_RESERVED_PORT + 1 +
510 prandom_u32_max(U32_MAX - LAST_RESERVED_PORT);
511
Andy Kingd021c342013-02-06 14:23:56 +0000512 vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port);
513
514 if (addr->svm_port == VMADDR_PORT_ANY) {
515 bool found = false;
516 unsigned int i;
517
518 for (i = 0; i < MAX_PORT_RETRIES; i++) {
519 if (port <= LAST_RESERVED_PORT)
520 port = LAST_RESERVED_PORT + 1;
521
522 new_addr.svm_port = port++;
523
524 if (!__vsock_find_bound_socket(&new_addr)) {
525 found = true;
526 break;
527 }
528 }
529
530 if (!found)
531 return -EADDRNOTAVAIL;
532 } else {
533 /* If port is in reserved range, ensure caller
534 * has necessary privileges.
535 */
536 if (addr->svm_port <= LAST_RESERVED_PORT &&
537 !capable(CAP_NET_BIND_SERVICE)) {
538 return -EACCES;
539 }
540
541 if (__vsock_find_bound_socket(&new_addr))
542 return -EADDRINUSE;
543 }
544
545 vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port);
546
547 /* Remove stream sockets from the unbound list and add them to the hash
548 * table for easy lookup by its address. The unbound list is simply an
549 * extra entry at the end of the hash table, a trick used by AF_UNIX.
550 */
551 __vsock_remove_bound(vsk);
552 __vsock_insert_bound(vsock_bound_sockets(&vsk->local_addr), vsk);
553
554 return 0;
555}
556
557static int __vsock_bind_dgram(struct vsock_sock *vsk,
558 struct sockaddr_vm *addr)
559{
560 return transport->dgram_bind(vsk, addr);
561}
562
563static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr)
564{
565 struct vsock_sock *vsk = vsock_sk(sk);
566 u32 cid;
567 int retval;
568
569 /* First ensure this socket isn't already bound. */
570 if (vsock_addr_bound(&vsk->local_addr))
571 return -EINVAL;
572
573 /* Now bind to the provided address or select appropriate values if
574 * none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that
575 * like AF_INET prevents binding to a non-local IP address (in most
576 * cases), we only allow binding to the local CID.
577 */
578 cid = transport->get_local_cid();
579 if (addr->svm_cid != cid && addr->svm_cid != VMADDR_CID_ANY)
580 return -EADDRNOTAVAIL;
581
582 switch (sk->sk_socket->type) {
583 case SOCK_STREAM:
584 spin_lock_bh(&vsock_table_lock);
585 retval = __vsock_bind_stream(vsk, addr);
586 spin_unlock_bh(&vsock_table_lock);
587 break;
588
589 case SOCK_DGRAM:
590 retval = __vsock_bind_dgram(vsk, addr);
591 break;
592
593 default:
594 retval = -EINVAL;
595 break;
596 }
597
598 return retval;
599}
600
Cong Wangf6b82762018-08-06 11:06:02 -0700601static void vsock_connect_timeout(struct work_struct *work);
602
Andy Kingd021c342013-02-06 14:23:56 +0000603struct sock *__vsock_create(struct net *net,
604 struct socket *sock,
605 struct sock *parent,
606 gfp_t priority,
Eric W. Biederman11aa9c22015-05-08 21:09:13 -0500607 unsigned short type,
608 int kern)
Andy Kingd021c342013-02-06 14:23:56 +0000609{
610 struct sock *sk;
611 struct vsock_sock *psk;
612 struct vsock_sock *vsk;
613
Eric W. Biederman11aa9c22015-05-08 21:09:13 -0500614 sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto, kern);
Andy Kingd021c342013-02-06 14:23:56 +0000615 if (!sk)
616 return NULL;
617
618 sock_init_data(sock, sk);
619
620 /* sk->sk_type is normally set in sock_init_data, but only if sock is
621 * non-NULL. We make sure that our sockets always have a type by
622 * setting it here if needed.
623 */
624 if (!sock)
625 sk->sk_type = type;
626
627 vsk = vsock_sk(sk);
628 vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
629 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
630
631 sk->sk_destruct = vsock_sk_destruct;
632 sk->sk_backlog_rcv = vsock_queue_rcv_skb;
633 sk->sk_state = 0;
634 sock_reset_flag(sk, SOCK_DONE);
635
636 INIT_LIST_HEAD(&vsk->bound_table);
637 INIT_LIST_HEAD(&vsk->connected_table);
638 vsk->listener = NULL;
639 INIT_LIST_HEAD(&vsk->pending_links);
640 INIT_LIST_HEAD(&vsk->accept_queue);
641 vsk->rejected = false;
642 vsk->sent_request = false;
643 vsk->ignore_connecting_rst = false;
644 vsk->peer_shutdown = 0;
Cong Wangf6b82762018-08-06 11:06:02 -0700645 INIT_DELAYED_WORK(&vsk->connect_work, vsock_connect_timeout);
646 INIT_DELAYED_WORK(&vsk->pending_work, vsock_pending_work);
Andy Kingd021c342013-02-06 14:23:56 +0000647
648 psk = parent ? vsock_sk(parent) : NULL;
649 if (parent) {
650 vsk->trusted = psk->trusted;
651 vsk->owner = get_cred(psk->owner);
652 vsk->connect_timeout = psk->connect_timeout;
653 } else {
654 vsk->trusted = capable(CAP_NET_ADMIN);
655 vsk->owner = get_current_cred();
656 vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT;
657 }
658
659 if (transport->init(vsk, psk) < 0) {
660 sk_free(sk);
661 return NULL;
662 }
663
664 if (sock)
665 vsock_insert_unbound(vsk);
666
667 return sk;
668}
669EXPORT_SYMBOL_GPL(__vsock_create);
670
671static void __vsock_release(struct sock *sk)
672{
673 if (sk) {
674 struct sk_buff *skb;
675 struct sock *pending;
676 struct vsock_sock *vsk;
677
678 vsk = vsock_sk(sk);
679 pending = NULL; /* Compiler warning. */
680
Andy Kingd021c342013-02-06 14:23:56 +0000681 transport->release(vsk);
682
683 lock_sock(sk);
684 sock_orphan(sk);
685 sk->sk_shutdown = SHUTDOWN_MASK;
686
687 while ((skb = skb_dequeue(&sk->sk_receive_queue)))
688 kfree_skb(skb);
689
690 /* Clean up any sockets that never were accepted. */
691 while ((pending = vsock_dequeue_accept(sk)) != NULL) {
692 __vsock_release(pending);
693 sock_put(pending);
694 }
695
696 release_sock(sk);
697 sock_put(sk);
698 }
699}
700
701static void vsock_sk_destruct(struct sock *sk)
702{
703 struct vsock_sock *vsk = vsock_sk(sk);
704
705 transport->destruct(vsk);
706
707 /* When clearing these addresses, there's no need to set the family and
708 * possibly register the address family with the kernel.
709 */
710 vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
711 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
712
713 put_cred(vsk->owner);
714}
715
716static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
717{
718 int err;
719
720 err = sock_queue_rcv_skb(sk, skb);
721 if (err)
722 kfree_skb(skb);
723
724 return err;
725}
726
727s64 vsock_stream_has_data(struct vsock_sock *vsk)
728{
729 return transport->stream_has_data(vsk);
730}
731EXPORT_SYMBOL_GPL(vsock_stream_has_data);
732
733s64 vsock_stream_has_space(struct vsock_sock *vsk)
734{
735 return transport->stream_has_space(vsk);
736}
737EXPORT_SYMBOL_GPL(vsock_stream_has_space);
738
739static int vsock_release(struct socket *sock)
740{
741 __vsock_release(sock->sk);
742 sock->sk = NULL;
743 sock->state = SS_FREE;
744
745 return 0;
746}
747
748static int
749vsock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
750{
751 int err;
752 struct sock *sk;
753 struct sockaddr_vm *vm_addr;
754
755 sk = sock->sk;
756
757 if (vsock_addr_cast(addr, addr_len, &vm_addr) != 0)
758 return -EINVAL;
759
760 lock_sock(sk);
761 err = __vsock_bind(sk, vm_addr);
762 release_sock(sk);
763
764 return err;
765}
766
767static int vsock_getname(struct socket *sock,
768 struct sockaddr *addr, int *addr_len, int peer)
769{
770 int err;
771 struct sock *sk;
772 struct vsock_sock *vsk;
773 struct sockaddr_vm *vm_addr;
774
775 sk = sock->sk;
776 vsk = vsock_sk(sk);
777 err = 0;
778
779 lock_sock(sk);
780
781 if (peer) {
782 if (sock->state != SS_CONNECTED) {
783 err = -ENOTCONN;
784 goto out;
785 }
786 vm_addr = &vsk->remote_addr;
787 } else {
788 vm_addr = &vsk->local_addr;
789 }
790
791 if (!vm_addr) {
792 err = -EINVAL;
793 goto out;
794 }
795
796 /* sys_getsockname() and sys_getpeername() pass us a
797 * MAX_SOCK_ADDR-sized buffer and don't set addr_len. Unfortunately
798 * that macro is defined in socket.c instead of .h, so we hardcode its
799 * value here.
800 */
801 BUILD_BUG_ON(sizeof(*vm_addr) > 128);
802 memcpy(addr, vm_addr, sizeof(*vm_addr));
803 *addr_len = sizeof(*vm_addr);
804
805out:
806 release_sock(sk);
807 return err;
808}
809
810static int vsock_shutdown(struct socket *sock, int mode)
811{
812 int err;
813 struct sock *sk;
814
815 /* User level uses SHUT_RD (0) and SHUT_WR (1), but the kernel uses
816 * RCV_SHUTDOWN (1) and SEND_SHUTDOWN (2), so we must increment mode
817 * here like the other address families do. Note also that the
818 * increment makes SHUT_RDWR (2) into RCV_SHUTDOWN | SEND_SHUTDOWN (3),
819 * which is what we want.
820 */
821 mode++;
822
823 if ((mode & ~SHUTDOWN_MASK) || !mode)
824 return -EINVAL;
825
826 /* If this is a STREAM socket and it is not connected then bail out
827 * immediately. If it is a DGRAM socket then we must first kick the
828 * socket so that it wakes up from any sleeping calls, for example
829 * recv(), and then afterwards return the error.
830 */
831
832 sk = sock->sk;
833 if (sock->state == SS_UNCONNECTED) {
834 err = -ENOTCONN;
835 if (sk->sk_type == SOCK_STREAM)
836 return err;
837 } else {
838 sock->state = SS_DISCONNECTING;
839 err = 0;
840 }
841
842 /* Receive and send shutdowns are treated alike. */
843 mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN);
844 if (mode) {
845 lock_sock(sk);
846 sk->sk_shutdown |= mode;
847 sk->sk_state_change(sk);
848 release_sock(sk);
849
850 if (sk->sk_type == SOCK_STREAM) {
851 sock_reset_flag(sk, SOCK_DONE);
852 vsock_send_shutdown(sk, mode);
853 }
854 }
855
856 return err;
857}
858
859static unsigned int vsock_poll(struct file *file, struct socket *sock,
860 poll_table *wait)
861{
862 struct sock *sk;
863 unsigned int mask;
864 struct vsock_sock *vsk;
865
866 sk = sock->sk;
867 vsk = vsock_sk(sk);
868
869 poll_wait(file, sk_sleep(sk), wait);
870 mask = 0;
871
872 if (sk->sk_err)
873 /* Signify that there has been an error on this socket. */
874 mask |= POLLERR;
875
876 /* INET sockets treat local write shutdown and peer write shutdown as a
877 * case of POLLHUP set.
878 */
879 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
880 ((sk->sk_shutdown & SEND_SHUTDOWN) &&
881 (vsk->peer_shutdown & SEND_SHUTDOWN))) {
882 mask |= POLLHUP;
883 }
884
885 if (sk->sk_shutdown & RCV_SHUTDOWN ||
886 vsk->peer_shutdown & SEND_SHUTDOWN) {
887 mask |= POLLRDHUP;
888 }
889
890 if (sock->type == SOCK_DGRAM) {
891 /* For datagram sockets we can read if there is something in
892 * the queue and write as long as the socket isn't shutdown for
893 * sending.
894 */
895 if (!skb_queue_empty(&sk->sk_receive_queue) ||
896 (sk->sk_shutdown & RCV_SHUTDOWN)) {
897 mask |= POLLIN | POLLRDNORM;
898 }
899
900 if (!(sk->sk_shutdown & SEND_SHUTDOWN))
901 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
902
903 } else if (sock->type == SOCK_STREAM) {
904 lock_sock(sk);
905
906 /* Listening sockets that have connections in their accept
907 * queue can be read.
908 */
Stefan Hajnocziea3803c2015-10-29 11:57:42 +0000909 if (sk->sk_state == VSOCK_SS_LISTEN
Andy Kingd021c342013-02-06 14:23:56 +0000910 && !vsock_is_accept_queue_empty(sk))
911 mask |= POLLIN | POLLRDNORM;
912
913 /* If there is something in the queue then we can read. */
914 if (transport->stream_is_active(vsk) &&
915 !(sk->sk_shutdown & RCV_SHUTDOWN)) {
916 bool data_ready_now = false;
917 int ret = transport->notify_poll_in(
918 vsk, 1, &data_ready_now);
919 if (ret < 0) {
920 mask |= POLLERR;
921 } else {
922 if (data_ready_now)
923 mask |= POLLIN | POLLRDNORM;
924
925 }
926 }
927
928 /* Sockets whose connections have been closed, reset, or
929 * terminated should also be considered read, and we check the
930 * shutdown flag for that.
931 */
932 if (sk->sk_shutdown & RCV_SHUTDOWN ||
933 vsk->peer_shutdown & SEND_SHUTDOWN) {
934 mask |= POLLIN | POLLRDNORM;
935 }
936
937 /* Connected sockets that can produce data can be written. */
938 if (sk->sk_state == SS_CONNECTED) {
939 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
940 bool space_avail_now = false;
941 int ret = transport->notify_poll_out(
942 vsk, 1, &space_avail_now);
943 if (ret < 0) {
944 mask |= POLLERR;
945 } else {
946 if (space_avail_now)
947 /* Remove POLLWRBAND since INET
948 * sockets are not setting it.
949 */
950 mask |= POLLOUT | POLLWRNORM;
951
952 }
953 }
954 }
955
956 /* Simulate INET socket poll behaviors, which sets
957 * POLLOUT|POLLWRNORM when peer is closed and nothing to read,
958 * but local send is not shutdown.
959 */
960 if (sk->sk_state == SS_UNCONNECTED) {
961 if (!(sk->sk_shutdown & SEND_SHUTDOWN))
962 mask |= POLLOUT | POLLWRNORM;
963
964 }
965
966 release_sock(sk);
967 }
968
969 return mask;
970}
971
Ying Xue1b784142015-03-02 15:37:48 +0800972static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
973 size_t len)
Andy Kingd021c342013-02-06 14:23:56 +0000974{
975 int err;
976 struct sock *sk;
977 struct vsock_sock *vsk;
978 struct sockaddr_vm *remote_addr;
979
980 if (msg->msg_flags & MSG_OOB)
981 return -EOPNOTSUPP;
982
983 /* For now, MSG_DONTWAIT is always assumed... */
984 err = 0;
985 sk = sock->sk;
986 vsk = vsock_sk(sk);
987
988 lock_sock(sk);
989
Asias Heb3a6dfe2013-06-20 17:20:30 +0800990 err = vsock_auto_bind(vsk);
991 if (err)
992 goto out;
Andy Kingd021c342013-02-06 14:23:56 +0000993
Andy Kingd021c342013-02-06 14:23:56 +0000994
995 /* If the provided message contains an address, use that. Otherwise
996 * fall back on the socket's remote handle (if it has been connected).
997 */
998 if (msg->msg_name &&
999 vsock_addr_cast(msg->msg_name, msg->msg_namelen,
1000 &remote_addr) == 0) {
1001 /* Ensure this address is of the right type and is a valid
1002 * destination.
1003 */
1004
1005 if (remote_addr->svm_cid == VMADDR_CID_ANY)
1006 remote_addr->svm_cid = transport->get_local_cid();
1007
1008 if (!vsock_addr_bound(remote_addr)) {
1009 err = -EINVAL;
1010 goto out;
1011 }
1012 } else if (sock->state == SS_CONNECTED) {
1013 remote_addr = &vsk->remote_addr;
1014
1015 if (remote_addr->svm_cid == VMADDR_CID_ANY)
1016 remote_addr->svm_cid = transport->get_local_cid();
1017
1018 /* XXX Should connect() or this function ensure remote_addr is
1019 * bound?
1020 */
1021 if (!vsock_addr_bound(&vsk->remote_addr)) {
1022 err = -EINVAL;
1023 goto out;
1024 }
1025 } else {
1026 err = -EINVAL;
1027 goto out;
1028 }
1029
1030 if (!transport->dgram_allow(remote_addr->svm_cid,
1031 remote_addr->svm_port)) {
1032 err = -EINVAL;
1033 goto out;
1034 }
1035
Al Viro0f7db232014-11-20 04:05:34 -05001036 err = transport->dgram_enqueue(vsk, remote_addr, msg, len);
Andy Kingd021c342013-02-06 14:23:56 +00001037
1038out:
1039 release_sock(sk);
1040 return err;
1041}
1042
1043static int vsock_dgram_connect(struct socket *sock,
1044 struct sockaddr *addr, int addr_len, int flags)
1045{
1046 int err;
1047 struct sock *sk;
1048 struct vsock_sock *vsk;
1049 struct sockaddr_vm *remote_addr;
1050
1051 sk = sock->sk;
1052 vsk = vsock_sk(sk);
1053
1054 err = vsock_addr_cast(addr, addr_len, &remote_addr);
1055 if (err == -EAFNOSUPPORT && remote_addr->svm_family == AF_UNSPEC) {
1056 lock_sock(sk);
1057 vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY,
1058 VMADDR_PORT_ANY);
1059 sock->state = SS_UNCONNECTED;
1060 release_sock(sk);
1061 return 0;
1062 } else if (err != 0)
1063 return -EINVAL;
1064
1065 lock_sock(sk);
1066
Asias Heb3a6dfe2013-06-20 17:20:30 +08001067 err = vsock_auto_bind(vsk);
1068 if (err)
1069 goto out;
Andy Kingd021c342013-02-06 14:23:56 +00001070
1071 if (!transport->dgram_allow(remote_addr->svm_cid,
1072 remote_addr->svm_port)) {
1073 err = -EINVAL;
1074 goto out;
1075 }
1076
1077 memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr));
1078 sock->state = SS_CONNECTED;
1079
1080out:
1081 release_sock(sk);
1082 return err;
1083}
1084
Ying Xue1b784142015-03-02 15:37:48 +08001085static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
1086 size_t len, int flags)
Andy Kingd021c342013-02-06 14:23:56 +00001087{
Ying Xue1b784142015-03-02 15:37:48 +08001088 return transport->dgram_dequeue(vsock_sk(sock->sk), msg, len, flags);
Andy Kingd021c342013-02-06 14:23:56 +00001089}
1090
1091static const struct proto_ops vsock_dgram_ops = {
1092 .family = PF_VSOCK,
1093 .owner = THIS_MODULE,
1094 .release = vsock_release,
1095 .bind = vsock_bind,
1096 .connect = vsock_dgram_connect,
1097 .socketpair = sock_no_socketpair,
1098 .accept = sock_no_accept,
1099 .getname = vsock_getname,
1100 .poll = vsock_poll,
1101 .ioctl = sock_no_ioctl,
1102 .listen = sock_no_listen,
1103 .shutdown = vsock_shutdown,
1104 .setsockopt = sock_no_setsockopt,
1105 .getsockopt = sock_no_getsockopt,
1106 .sendmsg = vsock_dgram_sendmsg,
1107 .recvmsg = vsock_dgram_recvmsg,
1108 .mmap = sock_no_mmap,
1109 .sendpage = sock_no_sendpage,
1110};
1111
Peng Tao98d20e52017-03-15 09:32:17 +08001112static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
1113{
1114 if (!transport->cancel_pkt)
1115 return -EOPNOTSUPP;
1116
1117 return transport->cancel_pkt(vsk);
1118}
1119
Andy Kingd021c342013-02-06 14:23:56 +00001120static void vsock_connect_timeout(struct work_struct *work)
1121{
1122 struct sock *sk;
1123 struct vsock_sock *vsk;
Peng Tao98d20e52017-03-15 09:32:17 +08001124 int cancel = 0;
Andy Kingd021c342013-02-06 14:23:56 +00001125
Cong Wangf6b82762018-08-06 11:06:02 -07001126 vsk = container_of(work, struct vsock_sock, connect_work.work);
Andy Kingd021c342013-02-06 14:23:56 +00001127 sk = sk_vsock(vsk);
1128
1129 lock_sock(sk);
1130 if (sk->sk_state == SS_CONNECTING &&
1131 (sk->sk_shutdown != SHUTDOWN_MASK)) {
1132 sk->sk_state = SS_UNCONNECTED;
1133 sk->sk_err = ETIMEDOUT;
1134 sk->sk_error_report(sk);
Peng Tao98d20e52017-03-15 09:32:17 +08001135 cancel = 1;
Andy Kingd021c342013-02-06 14:23:56 +00001136 }
1137 release_sock(sk);
Peng Tao98d20e52017-03-15 09:32:17 +08001138 if (cancel)
1139 vsock_transport_cancel_pkt(vsk);
Andy Kingd021c342013-02-06 14:23:56 +00001140
1141 sock_put(sk);
1142}
1143
1144static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
1145 int addr_len, int flags)
1146{
1147 int err;
1148 struct sock *sk;
1149 struct vsock_sock *vsk;
1150 struct sockaddr_vm *remote_addr;
1151 long timeout;
1152 DEFINE_WAIT(wait);
1153
1154 err = 0;
1155 sk = sock->sk;
1156 vsk = vsock_sk(sk);
1157
1158 lock_sock(sk);
1159
1160 /* XXX AF_UNSPEC should make us disconnect like AF_INET. */
1161 switch (sock->state) {
1162 case SS_CONNECTED:
1163 err = -EISCONN;
1164 goto out;
1165 case SS_DISCONNECTING:
1166 err = -EINVAL;
1167 goto out;
1168 case SS_CONNECTING:
1169 /* This continues on so we can move sock into the SS_CONNECTED
1170 * state once the connection has completed (at which point err
1171 * will be set to zero also). Otherwise, we will either wait
1172 * for the connection or return -EALREADY should this be a
1173 * non-blocking call.
1174 */
1175 err = -EALREADY;
1176 break;
1177 default:
Stefan Hajnocziea3803c2015-10-29 11:57:42 +00001178 if ((sk->sk_state == VSOCK_SS_LISTEN) ||
Andy Kingd021c342013-02-06 14:23:56 +00001179 vsock_addr_cast(addr, addr_len, &remote_addr) != 0) {
1180 err = -EINVAL;
1181 goto out;
1182 }
1183
1184 /* The hypervisor and well-known contexts do not have socket
1185 * endpoints.
1186 */
1187 if (!transport->stream_allow(remote_addr->svm_cid,
1188 remote_addr->svm_port)) {
1189 err = -ENETUNREACH;
1190 goto out;
1191 }
1192
1193 /* Set the remote address that we are connecting to. */
1194 memcpy(&vsk->remote_addr, remote_addr,
1195 sizeof(vsk->remote_addr));
1196
Asias Heb3a6dfe2013-06-20 17:20:30 +08001197 err = vsock_auto_bind(vsk);
1198 if (err)
1199 goto out;
Andy Kingd021c342013-02-06 14:23:56 +00001200
1201 sk->sk_state = SS_CONNECTING;
1202
1203 err = transport->connect(vsk);
1204 if (err < 0)
1205 goto out;
1206
1207 /* Mark sock as connecting and set the error code to in
1208 * progress in case this is a non-blocking connect.
1209 */
1210 sock->state = SS_CONNECTING;
1211 err = -EINPROGRESS;
1212 }
1213
1214 /* The receive path will handle all communication until we are able to
1215 * enter the connected state. Here we wait for the connection to be
1216 * completed or a notification of an error.
1217 */
1218 timeout = vsk->connect_timeout;
1219 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1220
1221 while (sk->sk_state != SS_CONNECTED && sk->sk_err == 0) {
1222 if (flags & O_NONBLOCK) {
1223 /* If we're not going to block, we schedule a timeout
1224 * function to generate a timeout on the connection
1225 * attempt, in case the peer doesn't respond in a
1226 * timely manner. We hold on to the socket until the
1227 * timeout fires.
1228 */
1229 sock_hold(sk);
Cong Wangf6b82762018-08-06 11:06:02 -07001230 schedule_delayed_work(&vsk->connect_work, timeout);
Andy Kingd021c342013-02-06 14:23:56 +00001231
1232 /* Skip ahead to preserve error code set above. */
1233 goto out_wait;
1234 }
1235
1236 release_sock(sk);
1237 timeout = schedule_timeout(timeout);
1238 lock_sock(sk);
1239
1240 if (signal_pending(current)) {
1241 err = sock_intr_errno(timeout);
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001242 sk->sk_state = SS_UNCONNECTED;
1243 sock->state = SS_UNCONNECTED;
Peng Tao98d20e52017-03-15 09:32:17 +08001244 vsock_transport_cancel_pkt(vsk);
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001245 goto out_wait;
Andy Kingd021c342013-02-06 14:23:56 +00001246 } else if (timeout == 0) {
1247 err = -ETIMEDOUT;
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001248 sk->sk_state = SS_UNCONNECTED;
1249 sock->state = SS_UNCONNECTED;
Peng Tao98d20e52017-03-15 09:32:17 +08001250 vsock_transport_cancel_pkt(vsk);
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001251 goto out_wait;
Andy Kingd021c342013-02-06 14:23:56 +00001252 }
1253
1254 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1255 }
1256
1257 if (sk->sk_err) {
1258 err = -sk->sk_err;
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001259 sk->sk_state = SS_UNCONNECTED;
1260 sock->state = SS_UNCONNECTED;
1261 } else {
Andy Kingd021c342013-02-06 14:23:56 +00001262 err = 0;
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001263 }
Andy Kingd021c342013-02-06 14:23:56 +00001264
1265out_wait:
1266 finish_wait(sk_sleep(sk), &wait);
1267out:
1268 release_sock(sk);
1269 return err;
Andy Kingd021c342013-02-06 14:23:56 +00001270}
1271
1272static int vsock_accept(struct socket *sock, struct socket *newsock, int flags)
1273{
1274 struct sock *listener;
1275 int err;
1276 struct sock *connected;
1277 struct vsock_sock *vconnected;
1278 long timeout;
1279 DEFINE_WAIT(wait);
1280
1281 err = 0;
1282 listener = sock->sk;
1283
1284 lock_sock(listener);
1285
1286 if (sock->type != SOCK_STREAM) {
1287 err = -EOPNOTSUPP;
1288 goto out;
1289 }
1290
Stefan Hajnocziea3803c2015-10-29 11:57:42 +00001291 if (listener->sk_state != VSOCK_SS_LISTEN) {
Andy Kingd021c342013-02-06 14:23:56 +00001292 err = -EINVAL;
1293 goto out;
1294 }
1295
1296 /* Wait for children sockets to appear; these are the new sockets
1297 * created upon connection establishment.
1298 */
1299 timeout = sock_sndtimeo(listener, flags & O_NONBLOCK);
1300 prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE);
1301
1302 while ((connected = vsock_dequeue_accept(listener)) == NULL &&
1303 listener->sk_err == 0) {
1304 release_sock(listener);
1305 timeout = schedule_timeout(timeout);
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001306 finish_wait(sk_sleep(listener), &wait);
Andy Kingd021c342013-02-06 14:23:56 +00001307 lock_sock(listener);
1308
1309 if (signal_pending(current)) {
1310 err = sock_intr_errno(timeout);
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001311 goto out;
Andy Kingd021c342013-02-06 14:23:56 +00001312 } else if (timeout == 0) {
1313 err = -EAGAIN;
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001314 goto out;
Andy Kingd021c342013-02-06 14:23:56 +00001315 }
1316
1317 prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE);
1318 }
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001319 finish_wait(sk_sleep(listener), &wait);
Andy Kingd021c342013-02-06 14:23:56 +00001320
1321 if (listener->sk_err)
1322 err = -listener->sk_err;
1323
1324 if (connected) {
1325 listener->sk_ack_backlog--;
1326
Stefan Hajnoczi4192f672016-06-23 16:28:58 +01001327 lock_sock_nested(connected, SINGLE_DEPTH_NESTING);
Andy Kingd021c342013-02-06 14:23:56 +00001328 vconnected = vsock_sk(connected);
1329
1330 /* If the listener socket has received an error, then we should
1331 * reject this socket and return. Note that we simply mark the
1332 * socket rejected, drop our reference, and let the cleanup
1333 * function handle the cleanup; the fact that we found it in
1334 * the listener's accept queue guarantees that the cleanup
1335 * function hasn't run yet.
1336 */
1337 if (err) {
1338 vconnected->rejected = true;
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001339 } else {
1340 newsock->state = SS_CONNECTED;
1341 sock_graft(connected, newsock);
Andy Kingd021c342013-02-06 14:23:56 +00001342 }
1343
Andy Kingd021c342013-02-06 14:23:56 +00001344 release_sock(connected);
1345 sock_put(connected);
1346 }
1347
Andy Kingd021c342013-02-06 14:23:56 +00001348out:
1349 release_sock(listener);
1350 return err;
1351}
1352
1353static int vsock_listen(struct socket *sock, int backlog)
1354{
1355 int err;
1356 struct sock *sk;
1357 struct vsock_sock *vsk;
1358
1359 sk = sock->sk;
1360
1361 lock_sock(sk);
1362
1363 if (sock->type != SOCK_STREAM) {
1364 err = -EOPNOTSUPP;
1365 goto out;
1366 }
1367
1368 if (sock->state != SS_UNCONNECTED) {
1369 err = -EINVAL;
1370 goto out;
1371 }
1372
1373 vsk = vsock_sk(sk);
1374
1375 if (!vsock_addr_bound(&vsk->local_addr)) {
1376 err = -EINVAL;
1377 goto out;
1378 }
1379
1380 sk->sk_max_ack_backlog = backlog;
Stefan Hajnocziea3803c2015-10-29 11:57:42 +00001381 sk->sk_state = VSOCK_SS_LISTEN;
Andy Kingd021c342013-02-06 14:23:56 +00001382
1383 err = 0;
1384
1385out:
1386 release_sock(sk);
1387 return err;
1388}
1389
1390static int vsock_stream_setsockopt(struct socket *sock,
1391 int level,
1392 int optname,
1393 char __user *optval,
1394 unsigned int optlen)
1395{
1396 int err;
1397 struct sock *sk;
1398 struct vsock_sock *vsk;
1399 u64 val;
1400
1401 if (level != AF_VSOCK)
1402 return -ENOPROTOOPT;
1403
1404#define COPY_IN(_v) \
1405 do { \
1406 if (optlen < sizeof(_v)) { \
1407 err = -EINVAL; \
1408 goto exit; \
1409 } \
1410 if (copy_from_user(&_v, optval, sizeof(_v)) != 0) { \
1411 err = -EFAULT; \
1412 goto exit; \
1413 } \
1414 } while (0)
1415
1416 err = 0;
1417 sk = sock->sk;
1418 vsk = vsock_sk(sk);
1419
1420 lock_sock(sk);
1421
1422 switch (optname) {
1423 case SO_VM_SOCKETS_BUFFER_SIZE:
1424 COPY_IN(val);
1425 transport->set_buffer_size(vsk, val);
1426 break;
1427
1428 case SO_VM_SOCKETS_BUFFER_MAX_SIZE:
1429 COPY_IN(val);
1430 transport->set_max_buffer_size(vsk, val);
1431 break;
1432
1433 case SO_VM_SOCKETS_BUFFER_MIN_SIZE:
1434 COPY_IN(val);
1435 transport->set_min_buffer_size(vsk, val);
1436 break;
1437
1438 case SO_VM_SOCKETS_CONNECT_TIMEOUT: {
1439 struct timeval tv;
1440 COPY_IN(tv);
1441 if (tv.tv_sec >= 0 && tv.tv_usec < USEC_PER_SEC &&
1442 tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) {
1443 vsk->connect_timeout = tv.tv_sec * HZ +
1444 DIV_ROUND_UP(tv.tv_usec, (1000000 / HZ));
1445 if (vsk->connect_timeout == 0)
1446 vsk->connect_timeout =
1447 VSOCK_DEFAULT_CONNECT_TIMEOUT;
1448
1449 } else {
1450 err = -ERANGE;
1451 }
1452 break;
1453 }
1454
1455 default:
1456 err = -ENOPROTOOPT;
1457 break;
1458 }
1459
1460#undef COPY_IN
1461
1462exit:
1463 release_sock(sk);
1464 return err;
1465}
1466
1467static int vsock_stream_getsockopt(struct socket *sock,
1468 int level, int optname,
1469 char __user *optval,
1470 int __user *optlen)
1471{
1472 int err;
1473 int len;
1474 struct sock *sk;
1475 struct vsock_sock *vsk;
1476 u64 val;
1477
1478 if (level != AF_VSOCK)
1479 return -ENOPROTOOPT;
1480
1481 err = get_user(len, optlen);
1482 if (err != 0)
1483 return err;
1484
1485#define COPY_OUT(_v) \
1486 do { \
1487 if (len < sizeof(_v)) \
1488 return -EINVAL; \
1489 \
1490 len = sizeof(_v); \
1491 if (copy_to_user(optval, &_v, len) != 0) \
1492 return -EFAULT; \
1493 \
1494 } while (0)
1495
1496 err = 0;
1497 sk = sock->sk;
1498 vsk = vsock_sk(sk);
1499
1500 switch (optname) {
1501 case SO_VM_SOCKETS_BUFFER_SIZE:
1502 val = transport->get_buffer_size(vsk);
1503 COPY_OUT(val);
1504 break;
1505
1506 case SO_VM_SOCKETS_BUFFER_MAX_SIZE:
1507 val = transport->get_max_buffer_size(vsk);
1508 COPY_OUT(val);
1509 break;
1510
1511 case SO_VM_SOCKETS_BUFFER_MIN_SIZE:
1512 val = transport->get_min_buffer_size(vsk);
1513 COPY_OUT(val);
1514 break;
1515
1516 case SO_VM_SOCKETS_CONNECT_TIMEOUT: {
1517 struct timeval tv;
1518 tv.tv_sec = vsk->connect_timeout / HZ;
1519 tv.tv_usec =
1520 (vsk->connect_timeout -
1521 tv.tv_sec * HZ) * (1000000 / HZ);
1522 COPY_OUT(tv);
1523 break;
1524 }
1525 default:
1526 return -ENOPROTOOPT;
1527 }
1528
1529 err = put_user(len, optlen);
1530 if (err != 0)
1531 return -EFAULT;
1532
1533#undef COPY_OUT
1534
1535 return 0;
1536}
1537
Ying Xue1b784142015-03-02 15:37:48 +08001538static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1539 size_t len)
Andy Kingd021c342013-02-06 14:23:56 +00001540{
1541 struct sock *sk;
1542 struct vsock_sock *vsk;
1543 ssize_t total_written;
1544 long timeout;
1545 int err;
1546 struct vsock_transport_send_notify_data send_data;
WANG Cong6be6e482017-05-19 11:21:59 -07001547 DEFINE_WAIT_FUNC(wait, woken_wake_function);
Andy Kingd021c342013-02-06 14:23:56 +00001548
1549 sk = sock->sk;
1550 vsk = vsock_sk(sk);
1551 total_written = 0;
1552 err = 0;
1553
1554 if (msg->msg_flags & MSG_OOB)
1555 return -EOPNOTSUPP;
1556
1557 lock_sock(sk);
1558
1559 /* Callers should not provide a destination with stream sockets. */
1560 if (msg->msg_namelen) {
1561 err = sk->sk_state == SS_CONNECTED ? -EISCONN : -EOPNOTSUPP;
1562 goto out;
1563 }
1564
1565 /* Send data only if both sides are not shutdown in the direction. */
1566 if (sk->sk_shutdown & SEND_SHUTDOWN ||
1567 vsk->peer_shutdown & RCV_SHUTDOWN) {
1568 err = -EPIPE;
1569 goto out;
1570 }
1571
1572 if (sk->sk_state != SS_CONNECTED ||
1573 !vsock_addr_bound(&vsk->local_addr)) {
1574 err = -ENOTCONN;
1575 goto out;
1576 }
1577
1578 if (!vsock_addr_bound(&vsk->remote_addr)) {
1579 err = -EDESTADDRREQ;
1580 goto out;
1581 }
1582
1583 /* Wait for room in the produce queue to enqueue our user's data. */
1584 timeout = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1585
1586 err = transport->notify_send_init(vsk, &send_data);
1587 if (err < 0)
1588 goto out;
1589
Andy Kingd021c342013-02-06 14:23:56 +00001590 while (total_written < len) {
1591 ssize_t written;
1592
WANG Cong6be6e482017-05-19 11:21:59 -07001593 add_wait_queue(sk_sleep(sk), &wait);
Andy Kingd021c342013-02-06 14:23:56 +00001594 while (vsock_stream_has_space(vsk) == 0 &&
1595 sk->sk_err == 0 &&
1596 !(sk->sk_shutdown & SEND_SHUTDOWN) &&
1597 !(vsk->peer_shutdown & RCV_SHUTDOWN)) {
1598
1599 /* Don't wait for non-blocking sockets. */
1600 if (timeout == 0) {
1601 err = -EAGAIN;
WANG Cong6be6e482017-05-19 11:21:59 -07001602 remove_wait_queue(sk_sleep(sk), &wait);
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001603 goto out_err;
Andy Kingd021c342013-02-06 14:23:56 +00001604 }
1605
1606 err = transport->notify_send_pre_block(vsk, &send_data);
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001607 if (err < 0) {
WANG Cong6be6e482017-05-19 11:21:59 -07001608 remove_wait_queue(sk_sleep(sk), &wait);
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001609 goto out_err;
1610 }
Andy Kingd021c342013-02-06 14:23:56 +00001611
1612 release_sock(sk);
WANG Cong6be6e482017-05-19 11:21:59 -07001613 timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, timeout);
Andy Kingd021c342013-02-06 14:23:56 +00001614 lock_sock(sk);
1615 if (signal_pending(current)) {
1616 err = sock_intr_errno(timeout);
WANG Cong6be6e482017-05-19 11:21:59 -07001617 remove_wait_queue(sk_sleep(sk), &wait);
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001618 goto out_err;
Andy Kingd021c342013-02-06 14:23:56 +00001619 } else if (timeout == 0) {
1620 err = -EAGAIN;
WANG Cong6be6e482017-05-19 11:21:59 -07001621 remove_wait_queue(sk_sleep(sk), &wait);
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001622 goto out_err;
Andy Kingd021c342013-02-06 14:23:56 +00001623 }
Andy Kingd021c342013-02-06 14:23:56 +00001624 }
WANG Cong6be6e482017-05-19 11:21:59 -07001625 remove_wait_queue(sk_sleep(sk), &wait);
Andy Kingd021c342013-02-06 14:23:56 +00001626
1627 /* These checks occur both as part of and after the loop
1628 * conditional since we need to check before and after
1629 * sleeping.
1630 */
1631 if (sk->sk_err) {
1632 err = -sk->sk_err;
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001633 goto out_err;
Andy Kingd021c342013-02-06 14:23:56 +00001634 } else if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
1635 (vsk->peer_shutdown & RCV_SHUTDOWN)) {
1636 err = -EPIPE;
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001637 goto out_err;
Andy Kingd021c342013-02-06 14:23:56 +00001638 }
1639
1640 err = transport->notify_send_pre_enqueue(vsk, &send_data);
1641 if (err < 0)
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001642 goto out_err;
Andy Kingd021c342013-02-06 14:23:56 +00001643
1644 /* Note that enqueue will only write as many bytes as are free
1645 * in the produce queue, so we don't need to ensure len is
1646 * smaller than the queue size. It is the caller's
1647 * responsibility to check how many bytes we were able to send.
1648 */
1649
1650 written = transport->stream_enqueue(
Al Viro0f7db232014-11-20 04:05:34 -05001651 vsk, msg,
Andy Kingd021c342013-02-06 14:23:56 +00001652 len - total_written);
1653 if (written < 0) {
1654 err = -ENOMEM;
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001655 goto out_err;
Andy Kingd021c342013-02-06 14:23:56 +00001656 }
1657
1658 total_written += written;
1659
1660 err = transport->notify_send_post_enqueue(
1661 vsk, written, &send_data);
1662 if (err < 0)
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001663 goto out_err;
Andy Kingd021c342013-02-06 14:23:56 +00001664
1665 }
1666
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001667out_err:
Andy Kingd021c342013-02-06 14:23:56 +00001668 if (total_written > 0)
1669 err = total_written;
Andy Kingd021c342013-02-06 14:23:56 +00001670out:
1671 release_sock(sk);
1672 return err;
1673}
1674
1675
1676static int
Ying Xue1b784142015-03-02 15:37:48 +08001677vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1678 int flags)
Andy Kingd021c342013-02-06 14:23:56 +00001679{
1680 struct sock *sk;
1681 struct vsock_sock *vsk;
1682 int err;
1683 size_t target;
1684 ssize_t copied;
1685 long timeout;
1686 struct vsock_transport_recv_notify_data recv_data;
1687
1688 DEFINE_WAIT(wait);
1689
1690 sk = sock->sk;
1691 vsk = vsock_sk(sk);
1692 err = 0;
1693
1694 lock_sock(sk);
1695
1696 if (sk->sk_state != SS_CONNECTED) {
1697 /* Recvmsg is supposed to return 0 if a peer performs an
1698 * orderly shutdown. Differentiate between that case and when a
1699 * peer has not connected or a local shutdown occured with the
1700 * SOCK_DONE flag.
1701 */
1702 if (sock_flag(sk, SOCK_DONE))
1703 err = 0;
1704 else
1705 err = -ENOTCONN;
1706
1707 goto out;
1708 }
1709
1710 if (flags & MSG_OOB) {
1711 err = -EOPNOTSUPP;
1712 goto out;
1713 }
1714
1715 /* We don't check peer_shutdown flag here since peer may actually shut
1716 * down, but there can be data in the queue that a local socket can
1717 * receive.
1718 */
1719 if (sk->sk_shutdown & RCV_SHUTDOWN) {
1720 err = 0;
1721 goto out;
1722 }
1723
1724 /* It is valid on Linux to pass in a zero-length receive buffer. This
1725 * is not an error. We may as well bail out now.
1726 */
1727 if (!len) {
1728 err = 0;
1729 goto out;
1730 }
1731
1732 /* We must not copy less than target bytes into the user's buffer
1733 * before returning successfully, so we wait for the consume queue to
1734 * have that much data to consume before dequeueing. Note that this
1735 * makes it impossible to handle cases where target is greater than the
1736 * queue size.
1737 */
1738 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1739 if (target >= transport->stream_rcvhiwat(vsk)) {
1740 err = -ENOMEM;
1741 goto out;
1742 }
1743 timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
1744 copied = 0;
1745
1746 err = transport->notify_recv_init(vsk, target, &recv_data);
1747 if (err < 0)
1748 goto out;
1749
Andy Kingd021c342013-02-06 14:23:56 +00001750
1751 while (1) {
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001752 s64 ready;
Andy Kingd021c342013-02-06 14:23:56 +00001753
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001754 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1755 ready = vsock_stream_has_data(vsk);
Andy Kingd021c342013-02-06 14:23:56 +00001756
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001757 if (ready == 0) {
1758 if (sk->sk_err != 0 ||
1759 (sk->sk_shutdown & RCV_SHUTDOWN) ||
1760 (vsk->peer_shutdown & SEND_SHUTDOWN)) {
1761 finish_wait(sk_sleep(sk), &wait);
1762 break;
1763 }
1764 /* Don't wait for non-blocking sockets. */
1765 if (timeout == 0) {
1766 err = -EAGAIN;
1767 finish_wait(sk_sleep(sk), &wait);
1768 break;
1769 }
1770
1771 err = transport->notify_recv_pre_block(
1772 vsk, target, &recv_data);
1773 if (err < 0) {
1774 finish_wait(sk_sleep(sk), &wait);
1775 break;
1776 }
1777 release_sock(sk);
1778 timeout = schedule_timeout(timeout);
1779 lock_sock(sk);
1780
1781 if (signal_pending(current)) {
1782 err = sock_intr_errno(timeout);
1783 finish_wait(sk_sleep(sk), &wait);
1784 break;
1785 } else if (timeout == 0) {
1786 err = -EAGAIN;
1787 finish_wait(sk_sleep(sk), &wait);
1788 break;
1789 }
1790 } else {
Andy Kingd021c342013-02-06 14:23:56 +00001791 ssize_t read;
1792
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001793 finish_wait(sk_sleep(sk), &wait);
1794
1795 if (ready < 0) {
1796 /* Invalid queue pair content. XXX This should
1797 * be changed to a connection reset in a later
1798 * change.
1799 */
1800
1801 err = -ENOMEM;
1802 goto out;
1803 }
1804
Andy Kingd021c342013-02-06 14:23:56 +00001805 err = transport->notify_recv_pre_dequeue(
1806 vsk, target, &recv_data);
1807 if (err < 0)
1808 break;
1809
1810 read = transport->stream_dequeue(
Al Viro0f7db232014-11-20 04:05:34 -05001811 vsk, msg,
Andy Kingd021c342013-02-06 14:23:56 +00001812 len - copied, flags);
1813 if (read < 0) {
1814 err = -ENOMEM;
1815 break;
1816 }
1817
1818 copied += read;
1819
1820 err = transport->notify_recv_post_dequeue(
1821 vsk, target, read,
1822 !(flags & MSG_PEEK), &recv_data);
1823 if (err < 0)
Claudio Imbrendaf7f9b5e2016-03-22 17:05:52 +01001824 goto out;
Andy Kingd021c342013-02-06 14:23:56 +00001825
1826 if (read >= target || flags & MSG_PEEK)
1827 break;
1828
1829 target -= read;
Andy Kingd021c342013-02-06 14:23:56 +00001830 }
1831 }
1832
1833 if (sk->sk_err)
1834 err = -sk->sk_err;
1835 else if (sk->sk_shutdown & RCV_SHUTDOWN)
1836 err = 0;
1837
Ian Campbelldedc58e2016-05-04 14:21:53 +01001838 if (copied > 0)
Andy Kingd021c342013-02-06 14:23:56 +00001839 err = copied;
Andy Kingd021c342013-02-06 14:23:56 +00001840
Andy Kingd021c342013-02-06 14:23:56 +00001841out:
1842 release_sock(sk);
1843 return err;
1844}
1845
1846static const struct proto_ops vsock_stream_ops = {
1847 .family = PF_VSOCK,
1848 .owner = THIS_MODULE,
1849 .release = vsock_release,
1850 .bind = vsock_bind,
1851 .connect = vsock_stream_connect,
1852 .socketpair = sock_no_socketpair,
1853 .accept = vsock_accept,
1854 .getname = vsock_getname,
1855 .poll = vsock_poll,
1856 .ioctl = sock_no_ioctl,
1857 .listen = vsock_listen,
1858 .shutdown = vsock_shutdown,
1859 .setsockopt = vsock_stream_setsockopt,
1860 .getsockopt = vsock_stream_getsockopt,
1861 .sendmsg = vsock_stream_sendmsg,
1862 .recvmsg = vsock_stream_recvmsg,
1863 .mmap = sock_no_mmap,
1864 .sendpage = sock_no_sendpage,
1865};
1866
1867static int vsock_create(struct net *net, struct socket *sock,
1868 int protocol, int kern)
1869{
1870 if (!sock)
1871 return -EINVAL;
1872
Andy King6cf1c5f2013-02-18 06:04:13 +00001873 if (protocol && protocol != PF_VSOCK)
Andy Kingd021c342013-02-06 14:23:56 +00001874 return -EPROTONOSUPPORT;
1875
1876 switch (sock->type) {
1877 case SOCK_DGRAM:
1878 sock->ops = &vsock_dgram_ops;
1879 break;
1880 case SOCK_STREAM:
1881 sock->ops = &vsock_stream_ops;
1882 break;
1883 default:
1884 return -ESOCKTNOSUPPORT;
1885 }
1886
1887 sock->state = SS_UNCONNECTED;
1888
Eric W. Biederman11aa9c22015-05-08 21:09:13 -05001889 return __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern) ? 0 : -ENOMEM;
Andy Kingd021c342013-02-06 14:23:56 +00001890}
1891
1892static const struct net_proto_family vsock_family_ops = {
1893 .family = AF_VSOCK,
1894 .create = vsock_create,
1895 .owner = THIS_MODULE,
1896};
1897
1898static long vsock_dev_do_ioctl(struct file *filp,
1899 unsigned int cmd, void __user *ptr)
1900{
1901 u32 __user *p = ptr;
1902 int retval = 0;
1903
1904 switch (cmd) {
1905 case IOCTL_VM_SOCKETS_GET_LOCAL_CID:
1906 if (put_user(transport->get_local_cid(), p) != 0)
1907 retval = -EFAULT;
1908 break;
1909
1910 default:
1911 pr_err("Unknown ioctl %d\n", cmd);
1912 retval = -EINVAL;
1913 }
1914
1915 return retval;
1916}
1917
1918static long vsock_dev_ioctl(struct file *filp,
1919 unsigned int cmd, unsigned long arg)
1920{
1921 return vsock_dev_do_ioctl(filp, cmd, (void __user *)arg);
1922}
1923
1924#ifdef CONFIG_COMPAT
1925static long vsock_dev_compat_ioctl(struct file *filp,
1926 unsigned int cmd, unsigned long arg)
1927{
1928 return vsock_dev_do_ioctl(filp, cmd, compat_ptr(arg));
1929}
1930#endif
1931
1932static const struct file_operations vsock_device_ops = {
1933 .owner = THIS_MODULE,
1934 .unlocked_ioctl = vsock_dev_ioctl,
1935#ifdef CONFIG_COMPAT
1936 .compat_ioctl = vsock_dev_compat_ioctl,
1937#endif
1938 .open = nonseekable_open,
1939};
1940
1941static struct miscdevice vsock_device = {
1942 .name = "vsock",
Andy Kingd021c342013-02-06 14:23:56 +00001943 .fops = &vsock_device_ops,
1944};
1945
Andy King2c4a3362014-05-01 15:20:43 -07001946int __vsock_core_init(const struct vsock_transport *t, struct module *owner)
Andy Kingd021c342013-02-06 14:23:56 +00001947{
Andy King2c4a3362014-05-01 15:20:43 -07001948 int err = mutex_lock_interruptible(&vsock_register_mutex);
1949
1950 if (err)
1951 return err;
1952
1953 if (transport) {
1954 err = -EBUSY;
1955 goto err_busy;
1956 }
1957
1958 /* Transport must be the owner of the protocol so that it can't
1959 * unload while there are open sockets.
1960 */
1961 vsock_proto.owner = owner;
1962 transport = t;
Andy Kingd021c342013-02-06 14:23:56 +00001963
1964 vsock_init_tables();
1965
Asias He6ad0b2f2013-04-23 20:33:52 +00001966 vsock_device.minor = MISC_DYNAMIC_MINOR;
Andy Kingd021c342013-02-06 14:23:56 +00001967 err = misc_register(&vsock_device);
1968 if (err) {
1969 pr_err("Failed to register misc device\n");
Gao fengf6a835b2015-10-18 23:35:56 +08001970 goto err_reset_transport;
Andy Kingd021c342013-02-06 14:23:56 +00001971 }
1972
1973 err = proto_register(&vsock_proto, 1); /* we want our slab */
1974 if (err) {
1975 pr_err("Cannot register vsock protocol\n");
Gao fengf6a835b2015-10-18 23:35:56 +08001976 goto err_deregister_misc;
Andy Kingd021c342013-02-06 14:23:56 +00001977 }
1978
1979 err = sock_register(&vsock_family_ops);
1980 if (err) {
1981 pr_err("could not register af_vsock (%d) address family: %d\n",
1982 AF_VSOCK, err);
1983 goto err_unregister_proto;
1984 }
1985
Andy King2c4a3362014-05-01 15:20:43 -07001986 mutex_unlock(&vsock_register_mutex);
Andy Kingd021c342013-02-06 14:23:56 +00001987 return 0;
1988
1989err_unregister_proto:
1990 proto_unregister(&vsock_proto);
Gao fengf6a835b2015-10-18 23:35:56 +08001991err_deregister_misc:
Andy Kingd021c342013-02-06 14:23:56 +00001992 misc_deregister(&vsock_device);
Gao fengf6a835b2015-10-18 23:35:56 +08001993err_reset_transport:
Andy King2c4a3362014-05-01 15:20:43 -07001994 transport = NULL;
1995err_busy:
1996 mutex_unlock(&vsock_register_mutex);
Andy Kingd021c342013-02-06 14:23:56 +00001997 return err;
1998}
Andy King2c4a3362014-05-01 15:20:43 -07001999EXPORT_SYMBOL_GPL(__vsock_core_init);
Andy Kingd021c342013-02-06 14:23:56 +00002000
2001void vsock_core_exit(void)
2002{
2003 mutex_lock(&vsock_register_mutex);
2004
2005 misc_deregister(&vsock_device);
2006 sock_unregister(AF_VSOCK);
2007 proto_unregister(&vsock_proto);
2008
2009 /* We do not want the assignment below re-ordered. */
2010 mb();
2011 transport = NULL;
2012
2013 mutex_unlock(&vsock_register_mutex);
2014}
2015EXPORT_SYMBOL_GPL(vsock_core_exit);
2016
Stefan Hajnoczi0b01aeb2016-07-28 15:36:30 +01002017const struct vsock_transport *vsock_core_get_transport(void)
2018{
2019 /* vsock_register_mutex not taken since only the transport uses this
2020 * function and only while registered.
2021 */
2022 return transport;
2023}
2024EXPORT_SYMBOL_GPL(vsock_core_get_transport);
2025
Andy Kingd021c342013-02-06 14:23:56 +00002026MODULE_AUTHOR("VMware, Inc.");
2027MODULE_DESCRIPTION("VMware Virtual Socket Family");
Jorgen Hansen1190cfd2016-09-26 23:59:53 -07002028MODULE_VERSION("1.0.2.0-k");
Andy Kingd021c342013-02-06 14:23:56 +00002029MODULE_LICENSE("GPL v2");