blob: 44f4f3898bbea83e95ec792d0d535705524ec3ed [file] [log] [blame]
Sudeep Dutt7df20f22015-04-29 05:32:28 -07001/*
2 * Intel MIC Platform Software Stack (MPSS)
3 *
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * Copyright(c) 2014 Intel Corporation.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * BSD LICENSE
21 *
22 * Copyright(c) 2014 Intel Corporation.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 *
28 * * Redistributions of source code must retain the above copyright
29 * notice, this list of conditions and the following disclaimer.
30 * * Redistributions in binary form must reproduce the above copyright
31 * notice, this list of conditions and the following disclaimer in
32 * the documentation and/or other materials provided with the
33 * distribution.
34 * * Neither the name of Intel Corporation nor the names of its
35 * contributors may be used to endorse or promote products derived
36 * from this software without specific prior written permission.
37 *
38 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
39 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
40 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
41 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
42 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
44 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
45 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
46 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
47 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
48 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
49 *
50 * Intel SCIF driver.
51 *
52 */
53#ifndef __SCIF_H__
54#define __SCIF_H__
55
56#include <linux/types.h>
57#include <linux/poll.h>
58#include <linux/scif_ioctl.h>
59
60#define SCIF_ACCEPT_SYNC 1
61#define SCIF_SEND_BLOCK 1
62#define SCIF_RECV_BLOCK 1
63
64enum {
65 SCIF_PROT_READ = (1 << 0),
66 SCIF_PROT_WRITE = (1 << 1)
67};
68
69enum {
70 SCIF_MAP_FIXED = 0x10,
71 SCIF_MAP_KERNEL = 0x20,
72};
73
74enum {
75 SCIF_FENCE_INIT_SELF = (1 << 0),
76 SCIF_FENCE_INIT_PEER = (1 << 1),
77 SCIF_SIGNAL_LOCAL = (1 << 4),
78 SCIF_SIGNAL_REMOTE = (1 << 5)
79};
80
81enum {
82 SCIF_RMA_USECPU = (1 << 0),
83 SCIF_RMA_USECACHE = (1 << 1),
84 SCIF_RMA_SYNC = (1 << 2),
85 SCIF_RMA_ORDERED = (1 << 3)
86};
87
88/* End of SCIF Admin Reserved Ports */
89#define SCIF_ADMIN_PORT_END 1024
90
91/* End of SCIF Reserved Ports */
92#define SCIF_PORT_RSVD 1088
93
94typedef struct scif_endpt *scif_epd_t;
95
96#define SCIF_OPEN_FAILED ((scif_epd_t)-1)
97#define SCIF_REGISTER_FAILED ((off_t)-1)
98#define SCIF_MMAP_FAILED ((void *)-1)
99
100/**
101 * scif_open() - Create an endpoint
102 *
103 * Return:
104 * Upon successful completion, scif_open() returns an endpoint descriptor to
105 * be used in subsequent SCIF functions calls to refer to that endpoint;
106 * otherwise in user mode SCIF_OPEN_FAILED (that is ((scif_epd_t)-1)) is
107 * returned and errno is set to indicate the error; in kernel mode a NULL
108 * scif_epd_t is returned.
109 *
110 * Errors:
111 * ENOMEM - Insufficient kernel memory was available
112 */
113scif_epd_t scif_open(void);
114
115/**
116 * scif_bind() - Bind an endpoint to a port
117 * @epd: endpoint descriptor
118 * @pn: port number
119 *
120 * scif_bind() binds endpoint epd to port pn, where pn is a port number on the
121 * local node. If pn is zero, a port number greater than or equal to
122 * SCIF_PORT_RSVD is assigned and returned. Each endpoint may be bound to
123 * exactly one local port. Ports less than 1024 when requested can only be bound
124 * by system (or root) processes or by processes executed by privileged users.
125 *
126 * Return:
127 * Upon successful completion, scif_bind() returns the port number to which epd
128 * is bound; otherwise in user mode -1 is returned and errno is set to
129 * indicate the error; in kernel mode the negative of one of the following
130 * errors is returned.
131 *
132 * Errors:
133 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
134 * EINVAL - the endpoint or the port is already bound
135 * EISCONN - The endpoint is already connected
136 * ENOSPC - No port number available for assignment
137 * EACCES - The port requested is protected and the user is not the superuser
138 */
139int scif_bind(scif_epd_t epd, u16 pn);
140
141/**
142 * scif_listen() - Listen for connections on an endpoint
143 * @epd: endpoint descriptor
144 * @backlog: maximum pending connection requests
145 *
146 * scif_listen() marks the endpoint epd as a listening endpoint - that is, as
147 * an endpoint that will be used to accept incoming connection requests. Once
148 * so marked, the endpoint is said to be in the listening state and may not be
149 * used as the endpoint of a connection.
150 *
151 * The endpoint, epd, must have been bound to a port.
152 *
153 * The backlog argument defines the maximum length to which the queue of
154 * pending connections for epd may grow. If a connection request arrives when
155 * the queue is full, the client may receive an error with an indication that
156 * the connection was refused.
157 *
158 * Return:
159 * Upon successful completion, scif_listen() returns 0; otherwise in user mode
160 * -1 is returned and errno is set to indicate the error; in kernel mode the
161 * negative of one of the following errors is returned.
162 *
163 * Errors:
164 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
165 * EINVAL - the endpoint is not bound to a port
166 * EISCONN - The endpoint is already connected or listening
167 */
168int scif_listen(scif_epd_t epd, int backlog);
169
170/**
171 * scif_connect() - Initiate a connection on a port
172 * @epd: endpoint descriptor
173 * @dst: global id of port to which to connect
174 *
175 * The scif_connect() function requests the connection of endpoint epd to remote
176 * port dst. If the connection is successful, a peer endpoint, bound to dst, is
177 * created on node dst.node. On successful return, the connection is complete.
178 *
179 * If the endpoint epd has not already been bound to a port, scif_connect()
180 * will bind it to an unused local port.
181 *
182 * A connection is terminated when an endpoint of the connection is closed,
183 * either explicitly by scif_close(), or when a process that owns one of the
184 * endpoints of the connection is terminated.
185 *
186 * In user space, scif_connect() supports an asynchronous connection mode
187 * if the application has set the O_NONBLOCK flag on the endpoint via the
188 * fcntl() system call. Setting this flag will result in the calling process
189 * not to wait during scif_connect().
190 *
191 * Return:
192 * Upon successful completion, scif_connect() returns the port ID to which the
193 * endpoint, epd, is bound; otherwise in user mode -1 is returned and errno is
194 * set to indicate the error; in kernel mode the negative of one of the
195 * following errors is returned.
196 *
197 * Errors:
198 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
199 * ECONNREFUSED - The destination was not listening for connections or refused
200 * the connection request
201 * EINVAL - dst.port is not a valid port ID
202 * EISCONN - The endpoint is already connected
203 * ENOMEM - No buffer space is available
204 * ENODEV - The destination node does not exist, or the node is lost or existed,
205 * but is not currently in the network since it may have crashed
206 * ENOSPC - No port number available for assignment
207 * EOPNOTSUPP - The endpoint is listening and cannot be connected
208 */
209int scif_connect(scif_epd_t epd, struct scif_port_id *dst);
210
211/**
212 * scif_accept() - Accept a connection on an endpoint
213 * @epd: endpoint descriptor
214 * @peer: global id of port to which connected
215 * @newepd: new connected endpoint descriptor
216 * @flags: flags
217 *
218 * The scif_accept() call extracts the first connection request from the queue
219 * of pending connections for the port on which epd is listening. scif_accept()
220 * creates a new endpoint, bound to the same port as epd, and allocates a new
221 * SCIF endpoint descriptor, returned in newepd, for the endpoint. The new
222 * endpoint is connected to the endpoint through which the connection was
223 * requested. epd is unaffected by this call, and remains in the listening
224 * state.
225 *
226 * On successful return, peer holds the global port identifier (node id and
227 * local port number) of the port which requested the connection.
228 *
229 * A connection is terminated when an endpoint of the connection is closed,
230 * either explicitly by scif_close(), or when a process that owns one of the
231 * endpoints of the connection is terminated.
232 *
233 * The number of connections that can (subsequently) be accepted on epd is only
234 * limited by system resources (memory).
235 *
236 * The flags argument is formed by OR'ing together zero or more of the
237 * following values.
238 * SCIF_ACCEPT_SYNC - block until a connection request is presented. If
239 * SCIF_ACCEPT_SYNC is not in flags, and no pending
240 * connections are present on the queue, scif_accept()
241 * fails with an EAGAIN error
242 *
243 * In user mode, the select() and poll() functions can be used to determine
244 * when there is a connection request. In kernel mode, the scif_poll()
245 * function may be used for this purpose. A readable event will be delivered
246 * when a connection is requested.
247 *
248 * Return:
249 * Upon successful completion, scif_accept() returns 0; otherwise in user mode
250 * -1 is returned and errno is set to indicate the error; in kernel mode the
251 * negative of one of the following errors is returned.
252 *
253 * Errors:
254 * EAGAIN - SCIF_ACCEPT_SYNC is not set and no connections are present to be
255 * accepted or SCIF_ACCEPT_SYNC is not set and remote node failed to complete
256 * its connection request
257 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
258 * EINTR - Interrupted function
259 * EINVAL - epd is not a listening endpoint, or flags is invalid, or peer is
260 * NULL, or newepd is NULL
261 * ENODEV - The requesting node is lost or existed, but is not currently in the
262 * network since it may have crashed
263 * ENOMEM - Not enough space
264 * ENOENT - Secondary part of epd registration failed
265 */
266int scif_accept(scif_epd_t epd, struct scif_port_id *peer, scif_epd_t
267 *newepd, int flags);
268
269/**
270 * scif_close() - Close an endpoint
271 * @epd: endpoint descriptor
272 *
273 * scif_close() closes an endpoint and performs necessary teardown of
274 * facilities associated with that endpoint.
275 *
276 * If epd is a listening endpoint then it will no longer accept connection
277 * requests on the port to which it is bound. Any pending connection requests
278 * are rejected.
279 *
280 * If epd is a connected endpoint, then its peer endpoint is also closed. RMAs
281 * which are in-process through epd or its peer endpoint will complete before
282 * scif_close() returns. Registered windows of the local and peer endpoints are
283 * released as if scif_unregister() was called against each window.
284 *
285 * Closing a SCIF endpoint does not affect local registered memory mapped by
286 * a SCIF endpoint on a remote node. The local memory remains mapped by the peer
287 * SCIF endpoint explicitly removed by calling munmap(..) by the peer.
288 *
289 * If the peer endpoint's receive queue is not empty at the time that epd is
290 * closed, then the peer endpoint can be passed as the endpoint parameter to
291 * scif_recv() until the receive queue is empty.
292 *
293 * epd is freed and may no longer be accessed.
294 *
295 * Return:
296 * Upon successful completion, scif_close() returns 0; otherwise in user mode
297 * -1 is returned and errno is set to indicate the error; in kernel mode the
298 * negative of one of the following errors is returned.
299 *
300 * Errors:
301 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
302 */
303int scif_close(scif_epd_t epd);
304
305/**
306 * scif_send() - Send a message
307 * @epd: endpoint descriptor
308 * @msg: message buffer address
309 * @len: message length
310 * @flags: blocking mode flags
311 *
312 * scif_send() sends data to the peer of endpoint epd. Up to len bytes of data
313 * are copied from memory starting at address msg. On successful execution the
314 * return value of scif_send() is the number of bytes that were sent, and is
315 * zero if no bytes were sent because len was zero. scif_send() may be called
316 * only when the endpoint is in a connected state.
317 *
318 * If a scif_send() call is non-blocking, then it sends only those bytes which
319 * can be sent without waiting, up to a maximum of len bytes.
320 *
321 * If a scif_send() call is blocking, then it normally returns after sending
322 * all len bytes. If a blocking call is interrupted or the connection is
323 * reset, the call is considered successful if some bytes were sent or len is
324 * zero, otherwise the call is considered unsuccessful.
325 *
326 * In user mode, the select() and poll() functions can be used to determine
327 * when the send queue is not full. In kernel mode, the scif_poll() function
328 * may be used for this purpose.
329 *
330 * It is recommended that scif_send()/scif_recv() only be used for short
331 * control-type message communication between SCIF endpoints. The SCIF RMA
332 * APIs are expected to provide better performance for transfer sizes of
333 * 1024 bytes or longer for the current MIC hardware and software
334 * implementation.
335 *
336 * scif_send() will block until the entire message is sent if SCIF_SEND_BLOCK
337 * is passed as the flags argument.
338 *
339 * Return:
340 * Upon successful completion, scif_send() returns the number of bytes sent;
341 * otherwise in user mode -1 is returned and errno is set to indicate the
342 * error; in kernel mode the negative of one of the following errors is
343 * returned.
344 *
345 * Errors:
346 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
347 * ECONNRESET - Connection reset by peer
348 * EFAULT - An invalid address was specified for a parameter
349 * EINVAL - flags is invalid, or len is negative
350 * ENODEV - The remote node is lost or existed, but is not currently in the
351 * network since it may have crashed
352 * ENOMEM - Not enough space
353 * ENOTCONN - The endpoint is not connected
354 */
355int scif_send(scif_epd_t epd, void *msg, int len, int flags);
356
357/**
358 * scif_recv() - Receive a message
359 * @epd: endpoint descriptor
360 * @msg: message buffer address
361 * @len: message buffer length
362 * @flags: blocking mode flags
363 *
364 * scif_recv() receives data from the peer of endpoint epd. Up to len bytes of
365 * data are copied to memory starting at address msg. On successful execution
366 * the return value of scif_recv() is the number of bytes that were received,
367 * and is zero if no bytes were received because len was zero. scif_recv() may
368 * be called only when the endpoint is in a connected state.
369 *
370 * If a scif_recv() call is non-blocking, then it receives only those bytes
371 * which can be received without waiting, up to a maximum of len bytes.
372 *
373 * If a scif_recv() call is blocking, then it normally returns after receiving
374 * all len bytes. If the blocking call was interrupted due to a disconnection,
375 * subsequent calls to scif_recv() will copy all bytes received upto the point
376 * of disconnection.
377 *
378 * In user mode, the select() and poll() functions can be used to determine
379 * when data is available to be received. In kernel mode, the scif_poll()
380 * function may be used for this purpose.
381 *
382 * It is recommended that scif_send()/scif_recv() only be used for short
383 * control-type message communication between SCIF endpoints. The SCIF RMA
384 * APIs are expected to provide better performance for transfer sizes of
385 * 1024 bytes or longer for the current MIC hardware and software
386 * implementation.
387 *
388 * scif_recv() will block until the entire message is received if
389 * SCIF_RECV_BLOCK is passed as the flags argument.
390 *
391 * Return:
392 * Upon successful completion, scif_recv() returns the number of bytes
393 * received; otherwise in user mode -1 is returned and errno is set to
394 * indicate the error; in kernel mode the negative of one of the following
395 * errors is returned.
396 *
397 * Errors:
398 * EAGAIN - The destination node is returning from a low power state
399 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
400 * ECONNRESET - Connection reset by peer
401 * EFAULT - An invalid address was specified for a parameter
402 * EINVAL - flags is invalid, or len is negative
403 * ENODEV - The remote node is lost or existed, but is not currently in the
404 * network since it may have crashed
405 * ENOMEM - Not enough space
406 * ENOTCONN - The endpoint is not connected
407 */
408int scif_recv(scif_epd_t epd, void *msg, int len, int flags);
409
410/**
411 * scif_register() - Mark a memory region for remote access.
412 * @epd: endpoint descriptor
413 * @addr: starting virtual address
414 * @len: length of range
415 * @offset: offset of window
416 * @prot_flags: read/write protection flags
417 * @map_flags: mapping flags
418 *
419 * The scif_register() function opens a window, a range of whole pages of the
420 * registered address space of the endpoint epd, starting at offset po and
421 * continuing for len bytes. The value of po, further described below, is a
422 * function of the parameters offset and len, and the value of map_flags. Each
423 * page of the window represents the physical memory page which backs the
424 * corresponding page of the range of virtual address pages starting at addr
425 * and continuing for len bytes. addr and len are constrained to be multiples
426 * of the page size. A successful scif_register() call returns po.
427 *
428 * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset
429 * exactly, and offset is constrained to be a multiple of the page size. The
430 * mapping established by scif_register() will not replace any existing
431 * registration; an error is returned if any page within the range [offset,
432 * offset + len - 1] intersects an existing window.
433 *
434 * When SCIF_MAP_FIXED is not set, the implementation uses offset in an
435 * implementation-defined manner to arrive at po. The po value so chosen will
436 * be an area of the registered address space that the implementation deems
437 * suitable for a mapping of len bytes. An offset value of 0 is interpreted as
438 * granting the implementation complete freedom in selecting po, subject to
439 * constraints described below. A non-zero value of offset is taken to be a
440 * suggestion of an offset near which the mapping should be placed. When the
441 * implementation selects a value for po, it does not replace any extant
442 * window. In all cases, po will be a multiple of the page size.
443 *
444 * The physical pages which are so represented by a window are available for
445 * access in calls to mmap(), scif_readfrom(), scif_writeto(),
446 * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the
447 * physical pages represented by the window will not be reused by the memory
448 * subsystem for any other purpose. Note that the same physical page may be
449 * represented by multiple windows.
450 *
451 * Subsequent operations which change the memory pages to which virtual
452 * addresses are mapped (such as mmap(), munmap()) have no effect on
453 * existing window.
454 *
455 * If the process will fork(), it is recommended that the registered
456 * virtual address range be marked with MADV_DONTFORK. Doing so will prevent
457 * problems due to copy-on-write semantics.
458 *
459 * The prot_flags argument is formed by OR'ing together one or more of the
460 * following values.
461 * SCIF_PROT_READ - allow read operations from the window
462 * SCIF_PROT_WRITE - allow write operations to the window
463 *
464 * The map_flags argument can be set to SCIF_MAP_FIXED which interprets a
465 * fixed offset.
466 *
467 * Return:
468 * Upon successful completion, scif_register() returns the offset at which the
469 * mapping was placed (po); otherwise in user mode SCIF_REGISTER_FAILED (that
470 * is (off_t *)-1) is returned and errno is set to indicate the error; in
471 * kernel mode the negative of one of the following errors is returned.
472 *
473 * Errors:
474 * EADDRINUSE - SCIF_MAP_FIXED is set in map_flags, and pages in the range
475 * [offset, offset + len -1] are already registered
476 * EAGAIN - The mapping could not be performed due to lack of resources
477 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
478 * ECONNRESET - Connection reset by peer
479 * EFAULT - Addresses in the range [addr, addr + len - 1] are invalid
480 * EINVAL - map_flags is invalid, or prot_flags is invalid, or SCIF_MAP_FIXED is
481 * set in flags, and offset is not a multiple of the page size, or addr is not a
482 * multiple of the page size, or len is not a multiple of the page size, or is
483 * 0, or offset is negative
484 * ENODEV - The remote node is lost or existed, but is not currently in the
485 * network since it may have crashed
486 * ENOMEM - Not enough space
487 * ENOTCONN -The endpoint is not connected
488 */
489off_t scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset,
490 int prot_flags, int map_flags);
491
492/**
493 * scif_unregister() - Mark a memory region for remote access.
494 * @epd: endpoint descriptor
495 * @offset: start of range to unregister
496 * @len: length of range to unregister
497 *
498 * The scif_unregister() function closes those previously registered windows
499 * which are entirely within the range [offset, offset + len - 1]. It is an
500 * error to specify a range which intersects only a subrange of a window.
501 *
502 * On a successful return, pages within the window may no longer be specified
503 * in calls to mmap(), scif_readfrom(), scif_writeto(), scif_vreadfrom(),
504 * scif_vwriteto(), scif_get_pages, and scif_fence_signal(). The window,
505 * however, continues to exist until all previous references against it are
506 * removed. A window is referenced if there is a mapping to it created by
507 * mmap(), or if scif_get_pages() was called against the window
508 * (and the pages have not been returned via scif_put_pages()). A window is
509 * also referenced while an RMA, in which some range of the window is a source
510 * or destination, is in progress. Finally a window is referenced while some
511 * offset in that window was specified to scif_fence_signal(), and the RMAs
512 * marked by that call to scif_fence_signal() have not completed. While a
513 * window is in this state, its registered address space pages are not
514 * available for use in a new registered window.
515 *
516 * When all such references to the window have been removed, its references to
517 * all the physical pages which it represents are removed. Similarly, the
518 * registered address space pages of the window become available for
519 * registration in a new window.
520 *
521 * Return:
522 * Upon successful completion, scif_unregister() returns 0; otherwise in user
523 * mode -1 is returned and errno is set to indicate the error; in kernel mode
524 * the negative of one of the following errors is returned. In the event of an
525 * error, no windows are unregistered.
526 *
527 * Errors:
528 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
529 * ECONNRESET - Connection reset by peer
530 * EINVAL - the range [offset, offset + len - 1] intersects a subrange of a
531 * window, or offset is negative
532 * ENODEV - The remote node is lost or existed, but is not currently in the
533 * network since it may have crashed
534 * ENOTCONN - The endpoint is not connected
535 * ENXIO - Offsets in the range [offset, offset + len - 1] are invalid for the
536 * registered address space of epd
537 */
538int scif_unregister(scif_epd_t epd, off_t offset, size_t len);
539
540/**
541 * scif_readfrom() - Copy from a remote address space
542 * @epd: endpoint descriptor
543 * @loffset: offset in local registered address space to
544 * which to copy
545 * @len: length of range to copy
546 * @roffset: offset in remote registered address space
547 * from which to copy
548 * @rma_flags: transfer mode flags
549 *
550 * scif_readfrom() copies len bytes from the remote registered address space of
551 * the peer of endpoint epd, starting at the offset roffset to the local
552 * registered address space of epd, starting at the offset loffset.
553 *
554 * Each of the specified ranges [loffset, loffset + len - 1] and [roffset,
555 * roffset + len - 1] must be within some registered window or windows of the
556 * local and remote nodes. A range may intersect multiple registered windows,
557 * but only if those windows are contiguous in the registered address space.
558 *
559 * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
560 * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
561 * flags includes SCIF_RMA_SYNC, then scif_readfrom() will return after the
562 * transfer is complete. Otherwise, the transfer may be performed asynchron-
563 * ously. The order in which any two asynchronous RMA operations complete
564 * is non-deterministic. The synchronization functions, scif_fence_mark()/
565 * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
566 * the completion of asynchronous RMA operations on the same endpoint.
567 *
568 * The DMA transfer of individual bytes is not guaranteed to complete in
569 * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
570 * cacheline or partial cacheline of the source range will become visible on
571 * the destination node after all other transferred data in the source
572 * range has become visible on the destination node.
573 *
574 * The optimal DMA performance will likely be realized if both
575 * loffset and roffset are cacheline aligned (are a multiple of 64). Lower
576 * performance will likely be realized if loffset and roffset are not
577 * cacheline aligned but are separated by some multiple of 64. The lowest level
578 * of performance is likely if loffset and roffset are not separated by a
579 * multiple of 64.
580 *
581 * The rma_flags argument is formed by ORing together zero or more of the
582 * following values.
583 * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
584 * engine.
585 * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
586 * transfer has completed. Passing this flag results in the
587 * current implementation busy waiting and consuming CPU cycles
588 * while the DMA transfer is in progress for best performance by
589 * avoiding the interrupt latency.
590 * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
591 * the source range becomes visible on the destination node
592 * after all other transferred data in the source range has
593 * become visible on the destination
594 *
595 * Return:
596 * Upon successful completion, scif_readfrom() returns 0; otherwise in user
597 * mode -1 is returned and errno is set to indicate the error; in kernel mode
598 * the negative of one of the following errors is returned.
599 *
600 * Errors:
601 * EACCESS - Attempt to write to a read-only range
602 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
603 * ECONNRESET - Connection reset by peer
604 * EINVAL - rma_flags is invalid
605 * ENODEV - The remote node is lost or existed, but is not currently in the
606 * network since it may have crashed
607 * ENOTCONN - The endpoint is not connected
608 * ENXIO - The range [loffset, loffset + len - 1] is invalid for the registered
609 * address space of epd, or, The range [roffset, roffset + len - 1] is invalid
610 * for the registered address space of the peer of epd, or loffset or roffset
611 * is negative
612 */
613int scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, off_t
614 roffset, int rma_flags);
615
616/**
617 * scif_writeto() - Copy to a remote address space
618 * @epd: endpoint descriptor
619 * @loffset: offset in local registered address space
620 * from which to copy
621 * @len: length of range to copy
622 * @roffset: offset in remote registered address space to
623 * which to copy
624 * @rma_flags: transfer mode flags
625 *
626 * scif_writeto() copies len bytes from the local registered address space of
627 * epd, starting at the offset loffset to the remote registered address space
628 * of the peer of endpoint epd, starting at the offset roffset.
629 *
630 * Each of the specified ranges [loffset, loffset + len - 1] and [roffset,
631 * roffset + len - 1] must be within some registered window or windows of the
632 * local and remote nodes. A range may intersect multiple registered windows,
633 * but only if those windows are contiguous in the registered address space.
634 *
635 * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
636 * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
637 * flags includes SCIF_RMA_SYNC, then scif_writeto() will return after the
638 * transfer is complete. Otherwise, the transfer may be performed asynchron-
639 * ously. The order in which any two asynchronous RMA operations complete
640 * is non-deterministic. The synchronization functions, scif_fence_mark()/
641 * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
642 * the completion of asynchronous RMA operations on the same endpoint.
643 *
644 * The DMA transfer of individual bytes is not guaranteed to complete in
645 * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
646 * cacheline or partial cacheline of the source range will become visible on
647 * the destination node after all other transferred data in the source
648 * range has become visible on the destination node.
649 *
650 * The optimal DMA performance will likely be realized if both
651 * loffset and roffset are cacheline aligned (are a multiple of 64). Lower
652 * performance will likely be realized if loffset and roffset are not cacheline
653 * aligned but are separated by some multiple of 64. The lowest level of
654 * performance is likely if loffset and roffset are not separated by a multiple
655 * of 64.
656 *
657 * The rma_flags argument is formed by ORing together zero or more of the
658 * following values.
659 * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
660 * engine.
661 * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
662 * transfer has completed. Passing this flag results in the
663 * current implementation busy waiting and consuming CPU cycles
664 * while the DMA transfer is in progress for best performance by
665 * avoiding the interrupt latency.
666 * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
667 * the source range becomes visible on the destination node
668 * after all other transferred data in the source range has
669 * become visible on the destination
670 *
671 * Return:
672 * Upon successful completion, scif_readfrom() returns 0; otherwise in user
673 * mode -1 is returned and errno is set to indicate the error; in kernel mode
674 * the negative of one of the following errors is returned.
675 *
676 * Errors:
677 * EACCESS - Attempt to write to a read-only range
678 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
679 * ECONNRESET - Connection reset by peer
680 * EINVAL - rma_flags is invalid
681 * ENODEV - The remote node is lost or existed, but is not currently in the
682 * network since it may have crashed
683 * ENOTCONN - The endpoint is not connected
684 * ENXIO - The range [loffset, loffset + len - 1] is invalid for the registered
685 * address space of epd, or, The range [roffset , roffset + len -1] is invalid
686 * for the registered address space of the peer of epd, or loffset or roffset
687 * is negative
688 */
689int scif_writeto(scif_epd_t epd, off_t loffset, size_t len, off_t
690 roffset, int rma_flags);
691
692/**
693 * scif_vreadfrom() - Copy from a remote address space
694 * @epd: endpoint descriptor
695 * @addr: address to which to copy
696 * @len: length of range to copy
697 * @roffset: offset in remote registered address space
698 * from which to copy
699 * @rma_flags: transfer mode flags
700 *
701 * scif_vreadfrom() copies len bytes from the remote registered address
702 * space of the peer of endpoint epd, starting at the offset roffset, to local
703 * memory, starting at addr.
704 *
705 * The specified range [roffset, roffset + len - 1] must be within some
706 * registered window or windows of the remote nodes. The range may
707 * intersect multiple registered windows, but only if those windows are
708 * contiguous in the registered address space.
709 *
710 * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
711 * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
712 * flags includes SCIF_RMA_SYNC, then scif_vreadfrom() will return after the
713 * transfer is complete. Otherwise, the transfer may be performed asynchron-
714 * ously. The order in which any two asynchronous RMA operations complete
715 * is non-deterministic. The synchronization functions, scif_fence_mark()/
716 * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
717 * the completion of asynchronous RMA operations on the same endpoint.
718 *
719 * The DMA transfer of individual bytes is not guaranteed to complete in
720 * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
721 * cacheline or partial cacheline of the source range will become visible on
722 * the destination node after all other transferred data in the source
723 * range has become visible on the destination node.
724 *
725 * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back
726 * the specified local memory range may be remain in a pinned state even after
727 * the specified transfer completes. This may reduce overhead if some or all of
728 * the same virtual address range is referenced in a subsequent call of
729 * scif_vreadfrom() or scif_vwriteto().
730 *
731 * The optimal DMA performance will likely be realized if both
732 * addr and roffset are cacheline aligned (are a multiple of 64). Lower
733 * performance will likely be realized if addr and roffset are not
734 * cacheline aligned but are separated by some multiple of 64. The lowest level
735 * of performance is likely if addr and roffset are not separated by a
736 * multiple of 64.
737 *
738 * The rma_flags argument is formed by ORing together zero or more of the
739 * following values.
740 * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
741 * engine.
742 * SCIF_RMA_USECACHE - enable registration caching
743 * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
744 * transfer has completed. Passing this flag results in the
745 * current implementation busy waiting and consuming CPU cycles
746 * while the DMA transfer is in progress for best performance by
747 * avoiding the interrupt latency.
748 * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
749 * the source range becomes visible on the destination node
750 * after all other transferred data in the source range has
751 * become visible on the destination
752 *
753 * Return:
754 * Upon successful completion, scif_vreadfrom() returns 0; otherwise in user
755 * mode -1 is returned and errno is set to indicate the error; in kernel mode
756 * the negative of one of the following errors is returned.
757 *
758 * Errors:
759 * EACCESS - Attempt to write to a read-only range
760 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
761 * ECONNRESET - Connection reset by peer
762 * EFAULT - Addresses in the range [addr, addr + len - 1] are invalid
763 * EINVAL - rma_flags is invalid
764 * ENODEV - The remote node is lost or existed, but is not currently in the
765 * network since it may have crashed
766 * ENOTCONN - The endpoint is not connected
767 * ENXIO - Offsets in the range [roffset, roffset + len - 1] are invalid for the
768 * registered address space of epd
769 */
770int scif_vreadfrom(scif_epd_t epd, void *addr, size_t len, off_t roffset,
771 int rma_flags);
772
773/**
774 * scif_vwriteto() - Copy to a remote address space
775 * @epd: endpoint descriptor
776 * @addr: address from which to copy
777 * @len: length of range to copy
778 * @roffset: offset in remote registered address space to
779 * which to copy
780 * @rma_flags: transfer mode flags
781 *
782 * scif_vwriteto() copies len bytes from the local memory, starting at addr, to
783 * the remote registered address space of the peer of endpoint epd, starting at
784 * the offset roffset.
785 *
786 * The specified range [roffset, roffset + len - 1] must be within some
787 * registered window or windows of the remote nodes. The range may intersect
788 * multiple registered windows, but only if those windows are contiguous in the
789 * registered address space.
790 *
791 * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
792 * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
793 * flags includes SCIF_RMA_SYNC, then scif_vwriteto() will return after the
794 * transfer is complete. Otherwise, the transfer may be performed asynchron-
795 * ously. The order in which any two asynchronous RMA operations complete
796 * is non-deterministic. The synchronization functions, scif_fence_mark()/
797 * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
798 * the completion of asynchronous RMA operations on the same endpoint.
799 *
800 * The DMA transfer of individual bytes is not guaranteed to complete in
801 * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
802 * cacheline or partial cacheline of the source range will become visible on
803 * the destination node after all other transferred data in the source
804 * range has become visible on the destination node.
805 *
806 * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back
807 * the specified local memory range may be remain in a pinned state even after
808 * the specified transfer completes. This may reduce overhead if some or all of
809 * the same virtual address range is referenced in a subsequent call of
810 * scif_vreadfrom() or scif_vwriteto().
811 *
812 * The optimal DMA performance will likely be realized if both
813 * addr and offset are cacheline aligned (are a multiple of 64). Lower
814 * performance will likely be realized if addr and offset are not cacheline
815 * aligned but are separated by some multiple of 64. The lowest level of
816 * performance is likely if addr and offset are not separated by a multiple of
817 * 64.
818 *
819 * The rma_flags argument is formed by ORing together zero or more of the
820 * following values.
821 * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
822 * engine.
823 * SCIF_RMA_USECACHE - allow registration caching
824 * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
825 * transfer has completed. Passing this flag results in the
826 * current implementation busy waiting and consuming CPU cycles
827 * while the DMA transfer is in progress for best performance by
828 * avoiding the interrupt latency.
829 * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
830 * the source range becomes visible on the destination node
831 * after all other transferred data in the source range has
832 * become visible on the destination
833 *
834 * Return:
835 * Upon successful completion, scif_vwriteto() returns 0; otherwise in user
836 * mode -1 is returned and errno is set to indicate the error; in kernel mode
837 * the negative of one of the following errors is returned.
838 *
839 * Errors:
840 * EACCESS - Attempt to write to a read-only range
841 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
842 * ECONNRESET - Connection reset by peer
843 * EFAULT - Addresses in the range [addr, addr + len - 1] are invalid
844 * EINVAL - rma_flags is invalid
845 * ENODEV - The remote node is lost or existed, but is not currently in the
846 * network since it may have crashed
847 * ENOTCONN - The endpoint is not connected
848 * ENXIO - Offsets in the range [roffset, roffset + len - 1] are invalid for the
849 * registered address space of epd
850 */
851int scif_vwriteto(scif_epd_t epd, void *addr, size_t len, off_t roffset,
852 int rma_flags);
853
854/**
855 * scif_fence_mark() - Mark previously issued RMAs
856 * @epd: endpoint descriptor
857 * @flags: control flags
858 * @mark: marked value returned as output.
859 *
860 * scif_fence_mark() returns after marking the current set of all uncompleted
861 * RMAs initiated through the endpoint epd or the current set of all
862 * uncompleted RMAs initiated through the peer of endpoint epd. The RMAs are
863 * marked with a value returned at mark. The application may subsequently call
864 * scif_fence_wait(), passing the value returned at mark, to await completion
865 * of all RMAs so marked.
866 *
867 * The flags argument has exactly one of the following values.
868 * SCIF_FENCE_INIT_SELF - RMA operations initiated through endpoint
869 * epd are marked
870 * SCIF_FENCE_INIT_PEER - RMA operations initiated through the peer
871 * of endpoint epd are marked
872 *
873 * Return:
874 * Upon successful completion, scif_fence_mark() returns 0; otherwise in user
875 * mode -1 is returned and errno is set to indicate the error; in kernel mode
876 * the negative of one of the following errors is returned.
877 *
878 * Errors:
879 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
880 * ECONNRESET - Connection reset by peer
881 * EINVAL - flags is invalid
882 * ENODEV - The remote node is lost or existed, but is not currently in the
883 * network since it may have crashed
884 * ENOTCONN - The endpoint is not connected
885 * ENOMEM - Insufficient kernel memory was available
886 */
887int scif_fence_mark(scif_epd_t epd, int flags, int *mark);
888
889/**
890 * scif_fence_wait() - Wait for completion of marked RMAs
891 * @epd: endpoint descriptor
892 * @mark: mark request
893 *
894 * scif_fence_wait() returns after all RMAs marked with mark have completed.
895 * The value passed in mark must have been obtained in a previous call to
896 * scif_fence_mark().
897 *
898 * Return:
899 * Upon successful completion, scif_fence_wait() returns 0; otherwise in user
900 * mode -1 is returned and errno is set to indicate the error; in kernel mode
901 * the negative of one of the following errors is returned.
902 *
903 * Errors:
904 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
905 * ECONNRESET - Connection reset by peer
906 * ENODEV - The remote node is lost or existed, but is not currently in the
907 * network since it may have crashed
908 * ENOTCONN - The endpoint is not connected
909 * ENOMEM - Insufficient kernel memory was available
910 */
911int scif_fence_wait(scif_epd_t epd, int mark);
912
913/**
914 * scif_fence_signal() - Request a memory update on completion of RMAs
915 * @epd: endpoint descriptor
916 * @loff: local offset
917 * @lval: local value to write to loffset
918 * @roff: remote offset
919 * @rval: remote value to write to roffset
920 * @flags: flags
921 *
922 * scif_fence_signal() returns after marking the current set of all uncompleted
923 * RMAs initiated through the endpoint epd or marking the current set of all
924 * uncompleted RMAs initiated through the peer of endpoint epd.
925 *
926 * If flags includes SCIF_SIGNAL_LOCAL, then on completion of the RMAs in the
927 * marked set, lval is written to memory at the address corresponding to offset
928 * loff in the local registered address space of epd. loff must be within a
929 * registered window. If flags includes SCIF_SIGNAL_REMOTE, then on completion
930 * of the RMAs in the marked set, rval is written to memory at the address
931 * corresponding to offset roff in the remote registered address space of epd.
932 * roff must be within a remote registered window of the peer of epd. Note
933 * that any specified offset must be DWORD (4 byte / 32 bit) aligned.
934 *
935 * The flags argument is formed by OR'ing together the following.
936 * Exactly one of the following values.
937 * SCIF_FENCE_INIT_SELF - RMA operations initiated through endpoint
938 * epd are marked
939 * SCIF_FENCE_INIT_PEER - RMA operations initiated through the peer
940 * of endpoint epd are marked
941 * One or more of the following values.
942 * SCIF_SIGNAL_LOCAL - On completion of the marked set of RMAs, write lval to
943 * memory at the address corresponding to offset loff in the local
944 * registered address space of epd.
945 * SCIF_SIGNAL_REMOTE - On completion of the marked set of RMAs, write rval to
946 * memory at the address corresponding to offset roff in the remote
947 * registered address space of epd.
948 *
949 * Return:
950 * Upon successful completion, scif_fence_signal() returns 0; otherwise in
951 * user mode -1 is returned and errno is set to indicate the error; in kernel
952 * mode the negative of one of the following errors is returned.
953 *
954 * Errors:
955 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
956 * ECONNRESET - Connection reset by peer
957 * EINVAL - flags is invalid, or loff or roff are not DWORD aligned
958 * ENODEV - The remote node is lost or existed, but is not currently in the
959 * network since it may have crashed
960 * ENOTCONN - The endpoint is not connected
961 * ENXIO - loff is invalid for the registered address of epd, or roff is invalid
962 * for the registered address space, of the peer of epd
963 */
964int scif_fence_signal(scif_epd_t epd, off_t loff, u64 lval, off_t roff,
965 u64 rval, int flags);
966
967/**
968 * scif_get_node_ids() - Return information about online nodes
969 * @nodes: array in which to return online node IDs
970 * @len: number of entries in the nodes array
971 * @self: address to place the node ID of the local node
972 *
973 * scif_get_node_ids() fills in the nodes array with up to len node IDs of the
974 * nodes in the SCIF network. If there is not enough space in nodes, as
975 * indicated by the len parameter, only len node IDs are returned in nodes. The
976 * return value of scif_get_node_ids() is the total number of nodes currently in
977 * the SCIF network. By checking the return value against the len parameter,
978 * the user may determine if enough space for nodes was allocated.
979 *
980 * The node ID of the local node is returned at self.
981 *
982 * Return:
983 * Upon successful completion, scif_get_node_ids() returns the actual number of
984 * online nodes in the SCIF network including 'self'; otherwise in user mode
985 * -1 is returned and errno is set to indicate the error; in kernel mode no
986 * errors are returned.
987 *
988 * Errors:
989 * EFAULT - Bad address
990 */
991int scif_get_node_ids(u16 *nodes, int len, u16 *self);
992
993#endif /* __SCIF_H__ */