Greg Hartman | 0055e0d | 2018-04-05 17:59:11 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2017 Google, Inc. |
| 3 | * |
| 4 | * This software is licensed under the terms of the GNU General Public |
| 5 | * License version 2, as published by the Free Software Foundation, and |
| 6 | * may be copied, distributed, and modified under those terms. |
| 7 | * |
| 8 | * This program is distributed in the hope that it will be useful, |
| 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 11 | * GNU General Public License for more details. |
| 12 | * |
| 13 | */ |
| 14 | |
| 15 | #ifndef _UAPI_LINUX_VSOC_SHM_H |
| 16 | #define _UAPI_LINUX_VSOC_SHM_H |
| 17 | |
| 18 | #include <linux/types.h> |
| 19 | |
| 20 | /** |
| 21 | * A permission is a token that permits a receiver to read and/or write an area |
| 22 | * of memory within a Vsoc region. |
| 23 | * |
| 24 | * An fd_scoped permission grants both read and write access, and can be |
| 25 | * attached to a file description (see open(2)). |
| 26 | * Ownership of the area can then be shared by passing a file descriptor |
| 27 | * among processes. |
| 28 | * |
| 29 | * begin_offset and end_offset define the area of memory that is controlled by |
| 30 | * the permission. owner_offset points to a word, also in shared memory, that |
| 31 | * controls ownership of the area. |
| 32 | * |
| 33 | * ownership of the region expires when the associated file description is |
| 34 | * released. |
| 35 | * |
| 36 | * At most one permission can be attached to each file description. |
| 37 | * |
| 38 | * This is useful when implementing HALs like gralloc that scope and pass |
| 39 | * ownership of shared resources via file descriptors. |
| 40 | * |
| 41 | * The caller is responsibe for doing any fencing. |
| 42 | * |
| 43 | * The calling process will normally identify a currently free area of |
| 44 | * memory. It will construct a proposed fd_scoped_permission_arg structure: |
| 45 | * |
| 46 | * begin_offset and end_offset describe the area being claimed |
| 47 | * |
| 48 | * owner_offset points to the location in shared memory that indicates the |
| 49 | * owner of the area. |
| 50 | * |
| 51 | * owned_value is the value that will be stored in owner_offset iff the |
| 52 | * permission can be granted. It must be different than VSOC_REGION_FREE. |
| 53 | * |
| 54 | * Two fd_scoped_permission structures are compatible if they vary only by |
| 55 | * their owned_value fields. |
| 56 | * |
| 57 | * The driver ensures that, for any group of simultaneous callers proposing |
| 58 | * compatible fd_scoped_permissions, it will accept exactly one of the |
| 59 | * propopsals. The other callers will get a failure with errno of EAGAIN. |
| 60 | * |
| 61 | * A process receiving a file descriptor can identify the region being |
| 62 | * granted using the VSOC_GET_FD_SCOPED_PERMISSION ioctl. |
| 63 | */ |
| 64 | struct fd_scoped_permission { |
| 65 | __u32 begin_offset; |
| 66 | __u32 end_offset; |
| 67 | __u32 owner_offset; |
| 68 | __u32 owned_value; |
| 69 | }; |
| 70 | |
| 71 | /* |
| 72 | * This value represents a free area of memory. The driver expects to see this |
| 73 | * value at owner_offset when creating a permission otherwise it will not do it, |
| 74 | * and will write this value back once the permission is no longer needed. |
| 75 | */ |
| 76 | #define VSOC_REGION_FREE ((__u32)0) |
| 77 | |
| 78 | /** |
| 79 | * ioctl argument for VSOC_CREATE_FD_SCOPE_PERMISSION |
| 80 | */ |
| 81 | struct fd_scoped_permission_arg { |
| 82 | struct fd_scoped_permission perm; |
| 83 | __s32 managed_region_fd; |
| 84 | }; |
| 85 | |
| 86 | #define VSOC_NODE_FREE ((__u32)0) |
| 87 | |
| 88 | /* |
| 89 | * Describes a signal table in shared memory. Each non-zero entry in the |
| 90 | * table indicates that the receiver should signal the futex at the given |
| 91 | * offset. Offsets are relative to the region, not the shared memory window. |
| 92 | * |
| 93 | * interrupt_signalled_offset is used to reliably signal interrupts across the |
| 94 | * vmm boundary. There are two roles: transmitter and receiver. For example, |
| 95 | * in the host_to_guest_signal_table the host is the transmitter and the |
| 96 | * guest is the receiver. The protocol is as follows: |
| 97 | * |
| 98 | * 1. The transmitter should convert the offset of the futex to an offset |
| 99 | * in the signal table [0, (1 << num_nodes_lg2)) |
| 100 | * The transmitter can choose any appropriate hashing algorithm, including |
| 101 | * hash = futex_offset & ((1 << num_nodes_lg2) - 1) |
| 102 | * |
| 103 | * 3. The transmitter should atomically compare and swap futex_offset with 0 |
| 104 | * at hash. There are 3 possible outcomes |
| 105 | * a. The swap fails because the futex_offset is already in the table. |
| 106 | * The transmitter should stop. |
| 107 | * b. Some other offset is in the table. This is a hash collision. The |
| 108 | * transmitter should move to another table slot and try again. One |
| 109 | * possible algorithm: |
| 110 | * hash = (hash + 1) & ((1 << num_nodes_lg2) - 1) |
| 111 | * c. The swap worked. Continue below. |
| 112 | * |
| 113 | * 3. The transmitter atomically swaps 1 with the value at the |
| 114 | * interrupt_signalled_offset. There are two outcomes: |
| 115 | * a. The prior value was 1. In this case an interrupt has already been |
| 116 | * posted. The transmitter is done. |
| 117 | * b. The prior value was 0, indicating that the receiver may be sleeping. |
| 118 | * The transmitter will issue an interrupt. |
| 119 | * |
| 120 | * 4. On waking the receiver immediately exchanges a 0 with the |
| 121 | * interrupt_signalled_offset. If it receives a 0 then this a spurious |
| 122 | * interrupt. That may occasionally happen in the current protocol, but |
| 123 | * should be rare. |
| 124 | * |
| 125 | * 5. The receiver scans the signal table by atomicaly exchanging 0 at each |
| 126 | * location. If a non-zero offset is returned from the exchange the |
| 127 | * receiver wakes all sleepers at the given offset: |
| 128 | * futex((int*)(region_base + old_value), FUTEX_WAKE, MAX_INT); |
| 129 | * |
| 130 | * 6. The receiver thread then does a conditional wait, waking immediately |
| 131 | * if the value at interrupt_signalled_offset is non-zero. This catches cases |
| 132 | * here additional signals were posted while the table was being scanned. |
| 133 | * On the guest the wait is handled via the VSOC_WAIT_FOR_INCOMING_INTERRUPT |
| 134 | * ioctl. |
| 135 | */ |
| 136 | struct vsoc_signal_table_layout { |
| 137 | /* log_2(Number of signal table entries) */ |
| 138 | __u32 num_nodes_lg2; |
| 139 | /* |
| 140 | * Offset to the first signal table entry relative to the start of the |
| 141 | * region |
| 142 | */ |
| 143 | __u32 futex_uaddr_table_offset; |
| 144 | /* |
| 145 | * Offset to an atomic_t / atomic uint32_t. A non-zero value indicates |
| 146 | * that one or more offsets are currently posted in the table. |
| 147 | * semi-unique access to an entry in the table |
| 148 | */ |
| 149 | __u32 interrupt_signalled_offset; |
| 150 | }; |
| 151 | |
| 152 | #define VSOC_REGION_WHOLE ((__s32)0) |
| 153 | #define VSOC_DEVICE_NAME_SZ 16 |
| 154 | |
| 155 | /** |
| 156 | * Each HAL would (usually) talk to a single device region |
| 157 | * Mulitple entities care about these regions: |
| 158 | * - The ivshmem_server will populate the regions in shared memory |
| 159 | * - The guest kernel will read the region, create minor device nodes, and |
| 160 | * allow interested parties to register for FUTEX_WAKE events in the region |
| 161 | * - HALs will access via the minor device nodes published by the guest kernel |
| 162 | * - Host side processes will access the region via the ivshmem_server: |
| 163 | * 1. Pass name to ivshmem_server at a UNIX socket |
| 164 | * 2. ivshmemserver will reply with 2 fds: |
| 165 | * - host->guest doorbell fd |
| 166 | * - guest->host doorbell fd |
| 167 | * - fd for the shared memory region |
| 168 | * - region offset |
| 169 | * 3. Start a futex receiver thread on the doorbell fd pointed at the |
| 170 | * signal_nodes |
| 171 | */ |
| 172 | struct vsoc_device_region { |
| 173 | __u16 current_version; |
| 174 | __u16 min_compatible_version; |
| 175 | __u32 region_begin_offset; |
| 176 | __u32 region_end_offset; |
| 177 | __u32 offset_of_region_data; |
| 178 | struct vsoc_signal_table_layout guest_to_host_signal_table; |
| 179 | struct vsoc_signal_table_layout host_to_guest_signal_table; |
| 180 | /* Name of the device. Must always be terminated with a '\0', so |
| 181 | * the longest supported device name is 15 characters. |
| 182 | */ |
| 183 | char device_name[VSOC_DEVICE_NAME_SZ]; |
| 184 | /* There are two ways that permissions to access regions are handled: |
| 185 | * - When subdivided_by is VSOC_REGION_WHOLE, any process that can |
| 186 | * open the device node for the region gains complete access to it. |
| 187 | * - When subdivided is set processes that open the region cannot |
| 188 | * access it. Access to a sub-region must be established by invoking |
| 189 | * the VSOC_CREATE_FD_SCOPE_PERMISSION ioctl on the region |
| 190 | * referenced in subdivided_by, providing a fileinstance |
| 191 | * (represented by a fd) opened on this region. |
| 192 | */ |
| 193 | __u32 managed_by; |
| 194 | }; |
| 195 | |
| 196 | /* |
| 197 | * The vsoc layout descriptor. |
| 198 | * The first 4K should be reserved for the shm header and region descriptors. |
| 199 | * The regions should be page aligned. |
| 200 | */ |
| 201 | |
| 202 | struct vsoc_shm_layout_descriptor { |
| 203 | __u16 major_version; |
| 204 | __u16 minor_version; |
| 205 | |
| 206 | /* size of the shm. This may be redundant but nice to have */ |
| 207 | __u32 size; |
| 208 | |
| 209 | /* number of shared memory regions */ |
| 210 | __u32 region_count; |
| 211 | |
| 212 | /* The offset to the start of region descriptors */ |
| 213 | __u32 vsoc_region_desc_offset; |
| 214 | }; |
| 215 | |
| 216 | /* |
| 217 | * This specifies the current version that should be stored in |
| 218 | * vsoc_shm_layout_descriptor.major_version and |
| 219 | * vsoc_shm_layout_descriptor.minor_version. |
| 220 | * It should be updated only if the vsoc_device_region and |
| 221 | * vsoc_shm_layout_descriptor structures have changed. |
| 222 | * Versioning within each region is transferred |
| 223 | * via the min_compatible_version and current_version fields in |
| 224 | * vsoc_device_region. The driver does not consult these fields: they are left |
| 225 | * for the HALs and host processes and will change independently of the layout |
| 226 | * version. |
| 227 | */ |
| 228 | #define CURRENT_VSOC_LAYOUT_MAJOR_VERSION 2 |
| 229 | #define CURRENT_VSOC_LAYOUT_MINOR_VERSION 0 |
| 230 | |
| 231 | #define VSOC_CREATE_FD_SCOPED_PERMISSION \ |
| 232 | _IOW(0xF5, 0, struct fd_scoped_permission) |
| 233 | #define VSOC_GET_FD_SCOPED_PERMISSION _IOR(0xF5, 1, struct fd_scoped_permission) |
| 234 | |
| 235 | /* |
| 236 | * This is used to signal the host to scan the guest_to_host_signal_table |
| 237 | * for new futexes to wake. This sends an interrupt if one is not already |
| 238 | * in flight. |
| 239 | */ |
| 240 | #define VSOC_MAYBE_SEND_INTERRUPT_TO_HOST _IO(0xF5, 2) |
| 241 | |
| 242 | /* |
| 243 | * When this returns the guest will scan host_to_guest_signal_table to |
| 244 | * check for new futexes to wake. |
| 245 | */ |
| 246 | /* TODO(ghartman): Consider moving this to the bottom half */ |
| 247 | #define VSOC_WAIT_FOR_INCOMING_INTERRUPT _IO(0xF5, 3) |
| 248 | |
| 249 | /* |
| 250 | * Guest HALs will use this to retrieve the region description after |
| 251 | * opening their device node. |
| 252 | */ |
| 253 | #define VSOC_DESCRIBE_REGION _IOR(0xF5, 4, struct vsoc_device_region) |
| 254 | |
| 255 | /* |
| 256 | * Wake any threads that may be waiting for a host interrupt on this region. |
| 257 | * This is mostly used during shutdown. |
| 258 | */ |
| 259 | #define VSOC_SELF_INTERRUPT _IO(0xF5, 5) |
| 260 | |
| 261 | /* |
| 262 | * This is used to signal the host to scan the guest_to_host_signal_table |
| 263 | * for new futexes to wake. This sends an interrupt unconditionally. |
| 264 | */ |
| 265 | #define VSOC_SEND_INTERRUPT_TO_HOST _IO(0xF5, 6) |
| 266 | |
| 267 | enum wait_types { |
| 268 | VSOC_WAIT_UNDEFINED = 0, |
| 269 | VSOC_WAIT_IF_EQUAL = 1, |
| 270 | VSOC_WAIT_IF_EQUAL_TIMEOUT = 2 |
| 271 | }; |
| 272 | |
| 273 | /* |
| 274 | * Wait for a condition to be true |
| 275 | * |
| 276 | * Note, this is sized and aligned so the 32 bit and 64 bit layouts are |
| 277 | * identical. |
| 278 | */ |
| 279 | struct vsoc_cond_wait { |
| 280 | /* Input: Offset of the 32 bit word to check */ |
| 281 | __u32 offset; |
| 282 | /* Input: Value that will be compared with the offset */ |
| 283 | __u32 value; |
| 284 | /* Monotonic time to wake at in seconds */ |
| 285 | __u64 wake_time_sec; |
| 286 | /* Input: Monotonic time to wait in nanoseconds */ |
| 287 | __u32 wake_time_nsec; |
| 288 | /* Input: Type of wait */ |
| 289 | __u32 wait_type; |
| 290 | /* Output: Number of times the thread woke before returning. */ |
| 291 | __u32 wakes; |
| 292 | /* Ensure that we're 8-byte aligned and 8 byte length for 32/64 bit |
| 293 | * compatibility. |
| 294 | */ |
| 295 | __u32 reserved_1; |
| 296 | }; |
| 297 | |
| 298 | #define VSOC_COND_WAIT _IOWR(0xF5, 7, struct vsoc_cond_wait) |
| 299 | |
| 300 | /* Wake any local threads waiting at the offset given in arg */ |
| 301 | #define VSOC_COND_WAKE _IO(0xF5, 8) |
| 302 | |
| 303 | #endif /* _UAPI_LINUX_VSOC_SHM_H */ |