blob: 4d98643df546ddab9e090f631e94a0ada2d26566 [file] [log] [blame]
Dominik Zeromskied816d52015-07-17 11:25:43 +02001/*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Zhenyu Wang <zhenyuw@linux.intel.com>
25 * Dominik Zeromski <dominik.zeromski@intel.com>
26 */
27
28#include <intel_bufmgr.h>
29#include <i915_drm.h>
30
31#include "intel_reg.h"
32#include "drmtest.h"
33#include "intel_batchbuffer.h"
34#include "gen7_media.h"
Dominik Zeromskia017c292015-07-17 11:25:44 +020035#include "gen8_media.h"
Dominik Zeromskied816d52015-07-17 11:25:43 +020036#include "gpgpu_fill.h"
37
38/* shaders/gpgpu/gpgpu_fill.gxa */
39static const uint32_t gen7_gpgpu_kernel[][4] = {
40 { 0x00400001, 0x20200231, 0x00000020, 0x00000000 },
41 { 0x00000041, 0x20400c21, 0x00000004, 0x00000010 },
42 { 0x00000001, 0x20440021, 0x00000018, 0x00000000 },
43 { 0x00600001, 0x20800021, 0x008d0000, 0x00000000 },
44 { 0x00200001, 0x20800021, 0x00450040, 0x00000000 },
45 { 0x00000001, 0x20880061, 0x00000000, 0x0000000f },
46 { 0x00800001, 0x20a00021, 0x00000020, 0x00000000 },
47 { 0x05800031, 0x24001ca8, 0x00000080, 0x060a8000 },
48 { 0x00600001, 0x2e000021, 0x008d0000, 0x00000000 },
49 { 0x07800031, 0x20001ca8, 0x00000e00, 0x82000010 },
50};
51
Dominik Zeromskia017c292015-07-17 11:25:44 +020052static const uint32_t gen8_gpgpu_kernel[][4] = {
53 { 0x00400001, 0x20202288, 0x00000020, 0x00000000 },
54 { 0x00000041, 0x20400208, 0x06000004, 0x00000010 },
55 { 0x00000001, 0x20440208, 0x00000018, 0x00000000 },
56 { 0x00600001, 0x20800208, 0x008d0000, 0x00000000 },
57 { 0x00200001, 0x20800208, 0x00450040, 0x00000000 },
58 { 0x00000001, 0x20880608, 0x00000000, 0x0000000f },
59 { 0x00800001, 0x20a00208, 0x00000020, 0x00000000 },
60 { 0x0c800031, 0x24000a40, 0x0e000080, 0x060a8000 },
61 { 0x00600001, 0x2e000208, 0x008d0000, 0x00000000 },
62 { 0x07800031, 0x20000a40, 0x0e000e00, 0x82000010 },
63};
64
Dominik Zeromski36769122015-07-17 11:25:45 +020065static const uint32_t gen9_gpgpu_kernel[][4] = {
66 { 0x00400001, 0x20202288, 0x00000020, 0x00000000 },
67 { 0x00000041, 0x20400208, 0x06000004, 0x00000010 },
68 { 0x00000001, 0x20440208, 0x00000018, 0x00000000 },
69 { 0x00600001, 0x20800208, 0x008d0000, 0x00000000 },
70 { 0x00200001, 0x20800208, 0x00450040, 0x00000000 },
71 { 0x00000001, 0x20880608, 0x00000000, 0x0000000f },
72 { 0x00800001, 0x20a00208, 0x00000020, 0x00000000 },
73 { 0x0c800031, 0x24000a40, 0x06000080, 0x060a8000 },
74 { 0x00600001, 0x2e000208, 0x008d0000, 0x00000000 },
75 { 0x07800031, 0x20000a40, 0x06000e00, 0x82000010 },
76};
77
Dominik Zeromskied816d52015-07-17 11:25:43 +020078static uint32_t
79batch_used(struct intel_batchbuffer *batch)
80{
81 return batch->ptr - batch->buffer;
82}
83
84static uint32_t
85batch_align(struct intel_batchbuffer *batch, uint32_t align)
86{
87 uint32_t offset = batch_used(batch);
88 offset = ALIGN(offset, align);
89 batch->ptr = batch->buffer + offset;
90 return offset;
91}
92
93static void *
94batch_alloc(struct intel_batchbuffer *batch, uint32_t size, uint32_t align)
95{
96 uint32_t offset = batch_align(batch, align);
97 batch->ptr += size;
98 return memset(batch->buffer + offset, 0, size);
99}
100
101static uint32_t
102batch_offset(struct intel_batchbuffer *batch, void *ptr)
103{
104 return (uint8_t *)ptr - batch->buffer;
105}
106
107static uint32_t
108batch_copy(struct intel_batchbuffer *batch, const void *ptr, uint32_t size,
109 uint32_t align)
110{
111 return batch_offset(batch, memcpy(batch_alloc(batch, size, align), ptr, size));
112}
113
114static void
115gen7_render_flush(struct intel_batchbuffer *batch, uint32_t batch_end)
116{
117 int ret;
118
119 ret = drm_intel_bo_subdata(batch->bo, 0, 4096, batch->buffer);
120 if (ret == 0)
121 ret = drm_intel_bo_mrb_exec(batch->bo, batch_end,
122 NULL, 0, 0, 0);
123 igt_assert(ret == 0);
124}
125
126static uint32_t
Dominik Zeromskia017c292015-07-17 11:25:44 +0200127gen7_fill_curbe_buffer_data(struct intel_batchbuffer *batch, uint8_t color)
Dominik Zeromskied816d52015-07-17 11:25:43 +0200128{
129 uint8_t *curbe_buffer;
130 uint32_t offset;
131
132 curbe_buffer = batch_alloc(batch, sizeof(uint32_t) * 8, 64);
133 offset = batch_offset(batch, curbe_buffer);
134 *curbe_buffer = color;
135
136 return offset;
137}
138
139static uint32_t
140gen7_fill_surface_state(struct intel_batchbuffer *batch,
141 struct igt_buf *buf,
142 uint32_t format,
143 int is_dst)
144{
145 struct gen7_surface_state *ss;
146 uint32_t write_domain, read_domain, offset;
147 int ret;
148
149 if (is_dst) {
150 write_domain = read_domain = I915_GEM_DOMAIN_RENDER;
151 } else {
152 write_domain = 0;
153 read_domain = I915_GEM_DOMAIN_SAMPLER;
154 }
155
156 ss = batch_alloc(batch, sizeof(*ss), 64);
157 offset = batch_offset(batch, ss);
158
159 ss->ss0.surface_type = GEN7_SURFACE_2D;
160 ss->ss0.surface_format = format;
161 ss->ss0.render_cache_read_write = 1;
162
163 if (buf->tiling == I915_TILING_X)
164 ss->ss0.tiled_mode = 2;
165 else if (buf->tiling == I915_TILING_Y)
166 ss->ss0.tiled_mode = 3;
167
168 ss->ss1.base_addr = buf->bo->offset;
169 ret = drm_intel_bo_emit_reloc(batch->bo,
170 batch_offset(batch, ss) + 4,
171 buf->bo, 0,
172 read_domain, write_domain);
173 igt_assert(ret == 0);
174
175 ss->ss2.height = igt_buf_height(buf) - 1;
176 ss->ss2.width = igt_buf_width(buf) - 1;
177
178 ss->ss3.pitch = buf->stride - 1;
179
180 ss->ss7.shader_chanel_select_r = 4;
181 ss->ss7.shader_chanel_select_g = 5;
182 ss->ss7.shader_chanel_select_b = 6;
183 ss->ss7.shader_chanel_select_a = 7;
184
185 return offset;
186}
187
188static uint32_t
Dominik Zeromskia017c292015-07-17 11:25:44 +0200189gen8_fill_surface_state(struct intel_batchbuffer *batch,
190 struct igt_buf *buf,
191 uint32_t format,
192 int is_dst)
193{
194 struct gen8_surface_state *ss;
195 uint32_t write_domain, read_domain, offset;
196 int ret;
197
198 if (is_dst) {
199 write_domain = read_domain = I915_GEM_DOMAIN_RENDER;
200 } else {
201 write_domain = 0;
202 read_domain = I915_GEM_DOMAIN_SAMPLER;
203 }
204
205 ss = batch_alloc(batch, sizeof(*ss), 64);
206 offset = batch_offset(batch, ss);
207
208 ss->ss0.surface_type = GEN8_SURFACE_2D;
209 ss->ss0.surface_format = format;
210 ss->ss0.render_cache_read_write = 1;
211 ss->ss0.vertical_alignment = 1; /* align 4 */
212 ss->ss0.horizontal_alignment = 1; /* align 4 */
213
214 if (buf->tiling == I915_TILING_X)
215 ss->ss0.tiled_mode = 2;
216 else if (buf->tiling == I915_TILING_Y)
217 ss->ss0.tiled_mode = 3;
218
219 ss->ss8.base_addr = buf->bo->offset;
220
221 ret = drm_intel_bo_emit_reloc(batch->bo,
222 batch_offset(batch, ss) + 8 * 4,
223 buf->bo, 0,
224 read_domain, write_domain);
225 igt_assert_eq(ret, 0);
226
227 ss->ss2.height = igt_buf_height(buf) - 1;
228 ss->ss2.width = igt_buf_width(buf) - 1;
229 ss->ss3.pitch = buf->stride - 1;
230
231 ss->ss7.shader_chanel_select_r = 4;
232 ss->ss7.shader_chanel_select_g = 5;
233 ss->ss7.shader_chanel_select_b = 6;
234 ss->ss7.shader_chanel_select_a = 7;
235
236 return offset;
237
238}
239
240static uint32_t
Dominik Zeromskied816d52015-07-17 11:25:43 +0200241gen7_fill_binding_table(struct intel_batchbuffer *batch,
242 struct igt_buf *dst)
243{
244 uint32_t *binding_table, offset;
245
246 binding_table = batch_alloc(batch, 32, 64);
247 offset = batch_offset(batch, binding_table);
248
249 binding_table[0] = gen7_fill_surface_state(batch, dst, GEN7_SURFACEFORMAT_R8_UNORM, 1);
250
251 return offset;
252}
253
254static uint32_t
Dominik Zeromskia017c292015-07-17 11:25:44 +0200255gen8_fill_binding_table(struct intel_batchbuffer *batch,
256 struct igt_buf *dst)
257{
258 uint32_t *binding_table, offset;
259
260 binding_table = batch_alloc(batch, 32, 64);
261 offset = batch_offset(batch, binding_table);
262
263 binding_table[0] = gen8_fill_surface_state(batch, dst, GEN8_SURFACEFORMAT_R8_UNORM, 1);
264
265 return offset;
266}
267
268static uint32_t
Dominik Zeromskied816d52015-07-17 11:25:43 +0200269gen7_fill_gpgpu_kernel(struct intel_batchbuffer *batch,
270 const uint32_t kernel[][4],
271 size_t size)
272{
273 uint32_t offset;
274
275 offset = batch_copy(batch, kernel, size, 64);
276
277 return offset;
278}
279
280static uint32_t
281gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, struct igt_buf *dst,
282 const uint32_t kernel[][4], size_t size)
283{
284 struct gen7_interface_descriptor_data *idd;
285 uint32_t offset;
286 uint32_t binding_table_offset, kernel_offset;
287
288 binding_table_offset = gen7_fill_binding_table(batch, dst);
289 kernel_offset = gen7_fill_gpgpu_kernel(batch, kernel, size);
290
291 idd = batch_alloc(batch, sizeof(*idd), 64);
292 offset = batch_offset(batch, idd);
293
294 idd->desc0.kernel_start_pointer = (kernel_offset >> 6);
295
296 idd->desc1.single_program_flow = 1;
297 idd->desc1.floating_point_mode = GEN7_FLOATING_POINT_IEEE_754;
298
299 idd->desc2.sampler_count = 0; /* 0 samplers used */
300 idd->desc2.sampler_state_pointer = 0;
301
302 idd->desc3.binding_table_entry_count = 0;
303 idd->desc3.binding_table_pointer = (binding_table_offset >> 5);
304
305 idd->desc4.constant_urb_entry_read_offset = 0;
306 idd->desc4.constant_urb_entry_read_length = 1; /* grf 1 */
307
308 return offset;
309}
310
Dominik Zeromskia017c292015-07-17 11:25:44 +0200311static uint32_t
312gen8_fill_interface_descriptor(struct intel_batchbuffer *batch, struct igt_buf *dst,
313 const uint32_t kernel[][4], size_t size)
314{
315 struct gen8_interface_descriptor_data *idd;
316 uint32_t offset;
317 uint32_t binding_table_offset, kernel_offset;
318
319 binding_table_offset = gen8_fill_binding_table(batch, dst);
320 kernel_offset = gen7_fill_gpgpu_kernel(batch, kernel, size);
321
322 idd = batch_alloc(batch, sizeof(*idd), 64);
323 offset = batch_offset(batch, idd);
324
325 idd->desc0.kernel_start_pointer = (kernel_offset >> 6);
326
327 idd->desc2.single_program_flow = 1;
328 idd->desc2.floating_point_mode = GEN8_FLOATING_POINT_IEEE_754;
329
330 idd->desc3.sampler_count = 0; /* 0 samplers used */
331 idd->desc3.sampler_state_pointer = 0;
332
333 idd->desc4.binding_table_entry_count = 0;
334 idd->desc4.binding_table_pointer = (binding_table_offset >> 5);
335
336 idd->desc5.constant_urb_entry_read_offset = 0;
337 idd->desc5.constant_urb_entry_read_length = 1; /* grf 1 */
338
339 return offset;
340}
341
Dominik Zeromskied816d52015-07-17 11:25:43 +0200342static void
343gen7_emit_state_base_address(struct intel_batchbuffer *batch)
344{
345 OUT_BATCH(GEN7_STATE_BASE_ADDRESS | (10 - 2));
346
347 /* general */
348 OUT_BATCH(0);
349
350 /* surface */
351 OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, BASE_ADDRESS_MODIFY);
352
353 /* dynamic */
354 OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, BASE_ADDRESS_MODIFY);
355
356 /* indirect */
357 OUT_BATCH(0);
358
359 /* instruction */
360 OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, BASE_ADDRESS_MODIFY);
361
362 /* general/dynamic/indirect/instruction access Bound */
363 OUT_BATCH(0);
364 OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
365 OUT_BATCH(0);
366 OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
367}
368
369static void
Dominik Zeromskia017c292015-07-17 11:25:44 +0200370gen8_emit_state_base_address(struct intel_batchbuffer *batch)
371{
372 OUT_BATCH(GEN8_STATE_BASE_ADDRESS | (16 - 2));
373
374 /* general */
375 OUT_BATCH(0 | (0x78 << 4) | (0 << 1) | BASE_ADDRESS_MODIFY);
376 OUT_BATCH(0);
377
378 /* stateless data port */
379 OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
380
381 /* surface */
382 OUT_RELOC(batch->bo, I915_GEM_DOMAIN_SAMPLER, 0, BASE_ADDRESS_MODIFY);
383
384 /* dynamic */
385 OUT_RELOC(batch->bo, I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION,
386 0, BASE_ADDRESS_MODIFY);
387
388 /* indirect */
389 OUT_BATCH(0);
390 OUT_BATCH(0 );
391
392 /* instruction */
393 OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, BASE_ADDRESS_MODIFY);
394
395 /* general state buffer size */
396 OUT_BATCH(0xfffff000 | 1);
397 /* dynamic state buffer size */
398 OUT_BATCH(1 << 12 | 1);
399 /* indirect object buffer size */
400 OUT_BATCH(0xfffff000 | 1);
401 /* intruction buffer size, must set modify enable bit, otherwise it may result in GPU hang */
402 OUT_BATCH(1 << 12 | 1);
403}
404
405static void
Dominik Zeromski36769122015-07-17 11:25:45 +0200406gen9_emit_state_base_address(struct intel_batchbuffer *batch)
407{
408 OUT_BATCH(GEN8_STATE_BASE_ADDRESS | (19 - 2));
409
410 /* general */
411 OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
412 OUT_BATCH(0);
413
414 /* stateless data port */
415 OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
416
417 /* surface */
418 OUT_RELOC(batch->bo, I915_GEM_DOMAIN_SAMPLER, 0, BASE_ADDRESS_MODIFY);
419
420 /* dynamic */
421 OUT_RELOC(batch->bo, I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION,
422 0, BASE_ADDRESS_MODIFY);
423
424 /* indirect */
425 OUT_BATCH(0);
426 OUT_BATCH(0);
427
428 /* instruction */
429 OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, BASE_ADDRESS_MODIFY);
430
431 /* general state buffer size */
432 OUT_BATCH(0xfffff000 | 1);
433 /* dynamic state buffer size */
434 OUT_BATCH(1 << 12 | 1);
435 /* indirect object buffer size */
436 OUT_BATCH(0xfffff000 | 1);
437 /* intruction buffer size, must set modify enable bit, otherwise it may result in GPU hang */
438 OUT_BATCH(1 << 12 | 1);
439
440 /* Bindless surface state base address */
441 OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
442 OUT_BATCH(0);
443 OUT_BATCH(0xfffff000);
444}
445
446static void
Dominik Zeromskied816d52015-07-17 11:25:43 +0200447gen7_emit_vfe_state_gpgpu(struct intel_batchbuffer *batch)
448{
449 OUT_BATCH(GEN7_MEDIA_VFE_STATE | (8 - 2));
450
451 /* scratch buffer */
452 OUT_BATCH(0);
453
454 /* number of threads & urb entries */
455 OUT_BATCH(1 << 16 | /* max num of threads */
456 0 << 8 | /* num of URB entry */
457 1 << 2); /* GPGPU mode */
458
459 OUT_BATCH(0);
460
461 /* urb entry size & curbe size */
462 OUT_BATCH(0 << 16 | /* URB entry size in 256 bits unit */
463 1); /* CURBE entry size in 256 bits unit */
464
465 /* scoreboard */
466 OUT_BATCH(0);
467 OUT_BATCH(0);
468 OUT_BATCH(0);
469}
470
471static void
Dominik Zeromskia017c292015-07-17 11:25:44 +0200472gen8_emit_vfe_state_gpgpu(struct intel_batchbuffer *batch)
473{
474 OUT_BATCH(GEN8_MEDIA_VFE_STATE | (9 - 2));
475
476 /* scratch buffer */
477 OUT_BATCH(0);
478 OUT_BATCH(0);
479
480 /* number of threads & urb entries */
481 OUT_BATCH(1 << 16 | 1 << 8);
482
483 OUT_BATCH(0);
484
485 /* urb entry size & curbe size */
486 OUT_BATCH(0 << 16 | 1);
487
488 /* scoreboard */
489 OUT_BATCH(0);
490 OUT_BATCH(0);
491 OUT_BATCH(0);
492}
493
494static void
Dominik Zeromskied816d52015-07-17 11:25:43 +0200495gen7_emit_curbe_load(struct intel_batchbuffer *batch, uint32_t curbe_buffer)
496{
497 OUT_BATCH(GEN7_MEDIA_CURBE_LOAD | (4 - 2));
498 OUT_BATCH(0);
499 /* curbe total data length */
500 OUT_BATCH(64);
501 /* curbe data start address, is relative to the dynamics base address */
502 OUT_BATCH(curbe_buffer);
503}
504
505static void
506gen7_emit_interface_descriptor_load(struct intel_batchbuffer *batch, uint32_t interface_descriptor)
507{
508 OUT_BATCH(GEN7_MEDIA_INTERFACE_DESCRIPTOR_LOAD | (4 - 2));
509 OUT_BATCH(0);
510 /* interface descriptor data length */
511 OUT_BATCH(sizeof(struct gen7_interface_descriptor_data));
512 /* interface descriptor address, is relative to the dynamics base address */
513 OUT_BATCH(interface_descriptor);
514}
515
516static void
Dominik Zeromskia017c292015-07-17 11:25:44 +0200517gen8_emit_interface_descriptor_load(struct intel_batchbuffer *batch, uint32_t interface_descriptor)
518{
519 OUT_BATCH(GEN8_MEDIA_INTERFACE_DESCRIPTOR_LOAD | (4 - 2));
520 OUT_BATCH(0);
521 /* interface descriptor data length */
522 OUT_BATCH(sizeof(struct gen8_interface_descriptor_data));
523 /* interface descriptor address, is relative to the dynamics base address */
524 OUT_BATCH(interface_descriptor);
525}
526
527static void
Dominik Zeromskied816d52015-07-17 11:25:43 +0200528gen7_emit_gpgpu_walk(struct intel_batchbuffer *batch,
529 unsigned x, unsigned y,
530 unsigned width, unsigned height)
531{
532 uint32_t x_dim, y_dim, tmp, right_mask;
533
534 /*
535 * Simply do SIMD16 based dispatch, so every thread uses
536 * SIMD16 channels.
537 *
538 * Define our own thread group size, e.g 16x1 for every group, then
539 * will have 1 thread each group in SIMD16 dispatch. So thread
540 * width/height/depth are all 1.
541 *
542 * Then thread group X = width / 16 (aligned to 16)
543 * thread group Y = height;
544 */
545 x_dim = (width + 15) / 16;
546 y_dim = height;
547
548 tmp = width & 15;
549 if (tmp == 0)
550 right_mask = (1 << 16) - 1;
551 else
552 right_mask = (1 << tmp) - 1;
553
554 OUT_BATCH(GEN7_GPGPU_WALKER | 9);
555
556 /* interface descriptor offset */
557 OUT_BATCH(0);
558
559 /* SIMD size, thread w/h/d */
560 OUT_BATCH(1 << 30 | /* SIMD16 */
561 0 << 16 | /* depth:1 */
562 0 << 8 | /* height:1 */
563 0); /* width:1 */
564
565 /* thread group X */
566 OUT_BATCH(0);
567 OUT_BATCH(x_dim);
568
569 /* thread group Y */
570 OUT_BATCH(0);
571 OUT_BATCH(y_dim);
572
573 /* thread group Z */
574 OUT_BATCH(0);
575 OUT_BATCH(1);
576
577 /* right mask */
578 OUT_BATCH(right_mask);
579
580 /* bottom mask, height 1, always 0xffffffff */
581 OUT_BATCH(0xffffffff);
582}
583
Dominik Zeromskia017c292015-07-17 11:25:44 +0200584static void
585gen8_emit_gpgpu_walk(struct intel_batchbuffer *batch,
586 unsigned x, unsigned y,
587 unsigned width, unsigned height)
588{
589 uint32_t x_dim, y_dim, tmp, right_mask;
590
591 /*
592 * Simply do SIMD16 based dispatch, so every thread uses
593 * SIMD16 channels.
594 *
595 * Define our own thread group size, e.g 16x1 for every group, then
596 * will have 1 thread each group in SIMD16 dispatch. So thread
597 * width/height/depth are all 1.
598 *
599 * Then thread group X = width / 16 (aligned to 16)
600 * thread group Y = height;
601 */
602 x_dim = (width + 15) / 16;
603 y_dim = height;
604
605 tmp = width & 15;
606 if (tmp == 0)
607 right_mask = (1 << 16) - 1;
608 else
609 right_mask = (1 << tmp) - 1;
610
611 OUT_BATCH(GEN7_GPGPU_WALKER | 13);
612
613 OUT_BATCH(0); /* kernel offset */
614 OUT_BATCH(0); /* indirect data length */
615 OUT_BATCH(0); /* indirect data offset */
616
617 /* SIMD size, thread w/h/d */
618 OUT_BATCH(1 << 30 | /* SIMD16 */
619 0 << 16 | /* depth:1 */
620 0 << 8 | /* height:1 */
621 0); /* width:1 */
622
623 /* thread group X */
624 OUT_BATCH(0);
625 OUT_BATCH(0);
626 OUT_BATCH(x_dim);
627
628 /* thread group Y */
629 OUT_BATCH(0);
630 OUT_BATCH(0);
631 OUT_BATCH(y_dim);
632
633 /* thread group Z */
634 OUT_BATCH(0);
635 OUT_BATCH(1);
636
637 /* right mask */
638 OUT_BATCH(right_mask);
639
640 /* bottom mask, height 1, always 0xffffffff */
641 OUT_BATCH(0xffffffff);
642}
643
Dominik Zeromskied816d52015-07-17 11:25:43 +0200644/*
645 * This sets up the gpgpu pipeline,
646 *
647 * +---------------+ <---- 4096
648 * | ^ |
649 * | | |
650 * | various |
651 * | state |
652 * | | |
653 * |_______|_______| <---- 2048 + ?
654 * | ^ |
655 * | | |
656 * | batch |
657 * | commands |
658 * | | |
659 * | | |
660 * +---------------+ <---- 0 + ?
661 *
662 */
663
664#define BATCH_STATE_SPLIT 2048
665
666void
667gen7_gpgpu_fillfunc(struct intel_batchbuffer *batch,
668 struct igt_buf *dst,
669 unsigned x, unsigned y,
670 unsigned width, unsigned height,
671 uint8_t color)
672{
673 uint32_t curbe_buffer, interface_descriptor;
674 uint32_t batch_end;
675
676 intel_batchbuffer_flush(batch);
677
678 /* setup states */
679 batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
680
681 /*
682 * const buffer needs to fill for every thread, but as we have just 1 thread
683 * per every group, so need only one curbe data.
684 *
685 * For each thread, just use thread group ID for buffer offset.
686 */
687 curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
688
689 interface_descriptor = gen7_fill_interface_descriptor(batch, dst,
690 gen7_gpgpu_kernel,
691 sizeof(gen7_gpgpu_kernel));
692 igt_assert(batch->ptr < &batch->buffer[4095]);
693
694 batch->ptr = batch->buffer;
695
696 /* GPGPU pipeline */
697 OUT_BATCH(GEN7_PIPELINE_SELECT | PIPELINE_SELECT_GPGPU);
698
699 gen7_emit_state_base_address(batch);
Dominik Zeromskied816d52015-07-17 11:25:43 +0200700 gen7_emit_vfe_state_gpgpu(batch);
Dominik Zeromskied816d52015-07-17 11:25:43 +0200701 gen7_emit_curbe_load(batch, curbe_buffer);
Dominik Zeromskied816d52015-07-17 11:25:43 +0200702 gen7_emit_interface_descriptor_load(batch, interface_descriptor);
Dominik Zeromskied816d52015-07-17 11:25:43 +0200703 gen7_emit_gpgpu_walk(batch, x, y, width, height);
704
705 OUT_BATCH(MI_BATCH_BUFFER_END);
706
707 batch_end = batch_align(batch, 8);
708 igt_assert(batch_end < BATCH_STATE_SPLIT);
709
710 gen7_render_flush(batch, batch_end);
711 intel_batchbuffer_reset(batch);
712}
Dominik Zeromskia017c292015-07-17 11:25:44 +0200713
714void
715gen8_gpgpu_fillfunc(struct intel_batchbuffer *batch,
716 struct igt_buf *dst,
717 unsigned x, unsigned y,
718 unsigned width, unsigned height,
719 uint8_t color)
720{
721 uint32_t curbe_buffer, interface_descriptor;
722 uint32_t batch_end;
723
724 intel_batchbuffer_flush(batch);
725
726 /* setup states */
727 batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
728
729 /*
730 * const buffer needs to fill for every thread, but as we have just 1 thread
731 * per every group, so need only one curbe data.
732 *
733 * For each thread, just use thread group ID for buffer offset.
734 */
735 curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
736
737 interface_descriptor = gen8_fill_interface_descriptor(batch, dst,
738 gen8_gpgpu_kernel,
739 sizeof(gen8_gpgpu_kernel));
740 igt_assert(batch->ptr < &batch->buffer[4095]);
741
742 batch->ptr = batch->buffer;
743
744 /* GPGPU pipeline */
745 OUT_BATCH(GEN7_PIPELINE_SELECT | PIPELINE_SELECT_GPGPU);
746
747 gen8_emit_state_base_address(batch);
748 gen8_emit_vfe_state_gpgpu(batch);
749 gen7_emit_curbe_load(batch, curbe_buffer);
750 gen8_emit_interface_descriptor_load(batch, interface_descriptor);
751 gen8_emit_gpgpu_walk(batch, x, y, width, height);
752
753 OUT_BATCH(MI_BATCH_BUFFER_END);
754
755 batch_end = batch_align(batch, 8);
756 igt_assert(batch_end < BATCH_STATE_SPLIT);
757
758 gen7_render_flush(batch, batch_end);
759 intel_batchbuffer_reset(batch);
760}
Dominik Zeromski36769122015-07-17 11:25:45 +0200761
762void
763gen9_gpgpu_fillfunc(struct intel_batchbuffer *batch,
764 struct igt_buf *dst,
765 unsigned x, unsigned y,
766 unsigned width, unsigned height,
767 uint8_t color)
768{
769 uint32_t curbe_buffer, interface_descriptor;
770 uint32_t batch_end;
771
772 intel_batchbuffer_flush(batch);
773
774 /* setup states */
775 batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
776
777 /*
778 * const buffer needs to fill for every thread, but as we have just 1 thread
779 * per every group, so need only one curbe data.
780 *
781 * For each thread, just use thread group ID for buffer offset.
782 */
783 curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
784
785 interface_descriptor = gen8_fill_interface_descriptor(batch, dst,
786 gen9_gpgpu_kernel,
787 sizeof(gen9_gpgpu_kernel));
788 igt_assert(batch->ptr < &batch->buffer[4095]);
789
790 batch->ptr = batch->buffer;
791
792 /* GPGPU pipeline */
793 OUT_BATCH(GEN7_PIPELINE_SELECT | PIPELINE_SELECT_GPGPU);
794
795 gen9_emit_state_base_address(batch);
796 gen8_emit_vfe_state_gpgpu(batch);
797 gen7_emit_curbe_load(batch, curbe_buffer);
798 gen7_emit_interface_descriptor_load(batch, interface_descriptor);
799 gen8_emit_gpgpu_walk(batch, x, y, width, height);
800
801 OUT_BATCH(MI_BATCH_BUFFER_END);
802
803 batch_end = batch_align(batch, 8);
804 igt_assert(batch_end < BATCH_STATE_SPLIT);
805
806 gen7_render_flush(batch, batch_end);
807 intel_batchbuffer_reset(batch);
808}