| /* |
| * Copyright © 2010 Intel Corporation |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| */ |
| |
| /** @file brw_fs.cpp |
| * |
| * This file drives the GLSL IR -> LIR translation, contains the |
| * optimizations on the LIR, and drives the generation of native code |
| * from the LIR. |
| */ |
| |
| #include <sys/types.h> |
| |
| #include "util/hash_table.h" |
| #include "main/macros.h" |
| #include "main/shaderobj.h" |
| #include "main/fbobject.h" |
| #include "program/prog_parameter.h" |
| #include "program/prog_print.h" |
| #include "util/register_allocate.h" |
| #include "program/hash_table.h" |
| #include "brw_context.h" |
| #include "brw_eu.h" |
| #include "brw_wm.h" |
| #include "brw_fs.h" |
| #include "brw_cfg.h" |
| #include "brw_dead_control_flow.h" |
| #include "main/uniforms.h" |
| #include "brw_fs_live_variables.h" |
| #include "glsl/glsl_types.h" |
| #include "program/sampler.h" |
| |
| using namespace brw; |
| |
| void |
| fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, |
| const fs_reg *src, unsigned sources) |
| { |
| memset(this, 0, sizeof(*this)); |
| |
| this->src = new fs_reg[MAX2(sources, 3)]; |
| for (unsigned i = 0; i < sources; i++) |
| this->src[i] = src[i]; |
| |
| this->opcode = opcode; |
| this->dst = dst; |
| this->sources = sources; |
| this->exec_size = exec_size; |
| |
| assert(dst.file != IMM && dst.file != UNIFORM); |
| |
| assert(this->exec_size != 0); |
| |
| this->conditional_mod = BRW_CONDITIONAL_NONE; |
| |
| /* This will be the case for almost all instructions. */ |
| switch (dst.file) { |
| case GRF: |
| case HW_REG: |
| case MRF: |
| case ATTR: |
| this->regs_written = DIV_ROUND_UP(dst.component_size(exec_size), |
| REG_SIZE); |
| break; |
| case BAD_FILE: |
| this->regs_written = 0; |
| break; |
| case IMM: |
| case UNIFORM: |
| unreachable("Invalid destination register file"); |
| default: |
| unreachable("Invalid register file"); |
| } |
| |
| this->writes_accumulator = false; |
| } |
| |
| fs_inst::fs_inst() |
| { |
| init(BRW_OPCODE_NOP, 8, dst, NULL, 0); |
| } |
| |
| fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size) |
| { |
| init(opcode, exec_size, reg_undef, NULL, 0); |
| } |
| |
| fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst) |
| { |
| init(opcode, exec_size, dst, NULL, 0); |
| } |
| |
| fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, |
| const fs_reg &src0) |
| { |
| const fs_reg src[1] = { src0 }; |
| init(opcode, exec_size, dst, src, 1); |
| } |
| |
| fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, |
| const fs_reg &src0, const fs_reg &src1) |
| { |
| const fs_reg src[2] = { src0, src1 }; |
| init(opcode, exec_size, dst, src, 2); |
| } |
| |
| fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, |
| const fs_reg &src0, const fs_reg &src1, const fs_reg &src2) |
| { |
| const fs_reg src[3] = { src0, src1, src2 }; |
| init(opcode, exec_size, dst, src, 3); |
| } |
| |
| fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst, |
| const fs_reg src[], unsigned sources) |
| { |
| init(opcode, exec_width, dst, src, sources); |
| } |
| |
| fs_inst::fs_inst(const fs_inst &that) |
| { |
| memcpy(this, &that, sizeof(that)); |
| |
| this->src = new fs_reg[MAX2(that.sources, 3)]; |
| |
| for (unsigned i = 0; i < that.sources; i++) |
| this->src[i] = that.src[i]; |
| } |
| |
| fs_inst::~fs_inst() |
| { |
| delete[] this->src; |
| } |
| |
| void |
| fs_inst::resize_sources(uint8_t num_sources) |
| { |
| if (this->sources != num_sources) { |
| fs_reg *src = new fs_reg[MAX2(num_sources, 3)]; |
| |
| for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i) |
| src[i] = this->src[i]; |
| |
| delete[] this->src; |
| this->src = src; |
| this->sources = num_sources; |
| } |
| } |
| |
| void |
| fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld, |
| const fs_reg &dst, |
| const fs_reg &surf_index, |
| const fs_reg &varying_offset, |
| uint32_t const_offset) |
| { |
| /* We have our constant surface use a pitch of 4 bytes, so our index can |
| * be any component of a vector, and then we load 4 contiguous |
| * components starting from that. |
| * |
| * We break down the const_offset to a portion added to the variable |
| * offset and a portion done using reg_offset, which means that if you |
| * have GLSL using something like "uniform vec4 a[20]; gl_FragColor = |
| * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and |
| * CSE can later notice that those loads are all the same and eliminate |
| * the redundant ones. |
| */ |
| fs_reg vec4_offset = vgrf(glsl_type::int_type); |
| bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3)); |
| |
| int scale = 1; |
| if (devinfo->gen == 4 && bld.dispatch_width() == 8) { |
| /* Pre-gen5, we can either use a SIMD8 message that requires (header, |
| * u, v, r) as parameters, or we can just use the SIMD16 message |
| * consisting of (header, u). We choose the second, at the cost of a |
| * longer return length. |
| */ |
| scale = 2; |
| } |
| |
| enum opcode op; |
| if (devinfo->gen >= 7) |
| op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7; |
| else |
| op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD; |
| |
| int regs_written = 4 * (bld.dispatch_width() / 8) * scale; |
| fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written), dst.type); |
| fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset); |
| inst->regs_written = regs_written; |
| |
| if (devinfo->gen < 7) { |
| inst->base_mrf = 13; |
| inst->header_size = 1; |
| if (devinfo->gen == 4) |
| inst->mlen = 3; |
| else |
| inst->mlen = 1 + bld.dispatch_width() / 8; |
| } |
| |
| bld.MOV(dst, offset(vec4_result, bld, (const_offset & 3) * scale)); |
| } |
| |
| /** |
| * A helper for MOV generation for fixing up broken hardware SEND dependency |
| * handling. |
| */ |
| void |
| fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf) |
| { |
| /* The caller always wants uncompressed to emit the minimal extra |
| * dependencies, and to avoid having to deal with aligning its regs to 2. |
| */ |
| const fs_builder ubld = bld.annotate("send dependency resolve") |
| .half(0); |
| |
| ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F)); |
| } |
| |
| bool |
| fs_inst::equals(fs_inst *inst) const |
| { |
| return (opcode == inst->opcode && |
| dst.equals(inst->dst) && |
| src[0].equals(inst->src[0]) && |
| src[1].equals(inst->src[1]) && |
| src[2].equals(inst->src[2]) && |
| saturate == inst->saturate && |
| predicate == inst->predicate && |
| conditional_mod == inst->conditional_mod && |
| mlen == inst->mlen && |
| base_mrf == inst->base_mrf && |
| target == inst->target && |
| eot == inst->eot && |
| header_size == inst->header_size && |
| shadow_compare == inst->shadow_compare && |
| exec_size == inst->exec_size && |
| offset == inst->offset); |
| } |
| |
| bool |
| fs_inst::overwrites_reg(const fs_reg ®) const |
| { |
| return reg.in_range(dst, regs_written); |
| } |
| |
| bool |
| fs_inst::is_send_from_grf() const |
| { |
| switch (opcode) { |
| case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7: |
| case SHADER_OPCODE_SHADER_TIME_ADD: |
| case FS_OPCODE_INTERPOLATE_AT_CENTROID: |
| case FS_OPCODE_INTERPOLATE_AT_SAMPLE: |
| case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: |
| case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: |
| case SHADER_OPCODE_UNTYPED_ATOMIC: |
| case SHADER_OPCODE_UNTYPED_SURFACE_READ: |
| case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: |
| case SHADER_OPCODE_TYPED_ATOMIC: |
| case SHADER_OPCODE_TYPED_SURFACE_READ: |
| case SHADER_OPCODE_TYPED_SURFACE_WRITE: |
| case SHADER_OPCODE_URB_WRITE_SIMD8: |
| return true; |
| case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: |
| return src[1].file == GRF; |
| case FS_OPCODE_FB_WRITE: |
| return src[0].file == GRF; |
| default: |
| if (is_tex()) |
| return src[0].file == GRF; |
| |
| return false; |
| } |
| } |
| |
| bool |
| fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const |
| { |
| if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD) |
| return false; |
| |
| fs_reg reg = this->src[0]; |
| if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0) |
| return false; |
| |
| if (grf_alloc.sizes[reg.reg] != this->regs_written) |
| return false; |
| |
| for (int i = 0; i < this->sources; i++) { |
| reg.type = this->src[i].type; |
| if (!this->src[i].equals(reg)) |
| return false; |
| |
| if (i < this->header_size) { |
| reg.reg_offset += 1; |
| } else { |
| reg.reg_offset += this->exec_size / 8; |
| } |
| } |
| |
| return true; |
| } |
| |
| bool |
| fs_inst::can_do_source_mods(const struct brw_device_info *devinfo) |
| { |
| if (devinfo->gen == 6 && is_math()) |
| return false; |
| |
| if (is_send_from_grf()) |
| return false; |
| |
| if (!backend_instruction::can_do_source_mods()) |
| return false; |
| |
| return true; |
| } |
| |
| bool |
| fs_inst::has_side_effects() const |
| { |
| return this->eot || backend_instruction::has_side_effects(); |
| } |
| |
| void |
| fs_reg::init() |
| { |
| memset(this, 0, sizeof(*this)); |
| stride = 1; |
| } |
| |
| /** Generic unset register constructor. */ |
| fs_reg::fs_reg() |
| { |
| init(); |
| this->file = BAD_FILE; |
| } |
| |
| /** Immediate value constructor. */ |
| fs_reg::fs_reg(float f) |
| { |
| init(); |
| this->file = IMM; |
| this->type = BRW_REGISTER_TYPE_F; |
| this->stride = 0; |
| this->fixed_hw_reg.dw1.f = f; |
| } |
| |
| /** Immediate value constructor. */ |
| fs_reg::fs_reg(int32_t i) |
| { |
| init(); |
| this->file = IMM; |
| this->type = BRW_REGISTER_TYPE_D; |
| this->stride = 0; |
| this->fixed_hw_reg.dw1.d = i; |
| } |
| |
| /** Immediate value constructor. */ |
| fs_reg::fs_reg(uint32_t u) |
| { |
| init(); |
| this->file = IMM; |
| this->type = BRW_REGISTER_TYPE_UD; |
| this->stride = 0; |
| this->fixed_hw_reg.dw1.ud = u; |
| } |
| |
| /** Vector float immediate value constructor. */ |
| fs_reg::fs_reg(uint8_t vf[4]) |
| { |
| init(); |
| this->file = IMM; |
| this->type = BRW_REGISTER_TYPE_VF; |
| memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned)); |
| } |
| |
| /** Vector float immediate value constructor. */ |
| fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3) |
| { |
| init(); |
| this->file = IMM; |
| this->type = BRW_REGISTER_TYPE_VF; |
| this->fixed_hw_reg.dw1.ud = (vf0 << 0) | |
| (vf1 << 8) | |
| (vf2 << 16) | |
| (vf3 << 24); |
| } |
| |
| /** Fixed brw_reg. */ |
| fs_reg::fs_reg(struct brw_reg fixed_hw_reg) |
| { |
| init(); |
| this->file = HW_REG; |
| this->fixed_hw_reg = fixed_hw_reg; |
| this->type = fixed_hw_reg.type; |
| } |
| |
| bool |
| fs_reg::equals(const fs_reg &r) const |
| { |
| return (file == r.file && |
| reg == r.reg && |
| reg_offset == r.reg_offset && |
| subreg_offset == r.subreg_offset && |
| type == r.type && |
| negate == r.negate && |
| abs == r.abs && |
| !reladdr && !r.reladdr && |
| memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 && |
| stride == r.stride); |
| } |
| |
| fs_reg & |
| fs_reg::set_smear(unsigned subreg) |
| { |
| assert(file != HW_REG && file != IMM); |
| subreg_offset = subreg * type_sz(type); |
| stride = 0; |
| return *this; |
| } |
| |
| bool |
| fs_reg::is_contiguous() const |
| { |
| return stride == 1; |
| } |
| |
| unsigned |
| fs_reg::component_size(unsigned width) const |
| { |
| const unsigned stride = (file != HW_REG ? this->stride : |
| fixed_hw_reg.hstride == 0 ? 0 : |
| 1 << (fixed_hw_reg.hstride - 1)); |
| return MAX2(width * stride, 1) * type_sz(type); |
| } |
| |
| int |
| fs_visitor::type_size(const struct glsl_type *type) |
| { |
| unsigned int size, i; |
| |
| switch (type->base_type) { |
| case GLSL_TYPE_UINT: |
| case GLSL_TYPE_INT: |
| case GLSL_TYPE_FLOAT: |
| case GLSL_TYPE_BOOL: |
| return type->components(); |
| case GLSL_TYPE_ARRAY: |
| return type_size(type->fields.array) * type->length; |
| case GLSL_TYPE_STRUCT: |
| size = 0; |
| for (i = 0; i < type->length; i++) { |
| size += type_size(type->fields.structure[i].type); |
| } |
| return size; |
| case GLSL_TYPE_SAMPLER: |
| /* Samplers take up no register space, since they're baked in at |
| * link time. |
| */ |
| return 0; |
| case GLSL_TYPE_ATOMIC_UINT: |
| return 0; |
| case GLSL_TYPE_IMAGE: |
| case GLSL_TYPE_VOID: |
| case GLSL_TYPE_ERROR: |
| case GLSL_TYPE_INTERFACE: |
| case GLSL_TYPE_DOUBLE: |
| unreachable("not reached"); |
| } |
| |
| return 0; |
| } |
| |
| /** |
| * Create a MOV to read the timestamp register. |
| * |
| * The caller is responsible for emitting the MOV. The return value is |
| * the destination of the MOV, with extra parameters set. |
| */ |
| fs_reg |
| fs_visitor::get_timestamp(const fs_builder &bld) |
| { |
| assert(devinfo->gen >= 7); |
| |
| fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE, |
| BRW_ARF_TIMESTAMP, |
| 0), |
| BRW_REGISTER_TYPE_UD)); |
| |
| fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); |
| |
| /* We want to read the 3 fields we care about even if it's not enabled in |
| * the dispatch. |
| */ |
| bld.group(4, 0).exec_all().MOV(dst, ts); |
| |
| /* The caller wants the low 32 bits of the timestamp. Since it's running |
| * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds, |
| * which is plenty of time for our purposes. It is identical across the |
| * EUs, but since it's tracking GPU core speed it will increment at a |
| * varying rate as render P-states change. |
| * |
| * The caller could also check if render P-states have changed (or anything |
| * else that might disrupt timing) by setting smear to 2 and checking if |
| * that field is != 0. |
| */ |
| dst.set_smear(0); |
| |
| return dst; |
| } |
| |
| void |
| fs_visitor::emit_shader_time_begin() |
| { |
| shader_start_time = get_timestamp(bld.annotate("shader time start")); |
| } |
| |
| void |
| fs_visitor::emit_shader_time_end() |
| { |
| /* Insert our code just before the final SEND with EOT. */ |
| exec_node *end = this->instructions.get_tail(); |
| assert(end && ((fs_inst *) end)->eot); |
| const fs_builder ibld = bld.annotate("shader time end") |
| .exec_all().at(NULL, end); |
| |
| fs_reg shader_end_time = get_timestamp(ibld); |
| |
| /* Check that there weren't any timestamp reset events (assuming these |
| * were the only two timestamp reads that happened). |
| */ |
| fs_reg reset = shader_end_time; |
| reset.set_smear(2); |
| set_condmod(BRW_CONDITIONAL_Z, |
| ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u))); |
| ibld.IF(BRW_PREDICATE_NORMAL); |
| |
| fs_reg start = shader_start_time; |
| start.negate = true; |
| fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); |
| diff.set_smear(0); |
| |
| const fs_builder cbld = ibld.group(1, 0); |
| cbld.group(1, 0).ADD(diff, start, shader_end_time); |
| |
| /* If there were no instructions between the two timestamp gets, the diff |
| * is 2 cycles. Remove that overhead, so I can forget about that when |
| * trying to determine the time taken for single instructions. |
| */ |
| cbld.ADD(diff, diff, fs_reg(-2u)); |
| SHADER_TIME_ADD(cbld, 0, diff); |
| SHADER_TIME_ADD(cbld, 1, fs_reg(1u)); |
| ibld.emit(BRW_OPCODE_ELSE); |
| SHADER_TIME_ADD(cbld, 2, fs_reg(1u)); |
| ibld.emit(BRW_OPCODE_ENDIF); |
| } |
| |
| void |
| fs_visitor::SHADER_TIME_ADD(const fs_builder &bld, |
| int shader_time_subindex, |
| fs_reg value) |
| { |
| int index = shader_time_index * 3 + shader_time_subindex; |
| fs_reg offset = fs_reg(index * SHADER_TIME_STRIDE); |
| |
| fs_reg payload; |
| if (dispatch_width == 8) |
| payload = vgrf(glsl_type::uvec2_type); |
| else |
| payload = vgrf(glsl_type::uint_type); |
| |
| bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value); |
| } |
| |
| void |
| fs_visitor::vfail(const char *format, va_list va) |
| { |
| char *msg; |
| |
| if (failed) |
| return; |
| |
| failed = true; |
| |
| msg = ralloc_vasprintf(mem_ctx, format, va); |
| msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg); |
| |
| this->fail_msg = msg; |
| |
| if (debug_enabled) { |
| fprintf(stderr, "%s", msg); |
| } |
| } |
| |
| void |
| fs_visitor::fail(const char *format, ...) |
| { |
| va_list va; |
| |
| va_start(va, format); |
| vfail(format, va); |
| va_end(va); |
| } |
| |
| /** |
| * Mark this program as impossible to compile in SIMD16 mode. |
| * |
| * During the SIMD8 compile (which happens first), we can detect and flag |
| * things that are unsupported in SIMD16 mode, so the compiler can skip |
| * the SIMD16 compile altogether. |
| * |
| * During a SIMD16 compile (if one happens anyway), this just calls fail(). |
| */ |
| void |
| fs_visitor::no16(const char *msg) |
| { |
| if (dispatch_width == 16) { |
| fail("%s", msg); |
| } else { |
| simd16_unsupported = true; |
| |
| compiler->shader_perf_log(log_data, |
| "SIMD16 shader failed to compile: %s", msg); |
| } |
| } |
| |
| /** |
| * Returns true if the instruction has a flag that means it won't |
| * update an entire destination register. |
| * |
| * For example, dead code elimination and live variable analysis want to know |
| * when a write to a variable screens off any preceding values that were in |
| * it. |
| */ |
| bool |
| fs_inst::is_partial_write() const |
| { |
| return ((this->predicate && this->opcode != BRW_OPCODE_SEL) || |
| (this->exec_size * type_sz(this->dst.type)) < 32 || |
| !this->dst.is_contiguous()); |
| } |
| |
| int |
| fs_inst::regs_read(int arg) const |
| { |
| unsigned components = 1; |
| switch (opcode) { |
| case FS_OPCODE_FB_WRITE: |
| case SHADER_OPCODE_URB_WRITE_SIMD8: |
| case SHADER_OPCODE_UNTYPED_ATOMIC: |
| case SHADER_OPCODE_UNTYPED_SURFACE_READ: |
| case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: |
| case SHADER_OPCODE_TYPED_ATOMIC: |
| case SHADER_OPCODE_TYPED_SURFACE_READ: |
| case SHADER_OPCODE_TYPED_SURFACE_WRITE: |
| case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: |
| if (arg == 0) |
| return mlen; |
| break; |
| |
| case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7: |
| /* The payload is actually stored in src1 */ |
| if (arg == 1) |
| return mlen; |
| break; |
| |
| case FS_OPCODE_LINTERP: |
| if (arg == 0) |
| return exec_size / 4; |
| else |
| return 1; |
| |
| case FS_OPCODE_PIXEL_X: |
| case FS_OPCODE_PIXEL_Y: |
| if (arg == 0) |
| components = 2; |
| break; |
| |
| case SHADER_OPCODE_LOAD_PAYLOAD: |
| if (arg < this->header_size) |
| return 1; |
| break; |
| |
| case CS_OPCODE_CS_TERMINATE: |
| return 1; |
| |
| default: |
| if (is_tex() && arg == 0 && src[0].file == GRF) |
| return mlen; |
| break; |
| } |
| |
| switch (src[arg].file) { |
| case BAD_FILE: |
| case UNIFORM: |
| case IMM: |
| return 1; |
| case GRF: |
| case HW_REG: |
| return DIV_ROUND_UP(components * src[arg].component_size(exec_size), |
| REG_SIZE); |
| case MRF: |
| unreachable("MRF registers are not allowed as sources"); |
| default: |
| unreachable("Invalid register file"); |
| } |
| } |
| |
| bool |
| fs_inst::reads_flag() const |
| { |
| return predicate; |
| } |
| |
| bool |
| fs_inst::writes_flag() const |
| { |
| return (conditional_mod && (opcode != BRW_OPCODE_SEL && |
| opcode != BRW_OPCODE_IF && |
| opcode != BRW_OPCODE_WHILE)) || |
| opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS; |
| } |
| |
| /** |
| * Returns how many MRFs an FS opcode will write over. |
| * |
| * Note that this is not the 0 or 1 implied writes in an actual gen |
| * instruction -- the FS opcodes often generate MOVs in addition. |
| */ |
| int |
| fs_visitor::implied_mrf_writes(fs_inst *inst) |
| { |
| if (inst->mlen == 0) |
| return 0; |
| |
| if (inst->base_mrf == -1) |
| return 0; |
| |
| switch (inst->opcode) { |
| case SHADER_OPCODE_RCP: |
| case SHADER_OPCODE_RSQ: |
| case SHADER_OPCODE_SQRT: |
| case SHADER_OPCODE_EXP2: |
| case SHADER_OPCODE_LOG2: |
| case SHADER_OPCODE_SIN: |
| case SHADER_OPCODE_COS: |
| return 1 * dispatch_width / 8; |
| case SHADER_OPCODE_POW: |
| case SHADER_OPCODE_INT_QUOTIENT: |
| case SHADER_OPCODE_INT_REMAINDER: |
| return 2 * dispatch_width / 8; |
| case SHADER_OPCODE_TEX: |
| case FS_OPCODE_TXB: |
| case SHADER_OPCODE_TXD: |
| case SHADER_OPCODE_TXF: |
| case SHADER_OPCODE_TXF_CMS: |
| case SHADER_OPCODE_TXF_MCS: |
| case SHADER_OPCODE_TG4: |
| case SHADER_OPCODE_TG4_OFFSET: |
| case SHADER_OPCODE_TXL: |
| case SHADER_OPCODE_TXS: |
| case SHADER_OPCODE_LOD: |
| return 1; |
| case FS_OPCODE_FB_WRITE: |
| return 2; |
| case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: |
| case SHADER_OPCODE_GEN4_SCRATCH_READ: |
| return 1; |
| case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD: |
| return inst->mlen; |
| case SHADER_OPCODE_GEN4_SCRATCH_WRITE: |
| return inst->mlen; |
| case SHADER_OPCODE_UNTYPED_ATOMIC: |
| case SHADER_OPCODE_UNTYPED_SURFACE_READ: |
| case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: |
| case SHADER_OPCODE_TYPED_ATOMIC: |
| case SHADER_OPCODE_TYPED_SURFACE_READ: |
| case SHADER_OPCODE_TYPED_SURFACE_WRITE: |
| case SHADER_OPCODE_URB_WRITE_SIMD8: |
| case FS_OPCODE_INTERPOLATE_AT_CENTROID: |
| case FS_OPCODE_INTERPOLATE_AT_SAMPLE: |
| case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: |
| case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: |
| return 0; |
| default: |
| unreachable("not reached"); |
| } |
| } |
| |
| fs_reg |
| fs_visitor::vgrf(const glsl_type *const type) |
| { |
| int reg_width = dispatch_width / 8; |
| return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width), |
| brw_type_for_base_type(type)); |
| } |
| |
| /** Fixed HW reg constructor. */ |
| fs_reg::fs_reg(enum register_file file, int reg) |
| { |
| init(); |
| this->file = file; |
| this->reg = reg; |
| this->type = BRW_REGISTER_TYPE_F; |
| this->stride = (file == UNIFORM ? 0 : 1); |
| } |
| |
| /** Fixed HW reg constructor. */ |
| fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type) |
| { |
| init(); |
| this->file = file; |
| this->reg = reg; |
| this->type = type; |
| this->stride = (file == UNIFORM ? 0 : 1); |
| } |
| |
| /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch. |
| * This brings in those uniform definitions |
| */ |
| void |
| fs_visitor::import_uniforms(fs_visitor *v) |
| { |
| this->push_constant_loc = v->push_constant_loc; |
| this->pull_constant_loc = v->pull_constant_loc; |
| this->uniforms = v->uniforms; |
| this->param_size = v->param_size; |
| } |
| |
| fs_reg * |
| fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer, |
| bool origin_upper_left) |
| { |
| assert(stage == MESA_SHADER_FRAGMENT); |
| brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; |
| fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type)); |
| fs_reg wpos = *reg; |
| bool flip = !origin_upper_left ^ key->render_to_fbo; |
| |
| /* gl_FragCoord.x */ |
| if (pixel_center_integer) { |
| bld.MOV(wpos, this->pixel_x); |
| } else { |
| bld.ADD(wpos, this->pixel_x, fs_reg(0.5f)); |
| } |
| wpos = offset(wpos, bld, 1); |
| |
| /* gl_FragCoord.y */ |
| if (!flip && pixel_center_integer) { |
| bld.MOV(wpos, this->pixel_y); |
| } else { |
| fs_reg pixel_y = this->pixel_y; |
| float offset = (pixel_center_integer ? 0.0 : 0.5); |
| |
| if (flip) { |
| pixel_y.negate = true; |
| offset += key->drawable_height - 1.0; |
| } |
| |
| bld.ADD(wpos, pixel_y, fs_reg(offset)); |
| } |
| wpos = offset(wpos, bld, 1); |
| |
| /* gl_FragCoord.z */ |
| if (devinfo->gen >= 6) { |
| bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))); |
| } else { |
| bld.emit(FS_OPCODE_LINTERP, wpos, |
| this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], |
| interp_reg(VARYING_SLOT_POS, 2)); |
| } |
| wpos = offset(wpos, bld, 1); |
| |
| /* gl_FragCoord.w: Already set up in emit_interpolation */ |
| bld.MOV(wpos, this->wpos_w); |
| |
| return reg; |
| } |
| |
| fs_inst * |
| fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp, |
| glsl_interp_qualifier interpolation_mode, |
| bool is_centroid, bool is_sample) |
| { |
| brw_wm_barycentric_interp_mode barycoord_mode; |
| if (devinfo->gen >= 6) { |
| if (is_centroid) { |
| if (interpolation_mode == INTERP_QUALIFIER_SMOOTH) |
| barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC; |
| else |
| barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC; |
| } else if (is_sample) { |
| if (interpolation_mode == INTERP_QUALIFIER_SMOOTH) |
| barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC; |
| else |
| barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC; |
| } else { |
| if (interpolation_mode == INTERP_QUALIFIER_SMOOTH) |
| barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC; |
| else |
| barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC; |
| } |
| } else { |
| /* On Ironlake and below, there is only one interpolation mode. |
| * Centroid interpolation doesn't mean anything on this hardware -- |
| * there is no multisampling. |
| */ |
| barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC; |
| } |
| return bld.emit(FS_OPCODE_LINTERP, attr, |
| this->delta_xy[barycoord_mode], interp); |
| } |
| |
| void |
| fs_visitor::emit_general_interpolation(fs_reg attr, const char *name, |
| const glsl_type *type, |
| glsl_interp_qualifier interpolation_mode, |
| int location, bool mod_centroid, |
| bool mod_sample) |
| { |
| attr.type = brw_type_for_base_type(type->get_scalar_type()); |
| |
| assert(stage == MESA_SHADER_FRAGMENT); |
| brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; |
| brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; |
| |
| unsigned int array_elements; |
| |
| if (type->is_array()) { |
| array_elements = type->length; |
| if (array_elements == 0) { |
| fail("dereferenced array '%s' has length 0\n", name); |
| } |
| type = type->fields.array; |
| } else { |
| array_elements = 1; |
| } |
| |
| if (interpolation_mode == INTERP_QUALIFIER_NONE) { |
| bool is_gl_Color = |
| location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1; |
| if (key->flat_shade && is_gl_Color) { |
| interpolation_mode = INTERP_QUALIFIER_FLAT; |
| } else { |
| interpolation_mode = INTERP_QUALIFIER_SMOOTH; |
| } |
| } |
| |
| for (unsigned int i = 0; i < array_elements; i++) { |
| for (unsigned int j = 0; j < type->matrix_columns; j++) { |
| if (prog_data->urb_setup[location] == -1) { |
| /* If there's no incoming setup data for this slot, don't |
| * emit interpolation for it. |
| */ |
| attr = offset(attr, bld, type->vector_elements); |
| location++; |
| continue; |
| } |
| |
| if (interpolation_mode == INTERP_QUALIFIER_FLAT) { |
| /* Constant interpolation (flat shading) case. The SF has |
| * handed us defined values in only the constant offset |
| * field of the setup reg. |
| */ |
| for (unsigned int k = 0; k < type->vector_elements; k++) { |
| struct brw_reg interp = interp_reg(location, k); |
| interp = suboffset(interp, 3); |
| interp.type = attr.type; |
| bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp)); |
| attr = offset(attr, bld, 1); |
| } |
| } else { |
| /* Smooth/noperspective interpolation case. */ |
| for (unsigned int k = 0; k < type->vector_elements; k++) { |
| struct brw_reg interp = interp_reg(location, k); |
| if (devinfo->needs_unlit_centroid_workaround && mod_centroid) { |
| /* Get the pixel/sample mask into f0 so that we know |
| * which pixels are lit. Then, for each channel that is |
| * unlit, replace the centroid data with non-centroid |
| * data. |
| */ |
| bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS); |
| |
| fs_inst *inst; |
| inst = emit_linterp(attr, fs_reg(interp), interpolation_mode, |
| false, false); |
| inst->predicate = BRW_PREDICATE_NORMAL; |
| inst->predicate_inverse = true; |
| if (devinfo->has_pln) |
| inst->no_dd_clear = true; |
| |
| inst = emit_linterp(attr, fs_reg(interp), interpolation_mode, |
| mod_centroid && !key->persample_shading, |
| mod_sample || key->persample_shading); |
| inst->predicate = BRW_PREDICATE_NORMAL; |
| inst->predicate_inverse = false; |
| if (devinfo->has_pln) |
| inst->no_dd_check = true; |
| |
| } else { |
| emit_linterp(attr, fs_reg(interp), interpolation_mode, |
| mod_centroid && !key->persample_shading, |
| mod_sample || key->persample_shading); |
| } |
| if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) { |
| bld.MUL(attr, attr, this->pixel_w); |
| } |
| attr = offset(attr, bld, 1); |
| } |
| |
| } |
| location++; |
| } |
| } |
| } |
| |
| fs_reg * |
| fs_visitor::emit_frontfacing_interpolation() |
| { |
| fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type)); |
| |
| if (devinfo->gen >= 6) { |
| /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create |
| * a boolean result from this (~0/true or 0/false). |
| * |
| * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish |
| * this task in only one instruction: |
| * - a negation source modifier will flip the bit; and |
| * - a W -> D type conversion will sign extend the bit into the high |
| * word of the destination. |
| * |
| * An ASR 15 fills the low word of the destination. |
| */ |
| fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W)); |
| g0.negate = true; |
| |
| bld.ASR(*reg, g0, fs_reg(15)); |
| } else { |
| /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create |
| * a boolean result from this (1/true or 0/false). |
| * |
| * Like in the above case, since the bit is the MSB of g1.6:UD we can use |
| * the negation source modifier to flip it. Unfortunately the SHR |
| * instruction only operates on UD (or D with an abs source modifier) |
| * sources without negation. |
| * |
| * Instead, use ASR (which will give ~0/true or 0/false). |
| */ |
| fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D)); |
| g1_6.negate = true; |
| |
| bld.ASR(*reg, g1_6, fs_reg(31)); |
| } |
| |
| return reg; |
| } |
| |
| void |
| fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos) |
| { |
| assert(stage == MESA_SHADER_FRAGMENT); |
| brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; |
| assert(dst.type == BRW_REGISTER_TYPE_F); |
| |
| if (key->compute_pos_offset) { |
| /* Convert int_sample_pos to floating point */ |
| bld.MOV(dst, int_sample_pos); |
| /* Scale to the range [0, 1] */ |
| bld.MUL(dst, dst, fs_reg(1 / 16.0f)); |
| } |
| else { |
| /* From ARB_sample_shading specification: |
| * "When rendering to a non-multisample buffer, or if multisample |
| * rasterization is disabled, gl_SamplePosition will always be |
| * (0.5, 0.5). |
| */ |
| bld.MOV(dst, fs_reg(0.5f)); |
| } |
| } |
| |
| fs_reg * |
| fs_visitor::emit_samplepos_setup() |
| { |
| assert(devinfo->gen >= 6); |
| |
| const fs_builder abld = bld.annotate("compute sample position"); |
| fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type)); |
| fs_reg pos = *reg; |
| fs_reg int_sample_x = vgrf(glsl_type::int_type); |
| fs_reg int_sample_y = vgrf(glsl_type::int_type); |
| |
| /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16 |
| * mode will be enabled. |
| * |
| * From the Ivy Bridge PRM, volume 2 part 1, page 344: |
| * R31.1:0 Position Offset X/Y for Slot[3:0] |
| * R31.3:2 Position Offset X/Y for Slot[7:4] |
| * ..... |
| * |
| * The X, Y sample positions come in as bytes in thread payload. So, read |
| * the positions using vstride=16, width=8, hstride=2. |
| */ |
| struct brw_reg sample_pos_reg = |
| stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0), |
| BRW_REGISTER_TYPE_B), 16, 8, 2); |
| |
| if (dispatch_width == 8) { |
| abld.MOV(int_sample_x, fs_reg(sample_pos_reg)); |
| } else { |
| abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)); |
| abld.half(1).MOV(half(int_sample_x, 1), |
| fs_reg(suboffset(sample_pos_reg, 16))); |
| } |
| /* Compute gl_SamplePosition.x */ |
| compute_sample_position(pos, int_sample_x); |
| pos = offset(pos, abld, 1); |
| if (dispatch_width == 8) { |
| abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))); |
| } else { |
| abld.half(0).MOV(half(int_sample_y, 0), |
| fs_reg(suboffset(sample_pos_reg, 1))); |
| abld.half(1).MOV(half(int_sample_y, 1), |
| fs_reg(suboffset(sample_pos_reg, 17))); |
| } |
| /* Compute gl_SamplePosition.y */ |
| compute_sample_position(pos, int_sample_y); |
| return reg; |
| } |
| |
| fs_reg * |
| fs_visitor::emit_sampleid_setup() |
| { |
| assert(stage == MESA_SHADER_FRAGMENT); |
| brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; |
| assert(devinfo->gen >= 6); |
| |
| const fs_builder abld = bld.annotate("compute sample id"); |
| fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type)); |
| |
| if (key->compute_sample_id) { |
| fs_reg t1 = vgrf(glsl_type::int_type); |
| fs_reg t2 = vgrf(glsl_type::int_type); |
| t2.type = BRW_REGISTER_TYPE_UW; |
| |
| /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with |
| * 8x multisampling, subspan 0 will represent sample N (where N |
| * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or |
| * 7. We can find the value of N by looking at R0.0 bits 7:6 |
| * ("Starting Sample Pair Index (SSPI)") and multiplying by two |
| * (since samples are always delivered in pairs). That is, we |
| * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then |
| * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in |
| * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, |
| * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by |
| * populating a temporary variable with the sequence (0, 1, 2, 3), |
| * and then reading from it using vstride=1, width=4, hstride=0. |
| * These computations hold good for 4x multisampling as well. |
| * |
| * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1): |
| * the first four slots are sample 0 of subspan 0; the next four |
| * are sample 1 of subspan 0; the third group is sample 0 of |
| * subspan 1, and finally sample 1 of subspan 1. |
| */ |
| abld.exec_all() |
| .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)), |
| fs_reg(0xc0)); |
| abld.exec_all().SHR(t1, t1, fs_reg(5)); |
| |
| /* This works for both SIMD8 and SIMD16 */ |
| abld.exec_all() |
| .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)); |
| |
| /* This special instruction takes care of setting vstride=1, |
| * width=4, hstride=0 of t2 during an ADD instruction. |
| */ |
| abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2); |
| } else { |
| /* As per GL_ARB_sample_shading specification: |
| * "When rendering to a non-multisample buffer, or if multisample |
| * rasterization is disabled, gl_SampleID will always be zero." |
| */ |
| abld.MOV(*reg, fs_reg(0)); |
| } |
| |
| return reg; |
| } |
| |
| void |
| fs_visitor::resolve_source_modifiers(fs_reg *src) |
| { |
| if (!src->abs && !src->negate) |
| return; |
| |
| fs_reg temp = bld.vgrf(src->type); |
| bld.MOV(temp, *src); |
| *src = temp; |
| } |
| |
| void |
| fs_visitor::emit_discard_jump() |
| { |
| assert(((brw_wm_prog_data*) this->prog_data)->uses_kill); |
| |
| /* For performance, after a discard, jump to the end of the |
| * shader if all relevant channels have been discarded. |
| */ |
| fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP); |
| discard_jump->flag_subreg = 1; |
| |
| discard_jump->predicate = (dispatch_width == 8) |
| ? BRW_PREDICATE_ALIGN1_ANY8H |
| : BRW_PREDICATE_ALIGN1_ANY16H; |
| discard_jump->predicate_inverse = true; |
| } |
| |
| void |
| fs_visitor::assign_curb_setup() |
| { |
| if (dispatch_width == 8) { |
| prog_data->dispatch_grf_start_reg = payload.num_regs; |
| } else { |
| if (stage == MESA_SHADER_FRAGMENT) { |
| brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; |
| prog_data->dispatch_grf_start_reg_16 = payload.num_regs; |
| } else if (stage == MESA_SHADER_COMPUTE) { |
| brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data; |
| prog_data->dispatch_grf_start_reg_16 = payload.num_regs; |
| } else { |
| unreachable("Unsupported shader type!"); |
| } |
| } |
| |
| prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8; |
| |
| /* Map the offsets in the UNIFORM file to fixed HW regs. */ |
| foreach_block_and_inst(block, fs_inst, inst, cfg) { |
| for (unsigned int i = 0; i < inst->sources; i++) { |
| if (inst->src[i].file == UNIFORM) { |
| int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset; |
| int constant_nr; |
| if (uniform_nr >= 0 && uniform_nr < (int) uniforms) { |
| constant_nr = push_constant_loc[uniform_nr]; |
| } else { |
| /* Section 5.11 of the OpenGL 4.1 spec says: |
| * "Out-of-bounds reads return undefined values, which include |
| * values from other variables of the active program or zero." |
| * Just return the first push constant. |
| */ |
| constant_nr = 0; |
| } |
| |
| struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs + |
| constant_nr / 8, |
| constant_nr % 8); |
| |
| assert(inst->src[i].stride == 0); |
| inst->src[i].file = HW_REG; |
| inst->src[i].fixed_hw_reg = byte_offset( |
| retype(brw_reg, inst->src[i].type), |
| inst->src[i].subreg_offset); |
| } |
| } |
| } |
| } |
| |
| void |
| fs_visitor::calculate_urb_setup() |
| { |
| assert(stage == MESA_SHADER_FRAGMENT); |
| brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; |
| brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; |
| |
| memset(prog_data->urb_setup, -1, |
| sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX); |
| |
| int urb_next = 0; |
| /* Figure out where each of the incoming setup attributes lands. */ |
| if (devinfo->gen >= 6) { |
| if (_mesa_bitcount_64(prog->InputsRead & |
| BRW_FS_VARYING_INPUT_MASK) <= 16) { |
| /* The SF/SBE pipeline stage can do arbitrary rearrangement of the |
| * first 16 varying inputs, so we can put them wherever we want. |
| * Just put them in order. |
| * |
| * This is useful because it means that (a) inputs not used by the |
| * fragment shader won't take up valuable register space, and (b) we |
| * won't have to recompile the fragment shader if it gets paired with |
| * a different vertex (or geometry) shader. |
| */ |
| for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { |
| if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK & |
| BITFIELD64_BIT(i)) { |
| prog_data->urb_setup[i] = urb_next++; |
| } |
| } |
| } else { |
| /* We have enough input varyings that the SF/SBE pipeline stage can't |
| * arbitrarily rearrange them to suit our whim; we have to put them |
| * in an order that matches the output of the previous pipeline stage |
| * (geometry or vertex shader). |
| */ |
| struct brw_vue_map prev_stage_vue_map; |
| brw_compute_vue_map(devinfo, &prev_stage_vue_map, |
| key->input_slots_valid); |
| int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET; |
| assert(prev_stage_vue_map.num_slots <= first_slot + 32); |
| for (int slot = first_slot; slot < prev_stage_vue_map.num_slots; |
| slot++) { |
| int varying = prev_stage_vue_map.slot_to_varying[slot]; |
| /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is |
| * unused. |
| */ |
| if (varying != BRW_VARYING_SLOT_COUNT && |
| (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK & |
| BITFIELD64_BIT(varying))) { |
| prog_data->urb_setup[varying] = slot - first_slot; |
| } |
| } |
| urb_next = prev_stage_vue_map.num_slots - first_slot; |
| } |
| } else { |
| /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */ |
| for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { |
| /* Point size is packed into the header, not as a general attribute */ |
| if (i == VARYING_SLOT_PSIZ) |
| continue; |
| |
| if (key->input_slots_valid & BITFIELD64_BIT(i)) { |
| /* The back color slot is skipped when the front color is |
| * also written to. In addition, some slots can be |
| * written in the vertex shader and not read in the |
| * fragment shader. So the register number must always be |
| * incremented, mapped or not. |
| */ |
| if (_mesa_varying_slot_in_fs((gl_varying_slot) i)) |
| prog_data->urb_setup[i] = urb_next; |
| urb_next++; |
| } |
| } |
| |
| /* |
| * It's a FS only attribute, and we did interpolation for this attribute |
| * in SF thread. So, count it here, too. |
| * |
| * See compile_sf_prog() for more info. |
| */ |
| if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC)) |
| prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++; |
| } |
| |
| prog_data->num_varying_inputs = urb_next; |
| } |
| |
| void |
| fs_visitor::assign_urb_setup() |
| { |
| assert(stage == MESA_SHADER_FRAGMENT); |
| brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; |
| |
| int urb_start = payload.num_regs + prog_data->base.curb_read_length; |
| |
| /* Offset all the urb_setup[] index by the actual position of the |
| * setup regs, now that the location of the constants has been chosen. |
| */ |
| foreach_block_and_inst(block, fs_inst, inst, cfg) { |
| if (inst->opcode == FS_OPCODE_LINTERP) { |
| assert(inst->src[1].file == HW_REG); |
| inst->src[1].fixed_hw_reg.nr += urb_start; |
| } |
| |
| if (inst->opcode == FS_OPCODE_CINTERP) { |
| assert(inst->src[0].file == HW_REG); |
| inst->src[0].fixed_hw_reg.nr += urb_start; |
| } |
| } |
| |
| /* Each attribute is 4 setup channels, each of which is half a reg. */ |
| this->first_non_payload_grf = |
| urb_start + prog_data->num_varying_inputs * 2; |
| } |
| |
| void |
| fs_visitor::assign_vs_urb_setup() |
| { |
| brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data; |
| int grf, count, slot, channel, attr; |
| |
| assert(stage == MESA_SHADER_VERTEX); |
| count = _mesa_bitcount_64(vs_prog_data->inputs_read); |
| if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) |
| count++; |
| |
| /* Each attribute is 4 regs. */ |
| this->first_non_payload_grf = |
| payload.num_regs + prog_data->curb_read_length + count * 4; |
| |
| unsigned vue_entries = |
| MAX2(count, vs_prog_data->base.vue_map.num_slots); |
| |
| vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4; |
| vs_prog_data->base.urb_read_length = (count + 1) / 2; |
| |
| assert(vs_prog_data->base.urb_read_length <= 15); |
| |
| /* Rewrite all ATTR file references to the hw grf that they land in. */ |
| foreach_block_and_inst(block, fs_inst, inst, cfg) { |
| for (int i = 0; i < inst->sources; i++) { |
| if (inst->src[i].file == ATTR) { |
| |
| if (inst->src[i].reg == VERT_ATTRIB_MAX) { |
| slot = count - 1; |
| } else { |
| /* Attributes come in in a contiguous block, ordered by their |
| * gl_vert_attrib value. That means we can compute the slot |
| * number for an attribute by masking out the enabled |
| * attributes before it and counting the bits. |
| */ |
| attr = inst->src[i].reg + inst->src[i].reg_offset / 4; |
| slot = _mesa_bitcount_64(vs_prog_data->inputs_read & |
| BITFIELD64_MASK(attr)); |
| } |
| |
| channel = inst->src[i].reg_offset & 3; |
| |
| grf = payload.num_regs + |
| prog_data->curb_read_length + |
| slot * 4 + channel; |
| |
| inst->src[i].file = HW_REG; |
| inst->src[i].fixed_hw_reg = |
| retype(brw_vec8_grf(grf, 0), inst->src[i].type); |
| } |
| } |
| } |
| } |
| |
| /** |
| * Split large virtual GRFs into separate components if we can. |
| * |
| * This is mostly duplicated with what brw_fs_vector_splitting does, |
| * but that's really conservative because it's afraid of doing |
| * splitting that doesn't result in real progress after the rest of |
| * the optimization phases, which would cause infinite looping in |
| * optimization. We can do it once here, safely. This also has the |
| * opportunity to split interpolated values, or maybe even uniforms, |
| * which we don't have at the IR level. |
| * |
| * We want to split, because virtual GRFs are what we register |
| * allocate and spill (due to contiguousness requirements for some |
| * instructions), and they're what we naturally generate in the |
| * codegen process, but most virtual GRFs don't actually need to be |
| * contiguous sets of GRFs. If we split, we'll end up with reduced |
| * live intervals and better dead code elimination and coalescing. |
| */ |
| void |
| fs_visitor::split_virtual_grfs() |
| { |
| int num_vars = this->alloc.count; |
| |
| /* Count the total number of registers */ |
| int reg_count = 0; |
| int vgrf_to_reg[num_vars]; |
| for (int i = 0; i < num_vars; i++) { |
| vgrf_to_reg[i] = reg_count; |
| reg_count += alloc.sizes[i]; |
| } |
| |
| /* An array of "split points". For each register slot, this indicates |
| * if this slot can be separated from the previous slot. Every time an |
| * instruction uses multiple elements of a register (as a source or |
| * destination), we mark the used slots as inseparable. Then we go |
| * through and split the registers into the smallest pieces we can. |
| */ |
| bool split_points[reg_count]; |
| memset(split_points, 0, sizeof(split_points)); |
| |
| /* Mark all used registers as fully splittable */ |
| foreach_block_and_inst(block, fs_inst, inst, cfg) { |
| if (inst->dst.file == GRF) { |
| int reg = vgrf_to_reg[inst->dst.reg]; |
| for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++) |
| split_points[reg + j] = true; |
| } |
| |
| for (int i = 0; i < inst->sources; i++) { |
| if (inst->src[i].file == GRF) { |
| int reg = vgrf_to_reg[inst->src[i].reg]; |
| for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++) |
| split_points[reg + j] = true; |
| } |
| } |
| } |
| |
| foreach_block_and_inst(block, fs_inst, inst, cfg) { |
| if (inst->dst.file == GRF) { |
| int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset; |
| for (int j = 1; j < inst->regs_written; j++) |
| split_points[reg + j] = false; |
| } |
| for (int i = 0; i < inst->sources; i++) { |
| if (inst->src[i].file == GRF) { |
| int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset; |
| for (int j = 1; j < inst->regs_read(i); j++) |
| split_points[reg + j] = false; |
| } |
| } |
| } |
| |
| int new_virtual_grf[reg_count]; |
| int new_reg_offset[reg_count]; |
| |
| int reg = 0; |
| for (int i = 0; i < num_vars; i++) { |
| /* The first one should always be 0 as a quick sanity check. */ |
| assert(split_points[reg] == false); |
| |
| /* j = 0 case */ |
| new_reg_offset[reg] = 0; |
| reg++; |
| int offset = 1; |
| |
| /* j > 0 case */ |
| for (unsigned j = 1; j < alloc.sizes[i]; j++) { |
| /* If this is a split point, reset the offset to 0 and allocate a |
| * new virtual GRF for the previous offset many registers |
| */ |
| if (split_points[reg]) { |
| assert(offset <= MAX_VGRF_SIZE); |
| int grf = alloc.allocate(offset); |
| for (int k = reg - offset; k < reg; k++) |
| new_virtual_grf[k] = grf; |
| offset = 0; |
| } |
| new_reg_offset[reg] = offset; |
| offset++; |
| reg++; |
| } |
| |
| /* The last one gets the original register number */ |
| assert(offset <= MAX_VGRF_SIZE); |
| alloc.sizes[i] = offset; |
| for (int k = reg - offset; k < reg; k++) |
| new_virtual_grf[k] = i; |
| } |
| assert(reg == reg_count); |
| |
| foreach_block_and_inst(block, fs_inst, inst, cfg) { |
| if (inst->dst.file == GRF) { |
| reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset; |
| inst->dst.reg = new_virtual_grf[reg]; |
| inst->dst.reg_offset = new_reg_offset[reg]; |
| assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]); |
| } |
| for (int i = 0; i < inst->sources; i++) { |
| if (inst->src[i].file == GRF) { |
| reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset; |
| inst->src[i].reg = new_virtual_grf[reg]; |
| inst->src[i].reg_offset = new_reg_offset[reg]; |
| assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]); |
| } |
| } |
| } |
| invalidate_live_intervals(); |
| } |
| |
| /** |
| * Remove unused virtual GRFs and compact the virtual_grf_* arrays. |
| * |
| * During code generation, we create tons of temporary variables, many of |
| * which get immediately killed and are never used again. Yet, in later |
| * optimization and analysis passes, such as compute_live_intervals, we need |
| * to loop over all the virtual GRFs. Compacting them can save a lot of |
| * overhead. |
| */ |
| bool |
| fs_visitor::compact_virtual_grfs() |
| { |
| bool progress = false; |
| int remap_table[this->alloc.count]; |
| memset(remap_table, -1, sizeof(remap_table)); |
| |
| /* Mark which virtual GRFs are used. */ |
| foreach_block_and_inst(block, const fs_inst, inst, cfg) { |
| if (inst->dst.file == GRF) |
| remap_table[inst->dst.reg] = 0; |
| |
| for (int i = 0; i < inst->sources; i++) { |
| if (inst->src[i].file == GRF) |
| remap_table[inst->src[i].reg] = 0; |
| } |
| } |
| |
| /* Compact the GRF arrays. */ |
| int new_index = 0; |
| for (unsigned i = 0; i < this->alloc.count; i++) { |
| if (remap_table[i] == -1) { |
| /* We just found an unused register. This means that we are |
| * actually going to compact something. |
| */ |
| progress = true; |
| } else { |
| remap_table[i] = new_index; |
| alloc.sizes[new_index] = alloc.sizes[i]; |
| invalidate_live_intervals(); |
| ++new_index; |
| } |
| } |
| |
| this->alloc.count = new_index; |
| |
| /* Patch all the instructions to use the newly renumbered registers */ |
| foreach_block_and_inst(block, fs_inst, inst, cfg) { |
| if (inst->dst.file == GRF) |
| inst->dst.reg = remap_table[inst->dst.reg]; |
| |
| for (int i = 0; i < inst->sources; i++) { |
| if (inst->src[i].file == GRF) |
| inst->src[i].reg = remap_table[inst->src[i].reg]; |
| } |
| } |
| |
| /* Patch all the references to delta_xy, since they're used in register |
| * allocation. If they're unused, switch them to BAD_FILE so we don't |
| * think some random VGRF is delta_xy. |
| */ |
| for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) { |
| if (delta_xy[i].file == GRF) { |
| if (remap_table[delta_xy[i].reg] != -1) { |
| delta_xy[i].reg = remap_table[delta_xy[i].reg]; |
| } else { |
| delta_xy[i].file = BAD_FILE; |
| } |
| } |
| } |
| |
| return progress; |
| } |
| |
| /* |
| * Implements array access of uniforms by inserting a |
| * PULL_CONSTANT_LOAD instruction. |
| * |
| * Unlike temporary GRF array access (where we don't support it due to |
| * the difficulty of doing relative addressing on instruction |
| * destinations), we could potentially do array access of uniforms |
| * that were loaded in GRF space as push constants. In real-world |
| * usage we've seen, though, the arrays being used are always larger |
| * than we could load as push constants, so just always move all |
| * uniform array access out to a pull constant buffer. |
| */ |
| void |
| fs_visitor::move_uniform_array_access_to_pull_constants() |
| { |
| if (dispatch_width != 8) |
| return; |
| |
| pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); |
| memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms); |
| |
| /* Walk through and find array access of uniforms. Put a copy of that |
| * uniform in the pull constant buffer. |
| * |
| * Note that we don't move constant-indexed accesses to arrays. No |
| * testing has been done of the performance impact of this choice. |
| */ |
| foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { |
| for (int i = 0 ; i < inst->sources; i++) { |
| if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr) |
| continue; |
| |
| int uniform = inst->src[i].reg; |
| |
| /* If this array isn't already present in the pull constant buffer, |
| * add it. |
| */ |
| if (pull_constant_loc[uniform] == -1) { |
| const gl_constant_value **values = &stage_prog_data->param[uniform]; |
| |
| assert(param_size[uniform]); |
| |
| for (int j = 0; j < param_size[uniform]; j++) { |
| pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params; |
| |
| stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] = |
| values[j]; |
| } |
| } |
| } |
| } |
| } |
| |
| /** |
| * Assign UNIFORM file registers to either push constants or pull constants. |
| * |
| * We allow a fragment shader to have more than the specified minimum |
| * maximum number of fragment shader uniform components (64). If |
| * there are too many of these, they'd fill up all of register space. |
| * So, this will push some of them out to the pull constant buffer and |
| * update the program to load them. |
| */ |
| void |
| fs_visitor::assign_constant_locations() |
| { |
| /* Only the first compile (SIMD8 mode) gets to decide on locations. */ |
| if (dispatch_width != 8) |
| return; |
| |
| /* Find which UNIFORM registers are still in use. */ |
| bool is_live[uniforms]; |
| for (unsigned int i = 0; i < uniforms; i++) { |
| is_live[i] = false; |
| } |
| |
| foreach_block_and_inst(block, fs_inst, inst, cfg) { |
| for (int i = 0; i < inst->sources; i++) { |
| if (inst->src[i].file != UNIFORM) |
| continue; |
| |
| int constant_nr = inst->src[i].reg + inst->src[i].reg_offset; |
| if (constant_nr >= 0 && constant_nr < (int) uniforms) |
| is_live[constant_nr] = true; |
| } |
| } |
| |
| /* Only allow 16 registers (128 uniform components) as push constants. |
| * |
| * Just demote the end of the list. We could probably do better |
| * here, demoting things that are rarely used in the program first. |
| * |
| * If changing this value, note the limitation about total_regs in |
| * brw_curbe.c. |
| */ |
| unsigned int max_push_components = 16 * 8; |
| unsigned int num_push_constants = 0; |
| |
| push_constant_loc = ralloc_array(mem_ctx, int, uniforms); |
| |
| for (unsigned int i = 0; i < uniforms; i++) { |
| if (!is_live[i] || pull_constant_loc[i] != -1) { |
| /* This UNIFORM register is either dead, or has already been demoted |
| * to a pull const. Mark it as no longer living in the param[] array. |
| */ |
| push_constant_loc[i] = -1; |
| continue; |
| } |
| |
| if (num_push_constants < max_push_components) { |
| /* Retain as a push constant. Record the location in the params[] |
| * array. |
| */ |
| push_constant_loc[i] = num_push_constants++; |
| } else { |
| /* Demote to a pull constant. */ |
| push_constant_loc[i] = -1; |
| |
| int pull_index = stage_prog_data->nr_pull_params++; |
| stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i]; |
| pull_constant_loc[i] = pull_index; |
| } |
| } |
| |
| stage_prog_data->nr_params = num_push_constants; |
| |
| /* Up until now, the param[] array has been indexed by reg + reg_offset |
| * of UNIFORM registers. Condense it to only contain the uniforms we |
| * chose to upload as push constants. |
| */ |
| for (unsigned int i = 0; i < uniforms; i++) { |
| int remapped = push_constant_loc[i]; |
| |
| if (remapped == -1) |
| continue; |
| |
| assert(remapped <= (int)i); |
| stage_prog_data->param[remapped] = stage_prog_data->param[i]; |
| } |
| } |
| |
| /** |
| * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD |
| * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs. |
| */ |
| void |
| fs_visitor::demote_pull_constants() |
| { |
| foreach_block_and_inst (block, fs_inst, inst, cfg) { |
| for (int i = 0; i < inst->sources; i++) { |
| if (inst->src[i].file != UNIFORM) |
| continue; |
| |
| int pull_index; |
| unsigned location = inst->src[i].reg + inst->src[i].reg_offset; |
| if (location >= uniforms) /* Out of bounds access */ |
| pull_index = -1; |
| else |
| pull_index = pull_constant_loc[location]; |
| |
| if (pull_index == -1) |
| continue; |
| |
| /* Set up the annotation tracking for new generated instructions. */ |
| const fs_builder ibld = bld.annotate(inst->annotation, inst->ir) |
| .at(block, inst); |
| fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start); |
| fs_reg dst = vgrf(glsl_type::float_type); |
| |
| assert(inst->src[i].stride == 0); |
| |
| /* Generate a pull load into dst. */ |
| if (inst->src[i].reladdr) { |
| VARYING_PULL_CONSTANT_LOAD(ibld, dst, |
| surf_index, |
| *inst->src[i].reladdr, |
| pull_index); |
| inst->src[i].reladdr = NULL; |
| inst->src[i].stride = 1; |
| } else { |
| fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15); |
| ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, |
| dst, surf_index, offset); |
| inst->src[i].set_smear(pull_index & 3); |
| } |
| |
| /* Rewrite the instruction to use the temporary VGRF. */ |
| inst->src[i].file = GRF; |
| inst->src[i].reg = dst.reg; |
| inst->src[i].reg_offset = 0; |
| } |
| } |
| invalidate_live_intervals(); |
| } |
| |
| bool |
| fs_visitor::opt_algebraic() |
| { |
| bool progress = false; |
| |
| foreach_block_and_inst(block, fs_inst, inst, cfg) { |
| switch (inst->opcode) { |
| case BRW_OPCODE_MOV: |
| if (inst->src[0].file != IMM) |
| break; |
| |
| if (inst->saturate) { |
| if (inst->dst.type != inst->src[0].type) |
| assert(!"unimplemented: saturate mixed types"); |
| |
| if (brw_saturate_immediate(inst->dst.type, |
| &inst->src[0].fixed_hw_reg)) { |
| inst->saturate = false; |
| progress = true; |
| } |
| } |
| break; |
| |
| case BRW_OPCODE_MUL: |
| if (inst->src[1].file != IMM) |
| continue; |
| |
| /* a * 1.0 = a */ |
| if (inst->src[1].is_one()) { |
| inst->opcode = BRW_OPCODE_MOV; |
| inst->src[1] = reg_undef; |
| progress = true; |
| break; |
| } |
| |
| /* a * -1.0 = -a */ |
| if (inst->src[1].is_negative_one()) { |
| inst->opcode = BRW_OPCODE_MOV; |
| inst->src[0].negate = !inst->src[0].negate; |
| inst->src[1] = reg_undef; |
| progress = true; |
| break; |
| } |
| |
| /* a * 0.0 = 0.0 */ |
| if (inst->src[1].is_zero()) { |
| inst->opcode = BRW_OPCODE_MOV; |
| inst->src[0] = inst->src[1]; |
| inst->src[1] = reg_undef; |
| progress = true; |
| break; |
| } |
| |
| if (inst->src[0].file == IMM) { |
| assert(inst->src[0].type == BRW_REGISTER_TYPE_F); |
| inst->opcode = BRW_OPCODE_MOV; |
| inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f; |
| inst->src[1] = reg_undef; |
| progress = true; |
| break; |
| } |
| break; |
| case BRW_OPCODE_ADD: |
| if (inst->src[1].file != IMM) |
| continue; |
| |
| /* a + 0.0 = a */ |
| if (inst->src[1].is_zero()) { |
| inst->opcode = BRW_OPCODE_MOV; |
| inst->src[1] = reg_undef; |
| progress = true; |
| break; |
| } |
| |
| if (inst->src[0].file == IMM) { |
| assert(inst->src[0].type == BRW_REGISTER_TYPE_F); |
| inst->opcode = BRW_OPCODE_MOV; |
| inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f; |
| inst->src[1] = reg_undef; |
| progress = true; |
| break; |
| } |
| break; |
| case BRW_OPCODE_OR: |
| if (inst->src[0].equals(inst->src[1])) { |
| inst->opcode = BRW_OPCODE_MOV; |
| inst->src[1] = reg_undef; |
| progress = true; |
| break; |
| } |
| break; |
| case BRW_OPCODE_LRP: |
| if (inst->src[1].equals(inst->src[2])) { |
| inst->opcode = BRW_OPCODE_MOV; |
| inst->src[0] = inst->src[1]; |
| inst->src[1] = reg_undef; |
| inst->src[2] = reg_undef; |
| progress = true; |
| break; |
| } |
| break; |
| case BRW_OPCODE_CMP: |
| if (inst->conditional_mod == BRW_CONDITIONAL_GE && |
| inst->src[0].abs && |
| inst->src[0].negate && |
| inst->src[1].is_zero()) { |
| inst->src[0].abs = false; |
| inst->src[0].negate = false; |
| inst->conditional_mod = BRW_CONDITIONAL_Z; |
| progress = true; |
| break; |
| } |
| break; |
| case BRW_OPCODE_SEL: |
| if (inst->src[0].equals(inst->src[1])) { |
| inst->opcode = BRW_OPCODE_MOV; |
| inst->src[1] = reg_undef; |
| inst->predicate = BRW_PREDICATE_NONE; |
| inst->predicate_inverse = false; |
| progress = true; |
| } else if (inst->saturate && inst->src[1].file == IMM) { |
| switch (inst->conditional_mod) { |
| case BRW_CONDITIONAL_LE: |
| case BRW_CONDITIONAL_L: |
| switch (inst->src[1].type) { |
| case BRW_REGISTER_TYPE_F: |
| if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) { |
| inst->opcode = BRW_OPCODE_MOV; |
| inst->src[1] = reg_undef; |
| inst->conditional_mod = BRW_CONDITIONAL_NONE; |
| progress = true; |
| } |
| break; |
| default: |
| break; |
| } |
| break; |
| case BRW_CONDITIONAL_GE: |
| case BRW_CONDITIONAL_G: |
| switch (inst->src[1].type) { |
| case BRW_REGISTER_TYPE_F: |
| if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) { |
| inst->opcode = BRW_OPCODE_MOV; |
| inst->src[1] = reg_undef; |
| inst->conditional_mod = BRW_CONDITIONAL_NONE; |
| progress = true; |
| } |
| break; |
| default: |
| break; |
| } |
| default: |
| break; |
| } |
| } |
| break; |
| case BRW_OPCODE_MAD: |
| if (inst->src[1].is_zero() || inst->src[2].is_zero()) { |
| inst->opcode = BRW_OPCODE_MOV; |
| inst->src[1] = reg_undef; |
| inst->src[2] = reg_undef; |
| progress = true; |
| } else if (inst->src[0].is_zero()) { |
| inst->opcode = BRW_OPCODE_MUL; |
| inst->src[0] = inst->src[2]; |
| inst->src[2] = reg_undef; |
| progress = true; |
| } else if (inst->src[1].is_one()) { |
| inst->opcode = BRW_OPCODE_ADD; |
| inst->src[1] = inst->src[2]; |
| inst->src[2] = reg_undef; |
| progress = true; |
| } else if (inst->src[2].is_one()) { |
| inst->opcode = BRW_OPCODE_ADD; |
| inst->src[2] = reg_undef; |
| progress = true; |
| } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) { |
| inst->opcode = BRW_OPCODE_ADD; |
| inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f; |
| inst->src[2] = reg_undef; |
| progress = true; |
| } |
| break; |
| case SHADER_OPCODE_RCP: { |
| fs_inst *prev = (fs_inst *)inst->prev; |
| if (prev->opcode == SHADER_OPCODE_SQRT) { |
| if (inst->src[0].equals(prev->dst)) { |
| inst->opcode = SHADER_OPCODE_RSQ; |
| inst->src[0] = prev->src[0]; |
| progress = true; |
| } |
| } |
| break; |
| } |
| case SHADER_OPCODE_BROADCAST: |
| if (is_uniform(inst->src[0])) { |
| inst->opcode = BRW_OPCODE_MOV; |
| inst->sources = 1; |
| inst->force_writemask_all = true; |
| progress = true; |
| } else if (inst->src[1].file == IMM) { |
| inst->opcode = BRW_OPCODE_MOV; |
| inst->src[0] = component(inst->src[0], |
| inst->src[1].fixed_hw_reg.dw1.ud); |
| inst->sources = 1; |
| inst->force_writemask_all = true; |
| progress = true; |
| } |
| break; |
| |
| default: |
| break; |
| } |
| |
| /* Swap if src[0] is immediate. */ |
| if (progress && inst->is_commutative()) { |
| if (inst->src[0].file == IMM) { |
| fs_reg tmp = inst->src[1]; |
| inst->src[1] = inst->src[0]; |
| inst->src[0] = tmp; |
| } |
| } |
| } |
| return progress; |
| } |
| |
| /** |
| * Optimize sample messages that have constant zero values for the trailing |
| * texture coordinates. We can just reduce the message length for these |
| * instructions instead of reserving a register for it. Trailing parameters |
| * that aren't sent default to zero anyway. This will cause the dead code |
| * eliminator to remove the MOV instruction that would otherwise be emitted to |
| * set up the zero value. |
| */ |
| bool |
| fs_visitor::opt_zero_samples() |
| { |
| /* Gen4 infers the texturing opcode based on the message length so we can't |
| * change it. |
| */ |
| if (devinfo->gen < 5) |
| return false; |
| |
| bool progress = false; |
| |
| foreach_block_and_inst(block, fs_inst, inst, cfg) { |
| if (!inst->is_tex()) |
| continue; |
| |
| fs_inst *load_payload = (fs_inst *) inst->prev; |
| |
| if (load_payload->is_head_sentinel() || |
| load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD) |
| continue; |
| |
| /* We don't want to remove the message header or the first parameter. |
| * Removing the first parameter is not allowed, see the Haswell PRM |
| * volume 7, page 149: |
| * |
| * "Parameter 0 is required except for the sampleinfo message, which |
| * has no parameter 0" |
| */ |
| while (inst->mlen > inst->header_size + dispatch_width / 8 && |
| load_payload->src[(inst->mlen - inst->header_size) / |
| (dispatch_width / 8) + |
| inst->header_size - 1].is_zero()) { |
| inst->mlen -= dispatch_width / 8; |
| progress = true; |
| } |
| } |
| |
| if (progress) |
| invalidate_live_intervals(); |
| |
| return progress; |
| } |
| |
| /** |
| * Optimize sample messages which are followed by the final RT write. |
| * |
| * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its |
| * results sent directly to the framebuffer, bypassing the EU. Recognize the |
| * final texturing results copied to the framebuffer write payload and modify |
| * them to write to the framebuffer directly. |
| */ |
| bool |
| fs_visitor::opt_sampler_eot() |
| { |
| brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; |
| |
| if (stage != MESA_SHADER_FRAGMENT) |
| return false; |
| |
| if (devinfo->gen < 9 && !devinfo->is_cherryview) |
| return false; |
| |
| /* FINISHME: It should be possible to implement this optimization when there |
| * are multiple drawbuffers. |
| */ |
| if (key->nr_color_regions != 1) |
| return false; |
| |
| /* Look for a texturing instruction immediately before the final FB_WRITE. */ |
| fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end(); |
| assert(fb_write->eot); |
| assert(fb_write->opcode == FS_OPCODE_FB_WRITE); |
| |
| fs_inst *tex_inst = (fs_inst *) fb_write->prev; |
| |
| /* There wasn't one; nothing to do. */ |
| if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex()) |
| return false; |
| |
| /* This optimisation doesn't seem to work for textureGather for some |
| * reason. I can't find any documentation or known workarounds to indicate |
| * that this is expected, but considering that it is probably pretty |
| * unlikely that a shader would directly write out the results from |
| * textureGather we might as well just disable it. |
| */ |
| if (tex_inst->opcode == SHADER_OPCODE_TG4 || |
| tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET) |
| return false; |
| |
| /* If there's no header present, we need to munge the LOAD_PAYLOAD as well. |
| * It's very likely to be the previous instruction. |
| */ |
| fs_inst *load_payload = (fs_inst *) tex_inst->prev; |
| if (load_payload->is_head_sentinel() || |
| load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD) |
| return false; |
| |
| assert(!tex_inst->eot); /* We can't get here twice */ |
| assert((tex_inst->offset & (0xff << 24)) == 0); |
| |
| tex_inst->offset |= fb_write->target << 24; |
| tex_inst->eot = true; |
| tex_inst->dst = bld.null_reg_ud(); |
| fb_write->remove(cfg->blocks[cfg->num_blocks - 1]); |
| |
| /* If a header is present, marking the eot is sufficient. Otherwise, we need |
| * to create a new LOAD_PAYLOAD command with the same sources and a space |
| * saved for the header. Using a new destination register not only makes sure |
| * we have enough space, but it will make sure the dead code eliminator kills |
| * the instruction that this will replace. |
| */ |
| if (tex_inst->header_size != 0) |
| return true; |
| |
| fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F, |
| load_payload->sources + 1); |
| fs_reg *new_sources = |
| ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1); |
| |
| new_sources[0] = fs_reg(); |
| for (int i = 0; i < load_payload->sources; i++) |
| new_sources[i+1] = load_payload->src[i]; |
| |
| /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it |
| * requires a lot of information about the sources to appropriately figure |
| * out the number of registers needed to be used. Given this stage in our |
| * optimization, we may not have the appropriate GRFs required by |
| * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to |
| * manually emit the instruction. |
| */ |
| fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, |
| load_payload->exec_size, |
| send_header, |
| new_sources, |
| load_payload->sources + 1); |
| |
| new_load_payload->regs_written = load_payload->regs_written + 1; |
| new_load_payload->header_size = 1; |
| tex_inst->mlen++; |
| tex_inst->header_size = 1; |
| tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload); |
| tex_inst->src[0] = send_header; |
| |
| return true; |
| } |
| |
| bool |
| fs_visitor::opt_register_renaming() |
| { |
| bool progress = false; |
| int depth = 0; |
| |
| int remap[alloc.count]; |
| memset(remap, -1, sizeof(int) * alloc.count); |
| |
| foreach_block_and_inst(block, fs_inst, inst, cfg) { |
| if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) { |
| depth++; |
| } else if (inst->opcode == BRW_OPCODE_ENDIF || |
| inst->opcode == BRW_OPCODE_WHILE) { |
| depth--; |
| } |
| |
| /* Rewrite instruction sources. */ |
| for (int i = 0; i < inst->sources; i++) { |
| if (inst->src[i].file == GRF && |
| remap[inst->src[i].reg] != -1 && |
| remap[inst->src[i].reg] != inst->src[i].reg) { |
| inst->src[i].reg = remap[inst->src[i].reg]; |
| progress = true; |
| } |
| } |
| |
| const int dst = inst->dst.reg; |
| |
| if (depth == 0 && |
| inst->dst.file == GRF && |
| alloc.sizes[inst->dst.reg] == inst->exec_size / 8 && |
| !inst->is_partial_write()) { |
| if (remap[dst] == -1) { |
| remap[dst] = dst; |
| } else { |
| remap[dst] = alloc.allocate(inst->exec_size / 8); |
| inst->dst.reg = remap[dst]; |
| progress = true; |
| } |
| } else if (inst->dst.file == GRF && |
| remap[dst] != -1 && |
| remap[dst] != dst) { |
| inst->dst.reg = remap[dst]; |
| progress = true; |
| } |
| } |
| |
| if (progress) { |
| invalidate_live_intervals(); |
| |
| for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) { |
| if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) { |
| delta_xy[i].reg = remap[delta_xy[i].reg]; |
| } |
| } |
| } |
| |
| return progress; |
| } |
| |
| /** |
| * Remove redundant or useless discard jumps. |
| * |
| * For example, we can eliminate jumps in the following sequence: |
| * |
| * discard-jump (redundant with the next jump) |
| * discard-jump (useless; jumps to the next instruction) |
| * placeholder-halt |
| */ |
| bool |
| fs_visitor::opt_redundant_discard_jumps() |
| { |
| bool progress = false; |
| |
| bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1]; |
| |
| fs_inst *placeholder_halt = NULL; |
| foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) { |
| if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) { |
| placeholder_halt = inst; |
| break; |
| } |
| } |
| |
| if (!placeholder_halt) |
| return false; |
| |
| /* Delete any HALTs immediately before the placeholder halt. */ |
| for (fs_inst *prev = (fs_inst *) placeholder_halt->prev; |
| !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP; |
| prev = (fs_inst *) placeholder_halt->prev) { |
| prev->remove(last_bblock); |
| progress = true; |
| } |
| |
| if (progress) |
| invalidate_live_intervals(); |
| |
| return progress; |
| } |
| |
| bool |
| fs_visitor::compute_to_mrf() |
| { |
| bool progress = false; |
| int next_ip = 0; |
| |
| /* No MRFs on Gen >= 7. */ |
| if (devinfo->gen >= 7) |
| return false; |
| |
| calculate_live_intervals(); |
| |
| foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { |
| int ip = next_ip; |
| next_ip++; |
| |
| if (inst->opcode != BRW_OPCODE_MOV || |
| inst->is_partial_write() || |
| inst->dst.file != MRF || inst->src[0].file != GRF || |
| inst->dst.type != inst->src[0].type || |
| inst->src[0].abs || inst->src[0].negate || |
| !inst->src[0].is_contiguous() || |
| inst->src[0].subreg_offset) |
| continue; |
| |
| /* Work out which hardware MRF registers are written by this |
| * instruction. |
| */ |
| int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4; |
| int mrf_high; |
| if (inst->dst.reg & BRW_MRF_COMPR4) { |
| mrf_high = mrf_low + 4; |
| } else if (inst->exec_size == 16) { |
| mrf_high = mrf_low + 1; |
| } else { |
| mrf_high = mrf_low; |
| } |
| |
| /* Can't compute-to-MRF this GRF if someone else was going to |
| * read it later. |
| */ |
| if (this->virtual_grf_end[inst->src[0].reg] > ip) |
| continue; |
| |
| /* Found a move of a GRF to a MRF. Let's see if we can go |
| * rewrite the thing that made this GRF to write into the MRF. |
| */ |
| foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) { |
| if (scan_inst->dst.file == GRF && |
| scan_inst->dst.reg == inst->src[0].reg) { |
| /* Found the last thing to write our reg we want to turn |
| * into a compute-to-MRF. |
| */ |
| |
| /* If this one instruction didn't populate all the |
| * channels, bail. We might be able to rewrite everything |
| * that writes that reg, but it would require smarter |
| * tracking to delay the rewriting until complete success. |
| */ |
| if (scan_inst->is_partial_write()) |
| break; |
| |
| /* Things returning more than one register would need us to |
| * understand coalescing out more than one MOV at a time. |
| */ |
| if (scan_inst->regs_written > scan_inst->exec_size / 8) |
| break; |
| |
| /* SEND instructions can't have MRF as a destination. */ |
| if (scan_inst->mlen) |
| break; |
| |
| if (devinfo->gen == 6) { |
| /* gen6 math instructions must have the destination be |
| * GRF, so no compute-to-MRF for them. |
| */ |
| if (scan_inst->is_math()) { |
| break; |
| } |
| } |
| |
| if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) { |
| /* Found the creator of our MRF's source value. */ |
| scan_inst->dst.file = MRF; |
| scan_inst->dst.reg = inst->dst.reg; |
| scan_inst->saturate |= inst->saturate; |
| inst->remove(block); |
| progress = true; |
| } |
| break; |
| } |
| |
| /* We don't handle control flow here. Most computation of |
| * values that end up in MRFs are shortly before the MRF |
| * write anyway. |
| */ |
| if (block->start() == scan_inst) |
| break; |
| |
| /* You can't read from an MRF, so if someone else reads our |
| * MRF's source GRF that we wanted to rewrite, that stops us. |
| */ |
| bool interfered = false; |
| for (int i = 0; i < scan_inst->sources; i++) { |
| if (scan_inst->src[i].file == GRF && |
| scan_inst->src[i].reg == inst->src[0].reg && |
| scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { |
| interfered = true; |
| } |
| } |
| if (interfered) |
| break; |
| |
| if (scan_inst->dst.file == MRF) { |
| /* If somebody else writes our MRF here, we can't |
| * compute-to-MRF before that. |
| */ |
| int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4; |
| int scan_mrf_high; |
| |
| if (scan_inst->dst.reg & BRW_MRF_COMPR4) { |
| scan_mrf_high = scan_mrf_low + 4; |
| } else if (scan_inst->exec_size == 16) { |
| scan_mrf_high = scan_mrf_low + 1; |
| } else { |
| scan_mrf_high = scan_mrf_low; |
| } |
| |
| if (mrf_low == scan_mrf_low || |
| mrf_low == scan_mrf_high || |
| mrf_high == scan_mrf_low || |
| mrf_high == scan_mrf_high) { |
| break; |
| } |
| } |
| |
| if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) { |
| /* Found a SEND instruction, which means that there are |
| * live values in MRFs from base_mrf to base_mrf + |
| * scan_inst->mlen - 1. Don't go pushing our MRF write up |
| * above it. |
| */ |
| if (mrf_low >= scan_inst->base_mrf && |
| mrf_low < scan_inst->base_mrf + scan_inst->mlen) { |
| break; |
| } |
| if (mrf_high >= scan_inst->base_mrf && |
| mrf_high < scan_inst->base_mrf + scan_inst->mlen) { |
| break; |
| } |
| } |
| } |
| } |
| |
| if (progress) |
| invalidate_live_intervals(); |
| |
| return progress; |
| } |
| |
| /** |
| * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control |
| * flow. We could probably do better here with some form of divergence |
| * analysis. |
| */ |
| bool |
| fs_visitor::eliminate_find_live_channel() |
| { |
| bool progress = false; |
| unsigned depth = 0; |
| |
| foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { |
| switch (inst->opcode) { |
| case BRW_OPCODE_IF: |
| case BRW_OPCODE_DO: |
| depth++; |
| break; |
| |
| case BRW_OPCODE_ENDIF: |
| case BRW_OPCODE_WHILE: |
| depth--; |
| break; |
| |
| case FS_OPCODE_DISCARD_JUMP: |
| /* This can potentially make control flow non-uniform until the end |
| * of the program. |
| */ |
| return progress; |
| |
| case SHADER_OPCODE_FIND_LIVE_CHANNEL: |
| if (depth == 0) { |
| inst->opcode = BRW_OPCODE_MOV; |
| inst->src[0] = fs_reg(0); |
| inst->sources = 1; |
| inst->force_writemask_all = true; |
| progress = true; |
| } |
| break; |
| |
| default: |
| break; |
| } |
| } |
| |
| return progress; |
| } |
| |
| /** |
| * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE |
| * instructions to FS_OPCODE_REP_FB_WRITE. |
| */ |
| void |
| fs_visitor::emit_repclear_shader() |
| { |
| brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; |
| int base_mrf = 1; |
| int color_mrf = base_mrf + 2; |
| |
| fs_inst *mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)), |
| fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)); |
| |
| fs_inst *write; |
| if (key->nr_color_regions == 1) { |
| write = bld.emit(FS_OPCODE_REP_FB_WRITE); |
| write->saturate = key->clamp_fragment_color; |
| write->base_mrf = color_mrf; |
| write->target = 0; |
| write->header_size = 0; |
| write->mlen = 1; |
| } else { |
| assume(key->nr_color_regions > 0); |
| for (int i = 0; i < key->nr_color_regions; ++i) { |
| write = bld.emit(FS_OPCODE_REP_FB_WRITE); |
| write->saturate = key->clamp_fragment_color; |
| write->base_mrf = base_mrf; |
| write->target = i; |
| write->header_size = 2; |
| write->mlen = 3; |
| } |
| } |
| write->eot = true; |
| |
| calculate_cfg(); |
| |
| assign_constant_locations(); |
| assign_curb_setup(); |
| |
| /* Now that we have the uniform assigned, go ahead and force it to a vec4. */ |
| assert(mov->src[0].file == HW_REG); |
| mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0); |
| } |
| |
| /** |
| * Walks through basic blocks, looking for repeated MRF writes and |
| * removing the later ones. |
| */ |
| bool |
| fs_visitor::remove_duplicate_mrf_writes() |
| { |
| fs_inst *last_mrf_move[16]; |
| bool progress = false; |
| |
| /* Need to update the MRF tracking for compressed instructions. */ |
| if (dispatch_width == 16) |
| return false; |
| |
| memset(last_mrf_move, 0, sizeof(last_mrf_move)); |
| |
| foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { |
| if (inst->is_control_flow()) { |
| memset(last_mrf_move, 0, sizeof(last_mrf_move)); |
| } |
| |
| if (inst->opcode == BRW_OPCODE_MOV && |
| inst->dst.file == MRF) { |
| fs_inst *prev_inst = last_mrf_move[inst->dst.reg]; |
| if (prev_inst && inst->equals(prev_inst)) { |
| inst->remove(block); |
| progress = true; |
| continue; |
| } |
| } |
| |
| /* Clear out the last-write records for MRFs that were overwritten. */ |
| if (inst->dst.file == MRF) { |
| last_mrf_move[inst->dst.reg] = NULL; |
| } |
| |
| if (inst->mlen > 0 && inst->base_mrf != -1) { |
| /* Found a SEND instruction, which will include two or fewer |
| * implied MRF writes. We could do better here. |
| */ |
| for (int i = 0; i < implied_mrf_writes(inst); i++) { |
| last_mrf_move[inst->base_mrf + i] = NULL; |
| } |
| } |
| |
| /* Clear out any MRF move records whose sources got overwritten. */ |
| if (inst->dst.file == GRF) { |
| for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) { |
| if (last_mrf_move[i] && |
| last_mrf_move[i]->src[0].reg == inst->dst.reg) { |
| last_mrf_move[i] = NULL; |
| } |
| } |
| } |
| |
| if (inst->opcode == BRW_OPCODE_MOV && |
| inst->dst.file == MRF && |
| inst->src[0].file == GRF && |
| !inst->is_partial_write()) { |
| last_mrf_move[inst->dst.reg] = inst; |
| } |
| } |
| |
| if (progress) |
| invalidate_live_intervals(); |
| |
| return progress; |
| } |
| |
| static void |
| clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len) |
| { |
| /* Clear the flag for registers that actually got read (as expected). */ |
| for (int i = 0; i < inst->sources; i++) { |
| int grf; |
| if (inst->src[i].file == GRF) { |
| grf = inst->src[i].reg; |
| } else if (inst->src[i].file == HW_REG && |
| inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { |
| grf = inst->src[i].fixed_hw_reg.nr; |
| } else { |
| continue; |
| } |
| |
| if (grf >= first_grf && |
| grf < first_grf + grf_len) { |
| deps[grf - first_grf] = false; |
| if (inst->exec_size == 16) |
| deps[grf - first_grf + 1] = false; |
| } |
| } |
| } |
| |
| /** |
| * Implements this workaround for the original 965: |
| * |
| * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not |
| * check for post destination dependencies on this instruction, software |
| * must ensure that there is no destination hazard for the case of ‘write |
| * followed by a posted write’ shown in the following example. |
| * |
| * 1. mov r3 0 |
| * 2. send r3.xy <rest of send instruction> |
| * 3. mov r2 r3 |
| * |
| * Due to no post-destination dependency check on the ‘send’, the above |
| * code sequence could have two instructions (1 and 2) in flight at the |
| * same time that both consider ‘r3’ as the target of their final writes. |
| */ |
| void |
| fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block, |
| fs_inst *inst) |
| { |
| int write_len = inst->regs_written; |
| int first_write_grf = inst->dst.reg; |
| bool needs_dep[BRW_MAX_MRF]; |
| assert(write_len < (int)sizeof(needs_dep) - 1); |
| |
| memset(needs_dep, false, sizeof(needs_dep)); |
| memset(needs_dep, true, write_len); |
| |
| clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len); |
| |
| /* Walk backwards looking for writes to registers we're writing which |
| * aren't read since being written. If we hit the start of the program, |
| * we assume that there are no outstanding dependencies on entry to the |
| * program. |
| */ |
| foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) { |
| /* If we hit control flow, assume that there *are* outstanding |
| * dependencies, and force their cleanup before our instruction. |
| */ |
| if (block->start() == scan_inst) { |
| for (int i = 0; i < write_len; i++) { |
| if (needs_dep[i]) |
| DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i); |
| } |
| return; |
| } |
| |
| /* We insert our reads as late as possible on the assumption that any |
| * instruction but a MOV that might have left us an outstanding |
| * dependency has more latency than a MOV. |
| */ |
| if (scan_inst->dst.file == GRF) { |
| for (int i = 0; i < scan_inst->regs_written; i++) { |
| int reg = scan_inst->dst.reg + i; |
| |
| if (reg >= first_write_grf && |
| reg < first_write_grf + write_len && |
| needs_dep[reg - first_write_grf]) { |
| DEP_RESOLVE_MOV(bld.at(block, inst), reg); |
| needs_dep[reg - first_write_grf] = false; |
| if (scan_inst->exec_size == 16) |
| needs_dep[reg - first_write_grf + 1] = false; |
| } |
| } |
| } |
| |
| /* Clear the flag for registers that actually got read (as expected). */ |
| clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len); |
| |
| /* Continue the loop only if we haven't resolved all the dependencies */ |
| int i; |
| for (i = 0; i < write_len; i++) { |
| if (needs_dep[i]) |
| break; |
| } |
| if (i == write_len) |
| return; |
| } |
| } |
| |
| /** |
| * Implements this workaround for the original 965: |
| * |
| * "[DevBW, DevCL] Errata: A destination register from a send can not be |
| * used as a destination register until after it has been sourced by an |
| * instruction with a different destination register. |
| */ |
| void |
| fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst) |
| { |
| int write_len = inst->regs_written; |
| int first_write_grf = inst->dst.reg; |
| bool needs_dep[BRW_MAX_MRF]; |
| assert(write_len < (int)sizeof(needs_dep) - 1); |
| |
| memset(needs_dep, false, sizeof(needs_dep)); |
| memset(needs_dep, true, write_len); |
| /* Walk forwards looking for writes to registers we're writing which aren't |
| * read before being written. |
| */ |
| foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) { |
| /* If we hit control flow, force resolve all remaining dependencies. */ |
| if (block->end() == scan_inst) { |
| for (int i = 0; i < write_len; i++) { |
| if (needs_dep[i]) |
| DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i); |
| } |
| return; |
| } |
| |
| /* Clear the flag for registers that actually got read (as expected). */ |
| clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len); |
| |
| /* We insert our reads as late as possible since they're reading the |
| * result of a SEND, which has massive latency. |
| */ |
| if (scan_inst->dst.file == GRF && |
| scan_inst->dst.reg >= first_write_grf && |
| scan_inst->dst.reg < first_write_grf + write_len && |
| needs_dep[scan_inst->dst.reg - first_write_grf]) { |
| DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg); |
| needs_dep[scan_inst->dst.reg - first_write_grf] = false; |
| } |
| |
| /* Continue the loop only if we haven't resolved all the dependencies */ |
| int i; |
| for (i = 0; i < write_len; i++) { |
| if (needs_dep[i]) |
| break; |
| } |
| if (i == write_len) |
| return; |
| } |
| } |
| |
| void |
| fs_visitor::insert_gen4_send_dependency_workarounds() |
| { |
| if (devinfo->gen != 4 || devinfo->is_g4x) |
| return; |
| |
| bool progress = false; |
| |
| /* Note that we're done with register allocation, so GRF fs_regs always |
| * have a .reg_offset of 0. |
| */ |
| |
| foreach_block_and_inst(block, fs_inst, inst, cfg) { |
| if (inst->mlen != 0 && inst->dst.file == GRF) { |
| insert_gen4_pre_send_dependency_workarounds(block, inst); |
| insert_gen4_post_send_dependency_workarounds(block, inst); |
| progress = true; |
| } |
| } |
| |
| if (progress) |
| invalidate_live_intervals(); |
| } |
| |
| /** |
| * Turns the generic expression-style uniform pull constant load instruction |
| * into a hardware-specific series of instructions for loading a pull |
| * constant. |
| * |
| * The expression style allows the CSE pass before this to optimize out |
| * repeated loads from the same offset, and gives the pre-register-allocation |
| * scheduling full flexibility, while the conversion to native instructions |
| * allows the post-register-allocation scheduler the best information |
| * possible. |
| * |
| * Note that execution masking for setting up pull constant loads is special: |
| * the channels that need to be written are unrelated to the current execution |
| * mask, since a later instruction will use one of the result channels as a |
| * source operand for all 8 or 16 of its channels. |
| */ |
| void |
| fs_visitor::lower_uniform_pull_constant_loads() |
| { |
| foreach_block_and_inst (block, fs_inst, inst, cfg) { |
| if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD) |
| continue; |
| |
| if (devinfo->gen >= 7) { |
| /* The offset arg before was a vec4-aligned byte offset. We need to |
| * turn it into a dword offset. |
| */ |
| fs_reg const_offset_reg = inst->src[1]; |
| assert(const_offset_reg.file == IMM && |
| const_offset_reg.type == BRW_REGISTER_TYPE_UD); |
| const_offset_reg.fixed_hw_reg.dw1.ud /= 4; |
| |
| fs_reg payload, offset; |
| if (devinfo->gen >= 9) { |
| /* We have to use a message header on Skylake to get SIMD4x2 |
| * mode. Reserve space for the register. |
| */ |
| offset = payload = fs_reg(GRF, alloc.allocate(2)); |
| offset.reg_offset++; |
| inst->mlen = 2; |
| } else { |
| offset = payload = fs_reg(GRF, alloc.allocate(1)); |
| inst->mlen = 1; |
| } |
| |
| /* This is actually going to be a MOV, but since only the first dword |
| * is accessed, we have a special opcode to do just that one. Note |
| * that this needs to be an operation that will be considered a def |
| * by live variable analysis, or register allocation will explode. |
| */ |
| fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET, |
| 8, offset, const_offset_reg); |
| setup->force_writemask_all = true; |
| |
| setup->ir = inst->ir; |
| setup->annotation = inst->annotation; |
| inst->insert_before(block, setup); |
| |
| /* Similarly, this will only populate the first 4 channels of the |
| * result register (since we only use smear values from 0-3), but we |
| * don't tell the optimizer. |
| */ |
| inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7; |
| inst->src[1] = payload; |
| inst->base_mrf = -1; |
| |
| invalidate_live_intervals(); |
| } else { |
| /* Before register allocation, we didn't tell the scheduler about the |
| * MRF we use. We know it's safe to use this MRF because nothing |
| * else does except for register spill/unspill, which generates and |
| * uses its MRF within a single IR instruction. |
| */ |
| inst->base_mrf = 14; |
| inst->mlen = 1; |
| } |
| } |
| } |
| |
| bool |
| fs_visitor::lower_load_payload() |
| { |
| bool progress = false; |
| |
| foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { |
| if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) |
| continue; |
| |
| assert(inst->dst.file == MRF || inst->dst.file == GRF); |
| assert(inst->saturate == false); |
| fs_reg dst = inst->dst; |
| |
| /* Get rid of COMPR4. We'll add it back in if we need it */ |
| if (dst.file == MRF) |
| dst.reg = dst.reg & ~BRW_MRF_COMPR4; |
| |
| const fs_builder hbld = bld.exec_all().group(8, 0).at(block, inst); |
| |
| for (uint8_t i = 0; i < inst->header_size; i++) { |
| if (inst->src[i].file != BAD_FILE) { |
| fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD); |
| fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD); |
| hbld.MOV(mov_dst, mov_src); |
| } |
| dst = offset(dst, hbld, 1); |
| } |
| |
| const fs_builder ibld = bld.exec_all(inst->force_writemask_all) |
| .group(inst->exec_size, inst->force_sechalf) |
| .at(block, inst); |
| |
| if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) && |
| inst->exec_size > 8) { |
| /* In this case, the payload portion of the LOAD_PAYLOAD isn't |
| * a straightforward copy. Instead, the result of the |
| * LOAD_PAYLOAD is treated as interleaved and the first four |
| * non-header sources are unpacked as: |
| * |
| * m + 0: r0 |
| * m + 1: g0 |
| * m + 2: b0 |
| * m + 3: a0 |
| * m + 4: r1 |
| * m + 5: g1 |
| * m + 6: b1 |
| * m + 7: a1 |
| * |
| * This is used for gen <= 5 fb writes. |
| */ |
| assert(inst->exec_size == 16); |
| assert(inst->header_size + 4 <= inst->sources); |
| for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) { |
| if (inst->src[i].file != BAD_FILE) { |
| if (devinfo->has_compr4) { |
| fs_reg compr4_dst = retype(dst, inst->src[i].type); |
| compr4_dst.reg |= BRW_MRF_COMPR4; |
| ibld.MOV(compr4_dst, inst->src[i]); |
| } else { |
| /* Platform doesn't have COMPR4. We have to fake it */ |
| fs_reg mov_dst = retype(dst, inst->src[i].type); |
| ibld.half(0).MOV(mov_dst, half(inst->src[i], 0)); |
| mov_dst.reg += 4; |
| ibld.half(1).MOV(mov_dst, half(inst->src[i], 1)); |
| } |
| } |
| |
| dst.reg++; |
| } |
| |
| /* The loop above only ever incremented us through the first set |
| * of 4 registers. However, thanks to the magic of COMPR4, we |
| * actually wrote to the first 8 registers, so we need to take |
| * that into account now. |
| */ |
| dst.reg += 4; |
| |
| /* The COMPR4 code took care of the first 4 sources. We'll let |
| * the regular path handle any remaining sources. Yes, we are |
| * modifying the instruction but we're about to delete it so |
| * this really doesn't hurt anything. |
| */ |
| inst->header_size += 4; |
| } |
| |
| for (uint8_t i = inst->header_size; i < inst->sources; i++) { |
| if (inst->src[i].file != BAD_FILE) |
| ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]); |
| dst = offset(dst, ibld, 1); |
| } |
| |
| inst->remove(block); |
| progress = true; |
| } |
| |
| if (progress) |
| invalidate_live_intervals(); |
| |
| return progress; |
| } |
| |
| bool |
| fs_visitor::lower_integer_multiplication() |
| { |
| bool progress = false; |
| |
| /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation |
| * directly, but Cherryview cannot. |
| */ |
| if (devinfo->gen >= 8 && !devinfo->is_cherryview) |
| return false; |
| |
| foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { |
| if (inst->opcode != BRW_OPCODE_MUL || |
| inst->dst.is_accumulator() || |
| (inst->dst.type != BRW_REGISTER_TYPE_D && |
| inst->dst.type != BRW_REGISTER_TYPE_UD)) |
| continue; |
| |
| const fs_builder ibld = bld.at(block, inst); |
| |
| /* The MUL instruction isn't commutative. On Gen <= 6, only the low |
| * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of |
| * src1 are used. |
| * |
| * If multiplying by an immediate value that fits in 16-bits, do a |
| * single MUL instruction with that value in the proper location. |
| */ |
| if (inst->src[1].file == IMM && |
| inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) { |
| if (devinfo->gen < 7) { |
| fs_reg imm(GRF, alloc.allocate(dispatch_width / 8), |
| inst->dst.type); |
| ibld.MOV(imm, inst->src[1]); |
| ibld.MUL(inst->dst, imm, inst->src[0]); |
| } else { |
| ibld.MUL(inst->dst, inst->src[0], inst->src[1]); |
| } |
| } else { |
| /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot |
| * do 32-bit integer multiplication in one instruction, but instead |
| * must do a sequence (which actually calculates a 64-bit result): |
| * |
| * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D |
| * mach(8) null g3<8,8,1>D g4<8,8,1>D |
| * mov(8) g2<1>D acc0<8,8,1>D |
| * |
| * But on Gen > 6, the ability to use second accumulator register |
| * (acc1) for non-float data types was removed, preventing a simple |
| * implementation in SIMD16. A 16-channel result can be calculated by |
| * executing the three instructions twice in SIMD8, once with quarter |
| * control of 1Q for the first eight channels and again with 2Q for |
| * the second eight channels. |
| * |
| * Which accumulator register is implicitly accessed (by AccWrEnable |
| * for instance) is determined by the quarter control. Unfortunately |
| * Ivybridge (and presumably Baytrail) has a hardware bug in which an |
| * implicit accumulator access by an instruction with 2Q will access |
| * acc1 regardless of whether the data type is usable in acc1. |
| * |
| * Specifically, the 2Q mach(8) writes acc1 which does not exist for |
| * integer data types. |
| * |
| * Since we only want the low 32-bits of the result, we can do two |
| * 32-bit x 16-bit multiplies (like the mul and mach are doing), and |
| * adjust the high result and add them (like the mach is doing): |
| * |
| * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW |
| * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW |
| * shl(8) g9<1>D g8<8,8,1>D 16D |
| * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D |
| * |
| * We avoid the shl instruction by realizing that we only want to add |
| * the low 16-bits of the "high" result to the high 16-bits of the |
| * "low" result and using proper regioning on the add: |
| * |
| * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW |
| * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW |
| * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW |
| * |
| * Since it does not use the (single) accumulator register, we can |
| * schedule multi-component multiplications much better. |
| */ |
| |
| if (inst->conditional_mod && inst->dst.is_null()) { |
| inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8), |
| inst->dst.type); |
| } |
| fs_reg low = inst->dst; |
| fs_reg high(GRF, alloc.allocate(dispatch_width / 8), |
| inst->dst.type); |
| |
| if (devinfo->gen >= 7) { |
| fs_reg src1_0_w = inst->src[1]; |
| fs_reg src1_1_w = inst->src[1]; |
| |
| if (inst->src[1].file == IMM) { |
| src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff; |
| src1_1_w.fixed_hw_reg.dw1.ud >>= 16; |
| } else { |
| src1_0_w.type = BRW_REGISTER_TYPE_UW; |
| if (src1_0_w.stride != 0) { |
| assert(src1_0_w.stride == 1); |
| src1_0_w.stride = 2; |
| } |
| |
| src1_1_w.type = BRW_REGISTER_TYPE_UW; |
| if (src1_1_w.stride != 0) { |
| assert(src1_1_w.stride == 1); |
| src1_1_w.stride = 2; |
| } |
| src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW); |
| } |
| ibld.MUL(low, inst->src[0], src1_0_w); |
| ibld.MUL(high, inst->src[0], src1_1_w); |
| } else { |
| fs_reg src0_0_w = inst->src[0]; |
| fs_reg src0_1_w = inst->src[0]; |
| |
| src0_0_w.type = BRW_REGISTER_TYPE_UW; |
| if (src0_0_w.stride != 0) { |
| assert(src0_0_w.stride == 1); |
| src0_0_w.stride = 2; |
| } |
| |
| src0_1_w.type = BRW_REGISTER_TYPE_UW; |
| if (src0_1_w.stride != 0) { |
| assert(src0_1_w.stride == 1); |
| src0_1_w.stride = 2; |
| } |
| src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW); |
| |
| ibld.MUL(low, src0_0_w, inst->src[1]); |
| ibld.MUL(high, src0_1_w, inst->src[1]); |
| } |
| |
| fs_reg dst = inst->dst; |
| dst.type = BRW_REGISTER_TYPE_UW; |
| dst.subreg_offset = 2; |
| dst.stride = 2; |
| |
| high.type = BRW_REGISTER_TYPE_UW; |
| high.stride = 2; |
| |
| low.type = BRW_REGISTER_TYPE_UW; |
| low.subreg_offset = 2; |
| low.stride = 2; |
| |
| ibld.ADD(dst, low, high); |
| |
| if (inst->conditional_mod) { |
| fs_reg null(retype(ibld.null_reg_f(), inst->dst.type)); |
| set_condmod(inst->conditional_mod, |
| ibld.MOV(null, inst->dst)); |
| } |
| } |
| |
| inst->remove(block); |
| progress = true; |
| } |
| |
| if (progress) |
| invalidate_live_intervals(); |
| |
| return progress; |
| } |
| |
| void |
| fs_visitor::dump_instructions() |
| { |
| dump_instructions(NULL); |
| } |
| |
| void |
| fs_visitor::dump_instructions(const char *name) |
| { |
| FILE *file = stderr; |
| if (name && geteuid() != 0) { |
| file = fopen(name, "w"); |
| if (!file) |
| file = stderr; |
| } |
| |
| if (cfg) { |
| calculate_register_pressure(); |
| int ip = 0, max_pressure = 0; |
| foreach_block_and_inst(block, backend_instruction, inst, cfg) { |
| max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]); |
| fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip); |
| dump_instruction(inst, file); |
| ip++; |
| } |
| fprintf(file, "Maximum %3d registers live at once.\n", max_pressure); |
| } else { |
| int ip = 0; |
| foreach_in_list(backend_instruction, inst, &instructions) { |
| fprintf(file, "%4d: ", ip++); |
| dump_instruction(inst, file); |
| } |
| } |
| |
| if (file != stderr) { |
| fclose(file); |
| } |
| } |
| |
| void |
| fs_visitor::dump_instruction(backend_instruction *be_inst) |
| { |
| dump_instruction(be_inst, stderr); |
| } |
| |
| void |
| fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) |
| { |
| fs_inst *inst = (fs_inst *)be_inst; |
| |
| if (inst->predicate) { |
| fprintf(file, "(%cf0.%d) ", |
| inst->predicate_inverse ? '-' : '+', |
| inst->flag_subreg); |
| } |
| |
| fprintf(file, "%s", brw_instruction_name(inst->opcode)); |
| if (inst->saturate) |
| fprintf(file, ".sat"); |
| if (inst->conditional_mod) { |
| fprintf(file, "%s", conditional_modifier[inst->conditional_mod]); |
| if (!inst->predicate && |
| (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL && |
| inst->opcode != BRW_OPCODE_IF && |
| inst->opcode != BRW_OPCODE_WHILE))) { |
| fprintf(file, ".f0.%d", inst->flag_subreg); |
| } |
| } |
| fprintf(file, "(%d) ", inst->exec_size); |
| |
| if (inst->mlen) { |
| fprintf(file, "(mlen: %d) ", inst->mlen); |
| } |
| |
| switch (inst->dst.file) { |
| case GRF: |
| fprintf(file, "vgrf%d", inst->dst.reg); |
| if (alloc.sizes[inst->dst.reg] != inst->regs_written || |
| inst->dst.subreg_offset) |
| fprintf(file, "+%d.%d", |
| inst->dst.reg_offset, inst->dst.subreg_offset); |
| break; |
| case MRF: |
| fprintf(file, "m%d", inst->dst.reg); |
| break; |
| case BAD_FILE: |
| fprintf(file, "(null)"); |
| break; |
| case UNIFORM: |
| fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset); |
| break; |
| case ATTR: |
| fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset); |
| break; |
| case HW_REG: |
| if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) { |
| switch (inst->dst.fixed_hw_reg.nr) { |
| case BRW_ARF_NULL: |
| fprintf(file, "null"); |
| break; |
| case BRW_ARF_ADDRESS: |
| fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr); |
| break; |
| case BRW_ARF_ACCUMULATOR: |
| fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr); |
| break; |
| case BRW_ARF_FLAG: |
| fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf, |
| inst->dst.fixed_hw_reg.subnr); |
| break; |
| default: |
| fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf, |
| inst->dst.fixed_hw_reg.subnr); |
| break; |
| } |
| } else { |
| fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr); |
| } |
| if (inst->dst.fixed_hw_reg.subnr) |
| fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr); |
| break; |
| default: |
| fprintf(file, "???"); |
| break; |
| } |
| fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type)); |
| |
| for (int i = 0; i < inst->sources; i++) { |
| if (inst->src[i].negate) |
| fprintf(file, "-"); |
| if (inst->src[i].abs) |
| fprintf(file, "|"); |
| switch (inst->src[i].file) { |
| case GRF: |
| fprintf(file, "vgrf%d", inst->src[i].reg); |
| if (alloc.sizes[inst->src[i].reg] != (unsigned)inst->regs_read(i) || |
| inst->src[i].subreg_offset) |
| fprintf(file, "+%d.%d", inst->src[i].reg_offset, |
| inst->src[i].subreg_offset); |
| break; |
| case MRF: |
| fprintf(file, "***m%d***", inst->src[i].reg); |
| break; |
| case ATTR: |
| fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset); |
| break; |
| case UNIFORM: |
| fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset); |
| if (inst->src[i].reladdr) { |
| fprintf(file, "+reladdr"); |
| } else if (inst->src[i].subreg_offset) { |
| fprintf(file, "+%d.%d", inst->src[i].reg_offset, |
| inst->src[i].subreg_offset); |
| } |
| break; |
| case BAD_FILE: |
| fprintf(file, "(null)"); |
| break; |
| case IMM: |
| switch (inst->src[i].type) { |
| case BRW_REGISTER_TYPE_F: |
| fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f); |
| break; |
| case BRW_REGISTER_TYPE_W: |
| case BRW_REGISTER_TYPE_D: |
| fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d); |
| break; |
| case BRW_REGISTER_TYPE_UW: |
| case BRW_REGISTER_TYPE_UD: |
| fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud); |
| break; |
| case BRW_REGISTER_TYPE_VF: |
| fprintf(file, "[%-gF, %-gF, %-gF, %-gF]", |
| brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff), |
| brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff), |
| brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff), |
| brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff)); |
| break; |
| default: |
| fprintf(file, "???"); |
| break; |
| } |
| break; |
| case HW_REG: |
| if (inst->src[i].fixed_hw_reg.negate) |
| fprintf(file, "-"); |
| if (inst->src[i].fixed_hw_reg.abs) |
| fprintf(file, "|"); |
| if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) { |
| switch (inst->src[i].fixed_hw_reg.nr) { |
| case BRW_ARF_NULL: |
| fprintf(file, "null"); |
| break; |
| case BRW_ARF_ADDRESS: |
| fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr); |
| break; |
| case BRW_ARF_ACCUMULATOR: |
| fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr); |
| break; |
| case BRW_ARF_FLAG: |
| fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf, |
| inst->src[i].fixed_hw_reg.subnr); |
| break; |
| default: |
| fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf, |
| inst->src[i].fixed_hw_reg.subnr); |
| break; |
| } |
| } else { |
| fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr); |
| } |
| if (inst->src[i].fixed_hw_reg.subnr) |
| fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr); |
| if (inst->src[i].fixed_hw_reg.abs) |
| fprintf(file, "|"); |
| break; |
| default: |
| fprintf(file, "???"); |
| break; |
| } |
| if (inst->src[i].abs) |
| fprintf(file, "|"); |
| |
| if (inst->src[i].file != IMM) { |
| fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type)); |
| } |
| |
| if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE) |
| fprintf(file, ", "); |
| } |
| |
| fprintf(file, " "); |
| |
| if (dispatch_width == 16 && inst->exec_size == 8) { |
| if (inst->force_sechalf) |
| fprintf(file, "2ndhalf "); |
| else |
| fprintf(file, "1sthalf "); |
| } |
| |
| fprintf(file, "\n"); |
| } |
| |
| /** |
| * Possibly returns an instruction that set up @param reg. |
| * |
| * Sometimes we want to take the result of some expression/variable |
| * dereference tree and rewrite the instruction generating the result |
| * of the tree. When processing the tree, we know that the |
| * instructions generated are all writing temporaries that are dead |
| * outside of this tree. So, if we have some instructions that write |
| * a temporary, we're free to point that temp write somewhere else. |
| * |
| * Note that this doesn't guarantee that the instruction generated |
| * only reg -- it might be the size=4 destination of a texture instruction. |
| */ |
| fs_inst * |
| fs_visitor::get_instruction_generating_reg(fs_inst *start, |
| fs_inst *end, |
| const fs_reg ®) |
| { |
| if (end == start || |
| end->is_partial_write() || |
| reg.reladdr || |
| !reg.equals(end->dst)) { |
| return NULL; |
| } else { |
| return end; |
| } |
| } |
| |
| void |
| fs_visitor::setup_payload_gen6() |
| { |
| bool uses_depth = |
| (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0; |
| unsigned barycentric_interp_modes = |
| (stage == MESA_SHADER_FRAGMENT) ? |
| ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0; |
| |
| assert(devinfo->gen >= 6); |
| |
| /* R0-1: masks, pixel X/Y coordinates. */ |
| payload.num_regs = 2; |
| /* R2: only for 32-pixel dispatch.*/ |
| |
| /* R3-26: barycentric interpolation coordinates. These appear in the |
| * same order that they appear in the brw_wm_barycentric_interp_mode |
| * enum. Each set of coordinates occupies 2 registers if dispatch width |
| * == 8 and 4 registers if dispatch width == 16. Coordinates only |
| * appear if they were enabled using the "Barycentric Interpolation |
| * Mode" bits in WM_STATE. |
| */ |
| for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) { |
| if (barycentric_interp_modes & (1 << i)) { |
| payload.barycentric_coord_reg[i] = payload.num_regs; |
| payload.num_regs += 2; |
| if (dispatch_width == 16) { |
| payload.num_regs += 2; |
| } |
| } |
| } |
| |
| /* R27: interpolated depth if uses source depth */ |
| if (uses_depth) { |
| payload.source_depth_reg = payload.num_regs; |
| payload.num_regs++; |
| if (dispatch_width == 16) { |
| /* R28: interpolated depth if not SIMD8. */ |
| payload.num_regs++; |
| } |
| } |
| /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */ |
| if (uses_depth) { |
| payload.source_w_reg = payload.num_regs; |
| payload.num_regs++; |
| if (dispatch_width == 16) { |
| /* R30: interpolated W if not SIMD8. */ |
| payload.num_regs++; |
| } |
| } |
| |
| if (stage == MESA_SHADER_FRAGMENT) { |
| brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; |
| brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; |
| prog_data->uses_pos_offset = key->compute_pos_offset; |
| /* R31: MSAA position offsets. */ |
| if (prog_data->uses_pos_offset) { |
| payload.sample_pos_reg = payload.num_regs; |
| payload.num_regs++; |
| } |
| } |
| |
| /* R32: MSAA input coverage mask */ |
| if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) { |
| assert(devinfo->gen >= 7); |
| payload.sample_mask_in_reg = payload.num_regs; |
| payload.num_regs++; |
| if (dispatch_width == 16) { |
| /* R33: input coverage mask if not SIMD8. */ |
| payload.num_regs++; |
| } |
| } |
| |
| /* R34-: bary for 32-pixel. */ |
| /* R58-59: interp W for 32-pixel. */ |
| |
| if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { |
| source_depth_to_render_target = true; |
| } |
| } |
| |
| void |
| fs_visitor::setup_vs_payload() |
| { |
| /* R0: thread header, R1: urb handles */ |
| payload.num_regs = 2; |
| } |
| |
| void |
| fs_visitor::setup_cs_payload() |
| { |
| assert(devinfo->gen >= 7); |
| |
| payload.num_regs = 1; |
| } |
| |
| void |
| fs_visitor::assign_binding_table_offsets() |
| { |
| assert(stage == MESA_SHADER_FRAGMENT); |
| brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; |
| brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; |
| uint32_t next_binding_table_offset = 0; |
| |
| /* If there are no color regions, we still perform an FB write to a null |
| * renderbuffer, which we place at surface index 0. |
| */ |
| prog_data->binding_table.render_target_start = next_binding_table_offset; |
| next_binding_table_offset += MAX2(key->nr_color_regions, 1); |
| |
| assign_common_binding_table_offsets(next_binding_table_offset); |
| } |
| |
| void |
| fs_visitor::calculate_register_pressure() |
| { |
| invalidate_live_intervals(); |
| calculate_live_intervals(); |
| |
| unsigned num_instructions = 0; |
| foreach_block(block, cfg) |
| num_instructions += block->instructions.length(); |
| |
| regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions); |
| |
| for (unsigned reg = 0; reg < alloc.count; reg++) { |
| for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++) |
| regs_live_at_ip[ip] += alloc.sizes[reg]; |
| } |
| } |
| |
| void |
| fs_visitor::optimize() |
| { |
| /* bld is the common builder object pointing at the end of the program we |
| * used to translate it into i965 IR. For the optimization and lowering |
| * passes coming next, any code added after the end of the program without |
| * having explicitly called fs_builder::at() clearly points at a mistake. |
| * Ideally optimization passes wouldn't be part of the visitor so they |
| * wouldn't have access to bld at all, but they do, so just in case some |
| * pass forgets to ask for a location explicitly set it to NULL here to |
| * make it trip. |
| */ |
| bld = bld.at(NULL, NULL); |
| |
| split_virtual_grfs(); |
| |
| move_uniform_array_access_to_pull_constants(); |
| assign_constant_locations(); |
| demote_pull_constants(); |
| |
| #define OPT(pass, args...) ({ \ |
| pass_num++; \ |
| bool this_progress = pass(args); \ |
| \ |
| if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \ |
| char filename[64]; \ |
| snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \ |
| stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \ |
| \ |
| backend_shader::dump_instructions(filename); \ |
| } \ |
| \ |
| progress = progress || this_progress; \ |
| this_progress; \ |
| }) |
| |
| if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) { |
| char filename[64]; |
| snprintf(filename, 64, "%s%d-%04d-00-start", |
| stage_abbrev, dispatch_width, |
| shader_prog ? shader_prog->Name : 0); |
| |
| backend_shader::dump_instructions(filename); |
| } |
| |
| bool progress; |
| int iteration = 0; |
| int pass_num = 0; |
| do { |
| progress = false; |
| pass_num = 0; |
| iteration++; |
| |
| OPT(remove_duplicate_mrf_writes); |
| |
| OPT(opt_algebraic); |
| OPT(opt_cse); |
| OPT(opt_copy_propagate); |
| OPT(opt_peephole_predicated_break); |
| OPT(opt_cmod_propagation); |
| OPT(dead_code_eliminate); |
| OPT(opt_peephole_sel); |
| OPT(dead_control_flow_eliminate, this); |
| OPT(opt_register_renaming); |
| OPT(opt_redundant_discard_jumps); |
| OPT(opt_saturate_propagation); |
| OPT(opt_zero_samples); |
| OPT(register_coalesce); |
| OPT(compute_to_mrf); |
| OPT(eliminate_find_live_channel); |
| |
| OPT(compact_virtual_grfs); |
| } while (progress); |
| |
| pass_num = 0; |
| |
| OPT(opt_sampler_eot); |
| |
| if (OPT(lower_load_payload)) { |
| split_virtual_grfs(); |
| OPT(register_coalesce); |
| OPT(compute_to_mrf); |
| OPT(dead_code_eliminate); |
| } |
| |
| OPT(opt_combine_constants); |
| OPT(lower_integer_multiplication); |
| |
| lower_uniform_pull_constant_loads(); |
| } |
| |
| /** |
| * Three source instruction must have a GRF/MRF destination register. |
| * ARF NULL is not allowed. Fix that up by allocating a temporary GRF. |
| */ |
| void |
| fs_visitor::fixup_3src_null_dest() |
| { |
| foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { |
| if (inst->is_3src() && inst->dst.is_null()) { |
| inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8), |
| inst->dst.type); |
| } |
| } |
| } |
| |
| void |
| fs_visitor::allocate_registers() |
| { |
| bool allocated_without_spills; |
| |
| static const enum instruction_scheduler_mode pre_modes[] = { |
| SCHEDULE_PRE, |
| SCHEDULE_PRE_NON_LIFO, |
| SCHEDULE_PRE_LIFO, |
| }; |
| |
| /* Try each scheduling heuristic to see if it can successfully register |
| * allocate without spilling. They should be ordered by decreasing |
| * performance but increasing likelihood of allocating. |
| */ |
| for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) { |
| schedule_instructions(pre_modes[i]); |
| |
| if (0) { |
| assign_regs_trivial(); |
| allocated_without_spills = true; |
| } else { |
| allocated_without_spills = assign_regs(false); |
| } |
| if (allocated_without_spills) |
| break; |
| } |
| |
| if (!allocated_without_spills) { |
| /* We assume that any spilling is worse than just dropping back to |
| * SIMD8. There's probably actually some intermediate point where |
| * SIMD16 with a couple of spills is still better. |
| */ |
| if (dispatch_width == 16) { |
| fail("Failure to register allocate. Reduce number of " |
| "live scalar values to avoid this."); |
| } else { |
| compiler->shader_perf_log(log_data, |
| "%s shader triggered register spilling. " |
| "Try reducing the number of live scalar " |
| "values to improve performance.\n", |
| stage_name); |
| } |
| |
| /* Since we're out of heuristics, just go spill registers until we |
| * get an allocation. |
| */ |
| while (!assign_regs(true)) { |
| if (failed) |
| break; |
| } |
| } |
| |
| /* This must come after all optimization and register allocation, since |
| * it inserts dead code that happens to have side effects, and it does |
| * so based on the actual physical registers in use. |
| */ |
| insert_gen4_send_dependency_workarounds(); |
| |
| if (failed) |
| return; |
| |
| if (!allocated_without_spills) |
| schedule_instructions(SCHEDULE_POST); |
| |
| if (last_scratch > 0) |
| prog_data->total_scratch = brw_get_scratch_size(last_scratch); |
| } |
| |
| bool |
| fs_visitor::run_vs(gl_clip_plane *clip_planes) |
| { |
| assert(stage == MESA_SHADER_VERTEX); |
| |
| assign_common_binding_table_offsets(0); |
| setup_vs_payload(); |
| |
| if (shader_time_index >= 0) |
| emit_shader_time_begin(); |
| |
| emit_nir_code(); |
| |
| if (failed) |
| return false; |
| |
| compute_clip_distance(clip_planes); |
| |
| emit_urb_writes(); |
| |
| if (shader_time_index >= 0) |
| emit_shader_time_end(); |
| |
| calculate_cfg(); |
| |
| optimize(); |
| |
| assign_curb_setup(); |
| assign_vs_urb_setup(); |
| |
| fixup_3src_null_dest(); |
| allocate_registers(); |
| |
| return !failed; |
| } |
| |
| bool |
| fs_visitor::run_fs(bool do_rep_send) |
| { |
| brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data; |
| brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key; |
| |
| assert(stage == MESA_SHADER_FRAGMENT); |
| |
| sanity_param_count = prog->Parameters->NumParameters; |
| |
| assign_binding_table_offsets(); |
| |
| if (devinfo->gen >= 6) |
| setup_payload_gen6(); |
| else |
| setup_payload_gen4(); |
| |
| if (0) { |
| emit_dummy_fs(); |
| } else if (do_rep_send) { |
| assert(dispatch_width == 16); |
| emit_repclear_shader(); |
| } else { |
| if (shader_time_index >= 0) |
| emit_shader_time_begin(); |
| |
| calculate_urb_setup(); |
| if (prog->InputsRead > 0) { |
| if (devinfo->gen < 6) |
| emit_interpolation_setup_gen4(); |
| else |
| emit_interpolation_setup_gen6(); |
| } |
| |
| /* We handle discards by keeping track of the still-live pixels in f0.1. |
| * Initialize it with the dispatched pixels. |
| */ |
| if (wm_prog_data->uses_kill) { |
| fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS); |
| discard_init->flag_subreg = 1; |
| } |
| |
| /* Generate FS IR for main(). (the visitor only descends into |
| * functions called "main"). |
| */ |
| emit_nir_code(); |
| |
| if (failed) |
| return false; |
| |
| if (wm_prog_data->uses_kill) |
| bld.emit(FS_OPCODE_PLACEHOLDER_HALT); |
| |
| if (wm_key->alpha_test_func) |
| emit_alpha_test(); |
| |
| emit_fb_writes(); |
| |
| if (shader_time_index >= 0) |
| emit_shader_time_end(); |
| |
| calculate_cfg(); |
| |
| optimize(); |
| |
| assign_curb_setup(); |
| assign_urb_setup(); |
| |
| fixup_3src_null_dest(); |
| allocate_registers(); |
| |
| if (failed) |
| return false; |
| } |
| |
| if (dispatch_width == 8) |
| wm_prog_data->reg_blocks = brw_register_blocks(grf_used); |
| else |
| wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used); |
| |
| /* If any state parameters were appended, then ParameterValues could have |
| * been realloced, in which case the driver uniform storage set up by |
| * _mesa_associate_uniform_storage() would point to freed memory. Make |
| * sure that didn't happen. |
| */ |
| assert(sanity_param_count == prog->Parameters->NumParameters); |
| |
| return !failed; |
| } |
| |
| bool |
| fs_visitor::run_cs() |
| { |
| assert(stage == MESA_SHADER_COMPUTE); |
| assert(shader); |
| |
| sanity_param_count = prog->Parameters->NumParameters; |
| |
| assign_common_binding_table_offsets(0); |
| |
| setup_cs_payload(); |
| |
| if (shader_time_index >= 0) |
| emit_shader_time_begin(); |
| |
| emit_nir_code(); |
| |
| if (failed) |
| return false; |
| |
| emit_cs_terminate(); |
| |
| if (shader_time_index >= 0) |
| emit_shader_time_end(); |
| |
| calculate_cfg(); |
| |
| optimize(); |
| |
| assign_curb_setup(); |
| |
| fixup_3src_null_dest(); |
| allocate_registers(); |
| |
| if (failed) |
| return false; |
| |
| /* If any state parameters were appended, then ParameterValues could have |
| * been realloced, in which case the driver uniform storage set up by |
| * _mesa_associate_uniform_storage() would point to freed memory. Make |
| * sure that didn't happen. |
| */ |
| assert(sanity_param_count == prog->Parameters->NumParameters); |
| |
| return !failed; |
| } |
| |
| const unsigned * |
| brw_wm_fs_emit(struct brw_context *brw, |
| void *mem_ctx, |
| const struct brw_wm_prog_key *key, |
| struct brw_wm_prog_data *prog_data, |
| struct gl_fragment_program *fp, |
| struct gl_shader_program *prog, |
| unsigned *final_assembly_size) |
| { |
| bool start_busy = false; |
| double start_time = 0; |
| |
| if (unlikely(brw->perf_debug)) { |
| start_busy = (brw->batch.last_bo && |
| drm_intel_bo_busy(brw->batch.last_bo)); |
| start_time = get_time(); |
| } |
| |
| struct brw_shader *shader = NULL; |
| if (prog) |
| shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; |
| |
| if (unlikely(INTEL_DEBUG & DEBUG_WM)) |
| brw_dump_ir("fragment", prog, &shader->base, &fp->Base); |
| |
| int st_index8 = -1, st_index16 = -1; |
| if (INTEL_DEBUG & DEBUG_SHADER_TIME) { |
| st_index8 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS8); |
| st_index16 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS16); |
| } |
| |
| /* Now the main event: Visit the shader IR and generate our FS IR for it. |
| */ |
| fs_visitor v(brw->intelScreen->compiler, brw, |
| mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base, |
| prog, &fp->Base, 8, st_index8); |
| if (!v.run_fs(false /* do_rep_send */)) { |
| if (prog) { |
| prog->LinkStatus = false; |
| ralloc_strcat(&prog->InfoLog, v.fail_msg); |
| } |
| |
| _mesa_problem(NULL, "Failed to compile fragment shader: %s\n", |
| v.fail_msg); |
| |
| return NULL; |
| } |
| |
| cfg_t *simd16_cfg = NULL; |
| fs_visitor v2(brw->intelScreen->compiler, brw, |
| mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base, |
| prog, &fp->Base, 16, st_index16); |
| if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) { |
| if (!v.simd16_unsupported) { |
| /* Try a SIMD16 compile */ |
| v2.import_uniforms(&v); |
| if (!v2.run_fs(brw->use_rep_send)) { |
| perf_debug("SIMD16 shader failed to compile: %s", v2.fail_msg); |
| } else { |
| simd16_cfg = v2.cfg; |
| } |
| } |
| } |
| |
| cfg_t *simd8_cfg; |
| int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8; |
| if ((no_simd8 || brw->gen < 5) && simd16_cfg) { |
| simd8_cfg = NULL; |
| prog_data->no_8 = true; |
| } else { |
| simd8_cfg = v.cfg; |
| prog_data->no_8 = false; |
| } |
| |
| fs_generator g(brw->intelScreen->compiler, brw, |
| mem_ctx, (void *) key, &prog_data->base, |
| &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS"); |
| |
| if (unlikely(INTEL_DEBUG & DEBUG_WM)) { |
| char *name; |
| if (prog) |
| name = ralloc_asprintf(mem_ctx, "%s fragment shader %d", |
| prog->Label ? prog->Label : "unnamed", |
| prog->Name); |
| else |
| name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id); |
| |
| g.enable_debug(name); |
| } |
| |
| if (simd8_cfg) |
| g.generate_code(simd8_cfg, 8); |
| if (simd16_cfg) |
| prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16); |
| |
| if (unlikely(brw->perf_debug) && shader) { |
| if (shader->compiled_once) |
| brw_wm_debug_recompile(brw, prog, key); |
| shader->compiled_once = true; |
| |
| if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) { |
| perf_debug("FS compile took %.03f ms and stalled the GPU\n", |
| (get_time() - start_time) * 1000); |
| } |
| } |
| |
| return g.get_assembly(final_assembly_size); |
| } |
| |
| extern "C" bool |
| brw_fs_precompile(struct gl_context *ctx, |
| struct gl_shader_program *shader_prog, |
| struct gl_program *prog) |
| { |
| struct brw_context *brw = brw_context(ctx); |
| struct brw_wm_prog_key key; |
| |
| struct gl_fragment_program *fp = (struct gl_fragment_program *) prog; |
| struct brw_fragment_program *bfp = brw_fragment_program(fp); |
| bool program_uses_dfdy = fp->UsesDFdy; |
| |
| memset(&key, 0, sizeof(key)); |
| |
| if (brw->gen < 6) { |
| if (fp->UsesKill) |
| key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT; |
| |
| if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) |
| key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT; |
| |
| /* Just assume depth testing. */ |
| key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT; |
| key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT; |
| } |
| |
| if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead & |
| BRW_FS_VARYING_INPUT_MASK) > 16) |
| key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS; |
| |
| brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base); |
| |
| if (fp->Base.InputsRead & VARYING_BIT_POS) { |
| key.drawable_height = ctx->DrawBuffer->Height; |
| } |
| |
| key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten & |
| ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) | |
| BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK))); |
| |
| if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) { |
| key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) || |
| key.nr_color_regions > 1; |
| } |
| |
| key.program_string_id = bfp->id; |
| |
| uint32_t old_prog_offset = brw->wm.base.prog_offset; |
| struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data; |
| |
| bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key); |
| |
| brw->wm.base.prog_offset = old_prog_offset; |
| brw->wm.prog_data = old_prog_data; |
| |
| return success; |
| } |
| |
| void |
| brw_setup_tex_for_precompile(struct brw_context *brw, |
| struct brw_sampler_prog_key_data *tex, |
| struct gl_program *prog) |
| { |
| const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8; |
| unsigned sampler_count = _mesa_fls(prog->SamplersUsed); |
| for (unsigned i = 0; i < sampler_count; i++) { |
| if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) { |
| /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */ |
| tex->swizzles[i] = |
| MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE); |
| } else { |
| /* Color sampler: assume no swizzling. */ |
| tex->swizzles[i] = SWIZZLE_XYZW; |
| } |
| } |
| } |