Skip to content
This repository has been archived by the owner on Jan 7, 2023. It is now read-only.

Commit

Permalink
FROMLIST: SQUASH: i965: SIMD32 selection heuristics
Browse files Browse the repository at this point in the history
(cover letter https://patchwork.freedesktop.org/series/51006/)

FROMLIST: i965: SIMD32 heuristics debug flag

Added a new DEBUG_HEUR32 flag to INTEL_DEBUG flags for enabling SIMD32
selection heuristics.

(am from https://patchwork.freedesktop.org/patch/256764/)

FROMLIST: i965: SIMD32 heuristics control data

Added a new structure for holding SIMD32 heuristics control data. The
control data itself will be fetched from drirc.

(am from https://patchwork.freedesktop.org/patch/256806/)

FROMLIST: i965: SIMD32 heuristics control data from drirc

To be able to test the heuristics with different parameters, they can be
controlled via environment variables through drirc.

(am from https://patchwork.freedesktop.org/patch/256788/)

FROMLIST: mesa: Helper functions for counting set bits in a mask

(am from https://patchwork.freedesktop.org/patch/256765/)

FROMLIST: i965/fs: Save the instruction count of each dispatch width

The SIMD32 selection heuristics will use this information for deciding whether
SIMD32 shaders should be used.

(am from https://patchwork.freedesktop.org/patch/256793/)

FROMLIST: i965/fs: SIMD32 selection heuristic based on grouped texture fetches

The function goes through the compiled shader and checks how many grouped
texture fetches there are. This is a simple heuristic which gets rid of most
of the regressions when enabling SIMD32 shaders but still retains some of
the benefits.

(am from https://patchwork.freedesktop.org/patch/256798/)

FROMLIST: i965/fs: Enable all SIMD32 heuristics

There are three simple heuristics for SIMD32 shader enabling:

- How many MRTs does the shader write into?
- How many grouped texture fetches does the shader have?
- How many instructions does the SIMD32 shader have compared to the SIMD16
   shader?

For testing purposes, the heuristics can be controlled via these environment
variables:

simd32_heuristic_mrt_check
- Enables MRT write check
- Default: true

simd32_heuristic_max_mrts
- How many MRT writes the heuristic allows
- Default: 1

simd32_heuristic_grouped_check
- Enables grouped texture fetch check
- Default: true

simd32_heuristic_grouped_sends
- How many grouped texture fetches the heuristic allows
- Default: 6

simd32_heuristic_inst_check
- Enables SIMD32 vs. SIMD16 instruction count check
- Default: true

simd32_heuristic_inst_ratio
- SIMD32 vs. SIMD16 instruction count ratio the heuristic allows
- Default: 2.3

SIMD32 shaders will not be compiled also when SIMD16 compilation fails or
spills.

(am from https://patchwork.freedesktop.org/patch/256766/)
  • Loading branch information
Toni Lönnberg authored and strassek committed Dec 3, 2018
1 parent c9f0060 commit 9e21d48
Show file tree
Hide file tree
Showing 9 changed files with 152 additions and 7 deletions.
1 change: 1 addition & 0 deletions src/intel/common/gen_debug.c
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ static const struct debug_control debug_control[] = {
{ "nohiz", DEBUG_NO_HIZ },
{ "color", DEBUG_COLOR },
{ "reemit", DEBUG_REEMIT },
{ "heur32", DEBUG_HEUR32 },
{ NULL, 0 }
};

Expand Down
3 changes: 2 additions & 1 deletion src/intel/common/gen_debug.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,15 @@ extern uint64_t INTEL_DEBUG;
#define DEBUG_NO_HIZ (1ull << 39)
#define DEBUG_COLOR (1ull << 40)
#define DEBUG_REEMIT (1ull << 41)
#define DEBUG_HEUR32 (1ull << 42)

/* These flags are not compatible with the disk shader cache */
#define DEBUG_DISK_CACHE_DISABLE_MASK DEBUG_SHADER_TIME

/* These flags may affect program generation */
#define DEBUG_DISK_CACHE_MASK \
(DEBUG_NO16 | DEBUG_NO_DUAL_OBJECT_GS | DEBUG_NO8 | DEBUG_SPILL_FS | \
DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32)
DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32 | DEBUG_HEUR32)

#ifdef HAVE_ANDROID_PLATFORM
#define LOG_TAG "INTEL-MESA"
Expand Down
11 changes: 11 additions & 0 deletions src/intel/compiler/brw_compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,15 @@ struct ra_regs;
struct nir_shader;
struct brw_program;

struct brw_simd32_heuristics_control {
bool grouped_sends_check;
int max_grouped_sends;
bool inst_count_check;
float inst_count_ratio;
bool mrt_check;
int max_mrts;
};

struct brw_compiler {
const struct gen_device_info *devinfo;

Expand Down Expand Up @@ -118,6 +127,8 @@ struct brw_compiler {
* whether nir_opt_large_constants will be run.
*/
bool supports_shader_constants;

struct brw_simd32_heuristics_control simd32_heuristics_control;
};

/**
Expand Down
63 changes: 57 additions & 6 deletions src/intel/compiler/brw_fs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7098,6 +7098,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
char **error_str)
{
const struct gen_device_info *devinfo = compiler->devinfo;
bool simd16_failed = false;
bool simd16_spilled = false;

nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
Expand Down Expand Up @@ -7165,20 +7167,30 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
shader_time_index16);
v16.import_uniforms(&v8);
if (!v16.run_fs(allow_spilling, use_rep_send)) {
simd16_failed = true;
compiler->shader_perf_log(log_data,
"SIMD16 shader failed to compile: %s",
v16.fail_msg);
} else {
simd16_spilled = v16.spilled_any_registers;
simd16_cfg = v16.cfg;
prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
prog_data->reg_blocks_16 = brw_register_blocks(v16.grf_used);
}
}

/* Currently, the compiler only supports SIMD32 on SNB+ */
const brw_simd32_heuristics_control *ctrl = &compiler->simd32_heuristics_control;
uint64_t mrts = shader->info.outputs_written << FRAG_RESULT_DATA0;

if (v8.max_dispatch_width >= 32 && !use_rep_send &&
compiler->devinfo->gen >= 6 &&
unlikely(INTEL_DEBUG & DEBUG_DO32)) {
(unlikely(INTEL_DEBUG & DEBUG_DO32) ||
(unlikely(INTEL_DEBUG & DEBUG_HEUR32) &&
!simd16_failed && !simd16_spilled &&
(!ctrl->mrt_check ||
(ctrl->mrt_check &&
u_count_bits64(&mrts) <= ctrl->max_mrts))))) {
/* Try a SIMD32 compile */
fs_visitor v32(compiler, log_data, mem_ctx, key,
&prog_data->base, prog, shader, 32,
Expand All @@ -7189,9 +7201,12 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
"SIMD32 shader failed to compile: %s",
v32.fail_msg);
} else {
simd32_cfg = v32.cfg;
prog_data->dispatch_grf_start_reg_32 = v32.payload.num_regs;
prog_data->reg_blocks_32 = brw_register_blocks(v32.grf_used);
if (likely(!(INTEL_DEBUG & DEBUG_HEUR32)) ||
v32.run_heuristic(ctrl)) {
simd32_cfg = v32.cfg;
prog_data->dispatch_grf_start_reg_32 = v32.payload.num_regs;
prog_data->reg_blocks_32 = brw_register_blocks(v32.grf_used);
}
}
}

Expand Down Expand Up @@ -7270,13 +7285,49 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
}

if (simd32_cfg) {
prog_data->dispatch_32 = true;
prog_data->prog_offset_32 = g.generate_code(simd32_cfg, 32);
uint32_t offset = g.generate_code(simd32_cfg, 32);

if (unlikely(INTEL_DEBUG & DEBUG_DO32) ||
(unlikely(INTEL_DEBUG & DEBUG_HEUR32) &&
(!simd16_cfg ||
(simd16_cfg &&
(!ctrl->inst_count_check ||
(ctrl->inst_count_check &&
(float)g.get_inst_count(32) / (float)g.get_inst_count(16) <= ctrl->inst_count_ratio)))))) {
prog_data->dispatch_32 = true;
prog_data->prog_offset_32 = offset;
}
}

return g.get_assembly();
}

bool
fs_visitor::run_heuristic(const struct brw_simd32_heuristics_control *ctrl) {
int grouped_sends = 0;
int max_grouped_sends = 0;
bool pass = true;

foreach_block_and_inst(block, fs_inst, inst, cfg) {
if (inst->opcode >= SHADER_OPCODE_TEX && inst->opcode <= SHADER_OPCODE_SAMPLEINFO_LOGICAL) {
++grouped_sends;
} else if (grouped_sends > 0) {
if (grouped_sends > max_grouped_sends) {
max_grouped_sends = grouped_sends;
}
grouped_sends = 0;
}
}

if (ctrl->grouped_sends_check) {
if (max_grouped_sends > ctrl->max_grouped_sends) {
pass = false;
}
}

return pass;
}

fs_reg *
fs_visitor::emit_cs_work_group_id_setup()
{
Expand Down
4 changes: 4 additions & 0 deletions src/intel/compiler/brw_fs.h
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,8 @@ class fs_visitor : public backend_shader
void dump_instruction(backend_instruction *inst);
void dump_instruction(backend_instruction *inst, FILE *file);

bool run_heuristic(const struct brw_simd32_heuristics_control *ctrl);

const void *const key;
const struct brw_sampler_prog_key_data *key_tex;

Expand Down Expand Up @@ -392,6 +394,7 @@ class fs_generator

void enable_debug(const char *shader_name);
int generate_code(const cfg_t *cfg, int dispatch_width);
int get_inst_count(int dispatch_width);
const unsigned *get_assembly();

private:
Expand Down Expand Up @@ -484,6 +487,7 @@ class fs_generator
struct brw_stage_prog_data * const prog_data;

unsigned dispatch_width; /**< 8, 16 or 32 */
int inst_count[3]; /* for 8, 16 and 32 */

exec_list discard_halt_patches;
unsigned promoted_constants;
Expand Down
12 changes: 12 additions & 0 deletions src/intel/compiler/brw_fs_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2464,6 +2464,8 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
fill_count, promoted_constants, before_size,
after_size);

inst_count[ffs(dispatch_width) - 4] = before_size / 16;

return start_offset;
}

Expand All @@ -2472,3 +2474,13 @@ fs_generator::get_assembly()
{
return brw_get_program(p, &prog_data->program_size);
}

int
fs_generator::get_inst_count(int dispatch_width)
{
if (dispatch_width == 8 || dispatch_width == 16 || dispatch_width == 32) {
return inst_count[ffs(dispatch_width) - 4];
} else {
return 0;
}
}
13 changes: 13 additions & 0 deletions src/mesa/drivers/dri/i965/brw_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -863,6 +863,19 @@ brw_process_driconf_options(struct brw_context *brw)
ctx->Const.dri_config_options_sha1 = ralloc_array(brw, unsigned char, 20);
driComputeOptionsSha1(&brw->screen->optionCache,
ctx->Const.dri_config_options_sha1);

brw->screen->compiler->simd32_heuristics_control.grouped_sends_check =
driQueryOptionb(&brw->optionCache, "simd32_heuristic_grouped_check");
brw->screen->compiler->simd32_heuristics_control.max_grouped_sends =
driQueryOptioni(&brw->optionCache, "simd32_heuristic_grouped_sends");
brw->screen->compiler->simd32_heuristics_control.inst_count_check =
driQueryOptionb(&brw->optionCache, "simd32_heuristic_inst_check");
brw->screen->compiler->simd32_heuristics_control.inst_count_ratio =
driQueryOptionf(&brw->optionCache, "simd32_heuristic_inst_ratio");
brw->screen->compiler->simd32_heuristics_control.mrt_check =
driQueryOptionb(&brw->optionCache, "simd32_heuristic_mrt_check");
brw->screen->compiler->simd32_heuristics_control.max_mrts =
driQueryOptioni(&brw->optionCache, "simd32_heuristic_max_mrts");
}

GLboolean
Expand Down
27 changes: 27 additions & 0 deletions src/mesa/drivers/dri/i965/intel_screen.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,33 @@ DRI_CONF_BEGIN
DRI_CONF_ENUM(1, "Enable reuse of all sizes of buffer objects")
DRI_CONF_DESC_END
DRI_CONF_OPT_END

DRI_CONF_OPT_BEGIN_B(simd32_heuristic_grouped_check, "true")
DRI_CONF_DESC(en, "Enable/disable grouped texture fetch "
"check in the SIMD32 selection heuristic.")
DRI_CONF_OPT_END
DRI_CONF_OPT_BEGIN_V(simd32_heuristic_grouped_sends, int, 6, "1:999")
DRI_CONF_DESC(en, "How many grouped texture fetches should "
"the SIMD32 selection heuristic allow.")
DRI_CONF_OPT_END
DRI_CONF_OPT_BEGIN_B(simd32_heuristic_inst_check, "true")
DRI_CONF_DESC(en, "Enable/disable SIMD32/SIMD16 instruction "
"count ratio check in the SIMD32 selection "
"heuristic.")
DRI_CONF_OPT_END
DRI_CONF_OPT_BEGIN_V(simd32_heuristic_inst_ratio, float, 2.3, "1:999")
DRI_CONF_DESC(en, "SIMD32/SIMD16 instruction count ratio "
"the SIMD32 selection heuristic should allow.")
DRI_CONF_OPT_END
DRI_CONF_OPT_BEGIN_B(simd32_heuristic_mrt_check, "true")
DRI_CONF_DESC(en, "Enable/disable MRT write check in the "
"SIMD32 selection heuristic.")
DRI_CONF_OPT_END
DRI_CONF_OPT_BEGIN_V(simd32_heuristic_max_mrts, int, 1, "1:8")
DRI_CONF_DESC(en, "How many MRT writes should the SIMD32 "
"selection heuristic allow.")
DRI_CONF_OPT_END

DRI_CONF_MESA_NO_ERROR("false")
DRI_CONF_SECTION_END

Expand Down
25 changes: 25 additions & 0 deletions src/util/bitscan.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,31 @@ u_bit_scan64(uint64_t *mask)
return i;
}

/* Count bits set in mask */
static inline int
u_count_bits(unsigned *mask)
{
unsigned v = *mask;
int c;
v = v - ((v >> 1) & 0x55555555);
v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
v = (v + (v >> 4)) & 0xF0F0F0F;
c = (int)((v * 0x1010101) >> 24);
return c;
}

static inline int
u_count_bits64(uint64_t *mask)
{
uint64_t v = *mask;
int c;
v = v - ((v >> 1) & 0x5555555555555555ull);
v = (v & 0x3333333333333333ull) + ((v >> 2) & 0x3333333333333333ull);
v = (v + (v >> 4)) & 0xF0F0F0F0F0F0F0Full;
c = (int)((v * 0x101010101010101ull) >> 56);
return c;
}

/* Determine if an unsigned value is a power of two.
*
* \note
Expand Down

0 comments on commit 9e21d48

Please sign in to comment.