mirror of OpenBSD xenocara tree github.com/openbsd/xenocara
openbsd
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

at jcs 880 lines 30 kB view raw
1/* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "nir.h" 25#include "nir_instr_set.h" 26 27/* 28 * Implements Global Code Motion. A description of GCM can be found in 29 * "Global Code Motion; Global Value Numbering" by Cliff Click. 30 * Unfortunately, the algorithm presented in the paper is broken in a 31 * number of ways. The algorithm used here differs substantially from the 32 * one in the paper but it is, in my opinion, much easier to read and 33 * verify correcness. 34 */ 35 36/* This is used to stop GCM moving instruction out of a loop if the loop 37 * contains too many instructions and moving them would create excess spilling. 38 * 39 * TODO: Figure out a better way to decide if we should remove instructions from 40 * a loop. 41 */ 42#define MAX_LOOP_INSTRUCTIONS 100 43 44struct gcm_block_info { 45 /* Number of loops this block is inside */ 46 unsigned loop_depth; 47 48 /* Number of ifs this block is inside */ 49 unsigned if_depth; 50 51 unsigned loop_instr_count; 52 53 /* The loop the block is nested inside or NULL */ 54 nir_loop *loop; 55 56 /* The last instruction inserted into this block. This is used as we 57 * traverse the instructions and insert them back into the program to 58 * put them in the right order. 59 */ 60 nir_instr *last_instr; 61}; 62 63struct gcm_instr_info { 64 nir_block *early_block; 65}; 66 67/* Flags used in the instr->pass_flags field for various instruction states */ 68enum { 69 GCM_INSTR_PINNED = (1 << 0), 70 GCM_INSTR_SCHEDULE_EARLIER_ONLY = (1 << 1), 71 GCM_INSTR_SCHEDULED_EARLY = (1 << 2), 72 GCM_INSTR_SCHEDULED_LATE = (1 << 3), 73 GCM_INSTR_PLACED = (1 << 4), 74}; 75 76struct gcm_state { 77 nir_function_impl *impl; 78 nir_instr *instr; 79 80 bool progress; 81 82 /* The list of non-pinned instructions. As we do the late scheduling, 83 * we pull non-pinned instructions out of their blocks and place them in 84 * this list. This saves us from having linked-list problems when we go 85 * to put instructions back in their blocks. 86 */ 87 struct exec_list instrs; 88 89 struct gcm_block_info *blocks; 90 91 unsigned num_instrs; 92 struct gcm_instr_info *instr_infos; 93}; 94 95static unsigned 96get_loop_instr_count(struct exec_list *cf_list) 97{ 98 unsigned loop_instr_count = 0; 99 foreach_list_typed(nir_cf_node, node, node, cf_list) { 100 switch (node->type) { 101 case nir_cf_node_block: { 102 nir_block *block = nir_cf_node_as_block(node); 103 nir_foreach_instr(instr, block) { 104 loop_instr_count++; 105 } 106 break; 107 } 108 case nir_cf_node_if: { 109 nir_if *if_stmt = nir_cf_node_as_if(node); 110 loop_instr_count += get_loop_instr_count(&if_stmt->then_list); 111 loop_instr_count += get_loop_instr_count(&if_stmt->else_list); 112 break; 113 } 114 case nir_cf_node_loop: { 115 nir_loop *loop = nir_cf_node_as_loop(node); 116 assert(!nir_loop_has_continue_construct(loop)); 117 loop_instr_count += get_loop_instr_count(&loop->body); 118 break; 119 } 120 default: 121 unreachable("Invalid CF node type"); 122 } 123 } 124 125 return loop_instr_count; 126} 127 128/* Recursively walks the CFG and builds the block_info structure */ 129static void 130gcm_build_block_info(struct exec_list *cf_list, struct gcm_state *state, 131 nir_loop *loop, unsigned loop_depth, unsigned if_depth, 132 unsigned loop_instr_count) 133{ 134 foreach_list_typed(nir_cf_node, node, node, cf_list) { 135 switch (node->type) { 136 case nir_cf_node_block: { 137 nir_block *block = nir_cf_node_as_block(node); 138 state->blocks[block->index].if_depth = if_depth; 139 state->blocks[block->index].loop_depth = loop_depth; 140 state->blocks[block->index].loop_instr_count = loop_instr_count; 141 state->blocks[block->index].loop = loop; 142 break; 143 } 144 case nir_cf_node_if: { 145 nir_if *if_stmt = nir_cf_node_as_if(node); 146 gcm_build_block_info(&if_stmt->then_list, state, loop, loop_depth, 147 if_depth + 1, ~0u); 148 gcm_build_block_info(&if_stmt->else_list, state, loop, loop_depth, 149 if_depth + 1, ~0u); 150 break; 151 } 152 case nir_cf_node_loop: { 153 nir_loop *loop = nir_cf_node_as_loop(node); 154 assert(!nir_loop_has_continue_construct(loop)); 155 gcm_build_block_info(&loop->body, state, loop, loop_depth + 1, if_depth, 156 get_loop_instr_count(&loop->body)); 157 break; 158 } 159 default: 160 unreachable("Invalid CF node type"); 161 } 162 } 163} 164 165static bool 166is_src_scalarizable(nir_src *src) 167{ 168 169 nir_instr *src_instr = src->ssa->parent_instr; 170 switch (src_instr->type) { 171 case nir_instr_type_alu: { 172 nir_alu_instr *src_alu = nir_instr_as_alu(src_instr); 173 174 /* ALU operations with output_size == 0 should be scalarized. We 175 * will also see a bunch of vecN operations from scalarizing ALU 176 * operations and, since they can easily be copy-propagated, they 177 * are ok too. 178 */ 179 return nir_op_infos[src_alu->op].output_size == 0 || 180 src_alu->op == nir_op_vec2 || 181 src_alu->op == nir_op_vec3 || 182 src_alu->op == nir_op_vec4; 183 } 184 185 case nir_instr_type_load_const: 186 /* These are trivially scalarizable */ 187 return true; 188 189 case nir_instr_type_undef: 190 return true; 191 192 case nir_instr_type_intrinsic: { 193 nir_intrinsic_instr *src_intrin = nir_instr_as_intrinsic(src_instr); 194 195 switch (src_intrin->intrinsic) { 196 case nir_intrinsic_load_deref: { 197 /* Don't scalarize if we see a load of a local variable because it 198 * might turn into one of the things we can't scalarize. 199 */ 200 nir_deref_instr *deref = nir_src_as_deref(src_intrin->src[0]); 201 return !nir_deref_mode_may_be(deref, (nir_var_function_temp | 202 nir_var_shader_temp)); 203 } 204 205 case nir_intrinsic_interp_deref_at_centroid: 206 case nir_intrinsic_interp_deref_at_sample: 207 case nir_intrinsic_interp_deref_at_offset: 208 case nir_intrinsic_load_uniform: 209 case nir_intrinsic_load_ubo: 210 case nir_intrinsic_load_ssbo: 211 case nir_intrinsic_load_global: 212 case nir_intrinsic_load_global_constant: 213 case nir_intrinsic_load_input: 214 case nir_intrinsic_load_per_primitive_input: 215 return true; 216 default: 217 break; 218 } 219 220 return false; 221 } 222 223 default: 224 /* We can't scalarize this type of instruction */ 225 return false; 226 } 227} 228 229static bool 230is_binding_uniform(nir_src src) 231{ 232 nir_binding binding = nir_chase_binding(src); 233 if (!binding.success) 234 return false; 235 236 for (unsigned i = 0; i < binding.num_indices; i++) { 237 if (!nir_src_is_always_uniform(binding.indices[i])) 238 return false; 239 } 240 241 return true; 242} 243 244static void 245pin_intrinsic(nir_intrinsic_instr *intrin) 246{ 247 nir_instr *instr = &intrin->instr; 248 249 if (!nir_intrinsic_can_reorder(intrin)) { 250 instr->pass_flags = GCM_INSTR_PINNED; 251 return; 252 } 253 254 instr->pass_flags = 0; 255 256 /* If the intrinsic requires a uniform source, we can't safely move it across non-uniform 257 * control flow if it's not uniform at the point it's defined. 258 * Stores and atomics can never be re-ordered, so we don't have to consider them here. 259 */ 260 bool non_uniform = nir_intrinsic_has_access(intrin) && 261 (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM); 262 if (!non_uniform && 263 (intrin->intrinsic == nir_intrinsic_load_ubo || 264 intrin->intrinsic == nir_intrinsic_load_ssbo || 265 intrin->intrinsic == nir_intrinsic_get_ubo_size || 266 intrin->intrinsic == nir_intrinsic_get_ssbo_size || 267 nir_intrinsic_has_image_dim(intrin) || 268 ((intrin->intrinsic == nir_intrinsic_load_deref || 269 intrin->intrinsic == nir_intrinsic_deref_buffer_array_length) && 270 nir_deref_mode_may_be(nir_src_as_deref(intrin->src[0]), 271 nir_var_mem_ubo | nir_var_mem_ssbo)))) { 272 if (!is_binding_uniform(intrin->src[0])) 273 instr->pass_flags = GCM_INSTR_PINNED; 274 } else if (intrin->intrinsic == nir_intrinsic_load_push_constant) { 275 if (!nir_src_is_always_uniform(intrin->src[0])) 276 instr->pass_flags = GCM_INSTR_PINNED; 277 } else if (intrin->intrinsic == nir_intrinsic_load_deref && 278 nir_deref_mode_is(nir_src_as_deref(intrin->src[0]), 279 nir_var_mem_push_const)) { 280 nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); 281 while (deref->deref_type != nir_deref_type_var) { 282 if ((deref->deref_type == nir_deref_type_array || 283 deref->deref_type == nir_deref_type_ptr_as_array) && 284 !nir_src_is_always_uniform(deref->arr.index)) { 285 instr->pass_flags = GCM_INSTR_PINNED; 286 return; 287 } 288 deref = nir_deref_instr_parent(deref); 289 if (!deref) { 290 instr->pass_flags = GCM_INSTR_PINNED; 291 return; 292 } 293 } 294 } 295} 296 297/* Walks the instruction list and marks immovable instructions as pinned or 298 * placed. 299 * 300 * This function also serves to initialize the instr->pass_flags field. 301 * After this is completed, all instructions' pass_flags fields will be set 302 * to either GCM_INSTR_PINNED, GCM_INSTR_PLACED or 0. 303 */ 304static void 305gcm_pin_instructions(nir_function_impl *impl, struct gcm_state *state) 306{ 307 state->num_instrs = 0; 308 309 nir_foreach_block(block, impl) { 310 nir_foreach_instr_safe(instr, block) { 311 /* Index the instructions for use in gcm_state::instrs */ 312 instr->index = state->num_instrs++; 313 314 switch (instr->type) { 315 case nir_instr_type_alu: { 316 nir_alu_instr *alu = nir_instr_as_alu(instr); 317 318 if (alu->op == nir_op_mov && !is_src_scalarizable(&alu->src[0].src)) { 319 instr->pass_flags = GCM_INSTR_PINNED; 320 } else { 321 instr->pass_flags = 0; 322 } 323 break; 324 } 325 326 case nir_instr_type_tex: { 327 nir_tex_instr *tex = nir_instr_as_tex(instr); 328 if (nir_tex_instr_has_implicit_derivative(tex)) 329 instr->pass_flags = GCM_INSTR_SCHEDULE_EARLIER_ONLY; 330 331 for (unsigned i = 0; i < tex->num_srcs; i++) { 332 nir_tex_src *src = &tex->src[i]; 333 switch (src->src_type) { 334 case nir_tex_src_texture_deref: 335 if (!tex->texture_non_uniform && !is_binding_uniform(src->src)) 336 instr->pass_flags = GCM_INSTR_PINNED; 337 break; 338 case nir_tex_src_sampler_deref: 339 if (!tex->sampler_non_uniform && !is_binding_uniform(src->src)) 340 instr->pass_flags = GCM_INSTR_PINNED; 341 break; 342 case nir_tex_src_texture_offset: 343 case nir_tex_src_texture_handle: 344 if (!tex->texture_non_uniform && !nir_src_is_always_uniform(src->src)) 345 instr->pass_flags = GCM_INSTR_PINNED; 346 break; 347 case nir_tex_src_sampler_offset: 348 case nir_tex_src_sampler_handle: 349 if (!tex->sampler_non_uniform && !nir_src_is_always_uniform(src->src)) 350 instr->pass_flags = GCM_INSTR_PINNED; 351 break; 352 default: 353 break; 354 } 355 } 356 break; 357 } 358 359 case nir_instr_type_deref: 360 case nir_instr_type_load_const: 361 instr->pass_flags = 0; 362 break; 363 364 case nir_instr_type_intrinsic: 365 pin_intrinsic(nir_instr_as_intrinsic(instr)); 366 break; 367 368 case nir_instr_type_call: 369 instr->pass_flags = GCM_INSTR_PINNED; 370 break; 371 372 case nir_instr_type_jump: 373 case nir_instr_type_undef: 374 case nir_instr_type_phi: 375 instr->pass_flags = GCM_INSTR_PLACED; 376 break; 377 378 default: 379 unreachable("Invalid instruction type in GCM"); 380 } 381 382 if (!(instr->pass_flags & GCM_INSTR_PLACED)) { 383 /* If this is an unplaced instruction, go ahead and pull it out of 384 * the program and put it on the instrs list. This has a couple 385 * of benifits. First, it makes the scheduling algorithm more 386 * efficient because we can avoid walking over basic blocks. 387 * Second, it keeps us from causing linked list confusion when 388 * we're trying to put everything in its proper place at the end 389 * of the pass. 390 * 391 * Note that we don't use nir_instr_remove here because that also 392 * cleans up uses and defs and we want to keep that information. 393 */ 394 exec_node_remove(&instr->node); 395 exec_list_push_tail(&state->instrs, &instr->node); 396 } 397 } 398 } 399} 400 401static void 402gcm_schedule_early_instr(nir_instr *instr, struct gcm_state *state); 403 404/** Update an instructions schedule for the given source 405 * 406 * This function is called iteratively as we walk the sources of an 407 * instruction. It ensures that the given source instruction has been 408 * scheduled and then update this instruction's block if the source 409 * instruction is lower down the tree. 410 */ 411static bool 412gcm_schedule_early_src(nir_src *src, void *void_state) 413{ 414 struct gcm_state *state = void_state; 415 nir_instr *instr = state->instr; 416 417 gcm_schedule_early_instr(src->ssa->parent_instr, void_state); 418 419 /* While the index isn't a proper dominance depth, it does have the 420 * property that if A dominates B then A->index <= B->index. Since we 421 * know that this instruction must have been dominated by all of its 422 * sources at some point (even if it's gone through value-numbering), 423 * all of the sources must lie on the same branch of the dominance tree. 424 * Therefore, we can just go ahead and just compare indices. 425 */ 426 struct gcm_instr_info *src_info = 427 &state->instr_infos[src->ssa->parent_instr->index]; 428 struct gcm_instr_info *info = &state->instr_infos[instr->index]; 429 if (info->early_block->index < src_info->early_block->index) 430 info->early_block = src_info->early_block; 431 432 /* We need to restore the state instruction because it may have been 433 * changed through the gcm_schedule_early_instr call above. Since we 434 * may still be iterating through sources and future calls to 435 * gcm_schedule_early_src for the same instruction will still need it. 436 */ 437 state->instr = instr; 438 439 return true; 440} 441 442/** Schedules an instruction early 443 * 444 * This function performs a recursive depth-first search starting at the 445 * given instruction and proceeding through the sources to schedule 446 * instructions as early as they can possibly go in the dominance tree. 447 * The instructions are "scheduled" by updating the early_block field of 448 * the corresponding gcm_instr_state entry. 449 */ 450static void 451gcm_schedule_early_instr(nir_instr *instr, struct gcm_state *state) 452{ 453 if (instr->pass_flags & GCM_INSTR_SCHEDULED_EARLY) 454 return; 455 456 instr->pass_flags |= GCM_INSTR_SCHEDULED_EARLY; 457 458 /* Pinned/placed instructions always get scheduled in their original block so 459 * we don't need to do anything. Also, bailing here keeps us from ever 460 * following the sources of phi nodes which can be back-edges. 461 */ 462 if (instr->pass_flags & GCM_INSTR_PINNED || 463 instr->pass_flags & GCM_INSTR_PLACED) { 464 state->instr_infos[instr->index].early_block = instr->block; 465 return; 466 } 467 468 /* Start with the instruction at the top. As we iterate over the 469 * sources, it will get moved down as needed. 470 */ 471 state->instr_infos[instr->index].early_block = nir_start_block(state->impl); 472 state->instr = instr; 473 474 nir_foreach_src(instr, gcm_schedule_early_src, state); 475} 476 477static bool 478set_block_for_loop_instr(struct gcm_state *state, nir_instr *instr, 479 nir_block *block) 480{ 481 /* If the instruction wasn't in a loop to begin with we don't want to push 482 * it down into one. 483 */ 484 nir_loop *loop = state->blocks[instr->block->index].loop; 485 if (loop == NULL) 486 return true; 487 488 assert(!nir_loop_has_continue_construct(loop)); 489 if (nir_block_dominates(instr->block, block)) 490 return true; 491 492 /* If the loop only executes a single time i.e its wrapped in a: 493 * do{ ... break; } while(true) 494 * Don't move the instruction as it will not help anything. 495 */ 496 if (loop->info->limiting_terminator == NULL && !loop->info->complex_loop && 497 nir_block_ends_in_break(nir_loop_last_block(loop))) 498 return false; 499 500 /* Being too aggressive with how we pull instructions out of loops can 501 * result in extra register pressure and spilling. For example its fairly 502 * common for loops in compute shaders to calculate SSBO offsets using 503 * the workgroup id, subgroup id and subgroup invocation, pulling all 504 * these calculations outside the loop causes register pressure. 505 * 506 * To work around these issues for now we only allow constant and texture 507 * instructions to be moved outside their original loops, or instructions 508 * where the total loop instruction count is less than 509 * MAX_LOOP_INSTRUCTIONS. 510 * 511 * TODO: figure out some more heuristics to allow more to be moved out of 512 * loops. 513 */ 514 if (state->blocks[instr->block->index].loop_instr_count < MAX_LOOP_INSTRUCTIONS) 515 return true; 516 517 if (instr->type == nir_instr_type_load_const || 518 instr->type == nir_instr_type_tex || 519 (instr->type == nir_instr_type_intrinsic && 520 nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_resource_intel)) 521 return true; 522 523 return false; 524} 525 526static bool 527set_block_to_if_block(struct gcm_state *state, nir_instr *instr, 528 nir_block *block) 529{ 530 if (instr->type == nir_instr_type_load_const) 531 return true; 532 533 if (instr->type == nir_instr_type_intrinsic && 534 nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_resource_intel) 535 return true; 536 537 /* TODO: Figure out some more heuristics to allow more to be moved into 538 * if-statements. 539 */ 540 541 return false; 542} 543 544static nir_block * 545gcm_choose_block_for_instr(nir_instr *instr, nir_block *early_block, 546 nir_block *late_block, struct gcm_state *state) 547{ 548 assert(nir_block_dominates(early_block, late_block)); 549 550 bool block_set = false; 551 552 /* First see if we can push the instruction down into an if-statements block */ 553 nir_block *best = late_block; 554 for (nir_block *block = late_block; block != NULL; block = block->imm_dom) { 555 if (state->blocks[block->index].loop_depth > 556 state->blocks[instr->block->index].loop_depth) 557 continue; 558 559 if (state->blocks[block->index].if_depth >= 560 state->blocks[best->index].if_depth && 561 set_block_to_if_block(state, instr, block)) { 562 /* If we are pushing the instruction into an if we want it to be 563 * in the earliest block not the latest to avoid creating register 564 * pressure issues. So we don't break unless we come across the 565 * block the instruction was originally in. 566 */ 567 best = block; 568 block_set = true; 569 if (block == instr->block) 570 break; 571 } else if (block == instr->block) { 572 /* If we couldn't push the instruction later just put is back where it 573 * was previously. 574 */ 575 if (!block_set) 576 best = block; 577 break; 578 } 579 580 if (block == early_block) 581 break; 582 } 583 584 /* Now see if we can evict the instruction from a loop */ 585 for (nir_block *block = late_block; block != NULL; block = block->imm_dom) { 586 if (state->blocks[block->index].loop_depth < 587 state->blocks[best->index].loop_depth) { 588 if (set_block_for_loop_instr(state, instr, block)) { 589 best = block; 590 } else if (block == instr->block) { 591 if (!block_set) 592 best = block; 593 break; 594 } 595 } 596 597 if (block == early_block) 598 break; 599 } 600 601 return best; 602} 603 604static void 605gcm_schedule_late_instr(nir_instr *instr, struct gcm_state *state); 606 607/** Schedules the instruction associated with the given SSA def late 608 * 609 * This function works by first walking all of the uses of the given SSA 610 * definition, ensuring that they are scheduled, and then computing the LCA 611 * (least common ancestor) of its uses. It then schedules this instruction 612 * as close to the LCA as possible while trying to stay out of loops. 613 */ 614static bool 615gcm_schedule_late_def(nir_def *def, void *void_state) 616{ 617 struct gcm_state *state = void_state; 618 619 nir_block *lca = NULL; 620 621 nir_foreach_use(use_src, def) { 622 nir_instr *use_instr = nir_src_parent_instr(use_src); 623 624 gcm_schedule_late_instr(use_instr, state); 625 626 /* Phi instructions are a bit special. SSA definitions don't have to 627 * dominate the sources of the phi nodes that use them; instead, they 628 * have to dominate the predecessor block corresponding to the phi 629 * source. We handle this by looking through the sources, finding 630 * any that are usingg this SSA def, and using those blocks instead 631 * of the one the phi lives in. 632 */ 633 if (use_instr->type == nir_instr_type_phi) { 634 nir_phi_instr *phi = nir_instr_as_phi(use_instr); 635 636 nir_foreach_phi_src(phi_src, phi) { 637 if (phi_src->src.ssa == def) 638 lca = nir_dominance_lca(lca, phi_src->pred); 639 } 640 } else { 641 lca = nir_dominance_lca(lca, use_instr->block); 642 } 643 } 644 645 nir_foreach_if_use(use_src, def) { 646 nir_if *if_stmt = nir_src_parent_if(use_src); 647 648 /* For if statements, we consider the block to be the one immediately 649 * preceding the if CF node. 650 */ 651 nir_block *pred_block = 652 nir_cf_node_as_block(nir_cf_node_prev(&if_stmt->cf_node)); 653 654 lca = nir_dominance_lca(lca, pred_block); 655 } 656 657 nir_block *early_block = 658 state->instr_infos[def->parent_instr->index].early_block; 659 660 /* Some instructions may never be used. Flag them and the instruction 661 * placement code will get rid of them for us. 662 */ 663 if (lca == NULL) { 664 def->parent_instr->block = NULL; 665 return true; 666 } 667 668 if (def->parent_instr->pass_flags & GCM_INSTR_SCHEDULE_EARLIER_ONLY && 669 lca != def->parent_instr->block && 670 nir_block_dominates(def->parent_instr->block, lca)) { 671 lca = def->parent_instr->block; 672 } 673 674 /* We now have the LCA of all of the uses. If our invariants hold, 675 * this is dominated by the block that we chose when scheduling early. 676 * We now walk up the dominance tree and pick the lowest block that is 677 * as far outside loops as we can get. 678 */ 679 nir_block *best_block = 680 gcm_choose_block_for_instr(def->parent_instr, early_block, lca, state); 681 682 if (def->parent_instr->block != best_block) 683 state->progress = true; 684 685 def->parent_instr->block = best_block; 686 687 return true; 688} 689 690/** Schedules an instruction late 691 * 692 * This function performs a depth-first search starting at the given 693 * instruction and proceeding through its uses to schedule instructions as 694 * late as they can reasonably go in the dominance tree. The instructions 695 * are "scheduled" by updating their instr->block field. 696 * 697 * The name of this function is actually a bit of a misnomer as it doesn't 698 * schedule them "as late as possible" as the paper implies. Instead, it 699 * first finds the lates possible place it can schedule the instruction and 700 * then possibly schedules it earlier than that. The actual location is as 701 * far down the tree as we can go while trying to stay out of loops. 702 */ 703static void 704gcm_schedule_late_instr(nir_instr *instr, struct gcm_state *state) 705{ 706 if (instr->pass_flags & GCM_INSTR_SCHEDULED_LATE) 707 return; 708 709 instr->pass_flags |= GCM_INSTR_SCHEDULED_LATE; 710 711 /* Pinned/placed instructions are already scheduled so we don't need to do 712 * anything. Also, bailing here keeps us from ever following phi nodes 713 * which can be back-edges. 714 */ 715 if (instr->pass_flags & GCM_INSTR_PLACED || 716 instr->pass_flags & GCM_INSTR_PINNED) 717 return; 718 719 nir_foreach_def(instr, gcm_schedule_late_def, state); 720} 721 722static bool 723gcm_replace_def_with_undef(nir_def *def, void *void_state) 724{ 725 struct gcm_state *state = void_state; 726 727 if (nir_def_is_unused(def)) 728 return true; 729 730 nir_undef_instr *undef = 731 nir_undef_instr_create(state->impl->function->shader, 732 def->num_components, def->bit_size); 733 nir_instr_insert(nir_before_impl(state->impl), &undef->instr); 734 nir_def_rewrite_uses(def, &undef->def); 735 736 return true; 737} 738 739/** Places an instrution back into the program 740 * 741 * The earlier passes of GCM simply choose blocks for each instruction and 742 * otherwise leave them alone. This pass actually places the instructions 743 * into their chosen blocks. 744 * 745 * To do so, we simply insert instructions in the reverse order they were 746 * extracted. This will simply place instructions that were scheduled earlier 747 * onto the end of their new block and instructions that were scheduled later to 748 * the start of their new block. 749 */ 750static void 751gcm_place_instr(nir_instr *instr, struct gcm_state *state) 752{ 753 if (instr->pass_flags & GCM_INSTR_PLACED) 754 return; 755 756 instr->pass_flags |= GCM_INSTR_PLACED; 757 758 if (instr->block == NULL) { 759 nir_foreach_def(instr, gcm_replace_def_with_undef, state); 760 nir_instr_remove(instr); 761 return; 762 } 763 764 struct gcm_block_info *block_info = &state->blocks[instr->block->index]; 765 exec_node_remove(&instr->node); 766 767 if (block_info->last_instr) { 768 exec_node_insert_node_before(&block_info->last_instr->node, 769 &instr->node); 770 } else { 771 /* Schedule it at the end of the block */ 772 nir_instr *jump_instr = nir_block_last_instr(instr->block); 773 if (jump_instr && jump_instr->type == nir_instr_type_jump) { 774 exec_node_insert_node_before(&jump_instr->node, &instr->node); 775 } else { 776 exec_list_push_tail(&instr->block->instr_list, &instr->node); 777 } 778 } 779 780 block_info->last_instr = instr; 781} 782 783/** 784 * Are instructions a and b both contained in the same if/else block? 785 */ 786static bool 787weak_gvn(const nir_instr *a, const nir_instr *b) 788{ 789 const struct nir_cf_node *ap = a->block->cf_node.parent; 790 const struct nir_cf_node *bp = b->block->cf_node.parent; 791 return ap && ap == bp && ap->type == nir_cf_node_if; 792} 793 794static bool 795opt_gcm_impl(nir_shader *shader, nir_function_impl *impl, bool value_number) 796{ 797 nir_metadata_require(impl, nir_metadata_control_flow); 798 nir_metadata_require(impl, nir_metadata_loop_analysis, 799 shader->options->force_indirect_unrolling, 800 shader->options->force_indirect_unrolling_sampler); 801 802 /* A previous pass may have left pass_flags dirty, so clear it all out. */ 803 nir_foreach_block(block, impl) 804 nir_foreach_instr(instr, block) 805 instr->pass_flags = 0; 806 807 struct gcm_state state; 808 809 state.impl = impl; 810 state.instr = NULL; 811 state.progress = false; 812 exec_list_make_empty(&state.instrs); 813 state.blocks = rzalloc_array(NULL, struct gcm_block_info, impl->num_blocks); 814 815 gcm_build_block_info(&impl->body, &state, NULL, 0, 0, ~0u); 816 817 gcm_pin_instructions(impl, &state); 818 819 state.instr_infos = 820 rzalloc_array(NULL, struct gcm_instr_info, state.num_instrs); 821 822 /* Perform (at least some) Global Value Numbering (GVN). 823 * 824 * We perform full GVN when `value_number' is true. This can be too 825 * aggressive, moving values far away and extending their live ranges, 826 * so we don't always want to do it. 827 * 828 * Otherwise, we perform 'weaker' GVN: if identical ALU instructions appear 829 * on both sides of the same if/else block, we allow them to be moved. 830 * This cleans up a lot of mess without being -too- aggressive. 831 */ 832 struct set *gvn_set = nir_instr_set_create(NULL); 833 foreach_list_typed_safe(nir_instr, instr, node, &state.instrs) { 834 if (instr->pass_flags & GCM_INSTR_PINNED) 835 continue; 836 837 if (nir_instr_set_add_or_rewrite(gvn_set, instr, 838 value_number ? NULL : weak_gvn)) { 839 state.progress = true; 840 nir_instr_remove(instr); 841 } 842 } 843 nir_instr_set_destroy(gvn_set); 844 845 foreach_list_typed(nir_instr, instr, node, &state.instrs) 846 gcm_schedule_early_instr(instr, &state); 847 848 foreach_list_typed(nir_instr, instr, node, &state.instrs) 849 gcm_schedule_late_instr(instr, &state); 850 851 while (!exec_list_is_empty(&state.instrs)) { 852 nir_instr *instr = exec_node_data(nir_instr, 853 state.instrs.tail_sentinel.prev, node); 854 gcm_place_instr(instr, &state); 855 } 856 857 ralloc_free(state.blocks); 858 ralloc_free(state.instr_infos); 859 860 if (state.progress) { 861 nir_metadata_preserve(impl, nir_metadata_control_flow); 862 } else { 863 nir_metadata_preserve(impl, nir_metadata_control_flow | 864 nir_metadata_loop_analysis); 865 } 866 867 return state.progress; 868} 869 870bool 871nir_opt_gcm(nir_shader *shader, bool value_number) 872{ 873 bool progress = false; 874 875 nir_foreach_function_impl(impl, shader) { 876 progress |= opt_gcm_impl(shader, impl, value_number); 877 } 878 879 return progress; 880}