mirror of OpenBSD xenocara tree github.com/openbsd/xenocara
openbsd
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

at jcs 812 lines 31 kB view raw
1/* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Connor Abbott (cwabbott0@gmail.com) 25 * 26 */ 27 28#include "util/set.h" 29#include "util/u_math.h" 30#include "nir.h" 31#include "nir_builder.h" 32 33struct lower_sysval_state { 34 const nir_lower_compute_system_values_options *options; 35 36 /* List of intrinsics that have already been lowered and shouldn't be 37 * lowered again. 38 */ 39 struct set *lower_once_list; 40}; 41 42static nir_def * 43sanitize_32bit_sysval(nir_builder *b, nir_intrinsic_instr *intrin) 44{ 45 const unsigned bit_size = intrin->def.bit_size; 46 if (bit_size == 32) 47 return NULL; 48 49 intrin->def.bit_size = 32; 50 return nir_u2uN(b, &intrin->def, bit_size); 51} 52 53static nir_def * 54build_global_group_size(nir_builder *b, unsigned bit_size) 55{ 56 nir_def *group_size = nir_load_workgroup_size(b); 57 nir_def *num_workgroups = nir_load_num_workgroups(b); 58 return nir_imul(b, nir_u2uN(b, group_size, bit_size), 59 nir_u2uN(b, num_workgroups, bit_size)); 60} 61 62static bool 63lower_system_value_filter(const nir_instr *instr, const void *_state) 64{ 65 return instr->type == nir_instr_type_intrinsic; 66} 67 68static nir_def * 69lower_system_value_instr(nir_builder *b, nir_instr *instr, void *_state) 70{ 71 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 72 73 /* All the intrinsics we care about are loads */ 74 if (!nir_intrinsic_infos[intrin->intrinsic].has_dest) 75 return NULL; 76 77 const unsigned bit_size = intrin->def.bit_size; 78 79 switch (intrin->intrinsic) { 80 case nir_intrinsic_load_vertex_id: 81 if (b->shader->options->vertex_id_zero_based) { 82 return nir_iadd(b, nir_load_vertex_id_zero_base(b), 83 nir_load_first_vertex(b)); 84 } else { 85 return NULL; 86 } 87 88 case nir_intrinsic_load_base_vertex: 89 /** 90 * From the OpenGL 4.6 (11.1.3.9 Shader Inputs) specification: 91 * 92 * "gl_BaseVertex holds the integer value passed to the baseVertex 93 * parameter to the command that resulted in the current shader 94 * invocation. In the case where the command has no baseVertex 95 * parameter, the value of gl_BaseVertex is zero." 96 */ 97 if (b->shader->options->lower_base_vertex) { 98 return nir_iand(b, nir_load_is_indexed_draw(b), 99 nir_load_first_vertex(b)); 100 } else { 101 return NULL; 102 } 103 104 case nir_intrinsic_load_helper_invocation: 105 if (b->shader->options->lower_helper_invocation) { 106 return nir_build_lowered_load_helper_invocation(b); 107 } else { 108 return NULL; 109 } 110 111 case nir_intrinsic_load_local_invocation_id: 112 case nir_intrinsic_load_local_invocation_index: 113 case nir_intrinsic_load_num_workgroups: 114 case nir_intrinsic_load_workgroup_id: 115 case nir_intrinsic_load_workgroup_size: 116 return sanitize_32bit_sysval(b, intrin); 117 118 case nir_intrinsic_interp_deref_at_centroid: 119 case nir_intrinsic_interp_deref_at_sample: 120 case nir_intrinsic_interp_deref_at_offset: { 121 nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); 122 if (!nir_deref_mode_is(deref, nir_var_system_value)) 123 return NULL; 124 125 nir_variable *var = deref->var; 126 enum glsl_interp_mode interp_mode; 127 128 if (var->data.location == SYSTEM_VALUE_BARYCENTRIC_PERSP_COORD) { 129 interp_mode = INTERP_MODE_SMOOTH; 130 } else { 131 assert(var->data.location == SYSTEM_VALUE_BARYCENTRIC_LINEAR_COORD); 132 interp_mode = INTERP_MODE_NOPERSPECTIVE; 133 } 134 135 switch (intrin->intrinsic) { 136 case nir_intrinsic_interp_deref_at_centroid: 137 return nir_load_barycentric_coord_centroid(b, 32, .interp_mode = interp_mode); 138 case nir_intrinsic_interp_deref_at_sample: 139 return nir_load_barycentric_coord_at_sample(b, 32, intrin->src[1].ssa, 140 .interp_mode = interp_mode); 141 case nir_intrinsic_interp_deref_at_offset: 142 return nir_load_barycentric_coord_at_offset(b, 32, intrin->src[1].ssa, 143 .interp_mode = interp_mode); 144 default: 145 unreachable("Bogus interpolateAt() intrinsic."); 146 } 147 } 148 149 case nir_intrinsic_load_input: 150 case nir_intrinsic_load_per_primitive_input: 151 if (b->shader->options->lower_layer_fs_input_to_sysval && 152 b->shader->info.stage == MESA_SHADER_FRAGMENT && 153 nir_intrinsic_io_semantics(intrin).location == VARYING_SLOT_LAYER) 154 return nir_load_layer_id(b); 155 else 156 return NULL; 157 158 case nir_intrinsic_load_deref: { 159 nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); 160 if (!nir_deref_mode_is(deref, nir_var_system_value)) 161 return NULL; 162 163 nir_def *column = NULL; 164 if (deref->deref_type != nir_deref_type_var) { 165 /* The only one system values that aren't plane variables are 166 * gl_SampleMask which is always an array of one element and a 167 * couple of ray-tracing intrinsics which are matrices. 168 */ 169 assert(deref->deref_type == nir_deref_type_array); 170 column = deref->arr.index.ssa; 171 nir_deref_instr *arr_deref = deref; 172 deref = nir_deref_instr_parent(deref); 173 assert(deref->deref_type == nir_deref_type_var); 174 175 switch (deref->var->data.location) { 176 case SYSTEM_VALUE_TESS_LEVEL_INNER: 177 case SYSTEM_VALUE_TESS_LEVEL_OUTER: { 178 nir_def *sysval = (deref->var->data.location == 179 SYSTEM_VALUE_TESS_LEVEL_INNER) 180 ? nir_load_tess_level_inner(b) 181 : nir_load_tess_level_outer(b); 182 return nir_vector_extract(b, sysval, arr_deref->arr.index.ssa); 183 } 184 185 case SYSTEM_VALUE_SAMPLE_MASK_IN: 186 case SYSTEM_VALUE_RAY_OBJECT_TO_WORLD: 187 case SYSTEM_VALUE_RAY_WORLD_TO_OBJECT: 188 case SYSTEM_VALUE_MESH_VIEW_INDICES: 189 case SYSTEM_VALUE_RAY_TRIANGLE_VERTEX_POSITIONS: 190 /* These are all single-element arrays in our implementation, and 191 * the sysval load below just drops the 0 array index. 192 */ 193 break; 194 195 default: 196 unreachable("unsupported system value array deref"); 197 } 198 } 199 nir_variable *var = deref->var; 200 201 switch (var->data.location) { 202 case SYSTEM_VALUE_INSTANCE_INDEX: 203 return nir_iadd(b, nir_load_instance_id(b), 204 nir_load_base_instance(b)); 205 206 case SYSTEM_VALUE_GLOBAL_INVOCATION_ID: { 207 return nir_iadd(b, nir_load_global_invocation_id(b, bit_size), 208 nir_load_base_global_invocation_id(b, bit_size)); 209 } 210 211 case SYSTEM_VALUE_WORKGROUP_ID: { 212 return nir_iadd(b, nir_u2uN(b, nir_load_workgroup_id(b), bit_size), 213 nir_load_base_workgroup_id(b, bit_size)); 214 } 215 216 case SYSTEM_VALUE_SUBGROUP_EQ_MASK: 217 case SYSTEM_VALUE_SUBGROUP_GE_MASK: 218 case SYSTEM_VALUE_SUBGROUP_GT_MASK: 219 case SYSTEM_VALUE_SUBGROUP_LE_MASK: 220 case SYSTEM_VALUE_SUBGROUP_LT_MASK: { 221 nir_intrinsic_op op = 222 nir_intrinsic_from_system_value(var->data.location); 223 nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, op); 224 nir_def_init_for_type(&load->instr, &load->def, var->type); 225 load->num_components = load->def.num_components; 226 nir_builder_instr_insert(b, &load->instr); 227 return &load->def; 228 } 229 230 case SYSTEM_VALUE_DEVICE_INDEX: 231 if (b->shader->options->lower_device_index_to_zero) 232 return nir_imm_int(b, 0); 233 break; 234 235 case SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL: 236 return nir_load_barycentric(b, nir_intrinsic_load_barycentric_pixel, 237 INTERP_MODE_NOPERSPECTIVE); 238 239 case SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID: 240 return nir_load_barycentric(b, nir_intrinsic_load_barycentric_centroid, 241 INTERP_MODE_NOPERSPECTIVE); 242 243 case SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE: 244 return nir_load_barycentric(b, nir_intrinsic_load_barycentric_sample, 245 INTERP_MODE_NOPERSPECTIVE); 246 247 case SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL: 248 return nir_load_barycentric(b, nir_intrinsic_load_barycentric_pixel, 249 INTERP_MODE_SMOOTH); 250 251 case SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID: 252 return nir_load_barycentric(b, nir_intrinsic_load_barycentric_centroid, 253 INTERP_MODE_SMOOTH); 254 255 case SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE: 256 return nir_load_barycentric(b, nir_intrinsic_load_barycentric_sample, 257 INTERP_MODE_SMOOTH); 258 259 case SYSTEM_VALUE_BARYCENTRIC_PULL_MODEL: 260 return nir_load_barycentric(b, nir_intrinsic_load_barycentric_model, 261 INTERP_MODE_NONE); 262 263 case SYSTEM_VALUE_BARYCENTRIC_LINEAR_COORD: 264 case SYSTEM_VALUE_BARYCENTRIC_PERSP_COORD: { 265 enum glsl_interp_mode interp_mode; 266 267 if (var->data.location == SYSTEM_VALUE_BARYCENTRIC_PERSP_COORD) { 268 interp_mode = INTERP_MODE_SMOOTH; 269 } else { 270 assert(var->data.location == SYSTEM_VALUE_BARYCENTRIC_LINEAR_COORD); 271 interp_mode = INTERP_MODE_NOPERSPECTIVE; 272 } 273 274 if (var->data.sample) { 275 return nir_load_barycentric_coord_sample(b, 32, .interp_mode = interp_mode); 276 } else if (var->data.centroid) { 277 return nir_load_barycentric_coord_centroid(b, 32, .interp_mode = interp_mode); 278 } else { 279 return nir_load_barycentric_coord_pixel(b, 32, .interp_mode = interp_mode); 280 } 281 } 282 283 case SYSTEM_VALUE_HELPER_INVOCATION: { 284 /* When demote operation is used, reading the HelperInvocation 285 * needs to use Volatile memory access semantics to provide the 286 * correct (dynamic) value. See OpDemoteToHelperInvocation. 287 */ 288 if (nir_intrinsic_access(intrin) & ACCESS_VOLATILE) 289 return nir_is_helper_invocation(b, 1); 290 break; 291 } 292 293 case SYSTEM_VALUE_MESH_VIEW_INDICES: 294 return nir_load_mesh_view_indices(b, intrin->def.num_components, 295 bit_size, column, .base = 0, 296 .range = intrin->def.num_components * bit_size / 8); 297 298 default: 299 break; 300 } 301 302 nir_intrinsic_op sysval_op = 303 nir_intrinsic_from_system_value(var->data.location); 304 if (glsl_type_is_matrix(var->type)) { 305 assert(nir_intrinsic_infos[sysval_op].index_map[NIR_INTRINSIC_COLUMN] > 0); 306 unsigned num_cols = glsl_get_matrix_columns(var->type); 307 ASSERTED unsigned num_rows = glsl_get_vector_elements(var->type); 308 assert(num_rows == intrin->def.num_components); 309 310 nir_def *cols[4]; 311 for (unsigned i = 0; i < num_cols; i++) { 312 cols[i] = nir_load_system_value(b, sysval_op, i, 313 intrin->def.num_components, 314 intrin->def.bit_size); 315 assert(cols[i]->num_components == num_rows); 316 } 317 return nir_select_from_ssa_def_array(b, cols, num_cols, column); 318 } else if (glsl_type_is_array(var->type)) { 319 unsigned num_elems = glsl_get_length(var->type); 320 ASSERTED const struct glsl_type *elem_type = glsl_get_array_element(var->type); 321 assert(glsl_get_components(elem_type) == intrin->def.num_components); 322 323 nir_def *elems[4]; 324 assert(ARRAY_SIZE(elems) >= num_elems); 325 for (unsigned i = 0; i < num_elems; i++) { 326 elems[i] = nir_load_system_value(b, sysval_op, i, 327 intrin->def.num_components, 328 intrin->def.bit_size); 329 } 330 return nir_select_from_ssa_def_array(b, elems, num_elems, column); 331 } else { 332 return nir_load_system_value(b, sysval_op, 0, 333 intrin->def.num_components, 334 intrin->def.bit_size); 335 } 336 } 337 338 default: 339 return NULL; 340 } 341} 342 343nir_def * 344nir_build_lowered_load_helper_invocation(nir_builder *b) 345{ 346 nir_def *tmp; 347 tmp = nir_ishl(b, nir_imm_int(b, 1), 348 nir_load_sample_id_no_per_sample(b)); 349 tmp = nir_iand(b, nir_load_sample_mask_in(b), tmp); 350 return nir_inot(b, nir_i2b(b, tmp)); 351} 352 353bool 354nir_lower_system_values(nir_shader *shader) 355{ 356 bool progress = nir_shader_lower_instructions(shader, 357 lower_system_value_filter, 358 lower_system_value_instr, 359 NULL); 360 361 /* We're going to delete the variables so we need to clean up all those 362 * derefs we left lying around. 363 */ 364 if (progress) 365 nir_remove_dead_derefs(shader); 366 367 nir_foreach_variable_with_modes_safe(var, shader, nir_var_system_value) 368 exec_node_remove(&var->node); 369 370 return progress; 371} 372 373static nir_def * 374id_to_index_no_umod_slow(nir_builder *b, nir_def *index, 375 nir_def *size_x, nir_def *size_y, 376 unsigned bit_size) 377{ 378 /* We lower ID to Index with the following formula: 379 * 380 * id.z = index / (size.x * size.y) 381 * id.y = (index - (id.z * (size.x * size.y))) / size.x 382 * id.x = index - ((id.z * (size.x * size.y)) + (id.y * size.x)) 383 * 384 * This is more efficient on HW that doesn't have a 385 * modulo division instruction and when the size is either 386 * not compile time known or not a power of two. 387 */ 388 389 nir_def *size_x_y = nir_imul(b, size_x, size_y); 390 nir_def *id_z = nir_udiv(b, index, size_x_y); 391 nir_def *z_portion = nir_imul(b, id_z, size_x_y); 392 nir_def *id_y = nir_udiv(b, nir_isub(b, index, z_portion), size_x); 393 nir_def *y_portion = nir_imul(b, id_y, size_x); 394 nir_def *id_x = nir_isub(b, index, nir_iadd(b, z_portion, y_portion)); 395 396 return nir_u2uN(b, nir_vec3(b, id_x, id_y, id_z), bit_size); 397} 398 399static nir_def * 400lower_id_to_index_no_umod(nir_builder *b, nir_def *index, 401 nir_def *size, unsigned bit_size, 402 const uint32_t *size_imm, 403 bool shortcut_1d) 404{ 405 nir_def *size_x, *size_y; 406 407 if (size_imm[0] > 0) 408 size_x = nir_imm_int(b, size_imm[0]); 409 else 410 size_x = nir_channel(b, size, 0); 411 412 if (size_imm[1] > 0) 413 size_y = nir_imm_int(b, size_imm[1]); 414 else 415 size_y = nir_channel(b, size, 1); 416 417 if (shortcut_1d) { 418 /* if size.y + size.z == 2 (which means that both y and z are 1) 419 * id = vec3(index, 0, 0) 420 * else 421 * id = id_to_index_no_umod_slow 422 */ 423 424 nir_def *size_z = nir_channel(b, size, 2); 425 nir_def *cond = nir_ieq(b, nir_iadd(b, size_y, size_z), nir_imm_int(b, 2)); 426 427 nir_def *val1, *val2; 428 nir_if *if_opt = nir_push_if(b, cond); 429 if_opt->control = nir_selection_control_dont_flatten; 430 { 431 nir_def *zero = nir_imm_int(b, 0); 432 val1 = nir_u2uN(b, nir_vec3(b, index, zero, zero), bit_size); 433 } 434 nir_push_else(b, if_opt); 435 { 436 val2 = id_to_index_no_umod_slow(b, index, size_x, size_y, bit_size); 437 } 438 nir_pop_if(b, if_opt); 439 440 return nir_if_phi(b, val1, val2); 441 } else { 442 return id_to_index_no_umod_slow(b, index, size_x, size_y, bit_size); 443 } 444} 445 446static nir_def * 447lower_id_to_index(nir_builder *b, nir_def *index, nir_def *size, 448 unsigned bit_size) 449{ 450 /* We lower gl_LocalInvocationID to gl_LocalInvocationIndex based 451 * on this formula: 452 * 453 * id.x = index % size.x; 454 * id.y = (index / size.x) % gl_WorkGroupSize.y; 455 * id.z = (index / (size.x * size.y)) % size.z; 456 * 457 * However, the final % size.z does nothing unless we 458 * accidentally end up with an index that is too 459 * large so it can safely be omitted. 460 * 461 * Because no hardware supports a local workgroup size greater than 462 * about 1K, this calculation can be done in 32-bit and can save some 463 * 64-bit arithmetic. 464 */ 465 466 nir_def *size_x = nir_channel(b, size, 0); 467 nir_def *size_y = nir_channel(b, size, 1); 468 469 nir_def *id_x = nir_umod(b, index, size_x); 470 nir_def *id_y = nir_umod(b, nir_udiv(b, index, size_x), size_y); 471 nir_def *id_z = nir_udiv(b, index, nir_imul(b, size_x, size_y)); 472 473 return nir_u2uN(b, nir_vec3(b, id_x, id_y, id_z), bit_size); 474} 475 476static bool 477lower_compute_system_value_filter(const nir_instr *instr, const void *_state) 478{ 479 return instr->type == nir_instr_type_intrinsic; 480} 481 482static nir_def * 483try_lower_id_to_index_1d(nir_builder *b, nir_def *index, const uint32_t *size) 484{ 485 /* size_x = 1, size_y = 1, therefore Z = local index */ 486 if (size[0] == 1 && size[1] == 1) 487 return nir_vec3(b, nir_imm_int(b, 0), nir_imm_int(b, 0), index); 488 489 /* size_x = 1, size_z = 1, therefore Y = local index */ 490 if (size[0] == 1 && size[2] == 1) 491 return nir_vec3(b, nir_imm_int(b, 0), index, nir_imm_int(b, 0)); 492 493 /* size_y = 1, size_z = 1, therefore X = local index */ 494 if (size[1] == 1 && size[2] == 1) 495 return nir_vec3(b, index, nir_imm_int(b, 0), nir_imm_int(b, 0)); 496 497 return NULL; 498} 499 500static nir_def * 501lower_compute_system_value_instr(nir_builder *b, 502 nir_instr *instr, void *_state) 503{ 504 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 505 struct lower_sysval_state *state = (struct lower_sysval_state *)_state; 506 const nir_lower_compute_system_values_options *options = state->options; 507 508 /* All the intrinsics we care about are loads */ 509 if (!nir_intrinsic_infos[intrin->intrinsic].has_dest) 510 return NULL; 511 512 const unsigned bit_size = intrin->def.bit_size; 513 514 switch (intrin->intrinsic) { 515 case nir_intrinsic_load_local_invocation_id: 516 /* If lower_cs_local_id_to_index is true, then we replace 517 * local_invocation_id with a formula based on local_invocation_index. 518 */ 519 if (b->shader->options->lower_cs_local_id_to_index || 520 (options && options->lower_cs_local_id_to_index)) { 521 nir_def *local_index = nir_load_local_invocation_index(b); 522 523 if (!b->shader->info.workgroup_size_variable) { 524 /* Shortcut for 1 dimensional workgroups: 525 * Use local_invocation_index directly, which is better than 526 * lower_id_to_index + constant folding, because 527 * this way we don't leave behind extra ALU instrs. 528 */ 529 530 uint32_t wg_size[3] = {b->shader->info.workgroup_size[0], 531 b->shader->info.workgroup_size[1], 532 b->shader->info.workgroup_size[2]}; 533 nir_def *val = try_lower_id_to_index_1d(b, local_index, wg_size); 534 if (val) 535 return val; 536 } 537 538 nir_def *local_size = nir_load_workgroup_size(b); 539 return lower_id_to_index(b, local_index, local_size, bit_size); 540 } 541 if (options && options->shuffle_local_ids_for_quad_derivatives && 542 b->shader->info.derivative_group == DERIVATIVE_GROUP_QUADS && 543 _mesa_set_search(state->lower_once_list, instr) == NULL) { 544 nir_def *ids = nir_load_local_invocation_id(b); 545 _mesa_set_add(state->lower_once_list, ids->parent_instr); 546 547 nir_def *x = nir_channel(b, ids, 0); 548 nir_def *y = nir_channel(b, ids, 1); 549 nir_def *z = nir_channel(b, ids, 2); 550 unsigned size_x = b->shader->info.workgroup_size[0]; 551 nir_def *size_x_imm; 552 553 if (b->shader->info.workgroup_size_variable) 554 size_x_imm = nir_channel(b, nir_load_workgroup_size(b), 0); 555 else 556 size_x_imm = nir_imm_int(b, size_x); 557 558 /* Remap indices from: 559 * | 0| 1| 2| 3| 560 * | 4| 5| 6| 7| 561 * | 8| 9|10|11| 562 * |12|13|14|15| 563 * to: 564 * | 0| 1| 4| 5| 565 * | 2| 3| 6| 7| 566 * | 8| 9|12|13| 567 * |10|11|14|15| 568 * 569 * That's the layout required by AMD hardware for derivatives to 570 * work. Other hardware may work differently. 571 * 572 * It's a classic tiling pattern that can be implemented by inserting 573 * bit y[0] between bits x[0] and x[1] like this: 574 * 575 * x[0],y[0],x[1],...x[last],y[1],...,y[last] 576 * 577 * If the width is a power of two, use: 578 * i = ((x & 1) | ((y & 1) << 1) | ((x & ~1) << 1)) | ((y & ~1) << logbase2(size_x)) 579 * 580 * If the width is not a power of two or the local size is variable, use: 581 * i = ((x & 1) | ((y & 1) << 1) | ((x & ~1) << 1)) + ((y & ~1) * size_x) 582 * 583 * GL_NV_compute_shader_derivatives requires that the width and height 584 * are a multiple of two, which is also a requirement for the second 585 * expression to work. 586 * 587 * The 2D result is: (x,y) = (i % w, i / w) 588 */ 589 590 nir_def *one = nir_imm_int(b, 1); 591 nir_def *inv_one = nir_imm_int(b, ~1); 592 nir_def *x_bit0 = nir_iand(b, x, one); 593 nir_def *y_bit0 = nir_iand(b, y, one); 594 nir_def *x_bits_1n = nir_iand(b, x, inv_one); 595 nir_def *y_bits_1n = nir_iand(b, y, inv_one); 596 nir_def *bits_01 = nir_ior(b, x_bit0, nir_ishl(b, y_bit0, one)); 597 nir_def *bits_01x = nir_ior(b, bits_01, 598 nir_ishl(b, x_bits_1n, one)); 599 nir_def *i; 600 601 if (!b->shader->info.workgroup_size_variable && 602 util_is_power_of_two_nonzero(size_x)) { 603 nir_def *log2_size_x = nir_imm_int(b, util_logbase2(size_x)); 604 i = nir_ior(b, bits_01x, nir_ishl(b, y_bits_1n, log2_size_x)); 605 } else { 606 i = nir_iadd(b, bits_01x, nir_imul(b, y_bits_1n, size_x_imm)); 607 } 608 609 /* This should be fast if size_x is an immediate or even a power 610 * of two. 611 */ 612 x = nir_umod(b, i, size_x_imm); 613 y = nir_udiv(b, i, size_x_imm); 614 615 return nir_vec3(b, x, y, z); 616 } 617 618 /* If a workgroup size dimension is 1, then the local invocation id must be zero. */ 619 nir_component_mask_t is_zero = 0; 620 is_zero |= b->shader->info.workgroup_size[0] == 1 ? 0x1 : 0x0; 621 is_zero |= b->shader->info.workgroup_size[1] == 1 ? 0x2 : 0x0; 622 is_zero |= b->shader->info.workgroup_size[2] == 1 ? 0x4 : 0x0; 623 if (!b->shader->info.workgroup_size_variable && is_zero) { 624 nir_scalar defs[3]; 625 for (unsigned i = 0; i < 3; i++) { 626 defs[i] = is_zero & (1 << i) ? nir_get_scalar(nir_imm_zero(b, 1, 32), 0) : nir_get_scalar(&intrin->def, i); 627 } 628 return nir_vec_scalars(b, defs, 3); 629 } 630 631 return NULL; 632 633 case nir_intrinsic_load_local_invocation_index: 634 /* If lower_cs_local_index_to_id is true, then we replace 635 * local_invocation_index with a formula based on local_invocation_id. 636 */ 637 if (b->shader->options->lower_cs_local_index_to_id || 638 (options && options->lower_local_invocation_index)) { 639 /* From the GLSL man page for gl_LocalInvocationIndex: 640 * 641 * "The value of gl_LocalInvocationIndex is equal to 642 * gl_LocalInvocationID.z * gl_WorkGroupSize.x * 643 * gl_WorkGroupSize.y + gl_LocalInvocationID.y * 644 * gl_WorkGroupSize.x + gl_LocalInvocationID.x" 645 */ 646 nir_def *local_id = nir_load_local_invocation_id(b); 647 nir_def *local_size = nir_load_workgroup_size(b); 648 nir_def *size_x = nir_channel(b, local_size, 0); 649 nir_def *size_y = nir_channel(b, local_size, 1); 650 651 /* Because no hardware supports a local workgroup size greater than 652 * about 1K, this calculation can be done in 32-bit and can save some 653 * 64-bit arithmetic. 654 */ 655 nir_def *index; 656 index = nir_imul(b, nir_channel(b, local_id, 2), 657 nir_imul(b, size_x, size_y)); 658 index = nir_iadd(b, index, 659 nir_imul(b, nir_channel(b, local_id, 1), size_x)); 660 index = nir_iadd(b, index, nir_channel(b, local_id, 0)); 661 return nir_u2uN(b, index, bit_size); 662 } else { 663 return NULL; 664 } 665 666 case nir_intrinsic_load_workgroup_size: 667 if (b->shader->info.workgroup_size_variable) { 668 /* If the local work group size is variable it can't be lowered at 669 * this point. We do, however, have to make sure that the intrinsic 670 * is only 32-bit. 671 */ 672 return NULL; 673 } else { 674 /* using a 32 bit constant is safe here as no device/driver needs more 675 * than 32 bits for the local size */ 676 nir_const_value workgroup_size_const[3]; 677 memset(workgroup_size_const, 0, sizeof(workgroup_size_const)); 678 workgroup_size_const[0].u32 = b->shader->info.workgroup_size[0]; 679 workgroup_size_const[1].u32 = b->shader->info.workgroup_size[1]; 680 workgroup_size_const[2].u32 = b->shader->info.workgroup_size[2]; 681 return nir_u2uN(b, nir_build_imm(b, 3, 32, workgroup_size_const), bit_size); 682 } 683 684 case nir_intrinsic_load_global_invocation_id: { 685 if ((options && options->has_base_workgroup_id) || 686 !b->shader->options->has_cs_global_id) { 687 nir_def *group_size = nir_load_workgroup_size(b); 688 nir_def *group_id = nir_load_workgroup_id(b); 689 nir_def *base_group_id = nir_load_base_workgroup_id(b, bit_size); 690 nir_def *local_id = nir_load_local_invocation_id(b); 691 692 return nir_iadd(b, nir_imul(b, nir_iadd(b, nir_u2uN(b, group_id, bit_size), 693 base_group_id), 694 nir_u2uN(b, group_size, bit_size)), 695 nir_u2uN(b, local_id, bit_size)); 696 } else if (options && options->global_id_is_32bit && bit_size > 32) { 697 return nir_u2uN(b, nir_load_global_invocation_id(b, 32), bit_size); 698 } else { 699 return NULL; 700 } 701 } 702 703 case nir_intrinsic_load_base_global_invocation_id: { 704 if (options && !options->has_base_global_invocation_id) 705 return nir_imm_zero(b, 3, bit_size); 706 return NULL; 707 } 708 709 case nir_intrinsic_load_base_workgroup_id: { 710 if (options && !options->has_base_workgroup_id) 711 return nir_imm_zero(b, 3, bit_size); 712 return NULL; 713 } 714 715 case nir_intrinsic_load_global_invocation_index: { 716 /* OpenCL's global_linear_id explicitly ignores the global offset */ 717 assert(b->shader->info.stage == MESA_SHADER_KERNEL); 718 nir_def *global_id = nir_load_global_invocation_id(b, bit_size); 719 nir_def *global_size = nir_load_global_size(b, bit_size); 720 721 /* index = id.x + ((id.y + (id.z * size.y)) * size.x) */ 722 nir_def *index; 723 index = nir_imul(b, nir_channel(b, global_id, 2), 724 nir_channel(b, global_size, 1)); 725 index = nir_iadd(b, nir_channel(b, global_id, 1), index); 726 index = nir_imul(b, nir_channel(b, global_size, 0), index); 727 index = nir_iadd(b, nir_channel(b, global_id, 0), index); 728 return index; 729 } 730 731 case nir_intrinsic_load_global_size: { 732 if (options && !options->has_global_size) 733 return build_global_group_size(b, bit_size); 734 return NULL; 735 } 736 737 case nir_intrinsic_load_workgroup_id: { 738 if (options && options->lower_workgroup_id_to_index) { 739 nir_def *wg_idx = nir_load_workgroup_index(b); 740 741 nir_def *val = 742 try_lower_id_to_index_1d(b, wg_idx, options->num_workgroups); 743 if (val) 744 return val; 745 746 nir_def *num_workgroups = nir_load_num_workgroups(b); 747 return lower_id_to_index_no_umod(b, wg_idx, 748 nir_u2uN(b, num_workgroups, bit_size), 749 bit_size, 750 options->num_workgroups, 751 options->shortcut_1d_workgroup_id); 752 } 753 754 return NULL; 755 } 756 757 case nir_intrinsic_load_num_workgroups: { 758 if (!options) 759 return NULL; 760 761 const uint32_t *num_wgs_imm = options->num_workgroups; 762 763 /* Exit early when none of the num workgroups components are known at 764 * compile time. 765 */ 766 if (num_wgs_imm[0] == 0 && num_wgs_imm[1] == 0 && num_wgs_imm[2] == 0) 767 return NULL; 768 769 b->cursor = nir_after_instr(instr); 770 771 nir_def *num_wgs = &intrin->def; 772 for (unsigned i = 0; i < 3; ++i) { 773 if (num_wgs_imm[i]) 774 num_wgs = nir_vector_insert_imm(b, num_wgs, nir_imm_int(b, num_wgs_imm[i]), i); 775 } 776 777 return num_wgs; 778 } 779 780 case nir_intrinsic_load_shader_index: 781 return nir_imm_int(b, b->shader->info.cs.shader_index); 782 783 default: 784 return NULL; 785 } 786} 787 788bool 789nir_lower_compute_system_values(nir_shader *shader, 790 const nir_lower_compute_system_values_options *options) 791{ 792 if (!gl_shader_stage_uses_workgroup(shader->info.stage)) 793 return false; 794 795 struct lower_sysval_state state; 796 state.options = options; 797 state.lower_once_list = _mesa_pointer_set_create(NULL); 798 799 bool progress = 800 nir_shader_lower_instructions(shader, 801 lower_compute_system_value_filter, 802 lower_compute_system_value_instr, 803 (void *)&state); 804 ralloc_free(state.lower_once_list); 805 806 /* Update this so as not to lower it again. */ 807 if (options && options->shuffle_local_ids_for_quad_derivatives && 808 shader->info.derivative_group == DERIVATIVE_GROUP_QUADS) 809 shader->info.derivative_group = DERIVATIVE_GROUP_LINEAR; 810 811 return progress; 812}