lib/mesa/src/compiler/nir/nir_lower_subgroups.c at jcs

jcs.org / openbsd-xenocara
fork
mirror of OpenBSD xenocara tree github.com/openbsd/xenocara
openbsd
fork
openbsd-xenocara / lib / mesa / src / compiler / nir / nir_lower_subgroups.c
at jcs 1359 lines 52 kB view raw
wrap content
jsg Import Mesa 25.0.7 11mo ago
67d6f117
   1/*
   2 * Copyright © 2023 Collabora, Ltd.
   3 * Copyright © 2017 Intel Corporation
   4 *
   5 * Permission is hereby granted, free of charge, to any person obtaining a
   6 * copy of this software and associated documentation files (the "Software"),
   7 * to deal in the Software without restriction, including without limitation
   8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9 * and/or sell copies of the Software, and to permit persons to whom the
  10 * Software is furnished to do so, subject to the following conditions:
  11 *
  12 * The above copyright notice and this permission notice (including the next
  13 * paragraph) shall be included in all copies or substantial portions of the
  14 * Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22 * IN THE SOFTWARE.
  23 */
  24
  25#include "util/u_math.h"
  26#include "nir.h"
  27#include "nir_builder.h"
  28
  29/**
  30 * \file nir_opt_intrinsics.c
  31 */
  32
  33static unsigned
  34get_max_subgroup_size(const nir_lower_subgroups_options *options)
  35{
  36   return options->subgroup_size
  37             ? options->subgroup_size
  38             : options->ballot_components * options->ballot_bit_size;
  39}
  40
  41static nir_intrinsic_instr *
  42lower_subgroups_64bit_split_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
  43                                      unsigned int component)
  44{
  45   nir_def *comp;
  46   if (component == 0)
  47      comp = nir_unpack_64_2x32_split_x(b, intrin->src[0].ssa);
  48   else
  49      comp = nir_unpack_64_2x32_split_y(b, intrin->src[0].ssa);
  50
  51   nir_intrinsic_instr *intr = nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
  52   nir_def_init(&intr->instr, &intr->def, 1, 32);
  53   intr->const_index[0] = intrin->const_index[0];
  54   intr->const_index[1] = intrin->const_index[1];
  55   intr->src[0] = nir_src_for_ssa(comp);
  56   if (nir_intrinsic_infos[intrin->intrinsic].num_srcs == 2)
  57      intr->src[1] = nir_src_for_ssa(intrin->src[1].ssa);
  58
  59   intr->num_components = 1;
  60   nir_builder_instr_insert(b, &intr->instr);
  61   return intr;
  62}
  63
  64static nir_def *
  65lower_subgroup_op_to_32bit(nir_builder *b, nir_intrinsic_instr *intrin)
  66{
  67   assert(intrin->src[0].ssa->bit_size == 64);
  68   nir_intrinsic_instr *intr_x = lower_subgroups_64bit_split_intrinsic(b, intrin, 0);
  69   nir_intrinsic_instr *intr_y = lower_subgroups_64bit_split_intrinsic(b, intrin, 1);
  70   return nir_pack_64_2x32_split(b, &intr_x->def, &intr_y->def);
  71}
  72
  73/* Return a mask which is 1 for threads up to the run-time subgroup size, i.e.
  74 * 1 for the entire subgroup. SPIR-V requires us to return 0 for indices at or
  75 * above the subgroup size for the masks, but gt_mask and ge_mask make them 1
  76 * so we have to "and" with this mask.
  77 */
  78static nir_def *
  79build_subgroup_mask(nir_builder *b,
  80                    const nir_lower_subgroups_options *options)
  81{
  82   nir_def *subgroup_size = nir_load_subgroup_size(b);
  83
  84   /* First compute the result assuming one ballot component. */
  85   nir_def *result =
  86      nir_ushr(b, nir_imm_intN_t(b, ~0ull, options->ballot_bit_size),
  87               nir_isub_imm(b, options->ballot_bit_size,
  88                            subgroup_size));
  89
  90   /* Since the subgroup size and ballot bitsize are both powers of two, there
  91    * are two possible cases to consider:
  92    *
  93    * (1) The subgroup size is less than the ballot bitsize. We need to return
  94    * "result" in the first component and 0 in every other component.
  95    * (2) The subgroup size is a multiple of the ballot bitsize. We need to
  96    * return ~0 if the subgroup size divided by the ballot bitsize is less
  97    * than or equal to the index in the vector and 0 otherwise. For example,
  98    * with a target ballot type of 4 x uint32 and subgroup_size = 64 we'd need
  99    * to return { ~0, ~0, 0, 0 }.
 100    *
 101    * In case (2) it turns out that "result" will be ~0, because
 102    * "ballot_bit_size - subgroup_size" is also a multiple of
 103    * "ballot_bit_size" and since nir_ushr masks the shift value it will
 104    * shifted by 0. This means that the first component can just be "result"
 105    * in all cases.  The other components will also get the correct value in
 106    * case (1) if we just use the rule in case (2), so we'll get the correct
 107    * result if we just follow (2) and then replace the first component with
 108    * "result".
 109    */
 110   nir_const_value min_idx[4];
 111   for (unsigned i = 0; i < options->ballot_components; i++)
 112      min_idx[i] = nir_const_value_for_int(i * options->ballot_bit_size, 32);
 113   nir_def *min_idx_val = nir_build_imm(b, options->ballot_components, 32, min_idx);
 114
 115   nir_def *result_extended =
 116      nir_pad_vector_imm_int(b, result, ~0ull, options->ballot_components);
 117
 118   return nir_bcsel(b, nir_ult(b, min_idx_val, subgroup_size),
 119                    result_extended, nir_imm_intN_t(b, 0, options->ballot_bit_size));
 120}
 121
 122/* Return a ballot-mask-sized value which represents "val" sign-extended and
 123 * then shifted left by "shift". Only particular values for "val" are
 124 * supported, see below.
 125 *
 126 * This function assumes that `val << shift` will never span a ballot_bit_size
 127 * word and that the high bit of val can be extended across the entire result.
 128 * This is trivially satisfied for 0, 1, ~0, and ~1.  However, it may also be
 129 * fine for other values if the shift is guaranteed to be sufficiently
 130 * aligned.  One example is 0xf when the shift is known to be a multiple of 4.
 131 */
 132static nir_def *
 133build_ballot_imm_ishl(nir_builder *b, int64_t val, nir_def *shift,
 134                      const nir_lower_subgroups_options *options)
 135{
 136   /* First compute the result assuming one ballot component. */
 137   nir_def *result =
 138      nir_ishl(b, nir_imm_intN_t(b, val, options->ballot_bit_size), shift);
 139
 140   if (options->ballot_components == 1)
 141      return result;
 142
 143   /* Fix up the result when there is > 1 component. The idea is that nir_ishl
 144    * masks out the high bits of the shift value already, so in case there's
 145    * more than one component the component which 1 would be shifted into
 146    * already has the right value and all we have to do is fixup the other
 147    * components. Components below it should always be 0, and components above
 148    * it must be either 0 or ~0 because of the assert above. For example, if
 149    * the target ballot size is 2 x uint32, and we're shifting 1 by 33, then
 150    * we'll feed 33 into ishl, which will mask it off to get 1, so we'll
 151    * compute a single-component result of 2, which is correct for the second
 152    * component, but the first component needs to be 0, which we get by
 153    * comparing the high bits of the shift with 0 and selecting the original
 154    * answer or 0 for the first component (and something similar with the
 155    * second component). This idea is generalized here for any component count
 156    */
 157   nir_const_value min_shift[4];
 158   for (unsigned i = 0; i < options->ballot_components; i++)
 159      min_shift[i] = nir_const_value_for_int(i * options->ballot_bit_size, 32);
 160   nir_def *min_shift_val = nir_build_imm(b, options->ballot_components, 32, min_shift);
 161
 162   nir_const_value max_shift[4];
 163   for (unsigned i = 0; i < options->ballot_components; i++)
 164      max_shift[i] = nir_const_value_for_int((i + 1) * options->ballot_bit_size, 32);
 165   nir_def *max_shift_val = nir_build_imm(b, options->ballot_components, 32, max_shift);
 166
 167   return nir_bcsel(b, nir_ult(b, shift, max_shift_val),
 168                    nir_bcsel(b, nir_ult(b, shift, min_shift_val),
 169                              nir_imm_intN_t(b, val >> 63, result->bit_size),
 170                              result),
 171                    nir_imm_intN_t(b, 0, result->bit_size));
 172}
 173
 174static nir_def *
 175ballot_type_to_uint(nir_builder *b, nir_def *value,
 176                    const nir_lower_subgroups_options *options)
 177{
 178   /* Allow internal generated ballots to pass through */
 179   if (value->num_components == options->ballot_components &&
 180       value->bit_size == options->ballot_bit_size)
 181      return value;
 182
 183   /* Only the new-style SPIR-V subgroup instructions take a ballot result as
 184    * an argument, so we only use this on uvec4 types.
 185    */
 186   assert(value->num_components == 4 && value->bit_size == 32);
 187
 188   return nir_extract_bits(b, &value, 1, 0, options->ballot_components,
 189                           options->ballot_bit_size);
 190}
 191
 192static nir_def *
 193uint_to_ballot_type(nir_builder *b, nir_def *value,
 194                    unsigned num_components, unsigned bit_size)
 195{
 196   assert(util_is_power_of_two_nonzero(num_components));
 197   assert(util_is_power_of_two_nonzero(value->num_components));
 198
 199   unsigned total_bits = bit_size * num_components;
 200
 201   /* If the source doesn't have enough bits, zero-pad */
 202   if (total_bits > value->bit_size * value->num_components)
 203      value = nir_pad_vector_imm_int(b, value, 0, total_bits / value->bit_size);
 204
 205   value = nir_bitcast_vector(b, value, bit_size);
 206
 207   /* If the source has too many components, truncate.  This can happen if,
 208    * for instance, we're implementing GL_ARB_shader_ballot or
 209    * VK_EXT_shader_subgroup_ballot which have 64-bit ballot values on an
 210    * architecture with a native 128-bit uvec4 ballot.  This comes up in Zink
 211    * for OpenGL on Vulkan.  It's the job of the driver calling this lowering
 212    * pass to ensure that it's restricted subgroup sizes sufficiently that we
 213    * have enough ballot bits.
 214    */
 215   if (value->num_components > num_components)
 216      value = nir_trim_vector(b, value, num_components);
 217
 218   return value;
 219}
 220
 221static nir_def *
 222lower_subgroup_op_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin)
 223{
 224   /* This is safe to call on scalar things but it would be silly */
 225   assert(intrin->def.num_components > 1);
 226
 227   nir_def *value = intrin->src[0].ssa;
 228   nir_def *reads[NIR_MAX_VEC_COMPONENTS];
 229
 230   for (unsigned i = 0; i < intrin->num_components; i++) {
 231      nir_intrinsic_instr *chan_intrin =
 232         nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
 233      nir_def_init(&chan_intrin->instr, &chan_intrin->def, 1,
 234                   intrin->def.bit_size);
 235      chan_intrin->num_components = 1;
 236
 237      /* value */
 238      chan_intrin->src[0] = nir_src_for_ssa(nir_channel(b, value, i));
 239      /* invocation */
 240      if (nir_intrinsic_infos[intrin->intrinsic].num_srcs > 1) {
 241         assert(nir_intrinsic_infos[intrin->intrinsic].num_srcs == 2);
 242         chan_intrin->src[1] = nir_src_for_ssa(intrin->src[1].ssa);
 243      }
 244
 245      chan_intrin->const_index[0] = intrin->const_index[0];
 246      chan_intrin->const_index[1] = intrin->const_index[1];
 247
 248      nir_builder_instr_insert(b, &chan_intrin->instr);
 249      reads[i] = &chan_intrin->def;
 250   }
 251
 252   return nir_vec(b, reads, intrin->num_components);
 253}
 254
 255static nir_def *
 256lower_vote_eq_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin)
 257{
 258   nir_def *value = intrin->src[0].ssa;
 259
 260   nir_def *result = NULL;
 261   for (unsigned i = 0; i < intrin->num_components; i++) {
 262      nir_def* chan = nir_channel(b, value, i);
 263
 264      if (intrin->intrinsic == nir_intrinsic_vote_feq) {
 265         chan = nir_vote_feq(b, intrin->def.bit_size, chan);
 266      } else {
 267         chan = nir_vote_ieq(b, intrin->def.bit_size, chan);
 268      }
 269
 270      if (result) {
 271         result = nir_iand(b, result, chan);
 272      } else {
 273         result = chan;
 274      }
 275   }
 276
 277   return result;
 278}
 279
 280static nir_def *
 281lower_vote_eq(nir_builder *b, nir_intrinsic_instr *intrin)
 282{
 283   nir_def *value = intrin->src[0].ssa;
 284
 285   /* We have to implicitly lower to scalar */
 286   nir_def *all_eq = NULL;
 287   for (unsigned i = 0; i < intrin->num_components; i++) {
 288      nir_def *rfi = nir_read_first_invocation(b, nir_channel(b, value, i));
 289
 290      nir_def *is_eq;
 291      if (intrin->intrinsic == nir_intrinsic_vote_feq) {
 292         is_eq = nir_feq(b, rfi, nir_channel(b, value, i));
 293      } else {
 294         is_eq = nir_ieq(b, rfi, nir_channel(b, value, i));
 295      }
 296
 297      if (all_eq == NULL) {
 298         all_eq = is_eq;
 299      } else {
 300         all_eq = nir_iand(b, all_eq, is_eq);
 301      }
 302   }
 303
 304   return nir_vote_all(b, 1, all_eq);
 305}
 306
 307static nir_def *
 308lower_shuffle_to_swizzle(nir_builder *b, nir_intrinsic_instr *intrin)
 309{
 310   unsigned mask = nir_src_as_uint(intrin->src[1]);
 311
 312   if (mask >= 32)
 313      return NULL;
 314
 315   return nir_masked_swizzle_amd(b, intrin->src[0].ssa,
 316                                 .swizzle_mask = (mask << 10) | 0x1f,
 317                                 .fetch_inactive = true);
 318}
 319
 320/* Lowers "specialized" shuffles to a generic nir_intrinsic_shuffle. */
 321
 322static nir_def *
 323lower_to_shuffle(nir_builder *b, nir_intrinsic_instr *intrin,
 324                 const nir_lower_subgroups_options *options)
 325{
 326   if (intrin->intrinsic == nir_intrinsic_shuffle_xor &&
 327       options->lower_shuffle_to_swizzle_amd &&
 328       nir_src_is_const(intrin->src[1])) {
 329
 330      nir_def *result = lower_shuffle_to_swizzle(b, intrin);
 331      if (result)
 332         return result;
 333   }
 334
 335   nir_def *index = nir_load_subgroup_invocation(b);
 336   switch (intrin->intrinsic) {
 337   case nir_intrinsic_shuffle_xor:
 338      index = nir_ixor(b, index, intrin->src[1].ssa);
 339      break;
 340   case nir_intrinsic_shuffle_up:
 341      index = nir_isub(b, index, intrin->src[1].ssa);
 342      break;
 343   case nir_intrinsic_shuffle_down:
 344      index = nir_iadd(b, index, intrin->src[1].ssa);
 345      break;
 346   case nir_intrinsic_quad_broadcast:
 347      index = nir_ior(b, nir_iand_imm(b, index, ~0x3),
 348                      intrin->src[1].ssa);
 349      break;
 350   case nir_intrinsic_quad_swap_horizontal:
 351      /* For Quad operations, subgroups are divided into quads where
 352       * (invocation % 4) is the index to a square arranged as follows:
 353       *
 354       *    +---+---+
 355       *    | 0 | 1 |
 356       *    +---+---+
 357       *    | 2 | 3 |
 358       *    +---+---+
 359       */
 360      index = nir_ixor(b, index, nir_imm_int(b, 0x1));
 361      break;
 362   case nir_intrinsic_quad_swap_vertical:
 363      index = nir_ixor(b, index, nir_imm_int(b, 0x2));
 364      break;
 365   case nir_intrinsic_quad_swap_diagonal:
 366      index = nir_ixor(b, index, nir_imm_int(b, 0x3));
 367      break;
 368   case nir_intrinsic_rotate: {
 369      nir_def *delta = intrin->src[1].ssa;
 370      nir_def *local_id = nir_load_subgroup_invocation(b);
 371      const unsigned cluster_size = nir_intrinsic_cluster_size(intrin);
 372
 373      nir_def *rotation_group_mask =
 374         cluster_size > 0 ? nir_imm_int(b, (int)(cluster_size - 1)) : nir_iadd_imm(b, nir_load_subgroup_size(b), -1);
 375
 376      index = nir_iand(b, nir_iadd(b, local_id, delta),
 377                       rotation_group_mask);
 378      if (cluster_size > 0) {
 379         index = nir_iadd(b, index,
 380                          nir_iand(b, local_id, nir_inot(b, rotation_group_mask)));
 381      }
 382      break;
 383   }
 384   default:
 385      unreachable("Invalid intrinsic");
 386   }
 387
 388   return nir_shuffle(b, intrin->src[0].ssa, index);
 389}
 390
 391static const struct glsl_type *
 392glsl_type_for_ssa(nir_def *def)
 393{
 394   const struct glsl_type *comp_type = def->bit_size == 1 ? glsl_bool_type() : glsl_uintN_t_type(def->bit_size);
 395   return glsl_replace_vector_type(comp_type, def->num_components);
 396}
 397
 398/* Lower nir_intrinsic_shuffle to a waterfall loop + nir_read_invocation.
 399 */
 400static nir_def *
 401lower_shuffle(nir_builder *b, nir_intrinsic_instr *intrin)
 402{
 403   nir_def *val = intrin->src[0].ssa;
 404   nir_def *id = intrin->src[1].ssa;
 405
 406   /* The loop is something like:
 407    *
 408    * while (true) {
 409    *    first_id = readFirstInvocation(gl_SubgroupInvocationID);
 410    *    first_val = readFirstInvocation(val);
 411    *    first_result = readInvocation(val, readFirstInvocation(id));
 412    *    if (id == first_id)
 413    *       result = first_val;
 414    *    if (elect()) {
 415    *       if (id > gl_SubgroupInvocationID) {
 416    *          result = first_result;
 417    *       }
 418    *       break;
 419    *    }
 420    * }
 421    *
 422    * The idea is to guarantee, on each iteration of the loop, that anything
 423    * reading from first_id gets the correct value, so that we can then kill
 424    * it off by breaking out of the loop. Before doing that we also have to
 425    * ensure that first_id invocation gets the correct value. It only won't be
 426    * assigned the correct value already if the invocation it's reading from
 427    * isn't already killed off, that is, if it's later than its own ID.
 428    * Invocations where id <= gl_SubgroupInvocationID will be assigned their
 429    * result in the first if, and invocations where id >
 430    * gl_SubgroupInvocationID will be assigned their result in the second if.
 431    *
 432    * We do this more complicated loop rather than looping over all id's
 433    * explicitly because at this point we don't know the "actual" subgroup
 434    * size and at the moment there's no way to get at it, which means we may
 435    * loop over always-inactive invocations.
 436    */
 437
 438   nir_def *subgroup_id = nir_load_subgroup_invocation(b);
 439
 440   nir_variable *result =
 441      nir_local_variable_create(b->impl, glsl_type_for_ssa(val), "result");
 442
 443   nir_loop *loop = nir_push_loop(b);
 444   {
 445      nir_def *first_id = nir_read_first_invocation(b, subgroup_id);
 446      nir_def *first_val = nir_read_first_invocation(b, val);
 447      nir_def *first_result =
 448         nir_read_invocation(b, val, nir_read_first_invocation(b, id));
 449
 450      nir_if *nif = nir_push_if(b, nir_ieq(b, id, first_id));
 451      {
 452         nir_store_var(b, result, first_val, BITFIELD_MASK(val->num_components));
 453      }
 454      nir_pop_if(b, nif);
 455
 456      nir_if *nif2 = nir_push_if(b, nir_elect(b, 1));
 457      {
 458         nir_if *nif3 = nir_push_if(b, nir_ult(b, subgroup_id, id));
 459         {
 460            nir_store_var(b, result, first_result, BITFIELD_MASK(val->num_components));
 461         }
 462         nir_pop_if(b, nif3);
 463
 464         nir_jump(b, nir_jump_break);
 465      }
 466      nir_pop_if(b, nif2);
 467   }
 468   nir_pop_loop(b, loop);
 469
 470   return nir_load_var(b, result);
 471}
 472
 473static nir_def *
 474lower_boolean_shuffle(nir_builder *b, nir_intrinsic_instr *intrin,
 475                      const nir_lower_subgroups_options *options)
 476{
 477   assert(options->ballot_components == 1 && options->subgroup_size);
 478   nir_def *ballot = nir_ballot_relaxed(b, 1, options->ballot_bit_size, intrin->src[0].ssa);
 479
 480   nir_def *index = NULL;
 481
 482   /* If the shuffle amount isn't constant, it might be divergent but
 483    * inverse_ballot requires a uniform source, so take a different path.
 484    * rotate allows us to assume the delta is uniform unlike shuffle_up/down.
 485    */
 486   switch (intrin->intrinsic) {
 487   case nir_intrinsic_shuffle_up:
 488      if (nir_src_is_const(intrin->src[1]))
 489         ballot = nir_ishl(b, ballot, intrin->src[1].ssa);
 490      else
 491         index = nir_isub(b, nir_load_subgroup_invocation(b), intrin->src[1].ssa);
 492      break;
 493   case nir_intrinsic_shuffle_down:
 494      if (nir_src_is_const(intrin->src[1]))
 495         ballot = nir_ushr(b, ballot, intrin->src[1].ssa);
 496      else
 497         index = nir_iadd(b, nir_load_subgroup_invocation(b), intrin->src[1].ssa);
 498      break;
 499   case nir_intrinsic_shuffle_xor:
 500      index = nir_ixor(b, nir_load_subgroup_invocation(b), intrin->src[1].ssa);
 501      break;
 502   case nir_intrinsic_rotate: {
 503      nir_def *delta = nir_as_uniform(b, intrin->src[1].ssa);
 504      uint32_t cluster_size = nir_intrinsic_cluster_size(intrin);
 505      unsigned subgroup_size = get_max_subgroup_size(options);
 506      cluster_size = cluster_size ? cluster_size : subgroup_size;
 507      cluster_size = MIN2(cluster_size, subgroup_size);
 508      if (cluster_size == 1) {
 509         return intrin->src[0].ssa;
 510      } else if (cluster_size == 2) {
 511         delta = nir_iand_imm(b, delta, cluster_size - 1);
 512         nir_def *lo = nir_iand_imm(b, nir_ushr_imm(b, ballot, 1), 0x5555555555555555ull);
 513         nir_def *hi = nir_iand_imm(b, nir_ishl_imm(b, ballot, 1), 0xaaaaaaaaaaaaaaaaull);
 514         ballot = nir_bcsel(b, nir_ine_imm(b, delta, 0), nir_ior(b, hi, lo), ballot);
 515      } else if (cluster_size == ballot->bit_size) {
 516         ballot = nir_uror(b, ballot, delta);
 517      } else if (cluster_size == 32) {
 518         nir_def *unpacked = nir_unpack_64_2x32(b, ballot);
 519         unpacked = nir_uror(b, unpacked, delta);
 520         ballot = nir_pack_64_2x32(b, unpacked);
 521      } else {
 522         delta = nir_iand_imm(b, delta, cluster_size - 1);
 523         nir_def *delta_rev = nir_isub_imm(b, cluster_size, delta);
 524         nir_def *mask = nir_mask(b, delta_rev, ballot->bit_size);
 525         for (uint32_t i = cluster_size; i < ballot->bit_size; i *= 2) {
 526            mask = nir_ior(b, nir_ishl_imm(b, mask, i), mask);
 527         }
 528         nir_def *lo = nir_iand(b, nir_ushr(b, ballot, delta), mask);
 529         nir_def *hi = nir_iand(b, nir_ishl(b, ballot, delta_rev), nir_inot(b, mask));
 530         ballot = nir_ior(b, lo, hi);
 531      }
 532      break;
 533   }
 534   case nir_intrinsic_shuffle:
 535      index = intrin->src[1].ssa;
 536      break;
 537   case nir_intrinsic_read_invocation:
 538      index = nir_as_uniform(b, intrin->src[1].ssa);
 539      break;
 540   default:
 541      unreachable("not a boolean shuffle");
 542   }
 543
 544   if (index) {
 545      nir_def *mask = nir_ishl(b, nir_imm_intN_t(b, 1, ballot->bit_size), index);
 546      return nir_ine_imm(b, nir_iand(b, ballot, mask), 0);
 547   } else {
 548      return nir_inverse_ballot(b, 1, ballot);
 549   }
 550}
 551
 552static nir_def *
 553vec_bit_count(nir_builder *b, nir_def *value)
 554{
 555   nir_def *vec_result = nir_bit_count(b, value);
 556   nir_def *result = nir_channel(b, vec_result, 0);
 557   for (unsigned i = 1; i < value->num_components; i++)
 558      result = nir_iadd(b, result, nir_channel(b, vec_result, i));
 559   return result;
 560}
 561
 562/* produce a bitmask of 111...000...111... alternating between "size"
 563 * 1's and "size" 0's (the LSB is 1).
 564 */
 565static uint64_t
 566reduce_mask(unsigned size, unsigned ballot_bit_size)
 567{
 568   uint64_t mask = 0;
 569   for (unsigned i = 0; i < ballot_bit_size; i += 2 * size) {
 570      mask |= ((1ull << size) - 1) << i;
 571   }
 572
 573   return mask;
 574}
 575
 576/* operate on a uniform per-thread bitmask provided by ballot() to perform the
 577 * desired Boolean reduction. Assumes that the identity of the operation is
 578 * false (so, no iand).
 579 */
 580static nir_def *
 581lower_boolean_reduce_internal(nir_builder *b, nir_def *src,
 582                              unsigned cluster_size, nir_op op,
 583                              const nir_lower_subgroups_options *options)
 584{
 585   for (unsigned size = 1; size < cluster_size; size *= 2) {
 586      nir_def *shifted = nir_ushr_imm(b, src, size);
 587      src = nir_build_alu2(b, op, shifted, src);
 588      uint64_t mask = reduce_mask(size, options->ballot_bit_size);
 589      src = nir_iand_imm(b, src, mask);
 590      shifted = nir_ishl_imm(b, src, size);
 591      src = nir_ior(b, src, shifted);
 592   }
 593
 594   return src;
 595}
 596
 597/* operate on a uniform per-thread bitmask provided by ballot() to perform the
 598 * desired Boolean inclusive scan. Assumes that the identity of the operation is
 599 * false (so, no iand).
 600 */
 601static nir_def *
 602lower_boolean_scan_internal(nir_builder *b, nir_def *src,
 603                            nir_op op,
 604                            const nir_lower_subgroups_options *options)
 605{
 606   if (op == nir_op_ior) {
 607      /* We want to return a bitmask with all 1's starting at the first 1 in
 608       * src. -src is equivalent to ~src + 1. While src | ~src returns all
 609       * 1's, src | (~src + 1) returns all 1's except for the bits changed by
 610       * the increment. Any 1's before the least significant 0 of ~src are
 611       * turned into 0 (zeroing those bits after or'ing) and the least
 612       * signficant 0 of ~src is turned into 1 (not doing anything). So the
 613       * final output is what we want.
 614       */
 615      return nir_ior(b, src, nir_ineg(b, src));
 616   } else {
 617      assert(op == nir_op_ixor);
 618      for (unsigned shift = 1; shift < options->ballot_bit_size; shift *= 2) {
 619         src = nir_ixor(b, src, nir_ishl_imm(b, src, shift));
 620      }
 621      return src;
 622   }
 623}
 624
 625static nir_def *
 626lower_boolean_reduce(nir_builder *b, nir_intrinsic_instr *intrin,
 627                     const nir_lower_subgroups_options *options)
 628{
 629   assert(intrin->num_components == 1);
 630   assert(options->ballot_components == 1);
 631
 632   unsigned cluster_size =
 633      intrin->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(intrin) : 0;
 634   nir_op op = nir_intrinsic_reduction_op(intrin);
 635
 636   /* For certain cluster sizes, reductions of iand and ior can be implemented
 637    * more efficiently.
 638    */
 639   if (intrin->intrinsic == nir_intrinsic_reduce) {
 640      if (cluster_size == 0) {
 641         if (op == nir_op_iand)
 642            return nir_vote_all(b, 1, intrin->src[0].ssa);
 643         else if (op == nir_op_ior)
 644            return nir_vote_any(b, 1, intrin->src[0].ssa);
 645         else if (op == nir_op_ixor)
 646            return nir_i2b(b, nir_iand_imm(b, vec_bit_count(b, nir_ballot(b,
 647                                                                          options->ballot_components,
 648                                                                          options->ballot_bit_size,
 649                                                                          intrin->src[0].ssa)),
 650                                           1));
 651         else
 652            unreachable("bad boolean reduction op");
 653      }
 654
 655      if (cluster_size == 4) {
 656         if (op == nir_op_iand)
 657            return nir_quad_vote_all(b, 1, intrin->src[0].ssa);
 658         else if (op == nir_op_ior)
 659            return nir_quad_vote_any(b, 1, intrin->src[0].ssa);
 660      }
 661   }
 662
 663   nir_def *src = intrin->src[0].ssa;
 664
 665   /* Apply DeMorgan's law to implement "and" reductions, since all the
 666    * lower_boolean_*_internal() functions assume an identity of 0 to make the
 667    * generated code shorter.
 668    */
 669   nir_op new_op = (op == nir_op_iand) ? nir_op_ior : op;
 670   if (op == nir_op_iand) {
 671      src = nir_inot(b, src);
 672   }
 673
 674   nir_def *val = nir_ballot(b, options->ballot_components, options->ballot_bit_size, src);
 675
 676   switch (intrin->intrinsic) {
 677   case nir_intrinsic_reduce:
 678      val = lower_boolean_reduce_internal(b, val, cluster_size, new_op, options);
 679      break;
 680   case nir_intrinsic_inclusive_scan:
 681      val = lower_boolean_scan_internal(b, val, new_op, options);
 682      break;
 683   case nir_intrinsic_exclusive_scan:
 684      val = lower_boolean_scan_internal(b, val, new_op, options);
 685      val = nir_ishl_imm(b, val, 1);
 686      break;
 687   default:
 688      unreachable("bad intrinsic");
 689   }
 690
 691   if (op == nir_op_iand) {
 692      val = nir_inot(b, val);
 693   }
 694
 695   return nir_inverse_ballot(b, 1, val);
 696}
 697
 698static nir_def *
 699build_identity(nir_builder *b, unsigned bit_size, nir_op op)
 700{
 701   nir_const_value ident_const = nir_alu_binop_identity(op, bit_size);
 702   return nir_build_imm(b, 1, bit_size, &ident_const);
 703}
 704
 705/* Implementation of scan/reduce that assumes a full subgroup */
 706static nir_def *
 707build_scan_full(nir_builder *b, nir_intrinsic_op op, nir_op red_op,
 708                nir_def *data, unsigned cluster_size)
 709{
 710   switch (op) {
 711   case nir_intrinsic_exclusive_scan:
 712   case nir_intrinsic_inclusive_scan: {
 713      for (unsigned i = 1; i < cluster_size; i *= 2) {
 714         nir_def *idx = nir_load_subgroup_invocation(b);
 715         nir_def *has_buddy = nir_ige_imm(b, idx, i);
 716
 717         nir_def *buddy_data = nir_shuffle_up(b, data, nir_imm_int(b, i));
 718         nir_def *accum = nir_build_alu2(b, red_op, data, buddy_data);
 719         data = nir_bcsel(b, has_buddy, accum, data);
 720      }
 721
 722      if (op == nir_intrinsic_exclusive_scan) {
 723         /* For exclusive scans, we need to shift one more time and fill in the
 724          * bottom channel with identity.
 725          */
 726         nir_def *idx = nir_load_subgroup_invocation(b);
 727         nir_def *has_buddy = nir_ige_imm(b, idx, 1);
 728
 729         nir_def *buddy_data = nir_shuffle_up(b, data, nir_imm_int(b, 1));
 730         nir_def *identity = build_identity(b, data->bit_size, red_op);
 731         data = nir_bcsel(b, has_buddy, buddy_data, identity);
 732      }
 733
 734      return data;
 735   }
 736
 737   case nir_intrinsic_reduce: {
 738      for (unsigned i = 1; i < cluster_size; i *= 2) {
 739         nir_def *buddy_data = nir_shuffle_xor(b, data, nir_imm_int(b, i));
 740         data = nir_build_alu2(b, red_op, data, buddy_data);
 741      }
 742      return data;
 743   }
 744
 745   default:
 746      unreachable("Unsupported scan/reduce op");
 747   }
 748}
 749
 750/* Fully generic implementation of scan/reduce that takes a mask */
 751static nir_def *
 752build_scan_reduce(nir_builder *b, nir_intrinsic_op op, nir_op red_op,
 753                  nir_def *data, nir_def *mask, unsigned max_mask_bits,
 754                  const nir_lower_subgroups_options *options)
 755{
 756   nir_def *lt_mask = nir_load_subgroup_lt_mask(b, options->ballot_components,
 757                                                options->ballot_bit_size);
 758
 759   /* Mask of all channels whose values we need to accumulate.  Our own value
 760    * is already in accum, if inclusive, thanks to the initialization above.
 761    * We only need to consider lower indexed invocations.
 762    */
 763   nir_def *remaining = nir_iand(b, mask, lt_mask);
 764
 765   for (unsigned i = 1; i < max_mask_bits; i *= 2) {
 766      /* At each step, our buddy channel is the first channel we have yet to
 767       * take into account in the accumulator.
 768       */
 769      nir_def *has_buddy = nir_bany_inequal(b, remaining, nir_imm_int(b, 0));
 770      nir_def *buddy = nir_ballot_find_msb(b, 32, remaining);
 771
 772      /* Accumulate with our buddy channel, if any */
 773      nir_def *buddy_data = nir_shuffle(b, data, buddy);
 774      nir_def *accum = nir_build_alu2(b, red_op, data, buddy_data);
 775      data = nir_bcsel(b, has_buddy, accum, data);
 776
 777      /* We just took into account everything in our buddy's accumulator from
 778       * the previous step.  The only things remaining are whatever channels
 779       * were remaining for our buddy.
 780       */
 781      nir_def *buddy_remaining = nir_shuffle(b, remaining, buddy);
 782      remaining = nir_bcsel(b, has_buddy, buddy_remaining, nir_imm_int(b, 0));
 783   }
 784
 785   switch (op) {
 786   case nir_intrinsic_exclusive_scan: {
 787      /* For exclusive scans, we need to shift one more time and fill in the
 788       * bottom channel with identity.
 789       *
 790       * Some of this will get CSE'd with the first step but that's okay. The
 791       * code is cleaner this way.
 792       */
 793      nir_def *lower = nir_iand(b, mask, lt_mask);
 794      nir_def *has_buddy = nir_bany_inequal(b, lower, nir_imm_int(b, 0));
 795      nir_def *buddy = nir_ballot_find_msb(b, 32, lower);
 796
 797      nir_def *buddy_data = nir_shuffle(b, data, buddy);
 798      nir_def *identity = build_identity(b, data->bit_size, red_op);
 799      return nir_bcsel(b, has_buddy, buddy_data, identity);
 800   }
 801
 802   case nir_intrinsic_inclusive_scan:
 803      return data;
 804
 805   case nir_intrinsic_reduce: {
 806      /* For reductions, we need to take the top value of the scan */
 807      nir_def *idx = nir_ballot_find_msb(b, 32, mask);
 808      return nir_shuffle(b, data, idx);
 809   }
 810
 811   default:
 812      unreachable("Unsupported scan/reduce op");
 813   }
 814}
 815
 816static nir_def *
 817build_cluster_mask(nir_builder *b, unsigned cluster_size,
 818                   const nir_lower_subgroups_options *options)
 819{
 820   nir_def *idx = nir_load_subgroup_invocation(b);
 821   nir_def *cluster = nir_iand_imm(b, idx, ~(uint64_t)(cluster_size - 1));
 822
 823   if (cluster_size <= options->ballot_bit_size) {
 824      return build_ballot_imm_ishl(b, BITFIELD_MASK(cluster_size), cluster,
 825                                   options);
 826   }
 827
 828   /* Since the cluster size and the ballot bit size are both powers of 2,
 829    * cluster size will be a multiple of the ballot bit size. Therefore, each
 830    * ballot component will be either all ones or all zeros. Build a vec for
 831    * which each component holds the value of `cluster` for which the mask
 832    * should be all ones.
 833    */
 834   nir_const_value cluster_sel_const[4];
 835   assert(ARRAY_SIZE(cluster_sel_const) >= options->ballot_components);
 836
 837   for (unsigned i = 0; i < options->ballot_components; i++) {
 838      unsigned cluster_val =
 839         ROUND_DOWN_TO(i * options->ballot_bit_size, cluster_size);
 840      cluster_sel_const[i] =
 841         nir_const_value_for_uint(cluster_val, options->ballot_bit_size);
 842   }
 843
 844   nir_def *cluster_sel =
 845      nir_build_imm(b, options->ballot_components, options->ballot_bit_size,
 846                    cluster_sel_const);
 847   nir_def *ones = nir_imm_intN_t(b, -1, options->ballot_bit_size);
 848   nir_def *zeros = nir_imm_intN_t(b, 0, options->ballot_bit_size);
 849   return nir_bcsel(b, nir_ieq(b, cluster, cluster_sel), ones, zeros);
 850}
 851
 852static nir_def *
 853lower_scan_reduce(nir_builder *b, nir_intrinsic_instr *intrin,
 854                  const nir_lower_subgroups_options *options)
 855{
 856   const nir_op red_op = nir_intrinsic_reduction_op(intrin);
 857   unsigned subgroup_size = get_max_subgroup_size(options);
 858
 859   /* Grab the cluster size */
 860   unsigned cluster_size = subgroup_size;
 861   if (nir_intrinsic_has_cluster_size(intrin)) {
 862      cluster_size = nir_intrinsic_cluster_size(intrin);
 863      if (cluster_size == 0 || cluster_size > subgroup_size)
 864         cluster_size = subgroup_size;
 865   }
 866
 867   /* Check if all invocations are active. If so, we use the fast path. */
 868   nir_def *mask = nir_ballot(b, options->ballot_components,
 869                              options->ballot_bit_size, nir_imm_true(b));
 870
 871   nir_def *full, *partial;
 872   nir_push_if(b, nir_ball_iequal(b, mask, build_subgroup_mask(b, options)));
 873   {
 874      full = build_scan_full(b, intrin->intrinsic, red_op,
 875                             intrin->src[0].ssa, cluster_size);
 876   }
 877   nir_push_else(b, NULL);
 878   {
 879      /* Mask according to the cluster size */
 880      if (cluster_size < subgroup_size) {
 881         nir_def *cluster_mask = build_cluster_mask(b, cluster_size, options);
 882         mask = nir_iand(b, mask, cluster_mask);
 883      }
 884
 885      partial = build_scan_reduce(b, intrin->intrinsic, red_op,
 886                                  intrin->src[0].ssa, mask, cluster_size,
 887                                  options);
 888   }
 889   nir_pop_if(b, NULL);
 890   return nir_if_phi(b, full, partial);
 891}
 892
 893static bool
 894lower_subgroups_filter(const nir_instr *instr, const void *_options)
 895{
 896   const nir_lower_subgroups_options *options = _options;
 897
 898   if (options->filter) {
 899      return options->filter(instr, options->filter_data);
 900   }
 901
 902   return instr->type == nir_instr_type_intrinsic;
 903}
 904
 905static nir_def *
 906build_subgroup_eq_mask(nir_builder *b,
 907                       const nir_lower_subgroups_options *options)
 908{
 909   nir_def *subgroup_idx = nir_load_subgroup_invocation(b);
 910
 911   return build_ballot_imm_ishl(b, 1, subgroup_idx, options);
 912}
 913
 914static nir_def *
 915build_subgroup_ge_mask(nir_builder *b,
 916                       const nir_lower_subgroups_options *options)
 917{
 918   nir_def *subgroup_idx = nir_load_subgroup_invocation(b);
 919
 920   return build_ballot_imm_ishl(b, ~0ull, subgroup_idx, options);
 921}
 922
 923static nir_def *
 924build_subgroup_gt_mask(nir_builder *b,
 925                       const nir_lower_subgroups_options *options)
 926{
 927   nir_def *subgroup_idx = nir_load_subgroup_invocation(b);
 928
 929   return build_ballot_imm_ishl(b, ~1ull, subgroup_idx, options);
 930}
 931
 932static nir_def *
 933build_subgroup_quad_mask(nir_builder *b,
 934                         const nir_lower_subgroups_options *options)
 935{
 936   nir_def *subgroup_idx = nir_load_subgroup_invocation(b);
 937   nir_def *quad_first_idx = nir_iand_imm(b, subgroup_idx, ~0x3);
 938
 939   return build_ballot_imm_ishl(b, 0xf, quad_first_idx, options);
 940}
 941
 942static nir_def *
 943build_quad_vote_any(nir_builder *b, nir_def *src,
 944                    const nir_lower_subgroups_options *options)
 945{
 946   nir_def *ballot = nir_ballot(b, options->ballot_components,
 947                                   options->ballot_bit_size,
 948                                   src);
 949   nir_def *mask = build_subgroup_quad_mask(b, options);
 950
 951   return nir_ine_imm(b, nir_iand(b, ballot, mask), 0);
 952}
 953
 954static nir_def *
 955vec_find_lsb(nir_builder *b, nir_def *value)
 956{
 957   nir_def *vec_result = nir_find_lsb(b, value);
 958   nir_def *result = nir_imm_int(b, -1);
 959   for (int i = value->num_components - 1; i >= 0; i--) {
 960      nir_def *channel = nir_channel(b, vec_result, i);
 961      /* result = channel >= 0 ? (i * bitsize + channel) : result */
 962      result = nir_bcsel(b, nir_ige_imm(b, channel, 0),
 963                         nir_iadd_imm(b, channel, i * value->bit_size),
 964                         result);
 965   }
 966   return result;
 967}
 968
 969static nir_def *
 970vec_find_msb(nir_builder *b, nir_def *value)
 971{
 972   nir_def *vec_result = nir_ufind_msb(b, value);
 973   nir_def *result = nir_imm_int(b, -1);
 974   for (unsigned i = 0; i < value->num_components; i++) {
 975      nir_def *channel = nir_channel(b, vec_result, i);
 976      /* result = channel >= 0 ? (i * bitsize + channel) : result */
 977      result = nir_bcsel(b, nir_ige_imm(b, channel, 0),
 978                         nir_iadd_imm(b, channel, i * value->bit_size),
 979                         result);
 980   }
 981   return result;
 982}
 983
 984static nir_def *
 985lower_dynamic_quad_broadcast(nir_builder *b, nir_intrinsic_instr *intrin,
 986                             const nir_lower_subgroups_options *options)
 987{
 988   if (!options->lower_quad_broadcast_dynamic_to_const)
 989      return lower_to_shuffle(b, intrin, options);
 990
 991   nir_def *dst = NULL;
 992
 993   for (unsigned i = 0; i < 4; ++i) {
 994      nir_def *qbcst = nir_quad_broadcast(b, intrin->src[0].ssa,
 995                                              nir_imm_int(b, i));
 996
 997      if (i)
 998         dst = nir_bcsel(b, nir_ieq_imm(b, intrin->src[1].ssa, i),
 999                         qbcst, dst);
1000      else
1001         dst = qbcst;
1002   }
1003
1004   return dst;
1005}
1006
1007static nir_def *
1008lower_first_invocation_to_ballot(nir_builder *b, nir_intrinsic_instr *intrin,
1009                                 const nir_lower_subgroups_options *options)
1010{
1011   return nir_ballot_find_lsb(b, 32, nir_ballot(b, 4, 32, nir_imm_true(b)));
1012}
1013
1014static nir_def *
1015lower_read_first_invocation(nir_builder *b, nir_intrinsic_instr *intrin)
1016{
1017   return nir_read_invocation(b, intrin->src[0].ssa, nir_first_invocation(b));
1018}
1019
1020static nir_def *
1021lower_read_invocation_to_cond(nir_builder *b, nir_intrinsic_instr *intrin)
1022{
1023   return nir_read_invocation_cond_ir3(b, intrin->def.bit_size,
1024                                       intrin->src[0].ssa,
1025                                       nir_ieq(b, intrin->src[1].ssa,
1026                                               nir_load_subgroup_invocation(b)));
1027}
1028
1029static nir_def *
1030lower_subgroups_instr(nir_builder *b, nir_instr *instr, void *_options)
1031{
1032   const nir_lower_subgroups_options *options = _options;
1033
1034   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1035   switch (intrin->intrinsic) {
1036   case nir_intrinsic_vote_any:
1037   case nir_intrinsic_vote_all:
1038      if (options->lower_vote_trivial)
1039         return intrin->src[0].ssa;
1040      break;
1041
1042   case nir_intrinsic_vote_feq:
1043   case nir_intrinsic_vote_ieq:
1044      if (options->lower_vote_trivial)
1045         return nir_imm_true(b);
1046
1047      if (nir_src_bit_size(intrin->src[0]) == 1) {
1048         if (options->lower_vote_bool_eq)
1049            return lower_vote_eq(b, intrin);
1050      } else {
1051         if (options->lower_vote_eq)
1052            return lower_vote_eq(b, intrin);
1053      }
1054
1055      if (options->lower_to_scalar && intrin->num_components > 1)
1056         return lower_vote_eq_to_scalar(b, intrin);
1057      break;
1058
1059   case nir_intrinsic_load_subgroup_size:
1060      if (options->subgroup_size)
1061         return nir_imm_int(b, options->subgroup_size);
1062      break;
1063
1064   case nir_intrinsic_first_invocation:
1065      if (options->subgroup_size == 1)
1066         return nir_imm_int(b, 0);
1067
1068      if (options->lower_first_invocation_to_ballot)
1069         return lower_first_invocation_to_ballot(b, intrin, options);
1070
1071      break;
1072
1073   case nir_intrinsic_read_invocation:
1074      if (options->lower_to_scalar && intrin->num_components > 1)
1075         return lower_subgroup_op_to_scalar(b, intrin);
1076
1077      if (options->lower_boolean_shuffle && intrin->src[0].ssa->bit_size == 1)
1078         return lower_boolean_shuffle(b, intrin, options);
1079
1080      if (options->lower_read_invocation_to_cond)
1081         return lower_read_invocation_to_cond(b, intrin);
1082
1083      break;
1084
1085   case nir_intrinsic_read_first_invocation:
1086      if (options->lower_to_scalar && intrin->num_components > 1)
1087         return lower_subgroup_op_to_scalar(b, intrin);
1088
1089      if (options->lower_read_first_invocation)
1090         return lower_read_first_invocation(b, intrin);
1091      break;
1092
1093   case nir_intrinsic_load_subgroup_eq_mask:
1094   case nir_intrinsic_load_subgroup_ge_mask:
1095   case nir_intrinsic_load_subgroup_gt_mask:
1096   case nir_intrinsic_load_subgroup_le_mask:
1097   case nir_intrinsic_load_subgroup_lt_mask: {
1098      if (!options->lower_subgroup_masks)
1099         return NULL;
1100
1101      nir_def *val;
1102      switch (intrin->intrinsic) {
1103      case nir_intrinsic_load_subgroup_eq_mask:
1104         val = build_subgroup_eq_mask(b, options);
1105         break;
1106      case nir_intrinsic_load_subgroup_ge_mask:
1107         val = nir_iand(b, build_subgroup_ge_mask(b, options),
1108                        build_subgroup_mask(b, options));
1109         break;
1110      case nir_intrinsic_load_subgroup_gt_mask:
1111         val = nir_iand(b, build_subgroup_gt_mask(b, options),
1112                        build_subgroup_mask(b, options));
1113         break;
1114      case nir_intrinsic_load_subgroup_le_mask:
1115         val = nir_inot(b, build_subgroup_gt_mask(b, options));
1116         break;
1117      case nir_intrinsic_load_subgroup_lt_mask:
1118         val = nir_inot(b, build_subgroup_ge_mask(b, options));
1119         break;
1120      default:
1121         unreachable("you seriously can't tell this is unreachable?");
1122      }
1123
1124      return uint_to_ballot_type(b, val,
1125                                 intrin->def.num_components,
1126                                 intrin->def.bit_size);
1127   }
1128
1129   case nir_intrinsic_ballot: {
1130      if (intrin->def.num_components == options->ballot_components &&
1131          intrin->def.bit_size == options->ballot_bit_size)
1132         return NULL;
1133
1134      nir_def *ballot =
1135         nir_ballot(b, options->ballot_components, options->ballot_bit_size,
1136                    intrin->src[0].ssa);
1137
1138      return uint_to_ballot_type(b, ballot,
1139                                 intrin->def.num_components,
1140                                 intrin->def.bit_size);
1141   }
1142
1143   case nir_intrinsic_inverse_ballot:
1144      if (options->lower_inverse_ballot) {
1145         return nir_ballot_bitfield_extract(b, 1, intrin->src[0].ssa,
1146                                            nir_load_subgroup_invocation(b));
1147      } else if (intrin->src[0].ssa->num_components != options->ballot_components ||
1148                 intrin->src[0].ssa->bit_size != options->ballot_bit_size) {
1149         return nir_inverse_ballot(b, 1, ballot_type_to_uint(b, intrin->src[0].ssa, options));
1150      }
1151      break;
1152
1153   case nir_intrinsic_ballot_bitfield_extract:
1154   case nir_intrinsic_ballot_bit_count_reduce:
1155   case nir_intrinsic_ballot_find_lsb:
1156   case nir_intrinsic_ballot_find_msb: {
1157      nir_def *int_val = ballot_type_to_uint(b, intrin->src[0].ssa,
1158                                             options);
1159
1160      if (intrin->intrinsic != nir_intrinsic_ballot_bitfield_extract &&
1161          intrin->intrinsic != nir_intrinsic_ballot_find_lsb) {
1162         /* For OpGroupNonUniformBallotFindMSB, the SPIR-V Spec says:
1163          *
1164          *    "Find the most significant bit set to 1 in Value, considering
1165          *    only the bits in Value required to represent all bits of the
1166          *    group’s invocations.  If none of the considered bits is set to
1167          *    1, the result is undefined."
1168          *
1169          * It has similar text for the other three.  This means that, in case
1170          * the subgroup size is less than 32, we have to mask off the unused
1171          * bits.  If the subgroup size is fixed and greater than or equal to
1172          * 32, the mask will be 0xffffffff and nir_opt_algebraic will delete
1173          * the iand.
1174          *
1175          * We only have to worry about this for BitCount and FindMSB because
1176          * FindLSB counts from the bottom and BitfieldExtract selects
1177          * individual bits.  In either case, if run outside the range of
1178          * valid bits, we hit the undefined results case and we can return
1179          * anything we want.
1180          */
1181         int_val = nir_iand(b, int_val, build_subgroup_mask(b, options));
1182      }
1183
1184      switch (intrin->intrinsic) {
1185      case nir_intrinsic_ballot_bitfield_extract: {
1186         nir_def *idx = intrin->src[1].ssa;
1187         if (int_val->num_components > 1) {
1188            /* idx will be truncated by nir_ushr, so we just need to select
1189             * the right component using the bits of idx that are truncated in
1190             * the shift.
1191             */
1192            int_val =
1193               nir_vector_extract(b, int_val,
1194                                  nir_udiv_imm(b, idx, int_val->bit_size));
1195         }
1196
1197         return nir_test_mask(b, nir_ushr(b, int_val, idx), 1);
1198      }
1199      case nir_intrinsic_ballot_bit_count_reduce:
1200         return vec_bit_count(b, int_val);
1201      case nir_intrinsic_ballot_find_lsb:
1202         return vec_find_lsb(b, int_val);
1203      case nir_intrinsic_ballot_find_msb:
1204         return vec_find_msb(b, int_val);
1205      default:
1206         unreachable("you seriously can't tell this is unreachable?");
1207      }
1208   }
1209
1210   case nir_intrinsic_ballot_bit_count_exclusive:
1211   case nir_intrinsic_ballot_bit_count_inclusive: {
1212      nir_def *int_val = ballot_type_to_uint(b, intrin->src[0].ssa,
1213                                             options);
1214      if (options->lower_ballot_bit_count_to_mbcnt_amd) {
1215         nir_def *acc;
1216         if (intrin->intrinsic == nir_intrinsic_ballot_bit_count_exclusive) {
1217            acc = nir_imm_int(b, 0);
1218         } else {
1219            acc = nir_iand_imm(b, nir_u2u32(b, int_val), 0x1);
1220            int_val = nir_ushr_imm(b, int_val, 1);
1221         }
1222         return nir_mbcnt_amd(b, int_val, acc);
1223      }
1224
1225      nir_def *mask;
1226      if (intrin->intrinsic == nir_intrinsic_ballot_bit_count_inclusive) {
1227         mask = nir_inot(b, build_subgroup_gt_mask(b, options));
1228      } else {
1229         mask = nir_inot(b, build_subgroup_ge_mask(b, options));
1230      }
1231
1232      return vec_bit_count(b, nir_iand(b, int_val, mask));
1233   }
1234
1235   case nir_intrinsic_elect: {
1236      if (!options->lower_elect)
1237         return NULL;
1238
1239      return nir_ieq(b, nir_load_subgroup_invocation(b), nir_first_invocation(b));
1240   }
1241
1242   case nir_intrinsic_shuffle:
1243      if (options->lower_shuffle &&
1244          (!options->lower_boolean_shuffle || intrin->src[0].ssa->bit_size != 1))
1245         return lower_shuffle(b, intrin);
1246      else if (options->lower_to_scalar && intrin->num_components > 1)
1247         return lower_subgroup_op_to_scalar(b, intrin);
1248      else if (options->lower_boolean_shuffle && intrin->src[0].ssa->bit_size == 1)
1249         return lower_boolean_shuffle(b, intrin, options);
1250      else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64)
1251         return lower_subgroup_op_to_32bit(b, intrin);
1252      break;
1253   case nir_intrinsic_shuffle_xor:
1254   case nir_intrinsic_shuffle_up:
1255   case nir_intrinsic_shuffle_down:
1256      if (options->lower_relative_shuffle &&
1257          (!options->lower_boolean_shuffle || intrin->src[0].ssa->bit_size != 1))
1258         return lower_to_shuffle(b, intrin, options);
1259      else if (options->lower_to_scalar && intrin->num_components > 1)
1260         return lower_subgroup_op_to_scalar(b, intrin);
1261      else if (options->lower_boolean_shuffle && intrin->src[0].ssa->bit_size == 1)
1262         return lower_boolean_shuffle(b, intrin, options);
1263      else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64)
1264         return lower_subgroup_op_to_32bit(b, intrin);
1265      break;
1266
1267   case nir_intrinsic_quad_broadcast:
1268   case nir_intrinsic_quad_swap_horizontal:
1269   case nir_intrinsic_quad_swap_vertical:
1270   case nir_intrinsic_quad_swap_diagonal:
1271      if (options->lower_quad ||
1272          (options->lower_quad_broadcast_dynamic &&
1273           intrin->intrinsic == nir_intrinsic_quad_broadcast &&
1274           !nir_src_is_const(intrin->src[1])))
1275         return lower_dynamic_quad_broadcast(b, intrin, options);
1276      else if (options->lower_to_scalar && intrin->num_components > 1)
1277         return lower_subgroup_op_to_scalar(b, intrin);
1278      break;
1279
1280   case nir_intrinsic_quad_vote_any:
1281      if (options->lower_quad_vote)
1282         return build_quad_vote_any(b, intrin->src[0].ssa, options);
1283      break;
1284   case nir_intrinsic_quad_vote_all:
1285      if (options->lower_quad_vote) {
1286         nir_def *not_src = nir_inot(b, intrin->src[0].ssa);
1287         nir_def *any_not = build_quad_vote_any(b, not_src, options);
1288         return nir_inot(b, any_not);
1289      }
1290      break;
1291
1292   case nir_intrinsic_reduce: {
1293      nir_def *ret = NULL;
1294      /* A cluster size greater than the subgroup size is implemention defined */
1295      if (options->subgroup_size &&
1296          nir_intrinsic_cluster_size(intrin) >= options->subgroup_size) {
1297         nir_intrinsic_set_cluster_size(intrin, 0);
1298         ret = NIR_LOWER_INSTR_PROGRESS;
1299      }
1300      if (nir_intrinsic_cluster_size(intrin) == 1)
1301         return intrin->src[0].ssa;
1302      if (options->lower_to_scalar && intrin->num_components > 1)
1303         return lower_subgroup_op_to_scalar(b, intrin);
1304      if (intrin->def.bit_size == 1 && options->ballot_components == 1 &&
1305          (options->lower_boolean_reduce || options->lower_reduce))
1306         return lower_boolean_reduce(b, intrin, options);
1307      if (options->lower_reduce)
1308         return lower_scan_reduce(b, intrin, options);
1309      return ret;
1310   }
1311   case nir_intrinsic_inclusive_scan:
1312   case nir_intrinsic_exclusive_scan:
1313      if (options->lower_to_scalar && intrin->num_components > 1)
1314         return lower_subgroup_op_to_scalar(b, intrin);
1315      if (intrin->def.bit_size == 1 && options->ballot_components == 1 &&
1316          (options->lower_boolean_reduce || options->lower_reduce))
1317         return lower_boolean_reduce(b, intrin, options);
1318      if (options->lower_reduce)
1319         return lower_scan_reduce(b, intrin, options);
1320      break;
1321
1322   case nir_intrinsic_rotate:
1323      if (options->lower_rotate_to_shuffle &&
1324          (!options->lower_boolean_shuffle || intrin->src[0].ssa->bit_size != 1))
1325         return lower_to_shuffle(b, intrin, options);
1326      else if (options->lower_rotate_clustered_to_shuffle &&
1327               nir_intrinsic_cluster_size(intrin) > 0 &&
1328               (!options->lower_boolean_shuffle || intrin->src[0].ssa->bit_size != 1))
1329         return lower_to_shuffle(b, intrin, options);
1330      else if (options->lower_to_scalar && intrin->num_components > 1)
1331         return lower_subgroup_op_to_scalar(b, intrin);
1332      else if (options->lower_boolean_shuffle && intrin->src[0].ssa->bit_size == 1)
1333         return lower_boolean_shuffle(b, intrin, options);
1334      else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64)
1335         return lower_subgroup_op_to_32bit(b, intrin);
1336      break;
1337   case nir_intrinsic_masked_swizzle_amd:
1338      if (options->lower_to_scalar && intrin->num_components > 1) {
1339         return lower_subgroup_op_to_scalar(b, intrin);
1340      } else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64) {
1341         return lower_subgroup_op_to_32bit(b, intrin);
1342      }
1343      break;
1344
1345   default:
1346      break;
1347   }
1348
1349   return NULL;
1350}
1351
1352bool
1353nir_lower_subgroups(nir_shader *shader,
1354                    const nir_lower_subgroups_options *options)
1355{
1356   return nir_shader_lower_instructions(shader, lower_subgroups_filter,
1357                                        lower_subgroups_instr,
1358                                        (void *)options);
1359}
Configure Feed

Configure Feed