mirror of OpenBSD xenocara tree
github.com/openbsd/xenocara
openbsd
1#
2# Copyright (C) 2014 Connor Abbott
3#
4# Permission is hereby granted, free of charge, to any person obtaining a
5# copy of this software and associated documentation files (the "Software"),
6# to deal in the Software without restriction, including without limitation
7# the rights to use, copy, modify, merge, publish, distribute, sublicense,
8# and/or sell copies of the Software, and to permit persons to whom the
9# Software is furnished to do so, subject to the following conditions:
10#
11# The above copyright notice and this permission notice (including the next
12# paragraph) shall be included in all copies or substantial portions of the
13# Software.
14#
15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21# IN THE SOFTWARE.
22#
23# Authors:
24# Connor Abbott (cwabbott0@gmail.com)
25
26import re
27
28# Class that represents all the information we have about the opcode
29# NOTE: this must be kept in sync with nir_op_info
30
31class Opcode(object):
32 """Class that represents all the information we have about the opcode
33 NOTE: this must be kept in sync with nir_op_info
34 """
35 def __init__(self, name, output_size, output_type, input_sizes,
36 input_types, is_conversion, algebraic_properties, const_expr,
37 description):
38 """Parameters:
39
40 - name is the name of the opcode (prepend nir_op_ for the enum name)
41 - all types are strings that get nir_type_ prepended to them
42 - input_types is a list of types
43 - is_conversion is true if this opcode represents a type conversion
44 - algebraic_properties is a space-seperated string, where nir_op_is_ is
45 prepended before each entry
46 - const_expr is an expression or series of statements that computes the
47 constant value of the opcode given the constant values of its inputs.
48 - Optional description of the opcode for documentation.
49
50 Constant expressions are formed from the variables src0, src1, ...,
51 src(N-1), where N is the number of arguments. The output of the
52 expression should be stored in the dst variable. Per-component input
53 and output variables will be scalars and non-per-component input and
54 output variables will be a struct with fields named x, y, z, and w
55 all of the correct type. Input and output variables can be assumed
56 to already be of the correct type and need no conversion. In
57 particular, the conversion from the C bool type to/from NIR_TRUE and
58 NIR_FALSE happens automatically.
59
60 For per-component instructions, the entire expression will be
61 executed once for each component. For non-per-component
62 instructions, the expression is expected to store the correct values
63 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the
64 constant expression, an assignment to dst will happen automatically
65 and the result will be equivalent to "dst = <expression>" for
66 per-component instructions and "dst.x = dst.y = ... = <expression>"
67 for non-per-component instructions.
68 """
69 assert isinstance(name, str)
70 assert isinstance(output_size, int)
71 assert isinstance(output_type, str)
72 assert isinstance(input_sizes, list)
73 assert isinstance(input_sizes[0], int)
74 assert isinstance(input_types, list)
75 assert isinstance(input_types[0], str)
76 assert isinstance(is_conversion, bool)
77 assert isinstance(algebraic_properties, str)
78 assert isinstance(const_expr, str)
79 assert len(input_sizes) == len(input_types)
80 assert 0 <= output_size <= 5 or (output_size == 8) or (output_size == 16)
81 for size in input_sizes:
82 assert 0 <= size <= 5 or (size == 8) or (size == 16)
83 if output_size != 0:
84 assert size != 0
85 self.name = name
86 self.num_inputs = len(input_sizes)
87 self.output_size = output_size
88 self.output_type = output_type
89 self.input_sizes = input_sizes
90 self.input_types = input_types
91 self.is_conversion = is_conversion
92 self.algebraic_properties = algebraic_properties
93 self.const_expr = const_expr
94 self.description = description
95
96# helper variables for strings
97tfloat = "float"
98tint = "int"
99tbool = "bool"
100tbool1 = "bool1"
101tbool8 = "bool8"
102tbool16 = "bool16"
103tbool32 = "bool32"
104tuint = "uint"
105tuint8 = "uint8"
106tint16 = "int16"
107tuint16 = "uint16"
108tfloat16 = "float16"
109tfloat32 = "float32"
110tint32 = "int32"
111tuint32 = "uint32"
112tint64 = "int64"
113tuint64 = "uint64"
114tfloat64 = "float64"
115
116_TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
117
118def type_has_size(type_):
119 m = _TYPE_SPLIT_RE.match(type_)
120 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
121 return m.group('bits') is not None
122
123def type_size(type_):
124 m = _TYPE_SPLIT_RE.match(type_)
125 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
126 assert m.group('bits') is not None, \
127 'NIR type string has no bit size: "{}"'.format(type_)
128 return int(m.group('bits'))
129
130def type_sizes(type_):
131 if type_has_size(type_):
132 return [type_size(type_)]
133 elif type_ == 'bool':
134 return [1, 8, 16, 32]
135 elif type_ == 'float':
136 return [16, 32, 64]
137 else:
138 return [1, 8, 16, 32, 64]
139
140def type_base_type(type_):
141 m = _TYPE_SPLIT_RE.match(type_)
142 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
143 return m.group('type')
144
145# Operation where the first two sources are commutative.
146#
147# For 2-source operations, this just mathematical commutativity. Some
148# 3-source operations, like ffma, are only commutative in the first two
149# sources.
150_2src_commutative = "2src_commutative "
151associative = "associative "
152selection = "selection "
153
154# global dictionary of opcodes
155opcodes = {}
156
157def opcode(name, output_size, output_type, input_sizes, input_types,
158 is_conversion, algebraic_properties, const_expr, description = ""):
159 assert name not in opcodes
160 opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
161 input_types, is_conversion, algebraic_properties,
162 const_expr, description)
163
164def unop_convert(name, out_type, in_type, const_expr, description = ""):
165 opcode(name, 0, out_type, [0], [in_type], False, "", const_expr, description)
166
167def unop(name, ty, const_expr, description = "", algebraic_properties = ""):
168 opcode(name, 0, ty, [0], [ty], False, algebraic_properties, const_expr,
169 description)
170
171def unop_horiz(name, output_size, output_type, input_size, input_type,
172 const_expr, description = ""):
173 opcode(name, output_size, output_type, [input_size], [input_type],
174 False, "", const_expr, description)
175
176def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
177 reduce_expr, final_expr, description = ""):
178 def prereduce(src):
179 return "(" + prereduce_expr.format(src=src) + ")"
180 def final(src):
181 return final_expr.format(src="(" + src + ")")
182 def reduce_(src0, src1):
183 return reduce_expr.format(src0=src0, src1=src1)
184 src0 = prereduce("src0.x")
185 src1 = prereduce("src0.y")
186 src2 = prereduce("src0.z")
187 src3 = prereduce("src0.w")
188 unop_horiz(name + "2", output_size, output_type, 2, input_type,
189 final(reduce_(src0, src1)), description)
190 unop_horiz(name + "3", output_size, output_type, 3, input_type,
191 final(reduce_(reduce_(src0, src1), src2)), description)
192 unop_horiz(name + "4", output_size, output_type, 4, input_type,
193 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))),
194 description)
195
196def unop_numeric_convert(name, out_type, in_type, const_expr, description = ""):
197 opcode(name, 0, out_type, [0], [in_type], True, "", const_expr, description)
198
199unop("mov", tuint, "src0")
200
201unop("ineg", tint, "src0 == u_intN_min(bit_size) ? src0 : -src0")
202unop("fneg", tfloat, "-src0")
203unop("inot", tint, "~src0", description = "Invert every bit of the integer")
204
205unop("fsign", tfloat, ("bit_size == 64 ? " +
206 "(isnan(src0) ? 0.0 : ((src0 == 0.0 ) ? src0 : (src0 > 0.0 ) ? 1.0 : -1.0 )) : " +
207 "(isnan(src0) ? 0.0f : ((src0 == 0.0f) ? src0 : (src0 > 0.0f) ? 1.0f : -1.0f))"),
208 description = """
209Roughly implements the OpenGL / Vulkan rules for ``sign(float)``.
210The ``GLSL.std.450 FSign`` instruction is defined as:
211
212 Result is 1.0 if x > 0, 0.0 if x = 0, or -1.0 if x < 0.
213
214If the source is equal to zero, there is a preference for the result to have
215the same sign, but this is not required (it is required by OpenCL). If the
216source is not a number, there is a preference for the result to be +0.0, but
217this is not required (it is required by OpenCL). If the source is not a
218number, and the result is not +0.0, the result should definitely **not** be
219NaN.
220
221The values returned for constant folding match the behavior required by
222OpenCL.
223 """)
224
225unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
226unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
227unop("fabs", tfloat, "fabs(src0)")
228unop("fsat", tfloat, ("fmin(fmax(src0, 0.0), 1.0)"))
229unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
230unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
231unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
232unop("fexp2", tfloat, "exp2f(src0)")
233unop("flog2", tfloat, "log2f(src0)")
234
235# Generate all of the numeric conversion opcodes
236for src_t in [tint, tuint, tfloat, tbool]:
237 if src_t == tbool:
238 dst_types = [tfloat, tint, tbool]
239 elif src_t == tint:
240 dst_types = [tfloat, tint]
241 elif src_t == tuint:
242 dst_types = [tfloat, tuint]
243 elif src_t == tfloat:
244 dst_types = [tint, tuint, tfloat]
245
246 for dst_t in dst_types:
247 for dst_bit_size in type_sizes(dst_t):
248 if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
249 rnd_modes = ['_rtne', '_rtz', '']
250 for rnd_mode in rnd_modes:
251 if rnd_mode == '_rtne':
252 conv_expr = """
253 if (bit_size > 32) {
254 dst = _mesa_half_to_float(_mesa_double_to_float16_rtne(src0));
255 } else if (bit_size > 16) {
256 dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
257 } else {
258 dst = src0;
259 }
260 """
261 elif rnd_mode == '_rtz':
262 conv_expr = """
263 if (bit_size > 32) {
264 dst = _mesa_half_to_float(_mesa_double_to_float16_rtz(src0));
265 } else if (bit_size > 16) {
266 dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
267 } else {
268 dst = src0;
269 }
270 """
271 else:
272 conv_expr = """
273 if (bit_size > 32) {
274 if (nir_is_rounding_mode_rtz(execution_mode, 16))
275 dst = _mesa_half_to_float(_mesa_double_to_float16_rtz(src0));
276 else
277 dst = _mesa_half_to_float(_mesa_double_to_float16_rtne(src0));
278 } else if (bit_size > 16) {
279 if (nir_is_rounding_mode_rtz(execution_mode, 16))
280 dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
281 else
282 dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
283 } else {
284 dst = src0;
285 }
286 """
287
288 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0],
289 dst_t[0],
290 dst_bit_size,
291 rnd_mode),
292 dst_t + str(dst_bit_size),
293 src_t, conv_expr)
294 elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat:
295 conv_expr = """
296 if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
297 dst = _mesa_double_to_float_rtz(src0);
298 } else {
299 dst = src0;
300 }
301 """
302 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
303 dst_bit_size),
304 dst_t + str(dst_bit_size), src_t, conv_expr)
305 else:
306 conv_expr = "src0 != 0" if dst_t == tbool else "src0"
307 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
308 dst_bit_size),
309 dst_t + str(dst_bit_size), src_t, conv_expr)
310
311def unop_numeric_convert_mp(base, src_t, dst_t):
312 op_like = base + "16"
313 unop_numeric_convert(base + "mp", src_t, dst_t, opcodes[op_like].const_expr,
314 description = """
315Special opcode that is the same as :nir:alu-op:`{}` except that it is safe to
316remove it if the result is immediately converted back to 32 bits again. This is
317generated as part of the precision lowering pass. ``mp`` stands for medium
318precision.
319 """.format(op_like))
320
321unop_numeric_convert_mp("f2f", tfloat16, tfloat32)
322unop_numeric_convert_mp("i2i", tint16, tint32)
323# u2ump isn't defined, because the behavior is equal to i2imp
324unop_numeric_convert_mp("f2i", tint16, tfloat32)
325unop_numeric_convert_mp("f2u", tuint16, tfloat32)
326unop_numeric_convert_mp("i2f", tfloat16, tint32)
327unop_numeric_convert_mp("u2f", tfloat16, tuint32)
328
329# Unary floating-point rounding operations.
330
331
332unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
333unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
334unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
335unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
336unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
337
338unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
339
340# Trigonometric operations.
341
342
343unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
344unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
345
346# dfrexp
347unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
348unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
349
350# Floating point pack and unpack operations.
351
352def pack_2x16(fmt, in_type):
353 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, in_type, """
354dst.x = (uint32_t) pack_fmt_1x16(src0.x);
355dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
356""".replace("fmt", fmt))
357
358def pack_4x8(fmt):
359 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
360dst.x = (uint32_t) pack_fmt_1x8(src0.x);
361dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
362dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
363dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
364""".replace("fmt", fmt))
365
366def unpack_2x16(fmt):
367 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
368dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
369dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
370""".replace("fmt", fmt))
371
372def unpack_4x8(fmt):
373 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
374dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
375dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
376dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
377dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
378""".replace("fmt", fmt))
379
380
381pack_2x16("snorm", tfloat)
382pack_4x8("snorm")
383pack_2x16("unorm", tfloat)
384pack_4x8("unorm")
385pack_2x16("half", tfloat32)
386unpack_2x16("snorm")
387unpack_4x8("snorm")
388unpack_2x16("unorm")
389unpack_4x8("unorm")
390
391unop_horiz("pack_uint_2x16", 1, tuint32, 2, tuint32, """
392dst.x = _mesa_unsigned_to_unsigned(src0.x, 16);
393dst.x |= _mesa_unsigned_to_unsigned(src0.y, 16) << 16;
394""", description = """
395Convert two unsigned integers into a packed unsigned short (clamp is applied).
396""")
397
398unop_horiz("pack_sint_2x16", 1, tint32, 2, tint32, """
399dst.x = _mesa_signed_to_signed(src0.x, 16) & 0xffff;
400dst.x |= _mesa_signed_to_signed(src0.y, 16) << 16;
401""", description = """
402Convert two signed integers into a packed signed short (clamp is applied).
403""")
404
405unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
406dst.x = (src0.x & 0xffff) | (src0.y << 16);
407""")
408
409unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
410dst.x = (src0.x << 0) |
411 (src0.y << 8) |
412 (src0.z << 16) |
413 (src0.w << 24);
414""")
415
416unop_horiz("pack_32_4x8", 1, tuint32, 4, tuint8,
417 "dst.x = src0.x | ((uint32_t)src0.y << 8) | ((uint32_t)src0.z << 16) | ((uint32_t)src0.w << 24);")
418
419unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
420 "dst.x = src0.x | ((uint32_t)src0.y << 16);")
421
422unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
423 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
424
425unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
426 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
427
428unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
429 "dst.x = src0.x; dst.y = src0.x >> 32;")
430
431unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
432 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.x >> 48;")
433
434unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
435 "dst.x = src0.x; dst.y = src0.x >> 16;")
436
437unop_horiz("unpack_32_4x8", 4, tuint8, 1, tuint32,
438 "dst.x = src0.x; dst.y = src0.x >> 8; dst.z = src0.x >> 16; dst.w = src0.x >> 24;")
439
440unop_horiz("unpack_half_2x16", 2, tfloat32, 1, tuint32, """
441dst.x = unpack_half_1x16((uint16_t)(src0.x & 0xffff), nir_is_denorm_flush_to_zero(execution_mode, 16));
442dst.y = unpack_half_1x16((uint16_t)(src0.x >> 16), nir_is_denorm_flush_to_zero(execution_mode, 16));
443""")
444
445# Lowered floating point unpacking operations.
446
447unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
448 "unpack_half_1x16((uint16_t)(src0 & 0xffff), nir_is_denorm_flush_to_zero(execution_mode, 16))")
449unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
450 "unpack_half_1x16((uint16_t)(src0 >> 16), nir_is_denorm_flush_to_zero(execution_mode, 16))")
451
452
453unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
454unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
455
456unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
457unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
458
459# Bit operations, part of ARB_gpu_shader5.
460
461
462unop("bitfield_reverse", tuint32, """
463/* we're not winning any awards for speed here, but that's ok */
464dst = 0;
465for (unsigned bit = 0; bit < 32; bit++)
466 dst |= ((src0 >> bit) & 1) << (31 - bit);
467""")
468unop_convert("bit_count", tuint32, tuint, """
469dst = 0;
470for (unsigned bit = 0; bit < bit_size; bit++) {
471 if ((src0 >> bit) & 1)
472 dst++;
473}
474""")
475
476unop_convert("ufind_msb", tint32, tuint, """
477dst = -1;
478for (int bit = bit_size - 1; bit >= 0; bit--) {
479 if ((src0 >> bit) & 1) {
480 dst = bit;
481 break;
482 }
483}
484""")
485
486unop_convert("ufind_msb_rev", tint32, tuint, """
487dst = -1;
488for (int bit = 0; bit < bit_size; bit++) {
489 if ((src0 << bit) & 0x80000000) {
490 dst = bit;
491 break;
492 }
493}
494""")
495
496unop("uclz", tuint32, """
497int bit;
498for (bit = bit_size - 1; bit >= 0; bit--) {
499 if ((src0 & (1u << bit)) != 0)
500 break;
501}
502dst = (unsigned)(bit_size - bit - 1);
503""")
504
505unop("ifind_msb", tint32, """
506dst = -1;
507for (int bit = bit_size - 1; bit >= 0; bit--) {
508 /* If src0 < 0, we're looking for the first 0 bit.
509 * if src0 >= 0, we're looking for the first 1 bit.
510 */
511 if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
512 (!((src0 >> bit) & 1) && (src0 < 0))) {
513 dst = bit;
514 break;
515 }
516}
517""")
518
519unop("ifind_msb_rev", tint32, """
520dst = -1;
521/* We are looking for the highest bit that's not the same as the sign bit. */
522uint32_t sign = src0 & 0x80000000u;
523for (int bit = 0; bit < 32; bit++) {
524 if (((src0 << bit) & 0x80000000u) != sign) {
525 dst = bit;
526 break;
527 }
528}
529""")
530
531unop_convert("find_lsb", tint32, tint, """
532dst = -1;
533for (unsigned bit = 0; bit < bit_size; bit++) {
534 if ((src0 >> bit) & 1) {
535 dst = bit;
536 break;
537 }
538}
539""")
540
541unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}",
542 description = "Sum of vector components")
543
544def binop_convert(name, out_type, in_type1, alg_props, const_expr, description="", in_type2=None):
545 if in_type2 is None:
546 in_type2 = in_type1
547 opcode(name, 0, out_type, [0, 0], [in_type1, in_type2],
548 False, alg_props, const_expr, description)
549
550def binop(name, ty, alg_props, const_expr, description = ""):
551 binop_convert(name, ty, ty, alg_props, const_expr, description)
552
553def binop_compare(name, ty, alg_props, const_expr, description = "", ty2=None):
554 binop_convert(name, tbool1, ty, alg_props, const_expr, description, ty2)
555
556def binop_compare8(name, ty, alg_props, const_expr, description = "", ty2=None):
557 binop_convert(name, tbool8, ty, alg_props, const_expr, description, ty2)
558
559def binop_compare16(name, ty, alg_props, const_expr, description = "", ty2=None):
560 binop_convert(name, tbool16, ty, alg_props, const_expr, description, ty2)
561
562def binop_compare32(name, ty, alg_props, const_expr, description = "", ty2=None):
563 binop_convert(name, tbool32, ty, alg_props, const_expr, description, ty2)
564
565def binop_compare_all_sizes(name, ty, alg_props, const_expr, description = "", ty2=None):
566 binop_compare(name, ty, alg_props, const_expr, description, ty2)
567 binop_compare8(name + "8", ty, alg_props, const_expr, description, ty2)
568 binop_compare16(name + "16", ty, alg_props, const_expr, description, ty2)
569 binop_compare32(name + "32", ty, alg_props, const_expr, description, ty2)
570
571def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
572 src2_type, const_expr, description = ""):
573 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
574 False, "", const_expr, description)
575
576def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
577 reduce_expr, final_expr, suffix="", description = ""):
578 def final(src):
579 return final_expr.format(src= "(" + src + ")")
580 def reduce_(src0, src1):
581 return reduce_expr.format(src0=src0, src1=src1)
582 def prereduce(src0, src1):
583 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
584 srcs = [prereduce("src0." + letter, "src1." + letter) for letter in "xyzwefghijklmnop"]
585 def pairwise_reduce(start, size):
586 if (size == 1):
587 return srcs[start]
588 return reduce_(pairwise_reduce(start + size // 2, size // 2), pairwise_reduce(start, size // 2))
589 for size in [2, 4, 8, 16]:
590 opcode(name + str(size) + suffix, output_size, output_type,
591 [size, size], [src_type, src_type], False, _2src_commutative,
592 final(pairwise_reduce(0, size)), description)
593 opcode(name + "3" + suffix, output_size, output_type,
594 [3, 3], [src_type, src_type], False, _2src_commutative,
595 final(reduce_(reduce_(srcs[2], srcs[1]), srcs[0])), description)
596 opcode(name + "5" + suffix, output_size, output_type,
597 [5, 5], [src_type, src_type], False, _2src_commutative,
598 final(reduce_(srcs[4], reduce_(reduce_(srcs[3], srcs[2]),
599 reduce_(srcs[1], srcs[0])))),
600 description)
601
602def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr,
603 reduce_expr, final_expr, description = ""):
604 binop_reduce(name, output_size, tbool1, src_type,
605 prereduce_expr, reduce_expr, final_expr, description)
606 binop_reduce("b8" + name[1:], output_size, tbool8, src_type,
607 prereduce_expr, reduce_expr, final_expr, description)
608 binop_reduce("b16" + name[1:], output_size, tbool16, src_type,
609 prereduce_expr, reduce_expr, final_expr, description)
610 binop_reduce("b32" + name[1:], output_size, tbool32, src_type,
611 prereduce_expr, reduce_expr, final_expr, description)
612
613binop("fadd", tfloat, _2src_commutative + associative,"""
614if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
615 if (bit_size == 64)
616 dst = _mesa_double_add_rtz(src0, src1);
617 else
618 dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
619} else {
620 dst = src0 + src1;
621}
622""")
623binop("iadd", tint, _2src_commutative + associative, "(uint64_t)src0 + (uint64_t)src1")
624binop("iadd_sat", tint, _2src_commutative, """
625 src1 > 0 ?
626 (src0 + src1 < src0 ? u_intN_max(bit_size) : src0 + src1) :
627 (src0 < src0 + src1 ? u_intN_min(bit_size) : src0 + src1)
628""")
629binop("uadd_sat", tuint, _2src_commutative,
630 "(src0 + src1) < src0 ? u_uintN_max(sizeof(src0) * 8) : (src0 + src1)")
631binop("isub_sat", tint, "", """
632 src1 < 0 ?
633 (src0 - src1 < src0 ? u_intN_max(bit_size) : src0 - src1) :
634 (src0 < src0 - src1 ? u_intN_min(bit_size) : src0 - src1)
635""")
636binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
637
638binop("fsub", tfloat, "", """
639if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
640 if (bit_size == 64)
641 dst = _mesa_double_sub_rtz(src0, src1);
642 else
643 dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
644} else {
645 dst = src0 - src1;
646}
647""")
648binop("isub", tint, "", "src0 - src1")
649binop_convert("uabs_isub", tuint, tint, "", """
650 src1 > src0 ? (uint64_t) src1 - (uint64_t) src0
651 : (uint64_t) src0 - (uint64_t) src1
652""")
653binop("uabs_usub", tuint, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)")
654
655binop("fmul", tfloat, _2src_commutative + associative, """
656if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
657 if (bit_size == 64)
658 dst = _mesa_double_mul_rtz(src0, src1);
659 else
660 dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
661} else {
662 dst = src0 * src1;
663}
664""")
665
666binop("fmulz", tfloat32, _2src_commutative + associative, """
667if (src0 == 0.0 || src1 == 0.0)
668 dst = 0.0;
669else if (nir_is_rounding_mode_rtz(execution_mode, 32))
670 dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
671else
672 dst = src0 * src1;
673""", description = """
674Unlike :nir:alu-op:`fmul`, anything (even infinity or NaN) multiplied by zero is
675always zero. ``fmulz(0.0, inf)`` and ``fmulz(0.0, nan)`` must be +/-0.0, even
676if ``INF_PRESERVE/NAN_PRESERVE`` is not used. If ``SIGNED_ZERO_PRESERVE`` is
677used, then the result must be a positive zero if either operand is zero.
678""")
679
680
681binop("imul", tint, _2src_commutative + associative, """
682 /* Use 64-bit multiplies to prevent overflow of signed arithmetic */
683 dst = (uint64_t)src0 * (uint64_t)src1;
684""", description = "Low 32-bits of signed/unsigned integer multiply")
685
686binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
687 "(int64_t)src0 * (int64_t)src1",
688 description = "Multiply signed 32-bit integers, 64-bit result")
689binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
690 "(uint64_t)src0 * (uint64_t)src1",
691 description = "Multiply unsigned 32-bit integers, 64-bit result")
692
693binop("imul_high", tint, _2src_commutative, """
694if (bit_size == 64) {
695 /* We need to do a full 128-bit x 128-bit multiply in order for the sign
696 * extension to work properly. The casts are kind-of annoying but needed
697 * to prevent compiler warnings.
698 */
699 uint32_t src0_u32[4] = {
700 src0,
701 (int64_t)src0 >> 32,
702 (int64_t)src0 >> 63,
703 (int64_t)src0 >> 63,
704 };
705 uint32_t src1_u32[4] = {
706 src1,
707 (int64_t)src1 >> 32,
708 (int64_t)src1 >> 63,
709 (int64_t)src1 >> 63,
710 };
711 uint32_t prod_u32[4];
712 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
713 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
714} else {
715 /* First, sign-extend to 64-bit, then convert to unsigned to prevent
716 * potential overflow of signed multiply */
717 dst = ((uint64_t)(int64_t)src0 * (uint64_t)(int64_t)src1) >> bit_size;
718}
719""", description = "High 32-bits of signed integer multiply")
720
721binop("umul_high", tuint, _2src_commutative, """
722if (bit_size == 64) {
723 /* The casts are kind-of annoying but needed to prevent compiler warnings. */
724 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
725 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
726 uint32_t prod_u32[4];
727 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
728 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
729} else {
730 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
731}
732""", description = "High 32-bits of unsigned integer multiply")
733
734binop("umul_low", tuint32, _2src_commutative, """
735uint64_t mask = (1 << (bit_size / 2)) - 1;
736dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
737""", description = "Low 32-bits of unsigned integer multiply")
738
739binop("imul_32x16", tint32, "", "src0 * (int16_t) src1",
740 description = "Multiply 32-bits with low 16-bits, with sign extension")
741binop("umul_32x16", tuint32, "", "src0 * (uint16_t) src1",
742 description = "Multiply 32-bits with low 16-bits, with zero extension")
743
744binop("fdiv", tfloat, "", "src0 / src1")
745binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
746binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
747
748binop_convert("uadd_carry", tuint, tuint, _2src_commutative,
749 "src0 + src1 < src0",
750 description = """
751Return an integer (1 or 0) representing the carry resulting from the
752addition of the two unsigned arguments.
753 """)
754
755binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1", description = """
756Return an integer (1 or 0) representing the borrow resulting from the
757subtraction of the two unsigned arguments.
758 """)
759
760# hadd: (a + b) >> 1 (without overflow)
761# x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
762# = (x & y) + (x & ~y) + (x & y) + (~x & y)
763# = 2 * (x & y) + (x & ~y) + (~x & y)
764# = ((x & y) << 1) + (x ^ y)
765#
766# Since we know that the bottom bit of (x & y) << 1 is zero,
767#
768# (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
769# = (x & y) + ((x ^ y) >> 1)
770binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
771binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
772
773# rhadd: (a + b + 1) >> 1 (without overflow)
774# x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
775# = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1
776# = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1
777# = ((x | y) << 1) - (x ^ y) + 1
778#
779# Since we know that the bottom bit of (x & y) << 1 is zero,
780#
781# (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
782# = (x | y) - ((x ^ y) >> 1)
783binop("irhadd", tint, _2src_commutative, "(src0 | src1) - ((src0 ^ src1) >> 1)")
784binop("urhadd", tuint, _2src_commutative, "(src0 | src1) - ((src0 ^ src1) >> 1)")
785
786binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
787
788# For signed integers, there are several different possible definitions of
789# "modulus" or "remainder". We follow the conventions used by LLVM and
790# SPIR-V. The irem opcode implements the standard C/C++ signed "%"
791# operation while the imod opcode implements the more mathematical
792# "modulus" operation. For details on the difference, see
793#
794# http://mathforum.org/library/drmath/view/52343.html
795
796binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
797binop("imod", tint, "",
798 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
799 " src0 % src1 : src0 % src1 + src1)")
800binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
801binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
802
803#
804# Comparisons
805#
806
807
808# these integer-aware comparisons return a boolean (0 or ~0)
809
810binop_compare_all_sizes("flt", tfloat, "", "src0 < src1")
811binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1")
812binop_compare_all_sizes("fltu", tfloat, "", "isnan(src0) || isnan(src1) || src0 < src1")
813binop_compare_all_sizes("fgeu", tfloat, "", "isnan(src0) || isnan(src1) || src0 >= src1")
814binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1")
815binop_compare_all_sizes("fneu", tfloat, _2src_commutative, "src0 != src1")
816binop_compare_all_sizes("fequ", tfloat, _2src_commutative, "isnan(src0) || isnan(src1) || src0 == src1")
817binop_compare_all_sizes("fneo", tfloat, _2src_commutative, "!isnan(src0) && !isnan(src1) && src0 != src1")
818binop_compare_all_sizes("funord", tfloat, _2src_commutative, "isnan(src0) || isnan(src1)")
819binop_compare_all_sizes("ford", tfloat, _2src_commutative, "!isnan(src0) && !isnan(src1)")
820binop_compare_all_sizes("ilt", tint, "", "src0 < src1")
821binop_compare_all_sizes("ige", tint, "", "src0 >= src1")
822binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1")
823binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1")
824binop_compare_all_sizes("ult", tuint, "", "src0 < src1")
825binop_compare_all_sizes("uge", tuint, "", "src0 >= src1")
826
827binop_compare_all_sizes("bitnz", tuint, "", "((uint64_t)src0 >> (src1 & (bit_size - 1)) & 0x1) == 0x1",
828 "only uses the least significant bits like SM5 shifts", tuint32)
829
830binop_compare_all_sizes("bitz", tuint, "", "((uint64_t)src0 >> (src1 & (bit_size - 1)) & 0x1) == 0x0",
831 "only uses the least significant bits like SM5 shifts", tuint32)
832
833# integer-aware GLSL-style comparisons that compare floats and ints
834
835binop_reduce_all_sizes("ball_fequal", 1, tfloat, "{src0} == {src1}",
836 "{src0} && {src1}", "{src}")
837binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}",
838 "{src0} || {src1}", "{src}")
839binop_reduce_all_sizes("ball_iequal", 1, tint, "{src0} == {src1}",
840 "{src0} && {src1}", "{src}")
841binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}",
842 "{src0} || {src1}", "{src}")
843
844# non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
845
846binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}",
847 "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
848binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
849 "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
850
851# These comparisons for integer-less hardware return 1.0 and 0.0 for true
852# and false respectively
853
854binop("slt", tfloat, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
855binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
856binop("seq", tfloat, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
857binop("sne", tfloat, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
858
859shift_note = """
860SPIRV shifts are undefined for shift-operands >= bitsize,
861but SM5 shifts are defined to use only the least significant bits.
862The NIR definition is according to the SM5 specification.
863"""
864
865opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
866 "(uint64_t)src0 << (src1 & (sizeof(src0) * 8 - 1))",
867 description = "Left shift." + shift_note)
868opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
869 "src0 >> (src1 & (sizeof(src0) * 8 - 1))",
870 description = "Signed right-shift." + shift_note)
871opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
872 "src0 >> (src1 & (sizeof(src0) * 8 - 1))",
873 description = "Unsigned right-shift." + shift_note)
874
875opcode("udiv_aligned_4", 0, tuint, [0], [tuint], False, "",
876 "src0 >> 2", description = "Divide a multiple of 4 by 4")
877
878opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
879 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
880 dst = (src0 << (src1 & rotate_mask)) |
881 (src0 >> (-src1 & rotate_mask));
882""")
883opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
884 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
885 dst = (src0 >> (src1 & rotate_mask)) |
886 (src0 << (-src1 & rotate_mask));
887""")
888
889opcode("shfr", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
890 uint32_t rotate_mask = sizeof(src0) * 8 - 1;
891 uint64_t src = src1 | ((uint64_t)src0 << 32);
892 dst = src >> (src2 & rotate_mask);
893""")
894
895bitwise_description = """
896Bitwise {0}, also used as a boolean {0} for hardware supporting integers.
897"""
898
899binop("iand", tuint, _2src_commutative + associative, "src0 & src1",
900 description = bitwise_description.format("AND"))
901binop("ior", tuint, _2src_commutative + associative, "src0 | src1",
902 description = bitwise_description.format("OR"))
903binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1",
904 description = bitwise_description.format("XOR"))
905
906
907binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
908 "{src}")
909
910binop_reduce("fdot", 0, tfloat, tfloat,
911 "{src0} * {src1}", "{src0} + {src1}", "{src}",
912 suffix="_replicated")
913
914opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
915 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
916opcode("fdph_replicated", 0, tfloat, [3, 4], [tfloat, tfloat], False, "",
917 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
918
919# The C fmin/fmax functions have implementation-defined behaviour for signed
920# zeroes. However, SPIR-V requires:
921#
922# fmin(-0, +0) = -0
923# fmax(+0, -0) = +0
924#
925# The NIR opcodes match SPIR-V. Furthermore, the NIR opcodes are commutative, so
926# we must also ensure:
927#
928# fmin(+0, -0) = -0
929# fmax(-0, +0) = +0
930#
931# To implement the constant folding, when the sources are equal, we use the
932# min/max of the bit patterns which will order the signed zeroes while
933# preserving all other values.
934for op, macro in [("fmin", "MIN2"), ("fmax", "MAX2")]:
935 binop(op, tfloat, _2src_commutative + associative,
936 "bit_size == 64 ? " +
937 f"(src0 == src1 ? uid({macro}((int64_t)dui(src0), (int64_t)dui(src1))) : {op}(src0, src1)) :"
938 f"(src0 == src1 ? uif({macro}((int32_t)fui(src0), (int32_t)fui(src1))) : {op}f(src0, src1))")
939
940binop("imin", tint, _2src_commutative + associative, "MIN2(src0, src1)")
941binop("umin", tuint, _2src_commutative + associative, "MIN2(src0, src1)")
942binop("imax", tint, _2src_commutative + associative, "MAX2(src0, src1)")
943binop("umax", tuint, _2src_commutative + associative, "MAX2(src0, src1)")
944
945binop("fpow", tfloat, "", "bit_size == 64 ? pow(src0, src1) : powf(src0, src1)")
946
947binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
948 "pack_half_1x16(src0.x) | ((uint32_t)(pack_half_1x16(src1.x)) << 16)")
949
950binop_horiz("pack_half_2x16_rtz_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
951 "pack_half_1x16_rtz(src0.x) | (uint32_t)(pack_half_1x16_rtz(src1.x) << 16)")
952
953binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
954 "src0 | ((uint64_t)src1 << 32)")
955
956binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
957 "src0 | ((uint32_t)src1 << 16)")
958
959opcode("pack_32_4x8_split", 0, tuint32, [0, 0, 0, 0], [tuint8, tuint8, tuint8, tuint8],
960 False, "",
961 "src0 | ((uint32_t)src1 << 8) | ((uint32_t)src2 << 16) | ((uint32_t)src3 << 24)")
962
963binop_convert("bfm", tuint32, tint32, "", """
964int bits = src0 & 0x1F;
965int offset = src1 & 0x1F;
966dst = ((1u << bits) - 1) << offset;
967""", description = """
968Implements the behavior of the first operation of the SM5 "bfi" assembly
969and that of the "bfi1" i965 instruction. That is, the bits and offset values
970are from the low five bits of src0 and src1, respectively.
971""")
972
973opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
974dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
975/* flush denormals to zero. */
976if (!isnormal(dst))
977 dst = copysignf(0.0f, src0);
978""")
979
980binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
981dst.x = src0.x;
982dst.y = src1.x;
983""", description = """
984Combines the first component of each input to make a 2-component vector.
985""")
986
987# Byte extraction
988binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
989binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
990
991# Word extraction
992binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
993binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
994
995# Byte/word insertion
996binop("insert_u8", tuint, "", "(src0 & 0xff) << (src1 * 8)")
997binop("insert_u16", tuint, "", "(src0 & 0xffff) << (src1 * 16)")
998
999
1000def triop(name, ty, alg_props, const_expr, description = ""):
1001 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr,
1002 description)
1003def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr,
1004 description = ""):
1005 opcode(name, output_size, tuint,
1006 [src1_size, src2_size, src3_size],
1007 [tuint, tuint, tuint], False, "", const_expr, description)
1008
1009triop("ffma", tfloat, _2src_commutative, """
1010if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
1011 if (bit_size == 64)
1012 dst = _mesa_double_fma_rtz(src0, src1, src2);
1013 else if (bit_size == 32)
1014 dst = _mesa_float_fma_rtz(src0, src1, src2);
1015 else
1016 dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
1017} else {
1018 if (bit_size == 32)
1019 dst = fmaf(src0, src1, src2);
1020 else
1021 dst = fma(src0, src1, src2);
1022}
1023""")
1024
1025triop("ffmaz", tfloat32, _2src_commutative, """
1026if (src0 == 0.0 || src1 == 0.0)
1027 dst = 0.0 + src2;
1028else if (nir_is_rounding_mode_rtz(execution_mode, 32))
1029 dst = _mesa_float_fma_rtz(src0, src1, src2);
1030else
1031 dst = fmaf(src0, src1, src2);
1032""", description = """
1033Floating-point multiply-add with modified zero handling.
1034
1035Unlike :nir:alu-op:`ffma`, anything (even infinity or NaN) multiplied by zero is
1036always zero. ``ffmaz(0.0, inf, src2)`` and ``ffmaz(0.0, nan, src2)`` must be
1037``+/-0.0 + src2``, even if ``INF_PRESERVE/NAN_PRESERVE`` is not used. If
1038``SIGNED_ZERO_PRESERVE`` is used, then the result must be a positive
1039zero plus src2 if either src0 or src1 is zero.
1040""")
1041
1042triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
1043
1044triop("iadd3", tint, _2src_commutative + associative, "src0 + src1 + src2",
1045 description = "Ternary addition")
1046
1047triop("imad", tint, _2src_commutative + associative, "src0 * src1 + src2",
1048 description = "Integer multiply-add")
1049
1050csel_description = """
1051A vector conditional select instruction (like ?:, but operating per-
1052component on vectors). The condition is {} bool ({}).
1053"""
1054
1055triop("fcsel", tfloat32, selection, "(src0 != 0.0f) ? src1 : src2",
1056 description = csel_description.format("a floating point", "0.0 vs 1.0"))
1057opcode("bcsel", 0, tuint, [0, 0, 0],
1058 [tbool1, tuint, tuint], False, selection, "src0 ? src1 : src2",
1059 description = csel_description.format("a 1-bit", "0 vs 1"))
1060opcode("b8csel", 0, tuint, [0, 0, 0],
1061 [tbool8, tuint, tuint], False, selection, "src0 ? src1 : src2",
1062 description = csel_description.format("an 8-bit", "0 vs ~0"))
1063opcode("b16csel", 0, tuint, [0, 0, 0],
1064 [tbool16, tuint, tuint], False, selection, "src0 ? src1 : src2",
1065 description = csel_description.format("a 16-bit", "0 vs ~0"))
1066opcode("b32csel", 0, tuint, [0, 0, 0],
1067 [tbool32, tuint, tuint], False, selection, "src0 ? src1 : src2",
1068 description = csel_description.format("a 32-bit", "0 vs ~0"))
1069
1070triop("icsel_eqz", tint, selection, "(src0 == 0) ? src1 : src2")
1071
1072triop("i32csel_gt", tint32, selection, "(src0 > 0) ? src1 : src2")
1073triop("i32csel_ge", tint32, selection, "(src0 >= 0) ? src1 : src2")
1074
1075triop("fcsel_gt", tfloat32, selection, "(src0 > 0.0f) ? src1 : src2")
1076triop("fcsel_ge", tfloat32, selection, "(src0 >= 0.0f) ? src1 : src2")
1077
1078triop("bfi", tuint32, "", """
1079unsigned mask = src0, insert = src1, base = src2;
1080if (mask == 0) {
1081 dst = base;
1082} else {
1083 unsigned tmp = mask;
1084 while (!(tmp & 1)) {
1085 tmp >>= 1;
1086 insert <<= 1;
1087 }
1088 dst = (base & ~mask) | (insert & mask);
1089}
1090""", description = "SM5 bfi assembly")
1091
1092
1093triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
1094
1095# SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
1096opcode("ubfe", 0, tuint32,
1097 [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
1098unsigned base = src0;
1099unsigned offset = src1 & 0x1F;
1100unsigned bits = src2 & 0x1F;
1101if (bits == 0) {
1102 dst = 0;
1103} else if (offset + bits < 32) {
1104 dst = (base << (32 - bits - offset)) >> (32 - bits);
1105} else {
1106 dst = base >> offset;
1107}
1108""")
1109opcode("ibfe", 0, tint32,
1110 [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
1111int base = src0;
1112unsigned offset = src1 & 0x1F;
1113unsigned bits = src2 & 0x1F;
1114if (bits == 0) {
1115 dst = 0;
1116} else if (offset + bits < 32) {
1117 dst = (base << (32 - bits - offset)) >> (32 - bits);
1118} else {
1119 dst = base >> offset;
1120}
1121""")
1122
1123# GLSL bitfieldExtract()
1124opcode("ubitfield_extract", 0, tuint32,
1125 [0, 0, 0], [tuint32, tint32, tint32], False, "", """
1126unsigned base = src0;
1127int offset = src1, bits = src2;
1128if (bits == 0) {
1129 dst = 0;
1130} else if (bits < 0 || offset < 0 || offset + bits > 32) {
1131 dst = 0; /* undefined per the spec */
1132} else {
1133 dst = (base >> offset) & ((1ull << bits) - 1);
1134}
1135""")
1136opcode("ibitfield_extract", 0, tint32,
1137 [0, 0, 0], [tint32, tint32, tint32], False, "", """
1138int base = src0;
1139int offset = src1, bits = src2;
1140if (bits == 0) {
1141 dst = 0;
1142} else if (offset < 0 || bits < 0 || offset + bits > 32) {
1143 dst = 0;
1144} else {
1145 dst = (base << (32 - offset - bits)) >> (32 - bits); /* use sign-extending shift */
1146}
1147""")
1148
1149triop("msad_4x8", tuint32, "", """
1150dst = msad(src0, src1, src2);
1151""", description = """
1152Masked sum of absolute differences with accumulation. Equivalent to AMD's v_msad_u8
1153instruction and DXIL's MSAD.
1154
1155The first two sources contain packed 8-bit unsigned integers, the instruction
1156will calculate the absolute difference of integers when src0's is non-zero, and
1157then add them together. There is also a third source which is a 32-bit unsigned
1158integer and added to the result.
1159""")
1160
1161opcode("mqsad_4x8", 4, tuint32, [1, 2, 4], [tuint32, tuint32, tuint32], False, "", """
1162uint64_t src = src1.x | ((uint64_t)src1.y << 32);
1163dst.x = msad(src0.x, src, src2.x);
1164dst.y = msad(src0.x, src >> 8, src2.y);
1165dst.z = msad(src0.x, src >> 16, src2.z);
1166dst.w = msad(src0.x, src >> 24, src2.w);
1167""")
1168
1169# Combines the first component of each input to make a 3-component vector.
1170
1171triop_horiz("vec3", 3, 1, 1, 1, """
1172dst.x = src0.x;
1173dst.y = src1.x;
1174dst.z = src2.x;
1175""")
1176
1177def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
1178 src4_size, const_expr):
1179 opcode(name, output_size, tuint,
1180 [src1_size, src2_size, src3_size, src4_size],
1181 [tuint, tuint, tuint, tuint],
1182 False, "", const_expr)
1183
1184opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
1185 [tuint32, tuint32, tint32, tint32], False, "", """
1186unsigned base = src0, insert = src1;
1187int offset = src2, bits = src3;
1188if (bits == 0) {
1189 dst = base;
1190} else if (offset < 0 || bits < 0 || bits + offset > 32) {
1191 dst = 0;
1192} else {
1193 unsigned mask = ((1ull << bits) - 1) << offset;
1194 dst = (base & ~mask) | ((insert << offset) & mask);
1195}
1196""")
1197
1198quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1199dst.x = src0.x;
1200dst.y = src1.x;
1201dst.z = src2.x;
1202dst.w = src3.x;
1203""")
1204
1205opcode("vec5", 5, tuint,
1206 [1] * 5, [tuint] * 5,
1207 False, "", """
1208dst.x = src0.x;
1209dst.y = src1.x;
1210dst.z = src2.x;
1211dst.w = src3.x;
1212dst.e = src4.x;
1213""")
1214
1215opcode("vec8", 8, tuint,
1216 [1] * 8, [tuint] * 8,
1217 False, "", """
1218dst.x = src0.x;
1219dst.y = src1.x;
1220dst.z = src2.x;
1221dst.w = src3.x;
1222dst.e = src4.x;
1223dst.f = src5.x;
1224dst.g = src6.x;
1225dst.h = src7.x;
1226""")
1227
1228opcode("vec16", 16, tuint,
1229 [1] * 16, [tuint] * 16,
1230 False, "", """
1231dst.x = src0.x;
1232dst.y = src1.x;
1233dst.z = src2.x;
1234dst.w = src3.x;
1235dst.e = src4.x;
1236dst.f = src5.x;
1237dst.g = src6.x;
1238dst.h = src7.x;
1239dst.i = src8.x;
1240dst.j = src9.x;
1241dst.k = src10.x;
1242dst.l = src11.x;
1243dst.m = src12.x;
1244dst.n = src13.x;
1245dst.o = src14.x;
1246dst.p = src15.x;
1247""")
1248
1249# An integer multiply instruction for address calculation. This is
1250# similar to imul, except that the results are undefined in case of
1251# overflow. Overflow is defined according to the size of the variable
1252# being dereferenced.
1253#
1254# This relaxed definition, compared to imul, allows an optimization
1255# pass to propagate bounds (ie, from an load/store intrinsic) to the
1256# sources, such that lower precision integer multiplies can be used.
1257# This is useful on hw that has 24b or perhaps 16b integer multiply
1258# instructions.
1259binop("amul", tint, _2src_commutative + associative, "src0 * src1")
1260
1261# ir3-specific instruction that maps directly to mul-add shift high mix,
1262# (IMADSH_MIX16 i.e. al * bh << 16 + c). It is used for lowering integer
1263# multiplication (imul) on Freedreno backend..
1264opcode("imadsh_mix16", 0, tint32,
1265 [0, 0, 0], [tint32, tint32, tint32], False, "", """
1266dst = ((((src0 & 0x0000ffff) << 16) * (src1 & 0xffff0000)) >> 16) + src2;
1267""")
1268
1269# ir3-specific instruction that maps directly to ir3 mad.s24.
1270#
1271# 24b multiply into 32b result (with sign extension) plus 32b int
1272triop("imad24_ir3", tint32, _2src_commutative,
1273 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1274
1275def triop_shift_ir3(name, shift_op, bit_op):
1276 opcode(name, 0, tuint, [0, 0, 0], [tuint, tuint32, tuint], False, "",
1277 f"(src0 {shift_op} (src1 & (sizeof(src0) * 8 - 1))) {bit_op} src2")
1278
1279triop_shift_ir3("shrm_ir3", ">>", "&")
1280triop_shift_ir3("shlm_ir3", "<<", "&")
1281triop_shift_ir3("shrg_ir3", ">>", "|")
1282triop_shift_ir3("shlg_ir3", "<<", "|")
1283triop("andg_ir3", tuint, _2src_commutative, "(src0 & src1) | src2")
1284
1285# r600/gcn specific instruction that evaluates unnormalized cube texture coordinates
1286# and face index
1287# The actual texture coordinates are evaluated from this according to
1288# dst.yx / abs(dst.z) + 1.5
1289unop_horiz("cube_amd", 4, tfloat32, 3, tfloat32, """
1290 dst.x = dst.y = dst.z = 0.0;
1291 float absX = fabsf(src0.x);
1292 float absY = fabsf(src0.y);
1293 float absZ = fabsf(src0.z);
1294
1295 if (absX >= absY && absX >= absZ) { dst.z = 2 * src0.x; }
1296 if (absY >= absX && absY >= absZ) { dst.z = 2 * src0.y; }
1297 if (absZ >= absX && absZ >= absY) { dst.z = 2 * src0.z; }
1298
1299 if (src0.x >= 0 && absX >= absY && absX >= absZ) {
1300 dst.y = -src0.z; dst.x = -src0.y; dst.w = 0;
1301 }
1302 if (src0.x < 0 && absX >= absY && absX >= absZ) {
1303 dst.y = src0.z; dst.x = -src0.y; dst.w = 1;
1304 }
1305 if (src0.y >= 0 && absY >= absX && absY >= absZ) {
1306 dst.y = src0.x; dst.x = src0.z; dst.w = 2;
1307 }
1308 if (src0.y < 0 && absY >= absX && absY >= absZ) {
1309 dst.y = src0.x; dst.x = -src0.z; dst.w = 3;
1310 }
1311 if (src0.z >= 0 && absZ >= absX && absZ >= absY) {
1312 dst.y = src0.x; dst.x = -src0.y; dst.w = 4;
1313 }
1314 if (src0.z < 0 && absZ >= absX && absZ >= absY) {
1315 dst.y = -src0.x; dst.x = -src0.y; dst.w = 5;
1316 }
1317""")
1318
1319# r600/gcn specific sin and cos
1320# these trigeometric functions need some lowering because the supported
1321# input values are expected to be normalized by dividing by (2 * pi)
1322unop("fsin_amd", tfloat, "sinf(6.2831853 * src0)")
1323unop("fcos_amd", tfloat, "cosf(6.2831853 * src0)")
1324
1325opcode("alignbyte_amd", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
1326 uint64_t src = src1 | ((uint64_t)src0 << 32);
1327 dst = src >> ((src2 & 0x3) * 8);
1328""")
1329
1330# Midgard specific sin and cos
1331# These expect their inputs to be divided by pi.
1332unop("fsin_mdg", tfloat, "sinf(3.141592653589793 * src0)")
1333unop("fcos_mdg", tfloat, "cosf(3.141592653589793 * src0)")
1334
1335# AGX specific sin with input expressed in quadrants. Used in the lowering for
1336# fsin/fcos. This corresponds to a sequence of 3 ALU ops in the backend (where
1337# the angle is further decomposed by quadrant, sinc is computed, and the angle
1338# is multiplied back for sin). Lowering fsin/fcos to fsin_agx requires some
1339# additional ALU that NIR may be able to optimize.
1340unop("fsin_agx", tfloat, "sinf(src0 * (6.2831853/4.0))")
1341
1342# AGX specific bitfield extraction from a pair of 32bit registers.
1343# src0,src1: the two registers
1344# src2: bit position of the LSB of the bitfield
1345# src3: number of bits in the bitfield if src3 > 0
1346# src3 = 0 is equivalent to src3 = 32
1347# NOTE: src3 is a nir constant by contract
1348opcode("extr_agx", 0, tuint32,
1349 [0, 0, 0, 0], [tuint32, tuint32, tuint32, tuint32], False, "", """
1350 uint32_t mask = 0xFFFFFFFF;
1351 uint8_t shift = src2 & 0x7F;
1352 if (src3 != 0) {
1353 mask = (1 << src3) - 1;
1354 }
1355 if (shift >= 64) {
1356 dst = 0;
1357 } else {
1358 dst = (((((uint64_t) src1) << 32) | (uint64_t) src0) >> shift) & mask;
1359 }
1360""");
1361
1362# AGX multiply-shift-add. Corresponds to iadd/isub/imad/imsub instructions.
1363# The shift must be <= 4 (domain restriction). For performance, it should be
1364# constant.
1365opcode("imadshl_agx", 0, tint, [0, 0, 0, 0], [tint, tint, tint, tint], False,
1366 "", f"(src0 * src1) + (src2 << src3)")
1367opcode("imsubshl_agx", 0, tint, [0, 0, 0, 0], [tint, tint, tint, tint], False,
1368 "", f"(src0 * src1) - (src2 << src3)")
1369
1370# Address arithmetic instructions: extend, shift, and add
1371# Shift must be a small constant.
1372opcode("ilea_agx", 0, tuint64, [0, 0, 0], [tuint64, tint32, tuint32], False,
1373 "", f"src0 + (((int64_t)src1) << src2)")
1374opcode("ulea_agx", 0, tuint64, [0, 0, 0], [tuint64, tuint32, tuint32], False,
1375 "", f"src0 + (((uint64_t)src1) << src2)")
1376
1377# Bounds check instruction.
1378#
1379# Sources: <data, end offset, bounds>
1380opcode("bounds_agx", 0, tint, [0, 0, 0],
1381 [tint, tint, tint], False,
1382 "", "src1 <= src2 ? src0 : 0")
1383
1384binop_convert("interleave_agx", tuint32, tuint16, "", """
1385 dst = 0;
1386 for (unsigned bit = 0; bit < 16; bit++) {
1387 dst |= (src0 & (1 << bit)) << bit;
1388 dst |= (src1 & (1 << bit)) << (bit + 1);
1389 }""", description="""
1390 Interleave bits of 16-bit integers to calculate a 32-bit integer. This can
1391 be used as-is for Morton encoding.
1392 """)
1393
1394# These are like fmin/fmax, but do not flush denorms on the output which is why
1395# they're modeled as conversions. AGX flushes fp32 denorms but preserves fp16
1396# denorms, so fp16 fmin/fmax work without lowering.
1397binop_convert("fmin_agx", tuint32, tfloat32, _2src_commutative + associative,
1398 "(src0 < src1 || isnan(src1)) ? src0 : src1")
1399binop_convert("fmax_agx", tuint32, tfloat32, _2src_commutative + associative,
1400 "(src0 > src1 || isnan(src1)) ? src0 : src1")
1401
1402# NVIDIA PRMT
1403opcode("prmt_nv", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32],
1404 False, "", """
1405 dst = 0;
1406 for (unsigned i = 0; i < 4; i++) {
1407 uint8_t byte = (src0 >> (i * 4)) & 0x7;
1408 uint8_t x = byte < 4 ? (src1 >> (byte * 8))
1409 : (src2 >> ((byte - 4) * 8));
1410 if ((src0 >> (i * 4)) & 0x8)
1411 x = ((int8_t)x) >> 7;
1412 dst |= ((uint32_t)x) << i * 8;
1413 }""")
1414
1415# 24b multiply into 32b result (with sign extension)
1416binop("imul24", tint32, _2src_commutative + associative,
1417 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")
1418
1419# unsigned 24b multiply into 32b result plus 32b int
1420triop("umad24", tuint32, _2src_commutative,
1421 "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8) + src2")
1422
1423# unsigned 24b multiply into 32b result uint
1424binop("umul24", tint32, _2src_commutative + associative,
1425 "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8)")
1426
1427# relaxed versions of the above, which assume input is in the 24bit range (no clamping)
1428binop("imul24_relaxed", tint32, _2src_commutative + associative, "src0 * src1")
1429triop("umad24_relaxed", tuint32, _2src_commutative, "src0 * src1 + src2")
1430binop("umul24_relaxed", tuint32, _2src_commutative + associative, "src0 * src1")
1431
1432unop_convert("fisnormal", tbool1, tfloat, "isnormal(src0)")
1433unop_convert("fisfinite", tbool1, tfloat, "isfinite(src0)")
1434unop_convert("fisfinite32", tbool32, tfloat, "isfinite(src0)")
1435
1436# vc4-specific opcodes
1437
1438# Saturated vector add for 4 8bit ints.
1439binop("usadd_4x8_vc4", tint32, _2src_commutative + associative, """
1440dst = 0;
1441for (int i = 0; i < 32; i += 8) {
1442 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
1443}
1444""")
1445
1446# Saturated vector subtract for 4 8bit ints.
1447binop("ussub_4x8_vc4", tint32, "", """
1448dst = 0;
1449for (int i = 0; i < 32; i += 8) {
1450 int src0_chan = (src0 >> i) & 0xff;
1451 int src1_chan = (src1 >> i) & 0xff;
1452 if (src0_chan > src1_chan)
1453 dst |= (src0_chan - src1_chan) << i;
1454}
1455""")
1456
1457# vector min for 4 8bit ints.
1458binop("umin_4x8_vc4", tint32, _2src_commutative + associative, """
1459dst = 0;
1460for (int i = 0; i < 32; i += 8) {
1461 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
1462}
1463""")
1464
1465# vector max for 4 8bit ints.
1466binop("umax_4x8_vc4", tint32, _2src_commutative + associative, """
1467dst = 0;
1468for (int i = 0; i < 32; i += 8) {
1469 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
1470}
1471""")
1472
1473# unorm multiply: (a * b) / 255.
1474binop("umul_unorm_4x8_vc4", tuint32, _2src_commutative + associative, """
1475dst = 0;
1476for (int i = 0; i < 32; i += 8) {
1477 uint32_t src0_chan = (src0 >> i) & 0xff;
1478 uint32_t src1_chan = (src1 >> i) & 0xff;
1479 dst |= ((src0_chan * src1_chan) / 255) << i;
1480}
1481""")
1482
1483# v3d-specific opcodes
1484
1485# v3d-specific (v71) instruction that packs bits of 2 2x16 floating point into
1486# r11g11b10 bits, rounding to nearest even, so
1487# dst[10:0] = float16_to_float11 (src0[15:0])
1488# dst[21:11] = float16_to_float11 (src0[31:16])
1489# dst[31:22] = float16_to_float10 (src1[15:0])
1490binop_convert("pack_32_to_r11g11b10_v3d", tuint32, tuint32, "",
1491 "pack_32_to_r11g11b10_v3d(src0, src1)")
1492
1493# v3d-specific (v71) instruction that packs 2x32 bit to 2x16 bit integer. The
1494# difference with pack_32_2x16_split is that the sources are 32bit too. So it
1495# receives 2 32-bit integer, and packs the lower halfword as 2x16 on a 32-bit
1496# integer.
1497binop_horiz("pack_2x32_to_2x16_v3d", 1, tuint32, 1, tuint32, 1, tuint32,
1498 "(src0.x & 0xffff) | (src1.x << 16)")
1499
1500# v3d-specific (v71) instruction that packs bits of 2 2x16 integers into
1501# r10g10b10a2:
1502# dst[9:0] = src0[9:0]
1503# dst[19:10] = src0[25:16]
1504# dst[29:20] = src1[9:0]
1505# dst[31:30] = src1[17:16]
1506binop_convert("pack_uint_32_to_r10g10b10a2_v3d", tuint32, tuint32, "",
1507 "(src0 & 0x3ff) | ((src0 >> 16) & 0x3ff) << 10 | (src1 & 0x3ff) << 20 | ((src1 >> 16) & 0x3ff) << 30")
1508
1509# v3d-specific (v71) instruction that packs 2 2x16 bit integers into 4x8 bits:
1510# dst[7:0] = src0[7:0]
1511# dst[15:8] = src0[23:16]
1512# dst[23:16] = src1[7:0]
1513# dst[31:24] = src1[23:16]
1514opcode("pack_4x16_to_4x8_v3d", 0, tuint32, [0, 0], [tuint32, tuint32],
1515 False, "",
1516 "(src0 & 0x000000ff) | (src0 & 0x00ff0000) >> 8 | (src1 & 0x000000ff) << 16 | (src1 & 0x00ff0000) << 8")
1517
1518# v3d-specific (v71) instructions to convert 2x16 floating point to 2x8 bit unorm/snorm
1519unop("pack_2x16_to_unorm_2x8_v3d", tuint32,
1520 "_mesa_half_to_unorm(src0 & 0xffff, 8) | (_mesa_half_to_unorm(src0 >> 16, 8) << 16)")
1521unop("pack_2x16_to_snorm_2x8_v3d", tuint32,
1522 "_mesa_half_to_snorm(src0 & 0xffff, 8) | ((uint32_t)(_mesa_half_to_snorm(src0 >> 16, 8)) << 16)")
1523
1524# v3d-specific (v71) instructions to convert 32-bit floating point to 16 bit unorm/snorm
1525unop("f2unorm_16_v3d", tuint32, "_mesa_float_to_unorm16(src0)")
1526unop("f2snorm_16_v3d", tuint32, "_mesa_float_to_snorm16(src0)")
1527
1528# v3d-specific (v71) instructions to convert 2x16 bit floating points to 2x10 bit unorm
1529unop("pack_2x16_to_unorm_2x10_v3d", tuint32, "pack_2x16_to_unorm_2x10(src0)")
1530
1531# v3d-specific (v71) instructions to convert 2x16 bit floating points to one 2-bit
1532# and one 10 bit unorm
1533unop("pack_2x16_to_unorm_10_2_v3d", tuint32, "pack_2x16_to_unorm_10_2(src0)")
1534
1535# These opcodes are used used by Mali and V3D
1536unop("fsat_signed", tfloat, ("fmin(fmax(src0, -1.0), 1.0)"))
1537unop("fclamp_pos", tfloat, ("fmax(src0, 0.0)"))
1538
1539opcode("b32fcsel_mdg", 0, tuint, [0, 0, 0],
1540 [tbool32, tfloat, tfloat], False, selection, "src0 ? src1 : src2",
1541 description = csel_description.format("a 32-bit", "0 vs ~0") + """
1542 This Midgard-specific variant takes floating-point sources, rather than
1543 integer sources. That includes support for floating point modifiers in
1544 the backend.
1545 """)
1546
1547# DXIL specific double [un]pack
1548# DXIL doesn't support generic [un]pack instructions, so we want those
1549# lowered to bit ops. HLSL doesn't support 64bit bitcasts to/from
1550# double, only [un]pack. Technically DXIL does, but considering they
1551# can't be generated from HLSL, we want to match what would be coming from DXC.
1552# This is essentially just the standard [un]pack, except that it doesn't get
1553# lowered so we can handle it in the backend and turn it into MakeDouble/SplitDouble
1554unop_horiz("pack_double_2x32_dxil", 1, tuint64, 2, tuint32,
1555 "dst.x = src0.x | ((uint64_t)src0.y << 32);")
1556unop_horiz("unpack_double_2x32_dxil", 2, tuint32, 1, tuint64,
1557 "dst.x = src0.x; dst.y = src0.x >> 32;")
1558
1559# src0 and src1 are i8vec4 packed in an int32, and src2 is an int32. The int8
1560# components are sign-extended to 32-bits, and a dot-product is performed on
1561# the resulting vectors. src2 is added to the result of the dot-product.
1562opcode("sdot_4x8_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1563 False, _2src_commutative, """
1564 const int32_t v0x = (int8_t)(src0 );
1565 const int32_t v0y = (int8_t)(src0 >> 8);
1566 const int32_t v0z = (int8_t)(src0 >> 16);
1567 const int32_t v0w = (int8_t)(src0 >> 24);
1568 const int32_t v1x = (int8_t)(src1 );
1569 const int32_t v1y = (int8_t)(src1 >> 8);
1570 const int32_t v1z = (int8_t)(src1 >> 16);
1571 const int32_t v1w = (int8_t)(src1 >> 24);
1572
1573 dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1574""")
1575
1576# Like sdot_4x8_iadd, but unsigned.
1577opcode("udot_4x8_uadd", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32],
1578 False, _2src_commutative, """
1579 const uint32_t v0x = (uint8_t)(src0 );
1580 const uint32_t v0y = (uint8_t)(src0 >> 8);
1581 const uint32_t v0z = (uint8_t)(src0 >> 16);
1582 const uint32_t v0w = (uint8_t)(src0 >> 24);
1583 const uint32_t v1x = (uint8_t)(src1 );
1584 const uint32_t v1y = (uint8_t)(src1 >> 8);
1585 const uint32_t v1z = (uint8_t)(src1 >> 16);
1586 const uint32_t v1w = (uint8_t)(src1 >> 24);
1587
1588 dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1589""")
1590
1591# src0 is i8vec4 packed in an int32, src1 is u8vec4 packed in an int32, and
1592# src2 is an int32. The 8-bit components are extended to 32-bits, and a
1593# dot-product is performed on the resulting vectors. src2 is added to the
1594# result of the dot-product.
1595#
1596# NOTE: Unlike many of the other dp4a opcodes, this mixed signs of source 0
1597# and source 1 mean that this opcode is not 2-source commutative
1598opcode("sudot_4x8_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1599 False, "", """
1600 const int32_t v0x = (int8_t)(src0 );
1601 const int32_t v0y = (int8_t)(src0 >> 8);
1602 const int32_t v0z = (int8_t)(src0 >> 16);
1603 const int32_t v0w = (int8_t)(src0 >> 24);
1604 const uint32_t v1x = (uint8_t)(src1 );
1605 const uint32_t v1y = (uint8_t)(src1 >> 8);
1606 const uint32_t v1z = (uint8_t)(src1 >> 16);
1607 const uint32_t v1w = (uint8_t)(src1 >> 24);
1608
1609 dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1610""")
1611
1612# Like sdot_4x8_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff].
1613opcode("sdot_4x8_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1614 False, _2src_commutative, """
1615 const int64_t v0x = (int8_t)(src0 );
1616 const int64_t v0y = (int8_t)(src0 >> 8);
1617 const int64_t v0z = (int8_t)(src0 >> 16);
1618 const int64_t v0w = (int8_t)(src0 >> 24);
1619 const int64_t v1x = (int8_t)(src1 );
1620 const int64_t v1y = (int8_t)(src1 >> 8);
1621 const int64_t v1z = (int8_t)(src1 >> 16);
1622 const int64_t v1w = (int8_t)(src1 >> 24);
1623
1624 const int64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1625
1626 dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp);
1627""")
1628
1629# Like udot_4x8_uadd, but the result is clampled to the range [0, 0xfffffffff].
1630opcode("udot_4x8_uadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1631 False, _2src_commutative, """
1632 const uint64_t v0x = (uint8_t)(src0 );
1633 const uint64_t v0y = (uint8_t)(src0 >> 8);
1634 const uint64_t v0z = (uint8_t)(src0 >> 16);
1635 const uint64_t v0w = (uint8_t)(src0 >> 24);
1636 const uint64_t v1x = (uint8_t)(src1 );
1637 const uint64_t v1y = (uint8_t)(src1 >> 8);
1638 const uint64_t v1z = (uint8_t)(src1 >> 16);
1639 const uint64_t v1w = (uint8_t)(src1 >> 24);
1640
1641 const uint64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1642
1643 dst = tmp >= UINT32_MAX ? UINT32_MAX : tmp;
1644""")
1645
1646# Like sudot_4x8_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff].
1647#
1648# NOTE: Unlike many of the other dp4a opcodes, this mixed signs of source 0
1649# and source 1 mean that this opcode is not 2-source commutative
1650opcode("sudot_4x8_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1651 False, "", """
1652 const int64_t v0x = (int8_t)(src0 );
1653 const int64_t v0y = (int8_t)(src0 >> 8);
1654 const int64_t v0z = (int8_t)(src0 >> 16);
1655 const int64_t v0w = (int8_t)(src0 >> 24);
1656 const uint64_t v1x = (uint8_t)(src1 );
1657 const uint64_t v1y = (uint8_t)(src1 >> 8);
1658 const uint64_t v1z = (uint8_t)(src1 >> 16);
1659 const uint64_t v1w = (uint8_t)(src1 >> 24);
1660
1661 const int64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1662
1663 dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp);
1664""")
1665
1666# src0 and src1 are i16vec2 packed in an int32, and src2 is an int32. The int16
1667# components are sign-extended to 32-bits, and a dot-product is performed on
1668# the resulting vectors. src2 is added to the result of the dot-product.
1669opcode("sdot_2x16_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1670 False, _2src_commutative, """
1671 const int32_t v0x = (int16_t)(src0 );
1672 const int32_t v0y = (int16_t)(src0 >> 16);
1673 const int32_t v1x = (int16_t)(src1 );
1674 const int32_t v1y = (int16_t)(src1 >> 16);
1675
1676 dst = (v0x * v1x) + (v0y * v1y) + src2;
1677""")
1678
1679# Like sdot_2x16_iadd, but unsigned.
1680opcode("udot_2x16_uadd", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32],
1681 False, _2src_commutative, """
1682 const uint32_t v0x = (uint16_t)(src0 );
1683 const uint32_t v0y = (uint16_t)(src0 >> 16);
1684 const uint32_t v1x = (uint16_t)(src1 );
1685 const uint32_t v1y = (uint16_t)(src1 >> 16);
1686
1687 dst = (v0x * v1x) + (v0y * v1y) + src2;
1688""")
1689
1690# Like sdot_2x16_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff].
1691opcode("sdot_2x16_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1692 False, _2src_commutative, """
1693 const int64_t v0x = (int16_t)(src0 );
1694 const int64_t v0y = (int16_t)(src0 >> 16);
1695 const int64_t v1x = (int16_t)(src1 );
1696 const int64_t v1y = (int16_t)(src1 >> 16);
1697
1698 const int64_t tmp = (v0x * v1x) + (v0y * v1y) + src2;
1699
1700 dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp);
1701""")
1702
1703# Like udot_2x16_uadd, but the result is clampled to the range [0, 0xfffffffff].
1704opcode("udot_2x16_uadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1705 False, _2src_commutative, """
1706 const uint64_t v0x = (uint16_t)(src0 );
1707 const uint64_t v0y = (uint16_t)(src0 >> 16);
1708 const uint64_t v1x = (uint16_t)(src1 );
1709 const uint64_t v1y = (uint16_t)(src1 >> 16);
1710
1711 const uint64_t tmp = (v0x * v1x) + (v0y * v1y) + src2;
1712
1713 dst = tmp >= UINT32_MAX ? UINT32_MAX : tmp;
1714""")