mirror of OpenBSD xenocara tree github.com/openbsd/xenocara
openbsd
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

at jcs 911 lines 30 kB view raw
1/* 2 * Copyright © 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25#include "nir.h" 26#include "nir_builder.h" 27 28#include <float.h> 29#include <math.h> 30 31/* 32 * Lowers some unsupported double operations, using only: 33 * 34 * - pack/unpackDouble2x32 35 * - conversion to/from single-precision 36 * - double add, mul, and fma 37 * - conditional select 38 * - 32-bit integer and floating point arithmetic 39 */ 40 41/* Creates a double with the exponent bits set to a given integer value */ 42static nir_def * 43set_exponent(nir_builder *b, nir_def *src, nir_def *exp) 44{ 45 /* Split into bits 0-31 and 32-63 */ 46 nir_def *lo = nir_unpack_64_2x32_split_x(b, src); 47 nir_def *hi = nir_unpack_64_2x32_split_y(b, src); 48 49 /* The exponent is bits 52-62, or 20-30 of the high word, so set the exponent 50 * to 1023 51 */ 52 nir_def *new_hi = nir_bitfield_insert(b, hi, exp, 53 nir_imm_int(b, 20), 54 nir_imm_int(b, 11)); 55 /* recombine */ 56 return nir_pack_64_2x32_split(b, lo, new_hi); 57} 58 59static nir_def * 60get_exponent(nir_builder *b, nir_def *src) 61{ 62 /* get bits 32-63 */ 63 nir_def *hi = nir_unpack_64_2x32_split_y(b, src); 64 65 /* extract bits 20-30 of the high word */ 66 return nir_ubitfield_extract(b, hi, nir_imm_int(b, 20), nir_imm_int(b, 11)); 67} 68 69/* Return infinity with the sign of the given source which is +/-0 */ 70 71static nir_def * 72get_signed_inf(nir_builder *b, nir_def *zero) 73{ 74 nir_def *zero_hi = nir_unpack_64_2x32_split_y(b, zero); 75 76 /* The bit pattern for infinity is 0x7ff0000000000000, where the sign bit 77 * is the highest bit. Only the sign bit can be non-zero in the passed in 78 * source. So we essentially need to OR the infinity and the zero, except 79 * the low 32 bits are always 0 so we can construct the correct high 32 80 * bits and then pack it together with zero low 32 bits. 81 */ 82 nir_def *inf_hi = nir_ior_imm(b, zero_hi, 0x7ff00000); 83 return nir_pack_64_2x32_split(b, nir_imm_int(b, 0), inf_hi); 84} 85 86/* Return a correctly signed zero based on src, if we care. */ 87static nir_def * 88get_signed_zero(nir_builder *b, nir_def *src) 89{ 90 uint32_t exec_mode = b->fp_fast_math; 91 92 nir_def *zero; 93 if (nir_is_float_control_signed_zero_preserve(exec_mode, 64)) { 94 nir_def *hi = nir_unpack_64_2x32_split_y(b, src); 95 nir_def *sign = nir_iand_imm(b, hi, 0x80000000); 96 zero = nir_pack_64_2x32_split(b, nir_imm_int(b, 0), sign); 97 } else { 98 zero = nir_imm_double(b, 0.0f); 99 } 100 101 return zero; 102} 103 104static nir_def * 105preserve_nan(nir_builder *b, nir_def *src, nir_def *res) 106{ 107 uint32_t exec_mode = b->fp_fast_math; 108 109 if (nir_is_float_control_nan_preserve(exec_mode, 64)) { 110 nir_def *is_nan = nir_fneu(b, src, src); 111 return nir_bcsel(b, is_nan, src, res); 112 } 113 114 return res; 115} 116 117/* 118 * Generates the correctly-signed infinity if the source was zero, and flushes 119 * the result to 0 if the source was infinity or the calculated exponent was 120 * too small to be representable. 121 */ 122 123static nir_def * 124fix_inv_result(nir_builder *b, nir_def *res, nir_def *src, 125 nir_def *exp) 126{ 127 /* If the exponent is too small or the original input was infinity, 128 * force the result to 0 (flush denorms) to avoid the work of handling 129 * denorms properly. If we are asked to preserve NaN, do so, otherwise 130 * we return the flushed result for it. 131 */ 132 res = nir_bcsel(b, nir_ior(b, nir_ile_imm(b, exp, 0), nir_feq_imm(b, nir_fabs(b, src), INFINITY)), 133 get_signed_zero(b, src), res); 134 res = preserve_nan(b, src, res); 135 136 /* If the original input was 0, generate the correctly-signed infinity */ 137 res = nir_bcsel(b, nir_fneu_imm(b, src, 0.0f), 138 res, get_signed_inf(b, src)); 139 140 return res; 141} 142 143static nir_def * 144lower_rcp(nir_builder *b, nir_def *src) 145{ 146 /* normalize the input to avoid range issues */ 147 nir_def *src_norm = set_exponent(b, src, nir_imm_int(b, 1023)); 148 149 /* cast to float, do an rcp, and then cast back to get an approximate 150 * result 151 */ 152 nir_def *ra = nir_f2f64(b, nir_frcp(b, nir_f2f32(b, src_norm))); 153 154 /* Fixup the exponent of the result - note that we check if this is too 155 * small below. 156 */ 157 nir_def *new_exp = nir_isub(b, get_exponent(b, ra), 158 nir_iadd_imm(b, get_exponent(b, src), 159 -1023)); 160 161 ra = set_exponent(b, ra, new_exp); 162 163 /* Do a few Newton-Raphson steps to improve precision. 164 * 165 * Each step doubles the precision, and we started off with around 24 bits, 166 * so we only need to do 2 steps to get to full precision. The step is: 167 * 168 * x_new = x * (2 - x*src) 169 * 170 * But we can re-arrange this to improve precision by using another fused 171 * multiply-add: 172 * 173 * x_new = x + x * (1 - x*src) 174 * 175 * See https://en.wikipedia.org/wiki/Division_algorithm for more details. 176 */ 177 178 ra = nir_ffma(b, nir_fneg(b, ra), nir_ffma_imm2(b, ra, src, -1), ra); 179 ra = nir_ffma(b, nir_fneg(b, ra), nir_ffma_imm2(b, ra, src, -1), ra); 180 181 return fix_inv_result(b, ra, src, new_exp); 182} 183 184static nir_def * 185lower_sqrt_rsq(nir_builder *b, nir_def *src, bool sqrt) 186{ 187 /* We want to compute: 188 * 189 * 1/sqrt(m * 2^e) 190 * 191 * When the exponent is even, this is equivalent to: 192 * 193 * 1/sqrt(m) * 2^(-e/2) 194 * 195 * and then the exponent is odd, this is equal to: 196 * 197 * 1/sqrt(m * 2) * 2^(-(e - 1)/2) 198 * 199 * where the m * 2 is absorbed into the exponent. So we want the exponent 200 * inside the square root to be 1 if e is odd and 0 if e is even, and we 201 * want to subtract off e/2 from the final exponent, rounded to negative 202 * infinity. We can do the former by first computing the unbiased exponent, 203 * and then AND'ing it with 1 to get 0 or 1, and we can do the latter by 204 * shifting right by 1. 205 */ 206 207 nir_def *unbiased_exp = nir_iadd_imm(b, get_exponent(b, src), 208 -1023); 209 nir_def *even = nir_iand_imm(b, unbiased_exp, 1); 210 nir_def *half = nir_ishr_imm(b, unbiased_exp, 1); 211 212 nir_def *src_norm = set_exponent(b, src, 213 nir_iadd_imm(b, even, 1023)); 214 215 nir_def *ra = nir_f2f64(b, nir_frsq(b, nir_f2f32(b, src_norm))); 216 nir_def *new_exp = nir_isub(b, get_exponent(b, ra), half); 217 ra = set_exponent(b, ra, new_exp); 218 219 /* 220 * The following implements an iterative algorithm that's very similar 221 * between sqrt and rsqrt. We start with an iteration of Goldschmit's 222 * algorithm, which looks like: 223 * 224 * a = the source 225 * y_0 = initial (single-precision) rsqrt estimate 226 * 227 * h_0 = .5 * y_0 228 * g_0 = a * y_0 229 * r_0 = .5 - h_0 * g_0 230 * g_1 = g_0 * r_0 + g_0 231 * h_1 = h_0 * r_0 + h_0 232 * 233 * Now g_1 ~= sqrt(a), and h_1 ~= 1/(2 * sqrt(a)). We could continue 234 * applying another round of Goldschmit, but since we would never refer 235 * back to a (the original source), we would add too much rounding error. 236 * So instead, we do one last round of Newton-Raphson, which has better 237 * rounding characteristics, to get the final rounding correct. This is 238 * split into two cases: 239 * 240 * 1. sqrt 241 * 242 * Normally, doing a round of Newton-Raphson for sqrt involves taking a 243 * reciprocal of the original estimate, which is slow since it isn't 244 * supported in HW. But we can take advantage of the fact that we already 245 * computed a good estimate of 1/(2 * g_1) by rearranging it like so: 246 * 247 * g_2 = .5 * (g_1 + a / g_1) 248 * = g_1 + .5 * (a / g_1 - g_1) 249 * = g_1 + (.5 / g_1) * (a - g_1^2) 250 * = g_1 + h_1 * (a - g_1^2) 251 * 252 * The second term represents the error, and by splitting it out we can get 253 * better precision by computing it as part of a fused multiply-add. Since 254 * both Newton-Raphson and Goldschmit approximately double the precision of 255 * the result, these two steps should be enough. 256 * 257 * 2. rsqrt 258 * 259 * First off, note that the first round of the Goldschmit algorithm is 260 * really just a Newton-Raphson step in disguise: 261 * 262 * h_1 = h_0 * (.5 - h_0 * g_0) + h_0 263 * = h_0 * (1.5 - h_0 * g_0) 264 * = h_0 * (1.5 - .5 * a * y_0^2) 265 * = (.5 * y_0) * (1.5 - .5 * a * y_0^2) 266 * 267 * which is the standard formula multiplied by .5. Unlike in the sqrt case, 268 * we don't need the inverse to do a Newton-Raphson step; we just need h_1, 269 * so we can skip the calculation of g_1. Instead, we simply do another 270 * Newton-Raphson step: 271 * 272 * y_1 = 2 * h_1 273 * r_1 = .5 - h_1 * y_1 * a 274 * y_2 = y_1 * r_1 + y_1 275 * 276 * Where the difference from Goldschmit is that we calculate y_1 * a 277 * instead of using g_1. Doing it this way should be as fast as computing 278 * y_1 up front instead of h_1, and it lets us share the code for the 279 * initial Goldschmit step with the sqrt case. 280 * 281 * Putting it together, the computations are: 282 * 283 * h_0 = .5 * y_0 284 * g_0 = a * y_0 285 * r_0 = .5 - h_0 * g_0 286 * h_1 = h_0 * r_0 + h_0 287 * if sqrt: 288 * g_1 = g_0 * r_0 + g_0 289 * r_1 = a - g_1 * g_1 290 * g_2 = h_1 * r_1 + g_1 291 * else: 292 * y_1 = 2 * h_1 293 * r_1 = .5 - y_1 * (h_1 * a) 294 * y_2 = y_1 * r_1 + y_1 295 * 296 * For more on the ideas behind this, see "Software Division and Square 297 * Root Using Goldschmit's Algorithms" by Markstein and the Wikipedia page 298 * on square roots 299 * (https://en.wikipedia.org/wiki/Methods_of_computing_square_roots). 300 */ 301 302 nir_def *one_half = nir_imm_double(b, 0.5); 303 nir_def *h_0 = nir_fmul(b, one_half, ra); 304 nir_def *g_0 = nir_fmul(b, src, ra); 305 nir_def *r_0 = nir_ffma(b, nir_fneg(b, h_0), g_0, one_half); 306 nir_def *h_1 = nir_ffma(b, h_0, r_0, h_0); 307 nir_def *res; 308 if (sqrt) { 309 nir_def *g_1 = nir_ffma(b, g_0, r_0, g_0); 310 nir_def *r_1 = nir_ffma(b, nir_fneg(b, g_1), g_1, src); 311 res = nir_ffma(b, h_1, r_1, g_1); 312 } else { 313 nir_def *y_1 = nir_fmul_imm(b, h_1, 2.0); 314 nir_def *r_1 = nir_ffma(b, nir_fneg(b, y_1), nir_fmul(b, h_1, src), 315 one_half); 316 res = nir_ffma(b, y_1, r_1, y_1); 317 } 318 319 uint32_t exec_mode = b->fp_fast_math; 320 if (sqrt) { 321 /* Here, the special cases we need to handle are 322 * 0 -> 0 (sign preserving) 323 * +inf -> +inf 324 * -inf -> NaN 325 * NaN -> NaN 326 */ 327 /* Denorm flushing/preserving isn't part of the per-instruction bits, so 328 * check the execution mode for it. 329 */ 330 uint32_t shader_exec_mode = b->shader->info.float_controls_execution_mode; 331 nir_def *src_flushed = src; 332 if (!nir_is_denorm_preserve(shader_exec_mode, 64)) { 333 src_flushed = nir_bcsel(b, 334 nir_flt_imm(b, nir_fabs(b, src), DBL_MIN), 335 get_signed_zero(b, src), 336 src); 337 } 338 res = nir_bcsel(b, nir_ior(b, nir_feq_imm(b, src_flushed, 0.0), nir_feq_imm(b, src, INFINITY)), 339 src_flushed, res); 340 res = preserve_nan(b, src, res); 341 } else { 342 res = fix_inv_result(b, res, src, new_exp); 343 } 344 345 if (nir_is_float_control_nan_preserve(exec_mode, 64)) 346 res = nir_bcsel(b, nir_feq_imm(b, src, -INFINITY), 347 nir_imm_double(b, NAN), res); 348 349 return res; 350} 351 352static nir_def * 353lower_trunc(nir_builder *b, nir_def *src) 354{ 355 nir_def *unbiased_exp = nir_iadd_imm(b, get_exponent(b, src), 356 -1023); 357 358 nir_def *frac_bits = nir_isub_imm(b, 52, unbiased_exp); 359 360 /* 361 * Decide the operation to apply depending on the unbiased exponent: 362 * 363 * if (unbiased_exp < 0) 364 * return 0 365 * else if (unbiased_exp > 52) 366 * return src 367 * else 368 * return src & (~0 << frac_bits) 369 * 370 * Notice that the else branch is a 64-bit integer operation that we need 371 * to implement in terms of 32-bit integer arithmetics (at least until we 372 * support 64-bit integer arithmetics). 373 */ 374 375 /* Compute "~0 << frac_bits" in terms of hi/lo 32-bit integer math */ 376 nir_def *mask_lo = 377 nir_bcsel(b, 378 nir_ige_imm(b, frac_bits, 32), 379 nir_imm_int(b, 0), 380 nir_ishl(b, nir_imm_int(b, ~0), frac_bits)); 381 382 nir_def *mask_hi = 383 nir_bcsel(b, 384 nir_ilt_imm(b, frac_bits, 33), 385 nir_imm_int(b, ~0), 386 nir_ishl(b, 387 nir_imm_int(b, ~0), 388 nir_iadd_imm(b, frac_bits, -32))); 389 390 nir_def *src_lo = nir_unpack_64_2x32_split_x(b, src); 391 nir_def *src_hi = nir_unpack_64_2x32_split_y(b, src); 392 393 return nir_bcsel(b, 394 nir_ilt_imm(b, unbiased_exp, 0), 395 get_signed_zero(b, src), 396 nir_bcsel(b, nir_ige_imm(b, unbiased_exp, 53), 397 src, 398 nir_pack_64_2x32_split(b, 399 nir_iand(b, mask_lo, src_lo), 400 nir_iand(b, mask_hi, src_hi)))); 401} 402 403static nir_def * 404lower_floor(nir_builder *b, nir_def *src) 405{ 406 /* 407 * For x >= 0, floor(x) = trunc(x) 408 * For x < 0, 409 * - if x is integer, floor(x) = x 410 * - otherwise, floor(x) = trunc(x) - 1 411 */ 412 nir_def *tr = nir_ftrunc(b, src); 413 nir_def *positive = nir_fge_imm(b, src, 0.0); 414 return nir_bcsel(b, 415 nir_ior(b, positive, nir_feq(b, src, tr)), 416 tr, 417 nir_fadd_imm(b, tr, -1.0)); 418} 419 420static nir_def * 421lower_ceil(nir_builder *b, nir_def *src) 422{ 423 /* if x < 0, ceil(x) = trunc(x) 424 * else if (x - trunc(x) == 0), ceil(x) = x 425 * else, ceil(x) = trunc(x) + 1 426 */ 427 nir_def *tr = nir_ftrunc(b, src); 428 nir_def *negative = nir_flt_imm(b, src, 0.0); 429 return nir_bcsel(b, 430 nir_ior(b, negative, nir_feq(b, src, tr)), 431 tr, 432 nir_fadd_imm(b, tr, 1.0)); 433} 434 435static nir_def * 436lower_fract(nir_builder *b, nir_def *src) 437{ 438 return nir_fsub(b, src, nir_ffloor(b, src)); 439} 440 441static nir_def * 442lower_round_even(nir_builder *b, nir_def *src) 443{ 444 /* Add and subtract 2**52 to round off any fractional bits. */ 445 nir_def *two52 = nir_imm_double(b, (double)(1ull << 52)); 446 nir_def *sign = nir_iand_imm(b, nir_unpack_64_2x32_split_y(b, src), 447 1ull << 31); 448 449 b->exact = true; 450 nir_def *res = nir_fsub(b, nir_fadd(b, nir_fabs(b, src), two52), two52); 451 b->exact = false; 452 453 return nir_bcsel(b, nir_flt(b, nir_fabs(b, src), two52), 454 nir_pack_64_2x32_split(b, nir_unpack_64_2x32_split_x(b, res), 455 nir_ior(b, nir_unpack_64_2x32_split_y(b, res), sign)), 456 src); 457} 458 459static nir_def * 460lower_mod(nir_builder *b, nir_def *src0, nir_def *src1) 461{ 462 /* mod(x,y) = x - y * floor(x/y) 463 * 464 * If the division is lowered, it could add some rounding errors that make 465 * floor() to return the quotient minus one when x = N * y. If this is the 466 * case, we should return zero because mod(x, y) output value is [0, y). 467 * But fortunately Vulkan spec allows this kind of errors; from Vulkan 468 * spec, appendix A (Precision and Operation of SPIR-V instructions: 469 * 470 * "The OpFRem and OpFMod instructions use cheap approximations of 471 * remainder, and the error can be large due to the discontinuity in 472 * trunc() and floor(). This can produce mathematically unexpected 473 * results in some cases, such as FMod(x,x) computing x rather than 0, 474 * and can also cause the result to have a different sign than the 475 * infinitely precise result." 476 * 477 * In practice this means the output value is actually in the interval 478 * [0, y]. 479 * 480 * While Vulkan states this behaviour explicitly, OpenGL does not, and thus 481 * we need to assume that value should be in range [0, y); but on the other 482 * hand, mod(a,b) is defined as "a - b * floor(a/b)" and OpenGL allows for 483 * some error in division, so a/a could actually end up being 1.0 - 1ULP; 484 * so in this case floor(a/a) would end up as 0, and hence mod(a,a) == a. 485 * 486 * In summary, in the practice mod(a,a) can be "a" both for OpenGL and 487 * Vulkan. 488 */ 489 nir_def *floor = nir_ffloor(b, nir_fdiv(b, src0, src1)); 490 491 return nir_fsub(b, src0, nir_fmul(b, src1, floor)); 492} 493 494static nir_def * 495lower_minmax(nir_builder *b, nir_op cmp, nir_def *src0, nir_def *src1) 496{ 497 b->exact = true; 498 nir_def *src1_is_nan = nir_fneu(b, src1, src1); 499 nir_def *cmp_res = nir_build_alu2(b, cmp, src0, src1); 500 b->exact = false; 501 nir_def *take_src0 = nir_ior(b, src1_is_nan, cmp_res); 502 503 /* IEEE-754-2019 requires that fmin/fmax compare -0 < 0, but -0 and 0 are 504 * indistinguishable for flt/fge. So, we fix up signed zeroes. 505 */ 506 if (nir_is_float_control_signed_zero_preserve(b->fp_fast_math, 64)) { 507 nir_def *src0_is_negzero = nir_ieq_imm(b, src0, 1ull << 63); 508 nir_def *src1_is_poszero = nir_ieq_imm(b, src1, 0x0); 509 nir_def *neg_pos_zero = nir_iand(b, src0_is_negzero, src1_is_poszero); 510 511 if (cmp == nir_op_flt) { 512 take_src0 = nir_ior(b, take_src0, neg_pos_zero); 513 } else { 514 assert(cmp == nir_op_fge); 515 take_src0 = nir_iand(b, take_src0, nir_inot(b, neg_pos_zero)); 516 } 517 } 518 519 return nir_bcsel(b, take_src0, src0, src1); 520} 521 522static nir_def * 523lower_sat(nir_builder *b, nir_def *src) 524{ 525 b->exact = true; 526 /* This will get lowered again if nir_lower_dminmax is set */ 527 nir_def *sat = nir_fclamp(b, src, nir_imm_double(b, 0), 528 nir_imm_double(b, 1)); 529 b->exact = false; 530 return sat; 531} 532 533static nir_def * 534lower_doubles_instr_to_soft(nir_builder *b, nir_alu_instr *instr, 535 const nir_shader *softfp64, 536 nir_lower_doubles_options options) 537{ 538 if (!(options & nir_lower_fp64_full_software)) 539 return NULL; 540 541 const char *name; 542 const char *mangled_name; 543 const struct glsl_type *return_type = glsl_uint64_t_type(); 544 545 switch (instr->op) { 546 case nir_op_f2i64: 547 if (instr->src[0].src.ssa->bit_size != 64) 548 return false; 549 name = "__fp64_to_int64"; 550 mangled_name = "__fp64_to_int64(u641;"; 551 return_type = glsl_int64_t_type(); 552 break; 553 case nir_op_f2u64: 554 if (instr->src[0].src.ssa->bit_size != 64) 555 return false; 556 name = "__fp64_to_uint64"; 557 mangled_name = "__fp64_to_uint64(u641;"; 558 break; 559 case nir_op_f2f64: 560 name = "__fp32_to_fp64"; 561 mangled_name = "__fp32_to_fp64(f1;"; 562 break; 563 case nir_op_f2f32: 564 name = "__fp64_to_fp32"; 565 mangled_name = "__fp64_to_fp32(u641;"; 566 return_type = glsl_float_type(); 567 break; 568 case nir_op_f2i32: 569 name = "__fp64_to_int"; 570 mangled_name = "__fp64_to_int(u641;"; 571 return_type = glsl_int_type(); 572 break; 573 case nir_op_f2u32: 574 name = "__fp64_to_uint"; 575 mangled_name = "__fp64_to_uint(u641;"; 576 return_type = glsl_uint_type(); 577 break; 578 case nir_op_b2f64: 579 name = "__bool_to_fp64"; 580 mangled_name = "__bool_to_fp64(b1;"; 581 break; 582 case nir_op_i2f64: 583 if (instr->src[0].src.ssa->bit_size == 64) { 584 name = "__int64_to_fp64"; 585 mangled_name = "__int64_to_fp64(i641;"; 586 } else { 587 name = "__int_to_fp64"; 588 mangled_name = "__int_to_fp64(i1;"; 589 } 590 break; 591 case nir_op_u2f64: 592 if (instr->src[0].src.ssa->bit_size == 64) { 593 name = "__uint64_to_fp64"; 594 mangled_name = "__uint64_to_fp64(u641;"; 595 } else { 596 name = "__uint_to_fp64"; 597 mangled_name = "__uint_to_fp64(u1;"; 598 } 599 break; 600 case nir_op_fabs: 601 name = "__fabs64"; 602 mangled_name = "__fabs64(u641;"; 603 break; 604 case nir_op_fneg: 605 name = "__fneg64"; 606 mangled_name = "__fneg64(u641;"; 607 break; 608 case nir_op_fround_even: 609 name = "__fround64"; 610 mangled_name = "__fround64(u641;"; 611 break; 612 case nir_op_ftrunc: 613 name = "__ftrunc64"; 614 mangled_name = "__ftrunc64(u641;"; 615 break; 616 case nir_op_ffloor: 617 name = "__ffloor64"; 618 mangled_name = "__ffloor64(u641;"; 619 break; 620 case nir_op_ffract: 621 name = "__ffract64"; 622 mangled_name = "__ffract64(u641;"; 623 break; 624 case nir_op_fsign: 625 name = "__fsign64"; 626 mangled_name = "__fsign64(u641;"; 627 break; 628 case nir_op_feq: 629 name = "__feq64"; 630 mangled_name = "__feq64(u641;u641;"; 631 return_type = glsl_bool_type(); 632 break; 633 case nir_op_fneu: 634 name = "__fneu64"; 635 mangled_name = "__fneu64(u641;u641;"; 636 return_type = glsl_bool_type(); 637 break; 638 case nir_op_flt: 639 name = "__flt64"; 640 mangled_name = "__flt64(u641;u641;"; 641 return_type = glsl_bool_type(); 642 break; 643 case nir_op_fge: 644 name = "__fge64"; 645 mangled_name = "__fge64(u641;u641;"; 646 return_type = glsl_bool_type(); 647 break; 648 case nir_op_fmin: 649 name = "__fmin64"; 650 mangled_name = "__fmin64(u641;u641;"; 651 break; 652 case nir_op_fmax: 653 name = "__fmax64"; 654 mangled_name = "__fmax64(u641;u641;"; 655 break; 656 case nir_op_fadd: 657 name = "__fadd64"; 658 mangled_name = "__fadd64(u641;u641;"; 659 break; 660 case nir_op_fmul: 661 name = "__fmul64"; 662 mangled_name = "__fmul64(u641;u641;"; 663 break; 664 case nir_op_ffma: 665 name = "__ffma64"; 666 mangled_name = "__ffma64(u641;u641;u641;"; 667 break; 668 case nir_op_fsat: 669 name = "__fsat64"; 670 mangled_name = "__fsat64(u641;"; 671 break; 672 case nir_op_fisfinite: 673 name = "__fisfinite64"; 674 mangled_name = "__fisfinite64(u641;"; 675 return_type = glsl_bool_type(); 676 break; 677 default: 678 return false; 679 } 680 681 assert(softfp64 != NULL); 682 nir_function *func = nir_shader_get_function_for_name(softfp64, name); 683 684 /* Another attempt, but this time with mangled names if softfp64 685 * shader is taken from SPIR-V. 686 */ 687 if (!func) 688 func = nir_shader_get_function_for_name(softfp64, mangled_name); 689 690 if (!func || !func->impl) { 691 fprintf(stderr, "Cannot find function \"%s\"\n", name); 692 assert(func); 693 } 694 695 nir_def *params[4] = { 696 NULL, 697 }; 698 699 nir_variable *ret_tmp = 700 nir_local_variable_create(b->impl, return_type, "return_tmp"); 701 nir_deref_instr *ret_deref = nir_build_deref_var(b, ret_tmp); 702 params[0] = &ret_deref->def; 703 704 assert(nir_op_infos[instr->op].num_inputs + 1 == func->num_params); 705 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 706 nir_alu_type n_type = 707 nir_alu_type_get_base_type(nir_op_infos[instr->op].input_types[i]); 708 /* Add bitsize */ 709 n_type = n_type | instr->src[0].src.ssa->bit_size; 710 711 const struct glsl_type *param_type = 712 glsl_scalar_type(nir_get_glsl_base_type_for_nir_type(n_type)); 713 714 nir_variable *param = 715 nir_local_variable_create(b->impl, param_type, "param"); 716 nir_deref_instr *param_deref = nir_build_deref_var(b, param); 717 nir_store_deref(b, param_deref, nir_mov_alu(b, instr->src[i], 1), ~0); 718 719 assert(i + 1 < ARRAY_SIZE(params)); 720 params[i + 1] = &param_deref->def; 721 } 722 723 nir_inline_function_impl(b, func->impl, params, NULL); 724 725 return nir_load_deref(b, ret_deref); 726} 727 728nir_lower_doubles_options 729nir_lower_doubles_op_to_options_mask(nir_op opcode) 730{ 731 switch (opcode) { 732 case nir_op_frcp: 733 return nir_lower_drcp; 734 case nir_op_fsqrt: 735 return nir_lower_dsqrt; 736 case nir_op_frsq: 737 return nir_lower_drsq; 738 case nir_op_ftrunc: 739 return nir_lower_dtrunc; 740 case nir_op_ffloor: 741 return nir_lower_dfloor; 742 case nir_op_fceil: 743 return nir_lower_dceil; 744 case nir_op_ffract: 745 return nir_lower_dfract; 746 case nir_op_fround_even: 747 return nir_lower_dround_even; 748 case nir_op_fmod: 749 return nir_lower_dmod; 750 case nir_op_fsub: 751 return nir_lower_dsub; 752 case nir_op_fdiv: 753 return nir_lower_ddiv; 754 case nir_op_fmin: 755 case nir_op_fmax: 756 return nir_lower_dminmax; 757 case nir_op_fsat: 758 return nir_lower_dsat; 759 default: 760 return 0; 761 } 762} 763 764struct lower_doubles_data { 765 const nir_shader *softfp64; 766 nir_lower_doubles_options options; 767}; 768 769static bool 770should_lower_double_instr(const nir_instr *instr, const void *_data) 771{ 772 const struct lower_doubles_data *data = _data; 773 const nir_lower_doubles_options options = data->options; 774 775 if (instr->type != nir_instr_type_alu) 776 return false; 777 778 const nir_alu_instr *alu = nir_instr_as_alu(instr); 779 780 bool is_64 = alu->def.bit_size == 64; 781 782 unsigned num_srcs = nir_op_infos[alu->op].num_inputs; 783 for (unsigned i = 0; i < num_srcs; i++) { 784 is_64 |= (nir_src_bit_size(alu->src[i].src) == 64); 785 } 786 787 if (!is_64) 788 return false; 789 790 if (options & nir_lower_fp64_full_software) 791 return true; 792 793 return options & nir_lower_doubles_op_to_options_mask(alu->op); 794} 795 796static nir_def * 797lower_doubles_instr(nir_builder *b, nir_instr *instr, void *_data) 798{ 799 const struct lower_doubles_data *data = _data; 800 const nir_lower_doubles_options options = data->options; 801 nir_alu_instr *alu = nir_instr_as_alu(instr); 802 803 /* Easier to set it here than pass it around all over ther place. */ 804 b->fp_fast_math = alu->fp_fast_math; 805 806 nir_def *soft_def = 807 lower_doubles_instr_to_soft(b, alu, data->softfp64, options); 808 if (soft_def) 809 return soft_def; 810 811 if (!(options & nir_lower_doubles_op_to_options_mask(alu->op))) 812 return NULL; 813 814 nir_def *src = nir_mov_alu(b, alu->src[0], 815 alu->def.num_components); 816 817 switch (alu->op) { 818 case nir_op_frcp: 819 return lower_rcp(b, src); 820 case nir_op_fsqrt: 821 return lower_sqrt_rsq(b, src, true); 822 case nir_op_frsq: 823 return lower_sqrt_rsq(b, src, false); 824 case nir_op_ftrunc: 825 return lower_trunc(b, src); 826 case nir_op_ffloor: 827 return lower_floor(b, src); 828 case nir_op_fceil: 829 return lower_ceil(b, src); 830 case nir_op_ffract: 831 return lower_fract(b, src); 832 case nir_op_fround_even: 833 return lower_round_even(b, src); 834 case nir_op_fsat: 835 return lower_sat(b, src); 836 837 case nir_op_fdiv: 838 case nir_op_fsub: 839 case nir_op_fmod: 840 case nir_op_fmin: 841 case nir_op_fmax: { 842 nir_def *src1 = nir_mov_alu(b, alu->src[1], 843 alu->def.num_components); 844 switch (alu->op) { 845 case nir_op_fdiv: 846 return nir_fmul(b, src, nir_frcp(b, src1)); 847 case nir_op_fsub: 848 return nir_fadd(b, src, nir_fneg(b, src1)); 849 case nir_op_fmod: 850 return lower_mod(b, src, src1); 851 case nir_op_fmin: 852 return lower_minmax(b, nir_op_flt, src, src1); 853 case nir_op_fmax: 854 return lower_minmax(b, nir_op_fge, src, src1); 855 default: 856 unreachable("unhandled opcode"); 857 } 858 } 859 default: 860 unreachable("unhandled opcode"); 861 } 862} 863 864static bool 865nir_lower_doubles_impl(nir_function_impl *impl, 866 const nir_shader *softfp64, 867 nir_lower_doubles_options options) 868{ 869 struct lower_doubles_data data = { 870 .softfp64 = softfp64, 871 .options = options, 872 }; 873 874 bool progress = 875 nir_function_impl_lower_instructions(impl, 876 should_lower_double_instr, 877 lower_doubles_instr, 878 &data); 879 880 if (progress && (options & nir_lower_fp64_full_software)) { 881 /* Indices are completely messed up now */ 882 nir_index_ssa_defs(impl); 883 884 nir_metadata_preserve(impl, nir_metadata_none); 885 886 /* And we have deref casts we need to clean up thanks to function 887 * inlining. 888 */ 889 nir_opt_deref_impl(impl); 890 } else if (progress) { 891 nir_metadata_preserve(impl, nir_metadata_control_flow); 892 } else { 893 nir_metadata_preserve(impl, nir_metadata_all); 894 } 895 896 return progress; 897} 898 899bool 900nir_lower_doubles(nir_shader *shader, 901 const nir_shader *softfp64, 902 nir_lower_doubles_options options) 903{ 904 bool progress = false; 905 906 nir_foreach_function_impl(impl, shader) { 907 progress |= nir_lower_doubles_impl(impl, softfp64, options); 908 } 909 910 return progress; 911}