__device__ __forceinline__ void fast_path_compute( int *water_level_read, int *water_level_write, float height_shared[SHARED_COUNT], float water_level_read_shared[SHARED_COUNT], int2 cell_index, float *max_spillage_iter_thread ) { // Pre-load every shared cell the compute will touch into registers, *before* // any branch. ld.shared is not treated as pure by nvcc, so the compiler will // not CSE duplicate loads across the `if (w <= 0) continue` branches inside // the outer loop. Hoisting explicitly collapses 21 height loads -> 13 and // 5 water loads -> 5 (all unique), each surviving in a register. constexpr int SHX = SHARED_X_COUNT; int base = (threadIdx.y + 2) * SHX + (threadIdx.x + 2); const float h_c = height_shared[base]; const float h_w = height_shared[base - 1]; const float h_e = height_shared[base + 1]; const float h_n = height_shared[base - SHX]; const float h_s = height_shared[base + SHX]; const float h_ww = height_shared[base - 2]; const float h_ee = height_shared[base + 2]; const float h_nn = height_shared[base - 2 * SHX]; const float h_ss = height_shared[base + 2 * SHX]; const float h_nw = height_shared[base - SHX - 1]; const float h_ne = height_shared[base - SHX + 1]; const float h_sw = height_shared[base + SHX - 1]; const float h_se = height_shared[base + SHX + 1]; const float w_c = water_level_read_shared[base]; const float w_w = water_level_read_shared[base - 1]; const float w_e = water_level_read_shared[base + 1]; const float w_n = water_level_read_shared[base - SHX]; const float w_s = water_level_read_shared[base + SHX]; int cell_water_level_delta_int = 0; // Outer iter 0: lhs = center. Only contributes outgoing spillage + max update. // (incoming would be prop * max(0, h_c - h_c) / 2 == 0, so skipped.) if (w_c > 0.0f) { float d_w = max(0.0f, h_c - h_w); float d_e = max(0.0f, h_c - h_e); float d_n = max(0.0f, h_c - h_n); float d_s = max(0.0f, h_c - h_s); float sum = d_w + d_e + d_n + d_s; float mx = max(max(d_w, d_e), max(d_n, d_s)); float spill = min(w_c, mx); if (spill > 1e-8f * sum) { int outgoing = FIXED(spill / SPILLAGE_FACTOR); cell_water_level_delta_int -= outgoing; *max_spillage_iter_thread = max(*max_spillage_iter_thread, spill / SPILLAGE_FACTOR); } } // Helper: each neighbor as lhs contributes incoming = FIXED(prop * d_c / 2), // where d_c = max(0, lhs_height - cell_height). #define FLOOD_NEIGHBOR_SPILL(w_lhs, h_lhs, h_a, h_b, h_c_rhs, h_d_rhs, d_to_center) \ do { \ if ((w_lhs) > 0.0f) { \ float d1 = max(0.0f, (h_lhs) - (h_a)); \ float d2 = max(0.0f, (h_lhs) - (h_b)); \ float d3 = max(0.0f, (h_lhs) - (h_c_rhs)); \ float d4 = max(0.0f, (h_lhs) - (h_d_rhs)); \ float sum = d1 + d2 + d3 + d4; \ float mx = max(max(d1, d2), max(d3, d4)); \ float spill = min((w_lhs), mx); \ if (spill > 1e-8f * sum) { \ float prop = spill / sum; \ int incoming = FIXED(prop * (d_to_center) / SPILLAGE_FACTOR); \ cell_water_level_delta_int += incoming; \ } \ } \ } while (0) // Outer 1: lhs=W (-1,0); rhs={WW, center, NW, SW}. d_to_center = max(0, h_w - h_c). FLOOD_NEIGHBOR_SPILL(w_w, h_w, h_ww, h_c, h_nw, h_sw, max(0.0f, h_w - h_c)); // Outer 2: lhs=E (+1,0); rhs={center, EE, NE, SE}. FLOOD_NEIGHBOR_SPILL(w_e, h_e, h_c, h_ee, h_ne, h_se, max(0.0f, h_e - h_c)); // Outer 3: lhs=N (0,-1); rhs={NW, NE, NN, center}. FLOOD_NEIGHBOR_SPILL(w_n, h_n, h_nw, h_ne, h_nn, h_c, max(0.0f, h_n - h_c)); // Outer 4: lhs=S (0,+1); rhs={SW, SE, center, SS}. FLOOD_NEIGHBOR_SPILL(w_s, h_s, h_sw, h_se, h_c, h_ss, max(0.0f, h_s - h_c)); #undef FLOOD_NEIGHBOR_SPILL get_global(water_level_write, cell_index) = get_global(water_level_read, cell_index) + cell_water_level_delta_int; }