this string has no description
0
flood_cuda.cpp
1__device__ __forceinline__ void fast_path_compute(
2 int *water_level_read,
3 int *water_level_write,
4 float height_shared[SHARED_COUNT],
5 float water_level_read_shared[SHARED_COUNT],
6 int2 cell_index,
7 float *max_spillage_iter_thread
8) {
9 // Pre-load every shared cell the compute will touch into registers, *before*
10 // any branch. ld.shared is not treated as pure by nvcc, so the compiler will
11 // not CSE duplicate loads across the `if (w <= 0) continue` branches inside
12 // the outer loop. Hoisting explicitly collapses 21 height loads -> 13 and
13 // 5 water loads -> 5 (all unique), each surviving in a register.
14 constexpr int SHX = SHARED_X_COUNT;
15 int base = (threadIdx.y + 2) * SHX + (threadIdx.x + 2);
16
17 const float h_c = height_shared[base];
18 const float h_w = height_shared[base - 1];
19 const float h_e = height_shared[base + 1];
20 const float h_n = height_shared[base - SHX];
21 const float h_s = height_shared[base + SHX];
22 const float h_ww = height_shared[base - 2];
23 const float h_ee = height_shared[base + 2];
24 const float h_nn = height_shared[base - 2 * SHX];
25 const float h_ss = height_shared[base + 2 * SHX];
26 const float h_nw = height_shared[base - SHX - 1];
27 const float h_ne = height_shared[base - SHX + 1];
28 const float h_sw = height_shared[base + SHX - 1];
29 const float h_se = height_shared[base + SHX + 1];
30
31 const float w_c = water_level_read_shared[base];
32 const float w_w = water_level_read_shared[base - 1];
33 const float w_e = water_level_read_shared[base + 1];
34 const float w_n = water_level_read_shared[base - SHX];
35 const float w_s = water_level_read_shared[base + SHX];
36
37 int cell_water_level_delta_int = 0;
38
39 // Outer iter 0: lhs = center. Only contributes outgoing spillage + max update.
40 // (incoming would be prop * max(0, h_c - h_c) / 2 == 0, so skipped.)
41 if (w_c > 0.0f) {
42 float d_w = max(0.0f, h_c - h_w);
43 float d_e = max(0.0f, h_c - h_e);
44 float d_n = max(0.0f, h_c - h_n);
45 float d_s = max(0.0f, h_c - h_s);
46 float sum = d_w + d_e + d_n + d_s;
47 float mx = max(max(d_w, d_e), max(d_n, d_s));
48 float spill = min(w_c, mx);
49 if (spill > 1e-8f * sum) {
50 int outgoing = FIXED(spill / SPILLAGE_FACTOR);
51 cell_water_level_delta_int -= outgoing;
52 *max_spillage_iter_thread =
53 max(*max_spillage_iter_thread, spill / SPILLAGE_FACTOR);
54 }
55 }
56
57 // Helper: each neighbor as lhs contributes incoming = FIXED(prop * d_c / 2),
58 // where d_c = max(0, lhs_height - cell_height).
59#define FLOOD_NEIGHBOR_SPILL(w_lhs, h_lhs, h_a, h_b, h_c_rhs, h_d_rhs, d_to_center) \
60 do { \
61 if ((w_lhs) > 0.0f) { \
62 float d1 = max(0.0f, (h_lhs) - (h_a)); \
63 float d2 = max(0.0f, (h_lhs) - (h_b)); \
64 float d3 = max(0.0f, (h_lhs) - (h_c_rhs)); \
65 float d4 = max(0.0f, (h_lhs) - (h_d_rhs)); \
66 float sum = d1 + d2 + d3 + d4; \
67 float mx = max(max(d1, d2), max(d3, d4)); \
68 float spill = min((w_lhs), mx); \
69 if (spill > 1e-8f * sum) { \
70 float prop = spill / sum; \
71 int incoming = FIXED(prop * (d_to_center) / SPILLAGE_FACTOR); \
72 cell_water_level_delta_int += incoming; \
73 } \
74 } \
75 } while (0)
76
77 // Outer 1: lhs=W (-1,0); rhs={WW, center, NW, SW}. d_to_center = max(0, h_w - h_c).
78 FLOOD_NEIGHBOR_SPILL(w_w, h_w, h_ww, h_c, h_nw, h_sw, max(0.0f, h_w - h_c));
79 // Outer 2: lhs=E (+1,0); rhs={center, EE, NE, SE}.
80 FLOOD_NEIGHBOR_SPILL(w_e, h_e, h_c, h_ee, h_ne, h_se, max(0.0f, h_e - h_c));
81 // Outer 3: lhs=N (0,-1); rhs={NW, NE, NN, center}.
82 FLOOD_NEIGHBOR_SPILL(w_n, h_n, h_nw, h_ne, h_nn, h_c, max(0.0f, h_n - h_c));
83 // Outer 4: lhs=S (0,+1); rhs={SW, SE, center, SS}.
84 FLOOD_NEIGHBOR_SPILL(w_s, h_s, h_sw, h_se, h_c, h_ss, max(0.0f, h_s - h_c));
85
86#undef FLOOD_NEIGHBOR_SPILL
87
88 get_global(water_level_write, cell_index) =
89 get_global(water_level_read, cell_index) + cell_water_level_delta_int;
90}