flood_cuda.cpp · by storm.tngl.sh

this string has no description
flood_cuda.cpp 7w ago
90 lines 4.7 kB view raw
 1__device__ __forceinline__ void fast_path_compute(
 2    int *water_level_read,
 3    int *water_level_write,
 4    float height_shared[SHARED_COUNT],
 5    float water_level_read_shared[SHARED_COUNT],
 6    int2 cell_index,
 7    float *max_spillage_iter_thread
 8) {
 9    // Pre-load every shared cell the compute will touch into registers, *before*
10    // any branch. ld.shared is not treated as pure by nvcc, so the compiler will
11    // not CSE duplicate loads across the `if (w <= 0) continue` branches inside
12    // the outer loop. Hoisting explicitly collapses 21 height loads -> 13 and
13    // 5 water loads -> 5 (all unique), each surviving in a register.
14    constexpr int SHX = SHARED_X_COUNT;
15    int base = (threadIdx.y + 2) * SHX + (threadIdx.x + 2);
16
17    const float h_c  = height_shared[base];
18    const float h_w  = height_shared[base - 1];
19    const float h_e  = height_shared[base + 1];
20    const float h_n  = height_shared[base - SHX];
21    const float h_s  = height_shared[base + SHX];
22    const float h_ww = height_shared[base - 2];
23    const float h_ee = height_shared[base + 2];
24    const float h_nn = height_shared[base - 2 * SHX];
25    const float h_ss = height_shared[base + 2 * SHX];
26    const float h_nw = height_shared[base - SHX - 1];
27    const float h_ne = height_shared[base - SHX + 1];
28    const float h_sw = height_shared[base + SHX - 1];
29    const float h_se = height_shared[base + SHX + 1];
30
31    const float w_c = water_level_read_shared[base];
32    const float w_w = water_level_read_shared[base - 1];
33    const float w_e = water_level_read_shared[base + 1];
34    const float w_n = water_level_read_shared[base - SHX];
35    const float w_s = water_level_read_shared[base + SHX];
36
37    int cell_water_level_delta_int = 0;
38
39    // Outer iter 0: lhs = center. Only contributes outgoing spillage + max update.
40    // (incoming would be prop * max(0, h_c - h_c) / 2 == 0, so skipped.)
41    if (w_c > 0.0f) {
42        float d_w = max(0.0f, h_c - h_w);
43        float d_e = max(0.0f, h_c - h_e);
44        float d_n = max(0.0f, h_c - h_n);
45        float d_s = max(0.0f, h_c - h_s);
46        float sum = d_w + d_e + d_n + d_s;
47        float mx = max(max(d_w, d_e), max(d_n, d_s));
48        float spill = min(w_c, mx);
49        if (spill > 1e-8f * sum) {
50            int outgoing = FIXED(spill / SPILLAGE_FACTOR);
51            cell_water_level_delta_int -= outgoing;
52            *max_spillage_iter_thread =
53                max(*max_spillage_iter_thread, spill / SPILLAGE_FACTOR);
54        }
55    }
56
57    // Helper: each neighbor as lhs contributes incoming = FIXED(prop * d_c / 2),
58    // where d_c = max(0, lhs_height - cell_height).
59#define FLOOD_NEIGHBOR_SPILL(w_lhs, h_lhs, h_a, h_b, h_c_rhs, h_d_rhs, d_to_center) \
60    do {                                                                           \
61        if ((w_lhs) > 0.0f) {                                                      \
62            float d1 = max(0.0f, (h_lhs) - (h_a));                                 \
63            float d2 = max(0.0f, (h_lhs) - (h_b));                                 \
64            float d3 = max(0.0f, (h_lhs) - (h_c_rhs));                             \
65            float d4 = max(0.0f, (h_lhs) - (h_d_rhs));                             \
66            float sum = d1 + d2 + d3 + d4;                                         \
67            float mx = max(max(d1, d2), max(d3, d4));                              \
68            float spill = min((w_lhs), mx);                                        \
69            if (spill > 1e-8f * sum) {                                             \
70                float prop = spill / sum;                                          \
71                int incoming = FIXED(prop * (d_to_center) / SPILLAGE_FACTOR);      \
72                cell_water_level_delta_int += incoming;                            \
73            }                                                                      \
74        }                                                                          \
75    } while (0)
76
77    // Outer 1: lhs=W (-1,0); rhs={WW, center, NW, SW}. d_to_center = max(0, h_w - h_c).
78    FLOOD_NEIGHBOR_SPILL(w_w, h_w, h_ww, h_c, h_nw, h_sw, max(0.0f, h_w - h_c));
79    // Outer 2: lhs=E (+1,0); rhs={center, EE, NE, SE}.
80    FLOOD_NEIGHBOR_SPILL(w_e, h_e, h_c, h_ee, h_ne, h_se, max(0.0f, h_e - h_c));
81    // Outer 3: lhs=N (0,-1); rhs={NW, NE, NN, center}.
82    FLOOD_NEIGHBOR_SPILL(w_n, h_n, h_nw, h_ne, h_nn, h_c, max(0.0f, h_n - h_c));
83    // Outer 4: lhs=S (0,+1); rhs={SW, SE, center, SS}.
84    FLOOD_NEIGHBOR_SPILL(w_s, h_s, h_sw, h_se, h_c, h_ss, max(0.0f, h_s - h_c));
85
86#undef FLOOD_NEIGHBOR_SPILL
87
88    get_global(water_level_write, cell_index) =
89        get_global(water_level_read, cell_index) + cell_water_level_delta_int;
90}