this string has no description
0
flood_cuda.cpp
90 lines 4.7 kB view raw
1__device__ __forceinline__ void fast_path_compute( 2 int *water_level_read, 3 int *water_level_write, 4 float height_shared[SHARED_COUNT], 5 float water_level_read_shared[SHARED_COUNT], 6 int2 cell_index, 7 float *max_spillage_iter_thread 8) { 9 // Pre-load every shared cell the compute will touch into registers, *before* 10 // any branch. ld.shared is not treated as pure by nvcc, so the compiler will 11 // not CSE duplicate loads across the `if (w <= 0) continue` branches inside 12 // the outer loop. Hoisting explicitly collapses 21 height loads -> 13 and 13 // 5 water loads -> 5 (all unique), each surviving in a register. 14 constexpr int SHX = SHARED_X_COUNT; 15 int base = (threadIdx.y + 2) * SHX + (threadIdx.x + 2); 16 17 const float h_c = height_shared[base]; 18 const float h_w = height_shared[base - 1]; 19 const float h_e = height_shared[base + 1]; 20 const float h_n = height_shared[base - SHX]; 21 const float h_s = height_shared[base + SHX]; 22 const float h_ww = height_shared[base - 2]; 23 const float h_ee = height_shared[base + 2]; 24 const float h_nn = height_shared[base - 2 * SHX]; 25 const float h_ss = height_shared[base + 2 * SHX]; 26 const float h_nw = height_shared[base - SHX - 1]; 27 const float h_ne = height_shared[base - SHX + 1]; 28 const float h_sw = height_shared[base + SHX - 1]; 29 const float h_se = height_shared[base + SHX + 1]; 30 31 const float w_c = water_level_read_shared[base]; 32 const float w_w = water_level_read_shared[base - 1]; 33 const float w_e = water_level_read_shared[base + 1]; 34 const float w_n = water_level_read_shared[base - SHX]; 35 const float w_s = water_level_read_shared[base + SHX]; 36 37 int cell_water_level_delta_int = 0; 38 39 // Outer iter 0: lhs = center. Only contributes outgoing spillage + max update. 40 // (incoming would be prop * max(0, h_c - h_c) / 2 == 0, so skipped.) 41 if (w_c > 0.0f) { 42 float d_w = max(0.0f, h_c - h_w); 43 float d_e = max(0.0f, h_c - h_e); 44 float d_n = max(0.0f, h_c - h_n); 45 float d_s = max(0.0f, h_c - h_s); 46 float sum = d_w + d_e + d_n + d_s; 47 float mx = max(max(d_w, d_e), max(d_n, d_s)); 48 float spill = min(w_c, mx); 49 if (spill > 1e-8f * sum) { 50 int outgoing = FIXED(spill / SPILLAGE_FACTOR); 51 cell_water_level_delta_int -= outgoing; 52 *max_spillage_iter_thread = 53 max(*max_spillage_iter_thread, spill / SPILLAGE_FACTOR); 54 } 55 } 56 57 // Helper: each neighbor as lhs contributes incoming = FIXED(prop * d_c / 2), 58 // where d_c = max(0, lhs_height - cell_height). 59#define FLOOD_NEIGHBOR_SPILL(w_lhs, h_lhs, h_a, h_b, h_c_rhs, h_d_rhs, d_to_center) \ 60 do { \ 61 if ((w_lhs) > 0.0f) { \ 62 float d1 = max(0.0f, (h_lhs) - (h_a)); \ 63 float d2 = max(0.0f, (h_lhs) - (h_b)); \ 64 float d3 = max(0.0f, (h_lhs) - (h_c_rhs)); \ 65 float d4 = max(0.0f, (h_lhs) - (h_d_rhs)); \ 66 float sum = d1 + d2 + d3 + d4; \ 67 float mx = max(max(d1, d2), max(d3, d4)); \ 68 float spill = min((w_lhs), mx); \ 69 if (spill > 1e-8f * sum) { \ 70 float prop = spill / sum; \ 71 int incoming = FIXED(prop * (d_to_center) / SPILLAGE_FACTOR); \ 72 cell_water_level_delta_int += incoming; \ 73 } \ 74 } \ 75 } while (0) 76 77 // Outer 1: lhs=W (-1,0); rhs={WW, center, NW, SW}. d_to_center = max(0, h_w - h_c). 78 FLOOD_NEIGHBOR_SPILL(w_w, h_w, h_ww, h_c, h_nw, h_sw, max(0.0f, h_w - h_c)); 79 // Outer 2: lhs=E (+1,0); rhs={center, EE, NE, SE}. 80 FLOOD_NEIGHBOR_SPILL(w_e, h_e, h_c, h_ee, h_ne, h_se, max(0.0f, h_e - h_c)); 81 // Outer 3: lhs=N (0,-1); rhs={NW, NE, NN, center}. 82 FLOOD_NEIGHBOR_SPILL(w_n, h_n, h_nw, h_ne, h_nn, h_c, max(0.0f, h_n - h_c)); 83 // Outer 4: lhs=S (0,+1); rhs={SW, SE, center, SS}. 84 FLOOD_NEIGHBOR_SPILL(w_s, h_s, h_sw, h_se, h_c, h_ss, max(0.0f, h_s - h_c)); 85 86#undef FLOOD_NEIGHBOR_SPILL 87 88 get_global(water_level_write, cell_index) = 89 get_global(water_level_read, cell_index) + cell_water_level_delta_int; 90}