__device__ __forceinline__ void fast_path_compute(
    int *water_level_read,
    int *water_level_write,
    float height_shared[SHARED_COUNT],
    float water_level_read_shared[SHARED_COUNT],
    int2 cell_index,
    float *max_spillage_iter_thread
) {
    // Pre-load every shared cell the compute will touch into registers, *before*
    // any branch. ld.shared is not treated as pure by nvcc, so the compiler will
    // not CSE duplicate loads across the `if (w <= 0) continue` branches inside
    // the outer loop. Hoisting explicitly collapses 21 height loads -> 13 and
    // 5 water loads -> 5 (all unique), each surviving in a register.
    constexpr int SHX = SHARED_X_COUNT;
    int base = (threadIdx.y + 2) * SHX + (threadIdx.x + 2);

    const float h_c  = height_shared[base];
    const float h_w  = height_shared[base - 1];
    const float h_e  = height_shared[base + 1];
    const float h_n  = height_shared[base - SHX];
    const float h_s  = height_shared[base + SHX];
    const float h_ww = height_shared[base - 2];
    const float h_ee = height_shared[base + 2];
    const float h_nn = height_shared[base - 2 * SHX];
    const float h_ss = height_shared[base + 2 * SHX];
    const float h_nw = height_shared[base - SHX - 1];
    const float h_ne = height_shared[base - SHX + 1];
    const float h_sw = height_shared[base + SHX - 1];
    const float h_se = height_shared[base + SHX + 1];

    const float w_c = water_level_read_shared[base];
    const float w_w = water_level_read_shared[base - 1];
    const float w_e = water_level_read_shared[base + 1];
    const float w_n = water_level_read_shared[base - SHX];
    const float w_s = water_level_read_shared[base + SHX];

    int cell_water_level_delta_int = 0;

    // Outer iter 0: lhs = center. Only contributes outgoing spillage + max update.
    // (incoming would be prop * max(0, h_c - h_c) / 2 == 0, so skipped.)
    if (w_c > 0.0f) {
        float d_w = max(0.0f, h_c - h_w);
        float d_e = max(0.0f, h_c - h_e);
        float d_n = max(0.0f, h_c - h_n);
        float d_s = max(0.0f, h_c - h_s);
        float sum = d_w + d_e + d_n + d_s;
        float mx = max(max(d_w, d_e), max(d_n, d_s));
        float spill = min(w_c, mx);
        if (spill > 1e-8f * sum) {
            int outgoing = FIXED(spill / SPILLAGE_FACTOR);
            cell_water_level_delta_int -= outgoing;
            *max_spillage_iter_thread =
                max(*max_spillage_iter_thread, spill / SPILLAGE_FACTOR);
        }
    }

    // Helper: each neighbor as lhs contributes incoming = FIXED(prop * d_c / 2),
    // where d_c = max(0, lhs_height - cell_height).
#define FLOOD_NEIGHBOR_SPILL(w_lhs, h_lhs, h_a, h_b, h_c_rhs, h_d_rhs, d_to_center) \
    do {                                                                           \
        if ((w_lhs) > 0.0f) {                                                      \
            float d1 = max(0.0f, (h_lhs) - (h_a));                                 \
            float d2 = max(0.0f, (h_lhs) - (h_b));                                 \
            float d3 = max(0.0f, (h_lhs) - (h_c_rhs));                             \
            float d4 = max(0.0f, (h_lhs) - (h_d_rhs));                             \
            float sum = d1 + d2 + d3 + d4;                                         \
            float mx = max(max(d1, d2), max(d3, d4));                              \
            float spill = min((w_lhs), mx);                                        \
            if (spill > 1e-8f * sum) {                                             \
                float prop = spill / sum;                                          \
                int incoming = FIXED(prop * (d_to_center) / SPILLAGE_FACTOR);      \
                cell_water_level_delta_int += incoming;                            \
            }                                                                      \
        }                                                                          \
    } while (0)

    // Outer 1: lhs=W (-1,0); rhs={WW, center, NW, SW}. d_to_center = max(0, h_w - h_c).
    FLOOD_NEIGHBOR_SPILL(w_w, h_w, h_ww, h_c, h_nw, h_sw, max(0.0f, h_w - h_c));
    // Outer 2: lhs=E (+1,0); rhs={center, EE, NE, SE}.
    FLOOD_NEIGHBOR_SPILL(w_e, h_e, h_c, h_ee, h_ne, h_se, max(0.0f, h_e - h_c));
    // Outer 3: lhs=N (0,-1); rhs={NW, NE, NN, center}.
    FLOOD_NEIGHBOR_SPILL(w_n, h_n, h_nw, h_ne, h_nn, h_c, max(0.0f, h_n - h_c));
    // Outer 4: lhs=S (0,+1); rhs={SW, SE, center, SS}.
    FLOOD_NEIGHBOR_SPILL(w_s, h_s, h_sw, h_se, h_c, h_ss, max(0.0f, h_s - h_c));

#undef FLOOD_NEIGHBOR_SPILL

    get_global(water_level_write, cell_index) =
        get_global(water_level_read, cell_index) + cell_water_level_delta_int;
}