flood_cuda.cu · by storm.tngl.sh

this string has no description
flood_cuda.cu 1mo ago
551 lines 19 kB view raw
  1#include <cstdlib>
  2#include <cuda.h>
  3#include <float.h>
  4#include <stdio.h>
  5#include <utility>
  6
  7#include "flood.h"
  8
  9/* Example of macros for error checking in CUDA */
 10#define CCF(call)                                                                                 \
 11    {                                                                                             \
 12        cudaError_t check = call;                                                                 \
 13        if (check != cudaSuccess)                                                                 \
 14            fprintf(stderr, "CUDA Error in line: %d, %s\n", __LINE__, cudaGetErrorString(check)); \
 15    }
 16
 17#define CCK()                                                                                      \
 18    {                                                                                              \
 19        cudaError_t check = cudaGetLastError();                                                    \
 20        if (check != cudaSuccess)                                                                  \
 21            fprintf(                                                                               \
 22                stderr, "CUDA Kernel Error in line: %d, %s\n", __LINE__, cudaGetErrorString(check) \
 23            );                                                                                     \
 24    }
 25
 26#define get_global(buffer, index) buffer[index.y * sd.x_count + index.x]
 27#define get_shared(buffer, index) buffer[index.y * SHARED_X_COUNT + index.x]
 28
 29#define COORD_SCEN2MAT_X(x) (x * sd.x_count / SCENARIO_SIZE)
 30#define COORD_SCEN2MAT_Y(y) (y * sd.y_count / SCENARIO_SIZE)
 31#define COORD_MAT2SCEN_X(x) (x * SCENARIO_SIZE / sd.x_count)
 32#define COORD_MAT2SCEN_Y(y) (y * SCENARIO_SIZE / sd.y_count)
 33
 34typedef struct static_data {
 35    int x_count;
 36    int y_count;
 37    float ex_factor;
 38    int cloud_count;
 39} StaticData;
 40
 41typedef struct statistics {
 42    double max_spillage_iter;
 43    int max_spillage_minute;
 44    double max_spillage_scenario;
 45    double max_water_scenario;
 46
 47    int minute;
 48
 49    unsigned long long total_rain;
 50    unsigned long long total_water;
 51    unsigned long long total_water_loss;
 52} Statistics;
 53
 54__constant__ StaticData sd;
 55
 56__device__ __forceinline__ double atomicMaxDouble(double *address, double value) {
 57    return __longlong_as_double(
 58        atomicMax((unsigned long long *)address, __double_as_longlong(value))
 59    );
 60}
 61
 62__device__ __forceinline__ float warpReduceMax(float value) {
 63#pragma unroll
 64    for (int offset = warpSize / 2; offset > 0; offset >>= 1)
 65        value = max(value, __shfl_down_sync(0xffffffff, value, offset));
 66
 67    return value;
 68}
 69
 70__device__ __forceinline__ unsigned long long warpReduceSum(unsigned long long value) {
 71#pragma unroll
 72    for (int offset = warpSize / 2; offset > 0; offset >>= 1)
 73        value = value + __shfl_down_sync(0xffffffff, value, offset);
 74
 75    return value;
 76}
 77
 78__device__ __forceinline__ bool is_in_bounds(int2 index) {
 79    return index.x >= 0 && index.x < sd.x_count && index.y >= 0 && index.y < sd.y_count;
 80}
 81
 82__device__ __forceinline__ int2 operator+(int2 lhs, int2 rhs) {
 83    return make_int2(lhs.x + rhs.x, lhs.y + rhs.y);
 84}
 85
 86__global__ void next_iteration(Statistics *stats) {
 87    if (stats->max_spillage_iter > stats->max_spillage_scenario) {
 88        stats->max_spillage_scenario = stats->max_spillage_iter;
 89        stats->max_spillage_minute = stats->minute;
 90    }
 91
 92    stats->minute += 1;
 93}
 94
 95__global__ void step1(Statistics *stats, Cloud_t *clouds, int *water_level) {
 96    int cloud_index = blockIdx.x;
 97    int thread_index = threadIdx.x;
 98
 99    unsigned long long total_rain_thread = 0;
100
101    __shared__ Cloud_t cloud;
102
103    if (thread_index == 0) {
104        // Bit ugly here but it works!
105        stats->max_spillage_iter = 0;
106
107        /* Step 1.1: Clouds movement */
108        cloud = clouds[cloud_index];
109
110        float km_minute = cloud.speed / 60;
111        cloud.x += km_minute * cos(cloud.angle * M_PI / 180.0);
112        cloud.y += km_minute * sin(cloud.angle * M_PI / 180.0);
113
114        clouds[cloud_index] = cloud;
115    }
116
117    __syncthreads();
118
119    // Compute the bounding box area of the cloud
120    float lhs_x_index = COORD_SCEN2MAT_X(max(0.0, cloud.x - cloud.radius));
121    float rhs_x_index = COORD_SCEN2MAT_X(min(cloud.x + cloud.radius, (float)SCENARIO_SIZE));
122    float lhs_y_index = COORD_SCEN2MAT_Y(max(0.0, cloud.y - cloud.radius));
123    float rhs_y_index = COORD_SCEN2MAT_Y(min(cloud.y + cloud.radius, (float)SCENARIO_SIZE));
124
125    int x_count = max(0, (int)ceilf(rhs_x_index - lhs_x_index));
126    int y_count = max(0, (int)ceilf(rhs_y_index - lhs_y_index));
127    int cell_count = x_count * y_count;
128
129    for (int i = thread_index; i < cell_count; i += blockDim.x) {
130        int x_offset_count = i % x_count;
131        int y_offset_count = i / x_count;
132
133        float float_x_index = lhs_x_index + x_offset_count;
134        float float_y_index = lhs_y_index + y_offset_count;
135
136        int2 int_index = make_int2((int)float_x_index, (int)float_y_index);
137        if (!is_in_bounds(int_index)) continue;
138
139        float dx = COORD_MAT2SCEN_X(float_x_index) - cloud.x;
140        float dy = COORD_MAT2SCEN_Y(float_y_index) - cloud.y;
141
142        float distance = sqrt(dx * dx + dy * dy);
143        if (distance >= cloud.radius) continue;
144
145        float rain = sd.ex_factor *
146                     max(0.0, cloud.intensity - distance / cloud.radius * sqrt(cloud.intensity));
147
148        float meters_per_minute = rain / 1000 / 60;
149
150        // Making this conditional on >0 doesn't seem to do anything.
151        atomicAdd(&get_global(water_level, int_index), FIXED(meters_per_minute));
152
153        total_rain_thread += FIXED(meters_per_minute);
154    }
155
156    unsigned long long total_rain_warp = warpReduceSum(total_rain_thread);
157
158    int warp_lane_index = thread_index % warpSize;
159    if (warp_lane_index == 0 && total_rain_warp > 0) {
160        atomicAdd(&stats->total_rain, total_rain_warp);
161    }
162}
163
164// 2 halo cells on both sides.
165constexpr int SHARED_X_COUNT = 32 + 4;
166constexpr int SHARED_Y_COUNT = 8 + 4;
167constexpr int SHARED_COUNT = SHARED_X_COUNT * SHARED_Y_COUNT;
168
169template <bool is_cell>
170__device__ __forceinline__ void inner_kernel(
171    int2 lhs_index,
172    int2 lhs_index_shared,
173    float *height_shared,
174    float *ground_shared,
175    float *water_level_read_shared,
176    float cell_height,
177    float *max_spillage_iter_thread,
178    int *cell_water_level_delta_int,
179    unsigned long long *total_water_loss_thread
180) {
181    // int2 lhs_index = cell_index + offsets[outer_index];
182    // if (!is_in_bounds(lhs_index)) continue;
183
184    // int2 lhs_index_shared = cell_index_shared + offsets[outer_index];
185
186    float lhs_water_level = get_shared(water_level_read_shared, lhs_index_shared);
187    if (lhs_water_level <= 0.0) return;
188
189    float height_difference_sum = 0;
190    float height_difference_max = 0;
191    float height_difference_oob = 0;
192
193    float lhs_height = get_shared(height_shared, lhs_index_shared);
194    float lhs_ground = get_shared(ground_shared, lhs_index_shared);
195
196    static int2 offsets[4] = {{-1, 0}, {+1, 0}, {0, -1}, {0, +1}};
197
198#pragma unroll
199    for (int inner_index = 0; inner_index < 4; inner_index += 1) {
200        int2 rhs_index = lhs_index + offsets[inner_index];
201        int2 rhs_index_shared = lhs_index_shared + offsets[inner_index];
202
203        float rhs_height;
204
205        if (is_in_bounds(rhs_index)) {
206            rhs_height = get_shared(height_shared, rhs_index_shared);
207        } else {
208            rhs_height = lhs_ground;
209        }
210
211        if (rhs_height > lhs_height) continue;
212
213        // Compute level differences
214        float height_difference = lhs_height - rhs_height;
215
216        height_difference_sum += height_difference;
217        height_difference_max = max(height_difference_max, height_difference);
218
219        if (!is_in_bounds(rhs_index)) {
220            height_difference_oob += height_difference;
221        }
222    }
223
224    float lhs_spillage_level = min(lhs_water_level, height_difference_max);
225
226    if (height_difference_sum <= 0.0) return;
227
228    float proportion = lhs_spillage_level / height_difference_sum;
229    if (proportion <= 1e-8) return;
230
231    // Identical logic to: flood_seq.c:167 and flood_seq.c:218.
232    if constexpr (is_cell) {
233        int outgoing = FIXED(lhs_spillage_level / SPILLAGE_FACTOR);
234        *cell_water_level_delta_int -= outgoing;
235
236        *total_water_loss_thread += FIXED(proportion * height_difference_oob / 2);
237
238        *max_spillage_iter_thread =
239            max(*max_spillage_iter_thread, lhs_spillage_level / SPILLAGE_FACTOR);
240
241        return;
242    }
243
244    if (lhs_height < cell_height) return;
245
246    // Identical logic to: flood_seq.c:196, flood_seq:199 and flood_seq:235.
247    int incoming = FIXED(proportion * (lhs_height - cell_height) / SPILLAGE_FACTOR);
248    *cell_water_level_delta_int += incoming;
249}
250
251__global__ void step2_3(
252    Statistics *stats,
253    int *water_level_read,
254    int *water_level_write,
255    float *ground
256) {
257    __shared__ float ground_shared[SHARED_COUNT];
258    __shared__ float height_shared[SHARED_COUNT];
259    __shared__ float water_level_read_shared[SHARED_COUNT];
260
261    int thread_index = blockDim.x * threadIdx.y + threadIdx.x;
262    int thread_count = blockDim.x * blockDim.y;
263    int block_count = SHARED_X_COUNT * SHARED_Y_COUNT;
264
265    for (int i = thread_index; i < block_count; i += thread_count) {
266        int2 shared_index = make_int2(i % SHARED_X_COUNT, i / SHARED_X_COUNT);
267
268        int global_x_index = blockIdx.x * blockDim.x + shared_index.x - 2;
269        int global_y_index = blockIdx.y * blockDim.y + shared_index.y - 2;
270        int2 global_index = make_int2(global_x_index, global_y_index);
271
272        if (is_in_bounds(global_index)) {
273            float ground_value = get_global(ground, global_index);
274            get_shared(ground_shared, shared_index) = ground_value;
275
276            int water_level_int_value = get_global(water_level_read, global_index);
277            float water_level_float_value = FLOATING(water_level_int_value);
278            get_shared(water_level_read_shared, shared_index) = water_level_float_value;
279
280            float height_value = ground_value + water_level_float_value;
281            get_shared(height_shared, shared_index) = height_value;
282        } else {
283            get_shared(ground_shared, shared_index) = 0.0;
284            get_shared(height_shared, shared_index) = 0.0;
285            get_shared(water_level_read_shared, shared_index) = 0.0;
286        }
287    }
288
289    __syncthreads();
290
291    int cell_x_index = blockIdx.x * blockDim.x + threadIdx.x;
292    int cell_y_index = blockIdx.y * blockDim.y + threadIdx.y;
293    int2 cell_index = make_int2(cell_x_index, cell_y_index);
294
295    float max_spillage_iter_thread = 0.0;
296    unsigned long long total_water_loss_thread = 0;
297
298    if (is_in_bounds(cell_index)) {
299        int2 cell_index_shared = make_int2(threadIdx.x + 2, threadIdx.y + 2);
300
301        int cell_water_level_delta_int = 0;
302
303        float cell_height = get_shared(height_shared, cell_index_shared);
304
305        inner_kernel<true>(
306            cell_index,
307            cell_index_shared,
308            height_shared,
309            ground_shared,
310            water_level_read_shared,
311            cell_height,
312            &max_spillage_iter_thread,
313            &cell_water_level_delta_int,
314            &total_water_loss_thread
315        );
316
317        static int2 offsets[4] = {{-1, 0}, {+1, 0}, {0, -1}, {0, +1}};
318
319        for (int offset_index = 0; offset_index < 4; offset_index += 1) {
320            int2 offset = offsets[offset_index];
321            if (is_in_bounds(cell_index + offset)) {
322                inner_kernel<false>(
323                    cell_index + offset,
324                    cell_index_shared + offset,
325                    height_shared,
326                    ground_shared,
327                    water_level_read_shared,
328                    cell_height,
329                    &max_spillage_iter_thread,
330                    &cell_water_level_delta_int,
331                    &total_water_loss_thread
332                );
333            }
334        }
335
336        get_global(water_level_write, cell_index) =
337            get_global(water_level_read, cell_index) + cell_water_level_delta_int;
338    }
339
340    if (total_water_loss_thread > 0) {
341        atomicAdd(&stats->total_water_loss, total_water_loss_thread);
342    }
343
344    float max_spillage_iter_warp = warpReduceMax(max_spillage_iter_thread);
345
346    int warp_lane_index = thread_index % warpSize;
347    if (warp_lane_index == 0 && max_spillage_iter_warp > 0.0) {
348        atomicMaxDouble(&stats->max_spillage_iter, max_spillage_iter_warp);
349    }
350}
351
352__global__ void calculate_total(Statistics *stats, int *water_level) {
353    int row_count = sd.y_count, column_count = sd.x_count;
354
355    int x_index = blockIdx.x * blockDim.x + threadIdx.x;
356    int y_index = blockIdx.y * blockDim.y + threadIdx.y;
357    int2 index = make_int2(x_index, y_index);
358
359    auto thread_index = threadIdx.x + threadIdx.y * blockDim.x;
360
361    __shared__ double max_water_scenario_local;
362    __shared__ unsigned long long total_water_local;
363
364    if (thread_index == 0) {
365        max_water_scenario_local = 0.0;
366        total_water_local = 0;
367    }
368
369    __syncthreads();
370
371    if (y_index >= row_count || x_index >= column_count) return;
372
373    atomicAdd(&total_water_local, get_global(water_level, index));
374    atomicMaxDouble(&max_water_scenario_local, FLOATING(get_global(water_level, index)));
375
376    __syncthreads();
377
378    if (thread_index == 0) {
379        atomicAdd(&stats->total_water, total_water_local);
380        atomicMaxDouble(&stats->max_water_scenario, max_water_scenario_local);
381    }
382}
383
384extern "C" void do_compute(struct parameters *p, struct results *r) {
385    /* 2. Start global timer */
386    CCF(cudaSetDevice(0));
387
388    /* Memory allocation */
389    size_t cell_count = (size_t)p->rows * (size_t)p->columns;
390
391    size_t clouds_size = sizeof(Cloud_t) * (size_t)p->num_clouds;
392    size_t ground_size = sizeof(float) * cell_count;
393    size_t water_level_size = sizeof(int) * cell_count;
394
395    StaticData sd_host = {
396        .x_count = p->columns,
397        .y_count = p->rows,
398        .ex_factor = p->ex_factor,
399        .cloud_count = p->num_clouds,
400    };
401
402    Statistics stats_host = {
403        .max_spillage_iter = DBL_MAX,
404        .max_spillage_minute = 0,
405        .max_spillage_scenario = 0.0,
406        .max_water_scenario = 0.0,
407        .minute = 0,
408        .total_rain = 0,
409        .total_water = 0,
410        .total_water_loss = 0,
411    };
412
413    Cloud_t *clouds;
414    CCF(cudaMalloc(&clouds, clouds_size));
415    CCF(cudaMemcpy(clouds, p->clouds, clouds_size, cudaMemcpyHostToDevice));
416
417#ifdef DEBUG
418    Cloud_t *clouds_debug = (Cloud_t *)malloc(clouds_size);
419    if (clouds_debug == NULL) {
420        printf("Failed to allocate clouds_debug\n");
421        exit(EXIT_FAILURE);
422    }
423#endif
424
425    float *ground;
426    CCF(cudaMalloc(&ground, ground_size));
427    CCF(cudaMemcpy(ground, p->ground, ground_size, cudaMemcpyHostToDevice));
428
429    CCF(cudaMemcpyToSymbol(sd, &sd_host, sizeof(StaticData), 0, cudaMemcpyHostToDevice));
430
431    struct statistics *stats_device;
432    CCF(cudaMalloc(&stats_device, sizeof(Statistics)));
433    CCF(cudaMemcpy(stats_device, &stats_host, sizeof(Statistics), cudaMemcpyHostToDevice));
434
435    int *water_level_read;
436    CCF(cudaMalloc(&water_level_read, water_level_size));
437    CCF(cudaMemset(water_level_read, 0, water_level_size));
438
439    int *water_level_write;
440    CCF(cudaMalloc(&water_level_write, water_level_size));
441    CCF(cudaMemset(water_level_write, 0, water_level_size));
442
443#ifdef DEBUG
444    int *water_level_debug = (int *)malloc(water_level_size);
445    if (water_level_debug == NULL) {
446        printf("Failed to allocate water_level_debug\n");
447        exit(EXIT_FAILURE);
448    }
449#endif
450
451    /* CUDA grid. */
452    dim3 block(32, 8);
453    auto columns_ceil = (p->columns + block.x - 1) / block.x;
454    auto rows_ceil = (p->rows + block.y - 1) / block.y;
455
456    dim3 grid(columns_ceil, rows_ceil);
457
458#ifdef DEBUG
459    print_matrix(PRECISION_FLOAT, rows, columns, ground, "Ground heights");
460#ifndef ANIMATION
461    print_clouds(p->num_clouds, p->clouds);
462#endif
463#endif
464
465    /* Prepare to measure runtime */
466    r->runtime = get_time();
467
468    /* Flood simulation */
469    while (stats_host.minute < p->num_minutes && stats_host.max_spillage_iter > p->threshold) {
470        step1<<<p->num_clouds, 256>>>(stats_device, clouds, water_level_read);
471        CCK();
472
473#ifdef DEBUG
474#ifndef ANIMATION
475        CCF(cudaMemcpy(clouds_debug, clouds, clouds_size, cudaMemcpyDeviceToHost));
476
477        print_clouds(p->num_clouds, clouds_debug);
478#endif
479#endif
480
481#ifdef DEBUG
482        CCF(cudaMemcpy(
483            water_level_debug, water_level_read, water_level_size, cudaMemcpyDeviceToHost
484        ));
485
486        print_matrix(PRECISION_FIXED, p->rows, p->columns, water_level_debug, "Water after rain");
487#endif
488
489        step2_3<<<grid, block>>>(stats_device, water_level_read, water_level_write, ground);
490        CCK();
491
492#ifdef DEBUG
493#ifndef ANIMATION
494        CCF(cudaMemcpy(
495            water_level_debug, water_level_write, water_level_size, cudaMemcpyDeviceToHost
496        ));
497
498        print_matrix(
499            PRECISION_FIXED, p->rows, p->columns, water_level_debug, "Water after spillage"
500        );
501#endif
502#endif
503
504        next_iteration<<<1, 1>>>(stats_device);
505        CCK();
506
507        // Copies minute and max_spillage_iter as these are required for the loop.
508        // Also implicitly synchronises the CPU and GPU.
509        CCF(cudaMemcpy(&stats_host, stats_device, sizeof(Statistics), cudaMemcpyDeviceToHost));
510
511        std::swap(water_level_read, water_level_write);
512    } // End of simulation.
513
514    r->runtime = get_time() - r->runtime;
515
516    /* Statistics: Total remaining water and maximum amount of water in a cell */
517    calculate_total<<<grid, block>>>(stats_device, water_level_read);
518    CCK();
519
520    // Copies max_water_scenario and total_water.
521    // Also implicitly synchronises the CPU and GPU.
522    CCF(cudaMemcpy(&stats_host, stats_device, sizeof(Statistics), cudaMemcpyDeviceToHost));
523
524    if (p->final_matrix) {
525        print_matrix(
526            PRECISION_FIXED, p->rows, p->columns, water_level_read, "Water after spillage"
527        );
528    }
529
530    r->max_water_scenario = stats_host.max_water_scenario;
531    r->total_water = stats_host.total_water;
532
533    r->minute = stats_host.minute;
534    r->total_rain = stats_host.total_rain;
535    r->total_water_loss = stats_host.total_water_loss;
536
537    r->max_spillage_minute = stats_host.max_spillage_minute;
538    r->max_spillage_scenario = stats_host.max_spillage_scenario;
539
540    /* Free resources */
541    CCF(cudaFree(clouds));
542    CCF(cudaFree(ground));
543    CCF(cudaFree(stats_device));
544    CCF(cudaFree(water_level_read));
545    CCF(cudaFree(water_level_write));
546
547#ifdef DEBUG
548    free(clouds_debug);
549    free(water_level_debug);
550#endif
551}