this string has no description
0
flood_cuda.cu
1#include <cstdlib>
2#include <cuda.h>
3#include <float.h>
4#include <stdio.h>
5#include <utility>
6
7#include "flood.h"
8
9/* Example of macros for error checking in CUDA */
10#define CCF(call) \
11 { \
12 cudaError_t check = call; \
13 if (check != cudaSuccess) \
14 fprintf(stderr, "CUDA Error in line: %d, %s\n", __LINE__, cudaGetErrorString(check)); \
15 }
16
17#define CCK() \
18 { \
19 cudaError_t check = cudaGetLastError(); \
20 if (check != cudaSuccess) \
21 fprintf( \
22 stderr, "CUDA Kernel Error in line: %d, %s\n", __LINE__, cudaGetErrorString(check) \
23 ); \
24 }
25
26#define get_global(buffer, index) buffer[index.y * sd.x_count + index.x]
27#define get_shared(buffer, index) buffer[index.y * SHARED_X_COUNT + index.x]
28
29#define COORD_SCEN2MAT_X(x) (x * sd.x_count / SCENARIO_SIZE)
30#define COORD_SCEN2MAT_Y(y) (y * sd.y_count / SCENARIO_SIZE)
31#define COORD_MAT2SCEN_X(x) (x * SCENARIO_SIZE / sd.x_count)
32#define COORD_MAT2SCEN_Y(y) (y * SCENARIO_SIZE / sd.y_count)
33
34typedef struct static_data {
35 int x_count;
36 int y_count;
37 float ex_factor;
38 int cloud_count;
39} StaticData;
40
41typedef struct statistics {
42 double max_spillage_iter;
43 int max_spillage_minute;
44 double max_spillage_scenario;
45 double max_water_scenario;
46
47 int minute;
48
49 unsigned long long total_rain;
50 unsigned long long total_water;
51 unsigned long long total_water_loss;
52} Statistics;
53
54__constant__ StaticData sd;
55
56__device__ __forceinline__ double atomicMaxDouble(double *address, double value) {
57 return __longlong_as_double(
58 atomicMax((unsigned long long *)address, __double_as_longlong(value))
59 );
60}
61
62__device__ __forceinline__ float warpReduceMax(float value) {
63#pragma unroll
64 for (int offset = warpSize / 2; offset > 0; offset >>= 1)
65 value = max(value, __shfl_down_sync(0xffffffff, value, offset));
66
67 return value;
68}
69
70__device__ __forceinline__ unsigned long long warpReduceSum(unsigned long long value) {
71#pragma unroll
72 for (int offset = warpSize / 2; offset > 0; offset >>= 1)
73 value = value + __shfl_down_sync(0xffffffff, value, offset);
74
75 return value;
76}
77
78__device__ __forceinline__ bool is_in_bounds(int2 index) {
79 return index.x >= 0 && index.x < sd.x_count && index.y >= 0 && index.y < sd.y_count;
80}
81
82__device__ __forceinline__ int2 operator+(int2 lhs, int2 rhs) {
83 return make_int2(lhs.x + rhs.x, lhs.y + rhs.y);
84}
85
86__global__ void next_iteration(Statistics *stats) {
87 if (stats->max_spillage_iter > stats->max_spillage_scenario) {
88 stats->max_spillage_scenario = stats->max_spillage_iter;
89 stats->max_spillage_minute = stats->minute;
90 }
91
92 stats->minute += 1;
93}
94
95__global__ void step1(Statistics *stats, Cloud_t *clouds, int *water_level) {
96 int cloud_index = blockIdx.x;
97 int thread_index = threadIdx.x;
98
99 unsigned long long total_rain_thread = 0;
100
101 __shared__ Cloud_t cloud;
102
103 if (thread_index == 0) {
104 // Bit ugly here but it works!
105 stats->max_spillage_iter = 0;
106
107 /* Step 1.1: Clouds movement */
108 cloud = clouds[cloud_index];
109
110 float km_minute = cloud.speed / 60;
111 cloud.x += km_minute * cos(cloud.angle * M_PI / 180.0);
112 cloud.y += km_minute * sin(cloud.angle * M_PI / 180.0);
113
114 clouds[cloud_index] = cloud;
115 }
116
117 __syncthreads();
118
119 // Compute the bounding box area of the cloud
120 float lhs_x_index = COORD_SCEN2MAT_X(max(0.0, cloud.x - cloud.radius));
121 float rhs_x_index = COORD_SCEN2MAT_X(min(cloud.x + cloud.radius, (float)SCENARIO_SIZE));
122 float lhs_y_index = COORD_SCEN2MAT_Y(max(0.0, cloud.y - cloud.radius));
123 float rhs_y_index = COORD_SCEN2MAT_Y(min(cloud.y + cloud.radius, (float)SCENARIO_SIZE));
124
125 int x_count = max(0, (int)ceilf(rhs_x_index - lhs_x_index));
126 int y_count = max(0, (int)ceilf(rhs_y_index - lhs_y_index));
127 int cell_count = x_count * y_count;
128
129 for (int i = thread_index; i < cell_count; i += blockDim.x) {
130 int x_offset_count = i % x_count;
131 int y_offset_count = i / x_count;
132
133 float float_x_index = lhs_x_index + x_offset_count;
134 float float_y_index = lhs_y_index + y_offset_count;
135
136 int2 int_index = make_int2((int)float_x_index, (int)float_y_index);
137 if (!is_in_bounds(int_index)) continue;
138
139 float dx = COORD_MAT2SCEN_X(float_x_index) - cloud.x;
140 float dy = COORD_MAT2SCEN_Y(float_y_index) - cloud.y;
141
142 float distance = sqrt(dx * dx + dy * dy);
143 if (distance >= cloud.radius) continue;
144
145 float rain = sd.ex_factor *
146 max(0.0, cloud.intensity - distance / cloud.radius * sqrt(cloud.intensity));
147
148 float meters_per_minute = rain / 1000 / 60;
149
150 // Making this conditional on >0 doesn't seem to do anything.
151 atomicAdd(&get_global(water_level, int_index), FIXED(meters_per_minute));
152
153 total_rain_thread += FIXED(meters_per_minute);
154 }
155
156 unsigned long long total_rain_warp = warpReduceSum(total_rain_thread);
157
158 int warp_lane_index = thread_index % warpSize;
159 if (warp_lane_index == 0 && total_rain_warp > 0) {
160 atomicAdd(&stats->total_rain, total_rain_warp);
161 }
162}
163
164// 2 halo cells on both sides.
165constexpr int SHARED_X_COUNT = 32 + 4;
166constexpr int SHARED_Y_COUNT = 8 + 4;
167constexpr int SHARED_COUNT = SHARED_X_COUNT * SHARED_Y_COUNT;
168
169template <bool is_cell>
170__device__ __forceinline__ void inner_kernel(
171 int2 lhs_index,
172 int2 lhs_index_shared,
173 float *height_shared,
174 float *ground_shared,
175 float *water_level_read_shared,
176 float cell_height,
177 float *max_spillage_iter_thread,
178 int *cell_water_level_delta_int,
179 unsigned long long *total_water_loss_thread
180) {
181 // int2 lhs_index = cell_index + offsets[outer_index];
182 // if (!is_in_bounds(lhs_index)) continue;
183
184 // int2 lhs_index_shared = cell_index_shared + offsets[outer_index];
185
186 float lhs_water_level = get_shared(water_level_read_shared, lhs_index_shared);
187 if (lhs_water_level <= 0.0) return;
188
189 float height_difference_sum = 0;
190 float height_difference_max = 0;
191 float height_difference_oob = 0;
192
193 float lhs_height = get_shared(height_shared, lhs_index_shared);
194 float lhs_ground = get_shared(ground_shared, lhs_index_shared);
195
196 static int2 offsets[4] = {{-1, 0}, {+1, 0}, {0, -1}, {0, +1}};
197
198#pragma unroll
199 for (int inner_index = 0; inner_index < 4; inner_index += 1) {
200 int2 rhs_index = lhs_index + offsets[inner_index];
201 int2 rhs_index_shared = lhs_index_shared + offsets[inner_index];
202
203 float rhs_height;
204
205 if (is_in_bounds(rhs_index)) {
206 rhs_height = get_shared(height_shared, rhs_index_shared);
207 } else {
208 rhs_height = lhs_ground;
209 }
210
211 if (rhs_height > lhs_height) continue;
212
213 // Compute level differences
214 float height_difference = lhs_height - rhs_height;
215
216 height_difference_sum += height_difference;
217 height_difference_max = max(height_difference_max, height_difference);
218
219 if (!is_in_bounds(rhs_index)) {
220 height_difference_oob += height_difference;
221 }
222 }
223
224 float lhs_spillage_level = min(lhs_water_level, height_difference_max);
225
226 if (height_difference_sum <= 0.0) return;
227
228 float proportion = lhs_spillage_level / height_difference_sum;
229 if (proportion <= 1e-8) return;
230
231 // Identical logic to: flood_seq.c:167 and flood_seq.c:218.
232 if constexpr (is_cell) {
233 int outgoing = FIXED(lhs_spillage_level / SPILLAGE_FACTOR);
234 *cell_water_level_delta_int -= outgoing;
235
236 *total_water_loss_thread += FIXED(proportion * height_difference_oob / 2);
237
238 *max_spillage_iter_thread =
239 max(*max_spillage_iter_thread, lhs_spillage_level / SPILLAGE_FACTOR);
240
241 return;
242 }
243
244 if (lhs_height < cell_height) return;
245
246 // Identical logic to: flood_seq.c:196, flood_seq:199 and flood_seq:235.
247 int incoming = FIXED(proportion * (lhs_height - cell_height) / SPILLAGE_FACTOR);
248 *cell_water_level_delta_int += incoming;
249}
250
251__global__ void step2_3(
252 Statistics *stats,
253 int *water_level_read,
254 int *water_level_write,
255 float *ground
256) {
257 __shared__ float ground_shared[SHARED_COUNT];
258 __shared__ float height_shared[SHARED_COUNT];
259 __shared__ float water_level_read_shared[SHARED_COUNT];
260
261 int thread_index = blockDim.x * threadIdx.y + threadIdx.x;
262 int thread_count = blockDim.x * blockDim.y;
263 int block_count = SHARED_X_COUNT * SHARED_Y_COUNT;
264
265 for (int i = thread_index; i < block_count; i += thread_count) {
266 int2 shared_index = make_int2(i % SHARED_X_COUNT, i / SHARED_X_COUNT);
267
268 int global_x_index = blockIdx.x * blockDim.x + shared_index.x - 2;
269 int global_y_index = blockIdx.y * blockDim.y + shared_index.y - 2;
270 int2 global_index = make_int2(global_x_index, global_y_index);
271
272 if (is_in_bounds(global_index)) {
273 float ground_value = get_global(ground, global_index);
274 get_shared(ground_shared, shared_index) = ground_value;
275
276 int water_level_int_value = get_global(water_level_read, global_index);
277 float water_level_float_value = FLOATING(water_level_int_value);
278 get_shared(water_level_read_shared, shared_index) = water_level_float_value;
279
280 float height_value = ground_value + water_level_float_value;
281 get_shared(height_shared, shared_index) = height_value;
282 } else {
283 get_shared(ground_shared, shared_index) = 0.0;
284 get_shared(height_shared, shared_index) = 0.0;
285 get_shared(water_level_read_shared, shared_index) = 0.0;
286 }
287 }
288
289 __syncthreads();
290
291 int cell_x_index = blockIdx.x * blockDim.x + threadIdx.x;
292 int cell_y_index = blockIdx.y * blockDim.y + threadIdx.y;
293 int2 cell_index = make_int2(cell_x_index, cell_y_index);
294
295 float max_spillage_iter_thread = 0.0;
296 unsigned long long total_water_loss_thread = 0;
297
298 if (is_in_bounds(cell_index)) {
299 int2 cell_index_shared = make_int2(threadIdx.x + 2, threadIdx.y + 2);
300
301 int cell_water_level_delta_int = 0;
302
303 float cell_height = get_shared(height_shared, cell_index_shared);
304
305 inner_kernel<true>(
306 cell_index,
307 cell_index_shared,
308 height_shared,
309 ground_shared,
310 water_level_read_shared,
311 cell_height,
312 &max_spillage_iter_thread,
313 &cell_water_level_delta_int,
314 &total_water_loss_thread
315 );
316
317 static int2 offsets[4] = {{-1, 0}, {+1, 0}, {0, -1}, {0, +1}};
318
319 for (int offset_index = 0; offset_index < 4; offset_index += 1) {
320 int2 offset = offsets[offset_index];
321 if (is_in_bounds(cell_index + offset)) {
322 inner_kernel<false>(
323 cell_index + offset,
324 cell_index_shared + offset,
325 height_shared,
326 ground_shared,
327 water_level_read_shared,
328 cell_height,
329 &max_spillage_iter_thread,
330 &cell_water_level_delta_int,
331 &total_water_loss_thread
332 );
333 }
334 }
335
336 get_global(water_level_write, cell_index) =
337 get_global(water_level_read, cell_index) + cell_water_level_delta_int;
338 }
339
340 if (total_water_loss_thread > 0) {
341 atomicAdd(&stats->total_water_loss, total_water_loss_thread);
342 }
343
344 float max_spillage_iter_warp = warpReduceMax(max_spillage_iter_thread);
345
346 int warp_lane_index = thread_index % warpSize;
347 if (warp_lane_index == 0 && max_spillage_iter_warp > 0.0) {
348 atomicMaxDouble(&stats->max_spillage_iter, max_spillage_iter_warp);
349 }
350}
351
352__global__ void calculate_total(Statistics *stats, int *water_level) {
353 int row_count = sd.y_count, column_count = sd.x_count;
354
355 int x_index = blockIdx.x * blockDim.x + threadIdx.x;
356 int y_index = blockIdx.y * blockDim.y + threadIdx.y;
357 int2 index = make_int2(x_index, y_index);
358
359 auto thread_index = threadIdx.x + threadIdx.y * blockDim.x;
360
361 __shared__ double max_water_scenario_local;
362 __shared__ unsigned long long total_water_local;
363
364 if (thread_index == 0) {
365 max_water_scenario_local = 0.0;
366 total_water_local = 0;
367 }
368
369 __syncthreads();
370
371 if (y_index >= row_count || x_index >= column_count) return;
372
373 atomicAdd(&total_water_local, get_global(water_level, index));
374 atomicMaxDouble(&max_water_scenario_local, FLOATING(get_global(water_level, index)));
375
376 __syncthreads();
377
378 if (thread_index == 0) {
379 atomicAdd(&stats->total_water, total_water_local);
380 atomicMaxDouble(&stats->max_water_scenario, max_water_scenario_local);
381 }
382}
383
384extern "C" void do_compute(struct parameters *p, struct results *r) {
385 /* 2. Start global timer */
386 CCF(cudaSetDevice(0));
387
388 /* Memory allocation */
389 size_t cell_count = (size_t)p->rows * (size_t)p->columns;
390
391 size_t clouds_size = sizeof(Cloud_t) * (size_t)p->num_clouds;
392 size_t ground_size = sizeof(float) * cell_count;
393 size_t water_level_size = sizeof(int) * cell_count;
394
395 StaticData sd_host = {
396 .x_count = p->columns,
397 .y_count = p->rows,
398 .ex_factor = p->ex_factor,
399 .cloud_count = p->num_clouds,
400 };
401
402 Statistics stats_host = {
403 .max_spillage_iter = DBL_MAX,
404 .max_spillage_minute = 0,
405 .max_spillage_scenario = 0.0,
406 .max_water_scenario = 0.0,
407 .minute = 0,
408 .total_rain = 0,
409 .total_water = 0,
410 .total_water_loss = 0,
411 };
412
413 Cloud_t *clouds;
414 CCF(cudaMalloc(&clouds, clouds_size));
415 CCF(cudaMemcpy(clouds, p->clouds, clouds_size, cudaMemcpyHostToDevice));
416
417#ifdef DEBUG
418 Cloud_t *clouds_debug = (Cloud_t *)malloc(clouds_size);
419 if (clouds_debug == NULL) {
420 printf("Failed to allocate clouds_debug\n");
421 exit(EXIT_FAILURE);
422 }
423#endif
424
425 float *ground;
426 CCF(cudaMalloc(&ground, ground_size));
427 CCF(cudaMemcpy(ground, p->ground, ground_size, cudaMemcpyHostToDevice));
428
429 CCF(cudaMemcpyToSymbol(sd, &sd_host, sizeof(StaticData), 0, cudaMemcpyHostToDevice));
430
431 struct statistics *stats_device;
432 CCF(cudaMalloc(&stats_device, sizeof(Statistics)));
433 CCF(cudaMemcpy(stats_device, &stats_host, sizeof(Statistics), cudaMemcpyHostToDevice));
434
435 int *water_level_read;
436 CCF(cudaMalloc(&water_level_read, water_level_size));
437 CCF(cudaMemset(water_level_read, 0, water_level_size));
438
439 int *water_level_write;
440 CCF(cudaMalloc(&water_level_write, water_level_size));
441 CCF(cudaMemset(water_level_write, 0, water_level_size));
442
443#ifdef DEBUG
444 int *water_level_debug = (int *)malloc(water_level_size);
445 if (water_level_debug == NULL) {
446 printf("Failed to allocate water_level_debug\n");
447 exit(EXIT_FAILURE);
448 }
449#endif
450
451 /* CUDA grid. */
452 dim3 block(32, 8);
453 auto columns_ceil = (p->columns + block.x - 1) / block.x;
454 auto rows_ceil = (p->rows + block.y - 1) / block.y;
455
456 dim3 grid(columns_ceil, rows_ceil);
457
458#ifdef DEBUG
459 print_matrix(PRECISION_FLOAT, rows, columns, ground, "Ground heights");
460#ifndef ANIMATION
461 print_clouds(p->num_clouds, p->clouds);
462#endif
463#endif
464
465 /* Prepare to measure runtime */
466 r->runtime = get_time();
467
468 /* Flood simulation */
469 while (stats_host.minute < p->num_minutes && stats_host.max_spillage_iter > p->threshold) {
470 step1<<<p->num_clouds, 256>>>(stats_device, clouds, water_level_read);
471 CCK();
472
473#ifdef DEBUG
474#ifndef ANIMATION
475 CCF(cudaMemcpy(clouds_debug, clouds, clouds_size, cudaMemcpyDeviceToHost));
476
477 print_clouds(p->num_clouds, clouds_debug);
478#endif
479#endif
480
481#ifdef DEBUG
482 CCF(cudaMemcpy(
483 water_level_debug, water_level_read, water_level_size, cudaMemcpyDeviceToHost
484 ));
485
486 print_matrix(PRECISION_FIXED, p->rows, p->columns, water_level_debug, "Water after rain");
487#endif
488
489 step2_3<<<grid, block>>>(stats_device, water_level_read, water_level_write, ground);
490 CCK();
491
492#ifdef DEBUG
493#ifndef ANIMATION
494 CCF(cudaMemcpy(
495 water_level_debug, water_level_write, water_level_size, cudaMemcpyDeviceToHost
496 ));
497
498 print_matrix(
499 PRECISION_FIXED, p->rows, p->columns, water_level_debug, "Water after spillage"
500 );
501#endif
502#endif
503
504 next_iteration<<<1, 1>>>(stats_device);
505 CCK();
506
507 // Copies minute and max_spillage_iter as these are required for the loop.
508 // Also implicitly synchronises the CPU and GPU.
509 CCF(cudaMemcpy(&stats_host, stats_device, sizeof(Statistics), cudaMemcpyDeviceToHost));
510
511 std::swap(water_level_read, water_level_write);
512 } // End of simulation.
513
514 r->runtime = get_time() - r->runtime;
515
516 /* Statistics: Total remaining water and maximum amount of water in a cell */
517 calculate_total<<<grid, block>>>(stats_device, water_level_read);
518 CCK();
519
520 // Copies max_water_scenario and total_water.
521 // Also implicitly synchronises the CPU and GPU.
522 CCF(cudaMemcpy(&stats_host, stats_device, sizeof(Statistics), cudaMemcpyDeviceToHost));
523
524 if (p->final_matrix) {
525 print_matrix(
526 PRECISION_FIXED, p->rows, p->columns, water_level_read, "Water after spillage"
527 );
528 }
529
530 r->max_water_scenario = stats_host.max_water_scenario;
531 r->total_water = stats_host.total_water;
532
533 r->minute = stats_host.minute;
534 r->total_rain = stats_host.total_rain;
535 r->total_water_loss = stats_host.total_water_loss;
536
537 r->max_spillage_minute = stats_host.max_spillage_minute;
538 r->max_spillage_scenario = stats_host.max_spillage_scenario;
539
540 /* Free resources */
541 CCF(cudaFree(clouds));
542 CCF(cudaFree(ground));
543 CCF(cudaFree(stats_device));
544 CCF(cudaFree(water_level_read));
545 CCF(cudaFree(water_level_write));
546
547#ifdef DEBUG
548 free(clouds_debug);
549 free(water_level_debug);
550#endif
551}