this string has no description
0
flood_cuda.cu
551 lines 19 kB view raw
1#include <cstdlib> 2#include <cuda.h> 3#include <float.h> 4#include <stdio.h> 5#include <utility> 6 7#include "flood.h" 8 9/* Example of macros for error checking in CUDA */ 10#define CCF(call) \ 11 { \ 12 cudaError_t check = call; \ 13 if (check != cudaSuccess) \ 14 fprintf(stderr, "CUDA Error in line: %d, %s\n", __LINE__, cudaGetErrorString(check)); \ 15 } 16 17#define CCK() \ 18 { \ 19 cudaError_t check = cudaGetLastError(); \ 20 if (check != cudaSuccess) \ 21 fprintf( \ 22 stderr, "CUDA Kernel Error in line: %d, %s\n", __LINE__, cudaGetErrorString(check) \ 23 ); \ 24 } 25 26#define get_global(buffer, index) buffer[index.y * sd.x_count + index.x] 27#define get_shared(buffer, index) buffer[index.y * SHARED_X_COUNT + index.x] 28 29#define COORD_SCEN2MAT_X(x) (x * sd.x_count / SCENARIO_SIZE) 30#define COORD_SCEN2MAT_Y(y) (y * sd.y_count / SCENARIO_SIZE) 31#define COORD_MAT2SCEN_X(x) (x * SCENARIO_SIZE / sd.x_count) 32#define COORD_MAT2SCEN_Y(y) (y * SCENARIO_SIZE / sd.y_count) 33 34typedef struct static_data { 35 int x_count; 36 int y_count; 37 float ex_factor; 38 int cloud_count; 39} StaticData; 40 41typedef struct statistics { 42 double max_spillage_iter; 43 int max_spillage_minute; 44 double max_spillage_scenario; 45 double max_water_scenario; 46 47 int minute; 48 49 unsigned long long total_rain; 50 unsigned long long total_water; 51 unsigned long long total_water_loss; 52} Statistics; 53 54__constant__ StaticData sd; 55 56__device__ __forceinline__ double atomicMaxDouble(double *address, double value) { 57 return __longlong_as_double( 58 atomicMax((unsigned long long *)address, __double_as_longlong(value)) 59 ); 60} 61 62__device__ __forceinline__ float warpReduceMax(float value) { 63#pragma unroll 64 for (int offset = warpSize / 2; offset > 0; offset >>= 1) 65 value = max(value, __shfl_down_sync(0xffffffff, value, offset)); 66 67 return value; 68} 69 70__device__ __forceinline__ unsigned long long warpReduceSum(unsigned long long value) { 71#pragma unroll 72 for (int offset = warpSize / 2; offset > 0; offset >>= 1) 73 value = value + __shfl_down_sync(0xffffffff, value, offset); 74 75 return value; 76} 77 78__device__ __forceinline__ bool is_in_bounds(int2 index) { 79 return index.x >= 0 && index.x < sd.x_count && index.y >= 0 && index.y < sd.y_count; 80} 81 82__device__ __forceinline__ int2 operator+(int2 lhs, int2 rhs) { 83 return make_int2(lhs.x + rhs.x, lhs.y + rhs.y); 84} 85 86__global__ void next_iteration(Statistics *stats) { 87 if (stats->max_spillage_iter > stats->max_spillage_scenario) { 88 stats->max_spillage_scenario = stats->max_spillage_iter; 89 stats->max_spillage_minute = stats->minute; 90 } 91 92 stats->minute += 1; 93} 94 95__global__ void step1(Statistics *stats, Cloud_t *clouds, int *water_level) { 96 int cloud_index = blockIdx.x; 97 int thread_index = threadIdx.x; 98 99 unsigned long long total_rain_thread = 0; 100 101 __shared__ Cloud_t cloud; 102 103 if (thread_index == 0) { 104 // Bit ugly here but it works! 105 stats->max_spillage_iter = 0; 106 107 /* Step 1.1: Clouds movement */ 108 cloud = clouds[cloud_index]; 109 110 float km_minute = cloud.speed / 60; 111 cloud.x += km_minute * cos(cloud.angle * M_PI / 180.0); 112 cloud.y += km_minute * sin(cloud.angle * M_PI / 180.0); 113 114 clouds[cloud_index] = cloud; 115 } 116 117 __syncthreads(); 118 119 // Compute the bounding box area of the cloud 120 float lhs_x_index = COORD_SCEN2MAT_X(max(0.0, cloud.x - cloud.radius)); 121 float rhs_x_index = COORD_SCEN2MAT_X(min(cloud.x + cloud.radius, (float)SCENARIO_SIZE)); 122 float lhs_y_index = COORD_SCEN2MAT_Y(max(0.0, cloud.y - cloud.radius)); 123 float rhs_y_index = COORD_SCEN2MAT_Y(min(cloud.y + cloud.radius, (float)SCENARIO_SIZE)); 124 125 int x_count = max(0, (int)ceilf(rhs_x_index - lhs_x_index)); 126 int y_count = max(0, (int)ceilf(rhs_y_index - lhs_y_index)); 127 int cell_count = x_count * y_count; 128 129 for (int i = thread_index; i < cell_count; i += blockDim.x) { 130 int x_offset_count = i % x_count; 131 int y_offset_count = i / x_count; 132 133 float float_x_index = lhs_x_index + x_offset_count; 134 float float_y_index = lhs_y_index + y_offset_count; 135 136 int2 int_index = make_int2((int)float_x_index, (int)float_y_index); 137 if (!is_in_bounds(int_index)) continue; 138 139 float dx = COORD_MAT2SCEN_X(float_x_index) - cloud.x; 140 float dy = COORD_MAT2SCEN_Y(float_y_index) - cloud.y; 141 142 float distance = sqrt(dx * dx + dy * dy); 143 if (distance >= cloud.radius) continue; 144 145 float rain = sd.ex_factor * 146 max(0.0, cloud.intensity - distance / cloud.radius * sqrt(cloud.intensity)); 147 148 float meters_per_minute = rain / 1000 / 60; 149 150 // Making this conditional on >0 doesn't seem to do anything. 151 atomicAdd(&get_global(water_level, int_index), FIXED(meters_per_minute)); 152 153 total_rain_thread += FIXED(meters_per_minute); 154 } 155 156 unsigned long long total_rain_warp = warpReduceSum(total_rain_thread); 157 158 int warp_lane_index = thread_index % warpSize; 159 if (warp_lane_index == 0 && total_rain_warp > 0) { 160 atomicAdd(&stats->total_rain, total_rain_warp); 161 } 162} 163 164// 2 halo cells on both sides. 165constexpr int SHARED_X_COUNT = 32 + 4; 166constexpr int SHARED_Y_COUNT = 8 + 4; 167constexpr int SHARED_COUNT = SHARED_X_COUNT * SHARED_Y_COUNT; 168 169template <bool is_cell> 170__device__ __forceinline__ void inner_kernel( 171 int2 lhs_index, 172 int2 lhs_index_shared, 173 float *height_shared, 174 float *ground_shared, 175 float *water_level_read_shared, 176 float cell_height, 177 float *max_spillage_iter_thread, 178 int *cell_water_level_delta_int, 179 unsigned long long *total_water_loss_thread 180) { 181 // int2 lhs_index = cell_index + offsets[outer_index]; 182 // if (!is_in_bounds(lhs_index)) continue; 183 184 // int2 lhs_index_shared = cell_index_shared + offsets[outer_index]; 185 186 float lhs_water_level = get_shared(water_level_read_shared, lhs_index_shared); 187 if (lhs_water_level <= 0.0) return; 188 189 float height_difference_sum = 0; 190 float height_difference_max = 0; 191 float height_difference_oob = 0; 192 193 float lhs_height = get_shared(height_shared, lhs_index_shared); 194 float lhs_ground = get_shared(ground_shared, lhs_index_shared); 195 196 static int2 offsets[4] = {{-1, 0}, {+1, 0}, {0, -1}, {0, +1}}; 197 198#pragma unroll 199 for (int inner_index = 0; inner_index < 4; inner_index += 1) { 200 int2 rhs_index = lhs_index + offsets[inner_index]; 201 int2 rhs_index_shared = lhs_index_shared + offsets[inner_index]; 202 203 float rhs_height; 204 205 if (is_in_bounds(rhs_index)) { 206 rhs_height = get_shared(height_shared, rhs_index_shared); 207 } else { 208 rhs_height = lhs_ground; 209 } 210 211 if (rhs_height > lhs_height) continue; 212 213 // Compute level differences 214 float height_difference = lhs_height - rhs_height; 215 216 height_difference_sum += height_difference; 217 height_difference_max = max(height_difference_max, height_difference); 218 219 if (!is_in_bounds(rhs_index)) { 220 height_difference_oob += height_difference; 221 } 222 } 223 224 float lhs_spillage_level = min(lhs_water_level, height_difference_max); 225 226 if (height_difference_sum <= 0.0) return; 227 228 float proportion = lhs_spillage_level / height_difference_sum; 229 if (proportion <= 1e-8) return; 230 231 // Identical logic to: flood_seq.c:167 and flood_seq.c:218. 232 if constexpr (is_cell) { 233 int outgoing = FIXED(lhs_spillage_level / SPILLAGE_FACTOR); 234 *cell_water_level_delta_int -= outgoing; 235 236 *total_water_loss_thread += FIXED(proportion * height_difference_oob / 2); 237 238 *max_spillage_iter_thread = 239 max(*max_spillage_iter_thread, lhs_spillage_level / SPILLAGE_FACTOR); 240 241 return; 242 } 243 244 if (lhs_height < cell_height) return; 245 246 // Identical logic to: flood_seq.c:196, flood_seq:199 and flood_seq:235. 247 int incoming = FIXED(proportion * (lhs_height - cell_height) / SPILLAGE_FACTOR); 248 *cell_water_level_delta_int += incoming; 249} 250 251__global__ void step2_3( 252 Statistics *stats, 253 int *water_level_read, 254 int *water_level_write, 255 float *ground 256) { 257 __shared__ float ground_shared[SHARED_COUNT]; 258 __shared__ float height_shared[SHARED_COUNT]; 259 __shared__ float water_level_read_shared[SHARED_COUNT]; 260 261 int thread_index = blockDim.x * threadIdx.y + threadIdx.x; 262 int thread_count = blockDim.x * blockDim.y; 263 int block_count = SHARED_X_COUNT * SHARED_Y_COUNT; 264 265 for (int i = thread_index; i < block_count; i += thread_count) { 266 int2 shared_index = make_int2(i % SHARED_X_COUNT, i / SHARED_X_COUNT); 267 268 int global_x_index = blockIdx.x * blockDim.x + shared_index.x - 2; 269 int global_y_index = blockIdx.y * blockDim.y + shared_index.y - 2; 270 int2 global_index = make_int2(global_x_index, global_y_index); 271 272 if (is_in_bounds(global_index)) { 273 float ground_value = get_global(ground, global_index); 274 get_shared(ground_shared, shared_index) = ground_value; 275 276 int water_level_int_value = get_global(water_level_read, global_index); 277 float water_level_float_value = FLOATING(water_level_int_value); 278 get_shared(water_level_read_shared, shared_index) = water_level_float_value; 279 280 float height_value = ground_value + water_level_float_value; 281 get_shared(height_shared, shared_index) = height_value; 282 } else { 283 get_shared(ground_shared, shared_index) = 0.0; 284 get_shared(height_shared, shared_index) = 0.0; 285 get_shared(water_level_read_shared, shared_index) = 0.0; 286 } 287 } 288 289 __syncthreads(); 290 291 int cell_x_index = blockIdx.x * blockDim.x + threadIdx.x; 292 int cell_y_index = blockIdx.y * blockDim.y + threadIdx.y; 293 int2 cell_index = make_int2(cell_x_index, cell_y_index); 294 295 float max_spillage_iter_thread = 0.0; 296 unsigned long long total_water_loss_thread = 0; 297 298 if (is_in_bounds(cell_index)) { 299 int2 cell_index_shared = make_int2(threadIdx.x + 2, threadIdx.y + 2); 300 301 int cell_water_level_delta_int = 0; 302 303 float cell_height = get_shared(height_shared, cell_index_shared); 304 305 inner_kernel<true>( 306 cell_index, 307 cell_index_shared, 308 height_shared, 309 ground_shared, 310 water_level_read_shared, 311 cell_height, 312 &max_spillage_iter_thread, 313 &cell_water_level_delta_int, 314 &total_water_loss_thread 315 ); 316 317 static int2 offsets[4] = {{-1, 0}, {+1, 0}, {0, -1}, {0, +1}}; 318 319 for (int offset_index = 0; offset_index < 4; offset_index += 1) { 320 int2 offset = offsets[offset_index]; 321 if (is_in_bounds(cell_index + offset)) { 322 inner_kernel<false>( 323 cell_index + offset, 324 cell_index_shared + offset, 325 height_shared, 326 ground_shared, 327 water_level_read_shared, 328 cell_height, 329 &max_spillage_iter_thread, 330 &cell_water_level_delta_int, 331 &total_water_loss_thread 332 ); 333 } 334 } 335 336 get_global(water_level_write, cell_index) = 337 get_global(water_level_read, cell_index) + cell_water_level_delta_int; 338 } 339 340 if (total_water_loss_thread > 0) { 341 atomicAdd(&stats->total_water_loss, total_water_loss_thread); 342 } 343 344 float max_spillage_iter_warp = warpReduceMax(max_spillage_iter_thread); 345 346 int warp_lane_index = thread_index % warpSize; 347 if (warp_lane_index == 0 && max_spillage_iter_warp > 0.0) { 348 atomicMaxDouble(&stats->max_spillage_iter, max_spillage_iter_warp); 349 } 350} 351 352__global__ void calculate_total(Statistics *stats, int *water_level) { 353 int row_count = sd.y_count, column_count = sd.x_count; 354 355 int x_index = blockIdx.x * blockDim.x + threadIdx.x; 356 int y_index = blockIdx.y * blockDim.y + threadIdx.y; 357 int2 index = make_int2(x_index, y_index); 358 359 auto thread_index = threadIdx.x + threadIdx.y * blockDim.x; 360 361 __shared__ double max_water_scenario_local; 362 __shared__ unsigned long long total_water_local; 363 364 if (thread_index == 0) { 365 max_water_scenario_local = 0.0; 366 total_water_local = 0; 367 } 368 369 __syncthreads(); 370 371 if (y_index >= row_count || x_index >= column_count) return; 372 373 atomicAdd(&total_water_local, get_global(water_level, index)); 374 atomicMaxDouble(&max_water_scenario_local, FLOATING(get_global(water_level, index))); 375 376 __syncthreads(); 377 378 if (thread_index == 0) { 379 atomicAdd(&stats->total_water, total_water_local); 380 atomicMaxDouble(&stats->max_water_scenario, max_water_scenario_local); 381 } 382} 383 384extern "C" void do_compute(struct parameters *p, struct results *r) { 385 /* 2. Start global timer */ 386 CCF(cudaSetDevice(0)); 387 388 /* Memory allocation */ 389 size_t cell_count = (size_t)p->rows * (size_t)p->columns; 390 391 size_t clouds_size = sizeof(Cloud_t) * (size_t)p->num_clouds; 392 size_t ground_size = sizeof(float) * cell_count; 393 size_t water_level_size = sizeof(int) * cell_count; 394 395 StaticData sd_host = { 396 .x_count = p->columns, 397 .y_count = p->rows, 398 .ex_factor = p->ex_factor, 399 .cloud_count = p->num_clouds, 400 }; 401 402 Statistics stats_host = { 403 .max_spillage_iter = DBL_MAX, 404 .max_spillage_minute = 0, 405 .max_spillage_scenario = 0.0, 406 .max_water_scenario = 0.0, 407 .minute = 0, 408 .total_rain = 0, 409 .total_water = 0, 410 .total_water_loss = 0, 411 }; 412 413 Cloud_t *clouds; 414 CCF(cudaMalloc(&clouds, clouds_size)); 415 CCF(cudaMemcpy(clouds, p->clouds, clouds_size, cudaMemcpyHostToDevice)); 416 417#ifdef DEBUG 418 Cloud_t *clouds_debug = (Cloud_t *)malloc(clouds_size); 419 if (clouds_debug == NULL) { 420 printf("Failed to allocate clouds_debug\n"); 421 exit(EXIT_FAILURE); 422 } 423#endif 424 425 float *ground; 426 CCF(cudaMalloc(&ground, ground_size)); 427 CCF(cudaMemcpy(ground, p->ground, ground_size, cudaMemcpyHostToDevice)); 428 429 CCF(cudaMemcpyToSymbol(sd, &sd_host, sizeof(StaticData), 0, cudaMemcpyHostToDevice)); 430 431 struct statistics *stats_device; 432 CCF(cudaMalloc(&stats_device, sizeof(Statistics))); 433 CCF(cudaMemcpy(stats_device, &stats_host, sizeof(Statistics), cudaMemcpyHostToDevice)); 434 435 int *water_level_read; 436 CCF(cudaMalloc(&water_level_read, water_level_size)); 437 CCF(cudaMemset(water_level_read, 0, water_level_size)); 438 439 int *water_level_write; 440 CCF(cudaMalloc(&water_level_write, water_level_size)); 441 CCF(cudaMemset(water_level_write, 0, water_level_size)); 442 443#ifdef DEBUG 444 int *water_level_debug = (int *)malloc(water_level_size); 445 if (water_level_debug == NULL) { 446 printf("Failed to allocate water_level_debug\n"); 447 exit(EXIT_FAILURE); 448 } 449#endif 450 451 /* CUDA grid. */ 452 dim3 block(32, 8); 453 auto columns_ceil = (p->columns + block.x - 1) / block.x; 454 auto rows_ceil = (p->rows + block.y - 1) / block.y; 455 456 dim3 grid(columns_ceil, rows_ceil); 457 458#ifdef DEBUG 459 print_matrix(PRECISION_FLOAT, rows, columns, ground, "Ground heights"); 460#ifndef ANIMATION 461 print_clouds(p->num_clouds, p->clouds); 462#endif 463#endif 464 465 /* Prepare to measure runtime */ 466 r->runtime = get_time(); 467 468 /* Flood simulation */ 469 while (stats_host.minute < p->num_minutes && stats_host.max_spillage_iter > p->threshold) { 470 step1<<<p->num_clouds, 256>>>(stats_device, clouds, water_level_read); 471 CCK(); 472 473#ifdef DEBUG 474#ifndef ANIMATION 475 CCF(cudaMemcpy(clouds_debug, clouds, clouds_size, cudaMemcpyDeviceToHost)); 476 477 print_clouds(p->num_clouds, clouds_debug); 478#endif 479#endif 480 481#ifdef DEBUG 482 CCF(cudaMemcpy( 483 water_level_debug, water_level_read, water_level_size, cudaMemcpyDeviceToHost 484 )); 485 486 print_matrix(PRECISION_FIXED, p->rows, p->columns, water_level_debug, "Water after rain"); 487#endif 488 489 step2_3<<<grid, block>>>(stats_device, water_level_read, water_level_write, ground); 490 CCK(); 491 492#ifdef DEBUG 493#ifndef ANIMATION 494 CCF(cudaMemcpy( 495 water_level_debug, water_level_write, water_level_size, cudaMemcpyDeviceToHost 496 )); 497 498 print_matrix( 499 PRECISION_FIXED, p->rows, p->columns, water_level_debug, "Water after spillage" 500 ); 501#endif 502#endif 503 504 next_iteration<<<1, 1>>>(stats_device); 505 CCK(); 506 507 // Copies minute and max_spillage_iter as these are required for the loop. 508 // Also implicitly synchronises the CPU and GPU. 509 CCF(cudaMemcpy(&stats_host, stats_device, sizeof(Statistics), cudaMemcpyDeviceToHost)); 510 511 std::swap(water_level_read, water_level_write); 512 } // End of simulation. 513 514 r->runtime = get_time() - r->runtime; 515 516 /* Statistics: Total remaining water and maximum amount of water in a cell */ 517 calculate_total<<<grid, block>>>(stats_device, water_level_read); 518 CCK(); 519 520 // Copies max_water_scenario and total_water. 521 // Also implicitly synchronises the CPU and GPU. 522 CCF(cudaMemcpy(&stats_host, stats_device, sizeof(Statistics), cudaMemcpyDeviceToHost)); 523 524 if (p->final_matrix) { 525 print_matrix( 526 PRECISION_FIXED, p->rows, p->columns, water_level_read, "Water after spillage" 527 ); 528 } 529 530 r->max_water_scenario = stats_host.max_water_scenario; 531 r->total_water = stats_host.total_water; 532 533 r->minute = stats_host.minute; 534 r->total_rain = stats_host.total_rain; 535 r->total_water_loss = stats_host.total_water_loss; 536 537 r->max_spillage_minute = stats_host.max_spillage_minute; 538 r->max_spillage_scenario = stats_host.max_spillage_scenario; 539 540 /* Free resources */ 541 CCF(cudaFree(clouds)); 542 CCF(cudaFree(ground)); 543 CCF(cudaFree(stats_device)); 544 CCF(cudaFree(water_level_read)); 545 CCF(cudaFree(water_level_write)); 546 547#ifdef DEBUG 548 free(clouds_debug); 549 free(water_level_debug); 550#endif 551}