#include #include #include #include #include #include "flood.h" /* Example of macros for error checking in CUDA */ #define CCF(call) \ { \ cudaError_t check = call; \ if (check != cudaSuccess) \ fprintf(stderr, "CUDA Error in line: %d, %s\n", __LINE__, cudaGetErrorString(check)); \ } #define CCK() \ { \ cudaError_t check = cudaGetLastError(); \ if (check != cudaSuccess) \ fprintf( \ stderr, "CUDA Kernel Error in line: %d, %s\n", __LINE__, cudaGetErrorString(check) \ ); \ } #define get_global(buffer, index) buffer[index.y * sd.x_count + index.x] #define get_shared(buffer, index) buffer[index.y * SHARED_X_COUNT + index.x] #define COORD_SCEN2MAT_X(x) (x * sd.x_count / SCENARIO_SIZE) #define COORD_SCEN2MAT_Y(y) (y * sd.y_count / SCENARIO_SIZE) #define COORD_MAT2SCEN_X(x) (x * SCENARIO_SIZE / sd.x_count) #define COORD_MAT2SCEN_Y(y) (y * SCENARIO_SIZE / sd.y_count) typedef struct static_data { int x_count; int y_count; float ex_factor; int cloud_count; } StaticData; typedef struct statistics { double max_spillage_iter; int max_spillage_minute; double max_spillage_scenario; double max_water_scenario; int minute; unsigned long long total_rain; unsigned long long total_water; unsigned long long total_water_loss; } Statistics; __constant__ StaticData sd; __device__ __forceinline__ double atomicMaxDouble(double *address, double value) { return __longlong_as_double( atomicMax((unsigned long long *)address, __double_as_longlong(value)) ); } __device__ __forceinline__ float warpReduceMax(float value) { #pragma unroll for (int offset = warpSize / 2; offset > 0; offset >>= 1) value = max(value, __shfl_down_sync(0xffffffff, value, offset)); return value; } __device__ __forceinline__ unsigned long long warpReduceSum(unsigned long long value) { #pragma unroll for (int offset = warpSize / 2; offset > 0; offset >>= 1) value = value + __shfl_down_sync(0xffffffff, value, offset); return value; } __device__ __forceinline__ bool is_in_bounds(int2 index) { return index.x >= 0 && index.x < sd.x_count && index.y >= 0 && index.y < sd.y_count; } __device__ __forceinline__ int2 operator+(int2 lhs, int2 rhs) { return make_int2(lhs.x + rhs.x, lhs.y + rhs.y); } __global__ void next_iteration(Statistics *stats) { if (stats->max_spillage_iter > stats->max_spillage_scenario) { stats->max_spillage_scenario = stats->max_spillage_iter; stats->max_spillage_minute = stats->minute; } stats->minute += 1; } __global__ void step1(Statistics *stats, Cloud_t *clouds, int *water_level) { int cloud_index = blockIdx.x; int thread_index = threadIdx.x; unsigned long long total_rain_thread = 0; __shared__ Cloud_t cloud; if (thread_index == 0) { // Bit ugly here but it works! stats->max_spillage_iter = 0; /* Step 1.1: Clouds movement */ cloud = clouds[cloud_index]; float km_minute = cloud.speed / 60; cloud.x += km_minute * cos(cloud.angle * M_PI / 180.0); cloud.y += km_minute * sin(cloud.angle * M_PI / 180.0); clouds[cloud_index] = cloud; } __syncthreads(); // Compute the bounding box area of the cloud float lhs_x_index = COORD_SCEN2MAT_X(max(0.0, cloud.x - cloud.radius)); float rhs_x_index = COORD_SCEN2MAT_X(min(cloud.x + cloud.radius, (float)SCENARIO_SIZE)); float lhs_y_index = COORD_SCEN2MAT_Y(max(0.0, cloud.y - cloud.radius)); float rhs_y_index = COORD_SCEN2MAT_Y(min(cloud.y + cloud.radius, (float)SCENARIO_SIZE)); int x_count = max(0, (int)ceilf(rhs_x_index - lhs_x_index)); int y_count = max(0, (int)ceilf(rhs_y_index - lhs_y_index)); int cell_count = x_count * y_count; for (int i = thread_index; i < cell_count; i += blockDim.x) { int x_offset_count = i % x_count; int y_offset_count = i / x_count; float float_x_index = lhs_x_index + x_offset_count; float float_y_index = lhs_y_index + y_offset_count; int2 int_index = make_int2((int)float_x_index, (int)float_y_index); if (!is_in_bounds(int_index)) continue; float dx = COORD_MAT2SCEN_X(float_x_index) - cloud.x; float dy = COORD_MAT2SCEN_Y(float_y_index) - cloud.y; float distance = sqrt(dx * dx + dy * dy); if (distance >= cloud.radius) continue; float rain = sd.ex_factor * max(0.0, cloud.intensity - distance / cloud.radius * sqrt(cloud.intensity)); float meters_per_minute = rain / 1000 / 60; // Making this conditional on >0 doesn't seem to do anything. atomicAdd(&get_global(water_level, int_index), FIXED(meters_per_minute)); total_rain_thread += FIXED(meters_per_minute); } unsigned long long total_rain_warp = warpReduceSum(total_rain_thread); int warp_lane_index = thread_index % warpSize; if (warp_lane_index == 0 && total_rain_warp > 0) { atomicAdd(&stats->total_rain, total_rain_warp); } } // 2 halo cells on both sides. constexpr int SHARED_X_COUNT = 32 + 4; constexpr int SHARED_Y_COUNT = 8 + 4; constexpr int SHARED_COUNT = SHARED_X_COUNT * SHARED_Y_COUNT; template __device__ __forceinline__ void inner_kernel( int2 lhs_index, int2 lhs_index_shared, float *height_shared, float *ground_shared, float *water_level_read_shared, float cell_height, float *max_spillage_iter_thread, int *cell_water_level_delta_int, unsigned long long *total_water_loss_thread ) { // int2 lhs_index = cell_index + offsets[outer_index]; // if (!is_in_bounds(lhs_index)) continue; // int2 lhs_index_shared = cell_index_shared + offsets[outer_index]; float lhs_water_level = get_shared(water_level_read_shared, lhs_index_shared); if (lhs_water_level <= 0.0) return; float height_difference_sum = 0; float height_difference_max = 0; float height_difference_oob = 0; float lhs_height = get_shared(height_shared, lhs_index_shared); float lhs_ground = get_shared(ground_shared, lhs_index_shared); static int2 offsets[4] = {{-1, 0}, {+1, 0}, {0, -1}, {0, +1}}; #pragma unroll for (int inner_index = 0; inner_index < 4; inner_index += 1) { int2 rhs_index = lhs_index + offsets[inner_index]; int2 rhs_index_shared = lhs_index_shared + offsets[inner_index]; float rhs_height; if (is_in_bounds(rhs_index)) { rhs_height = get_shared(height_shared, rhs_index_shared); } else { rhs_height = lhs_ground; } if (rhs_height > lhs_height) continue; // Compute level differences float height_difference = lhs_height - rhs_height; height_difference_sum += height_difference; height_difference_max = max(height_difference_max, height_difference); if (!is_in_bounds(rhs_index)) { height_difference_oob += height_difference; } } float lhs_spillage_level = min(lhs_water_level, height_difference_max); if (height_difference_sum <= 0.0) return; float proportion = lhs_spillage_level / height_difference_sum; if (proportion <= 1e-8) return; // Identical logic to: flood_seq.c:167 and flood_seq.c:218. if constexpr (is_cell) { int outgoing = FIXED(lhs_spillage_level / SPILLAGE_FACTOR); *cell_water_level_delta_int -= outgoing; *total_water_loss_thread += FIXED(proportion * height_difference_oob / 2); *max_spillage_iter_thread = max(*max_spillage_iter_thread, lhs_spillage_level / SPILLAGE_FACTOR); return; } if (lhs_height < cell_height) return; // Identical logic to: flood_seq.c:196, flood_seq:199 and flood_seq:235. int incoming = FIXED(proportion * (lhs_height - cell_height) / SPILLAGE_FACTOR); *cell_water_level_delta_int += incoming; } __global__ void step2_3( Statistics *stats, int *water_level_read, int *water_level_write, float *ground ) { __shared__ float ground_shared[SHARED_COUNT]; __shared__ float height_shared[SHARED_COUNT]; __shared__ float water_level_read_shared[SHARED_COUNT]; int thread_index = blockDim.x * threadIdx.y + threadIdx.x; int thread_count = blockDim.x * blockDim.y; int block_count = SHARED_X_COUNT * SHARED_Y_COUNT; for (int i = thread_index; i < block_count; i += thread_count) { int2 shared_index = make_int2(i % SHARED_X_COUNT, i / SHARED_X_COUNT); int global_x_index = blockIdx.x * blockDim.x + shared_index.x - 2; int global_y_index = blockIdx.y * blockDim.y + shared_index.y - 2; int2 global_index = make_int2(global_x_index, global_y_index); if (is_in_bounds(global_index)) { float ground_value = get_global(ground, global_index); get_shared(ground_shared, shared_index) = ground_value; int water_level_int_value = get_global(water_level_read, global_index); float water_level_float_value = FLOATING(water_level_int_value); get_shared(water_level_read_shared, shared_index) = water_level_float_value; float height_value = ground_value + water_level_float_value; get_shared(height_shared, shared_index) = height_value; } else { get_shared(ground_shared, shared_index) = 0.0; get_shared(height_shared, shared_index) = 0.0; get_shared(water_level_read_shared, shared_index) = 0.0; } } __syncthreads(); int cell_x_index = blockIdx.x * blockDim.x + threadIdx.x; int cell_y_index = blockIdx.y * blockDim.y + threadIdx.y; int2 cell_index = make_int2(cell_x_index, cell_y_index); float max_spillage_iter_thread = 0.0; unsigned long long total_water_loss_thread = 0; if (is_in_bounds(cell_index)) { int2 cell_index_shared = make_int2(threadIdx.x + 2, threadIdx.y + 2); int cell_water_level_delta_int = 0; float cell_height = get_shared(height_shared, cell_index_shared); inner_kernel( cell_index, cell_index_shared, height_shared, ground_shared, water_level_read_shared, cell_height, &max_spillage_iter_thread, &cell_water_level_delta_int, &total_water_loss_thread ); static int2 offsets[4] = {{-1, 0}, {+1, 0}, {0, -1}, {0, +1}}; for (int offset_index = 0; offset_index < 4; offset_index += 1) { int2 offset = offsets[offset_index]; if (is_in_bounds(cell_index + offset)) { inner_kernel( cell_index + offset, cell_index_shared + offset, height_shared, ground_shared, water_level_read_shared, cell_height, &max_spillage_iter_thread, &cell_water_level_delta_int, &total_water_loss_thread ); } } get_global(water_level_write, cell_index) = get_global(water_level_read, cell_index) + cell_water_level_delta_int; } if (total_water_loss_thread > 0) { atomicAdd(&stats->total_water_loss, total_water_loss_thread); } float max_spillage_iter_warp = warpReduceMax(max_spillage_iter_thread); int warp_lane_index = thread_index % warpSize; if (warp_lane_index == 0 && max_spillage_iter_warp > 0.0) { atomicMaxDouble(&stats->max_spillage_iter, max_spillage_iter_warp); } } __global__ void calculate_total(Statistics *stats, int *water_level) { int row_count = sd.y_count, column_count = sd.x_count; int x_index = blockIdx.x * blockDim.x + threadIdx.x; int y_index = blockIdx.y * blockDim.y + threadIdx.y; int2 index = make_int2(x_index, y_index); auto thread_index = threadIdx.x + threadIdx.y * blockDim.x; __shared__ double max_water_scenario_local; __shared__ unsigned long long total_water_local; if (thread_index == 0) { max_water_scenario_local = 0.0; total_water_local = 0; } __syncthreads(); if (y_index >= row_count || x_index >= column_count) return; atomicAdd(&total_water_local, get_global(water_level, index)); atomicMaxDouble(&max_water_scenario_local, FLOATING(get_global(water_level, index))); __syncthreads(); if (thread_index == 0) { atomicAdd(&stats->total_water, total_water_local); atomicMaxDouble(&stats->max_water_scenario, max_water_scenario_local); } } extern "C" void do_compute(struct parameters *p, struct results *r) { /* 2. Start global timer */ CCF(cudaSetDevice(0)); /* Memory allocation */ size_t cell_count = (size_t)p->rows * (size_t)p->columns; size_t clouds_size = sizeof(Cloud_t) * (size_t)p->num_clouds; size_t ground_size = sizeof(float) * cell_count; size_t water_level_size = sizeof(int) * cell_count; StaticData sd_host = { .x_count = p->columns, .y_count = p->rows, .ex_factor = p->ex_factor, .cloud_count = p->num_clouds, }; Statistics stats_host = { .max_spillage_iter = DBL_MAX, .max_spillage_minute = 0, .max_spillage_scenario = 0.0, .max_water_scenario = 0.0, .minute = 0, .total_rain = 0, .total_water = 0, .total_water_loss = 0, }; Cloud_t *clouds; CCF(cudaMalloc(&clouds, clouds_size)); CCF(cudaMemcpy(clouds, p->clouds, clouds_size, cudaMemcpyHostToDevice)); #ifdef DEBUG Cloud_t *clouds_debug = (Cloud_t *)malloc(clouds_size); if (clouds_debug == NULL) { printf("Failed to allocate clouds_debug\n"); exit(EXIT_FAILURE); } #endif float *ground; CCF(cudaMalloc(&ground, ground_size)); CCF(cudaMemcpy(ground, p->ground, ground_size, cudaMemcpyHostToDevice)); CCF(cudaMemcpyToSymbol(sd, &sd_host, sizeof(StaticData), 0, cudaMemcpyHostToDevice)); struct statistics *stats_device; CCF(cudaMalloc(&stats_device, sizeof(Statistics))); CCF(cudaMemcpy(stats_device, &stats_host, sizeof(Statistics), cudaMemcpyHostToDevice)); int *water_level_read; CCF(cudaMalloc(&water_level_read, water_level_size)); CCF(cudaMemset(water_level_read, 0, water_level_size)); int *water_level_write; CCF(cudaMalloc(&water_level_write, water_level_size)); CCF(cudaMemset(water_level_write, 0, water_level_size)); #ifdef DEBUG int *water_level_debug = (int *)malloc(water_level_size); if (water_level_debug == NULL) { printf("Failed to allocate water_level_debug\n"); exit(EXIT_FAILURE); } #endif /* CUDA grid. */ dim3 block(32, 8); auto columns_ceil = (p->columns + block.x - 1) / block.x; auto rows_ceil = (p->rows + block.y - 1) / block.y; dim3 grid(columns_ceil, rows_ceil); #ifdef DEBUG print_matrix(PRECISION_FLOAT, rows, columns, ground, "Ground heights"); #ifndef ANIMATION print_clouds(p->num_clouds, p->clouds); #endif #endif /* Prepare to measure runtime */ r->runtime = get_time(); /* Flood simulation */ while (stats_host.minute < p->num_minutes && stats_host.max_spillage_iter > p->threshold) { step1<<num_clouds, 256>>>(stats_device, clouds, water_level_read); CCK(); #ifdef DEBUG #ifndef ANIMATION CCF(cudaMemcpy(clouds_debug, clouds, clouds_size, cudaMemcpyDeviceToHost)); print_clouds(p->num_clouds, clouds_debug); #endif #endif #ifdef DEBUG CCF(cudaMemcpy( water_level_debug, water_level_read, water_level_size, cudaMemcpyDeviceToHost )); print_matrix(PRECISION_FIXED, p->rows, p->columns, water_level_debug, "Water after rain"); #endif step2_3<<>>(stats_device, water_level_read, water_level_write, ground); CCK(); #ifdef DEBUG #ifndef ANIMATION CCF(cudaMemcpy( water_level_debug, water_level_write, water_level_size, cudaMemcpyDeviceToHost )); print_matrix( PRECISION_FIXED, p->rows, p->columns, water_level_debug, "Water after spillage" ); #endif #endif next_iteration<<<1, 1>>>(stats_device); CCK(); // Copies minute and max_spillage_iter as these are required for the loop. // Also implicitly synchronises the CPU and GPU. CCF(cudaMemcpy(&stats_host, stats_device, sizeof(Statistics), cudaMemcpyDeviceToHost)); std::swap(water_level_read, water_level_write); } // End of simulation. r->runtime = get_time() - r->runtime; /* Statistics: Total remaining water and maximum amount of water in a cell */ calculate_total<<>>(stats_device, water_level_read); CCK(); // Copies max_water_scenario and total_water. // Also implicitly synchronises the CPU and GPU. CCF(cudaMemcpy(&stats_host, stats_device, sizeof(Statistics), cudaMemcpyDeviceToHost)); if (p->final_matrix) { print_matrix( PRECISION_FIXED, p->rows, p->columns, water_level_read, "Water after spillage" ); } r->max_water_scenario = stats_host.max_water_scenario; r->total_water = stats_host.total_water; r->minute = stats_host.minute; r->total_rain = stats_host.total_rain; r->total_water_loss = stats_host.total_water_loss; r->max_spillage_minute = stats_host.max_spillage_minute; r->max_spillage_scenario = stats_host.max_spillage_scenario; /* Free resources */ CCF(cudaFree(clouds)); CCF(cudaFree(ground)); CCF(cudaFree(stats_device)); CCF(cudaFree(water_level_read)); CCF(cudaFree(water_level_write)); #ifdef DEBUG free(clouds_debug); free(water_level_debug); #endif }