/* * segmentation.cu * * Created on: Oct 30, 2019 * Author: lutton */ #include #include #include // clock() and clock_t #include "segmentation.h" /** normalized 5-point smoothing filter, using binomial approximation to Gaussian * d_img input 3D image * d_convolved output 3D image * step direction of smoothing (1,C, or R * C) * R number of rows in image * C number of columns in image * N size of image */ __global__ void img_smooth_5( float *d_img, float *d_convolved, int *d_NR, int *d_NL, int *d_NR2, int *d_NL2, int R, int C, int edge_offset ){ int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; int p = k * R * C + j * C + i; int e = p + edge_offset; d_convolved[p] = (d_img[d_NR2[e]] + 4 * d_img[d_NR[e]] + 6 * d_img[p] + 4 * d_img[d_NL[e]] + d_img[d_NL2[e]]) * 0.0625; } /** two point gradient with variable radius * d_img input 3D image * d_convolved output 3D image * step product of direction (1,R, or R*C) and step size * R number or rows in image * C number of columns in image * N size of image */ __global__ void img_diff( float *d_img, float *d_convolved, int *d_NRs, int *d_NLs, int R, int C, int edge_offset ){ int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; int p = k * R * C + j * C + i; int e = p + edge_offset; d_convolved[p] = (d_img[d_NRs[e]] - d_img[d_NLs[e]]); } /** normalize gradient and store normal values * gradient directions have a minimum size to avoid introducing normals where no gradient exists. * d_norm_gxx gradient in x to be normalized * d_norm_gyy gradient in y to be normalized * d_norm_gzz gradient in z to be normalized * d_norm gradient size * R number of rows * C number of columns */ /* __global__ void normalize_grad( float *d_norm_gxx, float *d_norm_gyy, float *d_norm_gzz, float t_norm, int R, int C ){ int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; int p = k * R * C + j * C + i; float d_norm = norm3df(d_norm_gxx[p], d_norm_gyy[p], d_norm_gzz[p]); if(d_norm > t_norm){ d_norm_gxx[p] = d_norm_gxx[p] / d_norm; d_norm_gyy[p] = d_norm_gyy[p] / d_norm; d_norm_gzz[p] = d_norm_gzz[p] / d_norm; } else { d_norm_gxx[p] = 0.0; d_norm_gyy[p] = 0.0; d_norm_gzz[p] = 0.0; } } */ __global__ void normalize_grad( float *d_norm_gxx, float *d_norm_gyy, float *d_norm_gzz, float *d_norm, int R, int C ){ int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; int p = k * R * C + j * C + i; d_norm_gxx[p] = (0.5 + copysignf(0.5,fabsf(d_norm_gxx[p]) - 0.001)) * d_norm_gxx[p]; d_norm_gyy[p] = (0.5 + copysignf(0.5,fabsf(d_norm_gyy[p]) - 0.001)) * d_norm_gyy[p]; d_norm_gzz[p] = (0.5 + copysignf(0.5,fabsf(d_norm_gzz[p]) - 0.001)) * d_norm_gzz[p]; d_norm[p] = fmaxf(norm3df(d_norm_gxx[p], d_norm_gyy[p], d_norm_gzz[p]), 0.000001); d_norm_gxx[p] = d_norm_gxx[p] / d_norm[p]; d_norm_gyy[p] = d_norm_gyy[p] / d_norm[p]; d_norm_gzz[p] = d_norm_gzz[p] / d_norm[p]; } /** set image to zero * * d_img image to be set to zero * R number of rows * C number of columns */ __global__ void zero_image( float *d_img, int R, int C){ int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; int p = k * R * C + j * C + i; d_img[p] = 0; } /** multiply all image values by a given scale factor * * d_img image to be scaled * scale scale factor (multiplier * R number of rows * C number of columns */ __global__ void scale_image( float *d_img, float scale, int R, int C){ int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; int p = k * R * C + j * C + i; d_img[p] = scale * d_img[p]; } /** compute the mean curvature given normals, gradient size, and foreground probabilities * mean curvature is calculated using the divergence formula, with a threshold on the gradient * size to exclude areas where there is no change in probabilities (gradient effectively zero) * * d_norm_gxx normal in the x-direction * d_norm_gyy normal in the y-direction * d_norm_gzz normal in the z-direction * d_norm gradient magnitude * d_v foreground probability * d_curv mean curvature * t_norm gradient threshold * R number of rows * C number of columns */ /* __global__ void compute_curvature( float *d_norm_gxx, float *d_norm_gyy, float *d_norm_gzz, float *d_v, float *d_curv, int R, int C ) { int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; int p = k * R * C + j * C + i; float curv = -d_norm_gxx[p] - d_norm_gyy[p] - d_norm_gzz[p]; d_curv[p] = (d_v[p] < 0.5) ? fminf(curv, 0) : fmaxf(curv, 0); } */ __global__ void compute_curvature( float *d_norm_gxx, float *d_norm_gyy, float *d_norm_gzz, float *d_norm, float *d_v, float *d_curv, float t_norm, int R, int C ) { int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; int p = k * R * C + j * C + i; float curv = -d_norm_gxx[p] - d_norm_gyy[p] - d_norm_gzz[p]; d_curv[p] = (d_norm[p] > t_norm) ? ((d_v[p] < 0.5) ? fminf(curv, 0) : fmaxf(curv, 0)) : 0; } // TODO: finish this comment /** compute the derivative of the foreground probabilities (d_v) * differences are taken for each edge of each node in each direction * for a given direction, edges pointing forward are referred to as 'right' edges, * while edges pointing backward are referred to as 'left' edges * these differences are subsequently summed to give the weighted second derivative * * d_v foreground probabilities * d_dvdtL derivative on the 'left' * d_dvdtR derivative on the 'right' * d_NR node to the 'right' * d_w2 weight of edge */ __global__ void compute_dvdt( float *d_v, float *d_dvdtL, float *d_dvdtR, int *d_NR, float *d_w2, float Ddt, int R, int C, int edge_offset ) { int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; int pL = k * R * C + j * C + i; int e = pL + edge_offset; int pR = d_NR[e]; float diff = (d_v[pR] - d_v[pL]) * d_w2[e]; d_dvdtL[pL] += diff; d_dvdtR[pR] -= diff; } /** sum the derivatives and curvature term * * d_v foreground probabilities * d_dvdtL differences from 'left' edges * d_dvdtR differences from 'right' edges * d_curv mean curvature * Ddt time discretization * kappa curvature coefficient * R number of rows * C number of columns */ __global__ void kernel_add( float *d_v, float *d_dvdtL, float *d_dvdtR, float *d_seed_indicator, float *d_curv, float Ddt, float kappa, int R, int C ) { int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; int p = k * R * C + j * C + i; if(d_seed_indicator[p] > 0.5){ d_v[p] += Ddt * (d_dvdtL[p] + d_dvdtR[p] + kappa * d_curv[p] * d_v[p] * (1.0 - d_v[p])); } } // TODO: add full comment on this kernel //compute relative error __global__ void RE( float *d_v, float *d_dcdtL, float *d_dcdtR, float *d_seed_indicator, float *d_curv, float *d_RE, float Ddt, float kappa, int R, int C, int num_blocks_x, int num_blocks_y, int num_blocks_z, int num_threads_x, int num_threads_y, int num_threads_z ) { int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; int p = i + j * num_threads_x + k * num_threads_x * num_threads_y; int idx = i + j * C + k * R * C; int b_x; int b_y; int b_z; int idx_n; __shared__ float local_RE; d_RE[p] = 0; // zero the local sum if (p == 0) local_RE = 0; __syncthreads(); // make sure all threads are caught up //RE calc for (b_x = 0; b_x < num_blocks_x; ++b_x) for (b_y = 0; b_y < num_blocks_y; ++b_y) for (b_z = 0; b_z < num_blocks_z; ++b_z){ idx_n = idx + b_x * num_threads_x + b_y * num_threads_y * C + b_z * num_threads_z * R * C; if (d_v[idx_n] > 0 && d_seed_indicator[idx_n] > 0.5) { d_RE[p] += Ddt *fabsf(d_dcdtL[idx_n] + d_dcdtR[idx_n] + kappa * d_curv[idx_n] * d_v[idx_n] * (1.0 - d_v[idx_n])) / d_v[idx_n]; } } __syncthreads(); atomicAdd(&local_RE, d_RE[p]); // Make sure all threads in the block have caught up __syncthreads(); // One of the threads writes to the device array d_RE[p] = local_RE; } // TODO: add full comment on this kernel //compute number of nodes with c=0, iterating over all thread numbers per block, summing over all blocks __global__ void kernel_Nz( float *d_v, float *d_seed_indicator, int *d_Nz, int R, int C, int num_blocks_x, int num_blocks_y, int num_blocks_z, int num_threads_x, int num_threads_y, int num_threads_z ) { int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; int p = i + j * num_threads_x + k * num_threads_y * num_threads_x; int idx = i + j * C + k * R * C; int b_x; int b_y; int b_z; int idx_n; __shared__ int local_Nz; d_Nz[p] = 0; // zero the local sum if (p == 0) local_Nz = 0; __syncthreads(); // make sure all threads are caught up // should be replaced by thrust routines ... for (b_x = 0; b_x < num_blocks_x; ++b_x) for (b_y = 0; b_y < num_blocks_y; ++b_y) for (b_z = 0; b_z < num_blocks_z; ++b_z){ idx_n = idx + b_x * num_threads_x + b_y * num_threads_y * C + b_z * num_threads_z * R * C; if (d_v[idx_n] == 0 || d_seed_indicator[idx_n] < 0.5) d_Nz[p] += 1; } __syncthreads(); atomicAdd(&local_Nz, d_Nz[p]); //Make sure all threads in the block have caught up __syncthreads(); d_Nz[p] = local_Nz; } int run_segmentation( float *h_v, float *h_w2, int *h_NR, int *h_NL, int *h_NR2, int *h_NL2, int *h_NRs, int *h_NLs, float *h_seed_indicator, struct segmentation_parameters *params, struct image_dimensions *dims ){ float *d_v; // diffusive variable float *d_dvdtL; //sum of differences between a node and a node on it's "left" float *d_dvdtR; //sum of differences between a node and a node on it's "right" float *d_smooth_2; // v smoothed along 2 axes float *d_smooth; // v smoothed along 1 axis float *d_norm_gxx; //x-derivative of the x-component of the unit vector in the direction grad(c) float *d_norm_gyy; //y-derivative of the y-component of the unit vector in the direction grad(c) float *d_norm_gzz; //z-derivative of the z-component of the unit vector in the direction grad(c) float *d_curv; // curvature, computed implicitly using the divergence formulation float *d_norm; // norm of grad(c), used to filter out fluctuations in small gradients float *d_w2; // edge weights int *d_NR; // array of "right neighbour" node indices, used for diffusion and curvature int *d_NL; // array of "left neighbour" node indices, used for curvature only int *d_NR2; // array of "right second neighbour" node indices, used for curvature only int *d_NL2; // array of "left second neighbour" node indices, used for curvature only int *d_NRs; // array of "right th neighbour" node indices, used for curvature only int *d_NLs; // array of "left th neighbour" node indices, used for curvature only int *d_Nz; // number of non-zero nodes in the image float *d_RE; // relative error; change in v from the previous time point as a proportion of v float *d_seed_indicator; // zero at seeds to prevent them from changing int *h_Nz; // host copy of d_Nz float *h_RE; // host copy of d_RE // get image dimensions int R = dims->R; int C = dims->C; int N = dims->N; int E = dims->E; int edge_direction_count = dims->edge_direction_count; int num_blocks_x = dims->num_blocks_x; int num_blocks_y = dims->num_blocks_y; int num_blocks_z = dims->num_blocks_z; int threads_per_block_x = dims->threads_per_block_x; int threads_per_block_y = dims->threads_per_block_y; int threads_per_block_z = dims->threads_per_block_z; // segmentation parameters int T = params->T; float Ddt = params->Ddt; int grad_step = params->grad_step; float kappa = params->kappa; int curv_interval = params->curv_interval; int curv_time_start = params->curv_time_start; int test_time_start = params->test_time_start; int test_interval = params->test_interval; float re_stop = params->re_stop; float t_norm = params->t_norm; clock_t t1, t2; setup_GPU(); // allocate memory on device cudaMalloc(&d_v, N * sizeof(float)); cudaMalloc(&d_smooth_2, N * sizeof(float)); cudaMalloc(&d_smooth, N * sizeof(float)); cudaMalloc(&d_dvdtL, N * sizeof(float)); cudaMalloc(&d_dvdtR, N * sizeof(float)); cudaMalloc(&d_norm_gxx, N * sizeof(float)); cudaMalloc(&d_norm_gyy, N * sizeof(float)); cudaMalloc(&d_norm_gzz, N * sizeof(float)); cudaMalloc(&d_w2, E * sizeof(float)); cudaMalloc(&d_NR, E * sizeof(int)); cudaMalloc(&d_NL, 3 * N * sizeof(int)); cudaMalloc(&d_NR2, 3 * N * sizeof(int)); cudaMalloc(&d_NL2, 3 * N * sizeof(int)); cudaMalloc(&d_NRs, 3 * N * sizeof(int)); cudaMalloc(&d_NLs, 3 * N * sizeof(int)); cudaMalloc(&d_Nz, params->max_threads_per_block * sizeof(int)); cudaMalloc(&d_RE, params->max_threads_per_block * sizeof(float)); cudaMalloc(&d_seed_indicator, N * sizeof(float)); cudaMalloc(&d_curv, N * sizeof(float)); cudaMalloc(&d_norm, N * sizeof(float)); h_RE = (float *) malloc(params->max_threads_per_block * sizeof(float)); h_Nz = (int *) malloc(params->max_threads_per_block * sizeof(int)); //copy host arrays to device cudaMemcpy(d_v, h_v, N * sizeof(float), cudaMemcpyHostToDevice); // only right node used for diffusion component, so requires values for all edges cudaMemcpy(d_NR, h_NR, E * sizeof(int), cudaMemcpyHostToDevice); // neighbors for curvature, along with first 3N values of NR cudaMemcpy(d_NL, h_NL, 3 * N * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_NR2, h_NR2, 3 * N * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_NL2, h_NL2, 3 * N * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_NRs, h_NRs, 3 * N * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_NLs, h_NLs, 3 * N * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_seed_indicator, h_seed_indicator, N * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_w2, h_w2, E * sizeof(float), cudaMemcpyHostToDevice); // main block sizes const dim3 num_blocks(num_blocks_x, num_blocks_y, num_blocks_z); const dim3 num_threads(threads_per_block_x, threads_per_block_y, threads_per_block_z); // for computing relative error const dim3 re_blocks(1, 1, 1); zero_image<<>>(d_curv, R, C); int curv_running = false; t1 = clock(); for (int t = 0; t < T; t++) { if(t > curv_time_start) curv_running = true; zero_image<<>>(d_dvdtL, R, C); zero_image<<>>(d_dvdtR, R, C); for(int i = 0; i < edge_direction_count; i++){ // compute along all directions compute_dvdt<<>>(d_v, d_dvdtL, d_dvdtR, d_NR, d_w2, Ddt, R, C, i * N); } // average over all directions kernel_add<<>>(d_v, d_dvdtL, d_dvdtR, d_seed_indicator, d_curv, Ddt, kappa, R, C); // compute curvature after initial diffusion at regular intervals for non-zero coefficient if(curv_running && t % curv_interval == 0 && kappa > 0.0001){ //smooth perpendicular to x img_smooth_5<<>>(d_v, d_smooth, d_NR, d_NL, d_NR2, d_NL2, R, C, N); img_smooth_5<<>>(d_smooth, d_smooth_2, d_NR, d_NL, d_NR2, d_NL2, R, C, 2 * N); // compute x-component of the normal vector img_diff<<>>(d_smooth_2, d_norm_gxx, d_NRs, d_NLs, R, C, 0); //smooth perpendicular to y img_smooth_5<<>>(d_v, d_smooth, d_NR, d_NL, d_NR2, d_NL2, R, C, 0); img_smooth_5<<>>(d_smooth, d_smooth_2, d_NR, d_NL, d_NR2, d_NL2, R, C, 2 * N); // compute y-component of the normal vector img_diff<<>>(d_smooth_2, d_norm_gyy, d_NRs, d_NLs, R, C, N); //smooth perpendicular to z (use first step from previous calculation) img_smooth_5<<>>(d_smooth, d_smooth_2, d_NR, d_NL, d_NR2, d_NL2, R, C, N); // compute z-component of the normal vector img_diff<<>>(d_smooth_2, d_norm_gzz, d_NRs, d_NLs, R, C, 2 * N); //normalize gradients normalize_grad<<>>(d_norm_gxx, d_norm_gyy, d_norm_gzz, d_norm, R, C); // normalize_grad<<>>(d_norm_gxx, d_norm_gyy, d_norm_gzz, t_norm, R, C); //smooth perpendicular to x img_smooth_5<<>>(d_norm_gxx, d_smooth, d_NR, d_NL, d_NR2, d_NL2, R, C, N); img_smooth_5<<>>(d_smooth, d_smooth_2, d_NR, d_NL, d_NR2, d_NL2, R, C, 2 * N); //compute derivative of x-component of norm vector along x img_diff<<>>(d_smooth_2, d_norm_gxx, d_NRs, d_NLs, R, C, 0); // rescale difference to give gradient scale_image<<>>(d_norm_gxx, 0.5 / grad_step, R, C); //smooth perpendicular to y img_smooth_5<<>>(d_norm_gyy, d_smooth, d_NR, d_NL, d_NR2, d_NL2, R, C, 0); img_smooth_5<<>>(d_smooth, d_smooth_2, d_NR, d_NL, d_NR2, d_NL2, R, C, 2 * N); //compute derivative of y-component of norm vector along y img_diff<<>>(d_smooth_2, d_norm_gyy, d_NRs, d_NLs, R, C, N); // rescale difference to give gradient scale_image<<>>(d_norm_gyy, 0.5 / grad_step, R, C); //smooth perpendicular to z img_smooth_5<<>>(d_norm_gzz, d_smooth, d_NR, d_NL, d_NR2, d_NL2, R, C, 0); img_smooth_5<<>>(d_smooth, d_smooth_2, d_NR, d_NL, d_NR2, d_NL2, R, C, N); //compute derivative of z-component of norm vector along z img_diff<<>>(d_smooth_2, d_norm_gzz, d_NRs, d_NLs, R, C, 2 * N); // rescale difference to give gradient scale_image<<>>(d_norm_gzz, 0.5 / grad_step, R, C); compute_curvature<<>>(d_norm_gxx, d_norm_gyy, d_norm_gzz, d_norm, d_v, d_curv, t_norm, R, C); // compute_curvature<<>>(d_norm_gxx, d_norm_gyy, d_norm_gzz, d_v, d_curv, R, C); } if (t >= test_time_start) { if(t % test_interval == 0){ kernel_Nz<<>>( d_v, d_seed_indicator, d_Nz, R, C, num_blocks_x, num_blocks_y, num_blocks_z, threads_per_block_x, threads_per_block_y, threads_per_block_z ); cudaMemcpy(h_Nz, d_Nz, sizeof(int), cudaMemcpyDeviceToHost); int Nnz = N - h_Nz[0]; //number of non-zero elements RE<<>>( d_v, d_dvdtL, d_dvdtR, d_seed_indicator, d_curv, d_RE, Ddt, kappa, R, C, num_blocks_x, num_blocks_y, num_blocks_z, threads_per_block_x, threads_per_block_y, threads_per_block_z ); cudaMemcpy(h_RE, d_RE, sizeof(float), cudaMemcpyDeviceToHost); h_RE[0] = h_RE[0] / Nnz; //we only consider the relative error for non-zero elements t2 = clock(); printf("t: %d, h_RE: %f Nnz: %d, timestamp: %f\n", t, h_RE[0], Nnz,(double) (t2 - t1) / (double) CLOCKS_PER_SEC); if (h_RE[0] < re_stop) { if(curv_running){ printf("stopped at t: %d, h_RE: %f\n", t, h_RE[0]); break; } else { curv_running = true; printf("adding curvature at t: %d, h_RE: %f\n", t, h_RE[0]); } } } } } cudaMemcpy(h_v, d_v, N * sizeof(float), cudaMemcpyDeviceToHost); cudaFree(d_v); cudaFree(d_smooth_2); cudaFree(d_smooth); cudaFree(d_dvdtL); cudaFree(d_dvdtR); cudaFree(d_norm_gxx); cudaFree(d_norm_gyy); cudaFree(d_norm_gzz); cudaFree(d_w2); cudaFree(d_NR); cudaFree(d_NL); cudaFree(d_NR2); cudaFree(d_NL2); cudaFree(d_NRs); cudaFree(d_NLs); cudaFree(d_RE); cudaFree(d_Nz); cudaFree(d_seed_indicator); cudaFree(d_curv); cudaFree(d_norm); free(h_RE); free(h_Nz); return 0; } /////////////// void setup_GPU() { // cudaError_t is a type defined in cuda.h cudaError_t err; //printf("Checking for CUDA devices...\n"); // Call a CUDA helper function to count the number // of CUDA available devices in the system. int count; err = cudaGetDeviceCount(&count); if ((count == 0)) { printf("No CUDA supported devices are available in this system.\n"); exit(EXIT_FAILURE); } else if (err != cudaSuccess){ printf("Error getting number of devices.\n"); } else { printf("Found %d CUDA devices in this system\n", count); } // cudaDeviceProp is a type of struct which we will // populate with information about the available // GPU compute devices. cudaDeviceProp prop; // Loop over the available CUDA devices int idev; for (idev = 0; idev < count; idev++) { // Call another CUDA helper function to populate prop err = cudaGetDeviceProperties(&prop, idev); if (err != cudaSuccess) { printf("Error getting device properties\n"); exit(EXIT_FAILURE); } // Print out a member of the prop struct which tells // us the name of the CUDA device. Other members of this // struct tell us the clock speed and compute capability // of the device. printf("Device %d : %s\n", idev, prop.name); } cudaSetDevice(idev); err = cudaGetDevice(&idev); if (err != cudaSuccess) { printf("Error identifying active device\n"); exit(EXIT_FAILURE); } printf("Using device %d\n", idev); } ///////////////