/*
 * segmentation.cu
 *
 *  Created on: Oct 30, 2019
 *      Author: lutton
 */

#include <cuda.h>
#include <stdio.h>
#include <time.h>  // clock() and clock_t

#include "segmentation.h"

/** normalized 5-point smoothing filter, using binomial approximation to Gaussian
 * d_img			input 3D image
 * d_convolved		output 3D image
 * step				direction of smoothing (1,C, or R * C)
 * R				number of rows in image
 * C				number of columns in image
 * N				size of image
 */
__global__ void img_smooth_5(
		float *d_img,
		float *d_convolved,
		int *d_NR, int *d_NL,
		int *d_NR2, int *d_NL2,
		int R, int C,
		int edge_offset
		){
	int i = blockDim.x * blockIdx.x + threadIdx.x;
	int j = blockDim.y * blockIdx.y + threadIdx.y;
	int k = blockDim.z * blockIdx.z + threadIdx.z;

	int p = k * R * C + j * C + i;
	int e = p + edge_offset;

	d_convolved[p] = (d_img[d_NR2[e]] + 4 * d_img[d_NR[e]] + 6 * d_img[p] + 4 * d_img[d_NL[e]] + d_img[d_NL2[e]]) * 0.0625;
}

/** two point gradient with variable radius
  * d_img				input 3D image
  * d_convolved			output 3D image
  * step				product of direction (1,R, or R*C) and step size
  * R					number or rows in image
  * C					number of columns in image
  * N					size of image
  */
__global__ void img_diff(
		float *d_img,
		float *d_convolved,
		int *d_NRs, int *d_NLs,
		int R, int C, int edge_offset
		){
	int i = blockDim.x * blockIdx.x + threadIdx.x;
	int j = blockDim.y * blockIdx.y + threadIdx.y;
	int k = blockDim.z * blockIdx.z + threadIdx.z;

	int p = k * R * C + j * C + i;
	int e = p + edge_offset;

	d_convolved[p] = (d_img[d_NRs[e]] - d_img[d_NLs[e]]);
}


/** normalize gradient and store normal values
 * gradient directions have a minimum size to avoid introducing normals where no gradient exists.
 * d_norm_gxx			gradient in x to be normalized
 * d_norm_gyy			gradient in y to be normalized
 * d_norm_gzz			gradient in z to be normalized
 * d_norm				gradient size
 * R					number of rows
 * C					number of columns
 */
/*
__global__ void normalize_grad(
		float *d_norm_gxx, float *d_norm_gyy, float *d_norm_gzz,
		float t_norm,
		int R, int C
		){
	int i = blockDim.x * blockIdx.x + threadIdx.x;
	int j = blockDim.y * blockIdx.y + threadIdx.y;
	int k = blockDim.z * blockIdx.z + threadIdx.z;

	int p = k * R * C + j * C + i;

	float d_norm = norm3df(d_norm_gxx[p], d_norm_gyy[p], d_norm_gzz[p]);
	if(d_norm > t_norm){
		d_norm_gxx[p] = d_norm_gxx[p] / d_norm;
		d_norm_gyy[p] = d_norm_gyy[p] / d_norm;
		d_norm_gzz[p] = d_norm_gzz[p] / d_norm;
	} else {
		d_norm_gxx[p] = 0.0;
		d_norm_gyy[p] = 0.0;
		d_norm_gzz[p] = 0.0;
	}
}
*/
__global__ void normalize_grad(
		float *d_norm_gxx, float *d_norm_gyy, float *d_norm_gzz,
		float *d_norm,
		int R, int C
		){
	int i = blockDim.x * blockIdx.x + threadIdx.x;
	int j = blockDim.y * blockIdx.y + threadIdx.y;
	int k = blockDim.z * blockIdx.z + threadIdx.z;

	int p = k * R * C + j * C + i;

	d_norm_gxx[p] = (0.5 + copysignf(0.5,fabsf(d_norm_gxx[p]) - 0.001)) * d_norm_gxx[p];
	d_norm_gyy[p] = (0.5 + copysignf(0.5,fabsf(d_norm_gyy[p]) - 0.001)) * d_norm_gyy[p];
	d_norm_gzz[p] = (0.5 + copysignf(0.5,fabsf(d_norm_gzz[p]) - 0.001)) * d_norm_gzz[p];

	d_norm[p] = fmaxf(norm3df(d_norm_gxx[p], d_norm_gyy[p], d_norm_gzz[p]), 0.000001);

	d_norm_gxx[p] = d_norm_gxx[p] / d_norm[p];
	d_norm_gyy[p] = d_norm_gyy[p] / d_norm[p];
	d_norm_gzz[p] = d_norm_gzz[p] / d_norm[p];
}


/** set image to zero
 *
 * d_img			image to be set to zero
 * R				number of rows
 * C				number of columns
 */
__global__ void zero_image(
		float *d_img,
		int R, int C){
	int i = blockDim.x * blockIdx.x + threadIdx.x;
	int j = blockDim.y * blockIdx.y + threadIdx.y;
	int k = blockDim.z * blockIdx.z + threadIdx.z;

	int p = k * R * C + j * C + i;

	d_img[p] = 0;
}

/** multiply all image values by a given scale factor
 *
 * d_img			image to be scaled
 * scale			scale factor (multiplier
 * R				number of rows
 * C				number of columns
 */
__global__ void scale_image(
		float *d_img,
		float scale,
		int R, int C){
	int i = blockDim.x * blockIdx.x + threadIdx.x;
	int j = blockDim.y * blockIdx.y + threadIdx.y;
	int k = blockDim.z * blockIdx.z + threadIdx.z;

	int p = k * R * C + j * C + i;

	d_img[p] = scale * d_img[p];
}

/** compute the mean curvature given normals, gradient size, and foreground probabilities
 * mean curvature is calculated using the divergence formula, with a threshold on the gradient
 * size to exclude areas where there is no change in probabilities (gradient effectively zero)
 *
 * d_norm_gxx		normal in the x-direction
 * d_norm_gyy		normal in the y-direction
 * d_norm_gzz		normal in the z-direction
 * d_norm			gradient magnitude
 * d_v				foreground probability
 * d_curv			mean curvature
 * t_norm			gradient threshold
 * R				number of rows
 * C				number of columns
 */
/*
__global__ void compute_curvature(
		float *d_norm_gxx, float *d_norm_gyy, float *d_norm_gzz, float *d_v,
		float *d_curv,
		int R, int C
		) {
	int i = blockDim.x * blockIdx.x + threadIdx.x;
	int j = blockDim.y * blockIdx.y + threadIdx.y;
	int k = blockDim.z * blockIdx.z + threadIdx.z;

	int p = k * R * C + j * C + i;
	float curv = -d_norm_gxx[p] - d_norm_gyy[p] - d_norm_gzz[p];
	d_curv[p] = (d_v[p] < 0.5) ? fminf(curv, 0) : fmaxf(curv, 0);
}
*/
__global__ void compute_curvature(
		float *d_norm_gxx, float *d_norm_gyy, float *d_norm_gzz, float *d_norm, float *d_v,
		float *d_curv,
		float t_norm,
		int R, int C
		) {
	int i = blockDim.x * blockIdx.x + threadIdx.x;
	int j = blockDim.y * blockIdx.y + threadIdx.y;
	int k = blockDim.z * blockIdx.z + threadIdx.z;

	int p = k * R * C + j * C + i;
	float curv = -d_norm_gxx[p] - d_norm_gyy[p] - d_norm_gzz[p];
	d_curv[p] = (d_norm[p] > t_norm) ? ((d_v[p] < 0.5) ? fminf(curv, 0) : fmaxf(curv, 0)) : 0;
}

// TODO: finish this comment
/** compute the derivative of the foreground probabilities (d_v)
 * differences are taken for each edge of each node in each direction
 * for a given direction, edges pointing forward are referred to as 'right' edges,
 * while edges pointing backward are referred to as 'left' edges
 * these differences are subsequently summed to give the weighted second derivative
 *
 * d_v			foreground probabilities
 * d_dvdtL		derivative on the 'left'
 * d_dvdtR		derivative on the 'right'
 * d_NR		node to the 'right'
 * d_w2		weight of edge
 */
__global__ void compute_dvdt(
		float *d_v,
		float *d_dvdtL, float *d_dvdtR,
		int *d_NR, float *d_w2,
		float Ddt,
		int R, int C,
		int edge_offset
		) {

	int i = blockDim.x * blockIdx.x + threadIdx.x;
	int j = blockDim.y * blockIdx.y + threadIdx.y;
	int k = blockDim.z * blockIdx.z + threadIdx.z;

	int pL = k * R * C + j * C + i;
	int e = pL + edge_offset;
	int pR = d_NR[e];

	float diff = (d_v[pR] - d_v[pL]) * d_w2[e];

	d_dvdtL[pL] += diff;
	d_dvdtR[pR] -= diff;
}

/** sum the derivatives and curvature term
 *
 * d_v			foreground probabilities
 * d_dvdtL		differences from 'left' edges
 * d_dvdtR		differences from 'right' edges
 * d_curv		mean curvature
 * Ddt			time discretization
 * kappa		curvature coefficient
 * R			number of rows
 * C			number of columns
 */
__global__ void kernel_add(
		float *d_v,
		float *d_dvdtL, float *d_dvdtR, float *d_seed_indicator,
		float *d_curv,
		float Ddt,
		float kappa,
		int R, int C
		) {

	int i = blockDim.x * blockIdx.x + threadIdx.x;
	int j = blockDim.y * blockIdx.y + threadIdx.y;
	int k = blockDim.z * blockIdx.z + threadIdx.z;

	int p = k * R * C + j * C + i;
	if(d_seed_indicator[p] > 0.5){
		d_v[p] += Ddt * (d_dvdtL[p] + d_dvdtR[p] + kappa * d_curv[p] * d_v[p] * (1.0 - d_v[p]));
	}
}

// TODO: add full comment on this kernel
//compute relative error
__global__ void RE(
		float *d_v, float *d_dcdtL, float *d_dcdtR, float *d_seed_indicator, float *d_curv,
		float *d_RE,
		float Ddt,
		float kappa,
		int R, int C,
		int num_blocks_x, int num_blocks_y, int num_blocks_z,
		int num_threads_x, int num_threads_y, int num_threads_z
		) {

	int i = blockDim.x * blockIdx.x + threadIdx.x;
	int j = blockDim.y * blockIdx.y + threadIdx.y;
	int k = blockDim.z * blockIdx.z + threadIdx.z;

	int p = i + j * num_threads_x + k * num_threads_x * num_threads_y;
	int idx = i + j * C + k * R * C;
    int b_x;
    int b_y;
    int b_z;
    int idx_n;

	__shared__ float local_RE;

	d_RE[p] = 0;

	// zero the local sum
	if (p == 0) local_RE = 0;

	__syncthreads(); // make sure all threads are caught up

	//RE calc
	for (b_x = 0; b_x < num_blocks_x; ++b_x)
		for (b_y = 0; b_y < num_blocks_y; ++b_y)
			for (b_z = 0; b_z < num_blocks_z; ++b_z){
				idx_n = idx + b_x * num_threads_x + b_y * num_threads_y * C + b_z * num_threads_z * R * C;
				if (d_v[idx_n] > 0 && d_seed_indicator[idx_n] > 0.5) {
					d_RE[p] += Ddt *fabsf(d_dcdtL[idx_n] + d_dcdtR[idx_n] + kappa * d_curv[idx_n] * d_v[idx_n] * (1.0 - d_v[idx_n])) / d_v[idx_n];
				}
			}

	__syncthreads();

	atomicAdd(&local_RE, d_RE[p]);

	// Make sure all threads in the block have caught up
	__syncthreads();

	// One of the threads writes to the device array
	d_RE[p] = local_RE;
}

// TODO: add full comment on this kernel
//compute number of nodes with c=0, iterating over all thread numbers per block, summing over all blocks
__global__ void kernel_Nz(
		float *d_v, float *d_seed_indicator,
		int *d_Nz,
		int R, int C,
		int num_blocks_x, int num_blocks_y, int num_blocks_z,
		int num_threads_x, int num_threads_y, int num_threads_z
		) {

    int i = blockDim.x * blockIdx.x + threadIdx.x;
    int j = blockDim.y * blockIdx.y + threadIdx.y;
    int k = blockDim.z * blockIdx.z + threadIdx.z;

    int p = i + j * num_threads_x + k * num_threads_y * num_threads_x;
    int idx = i + j * C + k * R * C;
    int b_x;
    int b_y;
    int b_z;
    int idx_n;

    __shared__ int local_Nz;

    d_Nz[p] = 0;

    // zero the local sum
    if (p == 0) local_Nz = 0;

    __syncthreads(); // make sure all threads are caught up

    // should be replaced by thrust routines ...
    for (b_x = 0; b_x < num_blocks_x; ++b_x)
    	for (b_y = 0; b_y < num_blocks_y; ++b_y)
    		for (b_z = 0; b_z < num_blocks_z; ++b_z){
    			idx_n = idx + b_x * num_threads_x + b_y * num_threads_y * C + b_z * num_threads_z * R * C;
    			if (d_v[idx_n] == 0 || d_seed_indicator[idx_n] < 0.5)
    				d_Nz[p] += 1;
    		}

    __syncthreads();

    atomicAdd(&local_Nz, d_Nz[p]);

    //Make sure all threads in the block have caught up
    __syncthreads();

    d_Nz[p] = local_Nz;

}

int run_segmentation(
		float *h_v,
		float *h_w2,
		int *h_NR,
		int *h_NL,
		int *h_NR2,
		int *h_NL2,
		int *h_NRs,
		int *h_NLs,
		float *h_seed_indicator,
		struct segmentation_parameters *params,
		struct image_dimensions *dims
		){

	float *d_v; // diffusive variable

	float *d_dvdtL; //sum of differences between a node and a node on it's "left"
	float *d_dvdtR; //sum of differences between a node and a node on it's "right"

	float *d_smooth_2; // v smoothed along 2 axes
	float *d_smooth; // v smoothed along 1 axis
	float *d_norm_gxx; //x-derivative of the x-component of the unit vector in the direction grad(c)
	float *d_norm_gyy; //y-derivative of the y-component of the unit vector in the direction grad(c)
	float *d_norm_gzz; //z-derivative of the z-component of the unit vector in the direction grad(c)
	float *d_curv; // curvature, computed implicitly using the divergence formulation
	float *d_norm; // norm of grad(c), used to filter out fluctuations in small gradients

	float *d_w2; // edge weights
	int *d_NR; // array of "right neighbour" node indices, used for diffusion and curvature
	int *d_NL; // array of "left neighbour" node indices, used for curvature only
	int *d_NR2; // array of "right second neighbour" node indices, used for curvature only
	int *d_NL2; // array of "left second neighbour" node indices, used for curvature only
	int *d_NRs; // array of "right <step>th neighbour" node indices, used for curvature only
	int *d_NLs; // array of "left <step>th neighbour" node indices, used for curvature only

	int *d_Nz; // number of non-zero nodes in the image
	float *d_RE; // relative error; change in v from the previous time point as a proportion of v
	float *d_seed_indicator; // zero at seeds to prevent them from changing

	int *h_Nz; // host copy of d_Nz
	float *h_RE; // host copy of d_RE

	// get image dimensions
	int R = dims->R;
	int C = dims->C;
	int N = dims->N;
	int E = dims->E;
	int edge_direction_count = dims->edge_direction_count;
	int num_blocks_x = dims->num_blocks_x;
	int num_blocks_y = dims->num_blocks_y;
	int num_blocks_z = dims->num_blocks_z;
	int threads_per_block_x = dims->threads_per_block_x;
	int threads_per_block_y = dims->threads_per_block_y;
	int threads_per_block_z = dims->threads_per_block_z;
	// segmentation parameters
	int T = params->T;
	float Ddt = params->Ddt;
	int grad_step = params->grad_step;
	float kappa = params->kappa;
	int curv_interval = params->curv_interval;
	int curv_time_start = params->curv_time_start;
	int test_time_start = params->test_time_start;
	int test_interval = params->test_interval;
	float re_stop = params->re_stop;
	float t_norm = params->t_norm;

	clock_t t1, t2;

	setup_GPU();

	// allocate memory on device
	cudaMalloc(&d_v, N * sizeof(float));
	cudaMalloc(&d_smooth_2, N * sizeof(float));
	cudaMalloc(&d_smooth, N * sizeof(float));
	cudaMalloc(&d_dvdtL, N * sizeof(float));
	cudaMalloc(&d_dvdtR, N * sizeof(float));
	cudaMalloc(&d_norm_gxx, N * sizeof(float));
	cudaMalloc(&d_norm_gyy, N * sizeof(float));
	cudaMalloc(&d_norm_gzz, N * sizeof(float));
	cudaMalloc(&d_w2, E * sizeof(float));
	cudaMalloc(&d_NR, E * sizeof(int));
	cudaMalloc(&d_NL, 3 * N * sizeof(int));
	cudaMalloc(&d_NR2, 3 * N * sizeof(int));
	cudaMalloc(&d_NL2, 3 * N * sizeof(int));
	cudaMalloc(&d_NRs, 3 * N * sizeof(int));
	cudaMalloc(&d_NLs, 3 * N * sizeof(int));
	cudaMalloc(&d_Nz, params->max_threads_per_block * sizeof(int));
	cudaMalloc(&d_RE, params->max_threads_per_block * sizeof(float));
	cudaMalloc(&d_seed_indicator, N * sizeof(float));
	cudaMalloc(&d_curv, N * sizeof(float));
	cudaMalloc(&d_norm, N * sizeof(float));

	h_RE = (float *) malloc(params->max_threads_per_block * sizeof(float));
	h_Nz = (int *) malloc(params->max_threads_per_block * sizeof(int));

	//copy host arrays to device
	cudaMemcpy(d_v, h_v, N * sizeof(float), cudaMemcpyHostToDevice);
	// only right node used for diffusion component, so requires values for all edges
	cudaMemcpy(d_NR, h_NR, E * sizeof(int), cudaMemcpyHostToDevice);
	// neighbors for curvature, along with first 3N values of NR
	cudaMemcpy(d_NL, h_NL, 3 * N * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_NR2, h_NR2, 3 * N * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_NL2, h_NL2, 3 * N * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_NRs, h_NRs, 3 * N * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_NLs, h_NLs, 3 * N * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_seed_indicator, h_seed_indicator, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(d_w2, h_w2, E * sizeof(float), cudaMemcpyHostToDevice);

	// main block sizes
	const dim3 num_blocks(num_blocks_x, num_blocks_y, num_blocks_z);
	const dim3 num_threads(threads_per_block_x, threads_per_block_y, threads_per_block_z);
	// for computing relative error
	const dim3 re_blocks(1, 1, 1);

	zero_image<<<num_blocks, num_threads>>>(d_curv, R, C);
	int curv_running = false;

	t1 = clock();
	for (int t = 0; t < T; t++) {
		if(t > curv_time_start)
			curv_running = true;

		zero_image<<<num_blocks, num_threads>>>(d_dvdtL, R, C);
		zero_image<<<num_blocks, num_threads>>>(d_dvdtR, R, C);

		for(int i = 0; i < edge_direction_count; i++){ // compute along all directions
			compute_dvdt<<<num_blocks, num_threads>>>(d_v, d_dvdtL, d_dvdtR, d_NR, d_w2, Ddt, R, C, i * N);
		}

		// average over all directions
		kernel_add<<<num_blocks, num_threads>>>(d_v, d_dvdtL, d_dvdtR, d_seed_indicator, d_curv, Ddt, kappa, R, C);

		// compute curvature after initial diffusion at regular intervals for non-zero coefficient
		if(curv_running && t % curv_interval == 0 && kappa > 0.0001){
			//smooth perpendicular to x
			img_smooth_5<<<num_blocks, num_threads>>>(d_v, d_smooth, d_NR, d_NL, d_NR2, d_NL2, R, C, N);
			img_smooth_5<<<num_blocks, num_threads>>>(d_smooth, d_smooth_2, d_NR, d_NL, d_NR2, d_NL2, R, C, 2 * N);
			// compute x-component of the normal vector
			img_diff<<<num_blocks, num_threads>>>(d_smooth_2, d_norm_gxx, d_NRs, d_NLs, R, C, 0);

			//smooth perpendicular to y
			img_smooth_5<<<num_blocks, num_threads>>>(d_v, d_smooth, d_NR, d_NL, d_NR2, d_NL2, R, C, 0);
			img_smooth_5<<<num_blocks, num_threads>>>(d_smooth, d_smooth_2, d_NR, d_NL, d_NR2, d_NL2, R, C, 2 * N);
			// compute y-component of the normal vector
			img_diff<<<num_blocks, num_threads>>>(d_smooth_2, d_norm_gyy, d_NRs, d_NLs, R, C, N);

			//smooth perpendicular to z (use first step from previous calculation)
			img_smooth_5<<<num_blocks, num_threads>>>(d_smooth, d_smooth_2, d_NR, d_NL, d_NR2, d_NL2, R, C, N);
			// compute z-component of the normal vector
			img_diff<<<num_blocks, num_threads>>>(d_smooth_2, d_norm_gzz, d_NRs, d_NLs, R, C, 2 * N);

			//normalize gradients
			normalize_grad<<<num_blocks, num_threads>>>(d_norm_gxx, d_norm_gyy, d_norm_gzz, d_norm, R, C);
//			normalize_grad<<<num_blocks, num_threads>>>(d_norm_gxx, d_norm_gyy, d_norm_gzz, t_norm, R, C);

			//smooth perpendicular to x
			img_smooth_5<<<num_blocks, num_threads>>>(d_norm_gxx, d_smooth, d_NR, d_NL, d_NR2, d_NL2, R, C, N);
			img_smooth_5<<<num_blocks, num_threads>>>(d_smooth, d_smooth_2, d_NR, d_NL, d_NR2, d_NL2, R, C, 2 * N);
			//compute derivative of x-component of norm vector along x
			img_diff<<<num_blocks, num_threads>>>(d_smooth_2, d_norm_gxx, d_NRs, d_NLs, R, C, 0);
			// rescale difference to give gradient
			scale_image<<<num_blocks, num_threads>>>(d_norm_gxx, 0.5 / grad_step, R, C);

			//smooth perpendicular to y
			img_smooth_5<<<num_blocks, num_threads>>>(d_norm_gyy, d_smooth, d_NR, d_NL, d_NR2, d_NL2, R, C, 0);
			img_smooth_5<<<num_blocks, num_threads>>>(d_smooth, d_smooth_2, d_NR, d_NL, d_NR2, d_NL2, R, C, 2 * N);
			//compute derivative of y-component of norm vector along y
			img_diff<<<num_blocks, num_threads>>>(d_smooth_2, d_norm_gyy, d_NRs, d_NLs, R, C, N);
			// rescale difference to give gradient
			scale_image<<<num_blocks, num_threads>>>(d_norm_gyy, 0.5 / grad_step, R, C);

			//smooth perpendicular to z
			img_smooth_5<<<num_blocks, num_threads>>>(d_norm_gzz, d_smooth, d_NR, d_NL, d_NR2, d_NL2, R, C, 0);
			img_smooth_5<<<num_blocks, num_threads>>>(d_smooth, d_smooth_2, d_NR, d_NL, d_NR2, d_NL2, R, C, N);
			//compute derivative of z-component of norm vector along z
			img_diff<<<num_blocks, num_threads>>>(d_smooth_2, d_norm_gzz, d_NRs, d_NLs, R, C, 2 * N);
			// rescale difference to give gradient
			scale_image<<<num_blocks, num_threads>>>(d_norm_gzz, 0.5 / grad_step, R, C);

			compute_curvature<<<num_blocks, num_threads>>>(d_norm_gxx, d_norm_gyy, d_norm_gzz, d_norm, d_v, d_curv, t_norm, R, C);
//			compute_curvature<<<num_blocks, num_threads>>>(d_norm_gxx, d_norm_gyy, d_norm_gzz, d_v, d_curv, R, C);

		}
		if (t >= test_time_start) {
			if(t % test_interval == 0){

				kernel_Nz<<<re_blocks, num_threads>>>(
						d_v, d_seed_indicator,
						d_Nz,
						R, C,
						num_blocks_x, num_blocks_y, num_blocks_z,
						threads_per_block_x, threads_per_block_y, threads_per_block_z
				);
				cudaMemcpy(h_Nz, d_Nz, sizeof(int), cudaMemcpyDeviceToHost);
				int Nnz = N - h_Nz[0]; //number of non-zero elements

				RE<<<re_blocks, num_threads>>>(
						d_v, d_dvdtL, d_dvdtR, d_seed_indicator, d_curv,
						d_RE,
						Ddt,
						kappa,
						R, C,
						num_blocks_x, num_blocks_y, num_blocks_z,
						threads_per_block_x, threads_per_block_y, threads_per_block_z
				);

				cudaMemcpy(h_RE, d_RE, sizeof(float), cudaMemcpyDeviceToHost);
				h_RE[0] = h_RE[0] / Nnz; //we only consider the relative error for non-zero elements

				t2 = clock();
				printf("t: %d, h_RE: %f Nnz: %d, timestamp: %f\n", t, h_RE[0], Nnz,(double) (t2 - t1) / (double) CLOCKS_PER_SEC);

				if (h_RE[0] < re_stop) {
					if(curv_running){
						printf("stopped at t: %d, h_RE: %f\n", t, h_RE[0]);
						break;
					} else {
						curv_running = true;
						printf("adding curvature at t: %d, h_RE: %f\n", t, h_RE[0]);
					}
				}
			}
		}
	}

	cudaMemcpy(h_v, d_v, N * sizeof(float), cudaMemcpyDeviceToHost);

	cudaFree(d_v);
	cudaFree(d_smooth_2);
	cudaFree(d_smooth);
	cudaFree(d_dvdtL);
	cudaFree(d_dvdtR);
	cudaFree(d_norm_gxx);
	cudaFree(d_norm_gyy);
	cudaFree(d_norm_gzz);
	cudaFree(d_w2);
	cudaFree(d_NR);
	cudaFree(d_NL);
	cudaFree(d_NR2);
	cudaFree(d_NL2);
	cudaFree(d_NRs);
	cudaFree(d_NLs);
	cudaFree(d_RE);
	cudaFree(d_Nz);
	cudaFree(d_seed_indicator);
	cudaFree(d_curv);
	cudaFree(d_norm);

	free(h_RE);
	free(h_Nz);

	return 0;
}

///////////////

void setup_GPU() {

	// cudaError_t is a type defined in cuda.h
	cudaError_t err;

	//printf("Checking for CUDA devices...\n");

	// Call a CUDA helper function to count the number
	// of CUDA available devices in the system.
	int count;
	err = cudaGetDeviceCount(&count);
	if ((count == 0)) {
		printf("No CUDA supported devices are available in this system.\n");
		exit(EXIT_FAILURE);
	} else if (err != cudaSuccess){
		printf("Error getting number of devices.\n");
	} else {
		printf("Found %d CUDA devices in this system\n", count);
	}

	// cudaDeviceProp is a type of struct which we will
	// populate with information about the available
	// GPU compute devices.
	cudaDeviceProp prop;

	// Loop over the available CUDA devices
	int idev;
	for (idev = 0; idev < count; idev++) {

		// Call another CUDA helper function to populate prop
		err = cudaGetDeviceProperties(&prop, idev);
		if (err != cudaSuccess) {
			printf("Error getting device properties\n");
			exit(EXIT_FAILURE);
		}

		// Print out a member of the prop struct which tells
		// us the name of the CUDA device. Other members of this
		// struct tell us the clock speed and compute capability
		// of the device.
		printf("Device %d : %s\n", idev, prop.name);

	}

	cudaSetDevice(idev);

	err = cudaGetDevice(&idev);
	if (err != cudaSuccess) {
		printf("Error identifying active device\n");
		exit(EXIT_FAILURE);
	}
	printf("Using device %d\n", idev);
}

///////////////