stim/cuda/branch_detection2.cuh

#include <stim/cuda/templates/gaussian_blur.cuh>
#include <stim/cuda/templates/gradient.cuh>
#include <stim/cuda/arraymath.cuh>
#include <stim/cuda/ivote.cuh>
void atan_2(float* cpuTable, unsigned int rmax){
	//initialize the width and height of the window which atan2 are computed in.
	int xsize = 2*rmax +1;
	int ysize = 2*rmax +1;
	
	// assign the center coordinates of the atan2 window to yi and xi
	int yi = rmax;
	int xi = rmax;
	
	for (int xt = 0; xt < xsize; xt++){
		for(int yt = 0; yt < ysize; yt++){
			//convert the current 2D coordinates to 1D
			int id = yt * xsize + xt;
			// calculate the distance between the pixel and the center of the atan2 window
			float xd = xi - xt;
			float yd = yi - yt;
			// calculate the angle between the pixel and the center of the atan2 window and store the result.
			float atan_2d_vote = atan2(yd, xd);
			cpuTable[id] = atan_2d_vote;
		}
	}
}
std::vector<stim::vec<float> > 
find_branch(GLint texbufferID, GLenum texType, unsigned int x, unsigned int y)
{
	float* cpuTable		= (float
	unsigned int pixels = x * y;
	unsigned int bytes = sizeof(float) * pixels;
	//calculate the number of bytes in the atan2 table
	unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(float);
	//allocate space on the GPU for the atan2 table
	float* gpuTable;
	cudaMalloc(&gpuTable, bytes_table);
	cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice);
	unsigned int sigma_ds = 1/resize;
	unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
	unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
	unsigned int bytes_ds = sizeof(float) * x_ds * y_ds;
	
	float* gpuI;
	cudaMalloc(&gpuI, bytes_ds);
	
	float* gpuGrad;
	cudaMalloc(&gpuGrad, bytes_ds*2);
	float* gpuVote;
	cudaMalloc(&gpuVote, bytes_ds);
	// allocate space on the GPU for the detected cell centes
	float* gpuCenters;
	cudaMalloc(&gpuCenters, bytes_ds);		
	stim::cuda::gpu_down_sample<float>(gpuI, gpuI0, resize, x , y);
	cudaMemcpy(cpuResize, gpuI, bytes_ds, cudaMemcpyDeviceToHost);
x = x_ds;
	y = y_ds;
	t = t * resize;
	//sigma = sigma * resize;
	cudaDeviceSynchronize();
	stim::cuda::gpu_gaussian_blur2<float>(gpuI,sigma, x, y);
	cudaDeviceSynchronize();
	cudaMemcpy(cpuBlur, gpuI, bytes_ds, cudaMemcpyDeviceToHost);
	cudaDeviceSynchronize();
	
	stim::cuda::gpu_gradient_2d<float>(gpuGrad, gpuI, x, y);
	cudaDeviceSynchronize();
	cudaMemcpy(cpuGradient, gpuGrad, bytes_ds*2, cudaMemcpyDeviceToHost);
	stim::cuda::gpu_cart2polar<float>(gpuGrad, x, y);
	cudaDeviceSynchronize();
	cudaMemcpy(cpuCart2Polar, gpuGrad, bytes_ds*2, cudaMemcpyDeviceToHost);
	
	//multiply the gradient by a constant and calculate the absolute value (to save an image)	
	stim::cuda::cpu_multiply<float>(cpuCart2Polar, 40, x * y * 2);
	cudaDeviceSynchronize();
	stim::cuda::cpu_abs<float>(cpuCart2Polar, x * y * 2);
	cudaDeviceSynchronize();
		
	for (int i =0; i<iter; i++){
		
		stim::cuda::gpu_vote<float>(gpuVote, gpuGrad, gpuTable, phi, rmax, x, y);
		cudaDeviceSynchronize();
		stim::cuda::gpu_update_dir<float>(gpuVote, gpuGrad, gpuTable, phi, rmax, x, y);
		cudaDeviceSynchronize();
		switch (i){
		case 0 : cudaMemcpy(cpuVote1, gpuVote, bytes_ds, cudaMemcpyDeviceToHost);
			break;
		case 1 : cudaMemcpy(cpuVote2, gpuVote, bytes_ds, cudaMemcpyDeviceToHost);
			break;
		case 2 : cudaMemcpy(cpuVote3, gpuVote, bytes_ds, cudaMemcpyDeviceToHost);
			break;
		case 3 : cudaMemcpy(cpuVote4, gpuVote, bytes_ds, cudaMemcpyDeviceToHost);
			break;
		case 4 : cudaMemcpy(cpuVote5, gpuVote, bytes_ds, cudaMemcpyDeviceToHost);
			break;
		default : cudaMemcpy(cpuVote5, gpuVote, bytes_ds, cudaMemcpyDeviceToHost);
			break;
		}
		phi = phi - dphi;
	}
	
	stim::cuda::gpu_local_max<float>(gpuCenters, gpuVote, t, conn, x, y);
	cudaMemcpy(cpuCenters, gpuCenters, bytes_ds, cudaMemcpyDeviceToHost);
	
}