update_dir3.cuh 6.53 KB
#ifndef STIM_CUDA_UPDATE_DIR3_H
#define STIM_CUDA_UPDATE_DIR3_H

# include <iostream>
# include <cuda.h>
#include <stim/cuda/cudatools.h>
#include "cpyToshare.cuh"

		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
		template<typename T>
		__global__ void update_dir3(T* gpu_dir, T* gpu_grad, T* gpu_vote, T cos_phi, int rx, int ry, int rz,  int x,  int y, int z){
			extern __shared__ float s_vote[];
			//calculate x,y,z coordinates for this thread
			int xi = blockIdx.x * blockDim.x + threadIdx.x;
			//find the grid size along y
			int grid_y = y / blockDim.y;
			int blockidx_y = blockIdx.y % grid_y;
			int yi = blockidx_y * blockDim.y + threadIdx.y;
			int zi = blockIdx.y / grid_y;
			//compute the global 1D index for this pixel
			int i = zi * x * y + yi * x + xi;

			if(xi >= x|| yi >= y || zi>= z) return;
			// find the starting points for this block along the x and y directions
			int bxi = blockIdx.x * blockDim.x;
			int byi = blockidx_y * blockDim.y;
			//find the starting points and the size of the window, which will be copied to the 2D-shared memory
			int bxs = bxi - rx;
			int bys = byi - ry;
			int xwidth = 2 * rx + blockDim.x;
			int ywidth = 2 * ry + blockDim.y;
			//compute the coordinations of this pixel in the 2D-shared memory.
			int sx_rx = threadIdx.x + rx;
			int sy_ry = threadIdx.y + ry;
			//find the gradient values along the x, y ,z axis, and the gradient magnitude for the voter
			float g_v_x = gpu_grad[i * 3 + 0];
			float g_v_y = gpu_grad[i * 3 + 1];
			float g_v_z = gpu_grad[i * 3 + 2];
			float mag_v = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z);

			
			// define a local variable to maximum value of the vote image in the voting area for this voter
			float max = 0;
			float l_vote = 0;
			// define local variables for the x, y, and z coordinations point to the vote direction
			float id_x = g_v_x;
			float id_y = g_v_y;
			float id_z = g_v_z;

			int rx_sq = rx * rx;
			int ry_sq = ry * ry;
			int rz_sq = rz * rz;

			for (int z_p = -rz; z_p<=rz; z_p++){
				int zi_p = zi + z_p;
				if ((zi_p) >=0 && (zi_p) < z){
					//call the function to copy one slide of vote date to the 2D-shared memory.
					__syncthreads();
					cpyG2S2D<float>(s_vote, gpu_vote, bxs, bys, zi + z_p, xwidth, ywidth, threadIdx, blockDim, x, y);
					__syncthreads();								
					float z_sq = z_p * z_p;		
					float d_z_sq = (z_sq)/rz_sq;
					for(int y_p = -ry; y_p <= ry; y_p++){

						float y_sq = y_p * y_p;
						float yz_sq = y_sq + z_sq;
						int yi_p = (yi + y_p) ;
						float d_yz_sq = (y_sq)/ry_sq + d_z_sq;
						unsigned int s_y1d = (sy_ry + y_p) * xwidth;
						for(int x_p = -rx; x_p <= rx; x_p++){

							//check if the current pixel is inside of the data-set.
							int xi_p = (xi + x_p) ;
							if (yi_p >=0 && yi_p < y && xi_p >=0 && xi_p < x){

								//calculate the distance between the pixel and the current voter.
								float x_sq = x_p * x_p;															
								float d_pv = sqrt(x_sq + yz_sq);

								// calculate the angle between the pixel and the current voter.
								float cos_diff = (g_v_x * x_p + g_v_y * y_p + g_v_z * z_p)/(d_pv * mag_v);

								if ((((x_sq)/rx_sq + d_yz_sq )<= 1) && (cos_diff >= cos_phi)){

									//calculate the 1D index for the current pixel in the 2D-shared memory
									unsigned int id_s = s_y1d + (sx_rx + x_p);
									l_vote = s_vote[id_s];
						
								// compare the vote value of this pixel with the max value to find the maxima and its index.
									if  (l_vote>max) {

										max = l_vote;
										id_x = x_p;
										id_y = y_p;
										id_z = z_p;
									}
								}
							}
						}
					}
				}
			}
							
		
		float m_id = sqrt (id_x*id_x + id_y*id_y + id_z*id_z);
			gpu_dir[i * 3 + 0] = mag_v * (id_x/m_id);
			gpu_dir[i * 3 + 1] = mag_v * (id_y/m_id);
			gpu_dir[i * 3 + 2] = mag_v * (id_z/m_id);
		}
		
		// this kernel updates the gradient direction by the calculated voting direction.
		template<typename T>
		__global__ void update_grad3(T* gpu_grad, T* gpu_dir, int x, int y, int z){

			//calculate x,y,z coordinates for this thread
			int xi = blockIdx.x * blockDim.x + threadIdx.x;
			//find the grid size along y
			int grid_y = y / blockDim.y;
			int blockidx_y = blockIdx.y % grid_y;
			int yi = blockidx_y * blockDim.y + threadIdx.y;
			int zi = blockIdx.y / grid_y;
			int i = zi * x * y + yi * x + xi;

			if(xi >= x || yi >= y || zi >= z) return;
			//update the gradient image with the new direction direction
			gpu_grad[i * 3 + 0] = gpu_dir [i * 3 + 0];
			gpu_grad[i * 3 + 1] = gpu_dir [i * 3 + 1];
			gpu_grad[i * 3 + 2] = gpu_dir [i * 3 + 2];
		}
		
		template<typename T>
		void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){

			unsigned int max_threads = stim::maxThreadsPerBlock();
			dim3 threads(sqrt (max_threads),sqrt (max_threads));
			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
			unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*sizeof(T);			
			// allocate space on the GPU for the updated vote direction
			T* gpu_dir;
			cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3);	

			//call the kernel to calculate the new voting direction
			update_dir3 <<< blocks, threads, shared_bytes >>>(gpu_dir, gpu_grad, gpu_vote, cos_phi, r[0], r[1], r[2], x , y, z);
			
			
			//call the kernel to update the gradient direction
			update_grad3 <<< blocks, threads >>>(gpu_grad, gpu_dir, x , y, z);
			
			//free allocated memory
			cudaFree(gpu_dir);

		}
		
		template<typename T>
		void cpu_update_dir3(T* cpu_grad, T* cpu_vote, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){

			//calculate the number of bytes in the array
			unsigned int bytes = x * y * z * sizeof(T);

			//allocate space on the GPU for the Vote data
			T* gpu_vote;
			cudaMalloc(&gpu_vote, bytes);

			//copy the input vote data to the GPU
			cudaMemcpy(gpu_vote, cpu_vote, bytes, cudaMemcpyHostToDevice);	

			//allocate space on the GPU for the Gradient data
			T* gpu_grad;
			cudaMalloc(&gpu_grad, bytes*3);

			//copy the Gradient data to the GPU
			cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);

			//call the GPU version of the update direction function
			gpu_update_dir3<T>(gpu_grad, gpu_vote, cos_phi, r, x , y, z);
							
			//copy the new gradient image back to the CPU
			cudaMemcpy(cpu_grad, gpu_grad, bytes*3, cudaMemcpyDeviceToHost) ;

			//free allocated memory
			cudaFree(gpu_vote);
			cudaFree(gpu_grad);
		}
		


#endif