vote3.cuh 5.2 KB
#ifndef STIM_CUDA_VOTE3_H
#define STIM_CUDA_VOTE3_H

#include <iostream>
#include <cuda.h>
#include <stim/cuda/cudatools.h>
#include <stim/cuda/cudatools/error.h>
#include "cpyToshare.cuh"

		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
		template<typename T>
		__global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){

			extern __shared__ float s[];
			//calculate x,y,z coordinates for this thread
			int xi = blockIdx.x * blockDim.x + threadIdx.x;
			//find the grid size along y
			int grid_y = y / blockDim.y;
			int blockidx_y = blockIdx.y % grid_y;
			int yi = blockidx_y * blockDim.y + threadIdx.y;
			int zi = blockIdx.y / grid_y;
			int i = zi * x * y + yi * x + xi;			
			
			if(xi>=x || yi>=y || zi>=z) return;
			
			//find the starting points and the size of the window, which will be copied to the 2D-shared memory
			int bxs = blockIdx.x * blockDim.x - rx;
			int bys = blockidx_y * blockDim.y - ry;
			int xwidth = 2 * rx + blockDim.x;
			int ywidth = 2 * ry + blockDim.y;
			//calculate the starting point of shared memory for storing the magnitude.
			unsigned int b_s = 3 * xwidth * ywidth;
			//compute the coordinations of this pixel in the 2D-shared memory.
			int sx_rx = threadIdx.x + rx;
			int sy_ry = threadIdx.y + ry;

			// define a local variable to sum the votes from the voters
			float sum = 0;
			
			int rx_sq = rx * rx;
			int ry_sq = ry * ry;
			int rz_sq = rz * rz;

			for (int z_v = -rz; z_v<=rz; z_v++){
				int zi_v = zi + z_v;
				if ((zi_v) >=0 && (zi_v) <z){
					//call the function to copy one slide of the gradient from global to the 2D-shared memory.					
					__syncthreads();
					cpyG2S2D3ch<float>(s, gpu_grad, bxs, bys, zi + z_v, 3*xwidth, ywidth, threadIdx, blockDim, x, y);
					__syncthreads();
					mag_share2D<float>(s, b_s, xwidth, ywidth, threadIdx, blockDim);
					__syncthreads();
					float z_sq = z_v * z_v;	
					float d_z_sq = z_sq/rz_sq;

					for(int y_v = -ry; y_v <= ry; y_v++){
						int yi_v = (yi + y_v) ;
						//compute the position of the current voter in the shared memory along the y axis.
						unsigned int sIdx_y1d = (sy_ry + y_v)* xwidth;

						float y_sq = y_v * y_v;
						float yz_sq = z_sq + y_sq;
						float d_yz_sq = y_sq/ry_sq + d_z_sq;
						for(int x_v = -rx; x_v <= rx; x_v++){

							//check if the current voter is inside of the data-set
							int xi_v = (xi + x_v) ;
							if (yi_v >=0 && yi_v < y && xi_v >=0 && xi_v < x){

								//compute the position of the current voter in the 2D-shared memory along the x axis.
								unsigned int sIdx_x = (sx_rx + x_v);
								//find the 1D index of this voter in the 2D-shared memory.
								unsigned int s_Idx = (sIdx_y1d  + sIdx_x);
								unsigned int s_Idx3 = s_Idx * 3;
								
								//save the gradient values for the current voter to the local variables and compute the gradient magnitude.					
								float g_v_x = s[s_Idx3];
								float g_v_y = s[s_Idx3 + 1];
								float g_v_z = s[s_Idx3 + 2];																						
								float mag_v = s[b_s + s_Idx]; //sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z);										
						
								//calculate the distance between the pixel and the current voter.
								float x_sq = x_v * x_v;									
								float d_pv = sqrt(x_sq + yz_sq);

								// calculate the angle between the pixel and the current voter.
								float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * mag_v);
						
								// check if the current voter is located in the voting area of this pixel.
								if ((((x_sq)/rx_sq + d_yz_sq)<= 1) && (cos_diff >= cos_phi)){
												
									sum += mag_v;	
								}
							}
						}	
					}
				}
			}
						
			gpu_vote[i] = sum;			
		}

		template<typename T>
		void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){

			
			unsigned int max_threads = stim::maxThreadsPerBlock();
			dim3 threads(sqrt (max_threads),sqrt (max_threads));
			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
			unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*4*sizeof(T);			
			//call the kernel to do the voting
			vote3 <T> <<< blocks, threads, shared_bytes >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z);

		}


		template<typename T>
		void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){

			//calculate the number of bytes in the array
			unsigned int bytes = x * y * z * sizeof(T);

			
			//allocate space on the GPU for the Vote Image
			T* gpu_vote;
			cudaMalloc(&gpu_vote, bytes);		

			//allocate space on the GPU for the input Gradient image
			T* gpu_grad;
			cudaMalloc(&gpu_grad, bytes*3);
			
			
			//copy the Gradient data to the GPU
			cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);
			
					
			//call the GPU version of the vote calculation function
			gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x , y, z);
							
			//copy the Vote Data back to the CPU
			cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ;

			//free allocated memory
			cudaFree(gpu_vote);
			cudaFree(gpu_grad);
			
		}


#endif