cpp/vote3.cuh

#ifndef STIM_CUDA_VOTE3_H
#define STIM_CUDA_VOTE3_H
#include <iostream>
#include <cuda.h>
#include <stim/cuda/cudatools.h>
#include <stim/cuda/sharedmem.cuh>
#include <stim/cuda/cudatools/error.h>
		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
		template<typename T>
		__global__ void vote3(T* gpu_vote, T* gpu_grad, T* gpu_mag, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
			//calculate x,y,z coordinates for this thread
			int xi = blockIdx.x * blockDim.x + threadIdx.x;
			//find the grid size along y
			int grid_y = y / blockDim.y;
			int blockidx_y = blockIdx.y % grid_y;
			int yi = blockidx_y * blockDim.y + threadIdx.y;
			int zi = blockIdx.y / grid_y;
			int i = zi * x * y + yi * x + xi;			
			
			if(xi>=x || yi>=y || zi>=z) return;
			
			// define a local variable to sum the votes from the voters
			float sum = 0;
			
			int rx_sq = rx * rx;
			int ry_sq = ry * ry;
			int rz_sq = rz * rz;
			for (int z_v = -rz; z_v<=rz; z_v++){
							
				for(int y_v = -ry; y_v <= ry; y_v++){
					for(int x_v = -rx; x_v <= rx; x_v++){
						//calculate the x, y ,z indices for the current voter
						int xi_v = (xi + x_v) ;
						int yi_v = (yi + y_v) ;
						int zi_v = (zi + z_v) ; 
						if (zi_v >=0 && zi_v < z && yi_v >=0 && yi_v < y && xi_v >=0 && xi_v < x){
														
							//calculate the 1D index for the current voter
							unsigned int id_v = (zi_v) * x * y + (yi_v) * x + (xi_v);
							//find the gradient values along the x, y ,z axis, and the gradient magnitude for this voter
							float mag_v = gpu_mag[id_v];
							float g_v_x = gpu_grad[id_v * 3 + 0];
							float g_v_y = gpu_grad[id_v * 3 + 1];
							float g_v_z = gpu_grad[id_v * 3 + 2];
							float g_v_m = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z);
							//calculate the distance between the pixel and the current voter.
							float x_sq = x_v * x_v;
							float y_sq = y_v * y_v;
							float z_sq = z_v * z_v;
							float d_pv = sqrt(x_sq + y_sq + z_sq);
							// calculate the angle between the pixel and the current voter.
							float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * g_v_m);
						
							// check if the current voter is located in the voting area of this pixel.
							if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){
												
								sum += mag_v;	
							}
						}
					}	
				}
			}
						
			gpu_vote[i] = sum;			
		}
		template<typename T>
		void gpu_vote3(T* gpu_vote, T* gpu_grad, T* gpu_mag, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
			int rx = r[0];
			int ry = r[1];
			int rz = r[2];
			unsigned int max_threads = stim::maxThreadsPerBlock();
			dim3 threads(sqrt (max_threads),sqrt (max_threads));
			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
						
			//call the kernel to do the voting
			vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, gpu_mag, cos_phi, rx, ry, rz, x , y, z);
		}
		template<typename T>
		void cpu_vote3(T* cpu_vote, T* cpu_grad, T* cpu_mag, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
			//calculate the number of bytes in the array
			unsigned int bytes = x * y * z * sizeof(T);
			
			//allocate space on the GPU for the Vote Image
			T* gpu_vote;
			cudaMalloc(&gpu_vote, bytes);		
			//allocate space on the GPU for the input Gradient image
			T* gpu_grad;
			cudaMalloc(&gpu_grad, bytes*3);
			
			//allocate space on the GPU for the Gradient magnitude
			T* gpu_mag;
			cudaMalloc(&gpu_mag, bytes);
			
			//copy the Gradient data to the GPU
			cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);
			
			//copy the gradient magnitude to the GPU
			cudaMemcpy(gpu_mag, cpu_mag, bytes, cudaMemcpyHostToDevice);
						
			//call the GPU version of the vote calculation function
			gpu_vote3<T>(gpu_vote, gpu_grad, gpu_mag, cos_phi, r, x , y, z);
							
			//copy the Vote Data back to the CPU
			cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ;
			//free allocated memory
			cudaFree(gpu_vote);
			cudaFree(gpu_grad);
			cudaFree(gpu_mag);
		}
#endif