cpp/vote3_atomic.cuh

#ifndef STIM_CUDA_VOTE3_ATOMIC_H
#define STIM_CUDA_VOTE3_ATOMIC_H
#include <iostream>
#include <cuda.h>
#include <stim/cuda/cudatools.h>
#include <stim/cuda/cudatools/error.h>
#include "cpyToshare.cuh"
		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
		template<typename T>
		__global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
			int xi = blockIdx.x * blockDim.x + threadIdx.x;						//calculate x,y,z coordinates for this thread
			
			int grid_y = y / blockDim.y;										//find the grid size along y
			int blockidx_y = blockIdx.y % grid_y;
			int yi = blockidx_y * blockDim.y + threadIdx.y;
			int zi = blockIdx.y / grid_y;
						
			if(xi>=x || yi>=y || zi>=z) return;
			int i = zi * x * y + yi * x + xi;	  // calculate the 1D index of the voter
			float gx_v = gpu_grad[3*i];           // find the gradient information in cartesian coordinate for the voter - global memory fetch
			float gy_v = gpu_grad[3*i+1];
			float gz_v = gpu_grad[3*i+2];
			float mag_v = sqrt(gx_v*gx_v + gy_v*gy_v + gz_v*gz_v);  // compute the gradient magnitude for the voter
			float gx_v_n = gx_v/mag_v;      // normalize the gradient vector for the voter
			float gy_v_n = gy_v/mag_v;
			float gz_v_n = gz_v/mag_v;
			float rx_sq = rx * rx;        // compute the square for rmax 
			float ry_sq = ry * ry;
			float rz_sq = rz * rz;
			float x_sq, y_sq, z_sq, d_c, cos_diff;
			int xi_c, yi_c, zi_c, idx_c;
			for (int z_c=-rz; z_c<=rz; z_c++){
				zi_c = zi + z_c;              // calculate the z position for the current counter
				if (zi_c <z && zi_c>=0){      // make sure the current counter is inside the image
					z_sq = z_c * z_c;
					for (int y_c=-ry; y_c<=ry; y_c++){
						yi_c = yi + y_c;
						if (yi_c < y && yi_c>=0){
							y_sq = y_c * y_c;
							for (int x_c=-rx; x_c<=rx; x_c++){
								xi_c = xi + x_c;
								if (xi_c < x && xi_c>=0){
									x_sq = x_c * x_c;
									d_c = sqrt(x_sq + y_sq + z_sq);			//calculate the distance between the voter and the current counter
									cos_diff = (gx_v_n * x_c + gy_v_n * y_c +  gz_v_n * z_c)/(d_c);			 // calculate the cosine of angle between the voter and the current counter
									if ( ( (x_sq/rx_sq + y_sq/ry_sq + z_sq/rz_sq) <=1 ) && (cos_diff >=cos_phi) ){      
										idx_c = (zi_c * y + yi_c) * x + xi_c;			//calculate the 1D index for the current counter
										atomicAdd (&gpu_vote[idx_c] , mag_v);
									}
								}
							}
						}
					}
				}
			}	
		}
		template<typename T>
		void gpu_vote3(T* gpu_vote, T* gpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
			
			unsigned int max_threads = stim::maxThreadsPerBlock();
			dim3 threads(sqrt (max_threads),sqrt (max_threads));
			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
			//unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*4*sizeof(T);			
			//call the kernel to do the voting
			vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z);
		}
		template<typename T>
		void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
			//calculate the number of bytes in the array
			unsigned int bytes = x * y * z * sizeof(T);
			
			//allocate space on the GPU for the Vote Image
			T* gpu_vote;
			cudaMalloc(&gpu_vote, bytes);		
			//allocate space on the GPU for the input Gradient image
			T* gpu_grad;
			cudaMalloc(&gpu_grad, bytes*3);
			
			
			//copy the Gradient data to the GPU
			cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);
			
					
			//call the GPU version of the vote calculation function
			gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x , y, z);
							
			//copy the Vote Data back to the CPU
			cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ;
			//free allocated memory
			cudaFree(gpu_vote);
			cudaFree(gpu_grad);
			
		}
#endif