vote3.cuh 4.13 KB

Edit Raw Blame History

#ifndef STIM_CUDA_VOTE3_H
#define STIM_CUDA_VOTE3_H

#include <iostream>
#include <cuda.h>
#include <stim/cuda/cudatools.h>
#include <stim/cuda/sharedmem.cuh>
#include <stim/cuda/cudatools/error.h>


		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
		template<typename T>
		__global__ void vote3(T* gpu_vote, T* gpu_grad, T* gpu_mag, T cos_phi, int rx, int ry, int rz, int x, int y, int z){

			//calculate x,y,z coordinates for this thread
			int xi = blockIdx.x * blockDim.x + threadIdx.x;
			//find the grid size along y
			int grid_y = y / blockDim.y;
			int blockidx_y = blockIdx.y % grid_y;
			int yi = blockidx_y * blockDim.y + threadIdx.y;
			int zi = blockIdx.y / grid_y;
			int i = zi * x * y + yi * x + xi;

			if(xi>=x || yi>=y || zi>=z) return;

			// define a local variable to sum the votes from the voters
			float sum = 0;

			int rx_sq = rx * rx;
			int ry_sq = ry * ry;
			int rz_sq = rz * rz;

			for (int z_v = -rz; z_v<=rz; z_v++){

				for(int y_v = -ry; y_v <= ry; y_v++){

					for(int x_v = -rx; x_v <= rx; x_v++){

						//calculate the x, y ,z indices for the current voter
						int xi_v = (xi + x_v) ;
						int yi_v = (yi + y_v) ;
						int zi_v = (zi + z_v) ;
						if (zi_v >=0 && zi_v < z && yi_v >=0 && yi_v < y && xi_v >=0 && xi_v < x){

							//calculate the 1D index for the current voter
							unsigned int id_v = (zi_v) * x * y + (yi_v) * x + (xi_v);

							//find the gradient values along the x, y ,z axis, and the gradient magnitude for this voter
							float mag_v = gpu_mag[id_v];
							float g_v_x = gpu_grad[id_v * 3 + 0];
							float g_v_y = gpu_grad[id_v * 3 + 1];
							float g_v_z = gpu_grad[id_v * 3 + 2];
							float g_v_m = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z);

							//calculate the distance between the pixel and the current voter.
							float x_sq = x_v * x_v;
							float y_sq = y_v * y_v;
							float z_sq = z_v * z_v;
							float d_pv = sqrt(x_sq + y_sq + z_sq);

							// calculate the angle between the pixel and the current voter.
							float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * g_v_m);

							// check if the current voter is located in the voting area of this pixel.
							if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){

								sum += mag_v;
							}
						}
					}
				}
			}

			gpu_vote[i] = sum;
		}

		template<typename T>
		void gpu_vote3(T* gpu_vote, T* gpu_grad, T* gpu_mag, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){

			int rx = r[0];
			int ry = r[1];
			int rz = r[2];
			unsigned int max_threads = stim::maxThreadsPerBlock();
			dim3 threads(sqrt (max_threads),sqrt (max_threads));
			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);

			//call the kernel to do the voting
			vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, gpu_mag, cos_phi, rx, ry, rz, x , y, z);

		}


		template<typename T>
		void cpu_vote3(T* cpu_vote, T* cpu_grad, T* cpu_mag, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){

			//calculate the number of bytes in the array
			unsigned int bytes = x * y * z * sizeof(T);


			//allocate space on the GPU for the Vote Image
			T* gpu_vote;
			cudaMalloc(&gpu_vote, bytes);

			//allocate space on the GPU for the input Gradient image
			T* gpu_grad;
			cudaMalloc(&gpu_grad, bytes*3);

			//allocate space on the GPU for the Gradient magnitude
			T* gpu_mag;
			cudaMalloc(&gpu_mag, bytes);


			//copy the Gradient data to the GPU
			cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);

			//copy the gradient magnitude to the GPU
			cudaMemcpy(gpu_mag, cpu_mag, bytes, cudaMemcpyHostToDevice);

			//call the GPU version of the vote calculation function
			gpu_vote3<T>(gpu_vote, gpu_grad, gpu_mag, cos_phi, r, x , y, z);

			//copy the Vote Data back to the CPU
			cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ;

			//free allocated memory
			cudaFree(gpu_vote);
			cudaFree(gpu_grad);
			cudaFree(gpu_mag);
		}


#endif