cpp/update_dir3.cuh

#ifndef STIM_CUDA_UPDATE_DIR3_H
#define STIM_CUDA_UPDATE_DIR3_H
# include <iostream>
# include <cuda.h>
#include <stim/cuda/cudatools.h>
#include <stim/cuda/sharedmem.cuh>
#include <cuda_fp16.h>
		// this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
		template<typename T>
		__global__ void update_dir3(T* gpu_dir, T* gpu_grad, T* gpu_vote, T cos_phi, int rx, int ry, int rz,  int x,  int y, int z){
			//calculate x,y,z coordinates for this thread
			int xi = blockIdx.x * blockDim.x + threadIdx.x;
			//find the grid size along y
			int grid_y = y / blockDim.y;
			int blockidx_y = blockIdx.y % grid_y;
			int yi = blockidx_y * blockDim.y + threadIdx.y;
			int zi = blockIdx.y / grid_y;
			int i = zi * x * y + yi * x + xi;
			if(xi >= x|| yi >= y || zi>= z) return;
			//find the gradient values along the x, y ,z axis, and the gradient magnitude for the voter
			float g_v_x = gpu_grad[i * 3 + 0];
			float g_v_y = gpu_grad[i * 3 + 1];
			float g_v_z = gpu_grad[i * 3 + 2];
			float mag_v = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z);
			
			// define a local variable to maximum value of the vote image in the voting area for this voter
			float max = 0;
			float l_vote = 0;
			// define local variables for the x, y, and z coordinations point to the vote direction
			float id_x = g_v_x;
			float id_y = g_v_y;
			float id_z = g_v_z;
			int rx_sq = rx * rx;
			int ry_sq = ry * ry;
			int rz_sq = rz * rz;
			for (int z_p = -rz; z_p<=rz; z_p++){
							
				for(int y_p = -ry; y_p <= ry; y_p++){
					for(int x_p = -rx; x_p <= rx; x_p++){
						//calculate the x, y ,z indices for the current pixel
						int xi_p = (xi + x_p) ;
						int yi_p = (yi + y_p) ;
						int zi_p = (zi + z_p) ; 
						if (zi_p >=0 && zi_p < z && yi_p >=0 && yi_p < y && xi_p >=0 && xi_p < x){
							//calculate the distance between the pixel and the current voter.
							float x_sq = x_p * x_p;
							float y_sq = y_p * y_p;
							float z_sq = z_p * z_p;
							float d_pv = sqrt(x_sq + y_sq + z_sq);
							// calculate the angle between the pixel and the current voter.
							float cos_diff = (g_v_x * x_p + g_v_y * y_p + g_v_z * z_p)/(d_pv * mag_v);
							if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){
								//calculate the 1D index for the current pixel
								unsigned int id_p = (zi_p) * x * y + (yi_p) * x + (xi_p);
								l_vote = gpu_vote[id_p];
						
							// compare the vote value of this pixel with the max value to find the maxima and its index.
								if  (l_vote>max) {
									max = l_vote;
									id_x = x_p;
									id_y = y_p;
									id_z = z_p;
								}
							}
						}
					}
				}
			}
							
		
		float m_id = sqrt (id_x*id_x + id_y*id_y + id_z*id_z);
			gpu_dir[i * 3 + 0] = mag_v * (id_x/m_id);
			gpu_dir[i * 3 + 1] = mag_v * (id_y/m_id);
			gpu_dir[i * 3 + 2] = mag_v * (id_z/m_id);
		}
		
		// this kernel updates the gradient direction by the calculated voting direction.
		template<typename T>
		__global__ void update_grad3(T* gpu_grad, T* gpu_dir, int x, int y, int z){
			//calculate x,y,z coordinates for this thread
			int xi = blockIdx.x * blockDim.x + threadIdx.x;
			//find the grid size along y
			int grid_y = y / blockDim.y;
			int blockidx_y = blockIdx.y % grid_y;
			int yi = blockidx_y * blockDim.y + threadIdx.y;
			int zi = blockIdx.y / grid_y;
			int i = zi * x * y + yi * x + xi;
			if(xi >= x || yi >= y || zi >= z) return;
			//update the gradient image with the new direction direction
			gpu_grad[i * 3 + 0] = gpu_dir [i * 3 + 0];
			gpu_grad[i * 3 + 1] = gpu_dir [i * 3 + 1];
			gpu_grad[i * 3 + 2] = gpu_dir [i * 3 + 2];
		}
		
		template<typename T>
		void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
			unsigned int max_threads = stim::maxThreadsPerBlock();
			dim3 threads(sqrt (max_threads),sqrt (max_threads));
			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
			
			// allocate space on the GPU for the updated vote direction
			T* gpu_dir;
			cudaMalloc(&gpu_dir, x * y * z * sizeof(T) * 3);	
			//call the kernel to calculate the new voting direction
			update_dir3 <<< blocks, threads >>>(gpu_dir, gpu_grad, gpu_vote, cos_phi, r[0], r[1], r[2], x , y, z);
			
			
			//call the kernel to update the gradient direction
			update_grad3 <<< blocks, threads >>>(gpu_grad, gpu_dir, x , y, z);
			
			//free allocated memory
			cudaFree(gpu_dir);
		}
		
		template<typename T>
		void cpu_update_dir3(T* cpu_grad, T* cpu_vote, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
			//calculate the number of bytes in the array
			unsigned int bytes = x * y * z * sizeof(T);
			//allocate space on the GPU for the Vote data
			T* gpu_vote;
			cudaMalloc(&gpu_vote, bytes);
			//copy the input vote data to the GPU
			cudaMemcpy(gpu_vote, cpu_vote, bytes, cudaMemcpyHostToDevice);	
			//allocate space on the GPU for the Gradient data
			T* gpu_grad;
			cudaMalloc(&gpu_grad, bytes*3);
			//copy the Gradient data to the GPU
			cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);
			//call the GPU version of the update direction function
			gpu_update_dir3<T>(gpu_grad, gpu_vote, cos_phi, r, x , y, z);
							
			//copy the new gradient image back to the CPU
			cudaMemcpy(cpu_grad, gpu_grad, bytes*3, cudaMemcpyDeviceToHost) ;
			//free allocated memory
			cudaFree(gpu_vote);
			cudaFree(gpu_grad);
		}
		
#endif