change the vote and update_dir and cudafunc codes to use three channels for stor…

…ing the gradient instead of four channels

change the vote and update_dir and cudafunc codes to use three channels for stor…
…ing the gradient instead of four channels
Laila Saadatifard
1 parent 6ef1dab9
Showing 3 changed files with 20 additions and 40 deletions Show diff stats
cpp/cudafunc.cu
cpp/update_dir3.cuh
cpp/vote3.cuh
@@ -27,9 +27,6 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un
  
 	cudaDeviceSynchronize();
  
-	//copy the blur data back to the cpu
-	//cudaMemcpy(img, gpuI0, bytes, cudaMemcpyDeviceToHost);
-	
 	//assign memory on the gpu for the gradient along the X, y, z.
 	float* gpu_grad;
 	cudaMalloc(&gpu_grad, bytes*3);
@@ -38,14 +35,6 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un
 	gpu_gradient3<float>(gpu_grad, gpuI0, x, y, z);
 	cudaFree(gpuI0);
  
-	//assign memory on the gpu for the gradient magnitude
-	float* gpu_mag;
-	cudaMalloc(&gpu_mag, bytes);
-	
-	//call the magnitude function
-	gpu_mag3<float>(gpu_mag, gpu_grad, x, y, z);
-	//cudaMemcpy(img, gpu_mag, bytes, cudaMemcpyDeviceToHost);
-	//assign memory on the gpu for the vote.
 	float* gpu_vote;
 	cudaMalloc(&gpu_vote, bytes);
  
@@ -54,7 +43,7 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un
 	//call the vote function.
 	for (int i = 0; i < iter; i++){
  
-		gpu_vote3<float>(gpu_vote, gpu_grad, gpu_mag, cos_phi, r, x, y, z);
+		gpu_vote3<float>(gpu_vote, gpu_grad, cos_phi, r, x, y, z);
 		cudaDeviceSynchronize();
 		if (i==0)
 			cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);
@@ -68,8 +57,7 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un
  
 	}
  
-	cudaFree(gpu_grad);
-	cudaFree(gpu_mag);
+	cudaFree(gpu_grad);	
 	cudaMemcpy(center, gpu_vote, bytes, cudaMemcpyDeviceToHost);
  
 	//allocate space on the gpu for the final detected cells.
@@ -26,7 +26,7 @@
 			float g_v_x = gpu_grad[i * 3 + 0];
 			float g_v_y = gpu_grad[i * 3 + 1];
 			float g_v_z = gpu_grad[i * 3 + 2];
-			float g_v_m = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z);
+			float mag_v = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z);
  
  
 			// define a local variable to maximum value of the vote image in the voting area for this voter
@@ -60,7 +60,7 @@
 							float d_pv = sqrt(x_sq + y_sq + z_sq);
  
 							// calculate the angle between the pixel and the current voter.
-							float cos_diff = (g_v_x * x_p + g_v_y * y_p + g_v_z * z_p)/(d_pv * g_v_m);
+							float cos_diff = (g_v_x * x_p + g_v_y * y_p + g_v_z * z_p)/(d_pv * mag_v);
  
 							if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){
  
@@ -82,10 +82,11 @@
 				}
 			}
  
-		__syncthreads();
-			gpu_dir[i * 3 + 0] = id_x;
-			gpu_dir[i * 3 + 1] = id_y;
-			gpu_dir[i * 3 + 2] = id_z;
+		
+		float m_id = sqrt (id_x*id_x + id_y*id_y + id_z*id_z);
+			gpu_dir[i * 3 + 0] = mag_v * (id_x/m_id);
+			gpu_dir[i * 3 + 1] = mag_v * (id_y/m_id);
+			gpu_dir[i * 3 + 2] = mag_v * (id_z/m_id);
 		}
  
 		// this kernel updates the gradient direction by the calculated voting direction.
@@ -10,7 +10,7 @@
  
 		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
 		template<typename T>
-		__global__ void vote3(T* gpu_vote, T* gpu_grad, T* gpu_mag, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
+		__global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
  
 			//calculate x,y,z coordinates for this thread
 			int xi = blockIdx.x * blockDim.x + threadIdx.x;
@@ -46,12 +46,11 @@
 							unsigned int id_v = (zi_v) * x * y + (yi_v) * x + (xi_v);
  
 							//find the gradient values along the x, y ,z axis, and the gradient magnitude for this voter
-							float mag_v = gpu_mag[id_v];
+							
 							float g_v_x = gpu_grad[id_v * 3 + 0];
 							float g_v_y = gpu_grad[id_v * 3 + 1];
 							float g_v_z = gpu_grad[id_v * 3 + 2];
-							float g_v_m = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z);
-
+							float mag_v = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z);
 							//calculate the distance between the pixel and the current voter.
 							float x_sq = x_v * x_v;
 							float y_sq = y_v * y_v;
@@ -59,7 +58,7 @@
 							float d_pv = sqrt(x_sq + y_sq + z_sq);
  
 							// calculate the angle between the pixel and the current voter.
-							float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * g_v_m);
+							float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * mag_v);
  
 							// check if the current voter is located in the voting area of this pixel.
 							if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){
@@ -75,23 +74,21 @@
 		}
  
 		template<typename T>
-		void gpu_vote3(T* gpu_vote, T* gpu_grad, T* gpu_mag, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
+		void gpu_vote3(T* gpu_vote, T* gpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
  
-			int rx = r[0];
-			int ry = r[1];
-			int rz = r[2];
+			
 			unsigned int max_threads = stim::maxThreadsPerBlock();
 			dim3 threads(sqrt (max_threads),sqrt (max_threads));
 			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
  
 			//call the kernel to do the voting
-			vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, gpu_mag, cos_phi, rx, ry, rz, x , y, z);
+			vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z);
  
 		}
  
  
 		template<typename T>
-		void cpu_vote3(T* cpu_vote, T* cpu_grad, T* cpu_mag, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
+		void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
  
 			//calculate the number of bytes in the array
 			unsigned int bytes = x * y * z * sizeof(T);
@@ -105,19 +102,13 @@
 			T* gpu_grad;
 			cudaMalloc(&gpu_grad, bytes*3);
  
-			//allocate space on the GPU for the Gradient magnitude
-			T* gpu_mag;
-			cudaMalloc(&gpu_mag, bytes);
  
-
 			//copy the Gradient data to the GPU
 			cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);
  
-			//copy the gradient magnitude to the GPU
-			cudaMemcpy(gpu_mag, cpu_mag, bytes, cudaMemcpyHostToDevice);
-						
+					
 			//call the GPU version of the vote calculation function
-			gpu_vote3<T>(gpu_vote, gpu_grad, gpu_mag, cos_phi, r, x , y, z);
+			gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x , y, z);
  
 			//copy the Vote Data back to the CPU
 			cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ;
@@ -125,7 +116,7 @@
 			//free allocated memory
 			cudaFree(gpu_vote);
 			cudaFree(gpu_grad);
-			cudaFree(gpu_mag);
+			
 		}