Commit 02fb26b3ee6354848bb0fb35b9e970b571b251b7

Authored by Laila Saadatifard
1 parent 6ef1dab9

change the vote and update_dir and cudafunc codes to use three channels for stor…

…ing the gradient instead of four channels
Showing 3 changed files with 20 additions and 40 deletions   Show diff stats
@@ -27,9 +27,6 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un @@ -27,9 +27,6 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un
27 27
28 cudaDeviceSynchronize(); 28 cudaDeviceSynchronize();
29 29
30 - //copy the blur data back to the cpu  
31 - //cudaMemcpy(img, gpuI0, bytes, cudaMemcpyDeviceToHost);  
32 -  
33 //assign memory on the gpu for the gradient along the X, y, z. 30 //assign memory on the gpu for the gradient along the X, y, z.
34 float* gpu_grad; 31 float* gpu_grad;
35 cudaMalloc(&gpu_grad, bytes*3); 32 cudaMalloc(&gpu_grad, bytes*3);
@@ -38,14 +35,6 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un @@ -38,14 +35,6 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un
38 gpu_gradient3<float>(gpu_grad, gpuI0, x, y, z); 35 gpu_gradient3<float>(gpu_grad, gpuI0, x, y, z);
39 cudaFree(gpuI0); 36 cudaFree(gpuI0);
40 37
41 - //assign memory on the gpu for the gradient magnitude  
42 - float* gpu_mag;  
43 - cudaMalloc(&gpu_mag, bytes);  
44 -  
45 - //call the magnitude function  
46 - gpu_mag3<float>(gpu_mag, gpu_grad, x, y, z);  
47 - //cudaMemcpy(img, gpu_mag, bytes, cudaMemcpyDeviceToHost);  
48 - //assign memory on the gpu for the vote.  
49 float* gpu_vote; 38 float* gpu_vote;
50 cudaMalloc(&gpu_vote, bytes); 39 cudaMalloc(&gpu_vote, bytes);
51 40
@@ -54,7 +43,7 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un @@ -54,7 +43,7 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un
54 //call the vote function. 43 //call the vote function.
55 for (int i = 0; i < iter; i++){ 44 for (int i = 0; i < iter; i++){
56 45
57 - gpu_vote3<float>(gpu_vote, gpu_grad, gpu_mag, cos_phi, r, x, y, z); 46 + gpu_vote3<float>(gpu_vote, gpu_grad, cos_phi, r, x, y, z);
58 cudaDeviceSynchronize(); 47 cudaDeviceSynchronize();
59 if (i==0) 48 if (i==0)
60 cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); 49 cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);
@@ -68,8 +57,7 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un @@ -68,8 +57,7 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un
68 57
69 } 58 }
70 59
71 - cudaFree(gpu_grad);  
72 - cudaFree(gpu_mag); 60 + cudaFree(gpu_grad);
73 cudaMemcpy(center, gpu_vote, bytes, cudaMemcpyDeviceToHost); 61 cudaMemcpy(center, gpu_vote, bytes, cudaMemcpyDeviceToHost);
74 62
75 //allocate space on the gpu for the final detected cells. 63 //allocate space on the gpu for the final detected cells.
cpp/update_dir3.cuh
@@ -26,7 +26,7 @@ @@ -26,7 +26,7 @@
26 float g_v_x = gpu_grad[i * 3 + 0]; 26 float g_v_x = gpu_grad[i * 3 + 0];
27 float g_v_y = gpu_grad[i * 3 + 1]; 27 float g_v_y = gpu_grad[i * 3 + 1];
28 float g_v_z = gpu_grad[i * 3 + 2]; 28 float g_v_z = gpu_grad[i * 3 + 2];
29 - float g_v_m = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z); 29 + float mag_v = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z);
30 30
31 31
32 // define a local variable to maximum value of the vote image in the voting area for this voter 32 // define a local variable to maximum value of the vote image in the voting area for this voter
@@ -60,7 +60,7 @@ @@ -60,7 +60,7 @@
60 float d_pv = sqrt(x_sq + y_sq + z_sq); 60 float d_pv = sqrt(x_sq + y_sq + z_sq);
61 61
62 // calculate the angle between the pixel and the current voter. 62 // calculate the angle between the pixel and the current voter.
63 - float cos_diff = (g_v_x * x_p + g_v_y * y_p + g_v_z * z_p)/(d_pv * g_v_m); 63 + float cos_diff = (g_v_x * x_p + g_v_y * y_p + g_v_z * z_p)/(d_pv * mag_v);
64 64
65 if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){ 65 if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){
66 66
@@ -82,10 +82,11 @@ @@ -82,10 +82,11 @@
82 } 82 }
83 } 83 }
84 84
85 - __syncthreads();  
86 - gpu_dir[i * 3 + 0] = id_x;  
87 - gpu_dir[i * 3 + 1] = id_y;  
88 - gpu_dir[i * 3 + 2] = id_z; 85 +
  86 + float m_id = sqrt (id_x*id_x + id_y*id_y + id_z*id_z);
  87 + gpu_dir[i * 3 + 0] = mag_v * (id_x/m_id);
  88 + gpu_dir[i * 3 + 1] = mag_v * (id_y/m_id);
  89 + gpu_dir[i * 3 + 2] = mag_v * (id_z/m_id);
89 } 90 }
90 91
91 // this kernel updates the gradient direction by the calculated voting direction. 92 // this kernel updates the gradient direction by the calculated voting direction.
@@ -10,7 +10,7 @@ @@ -10,7 +10,7 @@
10 10
11 // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area 11 // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
12 template<typename T> 12 template<typename T>
13 - __global__ void vote3(T* gpu_vote, T* gpu_grad, T* gpu_mag, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ 13 + __global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
14 14
15 //calculate x,y,z coordinates for this thread 15 //calculate x,y,z coordinates for this thread
16 int xi = blockIdx.x * blockDim.x + threadIdx.x; 16 int xi = blockIdx.x * blockDim.x + threadIdx.x;
@@ -46,12 +46,11 @@ @@ -46,12 +46,11 @@
46 unsigned int id_v = (zi_v) * x * y + (yi_v) * x + (xi_v); 46 unsigned int id_v = (zi_v) * x * y + (yi_v) * x + (xi_v);
47 47
48 //find the gradient values along the x, y ,z axis, and the gradient magnitude for this voter 48 //find the gradient values along the x, y ,z axis, and the gradient magnitude for this voter
49 - float mag_v = gpu_mag[id_v]; 49 +
50 float g_v_x = gpu_grad[id_v * 3 + 0]; 50 float g_v_x = gpu_grad[id_v * 3 + 0];
51 float g_v_y = gpu_grad[id_v * 3 + 1]; 51 float g_v_y = gpu_grad[id_v * 3 + 1];
52 float g_v_z = gpu_grad[id_v * 3 + 2]; 52 float g_v_z = gpu_grad[id_v * 3 + 2];
53 - float g_v_m = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z);  
54 - 53 + float mag_v = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z);
55 //calculate the distance between the pixel and the current voter. 54 //calculate the distance between the pixel and the current voter.
56 float x_sq = x_v * x_v; 55 float x_sq = x_v * x_v;
57 float y_sq = y_v * y_v; 56 float y_sq = y_v * y_v;
@@ -59,7 +58,7 @@ @@ -59,7 +58,7 @@
59 float d_pv = sqrt(x_sq + y_sq + z_sq); 58 float d_pv = sqrt(x_sq + y_sq + z_sq);
60 59
61 // calculate the angle between the pixel and the current voter. 60 // calculate the angle between the pixel and the current voter.
62 - float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * g_v_m); 61 + float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * mag_v);
63 62
64 // check if the current voter is located in the voting area of this pixel. 63 // check if the current voter is located in the voting area of this pixel.
65 if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){ 64 if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){
@@ -75,23 +74,21 @@ @@ -75,23 +74,21 @@
75 } 74 }
76 75
77 template<typename T> 76 template<typename T>
78 - void gpu_vote3(T* gpu_vote, T* gpu_grad, T* gpu_mag, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ 77 + void gpu_vote3(T* gpu_vote, T* gpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
79 78
80 - int rx = r[0];  
81 - int ry = r[1];  
82 - int rz = r[2]; 79 +
83 unsigned int max_threads = stim::maxThreadsPerBlock(); 80 unsigned int max_threads = stim::maxThreadsPerBlock();
84 dim3 threads(sqrt (max_threads),sqrt (max_threads)); 81 dim3 threads(sqrt (max_threads),sqrt (max_threads));
85 dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); 82 dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
86 83
87 //call the kernel to do the voting 84 //call the kernel to do the voting
88 - vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, gpu_mag, cos_phi, rx, ry, rz, x , y, z); 85 + vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z);
89 86
90 } 87 }
91 88
92 89
93 template<typename T> 90 template<typename T>
94 - void cpu_vote3(T* cpu_vote, T* cpu_grad, T* cpu_mag, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ 91 + void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
95 92
96 //calculate the number of bytes in the array 93 //calculate the number of bytes in the array
97 unsigned int bytes = x * y * z * sizeof(T); 94 unsigned int bytes = x * y * z * sizeof(T);
@@ -105,19 +102,13 @@ @@ -105,19 +102,13 @@
105 T* gpu_grad; 102 T* gpu_grad;
106 cudaMalloc(&gpu_grad, bytes*3); 103 cudaMalloc(&gpu_grad, bytes*3);
107 104
108 - //allocate space on the GPU for the Gradient magnitude  
109 - T* gpu_mag;  
110 - cudaMalloc(&gpu_mag, bytes);  
111 105
112 -  
113 //copy the Gradient data to the GPU 106 //copy the Gradient data to the GPU
114 cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice); 107 cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);
115 108
116 - //copy the gradient magnitude to the GPU  
117 - cudaMemcpy(gpu_mag, cpu_mag, bytes, cudaMemcpyHostToDevice);  
118 - 109 +
119 //call the GPU version of the vote calculation function 110 //call the GPU version of the vote calculation function
120 - gpu_vote3<T>(gpu_vote, gpu_grad, gpu_mag, cos_phi, r, x , y, z); 111 + gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x , y, z);
121 112
122 //copy the Vote Data back to the CPU 113 //copy the Vote Data back to the CPU
123 cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ; 114 cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ;
@@ -125,7 +116,7 @@ @@ -125,7 +116,7 @@
125 //free allocated memory 116 //free allocated memory
126 cudaFree(gpu_vote); 117 cudaFree(gpu_vote);
127 cudaFree(gpu_grad); 118 cudaFree(gpu_grad);
128 - cudaFree(gpu_mag); 119 +
129 } 120 }
130 121
131 122