Commit 02fb26b3ee6354848bb0fb35b9e970b571b251b7
1 parent
6ef1dab9
change the vote and update_dir and cudafunc codes to use three channels for stor…
…ing the gradient instead of four channels
Showing
3 changed files
with
20 additions
and
40 deletions
Show diff stats
cpp/cudafunc.cu
@@ -27,9 +27,6 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un | @@ -27,9 +27,6 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un | ||
27 | 27 | ||
28 | cudaDeviceSynchronize(); | 28 | cudaDeviceSynchronize(); |
29 | 29 | ||
30 | - //copy the blur data back to the cpu | ||
31 | - //cudaMemcpy(img, gpuI0, bytes, cudaMemcpyDeviceToHost); | ||
32 | - | ||
33 | //assign memory on the gpu for the gradient along the X, y, z. | 30 | //assign memory on the gpu for the gradient along the X, y, z. |
34 | float* gpu_grad; | 31 | float* gpu_grad; |
35 | cudaMalloc(&gpu_grad, bytes*3); | 32 | cudaMalloc(&gpu_grad, bytes*3); |
@@ -38,14 +35,6 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un | @@ -38,14 +35,6 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un | ||
38 | gpu_gradient3<float>(gpu_grad, gpuI0, x, y, z); | 35 | gpu_gradient3<float>(gpu_grad, gpuI0, x, y, z); |
39 | cudaFree(gpuI0); | 36 | cudaFree(gpuI0); |
40 | 37 | ||
41 | - //assign memory on the gpu for the gradient magnitude | ||
42 | - float* gpu_mag; | ||
43 | - cudaMalloc(&gpu_mag, bytes); | ||
44 | - | ||
45 | - //call the magnitude function | ||
46 | - gpu_mag3<float>(gpu_mag, gpu_grad, x, y, z); | ||
47 | - //cudaMemcpy(img, gpu_mag, bytes, cudaMemcpyDeviceToHost); | ||
48 | - //assign memory on the gpu for the vote. | ||
49 | float* gpu_vote; | 38 | float* gpu_vote; |
50 | cudaMalloc(&gpu_vote, bytes); | 39 | cudaMalloc(&gpu_vote, bytes); |
51 | 40 | ||
@@ -54,7 +43,7 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un | @@ -54,7 +43,7 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un | ||
54 | //call the vote function. | 43 | //call the vote function. |
55 | for (int i = 0; i < iter; i++){ | 44 | for (int i = 0; i < iter; i++){ |
56 | 45 | ||
57 | - gpu_vote3<float>(gpu_vote, gpu_grad, gpu_mag, cos_phi, r, x, y, z); | 46 | + gpu_vote3<float>(gpu_vote, gpu_grad, cos_phi, r, x, y, z); |
58 | cudaDeviceSynchronize(); | 47 | cudaDeviceSynchronize(); |
59 | if (i==0) | 48 | if (i==0) |
60 | cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); | 49 | cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); |
@@ -68,8 +57,7 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un | @@ -68,8 +57,7 @@ void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, un | ||
68 | 57 | ||
69 | } | 58 | } |
70 | 59 | ||
71 | - cudaFree(gpu_grad); | ||
72 | - cudaFree(gpu_mag); | 60 | + cudaFree(gpu_grad); |
73 | cudaMemcpy(center, gpu_vote, bytes, cudaMemcpyDeviceToHost); | 61 | cudaMemcpy(center, gpu_vote, bytes, cudaMemcpyDeviceToHost); |
74 | 62 | ||
75 | //allocate space on the gpu for the final detected cells. | 63 | //allocate space on the gpu for the final detected cells. |
cpp/update_dir3.cuh
@@ -26,7 +26,7 @@ | @@ -26,7 +26,7 @@ | ||
26 | float g_v_x = gpu_grad[i * 3 + 0]; | 26 | float g_v_x = gpu_grad[i * 3 + 0]; |
27 | float g_v_y = gpu_grad[i * 3 + 1]; | 27 | float g_v_y = gpu_grad[i * 3 + 1]; |
28 | float g_v_z = gpu_grad[i * 3 + 2]; | 28 | float g_v_z = gpu_grad[i * 3 + 2]; |
29 | - float g_v_m = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z); | 29 | + float mag_v = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z); |
30 | 30 | ||
31 | 31 | ||
32 | // define a local variable to maximum value of the vote image in the voting area for this voter | 32 | // define a local variable to maximum value of the vote image in the voting area for this voter |
@@ -60,7 +60,7 @@ | @@ -60,7 +60,7 @@ | ||
60 | float d_pv = sqrt(x_sq + y_sq + z_sq); | 60 | float d_pv = sqrt(x_sq + y_sq + z_sq); |
61 | 61 | ||
62 | // calculate the angle between the pixel and the current voter. | 62 | // calculate the angle between the pixel and the current voter. |
63 | - float cos_diff = (g_v_x * x_p + g_v_y * y_p + g_v_z * z_p)/(d_pv * g_v_m); | 63 | + float cos_diff = (g_v_x * x_p + g_v_y * y_p + g_v_z * z_p)/(d_pv * mag_v); |
64 | 64 | ||
65 | if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){ | 65 | if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){ |
66 | 66 | ||
@@ -82,10 +82,11 @@ | @@ -82,10 +82,11 @@ | ||
82 | } | 82 | } |
83 | } | 83 | } |
84 | 84 | ||
85 | - __syncthreads(); | ||
86 | - gpu_dir[i * 3 + 0] = id_x; | ||
87 | - gpu_dir[i * 3 + 1] = id_y; | ||
88 | - gpu_dir[i * 3 + 2] = id_z; | 85 | + |
86 | + float m_id = sqrt (id_x*id_x + id_y*id_y + id_z*id_z); | ||
87 | + gpu_dir[i * 3 + 0] = mag_v * (id_x/m_id); | ||
88 | + gpu_dir[i * 3 + 1] = mag_v * (id_y/m_id); | ||
89 | + gpu_dir[i * 3 + 2] = mag_v * (id_z/m_id); | ||
89 | } | 90 | } |
90 | 91 | ||
91 | // this kernel updates the gradient direction by the calculated voting direction. | 92 | // this kernel updates the gradient direction by the calculated voting direction. |
cpp/vote3.cuh
@@ -10,7 +10,7 @@ | @@ -10,7 +10,7 @@ | ||
10 | 10 | ||
11 | // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area | 11 | // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area |
12 | template<typename T> | 12 | template<typename T> |
13 | - __global__ void vote3(T* gpu_vote, T* gpu_grad, T* gpu_mag, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ | 13 | + __global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){ |
14 | 14 | ||
15 | //calculate x,y,z coordinates for this thread | 15 | //calculate x,y,z coordinates for this thread |
16 | int xi = blockIdx.x * blockDim.x + threadIdx.x; | 16 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
@@ -46,12 +46,11 @@ | @@ -46,12 +46,11 @@ | ||
46 | unsigned int id_v = (zi_v) * x * y + (yi_v) * x + (xi_v); | 46 | unsigned int id_v = (zi_v) * x * y + (yi_v) * x + (xi_v); |
47 | 47 | ||
48 | //find the gradient values along the x, y ,z axis, and the gradient magnitude for this voter | 48 | //find the gradient values along the x, y ,z axis, and the gradient magnitude for this voter |
49 | - float mag_v = gpu_mag[id_v]; | 49 | + |
50 | float g_v_x = gpu_grad[id_v * 3 + 0]; | 50 | float g_v_x = gpu_grad[id_v * 3 + 0]; |
51 | float g_v_y = gpu_grad[id_v * 3 + 1]; | 51 | float g_v_y = gpu_grad[id_v * 3 + 1]; |
52 | float g_v_z = gpu_grad[id_v * 3 + 2]; | 52 | float g_v_z = gpu_grad[id_v * 3 + 2]; |
53 | - float g_v_m = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z); | ||
54 | - | 53 | + float mag_v = sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z); |
55 | //calculate the distance between the pixel and the current voter. | 54 | //calculate the distance between the pixel and the current voter. |
56 | float x_sq = x_v * x_v; | 55 | float x_sq = x_v * x_v; |
57 | float y_sq = y_v * y_v; | 56 | float y_sq = y_v * y_v; |
@@ -59,7 +58,7 @@ | @@ -59,7 +58,7 @@ | ||
59 | float d_pv = sqrt(x_sq + y_sq + z_sq); | 58 | float d_pv = sqrt(x_sq + y_sq + z_sq); |
60 | 59 | ||
61 | // calculate the angle between the pixel and the current voter. | 60 | // calculate the angle between the pixel and the current voter. |
62 | - float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * g_v_m); | 61 | + float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * mag_v); |
63 | 62 | ||
64 | // check if the current voter is located in the voting area of this pixel. | 63 | // check if the current voter is located in the voting area of this pixel. |
65 | if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){ | 64 | if ((((x_sq)/rx_sq + (y_sq)/ry_sq + (z_sq)/rz_sq)<= 1) && (cos_diff >= cos_phi)){ |
@@ -75,23 +74,21 @@ | @@ -75,23 +74,21 @@ | ||
75 | } | 74 | } |
76 | 75 | ||
77 | template<typename T> | 76 | template<typename T> |
78 | - void gpu_vote3(T* gpu_vote, T* gpu_grad, T* gpu_mag, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ | 77 | + void gpu_vote3(T* gpu_vote, T* gpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ |
79 | 78 | ||
80 | - int rx = r[0]; | ||
81 | - int ry = r[1]; | ||
82 | - int rz = r[2]; | 79 | + |
83 | unsigned int max_threads = stim::maxThreadsPerBlock(); | 80 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
84 | dim3 threads(sqrt (max_threads),sqrt (max_threads)); | 81 | dim3 threads(sqrt (max_threads),sqrt (max_threads)); |
85 | dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | 82 | dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); |
86 | 83 | ||
87 | //call the kernel to do the voting | 84 | //call the kernel to do the voting |
88 | - vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, gpu_mag, cos_phi, rx, ry, rz, x , y, z); | 85 | + vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z); |
89 | 86 | ||
90 | } | 87 | } |
91 | 88 | ||
92 | 89 | ||
93 | template<typename T> | 90 | template<typename T> |
94 | - void cpu_vote3(T* cpu_vote, T* cpu_grad, T* cpu_mag, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ | 91 | + void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ |
95 | 92 | ||
96 | //calculate the number of bytes in the array | 93 | //calculate the number of bytes in the array |
97 | unsigned int bytes = x * y * z * sizeof(T); | 94 | unsigned int bytes = x * y * z * sizeof(T); |
@@ -105,19 +102,13 @@ | @@ -105,19 +102,13 @@ | ||
105 | T* gpu_grad; | 102 | T* gpu_grad; |
106 | cudaMalloc(&gpu_grad, bytes*3); | 103 | cudaMalloc(&gpu_grad, bytes*3); |
107 | 104 | ||
108 | - //allocate space on the GPU for the Gradient magnitude | ||
109 | - T* gpu_mag; | ||
110 | - cudaMalloc(&gpu_mag, bytes); | ||
111 | 105 | ||
112 | - | ||
113 | //copy the Gradient data to the GPU | 106 | //copy the Gradient data to the GPU |
114 | cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice); | 107 | cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice); |
115 | 108 | ||
116 | - //copy the gradient magnitude to the GPU | ||
117 | - cudaMemcpy(gpu_mag, cpu_mag, bytes, cudaMemcpyHostToDevice); | ||
118 | - | 109 | + |
119 | //call the GPU version of the vote calculation function | 110 | //call the GPU version of the vote calculation function |
120 | - gpu_vote3<T>(gpu_vote, gpu_grad, gpu_mag, cos_phi, r, x , y, z); | 111 | + gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x , y, z); |
121 | 112 | ||
122 | //copy the Vote Data back to the CPU | 113 | //copy the Vote Data back to the CPU |
123 | cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ; | 114 | cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ; |
@@ -125,7 +116,7 @@ | @@ -125,7 +116,7 @@ | ||
125 | //free allocated memory | 116 | //free allocated memory |
126 | cudaFree(gpu_vote); | 117 | cudaFree(gpu_vote); |
127 | cudaFree(gpu_grad); | 118 | cudaFree(gpu_grad); |
128 | - cudaFree(gpu_mag); | 119 | + |
129 | } | 120 | } |
130 | 121 | ||
131 | 122 |