Commit 3f0e43cee10e36ee424753707555ba8e23b3273f
1 parent
c986ebb7
compute the local max without thresholding
Showing
4 changed files
with
133 additions
and
32 deletions
Show diff stats
cpp/CMakeLists.txt
... | ... | @@ -58,3 +58,5 @@ target_link_libraries(ivote3 |
58 | 58 | configure_file(nissl-raw-data/nissl-float-256.256.256.vol nissl-float-256.256.256.vol COPYONLY) |
59 | 59 | configure_file(nissl-raw-data/nissl-float-128.128.128.vol nissl-float-128.128.128.vol COPYONLY) |
60 | 60 | configure_file(nissl-raw-data/nissl-float-64.64.64.vol nissl-float-64.64.64.vol COPYONLY) |
61 | + | |
62 | +configure_file(nissl-raw-data/nissl-float-128.128.128.vol ${CMAKE_CURRENT_BINARY_DIR}/nissl-float-128.128.128_at.vol @ONLY) | |
61 | 63 | \ No newline at end of file | ... | ... |
cpp/local_max3.cuh
... | ... | @@ -6,7 +6,7 @@ |
6 | 6 | #include <stim/cuda/cudatools.h> |
7 | 7 | |
8 | 8 | template<typename T> |
9 | -__global__ void cuda_local_max3(T* gpu_center, T* gpu_vote, T t, int conn_x, int conn_y, int conn_z, int x, int y, int z){ | |
9 | +__global__ void cuda_local_max3(T* gpu_center, T* gpu_vote, int conn_x, int conn_y, int conn_z, int x, int y, int z){ | |
10 | 10 | |
11 | 11 | //calculate x,y,z coordinates for this thread |
12 | 12 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
... | ... | @@ -25,9 +25,6 @@ __global__ void cuda_local_max3(T* gpu_center, T* gpu_vote, T t, int conn_x, int |
25 | 25 | //initialize the max value to the vote value for this pixle. |
26 | 26 | T lv_i = gpu_vote[i]; |
27 | 27 | |
28 | - //check the vote value is greater than threshold | |
29 | - if (lv_i < t) return; | |
30 | - | |
31 | 28 | for (int xl = xi - conn_x; xl <= xi + conn_x; xl++){ |
32 | 29 | |
33 | 30 | for (int yl = yi - conn_y; yl <= yi + conn_y; yl++){ |
... | ... | @@ -38,12 +35,17 @@ __global__ void cuda_local_max3(T* gpu_center, T* gpu_vote, T t, int conn_x, int |
38 | 35 | |
39 | 36 | int i_l = zl * x * y + yl * x + xl; |
40 | 37 | if (gpu_vote[i_l] > lv_i) return; |
38 | + if (gpu_vote[i_l] == lv_i){ | |
39 | + if( i_l > i){ | |
40 | + return; | |
41 | + } | |
42 | + } | |
41 | 43 | } |
42 | 44 | } |
43 | 45 | } |
44 | 46 | } |
45 | 47 | |
46 | - gpu_center[i] = 1; | |
48 | + gpu_center[i] = lv_i; | |
47 | 49 | } |
48 | 50 | |
49 | 51 | template<typename T> |
... | ... | @@ -55,7 +57,7 @@ void gpu_local_max3(T* gpu_output, T* gpu_vote, T t, unsigned int conn[], unsign |
55 | 57 | dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); |
56 | 58 | |
57 | 59 | //call the kernel to find the local max |
58 | - cuda_local_max3<T><<<blocks, threads>>>(gpu_output, gpu_vote, t, conn[0], conn[1], conn[2], x, y, z); | |
60 | + cuda_local_max3<T><<<blocks, threads>>>(gpu_output, gpu_vote, conn[0], conn[1], conn[2], x, y, z); | |
59 | 61 | |
60 | 62 | |
61 | 63 | ... | ... |
1 | +#ifndef STIM_CUDA_LOCAL_MAX3_THRESHOLD_H | |
2 | +#define STIM_CUDA_LOCAL_MAX3_THRESHOLD_H | |
3 | + | |
4 | +#include <iostream> | |
5 | +#include <cuda.h> | |
6 | +#include <stim/cuda/cudatools.h> | |
7 | + | |
8 | +template<typename T> | |
9 | +__global__ void cuda_local_max3(T* gpu_center, T* gpu_vote, T t, int conn_x, int conn_y, int conn_z, int x, int y, int z){ | |
10 | + | |
11 | + //calculate x,y,z coordinates for this thread | |
12 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
13 | + //find the grid size along y | |
14 | + int grid_y = y / blockDim.y; | |
15 | + int blockidx_y = blockIdx.y % grid_y; | |
16 | + int yi = blockidx_y * blockDim.y + threadIdx.y; | |
17 | + int zi = blockIdx.y / grid_y; | |
18 | + int i = zi * x * y + yi * x + xi; | |
19 | + | |
20 | + if (xi>=x || yi>=y || zi>=z) return; | |
21 | + | |
22 | + //initilize the center value for this pixel to zero | |
23 | + gpu_center[i] = 0; | |
24 | + | |
25 | + //initialize the max value to the vote value for this pixle. | |
26 | + T lv_i = gpu_vote[i]; | |
27 | + | |
28 | + //check the vote value is greater than threshold | |
29 | + if (lv_i < t) return; | |
30 | + | |
31 | + for (int xl = xi - conn_x; xl <= xi + conn_x; xl++){ | |
32 | + | |
33 | + for (int yl = yi - conn_y; yl <= yi + conn_y; yl++){ | |
34 | + | |
35 | + for (int zl = zi - conn_z ; zl <= zi + conn_z; zl++){ | |
36 | + | |
37 | + if (xl>=0 && yl>=0 && zl>=0 && xl<x && yl<y && zl<z){ | |
38 | + | |
39 | + int i_l = zl * x * y + yl * x + xl; | |
40 | + if (gpu_vote[i_l] > lv_i) return; | |
41 | + if (gpu_vote[i_l] == lv_i){ | |
42 | + if( i_l > i){ | |
43 | + return; | |
44 | + } | |
45 | + } | |
46 | + } | |
47 | + } | |
48 | + } | |
49 | + } | |
50 | + | |
51 | + gpu_center[i] = 1; | |
52 | +} | |
53 | + | |
54 | +template<typename T> | |
55 | +void gpu_local_max3(T* gpu_output, T* gpu_vote, T t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){ | |
56 | + | |
57 | + //find the max number of threads per block. | |
58 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | |
59 | + dim3 threads(sqrt (max_threads),sqrt (max_threads)); | |
60 | + dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); | |
61 | + | |
62 | + //call the kernel to find the local max | |
63 | + cuda_local_max3<T><<<blocks, threads>>>(gpu_output, gpu_vote, t, conn[0], conn[1], conn[2], x, y, z); | |
64 | + | |
65 | + | |
66 | + | |
67 | +} | |
68 | + | |
69 | +template<typename T> | |
70 | +void cpu_local_max3(T* cpu_output, T* cpu_vote, T t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){ | |
71 | + | |
72 | + //calculate the number of bytes in the data set. | |
73 | + unsigned int bytes = x * y * z * sizeof(T); | |
74 | + | |
75 | + //allocate space on the gpu for the vote data and the output. | |
76 | + T* gpu_vote; | |
77 | + cudaMalloc(&gpu_vote, bytes); | |
78 | + | |
79 | + T* gpu_output; | |
80 | + cudaMalloc(&gpu_output, bytes); | |
81 | + | |
82 | + //copy the vote data to the gpu. | |
83 | + cudaMemcpy(gpu_vote, cpu_vote, bytes, cudaMemcpyHostToDevice); | |
84 | + | |
85 | + //call the gpu version of local max function. | |
86 | + gpu_local_max3<T>(gpu_output, gpu_vote, t, conn, x ,y, z); | |
87 | + | |
88 | + //copy the detected result to the cpu. | |
89 | + cudaMemcpy(cpu_output, gpu_output, bytes, cudaMemcpyDeviceToHost); | |
90 | + | |
91 | + cudaFree(gpu_vote); | |
92 | + cudaFree(gpu_output); | |
93 | +} | |
94 | + | |
95 | +#endif | |
0 | 96 | \ No newline at end of file | ... | ... |
cpp/main.cpp
... | ... | @@ -215,7 +215,7 @@ int main(int argc, char** argv){ |
215 | 215 | unsigned int r[3] = { 12, rmax, rmax}; |
216 | 216 | float std = 5; |
217 | 217 | float sigma[3] = { std, std, std}; |
218 | - unsigned int nlmax = 1; | |
218 | + unsigned int nlmax = 5; | |
219 | 219 | unsigned int conn[3] = { nlmax, nlmax, nlmax}; |
220 | 220 | float phi_deg = 25.0; |
221 | 221 | float phi = phi_deg * pi /180; |
... | ... | @@ -269,31 +269,33 @@ int main(int argc, char** argv){ |
269 | 269 | |
270 | 270 | // creat a file for saving the list centers |
271 | 271 | |
272 | - //std::ofstream list("shared2D-v8/" + OutName.str()+std::to_string(t0)+".obj"); | |
273 | - //// set the number of detected cells to zero. | |
274 | - //int nod = 0; | |
275 | - //if (list.is_open()){ | |
276 | - | |
277 | - // for (int iz=0; iz<z; iz++){ | |
278 | - // for (int iy=0; iy<y; iy++){ | |
279 | - // for (int ix=0; ix<x; ix++){ | |
280 | - | |
281 | - // int idx = iz * x * y + iy * x + ix; | |
282 | - // if (cpu_out[idx]==1){ | |
283 | - // nod++; | |
284 | - // list << "v" << "\t" << ix << "\t" << iy << "\t"<< iz << '\n' ; | |
285 | - // | |
286 | - // } | |
287 | - // } | |
288 | - // } | |
289 | - // } | |
290 | - // list << "p" << "\t"; | |
291 | - // for (unsigned int i_nod =1 ; i_nod <=nod; i_nod++){ | |
292 | - // list << i_nod << "\t"; | |
293 | - // } | |
294 | - | |
295 | - //list.close(); | |
296 | - //} | |
272 | + std::ofstream list(OutName.str()+std::to_string(t0)+".obj"); | |
273 | + // set the number of detected cells to zero. | |
274 | + int nod = 0; | |
275 | + if (list.is_open()){ | |
276 | + | |
277 | + for (int iz=0; iz<z; iz++){ | |
278 | + for (int iy=0; iy<y; iy++){ | |
279 | + for (int ix=0; ix<x; ix++){ | |
280 | + | |
281 | + int idx = iz * x * y + iy * x + ix; | |
282 | + if (cpu_out[idx]>0){ | |
283 | + nod++; | |
284 | + list << "v" << "\t" << ix << "\t" << iy << "\t"<< iz << "\t" << cpu_out[idx] << '\n' ; | |
285 | + | |
286 | + } | |
287 | + } | |
288 | + } | |
289 | + } | |
290 | + list << "p" << "\t"; | |
291 | + for (unsigned int i_nod =1 ; i_nod <=nod; i_nod++){ | |
292 | + list << i_nod << "\t"; | |
293 | + } | |
294 | + | |
295 | + list.close(); | |
296 | + } | |
297 | + | |
298 | + | |
297 | 299 | //} |
298 | 300 | cudaDeviceReset(); |
299 | 301 | ... | ... |