Commit 310a169858bcdd5efb908f5548e3fd086cd76488

Authored by Laila Saadatifard
1 parent 2558ee86

update the ivote3 project

Matlab_3D/local_max.m
@@ -26,8 +26,8 @@ for li = 1: nl @@ -26,8 +26,8 @@ for li = 1: nl
26 ind_m = sub2ind(size(Ivote), cx, cy, cz); 26 ind_m = sub2ind(size(Ivote), cx, cy, cz);
27 lm_Ivote = A(ind_m); 27 lm_Ivote = A(ind_m);
28 [~, lm_indx] = max(lm_Ivote(:)); 28 [~, lm_indx] = max(lm_Ivote(:));
29 - if ind_f(li) == ind_m(lm_indx);  
30 - Icenter(ind_f(li)) = 255; 29 + if ind_f(li) == ind_m(lm_indx)
  30 + Icenter(ind_f(li)) = A(ind_f(li)); %255
31 end 31 end
32 end 32 end
33 33
@@ -5,71 +5,65 @@ @@ -5,71 +5,65 @@
5 #include "vote3_atomic_aabb.cuh" 5 #include "vote3_atomic_aabb.cuh"
6 #include "update_dir3_aabb.cuh" 6 #include "update_dir3_aabb.cuh"
7 #include "local_max3.cuh" 7 #include "local_max3.cuh"
8 - 8 +#include <fstream>
9 9
10 void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[], 10 void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[],
11 - int iter, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){ 11 + int iter, float t, unsigned int conn[], size_t x, size_t y, size_t z){
12 12
13 13
14 - cudaSetDevice(0);  
15 -  
16 - unsigned int bytes = x * y * z * sizeof(float); // compute the number of bytes in the input data 14 + size_t bytes = x * y * z * sizeof(float); // compute the number of bytes in the input data
17 15
18 float* gpuI0; //assign memory on gpu for the input data 16 float* gpuI0; //assign memory on gpu for the input data
19 cudaMalloc(&gpuI0, bytes); 17 cudaMalloc(&gpuI0, bytes);
20 cudaMemcpy(gpuI0, img, bytes, cudaMemcpyHostToDevice); //copy the image data to the GPU. 18 cudaMemcpy(gpuI0, img, bytes, cudaMemcpyHostToDevice); //copy the image data to the GPU.
21 19
22 -  
23 gpu_gaussian_blur3<float>(gpuI0, sigma, x, y, z); //call the blurring function from the gpu. 20 gpu_gaussian_blur3<float>(gpuI0, sigma, x, y, z); //call the blurring function from the gpu.
24 cudaDeviceSynchronize(); 21 cudaDeviceSynchronize();
25 22
26 float* gpu_grad; //assign memory on the gpu for the gradient along the X, y, z. 23 float* gpu_grad; //assign memory on the gpu for the gradient along the X, y, z.
27 cudaMalloc(&gpu_grad, bytes*3); 24 cudaMalloc(&gpu_grad, bytes*3);
28 25
29 - gpu_gradient3<float>(gpu_grad, gpuI0, 1, x, y, z); //call the gradient function from the gpu. 26 + gpu_gradient3<float>(gpu_grad, gpuI0, x, y, z); //call the gradient function from the gpu.
30 cudaFree(gpuI0); 27 cudaFree(gpuI0);
31 28
32 float* gpu_vote; 29 float* gpu_vote;
33 cudaMalloc(&gpu_vote, bytes); 30 cudaMalloc(&gpu_vote, bytes);
34 31
35 float cos_phi = cos(phi); 32 float cos_phi = cos(phi);
36 - 33 +
37 //call the vote function. 34 //call the vote function.
38 for (int i = 0; i < iter; i++){ 35 for (int i = 0; i < iter; i++){
39 36
40 cudaMemset(gpu_vote, 0, bytes); 37 cudaMemset(gpu_vote, 0, bytes);
41 gpu_vote3<float>(gpu_vote, gpu_grad, phi, cos_phi, r, x, y, z); 38 gpu_vote3<float>(gpu_vote, gpu_grad, phi, cos_phi, r, x, y, z);
42 cudaDeviceSynchronize(); 39 cudaDeviceSynchronize();
43 -  
44 - //if (phi >= d_phi){ 40 + if (i == 0) {
  41 + cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);
  42 + std::ofstream fvote("00-vote1_aabb.vol", std::ofstream::out | std::ofstream::binary);
  43 + fvote.write((char*)img, bytes);
  44 + fvote.close();
  45 + }
  46 + if (i == 1) {
  47 + cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);
  48 + std::ofstream fvote("00-vote2_aabb.vol", std::ofstream::out | std::ofstream::binary);
  49 + fvote.write((char*)img, bytes);
  50 + fvote.close();
  51 + }
45 gpu_update_dir3<float>(gpu_grad, gpu_vote, phi, cos_phi, r, x, y, z); 52 gpu_update_dir3<float>(gpu_grad, gpu_vote, phi, cos_phi, r, x, y, z);
46 - cudaDeviceSynchronize();  
47 - phi = phi - d_phi;  
48 - cos_phi = cos(phi);  
49 - //}  
50 - 53 + cudaDeviceSynchronize();
  54 + phi = phi - d_phi;
  55 + cos_phi = cos(phi);
51 } 56 }
52 57
53 cudaFree(gpu_grad); 58 cudaFree(gpu_grad);
54 cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); 59 cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);
55 60
56 - //allocate space on the gpu for the final detected cells.  
57 - //float* gpu_output;  
58 - //cudaMalloc(&gpu_output, bytes);  
59 -  
60 - ////call the local max function  
61 - //gpu_local_max3<float>(gpu_output, gpu_vote, t, conn, x, y, z);  
62 -  
63 - ////copy the final result to the cpu.  
64 - //cudaMemcpy(center, gpu_output, bytes, cudaMemcpyDeviceToHost);  
65 - //  
66 - //  
67 cudaFree(gpu_vote); 61 cudaFree(gpu_vote);
68 //cudaFree(gpu_output); 62 //cudaFree(gpu_output);
69 63
70 } 64 }
71 65
72 -void lmax(float* out, float* in, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){ 66 +void lmax(float* out, float* in, float t, unsigned int conn[], size_t x, size_t y, size_t z){
73 unsigned int bytes = x * y * z * sizeof(float); 67 unsigned int bytes = x * y * z * sizeof(float);
74 68
75 cudaSetDevice(0); 69 cudaSetDevice(0);
@@ -89,5 +83,4 @@ void lmax(float* out, float* in, float t, unsigned int conn[], unsigned int x, u @@ -89,5 +83,4 @@ void lmax(float* out, float* in, float t, unsigned int conn[], unsigned int x, u
89 83
90 cudaFree(gpuV); 84 cudaFree(gpuV);
91 cudaFree(gpuOut); 85 cudaFree(gpuOut);
92 -}  
93 - 86 +}
94 \ No newline at end of file 87 \ No newline at end of file
cpp/gaussian_blur3.cuh
@@ -8,6 +8,7 @@ @@ -8,6 +8,7 @@
8 8
9 #define pi 3.14159 9 #define pi 3.14159
10 10
  11 +
11 12
12 template<typename T> 13 template<typename T>
13 __global__ void blur_x(T* out, T* in, T sigma, int x, int y, int z){ 14 __global__ void blur_x(T* out, T* in, T sigma, int x, int y, int z){
@@ -22,7 +23,7 @@ @@ -22,7 +23,7 @@
22 int i = zi * x * y + yi * x + xi; 23 int i = zi * x * y + yi * x + xi;
23 24
24 // calculate the kernel size 25 // calculate the kernel size
25 - int k_size = sigma * 4; 26 + T k_size = sigma * 4;
26 27
27 //if the current pixel is outside of the image 28 //if the current pixel is outside of the image
28 if(xi >= x || yi >= y || zi>=z) 29 if(xi >= x || yi >= y || zi>=z)
@@ -33,14 +34,21 @@ @@ -33,14 +34,21 @@
33 T G; 34 T G;
34 T sum = 0; //running weighted sum across the kernel 35 T sum = 0; //running weighted sum across the kernel
35 out[i] = 0; 36 out[i] = 0;
36 - T sigma_sq = 2 * sigma * sigma;  
37 - T a = 1.0 / (sigma * sqrt(2 * pi)); 37 + T sigma_sq = 2.0 * sigma * sigma;
  38 + T a = 1.0 / (sigma * sqrt(2.0 * pi));
  39 +
  40 + //handle the boundary in x direction
  41 + int kl = -(int)k_size;
  42 + //if (xi < k_size) kl = -xi;
  43 + int kh = (int)k_size;
  44 + //if (xi >= x - (int)k_size) kh = x - xi - 1;
  45 +
38 46
39 //for each element of the kernel 47 //for each element of the kernel
40 - for(int ki = -k_size; ki <= k_size; ki++){ 48 + for(int ki = kl; ki <= kh; ki++){
41 49
42 //calculate the gaussian value 50 //calculate the gaussian value
43 - G = a * exp(-(ki*ki) / (sigma_sq)); 51 + G = a * exp(-(T)(ki*ki) / (sigma_sq));
44 //calculate the global coordinates for this point in the kernel 52 //calculate the global coordinates for this point in the kernel
45 gx = (xi + ki) % x; 53 gx = (xi + ki) % x;
46 gi = zi * x * y + yi * x + gx; 54 gi = zi * x * y + yi * x + gx;
@@ -65,7 +73,7 @@ @@ -65,7 +73,7 @@
65 int i = zi * x * y + yi * x + xi; 73 int i = zi * x * y + yi * x + xi;
66 74
67 // calculate the kernel size 75 // calculate the kernel size
68 - int k_size = sigma * 4; 76 + T k_size = sigma * 4;
69 77
70 //if the current pixel is outside of the image 78 //if the current pixel is outside of the image
71 if(xi >= x || yi >= y || zi>=z) 79 if(xi >= x || yi >= y || zi>=z)
@@ -76,14 +84,22 @@ @@ -76,14 +84,22 @@
76 T G; 84 T G;
77 T sum = 0; //running weighted sum across the kernel 85 T sum = 0; //running weighted sum across the kernel
78 out[i] = 0; 86 out[i] = 0;
79 - T sigma_sq = 2 * sigma * sigma;  
80 - T a = 1.0 / (sigma * sqrt(2 * pi)); 87 + T sigma_sq = 2.0 * sigma * sigma;
  88 + T a = 1.0 / (sigma * sqrt(2.0 * pi));
  89 +
  90 + //handle the boundary in y direction
  91 + int kl = -(int)k_size;
  92 + //if (yi < k_size) kl = -yi;
  93 + int kh = (int)k_size;
  94 + //if (yi >= y - (int)k_size) kh = y - yi - 1;
  95 +
  96 +
81 97
82 //for each element of the kernel 98 //for each element of the kernel
83 - for(int ki = -k_size; ki <= k_size; ki++){ 99 + for(int ki = kl; ki <= kh; ki++){
84 100
85 //calculate the gaussian value 101 //calculate the gaussian value
86 - G = a * exp(-(ki*ki) / sigma_sq); 102 + G = a * exp(-(T)(ki*ki) / sigma_sq);
87 //calculate the global coordinates for this point in the kernel 103 //calculate the global coordinates for this point in the kernel
88 gy = (yi + ki ) % y; 104 gy = (yi + ki ) % y;
89 gi = zi * x * y + gy * x + xi; 105 gi = zi * x * y + gy * x + xi;
@@ -108,7 +124,7 @@ @@ -108,7 +124,7 @@
108 int i = zi * x * y + yi * x + xi; 124 int i = zi * x * y + yi * x + xi;
109 125
110 // calculate the kernel size 126 // calculate the kernel size
111 - int k_size = sigma * 4; 127 + T k_size = sigma * 4;
112 128
113 //if the current pixel is outside of the image 129 //if the current pixel is outside of the image
114 if(xi >= x || yi >= y || zi>=z) 130 if(xi >= x || yi >= y || zi>=z)
@@ -119,14 +135,20 @@ @@ -119,14 +135,20 @@
119 T G; 135 T G;
120 T sum = 0; //running weighted sum across the kernel 136 T sum = 0; //running weighted sum across the kernel
121 out[i] = 0; 137 out[i] = 0;
122 - T sigma_sq = 2 * sigma * sigma;  
123 - T a = 1.0 / (sigma * sqrt(2 * pi)); 138 + T sigma_sq = 2.0 * sigma * sigma;
  139 + T a = 1.0 / (sigma * sqrt(2.0 * pi));
  140 +
  141 + //handle the boundary in z direction
  142 + int kl = -(int)k_size;
  143 + //if (zi < k_size) kl = -zi;
  144 + int kh = (int)k_size;
  145 + //if (zi >= z - (int)k_size) kh = z - zi - 1;
124 146
125 //for each element of the kernel 147 //for each element of the kernel
126 - for(int ki = -k_size; ki <= k_size; ki++){ 148 + for(int ki = kl; ki <= kh; ki++){
127 149
128 //calculate the gaussian value 150 //calculate the gaussian value
129 - G = a * exp(-(ki*ki) / sigma_sq); 151 + G = a * exp(-(T)(ki*ki) / sigma_sq);
130 //calculate the global coordinates for this point in the kernel 152 //calculate the global coordinates for this point in the kernel
131 gz = (zi + ki) % z; 153 gz = (zi + ki) % z;
132 gi = gz * x * y + yi * x + xi; 154 gi = gz * x * y + yi * x + xi;
@@ -138,13 +160,13 @@ @@ -138,13 +160,13 @@
138 } 160 }
139 161
140 template<typename T> 162 template<typename T>
141 - void gpu_gaussian_blur3(T* image, T sigma[], unsigned int x, unsigned int y, unsigned int z){ 163 + void gpu_gaussian_blur3(T* image, T sigma[], size_t x, size_t y, size_t z){
142 164
143 //get the number of pixels in the image 165 //get the number of pixels in the image
144 - unsigned int pixels = x * y * z;  
145 - unsigned int bytes = sizeof(T) * pixels; 166 + size_t pixels = x * y * z;
  167 + size_t bytes = sizeof(T) * pixels;
146 168
147 - int max_threads = stim::maxThreadsPerBlock(); 169 + unsigned int max_threads = stim::maxThreadsPerBlock();
148 dim3 threads(sqrt (max_threads),sqrt (max_threads)); 170 dim3 threads(sqrt (max_threads),sqrt (max_threads));
149 dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); 171 dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
150 172
@@ -7,7 +7,7 @@ @@ -7,7 +7,7 @@
7 #include <stim/cuda/cudatools/error.h> 7 #include <stim/cuda/cudatools/error.h>
8 8
9 template<typename T> 9 template<typename T>
10 -__global__ void gradient3(T* out, T* in, float anisotropy, int x, int y, int z){ 10 +__global__ void gradient3(T* out, T* in, int x, int y, int z){
11 11
12 //calculate x,y,z coordinates for this thread 12 //calculate x,y,z coordinates for this thread
13 int xi = blockIdx.x * blockDim.x + threadIdx.x; 13 int xi = blockIdx.x * blockDim.x + threadIdx.x;
@@ -55,30 +55,28 @@ __global__ void gradient3(T* out, T* in, float anisotropy, int x, int y, int z){ @@ -55,30 +55,28 @@ __global__ void gradient3(T* out, T* in, float anisotropy, int x, int y, int z){
55 if(zi > 0 && zi < z-1) 55 if(zi > 0 && zi < z-1)
56 out[i * 3 + 2] = (in[i_zp] - in[i_zn]) / 2; 56 out[i * 3 + 2] = (in[i_zp] - in[i_zn]) / 2;
57 57
58 - out[i * 3 + 2] *= 1/anisotropy;  
59 -  
60 } 58 }
61 59
62 template<typename T> 60 template<typename T>
63 61
64 -void gpu_gradient3(T* gpuGrad, T* gpuI, float anisotropy, unsigned int x, unsigned int y, unsigned int z){ 62 +void gpu_gradient3(T* gpuGrad, T* gpuI, size_t x, size_t y, size_t z){
65 63
66 64
67 - int max_threads = stim::maxThreadsPerBlock(); 65 + unsigned int max_threads = stim::maxThreadsPerBlock();
68 dim3 threads(sqrt (max_threads),sqrt (max_threads)); 66 dim3 threads(sqrt (max_threads),sqrt (max_threads));
69 dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); 67 dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
70 68
71 //call the GPU kernel to determine the gradient 69 //call the GPU kernel to determine the gradient
72 - gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, anisotropy, x, y, z); 70 + gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y, z);
73 71
74 } 72 }
75 73
76 template<typename T> 74 template<typename T>
77 -void cpu_gradient3(T* out, T* in, float anisotropy, unsigned int x, unsigned int y, unsigned int z){ 75 +void cpu_gradient3(T* out, T* in, size_t x, size_t y, size_t z){
78 76
79 //get the number of pixels in the image 77 //get the number of pixels in the image
80 - unsigned int pixels = x * y * z;  
81 - unsigned int bytes = pixels * sizeof(T); 78 + size_t pixels = x * y * z;
  79 + size_t bytes = pixels * sizeof(T);
82 80
83 //allocate space on the GPU for the input image 81 //allocate space on the GPU for the input image
84 T* gpuIn; 82 T* gpuIn;
@@ -92,7 +90,7 @@ void cpu_gradient3(T* out, T* in, float anisotropy, unsigned int x, unsigned int @@ -92,7 +90,7 @@ void cpu_gradient3(T* out, T* in, float anisotropy, unsigned int x, unsigned int
92 cudaMalloc(&gpuOut, bytes * 3); //the output image will have two channels (x, y) 90 cudaMalloc(&gpuOut, bytes * 3); //the output image will have two channels (x, y)
93 91
94 //call the GPU version of this function 92 //call the GPU version of this function
95 - gpu_gradient3(gpuOut, gpuIn, anisotropy, x, y, z); 93 + gpu_gradient3(gpuOut, gpuIn, x, y, z);
96 94
97 //copy the results to the CPU 95 //copy the results to the CPU
98 cudaMemcpy(out, gpuOut, bytes * 3, cudaMemcpyDeviceToHost); 96 cudaMemcpy(out, gpuOut, bytes * 3, cudaMemcpyDeviceToHost);
@@ -19,21 +19,22 @@ float phi; @@ -19,21 +19,22 @@ float phi;
19 size_t x, y, z; 19 size_t x, y, z;
20 20
21 21
22 -void ivote3(float* img, float std[], float phi, float d_phi, unsigned int r[], int iter, float t, unsigned int conn[],  
23 - unsigned int x, unsigned int y, unsigned int z);  
24 -void lmax(float* center, float* vote, float t1, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z);  
25 -  
26 -void invert_data(float* cpuI, unsigned int x, unsigned int y, unsigned int z){  
27 - for(int ix = 0; ix < x; ix++){  
28 - for (int iy = 0; iy < y; iy++){  
29 - for (int iz = 0; iz < z; iz++){  
30 - int idx = iz * x * y + iy * x + ix;  
31 - cpuI[idx] = 255 - cpuI[idx];  
32 - } 22 +void ivote3(float* img, float std[], float phi, float d_phi, unsigned int r[], int iter, float t, unsigned int conn[],
  23 + size_t x, size_t y, size_t z);
  24 +void lmax(float* center, float* vote, float t1, unsigned int conn[], size_t x, size_t y, size_t z);
  25 +
  26 +void invert_data(float* cpuI, size_t x, size_t y, size_t z) {
  27 + size_t idx;
  28 + for (size_t ix = 0; ix < x; ix++) {
  29 + for (size_t iy = 0; iy < y; iy++) {
  30 + for (size_t iz = 0; iz < z; iz++) {
  31 + idx = iz * x * y + iy * x + ix;
  32 + cpuI[idx] = 255 - cpuI[idx];
33 } 33 }
34 } 34 }
35 } 35 }
36 - 36 +}
  37 +
37 38
38 39
39 void advertise() { 40 void advertise() {
@@ -57,12 +58,12 @@ void init_args(int argc, char* argv[]) { @@ -57,12 +58,12 @@ void init_args(int argc, char* argv[]) {
57 args.add("x", "size of the dataset along X axis", "positive value"); 58 args.add("x", "size of the dataset along X axis", "positive value");
58 args.add("y", "size of the dataset along Y axis", "positive value"); 59 args.add("y", "size of the dataset along Y axis", "positive value");
59 args.add("z", "size of the dataset along Z axis", "positive value"); 60 args.add("z", "size of the dataset along Z axis", "positive value");
60 - args.add("t", "threshold value for the final result", "positive valu"); 61 + args.add("t", "threshold value for the final result", "0", "positive valu");
61 args.add("invert", "to invert the input data set", "string"); 62 args.add("invert", "to invert the input data set", "string");
62 args.add("rmax", "maximum possible radius of the cells in the input image", "10", "[positive value]"); 63 args.add("rmax", "maximum possible radius of the cells in the input image", "10", "[positive value]");
63 args.add("phi", "starting angle for the vote region (in degrees)", "25.0", "0 <= phi < 180"); 64 args.add("phi", "starting angle for the vote region (in degrees)", "25.0", "0 <= phi < 180");
64 args.add("iter", "number of iterations for voting", "8", "i > 0"); 65 args.add("iter", "number of iterations for voting", "8", "i > 0");
65 - args.add("sigma", "the gaussian blur standard deviation", "5", "s >=0 (s = 0, no blurring)"); 66 + args.add("sigma", "the gaussian blur standard deviation", "3", "s >=0 (s = 0, no blurring)");
66 args.add("conn", "the number of connected neighbors for calculating the local maxima", "5", "[positive value]"); 67 args.add("conn", "the number of connected neighbors for calculating the local maxima", "5", "[positive value]");
67 //parse the command line arguments. 68 //parse the command line arguments.
68 args.parse(argc, argv); 69 args.parse(argc, argv);
@@ -95,94 +96,93 @@ void init_args(int argc, char* argv[]) { @@ -95,94 +96,93 @@ void init_args(int argc, char* argv[]) {
95 t = args["t"].as_float(); 96 t = args["t"].as_float();
96 sigma = args["sigma"].as_float(); 97 sigma = args["sigma"].as_float();
97 phi = (float)args["phi"].as_float() * (float)stim::PI / 180; 98 phi = (float)args["phi"].as_float() * (float)stim::PI / 180;
98 - 99 +
99 } 100 }
100 -int main(int argc, char** argv){ 101 +int main(int argc, char** argv) {
101 102
102 103
103 cudaDeviceProp prop; 104 cudaDeviceProp prop;
104 int count; 105 int count;
105 cudaGetDeviceCount(&count); 106 cudaGetDeviceCount(&count);
106 - for (int i=0; i<count; i++){ 107 + for (int i = 0; i<count; i++) {
107 cudaGetDeviceProperties(&prop, i); 108 cudaGetDeviceProperties(&prop, i);
108 printf("current device ID: %d\n", i); 109 printf("current device ID: %d\n", i);
109 printf("device name: %s\n", prop.name); 110 printf("device name: %s\n", prop.name);
110 printf("total global mem: %lu\n", prop.totalGlobalMem); 111 printf("total global mem: %lu\n", prop.totalGlobalMem);
111 printf("shared memory per block: %lu\n", prop.sharedMemPerBlock); 112 printf("shared memory per block: %lu\n", prop.sharedMemPerBlock);
112 } 113 }
113 - 114 +
114 init_args(argc, argv); 115 init_args(argc, argv);
115 116
116 - unsigned int r[3] = { rmax , rmax, rmax};  
117 -  
118 - float sigma3[3] = { sigma, sigma, sigma};  
119 - unsigned int conn[3] = { nlmax, nlmax, nlmax};  
120 - float d_phi = phi/(iter+2);  
121 -  
122 - size_t bytes = x*y*z*sizeof(float); 117 + unsigned int r[3] = { rmax , rmax, rmax };
  118 +
  119 + float sigma3[3] = { sigma, sigma, sigma };
  120 + unsigned int conn[3] = { nlmax, nlmax, nlmax };
  121 + float d_phi = phi / (iter);
  122 +
  123 + size_t bytes = x*y*z * sizeof(float);
123 124
124 //allocate space on the cpu for the input data 125 //allocate space on the cpu for the input data
125 - float* cpuI = (float*) malloc(bytes); 126 + float* cpuI = (float*)malloc(bytes);
126 127
127 //load the input file into the cpuI 128 //load the input file into the cpuI
128 std::ifstream nissl(args.arg(0), std::ios::in | std::ios::binary); 129 std::ifstream nissl(args.arg(0), std::ios::in | std::ios::binary);
129 nissl.read((char*)cpuI, bytes); 130 nissl.read((char*)cpuI, bytes);
130 nissl.close(); 131 nissl.close();
131 - if(args["invert"].is_set()) 132 + if (args["invert"].is_set())
132 invert_data(cpuI, x, y, z); 133 invert_data(cpuI, x, y, z);
133 - 134 +
134 //write a new file from the cpuI. 135 //write a new file from the cpuI.
135 std::ofstream original("0-inv-128.vol", std::ofstream::out | std::ofstream::binary); 136 std::ofstream original("0-inv-128.vol", std::ofstream::out | std::ofstream::binary);
136 original.write((char*)cpuI, bytes); 137 original.write((char*)cpuI, bytes);
137 original.close(); 138 original.close();
138 -  
139 - 139 +
  140 +
140 ivote3(cpuI, sigma3, phi, d_phi, r, iter, t, conn, x, y, z); // call the ivote function 141 ivote3(cpuI, sigma3, phi, d_phi, r, iter, t, conn, x, y, z); // call the ivote function
141 -  
142 - std::ofstream fvote("0-vote8.vol", std::ofstream::out | std::ofstream::binary); 142 +
  143 + std::ofstream fvote("00-vote8_aabb.vol", std::ofstream::out | std::ofstream::binary);
143 fvote.write((char*)cpuI, bytes); 144 fvote.write((char*)cpuI, bytes);
144 fvote.close(); 145 fvote.close();
145 - 146 +
146 //allocate space on the cpu for the output result 147 //allocate space on the cpu for the output result
147 float* cpu_out = (float*)malloc(bytes * 3); 148 float* cpu_out = (float*)malloc(bytes * 3);
148 149
149 //write the output file. 150 //write the output file.
150 //for (int t0=0; t0<=5000; t0+=100){ 151 //for (int t0=0; t0<=5000; t0+=100){
151 // float t1 = t0; 152 // float t1 = t0;
152 - int t0 = t;  
153 - lmax(cpu_out, cpuI, t, conn, x, y, z);  
154 - //std::ofstream fo("shared2D-v8/" + OutName.str(), std::ofstream::out | std::ofstream::binary);  
155 - std::ofstream fo( args.arg(1), std::ofstream::out | std::ofstream::binary);  
156 - fo.write((char*)cpu_out, bytes);  
157 - fo.close(); 153 + int t0 = t;
  154 + lmax(cpu_out, cpuI, t, conn, x, y, z);
158 155
  156 + std::ofstream fo(args.arg(1), std::ofstream::out | std::ofstream::binary);
  157 + fo.write((char*)cpu_out, bytes);
  158 + fo.close();
  159 +
159 // creat a file for saving the list centers 160 // creat a file for saving the list centers
160 -  
161 - std::ofstream list(args.arg(2));  
162 - // set the number of detected cells to zero.  
163 - int nod = 0;  
164 - if (list.is_open()){  
165 -  
166 - for (int iz=0; iz<z; iz++){  
167 - for (int iy=0; iy<y; iy++){  
168 - for (int ix=0; ix<x; ix++){  
169 -  
170 - int idx = iz * x * y + iy * x + ix;  
171 - if (cpu_out[idx]>0){  
172 - nod++;  
173 - list << ix << " " << iy << " "<< iz << " " << cpu_out[idx] << '\n' ;  
174 -  
175 - }  
176 - } 161 +
  162 + std::ofstream list(args.arg(2));
  163 + // set the number of detected cells to zero.
  164 + int nod = 0;
  165 + if (list.is_open()) {
  166 +
  167 + for (int iz = 0; iz<z; iz++) {
  168 + for (int iy = 0; iy<y; iy++) {
  169 + for (int ix = 0; ix<x; ix++) {
  170 +
  171 + int idx = iz * x * y + iy * x + ix;
  172 + if (cpu_out[idx]>0) {
  173 + nod++;
  174 + list << ix << " " << iy << " " << iz << " " << cpu_out[idx] << '\n';
  175 +
177 } 176 }
178 } 177 }
179 -  
180 - list.close(); 178 + }
181 } 179 }
182 180
  181 + list.close();
  182 + }
  183 +
183 184
184 //} 185 //}
185 - cudaDeviceReset();  
186 -  
187 -} 186 + cudaDeviceReset();
188 187
  188 +}
189 \ No newline at end of file 189 \ No newline at end of file
cpp/update_dir3_aabb.cuh
@@ -46,7 +46,7 @@ @@ -46,7 +46,7 @@
46 float step = 360.0/n; 46 float step = 360.0/n;
47 stim::circle<float> cir(center, d, norm); 47 stim::circle<float> cir(center, d, norm);
48 stim::aabb3<int> bb(xi,yi,zi); 48 stim::aabb3<int> bb(xi,yi,zi);
49 - bb.insert(xc,yc,zc); 49 + bb.insert((int) xc, (int)yc, (int)zc);
50 for(float j = 0; j <360.0; j += step){ 50 for(float j = 0; j <360.0; j += step){
51 stim::vec3<float> out = cir.p(j); 51 stim::vec3<float> out = cir.p(j);
52 bb.insert(out[0], out[1], out[2]); 52 bb.insert(out[0], out[1], out[2]);
@@ -67,7 +67,7 @@ @@ -67,7 +67,7 @@
67 float id_x = g[0]; // define local variables for the x, y, and z coordinations point to the vote direction 67 float id_x = g[0]; // define local variables for the x, y, and z coordinations point to the vote direction
68 float id_y = g[1]; 68 float id_y = g[1];
69 float id_z = g[2]; 69 float id_z = g[2];
70 - 70 +
71 for (bz=bb.low[2]; bz<=bb.high[2]; bz++){ 71 for (bz=bb.low[2]; bz<=bb.high[2]; bz++){
72 dz = bz - zi; //compute the distance bw the voter and the current counter along z axis 72 dz = bz - zi; //compute the distance bw the voter and the current counter along z axis
73 dz_sq = dz * dz; 73 dz_sq = dz * dz;
@@ -85,9 +85,9 @@ @@ -85,9 +85,9 @@
85 l_vote = gpu_vote[idx_c]; 85 l_vote = gpu_vote[idx_c];
86 if (l_vote>max) { 86 if (l_vote>max) {
87 max = l_vote; 87 max = l_vote;
88 - id_x = dx;  
89 - id_y = dy;  
90 - id_z = dz; 88 + id_x = (float)dx;
  89 + id_y = (float)dy;
  90 + id_z = (float)dz;
91 } 91 }
92 } 92 }
93 } 93 }
@@ -103,25 +103,26 @@ @@ -103,25 +103,26 @@
103 103
104 // this kernel updates the gradient direction by the calculated voting direction. 104 // this kernel updates the gradient direction by the calculated voting direction.
105 template<typename T> 105 template<typename T>
106 - __global__ void update_grad3(T* gpu_grad, T* gpu_dir, int x, int y, int z){ 106 + __global__ void update_grad3(T* gpu_grad, T* gpu_dir, size_t x, size_t y, size_t z){
107 107
108 - int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread 108 + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread
  109 +
  110 + size_t grid_y = y / blockDim.y; //find the grid size along y
  111 + size_t blockidx_y = blockIdx.y % grid_y;
  112 + size_t yi = blockidx_y * blockDim.y + threadIdx.y;
  113 + size_t zi = blockIdx.y / grid_y;
109 114
110 - int grid_y = y / blockDim.y; //find the grid size along y  
111 - int blockidx_y = blockIdx.y % grid_y;  
112 - int yi = blockidx_y * blockDim.y + threadIdx.y;  
113 - int zi = blockIdx.y / grid_y;  
114 - int i = zi * x * y + yi * x + xi;  
115 -  
116 if(xi >= x || yi >= y || zi >= z) return; 115 if(xi >= x || yi >= y || zi >= z) return;
117 116
  117 + size_t i = zi * x * y + yi * x + xi;
  118 +
118 gpu_grad[i * 3 + 0] = gpu_dir [i * 3 + 0]; //update the gradient image with the new direction direction 119 gpu_grad[i * 3 + 0] = gpu_dir [i * 3 + 0]; //update the gradient image with the new direction direction
119 gpu_grad[i * 3 + 1] = gpu_dir [i * 3 + 1]; 120 gpu_grad[i * 3 + 1] = gpu_dir [i * 3 + 1];
120 gpu_grad[i * 3 + 2] = gpu_dir [i * 3 + 2]; 121 gpu_grad[i * 3 + 2] = gpu_dir [i * 3 + 2];
121 } 122 }
122 123
123 template<typename T> 124 template<typename T>
124 - void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ 125 + void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z){
125 126
126 unsigned int max_threads = stim::maxThreadsPerBlock(); 127 unsigned int max_threads = stim::maxThreadsPerBlock();
127 dim3 threads(sqrt (max_threads),sqrt (max_threads)); 128 dim3 threads(sqrt (max_threads),sqrt (max_threads));
cpp/vote3_atomic_aabb.cuh
@@ -13,121 +13,121 @@ @@ -13,121 +13,121 @@
13 #include <stim/math/vector.h> 13 #include <stim/math/vector.h>
14 #include <stim/visualization/aabb3.h> 14 #include <stim/visualization/aabb3.h>
15 15
16 - // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area  
17 - template<typename T>  
18 - __global__ void vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, int rx, int ry, int rz, int x, int y, int z){  
19 -  
20 - int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread  
21 -  
22 - int grid_y = y / blockDim.y; //find the grid size along y  
23 - int blockidx_y = blockIdx.y % grid_y;  
24 - int yi = blockidx_y * blockDim.y + threadIdx.y;  
25 - int zi = blockIdx.y / grid_y;  
26 -  
27 - if(xi>=x || yi>=y || zi>=z) return;  
28 -  
29 - int i = zi * x * y + yi * x + xi; // calculate the 1D index of the voter  
30 -  
31 -  
32 - float rx_sq = rx * rx; // compute the square for rmax  
33 - float ry_sq = ry * ry;  
34 - float rz_sq = rz * rz;  
35 -  
36 - stim::vec3<float> g(gpu_grad[3*i],gpu_grad[3*i+1],gpu_grad[3*i+2]); // form a vec3 variable for the gradient vector  
37 - stim::vec3<float> g_sph = g.cart2sph(); //convert cartesian coordinate to spherical for the gradient vector  
38 - float n =8; //set the number of points to find the boundaries of the conical voting area  
39 - float xc = rx * cos(g_sph[1]) * sin(g_sph[2]); //calculate the center point of the surface of the voting area for the voter  
40 - float yc = ry * sin(g_sph[1]) * sin(g_sph[2]) ;  
41 - float zc = rz * cos(g_sph[2]) ;  
42 - float r = sqrt(xc*xc + yc*yc + zc*zc);  
43 - xc+=xi;  
44 - yc+=yi;  
45 - zc+=zi;  
46 - stim::vec3<float> center(xc,yc,zc);  
47 -  
48 - float d = 2 * r * tan(phi); //find the diameter of the conical voting area  
49 - stim::vec3<float> norm = g.norm(); //compute the normalize gradient vector  
50 - float step = 360.0/n;  
51 - stim::circle<float> cir(center, d, norm);  
52 - stim::aabb3<int> bb(xi,yi,zi);  
53 - bb.insert((int)xc, (int)yc, (int)zc);  
54 - for(float j = 0; j <360.0; j += step){  
55 - stim::vec3<float> out = cir.p(j);  
56 - bb.insert(out[0], out[1], out[2]); 16 +// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  17 +template<typename T>
  18 +__global__ void vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, int rx, int ry, int rz, int x, int y, int z) {
  19 +
  20 + int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread
  21 +
  22 + int grid_y = y / blockDim.y; //find the grid size along y
  23 + int blockidx_y = blockIdx.y % grid_y;
  24 + int yi = blockidx_y * blockDim.y + threadIdx.y;
  25 + int zi = blockIdx.y / grid_y;
  26 +
  27 + if (xi >= x || yi >= y || zi >= z) return;
  28 +
  29 + int i = zi * x * y + yi * x + xi; // calculate the 1D index of the voter
  30 +
  31 +
  32 + float rx_sq = rx * rx; // compute the square for rmax
  33 + float ry_sq = ry * ry;
  34 + float rz_sq = rz * rz;
  35 +
  36 + stim::vec3<float> g(gpu_grad[3 * i], gpu_grad[3 * i + 1], gpu_grad[3 * i + 2]); // form a vec3 variable for the gradient vector
  37 + stim::vec3<float> g_sph = g.cart2sph(); //convert cartesian coordinate to spherical for the gradient vector
  38 + float n = 8; //set the number of points to find the boundaries of the conical voting area
  39 + float xc = rx * cos(g_sph[1]) * sin(g_sph[2]); //calculate the center point of the surface of the voting area for the voter
  40 + float yc = ry * sin(g_sph[1]) * sin(g_sph[2]);
  41 + float zc = rz * cos(g_sph[2]);
  42 + float r = sqrt(xc*xc + yc*yc + zc*zc);
  43 + xc += xi;
  44 + yc += yi;
  45 + zc += zi;
  46 + stim::vec3<float> center(xc, yc, zc);
  47 +
  48 + float d = 2 * r * tan(phi); //find the diameter of the conical voting area
  49 + stim::vec3<float> norm = g.norm(); //compute the normalize gradient vector
  50 + float step = 360.0 / n;
  51 + stim::circle<float> cir(center, d, norm);
  52 + stim::aabb3<int> bb(xi, yi, zi);
  53 + bb.insert((int)xc, (int)yc, (int)zc);
  54 + for (float j = 0; j <360.0; j += step) {
  55 + stim::vec3<float> out = cir.p(j);
  56 + bb.insert(out[0], out[1], out[2]);
  57 + }
  58 + bb.trim_low(xi - rx, yi - ry, zi - rz);
  59 + bb.trim_low(0, 0, 0);
  60 + bb.trim_high(xi + rx, yi + ry, zi + rz);
  61 + bb.trim_high(x - 1, y - 1, z - 1);
  62 + int bx, by, bz;
  63 + int dx, dy, dz;
  64 + float dx_sq, dy_sq, dz_sq;
  65 + float dist, cos_diff;
  66 + int idx_c;
  67 + for (bz = bb.low[2]; bz <= bb.high[2]; bz++) {
  68 + dz = bz - zi; //compute the distance bw the voter and the current counter along z axis
  69 + dz_sq = dz * dz;
  70 + for (by = bb.low[1]; by <= bb.high[1]; by++) {
  71 + dy = by - yi; //compute the distance bw the voter and the current counter along y axis
  72 + dy_sq = dy * dy;
  73 + for (bx = bb.low[0]; bx <= bb.high[0]; bx++) {
  74 + dx = bx - xi; //compute the distance bw the voter and the current counter along x axis
  75 + dx_sq = dx * dx;
  76 +
  77 + dist = sqrt(dx_sq + dy_sq + dz_sq); //calculate the distance between the voter and the current counter
  78 + cos_diff = (norm[0] * dx + norm[1] * dy + norm[2] * dz) / dist; // calculate the cosine of angle between the voter and the current counter
  79 + if (((dx_sq / rx_sq + dy_sq / ry_sq + dz_sq / rz_sq) <= 1) && (cos_diff >= cos_phi)) { //check if the current counter located in the voting area of the voter
  80 + idx_c = (bz* y + by) * x + bx; //calculate the 1D index for the current counter
  81 + atomicAdd(&gpu_vote[idx_c], g_sph[0]);
  82 + }
57 } 83 }
58 - bb.trim_low(xi-rx, yi-ry, zi-rz);  
59 - bb.trim_low(0,0,0);  
60 - bb.trim_high(xi+rx, yi+ry, zi+rz);  
61 - bb.trim_high(x-1, y-1, z-1);  
62 - int bx,by,bz;  
63 - int dx, dy, dz;  
64 - float dx_sq, dy_sq, dz_sq;  
65 - float dist, cos_diff;  
66 - int idx_c;  
67 - for (bz=bb.low[2]; bz<=bb.high[2]; bz++){  
68 - dz = bz - zi; //compute the distance bw the voter and the current counter along z axis  
69 - dz_sq = dz * dz;  
70 - for (by=bb.low[1]; by<=bb.high[1]; by++){  
71 - dy = by - yi; //compute the distance bw the voter and the current counter along y axis  
72 - dy_sq = dy * dy;  
73 - for (bx=bb.low[0]; bx<=bb.high[0]; bx++){  
74 - dx = bx - xi; //compute the distance bw the voter and the current counter along x axis  
75 - dx_sq = dx * dx;  
76 -  
77 - dist = sqrt(dx_sq + dy_sq + dz_sq); //calculate the distance between the voter and the current counter  
78 - cos_diff = (norm[0] * dx + norm[1] * dy + norm[2] * dz)/dist; // calculate the cosine of angle between the voter and the current counter  
79 - if ( ( (dx_sq/rx_sq + dy_sq/ry_sq + dz_sq/rz_sq) <=1 ) && (cos_diff >=cos_phi) ){ //check if the current counter located in the voting area of the voter  
80 - idx_c = (bz* y + by) * x + bx; //calculate the 1D index for the current counter  
81 - atomicAdd (&gpu_vote[idx_c] , g_sph[0]);  
82 - }  
83 - }  
84 - }  
85 - }  
86 } 84 }
  85 + }
  86 +}
87 87
88 - template<typename T>  
89 - void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ 88 +template<typename T>
  89 +void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z) {
90 90
91 -  
92 - unsigned int max_threads = stim::maxThreadsPerBlock();  
93 - dim3 threads(sqrt (max_threads),sqrt (max_threads));  
94 - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);  
95 - vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, phi, cos_phi, r[0], r[1], r[2], x , y, z); //call the kernel to do the voting  
96 91
97 - } 92 + unsigned int max_threads = stim::maxThreadsPerBlock();
  93 + dim3 threads(sqrt(max_threads), sqrt(max_threads));
  94 + dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
  95 + vote3 <T> << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, r[0], r[1], r[2], x, y, z); //call the kernel to do the voting
98 96
  97 +}
99 98
100 - template<typename T>  
101 - void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){  
102 -  
103 - //calculate the number of bytes in the array  
104 - unsigned int bytes = x * y * z * sizeof(T);  
105 -  
106 -  
107 - //allocate space on the GPU for the Vote Image  
108 - T* gpu_vote;  
109 - cudaMalloc(&gpu_vote, bytes);  
110 -  
111 - //allocate space on the GPU for the input Gradient image  
112 - T* gpu_grad;  
113 - cudaMalloc(&gpu_grad, bytes*3);  
114 -  
115 -  
116 - //copy the Gradient data to the GPU  
117 - cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);  
118 -  
119 -  
120 - //call the GPU version of the vote calculation function  
121 - gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x , y, z);  
122 -  
123 - //copy the Vote Data back to the CPU  
124 - cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ;  
125 -  
126 - //free allocated memory  
127 - cudaFree(gpu_vote);  
128 - cudaFree(gpu_grad);  
129 -  
130 - } 99 +
  100 +template<typename T>
  101 +void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z) {
  102 +
  103 + //calculate the number of bytes in the array
  104 + unsigned int bytes = x * y * z * sizeof(T);
  105 +
  106 +
  107 + //allocate space on the GPU for the Vote Image
  108 + T* gpu_vote;
  109 + cudaMalloc(&gpu_vote, bytes);
  110 +
  111 + //allocate space on the GPU for the input Gradient image
  112 + T* gpu_grad;
  113 + cudaMalloc(&gpu_grad, bytes * 3);
  114 +
  115 +
  116 + //copy the Gradient data to the GPU
  117 + cudaMemcpy(gpu_grad, cpu_grad, bytes * 3, cudaMemcpyHostToDevice);
  118 +
  119 +
  120 + //call the GPU version of the vote calculation function
  121 + gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x, y, z);
  122 +
  123 + //copy the Vote Data back to the CPU
  124 + cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost);
  125 +
  126 + //free allocated memory
  127 + cudaFree(gpu_vote);
  128 + cudaFree(gpu_grad);
  129 +
  130 +}
131 131
132 132
133 #endif 133 #endif
134 \ No newline at end of file 134 \ No newline at end of file