Commit 310a169858bcdd5efb908f5548e3fd086cd76488

Authored by Laila Saadatifard
1 parent 2558ee86

update the ivote3 project

Matlab_3D/local_max.m
... ... @@ -26,8 +26,8 @@ for li = 1: nl
26 26 ind_m = sub2ind(size(Ivote), cx, cy, cz);
27 27 lm_Ivote = A(ind_m);
28 28 [~, lm_indx] = max(lm_Ivote(:));
29   - if ind_f(li) == ind_m(lm_indx);
30   - Icenter(ind_f(li)) = 255;
  29 + if ind_f(li) == ind_m(lm_indx)
  30 + Icenter(ind_f(li)) = A(ind_f(li)); %255
31 31 end
32 32 end
33 33  
... ...
cpp/cudafunc.cu
... ... @@ -5,71 +5,65 @@
5 5 #include "vote3_atomic_aabb.cuh"
6 6 #include "update_dir3_aabb.cuh"
7 7 #include "local_max3.cuh"
8   -
  8 +#include <fstream>
9 9  
10 10 void ivote3(float* img, float sigma[], float phi, float d_phi, unsigned int r[],
11   - int iter, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){
  11 + int iter, float t, unsigned int conn[], size_t x, size_t y, size_t z){
12 12  
13 13  
14   - cudaSetDevice(0);
15   -
16   - unsigned int bytes = x * y * z * sizeof(float); // compute the number of bytes in the input data
  14 + size_t bytes = x * y * z * sizeof(float); // compute the number of bytes in the input data
17 15  
18 16 float* gpuI0; //assign memory on gpu for the input data
19 17 cudaMalloc(&gpuI0, bytes);
20 18 cudaMemcpy(gpuI0, img, bytes, cudaMemcpyHostToDevice); //copy the image data to the GPU.
21 19  
22   -
23 20 gpu_gaussian_blur3<float>(gpuI0, sigma, x, y, z); //call the blurring function from the gpu.
24 21 cudaDeviceSynchronize();
25 22  
26 23 float* gpu_grad; //assign memory on the gpu for the gradient along the X, y, z.
27 24 cudaMalloc(&gpu_grad, bytes*3);
28 25  
29   - gpu_gradient3<float>(gpu_grad, gpuI0, 1, x, y, z); //call the gradient function from the gpu.
  26 + gpu_gradient3<float>(gpu_grad, gpuI0, x, y, z); //call the gradient function from the gpu.
30 27 cudaFree(gpuI0);
31 28  
32 29 float* gpu_vote;
33 30 cudaMalloc(&gpu_vote, bytes);
34 31  
35 32 float cos_phi = cos(phi);
36   -
  33 +
37 34 //call the vote function.
38 35 for (int i = 0; i < iter; i++){
39 36  
40 37 cudaMemset(gpu_vote, 0, bytes);
41 38 gpu_vote3<float>(gpu_vote, gpu_grad, phi, cos_phi, r, x, y, z);
42 39 cudaDeviceSynchronize();
43   -
44   - //if (phi >= d_phi){
  40 + if (i == 0) {
  41 + cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);
  42 + std::ofstream fvote("00-vote1_aabb.vol", std::ofstream::out | std::ofstream::binary);
  43 + fvote.write((char*)img, bytes);
  44 + fvote.close();
  45 + }
  46 + if (i == 1) {
  47 + cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);
  48 + std::ofstream fvote("00-vote2_aabb.vol", std::ofstream::out | std::ofstream::binary);
  49 + fvote.write((char*)img, bytes);
  50 + fvote.close();
  51 + }
45 52 gpu_update_dir3<float>(gpu_grad, gpu_vote, phi, cos_phi, r, x, y, z);
46   - cudaDeviceSynchronize();
47   - phi = phi - d_phi;
48   - cos_phi = cos(phi);
49   - //}
50   -
  53 + cudaDeviceSynchronize();
  54 + phi = phi - d_phi;
  55 + cos_phi = cos(phi);
51 56 }
52 57  
53 58 cudaFree(gpu_grad);
54 59 cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost);
55 60  
56   - //allocate space on the gpu for the final detected cells.
57   - //float* gpu_output;
58   - //cudaMalloc(&gpu_output, bytes);
59   -
60   - ////call the local max function
61   - //gpu_local_max3<float>(gpu_output, gpu_vote, t, conn, x, y, z);
62   -
63   - ////copy the final result to the cpu.
64   - //cudaMemcpy(center, gpu_output, bytes, cudaMemcpyDeviceToHost);
65   - //
66   - //
67 61 cudaFree(gpu_vote);
68 62 //cudaFree(gpu_output);
69 63  
70 64 }
71 65  
72   -void lmax(float* out, float* in, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){
  66 +void lmax(float* out, float* in, float t, unsigned int conn[], size_t x, size_t y, size_t z){
73 67 unsigned int bytes = x * y * z * sizeof(float);
74 68  
75 69 cudaSetDevice(0);
... ... @@ -89,5 +83,4 @@ void lmax(float* out, float* in, float t, unsigned int conn[], unsigned int x, u
89 83  
90 84 cudaFree(gpuV);
91 85 cudaFree(gpuOut);
92   -}
93   -
  86 +}
94 87 \ No newline at end of file
... ...
cpp/gaussian_blur3.cuh
... ... @@ -8,6 +8,7 @@
8 8  
9 9 #define pi 3.14159
10 10  
  11 +
11 12  
12 13 template<typename T>
13 14 __global__ void blur_x(T* out, T* in, T sigma, int x, int y, int z){
... ... @@ -22,7 +23,7 @@
22 23 int i = zi * x * y + yi * x + xi;
23 24  
24 25 // calculate the kernel size
25   - int k_size = sigma * 4;
  26 + T k_size = sigma * 4;
26 27  
27 28 //if the current pixel is outside of the image
28 29 if(xi >= x || yi >= y || zi>=z)
... ... @@ -33,14 +34,21 @@
33 34 T G;
34 35 T sum = 0; //running weighted sum across the kernel
35 36 out[i] = 0;
36   - T sigma_sq = 2 * sigma * sigma;
37   - T a = 1.0 / (sigma * sqrt(2 * pi));
  37 + T sigma_sq = 2.0 * sigma * sigma;
  38 + T a = 1.0 / (sigma * sqrt(2.0 * pi));
  39 +
  40 + //handle the boundary in x direction
  41 + int kl = -(int)k_size;
  42 + //if (xi < k_size) kl = -xi;
  43 + int kh = (int)k_size;
  44 + //if (xi >= x - (int)k_size) kh = x - xi - 1;
  45 +
38 46  
39 47 //for each element of the kernel
40   - for(int ki = -k_size; ki <= k_size; ki++){
  48 + for(int ki = kl; ki <= kh; ki++){
41 49  
42 50 //calculate the gaussian value
43   - G = a * exp(-(ki*ki) / (sigma_sq));
  51 + G = a * exp(-(T)(ki*ki) / (sigma_sq));
44 52 //calculate the global coordinates for this point in the kernel
45 53 gx = (xi + ki) % x;
46 54 gi = zi * x * y + yi * x + gx;
... ... @@ -65,7 +73,7 @@
65 73 int i = zi * x * y + yi * x + xi;
66 74  
67 75 // calculate the kernel size
68   - int k_size = sigma * 4;
  76 + T k_size = sigma * 4;
69 77  
70 78 //if the current pixel is outside of the image
71 79 if(xi >= x || yi >= y || zi>=z)
... ... @@ -76,14 +84,22 @@
76 84 T G;
77 85 T sum = 0; //running weighted sum across the kernel
78 86 out[i] = 0;
79   - T sigma_sq = 2 * sigma * sigma;
80   - T a = 1.0 / (sigma * sqrt(2 * pi));
  87 + T sigma_sq = 2.0 * sigma * sigma;
  88 + T a = 1.0 / (sigma * sqrt(2.0 * pi));
  89 +
  90 + //handle the boundary in y direction
  91 + int kl = -(int)k_size;
  92 + //if (yi < k_size) kl = -yi;
  93 + int kh = (int)k_size;
  94 + //if (yi >= y - (int)k_size) kh = y - yi - 1;
  95 +
  96 +
81 97  
82 98 //for each element of the kernel
83   - for(int ki = -k_size; ki <= k_size; ki++){
  99 + for(int ki = kl; ki <= kh; ki++){
84 100  
85 101 //calculate the gaussian value
86   - G = a * exp(-(ki*ki) / sigma_sq);
  102 + G = a * exp(-(T)(ki*ki) / sigma_sq);
87 103 //calculate the global coordinates for this point in the kernel
88 104 gy = (yi + ki ) % y;
89 105 gi = zi * x * y + gy * x + xi;
... ... @@ -108,7 +124,7 @@
108 124 int i = zi * x * y + yi * x + xi;
109 125  
110 126 // calculate the kernel size
111   - int k_size = sigma * 4;
  127 + T k_size = sigma * 4;
112 128  
113 129 //if the current pixel is outside of the image
114 130 if(xi >= x || yi >= y || zi>=z)
... ... @@ -119,14 +135,20 @@
119 135 T G;
120 136 T sum = 0; //running weighted sum across the kernel
121 137 out[i] = 0;
122   - T sigma_sq = 2 * sigma * sigma;
123   - T a = 1.0 / (sigma * sqrt(2 * pi));
  138 + T sigma_sq = 2.0 * sigma * sigma;
  139 + T a = 1.0 / (sigma * sqrt(2.0 * pi));
  140 +
  141 + //handle the boundary in z direction
  142 + int kl = -(int)k_size;
  143 + //if (zi < k_size) kl = -zi;
  144 + int kh = (int)k_size;
  145 + //if (zi >= z - (int)k_size) kh = z - zi - 1;
124 146  
125 147 //for each element of the kernel
126   - for(int ki = -k_size; ki <= k_size; ki++){
  148 + for(int ki = kl; ki <= kh; ki++){
127 149  
128 150 //calculate the gaussian value
129   - G = a * exp(-(ki*ki) / sigma_sq);
  151 + G = a * exp(-(T)(ki*ki) / sigma_sq);
130 152 //calculate the global coordinates for this point in the kernel
131 153 gz = (zi + ki) % z;
132 154 gi = gz * x * y + yi * x + xi;
... ... @@ -138,13 +160,13 @@
138 160 }
139 161  
140 162 template<typename T>
141   - void gpu_gaussian_blur3(T* image, T sigma[], unsigned int x, unsigned int y, unsigned int z){
  163 + void gpu_gaussian_blur3(T* image, T sigma[], size_t x, size_t y, size_t z){
142 164  
143 165 //get the number of pixels in the image
144   - unsigned int pixels = x * y * z;
145   - unsigned int bytes = sizeof(T) * pixels;
  166 + size_t pixels = x * y * z;
  167 + size_t bytes = sizeof(T) * pixels;
146 168  
147   - int max_threads = stim::maxThreadsPerBlock();
  169 + unsigned int max_threads = stim::maxThreadsPerBlock();
148 170 dim3 threads(sqrt (max_threads),sqrt (max_threads));
149 171 dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
150 172  
... ...
cpp/gradient3.cuh
... ... @@ -7,7 +7,7 @@
7 7 #include <stim/cuda/cudatools/error.h>
8 8  
9 9 template<typename T>
10   -__global__ void gradient3(T* out, T* in, float anisotropy, int x, int y, int z){
  10 +__global__ void gradient3(T* out, T* in, int x, int y, int z){
11 11  
12 12 //calculate x,y,z coordinates for this thread
13 13 int xi = blockIdx.x * blockDim.x + threadIdx.x;
... ... @@ -55,30 +55,28 @@ __global__ void gradient3(T* out, T* in, float anisotropy, int x, int y, int z){
55 55 if(zi > 0 && zi < z-1)
56 56 out[i * 3 + 2] = (in[i_zp] - in[i_zn]) / 2;
57 57  
58   - out[i * 3 + 2] *= 1/anisotropy;
59   -
60 58 }
61 59  
62 60 template<typename T>
63 61  
64   -void gpu_gradient3(T* gpuGrad, T* gpuI, float anisotropy, unsigned int x, unsigned int y, unsigned int z){
  62 +void gpu_gradient3(T* gpuGrad, T* gpuI, size_t x, size_t y, size_t z){
65 63  
66 64  
67   - int max_threads = stim::maxThreadsPerBlock();
  65 + unsigned int max_threads = stim::maxThreadsPerBlock();
68 66 dim3 threads(sqrt (max_threads),sqrt (max_threads));
69 67 dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
70 68  
71 69 //call the GPU kernel to determine the gradient
72   - gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, anisotropy, x, y, z);
  70 + gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y, z);
73 71  
74 72 }
75 73  
76 74 template<typename T>
77   -void cpu_gradient3(T* out, T* in, float anisotropy, unsigned int x, unsigned int y, unsigned int z){
  75 +void cpu_gradient3(T* out, T* in, size_t x, size_t y, size_t z){
78 76  
79 77 //get the number of pixels in the image
80   - unsigned int pixels = x * y * z;
81   - unsigned int bytes = pixels * sizeof(T);
  78 + size_t pixels = x * y * z;
  79 + size_t bytes = pixels * sizeof(T);
82 80  
83 81 //allocate space on the GPU for the input image
84 82 T* gpuIn;
... ... @@ -92,7 +90,7 @@ void cpu_gradient3(T* out, T* in, float anisotropy, unsigned int x, unsigned int
92 90 cudaMalloc(&gpuOut, bytes * 3); //the output image will have two channels (x, y)
93 91  
94 92 //call the GPU version of this function
95   - gpu_gradient3(gpuOut, gpuIn, anisotropy, x, y, z);
  93 + gpu_gradient3(gpuOut, gpuIn, x, y, z);
96 94  
97 95 //copy the results to the CPU
98 96 cudaMemcpy(out, gpuOut, bytes * 3, cudaMemcpyDeviceToHost);
... ...
cpp/main.cpp
... ... @@ -19,21 +19,22 @@ float phi;
19 19 size_t x, y, z;
20 20  
21 21  
22   -void ivote3(float* img, float std[], float phi, float d_phi, unsigned int r[], int iter, float t, unsigned int conn[],
23   - unsigned int x, unsigned int y, unsigned int z);
24   -void lmax(float* center, float* vote, float t1, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z);
25   -
26   -void invert_data(float* cpuI, unsigned int x, unsigned int y, unsigned int z){
27   - for(int ix = 0; ix < x; ix++){
28   - for (int iy = 0; iy < y; iy++){
29   - for (int iz = 0; iz < z; iz++){
30   - int idx = iz * x * y + iy * x + ix;
31   - cpuI[idx] = 255 - cpuI[idx];
32   - }
  22 +void ivote3(float* img, float std[], float phi, float d_phi, unsigned int r[], int iter, float t, unsigned int conn[],
  23 + size_t x, size_t y, size_t z);
  24 +void lmax(float* center, float* vote, float t1, unsigned int conn[], size_t x, size_t y, size_t z);
  25 +
  26 +void invert_data(float* cpuI, size_t x, size_t y, size_t z) {
  27 + size_t idx;
  28 + for (size_t ix = 0; ix < x; ix++) {
  29 + for (size_t iy = 0; iy < y; iy++) {
  30 + for (size_t iz = 0; iz < z; iz++) {
  31 + idx = iz * x * y + iy * x + ix;
  32 + cpuI[idx] = 255 - cpuI[idx];
33 33 }
34 34 }
35 35 }
36   -
  36 +}
  37 +
37 38  
38 39  
39 40 void advertise() {
... ... @@ -57,12 +58,12 @@ void init_args(int argc, char* argv[]) {
57 58 args.add("x", "size of the dataset along X axis", "positive value");
58 59 args.add("y", "size of the dataset along Y axis", "positive value");
59 60 args.add("z", "size of the dataset along Z axis", "positive value");
60   - args.add("t", "threshold value for the final result", "positive valu");
  61 + args.add("t", "threshold value for the final result", "0", "positive valu");
61 62 args.add("invert", "to invert the input data set", "string");
62 63 args.add("rmax", "maximum possible radius of the cells in the input image", "10", "[positive value]");
63 64 args.add("phi", "starting angle for the vote region (in degrees)", "25.0", "0 <= phi < 180");
64 65 args.add("iter", "number of iterations for voting", "8", "i > 0");
65   - args.add("sigma", "the gaussian blur standard deviation", "5", "s >=0 (s = 0, no blurring)");
  66 + args.add("sigma", "the gaussian blur standard deviation", "3", "s >=0 (s = 0, no blurring)");
66 67 args.add("conn", "the number of connected neighbors for calculating the local maxima", "5", "[positive value]");
67 68 //parse the command line arguments.
68 69 args.parse(argc, argv);
... ... @@ -95,94 +96,93 @@ void init_args(int argc, char* argv[]) {
95 96 t = args["t"].as_float();
96 97 sigma = args["sigma"].as_float();
97 98 phi = (float)args["phi"].as_float() * (float)stim::PI / 180;
98   -
  99 +
99 100 }
100   -int main(int argc, char** argv){
  101 +int main(int argc, char** argv) {
101 102  
102 103  
103 104 cudaDeviceProp prop;
104 105 int count;
105 106 cudaGetDeviceCount(&count);
106   - for (int i=0; i<count; i++){
  107 + for (int i = 0; i<count; i++) {
107 108 cudaGetDeviceProperties(&prop, i);
108 109 printf("current device ID: %d\n", i);
109 110 printf("device name: %s\n", prop.name);
110 111 printf("total global mem: %lu\n", prop.totalGlobalMem);
111 112 printf("shared memory per block: %lu\n", prop.sharedMemPerBlock);
112 113 }
113   -
  114 +
114 115 init_args(argc, argv);
115 116  
116   - unsigned int r[3] = { rmax , rmax, rmax};
117   -
118   - float sigma3[3] = { sigma, sigma, sigma};
119   - unsigned int conn[3] = { nlmax, nlmax, nlmax};
120   - float d_phi = phi/(iter+2);
121   -
122   - size_t bytes = x*y*z*sizeof(float);
  117 + unsigned int r[3] = { rmax , rmax, rmax };
  118 +
  119 + float sigma3[3] = { sigma, sigma, sigma };
  120 + unsigned int conn[3] = { nlmax, nlmax, nlmax };
  121 + float d_phi = phi / (iter);
  122 +
  123 + size_t bytes = x*y*z * sizeof(float);
123 124  
124 125 //allocate space on the cpu for the input data
125   - float* cpuI = (float*) malloc(bytes);
  126 + float* cpuI = (float*)malloc(bytes);
126 127  
127 128 //load the input file into the cpuI
128 129 std::ifstream nissl(args.arg(0), std::ios::in | std::ios::binary);
129 130 nissl.read((char*)cpuI, bytes);
130 131 nissl.close();
131   - if(args["invert"].is_set())
  132 + if (args["invert"].is_set())
132 133 invert_data(cpuI, x, y, z);
133   -
  134 +
134 135 //write a new file from the cpuI.
135 136 std::ofstream original("0-inv-128.vol", std::ofstream::out | std::ofstream::binary);
136 137 original.write((char*)cpuI, bytes);
137 138 original.close();
138   -
139   -
  139 +
  140 +
140 141 ivote3(cpuI, sigma3, phi, d_phi, r, iter, t, conn, x, y, z); // call the ivote function
141   -
142   - std::ofstream fvote("0-vote8.vol", std::ofstream::out | std::ofstream::binary);
  142 +
  143 + std::ofstream fvote("00-vote8_aabb.vol", std::ofstream::out | std::ofstream::binary);
143 144 fvote.write((char*)cpuI, bytes);
144 145 fvote.close();
145   -
  146 +
146 147 //allocate space on the cpu for the output result
147 148 float* cpu_out = (float*)malloc(bytes * 3);
148 149  
149 150 //write the output file.
150 151 //for (int t0=0; t0<=5000; t0+=100){
151 152 // float t1 = t0;
152   - int t0 = t;
153   - lmax(cpu_out, cpuI, t, conn, x, y, z);
154   - //std::ofstream fo("shared2D-v8/" + OutName.str(), std::ofstream::out | std::ofstream::binary);
155   - std::ofstream fo( args.arg(1), std::ofstream::out | std::ofstream::binary);
156   - fo.write((char*)cpu_out, bytes);
157   - fo.close();
  153 + int t0 = t;
  154 + lmax(cpu_out, cpuI, t, conn, x, y, z);
158 155  
  156 + std::ofstream fo(args.arg(1), std::ofstream::out | std::ofstream::binary);
  157 + fo.write((char*)cpu_out, bytes);
  158 + fo.close();
  159 +
159 160 // creat a file for saving the list centers
160   -
161   - std::ofstream list(args.arg(2));
162   - // set the number of detected cells to zero.
163   - int nod = 0;
164   - if (list.is_open()){
165   -
166   - for (int iz=0; iz<z; iz++){
167   - for (int iy=0; iy<y; iy++){
168   - for (int ix=0; ix<x; ix++){
169   -
170   - int idx = iz * x * y + iy * x + ix;
171   - if (cpu_out[idx]>0){
172   - nod++;
173   - list << ix << " " << iy << " "<< iz << " " << cpu_out[idx] << '\n' ;
174   -
175   - }
176   - }
  161 +
  162 + std::ofstream list(args.arg(2));
  163 + // set the number of detected cells to zero.
  164 + int nod = 0;
  165 + if (list.is_open()) {
  166 +
  167 + for (int iz = 0; iz<z; iz++) {
  168 + for (int iy = 0; iy<y; iy++) {
  169 + for (int ix = 0; ix<x; ix++) {
  170 +
  171 + int idx = iz * x * y + iy * x + ix;
  172 + if (cpu_out[idx]>0) {
  173 + nod++;
  174 + list << ix << " " << iy << " " << iz << " " << cpu_out[idx] << '\n';
  175 +
177 176 }
178 177 }
179   -
180   - list.close();
  178 + }
181 179 }
182 180  
  181 + list.close();
  182 + }
  183 +
183 184  
184 185 //}
185   - cudaDeviceReset();
186   -
187   -}
  186 + cudaDeviceReset();
188 187  
  188 +}
189 189 \ No newline at end of file
... ...
cpp/update_dir3_aabb.cuh
... ... @@ -46,7 +46,7 @@
46 46 float step = 360.0/n;
47 47 stim::circle<float> cir(center, d, norm);
48 48 stim::aabb3<int> bb(xi,yi,zi);
49   - bb.insert(xc,yc,zc);
  49 + bb.insert((int) xc, (int)yc, (int)zc);
50 50 for(float j = 0; j <360.0; j += step){
51 51 stim::vec3<float> out = cir.p(j);
52 52 bb.insert(out[0], out[1], out[2]);
... ... @@ -67,7 +67,7 @@
67 67 float id_x = g[0]; // define local variables for the x, y, and z coordinations point to the vote direction
68 68 float id_y = g[1];
69 69 float id_z = g[2];
70   -
  70 +
71 71 for (bz=bb.low[2]; bz<=bb.high[2]; bz++){
72 72 dz = bz - zi; //compute the distance bw the voter and the current counter along z axis
73 73 dz_sq = dz * dz;
... ... @@ -85,9 +85,9 @@
85 85 l_vote = gpu_vote[idx_c];
86 86 if (l_vote>max) {
87 87 max = l_vote;
88   - id_x = dx;
89   - id_y = dy;
90   - id_z = dz;
  88 + id_x = (float)dx;
  89 + id_y = (float)dy;
  90 + id_z = (float)dz;
91 91 }
92 92 }
93 93 }
... ... @@ -103,25 +103,26 @@
103 103  
104 104 // this kernel updates the gradient direction by the calculated voting direction.
105 105 template<typename T>
106   - __global__ void update_grad3(T* gpu_grad, T* gpu_dir, int x, int y, int z){
  106 + __global__ void update_grad3(T* gpu_grad, T* gpu_dir, size_t x, size_t y, size_t z){
107 107  
108   - int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread
  108 + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread
  109 +
  110 + size_t grid_y = y / blockDim.y; //find the grid size along y
  111 + size_t blockidx_y = blockIdx.y % grid_y;
  112 + size_t yi = blockidx_y * blockDim.y + threadIdx.y;
  113 + size_t zi = blockIdx.y / grid_y;
109 114  
110   - int grid_y = y / blockDim.y; //find the grid size along y
111   - int blockidx_y = blockIdx.y % grid_y;
112   - int yi = blockidx_y * blockDim.y + threadIdx.y;
113   - int zi = blockIdx.y / grid_y;
114   - int i = zi * x * y + yi * x + xi;
115   -
116 115 if(xi >= x || yi >= y || zi >= z) return;
117 116  
  117 + size_t i = zi * x * y + yi * x + xi;
  118 +
118 119 gpu_grad[i * 3 + 0] = gpu_dir [i * 3 + 0]; //update the gradient image with the new direction direction
119 120 gpu_grad[i * 3 + 1] = gpu_dir [i * 3 + 1];
120 121 gpu_grad[i * 3 + 2] = gpu_dir [i * 3 + 2];
121 122 }
122 123  
123 124 template<typename T>
124   - void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
  125 + void gpu_update_dir3(T* gpu_grad, T* gpu_vote, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z){
125 126  
126 127 unsigned int max_threads = stim::maxThreadsPerBlock();
127 128 dim3 threads(sqrt (max_threads),sqrt (max_threads));
... ...
cpp/vote3_atomic_aabb.cuh
... ... @@ -13,121 +13,121 @@
13 13 #include <stim/math/vector.h>
14 14 #include <stim/visualization/aabb3.h>
15 15  
16   - // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
17   - template<typename T>
18   - __global__ void vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
19   -
20   - int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread
21   -
22   - int grid_y = y / blockDim.y; //find the grid size along y
23   - int blockidx_y = blockIdx.y % grid_y;
24   - int yi = blockidx_y * blockDim.y + threadIdx.y;
25   - int zi = blockIdx.y / grid_y;
26   -
27   - if(xi>=x || yi>=y || zi>=z) return;
28   -
29   - int i = zi * x * y + yi * x + xi; // calculate the 1D index of the voter
30   -
31   -
32   - float rx_sq = rx * rx; // compute the square for rmax
33   - float ry_sq = ry * ry;
34   - float rz_sq = rz * rz;
35   -
36   - stim::vec3<float> g(gpu_grad[3*i],gpu_grad[3*i+1],gpu_grad[3*i+2]); // form a vec3 variable for the gradient vector
37   - stim::vec3<float> g_sph = g.cart2sph(); //convert cartesian coordinate to spherical for the gradient vector
38   - float n =8; //set the number of points to find the boundaries of the conical voting area
39   - float xc = rx * cos(g_sph[1]) * sin(g_sph[2]); //calculate the center point of the surface of the voting area for the voter
40   - float yc = ry * sin(g_sph[1]) * sin(g_sph[2]) ;
41   - float zc = rz * cos(g_sph[2]) ;
42   - float r = sqrt(xc*xc + yc*yc + zc*zc);
43   - xc+=xi;
44   - yc+=yi;
45   - zc+=zi;
46   - stim::vec3<float> center(xc,yc,zc);
47   -
48   - float d = 2 * r * tan(phi); //find the diameter of the conical voting area
49   - stim::vec3<float> norm = g.norm(); //compute the normalize gradient vector
50   - float step = 360.0/n;
51   - stim::circle<float> cir(center, d, norm);
52   - stim::aabb3<int> bb(xi,yi,zi);
53   - bb.insert((int)xc, (int)yc, (int)zc);
54   - for(float j = 0; j <360.0; j += step){
55   - stim::vec3<float> out = cir.p(j);
56   - bb.insert(out[0], out[1], out[2]);
  16 +// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  17 +template<typename T>
  18 +__global__ void vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, int rx, int ry, int rz, int x, int y, int z) {
  19 +
  20 + int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate x,y,z coordinates for this thread
  21 +
  22 + int grid_y = y / blockDim.y; //find the grid size along y
  23 + int blockidx_y = blockIdx.y % grid_y;
  24 + int yi = blockidx_y * blockDim.y + threadIdx.y;
  25 + int zi = blockIdx.y / grid_y;
  26 +
  27 + if (xi >= x || yi >= y || zi >= z) return;
  28 +
  29 + int i = zi * x * y + yi * x + xi; // calculate the 1D index of the voter
  30 +
  31 +
  32 + float rx_sq = rx * rx; // compute the square for rmax
  33 + float ry_sq = ry * ry;
  34 + float rz_sq = rz * rz;
  35 +
  36 + stim::vec3<float> g(gpu_grad[3 * i], gpu_grad[3 * i + 1], gpu_grad[3 * i + 2]); // form a vec3 variable for the gradient vector
  37 + stim::vec3<float> g_sph = g.cart2sph(); //convert cartesian coordinate to spherical for the gradient vector
  38 + float n = 8; //set the number of points to find the boundaries of the conical voting area
  39 + float xc = rx * cos(g_sph[1]) * sin(g_sph[2]); //calculate the center point of the surface of the voting area for the voter
  40 + float yc = ry * sin(g_sph[1]) * sin(g_sph[2]);
  41 + float zc = rz * cos(g_sph[2]);
  42 + float r = sqrt(xc*xc + yc*yc + zc*zc);
  43 + xc += xi;
  44 + yc += yi;
  45 + zc += zi;
  46 + stim::vec3<float> center(xc, yc, zc);
  47 +
  48 + float d = 2 * r * tan(phi); //find the diameter of the conical voting area
  49 + stim::vec3<float> norm = g.norm(); //compute the normalize gradient vector
  50 + float step = 360.0 / n;
  51 + stim::circle<float> cir(center, d, norm);
  52 + stim::aabb3<int> bb(xi, yi, zi);
  53 + bb.insert((int)xc, (int)yc, (int)zc);
  54 + for (float j = 0; j <360.0; j += step) {
  55 + stim::vec3<float> out = cir.p(j);
  56 + bb.insert(out[0], out[1], out[2]);
  57 + }
  58 + bb.trim_low(xi - rx, yi - ry, zi - rz);
  59 + bb.trim_low(0, 0, 0);
  60 + bb.trim_high(xi + rx, yi + ry, zi + rz);
  61 + bb.trim_high(x - 1, y - 1, z - 1);
  62 + int bx, by, bz;
  63 + int dx, dy, dz;
  64 + float dx_sq, dy_sq, dz_sq;
  65 + float dist, cos_diff;
  66 + int idx_c;
  67 + for (bz = bb.low[2]; bz <= bb.high[2]; bz++) {
  68 + dz = bz - zi; //compute the distance bw the voter and the current counter along z axis
  69 + dz_sq = dz * dz;
  70 + for (by = bb.low[1]; by <= bb.high[1]; by++) {
  71 + dy = by - yi; //compute the distance bw the voter and the current counter along y axis
  72 + dy_sq = dy * dy;
  73 + for (bx = bb.low[0]; bx <= bb.high[0]; bx++) {
  74 + dx = bx - xi; //compute the distance bw the voter and the current counter along x axis
  75 + dx_sq = dx * dx;
  76 +
  77 + dist = sqrt(dx_sq + dy_sq + dz_sq); //calculate the distance between the voter and the current counter
  78 + cos_diff = (norm[0] * dx + norm[1] * dy + norm[2] * dz) / dist; // calculate the cosine of angle between the voter and the current counter
  79 + if (((dx_sq / rx_sq + dy_sq / ry_sq + dz_sq / rz_sq) <= 1) && (cos_diff >= cos_phi)) { //check if the current counter located in the voting area of the voter
  80 + idx_c = (bz* y + by) * x + bx; //calculate the 1D index for the current counter
  81 + atomicAdd(&gpu_vote[idx_c], g_sph[0]);
  82 + }
57 83 }
58   - bb.trim_low(xi-rx, yi-ry, zi-rz);
59   - bb.trim_low(0,0,0);
60   - bb.trim_high(xi+rx, yi+ry, zi+rz);
61   - bb.trim_high(x-1, y-1, z-1);
62   - int bx,by,bz;
63   - int dx, dy, dz;
64   - float dx_sq, dy_sq, dz_sq;
65   - float dist, cos_diff;
66   - int idx_c;
67   - for (bz=bb.low[2]; bz<=bb.high[2]; bz++){
68   - dz = bz - zi; //compute the distance bw the voter and the current counter along z axis
69   - dz_sq = dz * dz;
70   - for (by=bb.low[1]; by<=bb.high[1]; by++){
71   - dy = by - yi; //compute the distance bw the voter and the current counter along y axis
72   - dy_sq = dy * dy;
73   - for (bx=bb.low[0]; bx<=bb.high[0]; bx++){
74   - dx = bx - xi; //compute the distance bw the voter and the current counter along x axis
75   - dx_sq = dx * dx;
76   -
77   - dist = sqrt(dx_sq + dy_sq + dz_sq); //calculate the distance between the voter and the current counter
78   - cos_diff = (norm[0] * dx + norm[1] * dy + norm[2] * dz)/dist; // calculate the cosine of angle between the voter and the current counter
79   - if ( ( (dx_sq/rx_sq + dy_sq/ry_sq + dz_sq/rz_sq) <=1 ) && (cos_diff >=cos_phi) ){ //check if the current counter located in the voting area of the voter
80   - idx_c = (bz* y + by) * x + bx; //calculate the 1D index for the current counter
81   - atomicAdd (&gpu_vote[idx_c] , g_sph[0]);
82   - }
83   - }
84   - }
85   - }
86 84 }
  85 + }
  86 +}
87 87  
88   - template<typename T>
89   - void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
  88 +template<typename T>
  89 +void gpu_vote3(T* gpu_vote, T* gpu_grad, T phi, T cos_phi, unsigned int r[], size_t x, size_t y, size_t z) {
90 90  
91   -
92   - unsigned int max_threads = stim::maxThreadsPerBlock();
93   - dim3 threads(sqrt (max_threads),sqrt (max_threads));
94   - dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
95   - vote3 <T> <<< blocks, threads >>>(gpu_vote, gpu_grad, phi, cos_phi, r[0], r[1], r[2], x , y, z); //call the kernel to do the voting
96 91  
97   - }
  92 + unsigned int max_threads = stim::maxThreadsPerBlock();
  93 + dim3 threads(sqrt(max_threads), sqrt(max_threads));
  94 + dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
  95 + vote3 <T> << < blocks, threads >> >(gpu_vote, gpu_grad, phi, cos_phi, r[0], r[1], r[2], x, y, z); //call the kernel to do the voting
98 96  
  97 +}
99 98  
100   - template<typename T>
101   - void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
102   -
103   - //calculate the number of bytes in the array
104   - unsigned int bytes = x * y * z * sizeof(T);
105   -
106   -
107   - //allocate space on the GPU for the Vote Image
108   - T* gpu_vote;
109   - cudaMalloc(&gpu_vote, bytes);
110   -
111   - //allocate space on the GPU for the input Gradient image
112   - T* gpu_grad;
113   - cudaMalloc(&gpu_grad, bytes*3);
114   -
115   -
116   - //copy the Gradient data to the GPU
117   - cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);
118   -
119   -
120   - //call the GPU version of the vote calculation function
121   - gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x , y, z);
122   -
123   - //copy the Vote Data back to the CPU
124   - cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ;
125   -
126   - //free allocated memory
127   - cudaFree(gpu_vote);
128   - cudaFree(gpu_grad);
129   -
130   - }
  99 +
  100 +template<typename T>
  101 +void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z) {
  102 +
  103 + //calculate the number of bytes in the array
  104 + unsigned int bytes = x * y * z * sizeof(T);
  105 +
  106 +
  107 + //allocate space on the GPU for the Vote Image
  108 + T* gpu_vote;
  109 + cudaMalloc(&gpu_vote, bytes);
  110 +
  111 + //allocate space on the GPU for the input Gradient image
  112 + T* gpu_grad;
  113 + cudaMalloc(&gpu_grad, bytes * 3);
  114 +
  115 +
  116 + //copy the Gradient data to the GPU
  117 + cudaMemcpy(gpu_grad, cpu_grad, bytes * 3, cudaMemcpyHostToDevice);
  118 +
  119 +
  120 + //call the GPU version of the vote calculation function
  121 + gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x, y, z);
  122 +
  123 + //copy the Vote Data back to the CPU
  124 + cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost);
  125 +
  126 + //free allocated memory
  127 + cudaFree(gpu_vote);
  128 + cudaFree(gpu_grad);
  129 +
  130 +}
131 131  
132 132  
133 133 #endif
134 134 \ No newline at end of file
... ...