Commit af825cb9e3a875e096d138d147b325ab67c2beb0
fixed merge conflicts
Showing
28 changed files
with
2063 additions
and
621 deletions
Show diff stats
1 | +function S = loadAgilent(filename) | |
2 | + | |
3 | +fid = fopen(filename); %open the file for reading | |
4 | +fseek(fid, 9, 'bof'); %skip past the first 9 bytes of the header | |
5 | + | |
6 | +bands = fread(fid, 1, 'int16'); %read the number of bands in the file | |
7 | +fseek(fid, 13, 'cof'); %skip the next 13 bytes in the header | |
8 | + | |
9 | +samples = fread(fid, 1, 'int16'); %read the number of samples (X) | |
10 | +lines = fread(fid, 1, 'int16'); %read the number of lines (Y) | |
11 | + | |
12 | +fseek(fid, 1020, 'bof'); %skip past the entire header | |
13 | +S = fread(fid, [samples lines*bands], 'float32'); %read all the data | |
14 | +S = reshape(S, [samples, lines, bands]); | |
15 | +fclose(fid); %close the file | |
16 | + | |
17 | + | ... | ... |
1 | +#ifndef STIM_CELLSET_H | |
2 | +#define STIM_CELLSET_H | |
3 | + | |
4 | +#include <stim/math/vec3.h> | |
5 | +#include <vector> | |
6 | +#include <unordered_map> | |
7 | +#include <fstream> | |
8 | + | |
9 | +namespace stim{ | |
10 | + | |
11 | +class cellset{ | |
12 | +private: | |
13 | + static const char delim = ' '; | |
14 | +protected: | |
15 | + std::vector<double*> cells; //vector storing field data for each cell | |
16 | + std::unordered_map<std::string, size_t> fields; //unordered map storing field->index information for each field | |
17 | + size_t ip[3]; //hard code to position indices (for speed) | |
18 | + | |
19 | + void init(){ | |
20 | + | |
21 | + } | |
22 | +public: | |
23 | + /// Constructor - create an empty cell set | |
24 | + cellset(){ | |
25 | + init(); | |
26 | + } | |
27 | + | |
28 | + /// Constructor - load a cellset from a file | |
29 | + cellset(std::string filename){ | |
30 | + init(); //initialize an empty cellset | |
31 | + load(filename); //load the cellset from an existing file | |
32 | + } | |
33 | + | |
34 | + /// Loads a cellset from a file | |
35 | + void load(std::string filename){ | |
36 | + std::ifstream infile(filename); | |
37 | + std::string header; //allocate space for the file header | |
38 | + std::getline(infile, header); //get the file header | |
39 | + | |
40 | + // break the header into fields | |
41 | + std::stringstream ss(header); //create a string stream | |
42 | + std::string field; //store a single field name | |
43 | + size_t i = 0; //current field index | |
44 | + while (std::getline(ss, field, delim)) { //split the header into individual fields | |
45 | + std::pair<std::string, size_t> p(field, i); //create a pair associating the header name with the index | |
46 | + fields.insert(p); //insert the pair into the fields map | |
47 | + i++; //increment the data index | |
48 | + } | |
49 | + size_t nfields = fields.size(); //store the number of fields for each cell | |
50 | + | |
51 | + //load each cell and all associated fields | |
52 | + std::string cell_line; //string holds all information for a cell | |
53 | + std::list<std::string> cell_list; //list will be temporary storage for the cell fields | |
54 | + while(std::getline(infile, cell_line)){ //for each cell entry | |
55 | + cell_list.push_back(cell_line); //push the cell entry into the list | |
56 | + } | |
57 | + | |
58 | + //convert the list into actual data | |
59 | + size_t ncells = cell_list.size(); //count the number of cells | |
60 | + cells.resize(ncells); //allocate enough space in the array to store all cells | |
61 | + for(size_t c = 0; c < ncells; c++){ //for each cell entry in the list | |
62 | + cells[c] = (double*) malloc(sizeof(double) * nfields); //allocate enough space for each field | |
63 | + std::stringstream fss(cell_list.front()); //turn the string representing the cell list into a stringstream | |
64 | + for(size_t f = 0; f < nfields; f++){ //for each field | |
65 | + fss>>cells[c][f]; //load the field | |
66 | + } | |
67 | + cell_list.pop_front(); //pop the read string off of the front of the list | |
68 | + } | |
69 | + infile.close(); //close the input file | |
70 | + | |
71 | + ip[0] = fields["x"]; //hard code the position indices for speed | |
72 | + ip[1] = fields["y"]; // this assumes all cells have positions | |
73 | + ip[2] = fields["z"]; | |
74 | + } | |
75 | + | |
76 | + /// Return the value a specified field for a cell | |
77 | + /// @param c is the cell index | |
78 | + /// @param f is the field | |
79 | + double value(size_t c, std::string f){ | |
80 | + size_t idx = fields[f]; | |
81 | + return cells[c][idx]; | |
82 | + } | |
83 | + | |
84 | + /// returns an ID used to look up a field | |
85 | + bool exists(std::string f){ | |
86 | + std::unordered_map<std::string, size_t>::iterator iter = fields.find(f); | |
87 | + if(iter == fields.end()) return false; | |
88 | + else return true; | |
89 | + } | |
90 | + | |
91 | + /// Return the position of cell [i] | |
92 | + stim::vec3<double> p(size_t i){ | |
93 | + stim::vec3<double> pos(cells[i][ip[0]], cells[i][ip[1]], cells[i][ip[2]]); | |
94 | + return pos; | |
95 | + } | |
96 | + | |
97 | + /// Return the number of cells in the set | |
98 | + size_t size(){ | |
99 | + return cells.size(); | |
100 | + } | |
101 | + | |
102 | + /// Return the maximum value of a field in this cell set | |
103 | + double max(std::string field){ | |
104 | + size_t idx = fields[field]; //get the field index | |
105 | + size_t ncells = cells.size(); //get the total number of cells | |
106 | + double maxval, val; //stores the current and maximum values | |
107 | + for(size_t c = 0; c < ncells; c++){ //for each cell | |
108 | + val = cells[c][idx]; //get the field value for this cell | |
109 | + if(c == 0) maxval = val; //if this is the first cell, just assign the maximum | |
110 | + else if(val > maxval) maxval = val; // otherwise text for the size of val and assign it as appropriate | |
111 | + } | |
112 | + return maxval; | |
113 | + } | |
114 | + | |
115 | + /// Return the maximum value of a field in this cell set | |
116 | + double min(std::string field){ | |
117 | + size_t idx = fields[field]; //get the field index | |
118 | + size_t ncells = cells.size(); //get the total number of cells | |
119 | + double minval, val; //stores the current and maximum values | |
120 | + for(size_t c = 0; c < ncells; c++){ //for each cell | |
121 | + val = cells[c][idx]; //get the field value for this cell | |
122 | + if(c == 0) minval = val; //if this is the first cell, just assign the maximum | |
123 | + else if(val < minval) minval = val; // otherwise text for the size of val and assign it as appropriate | |
124 | + } | |
125 | + return minval; | |
126 | + } | |
127 | + | |
128 | + | |
129 | +}; //end class cellset | |
130 | +}; //end namespace stim | |
131 | + | |
132 | +#endif | |
0 | 133 | \ No newline at end of file | ... | ... |
stim/cuda/ivote/local_max.cuh
... | ... | @@ -14,7 +14,7 @@ namespace stim{ |
14 | 14 | |
15 | 15 | // calculate the 2D coordinates for this current thread. |
16 | 16 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
17 | - int yi = blockIdx.y; | |
17 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | |
18 | 18 | |
19 | 19 | if(xi >= x || yi >= y) |
20 | 20 | return; |
... | ... | @@ -63,8 +63,10 @@ namespace stim{ |
63 | 63 | void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){ |
64 | 64 | |
65 | 65 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
66 | - dim3 threads(max_threads, 1); | |
67 | - dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | |
66 | + /*dim3 threads(max_threads, 1); | |
67 | + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);*/ | |
68 | + dim3 threads( sqrt(max_threads), sqrt(max_threads) ); | |
69 | + dim3 blocks(x/threads.x + 1, y/threads.y + 1); | |
68 | 70 | |
69 | 71 | //call the kernel to find the local maximum. |
70 | 72 | cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y); | ... | ... |
1 | +#ifndef STIM_CUDA_RE_SAMPLE_H | |
2 | +#define STIM_CUDA_RE_SAMPLE_H | |
3 | + | |
4 | +#include <iostream> | |
5 | +#include <cuda.h> | |
6 | +#include <stim/cuda/cudatools.h> | |
7 | +#include <stim/cuda/templates/gaussian_blur.cuh> | |
8 | + | |
9 | +namespace stim{ | |
10 | + namespace cuda{ | |
11 | + | |
12 | + template<typename T> | |
13 | + __global__ void cuda_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){ | |
14 | + | |
15 | + unsigned int sigma_ds = 1/resize; | |
16 | + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); | |
17 | + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); | |
18 | + | |
19 | + | |
20 | + // calculate the 2D coordinates for this current thread. | |
21 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
22 | + int yi = blockIdx.y; | |
23 | + // convert 2D coordinates to 1D | |
24 | + int i = yi * x + xi; | |
25 | + | |
26 | + if(xi< x && yi< y){ | |
27 | + if(xi%sigma_ds==0){ | |
28 | + if(yi%sigma_ds==0){ | |
29 | + gpuI[i] = gpuI0[(yi/sigma_ds)*x_ds + xi/sigma_ds]; | |
30 | + } | |
31 | + } | |
32 | + else gpuI[i] = 0; | |
33 | + | |
34 | + //int x_org = xi * sigma_ds ; | |
35 | + //int y_org = yi * sigma_ds ; | |
36 | + //int i_org = y_org * x + x_org; | |
37 | + //gpuI[i] = gpuI0[i_org]; | |
38 | + } | |
39 | + | |
40 | + } | |
41 | + | |
42 | + | |
43 | + /// Applies a Gaussian blur to a 2D image stored on the GPU | |
44 | + template<typename T> | |
45 | + void gpu_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){ | |
46 | + | |
47 | + | |
48 | + //unsigned int sigma_ds = 1/resize; | |
49 | + //unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); | |
50 | + //unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); | |
51 | + | |
52 | + //get the number of pixels in the image | |
53 | + //unsigned int pixels_ds = x_ds * y_ds; | |
54 | + | |
55 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | |
56 | + dim3 threads(max_threads, 1); | |
57 | + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | |
58 | + | |
59 | + //stim::cuda::gpu_gaussian_blur2<float>(gpuI0, sigma_ds,x ,y); | |
60 | + | |
61 | + //resample the image | |
62 | + cuda_re_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y); | |
63 | + | |
64 | + } | |
65 | + | |
66 | + /// Applies a Gaussian blur to a 2D image stored on the CPU | |
67 | + template<typename T> | |
68 | + void cpu_re_sample(T* out, T* in, T resize, unsigned int x, unsigned int y){ | |
69 | + | |
70 | + //get the number of pixels in the image | |
71 | + unsigned int pixels = x*y; | |
72 | + unsigned int bytes = sizeof(T) * pixels; | |
73 | + | |
74 | + unsigned int sigma_ds = 1/resize; | |
75 | + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1)); | |
76 | + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1)); | |
77 | + unsigned int bytes_ds = sizeof(T) * x_ds * y_ds; | |
78 | + | |
79 | + | |
80 | + | |
81 | + //allocate space on the GPU for the original image | |
82 | + T* gpuI0; | |
83 | + cudaMalloc(&gpuI0, bytes_ds); | |
84 | + | |
85 | + | |
86 | + //copy the image data to the GPU | |
87 | + cudaMemcpy(gpuI0, in, bytes_ds, cudaMemcpyHostToDevice); | |
88 | + | |
89 | + //allocate space on the GPU for the down sampled image | |
90 | + T* gpuI; | |
91 | + cudaMalloc(&gpuI, bytes); | |
92 | + | |
93 | + //run the GPU-based version of the algorithm | |
94 | + gpu_re_sample<T>(gpuI, gpuI0, resize, x, y); | |
95 | + | |
96 | + //copy the image data to the GPU | |
97 | + cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice); | |
98 | + | |
99 | + cudaFree(gpuI0); | |
100 | + cudeFree(gpuI); | |
101 | + } | |
102 | + | |
103 | + } | |
104 | +} | |
105 | + | |
106 | +#endif | |
0 | 107 | \ No newline at end of file | ... | ... |
stim/cuda/ivote/update_dir_global.cuh renamed to stim/cuda/ivote/update_dir_bb.cuh
1 | -#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H | |
2 | -#define STIM_CUDA_UPDATE_DIR_GLOBAL_H | |
1 | +#ifndef STIM_CUDA_UPDATE_DIR_BB_H | |
2 | +#define STIM_CUDA_UPDATE_DIR_BB_H | |
3 | 3 | |
4 | 4 | # include <iostream> |
5 | 5 | # include <cuda.h> |
... | ... | @@ -7,8 +7,7 @@ |
7 | 7 | #include <stim/cuda/sharedmem.cuh> |
8 | 8 | #include <stim/visualization/aabb2.h> |
9 | 9 | #include <stim/visualization/colormap.h> |
10 | -#include <math.h> | |
11 | -#include "cpyToshare.cuh" | |
10 | +#include <math.h> | |
12 | 11 | |
13 | 12 | //#define RMAX_TEST 8 |
14 | 13 | |
... | ... | @@ -76,68 +75,6 @@ namespace stim{ |
76 | 75 | gpuDir[i] = atan2((T)max_dy, (T)max_dx); |
77 | 76 | } |
78 | 77 | |
79 | - // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. | |
80 | - template<typename T> | |
81 | - __global__ void leila_cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | |
82 | - | |
83 | - | |
84 | - // calculate the 2D coordinates for this current thread. | |
85 | - int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
86 | - int yi = blockIdx.y * blockDim.y + threadIdx.y; | |
87 | - | |
88 | - if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel | |
89 | - | |
90 | - int i = yi * x + xi; // convert 2D coordinates to 1D | |
91 | - | |
92 | - float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch | |
93 | - gpuDir[i] = 0; //initialize the vote direction to zero | |
94 | - float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter | |
95 | - int id_x = 0; // define two local variables for the x and y position of the maximum | |
96 | - int id_y = 0; | |
97 | - | |
98 | - int x_table = 2*rmax +1; // compute the size of window which will be checked for finding the voting area for this voter | |
99 | - int rmax_sq = rmax * rmax; | |
100 | - int tx_rmax = threadIdx.x + rmax; | |
101 | - float atan_angle; | |
102 | - float vote_c; | |
103 | - int xidx, yidx, yr_sq, xr_sq; | |
104 | - for(int yr = -rmax; yr <= rmax; yr++){ | |
105 | - yidx = yi + yr; //compute the index into the image | |
106 | - if (yidx >= 0 && yidx < y){ //if the current y-index is inside the image | |
107 | - yr_sq = yr * yr; //compute the square of yr, to save time later | |
108 | - for(int xr = -rmax; xr <= rmax; xr++){ | |
109 | - xidx = xi + xr; | |
110 | - if(xidx >= 0 && xidx < x){ | |
111 | - xr_sq = xr * xr; | |
112 | - unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; | |
113 | - | |
114 | - // calculate the angle between the voter and the current pixel in x and y directions | |
115 | - atan_angle = gpuTable[ind_t]; | |
116 | - //atan_angle = atan2((T)yr, (T)xr); | |
117 | - | |
118 | - // check if the current pixel is located in the voting area of this voter. | |
119 | - if (((xr_sq + yr_sq)< rmax_sq) && (abs(atan_angle - theta) <phi)){ | |
120 | - | |
121 | - vote_c = gpuVote[yidx * x + xidx]; // find the vote value for the current counter | |
122 | - // compare the vote value of this pixel with the max value to find the maxima and its index. | |
123 | - if (vote_c>max) { | |
124 | - | |
125 | - max = vote_c; | |
126 | - id_x = xr; | |
127 | - id_y = yr; | |
128 | - } | |
129 | - } | |
130 | - } | |
131 | - } | |
132 | - } | |
133 | - } | |
134 | - | |
135 | - unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); | |
136 | - float new_angle = gpuTable[ind_m]; | |
137 | - | |
138 | - if(xi < x && yi < y) | |
139 | - gpuDir[i] = new_angle; | |
140 | - } //end kernel | |
141 | 78 | |
142 | 79 | |
143 | 80 | // this kernel updates the gradient direction by the calculated voting direction. |
... | ... | @@ -168,9 +105,7 @@ namespace stim{ |
168 | 105 | HANDLE_ERROR( cudaMalloc(&gpuDir, bytes) ); |
169 | 106 | |
170 | 107 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
171 | - //dim3 threads(min(x, max_threads), 1); | |
172 | - //dim3 blocks(x/threads.x, y); | |
173 | - | |
108 | + | |
174 | 109 | dim3 threads( sqrt(max_threads), sqrt(max_threads) ); |
175 | 110 | dim3 blocks(x/threads.x + 1, y/threads.y + 1); |
176 | 111 | |
... | ... | @@ -188,12 +123,12 @@ namespace stim{ |
188 | 123 | |
189 | 124 | //call the kernel to calculate the new voting direction |
190 | 125 | cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); |
191 | - stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer); | |
126 | + //stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer); | |
192 | 127 | |
193 | 128 | //exit(0); |
194 | 129 | |
195 | - threads = dim3( sqrt(max_threads), sqrt(max_threads) ); | |
196 | - blocks = dim3(x/threads.x + 1, y/threads.y + 1); | |
130 | + //threads = dim3( sqrt(max_threads), sqrt(max_threads) ); | |
131 | + //blocks = dim3(x/threads.x + 1, y/threads.y + 1); | |
197 | 132 | |
198 | 133 | //call the kernel to update the gradient direction |
199 | 134 | cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); | ... | ... |
stim/cuda/ivote/david_update_dir_global.cuh renamed to stim/cuda/ivote/update_dir_threshold_global.cuh
1 | -#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H | |
2 | -#define STIM_CUDA_UPDATE_DIR_GLOBAL_H | |
1 | +#ifndef STIM_CUDA_UPDATE_DIR_THRESHOLD_GLOBALD_H | |
2 | +#define STIM_CUDA_UPDATE_DIR_THRESHOLD_GLOBAL_H | |
3 | 3 | |
4 | 4 | # include <iostream> |
5 | 5 | # include <cuda.h> |
6 | 6 | #include <stim/cuda/cudatools.h> |
7 | 7 | #include <stim/cuda/sharedmem.cuh> |
8 | -#include <math.h> | |
9 | -#include "cpyToshare.cuh" | |
10 | - | |
11 | -#define RMAX_TEST 8 | |
8 | +#include "cpyToshare.cuh" | |
12 | 9 | |
13 | 10 | namespace stim{ |
14 | 11 | namespace cuda{ |
15 | 12 | |
16 | 13 | // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area. |
17 | 14 | template<typename T> |
18 | - __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | |
19 | - extern __shared__ T atan2_table[]; | |
20 | - | |
21 | - //calculate the start point for this block | |
22 | - //int bxi = blockIdx.x * blockDim.x; | |
23 | - | |
24 | - stim::cuda::sharedMemcpy(atan2_table, gpuTable, (2 * rmax + 1) * (2 * rmax + 1), threadIdx.x, blockDim.x); | |
15 | + __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuTh, T* gpuTable, T phi, int rmax, int th_size, int x, int y){ | |
25 | 16 | |
26 | - __syncthreads(); | |
27 | 17 | |
28 | - // calculate the 2D coordinates for this current thread. | |
29 | - //int xi = bxi + threadIdx.x; | |
18 | + | |
19 | + // calculate the coordinate for this current thread. | |
30 | 20 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
31 | - int yi = blockIdx.y * blockDim.y + threadIdx.y; | |
32 | - if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel | |
33 | - | |
34 | - int i = yi * x + xi; // convert 2D coordinates to 1D | |
21 | + // calculate the voting direction based on the grtadient direction | |
22 | + float theta = gpuTh[3*xi]; | |
35 | 23 | |
36 | - float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch | |
37 | - gpuDir[i] = 0; //initialize the vote direction to zero | |
38 | - float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter | |
39 | - int id_x = 0; // define two local variables for the x and y position of the maximum | |
40 | - int id_y = 0; | |
24 | + //calculate the position and x, y coordinations of this voter in the original image | |
25 | + unsigned int i_v = gpuTh[3*xi+2]; | |
26 | + unsigned int y_v = i_v/x; | |
27 | + unsigned int x_v = i_v - (y_v*x); | |
41 | 28 | |
42 | - int x_table = 2*rmax +1; // compute the size of window which will be checked for finding the voting area for this voter | |
29 | + //initialize the vote direction to zero | |
30 | + gpuDir[xi] = 0; | |
31 | + | |
32 | + // define a local variable to maximum value of the vote image in the voting area for this voter | |
33 | + float max = 0; | |
34 | + | |
35 | + // define two local variables for the x and y coordinations where the maximum happened | |
36 | + int id_x = 0; | |
37 | + int id_y = 0; | |
38 | + | |
39 | + // compute the size of window which will be checked for finding the voting area for this voter | |
40 | + int x_table = 2*rmax +1; | |
43 | 41 | int rmax_sq = rmax * rmax; |
44 | 42 | int tx_rmax = threadIdx.x + rmax; |
45 | - float atan_angle; | |
46 | - float vote_c; | |
47 | - unsigned int ind_t; | |
48 | - for(int yr = -rmax; yr <= rmax; yr++){ //for each counter in the y direction | |
49 | - if (yi+yr >= 0 && yi + yr < y){ //if the counter exists (we aren't looking outside of the image) | |
50 | - for(int xr = -rmax; xr <= rmax; xr++){ //for each counter in the x direction | |
51 | - if((xr * xr + yr *yr)< rmax_sq){ //if the counter is within range of the voter | |
52 | - | |
53 | - ind_t = (rmax - yr) * x_table + rmax - xr; //calculate the index to the atan2 table | |
54 | - atan_angle = atan2_table[ind_t]; //retrieve the direction vector from the table | |
55 | - | |
56 | - //atan_angle = atan2((float)yr, (float)xr); | |
57 | - | |
58 | - if (abs(atan_angle - theta) <phi){ // check if the current pixel is located in the voting angle of this voter. | |
59 | - vote_c = gpuVote[(yi+yr)*x + (xi+xr)]; // find the vote value for the current counter | |
60 | - if(vote_c>max) { // compare the vote value of this pixel with the max value to find the maxima and its index. | |
61 | - max = vote_c; | |
62 | - id_x = xr; | |
63 | - id_y = yr; | |
64 | - } | |
43 | + if(xi < th_size){ | |
44 | + | |
45 | + for(int yr = -rmax; yr <= rmax; yr++){ | |
46 | + | |
47 | + for(int xr = -rmax; xr <= rmax; xr++){ | |
48 | + | |
49 | + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; | |
50 | + | |
51 | + // find the angle between the voter and the current pixel in x and y directions | |
52 | + float atan_angle = gpuTable[ind_t]; | |
53 | + | |
54 | + // check if the current pixel is located in the voting area of this voter. | |
55 | + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){ | |
56 | + // find the vote value for the current counter | |
57 | + float vote_c = gpuVote[(y_v+yr)*x + (x_v+xr)]; | |
58 | + // compare the vote value of this pixel with the max value to find the maxima and its index. | |
59 | + if (vote_c>max) { | |
60 | + | |
61 | + max = vote_c; | |
62 | + id_x = xr; | |
63 | + id_y = yr; | |
65 | 64 | } |
66 | 65 | } |
67 | 66 | } |
68 | 67 | } |
69 | - } | |
68 | + | |
70 | 69 | |
71 | - unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); | |
72 | - float new_angle = gpuTable[ind_m]; | |
70 | + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x); | |
71 | + float new_angle = gpuTable[ind_m]; | |
72 | + gpuDir[xi] = new_angle; | |
73 | + } | |
73 | 74 | |
74 | - if(xi < x && yi < y) | |
75 | - gpuDir[i] = new_angle; | |
76 | - } //end kernel | |
75 | + } | |
77 | 76 | |
78 | 77 | // this kernel updates the gradient direction by the calculated voting direction. |
79 | 78 | template<typename T> |
80 | - __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){ | |
79 | + __global__ void cuda_update_grad(T* gpuTh, T* gpuDir, int th_size, int x, int y){ | |
81 | 80 | |
82 | - // calculate the 2D coordinates for this current thread. | |
81 | + // calculate the coordinate for this current thread. | |
83 | 82 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
84 | - int yi = blockIdx.y * blockDim.y + threadIdx.y; | |
85 | - | |
86 | - // convert 2D coordinates to 1D | |
87 | - int i = yi * x + xi; | |
88 | 83 | |
84 | + | |
89 | 85 | //update the gradient image with the vote direction |
90 | - gpuGrad[2*i] = gpuDir[i]; | |
86 | + gpuTh[3*xi] = gpuDir[xi]; | |
91 | 87 | } |
92 | 88 | |
93 | 89 | template<typename T> |
94 | - void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
95 | - | |
96 | - | |
90 | + void gpu_update_dir(T* gpuVote, T* gpuTh, T* gpuTable, T phi, unsigned int rmax, unsigned int th_size, unsigned int x, unsigned int y){ | |
97 | 91 | |
98 | 92 | //calculate the number of bytes in the array |
99 | - unsigned int bytes = x * y * sizeof(T); | |
93 | + unsigned int bytes_th = th_size* sizeof(T); | |
100 | 94 | |
101 | 95 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
102 | - | |
103 | - dim3 threads(sqrt(max_threads), sqrt(max_threads)); | |
104 | - dim3 blocks(x/threads.x + 1, y/threads.y + 1); | |
105 | - | |
106 | - | |
96 | + dim3 threads(max_threads); | |
97 | + dim3 blocks(th_size/threads.x+1); | |
107 | 98 | |
108 | 99 | // allocate space on the GPU for the updated vote direction |
109 | 100 | T* gpuDir; |
110 | - cudaMalloc(&gpuDir, bytes); | |
111 | - | |
112 | - size_t shared_mem = sizeof(T) * std::pow((2 * rmax + 1), 2); | |
113 | - std::cout<<"Shared memory for atan2 table: "<<shared_mem<<std::endl; | |
101 | + cudaMalloc(&gpuDir, bytes_th); | |
114 | 102 | |
115 | 103 | //call the kernel to calculate the new voting direction |
116 | - cuda_update_dir <<< blocks, threads, shared_mem>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | |
104 | + cuda_update_dir <<< blocks, threads>>>(gpuDir, gpuVote, gpuTh, gpuTable, phi, rmax, th_size, x , y); | |
117 | 105 | |
118 | 106 | //call the kernel to update the gradient direction |
119 | - cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y); | |
107 | + cuda_update_grad <<< blocks, threads >>>(gpuTh, gpuDir, th_size, x , y); | |
120 | 108 | |
121 | 109 | //free allocated memory |
122 | 110 | cudaFree(gpuDir); | ... | ... |
1 | +#ifndef STIM_CUDA_VOTE_ATOMIC_BB_H | |
2 | +#define STIM_CUDA_VOTE_ATOMIC_BB_H | |
3 | + | |
4 | +# include <iostream> | |
5 | +# include <cuda.h> | |
6 | +#include <stim/cuda/cudatools.h> | |
7 | +#include <stim/cuda/sharedmem.cuh> | |
8 | +#include <stim/visualization/aabb2.h> | |
9 | +#include <stim/visualization/colormap.h> | |
10 | +#include <math.h> | |
11 | + | |
12 | +namespace stim{ | |
13 | + namespace cuda{ | |
14 | + | |
15 | + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area | |
16 | + template<typename T> | |
17 | + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | |
18 | + | |
19 | + extern __shared__ T S[]; | |
20 | + T* shared_atan = S; | |
21 | + size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1); | |
22 | + stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x); | |
23 | + | |
24 | + // calculate the 2D coordinates for this current thread. | |
25 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
26 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | |
27 | + | |
28 | + if(xi >= x || yi >= y) return; | |
29 | + // convert 2D coordinates to 1D | |
30 | + int i = yi * x + xi; | |
31 | + | |
32 | + // calculate the voting direction based on the grtadient direction | |
33 | + float theta = gpuGrad[2*i]; | |
34 | + //calculate the amount of vote for the voter | |
35 | + float mag = gpuGrad[2*i + 1]; | |
36 | + | |
37 | + | |
38 | + stim::aabb2<int> bb(xi, yi); //initialize a bounding box at the current point | |
39 | + bb.insert(xi + ceil(rmax * cos(theta)), ceil(yi + rmax * sin(theta))); | |
40 | + bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi))); //insert one corner of the triangle into the bounding box | |
41 | + bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi))); //insert the final corner into the bounding box | |
42 | + | |
43 | + // compute the size of window which will be checked for finding the proper voters for this pixel | |
44 | + int x_table = 2*rmax +1; | |
45 | + int rmax_sq = rmax * rmax; | |
46 | + | |
47 | + int lut_i; | |
48 | + T dx_sq, dy_sq; | |
49 | + | |
50 | + bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image | |
51 | + bb.trim_high(x-1, y-1); | |
52 | + | |
53 | + int by, bx; | |
54 | + int dx, dy; | |
55 | + | |
56 | + unsigned int ind_g; //initialize the maximum vote value to zero | |
57 | + T alpha; | |
58 | + | |
59 | + for(by = bb.low[1]; by <= bb.high[1]; by++){ //for each element in the bounding box | |
60 | + dy = by - yi; //calculate the y coordinate of the current point relative to yi | |
61 | + dy_sq = dy * dy; | |
62 | + for(bx = bb.low[0]; bx <= bb.high[0]; bx++){ | |
63 | + dx = bx - xi; | |
64 | + dx_sq = dx * dx; | |
65 | + lut_i = (rmax - dy) * x_table + rmax - dx; | |
66 | + alpha = shared_atan[lut_i]; | |
67 | + if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){ | |
68 | + ind_g = (by)*x + (bx); | |
69 | + atomicAdd(&gpuVote[ind_g], mag); | |
70 | + | |
71 | + } | |
72 | + } | |
73 | + } | |
74 | + | |
75 | + } | |
76 | + | |
77 | + | |
78 | + template<typename T> | |
79 | + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
80 | + | |
81 | + | |
82 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | |
83 | + dim3 threads( sqrt(max_threads), sqrt(max_threads) ); | |
84 | + dim3 blocks(x/threads.x + 1, y/threads.y + 1); | |
85 | + size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1); | |
86 | + size_t shared_mem_req = table_bytes;// + template_bytes; | |
87 | + std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl; | |
88 | + size_t shared_mem = stim::sharedMemPerBlock(); | |
89 | + if(shared_mem_req > shared_mem){ | |
90 | + std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl; | |
91 | + exit(1); | |
92 | + } | |
93 | + | |
94 | + //call the kernel to do the voting | |
95 | + cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | |
96 | + | |
97 | + } | |
98 | + | |
99 | + | |
100 | + template<typename T> | |
101 | + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
102 | + | |
103 | + //calculate the number of bytes in the array | |
104 | + unsigned int bytes = x * y * sizeof(T); | |
105 | + | |
106 | + //calculate the number of bytes in the atan2 table | |
107 | + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); | |
108 | + | |
109 | + //allocate space on the GPU for the Vote Image | |
110 | + T* gpuVote; | |
111 | + cudaMalloc(&gpuVote, bytes); | |
112 | + | |
113 | + //allocate space on the GPU for the input Gradient image | |
114 | + T* gpuGrad; | |
115 | + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); | |
116 | + | |
117 | + //copy the Gradient Magnitude data to the GPU | |
118 | + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); | |
119 | + | |
120 | + //allocate space on the GPU for the atan2 table | |
121 | + T* gpuTable; | |
122 | + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); | |
123 | + | |
124 | + //copy the atan2 values to the GPU | |
125 | + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | |
126 | + | |
127 | + //call the GPU version of the vote calculation function | |
128 | + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | |
129 | + | |
130 | + //copy the Vote Data back to the CPU | |
131 | + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; | |
132 | + | |
133 | + //free allocated memory | |
134 | + cudaFree(gpuTable); | |
135 | + cudaFree(gpuVote); | |
136 | + cudaFree(gpuGrad); | |
137 | + } | |
138 | + | |
139 | + } | |
140 | +} | |
141 | + | |
142 | +#endif | |
0 | 143 | \ No newline at end of file | ... | ... |
stim/cuda/ivote/vote_atomic_shared.cuh
1 | +#ifndef STIM_CUDA_VOTE_SHARED_H | |
2 | +#define STIM_CUDA_VOTE_SHARED | |
3 | +# include <iostream> | |
4 | +# include <cuda.h> | |
5 | +#include <stim/cuda/cudatools.h> | |
6 | +#include <stim/cuda/sharedmem.cuh> | |
7 | +#include "cpyToshare.cuh" | |
8 | + | |
9 | +namespace stim{ | |
10 | + namespace cuda{ | |
11 | + | |
12 | + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area | |
13 | + template<typename T> | |
14 | + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){ | |
15 | + | |
16 | + //generate a pointer to shared memory (size will be specified as a kernel parameter) | |
17 | + extern __shared__ float s_grad[]; | |
18 | + | |
19 | + //calculate the start point for this block | |
20 | + int bxi = blockIdx.x * blockDim.x; | |
21 | + | |
22 | + // calculate the 2D coordinates for this current thread. | |
23 | + int xi = bxi + threadIdx.x; | |
24 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | |
25 | + // convert 2D coordinates to 1D | |
26 | + int i = yi * x + xi; | |
27 | + | |
28 | + // define a local variable to sum the votes from the voters | |
29 | + float sum = 0; | |
30 | + | |
31 | + //calculate the width of the shared memory block | |
32 | + int swidth = 2 * rmax + blockDim.x; | |
33 | + | |
34 | + // compute the size of window which will be checked for finding the proper voters for this pixel | |
35 | + int x_table = 2*rmax +1; | |
36 | + int rmax_sq = rmax * rmax; | |
37 | + int tx_rmax = threadIdx.x + rmax; | |
38 | + int bxs = bxi - rmax; | |
39 | + | |
40 | + //for every line (along y) | |
41 | + for(int yr = -rmax; yr <= rmax; yr++){ | |
42 | + if (yi+yr<y && yi+yr>=0){ | |
43 | + //copy the portion of the image necessary for this block to shared memory | |
44 | + __syncthreads(); | |
45 | + cpyG2S1D2ch<float>(s_grad, gpuGrad, bxs, yi + yr , 2*swidth, 1, threadIdx, blockDim, x, y); | |
46 | + __syncthreads(); | |
47 | + | |
48 | + if(xi < x && yi < y){ | |
49 | + | |
50 | + for(int xr = -rmax; xr <= rmax; xr++){ | |
51 | + | |
52 | + //find the location of this voter in the atan2 table | |
53 | + int id_t = (yr + rmax) * x_table + xr + rmax; | |
54 | + | |
55 | + // calculate the angle between the pixel and the current voter in x and y directions | |
56 | + float atan_angle = gpuTable[id_t]; | |
57 | + | |
58 | + // calculate the voting direction based on the grtadient direction | |
59 | + int idx_share = xr + tx_rmax ; | |
60 | + float theta = s_grad[idx_share*2]; | |
61 | + float mag = s_grad[idx_share*2 + 1]; | |
62 | + | |
63 | + | |
64 | + // check if the current voter is located in the voting area of this pixel. | |
65 | + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){ | |
66 | + sum += mag; | |
67 | + | |
68 | + } | |
69 | + } | |
70 | + | |
71 | + } | |
72 | + } | |
73 | + } | |
74 | + if(xi < x && yi < y) | |
75 | + gpuVote[i] = sum; | |
76 | + | |
77 | + } | |
78 | + | |
79 | + template<typename T> | |
80 | + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
81 | + | |
82 | + | |
83 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | |
84 | + dim3 threads(max_threads, 1); | |
85 | + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y); | |
86 | + | |
87 | + | |
88 | + // specify share memory | |
89 | + unsigned int share_bytes = (2*rmax + threads.x)*1*2*sizeof(T); | |
90 | + | |
91 | + //call the kernel to do the voting | |
92 | + cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | |
93 | + | |
94 | + } | |
95 | + | |
96 | + | |
97 | + template<typename T> | |
98 | + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
99 | + | |
100 | + //calculate the number of bytes in the array | |
101 | + unsigned int bytes = x * y * sizeof(T); | |
102 | + | |
103 | + //calculate the number of bytes in the atan2 table | |
104 | + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); | |
105 | + | |
106 | + //allocate space on the GPU for the Vote Image | |
107 | + T* gpuVote; | |
108 | + cudaMalloc(&gpuVote, bytes); | |
109 | + | |
110 | + //allocate space on the GPU for the input Gradient image | |
111 | + T* gpuGrad; | |
112 | + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); | |
113 | + | |
114 | + //copy the Gradient Magnitude data to the GPU | |
115 | + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); | |
116 | + | |
117 | + //allocate space on the GPU for the atan2 table | |
118 | + T* gpuTable; | |
119 | + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); | |
120 | + | |
121 | + //copy the atan2 values to the GPU | |
122 | + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | |
123 | + | |
124 | + //call the GPU version of the vote calculation function | |
125 | + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | |
126 | + | |
127 | + //copy the Vote Data back to the CPU | |
128 | + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; | |
129 | + | |
130 | + //free allocated memory | |
131 | + cudaFree(gpuTable); | |
132 | + cudaFree(gpuVote); | |
133 | + cudaFree(gpuGrad); | |
134 | + } | |
135 | + | |
136 | + } | |
137 | +} | |
138 | + | |
139 | +#endif | |
0 | 140 | \ No newline at end of file | ... | ... |
1 | +#ifndef STIM_CUDA_VOTE_THRESHOLD_GLOBAL_H | |
2 | +#define STIM_CUDA_VOTE_THRESHOLD_GLOBAL_H | |
3 | +# include <iostream> | |
4 | +# include <cuda.h> | |
5 | +#include <stim/cuda/cudatools.h> | |
6 | +#include <stim/cuda/sharedmem.cuh> | |
7 | +#include "cpyToshare.cuh" | |
8 | + | |
9 | +namespace stim{ | |
10 | + namespace cuda{ | |
11 | + | |
12 | + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area | |
13 | + template<typename T> | |
14 | + __global__ void cuda_vote(T* gpuVote, T* gpuTh, T* gpuTable, T phi, int rmax, int th_size, int x, int y){ | |
15 | + | |
16 | + | |
17 | + // calculate the x coordinate for this current thread. | |
18 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
19 | + | |
20 | + // calculate the voting direction based on the grtadient direction | |
21 | + float theta = gpuTh[3*xi]; | |
22 | + //find the gradient magnitude for the current voter | |
23 | + float mag = gpuTh[3*xi + 1]; | |
24 | + //calculate the position and x, y coordinations of this voter in the original image | |
25 | + unsigned int i_v = gpuTh[3*xi+2]; | |
26 | + unsigned int y_v = i_v/x; | |
27 | + unsigned int x_v = i_v - (y_v*x); | |
28 | + | |
29 | + // compute the size of window which will be checked for finding the proper voters for this pixel | |
30 | + int x_table = 2*rmax +1; | |
31 | + int rmax_sq = rmax * rmax; | |
32 | + if(xi < th_size){ | |
33 | + for(int yr = -rmax; yr <= rmax; yr++){ | |
34 | + for(int xr = -rmax; xr <= rmax; xr++){ | |
35 | + if ((y_v+yr)>=0 && (y_v+yr)<y && (x_v+xr)>=0 && (x_v+xr)<x){ | |
36 | + | |
37 | + //find the location of the current pixel in the atan2 table | |
38 | + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr; | |
39 | + | |
40 | + // calculate the angle between the voter and the current pixel in x and y directions | |
41 | + float atan_angle = gpuTable[ind_t]; | |
42 | + | |
43 | + // check if the current pixel is located in the voting area of this voter. | |
44 | + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){ | |
45 | + // calculate the 1D index for the current pixel in global memory | |
46 | + unsigned int ind_g = (y_v+yr)*x + (x_v+xr); | |
47 | + atomicAdd(&gpuVote[ind_g], mag); | |
48 | + | |
49 | + } | |
50 | + } | |
51 | + } | |
52 | + } | |
53 | + } | |
54 | + } | |
55 | + | |
56 | + template<typename T> | |
57 | + void gpu_vote(T* gpuVote, T* gpuTh, T* gpuTable, T phi, unsigned int rmax, unsigned int th_size, unsigned int x, unsigned int y){ | |
58 | + | |
59 | + | |
60 | + unsigned int max_threads = stim::maxThreadsPerBlock(); | |
61 | + dim3 threads(max_threads); | |
62 | + dim3 blocks(th_size/threads.x + 1); | |
63 | + | |
64 | + //call the kernel to do the voting | |
65 | + cuda_vote <<< blocks, threads>>>(gpuVote, gpuTh, gpuTable, phi, rmax, th_size, x , y); | |
66 | + | |
67 | + } | |
68 | + | |
69 | + | |
70 | + template<typename T> | |
71 | + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){ | |
72 | + | |
73 | + //calculate the number of bytes in the array | |
74 | + unsigned int bytes = x * y * sizeof(T); | |
75 | + | |
76 | + //calculate the number of bytes in the atan2 table | |
77 | + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T); | |
78 | + | |
79 | + //allocate space on the GPU for the Vote Image | |
80 | + T* gpuVote; | |
81 | + cudaMalloc(&gpuVote, bytes); | |
82 | + | |
83 | + //allocate space on the GPU for the input Gradient image | |
84 | + T* gpuGrad; | |
85 | + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2)); | |
86 | + | |
87 | + //copy the Gradient Magnitude data to the GPU | |
88 | + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice)); | |
89 | + | |
90 | + //allocate space on the GPU for the atan2 table | |
91 | + T* gpuTable; | |
92 | + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table)); | |
93 | + | |
94 | + //copy the atan2 values to the GPU | |
95 | + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice)); | |
96 | + | |
97 | + //call the GPU version of the vote calculation function | |
98 | + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y); | |
99 | + | |
100 | + //copy the Vote Data back to the CPU | |
101 | + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ; | |
102 | + | |
103 | + //free allocated memory | |
104 | + cudaFree(gpuTable); | |
105 | + cudaFree(gpuVote); | |
106 | + cudaFree(gpuGrad); | |
107 | + } | |
108 | + | |
109 | + } | |
110 | +} | |
111 | + | |
112 | +#endif | |
0 | 113 | \ No newline at end of file | ... | ... |
stim/cuda/ivote_atomic.cuh renamed to stim/cuda/ivote_atomic_bb.cuh
1 | -#ifndef STIM_CUDA_IVOTE_ATOMIC_H | |
2 | -#define STIM_CUDA_IVOTE_ATOMIC_H | |
1 | +#ifndef STIM_CUDA_IVOTE_ATOMIC_BB_H | |
2 | +#define STIM_CUDA_IVOTE_ATOMIC_BB_H | |
3 | 3 | |
4 | 4 | #include <stim/cuda/ivote/down_sample.cuh> |
5 | 5 | #include <stim/cuda/ivote/local_max.cuh> |
6 | -#include <stim/cuda/ivote/update_dir_global.cuh> | |
7 | -//#include <stim/cuda/ivote/vote_shared_32-32.cuh> | |
8 | -#include <stim/cuda/ivote/vote_atomic_shared.cuh> | |
9 | -//#include <stim/cuda/ivote/re_sample.cuh> | |
6 | +#include <stim/cuda/ivote/update_dir_bb.cuh> | |
7 | +#include <stim/cuda/ivote/vote_atomic_bb.cuh> | |
8 | + | |
10 | 9 | namespace stim{ |
11 | 10 | namespace cuda{ |
12 | 11 | ... | ... |
stim/envi/agilent_binary.h
... | ... | @@ -35,26 +35,28 @@ public: |
35 | 35 | void alloc(){ |
36 | 36 | ptr = (T*) malloc(bytes()); |
37 | 37 | } |
38 | - void alloc(short x, short y, short z){ | |
38 | + void alloc(size_t x, size_t y, size_t z){ | |
39 | 39 | R[0] = x; |
40 | 40 | R[1] = y; |
41 | 41 | R[2] = z; |
42 | 42 | alloc(); |
43 | 43 | } |
44 | 44 | |
45 | + /// Create a deep copy of an agileng_binary object | |
45 | 46 | void deep_copy(agilent_binary<T>* dst, const agilent_binary<T>* src){ |
46 | 47 | dst->alloc(src->R[0], src->R[1], src->R[2]); //allocate memory |
47 | 48 | memcpy(dst->ptr, src->ptr, bytes()); //copy the data |
48 | 49 | memcpy(dst->Z, src->Z, sizeof(double) * 2); //copy the data z range |
49 | 50 | } |
50 | 51 | |
52 | + /// Default constructor, sets the resolution to zero and the data pointer to NULL | |
51 | 53 | agilent_binary(){ |
52 | - memset(R, 0, sizeof(short) * 3); //set the resolution to zero | |
54 | + memset(R, 0, sizeof(size_t) * 3); //set the resolution to zero | |
53 | 55 | ptr = NULL; |
54 | 56 | } |
55 | 57 | |
56 | 58 | /// Constructor with resolution |
57 | - agilent_binary(short x, short y, short z){ | |
59 | + agilent_binary(size_t x, size_t y, size_t z){ | |
58 | 60 | alloc(x, y, z); |
59 | 61 | } |
60 | 62 | |
... | ... | @@ -109,13 +111,11 @@ public: |
109 | 111 | |
110 | 112 | char zero = 0; |
111 | 113 | for(size_t i = 0; i < 9; i++) outfile.write(&zero, 1); //write 9 zeros |
112 | - outfile.write((char*)&R[0], 2); | |
114 | + outfile.write((char*)&R[2], 2); | |
113 | 115 | for(size_t i = 0; i < 13; i++) outfile.write(&zero, 1); //write 13 zeros |
116 | + outfile.write((char*)&R[0], 2); | |
114 | 117 | outfile.write((char*)&R[1], 2); |
115 | - outfile.write((char*)&R[2], 2); | |
116 | 118 | for(size_t i = 0; i < 992; i++) outfile.write(&zero, 1); //write 992 zeros |
117 | - //char zerovec[1020]; | |
118 | - //outfile.write((char*)zerovec, 1020); | |
119 | 119 | |
120 | 120 | size_t b = bytes(); |
121 | 121 | outfile.write((char*)ptr, b); //write the data to the output file |
... | ... | @@ -149,7 +149,7 @@ public: |
149 | 149 | |
150 | 150 | #ifdef CUDA_FOUND |
151 | 151 | /// Perform an FFT and return a binary file with bands in the specified range |
152 | - agilent_binary<T> fft(float band_min, float band_max){ | |
152 | + agilent_binary<T> fft(double band_min, double band_max, double ELWN = 15798, int UDR = 2){ | |
153 | 153 | auto total_start = std::chrono::high_resolution_clock::now(); |
154 | 154 | |
155 | 155 | auto start = std::chrono::high_resolution_clock::now(); |
... | ... | @@ -177,8 +177,8 @@ public: |
177 | 177 | |
178 | 178 | start = std::chrono::high_resolution_clock::now(); |
179 | 179 | int N[1]; //create an array with the interferogram size (required for cuFFT input) |
180 | - N[0] = R[2]; //set the only array value to the interferogram size | |
181 | - if(cufftPlanMany(&plan, 1, N, NULL, 1, R[2], NULL, 1, R[2], CUFFT_R2C, batch) != CUFFT_SUCCESS){ | |
180 | + N[0] = (int)R[2]; //set the only array value to the interferogram size | |
181 | + if(cufftPlanMany(&plan, 1, N, NULL, 1, (int)R[2], NULL, 1, (int)R[2], CUFFT_R2C, (int)batch) != CUFFT_SUCCESS){ | |
182 | 182 | std::cout<<"cuFFT Error: unable to create 1D plan."<<std::endl; |
183 | 183 | exit(1); |
184 | 184 | } |
... | ... | @@ -199,12 +199,13 @@ public: |
199 | 199 | std::complex<T>* cpu_fft = (std::complex<T>*) malloc( R[0] * R[1] * (R[2]/2+1) * sizeof(std::complex<T>) ); |
200 | 200 | HANDLE_ERROR(cudaMemcpy(cpu_fft, gpu_fft, R[0] * R[1] * (R[2]/2+1) * sizeof(cufftComplex), cudaMemcpyDeviceToHost)); //copy data from the host to the device |
201 | 201 | |
202 | - double int_delta = 0.00012656; //interferogram sample spacing in centimeters | |
202 | + //double int_delta = 0.00012656; //interferogram sample spacing in centimeters | |
203 | + double int_delta = (1.0 / ELWN) * ((double)UDR / 2.0); //calculate the interferogram spacing | |
203 | 204 | double int_length = int_delta * R[2]; //interferogram length in centimeters |
204 | 205 | double fft_delta = 1/int_length; //spectrum spacing (in inverse centimeters, wavenumber |
205 | 206 | |
206 | - size_t start_i = std::ceil(band_min / fft_delta); //calculate the first band to store | |
207 | - size_t size_i = std::floor(band_max / fft_delta) - start_i; //calculate the number of bands to store | |
207 | + size_t start_i = (size_t)std::ceil(band_min / fft_delta); //calculate the first band to store | |
208 | + size_t size_i = (size_t)std::floor(band_max / fft_delta) - start_i; //calculate the number of bands to store | |
208 | 209 | size_t end_i = start_i + size_i; //last band number |
209 | 210 | agilent_binary<T> result(R[0], R[1], size_i); |
210 | 211 | result.Z[0] = start_i * fft_delta; //set the range for the FFT result | ... | ... |
stim/envi/bil.h
... | ... | @@ -1309,6 +1309,66 @@ public: |
1309 | 1309 | |
1310 | 1310 | } |
1311 | 1311 | |
1312 | + bool multiply(std::string outname, double v, unsigned char* mask = NULL, bool PROGRESS = false){ | |
1313 | + unsigned long long B = Z(); //calculate the number of bands | |
1314 | + unsigned long long ZX = Z() * X(); | |
1315 | + unsigned long long XY = X() * Y(); //calculate the number of pixels in a band | |
1316 | + unsigned long long S = XY * sizeof(T); //calculate the number of bytes in a band | |
1317 | + unsigned long long L = ZX * sizeof(T); | |
1318 | + | |
1319 | + std::ofstream target(outname.c_str(), std::ios::binary); //open the target binary file | |
1320 | + | |
1321 | + T * c; //pointer to the current ZX slice | |
1322 | + c = (T*)malloc( L ); //allocate space for the slice | |
1323 | + | |
1324 | + for(unsigned long long j = 0; j < Y(); j++){ //for each line | |
1325 | + read_plane_y(c, j); //load the line into memory | |
1326 | + for(unsigned long long i = 0; i < B; i++){ //for each band | |
1327 | + for(unsigned long long m = 0; m < X(); m++){ //for each sample | |
1328 | + if( mask == NULL && mask[m + j * X()] ) //if the pixel is masked | |
1329 | + c[m + i * X()] *= (T)v; | |
1330 | + } | |
1331 | + } | |
1332 | + target.write(reinterpret_cast<const char*>(c), L); //write normalized data into destination | |
1333 | + | |
1334 | + if(PROGRESS) progress = (double)(j+1) / Y() * 100; //update the progress | |
1335 | + } | |
1336 | + | |
1337 | + free(c); //free the slice memory | |
1338 | + target.close(); //close the output file | |
1339 | + return true; | |
1340 | + } | |
1341 | + | |
1342 | + bool add(std::string outname, double v, unsigned char* mask = NULL, bool PROGRESS = false){ | |
1343 | + unsigned long long B = Z(); //calculate the number of bands | |
1344 | + unsigned long long ZX = Z() * X(); | |
1345 | + unsigned long long XY = X() * Y(); //calculate the number of pixels in a band | |
1346 | + unsigned long long S = XY * sizeof(T); //calculate the number of bytes in a band | |
1347 | + unsigned long long L = ZX * sizeof(T); | |
1348 | + | |
1349 | + std::ofstream target(outname.c_str(), std::ios::binary); //open the target binary file | |
1350 | + | |
1351 | + T * c; //pointer to the current ZX slice | |
1352 | + c = (T*)malloc( L ); //allocate space for the slice | |
1353 | + | |
1354 | + for(unsigned long long j = 0; j < Y(); j++){ //for each line | |
1355 | + read_plane_y(c, j); //load the line into memory | |
1356 | + for(unsigned long long i = 0; i < B; i++){ //for each band | |
1357 | + for(unsigned long long m = 0; m < X(); m++){ //for each sample | |
1358 | + if( mask == NULL && mask[m + j * X()] ) //if the pixel is masked | |
1359 | + c[m + i * X()] += (T)v; | |
1360 | + } | |
1361 | + } | |
1362 | + target.write(reinterpret_cast<const char*>(c), L); //write normalized data into destination | |
1363 | + | |
1364 | + if(PROGRESS) progress = (double)(j+1) / Y() * 100; //update the progress | |
1365 | + } | |
1366 | + | |
1367 | + free(c); //free the slice memory | |
1368 | + target.close(); //close the output file | |
1369 | + return true; | |
1370 | + } | |
1371 | + | |
1312 | 1372 | /// Close the file. |
1313 | 1373 | bool close(){ |
1314 | 1374 | file.close(); | ... | ... |
stim/envi/binary.h
... | ... | @@ -7,6 +7,13 @@ |
7 | 7 | #include "../math/vector.h" |
8 | 8 | #include <fstream> |
9 | 9 | #include <sys/stat.h> |
10 | +#include <cstring> | |
11 | + | |
12 | +#ifdef _WIN32 | |
13 | +#include <Windows.h> | |
14 | +#else | |
15 | +#include <unistd.h> | |
16 | +#endif | |
10 | 17 | |
11 | 18 | namespace stim{ |
12 | 19 | |
... | ... | @@ -30,14 +37,16 @@ protected: |
30 | 37 | |
31 | 38 | double progress; //stores the progress on the current operation (accessible using a thread) |
32 | 39 | |
40 | + size_t buffer_size; //available memory for processing large files | |
33 | 41 | |
34 | 42 | /// Private initialization function used to set default parameters in the data structure. |
35 | 43 | void init(){ |
36 | - memset(R, 0, sizeof(unsigned long long) * D); //initialize the resolution to zero | |
37 | - header = 0; //initialize the header size to zero | |
44 | + std::memset(R, 0, sizeof(unsigned long long) * D); //initialize the resolution to zero | |
45 | + header = 0; //initialize the header size to zero | |
38 | 46 | mask = NULL; |
39 | 47 | |
40 | 48 | progress = 0; |
49 | + set_buffer(); //set the maximum buffer size to the default | |
41 | 50 | } |
42 | 51 | |
43 | 52 | /// Private helper function that returns the size of the file on disk using system functions. |
... | ... | @@ -105,6 +114,11 @@ protected: |
105 | 114 | |
106 | 115 | public: |
107 | 116 | |
117 | + //default constructor | |
118 | + binary(){ | |
119 | + init(); | |
120 | + } | |
121 | + | |
108 | 122 | double get_progress(){ |
109 | 123 | return progress; |
110 | 124 | } |
... | ... | @@ -113,6 +127,20 @@ public: |
113 | 127 | progress = 0; |
114 | 128 | } |
115 | 129 | |
130 | + //specify the maximum fraction of available memory that this class will use for buffering | |
131 | + void set_buffer(double mem_frac = 0.5){ //default to 50% | |
132 | +#ifdef _WIN32 | |
133 | + MEMORYSTATUSEX statex; | |
134 | + statex.dwLength = sizeof (statex); | |
135 | + GlobalMemoryStatusEx (&statex); | |
136 | + buffer_size = (size_t)(statex.ullAvailPhys * mem_frac); | |
137 | +#else | |
138 | + size_t pages = sysconf(_SC_PHYS_PAGES); | |
139 | + size_t page_size = sysconf(_SC_PAGE_SIZE); | |
140 | + buffer_size = (size_t)(pages * page_size * mem_frac); | |
141 | +#endif | |
142 | + } | |
143 | + | |
116 | 144 | /// Open a binary file for streaming. |
117 | 145 | |
118 | 146 | /// @param filename is the name of the binary file |
... | ... | @@ -375,6 +403,96 @@ public: |
375 | 403 | return read_pixel(p, i); |
376 | 404 | } |
377 | 405 | |
406 | + /// Reads a block specified by an (x, y, z) position and size using the largest possible contiguous reads | |
407 | + bool read(T* dest, size_t x, size_t y, size_t z, size_t sx, size_t sy, size_t sz){ | |
408 | + | |
409 | + size_t size_bytes = sx * sy * sz * sizeof(T); //size of the block to read in bytes | |
410 | + | |
411 | + size_t start = z * R[0] * R[1] + y * R[0] + x; //calculate the start postion | |
412 | + size_t start_bytes = start * sizeof(T); //start position in bytes | |
413 | + file.seekg(start * sizeof(T), std::ios::beg); //seek to the start position | |
414 | + | |
415 | + | |
416 | + if(sx == R[0] && sy == R[1]){ //if sx and sy result in a contiguous volume along z | |
417 | + file.read((char*)dest, size_bytes); //read the block in one pass | |
418 | + return true; | |
419 | + } | |
420 | + | |
421 | + if(sx == R[0]){ //if sx is contiguous, read each z-axis slice can be read in one pass | |
422 | + size_t jump_bytes = (R[1] - sy) * R[0] * sizeof(T); //jump between each slice | |
423 | + size_t slice_bytes = sx * sy * sizeof(T); //size of the slice to be read | |
424 | + for(size_t zi = 0; zi < sz; zi++){ //for each z-axis slice | |
425 | + file.read((char*)dest, slice_bytes); //read the slice | |
426 | + dest += sx * sy; //move the destination pointer to the next slice | |
427 | + file.seekg(jump_bytes, std::ios::cur); //skip to the next slice in the file | |
428 | + } | |
429 | + return true; | |
430 | + } | |
431 | + | |
432 | + //in this case, x is not contiguous so the volume must be read line-by-line | |
433 | + size_t jump_x_bytes = (R[0] - sx) * sizeof(T); //number of bytes skipped in the x direction | |
434 | + size_t jump_y_bytes = (R[1] - sy) * R[0] * sizeof(T) + jump_x_bytes; //number of bytes skipped between slices | |
435 | + size_t line_bytes = sx * sizeof(T); //size of the line to be read | |
436 | + size_t zi, yi; | |
437 | + for(zi = 0; zi < sz; zi++){ //for each slice | |
438 | + file.read((char*)dest, line_bytes); //read the first line | |
439 | + for(yi = 1; yi < sy; yi++){ //read each additional line | |
440 | + dest += sx; //move the pointer in the destination block to the next line | |
441 | + file.seekg(jump_x_bytes, std::ios::cur); //skip to the next line in the file | |
442 | + file.read((char*)dest, line_bytes); //read the line to the destination block | |
443 | + } | |
444 | + file.seekg(jump_y_bytes, std::ios::cur); //skip to the beginning of the next slice | |
445 | + } | |
446 | + return false; | |
447 | + } | |
448 | + | |
449 | + // permutes a block of data from the current interleave to the interleave specified (re-arranged dimensions to the order specified by [d0, d1, d2]) | |
450 | + | |
451 | + void permute(T* dest, T* src, size_t sx, size_t sy, size_t sz, size_t d0, size_t d1, size_t d2){ | |
452 | + size_t d[3] = {d0, d1, d2}; | |
453 | + size_t s[3] = {sx, sy, sz}; | |
454 | + size_t p[3];// = {x, y, z}; | |
455 | + | |
456 | + if(d[0] == 0 && d[1] == 1 && d[2] == 2){ | |
457 | + //this isn't actually a permute - just copy the data | |
458 | + memcpy(dest, src, sizeof(T) * sx * sy * sz); | |
459 | + } | |
460 | + else if(d[0] == 0){ //the individual lines are contiguous, so you can memcpy line-by-line | |
461 | + size_t y, z; | |
462 | + size_t src_idx, dest_idx; | |
463 | + size_t x_bytes = sizeof(T) * sx; | |
464 | + for(z = 0; z < sz; z++){ | |
465 | + p[2] = z; | |
466 | + for(y = 0; y < sy; y++){ | |
467 | + p[1] = y; | |
468 | + src_idx = z * sx * sy + y * sx; | |
469 | + dest_idx = p[d[2]] * s[d[0]] * s[d[1]] + p[d[1]] * s[d[0]]; | |
470 | + //std::cout<<z<<", "<<y<<" ------- "<<p[d[2]]<<" * "<<s[d[0]]<<" * "<<s[d[1]]<<" + "<<p[d[1]]<<" * "<<s[d[0]]<<std::endl; | |
471 | + memcpy(dest + dest_idx, src + src_idx, x_bytes); | |
472 | + } | |
473 | + } | |
474 | + } | |
475 | + else{ //loop through every damn point | |
476 | + size_t x, y, z; | |
477 | + size_t src_idx, dest_idx; | |
478 | + size_t src_z, src_y; | |
479 | + for(z = 0; z < sz; z++){ | |
480 | + p[2] = z; | |
481 | + src_z = z * sx * sy; | |
482 | + for(y = 0; y < sy; y++){ | |
483 | + p[1] = y; | |
484 | + src_y = src_z + y * sx; | |
485 | + for(x = 0; x < sx; x++){ | |
486 | + p[0] = x; | |
487 | + src_idx = src_y + x; | |
488 | + dest_idx = p[d[2]] * s[d[0]] * s[d[1]] + p[d[1]] * s[d[0]] + p[d[0]]; | |
489 | + dest[dest_idx] = src[src_idx]; | |
490 | + } | |
491 | + } | |
492 | + } | |
493 | + } | |
494 | + } | |
495 | + | |
378 | 496 | }; |
379 | 497 | |
380 | 498 | } | ... | ... |
stim/envi/bip.h
... | ... | @@ -373,7 +373,7 @@ public: |
373 | 373 | for(size_t xy = 0; xy < XY; xy++){ //for each pixel |
374 | 374 | memset(spec, 0, Bb); //set the spectrum to zero |
375 | 375 | if(mask == NULL || mask[xy]){ //if the pixel is masked |
376 | - len = 0; //initialize the | |
376 | + len = 0; //initialize the | |
377 | 377 | file.read((char*)spec, Bb); //read a spectrum |
378 | 378 | for(size_t b = 0; b < B; b++) //for each band |
379 | 379 | len += spec[b]*spec[b]; //add the square of the spectral band |
... | ... | @@ -385,7 +385,7 @@ public: |
385 | 385 | file.seekg(Bb, std::ios::cur); //otherwise skip a spectrum |
386 | 386 | target.write((char*)spec, Bb); //output the normalized spectrum |
387 | 387 | if(PROGRESS) progress = (double)(xy + 1) / (double)XY * 100; //update the progress |
388 | - } | |
388 | + } | |
389 | 389 | } |
390 | 390 | |
391 | 391 | |
... | ... | @@ -1088,6 +1088,232 @@ public: |
1088 | 1088 | return true; |
1089 | 1089 | } |
1090 | 1090 | |
1091 | + | |
1092 | +#ifdef CUDA_FOUND | |
1093 | + /// Calculate the covariance matrix of Noise for masked pixels using cuBLAS | |
1094 | + /// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra | |
1095 | + bool coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){ | |
1096 | + | |
1097 | + cudaError_t cudaStat; | |
1098 | + cublasStatus_t stat; | |
1099 | + cublasHandle_t handle; | |
1100 | + | |
1101 | + progress = 0; //initialize the progress to zero (0) | |
1102 | + unsigned long long XY = X() * Y(); //calculate the number of elements in a band image | |
1103 | + unsigned long long B = Z(); //calculate the number of spectral elements | |
1104 | + | |
1105 | + double* s = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum that will be pulled from the file | |
1106 | + double* s_dev; //declare a device pointer that will store the spectrum on the GPU | |
1107 | + | |
1108 | + double* s2_dev; // device pointer on the GPU | |
1109 | + cudaStat = cudaMalloc(&s2_dev, B * sizeof(double)); // allocate space on the CUDA device | |
1110 | + cudaStat = cudaMemset(s2_dev, 0, B * sizeof(double)); // initialize s2_dev to zero (0) | |
1111 | + | |
1112 | + double* A_dev; //declare a device pointer that will store the covariance matrix on the GPU | |
1113 | + double* avg_dev; //declare a device pointer that will store the average spectrum | |
1114 | + cudaStat = cudaMalloc(&s_dev, B * sizeof(double)); //allocate space on the CUDA device for the spectrum | |
1115 | + cudaStat = cudaMalloc(&A_dev, B * B * sizeof(double)); //allocate space on the CUDA device for the covariance matrix | |
1116 | + cudaStat = cudaMemset(A_dev, 0, B * B * sizeof(double)); //initialize the covariance matrix to zero (0) | |
1117 | + cudaStat = cudaMalloc(&avg_dev, B * sizeof(double)); //allocate space on the CUDA device for the average spectrum | |
1118 | + stat = cublasSetVector((int)B, sizeof(double), avg, 1, avg_dev, 1); //copy the average spectrum to the CUDA device | |
1119 | + | |
1120 | + double ger_alpha = 1.0/(double)XY; //scale the outer product by the inverse of the number of samples (mean outer product) | |
1121 | + double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction) | |
1122 | + | |
1123 | + stat = cublasCreate(&handle); //create a cuBLAS instance | |
1124 | + if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid | |
1125 | + printf ("CUBLAS initialization failed\n"); | |
1126 | + return EXIT_FAILURE; | |
1127 | + } | |
1128 | + for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel | |
1129 | + if (mask == NULL || mask[xy] != 0){ | |
1130 | + pixeld(s, xy); //retreive the spectrum at the current xy pixel location | |
1131 | + | |
1132 | + stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1); //copy the spectrum from the host to the device | |
1133 | + stat = cublasDaxpy(handle, (int)B, &axpy_alpha, avg_dev, 1, s_dev, 1); //subtract the average spectrum | |
1134 | + | |
1135 | + cudaMemcpy(s2_dev, s_dev + 1 , (B-1) * sizeof(double), cudaMemcpyDeviceToDevice); //copy B-1 elements from shifted source data (s_dev) to device pointer (s2_dev ) | |
1136 | + stat = cublasDaxpy(handle, (int)B, &axpy_alpha, s2_dev, 1, s_dev, 1); //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them ) | |
1137 | + | |
1138 | + | |
1139 | + stat = cublasDsyr(handle, CUBLAS_FILL_MODE_UPPER, (int)B, &ger_alpha, s_dev, 1, A_dev, (int)B); //calculate the covariance matrix (symmetric outer product) | |
1140 | + } | |
1141 | + if(PROGRESS) progress = (double)(xy+1) / XY * 100; //record the current progress | |
1142 | + | |
1143 | + } | |
1144 | + | |
1145 | + cublasGetMatrix((int)B, (int)B, sizeof(double), A_dev, (int)B, coN, (int)B); //copy the result from the GPU to the CPU | |
1146 | + | |
1147 | + cudaFree(A_dev); //clean up allocated device memory | |
1148 | + cudaFree(s_dev); | |
1149 | + cudaFree(s2_dev); | |
1150 | + cudaFree(avg_dev); | |
1151 | + | |
1152 | + for(unsigned long long i = 0; i < B; i++){ //copy the upper triangular portion to the lower triangular portion | |
1153 | + for(unsigned long long j = i+1; j < B; j++){ | |
1154 | + coN[B * i + j] = coN[B * j + i]; | |
1155 | + } | |
1156 | + } | |
1157 | + | |
1158 | + return true; | |
1159 | + } | |
1160 | +#endif | |
1161 | + | |
1162 | + /// Calculate the covariance of noise matrix for all masked pixels in the image with 64-bit floating point precision. | |
1163 | + | |
1164 | + /// @param coN is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix | |
1165 | + /// @param avg is a pointer to memory of size B that stores the average spectrum | |
1166 | + /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location | |
1167 | + bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){ | |
1168 | + | |
1169 | +#ifdef CUDA_FOUND | |
1170 | + int dev_count; | |
1171 | + cudaGetDeviceCount(&dev_count); //get the number of CUDA devices | |
1172 | + cudaDeviceProp prop; | |
1173 | + cudaGetDeviceProperties(&prop, 0); //get the property of the first device | |
1174 | + if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator | |
1175 | + return coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix | |
1176 | +#endif | |
1177 | + | |
1178 | + | |
1179 | + | |
1180 | + progress = 0; | |
1181 | + //memory allocation | |
1182 | + unsigned long long XY = X() * Y(); | |
1183 | + unsigned long long B = Z(); | |
1184 | + T* temp = (T*)malloc(sizeof(T) * B); | |
1185 | + | |
1186 | + unsigned long long count = nnz(mask); //count the number of masked pixels | |
1187 | + | |
1188 | + //initialize covariance matrix of noise | |
1189 | + memset(coN, 0, B * B * sizeof(double)); | |
1190 | + | |
1191 | + //calculate covariance matrix | |
1192 | + double* coN_half = (double*) malloc(B * B * sizeof(double)); //allocate space for a higher-precision intermediate matrix | |
1193 | + double* temp_precise = (double*) malloc(B * sizeof(double)); | |
1194 | + memset(coN_half, 0, B * B * sizeof(double)); //initialize the high-precision matrix with zeros | |
1195 | + unsigned long long idx; //stores i*B to speed indexing | |
1196 | + for (unsigned long long xy = 0; xy < XY; xy++){ | |
1197 | + if (mask == NULL || mask[xy] != 0){ | |
1198 | + pixel(temp, xy); //retreive the spectrum at the current xy pixel location | |
1199 | + for(unsigned long long b = 0; b < B; b++) //subtract the mean from this spectrum and increase the precision | |
1200 | + temp_precise[b] = (double)temp[b] - (double)avg[b]; | |
1201 | + | |
1202 | + for(unsigned long long b2 = 0; b2 < B-1; b2++) //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them ) | |
1203 | + temp_precise[b2] -= temp_precise[b2+1]; | |
1204 | + | |
1205 | + idx = 0; | |
1206 | + for (unsigned long long b0 = 0; b0 < B; b0++){ //for each band | |
1207 | + for (unsigned long long b1 = b0; b1 < B; b1++) | |
1208 | + coN_half[idx++] += temp_precise[b0] * temp_precise[b1]; | |
1209 | + } | |
1210 | + } | |
1211 | + if(PROGRESS) progress = (double)(xy+1) / XY * 100; | |
1212 | + } | |
1213 | + idx = 0; | |
1214 | + for (unsigned long long i = 0; i < B; i++){ //copy the precision matrix to both halves of the output matrix | |
1215 | + for (unsigned long long j = i; j < B; j++){ | |
1216 | + coN[j * B + i] = coN[i * B + j] = coN_half[idx++] / (double) count; | |
1217 | + } | |
1218 | + } | |
1219 | + | |
1220 | + free(temp); | |
1221 | + free(temp_precise); | |
1222 | + return true; | |
1223 | + } | |
1224 | + | |
1225 | + #ifdef CUDA_FOUND | |
1226 | + /// Project the spectra onto a set of basis functions | |
1227 | + /// @param outfile is the name of the new binary output file that will be created | |
1228 | + /// @param center is a spectrum about which the data set will be rotated (ex. when performing mean centering) | |
1229 | + /// @param basis a set of basis vectors that the data set will be projected onto (after centering) | |
1230 | + /// @param M is the number of basis vectors | |
1231 | + /// @param mask is a character mask used to limit processing to valid pixels | |
1232 | + bool project_cublas(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, bool PROGRESS = false){ | |
1233 | + | |
1234 | + cudaError_t cudaStat; | |
1235 | + cublasStatus_t stat; | |
1236 | + cublasHandle_t handle; | |
1237 | + | |
1238 | + std::ofstream target(outfile.c_str(), std::ios::binary); //open the target binary file | |
1239 | + | |
1240 | + progress = 0; //initialize the progress to zero (0) | |
1241 | + unsigned long long XY = X() * Y(); //calculate the number of elements in a band image | |
1242 | + unsigned long long B = Z(); //calculate the number of spectral elements | |
1243 | + | |
1244 | + double* s = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum that will be pulled from the file | |
1245 | + double* s_dev; //declare a device pointer that will store the spectrum on the GPU | |
1246 | + cudaStat = cudaMalloc(&s_dev, B * sizeof(double)); //allocate space on the CUDA device for the spectrum | |
1247 | + | |
1248 | + | |
1249 | + double* basis_dev; // device pointer on the GPU | |
1250 | + cudaStat = cudaMalloc(&basis_dev, M * B * sizeof(double)); // allocate space on the CUDA device | |
1251 | + cudaStat = cudaMemset(basis_dev, 0, M * B * sizeof(double)); // initialize basis_dev to zero (0) | |
1252 | + | |
1253 | + | |
1254 | + /// transposing basis matrix (because cuBLAS is column-major) | |
1255 | + double *basis_Transposed = (double*)malloc(M * B * sizeof(double)); | |
1256 | + memset(basis_Transposed, 0, M * B * sizeof(double)); | |
1257 | + for (int i = 0; i<M; i++) | |
1258 | + for (int j = 0; j<B; j++) | |
1259 | + basis_Transposed[i+j*M] = basis[i*B+j]; | |
1260 | + | |
1261 | + stat = cublasSetMatrix((int)M, (int)B, sizeof(double),basis_Transposed, (int)M, basis_dev, (int)M); //copy the basis_Transposed matrix to the CUDA device (both matrices are stored in column-major format) | |
1262 | + | |
1263 | + double* center_dev; //declare a device pointer that will store the center (average) | |
1264 | + cudaStat = cudaMalloc(¢er_dev, B * sizeof(double)); //allocate space on the CUDA device for the center (average) | |
1265 | + stat = cublasSetVector((int)B, sizeof(double), center, 1, center_dev, 1); //copy the center vector (average) to the CUDA device (from host to device) | |
1266 | + | |
1267 | + | |
1268 | + double* A = (double*)malloc(sizeof(double) * M); //allocate space for the projected pixel on the host | |
1269 | + double* A_dev; //declare a device pointer that will store the projected pixel on the GPU | |
1270 | + cudaStat = cudaMalloc(&A_dev,M * sizeof(double)); //allocate space on the CUDA device for the projected pixel | |
1271 | + cudaStat = cudaMemset(A_dev, 0,M * sizeof(double)); //initialize the projected pixel to zero (0) | |
1272 | + | |
1273 | + double axpy_alpha = -1; //multiplication factor for the center (in order to perform a subtraction) | |
1274 | + double axpy_alpha2 = 1; //multiplication factor for the matrix-vector multiplication | |
1275 | + double axpy_beta = 0; //multiplication factor for the matrix-vector multiplication (there is no second scalor) | |
1276 | + | |
1277 | + stat = cublasCreate(&handle); //create a cuBLAS instance | |
1278 | + if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid | |
1279 | + printf ("CUBLAS initialization failed\n"); | |
1280 | + return EXIT_FAILURE; | |
1281 | + } | |
1282 | + | |
1283 | + T* temp = (T*)malloc(sizeof(T) * M); //allocate space for the projected pixel to be written on the disc | |
1284 | + size_t i; | |
1285 | + for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel | |
1286 | + if (mask == NULL || mask[xy] != 0){ | |
1287 | + pixeld(s, xy); //retreive the spectrum at the current xy pixel location | |
1288 | + | |
1289 | + stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1); //copy the spectrum from the host to the device | |
1290 | + stat = cublasDaxpy(handle, (int)B, &axpy_alpha, center_dev, 1, s_dev, 1); //subtract the center (average) | |
1291 | + stat = cublasDgemv(handle,CUBLAS_OP_N,(int)M,(int)B,&axpy_alpha2,basis_dev,(int)M,s_dev,1,&axpy_beta,A_dev,1); //performs the matrix-vector multiplication | |
1292 | + stat = cublasGetVector((int)B, sizeof(double), A_dev, 1, A, 1); //copy the projected pixel to the host (from GPU to CPU) | |
1293 | + | |
1294 | + //std::copy<double*, T*>(A, A + M, temp); | |
1295 | + for(i = 0; i < M; i++) temp[i] = (T)A[i]; //casting projected pixel from double to whatever T is | |
1296 | + } | |
1297 | + | |
1298 | + target.write(reinterpret_cast<const char*>(temp), sizeof(T) * M); //write the projected vector | |
1299 | + if(PROGRESS) progress = (double)(xy+1) / XY * 100; //record the current progress | |
1300 | + | |
1301 | + } | |
1302 | + | |
1303 | + //clean up allocated device memory | |
1304 | + cudaFree(A_dev); | |
1305 | + cudaFree(s_dev); | |
1306 | + cudaFree(basis_dev); | |
1307 | + cudaFree(center_dev); | |
1308 | + free(A); | |
1309 | + free(s); | |
1310 | + free(temp); | |
1311 | + target.close(); //close the output file | |
1312 | + | |
1313 | + return true; | |
1314 | + } | |
1315 | +#endif | |
1316 | + | |
1091 | 1317 | /// Project the spectra onto a set of basis functions |
1092 | 1318 | /// @param outfile is the name of the new binary output file that will be created |
1093 | 1319 | /// @param center is a spectrum about which the data set will be rotated (ex. when performing mean centering) |
... | ... | @@ -1096,6 +1322,14 @@ public: |
1096 | 1322 | /// @param mask is a character mask used to limit processing to valid pixels |
1097 | 1323 | bool project(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, bool PROGRESS = false){ |
1098 | 1324 | |
1325 | +#ifdef CUDA_FOUND | |
1326 | + int dev_count; | |
1327 | + cudaGetDeviceCount(&dev_count); //get the number of CUDA devices | |
1328 | + cudaDeviceProp prop; | |
1329 | + cudaGetDeviceProperties(&prop, 0); //get the property of the first device | |
1330 | + if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator | |
1331 | + return project_cublas(outfile,center,basis,M,mask,PROGRESS); //use cuBLAS to calculate the covariance matrix | |
1332 | +#endif | |
1099 | 1333 | std::ofstream target(outfile.c_str(), std::ios::binary); //open the target binary file |
1100 | 1334 | //std::string headername = outfile + ".hdr"; //the header file name |
1101 | 1335 | |
... | ... | @@ -1125,7 +1359,7 @@ public: |
1125 | 1359 | free(s); //free temporary storage arrays |
1126 | 1360 | free(rs); |
1127 | 1361 | target.close(); //close the output file |
1128 | - | |
1362 | + | |
1129 | 1363 | return true; |
1130 | 1364 | } |
1131 | 1365 | |
... | ... | @@ -1395,6 +1629,52 @@ public: |
1395 | 1629 | } |
1396 | 1630 | } |
1397 | 1631 | |
1632 | + bool multiply(std::string outname, double v, unsigned char* mask = NULL, bool PROGRESS = false){ | |
1633 | + std::ofstream target(outname.c_str(), std::ios::binary); //open the target binary file | |
1634 | + std::string headername = outname + ".hdr"; //the header file name | |
1635 | + | |
1636 | + unsigned long long N = X() * Y(); //calculate the total number of pixels to be processed | |
1637 | + unsigned long long B = Z(); //get the number of bands | |
1638 | + T* s = (T*)malloc(sizeof(T) * B); //allocate memory to store a pixel | |
1639 | + for(unsigned long long n = 0; n < N; n++){ //for each pixel in the image | |
1640 | + if(mask == NULL || mask[n]){ //if the pixel is masked | |
1641 | + for(size_t b = 0; b < B; b++) //for each band in the spectrum | |
1642 | + s[b] *= (T)v; //multiply | |
1643 | + } | |
1644 | + | |
1645 | + if(PROGRESS) progress = (double)(n+1) / N * 100; //set the current progress | |
1646 | + | |
1647 | + target.write((char*)s, sizeof(T) * B); //write the corrected data into destination | |
1648 | + } //end for each pixel | |
1649 | + | |
1650 | + free(s); //free the spectrum | |
1651 | + target.close(); //close the output file | |
1652 | + return true; | |
1653 | + } | |
1654 | + | |
1655 | + bool add(std::string outname, double v, unsigned char* mask = NULL, bool PROGRESS = false){ | |
1656 | + std::ofstream target(outname.c_str(), std::ios::binary); //open the target binary file | |
1657 | + std::string headername = outname + ".hdr"; //the header file name | |
1658 | + | |
1659 | + unsigned long long N = X() * Y(); //calculate the total number of pixels to be processed | |
1660 | + unsigned long long B = Z(); //get the number of bands | |
1661 | + T* s = (T*)malloc(sizeof(T) * B); //allocate memory to store a pixel | |
1662 | + for(unsigned long long n = 0; n < N; n++){ //for each pixel in the image | |
1663 | + if(mask == NULL || mask[n]){ //if the pixel is masked | |
1664 | + for(size_t b = 0; b < B; b++) //for each band in the spectrum | |
1665 | + s[b] += (T)v; //multiply | |
1666 | + } | |
1667 | + | |
1668 | + if(PROGRESS) progress = (double)(n+1) / N * 100; //set the current progress | |
1669 | + | |
1670 | + target.write((char*)s, sizeof(T) * B); //write the corrected data into destination | |
1671 | + } //end for each pixel | |
1672 | + | |
1673 | + free(s); //free the spectrum | |
1674 | + target.close(); //close the output file | |
1675 | + return true; | |
1676 | + } | |
1677 | + | |
1398 | 1678 | |
1399 | 1679 | |
1400 | 1680 | /// Close the file. | ... | ... |
stim/envi/bsq.h
... | ... | @@ -9,6 +9,7 @@ |
9 | 9 | #include <vector> |
10 | 10 | #include <deque> |
11 | 11 | #include <chrono> |
12 | +#include <future> | |
12 | 13 | |
13 | 14 | |
14 | 15 | |
... | ... | @@ -376,36 +377,144 @@ public: |
376 | 377 | |
377 | 378 | } |
378 | 379 | |
379 | - /// Convert the current BSQ file to a BIL file with the specified file name. | |
380 | - | |
381 | - /// @param outname is the name of the output BIL file to be saved to disk. | |
382 | - bool bil(std::string outname, bool PROGRESS = false) | |
383 | - { | |
384 | - //simplify image resolution | |
385 | - unsigned long long jump = (Y() - 1) * X() * sizeof(T); | |
380 | + void readlines(T* dest, size_t start, size_t n){ | |
381 | + hsi<T>::read(dest, 0, start, 0, X(), n, Z()); | |
382 | + } | |
386 | 383 | |
387 | - std::ofstream target(outname.c_str(), std::ios::binary); | |
388 | - std::string headername = outname + ".hdr"; | |
384 | + /// Convert this BSQ file to a BIL | |
385 | + bool bil(std::string outname, bool PROGRESS = false){ | |
389 | 386 | |
390 | - unsigned long long L = X(); | |
391 | - T* line = (T*)malloc(sizeof(T) * L); | |
387 | + const size_t buffers = 4; //number of buffers required for this algorithm | |
388 | + size_t mem_per_batch = binary<T>::buffer_size / buffers; //calculate the maximum memory available for a batch | |
392 | 389 | |
393 | - for ( unsigned long long y = 0; y < Y(); y++) //for each y position | |
394 | - { | |
395 | - file.seekg(y * X() * sizeof(T), std::ios::beg); //seek to the beginning of the xz slice | |
396 | - for ( unsigned long long z = 0; z < Z(); z++ ) //for each band | |
397 | - { | |
398 | - file.read((char *)line, sizeof(T) * X()); //read a line | |
399 | - target.write((char*)line, sizeof(T) * X()); //write the line to the output file | |
400 | - file.seekg(jump, std::ios::cur); //seek to the next band | |
401 | - if(PROGRESS) progress = (double)((y+1) * Z() + z + 1) / (Z() * Y()) * 100; //update the progress counter | |
390 | + size_t slice_bytes = X() * Z() * sizeof(T); //number of bytes in an input batch slice (Y-slice in this case) | |
391 | + size_t max_slices_per_batch = mem_per_batch / slice_bytes; //maximum number of slices we can process in one batch given memory constraints | |
392 | + if(max_slices_per_batch == 0){ //if there is insufficient memory for a single slice, throw an error | |
393 | + std::cout<<"error, insufficient memory for stim::bsq::bil()"<<std::endl; | |
394 | + exit(1); | |
395 | + } | |
396 | + size_t max_batch_bytes = max_slices_per_batch * slice_bytes; //calculate the amount of memory that will be allocated for all four buffers | |
397 | + | |
398 | + T* src[2]; //source double-buffer for asynchronous batching | |
399 | + src[0] = (T*) malloc(max_batch_bytes); | |
400 | + src[1] = (T*) malloc(max_batch_bytes); | |
401 | + T* dst[2]; //destination double-buffer for asynchronous batching | |
402 | + dst[0] = (T*) malloc(max_batch_bytes); | |
403 | + dst[1] = (T*) malloc(max_batch_bytes); | |
404 | + | |
405 | + size_t N[2]; //number of slices stored in buffers 0 and 1 | |
406 | + N[0] = N[1] = min(Y(), max_slices_per_batch); //start with the maximum number of slices that can be stored (may be the entire data set) | |
407 | + | |
408 | + std::ofstream target(outname.c_str(), std::ios::binary); //open an ou |