Commit af825cb9e3a875e096d138d147b325ab67c2beb0

Authored by Pavel Govyadinov
2 parents f7b84fb2 cf5b4c92

fixed merge conflicts

matlab/loadAgilent.m 0 → 100644
  1 +function S = loadAgilent(filename)
  2 +
  3 +fid = fopen(filename); %open the file for reading
  4 +fseek(fid, 9, 'bof'); %skip past the first 9 bytes of the header
  5 +
  6 +bands = fread(fid, 1, 'int16'); %read the number of bands in the file
  7 +fseek(fid, 13, 'cof'); %skip the next 13 bytes in the header
  8 +
  9 +samples = fread(fid, 1, 'int16'); %read the number of samples (X)
  10 +lines = fread(fid, 1, 'int16'); %read the number of lines (Y)
  11 +
  12 +fseek(fid, 1020, 'bof'); %skip past the entire header
  13 +S = fread(fid, [samples lines*bands], 'float32'); %read all the data
  14 +S = reshape(S, [samples, lines, bands]);
  15 +fclose(fid); %close the file
  16 +
  17 +
... ...
stim/biomodels/cellset.h 0 → 100644
  1 +#ifndef STIM_CELLSET_H
  2 +#define STIM_CELLSET_H
  3 +
  4 +#include <stim/math/vec3.h>
  5 +#include <vector>
  6 +#include <unordered_map>
  7 +#include <fstream>
  8 +
  9 +namespace stim{
  10 +
  11 +class cellset{
  12 +private:
  13 + static const char delim = ' ';
  14 +protected:
  15 + std::vector<double*> cells; //vector storing field data for each cell
  16 + std::unordered_map<std::string, size_t> fields; //unordered map storing field->index information for each field
  17 + size_t ip[3]; //hard code to position indices (for speed)
  18 +
  19 + void init(){
  20 +
  21 + }
  22 +public:
  23 + /// Constructor - create an empty cell set
  24 + cellset(){
  25 + init();
  26 + }
  27 +
  28 + /// Constructor - load a cellset from a file
  29 + cellset(std::string filename){
  30 + init(); //initialize an empty cellset
  31 + load(filename); //load the cellset from an existing file
  32 + }
  33 +
  34 + /// Loads a cellset from a file
  35 + void load(std::string filename){
  36 + std::ifstream infile(filename);
  37 + std::string header; //allocate space for the file header
  38 + std::getline(infile, header); //get the file header
  39 +
  40 + // break the header into fields
  41 + std::stringstream ss(header); //create a string stream
  42 + std::string field; //store a single field name
  43 + size_t i = 0; //current field index
  44 + while (std::getline(ss, field, delim)) { //split the header into individual fields
  45 + std::pair<std::string, size_t> p(field, i); //create a pair associating the header name with the index
  46 + fields.insert(p); //insert the pair into the fields map
  47 + i++; //increment the data index
  48 + }
  49 + size_t nfields = fields.size(); //store the number of fields for each cell
  50 +
  51 + //load each cell and all associated fields
  52 + std::string cell_line; //string holds all information for a cell
  53 + std::list<std::string> cell_list; //list will be temporary storage for the cell fields
  54 + while(std::getline(infile, cell_line)){ //for each cell entry
  55 + cell_list.push_back(cell_line); //push the cell entry into the list
  56 + }
  57 +
  58 + //convert the list into actual data
  59 + size_t ncells = cell_list.size(); //count the number of cells
  60 + cells.resize(ncells); //allocate enough space in the array to store all cells
  61 + for(size_t c = 0; c < ncells; c++){ //for each cell entry in the list
  62 + cells[c] = (double*) malloc(sizeof(double) * nfields); //allocate enough space for each field
  63 + std::stringstream fss(cell_list.front()); //turn the string representing the cell list into a stringstream
  64 + for(size_t f = 0; f < nfields; f++){ //for each field
  65 + fss>>cells[c][f]; //load the field
  66 + }
  67 + cell_list.pop_front(); //pop the read string off of the front of the list
  68 + }
  69 + infile.close(); //close the input file
  70 +
  71 + ip[0] = fields["x"]; //hard code the position indices for speed
  72 + ip[1] = fields["y"]; // this assumes all cells have positions
  73 + ip[2] = fields["z"];
  74 + }
  75 +
  76 + /// Return the value a specified field for a cell
  77 + /// @param c is the cell index
  78 + /// @param f is the field
  79 + double value(size_t c, std::string f){
  80 + size_t idx = fields[f];
  81 + return cells[c][idx];
  82 + }
  83 +
  84 + /// returns an ID used to look up a field
  85 + bool exists(std::string f){
  86 + std::unordered_map<std::string, size_t>::iterator iter = fields.find(f);
  87 + if(iter == fields.end()) return false;
  88 + else return true;
  89 + }
  90 +
  91 + /// Return the position of cell [i]
  92 + stim::vec3<double> p(size_t i){
  93 + stim::vec3<double> pos(cells[i][ip[0]], cells[i][ip[1]], cells[i][ip[2]]);
  94 + return pos;
  95 + }
  96 +
  97 + /// Return the number of cells in the set
  98 + size_t size(){
  99 + return cells.size();
  100 + }
  101 +
  102 + /// Return the maximum value of a field in this cell set
  103 + double max(std::string field){
  104 + size_t idx = fields[field]; //get the field index
  105 + size_t ncells = cells.size(); //get the total number of cells
  106 + double maxval, val; //stores the current and maximum values
  107 + for(size_t c = 0; c < ncells; c++){ //for each cell
  108 + val = cells[c][idx]; //get the field value for this cell
  109 + if(c == 0) maxval = val; //if this is the first cell, just assign the maximum
  110 + else if(val > maxval) maxval = val; // otherwise text for the size of val and assign it as appropriate
  111 + }
  112 + return maxval;
  113 + }
  114 +
  115 + /// Return the maximum value of a field in this cell set
  116 + double min(std::string field){
  117 + size_t idx = fields[field]; //get the field index
  118 + size_t ncells = cells.size(); //get the total number of cells
  119 + double minval, val; //stores the current and maximum values
  120 + for(size_t c = 0; c < ncells; c++){ //for each cell
  121 + val = cells[c][idx]; //get the field value for this cell
  122 + if(c == 0) minval = val; //if this is the first cell, just assign the maximum
  123 + else if(val < minval) minval = val; // otherwise text for the size of val and assign it as appropriate
  124 + }
  125 + return minval;
  126 + }
  127 +
  128 +
  129 +}; //end class cellset
  130 +}; //end namespace stim
  131 +
  132 +#endif
0 133 \ No newline at end of file
... ...
stim/cuda/ivote/local_max.cuh
... ... @@ -14,7 +14,7 @@ namespace stim{
14 14  
15 15 // calculate the 2D coordinates for this current thread.
16 16 int xi = blockIdx.x * blockDim.x + threadIdx.x;
17   - int yi = blockIdx.y;
  17 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
18 18  
19 19 if(xi >= x || yi >= y)
20 20 return;
... ... @@ -63,8 +63,10 @@ namespace stim{
63 63 void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
64 64  
65 65 unsigned int max_threads = stim::maxThreadsPerBlock();
66   - dim3 threads(max_threads, 1);
67   - dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
  66 + /*dim3 threads(max_threads, 1);
  67 + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);*/
  68 + dim3 threads( sqrt(max_threads), sqrt(max_threads) );
  69 + dim3 blocks(x/threads.x + 1, y/threads.y + 1);
68 70  
69 71 //call the kernel to find the local maximum.
70 72 cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y);
... ...
stim/cuda/ivote/re_sample.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_RE_SAMPLE_H
  2 +#define STIM_CUDA_RE_SAMPLE_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/templates/gaussian_blur.cuh>
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + template<typename T>
  13 + __global__ void cuda_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
  14 +
  15 + unsigned int sigma_ds = 1/resize;
  16 + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
  17 + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
  18 +
  19 +
  20 + // calculate the 2D coordinates for this current thread.
  21 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  22 + int yi = blockIdx.y;
  23 + // convert 2D coordinates to 1D
  24 + int i = yi * x + xi;
  25 +
  26 + if(xi< x && yi< y){
  27 + if(xi%sigma_ds==0){
  28 + if(yi%sigma_ds==0){
  29 + gpuI[i] = gpuI0[(yi/sigma_ds)*x_ds + xi/sigma_ds];
  30 + }
  31 + }
  32 + else gpuI[i] = 0;
  33 +
  34 + //int x_org = xi * sigma_ds ;
  35 + //int y_org = yi * sigma_ds ;
  36 + //int i_org = y_org * x + x_org;
  37 + //gpuI[i] = gpuI0[i_org];
  38 + }
  39 +
  40 + }
  41 +
  42 +
  43 + /// Applies a Gaussian blur to a 2D image stored on the GPU
  44 + template<typename T>
  45 + void gpu_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
  46 +
  47 +
  48 + //unsigned int sigma_ds = 1/resize;
  49 + //unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
  50 + //unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
  51 +
  52 + //get the number of pixels in the image
  53 + //unsigned int pixels_ds = x_ds * y_ds;
  54 +
  55 + unsigned int max_threads = stim::maxThreadsPerBlock();
  56 + dim3 threads(max_threads, 1);
  57 + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
  58 +
  59 + //stim::cuda::gpu_gaussian_blur2<float>(gpuI0, sigma_ds,x ,y);
  60 +
  61 + //resample the image
  62 + cuda_re_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y);
  63 +
  64 + }
  65 +
  66 + /// Applies a Gaussian blur to a 2D image stored on the CPU
  67 + template<typename T>
  68 + void cpu_re_sample(T* out, T* in, T resize, unsigned int x, unsigned int y){
  69 +
  70 + //get the number of pixels in the image
  71 + unsigned int pixels = x*y;
  72 + unsigned int bytes = sizeof(T) * pixels;
  73 +
  74 + unsigned int sigma_ds = 1/resize;
  75 + unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
  76 + unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
  77 + unsigned int bytes_ds = sizeof(T) * x_ds * y_ds;
  78 +
  79 +
  80 +
  81 + //allocate space on the GPU for the original image
  82 + T* gpuI0;
  83 + cudaMalloc(&gpuI0, bytes_ds);
  84 +
  85 +
  86 + //copy the image data to the GPU
  87 + cudaMemcpy(gpuI0, in, bytes_ds, cudaMemcpyHostToDevice);
  88 +
  89 + //allocate space on the GPU for the down sampled image
  90 + T* gpuI;
  91 + cudaMalloc(&gpuI, bytes);
  92 +
  93 + //run the GPU-based version of the algorithm
  94 + gpu_re_sample<T>(gpuI, gpuI0, resize, x, y);
  95 +
  96 + //copy the image data to the GPU
  97 + cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice);
  98 +
  99 + cudaFree(gpuI0);
  100 + cudeFree(gpuI);
  101 + }
  102 +
  103 + }
  104 +}
  105 +
  106 +#endif
0 107 \ No newline at end of file
... ...
stim/cuda/ivote/update_dir_global.cuh renamed to stim/cuda/ivote/update_dir_bb.cuh
1   -#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H
2   -#define STIM_CUDA_UPDATE_DIR_GLOBAL_H
  1 +#ifndef STIM_CUDA_UPDATE_DIR_BB_H
  2 +#define STIM_CUDA_UPDATE_DIR_BB_H
3 3  
4 4 # include <iostream>
5 5 # include <cuda.h>
... ... @@ -7,8 +7,7 @@
7 7 #include <stim/cuda/sharedmem.cuh>
8 8 #include <stim/visualization/aabb2.h>
9 9 #include <stim/visualization/colormap.h>
10   -#include <math.h>
11   -#include "cpyToshare.cuh"
  10 +#include <math.h>
12 11  
13 12 //#define RMAX_TEST 8
14 13  
... ... @@ -76,68 +75,6 @@ namespace stim{
76 75 gpuDir[i] = atan2((T)max_dy, (T)max_dx);
77 76 }
78 77  
79   - // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
80   - template<typename T>
81   - __global__ void leila_cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
82   -
83   -
84   - // calculate the 2D coordinates for this current thread.
85   - int xi = blockIdx.x * blockDim.x + threadIdx.x;
86   - int yi = blockIdx.y * blockDim.y + threadIdx.y;
87   -
88   - if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel
89   -
90   - int i = yi * x + xi; // convert 2D coordinates to 1D
91   -
92   - float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch
93   - gpuDir[i] = 0; //initialize the vote direction to zero
94   - float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter
95   - int id_x = 0; // define two local variables for the x and y position of the maximum
96   - int id_y = 0;
97   -
98   - int x_table = 2*rmax +1; // compute the size of window which will be checked for finding the voting area for this voter
99   - int rmax_sq = rmax * rmax;
100   - int tx_rmax = threadIdx.x + rmax;
101   - float atan_angle;
102   - float vote_c;
103   - int xidx, yidx, yr_sq, xr_sq;
104   - for(int yr = -rmax; yr <= rmax; yr++){
105   - yidx = yi + yr; //compute the index into the image
106   - if (yidx >= 0 && yidx < y){ //if the current y-index is inside the image
107   - yr_sq = yr * yr; //compute the square of yr, to save time later
108   - for(int xr = -rmax; xr <= rmax; xr++){
109   - xidx = xi + xr;
110   - if(xidx >= 0 && xidx < x){
111   - xr_sq = xr * xr;
112   - unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
113   -
114   - // calculate the angle between the voter and the current pixel in x and y directions
115   - atan_angle = gpuTable[ind_t];
116   - //atan_angle = atan2((T)yr, (T)xr);
117   -
118   - // check if the current pixel is located in the voting area of this voter.
119   - if (((xr_sq + yr_sq)< rmax_sq) && (abs(atan_angle - theta) <phi)){
120   -
121   - vote_c = gpuVote[yidx * x + xidx]; // find the vote value for the current counter
122   - // compare the vote value of this pixel with the max value to find the maxima and its index.
123   - if (vote_c>max) {
124   -
125   - max = vote_c;
126   - id_x = xr;
127   - id_y = yr;
128   - }
129   - }
130   - }
131   - }
132   - }
133   - }
134   -
135   - unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
136   - float new_angle = gpuTable[ind_m];
137   -
138   - if(xi < x && yi < y)
139   - gpuDir[i] = new_angle;
140   - } //end kernel
141 78  
142 79  
143 80 // this kernel updates the gradient direction by the calculated voting direction.
... ... @@ -168,9 +105,7 @@ namespace stim{
168 105 HANDLE_ERROR( cudaMalloc(&gpuDir, bytes) );
169 106  
170 107 unsigned int max_threads = stim::maxThreadsPerBlock();
171   - //dim3 threads(min(x, max_threads), 1);
172   - //dim3 blocks(x/threads.x, y);
173   -
  108 +
174 109 dim3 threads( sqrt(max_threads), sqrt(max_threads) );
175 110 dim3 blocks(x/threads.x + 1, y/threads.y + 1);
176 111  
... ... @@ -188,12 +123,12 @@ namespace stim{
188 123  
189 124 //call the kernel to calculate the new voting direction
190 125 cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
191   - stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer);
  126 + //stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer);
192 127  
193 128 //exit(0);
194 129  
195   - threads = dim3( sqrt(max_threads), sqrt(max_threads) );
196   - blocks = dim3(x/threads.x + 1, y/threads.y + 1);
  130 + //threads = dim3( sqrt(max_threads), sqrt(max_threads) );
  131 + //blocks = dim3(x/threads.x + 1, y/threads.y + 1);
197 132  
198 133 //call the kernel to update the gradient direction
199 134 cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
... ...
stim/cuda/ivote/david_update_dir_global.cuh renamed to stim/cuda/ivote/update_dir_threshold_global.cuh
1   -#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H
2   -#define STIM_CUDA_UPDATE_DIR_GLOBAL_H
  1 +#ifndef STIM_CUDA_UPDATE_DIR_THRESHOLD_GLOBALD_H
  2 +#define STIM_CUDA_UPDATE_DIR_THRESHOLD_GLOBAL_H
3 3  
4 4 # include <iostream>
5 5 # include <cuda.h>
6 6 #include <stim/cuda/cudatools.h>
7 7 #include <stim/cuda/sharedmem.cuh>
8   -#include <math.h>
9   -#include "cpyToshare.cuh"
10   -
11   -#define RMAX_TEST 8
  8 +#include "cpyToshare.cuh"
12 9  
13 10 namespace stim{
14 11 namespace cuda{
15 12  
16 13 // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
17 14 template<typename T>
18   - __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
19   - extern __shared__ T atan2_table[];
20   -
21   - //calculate the start point for this block
22   - //int bxi = blockIdx.x * blockDim.x;
23   -
24   - stim::cuda::sharedMemcpy(atan2_table, gpuTable, (2 * rmax + 1) * (2 * rmax + 1), threadIdx.x, blockDim.x);
  15 + __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuTh, T* gpuTable, T phi, int rmax, int th_size, int x, int y){
25 16  
26   - __syncthreads();
27 17  
28   - // calculate the 2D coordinates for this current thread.
29   - //int xi = bxi + threadIdx.x;
  18 +
  19 + // calculate the coordinate for this current thread.
30 20 int xi = blockIdx.x * blockDim.x + threadIdx.x;
31   - int yi = blockIdx.y * blockDim.y + threadIdx.y;
32   - if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel
33   -
34   - int i = yi * x + xi; // convert 2D coordinates to 1D
  21 + // calculate the voting direction based on the grtadient direction
  22 + float theta = gpuTh[3*xi];
35 23  
36   - float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch
37   - gpuDir[i] = 0; //initialize the vote direction to zero
38   - float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter
39   - int id_x = 0; // define two local variables for the x and y position of the maximum
40   - int id_y = 0;
  24 + //calculate the position and x, y coordinations of this voter in the original image
  25 + unsigned int i_v = gpuTh[3*xi+2];
  26 + unsigned int y_v = i_v/x;
  27 + unsigned int x_v = i_v - (y_v*x);
41 28  
42   - int x_table = 2*rmax +1; // compute the size of window which will be checked for finding the voting area for this voter
  29 + //initialize the vote direction to zero
  30 + gpuDir[xi] = 0;
  31 +
  32 + // define a local variable to maximum value of the vote image in the voting area for this voter
  33 + float max = 0;
  34 +
  35 + // define two local variables for the x and y coordinations where the maximum happened
  36 + int id_x = 0;
  37 + int id_y = 0;
  38 +
  39 + // compute the size of window which will be checked for finding the voting area for this voter
  40 + int x_table = 2*rmax +1;
43 41 int rmax_sq = rmax * rmax;
44 42 int tx_rmax = threadIdx.x + rmax;
45   - float atan_angle;
46   - float vote_c;
47   - unsigned int ind_t;
48   - for(int yr = -rmax; yr <= rmax; yr++){ //for each counter in the y direction
49   - if (yi+yr >= 0 && yi + yr < y){ //if the counter exists (we aren't looking outside of the image)
50   - for(int xr = -rmax; xr <= rmax; xr++){ //for each counter in the x direction
51   - if((xr * xr + yr *yr)< rmax_sq){ //if the counter is within range of the voter
52   -
53   - ind_t = (rmax - yr) * x_table + rmax - xr; //calculate the index to the atan2 table
54   - atan_angle = atan2_table[ind_t]; //retrieve the direction vector from the table
55   -
56   - //atan_angle = atan2((float)yr, (float)xr);
57   -
58   - if (abs(atan_angle - theta) <phi){ // check if the current pixel is located in the voting angle of this voter.
59   - vote_c = gpuVote[(yi+yr)*x + (xi+xr)]; // find the vote value for the current counter
60   - if(vote_c>max) { // compare the vote value of this pixel with the max value to find the maxima and its index.
61   - max = vote_c;
62   - id_x = xr;
63   - id_y = yr;
64   - }
  43 + if(xi < th_size){
  44 +
  45 + for(int yr = -rmax; yr <= rmax; yr++){
  46 +
  47 + for(int xr = -rmax; xr <= rmax; xr++){
  48 +
  49 + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
  50 +
  51 + // find the angle between the voter and the current pixel in x and y directions
  52 + float atan_angle = gpuTable[ind_t];
  53 +
  54 + // check if the current pixel is located in the voting area of this voter.
  55 + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
  56 + // find the vote value for the current counter
  57 + float vote_c = gpuVote[(y_v+yr)*x + (x_v+xr)];
  58 + // compare the vote value of this pixel with the max value to find the maxima and its index.
  59 + if (vote_c>max) {
  60 +
  61 + max = vote_c;
  62 + id_x = xr;
  63 + id_y = yr;
65 64 }
66 65 }
67 66 }
68 67 }
69   - }
  68 +
70 69  
71   - unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
72   - float new_angle = gpuTable[ind_m];
  70 + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
  71 + float new_angle = gpuTable[ind_m];
  72 + gpuDir[xi] = new_angle;
  73 + }
73 74  
74   - if(xi < x && yi < y)
75   - gpuDir[i] = new_angle;
76   - } //end kernel
  75 + }
77 76  
78 77 // this kernel updates the gradient direction by the calculated voting direction.
79 78 template<typename T>
80   - __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
  79 + __global__ void cuda_update_grad(T* gpuTh, T* gpuDir, int th_size, int x, int y){
81 80  
82   - // calculate the 2D coordinates for this current thread.
  81 + // calculate the coordinate for this current thread.
83 82 int xi = blockIdx.x * blockDim.x + threadIdx.x;
84   - int yi = blockIdx.y * blockDim.y + threadIdx.y;
85   -
86   - // convert 2D coordinates to 1D
87   - int i = yi * x + xi;
88 83  
  84 +
89 85 //update the gradient image with the vote direction
90   - gpuGrad[2*i] = gpuDir[i];
  86 + gpuTh[3*xi] = gpuDir[xi];
91 87 }
92 88  
93 89 template<typename T>
94   - void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
95   -
96   -
  90 + void gpu_update_dir(T* gpuVote, T* gpuTh, T* gpuTable, T phi, unsigned int rmax, unsigned int th_size, unsigned int x, unsigned int y){
97 91  
98 92 //calculate the number of bytes in the array
99   - unsigned int bytes = x * y * sizeof(T);
  93 + unsigned int bytes_th = th_size* sizeof(T);
100 94  
101 95 unsigned int max_threads = stim::maxThreadsPerBlock();
102   -
103   - dim3 threads(sqrt(max_threads), sqrt(max_threads));
104   - dim3 blocks(x/threads.x + 1, y/threads.y + 1);
105   -
106   -
  96 + dim3 threads(max_threads);
  97 + dim3 blocks(th_size/threads.x+1);
107 98  
108 99 // allocate space on the GPU for the updated vote direction
109 100 T* gpuDir;
110   - cudaMalloc(&gpuDir, bytes);
111   -
112   - size_t shared_mem = sizeof(T) * std::pow((2 * rmax + 1), 2);
113   - std::cout<<"Shared memory for atan2 table: "<<shared_mem<<std::endl;
  101 + cudaMalloc(&gpuDir, bytes_th);
114 102  
115 103 //call the kernel to calculate the new voting direction
116   - cuda_update_dir <<< blocks, threads, shared_mem>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  104 + cuda_update_dir <<< blocks, threads>>>(gpuDir, gpuVote, gpuTh, gpuTable, phi, rmax, th_size, x , y);
117 105  
118 106 //call the kernel to update the gradient direction
119   - cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
  107 + cuda_update_grad <<< blocks, threads >>>(gpuTh, gpuDir, th_size, x , y);
120 108  
121 109 //free allocated memory
122 110 cudaFree(gpuDir);
... ...
stim/cuda/ivote/vote_atomic_bb.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_VOTE_ATOMIC_BB_H
  2 +#define STIM_CUDA_VOTE_ATOMIC_BB_H
  3 +
  4 +# include <iostream>
  5 +# include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/sharedmem.cuh>
  8 +#include <stim/visualization/aabb2.h>
  9 +#include <stim/visualization/colormap.h>
  10 +#include <math.h>
  11 +
  12 +namespace stim{
  13 + namespace cuda{
  14 +
  15 + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  16 + template<typename T>
  17 + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  18 +
  19 + extern __shared__ T S[];
  20 + T* shared_atan = S;
  21 + size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1);
  22 + stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
  23 +
  24 + // calculate the 2D coordinates for this current thread.
  25 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  26 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  27 +
  28 + if(xi >= x || yi >= y) return;
  29 + // convert 2D coordinates to 1D
  30 + int i = yi * x + xi;
  31 +
  32 + // calculate the voting direction based on the grtadient direction
  33 + float theta = gpuGrad[2*i];
  34 + //calculate the amount of vote for the voter
  35 + float mag = gpuGrad[2*i + 1];
  36 +
  37 +
  38 + stim::aabb2<int> bb(xi, yi); //initialize a bounding box at the current point
  39 + bb.insert(xi + ceil(rmax * cos(theta)), ceil(yi + rmax * sin(theta)));
  40 + bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi))); //insert one corner of the triangle into the bounding box
  41 + bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi))); //insert the final corner into the bounding box
  42 +
  43 + // compute the size of window which will be checked for finding the proper voters for this pixel
  44 + int x_table = 2*rmax +1;
  45 + int rmax_sq = rmax * rmax;
  46 +
  47 + int lut_i;
  48 + T dx_sq, dy_sq;
  49 +
  50 + bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image
  51 + bb.trim_high(x-1, y-1);
  52 +
  53 + int by, bx;
  54 + int dx, dy;
  55 +
  56 + unsigned int ind_g; //initialize the maximum vote value to zero
  57 + T alpha;
  58 +
  59 + for(by = bb.low[1]; by <= bb.high[1]; by++){ //for each element in the bounding box
  60 + dy = by - yi; //calculate the y coordinate of the current point relative to yi
  61 + dy_sq = dy * dy;
  62 + for(bx = bb.low[0]; bx <= bb.high[0]; bx++){
  63 + dx = bx - xi;
  64 + dx_sq = dx * dx;
  65 + lut_i = (rmax - dy) * x_table + rmax - dx;
  66 + alpha = shared_atan[lut_i];
  67 + if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
  68 + ind_g = (by)*x + (bx);
  69 + atomicAdd(&gpuVote[ind_g], mag);
  70 +
  71 + }
  72 + }
  73 + }
  74 +
  75 + }
  76 +
  77 +
  78 + template<typename T>
  79 + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  80 +
  81 +
  82 + unsigned int max_threads = stim::maxThreadsPerBlock();
  83 + dim3 threads( sqrt(max_threads), sqrt(max_threads) );
  84 + dim3 blocks(x/threads.x + 1, y/threads.y + 1);
  85 + size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
  86 + size_t shared_mem_req = table_bytes;// + template_bytes;
  87 + std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;
  88 + size_t shared_mem = stim::sharedMemPerBlock();
  89 + if(shared_mem_req > shared_mem){
  90 + std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
  91 + exit(1);
  92 + }
  93 +
  94 + //call the kernel to do the voting
  95 + cuda_vote <<< blocks, threads, shared_mem_req>>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  96 +
  97 + }
  98 +
  99 +
  100 + template<typename T>
  101 + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  102 +
  103 + //calculate the number of bytes in the array
  104 + unsigned int bytes = x * y * sizeof(T);
  105 +
  106 + //calculate the number of bytes in the atan2 table
  107 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  108 +
  109 + //allocate space on the GPU for the Vote Image
  110 + T* gpuVote;
  111 + cudaMalloc(&gpuVote, bytes);
  112 +
  113 + //allocate space on the GPU for the input Gradient image
  114 + T* gpuGrad;
  115 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  116 +
  117 + //copy the Gradient Magnitude data to the GPU
  118 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  119 +
  120 + //allocate space on the GPU for the atan2 table
  121 + T* gpuTable;
  122 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  123 +
  124 + //copy the atan2 values to the GPU
  125 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  126 +
  127 + //call the GPU version of the vote calculation function
  128 + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  129 +
  130 + //copy the Vote Data back to the CPU
  131 + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
  132 +
  133 + //free allocated memory
  134 + cudaFree(gpuTable);
  135 + cudaFree(gpuVote);
  136 + cudaFree(gpuGrad);
  137 + }
  138 +
  139 + }
  140 +}
  141 +
  142 +#endif
0 143 \ No newline at end of file
... ...
stim/cuda/ivote/vote_atomic_shared.cuh
... ... @@ -5,7 +5,7 @@
5 5 # include <cuda.h>
6 6 #include <stim/cuda/cudatools.h>
7 7 #include <stim/cuda/sharedmem.cuh>
8   -#include "cpyToshare.cuh"
  8 +
9 9 //#include "writebackshared.cuh"
10 10 namespace stim{
11 11 namespace cuda{
... ...
stim/cuda/ivote/vote_shared.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_VOTE_SHARED_H
  2 +#define STIM_CUDA_VOTE_SHARED
  3 +# include <iostream>
  4 +# include <cuda.h>
  5 +#include <stim/cuda/cudatools.h>
  6 +#include <stim/cuda/sharedmem.cuh>
  7 +#include "cpyToshare.cuh"
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  13 + template<typename T>
  14 + __global__ void cuda_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  15 +
  16 + //generate a pointer to shared memory (size will be specified as a kernel parameter)
  17 + extern __shared__ float s_grad[];
  18 +
  19 + //calculate the start point for this block
  20 + int bxi = blockIdx.x * blockDim.x;
  21 +
  22 + // calculate the 2D coordinates for this current thread.
  23 + int xi = bxi + threadIdx.x;
  24 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  25 + // convert 2D coordinates to 1D
  26 + int i = yi * x + xi;
  27 +
  28 + // define a local variable to sum the votes from the voters
  29 + float sum = 0;
  30 +
  31 + //calculate the width of the shared memory block
  32 + int swidth = 2 * rmax + blockDim.x;
  33 +
  34 + // compute the size of window which will be checked for finding the proper voters for this pixel
  35 + int x_table = 2*rmax +1;
  36 + int rmax_sq = rmax * rmax;
  37 + int tx_rmax = threadIdx.x + rmax;
  38 + int bxs = bxi - rmax;
  39 +
  40 + //for every line (along y)
  41 + for(int yr = -rmax; yr <= rmax; yr++){
  42 + if (yi+yr<y && yi+yr>=0){
  43 + //copy the portion of the image necessary for this block to shared memory
  44 + __syncthreads();
  45 + cpyG2S1D2ch<float>(s_grad, gpuGrad, bxs, yi + yr , 2*swidth, 1, threadIdx, blockDim, x, y);
  46 + __syncthreads();
  47 +
  48 + if(xi < x && yi < y){
  49 +
  50 + for(int xr = -rmax; xr <= rmax; xr++){
  51 +
  52 + //find the location of this voter in the atan2 table
  53 + int id_t = (yr + rmax) * x_table + xr + rmax;
  54 +
  55 + // calculate the angle between the pixel and the current voter in x and y directions
  56 + float atan_angle = gpuTable[id_t];
  57 +
  58 + // calculate the voting direction based on the grtadient direction
  59 + int idx_share = xr + tx_rmax ;
  60 + float theta = s_grad[idx_share*2];
  61 + float mag = s_grad[idx_share*2 + 1];
  62 +
  63 +
  64 + // check if the current voter is located in the voting area of this pixel.
  65 + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
  66 + sum += mag;
  67 +
  68 + }
  69 + }
  70 +
  71 + }
  72 + }
  73 + }
  74 + if(xi < x && yi < y)
  75 + gpuVote[i] = sum;
  76 +
  77 + }
  78 +
  79 + template<typename T>
  80 + void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  81 +
  82 +
  83 + unsigned int max_threads = stim::maxThreadsPerBlock();
  84 + dim3 threads(max_threads, 1);
  85 + dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
  86 +
  87 +
  88 + // specify share memory
  89 + unsigned int share_bytes = (2*rmax + threads.x)*1*2*sizeof(T);
  90 +
  91 + //call the kernel to do the voting
  92 + cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  93 +
  94 + }
  95 +
  96 +
  97 + template<typename T>
  98 + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  99 +
  100 + //calculate the number of bytes in the array
  101 + unsigned int bytes = x * y * sizeof(T);
  102 +
  103 + //calculate the number of bytes in the atan2 table
  104 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  105 +
  106 + //allocate space on the GPU for the Vote Image
  107 + T* gpuVote;
  108 + cudaMalloc(&gpuVote, bytes);
  109 +
  110 + //allocate space on the GPU for the input Gradient image
  111 + T* gpuGrad;
  112 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  113 +
  114 + //copy the Gradient Magnitude data to the GPU
  115 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  116 +
  117 + //allocate space on the GPU for the atan2 table
  118 + T* gpuTable;
  119 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  120 +
  121 + //copy the atan2 values to the GPU
  122 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  123 +
  124 + //call the GPU version of the vote calculation function
  125 + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  126 +
  127 + //copy the Vote Data back to the CPU
  128 + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
  129 +
  130 + //free allocated memory
  131 + cudaFree(gpuTable);
  132 + cudaFree(gpuVote);
  133 + cudaFree(gpuGrad);
  134 + }
  135 +
  136 + }
  137 +}
  138 +
  139 +#endif
0 140 \ No newline at end of file
... ...
stim/cuda/ivote/vote_threshold_global.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_VOTE_THRESHOLD_GLOBAL_H
  2 +#define STIM_CUDA_VOTE_THRESHOLD_GLOBAL_H
  3 +# include <iostream>
  4 +# include <cuda.h>
  5 +#include <stim/cuda/cudatools.h>
  6 +#include <stim/cuda/sharedmem.cuh>
  7 +#include "cpyToshare.cuh"
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  13 + template<typename T>
  14 + __global__ void cuda_vote(T* gpuVote, T* gpuTh, T* gpuTable, T phi, int rmax, int th_size, int x, int y){
  15 +
  16 +
  17 + // calculate the x coordinate for this current thread.
  18 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  19 +
  20 + // calculate the voting direction based on the grtadient direction
  21 + float theta = gpuTh[3*xi];
  22 + //find the gradient magnitude for the current voter
  23 + float mag = gpuTh[3*xi + 1];
  24 + //calculate the position and x, y coordinations of this voter in the original image
  25 + unsigned int i_v = gpuTh[3*xi+2];
  26 + unsigned int y_v = i_v/x;
  27 + unsigned int x_v = i_v - (y_v*x);
  28 +
  29 + // compute the size of window which will be checked for finding the proper voters for this pixel
  30 + int x_table = 2*rmax +1;
  31 + int rmax_sq = rmax * rmax;
  32 + if(xi < th_size){
  33 + for(int yr = -rmax; yr <= rmax; yr++){
  34 + for(int xr = -rmax; xr <= rmax; xr++){
  35 + if ((y_v+yr)>=0 && (y_v+yr)<y && (x_v+xr)>=0 && (x_v+xr)<x){
  36 +
  37 + //find the location of the current pixel in the atan2 table
  38 + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
  39 +
  40 + // calculate the angle between the voter and the current pixel in x and y directions
  41 + float atan_angle = gpuTable[ind_t];
  42 +
  43 + // check if the current pixel is located in the voting area of this voter.
  44 + if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
  45 + // calculate the 1D index for the current pixel in global memory
  46 + unsigned int ind_g = (y_v+yr)*x + (x_v+xr);
  47 + atomicAdd(&gpuVote[ind_g], mag);
  48 +
  49 + }
  50 + }
  51 + }
  52 + }
  53 + }
  54 + }
  55 +
  56 + template<typename T>
  57 + void gpu_vote(T* gpuVote, T* gpuTh, T* gpuTable, T phi, unsigned int rmax, unsigned int th_size, unsigned int x, unsigned int y){
  58 +
  59 +
  60 + unsigned int max_threads = stim::maxThreadsPerBlock();
  61 + dim3 threads(max_threads);
  62 + dim3 blocks(th_size/threads.x + 1);
  63 +
  64 + //call the kernel to do the voting
  65 + cuda_vote <<< blocks, threads>>>(gpuVote, gpuTh, gpuTable, phi, rmax, th_size, x , y);
  66 +
  67 + }
  68 +
  69 +
  70 + template<typename T>
  71 + void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  72 +
  73 + //calculate the number of bytes in the array
  74 + unsigned int bytes = x * y * sizeof(T);
  75 +
  76 + //calculate the number of bytes in the atan2 table
  77 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  78 +
  79 + //allocate space on the GPU for the Vote Image
  80 + T* gpuVote;
  81 + cudaMalloc(&gpuVote, bytes);
  82 +
  83 + //allocate space on the GPU for the input Gradient image
  84 + T* gpuGrad;
  85 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  86 +
  87 + //copy the Gradient Magnitude data to the GPU
  88 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  89 +
  90 + //allocate space on the GPU for the atan2 table
  91 + T* gpuTable;
  92 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  93 +
  94 + //copy the atan2 values to the GPU
  95 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  96 +
  97 + //call the GPU version of the vote calculation function
  98 + gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  99 +
  100 + //copy the Vote Data back to the CPU
  101 + cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
  102 +
  103 + //free allocated memory
  104 + cudaFree(gpuTable);
  105 + cudaFree(gpuVote);
  106 + cudaFree(gpuGrad);
  107 + }
  108 +
  109 + }
  110 +}
  111 +
  112 +#endif
0 113 \ No newline at end of file
... ...
stim/cuda/ivote_atomic.cuh renamed to stim/cuda/ivote_atomic_bb.cuh
1   -#ifndef STIM_CUDA_IVOTE_ATOMIC_H
2   -#define STIM_CUDA_IVOTE_ATOMIC_H
  1 +#ifndef STIM_CUDA_IVOTE_ATOMIC_BB_H
  2 +#define STIM_CUDA_IVOTE_ATOMIC_BB_H
3 3  
4 4 #include <stim/cuda/ivote/down_sample.cuh>
5 5 #include <stim/cuda/ivote/local_max.cuh>
6   -#include <stim/cuda/ivote/update_dir_global.cuh>
7   -//#include <stim/cuda/ivote/vote_shared_32-32.cuh>
8   -#include <stim/cuda/ivote/vote_atomic_shared.cuh>
9   -//#include <stim/cuda/ivote/re_sample.cuh>
  6 +#include <stim/cuda/ivote/update_dir_bb.cuh>
  7 +#include <stim/cuda/ivote/vote_atomic_bb.cuh>
  8 +
10 9 namespace stim{
11 10 namespace cuda{
12 11  
... ...
stim/envi/agilent_binary.h
... ... @@ -35,26 +35,28 @@ public:
35 35 void alloc(){
36 36 ptr = (T*) malloc(bytes());
37 37 }
38   - void alloc(short x, short y, short z){
  38 + void alloc(size_t x, size_t y, size_t z){
39 39 R[0] = x;
40 40 R[1] = y;
41 41 R[2] = z;
42 42 alloc();
43 43 }
44 44  
  45 + /// Create a deep copy of an agileng_binary object
45 46 void deep_copy(agilent_binary<T>* dst, const agilent_binary<T>* src){
46 47 dst->alloc(src->R[0], src->R[1], src->R[2]); //allocate memory
47 48 memcpy(dst->ptr, src->ptr, bytes()); //copy the data
48 49 memcpy(dst->Z, src->Z, sizeof(double) * 2); //copy the data z range
49 50 }
50 51  
  52 + /// Default constructor, sets the resolution to zero and the data pointer to NULL
51 53 agilent_binary(){
52   - memset(R, 0, sizeof(short) * 3); //set the resolution to zero
  54 + memset(R, 0, sizeof(size_t) * 3); //set the resolution to zero
53 55 ptr = NULL;
54 56 }
55 57  
56 58 /// Constructor with resolution
57   - agilent_binary(short x, short y, short z){
  59 + agilent_binary(size_t x, size_t y, size_t z){
58 60 alloc(x, y, z);
59 61 }
60 62  
... ... @@ -109,13 +111,11 @@ public:
109 111  
110 112 char zero = 0;
111 113 for(size_t i = 0; i < 9; i++) outfile.write(&zero, 1); //write 9 zeros
112   - outfile.write((char*)&R[0], 2);
  114 + outfile.write((char*)&R[2], 2);
113 115 for(size_t i = 0; i < 13; i++) outfile.write(&zero, 1); //write 13 zeros
  116 + outfile.write((char*)&R[0], 2);
114 117 outfile.write((char*)&R[1], 2);
115   - outfile.write((char*)&R[2], 2);
116 118 for(size_t i = 0; i < 992; i++) outfile.write(&zero, 1); //write 992 zeros
117   - //char zerovec[1020];
118   - //outfile.write((char*)zerovec, 1020);
119 119  
120 120 size_t b = bytes();
121 121 outfile.write((char*)ptr, b); //write the data to the output file
... ... @@ -149,7 +149,7 @@ public:
149 149  
150 150 #ifdef CUDA_FOUND
151 151 /// Perform an FFT and return a binary file with bands in the specified range
152   - agilent_binary<T> fft(float band_min, float band_max){
  152 + agilent_binary<T> fft(double band_min, double band_max, double ELWN = 15798, int UDR = 2){
153 153 auto total_start = std::chrono::high_resolution_clock::now();
154 154  
155 155 auto start = std::chrono::high_resolution_clock::now();
... ... @@ -177,8 +177,8 @@ public:
177 177  
178 178 start = std::chrono::high_resolution_clock::now();
179 179 int N[1]; //create an array with the interferogram size (required for cuFFT input)
180   - N[0] = R[2]; //set the only array value to the interferogram size
181   - if(cufftPlanMany(&plan, 1, N, NULL, 1, R[2], NULL, 1, R[2], CUFFT_R2C, batch) != CUFFT_SUCCESS){
  180 + N[0] = (int)R[2]; //set the only array value to the interferogram size
  181 + if(cufftPlanMany(&plan, 1, N, NULL, 1, (int)R[2], NULL, 1, (int)R[2], CUFFT_R2C, (int)batch) != CUFFT_SUCCESS){
182 182 std::cout<<"cuFFT Error: unable to create 1D plan."<<std::endl;
183 183 exit(1);
184 184 }
... ... @@ -199,12 +199,13 @@ public:
199 199 std::complex<T>* cpu_fft = (std::complex<T>*) malloc( R[0] * R[1] * (R[2]/2+1) * sizeof(std::complex<T>) );
200 200 HANDLE_ERROR(cudaMemcpy(cpu_fft, gpu_fft, R[0] * R[1] * (R[2]/2+1) * sizeof(cufftComplex), cudaMemcpyDeviceToHost)); //copy data from the host to the device
201 201  
202   - double int_delta = 0.00012656; //interferogram sample spacing in centimeters
  202 + //double int_delta = 0.00012656; //interferogram sample spacing in centimeters
  203 + double int_delta = (1.0 / ELWN) * ((double)UDR / 2.0); //calculate the interferogram spacing
203 204 double int_length = int_delta * R[2]; //interferogram length in centimeters
204 205 double fft_delta = 1/int_length; //spectrum spacing (in inverse centimeters, wavenumber
205 206  
206   - size_t start_i = std::ceil(band_min / fft_delta); //calculate the first band to store
207   - size_t size_i = std::floor(band_max / fft_delta) - start_i; //calculate the number of bands to store
  207 + size_t start_i = (size_t)std::ceil(band_min / fft_delta); //calculate the first band to store
  208 + size_t size_i = (size_t)std::floor(band_max / fft_delta) - start_i; //calculate the number of bands to store
208 209 size_t end_i = start_i + size_i; //last band number
209 210 agilent_binary<T> result(R[0], R[1], size_i);
210 211 result.Z[0] = start_i * fft_delta; //set the range for the FFT result
... ...
stim/envi/bil.h
... ... @@ -1309,6 +1309,66 @@ public:
1309 1309  
1310 1310 }
1311 1311  
  1312 + bool multiply(std::string outname, double v, unsigned char* mask = NULL, bool PROGRESS = false){
  1313 + unsigned long long B = Z(); //calculate the number of bands
  1314 + unsigned long long ZX = Z() * X();
  1315 + unsigned long long XY = X() * Y(); //calculate the number of pixels in a band
  1316 + unsigned long long S = XY * sizeof(T); //calculate the number of bytes in a band
  1317 + unsigned long long L = ZX * sizeof(T);
  1318 +
  1319 + std::ofstream target(outname.c_str(), std::ios::binary); //open the target binary file
  1320 +
  1321 + T * c; //pointer to the current ZX slice
  1322 + c = (T*)malloc( L ); //allocate space for the slice
  1323 +
  1324 + for(unsigned long long j = 0; j < Y(); j++){ //for each line
  1325 + read_plane_y(c, j); //load the line into memory
  1326 + for(unsigned long long i = 0; i < B; i++){ //for each band
  1327 + for(unsigned long long m = 0; m < X(); m++){ //for each sample
  1328 + if( mask == NULL && mask[m + j * X()] ) //if the pixel is masked
  1329 + c[m + i * X()] *= (T)v;
  1330 + }
  1331 + }
  1332 + target.write(reinterpret_cast<const char*>(c), L); //write normalized data into destination
  1333 +
  1334 + if(PROGRESS) progress = (double)(j+1) / Y() * 100; //update the progress
  1335 + }
  1336 +
  1337 + free(c); //free the slice memory
  1338 + target.close(); //close the output file
  1339 + return true;
  1340 + }
  1341 +
  1342 + bool add(std::string outname, double v, unsigned char* mask = NULL, bool PROGRESS = false){
  1343 + unsigned long long B = Z(); //calculate the number of bands
  1344 + unsigned long long ZX = Z() * X();
  1345 + unsigned long long XY = X() * Y(); //calculate the number of pixels in a band
  1346 + unsigned long long S = XY * sizeof(T); //calculate the number of bytes in a band
  1347 + unsigned long long L = ZX * sizeof(T);
  1348 +
  1349 + std::ofstream target(outname.c_str(), std::ios::binary); //open the target binary file
  1350 +
  1351 + T * c; //pointer to the current ZX slice
  1352 + c = (T*)malloc( L ); //allocate space for the slice
  1353 +
  1354 + for(unsigned long long j = 0; j < Y(); j++){ //for each line
  1355 + read_plane_y(c, j); //load the line into memory
  1356 + for(unsigned long long i = 0; i < B; i++){ //for each band
  1357 + for(unsigned long long m = 0; m < X(); m++){ //for each sample
  1358 + if( mask == NULL && mask[m + j * X()] ) //if the pixel is masked
  1359 + c[m + i * X()] += (T)v;
  1360 + }
  1361 + }
  1362 + target.write(reinterpret_cast<const char*>(c), L); //write normalized data into destination
  1363 +
  1364 + if(PROGRESS) progress = (double)(j+1) / Y() * 100; //update the progress
  1365 + }
  1366 +
  1367 + free(c); //free the slice memory
  1368 + target.close(); //close the output file
  1369 + return true;
  1370 + }
  1371 +
1312 1372 /// Close the file.
1313 1373 bool close(){
1314 1374 file.close();
... ...
stim/envi/binary.h
... ... @@ -7,6 +7,13 @@
7 7 #include "../math/vector.h"
8 8 #include <fstream>
9 9 #include <sys/stat.h>
  10 +#include <cstring>
  11 +
  12 +#ifdef _WIN32
  13 +#include <Windows.h>
  14 +#else
  15 +#include <unistd.h>
  16 +#endif
10 17  
11 18 namespace stim{
12 19  
... ... @@ -30,14 +37,16 @@ protected:
30 37  
31 38 double progress; //stores the progress on the current operation (accessible using a thread)
32 39  
  40 + size_t buffer_size; //available memory for processing large files
33 41  
34 42 /// Private initialization function used to set default parameters in the data structure.
35 43 void init(){
36   - memset(R, 0, sizeof(unsigned long long) * D); //initialize the resolution to zero
37   - header = 0; //initialize the header size to zero
  44 + std::memset(R, 0, sizeof(unsigned long long) * D); //initialize the resolution to zero
  45 + header = 0; //initialize the header size to zero
38 46 mask = NULL;
39 47  
40 48 progress = 0;
  49 + set_buffer(); //set the maximum buffer size to the default
41 50 }
42 51  
43 52 /// Private helper function that returns the size of the file on disk using system functions.
... ... @@ -105,6 +114,11 @@ protected:
105 114  
106 115 public:
107 116  
  117 + //default constructor
  118 + binary(){
  119 + init();
  120 + }
  121 +
108 122 double get_progress(){
109 123 return progress;
110 124 }
... ... @@ -113,6 +127,20 @@ public:
113 127 progress = 0;
114 128 }
115 129  
  130 + //specify the maximum fraction of available memory that this class will use for buffering
  131 + void set_buffer(double mem_frac = 0.5){ //default to 50%
  132 +#ifdef _WIN32
  133 + MEMORYSTATUSEX statex;
  134 + statex.dwLength = sizeof (statex);
  135 + GlobalMemoryStatusEx (&statex);
  136 + buffer_size = (size_t)(statex.ullAvailPhys * mem_frac);
  137 +#else
  138 + size_t pages = sysconf(_SC_PHYS_PAGES);
  139 + size_t page_size = sysconf(_SC_PAGE_SIZE);
  140 + buffer_size = (size_t)(pages * page_size * mem_frac);
  141 +#endif
  142 + }
  143 +
116 144 /// Open a binary file for streaming.
117 145  
118 146 /// @param filename is the name of the binary file
... ... @@ -375,6 +403,96 @@ public:
375 403 return read_pixel(p, i);
376 404 }
377 405  
  406 + /// Reads a block specified by an (x, y, z) position and size using the largest possible contiguous reads
  407 + bool read(T* dest, size_t x, size_t y, size_t z, size_t sx, size_t sy, size_t sz){
  408 +
  409 + size_t size_bytes = sx * sy * sz * sizeof(T); //size of the block to read in bytes
  410 +
  411 + size_t start = z * R[0] * R[1] + y * R[0] + x; //calculate the start postion
  412 + size_t start_bytes = start * sizeof(T); //start position in bytes
  413 + file.seekg(start * sizeof(T), std::ios::beg); //seek to the start position
  414 +
  415 +
  416 + if(sx == R[0] && sy == R[1]){ //if sx and sy result in a contiguous volume along z
  417 + file.read((char*)dest, size_bytes); //read the block in one pass
  418 + return true;
  419 + }
  420 +
  421 + if(sx == R[0]){ //if sx is contiguous, read each z-axis slice can be read in one pass
  422 + size_t jump_bytes = (R[1] - sy) * R[0] * sizeof(T); //jump between each slice
  423 + size_t slice_bytes = sx * sy * sizeof(T); //size of the slice to be read
  424 + for(size_t zi = 0; zi < sz; zi++){ //for each z-axis slice
  425 + file.read((char*)dest, slice_bytes); //read the slice
  426 + dest += sx * sy; //move the destination pointer to the next slice
  427 + file.seekg(jump_bytes, std::ios::cur); //skip to the next slice in the file
  428 + }
  429 + return true;
  430 + }
  431 +
  432 + //in this case, x is not contiguous so the volume must be read line-by-line
  433 + size_t jump_x_bytes = (R[0] - sx) * sizeof(T); //number of bytes skipped in the x direction
  434 + size_t jump_y_bytes = (R[1] - sy) * R[0] * sizeof(T) + jump_x_bytes; //number of bytes skipped between slices
  435 + size_t line_bytes = sx * sizeof(T); //size of the line to be read
  436 + size_t zi, yi;
  437 + for(zi = 0; zi < sz; zi++){ //for each slice
  438 + file.read((char*)dest, line_bytes); //read the first line
  439 + for(yi = 1; yi < sy; yi++){ //read each additional line
  440 + dest += sx; //move the pointer in the destination block to the next line
  441 + file.seekg(jump_x_bytes, std::ios::cur); //skip to the next line in the file
  442 + file.read((char*)dest, line_bytes); //read the line to the destination block
  443 + }
  444 + file.seekg(jump_y_bytes, std::ios::cur); //skip to the beginning of the next slice
  445 + }
  446 + return false;
  447 + }
  448 +
  449 + // permutes a block of data from the current interleave to the interleave specified (re-arranged dimensions to the order specified by [d0, d1, d2])
  450 +
  451 + void permute(T* dest, T* src, size_t sx, size_t sy, size_t sz, size_t d0, size_t d1, size_t d2){
  452 + size_t d[3] = {d0, d1, d2};
  453 + size_t s[3] = {sx, sy, sz};
  454 + size_t p[3];// = {x, y, z};
  455 +
  456 + if(d[0] == 0 && d[1] == 1 && d[2] == 2){
  457 + //this isn't actually a permute - just copy the data
  458 + memcpy(dest, src, sizeof(T) * sx * sy * sz);
  459 + }
  460 + else if(d[0] == 0){ //the individual lines are contiguous, so you can memcpy line-by-line
  461 + size_t y, z;
  462 + size_t src_idx, dest_idx;
  463 + size_t x_bytes = sizeof(T) * sx;
  464 + for(z = 0; z < sz; z++){
  465 + p[2] = z;
  466 + for(y = 0; y < sy; y++){
  467 + p[1] = y;
  468 + src_idx = z * sx * sy + y * sx;
  469 + dest_idx = p[d[2]] * s[d[0]] * s[d[1]] + p[d[1]] * s[d[0]];
  470 + //std::cout<<z<<", "<<y<<" ------- "<<p[d[2]]<<" * "<<s[d[0]]<<" * "<<s[d[1]]<<" + "<<p[d[1]]<<" * "<<s[d[0]]<<std::endl;
  471 + memcpy(dest + dest_idx, src + src_idx, x_bytes);
  472 + }
  473 + }
  474 + }
  475 + else{ //loop through every damn point
  476 + size_t x, y, z;
  477 + size_t src_idx, dest_idx;
  478 + size_t src_z, src_y;
  479 + for(z = 0; z < sz; z++){
  480 + p[2] = z;
  481 + src_z = z * sx * sy;
  482 + for(y = 0; y < sy; y++){
  483 + p[1] = y;
  484 + src_y = src_z + y * sx;
  485 + for(x = 0; x < sx; x++){
  486 + p[0] = x;
  487 + src_idx = src_y + x;
  488 + dest_idx = p[d[2]] * s[d[0]] * s[d[1]] + p[d[1]] * s[d[0]] + p[d[0]];
  489 + dest[dest_idx] = src[src_idx];
  490 + }
  491 + }
  492 + }
  493 + }
  494 + }
  495 +
378 496 };
379 497  
380 498 }
... ...
stim/envi/bip.h
... ... @@ -373,7 +373,7 @@ public:
373 373 for(size_t xy = 0; xy < XY; xy++){ //for each pixel
374 374 memset(spec, 0, Bb); //set the spectrum to zero
375 375 if(mask == NULL || mask[xy]){ //if the pixel is masked
376   - len = 0; //initialize the
  376 + len = 0; //initialize the
377 377 file.read((char*)spec, Bb); //read a spectrum
378 378 for(size_t b = 0; b < B; b++) //for each band
379 379 len += spec[b]*spec[b]; //add the square of the spectral band
... ... @@ -385,7 +385,7 @@ public:
385 385 file.seekg(Bb, std::ios::cur); //otherwise skip a spectrum
386 386 target.write((char*)spec, Bb); //output the normalized spectrum
387 387 if(PROGRESS) progress = (double)(xy + 1) / (double)XY * 100; //update the progress
388   - }
  388 + }
389 389 }
390 390  
391 391  
... ... @@ -1088,6 +1088,232 @@ public:
1088 1088 return true;
1089 1089 }
1090 1090  
  1091 +
  1092 +#ifdef CUDA_FOUND
  1093 + /// Calculate the covariance matrix of Noise for masked pixels using cuBLAS
  1094 + /// Note that cuBLAS only supports integer-sized arrays, so there may be issues with large spectra
  1095 + bool coNoise_matrix_cublas(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
  1096 +
  1097 + cudaError_t cudaStat;
  1098 + cublasStatus_t stat;
  1099 + cublasHandle_t handle;
  1100 +
  1101 + progress = 0; //initialize the progress to zero (0)
  1102 + unsigned long long XY = X() * Y(); //calculate the number of elements in a band image
  1103 + unsigned long long B = Z(); //calculate the number of spectral elements
  1104 +
  1105 + double* s = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum that will be pulled from the file
  1106 + double* s_dev; //declare a device pointer that will store the spectrum on the GPU
  1107 +
  1108 + double* s2_dev; // device pointer on the GPU
  1109 + cudaStat = cudaMalloc(&s2_dev, B * sizeof(double)); // allocate space on the CUDA device
  1110 + cudaStat = cudaMemset(s2_dev, 0, B * sizeof(double)); // initialize s2_dev to zero (0)
  1111 +
  1112 + double* A_dev; //declare a device pointer that will store the covariance matrix on the GPU
  1113 + double* avg_dev; //declare a device pointer that will store the average spectrum
  1114 + cudaStat = cudaMalloc(&s_dev, B * sizeof(double)); //allocate space on the CUDA device for the spectrum
  1115 + cudaStat = cudaMalloc(&A_dev, B * B * sizeof(double)); //allocate space on the CUDA device for the covariance matrix
  1116 + cudaStat = cudaMemset(A_dev, 0, B * B * sizeof(double)); //initialize the covariance matrix to zero (0)
  1117 + cudaStat = cudaMalloc(&avg_dev, B * sizeof(double)); //allocate space on the CUDA device for the average spectrum
  1118 + stat = cublasSetVector((int)B, sizeof(double), avg, 1, avg_dev, 1); //copy the average spectrum to the CUDA device
  1119 +
  1120 + double ger_alpha = 1.0/(double)XY; //scale the outer product by the inverse of the number of samples (mean outer product)
  1121 + double axpy_alpha = -1; //multiplication factor for the average spectrum (in order to perform a subtraction)
  1122 +
  1123 + stat = cublasCreate(&handle); //create a cuBLAS instance
  1124 + if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid
  1125 + printf ("CUBLAS initialization failed\n");
  1126 + return EXIT_FAILURE;
  1127 + }
  1128 + for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel
  1129 + if (mask == NULL || mask[xy] != 0){
  1130 + pixeld(s, xy); //retreive the spectrum at the current xy pixel location
  1131 +
  1132 + stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1); //copy the spectrum from the host to the device
  1133 + stat = cublasDaxpy(handle, (int)B, &axpy_alpha, avg_dev, 1, s_dev, 1); //subtract the average spectrum
  1134 +
  1135 + cudaMemcpy(s2_dev, s_dev + 1 , (B-1) * sizeof(double), cudaMemcpyDeviceToDevice); //copy B-1 elements from shifted source data (s_dev) to device pointer (s2_dev )
  1136 + stat = cublasDaxpy(handle, (int)B, &axpy_alpha, s2_dev, 1, s_dev, 1); //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them )
  1137 +
  1138 +
  1139 + stat = cublasDsyr(handle, CUBLAS_FILL_MODE_UPPER, (int)B, &ger_alpha, s_dev, 1, A_dev, (int)B); //calculate the covariance matrix (symmetric outer product)
  1140 + }
  1141 + if(PROGRESS) progress = (double)(xy+1) / XY * 100; //record the current progress
  1142 +
  1143 + }
  1144 +
  1145 + cublasGetMatrix((int)B, (int)B, sizeof(double), A_dev, (int)B, coN, (int)B); //copy the result from the GPU to the CPU
  1146 +
  1147 + cudaFree(A_dev); //clean up allocated device memory
  1148 + cudaFree(s_dev);
  1149 + cudaFree(s2_dev);
  1150 + cudaFree(avg_dev);
  1151 +
  1152 + for(unsigned long long i = 0; i < B; i++){ //copy the upper triangular portion to the lower triangular portion
  1153 + for(unsigned long long j = i+1; j < B; j++){
  1154 + coN[B * i + j] = coN[B * j + i];
  1155 + }
  1156 + }
  1157 +
  1158 + return true;
  1159 + }
  1160 +#endif
  1161 +
  1162 + /// Calculate the covariance of noise matrix for all masked pixels in the image with 64-bit floating point precision.
  1163 +
  1164 + /// @param coN is a pointer to pre-allocated memory of size [B * B] that stores the resulting covariance matrix
  1165 + /// @param avg is a pointer to memory of size B that stores the average spectrum
  1166 + /// @param mask is a pointer to memory of size [X * Y] that stores the mask value at each pixel location
  1167 + bool coNoise_matrix(double* coN, double* avg, unsigned char *mask, bool PROGRESS = false){
  1168 +
  1169 +#ifdef CUDA_FOUND
  1170 + int dev_count;
  1171 + cudaGetDeviceCount(&dev_count); //get the number of CUDA devices
  1172 + cudaDeviceProp prop;
  1173 + cudaGetDeviceProperties(&prop, 0); //get the property of the first device
  1174 + if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator
  1175 + return coNoise_matrix_cublas(coN, avg, mask, PROGRESS); //use cuBLAS to calculate the covariance matrix
  1176 +#endif
  1177 +
  1178 +
  1179 +
  1180 + progress = 0;
  1181 + //memory allocation
  1182 + unsigned long long XY = X() * Y();
  1183 + unsigned long long B = Z();
  1184 + T* temp = (T*)malloc(sizeof(T) * B);
  1185 +
  1186 + unsigned long long count = nnz(mask); //count the number of masked pixels
  1187 +
  1188 + //initialize covariance matrix of noise
  1189 + memset(coN, 0, B * B * sizeof(double));
  1190 +
  1191 + //calculate covariance matrix
  1192 + double* coN_half = (double*) malloc(B * B * sizeof(double)); //allocate space for a higher-precision intermediate matrix
  1193 + double* temp_precise = (double*) malloc(B * sizeof(double));
  1194 + memset(coN_half, 0, B * B * sizeof(double)); //initialize the high-precision matrix with zeros
  1195 + unsigned long long idx; //stores i*B to speed indexing
  1196 + for (unsigned long long xy = 0; xy < XY; xy++){
  1197 + if (mask == NULL || mask[xy] != 0){
  1198 + pixel(temp, xy); //retreive the spectrum at the current xy pixel location
  1199 + for(unsigned long long b = 0; b < B; b++) //subtract the mean from this spectrum and increase the precision
  1200 + temp_precise[b] = (double)temp[b] - (double)avg[b];
  1201 +
  1202 + for(unsigned long long b2 = 0; b2 < B-1; b2++) //Minimum/Maximum Autocorrelation Factors (MAF) method : subtranct each pixel from adjacent pixel (z direction is choosed to do so , which is almost the same as x or y direction or even average of them )
  1203 + temp_precise[b2] -= temp_precise[b2+1];
  1204 +
  1205 + idx = 0;
  1206 + for (unsigned long long b0 = 0; b0 < B; b0++){ //for each band
  1207 + for (unsigned long long b1 = b0; b1 < B; b1++)
  1208 + coN_half[idx++] += temp_precise[b0] * temp_precise[b1];
  1209 + }
  1210 + }
  1211 + if(PROGRESS) progress = (double)(xy+1) / XY * 100;
  1212 + }
  1213 + idx = 0;
  1214 + for (unsigned long long i = 0; i < B; i++){ //copy the precision matrix to both halves of the output matrix
  1215 + for (unsigned long long j = i; j < B; j++){
  1216 + coN[j * B + i] = coN[i * B + j] = coN_half[idx++] / (double) count;
  1217 + }
  1218 + }
  1219 +
  1220 + free(temp);
  1221 + free(temp_precise);
  1222 + return true;
  1223 + }
  1224 +
  1225 + #ifdef CUDA_FOUND
  1226 + /// Project the spectra onto a set of basis functions
  1227 + /// @param outfile is the name of the new binary output file that will be created
  1228 + /// @param center is a spectrum about which the data set will be rotated (ex. when performing mean centering)
  1229 + /// @param basis a set of basis vectors that the data set will be projected onto (after centering)
  1230 + /// @param M is the number of basis vectors
  1231 + /// @param mask is a character mask used to limit processing to valid pixels
  1232 + bool project_cublas(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, bool PROGRESS = false){
  1233 +
  1234 + cudaError_t cudaStat;
  1235 + cublasStatus_t stat;
  1236 + cublasHandle_t handle;
  1237 +
  1238 + std::ofstream target(outfile.c_str(), std::ios::binary); //open the target binary file
  1239 +
  1240 + progress = 0; //initialize the progress to zero (0)
  1241 + unsigned long long XY = X() * Y(); //calculate the number of elements in a band image
  1242 + unsigned long long B = Z(); //calculate the number of spectral elements
  1243 +
  1244 + double* s = (double*)malloc(sizeof(double) * B); //allocate space for the spectrum that will be pulled from the file
  1245 + double* s_dev; //declare a device pointer that will store the spectrum on the GPU
  1246 + cudaStat = cudaMalloc(&s_dev, B * sizeof(double)); //allocate space on the CUDA device for the spectrum
  1247 +
  1248 +
  1249 + double* basis_dev; // device pointer on the GPU
  1250 + cudaStat = cudaMalloc(&basis_dev, M * B * sizeof(double)); // allocate space on the CUDA device
  1251 + cudaStat = cudaMemset(basis_dev, 0, M * B * sizeof(double)); // initialize basis_dev to zero (0)
  1252 +
  1253 +
  1254 + /// transposing basis matrix (because cuBLAS is column-major)
  1255 + double *basis_Transposed = (double*)malloc(M * B * sizeof(double));
  1256 + memset(basis_Transposed, 0, M * B * sizeof(double));
  1257 + for (int i = 0; i<M; i++)
  1258 + for (int j = 0; j<B; j++)
  1259 + basis_Transposed[i+j*M] = basis[i*B+j];
  1260 +
  1261 + stat = cublasSetMatrix((int)M, (int)B, sizeof(double),basis_Transposed, (int)M, basis_dev, (int)M); //copy the basis_Transposed matrix to the CUDA device (both matrices are stored in column-major format)
  1262 +
  1263 + double* center_dev; //declare a device pointer that will store the center (average)
  1264 + cudaStat = cudaMalloc(&center_dev, B * sizeof(double)); //allocate space on the CUDA device for the center (average)
  1265 + stat = cublasSetVector((int)B, sizeof(double), center, 1, center_dev, 1); //copy the center vector (average) to the CUDA device (from host to device)
  1266 +
  1267 +
  1268 + double* A = (double*)malloc(sizeof(double) * M); //allocate space for the projected pixel on the host
  1269 + double* A_dev; //declare a device pointer that will store the projected pixel on the GPU
  1270 + cudaStat = cudaMalloc(&A_dev,M * sizeof(double)); //allocate space on the CUDA device for the projected pixel
  1271 + cudaStat = cudaMemset(A_dev, 0,M * sizeof(double)); //initialize the projected pixel to zero (0)
  1272 +
  1273 + double axpy_alpha = -1; //multiplication factor for the center (in order to perform a subtraction)
  1274 + double axpy_alpha2 = 1; //multiplication factor for the matrix-vector multiplication
  1275 + double axpy_beta = 0; //multiplication factor for the matrix-vector multiplication (there is no second scalor)
  1276 +
  1277 + stat = cublasCreate(&handle); //create a cuBLAS instance
  1278 + if (stat != CUBLAS_STATUS_SUCCESS) { //test the cuBLAS instance to make sure it is valid
  1279 + printf ("CUBLAS initialization failed\n");
  1280 + return EXIT_FAILURE;
  1281 + }
  1282 +
  1283 + T* temp = (T*)malloc(sizeof(T) * M); //allocate space for the projected pixel to be written on the disc
  1284 + size_t i;
  1285 + for (unsigned long long xy = 0; xy < XY; xy++){ //for each pixel
  1286 + if (mask == NULL || mask[xy] != 0){
  1287 + pixeld(s, xy); //retreive the spectrum at the current xy pixel location
  1288 +
  1289 + stat = cublasSetVector((int)B, sizeof(double), s, 1, s_dev, 1); //copy the spectrum from the host to the device
  1290 + stat = cublasDaxpy(handle, (int)B, &axpy_alpha, center_dev, 1, s_dev, 1); //subtract the center (average)
  1291 + stat = cublasDgemv(handle,CUBLAS_OP_N,(int)M,(int)B,&axpy_alpha2,basis_dev,(int)M,s_dev,1,&axpy_beta,A_dev,1); //performs the matrix-vector multiplication
  1292 + stat = cublasGetVector((int)B, sizeof(double), A_dev, 1, A, 1); //copy the projected pixel to the host (from GPU to CPU)
  1293 +
  1294 + //std::copy<double*, T*>(A, A + M, temp);
  1295 + for(i = 0; i < M; i++) temp[i] = (T)A[i]; //casting projected pixel from double to whatever T is
  1296 + }
  1297 +
  1298 + target.write(reinterpret_cast<const char*>(temp), sizeof(T) * M); //write the projected vector
  1299 + if(PROGRESS) progress = (double)(xy+1) / XY * 100; //record the current progress
  1300 +
  1301 + }
  1302 +
  1303 + //clean up allocated device memory
  1304 + cudaFree(A_dev);
  1305 + cudaFree(s_dev);
  1306 + cudaFree(basis_dev);
  1307 + cudaFree(center_dev);
  1308 + free(A);
  1309 + free(s);
  1310 + free(temp);
  1311 + target.close(); //close the output file
  1312 +
  1313 + return true;
  1314 + }
  1315 +#endif
  1316 +
1091 1317 /// Project the spectra onto a set of basis functions
1092 1318 /// @param outfile is the name of the new binary output file that will be created
1093 1319 /// @param center is a spectrum about which the data set will be rotated (ex. when performing mean centering)
... ... @@ -1096,6 +1322,14 @@ public:
1096 1322 /// @param mask is a character mask used to limit processing to valid pixels
1097 1323 bool project(std::string outfile, double* center, double* basis, unsigned long long M, unsigned char* mask = NULL, bool PROGRESS = false){
1098 1324  
  1325 +#ifdef CUDA_FOUND
  1326 + int dev_count;
  1327 + cudaGetDeviceCount(&dev_count); //get the number of CUDA devices
  1328 + cudaDeviceProp prop;
  1329 + cudaGetDeviceProperties(&prop, 0); //get the property of the first device
  1330 + if(dev_count > 0 && prop.major != 9999) //if the first device is not an emulator
  1331 + return project_cublas(outfile,center,basis,M,mask,PROGRESS); //use cuBLAS to calculate the covariance matrix
  1332 +#endif
1099 1333 std::ofstream target(outfile.c_str(), std::ios::binary); //open the target binary file
1100 1334 //std::string headername = outfile + ".hdr"; //the header file name
1101 1335  
... ... @@ -1125,7 +1359,7 @@ public:
1125 1359 free(s); //free temporary storage arrays
1126 1360 free(rs);
1127 1361 target.close(); //close the output file
1128   -
  1362 +
1129 1363 return true;
1130 1364 }
1131 1365  
... ... @@ -1395,6 +1629,52 @@ public:
1395 1629 }
1396 1630 }
1397 1631  
  1632 + bool multiply(std::string outname, double v, unsigned char* mask = NULL, bool PROGRESS = false){
  1633 + std::ofstream target(outname.c_str(), std::ios::binary); //open the target binary file
  1634 + std::string headername = outname + ".hdr"; //the header file name
  1635 +
  1636 + unsigned long long N = X() * Y(); //calculate the total number of pixels to be processed
  1637 + unsigned long long B = Z(); //get the number of bands
  1638 + T* s = (T*)malloc(sizeof(T) * B); //allocate memory to store a pixel
  1639 + for(unsigned long long n = 0; n < N; n++){ //for each pixel in the image
  1640 + if(mask == NULL || mask[n]){ //if the pixel is masked
  1641 + for(size_t b = 0; b < B; b++) //for each band in the spectrum
  1642 + s[b] *= (T)v; //multiply
  1643 + }
  1644 +
  1645 + if(PROGRESS) progress = (double)(n+1) / N * 100; //set the current progress
  1646 +
  1647 + target.write((char*)s, sizeof(T) * B); //write the corrected data into destination
  1648 + } //end for each pixel
  1649 +
  1650 + free(s); //free the spectrum
  1651 + target.close(); //close the output file
  1652 + return true;
  1653 + }
  1654 +
  1655 + bool add(std::string outname, double v, unsigned char* mask = NULL, bool PROGRESS = false){
  1656 + std::ofstream target(outname.c_str(), std::ios::binary); //open the target binary file
  1657 + std::string headername = outname + ".hdr"; //the header file name
  1658 +
  1659 + unsigned long long N = X() * Y(); //calculate the total number of pixels to be processed
  1660 + unsigned long long B = Z(); //get the number of bands
  1661 + T* s = (T*)malloc(sizeof(T) * B); //allocate memory to store a pixel
  1662 + for(unsigned long long n = 0; n < N; n++){ //for each pixel in the image
  1663 + if(mask == NULL || mask[n]){ //if the pixel is masked
  1664 + for(size_t b = 0; b < B; b++) //for each band in the spectrum
  1665 + s[b] += (T)v; //multiply
  1666 + }
  1667 +
  1668 + if(PROGRESS) progress = (double)(n+1) / N * 100; //set the current progress
  1669 +
  1670 + target.write((char*)s, sizeof(T) * B); //write the corrected data into destination
  1671 + } //end for each pixel
  1672 +
  1673 + free(s); //free the spectrum
  1674 + target.close(); //close the output file
  1675 + return true;
  1676 + }
  1677 +
1398 1678  
1399 1679  
1400 1680 /// Close the file.
... ...
stim/envi/bsq.h
... ... @@ -9,6 +9,7 @@
9 9 #include <vector>
10 10 #include <deque>
11 11 #include <chrono>
  12 +#include <future>
12 13  
13 14  
14 15  
... ... @@ -376,36 +377,144 @@ public:
376 377  
377 378 }
378 379  
379   - /// Convert the current BSQ file to a BIL file with the specified file name.
380   -
381   - /// @param outname is the name of the output BIL file to be saved to disk.
382   - bool bil(std::string outname, bool PROGRESS = false)
383   - {
384   - //simplify image resolution
385   - unsigned long long jump = (Y() - 1) * X() * sizeof(T);
  380 + void readlines(T* dest, size_t start, size_t n){
  381 + hsi<T>::read(dest, 0, start, 0, X(), n, Z());
  382 + }
386 383  
387   - std::ofstream target(outname.c_str(), std::ios::binary);
388   - std::string headername = outname + ".hdr";
  384 + /// Convert this BSQ file to a BIL
  385 + bool bil(std::string outname, bool PROGRESS = false){
389 386  
390   - unsigned long long L = X();
391   - T* line = (T*)malloc(sizeof(T) * L);
  387 + const size_t buffers = 4; //number of buffers required for this algorithm
  388 + size_t mem_per_batch = binary<T>::buffer_size / buffers; //calculate the maximum memory available for a batch
392 389  
393   - for ( unsigned long long y = 0; y < Y(); y++) //for each y position
394   - {
395   - file.seekg(y * X() * sizeof(T), std::ios::beg); //seek to the beginning of the xz slice
396   - for ( unsigned long long z = 0; z < Z(); z++ ) //for each band
397   - {
398   - file.read((char *)line, sizeof(T) * X()); //read a line
399   - target.write((char*)line, sizeof(T) * X()); //write the line to the output file
400   - file.seekg(jump, std::ios::cur); //seek to the next band
401   - if(PROGRESS) progress = (double)((y+1) * Z() + z + 1) / (Z() * Y()) * 100; //update the progress counter
  390 + size_t slice_bytes = X() * Z() * sizeof(T); //number of bytes in an input batch slice (Y-slice in this case)
  391 + size_t max_slices_per_batch = mem_per_batch / slice_bytes; //maximum number of slices we can process in one batch given memory constraints
  392 + if(max_slices_per_batch == 0){ //if there is insufficient memory for a single slice, throw an error
  393 + std::cout<<"error, insufficient memory for stim::bsq::bil()"<<std::endl;
  394 + exit(1);
  395 + }
  396 + size_t max_batch_bytes = max_slices_per_batch * slice_bytes; //calculate the amount of memory that will be allocated for all four buffers
  397 +
  398 + T* src[2]; //source double-buffer for asynchronous batching
  399 + src[0] = (T*) malloc(max_batch_bytes);
  400 + src[1] = (T*) malloc(max_batch_bytes);
  401 + T* dst[2]; //destination double-buffer for asynchronous batching
  402 + dst[0] = (T*) malloc(max_batch_bytes);
  403 + dst[1] = (T*) malloc(max_batch_bytes);
  404 +
  405 + size_t N[2]; //number of slices stored in buffers 0 and 1
  406 + N[0] = N[1] = min(Y(), max_slices_per_batch); //start with the maximum number of slices that can be stored (may be the entire data set)
  407 +
  408 + std::ofstream target(outname.c_str(), std::ios::binary); //open an ou