Commit ca99f95130a6562cb8827cbcd69c163b86592449

Authored by David Mayerich
1 parent 63d34974

faster implementation for stim::ivote

stim/cuda/arraymath.cuh
... ... @@ -10,8 +10,8 @@
10 10 #include <stim/cuda/arraymath/array_atan.cuh>
11 11 #include <stim/cuda/arraymath/array_abs.cuh>
12 12 #include <stim/cuda/arraymath/array_cart2polar.cuh>
13   -#include <stim/cuda/arraymath/array_threshold.cuh>
14   -#include <stim/cuda/arraymath/array_add3.cuh>
  13 +//#include <stim/cuda/arraymath/array_threshold.cuh>
  14 +//#include <stim/cuda/arraymath/array_add3.cuh>
15 15 namespace stim{
16 16 namespace cuda{
17 17  
... ...
stim/cuda/ivote/david_update_dir_global.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H
  2 +#define STIM_CUDA_UPDATE_DIR_GLOBAL_H
  3 +
  4 +# include <iostream>
  5 +# include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/sharedmem.cuh>
  8 +#include <math.h>
  9 +#include "cpyToshare.cuh"
  10 +
  11 +#define RMAX_TEST 8
  12 +
  13 +namespace stim{
  14 + namespace cuda{
  15 +
  16 + // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
  17 + template<typename T>
  18 + __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  19 + extern __shared__ T atan2_table[];
  20 +
  21 + //calculate the start point for this block
  22 + //int bxi = blockIdx.x * blockDim.x;
  23 +
  24 + stim::cuda::sharedMemcpy(atan2_table, gpuTable, (2 * rmax + 1) * (2 * rmax + 1), threadIdx.x, blockDim.x);
  25 +
  26 + __syncthreads();
  27 +
  28 + // calculate the 2D coordinates for this current thread.
  29 + //int xi = bxi + threadIdx.x;
  30 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  31 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  32 + if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel
  33 +
  34 + int i = yi * x + xi; // convert 2D coordinates to 1D
  35 +
  36 + float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch
  37 + gpuDir[i] = 0; //initialize the vote direction to zero
  38 + float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter
  39 + int id_x = 0; // define two local variables for the x and y position of the maximum
  40 + int id_y = 0;
  41 +
  42 + int x_table = 2*rmax +1; // compute the size of window which will be checked for finding the voting area for this voter
  43 + int rmax_sq = rmax * rmax;
  44 + int tx_rmax = threadIdx.x + rmax;
  45 + float atan_angle;
  46 + float vote_c;
  47 + unsigned int ind_t;
  48 + for(int yr = -rmax; yr <= rmax; yr++){ //for each counter in the y direction
  49 + if (yi+yr >= 0 && yi + yr < y){ //if the counter exists (we aren't looking outside of the image)
  50 + for(int xr = -rmax; xr <= rmax; xr++){ //for each counter in the x direction
  51 + if((xr * xr + yr *yr)< rmax_sq){ //if the counter is within range of the voter
  52 +
  53 + ind_t = (rmax - yr) * x_table + rmax - xr; //calculate the index to the atan2 table
  54 + atan_angle = atan2_table[ind_t]; //retrieve the direction vector from the table
  55 +
  56 + //atan_angle = atan2((float)yr, (float)xr);
  57 +
  58 + if (abs(atan_angle - theta) <phi){ // check if the current pixel is located in the voting angle of this voter.
  59 + vote_c = gpuVote[(yi+yr)*x + (xi+xr)]; // find the vote value for the current counter
  60 + if(vote_c>max) { // compare the vote value of this pixel with the max value to find the maxima and its index.
  61 + max = vote_c;
  62 + id_x = xr;
  63 + id_y = yr;
  64 + }
  65 + }
  66 + }
  67 + }
  68 + }
  69 + }
  70 +
  71 + unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
  72 + float new_angle = gpuTable[ind_m];
  73 +
  74 + if(xi < x && yi < y)
  75 + gpuDir[i] = new_angle;
  76 + } //end kernel
  77 +
  78 + // this kernel updates the gradient direction by the calculated voting direction.
  79 + template<typename T>
  80 + __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
  81 +
  82 + // calculate the 2D coordinates for this current thread.
  83 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  84 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  85 +
  86 + // convert 2D coordinates to 1D
  87 + int i = yi * x + xi;
  88 +
  89 + //update the gradient image with the vote direction
  90 + gpuGrad[2*i] = gpuDir[i];
  91 + }
  92 +
  93 + template<typename T>
  94 + void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  95 +
  96 +
  97 +
  98 + //calculate the number of bytes in the array
  99 + unsigned int bytes = x * y * sizeof(T);
  100 +
  101 + unsigned int max_threads = stim::maxThreadsPerBlock();
  102 +
  103 + dim3 threads(sqrt(max_threads), sqrt(max_threads));
  104 + dim3 blocks(x/threads.x + 1, y/threads.y + 1);
  105 +
  106 +
  107 +
  108 + // allocate space on the GPU for the updated vote direction
  109 + T* gpuDir;
  110 + cudaMalloc(&gpuDir, bytes);
  111 +
  112 + size_t shared_mem = sizeof(T) * std::pow((2 * rmax + 1), 2);
  113 + std::cout<<"Shared memory for atan2 table: "<<shared_mem<<std::endl;
  114 +
  115 + //call the kernel to calculate the new voting direction
  116 + cuda_update_dir <<< blocks, threads, shared_mem>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  117 +
  118 + //call the kernel to update the gradient direction
  119 + cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
  120 +
  121 + //free allocated memory
  122 + cudaFree(gpuDir);
  123 +
  124 + }
  125 +
  126 + template<typename T>
  127 + void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
  128 +
  129 + //calculate the number of bytes in the array
  130 + unsigned int bytes = x * y * sizeof(T);
  131 +
  132 + //calculate the number of bytes in the atan2 table
  133 + unsigned int bytes_table = (2*rmax+1) * (2*rmax+1) * sizeof(T);
  134 +
  135 + //allocate space on the GPU for the Vote Image
  136 + T* gpuVote;
  137 + cudaMalloc(&gpuVote, bytes);
  138 +
  139 + //copy the input vote image to the GPU
  140 + HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));
  141 +
  142 + //allocate space on the GPU for the input Gradient image
  143 + T* gpuGrad;
  144 + HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
  145 +
  146 + //copy the Gradient data to the GPU
  147 + HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
  148 +
  149 + //allocate space on the GPU for the atan2 table
  150 + T* gpuTable;
  151 + HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
  152 +
  153 + //copy the atan2 values to the GPU
  154 + HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
  155 +
  156 + //call the GPU version of the update direction function
  157 + gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  158 +
  159 + //copy the new gradient image back to the CPU
  160 + cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
  161 +
  162 + //free allocated memory
  163 + cudaFree(gpuTable);
  164 + cudaFree(gpuVote);
  165 + cudaFree(gpuGrad);
  166 + }
  167 +
  168 + }
  169 +}
  170 +
  171 +#endif
0 172 \ No newline at end of file
... ...
stim/cuda/ivote/update_dir_global.cuh
... ... @@ -5,25 +5,88 @@
5 5 # include <cuda.h>
6 6 #include <stim/cuda/cudatools.h>
7 7 #include <stim/cuda/sharedmem.cuh>
  8 +#include <stim/visualization/aabb2.h>
  9 +#include <stim/visualization/colormap.h>
  10 +#include <math.h>
8 11 #include "cpyToshare.cuh"
9 12  
10   -#define RMAX_TEST 8
  13 +//#define RMAX_TEST 8
11 14  
12 15 namespace stim{
13 16 namespace cuda{
14   -
15   - // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
  17 +
16 18 template<typename T>
17 19 __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  20 + extern __shared__ T S[];
  21 + T* shared_atan = S;
  22 + size_t n_table = (rmax * 2 + 1) * (rmax * 2 + 1);
  23 + stim::cuda::threadedMemcpy((char*)shared_atan, (char*)gpuTable, sizeof(T) * n_table, threadIdx.x, blockDim.x);
  24 +
  25 + //T* shared_vote = &S[n_table];
  26 + //size_t template_size_x = (blockDim.x + 2 * rmax);
  27 + //size_t template_size_y = (blockDim.y + 2 * rmax);
  28 + //stim::cuda::threadedMemcpy2D((char*)shared_vote, (char*)gpuVote, template_size_x, template_size_y, x, threadIdx.y * blockDim.x + threadIdx.x, blockDim.x * blockDim.y);
  29 +
  30 + int xi = blockIdx.x * blockDim.x + threadIdx.x; //calculate the 2D coordinates for this current thread.
  31 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
18 32  
  33 + if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel
  34 +
  35 + int i = yi * x + xi; //convert 2D coordinates to 1D
  36 + float theta = gpuGrad[2*i]; //calculate the voting direction based on the grtadient direction - global memory fetch
19 37  
20   - //calculate the start point for this block
21   - int bxi = blockIdx.x * blockDim.x;
  38 + stim::aabb2<int> bb(xi, yi); //initialize a bounding box at the current point
  39 + bb.insert(xi + ceil(rmax * cos(theta)), ceil(yi + rmax * sin(theta)));
  40 + bb.insert(xi + ceil(rmax * cos(theta - phi)), yi + ceil(rmax * sin(theta - phi))); //insert one corner of the triangle into the bounding box
  41 + bb.insert(xi + ceil(rmax * cos(theta + phi)), yi + ceil(rmax * sin(theta + phi))); //insert the final corner into the bounding box
  42 +
  43 + int x_table = 2*rmax +1;
  44 + int lut_i;
  45 + T rmax_sq = rmax * rmax;
  46 + T dx_sq, dy_sq;
  47 +
  48 + bb.trim_low(0, 0); //make sure the bounding box doesn't go outside the image
  49 + bb.trim_high(x-1, y-1);
  50 +
  51 + int by, bx;
  52 + int dx, dy; //coordinate relative to (xi, yi)
  53 + T v;
  54 + T max_v = 0; //initialize the maximum vote value to zero
  55 + T alpha;
  56 + int max_dx = bb.low[0];
  57 + int max_dy = bb.low[1];
  58 + for(by = bb.low[1]; by <= bb.high[1]; by++){ //for each element in the bounding box
  59 + dy = by - yi; //calculate the y coordinate of the current point relative to yi
  60 + dy_sq = dy * dy;
  61 + for(bx = bb.low[0]; bx <= bb.high[0]; bx++){
  62 + dx = bx - xi;
  63 + dx_sq = dx * dx;
  64 + lut_i = (rmax - dy) * x_table + rmax - dx;
  65 + alpha = shared_atan[lut_i];
  66 + if(dx_sq + dy_sq < rmax_sq && abs(alpha - theta) < phi){
  67 + v = gpuVote[by * x + bx]; // find the vote value for the current counter
  68 + if(v > max_v){
  69 + max_v = v;
  70 + max_dx = dx;
  71 + max_dy = dy;
  72 + }
  73 + }
  74 + }
  75 + }
  76 + gpuDir[i] = atan2((T)max_dy, (T)max_dx);
  77 + }
  78 +
  79 + // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
  80 + template<typename T>
  81 + __global__ void leila_cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
  82 +
22 83  
23 84 // calculate the 2D coordinates for this current thread.
24   - int xi = bxi + threadIdx.x;
25   - if(xi >= x) return; //if the index is outside of the image, terminate the kernel
26   - int yi = blockIdx.y * blockDim.y + threadIdx.y;
  85 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  86 + int yi = blockIdx.y * blockDim.y + threadIdx.y;
  87 +
  88 + if(xi >= x || yi >= y) return; //if the index is outside of the image, terminate the kernel
  89 +
27 90 int i = yi * x + xi; // convert 2D coordinates to 1D
28 91  
29 92 float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch
... ... @@ -37,27 +100,32 @@ namespace stim{
37 100 int tx_rmax = threadIdx.x + rmax;
38 101 float atan_angle;
39 102 float vote_c;
40   - for(int yr = -RMAX_TEST; yr <= RMAX_TEST; yr++){
41   - if (yi+yr >= 0 && yi + yr < y){
42   - for(int xr = -RMAX_TEST; xr <= RMAX_TEST; xr++){
43   -
44   - unsigned int ind_t = (RMAX_TEST - yr) * x_table + RMAX_TEST - xr;
45   -
46   - // calculate the angle between the voter and the current pixel in x and y directions
47   - atan_angle = gpuTable[ind_t];
48   -
49   - // find the vote value for the current counter
50   - vote_c = gpuVote[(yi+yr)*x + (xi+xr)];
51   -
52   - // check if the current pixel is located in the voting area of this voter.
53   - if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
54   -
55   - // compare the vote value of this pixel with the max value to find the maxima and its index.
56   - if (vote_c>max) {
57   -
58   - max = vote_c;
59   - id_x = xr;
60   - id_y = yr;
  103 + int xidx, yidx, yr_sq, xr_sq;
  104 + for(int yr = -rmax; yr <= rmax; yr++){
  105 + yidx = yi + yr; //compute the index into the image
  106 + if (yidx >= 0 && yidx < y){ //if the current y-index is inside the image
  107 + yr_sq = yr * yr; //compute the square of yr, to save time later
  108 + for(int xr = -rmax; xr <= rmax; xr++){
  109 + xidx = xi + xr;
  110 + if(xidx >= 0 && xidx < x){
  111 + xr_sq = xr * xr;
  112 + unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
  113 +
  114 + // calculate the angle between the voter and the current pixel in x and y directions
  115 + atan_angle = gpuTable[ind_t];
  116 + //atan_angle = atan2((T)yr, (T)xr);
  117 +
  118 + // check if the current pixel is located in the voting area of this voter.
  119 + if (((xr_sq + yr_sq)< rmax_sq) && (abs(atan_angle - theta) <phi)){
  120 +
  121 + vote_c = gpuVote[yidx * x + xidx]; // find the vote value for the current counter
  122 + // compare the vote value of this pixel with the max value to find the maxima and its index.
  123 + if (vote_c>max) {
  124 +
  125 + max = vote_c;
  126 + id_x = xr;
  127 + id_y = yr;
  128 + }
61 129 }
62 130 }
63 131 }
... ... @@ -70,6 +138,7 @@ namespace stim{
70 138 if(xi < x && yi < y)
71 139 gpuDir[i] = new_angle;
72 140 } //end kernel
  141 +
73 142  
74 143 // this kernel updates the gradient direction by the calculated voting direction.
75 144 template<typename T>
... ... @@ -78,6 +147,8 @@ namespace stim{
78 147 // calculate the 2D coordinates for this current thread.
79 148 int xi = blockIdx.x * blockDim.x + threadIdx.x;
80 149 int yi = blockIdx.y * blockDim.y + threadIdx.y;
  150 +
  151 + if(xi >= x || yi >= y) return;
81 152  
82 153 // convert 2D coordinates to 1D
83 154 int i = yi * x + xi;
... ... @@ -91,23 +162,43 @@ namespace stim{
91 162  
92 163 //calculate the number of bytes in the array
93 164 unsigned int bytes = x * y * sizeof(T);
94   -
95   - unsigned int max_threads = stim::maxThreadsPerBlock();
96   - dim3 threads(max_threads, 1);
97   - dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
98 165  
99 166 // allocate space on the GPU for the updated vote direction
100 167 T* gpuDir;
101   - cudaMalloc(&gpuDir, bytes);
  168 + HANDLE_ERROR( cudaMalloc(&gpuDir, bytes) );
  169 +
  170 + unsigned int max_threads = stim::maxThreadsPerBlock();
  171 + //dim3 threads(min(x, max_threads), 1);
  172 + //dim3 blocks(x/threads.x, y);
  173 +
  174 + dim3 threads( sqrt(max_threads), sqrt(max_threads) );
  175 + dim3 blocks(x/threads.x + 1, y/threads.y + 1);
  176 +
  177 + size_t table_bytes = sizeof(T) * (rmax * 2 + 1) * (rmax * 2 + 1);
  178 + //size_t curtain = 2 * rmax;
  179 + //size_t template_bytes = sizeof(T) * (threads.x + curtain) * (threads.y + curtain);
  180 + size_t shared_mem_req = table_bytes;// + template_bytes;
  181 + std::cout<<"Shared Memory required: "<<shared_mem_req<<std::endl;
  182 +
  183 + size_t shared_mem = stim::sharedMemPerBlock();
  184 + if(shared_mem_req > shared_mem){
  185 + std::cout<<"Error: insufficient shared memory for this implementation of cuda_update_dir()."<<std::endl;
  186 + exit(1);
  187 + }
102 188  
103 189 //call the kernel to calculate the new voting direction
104   - cuda_update_dir <<< blocks, threads>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  190 + cuda_update_dir <<< blocks, threads, shared_mem_req>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
  191 + stim::gpu2image<T>(gpuDir, "dir_david.bmp", x, y, -pi, pi, stim::cmBrewer);
  192 +
  193 + //exit(0);
  194 +
  195 + threads = dim3( sqrt(max_threads), sqrt(max_threads) );
  196 + blocks = dim3(x/threads.x + 1, y/threads.y + 1);
105 197  
106 198 //call the kernel to update the gradient direction
107 199 cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
108   -
109 200 //free allocated memory
110   - cudaFree(gpuDir);
  201 + HANDLE_ERROR( cudaFree(gpuDir) );
111 202  
112 203 }
113 204  
... ...
stim/cuda/ivote_atomic.cuh
... ... @@ -6,7 +6,7 @@
6 6 #include <stim/cuda/ivote/update_dir_global.cuh>
7 7 //#include <stim/cuda/ivote/vote_shared_32-32.cuh>
8 8 #include <stim/cuda/ivote/vote_atomic_shared.cuh>
9   -#include <stim/cuda/ivote/re_sample.cuh>
  9 +//#include <stim/cuda/ivote/re_sample.cuh>
10 10 namespace stim{
11 11 namespace cuda{
12 12  
... ...
stim/cuda/sharedmem.cuh
... ... @@ -35,10 +35,8 @@ namespace stim{
35 35 }
36 36 }
37 37  
38   - // Copies values from global memory to shared memory, optimizing threads
39   - template<typename T>
40   - __device__ void sharedMemcpy(T* dest, T* src, size_t N, size_t tid, size_t nt){
41   -
  38 + // Threaded copying of data on a CUDA device.
  39 + __device__ void threadedMemcpy(char* dest, char* src, size_t N, size_t tid, size_t nt){
42 40 size_t I = N / nt + 1; //calculate the number of iterations required to make the copy
43 41 size_t xi = tid; //initialize the source and destination index to the thread ID
44 42 for(size_t i = 0; i < I; i++){ //for each iteration
... ... @@ -48,7 +46,37 @@ namespace stim{
48 46 }
49 47 }
50 48  
51   -
  49 + /// Threaded copying of 2D data on a CUDA device
  50 + /// @param dest is a linear destination array of size nx * ny
  51 + /// @param src is a 2D image stored as a linear array with a pitch of X
  52 + /// @param X is the number of bytes in a row of src
  53 + /// @param tid is a 1D id for the current thread
  54 + /// @param nt is the number of threads in the block
  55 + template<typename T>
  56 + __device__ void threadedMemcpy2D(T* dest, size_t nx, size_t ny,
  57 + T* src, size_t x, size_t y, size_t sX, size_t sY,
  58 + size_t tid, size_t nt){
  59 +
  60 + size_t vals = nx * ny; //calculate the total number of bytes to be copied
  61 + size_t I = vals / nt + 1; //calculate the number of iterations required to perform the copy
  62 +
  63 + size_t src_i, dest_i;
  64 + size_t dest_x, dest_y, src_x, src_y;
  65 + for(size_t i = 0; i < I; i++){ //for each iteration
  66 + dest_i = i * nt + tid; //calculate the index into the destination array
  67 + dest_y = dest_i / nx;
  68 + dest_x = dest_i - dest_y * nx;
  69 +
  70 + if(dest_y < ny && dest_x < nx){
  71 +
  72 + src_x = x + dest_x;
  73 + src_y = y + dest_y;
  74 +
  75 + src_i = src_y * sX + src_x;
  76 + dest[dest_i] = src[src_i];
  77 + }
  78 + }
  79 + }
52 80 }
53 81 }
54 82  
... ...
stim/cuda/templates/conv2sep.cuh
... ... @@ -30,7 +30,8 @@ namespace stim{
30 30 int byi = blockIdx.y;
31 31  
32 32 //copy the portion of the image necessary for this block to shared memory
33   - stim::cuda::sharedMemcpy_tex2D<float, unsigned char>(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
  33 + //stim::cuda::sharedMemcpy_tex2D<float, unsigned char>(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
  34 + stim::cuda::sharedMemcpy_tex2D<float>(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
34 35  
35 36 //calculate the thread index
36 37 int ti = threadIdx.x;
... ... @@ -88,7 +89,8 @@ namespace stim{
88 89 int byi = blockIdx.y * blockDim.y;
89 90  
90 91 //copy the portion of the image necessary for this block to shared memory
91   - stim::cuda::sharedMemcpy_tex2D<float, unsigned char>(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
  92 + //stim::cuda::sharedMemcpy_tex2D<float, unsigned char>(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
  93 + stim::cuda::sharedMemcpy_tex2D<float>(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
92 94  
93 95 //calculate the thread index
94 96 int ti = threadIdx.y;
... ...
stim/image/image.h
... ... @@ -160,11 +160,26 @@ public:
160 160 exit(1);
161 161 }
162 162 allocate(cvImage.cols, cvImage.rows, cvImage.channels()); //allocate space for the image
163   - T* cv_ptr = (T*)cvImage.data;
  163 + unsigned char* cv_ptr = (unsigned char*)cvImage.data;
164 164 if(C() == 1) //if this is a single-color image, just copy the data
165 165 memcpy(img, cv_ptr, bytes());
166 166 if(C() == 3) //if this is a 3-color image, OpenCV uses BGR interleaving
167   - set_interleaved_bgr(cv_ptr, X(), Y());
  167 + from_opencv(cv_ptr, X(), Y());
  168 + }
  169 +
  170 + void from_opencv(unsigned char* buffer, size_t width, size_t height){
  171 + allocate(width, height, 3);
  172 + T value;
  173 + size_t i;
  174 + for(size_t c = 0; c < C(); c++){ //copy directly
  175 + for(size_t y = 0; y < Y(); y++){
  176 + for(size_t x = 0; x < X(); x++){
  177 + i = y * X() * C() + x * C() + (2-c);
  178 + value = buffer[i];
  179 + img[idx(x, y, c)] = value;
  180 + }
  181 + }
  182 + }
168 183 }
169 184  
170 185 //save a file
... ... @@ -180,23 +195,35 @@ public:
180 195 cv::imwrite(filename, cvImage);
181 196 }
182 197  
  198 + void set_interleaved(T* buffer, size_t width, size_t height, size_t channels){
  199 + allocate(width, height, channels);
  200 + memcpy(img, buffer, bytes());
  201 + }
  202 +
183 203 //create an image from an interleaved buffer
184 204 void set_interleaved_rgb(T* buffer, size_t width, size_t height){
185   - allocate(width, height, 3);
186   - memcpy(img, buffer, bytes());
  205 + set_interleaved(buffer, width, height, 3);
187 206 }
188 207  
189 208 void set_interleaved_bgr(T* buffer, size_t width, size_t height){
190 209 allocate(width, height, 3);
  210 + T value;
  211 + size_t i;
191 212 for(size_t c = 0; c < C(); c++){ //copy directly
192 213 for(size_t y = 0; y < Y(); y++){
193 214 for(size_t x = 0; x < X(); x++){
194   - img[idx(x, y, c)] = buffer[y * X() * C() + x * C() + (2-c)];
  215 + i = y * X() * C() + x * C() + (2-c);
  216 + value = buffer[i];
  217 + img[idx(x, y, c)] = value;
195 218 }
196 219 }
197 220 }
198 221 }
199 222  
  223 + void set_interleaved(T* buffer, size_t width, size_t height){
  224 + set_interleaved_rgb(buffer, width, height);
  225 + }
  226 +
200 227 void get_interleaved_bgr(T* data){
201 228  
202 229 //for each channel
... ...
stim/optics/scalarfield.h
... ... @@ -71,8 +71,8 @@ public:
71 71 void to_cpu(){
72 72 if(loc == CPUmem) return;
73 73 else{
74   - stim::complex<T>* host_E = (stim::complex<T>*) malloc(e_bytes()); //allocate space in main memory
75   - HANDLE_ERROR( cudaMemcpy(host_E, E, e_bytes(), cudaMemcpyDeviceToHost) ); //copy from GPU to CPU
  74 + stim::complex<T>* host_E = (stim::complex<T>*) malloc(grid_bytes()); //allocate space in main memory
  75 + HANDLE_ERROR( cudaMemcpy(host_E, E, grid_bytes(), cudaMemcpyDeviceToHost) ); //copy from GPU to CPU
76 76 HANDLE_ERROR( cudaFree(E) ); //free device memory
77 77 E = host_E; //swap pointers
78 78 }
... ...
stim/visualization/aabb2.h 0 → 100644
  1 +#ifndef STIM_AABB2_H
  2 +#define STIM_AABB2_H
  3 +
  4 +#include <stim/cuda/cudatools/callable.h>
  5 +
  6 +namespace stim{
  7 +
  8 +/// Structure for a 2D axis aligned bounding box
  9 +template<typename T>
  10 +struct aabb2{
  11 +
  12 +//protected:
  13 +
  14 + T low[2]; //top left corner position
  15 + T high[2]; //dimensions along x and y
  16 +
  17 +//public:
  18 +
  19 + CUDA_CALLABLE aabb2(T x, T y){ //initialize an axis aligned bounding box of size 0 at the given position
  20 + low[0] = high[0] = x; //set the position to the user specified coordinates
  21 + low[1] = high[1] = y;
  22 + }
  23 +
  24 + //insert a point into the bounding box, growing the box appropriately
  25 + CUDA_CALLABLE void insert(T x, T y){
  26 + if(x < low[0]) low[0] = x;
  27 + if(y < low[1]) low[1] = y;
  28 +
  29 + if(x > high[0]) high[0] = x;
  30 + if(y > high[1]) high[1] = y;
  31 + }
  32 +
  33 + //trim the bounding box so that the lower bounds are (x, y)
  34 + CUDA_CALLABLE void trim_low(T x, T y){
  35 + if(low[0] < x) low[0] = x;
  36 + if(low[1] < y) low[1] = y;
  37 + }
  38 +
  39 + CUDA_CALLABLE void trim_high(T x, T y){
  40 + if(high[0] > x) high[0] = x;
  41 + if(high[1] > y) high[1] = y;
  42 + }
  43 +
  44 +};
  45 +
  46 +}
  47 +
  48 +
  49 +#endif
0 50 \ No newline at end of file
... ...