Commit 3173e54dd0c592a5d06da09f3e0884973e53edd2

Authored by Tianshu Cheng
2 parents 5343a315 93de94e6

Merge branch 'master' of git.stim.ee.uh.edu:codebase/stimlib into bsds500

stim/cuda/arraymath.cuh
1 1 #ifndef STIM_CUDA_ARRAYMATH_H
2 2 #define STIM_CUDA_ARRAYMATH_H
3 3  
4   -#include <stim/cuda/array_add.cuh>
5   -#include <stim/cuda/array_multiply.cuh>
6   -#include <stim/cuda/array_abs.cuh>
7   -#include <stim/cuda/array_cart2polar.cuh>
8   -#include <stim/cuda/gaussian_blur.cuh>
9   -#include <stim/cuda/conv2.cuh>
  4 +#include <stim/cuda/arraymath/array_add.cuh>
  5 +#include <stim/cuda/arraymath/array_multiply.cuh>
  6 +#include <stim/cuda/arraymath/array_abs.cuh>
  7 +#include <stim/cuda/arraymath/array_cart2polar.cuh>
10 8  
11 9 namespace stim{
12 10 namespace cuda{
... ...
stim/cuda/array_abs.cuh renamed to stim/cuda/arraymath/array_abs.cuh
stim/cuda/array_add.cuh renamed to stim/cuda/arraymath/array_add.cuh
... ... @@ -3,8 +3,7 @@
3 3  
4 4 #include <iostream>
5 5 #include <cuda.h>
6   -#include <stim/cuda/devices.h>
7   -#include <stim/cuda/error.h>
  6 +#include <stim/cuda/cudatools.h>
8 7  
9 8 namespace stim{
10 9 namespace cuda{
... ...
stim/cuda/array_cart2polar.cuh renamed to stim/cuda/arraymath/array_cart2polar.cuh
stim/cuda/array_multiply.cuh renamed to stim/cuda/arraymath/array_multiply.cuh
... ... @@ -3,8 +3,7 @@
3 3  
4 4 #include <iostream>
5 5 #include <cuda.h>
6   -#include <stim/cuda/devices.h>
7   -#include <stim/cuda/error.h>
  6 +#include <stim/cuda/cudatools.h>
8 7  
9 8 namespace stim{
10 9 namespace cuda{
... ...
stim/cuda/cost.h
... ... @@ -3,11 +3,11 @@
3 3 #include <cuda_runtime.h>
4 4 #include <cublas_v2.h>
5 5 #include <stdio.h>
6   -#include "../visualization/colormap.h"
  6 +#include <stim/visualization/colormap.h>
7 7 #include <sstream>
8   -#include "../math/vector.h"
9   -#include "../cuda/devices.h"
10   -#include "../cuda/threads.h"
  8 +#include <stim/math/vector.h>
  9 +#include <stim/cuda/cudatools/devices.h>
  10 +#include <stim/cuda/cudatools/threads.h>
11 11  
12 12 ///Cost function that works with the gl-spider class to find index of the item with min-cost.
13 13 typedef unsigned char uchar;
... ...
stim/cuda/cudatools.h 0 → 100644
  1 +#include <stim/cuda/cudatools/callable.h>
  2 +#include <stim/cuda/cudatools/devices.h>
  3 +#include <stim/cuda/cudatools/error.h>
  4 +#include <stim/cuda/cudatools/threads.h>
  5 +#include <stim/cuda/cudatools/timer.h>
0 6 \ No newline at end of file
... ...
stim/cuda/callable.h renamed to stim/cuda/cudatools/callable.h
stim/cuda/devices.h renamed to stim/cuda/cudatools/devices.h
stim/cuda/error.h renamed to stim/cuda/cudatools/error.h
stim/cuda/glbind.h renamed to stim/cuda/cudatools/glbind.h
... ... @@ -9,7 +9,7 @@
9 9  
10 10 //#include <cudaHandleError.h>
11 11 #include "cuda_gl_interop.h"
12   -#include "../gl/error.h"
  12 +#include <stim/gl/error.h>
13 13  
14 14 namespace stim
15 15 {
... ...
stim/cuda/threads.h renamed to stim/cuda/cudatools/threads.h
1   -#include "cuda_runtime.h"
  1 +#include "cuda_runtime.h"
2 2 #include "device_launch_parameters.h"
3   -#include "../cuda/callable.h"
  3 +#include <stim/cuda/cudatools/callable.h>
4 4  
5 5 #ifndef CUDA_THREADS_H
6 6 #define CUDA_THREADS_H
... ...
stim/cuda/timer.h renamed to stim/cuda/cudatools/timer.h
1   -#ifndef STIM_CUDA_TIMER
2   -#define STIM_CUDA_TIMER
3   -
4   -static cudaEvent_t tStartEvent;
5   -static cudaEvent_t tStopEvent;
6   -
7   -namespace stim{
8   -
9   -/// These functions calculate the time between GPU functions in milliseconds
10   -static void gpuStartTimer()
11   -{
12   - //set up timing events
13   - cudaEventCreate(&tStartEvent);
14   - cudaEventCreate(&tStopEvent);
15   - cudaEventRecord(tStartEvent, 0);
16   -}
17   -
18   -static float gpuStopTimer()
19   -{
20   - cudaEventRecord(tStopEvent, 0);
21   - cudaEventSynchronize(tStopEvent);
22   - float elapsedTime;
23   - cudaEventElapsedTime(&elapsedTime, tStartEvent, tStopEvent);
24   - cudaEventDestroy(tStartEvent);
25   - cudaEventDestroy(tStopEvent);
26   - return elapsedTime;
27   -}
28   -
29   -} //end namespace stim
30   -
  1 +#ifndef STIM_CUDA_TIMER
  2 +#define STIM_CUDA_TIMER
  3 +
  4 +static cudaEvent_t tStartEvent;
  5 +static cudaEvent_t tStopEvent;
  6 +
  7 +namespace stim{
  8 +
  9 +/// These functions calculate the time between GPU functions in milliseconds
  10 +static void gpuStartTimer()
  11 +{
  12 + //set up timing events
  13 + cudaEventCreate(&tStartEvent);
  14 + cudaEventCreate(&tStopEvent);
  15 + cudaEventRecord(tStartEvent, 0);
  16 +}
  17 +
  18 +static float gpuStopTimer()
  19 +{
  20 + cudaEventRecord(tStopEvent, 0);
  21 + cudaEventSynchronize(tStopEvent);
  22 + float elapsedTime;
  23 + cudaEventElapsedTime(&elapsedTime, tStartEvent, tStopEvent);
  24 + cudaEventDestroy(tStartEvent);
  25 + cudaEventDestroy(tStopEvent);
  26 + return elapsedTime;
  27 +}
  28 +
  29 +} //end namespace stim
  30 +
31 31 #endif
32 32 \ No newline at end of file
... ...
stim/cuda/ivote.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_IVOTE_H
  2 +#define STIM_CUDA_IVOTE_H
  3 +
  4 +#include <stim/cuda/ivote/down_sample.cuh>
  5 +#include <stim/cuda/ivote/local_max.cuh>
  6 +#include <stim/cuda/ivote/update_dir.cuh>
  7 +#include <stim/cuda/ivote/vote.cuh>
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + }
  13 +}
  14 +
  15 +
  16 +
  17 +#endif
0 18 \ No newline at end of file
... ...
stim/cuda/down_sample.cuh renamed to stim/cuda/ivote/down_sample.cuh
... ... @@ -3,9 +3,8 @@
3 3  
4 4 #include <iostream>
5 5 #include <cuda.h>
6   -#include <stim/cuda/devices.h>
7   -#include <stim/cuda/timer.h>
8   -#include <stim/cuda/gaussian_blur.cuh>
  6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/templates/gaussian_blur.cuh>
9 8  
10 9 namespace stim{
11 10 namespace cuda{
... ... @@ -51,7 +50,7 @@ namespace stim{
51 50 dim3 threads(max_threads, 1);
52 51 dim3 blocks(x_ds/threads.x + (x_ds %threads.x == 0 ? 0:1) , y_ds);
53 52  
54   - stim::cuda::gpu_gaussian_blur_2d<float>(gpuI0, sigma_ds,x ,y);
  53 + stim::cuda::gpu_gaussian_blur2<float>(gpuI0, sigma_ds,x ,y);
55 54  
56 55 //resample the image
57 56 down_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y);
... ... @@ -89,7 +88,7 @@ namespace stim{
89 88 gpu_down_sample<T>(gpuI, gpuI0, resize, x, y);
90 89  
91 90 //copy the image data to the GPU
92   - cudaMemcpy(re_image, gpuI, bytes_ds, cudaMemcpyHostToDevice);
  91 + cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice);
93 92  
94 93 cudaFree(gpuI0);
95 94 cudeFree(gpuI);
... ...
stim/cuda/local_max.cuh renamed to stim/cuda/ivote/local_max.cuh
... ... @@ -4,8 +4,7 @@
4 4  
5 5 # include <iostream>
6 6 # include <cuda.h>
7   -# include <stim/cuda/devices.h>
8   -# include <stim/cuda/error.h>
  7 +#include <stim/cuda/cudatools.h>
9 8  
10 9 namespace stim{
11 10 namespace cuda{
... ...
stim/cuda/update_dir.cuh renamed to stim/cuda/ivote/update_dir.cuh
... ... @@ -4,8 +4,7 @@
4 4  
5 5 # include <iostream>
6 6 # include <cuda.h>
7   -# include <stim/cuda/devices.h>
8   -# include <stim/cuda/error.h>
  7 +#include <stim/cuda/cudatools.h>
9 8 #include <stim/cuda/sharedmem.cuh>
10 9  
11 10 namespace stim{
... ...
stim/cuda/vote.cuh renamed to stim/cuda/ivote/vote.cuh
... ... @@ -4,8 +4,7 @@
4 4  
5 5 # include <iostream>
6 6 # include <cuda.h>
7   -# include <stim/cuda/devices.h>
8   -# include <stim/cuda/error.h>
  7 +#include <stim/cuda/cudatools.h>
9 8 #include <stim/cuda/sharedmem.cuh>
10 9  
11 10  
... ...
stim/cuda/conv2.cuh renamed to stim/cuda/templates/conv2.cuh
... ... @@ -3,8 +3,7 @@
3 3  
4 4 #include <iostream>
5 5 #include <cuda.h>
6   -#include <stim/cuda/devices.h>
7   -#include <stim/cuda/error.h>
  6 +#include <stim/cuda/cudatools.h>
8 7 #include <cmath>
9 8 #include <algorithm>
10 9  
... ...
stim/cuda/gaussian_blur.cuh renamed to stim/cuda/templates/conv2sep.cuh
1   -#ifndef STIM_CUDA_GAUSSIAN_BLUR_H
2   -#define STIM_CUDA_GAUSSIAN_BLUR_H
  1 +#ifndef STIM_CUDA_CONV2SEP_H
  2 +#define STIM_CUDA_CONV2SEP_H
3 3  
4 4 #include <iostream>
5 5 #include <cuda.h>
6   -#include <stim/cuda/devices.h>
7   -#include <stim/cuda/timer.h>
  6 +#include <stim/cuda/cudatools/devices.h>
  7 +#include <stim/cuda/cudatools/timer.h>
8 8 #include <stim/cuda/sharedmem.cuh>
  9 +#include <stim/cuda/cudatools/error.h>
9 10  
10 11 #define pi 3.14159
11 12  
... ... @@ -13,12 +14,13 @@ namespace stim{
13 14 namespace cuda{
14 15  
15 16 template<typename T>
16   - __global__ void gaussian_blur_x(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){
  17 + __global__ void conv2sep_0(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y,
  18 + T* kernel0, unsigned int k0){
17 19  
18 20 //generate a pointer to shared memory (size will be specified as a kernel parameter)
19 21 extern __shared__ T s[];
20 22  
21   - int kr = sigma * 4; //calculate the kernel radius
  23 + int kr = k0/2; //calculate the kernel radius
22 24  
23 25 //get a pointer to the gaussian in memory
24 26 T* g = (T*)&s[blockDim.x + 2 * kr];
... ... @@ -30,30 +32,25 @@ namespace stim{
30 32 //copy the portion of the image necessary for this block to shared memory
31 33 stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
32 34  
33   - //calculate the thread index and block index
  35 + //calculate the thread index
34 36 int ti = threadIdx.x;
35 37  
36 38 //calculate the spatial coordinate for this thread
37 39 int xi = bxi + ti;
  40 + int yi = byi;
38 41  
39   - //pre-compute the gaussian values for each kernel point
40   - T a = 1.0 / (sigma * sqrt(2 * pi));
41   - T c = - 1.0 / (2*sigma*sigma);
42   - int ki;
43   -
44   - //use the first 2kr+1 threads to evaluate a gaussian and store the result
45   - if(ti <= 2* kr+1){
46   - ki = ti - kr;
47   - g[ti] = a * exp((ki*ki) * c);
  42 +
  43 + //use the first 2kr+1 threads to transfer the kernel to shared memory
  44 + if(ti < k0){
  45 + g[ti] = kernel0[ti];
48 46 }
49 47  
50 48 //make sure that all writing to shared memory is done before continuing
51 49 __syncthreads();
52 50  
53 51 //if the current pixel is outside of the image
54   - if(bxi + ti > x || byi > y)
  52 + if(xi > x || yi > y)
55 53 return;
56   -
57 54  
58 55  
59 56 //calculate the coordinates of the current thread in shared memory
... ... @@ -66,7 +63,7 @@ namespace stim{
66 63 for(int ki = -kr; ki <= kr; ki++){
67 64 sum += g[ki + kr] * s[si + ki];
68 65 }
69   -
  66 +
70 67 //calculate the 1D image index for this thread
71 68 unsigned int i = byi * x + xi;
72 69  
... ... @@ -75,12 +72,13 @@ namespace stim{
75 72 }
76 73  
77 74 template<typename T>
78   - __global__ void gaussian_blur_y(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){
  75 + __global__ void conv2sep_1(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y,
  76 + T* kernel0, unsigned int k0){
79 77  
80 78 //generate a pointer to shared memory (size will be specified as a kernel parameter)
81 79 extern __shared__ T s[];
82 80  
83   - int kr = sigma * 4; //calculate the kernel radius
  81 + int kr = k0/2; //calculate the kernel radius
84 82  
85 83 //get a pointer to the gaussian in memory
86 84 T* g = (T*)&s[blockDim.y + 2 * kr];
... ... @@ -92,30 +90,25 @@ namespace stim{
92 90 //copy the portion of the image necessary for this block to shared memory
93 91 stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
94 92  
95   - //calculate the thread index and block index
  93 + //calculate the thread index
96 94 int ti = threadIdx.y;
97 95  
98 96 //calculate the spatial coordinate for this thread
  97 + int xi = bxi;
99 98 int yi = byi + ti;
100 99  
101   - //pre-compute the gaussian values for each kernel point
102   - T a = 1.0 / (sigma * sqrt(2 * pi));
103   - T c = - 1.0 / (2*sigma*sigma);
104   - int ki;
105   -
106   - //use the first 2kr+1 threads to evaluate a gaussian and store the result
107   - if(ti <= 2* kr+1){
108   - ki = ti - kr;
109   - g[ti] = a * exp((ki*ki) * c);
  100 +
  101 + //use the first 2kr+1 threads to transfer the kernel to shared memory
  102 + if(ti < k0){
  103 + g[ti] = kernel0[ti];
110 104 }
111 105  
112 106 //make sure that all writing to shared memory is done before continuing
113 107 __syncthreads();
114 108  
115 109 //if the current pixel is outside of the image
116   - if(bxi >= x || yi >= y)
  110 + if(xi > x || yi > y)
117 111 return;
118   -
119 112  
120 113  
121 114 //calculate the coordinates of the current thread in shared memory
... ... @@ -128,17 +121,55 @@ namespace stim{
128 121 for(int ki = -kr; ki <= kr; ki++){
129 122 sum += g[ki + kr] * s[si + ki];
130 123 }
131   -
  124 +
132 125 //calculate the 1D image index for this thread
133   - unsigned int i = yi * x + bxi;
  126 + unsigned int i = yi * x + xi;
134 127  
135 128 //output the result to global memory
136 129 out[i] = sum;
137 130 }
138 131  
139   - /// Applies a Gaussian blur to a 2D image stored on the GPU
140 132 template<typename T>
141   - void gpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){
  133 + void tex_conv2sep(T* out, unsigned int x, unsigned int y,
  134 + cudaTextureObject_t texObj, cudaArray* cuArray,
  135 + T* kernel0, unsigned int k0,
  136 + T* kernel1, unsigned int k1){
  137 +
  138 + //get the maximum number of threads per block for the CUDA device
  139 + int max_threads = stim::maxThreadsPerBlock();
  140 + dim3 threads(max_threads, 1);
  141 +
  142 + //calculate the number of blocks
  143 + dim3 blocks(x / threads.x + 1, y);
  144 +
  145 + //calculate the shared memory used in the kernel
  146 + unsigned int pixel_bytes = max_threads * sizeof(T); //bytes devoted to pixel data being processed
  147 + unsigned int apron_bytes = k0/2 * sizeof(T); //bytes devoted to the apron on each side of the window
  148 + unsigned int gaussian_bytes = k0 * sizeof(T); //bytes devoted to memory used to store the pre-computed Gaussian window
  149 + unsigned int shared_bytes = pixel_bytes + 2 * apron_bytes + gaussian_bytes; //total number of bytes shared memory used
  150 +
  151 + //blur the image along the x-axis
  152 + conv2sep_0<T> <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel0, k0);
  153 +
  154 + // Copy the x-blurred data from global memory to the texture
  155 + cudaMemcpyToArray(cuArray, 0, 0, out, x * y * sizeof(T),
  156 + cudaMemcpyDeviceToDevice);
  157 +
  158 + //transpose the block and thread dimensions
  159 + threads.x = 1;
  160 + threads.y = max_threads;
  161 + blocks.x = x;
  162 + blocks.y = y / threads.y + 1;
  163 +
  164 + //blur the image along the y-axis
  165 + conv2sep_1<T> <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel1, k1);
  166 +
  167 + }
  168 +
  169 + template<typename T>
  170 + void gpu_conv2sep(T* image, unsigned int x, unsigned int y,
  171 + T* kernel0, unsigned int k0,
  172 + T* kernel1, unsigned int k1){
142 173  
143 174 //get the number of pixels in the image
144 175 unsigned int pixels = x * y;
... ... @@ -176,36 +207,9 @@ namespace stim{
176 207 cudaTextureObject_t texObj = 0;
177 208 cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
178 209  
  210 + //call the texture version of the separable convolution function
  211 + tex_conv2sep(image, x, y, texObj, cuArray, kernel0, k0, kernel1, k1);
179 212  
180   - //get the maximum number of threads per block for the CUDA device
181   - int max_threads = stim::maxThreadsPerBlock();
182   - dim3 threads(max_threads, 1);
183   -
184   - //calculate the number of blocks
185   - dim3 blocks(x / threads.x + 1, y);
186   -
187   - //calculate the shared memory used in the kernel
188   - unsigned int pixel_bytes = max_threads * 4; //bytes devoted to pixel data being processed
189   - unsigned int apron_bytes = sigma * 8 * 4; //bytes devoted to pixels outside the window
190   - unsigned int gaussian_bytes = (sigma * 8 + 1) * 4; //bytes devoted to memory used to store the pre-computed Gaussian window
191   - unsigned int shared_bytes = pixel_bytes + apron_bytes + gaussian_bytes; //total number of bytes shared memory used
192   -
193   - //blur the image along the x-axis
194   - gaussian_blur_x <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y);
195   -
196   - // Copy the x-blurred data from global memory to the texture
197   - cudaMemcpyToArray(cuArray, 0, 0, image, bytes,
198   - cudaMemcpyDeviceToDevice);
199   -
200   - //transpose the block and thread dimensions
201   - threads.x = 1;
202   - threads.y = max_threads;
203   - blocks.x = x;
204   - blocks.y = y / threads.y + 1;
205   -
206   - //blur the image along the y-axis
207   - gaussian_blur_y <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y);
208   -
209 213 //free allocated memory
210 214 cudaFree(cuArray);
211 215  
... ... @@ -213,22 +217,35 @@ namespace stim{
213 217  
214 218 /// Applies a Gaussian blur to a 2D image stored on the CPU
215 219 template<typename T>
216   - void cpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){
  220 + void cpu_conv2sep(T* image, unsigned int x, unsigned int y,
  221 + T* kernel0, unsigned int k0,
  222 + T* kernel1, unsigned int k1){
217 223  
218 224 //get the number of pixels in the image
219 225 unsigned int pixels = x * y;
220 226 unsigned int bytes = sizeof(T) * pixels;
221 227  
222   - //allocate space on the GPU
  228 + //---------Allocate Image---------
  229 + //allocate space on the GPU for the image
223 230 T* gpuI0;
224   - cudaMalloc(&gpuI0, bytes);
225   -
  231 + HANDLE_ERROR(cudaMalloc(&gpuI0, bytes));
226 232  
227 233 //copy the image data to the GPU
228   - cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice);
  234 + HANDLE_ERROR(cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice));
  235 +
  236 + //---------Allocate Kernel--------
  237 + //allocate and copy the 0 (x) kernel
  238 + T* gpuK0;
  239 + HANDLE_ERROR(cudaMalloc(&gpuK0, k0 * sizeof(T)));
  240 + HANDLE_ERROR(cudaMemcpy(gpuK0, kernel0, k0 * sizeof(T), cudaMemcpyHostToDevice));
  241 +
  242 + //allocate and copy the 1 (y) kernel
  243 + T* gpuK1;
  244 + HANDLE_ERROR(cudaMalloc(&gpuK1, k1 * sizeof(T)));
  245 + HANDLE_ERROR(cudaMemcpy(gpuK1, kernel1, k1 * sizeof(T), cudaMemcpyHostToDevice));
229 246  
230 247 //run the GPU-based version of the algorithm
231   - gpu_gaussian_blur_2d<T>(gpuI0, sigma, x, y);
  248 + gpu_conv2sep<T>(gpuI0, x, y, gpuK0, k0, gpuK1, k1);
232 249  
233 250 //copy the image data from the device
234 251 cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost);
... ...
stim/cuda/templates/gaussian_blur.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_GAUSSIAN_BLUR_H
  2 +#define STIM_CUDA_GAUSSIAN_BLUR_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/sharedmem.cuh>
  8 +#include <stim/cuda/templates/conv2sep.cuh> //GPU-based separable convolution algorithm
  9 +
  10 +#define pi 3.14159
  11 +
  12 +namespace stim{
  13 + namespace cuda{
  14 +
  15 + template<typename T>
  16 + void gen_gaussian(T* out, T sigma, unsigned int width){
  17 +
  18 + //fill the kernel with a gaussian
  19 + for(unsigned int xi = 0; xi < width; xi++){
  20 +
  21 + float x = (float)xi - (float)(width/2); //calculate the x position of the gaussian
  22 + float g = 1.0 / (sigma * sqrt(2 * 3.14159)) * exp( - (x*x) / (2*sigma*sigma) );
  23 + out[xi] = g;
  24 + }
  25 +
  26 + }
  27 +
  28 + template<typename T>
  29 + void tex_gaussian_blur2(T* out, T sigma, unsigned int x, unsigned int y, cudaTextureObject_t texObj, cudaArray* cuArray){
  30 +
  31 + //allocate space for the kernel
  32 + unsigned int kwidth = sigma * 8 + 1;
  33 + float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
  34 +
  35 + //fill the kernel with a gaussian
  36 + gen_gaussian(kernel0, sigma, kwidth);
  37 +
  38 + //copy the kernel to the GPU
  39 + T* gpuKernel0;
  40 + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
  41 +
  42 + //perform the gaussian blur as a separable convolution
  43 + stim::cuda::tex_conv2sep(out, x, y, texObj, cuArray, gpuKernel0, kwidth, gpuKernel0, kwidth);
  44 +
  45 + HANDLE_ERROR(cudaFree(gpuKernel0));
  46 +
  47 + }
  48 +
  49 + template<typename T>
  50 + void gpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
  51 +
  52 + //allocate space for the kernel
  53 + unsigned int kwidth = sigma * 8 + 1;
  54 + float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
  55 +
  56 + //fill the kernel with a gaussian
  57 + gen_gaussian(kernel0, sigma, kwidth);
  58 +
  59 + //copy the kernel to the GPU
  60 + T* gpuKernel0;
  61 + HANDLE_ERROR(cudaMalloc(&gpuKernel0, kwidth * sizeof(T)));
  62 + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
  63 +
  64 + //perform the gaussian blur as a separable convolution
  65 + stim::cuda::gpu_conv2sep<float>(image, x, y, gpuKernel0, kwidth, gpuKernel0, kwidth);
  66 +
  67 + HANDLE_ERROR(cudaFree(gpuKernel0));
  68 +
  69 + }
  70 +
  71 + /// Applies a Gaussian blur to a 2D image stored on the CPU
  72 + template<typename T>
  73 + void cpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
  74 +
  75 + //allocate space for the kernel
  76 + unsigned int kwidth = sigma * 8 + 1;
  77 + float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
  78 +
  79 + //fill the kernel with a gaussian
  80 + gen_gaussian(kernel0, sigma, kwidth);
  81 +
  82 + //perform the gaussian blur as a separable convolution
  83 + stim::cuda::cpu_conv2sep<float>(image, x, y, kernel0, kwidth, kernel0, kwidth);
  84 +
  85 + }
  86 +
  87 + };
  88 +};
  89 +
  90 +#endif
0 91 \ No newline at end of file
... ...
stim/cuda/gradient.cuh renamed to stim/cuda/templates/gradient.cuh
... ... @@ -3,8 +3,7 @@
3 3  
4 4 #include <iostream>
5 5 #include <cuda.h>
6   -#include <stim/cuda/devices.h>
7   -#include <stim/cuda/error.h>
  6 +#include <stim/cuda/cudatools.h>
8 7  
9 8 namespace stim{
10 9 namespace cuda{
... ...
stim/gl/gl_spider.h
... ... @@ -14,7 +14,7 @@
14 14 #include "stim/math/rect.h"
15 15 #include "stim/math/matrix.h"
16 16 #include "stim/cuda/cost.h"
17   -#include "stim/cuda/glbind.h"
  17 +#include <stim/cuda/cudatools/glbind.h>
18 18 #include <stim/visualization/obj.h>
19 19 #include <vector>
20 20  
... ...
stim/image/image.h
... ... @@ -174,7 +174,7 @@ public:
174 174  
175 175  
176 176 /// Returns the maximum pixel value in the image
177   - T max(){
  177 + T maxv(){
178 178 float max = 0;
179 179 unsigned long N = width() * height(); //get the number of pixels
180 180  
... ... @@ -190,7 +190,7 @@ public:
190 190 }
191 191  
192 192 /// Returns the minimum pixel value in the image
193   - T min(){
  193 + T minv(){
194 194 float min = 0;
195 195 unsigned long N = width() * height(); //get the number of pixels
196 196  
... ...
stim/math/matrix.h
... ... @@ -5,7 +5,7 @@
5 5 #include <string.h>
6 6 #include <iostream>
7 7 #include <stim/math/vector.h>
8   -#include "../cuda/callable.h"
  8 +#include <stim/cuda/cudatools/callable.h>
9 9  
10 10 namespace stim{
11 11  
... ...
stim/math/quaternion.h
1 1 #ifndef RTS_QUATERNION_H
2 2 #define RTS_QUATERNION_H
3 3  
4   -#include "../math/matrix.h"
5   -#include "../cuda/callable.h"
  4 +#include <stim/math/matrix.h>
  5 +#include <stim/cuda/cudatools/callable.h>
6 6  
7 7 namespace stim{
8 8  
... ...
stim/math/rect.h
... ... @@ -2,7 +2,7 @@
2 2 #define RTS_RECT_H
3 3  
4 4 //enable CUDA_CALLABLE macro
5   -#include <stim/cuda/callable.h>
  5 +#include <stim/cuda/cudatools/callable.h>
6 6 #include <stim/math/vector.h>
7 7 #include <stim/math/triangle.h>
8 8 #include <stim/math/quaternion.h>
... ...
stim/math/triangle.h
... ... @@ -2,7 +2,7 @@
2 2 #define RTS_TRIANGLE_H
3 3  
4 4 //enable CUDA_CALLABLE macro
5   -#include <stim/cuda/callable.h>
  5 +#include <stim/cuda/cudatools/callable.h>
6 6 #include <stim/math/vector.h>
7 7 #include <iostream>
8 8  
... ...
stim/math/vector.h
... ... @@ -5,7 +5,8 @@
5 5 #include <cmath>
6 6 #include <sstream>
7 7 #include <vector>
8   -#include "../cuda/callable.h"
  8 +
  9 +#include <stim/cuda/cudatools/callable.h>
9 10  
10 11 namespace stim
11 12 {
... ...
stim/visualization/colormap.h
... ... @@ -4,12 +4,12 @@
4 4 #include <string>
5 5 #include <stdlib.h>
6 6 #ifdef __CUDACC__
7   -#include "../cuda/error.h"
  7 +#include <stim/cuda/cudatools/error.h>
8 8 #endif
9 9  
10 10 //saving an image to a file uses the CImg library
11 11 //this currently throws a lot of "unreachable" warnings (as of GCC 4.8.2, nvcc 6.5.12)
12   -#include "../image/image.h"
  12 +#include <stim/image/image.h>
13 13  
14 14  
15 15 #define BREWER_CTRL_PTS 11
... ...