Commit 3173e54dd0c592a5d06da09f3e0884973e53edd2

Authored by Tianshu Cheng
2 parents 5343a315 93de94e6

Merge branch 'master' of git.stim.ee.uh.edu:codebase/stimlib into bsds500

stim/cuda/arraymath.cuh
1 #ifndef STIM_CUDA_ARRAYMATH_H 1 #ifndef STIM_CUDA_ARRAYMATH_H
2 #define STIM_CUDA_ARRAYMATH_H 2 #define STIM_CUDA_ARRAYMATH_H
3 3
4 -#include <stim/cuda/array_add.cuh>  
5 -#include <stim/cuda/array_multiply.cuh>  
6 -#include <stim/cuda/array_abs.cuh>  
7 -#include <stim/cuda/array_cart2polar.cuh>  
8 -#include <stim/cuda/gaussian_blur.cuh>  
9 -#include <stim/cuda/conv2.cuh> 4 +#include <stim/cuda/arraymath/array_add.cuh>
  5 +#include <stim/cuda/arraymath/array_multiply.cuh>
  6 +#include <stim/cuda/arraymath/array_abs.cuh>
  7 +#include <stim/cuda/arraymath/array_cart2polar.cuh>
10 8
11 namespace stim{ 9 namespace stim{
12 namespace cuda{ 10 namespace cuda{
stim/cuda/array_abs.cuh renamed to stim/cuda/arraymath/array_abs.cuh
stim/cuda/array_add.cuh renamed to stim/cuda/arraymath/array_add.cuh
@@ -3,8 +3,7 @@ @@ -3,8 +3,7 @@
3 3
4 #include <iostream> 4 #include <iostream>
5 #include <cuda.h> 5 #include <cuda.h>
6 -#include <stim/cuda/devices.h>  
7 -#include <stim/cuda/error.h> 6 +#include <stim/cuda/cudatools.h>
8 7
9 namespace stim{ 8 namespace stim{
10 namespace cuda{ 9 namespace cuda{
stim/cuda/array_cart2polar.cuh renamed to stim/cuda/arraymath/array_cart2polar.cuh
stim/cuda/array_multiply.cuh renamed to stim/cuda/arraymath/array_multiply.cuh
@@ -3,8 +3,7 @@ @@ -3,8 +3,7 @@
3 3
4 #include <iostream> 4 #include <iostream>
5 #include <cuda.h> 5 #include <cuda.h>
6 -#include <stim/cuda/devices.h>  
7 -#include <stim/cuda/error.h> 6 +#include <stim/cuda/cudatools.h>
8 7
9 namespace stim{ 8 namespace stim{
10 namespace cuda{ 9 namespace cuda{
@@ -3,11 +3,11 @@ @@ -3,11 +3,11 @@
3 #include <cuda_runtime.h> 3 #include <cuda_runtime.h>
4 #include <cublas_v2.h> 4 #include <cublas_v2.h>
5 #include <stdio.h> 5 #include <stdio.h>
6 -#include "../visualization/colormap.h" 6 +#include <stim/visualization/colormap.h>
7 #include <sstream> 7 #include <sstream>
8 -#include "../math/vector.h"  
9 -#include "../cuda/devices.h"  
10 -#include "../cuda/threads.h" 8 +#include <stim/math/vector.h>
  9 +#include <stim/cuda/cudatools/devices.h>
  10 +#include <stim/cuda/cudatools/threads.h>
11 11
12 ///Cost function that works with the gl-spider class to find index of the item with min-cost. 12 ///Cost function that works with the gl-spider class to find index of the item with min-cost.
13 typedef unsigned char uchar; 13 typedef unsigned char uchar;
stim/cuda/cudatools.h 0 → 100644
  1 +#include <stim/cuda/cudatools/callable.h>
  2 +#include <stim/cuda/cudatools/devices.h>
  3 +#include <stim/cuda/cudatools/error.h>
  4 +#include <stim/cuda/cudatools/threads.h>
  5 +#include <stim/cuda/cudatools/timer.h>
0 \ No newline at end of file 6 \ No newline at end of file
stim/cuda/callable.h renamed to stim/cuda/cudatools/callable.h
stim/cuda/devices.h renamed to stim/cuda/cudatools/devices.h
stim/cuda/error.h renamed to stim/cuda/cudatools/error.h
stim/cuda/glbind.h renamed to stim/cuda/cudatools/glbind.h
@@ -9,7 +9,7 @@ @@ -9,7 +9,7 @@
9 9
10 //#include <cudaHandleError.h> 10 //#include <cudaHandleError.h>
11 #include "cuda_gl_interop.h" 11 #include "cuda_gl_interop.h"
12 -#include "../gl/error.h" 12 +#include <stim/gl/error.h>
13 13
14 namespace stim 14 namespace stim
15 { 15 {
stim/cuda/threads.h renamed to stim/cuda/cudatools/threads.h
1 -#include "cuda_runtime.h" 1 +#include "cuda_runtime.h"
2 #include "device_launch_parameters.h" 2 #include "device_launch_parameters.h"
3 -#include "../cuda/callable.h" 3 +#include <stim/cuda/cudatools/callable.h>
4 4
5 #ifndef CUDA_THREADS_H 5 #ifndef CUDA_THREADS_H
6 #define CUDA_THREADS_H 6 #define CUDA_THREADS_H
stim/cuda/timer.h renamed to stim/cuda/cudatools/timer.h
1 -#ifndef STIM_CUDA_TIMER  
2 -#define STIM_CUDA_TIMER  
3 -  
4 -static cudaEvent_t tStartEvent;  
5 -static cudaEvent_t tStopEvent;  
6 -  
7 -namespace stim{  
8 -  
9 -/// These functions calculate the time between GPU functions in milliseconds  
10 -static void gpuStartTimer()  
11 -{  
12 - //set up timing events  
13 - cudaEventCreate(&tStartEvent);  
14 - cudaEventCreate(&tStopEvent);  
15 - cudaEventRecord(tStartEvent, 0);  
16 -}  
17 -  
18 -static float gpuStopTimer()  
19 -{  
20 - cudaEventRecord(tStopEvent, 0);  
21 - cudaEventSynchronize(tStopEvent);  
22 - float elapsedTime;  
23 - cudaEventElapsedTime(&elapsedTime, tStartEvent, tStopEvent);  
24 - cudaEventDestroy(tStartEvent);  
25 - cudaEventDestroy(tStopEvent);  
26 - return elapsedTime;  
27 -}  
28 -  
29 -} //end namespace stim  
30 - 1 +#ifndef STIM_CUDA_TIMER
  2 +#define STIM_CUDA_TIMER
  3 +
  4 +static cudaEvent_t tStartEvent;
  5 +static cudaEvent_t tStopEvent;
  6 +
  7 +namespace stim{
  8 +
  9 +/// These functions calculate the time between GPU functions in milliseconds
  10 +static void gpuStartTimer()
  11 +{
  12 + //set up timing events
  13 + cudaEventCreate(&tStartEvent);
  14 + cudaEventCreate(&tStopEvent);
  15 + cudaEventRecord(tStartEvent, 0);
  16 +}
  17 +
  18 +static float gpuStopTimer()
  19 +{
  20 + cudaEventRecord(tStopEvent, 0);
  21 + cudaEventSynchronize(tStopEvent);
  22 + float elapsedTime;
  23 + cudaEventElapsedTime(&elapsedTime, tStartEvent, tStopEvent);
  24 + cudaEventDestroy(tStartEvent);
  25 + cudaEventDestroy(tStopEvent);
  26 + return elapsedTime;
  27 +}
  28 +
  29 +} //end namespace stim
  30 +
31 #endif 31 #endif
32 \ No newline at end of file 32 \ No newline at end of file
stim/cuda/ivote.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_IVOTE_H
  2 +#define STIM_CUDA_IVOTE_H
  3 +
  4 +#include <stim/cuda/ivote/down_sample.cuh>
  5 +#include <stim/cuda/ivote/local_max.cuh>
  6 +#include <stim/cuda/ivote/update_dir.cuh>
  7 +#include <stim/cuda/ivote/vote.cuh>
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + }
  13 +}
  14 +
  15 +
  16 +
  17 +#endif
0 \ No newline at end of file 18 \ No newline at end of file
stim/cuda/down_sample.cuh renamed to stim/cuda/ivote/down_sample.cuh
@@ -3,9 +3,8 @@ @@ -3,9 +3,8 @@
3 3
4 #include <iostream> 4 #include <iostream>
5 #include <cuda.h> 5 #include <cuda.h>
6 -#include <stim/cuda/devices.h>  
7 -#include <stim/cuda/timer.h>  
8 -#include <stim/cuda/gaussian_blur.cuh> 6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/templates/gaussian_blur.cuh>
9 8
10 namespace stim{ 9 namespace stim{
11 namespace cuda{ 10 namespace cuda{
@@ -51,7 +50,7 @@ namespace stim{ @@ -51,7 +50,7 @@ namespace stim{
51 dim3 threads(max_threads, 1); 50 dim3 threads(max_threads, 1);
52 dim3 blocks(x_ds/threads.x + (x_ds %threads.x == 0 ? 0:1) , y_ds); 51 dim3 blocks(x_ds/threads.x + (x_ds %threads.x == 0 ? 0:1) , y_ds);
53 52
54 - stim::cuda::gpu_gaussian_blur_2d<float>(gpuI0, sigma_ds,x ,y); 53 + stim::cuda::gpu_gaussian_blur2<float>(gpuI0, sigma_ds,x ,y);
55 54
56 //resample the image 55 //resample the image
57 down_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y); 56 down_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y);
@@ -89,7 +88,7 @@ namespace stim{ @@ -89,7 +88,7 @@ namespace stim{
89 gpu_down_sample<T>(gpuI, gpuI0, resize, x, y); 88 gpu_down_sample<T>(gpuI, gpuI0, resize, x, y);
90 89
91 //copy the image data to the GPU 90 //copy the image data to the GPU
92 - cudaMemcpy(re_image, gpuI, bytes_ds, cudaMemcpyHostToDevice); 91 + cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice);
93 92
94 cudaFree(gpuI0); 93 cudaFree(gpuI0);
95 cudeFree(gpuI); 94 cudeFree(gpuI);
stim/cuda/local_max.cuh renamed to stim/cuda/ivote/local_max.cuh
@@ -4,8 +4,7 @@ @@ -4,8 +4,7 @@
4 4
5 # include <iostream> 5 # include <iostream>
6 # include <cuda.h> 6 # include <cuda.h>
7 -# include <stim/cuda/devices.h>  
8 -# include <stim/cuda/error.h> 7 +#include <stim/cuda/cudatools.h>
9 8
10 namespace stim{ 9 namespace stim{
11 namespace cuda{ 10 namespace cuda{
stim/cuda/update_dir.cuh renamed to stim/cuda/ivote/update_dir.cuh
@@ -4,8 +4,7 @@ @@ -4,8 +4,7 @@
4 4
5 # include <iostream> 5 # include <iostream>
6 # include <cuda.h> 6 # include <cuda.h>
7 -# include <stim/cuda/devices.h>  
8 -# include <stim/cuda/error.h> 7 +#include <stim/cuda/cudatools.h>
9 #include <stim/cuda/sharedmem.cuh> 8 #include <stim/cuda/sharedmem.cuh>
10 9
11 namespace stim{ 10 namespace stim{
stim/cuda/vote.cuh renamed to stim/cuda/ivote/vote.cuh
@@ -4,8 +4,7 @@ @@ -4,8 +4,7 @@
4 4
5 # include <iostream> 5 # include <iostream>
6 # include <cuda.h> 6 # include <cuda.h>
7 -# include <stim/cuda/devices.h>  
8 -# include <stim/cuda/error.h> 7 +#include <stim/cuda/cudatools.h>
9 #include <stim/cuda/sharedmem.cuh> 8 #include <stim/cuda/sharedmem.cuh>
10 9
11 10
stim/cuda/conv2.cuh renamed to stim/cuda/templates/conv2.cuh
@@ -3,8 +3,7 @@ @@ -3,8 +3,7 @@
3 3
4 #include <iostream> 4 #include <iostream>
5 #include <cuda.h> 5 #include <cuda.h>
6 -#include <stim/cuda/devices.h>  
7 -#include <stim/cuda/error.h> 6 +#include <stim/cuda/cudatools.h>
8 #include <cmath> 7 #include <cmath>
9 #include <algorithm> 8 #include <algorithm>
10 9
stim/cuda/gaussian_blur.cuh renamed to stim/cuda/templates/conv2sep.cuh
1 -#ifndef STIM_CUDA_GAUSSIAN_BLUR_H  
2 -#define STIM_CUDA_GAUSSIAN_BLUR_H 1 +#ifndef STIM_CUDA_CONV2SEP_H
  2 +#define STIM_CUDA_CONV2SEP_H
3 3
4 #include <iostream> 4 #include <iostream>
5 #include <cuda.h> 5 #include <cuda.h>
6 -#include <stim/cuda/devices.h>  
7 -#include <stim/cuda/timer.h> 6 +#include <stim/cuda/cudatools/devices.h>
  7 +#include <stim/cuda/cudatools/timer.h>
8 #include <stim/cuda/sharedmem.cuh> 8 #include <stim/cuda/sharedmem.cuh>
  9 +#include <stim/cuda/cudatools/error.h>
9 10
10 #define pi 3.14159 11 #define pi 3.14159
11 12
@@ -13,12 +14,13 @@ namespace stim{ @@ -13,12 +14,13 @@ namespace stim{
13 namespace cuda{ 14 namespace cuda{
14 15
15 template<typename T> 16 template<typename T>
16 - __global__ void gaussian_blur_x(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){ 17 + __global__ void conv2sep_0(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y,
  18 + T* kernel0, unsigned int k0){
17 19
18 //generate a pointer to shared memory (size will be specified as a kernel parameter) 20 //generate a pointer to shared memory (size will be specified as a kernel parameter)
19 extern __shared__ T s[]; 21 extern __shared__ T s[];
20 22
21 - int kr = sigma * 4; //calculate the kernel radius 23 + int kr = k0/2; //calculate the kernel radius
22 24
23 //get a pointer to the gaussian in memory 25 //get a pointer to the gaussian in memory
24 T* g = (T*)&s[blockDim.x + 2 * kr]; 26 T* g = (T*)&s[blockDim.x + 2 * kr];
@@ -30,30 +32,25 @@ namespace stim{ @@ -30,30 +32,25 @@ namespace stim{
30 //copy the portion of the image necessary for this block to shared memory 32 //copy the portion of the image necessary for this block to shared memory
31 stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim); 33 stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
32 34
33 - //calculate the thread index and block index 35 + //calculate the thread index
34 int ti = threadIdx.x; 36 int ti = threadIdx.x;
35 37
36 //calculate the spatial coordinate for this thread 38 //calculate the spatial coordinate for this thread
37 int xi = bxi + ti; 39 int xi = bxi + ti;
  40 + int yi = byi;
38 41
39 - //pre-compute the gaussian values for each kernel point  
40 - T a = 1.0 / (sigma * sqrt(2 * pi));  
41 - T c = - 1.0 / (2*sigma*sigma);  
42 - int ki;  
43 -  
44 - //use the first 2kr+1 threads to evaluate a gaussian and store the result  
45 - if(ti <= 2* kr+1){  
46 - ki = ti - kr;  
47 - g[ti] = a * exp((ki*ki) * c); 42 +
  43 + //use the first 2kr+1 threads to transfer the kernel to shared memory
  44 + if(ti < k0){
  45 + g[ti] = kernel0[ti];
48 } 46 }
49 47
50 //make sure that all writing to shared memory is done before continuing 48 //make sure that all writing to shared memory is done before continuing
51 __syncthreads(); 49 __syncthreads();
52 50
53 //if the current pixel is outside of the image 51 //if the current pixel is outside of the image
54 - if(bxi + ti > x || byi > y) 52 + if(xi > x || yi > y)
55 return; 53 return;
56 -  
57 54
58 55
59 //calculate the coordinates of the current thread in shared memory 56 //calculate the coordinates of the current thread in shared memory
@@ -66,7 +63,7 @@ namespace stim{ @@ -66,7 +63,7 @@ namespace stim{
66 for(int ki = -kr; ki <= kr; ki++){ 63 for(int ki = -kr; ki <= kr; ki++){
67 sum += g[ki + kr] * s[si + ki]; 64 sum += g[ki + kr] * s[si + ki];
68 } 65 }
69 - 66 +
70 //calculate the 1D image index for this thread 67 //calculate the 1D image index for this thread
71 unsigned int i = byi * x + xi; 68 unsigned int i = byi * x + xi;
72 69
@@ -75,12 +72,13 @@ namespace stim{ @@ -75,12 +72,13 @@ namespace stim{
75 } 72 }
76 73
77 template<typename T> 74 template<typename T>
78 - __global__ void gaussian_blur_y(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){ 75 + __global__ void conv2sep_1(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y,
  76 + T* kernel0, unsigned int k0){
79 77
80 //generate a pointer to shared memory (size will be specified as a kernel parameter) 78 //generate a pointer to shared memory (size will be specified as a kernel parameter)
81 extern __shared__ T s[]; 79 extern __shared__ T s[];
82 80
83 - int kr = sigma * 4; //calculate the kernel radius 81 + int kr = k0/2; //calculate the kernel radius
84 82
85 //get a pointer to the gaussian in memory 83 //get a pointer to the gaussian in memory
86 T* g = (T*)&s[blockDim.y + 2 * kr]; 84 T* g = (T*)&s[blockDim.y + 2 * kr];
@@ -92,30 +90,25 @@ namespace stim{ @@ -92,30 +90,25 @@ namespace stim{
92 //copy the portion of the image necessary for this block to shared memory 90 //copy the portion of the image necessary for this block to shared memory
93 stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim); 91 stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
94 92
95 - //calculate the thread index and block index 93 + //calculate the thread index
96 int ti = threadIdx.y; 94 int ti = threadIdx.y;
97 95
98 //calculate the spatial coordinate for this thread 96 //calculate the spatial coordinate for this thread
  97 + int xi = bxi;
99 int yi = byi + ti; 98 int yi = byi + ti;
100 99
101 - //pre-compute the gaussian values for each kernel point  
102 - T a = 1.0 / (sigma * sqrt(2 * pi));  
103 - T c = - 1.0 / (2*sigma*sigma);  
104 - int ki;  
105 -  
106 - //use the first 2kr+1 threads to evaluate a gaussian and store the result  
107 - if(ti <= 2* kr+1){  
108 - ki = ti - kr;  
109 - g[ti] = a * exp((ki*ki) * c); 100 +
  101 + //use the first 2kr+1 threads to transfer the kernel to shared memory
  102 + if(ti < k0){
  103 + g[ti] = kernel0[ti];
110 } 104 }
111 105
112 //make sure that all writing to shared memory is done before continuing 106 //make sure that all writing to shared memory is done before continuing
113 __syncthreads(); 107 __syncthreads();
114 108
115 //if the current pixel is outside of the image 109 //if the current pixel is outside of the image
116 - if(bxi >= x || yi >= y) 110 + if(xi > x || yi > y)
117 return; 111 return;
118 -  
119 112
120 113
121 //calculate the coordinates of the current thread in shared memory 114 //calculate the coordinates of the current thread in shared memory
@@ -128,17 +121,55 @@ namespace stim{ @@ -128,17 +121,55 @@ namespace stim{
128 for(int ki = -kr; ki <= kr; ki++){ 121 for(int ki = -kr; ki <= kr; ki++){
129 sum += g[ki + kr] * s[si + ki]; 122 sum += g[ki + kr] * s[si + ki];
130 } 123 }
131 - 124 +
132 //calculate the 1D image index for this thread 125 //calculate the 1D image index for this thread
133 - unsigned int i = yi * x + bxi; 126 + unsigned int i = yi * x + xi;
134 127
135 //output the result to global memory 128 //output the result to global memory
136 out[i] = sum; 129 out[i] = sum;
137 } 130 }
138 131
139 - /// Applies a Gaussian blur to a 2D image stored on the GPU  
140 template<typename T> 132 template<typename T>
141 - void gpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){ 133 + void tex_conv2sep(T* out, unsigned int x, unsigned int y,
  134 + cudaTextureObject_t texObj, cudaArray* cuArray,
  135 + T* kernel0, unsigned int k0,
  136 + T* kernel1, unsigned int k1){
  137 +
  138 + //get the maximum number of threads per block for the CUDA device
  139 + int max_threads = stim::maxThreadsPerBlock();
  140 + dim3 threads(max_threads, 1);
  141 +
  142 + //calculate the number of blocks
  143 + dim3 blocks(x / threads.x + 1, y);
  144 +
  145 + //calculate the shared memory used in the kernel
  146 + unsigned int pixel_bytes = max_threads * sizeof(T); //bytes devoted to pixel data being processed
  147 + unsigned int apron_bytes = k0/2 * sizeof(T); //bytes devoted to the apron on each side of the window
  148 + unsigned int gaussian_bytes = k0 * sizeof(T); //bytes devoted to memory used to store the pre-computed Gaussian window
  149 + unsigned int shared_bytes = pixel_bytes + 2 * apron_bytes + gaussian_bytes; //total number of bytes shared memory used
  150 +
  151 + //blur the image along the x-axis
  152 + conv2sep_0<T> <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel0, k0);
  153 +
  154 + // Copy the x-blurred data from global memory to the texture
  155 + cudaMemcpyToArray(cuArray, 0, 0, out, x * y * sizeof(T),
  156 + cudaMemcpyDeviceToDevice);
  157 +
  158 + //transpose the block and thread dimensions
  159 + threads.x = 1;
  160 + threads.y = max_threads;
  161 + blocks.x = x;
  162 + blocks.y = y / threads.y + 1;
  163 +
  164 + //blur the image along the y-axis
  165 + conv2sep_1<T> <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel1, k1);
  166 +
  167 + }
  168 +
  169 + template<typename T>
  170 + void gpu_conv2sep(T* image, unsigned int x, unsigned int y,
  171 + T* kernel0, unsigned int k0,
  172 + T* kernel1, unsigned int k1){
142 173
143 //get the number of pixels in the image 174 //get the number of pixels in the image
144 unsigned int pixels = x * y; 175 unsigned int pixels = x * y;
@@ -176,36 +207,9 @@ namespace stim{ @@ -176,36 +207,9 @@ namespace stim{
176 cudaTextureObject_t texObj = 0; 207 cudaTextureObject_t texObj = 0;
177 cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); 208 cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
178 209
  210 + //call the texture version of the separable convolution function
  211 + tex_conv2sep(image, x, y, texObj, cuArray, kernel0, k0, kernel1, k1);
179 212
180 - //get the maximum number of threads per block for the CUDA device  
181 - int max_threads = stim::maxThreadsPerBlock();  
182 - dim3 threads(max_threads, 1);  
183 -  
184 - //calculate the number of blocks  
185 - dim3 blocks(x / threads.x + 1, y);  
186 -  
187 - //calculate the shared memory used in the kernel  
188 - unsigned int pixel_bytes = max_threads * 4; //bytes devoted to pixel data being processed  
189 - unsigned int apron_bytes = sigma * 8 * 4; //bytes devoted to pixels outside the window  
190 - unsigned int gaussian_bytes = (sigma * 8 + 1) * 4; //bytes devoted to memory used to store the pre-computed Gaussian window  
191 - unsigned int shared_bytes = pixel_bytes + apron_bytes + gaussian_bytes; //total number of bytes shared memory used  
192 -  
193 - //blur the image along the x-axis  
194 - gaussian_blur_x <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y);  
195 -  
196 - // Copy the x-blurred data from global memory to the texture  
197 - cudaMemcpyToArray(cuArray, 0, 0, image, bytes,  
198 - cudaMemcpyDeviceToDevice);  
199 -  
200 - //transpose the block and thread dimensions  
201 - threads.x = 1;  
202 - threads.y = max_threads;  
203 - blocks.x = x;  
204 - blocks.y = y / threads.y + 1;  
205 -  
206 - //blur the image along the y-axis  
207 - gaussian_blur_y <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y);  
208 -  
209 //free allocated memory 213 //free allocated memory
210 cudaFree(cuArray); 214 cudaFree(cuArray);
211 215
@@ -213,22 +217,35 @@ namespace stim{ @@ -213,22 +217,35 @@ namespace stim{
213 217
214 /// Applies a Gaussian blur to a 2D image stored on the CPU 218 /// Applies a Gaussian blur to a 2D image stored on the CPU
215 template<typename T> 219 template<typename T>
216 - void cpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){ 220 + void cpu_conv2sep(T* image, unsigned int x, unsigned int y,
  221 + T* kernel0, unsigned int k0,
  222 + T* kernel1, unsigned int k1){
217 223
218 //get the number of pixels in the image 224 //get the number of pixels in the image
219 unsigned int pixels = x * y; 225 unsigned int pixels = x * y;
220 unsigned int bytes = sizeof(T) * pixels; 226 unsigned int bytes = sizeof(T) * pixels;
221 227
222 - //allocate space on the GPU 228 + //---------Allocate Image---------
  229 + //allocate space on the GPU for the image
223 T* gpuI0; 230 T* gpuI0;
224 - cudaMalloc(&gpuI0, bytes);  
225 - 231 + HANDLE_ERROR(cudaMalloc(&gpuI0, bytes));
226 232
227 //copy the image data to the GPU 233 //copy the image data to the GPU
228 - cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice); 234 + HANDLE_ERROR(cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice));
  235 +
  236 + //---------Allocate Kernel--------
  237 + //allocate and copy the 0 (x) kernel
  238 + T* gpuK0;
  239 + HANDLE_ERROR(cudaMalloc(&gpuK0, k0 * sizeof(T)));
  240 + HANDLE_ERROR(cudaMemcpy(gpuK0, kernel0, k0 * sizeof(T), cudaMemcpyHostToDevice));
  241 +
  242 + //allocate and copy the 1 (y) kernel
  243 + T* gpuK1;
  244 + HANDLE_ERROR(cudaMalloc(&gpuK1, k1 * sizeof(T)));
  245 + HANDLE_ERROR(cudaMemcpy(gpuK1, kernel1, k1 * sizeof(T), cudaMemcpyHostToDevice));
229 246
230 //run the GPU-based version of the algorithm 247 //run the GPU-based version of the algorithm
231 - gpu_gaussian_blur_2d<T>(gpuI0, sigma, x, y); 248 + gpu_conv2sep<T>(gpuI0, x, y, gpuK0, k0, gpuK1, k1);
232 249
233 //copy the image data from the device 250 //copy the image data from the device
234 cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost); 251 cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost);
stim/cuda/templates/gaussian_blur.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_GAUSSIAN_BLUR_H
  2 +#define STIM_CUDA_GAUSSIAN_BLUR_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +#include <stim/cuda/sharedmem.cuh>
  8 +#include <stim/cuda/templates/conv2sep.cuh> //GPU-based separable convolution algorithm
  9 +
  10 +#define pi 3.14159
  11 +
  12 +namespace stim{
  13 + namespace cuda{
  14 +
  15 + template<typename T>
  16 + void gen_gaussian(T* out, T sigma, unsigned int width){
  17 +
  18 + //fill the kernel with a gaussian
  19 + for(unsigned int xi = 0; xi < width; xi++){
  20 +
  21 + float x = (float)xi - (float)(width/2); //calculate the x position of the gaussian
  22 + float g = 1.0 / (sigma * sqrt(2 * 3.14159)) * exp( - (x*x) / (2*sigma*sigma) );
  23 + out[xi] = g;
  24 + }
  25 +
  26 + }
  27 +
  28 + template<typename T>
  29 + void tex_gaussian_blur2(T* out, T sigma, unsigned int x, unsigned int y, cudaTextureObject_t texObj, cudaArray* cuArray){
  30 +
  31 + //allocate space for the kernel
  32 + unsigned int kwidth = sigma * 8 + 1;
  33 + float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
  34 +
  35 + //fill the kernel with a gaussian
  36 + gen_gaussian(kernel0, sigma, kwidth);
  37 +
  38 + //copy the kernel to the GPU
  39 + T* gpuKernel0;
  40 + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
  41 +
  42 + //perform the gaussian blur as a separable convolution
  43 + stim::cuda::tex_conv2sep(out, x, y, texObj, cuArray, gpuKernel0, kwidth, gpuKernel0, kwidth);
  44 +
  45 + HANDLE_ERROR(cudaFree(gpuKernel0));
  46 +
  47 + }
  48 +
  49 + template<typename T>
  50 + void gpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
  51 +
  52 + //allocate space for the kernel
  53 + unsigned int kwidth = sigma * 8 + 1;
  54 + float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
  55 +
  56 + //fill the kernel with a gaussian
  57 + gen_gaussian(kernel0, sigma, kwidth);
  58 +
  59 + //copy the kernel to the GPU
  60 + T* gpuKernel0;
  61 + HANDLE_ERROR(cudaMalloc(&gpuKernel0, kwidth * sizeof(T)));
  62 + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice));
  63 +
  64 + //perform the gaussian blur as a separable convolution
  65 + stim::cuda::gpu_conv2sep<float>(image, x, y, gpuKernel0, kwidth, gpuKernel0, kwidth);
  66 +
  67 + HANDLE_ERROR(cudaFree(gpuKernel0));
  68 +
  69 + }
  70 +
  71 + /// Applies a Gaussian blur to a 2D image stored on the CPU
  72 + template<typename T>
  73 + void cpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){
  74 +
  75 + //allocate space for the kernel
  76 + unsigned int kwidth = sigma * 8 + 1;
  77 + float* kernel0 = (float*) malloc( kwidth * sizeof(float) );
  78 +
  79 + //fill the kernel with a gaussian
  80 + gen_gaussian(kernel0, sigma, kwidth);
  81 +
  82 + //perform the gaussian blur as a separable convolution
  83 + stim::cuda::cpu_conv2sep<float>(image, x, y, kernel0, kwidth, kernel0, kwidth);
  84 +
  85 + }
  86 +
  87 + };
  88 +};
  89 +
  90 +#endif
0 \ No newline at end of file 91 \ No newline at end of file
stim/cuda/gradient.cuh renamed to stim/cuda/templates/gradient.cuh
@@ -3,8 +3,7 @@ @@ -3,8 +3,7 @@
3 3
4 #include <iostream> 4 #include <iostream>
5 #include <cuda.h> 5 #include <cuda.h>
6 -#include <stim/cuda/devices.h>  
7 -#include <stim/cuda/error.h> 6 +#include <stim/cuda/cudatools.h>
8 7
9 namespace stim{ 8 namespace stim{
10 namespace cuda{ 9 namespace cuda{
stim/gl/gl_spider.h
@@ -14,7 +14,7 @@ @@ -14,7 +14,7 @@
14 #include "stim/math/rect.h" 14 #include "stim/math/rect.h"
15 #include "stim/math/matrix.h" 15 #include "stim/math/matrix.h"
16 #include "stim/cuda/cost.h" 16 #include "stim/cuda/cost.h"
17 -#include "stim/cuda/glbind.h" 17 +#include <stim/cuda/cudatools/glbind.h>
18 #include <stim/visualization/obj.h> 18 #include <stim/visualization/obj.h>
19 #include <vector> 19 #include <vector>
20 20
stim/image/image.h
@@ -174,7 +174,7 @@ public: @@ -174,7 +174,7 @@ public:
174 174
175 175
176 /// Returns the maximum pixel value in the image 176 /// Returns the maximum pixel value in the image
177 - T max(){ 177 + T maxv(){
178 float max = 0; 178 float max = 0;
179 unsigned long N = width() * height(); //get the number of pixels 179 unsigned long N = width() * height(); //get the number of pixels
180 180
@@ -190,7 +190,7 @@ public: @@ -190,7 +190,7 @@ public:
190 } 190 }
191 191
192 /// Returns the minimum pixel value in the image 192 /// Returns the minimum pixel value in the image
193 - T min(){ 193 + T minv(){
194 float min = 0; 194 float min = 0;
195 unsigned long N = width() * height(); //get the number of pixels 195 unsigned long N = width() * height(); //get the number of pixels
196 196
stim/math/matrix.h
@@ -5,7 +5,7 @@ @@ -5,7 +5,7 @@
5 #include <string.h> 5 #include <string.h>
6 #include <iostream> 6 #include <iostream>
7 #include <stim/math/vector.h> 7 #include <stim/math/vector.h>
8 -#include "../cuda/callable.h" 8 +#include <stim/cuda/cudatools/callable.h>
9 9
10 namespace stim{ 10 namespace stim{
11 11
stim/math/quaternion.h
1 #ifndef RTS_QUATERNION_H 1 #ifndef RTS_QUATERNION_H
2 #define RTS_QUATERNION_H 2 #define RTS_QUATERNION_H
3 3
4 -#include "../math/matrix.h"  
5 -#include "../cuda/callable.h" 4 +#include <stim/math/matrix.h>
  5 +#include <stim/cuda/cudatools/callable.h>
6 6
7 namespace stim{ 7 namespace stim{
8 8
@@ -2,7 +2,7 @@ @@ -2,7 +2,7 @@
2 #define RTS_RECT_H 2 #define RTS_RECT_H
3 3
4 //enable CUDA_CALLABLE macro 4 //enable CUDA_CALLABLE macro
5 -#include <stim/cuda/callable.h> 5 +#include <stim/cuda/cudatools/callable.h>
6 #include <stim/math/vector.h> 6 #include <stim/math/vector.h>
7 #include <stim/math/triangle.h> 7 #include <stim/math/triangle.h>
8 #include <stim/math/quaternion.h> 8 #include <stim/math/quaternion.h>
stim/math/triangle.h
@@ -2,7 +2,7 @@ @@ -2,7 +2,7 @@
2 #define RTS_TRIANGLE_H 2 #define RTS_TRIANGLE_H
3 3
4 //enable CUDA_CALLABLE macro 4 //enable CUDA_CALLABLE macro
5 -#include <stim/cuda/callable.h> 5 +#include <stim/cuda/cudatools/callable.h>
6 #include <stim/math/vector.h> 6 #include <stim/math/vector.h>
7 #include <iostream> 7 #include <iostream>
8 8
stim/math/vector.h
@@ -5,7 +5,8 @@ @@ -5,7 +5,8 @@
5 #include <cmath> 5 #include <cmath>
6 #include <sstream> 6 #include <sstream>
7 #include <vector> 7 #include <vector>
8 -#include "../cuda/callable.h" 8 +
  9 +#include <stim/cuda/cudatools/callable.h>
9 10
10 namespace stim 11 namespace stim
11 { 12 {
stim/visualization/colormap.h
@@ -4,12 +4,12 @@ @@ -4,12 +4,12 @@
4 #include <string> 4 #include <string>
5 #include <stdlib.h> 5 #include <stdlib.h>
6 #ifdef __CUDACC__ 6 #ifdef __CUDACC__
7 -#include "../cuda/error.h" 7 +#include <stim/cuda/cudatools/error.h>
8 #endif 8 #endif
9 9
10 //saving an image to a file uses the CImg library 10 //saving an image to a file uses the CImg library
11 //this currently throws a lot of "unreachable" warnings (as of GCC 4.8.2, nvcc 6.5.12) 11 //this currently throws a lot of "unreachable" warnings (as of GCC 4.8.2, nvcc 6.5.12)
12 -#include "../image/image.h" 12 +#include <stim/image/image.h>
13 13
14 14
15 #define BREWER_CTRL_PTS 11 15 #define BREWER_CTRL_PTS 11