Commit 3173e54dd0c592a5d06da09f3e0884973e53edd2
Merge branch 'master' of git.stim.ee.uh.edu:codebase/stimlib into bsds500
Showing
30 changed files
with
265 additions
and
145 deletions
Show diff stats
stim/cuda/arraymath.cuh
1 | #ifndef STIM_CUDA_ARRAYMATH_H | 1 | #ifndef STIM_CUDA_ARRAYMATH_H |
2 | #define STIM_CUDA_ARRAYMATH_H | 2 | #define STIM_CUDA_ARRAYMATH_H |
3 | 3 | ||
4 | -#include <stim/cuda/array_add.cuh> | ||
5 | -#include <stim/cuda/array_multiply.cuh> | ||
6 | -#include <stim/cuda/array_abs.cuh> | ||
7 | -#include <stim/cuda/array_cart2polar.cuh> | ||
8 | -#include <stim/cuda/gaussian_blur.cuh> | ||
9 | -#include <stim/cuda/conv2.cuh> | 4 | +#include <stim/cuda/arraymath/array_add.cuh> |
5 | +#include <stim/cuda/arraymath/array_multiply.cuh> | ||
6 | +#include <stim/cuda/arraymath/array_abs.cuh> | ||
7 | +#include <stim/cuda/arraymath/array_cart2polar.cuh> | ||
10 | 8 | ||
11 | namespace stim{ | 9 | namespace stim{ |
12 | namespace cuda{ | 10 | namespace cuda{ |
stim/cuda/array_abs.cuh renamed to stim/cuda/arraymath/array_abs.cuh
stim/cuda/array_add.cuh renamed to stim/cuda/arraymath/array_add.cuh
stim/cuda/array_cart2polar.cuh renamed to stim/cuda/arraymath/array_cart2polar.cuh
stim/cuda/array_multiply.cuh renamed to stim/cuda/arraymath/array_multiply.cuh
stim/cuda/cost.h
@@ -3,11 +3,11 @@ | @@ -3,11 +3,11 @@ | ||
3 | #include <cuda_runtime.h> | 3 | #include <cuda_runtime.h> |
4 | #include <cublas_v2.h> | 4 | #include <cublas_v2.h> |
5 | #include <stdio.h> | 5 | #include <stdio.h> |
6 | -#include "../visualization/colormap.h" | 6 | +#include <stim/visualization/colormap.h> |
7 | #include <sstream> | 7 | #include <sstream> |
8 | -#include "../math/vector.h" | ||
9 | -#include "../cuda/devices.h" | ||
10 | -#include "../cuda/threads.h" | 8 | +#include <stim/math/vector.h> |
9 | +#include <stim/cuda/cudatools/devices.h> | ||
10 | +#include <stim/cuda/cudatools/threads.h> | ||
11 | 11 | ||
12 | ///Cost function that works with the gl-spider class to find index of the item with min-cost. | 12 | ///Cost function that works with the gl-spider class to find index of the item with min-cost. |
13 | typedef unsigned char uchar; | 13 | typedef unsigned char uchar; |
stim/cuda/callable.h renamed to stim/cuda/cudatools/callable.h
stim/cuda/devices.h renamed to stim/cuda/cudatools/devices.h
stim/cuda/error.h renamed to stim/cuda/cudatools/error.h
stim/cuda/glbind.h renamed to stim/cuda/cudatools/glbind.h
stim/cuda/threads.h renamed to stim/cuda/cudatools/threads.h
1 | -#include "cuda_runtime.h" | 1 | +#include "cuda_runtime.h" |
2 | #include "device_launch_parameters.h" | 2 | #include "device_launch_parameters.h" |
3 | -#include "../cuda/callable.h" | 3 | +#include <stim/cuda/cudatools/callable.h> |
4 | 4 | ||
5 | #ifndef CUDA_THREADS_H | 5 | #ifndef CUDA_THREADS_H |
6 | #define CUDA_THREADS_H | 6 | #define CUDA_THREADS_H |
stim/cuda/timer.h renamed to stim/cuda/cudatools/timer.h
1 | -#ifndef STIM_CUDA_TIMER | ||
2 | -#define STIM_CUDA_TIMER | ||
3 | - | ||
4 | -static cudaEvent_t tStartEvent; | ||
5 | -static cudaEvent_t tStopEvent; | ||
6 | - | ||
7 | -namespace stim{ | ||
8 | - | ||
9 | -/// These functions calculate the time between GPU functions in milliseconds | ||
10 | -static void gpuStartTimer() | ||
11 | -{ | ||
12 | - //set up timing events | ||
13 | - cudaEventCreate(&tStartEvent); | ||
14 | - cudaEventCreate(&tStopEvent); | ||
15 | - cudaEventRecord(tStartEvent, 0); | ||
16 | -} | ||
17 | - | ||
18 | -static float gpuStopTimer() | ||
19 | -{ | ||
20 | - cudaEventRecord(tStopEvent, 0); | ||
21 | - cudaEventSynchronize(tStopEvent); | ||
22 | - float elapsedTime; | ||
23 | - cudaEventElapsedTime(&elapsedTime, tStartEvent, tStopEvent); | ||
24 | - cudaEventDestroy(tStartEvent); | ||
25 | - cudaEventDestroy(tStopEvent); | ||
26 | - return elapsedTime; | ||
27 | -} | ||
28 | - | ||
29 | -} //end namespace stim | ||
30 | - | 1 | +#ifndef STIM_CUDA_TIMER |
2 | +#define STIM_CUDA_TIMER | ||
3 | + | ||
4 | +static cudaEvent_t tStartEvent; | ||
5 | +static cudaEvent_t tStopEvent; | ||
6 | + | ||
7 | +namespace stim{ | ||
8 | + | ||
9 | +/// These functions calculate the time between GPU functions in milliseconds | ||
10 | +static void gpuStartTimer() | ||
11 | +{ | ||
12 | + //set up timing events | ||
13 | + cudaEventCreate(&tStartEvent); | ||
14 | + cudaEventCreate(&tStopEvent); | ||
15 | + cudaEventRecord(tStartEvent, 0); | ||
16 | +} | ||
17 | + | ||
18 | +static float gpuStopTimer() | ||
19 | +{ | ||
20 | + cudaEventRecord(tStopEvent, 0); | ||
21 | + cudaEventSynchronize(tStopEvent); | ||
22 | + float elapsedTime; | ||
23 | + cudaEventElapsedTime(&elapsedTime, tStartEvent, tStopEvent); | ||
24 | + cudaEventDestroy(tStartEvent); | ||
25 | + cudaEventDestroy(tStopEvent); | ||
26 | + return elapsedTime; | ||
27 | +} | ||
28 | + | ||
29 | +} //end namespace stim | ||
30 | + | ||
31 | #endif | 31 | #endif |
32 | \ No newline at end of file | 32 | \ No newline at end of file |
1 | +#ifndef STIM_CUDA_IVOTE_H | ||
2 | +#define STIM_CUDA_IVOTE_H | ||
3 | + | ||
4 | +#include <stim/cuda/ivote/down_sample.cuh> | ||
5 | +#include <stim/cuda/ivote/local_max.cuh> | ||
6 | +#include <stim/cuda/ivote/update_dir.cuh> | ||
7 | +#include <stim/cuda/ivote/vote.cuh> | ||
8 | + | ||
9 | +namespace stim{ | ||
10 | + namespace cuda{ | ||
11 | + | ||
12 | + } | ||
13 | +} | ||
14 | + | ||
15 | + | ||
16 | + | ||
17 | +#endif | ||
0 | \ No newline at end of file | 18 | \ No newline at end of file |
stim/cuda/down_sample.cuh renamed to stim/cuda/ivote/down_sample.cuh
@@ -3,9 +3,8 @@ | @@ -3,9 +3,8 @@ | ||
3 | 3 | ||
4 | #include <iostream> | 4 | #include <iostream> |
5 | #include <cuda.h> | 5 | #include <cuda.h> |
6 | -#include <stim/cuda/devices.h> | ||
7 | -#include <stim/cuda/timer.h> | ||
8 | -#include <stim/cuda/gaussian_blur.cuh> | 6 | +#include <stim/cuda/cudatools.h> |
7 | +#include <stim/cuda/templates/gaussian_blur.cuh> | ||
9 | 8 | ||
10 | namespace stim{ | 9 | namespace stim{ |
11 | namespace cuda{ | 10 | namespace cuda{ |
@@ -51,7 +50,7 @@ namespace stim{ | @@ -51,7 +50,7 @@ namespace stim{ | ||
51 | dim3 threads(max_threads, 1); | 50 | dim3 threads(max_threads, 1); |
52 | dim3 blocks(x_ds/threads.x + (x_ds %threads.x == 0 ? 0:1) , y_ds); | 51 | dim3 blocks(x_ds/threads.x + (x_ds %threads.x == 0 ? 0:1) , y_ds); |
53 | 52 | ||
54 | - stim::cuda::gpu_gaussian_blur_2d<float>(gpuI0, sigma_ds,x ,y); | 53 | + stim::cuda::gpu_gaussian_blur2<float>(gpuI0, sigma_ds,x ,y); |
55 | 54 | ||
56 | //resample the image | 55 | //resample the image |
57 | down_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y); | 56 | down_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y); |
@@ -89,7 +88,7 @@ namespace stim{ | @@ -89,7 +88,7 @@ namespace stim{ | ||
89 | gpu_down_sample<T>(gpuI, gpuI0, resize, x, y); | 88 | gpu_down_sample<T>(gpuI, gpuI0, resize, x, y); |
90 | 89 | ||
91 | //copy the image data to the GPU | 90 | //copy the image data to the GPU |
92 | - cudaMemcpy(re_image, gpuI, bytes_ds, cudaMemcpyHostToDevice); | 91 | + cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice); |
93 | 92 | ||
94 | cudaFree(gpuI0); | 93 | cudaFree(gpuI0); |
95 | cudeFree(gpuI); | 94 | cudeFree(gpuI); |
stim/cuda/local_max.cuh renamed to stim/cuda/ivote/local_max.cuh
@@ -4,8 +4,7 @@ | @@ -4,8 +4,7 @@ | ||
4 | 4 | ||
5 | # include <iostream> | 5 | # include <iostream> |
6 | # include <cuda.h> | 6 | # include <cuda.h> |
7 | -# include <stim/cuda/devices.h> | ||
8 | -# include <stim/cuda/error.h> | 7 | +#include <stim/cuda/cudatools.h> |
9 | 8 | ||
10 | namespace stim{ | 9 | namespace stim{ |
11 | namespace cuda{ | 10 | namespace cuda{ |
stim/cuda/update_dir.cuh renamed to stim/cuda/ivote/update_dir.cuh
@@ -4,8 +4,7 @@ | @@ -4,8 +4,7 @@ | ||
4 | 4 | ||
5 | # include <iostream> | 5 | # include <iostream> |
6 | # include <cuda.h> | 6 | # include <cuda.h> |
7 | -# include <stim/cuda/devices.h> | ||
8 | -# include <stim/cuda/error.h> | 7 | +#include <stim/cuda/cudatools.h> |
9 | #include <stim/cuda/sharedmem.cuh> | 8 | #include <stim/cuda/sharedmem.cuh> |
10 | 9 | ||
11 | namespace stim{ | 10 | namespace stim{ |
stim/cuda/vote.cuh renamed to stim/cuda/ivote/vote.cuh
@@ -4,8 +4,7 @@ | @@ -4,8 +4,7 @@ | ||
4 | 4 | ||
5 | # include <iostream> | 5 | # include <iostream> |
6 | # include <cuda.h> | 6 | # include <cuda.h> |
7 | -# include <stim/cuda/devices.h> | ||
8 | -# include <stim/cuda/error.h> | 7 | +#include <stim/cuda/cudatools.h> |
9 | #include <stim/cuda/sharedmem.cuh> | 8 | #include <stim/cuda/sharedmem.cuh> |
10 | 9 | ||
11 | 10 |
stim/cuda/conv2.cuh renamed to stim/cuda/templates/conv2.cuh
@@ -3,8 +3,7 @@ | @@ -3,8 +3,7 @@ | ||
3 | 3 | ||
4 | #include <iostream> | 4 | #include <iostream> |
5 | #include <cuda.h> | 5 | #include <cuda.h> |
6 | -#include <stim/cuda/devices.h> | ||
7 | -#include <stim/cuda/error.h> | 6 | +#include <stim/cuda/cudatools.h> |
8 | #include <cmath> | 7 | #include <cmath> |
9 | #include <algorithm> | 8 | #include <algorithm> |
10 | 9 |
stim/cuda/gaussian_blur.cuh renamed to stim/cuda/templates/conv2sep.cuh
1 | -#ifndef STIM_CUDA_GAUSSIAN_BLUR_H | ||
2 | -#define STIM_CUDA_GAUSSIAN_BLUR_H | 1 | +#ifndef STIM_CUDA_CONV2SEP_H |
2 | +#define STIM_CUDA_CONV2SEP_H | ||
3 | 3 | ||
4 | #include <iostream> | 4 | #include <iostream> |
5 | #include <cuda.h> | 5 | #include <cuda.h> |
6 | -#include <stim/cuda/devices.h> | ||
7 | -#include <stim/cuda/timer.h> | 6 | +#include <stim/cuda/cudatools/devices.h> |
7 | +#include <stim/cuda/cudatools/timer.h> | ||
8 | #include <stim/cuda/sharedmem.cuh> | 8 | #include <stim/cuda/sharedmem.cuh> |
9 | +#include <stim/cuda/cudatools/error.h> | ||
9 | 10 | ||
10 | #define pi 3.14159 | 11 | #define pi 3.14159 |
11 | 12 | ||
@@ -13,12 +14,13 @@ namespace stim{ | @@ -13,12 +14,13 @@ namespace stim{ | ||
13 | namespace cuda{ | 14 | namespace cuda{ |
14 | 15 | ||
15 | template<typename T> | 16 | template<typename T> |
16 | - __global__ void gaussian_blur_x(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){ | 17 | + __global__ void conv2sep_0(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y, |
18 | + T* kernel0, unsigned int k0){ | ||
17 | 19 | ||
18 | //generate a pointer to shared memory (size will be specified as a kernel parameter) | 20 | //generate a pointer to shared memory (size will be specified as a kernel parameter) |
19 | extern __shared__ T s[]; | 21 | extern __shared__ T s[]; |
20 | 22 | ||
21 | - int kr = sigma * 4; //calculate the kernel radius | 23 | + int kr = k0/2; //calculate the kernel radius |
22 | 24 | ||
23 | //get a pointer to the gaussian in memory | 25 | //get a pointer to the gaussian in memory |
24 | T* g = (T*)&s[blockDim.x + 2 * kr]; | 26 | T* g = (T*)&s[blockDim.x + 2 * kr]; |
@@ -30,30 +32,25 @@ namespace stim{ | @@ -30,30 +32,25 @@ namespace stim{ | ||
30 | //copy the portion of the image necessary for this block to shared memory | 32 | //copy the portion of the image necessary for this block to shared memory |
31 | stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim); | 33 | stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim); |
32 | 34 | ||
33 | - //calculate the thread index and block index | 35 | + //calculate the thread index |
34 | int ti = threadIdx.x; | 36 | int ti = threadIdx.x; |
35 | 37 | ||
36 | //calculate the spatial coordinate for this thread | 38 | //calculate the spatial coordinate for this thread |
37 | int xi = bxi + ti; | 39 | int xi = bxi + ti; |
40 | + int yi = byi; | ||
38 | 41 | ||
39 | - //pre-compute the gaussian values for each kernel point | ||
40 | - T a = 1.0 / (sigma * sqrt(2 * pi)); | ||
41 | - T c = - 1.0 / (2*sigma*sigma); | ||
42 | - int ki; | ||
43 | - | ||
44 | - //use the first 2kr+1 threads to evaluate a gaussian and store the result | ||
45 | - if(ti <= 2* kr+1){ | ||
46 | - ki = ti - kr; | ||
47 | - g[ti] = a * exp((ki*ki) * c); | 42 | + |
43 | + //use the first 2kr+1 threads to transfer the kernel to shared memory | ||
44 | + if(ti < k0){ | ||
45 | + g[ti] = kernel0[ti]; | ||
48 | } | 46 | } |
49 | 47 | ||
50 | //make sure that all writing to shared memory is done before continuing | 48 | //make sure that all writing to shared memory is done before continuing |
51 | __syncthreads(); | 49 | __syncthreads(); |
52 | 50 | ||
53 | //if the current pixel is outside of the image | 51 | //if the current pixel is outside of the image |
54 | - if(bxi + ti > x || byi > y) | 52 | + if(xi > x || yi > y) |
55 | return; | 53 | return; |
56 | - | ||
57 | 54 | ||
58 | 55 | ||
59 | //calculate the coordinates of the current thread in shared memory | 56 | //calculate the coordinates of the current thread in shared memory |
@@ -66,7 +63,7 @@ namespace stim{ | @@ -66,7 +63,7 @@ namespace stim{ | ||
66 | for(int ki = -kr; ki <= kr; ki++){ | 63 | for(int ki = -kr; ki <= kr; ki++){ |
67 | sum += g[ki + kr] * s[si + ki]; | 64 | sum += g[ki + kr] * s[si + ki]; |
68 | } | 65 | } |
69 | - | 66 | + |
70 | //calculate the 1D image index for this thread | 67 | //calculate the 1D image index for this thread |
71 | unsigned int i = byi * x + xi; | 68 | unsigned int i = byi * x + xi; |
72 | 69 | ||
@@ -75,12 +72,13 @@ namespace stim{ | @@ -75,12 +72,13 @@ namespace stim{ | ||
75 | } | 72 | } |
76 | 73 | ||
77 | template<typename T> | 74 | template<typename T> |
78 | - __global__ void gaussian_blur_y(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){ | 75 | + __global__ void conv2sep_1(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y, |
76 | + T* kernel0, unsigned int k0){ | ||
79 | 77 | ||
80 | //generate a pointer to shared memory (size will be specified as a kernel parameter) | 78 | //generate a pointer to shared memory (size will be specified as a kernel parameter) |
81 | extern __shared__ T s[]; | 79 | extern __shared__ T s[]; |
82 | 80 | ||
83 | - int kr = sigma * 4; //calculate the kernel radius | 81 | + int kr = k0/2; //calculate the kernel radius |
84 | 82 | ||
85 | //get a pointer to the gaussian in memory | 83 | //get a pointer to the gaussian in memory |
86 | T* g = (T*)&s[blockDim.y + 2 * kr]; | 84 | T* g = (T*)&s[blockDim.y + 2 * kr]; |
@@ -92,30 +90,25 @@ namespace stim{ | @@ -92,30 +90,25 @@ namespace stim{ | ||
92 | //copy the portion of the image necessary for this block to shared memory | 90 | //copy the portion of the image necessary for this block to shared memory |
93 | stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim); | 91 | stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim); |
94 | 92 | ||
95 | - //calculate the thread index and block index | 93 | + //calculate the thread index |
96 | int ti = threadIdx.y; | 94 | int ti = threadIdx.y; |
97 | 95 | ||
98 | //calculate the spatial coordinate for this thread | 96 | //calculate the spatial coordinate for this thread |
97 | + int xi = bxi; | ||
99 | int yi = byi + ti; | 98 | int yi = byi + ti; |
100 | 99 | ||
101 | - //pre-compute the gaussian values for each kernel point | ||
102 | - T a = 1.0 / (sigma * sqrt(2 * pi)); | ||
103 | - T c = - 1.0 / (2*sigma*sigma); | ||
104 | - int ki; | ||
105 | - | ||
106 | - //use the first 2kr+1 threads to evaluate a gaussian and store the result | ||
107 | - if(ti <= 2* kr+1){ | ||
108 | - ki = ti - kr; | ||
109 | - g[ti] = a * exp((ki*ki) * c); | 100 | + |
101 | + //use the first 2kr+1 threads to transfer the kernel to shared memory | ||
102 | + if(ti < k0){ | ||
103 | + g[ti] = kernel0[ti]; | ||
110 | } | 104 | } |
111 | 105 | ||
112 | //make sure that all writing to shared memory is done before continuing | 106 | //make sure that all writing to shared memory is done before continuing |
113 | __syncthreads(); | 107 | __syncthreads(); |
114 | 108 | ||
115 | //if the current pixel is outside of the image | 109 | //if the current pixel is outside of the image |
116 | - if(bxi >= x || yi >= y) | 110 | + if(xi > x || yi > y) |
117 | return; | 111 | return; |
118 | - | ||
119 | 112 | ||
120 | 113 | ||
121 | //calculate the coordinates of the current thread in shared memory | 114 | //calculate the coordinates of the current thread in shared memory |
@@ -128,17 +121,55 @@ namespace stim{ | @@ -128,17 +121,55 @@ namespace stim{ | ||
128 | for(int ki = -kr; ki <= kr; ki++){ | 121 | for(int ki = -kr; ki <= kr; ki++){ |
129 | sum += g[ki + kr] * s[si + ki]; | 122 | sum += g[ki + kr] * s[si + ki]; |
130 | } | 123 | } |
131 | - | 124 | + |
132 | //calculate the 1D image index for this thread | 125 | //calculate the 1D image index for this thread |
133 | - unsigned int i = yi * x + bxi; | 126 | + unsigned int i = yi * x + xi; |
134 | 127 | ||
135 | //output the result to global memory | 128 | //output the result to global memory |
136 | out[i] = sum; | 129 | out[i] = sum; |
137 | } | 130 | } |
138 | 131 | ||
139 | - /// Applies a Gaussian blur to a 2D image stored on the GPU | ||
140 | template<typename T> | 132 | template<typename T> |
141 | - void gpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){ | 133 | + void tex_conv2sep(T* out, unsigned int x, unsigned int y, |
134 | + cudaTextureObject_t texObj, cudaArray* cuArray, | ||
135 | + T* kernel0, unsigned int k0, | ||
136 | + T* kernel1, unsigned int k1){ | ||
137 | + | ||
138 | + //get the maximum number of threads per block for the CUDA device | ||
139 | + int max_threads = stim::maxThreadsPerBlock(); | ||
140 | + dim3 threads(max_threads, 1); | ||
141 | + | ||
142 | + //calculate the number of blocks | ||
143 | + dim3 blocks(x / threads.x + 1, y); | ||
144 | + | ||
145 | + //calculate the shared memory used in the kernel | ||
146 | + unsigned int pixel_bytes = max_threads * sizeof(T); //bytes devoted to pixel data being processed | ||
147 | + unsigned int apron_bytes = k0/2 * sizeof(T); //bytes devoted to the apron on each side of the window | ||
148 | + unsigned int gaussian_bytes = k0 * sizeof(T); //bytes devoted to memory used to store the pre-computed Gaussian window | ||
149 | + unsigned int shared_bytes = pixel_bytes + 2 * apron_bytes + gaussian_bytes; //total number of bytes shared memory used | ||
150 | + | ||
151 | + //blur the image along the x-axis | ||
152 | + conv2sep_0<T> <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel0, k0); | ||
153 | + | ||
154 | + // Copy the x-blurred data from global memory to the texture | ||
155 | + cudaMemcpyToArray(cuArray, 0, 0, out, x * y * sizeof(T), | ||
156 | + cudaMemcpyDeviceToDevice); | ||
157 | + | ||
158 | + //transpose the block and thread dimensions | ||
159 | + threads.x = 1; | ||
160 | + threads.y = max_threads; | ||
161 | + blocks.x = x; | ||
162 | + blocks.y = y / threads.y + 1; | ||
163 | + | ||
164 | + //blur the image along the y-axis | ||
165 | + conv2sep_1<T> <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel1, k1); | ||
166 | + | ||
167 | + } | ||
168 | + | ||
169 | + template<typename T> | ||
170 | + void gpu_conv2sep(T* image, unsigned int x, unsigned int y, | ||
171 | + T* kernel0, unsigned int k0, | ||
172 | + T* kernel1, unsigned int k1){ | ||
142 | 173 | ||
143 | //get the number of pixels in the image | 174 | //get the number of pixels in the image |
144 | unsigned int pixels = x * y; | 175 | unsigned int pixels = x * y; |
@@ -176,36 +207,9 @@ namespace stim{ | @@ -176,36 +207,9 @@ namespace stim{ | ||
176 | cudaTextureObject_t texObj = 0; | 207 | cudaTextureObject_t texObj = 0; |
177 | cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); | 208 | cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); |
178 | 209 | ||
210 | + //call the texture version of the separable convolution function | ||
211 | + tex_conv2sep(image, x, y, texObj, cuArray, kernel0, k0, kernel1, k1); | ||
179 | 212 | ||
180 | - //get the maximum number of threads per block for the CUDA device | ||
181 | - int max_threads = stim::maxThreadsPerBlock(); | ||
182 | - dim3 threads(max_threads, 1); | ||
183 | - | ||
184 | - //calculate the number of blocks | ||
185 | - dim3 blocks(x / threads.x + 1, y); | ||
186 | - | ||
187 | - //calculate the shared memory used in the kernel | ||
188 | - unsigned int pixel_bytes = max_threads * 4; //bytes devoted to pixel data being processed | ||
189 | - unsigned int apron_bytes = sigma * 8 * 4; //bytes devoted to pixels outside the window | ||
190 | - unsigned int gaussian_bytes = (sigma * 8 + 1) * 4; //bytes devoted to memory used to store the pre-computed Gaussian window | ||
191 | - unsigned int shared_bytes = pixel_bytes + apron_bytes + gaussian_bytes; //total number of bytes shared memory used | ||
192 | - | ||
193 | - //blur the image along the x-axis | ||
194 | - gaussian_blur_x <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y); | ||
195 | - | ||
196 | - // Copy the x-blurred data from global memory to the texture | ||
197 | - cudaMemcpyToArray(cuArray, 0, 0, image, bytes, | ||
198 | - cudaMemcpyDeviceToDevice); | ||
199 | - | ||
200 | - //transpose the block and thread dimensions | ||
201 | - threads.x = 1; | ||
202 | - threads.y = max_threads; | ||
203 | - blocks.x = x; | ||
204 | - blocks.y = y / threads.y + 1; | ||
205 | - | ||
206 | - //blur the image along the y-axis | ||
207 | - gaussian_blur_y <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y); | ||
208 | - | ||
209 | //free allocated memory | 213 | //free allocated memory |
210 | cudaFree(cuArray); | 214 | cudaFree(cuArray); |
211 | 215 | ||
@@ -213,22 +217,35 @@ namespace stim{ | @@ -213,22 +217,35 @@ namespace stim{ | ||
213 | 217 | ||
214 | /// Applies a Gaussian blur to a 2D image stored on the CPU | 218 | /// Applies a Gaussian blur to a 2D image stored on the CPU |
215 | template<typename T> | 219 | template<typename T> |
216 | - void cpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){ | 220 | + void cpu_conv2sep(T* image, unsigned int x, unsigned int y, |
221 | + T* kernel0, unsigned int k0, | ||
222 | + T* kernel1, unsigned int k1){ | ||
217 | 223 | ||
218 | //get the number of pixels in the image | 224 | //get the number of pixels in the image |
219 | unsigned int pixels = x * y; | 225 | unsigned int pixels = x * y; |
220 | unsigned int bytes = sizeof(T) * pixels; | 226 | unsigned int bytes = sizeof(T) * pixels; |
221 | 227 | ||
222 | - //allocate space on the GPU | 228 | + //---------Allocate Image--------- |
229 | + //allocate space on the GPU for the image | ||
223 | T* gpuI0; | 230 | T* gpuI0; |
224 | - cudaMalloc(&gpuI0, bytes); | ||
225 | - | 231 | + HANDLE_ERROR(cudaMalloc(&gpuI0, bytes)); |
226 | 232 | ||
227 | //copy the image data to the GPU | 233 | //copy the image data to the GPU |
228 | - cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice); | 234 | + HANDLE_ERROR(cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice)); |
235 | + | ||
236 | + //---------Allocate Kernel-------- | ||
237 | + //allocate and copy the 0 (x) kernel | ||
238 | + T* gpuK0; | ||
239 | + HANDLE_ERROR(cudaMalloc(&gpuK0, k0 * sizeof(T))); | ||
240 | + HANDLE_ERROR(cudaMemcpy(gpuK0, kernel0, k0 * sizeof(T), cudaMemcpyHostToDevice)); | ||
241 | + | ||
242 | + //allocate and copy the 1 (y) kernel | ||
243 | + T* gpuK1; | ||
244 | + HANDLE_ERROR(cudaMalloc(&gpuK1, k1 * sizeof(T))); | ||
245 | + HANDLE_ERROR(cudaMemcpy(gpuK1, kernel1, k1 * sizeof(T), cudaMemcpyHostToDevice)); | ||
229 | 246 | ||
230 | //run the GPU-based version of the algorithm | 247 | //run the GPU-based version of the algorithm |
231 | - gpu_gaussian_blur_2d<T>(gpuI0, sigma, x, y); | 248 | + gpu_conv2sep<T>(gpuI0, x, y, gpuK0, k0, gpuK1, k1); |
232 | 249 | ||
233 | //copy the image data from the device | 250 | //copy the image data from the device |
234 | cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost); | 251 | cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost); |
1 | +#ifndef STIM_CUDA_GAUSSIAN_BLUR_H | ||
2 | +#define STIM_CUDA_GAUSSIAN_BLUR_H | ||
3 | + | ||
4 | +#include <iostream> | ||
5 | +#include <cuda.h> | ||
6 | +#include <stim/cuda/cudatools.h> | ||
7 | +#include <stim/cuda/sharedmem.cuh> | ||
8 | +#include <stim/cuda/templates/conv2sep.cuh> //GPU-based separable convolution algorithm | ||
9 | + | ||
10 | +#define pi 3.14159 | ||
11 | + | ||
12 | +namespace stim{ | ||
13 | + namespace cuda{ | ||
14 | + | ||
15 | + template<typename T> | ||
16 | + void gen_gaussian(T* out, T sigma, unsigned int width){ | ||
17 | + | ||
18 | + //fill the kernel with a gaussian | ||
19 | + for(unsigned int xi = 0; xi < width; xi++){ | ||
20 | + | ||
21 | + float x = (float)xi - (float)(width/2); //calculate the x position of the gaussian | ||
22 | + float g = 1.0 / (sigma * sqrt(2 * 3.14159)) * exp( - (x*x) / (2*sigma*sigma) ); | ||
23 | + out[xi] = g; | ||
24 | + } | ||
25 | + | ||
26 | + } | ||
27 | + | ||
28 | + template<typename T> | ||
29 | + void tex_gaussian_blur2(T* out, T sigma, unsigned int x, unsigned int y, cudaTextureObject_t texObj, cudaArray* cuArray){ | ||
30 | + | ||
31 | + //allocate space for the kernel | ||
32 | + unsigned int kwidth = sigma * 8 + 1; | ||
33 | + float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); | ||
34 | + | ||
35 | + //fill the kernel with a gaussian | ||
36 | + gen_gaussian(kernel0, sigma, kwidth); | ||
37 | + | ||
38 | + //copy the kernel to the GPU | ||
39 | + T* gpuKernel0; | ||
40 | + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice)); | ||
41 | + | ||
42 | + //perform the gaussian blur as a separable convolution | ||
43 | + stim::cuda::tex_conv2sep(out, x, y, texObj, cuArray, gpuKernel0, kwidth, gpuKernel0, kwidth); | ||
44 | + | ||
45 | + HANDLE_ERROR(cudaFree(gpuKernel0)); | ||
46 | + | ||
47 | + } | ||
48 | + | ||
49 | + template<typename T> | ||
50 | + void gpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){ | ||
51 | + | ||
52 | + //allocate space for the kernel | ||
53 | + unsigned int kwidth = sigma * 8 + 1; | ||
54 | + float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); | ||
55 | + | ||
56 | + //fill the kernel with a gaussian | ||
57 | + gen_gaussian(kernel0, sigma, kwidth); | ||
58 | + | ||
59 | + //copy the kernel to the GPU | ||
60 | + T* gpuKernel0; | ||
61 | + HANDLE_ERROR(cudaMalloc(&gpuKernel0, kwidth * sizeof(T))); | ||
62 | + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice)); | ||
63 | + | ||
64 | + //perform the gaussian blur as a separable convolution | ||
65 | + stim::cuda::gpu_conv2sep<float>(image, x, y, gpuKernel0, kwidth, gpuKernel0, kwidth); | ||
66 | + | ||
67 | + HANDLE_ERROR(cudaFree(gpuKernel0)); | ||
68 | + | ||
69 | + } | ||
70 | + | ||
71 | + /// Applies a Gaussian blur to a 2D image stored on the CPU | ||
72 | + template<typename T> | ||
73 | + void cpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){ | ||
74 | + | ||
75 | + //allocate space for the kernel | ||
76 | + unsigned int kwidth = sigma * 8 + 1; | ||
77 | + float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); | ||
78 | + | ||
79 | + //fill the kernel with a gaussian | ||
80 | + gen_gaussian(kernel0, sigma, kwidth); | ||
81 | + | ||
82 | + //perform the gaussian blur as a separable convolution | ||
83 | + stim::cuda::cpu_conv2sep<float>(image, x, y, kernel0, kwidth, kernel0, kwidth); | ||
84 | + | ||
85 | + } | ||
86 | + | ||
87 | + }; | ||
88 | +}; | ||
89 | + | ||
90 | +#endif | ||
0 | \ No newline at end of file | 91 | \ No newline at end of file |
stim/cuda/gradient.cuh renamed to stim/cuda/templates/gradient.cuh
stim/gl/gl_spider.h
@@ -14,7 +14,7 @@ | @@ -14,7 +14,7 @@ | ||
14 | #include "stim/math/rect.h" | 14 | #include "stim/math/rect.h" |
15 | #include "stim/math/matrix.h" | 15 | #include "stim/math/matrix.h" |
16 | #include "stim/cuda/cost.h" | 16 | #include "stim/cuda/cost.h" |
17 | -#include "stim/cuda/glbind.h" | 17 | +#include <stim/cuda/cudatools/glbind.h> |
18 | #include <stim/visualization/obj.h> | 18 | #include <stim/visualization/obj.h> |
19 | #include <vector> | 19 | #include <vector> |
20 | 20 |
stim/image/image.h
@@ -174,7 +174,7 @@ public: | @@ -174,7 +174,7 @@ public: | ||
174 | 174 | ||
175 | 175 | ||
176 | /// Returns the maximum pixel value in the image | 176 | /// Returns the maximum pixel value in the image |
177 | - T max(){ | 177 | + T maxv(){ |
178 | float max = 0; | 178 | float max = 0; |
179 | unsigned long N = width() * height(); //get the number of pixels | 179 | unsigned long N = width() * height(); //get the number of pixels |
180 | 180 | ||
@@ -190,7 +190,7 @@ public: | @@ -190,7 +190,7 @@ public: | ||
190 | } | 190 | } |
191 | 191 | ||
192 | /// Returns the minimum pixel value in the image | 192 | /// Returns the minimum pixel value in the image |
193 | - T min(){ | 193 | + T minv(){ |
194 | float min = 0; | 194 | float min = 0; |
195 | unsigned long N = width() * height(); //get the number of pixels | 195 | unsigned long N = width() * height(); //get the number of pixels |
196 | 196 |
stim/math/matrix.h
@@ -5,7 +5,7 @@ | @@ -5,7 +5,7 @@ | ||
5 | #include <string.h> | 5 | #include <string.h> |
6 | #include <iostream> | 6 | #include <iostream> |
7 | #include <stim/math/vector.h> | 7 | #include <stim/math/vector.h> |
8 | -#include "../cuda/callable.h" | 8 | +#include <stim/cuda/cudatools/callable.h> |
9 | 9 | ||
10 | namespace stim{ | 10 | namespace stim{ |
11 | 11 |
stim/math/quaternion.h
stim/math/rect.h
@@ -2,7 +2,7 @@ | @@ -2,7 +2,7 @@ | ||
2 | #define RTS_RECT_H | 2 | #define RTS_RECT_H |
3 | 3 | ||
4 | //enable CUDA_CALLABLE macro | 4 | //enable CUDA_CALLABLE macro |
5 | -#include <stim/cuda/callable.h> | 5 | +#include <stim/cuda/cudatools/callable.h> |
6 | #include <stim/math/vector.h> | 6 | #include <stim/math/vector.h> |
7 | #include <stim/math/triangle.h> | 7 | #include <stim/math/triangle.h> |
8 | #include <stim/math/quaternion.h> | 8 | #include <stim/math/quaternion.h> |
stim/math/triangle.h
@@ -2,7 +2,7 @@ | @@ -2,7 +2,7 @@ | ||
2 | #define RTS_TRIANGLE_H | 2 | #define RTS_TRIANGLE_H |
3 | 3 | ||
4 | //enable CUDA_CALLABLE macro | 4 | //enable CUDA_CALLABLE macro |
5 | -#include <stim/cuda/callable.h> | 5 | +#include <stim/cuda/cudatools/callable.h> |
6 | #include <stim/math/vector.h> | 6 | #include <stim/math/vector.h> |
7 | #include <iostream> | 7 | #include <iostream> |
8 | 8 |
stim/math/vector.h
stim/visualization/colormap.h
@@ -4,12 +4,12 @@ | @@ -4,12 +4,12 @@ | ||
4 | #include <string> | 4 | #include <string> |
5 | #include <stdlib.h> | 5 | #include <stdlib.h> |
6 | #ifdef __CUDACC__ | 6 | #ifdef __CUDACC__ |
7 | -#include "../cuda/error.h" | 7 | +#include <stim/cuda/cudatools/error.h> |
8 | #endif | 8 | #endif |
9 | 9 | ||
10 | //saving an image to a file uses the CImg library | 10 | //saving an image to a file uses the CImg library |
11 | //this currently throws a lot of "unreachable" warnings (as of GCC 4.8.2, nvcc 6.5.12) | 11 | //this currently throws a lot of "unreachable" warnings (as of GCC 4.8.2, nvcc 6.5.12) |
12 | -#include "../image/image.h" | 12 | +#include <stim/image/image.h> |
13 | 13 | ||
14 | 14 | ||
15 | #define BREWER_CTRL_PTS 11 | 15 | #define BREWER_CTRL_PTS 11 |