Commit 6b747d25f8386ac98393a6ae14a4eb1803c8ffa3
Merge branch 'bsds500' of git.stim.ee.uh.edu:codebase/stimlib
Showing
3 changed files
with
220 additions
and
0 deletions
Show diff stats
1 | +#ifndef STIM_CUDA_ARRAY_ADD_H | ||
2 | +#define STIM_CUDA_ARRAY_ADD_H | ||
3 | + | ||
4 | +#include <iostream> | ||
5 | +#include <cuda.h> | ||
6 | +#include <stim/cuda/devices.h> | ||
7 | +#include <stim/cuda/error.h> | ||
8 | + | ||
9 | +namespace stim{ | ||
10 | + namespace cuda{ | ||
11 | + | ||
12 | + template<typename T> | ||
13 | + __global__ void cuda_add(T* ptr1, T* ptr2, T* sum, unsigned int N){ | ||
14 | + | ||
15 | + //calculate the 1D index for this thread | ||
16 | + int idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
17 | + | ||
18 | + if(idx < N){ | ||
19 | + sum[idx] = ptr1[idx] + ptr2[idx]; | ||
20 | + } | ||
21 | + | ||
22 | + } | ||
23 | + | ||
24 | + template<typename T> | ||
25 | + void gpu_add(T* ptr1, T* ptr2, T* sum, unsigned int N){ | ||
26 | + | ||
27 | + //get the maximum number of threads per block for the CUDA device | ||
28 | + int threads = stim::maxThreadsPerBlock(); | ||
29 | + | ||
30 | + //calculate the number of blocks | ||
31 | + int blocks = N / threads + (N%threads == 0 ? 0:1); | ||
32 | + | ||
33 | + //call the kernel to do the multiplication | ||
34 | + cuda_add <<< blocks, threads >>>(ptr1, ptr2, sum, N); | ||
35 | + | ||
36 | + } | ||
37 | + | ||
38 | + template<typename T> | ||
39 | + void cpu_add(T* ptr1, T* ptr2, T* cpu_sum, unsigned int N){ | ||
40 | + | ||
41 | + //allocate memory on the GPU for the array | ||
42 | + T* gpu_ptr1; | ||
43 | + T* gpu_ptr2; | ||
44 | + T* gpu_sum; | ||
45 | + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) ); | ||
46 | + HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) ); | ||
47 | + HANDLE_ERROR( cudaMalloc( &gpu_sum, N * sizeof(T) ) ); | ||
48 | + | ||
49 | + //copy the array to the GPU | ||
50 | + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) ); | ||
51 | + HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) ); | ||
52 | + | ||
53 | + //call the GPU version of this function | ||
54 | + gpu_add<T>(gpu_ptr1, gpu_ptr2 ,gpu_sum, N); | ||
55 | + | ||
56 | + //copy the array back to the CPU | ||
57 | + HANDLE_ERROR( cudaMemcpy( cpu_sum, gpu_sum, N * sizeof(T), cudaMemcpyDeviceToHost) ); | ||
58 | + | ||
59 | + //free allocated memory | ||
60 | + cudaFree(gpu_ptr1); | ||
61 | + cudaFree(gpu_ptr2); | ||
62 | + cudaFree(gpu_sum); | ||
63 | + | ||
64 | + } | ||
65 | + | ||
66 | + } | ||
67 | +} | ||
68 | + | ||
69 | + | ||
70 | + | ||
71 | +#endif | ||
0 | \ No newline at end of file | 72 | \ No newline at end of file |
stim/cuda/arraymath.cuh
@@ -5,6 +5,8 @@ | @@ -5,6 +5,8 @@ | ||
5 | #include <stim/cuda/array_multiply.cuh> | 5 | #include <stim/cuda/array_multiply.cuh> |
6 | #include <stim/cuda/array_abs.cuh> | 6 | #include <stim/cuda/array_abs.cuh> |
7 | #include <stim/cuda/array_cart2polar.cuh> | 7 | #include <stim/cuda/array_cart2polar.cuh> |
8 | +#include <stim/cuda/gaussian_blur.cuh> | ||
9 | +#include <stim/cuda/conv2.cuh> | ||
8 | 10 | ||
9 | namespace stim{ | 11 | namespace stim{ |
10 | namespace cuda{ | 12 | namespace cuda{ |
1 | +#ifndef STIM_CUDA_CONV2_H | ||
2 | +#define STIM_CUDA_CONV2_H | ||
3 | + | ||
4 | +#include <iostream> | ||
5 | +#include <cuda.h> | ||
6 | +#include <stim/cuda/devices.h> | ||
7 | +#include <stim/cuda/error.h> | ||
8 | +#include <cmath> | ||
9 | +#include <algorithm> | ||
10 | + | ||
11 | +namespace stim{ | ||
12 | + namespace cuda{ | ||
13 | + | ||
14 | + template<typename T> | ||
15 | + //__global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){ | ||
16 | + __global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){ | ||
17 | + | ||
18 | + | ||
19 | + //the radius of mask | ||
20 | + unsigned r = (M - 1)/2; | ||
21 | + | ||
22 | + | ||
23 | + //calculate the 1D index for this thread | ||
24 | + //int idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
25 | + | ||
26 | + //change 1D index to 2D cordinates | ||
27 | + int i = blockIdx.x * blockDim.x + threadIdx.x; | ||
28 | + int j = blockIdx.y; | ||
29 | + | ||
30 | + int idx = j * w + i; | ||
31 | + //unsigned long N = w * h; | ||
32 | + | ||
33 | + if(i < w && j < h){ | ||
34 | + | ||
35 | + copy[idx] = tex2D<float>(texObj, i+100, j+100); | ||
36 | + return; | ||
37 | + | ||
38 | + //tex2D<float>(texObj, i, j); | ||
39 | + | ||
40 | + //allocate memory for result | ||
41 | + T sum = 0; | ||
42 | + | ||
43 | + //for (unsigned int y = max(j - r, 0); y <= min(j + r, h - 1); y++){ | ||
44 | + | ||
45 | + //for (unsigned int x = max(i - r, 0); x <= min(i + r, w - 1); x++){ | ||
46 | + | ||
47 | + for (int y = j - r; y <= j + r; y++){ | ||
48 | + | ||
49 | + for (int x = i - r; x <= i + r; x++){ | ||
50 | + | ||
51 | + //idx to mask cordinates(xx, yy) | ||
52 | + unsigned int xx = x - (i - r); | ||
53 | + unsigned int yy = y - (j - r); | ||
54 | + | ||
55 | + //T temp = img[y * w + x] * mask[yy * M + xx]; | ||
56 | + //sum += img[y * w + x] * mask[yy * M + xx]; | ||
57 | + sum += tex2D<float>(texObj, x, y);// * mask[yy * M + xx]; | ||
58 | + } | ||
59 | + } | ||
60 | + copy[idx] = sum; | ||
61 | + } | ||
62 | + } | ||
63 | + | ||
64 | + | ||
65 | + template<typename T> | ||
66 | + void gpu_conv2(T* img, T* mask, T* copy, unsigned int w, unsigned int h, unsigned M){ | ||
67 | + | ||
68 | + unsigned long N = w * h; | ||
69 | + | ||
70 | + // Allocate CUDA array in device memory | ||
71 | + | ||
72 | + //define a channel descriptor for a single 32-bit channel | ||
73 | + cudaChannelFormatDesc channelDesc = | ||
74 | + cudaCreateChannelDesc(32, 0, 0, 0, | ||
75 | + cudaChannelFormatKindFloat); | ||
76 | + cudaArray* cuArray; //declare the cuda array | ||
77 | + cudaMallocArray(&cuArray, &channelDesc, w, h); //allocate the cuda array | ||
78 | + | ||
79 | + // Copy the image data from global memory to the array | ||
80 | + cudaMemcpyToArray(cuArray, 0, 0, img, N * sizeof(T), | ||
81 | + cudaMemcpyDeviceToDevice); | ||
82 | + | ||
83 | + // Specify texture | ||
84 | + struct cudaResourceDesc resDesc; //create a resource descriptor | ||
85 | + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero | ||
86 | + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type | ||
87 | + resDesc.res.array.array = cuArray; //add a pointer to the cuda array | ||
88 | + | ||
89 | + // Specify texture object parameters | ||
90 | + struct cudaTextureDesc texDesc; //create a texture descriptor | ||
91 | + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero | ||
92 | + texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) | ||
93 | + texDesc.addressMode[1] = cudaAddressModeWrap; | ||
94 | + texDesc.filterMode = cudaFilterModePoint; //use linear filtering | ||
95 | + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) | ||
96 | + texDesc.normalizedCoords = 0; //not using normalized coordinates | ||
97 | + | ||
98 | + // Create texture object | ||
99 | + cudaTextureObject_t texObj = 0; | ||
100 | + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); | ||
101 | + | ||
102 | + //get the maximum number of threads per block for the CUDA device | ||
103 | + int threads = stim::maxThreadsPerBlock(); | ||
104 | + | ||
105 | + //calculate the number of blocks | ||
106 | + dim3 blocks(w / threads + 1, h); | ||
107 | + | ||
108 | + //call the kernel to do the multiplication | ||
109 | + //cuda_conv2 <<< blocks, threads >>>(img, mask, copy, w, h, M); | ||
110 | + cuda_conv2 <<< blocks, threads >>>(img, mask, copy, texObj, w, h, M); | ||
111 | + | ||
112 | + } | ||
113 | + | ||
114 | + template<typename T> | ||
115 | + void cpu_conv2(T* img, T* mask, T* cpu_copy, unsigned int w, unsigned int h, unsigned M){ | ||
116 | + | ||
117 | + unsigned long N = w * h; | ||
118 | + //allocate memory on the GPU for the array | ||
119 | + T* gpu_img; | ||
120 | + T* gpu_mask; | ||
121 | + T* gpu_copy; | ||
122 | + HANDLE_ERROR( cudaMalloc( &gpu_img, N * sizeof(T) ) ); | ||
123 | + HANDLE_ERROR( cudaMalloc( &gpu_mask, pow(M, 2) * sizeof(T) ) ); | ||
124 | + HANDLE_ERROR( cudaMalloc( &gpu_copy, N * sizeof(T) ) ); | ||
125 | + | ||
126 | + //copy the array to the GPU | ||
127 | + HANDLE_ERROR( cudaMemcpy( gpu_img, img, N * sizeof(T), cudaMemcpyHostToDevice) ); | ||
128 | + HANDLE_ERROR( cudaMemcpy( gpu_mask, mask, pow(M, 2) * sizeof(T), cudaMemcpyHostToDevice) ); | ||
129 | + | ||
130 | + //call the GPU version of this function | ||
131 | + gpu_conv2<T>(gpu_img, gpu_mask ,gpu_copy, w, h, M); | ||
132 | + | ||
133 | + //copy the array back to the CPU | ||
134 | + HANDLE_ERROR( cudaMemcpy( cpu_copy, gpu_copy, N * sizeof(T), cudaMemcpyDeviceToHost) ); | ||
135 | + | ||
136 | + //free allocated memory | ||
137 | + cudaFree(gpu_img); | ||
138 | + cudaFree(gpu_mask); | ||
139 | + cudaFree(gpu_copy); | ||
140 | + | ||
141 | + } | ||
142 | + | ||
143 | + } | ||
144 | +} | ||
145 | + | ||
146 | + | ||
147 | +#endif | ||
0 | \ No newline at end of file | 148 | \ No newline at end of file |