Commit 348f8ab9945f81ca275a3d51e5586ba618e0b7db
1 parent
c0fc9427
inseparable 2D convolution
Showing
1 changed file
with
147 additions
and
0 deletions
Show diff stats
1 | +#ifndef STIM_CUDA_CONV2_H | |
2 | +#define STIM_CUDA_CONV2_H | |
3 | + | |
4 | +#include <iostream> | |
5 | +#include <cuda.h> | |
6 | +#include <stim/cuda/devices.h> | |
7 | +#include <stim/cuda/error.h> | |
8 | +#include <cmath> | |
9 | +#include <algorithm> | |
10 | + | |
11 | +namespace stim{ | |
12 | + namespace cuda{ | |
13 | + | |
14 | + template<typename T> | |
15 | + //__global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){ | |
16 | + __global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){ | |
17 | + | |
18 | + | |
19 | + //the radius of mask | |
20 | + unsigned r = (M - 1)/2; | |
21 | + | |
22 | + | |
23 | + //calculate the 1D index for this thread | |
24 | + //int idx = blockIdx.x * blockDim.x + threadIdx.x; | |
25 | + | |
26 | + //change 1D index to 2D cordinates | |
27 | + int i = blockIdx.x * blockDim.x + threadIdx.x; | |
28 | + int j = blockIdx.y; | |
29 | + | |
30 | + int idx = j * w + i; | |
31 | + //unsigned long N = w * h; | |
32 | + | |
33 | + if(i < w && j < h){ | |
34 | + | |
35 | + copy[idx] = tex2D<float>(texObj, i+100, j+100); | |
36 | + return; | |
37 | + | |
38 | + //tex2D<float>(texObj, i, j); | |
39 | + | |
40 | + //allocate memory for result | |
41 | + T sum = 0; | |
42 | + | |
43 | + //for (unsigned int y = max(j - r, 0); y <= min(j + r, h - 1); y++){ | |
44 | + | |
45 | + //for (unsigned int x = max(i - r, 0); x <= min(i + r, w - 1); x++){ | |
46 | + | |
47 | + for (int y = j - r; y <= j + r; y++){ | |
48 | + | |
49 | + for (int x = i - r; x <= i + r; x++){ | |
50 | + | |
51 | + //idx to mask cordinates(xx, yy) | |
52 | + unsigned int xx = x - (i - r); | |
53 | + unsigned int yy = y - (j - r); | |
54 | + | |
55 | + //T temp = img[y * w + x] * mask[yy * M + xx]; | |
56 | + //sum += img[y * w + x] * mask[yy * M + xx]; | |
57 | + sum += tex2D<float>(texObj, x, y);// * mask[yy * M + xx]; | |
58 | + } | |
59 | + } | |
60 | + copy[idx] = sum; | |
61 | + } | |
62 | + } | |
63 | + | |
64 | + | |
65 | + template<typename T> | |
66 | + void gpu_conv2(T* img, T* mask, T* copy, unsigned int w, unsigned int h, unsigned M){ | |
67 | + | |
68 | + unsigned long N = w * h; | |
69 | + | |
70 | + // Allocate CUDA array in device memory | |
71 | + | |
72 | + //define a channel descriptor for a single 32-bit channel | |
73 | + cudaChannelFormatDesc channelDesc = | |
74 | + cudaCreateChannelDesc(32, 0, 0, 0, | |
75 | + cudaChannelFormatKindFloat); | |
76 | + cudaArray* cuArray; //declare the cuda array | |
77 | + cudaMallocArray(&cuArray, &channelDesc, w, h); //allocate the cuda array | |
78 | + | |
79 | + // Copy the image data from global memory to the array | |
80 | + cudaMemcpyToArray(cuArray, 0, 0, img, N * sizeof(T), | |
81 | + cudaMemcpyDeviceToDevice); | |
82 | + | |
83 | + // Specify texture | |
84 | + struct cudaResourceDesc resDesc; //create a resource descriptor | |
85 | + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero | |
86 | + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type | |
87 | + resDesc.res.array.array = cuArray; //add a pointer to the cuda array | |
88 | + | |
89 | + // Specify texture object parameters | |
90 | + struct cudaTextureDesc texDesc; //create a texture descriptor | |
91 | + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero | |
92 | + texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) | |
93 | + texDesc.addressMode[1] = cudaAddressModeWrap; | |
94 | + texDesc.filterMode = cudaFilterModePoint; //use linear filtering | |
95 | + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) | |
96 | + texDesc.normalizedCoords = 0; //not using normalized coordinates | |
97 | + | |
98 | + // Create texture object | |
99 | + cudaTextureObject_t texObj = 0; | |
100 | + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); | |
101 | + | |
102 | + //get the maximum number of threads per block for the CUDA device | |
103 | + int threads = stim::maxThreadsPerBlock(); | |
104 | + | |
105 | + //calculate the number of blocks | |
106 | + dim3 blocks(w / threads + 1, h); | |
107 | + | |
108 | + //call the kernel to do the multiplication | |
109 | + //cuda_conv2 <<< blocks, threads >>>(img, mask, copy, w, h, M); | |
110 | + cuda_conv2 <<< blocks, threads >>>(img, mask, copy, texObj, w, h, M); | |
111 | + | |
112 | + } | |
113 | + | |
114 | + template<typename T> | |
115 | + void cpu_conv2(T* img, T* mask, T* cpu_copy, unsigned int w, unsigned int h, unsigned M){ | |
116 | + | |
117 | + unsigned long N = w * h; | |
118 | + //allocate memory on the GPU for the array | |
119 | + T* gpu_img; | |
120 | + T* gpu_mask; | |
121 | + T* gpu_copy; | |
122 | + HANDLE_ERROR( cudaMalloc( &gpu_img, N * sizeof(T) ) ); | |
123 | + HANDLE_ERROR( cudaMalloc( &gpu_mask, pow(M, 2) * sizeof(T) ) ); | |
124 | + HANDLE_ERROR( cudaMalloc( &gpu_copy, N * sizeof(T) ) ); | |
125 | + | |
126 | + //copy the array to the GPU | |
127 | + HANDLE_ERROR( cudaMemcpy( gpu_img, img, N * sizeof(T), cudaMemcpyHostToDevice) ); | |
128 | + HANDLE_ERROR( cudaMemcpy( gpu_mask, mask, pow(M, 2) * sizeof(T), cudaMemcpyHostToDevice) ); | |
129 | + | |
130 | + //call the GPU version of this function | |
131 | + gpu_conv2<T>(gpu_img, gpu_mask ,gpu_copy, w, h, M); | |
132 | + | |
133 | + //copy the array back to the CPU | |
134 | + HANDLE_ERROR( cudaMemcpy( cpu_copy, gpu_copy, N * sizeof(T), cudaMemcpyDeviceToHost) ); | |
135 | + | |
136 | + //free allocated memory | |
137 | + cudaFree(gpu_img); | |
138 | + cudaFree(gpu_mask); | |
139 | + cudaFree(gpu_copy); | |
140 | + | |
141 | + } | |
142 | + | |
143 | + } | |
144 | +} | |
145 | + | |
146 | + | |
147 | +#endif | |
0 | 148 | \ No newline at end of file | ... | ... |