Blame view

stim/cuda/templates/conv2.cuh 4.21 KB
348f8ab9   Tianshu Cheng   inseparable 2D co...
1
2
3
4
5
  #ifndef STIM_CUDA_CONV2_H
  #define STIM_CUDA_CONV2_H
  
  #include <iostream>
  #include <cuda.h>
96f9b10f   Laila Saadatifard   change the header...
6
  #include <stim/cuda/cudatools.h>
348f8ab9   Tianshu Cheng   inseparable 2D co...
7
8
9
10
11
12
13
  #include <cmath>
  #include <algorithm>
  
  namespace stim{
  	namespace cuda{
  
  		template<typename T>
f186dbda   Tianshu Cheng   header file for b...
14
  		__global__ void cuda_conv2(T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned int M){
348f8ab9   Tianshu Cheng   inseparable 2D co...
15
16
17
  
  
  			//the radius of mask
aa1bc80d   David Mayerich   fixed the signed/...
18
  			int r = (M - 1)/2;
348f8ab9   Tianshu Cheng   inseparable 2D co...
19
20
21
22
23
24
25
26
27
28
29
30
31
32
  
  
  			//calculate the 1D index for this thread
  			//int idx = blockIdx.x * blockDim.x + threadIdx.x;
  
  			//change 1D index to 2D cordinates
  			int i = blockIdx.x * blockDim.x + threadIdx.x;
  			int j = blockIdx.y;
  
  			int idx = j * w + i;
  			//unsigned long N = w * h;
  
  			if(i < w && j < h){
  
aa1bc80d   David Mayerich   fixed the signed/...
33
  				//copy[idx] = tex2D<float>(texObj, i+100, j+100);
15f80db6   Tianshu Cheng   add image_contour...
34
  				//return;
348f8ab9   Tianshu Cheng   inseparable 2D co...
35
  
5343a315   Tianshu Cheng   make changes to i...
36
  				tex2D<float>(texObj, (float)i/w, (float)j/h);
348f8ab9   Tianshu Cheng   inseparable 2D co...
37
38
39
40
41
42
43
44
45
46
47
48
49
  
  				//allocate memory for result
  				T sum = 0;
  
  				//for (unsigned int y = max(j - r, 0); y <= min(j + r, h - 1); y++){
  
  					//for (unsigned int x = max(i - r, 0); x <= min(i + r, w - 1); x++){
  
  				for (int y = j - r; y <= j + r; y++){
  
  					for (int x = i - r; x <= i + r; x++){
  
  						//idx to mask cordinates(xx, yy)
aa1bc80d   David Mayerich   fixed the signed/...
50
51
  						int xx = x - (i - r);
  						int yy = y - (j - r);
348f8ab9   Tianshu Cheng   inseparable 2D co...
52
  
5343a315   Tianshu Cheng   make changes to i...
53
  						sum += tex2D<T>(texObj, (float)x/w, (float)y/h) * mask[yy * M + xx];
348f8ab9   Tianshu Cheng   inseparable 2D co...
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
  					}		
  				}
  				copy[idx] = sum;
  			 }
  		}
  		
  
  		template<typename T>
  		void gpu_conv2(T* img, T* mask, T* copy, unsigned int w, unsigned int h, unsigned M){
  
  			unsigned long N = w * h;
  
  			// Allocate CUDA array in device memory
  			
  			//define a channel descriptor for a single 32-bit channel
  			cudaChannelFormatDesc channelDesc =
  					   cudaCreateChannelDesc(32, 0, 0, 0,
  											 cudaChannelFormatKindFloat);
  			cudaArray* cuArray;												//declare the cuda array
  			cudaMallocArray(&cuArray, &channelDesc, w, h);			//allocate the cuda array
  
  			// Copy the image data from global memory to the array
  			cudaMemcpyToArray(cuArray, 0, 0, img, N * sizeof(T),
  							  cudaMemcpyDeviceToDevice);
  
  			// Specify texture
  			struct cudaResourceDesc resDesc;				//create a resource descriptor
  			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
  			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
  			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
  
  			// Specify texture object parameters
  			struct cudaTextureDesc texDesc;							//create a texture descriptor
  			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
5343a315   Tianshu Cheng   make changes to i...
88
89
  			texDesc.addressMode[0]   = cudaAddressModeClamp;			//use wrapping (around the edges)
  			texDesc.addressMode[1]   = cudaAddressModeClamp;
348f8ab9   Tianshu Cheng   inseparable 2D co...
90
91
  			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
  			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
5343a315   Tianshu Cheng   make changes to i...
92
  			texDesc.normalizedCoords = 1;							//using normalized coordinates
348f8ab9   Tianshu Cheng   inseparable 2D co...
93
94
95
96
97
98
99
100
101
102
103
104
  
  			// Create texture object
  			cudaTextureObject_t texObj = 0;
  			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
  
  			//get the maximum number of threads per block for the CUDA device
  			int threads = stim::maxThreadsPerBlock();
  
  			//calculate the number of blocks
  			dim3 blocks(w / threads + 1, h);
  
  			//call the kernel to do the multiplication
f186dbda   Tianshu Cheng   header file for b...
105
  			cuda_conv2 <<< blocks, threads >>>(mask, copy, texObj, w, h, M);
348f8ab9   Tianshu Cheng   inseparable 2D co...
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
  
  		}
  
  		template<typename T>
  		void cpu_conv2(T* img, T* mask, T* cpu_copy, unsigned int w, unsigned int h, unsigned M){
  
  			unsigned long N = w * h;
  			//allocate memory on the GPU for the array
  			T* gpu_img; 
  			T* gpu_mask; 
  			T* gpu_copy;
  			HANDLE_ERROR( cudaMalloc( &gpu_img, N * sizeof(T) ) );
  			HANDLE_ERROR( cudaMalloc( &gpu_mask, pow(M, 2) * sizeof(T) ) );
  			HANDLE_ERROR( cudaMalloc( &gpu_copy, N * sizeof(T) ) );
  
  			//copy the array to the GPU
  			HANDLE_ERROR( cudaMemcpy( gpu_img, img, N * sizeof(T), cudaMemcpyHostToDevice) );
  			HANDLE_ERROR( cudaMemcpy( gpu_mask, mask, pow(M, 2) * sizeof(T), cudaMemcpyHostToDevice) );
  
  			//call the GPU version of this function
  			gpu_conv2<T>(gpu_img, gpu_mask ,gpu_copy, w, h, M);
  
  			//copy the array back to the CPU
  			HANDLE_ERROR( cudaMemcpy( cpu_copy, gpu_copy, N * sizeof(T), cudaMemcpyDeviceToHost) );
  
  			//free allocated memory
  			cudaFree(gpu_img);
  			cudaFree(gpu_mask);
  			cudaFree(gpu_copy);
  
  		}
  		
  	}
  }
  
  
  #endif