Commit f186dbdaad1858d7e18d4b35a1e099ef81ff1f42
1 parent
3173e54d
header file for bsds500
Showing
12 changed files
with
628 additions
and
10 deletions
Show diff stats
stim/cuda/arraymath.cuh
... | ... | @@ -3,6 +3,11 @@ |
3 | 3 | |
4 | 4 | #include <stim/cuda/arraymath/array_add.cuh> |
5 | 5 | #include <stim/cuda/arraymath/array_multiply.cuh> |
6 | +#include <stim/cuda/arraymath/array_multiply2.cuh> | |
7 | +#include <stim/cuda/arraymath/array_divide.cuh> | |
8 | +#include <stim/cuda/arraymath/array_cos.cuh> | |
9 | +#include <stim/cuda/arraymath/array_sin.cuh> | |
10 | +#include <stim/cuda/arraymath/array_atan.cuh> | |
6 | 11 | #include <stim/cuda/arraymath/array_abs.cuh> |
7 | 12 | #include <stim/cuda/arraymath/array_cart2polar.cuh> |
8 | 13 | ... | ... |
stim/cuda/arraymath/array_add.cuh
1 | +#ifndef STIM_CUDA_ARRAY_ATAN_H | |
2 | +#define STIM_CUDA_ARRAY_ATAN_H | |
3 | + | |
4 | +#include <iostream> | |
5 | +#include <cuda.h> | |
6 | +#include <cmath> | |
7 | +#include <stim/cuda/cudatools.h> | |
8 | + | |
9 | +namespace stim{ | |
10 | + namespace cuda{ | |
11 | + | |
12 | + template<typename T> | |
13 | + __global__ void cuda_atan(T* ptr1, T* out, unsigned int N){ | |
14 | + | |
15 | + //calculate the 1D index for this thread | |
16 | + int idx = blockIdx.x * blockDim.x + threadIdx.x; | |
17 | + | |
18 | + if(idx < N){ | |
19 | + out[idx] = atan(ptr1[idx]); | |
20 | + } | |
21 | + | |
22 | + } | |
23 | + | |
24 | + template<typename T> | |
25 | + void gpu_atan(T* ptr1, T* out, unsigned int N){ | |
26 | + | |
27 | + //get the maximum number of threads per block for the CUDA device | |
28 | + int threads = stim::maxThreadsPerBlock(); | |
29 | + | |
30 | + //calculate the number of blocks | |
31 | + int blocks = N / threads + 1; | |
32 | + | |
33 | + //call the kernel to do the multiplication | |
34 | + cuda_atan <<< blocks, threads >>>(ptr1, out, N); | |
35 | + | |
36 | + } | |
37 | + | |
38 | + template<typename T> | |
39 | + void cpu_atan(T* ptr1, T* cpu_out, unsigned int N){ | |
40 | + | |
41 | + //allocate memory on the GPU for the array | |
42 | + T* gpu_ptr1; | |
43 | + T* gpu_out; | |
44 | + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) ); | |
45 | + HANDLE_ERROR( cudaMalloc( &gpu_out, N * sizeof(T) ) ); | |
46 | + | |
47 | + //copy the array to the GPU | |
48 | + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) ); | |
49 | + | |
50 | + //call the GPU version of this function | |
51 | + gpu_atan<T>(gpu_ptr1 ,gpu_out, N); | |
52 | + | |
53 | + //copy the array back to the CPU | |
54 | + HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N * sizeof(T), cudaMemcpyDeviceToHost) ); | |
55 | + | |
56 | + //free allocated memory | |
57 | + cudaFree(gpu_ptr1); | |
58 | + cudaFree(gpu_out); | |
59 | + | |
60 | + } | |
61 | + | |
62 | + } | |
63 | +} | |
64 | + | |
65 | + | |
66 | + | |
67 | +#endif | |
0 | 68 | \ No newline at end of file | ... | ... |
1 | +#ifndef STIM_CUDA_ARRAY_COS_H | |
2 | +#define STIM_CUDA_ARRAY_COS_H | |
3 | + | |
4 | +#include <iostream> | |
5 | +#include <cuda.h> | |
6 | +#include <cmath> | |
7 | +#include <stim/cuda/cudatools.h> | |
8 | + | |
9 | +namespace stim{ | |
10 | + namespace cuda{ | |
11 | + | |
12 | + template<typename T> | |
13 | + __global__ void cuda_cos(T* ptr1, T* out, unsigned int N){ | |
14 | + | |
15 | + //calculate the 1D index for this thread | |
16 | + int idx = blockIdx.x * blockDim.x + threadIdx.x; | |
17 | + | |
18 | + if(idx < N){ | |
19 | + out[idx] = cos(ptr1[idx]); | |
20 | + } | |
21 | + | |
22 | + } | |
23 | + | |
24 | + template<typename T> | |
25 | + void gpu_cos(T* ptr1, T* out, unsigned int N){ | |
26 | + | |
27 | + //get the maximum number of threads per block for the CUDA device | |
28 | + int threads = stim::maxThreadsPerBlock(); | |
29 | + | |
30 | + //calculate the number of blocks | |
31 | + int blocks = N / threads + 1; | |
32 | + | |
33 | + //call the kernel to do the multiplication | |
34 | + cuda_cos <<< blocks, threads >>>(ptr1, out, N); | |
35 | + | |
36 | + } | |
37 | + | |
38 | + template<typename T> | |
39 | + void cpu_cos(T* ptr1, T* cpu_out, unsigned int N){ | |
40 | + | |
41 | + //allocate memory on the GPU for the array | |
42 | + T* gpu_ptr1; | |
43 | + T* gpu_out; | |
44 | + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) ); | |
45 | + HANDLE_ERROR( cudaMalloc( &gpu_out, N * sizeof(T) ) ); | |
46 | + | |
47 | + //copy the array to the GPU | |
48 | + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) ); | |
49 | + | |
50 | + //call the GPU version of this function | |
51 | + gpu_cos<T>(gpu_ptr1 ,gpu_out, N); | |
52 | + | |
53 | + //copy the array back to the CPU | |
54 | + HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N * sizeof(T), cudaMemcpyDeviceToHost) ); | |
55 | + | |
56 | + //free allocated memory | |
57 | + cudaFree(gpu_ptr1); | |
58 | + cudaFree(gpu_out); | |
59 | + | |
60 | + } | |
61 | + | |
62 | + } | |
63 | +} | |
64 | + | |
65 | + | |
66 | + | |
67 | +#endif | |
0 | 68 | \ No newline at end of file | ... | ... |
1 | +#ifndef STIM_CUDA_ARRAY_DIVIDE_H | |
2 | +#define STIM_CUDA_ARRAY_DIVIDE_H | |
3 | + | |
4 | +#include <iostream> | |
5 | +#include <cuda.h> | |
6 | +#include <stim/cuda/cudatools.h> | |
7 | + | |
8 | +namespace stim{ | |
9 | + namespace cuda{ | |
10 | + | |
11 | + template<typename T> | |
12 | + __global__ void cuda_divide(T* ptr1, T* ptr2, T* quotient, unsigned int N){ | |
13 | + | |
14 | + //calculate the 1D index for this thread | |
15 | + int idx = blockIdx.x * blockDim.x + threadIdx.x; | |
16 | + | |
17 | + if(idx < N){ | |
18 | + quotient[idx] = ptr1[idx] / ptr2[idx]; | |
19 | + } | |
20 | + | |
21 | + } | |
22 | + | |
23 | + template<typename T> | |
24 | + void gpu_divide(T* ptr1, T* ptr2, T* quotient, unsigned int N){ | |
25 | + | |
26 | + //get the maximum number of threads per block for the CUDA device | |
27 | + int threads = stim::maxThreadsPerBlock(); | |
28 | + | |
29 | + //calculate the number of blocks | |
30 | + int blocks = N / threads + 1; | |
31 | + | |
32 | + //call the kernel to do the multiplication | |
33 | + cuda_divide <<< blocks, threads >>>(ptr1, ptr2, quotient, N); | |
34 | + | |
35 | + } | |
36 | + | |
37 | + template<typename T> | |
38 | + void cpu_divide(T* ptr1, T* ptr2, T* cpu_quotient, unsigned int N){ | |
39 | + | |
40 | + //allocate memory on the GPU for the array | |
41 | + T* gpu_ptr1; | |
42 | + T* gpu_ptr2; | |
43 | + T* gpu_quotient; | |
44 | + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) ); | |
45 | + HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) ); | |
46 | + HANDLE_ERROR( cudaMalloc( &gpu_quotient, N * sizeof(T) ) ); | |
47 | + | |
48 | + //copy the array to the GPU | |
49 | + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) ); | |
50 | + HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) ); | |
51 | + | |
52 | + //call the GPU version of this function | |
53 | + gpu_divide<T>(gpu_ptr1, gpu_ptr2 ,gpu_quotient, N); | |
54 | + | |
55 | + //copy the array back to the CPU | |
56 | + HANDLE_ERROR( cudaMemcpy( cpu_quotient, gpu_quotient, N * sizeof(T), cudaMemcpyDeviceToHost) ); | |
57 | + | |
58 | + //free allocated memory | |
59 | + cudaFree(gpu_ptr1); | |
60 | + cudaFree(gpu_ptr2); | |
61 | + cudaFree(gpu_quotient); | |
62 | + | |
63 | + } | |
64 | + | |
65 | + } | |
66 | +} | |
67 | + | |
68 | + | |
69 | + | |
70 | +#endif | |
0 | 71 | \ No newline at end of file | ... | ... |
1 | +#ifndef STIM_CUDA_ARRAY_MULTIPLY_H | |
2 | +#define STIM_CUDA_ARRAY_MULTIPLY_H | |
3 | + | |
4 | +#include <iostream> | |
5 | +#include <cuda.h> | |
6 | +#include <stim/cuda/cudatools.h> | |
7 | + | |
8 | +namespace stim{ | |
9 | + namespace cuda{ | |
10 | + | |
11 | + template<typename T> | |
12 | + __global__ void cuda_multiply(T* ptr1, T* ptr2, T* product, unsigned int N){ | |
13 | + | |
14 | + //calculate the 1D index for this thread | |
15 | + int idx = blockIdx.x * blockDim.x + threadIdx.x; | |
16 | + | |
17 | + if(idx < N){ | |
18 | + product[idx] = ptr1[idx] * ptr2[idx]; | |
19 | + } | |
20 | + | |
21 | + } | |
22 | + | |
23 | + template<typename T> | |
24 | + void gpu_multiply(T* ptr1, T* ptr2, T* product, unsigned int N){ | |
25 | + | |
26 | + //get the maximum number of threads per block for the CUDA device | |
27 | + int threads = stim::maxThreadsPerBlock(); | |
28 | + | |
29 | + //calculate the number of blocks | |
30 | + int blocks = N / threads + 1; | |
31 | + | |
32 | + //call the kernel to do the multiplication | |
33 | + cuda_multiply <<< blocks, threads >>>(ptr1, ptr2, product, N); | |
34 | + | |
35 | + } | |
36 | + | |
37 | + template<typename T> | |
38 | + void cpu_multiply(T* ptr1, T* ptr2, T* cpu_product, unsigned int N){ | |
39 | + | |
40 | + //allocate memory on the GPU for the array | |
41 | + T* gpu_ptr1; | |
42 | + T* gpu_ptr2; | |
43 | + T* gpu_product; | |
44 | + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) ); | |
45 | + HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) ); | |
46 | + HANDLE_ERROR( cudaMalloc( &gpu_product, N * sizeof(T) ) ); | |
47 | + | |
48 | + //copy the array to the GPU | |
49 | + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) ); | |
50 | + HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) ); | |
51 | + | |
52 | + //call the GPU version of this function | |
53 | + gpu_multiply<T>(gpu_ptr1, gpu_ptr2 ,gpu_product, N); | |
54 | + | |
55 | + //copy the array back to the CPU | |
56 | + HANDLE_ERROR( cudaMemcpy( cpu_product, gpu_product, N * sizeof(T), cudaMemcpyDeviceToHost) ); | |
57 | + | |
58 | + //free allocated memory | |
59 | + cudaFree(gpu_ptr1); | |
60 | + cudaFree(gpu_ptr2); | |
61 | + cudaFree(gpu_product); | |
62 | + | |
63 | + } | |
64 | + | |
65 | + } | |
66 | +} | |
67 | + | |
68 | + | |
69 | + | |
70 | +#endif | |
0 | 71 | \ No newline at end of file | ... | ... |
1 | +#ifndef STIM_CUDA_ARRAY_SIN_H | |
2 | +#define STIM_CUDA_ARRAY_SIN_H | |
3 | + | |
4 | +#include <iostream> | |
5 | +#include <cuda.h> | |
6 | +#include <cmath> | |
7 | +#include <stim/cuda/cudatools.h> | |
8 | + | |
9 | +namespace stim{ | |
10 | + namespace cuda{ | |
11 | + | |
12 | + template<typename T> | |
13 | + __global__ void cuda_sin(T* ptr1, T* out, unsigned int N){ | |
14 | + | |
15 | + //calculate the 1D index for this thread | |
16 | + int idx = blockIdx.x * blockDim.x + threadIdx.x; | |
17 | + | |
18 | + if(idx < N){ | |
19 | + out[idx] = sin(ptr1[idx]); | |
20 | + } | |
21 | + | |
22 | + } | |
23 | + | |
24 | + template<typename T> | |
25 | + void gpu_sin(T* ptr1, T* out, unsigned int N){ | |
26 | + | |
27 | + //get the maximum number of threads per block for the CUDA device | |
28 | + int threads = stim::maxThreadsPerBlock(); | |
29 | + | |
30 | + //calculate the number of blocks | |
31 | + int blocks = N / threads + 1; | |
32 | + | |
33 | + //call the kernel to do the multiplication | |
34 | + cuda_sin <<< blocks, threads >>>(ptr1, out, N); | |
35 | + | |
36 | + } | |
37 | + | |
38 | + template<typename T> | |
39 | + void cpu_sin(T* ptr1, T* cpu_out, unsigned int N){ | |
40 | + | |
41 | + //allocate memory on the GPU for the array | |
42 | + T* gpu_ptr1; | |
43 | + T* gpu_out; | |
44 | + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) ); | |
45 | + HANDLE_ERROR( cudaMalloc( &gpu_out, N * sizeof(T) ) ); | |
46 | + | |
47 | + //copy the array to the GPU | |
48 | + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) ); | |
49 | + | |
50 | + //call the GPU version of this function | |
51 | + gpu_sin<T>(gpu_ptr1 ,gpu_out, N); | |
52 | + | |
53 | + //copy the array back to the CPU | |
54 | + HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N * sizeof(T), cudaMemcpyDeviceToHost) ); | |
55 | + | |
56 | + //free allocated memory | |
57 | + cudaFree(gpu_ptr1); | |
58 | + cudaFree(gpu_out); | |
59 | + | |
60 | + } | |
61 | + | |
62 | + } | |
63 | +} | |
64 | + | |
65 | + | |
66 | + | |
67 | +#endif | |
0 | 68 | \ No newline at end of file | ... | ... |
stim/cuda/cudatools/devices.h
... | ... | @@ -13,6 +13,15 @@ int maxThreadsPerBlock() |
13 | 13 | cudaGetDeviceProperties(&props, device); |
14 | 14 | return props.maxThreadsPerBlock; |
15 | 15 | } |
16 | + | |
17 | +int sharedMemPerBlock() | |
18 | +{ | |
19 | + int device; | |
20 | + cudaGetDevice(&device); //get the id of the current device | |
21 | + cudaDeviceProp props; //device property structure | |
22 | + cudaGetDeviceProperties(&props, device); | |
23 | + return props.sharedMemPerBlock; | |
24 | +} | |
16 | 25 | } //end namespace rts |
17 | 26 | |
18 | 27 | #endif | ... | ... |
1 | +#ifndef STIM_CUDA_CHI_GRAD_H | |
2 | +#define STIM_CUDA_CHI_GRAD_H | |
3 | + | |
4 | +#include <iostream> | |
5 | +#include <cuda.h> | |
6 | +#include <cuda_runtime.h> | |
7 | +#include <stim/cuda/sharedmem.cuh> | |
8 | +#include <cmath> | |
9 | +#include <algorithm> | |
10 | + | |
11 | +#define PI 3.14159265358979 | |
12 | + | |
13 | +namespace stim{ | |
14 | + namespace cuda{ | |
15 | + | |
16 | + /// template parameter @param T is the data type | |
17 | + template<typename T> | |
18 | + __global__ void cuda_chi_grad(T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){ | |
19 | + | |
20 | + double theta_r = ((theta) * PI)/180; //change angle unit from degree to rad | |
21 | + float sum = 0; | |
22 | + unsigned int N = w * h; | |
23 | + | |
24 | + //change 1D index to 2D cordinates | |
25 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
26 | + int yj = blockIdx.y; | |
27 | + int idx = yj * w + xi; | |
28 | + int shareidx = threadIdx.x; | |
29 | + | |
30 | + extern __shared__ unsigned short bin[]; | |
31 | + | |
32 | + | |
33 | + if(xi < w && yj < h){ | |
34 | + | |
35 | + int gidx; | |
36 | + int hidx; | |
37 | + | |
38 | + //initialize histogram bin to zeros | |
39 | + for(int i = 0; i < bin_n; i++){ | |
40 | + | |
41 | + bin[shareidx * bin_n + i] = 0; | |
42 | + __syncthreads(); | |
43 | + | |
44 | + } | |
45 | + | |
46 | + //get the histogram of the first half of disc and store in bin | |
47 | + for (int y = yj - r; y <= yj + r; y++){ | |
48 | + for (int x = xi - r; x <= xi + r; x++){ | |
49 | + | |
50 | + if ((y - yj)*cos(theta_r) + (x - xi)*sin(theta_r) > 0){ | |
51 | + | |
52 | + gidx = (int) tex2D<T>(texObj, (float)x/w, (float)y/h)/bin_size; | |
53 | + __syncthreads(); | |
54 | + | |
55 | + bin[shareidx * bin_n + gidx]++; | |
56 | + __syncthreads(); | |
57 | + | |
58 | + } | |
59 | + | |
60 | + else{} | |
61 | + } | |
62 | + } | |
63 | + | |
64 | + //initiallize the gbin | |
65 | + unsigned short* gbin = (unsigned short*) malloc(bin_n*sizeof(unsigned short)); | |
66 | + memset (gbin, 0, bin_n*sizeof(unsigned short)); | |
67 | + | |
68 | + //copy the histogram to gbin | |
69 | + for (unsigned int gi = 0; gi < bin_n; gi++){ | |
70 | + | |
71 | + gbin[gi] = bin[shareidx * bin_n + gi]; | |
72 | + | |
73 | + } | |
74 | + | |
75 | + //initialize histogram bin to zeros | |
76 | + for(int j = 0; j < bin_n; j++){ //initialize histogram bin to zeros | |
77 | + | |
78 | + bin[shareidx * bin_n + j] = 0; | |
79 | + __syncthreads(); | |
80 | + } | |
81 | + | |
82 | + //get the histogram of the second half of disc and store in bin | |
83 | + for (int y = yj - r; y <= yj + r; y++){ | |
84 | + for (int x = xi - r; x <= xi + r; x++){ | |
85 | + | |
86 | + if ((y - yj)*cos(theta_r) + (x - xi)*sin(theta_r) < 0){ | |
87 | + | |
88 | + hidx = (int) tex2D<T>(texObj, (float)x/w, (float)y/h)/bin_size; | |
89 | + __syncthreads(); | |
90 | + | |
91 | + bin[shareidx * bin_n + hidx]++; | |
92 | + __syncthreads(); | |
93 | + | |
94 | + } | |
95 | + else{} | |
96 | + } | |
97 | + } | |
98 | + | |
99 | + //initiallize the gbin | |
100 | + unsigned short* hbin = (unsigned short*) malloc(bin_n*sizeof(unsigned short)); | |
101 | + memset (hbin, 0, bin_n*sizeof(unsigned short)); | |
102 | + | |
103 | + //copy the histogram to hbin | |
104 | + for (unsigned int hi = 0; hi < bin_n; hi++){ | |
105 | + | |
106 | + hbin[hi] = bin[shareidx * bin_n + hi]; | |
107 | + | |
108 | + } | |
109 | + | |
110 | + //compare gbin, hbin and calculate the chi distance | |
111 | + for (int k = 0; k < bin_n; k++){ | |
112 | + | |
113 | + float flag; // set flag to avoid zero denominator | |
114 | + | |
115 | + if ((gbin[k] + hbin[k]) == 0){ | |
116 | + flag = 1; | |
117 | + } | |
118 | + else { | |
119 | + flag = (gbin[k] + hbin[k]); | |
120 | + __syncthreads(); | |
121 | + } | |
122 | + | |
123 | + sum += (gbin[k] - hbin[k])*(gbin[k] - hbin[k])/flag; | |
124 | + __syncthreads(); | |
125 | + | |
126 | + } | |
127 | + | |
128 | + // return chi-distance for each pixel | |
129 | + copy[idx] = sum; | |
130 | + | |
131 | + free(gbin); | |
132 | + free(hbin); | |
133 | + } | |
134 | + } | |
135 | + | |
136 | + | |
137 | + template<typename T> | |
138 | + void gpu_chi_grad(T* img, T* copy, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){ | |
139 | + | |
140 | + unsigned long N = w * h; | |
141 | + | |
142 | + // Allocate CUDA array in device memory | |
143 | + | |
144 | + //define a channel descriptor for a single 32-bit channel | |
145 | + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); | |
146 | + cudaArray* cuArray; //declare the cuda array | |
147 | + cudaMallocArray(&cuArray, &channelDesc, w, h); //allocate the cuda array | |
148 | + | |
149 | + // Copy the image data from global memory to the array | |
150 | + cudaMemcpyToArray(cuArray, 0, 0, img, N * sizeof(T), cudaMemcpyDeviceToDevice); | |
151 | + | |
152 | + // Specify texture | |
153 | + struct cudaResourceDesc resDesc; //create a resource descriptor | |
154 | + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero | |
155 | + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type | |
156 | + resDesc.res.array.array = cuArray; //add a pointer to the cuda array | |
157 | + | |
158 | + // Specify texture object parameters | |
159 | + struct cudaTextureDesc texDesc; //create a texture descriptor | |
160 | + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero | |
161 | + texDesc.addressMode[0] = cudaAddressModeMirror; //use wrapping (around the edges) | |
162 | + texDesc.addressMode[1] = cudaAddressModeMirror; | |
163 | + texDesc.filterMode = cudaFilterModePoint; //use linear filtering | |
164 | + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) | |
165 | + texDesc.normalizedCoords = 1; //using normalized coordinates | |
166 | + | |
167 | + // Create texture object | |
168 | + cudaTextureObject_t texObj = 0; | |
169 | + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); | |
170 | + | |
171 | + //get the maximum number of threads per block for the CUDA device | |
172 | + int threads = stim::maxThreadsPerBlock(); | |
173 | + int sharemax = stim::sharedMemPerBlock(); //get the size of Shared memory available per block in bytes | |
174 | + unsigned int shared_bytes = threads * bin_n * sizeof(unsigned short); | |
175 | + | |
176 | + if(threads * bin_n > sharemax){ | |
177 | + | |
178 | + cout <<"Error: shared_bytes exceeds the max value."<<'\n'; | |
179 | + exit(1); | |
180 | + | |
181 | + } | |
182 | + | |
183 | + | |
184 | + //calculate the number of blocks | |
185 | + dim3 blocks(w / threads + 1, h); | |
186 | + | |
187 | + //call the kernel to do the multiplication | |
188 | + cuda_chi_grad <<< blocks, threads, shared_bytes >>>(copy, texObj, w, h, r, bin_n, bin_size, theta); | |
189 | + | |
190 | + } | |
191 | + | |
192 | + template<typename T> | |
193 | + void cpu_chi_grad(T* img, T* cpu_copy, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){ | |
194 | + | |
195 | + unsigned long N = w * h; | |
196 | + //allocate memory on the GPU for the array | |
197 | + T* gpu_img; | |
198 | + T* gpu_copy; | |
199 | + HANDLE_ERROR( cudaMalloc( &gpu_img, N * sizeof(T) ) ); | |
200 | + HANDLE_ERROR( cudaMalloc( &gpu_copy, N * sizeof(T) ) ); | |
201 | + | |
202 | + //copy the array to the GPU | |
203 | + HANDLE_ERROR( cudaMemcpy( gpu_img, img, N * sizeof(T), cudaMemcpyHostToDevice) ); | |
204 | + | |
205 | + //call the GPU version of this function | |
206 | + gpu_chi_grad<T>(gpu_img, gpu_copy, w, h, r, bin_n, bin_size, theta); | |
207 | + | |
208 | + //copy the array back to the CPU | |
209 | + HANDLE_ERROR( cudaMemcpy( cpu_copy, gpu_copy, N * sizeof(T), cudaMemcpyDeviceToHost) ); | |
210 | + | |
211 | + //free allocated memory | |
212 | + cudaFree(gpu_img); | |
213 | + cudaFree(gpu_copy); | |
214 | + | |
215 | + } | |
216 | + | |
217 | + } | |
218 | +} | |
219 | + | |
220 | + | |
221 | +#endif | |
0 | 222 | \ No newline at end of file | ... | ... |
stim/cuda/templates/conv2.cuh
... | ... | @@ -11,8 +11,7 @@ namespace stim{ |
11 | 11 | namespace cuda{ |
12 | 12 | |
13 | 13 | template<typename T> |
14 | - //__global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){ | |
15 | - __global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned int M){ | |
14 | + __global__ void cuda_conv2(T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned int M){ | |
16 | 15 | |
17 | 16 | |
18 | 17 | //the radius of mask |
... | ... | @@ -51,8 +50,6 @@ namespace stim{ |
51 | 50 | int xx = x - (i - r); |
52 | 51 | int yy = y - (j - r); |
53 | 52 | |
54 | - //T temp = img[y * w + x] * mask[yy * M + xx]; | |
55 | - //sum += img[y * w + x] * mask[yy * M + xx]; | |
56 | 53 | sum += tex2D<T>(texObj, (float)x/w, (float)y/h) * mask[yy * M + xx]; |
57 | 54 | } |
58 | 55 | } |
... | ... | @@ -105,8 +102,7 @@ namespace stim{ |
105 | 102 | dim3 blocks(w / threads + 1, h); |
106 | 103 | |
107 | 104 | //call the kernel to do the multiplication |
108 | - //cuda_conv2 <<< blocks, threads >>>(img, mask, copy, w, h, M); | |
109 | - cuda_conv2 <<< blocks, threads >>>(img, mask, copy, texObj, w, h, M); | |
105 | + cuda_conv2 <<< blocks, threads >>>(mask, copy, texObj, w, h, M); | |
110 | 106 | |
111 | 107 | } |
112 | 108 | ... | ... |
stim/image/image.h
... | ... | @@ -31,8 +31,12 @@ public: |
31 | 31 | } |
32 | 32 | |
33 | 33 | /// Constructor initializes an image to a given size |
34 | - image(unsigned int x, unsigned int y = 1, unsigned int z = 1){ | |
34 | + /*image(unsigned int x, unsigned int y = 1, unsigned int z = 1){ | |
35 | 35 | img = cimg_library::CImg<T>(x, y, z); |
36 | + }*/ | |
37 | + | |
38 | + image(unsigned int x, unsigned int y = 1, unsigned int z = 1, unsigned int c = 1){ | |
39 | + img = cimg_library::CImg<T>(x, y, z, c); | |
36 | 40 | } |
37 | 41 | |
38 | 42 | //Load an image from a file |
... | ... | @@ -90,6 +94,23 @@ public: |
90 | 94 | |
91 | 95 | } |
92 | 96 | |
97 | + /// Copy the given data to the specified channel | |
98 | + | |
99 | + /// @param c is the channel number that the data will be copied to | |
100 | + /// @param buffer is a pointer to the image to be copied to channel c | |
101 | + | |
102 | + void set_channel(unsigned int c, T* buffer){ | |
103 | + | |
104 | + //calculate the number of pixels in a channel | |
105 | + unsigned int channel_size = width() * height(); | |
106 | + | |
107 | + //retreive a pointer to the raw image data | |
108 | + T* ptr = img.data() + channel_size * c; | |
109 | + | |
110 | + //copy the buffer to the specified channel | |
111 | + memcpy(ptr, buffer, sizeof(T) * channel_size); | |
112 | + } | |
113 | + | |
93 | 114 | image<T> getslice(unsigned int c){ |
94 | 115 | |
95 | 116 | //create a new image |
... | ... | @@ -228,6 +249,18 @@ public: |
228 | 249 | } |
229 | 250 | |
230 | 251 | |
252 | + image<T> rotate(float angle, float cx, float cy){ | |
253 | + | |
254 | + image<T> result; | |
255 | + float zoom = 1; | |
256 | + unsigned int interpolation = 1; | |
257 | + unsigned int boundary = 1; | |
258 | + result.img = img.get_rotate (angle, cx, cy, zoom, interpolation, boundary); | |
259 | + //result.save("data_output/test_rotate_neum.bmp"); | |
260 | + | |
261 | + return result; | |
262 | + } | |
263 | + | |
231 | 264 | }; |
232 | 265 | |
233 | 266 | }; //end namespace stim | ... | ... |
stim/image/image_contour_detection.h
1 | 1 | |
2 | -stim::image<float> gaussian_derivative_filter_odd(stim::image<float> image, int r, unsigned int sigma_n, float theta); | |
3 | -stim::image<float> func_mPb_theta(stim::image<float> img, float theta, int* r, float* alpha, int s); | |
4 | -stim::image<float> func_mPb(stim::image<float> img, unsigned int theta_n, int* r, float* alpha, int s); | |
5 | 2 | \ No newline at end of file |
3 | +//stim::image<float> gaussian_derivative_filter_odd(stim::image<float> image, int r, unsigned int sigma_n, float theta); | |
4 | +//stim::image<float> func_mPb_theta(stim::image<float> img, float theta, int* r, float* alpha, int s); | |
5 | +//stim::image<float> func_mPb(stim::image<float> img, unsigned int theta_n, int* r, float* alpha, int s); | |
6 | + | |
7 | +stim::image<float> Gd1(stim::image<float> image, int r, unsigned int sigma_n); | |
8 | +stim::image<float> Gd2(stim::image<float> image, int r, unsigned int sigma_n); | |
9 | +stim::image<float> Gd_odd(stim::image<float> image, int r, unsigned int sigma_n, float theta); | |
10 | +stim::image<float> Gd_even(stim::image<float> image, int r, unsigned int sigma_n, float theta); | |
11 | +stim::image<float> Gd_center(stim::image<float> image, int r, unsigned int sigma_n); | |
12 | + | |
13 | +stim::image<float> textons(stim::image<float> image, unsigned int theta_n); | |
14 | +stim::image<float> kmeans(stim::image<float> textons, unsigned int K); | |
15 | +stim::image<float> Pb(stim::image<float> image, int r, unsigned int sigma_n); | |
16 | +stim::image<float> cPb(stim::image<float> img, int* r, float* alpha, int s); | |
17 | +stim::image<float> tPb(stim::image<float> img, int* r, float* alpha, unsigned int theta_n, unsigned int bin_n, int s, unsigned int K); | |
6 | 18 | \ No newline at end of file | ... | ... |