Commit f186dbdaad1858d7e18d4b35a1e099ef81ff1f42
1 parent
3173e54d
header file for bsds500
Showing
12 changed files
with
628 additions
and
10 deletions
Show diff stats
stim/cuda/arraymath.cuh
@@ -3,6 +3,11 @@ | @@ -3,6 +3,11 @@ | ||
3 | 3 | ||
4 | #include <stim/cuda/arraymath/array_add.cuh> | 4 | #include <stim/cuda/arraymath/array_add.cuh> |
5 | #include <stim/cuda/arraymath/array_multiply.cuh> | 5 | #include <stim/cuda/arraymath/array_multiply.cuh> |
6 | +#include <stim/cuda/arraymath/array_multiply2.cuh> | ||
7 | +#include <stim/cuda/arraymath/array_divide.cuh> | ||
8 | +#include <stim/cuda/arraymath/array_cos.cuh> | ||
9 | +#include <stim/cuda/arraymath/array_sin.cuh> | ||
10 | +#include <stim/cuda/arraymath/array_atan.cuh> | ||
6 | #include <stim/cuda/arraymath/array_abs.cuh> | 11 | #include <stim/cuda/arraymath/array_abs.cuh> |
7 | #include <stim/cuda/arraymath/array_cart2polar.cuh> | 12 | #include <stim/cuda/arraymath/array_cart2polar.cuh> |
8 | 13 |
stim/cuda/arraymath/array_add.cuh
1 | +#ifndef STIM_CUDA_ARRAY_ATAN_H | ||
2 | +#define STIM_CUDA_ARRAY_ATAN_H | ||
3 | + | ||
4 | +#include <iostream> | ||
5 | +#include <cuda.h> | ||
6 | +#include <cmath> | ||
7 | +#include <stim/cuda/cudatools.h> | ||
8 | + | ||
9 | +namespace stim{ | ||
10 | + namespace cuda{ | ||
11 | + | ||
12 | + template<typename T> | ||
13 | + __global__ void cuda_atan(T* ptr1, T* out, unsigned int N){ | ||
14 | + | ||
15 | + //calculate the 1D index for this thread | ||
16 | + int idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
17 | + | ||
18 | + if(idx < N){ | ||
19 | + out[idx] = atan(ptr1[idx]); | ||
20 | + } | ||
21 | + | ||
22 | + } | ||
23 | + | ||
24 | + template<typename T> | ||
25 | + void gpu_atan(T* ptr1, T* out, unsigned int N){ | ||
26 | + | ||
27 | + //get the maximum number of threads per block for the CUDA device | ||
28 | + int threads = stim::maxThreadsPerBlock(); | ||
29 | + | ||
30 | + //calculate the number of blocks | ||
31 | + int blocks = N / threads + 1; | ||
32 | + | ||
33 | + //call the kernel to do the multiplication | ||
34 | + cuda_atan <<< blocks, threads >>>(ptr1, out, N); | ||
35 | + | ||
36 | + } | ||
37 | + | ||
38 | + template<typename T> | ||
39 | + void cpu_atan(T* ptr1, T* cpu_out, unsigned int N){ | ||
40 | + | ||
41 | + //allocate memory on the GPU for the array | ||
42 | + T* gpu_ptr1; | ||
43 | + T* gpu_out; | ||
44 | + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) ); | ||
45 | + HANDLE_ERROR( cudaMalloc( &gpu_out, N * sizeof(T) ) ); | ||
46 | + | ||
47 | + //copy the array to the GPU | ||
48 | + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) ); | ||
49 | + | ||
50 | + //call the GPU version of this function | ||
51 | + gpu_atan<T>(gpu_ptr1 ,gpu_out, N); | ||
52 | + | ||
53 | + //copy the array back to the CPU | ||
54 | + HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N * sizeof(T), cudaMemcpyDeviceToHost) ); | ||
55 | + | ||
56 | + //free allocated memory | ||
57 | + cudaFree(gpu_ptr1); | ||
58 | + cudaFree(gpu_out); | ||
59 | + | ||
60 | + } | ||
61 | + | ||
62 | + } | ||
63 | +} | ||
64 | + | ||
65 | + | ||
66 | + | ||
67 | +#endif | ||
0 | \ No newline at end of file | 68 | \ No newline at end of file |
1 | +#ifndef STIM_CUDA_ARRAY_COS_H | ||
2 | +#define STIM_CUDA_ARRAY_COS_H | ||
3 | + | ||
4 | +#include <iostream> | ||
5 | +#include <cuda.h> | ||
6 | +#include <cmath> | ||
7 | +#include <stim/cuda/cudatools.h> | ||
8 | + | ||
9 | +namespace stim{ | ||
10 | + namespace cuda{ | ||
11 | + | ||
12 | + template<typename T> | ||
13 | + __global__ void cuda_cos(T* ptr1, T* out, unsigned int N){ | ||
14 | + | ||
15 | + //calculate the 1D index for this thread | ||
16 | + int idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
17 | + | ||
18 | + if(idx < N){ | ||
19 | + out[idx] = cos(ptr1[idx]); | ||
20 | + } | ||
21 | + | ||
22 | + } | ||
23 | + | ||
24 | + template<typename T> | ||
25 | + void gpu_cos(T* ptr1, T* out, unsigned int N){ | ||
26 | + | ||
27 | + //get the maximum number of threads per block for the CUDA device | ||
28 | + int threads = stim::maxThreadsPerBlock(); | ||
29 | + | ||
30 | + //calculate the number of blocks | ||
31 | + int blocks = N / threads + 1; | ||
32 | + | ||
33 | + //call the kernel to do the multiplication | ||
34 | + cuda_cos <<< blocks, threads >>>(ptr1, out, N); | ||
35 | + | ||
36 | + } | ||
37 | + | ||
38 | + template<typename T> | ||
39 | + void cpu_cos(T* ptr1, T* cpu_out, unsigned int N){ | ||
40 | + | ||
41 | + //allocate memory on the GPU for the array | ||
42 | + T* gpu_ptr1; | ||
43 | + T* gpu_out; | ||
44 | + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) ); | ||
45 | + HANDLE_ERROR( cudaMalloc( &gpu_out, N * sizeof(T) ) ); | ||
46 | + | ||
47 | + //copy the array to the GPU | ||
48 | + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) ); | ||
49 | + | ||
50 | + //call the GPU version of this function | ||
51 | + gpu_cos<T>(gpu_ptr1 ,gpu_out, N); | ||
52 | + | ||
53 | + //copy the array back to the CPU | ||
54 | + HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N * sizeof(T), cudaMemcpyDeviceToHost) ); | ||
55 | + | ||
56 | + //free allocated memory | ||
57 | + cudaFree(gpu_ptr1); | ||
58 | + cudaFree(gpu_out); | ||
59 | + | ||
60 | + } | ||
61 | + | ||
62 | + } | ||
63 | +} | ||
64 | + | ||
65 | + | ||
66 | + | ||
67 | +#endif | ||
0 | \ No newline at end of file | 68 | \ No newline at end of file |
1 | +#ifndef STIM_CUDA_ARRAY_DIVIDE_H | ||
2 | +#define STIM_CUDA_ARRAY_DIVIDE_H | ||
3 | + | ||
4 | +#include <iostream> | ||
5 | +#include <cuda.h> | ||
6 | +#include <stim/cuda/cudatools.h> | ||
7 | + | ||
8 | +namespace stim{ | ||
9 | + namespace cuda{ | ||
10 | + | ||
11 | + template<typename T> | ||
12 | + __global__ void cuda_divide(T* ptr1, T* ptr2, T* quotient, unsigned int N){ | ||
13 | + | ||
14 | + //calculate the 1D index for this thread | ||
15 | + int idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
16 | + | ||
17 | + if(idx < N){ | ||
18 | + quotient[idx] = ptr1[idx] / ptr2[idx]; | ||
19 | + } | ||
20 | + | ||
21 | + } | ||
22 | + | ||
23 | + template<typename T> | ||
24 | + void gpu_divide(T* ptr1, T* ptr2, T* quotient, unsigned int N){ | ||
25 | + | ||
26 | + //get the maximum number of threads per block for the CUDA device | ||
27 | + int threads = stim::maxThreadsPerBlock(); | ||
28 | + | ||
29 | + //calculate the number of blocks | ||
30 | + int blocks = N / threads + 1; | ||
31 | + | ||
32 | + //call the kernel to do the multiplication | ||
33 | + cuda_divide <<< blocks, threads >>>(ptr1, ptr2, quotient, N); | ||
34 | + | ||
35 | + } | ||
36 | + | ||
37 | + template<typename T> | ||
38 | + void cpu_divide(T* ptr1, T* ptr2, T* cpu_quotient, unsigned int N){ | ||
39 | + | ||
40 | + //allocate memory on the GPU for the array | ||
41 | + T* gpu_ptr1; | ||
42 | + T* gpu_ptr2; | ||
43 | + T* gpu_quotient; | ||
44 | + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) ); | ||
45 | + HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) ); | ||
46 | + HANDLE_ERROR( cudaMalloc( &gpu_quotient, N * sizeof(T) ) ); | ||
47 | + | ||
48 | + //copy the array to the GPU | ||
49 | + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) ); | ||
50 | + HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) ); | ||
51 | + | ||
52 | + //call the GPU version of this function | ||
53 | + gpu_divide<T>(gpu_ptr1, gpu_ptr2 ,gpu_quotient, N); | ||
54 | + | ||
55 | + //copy the array back to the CPU | ||
56 | + HANDLE_ERROR( cudaMemcpy( cpu_quotient, gpu_quotient, N * sizeof(T), cudaMemcpyDeviceToHost) ); | ||
57 | + | ||
58 | + //free allocated memory | ||
59 | + cudaFree(gpu_ptr1); | ||
60 | + cudaFree(gpu_ptr2); | ||
61 | + cudaFree(gpu_quotient); | ||
62 | + | ||
63 | + } | ||
64 | + | ||
65 | + } | ||
66 | +} | ||
67 | + | ||
68 | + | ||
69 | + | ||
70 | +#endif | ||
0 | \ No newline at end of file | 71 | \ No newline at end of file |
1 | +#ifndef STIM_CUDA_ARRAY_MULTIPLY_H | ||
2 | +#define STIM_CUDA_ARRAY_MULTIPLY_H | ||
3 | + | ||
4 | +#include <iostream> | ||
5 | +#include <cuda.h> | ||
6 | +#include <stim/cuda/cudatools.h> | ||
7 | + | ||
8 | +namespace stim{ | ||
9 | + namespace cuda{ | ||
10 | + | ||
11 | + template<typename T> | ||
12 | + __global__ void cuda_multiply(T* ptr1, T* ptr2, T* product, unsigned int N){ | ||
13 | + | ||
14 | + //calculate the 1D index for this thread | ||
15 | + int idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
16 | + | ||
17 | + if(idx < N){ | ||
18 | + product[idx] = ptr1[idx] * ptr2[idx]; | ||
19 | + } | ||
20 | + | ||
21 | + } | ||
22 | + | ||
23 | + template<typename T> | ||
24 | + void gpu_multiply(T* ptr1, T* ptr2, T* product, unsigned int N){ | ||
25 | + | ||
26 | + //get the maximum number of threads per block for the CUDA device | ||
27 | + int threads = stim::maxThreadsPerBlock(); | ||
28 | + | ||
29 | + //calculate the number of blocks | ||
30 | + int blocks = N / threads + 1; | ||
31 | + | ||
32 | + //call the kernel to do the multiplication | ||
33 | + cuda_multiply <<< blocks, threads >>>(ptr1, ptr2, product, N); | ||
34 | + | ||
35 | + } | ||
36 | + | ||
37 | + template<typename T> | ||
38 | + void cpu_multiply(T* ptr1, T* ptr2, T* cpu_product, unsigned int N){ | ||
39 | + | ||
40 | + //allocate memory on the GPU for the array | ||
41 | + T* gpu_ptr1; | ||
42 | + T* gpu_ptr2; | ||
43 | + T* gpu_product; | ||
44 | + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) ); | ||
45 | + HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) ); | ||
46 | + HANDLE_ERROR( cudaMalloc( &gpu_product, N * sizeof(T) ) ); | ||
47 | + | ||
48 | + //copy the array to the GPU | ||
49 | + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) ); | ||
50 | + HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) ); | ||
51 | + | ||
52 | + //call the GPU version of this function | ||
53 | + gpu_multiply<T>(gpu_ptr1, gpu_ptr2 ,gpu_product, N); | ||
54 | + | ||
55 | + //copy the array back to the CPU | ||
56 | + HANDLE_ERROR( cudaMemcpy( cpu_product, gpu_product, N * sizeof(T), cudaMemcpyDeviceToHost) ); | ||
57 | + | ||
58 | + //free allocated memory | ||
59 | + cudaFree(gpu_ptr1); | ||
60 | + cudaFree(gpu_ptr2); | ||
61 | + cudaFree(gpu_product); | ||
62 | + | ||
63 | + } | ||
64 | + | ||
65 | + } | ||
66 | +} | ||
67 | + | ||
68 | + | ||
69 | + | ||
70 | +#endif | ||
0 | \ No newline at end of file | 71 | \ No newline at end of file |
1 | +#ifndef STIM_CUDA_ARRAY_SIN_H | ||
2 | +#define STIM_CUDA_ARRAY_SIN_H | ||
3 | + | ||
4 | +#include <iostream> | ||
5 | +#include <cuda.h> | ||
6 | +#include <cmath> | ||
7 | +#include <stim/cuda/cudatools.h> | ||
8 | + | ||
9 | +namespace stim{ | ||
10 | + namespace cuda{ | ||
11 | + | ||
12 | + template<typename T> | ||
13 | + __global__ void cuda_sin(T* ptr1, T* out, unsigned int N){ | ||
14 | + | ||
15 | + //calculate the 1D index for this thread | ||
16 | + int idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
17 | + | ||
18 | + if(idx < N){ | ||
19 | + out[idx] = sin(ptr1[idx]); | ||
20 | + } | ||
21 | + | ||
22 | + } | ||
23 | + | ||
24 | + template<typename T> | ||
25 | + void gpu_sin(T* ptr1, T* out, unsigned int N){ | ||
26 | + | ||
27 | + //get the maximum number of threads per block for the CUDA device | ||
28 | + int threads = stim::maxThreadsPerBlock(); | ||
29 | + | ||
30 | + //calculate the number of blocks | ||
31 | + int blocks = N / threads + 1; | ||
32 | + | ||
33 | + //call the kernel to do the multiplication | ||
34 | + cuda_sin <<< blocks, threads >>>(ptr1, out, N); | ||
35 | + | ||
36 | + } | ||
37 | + | ||
38 | + template<typename T> | ||
39 | + void cpu_sin(T* ptr1, T* cpu_out, unsigned int N){ | ||
40 | + | ||
41 | + //allocate memory on the GPU for the array | ||
42 | + T* gpu_ptr1; | ||
43 | + T* gpu_out; | ||
44 | + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) ); | ||
45 | + HANDLE_ERROR( cudaMalloc( &gpu_out, N * sizeof(T) ) ); | ||
46 | + | ||
47 | + //copy the array to the GPU | ||
48 | + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) ); | ||
49 | + | ||
50 | + //call the GPU version of this function | ||
51 | + gpu_sin<T>(gpu_ptr1 ,gpu_out, N); | ||
52 | + | ||
53 | + //copy the array back to the CPU | ||
54 | + HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N * sizeof(T), cudaMemcpyDeviceToHost) ); | ||
55 | + | ||
56 | + //free allocated memory | ||
57 | + cudaFree(gpu_ptr1); | ||
58 | + cudaFree(gpu_out); | ||
59 | + | ||
60 | + } | ||
61 | + | ||
62 | + } | ||
63 | +} | ||
64 | + | ||
65 | + | ||
66 | + | ||
67 | +#endif | ||
0 | \ No newline at end of file | 68 | \ No newline at end of file |
stim/cuda/cudatools/devices.h
@@ -13,6 +13,15 @@ int maxThreadsPerBlock() | @@ -13,6 +13,15 @@ int maxThreadsPerBlock() | ||
13 | cudaGetDeviceProperties(&props, device); | 13 | cudaGetDeviceProperties(&props, device); |
14 | return props.maxThreadsPerBlock; | 14 | return props.maxThreadsPerBlock; |
15 | } | 15 | } |
16 | + | ||
17 | +int sharedMemPerBlock() | ||
18 | +{ | ||
19 | + int device; | ||
20 | + cudaGetDevice(&device); //get the id of the current device | ||
21 | + cudaDeviceProp props; //device property structure | ||
22 | + cudaGetDeviceProperties(&props, device); | ||
23 | + return props.sharedMemPerBlock; | ||
24 | +} | ||
16 | } //end namespace rts | 25 | } //end namespace rts |
17 | 26 | ||
18 | #endif | 27 | #endif |
1 | +#ifndef STIM_CUDA_CHI_GRAD_H | ||
2 | +#define STIM_CUDA_CHI_GRAD_H | ||
3 | + | ||
4 | +#include <iostream> | ||
5 | +#include <cuda.h> | ||
6 | +#include <cuda_runtime.h> | ||
7 | +#include <stim/cuda/sharedmem.cuh> | ||
8 | +#include <cmath> | ||
9 | +#include <algorithm> | ||
10 | + | ||
11 | +#define PI 3.14159265358979 | ||
12 | + | ||
13 | +namespace stim{ | ||
14 | + namespace cuda{ | ||
15 | + | ||
16 | + /// template parameter @param T is the data type | ||
17 | + template<typename T> | ||
18 | + __global__ void cuda_chi_grad(T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){ | ||
19 | + | ||
20 | + double theta_r = ((theta) * PI)/180; //change angle unit from degree to rad | ||
21 | + float sum = 0; | ||
22 | + unsigned int N = w * h; | ||
23 | + | ||
24 | + //change 1D index to 2D cordinates | ||
25 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
26 | + int yj = blockIdx.y; | ||
27 | + int idx = yj * w + xi; | ||
28 | + int shareidx = threadIdx.x; | ||
29 | + | ||
30 | + extern __shared__ unsigned short bin[]; | ||
31 | + | ||
32 | + | ||
33 | + if(xi < w && yj < h){ | ||
34 | + | ||
35 | + int gidx; | ||
36 | + int hidx; | ||
37 | + | ||
38 | + //initialize histogram bin to zeros | ||
39 | + for(int i = 0; i < bin_n; i++){ | ||
40 | + | ||
41 | + bin[shareidx * bin_n + i] = 0; | ||
42 | + __syncthreads(); | ||
43 | + | ||
44 | + } | ||
45 | + | ||
46 | + //get the histogram of the first half of disc and store in bin | ||
47 | + for (int y = yj - r; y <= yj + r; y++){ | ||
48 | + for (int x = xi - r; x <= xi + r; x++){ | ||
49 | + | ||
50 | + if ((y - yj)*cos(theta_r) + (x - xi)*sin(theta_r) > 0){ | ||
51 | + | ||
52 | + gidx = (int) tex2D<T>(texObj, (float)x/w, (float)y/h)/bin_size; | ||
53 | + __syncthreads(); | ||
54 | + | ||
55 | + bin[shareidx * bin_n + gidx]++; | ||
56 | + __syncthreads(); | ||
57 | + | ||
58 | + } | ||
59 | + | ||
60 | + else{} | ||
61 | + } | ||
62 | + } | ||
63 | + | ||
64 | + //initiallize the gbin | ||
65 | + unsigned short* gbin = (unsigned short*) malloc(bin_n*sizeof(unsigned short)); | ||
66 | + memset (gbin, 0, bin_n*sizeof(unsigned short)); | ||
67 | + | ||
68 | + //copy the histogram to gbin | ||
69 | + for (unsigned int gi = 0; gi < bin_n; gi++){ | ||
70 | + | ||
71 | + gbin[gi] = bin[shareidx * bin_n + gi]; | ||
72 | + | ||
73 | + } | ||
74 | + | ||
75 | + //initialize histogram bin to zeros | ||
76 | + for(int j = 0; j < bin_n; j++){ //initialize histogram bin to zeros | ||
77 | + | ||
78 | + bin[shareidx * bin_n + j] = 0; | ||
79 | + __syncthreads(); | ||
80 | + } | ||
81 | + | ||
82 | + //get the histogram of the second half of disc and store in bin | ||
83 | + for (int y = yj - r; y <= yj + r; y++){ | ||
84 | + for (int x = xi - r; x <= xi + r; x++){ | ||
85 | + | ||
86 | + if ((y - yj)*cos(theta_r) + (x - xi)*sin(theta_r) < 0){ | ||
87 | + | ||
88 | + hidx = (int) tex2D<T>(texObj, (float)x/w, (float)y/h)/bin_size; | ||
89 | + __syncthreads(); | ||
90 | + | ||
91 | + bin[shareidx * bin_n + hidx]++; | ||
92 | + __syncthreads(); | ||
93 | + | ||
94 | + } | ||
95 | + else{} | ||
96 | + } | ||
97 | + } | ||
98 | + | ||
99 | + //initiallize the gbin | ||
100 | + unsigned short* hbin = (unsigned short*) malloc(bin_n*sizeof(unsigned short)); | ||
101 | + memset (hbin, 0, bin_n*sizeof(unsigned short)); | ||
102 | + | ||
103 | + //copy the histogram to hbin | ||
104 | + for (unsigned int hi = 0; hi < bin_n; hi++){ | ||
105 | + | ||
106 | + hbin[hi] = bin[shareidx * bin_n + hi]; | ||
107 | + | ||
108 | + } | ||
109 | + | ||
110 | + //compare gbin, hbin and calculate the chi distance | ||
111 | + for (int k = 0; k < bin_n; k++){ | ||
112 | + | ||
113 | + float flag; // set flag to avoid zero denominator | ||
114 | + | ||
115 | + if ((gbin[k] + hbin[k]) == 0){ | ||
116 | + flag = 1; | ||
117 | + } | ||
118 | + else { | ||
119 | + flag = (gbin[k] + hbin[k]); | ||
120 | + __syncthreads(); | ||
121 | + } | ||
122 | + | ||
123 | + sum += (gbin[k] - hbin[k])*(gbin[k] - hbin[k])/flag; | ||
124 | + __syncthreads(); | ||
125 | + | ||
126 | + } | ||
127 | + | ||
128 | + // return chi-distance for each pixel | ||
129 | + copy[idx] = sum; | ||
130 | + | ||
131 | + free(gbin); | ||
132 | + free(hbin); | ||
133 | + } | ||
134 | + } | ||
135 | + | ||
136 | + | ||
137 | + template<typename T> | ||
138 | + void gpu_chi_grad(T* img, T* copy, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){ | ||
139 | + | ||
140 | + unsigned long N = w * h; | ||
141 | + | ||
142 | + // Allocate CUDA array in device memory | ||
143 | + | ||
144 | + //define a channel descriptor for a single 32-bit channel | ||
145 | + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); | ||
146 | + cudaArray* cuArray; //declare the cuda array | ||
147 | + cudaMallocArray(&cuArray, &channelDesc, w, h); //allocate the cuda array | ||
148 | + | ||
149 | + // Copy the image data from global memory to the array | ||
150 | + cudaMemcpyToArray(cuArray, 0, 0, img, N * sizeof(T), cudaMemcpyDeviceToDevice); | ||
151 | + | ||
152 | + // Specify texture | ||
153 | + struct cudaResourceDesc resDesc; //create a resource descriptor | ||
154 | + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero | ||
155 | + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type | ||
156 | + resDesc.res.array.array = cuArray; //add a pointer to the cuda array | ||
157 | + | ||
158 | + // Specify texture object parameters | ||
159 | + struct cudaTextureDesc texDesc; //create a texture descriptor | ||
160 | + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero | ||
161 | + texDesc.addressMode[0] = cudaAddressModeMirror; //use wrapping (around the edges) | ||
162 | + texDesc.addressMode[1] = cudaAddressModeMirror; | ||
163 | + texDesc.filterMode = cudaFilterModePoint; //use linear filtering | ||
164 | + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) | ||
165 | + texDesc.normalizedCoords = 1; //using normalized coordinates | ||
166 | + | ||
167 | + // Create texture object | ||
168 | + cudaTextureObject_t texObj = 0; | ||
169 | + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); | ||
170 | + | ||
171 | + //get the maximum number of threads per block for the CUDA device | ||
172 | + int threads = stim::maxThreadsPerBlock(); | ||
173 | + int sharemax = stim::sharedMemPerBlock(); //get the size of Shared memory available per block in bytes | ||
174 | + unsigned int shared_bytes = threads * bin_n * sizeof(unsigned short); | ||
175 | + | ||
176 | + if(threads * bin_n > sharemax){ | ||
177 | + | ||
178 | + cout <<"Error: shared_bytes exceeds the max value."<<'\n'; | ||
179 | + exit(1); | ||
180 | + | ||
181 | + } | ||
182 | + | ||
183 | + | ||
184 | + //calculate the number of blocks | ||
185 | + dim3 blocks(w / threads + 1, h); | ||
186 | + | ||
187 | + //call the kernel to do the multiplication | ||
188 | + cuda_chi_grad <<< blocks, threads, shared_bytes >>>(copy, texObj, w, h, r, bin_n, bin_size, theta); | ||
189 | + | ||
190 | + } | ||
191 | + | ||
192 | + template<typename T> | ||
193 | + void cpu_chi_grad(T* img, T* cpu_copy, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){ | ||
194 | + | ||
195 | + unsigned long N = w * h; | ||
196 | + //allocate memory on the GPU for the array | ||
197 | + T* gpu_img; | ||
198 | + T* gpu_copy; | ||
199 | + HANDLE_ERROR( cudaMalloc( &gpu_img, N * sizeof(T) ) ); | ||
200 | + HANDLE_ERROR( cudaMalloc( &gpu_copy, N * sizeof(T) ) ); | ||
201 | + | ||
202 | + //copy the array to the GPU | ||
203 | + HANDLE_ERROR( cudaMemcpy( gpu_img, img, N * sizeof(T), cudaMemcpyHostToDevice) ); | ||
204 | + | ||
205 | + //call the GPU version of this function | ||
206 | + gpu_chi_grad<T>(gpu_img, gpu_copy, w, h, r, bin_n, bin_size, theta); | ||
207 | + | ||
208 | + //copy the array back to the CPU | ||
209 | + HANDLE_ERROR( cudaMemcpy( cpu_copy, gpu_copy, N * sizeof(T), cudaMemcpyDeviceToHost) ); | ||
210 | + | ||
211 | + //free allocated memory | ||
212 | + cudaFree(gpu_img); | ||
213 | + cudaFree(gpu_copy); | ||
214 | + | ||
215 | + } | ||
216 | + | ||
217 | + } | ||
218 | +} | ||
219 | + | ||
220 | + | ||
221 | +#endif | ||
0 | \ No newline at end of file | 222 | \ No newline at end of file |
stim/cuda/templates/conv2.cuh
@@ -11,8 +11,7 @@ namespace stim{ | @@ -11,8 +11,7 @@ namespace stim{ | ||
11 | namespace cuda{ | 11 | namespace cuda{ |
12 | 12 | ||
13 | template<typename T> | 13 | template<typename T> |
14 | - //__global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){ | ||
15 | - __global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned int M){ | 14 | + __global__ void cuda_conv2(T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned int M){ |
16 | 15 | ||
17 | 16 | ||
18 | //the radius of mask | 17 | //the radius of mask |
@@ -51,8 +50,6 @@ namespace stim{ | @@ -51,8 +50,6 @@ namespace stim{ | ||
51 | int xx = x - (i - r); | 50 | int xx = x - (i - r); |
52 | int yy = y - (j - r); | 51 | int yy = y - (j - r); |
53 | 52 | ||
54 | - //T temp = img[y * w + x] * mask[yy * M + xx]; | ||
55 | - //sum += img[y * w + x] * mask[yy * M + xx]; | ||
56 | sum += tex2D<T>(texObj, (float)x/w, (float)y/h) * mask[yy * M + xx]; | 53 | sum += tex2D<T>(texObj, (float)x/w, (float)y/h) * mask[yy * M + xx]; |
57 | } | 54 | } |
58 | } | 55 | } |
@@ -105,8 +102,7 @@ namespace stim{ | @@ -105,8 +102,7 @@ namespace stim{ | ||
105 | dim3 blocks(w / threads + 1, h); | 102 | dim3 blocks(w / threads + 1, h); |
106 | 103 | ||
107 | //call the kernel to do the multiplication | 104 | //call the kernel to do the multiplication |
108 | - //cuda_conv2 <<< blocks, threads >>>(img, mask, copy, w, h, M); | ||
109 | - cuda_conv2 <<< blocks, threads >>>(img, mask, copy, texObj, w, h, M); | 105 | + cuda_conv2 <<< blocks, threads >>>(mask, copy, texObj, w, h, M); |
110 | 106 | ||
111 | } | 107 | } |
112 | 108 |
stim/image/image.h
@@ -31,8 +31,12 @@ public: | @@ -31,8 +31,12 @@ public: | ||
31 | } | 31 | } |
32 | 32 | ||
33 | /// Constructor initializes an image to a given size | 33 | /// Constructor initializes an image to a given size |
34 | - image(unsigned int x, unsigned int y = 1, unsigned int z = 1){ | 34 | + /*image(unsigned int x, unsigned int y = 1, unsigned int z = 1){ |
35 | img = cimg_library::CImg<T>(x, y, z); | 35 | img = cimg_library::CImg<T>(x, y, z); |
36 | + }*/ | ||
37 | + | ||
38 | + image(unsigned int x, unsigned int y = 1, unsigned int z = 1, unsigned int c = 1){ | ||
39 | + img = cimg_library::CImg<T>(x, y, z, c); | ||
36 | } | 40 | } |
37 | 41 | ||
38 | //Load an image from a file | 42 | //Load an image from a file |
@@ -90,6 +94,23 @@ public: | @@ -90,6 +94,23 @@ public: | ||
90 | 94 | ||
91 | } | 95 | } |
92 | 96 | ||
97 | + /// Copy the given data to the specified channel | ||
98 | + | ||
99 | + /// @param c is the channel number that the data will be copied to | ||
100 | + /// @param buffer is a pointer to the image to be copied to channel c | ||
101 | + | ||
102 | + void set_channel(unsigned int c, T* buffer){ | ||
103 | + | ||
104 | + //calculate the number of pixels in a channel | ||
105 | + unsigned int channel_size = width() * height(); | ||
106 | + | ||
107 | + //retreive a pointer to the raw image data | ||
108 | + T* ptr = img.data() + channel_size * c; | ||
109 | + | ||
110 | + //copy the buffer to the specified channel | ||
111 | + memcpy(ptr, buffer, sizeof(T) * channel_size); | ||
112 | + } | ||
113 | + | ||
93 | image<T> getslice(unsigned int c){ | 114 | image<T> getslice(unsigned int c){ |
94 | 115 | ||
95 | //create a new image | 116 | //create a new image |
@@ -228,6 +249,18 @@ public: | @@ -228,6 +249,18 @@ public: | ||
228 | } | 249 | } |
229 | 250 | ||
230 | 251 | ||
252 | + image<T> rotate(float angle, float cx, float cy){ | ||
253 | + | ||
254 | + image<T> result; | ||
255 | + float zoom = 1; | ||
256 | + unsigned int interpolation = 1; | ||
257 | + unsigned int boundary = 1; | ||
258 | + result.img = img.get_rotate (angle, cx, cy, zoom, interpolation, boundary); | ||
259 | + //result.save("data_output/test_rotate_neum.bmp"); | ||
260 | + | ||
261 | + return result; | ||
262 | + } | ||
263 | + | ||
231 | }; | 264 | }; |
232 | 265 | ||
233 | }; //end namespace stim | 266 | }; //end namespace stim |
stim/image/image_contour_detection.h
1 | 1 | ||
2 | -stim::image<float> gaussian_derivative_filter_odd(stim::image<float> image, int r, unsigned int sigma_n, float theta); | ||
3 | -stim::image<float> func_mPb_theta(stim::image<float> img, float theta, int* r, float* alpha, int s); | ||
4 | -stim::image<float> func_mPb(stim::image<float> img, unsigned int theta_n, int* r, float* alpha, int s); | ||
5 | \ No newline at end of file | 2 | \ No newline at end of file |
3 | +//stim::image<float> gaussian_derivative_filter_odd(stim::image<float> image, int r, unsigned int sigma_n, float theta); | ||
4 | +//stim::image<float> func_mPb_theta(stim::image<float> img, float theta, int* r, float* alpha, int s); | ||
5 | +//stim::image<float> func_mPb(stim::image<float> img, unsigned int theta_n, int* r, float* alpha, int s); | ||
6 | + | ||
7 | +stim::image<float> Gd1(stim::image<float> image, int r, unsigned int sigma_n); | ||
8 | +stim::image<float> Gd2(stim::image<float> image, int r, unsigned int sigma_n); | ||
9 | +stim::image<float> Gd_odd(stim::image<float> image, int r, unsigned int sigma_n, float theta); | ||
10 | +stim::image<float> Gd_even(stim::image<float> image, int r, unsigned int sigma_n, float theta); | ||
11 | +stim::image<float> Gd_center(stim::image<float> image, int r, unsigned int sigma_n); | ||
12 | + | ||
13 | +stim::image<float> textons(stim::image<float> image, unsigned int theta_n); | ||
14 | +stim::image<float> kmeans(stim::image<float> textons, unsigned int K); | ||
15 | +stim::image<float> Pb(stim::image<float> image, int r, unsigned int sigma_n); | ||
16 | +stim::image<float> cPb(stim::image<float> img, int* r, float* alpha, int s); | ||
17 | +stim::image<float> tPb(stim::image<float> img, int* r, float* alpha, unsigned int theta_n, unsigned int bin_n, int s, unsigned int K); | ||
6 | \ No newline at end of file | 18 | \ No newline at end of file |