spider_cost.cuh
5.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#ifndef STIM_SPIDER_COST_H
#define STIM_SPIDER_COST_H
#include <assert.h>
#include <cuda.h>
//#include <cuda_runtime.h>
#include <stdio.h>
#include <stim/visualization/colormap.h>
#include <sstream>
#include <stim/math/vector.h>
#include <stim/cuda/cudatools/timer.h>
#include <stim/cuda/cudatools/devices.h>
#include <stim/cuda/cudatools/threads.h>
#include <stim/cuda/cuda_texture.cuh>
namespace stim{
namespace cuda
{
stim::cuda::cuda_texture t; //texture object.
float* result;
// float* print;
///Initialization function, allocates the memory and passes the necessary
///handles from OpenGL and Cuda.
///@param DIM_Y --integer controlling how much memory to allocate.
void initArray(int DIM_Y)
{
// cudaMalloc( (void**) &print, DIM_Y*16*sizeof(float)); ///temporary
cudaMalloc( (void**) &result, DIM_Y*sizeof(float));
}
///Deinit function that frees the memery used and releases the texture resource
///back to OpenGL.
void cleanUP()
{
cudaFree(result);
// cudaFree(print); ///temporary
}
///A virtual representation of a uniform template.
///Returns the value of the template pixel.
///@param int x --location of a pixel.
__device__
float Template(int x, int max_x)
{
if(x < max_x/6 || x > max_x*5/6 || (x > max_x*2/6 && x < max_x*4/6))
{
return 1.0;
}else{
return 0.0;
}
}
///Find the difference of the given set of samples and the template
///using cuda acceleration.
///@param stim::cuda::cuda_texture t --stim texture that holds all the references
/// to the data.
///@param float* result --a pointer to the memory that stores the result.
__global__
//void get_diff (float *result)
void get_diff (cudaTextureObject_t texIn, float *result, int dx, int dy)
{
// __shared__ float shared[32][16];
extern __shared__ float shared[];
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int x_t = threadIdx.x;
int y_t = threadIdx.y;
int idx = y_t*dx+x_t;
int g_idx = blockIdx.y;
float valIn = tex2D<unsigned char>(texIn, x, y)/255.0;
float valTemp = Template(x, dx);
// print[idx] = abs(valIn); ///temporary
shared[idx] = abs(valIn-valTemp);
__syncthreads();
for(unsigned int step = blockDim.x/2; step >= 1; step >>= 1)
{
__syncthreads();
if (x_t < step)
{
// shared[x_t][y_t] += shared[x_t + step][y_t];
shared[idx] += shared[y_t*dx+x_t+step];
}
__syncthreads();
}
__syncthreads();
for(unsigned int step = blockDim.y/2; step >= 1; step >>= 1)
{
__syncthreads();
if(y_t < step)
{
// shared[x_t][y_t] += shared[x_t][y_t + step];
shared[idx] += shared[(y_t+step)*dx+x_t];
}
__syncthreads();
}
__syncthreads();
if(x_t == 0 && y_t == 0)
result[g_idx] = shared[0];
// //result[idx] = abs(valIn);
}
///External access-point to the cuda function
///@param GLuint texbufferID --GLtexture (most be contained in a framebuffer object)
/// that holds the data that will be handed to cuda.
///@param GLenum texType --either GL_TEXTURE_1D, GL_TEXTURE_2D or GL_TEXTURE_3D
/// may work with other gl texture types, but untested.
///@param DIM_Y, the number of samples in the template.
extern "C"
stim::vec<int> get_cost(GLint texbufferID, GLenum texType, int DIM_Y,int dx = 16, int dy = 8)
{
//Bind the Texture in GL and allow access to cuda.
// #ifdef TIMING
// gpuStartTimer();
// #endif
t.MapCudaTexture(texbufferID, texType);
// #ifdef TIMING
// std::cout << " " << gpuStopTimer();
// #endif
//initialize the return arrays.
// #ifdef TIMING
// gpuStartTimer();
// #endif
float* output;
output = (float* ) malloc(DIM_Y*sizeof(float));
stim::vec<int> ret(0, 0);
initArray(DIM_Y);
//variables for finding the min.
float mini = 10000000000000000.0;
int idx = 0;
// #ifdef TIMING
// std::cout << " " << gpuStopTimer();
// #endif
//cuda launch variables.
// #ifdef TIMING
// gpuStartTimer();
// #endif
dim3 numBlocks(1, DIM_Y);
dim3 threadsPerBlock(dx, dy);
get_diff <<< numBlocks, threadsPerBlock, dx*dy*sizeof(float) >>> (t.getTexture(), result, dx, dy);
cudaDeviceSynchronize();
// #ifdef TIMING
// std::cout << " " << gpuStopTimer();
// #endif
// #ifdef TIMING
// gpuStartTimer();
// #endif
HANDLE_ERROR(
cudaMemcpy(output, result, DIM_Y*sizeof(float), cudaMemcpyDeviceToHost)
);
for( int i = 0; i<DIM_Y; i++){
if(output[i] < mini){
mini = output[i];
idx = i;
}
}
// #ifdef TIMING
// std::cout << " " << gpuStopTimer();
// #endif
// #ifdef TIMING
// gpuStartTimer();
// #endif
// stringstream name; //for debugging
// name << "Test.bmp";
// stim::gpu2image<float>(print, name.str(),16,218,0,256);
t.UnmapCudaTexture();
cleanUP();
ret[0] = idx; ret[1] = (int) output[idx];
// std::cout << "The cost is " << output[idx] << std::endl;
free(output);
// #ifdef TIMING
// std::cout << " " << gpuStopTimer() << std::endl;
// #endif
return ret;
}
}
}
#endif