Commit 79a9bf3f36b37c05bd5cf144c9824b62e9fe4824
1 parent
fa737592
new implementation of cost.h without using cublas. Significant speedup, gl_spide…
…r's Bind and Unbind methods have been made public to lower the number of times they are called
Showing
2 changed files
with
149 additions
and
106 deletions
Show diff stats
stim/cuda/cost.h
... | ... | @@ -3,19 +3,33 @@ |
3 | 3 | #include <cuda_runtime.h> |
4 | 4 | #include <cublas_v2.h> |
5 | 5 | #include <stdio.h> |
6 | -#include <stim/visualization/colormap.h> | |
6 | +#include "../visualization/colormap.h" | |
7 | 7 | #include <sstream> |
8 | -#include <stim/math/vector.h> | |
9 | -#include <stim/cuda/devices.h> | |
10 | -#include <stim/cuda/threads.h> | |
8 | +#include "../math/vector.h" | |
9 | +#include "../cuda/devices.h" | |
10 | +#include "../cuda/threads.h" | |
11 | 11 | |
12 | 12 | ///Cost function that works with the gl-spider class to find index of the item with min-cost. |
13 | 13 | typedef unsigned char uchar; |
14 | 14 | texture<uchar, cudaTextureType2D, cudaReadModeElementType> texIn; |
15 | 15 | float *result; |
16 | -float* v_dif; | |
17 | 16 | cudaArray* srcArray; |
18 | 17 | bool testing = false; |
18 | +/* | |
19 | +struct SharedMemory | |
20 | +{ | |
21 | + __device__ inline operator float* () | |
22 | + { | |
23 | + extern __shared__ float __smem[]; | |
24 | + return (float *)__smem; | |
25 | + } | |
26 | + | |
27 | + __device__ inline operator const float* () const | |
28 | + { | |
29 | + extern __shared__ float __smem[]; | |
30 | + return (float *)__smem; | |
31 | + } | |
32 | +};*/ | |
19 | 33 | |
20 | 34 | inline void checkCUDAerrors(const char *msg) |
21 | 35 | { |
... | ... | @@ -26,30 +40,12 @@ inline void checkCUDAerrors(const char *msg) |
26 | 40 | } |
27 | 41 | } |
28 | 42 | |
29 | -///Finds the sum of all the pixes in a gives template element. | |
30 | -///Returns the abosolute value. | |
31 | -///@param *diff, a pointer to the memory block that holds the pixel-differences. | |
32 | -float get_sum(float *diff) | |
33 | -{ | |
34 | - | |
35 | - cublasStatus_t ret; | |
36 | - cublasHandle_t handle; | |
37 | - ret = cublasCreate(&handle); | |
38 | - | |
39 | - ret = cublasSetVector(20*10, sizeof(*diff), diff, 1, v_dif, 1); | |
40 | - float out; | |
41 | - ret = cublasSasum(handle, 20*10, v_dif, 1, &out); | |
42 | -// cublasDestroy(ret); | |
43 | - cublasDestroy(handle); | |
44 | - return out; | |
45 | -} | |
46 | - | |
47 | 43 | ///A virtual representation of a uniform template. |
48 | 44 | ///Returns the value of the template pixel. |
49 | 45 | ///@param x, location of a pixel. |
50 | 46 | __device__ float Template(int x) |
51 | 47 | { |
52 | - if(x < 20/6 || x > 20*5/6 || (x > 20*2/6 && x < 20*4/6)){ | |
48 | + if(x < 16/6 || x > 16*5/6 || (x > 16*2/6 && x < 16*4/6)){ | |
53 | 49 | return 1.0; |
54 | 50 | }else{ |
55 | 51 | return 0.0; |
... | ... | @@ -63,15 +59,66 @@ __device__ float Template(int x) |
63 | 59 | __global__ |
64 | 60 | void get_diff (float *result) |
65 | 61 | { |
66 | - //cuPrintf("Hello"); | |
62 | + //float* shared = SharedMemory(); | |
63 | + __shared__ float shared[16][8]; | |
67 | 64 | int x = threadIdx.x + blockIdx.x * blockDim.x; |
68 | 65 | int y = threadIdx.y + blockIdx.y * blockDim.y; |
69 | - int idx = y*20+x; | |
66 | + int x_t = threadIdx.x; | |
67 | + int y_t = threadIdx.y; | |
68 | + //int idx = y*16+x; | |
69 | + int g_idx = blockIdx.y; | |
70 | 70 | |
71 | 71 | float valIn = tex2D(texIn, x, y)/255.0; |
72 | 72 | float valTemp = Template(x); |
73 | - result[idx] = abs(valIn-valTemp); | |
74 | - //result[idx] = abs(valIn); | |
73 | + shared[x_t][y_t] = abs(valIn-valTemp); | |
74 | + | |
75 | + __syncthreads(); | |
76 | + | |
77 | + for(unsigned int step = blockDim.x/2; step >= 1; step >>= 1) | |
78 | + { | |
79 | + __syncthreads(); | |
80 | + if (x_t < step) | |
81 | + { | |
82 | + shared[x_t][y_t] += shared[x_t + step][y_t]; | |
83 | + } | |
84 | + __syncthreads(); | |
85 | + } | |
86 | + __syncthreads(); | |
87 | + | |
88 | + for(unsigned int step = blockDim.y/2; step >= 1; step >>= 1) | |
89 | + { | |
90 | + __syncthreads(); | |
91 | + if(y_t < step) | |
92 | + { | |
93 | + shared[x_t][y_t] += shared[x_t][y_t + step]; | |
94 | + } | |
95 | + __syncthreads(); | |
96 | + } | |
97 | + __syncthreads(); | |
98 | +/* for(unsigned int step = 1; step < blockDim.x; step *= 2) | |
99 | + { | |
100 | + __syncthreads(); | |
101 | + if (x_t %(2*step) == 0) | |
102 | + { | |
103 | + shared[x_t][y_t] += shared[x_t + step][y_t]; | |
104 | + } | |
105 | + } | |
106 | + __syncthreads(); | |
107 | + | |
108 | + for(unsigned int step = 1; step < blockDim.y; step *= 2) | |
109 | + { | |
110 | + __syncthreads(); | |
111 | + if(y_t%(2*step) == 0) | |
112 | + { | |
113 | + shared[x_t][y_t] += shared[x_t][y_t + step]; | |
114 | + } | |
115 | + } | |
116 | + __syncthreads(); */ | |
117 | + if(x_t == 0 && y_t == 0) | |
118 | + result[g_idx] = shared[0][0]; | |
119 | + | |
120 | + | |
121 | +// //result[idx] = abs(valIn); | |
75 | 122 | } |
76 | 123 | |
77 | 124 | |
... | ... | @@ -82,12 +129,6 @@ void get_diff (float *result) |
82 | 129 | ///@param DIM_Y, integer controlling how much memory to allocate. |
83 | 130 | void initArray(cudaGraphicsResource_t src, int DIM_Y) |
84 | 131 | { |
85 | - //cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar> (); | |
86 | - //cudaMallocArray(&result, &channelDesc, DIM_X, DIM_Y, 0); | |
87 | - //HANDLE_ERROR( | |
88 | - // cudaGraphicsGLRegisterImage(&src, | |
89 | - // fboID, | |
90 | - // GL_TEXTURE_2D, | |
91 | 132 | HANDLE_ERROR( |
92 | 133 | cudaGraphicsMapResources(1, &src) |
93 | 134 | ); |
... | ... | @@ -97,10 +138,8 @@ void initArray(cudaGraphicsResource_t src, int DIM_Y) |
97 | 138 | HANDLE_ERROR( |
98 | 139 | cudaBindTextureToArray(texIn, srcArray) |
99 | 140 | ); |
100 | - cudaMalloc( (void**) &result, 20*DIM_Y*sizeof(float)); | |
141 | + cudaMalloc( (void**) &result, DIM_Y*sizeof(float)); | |
101 | 142 | checkCUDAerrors("Memory Allocation Issue 1"); |
102 | - cudaMalloc((void **) &v_dif, 20*10*sizeof(float)); | |
103 | - checkCUDAerrors("Memory Allocation Issue 2"); | |
104 | 143 | //HANDLE_ERROR( |
105 | 144 | // cudaBindTextureToArray(texIn, ptr, &channelDesc) |
106 | 145 | // ); |
... | ... | @@ -117,9 +156,6 @@ void cleanUP(cudaGraphicsResource_t src) |
117 | 156 | cudaGraphicsUnmapResources(1,&src) |
118 | 157 | ); |
119 | 158 | HANDLE_ERROR( |
120 | - cudaFree(v_dif) | |
121 | - ); | |
122 | - HANDLE_ERROR( | |
123 | 159 | cudaUnbindTexture(texIn) |
124 | 160 | ); |
125 | 161 | } |
... | ... | @@ -151,25 +187,32 @@ stim::vec<int> get_cost(cudaGraphicsResource_t src, int DIM_Y) |
151 | 187 | // name << "sample_" << inter << "_" << idx << ".bmp"; |
152 | 188 | // stim::gpu2image<float>(v_dif, name.str(), 20,10,0,1); |
153 | 189 | |
154 | - float output[DIM_Y]; | |
190 | + //float output[DIM_Y]; | |
191 | + float *output; | |
192 | + output = (float* ) malloc(DIM_Y*sizeof(float)); | |
155 | 193 | stim::vec<int> ret(0, 0); |
156 | 194 | float mini = 10000000000000000.0; |
157 | - int idx; | |
158 | - initArray(src, DIM_Y*10); | |
159 | - dim3 grid(20/2, DIM_Y*10/2); | |
160 | - dim3 block(2, 2); | |
161 | - | |
162 | - get_diff <<< grid, block >>> (result); | |
163 | - for (int i = 0; i < DIM_Y; i++){ | |
164 | - output[i] = get_sum(result+(20*10*i)); | |
165 | - if(output[i] <= mini){ | |
195 | + int idx = 0; | |
196 | + initArray(src, DIM_Y*8); | |
197 | + dim3 numBlocks(1, DIM_Y); | |
198 | + dim3 threadsPerBlock(16, 8); | |
199 | + | |
200 | + | |
201 | + get_diff <<< numBlocks, threadsPerBlock >>> (result); | |
202 | + cudaMemcpy(output, result, DIM_Y*sizeof(float), cudaMemcpyDeviceToHost); | |
203 | + | |
204 | + for( int i = 0; i<DIM_Y; i++){ | |
205 | +// std::cout << output[i] << std::endl; | |
206 | + if(output[i] < mini){ | |
166 | 207 | mini = output[i]; |
167 | 208 | idx = i; |
168 | 209 | } |
169 | - } | |
170 | - | |
171 | - output[idx] = get_sum(result+(20*10*idx)); | |
210 | + } | |
211 | + | |
212 | +// std::cout << "hello" << std::endl; | |
213 | + //output[idx] = get_sum(result+(16*8*idx)); | |
172 | 214 | cleanUP(src); |
173 | 215 | ret[0] = idx; ret[1] = (int) output[idx]; |
216 | + free(output); | |
174 | 217 | return ret; |
175 | 218 | } | ... | ... |
stim/gl/gl_spider.h
... | ... | @@ -185,7 +185,7 @@ class gl_spider |
185 | 185 | ver = stim::rect<float>(mag, |
186 | 186 | pos, temp, |
187 | 187 | hor.n()); |
188 | - UpdateBuffer(0.0, 0.0+idx*10.0); | |
188 | + UpdateBuffer(0.0, 0.0+idx*8.0); | |
189 | 189 | CHECK_OPENGL_ERROR |
190 | 190 | } |
191 | 191 | } |
... | ... | @@ -233,7 +233,7 @@ class gl_spider |
233 | 233 | ver = stim::rect<float>(mag, |
234 | 234 | temp, dir, |
235 | 235 | hor.n()); |
236 | - UpdateBuffer(0.0, 0.0+idx*10.0); | |
236 | + UpdateBuffer(0.0, 0.0+idx*8.0); | |
237 | 237 | CHECK_OPENGL_ERROR |
238 | 238 | } |
239 | 239 | } |
... | ... | @@ -244,7 +244,7 @@ class gl_spider |
244 | 244 | ///Method for populating the buffer with the sampled texture. |
245 | 245 | ///uses the default m <1,1,0> |
246 | 246 | void |
247 | - genMagnitudeVectors(float delta = 0.5) | |
247 | + genMagnitudeVectors(float delta = 0.70) | |
248 | 248 | { |
249 | 249 | |
250 | 250 | //Set up the vectors necessary for Rectangle creation. |
... | ... | @@ -274,7 +274,7 @@ class gl_spider |
274 | 274 | ver = stim::rect<float>(temp, |
275 | 275 | pos, dir, |
276 | 276 | hor.n()); |
277 | - UpdateBuffer(0.0, 0.0+i*10.0); | |
277 | + UpdateBuffer(0.0, 0.0+i*8.0); | |
278 | 278 | CHECK_OPENGL_ERROR |
279 | 279 | } |
280 | 280 | glEndList(); |
... | ... | @@ -286,7 +286,7 @@ class gl_spider |
286 | 286 | void |
287 | 287 | UpdateBuffer(float v_x, float v_y) |
288 | 288 | { |
289 | - float len = 10.0; | |
289 | + float len = 8.0; | |
290 | 290 | stim::vec<float>p1; |
291 | 291 | stim::vec<float>p2; |
292 | 292 | stim::vec<float>p3; |
... | ... | @@ -338,13 +338,13 @@ class gl_spider |
338 | 338 | p2[1], |
339 | 339 | p2[2] |
340 | 340 | ); |
341 | - glVertex2f(v_x+2*len, v_y); | |
341 | + glVertex2f(v_x+2.0*len, v_y); | |
342 | 342 | glTexCoord3f( |
343 | 343 | p3[0], |
344 | 344 | p3[1], |
345 | 345 | p3[2] |
346 | 346 | ); |
347 | - glVertex2f(v_x+2*len, v_y+len); | |
347 | + glVertex2f(v_x+2.0*len, v_y+len); | |
348 | 348 | glTexCoord3f( |
349 | 349 | p4[0], |
350 | 350 | p4[1], |
... | ... | @@ -383,47 +383,6 @@ class gl_spider |
383 | 383 | glBindTexture(GL_TEXTURE_2D, 0); |
384 | 384 | } |
385 | 385 | |
386 | - ///Method for controling the buffer and texture binding in order to properly | |
387 | - ///do the render to texture. | |
388 | - void | |
389 | - Bind() | |
390 | - { | |
391 | - float len = 10.0; | |
392 | - glBindFramebuffer(GL_FRAMEBUFFER, fboID);//set up GL buffer | |
393 | - glFramebufferTexture2D( | |
394 | - GL_FRAMEBUFFER, | |
395 | - GL_COLOR_ATTACHMENT0, | |
396 | - GL_TEXTURE_2D, | |
397 | - texbufferID, | |
398 | - 0); | |
399 | - glBindFramebuffer(GL_FRAMEBUFFER, fboID); | |
400 | - GLenum DrawBuffers[1] = {GL_COLOR_ATTACHMENT0}; | |
401 | - glDrawBuffers(1, DrawBuffers); | |
402 | - glBindTexture(GL_TEXTURE_2D, texbufferID); | |
403 | - glClearColor(1,1,1,1); | |
404 | - glClear(GL_COLOR_BUFFER_BIT); | |
405 | - glMatrixMode(GL_PROJECTION); | |
406 | - glLoadIdentity(); | |
407 | - glMatrixMode(GL_MODELVIEW); | |
408 | - glLoadIdentity(); | |
409 | - glViewport(0,0,2.0*len, numSamples*len); | |
410 | - gluOrtho2D(0.0,2.0*len,0.0,numSamples*len); | |
411 | - glEnable(GL_TEXTURE_3D); | |
412 | - glBindTexture(GL_TEXTURE_3D, texID); | |
413 | - | |
414 | - CHECK_OPENGL_ERROR | |
415 | - } | |
416 | - | |
417 | - ///Method for Unbinding all of the texture resources | |
418 | - void | |
419 | - Unbind() | |
420 | - { | |
421 | - //Finalize GL_buffer | |
422 | - glBindTexture(GL_TEXTURE_3D, 0); | |
423 | - glDisable(GL_TEXTURE_3D); | |
424 | - glBindFramebuffer(GL_FRAMEBUFFER,0); | |
425 | - glBindTexture(GL_TEXTURE_2D, 0); | |
426 | - } | |
427 | 386 | |
428 | 387 | ///Method for using the gl manipulation to alighn templates from |
429 | 388 | ///Template space (-0.5 0.5) to Texture space (0.0, 1.0), |
... | ... | @@ -558,7 +517,7 @@ class gl_spider |
558 | 517 | attachSpider(GLuint id) |
559 | 518 | { |
560 | 519 | texID = id; |
561 | - GenerateFBO(20, numSamples*10); | |
520 | + GenerateFBO(16, numSamples*8); | |
562 | 521 | setDims(0.6, 0.6, 1.0); |
563 | 522 | setSize(512.0, 512.0, 426.0); |
564 | 523 | setMatrix(); |
... | ... | @@ -704,6 +663,47 @@ class gl_spider |
704 | 663 | return fboID; |
705 | 664 | } |
706 | 665 | |
666 | + ///Method for controling the buffer and texture binding in order to properly | |
667 | + ///do the render to texture. | |
668 | + void | |
669 | + Bind() | |
670 | + { | |
671 | + float len = 8.0; | |
672 | + glBindFramebuffer(GL_FRAMEBUFFER, fboID);//set up GL buffer | |
673 | + glFramebufferTexture2D( | |
674 | + GL_FRAMEBUFFER, | |
675 | + GL_COLOR_ATTACHMENT0, | |
676 | + GL_TEXTURE_2D, | |
677 | + texbufferID, | |
678 | + 0); | |
679 | + glBindFramebuffer(GL_FRAMEBUFFER, fboID); | |
680 | + GLenum DrawBuffers[1] = {GL_COLOR_ATTACHMENT0}; | |
681 | + glDrawBuffers(1, DrawBuffers); | |
682 | + glBindTexture(GL_TEXTURE_2D, texbufferID); | |
683 | + glClearColor(1,1,1,1); | |
684 | + glClear(GL_COLOR_BUFFER_BIT); | |
685 | + glMatrixMode(GL_PROJECTION); | |
686 | + glLoadIdentity(); | |
687 | + glMatrixMode(GL_MODELVIEW); | |
688 | + glLoadIdentity(); | |
689 | + glViewport(0,0,2.0*len, numSamples*len); | |
690 | + gluOrtho2D(0.0,2.0*len,0.0,numSamples*len); | |
691 | + glEnable(GL_TEXTURE_3D); | |
692 | + glBindTexture(GL_TEXTURE_3D, texID); | |
693 | + | |
694 | + CHECK_OPENGL_ERROR | |
695 | + } | |
696 | + | |
697 | + ///Method for Unbinding all of the texture resources | |
698 | + void | |
699 | + Unbind() | |
700 | + { | |
701 | + //Finalize GL_buffer | |
702 | + glBindTexture(GL_TEXTURE_3D, 0); | |
703 | + glDisable(GL_TEXTURE_3D); | |
704 | + glBindFramebuffer(GL_FRAMEBUFFER,0); | |
705 | + glBindTexture(GL_TEXTURE_2D, 0); | |
706 | + } | |
707 | 707 | //--------------------------------------------------------------------------// |
708 | 708 | //-----------------------------TEMPORARY METHODS----------------------------// |
709 | 709 | //--------------------------------------------------------------------------// |
... | ... | @@ -725,12 +725,12 @@ class gl_spider |
725 | 725 | int |
726 | 726 | Step() |
727 | 727 | { |
728 | - Bind(); | |
728 | + // Bind(); | |
729 | 729 | findOptimalDirection(); |
730 | 730 | findOptimalPosition(); |
731 | 731 | findOptimalScale(); |
732 | 732 | // branchDetection(); |
733 | - Unbind(); | |
733 | + // Unbind(); | |
734 | 734 | return current_cost; |
735 | 735 | } |
736 | 736 | |
... | ... | @@ -776,9 +776,9 @@ class gl_spider |
776 | 776 | glTexCoord3f(x,y,z0); |
777 | 777 | glVertex2f(0.0, j*0.1+0.1); |
778 | 778 | glTexCoord3f(x,y,z1); |
779 | - glVertex2f(20.0, j*0.1+0.1); | |
779 | + glVertex2f(16.0, j*0.1+0.1); | |
780 | 780 | glTexCoord3f(xold,yold,z1); |
781 | - glVertex2f(20.0, j*0.1); | |
781 | + glVertex2f(16.0, j*0.1); | |
782 | 782 | glTexCoord3f(xold,yold,z0); |
783 | 783 | glVertex2f(0.0, j*0.1); |
784 | 784 | xold=x; | ... | ... |