Commit 79a9bf3f36b37c05bd5cf144c9824b62e9fe4824

Authored by Pavel Govyadinov
1 parent fa737592

new implementation of cost.h without using cublas. Significant speedup, gl_spide…

…r's Bind and Unbind methods have been made public to lower the number of times they are called
Showing 2 changed files with 149 additions and 106 deletions   Show diff stats
@@ -3,19 +3,33 @@ @@ -3,19 +3,33 @@
3 #include <cuda_runtime.h> 3 #include <cuda_runtime.h>
4 #include <cublas_v2.h> 4 #include <cublas_v2.h>
5 #include <stdio.h> 5 #include <stdio.h>
6 -#include <stim/visualization/colormap.h> 6 +#include "../visualization/colormap.h"
7 #include <sstream> 7 #include <sstream>
8 -#include <stim/math/vector.h>  
9 -#include <stim/cuda/devices.h>  
10 -#include <stim/cuda/threads.h> 8 +#include "../math/vector.h"
  9 +#include "../cuda/devices.h"
  10 +#include "../cuda/threads.h"
11 11
12 ///Cost function that works with the gl-spider class to find index of the item with min-cost. 12 ///Cost function that works with the gl-spider class to find index of the item with min-cost.
13 typedef unsigned char uchar; 13 typedef unsigned char uchar;
14 texture<uchar, cudaTextureType2D, cudaReadModeElementType> texIn; 14 texture<uchar, cudaTextureType2D, cudaReadModeElementType> texIn;
15 float *result; 15 float *result;
16 -float* v_dif;  
17 cudaArray* srcArray; 16 cudaArray* srcArray;
18 bool testing = false; 17 bool testing = false;
  18 +/*
  19 +struct SharedMemory
  20 +{
  21 + __device__ inline operator float* ()
  22 + {
  23 + extern __shared__ float __smem[];
  24 + return (float *)__smem;
  25 + }
  26 +
  27 + __device__ inline operator const float* () const
  28 + {
  29 + extern __shared__ float __smem[];
  30 + return (float *)__smem;
  31 + }
  32 +};*/
19 33
20 inline void checkCUDAerrors(const char *msg) 34 inline void checkCUDAerrors(const char *msg)
21 { 35 {
@@ -26,30 +40,12 @@ inline void checkCUDAerrors(const char *msg) @@ -26,30 +40,12 @@ inline void checkCUDAerrors(const char *msg)
26 } 40 }
27 } 41 }
28 42
29 -///Finds the sum of all the pixes in a gives template element.  
30 -///Returns the abosolute value.  
31 -///@param *diff, a pointer to the memory block that holds the pixel-differences.  
32 -float get_sum(float *diff)  
33 -{  
34 -  
35 - cublasStatus_t ret;  
36 - cublasHandle_t handle;  
37 - ret = cublasCreate(&handle);  
38 -  
39 - ret = cublasSetVector(20*10, sizeof(*diff), diff, 1, v_dif, 1);  
40 - float out;  
41 - ret = cublasSasum(handle, 20*10, v_dif, 1, &out);  
42 -// cublasDestroy(ret);  
43 - cublasDestroy(handle);  
44 - return out;  
45 -}  
46 -  
47 ///A virtual representation of a uniform template. 43 ///A virtual representation of a uniform template.
48 ///Returns the value of the template pixel. 44 ///Returns the value of the template pixel.
49 ///@param x, location of a pixel. 45 ///@param x, location of a pixel.
50 __device__ float Template(int x) 46 __device__ float Template(int x)
51 { 47 {
52 - if(x < 20/6 || x > 20*5/6 || (x > 20*2/6 && x < 20*4/6)){ 48 + if(x < 16/6 || x > 16*5/6 || (x > 16*2/6 && x < 16*4/6)){
53 return 1.0; 49 return 1.0;
54 }else{ 50 }else{
55 return 0.0; 51 return 0.0;
@@ -63,15 +59,66 @@ __device__ float Template(int x) @@ -63,15 +59,66 @@ __device__ float Template(int x)
63 __global__ 59 __global__
64 void get_diff (float *result) 60 void get_diff (float *result)
65 { 61 {
66 - //cuPrintf("Hello"); 62 + //float* shared = SharedMemory();
  63 + __shared__ float shared[16][8];
67 int x = threadIdx.x + blockIdx.x * blockDim.x; 64 int x = threadIdx.x + blockIdx.x * blockDim.x;
68 int y = threadIdx.y + blockIdx.y * blockDim.y; 65 int y = threadIdx.y + blockIdx.y * blockDim.y;
69 - int idx = y*20+x; 66 + int x_t = threadIdx.x;
  67 + int y_t = threadIdx.y;
  68 + //int idx = y*16+x;
  69 + int g_idx = blockIdx.y;
70 70
71 float valIn = tex2D(texIn, x, y)/255.0; 71 float valIn = tex2D(texIn, x, y)/255.0;
72 float valTemp = Template(x); 72 float valTemp = Template(x);
73 - result[idx] = abs(valIn-valTemp);  
74 - //result[idx] = abs(valIn); 73 + shared[x_t][y_t] = abs(valIn-valTemp);
  74 +
  75 + __syncthreads();
  76 +
  77 + for(unsigned int step = blockDim.x/2; step >= 1; step >>= 1)
  78 + {
  79 + __syncthreads();
  80 + if (x_t < step)
  81 + {
  82 + shared[x_t][y_t] += shared[x_t + step][y_t];
  83 + }
  84 + __syncthreads();
  85 + }
  86 + __syncthreads();
  87 +
  88 + for(unsigned int step = blockDim.y/2; step >= 1; step >>= 1)
  89 + {
  90 + __syncthreads();
  91 + if(y_t < step)
  92 + {
  93 + shared[x_t][y_t] += shared[x_t][y_t + step];
  94 + }
  95 + __syncthreads();
  96 + }
  97 + __syncthreads();
  98 +/* for(unsigned int step = 1; step < blockDim.x; step *= 2)
  99 + {
  100 + __syncthreads();
  101 + if (x_t %(2*step) == 0)
  102 + {
  103 + shared[x_t][y_t] += shared[x_t + step][y_t];
  104 + }
  105 + }
  106 + __syncthreads();
  107 +
  108 + for(unsigned int step = 1; step < blockDim.y; step *= 2)
  109 + {
  110 + __syncthreads();
  111 + if(y_t%(2*step) == 0)
  112 + {
  113 + shared[x_t][y_t] += shared[x_t][y_t + step];
  114 + }
  115 + }
  116 + __syncthreads(); */
  117 + if(x_t == 0 && y_t == 0)
  118 + result[g_idx] = shared[0][0];
  119 +
  120 +
  121 +// //result[idx] = abs(valIn);
75 } 122 }
76 123
77 124
@@ -82,12 +129,6 @@ void get_diff (float *result) @@ -82,12 +129,6 @@ void get_diff (float *result)
82 ///@param DIM_Y, integer controlling how much memory to allocate. 129 ///@param DIM_Y, integer controlling how much memory to allocate.
83 void initArray(cudaGraphicsResource_t src, int DIM_Y) 130 void initArray(cudaGraphicsResource_t src, int DIM_Y)
84 { 131 {
85 - //cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar> ();  
86 - //cudaMallocArray(&result, &channelDesc, DIM_X, DIM_Y, 0);  
87 - //HANDLE_ERROR(  
88 - // cudaGraphicsGLRegisterImage(&src,  
89 - // fboID,  
90 - // GL_TEXTURE_2D,  
91 HANDLE_ERROR( 132 HANDLE_ERROR(
92 cudaGraphicsMapResources(1, &src) 133 cudaGraphicsMapResources(1, &src)
93 ); 134 );
@@ -97,10 +138,8 @@ void initArray(cudaGraphicsResource_t src, int DIM_Y) @@ -97,10 +138,8 @@ void initArray(cudaGraphicsResource_t src, int DIM_Y)
97 HANDLE_ERROR( 138 HANDLE_ERROR(
98 cudaBindTextureToArray(texIn, srcArray) 139 cudaBindTextureToArray(texIn, srcArray)
99 ); 140 );
100 - cudaMalloc( (void**) &result, 20*DIM_Y*sizeof(float)); 141 + cudaMalloc( (void**) &result, DIM_Y*sizeof(float));
101 checkCUDAerrors("Memory Allocation Issue 1"); 142 checkCUDAerrors("Memory Allocation Issue 1");
102 - cudaMalloc((void **) &v_dif, 20*10*sizeof(float));  
103 - checkCUDAerrors("Memory Allocation Issue 2");  
104 //HANDLE_ERROR( 143 //HANDLE_ERROR(
105 // cudaBindTextureToArray(texIn, ptr, &channelDesc) 144 // cudaBindTextureToArray(texIn, ptr, &channelDesc)
106 // ); 145 // );
@@ -117,9 +156,6 @@ void cleanUP(cudaGraphicsResource_t src) @@ -117,9 +156,6 @@ void cleanUP(cudaGraphicsResource_t src)
117 cudaGraphicsUnmapResources(1,&src) 156 cudaGraphicsUnmapResources(1,&src)
118 ); 157 );
119 HANDLE_ERROR( 158 HANDLE_ERROR(
120 - cudaFree(v_dif)  
121 - );  
122 - HANDLE_ERROR(  
123 cudaUnbindTexture(texIn) 159 cudaUnbindTexture(texIn)
124 ); 160 );
125 } 161 }
@@ -151,25 +187,32 @@ stim::vec&lt;int&gt; get_cost(cudaGraphicsResource_t src, int DIM_Y) @@ -151,25 +187,32 @@ stim::vec&lt;int&gt; get_cost(cudaGraphicsResource_t src, int DIM_Y)
151 // name << "sample_" << inter << "_" << idx << ".bmp"; 187 // name << "sample_" << inter << "_" << idx << ".bmp";
152 // stim::gpu2image<float>(v_dif, name.str(), 20,10,0,1); 188 // stim::gpu2image<float>(v_dif, name.str(), 20,10,0,1);
153 189
154 - float output[DIM_Y]; 190 + //float output[DIM_Y];
  191 + float *output;
  192 + output = (float* ) malloc(DIM_Y*sizeof(float));
155 stim::vec<int> ret(0, 0); 193 stim::vec<int> ret(0, 0);
156 float mini = 10000000000000000.0; 194 float mini = 10000000000000000.0;
157 - int idx;  
158 - initArray(src, DIM_Y*10);  
159 - dim3 grid(20/2, DIM_Y*10/2);  
160 - dim3 block(2, 2);  
161 -  
162 - get_diff <<< grid, block >>> (result);  
163 - for (int i = 0; i < DIM_Y; i++){  
164 - output[i] = get_sum(result+(20*10*i));  
165 - if(output[i] <= mini){ 195 + int idx = 0;
  196 + initArray(src, DIM_Y*8);
  197 + dim3 numBlocks(1, DIM_Y);
  198 + dim3 threadsPerBlock(16, 8);
  199 +
  200 +
  201 + get_diff <<< numBlocks, threadsPerBlock >>> (result);
  202 + cudaMemcpy(output, result, DIM_Y*sizeof(float), cudaMemcpyDeviceToHost);
  203 +
  204 + for( int i = 0; i<DIM_Y; i++){
  205 +// std::cout << output[i] << std::endl;
  206 + if(output[i] < mini){
166 mini = output[i]; 207 mini = output[i];
167 idx = i; 208 idx = i;
168 } 209 }
169 - }  
170 -  
171 - output[idx] = get_sum(result+(20*10*idx)); 210 + }
  211 +
  212 +// std::cout << "hello" << std::endl;
  213 + //output[idx] = get_sum(result+(16*8*idx));
172 cleanUP(src); 214 cleanUP(src);
173 ret[0] = idx; ret[1] = (int) output[idx]; 215 ret[0] = idx; ret[1] = (int) output[idx];
  216 + free(output);
174 return ret; 217 return ret;
175 } 218 }
stim/gl/gl_spider.h
@@ -185,7 +185,7 @@ class gl_spider @@ -185,7 +185,7 @@ class gl_spider
185 ver = stim::rect<float>(mag, 185 ver = stim::rect<float>(mag,
186 pos, temp, 186 pos, temp,
187 hor.n()); 187 hor.n());
188 - UpdateBuffer(0.0, 0.0+idx*10.0); 188 + UpdateBuffer(0.0, 0.0+idx*8.0);
189 CHECK_OPENGL_ERROR 189 CHECK_OPENGL_ERROR
190 } 190 }
191 } 191 }
@@ -233,7 +233,7 @@ class gl_spider @@ -233,7 +233,7 @@ class gl_spider
233 ver = stim::rect<float>(mag, 233 ver = stim::rect<float>(mag,
234 temp, dir, 234 temp, dir,
235 hor.n()); 235 hor.n());
236 - UpdateBuffer(0.0, 0.0+idx*10.0); 236 + UpdateBuffer(0.0, 0.0+idx*8.0);
237 CHECK_OPENGL_ERROR 237 CHECK_OPENGL_ERROR
238 } 238 }
239 } 239 }
@@ -244,7 +244,7 @@ class gl_spider @@ -244,7 +244,7 @@ class gl_spider
244 ///Method for populating the buffer with the sampled texture. 244 ///Method for populating the buffer with the sampled texture.
245 ///uses the default m <1,1,0> 245 ///uses the default m <1,1,0>
246 void 246 void
247 - genMagnitudeVectors(float delta = 0.5) 247 + genMagnitudeVectors(float delta = 0.70)
248 { 248 {
249 249
250 //Set up the vectors necessary for Rectangle creation. 250 //Set up the vectors necessary for Rectangle creation.
@@ -274,7 +274,7 @@ class gl_spider @@ -274,7 +274,7 @@ class gl_spider
274 ver = stim::rect<float>(temp, 274 ver = stim::rect<float>(temp,
275 pos, dir, 275 pos, dir,
276 hor.n()); 276 hor.n());
277 - UpdateBuffer(0.0, 0.0+i*10.0); 277 + UpdateBuffer(0.0, 0.0+i*8.0);
278 CHECK_OPENGL_ERROR 278 CHECK_OPENGL_ERROR
279 } 279 }
280 glEndList(); 280 glEndList();
@@ -286,7 +286,7 @@ class gl_spider @@ -286,7 +286,7 @@ class gl_spider
286 void 286 void
287 UpdateBuffer(float v_x, float v_y) 287 UpdateBuffer(float v_x, float v_y)
288 { 288 {
289 - float len = 10.0; 289 + float len = 8.0;
290 stim::vec<float>p1; 290 stim::vec<float>p1;
291 stim::vec<float>p2; 291 stim::vec<float>p2;
292 stim::vec<float>p3; 292 stim::vec<float>p3;
@@ -338,13 +338,13 @@ class gl_spider @@ -338,13 +338,13 @@ class gl_spider
338 p2[1], 338 p2[1],
339 p2[2] 339 p2[2]
340 ); 340 );
341 - glVertex2f(v_x+2*len, v_y); 341 + glVertex2f(v_x+2.0*len, v_y);
342 glTexCoord3f( 342 glTexCoord3f(
343 p3[0], 343 p3[0],
344 p3[1], 344 p3[1],
345 p3[2] 345 p3[2]
346 ); 346 );
347 - glVertex2f(v_x+2*len, v_y+len); 347 + glVertex2f(v_x+2.0*len, v_y+len);
348 glTexCoord3f( 348 glTexCoord3f(
349 p4[0], 349 p4[0],
350 p4[1], 350 p4[1],
@@ -383,47 +383,6 @@ class gl_spider @@ -383,47 +383,6 @@ class gl_spider
383 glBindTexture(GL_TEXTURE_2D, 0); 383 glBindTexture(GL_TEXTURE_2D, 0);
384 } 384 }
385 385
386 - ///Method for controling the buffer and texture binding in order to properly  
387 - ///do the render to texture.  
388 - void  
389 - Bind()  
390 - {  
391 - float len = 10.0;  
392 - glBindFramebuffer(GL_FRAMEBUFFER, fboID);//set up GL buffer  
393 - glFramebufferTexture2D(  
394 - GL_FRAMEBUFFER,  
395 - GL_COLOR_ATTACHMENT0,  
396 - GL_TEXTURE_2D,  
397 - texbufferID,  
398 - 0);  
399 - glBindFramebuffer(GL_FRAMEBUFFER, fboID);  
400 - GLenum DrawBuffers[1] = {GL_COLOR_ATTACHMENT0};  
401 - glDrawBuffers(1, DrawBuffers);  
402 - glBindTexture(GL_TEXTURE_2D, texbufferID);  
403 - glClearColor(1,1,1,1);  
404 - glClear(GL_COLOR_BUFFER_BIT);  
405 - glMatrixMode(GL_PROJECTION);  
406 - glLoadIdentity();  
407 - glMatrixMode(GL_MODELVIEW);  
408 - glLoadIdentity();  
409 - glViewport(0,0,2.0*len, numSamples*len);  
410 - gluOrtho2D(0.0,2.0*len,0.0,numSamples*len);  
411 - glEnable(GL_TEXTURE_3D);  
412 - glBindTexture(GL_TEXTURE_3D, texID);  
413 -  
414 - CHECK_OPENGL_ERROR  
415 - }  
416 -  
417 - ///Method for Unbinding all of the texture resources  
418 - void  
419 - Unbind()  
420 - {  
421 - //Finalize GL_buffer  
422 - glBindTexture(GL_TEXTURE_3D, 0);  
423 - glDisable(GL_TEXTURE_3D);  
424 - glBindFramebuffer(GL_FRAMEBUFFER,0);  
425 - glBindTexture(GL_TEXTURE_2D, 0);  
426 - }  
427 386
428 ///Method for using the gl manipulation to alighn templates from 387 ///Method for using the gl manipulation to alighn templates from
429 ///Template space (-0.5 0.5) to Texture space (0.0, 1.0), 388 ///Template space (-0.5 0.5) to Texture space (0.0, 1.0),
@@ -558,7 +517,7 @@ class gl_spider @@ -558,7 +517,7 @@ class gl_spider
558 attachSpider(GLuint id) 517 attachSpider(GLuint id)
559 { 518 {
560 texID = id; 519 texID = id;
561 - GenerateFBO(20, numSamples*10); 520 + GenerateFBO(16, numSamples*8);
562 setDims(0.6, 0.6, 1.0); 521 setDims(0.6, 0.6, 1.0);
563 setSize(512.0, 512.0, 426.0); 522 setSize(512.0, 512.0, 426.0);
564 setMatrix(); 523 setMatrix();
@@ -704,6 +663,47 @@ class gl_spider @@ -704,6 +663,47 @@ class gl_spider
704 return fboID; 663 return fboID;
705 } 664 }
706 665
  666 + ///Method for controling the buffer and texture binding in order to properly
  667 + ///do the render to texture.
  668 + void
  669 + Bind()
  670 + {
  671 + float len = 8.0;
  672 + glBindFramebuffer(GL_FRAMEBUFFER, fboID);//set up GL buffer
  673 + glFramebufferTexture2D(
  674 + GL_FRAMEBUFFER,
  675 + GL_COLOR_ATTACHMENT0,
  676 + GL_TEXTURE_2D,
  677 + texbufferID,
  678 + 0);
  679 + glBindFramebuffer(GL_FRAMEBUFFER, fboID);
  680 + GLenum DrawBuffers[1] = {GL_COLOR_ATTACHMENT0};
  681 + glDrawBuffers(1, DrawBuffers);
  682 + glBindTexture(GL_TEXTURE_2D, texbufferID);
  683 + glClearColor(1,1,1,1);
  684 + glClear(GL_COLOR_BUFFER_BIT);
  685 + glMatrixMode(GL_PROJECTION);
  686 + glLoadIdentity();
  687 + glMatrixMode(GL_MODELVIEW);
  688 + glLoadIdentity();
  689 + glViewport(0,0,2.0*len, numSamples*len);
  690 + gluOrtho2D(0.0,2.0*len,0.0,numSamples*len);
  691 + glEnable(GL_TEXTURE_3D);
  692 + glBindTexture(GL_TEXTURE_3D, texID);
  693 +
  694 + CHECK_OPENGL_ERROR
  695 + }
  696 +
  697 + ///Method for Unbinding all of the texture resources
  698 + void
  699 + Unbind()
  700 + {
  701 + //Finalize GL_buffer
  702 + glBindTexture(GL_TEXTURE_3D, 0);
  703 + glDisable(GL_TEXTURE_3D);
  704 + glBindFramebuffer(GL_FRAMEBUFFER,0);
  705 + glBindTexture(GL_TEXTURE_2D, 0);
  706 + }
707 //--------------------------------------------------------------------------// 707 //--------------------------------------------------------------------------//
708 //-----------------------------TEMPORARY METHODS----------------------------// 708 //-----------------------------TEMPORARY METHODS----------------------------//
709 //--------------------------------------------------------------------------// 709 //--------------------------------------------------------------------------//
@@ -725,12 +725,12 @@ class gl_spider @@ -725,12 +725,12 @@ class gl_spider
725 int 725 int
726 Step() 726 Step()
727 { 727 {
728 - Bind(); 728 + // Bind();
729 findOptimalDirection(); 729 findOptimalDirection();
730 findOptimalPosition(); 730 findOptimalPosition();
731 findOptimalScale(); 731 findOptimalScale();
732 // branchDetection(); 732 // branchDetection();
733 - Unbind(); 733 + // Unbind();
734 return current_cost; 734 return current_cost;
735 } 735 }
736 736
@@ -776,9 +776,9 @@ class gl_spider @@ -776,9 +776,9 @@ class gl_spider
776 glTexCoord3f(x,y,z0); 776 glTexCoord3f(x,y,z0);
777 glVertex2f(0.0, j*0.1+0.1); 777 glVertex2f(0.0, j*0.1+0.1);
778 glTexCoord3f(x,y,z1); 778 glTexCoord3f(x,y,z1);
779 - glVertex2f(20.0, j*0.1+0.1); 779 + glVertex2f(16.0, j*0.1+0.1);
780 glTexCoord3f(xold,yold,z1); 780 glTexCoord3f(xold,yold,z1);
781 - glVertex2f(20.0, j*0.1); 781 + glVertex2f(16.0, j*0.1);
782 glTexCoord3f(xold,yold,z0); 782 glTexCoord3f(xold,yold,z0);
783 glVertex2f(0.0, j*0.1); 783 glVertex2f(0.0, j*0.1);
784 xold=x; 784 xold=x;