Commit 79a9bf3f36b37c05bd5cf144c9824b62e9fe4824

Authored by Pavel Govyadinov
1 parent fa737592

new implementation of cost.h without using cublas. Significant speedup, gl_spide…

…r's Bind and Unbind methods have been made public to lower the number of times they are called
Showing 2 changed files with 149 additions and 106 deletions   Show diff stats
stim/cuda/cost.h
... ... @@ -3,19 +3,33 @@
3 3 #include <cuda_runtime.h>
4 4 #include <cublas_v2.h>
5 5 #include <stdio.h>
6   -#include <stim/visualization/colormap.h>
  6 +#include "../visualization/colormap.h"
7 7 #include <sstream>
8   -#include <stim/math/vector.h>
9   -#include <stim/cuda/devices.h>
10   -#include <stim/cuda/threads.h>
  8 +#include "../math/vector.h"
  9 +#include "../cuda/devices.h"
  10 +#include "../cuda/threads.h"
11 11  
12 12 ///Cost function that works with the gl-spider class to find index of the item with min-cost.
13 13 typedef unsigned char uchar;
14 14 texture<uchar, cudaTextureType2D, cudaReadModeElementType> texIn;
15 15 float *result;
16   -float* v_dif;
17 16 cudaArray* srcArray;
18 17 bool testing = false;
  18 +/*
  19 +struct SharedMemory
  20 +{
  21 + __device__ inline operator float* ()
  22 + {
  23 + extern __shared__ float __smem[];
  24 + return (float *)__smem;
  25 + }
  26 +
  27 + __device__ inline operator const float* () const
  28 + {
  29 + extern __shared__ float __smem[];
  30 + return (float *)__smem;
  31 + }
  32 +};*/
19 33  
20 34 inline void checkCUDAerrors(const char *msg)
21 35 {
... ... @@ -26,30 +40,12 @@ inline void checkCUDAerrors(const char *msg)
26 40 }
27 41 }
28 42  
29   -///Finds the sum of all the pixes in a gives template element.
30   -///Returns the abosolute value.
31   -///@param *diff, a pointer to the memory block that holds the pixel-differences.
32   -float get_sum(float *diff)
33   -{
34   -
35   - cublasStatus_t ret;
36   - cublasHandle_t handle;
37   - ret = cublasCreate(&handle);
38   -
39   - ret = cublasSetVector(20*10, sizeof(*diff), diff, 1, v_dif, 1);
40   - float out;
41   - ret = cublasSasum(handle, 20*10, v_dif, 1, &out);
42   -// cublasDestroy(ret);
43   - cublasDestroy(handle);
44   - return out;
45   -}
46   -
47 43 ///A virtual representation of a uniform template.
48 44 ///Returns the value of the template pixel.
49 45 ///@param x, location of a pixel.
50 46 __device__ float Template(int x)
51 47 {
52   - if(x < 20/6 || x > 20*5/6 || (x > 20*2/6 && x < 20*4/6)){
  48 + if(x < 16/6 || x > 16*5/6 || (x > 16*2/6 && x < 16*4/6)){
53 49 return 1.0;
54 50 }else{
55 51 return 0.0;
... ... @@ -63,15 +59,66 @@ __device__ float Template(int x)
63 59 __global__
64 60 void get_diff (float *result)
65 61 {
66   - //cuPrintf("Hello");
  62 + //float* shared = SharedMemory();
  63 + __shared__ float shared[16][8];
67 64 int x = threadIdx.x + blockIdx.x * blockDim.x;
68 65 int y = threadIdx.y + blockIdx.y * blockDim.y;
69   - int idx = y*20+x;
  66 + int x_t = threadIdx.x;
  67 + int y_t = threadIdx.y;
  68 + //int idx = y*16+x;
  69 + int g_idx = blockIdx.y;
70 70  
71 71 float valIn = tex2D(texIn, x, y)/255.0;
72 72 float valTemp = Template(x);
73   - result[idx] = abs(valIn-valTemp);
74   - //result[idx] = abs(valIn);
  73 + shared[x_t][y_t] = abs(valIn-valTemp);
  74 +
  75 + __syncthreads();
  76 +
  77 + for(unsigned int step = blockDim.x/2; step >= 1; step >>= 1)
  78 + {
  79 + __syncthreads();
  80 + if (x_t < step)
  81 + {
  82 + shared[x_t][y_t] += shared[x_t + step][y_t];
  83 + }
  84 + __syncthreads();
  85 + }
  86 + __syncthreads();
  87 +
  88 + for(unsigned int step = blockDim.y/2; step >= 1; step >>= 1)
  89 + {
  90 + __syncthreads();
  91 + if(y_t < step)
  92 + {
  93 + shared[x_t][y_t] += shared[x_t][y_t + step];
  94 + }
  95 + __syncthreads();
  96 + }
  97 + __syncthreads();
  98 +/* for(unsigned int step = 1; step < blockDim.x; step *= 2)
  99 + {
  100 + __syncthreads();
  101 + if (x_t %(2*step) == 0)
  102 + {
  103 + shared[x_t][y_t] += shared[x_t + step][y_t];
  104 + }
  105 + }
  106 + __syncthreads();
  107 +
  108 + for(unsigned int step = 1; step < blockDim.y; step *= 2)
  109 + {
  110 + __syncthreads();
  111 + if(y_t%(2*step) == 0)
  112 + {
  113 + shared[x_t][y_t] += shared[x_t][y_t + step];
  114 + }
  115 + }
  116 + __syncthreads(); */
  117 + if(x_t == 0 && y_t == 0)
  118 + result[g_idx] = shared[0][0];
  119 +
  120 +
  121 +// //result[idx] = abs(valIn);
75 122 }
76 123  
77 124  
... ... @@ -82,12 +129,6 @@ void get_diff (float *result)
82 129 ///@param DIM_Y, integer controlling how much memory to allocate.
83 130 void initArray(cudaGraphicsResource_t src, int DIM_Y)
84 131 {
85   - //cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar> ();
86   - //cudaMallocArray(&result, &channelDesc, DIM_X, DIM_Y, 0);
87   - //HANDLE_ERROR(
88   - // cudaGraphicsGLRegisterImage(&src,
89   - // fboID,
90   - // GL_TEXTURE_2D,
91 132 HANDLE_ERROR(
92 133 cudaGraphicsMapResources(1, &src)
93 134 );
... ... @@ -97,10 +138,8 @@ void initArray(cudaGraphicsResource_t src, int DIM_Y)
97 138 HANDLE_ERROR(
98 139 cudaBindTextureToArray(texIn, srcArray)
99 140 );
100   - cudaMalloc( (void**) &result, 20*DIM_Y*sizeof(float));
  141 + cudaMalloc( (void**) &result, DIM_Y*sizeof(float));
101 142 checkCUDAerrors("Memory Allocation Issue 1");
102   - cudaMalloc((void **) &v_dif, 20*10*sizeof(float));
103   - checkCUDAerrors("Memory Allocation Issue 2");
104 143 //HANDLE_ERROR(
105 144 // cudaBindTextureToArray(texIn, ptr, &channelDesc)
106 145 // );
... ... @@ -117,9 +156,6 @@ void cleanUP(cudaGraphicsResource_t src)
117 156 cudaGraphicsUnmapResources(1,&src)
118 157 );
119 158 HANDLE_ERROR(
120   - cudaFree(v_dif)
121   - );
122   - HANDLE_ERROR(
123 159 cudaUnbindTexture(texIn)
124 160 );
125 161 }
... ... @@ -151,25 +187,32 @@ stim::vec&lt;int&gt; get_cost(cudaGraphicsResource_t src, int DIM_Y)
151 187 // name << "sample_" << inter << "_" << idx << ".bmp";
152 188 // stim::gpu2image<float>(v_dif, name.str(), 20,10,0,1);
153 189  
154   - float output[DIM_Y];
  190 + //float output[DIM_Y];
  191 + float *output;
  192 + output = (float* ) malloc(DIM_Y*sizeof(float));
155 193 stim::vec<int> ret(0, 0);
156 194 float mini = 10000000000000000.0;
157   - int idx;
158   - initArray(src, DIM_Y*10);
159   - dim3 grid(20/2, DIM_Y*10/2);
160   - dim3 block(2, 2);
161   -
162   - get_diff <<< grid, block >>> (result);
163   - for (int i = 0; i < DIM_Y; i++){
164   - output[i] = get_sum(result+(20*10*i));
165   - if(output[i] <= mini){
  195 + int idx = 0;
  196 + initArray(src, DIM_Y*8);
  197 + dim3 numBlocks(1, DIM_Y);
  198 + dim3 threadsPerBlock(16, 8);
  199 +
  200 +
  201 + get_diff <<< numBlocks, threadsPerBlock >>> (result);
  202 + cudaMemcpy(output, result, DIM_Y*sizeof(float), cudaMemcpyDeviceToHost);
  203 +
  204 + for( int i = 0; i<DIM_Y; i++){
  205 +// std::cout << output[i] << std::endl;
  206 + if(output[i] < mini){
166 207 mini = output[i];
167 208 idx = i;
168 209 }
169   - }
170   -
171   - output[idx] = get_sum(result+(20*10*idx));
  210 + }
  211 +
  212 +// std::cout << "hello" << std::endl;
  213 + //output[idx] = get_sum(result+(16*8*idx));
172 214 cleanUP(src);
173 215 ret[0] = idx; ret[1] = (int) output[idx];
  216 + free(output);
174 217 return ret;
175 218 }
... ...
stim/gl/gl_spider.h
... ... @@ -185,7 +185,7 @@ class gl_spider
185 185 ver = stim::rect<float>(mag,
186 186 pos, temp,
187 187 hor.n());
188   - UpdateBuffer(0.0, 0.0+idx*10.0);
  188 + UpdateBuffer(0.0, 0.0+idx*8.0);
189 189 CHECK_OPENGL_ERROR
190 190 }
191 191 }
... ... @@ -233,7 +233,7 @@ class gl_spider
233 233 ver = stim::rect<float>(mag,
234 234 temp, dir,
235 235 hor.n());
236   - UpdateBuffer(0.0, 0.0+idx*10.0);
  236 + UpdateBuffer(0.0, 0.0+idx*8.0);
237 237 CHECK_OPENGL_ERROR
238 238 }
239 239 }
... ... @@ -244,7 +244,7 @@ class gl_spider
244 244 ///Method for populating the buffer with the sampled texture.
245 245 ///uses the default m <1,1,0>
246 246 void
247   - genMagnitudeVectors(float delta = 0.5)
  247 + genMagnitudeVectors(float delta = 0.70)
248 248 {
249 249  
250 250 //Set up the vectors necessary for Rectangle creation.
... ... @@ -274,7 +274,7 @@ class gl_spider
274 274 ver = stim::rect<float>(temp,
275 275 pos, dir,
276 276 hor.n());
277   - UpdateBuffer(0.0, 0.0+i*10.0);
  277 + UpdateBuffer(0.0, 0.0+i*8.0);
278 278 CHECK_OPENGL_ERROR
279 279 }
280 280 glEndList();
... ... @@ -286,7 +286,7 @@ class gl_spider
286 286 void
287 287 UpdateBuffer(float v_x, float v_y)
288 288 {
289   - float len = 10.0;
  289 + float len = 8.0;
290 290 stim::vec<float>p1;
291 291 stim::vec<float>p2;
292 292 stim::vec<float>p3;
... ... @@ -338,13 +338,13 @@ class gl_spider
338 338 p2[1],
339 339 p2[2]
340 340 );
341   - glVertex2f(v_x+2*len, v_y);
  341 + glVertex2f(v_x+2.0*len, v_y);
342 342 glTexCoord3f(
343 343 p3[0],
344 344 p3[1],
345 345 p3[2]
346 346 );
347   - glVertex2f(v_x+2*len, v_y+len);
  347 + glVertex2f(v_x+2.0*len, v_y+len);
348 348 glTexCoord3f(
349 349 p4[0],
350 350 p4[1],
... ... @@ -383,47 +383,6 @@ class gl_spider
383 383 glBindTexture(GL_TEXTURE_2D, 0);
384 384 }
385 385  
386   - ///Method for controling the buffer and texture binding in order to properly
387   - ///do the render to texture.
388   - void
389   - Bind()
390   - {
391   - float len = 10.0;
392   - glBindFramebuffer(GL_FRAMEBUFFER, fboID);//set up GL buffer
393   - glFramebufferTexture2D(
394   - GL_FRAMEBUFFER,
395   - GL_COLOR_ATTACHMENT0,
396   - GL_TEXTURE_2D,
397   - texbufferID,
398   - 0);
399   - glBindFramebuffer(GL_FRAMEBUFFER, fboID);
400   - GLenum DrawBuffers[1] = {GL_COLOR_ATTACHMENT0};
401   - glDrawBuffers(1, DrawBuffers);
402   - glBindTexture(GL_TEXTURE_2D, texbufferID);
403   - glClearColor(1,1,1,1);
404   - glClear(GL_COLOR_BUFFER_BIT);
405   - glMatrixMode(GL_PROJECTION);
406   - glLoadIdentity();
407   - glMatrixMode(GL_MODELVIEW);
408   - glLoadIdentity();
409   - glViewport(0,0,2.0*len, numSamples*len);
410   - gluOrtho2D(0.0,2.0*len,0.0,numSamples*len);
411   - glEnable(GL_TEXTURE_3D);
412   - glBindTexture(GL_TEXTURE_3D, texID);
413   -
414   - CHECK_OPENGL_ERROR
415   - }
416   -
417   - ///Method for Unbinding all of the texture resources
418   - void
419   - Unbind()
420   - {
421   - //Finalize GL_buffer
422   - glBindTexture(GL_TEXTURE_3D, 0);
423   - glDisable(GL_TEXTURE_3D);
424   - glBindFramebuffer(GL_FRAMEBUFFER,0);
425   - glBindTexture(GL_TEXTURE_2D, 0);
426   - }
427 386  
428 387 ///Method for using the gl manipulation to alighn templates from
429 388 ///Template space (-0.5 0.5) to Texture space (0.0, 1.0),
... ... @@ -558,7 +517,7 @@ class gl_spider
558 517 attachSpider(GLuint id)
559 518 {
560 519 texID = id;
561   - GenerateFBO(20, numSamples*10);
  520 + GenerateFBO(16, numSamples*8);
562 521 setDims(0.6, 0.6, 1.0);
563 522 setSize(512.0, 512.0, 426.0);
564 523 setMatrix();
... ... @@ -704,6 +663,47 @@ class gl_spider
704 663 return fboID;
705 664 }
706 665  
  666 + ///Method for controling the buffer and texture binding in order to properly
  667 + ///do the render to texture.
  668 + void
  669 + Bind()
  670 + {
  671 + float len = 8.0;
  672 + glBindFramebuffer(GL_FRAMEBUFFER, fboID);//set up GL buffer
  673 + glFramebufferTexture2D(
  674 + GL_FRAMEBUFFER,
  675 + GL_COLOR_ATTACHMENT0,
  676 + GL_TEXTURE_2D,
  677 + texbufferID,
  678 + 0);
  679 + glBindFramebuffer(GL_FRAMEBUFFER, fboID);
  680 + GLenum DrawBuffers[1] = {GL_COLOR_ATTACHMENT0};
  681 + glDrawBuffers(1, DrawBuffers);
  682 + glBindTexture(GL_TEXTURE_2D, texbufferID);
  683 + glClearColor(1,1,1,1);
  684 + glClear(GL_COLOR_BUFFER_BIT);
  685 + glMatrixMode(GL_PROJECTION);
  686 + glLoadIdentity();
  687 + glMatrixMode(GL_MODELVIEW);
  688 + glLoadIdentity();
  689 + glViewport(0,0,2.0*len, numSamples*len);
  690 + gluOrtho2D(0.0,2.0*len,0.0,numSamples*len);
  691 + glEnable(GL_TEXTURE_3D);
  692 + glBindTexture(GL_TEXTURE_3D, texID);
  693 +
  694 + CHECK_OPENGL_ERROR
  695 + }
  696 +
  697 + ///Method for Unbinding all of the texture resources
  698 + void
  699 + Unbind()
  700 + {
  701 + //Finalize GL_buffer
  702 + glBindTexture(GL_TEXTURE_3D, 0);
  703 + glDisable(GL_TEXTURE_3D);
  704 + glBindFramebuffer(GL_FRAMEBUFFER,0);
  705 + glBindTexture(GL_TEXTURE_2D, 0);
  706 + }
707 707 //--------------------------------------------------------------------------//
708 708 //-----------------------------TEMPORARY METHODS----------------------------//
709 709 //--------------------------------------------------------------------------//
... ... @@ -725,12 +725,12 @@ class gl_spider
725 725 int
726 726 Step()
727 727 {
728   - Bind();
  728 + // Bind();
729 729 findOptimalDirection();
730 730 findOptimalPosition();
731 731 findOptimalScale();
732 732 // branchDetection();
733   - Unbind();
  733 + // Unbind();
734 734 return current_cost;
735 735 }
736 736  
... ... @@ -776,9 +776,9 @@ class gl_spider
776 776 glTexCoord3f(x,y,z0);
777 777 glVertex2f(0.0, j*0.1+0.1);
778 778 glTexCoord3f(x,y,z1);
779   - glVertex2f(20.0, j*0.1+0.1);
  779 + glVertex2f(16.0, j*0.1+0.1);
780 780 glTexCoord3f(xold,yold,z1);
781   - glVertex2f(20.0, j*0.1);
  781 + glVertex2f(16.0, j*0.1);
782 782 glTexCoord3f(xold,yold,z0);
783 783 glVertex2f(0.0, j*0.1);
784 784 xold=x;
... ...