codebase / stimlib

Browse Code »

Commit 3f0de7ddf2ecadd35cfb1915abe205be37a89e97

Authored by Laila Saadatifard 2016-08-17 16:31:11 -0500

1 parent e0edbe13

upload the vote and update_dir kernels that are used bounding box

Showing 5 changed files with 115 additions and 181 deletions Show diff stats

Inline Side-by-side

stim/cuda/ivote/david_update_dir_global.cuh deleted

View file @e0edbe1

1		-#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H
2		-#define STIM_CUDA_UPDATE_DIR_GLOBAL_H
3		-
4		-# include <iostream>
5		-# include <cuda.h>
6		-#include <stim/cuda/cudatools.h>
7		-#include <stim/cuda/sharedmem.cuh>
8		-#include <math.h>
9		-#include "cpyToshare.cuh"
10		-
11		-#define RMAX_TEST 8
12		-
13		-namespace stim{
14		- namespace cuda{
15		-
16		- // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
17		- template<typename T>
18		- __global__ void cuda_update_dir(T* gpuDir, T* gpuVote, T* gpuGrad, T* gpuTable, T phi, int rmax, int x, int y){
19		- extern __shared__ T atan2_table[];
20		-
21		- //calculate the start point for this block
22		- //int bxi = blockIdx.x * blockDim.x;
23		-
24		- stim::cuda::sharedMemcpy(atan2_table, gpuTable, (2 * rmax + 1) * (2 * rmax + 1), threadIdx.x, blockDim.x);
25		-
26		- __syncthreads();
27		-
28		- // calculate the 2D coordinates for this current thread.
29		- //int xi = bxi + threadIdx.x;
30		- int xi = blockIdx.x * blockDim.x + threadIdx.x;
31		- int yi = blockIdx.y * blockDim.y + threadIdx.y;
32		- if(xi >= x \|\| yi >= y) return; //if the index is outside of the image, terminate the kernel
33		-
34		- int i = yi * x + xi; // convert 2D coordinates to 1D
35		-
36		- float theta = gpuGrad[2*i]; // calculate the voting direction based on the grtadient direction - global memory fetch
37		- gpuDir[i] = 0; //initialize the vote direction to zero
38		- float max = 0; // define a local variable to maximum value of the vote image in the voting area for this voter
39		- int id_x = 0; // define two local variables for the x and y position of the maximum
40		- int id_y = 0;
41		-
42		- int x_table = 2*rmax +1; // compute the size of window which will be checked for finding the voting area for this voter
43		- int rmax_sq = rmax * rmax;
44		- int tx_rmax = threadIdx.x + rmax;
45		- float atan_angle;
46		- float vote_c;
47		- unsigned int ind_t;
48		- for(int yr = -rmax; yr <= rmax; yr++){ //for each counter in the y direction
49		- if (yi+yr >= 0 && yi + yr < y){ //if the counter exists (we aren't looking outside of the image)
50		- for(int xr = -rmax; xr <= rmax; xr++){ //for each counter in the x direction
51		- if((xr * xr + yr *yr)< rmax_sq){ //if the counter is within range of the voter
52		-
53		- ind_t = (rmax - yr) * x_table + rmax - xr; //calculate the index to the atan2 table
54		- atan_angle = atan2_table[ind_t]; //retrieve the direction vector from the table
55		-
56		- //atan_angle = atan2((float)yr, (float)xr);
57		-
58		- if (abs(atan_angle - theta) <phi){ // check if the current pixel is located in the voting angle of this voter.
59		- vote_c = gpuVote[(yi+yr)*x + (xi+xr)]; // find the vote value for the current counter
60		- if(vote_c>max) { // compare the vote value of this pixel with the max value to find the maxima and its index.
61		- max = vote_c;
62		- id_x = xr;
63		- id_y = yr;
64		- }
65		- }
66		- }
67		- }
68		- }
69		- }
70		-
71		- unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
72		- float new_angle = gpuTable[ind_m];
73		-
74		- if(xi < x && yi < y)
75		- gpuDir[i] = new_angle;
76		- } //end kernel
77		-
78		- // this kernel updates the gradient direction by the calculated voting direction.
79		- template<typename T>
80		- __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, int x, int y){
81		-
82		- // calculate the 2D coordinates for this current thread.
83		- int xi = blockIdx.x * blockDim.x + threadIdx.x;
84		- int yi = blockIdx.y * blockDim.y + threadIdx.y;
85		-
86		- // convert 2D coordinates to 1D
87		- int i = yi * x + xi;
88		-
89		- //update the gradient image with the vote direction
90		- gpuGrad[2*i] = gpuDir[i];
91		- }
92		-
93		- template<typename T>
94		- void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
95		-
96		-
97		-
98		- //calculate the number of bytes in the array
99		- unsigned int bytes = x * y * sizeof(T);
100		-
101		- unsigned int max_threads = stim::maxThreadsPerBlock();
102		-
103		- dim3 threads(sqrt(max_threads), sqrt(max_threads));
104		- dim3 blocks(x/threads.x + 1, y/threads.y + 1);
105		-
106		-
107		-
108		- // allocate space on the GPU for the updated vote direction
109		- T* gpuDir;
110		- cudaMalloc(&gpuDir, bytes);
111		-
112		- size_t shared_mem = sizeof(T) * std::pow((2 * rmax + 1), 2);
113		- std::cout<<"Shared memory for atan2 table: "<<shared_mem<<std::endl;
114		-
115		- //call the kernel to calculate the new voting direction
116		- cuda_update_dir <<< blocks, threads, shared_mem>>>(gpuDir, gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
117		-
118		- //call the kernel to update the gradient direction
119		- cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
120		-
121		- //free allocated memory
122		- cudaFree(gpuDir);
123		-
124		- }
125		-
126		- template<typename T>
127		- void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
128		-
129		- //calculate the number of bytes in the array
130		- unsigned int bytes = x * y * sizeof(T);
131		-
132		- //calculate the number of bytes in the atan2 table
133		- unsigned int bytes_table = (2rmax+1) (2rmax+1) sizeof(T);
134		-
135		- //allocate space on the GPU for the Vote Image
136		- T* gpuVote;
137		- cudaMalloc(&gpuVote, bytes);
138		-
139		- //copy the input vote image to the GPU
140		- HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));
141		-
142		- //allocate space on the GPU for the input Gradient image
143		- T* gpuGrad;
144		- HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
145		-
146		- //copy the Gradient data to the GPU
147		- HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
148		-
149		- //allocate space on the GPU for the atan2 table
150		- T* gpuTable;
151		- HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
152		-
153		- //copy the atan2 values to the GPU
154		- HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
155		-
156		- //call the GPU version of the update direction function
157		- gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
158		-
159		- //copy the new gradient image back to the CPU
160		- cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
161		-
162		- //free allocated memory
163		- cudaFree(gpuTable);
164		- cudaFree(gpuVote);
165		- cudaFree(gpuGrad);
166		- }
167		-
168		- }
169		-}
170		-
171		-#endif
172	0	\ No newline at end of file

stim/cuda/ivote/re_sample.cuh 0 → 100644

Wrap text Show/Hide comments View file @3f0de7d

	1	+#ifndef STIM_CUDA_RE_SAMPLE_H
	2	+#define STIM_CUDA_RE_SAMPLE_H
	3	+
	4	+#include <iostream>
	5	+#include <cuda.h>
	6	+#include <stim/cuda/cudatools.h>
	7	+#include <stim/cuda/templates/gaussian_blur.cuh>
	8	+
	9	+namespace stim{
	10	+ namespace cuda{
	11	+
	12	+ template<typename T>
	13	+ __global__ void cuda_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
	14	+
	15	+ unsigned int sigma_ds = 1/resize;
	16	+ unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
	17	+ unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
	18	+
	19	+
	20	+ // calculate the 2D coordinates for this current thread.
	21	+ int xi = blockIdx.x * blockDim.x + threadIdx.x;
	22	+ int yi = blockIdx.y;
	23	+ // convert 2D coordinates to 1D
	24	+ int i = yi * x + xi;
	25	+
	26	+ if(xi< x && yi< y){
	27	+ if(xi%sigma_ds==0){
	28	+ if(yi%sigma_ds==0){
	29	+ gpuI[i] = gpuI0[(yi/sigma_ds)*x_ds + xi/sigma_ds];
	30	+ }
	31	+ }
	32	+ else gpuI[i] = 0;
	33	+
	34	+ //int x_org = xi * sigma_ds ;
	35	+ //int y_org = yi * sigma_ds ;
	36	+ //int i_org = y_org * x + x_org;
	37	+ //gpuI[i] = gpuI0[i_org];
	38	+ }
	39	+
	40	+ }
	41	+
	42	+
	43	+ /// Applies a Gaussian blur to a 2D image stored on the GPU
	44	+ template<typename T>
	45	+ void gpu_re_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
	46	+
	47	+
	48	+ //unsigned int sigma_ds = 1/resize;
	49	+ //unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
	50	+ //unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
	51	+
	52	+ //get the number of pixels in the image
	53	+ //unsigned int pixels_ds = x_ds * y_ds;
	54	+
	55	+ unsigned int max_threads = stim::maxThreadsPerBlock();
	56	+ dim3 threads(max_threads, 1);
	57	+ dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
	58	+
	59	+ //stim::cuda::gpu_gaussian_blur2<float>(gpuI0, sigma_ds,x ,y);
	60	+
	61	+ //resample the image
	62	+ cuda_re_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y);
	63	+
	64	+ }
	65	+
	66	+ /// Applies a Gaussian blur to a 2D image stored on the CPU
	67	+ template<typename T>
	68	+ void cpu_re_sample(T* out, T* in, T resize, unsigned int x, unsigned int y){
	69	+
	70	+ //get the number of pixels in the image
	71	+ unsigned int pixels = x*y;
	72	+ unsigned int bytes = sizeof(T) * pixels;
	73	+
	74	+ unsigned int sigma_ds = 1/resize;
	75	+ unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
	76	+ unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
	77	+ unsigned int bytes_ds = sizeof(T) * x_ds * y_ds;
	78	+
	79	+
	80	+
	81	+ //allocate space on the GPU for the original image
	82	+ T* gpuI0;
	83	+ cudaMalloc(&gpuI0, bytes_ds);
	84	+
	85	+
	86	+ //copy the image data to the GPU
	87	+ cudaMemcpy(gpuI0, in, bytes_ds, cudaMemcpyHostToDevice);
	88	+
	89	+ //allocate space on the GPU for the down sampled image
	90	+ T* gpuI;
	91	+ cudaMalloc(&gpuI, bytes);
	92	+
	93	+ //run the GPU-based version of the algorithm
	94	+ gpu_re_sample<T>(gpuI, gpuI0, resize, x, y);
	95	+
	96	+ //copy the image data to the GPU
	97	+ cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice);
	98	+
	99	+ cudaFree(gpuI0);
	100	+ cudeFree(gpuI);
	101	+ }
	102	+
	103	+ }
	104	+}
	105	+
	106	+#endif
0	107	\ No newline at end of file
...	...

stim/cuda/ivote/update_dir_global.cuh renamed to stim/cuda/ivote/update_dir_bb.cuh

Wrap text Show/Hide comments View file @3f0de7d

1		-#ifndef STIM_CUDA_UPDATE_DIR_GLOBALD_H
2		-#define STIM_CUDA_UPDATE_DIR_GLOBAL_H
	1	+#ifndef STIM_CUDA_UPDATE_DIR_BB_H
	2	+#define STIM_CUDA_UPDATE_DIR_BB_H
3	3
4	4	# include <iostream>
5	5	# include <cuda.h>
...	...

stim/cuda/ivote/vote_atomic_global.cuh renamed to stim/cuda/ivote/vote_atomic_bb.cuh

Wrap text Show/Hide comments View file @3f0de7d

1		-#ifndef STIM_CUDA_VOTE_ATOMIC_GLOBAL_H
2		-#define STIM_CUDA_VOTE_ATOMIC_GLOBAL_H
	1	+#ifndef STIM_CUDA_VOTE_ATOMIC_BB_H
	2	+#define STIM_CUDA_VOTE_ATOMIC_BB_H
3	3
4	4	# include <iostream>
5	5	# include <cuda.h>
...	...

stim/cuda/ivote_atomic.cuh renamed to stim/cuda/ivote_atomic_bb.cuh

Wrap text Show/Hide comments View file @3f0de7d

1		-#ifndef STIM_CUDA_IVOTE_ATOMIC_H
2		-#define STIM_CUDA_IVOTE_ATOMIC_H
	1	+#ifndef STIM_CUDA_IVOTE_ATOMIC_BB_H
	2	+#define STIM_CUDA_IVOTE_ATOMIC_BB_H
3	3
4	4	#include <stim/cuda/ivote/down_sample.cuh>
5	5	#include <stim/cuda/ivote/local_max.cuh>
6		-#include <stim/cuda/ivote/update_dir_global.cuh>
7		-//#include <stim/cuda/ivote/vote_shared_32-32.cuh>
8		-#include <stim/cuda/ivote/vote_atomic_global.cuh>
9		-//#include <stim/cuda/ivote/re_sample.cuh>
	6	+#include <stim/cuda/ivote/update_dir_bb.cuh>
	7	+#include <stim/cuda/ivote/vote_atomic_bb.cuh>
	8	+
10	9	namespace stim{
11	10	namespace cuda{
12	11
...	...