Blame view

cpp/vote3.cuh 5.22 KB
5c079506   Laila Saadatifard   upload the ivote ...
1
2
3
4
5
6
7
8
  #ifndef STIM_CUDA_VOTE3_H
  #define STIM_CUDA_VOTE3_H
  
  #include <iostream>
  #include <cuda.h>
  #include <stim/cuda/cudatools.h>
  #include <stim/cuda/sharedmem.cuh>
  #include <stim/cuda/cudatools/error.h>
89604e92   Laila Saadatifard   ivote3 run on the...
9
  #include "cpyToshare.cuh"
5c079506   Laila Saadatifard   upload the ivote ...
10
11
12
  
  		// this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
  		template<typename T>
02fb26b3   Laila Saadatifard   change the vote a...
13
  		__global__ void vote3(T* gpu_vote, T* gpu_grad, T cos_phi, int rx, int ry, int rz, int x, int y, int z){
5c079506   Laila Saadatifard   upload the ivote ...
14
  
89604e92   Laila Saadatifard   ivote3 run on the...
15
  			extern __shared__ float s[];
5c079506   Laila Saadatifard   upload the ivote ...
16
17
18
19
20
21
22
23
24
25
26
  			//calculate x,y,z coordinates for this thread
  			int xi = blockIdx.x * blockDim.x + threadIdx.x;
  			//find the grid size along y
  			int grid_y = y / blockDim.y;
  			int blockidx_y = blockIdx.y % grid_y;
  			int yi = blockidx_y * blockDim.y + threadIdx.y;
  			int zi = blockIdx.y / grid_y;
  			int i = zi * x * y + yi * x + xi;			
  			
  			if(xi>=x || yi>=y || zi>=z) return;
  			
89604e92   Laila Saadatifard   ivote3 run on the...
27
28
29
30
31
32
33
34
35
36
37
  			//find the starting points and the size of the window, which will be copied to the 2D-shared memory
  			int bxs = blockIdx.x * blockDim.x - rx;
  			int bys = blockidx_y * blockDim.y - ry;
  			int xwidth = 2 * rx + blockDim.x;
  			int ywidth = 2 * ry + blockDim.y;
  			//calculate the starting point of shared memory for storing the magnitude.
  			unsigned int b_s = 3 * xwidth * ywidth;
  			//compute the coordinations of this pixel in the 2D-shared memory.
  			int sx_rx = threadIdx.x + rx;
  			int sy_ry = threadIdx.y + ry;
  
5c079506   Laila Saadatifard   upload the ivote ...
38
39
40
41
42
43
44
45
  			// define a local variable to sum the votes from the voters
  			float sum = 0;
  			
  			int rx_sq = rx * rx;
  			int ry_sq = ry * ry;
  			int rz_sq = rz * rz;
  
  			for (int z_v = -rz; z_v<=rz; z_v++){
89604e92   Laila Saadatifard   ivote3 run on the...
46
47
48
49
50
51
52
53
54
55
56
57
  				int zi_v = zi + z_v;
  				if ((zi_v) >=0 && (zi_v) <z){
  					//call the function to copy one slide of the gradient from global to the 2D-shared memory.					
  					__syncthreads();
  					cpyG2S2D3ch<float>(s, gpu_grad, bxs, bys, zi + z_v, 3*xwidth, ywidth, threadIdx, blockDim, x, y);
  					__syncthreads();
  					mag_share2D<float>(s, b_s, xwidth, ywidth, threadIdx, blockDim);
  					__syncthreads();
  					float z_sq = z_v * z_v;	
  					float d_z_sq = z_sq/rz_sq;
  
  					for(int y_v = -ry; y_v <= ry; y_v++){
5c079506   Laila Saadatifard   upload the ivote ...
58
  						int yi_v = (yi + y_v) ;
89604e92   Laila Saadatifard   ivote3 run on the...
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
  						//compute the position of the current voter in the shared memory along the y axis.
  						unsigned int sIdx_y1d = (sy_ry + y_v)* xwidth;
  
  						float y_sq = y_v * y_v;
  						float yz_sq = z_sq + y_sq;
  						float d_yz_sq = y_sq/ry_sq + d_z_sq;
  						for(int x_v = -rx; x_v <= rx; x_v++){
  
  							//check if the current voter is inside of the data-set
  							int xi_v = (xi + x_v) ;
  							if (yi_v >=0 && yi_v < y && xi_v >=0 && xi_v < x){
  
  								//compute the position of the current voter in the 2D-shared memory along the x axis.
  								unsigned int sIdx_x = (sx_rx + x_v);
  								//find the 1D index of this voter in the 2D-shared memory.
  								unsigned int s_Idx = (sIdx_y1d  + sIdx_x);
  								unsigned int s_Idx3 = s_Idx * 3;
  								
  								//save the gradient values for the current voter to the local variables and compute the gradient magnitude.					
  								float g_v_x = s[s_Idx3];
  								float g_v_y = s[s_Idx3 + 1];
  								float g_v_z = s[s_Idx3 + 2];																						
  								float mag_v = s[b_s + s_Idx]; //sqrt( g_v_x * g_v_x + g_v_y * g_v_y + g_v_z * g_v_z);										
  						
  								//calculate the distance between the pixel and the current voter.
  								float x_sq = x_v * x_v;									
  								float d_pv = sqrt(x_sq + yz_sq);
5c079506   Laila Saadatifard   upload the ivote ...
86
  
89604e92   Laila Saadatifard   ivote3 run on the...
87
88
  								// calculate the angle between the pixel and the current voter.
  								float cos_diff = (g_v_x * (-x_v) + g_v_y * (-y_v) + g_v_z * (-z_v))/(d_pv * mag_v);
5c079506   Laila Saadatifard   upload the ivote ...
89
  						
89604e92   Laila Saadatifard   ivote3 run on the...
90
91
  								// check if the current voter is located in the voting area of this pixel.
  								if ((((x_sq)/rx_sq + d_yz_sq)<= 1) && (cos_diff >= cos_phi)){
5c079506   Laila Saadatifard   upload the ivote ...
92
  												
89604e92   Laila Saadatifard   ivote3 run on the...
93
94
  									sum += mag_v;	
  								}
5c079506   Laila Saadatifard   upload the ivote ...
95
  							}
89604e92   Laila Saadatifard   ivote3 run on the...
96
97
  						}	
  					}
5c079506   Laila Saadatifard   upload the ivote ...
98
99
100
101
102
103
104
  				}
  			}
  						
  			gpu_vote[i] = sum;			
  		}
  
  		template<typename T>
02fb26b3   Laila Saadatifard   change the vote a...
105
  		void gpu_vote3(T* gpu_vote, T* gpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
5c079506   Laila Saadatifard   upload the ivote ...
106
  
02fb26b3   Laila Saadatifard   change the vote a...
107
  			
5c079506   Laila Saadatifard   upload the ivote ...
108
109
110
  			unsigned int max_threads = stim::maxThreadsPerBlock();
  			dim3 threads(sqrt (max_threads),sqrt (max_threads));
  			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
89604e92   Laila Saadatifard   ivote3 run on the...
111
  			unsigned int shared_bytes = (threads.x + 2*r[0])*(threads.y + 2*r[1])*4*sizeof(T);			
5c079506   Laila Saadatifard   upload the ivote ...
112
  			//call the kernel to do the voting
89604e92   Laila Saadatifard   ivote3 run on the...
113
  			vote3 <T> <<< blocks, threads, shared_bytes >>>(gpu_vote, gpu_grad, cos_phi, r[0], r[1], r[2], x , y, z);
5c079506   Laila Saadatifard   upload the ivote ...
114
115
116
117
118
  
  		}
  
  
  		template<typename T>
02fb26b3   Laila Saadatifard   change the vote a...
119
  		void cpu_vote3(T* cpu_vote, T* cpu_grad, T cos_phi, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){
5c079506   Laila Saadatifard   upload the ivote ...
120
121
122
123
124
125
126
127
128
129
130
131
132
  
  			//calculate the number of bytes in the array
  			unsigned int bytes = x * y * z * sizeof(T);
  
  			
  			//allocate space on the GPU for the Vote Image
  			T* gpu_vote;
  			cudaMalloc(&gpu_vote, bytes);		
  
  			//allocate space on the GPU for the input Gradient image
  			T* gpu_grad;
  			cudaMalloc(&gpu_grad, bytes*3);
  			
5c079506   Laila Saadatifard   upload the ivote ...
133
  			
5c079506   Laila Saadatifard   upload the ivote ...
134
135
136
  			//copy the Gradient data to the GPU
  			cudaMemcpy(gpu_grad, cpu_grad, bytes*3, cudaMemcpyHostToDevice);
  			
02fb26b3   Laila Saadatifard   change the vote a...
137
  					
5c079506   Laila Saadatifard   upload the ivote ...
138
  			//call the GPU version of the vote calculation function
02fb26b3   Laila Saadatifard   change the vote a...
139
  			gpu_vote3<T>(gpu_vote, gpu_grad, cos_phi, r, x , y, z);
5c079506   Laila Saadatifard   upload the ivote ...
140
141
142
143
144
145
146
  							
  			//copy the Vote Data back to the CPU
  			cudaMemcpy(cpu_vote, gpu_vote, bytes, cudaMemcpyDeviceToHost) ;
  
  			//free allocated memory
  			cudaFree(gpu_vote);
  			cudaFree(gpu_grad);
02fb26b3   Laila Saadatifard   change the vote a...
147
  			
5c079506   Laila Saadatifard   upload the ivote ...
148
149
150
151
  		}
  
  
  #endif