Blame view

cpp/local_max3.cuh 2.45 KB
5c079506   Laila Saadatifard   upload the ivote ...
1
2
3
4
5
6
7
8
  #ifndef STIM_CUDA_LOCAL_MAX3_H
  #define STIM_CUDA_LOCAL_MAX3_H
  
  #include <iostream>
  #include <cuda.h>
  #include <stim/cuda/cudatools.h>
  
  template<typename T>
3f0e43ce   Laila Saadatifard   compute the local...
9
  __global__ void cuda_local_max3(T* gpu_center, T* gpu_vote, int conn_x, int conn_y, int conn_z, int x, int y, int z){
5c079506   Laila Saadatifard   upload the ivote ...
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
  
  	//calculate x,y,z coordinates for this thread
  	int xi = blockIdx.x * blockDim.x + threadIdx.x;
  	//find the grid size along y
  	int grid_y = y / blockDim.y;
  	int blockidx_y = blockIdx.y % grid_y;
  	int yi = blockidx_y * blockDim.y + threadIdx.y;
  	int zi = blockIdx.y / grid_y;
  	int i = zi * x * y + yi * x + xi;
  	
  	if (xi>=x || yi>=y || zi>=z) return;
  
  	//initilize the center value for this pixel to zero
  	gpu_center[i] = 0;
  
  	//initialize the max value to the vote value for this pixle.
  	T lv_i = gpu_vote[i];
  	
5c079506   Laila Saadatifard   upload the ivote ...
28
29
30
31
32
33
34
35
  	for (int xl = xi - conn_x; xl <= xi + conn_x; xl++){
  
  		for (int yl = yi - conn_y; yl <= yi + conn_y; yl++){
  
  			for (int zl = zi - conn_z ; zl <= zi + conn_z; zl++){
  
  				if (xl>=0 && yl>=0 && zl>=0 && xl<x && yl<y && zl<z){
  					
94d437dd   Laila Saadatifard   ivote3 code compi...
36
  					int i_l = zl * x * y + yl * x + xl;
5c079506   Laila Saadatifard   upload the ivote ...
37
  					if (gpu_vote[i_l] > lv_i) return;
3f0e43ce   Laila Saadatifard   compute the local...
38
39
40
41
42
  					if (gpu_vote[i_l] == lv_i){
  							if( i_l > i){
  								return;
  						}
  					}
5c079506   Laila Saadatifard   upload the ivote ...
43
44
45
46
47
  				}
  			}
  		}
  	}
  
3f0e43ce   Laila Saadatifard   compute the local...
48
  	gpu_center[i] = lv_i;
5c079506   Laila Saadatifard   upload the ivote ...
49
50
51
52
53
54
55
56
57
58
59
  }
  
  template<typename T>
  void gpu_local_max3(T* gpu_output, T* gpu_vote, T t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){
  
  	//find the max number of threads per block.
  	unsigned int max_threads = stim::maxThreadsPerBlock();
  	dim3 threads(sqrt (max_threads),sqrt (max_threads));
  	dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
  
  	//call the kernel to find the local max
3f0e43ce   Laila Saadatifard   compute the local...
60
  	cuda_local_max3<T><<<blocks, threads>>>(gpu_output, gpu_vote, conn[0], conn[1], conn[2], x, y, z);
5c079506   Laila Saadatifard   upload the ivote ...
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
  
  
  
  }
  
  template<typename T>
  void cpu_local_max3(T* cpu_output, T* cpu_vote, T t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){
  	
  	//calculate the number of bytes in the data set.
  	unsigned int bytes = x * y * z * sizeof(T);
  
  	//allocate space on the gpu for the vote data and the output.
  	T* gpu_vote;
  	cudaMalloc(&gpu_vote, bytes);
  
  	T* gpu_output;
  	cudaMalloc(&gpu_output, bytes);
  
  	//copy the vote data to the gpu.
  	cudaMemcpy(gpu_vote, cpu_vote, bytes, cudaMemcpyHostToDevice);
  
  	//call the gpu version of local max function.
  	gpu_local_max3<T>(gpu_output, gpu_vote, t, conn, x ,y, z);
  
  	//copy the detected result to the cpu.
  	cudaMemcpy(cpu_output, gpu_output, bytes, cudaMemcpyDeviceToHost);
  
  	cudaFree(gpu_vote);
  	cudaFree(gpu_output);
  }
  
  #endif