Blame view

cpp/gradient3.cuh 2.86 KB
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
1
2
3
4
5
6
7
8
9
  #ifndef STIM_CUDA_GRADIENT3_H
  #define STIM_CUDA_GRADIENT3_H
  
  #include <iostream>
  #include <cuda.h>
  #include <stim/cuda/cudatools.h>
  #include <stim/cuda/cudatools/error.h>
  
  template<typename T>
f12505fb   Laila Saadatifard   upload the ivote ...
10
  __global__ void gradient3(T* out, T* in, int x, int y, int z){
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
11
12
13
  
  	//calculate x,y,z coordinates for this thread
  	int xi = blockIdx.x * blockDim.x + threadIdx.x;
f12505fb   Laila Saadatifard   upload the ivote ...
14
15
16
17
18
  	//find the grid size along y
  	int grid_y = y / blockDim.y;
  	int blockidx_y = blockIdx.y % grid_y;
  	int yi = blockidx_y * blockDim.y + threadIdx.y;
  	int zi = blockIdx.y / grid_y;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
19
20
21
  	int i = zi * x * y + yi * x + xi;
  	
  	//return if the pixel is outside of the image
f12505fb   Laila Saadatifard   upload the ivote ...
22
  	if(xi >= x || yi >= y || zi>=z) return;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
23
24
25
26
27
28
  
  	//calculate indices for the forward difference
  	int i_xp = zi * x * y + yi * x + (xi + 1);
  	int i_yp = zi * x * y + (yi + 1) * x + xi;
  	int i_zp = (zi + 1) * x * y + yi * x + xi;
  
f12505fb   Laila Saadatifard   upload the ivote ...
29
30
31
32
33
  	//calculate indices for the backward difference
  	int i_xn = zi * x * y + yi * x + (xi - 1);
  	int i_yn = zi * x * y + (yi - 1) * x + xi;
  	int i_zn = (zi - 1) * x * y + yi * x + xi;
  
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
34
35
36
37
38
39
40
41
  	//use forward differences if a coordinate is zero
  	if(xi == 0)
  		out[i * 3 + 0] = in[i_xp] - in[i];
  	if(yi == 0)
  		out[i * 3 + 1] = in[i_yp] - in[i];
  	if (zi==0)
  		out[i * 3 + 2] = in[i_zp] - in[i];
  
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
42
43
44
45
46
47
48
49
50
51
52
  	//use backward differences if the coordinate is at the maximum edge
  	if(xi == x-1)
  		out[i * 3 + 0] = in[i] - in[i_xn];
  	if(yi == y-1)
  		out[i * 3 + 1] = in[i] - in[i_yn];
  	if(zi == z-1)
  		out[i * 3 + 2] = in[i] - in[i_zn];
  
  	//otherwise use central differences
  	if(xi > 0 && xi < x-1)
  		out[i * 3 + 0] = (in[i_xp] - in[i_xn]) / 2;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
53
54
55
  	if(yi > 0 && yi < y-1)
  		out[i * 3 + 1] = (in[i_yp] - in[i_yn]) / 2;
  	if(zi > 0 && zi < z-1)
f12505fb   Laila Saadatifard   upload the ivote ...
56
  		out[i * 3 + 2] = (in[i_zp] - in[i_zn]) / 2;
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
57
58
59
60
61
62
63
  
  }
  
  template<typename T>
  
  void gpu_gradient3(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y, unsigned int z){
  
f12505fb   Laila Saadatifard   upload the ivote ...
64
  			
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
65
  	int max_threads = stim::maxThreadsPerBlock();
f12505fb   Laila Saadatifard   upload the ivote ...
66
67
  	dim3 threads(sqrt (max_threads),sqrt (max_threads));
  	dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
c9aba18a   Laila Saadatifard   ivote3 on the GPU...
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
  	
  	//call the GPU kernel to determine the gradient
  	gradient3<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y, z);
  
  }
  
  template<typename T>
  void cpu_gradient3(T* out, T* in, unsigned int x, unsigned int y, unsigned int z){
  
  	//get the number of pixels in the image
  	unsigned int pixels = x * y * z;
  	unsigned int bytes = pixels * sizeof(T);
  
  	//allocate space on the GPU for the input image
  	T* gpuIn;
  	HANDLE_ERROR(cudaMalloc(&gpuIn, bytes));
  
  	//copy the image data to the GPU
  	HANDLE_ERROR(cudaMemcpy(gpuIn, in, bytes, cudaMemcpyHostToDevice));
  
  	//allocate space on the GPU for the output gradient image
  	T* gpuOut;
  	cudaMalloc(&gpuOut, bytes * 3);		//the output image will have two channels (x, y)
  
  	//call the GPU version of this function
  	gpu_gradient3(gpuOut, gpuIn, x, y, z);	
  
  	//copy the results to the CPU
  	cudaMemcpy(out, gpuOut, bytes * 3, cudaMemcpyDeviceToHost);
  
  	//free allocated memory
  	cudaFree(gpuIn);
  	cudaFree(gpuOut);
  }
  
  
  
  #endif