Blame view

cpp/float_to_half.cuh 1.93 KB
5c079506   Laila Saadatifard   upload the ivote ...
1
2
3
4
5
6
7
8
  #ifndef STIM_CUDA_FLOAT_TO_HALF_H
  #define STIM_CUDA_FLOAT_TO_HALF_H
  
  #include <iostream>
  #include <cuda.h>
  #include <stim/cuda/cudatools.h>
  #include <stim/cuda/sharedmem.cuh>
  #include <stim/cuda/cudatools/error.h>
6ef1dab9   Laila Saadatifard   fix one bug in th...
9
10
  #include <cuda_fp16.h>
  #include <stdio.h>
5c079506   Laila Saadatifard   upload the ivote ...
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
  		__global__ void cuda_f2h(half* gpu_half, float* gpu_float, int x, int y, int z){
  
  			
  			//calculate x,y,z coordinates for this thread
  			int xi = blockIdx.x * blockDim.x + threadIdx.x;
  			//find the grid size along y
  			int grid_y = y / blockDim.y;
  			int blockidx_y = blockIdx.y % grid_y;
  			int yi = blockidx_y * blockDim.y + threadIdx.y;
  			int zi = blockIdx.y / grid_y;
  			int i = zi * x * y + yi * x + xi;
  
  			if(xi >= x|| yi >= y || zi>= z) return;
  
  							
  			gpu_half[i] = __float2half(gpu_float[i]);
6ef1dab9   Laila Saadatifard   fix one bug in th...
27
28
  			
  			
5c079506   Laila Saadatifard   upload the ivote ...
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
  		}
  
  
  		void gpu_f2h(half* gpu_half, float* gpu_float, unsigned int x, unsigned int y, unsigned int z){
  
  			
  			int max_threads = stim::maxThreadsPerBlock();
  			dim3 threads(sqrt (max_threads),sqrt (max_threads));
  			dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z);
  
  			//call the GPU kernel to determine the gradient
  			cuda_f2h <<< blocks, threads >>>(gpu_half, gpu_float, x, y, z);
  
  		}
  
  
  		
  		void cpu_f2h(half* h_out, float* f_in, unsigned int x, unsigned int y, unsigned int z){
  
  			//calculate the number of pixels in the array
  			unsigned int pix = x* y* z;
  
  			//allocate memory on the GPU for the input float precision.
  			float* gpu_float;
  			cudaMalloc(&gpu_float, pix * sizeof(float));
  			cudaMemcpy(gpu_float, f_in, pix * sizeof(float), cudaMemcpyHostToDevice);
  			
  			//allocate memory on the GPU for the output half precision
  			half* gpu_half;
  			cudaMalloc(&gpu_half, pix * sizeof(half));
  
  			//call the GPU version of this function
  			gpu_f2h(gpu_half, gpu_float, x, y, z);
  
  			//copy the array back to the CPU
  			cudaMemcpy(h_out, gpu_half, pix * sizeof(half), cudaMemcpyDeviceToHost);
  
  			//free allocated memory
  			cudaFree(gpu_float);
  			cudaFree(gpu_half);
  
  		}
  
  
  #endif