Blame view

stim/cuda/arraymath/array_add.cuh 1.68 KB
9dad0974   Tianshu Cheng   removed image con...
1
2
3
4
5
  #ifndef STIM_CUDA_ARRAY_ADD_H
  #define STIM_CUDA_ARRAY_ADD_H
  
  #include <iostream>
  #include <cuda.h>
f186dbda   Tianshu Cheng   header file for b...
6
  //#include <cmath>
96f9b10f   Laila Saadatifard   change the header...
7
  #include <stim/cuda/cudatools.h>
9dad0974   Tianshu Cheng   removed image con...
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
  
  namespace stim{
  	namespace cuda{
  
  		template<typename T>
  		__global__ void cuda_add(T* ptr1, T* ptr2, T* sum, unsigned int N){
  
  			//calculate the 1D index for this thread
  			int idx = blockIdx.x * blockDim.x + threadIdx.x;
  
  			if(idx < N){
  				sum[idx] = ptr1[idx] + ptr2[idx];
  			}
  
  		}
  
  		template<typename T>
  		void gpu_add(T* ptr1, T* ptr2, T* sum, unsigned int N){
  
  			//get the maximum number of threads per block for the CUDA device
  			int threads = stim::maxThreadsPerBlock();
  
  			//calculate the number of blocks
5343a315   Tianshu Cheng   make changes to i...
31
  			int blocks = N / threads + 1;
9dad0974   Tianshu Cheng   removed image con...
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
  
  			//call the kernel to do the multiplication
  			cuda_add <<< blocks, threads >>>(ptr1, ptr2, sum, N);
  
  		}
  
  		template<typename T>
  		void cpu_add(T* ptr1, T* ptr2, T* cpu_sum, unsigned int N){
  
  			//allocate memory on the GPU for the array
  			T* gpu_ptr1; 
  			T* gpu_ptr2; 
  			T* gpu_sum;
  			HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
  			HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) );
  			HANDLE_ERROR( cudaMalloc( &gpu_sum, N * sizeof(T) ) );
  
  			//copy the array to the GPU
  			HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
  			HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) );
  
  			//call the GPU version of this function
  			gpu_add<T>(gpu_ptr1, gpu_ptr2 ,gpu_sum, N);
  
  			//copy the array back to the CPU
  			HANDLE_ERROR( cudaMemcpy( cpu_sum, gpu_sum, N * sizeof(T), cudaMemcpyDeviceToHost) );
  
  			//free allocated memory
  			cudaFree(gpu_ptr1);
  			cudaFree(gpu_ptr2);
  			cudaFree(gpu_sum);
  
  		}
  		
  	}
  }
  
  
  
  #endif