Blame view

stim/cuda/arraymath/array_multiply.cuh 2.97 KB
13fe3c84   Laila Saadatifard   update the stimli...
1
2
3
4
5
  #ifndef STIM_CUDA_ARRAY_MULTIPLY_H
  #define STIM_CUDA_ARRAY_MULTIPLY_H
  
  #include <iostream>
  #include <cuda.h>
96f9b10f   Laila Saadatifard   change the header...
6
  #include <stim/cuda/cudatools.h>
13fe3c84   Laila Saadatifard   update the stimli...
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
  
  namespace stim{
  	namespace cuda{
  
  		template<typename T>
  		__global__ void cuda_multiply(T* lhs, T rhs, unsigned int N){
  
  			//calculate the 1D index for this thread
  			int i = blockIdx.x * blockDim.x + threadIdx.x;
  
  			if(i < N)
  				lhs[i] *= rhs;
  		}
  
  		template<typename T>
  		void gpu_multiply(T* lhs, T rhs, unsigned int N){
  
  			//get the maximum number of threads per block for the CUDA device
  			int threads = stim::maxThreadsPerBlock();
  
  			//calculate the number of blocks
  			int blocks = N / threads + (N%threads == 0 ? 0:1);
  
  			//call the kernel to do the multiplication
  			cuda_multiply <<< blocks, threads >>>(lhs, rhs, N);
  
  		}
  
  		template<typename T>
  		void cpu_multiply(T* lhs, T rhs, unsigned int N){
  
  			//calculate the number of bytes in the array
  			unsigned int bytes = N * sizeof(T);
  
  			//allocate memory on the GPU for the array
  			T* gpuLHS;
  			HANDLE_ERROR( cudaMalloc(&gpuLHS, bytes) );
  
  			//copy the array to the GPU
  			HANDLE_ERROR( cudaMemcpy(gpuLHS, lhs, bytes, cudaMemcpyHostToDevice) );
  
  			//call the GPU version of this function
  			gpu_multiply<T>(gpuLHS, rhs, N);
  
  			//copy the array back to the CPU
  			HANDLE_ERROR( cudaMemcpy(lhs, gpuLHS, bytes, cudaMemcpyDeviceToHost) );
  
  			//free allocated memory
  			cudaFree(gpuLHS);
  		}
85025dd1   David Mayerich   adapted bsds500 f...
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
  
  
  //	array .* array multiplication
  
  		template<typename T>
  		__global__ void cuda_multiply(T* ptr1, T* ptr2, T* product, unsigned int N){
  
  			//calculate the 1D index for this thread
  			int idx = blockIdx.x * blockDim.x + threadIdx.x;
  
  			if(idx < N){
  				product[idx] = ptr1[idx] * ptr2[idx];
  			}
  
  		}
  
  		template<typename T>
  		void gpu_multiply(T* ptr1, T* ptr2, T* product, unsigned int N){
  
  			//get the maximum number of threads per block for the CUDA device
  			int threads = stim::maxThreadsPerBlock();
  
  			//calculate the number of blocks
  			int blocks = N / threads + 1;
  
  			//call the kernel to do the multiplication
  			cuda_multiply <<< blocks, threads >>>(ptr1, ptr2, product, N);
  
  		}
  
  		template<typename T>
  		void cpu_multiply(T* ptr1, T* ptr2, T* cpu_product, unsigned int N){
  
  			//allocate memory on the GPU for the array
  			T* gpu_ptr1; 
  			T* gpu_ptr2; 
  			T* gpu_product;
  			HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
  			HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) );
  			HANDLE_ERROR( cudaMalloc( &gpu_product, N * sizeof(T) ) );
  
  			//copy the array to the GPU
  			HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
  			HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) );
  
  			//call the GPU version of this function
  			gpu_multiply<T>(gpu_ptr1, gpu_ptr2 ,gpu_product, N);
  
  			//copy the array back to the CPU
  			HANDLE_ERROR( cudaMemcpy( cpu_product, gpu_product, N * sizeof(T), cudaMemcpyDeviceToHost) );
  
  			//free allocated memory
  			cudaFree(gpu_ptr1);
  			cudaFree(gpu_ptr2);
  			cudaFree(gpu_product);
  
  		}
13fe3c84   Laila Saadatifard   update the stimli...
114
115
116
117
118
119
120
  		
  	}
  }
  
  
  
  #endif