Commit 9dad097417cd1fb2374291310961d0f33200069f
1 parent
6d30a707
removed image contour
Showing
1 changed file
with
71 additions
and
0 deletions
Show diff stats
1 | +#ifndef STIM_CUDA_ARRAY_ADD_H | |
2 | +#define STIM_CUDA_ARRAY_ADD_H | |
3 | + | |
4 | +#include <iostream> | |
5 | +#include <cuda.h> | |
6 | +#include <stim/cuda/devices.h> | |
7 | +#include <stim/cuda/error.h> | |
8 | + | |
9 | +namespace stim{ | |
10 | + namespace cuda{ | |
11 | + | |
12 | + template<typename T> | |
13 | + __global__ void cuda_add(T* ptr1, T* ptr2, T* sum, unsigned int N){ | |
14 | + | |
15 | + //calculate the 1D index for this thread | |
16 | + int idx = blockIdx.x * blockDim.x + threadIdx.x; | |
17 | + | |
18 | + if(idx < N){ | |
19 | + sum[idx] = ptr1[idx] + ptr2[idx]; | |
20 | + } | |
21 | + | |
22 | + } | |
23 | + | |
24 | + template<typename T> | |
25 | + void gpu_add(T* ptr1, T* ptr2, T* sum, unsigned int N){ | |
26 | + | |
27 | + //get the maximum number of threads per block for the CUDA device | |
28 | + int threads = stim::maxThreadsPerBlock(); | |
29 | + | |
30 | + //calculate the number of blocks | |
31 | + int blocks = N / threads + (N%threads == 0 ? 0:1); | |
32 | + | |
33 | + //call the kernel to do the multiplication | |
34 | + cuda_add <<< blocks, threads >>>(ptr1, ptr2, sum, N); | |
35 | + | |
36 | + } | |
37 | + | |
38 | + template<typename T> | |
39 | + void cpu_add(T* ptr1, T* ptr2, T* cpu_sum, unsigned int N){ | |
40 | + | |
41 | + //allocate memory on the GPU for the array | |
42 | + T* gpu_ptr1; | |
43 | + T* gpu_ptr2; | |
44 | + T* gpu_sum; | |
45 | + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) ); | |
46 | + HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) ); | |
47 | + HANDLE_ERROR( cudaMalloc( &gpu_sum, N * sizeof(T) ) ); | |
48 | + | |
49 | + //copy the array to the GPU | |
50 | + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) ); | |
51 | + HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) ); | |
52 | + | |
53 | + //call the GPU version of this function | |
54 | + gpu_add<T>(gpu_ptr1, gpu_ptr2 ,gpu_sum, N); | |
55 | + | |
56 | + //copy the array back to the CPU | |
57 | + HANDLE_ERROR( cudaMemcpy( cpu_sum, gpu_sum, N * sizeof(T), cudaMemcpyDeviceToHost) ); | |
58 | + | |
59 | + //free allocated memory | |
60 | + cudaFree(gpu_ptr1); | |
61 | + cudaFree(gpu_ptr2); | |
62 | + cudaFree(gpu_sum); | |
63 | + | |
64 | + } | |
65 | + | |
66 | + } | |
67 | +} | |
68 | + | |
69 | + | |
70 | + | |
71 | +#endif | |
0 | 72 | \ No newline at end of file | ... | ... |