Commit 9dad097417cd1fb2374291310961d0f33200069f
1 parent
6d30a707
removed image contour
Showing
1 changed file
with
71 additions
and
0 deletions
Show diff stats
1 | +#ifndef STIM_CUDA_ARRAY_ADD_H | ||
2 | +#define STIM_CUDA_ARRAY_ADD_H | ||
3 | + | ||
4 | +#include <iostream> | ||
5 | +#include <cuda.h> | ||
6 | +#include <stim/cuda/devices.h> | ||
7 | +#include <stim/cuda/error.h> | ||
8 | + | ||
9 | +namespace stim{ | ||
10 | + namespace cuda{ | ||
11 | + | ||
12 | + template<typename T> | ||
13 | + __global__ void cuda_add(T* ptr1, T* ptr2, T* sum, unsigned int N){ | ||
14 | + | ||
15 | + //calculate the 1D index for this thread | ||
16 | + int idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
17 | + | ||
18 | + if(idx < N){ | ||
19 | + sum[idx] = ptr1[idx] + ptr2[idx]; | ||
20 | + } | ||
21 | + | ||
22 | + } | ||
23 | + | ||
24 | + template<typename T> | ||
25 | + void gpu_add(T* ptr1, T* ptr2, T* sum, unsigned int N){ | ||
26 | + | ||
27 | + //get the maximum number of threads per block for the CUDA device | ||
28 | + int threads = stim::maxThreadsPerBlock(); | ||
29 | + | ||
30 | + //calculate the number of blocks | ||
31 | + int blocks = N / threads + (N%threads == 0 ? 0:1); | ||
32 | + | ||
33 | + //call the kernel to do the multiplication | ||
34 | + cuda_add <<< blocks, threads >>>(ptr1, ptr2, sum, N); | ||
35 | + | ||
36 | + } | ||
37 | + | ||
38 | + template<typename T> | ||
39 | + void cpu_add(T* ptr1, T* ptr2, T* cpu_sum, unsigned int N){ | ||
40 | + | ||
41 | + //allocate memory on the GPU for the array | ||
42 | + T* gpu_ptr1; | ||
43 | + T* gpu_ptr2; | ||
44 | + T* gpu_sum; | ||
45 | + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) ); | ||
46 | + HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) ); | ||
47 | + HANDLE_ERROR( cudaMalloc( &gpu_sum, N * sizeof(T) ) ); | ||
48 | + | ||
49 | + //copy the array to the GPU | ||
50 | + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) ); | ||
51 | + HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) ); | ||
52 | + | ||
53 | + //call the GPU version of this function | ||
54 | + gpu_add<T>(gpu_ptr1, gpu_ptr2 ,gpu_sum, N); | ||
55 | + | ||
56 | + //copy the array back to the CPU | ||
57 | + HANDLE_ERROR( cudaMemcpy( cpu_sum, gpu_sum, N * sizeof(T), cudaMemcpyDeviceToHost) ); | ||
58 | + | ||
59 | + //free allocated memory | ||
60 | + cudaFree(gpu_ptr1); | ||
61 | + cudaFree(gpu_ptr2); | ||
62 | + cudaFree(gpu_sum); | ||
63 | + | ||
64 | + } | ||
65 | + | ||
66 | + } | ||
67 | +} | ||
68 | + | ||
69 | + | ||
70 | + | ||
71 | +#endif | ||
0 | \ No newline at end of file | 72 | \ No newline at end of file |