diff --git a/stim/cuda/array_add.cuh b/stim/cuda/array_add.cuh new file mode 100644 index 0000000..a05353c --- /dev/null +++ b/stim/cuda/array_add.cuh @@ -0,0 +1,71 @@ +#ifndef STIM_CUDA_ARRAY_ADD_H +#define STIM_CUDA_ARRAY_ADD_H + +#include +#include +#include +#include + +namespace stim{ + namespace cuda{ + + template + __global__ void cuda_add(T* ptr1, T* ptr2, T* sum, unsigned int N){ + + //calculate the 1D index for this thread + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if(idx < N){ + sum[idx] = ptr1[idx] + ptr2[idx]; + } + + } + + template + void gpu_add(T* ptr1, T* ptr2, T* sum, unsigned int N){ + + //get the maximum number of threads per block for the CUDA device + int threads = stim::maxThreadsPerBlock(); + + //calculate the number of blocks + int blocks = N / threads + (N%threads == 0 ? 0:1); + + //call the kernel to do the multiplication + cuda_add <<< blocks, threads >>>(ptr1, ptr2, sum, N); + + } + + template + void cpu_add(T* ptr1, T* ptr2, T* cpu_sum, unsigned int N){ + + //allocate memory on the GPU for the array + T* gpu_ptr1; + T* gpu_ptr2; + T* gpu_sum; + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) ); + HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) ); + HANDLE_ERROR( cudaMalloc( &gpu_sum, N * sizeof(T) ) ); + + //copy the array to the GPU + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) ); + HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) ); + + //call the GPU version of this function + gpu_add(gpu_ptr1, gpu_ptr2 ,gpu_sum, N); + + //copy the array back to the CPU + HANDLE_ERROR( cudaMemcpy( cpu_sum, gpu_sum, N * sizeof(T), cudaMemcpyDeviceToHost) ); + + //free allocated memory + cudaFree(gpu_ptr1); + cudaFree(gpu_ptr2); + cudaFree(gpu_sum); + + } + + } +} + + + +#endif \ No newline at end of file -- libgit2 0.21.4