Commit 9dad097417cd1fb2374291310961d0f33200069f

Authored by Tianshu Cheng
1 parent 6d30a707

removed image contour

Showing 1 changed file with 71 additions and 0 deletions   Show diff stats
stim/cuda/array_add.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_ARRAY_ADD_H
  2 +#define STIM_CUDA_ARRAY_ADD_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <stim/cuda/devices.h>
  7 +#include <stim/cuda/error.h>
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + template<typename T>
  13 + __global__ void cuda_add(T* ptr1, T* ptr2, T* sum, unsigned int N){
  14 +
  15 + //calculate the 1D index for this thread
  16 + int idx = blockIdx.x * blockDim.x + threadIdx.x;
  17 +
  18 + if(idx < N){
  19 + sum[idx] = ptr1[idx] + ptr2[idx];
  20 + }
  21 +
  22 + }
  23 +
  24 + template<typename T>
  25 + void gpu_add(T* ptr1, T* ptr2, T* sum, unsigned int N){
  26 +
  27 + //get the maximum number of threads per block for the CUDA device
  28 + int threads = stim::maxThreadsPerBlock();
  29 +
  30 + //calculate the number of blocks
  31 + int blocks = N / threads + (N%threads == 0 ? 0:1);
  32 +
  33 + //call the kernel to do the multiplication
  34 + cuda_add <<< blocks, threads >>>(ptr1, ptr2, sum, N);
  35 +
  36 + }
  37 +
  38 + template<typename T>
  39 + void cpu_add(T* ptr1, T* ptr2, T* cpu_sum, unsigned int N){
  40 +
  41 + //allocate memory on the GPU for the array
  42 + T* gpu_ptr1;
  43 + T* gpu_ptr2;
  44 + T* gpu_sum;
  45 + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
  46 + HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) );
  47 + HANDLE_ERROR( cudaMalloc( &gpu_sum, N * sizeof(T) ) );
  48 +
  49 + //copy the array to the GPU
  50 + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
  51 + HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) );
  52 +
  53 + //call the GPU version of this function
  54 + gpu_add<T>(gpu_ptr1, gpu_ptr2 ,gpu_sum, N);
  55 +
  56 + //copy the array back to the CPU
  57 + HANDLE_ERROR( cudaMemcpy( cpu_sum, gpu_sum, N * sizeof(T), cudaMemcpyDeviceToHost) );
  58 +
  59 + //free allocated memory
  60 + cudaFree(gpu_ptr1);
  61 + cudaFree(gpu_ptr2);
  62 + cudaFree(gpu_sum);
  63 +
  64 + }
  65 +
  66 + }
  67 +}
  68 +
  69 +
  70 +
  71 +#endif
0 72 \ No newline at end of file
... ...