diff --git a/stim/cuda/array_add.cuh b/stim/cuda/array_add.cuh
new file mode 100644
index 0000000..a05353c
--- /dev/null
+++ b/stim/cuda/array_add.cuh
@@ -0,0 +1,71 @@
+#ifndef STIM_CUDA_ARRAY_ADD_H
+#define STIM_CUDA_ARRAY_ADD_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/devices.h>
+#include <stim/cuda/error.h>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void cuda_add(T* ptr1, T* ptr2, T* sum, unsigned int N){
+
+			//calculate the 1D index for this thread
+			int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if(idx < N){
+				sum[idx] = ptr1[idx] + ptr2[idx];
+			}
+
+		}
+
+		template<typename T>
+		void gpu_add(T* ptr1, T* ptr2, T* sum, unsigned int N){
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			int blocks = N / threads + (N%threads == 0 ? 0:1);
+
+			//call the kernel to do the multiplication
+			cuda_add <<< blocks, threads >>>(ptr1, ptr2, sum, N);
+
+		}
+
+		template<typename T>
+		void cpu_add(T* ptr1, T* ptr2, T* cpu_sum, unsigned int N){
+
+			//allocate memory on the GPU for the array
+			T* gpu_ptr1; 
+			T* gpu_ptr2; 
+			T* gpu_sum;
+			HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_sum, N * sizeof(T) ) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
+			HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_add<T>(gpu_ptr1, gpu_ptr2 ,gpu_sum, N);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy( cpu_sum, gpu_sum, N * sizeof(T), cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpu_ptr1);
+			cudaFree(gpu_ptr2);
+			cudaFree(gpu_sum);
+
+		}
+		
+	}
+}
+
+
+
+#endif
\ No newline at end of file
--
libgit2 0.21.4