codebase / stimlib

Browse Code »

Commit 9dad097417cd1fb2374291310961d0f33200069f

Authored by Tianshu Cheng 2015-09-02 14:58:36 -0500

1 parent 6d30a707

removed image contour

Showing 1 changed file with 71 additions and 0 deletions Show diff stats

Inline Side-by-side

stim/cuda/array_add.cuh 0 → 100644

Show/Hide comments View file @9dad097

		1	+#ifndef STIM_CUDA_ARRAY_ADD_H
		2	+#define STIM_CUDA_ARRAY_ADD_H
		3	+
		4	+#include <iostream>
		5	+#include <cuda.h>
		6	+#include <stim/cuda/devices.h>
		7	+#include <stim/cuda/error.h>
		8	+
		9	+namespace stim{
		10	+ namespace cuda{
		11	+
		12	+ template<typename T>
		13	+ __global__ void cuda_add(T* ptr1, T* ptr2, T* sum, unsigned int N){
		14	+
		15	+ //calculate the 1D index for this thread
		16	+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
		17	+
		18	+ if(idx < N){
		19	+ sum[idx] = ptr1[idx] + ptr2[idx];
		20	+ }
		21	+
		22	+ }
		23	+
		24	+ template<typename T>
		25	+ void gpu_add(T* ptr1, T* ptr2, T* sum, unsigned int N){
		26	+
		27	+ //get the maximum number of threads per block for the CUDA device
		28	+ int threads = stim::maxThreadsPerBlock();
		29	+
		30	+ //calculate the number of blocks
		31	+ int blocks = N / threads + (N%threads == 0 ? 0:1);
		32	+
		33	+ //call the kernel to do the multiplication
		34	+ cuda_add <<< blocks, threads >>>(ptr1, ptr2, sum, N);
		35	+
		36	+ }
		37	+
		38	+ template<typename T>
		39	+ void cpu_add(T* ptr1, T* ptr2, T* cpu_sum, unsigned int N){
		40	+
		41	+ //allocate memory on the GPU for the array
		42	+ T* gpu_ptr1;
		43	+ T* gpu_ptr2;
		44	+ T* gpu_sum;
		45	+ HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
		46	+ HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) );
		47	+ HANDLE_ERROR( cudaMalloc( &gpu_sum, N * sizeof(T) ) );
		48	+
		49	+ //copy the array to the GPU
		50	+ HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
		51	+ HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) );
		52	+
		53	+ //call the GPU version of this function
		54	+ gpu_add<T>(gpu_ptr1, gpu_ptr2 ,gpu_sum, N);
		55	+
		56	+ //copy the array back to the CPU
		57	+ HANDLE_ERROR( cudaMemcpy( cpu_sum, gpu_sum, N * sizeof(T), cudaMemcpyDeviceToHost) );
		58	+
		59	+ //free allocated memory
		60	+ cudaFree(gpu_ptr1);
		61	+ cudaFree(gpu_ptr2);
		62	+ cudaFree(gpu_sum);
		63	+
		64	+ }
		65	+
		66	+ }
		67	+}
		68	+
		69	+
		70	+
		71	+#endif
0	\ No newline at end of file	72	\ No newline at end of file