codebase / stimlib

Browse Code »

Commit 9dad097417cd1fb2374291310961d0f33200069f

Authored by Tianshu Cheng 2015-09-02 14:58:36 -0500

1 parent 6d30a707

removed image contour

Showing 1 changed file with 71 additions and 0 deletions Show diff stats

Inline Side-by-side

stim/cuda/array_add.cuh 0 → 100644

Wrap text Show/Hide comments View file @9dad097

	1	+#ifndef STIM_CUDA_ARRAY_ADD_H
	2	+#define STIM_CUDA_ARRAY_ADD_H
	3	+
	4	+#include <iostream>
	5	+#include <cuda.h>
	6	+#include <stim/cuda/devices.h>
	7	+#include <stim/cuda/error.h>
	8	+
	9	+namespace stim{
	10	+ namespace cuda{
	11	+
	12	+ template<typename T>
	13	+ __global__ void cuda_add(T* ptr1, T* ptr2, T* sum, unsigned int N){
	14	+
	15	+ //calculate the 1D index for this thread
	16	+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
	17	+
	18	+ if(idx < N){
	19	+ sum[idx] = ptr1[idx] + ptr2[idx];
	20	+ }
	21	+
	22	+ }
	23	+
	24	+ template<typename T>
	25	+ void gpu_add(T* ptr1, T* ptr2, T* sum, unsigned int N){
	26	+
	27	+ //get the maximum number of threads per block for the CUDA device
	28	+ int threads = stim::maxThreadsPerBlock();
	29	+
	30	+ //calculate the number of blocks
	31	+ int blocks = N / threads + (N%threads == 0 ? 0:1);
	32	+
	33	+ //call the kernel to do the multiplication
	34	+ cuda_add <<< blocks, threads >>>(ptr1, ptr2, sum, N);
	35	+
	36	+ }
	37	+
	38	+ template<typename T>
	39	+ void cpu_add(T* ptr1, T* ptr2, T* cpu_sum, unsigned int N){
	40	+
	41	+ //allocate memory on the GPU for the array
	42	+ T* gpu_ptr1;
	43	+ T* gpu_ptr2;
	44	+ T* gpu_sum;
	45	+ HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
	46	+ HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) );
	47	+ HANDLE_ERROR( cudaMalloc( &gpu_sum, N * sizeof(T) ) );
	48	+
	49	+ //copy the array to the GPU
	50	+ HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
	51	+ HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) );
	52	+
	53	+ //call the GPU version of this function
	54	+ gpu_add<T>(gpu_ptr1, gpu_ptr2 ,gpu_sum, N);
	55	+
	56	+ //copy the array back to the CPU
	57	+ HANDLE_ERROR( cudaMemcpy( cpu_sum, gpu_sum, N * sizeof(T), cudaMemcpyDeviceToHost) );
	58	+
	59	+ //free allocated memory
	60	+ cudaFree(gpu_ptr1);
	61	+ cudaFree(gpu_ptr2);
	62	+ cudaFree(gpu_sum);
	63	+
	64	+ }
	65	+
	66	+ }
	67	+}
	68	+
	69	+
	70	+
	71	+#endif
0	72	\ No newline at end of file
...	...