From 5cc0976c6028f692e7663810b4504bc34e9c8f10 Mon Sep 17 00:00:00 2001 From: David Date: Mon, 14 Sep 2015 00:10:29 -0500 Subject: [PATCH] added separable convolution, organized the stim/cuda directory --- stim/cuda/array_abs.cuh | 59 ----------------------------------------------------------- stim/cuda/array_add.cuh | 71 ----------------------------------------------------------------------- stim/cuda/array_cart2polar.cuh | 66 ------------------------------------------------------------------ stim/cuda/array_multiply.cuh | 64 ---------------------------------------------------------------- stim/cuda/arraymath.cuh | 10 ++++------ stim/cuda/arraymath/array_abs.cuh | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/arraymath/array_add.cuh | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/arraymath/array_cart2polar.cuh | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/arraymath/array_multiply.cuh | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/callable.h | 10 ---------- stim/cuda/conv2.cuh | 147 --------------------------------------------------------------------------------------------------------------------------------------------------- stim/cuda/cudatools.h | 5 +++++ stim/cuda/cudatools/callable.h | 10 ++++++++++ stim/cuda/cudatools/devices.h | 18 ++++++++++++++++++ stim/cuda/cudatools/error.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/cudatools/glbind.h | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/cudatools/threads.h | 34 ++++++++++++++++++++++++++++++++++ stim/cuda/cudatools/timer.h | 31 +++++++++++++++++++++++++++++++ stim/cuda/devices.h | 18 ------------------ stim/cuda/down_sample.cuh | 2 +- stim/cuda/error.h | 48 ------------------------------------------------ stim/cuda/gaussian_blur.cuh | 238 ++++++++++++++++++++++++++++++++++++++++++---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- stim/cuda/glbind.h | 79 ------------------------------------------------------------------------------- stim/cuda/gradient.cuh | 115 ------------------------------------------------------------------------------------------------------------------- stim/cuda/templates/conv2.cuh | 147 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/templates/conv2sep.cuh | 260 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/templates/gaussian_blur.cuh | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/templates/gradient.cuh | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ stim/cuda/threads.h | 34 ---------------------------------- stim/cuda/timer.h | 31 ------------------------------- 30 files changed, 1143 insertions(+), 945 deletions(-) delete mode 100644 stim/cuda/array_abs.cuh delete mode 100644 stim/cuda/array_add.cuh delete mode 100644 stim/cuda/array_cart2polar.cuh delete mode 100644 stim/cuda/array_multiply.cuh create mode 100644 stim/cuda/arraymath/array_abs.cuh create mode 100644 stim/cuda/arraymath/array_add.cuh create mode 100644 stim/cuda/arraymath/array_cart2polar.cuh create mode 100644 stim/cuda/arraymath/array_multiply.cuh delete mode 100644 stim/cuda/callable.h delete mode 100644 stim/cuda/conv2.cuh create mode 100644 stim/cuda/cudatools.h create mode 100644 stim/cuda/cudatools/callable.h create mode 100644 stim/cuda/cudatools/devices.h create mode 100644 stim/cuda/cudatools/error.h create mode 100644 stim/cuda/cudatools/glbind.h create mode 100644 stim/cuda/cudatools/threads.h create mode 100644 stim/cuda/cudatools/timer.h delete mode 100644 stim/cuda/devices.h delete mode 100644 stim/cuda/error.h delete mode 100644 stim/cuda/glbind.h delete mode 100644 stim/cuda/gradient.cuh create mode 100644 stim/cuda/templates/conv2.cuh create mode 100644 stim/cuda/templates/conv2sep.cuh create mode 100644 stim/cuda/templates/gaussian_blur.cuh create mode 100644 stim/cuda/templates/gradient.cuh delete mode 100644 stim/cuda/threads.h delete mode 100644 stim/cuda/timer.h diff --git a/stim/cuda/array_abs.cuh b/stim/cuda/array_abs.cuh deleted file mode 100644 index fc4d176..0000000 --- a/stim/cuda/array_abs.cuh +++ /dev/null @@ -1,59 +0,0 @@ -#ifndef STIM_CUDA_ARRAY_ABS_H -#define STIM_CUDA_ARRAY_ABS_H - -namespace stim{ - namespace cuda{ - template - __global__ void cuda_abs(T* a, unsigned int N){ - - //calculate the 1D index for this thread - int i = blockIdx.x * blockDim.x + threadIdx.x; - - if(i < N) - a[i] = abs(a[i]); - } - - - template - void gpu_abs(T* a, unsigned int N){ - - //get the maximum number of threads per block for the CUDA device - int threads = stim::maxThreadsPerBlock(); - - //calculate the number of blocks - int blocks = N / threads + (N%threads == 0 ? 0:1); - - //call the kernel to do the multiplication - cuda_abs <<< blocks, threads >>>(a, N); - - } - - - template - void cpu_abs(T* a, unsigned int N){ - - //calculate the number of bytes in the array - unsigned int bytes = N * sizeof(T); - - //allocate memory on the GPU for the array - T* gpuA; - HANDLE_ERROR( cudaMalloc(&gpuA, bytes) ); - - //copy the array to the GPU - HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) ); - - //call the GPU version of this function - gpu_abs(gpuA, N); - - //copy the array back to the CPU - HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) ); - - //free allocated memory - cudaFree(gpuA); - - } - - } //end namespace cuda -} //end namespace stim - -#endif \ No newline at end of file diff --git a/stim/cuda/array_add.cuh b/stim/cuda/array_add.cuh deleted file mode 100644 index a05353c..0000000 --- a/stim/cuda/array_add.cuh +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef STIM_CUDA_ARRAY_ADD_H -#define STIM_CUDA_ARRAY_ADD_H - -#include -#include -#include -#include - -namespace stim{ - namespace cuda{ - - template - __global__ void cuda_add(T* ptr1, T* ptr2, T* sum, unsigned int N){ - - //calculate the 1D index for this thread - int idx = blockIdx.x * blockDim.x + threadIdx.x; - - if(idx < N){ - sum[idx] = ptr1[idx] + ptr2[idx]; - } - - } - - template - void gpu_add(T* ptr1, T* ptr2, T* sum, unsigned int N){ - - //get the maximum number of threads per block for the CUDA device - int threads = stim::maxThreadsPerBlock(); - - //calculate the number of blocks - int blocks = N / threads + (N%threads == 0 ? 0:1); - - //call the kernel to do the multiplication - cuda_add <<< blocks, threads >>>(ptr1, ptr2, sum, N); - - } - - template - void cpu_add(T* ptr1, T* ptr2, T* cpu_sum, unsigned int N){ - - //allocate memory on the GPU for the array - T* gpu_ptr1; - T* gpu_ptr2; - T* gpu_sum; - HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) ); - HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) ); - HANDLE_ERROR( cudaMalloc( &gpu_sum, N * sizeof(T) ) ); - - //copy the array to the GPU - HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) ); - HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) ); - - //call the GPU version of this function - gpu_add(gpu_ptr1, gpu_ptr2 ,gpu_sum, N); - - //copy the array back to the CPU - HANDLE_ERROR( cudaMemcpy( cpu_sum, gpu_sum, N * sizeof(T), cudaMemcpyDeviceToHost) ); - - //free allocated memory - cudaFree(gpu_ptr1); - cudaFree(gpu_ptr2); - cudaFree(gpu_sum); - - } - - } -} - - - -#endif \ No newline at end of file diff --git a/stim/cuda/array_cart2polar.cuh b/stim/cuda/array_cart2polar.cuh deleted file mode 100644 index 8889b03..0000000 --- a/stim/cuda/array_cart2polar.cuh +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef STIM_CUDA_ARRAY_CART2POLAR_H -#define STIM_CUDA_ARRAY_CART2POLAR_H - -namespace stim{ - namespace cuda{ - template - __global__ void cuda_cart2polar(T* a, unsigned int N){ - - - //calculate the 1D index for this thread - int i = blockIdx.x * blockDim.x + threadIdx.x; - - if(i < N){ - float x = a[i * 2 + 0]; - float y = a[i * 2 + 1]; - float theta = atan2( y, x ) ; - float r = sqrt(x * x + y * y); - a[i * 2 + 0] = theta; - a[i * 2 + 1] = r; - } - } - - - template - void gpu_cart2polar(T* gpuGrad, unsigned int N){ - - //get the maximum number of threads per block for the CUDA device - int threads = stim::maxThreadsPerBlock(); - - //calculate the number of blocks - int blocks = N / threads + (N % threads == 0 ? 0:1); - - //call the kernel to do the multiplication - cuda_cart2polar <<< blocks, threads >>>(gpuGrad, N); - - } - - - template - void cpu_cart2polar(T* a, unsigned int N){ - - //calculate the number of bytes in the array - unsigned int bytes = N * sizeof(T) * 2; - - //allocate memory on the GPU for the array - T* gpuA; - HANDLE_ERROR( cudaMalloc(&gpuA, bytes) ); - - //copy the array to the GPU - HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) ); - - //call the GPU version of this function - gpu_cart2polar(gpuA, N); - - //copy the array back to the CPU - HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) ); - - //free allocated memory - cudaFree(gpuA); - - } - - } -} - -#endif \ No newline at end of file diff --git a/stim/cuda/array_multiply.cuh b/stim/cuda/array_multiply.cuh deleted file mode 100644 index add19ba..0000000 --- a/stim/cuda/array_multiply.cuh +++ /dev/null @@ -1,64 +0,0 @@ -#ifndef STIM_CUDA_ARRAY_MULTIPLY_H -#define STIM_CUDA_ARRAY_MULTIPLY_H - -#include -#include -#include -#include - -namespace stim{ - namespace cuda{ - - template - __global__ void cuda_multiply(T* lhs, T rhs, unsigned int N){ - - //calculate the 1D index for this thread - int i = blockIdx.x * blockDim.x + threadIdx.x; - - if(i < N) - lhs[i] *= rhs; - } - - template - void gpu_multiply(T* lhs, T rhs, unsigned int N){ - - //get the maximum number of threads per block for the CUDA device - int threads = stim::maxThreadsPerBlock(); - - //calculate the number of blocks - int blocks = N / threads + (N%threads == 0 ? 0:1); - - //call the kernel to do the multiplication - cuda_multiply <<< blocks, threads >>>(lhs, rhs, N); - - } - - template - void cpu_multiply(T* lhs, T rhs, unsigned int N){ - - //calculate the number of bytes in the array - unsigned int bytes = N * sizeof(T); - - //allocate memory on the GPU for the array - T* gpuLHS; - HANDLE_ERROR( cudaMalloc(&gpuLHS, bytes) ); - - //copy the array to the GPU - HANDLE_ERROR( cudaMemcpy(gpuLHS, lhs, bytes, cudaMemcpyHostToDevice) ); - - //call the GPU version of this function - gpu_multiply(gpuLHS, rhs, N); - - //copy the array back to the CPU - HANDLE_ERROR( cudaMemcpy(lhs, gpuLHS, bytes, cudaMemcpyDeviceToHost) ); - - //free allocated memory - cudaFree(gpuLHS); - } - - } -} - - - -#endif \ No newline at end of file diff --git a/stim/cuda/arraymath.cuh b/stim/cuda/arraymath.cuh index 16a36b8..43d0154 100644 --- a/stim/cuda/arraymath.cuh +++ b/stim/cuda/arraymath.cuh @@ -1,12 +1,10 @@ #ifndef STIM_CUDA_ARRAYMATH_H #define STIM_CUDA_ARRAYMATH_H -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include namespace stim{ namespace cuda{ diff --git a/stim/cuda/arraymath/array_abs.cuh b/stim/cuda/arraymath/array_abs.cuh new file mode 100644 index 0000000..fc4d176 --- /dev/null +++ b/stim/cuda/arraymath/array_abs.cuh @@ -0,0 +1,59 @@ +#ifndef STIM_CUDA_ARRAY_ABS_H +#define STIM_CUDA_ARRAY_ABS_H + +namespace stim{ + namespace cuda{ + template + __global__ void cuda_abs(T* a, unsigned int N){ + + //calculate the 1D index for this thread + int i = blockIdx.x * blockDim.x + threadIdx.x; + + if(i < N) + a[i] = abs(a[i]); + } + + + template + void gpu_abs(T* a, unsigned int N){ + + //get the maximum number of threads per block for the CUDA device + int threads = stim::maxThreadsPerBlock(); + + //calculate the number of blocks + int blocks = N / threads + (N%threads == 0 ? 0:1); + + //call the kernel to do the multiplication + cuda_abs <<< blocks, threads >>>(a, N); + + } + + + template + void cpu_abs(T* a, unsigned int N){ + + //calculate the number of bytes in the array + unsigned int bytes = N * sizeof(T); + + //allocate memory on the GPU for the array + T* gpuA; + HANDLE_ERROR( cudaMalloc(&gpuA, bytes) ); + + //copy the array to the GPU + HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) ); + + //call the GPU version of this function + gpu_abs(gpuA, N); + + //copy the array back to the CPU + HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) ); + + //free allocated memory + cudaFree(gpuA); + + } + + } //end namespace cuda +} //end namespace stim + +#endif \ No newline at end of file diff --git a/stim/cuda/arraymath/array_add.cuh b/stim/cuda/arraymath/array_add.cuh new file mode 100644 index 0000000..a05353c --- /dev/null +++ b/stim/cuda/arraymath/array_add.cuh @@ -0,0 +1,71 @@ +#ifndef STIM_CUDA_ARRAY_ADD_H +#define STIM_CUDA_ARRAY_ADD_H + +#include +#include +#include +#include + +namespace stim{ + namespace cuda{ + + template + __global__ void cuda_add(T* ptr1, T* ptr2, T* sum, unsigned int N){ + + //calculate the 1D index for this thread + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if(idx < N){ + sum[idx] = ptr1[idx] + ptr2[idx]; + } + + } + + template + void gpu_add(T* ptr1, T* ptr2, T* sum, unsigned int N){ + + //get the maximum number of threads per block for the CUDA device + int threads = stim::maxThreadsPerBlock(); + + //calculate the number of blocks + int blocks = N / threads + (N%threads == 0 ? 0:1); + + //call the kernel to do the multiplication + cuda_add <<< blocks, threads >>>(ptr1, ptr2, sum, N); + + } + + template + void cpu_add(T* ptr1, T* ptr2, T* cpu_sum, unsigned int N){ + + //allocate memory on the GPU for the array + T* gpu_ptr1; + T* gpu_ptr2; + T* gpu_sum; + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) ); + HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) ); + HANDLE_ERROR( cudaMalloc( &gpu_sum, N * sizeof(T) ) ); + + //copy the array to the GPU + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) ); + HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) ); + + //call the GPU version of this function + gpu_add(gpu_ptr1, gpu_ptr2 ,gpu_sum, N); + + //copy the array back to the CPU + HANDLE_ERROR( cudaMemcpy( cpu_sum, gpu_sum, N * sizeof(T), cudaMemcpyDeviceToHost) ); + + //free allocated memory + cudaFree(gpu_ptr1); + cudaFree(gpu_ptr2); + cudaFree(gpu_sum); + + } + + } +} + + + +#endif \ No newline at end of file diff --git a/stim/cuda/arraymath/array_cart2polar.cuh b/stim/cuda/arraymath/array_cart2polar.cuh new file mode 100644 index 0000000..8889b03 --- /dev/null +++ b/stim/cuda/arraymath/array_cart2polar.cuh @@ -0,0 +1,66 @@ +#ifndef STIM_CUDA_ARRAY_CART2POLAR_H +#define STIM_CUDA_ARRAY_CART2POLAR_H + +namespace stim{ + namespace cuda{ + template + __global__ void cuda_cart2polar(T* a, unsigned int N){ + + + //calculate the 1D index for this thread + int i = blockIdx.x * blockDim.x + threadIdx.x; + + if(i < N){ + float x = a[i * 2 + 0]; + float y = a[i * 2 + 1]; + float theta = atan2( y, x ) ; + float r = sqrt(x * x + y * y); + a[i * 2 + 0] = theta; + a[i * 2 + 1] = r; + } + } + + + template + void gpu_cart2polar(T* gpuGrad, unsigned int N){ + + //get the maximum number of threads per block for the CUDA device + int threads = stim::maxThreadsPerBlock(); + + //calculate the number of blocks + int blocks = N / threads + (N % threads == 0 ? 0:1); + + //call the kernel to do the multiplication + cuda_cart2polar <<< blocks, threads >>>(gpuGrad, N); + + } + + + template + void cpu_cart2polar(T* a, unsigned int N){ + + //calculate the number of bytes in the array + unsigned int bytes = N * sizeof(T) * 2; + + //allocate memory on the GPU for the array + T* gpuA; + HANDLE_ERROR( cudaMalloc(&gpuA, bytes) ); + + //copy the array to the GPU + HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) ); + + //call the GPU version of this function + gpu_cart2polar(gpuA, N); + + //copy the array back to the CPU + HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) ); + + //free allocated memory + cudaFree(gpuA); + + } + + } +} + +#endif \ No newline at end of file diff --git a/stim/cuda/arraymath/array_multiply.cuh b/stim/cuda/arraymath/array_multiply.cuh new file mode 100644 index 0000000..add19ba --- /dev/null +++ b/stim/cuda/arraymath/array_multiply.cuh @@ -0,0 +1,64 @@ +#ifndef STIM_CUDA_ARRAY_MULTIPLY_H +#define STIM_CUDA_ARRAY_MULTIPLY_H + +#include +#include +#include +#include + +namespace stim{ + namespace cuda{ + + template + __global__ void cuda_multiply(T* lhs, T rhs, unsigned int N){ + + //calculate the 1D index for this thread + int i = blockIdx.x * blockDim.x + threadIdx.x; + + if(i < N) + lhs[i] *= rhs; + } + + template + void gpu_multiply(T* lhs, T rhs, unsigned int N){ + + //get the maximum number of threads per block for the CUDA device + int threads = stim::maxThreadsPerBlock(); + + //calculate the number of blocks + int blocks = N / threads + (N%threads == 0 ? 0:1); + + //call the kernel to do the multiplication + cuda_multiply <<< blocks, threads >>>(lhs, rhs, N); + + } + + template + void cpu_multiply(T* lhs, T rhs, unsigned int N){ + + //calculate the number of bytes in the array + unsigned int bytes = N * sizeof(T); + + //allocate memory on the GPU for the array + T* gpuLHS; + HANDLE_ERROR( cudaMalloc(&gpuLHS, bytes) ); + + //copy the array to the GPU + HANDLE_ERROR( cudaMemcpy(gpuLHS, lhs, bytes, cudaMemcpyHostToDevice) ); + + //call the GPU version of this function + gpu_multiply(gpuLHS, rhs, N); + + //copy the array back to the CPU + HANDLE_ERROR( cudaMemcpy(lhs, gpuLHS, bytes, cudaMemcpyDeviceToHost) ); + + //free allocated memory + cudaFree(gpuLHS); + } + + } +} + + + +#endif \ No newline at end of file diff --git a/stim/cuda/callable.h b/stim/cuda/callable.h deleted file mode 100644 index eefc437..0000000 --- a/stim/cuda/callable.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef CUDA_CALLABLE - -//define the CUDA_CALLABLE macro (will prefix all members) -#ifdef __CUDACC__ -#define CUDA_CALLABLE __host__ __device__ -#else -#define CUDA_CALLABLE -#endif - -#endif diff --git a/stim/cuda/conv2.cuh b/stim/cuda/conv2.cuh deleted file mode 100644 index a40fdd0..0000000 --- a/stim/cuda/conv2.cuh +++ /dev/null @@ -1,147 +0,0 @@ -#ifndef STIM_CUDA_CONV2_H -#define STIM_CUDA_CONV2_H - -#include -#include -#include -#include -#include -#include - -namespace stim{ - namespace cuda{ - - template - //__global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){ - __global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){ - - - //the radius of mask - int r = (M - 1)/2; - - - //calculate the 1D index for this thread - //int idx = blockIdx.x * blockDim.x + threadIdx.x; - - //change 1D index to 2D cordinates - int i = blockIdx.x * blockDim.x + threadIdx.x; - int j = blockIdx.y; - - int idx = j * w + i; - //unsigned long N = w * h; - - if(i < w && j < h){ - - //copy[idx] = tex2D(texObj, i+100, j+100); - //return; - - //tex2D(texObj, i, j); - - //allocate memory for result - T sum = 0; - - //for (unsigned int y = max(j - r, 0); y <= min(j + r, h - 1); y++){ - - //for (unsigned int x = max(i - r, 0); x <= min(i + r, w - 1); x++){ - - for (int y = j - r; y <= j + r; y++){ - - for (int x = i - r; x <= i + r; x++){ - - //idx to mask cordinates(xx, yy) - int xx = x - (i - r); - int yy = y - (j - r); - - //T temp = img[y * w + x] * mask[yy * M + xx]; - //sum += img[y * w + x] * mask[yy * M + xx]; - sum += tex2D(texObj, x, y) * 1.0;//mask[yy * M + xx]; - } - } - copy[idx] = sum; - } - } - - - template - void gpu_conv2(T* img, T* mask, T* copy, unsigned int w, unsigned int h, unsigned M){ - - unsigned long N = w * h; - - // Allocate CUDA array in device memory - - //define a channel descriptor for a single 32-bit channel - cudaChannelFormatDesc channelDesc = - cudaCreateChannelDesc(32, 0, 0, 0, - cudaChannelFormatKindFloat); - cudaArray* cuArray; //declare the cuda array - cudaMallocArray(&cuArray, &channelDesc, w, h); //allocate the cuda array - - // Copy the image data from global memory to the array - cudaMemcpyToArray(cuArray, 0, 0, img, N * sizeof(T), - cudaMemcpyDeviceToDevice); - - // Specify texture - struct cudaResourceDesc resDesc; //create a resource descriptor - memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero - resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type - resDesc.res.array.array = cuArray; //add a pointer to the cuda array - - // Specify texture object parameters - struct cudaTextureDesc texDesc; //create a texture descriptor - memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero - texDesc.addressMode[0] = cudaAddressModeMirror; //use wrapping (around the edges) - texDesc.addressMode[1] = cudaAddressModeMirror; - texDesc.filterMode = cudaFilterModePoint; //use linear filtering - texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) - texDesc.normalizedCoords = 0; //not using normalized coordinates - - // Create texture object - cudaTextureObject_t texObj = 0; - cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); - - //get the maximum number of threads per block for the CUDA device - int threads = stim::maxThreadsPerBlock(); - - //calculate the number of blocks - dim3 blocks(w / threads + 1, h); - - //call the kernel to do the multiplication - //cuda_conv2 <<< blocks, threads >>>(img, mask, copy, w, h, M); - cuda_conv2 <<< blocks, threads >>>(img, mask, copy, texObj, w, h, M); - - } - - template - void cpu_conv2(T* img, T* mask, T* cpu_copy, unsigned int w, unsigned int h, unsigned M){ - - unsigned long N = w * h; - //allocate memory on the GPU for the array - T* gpu_img; - T* gpu_mask; - T* gpu_copy; - HANDLE_ERROR( cudaMalloc( &gpu_img, N * sizeof(T) ) ); - HANDLE_ERROR( cudaMalloc( &gpu_mask, pow(M, 2) * sizeof(T) ) ); - HANDLE_ERROR( cudaMalloc( &gpu_copy, N * sizeof(T) ) ); - - //copy the array to the GPU - HANDLE_ERROR( cudaMemcpy( gpu_img, img, N * sizeof(T), cudaMemcpyHostToDevice) ); - HANDLE_ERROR( cudaMemcpy( gpu_mask, mask, pow(M, 2) * sizeof(T), cudaMemcpyHostToDevice) ); - - //call the GPU version of this function - gpu_conv2(gpu_img, gpu_mask ,gpu_copy, w, h, M); - - //copy the array back to the CPU - HANDLE_ERROR( cudaMemcpy( cpu_copy, gpu_copy, N * sizeof(T), cudaMemcpyDeviceToHost) ); - - //free allocated memory - cudaFree(gpu_img); - cudaFree(gpu_mask); - cudaFree(gpu_copy); - - } - - } -} - - -#endif \ No newline at end of file diff --git a/stim/cuda/cudatools.h b/stim/cuda/cudatools.h new file mode 100644 index 0000000..1f1a9e6 --- /dev/null +++ b/stim/cuda/cudatools.h @@ -0,0 +1,5 @@ +#include +#include +#include +#include +#include \ No newline at end of file diff --git a/stim/cuda/cudatools/callable.h b/stim/cuda/cudatools/callable.h new file mode 100644 index 0000000..eefc437 --- /dev/null +++ b/stim/cuda/cudatools/callable.h @@ -0,0 +1,10 @@ +#ifndef CUDA_CALLABLE + +//define the CUDA_CALLABLE macro (will prefix all members) +#ifdef __CUDACC__ +#define CUDA_CALLABLE __host__ __device__ +#else +#define CUDA_CALLABLE +#endif + +#endif diff --git a/stim/cuda/cudatools/devices.h b/stim/cuda/cudatools/devices.h new file mode 100644 index 0000000..4dce378 --- /dev/null +++ b/stim/cuda/cudatools/devices.h @@ -0,0 +1,18 @@ +#ifndef RTS_CUDA_DEVICES +#define RTS_CUDA_DEVICES + +#include + +namespace stim{ + +int maxThreadsPerBlock() +{ + int device; + cudaGetDevice(&device); //get the id of the current device + cudaDeviceProp props; //device property structure + cudaGetDeviceProperties(&props, device); + return props.maxThreadsPerBlock; +} +} //end namespace rts + +#endif diff --git a/stim/cuda/cudatools/error.h b/stim/cuda/cudatools/error.h new file mode 100644 index 0000000..b49cf4b --- /dev/null +++ b/stim/cuda/cudatools/error.h @@ -0,0 +1,48 @@ +#include +#include +using namespace std; +#include "cuda_runtime.h" +#include "device_launch_parameters.h" +#include "cufft.h" + +#ifndef CUDA_HANDLE_ERROR_H +#define CUDA_HANDLE_ERROR_H + +//handle error macro +static void HandleError( cudaError_t err, const char *file, int line ) { + if (err != cudaSuccess) { + //FILE* outfile = fopen("cudaErrorLog.txt", "w"); + //fprintf(outfile, "%s in %s at line %d\n", cudaGetErrorString( err ), file, line ); + //fclose(outfile); + printf("%s in %s at line %d\n", cudaGetErrorString( err ), file, line ); + //exit( EXIT_FAILURE ); + + } +} +#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) + +static void CufftError( cufftResult err ) +{ + if (err != CUFFT_SUCCESS) + { + if(err == CUFFT_INVALID_PLAN) + cout<<"The plan parameter is not a valid handle."< +#include + +#include +#include + +//#include +#include "cuda_gl_interop.h" +#include "../gl/error.h" + +namespace stim +{ + +static void InitGLEW() +{ + //Initialize the GLEW toolkit + + GLenum err = glewInit(); + if(GLEW_OK != err) + { + printf("Error starting GLEW."); + } + fprintf(stdout, "Status: Using GLEW %s\n", glewGetString(GLEW_VERSION)); +} + +static void cudaSetDevice(int major = 1, int minor = 3) +{ + cudaDeviceProp prop; + int dev; + + //find a CUDA device that can handle an offscreen buffer + int num_gpu; + HANDLE_ERROR(cudaGetDeviceCount(&num_gpu)); + printf("Number of CUDA devices detected: %d\n", num_gpu); + memset(&prop, 0, sizeof(cudaDeviceProp)); + prop.major=major; + prop.minor=minor; + HANDLE_ERROR(cudaChooseDevice(&dev, &prop)); + HANDLE_ERROR(cudaGetDeviceProperties(&prop, dev)); + HANDLE_ERROR(cudaGLSetGLDevice(dev)); +} + +static void* cudaMapResource(cudaGraphicsResource* cudaBufferResource) +{ + //this function takes a predefined CUDA resource and maps it to a pointer + void* buffer; + HANDLE_ERROR(cudaGraphicsMapResources(1, &cudaBufferResource, NULL)); + size_t size; + HANDLE_ERROR(cudaGraphicsResourceGetMappedPointer( (void**)&buffer, &size, cudaBufferResource)); + return buffer; +} +static void cudaUnmapResource(cudaGraphicsResource* resource) +{ + //this function unmaps the CUDA resource so it can be used by OpenGL + HANDLE_ERROR(cudaGraphicsUnmapResources(1, &resource, NULL)); +} + +static void cudaCreateRenderBuffer(GLuint &glBufferName, cudaGraphicsResource* &cudaBufferResource, int resX, int resY) +{ + //delete the previous buffer name and resource + if(cudaBufferResource != 0) + HANDLE_ERROR(cudaGraphicsUnregisterResource(cudaBufferResource)); + if(glBufferName != 0) + glDeleteBuffers(1, &glBufferName); + + //generate an OpenGL offscreen buffer + glGenBuffers(1, &glBufferName); + + //bind the buffer - directs all calls to this buffer + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, glBufferName); + glBufferData(GL_PIXEL_UNPACK_BUFFER, resX * resY * sizeof(uchar3), NULL, GL_DYNAMIC_DRAW_ARB); + CHECK_OPENGL_ERROR + HANDLE_ERROR(cudaGraphicsGLRegisterBuffer(&cudaBufferResource, glBufferName, cudaGraphicsMapFlagsNone)); +} +} +#endif diff --git a/stim/cuda/cudatools/threads.h b/stim/cuda/cudatools/threads.h new file mode 100644 index 0000000..0315082 --- /dev/null +++ b/stim/cuda/cudatools/threads.h @@ -0,0 +1,34 @@ +#include "cuda_runtime.h" +#include "device_launch_parameters.h" +#include + +#ifndef CUDA_THREADS_H +#define CUDA_THREADS_H + +#define MAX_GRID 65535 + +__device__ unsigned int ThreadIndex1D() +{ + return blockIdx.y * gridDim.x * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; +} + +dim3 GenGrid1D(unsigned int N, unsigned int blocksize = 128) +{ + dim3 dimgrid; + + dimgrid.x = (N + blocksize - 1)/blocksize; + dimgrid.y = 1; + dimgrid.z = 1; + + if(dimgrid.x > MAX_GRID) + { + dimgrid.y = (dimgrid.x + MAX_GRID - 1) / MAX_GRID; + dimgrid.x = MAX_GRID; + } + + return dimgrid; + +} + + +#endif diff --git a/stim/cuda/cudatools/timer.h b/stim/cuda/cudatools/timer.h new file mode 100644 index 0000000..27d6352 --- /dev/null +++ b/stim/cuda/cudatools/timer.h @@ -0,0 +1,31 @@ +#ifndef STIM_CUDA_TIMER +#define STIM_CUDA_TIMER + +static cudaEvent_t tStartEvent; +static cudaEvent_t tStopEvent; + +namespace stim{ + +/// These functions calculate the time between GPU functions in milliseconds +static void gpuStartTimer() +{ + //set up timing events + cudaEventCreate(&tStartEvent); + cudaEventCreate(&tStopEvent); + cudaEventRecord(tStartEvent, 0); +} + +static float gpuStopTimer() +{ + cudaEventRecord(tStopEvent, 0); + cudaEventSynchronize(tStopEvent); + float elapsedTime; + cudaEventElapsedTime(&elapsedTime, tStartEvent, tStopEvent); + cudaEventDestroy(tStartEvent); + cudaEventDestroy(tStopEvent); + return elapsedTime; +} + +} //end namespace stim + +#endif \ No newline at end of file diff --git a/stim/cuda/devices.h b/stim/cuda/devices.h deleted file mode 100644 index 4dce378..0000000 --- a/stim/cuda/devices.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef RTS_CUDA_DEVICES -#define RTS_CUDA_DEVICES - -#include - -namespace stim{ - -int maxThreadsPerBlock() -{ - int device; - cudaGetDevice(&device); //get the id of the current device - cudaDeviceProp props; //device property structure - cudaGetDeviceProperties(&props, device); - return props.maxThreadsPerBlock; -} -} //end namespace rts - -#endif diff --git a/stim/cuda/down_sample.cuh b/stim/cuda/down_sample.cuh index 73081e3..ecd19c9 100644 --- a/stim/cuda/down_sample.cuh +++ b/stim/cuda/down_sample.cuh @@ -89,7 +89,7 @@ namespace stim{ gpu_down_sample(gpuI, gpuI0, resize, x, y); //copy the image data to the GPU - cudaMemcpy(re_image, gpuI, bytes_ds, cudaMemcpyHostToDevice); + cudaMemcpy(re_img, gpuI, bytes_ds, cudaMemcpyHostToDevice); cudaFree(gpuI0); cudeFree(gpuI); diff --git a/stim/cuda/error.h b/stim/cuda/error.h deleted file mode 100644 index b49cf4b..0000000 --- a/stim/cuda/error.h +++ /dev/null @@ -1,48 +0,0 @@ -#include -#include -using namespace std; -#include "cuda_runtime.h" -#include "device_launch_parameters.h" -#include "cufft.h" - -#ifndef CUDA_HANDLE_ERROR_H -#define CUDA_HANDLE_ERROR_H - -//handle error macro -static void HandleError( cudaError_t err, const char *file, int line ) { - if (err != cudaSuccess) { - //FILE* outfile = fopen("cudaErrorLog.txt", "w"); - //fprintf(outfile, "%s in %s at line %d\n", cudaGetErrorString( err ), file, line ); - //fclose(outfile); - printf("%s in %s at line %d\n", cudaGetErrorString( err ), file, line ); - //exit( EXIT_FAILURE ); - - } -} -#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) - -static void CufftError( cufftResult err ) -{ - if (err != CUFFT_SUCCESS) - { - if(err == CUFFT_INVALID_PLAN) - cout<<"The plan parameter is not a valid handle."< #include -#include -#include +#include #include +#include //GPU-based separable convolution algorithm #define pi 3.14159 @@ -13,228 +13,74 @@ namespace stim{ namespace cuda{ template - __global__ void gaussian_blur_x(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){ + void gen_gaussian(T* out, T sigma, unsigned int width){ - //generate a pointer to shared memory (size will be specified as a kernel parameter) - extern __shared__ T s[]; + //fill the kernel with a gaussian + for(unsigned int xi = 0; xi < width; xi++){ - int kr = sigma * 4; //calculate the kernel radius - - //get a pointer to the gaussian in memory - T* g = (T*)&s[blockDim.x + 2 * kr]; - - //calculate the start point for this block - int bxi = blockIdx.x * blockDim.x; - int byi = blockIdx.y; - - //copy the portion of the image necessary for this block to shared memory - stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim); - - //calculate the thread index and block index - int ti = threadIdx.x; - - //calculate the spatial coordinate for this thread - int xi = bxi + ti; - - //pre-compute the gaussian values for each kernel point - T a = 1.0 / (sigma * sqrt(2 * pi)); - T c = - 1.0 / (2*sigma*sigma); - int ki; - - //use the first 2kr+1 threads to evaluate a gaussian and store the result - if(ti <= 2* kr+1){ - ki = ti - kr; - g[ti] = a * exp((ki*ki) * c); + float x = (float)xi - (float)(width/2); //calculate the x position of the gaussian + float g = 1.0 / (sigma * sqrt(2 * 3.14159)) * exp( - (x*x) / (2*sigma*sigma) ); + out[xi] = g; } - //make sure that all writing to shared memory is done before continuing - __syncthreads(); - - //if the current pixel is outside of the image - if(bxi + ti > x || byi > y) - return; - - - - //calculate the coordinates of the current thread in shared memory - int si = ti + kr; - - T sum = 0; //running weighted sum across the kernel - - - //for each element of the kernel - for(int ki = -kr; ki <= kr; ki++){ - sum += g[ki + kr] * s[si + ki]; - } - - //calculate the 1D image index for this thread - unsigned int i = byi * x + xi; - - //output the result to global memory - out[i] = sum; } template - __global__ void gaussian_blur_y(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){ - - //generate a pointer to shared memory (size will be specified as a kernel parameter) - extern __shared__ T s[]; - - int kr = sigma * 4; //calculate the kernel radius + void tex_gaussian_blur2(T* out, T sigma, unsigned int x, unsigned int y, cudaTextureObject_t texObj, cudaArray* cuArray){ - //get a pointer to the gaussian in memory - T* g = (T*)&s[blockDim.y + 2 * kr]; + //allocate space for the kernel + unsigned int kwidth = sigma * 8 + 1; + float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); - //calculate the start point for this block - int bxi = blockIdx.x; - int byi = blockIdx.y * blockDim.y; + //fill the kernel with a gaussian + gen_gaussian(kernel0, sigma, kwidth); - //copy the portion of the image necessary for this block to shared memory - stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim); + //copy the kernel to the GPU + T* gpuKernel0; + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice)); - //calculate the thread index and block index - int ti = threadIdx.y; + //perform the gaussian blur as a separable convolution + stim::cuda::tex_conv2sep(out, x, y, texObj, cuArray, gpuKernel0, kwidth, gpuKernel0, kwidth); - //calculate the spatial coordinate for this thread - int yi = byi + ti; + HANDLE_ERROR(cudaFree(gpuKernel0)); - //pre-compute the gaussian values for each kernel point - T a = 1.0 / (sigma * sqrt(2 * pi)); - T c = - 1.0 / (2*sigma*sigma); - int ki; - - //use the first 2kr+1 threads to evaluate a gaussian and store the result - if(ti <= 2* kr+1){ - ki = ti - kr; - g[ti] = a * exp((ki*ki) * c); - } - - //make sure that all writing to shared memory is done before continuing - __syncthreads(); - - //if the current pixel is outside of the image - if(bxi >= x || yi >= y) - return; - - - - //calculate the coordinates of the current thread in shared memory - int si = ti + kr; - - T sum = 0; //running weighted sum across the kernel - - - //for each element of the kernel - for(int ki = -kr; ki <= kr; ki++){ - sum += g[ki + kr] * s[si + ki]; - } - - //calculate the 1D image index for this thread - unsigned int i = yi * x + bxi; - - //output the result to global memory - out[i] = sum; } - /// Applies a Gaussian blur to a 2D image stored on the GPU template - void gpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){ + void gpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){ - //get the number of pixels in the image - unsigned int pixels = x * y; - unsigned int bytes = sizeof(T) * pixels; + //allocate space for the kernel + unsigned int kwidth = sigma * 8 + 1; + float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); - // Allocate CUDA array in device memory - - //define a channel descriptor for a single 32-bit channel - cudaChannelFormatDesc channelDesc = - cudaCreateChannelDesc(32, 0, 0, 0, - cudaChannelFormatKindFloat); - cudaArray* cuArray; //declare the cuda array - cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array - - // Copy the image data from global memory to the array - cudaMemcpyToArray(cuArray, 0, 0, image, bytes, - cudaMemcpyDeviceToDevice); - - // Specify texture - struct cudaResourceDesc resDesc; //create a resource descriptor - memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero - resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type - resDesc.res.array.array = cuArray; //add a pointer to the cuda array - - // Specify texture object parameters - struct cudaTextureDesc texDesc; //create a texture descriptor - memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero - texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) - texDesc.addressMode[1] = cudaAddressModeWrap; - texDesc.filterMode = cudaFilterModePoint; //use linear filtering - texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) - texDesc.normalizedCoords = 0; //not using normalized coordinates - - // Create texture object - cudaTextureObject_t texObj = 0; - cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); + //fill the kernel with a gaussian + gen_gaussian(kernel0, sigma, kwidth); - - //get the maximum number of threads per block for the CUDA device - int max_threads = stim::maxThreadsPerBlock(); - dim3 threads(max_threads, 1); - - //calculate the number of blocks - dim3 blocks(x / threads.x + 1, y); - - //calculate the shared memory used in the kernel - unsigned int pixel_bytes = max_threads * 4; //bytes devoted to pixel data being processed - unsigned int apron_bytes = sigma * 8 * 4; //bytes devoted to pixels outside the window - unsigned int gaussian_bytes = (sigma * 8 + 1) * 4; //bytes devoted to memory used to store the pre-computed Gaussian window - unsigned int shared_bytes = pixel_bytes + apron_bytes + gaussian_bytes; //total number of bytes shared memory used - - //blur the image along the x-axis - gaussian_blur_x <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y); - - // Copy the x-blurred data from global memory to the texture - cudaMemcpyToArray(cuArray, 0, 0, image, bytes, - cudaMemcpyDeviceToDevice); - - //transpose the block and thread dimensions - threads.x = 1; - threads.y = max_threads; - blocks.x = x; - blocks.y = y / threads.y + 1; - - //blur the image along the y-axis - gaussian_blur_y <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y); + //copy the kernel to the GPU + T* gpuKernel0; + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice)); + + //perform the gaussian blur as a separable convolution + stim::cuda::gpu_conv2sep(image, x, y, gpuKernel0, kwidth, gpuKernel0, kwidth); - //free allocated memory - cudaFree(cuArray); + HANDLE_ERROR(cudaFree(gpuKernel0)); } /// Applies a Gaussian blur to a 2D image stored on the CPU template - void cpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){ + void cpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){ - //get the number of pixels in the image - unsigned int pixels = x * y; - unsigned int bytes = sizeof(T) * pixels; + //allocate space for the kernel + unsigned int kwidth = sigma * 8 + 1; + float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); - //allocate space on the GPU - T* gpuI0; - cudaMalloc(&gpuI0, bytes); - - - //copy the image data to the GPU - cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice); - - //run the GPU-based version of the algorithm - gpu_gaussian_blur_2d(gpuI0, sigma, x, y); + //fill the kernel with a gaussian + gen_gaussian(kernel0, sigma, kwidth); - //copy the image data from the device - cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost); - - //free allocated memory - cudaFree(gpuI0); + //perform the gaussian blur as a separable convolution + stim::cuda::cpu_conv2sep(image, x, y, kernel0, kwidth, kernel0, kwidth); + } }; diff --git a/stim/cuda/glbind.h b/stim/cuda/glbind.h deleted file mode 100644 index bd841cc..0000000 --- a/stim/cuda/glbind.h +++ /dev/null @@ -1,79 +0,0 @@ -#ifndef RTS_GL_BIND_H -#define RTS_GL_BIND_H - -#include -#include - -#include -#include - -//#include -#include "cuda_gl_interop.h" -#include "../gl/error.h" - -namespace stim -{ - -static void InitGLEW() -{ - //Initialize the GLEW toolkit - - GLenum err = glewInit(); - if(GLEW_OK != err) - { - printf("Error starting GLEW."); - } - fprintf(stdout, "Status: Using GLEW %s\n", glewGetString(GLEW_VERSION)); -} - -static void cudaSetDevice(int major = 1, int minor = 3) -{ - cudaDeviceProp prop; - int dev; - - //find a CUDA device that can handle an offscreen buffer - int num_gpu; - HANDLE_ERROR(cudaGetDeviceCount(&num_gpu)); - printf("Number of CUDA devices detected: %d\n", num_gpu); - memset(&prop, 0, sizeof(cudaDeviceProp)); - prop.major=major; - prop.minor=minor; - HANDLE_ERROR(cudaChooseDevice(&dev, &prop)); - HANDLE_ERROR(cudaGetDeviceProperties(&prop, dev)); - HANDLE_ERROR(cudaGLSetGLDevice(dev)); -} - -static void* cudaMapResource(cudaGraphicsResource* cudaBufferResource) -{ - //this function takes a predefined CUDA resource and maps it to a pointer - void* buffer; - HANDLE_ERROR(cudaGraphicsMapResources(1, &cudaBufferResource, NULL)); - size_t size; - HANDLE_ERROR(cudaGraphicsResourceGetMappedPointer( (void**)&buffer, &size, cudaBufferResource)); - return buffer; -} -static void cudaUnmapResource(cudaGraphicsResource* resource) -{ - //this function unmaps the CUDA resource so it can be used by OpenGL - HANDLE_ERROR(cudaGraphicsUnmapResources(1, &resource, NULL)); -} - -static void cudaCreateRenderBuffer(GLuint &glBufferName, cudaGraphicsResource* &cudaBufferResource, int resX, int resY) -{ - //delete the previous buffer name and resource - if(cudaBufferResource != 0) - HANDLE_ERROR(cudaGraphicsUnregisterResource(cudaBufferResource)); - if(glBufferName != 0) - glDeleteBuffers(1, &glBufferName); - - //generate an OpenGL offscreen buffer - glGenBuffers(1, &glBufferName); - - //bind the buffer - directs all calls to this buffer - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, glBufferName); - glBufferData(GL_PIXEL_UNPACK_BUFFER, resX * resY * sizeof(uchar3), NULL, GL_DYNAMIC_DRAW_ARB); - CHECK_OPENGL_ERROR - HANDLE_ERROR(cudaGraphicsGLRegisterBuffer(&cudaBufferResource, glBufferName, cudaGraphicsMapFlagsNone)); -} -} -#endif diff --git a/stim/cuda/gradient.cuh b/stim/cuda/gradient.cuh deleted file mode 100644 index 95931dc..0000000 --- a/stim/cuda/gradient.cuh +++ /dev/null @@ -1,115 +0,0 @@ -#ifndef STIM_CUDA_GRADIENT_H -#define STIM_CUDA_GRADIENT_H - -#include -#include -#include -#include - -namespace stim{ - namespace cuda{ - - template - __global__ void gradient_2d(T* out, T* in, unsigned int x, unsigned int y){ - - //calculate the 1D image index for this thread - int i = blockIdx.x * blockDim.x + threadIdx.x; - - //convert this to 2D pixel coordinates - int yi = i / x; - int xi = i - (yi * x); - - //return if the pixel is outside of the image - if(xi >= x || yi >= y) return; - - //calculate indices for the forward difference - int i_xp = yi * x + (xi + 1); - int i_yp = (yi + 1) * x + xi; - - //use forward differences if a coordinate is zero - if(xi == 0) - out[i * 2 + 0] = in[i_xp] - in[i]; - if(yi == 0) - out[i * 2 + 1] = in[i_yp] - in[i]; - - //calculate indices for the backward difference - int i_xn = yi * x + (xi - 1); - int i_yn = (yi - 1) * x + xi; - - //use backward differences if the coordinate is at the maximum edge - if(xi == x-1) - out[i * 2 + 0] = in[i] - in[i_xn]; - if(yi == y-1) - out[i * 2 + 1] = in[i] - in[i_yn]; - - //otherwise use central differences - if(xi > 0 && xi < x-1) - out[i * 2 + 0] = (in[i_xp] - in[i_xn]) / 2; - - if(yi > 0 && yi < y-1) - out[i * 2 + 1] = (in[i_yp] - in[i_yn]) / 2; - - } - - template - //void gpu_gradient_2d(T* gpuOut, T* gpuIn, unsigned int x, unsigned int y){ - void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){ - - //get the number of pixels in the image - unsigned int pixels = x * y; - - //allocate space on the GPU for the input image - //T* gpuI; - //HANDLE_ERROR(cudaMalloc(&gpuI, bytes)); - - //cudaMemcpy(gpuI, gpuI0, bytes, cudaMemcpyDeviceToDevice); - - - //allocate space on the GPU for the output gradient image - //T* gpuGrad; - //cudaMalloc(&gpuGrad, bytes * 2); //the output image will have two channels (x, y) - - //get the maximum number of threads per block for the CUDA device - int threads = stim::maxThreadsPerBlock(); - - //calculate the number of blocks - int blocks = pixels / threads + (pixels%threads == 0 ? 0:1); - - //call the GPU kernel to determine the gradient - gradient_2d <<< blocks, threads >>>(gpuGrad, gpuI, x, y); - - } - - template - void cpu_gradient_2d(T* out, T* in, unsigned int x, unsigned int y){ - - //get the number of pixels in the image - unsigned int pixels = x * y; - unsigned int bytes = pixels * sizeof(T); - - //allocate space on the GPU for the input image - T* gpuIn; - HANDLE_ERROR(cudaMalloc(&gpuIn, bytes)); - - //copy the image data to the GPU - HANDLE_ERROR(cudaMemcpy(gpuIn, in, bytes, cudaMemcpyHostToDevice)); - - //allocate space on the GPU for the output gradient image - T* gpuOut; - cudaMalloc(&gpuOut, bytes * 2); //the output image will have two channels (x, y) - - //call the GPU version of this function - gpu_gradient_2d(gpuOut, gpuIn, x, y); - - //copy the results to the CPU - cudaMemcpy(out, gpuOut, bytes * 2, cudaMemcpyDeviceToHost); - - //free allocated memory - cudaFree(gpuOut); - } - - } -} - - -#endif \ No newline at end of file diff --git a/stim/cuda/templates/conv2.cuh b/stim/cuda/templates/conv2.cuh new file mode 100644 index 0000000..a40fdd0 --- /dev/null +++ b/stim/cuda/templates/conv2.cuh @@ -0,0 +1,147 @@ +#ifndef STIM_CUDA_CONV2_H +#define STIM_CUDA_CONV2_H + +#include +#include +#include +#include +#include +#include + +namespace stim{ + namespace cuda{ + + template + //__global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){ + __global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){ + + + //the radius of mask + int r = (M - 1)/2; + + + //calculate the 1D index for this thread + //int idx = blockIdx.x * blockDim.x + threadIdx.x; + + //change 1D index to 2D cordinates + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y; + + int idx = j * w + i; + //unsigned long N = w * h; + + if(i < w && j < h){ + + //copy[idx] = tex2D(texObj, i+100, j+100); + //return; + + //tex2D(texObj, i, j); + + //allocate memory for result + T sum = 0; + + //for (unsigned int y = max(j - r, 0); y <= min(j + r, h - 1); y++){ + + //for (unsigned int x = max(i - r, 0); x <= min(i + r, w - 1); x++){ + + for (int y = j - r; y <= j + r; y++){ + + for (int x = i - r; x <= i + r; x++){ + + //idx to mask cordinates(xx, yy) + int xx = x - (i - r); + int yy = y - (j - r); + + //T temp = img[y * w + x] * mask[yy * M + xx]; + //sum += img[y * w + x] * mask[yy * M + xx]; + sum += tex2D(texObj, x, y) * 1.0;//mask[yy * M + xx]; + } + } + copy[idx] = sum; + } + } + + + template + void gpu_conv2(T* img, T* mask, T* copy, unsigned int w, unsigned int h, unsigned M){ + + unsigned long N = w * h; + + // Allocate CUDA array in device memory + + //define a channel descriptor for a single 32-bit channel + cudaChannelFormatDesc channelDesc = + cudaCreateChannelDesc(32, 0, 0, 0, + cudaChannelFormatKindFloat); + cudaArray* cuArray; //declare the cuda array + cudaMallocArray(&cuArray, &channelDesc, w, h); //allocate the cuda array + + // Copy the image data from global memory to the array + cudaMemcpyToArray(cuArray, 0, 0, img, N * sizeof(T), + cudaMemcpyDeviceToDevice); + + // Specify texture + struct cudaResourceDesc resDesc; //create a resource descriptor + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type + resDesc.res.array.array = cuArray; //add a pointer to the cuda array + + // Specify texture object parameters + struct cudaTextureDesc texDesc; //create a texture descriptor + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero + texDesc.addressMode[0] = cudaAddressModeMirror; //use wrapping (around the edges) + texDesc.addressMode[1] = cudaAddressModeMirror; + texDesc.filterMode = cudaFilterModePoint; //use linear filtering + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) + texDesc.normalizedCoords = 0; //not using normalized coordinates + + // Create texture object + cudaTextureObject_t texObj = 0; + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); + + //get the maximum number of threads per block for the CUDA device + int threads = stim::maxThreadsPerBlock(); + + //calculate the number of blocks + dim3 blocks(w / threads + 1, h); + + //call the kernel to do the multiplication + //cuda_conv2 <<< blocks, threads >>>(img, mask, copy, w, h, M); + cuda_conv2 <<< blocks, threads >>>(img, mask, copy, texObj, w, h, M); + + } + + template + void cpu_conv2(T* img, T* mask, T* cpu_copy, unsigned int w, unsigned int h, unsigned M){ + + unsigned long N = w * h; + //allocate memory on the GPU for the array + T* gpu_img; + T* gpu_mask; + T* gpu_copy; + HANDLE_ERROR( cudaMalloc( &gpu_img, N * sizeof(T) ) ); + HANDLE_ERROR( cudaMalloc( &gpu_mask, pow(M, 2) * sizeof(T) ) ); + HANDLE_ERROR( cudaMalloc( &gpu_copy, N * sizeof(T) ) ); + + //copy the array to the GPU + HANDLE_ERROR( cudaMemcpy( gpu_img, img, N * sizeof(T), cudaMemcpyHostToDevice) ); + HANDLE_ERROR( cudaMemcpy( gpu_mask, mask, pow(M, 2) * sizeof(T), cudaMemcpyHostToDevice) ); + + //call the GPU version of this function + gpu_conv2(gpu_img, gpu_mask ,gpu_copy, w, h, M); + + //copy the array back to the CPU + HANDLE_ERROR( cudaMemcpy( cpu_copy, gpu_copy, N * sizeof(T), cudaMemcpyDeviceToHost) ); + + //free allocated memory + cudaFree(gpu_img); + cudaFree(gpu_mask); + cudaFree(gpu_copy); + + } + + } +} + + +#endif \ No newline at end of file diff --git a/stim/cuda/templates/conv2sep.cuh b/stim/cuda/templates/conv2sep.cuh new file mode 100644 index 0000000..d63d4e0 --- /dev/null +++ b/stim/cuda/templates/conv2sep.cuh @@ -0,0 +1,260 @@ +#ifndef STIM_CUDA_CONV2SEP_H +#define STIM_CUDA_CONV2SEP_H + +#include +#include +#include +#include +#include +#include + +#define pi 3.14159 + +namespace stim{ + namespace cuda{ + + template + __global__ void conv2sep_0(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y, + T* kernel0, unsigned int k0){ + + //generate a pointer to shared memory (size will be specified as a kernel parameter) + extern __shared__ T s[]; + + int kr = k0/2; //calculate the kernel radius + + //get a pointer to the gaussian in memory + T* g = (T*)&s[blockDim.x + 2 * kr]; + + //calculate the start point for this block + int bxi = blockIdx.x * blockDim.x; + int byi = blockIdx.y; + + //copy the portion of the image necessary for this block to shared memory + stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim); + + //calculate the thread index + int ti = threadIdx.x; + + //calculate the spatial coordinate for this thread + int xi = bxi + ti; + int yi = byi; + + + //use the first 2kr+1 threads to transfer the kernel to shared memory + if(ti < k0){ + g[ti] = kernel0[ti]; + } + + //make sure that all writing to shared memory is done before continuing + __syncthreads(); + + //if the current pixel is outside of the image + if(xi > x || yi > y) + return; + + + //calculate the coordinates of the current thread in shared memory + int si = ti + kr; + + T sum = 0; //running weighted sum across the kernel + + + //for each element of the kernel + for(int ki = -kr; ki <= kr; ki++){ + sum += g[ki + kr] * s[si + ki]; + } + + //calculate the 1D image index for this thread + unsigned int i = byi * x + xi; + + //output the result to global memory + out[i] = sum; + } + + template + __global__ void conv2sep_1(T* out, cudaTextureObject_t in, unsigned int x, unsigned int y, + T* kernel0, unsigned int k0){ + + //generate a pointer to shared memory (size will be specified as a kernel parameter) + extern __shared__ T s[]; + + int kr = k0/2; //calculate the kernel radius + + //get a pointer to the gaussian in memory + T* g = (T*)&s[blockDim.y + 2 * kr]; + + //calculate the start point for this block + int bxi = blockIdx.x; + int byi = blockIdx.y * blockDim.y; + + //copy the portion of the image necessary for this block to shared memory + stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim); + + //calculate the thread index + int ti = threadIdx.y; + + //calculate the spatial coordinate for this thread + int xi = bxi; + int yi = byi + ti; + + + //use the first 2kr+1 threads to transfer the kernel to shared memory + if(ti < k0){ + g[ti] = kernel0[ti]; + } + + //make sure that all writing to shared memory is done before continuing + __syncthreads(); + + //if the current pixel is outside of the image + if(xi > x || yi > y) + return; + + + //calculate the coordinates of the current thread in shared memory + int si = ti + kr; + + T sum = 0; //running weighted sum across the kernel + + + //for each element of the kernel + for(int ki = -kr; ki <= kr; ki++){ + sum += g[ki + kr] * s[si + ki]; + } + + //calculate the 1D image index for this thread + unsigned int i = yi * x + xi; + + //output the result to global memory + out[i] = sum; + } + + template + void tex_conv2sep(T* out, unsigned int x, unsigned int y, + cudaTextureObject_t texObj, cudaArray* cuArray, + T* kernel0, unsigned int k0, + T* kernel1, unsigned int k1){ + + //get the maximum number of threads per block for the CUDA device + int max_threads = stim::maxThreadsPerBlock(); + dim3 threads(max_threads, 1); + + //calculate the number of blocks + dim3 blocks(x / threads.x + 1, y); + + //calculate the shared memory used in the kernel + unsigned int pixel_bytes = max_threads * sizeof(T); //bytes devoted to pixel data being processed + unsigned int apron_bytes = k0/2 * sizeof(T); //bytes devoted to the apron on each side of the window + unsigned int gaussian_bytes = k0 * sizeof(T); //bytes devoted to memory used to store the pre-computed Gaussian window + unsigned int shared_bytes = pixel_bytes + 2 * apron_bytes + gaussian_bytes; //total number of bytes shared memory used + + //blur the image along the x-axis + conv2sep_0 <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel0, k0); + + // Copy the x-blurred data from global memory to the texture + cudaMemcpyToArray(cuArray, 0, 0, out, x * y * sizeof(T), + cudaMemcpyDeviceToDevice); + + //transpose the block and thread dimensions + threads.x = 1; + threads.y = max_threads; + blocks.x = x; + blocks.y = y / threads.y + 1; + + //blur the image along the y-axis + conv2sep_1 <<< blocks, threads, shared_bytes >>>(out, texObj, x, y, kernel1, k1); + + } + + template + void gpu_conv2sep(T* image, unsigned int x, unsigned int y, + T* kernel0, unsigned int k0, + T* kernel1, unsigned int k1){ + + //get the number of pixels in the image + unsigned int pixels = x * y; + unsigned int bytes = sizeof(T) * pixels; + + // Allocate CUDA array in device memory + + //define a channel descriptor for a single 32-bit channel + cudaChannelFormatDesc channelDesc = + cudaCreateChannelDesc(32, 0, 0, 0, + cudaChannelFormatKindFloat); + cudaArray* cuArray; //declare the cuda array + cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array + + // Copy the image data from global memory to the array + cudaMemcpyToArray(cuArray, 0, 0, image, bytes, + cudaMemcpyDeviceToDevice); + + // Specify texture + struct cudaResourceDesc resDesc; //create a resource descriptor + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type + resDesc.res.array.array = cuArray; //add a pointer to the cuda array + + // Specify texture object parameters + struct cudaTextureDesc texDesc; //create a texture descriptor + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero + texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) + texDesc.addressMode[1] = cudaAddressModeWrap; + texDesc.filterMode = cudaFilterModePoint; //use linear filtering + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) + texDesc.normalizedCoords = 0; //not using normalized coordinates + + // Create texture object + cudaTextureObject_t texObj = 0; + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); + + //call the texture version of the separable convolution function + tex_conv2sep(image, x, y, texObj, cuArray, kernel0, k0, kernel1, k1); + + //free allocated memory + cudaFree(cuArray); + + } + + /// Applies a Gaussian blur to a 2D image stored on the CPU + template + void cpu_conv2sep(T* image, unsigned int x, unsigned int y, + T* kernel0, unsigned int k0, + T* kernel1, unsigned int k1){ + + //get the number of pixels in the image + unsigned int pixels = x * y; + unsigned int bytes = sizeof(T) * pixels; + + //---------Allocate Image--------- + //allocate space on the GPU for the image + T* gpuI0; + HANDLE_ERROR(cudaMalloc(&gpuI0, bytes)); + + //copy the image data to the GPU + HANDLE_ERROR(cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice)); + + //---------Allocate Kernel-------- + //allocate and copy the 0 (x) kernel + T* gpuK0; + HANDLE_ERROR(cudaMalloc(&gpuK0, k0 * sizeof(T))); + HANDLE_ERROR(cudaMemcpy(gpuK0, kernel0, k0 * sizeof(T), cudaMemcpyHostToDevice)); + + //allocate and copy the 1 (y) kernel + T* gpuK1; + HANDLE_ERROR(cudaMalloc(&gpuK1, k1 * sizeof(T))); + HANDLE_ERROR(cudaMemcpy(gpuK1, kernel1, k1 * sizeof(T), cudaMemcpyHostToDevice)); + + //run the GPU-based version of the algorithm + gpu_conv2sep(gpuI0, x, y, gpuK0, k0, gpuK1, k1); + + //copy the image data from the device + cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost); + + //free allocated memory + cudaFree(gpuI0); + } + + }; +}; + +#endif \ No newline at end of file diff --git a/stim/cuda/templates/gaussian_blur.cuh b/stim/cuda/templates/gaussian_blur.cuh new file mode 100644 index 0000000..0b060f0 --- /dev/null +++ b/stim/cuda/templates/gaussian_blur.cuh @@ -0,0 +1,89 @@ +#ifndef STIM_CUDA_GAUSSIAN_BLUR_H +#define STIM_CUDA_GAUSSIAN_BLUR_H + +#include +#include +#include +#include +#include //GPU-based separable convolution algorithm + +#define pi 3.14159 + +namespace stim{ + namespace cuda{ + + template + void gen_gaussian(T* out, T sigma, unsigned int width){ + + //fill the kernel with a gaussian + for(unsigned int xi = 0; xi < width; xi++){ + + float x = (float)xi - (float)(width/2); //calculate the x position of the gaussian + float g = 1.0 / (sigma * sqrt(2 * 3.14159)) * exp( - (x*x) / (2*sigma*sigma) ); + out[xi] = g; + } + + } + + template + void tex_gaussian_blur2(T* out, T sigma, unsigned int x, unsigned int y, cudaTextureObject_t texObj, cudaArray* cuArray){ + + //allocate space for the kernel + unsigned int kwidth = sigma * 8 + 1; + float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); + + //fill the kernel with a gaussian + gen_gaussian(kernel0, sigma, kwidth); + + //copy the kernel to the GPU + T* gpuKernel0; + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice)); + + //perform the gaussian blur as a separable convolution + stim::cuda::tex_conv2sep(out, x, y, texObj, cuArray, gpuKernel0, kwidth, gpuKernel0, kwidth); + + HANDLE_ERROR(cudaFree(gpuKernel0)); + + } + + template + void gpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){ + + //allocate space for the kernel + unsigned int kwidth = sigma * 8 + 1; + float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); + + //fill the kernel with a gaussian + gen_gaussian(kernel0, sigma, kwidth); + + //copy the kernel to the GPU + T* gpuKernel0; + HANDLE_ERROR(cudaMemcpy(gpuKernel0, kernel0, kwidth * sizeof(T), cudaMemcpyHostToDevice)); + + //perform the gaussian blur as a separable convolution + stim::cuda::gpu_conv2sep(image, x, y, gpuKernel0, kwidth, gpuKernel0, kwidth); + + HANDLE_ERROR(cudaFree(gpuKernel0)); + + } + + /// Applies a Gaussian blur to a 2D image stored on the CPU + template + void cpu_gaussian_blur2(T* image, T sigma, unsigned int x, unsigned int y){ + + //allocate space for the kernel + unsigned int kwidth = sigma * 8 + 1; + float* kernel0 = (float*) malloc( kwidth * sizeof(float) ); + + //fill the kernel with a gaussian + gen_gaussian(kernel0, sigma, kwidth); + + //perform the gaussian blur as a separable convolution + stim::cuda::cpu_conv2sep(image, x, y, kernel0, kwidth, kernel0, kwidth); + + } + + }; +}; + +#endif \ No newline at end of file diff --git a/stim/cuda/templates/gradient.cuh b/stim/cuda/templates/gradient.cuh new file mode 100644 index 0000000..95931dc --- /dev/null +++ b/stim/cuda/templates/gradient.cuh @@ -0,0 +1,115 @@ +#ifndef STIM_CUDA_GRADIENT_H +#define STIM_CUDA_GRADIENT_H + +#include +#include +#include +#include + +namespace stim{ + namespace cuda{ + + template + __global__ void gradient_2d(T* out, T* in, unsigned int x, unsigned int y){ + + //calculate the 1D image index for this thread + int i = blockIdx.x * blockDim.x + threadIdx.x; + + //convert this to 2D pixel coordinates + int yi = i / x; + int xi = i - (yi * x); + + //return if the pixel is outside of the image + if(xi >= x || yi >= y) return; + + //calculate indices for the forward difference + int i_xp = yi * x + (xi + 1); + int i_yp = (yi + 1) * x + xi; + + //use forward differences if a coordinate is zero + if(xi == 0) + out[i * 2 + 0] = in[i_xp] - in[i]; + if(yi == 0) + out[i * 2 + 1] = in[i_yp] - in[i]; + + //calculate indices for the backward difference + int i_xn = yi * x + (xi - 1); + int i_yn = (yi - 1) * x + xi; + + //use backward differences if the coordinate is at the maximum edge + if(xi == x-1) + out[i * 2 + 0] = in[i] - in[i_xn]; + if(yi == y-1) + out[i * 2 + 1] = in[i] - in[i_yn]; + + //otherwise use central differences + if(xi > 0 && xi < x-1) + out[i * 2 + 0] = (in[i_xp] - in[i_xn]) / 2; + + if(yi > 0 && yi < y-1) + out[i * 2 + 1] = (in[i_yp] - in[i_yn]) / 2; + + } + + template + //void gpu_gradient_2d(T* gpuOut, T* gpuIn, unsigned int x, unsigned int y){ + void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){ + + //get the number of pixels in the image + unsigned int pixels = x * y; + + //allocate space on the GPU for the input image + //T* gpuI; + //HANDLE_ERROR(cudaMalloc(&gpuI, bytes)); + + //cudaMemcpy(gpuI, gpuI0, bytes, cudaMemcpyDeviceToDevice); + + + //allocate space on the GPU for the output gradient image + //T* gpuGrad; + //cudaMalloc(&gpuGrad, bytes * 2); //the output image will have two channels (x, y) + + //get the maximum number of threads per block for the CUDA device + int threads = stim::maxThreadsPerBlock(); + + //calculate the number of blocks + int blocks = pixels / threads + (pixels%threads == 0 ? 0:1); + + //call the GPU kernel to determine the gradient + gradient_2d <<< blocks, threads >>>(gpuGrad, gpuI, x, y); + + } + + template + void cpu_gradient_2d(T* out, T* in, unsigned int x, unsigned int y){ + + //get the number of pixels in the image + unsigned int pixels = x * y; + unsigned int bytes = pixels * sizeof(T); + + //allocate space on the GPU for the input image + T* gpuIn; + HANDLE_ERROR(cudaMalloc(&gpuIn, bytes)); + + //copy the image data to the GPU + HANDLE_ERROR(cudaMemcpy(gpuIn, in, bytes, cudaMemcpyHostToDevice)); + + //allocate space on the GPU for the output gradient image + T* gpuOut; + cudaMalloc(&gpuOut, bytes * 2); //the output image will have two channels (x, y) + + //call the GPU version of this function + gpu_gradient_2d(gpuOut, gpuIn, x, y); + + //copy the results to the CPU + cudaMemcpy(out, gpuOut, bytes * 2, cudaMemcpyDeviceToHost); + + //free allocated memory + cudaFree(gpuOut); + } + + } +} + + +#endif \ No newline at end of file diff --git a/stim/cuda/threads.h b/stim/cuda/threads.h deleted file mode 100644 index 87c443e..0000000 --- a/stim/cuda/threads.h +++ /dev/null @@ -1,34 +0,0 @@ -#include "cuda_runtime.h" -#include "device_launch_parameters.h" -#include "../cuda/callable.h" - -#ifndef CUDA_THREADS_H -#define CUDA_THREADS_H - -#define MAX_GRID 65535 - -__device__ unsigned int ThreadIndex1D() -{ - return blockIdx.y * gridDim.x * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; -} - -dim3 GenGrid1D(unsigned int N, unsigned int blocksize = 128) -{ - dim3 dimgrid; - - dimgrid.x = (N + blocksize - 1)/blocksize; - dimgrid.y = 1; - dimgrid.z = 1; - - if(dimgrid.x > MAX_GRID) - { - dimgrid.y = (dimgrid.x + MAX_GRID - 1) / MAX_GRID; - dimgrid.x = MAX_GRID; - } - - return dimgrid; - -} - - -#endif diff --git a/stim/cuda/timer.h b/stim/cuda/timer.h deleted file mode 100644 index f7ce5e1..0000000 --- a/stim/cuda/timer.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef STIM_CUDA_TIMER -#define STIM_CUDA_TIMER - -static cudaEvent_t tStartEvent; -static cudaEvent_t tStopEvent; - -namespace stim{ - -/// These functions calculate the time between GPU functions in milliseconds -static void gpuStartTimer() -{ - //set up timing events - cudaEventCreate(&tStartEvent); - cudaEventCreate(&tStopEvent); - cudaEventRecord(tStartEvent, 0); -} - -static float gpuStopTimer() -{ - cudaEventRecord(tStopEvent, 0); - cudaEventSynchronize(tStopEvent); - float elapsedTime; - cudaEventElapsedTime(&elapsedTime, tStartEvent, tStopEvent); - cudaEventDestroy(tStartEvent); - cudaEventDestroy(tStopEvent); - return elapsedTime; -} - -} //end namespace stim - -#endif \ No newline at end of file -- libgit2 0.21.4