From 348f8ab9945f81ca275a3d51e5586ba618e0b7db Mon Sep 17 00:00:00 2001 From: Tianshu Cheng Date: Fri, 4 Sep 2015 15:33:10 -0500 Subject: [PATCH] inseparable 2D convolution --- stim/cuda/conv2.cuh | 147 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+), 0 deletions(-) create mode 100644 stim/cuda/conv2.cuh diff --git a/stim/cuda/conv2.cuh b/stim/cuda/conv2.cuh new file mode 100644 index 0000000..b1d154e --- /dev/null +++ b/stim/cuda/conv2.cuh @@ -0,0 +1,147 @@ +#ifndef STIM_CUDA_CONV2_H +#define STIM_CUDA_CONV2_H + +#include +#include +#include +#include +#include +#include + +namespace stim{ + namespace cuda{ + + template + //__global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){ + __global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){ + + + //the radius of mask + unsigned r = (M - 1)/2; + + + //calculate the 1D index for this thread + //int idx = blockIdx.x * blockDim.x + threadIdx.x; + + //change 1D index to 2D cordinates + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y; + + int idx = j * w + i; + //unsigned long N = w * h; + + if(i < w && j < h){ + + copy[idx] = tex2D(texObj, i+100, j+100); + return; + + //tex2D(texObj, i, j); + + //allocate memory for result + T sum = 0; + + //for (unsigned int y = max(j - r, 0); y <= min(j + r, h - 1); y++){ + + //for (unsigned int x = max(i - r, 0); x <= min(i + r, w - 1); x++){ + + for (int y = j - r; y <= j + r; y++){ + + for (int x = i - r; x <= i + r; x++){ + + //idx to mask cordinates(xx, yy) + unsigned int xx = x - (i - r); + unsigned int yy = y - (j - r); + + //T temp = img[y * w + x] * mask[yy * M + xx]; + //sum += img[y * w + x] * mask[yy * M + xx]; + sum += tex2D(texObj, x, y);// * mask[yy * M + xx]; + } + } + copy[idx] = sum; + } + } + + + template + void gpu_conv2(T* img, T* mask, T* copy, unsigned int w, unsigned int h, unsigned M){ + + unsigned long N = w * h; + + // Allocate CUDA array in device memory + + //define a channel descriptor for a single 32-bit channel + cudaChannelFormatDesc channelDesc = + cudaCreateChannelDesc(32, 0, 0, 0, + cudaChannelFormatKindFloat); + cudaArray* cuArray; //declare the cuda array + cudaMallocArray(&cuArray, &channelDesc, w, h); //allocate the cuda array + + // Copy the image data from global memory to the array + cudaMemcpyToArray(cuArray, 0, 0, img, N * sizeof(T), + cudaMemcpyDeviceToDevice); + + // Specify texture + struct cudaResourceDesc resDesc; //create a resource descriptor + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type + resDesc.res.array.array = cuArray; //add a pointer to the cuda array + + // Specify texture object parameters + struct cudaTextureDesc texDesc; //create a texture descriptor + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero + texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges) + texDesc.addressMode[1] = cudaAddressModeWrap; + texDesc.filterMode = cudaFilterModePoint; //use linear filtering + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) + texDesc.normalizedCoords = 0; //not using normalized coordinates + + // Create texture object + cudaTextureObject_t texObj = 0; + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); + + //get the maximum number of threads per block for the CUDA device + int threads = stim::maxThreadsPerBlock(); + + //calculate the number of blocks + dim3 blocks(w / threads + 1, h); + + //call the kernel to do the multiplication + //cuda_conv2 <<< blocks, threads >>>(img, mask, copy, w, h, M); + cuda_conv2 <<< blocks, threads >>>(img, mask, copy, texObj, w, h, M); + + } + + template + void cpu_conv2(T* img, T* mask, T* cpu_copy, unsigned int w, unsigned int h, unsigned M){ + + unsigned long N = w * h; + //allocate memory on the GPU for the array + T* gpu_img; + T* gpu_mask; + T* gpu_copy; + HANDLE_ERROR( cudaMalloc( &gpu_img, N * sizeof(T) ) ); + HANDLE_ERROR( cudaMalloc( &gpu_mask, pow(M, 2) * sizeof(T) ) ); + HANDLE_ERROR( cudaMalloc( &gpu_copy, N * sizeof(T) ) ); + + //copy the array to the GPU + HANDLE_ERROR( cudaMemcpy( gpu_img, img, N * sizeof(T), cudaMemcpyHostToDevice) ); + HANDLE_ERROR( cudaMemcpy( gpu_mask, mask, pow(M, 2) * sizeof(T), cudaMemcpyHostToDevice) ); + + //call the GPU version of this function + gpu_conv2(gpu_img, gpu_mask ,gpu_copy, w, h, M); + + //copy the array back to the CPU + HANDLE_ERROR( cudaMemcpy( cpu_copy, gpu_copy, N * sizeof(T), cudaMemcpyDeviceToHost) ); + + //free allocated memory + cudaFree(gpu_img); + cudaFree(gpu_mask); + cudaFree(gpu_copy); + + } + + } +} + + +#endif \ No newline at end of file -- libgit2 0.21.4