conv2.cuh 4.21 KB

#include <iostream>
#include <cuda.h>
#include <stim/cuda/cudatools.h>
#include <cmath>
#include <algorithm>

namespace stim{
	namespace cuda{

		template<typename T>
		__global__ void cuda_conv2(T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned int M){

			//the radius of mask
			int r = (M - 1)/2;

			//calculate the 1D index for this thread
			//int idx = blockIdx.x * blockDim.x + threadIdx.x;

			//change 1D index to 2D cordinates
			int i = blockIdx.x * blockDim.x + threadIdx.x;
			int j = blockIdx.y;

			int idx = j * w + i;
			//unsigned long N = w * h;

			if(i < w && j < h){

				//copy[idx] = tex2D<float>(texObj, i+100, j+100);

				tex2D<float>(texObj, (float)i/w, (float)j/h);

				//allocate memory for result
				T sum = 0;

				//for (unsigned int y = max(j - r, 0); y <= min(j + r, h - 1); y++){

					//for (unsigned int x = max(i - r, 0); x <= min(i + r, w - 1); x++){

				for (int y = j - r; y <= j + r; y++){

					for (int x = i - r; x <= i + r; x++){

						//idx to mask cordinates(xx, yy)
						int xx = x - (i - r);
						int yy = y - (j - r);

						sum += tex2D<T>(texObj, (float)x/w, (float)y/h) * mask[yy * M + xx];
				copy[idx] = sum;

		template<typename T>
		void gpu_conv2(T* img, T* mask, T* copy, unsigned int w, unsigned int h, unsigned M){

			unsigned long N = w * h;

			// Allocate CUDA array in device memory
			//define a channel descriptor for a single 32-bit channel
			cudaChannelFormatDesc channelDesc =
					   cudaCreateChannelDesc(32, 0, 0, 0,
			cudaArray* cuArray;												//declare the cuda array
			cudaMallocArray(&cuArray, &channelDesc, w, h);			//allocate the cuda array

			// Copy the image data from global memory to the array
			cudaMemcpyToArray(cuArray, 0, 0, img, N * sizeof(T),

			// Specify texture
			struct cudaResourceDesc resDesc;				//create a resource descriptor
			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array

			// Specify texture object parameters
			struct cudaTextureDesc texDesc;							//create a texture descriptor
			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
			texDesc.addressMode[0]   = cudaAddressModeClamp;			//use wrapping (around the edges)
			texDesc.addressMode[1]   = cudaAddressModeClamp;
			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
			texDesc.normalizedCoords = 1;							//using normalized coordinates

			// Create texture object
			cudaTextureObject_t texObj = 0;
			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);

			//get the maximum number of threads per block for the CUDA device
			int threads = stim::maxThreadsPerBlock();

			//calculate the number of blocks
			dim3 blocks(w / threads + 1, h);

			//call the kernel to do the multiplication
			cuda_conv2 <<< blocks, threads >>>(mask, copy, texObj, w, h, M);


		template<typename T>
		void cpu_conv2(T* img, T* mask, T* cpu_copy, unsigned int w, unsigned int h, unsigned M){

			unsigned long N = w * h;
			//allocate memory on the GPU for the array
			T* gpu_img; 
			T* gpu_mask; 
			T* gpu_copy;
			HANDLE_ERROR( cudaMalloc( &gpu_img, N * sizeof(T) ) );
			HANDLE_ERROR( cudaMalloc( &gpu_mask, pow(M, 2) * sizeof(T) ) );
			HANDLE_ERROR( cudaMalloc( &gpu_copy, N * sizeof(T) ) );

			//copy the array to the GPU
			HANDLE_ERROR( cudaMemcpy( gpu_img, img, N * sizeof(T), cudaMemcpyHostToDevice) );
			HANDLE_ERROR( cudaMemcpy( gpu_mask, mask, pow(M, 2) * sizeof(T), cudaMemcpyHostToDevice) );

			//call the GPU version of this function
			gpu_conv2<T>(gpu_img, gpu_mask ,gpu_copy, w, h, M);

			//copy the array back to the CPU
			HANDLE_ERROR( cudaMemcpy( cpu_copy, gpu_copy, N * sizeof(T), cudaMemcpyDeviceToHost) );

			//free allocated memory