Merge branch 'bsds500' of git.stim.ee.uh.edu:codebase/stimlib

David Mayerich
2 parents eb303393 42890942
Showing 3 changed files with 220 additions and 0 deletions Show diff stats
stim/cuda/array_add.cuh
stim/cuda/arraymath.cuh
stim/cuda/conv2.cuh
+#ifndef STIM_CUDA_ARRAY_ADD_H
+#define STIM_CUDA_ARRAY_ADD_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/devices.h>
+#include <stim/cuda/error.h>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		__global__ void cuda_add(T* ptr1, T* ptr2, T* sum, unsigned int N){
+
+			//calculate the 1D index for this thread
+			int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if(idx < N){
+				sum[idx] = ptr1[idx] + ptr2[idx];
+			}
+
+		}
+
+		template<typename T>
+		void gpu_add(T* ptr1, T* ptr2, T* sum, unsigned int N){
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			int blocks = N / threads + (N%threads == 0 ? 0:1);
+
+			//call the kernel to do the multiplication
+			cuda_add <<< blocks, threads >>>(ptr1, ptr2, sum, N);
+
+		}
+
+		template<typename T>
+		void cpu_add(T* ptr1, T* ptr2, T* cpu_sum, unsigned int N){
+
+			//allocate memory on the GPU for the array
+			T* gpu_ptr1; 
+			T* gpu_ptr2; 
+			T* gpu_sum;
+			HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_sum, N * sizeof(T) ) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
+			HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_add<T>(gpu_ptr1, gpu_ptr2 ,gpu_sum, N);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy( cpu_sum, gpu_sum, N * sizeof(T), cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpu_ptr1);
+			cudaFree(gpu_ptr2);
+			cudaFree(gpu_sum);
+
+		}
+		
+	}
+}
+
+
+
+#endif
 \ No newline at end of file
@@ -5,6 +5,8 @@
 #include <stim/cuda/array_multiply.cuh>
 #include <stim/cuda/array_abs.cuh>
 #include <stim/cuda/array_cart2polar.cuh>
+#include <stim/cuda/gaussian_blur.cuh>
+#include <stim/cuda/conv2.cuh>
 namespace stim{
 	namespace cuda{
+#ifndef STIM_CUDA_CONV2_H
+#define STIM_CUDA_CONV2_H
+
+#include <iostream>
+#include <cuda.h>
+#include <stim/cuda/devices.h>
+#include <stim/cuda/error.h>
+#include <cmath>
+#include <algorithm>
+
+namespace stim{
+	namespace cuda{
+
+		template<typename T>
+		//__global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){
+		__global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){
+
+
+			//the radius of mask
+			unsigned r = (M - 1)/2;
+
+
+			//calculate the 1D index for this thread
+			//int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+			//change 1D index to 2D cordinates
+			int i = blockIdx.x * blockDim.x + threadIdx.x;
+			int j = blockIdx.y;
+
+			int idx = j * w + i;
+			//unsigned long N = w * h;
+
+			if(i < w && j < h){
+
+				copy[idx] = tex2D<float>(texObj, i+100, j+100);
+				return;
+
+				//tex2D<float>(texObj, i, j);
+
+				//allocate memory for result
+				T sum = 0;
+
+				//for (unsigned int y = max(j - r, 0); y <= min(j + r, h - 1); y++){
+
+					//for (unsigned int x = max(i - r, 0); x <= min(i + r, w - 1); x++){
+
+				for (int y = j - r; y <= j + r; y++){
+
+					for (int x = i - r; x <= i + r; x++){
+
+						//idx to mask cordinates(xx, yy)
+						unsigned int xx = x - (i - r);
+						unsigned int yy = y - (j - r);
+
+						//T temp = img[y * w + x] * mask[yy * M + xx];
+						//sum += img[y * w + x] * mask[yy * M + xx];
+						sum += tex2D<float>(texObj, x, y);// * mask[yy * M + xx];
+					}		
+				}
+				copy[idx] = sum;
+			 }
+		}
+		
+
+		template<typename T>
+		void gpu_conv2(T* img, T* mask, T* copy, unsigned int w, unsigned int h, unsigned M){
+
+			unsigned long N = w * h;
+
+			// Allocate CUDA array in device memory
+			
+			//define a channel descriptor for a single 32-bit channel
+			cudaChannelFormatDesc channelDesc =
+					   cudaCreateChannelDesc(32, 0, 0, 0,
+											 cudaChannelFormatKindFloat);
+			cudaArray* cuArray;												//declare the cuda array
+			cudaMallocArray(&cuArray, &channelDesc, w, h);			//allocate the cuda array
+
+			// Copy the image data from global memory to the array
+			cudaMemcpyToArray(cuArray, 0, 0, img, N * sizeof(T),
+							  cudaMemcpyDeviceToDevice);
+
+			// Specify texture
+			struct cudaResourceDesc resDesc;				//create a resource descriptor
+			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
+			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
+			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
+
+			// Specify texture object parameters
+			struct cudaTextureDesc texDesc;							//create a texture descriptor
+			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
+			texDesc.addressMode[0]   = cudaAddressModeWrap;			//use wrapping (around the edges)
+			texDesc.addressMode[1]   = cudaAddressModeWrap;
+			texDesc.filterMode       = cudaFilterModePoint;		//use linear filtering
+			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
+			texDesc.normalizedCoords = 0;							//not using normalized coordinates
+
+			// Create texture object
+			cudaTextureObject_t texObj = 0;
+			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
+
+			//get the maximum number of threads per block for the CUDA device
+			int threads = stim::maxThreadsPerBlock();
+
+			//calculate the number of blocks
+			dim3 blocks(w / threads + 1, h);
+
+			//call the kernel to do the multiplication
+			//cuda_conv2 <<< blocks, threads >>>(img, mask, copy, w, h, M);
+			cuda_conv2 <<< blocks, threads >>>(img, mask, copy, texObj, w, h, M);
+
+		}
+
+		template<typename T>
+		void cpu_conv2(T* img, T* mask, T* cpu_copy, unsigned int w, unsigned int h, unsigned M){
+
+			unsigned long N = w * h;
+			//allocate memory on the GPU for the array
+			T* gpu_img; 
+			T* gpu_mask; 
+			T* gpu_copy;
+			HANDLE_ERROR( cudaMalloc( &gpu_img, N * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_mask, pow(M, 2) * sizeof(T) ) );
+			HANDLE_ERROR( cudaMalloc( &gpu_copy, N * sizeof(T) ) );
+
+			//copy the array to the GPU
+			HANDLE_ERROR( cudaMemcpy( gpu_img, img, N * sizeof(T), cudaMemcpyHostToDevice) );
+			HANDLE_ERROR( cudaMemcpy( gpu_mask, mask, pow(M, 2) * sizeof(T), cudaMemcpyHostToDevice) );
+
+			//call the GPU version of this function
+			gpu_conv2<T>(gpu_img, gpu_mask ,gpu_copy, w, h, M);
+
+			//copy the array back to the CPU
+			HANDLE_ERROR( cudaMemcpy( cpu_copy, gpu_copy, N * sizeof(T), cudaMemcpyDeviceToHost) );
+
+			//free allocated memory
+			cudaFree(gpu_img);
+			cudaFree(gpu_mask);
+			cudaFree(gpu_copy);
+
+		}
+		
+	}
+}
+
+
+#endif
 \ No newline at end of file