codebase / stimlib

Browse Code »

Commit 348f8ab9945f81ca275a3d51e5586ba618e0b7db

Authored by Tianshu Cheng 2015-09-04 15:33:10 -0500

1 parent c0fc9427

inseparable 2D convolution

Showing 1 changed file with 147 additions and 0 deletions Show diff stats

Inline Side-by-side

stim/cuda/conv2.cuh 0 → 100644

Wrap text Show/Hide comments View file @348f8ab

	1	+#ifndef STIM_CUDA_CONV2_H
	2	+#define STIM_CUDA_CONV2_H
	3	+
	4	+#include <iostream>
	5	+#include <cuda.h>
	6	+#include <stim/cuda/devices.h>
	7	+#include <stim/cuda/error.h>
	8	+#include <cmath>
	9	+#include <algorithm>
	10	+
	11	+namespace stim{
	12	+ namespace cuda{
	13	+
	14	+ template<typename T>
	15	+ //__global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){
	16	+ __global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){
	17	+
	18	+
	19	+ //the radius of mask
	20	+ unsigned r = (M - 1)/2;
	21	+
	22	+
	23	+ //calculate the 1D index for this thread
	24	+ //int idx = blockIdx.x * blockDim.x + threadIdx.x;
	25	+
	26	+ //change 1D index to 2D cordinates
	27	+ int i = blockIdx.x * blockDim.x + threadIdx.x;
	28	+ int j = blockIdx.y;
	29	+
	30	+ int idx = j * w + i;
	31	+ //unsigned long N = w * h;
	32	+
	33	+ if(i < w && j < h){
	34	+
	35	+ copy[idx] = tex2D<float>(texObj, i+100, j+100);
	36	+ return;
	37	+
	38	+ //tex2D<float>(texObj, i, j);
	39	+
	40	+ //allocate memory for result
	41	+ T sum = 0;
	42	+
	43	+ //for (unsigned int y = max(j - r, 0); y <= min(j + r, h - 1); y++){
	44	+
	45	+ //for (unsigned int x = max(i - r, 0); x <= min(i + r, w - 1); x++){
	46	+
	47	+ for (int y = j - r; y <= j + r; y++){
	48	+
	49	+ for (int x = i - r; x <= i + r; x++){
	50	+
	51	+ //idx to mask cordinates(xx, yy)
	52	+ unsigned int xx = x - (i - r);
	53	+ unsigned int yy = y - (j - r);
	54	+
	55	+ //T temp = img[y * w + x] * mask[yy * M + xx];
	56	+ //sum += img[y * w + x] * mask[yy * M + xx];
	57	+ sum += tex2D<float>(texObj, x, y);// * mask[yy * M + xx];
	58	+ }
	59	+ }
	60	+ copy[idx] = sum;
	61	+ }
	62	+ }
	63	+
	64	+
	65	+ template<typename T>
	66	+ void gpu_conv2(T* img, T* mask, T* copy, unsigned int w, unsigned int h, unsigned M){
	67	+
	68	+ unsigned long N = w * h;
	69	+
	70	+ // Allocate CUDA array in device memory
	71	+
	72	+ //define a channel descriptor for a single 32-bit channel
	73	+ cudaChannelFormatDesc channelDesc =
	74	+ cudaCreateChannelDesc(32, 0, 0, 0,
	75	+ cudaChannelFormatKindFloat);
	76	+ cudaArray* cuArray; //declare the cuda array
	77	+ cudaMallocArray(&cuArray, &channelDesc, w, h); //allocate the cuda array
	78	+
	79	+ // Copy the image data from global memory to the array
	80	+ cudaMemcpyToArray(cuArray, 0, 0, img, N * sizeof(T),
	81	+ cudaMemcpyDeviceToDevice);
	82	+
	83	+ // Specify texture
	84	+ struct cudaResourceDesc resDesc; //create a resource descriptor
	85	+ memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero
	86	+ resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type
	87	+ resDesc.res.array.array = cuArray; //add a pointer to the cuda array
	88	+
	89	+ // Specify texture object parameters
	90	+ struct cudaTextureDesc texDesc; //create a texture descriptor
	91	+ memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero
	92	+ texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges)
	93	+ texDesc.addressMode[1] = cudaAddressModeWrap;
	94	+ texDesc.filterMode = cudaFilterModePoint; //use linear filtering
	95	+ texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats)
	96	+ texDesc.normalizedCoords = 0; //not using normalized coordinates
	97	+
	98	+ // Create texture object
	99	+ cudaTextureObject_t texObj = 0;
	100	+ cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
	101	+
	102	+ //get the maximum number of threads per block for the CUDA device
	103	+ int threads = stim::maxThreadsPerBlock();
	104	+
	105	+ //calculate the number of blocks
	106	+ dim3 blocks(w / threads + 1, h);
	107	+
	108	+ //call the kernel to do the multiplication
	109	+ //cuda_conv2 <<< blocks, threads >>>(img, mask, copy, w, h, M);
	110	+ cuda_conv2 <<< blocks, threads >>>(img, mask, copy, texObj, w, h, M);
	111	+
	112	+ }
	113	+
	114	+ template<typename T>
	115	+ void cpu_conv2(T* img, T* mask, T* cpu_copy, unsigned int w, unsigned int h, unsigned M){
	116	+
	117	+ unsigned long N = w * h;
	118	+ //allocate memory on the GPU for the array
	119	+ T* gpu_img;
	120	+ T* gpu_mask;
	121	+ T* gpu_copy;
	122	+ HANDLE_ERROR( cudaMalloc( &gpu_img, N * sizeof(T) ) );
	123	+ HANDLE_ERROR( cudaMalloc( &gpu_mask, pow(M, 2) * sizeof(T) ) );
	124	+ HANDLE_ERROR( cudaMalloc( &gpu_copy, N * sizeof(T) ) );
	125	+
	126	+ //copy the array to the GPU
	127	+ HANDLE_ERROR( cudaMemcpy( gpu_img, img, N * sizeof(T), cudaMemcpyHostToDevice) );
	128	+ HANDLE_ERROR( cudaMemcpy( gpu_mask, mask, pow(M, 2) * sizeof(T), cudaMemcpyHostToDevice) );
	129	+
	130	+ //call the GPU version of this function
	131	+ gpu_conv2<T>(gpu_img, gpu_mask ,gpu_copy, w, h, M);
	132	+
	133	+ //copy the array back to the CPU
	134	+ HANDLE_ERROR( cudaMemcpy( cpu_copy, gpu_copy, N * sizeof(T), cudaMemcpyDeviceToHost) );
	135	+
	136	+ //free allocated memory
	137	+ cudaFree(gpu_img);
	138	+ cudaFree(gpu_mask);
	139	+ cudaFree(gpu_copy);
	140	+
	141	+ }
	142	+
	143	+ }
	144	+}
	145	+
	146	+
	147	+#endif
0	148	\ No newline at end of file
...	...