codebase / stimlib

Commit 13fe3c84bbffad233bbd8728590c870364472ebc

Authored by Laila Saadatifard 2015-08-27 13:06:29 -0500

update the stimlib by adding some gpu-functions (abs, cart2polar, multiply, down…

…_sample, gaussian_blur, gradient, local_max, sharedmem, update_dir, vote)

Showing 11 changed files with 1277 additions and 0 deletions Show diff stats

Inline Side-by-side

stim/cuda/array_abs.cuh 0 → 100644

Wrap text Show/Hide comments View file @13fe3c8

	1	+#ifndef STIM_CUDA_ARRAY_ABS_H
	2	+#define STIM_CUDA_ARRAY_ABS_H
	3	+
	4	+namespace stim{
	5	+ namespace cuda{
	6	+ template<typename T>
	7	+ __global__ void cuda_abs(T* a, unsigned int N){
	8	+
	9	+ //calculate the 1D index for this thread
	10	+ int i = blockIdx.x * blockDim.x + threadIdx.x;
	11	+
	12	+ if(i < N)
	13	+ a[i] = abs(a[i]);
	14	+ }
	15	+
	16	+
	17	+ template<typename T>
	18	+ void gpu_abs(T* a, unsigned int N){
	19	+
	20	+ //get the maximum number of threads per block for the CUDA device
	21	+ int threads = stim::maxThreadsPerBlock();
	22	+
	23	+ //calculate the number of blocks
	24	+ int blocks = N / threads + (N%threads == 0 ? 0:1);
	25	+
	26	+ //call the kernel to do the multiplication
	27	+ cuda_abs <<< blocks, threads >>>(a, N);
	28	+
	29	+ }
	30	+
	31	+
	32	+ template<typename T>
	33	+ void cpu_abs(T* a, unsigned int N){
	34	+
	35	+ //calculate the number of bytes in the array
	36	+ unsigned int bytes = N * sizeof(T);
	37	+
	38	+ //allocate memory on the GPU for the array
	39	+ T* gpuA;
	40	+ HANDLE_ERROR( cudaMalloc(&gpuA, bytes) );
	41	+
	42	+ //copy the array to the GPU
	43	+ HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) );
	44	+
	45	+ //call the GPU version of this function
	46	+ gpu_abs<T>(gpuA, N);
	47	+
	48	+ //copy the array back to the CPU
	49	+ HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) );
	50	+
	51	+ //free allocated memory
	52	+ cudaFree(gpuA);
	53	+
	54	+ }
	55	+
	56	+ } //end namespace cuda
	57	+} //end namespace stim
	58	+
	59	+#endif
0	60	\ No newline at end of file
...	...

stim/cuda/array_cart2polar.cuh 0 → 100644

Wrap text Show/Hide comments View file @13fe3c8

	1	+#ifndef STIM_CUDA_ARRAY_CART2POLAR_H
	2	+#define STIM_CUDA_ARRAY_CART2POLAR_H
	3	+
	4	+namespace stim{
	5	+ namespace cuda{
	6	+ template<typename T>
	7	+ __global__ void cuda_cart2polar(T* a, unsigned int N){
	8	+
	9	+
	10	+ //calculate the 1D index for this thread
	11	+ int i = blockIdx.x * blockDim.x + threadIdx.x;
	12	+
	13	+ if(i < N){
	14	+ float x = a[i * 2 + 0];
	15	+ float y = a[i * 2 + 1];
	16	+ float theta = atan2( y, x ) ;
	17	+ float r = sqrt(x * x + y * y);
	18	+ a[i * 2 + 0] = theta;
	19	+ a[i * 2 + 1] = r;
	20	+ }
	21	+ }
	22	+
	23	+
	24	+ template<typename T>
	25	+ void gpu_cart2polar(T* gpuGrad, unsigned int N){
	26	+
	27	+ //get the maximum number of threads per block for the CUDA device
	28	+ int threads = stim::maxThreadsPerBlock();
	29	+
	30	+ //calculate the number of blocks
	31	+ int blocks = N / threads + (N % threads == 0 ? 0:1);
	32	+
	33	+ //call the kernel to do the multiplication
	34	+ cuda_cart2polar <<< blocks, threads >>>(gpuGrad, N);
	35	+
	36	+ }
	37	+
	38	+
	39	+ template<typename T>
	40	+ void cpu_cart2polar(T* a, unsigned int N){
	41	+
	42	+ //calculate the number of bytes in the array
	43	+ unsigned int bytes = N * sizeof(T) * 2;
	44	+
	45	+ //allocate memory on the GPU for the array
	46	+ T* gpuA;
	47	+ HANDLE_ERROR( cudaMalloc(&gpuA, bytes) );
	48	+
	49	+ //copy the array to the GPU
	50	+ HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) );
	51	+
	52	+ //call the GPU version of this function
	53	+ gpu_cart2polar<T>(gpuA, N);
	54	+
	55	+ //copy the array back to the CPU
	56	+ HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) );
	57	+
	58	+ //free allocated memory
	59	+ cudaFree(gpuA);
	60	+
	61	+ }
	62	+
	63	+ }
	64	+}
	65	+
	66	+#endif
0	67	\ No newline at end of file
...	...

stim/cuda/array_multiply.cuh 0 → 100644

Wrap text Show/Hide comments View file @13fe3c8

	1	+#ifndef STIM_CUDA_ARRAY_MULTIPLY_H
	2	+#define STIM_CUDA_ARRAY_MULTIPLY_H
	3	+
	4	+#include <iostream>
	5	+#include <cuda.h>
	6	+#include <stim/cuda/devices.h>
	7	+#include <stim/cuda/error.h>
	8	+
	9	+namespace stim{
	10	+ namespace cuda{
	11	+
	12	+ template<typename T>
	13	+ __global__ void cuda_multiply(T* lhs, T rhs, unsigned int N){
	14	+
	15	+ //calculate the 1D index for this thread
	16	+ int i = blockIdx.x * blockDim.x + threadIdx.x;
	17	+
	18	+ if(i < N)
	19	+ lhs[i] *= rhs;
	20	+ }
	21	+
	22	+ template<typename T>
	23	+ void gpu_multiply(T* lhs, T rhs, unsigned int N){
	24	+
	25	+ //get the maximum number of threads per block for the CUDA device
	26	+ int threads = stim::maxThreadsPerBlock();
	27	+
	28	+ //calculate the number of blocks
	29	+ int blocks = N / threads + (N%threads == 0 ? 0:1);
	30	+
	31	+ //call the kernel to do the multiplication
	32	+ cuda_multiply <<< blocks, threads >>>(lhs, rhs, N);
	33	+
	34	+ }
	35	+
	36	+ template<typename T>
	37	+ void cpu_multiply(T* lhs, T rhs, unsigned int N){
	38	+
	39	+ //calculate the number of bytes in the array
	40	+ unsigned int bytes = N * sizeof(T);
	41	+
	42	+ //allocate memory on the GPU for the array
	43	+ T* gpuLHS;
	44	+ HANDLE_ERROR( cudaMalloc(&gpuLHS, bytes) );
	45	+
	46	+ //copy the array to the GPU
	47	+ HANDLE_ERROR( cudaMemcpy(gpuLHS, lhs, bytes, cudaMemcpyHostToDevice) );
	48	+
	49	+ //call the GPU version of this function
	50	+ gpu_multiply<T>(gpuLHS, rhs, N);
	51	+
	52	+ //copy the array back to the CPU
	53	+ HANDLE_ERROR( cudaMemcpy(lhs, gpuLHS, bytes, cudaMemcpyDeviceToHost) );
	54	+
	55	+ //free allocated memory
	56	+ cudaFree(gpuLHS);
	57	+ }
	58	+
	59	+ }
	60	+}
	61	+
	62	+
	63	+
	64	+#endif
0	65	\ No newline at end of file
...	...

stim/cuda/arraymath.cuh 0 → 100644

Wrap text Show/Hide comments View file @13fe3c8

	1	+#ifndef STIM_CUDA_ARRAYMATH_H
	2	+#define STIM_CUDA_ARRAYMATH_H
	3	+
	4	+#include <stim/cuda/array_multiply.cuh>
	5	+#include <stim/cuda/array_abs.cuh>
	6	+#include <stim/cuda/array_cart2polar.cuh>
	7	+
	8	+namespace stim{
	9	+ namespace cuda{
	10	+
	11	+ }
	12	+}
	13	+
	14	+
	15	+
	16	+#endif
0	17	\ No newline at end of file
...	...

stim/cuda/down_sample.cuh 0 → 100644

Wrap text Show/Hide comments View file @13fe3c8

	1	+#ifndef STIM_CUDA_DOWN_SAMPLE_H
	2	+#define STIM_CUDA_DOWN_SAMPLE_H
	3	+
	4	+#include <iostream>
	5	+#include <cuda.h>
	6	+#include <stim/cuda/devices.h>
	7	+#include <stim/cuda/timer.h>
	8	+#include <stim/cuda/gaussian_blur.cuh>
	9	+
	10	+namespace stim{
	11	+ namespace cuda{
	12	+
	13	+ template<typename T>
	14	+ __global__ void down_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
	15	+
	16	+ unsigned int sigma_ds = 1/resize;
	17	+ unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
	18	+ unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
	19	+
	20	+
	21	+ // calculate the 2D coordinates for this current thread.
	22	+ int xi = blockIdx.x * blockDim.x + threadIdx.x;
	23	+ int yi = blockIdx.y;
	24	+ // convert 2D coordinates to 1D
	25	+ int i = yi * x_ds + xi;
	26	+
	27	+ if(xi< x_ds && yi< y_ds){
	28	+
	29	+ int x_org = xi * sigma_ds;
	30	+ int y_org = yi * sigma_ds;
	31	+ int i_org = y_org * x + x_org;
	32	+ gpuI[i] = gpuI0[i_org];
	33	+ }
	34	+
	35	+ }
	36	+
	37	+
	38	+ /// Applies a Gaussian blur to a 2D image stored on the GPU
	39	+ template<typename T>
	40	+ void gpu_down_sample(T* gpuI, T* gpuI0, T resize, unsigned int x, unsigned int y){
	41	+
	42	+
	43	+ unsigned int sigma_ds = 1/resize;
	44	+ unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
	45	+ unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
	46	+
	47	+ //get the number of pixels in the image
	48	+ unsigned int pixels_ds = x_ds * y_ds;
	49	+
	50	+ unsigned int max_threads = stim::maxThreadsPerBlock();
	51	+ dim3 threads(max_threads, 1);
	52	+ dim3 blocks(x_ds/threads.x + (x_ds %threads.x == 0 ? 0:1) , y_ds);
	53	+
	54	+ stim::cuda::gpu_gaussian_blur_2d<float>(gpuI0, sigma_ds,x ,y);
	55	+
	56	+ //resample the image
	57	+ down_sample<float> <<< blocks, threads >>>(gpuI, gpuI0, resize, x, y);
	58	+
	59	+ }
	60	+
	61	+ /// Applies a Gaussian blur to a 2D image stored on the CPU
	62	+ template<typename T>
	63	+ void cpu_down_sample(T* re_img, T* image, T resize, unsigned int x, unsigned int y){
	64	+
	65	+ //get the number of pixels in the image
	66	+ unsigned int pixels = x * y;
	67	+ unsigned int bytes = sizeof(T) * pixels;
	68	+
	69	+ unsigned int sigma_ds = 1/resize;
	70	+ unsigned int x_ds = (x/sigma_ds + (x %sigma_ds == 0 ? 0:1));
	71	+ unsigned int y_ds = (y/sigma_ds + (y %sigma_ds == 0 ? 0:1));
	72	+ unsigned int bytes_ds = sizeof(T) * x_ds * y_ds;
	73	+
	74	+
	75	+
	76	+ //allocate space on the GPU for the original image
	77	+ T* gpuI0;
	78	+ cudaMalloc(&gpuI0, bytes);
	79	+
	80	+
	81	+ //copy the image data to the GPU
	82	+ cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice);
	83	+
	84	+ //allocate space on the GPU for the down sampled image
	85	+ T* gpuI;
	86	+ cudaMalloc(&gpuI, bytes_ds);
	87	+
	88	+ //run the GPU-based version of the algorithm
	89	+ gpu_down_sample<T>(gpuI, gpuI0, resize, x, y);
	90	+
	91	+ //copy the image data to the GPU
	92	+ cudaMemcpy(re_image, gpuI, bytes_ds, cudaMemcpyHostToDevice);
	93	+
	94	+ cudaFree(gpuI0);
	95	+ cudeFree(gpuI);
	96	+ }
	97	+
	98	+ }
	99	+}
	100	+
	101	+#endif
0	102	\ No newline at end of file
...	...

stim/cuda/gaussian_blur.cuh 0 → 100644

Wrap text Show/Hide comments View file @13fe3c8

	1	+#ifndef STIM_CUDA_GAUSSIAN_BLUR_H
	2	+#define STIM_CUDA_GAUSSIAN_BLUR_H
	3	+
	4	+#include <iostream>
	5	+#include <cuda.h>
	6	+#include <stim/cuda/devices.h>
	7	+#include <stim/cuda/timer.h>
	8	+#include <stim/cuda/sharedmem.cuh>
	9	+
	10	+#define pi 3.14159
	11	+
	12	+namespace stim{
	13	+ namespace cuda{
	14	+
	15	+ template<typename T>
	16	+ __global__ void gaussian_blur_x(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){
	17	+
	18	+ //generate a pointer to shared memory (size will be specified as a kernel parameter)
	19	+ extern __shared__ T s[];
	20	+
	21	+ int kr = sigma * 4; //calculate the kernel radius
	22	+
	23	+ //get a pointer to the gaussian in memory
	24	+ T* g = (T)&s[blockDim.x + 2 kr];
	25	+
	26	+ //calculate the start point for this block
	27	+ int bxi = blockIdx.x * blockDim.x;
	28	+ int byi = blockIdx.y;
	29	+
	30	+ //copy the portion of the image necessary for this block to shared memory
	31	+ stim::cuda::sharedMemcpy_tex2D(s, in, bxi - kr, byi, 2 * kr + blockDim.x, 1, threadIdx, blockDim);
	32	+
	33	+ //calculate the thread index and block index
	34	+ int ti = threadIdx.x;
	35	+
	36	+ //calculate the spatial coordinate for this thread
	37	+ int xi = bxi + ti;
	38	+
	39	+ //pre-compute the gaussian values for each kernel point
	40	+ T a = 1.0 / (sigma * sqrt(2 * pi));
	41	+ T c = - 1.0 / (2sigmasigma);
	42	+ int ki;
	43	+
	44	+ //use the first 2kr+1 threads to evaluate a gaussian and store the result
	45	+ if(ti <= 2* kr+1){
	46	+ ki = ti - kr;
	47	+ g[ti] = a * exp((kiki) c);
	48	+ }
	49	+
	50	+ //make sure that all writing to shared memory is done before continuing
	51	+ __syncthreads();
	52	+
	53	+ //if the current pixel is outside of the image
	54	+ if(bxi + ti > x \|\| byi > y)
	55	+ return;
	56	+
	57	+
	58	+
	59	+ //calculate the coordinates of the current thread in shared memory
	60	+ int si = ti + kr;
	61	+
	62	+ T sum = 0; //running weighted sum across the kernel
	63	+
	64	+
	65	+ //for each element of the kernel
	66	+ for(int ki = -kr; ki <= kr; ki++){
	67	+ sum += g[ki + kr] * s[si + ki];
	68	+ }
	69	+
	70	+ //calculate the 1D image index for this thread
	71	+ unsigned int i = byi * x + xi;
	72	+
	73	+ //output the result to global memory
	74	+ out[i] = sum;
	75	+ }
	76	+
	77	+ template<typename T>
	78	+ __global__ void gaussian_blur_y(T* out, cudaTextureObject_t in, T sigma, unsigned int x, unsigned int y){
	79	+
	80	+ //generate a pointer to shared memory (size will be specified as a kernel parameter)
	81	+ extern __shared__ T s[];
	82	+
	83	+ int kr = sigma * 4; //calculate the kernel radius
	84	+
	85	+ //get a pointer to the gaussian in memory
	86	+ T* g = (T)&s[blockDim.y + 2 kr];
	87	+
	88	+ //calculate the start point for this block
	89	+ int bxi = blockIdx.x;
	90	+ int byi = blockIdx.y * blockDim.y;
	91	+
	92	+ //copy the portion of the image necessary for this block to shared memory
	93	+ stim::cuda::sharedMemcpy_tex2D(s, in, bxi, byi - kr, 1, 2 * kr + blockDim.y, threadIdx, blockDim);
	94	+
	95	+ //calculate the thread index and block index
	96	+ int ti = threadIdx.y;
	97	+
	98	+ //calculate the spatial coordinate for this thread
	99	+ int yi = byi + ti;
	100	+
	101	+ //pre-compute the gaussian values for each kernel point
	102	+ T a = 1.0 / (sigma * sqrt(2 * pi));
	103	+ T c = - 1.0 / (2sigmasigma);
	104	+ int ki;
	105	+
	106	+ //use the first 2kr+1 threads to evaluate a gaussian and store the result
	107	+ if(ti <= 2* kr+1){
	108	+ ki = ti - kr;
	109	+ g[ti] = a * exp((kiki) c);
	110	+ }
	111	+
	112	+ //make sure that all writing to shared memory is done before continuing
	113	+ __syncthreads();
	114	+
	115	+ //if the current pixel is outside of the image
	116	+ if(bxi >= x \|\| yi >= y)
	117	+ return;
	118	+
	119	+
	120	+
	121	+ //calculate the coordinates of the current thread in shared memory
	122	+ int si = ti + kr;
	123	+
	124	+ T sum = 0; //running weighted sum across the kernel
	125	+
	126	+
	127	+ //for each element of the kernel
	128	+ for(int ki = -kr; ki <= kr; ki++){
	129	+ sum += g[ki + kr] * s[si + ki];
	130	+ }
	131	+
	132	+ //calculate the 1D image index for this thread
	133	+ unsigned int i = yi * x + bxi;
	134	+
	135	+ //output the result to global memory
	136	+ out[i] = sum;
	137	+ }
	138	+
	139	+ /// Applies a Gaussian blur to a 2D image stored on the GPU
	140	+ template<typename T>
	141	+ void gpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){
	142	+
	143	+ //get the number of pixels in the image
	144	+ unsigned int pixels = x * y;
	145	+ unsigned int bytes = sizeof(T) * pixels;
	146	+
	147	+ // Allocate CUDA array in device memory
	148	+
	149	+ //define a channel descriptor for a single 32-bit channel
	150	+ cudaChannelFormatDesc channelDesc =
	151	+ cudaCreateChannelDesc(32, 0, 0, 0,
	152	+ cudaChannelFormatKindFloat);
	153	+ cudaArray* cuArray; //declare the cuda array
	154	+ cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array
	155	+
	156	+ // Copy the image data from global memory to the array
	157	+ cudaMemcpyToArray(cuArray, 0, 0, image, bytes,
	158	+ cudaMemcpyDeviceToDevice);
	159	+
	160	+ // Specify texture
	161	+ struct cudaResourceDesc resDesc; //create a resource descriptor
	162	+ memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero
	163	+ resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type
	164	+ resDesc.res.array.array = cuArray; //add a pointer to the cuda array
	165	+
	166	+ // Specify texture object parameters
	167	+ struct cudaTextureDesc texDesc; //create a texture descriptor
	168	+ memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero
	169	+ texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges)
	170	+ texDesc.addressMode[1] = cudaAddressModeWrap;
	171	+ texDesc.filterMode = cudaFilterModePoint; //use linear filtering
	172	+ texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats)
	173	+ texDesc.normalizedCoords = 0; //not using normalized coordinates
	174	+
	175	+ // Create texture object
	176	+ cudaTextureObject_t texObj = 0;
	177	+ cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
	178	+
	179	+
	180	+ //get the maximum number of threads per block for the CUDA device
	181	+ int max_threads = stim::maxThreadsPerBlock();
	182	+ dim3 threads(max_threads, 1);
	183	+
	184	+ //calculate the number of blocks
	185	+ dim3 blocks(x / threads.x + 1, y);
	186	+
	187	+ //calculate the shared memory used in the kernel
	188	+ unsigned int pixel_bytes = max_threads * 4; //bytes devoted to pixel data being processed
	189	+ unsigned int apron_bytes = sigma * 8 * 4; //bytes devoted to pixels outside the window
	190	+ unsigned int gaussian_bytes = (sigma * 8 + 1) * 4; //bytes devoted to memory used to store the pre-computed Gaussian window
	191	+ unsigned int shared_bytes = pixel_bytes + apron_bytes + gaussian_bytes; //total number of bytes shared memory used
	192	+
	193	+ //blur the image along the x-axis
	194	+ gaussian_blur_x <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y);
	195	+
	196	+ // Copy the x-blurred data from global memory to the texture
	197	+ cudaMemcpyToArray(cuArray, 0, 0, image, bytes,
	198	+ cudaMemcpyDeviceToDevice);
	199	+
	200	+ //transpose the block and thread dimensions
	201	+ threads.x = 1;
	202	+ threads.y = max_threads;
	203	+ blocks.x = x;
	204	+ blocks.y = y / threads.y + 1;
	205	+
	206	+ //blur the image along the y-axis
	207	+ gaussian_blur_y <<< blocks, threads, shared_bytes >>>(image, texObj, sigma, x, y);
	208	+
	209	+ //free allocated memory
	210	+ cudaFree(cuArray);
	211	+
	212	+ }
	213	+
	214	+ /// Applies a Gaussian blur to a 2D image stored on the CPU
	215	+ template<typename T>
	216	+ void cpu_gaussian_blur_2d(T* image, T sigma, unsigned int x, unsigned int y){
	217	+
	218	+ //get the number of pixels in the image
	219	+ unsigned int pixels = x * y;
	220	+ unsigned int bytes = sizeof(T) * pixels;
	221	+
	222	+ //allocate space on the GPU
	223	+ T* gpuI0;
	224	+ cudaMalloc(&gpuI0, bytes);
	225	+
	226	+
	227	+ //copy the image data to the GPU
	228	+ cudaMemcpy(gpuI0, image, bytes, cudaMemcpyHostToDevice);
	229	+
	230	+ //run the GPU-based version of the algorithm
	231	+ gpu_gaussian_blur_2d<T>(gpuI0, sigma, x, y);
	232	+
	233	+ //copy the image data from the device
	234	+ cudaMemcpy(image, gpuI0, bytes, cudaMemcpyDeviceToHost);
	235	+
	236	+ //free allocated memory
	237	+ cudaFree(gpuI0);
	238	+ }
	239	+
	240	+ }
	241	+}
	242	+
	243	+#endif
0	244	\ No newline at end of file
...	...

stim/cuda/gradient.cuh 0 → 100644

Wrap text Show/Hide comments View file @13fe3c8

	1	+#ifndef STIM_CUDA_GRADIENT_H
	2	+#define STIM_CUDA_GRADIENT_H
	3	+
	4	+#include <iostream>
	5	+#include <cuda.h>
	6	+#include <stim/cuda/devices.h>
	7	+#include <stim/cuda/error.h>
	8	+
	9	+namespace stim{
	10	+ namespace cuda{
	11	+
	12	+ template<typename T>
	13	+ __global__ void gradient_2d(T* out, T* in, unsigned int x, unsigned int y){
	14	+
	15	+ //calculate the 1D image index for this thread
	16	+ int i = blockIdx.x * blockDim.x + threadIdx.x;
	17	+
	18	+ //convert this to 2D pixel coordinates
	19	+ int yi = i / x;
	20	+ int xi = i - (yi * x);
	21	+
	22	+ //return if the pixel is outside of the image
	23	+ if(xi >= x \|\| yi >= y) return;
	24	+
	25	+ //calculate indices for the forward difference
	26	+ int i_xp = yi * x + (xi + 1);
	27	+ int i_yp = (yi + 1) * x + xi;
	28	+
	29	+ //use forward differences if a coordinate is zero
	30	+ if(xi == 0)
	31	+ out[i * 2 + 0] = in[i_xp] - in[i];
	32	+ if(yi == 0)
	33	+ out[i * 2 + 1] = in[i_yp] - in[i];
	34	+
	35	+ //calculate indices for the backward difference
	36	+ int i_xn = yi * x + (xi - 1);
	37	+ int i_yn = (yi - 1) * x + xi;
	38	+
	39	+ //use backward differences if the coordinate is at the maximum edge
	40	+ if(xi == x-1)
	41	+ out[i * 2 + 0] = in[i] - in[i_xn];
	42	+ if(yi == y-1)
	43	+ out[i * 2 + 1] = in[i] - in[i_yn];
	44	+
	45	+ //otherwise use central differences
	46	+ if(xi > 0 && xi < x-1)
	47	+ out[i * 2 + 0] = (in[i_xp] - in[i_xn]) / 2;
	48	+
	49	+ if(yi > 0 && yi < y-1)
	50	+ out[i * 2 + 1] = (in[i_yp] - in[i_yn]) / 2;
	51	+
	52	+ }
	53	+
	54	+ template<typename T>
	55	+ //void gpu_gradient_2d(T* gpuOut, T* gpuIn, unsigned int x, unsigned int y){
	56	+ void gpu_gradient_2d(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y){
	57	+
	58	+ //get the number of pixels in the image
	59	+ unsigned int pixels = x * y;
	60	+
	61	+ //allocate space on the GPU for the input image
	62	+ //T* gpuI;
	63	+ //HANDLE_ERROR(cudaMalloc(&gpuI, bytes));
	64	+
	65	+ //cudaMemcpy(gpuI, gpuI0, bytes, cudaMemcpyDeviceToDevice);
	66	+
	67	+
	68	+ //allocate space on the GPU for the output gradient image
	69	+ //T* gpuGrad;
	70	+ //cudaMalloc(&gpuGrad, bytes * 2); //the output image will have two channels (x, y)
	71	+
	72	+ //get the maximum number of threads per block for the CUDA device
	73	+ int threads = stim::maxThreadsPerBlock();
	74	+
	75	+ //calculate the number of blocks
	76	+ int blocks = pixels / threads + (pixels%threads == 0 ? 0:1);
	77	+
	78	+ //call the GPU kernel to determine the gradient
	79	+ gradient_2d<T> <<< blocks, threads >>>(gpuGrad, gpuI, x, y);
	80	+
	81	+ }
	82	+
	83	+ template<typename T>
	84	+ void cpu_gradient_2d(T* out, T* in, unsigned int x, unsigned int y){
	85	+
	86	+ //get the number of pixels in the image
	87	+ unsigned int pixels = x * y;
	88	+ unsigned int bytes = pixels * sizeof(T);
	89	+
	90	+ //allocate space on the GPU for the input image
	91	+ T* gpuIn;
	92	+ HANDLE_ERROR(cudaMalloc(&gpuIn, bytes));
	93	+
	94	+ //copy the image data to the GPU
	95	+ HANDLE_ERROR(cudaMemcpy(gpuIn, in, bytes, cudaMemcpyHostToDevice));
	96	+
	97	+ //allocate space on the GPU for the output gradient image
	98	+ T* gpuOut;
	99	+ cudaMalloc(&gpuOut, bytes * 2); //the output image will have two channels (x, y)
	100	+
	101	+ //call the GPU version of this function
	102	+ gpu_gradient_2d(gpuOut, gpuIn, x, y);
	103	+
	104	+ //copy the results to the CPU
	105	+ cudaMemcpy(out, gpuOut, bytes * 2, cudaMemcpyDeviceToHost);
	106	+
	107	+ //free allocated memory
	108	+ cudaFree(gpuOut);
	109	+ }
	110	+
	111	+ }
	112	+}
	113	+
	114	+
	115	+#endif
0	116	\ No newline at end of file
...	...

stim/cuda/local_max.cuh 0 → 100644

Wrap text Show/Hide comments View file @13fe3c8

	1	+#ifndef STIM_CUDA_LOCAL_MAX_H
	2	+#define STIM_CUDA_LOCAL_MAX_H
	3	+
	4	+
	5	+# include <iostream>
	6	+# include <cuda.h>
	7	+# include <stim/cuda/devices.h>
	8	+# include <stim/cuda/error.h>
	9	+
	10	+namespace stim{
	11	+ namespace cuda{
	12	+
	13	+
	14	+ // this kernel calculates the local maximum for finding the cell centers
	15	+ template<typename T>
	16	+ __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
	17	+
	18	+ // calculate the 2D coordinates for this current thread.
	19	+ int xi = blockIdx.x * blockDim.x + threadIdx.x;
	20	+ int yi = blockIdx.y;
	21	+ // convert 2D coordinates to 1D
	22	+ int i = yi * x + xi;
	23	+
	24	+
	25	+
	26	+ //calculate the lowest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'.
	27	+ int xl = xi - conn;
	28	+ int yl = yi - conn;
	29	+
	30	+ // use zero for the lowest limits if the xi or yi is less than conn.
	31	+ if (xi <= conn)
	32	+ xl = 0;
	33	+ if (yi <= conn)
	34	+ yl = 0;
	35	+
	36	+ //calculate the highest limit of the neighbors for this pixel. the size of neighbors are defined by 'conn'.
	37	+ int xh = xi + conn;
	38	+ int yh = yi + conn;
	39	+
	40	+ // use the image width or image height for the highest limits if the distance of xi or yi to the edges of image is less than conn.
	41	+ if (xi >= x - conn)
	42	+ xh = x;
	43	+ if (yi>= y - conn)
	44	+ yh = y;
	45	+
	46	+ // calculate the limits for finding the local maximum location in the connected neighbors for the current pixel
	47	+ int n_l = yl * x + xl;
	48	+ int n_h = yh * x + xh;
	49	+
	50	+ //initial the centers image to zero
	51	+ gpuCenters[i] = 0;
	52	+
	53	+
	54	+ int n = n_l;
	55	+
	56	+ float l_value = 0;
	57	+
	58	+ if (i < x * y)
	59	+
	60	+ // check if the vote value for this pixel is greater than threshold, so this pixel may be a local max.
	61	+ if (gpuVote[i]>final_t){
	62	+
	63	+ // compare the vote value for this pixel with the vote values with its neighbors.
	64	+ while (n<=n_h){
	65	+
	66	+ // check if this vote value is a local max in its neighborhood.
	67	+ if (gpuVote[i] < gpuVote[n]){
	68	+ l_value = 0;
	69	+ n =n_h+1;
	70	+ }
	71	+ else if (n == n_h){
	72	+ l_value = 1;
	73	+ n = n+1;
	74	+ }
	75	+ // check if the current neighbor is the last one at the current row
	76	+ else if ((n - n_l - 2*conn)% x ==0){
	77	+ n = n + x - 2*conn -1;
	78	+ n ++;
	79	+ }
	80	+ else
	81	+ n ++;
	82	+ }
	83	+ // set the center value for this pixel to high if it's a local max ,and to low if not.
	84	+ gpuCenters[i] = l_value ;
	85	+ }
	86	+
	87	+ }
	88	+
	89	+
	90	+
	91	+ template<typename T>
	92	+ void gpu_local_max(T* gpuCenters, T* gpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
	93	+
	94	+
	95	+
	96	+
	97	+ unsigned int max_threads = stim::maxThreadsPerBlock();
	98	+ dim3 threads(max_threads, 1);
	99	+ dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
	100	+
	101	+
	102	+
	103	+ //call the kernel to find the local maximum.
	104	+ cuda_local_max <<< blocks, threads >>>(gpuCenters, gpuVote, final_t, conn, x, y);
	105	+
	106	+
	107	+ }
	108	+
	109	+
	110	+
	111	+ template<typename T>
	112	+ void cpu_local_max(T* cpuCenters, T* cpuVote, T final_t, unsigned int conn, unsigned int x, unsigned int y){
	113	+
	114	+
	115	+ //calculate the number of bytes in the array
	116	+ unsigned int bytes = x * y * sizeof(T);
	117	+
	118	+ // allocate space on the GPU for the detected cell centes
	119	+ T* gpuCenters;
	120	+ cudaMalloc(&gpuCenters, bytes);
	121	+
	122	+
	123	+ //allocate space on the GPU for the input Vote Image
	124	+ T* gpuVote;
	125	+ cudaMalloc(&gpuVote, bytes);
	126	+
	127	+
	128	+ //copy the Vote image data to the GPU
	129	+ HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));
	130	+
	131	+
	132	+ //call the GPU version of the local max function
	133	+ gpu_local_max<T>(gpuCenters, gpuVote, final_t, conn, x, y);
	134	+
	135	+
	136	+ //copy the cell centers data to the CPU
	137	+ cudaMemcpy(cpuCenters, gpuCenters, bytes, cudaMemcpyDeviceToHost) ;
	138	+
	139	+
	140	+ //free allocated memory
	141	+ cudaFree(gpuCenters);
	142	+ cudaFree(gpuVote);
	143	+ cudaFree(gpuGrad);
	144	+ }
	145	+
	146	+ }
	147	+}
	148	+
	149	+
	150	+
	151	+#endif
0	152	\ No newline at end of file
...	...

stim/cuda/sharedmem.cuh 0 → 100644

Wrap text Show/Hide comments View file @13fe3c8

	1	+
	2	+#ifndef STIM_CUDA_SHAREDMEM_H
	3	+#define STIM_CUDA_SHAREDMEM_H
	4	+
	5	+namespace stim{
	6	+ namespace cuda{
	7	+
	8	+ // Copies values from global memory to shared memory, optimizing threads
	9	+ template<typename T>
	10	+ __device__ void sharedMemcpy_tex2D(T* dest, cudaTextureObject_t src,
	11	+ unsigned int x, unsigned int y, unsigned int X, unsigned int Y,
	12	+ dim3 threadIdx, dim3 blockDim){
	13	+
	14	+ //calculate the number of iterations required for the copy
	15	+ unsigned int xI, yI;
	16	+ xI = X/blockDim.x + 1; //number of iterations along X
	17	+ yI = Y/blockDim.y + 1; //number of iterations along Y
	18	+
	19	+ //for each iteration
	20	+ for(unsigned int xi = 0; xi < xI; xi++){
	21	+ for(unsigned int yi = 0; yi < yI; yi++){
	22	+
	23	+ //calculate the index into shared memory
	24	+ unsigned int sx = xi * blockDim.x + threadIdx.x;
	25	+ unsigned int sy = yi * blockDim.y + threadIdx.y;
	26	+
	27	+ //calculate the index into the texture
	28	+ unsigned int tx = x + sx;
	29	+ unsigned int ty = y + sy;
	30	+
	31	+ //perform the copy
	32	+ if(sx < X && sy < Y)
	33	+ dest[sy * X + sx] = tex2D<T>(src, tx, ty);
	34	+ }
	35	+ }
	36	+ }
	37	+
	38	+ }
	39	+}
	40	+
	41	+
	42	+#endif
0	43	\ No newline at end of file
...	...

stim/cuda/update_dir.cuh 0 → 100644

Wrap text Show/Hide comments View file @13fe3c8

	1	+#ifndef STIM_CUDA_UPDATE_DIR_H
	2	+#define STIM_CUDA_UPDATE_DIR_H
	3	+
	4	+
	5	+# include <iostream>
	6	+# include <cuda.h>
	7	+# include <stim/cuda/devices.h>
	8	+# include <stim/cuda/error.h>
	9	+#include <stim/cuda/sharedmem.cuh>
	10	+
	11	+namespace stim{
	12	+ namespace cuda{
	13	+
	14	+ // this kernel calculates the voting direction for the next iteration based on the angle between the location of this voter and the maximum vote value in its voting area.
	15	+ template<typename T>
	16	+ __global__ void cuda_update_dir(T* gpuDir, cudaTextureObject_t in, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
	17	+
	18	+ //generate a pointer to shared memory (size will be specified as a kernel parameter)
	19	+ extern __shared__ float s_vote[];
	20	+
	21	+ //calculate the start point for this block
	22	+ int bxi = blockIdx.x * blockDim.x;
	23	+
	24	+ //calculate the width of the shared memory block
	25	+ int swidth = 2 * rmax + blockDim.x;
	26	+
	27	+ // calculate the 2D coordinates for this current thread.
	28	+ int xi = bxi + threadIdx.x;
	29	+ int yi = blockIdx.y;
	30	+
	31	+ // convert 2D coordinates to 1D
	32	+ int i = yi * x + xi;
	33	+
	34	+ // calculate the voting direction based on the grtadient direction
	35	+ float theta = gpuGrad[2*i];
	36	+
	37	+ //initialize the vote direction to zero
	38	+ gpuDir[i] = 0;
	39	+
	40	+ // define a local variable to maximum value of the vote image in the voting area for this voter
	41	+ float max = 0;
	42	+
	43	+ // define two local variables for the x and y coordinations where the maximum happened
	44	+ int id_x = 0;
	45	+ int id_y = 0;
	46	+
	47	+ // compute the size of window which will be checked for finding the voting area for this voter
	48	+ unsigned int x_table = 2*rmax +1;
	49	+ unsigned int rmax_sq = rmax * rmax;
	50	+ int r = (int)rmax;
	51	+ int tx_rmax = threadIdx.x + rmax;
	52	+ int bxs = bxi - rmax;
	53	+
	54	+ for(int yr = -r; yr <= r; yr++){
	55	+
	56	+ //copy the portion of the image necessary for this block to shared memory
	57	+ __syncthreads();
	58	+ stim::cuda::sharedMemcpy_tex2D<float>(s_vote, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim);
	59	+ __syncthreads();
	60	+
	61	+ //if the current thread is outside of the image, it doesn't have to be computed
	62	+ if(xi < x && yi < y){
	63	+
	64	+ for(int xr = -r; xr <= r; xr++){
	65	+
	66	+ unsigned int ind_t = (rmax - yr) * x_table + rmax - xr;
	67	+
	68	+ // calculate the angle between the voter and the current pixel in x and y directions
	69	+ float atan_angle = gpuTable[ind_t];
	70	+
	71	+
	72	+ // calculate the voting direction based on the grtadient direction
	73	+ int idx_share_update = xr + tx_rmax ;
	74	+ float share_vote = s_vote[idx_share_update];
	75	+
	76	+ // check if the current pixel is located in the voting area of this voter.
	77	+ if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
	78	+
	79	+ // compare the vote value of this pixel with the max value to find the maxima and its index.
	80	+ if (share_vote>max) {
	81	+
	82	+ max = share_vote;
	83	+ id_x = xr;
	84	+ id_y = yr;
	85	+ }
	86	+ }
	87	+ }
	88	+ }
	89	+ }
	90	+
	91	+
	92	+ //float new_angle = atan2(dy, dx);
	93	+ unsigned int ind_m = (rmax - id_y) * x_table + (rmax - id_x);
	94	+
	95	+ float new_angle = gpuTable[ind_m];
	96	+
	97	+ gpuDir[i] = new_angle;
	98	+
	99	+ }
	100	+
	101	+ // this kernel updates the gradient direction by the calculated voting direction.
	102	+ template<typename T>
	103	+ __global__ void cuda_update_grad(T* gpuGrad, T* gpuDir, unsigned int x, unsigned int y){
	104	+
	105	+ //********** when the number of threads are (1024,1) ***********
	106	+
	107	+ // calculate the 2D coordinates for this current thread.
	108	+ int xi = blockIdx.x * blockDim.x + threadIdx.x;
	109	+ int yi = blockIdx.y;
	110	+ // convert 2D coordinates to 1D
	111	+ int i = yi * x + xi;
	112	+
	113	+
	114	+ //update the gradient image with the vote direction
	115	+ gpuGrad[2*i] = gpuDir[i];
	116	+ }
	117	+
	118	+
	119	+ template<typename T>
	120	+ void gpu_update_dir(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
	121	+
	122	+ //get the number of pixels in the image
	123	+ unsigned int pixels = x * y;
	124	+ unsigned int bytes = sizeof(T) * pixels;
	125	+
	126	+
	127	+ unsigned int max_threads = stim::maxThreadsPerBlock();
	128	+ dim3 threads(max_threads, 1);
	129	+ dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
	130	+
	131	+ // Allocate CUDA array in device memory
	132	+
	133	+ //define a channel descriptor for a single 32-bit channel
	134	+ cudaChannelFormatDesc channelDesc =
	135	+ cudaCreateChannelDesc(32, 0, 0, 0,
	136	+ cudaChannelFormatKindFloat);
	137	+ cudaArray* cuArray; //declare the cuda array
	138	+ cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array
	139	+
	140	+ // Copy the image data from global memory to the array
	141	+ cudaMemcpyToArray(cuArray, 0, 0, gpuVote, bytes,
	142	+ cudaMemcpyDeviceToDevice);
	143	+
	144	+ // Specify texture
	145	+ struct cudaResourceDesc resDesc; //create a resource descriptor
	146	+ memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero
	147	+ resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type
	148	+ resDesc.res.array.array = cuArray; //add a pointer to the cuda array
	149	+
	150	+ // Specify texture object parameters
	151	+ struct cudaTextureDesc texDesc; //create a texture descriptor
	152	+ memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero
	153	+ texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges)
	154	+ texDesc.addressMode[1] = cudaAddressModeWrap;
	155	+ texDesc.filterMode = cudaFilterModePoint; //use linear filtering
	156	+ texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats)
	157	+ texDesc.normalizedCoords = 0; //not using normalized coordinates
	158	+
	159	+ // Create texture object
	160	+ cudaTextureObject_t texObj = 0;
	161	+ cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
	162	+
	163	+ // specify share memory
	164	+ unsigned int share_bytes = (2rmax + threads.x)(1)*4;
	165	+
	166	+ // allocate space on the GPU for the updated vote direction
	167	+ T* gpuDir;
	168	+ cudaMalloc(&gpuDir, bytes);
	169	+
	170	+ //call the kernel to calculate the new voting direction
	171	+ cuda_update_dir <<< blocks, threads, share_bytes >>>(gpuDir, texObj, gpuGrad, gpuTable, phi, rmax, x , y);
	172	+
	173	+ //call the kernel to update the gradient direction
	174	+ cuda_update_grad <<< blocks, threads >>>(gpuGrad, gpuDir, x , y);
	175	+
	176	+
	177	+ //free allocated memory
	178	+ cudaFree(gpuDir);
	179	+
	180	+ }
	181	+
	182	+
	183	+ template<typename T>
	184	+ void cpu_update_dir(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
	185	+
	186	+ //calculate the number of bytes in the array
	187	+ unsigned int bytes = x * y * sizeof(T);
	188	+
	189	+ //calculate the number of bytes in the atan2 table
	190	+ unsigned int bytes_table = (2rmax+1) (2rmax+1) sizeof(T);
	191	+
	192	+ //allocate space on the GPU for the Vote Image
	193	+ T* gpuVote;
	194	+ cudaMalloc(&gpuVote, bytes);
	195	+
	196	+ //copy the input vote image to the GPU
	197	+ HANDLE_ERROR(cudaMemcpy(gpuVote, cpuVote, bytes, cudaMemcpyHostToDevice));
	198	+
	199	+ //allocate space on the GPU for the input Gradient image
	200	+ T* gpuGrad;
	201	+ HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
	202	+
	203	+ //copy the Gradient data to the GPU
	204	+ HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
	205	+
	206	+ //allocate space on the GPU for the atan2 table
	207	+ T* gpuTable;
	208	+ HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
	209	+
	210	+ //copy the atan2 values to the GPU
	211	+ HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
	212	+
	213	+
	214	+ //call the GPU version of the update direction function
	215	+ gpu_update_dir<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
	216	+
	217	+
	218	+ //copy the new gradient image back to the CPU
	219	+ cudaMemcpy(cpuGrad, gpuGrad, bytes*2, cudaMemcpyDeviceToHost) ;
	220	+
	221	+ //free allocated memory
	222	+ cudaFree(gpuTable);
	223	+ cudaFree(gpuVote);
	224	+ cudaFree(gpuGrad);
	225	+ }
	226	+
	227	+ }
	228	+}
	229	+
	230	+
	231	+
	232	+#endif
0	233	\ No newline at end of file
...	...

stim/cuda/vote.cuh 0 → 100644

Wrap text Show/Hide comments View file @13fe3c8

	1	+#ifndef STIM_CUDA_VOTE_H
	2	+#define STIM_CUDA_VOTE_H
	3	+
	4	+
	5	+# include <iostream>
	6	+# include <cuda.h>
	7	+# include <stim/cuda/devices.h>
	8	+# include <stim/cuda/error.h>
	9	+#include <stim/cuda/sharedmem.cuh>
	10	+
	11	+
	12	+namespace stim{
	13	+ namespace cuda{
	14	+
	15	+ // this kernel calculates the vote value by adding up the gradient magnitudes of every voter that this pixel is located in their voting area
	16	+ template<typename T>
	17	+ __global__ void cuda_vote(T* gpuVote, cudaTextureObject_t in, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
	18	+
	19	+ //generate a pointer to shared memory (size will be specified as a kernel parameter)
	20	+ extern __shared__ float2 s_grad[];
	21	+
	22	+ //calculate the start point for this block
	23	+ int bxi = blockIdx.x * blockDim.x;
	24	+
	25	+ //calculate the width of the shared memory block
	26	+ int swidth = 2 * rmax + blockDim.x;
	27	+
	28	+ // calculate the 2D coordinates for this current thread.
	29	+ int xi = bxi + threadIdx.x;
	30	+ int yi = blockIdx.y;
	31	+ // convert 2D coordinates to 1D
	32	+ int i = yi * x + xi;
	33	+
	34	+
	35	+ // define a local variable to sum the votes from the voters
	36	+ float sum = 0;
	37	+
	38	+ // compute the size of window which will be checked for finding the proper voters for this pixel
	39	+ unsigned int x_table = 2*rmax +1;
	40	+
	41	+ unsigned int rmax_sq = rmax * rmax;
	42	+ int r = (int)rmax;
	43	+ int tx_rmax = threadIdx.x + rmax;
	44	+ int bxs = bxi - rmax;
	45	+
	46	+
	47	+ for(int yr = -r; yr <= r; yr++){
	48	+
	49	+ //copy the portion of the image necessary for this block to shared memory
	50	+ __syncthreads();
	51	+ stim::cuda::sharedMemcpy_tex2D<float2>(s_grad, in, bxs, yi + yr , swidth, 1, threadIdx, blockDim);
	52	+ __syncthreads();
	53	+
	54	+ //if the current thread is outside of the image, it doesn't have to be computed
	55	+ if(xi < x && yi < y){
	56	+
	57	+ for(int xr = -r; xr <= r; xr++){
	58	+
	59	+ //find the location of this voter in the atan2 table
	60	+ unsigned int id_t = (yr + rmax) * x_table + xr + rmax;
	61	+
	62	+ // calculate the angle between the pixel and the current voter in x and y directions
	63	+ float atan_angle = gpuTable[id_t];
	64	+
	65	+
	66	+ // calculate the voting direction based on the grtadient direction
	67	+ int idx_share = xr + tx_rmax ;
	68	+ float2 g = s_grad[idx_share];
	69	+ float theta = g.x;
	70	+
	71	+ // check if the current voter is located in the voting area of this pixel.
	72	+ if (((xr * xr + yr *yr)< rmax_sq) && (abs(atan_angle - theta) <phi)){
	73	+ sum += g.y;
	74	+
	75	+ }
	76	+
	77	+ }
	78	+ }
	79	+ }
	80	+
	81	+ gpuVote[i] = sum;
	82	+ }
	83	+
	84	+ template<typename T>
	85	+ void gpu_vote(T* gpuVote, T* gpuGrad, T* gpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
	86	+
	87	+ //get the number of pixels in the image
	88	+ unsigned int pixels = x * y;
	89	+ unsigned int bytes = sizeof(T) * pixels;
	90	+
	91	+
	92	+ unsigned int max_threads = stim::maxThreadsPerBlock();
	93	+ //unsigned int thread_dim = sqrt(max_threads);
	94	+ dim3 threads(max_threads, 1);
	95	+ dim3 blocks(x/threads.x + (x %threads.x == 0 ? 0:1) , y);
	96	+
	97	+ // Allocate CUDA array in device memory
	98	+
	99	+ //define a channel descriptor for a single 32-bit channel
	100	+ cudaChannelFormatDesc channelDesc =
	101	+ cudaCreateChannelDesc(32, 32, 0, 0,
	102	+ cudaChannelFormatKindFloat);
	103	+ cudaArray* cuArray; //declare the cuda array
	104	+ cudaMallocArray(&cuArray, &channelDesc, x, y); //allocate the cuda array
	105	+
	106	+ // Copy the image data from global memory to the array
	107	+ cudaMemcpyToArray(cuArray, 0, 0, gpuGrad, bytes*2,
	108	+ cudaMemcpyDeviceToDevice);
	109	+
	110	+ // Specify texture
	111	+ struct cudaResourceDesc resDesc; //create a resource descriptor
	112	+ memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero
	113	+ resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type
	114	+ resDesc.res.array.array = cuArray; //add a pointer to the cuda array
	115	+
	116	+ // Specify texture object parameters
	117	+ struct cudaTextureDesc texDesc; //create a texture descriptor
	118	+ memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero
	119	+ texDesc.addressMode[0] = cudaAddressModeWrap; //use wrapping (around the edges)
	120	+ texDesc.addressMode[1] = cudaAddressModeWrap;
	121	+ texDesc.filterMode = cudaFilterModePoint; //use linear filtering
	122	+ texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats)
	123	+ texDesc.normalizedCoords = 0; //not using normalized coordinates
	124	+
	125	+ // Create texture object
	126	+ cudaTextureObject_t texObj = 0;
	127	+ cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
	128	+
	129	+ // specify share memory
	130	+ unsigned int share_bytes = (2rmax + threads.x)(1)24;
	131	+
	132	+
	133	+ //call the kernel to do the voting
	134	+
	135	+ cuda_vote <<< blocks, threads,share_bytes >>>(gpuVote, texObj, gpuTable, phi, rmax, x , y);
	136	+
	137	+ }
	138	+
	139	+
	140	+ template<typename T>
	141	+ void cpu_vote(T* cpuVote, T* cpuGrad,T* cpuTable, T phi, unsigned int rmax, unsigned int x, unsigned int y){
	142	+
	143	+ //calculate the number of bytes in the array
	144	+ unsigned int bytes = x * y * sizeof(T);
	145	+
	146	+ //calculate the number of bytes in the atan2 table
	147	+ unsigned int bytes_table = (2rmax+1) (2rmax+1) sizeof(T);
	148	+
	149	+ //allocate space on the GPU for the Vote Image
	150	+ T* gpuVote;
	151	+ cudaMalloc(&gpuVote, bytes);
	152	+
	153	+ //allocate space on the GPU for the input Gradient image
	154	+ T* gpuGrad;
	155	+ HANDLE_ERROR(cudaMalloc(&gpuGrad, bytes*2));
	156	+
	157	+ //copy the Gradient Magnitude data to the GPU
	158	+ HANDLE_ERROR(cudaMemcpy(gpuGrad, cpuGrad, bytes*2, cudaMemcpyHostToDevice));
	159	+
	160	+ //allocate space on the GPU for the atan2 table
	161	+ T* gpuTable;
	162	+ HANDLE_ERROR(cudaMalloc(&gpuTable, bytes_table));
	163	+
	164	+ //copy the atan2 values to the GPU
	165	+ HANDLE_ERROR(cudaMemcpy(gpuTable, cpuTable, bytes_table, cudaMemcpyHostToDevice));
	166	+
	167	+ //cudaMemcpyToSymbol (cstTable, cpuTable, bytes_table );
	168	+
	169	+
	170	+ //call the GPU version of the vote calculation function
	171	+ gpu_vote<T>(gpuVote, gpuGrad, gpuTable, phi, rmax, x , y);
	172	+
	173	+
	174	+ //copy the Vote Data back to the CPU
	175	+ cudaMemcpy(cpuVote, gpuVote, bytes, cudaMemcpyDeviceToHost) ;
	176	+
	177	+ //free allocated memory
	178	+ cudaFree(gpuTable);
	179	+ cudaFree(gpuVote);
	180	+ cudaFree(gpuGrad);
	181	+ }
	182	+
	183	+ }
	184	+}
	185	+
	186	+
	187	+
	188	+#endif
0	189	\ No newline at end of file
...	...