diff --git a/stim/math/filters/conv2.cuh b/stim/math/filters/conv2.cuh new file mode 100644 index 0000000..48ea7f7 --- /dev/null +++ b/stim/math/filters/conv2.cuh @@ -0,0 +1,119 @@ +#ifndef STIM_CUDA_CONV2_H +#define STIM_CUDA_CONV2_H +//#define __CUDACC__ + +#include +#include + +namespace stim { + //Kernel function that performs the 2D convolution. + template + __global__ void kernel_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) { + extern __shared__ T s[]; //declare a shared memory array + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; //threads correspond to indices into the output image + size_t yi = blockIdx.y * blockDim.y + threadIdx.y; + size_t tid = threadIdx.y * blockDim.x + threadIdx.x; + size_t nt = blockDim.x * blockDim.y; + + size_t cx = blockIdx.x * blockDim.x; //find the upper left corner of the input region + size_t cy = blockIdx.y * blockDim.y; + + size_t X = sx - kx + 1; //calculate the size of the output image + size_t Y = sy - ky + 1; + + if (cx >= X || cy >= Y) return; //return if the entire block is outside the image + size_t smx = min(blockDim.x + kx - 1, sx - cx); //size of the shared copy of the input image + size_t smy = min(blockDim.y + ky - 1, sy - cy); // min function is used to deal with boundary blocks + stim::cuda::threadedMemcpy2D(s, smx, smy, in, cx, cy, sx, sy, tid, nt); //copy the input region to shared memory + __syncthreads(); + + if (xi >= X || yi >= Y) return; //returns if the thread is outside of the output image + + //loop through the kernel + size_t kxi, kyi; + K v = 0; + for (kyi = 0; kyi < ky; kyi++) { + for (kxi = 0; kxi < kx; kxi++) { + v += s[(threadIdx.y + kyi) * smx + threadIdx.x + kxi] * kernel[kyi * kx + kxi]; + //v += in[(yi + kyi) * sx + xi + kxi] * kernel[kyi * kx + kxi]; + } + } + out[yi * X + xi] = (T)v; //write the result to global memory + + } + + //Performs a convolution of a 2D image using the GPU. All pointers are assumed to be to memory on the current device. + //@param out is a pointer to the output image, which is of size (sx - kx + 1) x (sy - ky + 1) + //@param in is a pointer to the input image + //@param sx is the size of the input image along X + //@param sy is the size of the input image along Y + //@param kx is the size of the kernel along X + //@param ky is the size of the kernel along Y + template + void gpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) { + cudaDeviceProp p; + HANDLE_ERROR(cudaGetDeviceProperties(&p, 0)); + size_t tmax = p.maxThreadsPerBlock; + dim3 nt(sqrt(tmax), sqrt(tmax)); //calculate the block dimensions + size_t X = sx - kx + 1; //calculate the size of the output image + size_t Y = sy - ky + 1; + dim3 nb(X / nt.x + 1, Y / nt.y + 1); //calculate the grid dimensions + size_t sm = (nt.x + kx - 1) * (nt.y + ky - 1) * sizeof(T); //shared memory bytes required to store block data + if (sm > p.sharedMemPerBlock) { + std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; + exit(1); + } + kernel_conv2 <<>> (out, in, kernel, sx, sy, kx, ky); //launch the kernel + } +//#endif + //Performs a convolution of a 2D image. Only valid pixels based on the kernel are returned. + // As a result, the output image will be smaller than the input image by (kx-1, ky-1) + //@param out is a pointer to the output image + //@param in is a pointer to the input image + //@param sx is the size of the input image along X + //@param sy is the size of the input image along Y + //@param kx is the size of the kernel along X + //@param ky is the size of the kernel along Y + template + void cpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) { + size_t X = sx - kx + 1; //x size of the output image + size_t Y = sy - ky + 1; //y size of the output image + + //allocate memory and copy everything to the GPU + T* gpu_in; + HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T))); + HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice)); + K* gpu_kernel; + HANDLE_ERROR(cudaMalloc(&gpu_kernel, kx * ky * sizeof(K))); + HANDLE_ERROR(cudaMemcpy(gpu_kernel, kernel, kx * ky * sizeof(K), cudaMemcpyHostToDevice)); + T* gpu_out; + HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T))); + gpu_conv2(gpu_out, gpu_in, gpu_kernel, sx, sy, kx, ky); //execute the GPU kernel + HANDLE_ERROR(cudaMemcpy(out, gpu_out, X * Y * sizeof(T), cudaMemcpyDeviceToHost)); //copy the result to the host + HANDLE_ERROR(cudaFree(gpu_in)); + HANDLE_ERROR(cudaFree(gpu_kernel)); + HANDLE_ERROR(cudaFree(gpu_out)); +/* CPU CODE + K v; //register stores the integral of the current pixel value + size_t yi, xi, kyi, kxi, yi_kyi_sx; + for (yi = 0; yi < Y; yi++) { //for each pixel in the output image + for (xi = 0; xi < X; xi++) { + v = 0; + for (kyi = 0; kyi < ky; kyi++) { //for each pixel in the kernel + yi_kyi_sx = (yi + kyi) * sx; + for (kxi = 0; kxi < kx; kxi++) { + v += in[yi_kyi_sx + xi + kxi] * kernel[kyi * kx + kxi]; + } + } + out[yi * X + xi] = v; //save the result to the output array + } + } + + */ + } + + +} + + +#endif \ No newline at end of file diff --git a/stim/math/filters/conv2.h b/stim/math/filters/conv2.h deleted file mode 100644 index 94e9f02..0000000 --- a/stim/math/filters/conv2.h +++ /dev/null @@ -1,123 +0,0 @@ -#ifndef STIM_CUDA_CONV2_H -#define STIM_CUDA_CONV2_H -//#define __CUDACC__ - -#ifdef __CUDACC__ -#include -#include -#endif - -namespace stim { -#ifdef __CUDACC__ - //Kernel function that performs the 2D convolution. - template - __global__ void kernel_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) { - extern __shared__ T s[]; //declare a shared memory array - size_t xi = blockIdx.x * blockDim.x + threadIdx.x; //threads correspond to indices into the output image - size_t yi = blockIdx.y * blockDim.y + threadIdx.y; - size_t tid = threadIdx.y * blockDim.x + threadIdx.x; - size_t nt = blockDim.x * blockDim.y; - - size_t cx = blockIdx.x * blockDim.x; //find the upper left corner of the input region - size_t cy = blockIdx.y * blockDim.y; - - size_t X = sx - kx + 1; //calculate the size of the output image - size_t Y = sy - ky + 1; - - if (cx >= X || cy >= Y) return; //return if the entire block is outside the image - size_t smx = min(blockDim.x + kx - 1, sx - cx); //size of the shared copy of the input image - size_t smy = min(blockDim.y + ky - 1, sy - cy); // min function is used to deal with boundary blocks - stim::cuda::threadedMemcpy2D(s, smx, smy, in, cx, cy, sx, sy, tid, nt); //copy the input region to shared memory - __syncthreads(); - - if (xi >= X || yi >= Y) return; //returns if the thread is outside of the output image - - //loop through the kernel - size_t kxi, kyi; - K v = 0; - for (kyi = 0; kyi < ky; kyi++) { - for (kxi = 0; kxi < kx; kxi++) { - v += s[(threadIdx.y + kyi) * smx + threadIdx.x + kxi] * kernel[kyi * kx + kxi]; - //v += in[(yi + kyi) * sx + xi + kxi] * kernel[kyi * kx + kxi]; - } - } - out[yi * X + xi] = (T)v; //write the result to global memory - - } - - //Performs a convolution of a 2D image using the GPU. All pointers are assumed to be to memory on the current device. - //@param out is a pointer to the output image, which is of size (sx - kx + 1) x (sy - ky + 1) - //@param in is a pointer to the input image - //@param sx is the size of the input image along X - //@param sy is the size of the input image along Y - //@param kx is the size of the kernel along X - //@param ky is the size of the kernel along Y - template - void gpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) { - cudaDeviceProp p; - HANDLE_ERROR(cudaGetDeviceProperties(&p, 0)); - size_t tmax = p.maxThreadsPerBlock; - dim3 nt(sqrt(tmax), sqrt(tmax)); //calculate the block dimensions - size_t X = sx - kx + 1; //calculate the size of the output image - size_t Y = sy - ky + 1; - dim3 nb(X / nt.x + 1, Y / nt.y + 1); //calculate the grid dimensions - size_t sm = (nt.x + kx - 1) * (nt.y + ky - 1) * sizeof(T); //shared memory bytes required to store block data - if (sm > p.sharedMemPerBlock) { - std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; - exit(1); - } - kernel_conv2 <<>> (out, in, kernel, sx, sy, kx, ky); //launch the kernel - } -#endif - //Performs a convolution of a 2D image. Only valid pixels based on the kernel are returned. - // As a result, the output image will be smaller than the input image by (kx-1, ky-1) - //@param out is a pointer to the output image - //@param in is a pointer to the input image - //@param sx is the size of the input image along X - //@param sy is the size of the input image along Y - //@param kx is the size of the kernel along X - //@param ky is the size of the kernel along Y - template - void cpu_conv2(T* out, T* in, K* kernel, size_t sx, size_t sy, size_t kx, size_t ky) { - size_t X = sx - kx + 1; //x size of the output image - size_t Y = sy - ky + 1; //y size of the output image - -#ifdef __CUDACC__ - //allocate memory and copy everything to the GPU - T* gpu_in; - HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T))); - HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice)); - K* gpu_kernel; - HANDLE_ERROR(cudaMalloc(&gpu_kernel, kx * ky * sizeof(K))); - HANDLE_ERROR(cudaMemcpy(gpu_kernel, kernel, kx * ky * sizeof(K), cudaMemcpyHostToDevice)); - T* gpu_out; - HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T))); - gpu_conv2(gpu_out, gpu_in, gpu_kernel, sx, sy, kx, ky); //execute the GPU kernel - HANDLE_ERROR(cudaMemcpy(out, gpu_out, X * Y * sizeof(T), cudaMemcpyDeviceToHost)); //copy the result to the host - HANDLE_ERROR(cudaFree(gpu_in)); - HANDLE_ERROR(cudaFree(gpu_kernel)); - HANDLE_ERROR(cudaFree(gpu_out)); -#else - K v; //register stores the integral of the current pixel value - size_t yi, xi, kyi, kxi, yi_kyi_sx; - for (yi = 0; yi < Y; yi++) { //for each pixel in the output image - for (xi = 0; xi < X; xi++) { - v = 0; - for (kyi = 0; kyi < ky; kyi++) { //for each pixel in the kernel - yi_kyi_sx = (yi + kyi) * sx; - for (kxi = 0; kxi < kx; kxi++) { - v += in[yi_kyi_sx + xi + kxi] * kernel[kyi * kx + kxi]; - } - } - out[yi * X + xi] = v; //save the result to the output array - } - } - -#endif - } - - -} - - -#endif \ No newline at end of file diff --git a/stim/math/filters/gauss2.cuh b/stim/math/filters/gauss2.cuh new file mode 100644 index 0000000..b27cd07 --- /dev/null +++ b/stim/math/filters/gauss2.cuh @@ -0,0 +1,47 @@ +#ifndef STIM_CUDA_GAUSS2_H +#define STIM_CUDA_GAUSS2_H + +#include +#include +#include + +namespace stim { + + template + T gauss1d(T x, T mu, T std) { + return ((T)1 / (T)sqrt(stim::TAU * std * std) * exp(-(x - mu)*(x - mu) / (2 * std*std))); + } + ///Perform a 2D gaussian convolution on an input image. + ///@param in is a pointer to the input image + ///@param stdx is the standard deviation (in pixels) along the x axis + ///@param stdy is the standard deviation (in pixels) along the y axis + ///@param nstds specifies the number of standard deviations of the Gaussian that will be kept in the kernel + template + stim::image cpu_gauss2(const stim::image& in, K stdx, K stdy, size_t nstds = 3) { + size_t kx = stdx * nstds * 2; //calculate the kernel sizes + size_t ky = stdy * nstds * 2; + size_t X = in.width() - kx + 1; //calculate the size of the output image + size_t Y = in.height() - ky + 1; + stim::image r(X, Y, in.channels()); //create an output image + + K* gx = (K*)malloc(kx * sizeof(K)); //allocate space for the gaussian kernels + K* gy = (K*)malloc(ky * sizeof(K)); + K mux = (K)kx / (K)2; //calculate the mean of the gaussian (center of the kernel) + K muy = (K)ky / (K)2; + for (size_t xi = 0; xi < kx; xi++) //evaluate the kernels + gx[xi] = gauss1d((K)xi, mux, stdx); + for (size_t yi = 0; yi < ky; yi++) + gy[yi] = gauss1d((K)yi, muy, stdy); + + std::vector< stim::image > IN = in.split(); //split the input image into channels + std::vector< stim::image > R = r.split(); //split the output image into channels + for (size_t c = 0; c < in.channels(); c++) //for each channel + cpu_sepconv2(R[c].data(), IN[c].data(), gx, gy, IN[c].width(), IN[c].height(), kx, ky); + + r.merge(R); //merge the blurred channels into the final image + return r; + } +} + + +#endif \ No newline at end of file diff --git a/stim/math/filters/gauss2.h b/stim/math/filters/gauss2.h deleted file mode 100644 index 44cae82..0000000 --- a/stim/math/filters/gauss2.h +++ /dev/null @@ -1,47 +0,0 @@ -#ifndef STIM_CUDA_GAUSS2_H -#define STIM_CUDA_GAUSS2_H - -#include -#include -#include - -namespace stim { - - template - T gauss1d(T x, T mu, T std) { - return ((T)1 / (T)sqrt(stim::TAU * std * std) * exp(-(x - mu)*(x - mu) / (2 * std*std))); - } - ///Perform a 2D gaussian convolution on an input image. - ///@param in is a pointer to the input image - ///@param stdx is the standard deviation (in pixels) along the x axis - ///@param stdy is the standard deviation (in pixels) along the y axis - ///@param nstds specifies the number of standard deviations of the Gaussian that will be kept in the kernel - template - stim::image cpu_gauss2(const stim::image& in, K stdx, K stdy, size_t nstds = 3) { - size_t kx = stdx * nstds * 2; //calculate the kernel sizes - size_t ky = stdy * nstds * 2; - size_t X = in.width() - kx + 1; //calculate the size of the output image - size_t Y = in.height() - ky + 1; - stim::image r(X, Y, in.channels()); //create an output image - - K* gx = (K*)malloc(kx * sizeof(K)); //allocate space for the gaussian kernels - K* gy = (K*)malloc(ky * sizeof(K)); - K mux = (K)kx / (K)2; //calculate the mean of the gaussian (center of the kernel) - K muy = (K)ky / (K)2; - for (size_t xi = 0; xi < kx; xi++) //evaluate the kernels - gx[xi] = gauss1d((K)xi, mux, stdx); - for (size_t yi = 0; yi < ky; yi++) - gy[yi] = gauss1d((K)yi, muy, stdy); - - std::vector< stim::image > IN = in.split(); //split the input image into channels - std::vector< stim::image > R = r.split(); //split the output image into channels - for (size_t c = 0; c < in.channels(); c++) //for each channel - cpu_sepconv2(R[c].data(), IN[c].data(), gx, gy, IN[c].width(), IN[c].height(), kx, ky); - - r.merge(R); //merge the blurred channels into the final image - return r; - } -} - - -#endif \ No newline at end of file diff --git a/stim/math/filters/gauss3.cuh b/stim/math/filters/gauss3.cuh new file mode 100644 index 0000000..4b72dd9 --- /dev/null +++ b/stim/math/filters/gauss3.cuh @@ -0,0 +1,55 @@ +#ifndef STIM_CUDA_GAUSS3_H +#define STIM_CUDA_GAUSS3_H +#include +#include +#include + +namespace stim +{ + ///Perform a 3D gaussian convolution on an input image. + ///@param in is a pointer to the input data. + ///@param dimx is the size of in* in the x direction. + ///@param dimx is the size of in* in the y direction. + ///@param dimx is the size of in* in the z direction. + ///@param stdx is the standard deviation (in pixels) along the x axis. + ///@param stdy is the standard deviation (in pixels) along the y axis. + ///@param nstds specifies the number of standard deviations of the Gaussian that will be kept in the kernel. + template + void cpu_gauss3(T* in, K dimx, K dimy, K dimz, K stdx, K stdy, K stdz, size_t nstds = 3) + { + //Set up the sizes of the gaussian Kernels. + size_t kx = stdx * nstds * 2; + size_t ky = stdy * nstds * 2; + size_t kz = stdz * nstds * 2; + + //Set up the sizes of the new output, which will be kx, ky, kz, smaller than the input. + size_t X = dimx - kx +1; + size_t Y = dimy - ky +1; + size_t Z = dimz - kz +1; + T* out = (T*) malloc(X*Y*Z* sizeof(T)); + + ///Set up the memory that will store the gaussians + K* gaussx = (K*)malloc(kx *sizeof(K)); + K* gaussy = (K*)malloc(ky *sizeof(K)); + K* gaussz = (K*)malloc(kz *sizeof(K)); + + ///Set up the midpoints of the gaussians. + K midgaussx = (K) kx/ (K)2; + K midgaussy = (K) ky/ (K)2; + K midgaussz = (K) kz/ (K)2; + + ///Evaluate the kernels in each cardinal direction. + for(size_t i = 0; i < kx; i++) + gaussx[i] = gauss1d((K) i, midgaussx, stdx); + + for(size_t i = 0; i < kx; i++) + gaussy[i] = gauss1d((K) i, midgaussy, stdy); + + for(size_t i = 0; i < kx; i++) + gaussz[i] = gauss1d((K) i, midgaussz, stdz); + + cpu_sepconv3(out, in, gaussx, gaussy, gaussz, dimx, dimy, dimz, kx, ky, kz); + + } +} +#endif diff --git a/stim/math/filters/gauss3.h b/stim/math/filters/gauss3.h deleted file mode 100644 index f6a5588..0000000 --- a/stim/math/filters/gauss3.h +++ /dev/null @@ -1,55 +0,0 @@ -#ifndef STIM_CUDA_GAUSS3_H -#define STIM_CUDA_GAUSS3_H -#include -#include -#include - -namespace stim -{ - ///Perform a 3D gaussian convolution on an input image. - ///@param in is a pointer to the input data. - ///@param dimx is the size of in* in the x direction. - ///@param dimx is the size of in* in the y direction. - ///@param dimx is the size of in* in the z direction. - ///@param stdx is the standard deviation (in pixels) along the x axis. - ///@param stdy is the standard deviation (in pixels) along the y axis. - ///@param nstds specifies the number of standard deviations of the Gaussian that will be kept in the kernel. - template - void cpu_gauss3(T* in, K dimx, K dimy, K dimz, K stdx, K stdy, K stdz, size_t nstds = 3) - { - //Set up the sizes of the gaussian Kernels. - size_t kx = stdx * nstds * 2; - size_t ky = stdy * nstds * 2; - size_t kz = stdz * nstds * 2; - - //Set up the sizes of the new output, which will be kx, ky, kz, smaller than the input. - size_t X = dimx - kx +1; - size_t Y = dimy - ky +1; - size_t Z = dimz - kz +1; - T* out = (T*) malloc(X*Y*Z* sizeof(T)); - - ///Set up the memory that will store the gaussians - K* gaussx = (K*)malloc(kx *sizeof(K)); - K* gaussy = (K*)malloc(ky *sizeof(K)); - K* gaussz = (K*)malloc(kz *sizeof(K)); - - ///Set up the midpoints of the gaussians. - K midgaussx = (K) kx/ (K)2; - K midgaussy = (K) ky/ (K)2; - K midgaussz = (K) kz/ (K)2; - - ///Evaluate the kernels in each cardinal direction. - for(size_t i = 0; i < kx; i++) - gaussx[i] = gauss1d((K) i, midgaussx, stdx); - - for(size_t i = 0; i < kx; i++) - gaussy[i] = gauss1d((K) i, midgaussy, stdy); - - for(size_t i = 0; i < kx; i++) - gaussz[i] = gauss1d((K) i, midgaussz, stdz); - - cpu_sepconv3(out, in, gaussx, gaussy, gaussz, dimx, dimy, dimz, kx, ky, kz); - - } -} -#endif diff --git a/stim/math/filters/resample2.cuh b/stim/math/filters/resample2.cuh new file mode 100644 index 0000000..a3263ae --- /dev/null +++ b/stim/math/filters/resample2.cuh @@ -0,0 +1,16 @@ +#ifndef STIM_CUDA_RESAMPLE2_H +#define STIM_CUDA_RESAMPLE2_H + +#include +#include + + +///Downsamples a 2D image by a factor f using a box filter. Any pixels outside of the valid region +/// (for example if X%f != 0) are chopped. +template +void cpu_resample2(T* out, T* in, size_t f, size_t sx, size_t sy) { + +} + + +#endif \ No newline at end of file diff --git a/stim/math/filters/sepconv2.cuh b/stim/math/filters/sepconv2.cuh new file mode 100644 index 0000000..5946c6a --- /dev/null +++ b/stim/math/filters/sepconv2.cuh @@ -0,0 +1,89 @@ +#ifndef STIM_CUDA_SEPCONV2_H +#define STIM_CUDA_SEPCONV2_H +#include +#ifdef __CUDACC__ +#include +#include +#endif + +namespace stim { +#ifdef __CUDACC__ + //Performs a convolution of a 2D image using the GPU. All pointers are assumed to be to memory on the current device. + //@param out is a pointer to the output image + //@param in is a pointer to the input image + //@param sx is the size of the input image along X + //@param sy is the size of the input image along Y + //@param kx is the size of the kernel along X + //@param ky is the size of the kernel along Y + template + void gpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) { + cudaDeviceProp p; + HANDLE_ERROR(cudaGetDeviceProperties(&p, 0)); + size_t tmax = p.maxThreadsPerBlock; + dim3 nt(sqrt(tmax), sqrt(tmax)); //calculate the block dimensions + size_t X = sx - kx + 1; //calculate the x size of the output image + T* temp; //declare a temporary variable to store the intermediate image + HANDLE_ERROR(cudaMalloc(&temp, X * sy * sizeof(T))); //allocate memory for the intermediate image + + dim3 nb(X / nt.x + 1, sy / nt.y + 1); //calculate the grid dimensions + size_t sm = (nt.x + kx - 1) * nt.y * sizeof(T); //shared memory bytes required to store block data + if (sm > p.sharedMemPerBlock) { + std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; + exit(1); + } + kernel_conv2 <<>> (temp, in, k0, sx, sy, kx, 1); //launch the kernel to compute the intermediate image + + size_t Y = sy - ky + 1; //calculate the y size of the output image + nb.y = Y / nt.y + 1; //update the grid dimensions to reflect the Y-axis size of the output image + sm = nt.x * (nt.y + ky - 1) * sizeof(T); //calculate the amount of shared memory needed for the second pass + if (sm > p.sharedMemPerBlock) { + std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; + exit(1); + } + kernel_conv2 <<>> (out, temp, k1, X, sy, 1, ky); //launch the kernel to compute the final image + HANDLE_ERROR(cudaFree(temp)); //free memory allocated for the intermediate image + } +#endif + //Performs a separable convolution of a 2D image. Only valid pixels based on the kernel are returned. + // As a result, the output image will be smaller than the input image by (kx-1, ky-1) + //@param out is a pointer to the output image + //@param in is a pointer to the input image + //@param k0 is the x-axis convolution filter + //@param k1 is the y-axis convolution filter + //@param sx is the size of the input image along X + //@param sy is the size of the input image along Y + //@param kx is the size of the kernel along X + //@param ky is the size of the kernel along Y + template + void cpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) { + size_t X = sx - kx + 1; //x size of the output image + size_t Y = sy - ky + 1; +#ifdef __CUDACC__ + //allocate memory and copy everything to the GPU + T* gpu_in; + HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T))); + HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice)); + K* gpu_k0; + HANDLE_ERROR(cudaMalloc(&gpu_k0, kx * sizeof(K))); + HANDLE_ERROR(cudaMemcpy(gpu_k0, k0, kx * sizeof(K), cudaMemcpyHostToDevice)); + K* gpu_k1; + HANDLE_ERROR(cudaMalloc(&gpu_k1, ky * sizeof(K))); + HANDLE_ERROR(cudaMemcpy(gpu_k1, k1, ky * sizeof(K), cudaMemcpyHostToDevice)); + T* gpu_out; + HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T))); + gpu_sepconv2(gpu_out, gpu_in, gpu_k0, gpu_k1, sx, sy, kx, ky); //execute the GPU kernel + HANDLE_ERROR(cudaMemcpy(out, gpu_out, X * Y * sizeof(T), cudaMemcpyDeviceToHost)); //copy the result to the host + HANDLE_ERROR(cudaFree(gpu_in)); + HANDLE_ERROR(cudaFree(gpu_k0)); + HANDLE_ERROR(cudaFree(gpu_k1)); + HANDLE_ERROR(cudaFree(gpu_out)); +#else + T* temp = (T*)malloc(X * sy * sizeof(T)); //allocate space for the intermediate image + cpu_conv2(temp, in, k0, sx, sy, kx, 1); //evaluate the intermediate image + cpu_conv2(out, temp, k1, X, sy, 1, ky); //evaluate the final image + free(temp); //free the memory for the intermediate image +#endif + } +} + +#endif diff --git a/stim/math/filters/sepconv2.h b/stim/math/filters/sepconv2.h deleted file mode 100644 index d05b697..0000000 --- a/stim/math/filters/sepconv2.h +++ /dev/null @@ -1,89 +0,0 @@ -#ifndef STIM_CUDA_SEPCONV2_H -#define STIM_CUDA_SEPCONV2_H -#include -#ifdef __CUDACC__ -#include -#include -#endif - -namespace stim { -#ifdef __CUDACC__ - //Performs a convolution of a 2D image using the GPU. All pointers are assumed to be to memory on the current device. - //@param out is a pointer to the output image - //@param in is a pointer to the input image - //@param sx is the size of the input image along X - //@param sy is the size of the input image along Y - //@param kx is the size of the kernel along X - //@param ky is the size of the kernel along Y - template - void gpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) { - cudaDeviceProp p; - HANDLE_ERROR(cudaGetDeviceProperties(&p, 0)); - size_t tmax = p.maxThreadsPerBlock; - dim3 nt(sqrt(tmax), sqrt(tmax)); //calculate the block dimensions - size_t X = sx - kx + 1; //calculate the x size of the output image - T* temp; //declare a temporary variable to store the intermediate image - HANDLE_ERROR(cudaMalloc(&temp, X * sy * sizeof(T))); //allocate memory for the intermediate image - - dim3 nb(X / nt.x + 1, sy / nt.y + 1); //calculate the grid dimensions - size_t sm = (nt.x + kx - 1) * nt.y * sizeof(T); //shared memory bytes required to store block data - if (sm > p.sharedMemPerBlock) { - std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; - exit(1); - } - kernel_conv2 <<>> (temp, in, k0, sx, sy, kx, 1); //launch the kernel to compute the intermediate image - - size_t Y = sy - ky + 1; //calculate the y size of the output image - nb.y = Y / nt.y + 1; //update the grid dimensions to reflect the Y-axis size of the output image - sm = nt.x * (nt.y + ky - 1) * sizeof(T); //calculate the amount of shared memory needed for the second pass - if (sm > p.sharedMemPerBlock) { - std::cout << "Error in stim::gpu_conv2() - insufficient shared memory for this kernel." << std::endl; - exit(1); - } - kernel_conv2 <<>> (out, temp, k1, X, sy, 1, ky); //launch the kernel to compute the final image - HANDLE_ERROR(cudaFree(temp)); //free memory allocated for the intermediate image - } -#endif - //Performs a separable convolution of a 2D image. Only valid pixels based on the kernel are returned. - // As a result, the output image will be smaller than the input image by (kx-1, ky-1) - //@param out is a pointer to the output image - //@param in is a pointer to the input image - //@param k0 is the x-axis convolution filter - //@param k1 is the y-axis convolution filter - //@param sx is the size of the input image along X - //@param sy is the size of the input image along Y - //@param kx is the size of the kernel along X - //@param ky is the size of the kernel along Y - template - void cpu_sepconv2(T* out, T* in, K* k0, K* k1, size_t sx, size_t sy, size_t kx, size_t ky) { - size_t X = sx - kx + 1; //x size of the output image - size_t Y = sy - ky + 1; -#ifdef __CUDACC__ - //allocate memory and copy everything to the GPU - T* gpu_in; - HANDLE_ERROR(cudaMalloc(&gpu_in, sx * sy * sizeof(T))); - HANDLE_ERROR(cudaMemcpy(gpu_in, in, sx * sy * sizeof(T), cudaMemcpyHostToDevice)); - K* gpu_k0; - HANDLE_ERROR(cudaMalloc(&gpu_k0, kx * sizeof(K))); - HANDLE_ERROR(cudaMemcpy(gpu_k0, k0, kx * sizeof(K), cudaMemcpyHostToDevice)); - K* gpu_k1; - HANDLE_ERROR(cudaMalloc(&gpu_k1, ky * sizeof(K))); - HANDLE_ERROR(cudaMemcpy(gpu_k1, k1, ky * sizeof(K), cudaMemcpyHostToDevice)); - T* gpu_out; - HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * sizeof(T))); - gpu_sepconv2(gpu_out, gpu_in, gpu_k0, gpu_k1, sx, sy, kx, ky); //execute the GPU kernel - HANDLE_ERROR(cudaMemcpy(out, gpu_out, X * Y * sizeof(T), cudaMemcpyDeviceToHost)); //copy the result to the host - HANDLE_ERROR(cudaFree(gpu_in)); - HANDLE_ERROR(cudaFree(gpu_k0)); - HANDLE_ERROR(cudaFree(gpu_k1)); - HANDLE_ERROR(cudaFree(gpu_out)); -#else - T* temp = (T*)malloc(X * sy * sizeof(T)); //allocate space for the intermediate image - cpu_conv2(temp, in, k0, sx, sy, kx, 1); //evaluate the intermediate image - cpu_conv2(out, temp, k1, X, sy, 1, ky); //evaluate the final image - free(temp); //free the memory for the intermediate image -#endif - } -} - -#endif diff --git a/stim/math/filters/sepconv3.cuh b/stim/math/filters/sepconv3.cuh new file mode 100644 index 0000000..7250d26 --- /dev/null +++ b/stim/math/filters/sepconv3.cuh @@ -0,0 +1,123 @@ +#ifndef STIM_CUDA_SEPCONV3_H +#define STIM_CUDA_SEPCONV3_H + +#include +#include +#ifdef __CUDACC__ + #include + #include +#endif + +namespace stim +{ +#ifdef __CUDACC__ + template + void gpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz) +{ + + size_t X = dimx - kx + 1; + size_t Y = dimy - ky + 1; + size_t Z = dimz - kz + 1; + + T* temp_out; + int idx_IN; + int idx_OUT; + HANDLE_ERROR(cudaMalloc(&temp_out, X*Y*dimz*sizeof(T))); + + for(int i = 0; i < dimz; i++) + { + idx_IN = (dimx*dimy)*i-i; + idx_OUT = (X*Y)*i-i; + gpu_sepconv2(&temp_out[idx_OUT], &in[idx_IN], k0, k1, dimx, dimy, kx, ky); + } + + cudaDeviceProp p; + HANDLE_ERROR(cudaGetDeviceProperties(&p, 0)); + size_t tmax = p.maxThreadsPerBlock; + + dim3 numThreads(sqrt(tmax), sqrt(tmax)); + dim3 numBlocks(X*Y/numThreads.x +1, dimz/numThreads.y + 1); + size_t sharedMem = (numThreads.x + kz - 1) * numThreads.y * sizeof(T); + if(sharedMem > p.sharedMemPerBlock) + { + std::cout << "Error in stim::gpu_sepconv3() - insufficient shared memory for this kernel." << std::endl; + exit(1); + } + kernel_conv2 <<< numBlocks, numThreads, sharedMem >>> (out, temp_out, k2, X*Y, dimz, 1, kz); + HANDLE_ERROR(cudaFree(temp_out)); + + +} +#endif + + //Performs a separable convolution of a 3D image. Only valid pixels based on the kernel ar e returned. + // As a result, the output image will be smaller than the input image by (kx-1, ky-1 , kz-1) + //@param out is a pointer to the output image + //@param in is a pointer to the input image + //@param kx is the x-axis convolution filter + //@param ky is the y-axis convolution filter + //@param kz is the z-axis convolution filter + //@param dimx is the size of the input image along X + //@param dimy is the size of the input image along Y + //@param dimz is the size of the input image along Z + //@param kx is the size of the kernel along X + //@param ky is the size of the kernel along Y + //@param kz is the size of the kernel along Z + + template + void cpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz) + { + //Set up the sizes of the new output, which will be kx, ky, kz, smaller than the i nput. + size_t X = dimx - kx + 1; + size_t Y = dimy - ky + 1; + size_t Z = dimz - kz + 1; + +#ifdef __CUDACC__ + ///Set up all of the memory on the GPU + T* gpu_in; + HANDLE_ERROR(cudaMalloc(&gpu_in, dimx*dimy*dimz*sizeof(T))); + HANDLE_ERROR(cudaMemcpy(gpu_in, in, dimx*dimy*dimz*sizeof(T),cudaMemcpyHostToDevice)); + K* gpu_kx; + HANDLE_ERROR(cudaMalloc(&gpu_kx, kx*sizeof(K))); + HANDLE_ERROR(cudaMemcpy(gpu_kx, k0, kx*sizeof(K),cudaMemcpyHostToDevice)); + K* gpu_ky; + HANDLE_ERROR(cudaMalloc(&gpu_ky, ky*sizeof(K))); + HANDLE_ERROR(cudaMemcpy(gpu_ky, k1, ky*sizeof(K),cudaMemcpyHostToDevice)); + K* gpu_kz; + HANDLE_ERROR(cudaMalloc(&gpu_kz, kz*sizeof(K))); + HANDLE_ERROR(cudaMemcpy(gpu_kz, k2, kz*sizeof(K),cudaMemcpyHostToDevice)); + T* gpu_out; + HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * Z*sizeof(T))); + + ///run the kernel + gpu_sepconv3(gpu_out, gpu_in, gpu_kx, gpu_ky, gpu_kz, dimx, dimy, dimz, kx, ky, kz); + + ///Copy the output + HANDLE_ERROR(cudaMemcpy(out, gpu_out, X*Y*Z*sizeof(T), cudaMemcpyDeviceToHost)); + + ///Free all the memory used. + HANDLE_ERROR(cudaFree(gpu_in)); + HANDLE_ERROR(cudaFree(gpu_kx)); + HANDLE_ERROR(cudaFree(gpu_ky)); + HANDLE_ERROR(cudaFree(gpu_kz)); + HANDLE_ERROR(cudaFree(gpu_out)); +#else + T* temp = (T*) malloc(X * dimy * sizeof(T)); + T* temp3 = (T*) malloc(X * Y * dimz * sizeof(T)); + for(int i = 0; i < dimz; i++) + { + idx_IN = (dimx*dimy)*i-i; + idx_OUT = (X*Y)*i-i; + cpu_conv2(temp, &in[idx_IN], k0, dimx, dimy, kx, 1) + cpu_conv2(&temp3[idx_OUT], temp, k1, X, dimy, 1, ky); + } + cpu_conv2(out, temp, k2, X*Y, dimz, 1, kz); + free(temp); + free(temp3); + +#endif + } +} + + +#endif diff --git a/stim/math/filters/sepconv3.h b/stim/math/filters/sepconv3.h deleted file mode 100644 index 11f1b0d..0000000 --- a/stim/math/filters/sepconv3.h +++ /dev/null @@ -1,123 +0,0 @@ -#ifndef STIM_CUDA_SEPCONV3_H -#define STIM_CUDA_SEPCONV3_H - -#include -#include -#ifdef __CUDACC__ - #include - #include -#endif - -namespace stim -{ -#ifdef __CUDACC__ - template - void gpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz) -{ - - size_t X = dimx - kx + 1; - size_t Y = dimy - ky + 1; - size_t Z = dimz - kz + 1; - - T* temp_out; - int idx_IN; - int idx_OUT; - HANDLE_ERROR(cudaMalloc(&temp_out, X*Y*dimz*sizeof(T))); - - for(int i = 0; i < dimz; i++) - { - idx_IN = (dimx*dimy)*i-i; - idx_OUT = (X*Y)*i-i; - gpu_sepconv2(&temp_out[idx_OUT], &in[idx_IN], k0, k1, dimx, dimy, kx, ky); - } - - cudaDeviceProp p; - HANDLE_ERROR(cudaGetDeviceProperties(&p, 0)); - size_t tmax = p.maxThreadsPerBlock; - - dim3 numThreads(sqrt(tmax), sqrt(tmax)); - dim3 numBlocks(X*Y/numThreads.x +1, dimz/numThreads.y + 1); - size_t sharedMem = (numThreads.x + kz - 1) * numThreads.y * sizeof(T); - if(sharedMem > p.sharedMemPerBlock) - { - std::cout << "Error in stim::gpu_sepconv3() - insufficient shared memory for this kernel." << std::endl; - exit(1); - } - kernel_conv2 <<< numBlocks, numThreads, sharedMem >>> (out, temp_out, k2, X*Y, dimz, 1, kz); - HANDLE_ERROR(cudaFree(temp_out)); - - -} -#endif - - //Performs a separable convolution of a 3D image. Only valid pixels based on the kernel ar e returned. - // As a result, the output image will be smaller than the input image by (kx-1, ky-1 , kz-1) - //@param out is a pointer to the output image - //@param in is a pointer to the input image - //@param kx is the x-axis convolution filter - //@param ky is the y-axis convolution filter - //@param kz is the z-axis convolution filter - //@param dimx is the size of the input image along X - //@param dimy is the size of the input image along Y - //@param dimz is the size of the input image along Z - //@param kx is the size of the kernel along X - //@param ky is the size of the kernel along Y - //@param kz is the size of the kernel along Z - - template - void cpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz) - { - //Set up the sizes of the new output, which will be kx, ky, kz, smaller than the i nput. - size_t X = dimx - kx + 1; - size_t Y = dimy - ky + 1; - size_t Z = dimz - kz + 1; - -#ifdef __CUDACC__ - ///Set up all of the memory on the GPU - T* gpu_in; - HANDLE_ERROR(cudaMalloc(&gpu_in, dimx*dimy*dimz*sizeof(T))); - HANDLE_ERROR(cudaMemcpy(gpu_in, in, dimx*dimy*dimz*sizeof(T),cudaMemcpyHostToDevice)); - K* gpu_kx; - HANDLE_ERROR(cudaMalloc(&gpu_kx, kx*sizeof(K))); - HANDLE_ERROR(cudaMemcpy(gpu_kx, k0, kx*sizeof(K),cudaMemcpyHostToDevice)); - K* gpu_ky; - HANDLE_ERROR(cudaMalloc(&gpu_ky, ky*sizeof(K))); - HANDLE_ERROR(cudaMemcpy(gpu_ky, k1, ky*sizeof(K),cudaMemcpyHostToDevice)); - K* gpu_kz; - HANDLE_ERROR(cudaMalloc(&gpu_kz, kz*sizeof(K))); - HANDLE_ERROR(cudaMemcpy(gpu_kz, k2, kz*sizeof(K),cudaMemcpyHostToDevice)); - T* gpu_out; - HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * Z*sizeof(T))); - - ///run the kernel - gpu_sepconv3(gpu_out, gpu_in, gpu_kx, gpu_ky, gpu_kz, dimx, dimy, dimz, kx, ky, kz); - - ///Copy the output - HANDLE_ERROR(cudaMemcpy(out, gpu_out, X*Y*Z*sizeof(T), cudaMemcpyDeviceToHost)); - - ///Free all the memory used. - HANDLE_ERROR(cudaFree(gpu_in)); - HANDLE_ERROR(cudaFree(gpu_kx)); - HANDLE_ERROR(cudaFree(gpu_ky)); - HANDLE_ERROR(cudaFree(gpu_kz)); - HANDLE_ERROR(cudaFree(gpu_out)); -#else - T* temp = (T*) malloc(X * dimy * sizeof(T)); - T* temp3 = (T*) malloc(X * Y * dimz * sizeof(T)); - for(int i = 0; i < dimz; i++) - { - idx_IN = (dimx*dimy)*i-i; - idx_OUT = (X*Y)*i-i; - cpu_conv2(temp, &in[idx_IN], k0, dimx, dimy, kx, 1) - cpu_conv2(&temp3[idx_OUT], temp, k1, X, dimy, 1, ky); - } - cpu_conv2(out, temp, k2, X*Y, dimz, 1, kz); - free(temp); - free(temp3); - -#endif - } -} - - -#endif -- libgit2 0.21.4