#ifndef STIM_CUDA_SEPCONV3_H #define STIM_CUDA_SEPCONV3_H #include #include #ifdef __CUDACC__ #include #include #endif namespace stim { #ifdef __CUDACC__ template void gpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz) { size_t X = dimx - kx + 1; size_t Y = dimy - ky + 1; size_t Z = dimz - kz + 1; T* temp_out; int idx_IN; int idx_OUT; HANDLE_ERROR(cudaMalloc(&temp_out, X*Y*dimz*sizeof(T))); for(int i = 0; i < dimz; i++) { idx_IN = (dimx*dimy)*i-i; idx_OUT = (X*Y)*i-i; gpu_sepconv2(&temp_out[idx_OUT], &in[idx_IN], k0, k1, dimx, dimy, kx, ky); } cudaDeviceProp p; HANDLE_ERROR(cudaGetDeviceProperties(&p, 0)); size_t tmax = p.maxThreadsPerBlock; dim3 numThreads(sqrt(tmax), sqrt(tmax)); dim3 numBlocks(X*Y/numThreads.x +1, dimz/numThreads.y + 1); size_t sharedMem = (numThreads.x + kz - 1) * numThreads.y * sizeof(T); if(sharedMem > p.sharedMemPerBlock) { std::cout << "Error in stim::gpu_sepconv3() - insufficient shared memory for this kernel." << std::endl; exit(1); } kernel_conv2 <<< numBlocks, numThreads, sharedMem >>> (out, temp_out, k2, X*Y, dimz, 1, kz); HANDLE_ERROR(cudaFree(temp_out)); } #endif //Performs a separable convolution of a 3D image. Only valid pixels based on the kernel ar e returned. // As a result, the output image will be smaller than the input image by (kx-1, ky-1 , kz-1) //@param out is a pointer to the output image //@param in is a pointer to the input image //@param kx is the x-axis convolution filter //@param ky is the y-axis convolution filter //@param kz is the z-axis convolution filter //@param dimx is the size of the input image along X //@param dimy is the size of the input image along Y //@param dimz is the size of the input image along Z //@param kx is the size of the kernel along X //@param ky is the size of the kernel along Y //@param kz is the size of the kernel along Z template void cpu_sepconv3(T* out, T* in, K* k0, K* k1, K* k2, size_t dimx, size_t dimy, size_t dimz, size_t kx, size_t ky, size_t kz) { //Set up the sizes of the new output, which will be kx, ky, kz, smaller than the i nput. size_t X = dimx - kx + 1; size_t Y = dimy - ky + 1; size_t Z = dimz - kz + 1; #ifdef __CUDACC__ ///Set up all of the memory on the GPU T* gpu_in; HANDLE_ERROR(cudaMalloc(&gpu_in, dimx*dimy*dimz*sizeof(T))); HANDLE_ERROR(cudaMemcpy(gpu_in, in, dimx*dimy*dimz*sizeof(T),cudaMemcpyHostToDevice)); K* gpu_kx; HANDLE_ERROR(cudaMalloc(&gpu_kx, kx*sizeof(K))); HANDLE_ERROR(cudaMemcpy(gpu_kx, k0, kx*sizeof(K),cudaMemcpyHostToDevice)); K* gpu_ky; HANDLE_ERROR(cudaMalloc(&gpu_ky, ky*sizeof(K))); HANDLE_ERROR(cudaMemcpy(gpu_ky, k1, ky*sizeof(K),cudaMemcpyHostToDevice)); K* gpu_kz; HANDLE_ERROR(cudaMalloc(&gpu_kz, kz*sizeof(K))); HANDLE_ERROR(cudaMemcpy(gpu_kz, k2, kz*sizeof(K),cudaMemcpyHostToDevice)); T* gpu_out; HANDLE_ERROR(cudaMalloc(&gpu_out, X * Y * Z*sizeof(T))); ///run the kernel gpu_sepconv3(gpu_out, gpu_in, gpu_kx, gpu_ky, gpu_kz, dimx, dimy, dimz, kx, ky, kz); ///Copy the output HANDLE_ERROR(cudaMemcpy(out, gpu_out, X*Y*Z*sizeof(T), cudaMemcpyDeviceToHost)); ///Free all the memory used. HANDLE_ERROR(cudaFree(gpu_in)); HANDLE_ERROR(cudaFree(gpu_kx)); HANDLE_ERROR(cudaFree(gpu_ky)); HANDLE_ERROR(cudaFree(gpu_kz)); HANDLE_ERROR(cudaFree(gpu_out)); #else T* temp = (T*) malloc(X * dimy * sizeof(T)); T* temp3 = (T*) malloc(X * Y * dimz * sizeof(T)); for(int i = 0; i < dimz; i++) { idx_IN = (dimx*dimy)*i-i; idx_OUT = (X*Y)*i-i; cpu_conv2(temp, &in[idx_IN], k0, dimx, dimy, kx, 1) cpu_conv2(&temp3[idx_OUT], temp, k1, X, dimy, 1, ky); } cpu_conv2(out, temp, k2, X*Y, dimz, 1, kz); free(temp); free(temp3); #endif } } #endif