#ifndef STIM_CUDA_CONV2_H #define STIM_CUDA_CONV2_H #include #include #include #include #include namespace stim{ namespace cuda{ template __global__ void cuda_conv2(T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned int M){ //the radius of mask int r = (M - 1)/2; //calculate the 1D index for this thread //int idx = blockIdx.x * blockDim.x + threadIdx.x; //change 1D index to 2D cordinates int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y; int idx = j * w + i; //unsigned long N = w * h; if(i < w && j < h){ //copy[idx] = tex2D(texObj, i+100, j+100); //return; tex2D(texObj, (float)i/w, (float)j/h); //allocate memory for result T sum = 0; //for (unsigned int y = max(j - r, 0); y <= min(j + r, h - 1); y++){ //for (unsigned int x = max(i - r, 0); x <= min(i + r, w - 1); x++){ for (int y = j - r; y <= j + r; y++){ for (int x = i - r; x <= i + r; x++){ //idx to mask cordinates(xx, yy) int xx = x - (i - r); int yy = y - (j - r); sum += tex2D(texObj, (float)x/w, (float)y/h) * mask[yy * M + xx]; } } copy[idx] = sum; } } template void gpu_conv2(T* img, T* mask, T* copy, unsigned int w, unsigned int h, unsigned M){ unsigned long N = w * h; // Allocate CUDA array in device memory //define a channel descriptor for a single 32-bit channel cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); cudaArray* cuArray; //declare the cuda array cudaMallocArray(&cuArray, &channelDesc, w, h); //allocate the cuda array // Copy the image data from global memory to the array cudaMemcpyToArray(cuArray, 0, 0, img, N * sizeof(T), cudaMemcpyDeviceToDevice); // Specify texture struct cudaResourceDesc resDesc; //create a resource descriptor memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type resDesc.res.array.array = cuArray; //add a pointer to the cuda array // Specify texture object parameters struct cudaTextureDesc texDesc; //create a texture descriptor memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero texDesc.addressMode[0] = cudaAddressModeClamp; //use wrapping (around the edges) texDesc.addressMode[1] = cudaAddressModeClamp; texDesc.filterMode = cudaFilterModePoint; //use linear filtering texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) texDesc.normalizedCoords = 1; //using normalized coordinates // Create texture object cudaTextureObject_t texObj = 0; cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); //get the maximum number of threads per block for the CUDA device int threads = stim::maxThreadsPerBlock(); //calculate the number of blocks dim3 blocks(w / threads + 1, h); //call the kernel to do the multiplication cuda_conv2 <<< blocks, threads >>>(mask, copy, texObj, w, h, M); } template void cpu_conv2(T* img, T* mask, T* cpu_copy, unsigned int w, unsigned int h, unsigned M){ unsigned long N = w * h; //allocate memory on the GPU for the array T* gpu_img; T* gpu_mask; T* gpu_copy; HANDLE_ERROR( cudaMalloc( &gpu_img, N * sizeof(T) ) ); HANDLE_ERROR( cudaMalloc( &gpu_mask, pow(M, 2) * sizeof(T) ) ); HANDLE_ERROR( cudaMalloc( &gpu_copy, N * sizeof(T) ) ); //copy the array to the GPU HANDLE_ERROR( cudaMemcpy( gpu_img, img, N * sizeof(T), cudaMemcpyHostToDevice) ); HANDLE_ERROR( cudaMemcpy( gpu_mask, mask, pow(M, 2) * sizeof(T), cudaMemcpyHostToDevice) ); //call the GPU version of this function gpu_conv2(gpu_img, gpu_mask ,gpu_copy, w, h, M); //copy the array back to the CPU HANDLE_ERROR( cudaMemcpy( cpu_copy, gpu_copy, N * sizeof(T), cudaMemcpyDeviceToHost) ); //free allocated memory cudaFree(gpu_img); cudaFree(gpu_mask); cudaFree(gpu_copy); } } } #endif