#ifndef STIM_CUDA_CHI_GRAD_H #define STIM_CUDA_CHI_GRAD_H #include #include #include #include #include #include #define PI 3.14159265358979 namespace stim{ namespace cuda{ /// template parameter @param T is the data type template __global__ void cuda_chi_grad(T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){ double theta_r = ((theta) * PI)/180; //change angle unit from degree to rad float sum = 0; unsigned int N = w * h; //change 1D index to 2D cordinates int xi = blockIdx.x * blockDim.x + threadIdx.x; int yj = blockIdx.y; int idx = yj * w + xi; int shareidx = threadIdx.x; extern __shared__ unsigned short bin[]; if(xi < w && yj < h){ int gidx; int hidx; //initialize histogram bin to zeros for(int i = 0; i < bin_n; i++){ bin[shareidx * bin_n + i] = 0; __syncthreads(); } //get the histogram of the first half of disc and store in bin for (int y = yj - r; y <= yj + r; y++){ for (int x = xi - r; x <= xi + r; x++){ if ((y - yj)*cos(theta_r) + (x - xi)*sin(theta_r) > 0){ gidx = (int) tex2D(texObj, (float)x/w, (float)y/h)/bin_size; __syncthreads(); bin[shareidx * bin_n + gidx]++; __syncthreads(); } else{} } } //initiallize the gbin unsigned short* gbin = (unsigned short*) malloc(bin_n*sizeof(unsigned short)); memset (gbin, 0, bin_n*sizeof(unsigned short)); //copy the histogram to gbin for (unsigned int gi = 0; gi < bin_n; gi++){ gbin[gi] = bin[shareidx * bin_n + gi]; } //initialize histogram bin to zeros for(int j = 0; j < bin_n; j++){ //initialize histogram bin to zeros bin[shareidx * bin_n + j] = 0; __syncthreads(); } //get the histogram of the second half of disc and store in bin for (int y = yj - r; y <= yj + r; y++){ for (int x = xi - r; x <= xi + r; x++){ if ((y - yj)*cos(theta_r) + (x - xi)*sin(theta_r) < 0){ hidx = (int) tex2D(texObj, (float)x/w, (float)y/h)/bin_size; __syncthreads(); bin[shareidx * bin_n + hidx]++; __syncthreads(); } else{} } } //initiallize the gbin unsigned short* hbin = (unsigned short*) malloc(bin_n*sizeof(unsigned short)); memset (hbin, 0, bin_n*sizeof(unsigned short)); //copy the histogram to hbin for (unsigned int hi = 0; hi < bin_n; hi++){ hbin[hi] = bin[shareidx * bin_n + hi]; } //compare gbin, hbin and calculate the chi distance for (int k = 0; k < bin_n; k++){ float flag; // set flag to avoid zero denominator if ((gbin[k] + hbin[k]) == 0){ flag = 1; } else { flag = (gbin[k] + hbin[k]); __syncthreads(); } sum += (gbin[k] - hbin[k])*(gbin[k] - hbin[k])/flag; __syncthreads(); } // return chi-distance for each pixel copy[idx] = sum; free(gbin); free(hbin); } } template void gpu_chi_grad(T* img, T* copy, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){ unsigned long N = w * h; // Allocate CUDA array in device memory //define a channel descriptor for a single 32-bit channel cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); cudaArray* cuArray; //declare the cuda array cudaMallocArray(&cuArray, &channelDesc, w, h); //allocate the cuda array // Copy the image data from global memory to the array cudaMemcpyToArray(cuArray, 0, 0, img, N * sizeof(T), cudaMemcpyDeviceToDevice); // Specify texture struct cudaResourceDesc resDesc; //create a resource descriptor memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type resDesc.res.array.array = cuArray; //add a pointer to the cuda array // Specify texture object parameters struct cudaTextureDesc texDesc; //create a texture descriptor memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero texDesc.addressMode[0] = cudaAddressModeMirror; //use wrapping (around the edges) texDesc.addressMode[1] = cudaAddressModeMirror; texDesc.filterMode = cudaFilterModePoint; //use linear filtering texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats) texDesc.normalizedCoords = 1; //using normalized coordinates // Create texture object cudaTextureObject_t texObj = 0; cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL); //get the maximum number of threads per block for the CUDA device int threads = stim::maxThreadsPerBlock(); int sharemax = stim::sharedMemPerBlock(); //get the size of Shared memory available per block in bytes unsigned int shared_bytes = threads * bin_n * sizeof(unsigned short); if(threads * bin_n > sharemax){ cout <<"Error: shared_bytes exceeds the max value."<<'\n'; exit(1); } //calculate the number of blocks dim3 blocks(w / threads + 1, h); //call the kernel to do the multiplication cuda_chi_grad <<< blocks, threads, shared_bytes >>>(copy, texObj, w, h, r, bin_n, bin_size, theta); } template void cpu_chi_grad(T* img, T* cpu_copy, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){ unsigned long N = w * h; //allocate memory on the GPU for the array T* gpu_img; T* gpu_copy; HANDLE_ERROR( cudaMalloc( &gpu_img, N * sizeof(T) ) ); HANDLE_ERROR( cudaMalloc( &gpu_copy, N * sizeof(T) ) ); //copy the array to the GPU HANDLE_ERROR( cudaMemcpy( gpu_img, img, N * sizeof(T), cudaMemcpyHostToDevice) ); //call the GPU version of this function gpu_chi_grad(gpu_img, gpu_copy, w, h, r, bin_n, bin_size, theta); //copy the array back to the CPU HANDLE_ERROR( cudaMemcpy( cpu_copy, gpu_copy, N * sizeof(T), cudaMemcpyDeviceToHost) ); //free allocated memory cudaFree(gpu_img); cudaFree(gpu_copy); } } } #endif