stim/cuda/templates/chi_gradient.cuh

#ifndef STIM_CUDA_CHI_GRAD_H
#define STIM_CUDA_CHI_GRAD_H
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stim/cuda/sharedmem.cuh>
#include <cmath>
#include <algorithm>
#define PI 3.14159265358979
namespace stim{
	namespace cuda{
		/// template parameter @param T is the data type
		template<typename T>
		__global__ void cuda_chi_grad(T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){
			double theta_r = ((theta) * PI)/180; //change angle unit from degree to rad
			float sum = 0;
			unsigned int N = w * h;
			//change 1D index to 2D cordinates
			int xi = blockIdx.x * blockDim.x + threadIdx.x;
			int yj = blockIdx.y;
			int idx = yj * w + xi;
			int shareidx = threadIdx.x;
			
			extern __shared__ unsigned short bin[];
			if(xi < w && yj < h){
				int gidx;
				int hidx;
				//initialize histogram bin to zeros
				for(int i = 0; i < bin_n; i++){      
			
				bin[shareidx * bin_n + i] = 0;
				__syncthreads();
				}
				
				//get the histogram of the first half of disc and store in bin
				for (int y = yj - r; y <= yj + r; y++){
					for (int x = xi - r; x <= xi + r; x++){
						
							if ((y - yj)*cos(theta_r) + (x - xi)*sin(theta_r) > 0){
								gidx = (int) tex2D<T>(texObj, (float)x/w, (float)y/h)/bin_size;
								__syncthreads();
								bin[shareidx * bin_n + gidx]++;
								__syncthreads();
							}
							else{}
					}		
				}
				//initiallize the gbin
				unsigned short* gbin = (unsigned short*) malloc(bin_n*sizeof(unsigned short));
				memset (gbin, 0, bin_n*sizeof(unsigned short));  
				//copy the histogram to gbin
				for (unsigned int gi = 0; gi < bin_n; gi++){
					gbin[gi] = bin[shareidx * bin_n + gi];
				
				}
				//initialize histogram bin to zeros
				for(int j = 0; j < bin_n; j++){      //initialize histogram bin to zeros
			
				bin[shareidx * bin_n + j] = 0;
				__syncthreads();
				}
				//get the histogram of the second half of disc and store in bin
				for (int y = yj - r; y <= yj + r; y++){
					for (int x = xi - r; x <= xi + r; x++){
						
							if ((y - yj)*cos(theta_r) + (x - xi)*sin(theta_r) < 0){
								hidx = (int) tex2D<T>(texObj, (float)x/w, (float)y/h)/bin_size;
								__syncthreads();
								bin[shareidx * bin_n + hidx]++;
								__syncthreads();	
							}
							else{}
					}		
				}
				//initiallize the gbin
				unsigned short* hbin = (unsigned short*) malloc(bin_n*sizeof(unsigned short));
				memset (hbin, 0, bin_n*sizeof(unsigned short));      
				//copy the histogram to hbin
				for (unsigned int hi = 0; hi < bin_n; hi++){
					hbin[hi] = bin[shareidx * bin_n + hi];
				
				}
				//compare gbin, hbin and calculate the chi distance
				for (int k = 0; k < bin_n; k++){
					float flag;              // set flag to avoid zero denominator
					
					if ((gbin[k] + hbin[k]) == 0){
						flag = 1;
					}
					else {
						flag = (gbin[k] + hbin[k]);
						__syncthreads();
					}
					sum += (gbin[k] - hbin[k])*(gbin[k] - hbin[k])/flag;
					__syncthreads();
				}
				// return chi-distance for each pixel
				copy[idx] = sum;
				free(gbin);
				free(hbin);
			 }
		}
		
		template<typename T>
		void gpu_chi_grad(T* img, T* copy, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){
			unsigned long N = w * h;
			// Allocate CUDA array in device memory
			
			//define a channel descriptor for a single 32-bit channel
			cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
			cudaArray* cuArray;												//declare the cuda array
			cudaMallocArray(&cuArray, &channelDesc, w, h);			//allocate the cuda array
			// Copy the image data from global memory to the array
			cudaMemcpyToArray(cuArray, 0, 0, img, N * sizeof(T), cudaMemcpyDeviceToDevice);
			// Specify texture
			struct cudaResourceDesc resDesc;				//create a resource descriptor
			memset(&resDesc, 0, sizeof(resDesc));			//set all values to zero
			resDesc.resType = cudaResourceTypeArray;		//specify the resource descriptor type
			resDesc.res.array.array = cuArray;				//add a pointer to the cuda array
			// Specify texture object parameters
			struct cudaTextureDesc texDesc;							//create a texture descriptor
			memset(&texDesc, 0, sizeof(texDesc));					//set all values in the texture descriptor to zero
			texDesc.addressMode[0]   = cudaAddressModeMirror;			//use wrapping (around the edges)
			texDesc.addressMode[1]   = cudaAddressModeMirror;
			texDesc.filterMode       = cudaFilterModePoint;			//use linear filtering
			texDesc.readMode         = cudaReadModeElementType;		//reads data based on the element type (32-bit floats)
			texDesc.normalizedCoords = 1;							//using normalized coordinates
			// Create texture object
			cudaTextureObject_t texObj = 0;
			cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
			//get the maximum number of threads per block for the CUDA device
			int threads = stim::maxThreadsPerBlock();
			int sharemax = stim::sharedMemPerBlock();                   //get the size of Shared memory available per block in bytes
			unsigned int shared_bytes = threads * bin_n * sizeof(unsigned short);
			if(threads * bin_n > sharemax){
				cout <<"Error: shared_bytes exceeds the max value."<<'\n';
				exit(1);
			
			}
			
			//calculate the number of blocks
			dim3 blocks(w / threads + 1, h);
			//call the kernel to do the multiplication
			cuda_chi_grad <<< blocks, threads, shared_bytes >>>(copy, texObj, w, h, r, bin_n, bin_size, theta);
		}
		template<typename T>
		void cpu_chi_grad(T* img, T* cpu_copy, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){
			
			unsigned long N = w * h;
			//allocate memory on the GPU for the array
			T* gpu_img; 
			T* gpu_copy;
			HANDLE_ERROR( cudaMalloc( &gpu_img, N * sizeof(T) ) );
			HANDLE_ERROR( cudaMalloc( &gpu_copy, N * sizeof(T) ) );
			//copy the array to the GPU
			HANDLE_ERROR( cudaMemcpy( gpu_img, img, N * sizeof(T), cudaMemcpyHostToDevice) );
			//call the GPU version of this function
			gpu_chi_grad<T>(gpu_img, gpu_copy, w, h, r, bin_n, bin_size, theta);
			//copy the array back to the CPU
			HANDLE_ERROR( cudaMemcpy( cpu_copy, gpu_copy, N * sizeof(T), cudaMemcpyDeviceToHost) );
			//free allocated memory
			cudaFree(gpu_img);
			cudaFree(gpu_copy);
		}
		
	}
}
#endif