#ifndef GA_GPU_CU #define GA_GPU_CU //#include //#include "cuda_runtime.h" //#include //#include "device_launch_parameters.h" #include #include "timer.h" //#include //#include #include #include extern Timer timer; __global__ void kernel_computeSb(float* gpuSb, unsigned int* gpuP, float* gpuM, float* gpuCM, size_t ub, size_t f, size_t p, size_t nC, unsigned int* gpu_nPxInCls){ size_t i = blockIdx.x * blockDim.x + threadIdx.x; //gnomeindex in population matrix size_t j = blockIdx.y * blockDim.y + threadIdx.y; //index of feature index from gnome size_t gnomeIndx = blockIdx.z * blockDim.z + threadIdx.z; //if we use 3d grid then it is needed if(gnomeIndx >= p || i >= f || j >= f) return; //handling segmentation fault //form a sb matrix from vector sbVec, multiply each element in matrix with num of pixels in the current class //and add it to previous value of between class scatter matrix sb float tempsbval; size_t n1; size_t n2; size_t classIndx; //class index in class mean matrix for(size_t c = 0; c < nC; c++){ tempsbval = 0; classIndx = c * ub; n1 = gpuP[gnomeIndx * f + i]; //actual feature index in original feature matrix n2 = gpuP[gnomeIndx * f + j]; //actual feature index in original feature matrix tempsbval = ((gpuCM[classIndx + n1] - gpuM[n1]) *(gpuCM[classIndx + n2] - gpuM[n2])) * (float)gpu_nPxInCls[c] ; gpuSb[gnomeIndx * f * f + j * f + i] += tempsbval; } } //Compute within class scatter sw (p x f x f) of all gnome features phe(tP x f) __global__ void kernel_computeSw(float* gpuSw, unsigned int* gpuP, float* gpuCM, float* gpuF, unsigned int* gpuT, size_t ub, size_t f, size_t p, size_t nC, size_t tP){ size_t i = blockIdx.x * blockDim.x + threadIdx.x; //gnomeindex in population matrix size_t j = blockIdx.y * blockDim.y + threadIdx.y; //index of feature index from gnome size_t gnomeIndx = blockIdx.z * blockDim.z + threadIdx.z; //total number of individuals if(gnomeIndx >= p || i >= f || j >= f) return; //handling segmentation fault float tempswval; size_t n1 = gpuP[gnomeIndx * f + i]; //actual feature index in original feature matrix size_t n2 = gpuP[gnomeIndx * f + j]; //actual feature index in original feature matrix tempswval = 0; for(size_t c = 0; c < nC; c++){ tempswval = 0; for(size_t k = 0; k < tP; k++){ if(gpuT[k] == (c+1) ){ tempswval += ((gpuF[ k * ub + n1] - gpuCM[c * ub + n1]) * (gpuF[k * ub + n2] - gpuCM[c * ub + n2])); } } gpuSw[gnomeIndx * f * f + j * f + i] += tempswval; } } //=============================gpu intialization============================================= /// Initialize all GPU pointers used in the GA-GPU algorithm /// @param gpuP is a pointer to GPU memory location, will point to memory space allocated for the population /// @param p is the population size /// @param f is the number of desired features /// @param gpuCM is a pointer to a GPU memory location, will point to the class mean /// @param cpuM is a pointer to the class mean on the CPU /// @param gpu_nPxInCls is a pointer to a GPU memory location storing the number of pixels in each class /// @param gpu_nPxInCls is a CPU array storing the number of pixels in each class /// @param gpuSb is a GPU memory pointer to the between-class scatter matrices /// @param gpuSw is a GPU memory pointer to the within-class scatter matrices /// @param gpuF is the destination for the GPU feature matrix /// @param cpuF is the complete feature matrix on the CPU void gpuIntialization(unsigned int** gpuP, size_t p, size_t f, //variables required for the population allocation float** gpuCM, float* cpuCM, size_t nC, unsigned int ub, float** gpuM, float* cpuM, unsigned int** gpu_nPxInCls, float** gpuSb, float** gpuSw, float** gpuF, float* cpuF, unsigned int** gpuT, unsigned int* cpuT, size_t tP, unsigned int* cpu_nPxInCls){ HANDLE_ERROR(cudaMalloc(gpuP, p * f * sizeof(unsigned int))); //allocate space for the population on the GPU HANDLE_ERROR(cudaMalloc(gpuCM, nC * ub * sizeof(float))); //allocate space for the class mean and copy it to the GPU HANDLE_ERROR(cudaMemcpy(*gpuCM, cpuCM, nC * ub * sizeof(float), cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMalloc(gpuM, ub * sizeof(float))); //allocate space for the mean of the feature matrix HANDLE_ERROR(cudaMemcpy(*gpuM, cpuM, ub * sizeof(float), cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMalloc(gpu_nPxInCls, nC * sizeof(unsigned int))); //number of pixels in each class HANDLE_ERROR(cudaMemcpy(*gpu_nPxInCls, cpu_nPxInCls, nC * sizeof(unsigned int), cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMalloc(gpuSb, p * f * f * sizeof(float))); //allocate memory for sb which is calculated for eery class separately and added together in different kernel HANDLE_ERROR(cudaMalloc(gpuSw, p * f * f * sizeof(float))); HANDLE_ERROR(cudaMalloc(gpuF, tP * ub * sizeof(float))); HANDLE_ERROR(cudaMemcpy(*gpuF, cpuF, tP * ub * sizeof(float), cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMalloc(gpuT, tP * sizeof(unsigned int))); HANDLE_ERROR(cudaMemcpy(*gpuT, cpuT, tP* sizeof(unsigned int), cudaMemcpyHostToDevice)); } //computation on GPU /// Initialize all GPU pointers used in the GA-GPU algorithm /// @param gpuP is a pointer to GPU memory location, will point to memory space allocated for the population /// @param p is the population size /// @param f is the number of desired features /// @param gpuSb is a GPU memory pointer to the between-class scatter matrices /// @param cpuSb is the between-class scatter matrix on the GPU (this function will copy the GPU result there) /// @param gpuSw is a GPU memory pointer to the within-class scatter matrices /// @param cpuSw is the within-class scatter matrix on the GPU (this function will copy the GPU result there) /// @param gpuCM is a pointer to a GPU memory location, will point to the class mean /// @param cpuM is a pointer to the class mean on the CPU /// @param gpu_nPxInCls is a pointer to a GPU memory location storing the number of pixels in each class /// @param gpu_nPxInCls is a CPU array storing the number of pixels in each class /// @param gpuF is the destination for the GPU feature matrix /// @param cpuF is the complete feature matrix on the CPU void gpucomputeSbSw(unsigned int* gpuP, unsigned int* cpuP, size_t p, size_t f, float* gpuSb, float* cpuSb, float* gpuSw, float* cpuSw, float* gpuF, unsigned int* gpuT,float* gpuM, float* gpuCM, size_t nC, size_t tP, cudaDeviceProp props, size_t gen, size_t gnrtn, size_t ub, unsigned int* gpu_nPxInCls, std::ofstream& profilefile){ timer.start(); HANDLE_ERROR(cudaMemcpy(gpuP, cpuP, p * f * sizeof(unsigned int), cudaMemcpyHostToDevice)); HANDLE_ERROR(cudaMemset(gpuSb, 0, p * f * f * sizeof(float))); //grid configuration of GPU size_t threads = (size_t)sqrt(props.maxThreadsPerBlock); if(threads > f) threads = f; size_t numberofblocksfor_f = (size_t)ceil((float)f/ threads); dim3 blockdim((int)threads, (int)threads, 1); dim3 griddim((int)numberofblocksfor_f, (int)numberofblocksfor_f, (int)p); //X dimension blocks will cover all gnomes of the population and each block will have as many gnomes as it can feet //sharedbytes calculation size_t sharedBytes = p * f * sizeof(unsigned int); //copy population to shared memory if(props.sharedMemPerBlock < sharedBytes) sharedBytes = props.sharedMemPerBlock; //launch kernel to compute sb matrix kernel_computeSb<<>>(gpuSb, gpuP, gpuM, gpuCM, ub, f, p, nC, gpu_nPxInCls); cudaDeviceSynchronize(); HANDLE_ERROR(cudaMemcpy(cpuSb, gpuSb, p * f * f * sizeof(float), cudaMemcpyDeviceToHost)); //copy between class scatter from gpu to cpu const auto elapsedg1 = timer.time_elapsed(); if(gen > gnrtn -2){ std::cout << "Sb gpu time "<(elapsedg1).count() << "us" << std::endl; profilefile << "Sb gpu time "<(elapsedg1).count() << "us" << std::endl; } timer.start(); //Compute within class scatter HANDLE_ERROR(cudaMemset(gpuSw, 0, p * f * f * sizeof(float))); //launch kernel to compute sb matrix kernel_computeSw<<>>(gpuSw, gpuP, gpuCM, gpuF, gpuT, ub, f, p, nC, tP); cudaDeviceSynchronize(); //copy between class scatter from gpu to cpu HANDLE_ERROR(cudaMemcpy(cpuSw, gpuSw, p * f * f * sizeof(float), cudaMemcpyDeviceToHost)); const auto elapsedg2 = timer.time_elapsed(); if(gen > gnrtn - 2){ std::cout << "Sw gpu time "<(elapsedg2).count() << "us" << std::endl; profilefile<< "Sw gpu time "<(elapsedg2).count() << "us" << std::endl; } } //free all gpu pointers void gpuDestroy(unsigned int* gpuP, float* gpuCM, float* gpuM, unsigned int* gpu_nPxInCls, float* gpuSb, float* gpuSw, float* gpuF, unsigned int* gpuT){ HANDLE_ERROR(cudaFree(gpuP)); HANDLE_ERROR(cudaFree(gpuCM)); HANDLE_ERROR(cudaFree(gpuM)); HANDLE_ERROR(cudaFree(gpu_nPxInCls)); HANDLE_ERROR(cudaFree(gpuSb)); HANDLE_ERROR(cudaFree(gpuSw)); HANDLE_ERROR(cudaFree(gpuF)); HANDLE_ERROR(cudaFree(gpuT)); } #endif