ga_gpu.cu
9.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#ifndef GA_GPU_CU
#define GA_GPU_CU
//#include <cuda.h>
//#include "cuda_runtime.h"
//#include <cuda_runtime_api.h>
//#include "device_launch_parameters.h"
#include <stim/cuda/cudatools/error.h>
#include "timer.h"
//#include <stdio.h>
//#include <stdlib.h>
#include <iostream>
#include <fstream>
extern Timer timer;
__global__ void kernel_computeSb(float* gpuSb, unsigned int* gpuP, float* gpuM, float* gpuCM, size_t ub, size_t f, size_t p, size_t nC, unsigned int* gpu_nPxInCls){
size_t i = blockIdx.x * blockDim.x + threadIdx.x; //gnomeindex in population matrix
size_t j = blockIdx.y * blockDim.y + threadIdx.y; //index of feature index from gnome
size_t gnomeIndx = blockIdx.z * blockDim.z + threadIdx.z; //if we use 3d grid then it is needed
if(gnomeIndx >= p || i >= f || j >= f) return; //handling segmentation fault
//form a sb matrix from vector sbVec, multiply each element in matrix with num of pixels in the current class
//and add it to previous value of between class scatter matrix sb
float tempsbval;
size_t n1;
size_t n2;
size_t classIndx; //class index in class mean matrix
for(size_t c = 0; c < nC; c++){
tempsbval = 0;
classIndx = c * ub;
n1 = gpuP[gnomeIndx * f + i]; //actual feature index in original feature matrix
n2 = gpuP[gnomeIndx * f + j]; //actual feature index in original feature matrix
tempsbval = ((gpuCM[classIndx + n1] - gpuM[n1]) *(gpuCM[classIndx + n2] - gpuM[n2])) * (float)gpu_nPxInCls[c] ;
gpuSb[gnomeIndx * f * f + j * f + i] += tempsbval;
}
}
//Compute within class scatter sw (p x f x f) of all gnome features phe(tP x f)
__global__ void kernel_computeSw(float* gpuSw, unsigned int* gpuP, float* gpuCM, float* gpuF, unsigned int* gpuT, size_t ub, size_t f, size_t p, size_t nC, size_t tP){
size_t i = blockIdx.x * blockDim.x + threadIdx.x; //gnomeindex in population matrix
size_t j = blockIdx.y * blockDim.y + threadIdx.y; //index of feature index from gnome
size_t gnomeIndx = blockIdx.z * blockDim.z + threadIdx.z; //total number of individuals
if(gnomeIndx >= p || i >= f || j >= f) return; //handling segmentation fault
float tempswval;
size_t n1 = gpuP[gnomeIndx * f + i]; //actual feature index in original feature matrix
size_t n2 = gpuP[gnomeIndx * f + j]; //actual feature index in original feature matrix
tempswval = 0;
for(size_t c = 0; c < nC; c++){
tempswval = 0;
for(size_t k = 0; k < tP; k++){
if(gpuT[k] == (c+1) ){
tempswval += ((gpuF[ k * ub + n1] - gpuCM[c * ub + n1]) * (gpuF[k * ub + n2] - gpuCM[c * ub + n2]));
}
}
gpuSw[gnomeIndx * f * f + j * f + i] += tempswval;
}
}
//=============================gpu intialization=============================================
/// Initialize all GPU pointers used in the GA-GPU algorithm
/// @param gpuP is a pointer to GPU memory location, will point to memory space allocated for the population
/// @param p is the population size
/// @param f is the number of desired features
/// @param gpuCM is a pointer to a GPU memory location, will point to the class mean
/// @param cpuM is a pointer to the class mean on the CPU
/// @param gpu_nPxInCls is a pointer to a GPU memory location storing the number of pixels in each class
/// @param gpu_nPxInCls is a CPU array storing the number of pixels in each class
/// @param gpuSb is a GPU memory pointer to the between-class scatter matrices
/// @param gpuSw is a GPU memory pointer to the within-class scatter matrices
/// @param gpuF is the destination for the GPU feature matrix
/// @param cpuF is the complete feature matrix on the CPU
void gpuIntialization(unsigned int** gpuP, size_t p, size_t f, //variables required for the population allocation
float** gpuCM, float* cpuCM, size_t nC, unsigned int ub,
float** gpuM, float* cpuM, unsigned int** gpu_nPxInCls,
float** gpuSb, float** gpuSw,
float** gpuF, float* cpuF,
unsigned int** gpuT, unsigned int* cpuT, size_t tP, unsigned int* cpu_nPxInCls){
HANDLE_ERROR(cudaMalloc(gpuP, p * f * sizeof(unsigned int))); //allocate space for the population on the GPU
HANDLE_ERROR(cudaMalloc(gpuCM, nC * ub * sizeof(float))); //allocate space for the class mean and copy it to the GPU
HANDLE_ERROR(cudaMemcpy(*gpuCM, cpuCM, nC * ub * sizeof(float), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMalloc(gpuM, ub * sizeof(float))); //allocate space for the mean of the feature matrix
HANDLE_ERROR(cudaMemcpy(*gpuM, cpuM, ub * sizeof(float), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMalloc(gpu_nPxInCls, nC * sizeof(unsigned int))); //number of pixels in each class
HANDLE_ERROR(cudaMemcpy(*gpu_nPxInCls, cpu_nPxInCls, nC * sizeof(unsigned int), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMalloc(gpuSb, p * f * f * sizeof(float))); //allocate memory for sb which is calculated for eery class separately and added together in different kernel
HANDLE_ERROR(cudaMalloc(gpuSw, p * f * f * sizeof(float)));
HANDLE_ERROR(cudaMalloc(gpuF, tP * ub * sizeof(float)));
HANDLE_ERROR(cudaMemcpy(*gpuF, cpuF, tP * ub * sizeof(float), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMalloc(gpuT, tP * sizeof(unsigned int)));
HANDLE_ERROR(cudaMemcpy(*gpuT, cpuT, tP* sizeof(unsigned int), cudaMemcpyHostToDevice));
}
//computation on GPU
/// Initialize all GPU pointers used in the GA-GPU algorithm
/// @param gpuP is a pointer to GPU memory location, will point to memory space allocated for the population
/// @param p is the population size
/// @param f is the number of desired features
/// @param gpuSb is a GPU memory pointer to the between-class scatter matrices
/// @param cpuSb is the between-class scatter matrix on the GPU (this function will copy the GPU result there)
/// @param gpuSw is a GPU memory pointer to the within-class scatter matrices
/// @param cpuSw is the within-class scatter matrix on the GPU (this function will copy the GPU result there)
/// @param gpuCM is a pointer to a GPU memory location, will point to the class mean
/// @param cpuM is a pointer to the class mean on the CPU
/// @param gpu_nPxInCls is a pointer to a GPU memory location storing the number of pixels in each class
/// @param gpu_nPxInCls is a CPU array storing the number of pixels in each class
/// @param gpuF is the destination for the GPU feature matrix
/// @param cpuF is the complete feature matrix on the CPU
void gpucomputeSbSw(unsigned int* gpuP, unsigned int* cpuP, size_t p, size_t f,
float* gpuSb, float* cpuSb,
float* gpuSw, float* cpuSw,
float* gpuF, unsigned int* gpuT,float* gpuM, float* gpuCM,
size_t nC, size_t tP, cudaDeviceProp props, size_t gen, size_t gnrtn, size_t ub, unsigned int* gpu_nPxInCls, std::ofstream& profilefile){
timer.start();
HANDLE_ERROR(cudaMemcpy(gpuP, cpuP, p * f * sizeof(unsigned int), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemset(gpuSb, 0, p * f * f * sizeof(float)));
//grid configuration of GPU
size_t threads = (size_t)sqrt(props.maxThreadsPerBlock);
if(threads > f) threads = f;
size_t numberofblocksfor_f = (size_t)ceil((float)f/ threads);
dim3 blockdim((int)threads, (int)threads, 1);
dim3 griddim((int)numberofblocksfor_f, (int)numberofblocksfor_f, (int)p); //X dimension blocks will cover all gnomes of the population and each block will have as many gnomes as it can feet
//sharedbytes calculation
size_t sharedBytes = p * f * sizeof(unsigned int); //copy population to shared memory
if(props.sharedMemPerBlock < sharedBytes) sharedBytes = props.sharedMemPerBlock;
//launch kernel to compute sb matrix
kernel_computeSb<<<griddim, blockdim, sharedBytes>>>(gpuSb, gpuP, gpuM, gpuCM, ub, f, p, nC, gpu_nPxInCls);
cudaDeviceSynchronize();
HANDLE_ERROR(cudaMemcpy(cpuSb, gpuSb, p * f * f * sizeof(float), cudaMemcpyDeviceToHost)); //copy between class scatter from gpu to cpu
const auto elapsedg1 = timer.time_elapsed();
if(gen > gnrtn -2){
std::cout << "Sb gpu time "<<std::chrono::duration_cast<std::chrono::microseconds>(elapsedg1).count() << "us" << std::endl;
profilefile << "Sb gpu time "<<std::chrono::duration_cast<std::chrono::microseconds>(elapsedg1).count() << "us" << std::endl;
}
timer.start();
//Compute within class scatter
HANDLE_ERROR(cudaMemset(gpuSw, 0, p * f * f * sizeof(float)));
//launch kernel to compute sb matrix
kernel_computeSw<<<griddim, blockdim>>>(gpuSw, gpuP, gpuCM, gpuF, gpuT, ub, f, p, nC, tP);
cudaDeviceSynchronize();
//copy between class scatter from gpu to cpu
HANDLE_ERROR(cudaMemcpy(cpuSw, gpuSw, p * f * f * sizeof(float), cudaMemcpyDeviceToHost));
const auto elapsedg2 = timer.time_elapsed();
if(gen > gnrtn - 2){
std::cout << "Sw gpu time "<<std::chrono::duration_cast<std::chrono::microseconds>(elapsedg2).count() << "us" << std::endl;
profilefile<< "Sw gpu time "<<std::chrono::duration_cast<std::chrono::microseconds>(elapsedg2).count() << "us" << std::endl;
}
}
//free all gpu pointers
void gpuDestroy(unsigned int* gpuP, float* gpuCM, float* gpuM, unsigned int* gpu_nPxInCls, float* gpuSb, float* gpuSw, float* gpuF, unsigned int* gpuT){
HANDLE_ERROR(cudaFree(gpuP));
HANDLE_ERROR(cudaFree(gpuCM));
HANDLE_ERROR(cudaFree(gpuM));
HANDLE_ERROR(cudaFree(gpu_nPxInCls));
HANDLE_ERROR(cudaFree(gpuSb));
HANDLE_ERROR(cudaFree(gpuSw));
HANDLE_ERROR(cudaFree(gpuF));
HANDLE_ERROR(cudaFree(gpuT));
}
#endif