Commit 7f02091aaf8a5310eba59b3ab6158a809d9d7311
1 parent
efbb4bf7
added cuda kmeans clustering
Showing
1 changed file
with
353 additions
and
0 deletions
Show diff stats
1 | + | |
2 | +#define malloc2D(name, xDim, yDim, type) do { \ | |
3 | + name = (type **)malloc(xDim * sizeof(type *)); \ | |
4 | + assert(name != NULL); \ | |
5 | + name[0] = (type *)malloc(xDim * yDim * sizeof(type)); \ | |
6 | + assert(name[0] != NULL); \ | |
7 | + for (size_t i = 1; i < xDim; i++) \ | |
8 | + name[i] = name[i-1] + yDim; \ | |
9 | +} while (0) | |
10 | + | |
11 | + | |
12 | + | |
13 | +static void handleError(cudaError_t error, const char* file, int line){ | |
14 | + | |
15 | + if(error != cudaSuccess){ | |
16 | + cout << cudaGetErrorString(error) << " in " << file << " at line " << line << endl; | |
17 | + exit(1); | |
18 | + } | |
19 | +} | |
20 | + | |
21 | +#define handle_error(error) handleError(error, __FILE__ , __LINE__) | |
22 | + | |
23 | + | |
24 | + | |
25 | +static inline int nextPowerOfTwo(int n) { | |
26 | + n--; | |
27 | + | |
28 | + n = n >> 1 | n; | |
29 | + n = n >> 2 | n; | |
30 | + n = n >> 4 | n; | |
31 | + n = n >> 8 | n; | |
32 | + n = n >> 16 | n; | |
33 | + // n = n >> 32 | n; // For 64-bit ints | |
34 | + | |
35 | + return ++n; | |
36 | +} | |
37 | + | |
38 | +/*----< euclid_dist_2() >----------------------------------------------------*/ | |
39 | +/* square of Euclid distance between two multi-dimensional points */ | |
40 | +__host__ __device__ inline static | |
41 | +float euclid_dist_2(int numCoords, | |
42 | + int numObjs, | |
43 | + int numClusters, | |
44 | + float *objects, // [numCoords][numObjs] | |
45 | + float *clusters, // [numCoords][numClusters] | |
46 | + int objectId, | |
47 | + int clusterId) | |
48 | +{ | |
49 | + int i; | |
50 | + float ans=0.0; | |
51 | + | |
52 | + for (i = 0; i < numCoords; i++) { | |
53 | + ans += (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]) * | |
54 | + (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]); | |
55 | + } | |
56 | + | |
57 | + return(ans); | |
58 | +} | |
59 | + | |
60 | +/*----< find_nearest_cluster() >---------------------------------------------*/ | |
61 | +__global__ static | |
62 | +void find_nearest_cluster(int numCoords, | |
63 | + int numObjs, | |
64 | + int numClusters, | |
65 | + float *objects, // [numCoords][numObjs] | |
66 | + float *deviceClusters, // [numCoords][numClusters] | |
67 | + int *membership, // [numObjs] | |
68 | + int *intermediates) | |
69 | +{ | |
70 | + extern __shared__ char sharedMemory[]; | |
71 | + | |
72 | + // The type chosen for membershipChanged must be large enough to support | |
73 | + // reductions! There are blockDim.x elements, one for each thread in the | |
74 | + // block. See numThreadsPerClusterBlock in cuda_kmeans(). | |
75 | + unsigned char *membershipChanged = (unsigned char *)sharedMemory; | |
76 | +#ifdef BLOCK_SHARED_MEM_OPTIMIZATION | |
77 | + float *clusters = (float *)(sharedMemory + blockDim.x); | |
78 | +#else | |
79 | + float *clusters = deviceClusters; | |
80 | +#endif | |
81 | + | |
82 | + membershipChanged[threadIdx.x] = 0; | |
83 | + | |
84 | +#ifdef BLOCK_SHARED_MEM_OPTIMIZATION | |
85 | + // BEWARE: We can overrun our shared memory here if there are too many | |
86 | + // clusters or too many coordinates! For reference, a Tesla C1060 has 16 | |
87 | + // KiB of shared memory per block, and a GeForce GTX 480 has 48 KiB of | |
88 | + // shared memory per block. | |
89 | + for (int i = threadIdx.x; i < numClusters; i += blockDim.x) { | |
90 | + for (int j = 0; j < numCoords; j++) { | |
91 | + clusters[numClusters * j + i] = deviceClusters[numClusters * j + i]; | |
92 | + } | |
93 | + } | |
94 | + __syncthreads(); | |
95 | +#endif | |
96 | + | |
97 | + int objectId = blockDim.x * blockIdx.x + threadIdx.x; | |
98 | + | |
99 | + if (objectId < numObjs) { | |
100 | + int index, i; | |
101 | + float dist, min_dist; | |
102 | + | |
103 | + /* find the cluster id that has min distance to object */ | |
104 | + index = 0; | |
105 | + min_dist = euclid_dist_2(numCoords, numObjs, numClusters, | |
106 | + objects, clusters, objectId, 0); | |
107 | + | |
108 | + for (i=1; i<numClusters; i++) { | |
109 | + dist = euclid_dist_2(numCoords, numObjs, numClusters, | |
110 | + objects, clusters, objectId, i); | |
111 | + /* no need square root */ | |
112 | + if (dist < min_dist) { /* find the min and its array index */ | |
113 | + min_dist = dist; | |
114 | + index = i; | |
115 | + } | |
116 | + } | |
117 | + | |
118 | + if (membership[objectId] != index) { | |
119 | + membershipChanged[threadIdx.x] = 1; | |
120 | + } | |
121 | + | |
122 | + /* assign the membership to object objectId */ | |
123 | + membership[objectId] = index; | |
124 | + | |
125 | + __syncthreads(); // For membershipChanged[] | |
126 | + | |
127 | + // blockDim.x *must* be a power of two! | |
128 | + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { | |
129 | + if (threadIdx.x < s) { | |
130 | + membershipChanged[threadIdx.x] += | |
131 | + membershipChanged[threadIdx.x + s]; | |
132 | + } | |
133 | + __syncthreads(); | |
134 | + } | |
135 | + | |
136 | + if (threadIdx.x == 0) { | |
137 | + intermediates[blockIdx.x] = membershipChanged[0]; | |
138 | + } | |
139 | + } | |
140 | +} | |
141 | + | |
142 | +__global__ static | |
143 | +void compute_delta(int *deviceIntermediates, | |
144 | + int numIntermediates, // The actual number of intermediates | |
145 | + int numIntermediates2) // The next power of two | |
146 | +{ | |
147 | + // The number of elements in this array should be equal to | |
148 | + // numIntermediates2, the number of threads launched. It *must* be a power | |
149 | + // of two! | |
150 | + extern __shared__ unsigned int intermediates[]; | |
151 | + | |
152 | + // Copy global intermediate values into shared memory. | |
153 | + intermediates[threadIdx.x] = | |
154 | + (threadIdx.x < numIntermediates) ? deviceIntermediates[threadIdx.x] : 0; | |
155 | + | |
156 | + __syncthreads(); | |
157 | + | |
158 | + // numIntermediates2 *must* be a power of two! | |
159 | + for (unsigned int s = numIntermediates2 / 2; s > 0; s >>= 1) { | |
160 | + if (threadIdx.x < s) { | |
161 | + intermediates[threadIdx.x] += intermediates[threadIdx.x + s]; | |
162 | + } | |
163 | + __syncthreads(); | |
164 | + } | |
165 | + | |
166 | + if (threadIdx.x == 0) { | |
167 | + deviceIntermediates[0] = intermediates[0]; | |
168 | + } | |
169 | +} | |
170 | + | |
171 | +/*----< cuda_kmeans() >-------------------------------------------------------*/ | |
172 | +// | |
173 | +// ---------------------------------------- | |
174 | +// DATA LAYOUT | |
175 | +// | |
176 | +// objects [numObjs][numCoords] | |
177 | +// clusters [numClusters][numCoords] | |
178 | +// dimObjects [numCoords][numObjs] | |
179 | +// dimClusters [numCoords][numClusters] | |
180 | +// newClusters [numCoords][numClusters] | |
181 | +// deviceObjects [numCoords][numObjs] | |
182 | +// deviceClusters [numCoords][numClusters] | |
183 | +// ---------------------------------------- | |
184 | +// | |
185 | +/* return an array of cluster centers of size [numClusters][numCoords] */ | |
186 | +float** cuda_kmeans(float **objects, /* in: [numObjs][numCoords] */ | |
187 | + unsigned int numCoords, /* no. features */ | |
188 | + unsigned int numObjs, /* no. objects */ | |
189 | + unsigned int numClusters, /* no. clusters */ | |
190 | + float threshold, /* % objects change membership */ | |
191 | + int *membership, /* out: [numObjs] */ | |
192 | + int loops) | |
193 | +{ | |
194 | + int i, j, index, loop=0; | |
195 | + int *newClusterSize; /* [numClusters]: no. objects assigned in each | |
196 | + new cluster */ | |
197 | + float delta; /* % of objects change their clusters */ | |
198 | + float **dimObjects; | |
199 | + float **clusters; /* out: [numClusters][numCoords] */ | |
200 | + float **dimClusters; | |
201 | + float **newClusters; /* [numCoords][numClusters] */ | |
202 | + | |
203 | + float *deviceObjects; | |
204 | + float *deviceClusters; | |
205 | + int *deviceMembership; | |
206 | + int *deviceIntermediates; | |
207 | + | |
208 | + // Copy objects given in [numObjs][numCoords] layout to new | |
209 | + // [numCoords][numObjs] layout | |
210 | + malloc2D(dimObjects, numCoords, numObjs, float); | |
211 | + for (i = 0; i < numCoords; i++) { | |
212 | + for (j = 0; j < numObjs; j++) { | |
213 | + dimObjects[i][j] = objects[j][i]; | |
214 | + } | |
215 | + } | |
216 | + | |
217 | + /* pick first numClusters elements of objects[] as initial cluster centers*/ | |
218 | + malloc2D(dimClusters, numCoords, numClusters, float); | |
219 | + for (i = 0; i < numCoords; i++) { | |
220 | + for (j = 0; j < numClusters; j++) { | |
221 | + dimClusters[i][j] = dimObjects[i][j]; | |
222 | + } | |
223 | + } | |
224 | + | |
225 | + /* initialize membership[] */ | |
226 | + for (i=0; i<numObjs; i++) membership[i] = -1; | |
227 | + | |
228 | + /* need to initialize newClusterSize and newClusters[0] to all 0 */ | |
229 | + newClusterSize = (int*) calloc(numClusters, sizeof(int)); | |
230 | + assert(newClusterSize != NULL); | |
231 | + | |
232 | + malloc2D(newClusters, numCoords, numClusters, float); | |
233 | + memset(newClusters[0], 0, numCoords * numClusters * sizeof(float)); | |
234 | + | |
235 | + // To support reduction, numThreadsPerClusterBlock *must* be a power of | |
236 | + // two, and it *must* be no larger than the number of bits that will | |
237 | + // fit into an unsigned char, the type used to keep track of membership | |
238 | + // changes in the kernel. | |
239 | + cudaDeviceProp props; | |
240 | + handle_error(cudaGetDeviceProperties(&props, 0)); | |
241 | + const unsigned int numThreadsPerClusterBlock = props.maxThreadsPerBlock; | |
242 | + const unsigned int numClusterBlocks = | |
243 | + ceil(numObjs / (double)numThreadsPerClusterBlock); | |
244 | + | |
245 | +#ifdef BLOCK_SHARED_MEM_OPTIMIZATION | |
246 | + const unsigned int clusterBlockSharedDataSize = | |
247 | + numThreadsPerClusterBlock * sizeof(unsigned char) + | |
248 | + numClusters * numCoords * sizeof(float); | |
249 | + | |
250 | + cudaDeviceProp deviceProp; | |
251 | + int deviceNum; | |
252 | + cudaGetDevice(&deviceNum); | |
253 | + cudaGetDeviceProperties(&deviceProp, deviceNum); | |
254 | + | |
255 | + if (clusterBlockSharedDataSize > deviceProp.sharedMemPerBlock) { | |
256 | + std::cout << "ERROR: insufficient shared memory. Please don't use the definition 'BLOCK_SHARED_MEM_OPTIMIZATION'" << endl; | |
257 | + exit(1); | |
258 | + } | |
259 | +#else | |
260 | + const unsigned int clusterBlockSharedDataSize = | |
261 | + numThreadsPerClusterBlock * sizeof(unsigned char); | |
262 | +#endif | |
263 | + | |
264 | + const unsigned int numReductionThreads = | |
265 | + nextPowerOfTwo(numClusterBlocks); | |
266 | + const unsigned int reductionBlockSharedDataSize = | |
267 | + numReductionThreads * sizeof(unsigned int); | |
268 | + | |
269 | + handle_error(cudaMalloc((void**)&deviceObjects, numObjs*numCoords*sizeof(float))); | |
270 | + handle_error(cudaMalloc((void**)&deviceClusters, numClusters*numCoords*sizeof(float))); | |
271 | + handle_error(cudaMalloc((void**)&deviceMembership, numObjs*sizeof(int))); | |
272 | + handle_error(cudaMalloc((void**)&deviceIntermediates, numReductionThreads*sizeof(unsigned int))); | |
273 | + | |
274 | + handle_error(cudaMemcpy(deviceObjects, dimObjects[0], | |
275 | + numObjs*numCoords*sizeof(float), cudaMemcpyHostToDevice)); | |
276 | + handle_error(cudaMemcpy(deviceMembership, membership, | |
277 | + numObjs*sizeof(int), cudaMemcpyHostToDevice)); | |
278 | + | |
279 | + do { | |
280 | + handle_error(cudaMemcpy(deviceClusters, dimClusters[0], | |
281 | + numClusters*numCoords*sizeof(float), cudaMemcpyHostToDevice)); | |
282 | + | |
283 | + find_nearest_cluster | |
284 | + <<< numClusterBlocks, numThreadsPerClusterBlock, clusterBlockSharedDataSize >>> | |
285 | + (numCoords, numObjs, numClusters, | |
286 | + deviceObjects, deviceClusters, deviceMembership, deviceIntermediates); | |
287 | + | |
288 | + cudaDeviceSynchronize(); | |
289 | + | |
290 | + compute_delta <<< 1, numReductionThreads, reductionBlockSharedDataSize >>> | |
291 | + (deviceIntermediates, numClusterBlocks, numReductionThreads); | |
292 | + | |
293 | + cudaDeviceSynchronize(); | |
294 | + | |
295 | + int d; | |
296 | + handle_error(cudaMemcpy(&d, deviceIntermediates, | |
297 | + sizeof(int), cudaMemcpyDeviceToHost)); | |
298 | + delta = (float)d; | |
299 | + | |
300 | + handle_error(cudaMemcpy(membership, deviceMembership, | |
301 | + numObjs*sizeof(int), cudaMemcpyDeviceToHost)); | |
302 | + | |
303 | + for (i=0; i<numObjs; i++) { | |
304 | + /* find the array index of nestest cluster center */ | |
305 | + index = membership[i]; | |
306 | + | |
307 | + /* update new cluster centers : sum of objects located within */ | |
308 | + newClusterSize[index]++; | |
309 | + for (j=0; j<numCoords; j++) | |
310 | + newClusters[j][index] += objects[i][j]; | |
311 | + } | |
312 | + | |
313 | + // TODO: Flip the nesting order | |
314 | + // TODO: Change layout of newClusters to [numClusters][numCoords] | |
315 | + /* average the sum and replace old cluster centers with newClusters */ | |
316 | + for (i=0; i<numClusters; i++) { | |
317 | + for (j=0; j<numCoords; j++) { | |
318 | + if (newClusterSize[i] > 0) | |
319 | + dimClusters[j][i] = newClusters[j][i] / newClusterSize[i]; | |
320 | + newClusters[j][i] = 0.0; /* set back to 0 */ | |
321 | + } | |
322 | + newClusterSize[i] = 0; /* set back to 0 */ | |
323 | + } | |
324 | + | |
325 | + delta /= numObjs; | |
326 | + } while (delta > threshold && loop++ < loops); | |
327 | + | |
328 | + | |
329 | + | |
330 | + /* allocate a 2D space for returning variable clusters[] (coordinates | |
331 | + of cluster centers) */ | |
332 | + malloc2D(clusters, numClusters, numCoords, float); | |
333 | + for (i = 0; i < numClusters; i++) { | |
334 | + for (j = 0; j < numCoords; j++) { | |
335 | + clusters[i][j] = dimClusters[j][i]; | |
336 | + } | |
337 | + } | |
338 | + | |
339 | + handle_error(cudaFree(deviceObjects)); | |
340 | + handle_error(cudaFree(deviceClusters)); | |
341 | + handle_error(cudaFree(deviceMembership)); | |
342 | + handle_error(cudaFree(deviceIntermediates)); | |
343 | + | |
344 | + free(dimObjects[0]); | |
345 | + free(dimObjects); | |
346 | + free(dimClusters[0]); | |
347 | + free(dimClusters); | |
348 | + free(newClusters[0]); | |
349 | + free(newClusters); | |
350 | + free(newClusterSize); | |
351 | + | |
352 | + return clusters; | |
353 | +} | ... | ... |