Commit e1f4b2bc4cd43a6682791bbf75a7392d1037bbe8
Merged branch davar into master
Showing
1 changed file
with
366 additions
and
0 deletions
Show diff stats
1 | +//This software is dervied from Professor Wei-keng Liao's parallel k-means | |
2 | +//clustering code obtained on November 21, 2010 from | |
3 | +// http://users.eecs.northwestern.edu/~wkliao/Kmeans/index.html | |
4 | +//(http://users.eecs.northwestern.edu/~wkliao/Kmeans/simple_kmeans.tar.gz). | |
5 | +// | |
6 | +//With his permission, Serban Giuroiu is publishing his CUDA implementation based on his code | |
7 | +//under the open-source MIT license. See the LICENSE file for more details. | |
8 | + | |
9 | +// The original code can be found on Github ( https://github.com/serban/kmeans ) | |
10 | +// Here I have just made a few changes to get it to work | |
11 | + | |
12 | + | |
13 | + | |
14 | + | |
15 | +#define malloc2D(name, xDim, yDim, type) do { \ | |
16 | + name = (type **)malloc(xDim * sizeof(type *)); \ | |
17 | + assert(name != NULL); \ | |
18 | + name[0] = (type *)malloc(xDim * yDim * sizeof(type)); \ | |
19 | + assert(name[0] != NULL); \ | |
20 | + for (size_t i = 1; i < xDim; i++) \ | |
21 | + name[i] = name[i-1] + yDim; \ | |
22 | +} while (0) | |
23 | + | |
24 | + | |
25 | + | |
26 | +static void handleError(cudaError_t error, const char* file, int line){ | |
27 | + | |
28 | + if(error != cudaSuccess){ | |
29 | + cout << cudaGetErrorString(error) << " in " << file << " at line " << line << endl; | |
30 | + exit(1); | |
31 | + } | |
32 | +} | |
33 | + | |
34 | +#define handle_error(error) handleError(error, __FILE__ , __LINE__) | |
35 | + | |
36 | + | |
37 | + | |
38 | +static inline int nextPowerOfTwo(int n) { | |
39 | + n--; | |
40 | + | |
41 | + n = n >> 1 | n; | |
42 | + n = n >> 2 | n; | |
43 | + n = n >> 4 | n; | |
44 | + n = n >> 8 | n; | |
45 | + n = n >> 16 | n; | |
46 | + // n = n >> 32 | n; // For 64-bit ints | |
47 | + | |
48 | + return ++n; | |
49 | +} | |
50 | + | |
51 | +/*----< euclid_dist_2() >----------------------------------------------------*/ | |
52 | +/* square of Euclid distance between two multi-dimensional points */ | |
53 | +__host__ __device__ inline static | |
54 | +float euclid_dist_2(int numCoords, | |
55 | + int numObjs, | |
56 | + int numClusters, | |
57 | + float *objects, // [numCoords][numObjs] | |
58 | + float *clusters, // [numCoords][numClusters] | |
59 | + int objectId, | |
60 | + int clusterId) | |
61 | +{ | |
62 | + int i; | |
63 | + float ans=0.0; | |
64 | + | |
65 | + for (i = 0; i < numCoords; i++) { | |
66 | + ans += (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]) * | |
67 | + (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]); | |
68 | + } | |
69 | + | |
70 | + return(ans); | |
71 | +} | |
72 | + | |
73 | +/*----< find_nearest_cluster() >---------------------------------------------*/ | |
74 | +__global__ static | |
75 | +void find_nearest_cluster(int numCoords, | |
76 | + int numObjs, | |
77 | + int numClusters, | |
78 | + float *objects, // [numCoords][numObjs] | |
79 | + float *deviceClusters, // [numCoords][numClusters] | |
80 | + int *membership, // [numObjs] | |
81 | + int *intermediates) | |
82 | +{ | |
83 | + extern __shared__ char sharedMemory[]; | |
84 | + | |
85 | + // The type chosen for membershipChanged must be large enough to support | |
86 | + // reductions! There are blockDim.x elements, one for each thread in the | |
87 | + // block. See numThreadsPerClusterBlock in cuda_kmeans(). | |
88 | + unsigned char *membershipChanged = (unsigned char *)sharedMemory; | |
89 | +#ifdef BLOCK_SHARED_MEM_OPTIMIZATION | |
90 | + float *clusters = (float *)(sharedMemory + blockDim.x); | |
91 | +#else | |
92 | + float *clusters = deviceClusters; | |
93 | +#endif | |
94 | + | |
95 | + membershipChanged[threadIdx.x] = 0; | |
96 | + | |
97 | +#ifdef BLOCK_SHARED_MEM_OPTIMIZATION | |
98 | + // BEWARE: We can overrun our shared memory here if there are too many | |
99 | + // clusters or too many coordinates! For reference, a Tesla C1060 has 16 | |
100 | + // KiB of shared memory per block, and a GeForce GTX 480 has 48 KiB of | |
101 | + // shared memory per block. | |
102 | + for (int i = threadIdx.x; i < numClusters; i += blockDim.x) { | |
103 | + for (int j = 0; j < numCoords; j++) { | |
104 | + clusters[numClusters * j + i] = deviceClusters[numClusters * j + i]; | |
105 | + } | |
106 | + } | |
107 | + __syncthreads(); | |
108 | +#endif | |
109 | + | |
110 | + int objectId = blockDim.x * blockIdx.x + threadIdx.x; | |
111 | + | |
112 | + if (objectId < numObjs) { | |
113 | + int index, i; | |
114 | + float dist, min_dist; | |
115 | + | |
116 | + /* find the cluster id that has min distance to object */ | |
117 | + index = 0; | |
118 | + min_dist = euclid_dist_2(numCoords, numObjs, numClusters, | |
119 | + objects, clusters, objectId, 0); | |
120 | + | |
121 | + for (i=1; i<numClusters; i++) { | |
122 | + dist = euclid_dist_2(numCoords, numObjs, numClusters, | |
123 | + objects, clusters, objectId, i); | |
124 | + /* no need square root */ | |
125 | + if (dist < min_dist) { /* find the min and its array index */ | |
126 | + min_dist = dist; | |
127 | + index = i; | |
128 | + } | |
129 | + } | |
130 | + | |
131 | + if (membership[objectId] != index) { | |
132 | + membershipChanged[threadIdx.x] = 1; | |
133 | + } | |
134 | + | |
135 | + /* assign the membership to object objectId */ | |
136 | + membership[objectId] = index; | |
137 | + | |
138 | + __syncthreads(); // For membershipChanged[] | |
139 | + | |
140 | + // blockDim.x *must* be a power of two! | |
141 | + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { | |
142 | + if (threadIdx.x < s) { | |
143 | + membershipChanged[threadIdx.x] += | |
144 | + membershipChanged[threadIdx.x + s]; | |
145 | + } | |
146 | + __syncthreads(); | |
147 | + } | |
148 | + | |
149 | + if (threadIdx.x == 0) { | |
150 | + intermediates[blockIdx.x] = membershipChanged[0]; | |
151 | + } | |
152 | + } | |
153 | +} | |
154 | + | |
155 | +__global__ static | |
156 | +void compute_delta(int *deviceIntermediates, | |
157 | + int numIntermediates, // The actual number of intermediates | |
158 | + int numIntermediates2) // The next power of two | |
159 | +{ | |
160 | + // The number of elements in this array should be equal to | |
161 | + // numIntermediates2, the number of threads launched. It *must* be a power | |
162 | + // of two! | |
163 | + extern __shared__ unsigned int intermediates[]; | |
164 | + | |
165 | + // Copy global intermediate values into shared memory. | |
166 | + intermediates[threadIdx.x] = | |
167 | + (threadIdx.x < numIntermediates) ? deviceIntermediates[threadIdx.x] : 0; | |
168 | + | |
169 | + __syncthreads(); | |
170 | + | |
171 | + // numIntermediates2 *must* be a power of two! | |
172 | + for (unsigned int s = numIntermediates2 / 2; s > 0; s >>= 1) { | |
173 | + if (threadIdx.x < s) { | |
174 | + intermediates[threadIdx.x] += intermediates[threadIdx.x + s]; | |
175 | + } | |
176 | + __syncthreads(); | |
177 | + } | |
178 | + | |
179 | + if (threadIdx.x == 0) { | |
180 | + deviceIntermediates[0] = intermediates[0]; | |
181 | + } | |
182 | +} | |
183 | + | |
184 | +/*----< cuda_kmeans() >-------------------------------------------------------*/ | |
185 | +// | |
186 | +// ---------------------------------------- | |
187 | +// DATA LAYOUT | |
188 | +// | |
189 | +// objects [numObjs][numCoords] | |
190 | +// clusters [numClusters][numCoords] | |
191 | +// dimObjects [numCoords][numObjs] | |
192 | +// dimClusters [numCoords][numClusters] | |
193 | +// newClusters [numCoords][numClusters] | |
194 | +// deviceObjects [numCoords][numObjs] | |
195 | +// deviceClusters [numCoords][numClusters] | |
196 | +// ---------------------------------------- | |
197 | +// | |
198 | +/* return an array of cluster centers of size [numClusters][numCoords] */ | |
199 | +float** cuda_kmeans(float **objects, /* in: [numObjs][numCoords] */ | |
200 | + unsigned int numCoords, /* no. features */ | |
201 | + unsigned int numObjs, /* no. objects */ | |
202 | + unsigned int numClusters, /* no. clusters */ | |
203 | + float threshold, /* % objects change membership */ | |
204 | + int *membership, /* out: [numObjs] */ | |
205 | + int loops) | |
206 | +{ | |
207 | + int i, j, index, loop=0; | |
208 | + int *newClusterSize; /* [numClusters]: no. objects assigned in each | |
209 | + new cluster */ | |
210 | + float delta; /* % of objects change their clusters */ | |
211 | + float **dimObjects; | |
212 | + float **clusters; /* out: [numClusters][numCoords] */ | |
213 | + float **dimClusters; | |
214 | + float **newClusters; /* [numCoords][numClusters] */ | |
215 | + | |
216 | + float *deviceObjects; | |
217 | + float *deviceClusters; | |
218 | + int *deviceMembership; | |
219 | + int *deviceIntermediates; | |
220 | + | |
221 | + // Copy objects given in [numObjs][numCoords] layout to new | |
222 | + // [numCoords][numObjs] layout | |
223 | + malloc2D(dimObjects, numCoords, numObjs, float); | |
224 | + for (i = 0; i < numCoords; i++) { | |
225 | + for (j = 0; j < numObjs; j++) { | |
226 | + dimObjects[i][j] = objects[j][i]; | |
227 | + } | |
228 | + } | |
229 | + | |
230 | + /* pick first numClusters elements of objects[] as initial cluster centers*/ | |
231 | + malloc2D(dimClusters, numCoords, numClusters, float); | |
232 | + for (i = 0; i < numCoords; i++) { | |
233 | + for (j = 0; j < numClusters; j++) { | |
234 | + dimClusters[i][j] = dimObjects[i][j]; | |
235 | + } | |
236 | + } | |
237 | + | |
238 | + /* initialize membership[] */ | |
239 | + for (i=0; i<numObjs; i++) membership[i] = -1; | |
240 | + | |
241 | + /* need to initialize newClusterSize and newClusters[0] to all 0 */ | |
242 | + newClusterSize = (int*) calloc(numClusters, sizeof(int)); | |
243 | + assert(newClusterSize != NULL); | |
244 | + | |
245 | + malloc2D(newClusters, numCoords, numClusters, float); | |
246 | + memset(newClusters[0], 0, numCoords * numClusters * sizeof(float)); | |
247 | + | |
248 | + // To support reduction, numThreadsPerClusterBlock *must* be a power of | |
249 | + // two, and it *must* be no larger than the number of bits that will | |
250 | + // fit into an unsigned char, the type used to keep track of membership | |
251 | + // changes in the kernel. | |
252 | + cudaDeviceProp props; | |
253 | + handle_error(cudaGetDeviceProperties(&props, 0)); | |
254 | + const unsigned int numThreadsPerClusterBlock = props.maxThreadsPerBlock; | |
255 | + const unsigned int numClusterBlocks = | |
256 | + ceil(numObjs / (double)numThreadsPerClusterBlock); | |
257 | + | |
258 | +#ifdef BLOCK_SHARED_MEM_OPTIMIZATION | |
259 | + const unsigned int clusterBlockSharedDataSize = | |
260 | + numThreadsPerClusterBlock * sizeof(unsigned char) + | |
261 | + numClusters * numCoords * sizeof(float); | |
262 | + | |
263 | + cudaDeviceProp deviceProp; | |
264 | + int deviceNum; | |
265 | + cudaGetDevice(&deviceNum); | |
266 | + cudaGetDeviceProperties(&deviceProp, deviceNum); | |
267 | + | |
268 | + if (clusterBlockSharedDataSize > deviceProp.sharedMemPerBlock) { | |
269 | + std::cout << "ERROR: insufficient shared memory. Please don't use the definition 'BLOCK_SHARED_MEM_OPTIMIZATION'" << endl; | |
270 | + exit(1); | |
271 | + } | |
272 | +#else | |
273 | + const unsigned int clusterBlockSharedDataSize = | |
274 | + numThreadsPerClusterBlock * sizeof(unsigned char); | |
275 | +#endif | |
276 | + | |
277 | + const unsigned int numReductionThreads = | |
278 | + nextPowerOfTwo(numClusterBlocks); | |
279 | + const unsigned int reductionBlockSharedDataSize = | |
280 | + numReductionThreads * sizeof(unsigned int); | |
281 | + | |
282 | + handle_error(cudaMalloc((void**)&deviceObjects, numObjs*numCoords*sizeof(float))); | |
283 | + handle_error(cudaMalloc((void**)&deviceClusters, numClusters*numCoords*sizeof(float))); | |
284 | + handle_error(cudaMalloc((void**)&deviceMembership, numObjs*sizeof(int))); | |
285 | + handle_error(cudaMalloc((void**)&deviceIntermediates, numReductionThreads*sizeof(unsigned int))); | |
286 | + | |
287 | + handle_error(cudaMemcpy(deviceObjects, dimObjects[0], | |
288 | + numObjs*numCoords*sizeof(float), cudaMemcpyHostToDevice)); | |
289 | + handle_error(cudaMemcpy(deviceMembership, membership, | |
290 | + numObjs*sizeof(int), cudaMemcpyHostToDevice)); | |
291 | + | |
292 | + do { | |
293 | + handle_error(cudaMemcpy(deviceClusters, dimClusters[0], | |
294 | + numClusters*numCoords*sizeof(float), cudaMemcpyHostToDevice)); | |
295 | + | |
296 | + find_nearest_cluster | |
297 | + <<< numClusterBlocks, numThreadsPerClusterBlock, clusterBlockSharedDataSize >>> | |
298 | + (numCoords, numObjs, numClusters, | |
299 | + deviceObjects, deviceClusters, deviceMembership, deviceIntermediates); | |
300 | + | |
301 | + cudaDeviceSynchronize(); | |
302 | + | |
303 | + compute_delta <<< 1, numReductionThreads, reductionBlockSharedDataSize >>> | |
304 | + (deviceIntermediates, numClusterBlocks, numReductionThreads); | |
305 | + | |
306 | + cudaDeviceSynchronize(); | |
307 | + | |
308 | + int d; | |
309 | + handle_error(cudaMemcpy(&d, deviceIntermediates, | |
310 | + sizeof(int), cudaMemcpyDeviceToHost)); | |
311 | + delta = (float)d; | |
312 | + | |
313 | + handle_error(cudaMemcpy(membership, deviceMembership, | |
314 | + numObjs*sizeof(int), cudaMemcpyDeviceToHost)); | |
315 | + | |
316 | + for (i=0; i<numObjs; i++) { | |
317 | + /* find the array index of nestest cluster center */ | |
318 | + index = membership[i]; | |
319 | + | |
320 | + /* update new cluster centers : sum of objects located within */ | |
321 | + newClusterSize[index]++; | |
322 | + for (j=0; j<numCoords; j++) | |
323 | + newClusters[j][index] += objects[i][j]; | |
324 | + } | |
325 | + | |
326 | + // TODO: Flip the nesting order | |
327 | + // TODO: Change layout of newClusters to [numClusters][numCoords] | |
328 | + /* average the sum and replace old cluster centers with newClusters */ | |
329 | + for (i=0; i<numClusters; i++) { | |
330 | + for (j=0; j<numCoords; j++) { | |
331 | + if (newClusterSize[i] > 0) | |
332 | + dimClusters[j][i] = newClusters[j][i] / newClusterSize[i]; | |
333 | + newClusters[j][i] = 0.0; /* set back to 0 */ | |
334 | + } | |
335 | + newClusterSize[i] = 0; /* set back to 0 */ | |
336 | + } | |
337 | + | |
338 | + delta /= numObjs; | |
339 | + } while (delta > threshold && loop++ < loops); | |
340 | + | |
341 | + | |
342 | + | |
343 | + /* allocate a 2D space for returning variable clusters[] (coordinates | |
344 | + of cluster centers) */ | |
345 | + malloc2D(clusters, numClusters, numCoords, float); | |
346 | + for (i = 0; i < numClusters; i++) { | |
347 | + for (j = 0; j < numCoords; j++) { | |
348 | + clusters[i][j] = dimClusters[j][i]; | |
349 | + } | |
350 | + } | |
351 | + | |
352 | + handle_error(cudaFree(deviceObjects)); | |
353 | + handle_error(cudaFree(deviceClusters)); | |
354 | + handle_error(cudaFree(deviceMembership)); | |
355 | + handle_error(cudaFree(deviceIntermediates)); | |
356 | + | |
357 | + free(dimObjects[0]); | |
358 | + free(dimObjects); | |
359 | + free(dimClusters[0]); | |
360 | + free(dimClusters); | |
361 | + free(newClusters[0]); | |
362 | + free(newClusters); | |
363 | + free(newClusterSize); | |
364 | + | |
365 | + return clusters; | |
366 | +} | ... | ... |