Commit e1f4b2bc4cd43a6682791bbf75a7392d1037bbe8
Merged branch davar into master
Showing
1 changed file
with
366 additions
and
0 deletions
Show diff stats
1 | +//This software is dervied from Professor Wei-keng Liao's parallel k-means | ||
2 | +//clustering code obtained on November 21, 2010 from | ||
3 | +// http://users.eecs.northwestern.edu/~wkliao/Kmeans/index.html | ||
4 | +//(http://users.eecs.northwestern.edu/~wkliao/Kmeans/simple_kmeans.tar.gz). | ||
5 | +// | ||
6 | +//With his permission, Serban Giuroiu is publishing his CUDA implementation based on his code | ||
7 | +//under the open-source MIT license. See the LICENSE file for more details. | ||
8 | + | ||
9 | +// The original code can be found on Github ( https://github.com/serban/kmeans ) | ||
10 | +// Here I have just made a few changes to get it to work | ||
11 | + | ||
12 | + | ||
13 | + | ||
14 | + | ||
15 | +#define malloc2D(name, xDim, yDim, type) do { \ | ||
16 | + name = (type **)malloc(xDim * sizeof(type *)); \ | ||
17 | + assert(name != NULL); \ | ||
18 | + name[0] = (type *)malloc(xDim * yDim * sizeof(type)); \ | ||
19 | + assert(name[0] != NULL); \ | ||
20 | + for (size_t i = 1; i < xDim; i++) \ | ||
21 | + name[i] = name[i-1] + yDim; \ | ||
22 | +} while (0) | ||
23 | + | ||
24 | + | ||
25 | + | ||
26 | +static void handleError(cudaError_t error, const char* file, int line){ | ||
27 | + | ||
28 | + if(error != cudaSuccess){ | ||
29 | + cout << cudaGetErrorString(error) << " in " << file << " at line " << line << endl; | ||
30 | + exit(1); | ||
31 | + } | ||
32 | +} | ||
33 | + | ||
34 | +#define handle_error(error) handleError(error, __FILE__ , __LINE__) | ||
35 | + | ||
36 | + | ||
37 | + | ||
38 | +static inline int nextPowerOfTwo(int n) { | ||
39 | + n--; | ||
40 | + | ||
41 | + n = n >> 1 | n; | ||
42 | + n = n >> 2 | n; | ||
43 | + n = n >> 4 | n; | ||
44 | + n = n >> 8 | n; | ||
45 | + n = n >> 16 | n; | ||
46 | + // n = n >> 32 | n; // For 64-bit ints | ||
47 | + | ||
48 | + return ++n; | ||
49 | +} | ||
50 | + | ||
51 | +/*----< euclid_dist_2() >----------------------------------------------------*/ | ||
52 | +/* square of Euclid distance between two multi-dimensional points */ | ||
53 | +__host__ __device__ inline static | ||
54 | +float euclid_dist_2(int numCoords, | ||
55 | + int numObjs, | ||
56 | + int numClusters, | ||
57 | + float *objects, // [numCoords][numObjs] | ||
58 | + float *clusters, // [numCoords][numClusters] | ||
59 | + int objectId, | ||
60 | + int clusterId) | ||
61 | +{ | ||
62 | + int i; | ||
63 | + float ans=0.0; | ||
64 | + | ||
65 | + for (i = 0; i < numCoords; i++) { | ||
66 | + ans += (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]) * | ||
67 | + (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]); | ||
68 | + } | ||
69 | + | ||
70 | + return(ans); | ||
71 | +} | ||
72 | + | ||
73 | +/*----< find_nearest_cluster() >---------------------------------------------*/ | ||
74 | +__global__ static | ||
75 | +void find_nearest_cluster(int numCoords, | ||
76 | + int numObjs, | ||
77 | + int numClusters, | ||
78 | + float *objects, // [numCoords][numObjs] | ||
79 | + float *deviceClusters, // [numCoords][numClusters] | ||
80 | + int *membership, // [numObjs] | ||
81 | + int *intermediates) | ||
82 | +{ | ||
83 | + extern __shared__ char sharedMemory[]; | ||
84 | + | ||
85 | + // The type chosen for membershipChanged must be large enough to support | ||
86 | + // reductions! There are blockDim.x elements, one for each thread in the | ||
87 | + // block. See numThreadsPerClusterBlock in cuda_kmeans(). | ||
88 | + unsigned char *membershipChanged = (unsigned char *)sharedMemory; | ||
89 | +#ifdef BLOCK_SHARED_MEM_OPTIMIZATION | ||
90 | + float *clusters = (float *)(sharedMemory + blockDim.x); | ||
91 | +#else | ||
92 | + float *clusters = deviceClusters; | ||
93 | +#endif | ||
94 | + | ||
95 | + membershipChanged[threadIdx.x] = 0; | ||
96 | + | ||
97 | +#ifdef BLOCK_SHARED_MEM_OPTIMIZATION | ||
98 | + // BEWARE: We can overrun our shared memory here if there are too many | ||
99 | + // clusters or too many coordinates! For reference, a Tesla C1060 has 16 | ||
100 | + // KiB of shared memory per block, and a GeForce GTX 480 has 48 KiB of | ||
101 | + // shared memory per block. | ||
102 | + for (int i = threadIdx.x; i < numClusters; i += blockDim.x) { | ||
103 | + for (int j = 0; j < numCoords; j++) { | ||
104 | + clusters[numClusters * j + i] = deviceClusters[numClusters * j + i]; | ||
105 | + } | ||
106 | + } | ||
107 | + __syncthreads(); | ||
108 | +#endif | ||
109 | + | ||
110 | + int objectId = blockDim.x * blockIdx.x + threadIdx.x; | ||
111 | + | ||
112 | + if (objectId < numObjs) { | ||
113 | + int index, i; | ||
114 | + float dist, min_dist; | ||
115 | + | ||
116 | + /* find the cluster id that has min distance to object */ | ||
117 | + index = 0; | ||
118 | + min_dist = euclid_dist_2(numCoords, numObjs, numClusters, | ||
119 | + objects, clusters, objectId, 0); | ||
120 | + | ||
121 | + for (i=1; i<numClusters; i++) { | ||
122 | + dist = euclid_dist_2(numCoords, numObjs, numClusters, | ||
123 | + objects, clusters, objectId, i); | ||
124 | + /* no need square root */ | ||
125 | + if (dist < min_dist) { /* find the min and its array index */ | ||
126 | + min_dist = dist; | ||
127 | + index = i; | ||
128 | + } | ||
129 | + } | ||
130 | + | ||
131 | + if (membership[objectId] != index) { | ||
132 | + membershipChanged[threadIdx.x] = 1; | ||
133 | + } | ||
134 | + | ||
135 | + /* assign the membership to object objectId */ | ||
136 | + membership[objectId] = index; | ||
137 | + | ||
138 | + __syncthreads(); // For membershipChanged[] | ||
139 | + | ||
140 | + // blockDim.x *must* be a power of two! | ||
141 | + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { | ||
142 | + if (threadIdx.x < s) { | ||
143 | + membershipChanged[threadIdx.x] += | ||
144 | + membershipChanged[threadIdx.x + s]; | ||
145 | + } | ||
146 | + __syncthreads(); | ||
147 | + } | ||
148 | + | ||
149 | + if (threadIdx.x == 0) { | ||
150 | + intermediates[blockIdx.x] = membershipChanged[0]; | ||
151 | + } | ||
152 | + } | ||
153 | +} | ||
154 | + | ||
155 | +__global__ static | ||
156 | +void compute_delta(int *deviceIntermediates, | ||
157 | + int numIntermediates, // The actual number of intermediates | ||
158 | + int numIntermediates2) // The next power of two | ||
159 | +{ | ||
160 | + // The number of elements in this array should be equal to | ||
161 | + // numIntermediates2, the number of threads launched. It *must* be a power | ||
162 | + // of two! | ||
163 | + extern __shared__ unsigned int intermediates[]; | ||
164 | + | ||
165 | + // Copy global intermediate values into shared memory. | ||
166 | + intermediates[threadIdx.x] = | ||
167 | + (threadIdx.x < numIntermediates) ? deviceIntermediates[threadIdx.x] : 0; | ||
168 | + | ||
169 | + __syncthreads(); | ||
170 | + | ||
171 | + // numIntermediates2 *must* be a power of two! | ||
172 | + for (unsigned int s = numIntermediates2 / 2; s > 0; s >>= 1) { | ||
173 | + if (threadIdx.x < s) { | ||
174 | + intermediates[threadIdx.x] += intermediates[threadIdx.x + s]; | ||
175 | + } | ||
176 | + __syncthreads(); | ||
177 | + } | ||
178 | + | ||
179 | + if (threadIdx.x == 0) { | ||
180 | + deviceIntermediates[0] = intermediates[0]; | ||
181 | + } | ||
182 | +} | ||
183 | + | ||
184 | +/*----< cuda_kmeans() >-------------------------------------------------------*/ | ||
185 | +// | ||
186 | +// ---------------------------------------- | ||
187 | +// DATA LAYOUT | ||
188 | +// | ||
189 | +// objects [numObjs][numCoords] | ||
190 | +// clusters [numClusters][numCoords] | ||
191 | +// dimObjects [numCoords][numObjs] | ||
192 | +// dimClusters [numCoords][numClusters] | ||
193 | +// newClusters [numCoords][numClusters] | ||
194 | +// deviceObjects [numCoords][numObjs] | ||
195 | +// deviceClusters [numCoords][numClusters] | ||
196 | +// ---------------------------------------- | ||
197 | +// | ||
198 | +/* return an array of cluster centers of size [numClusters][numCoords] */ | ||
199 | +float** cuda_kmeans(float **objects, /* in: [numObjs][numCoords] */ | ||
200 | + unsigned int numCoords, /* no. features */ | ||
201 | + unsigned int numObjs, /* no. objects */ | ||
202 | + unsigned int numClusters, /* no. clusters */ | ||
203 | + float threshold, /* % objects change membership */ | ||
204 | + int *membership, /* out: [numObjs] */ | ||
205 | + int loops) | ||
206 | +{ | ||
207 | + int i, j, index, loop=0; | ||
208 | + int *newClusterSize; /* [numClusters]: no. objects assigned in each | ||
209 | + new cluster */ | ||
210 | + float delta; /* % of objects change their clusters */ | ||
211 | + float **dimObjects; | ||
212 | + float **clusters; /* out: [numClusters][numCoords] */ | ||
213 | + float **dimClusters; | ||
214 | + float **newClusters; /* [numCoords][numClusters] */ | ||
215 | + | ||
216 | + float *deviceObjects; | ||
217 | + float *deviceClusters; | ||
218 | + int *deviceMembership; | ||
219 | + int *deviceIntermediates; | ||
220 | + | ||
221 | + // Copy objects given in [numObjs][numCoords] layout to new | ||
222 | + // [numCoords][numObjs] layout | ||
223 | + malloc2D(dimObjects, numCoords, numObjs, float); | ||
224 | + for (i = 0; i < numCoords; i++) { | ||
225 | + for (j = 0; j < numObjs; j++) { | ||
226 | + dimObjects[i][j] = objects[j][i]; | ||
227 | + } | ||
228 | + } | ||
229 | + | ||
230 | + /* pick first numClusters elements of objects[] as initial cluster centers*/ | ||
231 | + malloc2D(dimClusters, numCoords, numClusters, float); | ||
232 | + for (i = 0; i < numCoords; i++) { | ||
233 | + for (j = 0; j < numClusters; j++) { | ||
234 | + dimClusters[i][j] = dimObjects[i][j]; | ||
235 | + } | ||
236 | + } | ||
237 | + | ||
238 | + /* initialize membership[] */ | ||
239 | + for (i=0; i<numObjs; i++) membership[i] = -1; | ||
240 | + | ||
241 | + /* need to initialize newClusterSize and newClusters[0] to all 0 */ | ||
242 | + newClusterSize = (int*) calloc(numClusters, sizeof(int)); | ||
243 | + assert(newClusterSize != NULL); | ||
244 | + | ||
245 | + malloc2D(newClusters, numCoords, numClusters, float); | ||
246 | + memset(newClusters[0], 0, numCoords * numClusters * sizeof(float)); | ||
247 | + | ||
248 | + // To support reduction, numThreadsPerClusterBlock *must* be a power of | ||
249 | + // two, and it *must* be no larger than the number of bits that will | ||
250 | + // fit into an unsigned char, the type used to keep track of membership | ||
251 | + // changes in the kernel. | ||
252 | + cudaDeviceProp props; | ||
253 | + handle_error(cudaGetDeviceProperties(&props, 0)); | ||
254 | + const unsigned int numThreadsPerClusterBlock = props.maxThreadsPerBlock; | ||
255 | + const unsigned int numClusterBlocks = | ||
256 | + ceil(numObjs / (double)numThreadsPerClusterBlock); | ||
257 | + | ||
258 | +#ifdef BLOCK_SHARED_MEM_OPTIMIZATION | ||
259 | + const unsigned int clusterBlockSharedDataSize = | ||
260 | + numThreadsPerClusterBlock * sizeof(unsigned char) + | ||
261 | + numClusters * numCoords * sizeof(float); | ||
262 | + | ||
263 | + cudaDeviceProp deviceProp; | ||
264 | + int deviceNum; | ||
265 | + cudaGetDevice(&deviceNum); | ||
266 | + cudaGetDeviceProperties(&deviceProp, deviceNum); | ||
267 | + | ||
268 | + if (clusterBlockSharedDataSize > deviceProp.sharedMemPerBlock) { | ||
269 | + std::cout << "ERROR: insufficient shared memory. Please don't use the definition 'BLOCK_SHARED_MEM_OPTIMIZATION'" << endl; | ||
270 | + exit(1); | ||
271 | + } | ||
272 | +#else | ||
273 | + const unsigned int clusterBlockSharedDataSize = | ||
274 | + numThreadsPerClusterBlock * sizeof(unsigned char); | ||
275 | +#endif | ||
276 | + | ||
277 | + const unsigned int numReductionThreads = | ||
278 | + nextPowerOfTwo(numClusterBlocks); | ||
279 | + const unsigned int reductionBlockSharedDataSize = | ||
280 | + numReductionThreads * sizeof(unsigned int); | ||
281 | + | ||
282 | + handle_error(cudaMalloc((void**)&deviceObjects, numObjs*numCoords*sizeof(float))); | ||
283 | + handle_error(cudaMalloc((void**)&deviceClusters, numClusters*numCoords*sizeof(float))); | ||
284 | + handle_error(cudaMalloc((void**)&deviceMembership, numObjs*sizeof(int))); | ||
285 | + handle_error(cudaMalloc((void**)&deviceIntermediates, numReductionThreads*sizeof(unsigned int))); | ||
286 | + | ||
287 | + handle_error(cudaMemcpy(deviceObjects, dimObjects[0], | ||
288 | + numObjs*numCoords*sizeof(float), cudaMemcpyHostToDevice)); | ||
289 | + handle_error(cudaMemcpy(deviceMembership, membership, | ||
290 | + numObjs*sizeof(int), cudaMemcpyHostToDevice)); | ||
291 | + | ||
292 | + do { | ||
293 | + handle_error(cudaMemcpy(deviceClusters, dimClusters[0], | ||
294 | + numClusters*numCoords*sizeof(float), cudaMemcpyHostToDevice)); | ||
295 | + | ||
296 | + find_nearest_cluster | ||
297 | + <<< numClusterBlocks, numThreadsPerClusterBlock, clusterBlockSharedDataSize >>> | ||
298 | + (numCoords, numObjs, numClusters, | ||
299 | + deviceObjects, deviceClusters, deviceMembership, deviceIntermediates); | ||
300 | + | ||
301 | + cudaDeviceSynchronize(); | ||
302 | + | ||
303 | + compute_delta <<< 1, numReductionThreads, reductionBlockSharedDataSize >>> | ||
304 | + (deviceIntermediates, numClusterBlocks, numReductionThreads); | ||
305 | + | ||
306 | + cudaDeviceSynchronize(); | ||
307 | + | ||
308 | + int d; | ||
309 | + handle_error(cudaMemcpy(&d, deviceIntermediates, | ||
310 | + sizeof(int), cudaMemcpyDeviceToHost)); | ||
311 | + delta = (float)d; | ||
312 | + | ||
313 | + handle_error(cudaMemcpy(membership, deviceMembership, | ||
314 | + numObjs*sizeof(int), cudaMemcpyDeviceToHost)); | ||
315 | + | ||
316 | + for (i=0; i<numObjs; i++) { | ||
317 | + /* find the array index of nestest cluster center */ | ||
318 | + index = membership[i]; | ||
319 | + | ||
320 | + /* update new cluster centers : sum of objects located within */ | ||
321 | + newClusterSize[index]++; | ||
322 | + for (j=0; j<numCoords; j++) | ||
323 | + newClusters[j][index] += objects[i][j]; | ||
324 | + } | ||
325 | + | ||
326 | + // TODO: Flip the nesting order | ||
327 | + // TODO: Change layout of newClusters to [numClusters][numCoords] | ||
328 | + /* average the sum and replace old cluster centers with newClusters */ | ||
329 | + for (i=0; i<numClusters; i++) { | ||
330 | + for (j=0; j<numCoords; j++) { | ||
331 | + if (newClusterSize[i] > 0) | ||
332 | + dimClusters[j][i] = newClusters[j][i] / newClusterSize[i]; | ||
333 | + newClusters[j][i] = 0.0; /* set back to 0 */ | ||
334 | + } | ||
335 | + newClusterSize[i] = 0; /* set back to 0 */ | ||
336 | + } | ||
337 | + | ||
338 | + delta /= numObjs; | ||
339 | + } while (delta > threshold && loop++ < loops); | ||
340 | + | ||
341 | + | ||
342 | + | ||
343 | + /* allocate a 2D space for returning variable clusters[] (coordinates | ||
344 | + of cluster centers) */ | ||
345 | + malloc2D(clusters, numClusters, numCoords, float); | ||
346 | + for (i = 0; i < numClusters; i++) { | ||
347 | + for (j = 0; j < numCoords; j++) { | ||
348 | + clusters[i][j] = dimClusters[j][i]; | ||
349 | + } | ||
350 | + } | ||
351 | + | ||
352 | + handle_error(cudaFree(deviceObjects)); | ||
353 | + handle_error(cudaFree(deviceClusters)); | ||
354 | + handle_error(cudaFree(deviceMembership)); | ||
355 | + handle_error(cudaFree(deviceIntermediates)); | ||
356 | + | ||
357 | + free(dimObjects[0]); | ||
358 | + free(dimObjects); | ||
359 | + free(dimClusters[0]); | ||
360 | + free(dimClusters); | ||
361 | + free(newClusters[0]); | ||
362 | + free(newClusters); | ||
363 | + free(newClusterSize); | ||
364 | + | ||
365 | + return clusters; | ||
366 | +} |