Commit 7f02091aaf8a5310eba59b3ab6158a809d9d7311
1 parent
efbb4bf7
added cuda kmeans clustering
Showing
1 changed file
with
353 additions
and
0 deletions
Show diff stats
1 | + | ||
2 | +#define malloc2D(name, xDim, yDim, type) do { \ | ||
3 | + name = (type **)malloc(xDim * sizeof(type *)); \ | ||
4 | + assert(name != NULL); \ | ||
5 | + name[0] = (type *)malloc(xDim * yDim * sizeof(type)); \ | ||
6 | + assert(name[0] != NULL); \ | ||
7 | + for (size_t i = 1; i < xDim; i++) \ | ||
8 | + name[i] = name[i-1] + yDim; \ | ||
9 | +} while (0) | ||
10 | + | ||
11 | + | ||
12 | + | ||
13 | +static void handleError(cudaError_t error, const char* file, int line){ | ||
14 | + | ||
15 | + if(error != cudaSuccess){ | ||
16 | + cout << cudaGetErrorString(error) << " in " << file << " at line " << line << endl; | ||
17 | + exit(1); | ||
18 | + } | ||
19 | +} | ||
20 | + | ||
21 | +#define handle_error(error) handleError(error, __FILE__ , __LINE__) | ||
22 | + | ||
23 | + | ||
24 | + | ||
25 | +static inline int nextPowerOfTwo(int n) { | ||
26 | + n--; | ||
27 | + | ||
28 | + n = n >> 1 | n; | ||
29 | + n = n >> 2 | n; | ||
30 | + n = n >> 4 | n; | ||
31 | + n = n >> 8 | n; | ||
32 | + n = n >> 16 | n; | ||
33 | + // n = n >> 32 | n; // For 64-bit ints | ||
34 | + | ||
35 | + return ++n; | ||
36 | +} | ||
37 | + | ||
38 | +/*----< euclid_dist_2() >----------------------------------------------------*/ | ||
39 | +/* square of Euclid distance between two multi-dimensional points */ | ||
40 | +__host__ __device__ inline static | ||
41 | +float euclid_dist_2(int numCoords, | ||
42 | + int numObjs, | ||
43 | + int numClusters, | ||
44 | + float *objects, // [numCoords][numObjs] | ||
45 | + float *clusters, // [numCoords][numClusters] | ||
46 | + int objectId, | ||
47 | + int clusterId) | ||
48 | +{ | ||
49 | + int i; | ||
50 | + float ans=0.0; | ||
51 | + | ||
52 | + for (i = 0; i < numCoords; i++) { | ||
53 | + ans += (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]) * | ||
54 | + (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]); | ||
55 | + } | ||
56 | + | ||
57 | + return(ans); | ||
58 | +} | ||
59 | + | ||
60 | +/*----< find_nearest_cluster() >---------------------------------------------*/ | ||
61 | +__global__ static | ||
62 | +void find_nearest_cluster(int numCoords, | ||
63 | + int numObjs, | ||
64 | + int numClusters, | ||
65 | + float *objects, // [numCoords][numObjs] | ||
66 | + float *deviceClusters, // [numCoords][numClusters] | ||
67 | + int *membership, // [numObjs] | ||
68 | + int *intermediates) | ||
69 | +{ | ||
70 | + extern __shared__ char sharedMemory[]; | ||
71 | + | ||
72 | + // The type chosen for membershipChanged must be large enough to support | ||
73 | + // reductions! There are blockDim.x elements, one for each thread in the | ||
74 | + // block. See numThreadsPerClusterBlock in cuda_kmeans(). | ||
75 | + unsigned char *membershipChanged = (unsigned char *)sharedMemory; | ||
76 | +#ifdef BLOCK_SHARED_MEM_OPTIMIZATION | ||
77 | + float *clusters = (float *)(sharedMemory + blockDim.x); | ||
78 | +#else | ||
79 | + float *clusters = deviceClusters; | ||
80 | +#endif | ||
81 | + | ||
82 | + membershipChanged[threadIdx.x] = 0; | ||
83 | + | ||
84 | +#ifdef BLOCK_SHARED_MEM_OPTIMIZATION | ||
85 | + // BEWARE: We can overrun our shared memory here if there are too many | ||
86 | + // clusters or too many coordinates! For reference, a Tesla C1060 has 16 | ||
87 | + // KiB of shared memory per block, and a GeForce GTX 480 has 48 KiB of | ||
88 | + // shared memory per block. | ||
89 | + for (int i = threadIdx.x; i < numClusters; i += blockDim.x) { | ||
90 | + for (int j = 0; j < numCoords; j++) { | ||
91 | + clusters[numClusters * j + i] = deviceClusters[numClusters * j + i]; | ||
92 | + } | ||
93 | + } | ||
94 | + __syncthreads(); | ||
95 | +#endif | ||
96 | + | ||
97 | + int objectId = blockDim.x * blockIdx.x + threadIdx.x; | ||
98 | + | ||
99 | + if (objectId < numObjs) { | ||
100 | + int index, i; | ||
101 | + float dist, min_dist; | ||
102 | + | ||
103 | + /* find the cluster id that has min distance to object */ | ||
104 | + index = 0; | ||
105 | + min_dist = euclid_dist_2(numCoords, numObjs, numClusters, | ||
106 | + objects, clusters, objectId, 0); | ||
107 | + | ||
108 | + for (i=1; i<numClusters; i++) { | ||
109 | + dist = euclid_dist_2(numCoords, numObjs, numClusters, | ||
110 | + objects, clusters, objectId, i); | ||
111 | + /* no need square root */ | ||
112 | + if (dist < min_dist) { /* find the min and its array index */ | ||
113 | + min_dist = dist; | ||
114 | + index = i; | ||
115 | + } | ||
116 | + } | ||
117 | + | ||
118 | + if (membership[objectId] != index) { | ||
119 | + membershipChanged[threadIdx.x] = 1; | ||
120 | + } | ||
121 | + | ||
122 | + /* assign the membership to object objectId */ | ||
123 | + membership[objectId] = index; | ||
124 | + | ||
125 | + __syncthreads(); // For membershipChanged[] | ||
126 | + | ||
127 | + // blockDim.x *must* be a power of two! | ||
128 | + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { | ||
129 | + if (threadIdx.x < s) { | ||
130 | + membershipChanged[threadIdx.x] += | ||
131 | + membershipChanged[threadIdx.x + s]; | ||
132 | + } | ||
133 | + __syncthreads(); | ||
134 | + } | ||
135 | + | ||
136 | + if (threadIdx.x == 0) { | ||
137 | + intermediates[blockIdx.x] = membershipChanged[0]; | ||
138 | + } | ||
139 | + } | ||
140 | +} | ||
141 | + | ||
142 | +__global__ static | ||
143 | +void compute_delta(int *deviceIntermediates, | ||
144 | + int numIntermediates, // The actual number of intermediates | ||
145 | + int numIntermediates2) // The next power of two | ||
146 | +{ | ||
147 | + // The number of elements in this array should be equal to | ||
148 | + // numIntermediates2, the number of threads launched. It *must* be a power | ||
149 | + // of two! | ||
150 | + extern __shared__ unsigned int intermediates[]; | ||
151 | + | ||
152 | + // Copy global intermediate values into shared memory. | ||
153 | + intermediates[threadIdx.x] = | ||
154 | + (threadIdx.x < numIntermediates) ? deviceIntermediates[threadIdx.x] : 0; | ||
155 | + | ||
156 | + __syncthreads(); | ||
157 | + | ||
158 | + // numIntermediates2 *must* be a power of two! | ||
159 | + for (unsigned int s = numIntermediates2 / 2; s > 0; s >>= 1) { | ||
160 | + if (threadIdx.x < s) { | ||
161 | + intermediates[threadIdx.x] += intermediates[threadIdx.x + s]; | ||
162 | + } | ||
163 | + __syncthreads(); | ||
164 | + } | ||
165 | + | ||
166 | + if (threadIdx.x == 0) { | ||
167 | + deviceIntermediates[0] = intermediates[0]; | ||
168 | + } | ||
169 | +} | ||
170 | + | ||
171 | +/*----< cuda_kmeans() >-------------------------------------------------------*/ | ||
172 | +// | ||
173 | +// ---------------------------------------- | ||
174 | +// DATA LAYOUT | ||
175 | +// | ||
176 | +// objects [numObjs][numCoords] | ||
177 | +// clusters [numClusters][numCoords] | ||
178 | +// dimObjects [numCoords][numObjs] | ||
179 | +// dimClusters [numCoords][numClusters] | ||
180 | +// newClusters [numCoords][numClusters] | ||
181 | +// deviceObjects [numCoords][numObjs] | ||
182 | +// deviceClusters [numCoords][numClusters] | ||
183 | +// ---------------------------------------- | ||
184 | +// | ||
185 | +/* return an array of cluster centers of size [numClusters][numCoords] */ | ||
186 | +float** cuda_kmeans(float **objects, /* in: [numObjs][numCoords] */ | ||
187 | + unsigned int numCoords, /* no. features */ | ||
188 | + unsigned int numObjs, /* no. objects */ | ||
189 | + unsigned int numClusters, /* no. clusters */ | ||
190 | + float threshold, /* % objects change membership */ | ||
191 | + int *membership, /* out: [numObjs] */ | ||
192 | + int loops) | ||
193 | +{ | ||
194 | + int i, j, index, loop=0; | ||
195 | + int *newClusterSize; /* [numClusters]: no. objects assigned in each | ||
196 | + new cluster */ | ||
197 | + float delta; /* % of objects change their clusters */ | ||
198 | + float **dimObjects; | ||
199 | + float **clusters; /* out: [numClusters][numCoords] */ | ||
200 | + float **dimClusters; | ||
201 | + float **newClusters; /* [numCoords][numClusters] */ | ||
202 | + | ||
203 | + float *deviceObjects; | ||
204 | + float *deviceClusters; | ||
205 | + int *deviceMembership; | ||
206 | + int *deviceIntermediates; | ||
207 | + | ||
208 | + // Copy objects given in [numObjs][numCoords] layout to new | ||
209 | + // [numCoords][numObjs] layout | ||
210 | + malloc2D(dimObjects, numCoords, numObjs, float); | ||
211 | + for (i = 0; i < numCoords; i++) { | ||
212 | + for (j = 0; j < numObjs; j++) { | ||
213 | + dimObjects[i][j] = objects[j][i]; | ||
214 | + } | ||
215 | + } | ||
216 | + | ||
217 | + /* pick first numClusters elements of objects[] as initial cluster centers*/ | ||
218 | + malloc2D(dimClusters, numCoords, numClusters, float); | ||
219 | + for (i = 0; i < numCoords; i++) { | ||
220 | + for (j = 0; j < numClusters; j++) { | ||
221 | + dimClusters[i][j] = dimObjects[i][j]; | ||
222 | + } | ||
223 | + } | ||
224 | + | ||
225 | + /* initialize membership[] */ | ||
226 | + for (i=0; i<numObjs; i++) membership[i] = -1; | ||
227 | + | ||
228 | + /* need to initialize newClusterSize and newClusters[0] to all 0 */ | ||
229 | + newClusterSize = (int*) calloc(numClusters, sizeof(int)); | ||
230 | + assert(newClusterSize != NULL); | ||
231 | + | ||
232 | + malloc2D(newClusters, numCoords, numClusters, float); | ||
233 | + memset(newClusters[0], 0, numCoords * numClusters * sizeof(float)); | ||
234 | + | ||
235 | + // To support reduction, numThreadsPerClusterBlock *must* be a power of | ||
236 | + // two, and it *must* be no larger than the number of bits that will | ||
237 | + // fit into an unsigned char, the type used to keep track of membership | ||
238 | + // changes in the kernel. | ||
239 | + cudaDeviceProp props; | ||
240 | + handle_error(cudaGetDeviceProperties(&props, 0)); | ||
241 | + const unsigned int numThreadsPerClusterBlock = props.maxThreadsPerBlock; | ||
242 | + const unsigned int numClusterBlocks = | ||
243 | + ceil(numObjs / (double)numThreadsPerClusterBlock); | ||
244 | + | ||
245 | +#ifdef BLOCK_SHARED_MEM_OPTIMIZATION | ||
246 | + const unsigned int clusterBlockSharedDataSize = | ||
247 | + numThreadsPerClusterBlock * sizeof(unsigned char) + | ||
248 | + numClusters * numCoords * sizeof(float); | ||
249 | + | ||
250 | + cudaDeviceProp deviceProp; | ||
251 | + int deviceNum; | ||
252 | + cudaGetDevice(&deviceNum); | ||
253 | + cudaGetDeviceProperties(&deviceProp, deviceNum); | ||
254 | + | ||
255 | + if (clusterBlockSharedDataSize > deviceProp.sharedMemPerBlock) { | ||
256 | + std::cout << "ERROR: insufficient shared memory. Please don't use the definition 'BLOCK_SHARED_MEM_OPTIMIZATION'" << endl; | ||
257 | + exit(1); | ||
258 | + } | ||
259 | +#else | ||
260 | + const unsigned int clusterBlockSharedDataSize = | ||
261 | + numThreadsPerClusterBlock * sizeof(unsigned char); | ||
262 | +#endif | ||
263 | + | ||
264 | + const unsigned int numReductionThreads = | ||
265 | + nextPowerOfTwo(numClusterBlocks); | ||
266 | + const unsigned int reductionBlockSharedDataSize = | ||
267 | + numReductionThreads * sizeof(unsigned int); | ||
268 | + | ||
269 | + handle_error(cudaMalloc((void**)&deviceObjects, numObjs*numCoords*sizeof(float))); | ||
270 | + handle_error(cudaMalloc((void**)&deviceClusters, numClusters*numCoords*sizeof(float))); | ||
271 | + handle_error(cudaMalloc((void**)&deviceMembership, numObjs*sizeof(int))); | ||
272 | + handle_error(cudaMalloc((void**)&deviceIntermediates, numReductionThreads*sizeof(unsigned int))); | ||
273 | + | ||
274 | + handle_error(cudaMemcpy(deviceObjects, dimObjects[0], | ||
275 | + numObjs*numCoords*sizeof(float), cudaMemcpyHostToDevice)); | ||
276 | + handle_error(cudaMemcpy(deviceMembership, membership, | ||
277 | + numObjs*sizeof(int), cudaMemcpyHostToDevice)); | ||
278 | + | ||
279 | + do { | ||
280 | + handle_error(cudaMemcpy(deviceClusters, dimClusters[0], | ||
281 | + numClusters*numCoords*sizeof(float), cudaMemcpyHostToDevice)); | ||
282 | + | ||
283 | + find_nearest_cluster | ||
284 | + <<< numClusterBlocks, numThreadsPerClusterBlock, clusterBlockSharedDataSize >>> | ||
285 | + (numCoords, numObjs, numClusters, | ||
286 | + deviceObjects, deviceClusters, deviceMembership, deviceIntermediates); | ||
287 | + | ||
288 | + cudaDeviceSynchronize(); | ||
289 | + | ||
290 | + compute_delta <<< 1, numReductionThreads, reductionBlockSharedDataSize >>> | ||
291 | + (deviceIntermediates, numClusterBlocks, numReductionThreads); | ||
292 | + | ||
293 | + cudaDeviceSynchronize(); | ||
294 | + | ||
295 | + int d; | ||
296 | + handle_error(cudaMemcpy(&d, deviceIntermediates, | ||
297 | + sizeof(int), cudaMemcpyDeviceToHost)); | ||
298 | + delta = (float)d; | ||
299 | + | ||
300 | + handle_error(cudaMemcpy(membership, deviceMembership, | ||
301 | + numObjs*sizeof(int), cudaMemcpyDeviceToHost)); | ||
302 | + | ||
303 | + for (i=0; i<numObjs; i++) { | ||
304 | + /* find the array index of nestest cluster center */ | ||
305 | + index = membership[i]; | ||
306 | + | ||
307 | + /* update new cluster centers : sum of objects located within */ | ||
308 | + newClusterSize[index]++; | ||
309 | + for (j=0; j<numCoords; j++) | ||
310 | + newClusters[j][index] += objects[i][j]; | ||
311 | + } | ||
312 | + | ||
313 | + // TODO: Flip the nesting order | ||
314 | + // TODO: Change layout of newClusters to [numClusters][numCoords] | ||
315 | + /* average the sum and replace old cluster centers with newClusters */ | ||
316 | + for (i=0; i<numClusters; i++) { | ||
317 | + for (j=0; j<numCoords; j++) { | ||
318 | + if (newClusterSize[i] > 0) | ||
319 | + dimClusters[j][i] = newClusters[j][i] / newClusterSize[i]; | ||
320 | + newClusters[j][i] = 0.0; /* set back to 0 */ | ||
321 | + } | ||
322 | + newClusterSize[i] = 0; /* set back to 0 */ | ||
323 | + } | ||
324 | + | ||
325 | + delta /= numObjs; | ||
326 | + } while (delta > threshold && loop++ < loops); | ||
327 | + | ||
328 | + | ||
329 | + | ||
330 | + /* allocate a 2D space for returning variable clusters[] (coordinates | ||
331 | + of cluster centers) */ | ||
332 | + malloc2D(clusters, numClusters, numCoords, float); | ||
333 | + for (i = 0; i < numClusters; i++) { | ||
334 | + for (j = 0; j < numCoords; j++) { | ||
335 | + clusters[i][j] = dimClusters[j][i]; | ||
336 | + } | ||
337 | + } | ||
338 | + | ||
339 | + handle_error(cudaFree(deviceObjects)); | ||
340 | + handle_error(cudaFree(deviceClusters)); | ||
341 | + handle_error(cudaFree(deviceMembership)); | ||
342 | + handle_error(cudaFree(deviceIntermediates)); | ||
343 | + | ||
344 | + free(dimObjects[0]); | ||
345 | + free(dimObjects); | ||
346 | + free(dimClusters[0]); | ||
347 | + free(dimClusters); | ||
348 | + free(newClusters[0]); | ||
349 | + free(newClusters); | ||
350 | + free(newClusterSize); | ||
351 | + | ||
352 | + return clusters; | ||
353 | +} |