codebase / stimlib

Browse Code »

Commit e1f4b2bc4cd43a6682791bbf75a7392d1037bbe8

Authored by David Mayerich 2017-01-09 09:25:49 -0600

2 parents 7a615d7e 6911de0d

Merged branch davar into master

Showing 1 changed file with 366 additions and 0 deletions Show diff stats

Inline Side-by-side

stim/cuda/kmeans.cuh 0 → 100644

Wrap text Show/Hide comments View file @e1f4b2b

	1	+//This software is dervied from Professor Wei-keng Liao's parallel k-means
	2	+//clustering code obtained on November 21, 2010 from
	3	+// http://users.eecs.northwestern.edu/~wkliao/Kmeans/index.html
	4	+//(http://users.eecs.northwestern.edu/~wkliao/Kmeans/simple_kmeans.tar.gz).
	5	+//
	6	+//With his permission, Serban Giuroiu is publishing his CUDA implementation based on his code
	7	+//under the open-source MIT license. See the LICENSE file for more details.
	8	+
	9	+// The original code can be found on Github ( https://github.com/serban/kmeans )
	10	+// Here I have just made a few changes to get it to work
	11	+
	12	+
	13	+
	14	+
	15	+#define malloc2D(name, xDim, yDim, type) do { \
	16	+ name = (type *)malloc(xDim sizeof(type *)); \
	17	+ assert(name != NULL); \
	18	+ name[0] = (type )malloc(xDim yDim * sizeof(type)); \
	19	+ assert(name[0] != NULL); \
	20	+ for (size_t i = 1; i < xDim; i++) \
	21	+ name[i] = name[i-1] + yDim; \
	22	+} while (0)
	23	+
	24	+
	25	+
	26	+static void handleError(cudaError_t error, const char* file, int line){
	27	+
	28	+ if(error != cudaSuccess){
	29	+ cout << cudaGetErrorString(error) << " in " << file << " at line " << line << endl;
	30	+ exit(1);
	31	+ }
	32	+}
	33	+
	34	+#define handle_error(error) handleError(error, __FILE__ , __LINE__)
	35	+
	36	+
	37	+
	38	+static inline int nextPowerOfTwo(int n) {
	39	+ n--;
	40	+
	41	+ n = n >> 1 \| n;
	42	+ n = n >> 2 \| n;
	43	+ n = n >> 4 \| n;
	44	+ n = n >> 8 \| n;
	45	+ n = n >> 16 \| n;
	46	+ // n = n >> 32 \| n; // For 64-bit ints
	47	+
	48	+ return ++n;
	49	+}
	50	+
	51	+/----< euclid_dist_2() >----------------------------------------------------/
	52	+/* square of Euclid distance between two multi-dimensional points */
	53	+__host__ __device__ inline static
	54	+float euclid_dist_2(int numCoords,
	55	+ int numObjs,
	56	+ int numClusters,
	57	+ float *objects, // [numCoords][numObjs]
	58	+ float *clusters, // [numCoords][numClusters]
	59	+ int objectId,
	60	+ int clusterId)
	61	+{
	62	+ int i;
	63	+ float ans=0.0;
	64	+
	65	+ for (i = 0; i < numCoords; i++) {
	66	+ ans += (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]) *
	67	+ (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]);
	68	+ }
	69	+
	70	+ return(ans);
	71	+}
	72	+
	73	+/----< find_nearest_cluster() >---------------------------------------------/
	74	+__global__ static
	75	+void find_nearest_cluster(int numCoords,
	76	+ int numObjs,
	77	+ int numClusters,
	78	+ float *objects, // [numCoords][numObjs]
	79	+ float *deviceClusters, // [numCoords][numClusters]
	80	+ int *membership, // [numObjs]
	81	+ int *intermediates)
	82	+{
	83	+ extern __shared__ char sharedMemory[];
	84	+
	85	+ // The type chosen for membershipChanged must be large enough to support
	86	+ // reductions! There are blockDim.x elements, one for each thread in the
	87	+ // block. See numThreadsPerClusterBlock in cuda_kmeans().
	88	+ unsigned char membershipChanged = (unsigned char )sharedMemory;
	89	+#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
	90	+ float clusters = (float )(sharedMemory + blockDim.x);
	91	+#else
	92	+ float *clusters = deviceClusters;
	93	+#endif
	94	+
	95	+ membershipChanged[threadIdx.x] = 0;
	96	+
	97	+#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
	98	+ // BEWARE: We can overrun our shared memory here if there are too many
	99	+ // clusters or too many coordinates! For reference, a Tesla C1060 has 16
	100	+ // KiB of shared memory per block, and a GeForce GTX 480 has 48 KiB of
	101	+ // shared memory per block.
	102	+ for (int i = threadIdx.x; i < numClusters; i += blockDim.x) {
	103	+ for (int j = 0; j < numCoords; j++) {
	104	+ clusters[numClusters * j + i] = deviceClusters[numClusters * j + i];
	105	+ }
	106	+ }
	107	+ __syncthreads();
	108	+#endif
	109	+
	110	+ int objectId = blockDim.x * blockIdx.x + threadIdx.x;
	111	+
	112	+ if (objectId < numObjs) {
	113	+ int index, i;
	114	+ float dist, min_dist;
	115	+
	116	+ /* find the cluster id that has min distance to object */
	117	+ index = 0;
	118	+ min_dist = euclid_dist_2(numCoords, numObjs, numClusters,
	119	+ objects, clusters, objectId, 0);
	120	+
	121	+ for (i=1; i<numClusters; i++) {
	122	+ dist = euclid_dist_2(numCoords, numObjs, numClusters,
	123	+ objects, clusters, objectId, i);
	124	+ /* no need square root */
	125	+ if (dist < min_dist) { /* find the min and its array index */
	126	+ min_dist = dist;
	127	+ index = i;
	128	+ }
	129	+ }
	130	+
	131	+ if (membership[objectId] != index) {
	132	+ membershipChanged[threadIdx.x] = 1;
	133	+ }
	134	+
	135	+ /* assign the membership to object objectId */
	136	+ membership[objectId] = index;
	137	+
	138	+ __syncthreads(); // For membershipChanged[]
	139	+
	140	+ // blockDim.x must be a power of two!
	141	+ for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
	142	+ if (threadIdx.x < s) {
	143	+ membershipChanged[threadIdx.x] +=
	144	+ membershipChanged[threadIdx.x + s];
	145	+ }
	146	+ __syncthreads();
	147	+ }
	148	+
	149	+ if (threadIdx.x == 0) {
	150	+ intermediates[blockIdx.x] = membershipChanged[0];
	151	+ }
	152	+ }
	153	+}
	154	+
	155	+__global__ static
	156	+void compute_delta(int *deviceIntermediates,
	157	+ int numIntermediates, // The actual number of intermediates
	158	+ int numIntermediates2) // The next power of two
	159	+{
	160	+ // The number of elements in this array should be equal to
	161	+ // numIntermediates2, the number of threads launched. It must be a power
	162	+ // of two!
	163	+ extern __shared__ unsigned int intermediates[];
	164	+
	165	+ // Copy global intermediate values into shared memory.
	166	+ intermediates[threadIdx.x] =
	167	+ (threadIdx.x < numIntermediates) ? deviceIntermediates[threadIdx.x] : 0;
	168	+
	169	+ __syncthreads();
	170	+
	171	+ // numIntermediates2 must be a power of two!
	172	+ for (unsigned int s = numIntermediates2 / 2; s > 0; s >>= 1) {
	173	+ if (threadIdx.x < s) {
	174	+ intermediates[threadIdx.x] += intermediates[threadIdx.x + s];
	175	+ }
	176	+ __syncthreads();
	177	+ }
	178	+
	179	+ if (threadIdx.x == 0) {
	180	+ deviceIntermediates[0] = intermediates[0];
	181	+ }
	182	+}
	183	+
	184	+/----< cuda_kmeans() >-------------------------------------------------------/
	185	+//
	186	+// ----------------------------------------
	187	+// DATA LAYOUT
	188	+//
	189	+// objects [numObjs][numCoords]
	190	+// clusters [numClusters][numCoords]
	191	+// dimObjects [numCoords][numObjs]
	192	+// dimClusters [numCoords][numClusters]
	193	+// newClusters [numCoords][numClusters]
	194	+// deviceObjects [numCoords][numObjs]
	195	+// deviceClusters [numCoords][numClusters]
	196	+// ----------------------------------------
	197	+//
	198	+/* return an array of cluster centers of size [numClusters][numCoords] */
	199	+float cuda_kmeans(float objects, /* in: [numObjs][numCoords] */
	200	+ unsigned int numCoords, /* no. features */
	201	+ unsigned int numObjs, /* no. objects */
	202	+ unsigned int numClusters, /* no. clusters */
	203	+ float threshold, /* % objects change membership */
	204	+ int membership, / out: [numObjs] */
	205	+ int loops)
	206	+{
	207	+ int i, j, index, loop=0;
	208	+ int newClusterSize; / [numClusters]: no. objects assigned in each
	209	+ new cluster */
	210	+ float delta; /* % of objects change their clusters */
	211	+ float **dimObjects;
	212	+ float *clusters; / out: [numClusters][numCoords] */
	213	+ float **dimClusters;
	214	+ float *newClusters; / [numCoords][numClusters] */
	215	+
	216	+ float *deviceObjects;
	217	+ float *deviceClusters;
	218	+ int *deviceMembership;
	219	+ int *deviceIntermediates;
	220	+
	221	+ // Copy objects given in [numObjs][numCoords] layout to new
	222	+ // [numCoords][numObjs] layout
	223	+ malloc2D(dimObjects, numCoords, numObjs, float);
	224	+ for (i = 0; i < numCoords; i++) {
	225	+ for (j = 0; j < numObjs; j++) {
	226	+ dimObjects[i][j] = objects[j][i];
	227	+ }
	228	+ }
	229	+
	230	+ /* pick first numClusters elements of objects[] as initial cluster centers*/
	231	+ malloc2D(dimClusters, numCoords, numClusters, float);
	232	+ for (i = 0; i < numCoords; i++) {
	233	+ for (j = 0; j < numClusters; j++) {
	234	+ dimClusters[i][j] = dimObjects[i][j];
	235	+ }
	236	+ }
	237	+
	238	+ /* initialize membership[] */
	239	+ for (i=0; i<numObjs; i++) membership[i] = -1;
	240	+
	241	+ /* need to initialize newClusterSize and newClusters[0] to all 0 */
	242	+ newClusterSize = (int*) calloc(numClusters, sizeof(int));
	243	+ assert(newClusterSize != NULL);
	244	+
	245	+ malloc2D(newClusters, numCoords, numClusters, float);
	246	+ memset(newClusters[0], 0, numCoords * numClusters * sizeof(float));
	247	+
	248	+ // To support reduction, numThreadsPerClusterBlock must be a power of
	249	+ // two, and it must be no larger than the number of bits that will
	250	+ // fit into an unsigned char, the type used to keep track of membership
	251	+ // changes in the kernel.
	252	+ cudaDeviceProp props;
	253	+ handle_error(cudaGetDeviceProperties(&props, 0));
	254	+ const unsigned int numThreadsPerClusterBlock = props.maxThreadsPerBlock;
	255	+ const unsigned int numClusterBlocks =
	256	+ ceil(numObjs / (double)numThreadsPerClusterBlock);
	257	+
	258	+#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
	259	+ const unsigned int clusterBlockSharedDataSize =
	260	+ numThreadsPerClusterBlock * sizeof(unsigned char) +
	261	+ numClusters * numCoords * sizeof(float);
	262	+
	263	+ cudaDeviceProp deviceProp;
	264	+ int deviceNum;
	265	+ cudaGetDevice(&deviceNum);
	266	+ cudaGetDeviceProperties(&deviceProp, deviceNum);
	267	+
	268	+ if (clusterBlockSharedDataSize > deviceProp.sharedMemPerBlock) {
	269	+ std::cout << "ERROR: insufficient shared memory. Please don't use the definition 'BLOCK_SHARED_MEM_OPTIMIZATION'" << endl;
	270	+ exit(1);
	271	+ }
	272	+#else
	273	+ const unsigned int clusterBlockSharedDataSize =
	274	+ numThreadsPerClusterBlock * sizeof(unsigned char);
	275	+#endif
	276	+
	277	+ const unsigned int numReductionThreads =
	278	+ nextPowerOfTwo(numClusterBlocks);
	279	+ const unsigned int reductionBlockSharedDataSize =
	280	+ numReductionThreads * sizeof(unsigned int);
	281	+
	282	+ handle_error(cudaMalloc((void*)&deviceObjects, numObjsnumCoords*sizeof(float)));
	283	+ handle_error(cudaMalloc((void*)&deviceClusters, numClustersnumCoords*sizeof(float)));
	284	+ handle_error(cudaMalloc((void*)&deviceMembership, numObjssizeof(int)));
	285	+ handle_error(cudaMalloc((void*)&deviceIntermediates, numReductionThreadssizeof(unsigned int)));
	286	+
	287	+ handle_error(cudaMemcpy(deviceObjects, dimObjects[0],
	288	+ numObjsnumCoordssizeof(float), cudaMemcpyHostToDevice));
	289	+ handle_error(cudaMemcpy(deviceMembership, membership,
	290	+ numObjs*sizeof(int), cudaMemcpyHostToDevice));
	291	+
	292	+ do {
	293	+ handle_error(cudaMemcpy(deviceClusters, dimClusters[0],
	294	+ numClustersnumCoordssizeof(float), cudaMemcpyHostToDevice));
	295	+
	296	+ find_nearest_cluster
	297	+ <<< numClusterBlocks, numThreadsPerClusterBlock, clusterBlockSharedDataSize >>>
	298	+ (numCoords, numObjs, numClusters,
	299	+ deviceObjects, deviceClusters, deviceMembership, deviceIntermediates);
	300	+
	301	+ cudaDeviceSynchronize();
	302	+
	303	+ compute_delta <<< 1, numReductionThreads, reductionBlockSharedDataSize >>>
	304	+ (deviceIntermediates, numClusterBlocks, numReductionThreads);
	305	+
	306	+ cudaDeviceSynchronize();
	307	+
	308	+ int d;
	309	+ handle_error(cudaMemcpy(&d, deviceIntermediates,
	310	+ sizeof(int), cudaMemcpyDeviceToHost));
	311	+ delta = (float)d;
	312	+
	313	+ handle_error(cudaMemcpy(membership, deviceMembership,
	314	+ numObjs*sizeof(int), cudaMemcpyDeviceToHost));
	315	+
	316	+ for (i=0; i<numObjs; i++) {
	317	+ /* find the array index of nestest cluster center */
	318	+ index = membership[i];
	319	+
	320	+ /* update new cluster centers : sum of objects located within */
	321	+ newClusterSize[index]++;
	322	+ for (j=0; j<numCoords; j++)
	323	+ newClusters[j][index] += objects[i][j];
	324	+ }
	325	+
	326	+ // TODO: Flip the nesting order
	327	+ // TODO: Change layout of newClusters to [numClusters][numCoords]
	328	+ /* average the sum and replace old cluster centers with newClusters */
	329	+ for (i=0; i<numClusters; i++) {
	330	+ for (j=0; j<numCoords; j++) {
	331	+ if (newClusterSize[i] > 0)
	332	+ dimClusters[j][i] = newClusters[j][i] / newClusterSize[i];
	333	+ newClusters[j][i] = 0.0; /* set back to 0 */
	334	+ }
	335	+ newClusterSize[i] = 0; /* set back to 0 */
	336	+ }
	337	+
	338	+ delta /= numObjs;
	339	+ } while (delta > threshold && loop++ < loops);
	340	+
	341	+
	342	+
	343	+ /* allocate a 2D space for returning variable clusters[] (coordinates
	344	+ of cluster centers) */
	345	+ malloc2D(clusters, numClusters, numCoords, float);
	346	+ for (i = 0; i < numClusters; i++) {
	347	+ for (j = 0; j < numCoords; j++) {
	348	+ clusters[i][j] = dimClusters[j][i];
	349	+ }
	350	+ }
	351	+
	352	+ handle_error(cudaFree(deviceObjects));
	353	+ handle_error(cudaFree(deviceClusters));
	354	+ handle_error(cudaFree(deviceMembership));
	355	+ handle_error(cudaFree(deviceIntermediates));
	356	+
	357	+ free(dimObjects[0]);
	358	+ free(dimObjects);
	359	+ free(dimClusters[0]);
	360	+ free(dimClusters);
	361	+ free(newClusters[0]);
	362	+ free(newClusters);
	363	+ free(newClusterSize);
	364	+
	365	+ return clusters;
	366	+}
...	...