codebase / stimlib

Browse Code »

Commit e1f4b2bc4cd43a6682791bbf75a7392d1037bbe8

Authored by David Mayerich 2017-01-09 09:25:49 -0600

2 parents 7a615d7e 6911de0d

Merged branch davar into master

Showing 1 changed file with 366 additions and 0 deletions Show diff stats

Inline Side-by-side

stim/cuda/kmeans.cuh 0 → 100644

Show/Hide comments View file @e1f4b2b

		1	+//This software is dervied from Professor Wei-keng Liao's parallel k-means
		2	+//clustering code obtained on November 21, 2010 from
		3	+// http://users.eecs.northwestern.edu/~wkliao/Kmeans/index.html
		4	+//(http://users.eecs.northwestern.edu/~wkliao/Kmeans/simple_kmeans.tar.gz).
		5	+//
		6	+//With his permission, Serban Giuroiu is publishing his CUDA implementation based on his code
		7	+//under the open-source MIT license. See the LICENSE file for more details.
		8	+
		9	+// The original code can be found on Github ( https://github.com/serban/kmeans )
		10	+// Here I have just made a few changes to get it to work
		11	+
		12	+
		13	+
		14	+
		15	+#define malloc2D(name, xDim, yDim, type) do { \
		16	+ name = (type *)malloc(xDim sizeof(type *)); \
		17	+ assert(name != NULL); \
		18	+ name[0] = (type )malloc(xDim yDim * sizeof(type)); \
		19	+ assert(name[0] != NULL); \
		20	+ for (size_t i = 1; i < xDim; i++) \
		21	+ name[i] = name[i-1] + yDim; \
		22	+} while (0)
		23	+
		24	+
		25	+
		26	+static void handleError(cudaError_t error, const char* file, int line){
		27	+
		28	+ if(error != cudaSuccess){
		29	+ cout << cudaGetErrorString(error) << " in " << file << " at line " << line << endl;
		30	+ exit(1);
		31	+ }
		32	+}
		33	+
		34	+#define handle_error(error) handleError(error, __FILE__ , __LINE__)
		35	+
		36	+
		37	+
		38	+static inline int nextPowerOfTwo(int n) {
		39	+ n--;
		40	+
		41	+ n = n >> 1 \| n;
		42	+ n = n >> 2 \| n;
		43	+ n = n >> 4 \| n;
		44	+ n = n >> 8 \| n;
		45	+ n = n >> 16 \| n;
		46	+ // n = n >> 32 \| n; // For 64-bit ints
		47	+
		48	+ return ++n;
		49	+}
		50	+
		51	+/----< euclid_dist_2() >----------------------------------------------------/
		52	+/* square of Euclid distance between two multi-dimensional points */
		53	+__host__ __device__ inline static
		54	+float euclid_dist_2(int numCoords,
		55	+ int numObjs,
		56	+ int numClusters,
		57	+ float *objects, // [numCoords][numObjs]
		58	+ float *clusters, // [numCoords][numClusters]
		59	+ int objectId,
		60	+ int clusterId)
		61	+{
		62	+ int i;
		63	+ float ans=0.0;
		64	+
		65	+ for (i = 0; i < numCoords; i++) {
		66	+ ans += (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]) *
		67	+ (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]);
		68	+ }
		69	+
		70	+ return(ans);
		71	+}
		72	+
		73	+/----< find_nearest_cluster() >---------------------------------------------/
		74	+__global__ static
		75	+void find_nearest_cluster(int numCoords,
		76	+ int numObjs,
		77	+ int numClusters,
		78	+ float *objects, // [numCoords][numObjs]
		79	+ float *deviceClusters, // [numCoords][numClusters]
		80	+ int *membership, // [numObjs]
		81	+ int *intermediates)
		82	+{
		83	+ extern __shared__ char sharedMemory[];
		84	+
		85	+ // The type chosen for membershipChanged must be large enough to support
		86	+ // reductions! There are blockDim.x elements, one for each thread in the
		87	+ // block. See numThreadsPerClusterBlock in cuda_kmeans().
		88	+ unsigned char membershipChanged = (unsigned char )sharedMemory;
		89	+#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
		90	+ float clusters = (float )(sharedMemory + blockDim.x);
		91	+#else
		92	+ float *clusters = deviceClusters;
		93	+#endif
		94	+
		95	+ membershipChanged[threadIdx.x] = 0;
		96	+
		97	+#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
		98	+ // BEWARE: We can overrun our shared memory here if there are too many
		99	+ // clusters or too many coordinates! For reference, a Tesla C1060 has 16
		100	+ // KiB of shared memory per block, and a GeForce GTX 480 has 48 KiB of
		101	+ // shared memory per block.
		102	+ for (int i = threadIdx.x; i < numClusters; i += blockDim.x) {
		103	+ for (int j = 0; j < numCoords; j++) {
		104	+ clusters[numClusters * j + i] = deviceClusters[numClusters * j + i];
		105	+ }
		106	+ }
		107	+ __syncthreads();
		108	+#endif
		109	+
		110	+ int objectId = blockDim.x * blockIdx.x + threadIdx.x;
		111	+
		112	+ if (objectId < numObjs) {
		113	+ int index, i;
		114	+ float dist, min_dist;
		115	+
		116	+ /* find the cluster id that has min distance to object */
		117	+ index = 0;
		118	+ min_dist = euclid_dist_2(numCoords, numObjs, numClusters,
		119	+ objects, clusters, objectId, 0);
		120	+
		121	+ for (i=1; i<numClusters; i++) {
		122	+ dist = euclid_dist_2(numCoords, numObjs, numClusters,
		123	+ objects, clusters, objectId, i);
		124	+ /* no need square root */
		125	+ if (dist < min_dist) { /* find the min and its array index */
		126	+ min_dist = dist;
		127	+ index = i;
		128	+ }
		129	+ }
		130	+
		131	+ if (membership[objectId] != index) {
		132	+ membershipChanged[threadIdx.x] = 1;
		133	+ }
		134	+
		135	+ /* assign the membership to object objectId */
		136	+ membership[objectId] = index;
		137	+
		138	+ __syncthreads(); // For membershipChanged[]
		139	+
		140	+ // blockDim.x must be a power of two!
		141	+ for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
		142	+ if (threadIdx.x < s) {
		143	+ membershipChanged[threadIdx.x] +=
		144	+ membershipChanged[threadIdx.x + s];
		145	+ }
		146	+ __syncthreads();
		147	+ }
		148	+
		149	+ if (threadIdx.x == 0) {
		150	+ intermediates[blockIdx.x] = membershipChanged[0];
		151	+ }
		152	+ }
		153	+}
		154	+
		155	+__global__ static
		156	+void compute_delta(int *deviceIntermediates,
		157	+ int numIntermediates, // The actual number of intermediates
		158	+ int numIntermediates2) // The next power of two
		159	+{
		160	+ // The number of elements in this array should be equal to
		161	+ // numIntermediates2, the number of threads launched. It must be a power
		162	+ // of two!
		163	+ extern __shared__ unsigned int intermediates[];
		164	+
		165	+ // Copy global intermediate values into shared memory.
		166	+ intermediates[threadIdx.x] =
		167	+ (threadIdx.x < numIntermediates) ? deviceIntermediates[threadIdx.x] : 0;
		168	+
		169	+ __syncthreads();
		170	+
		171	+ // numIntermediates2 must be a power of two!
		172	+ for (unsigned int s = numIntermediates2 / 2; s > 0; s >>= 1) {
		173	+ if (threadIdx.x < s) {
		174	+ intermediates[threadIdx.x] += intermediates[threadIdx.x + s];
		175	+ }
		176	+ __syncthreads();
		177	+ }
		178	+
		179	+ if (threadIdx.x == 0) {
		180	+ deviceIntermediates[0] = intermediates[0];
		181	+ }
		182	+}
		183	+
		184	+/----< cuda_kmeans() >-------------------------------------------------------/
		185	+//
		186	+// ----------------------------------------
		187	+// DATA LAYOUT
		188	+//
		189	+// objects [numObjs][numCoords]
		190	+// clusters [numClusters][numCoords]
		191	+// dimObjects [numCoords][numObjs]
		192	+// dimClusters [numCoords][numClusters]
		193	+// newClusters [numCoords][numClusters]
		194	+// deviceObjects [numCoords][numObjs]
		195	+// deviceClusters [numCoords][numClusters]
		196	+// ----------------------------------------
		197	+//
		198	+/* return an array of cluster centers of size [numClusters][numCoords] */
		199	+float cuda_kmeans(float objects, /* in: [numObjs][numCoords] */
		200	+ unsigned int numCoords, /* no. features */
		201	+ unsigned int numObjs, /* no. objects */
		202	+ unsigned int numClusters, /* no. clusters */
		203	+ float threshold, /* % objects change membership */
		204	+ int membership, / out: [numObjs] */
		205	+ int loops)
		206	+{
		207	+ int i, j, index, loop=0;
		208	+ int newClusterSize; / [numClusters]: no. objects assigned in each
		209	+ new cluster */
		210	+ float delta; /* % of objects change their clusters */
		211	+ float **dimObjects;
		212	+ float *clusters; / out: [numClusters][numCoords] */
		213	+ float **dimClusters;
		214	+ float *newClusters; / [numCoords][numClusters] */
		215	+
		216	+ float *deviceObjects;
		217	+ float *deviceClusters;
		218	+ int *deviceMembership;
		219	+ int *deviceIntermediates;
		220	+
		221	+ // Copy objects given in [numObjs][numCoords] layout to new
		222	+ // [numCoords][numObjs] layout
		223	+ malloc2D(dimObjects, numCoords, numObjs, float);
		224	+ for (i = 0; i < numCoords; i++) {
		225	+ for (j = 0; j < numObjs; j++) {
		226	+ dimObjects[i][j] = objects[j][i];
		227	+ }
		228	+ }
		229	+
		230	+ /* pick first numClusters elements of objects[] as initial cluster centers*/
		231	+ malloc2D(dimClusters, numCoords, numClusters, float);
		232	+ for (i = 0; i < numCoords; i++) {
		233	+ for (j = 0; j < numClusters; j++) {
		234	+ dimClusters[i][j] = dimObjects[i][j];
		235	+ }
		236	+ }
		237	+
		238	+ /* initialize membership[] */
		239	+ for (i=0; i<numObjs; i++) membership[i] = -1;
		240	+
		241	+ /* need to initialize newClusterSize and newClusters[0] to all 0 */
		242	+ newClusterSize = (int*) calloc(numClusters, sizeof(int));
		243	+ assert(newClusterSize != NULL);
		244	+
		245	+ malloc2D(newClusters, numCoords, numClusters, float);
		246	+ memset(newClusters[0], 0, numCoords * numClusters * sizeof(float));
		247	+
		248	+ // To support reduction, numThreadsPerClusterBlock must be a power of
		249	+ // two, and it must be no larger than the number of bits that will
		250	+ // fit into an unsigned char, the type used to keep track of membership
		251	+ // changes in the kernel.
		252	+ cudaDeviceProp props;
		253	+ handle_error(cudaGetDeviceProperties(&props, 0));
		254	+ const unsigned int numThreadsPerClusterBlock = props.maxThreadsPerBlock;
		255	+ const unsigned int numClusterBlocks =
		256	+ ceil(numObjs / (double)numThreadsPerClusterBlock);
		257	+
		258	+#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
		259	+ const unsigned int clusterBlockSharedDataSize =
		260	+ numThreadsPerClusterBlock * sizeof(unsigned char) +
		261	+ numClusters * numCoords * sizeof(float);
		262	+
		263	+ cudaDeviceProp deviceProp;
		264	+ int deviceNum;
		265	+ cudaGetDevice(&deviceNum);
		266	+ cudaGetDeviceProperties(&deviceProp, deviceNum);
		267	+
		268	+ if (clusterBlockSharedDataSize > deviceProp.sharedMemPerBlock) {
		269	+ std::cout << "ERROR: insufficient shared memory. Please don't use the definition 'BLOCK_SHARED_MEM_OPTIMIZATION'" << endl;
		270	+ exit(1);
		271	+ }
		272	+#else
		273	+ const unsigned int clusterBlockSharedDataSize =
		274	+ numThreadsPerClusterBlock * sizeof(unsigned char);
		275	+#endif
		276	+
		277	+ const unsigned int numReductionThreads =
		278	+ nextPowerOfTwo(numClusterBlocks);
		279	+ const unsigned int reductionBlockSharedDataSize =
		280	+ numReductionThreads * sizeof(unsigned int);
		281	+
		282	+ handle_error(cudaMalloc((void*)&deviceObjects, numObjsnumCoords*sizeof(float)));
		283	+ handle_error(cudaMalloc((void*)&deviceClusters, numClustersnumCoords*sizeof(float)));
		284	+ handle_error(cudaMalloc((void*)&deviceMembership, numObjssizeof(int)));
		285	+ handle_error(cudaMalloc((void*)&deviceIntermediates, numReductionThreadssizeof(unsigned int)));
		286	+
		287	+ handle_error(cudaMemcpy(deviceObjects, dimObjects[0],
		288	+ numObjsnumCoordssizeof(float), cudaMemcpyHostToDevice));
		289	+ handle_error(cudaMemcpy(deviceMembership, membership,
		290	+ numObjs*sizeof(int), cudaMemcpyHostToDevice));
		291	+
		292	+ do {
		293	+ handle_error(cudaMemcpy(deviceClusters, dimClusters[0],
		294	+ numClustersnumCoordssizeof(float), cudaMemcpyHostToDevice));
		295	+
		296	+ find_nearest_cluster
		297	+ <<< numClusterBlocks, numThreadsPerClusterBlock, clusterBlockSharedDataSize >>>
		298	+ (numCoords, numObjs, numClusters,
		299	+ deviceObjects, deviceClusters, deviceMembership, deviceIntermediates);
		300	+
		301	+ cudaDeviceSynchronize();
		302	+
		303	+ compute_delta <<< 1, numReductionThreads, reductionBlockSharedDataSize >>>
		304	+ (deviceIntermediates, numClusterBlocks, numReductionThreads);
		305	+
		306	+ cudaDeviceSynchronize();
		307	+
		308	+ int d;
		309	+ handle_error(cudaMemcpy(&d, deviceIntermediates,
		310	+ sizeof(int), cudaMemcpyDeviceToHost));
		311	+ delta = (float)d;
		312	+
		313	+ handle_error(cudaMemcpy(membership, deviceMembership,
		314	+ numObjs*sizeof(int), cudaMemcpyDeviceToHost));
		315	+
		316	+ for (i=0; i<numObjs; i++) {
		317	+ /* find the array index of nestest cluster center */
		318	+ index = membership[i];
		319	+
		320	+ /* update new cluster centers : sum of objects located within */
		321	+ newClusterSize[index]++;
		322	+ for (j=0; j<numCoords; j++)
		323	+ newClusters[j][index] += objects[i][j];
		324	+ }
		325	+
		326	+ // TODO: Flip the nesting order
		327	+ // TODO: Change layout of newClusters to [numClusters][numCoords]
		328	+ /* average the sum and replace old cluster centers with newClusters */
		329	+ for (i=0; i<numClusters; i++) {
		330	+ for (j=0; j<numCoords; j++) {
		331	+ if (newClusterSize[i] > 0)
		332	+ dimClusters[j][i] = newClusters[j][i] / newClusterSize[i];
		333	+ newClusters[j][i] = 0.0; /* set back to 0 */
		334	+ }
		335	+ newClusterSize[i] = 0; /* set back to 0 */
		336	+ }
		337	+
		338	+ delta /= numObjs;
		339	+ } while (delta > threshold && loop++ < loops);
		340	+
		341	+
		342	+
		343	+ /* allocate a 2D space for returning variable clusters[] (coordinates
		344	+ of cluster centers) */
		345	+ malloc2D(clusters, numClusters, numCoords, float);
		346	+ for (i = 0; i < numClusters; i++) {
		347	+ for (j = 0; j < numCoords; j++) {
		348	+ clusters[i][j] = dimClusters[j][i];
		349	+ }
		350	+ }
		351	+
		352	+ handle_error(cudaFree(deviceObjects));
		353	+ handle_error(cudaFree(deviceClusters));
		354	+ handle_error(cudaFree(deviceMembership));
		355	+ handle_error(cudaFree(deviceIntermediates));
		356	+
		357	+ free(dimObjects[0]);
		358	+ free(dimObjects);
		359	+ free(dimClusters[0]);
		360	+ free(dimClusters);
		361	+ free(newClusters[0]);
		362	+ free(newClusters);
		363	+ free(newClusterSize);
		364	+
		365	+ return clusters;
		366	+}