codebase / stimlib

Browse Code »

Commit 7f02091aaf8a5310eba59b3ab6158a809d9d7311

Authored by Davar 2017-01-02 12:18:22 -0600

1 parent efbb4bf7

added cuda kmeans clustering

Showing 1 changed file with 353 additions and 0 deletions Show diff stats

Inline Side-by-side

stim/cuda/kmeans.cuh 0 → 100644

Show/Hide comments View file @7f02091

		1	+
		2	+#define malloc2D(name, xDim, yDim, type) do { \
		3	+ name = (type *)malloc(xDim sizeof(type *)); \
		4	+ assert(name != NULL); \
		5	+ name[0] = (type )malloc(xDim yDim * sizeof(type)); \
		6	+ assert(name[0] != NULL); \
		7	+ for (size_t i = 1; i < xDim; i++) \
		8	+ name[i] = name[i-1] + yDim; \
		9	+} while (0)
		10	+
		11	+
		12	+
		13	+static void handleError(cudaError_t error, const char* file, int line){
		14	+
		15	+ if(error != cudaSuccess){
		16	+ cout << cudaGetErrorString(error) << " in " << file << " at line " << line << endl;
		17	+ exit(1);
		18	+ }
		19	+}
		20	+
		21	+#define handle_error(error) handleError(error, __FILE__ , __LINE__)
		22	+
		23	+
		24	+
		25	+static inline int nextPowerOfTwo(int n) {
		26	+ n--;
		27	+
		28	+ n = n >> 1 \| n;
		29	+ n = n >> 2 \| n;
		30	+ n = n >> 4 \| n;
		31	+ n = n >> 8 \| n;
		32	+ n = n >> 16 \| n;
		33	+ // n = n >> 32 \| n; // For 64-bit ints
		34	+
		35	+ return ++n;
		36	+}
		37	+
		38	+/----< euclid_dist_2() >----------------------------------------------------/
		39	+/* square of Euclid distance between two multi-dimensional points */
		40	+__host__ __device__ inline static
		41	+float euclid_dist_2(int numCoords,
		42	+ int numObjs,
		43	+ int numClusters,
		44	+ float *objects, // [numCoords][numObjs]
		45	+ float *clusters, // [numCoords][numClusters]
		46	+ int objectId,
		47	+ int clusterId)
		48	+{
		49	+ int i;
		50	+ float ans=0.0;
		51	+
		52	+ for (i = 0; i < numCoords; i++) {
		53	+ ans += (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]) *
		54	+ (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]);
		55	+ }
		56	+
		57	+ return(ans);
		58	+}
		59	+
		60	+/----< find_nearest_cluster() >---------------------------------------------/
		61	+__global__ static
		62	+void find_nearest_cluster(int numCoords,
		63	+ int numObjs,
		64	+ int numClusters,
		65	+ float *objects, // [numCoords][numObjs]
		66	+ float *deviceClusters, // [numCoords][numClusters]
		67	+ int *membership, // [numObjs]
		68	+ int *intermediates)
		69	+{
		70	+ extern __shared__ char sharedMemory[];
		71	+
		72	+ // The type chosen for membershipChanged must be large enough to support
		73	+ // reductions! There are blockDim.x elements, one for each thread in the
		74	+ // block. See numThreadsPerClusterBlock in cuda_kmeans().
		75	+ unsigned char membershipChanged = (unsigned char )sharedMemory;
		76	+#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
		77	+ float clusters = (float )(sharedMemory + blockDim.x);
		78	+#else
		79	+ float *clusters = deviceClusters;
		80	+#endif
		81	+
		82	+ membershipChanged[threadIdx.x] = 0;
		83	+
		84	+#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
		85	+ // BEWARE: We can overrun our shared memory here if there are too many
		86	+ // clusters or too many coordinates! For reference, a Tesla C1060 has 16
		87	+ // KiB of shared memory per block, and a GeForce GTX 480 has 48 KiB of
		88	+ // shared memory per block.
		89	+ for (int i = threadIdx.x; i < numClusters; i += blockDim.x) {
		90	+ for (int j = 0; j < numCoords; j++) {
		91	+ clusters[numClusters * j + i] = deviceClusters[numClusters * j + i];
		92	+ }
		93	+ }
		94	+ __syncthreads();
		95	+#endif
		96	+
		97	+ int objectId = blockDim.x * blockIdx.x + threadIdx.x;
		98	+
		99	+ if (objectId < numObjs) {
		100	+ int index, i;
		101	+ float dist, min_dist;
		102	+
		103	+ /* find the cluster id that has min distance to object */
		104	+ index = 0;
		105	+ min_dist = euclid_dist_2(numCoords, numObjs, numClusters,
		106	+ objects, clusters, objectId, 0);
		107	+
		108	+ for (i=1; i<numClusters; i++) {
		109	+ dist = euclid_dist_2(numCoords, numObjs, numClusters,
		110	+ objects, clusters, objectId, i);
		111	+ /* no need square root */
		112	+ if (dist < min_dist) { /* find the min and its array index */
		113	+ min_dist = dist;
		114	+ index = i;
		115	+ }
		116	+ }
		117	+
		118	+ if (membership[objectId] != index) {
		119	+ membershipChanged[threadIdx.x] = 1;
		120	+ }
		121	+
		122	+ /* assign the membership to object objectId */
		123	+ membership[objectId] = index;
		124	+
		125	+ __syncthreads(); // For membershipChanged[]
		126	+
		127	+ // blockDim.x must be a power of two!
		128	+ for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
		129	+ if (threadIdx.x < s) {
		130	+ membershipChanged[threadIdx.x] +=
		131	+ membershipChanged[threadIdx.x + s];
		132	+ }
		133	+ __syncthreads();
		134	+ }
		135	+
		136	+ if (threadIdx.x == 0) {
		137	+ intermediates[blockIdx.x] = membershipChanged[0];
		138	+ }
		139	+ }
		140	+}
		141	+
		142	+__global__ static
		143	+void compute_delta(int *deviceIntermediates,
		144	+ int numIntermediates, // The actual number of intermediates
		145	+ int numIntermediates2) // The next power of two
		146	+{
		147	+ // The number of elements in this array should be equal to
		148	+ // numIntermediates2, the number of threads launched. It must be a power
		149	+ // of two!
		150	+ extern __shared__ unsigned int intermediates[];
		151	+
		152	+ // Copy global intermediate values into shared memory.
		153	+ intermediates[threadIdx.x] =
		154	+ (threadIdx.x < numIntermediates) ? deviceIntermediates[threadIdx.x] : 0;
		155	+
		156	+ __syncthreads();
		157	+
		158	+ // numIntermediates2 must be a power of two!
		159	+ for (unsigned int s = numIntermediates2 / 2; s > 0; s >>= 1) {
		160	+ if (threadIdx.x < s) {
		161	+ intermediates[threadIdx.x] += intermediates[threadIdx.x + s];
		162	+ }
		163	+ __syncthreads();
		164	+ }
		165	+
		166	+ if (threadIdx.x == 0) {
		167	+ deviceIntermediates[0] = intermediates[0];
		168	+ }
		169	+}
		170	+
		171	+/----< cuda_kmeans() >-------------------------------------------------------/
		172	+//
		173	+// ----------------------------------------
		174	+// DATA LAYOUT
		175	+//
		176	+// objects [numObjs][numCoords]
		177	+// clusters [numClusters][numCoords]
		178	+// dimObjects [numCoords][numObjs]
		179	+// dimClusters [numCoords][numClusters]
		180	+// newClusters [numCoords][numClusters]
		181	+// deviceObjects [numCoords][numObjs]
		182	+// deviceClusters [numCoords][numClusters]
		183	+// ----------------------------------------
		184	+//
		185	+/* return an array of cluster centers of size [numClusters][numCoords] */
		186	+float cuda_kmeans(float objects, /* in: [numObjs][numCoords] */
		187	+ unsigned int numCoords, /* no. features */
		188	+ unsigned int numObjs, /* no. objects */
		189	+ unsigned int numClusters, /* no. clusters */
		190	+ float threshold, /* % objects change membership */
		191	+ int membership, / out: [numObjs] */
		192	+ int loops)
		193	+{
		194	+ int i, j, index, loop=0;
		195	+ int newClusterSize; / [numClusters]: no. objects assigned in each
		196	+ new cluster */
		197	+ float delta; /* % of objects change their clusters */
		198	+ float **dimObjects;
		199	+ float *clusters; / out: [numClusters][numCoords] */
		200	+ float **dimClusters;
		201	+ float *newClusters; / [numCoords][numClusters] */
		202	+
		203	+ float *deviceObjects;
		204	+ float *deviceClusters;
		205	+ int *deviceMembership;
		206	+ int *deviceIntermediates;
		207	+
		208	+ // Copy objects given in [numObjs][numCoords] layout to new
		209	+ // [numCoords][numObjs] layout
		210	+ malloc2D(dimObjects, numCoords, numObjs, float);
		211	+ for (i = 0; i < numCoords; i++) {
		212	+ for (j = 0; j < numObjs; j++) {
		213	+ dimObjects[i][j] = objects[j][i];
		214	+ }
		215	+ }
		216	+
		217	+ /* pick first numClusters elements of objects[] as initial cluster centers*/
		218	+ malloc2D(dimClusters, numCoords, numClusters, float);
		219	+ for (i = 0; i < numCoords; i++) {
		220	+ for (j = 0; j < numClusters; j++) {
		221	+ dimClusters[i][j] = dimObjects[i][j];
		222	+ }
		223	+ }
		224	+
		225	+ /* initialize membership[] */
		226	+ for (i=0; i<numObjs; i++) membership[i] = -1;
		227	+
		228	+ /* need to initialize newClusterSize and newClusters[0] to all 0 */
		229	+ newClusterSize = (int*) calloc(numClusters, sizeof(int));
		230	+ assert(newClusterSize != NULL);
		231	+
		232	+ malloc2D(newClusters, numCoords, numClusters, float);
		233	+ memset(newClusters[0], 0, numCoords * numClusters * sizeof(float));
		234	+
		235	+ // To support reduction, numThreadsPerClusterBlock must be a power of
		236	+ // two, and it must be no larger than the number of bits that will
		237	+ // fit into an unsigned char, the type used to keep track of membership
		238	+ // changes in the kernel.
		239	+ cudaDeviceProp props;
		240	+ handle_error(cudaGetDeviceProperties(&props, 0));
		241	+ const unsigned int numThreadsPerClusterBlock = props.maxThreadsPerBlock;
		242	+ const unsigned int numClusterBlocks =
		243	+ ceil(numObjs / (double)numThreadsPerClusterBlock);
		244	+
		245	+#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
		246	+ const unsigned int clusterBlockSharedDataSize =
		247	+ numThreadsPerClusterBlock * sizeof(unsigned char) +
		248	+ numClusters * numCoords * sizeof(float);
		249	+
		250	+ cudaDeviceProp deviceProp;
		251	+ int deviceNum;
		252	+ cudaGetDevice(&deviceNum);
		253	+ cudaGetDeviceProperties(&deviceProp, deviceNum);
		254	+
		255	+ if (clusterBlockSharedDataSize > deviceProp.sharedMemPerBlock) {
		256	+ std::cout << "ERROR: insufficient shared memory. Please don't use the definition 'BLOCK_SHARED_MEM_OPTIMIZATION'" << endl;
		257	+ exit(1);
		258	+ }
		259	+#else
		260	+ const unsigned int clusterBlockSharedDataSize =
		261	+ numThreadsPerClusterBlock * sizeof(unsigned char);
		262	+#endif
		263	+
		264	+ const unsigned int numReductionThreads =
		265	+ nextPowerOfTwo(numClusterBlocks);
		266	+ const unsigned int reductionBlockSharedDataSize =
		267	+ numReductionThreads * sizeof(unsigned int);
		268	+
		269	+ handle_error(cudaMalloc((void*)&deviceObjects, numObjsnumCoords*sizeof(float)));
		270	+ handle_error(cudaMalloc((void*)&deviceClusters, numClustersnumCoords*sizeof(float)));
		271	+ handle_error(cudaMalloc((void*)&deviceMembership, numObjssizeof(int)));
		272	+ handle_error(cudaMalloc((void*)&deviceIntermediates, numReductionThreadssizeof(unsigned int)));
		273	+
		274	+ handle_error(cudaMemcpy(deviceObjects, dimObjects[0],
		275	+ numObjsnumCoordssizeof(float), cudaMemcpyHostToDevice));
		276	+ handle_error(cudaMemcpy(deviceMembership, membership,
		277	+ numObjs*sizeof(int), cudaMemcpyHostToDevice));
		278	+
		279	+ do {
		280	+ handle_error(cudaMemcpy(deviceClusters, dimClusters[0],
		281	+ numClustersnumCoordssizeof(float), cudaMemcpyHostToDevice));
		282	+
		283	+ find_nearest_cluster
		284	+ <<< numClusterBlocks, numThreadsPerClusterBlock, clusterBlockSharedDataSize >>>
		285	+ (numCoords, numObjs, numClusters,
		286	+ deviceObjects, deviceClusters, deviceMembership, deviceIntermediates);
		287	+
		288	+ cudaDeviceSynchronize();
		289	+
		290	+ compute_delta <<< 1, numReductionThreads, reductionBlockSharedDataSize >>>
		291	+ (deviceIntermediates, numClusterBlocks, numReductionThreads);
		292	+
		293	+ cudaDeviceSynchronize();
		294	+
		295	+ int d;
		296	+ handle_error(cudaMemcpy(&d, deviceIntermediates,
		297	+ sizeof(int), cudaMemcpyDeviceToHost));
		298	+ delta = (float)d;
		299	+
		300	+ handle_error(cudaMemcpy(membership, deviceMembership,
		301	+ numObjs*sizeof(int), cudaMemcpyDeviceToHost));
		302	+
		303	+ for (i=0; i<numObjs; i++) {
		304	+ /* find the array index of nestest cluster center */
		305	+ index = membership[i];
		306	+
		307	+ /* update new cluster centers : sum of objects located within */
		308	+ newClusterSize[index]++;
		309	+ for (j=0; j<numCoords; j++)
		310	+ newClusters[j][index] += objects[i][j];
		311	+ }
		312	+
		313	+ // TODO: Flip the nesting order
		314	+ // TODO: Change layout of newClusters to [numClusters][numCoords]
		315	+ /* average the sum and replace old cluster centers with newClusters */
		316	+ for (i=0; i<numClusters; i++) {
		317	+ for (j=0; j<numCoords; j++) {
		318	+ if (newClusterSize[i] > 0)
		319	+ dimClusters[j][i] = newClusters[j][i] / newClusterSize[i];
		320	+ newClusters[j][i] = 0.0; /* set back to 0 */
		321	+ }
		322	+ newClusterSize[i] = 0; /* set back to 0 */
		323	+ }
		324	+
		325	+ delta /= numObjs;
		326	+ } while (delta > threshold && loop++ < loops);
		327	+
		328	+
		329	+
		330	+ /* allocate a 2D space for returning variable clusters[] (coordinates
		331	+ of cluster centers) */
		332	+ malloc2D(clusters, numClusters, numCoords, float);
		333	+ for (i = 0; i < numClusters; i++) {
		334	+ for (j = 0; j < numCoords; j++) {
		335	+ clusters[i][j] = dimClusters[j][i];
		336	+ }
		337	+ }
		338	+
		339	+ handle_error(cudaFree(deviceObjects));
		340	+ handle_error(cudaFree(deviceClusters));
		341	+ handle_error(cudaFree(deviceMembership));
		342	+ handle_error(cudaFree(deviceIntermediates));
		343	+
		344	+ free(dimObjects[0]);
		345	+ free(dimObjects);
		346	+ free(dimClusters[0]);
		347	+ free(dimClusters);
		348	+ free(newClusters[0]);
		349	+ free(newClusters);
		350	+ free(newClusterSize);
		351	+
		352	+ return clusters;
		353	+}