codebase / stimlib

Browse Code »

Commit 7f02091aaf8a5310eba59b3ab6158a809d9d7311

Authored by Davar 2017-01-02 12:18:22 -0600

1 parent efbb4bf7

added cuda kmeans clustering

Showing 1 changed file with 353 additions and 0 deletions Show diff stats

Inline Side-by-side

stim/cuda/kmeans.cuh 0 → 100644

Wrap text Show/Hide comments View file @7f02091

	1	+
	2	+#define malloc2D(name, xDim, yDim, type) do { \
	3	+ name = (type *)malloc(xDim sizeof(type *)); \
	4	+ assert(name != NULL); \
	5	+ name[0] = (type )malloc(xDim yDim * sizeof(type)); \
	6	+ assert(name[0] != NULL); \
	7	+ for (size_t i = 1; i < xDim; i++) \
	8	+ name[i] = name[i-1] + yDim; \
	9	+} while (0)
	10	+
	11	+
	12	+
	13	+static void handleError(cudaError_t error, const char* file, int line){
	14	+
	15	+ if(error != cudaSuccess){
	16	+ cout << cudaGetErrorString(error) << " in " << file << " at line " << line << endl;
	17	+ exit(1);
	18	+ }
	19	+}
	20	+
	21	+#define handle_error(error) handleError(error, __FILE__ , __LINE__)
	22	+
	23	+
	24	+
	25	+static inline int nextPowerOfTwo(int n) {
	26	+ n--;
	27	+
	28	+ n = n >> 1 \| n;
	29	+ n = n >> 2 \| n;
	30	+ n = n >> 4 \| n;
	31	+ n = n >> 8 \| n;
	32	+ n = n >> 16 \| n;
	33	+ // n = n >> 32 \| n; // For 64-bit ints
	34	+
	35	+ return ++n;
	36	+}
	37	+
	38	+/----< euclid_dist_2() >----------------------------------------------------/
	39	+/* square of Euclid distance between two multi-dimensional points */
	40	+__host__ __device__ inline static
	41	+float euclid_dist_2(int numCoords,
	42	+ int numObjs,
	43	+ int numClusters,
	44	+ float *objects, // [numCoords][numObjs]
	45	+ float *clusters, // [numCoords][numClusters]
	46	+ int objectId,
	47	+ int clusterId)
	48	+{
	49	+ int i;
	50	+ float ans=0.0;
	51	+
	52	+ for (i = 0; i < numCoords; i++) {
	53	+ ans += (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]) *
	54	+ (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]);
	55	+ }
	56	+
	57	+ return(ans);
	58	+}
	59	+
	60	+/----< find_nearest_cluster() >---------------------------------------------/
	61	+__global__ static
	62	+void find_nearest_cluster(int numCoords,
	63	+ int numObjs,
	64	+ int numClusters,
	65	+ float *objects, // [numCoords][numObjs]
	66	+ float *deviceClusters, // [numCoords][numClusters]
	67	+ int *membership, // [numObjs]
	68	+ int *intermediates)
	69	+{
	70	+ extern __shared__ char sharedMemory[];
	71	+
	72	+ // The type chosen for membershipChanged must be large enough to support
	73	+ // reductions! There are blockDim.x elements, one for each thread in the
	74	+ // block. See numThreadsPerClusterBlock in cuda_kmeans().
	75	+ unsigned char membershipChanged = (unsigned char )sharedMemory;
	76	+#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
	77	+ float clusters = (float )(sharedMemory + blockDim.x);
	78	+#else
	79	+ float *clusters = deviceClusters;
	80	+#endif
	81	+
	82	+ membershipChanged[threadIdx.x] = 0;
	83	+
	84	+#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
	85	+ // BEWARE: We can overrun our shared memory here if there are too many
	86	+ // clusters or too many coordinates! For reference, a Tesla C1060 has 16
	87	+ // KiB of shared memory per block, and a GeForce GTX 480 has 48 KiB of
	88	+ // shared memory per block.
	89	+ for (int i = threadIdx.x; i < numClusters; i += blockDim.x) {
	90	+ for (int j = 0; j < numCoords; j++) {
	91	+ clusters[numClusters * j + i] = deviceClusters[numClusters * j + i];
	92	+ }
	93	+ }
	94	+ __syncthreads();
	95	+#endif
	96	+
	97	+ int objectId = blockDim.x * blockIdx.x + threadIdx.x;
	98	+
	99	+ if (objectId < numObjs) {
	100	+ int index, i;
	101	+ float dist, min_dist;
	102	+
	103	+ /* find the cluster id that has min distance to object */
	104	+ index = 0;
	105	+ min_dist = euclid_dist_2(numCoords, numObjs, numClusters,
	106	+ objects, clusters, objectId, 0);
	107	+
	108	+ for (i=1; i<numClusters; i++) {
	109	+ dist = euclid_dist_2(numCoords, numObjs, numClusters,
	110	+ objects, clusters, objectId, i);
	111	+ /* no need square root */
	112	+ if (dist < min_dist) { /* find the min and its array index */
	113	+ min_dist = dist;
	114	+ index = i;
	115	+ }
	116	+ }
	117	+
	118	+ if (membership[objectId] != index) {
	119	+ membershipChanged[threadIdx.x] = 1;
	120	+ }
	121	+
	122	+ /* assign the membership to object objectId */
	123	+ membership[objectId] = index;
	124	+
	125	+ __syncthreads(); // For membershipChanged[]
	126	+
	127	+ // blockDim.x must be a power of two!
	128	+ for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
	129	+ if (threadIdx.x < s) {
	130	+ membershipChanged[threadIdx.x] +=
	131	+ membershipChanged[threadIdx.x + s];
	132	+ }
	133	+ __syncthreads();
	134	+ }
	135	+
	136	+ if (threadIdx.x == 0) {
	137	+ intermediates[blockIdx.x] = membershipChanged[0];
	138	+ }
	139	+ }
	140	+}
	141	+
	142	+__global__ static
	143	+void compute_delta(int *deviceIntermediates,
	144	+ int numIntermediates, // The actual number of intermediates
	145	+ int numIntermediates2) // The next power of two
	146	+{
	147	+ // The number of elements in this array should be equal to
	148	+ // numIntermediates2, the number of threads launched. It must be a power
	149	+ // of two!
	150	+ extern __shared__ unsigned int intermediates[];
	151	+
	152	+ // Copy global intermediate values into shared memory.
	153	+ intermediates[threadIdx.x] =
	154	+ (threadIdx.x < numIntermediates) ? deviceIntermediates[threadIdx.x] : 0;
	155	+
	156	+ __syncthreads();
	157	+
	158	+ // numIntermediates2 must be a power of two!
	159	+ for (unsigned int s = numIntermediates2 / 2; s > 0; s >>= 1) {
	160	+ if (threadIdx.x < s) {
	161	+ intermediates[threadIdx.x] += intermediates[threadIdx.x + s];
	162	+ }
	163	+ __syncthreads();
	164	+ }
	165	+
	166	+ if (threadIdx.x == 0) {
	167	+ deviceIntermediates[0] = intermediates[0];
	168	+ }
	169	+}
	170	+
	171	+/----< cuda_kmeans() >-------------------------------------------------------/
	172	+//
	173	+// ----------------------------------------
	174	+// DATA LAYOUT
	175	+//
	176	+// objects [numObjs][numCoords]
	177	+// clusters [numClusters][numCoords]
	178	+// dimObjects [numCoords][numObjs]
	179	+// dimClusters [numCoords][numClusters]
	180	+// newClusters [numCoords][numClusters]
	181	+// deviceObjects [numCoords][numObjs]
	182	+// deviceClusters [numCoords][numClusters]
	183	+// ----------------------------------------
	184	+//
	185	+/* return an array of cluster centers of size [numClusters][numCoords] */
	186	+float cuda_kmeans(float objects, /* in: [numObjs][numCoords] */
	187	+ unsigned int numCoords, /* no. features */
	188	+ unsigned int numObjs, /* no. objects */
	189	+ unsigned int numClusters, /* no. clusters */
	190	+ float threshold, /* % objects change membership */
	191	+ int membership, / out: [numObjs] */
	192	+ int loops)
	193	+{
	194	+ int i, j, index, loop=0;
	195	+ int newClusterSize; / [numClusters]: no. objects assigned in each
	196	+ new cluster */
	197	+ float delta; /* % of objects change their clusters */
	198	+ float **dimObjects;
	199	+ float *clusters; / out: [numClusters][numCoords] */
	200	+ float **dimClusters;
	201	+ float *newClusters; / [numCoords][numClusters] */
	202	+
	203	+ float *deviceObjects;
	204	+ float *deviceClusters;
	205	+ int *deviceMembership;
	206	+ int *deviceIntermediates;
	207	+
	208	+ // Copy objects given in [numObjs][numCoords] layout to new
	209	+ // [numCoords][numObjs] layout
	210	+ malloc2D(dimObjects, numCoords, numObjs, float);
	211	+ for (i = 0; i < numCoords; i++) {
	212	+ for (j = 0; j < numObjs; j++) {
	213	+ dimObjects[i][j] = objects[j][i];
	214	+ }
	215	+ }
	216	+
	217	+ /* pick first numClusters elements of objects[] as initial cluster centers*/
	218	+ malloc2D(dimClusters, numCoords, numClusters, float);
	219	+ for (i = 0; i < numCoords; i++) {
	220	+ for (j = 0; j < numClusters; j++) {
	221	+ dimClusters[i][j] = dimObjects[i][j];
	222	+ }
	223	+ }
	224	+
	225	+ /* initialize membership[] */
	226	+ for (i=0; i<numObjs; i++) membership[i] = -1;
	227	+
	228	+ /* need to initialize newClusterSize and newClusters[0] to all 0 */
	229	+ newClusterSize = (int*) calloc(numClusters, sizeof(int));
	230	+ assert(newClusterSize != NULL);
	231	+
	232	+ malloc2D(newClusters, numCoords, numClusters, float);
	233	+ memset(newClusters[0], 0, numCoords * numClusters * sizeof(float));
	234	+
	235	+ // To support reduction, numThreadsPerClusterBlock must be a power of
	236	+ // two, and it must be no larger than the number of bits that will
	237	+ // fit into an unsigned char, the type used to keep track of membership
	238	+ // changes in the kernel.
	239	+ cudaDeviceProp props;
	240	+ handle_error(cudaGetDeviceProperties(&props, 0));
	241	+ const unsigned int numThreadsPerClusterBlock = props.maxThreadsPerBlock;
	242	+ const unsigned int numClusterBlocks =
	243	+ ceil(numObjs / (double)numThreadsPerClusterBlock);
	244	+
	245	+#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
	246	+ const unsigned int clusterBlockSharedDataSize =
	247	+ numThreadsPerClusterBlock * sizeof(unsigned char) +
	248	+ numClusters * numCoords * sizeof(float);
	249	+
	250	+ cudaDeviceProp deviceProp;
	251	+ int deviceNum;
	252	+ cudaGetDevice(&deviceNum);
	253	+ cudaGetDeviceProperties(&deviceProp, deviceNum);
	254	+
	255	+ if (clusterBlockSharedDataSize > deviceProp.sharedMemPerBlock) {
	256	+ std::cout << "ERROR: insufficient shared memory. Please don't use the definition 'BLOCK_SHARED_MEM_OPTIMIZATION'" << endl;
	257	+ exit(1);
	258	+ }
	259	+#else
	260	+ const unsigned int clusterBlockSharedDataSize =
	261	+ numThreadsPerClusterBlock * sizeof(unsigned char);
	262	+#endif
	263	+
	264	+ const unsigned int numReductionThreads =
	265	+ nextPowerOfTwo(numClusterBlocks);
	266	+ const unsigned int reductionBlockSharedDataSize =
	267	+ numReductionThreads * sizeof(unsigned int);
	268	+
	269	+ handle_error(cudaMalloc((void*)&deviceObjects, numObjsnumCoords*sizeof(float)));
	270	+ handle_error(cudaMalloc((void*)&deviceClusters, numClustersnumCoords*sizeof(float)));
	271	+ handle_error(cudaMalloc((void*)&deviceMembership, numObjssizeof(int)));
	272	+ handle_error(cudaMalloc((void*)&deviceIntermediates, numReductionThreadssizeof(unsigned int)));
	273	+
	274	+ handle_error(cudaMemcpy(deviceObjects, dimObjects[0],
	275	+ numObjsnumCoordssizeof(float), cudaMemcpyHostToDevice));
	276	+ handle_error(cudaMemcpy(deviceMembership, membership,
	277	+ numObjs*sizeof(int), cudaMemcpyHostToDevice));
	278	+
	279	+ do {
	280	+ handle_error(cudaMemcpy(deviceClusters, dimClusters[0],
	281	+ numClustersnumCoordssizeof(float), cudaMemcpyHostToDevice));
	282	+
	283	+ find_nearest_cluster
	284	+ <<< numClusterBlocks, numThreadsPerClusterBlock, clusterBlockSharedDataSize >>>
	285	+ (numCoords, numObjs, numClusters,
	286	+ deviceObjects, deviceClusters, deviceMembership, deviceIntermediates);
	287	+
	288	+ cudaDeviceSynchronize();
	289	+
	290	+ compute_delta <<< 1, numReductionThreads, reductionBlockSharedDataSize >>>
	291	+ (deviceIntermediates, numClusterBlocks, numReductionThreads);
	292	+
	293	+ cudaDeviceSynchronize();
	294	+
	295	+ int d;
	296	+ handle_error(cudaMemcpy(&d, deviceIntermediates,
	297	+ sizeof(int), cudaMemcpyDeviceToHost));
	298	+ delta = (float)d;
	299	+
	300	+ handle_error(cudaMemcpy(membership, deviceMembership,
	301	+ numObjs*sizeof(int), cudaMemcpyDeviceToHost));
	302	+
	303	+ for (i=0; i<numObjs; i++) {
	304	+ /* find the array index of nestest cluster center */
	305	+ index = membership[i];
	306	+
	307	+ /* update new cluster centers : sum of objects located within */
	308	+ newClusterSize[index]++;
	309	+ for (j=0; j<numCoords; j++)
	310	+ newClusters[j][index] += objects[i][j];
	311	+ }
	312	+
	313	+ // TODO: Flip the nesting order
	314	+ // TODO: Change layout of newClusters to [numClusters][numCoords]
	315	+ /* average the sum and replace old cluster centers with newClusters */
	316	+ for (i=0; i<numClusters; i++) {
	317	+ for (j=0; j<numCoords; j++) {
	318	+ if (newClusterSize[i] > 0)
	319	+ dimClusters[j][i] = newClusters[j][i] / newClusterSize[i];
	320	+ newClusters[j][i] = 0.0; /* set back to 0 */
	321	+ }
	322	+ newClusterSize[i] = 0; /* set back to 0 */
	323	+ }
	324	+
	325	+ delta /= numObjs;
	326	+ } while (delta > threshold && loop++ < loops);
	327	+
	328	+
	329	+
	330	+ /* allocate a 2D space for returning variable clusters[] (coordinates
	331	+ of cluster centers) */
	332	+ malloc2D(clusters, numClusters, numCoords, float);
	333	+ for (i = 0; i < numClusters; i++) {
	334	+ for (j = 0; j < numCoords; j++) {
	335	+ clusters[i][j] = dimClusters[j][i];
	336	+ }
	337	+ }
	338	+
	339	+ handle_error(cudaFree(deviceObjects));
	340	+ handle_error(cudaFree(deviceClusters));
	341	+ handle_error(cudaFree(deviceMembership));
	342	+ handle_error(cudaFree(deviceIntermediates));
	343	+
	344	+ free(dimObjects[0]);
	345	+ free(dimObjects);
	346	+ free(dimClusters[0]);
	347	+ free(dimClusters);
	348	+ free(newClusters[0]);
	349	+ free(newClusters);
	350	+ free(newClusterSize);
	351	+
	352	+ return clusters;
	353	+}
...	...