//This software is dervied from Professor Wei-keng Liao's parallel k-means //clustering code obtained on November 21, 2010 from // http://users.eecs.northwestern.edu/~wkliao/Kmeans/index.html //(http://users.eecs.northwestern.edu/~wkliao/Kmeans/simple_kmeans.tar.gz). // //With his permission, Serban Giuroiu is publishing his CUDA implementation based on his code //under the open-source MIT license. See the LICENSE file for more details. // The original code can be found on Github ( https://github.com/serban/kmeans ) // Here I have just made a few changes to get it to work #define malloc2D(name, xDim, yDim, type) do { \ name = (type **)malloc(xDim * sizeof(type *)); \ assert(name != NULL); \ name[0] = (type *)malloc(xDim * yDim * sizeof(type)); \ assert(name[0] != NULL); \ for (size_t i = 1; i < xDim; i++) \ name[i] = name[i-1] + yDim; \ } while (0) static void handleError(cudaError_t error, const char* file, int line){ if(error != cudaSuccess){ cout << cudaGetErrorString(error) << " in " << file << " at line " << line << endl; exit(1); } } #define handle_error(error) handleError(error, __FILE__ , __LINE__) static inline int nextPowerOfTwo(int n) { n--; n = n >> 1 | n; n = n >> 2 | n; n = n >> 4 | n; n = n >> 8 | n; n = n >> 16 | n; // n = n >> 32 | n; // For 64-bit ints return ++n; } /*----< euclid_dist_2() >----------------------------------------------------*/ /* square of Euclid distance between two multi-dimensional points */ __host__ __device__ inline static float euclid_dist_2(int numCoords, int numObjs, int numClusters, float *objects, // [numCoords][numObjs] float *clusters, // [numCoords][numClusters] int objectId, int clusterId) { int i; float ans=0.0; for (i = 0; i < numCoords; i++) { ans += (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]) * (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]); } return(ans); } /*----< find_nearest_cluster() >---------------------------------------------*/ __global__ static void find_nearest_cluster(int numCoords, int numObjs, int numClusters, float *objects, // [numCoords][numObjs] float *deviceClusters, // [numCoords][numClusters] int *membership, // [numObjs] int *intermediates) { extern __shared__ char sharedMemory[]; // The type chosen for membershipChanged must be large enough to support // reductions! There are blockDim.x elements, one for each thread in the // block. See numThreadsPerClusterBlock in cuda_kmeans(). unsigned char *membershipChanged = (unsigned char *)sharedMemory; #ifdef BLOCK_SHARED_MEM_OPTIMIZATION float *clusters = (float *)(sharedMemory + blockDim.x); #else float *clusters = deviceClusters; #endif membershipChanged[threadIdx.x] = 0; #ifdef BLOCK_SHARED_MEM_OPTIMIZATION // BEWARE: We can overrun our shared memory here if there are too many // clusters or too many coordinates! For reference, a Tesla C1060 has 16 // KiB of shared memory per block, and a GeForce GTX 480 has 48 KiB of // shared memory per block. for (int i = threadIdx.x; i < numClusters; i += blockDim.x) { for (int j = 0; j < numCoords; j++) { clusters[numClusters * j + i] = deviceClusters[numClusters * j + i]; } } __syncthreads(); #endif int objectId = blockDim.x * blockIdx.x + threadIdx.x; if (objectId < numObjs) { int index, i; float dist, min_dist; /* find the cluster id that has min distance to object */ index = 0; min_dist = euclid_dist_2(numCoords, numObjs, numClusters, objects, clusters, objectId, 0); for (i=1; i 0; s >>= 1) { if (threadIdx.x < s) { membershipChanged[threadIdx.x] += membershipChanged[threadIdx.x + s]; } __syncthreads(); } if (threadIdx.x == 0) { intermediates[blockIdx.x] = membershipChanged[0]; } } } __global__ static void compute_delta(int *deviceIntermediates, int numIntermediates, // The actual number of intermediates int numIntermediates2) // The next power of two { // The number of elements in this array should be equal to // numIntermediates2, the number of threads launched. It *must* be a power // of two! extern __shared__ unsigned int intermediates[]; // Copy global intermediate values into shared memory. intermediates[threadIdx.x] = (threadIdx.x < numIntermediates) ? deviceIntermediates[threadIdx.x] : 0; __syncthreads(); // numIntermediates2 *must* be a power of two! for (unsigned int s = numIntermediates2 / 2; s > 0; s >>= 1) { if (threadIdx.x < s) { intermediates[threadIdx.x] += intermediates[threadIdx.x + s]; } __syncthreads(); } if (threadIdx.x == 0) { deviceIntermediates[0] = intermediates[0]; } } /*----< cuda_kmeans() >-------------------------------------------------------*/ // // ---------------------------------------- // DATA LAYOUT // // objects [numObjs][numCoords] // clusters [numClusters][numCoords] // dimObjects [numCoords][numObjs] // dimClusters [numCoords][numClusters] // newClusters [numCoords][numClusters] // deviceObjects [numCoords][numObjs] // deviceClusters [numCoords][numClusters] // ---------------------------------------- // /* return an array of cluster centers of size [numClusters][numCoords] */ float** cuda_kmeans(float **objects, /* in: [numObjs][numCoords] */ unsigned int numCoords, /* no. features */ unsigned int numObjs, /* no. objects */ unsigned int numClusters, /* no. clusters */ float threshold, /* % objects change membership */ int *membership, /* out: [numObjs] */ int loops) { int i, j, index, loop=0; int *newClusterSize; /* [numClusters]: no. objects assigned in each new cluster */ float delta; /* % of objects change their clusters */ float **dimObjects; float **clusters; /* out: [numClusters][numCoords] */ float **dimClusters; float **newClusters; /* [numCoords][numClusters] */ float *deviceObjects; float *deviceClusters; int *deviceMembership; int *deviceIntermediates; // Copy objects given in [numObjs][numCoords] layout to new // [numCoords][numObjs] layout malloc2D(dimObjects, numCoords, numObjs, float); for (i = 0; i < numCoords; i++) { for (j = 0; j < numObjs; j++) { dimObjects[i][j] = objects[j][i]; } } /* pick first numClusters elements of objects[] as initial cluster centers*/ malloc2D(dimClusters, numCoords, numClusters, float); for (i = 0; i < numCoords; i++) { for (j = 0; j < numClusters; j++) { dimClusters[i][j] = dimObjects[i][j]; } } /* initialize membership[] */ for (i=0; i deviceProp.sharedMemPerBlock) { std::cout << "ERROR: insufficient shared memory. Please don't use the definition 'BLOCK_SHARED_MEM_OPTIMIZATION'" << endl; exit(1); } #else const unsigned int clusterBlockSharedDataSize = numThreadsPerClusterBlock * sizeof(unsigned char); #endif const unsigned int numReductionThreads = nextPowerOfTwo(numClusterBlocks); const unsigned int reductionBlockSharedDataSize = numReductionThreads * sizeof(unsigned int); handle_error(cudaMalloc((void**)&deviceObjects, numObjs*numCoords*sizeof(float))); handle_error(cudaMalloc((void**)&deviceClusters, numClusters*numCoords*sizeof(float))); handle_error(cudaMalloc((void**)&deviceMembership, numObjs*sizeof(int))); handle_error(cudaMalloc((void**)&deviceIntermediates, numReductionThreads*sizeof(unsigned int))); handle_error(cudaMemcpy(deviceObjects, dimObjects[0], numObjs*numCoords*sizeof(float), cudaMemcpyHostToDevice)); handle_error(cudaMemcpy(deviceMembership, membership, numObjs*sizeof(int), cudaMemcpyHostToDevice)); do { handle_error(cudaMemcpy(deviceClusters, dimClusters[0], numClusters*numCoords*sizeof(float), cudaMemcpyHostToDevice)); find_nearest_cluster <<< numClusterBlocks, numThreadsPerClusterBlock, clusterBlockSharedDataSize >>> (numCoords, numObjs, numClusters, deviceObjects, deviceClusters, deviceMembership, deviceIntermediates); cudaDeviceSynchronize(); compute_delta <<< 1, numReductionThreads, reductionBlockSharedDataSize >>> (deviceIntermediates, numClusterBlocks, numReductionThreads); cudaDeviceSynchronize(); int d; handle_error(cudaMemcpy(&d, deviceIntermediates, sizeof(int), cudaMemcpyDeviceToHost)); delta = (float)d; handle_error(cudaMemcpy(membership, deviceMembership, numObjs*sizeof(int), cudaMemcpyDeviceToHost)); for (i=0; i 0) dimClusters[j][i] = newClusters[j][i] / newClusterSize[i]; newClusters[j][i] = 0.0; /* set back to 0 */ } newClusterSize[i] = 0; /* set back to 0 */ } delta /= numObjs; } while (delta > threshold && loop++ < loops); /* allocate a 2D space for returning variable clusters[] (coordinates of cluster centers) */ malloc2D(clusters, numClusters, numCoords, float); for (i = 0; i < numClusters; i++) { for (j = 0; j < numCoords; j++) { clusters[i][j] = dimClusters[j][i]; } } handle_error(cudaFree(deviceObjects)); handle_error(cudaFree(deviceClusters)); handle_error(cudaFree(deviceMembership)); handle_error(cudaFree(deviceIntermediates)); free(dimObjects[0]); free(dimObjects); free(dimClusters[0]); free(dimClusters); free(newClusters[0]); free(newClusters); free(newClusterSize); return clusters; }