kmeans.cuh 13 KB
``````//This software is dervied from Professor Wei-keng Liao's parallel k-means
//clustering code obtained on November 21, 2010 from
// http://users.eecs.northwestern.edu/~wkliao/Kmeans/index.html
//(http://users.eecs.northwestern.edu/~wkliao/Kmeans/simple_kmeans.tar.gz).
//
//With his permission, Serban Giuroiu is publishing his CUDA implementation based on his code
//under the open-source MIT license. See the LICENSE file for more details.

// The original code can be found on Github ( https://github.com/serban/kmeans )
// Here I have just made a few changes to get it to work

#define malloc2D(name, xDim, yDim, type) do {               \
name = (type **)malloc(xDim * sizeof(type *));          \
assert(name != NULL);                                   \
name[0] = (type *)malloc(xDim * yDim * sizeof(type));   \
assert(name[0] != NULL);                                \
for (size_t i = 1; i < xDim; i++)                       \
name[i] = name[i-1] + yDim;                         \
} while (0)

static void handleError(cudaError_t error, const char* file, int line){

if(error != cudaSuccess){
cout << cudaGetErrorString(error) << " in " << file <<  " at line " << line << endl;
exit(1);
}
}

#define  handle_error(error)  handleError(error, __FILE__ , __LINE__)

static inline int nextPowerOfTwo(int n) {
n--;

n = n >>  1 | n;
n = n >>  2 | n;
n = n >>  4 | n;
n = n >>  8 | n;
n = n >> 16 | n;
// n = n >> 32 | n;    //  For 64-bit ints

return ++n;
}

/*----< euclid_dist_2() >----------------------------------------------------*/
/* square of Euclid distance between two multi-dimensional points            */
__host__ __device__ inline static
float euclid_dist_2(int    numCoords,
int    numObjs,
int    numClusters,
float *objects,     // [numCoords][numObjs]
float *clusters,    // [numCoords][numClusters]
int    objectId,
int    clusterId)
{
int i;
float ans=0.0;

for (i = 0; i < numCoords; i++) {
ans += (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]) *
(objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]);
}

return(ans);
}

/*----< find_nearest_cluster() >---------------------------------------------*/
__global__ static
void find_nearest_cluster(int numCoords,
int numObjs,
int numClusters,
float *objects,           //  [numCoords][numObjs]
float *deviceClusters,    //  [numCoords][numClusters]
int *membership,          //  [numObjs]
int *intermediates)
{
extern __shared__ char sharedMemory[];

//  The type chosen for membershipChanged must be large enough to support
//  reductions! There are blockDim.x elements, one for each thread in the
//  block. See numThreadsPerClusterBlock in cuda_kmeans().
unsigned char *membershipChanged = (unsigned char *)sharedMemory;
#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
float *clusters = (float *)(sharedMemory + blockDim.x);
#else
float *clusters = deviceClusters;
#endif

#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
//  BEWARE: We can overrun our shared memory here if there are too many
//  clusters or too many coordinates! For reference, a Tesla C1060 has 16
//  KiB of shared memory per block, and a GeForce GTX 480 has 48 KiB of
//  shared memory per block.
for (int i = threadIdx.x; i < numClusters; i += blockDim.x) {
for (int j = 0; j < numCoords; j++) {
clusters[numClusters * j + i] = deviceClusters[numClusters * j + i];
}
}
#endif

int objectId = blockDim.x * blockIdx.x + threadIdx.x;

if (objectId < numObjs) {
int   index, i;
float dist, min_dist;

/* find the cluster id that has min distance to object */
index    = 0;
min_dist = euclid_dist_2(numCoords, numObjs, numClusters,
objects, clusters, objectId, 0);

for (i=1; i<numClusters; i++) {
dist = euclid_dist_2(numCoords, numObjs, numClusters,
objects, clusters, objectId, i);
/* no need square root */
if (dist < min_dist) { /* find the min and its array index */
min_dist = dist;
index    = i;
}
}

if (membership[objectId] != index) {
}

/* assign the membership to object objectId */
membership[objectId] = index;

//  blockDim.x *must* be a power of two!
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
}
}

intermediates[blockIdx.x] = membershipChanged[0];
}
}
}

__global__ static
void compute_delta(int *deviceIntermediates,
int numIntermediates,    //  The actual number of intermediates
int numIntermediates2)   //  The next power of two
{
//  The number of elements in this array should be equal to
//  numIntermediates2, the number of threads launched. It *must* be a power
//  of two!
extern __shared__ unsigned int intermediates[];

//  Copy global intermediate values into shared memory.

//  numIntermediates2 *must* be a power of two!
for (unsigned int s = numIntermediates2 / 2; s > 0; s >>= 1) {
}
}

deviceIntermediates[0] = intermediates[0];
}
}

/*----< cuda_kmeans() >-------------------------------------------------------*/
//
//  ----------------------------------------
//  DATA LAYOUT
//
//  objects         [numObjs][numCoords]
//  clusters        [numClusters][numCoords]
//  dimObjects      [numCoords][numObjs]
//  dimClusters     [numCoords][numClusters]
//  newClusters     [numCoords][numClusters]
//  deviceObjects   [numCoords][numObjs]
//  deviceClusters  [numCoords][numClusters]
//  ----------------------------------------
//
/* return an array of cluster centers of size [numClusters][numCoords]       */
float** cuda_kmeans(float **objects,      /* in: [numObjs][numCoords] */
unsigned int     numCoords,    /* no. features */
unsigned int     numObjs,      /* no. objects */
unsigned int     numClusters,  /* no. clusters */
float   threshold,    /* % objects change membership */
int    *membership,   /* out: [numObjs] */
int	   loops)
{
int      i, j, index, loop=0;
int     *newClusterSize; /* [numClusters]: no. objects assigned in each
new cluster */
float    delta;          /* % of objects change their clusters */
float  **dimObjects;
float  **clusters;       /* out: [numClusters][numCoords] */
float  **dimClusters;
float  **newClusters;    /* [numCoords][numClusters] */

float *deviceObjects;
float *deviceClusters;
int *deviceMembership;
int *deviceIntermediates;

//  Copy objects given in [numObjs][numCoords] layout to new
//  [numCoords][numObjs] layout
malloc2D(dimObjects, numCoords, numObjs, float);
for (i = 0; i < numCoords; i++) {
for (j = 0; j < numObjs; j++) {
dimObjects[i][j] = objects[j][i];
}
}

/* pick first numClusters elements of objects[] as initial cluster centers*/
malloc2D(dimClusters, numCoords, numClusters, float);
for (i = 0; i < numCoords; i++) {
for (j = 0; j < numClusters; j++) {
dimClusters[i][j] = dimObjects[i][j];
}
}

/* initialize membership[] */
for (i=0; i<numObjs; i++) membership[i] = -1;

/* need to initialize newClusterSize and newClusters[0] to all 0 */
newClusterSize = (int*) calloc(numClusters, sizeof(int));
assert(newClusterSize != NULL);

malloc2D(newClusters, numCoords, numClusters, float);
memset(newClusters[0], 0, numCoords * numClusters * sizeof(float));

//  To support reduction, numThreadsPerClusterBlock *must* be a power of
//  two, and it *must* be no larger than the number of bits that will
//  fit into an unsigned char, the type used to keep  track of membership
//  changes in the kernel.
handle_error(cudaGetDeviceProperties(&props, 0));
const unsigned int numClusterBlocks =

#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
const unsigned int clusterBlockSharedDataSize =
numClusters * numCoords * sizeof(float);

int deviceNum;
cudaGetDevice(&deviceNum);
cudaGetDeviceProperties(&deviceProp, deviceNum);

if (clusterBlockSharedDataSize > deviceProp.sharedMemPerBlock) {
std::cout << "ERROR: insufficient shared memory. Please don't use the definition 'BLOCK_SHARED_MEM_OPTIMIZATION'" << endl;
exit(1);
}
#else
const unsigned int clusterBlockSharedDataSize =
#endif

nextPowerOfTwo(numClusterBlocks);
const unsigned int reductionBlockSharedDataSize =

handle_error(cudaMalloc((void**)&deviceObjects, numObjs*numCoords*sizeof(float)));
handle_error(cudaMalloc((void**)&deviceClusters, numClusters*numCoords*sizeof(float)));
handle_error(cudaMalloc((void**)&deviceMembership, numObjs*sizeof(int)));

handle_error(cudaMemcpy(deviceObjects, dimObjects[0],
numObjs*numCoords*sizeof(float), cudaMemcpyHostToDevice));
handle_error(cudaMemcpy(deviceMembership, membership,
numObjs*sizeof(int), cudaMemcpyHostToDevice));

do {
handle_error(cudaMemcpy(deviceClusters, dimClusters[0],
numClusters*numCoords*sizeof(float), cudaMemcpyHostToDevice));

find_nearest_cluster
(numCoords, numObjs, numClusters,
deviceObjects, deviceClusters, deviceMembership, deviceIntermediates);

compute_delta <<< 1, numReductionThreads, reductionBlockSharedDataSize >>>

int d;
handle_error(cudaMemcpy(&d, deviceIntermediates,
sizeof(int), cudaMemcpyDeviceToHost));
delta = (float)d;

handle_error(cudaMemcpy(membership, deviceMembership,
numObjs*sizeof(int), cudaMemcpyDeviceToHost));

for (i=0; i<numObjs; i++) {
/* find the array index of nestest cluster center */
index = membership[i];

/* update new cluster centers : sum of objects located within */
newClusterSize[index]++;
for (j=0; j<numCoords; j++)
newClusters[j][index] += objects[i][j];
}

//  TODO: Flip the nesting order
//  TODO: Change layout of newClusters to [numClusters][numCoords]
/* average the sum and replace old cluster centers with newClusters */
for (i=0; i<numClusters; i++) {
for (j=0; j<numCoords; j++) {
if (newClusterSize[i] > 0)
dimClusters[j][i] = newClusters[j][i] / newClusterSize[i];
newClusters[j][i] = 0.0;   /* set back to 0 */
}
newClusterSize[i] = 0;   /* set back to 0 */
}

delta /= numObjs;
} while (delta > threshold && loop++ < loops);

/* allocate a 2D space for returning variable clusters[] (coordinates
of cluster centers) */
malloc2D(clusters, numClusters, numCoords, float);
for (i = 0; i < numClusters; i++) {
for (j = 0; j < numCoords; j++) {
clusters[i][j] = dimClusters[j][i];
}
}

handle_error(cudaFree(deviceObjects));
handle_error(cudaFree(deviceClusters));
handle_error(cudaFree(deviceMembership));
handle_error(cudaFree(deviceIntermediates));

free(dimObjects[0]);
free(dimObjects);
free(dimClusters[0]);
free(dimClusters);
free(newClusters[0]);
free(newClusters);
free(newClusterSize);

return clusters;
}``````