Commit e1f4b2bc4cd43a6682791bbf75a7392d1037bbe8

Authored by David Mayerich
2 parents 7a615d7e 6911de0d

Merged branch davar into master

Showing 1 changed file with 366 additions and 0 deletions   Show diff stats
stim/cuda/kmeans.cuh 0 → 100644
  1 +//This software is dervied from Professor Wei-keng Liao's parallel k-means
  2 +//clustering code obtained on November 21, 2010 from
  3 +// http://users.eecs.northwestern.edu/~wkliao/Kmeans/index.html
  4 +//(http://users.eecs.northwestern.edu/~wkliao/Kmeans/simple_kmeans.tar.gz).
  5 +//
  6 +//With his permission, Serban Giuroiu is publishing his CUDA implementation based on his code
  7 +//under the open-source MIT license. See the LICENSE file for more details.
  8 +
  9 +// The original code can be found on Github ( https://github.com/serban/kmeans )
  10 +// Here I have just made a few changes to get it to work
  11 +
  12 +
  13 +
  14 +
  15 +#define malloc2D(name, xDim, yDim, type) do { \
  16 + name = (type **)malloc(xDim * sizeof(type *)); \
  17 + assert(name != NULL); \
  18 + name[0] = (type *)malloc(xDim * yDim * sizeof(type)); \
  19 + assert(name[0] != NULL); \
  20 + for (size_t i = 1; i < xDim; i++) \
  21 + name[i] = name[i-1] + yDim; \
  22 +} while (0)
  23 +
  24 +
  25 +
  26 +static void handleError(cudaError_t error, const char* file, int line){
  27 +
  28 + if(error != cudaSuccess){
  29 + cout << cudaGetErrorString(error) << " in " << file << " at line " << line << endl;
  30 + exit(1);
  31 + }
  32 +}
  33 +
  34 +#define handle_error(error) handleError(error, __FILE__ , __LINE__)
  35 +
  36 +
  37 +
  38 +static inline int nextPowerOfTwo(int n) {
  39 + n--;
  40 +
  41 + n = n >> 1 | n;
  42 + n = n >> 2 | n;
  43 + n = n >> 4 | n;
  44 + n = n >> 8 | n;
  45 + n = n >> 16 | n;
  46 + // n = n >> 32 | n; // For 64-bit ints
  47 +
  48 + return ++n;
  49 +}
  50 +
  51 +/*----< euclid_dist_2() >----------------------------------------------------*/
  52 +/* square of Euclid distance between two multi-dimensional points */
  53 +__host__ __device__ inline static
  54 +float euclid_dist_2(int numCoords,
  55 + int numObjs,
  56 + int numClusters,
  57 + float *objects, // [numCoords][numObjs]
  58 + float *clusters, // [numCoords][numClusters]
  59 + int objectId,
  60 + int clusterId)
  61 +{
  62 + int i;
  63 + float ans=0.0;
  64 +
  65 + for (i = 0; i < numCoords; i++) {
  66 + ans += (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]) *
  67 + (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]);
  68 + }
  69 +
  70 + return(ans);
  71 +}
  72 +
  73 +/*----< find_nearest_cluster() >---------------------------------------------*/
  74 +__global__ static
  75 +void find_nearest_cluster(int numCoords,
  76 + int numObjs,
  77 + int numClusters,
  78 + float *objects, // [numCoords][numObjs]
  79 + float *deviceClusters, // [numCoords][numClusters]
  80 + int *membership, // [numObjs]
  81 + int *intermediates)
  82 +{
  83 + extern __shared__ char sharedMemory[];
  84 +
  85 + // The type chosen for membershipChanged must be large enough to support
  86 + // reductions! There are blockDim.x elements, one for each thread in the
  87 + // block. See numThreadsPerClusterBlock in cuda_kmeans().
  88 + unsigned char *membershipChanged = (unsigned char *)sharedMemory;
  89 +#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
  90 + float *clusters = (float *)(sharedMemory + blockDim.x);
  91 +#else
  92 + float *clusters = deviceClusters;
  93 +#endif
  94 +
  95 + membershipChanged[threadIdx.x] = 0;
  96 +
  97 +#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
  98 + // BEWARE: We can overrun our shared memory here if there are too many
  99 + // clusters or too many coordinates! For reference, a Tesla C1060 has 16
  100 + // KiB of shared memory per block, and a GeForce GTX 480 has 48 KiB of
  101 + // shared memory per block.
  102 + for (int i = threadIdx.x; i < numClusters; i += blockDim.x) {
  103 + for (int j = 0; j < numCoords; j++) {
  104 + clusters[numClusters * j + i] = deviceClusters[numClusters * j + i];
  105 + }
  106 + }
  107 + __syncthreads();
  108 +#endif
  109 +
  110 + int objectId = blockDim.x * blockIdx.x + threadIdx.x;
  111 +
  112 + if (objectId < numObjs) {
  113 + int index, i;
  114 + float dist, min_dist;
  115 +
  116 + /* find the cluster id that has min distance to object */
  117 + index = 0;
  118 + min_dist = euclid_dist_2(numCoords, numObjs, numClusters,
  119 + objects, clusters, objectId, 0);
  120 +
  121 + for (i=1; i<numClusters; i++) {
  122 + dist = euclid_dist_2(numCoords, numObjs, numClusters,
  123 + objects, clusters, objectId, i);
  124 + /* no need square root */
  125 + if (dist < min_dist) { /* find the min and its array index */
  126 + min_dist = dist;
  127 + index = i;
  128 + }
  129 + }
  130 +
  131 + if (membership[objectId] != index) {
  132 + membershipChanged[threadIdx.x] = 1;
  133 + }
  134 +
  135 + /* assign the membership to object objectId */
  136 + membership[objectId] = index;
  137 +
  138 + __syncthreads(); // For membershipChanged[]
  139 +
  140 + // blockDim.x *must* be a power of two!
  141 + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
  142 + if (threadIdx.x < s) {
  143 + membershipChanged[threadIdx.x] +=
  144 + membershipChanged[threadIdx.x + s];
  145 + }
  146 + __syncthreads();
  147 + }
  148 +
  149 + if (threadIdx.x == 0) {
  150 + intermediates[blockIdx.x] = membershipChanged[0];
  151 + }
  152 + }
  153 +}
  154 +
  155 +__global__ static
  156 +void compute_delta(int *deviceIntermediates,
  157 + int numIntermediates, // The actual number of intermediates
  158 + int numIntermediates2) // The next power of two
  159 +{
  160 + // The number of elements in this array should be equal to
  161 + // numIntermediates2, the number of threads launched. It *must* be a power
  162 + // of two!
  163 + extern __shared__ unsigned int intermediates[];
  164 +
  165 + // Copy global intermediate values into shared memory.
  166 + intermediates[threadIdx.x] =
  167 + (threadIdx.x < numIntermediates) ? deviceIntermediates[threadIdx.x] : 0;
  168 +
  169 + __syncthreads();
  170 +
  171 + // numIntermediates2 *must* be a power of two!
  172 + for (unsigned int s = numIntermediates2 / 2; s > 0; s >>= 1) {
  173 + if (threadIdx.x < s) {
  174 + intermediates[threadIdx.x] += intermediates[threadIdx.x + s];
  175 + }
  176 + __syncthreads();
  177 + }
  178 +
  179 + if (threadIdx.x == 0) {
  180 + deviceIntermediates[0] = intermediates[0];
  181 + }
  182 +}
  183 +
  184 +/*----< cuda_kmeans() >-------------------------------------------------------*/
  185 +//
  186 +// ----------------------------------------
  187 +// DATA LAYOUT
  188 +//
  189 +// objects [numObjs][numCoords]
  190 +// clusters [numClusters][numCoords]
  191 +// dimObjects [numCoords][numObjs]
  192 +// dimClusters [numCoords][numClusters]
  193 +// newClusters [numCoords][numClusters]
  194 +// deviceObjects [numCoords][numObjs]
  195 +// deviceClusters [numCoords][numClusters]
  196 +// ----------------------------------------
  197 +//
  198 +/* return an array of cluster centers of size [numClusters][numCoords] */
  199 +float** cuda_kmeans(float **objects, /* in: [numObjs][numCoords] */
  200 + unsigned int numCoords, /* no. features */
  201 + unsigned int numObjs, /* no. objects */
  202 + unsigned int numClusters, /* no. clusters */
  203 + float threshold, /* % objects change membership */
  204 + int *membership, /* out: [numObjs] */
  205 + int loops)
  206 +{
  207 + int i, j, index, loop=0;
  208 + int *newClusterSize; /* [numClusters]: no. objects assigned in each
  209 + new cluster */
  210 + float delta; /* % of objects change their clusters */
  211 + float **dimObjects;
  212 + float **clusters; /* out: [numClusters][numCoords] */
  213 + float **dimClusters;
  214 + float **newClusters; /* [numCoords][numClusters] */
  215 +
  216 + float *deviceObjects;
  217 + float *deviceClusters;
  218 + int *deviceMembership;
  219 + int *deviceIntermediates;
  220 +
  221 + // Copy objects given in [numObjs][numCoords] layout to new
  222 + // [numCoords][numObjs] layout
  223 + malloc2D(dimObjects, numCoords, numObjs, float);
  224 + for (i = 0; i < numCoords; i++) {
  225 + for (j = 0; j < numObjs; j++) {
  226 + dimObjects[i][j] = objects[j][i];
  227 + }
  228 + }
  229 +
  230 + /* pick first numClusters elements of objects[] as initial cluster centers*/
  231 + malloc2D(dimClusters, numCoords, numClusters, float);
  232 + for (i = 0; i < numCoords; i++) {
  233 + for (j = 0; j < numClusters; j++) {
  234 + dimClusters[i][j] = dimObjects[i][j];
  235 + }
  236 + }
  237 +
  238 + /* initialize membership[] */
  239 + for (i=0; i<numObjs; i++) membership[i] = -1;
  240 +
  241 + /* need to initialize newClusterSize and newClusters[0] to all 0 */
  242 + newClusterSize = (int*) calloc(numClusters, sizeof(int));
  243 + assert(newClusterSize != NULL);
  244 +
  245 + malloc2D(newClusters, numCoords, numClusters, float);
  246 + memset(newClusters[0], 0, numCoords * numClusters * sizeof(float));
  247 +
  248 + // To support reduction, numThreadsPerClusterBlock *must* be a power of
  249 + // two, and it *must* be no larger than the number of bits that will
  250 + // fit into an unsigned char, the type used to keep track of membership
  251 + // changes in the kernel.
  252 + cudaDeviceProp props;
  253 + handle_error(cudaGetDeviceProperties(&props, 0));
  254 + const unsigned int numThreadsPerClusterBlock = props.maxThreadsPerBlock;
  255 + const unsigned int numClusterBlocks =
  256 + ceil(numObjs / (double)numThreadsPerClusterBlock);
  257 +
  258 +#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
  259 + const unsigned int clusterBlockSharedDataSize =
  260 + numThreadsPerClusterBlock * sizeof(unsigned char) +
  261 + numClusters * numCoords * sizeof(float);
  262 +
  263 + cudaDeviceProp deviceProp;
  264 + int deviceNum;
  265 + cudaGetDevice(&deviceNum);
  266 + cudaGetDeviceProperties(&deviceProp, deviceNum);
  267 +
  268 + if (clusterBlockSharedDataSize > deviceProp.sharedMemPerBlock) {
  269 + std::cout << "ERROR: insufficient shared memory. Please don't use the definition 'BLOCK_SHARED_MEM_OPTIMIZATION'" << endl;
  270 + exit(1);
  271 + }
  272 +#else
  273 + const unsigned int clusterBlockSharedDataSize =
  274 + numThreadsPerClusterBlock * sizeof(unsigned char);
  275 +#endif
  276 +
  277 + const unsigned int numReductionThreads =
  278 + nextPowerOfTwo(numClusterBlocks);
  279 + const unsigned int reductionBlockSharedDataSize =
  280 + numReductionThreads * sizeof(unsigned int);
  281 +
  282 + handle_error(cudaMalloc((void**)&deviceObjects, numObjs*numCoords*sizeof(float)));
  283 + handle_error(cudaMalloc((void**)&deviceClusters, numClusters*numCoords*sizeof(float)));
  284 + handle_error(cudaMalloc((void**)&deviceMembership, numObjs*sizeof(int)));
  285 + handle_error(cudaMalloc((void**)&deviceIntermediates, numReductionThreads*sizeof(unsigned int)));
  286 +
  287 + handle_error(cudaMemcpy(deviceObjects, dimObjects[0],
  288 + numObjs*numCoords*sizeof(float), cudaMemcpyHostToDevice));
  289 + handle_error(cudaMemcpy(deviceMembership, membership,
  290 + numObjs*sizeof(int), cudaMemcpyHostToDevice));
  291 +
  292 + do {
  293 + handle_error(cudaMemcpy(deviceClusters, dimClusters[0],
  294 + numClusters*numCoords*sizeof(float), cudaMemcpyHostToDevice));
  295 +
  296 + find_nearest_cluster
  297 + <<< numClusterBlocks, numThreadsPerClusterBlock, clusterBlockSharedDataSize >>>
  298 + (numCoords, numObjs, numClusters,
  299 + deviceObjects, deviceClusters, deviceMembership, deviceIntermediates);
  300 +
  301 + cudaDeviceSynchronize();
  302 +
  303 + compute_delta <<< 1, numReductionThreads, reductionBlockSharedDataSize >>>
  304 + (deviceIntermediates, numClusterBlocks, numReductionThreads);
  305 +
  306 + cudaDeviceSynchronize();
  307 +
  308 + int d;
  309 + handle_error(cudaMemcpy(&d, deviceIntermediates,
  310 + sizeof(int), cudaMemcpyDeviceToHost));
  311 + delta = (float)d;
  312 +
  313 + handle_error(cudaMemcpy(membership, deviceMembership,
  314 + numObjs*sizeof(int), cudaMemcpyDeviceToHost));
  315 +
  316 + for (i=0; i<numObjs; i++) {
  317 + /* find the array index of nestest cluster center */
  318 + index = membership[i];
  319 +
  320 + /* update new cluster centers : sum of objects located within */
  321 + newClusterSize[index]++;
  322 + for (j=0; j<numCoords; j++)
  323 + newClusters[j][index] += objects[i][j];
  324 + }
  325 +
  326 + // TODO: Flip the nesting order
  327 + // TODO: Change layout of newClusters to [numClusters][numCoords]
  328 + /* average the sum and replace old cluster centers with newClusters */
  329 + for (i=0; i<numClusters; i++) {
  330 + for (j=0; j<numCoords; j++) {
  331 + if (newClusterSize[i] > 0)
  332 + dimClusters[j][i] = newClusters[j][i] / newClusterSize[i];
  333 + newClusters[j][i] = 0.0; /* set back to 0 */
  334 + }
  335 + newClusterSize[i] = 0; /* set back to 0 */
  336 + }
  337 +
  338 + delta /= numObjs;
  339 + } while (delta > threshold && loop++ < loops);
  340 +
  341 +
  342 +
  343 + /* allocate a 2D space for returning variable clusters[] (coordinates
  344 + of cluster centers) */
  345 + malloc2D(clusters, numClusters, numCoords, float);
  346 + for (i = 0; i < numClusters; i++) {
  347 + for (j = 0; j < numCoords; j++) {
  348 + clusters[i][j] = dimClusters[j][i];
  349 + }
  350 + }
  351 +
  352 + handle_error(cudaFree(deviceObjects));
  353 + handle_error(cudaFree(deviceClusters));
  354 + handle_error(cudaFree(deviceMembership));
  355 + handle_error(cudaFree(deviceIntermediates));
  356 +
  357 + free(dimObjects[0]);
  358 + free(dimObjects);
  359 + free(dimClusters[0]);
  360 + free(dimClusters);
  361 + free(newClusters[0]);
  362 + free(newClusters);
  363 + free(newClusterSize);
  364 +
  365 + return clusters;
  366 +}
... ...