Commit 7f02091aaf8a5310eba59b3ab6158a809d9d7311

Authored by Davar
1 parent efbb4bf7

added cuda kmeans clustering

Showing 1 changed file with 353 additions and 0 deletions   Show diff stats
stim/cuda/kmeans.cuh 0 → 100644
  1 +
  2 +#define malloc2D(name, xDim, yDim, type) do { \
  3 + name = (type **)malloc(xDim * sizeof(type *)); \
  4 + assert(name != NULL); \
  5 + name[0] = (type *)malloc(xDim * yDim * sizeof(type)); \
  6 + assert(name[0] != NULL); \
  7 + for (size_t i = 1; i < xDim; i++) \
  8 + name[i] = name[i-1] + yDim; \
  9 +} while (0)
  10 +
  11 +
  12 +
  13 +static void handleError(cudaError_t error, const char* file, int line){
  14 +
  15 + if(error != cudaSuccess){
  16 + cout << cudaGetErrorString(error) << " in " << file << " at line " << line << endl;
  17 + exit(1);
  18 + }
  19 +}
  20 +
  21 +#define handle_error(error) handleError(error, __FILE__ , __LINE__)
  22 +
  23 +
  24 +
  25 +static inline int nextPowerOfTwo(int n) {
  26 + n--;
  27 +
  28 + n = n >> 1 | n;
  29 + n = n >> 2 | n;
  30 + n = n >> 4 | n;
  31 + n = n >> 8 | n;
  32 + n = n >> 16 | n;
  33 + // n = n >> 32 | n; // For 64-bit ints
  34 +
  35 + return ++n;
  36 +}
  37 +
  38 +/*----< euclid_dist_2() >----------------------------------------------------*/
  39 +/* square of Euclid distance between two multi-dimensional points */
  40 +__host__ __device__ inline static
  41 +float euclid_dist_2(int numCoords,
  42 + int numObjs,
  43 + int numClusters,
  44 + float *objects, // [numCoords][numObjs]
  45 + float *clusters, // [numCoords][numClusters]
  46 + int objectId,
  47 + int clusterId)
  48 +{
  49 + int i;
  50 + float ans=0.0;
  51 +
  52 + for (i = 0; i < numCoords; i++) {
  53 + ans += (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]) *
  54 + (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]);
  55 + }
  56 +
  57 + return(ans);
  58 +}
  59 +
  60 +/*----< find_nearest_cluster() >---------------------------------------------*/
  61 +__global__ static
  62 +void find_nearest_cluster(int numCoords,
  63 + int numObjs,
  64 + int numClusters,
  65 + float *objects, // [numCoords][numObjs]
  66 + float *deviceClusters, // [numCoords][numClusters]
  67 + int *membership, // [numObjs]
  68 + int *intermediates)
  69 +{
  70 + extern __shared__ char sharedMemory[];
  71 +
  72 + // The type chosen for membershipChanged must be large enough to support
  73 + // reductions! There are blockDim.x elements, one for each thread in the
  74 + // block. See numThreadsPerClusterBlock in cuda_kmeans().
  75 + unsigned char *membershipChanged = (unsigned char *)sharedMemory;
  76 +#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
  77 + float *clusters = (float *)(sharedMemory + blockDim.x);
  78 +#else
  79 + float *clusters = deviceClusters;
  80 +#endif
  81 +
  82 + membershipChanged[threadIdx.x] = 0;
  83 +
  84 +#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
  85 + // BEWARE: We can overrun our shared memory here if there are too many
  86 + // clusters or too many coordinates! For reference, a Tesla C1060 has 16
  87 + // KiB of shared memory per block, and a GeForce GTX 480 has 48 KiB of
  88 + // shared memory per block.
  89 + for (int i = threadIdx.x; i < numClusters; i += blockDim.x) {
  90 + for (int j = 0; j < numCoords; j++) {
  91 + clusters[numClusters * j + i] = deviceClusters[numClusters * j + i];
  92 + }
  93 + }
  94 + __syncthreads();
  95 +#endif
  96 +
  97 + int objectId = blockDim.x * blockIdx.x + threadIdx.x;
  98 +
  99 + if (objectId < numObjs) {
  100 + int index, i;
  101 + float dist, min_dist;
  102 +
  103 + /* find the cluster id that has min distance to object */
  104 + index = 0;
  105 + min_dist = euclid_dist_2(numCoords, numObjs, numClusters,
  106 + objects, clusters, objectId, 0);
  107 +
  108 + for (i=1; i<numClusters; i++) {
  109 + dist = euclid_dist_2(numCoords, numObjs, numClusters,
  110 + objects, clusters, objectId, i);
  111 + /* no need square root */
  112 + if (dist < min_dist) { /* find the min and its array index */
  113 + min_dist = dist;
  114 + index = i;
  115 + }
  116 + }
  117 +
  118 + if (membership[objectId] != index) {
  119 + membershipChanged[threadIdx.x] = 1;
  120 + }
  121 +
  122 + /* assign the membership to object objectId */
  123 + membership[objectId] = index;
  124 +
  125 + __syncthreads(); // For membershipChanged[]
  126 +
  127 + // blockDim.x *must* be a power of two!
  128 + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
  129 + if (threadIdx.x < s) {
  130 + membershipChanged[threadIdx.x] +=
  131 + membershipChanged[threadIdx.x + s];
  132 + }
  133 + __syncthreads();
  134 + }
  135 +
  136 + if (threadIdx.x == 0) {
  137 + intermediates[blockIdx.x] = membershipChanged[0];
  138 + }
  139 + }
  140 +}
  141 +
  142 +__global__ static
  143 +void compute_delta(int *deviceIntermediates,
  144 + int numIntermediates, // The actual number of intermediates
  145 + int numIntermediates2) // The next power of two
  146 +{
  147 + // The number of elements in this array should be equal to
  148 + // numIntermediates2, the number of threads launched. It *must* be a power
  149 + // of two!
  150 + extern __shared__ unsigned int intermediates[];
  151 +
  152 + // Copy global intermediate values into shared memory.
  153 + intermediates[threadIdx.x] =
  154 + (threadIdx.x < numIntermediates) ? deviceIntermediates[threadIdx.x] : 0;
  155 +
  156 + __syncthreads();
  157 +
  158 + // numIntermediates2 *must* be a power of two!
  159 + for (unsigned int s = numIntermediates2 / 2; s > 0; s >>= 1) {
  160 + if (threadIdx.x < s) {
  161 + intermediates[threadIdx.x] += intermediates[threadIdx.x + s];
  162 + }
  163 + __syncthreads();
  164 + }
  165 +
  166 + if (threadIdx.x == 0) {
  167 + deviceIntermediates[0] = intermediates[0];
  168 + }
  169 +}
  170 +
  171 +/*----< cuda_kmeans() >-------------------------------------------------------*/
  172 +//
  173 +// ----------------------------------------
  174 +// DATA LAYOUT
  175 +//
  176 +// objects [numObjs][numCoords]
  177 +// clusters [numClusters][numCoords]
  178 +// dimObjects [numCoords][numObjs]
  179 +// dimClusters [numCoords][numClusters]
  180 +// newClusters [numCoords][numClusters]
  181 +// deviceObjects [numCoords][numObjs]
  182 +// deviceClusters [numCoords][numClusters]
  183 +// ----------------------------------------
  184 +//
  185 +/* return an array of cluster centers of size [numClusters][numCoords] */
  186 +float** cuda_kmeans(float **objects, /* in: [numObjs][numCoords] */
  187 + unsigned int numCoords, /* no. features */
  188 + unsigned int numObjs, /* no. objects */
  189 + unsigned int numClusters, /* no. clusters */
  190 + float threshold, /* % objects change membership */
  191 + int *membership, /* out: [numObjs] */
  192 + int loops)
  193 +{
  194 + int i, j, index, loop=0;
  195 + int *newClusterSize; /* [numClusters]: no. objects assigned in each
  196 + new cluster */
  197 + float delta; /* % of objects change their clusters */
  198 + float **dimObjects;
  199 + float **clusters; /* out: [numClusters][numCoords] */
  200 + float **dimClusters;
  201 + float **newClusters; /* [numCoords][numClusters] */
  202 +
  203 + float *deviceObjects;
  204 + float *deviceClusters;
  205 + int *deviceMembership;
  206 + int *deviceIntermediates;
  207 +
  208 + // Copy objects given in [numObjs][numCoords] layout to new
  209 + // [numCoords][numObjs] layout
  210 + malloc2D(dimObjects, numCoords, numObjs, float);
  211 + for (i = 0; i < numCoords; i++) {
  212 + for (j = 0; j < numObjs; j++) {
  213 + dimObjects[i][j] = objects[j][i];
  214 + }
  215 + }
  216 +
  217 + /* pick first numClusters elements of objects[] as initial cluster centers*/
  218 + malloc2D(dimClusters, numCoords, numClusters, float);
  219 + for (i = 0; i < numCoords; i++) {
  220 + for (j = 0; j < numClusters; j++) {
  221 + dimClusters[i][j] = dimObjects[i][j];
  222 + }
  223 + }
  224 +
  225 + /* initialize membership[] */
  226 + for (i=0; i<numObjs; i++) membership[i] = -1;
  227 +
  228 + /* need to initialize newClusterSize and newClusters[0] to all 0 */
  229 + newClusterSize = (int*) calloc(numClusters, sizeof(int));
  230 + assert(newClusterSize != NULL);
  231 +
  232 + malloc2D(newClusters, numCoords, numClusters, float);
  233 + memset(newClusters[0], 0, numCoords * numClusters * sizeof(float));
  234 +
  235 + // To support reduction, numThreadsPerClusterBlock *must* be a power of
  236 + // two, and it *must* be no larger than the number of bits that will
  237 + // fit into an unsigned char, the type used to keep track of membership
  238 + // changes in the kernel.
  239 + cudaDeviceProp props;
  240 + handle_error(cudaGetDeviceProperties(&props, 0));
  241 + const unsigned int numThreadsPerClusterBlock = props.maxThreadsPerBlock;
  242 + const unsigned int numClusterBlocks =
  243 + ceil(numObjs / (double)numThreadsPerClusterBlock);
  244 +
  245 +#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
  246 + const unsigned int clusterBlockSharedDataSize =
  247 + numThreadsPerClusterBlock * sizeof(unsigned char) +
  248 + numClusters * numCoords * sizeof(float);
  249 +
  250 + cudaDeviceProp deviceProp;
  251 + int deviceNum;
  252 + cudaGetDevice(&deviceNum);
  253 + cudaGetDeviceProperties(&deviceProp, deviceNum);
  254 +
  255 + if (clusterBlockSharedDataSize > deviceProp.sharedMemPerBlock) {
  256 + std::cout << "ERROR: insufficient shared memory. Please don't use the definition 'BLOCK_SHARED_MEM_OPTIMIZATION'" << endl;
  257 + exit(1);
  258 + }
  259 +#else
  260 + const unsigned int clusterBlockSharedDataSize =
  261 + numThreadsPerClusterBlock * sizeof(unsigned char);
  262 +#endif
  263 +
  264 + const unsigned int numReductionThreads =
  265 + nextPowerOfTwo(numClusterBlocks);
  266 + const unsigned int reductionBlockSharedDataSize =
  267 + numReductionThreads * sizeof(unsigned int);
  268 +
  269 + handle_error(cudaMalloc((void**)&deviceObjects, numObjs*numCoords*sizeof(float)));
  270 + handle_error(cudaMalloc((void**)&deviceClusters, numClusters*numCoords*sizeof(float)));
  271 + handle_error(cudaMalloc((void**)&deviceMembership, numObjs*sizeof(int)));
  272 + handle_error(cudaMalloc((void**)&deviceIntermediates, numReductionThreads*sizeof(unsigned int)));
  273 +
  274 + handle_error(cudaMemcpy(deviceObjects, dimObjects[0],
  275 + numObjs*numCoords*sizeof(float), cudaMemcpyHostToDevice));
  276 + handle_error(cudaMemcpy(deviceMembership, membership,
  277 + numObjs*sizeof(int), cudaMemcpyHostToDevice));
  278 +
  279 + do {
  280 + handle_error(cudaMemcpy(deviceClusters, dimClusters[0],
  281 + numClusters*numCoords*sizeof(float), cudaMemcpyHostToDevice));
  282 +
  283 + find_nearest_cluster
  284 + <<< numClusterBlocks, numThreadsPerClusterBlock, clusterBlockSharedDataSize >>>
  285 + (numCoords, numObjs, numClusters,
  286 + deviceObjects, deviceClusters, deviceMembership, deviceIntermediates);
  287 +
  288 + cudaDeviceSynchronize();
  289 +
  290 + compute_delta <<< 1, numReductionThreads, reductionBlockSharedDataSize >>>
  291 + (deviceIntermediates, numClusterBlocks, numReductionThreads);
  292 +
  293 + cudaDeviceSynchronize();
  294 +
  295 + int d;
  296 + handle_error(cudaMemcpy(&d, deviceIntermediates,
  297 + sizeof(int), cudaMemcpyDeviceToHost));
  298 + delta = (float)d;
  299 +
  300 + handle_error(cudaMemcpy(membership, deviceMembership,
  301 + numObjs*sizeof(int), cudaMemcpyDeviceToHost));
  302 +
  303 + for (i=0; i<numObjs; i++) {
  304 + /* find the array index of nestest cluster center */
  305 + index = membership[i];
  306 +
  307 + /* update new cluster centers : sum of objects located within */
  308 + newClusterSize[index]++;
  309 + for (j=0; j<numCoords; j++)
  310 + newClusters[j][index] += objects[i][j];
  311 + }
  312 +
  313 + // TODO: Flip the nesting order
  314 + // TODO: Change layout of newClusters to [numClusters][numCoords]
  315 + /* average the sum and replace old cluster centers with newClusters */
  316 + for (i=0; i<numClusters; i++) {
  317 + for (j=0; j<numCoords; j++) {
  318 + if (newClusterSize[i] > 0)
  319 + dimClusters[j][i] = newClusters[j][i] / newClusterSize[i];
  320 + newClusters[j][i] = 0.0; /* set back to 0 */
  321 + }
  322 + newClusterSize[i] = 0; /* set back to 0 */
  323 + }
  324 +
  325 + delta /= numObjs;
  326 + } while (delta > threshold && loop++ < loops);
  327 +
  328 +
  329 +
  330 + /* allocate a 2D space for returning variable clusters[] (coordinates
  331 + of cluster centers) */
  332 + malloc2D(clusters, numClusters, numCoords, float);
  333 + for (i = 0; i < numClusters; i++) {
  334 + for (j = 0; j < numCoords; j++) {
  335 + clusters[i][j] = dimClusters[j][i];
  336 + }
  337 + }
  338 +
  339 + handle_error(cudaFree(deviceObjects));
  340 + handle_error(cudaFree(deviceClusters));
  341 + handle_error(cudaFree(deviceMembership));
  342 + handle_error(cudaFree(deviceIntermediates));
  343 +
  344 + free(dimObjects[0]);
  345 + free(dimObjects);
  346 + free(dimClusters[0]);
  347 + free(dimClusters);
  348 + free(newClusters[0]);
  349 + free(newClusters);
  350 + free(newClusterSize);
  351 +
  352 + return clusters;
  353 +}
... ...