diff --git a/stim/cuda/kmeans.cuh b/stim/cuda/kmeans.cuh new file mode 100644 index 0000000..f5a56bd --- /dev/null +++ b/stim/cuda/kmeans.cuh @@ -0,0 +1,353 @@ + +#define malloc2D(name, xDim, yDim, type) do { \ + name = (type **)malloc(xDim * sizeof(type *)); \ + assert(name != NULL); \ + name[0] = (type *)malloc(xDim * yDim * sizeof(type)); \ + assert(name[0] != NULL); \ + for (size_t i = 1; i < xDim; i++) \ + name[i] = name[i-1] + yDim; \ +} while (0) + + + +static void handleError(cudaError_t error, const char* file, int line){ + + if(error != cudaSuccess){ + cout << cudaGetErrorString(error) << " in " << file << " at line " << line << endl; + exit(1); + } +} + +#define handle_error(error) handleError(error, __FILE__ , __LINE__) + + + +static inline int nextPowerOfTwo(int n) { + n--; + + n = n >> 1 | n; + n = n >> 2 | n; + n = n >> 4 | n; + n = n >> 8 | n; + n = n >> 16 | n; + // n = n >> 32 | n; // For 64-bit ints + + return ++n; +} + +/*----< euclid_dist_2() >----------------------------------------------------*/ +/* square of Euclid distance between two multi-dimensional points */ +__host__ __device__ inline static +float euclid_dist_2(int numCoords, + int numObjs, + int numClusters, + float *objects, // [numCoords][numObjs] + float *clusters, // [numCoords][numClusters] + int objectId, + int clusterId) +{ + int i; + float ans=0.0; + + for (i = 0; i < numCoords; i++) { + ans += (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]) * + (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]); + } + + return(ans); +} + +/*----< find_nearest_cluster() >---------------------------------------------*/ +__global__ static +void find_nearest_cluster(int numCoords, + int numObjs, + int numClusters, + float *objects, // [numCoords][numObjs] + float *deviceClusters, // [numCoords][numClusters] + int *membership, // [numObjs] + int *intermediates) +{ + extern __shared__ char sharedMemory[]; + + // The type chosen for membershipChanged must be large enough to support + // reductions! There are blockDim.x elements, one for each thread in the + // block. See numThreadsPerClusterBlock in cuda_kmeans(). + unsigned char *membershipChanged = (unsigned char *)sharedMemory; +#ifdef BLOCK_SHARED_MEM_OPTIMIZATION + float *clusters = (float *)(sharedMemory + blockDim.x); +#else + float *clusters = deviceClusters; +#endif + + membershipChanged[threadIdx.x] = 0; + +#ifdef BLOCK_SHARED_MEM_OPTIMIZATION + // BEWARE: We can overrun our shared memory here if there are too many + // clusters or too many coordinates! For reference, a Tesla C1060 has 16 + // KiB of shared memory per block, and a GeForce GTX 480 has 48 KiB of + // shared memory per block. + for (int i = threadIdx.x; i < numClusters; i += blockDim.x) { + for (int j = 0; j < numCoords; j++) { + clusters[numClusters * j + i] = deviceClusters[numClusters * j + i]; + } + } + __syncthreads(); +#endif + + int objectId = blockDim.x * blockIdx.x + threadIdx.x; + + if (objectId < numObjs) { + int index, i; + float dist, min_dist; + + /* find the cluster id that has min distance to object */ + index = 0; + min_dist = euclid_dist_2(numCoords, numObjs, numClusters, + objects, clusters, objectId, 0); + + for (i=1; i 0; s >>= 1) { + if (threadIdx.x < s) { + membershipChanged[threadIdx.x] += + membershipChanged[threadIdx.x + s]; + } + __syncthreads(); + } + + if (threadIdx.x == 0) { + intermediates[blockIdx.x] = membershipChanged[0]; + } + } +} + +__global__ static +void compute_delta(int *deviceIntermediates, + int numIntermediates, // The actual number of intermediates + int numIntermediates2) // The next power of two +{ + // The number of elements in this array should be equal to + // numIntermediates2, the number of threads launched. It *must* be a power + // of two! + extern __shared__ unsigned int intermediates[]; + + // Copy global intermediate values into shared memory. + intermediates[threadIdx.x] = + (threadIdx.x < numIntermediates) ? deviceIntermediates[threadIdx.x] : 0; + + __syncthreads(); + + // numIntermediates2 *must* be a power of two! + for (unsigned int s = numIntermediates2 / 2; s > 0; s >>= 1) { + if (threadIdx.x < s) { + intermediates[threadIdx.x] += intermediates[threadIdx.x + s]; + } + __syncthreads(); + } + + if (threadIdx.x == 0) { + deviceIntermediates[0] = intermediates[0]; + } +} + +/*----< cuda_kmeans() >-------------------------------------------------------*/ +// +// ---------------------------------------- +// DATA LAYOUT +// +// objects [numObjs][numCoords] +// clusters [numClusters][numCoords] +// dimObjects [numCoords][numObjs] +// dimClusters [numCoords][numClusters] +// newClusters [numCoords][numClusters] +// deviceObjects [numCoords][numObjs] +// deviceClusters [numCoords][numClusters] +// ---------------------------------------- +// +/* return an array of cluster centers of size [numClusters][numCoords] */ +float** cuda_kmeans(float **objects, /* in: [numObjs][numCoords] */ + unsigned int numCoords, /* no. features */ + unsigned int numObjs, /* no. objects */ + unsigned int numClusters, /* no. clusters */ + float threshold, /* % objects change membership */ + int *membership, /* out: [numObjs] */ + int loops) +{ + int i, j, index, loop=0; + int *newClusterSize; /* [numClusters]: no. objects assigned in each + new cluster */ + float delta; /* % of objects change their clusters */ + float **dimObjects; + float **clusters; /* out: [numClusters][numCoords] */ + float **dimClusters; + float **newClusters; /* [numCoords][numClusters] */ + + float *deviceObjects; + float *deviceClusters; + int *deviceMembership; + int *deviceIntermediates; + + // Copy objects given in [numObjs][numCoords] layout to new + // [numCoords][numObjs] layout + malloc2D(dimObjects, numCoords, numObjs, float); + for (i = 0; i < numCoords; i++) { + for (j = 0; j < numObjs; j++) { + dimObjects[i][j] = objects[j][i]; + } + } + + /* pick first numClusters elements of objects[] as initial cluster centers*/ + malloc2D(dimClusters, numCoords, numClusters, float); + for (i = 0; i < numCoords; i++) { + for (j = 0; j < numClusters; j++) { + dimClusters[i][j] = dimObjects[i][j]; + } + } + + /* initialize membership[] */ + for (i=0; i deviceProp.sharedMemPerBlock) { + std::cout << "ERROR: insufficient shared memory. Please don't use the definition 'BLOCK_SHARED_MEM_OPTIMIZATION'" << endl; + exit(1); + } +#else + const unsigned int clusterBlockSharedDataSize = + numThreadsPerClusterBlock * sizeof(unsigned char); +#endif + + const unsigned int numReductionThreads = + nextPowerOfTwo(numClusterBlocks); + const unsigned int reductionBlockSharedDataSize = + numReductionThreads * sizeof(unsigned int); + + handle_error(cudaMalloc((void**)&deviceObjects, numObjs*numCoords*sizeof(float))); + handle_error(cudaMalloc((void**)&deviceClusters, numClusters*numCoords*sizeof(float))); + handle_error(cudaMalloc((void**)&deviceMembership, numObjs*sizeof(int))); + handle_error(cudaMalloc((void**)&deviceIntermediates, numReductionThreads*sizeof(unsigned int))); + + handle_error(cudaMemcpy(deviceObjects, dimObjects[0], + numObjs*numCoords*sizeof(float), cudaMemcpyHostToDevice)); + handle_error(cudaMemcpy(deviceMembership, membership, + numObjs*sizeof(int), cudaMemcpyHostToDevice)); + + do { + handle_error(cudaMemcpy(deviceClusters, dimClusters[0], + numClusters*numCoords*sizeof(float), cudaMemcpyHostToDevice)); + + find_nearest_cluster + <<< numClusterBlocks, numThreadsPerClusterBlock, clusterBlockSharedDataSize >>> + (numCoords, numObjs, numClusters, + deviceObjects, deviceClusters, deviceMembership, deviceIntermediates); + + cudaDeviceSynchronize(); + + compute_delta <<< 1, numReductionThreads, reductionBlockSharedDataSize >>> + (deviceIntermediates, numClusterBlocks, numReductionThreads); + + cudaDeviceSynchronize(); + + int d; + handle_error(cudaMemcpy(&d, deviceIntermediates, + sizeof(int), cudaMemcpyDeviceToHost)); + delta = (float)d; + + handle_error(cudaMemcpy(membership, deviceMembership, + numObjs*sizeof(int), cudaMemcpyDeviceToHost)); + + for (i=0; i 0) + dimClusters[j][i] = newClusters[j][i] / newClusterSize[i]; + newClusters[j][i] = 0.0; /* set back to 0 */ + } + newClusterSize[i] = 0; /* set back to 0 */ + } + + delta /= numObjs; + } while (delta > threshold && loop++ < loops); + + + + /* allocate a 2D space for returning variable clusters[] (coordinates + of cluster centers) */ + malloc2D(clusters, numClusters, numCoords, float); + for (i = 0; i < numClusters; i++) { + for (j = 0; j < numCoords; j++) { + clusters[i][j] = dimClusters[j][i]; + } + } + + handle_error(cudaFree(deviceObjects)); + handle_error(cudaFree(deviceClusters)); + handle_error(cudaFree(deviceMembership)); + handle_error(cudaFree(deviceIntermediates)); + + free(dimObjects[0]); + free(dimObjects); + free(dimClusters[0]); + free(dimClusters); + free(newClusters[0]); + free(newClusters); + free(newClusterSize); + + return clusters; +} -- libgit2 0.21.4