diff --git a/stim/cuda/kmeans.cuh b/stim/cuda/kmeans.cuh
new file mode 100644
index 0000000..f5a56bd
--- /dev/null
+++ b/stim/cuda/kmeans.cuh
@@ -0,0 +1,353 @@
+
+#define malloc2D(name, xDim, yDim, type) do {               \
+    name = (type **)malloc(xDim * sizeof(type *));          \
+    assert(name != NULL);                                   \
+    name[0] = (type *)malloc(xDim * yDim * sizeof(type));   \
+    assert(name[0] != NULL);                                \
+    for (size_t i = 1; i < xDim; i++)                       \
+        name[i] = name[i-1] + yDim;                         \
+} while (0)
+
+
+
+static void handleError(cudaError_t error, const char* file, int line){
+
+	if(error != cudaSuccess){
+		cout << cudaGetErrorString(error) << " in " << file <<  " at line " << line << endl;
+		exit(1);
+	}
+}
+
+#define  handle_error(error)  handleError(error, __FILE__ , __LINE__)
+
+
+
+static inline int nextPowerOfTwo(int n) {
+    n--;
+
+    n = n >>  1 | n;
+    n = n >>  2 | n;
+    n = n >>  4 | n;
+    n = n >>  8 | n;
+    n = n >> 16 | n;
+   // n = n >> 32 | n;    //  For 64-bit ints
+
+    return ++n;
+}
+
+/*----< euclid_dist_2() >----------------------------------------------------*/
+/* square of Euclid distance between two multi-dimensional points            */
+__host__ __device__ inline static
+float euclid_dist_2(int    numCoords,
+                    int    numObjs,
+                    int    numClusters,
+                    float *objects,     // [numCoords][numObjs]
+                    float *clusters,    // [numCoords][numClusters]
+                    int    objectId,
+                    int    clusterId)
+{
+    int i;
+    float ans=0.0;
+
+    for (i = 0; i < numCoords; i++) {
+        ans += (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]) *
+               (objects[numObjs * i + objectId] - clusters[numClusters * i + clusterId]);
+    }
+
+    return(ans);
+}
+
+/*----< find_nearest_cluster() >---------------------------------------------*/
+__global__ static
+void find_nearest_cluster(int numCoords,
+                          int numObjs,
+                          int numClusters,
+                          float *objects,           //  [numCoords][numObjs]
+                          float *deviceClusters,    //  [numCoords][numClusters]
+                          int *membership,          //  [numObjs]
+                          int *intermediates)
+{
+    extern __shared__ char sharedMemory[];
+
+    //  The type chosen for membershipChanged must be large enough to support
+    //  reductions! There are blockDim.x elements, one for each thread in the
+    //  block. See numThreadsPerClusterBlock in cuda_kmeans().
+    unsigned char *membershipChanged = (unsigned char *)sharedMemory;
+#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
+    float *clusters = (float *)(sharedMemory + blockDim.x);
+#else
+    float *clusters = deviceClusters;
+#endif
+
+    membershipChanged[threadIdx.x] = 0;
+
+#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
+    //  BEWARE: We can overrun our shared memory here if there are too many
+    //  clusters or too many coordinates! For reference, a Tesla C1060 has 16
+    //  KiB of shared memory per block, and a GeForce GTX 480 has 48 KiB of
+    //  shared memory per block.
+    for (int i = threadIdx.x; i < numClusters; i += blockDim.x) {
+        for (int j = 0; j < numCoords; j++) {
+            clusters[numClusters * j + i] = deviceClusters[numClusters * j + i];
+        }
+    }
+    __syncthreads();
+#endif
+
+    int objectId = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (objectId < numObjs) {
+        int   index, i;
+        float dist, min_dist;
+
+        /* find the cluster id that has min distance to object */
+        index    = 0;
+        min_dist = euclid_dist_2(numCoords, numObjs, numClusters,
+                                 objects, clusters, objectId, 0);
+
+        for (i=1; i<numClusters; i++) {
+            dist = euclid_dist_2(numCoords, numObjs, numClusters,
+                                 objects, clusters, objectId, i);
+            /* no need square root */
+            if (dist < min_dist) { /* find the min and its array index */
+                min_dist = dist;
+                index    = i;
+            }
+        }
+
+        if (membership[objectId] != index) {
+            membershipChanged[threadIdx.x] = 1;
+        }
+
+        /* assign the membership to object objectId */
+        membership[objectId] = index;
+
+        __syncthreads();    //  For membershipChanged[]
+
+        //  blockDim.x *must* be a power of two!
+        for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
+            if (threadIdx.x < s) {
+                membershipChanged[threadIdx.x] +=
+                    membershipChanged[threadIdx.x + s];
+            }
+            __syncthreads();
+        }
+
+        if (threadIdx.x == 0) {
+            intermediates[blockIdx.x] = membershipChanged[0];
+        }
+    }
+}
+
+__global__ static
+void compute_delta(int *deviceIntermediates,
+                   int numIntermediates,    //  The actual number of intermediates
+                   int numIntermediates2)   //  The next power of two
+{
+    //  The number of elements in this array should be equal to
+    //  numIntermediates2, the number of threads launched. It *must* be a power
+    //  of two!
+    extern __shared__ unsigned int intermediates[];
+
+    //  Copy global intermediate values into shared memory.
+    intermediates[threadIdx.x] =
+        (threadIdx.x < numIntermediates) ? deviceIntermediates[threadIdx.x] : 0;
+
+    __syncthreads();
+
+    //  numIntermediates2 *must* be a power of two!
+    for (unsigned int s = numIntermediates2 / 2; s > 0; s >>= 1) {
+        if (threadIdx.x < s) {
+            intermediates[threadIdx.x] += intermediates[threadIdx.x + s];
+        }
+        __syncthreads();
+    }
+
+    if (threadIdx.x == 0) {
+        deviceIntermediates[0] = intermediates[0];
+    }
+}
+
+/*----< cuda_kmeans() >-------------------------------------------------------*/
+//
+//  ----------------------------------------
+//  DATA LAYOUT
+//
+//  objects         [numObjs][numCoords]
+//  clusters        [numClusters][numCoords]
+//  dimObjects      [numCoords][numObjs]
+//  dimClusters     [numCoords][numClusters]
+//  newClusters     [numCoords][numClusters]
+//  deviceObjects   [numCoords][numObjs]
+//  deviceClusters  [numCoords][numClusters]
+//  ----------------------------------------
+//
+/* return an array of cluster centers of size [numClusters][numCoords]       */
+float** cuda_kmeans(float **objects,      /* in: [numObjs][numCoords] */
+          unsigned int     numCoords,    /* no. features */
+          unsigned int     numObjs,      /* no. objects */
+          unsigned int     numClusters,  /* no. clusters */
+                   float   threshold,    /* % objects change membership */
+                   int    *membership,   /* out: [numObjs] */
+                   int	   loops)
+{
+    int      i, j, index, loop=0;
+    int     *newClusterSize; /* [numClusters]: no. objects assigned in each
+                                new cluster */
+    float    delta;          /* % of objects change their clusters */
+    float  **dimObjects;
+    float  **clusters;       /* out: [numClusters][numCoords] */
+    float  **dimClusters;
+    float  **newClusters;    /* [numCoords][numClusters] */
+
+    float *deviceObjects;
+    float *deviceClusters;
+    int *deviceMembership;
+    int *deviceIntermediates;
+
+    //  Copy objects given in [numObjs][numCoords] layout to new
+    //  [numCoords][numObjs] layout
+    malloc2D(dimObjects, numCoords, numObjs, float);
+    for (i = 0; i < numCoords; i++) {
+        for (j = 0; j < numObjs; j++) {
+            dimObjects[i][j] = objects[j][i];
+        }
+    }
+
+    /* pick first numClusters elements of objects[] as initial cluster centers*/
+    malloc2D(dimClusters, numCoords, numClusters, float);
+    for (i = 0; i < numCoords; i++) {
+        for (j = 0; j < numClusters; j++) {
+            dimClusters[i][j] = dimObjects[i][j];
+		}
+    }
+
+    /* initialize membership[] */
+    for (i=0; i<numObjs; i++) membership[i] = -1;
+
+    /* need to initialize newClusterSize and newClusters[0] to all 0 */
+    newClusterSize = (int*) calloc(numClusters, sizeof(int));
+    assert(newClusterSize != NULL);
+
+    malloc2D(newClusters, numCoords, numClusters, float);
+    memset(newClusters[0], 0, numCoords * numClusters * sizeof(float));
+
+    //  To support reduction, numThreadsPerClusterBlock *must* be a power of
+    //  two, and it *must* be no larger than the number of bits that will
+    //  fit into an unsigned char, the type used to keep  track of membership
+    //  changes in the kernel.
+	cudaDeviceProp props;
+	handle_error(cudaGetDeviceProperties(&props, 0));
+    const unsigned int numThreadsPerClusterBlock = props.maxThreadsPerBlock;
+    const unsigned int numClusterBlocks =
+        ceil(numObjs / (double)numThreadsPerClusterBlock);
+
+#ifdef BLOCK_SHARED_MEM_OPTIMIZATION
+    const unsigned int clusterBlockSharedDataSize =
+        numThreadsPerClusterBlock * sizeof(unsigned char) +
+        numClusters * numCoords * sizeof(float);
+
+    cudaDeviceProp deviceProp;
+    int deviceNum;
+    cudaGetDevice(&deviceNum);
+    cudaGetDeviceProperties(&deviceProp, deviceNum);
+
+    if (clusterBlockSharedDataSize > deviceProp.sharedMemPerBlock) {
+		std::cout << "ERROR: insufficient shared memory. Please don't use the definition 'BLOCK_SHARED_MEM_OPTIMIZATION'" << endl;
+		exit(1);
+    }
+#else
+    const unsigned int clusterBlockSharedDataSize =
+        numThreadsPerClusterBlock * sizeof(unsigned char);
+#endif
+
+    const unsigned int numReductionThreads =
+        nextPowerOfTwo(numClusterBlocks);
+    const unsigned int reductionBlockSharedDataSize =
+        numReductionThreads * sizeof(unsigned int);
+
+    handle_error(cudaMalloc((void**)&deviceObjects, numObjs*numCoords*sizeof(float)));
+    handle_error(cudaMalloc((void**)&deviceClusters, numClusters*numCoords*sizeof(float)));
+    handle_error(cudaMalloc((void**)&deviceMembership, numObjs*sizeof(int)));
+    handle_error(cudaMalloc((void**)&deviceIntermediates, numReductionThreads*sizeof(unsigned int)));
+
+    handle_error(cudaMemcpy(deviceObjects, dimObjects[0],
+              numObjs*numCoords*sizeof(float), cudaMemcpyHostToDevice));
+    handle_error(cudaMemcpy(deviceMembership, membership,
+              numObjs*sizeof(int), cudaMemcpyHostToDevice));
+
+    do {
+        handle_error(cudaMemcpy(deviceClusters, dimClusters[0],
+                  numClusters*numCoords*sizeof(float), cudaMemcpyHostToDevice));
+
+        find_nearest_cluster
+            <<< numClusterBlocks, numThreadsPerClusterBlock, clusterBlockSharedDataSize >>>
+            (numCoords, numObjs, numClusters,
+             deviceObjects, deviceClusters, deviceMembership, deviceIntermediates);
+
+        cudaDeviceSynchronize(); 
+
+        compute_delta <<< 1, numReductionThreads, reductionBlockSharedDataSize >>>
+            (deviceIntermediates, numClusterBlocks, numReductionThreads);
+
+        cudaDeviceSynchronize(); 
+
+        int d;
+        handle_error(cudaMemcpy(&d, deviceIntermediates,
+                  sizeof(int), cudaMemcpyDeviceToHost));
+        delta = (float)d;
+
+        handle_error(cudaMemcpy(membership, deviceMembership,
+                  numObjs*sizeof(int), cudaMemcpyDeviceToHost));
+
+        for (i=0; i<numObjs; i++) {
+            /* find the array index of nestest cluster center */
+            index = membership[i];
+
+            /* update new cluster centers : sum of objects located within */
+            newClusterSize[index]++;
+            for (j=0; j<numCoords; j++)
+                newClusters[j][index] += objects[i][j];
+        }
+
+        //  TODO: Flip the nesting order
+        //  TODO: Change layout of newClusters to [numClusters][numCoords]
+        /* average the sum and replace old cluster centers with newClusters */
+        for (i=0; i<numClusters; i++) {
+            for (j=0; j<numCoords; j++) {
+                if (newClusterSize[i] > 0)
+                    dimClusters[j][i] = newClusters[j][i] / newClusterSize[i];
+                newClusters[j][i] = 0.0;   /* set back to 0 */
+            }
+            newClusterSize[i] = 0;   /* set back to 0 */
+        }
+
+        delta /= numObjs;
+    } while (delta > threshold && loop++ < loops);
+
+    
+
+    /* allocate a 2D space for returning variable clusters[] (coordinates
+       of cluster centers) */
+    malloc2D(clusters, numClusters, numCoords, float);
+    for (i = 0; i < numClusters; i++) {
+        for (j = 0; j < numCoords; j++) {
+            clusters[i][j] = dimClusters[j][i];
+        }
+    }
+
+    handle_error(cudaFree(deviceObjects));
+    handle_error(cudaFree(deviceClusters));
+    handle_error(cudaFree(deviceMembership));
+    handle_error(cudaFree(deviceIntermediates));
+
+    free(dimObjects[0]);
+    free(dimObjects);
+    free(dimClusters[0]);
+    free(dimClusters);
+    free(newClusters[0]);
+    free(newClusters);
+    free(newClusterSize);
+
+    return clusters;
+}
--
libgit2 0.21.4