From f6438346ee2bdf15cdc3f057f58447935dfe13af Mon Sep 17 00:00:00 2001 From: Mahsa Date: Fri, 19 Apr 2019 18:01:41 -0500 Subject: [PATCH] initial commit after PLoS submission --- FindSTIM.cmake | 9 +++++++++ hypersnakuscule.h | 113 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ main.cu | 1157 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1279 insertions(+), 0 deletions(-) create mode 100644 FindSTIM.cmake create mode 100644 hypersnakuscule.h create mode 100644 main.cu diff --git a/FindSTIM.cmake b/FindSTIM.cmake new file mode 100644 index 0000000..bf5762a --- /dev/null +++ b/FindSTIM.cmake @@ -0,0 +1,9 @@ +include(FindPackageHandleStandardArgs) + +set(STIM_INCLUDE_DIR $ENV{STIMLIB_PATH}) + +find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIR) + +if(STIM_FOUND) + set(STIM_INCLUDE_DIRS ${STIM_INCLUDE_DIR}) +endif() diff --git a/hypersnakuscule.h b/hypersnakuscule.h new file mode 100644 index 0000000..a0cff51 --- /dev/null +++ b/hypersnakuscule.h @@ -0,0 +1,113 @@ +#ifndef SNAKUSCULE_H +#define SNAKUSCULE_H + +template +class point { +public: + + T x; + T y; + T z; + + //default constructor + CUDA_CALLABLE point() { + x = 0; + y = 0; + z = 0; + }; + //constructor definition + CUDA_CALLABLE point(T a, T b, T c) { + x = a; + y = b; + z = c; + } + + CUDA_CALLABLE point operator= (const point &rhs) { + this->x = rhs.x; + this->y = rhs.y; + this->z = rhs.z; + return (*this); + } + + CUDA_CALLABLE point operator+ (const point rhs) { + point temp; + temp.x = this->x + rhs.x; + temp.y = this->y + rhs.y; + temp.z = this->z + rhs.z; + return temp; + + } + + CUDA_CALLABLE point operator- (const point rhs) { + point temp; + temp.x = this->x - rhs.x; + temp.y = this->y - rhs.y; + temp.z = this->z - rhs.z; + return temp; + + } + CUDA_CALLABLE point operator*(const float rhs) { + return point(rhs * this->x, rhs * this->y, rhs * this->z ); + } + + + CUDA_CALLABLE point operator/ (const float rhs) { + return point(this->x / rhs , this->y / rhs, this->z/ rhs ); + + } +}; + + +class sphere { + +public: + + point p; + point q; + //default constructor + CUDA_CALLABLE sphere() { + p.x = 0; + p.y = 0; + p.z = 0; + q.x = 0; + q.y = 0; + q.z = 0; + }; + + //CUDA_CALLABLE point p() { return point(center.x - radius, center.y); } + //CUDA_CALLABLE point q() { return point(center.x + radius, center.y); } + //CUDA_CALLABLE point c() { return center; } + CUDA_CALLABLE point c() { + point center; + point sum; // sum of two points to get center + sum = p + q; + center.x = 0.5f * sum.x; + center.y = 0.5f * sum.y; + center.z = 0.5f * sum.z; + return center; } + + //CUDA_CALLABLE float r() { return radius; } + CUDA_CALLABLE float r() { + float radius; + point d; //distance + d = p - q; + radius = 0.5f* sqrt((d.x * d.x) + (d.y * d.y) + (d.z * d.z)); + return radius; } + + CUDA_CALLABLE void update(point energyGradp , point energyGradq , float dt){ + p = p - (energyGradp * dt); + q = q - (energyGradq * dt); + } + + CUDA_UNCALLABLE std::string str() { + std::stringstream ss; + ss << "q = (" << q.x << ", " << q.y << ", " << q.z << ")" << std::endl; + ss << "p = (" << p.x << ", " << p.y << ", " << p.z << ")" << std::endl; + point center = c(); + ss << "c = (" << center.x << ", " << center.y << ", " << center.z << ")" << std::endl; + ss << "r = " << r() << std::endl; + return ss.str(); + } +}; + +#endif \ No newline at end of file diff --git a/main.cu b/main.cu new file mode 100644 index 0000000..c50553f --- /dev/null +++ b/main.cu @@ -0,0 +1,1157 @@ +#include +#include +#include +#include +#include "device_launch_parameters.h" +#include +#include +#include +#include +#include +#include +#include +#include "opencv2/core/core.hpp" +#include +#include +#include + +#define USING_OPENCV +#include +#include +#include +#include +#include +#include "stim/parser/arguments.h" +#include +#include // std::iota +//#include +#include "hypersnakuscule.h" +//#include "median2.cuh" + +#define deltaR 2.0f +#define deltar 1.5874f // deltar=deltaR/cubeRoot2 +#define cubeRoot2 1.2599f +#define pi 3.14159f +//#define Energy_th -3 +//----------------------------------------------functions-------------------------------------------------------------- +void stretch(float* I, size_t size, int low, int high) { + //size: number of image pixel + float max_val = I[0]; + float min_val = I[0]; + for (int n = 0; n < size; n++) { + if (I[n] > max_val) { + max_val = I[n]; + } + if (I[n] < min_val) { + min_val = I[n]; + } + } + float range = max_val - min_val; + float desired_range = (float)high - (float)low; + for (size_t n = 0; n < size; n++) { //for each element in the image + I[n] = desired_range * (I[n] - min_val) / range + low; + } + +} + +/// random generator function: generate random numbers in a sphere with radius 1 in the center (0, 0, 0) +void randGenerator(point* r, int sampleNum, bool debug = false) { + + for (size_t i = 0; i < sampleNum; i++) { + double rn = (double)rand() / (double)(RAND_MAX); + double theta = (double)rand() / (double)(RAND_MAX)* stim::TAU; + double cosphi = 1.0 - 2.0 * ((double)rand() / (double)(RAND_MAX)); + double phi = std::acos(cosphi); + //double phi = std::acos(2.0 * v - 1.0); + //double phi = (double)rand() / (double)(RAND_MAX)* stim::PI; + //std::cout << "rn=" << rn << "\t theta=" << theta << "\tphi=" << phi << std::endl; + r[i].x = (float)(std::cbrt(rn) * cos(theta) * sin(phi)); + r[i].y = (float)(std::cbrt(rn) * sin(theta) * sin(phi)); + r[i].z = (float)(std::cbrt(rn) * cos(phi)); + + + } + if (debug) { + std::ofstream outfile("randomSamples.txt"); //open a file for writing + for (size_t i = 0; i < sampleNum; i++) { + outfile << r[i].x << " " << r[i].y << " " << r[i].z << std::endl; //output the center and radius + } + outfile.close(); + } + +} +///create random numbers in a cube and delete the one outside the sphere +void randGenerator_cube(point* r, int sampleNum, bool debug = false) { + size_t counter = 0; + while (counter < sampleNum) { + double x = ((double)rand() / (double)(RAND_MAX) * 2.0) - 1.0; + double y = ((double)rand() / (double)(RAND_MAX) * 2.0) - 1.0; + double z = ((double)rand() / (double)(RAND_MAX) * 2.0) - 1.0; + double d = sqrt((x * x) + (y * y) + (z * z)); + + if (d < 1.1) { + r[counter].x = (float)x; + r[counter].y = (float)y; + r[counter].z = (float)z; + counter++; + } + } + + if (debug) { + std::ofstream outfile("randomSamples.txt"); //open a file for writing + for (size_t i = 0; i < sampleNum; i++) { + outfile << r[i].x << " " << r[i].y << " " << r[i].z << std::endl; //output the center and radius + } + outfile.close(); + } + + +} + +/// random generator function: generate random numbers in a sphere with radius 1 in the center (0 , 0, 0) +//void randGenerator1(point* r, int sampleNum, bool debug = false) { +// for (int i = 0; i < sampleNum; i++) { +// std::default_random_engine generator1(100 + i); +// std::uniform_real_distribution distribution1(0.0f, 1.0f); +// double rn = distribution1(generator1); +// std::default_random_engine generator2(50.0f + 5.0f * i); +// std::uniform_real_distribution distribution2(0.0, 2.0 * stim::PI); +// double theta = distribution2(generator2); +// std::default_random_engine generator3(30.0f + 3.0f * i); +// std::uniform_real_distribution distribution3(0, stim::PI); +// double phi = distribution3(generator3); +// r[i].x = (float)(std::cbrt(rn) * cos(theta) * sin(phi)); +// r[i].y = (float)(std::cbrt(rn) * sin(theta) * sin(phi)); +// r[i].z = (float)(std::cbrt(rn) * cos(phi)); +// } +// if (debug) { +// std::ofstream outfile("randomSamples.txt"); //open a file for writing +// for (size_t i = 0; i < sampleNum; i++) { +// outfile << r[i].x << " " << r[i].y << " " << r[i].z << std::endl; //output the center and radius +// } +// outfile.close(); +// } +//} + + + +/// saves the snakes specified by the idx array +void SaveSnakes(std::string filename, sphere* snakes, std::vector idx) { + std::ofstream outfile(filename); //open a file for writing + for (size_t i = 0; i < idx.size(); i++) { + point center = snakes[idx[i]].c(); //get the centerpoint of the snake + outfile << center.x << " " << center.y << " " << center.z << " " << snakes[idx[i]].r() << std::endl; //output the center and radius + } + outfile.close(); + + + +} +void initialSwarmSnake(sphere * snakes, size_t &counter, size_t w, size_t h, size_t d, float radius) { + std::cout << "W=" << w << "\t h=" << h << "\td=" << d << std::endl; + int D = int(sqrt(1.5)*radius); //distance between hypersnakes + //int D = 30; //for phantom + int k1 = 0; + counter = 0; + float startpoint = 0; + for (float k = radius; k < (d - radius); k += D ) {// for phantom D + for (float j = radius; j < (h - radius); j += D) { + k1++; + if (k1 % 2 == 1) + startpoint = 0; //for phantom D + else + startpoint = (float)D / 2.0f;//for phantom D + for (float i = startpoint; i < (w - radius); i += D) { + point temp_p(i, j, k); + snakes[counter].p = temp_p; + point temp_q(i + (2 * radius), j, k); + snakes[counter].q = temp_q; + counter = counter + 1; + } + } + } + std::ofstream outfile("initials.txt"); + for (int i = 0; i < counter; i++) { + outfile << snakes[i].c().x << " " << snakes[i].c().y << " " << snakes[i].c().z << " " << snakes[i].r() << std::endl; + } + outfile.close(); +} + +__host__ __device__ void sum_dE(point& dEdp, point& dEdq, point c, point s, float R, float f) { + + float r = R / cubeRoot2; // radius of inner snake + float dz2 = (s.z - c.z)*(s.z - c.z); + float dy2 = (s.y - c.y)*(s.y - c.y); + float dx2 = (s.x - c.x)*(s.x - c.x); + float d = sqrt(dz2 + dy2 + dx2); //distance bw given sample and center of contour + + float gx = 2.0f * R; // gx= snake.q.x - snake.p.x ; + float dx = (c.x - s.x) / d; + float dy = (c.y - s.y) / d; + float dz = (c.z - s.z) / d; + + if (d < (r - 0.5f*deltar)) { // mouth of snake + dEdp.x -= 3.0f / gx * f; //calculate the growth/shrinking term for the snake + dEdq.x += 3.0f / gx * f; + } + else if (d < (r + 0.5f*deltar)) { // throat of snake + float S = (2.0f / deltar) * (d - r); // weight function value in the given point + dEdp.x += f * ((3.0f / gx) * S + (dx + (1.0f / cubeRoot2)) / deltar); + dEdp.y += f *(dy / deltar); + dEdp.z += f *(dz / deltar); + + dEdq.x += f * ((-3.0f / gx) * S + (dx - (1.0f / cubeRoot2)) / deltar); + dEdq.y = dEdp.y; + dEdq.z = dEdp.z; + + } + else if (d < (R - 0.5f*deltaR)) { // coil of snake + dEdp.x += 3.0f / gx * f; + dEdq.x -= 3.0f / gx * f; + } + + else if (d < (R + 0.5f*deltaR)) { // fangs of snake + float S = -(1.0f / deltaR) * (d - (R + deltaR / 2.0f)); + + dEdp.x += f * ((3.0f * S / gx) - (0.5f * (dx + 1.0f) / deltaR)); + dEdp.y -= 0.5f *(dy / deltaR) * f; + dEdp.z -= 0.5f *(dz / deltaR) * f; + + dEdq.x += f *((-3.0f * S / gx) - (0.5f * (dx - 1.0f) / deltaR)); + dEdq.y = dEdp.y; + dEdq.z = dEdp.z; + } + +} + +//sum_dE in debug mode +__host__ __device__ void sum_dE_debug(point& dEdp, point& dEdq, int &counter, point c, point s, float R, float f) { + + float r = R / cubeRoot2; // radius of inner snake + float dz2 = (s.z - c.z)*(s.z - c.z); + float dy2 = (s.y - c.y)*(s.y - c.y); + float dx2 = (s.x - c.x)*(s.x - c.x); + float d = sqrt(dz2 + dy2 + dx2); //distance bw given sample and center of contour + + float gx = 2.0f * R; // gx= snake.q.x - snake.p.x ; + float dx = (c.x - s.x) / d; + float dy = (c.y - s.y) / d; + float dz = (c.z - s.z) / d; + int trivial = 0; //to test if any point is out of the contour + if (d < (r - 0.5f*deltar)) { // mouth of snake + counter++; + dEdp.x -= 3.0f / gx * f; //calculate the growth/shrinking term for the snake + dEdq.x += 3.0f / gx * f; + } + else if (d < (r + 0.5f*deltar)) { // throat of snake + counter++; + float S = (2.0f / deltar) * (d - r); // weight function value in the given point + dEdp.x += f * ((3.0f / gx) * S + (dx + (1.0f / cubeRoot2)) / deltar); + dEdp.y += f *(dy / deltar); + dEdp.z += f *(dz / deltar); + + dEdq.x += f * ((-3.0f / gx) * S + (dx - (1.0f / cubeRoot2)) / deltar); + dEdq.y = dEdp.y; + dEdq.z = dEdp.z; + + } + else if (d < (R - 0.5f*deltaR)) { // coil of snake + counter++; + dEdp.x += 3.0f / gx * f; + dEdq.x -= 3.0f / gx * f; + } + + else if (d < (R + 0.5f*deltaR)) { // fangs of snake + counter++; + float S = -(1.0f / deltaR) * (d - (R + deltaR / 2.0f)); + + dEdp.x += f * ((3.0f * S / gx) - (0.5f * (dx + 1.0f) / deltaR)); + dEdp.y -= 0.5f *(dy / deltaR) * f; + dEdp.z -= 0.5f *(dz / deltaR) * f; + + dEdq.x += f *((-3.0f * S / gx) - (0.5f * (dx - 1.0f) / deltaR)); + dEdq.y = dEdp.y; + dEdq.z = dEdp.z; + } + else + trivial++; + +} + +// this function calculate gradient of energy with respect to two point p and q +__host__ __device__ void snake_Engrad(point&dEdp, point&dEdq, sphere snake, float* I, size_t w, size_t h, size_t d, bool debug = false) { + + float radius = snake.r(); // radius of outer snake + point c = snake.c(); // center of snake + dEdp = point(0, 0, 0); //initialize dEdp and dEdq to zero + dEdq = point(0, 0, 0); + + float threshold = ((1 + cubeRoot2) / (cubeRoot2 - 1))*(deltar / pow(2.0f, (2.0f / 3.0f))); + if (radius < threshold) { + if (debug) printf("\t RADIUS IS OUT OF RANGE\n"); + return; + } + + float tempXmin = floor(c.x - radius - 1); //calculate a bounding box around the sphere + float tempXmax = ceil(c.x + radius + 1); + float tempYmin = floor(c.y - radius - 1); + float tempYmax = ceil(c.y + radius + 1); + float tempZmin = floor(c.z - radius - 1); + float tempZmax = ceil(c.z + radius + 1); + + float xmin = max((float)tempXmin, (float) 0.0); //clamp the bounding box to the image edges + float xmax = min((float)tempXmax, (float)(w - 1)); + float ymin = max((float)tempYmin, (float)0); + float ymax = min((float)tempYmax, (float)(h - 1)); + float zmin = max((float)tempZmin, (float)0); + float zmax = min((float)tempZmax, (float)(d - 1)); + + if ((xmax <= xmin) || (ymax <= ymin) || (zmax <= zmin)) { + if (debug) printf("(xmax <= xmin) || (ymax <= ymin) || (zmax <= zmin)"); + return; + } + + float R = radius; //simplify radius to R + float R3 = R * R * R; //calculate R^2 (radius squared) + for (unsigned int z = (unsigned int)zmin; z <= (unsigned int)zmax; z++) { // for each section + for (unsigned int x = (unsigned int)xmin; x <= (unsigned int)xmax; x++) { //for each column of section in the bounding box + for (unsigned int y = (unsigned int)ymin; y <= (unsigned int)ymax; y++) { //for each pixel p in the column + + point s(x, y, z); // a sample inside the contour + float f; // image value in given position + + int position = (int)((w * h * z) + (x * h + y)); + if (position < (w * h * d)) { + f = I[position]; + + + if (!f == 0) + sum_dE(dEdp, dEdq, c, s, R, f); + + } + + } + } + } + + dEdp = dEdp / (8 * R3); + dEdq = dEdq / (8 * R3); + + +} + +//// this function calculate gradient of energy with respect to two point p and q using Monte Carlo +__host__ __device__ void snake_Engrad_MC(point&dEdp, point&dEdq, sphere snake, float* I, point* samples, size_t sampleNum, size_t w, size_t h, size_t d, bool debug = false) { + + float radius = snake.r(); // radius of outer snake + point c = snake.c(); // center of snake + dEdp = point(0, 0, 0); //initialize dEdp and dEdq to zero + dEdq = point(0, 0, 0); + + float threshold = ((1 + cubeRoot2) / (cubeRoot2 - 1))*(deltar / pow(2.0f, (2.0f / 3.0f))); + if (radius < threshold) { + if (debug) printf("\t RADIUS IS OUT OF RANGE\n"); + return; + } + + float tempXmin = floor(c.x - radius - 1); //calculate a bounding box around the sphere + float tempXmax = ceil(c.x + radius + 1); + float tempYmin = floor(c.y - radius - 1); + float tempYmax = ceil(c.y + radius + 1); + float tempZmin = floor(c.z - radius - 1); + float tempZmax = ceil(c.z + radius + 1); + + float xmin = max((float)tempXmin, (float) 0.0); //clsamples(amp the bounding box to the image edges + float xmax = min((float)tempXmax, (float)(w - 1.0)); + float ymin = max((float)tempYmin, (float)0.0); + float ymax = min((float)tempYmax, (float)(h - 1.0)); + float zmin = max((float)tempZmin, (float)0.0); + float zmax = min((float)tempZmax, (float)(d - 1.0)); + + if ((xmax <= xmin) || (ymax <= ymin) || (zmax <= zmin)) { + if (debug) printf("(xmax <= xmin) || (ymax <= ymin) || (zmax <= zmin)"); + return; + } + + + + + float R = radius; //simplify radius to R + float R3 = R * R * R; //calculate R^3 (radius cube) + int counter = 0; + float sumf = 0.0; + + for (int i = 0; i < sampleNum; i++) { + float sx = samples[i].x; + float sy = samples[i].y; + float sz = samples[i].z; + + float x = (R + 1.0f) * sx + c.x; + float y = (R + 1.0f) * sy + c.y; + float z = (R + 1.0f) * sz + c.z; + + int xi = (int)round(x); + int yi = (int)round(y); + int zi = (int)round(z); + + point s(xi, yi, zi); // a sample inside the contour + float f; // image value in given position + + int position = (int)((w * h * zi) + (xi * h + yi)); + if (position < (w * h * d) && xi >= xmin && xi <= xmax && yi >= ymin && yi <= ymax && zi >= zmin && zi <= zmax) { // && d1< (R + 1) + counter++; + f = (float)I[position]; + sumf += f; + //if (debug) + sum_dE_debug(dEdp, dEdq, counter, c, s, R, f); + //sum_dE(dEdp, dEdq, c, s, R, f); + } + } + + + + + + float volume = pi * (((R + 1.0f) * (R + 1.0f) * (zmax - zmin)) - ((1.0f / 3.0f) * (powf((zmax - c.z), 3.0f) + powf((c.z - zmin), 3.0f)))); + + //if (debug) printf("volume_sphere=%f and volume=%f\n", volume_sphere, volume); + //if (debug) printf("sampleNum=%u and (float)counter= %f \n", sampleNum, (float)counter); + + dEdp = dEdp * volume / (float)counter; + + dEdq = dEdq * volume / (float)counter; + + + dEdp = dEdp / (8.0f * R3); + dEdq = dEdq / (8.0f * R3); + +} + + +// this function calculate gradient of energy with respect to two point p and q using Monte Carlo +__host__ __device__ void snake_Engrad_MC_parallel(point&dEdp, point&dEdq, int &counter, sphere snake, float* I, point sample, size_t w, size_t h, size_t d, bool debug = false) { + + float radius = snake.r(); // radius of outer snake + point c = snake.c(); // center of snake + + float tempXmin = floor(c.x - radius - 1); //calculate a bounding box around the sphere + float tempXmax = ceil(c.x + radius + 1); + float tempYmin = floor(c.y - radius - 1); + float tempYmax = ceil(c.y + radius + 1); + float tempZmin = floor(c.z - radius - 1); + float tempZmax = ceil(c.z + radius + 1); + + float xmin = max((float)tempXmin, (float) 0.0); //clsamples(amp the bounding box to the image edges + float xmax = min((float)tempXmax, (float)(w - 1.0)); + float ymin = max((float)tempYmin, (float)0.0); + float ymax = min((float)tempYmax, (float)(h - 1.0)); + float zmin = max((float)tempZmin, (float)0.0); + float zmax = min((float)tempZmax, (float)(d - 1.0)); + + if ((xmax <= xmin) || (ymax <= ymin) || (zmax <= zmin)) { + if (debug) printf("(xmax <= xmin) || (ymax <= ymin) || (zmax <= zmin)"); + return; + } + + + + + float R = radius; //simplify radius to R + float sumf = 0.0; + + //for (int i = 0; i < sampleNum; i++) { + float sx = sample.x; + float sy = sample.y; + float sz = sample.z; + + float x = (R + 1.0f) * sx + c.x; + float y = (R + 1.0f) * sy + c.y; + float z = (R + 1.0f) * sz + c.z; + + int xi = (int)round(x); + int yi = (int)round(y); + int zi = (int)round(z); + + point s(xi, yi, zi); // a sample inside the contour + float f; // image value in given position + + int position = (int)((w * h * zi) + (xi * h + yi)); + if (position < (w * h * d) && xi >= xmin && xi <= xmax && yi >= ymin && yi <= ymax && zi >= zmin && zi <= zmax) { // && d1< (R + 1) + + f = (float)I[position]; + sumf += f; + sum_dE_debug(dEdp, dEdq, counter, c, s, R, f); + //printf("dEdp.x=%f and dEdq.x=%f \t dEdp.y=%f and dEdq.y=%f \t dEdp.z=%f and dEdq.z=%f", dEdp.x, dEdq.x, dEdp.y, dEdq.y, dEdp.z, dEdq.z); + + } + + +} + + + + + +void snake_evolve(sphere &snakes, float* I, int w, int h, int d, float dt, int itr, point* samples, size_t sampleNum, bool MC, bool debug = false) { + point dEdp(0.0, 0.0, 0.0); // energy gradient wrt p + point dEdq(0.0, 0.0, 0.0); // energy gradient wrt q + for (int numItr = 0; numItr < itr; numItr++) { + if (MC) + snake_Engrad_MC(dEdp, dEdq, snakes, I, samples, sampleNum, w, h, d, debug); + else + snake_Engrad(dEdp, dEdq, snakes, I, w, h, d, debug); + + if (debug) printf("dEdp.x=%f \t dEdq.x=%f \n dEdp.y=%f \t dEdp.z=%f \n\n", dEdp.x, dEdq.x, dEdp.y, dEdp.z); + float factor = sqrt(float(numItr + 1)); // step size in gradient descent decreasing by number of iterations + + snakes.update(dEdp, dEdq, dt / factor); + + } + + +} + + + +//-------------------------------------------Kernels-------------------------------------------------------------------------- + +//__global__ void kernel_snake_evolve_MC(sphere * snakes, float* I, point* samples, int sampleNum, size_t snakeNum, size_t w, size_t h, size_t d, int itr, float dt, bool debug = false) { +// +// int idx = blockDim.x * blockIdx.x + threadIdx.x; +// +// if (idx >= snakeNum) // return if the number of threads is more than snakes +// return; +// +// +// if (idx == 0) +// printf("\n\n \t\t=============>>>>we are in the MC kernel\n\n"); +// point dEdp(0.0, 0.0, 0.0); // energy gradient wrt p +// point dEdq(0.0, 0.0, 0.0); // energy gradient wrt q +// sphere s = snakes[idx]; +// +// for (int i = 0; i < itr; i++) { +// if (debug) printf("\n\n---------------->> iteration %u\n", i); +// snake_Engrad_MC(dEdp, dEdq, s, I, samples, sampleNum, w, h, d, debug); +// float factor = sqrtf(float(i + 1)); +// if (debug) +// printf("dEdp.x=%f and dEdp.y=%f and dEdp.z=%f and dEdq.x=%f\n", dEdp.x, dEdp.y, dEdp.z, dEdq.x); +// s.update(dEdp, dEdq, dt / factor); +// +// } +// +// snakes[idx] = s; +// +// +//} + +__global__ void kernel_snake_evolve_MC_parallel(sphere * snakes, float* I, point* samples, int sampleNum, size_t snakeNum, size_t threads, size_t w, size_t h, size_t d, int itr, float dt, bool debug = false) { + + extern __shared__ float sharedPtr[]; // define shared memory to save result of each thread there + int n = floorf(sampleNum / threads) ; //# given sample points to one thread + int idx = blockDim.x * blockIdx.x + threadIdx.x; + + if (idx >= (snakeNum * threads)) // return if the number of threads is more than snakes + return; + + float threshold = ((1 + cubeRoot2) / (cubeRoot2 - 1))*(deltar / pow(2.0f, (2.0f / 3.0f))); //snake cannot be smaller than threshold + /*if (idx == 0) { + printf("\n\n \t\t=============>>>>we are in the MC kernel\n\n"); + printf("number of samples per thread=%d\n", n); + printf("idx=%d\n", (snakeNum * threads)); + printf("blockdim.x=%d and threads=%u \n", blockDim.x, threads); + }*/ + + point thread_sample; //sample goes to the thread + //sum_counter is real number of samples inside the contour averaged. some points of samples may round out of contour and did not contribute in averaging + for (int i = 0; i < itr; i++) { + //check the snake, if it pass the threshold + + if (snakes[blockIdx.x].r() < threshold) { + if (debug) printf("\t RADIUS IS OUT OF RANGE\n"); + return; + } + + float radius = snakes[blockIdx.x].r(); // radius of outer snake + point c = snakes[blockIdx.x].c(); // center of snake + float tempXmin = floor(c.x - radius - 1); //calculate a bounding box around the sphere + float tempXmax = ceil(c.x + radius + 1); + float tempYmin = floor(c.y - radius - 1); + float tempYmax = ceil(c.y + radius + 1); + float tempZmin = floor(c.z - radius - 1); + float tempZmax = ceil(c.z + radius + 1); + + float xmin = max((float)tempXmin, (float) 0.0); //clsamples(amp the bounding box to the image edges + float xmax = min((float)tempXmax, (float)(w - 1.0)); + float ymin = max((float)tempYmin, (float)0.0); + float ymax = min((float)tempYmax, (float)(h - 1.0)); + float zmin = max((float)tempZmin, (float)0.0); + float zmax = min((float)tempZmax, (float)(d - 1.0)); + + if ((xmax <= xmin) || (ymax <= ymin) || (zmax <= zmin)) { + if (debug) printf("(xmax <= xmin) || (ymax <= ymin) || (zmax <= zmin)"); + return; + } + + + + if (debug) printf("\n\n---------------->> iteration %u\n", i); + int counter = 0; + point dEdp(0.0f, 0.0f, 0.0f); // energy gradient wrt p + point dEdq(0.0f, 0.0f, 0.0f); // energy gradient wrt q + for (int j = 0; j < n; j++) { + //sphere single_snake = snakes[blockIdx.x]; // each block is assigned to one snake. all threads in a block are working for that snake + thread_sample = samples[threadIdx.x * n + j]; + snake_Engrad_MC_parallel(dEdp, dEdq, counter, snakes[blockIdx.x], I, thread_sample, w, h, d, debug); + + } + /*if (idx == 0) { + printf("points in the contour for thread0=%d\n", counter); + printf("dEdp.x=%f and dEdq.x=%f \t dEdp.y=%f and dEdq.y=%f \t dEdp.z=%f and dEdq.z=%f", dEdp.x, dEdq.x, dEdp.y, dEdq.y, dEdp.z, dEdq.z); + } +*/ + + //copy the result of each thread in shared memory + sharedPtr[threadIdx.x * 7 + 0] = dEdp.x; + sharedPtr[threadIdx.x * 7 + 1] = dEdp.y; + sharedPtr[threadIdx.x * 7 + 2] = dEdp.z; + + sharedPtr[threadIdx.x * 7 + 3] = dEdq.x; + sharedPtr[threadIdx.x * 7 + 4] = dEdq.y; + sharedPtr[threadIdx.x * 7 + 5] = dEdq.z; + + sharedPtr[threadIdx.x * 7 + 6] = counter; + + __syncthreads(); + + //combine threads + dEdp = point(0.0f, 0.0f, 0.0f); // energy gradient wrt p + dEdq = point(0.0f, 0.0f, 0.0f); + counter = 0; + if (threadIdx.x == 0) { + float R = snakes[blockIdx.x].r(); // radius of outer snake + float R3 = R * R * R; //calculate R^3 (radius cube) + point c = snakes[blockIdx.x].c(); // center of snake + + for (int i = 0; i < threads; i++) { + dEdp.x += sharedPtr[i * 7 + 0]; + dEdp.y += sharedPtr[i * 7 + 1]; + dEdp.z += sharedPtr[i * 7 + 2]; + + dEdq.x += sharedPtr[i * 7 + 3]; + dEdq.y += sharedPtr[i * 7 + 4]; + dEdq.z += sharedPtr[i * 7 + 5]; + + counter += sharedPtr[i * 7 + 6]; + + } + + float volume = pi * (((R + 1.0f) * (R + 1.0f) * (zmax - zmin)) - ((1.0f / 3.0f) * (powf((zmax - c.z), 3.0f) + powf((c.z - zmin), 3.0f)))); + //float volume = (4.0f / 3.0f) * pi * (R + 1.0f)*(R + 1.0f)*(R + 1.0f); //(4.0f / 3.0f) * pi * powf((R + 1), 3.0f); + dEdp = dEdp * volume / (float)counter; + dEdq = dEdq * volume / (float)counter; + + + dEdp = dEdp / (8.0f * R3); + dEdq = dEdq / (8.0f * R3); + + //if (idx == 0) + //printf("dEdp.x=%f and dEdq.x=%f \t dEdp.y=%f and dEdq.y=%f \t dEdp.z=%f and dEdq.z=%f", dEdp.x, dEdq.x, dEdp.y, dEdq.y, dEdp.z, dEdq.z); + + float factor = sqrtf(float(i + 1)); + snakes[blockIdx.x].update(dEdp, dEdq, dt / factor); + + //printf("snakes[blockIdx.x].p.x=%f and snakes[blockIdx.x].p.y=%f \n snakes[blockIdx.x].q.x=%f and snakes[blockIdx.x].q.y=%f\n\n", snakes[blockIdx.x].p.x, snakes[blockIdx.x].p.y, snakes[blockIdx.x].q.x, snakes[blockIdx.x].q.y); + } + __syncthreads(); + + } + + + + + + +} + +__global__ void kernel_snake_evolve(sphere* snakes, float *I, size_t snakeNum, size_t w, size_t h, size_t d, int itr, float dt, bool debug = false) { + + //__launch_bounds__(1024, 1); + int idx = blockDim.x * blockIdx.x + threadIdx.x; + + if (idx >= snakeNum) // return if the number of threads is more than snakes + return; + + + if (idx == 0) + printf("\n\n \t\t=============>>>>we are in the kernel\n\n"); + point dEdp(0.0, 0.0, 0.0); // energy gradient wrt p + point dEdq(0.0, 0.0, 0.0); // energy gradient wrt q + sphere s = snakes[idx]; + + for (int i = 0; i < itr; i++) { + + snake_Engrad(dEdp, dEdq, s, I, w, h, d, debug); + if (debug) + printf("dEdp.x=%f and dEdp.y=%f and dEdp.z=%f and dEdq.x=%f\n", dEdp.x, dEdp.y, dEdp.z, dEdq.x); + float factor = sqrtf(float(i + 1)); + s.update(dEdp, dEdq, dt / factor); + } + + snakes[idx] = s; + +} + + +// -----------------------------------Energy computaion and compare hypersnakes------------------------------------------------------------------------ +__host__ __device__ void sum_E(float& E, point c, point s, float R, float f) { + + float r = R / cubeRoot2; // radius of inner snake + float dz2 = (s.z - c.z)*(s.z - c.z); + float dy2 = (s.y - c.y)*(s.y - c.y); + float dx2 = (s.x - c.x)*(s.x - c.x); + float d = sqrt(dz2 + dy2 + dx2); //distance bw given sample and center of contour + + + if (d < (r - 0.5f*deltar)) // mouth of snake + E -= f; + + else if (d < (r + 0.5f*deltar)) { // throat of snake + float S = (2.0f / deltar) * (d - r); // weight function value in the given point + E += (S * f); + } + + else if (d < (R - 0.5f*deltaR)) // coil of snake + E += f; + + else if (d < (R + 0.5f*deltaR)) { // fangs of snake + float S = -(1.0f / deltaR) * (d - (R + deltaR / 2.0f)); + E += (S * f); + } + +} +__host__ __device__ void snake_energy(float &energy, sphere snake, float* I, size_t w, size_t h, size_t d) { + + float radius = snake.r(); // radius of outer snake + point c = snake.c(); // center of snake + float threshold = ((1 + cubeRoot2) / (cubeRoot2 - 1))*(deltar / pow(2.0f, (2.0f / 3.0f))); + if (radius < threshold) { + //printf("radius is out of range\n"); + return; + } + + float tempXmin = floor(c.x - radius - 1); //calculate a bounding box around the sphere + float tempXmax = ceil(c.x + radius + 1); + float tempYmin = floor(c.y - radius - 1); + float tempYmax = ceil(c.y + radius + 1); + float tempZmin = floor(c.z - radius - 1); + float tempZmax = ceil(c.z + radius + 1); + + float xmin = max((float)tempXmin, (float) 0.0); //clamp the bounding box to the image edges + float xmax = min((float)tempXmax, (float)(w - 1)); + float ymin = max((float)tempYmin, (float)0); + float ymax = min((float)tempYmax, (float)(h - 1)); + float zmin = max((float)tempZmin, (float)0); + float zmax = min((float)tempZmax, (float)(d - 1)); + + if ((xmax <= xmin) || (ymax <= ymin) || (zmax <= zmin)) { + //printf("(xmax <= xmin) || (ymax <= ymin) || (zmax <= zmin)"); + return; + } + + float E = 0.0f; + float R = radius; //simplify radius to R + float R3 = R * R * R; //calculate R^2 (radius squared) + + for (unsigned int z = (unsigned int)zmin; z <= (unsigned int)zmax; z++) { // for each section + for (unsigned int y = (unsigned int)ymin; y <= (unsigned int)ymax; y++) { //for each row of section in the bounding box + for (unsigned int x = (unsigned int)xmin; x <= (unsigned int)xmax; x++) { //for each pixel p in the row + + point s(x, y, z); // a sample inside the contour + float f; // image value in given position + int position = (int)((w * h * z) + (x * h + y)); + if (position < (w * h * d)) { + f = I[position]; + if (!f == 0) + sum_E(E, c, s, R, f); + } + + } + } + } + energy = E / (8 * R3); +} + +//compute energy of snakes +__global__ void kernel_snake_energy(float* energy, sphere* snakes, float* I, size_t snakeNum, size_t w, size_t h, size_t d) { + size_t i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i >= snakeNum) return; // return if the number of threads is more than snakes + + if (i == 0) { + printf("we are in energy kernel\n"); + } + float energy_temp; + snake_energy(energy_temp, snakes[i], I, w, h, d); + energy[i] = energy_temp; + +} + + +// returns a set of snake indices that meet specific criteria(to be determined and refined by Mahsa) +std::vector DetectValidSnakes_GPU(sphere* snakes, size_t snakeNum, float* I, size_t w, size_t h, size_t d, size_t threads, float energy_th) { + + ////calculate energy + float *gpu_energy; + HANDLE_ERROR(cudaMalloc(&gpu_energy, snakeNum * sizeof(float))); + size_t blocks = snakeNum / threads + 1; + + //// allocate memory and copy snakes to the device + sphere* gpu_snakes = new sphere[snakeNum]; + HANDLE_ERROR(cudaMalloc(&gpu_snakes, snakeNum * sizeof(sphere))); + HANDLE_ERROR(cudaMemcpy(gpu_snakes, snakes, snakeNum * sizeof(sphere), cudaMemcpyHostToDevice)); + + //// allocate memory and copy image to device + float *gpu_I; + HANDLE_ERROR(cudaMalloc(&gpu_I, w * h * d * sizeof(float))); + HANDLE_ERROR(cudaMemcpy(gpu_I, I, w * h * d * sizeof(float), cudaMemcpyHostToDevice)); + //create an array to store the snake energies' on cpu + float *energy; + energy = (float*)malloc(snakeNum * sizeof(float)); + //memset(energy, 0, snakeNum * sizeof(float)); + kernel_snake_energy << < (unsigned int)blocks, (unsigned int)threads >> > (gpu_energy, gpu_snakes, gpu_I, snakeNum, w, h, d); + + HANDLE_ERROR(cudaMemcpy(energy, gpu_energy, snakeNum * sizeof(float), cudaMemcpyDeviceToHost)); + //std::ofstream outfile("energy.txt"); //open a file for writing + //for (size_t i = 0; i < snakeNum; i++) { + // outfile << energy[i] << std::endl; //output the energy + //} + //outfile.close(); + + cudaFree(gpu_energy); + cudaFree(gpu_I); + cudaFree(gpu_snakes); + + + // compare snakes in possible overlaps + std::vector id; //store indices of snakes which have overlaps with snake i + std::vector idx; //create a vector to store indices of snakes which must be deleted. + + float threshold = ((1 + cubeRoot2) / (cubeRoot2 - 1))*(deltar / pow(2.0f, (2.0f / 3.0f))); + + for (size_t i = 0; i < snakeNum; i++) { + if (snakes[i].r() < threshold) + idx.push_back(i); + + if (std::find(idx.begin(), idx.end(), i) == idx.end()) { // check if snake is already deleted + id.clear(); + for (size_t j = 0; j < snakeNum; j++) { + if (j != i) { + if (snakes[j].c().x > snakes[i].c().x - 2 * snakes[i].r() && snakes[j].c().x < snakes[i].c().x + 2 * snakes[i].r() && snakes[j].c().y > snakes[i].c().y - 2 * snakes[i].r() && snakes[j].c().y < snakes[i].c().y + 2 * snakes[i].r()) { + if (snakes[j].r() < threshold) + idx.push_back(j); + if (std::find(idx.begin(), idx.end(), j) == idx.end()) { + float centerDistance_x = snakes[i].c().x - snakes[j].c().x; // centers distance of snakes i and j- in x direction + float centerDistance_y = snakes[i].c().y - snakes[j].c().y; // centers distance of snakes i and j- in y direction + float centerDistance_z = snakes[i].c().z - snakes[j].c().z; // centers distance of snakes i and j- in z direction + float centerDistance = sqrt((centerDistance_x * centerDistance_x) + (centerDistance_y * centerDistance_y) + 4 * (centerDistance_z * centerDistance_z)); // euclidean distance bw center snakes i and j + float maxRadius = max(snakes[i].r(), snakes[j].r()); // maximum of radius of snake i and snake j + if (centerDistance < (maxRadius / cubeRoot2)) + id.push_back(j); // store indices of overlapped snakes with snake i + } + } + } + } + if (!id.empty()) { + id.push_back(i); + float smallest = energy[id[0]]; + size_t smallest_id = id[0]; // index of snake should be kept + for (int k = 0; k < id.size(); k++) { // find snake with smallest energy among id vector + if (energy[id[k]] < smallest) { + smallest = energy[id[k]]; + smallest_id = id[k]; + } + } + for (int m = 0; m < id.size(); m++) { + if (id[m] != smallest_id) // among snakes with overlaps, the one with smallest energy survies. + idx.push_back(id[m]); // idx stores snakes which should be deleted + + } + } + } + + } + std::vector th_idx; //create a vector to store final indices exclusive idx (indices of snakes with large energy) and the ones with energy higher than threshold + for (size_t c = 0; c < snakeNum; c++) { + if (std::find(idx.begin(), idx.end(), c) == idx.end()) { + if (energy[c] < energy_th) + th_idx.push_back(c); //if the snake exceeds the energy threshold, store the index + } + } + free(energy); + return th_idx; //return the indices of valid snakes +} + + + + +///..................................................................main function............................................................................................. +void advertise() { + std::cout << "this is Hypersnakuscule implementation" << std::endl; + std::cout << "reference papre for 2D is (Snakuscules by Philippe Thévenaz and Michael Unser)" << std::endl; + std::cout << "implemented by Mahsa Lotfollahi" << std::endl << std::endl; + std::cout << "Usage: snakuscules input_image [options]" << std::endl; +} + +int main(int argc, char* argv[]) { + stim::arglist args; // create an argument list + + args.add("help", "prints this help"); + args.add("iter", "number of iteration for evolving contour", "400", "positive value"); + args.add("radius", "initial radius", "15", "real positive value"); + args.add("size", "specify size of image in 3 dimension", "", "[w h d]"); + args.add("dt", "gradient descend stepsize", "10", "real positive value"); + args.add("cuda", "specify the device used for CUDA calculations", "0", "device ID, -1 for CPU"); + args.add("mc", "specify using Monte Carlo sampling", "", "MC=1 for Monte Carlo sampling and 0 for original integration"); + args.add("single", "specify a single contour to evolve", "", "[x y z r]"); + args.add("filter", "specify filter type for preprocessing like log", "", "name of filter"); + args.add("energy_th", "specify energy threshold, snakes with energies less than that survive", "-1", "small negative value"); + args.add("debug", "output debugging information"); + args.parse(argc, argv); + + if (args["help"].is_set()) { //output help if requested by the user + advertise(); + std::cout << args.str() << std::endl; + return 1; + } + + if (args.nargs() < 1) { + std::cout << "ERROR: no input file specified" << std::endl; + return 1; + } + + std::string output_file = "output.txt"; //set the default output file name to "output.txt" + if (args.nargs() >= 2) output_file = args.arg(1); //if an output is specified by the user, use that instead + + int itr = args["iter"].as_int(); //get input parameters and set variables + float radius = (float)args["radius"].as_float(); + if (!args["size"]) { //get size of image + std::cout << "you should specify size of image in 3 dimension" << std::endl; + return 1; + } + + int w = args["size"].as_int(0); + int h = args["size"].as_int(1); + int d = args["size"].as_int(2); + + float energy_th = (float)args["energy_th"].as_float(); + float dt = (float)args["dt"].as_float(); + int cuda_device = args["cuda"].as_int(); //get the desired CUDA device + bool MC = false; + int sampleNum; + if (args["mc"]) MC = true; + sampleNum = 20000; + + if (args["mc"].nargs() > 0) + sampleNum = args["mc"].as_int(); + + bool swarm = true; + if (args["single"]) swarm = false; //if the user specifies a single snake parameter, don't use the swarm algorithm + + bool Filter = false; + std::string filter_name; + int kernel_size; // kernel size + if (args["filter"]) { + Filter = true; + filter_name = "log"; + kernel_size = 5; + if (args["filter"].nargs() > 0) + filter_name = args["filter"].as_string(0); + if (args["filter"].nargs() > 1) + kernel_size = args["filter"].as_int(1); + + } + + + //allocate memory in cpu for input image + size_t bytes = w * h * d * sizeof(float); //number of bytes needed to store image + float* I = (float*)malloc(bytes); + // load input image+ + std::ifstream inputfile(args.arg(0), std::ios::in | std::ios::binary); + if (!inputfile) { + std::cout << "cannot open specified input file" << std::endl; + return; + } + inputfile.read((char*)I, bytes); + inputfile.close(); + size_t N = w * h * d; // number of pixels (# array elements) + float* I_original = (float*)malloc(bytes); // keep the original image without pre-processing + memcpy(I_original, I, bytes); + stretch(I_original, N, 0, 255); + stretch(I, N, 0, 255); + + + if (Filter) { // compute log of image + if (filter_name == "log") { + for (int i = 0; i < N; i++) + I[i] = log(I[i] + 1); + } + if (filter_name == "median") { // apply median filter on each section + float* I_2D = (float*)malloc(w*h * sizeof(float)); // allocate memory to 2_D sections + + cv::Mat t_I_2D_mat(w, h, CV_32F); //image is stored column major but open cv read and write row major---matrix is transposed + cv::Mat I2d_blurred(w, h, CV_32F); //allocate memory for blured image + + for (int zz = 0; zz < d; zz++) { + memcpy(I_2D, I_original + (zz * w *h), w * h * sizeof(float)); // copy each section of 3D image(I) in an array + cv::Mat I_2D_mat(h, w, CV_32F, I_2D); // create a Mat to copy data to that and be able to use open cv median filter + cv::transpose(I_2D_mat, t_I_2D_mat); + cv::medianBlur(t_I_2D_mat, I2d_blurred, kernel_size); // apply median filter + cv::transpose(I2d_blurred, I_2D_mat); // transpose to be transfered to array + I_2D = (float *)I_2D_mat.data; + memcpy(I + (zz * w *h), I_2D, w * h * sizeof(float)); + + } + free(I_2D); + t_I_2D_mat.release(); + I2d_blurred.release(); + } + + } + + stretch(I, N, 0, 255); + size_t snakeNum; + if (swarm) { + int D = int(sqrt(1.5)*radius); //distance between hypersnakes + snakeNum = w * h * d / (D * D * D ); // approximate number of hypersnakes lying on image + } + else + snakeNum = 1; + sphere* snakes = new sphere[snakeNum]; //create an array of spheres.(one for each snake) + memset(snakes, 0, snakeNum * sizeof(sphere)); + + if (swarm) { + initialSwarmSnake(snakes, snakeNum, w, h, d, radius); + std::cout << "number of snakes=" << snakeNum << std::endl; + } + + + else { + point center; + center.x = (float)args["single"].as_float(0); + center.y = (float)args["single"].as_float(1); + center.z = (float)args["single"].as_float(2); + //cout << "number of args" << args["single"].nargs() << endl; + if (args["single"].nargs() >= 4) + radius = (float)args["single"].as_float(3); + + snakes[0].p.x = center.x - radius; // define p and q + snakes[0].q.x = center.x + radius; + snakes[0].p.y = snakes[0].q.y = center.y; + snakes[0].p.z = snakes[0].q.z = center.z; + + } + + point* samples = (point*)malloc(sampleNum * sizeof(point)); + memset(samples, 0, sampleNum * sizeof(point)); + if (MC) { + if (args["debug"]) { + randGenerator_cube(samples, sampleNum, true); + + } + else randGenerator_cube(samples, sampleNum); + } + + std::cout << "energy_th=" << energy_th << std::endl; + std::cout << "initial radius=" << radius << std::endl; + //-----------------------------------GPU implementation---------------------------------------------------------------------------- + if (cuda_device >= 0) { + + cudaDeviceProp prop; + HANDLE_ERROR(cudaGetDeviceProperties(&prop, 0)); + size_t threads = (size_t)prop.maxThreadsPerBlock; + size_t blocks = snakeNum; + size_t nbyte_shared = 7 * 4 * threads; + // allocate memory to snakes and copy them to device + sphere* gpu_snakes; + HANDLE_ERROR(cudaMalloc(&gpu_snakes, snakeNum * sizeof(sphere))); + HANDLE_ERROR(cudaMemcpy(gpu_snakes, snakes, snakeNum * sizeof(sphere), cudaMemcpyHostToDevice)); + + //allocate memory to image in device and copy from cpu to device + float* gpu_I; + HANDLE_ERROR(cudaMalloc(&gpu_I, bytes)); + HANDLE_ERROR(cudaMemcpy(gpu_I, I, bytes, cudaMemcpyHostToDevice)); + stim::gpuStartTimer(); + if (MC) { + // allocate memory to random samples on device + point* G_samples; + HANDLE_ERROR(cudaMalloc(&G_samples, sampleNum * sizeof(point))); + HANDLE_ERROR(cudaMemset(G_samples, 0, sampleNum * sizeof(point))); + HANDLE_ERROR(cudaMemcpy(G_samples, samples, sampleNum * sizeof(point), cudaMemcpyHostToDevice)); + + if (args["debug"]) + kernel_snake_evolve_MC_parallel << < blocks, threads, nbyte_shared >> > (gpu_snakes, gpu_I, G_samples, sampleNum, snakeNum, threads, w, h, d, itr, dt, true); + else + kernel_snake_evolve_MC_parallel << < blocks, threads, nbyte_shared >> > (gpu_snakes, gpu_I, G_samples, sampleNum, snakeNum, threads, w, h, d, itr, dt); + + } + else { + if (args["debug"]) + kernel_snake_evolve << > > (gpu_snakes, gpu_I, snakeNum, w, h, d, itr, dt, true); + else + kernel_snake_evolve << > > (gpu_snakes, gpu_I, snakeNum, w, h, d, itr, dt); + } + + std::cout << "gpuruntime = " << stim::gpuStopTimer() << " ms" << std::endl; + HANDLE_ERROR(cudaMemcpy(snakes, gpu_snakes, snakeNum * sizeof(sphere), cudaMemcpyDeviceToHost)); + cudaFree(gpu_I); + cudaFree(gpu_snakes); + + std::vector idx = DetectValidSnakes_GPU(snakes, snakeNum, I_original, w, h, d, threads, energy_th); + SaveSnakes(output_file, snakes, idx); + + + + } + + //--------------------------------------------CPU implementation---------------------------------------------------------------------------- + else { + unsigned int start = time(NULL); + std::cout << "it is running on CPU" << std::endl; + for (int i = 0; i < snakeNum; i++) { + //printf("\n\n---------------->> iteration %u", numItr); + if (args["debug"]) + snake_evolve(snakes[i], I, w, h, d, dt, itr, samples, sampleNum, MC, true); + else + snake_evolve(snakes[i], I, w, h, d, dt, itr, samples, sampleNum, MC); + + std::cout << "Output Snakes------------------" << std::endl; + std::cout << snakes[i].str() << std::endl; + + } + unsigned int end = time(NULL); + std::cout << "cpuRunTime=" << end - start << "s" << std::endl; + + + std::vector idx = DetectValidSnakes_GPU(snakes, snakeNum, I_original, w, h, d, 512, energy_th); + SaveSnakes(output_file, snakes, idx); + + } + + free(I); + free(I_original); + if (args["debug"]) { + std::vector idx(snakeNum); + std::iota(idx.begin(), idx.end(), 0); + //SaveSnakes(debug_file, snakes, idx); //saves the snakes to an output file + std::cout << "Output Snakes------------------" << std::endl; + for (size_t i = 0; i < idx.size(); i++) { //for each snake + std::cout << snakes[idx[i]].str() << std::endl; + } + } + + std::vector idx(snakeNum); + std::iota(idx.begin(), idx.end(), 0); + SaveSnakes("no-delete.txt", snakes, idx); //saves the snakes to an output file + return 0; + +} -- libgit2 0.21.4