Commit f6438346ee2bdf15cdc3f057f58447935dfe13af

Authored by Mahsa Lotfollahi
1 parent 547f6b98

initial commit after PLoS submission

Showing 3 changed files with 1279 additions and 0 deletions   Show diff stats
FindSTIM.cmake 0 → 100644
  1 +include(FindPackageHandleStandardArgs)
  2 +
  3 +set(STIM_INCLUDE_DIR $ENV{STIMLIB_PATH})
  4 +
  5 +find_package_handle_standard_args(STIM DEFAULT_MSG STIM_INCLUDE_DIR)
  6 +
  7 +if(STIM_FOUND)
  8 + set(STIM_INCLUDE_DIRS ${STIM_INCLUDE_DIR})
  9 +endif()
... ...
hypersnakuscule.h 0 → 100644
  1 +#ifndef SNAKUSCULE_H
  2 +#define SNAKUSCULE_H
  3 +
  4 +template <typename T >
  5 +class point {
  6 +public:
  7 +
  8 + T x;
  9 + T y;
  10 + T z;
  11 +
  12 + //default constructor
  13 + CUDA_CALLABLE point() {
  14 + x = 0;
  15 + y = 0;
  16 + z = 0;
  17 + };
  18 + //constructor definition
  19 + CUDA_CALLABLE point(T a, T b, T c) {
  20 + x = a;
  21 + y = b;
  22 + z = c;
  23 + }
  24 +
  25 + CUDA_CALLABLE point <T> operator= (const point<T> &rhs) {
  26 + this->x = rhs.x;
  27 + this->y = rhs.y;
  28 + this->z = rhs.z;
  29 + return (*this);
  30 + }
  31 +
  32 + CUDA_CALLABLE point <T> operator+ (const point<T> rhs) {
  33 + point<T> temp;
  34 + temp.x = this->x + rhs.x;
  35 + temp.y = this->y + rhs.y;
  36 + temp.z = this->z + rhs.z;
  37 + return temp;
  38 +
  39 + }
  40 +
  41 + CUDA_CALLABLE point<T> operator- (const point<T> rhs) {
  42 + point<T> temp;
  43 + temp.x = this->x - rhs.x;
  44 + temp.y = this->y - rhs.y;
  45 + temp.z = this->z - rhs.z;
  46 + return temp;
  47 +
  48 + }
  49 + CUDA_CALLABLE point<T> operator*(const float rhs) {
  50 + return point(rhs * this->x, rhs * this->y, rhs * this->z );
  51 + }
  52 +
  53 +
  54 + CUDA_CALLABLE point<T> operator/ (const float rhs) {
  55 + return point(this->x / rhs , this->y / rhs, this->z/ rhs );
  56 +
  57 + }
  58 +};
  59 +
  60 +
  61 +class sphere {
  62 +
  63 +public:
  64 +
  65 + point<float> p;
  66 + point<float> q;
  67 + //default constructor
  68 + CUDA_CALLABLE sphere() {
  69 + p.x = 0;
  70 + p.y = 0;
  71 + p.z = 0;
  72 + q.x = 0;
  73 + q.y = 0;
  74 + q.z = 0;
  75 + };
  76 +
  77 + //CUDA_CALLABLE point<float> p() { return point<float>(center.x - radius, center.y); }
  78 + //CUDA_CALLABLE point<float> q() { return point<float>(center.x + radius, center.y); }
  79 + //CUDA_CALLABLE point<float> c() { return center; }
  80 + CUDA_CALLABLE point<float> c() {
  81 + point<float> center;
  82 + point<float> sum; // sum of two points to get center
  83 + sum = p + q;
  84 + center.x = 0.5f * sum.x;
  85 + center.y = 0.5f * sum.y;
  86 + center.z = 0.5f * sum.z;
  87 + return center; }
  88 +
  89 + //CUDA_CALLABLE float r() { return radius; }
  90 + CUDA_CALLABLE float r() {
  91 + float radius;
  92 + point <float> d; //distance
  93 + d = p - q;
  94 + radius = 0.5f* sqrt((d.x * d.x) + (d.y * d.y) + (d.z * d.z));
  95 + return radius; }
  96 +
  97 + CUDA_CALLABLE void update(point<float> energyGradp , point<float> energyGradq , float dt){
  98 + p = p - (energyGradp * dt);
  99 + q = q - (energyGradq * dt);
  100 + }
  101 +
  102 + CUDA_UNCALLABLE std::string str() {
  103 + std::stringstream ss;
  104 + ss << "q = (" << q.x << ", " << q.y << ", " << q.z << ")" << std::endl;
  105 + ss << "p = (" << p.x << ", " << p.y << ", " << p.z << ")" << std::endl;
  106 + point<float> center = c();
  107 + ss << "c = (" << center.x << ", " << center.y << ", " << center.z << ")" << std::endl;
  108 + ss << "r = " << r() << std::endl;
  109 + return ss.str();
  110 + }
  111 +};
  112 +
  113 +#endif
0 114 \ No newline at end of file
... ...
main.cu 0 → 100644
  1 +#include <iostream>
  2 +#include<string>
  3 +#include<cuda.h>
  4 +#include <cuda_runtime.h>
  5 +#include "device_launch_parameters.h"
  6 +#include <fstream>
  7 +#include <curand.h>
  8 +#include <curand_kernel.h>
  9 +#include <time.h>
  10 +#include <chrono>
  11 +#include <stdio.h>
  12 +#include <math.h>
  13 +#include "opencv2/core/core.hpp"
  14 +#include <opencv2/imgproc/imgproc.hpp>
  15 +#include <opencv2/opencv.hpp>
  16 +#include <opencv2/highgui/highgui_c.h>
  17 +
  18 +#define USING_OPENCV
  19 +#include<stim/image/image.h>
  20 +#include <stim/cuda/cudatools/callable.h>
  21 +#include <stim/cuda/cudatools/error.h>
  22 +#include <stim/cuda/cudatools/timer.h>
  23 +#include <stim/math/constants.h>
  24 +#include "stim/parser/arguments.h"
  25 +#include<random>
  26 +#include <numeric> // std::iota
  27 +//#include<stim/math/random.h>
  28 +#include "hypersnakuscule.h"
  29 +//#include "median2.cuh"
  30 +
  31 +#define deltaR 2.0f
  32 +#define deltar 1.5874f // deltar=deltaR/cubeRoot2
  33 +#define cubeRoot2 1.2599f
  34 +#define pi 3.14159f
  35 +//#define Energy_th -3
  36 +//----------------------------------------------functions--------------------------------------------------------------
  37 +void stretch(float* I, size_t size, int low, int high) {
  38 + //size: number of image pixel
  39 + float max_val = I[0];
  40 + float min_val = I[0];
  41 + for (int n = 0; n < size; n++) {
  42 + if (I[n] > max_val) {
  43 + max_val = I[n];
  44 + }
  45 + if (I[n] < min_val) {
  46 + min_val = I[n];
  47 + }
  48 + }
  49 + float range = max_val - min_val;
  50 + float desired_range = (float)high - (float)low;
  51 + for (size_t n = 0; n < size; n++) { //for each element in the image
  52 + I[n] = desired_range * (I[n] - min_val) / range + low;
  53 + }
  54 +
  55 +}
  56 +
  57 +/// random generator function: generate random numbers in a sphere with radius 1 in the center (0, 0, 0)
  58 +void randGenerator(point<float>* r, int sampleNum, bool debug = false) {
  59 +
  60 + for (size_t i = 0; i < sampleNum; i++) {
  61 + double rn = (double)rand() / (double)(RAND_MAX);
  62 + double theta = (double)rand() / (double)(RAND_MAX)* stim::TAU;
  63 + double cosphi = 1.0 - 2.0 * ((double)rand() / (double)(RAND_MAX));
  64 + double phi = std::acos(cosphi);
  65 + //double phi = std::acos(2.0 * v - 1.0);
  66 + //double phi = (double)rand() / (double)(RAND_MAX)* stim::PI;
  67 + //std::cout << "rn=" << rn << "\t theta=" << theta << "\tphi=" << phi << std::endl;
  68 + r[i].x = (float)(std::cbrt(rn) * cos(theta) * sin(phi));
  69 + r[i].y = (float)(std::cbrt(rn) * sin(theta) * sin(phi));
  70 + r[i].z = (float)(std::cbrt(rn) * cos(phi));
  71 +
  72 +
  73 + }
  74 + if (debug) {
  75 + std::ofstream outfile("randomSamples.txt"); //open a file for writing
  76 + for (size_t i = 0; i < sampleNum; i++) {
  77 + outfile << r[i].x << " " << r[i].y << " " << r[i].z << std::endl; //output the center and radius
  78 + }
  79 + outfile.close();
  80 + }
  81 +
  82 +}
  83 +///create random numbers in a cube and delete the one outside the sphere
  84 +void randGenerator_cube(point<float>* r, int sampleNum, bool debug = false) {
  85 + size_t counter = 0;
  86 + while (counter < sampleNum) {
  87 + double x = ((double)rand() / (double)(RAND_MAX) * 2.0) - 1.0;
  88 + double y = ((double)rand() / (double)(RAND_MAX) * 2.0) - 1.0;
  89 + double z = ((double)rand() / (double)(RAND_MAX) * 2.0) - 1.0;
  90 + double d = sqrt((x * x) + (y * y) + (z * z));
  91 +
  92 + if (d < 1.1) {
  93 + r[counter].x = (float)x;
  94 + r[counter].y = (float)y;
  95 + r[counter].z = (float)z;
  96 + counter++;
  97 + }
  98 + }
  99 +
  100 + if (debug) {
  101 + std::ofstream outfile("randomSamples.txt"); //open a file for writing
  102 + for (size_t i = 0; i < sampleNum; i++) {
  103 + outfile << r[i].x << " " << r[i].y << " " << r[i].z << std::endl; //output the center and radius
  104 + }
  105 + outfile.close();
  106 + }
  107 +
  108 +
  109 +}
  110 +
  111 +/// random generator function: generate random numbers in a sphere with radius 1 in the center (0 , 0, 0)
  112 +//void randGenerator1(point<float>* r, int sampleNum, bool debug = false) {
  113 +// for (int i = 0; i < sampleNum; i++) {
  114 +// std::default_random_engine generator1(100 + i);
  115 +// std::uniform_real_distribution<double> distribution1(0.0f, 1.0f);
  116 +// double rn = distribution1(generator1);
  117 +// std::default_random_engine generator2(50.0f + 5.0f * i);
  118 +// std::uniform_real_distribution<double> distribution2(0.0, 2.0 * stim::PI);
  119 +// double theta = distribution2(generator2);
  120 +// std::default_random_engine generator3(30.0f + 3.0f * i);
  121 +// std::uniform_real_distribution<double> distribution3(0, stim::PI);
  122 +// double phi = distribution3(generator3);
  123 +// r[i].x = (float)(std::cbrt(rn) * cos(theta) * sin(phi));
  124 +// r[i].y = (float)(std::cbrt(rn) * sin(theta) * sin(phi));
  125 +// r[i].z = (float)(std::cbrt(rn) * cos(phi));
  126 +// }
  127 +// if (debug) {
  128 +// std::ofstream outfile("randomSamples.txt"); //open a file for writing
  129 +// for (size_t i = 0; i < sampleNum; i++) {
  130 +// outfile << r[i].x << " " << r[i].y << " " << r[i].z << std::endl; //output the center and radius
  131 +// }
  132 +// outfile.close();
  133 +// }
  134 +//}
  135 +
  136 +
  137 +
  138 +/// saves the snakes specified by the idx array
  139 +void SaveSnakes(std::string filename, sphere* snakes, std::vector<size_t> idx) {
  140 + std::ofstream outfile(filename); //open a file for writing
  141 + for (size_t i = 0; i < idx.size(); i++) {
  142 + point<float> center = snakes[idx[i]].c(); //get the centerpoint of the snake
  143 + outfile << center.x << " " << center.y << " " << center.z << " " << snakes[idx[i]].r() << std::endl; //output the center and radius
  144 + }
  145 + outfile.close();
  146 +
  147 +
  148 +
  149 +}
  150 +void initialSwarmSnake(sphere * snakes, size_t &counter, size_t w, size_t h, size_t d, float radius) {
  151 + std::cout << "W=" << w << "\t h=" << h << "\td=" << d << std::endl;
  152 + int D = int(sqrt(1.5)*radius); //distance between hypersnakes
  153 + //int D = 30; //for phantom
  154 + int k1 = 0;
  155 + counter = 0;
  156 + float startpoint = 0;
  157 + for (float k = radius; k < (d - radius); k += D ) {// for phantom D
  158 + for (float j = radius; j < (h - radius); j += D) {
  159 + k1++;
  160 + if (k1 % 2 == 1)
  161 + startpoint = 0; //for phantom D
  162 + else
  163 + startpoint = (float)D / 2.0f;//for phantom D
  164 + for (float i = startpoint; i < (w - radius); i += D) {
  165 + point<float> temp_p(i, j, k);
  166 + snakes[counter].p = temp_p;
  167 + point<float> temp_q(i + (2 * radius), j, k);
  168 + snakes[counter].q = temp_q;
  169 + counter = counter + 1;
  170 + }
  171 + }
  172 + }
  173 + std::ofstream outfile("initials.txt");
  174 + for (int i = 0; i < counter; i++) {
  175 + outfile << snakes[i].c().x << " " << snakes[i].c().y << " " << snakes[i].c().z << " " << snakes[i].r() << std::endl;
  176 + }
  177 + outfile.close();
  178 +}
  179 +
  180 +__host__ __device__ void sum_dE(point<float>& dEdp, point<float>& dEdq, point<float> c, point<float> s, float R, float f) {
  181 +
  182 + float r = R / cubeRoot2; // radius of inner snake
  183 + float dz2 = (s.z - c.z)*(s.z - c.z);
  184 + float dy2 = (s.y - c.y)*(s.y - c.y);
  185 + float dx2 = (s.x - c.x)*(s.x - c.x);
  186 + float d = sqrt(dz2 + dy2 + dx2); //distance bw given sample and center of contour
  187 +
  188 + float gx = 2.0f * R; // gx= snake.q.x - snake.p.x ;
  189 + float dx = (c.x - s.x) / d;
  190 + float dy = (c.y - s.y) / d;
  191 + float dz = (c.z - s.z) / d;
  192 +
  193 + if (d < (r - 0.5f*deltar)) { // mouth of snake
  194 + dEdp.x -= 3.0f / gx * f; //calculate the growth/shrinking term for the snake
  195 + dEdq.x += 3.0f / gx * f;
  196 + }
  197 + else if (d < (r + 0.5f*deltar)) { // throat of snake
  198 + float S = (2.0f / deltar) * (d - r); // weight function value in the given point
  199 + dEdp.x += f * ((3.0f / gx) * S + (dx + (1.0f / cubeRoot2)) / deltar);
  200 + dEdp.y += f *(dy / deltar);
  201 + dEdp.z += f *(dz / deltar);
  202 +
  203 + dEdq.x += f * ((-3.0f / gx) * S + (dx - (1.0f / cubeRoot2)) / deltar);
  204 + dEdq.y = dEdp.y;
  205 + dEdq.z = dEdp.z;
  206 +
  207 + }
  208 + else if (d < (R - 0.5f*deltaR)) { // coil of snake
  209 + dEdp.x += 3.0f / gx * f;
  210 + dEdq.x -= 3.0f / gx * f;
  211 + }
  212 +
  213 + else if (d < (R + 0.5f*deltaR)) { // fangs of snake
  214 + float S = -(1.0f / deltaR) * (d - (R + deltaR / 2.0f));
  215 +
  216 + dEdp.x += f * ((3.0f * S / gx) - (0.5f * (dx + 1.0f) / deltaR));
  217 + dEdp.y -= 0.5f *(dy / deltaR) * f;
  218 + dEdp.z -= 0.5f *(dz / deltaR) * f;
  219 +
  220 + dEdq.x += f *((-3.0f * S / gx) - (0.5f * (dx - 1.0f) / deltaR));
  221 + dEdq.y = dEdp.y;
  222 + dEdq.z = dEdp.z;
  223 + }
  224 +
  225 +}
  226 +
  227 +//sum_dE in debug mode
  228 +__host__ __device__ void sum_dE_debug(point<float>& dEdp, point<float>& dEdq, int &counter, point<float> c, point<float> s, float R, float f) {
  229 +
  230 + float r = R / cubeRoot2; // radius of inner snake
  231 + float dz2 = (s.z - c.z)*(s.z - c.z);
  232 + float dy2 = (s.y - c.y)*(s.y - c.y);
  233 + float dx2 = (s.x - c.x)*(s.x - c.x);
  234 + float d = sqrt(dz2 + dy2 + dx2); //distance bw given sample and center of contour
  235 +
  236 + float gx = 2.0f * R; // gx= snake.q.x - snake.p.x ;
  237 + float dx = (c.x - s.x) / d;
  238 + float dy = (c.y - s.y) / d;
  239 + float dz = (c.z - s.z) / d;
  240 + int trivial = 0; //to test if any point is out of the contour
  241 + if (d < (r - 0.5f*deltar)) { // mouth of snake
  242 + counter++;
  243 + dEdp.x -= 3.0f / gx * f; //calculate the growth/shrinking term for the snake
  244 + dEdq.x += 3.0f / gx * f;
  245 + }
  246 + else if (d < (r + 0.5f*deltar)) { // throat of snake
  247 + counter++;
  248 + float S = (2.0f / deltar) * (d - r); // weight function value in the given point
  249 + dEdp.x += f * ((3.0f / gx) * S + (dx + (1.0f / cubeRoot2)) / deltar);
  250 + dEdp.y += f *(dy / deltar);
  251 + dEdp.z += f *(dz / deltar);
  252 +
  253 + dEdq.x += f * ((-3.0f / gx) * S + (dx - (1.0f / cubeRoot2)) / deltar);
  254 + dEdq.y = dEdp.y;
  255 + dEdq.z = dEdp.z;
  256 +
  257 + }
  258 + else if (d < (R - 0.5f*deltaR)) { // coil of snake
  259 + counter++;
  260 + dEdp.x += 3.0f / gx * f;
  261 + dEdq.x -= 3.0f / gx * f;
  262 + }
  263 +
  264 + else if (d < (R + 0.5f*deltaR)) { // fangs of snake
  265 + counter++;
  266 + float S = -(1.0f / deltaR) * (d - (R + deltaR / 2.0f));
  267 +
  268 + dEdp.x += f * ((3.0f * S / gx) - (0.5f * (dx + 1.0f) / deltaR));
  269 + dEdp.y -= 0.5f *(dy / deltaR) * f;
  270 + dEdp.z -= 0.5f *(dz / deltaR) * f;
  271 +
  272 + dEdq.x += f *((-3.0f * S / gx) - (0.5f * (dx - 1.0f) / deltaR));
  273 + dEdq.y = dEdp.y;
  274 + dEdq.z = dEdp.z;
  275 + }
  276 + else
  277 + trivial++;
  278 +
  279 +}
  280 +
  281 +// this function calculate gradient of energy with respect to two point p and q
  282 +__host__ __device__ void snake_Engrad(point<float>&dEdp, point<float>&dEdq, sphere snake, float* I, size_t w, size_t h, size_t d, bool debug = false) {
  283 +
  284 + float radius = snake.r(); // radius of outer snake
  285 + point<float> c = snake.c(); // center of snake
  286 + dEdp = point<float>(0, 0, 0); //initialize dEdp and dEdq to zero
  287 + dEdq = point<float>(0, 0, 0);
  288 +
  289 + float threshold = ((1 + cubeRoot2) / (cubeRoot2 - 1))*(deltar / pow(2.0f, (2.0f / 3.0f)));
  290 + if (radius < threshold) {
  291 + if (debug) printf("\t RADIUS IS OUT OF RANGE\n");
  292 + return;
  293 + }
  294 +
  295 + float tempXmin = floor(c.x - radius - 1); //calculate a bounding box around the sphere
  296 + float tempXmax = ceil(c.x + radius + 1);
  297 + float tempYmin = floor(c.y - radius - 1);
  298 + float tempYmax = ceil(c.y + radius + 1);
  299 + float tempZmin = floor(c.z - radius - 1);
  300 + float tempZmax = ceil(c.z + radius + 1);
  301 +
  302 + float xmin = max((float)tempXmin, (float) 0.0); //clamp the bounding box to the image edges
  303 + float xmax = min((float)tempXmax, (float)(w - 1));
  304 + float ymin = max((float)tempYmin, (float)0);
  305 + float ymax = min((float)tempYmax, (float)(h - 1));
  306 + float zmin = max((float)tempZmin, (float)0);
  307 + float zmax = min((float)tempZmax, (float)(d - 1));
  308 +
  309 + if ((xmax <= xmin) || (ymax <= ymin) || (zmax <= zmin)) {
  310 + if (debug) printf("(xmax <= xmin) || (ymax <= ymin) || (zmax <= zmin)");
  311 + return;
  312 + }
  313 +
  314 + float R = radius; //simplify radius to R
  315 + float R3 = R * R * R; //calculate R^2 (radius squared)
  316 + for (unsigned int z = (unsigned int)zmin; z <= (unsigned int)zmax; z++) { // for each section
  317 + for (unsigned int x = (unsigned int)xmin; x <= (unsigned int)xmax; x++) { //for each column of section in the bounding box
  318 + for (unsigned int y = (unsigned int)ymin; y <= (unsigned int)ymax; y++) { //for each pixel p in the column
  319 +
  320 + point<float> s(x, y, z); // a sample inside the contour
  321 + float f; // image value in given position
  322 +
  323 + int position = (int)((w * h * z) + (x * h + y));
  324 + if (position < (w * h * d)) {
  325 + f = I[position];
  326 +
  327 +
  328 + if (!f == 0)
  329 + sum_dE(dEdp, dEdq, c, s, R, f);
  330 +
  331 + }
  332 +
  333 + }
  334 + }
  335 + }
  336 +
  337 + dEdp = dEdp / (8 * R3);
  338 + dEdq = dEdq / (8 * R3);
  339 +
  340 +
  341 +}
  342 +
  343 +//// this function calculate gradient of energy with respect to two point p and q using Monte Carlo
  344 +__host__ __device__ void snake_Engrad_MC(point<float>&dEdp, point<float>&dEdq, sphere snake, float* I, point<float>* samples, size_t sampleNum, size_t w, size_t h, size_t d, bool debug = false) {
  345 +
  346 + float radius = snake.r(); // radius of outer snake
  347 + point<float> c = snake.c(); // center of snake
  348 + dEdp = point<float>(0, 0, 0); //initialize dEdp and dEdq to zero
  349 + dEdq = point<float>(0, 0, 0);
  350 +
  351 + float threshold = ((1 + cubeRoot2) / (cubeRoot2 - 1))*(deltar / pow(2.0f, (2.0f / 3.0f)));
  352 + if (radius < threshold) {
  353 + if (debug) printf("\t RADIUS IS OUT OF RANGE\n");
  354 + return;
  355 + }
  356 +
  357 + float tempXmin = floor(c.x - radius - 1); //calculate a bounding box around the sphere
  358 + float tempXmax = ceil(c.x + radius + 1);
  359 + float tempYmin = floor(c.y - radius - 1);
  360 + float tempYmax = ceil(c.y + radius + 1);
  361 + float tempZmin = floor(c.z - radius - 1);
  362 + float tempZmax = ceil(c.z + radius + 1);
  363 +
  364 + float xmin = max((float)tempXmin, (float) 0.0); //clsamples(amp the bounding box to the image edges
  365 + float xmax = min((float)tempXmax, (float)(w - 1.0));
  366 + float ymin = max((float)tempYmin, (float)0.0);
  367 + float ymax = min((float)tempYmax, (float)(h - 1.0));
  368 + float zmin = max((float)tempZmin, (float)0.0);
  369 + float zmax = min((float)tempZmax, (float)(d - 1.0));
  370 +
  371 + if ((xmax <= xmin) || (ymax <= ymin) || (zmax <= zmin)) {
  372 + if (debug) printf("(xmax <= xmin) || (ymax <= ymin) || (zmax <= zmin)");
  373 + return;
  374 + }
  375 +
  376 +
  377 +
  378 +
  379 + float R = radius; //simplify radius to R
  380 + float R3 = R * R * R; //calculate R^3 (radius cube)
  381 + int counter = 0;
  382 + float sumf = 0.0;
  383 +
  384 + for (int i = 0; i < sampleNum; i++) {
  385 + float sx = samples[i].x;
  386 + float sy = samples[i].y;
  387 + float sz = samples[i].z;
  388 +
  389 + float x = (R + 1.0f) * sx + c.x;
  390 + float y = (R + 1.0f) * sy + c.y;
  391 + float z = (R + 1.0f) * sz + c.z;
  392 +
  393 + int xi = (int)round(x);
  394 + int yi = (int)round(y);
  395 + int zi = (int)round(z);
  396 +
  397 + point<float> s(xi, yi, zi); // a sample inside the contour
  398 + float f; // image value in given position
  399 +
  400 + int position = (int)((w * h * zi) + (xi * h + yi));
  401 + if (position < (w * h * d) && xi >= xmin && xi <= xmax && yi >= ymin && yi <= ymax && zi >= zmin && zi <= zmax) { // && d1< (R + 1)
  402 + counter++;
  403 + f = (float)I[position];
  404 + sumf += f;
  405 + //if (debug)
  406 + sum_dE_debug(dEdp, dEdq, counter, c, s, R, f);
  407 + //sum_dE(dEdp, dEdq, c, s, R, f);
  408 + }
  409 + }
  410 +
  411 +
  412 +
  413 +
  414 +
  415 + float volume = pi * (((R + 1.0f) * (R + 1.0f) * (zmax - zmin)) - ((1.0f / 3.0f) * (powf((zmax - c.z), 3.0f) + powf((c.z - zmin), 3.0f))));
  416 +
  417 + //if (debug) printf("volume_sphere=%f and volume=%f\n", volume_sphere, volume);
  418 + //if (debug) printf("sampleNum=%u and (float)counter= %f \n", sampleNum, (float)counter);
  419 +
  420 + dEdp = dEdp * volume / (float)counter;
  421 +
  422 + dEdq = dEdq * volume / (float)counter;
  423 +
  424 +
  425 + dEdp = dEdp / (8.0f * R3);
  426 + dEdq = dEdq / (8.0f * R3);
  427 +
  428 +}
  429 +
  430 +
  431 +// this function calculate gradient of energy with respect to two point p and q using Monte Carlo
  432 +__host__ __device__ void snake_Engrad_MC_parallel(point<float>&dEdp, point<float>&dEdq, int &counter, sphere snake, float* I, point<float> sample, size_t w, size_t h, size_t d, bool debug = false) {
  433 +
  434 + float radius = snake.r(); // radius of outer snake
  435 + point<float> c = snake.c(); // center of snake
  436 +
  437 + float tempXmin = floor(c.x - radius - 1); //calculate a bounding box around the sphere
  438 + float tempXmax = ceil(c.x + radius + 1);
  439 + float tempYmin = floor(c.y - radius - 1);
  440 + float tempYmax = ceil(c.y + radius + 1);
  441 + float tempZmin = floor(c.z - radius - 1);
  442 + float tempZmax = ceil(c.z + radius + 1);
  443 +
  444 + float xmin = max((float)tempXmin, (float) 0.0); //clsamples(amp the bounding box to the image edges
  445 + float xmax = min((float)tempXmax, (float)(w - 1.0));
  446 + float ymin = max((float)tempYmin, (float)0.0);
  447 + float ymax = min((float)tempYmax, (float)(h - 1.0));
  448 + float zmin = max((float)tempZmin, (float)0.0);
  449 + float zmax = min((float)tempZmax, (float)(d - 1.0));
  450 +
  451 + if ((xmax <= xmin) || (ymax <= ymin) || (zmax <= zmin)) {
  452 + if (debug) printf("(xmax <= xmin) || (ymax <= ymin) || (zmax <= zmin)");
  453 + return;
  454 + }
  455 +
  456 +
  457 +
  458 +
  459 + float R = radius; //simplify radius to R
  460 + float sumf = 0.0;
  461 +
  462 + //for (int i = 0; i < sampleNum; i++) {
  463 + float sx = sample.x;
  464 + float sy = sample.y;
  465 + float sz = sample.z;
  466 +
  467 + float x = (R + 1.0f) * sx + c.x;
  468 + float y = (R + 1.0f) * sy + c.y;
  469 + float z = (R + 1.0f) * sz + c.z;
  470 +
  471 + int xi = (int)round(x);
  472 + int yi = (int)round(y);
  473 + int zi = (int)round(z);
  474 +
  475 + point<float> s(xi, yi, zi); // a sample inside the contour
  476 + float f; // image value in given position
  477 +
  478 + int position = (int)((w * h * zi) + (xi * h + yi));
  479 + if (position < (w * h * d) && xi >= xmin && xi <= xmax && yi >= ymin && yi <= ymax && zi >= zmin && zi <= zmax) { // && d1< (R + 1)
  480 +
  481 + f = (float)I[position];
  482 + sumf += f;
  483 + sum_dE_debug(dEdp, dEdq, counter, c, s, R, f);
  484 + //printf("dEdp.x=%f and dEdq.x=%f \t dEdp.y=%f and dEdq.y=%f \t dEdp.z=%f and dEdq.z=%f", dEdp.x, dEdq.x, dEdp.y, dEdq.y, dEdp.z, dEdq.z);
  485 +
  486 + }
  487 +
  488 +
  489 +}
  490 +
  491 +
  492 +
  493 +
  494 +
  495 +void snake_evolve(sphere &snakes, float* I, int w, int h, int d, float dt, int itr, point<float>* samples, size_t sampleNum, bool MC, bool debug = false) {
  496 + point<float> dEdp(0.0, 0.0, 0.0); // energy gradient wrt p
  497 + point<float> dEdq(0.0, 0.0, 0.0); // energy gradient wrt q
  498 + for (int numItr = 0; numItr < itr; numItr++) {
  499 + if (MC)
  500 + snake_Engrad_MC(dEdp, dEdq, snakes, I, samples, sampleNum, w, h, d, debug);
  501 + else
  502 + snake_Engrad(dEdp, dEdq, snakes, I, w, h, d, debug);
  503 +
  504 + if (debug) printf("dEdp.x=%f \t dEdq.x=%f \n dEdp.y=%f \t dEdp.z=%f \n\n", dEdp.x, dEdq.x, dEdp.y, dEdp.z);
  505 + float factor = sqrt(float(numItr + 1)); // step size in gradient descent decreasing by number of iterations
  506 +
  507 + snakes.update(dEdp, dEdq, dt / factor);
  508 +
  509 + }
  510 +
  511 +
  512 +}
  513 +
  514 +
  515 +
  516 +//-------------------------------------------Kernels--------------------------------------------------------------------------
  517 +
  518 +//__global__ void kernel_snake_evolve_MC(sphere * snakes, float* I, point<float>* samples, int sampleNum, size_t snakeNum, size_t w, size_t h, size_t d, int itr, float dt, bool debug = false) {
  519 +//
  520 +// int idx = blockDim.x * blockIdx.x + threadIdx.x;
  521 +//
  522 +// if (idx >= snakeNum) // return if the number of threads is more than snakes
  523 +// return;
  524 +//
  525 +//
  526 +// if (idx == 0)
  527 +// printf("\n\n \t\t=============>>>>we are in the MC kernel\n\n");
  528 +// point<float> dEdp(0.0, 0.0, 0.0); // energy gradient wrt p
  529 +// point<float> dEdq(0.0, 0.0, 0.0); // energy gradient wrt q
  530 +// sphere s = snakes[idx];
  531 +//
  532 +// for (int i = 0; i < itr; i++) {
  533 +// if (debug) printf("\n\n---------------->> iteration %u\n", i);
  534 +// snake_Engrad_MC(dEdp, dEdq, s, I, samples, sampleNum, w, h, d, debug);
  535 +// float factor = sqrtf(float(i + 1));
  536 +// if (debug)
  537 +// printf("dEdp.x=%f and dEdp.y=%f and dEdp.z=%f and dEdq.x=%f\n", dEdp.x, dEdp.y, dEdp.z, dEdq.x);
  538 +// s.update(dEdp, dEdq, dt / factor);
  539 +//
  540 +// }
  541 +//
  542 +// snakes[idx] = s;
  543 +//
  544 +//
  545 +//}
  546 +
  547 +__global__ void kernel_snake_evolve_MC_parallel(sphere * snakes, float* I, point<float>* samples, int sampleNum, size_t snakeNum, size_t threads, size_t w, size_t h, size_t d, int itr, float dt, bool debug = false) {
  548 +
  549 + extern __shared__ float sharedPtr[]; // define shared memory to save result of each thread there
  550 + int n = floorf(sampleNum / threads) ; //# given sample points to one thread
  551 + int idx = blockDim.x * blockIdx.x + threadIdx.x;
  552 +
  553 + if (idx >= (snakeNum * threads)) // return if the number of threads is more than snakes
  554 + return;
  555 +
  556 + float threshold = ((1 + cubeRoot2) / (cubeRoot2 - 1))*(deltar / pow(2.0f, (2.0f / 3.0f))); //snake cannot be smaller than threshold
  557 + /*if (idx == 0) {
  558 + printf("\n\n \t\t=============>>>>we are in the MC kernel\n\n");
  559 + printf("number of samples per thread=%d\n", n);
  560 + printf("idx=%d\n", (snakeNum * threads));
  561 + printf("blockdim.x=%d and threads=%u \n", blockDim.x, threads);
  562 + }*/
  563 +
  564 + point<float> thread_sample; //sample goes to the thread
  565 + //sum_counter is real number of samples inside the contour averaged. some points of samples may round out of contour and did not contribute in averaging
  566 + for (int i = 0; i < itr; i++) {
  567 + //check the snake, if it pass the threshold
  568 +
  569 + if (snakes[blockIdx.x].r() < threshold) {
  570 + if (debug) printf("\t RADIUS IS OUT OF RANGE\n");
  571 + return;
  572 + }
  573 +
  574 + float radius = snakes[blockIdx.x].r(); // radius of outer snake
  575 + point<float> c = snakes[blockIdx.x].c(); // center of snake
  576 + float tempXmin = floor(c.x - radius - 1); //calculate a bounding box around the sphere
  577 + float tempXmax = ceil(c.x + radius + 1);
  578 + float tempYmin = floor(c.y - radius - 1);
  579 + float tempYmax = ceil(c.y + radius + 1);
  580 + float tempZmin = floor(c.z - radius - 1);
  581 + float tempZmax = ceil(c.z + radius + 1);
  582 +
  583 + float xmin = max((float)tempXmin, (float) 0.0); //clsamples(amp the bounding box to the image edges
  584 + float xmax = min((float)tempXmax, (float)(w - 1.0));
  585 + float ymin = max((float)tempYmin, (float)0.0);
  586 + float ymax = min((float)tempYmax, (float)(h - 1.0));
  587 + float zmin = max((float)tempZmin, (float)0.0);
  588 + float zmax = min((float)tempZmax, (float)(d - 1.0));
  589 +
  590 + if ((xmax <= xmin) || (ymax <= ymin) || (zmax <= zmin)) {
  591 + if (debug) printf("(xmax <= xmin) || (ymax <= ymin) || (zmax <= zmin)");
  592 + return;
  593 + }
  594 +
  595 +
  596 +
  597 + if (debug) printf("\n\n---------------->> iteration %u\n", i);
  598 + int counter = 0;
  599 + point<float> dEdp(0.0f, 0.0f, 0.0f); // energy gradient wrt p
  600 + point<float> dEdq(0.0f, 0.0f, 0.0f); // energy gradient wrt q
  601 + for (int j = 0; j < n; j++) {
  602 + //sphere single_snake = snakes[blockIdx.x]; // each block is assigned to one snake. all threads in a block are working for that snake
  603 + thread_sample = samples[threadIdx.x * n + j];
  604 + snake_Engrad_MC_parallel(dEdp, dEdq, counter, snakes[blockIdx.x], I, thread_sample, w, h, d, debug);
  605 +
  606 + }
  607 + /*if (idx == 0) {
  608 + printf("points in the contour for thread0=%d\n", counter);
  609 + printf("dEdp.x=%f and dEdq.x=%f \t dEdp.y=%f and dEdq.y=%f \t dEdp.z=%f and dEdq.z=%f", dEdp.x, dEdq.x, dEdp.y, dEdq.y, dEdp.z, dEdq.z);
  610 + }
  611 +*/
  612 +
  613 + //copy the result of each thread in shared memory
  614 + sharedPtr[threadIdx.x * 7 + 0] = dEdp.x;
  615 + sharedPtr[threadIdx.x * 7 + 1] = dEdp.y;
  616 + sharedPtr[threadIdx.x * 7 + 2] = dEdp.z;
  617 +
  618 + sharedPtr[threadIdx.x * 7 + 3] = dEdq.x;
  619 + sharedPtr[threadIdx.x * 7 + 4] = dEdq.y;
  620 + sharedPtr[threadIdx.x * 7 + 5] = dEdq.z;
  621 +
  622 + sharedPtr[threadIdx.x * 7 + 6] = counter;
  623 +
  624 + __syncthreads();
  625 +
  626 + //combine threads
  627 + dEdp = point<float>(0.0f, 0.0f, 0.0f); // energy gradient wrt p
  628 + dEdq = point<float>(0.0f, 0.0f, 0.0f);
  629 + counter = 0;
  630 + if (threadIdx.x == 0) {
  631 + float R = snakes[blockIdx.x].r(); // radius of outer snake
  632 + float R3 = R * R * R; //calculate R^3 (radius cube)
  633 + point<float> c = snakes[blockIdx.x].c(); // center of snake
  634 +
  635 + for (int i = 0; i < threads; i++) {
  636 + dEdp.x += sharedPtr[i * 7 + 0];
  637 + dEdp.y += sharedPtr[i * 7 + 1];
  638 + dEdp.z += sharedPtr[i * 7 + 2];
  639 +
  640 + dEdq.x += sharedPtr[i * 7 + 3];
  641 + dEdq.y += sharedPtr[i * 7 + 4];
  642 + dEdq.z += sharedPtr[i * 7 + 5];
  643 +
  644 + counter += sharedPtr[i * 7 + 6];
  645 +
  646 + }
  647 +
  648 + float volume = pi * (((R + 1.0f) * (R + 1.0f) * (zmax - zmin)) - ((1.0f / 3.0f) * (powf((zmax - c.z), 3.0f) + powf((c.z - zmin), 3.0f))));
  649 + //float volume = (4.0f / 3.0f) * pi * (R + 1.0f)*(R + 1.0f)*(R + 1.0f); //(4.0f / 3.0f) * pi * powf((R + 1), 3.0f);
  650 + dEdp = dEdp * volume / (float)counter;
  651 + dEdq = dEdq * volume / (float)counter;
  652 +
  653 +
  654 + dEdp = dEdp / (8.0f * R3);
  655 + dEdq = dEdq / (8.0f * R3);
  656 +
  657 + //if (idx == 0)
  658 + //printf("dEdp.x=%f and dEdq.x=%f \t dEdp.y=%f and dEdq.y=%f \t dEdp.z=%f and dEdq.z=%f", dEdp.x, dEdq.x, dEdp.y, dEdq.y, dEdp.z, dEdq.z);
  659 +
  660 + float factor = sqrtf(float(i + 1));
  661 + snakes[blockIdx.x].update(dEdp, dEdq, dt / factor);
  662 +
  663 + //printf("snakes[blockIdx.x].p.x=%f and snakes[blockIdx.x].p.y=%f \n snakes[blockIdx.x].q.x=%f and snakes[blockIdx.x].q.y=%f\n\n", snakes[blockIdx.x].p.x, snakes[blockIdx.x].p.y, snakes[blockIdx.x].q.x, snakes[blockIdx.x].q.y);
  664 + }
  665 + __syncthreads();
  666 +
  667 + }
  668 +
  669 +
  670 +
  671 +
  672 +
  673 +
  674 +}
  675 +
  676 +__global__ void kernel_snake_evolve(sphere* snakes, float *I, size_t snakeNum, size_t w, size_t h, size_t d, int itr, float dt, bool debug = false) {
  677 +
  678 + //__launch_bounds__(1024, 1);
  679 + int idx = blockDim.x * blockIdx.x + threadIdx.x;
  680 +
  681 + if (idx >= snakeNum) // return if the number of threads is more than snakes
  682 + return;
  683 +
  684 +
  685 + if (idx == 0)
  686 + printf("\n\n \t\t=============>>>>we are in the kernel\n\n");
  687 + point<float> dEdp(0.0, 0.0, 0.0); // energy gradient wrt p
  688 + point<float> dEdq(0.0, 0.0, 0.0); // energy gradient wrt q
  689 + sphere s = snakes[idx];
  690 +
  691 + for (int i = 0; i < itr; i++) {
  692 +
  693 + snake_Engrad(dEdp, dEdq, s, I, w, h, d, debug);
  694 + if (debug)
  695 + printf("dEdp.x=%f and dEdp.y=%f and dEdp.z=%f and dEdq.x=%f\n", dEdp.x, dEdp.y, dEdp.z, dEdq.x);
  696 + float factor = sqrtf(float(i + 1));
  697 + s.update(dEdp, dEdq, dt / factor);
  698 + }
  699 +
  700 + snakes[idx] = s;
  701 +
  702 +}
  703 +
  704 +
  705 +// -----------------------------------Energy computaion and compare hypersnakes------------------------------------------------------------------------
  706 +__host__ __device__ void sum_E(float& E, point<float> c, point<float> s, float R, float f) {
  707 +
  708 + float r = R / cubeRoot2; // radius of inner snake
  709 + float dz2 = (s.z - c.z)*(s.z - c.z);
  710 + float dy2 = (s.y - c.y)*(s.y - c.y);
  711 + float dx2 = (s.x - c.x)*(s.x - c.x);
  712 + float d = sqrt(dz2 + dy2 + dx2); //distance bw given sample and center of contour
  713 +
  714 +
  715 + if (d < (r - 0.5f*deltar)) // mouth of snake
  716 + E -= f;
  717 +
  718 + else if (d < (r + 0.5f*deltar)) { // throat of snake
  719 + float S = (2.0f / deltar) * (d - r); // weight function value in the given point
  720 + E += (S * f);
  721 + }
  722 +
  723 + else if (d < (R - 0.5f*deltaR)) // coil of snake
  724 + E += f;
  725 +
  726 + else if (d < (R + 0.5f*deltaR)) { // fangs of snake
  727 + float S = -(1.0f / deltaR) * (d - (R + deltaR / 2.0f));
  728 + E += (S * f);
  729 + }
  730 +
  731 +}
  732 +__host__ __device__ void snake_energy(float &energy, sphere snake, float* I, size_t w, size_t h, size_t d) {
  733 +
  734 + float radius = snake.r(); // radius of outer snake
  735 + point<float> c = snake.c(); // center of snake
  736 + float threshold = ((1 + cubeRoot2) / (cubeRoot2 - 1))*(deltar / pow(2.0f, (2.0f / 3.0f)));
  737 + if (radius < threshold) {
  738 + //printf("radius is out of range\n");
  739 + return;
  740 + }
  741 +
  742 + float tempXmin = floor(c.x - radius - 1); //calculate a bounding box around the sphere
  743 + float tempXmax = ceil(c.x + radius + 1);
  744 + float tempYmin = floor(c.y - radius - 1);
  745 + float tempYmax = ceil(c.y + radius + 1);
  746 + float tempZmin = floor(c.z - radius - 1);
  747 + float tempZmax = ceil(c.z + radius + 1);
  748 +
  749 + float xmin = max((float)tempXmin, (float) 0.0); //clamp the bounding box to the image edges
  750 + float xmax = min((float)tempXmax, (float)(w - 1));
  751 + float ymin = max((float)tempYmin, (float)0);
  752 + float ymax = min((float)tempYmax, (float)(h - 1));
  753 + float zmin = max((float)tempZmin, (float)0);
  754 + float zmax = min((float)tempZmax, (float)(d - 1));
  755 +
  756 + if ((xmax <= xmin) || (ymax <= ymin) || (zmax <= zmin)) {
  757 + //printf("(xmax <= xmin) || (ymax <= ymin) || (zmax <= zmin)");
  758 + return;
  759 + }
  760 +
  761 + float E = 0.0f;
  762 + float R = radius; //simplify radius to R
  763 + float R3 = R * R * R; //calculate R^2 (radius squared)
  764 +
  765 + for (unsigned int z = (unsigned int)zmin; z <= (unsigned int)zmax; z++) { // for each section
  766 + for (unsigned int y = (unsigned int)ymin; y <= (unsigned int)ymax; y++) { //for each row of section in the bounding box
  767 + for (unsigned int x = (unsigned int)xmin; x <= (unsigned int)xmax; x++) { //for each pixel p in the row
  768 +
  769 + point<float> s(x, y, z); // a sample inside the contour
  770 + float f; // image value in given position
  771 + int position = (int)((w * h * z) + (x * h + y));
  772 + if (position < (w * h * d)) {
  773 + f = I[position];
  774 + if (!f == 0)
  775 + sum_E(E, c, s, R, f);
  776 + }
  777 +
  778 + }
  779 + }
  780 + }
  781 + energy = E / (8 * R3);
  782 +}
  783 +
  784 +//compute energy of snakes
  785 +__global__ void kernel_snake_energy(float* energy, sphere* snakes, float* I, size_t snakeNum, size_t w, size_t h, size_t d) {
  786 + size_t i = blockDim.x * blockIdx.x + threadIdx.x;
  787 +
  788 + if (i >= snakeNum) return; // return if the number of threads is more than snakes
  789 +
  790 + if (i == 0) {
  791 + printf("we are in energy kernel\n");
  792 + }
  793 + float energy_temp;
  794 + snake_energy(energy_temp, snakes[i], I, w, h, d);
  795 + energy[i] = energy_temp;
  796 +
  797 +}
  798 +
  799 +
  800 +// returns a set of snake indices that meet specific criteria(to be determined and refined by Mahsa)
  801 +std::vector<size_t> DetectValidSnakes_GPU(sphere* snakes, size_t snakeNum, float* I, size_t w, size_t h, size_t d, size_t threads, float energy_th) {
  802 +
  803 + ////calculate energy
  804 + float *gpu_energy;
  805 + HANDLE_ERROR(cudaMalloc(&gpu_energy, snakeNum * sizeof(float)));
  806 + size_t blocks = snakeNum / threads + 1;
  807 +
  808 + //// allocate memory and copy snakes to the device
  809 + sphere* gpu_snakes = new sphere[snakeNum];
  810 + HANDLE_ERROR(cudaMalloc(&gpu_snakes, snakeNum * sizeof(sphere)));
  811 + HANDLE_ERROR(cudaMemcpy(gpu_snakes, snakes, snakeNum * sizeof(sphere), cudaMemcpyHostToDevice));
  812 +
  813 + //// allocate memory and copy image to device
  814 + float *gpu_I;
  815 + HANDLE_ERROR(cudaMalloc(&gpu_I, w * h * d * sizeof(float)));
  816 + HANDLE_ERROR(cudaMemcpy(gpu_I, I, w * h * d * sizeof(float), cudaMemcpyHostToDevice));
  817 + //create an array to store the snake energies' on cpu
  818 + float *energy;
  819 + energy = (float*)malloc(snakeNum * sizeof(float));
  820 + //memset(energy, 0, snakeNum * sizeof(float));
  821 + kernel_snake_energy << < (unsigned int)blocks, (unsigned int)threads >> > (gpu_energy, gpu_snakes, gpu_I, snakeNum, w, h, d);
  822 +
  823 + HANDLE_ERROR(cudaMemcpy(energy, gpu_energy, snakeNum * sizeof(float), cudaMemcpyDeviceToHost));
  824 + //std::ofstream outfile("energy.txt"); //open a file for writing
  825 + //for (size_t i = 0; i < snakeNum; i++) {
  826 + // outfile << energy[i] << std::endl; //output the energy
  827 + //}
  828 + //outfile.close();
  829 +
  830 + cudaFree(gpu_energy);
  831 + cudaFree(gpu_I);
  832 + cudaFree(gpu_snakes);
  833 +
  834 +
  835 + // compare snakes in possible overlaps
  836 + std::vector<size_t> id; //store indices of snakes which have overlaps with snake i
  837 + std::vector<size_t> idx; //create a vector to store indices of snakes which must be deleted.
  838 +
  839 + float threshold = ((1 + cubeRoot2) / (cubeRoot2 - 1))*(deltar / pow(2.0f, (2.0f / 3.0f)));
  840 +
  841 + for (size_t i = 0; i < snakeNum; i++) {
  842 + if (snakes[i].r() < threshold)
  843 + idx.push_back(i);
  844 +
  845 + if (std::find(idx.begin(), idx.end(), i) == idx.end()) { // check if snake is already deleted
  846 + id.clear();
  847 + for (size_t j = 0; j < snakeNum; j++) {
  848 + if (j != i) {
  849 + if (snakes[j].c().x > snakes[i].c().x - 2 * snakes[i].r() && snakes[j].c().x < snakes[i].c().x + 2 * snakes[i].r() && snakes[j].c().y > snakes[i].c().y - 2 * snakes[i].r() && snakes[j].c().y < snakes[i].c().y + 2 * snakes[i].r()) {
  850 + if (snakes[j].r() < threshold)
  851 + idx.push_back(j);
  852 + if (std::find(idx.begin(), idx.end(), j) == idx.end()) {
  853 + float centerDistance_x = snakes[i].c().x - snakes[j].c().x; // centers distance of snakes i and j- in x direction
  854 + float centerDistance_y = snakes[i].c().y - snakes[j].c().y; // centers distance of snakes i and j- in y direction
  855 + float centerDistance_z = snakes[i].c().z - snakes[j].c().z; // centers distance of snakes i and j- in z direction
  856 + float centerDistance = sqrt((centerDistance_x * centerDistance_x) + (centerDistance_y * centerDistance_y) + 4 * (centerDistance_z * centerDistance_z)); // euclidean distance bw center snakes i and j
  857 + float maxRadius = max(snakes[i].r(), snakes[j].r()); // maximum of radius of snake i and snake j
  858 + if (centerDistance < (maxRadius / cubeRoot2))
  859 + id.push_back(j); // store indices of overlapped snakes with snake i
  860 + }
  861 + }
  862 + }
  863 + }
  864 + if (!id.empty()) {
  865 + id.push_back(i);
  866 + float smallest = energy[id[0]];
  867 + size_t smallest_id = id[0]; // index of snake should be kept
  868 + for (int k = 0; k < id.size(); k++) { // find snake with smallest energy among id vector
  869 + if (energy[id[k]] < smallest) {
  870 + smallest = energy[id[k]];
  871 + smallest_id = id[k];
  872 + }
  873 + }
  874 + for (int m = 0; m < id.size(); m++) {
  875 + if (id[m] != smallest_id) // among snakes with overlaps, the one with smallest energy survies.
  876 + idx.push_back(id[m]); // idx stores snakes which should be deleted
  877 +
  878 + }
  879 + }
  880 + }
  881 +
  882 + }
  883 + std::vector<size_t> th_idx; //create a vector to store final indices exclusive idx (indices of snakes with large energy) and the ones with energy higher than threshold
  884 + for (size_t c = 0; c < snakeNum; c++) {
  885 + if (std::find(idx.begin(), idx.end(), c) == idx.end()) {
  886 + if (energy[c] < energy_th)
  887 + th_idx.push_back(c); //if the snake exceeds the energy threshold, store the index
  888 + }
  889 + }
  890 + free(energy);
  891 + return th_idx; //return the indices of valid snakes
  892 +}
  893 +
  894 +
  895 +
  896 +
  897 +///..................................................................main function.............................................................................................
  898 +void advertise() {
  899 + std::cout << "this is Hypersnakuscule implementation" << std::endl;
  900 + std::cout << "reference papre for 2D is (Snakuscules by Philippe Thévenaz and Michael Unser)" << std::endl;
  901 + std::cout << "implemented by Mahsa Lotfollahi" << std::endl << std::endl;
  902 + std::cout << "Usage: snakuscules input_image [options]" << std::endl;
  903 +}
  904 +
  905 +int main(int argc, char* argv[]) {
  906 + stim::arglist args; // create an argument list
  907 +
  908 + args.add("help", "prints this help");
  909 + args.add("iter", "number of iteration for evolving contour", "400", "positive value");
  910 + args.add("radius", "initial radius", "15", "real positive value");
  911 + args.add("size", "specify size of image in 3 dimension", "", "[w h d]");
  912 + args.add("dt", "gradient descend stepsize", "10", "real positive value");
  913 + args.add("cuda", "specify the device used for CUDA calculations", "0", "device ID, -1 for CPU");
  914 + args.add("mc", "specify using Monte Carlo sampling", "", "MC=1 for Monte Carlo sampling and 0 for original integration");
  915 + args.add("single", "specify a single contour to evolve", "", "[x y z r]");
  916 + args.add("filter", "specify filter type for preprocessing like log", "", "name of filter");
  917 + args.add("energy_th", "specify energy threshold, snakes with energies less than that survive", "-1", "small negative value");
  918 + args.add("debug", "output debugging information");
  919 + args.parse(argc, argv);
  920 +
  921 + if (args["help"].is_set()) { //output help if requested by the user
  922 + advertise();
  923 + std::cout << args.str() << std::endl;
  924 + return 1;
  925 + }
  926 +
  927 + if (args.nargs() < 1) {
  928 + std::cout << "ERROR: no input file specified" << std::endl;
  929 + return 1;
  930 + }
  931 +
  932 + std::string output_file = "output.txt"; //set the default output file name to "output.txt"
  933 + if (args.nargs() >= 2) output_file = args.arg(1); //if an output is specified by the user, use that instead
  934 +
  935 + int itr = args["iter"].as_int(); //get input parameters and set variables
  936 + float radius = (float)args["radius"].as_float();
  937 + if (!args["size"]) { //get size of image
  938 + std::cout << "you should specify size of image in 3 dimension" << std::endl;
  939 + return 1;
  940 + }
  941 +
  942 + int w = args["size"].as_int(0);
  943 + int h = args["size"].as_int(1);
  944 + int d = args["size"].as_int(2);
  945 +
  946 + float energy_th = (float)args["energy_th"].as_float();
  947 + float dt = (float)args["dt"].as_float();
  948 + int cuda_device = args["cuda"].as_int(); //get the desired CUDA device
  949 + bool MC = false;
  950 + int sampleNum;
  951 + if (args["mc"]) MC = true;
  952 + sampleNum = 20000;
  953 +
  954 + if (args["mc"].nargs() > 0)
  955 + sampleNum = args["mc"].as_int();
  956 +
  957 + bool swarm = true;
  958 + if (args["single"]) swarm = false; //if the user specifies a single snake parameter, don't use the swarm algorithm
  959 +
  960 + bool Filter = false;
  961 + std::string filter_name;
  962 + int kernel_size; // kernel size
  963 + if (args["filter"]) {
  964 + Filter = true;
  965 + filter_name = "log";
  966 + kernel_size = 5;
  967 + if (args["filter"].nargs() > 0)
  968 + filter_name = args["filter"].as_string(0);
  969 + if (args["filter"].nargs() > 1)
  970 + kernel_size = args["filter"].as_int(1);
  971 +
  972 + }
  973 +
  974 +
  975 + //allocate memory in cpu for input image
  976 + size_t bytes = w * h * d * sizeof(float); //number of bytes needed to store image
  977 + float* I = (float*)malloc(bytes);
  978 + // load input image+
  979 + std::ifstream inputfile(args.arg(0), std::ios::in | std::ios::binary);
  980 + if (!inputfile) {
  981 + std::cout << "cannot open specified input file" << std::endl;
  982 + return;
  983 + }
  984 + inputfile.read((char*)I, bytes);
  985 + inputfile.close();
  986 + size_t N = w * h * d; // number of pixels (# array elements)
  987 + float* I_original = (float*)malloc(bytes); // keep the original image without pre-processing
  988 + memcpy(I_original, I, bytes);
  989 + stretch(I_original, N, 0, 255);
  990 + stretch(I, N, 0, 255);
  991 +
  992 +
  993 + if (Filter) { // compute log of image
  994 + if (filter_name == "log") {
  995 + for (int i = 0; i < N; i++)
  996 + I[i] = log(I[i] + 1);
  997 + }
  998 + if (filter_name == "median") { // apply median filter on each section
  999 + float* I_2D = (float*)malloc(w*h * sizeof(float)); // allocate memory to 2_D sections
  1000 +
  1001 + cv::Mat t_I_2D_mat(w, h, CV_32F); //image is stored column major but open cv read and write row major---matrix is transposed
  1002 + cv::Mat I2d_blurred(w, h, CV_32F); //allocate memory for blured image
  1003 +
  1004 + for (int zz = 0; zz < d; zz++) {
  1005 + memcpy(I_2D, I_original + (zz * w *h), w * h * sizeof(float)); // copy each section of 3D image(I) in an array
  1006 + cv::Mat I_2D_mat(h, w, CV_32F, I_2D); // create a Mat to copy data to that and be able to use open cv median filter
  1007 + cv::transpose(I_2D_mat, t_I_2D_mat);
  1008 + cv::medianBlur(t_I_2D_mat, I2d_blurred, kernel_size); // apply median filter
  1009 + cv::transpose(I2d_blurred, I_2D_mat); // transpose to be transfered to array
  1010 + I_2D = (float *)I_2D_mat.data;
  1011 + memcpy(I + (zz * w *h), I_2D, w * h * sizeof(float));
  1012 +
  1013 + }
  1014 + free(I_2D);
  1015 + t_I_2D_mat.release();
  1016 + I2d_blurred.release();
  1017 + }
  1018 +
  1019 + }
  1020 +
  1021 + stretch(I, N, 0, 255);
  1022 + size_t snakeNum;
  1023 + if (swarm) {
  1024 + int D = int(sqrt(1.5)*radius); //distance between hypersnakes
  1025 + snakeNum = w * h * d / (D * D * D ); // approximate number of hypersnakes lying on image
  1026 + }
  1027 + else
  1028 + snakeNum = 1;
  1029 + sphere* snakes = new sphere[snakeNum]; //create an array of spheres.(one for each snake)
  1030 + memset(snakes, 0, snakeNum * sizeof(sphere));
  1031 +
  1032 + if (swarm) {
  1033 + initialSwarmSnake(snakes, snakeNum, w, h, d, radius);
  1034 + std::cout << "number of snakes=" << snakeNum << std::endl;
  1035 + }
  1036 +
  1037 +
  1038 + else {
  1039 + point<float> center;
  1040 + center.x = (float)args["single"].as_float(0);
  1041 + center.y = (float)args["single"].as_float(1);
  1042 + center.z = (float)args["single"].as_float(2);
  1043 + //cout << "number of args" << args["single"].nargs() << endl;
  1044 + if (args["single"].nargs() >= 4)
  1045 + radius = (float)args["single"].as_float(3);
  1046 +
  1047 + snakes[0].p.x = center.x - radius; // define p and q
  1048 + snakes[0].q.x = center.x + radius;
  1049 + snakes[0].p.y = snakes[0].q.y = center.y;
  1050 + snakes[0].p.z = snakes[0].q.z = center.z;
  1051 +
  1052 + }
  1053 +
  1054 + point<float>* samples = (point<float>*)malloc(sampleNum * sizeof(point<float>));
  1055 + memset(samples, 0, sampleNum * sizeof(point<float>));
  1056 + if (MC) {
  1057 + if (args["debug"]) {
  1058 + randGenerator_cube(samples, sampleNum, true);
  1059 +
  1060 + }
  1061 + else randGenerator_cube(samples, sampleNum);
  1062 + }
  1063 +
  1064 + std::cout << "energy_th=" << energy_th << std::endl;
  1065 + std::cout << "initial radius=" << radius << std::endl;
  1066 + //-----------------------------------GPU implementation----------------------------------------------------------------------------
  1067 + if (cuda_device >= 0) {
  1068 +
  1069 + cudaDeviceProp prop;
  1070 + HANDLE_ERROR(cudaGetDeviceProperties(&prop, 0));
  1071 + size_t threads = (size_t)prop.maxThreadsPerBlock;
  1072 + size_t blocks = snakeNum;
  1073 + size_t nbyte_shared = 7 * 4 * threads;
  1074 + // allocate memory to snakes and copy them to device
  1075 + sphere* gpu_snakes;
  1076 + HANDLE_ERROR(cudaMalloc(&gpu_snakes, snakeNum * sizeof(sphere)));
  1077 + HANDLE_ERROR(cudaMemcpy(gpu_snakes, snakes, snakeNum * sizeof(sphere), cudaMemcpyHostToDevice));
  1078 +
  1079 + //allocate memory to image in device and copy from cpu to device
  1080 + float* gpu_I;
  1081 + HANDLE_ERROR(cudaMalloc(&gpu_I, bytes));
  1082 + HANDLE_ERROR(cudaMemcpy(gpu_I, I, bytes, cudaMemcpyHostToDevice));
  1083 + stim::gpuStartTimer();
  1084 + if (MC) {
  1085 + // allocate memory to random samples on device
  1086 + point<float>* G_samples;
  1087 + HANDLE_ERROR(cudaMalloc(&G_samples, sampleNum * sizeof(point<float>)));
  1088 + HANDLE_ERROR(cudaMemset(G_samples, 0, sampleNum * sizeof(point<float>)));
  1089 + HANDLE_ERROR(cudaMemcpy(G_samples, samples, sampleNum * sizeof(point<float>), cudaMemcpyHostToDevice));
  1090 +
  1091 + if (args["debug"])
  1092 + kernel_snake_evolve_MC_parallel << < blocks, threads, nbyte_shared >> > (gpu_snakes, gpu_I, G_samples, sampleNum, snakeNum, threads, w, h, d, itr, dt, true);
  1093 + else
  1094 + kernel_snake_evolve_MC_parallel << < blocks, threads, nbyte_shared >> > (gpu_snakes, gpu_I, G_samples, sampleNum, snakeNum, threads, w, h, d, itr, dt);
  1095 +
  1096 + }
  1097 + else {
  1098 + if (args["debug"])
  1099 + kernel_snake_evolve << <blocks, threads >> > (gpu_snakes, gpu_I, snakeNum, w, h, d, itr, dt, true);
  1100 + else
  1101 + kernel_snake_evolve << <blocks, threads >> > (gpu_snakes, gpu_I, snakeNum, w, h, d, itr, dt);
  1102 + }
  1103 +
  1104 + std::cout << "gpuruntime = " << stim::gpuStopTimer() << " ms" << std::endl;
  1105 + HANDLE_ERROR(cudaMemcpy(snakes, gpu_snakes, snakeNum * sizeof(sphere), cudaMemcpyDeviceToHost));
  1106 + cudaFree(gpu_I);
  1107 + cudaFree(gpu_snakes);
  1108 +
  1109 + std::vector<size_t> idx = DetectValidSnakes_GPU(snakes, snakeNum, I_original, w, h, d, threads, energy_th);
  1110 + SaveSnakes(output_file, snakes, idx);
  1111 +
  1112 +
  1113 +
  1114 + }
  1115 +
  1116 + //--------------------------------------------CPU implementation----------------------------------------------------------------------------
  1117 + else {
  1118 + unsigned int start = time(NULL);
  1119 + std::cout << "it is running on CPU" << std::endl;
  1120 + for (int i = 0; i < snakeNum; i++) {
  1121 + //printf("\n\n---------------->> iteration %u", numItr);
  1122 + if (args["debug"])
  1123 + snake_evolve(snakes[i], I, w, h, d, dt, itr, samples, sampleNum, MC, true);
  1124 + else
  1125 + snake_evolve(snakes[i], I, w, h, d, dt, itr, samples, sampleNum, MC);
  1126 +
  1127 + std::cout << "Output Snakes------------------" << std::endl;
  1128 + std::cout << snakes[i].str() << std::endl;
  1129 +
  1130 + }
  1131 + unsigned int end = time(NULL);
  1132 + std::cout << "cpuRunTime=" << end - start << "s" << std::endl;
  1133 +
  1134 +
  1135 + std::vector<size_t> idx = DetectValidSnakes_GPU(snakes, snakeNum, I_original, w, h, d, 512, energy_th);
  1136 + SaveSnakes(output_file, snakes, idx);
  1137 +
  1138 + }
  1139 +
  1140 + free(I);
  1141 + free(I_original);
  1142 + if (args["debug"]) {
  1143 + std::vector<size_t> idx(snakeNum);
  1144 + std::iota(idx.begin(), idx.end(), 0);
  1145 + //SaveSnakes(debug_file, snakes, idx); //saves the snakes to an output file
  1146 + std::cout << "Output Snakes------------------" << std::endl;
  1147 + for (size_t i = 0; i < idx.size(); i++) { //for each snake
  1148 + std::cout << snakes[idx[i]].str() << std::endl;
  1149 + }
  1150 + }
  1151 +
  1152 + std::vector<size_t> idx(snakeNum);
  1153 + std::iota(idx.begin(), idx.end(), 0);
  1154 + SaveSnakes("no-delete.txt", snakes, idx); //saves the snakes to an output file
  1155 + return 0;
  1156 +
  1157 +}
... ...