Commit 5de3a9c2be3a6d1931e2d2040de0245261ce4dc2

Authored by Pavel Govyadinov
2 parents 1306fd96 91d8912e

CHECKPOINT: before the swap of globj for glnetwork in the use of segmentation.

stim/cuda/arraymath.cuh
@@ -3,6 +3,11 @@ @@ -3,6 +3,11 @@
3 3
4 #include <stim/cuda/arraymath/array_add.cuh> 4 #include <stim/cuda/arraymath/array_add.cuh>
5 #include <stim/cuda/arraymath/array_multiply.cuh> 5 #include <stim/cuda/arraymath/array_multiply.cuh>
  6 +#include <stim/cuda/arraymath/array_multiply2.cuh>
  7 +#include <stim/cuda/arraymath/array_divide.cuh>
  8 +#include <stim/cuda/arraymath/array_cos.cuh>
  9 +#include <stim/cuda/arraymath/array_sin.cuh>
  10 +#include <stim/cuda/arraymath/array_atan.cuh>
6 #include <stim/cuda/arraymath/array_abs.cuh> 11 #include <stim/cuda/arraymath/array_abs.cuh>
7 #include <stim/cuda/arraymath/array_cart2polar.cuh> 12 #include <stim/cuda/arraymath/array_cart2polar.cuh>
8 13
stim/cuda/arraymath/array_add.cuh
@@ -3,6 +3,7 @@ @@ -3,6 +3,7 @@
3 3
4 #include <iostream> 4 #include <iostream>
5 #include <cuda.h> 5 #include <cuda.h>
  6 +//#include <cmath>
6 #include <stim/cuda/cudatools.h> 7 #include <stim/cuda/cudatools.h>
7 8
8 namespace stim{ 9 namespace stim{
@@ -27,7 +28,7 @@ namespace stim{ @@ -27,7 +28,7 @@ namespace stim{
27 int threads = stim::maxThreadsPerBlock(); 28 int threads = stim::maxThreadsPerBlock();
28 29
29 //calculate the number of blocks 30 //calculate the number of blocks
30 - int blocks = N / threads + (N%threads == 0 ? 0:1); 31 + int blocks = N / threads + 1;
31 32
32 //call the kernel to do the multiplication 33 //call the kernel to do the multiplication
33 cuda_add <<< blocks, threads >>>(ptr1, ptr2, sum, N); 34 cuda_add <<< blocks, threads >>>(ptr1, ptr2, sum, N);
stim/cuda/arraymath/array_atan.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_ARRAY_ATAN_H
  2 +#define STIM_CUDA_ARRAY_ATAN_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <cmath>
  7 +#include <stim/cuda/cudatools.h>
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + template<typename T>
  13 + __global__ void cuda_atan(T* ptr1, T* out, unsigned int N){
  14 +
  15 + //calculate the 1D index for this thread
  16 + int idx = blockIdx.x * blockDim.x + threadIdx.x;
  17 +
  18 + if(idx < N){
  19 + out[idx] = atan(ptr1[idx]);
  20 + }
  21 +
  22 + }
  23 +
  24 + template<typename T>
  25 + void gpu_atan(T* ptr1, T* out, unsigned int N){
  26 +
  27 + //get the maximum number of threads per block for the CUDA device
  28 + int threads = stim::maxThreadsPerBlock();
  29 +
  30 + //calculate the number of blocks
  31 + int blocks = N / threads + 1;
  32 +
  33 + //call the kernel to do the multiplication
  34 + cuda_atan <<< blocks, threads >>>(ptr1, out, N);
  35 +
  36 + }
  37 +
  38 + template<typename T>
  39 + void cpu_atan(T* ptr1, T* cpu_out, unsigned int N){
  40 +
  41 + //allocate memory on the GPU for the array
  42 + T* gpu_ptr1;
  43 + T* gpu_out;
  44 + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
  45 + HANDLE_ERROR( cudaMalloc( &gpu_out, N * sizeof(T) ) );
  46 +
  47 + //copy the array to the GPU
  48 + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
  49 +
  50 + //call the GPU version of this function
  51 + gpu_atan<T>(gpu_ptr1 ,gpu_out, N);
  52 +
  53 + //copy the array back to the CPU
  54 + HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N * sizeof(T), cudaMemcpyDeviceToHost) );
  55 +
  56 + //free allocated memory
  57 + cudaFree(gpu_ptr1);
  58 + cudaFree(gpu_out);
  59 +
  60 + }
  61 +
  62 + }
  63 +}
  64 +
  65 +
  66 +
  67 +#endif
0 \ No newline at end of file 68 \ No newline at end of file
stim/cuda/arraymath/array_cos.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_ARRAY_COS_H
  2 +#define STIM_CUDA_ARRAY_COS_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <cmath>
  7 +#include <stim/cuda/cudatools.h>
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + template<typename T>
  13 + __global__ void cuda_cos(T* ptr1, T* out, unsigned int N){
  14 +
  15 + //calculate the 1D index for this thread
  16 + int idx = blockIdx.x * blockDim.x + threadIdx.x;
  17 +
  18 + if(idx < N){
  19 + out[idx] = cos(ptr1[idx]);
  20 + }
  21 +
  22 + }
  23 +
  24 + template<typename T>
  25 + void gpu_cos(T* ptr1, T* out, unsigned int N){
  26 +
  27 + //get the maximum number of threads per block for the CUDA device
  28 + int threads = stim::maxThreadsPerBlock();
  29 +
  30 + //calculate the number of blocks
  31 + int blocks = N / threads + 1;
  32 +
  33 + //call the kernel to do the multiplication
  34 + cuda_cos <<< blocks, threads >>>(ptr1, out, N);
  35 +
  36 + }
  37 +
  38 + template<typename T>
  39 + void cpu_cos(T* ptr1, T* cpu_out, unsigned int N){
  40 +
  41 + //allocate memory on the GPU for the array
  42 + T* gpu_ptr1;
  43 + T* gpu_out;
  44 + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
  45 + HANDLE_ERROR( cudaMalloc( &gpu_out, N * sizeof(T) ) );
  46 +
  47 + //copy the array to the GPU
  48 + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
  49 +
  50 + //call the GPU version of this function
  51 + gpu_cos<T>(gpu_ptr1 ,gpu_out, N);
  52 +
  53 + //copy the array back to the CPU
  54 + HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N * sizeof(T), cudaMemcpyDeviceToHost) );
  55 +
  56 + //free allocated memory
  57 + cudaFree(gpu_ptr1);
  58 + cudaFree(gpu_out);
  59 +
  60 + }
  61 +
  62 + }
  63 +}
  64 +
  65 +
  66 +
  67 +#endif
0 \ No newline at end of file 68 \ No newline at end of file
stim/cuda/arraymath/array_divide.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_ARRAY_DIVIDE_H
  2 +#define STIM_CUDA_ARRAY_DIVIDE_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +
  8 +namespace stim{
  9 + namespace cuda{
  10 +
  11 + template<typename T>
  12 + __global__ void cuda_divide(T* ptr1, T* ptr2, T* quotient, unsigned int N){
  13 +
  14 + //calculate the 1D index for this thread
  15 + int idx = blockIdx.x * blockDim.x + threadIdx.x;
  16 +
  17 + if(idx < N){
  18 + quotient[idx] = ptr1[idx] / ptr2[idx];
  19 + }
  20 +
  21 + }
  22 +
  23 + template<typename T>
  24 + void gpu_divide(T* ptr1, T* ptr2, T* quotient, unsigned int N){
  25 +
  26 + //get the maximum number of threads per block for the CUDA device
  27 + int threads = stim::maxThreadsPerBlock();
  28 +
  29 + //calculate the number of blocks
  30 + int blocks = N / threads + 1;
  31 +
  32 + //call the kernel to do the multiplication
  33 + cuda_divide <<< blocks, threads >>>(ptr1, ptr2, quotient, N);
  34 +
  35 + }
  36 +
  37 + template<typename T>
  38 + void cpu_divide(T* ptr1, T* ptr2, T* cpu_quotient, unsigned int N){
  39 +
  40 + //allocate memory on the GPU for the array
  41 + T* gpu_ptr1;
  42 + T* gpu_ptr2;
  43 + T* gpu_quotient;
  44 + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
  45 + HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) );
  46 + HANDLE_ERROR( cudaMalloc( &gpu_quotient, N * sizeof(T) ) );
  47 +
  48 + //copy the array to the GPU
  49 + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
  50 + HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) );
  51 +
  52 + //call the GPU version of this function
  53 + gpu_divide<T>(gpu_ptr1, gpu_ptr2 ,gpu_quotient, N);
  54 +
  55 + //copy the array back to the CPU
  56 + HANDLE_ERROR( cudaMemcpy( cpu_quotient, gpu_quotient, N * sizeof(T), cudaMemcpyDeviceToHost) );
  57 +
  58 + //free allocated memory
  59 + cudaFree(gpu_ptr1);
  60 + cudaFree(gpu_ptr2);
  61 + cudaFree(gpu_quotient);
  62 +
  63 + }
  64 +
  65 + }
  66 +}
  67 +
  68 +
  69 +
  70 +#endif
0 \ No newline at end of file 71 \ No newline at end of file
stim/cuda/arraymath/array_multiply2.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_ARRAY_MULTIPLY_H
  2 +#define STIM_CUDA_ARRAY_MULTIPLY_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +
  8 +namespace stim{
  9 + namespace cuda{
  10 +
  11 + template<typename T>
  12 + __global__ void cuda_multiply(T* ptr1, T* ptr2, T* product, unsigned int N){
  13 +
  14 + //calculate the 1D index for this thread
  15 + int idx = blockIdx.x * blockDim.x + threadIdx.x;
  16 +
  17 + if(idx < N){
  18 + product[idx] = ptr1[idx] * ptr2[idx];
  19 + }
  20 +
  21 + }
  22 +
  23 + template<typename T>
  24 + void gpu_multiply(T* ptr1, T* ptr2, T* product, unsigned int N){
  25 +
  26 + //get the maximum number of threads per block for the CUDA device
  27 + int threads = stim::maxThreadsPerBlock();
  28 +
  29 + //calculate the number of blocks
  30 + int blocks = N / threads + 1;
  31 +
  32 + //call the kernel to do the multiplication
  33 + cuda_multiply <<< blocks, threads >>>(ptr1, ptr2, product, N);
  34 +
  35 + }
  36 +
  37 + template<typename T>
  38 + void cpu_multiply(T* ptr1, T* ptr2, T* cpu_product, unsigned int N){
  39 +
  40 + //allocate memory on the GPU for the array
  41 + T* gpu_ptr1;
  42 + T* gpu_ptr2;
  43 + T* gpu_product;
  44 + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
  45 + HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) );
  46 + HANDLE_ERROR( cudaMalloc( &gpu_product, N * sizeof(T) ) );
  47 +
  48 + //copy the array to the GPU
  49 + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
  50 + HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) );
  51 +
  52 + //call the GPU version of this function
  53 + gpu_multiply<T>(gpu_ptr1, gpu_ptr2 ,gpu_product, N);
  54 +
  55 + //copy the array back to the CPU
  56 + HANDLE_ERROR( cudaMemcpy( cpu_product, gpu_product, N * sizeof(T), cudaMemcpyDeviceToHost) );
  57 +
  58 + //free allocated memory
  59 + cudaFree(gpu_ptr1);
  60 + cudaFree(gpu_ptr2);
  61 + cudaFree(gpu_product);
  62 +
  63 + }
  64 +
  65 + }
  66 +}
  67 +
  68 +
  69 +
  70 +#endif
0 \ No newline at end of file 71 \ No newline at end of file
stim/cuda/arraymath/array_sin.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_ARRAY_SIN_H
  2 +#define STIM_CUDA_ARRAY_SIN_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <cmath>
  7 +#include <stim/cuda/cudatools.h>
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + template<typename T>
  13 + __global__ void cuda_sin(T* ptr1, T* out, unsigned int N){
  14 +
  15 + //calculate the 1D index for this thread
  16 + int idx = blockIdx.x * blockDim.x + threadIdx.x;
  17 +
  18 + if(idx < N){
  19 + out[idx] = sin(ptr1[idx]);
  20 + }
  21 +
  22 + }
  23 +
  24 + template<typename T>
  25 + void gpu_sin(T* ptr1, T* out, unsigned int N){
  26 +
  27 + //get the maximum number of threads per block for the CUDA device
  28 + int threads = stim::maxThreadsPerBlock();
  29 +
  30 + //calculate the number of blocks
  31 + int blocks = N / threads + 1;
  32 +
  33 + //call the kernel to do the multiplication
  34 + cuda_sin <<< blocks, threads >>>(ptr1, out, N);
  35 +
  36 + }
  37 +
  38 + template<typename T>
  39 + void cpu_sin(T* ptr1, T* cpu_out, unsigned int N){
  40 +
  41 + //allocate memory on the GPU for the array
  42 + T* gpu_ptr1;
  43 + T* gpu_out;
  44 + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
  45 + HANDLE_ERROR( cudaMalloc( &gpu_out, N * sizeof(T) ) );
  46 +
  47 + //copy the array to the GPU
  48 + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
  49 +
  50 + //call the GPU version of this function
  51 + gpu_sin<T>(gpu_ptr1 ,gpu_out, N);
  52 +
  53 + //copy the array back to the CPU
  54 + HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N * sizeof(T), cudaMemcpyDeviceToHost) );
  55 +
  56 + //free allocated memory
  57 + cudaFree(gpu_ptr1);
  58 + cudaFree(gpu_out);
  59 +
  60 + }
  61 +
  62 + }
  63 +}
  64 +
  65 +
  66 +
  67 +#endif
0 \ No newline at end of file 68 \ No newline at end of file
stim/cuda/bsds500/cPb.cpp 0 → 100644
  1 +#include <stim/image/image.h>
  2 +#include <cmath>
  3 +#include <stim/visualization/colormap.h>
  4 +#include <stim/image/image_contour_detection.h>
  5 +#include <sstream>
  6 +
  7 +
  8 +void array_multiply(float* lhs, float rhs, unsigned int N);
  9 +void array_add(float* ptr1, float* ptr2, float* sum, unsigned int N);
  10 +
  11 +/// This function evaluates the cPb given an multi-channel image
  12 +
  13 +/// @param img is the multi-channel image
  14 +/// @param r is an array of radii for different scaled discs(filters)
  15 +/// @param alpha is is an array of weights for different scaled discs(filters)
  16 +/// @param s is the number of scales
  17 +
  18 +stim::image<float> cPb(stim::image<float> img, int* r, float* alpha, int s){
  19 +
  20 + unsigned int w = img.width(); // get the width of picture
  21 + unsigned int h = img.height(); // get the height of picture
  22 + unsigned int c = img.channels(); // get the channels of picture
  23 +
  24 +
  25 + stim::image<float> cPb(w, h, 1); // allocate space for cPb
  26 + unsigned size = cPb.size(); // get the size of cPb
  27 + memset ( cPb.data(), 0, size * sizeof(float)); // initialize all the pixels of cPb to 0
  28 +
  29 +
  30 + unsigned int N = w * h; // get the number of pixels
  31 + int sigma_n = 3; // set the number of standard deviations used to define the sigma
  32 +
  33 + std::ostringstream ss; // (optional) set the stream to designate the test result file
  34 +
  35 + stim::image<float> temp; // set the temporary image to store the addtion result
  36 +
  37 + for (int i = 0; i < c; i++){
  38 + for (int j = 0; j < s; j++){
  39 +
  40 + ss << "data_output/cPb_slice"<< i*s + j << ".bmp"; // set the name for test result file (optional)
  41 + std::string sss = ss.str();
  42 +
  43 + // get the gaussian gradient by convolving each image slice with the mask
  44 + temp = Pb(img.channel(i), r[i*s + j], sigma_n);
  45 +
  46 + // output the test result of each slice (optional)
  47 + //stim::cpu2image(temp.data(), sss, w, h, stim::cmBrewer);
  48 +
  49 + // multiply each gaussian gradient with its weight
  50 + array_multiply(temp.data(), alpha[i*s + j], N);
  51 +
  52 + // add up all the weighted gaussian gradients
  53 + array_add(cPb.data(), temp.data(), cPb.data(), N);
  54 +
  55 + ss.str(""); //(optional) clear the space for stream
  56 +
  57 + }
  58 + }
  59 +
  60 + float max = cPb.maxv(); // get the maximum of cPb used for normalization
  61 + array_multiply(cPb.data(), 1/max, N); // normalize the cPb
  62 +
  63 + // output the test result of cPb (optional)
  64 + //stim::cpu2image(cPb.data(), "data_output/cPb_0916.bmp", w, h, stim::cmBrewer);
  65 +
  66 + return cPb;
  67 +}
stim/cuda/bsds500/dG1_conv2.cpp 0 → 100644
  1 +#include <stim/image/image.h>
  2 +//#include <cmath>
  3 +#include <stim/visualization/colormap.h>
  4 +#include <stim/image/image_contour_detection.h>
  5 +
  6 +/// This function generates the first-order gaussian derivative filter gx gy,
  7 +/// convolves the image with gx gy,
  8 +/// and returns an image class which channel(0) is Ix and channel(1) is Iy
  9 +
  10 +/// @param img is the one-channel image
  11 +/// @param r is an array of radii for different scaled discs(filters)
  12 +/// @param sigma_n is the number of standard deviations used to define the sigma
  13 +
  14 +void conv2_sep(float* img, unsigned int x, unsigned int y, float* kernel0, unsigned int k0, float* kernel1, unsigned int k1);
  15 +//void array_abs(float* img, unsigned int N);
  16 +
  17 +stim::image<float> Gd1(stim::image<float> image, int r, unsigned int sigma_n){
  18 +
  19 + unsigned int w = image.width(); // get the width of picture
  20 + unsigned int h = image.height(); // get the height of picture
  21 + unsigned N = w * h; // get the number of pixels of picture
  22 + int winsize = 2 * r + 1; // set the winsdow size of disc(filter)
  23 + float sigma = float(r)/float(sigma_n); // calculate the sigma used in gaussian function
  24 +
  25 + stim::image<float> I(w, h, 1, 2); // allocate space for return image class
  26 + stim::image<float> Ix(w, h); // allocate space for Ix
  27 + stim::image<float> Iy(w, h); // allocate space for Iy
  28 + Ix = image; // initialize Ix
  29 + Iy = image; // initialize Iy
  30 +
  31 + float* array_x1;
  32 + array_x1 = new float[winsize]; //allocate space for the 1D x-oriented gaussian derivative filter array_x1 for gx
  33 + float* array_y1;
  34 + array_y1 = new float[winsize]; //allocate space for the 1D y-oriented gaussian derivative filter array_y1 for gx
  35 + float* array_x2;
  36 + array_x2 = new float[winsize]; //allocate space for the 1D x-oriented gaussian derivative filter array_x2 for gy
  37 + float* array_y2;
  38 + array_y2 = new float[winsize]; //allocate space for the 1D y-oriented gaussian derivative filter array_y2 for gy
  39 +
  40 +
  41 + for (int i = 0; i < winsize; i++){
  42 +
  43 + int x = i - r; //range of x
  44 + int y = i - r; //range of y
  45 +
  46 + // create the 1D x-oriented gaussian derivative filter array_x1 for gx
  47 + array_x1[i] = (-1) * x * exp((-1)*(pow(x, 2))/(2*pow(sigma, 2)));
  48 + // create the 1D y-oriented gaussian derivative filter array_y1 for gx
  49 + array_y1[i] = exp((-1)*(pow(y, 2))/(2*pow(sigma, 2)));
  50 + // create the 1D x-oriented gaussian derivative filter array_x2 for gy
  51 + array_x2[i] = exp((-1)*(pow(x, 2))/(2*pow(sigma, 2)));
  52 + // create the 1D y-oriented gaussian derivative filter array_y2 for gy
  53 + array_y2[i] = (-1) * y * exp((-1)*(pow(y, 2))/(2*pow(sigma, 2)));
  54 + }
  55 +
  56 + //stim::cpu2image(array_x1, "data_output/array_x1_0915.bmp", winsize, 1, stim::cmBrewer); // (optional) show the mask result
  57 + //stim::cpu2image(array_y1, "data_output/array_y1_0915.bmp", winsize, 1, stim::cmBrewer); // (optional) show the mask result
  58 + //stim::cpu2image(array_x2, "data_output/array_x2_0915.bmp", winsize, 1, stim::cmBrewer); // (optional) show the mask result
  59 + //stim::cpu2image(array_y2, "data_output/array_y2_0915.bmp", winsize, 1, stim::cmBrewer); // (optional) show the mask result
  60 +
  61 + // get Ix by convolving the image with gx
  62 + conv2_sep(Ix.data(), w, h, array_x1, winsize, array_y1, winsize);
  63 +
  64 + //stim::cpu2image(Ix.data(), "data_output/Ix_0915.bmp", w, h, stim::cmBrewer);
  65 + // get Iy by convolving the image with gy
  66 + conv2_sep(Iy.data(), w, h, array_x2, winsize, array_y2, winsize);
  67 +
  68 + //stim::cpu2image(Iy.data(), "data_output/Iy_0915.bmp", w, h, stim::cmBrewer);
  69 +
  70 + delete [] array_x1; //free the memory of array_x1
  71 + delete [] array_y1; //free the memory of array_y1
  72 + delete [] array_x2; //free the memory of array_x2
  73 + delete [] array_y2; //free the memory of array_y2
  74 +
  75 + I.set_channel(0, Ix.data());
  76 + I.set_channel(1, Iy.data());
  77 +
  78 + return I;
  79 +
  80 +}
0 \ No newline at end of file 81 \ No newline at end of file
stim/cuda/bsds500/dG1_theta_conv2.cpp 0 → 100644
  1 +#include <stim/image/image.h>
  2 +#include <cmath>
  3 +#include <stim/visualization/colormap.h>
  4 +#include <stim/image/image_contour_detection.h>
  5 +
  6 +#define PI 3.1415926
  7 +
  8 +void array_multiply(float* lhs, float rhs, unsigned int N);
  9 +void array_add(float* ptr1, float* ptr2, float* sum, unsigned int N);
  10 +void array_abs(float* img, unsigned int N);
  11 +
  12 +/// This function evaluates the theta-dependent odd symmetric gaussian derivative gradient of an one-channel image
  13 +
  14 +/// @param img is the one-channel image
  15 +/// @param r is an array of radii for different scaled discs(filters)
  16 +/// @param sigma_n is the number of standard deviations used to define the sigma
  17 +/// @param theta is angle used for computing the gradient
  18 +
  19 +stim::image<float> Gd_odd(stim::image<float> image, int r, unsigned int sigma_n, float theta){
  20 +
  21 + float theta_r = (theta * PI)/180; //change angle unit from degree to rad
  22 +
  23 + unsigned int w = image.width(); // get the width of picture
  24 + unsigned int h = image.height(); // get the height of picture
  25 + unsigned N = w * h; // get the number of pixels of picture
  26 + int winsize = 2 * r + 1; // set the winsdow size of disc(filter)
  27 +
  28 + stim::image<float> I(w, h, 1, 2); // allocate space for return image of Gd1
  29 + stim::image<float> Ix(w, h); // allocate space for Ix
  30 + stim::image<float> Iy(w, h); // allocate space for Iy
  31 + stim::image<float> Gd_odd_theta(w, h); // allocate space for Pb
  32 +
  33 + I = Gd1(image, r, sigma_n); // calculate the Ix, Iy
  34 + Ix = I.channel(0);
  35 + Iy = I.channel(1);
  36 +
  37 + array_multiply(Ix.data(), cos(theta_r), N); //Ix = Ix*cos(theta_r)
  38 + array_multiply(Iy.data(), sin(theta_r), N); //Iy = Iy*sin(theta_r)
  39 + array_add(Ix.data(), Iy.data(), Gd_odd_theta.data(), N); //Gd_odd_theta = Ix + Iy;
  40 + array_abs(Gd_odd_theta.data(), N);
  41 +
  42 + //stim::cpu2image(I.channel(0).data(), "data_output/Gd_odd_x_0919.bmp", w, h, stim::cmBrewer);
  43 + //stim::cpu2image(I.channel(1).data(), "data_output/Gd_odd_y_0919.bmp", w, h, stim::cmBrewer);
  44 + //stim::cpu2image(Gd_odd_theta.data(), "data_output/Gd_odd_theta_0919.bmp", w, h, stim::cmBrewer);
  45 +
  46 + return Gd_odd_theta;
  47 +
  48 +}
stim/cuda/bsds500/dG2_conv2.cpp 0 → 100644
  1 +#include <stim/image/image.h>
  2 +//#include <cmath>
  3 +#include <stim/visualization/colormap.h>
  4 +#include <stim/image/image_contour_detection.h>
  5 +
  6 +/// This function generates the second-order gaussian derivative filter gxx gyy,
  7 +/// convolves the image with gxx gyy,
  8 +/// and returns an image class which channel(0) is Ixx and channel(1) is Iyy
  9 +
  10 +/// @param img is the one-channel image
  11 +/// @param r is an array of radii for different scaled discs(filters)
  12 +/// @param sigma_n is the number of standard deviations used to define the sigma
  13 +
  14 +void conv2_sep(float* img, unsigned int x, unsigned int y, float* kernel0, unsigned int k0, float* kernel1, unsigned int k1);
  15 +//void array_abs(float* img, unsigned int N);
  16 +
  17 +stim::image<float> Gd2(stim::image<float> image, int r, unsigned int sigma_n){
  18 +
  19 + unsigned int w = image.width(); // get the width of picture
  20 + unsigned int h = image.height(); // get the height of picture
  21 + unsigned N = w * h; // get the number of pixels of picture
  22 + int winsize = 2 * r + 1; // set the winsdow size of disc(filter)
  23 + float sigma = float(r)/float(sigma_n); // calculate the sigma used in gaussian function
  24 +
  25 + stim::image<float> I(w, h, 1, 2); // allocate space for return image class
  26 + stim::image<float> Ixx(w, h); // allocate space for Ixx
  27 + stim::image<float> Iyy(w, h); // allocate space for Iyy
  28 + Ixx = image; // initialize Ixx
  29 + Iyy = image; // initialize Iyy
  30 +
  31 + float* array_x1;
  32 + array_x1 = new float[winsize]; //allocate space for the 1D x-oriented gaussian derivative filter array_x1 for gxx
  33 + float* array_y1;
  34 + array_y1 = new float[winsize]; //allocate space for the 1D y-oriented gaussian derivative filter array_y1 for gxx
  35 + float* array_x2;
  36 + array_x2 = new float[winsize]; //allocate space for the 1D x-oriented gaussian derivative filter array_x2 for gyy
  37 + float* array_y2;
  38 + array_y2 = new float[winsize]; //allocate space for the 1D y-oriented gaussian derivative filter array_y2 for gyy
  39 +
  40 +
  41 + for (int i = 0; i < winsize; i++){
  42 +
  43 + int x = i - r; //range of x
  44 + int y = i - r; //range of y
  45 +
  46 + // create the 1D x-oriented gaussian derivative filter array_x1 for gxx
  47 + array_x1[i] = (-1) * (1 - pow(x, 2)) * exp((-1)*(pow(x, 2))/(2*pow(sigma, 2)));
  48 + // create the 1D y-oriented gaussian derivative filter array_y1 for gxx
  49 + array_y1[i] = exp((-1)*(pow(y, 2))/(2*pow(sigma, 2)));
  50 + // create the 1D x-oriented gaussian derivative filter array_x2 for gyy
  51 + array_x2[i] = exp((-1)*(pow(x, 2))/(2*pow(sigma, 2)));
  52 + // create the 1D y-oriented gaussian derivative filter array_y2 for gyy
  53 + array_y2[i] = (-1) * (1 - pow(y, 2)) * exp((-1)*(pow(y, 2))/(2*pow(sigma, 2)));
  54 + }
  55 +
  56 + //stim::cpu2image(array_x1, "data_output/array_x1_0915.bmp", winsize, 1, stim::cmBrewer); // (optional) show the mask result
  57 + //stim::cpu2image(array_y1, "data_output/array_y1_0915.bmp", winsize, 1, stim::cmBrewer); // (optional) show the mask result
  58 + //stim::cpu2image(array_x2, "data_output/array_x2_0915.bmp", winsize, 1, stim::cmBrewer); // (optional) show the mask result
  59 + //stim::cpu2image(array_y2, "data_output/array_y2_0915.bmp", winsize, 1, stim::cmBrewer); // (optional) show the mask result
  60 +
  61 + // get Ixx by convolving the image with gxx
  62 + conv2_sep(Ixx.data(), w, h, array_x1, winsize, array_y1, winsize);
  63 +
  64 + //stim::cpu2image(Ixx.data(), "data_output/Ixx_0915.bmp", w, h, stim::cmBrewer);
  65 + // get Iyy by convolving the image with gyy
  66 + conv2_sep(Iyy.data(), w, h, array_x2, winsize, array_y2, winsize);
  67 +
  68 + //stim::cpu2image(Iyy.data(), "data_output/Iyy_0915.bmp", w, h, stim::cmBrewer);
  69 +
  70 + delete [] array_x1; //free the memory of array_x1
  71 + delete [] array_y1; //free the memory of array_y1
  72 + delete [] array_x2; //free the memory of array_x2
  73 + delete [] array_y2; //free the memory of array_y2
  74 +
  75 + I.set_channel(0, Ixx.data());
  76 + I.set_channel(1, Iyy.data());
  77 +
  78 + return I;
  79 +
  80 +}
0 \ No newline at end of file 81 \ No newline at end of file
stim/cuda/bsds500/dG_d2x_theta_conv2.cpp 0 → 100644
  1 +#include <stim/image/image.h>
  2 +#include <cmath>
  3 +#include <stim/visualization/colormap.h>
  4 +#include <stim/image/image_contour_detection.h>
  5 +
  6 +/// This function evaluates the theta-dependent even-symmetric gaussian derivative gradient of an one-channel image
  7 +
  8 +/// @param img is the one-channel image
  9 +/// @param r is an array of radii for different scaled discs(filters)
  10 +/// @param sigma_n is the number of standard deviations used to define the sigma
  11 +/// @param theta is angle used for computing the gradient
  12 +
  13 +void conv2(float* img, float* mask, float* cpu_copy, unsigned int w, unsigned int h, unsigned int M);
  14 +void array_abs(float* img, unsigned int N);
  15 +
  16 +stim::image<float> Gd_even(stim::image<float> image, int r, unsigned int sigma_n, float theta){
  17 +
  18 + unsigned int w = image.width(); // get the width of picture
  19 + unsigned int h = image.height(); // get the height of picture
  20 + unsigned N = w * h; // get the number of pixels of picture
  21 + int winsize = 2 * r + 1; // set the winsdow size of disc(filter)
  22 + float sigma = float(r)/float(sigma_n); // calculate the sigma used in gaussian function
  23 +
  24 + stim::image<float> I(w, h, 1, 2); // allocate space for return image class
  25 + stim::image<float> Gd_even_theta(w, h); // allocate space for Gd_even_theta
  26 + stim::image<float> mask_x(winsize, winsize); // allocate space for x-axis-oriented filter
  27 + stim::image<float> mask_r(winsize, winsize); // allocate space for theta-oriented filter
  28 +
  29 + for (int j = 0; j < winsize; j++){
  30 + for (int i = 0; i< winsize; i++){
  31 +
  32 + int x = i - r; //range of x
  33 + int y = j - r; //range of y
  34 +
  35 + // create the x-oriented gaussian derivative filter mask_x
  36 + mask_x.data()[j*winsize + i] = (-1) * (1 - pow(x, 2)) * exp((-1)*(pow(x, 2))/(2*pow(sigma, 2))) * exp((-1)*(pow(y, 2))/(2*pow(sigma, 2)));
  37 +
  38 + }
  39 + }
  40 +
  41 + mask_r = mask_x.rotate(theta, r, r);
  42 + //mask_r = mask_x.rotate(45, r, r);
  43 + //stim::cpu2image(mask_r.data(), "data_output/mask_r_0919.bmp", winsize, winsize, stim::cmBrewer);
  44 +
  45 + // do the 2D convolution with image and mask
  46 + conv2(image.data(), mask_r.data(), Gd_even_theta.data(), w, h, winsize);
  47 + array_abs(Gd_even_theta.data(), N);
  48 +
  49 + //stim::cpu2image(Gd_even_theta.data(), "data_output/Gd_even_theta_0919.bmp", w, h, stim::cmGrayscale);
  50 +
  51 + return Gd_even_theta;
  52 +}
0 \ No newline at end of file 53 \ No newline at end of file
stim/cuda/bsds500/kmeans.cpp 0 → 100644
  1 +#include <stim/image/image.h>
  2 +//#include <cmath>
  3 +#include <stim/visualization/colormap.h>
  4 +#include <stim/image/image_contour_detection.h>
  5 +#include <opencv2/opencv.hpp>
  6 +#include <iostream>
  7 +
  8 +/// This function use cvkmeans to cluster given textons
  9 +
  10 +/// @param testons is a multi-channel image
  11 +/// @param k is the number of clusters
  12 +
  13 +stim::image<float> kmeans(stim::image<float> textons, unsigned int K){
  14 +
  15 + unsigned int w = textons.width(); // get the width of picture
  16 + unsigned int h = textons.height(); // get the height of picture
  17 + unsigned int feature_n = textons.channels(); // get the spectrum of picture
  18 + unsigned int N = w * h; // get the number of pixels
  19 +
  20 + float* sample1 = (float*) malloc(sizeof(float) * N * feature_n); //allocate the space for textons
  21 +
  22 + //reallocate a multi-channel texton image to a single-channel image
  23 + for(unsigned int c = 0; c < feature_n; c++){
  24 +
  25 + stim::image<float> temp;
  26 + temp = textons.channel(c);
  27 +
  28 + for(unsigned int j = 0; j < N; j++){
  29 +
  30 + sample1[c + j * feature_n] = temp.data()[j];
  31 + }
  32 + }
  33 +
  34 +
  35 + cv::Mat sample2(N, feature_n, CV_32F, sample1); //copy image to cv::mat
  36 +
  37 + //(optional) show the test result
  38 + //imshow("sample2", sample2);
  39 +
  40 +
  41 + cv::TermCriteria criteria(CV_TERMCRIT_EPS+CV_TERMCRIT_ITER, 10, 0.1); // set stop-criteria for kmeans iteration
  42 + cv::Mat labels(N, 1, CV_8U, cvScalarAll(0)); // allocate space for kmeans output
  43 + cv::Mat centers; // allocate space for kmeans output
  44 +
  45 + unsigned int test_times = 2; // set the number of times of trying kmeans, it will return the best result
  46 +
  47 + cv::kmeans(sample2, K, labels, criteria, test_times, cv::KMEANS_PP_CENTERS, centers); // kmeans clustering
  48 +
  49 + //(optional) show the test result
  50 + //imwrite( "data_output/labels_1D.bmp", labels);
  51 +
  52 + stim::image<float> texture(w, h, 1, 1); // allocate space for texture
  53 +
  54 + for(unsigned int i = 0; i < N; i++){ // reshape the labels from iD array to image
  55 +
  56 + texture.data()[i] = labels.at<int>(i);
  57 +
  58 + }
  59 +
  60 + //texture.save("data_output/kmeans_test0924_2.bmp");
  61 +
  62 + //(optional) show the test result
  63 + //stim::cpu2image(texture.data(), "data_output/kmeans_test.bmp", w, h, stim::cmBrewer);
  64 +
  65 + return texture;
  66 +
  67 +}
0 \ No newline at end of file 68 \ No newline at end of file
stim/cuda/bsds500/laplacian_conv2.cpp 0 → 100644
  1 +#include <stim/image/image.h>
  2 +#include <cmath>
  3 +#include <stim/visualization/colormap.h>
  4 +#include <stim/image/image_contour_detection.h>
  5 +
  6 +#define PI 3.1415926
  7 +
  8 +void array_multiply(float* lhs, float rhs, unsigned int N);
  9 +void array_add(float* ptr1, float* ptr2, float* sum, unsigned int N);
  10 +void array_abs(float* img, unsigned int N);
  11 +
  12 +/// This function evaluates the center-surround(Laplacian of Gaussian) gaussian derivative gradient of an one-channel image
  13 +
  14 +/// @param img is the one-channel image
  15 +/// @param r is an array of radii for different scaled discs(filters)
  16 +/// @param sigma_n is the number of standard deviations used to define the sigma
  17 +
  18 +stim::image<float> Gd_center(stim::image<float> image, int r, unsigned int sigma_n){
  19 +
  20 + unsigned int w = image.width(); // get the width of picture
  21 + unsigned int h = image.height(); // get the height of picture
  22 + unsigned N = w * h; // get the number of pixels of picture
  23 + int winsize = 2 * r + 1; // set the winsdow size of disc(filter)
  24 +
  25 + stim::image<float> I(w, h, 1, 2); // allocate space for return image of Gd2
  26 + stim::image<float> Ixx(w, h); // allocate space for Ixx
  27 + stim::image<float> Iyy(w, h); // allocate space for Iyy
  28 + stim::image<float> Gd_center(w, h); // allocate space for Pb
  29 +
  30 + I = Gd2(image, r, sigma_n); // calculate the Ixx, Iyy
  31 + Ixx = I.channel(0);
  32 + Iyy = I.channel(1);
  33 +
  34 + array_add(Ixx.data(), Iyy.data(), Gd_center.data(), N); //Gd_center = Ixx + Iyy;
  35 + array_abs(Gd_center.data(), N);
  36 +
  37 + //stim::cpu2image(Gd_center.data(), "data_output/Gd_center_0919.bmp", w, h, stim::cmBrewer);
  38 +
  39 + return Gd_center;
  40 +
  41 +}
stim/cuda/bsds500/tPb.cpp 0 → 100644
  1 +#include <stim/image/image.h>
  2 +#include <cmath>
  3 +#include <stim/visualization/colormap.h>
  4 +#include <stim/image/image_contour_detection.h>
  5 +#include <sstream>
  6 +
  7 +
  8 +void array_multiply(float* lhs, float rhs, unsigned int N);
  9 +void array_add(float* ptr1, float* ptr2, float* sum, unsigned int N);
  10 +void chi_grad(float* img, float* cpu_copy, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta);
  11 +
  12 +/// This function evaluates the tPb given a grayscale image
  13 +
  14 +/// @param img is the multi-channel image
  15 +/// @param theta_n is the number of angles used for computing oriented chi-gradient
  16 +/// @param r is an array of radii for different scaled discs(filters)
  17 +/// @param alpha is is an array of weights for different scaled discs(filters)
  18 +/// @param s is the number of scales
  19 +/// @param K is the number of clusters
  20 +
  21 +stim::image<float> tPb(stim::image<float> img, int* r, float* alpha, unsigned int theta_n, unsigned int bin_n, int s, unsigned K){
  22 +
  23 + unsigned int w = img.width(); // get the width of picture
  24 + unsigned int h = img.height(); // get the height of picture
  25 + unsigned int N = w * h; // get the number of pixels
  26 +
  27 + stim::image<float> img_textons(w, h, 1, theta_n*2+1); // allocate space for img_textons
  28 + stim::image<float> img_texture(w, h, 1, 1); // allocate space for img_texture
  29 + stim::image<float> tPb_theta(w, h, 1, 1); // allocate space for tPb_theta
  30 + stim::image<float> tPb(w, h, 1, 1); // allocate space for tPb
  31 + unsigned size = tPb_theta.size(); // get the size of tPb_theta
  32 + memset (tPb.data(), 0, size * sizeof(float)); // initialize all the pixels of tPb to 0
  33 + stim::image<float> temp(w, h, 1, 1); // set the temporary image to store the addtion result
  34 +
  35 + std::ostringstream ss; // (optional) set the stream to designate the test result file
  36 +
  37 +
  38 + img_textons = textons(img, theta_n);
  39 +
  40 + img_texture = kmeans(img_textons, K); // changing kmeans result into float type is required
  41 +
  42 + stim::cpu2image(img_texture.data(), "data_output/texture_0925.bmp", w, h, stim::cmBrewer);
  43 +
  44 +
  45 + unsigned int max1 = img_texture.maxv(); // get the maximum of Pb used for normalization
  46 + unsigned int bin_size = (max1 + 1)/bin_n; // (whether"+1" or not depends on kmeans result)
  47 +
  48 + for (int i = 0; i < theta_n; i++){
  49 +
  50 + float theta = 180 * ((float)i/theta_n); // calculate the even-splited angle for each tPb_theta
  51 +
  52 + memset (tPb_theta.data(), 0, size * sizeof(float)); // initialize all the pixels of tPb_theta to 0
  53 +
  54 + //ss << "data_output/0922tPb_theta"<< theta << ".bmp"; // set the name for test result file (optional)
  55 + //std::string sss = ss.str();
  56 +
  57 + for (int j = 0; j < s; j++){
  58 +
  59 + // get the chi-gradient by convolving each image slice with the mask
  60 + chi_grad(img_texture.data(), temp.data(), w, h, r[j], bin_n, bin_size, theta);
  61 +
  62 + float max2 = temp.maxv(); // get the maximum of tPb_theta used for normalization
  63 + array_multiply(temp.data(), 1/max2, N); // normalize the tPb_theta
  64 +
  65 + //output the test result of each slice (optional)
  66 + //stim::cpu2image(temp.data(), "data_output/tPb_slice0924_2.bmp", w, h, stim::cmBrewer);
  67 +
  68 + // multiply each chi-gradient with its weight
  69 + array_multiply(temp.data(), alpha[j], N);
  70 +
  71 + // add up all the weighted chi-gradients
  72 + array_add(tPb_theta.data(), temp.data(), tPb_theta.data(), N);
  73 +
  74 +
  75 + }
  76 +
  77 + //ss.str(""); //(optional) clear the space for stream
  78 +
  79 + for(unsigned long ti = 0; ti < N; ti++){
  80 +
  81 + if(tPb_theta.data()[ti] > tPb.data()[ti]){ //get the maximum value among all tPb_theta for ith pixel
  82 + tPb.data()[ti] = tPb_theta.data()[ti];
  83 + }
  84 +
  85 + else{
  86 + }
  87 + }
  88 + }
  89 +
  90 + float max3 = tPb.maxv(); // get the maximum of tPb used for normalization
  91 + array_multiply(tPb.data(), 1/max3, N); // normalize the tPb
  92 +
  93 + //output the test result of tPb (optional)
  94 + //stim::cpu2image(tPb.data(), "data_output/tPb_0922.bmp", w, h, stim::cmBrewer);
  95 +
  96 + return tPb;
  97 +}
stim/cuda/bsds500/textons.cpp 0 → 100644
  1 +#include <stim/image/image.h>
  2 +//#include <cmath>
  3 +#include <stim/visualization/colormap.h>
  4 +#include <stim/image/image_contour_detection.h>
  5 +#include <sstream>
  6 +
  7 +/// This function convolve the grayscale image with a set of oriented Gaussian
  8 +/// derivative filters, and return a texton image with (theta_n*2+1) channels
  9 +
  10 +/// @param image is an one-channel grayscale image
  11 +/// @param theta_n is the number of angles used for computing the gradient
  12 +
  13 +stim::image<float> textons(stim::image<float> image, unsigned int theta_n){
  14 +
  15 + unsigned int w = image.width(); // get the width of picture
  16 + unsigned int h = image.height(); // get the height of picture
  17 + unsigned N = w * h; // get the number of pixels of picture
  18 +
  19 + stim::image<float> textons(w, h, 1, theta_n*2+1); // allocate space for textons
  20 + stim::image<float> temp(w, h); // allocate space for temp
  21 +
  22 + unsigned int r_odd = 3; // set disc radii for odd-symmetric filter
  23 + unsigned int sigma_n_odd = 3; // set sigma_n for odd-symmetric filter
  24 + unsigned int r_even = 3; // set disc radii for even-symmetric filter
  25 + unsigned int sigma_n_even = 3; // set sigma_n for even-symmetric filter
  26 + unsigned int r_center = 3; // set disc radii for center-surround filter
  27 + unsigned int sigma_n_center = 3; // set sigma_n for center-surround filter
  28 +
  29 + //std::ostringstream ss1, ss2; // (optional) set the stream to designate the test result file
  30 +
  31 + for (unsigned int i = 0; i < theta_n; i++){
  32 +
  33 + //ss1 << "data_output/textons_channel_"<< i << ".bmp"; // set the name for test result file (optional)
  34 + //std::string sss1 = ss1.str();
  35 + //ss2 << "data_output/textons_channel_"<< i+theta_n << ".bmp"; // set the name for test result file (optional)
  36 + //std::string sss2 = ss2.str();
  37 +
  38 + float theta = 180 * ((float)i/theta_n); // calculate the even-splited angle for each oriented filter
  39 +
  40 + temp = Gd_odd(image, r_odd, sigma_n_odd, theta); // return Gd_odd to temp
  41 + //stim::cpu2image(temp.data(), sss1, w, h, stim::cmBrewer);
  42 + textons.set_channel(i, temp.data()); // copy temp to ith channel of textons
  43 +
  44 + temp = Gd_even(image, r_even, sigma_n_even, theta); // return Gd_even to temp
  45 + //stim::cpu2image(temp.data(), sss2, w, h, stim::cmBrewer);
  46 + textons.set_channel(i + theta_n, temp.data()); // copy temp to (i+theta_n)th channel of textons
  47 +
  48 + //ss1.str(""); //(optional) clear the space for stream
  49 + //ss2.str(""); //(optional) clear the space for stream
  50 +
  51 + }
  52 +
  53 + temp = Gd_center(image, r_center, sigma_n_center); // return Gd_center to temp
  54 + //stim::cpu2image(temp.data(), "data_output/textons_channel_16.bmp", w, h, stim::cmBrewer);
  55 + textons.set_channel(theta_n*2, temp.data()); // copy temp to (theta_n*2)th channel of textons
  56 +
  57 + return textons;
  58 +
  59 +}
  60 +
  61 +
0 \ No newline at end of file 62 \ No newline at end of file
stim/cuda/cudatools/devices.h
@@ -4,7 +4,7 @@ @@ -4,7 +4,7 @@
4 #include <cuda.h> 4 #include <cuda.h>
5 5
6 namespace stim{ 6 namespace stim{
7 - 7 +extern "C"
8 int maxThreadsPerBlock() 8 int maxThreadsPerBlock()
9 { 9 {
10 int device; 10 int device;
@@ -13,6 +13,16 @@ int maxThreadsPerBlock() @@ -13,6 +13,16 @@ int maxThreadsPerBlock()
13 cudaGetDeviceProperties(&props, device); 13 cudaGetDeviceProperties(&props, device);
14 return props.maxThreadsPerBlock; 14 return props.maxThreadsPerBlock;
15 } 15 }
  16 +
  17 +extern "C"
  18 +int sharedMemPerBlock()
  19 +{
  20 + int device;
  21 + cudaGetDevice(&device); //get the id of the current device
  22 + cudaDeviceProp props; //device property structure
  23 + cudaGetDeviceProperties(&props, device);
  24 + return props.sharedMemPerBlock;
  25 +}
16 } //end namespace rts 26 } //end namespace rts
17 27
18 #endif 28 #endif
stim/cuda/ivote/update_dir.cuh
@@ -165,6 +165,7 @@ namespace stim{ @@ -165,6 +165,7 @@ namespace stim{
165 cudaFree(gpuDir); 165 cudaFree(gpuDir);
166 166
167 cudaDestroyTextureObject(texObj); 167 cudaDestroyTextureObject(texObj);
  168 + cudaFreeArray(cuArray);
168 169
169 } 170 }
170 171
stim/cuda/templates/chi_gradient.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_CHI_GRAD_H
  2 +#define STIM_CUDA_CHI_GRAD_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <cuda_runtime.h>
  7 +#include <stim/cuda/sharedmem.cuh>
  8 +#include <cmath>
  9 +#include <algorithm>
  10 +
  11 +#define PI 3.14159265358979
  12 +
  13 +namespace stim{
  14 + namespace cuda{
  15 +
  16 + /// template parameter @param T is the data type
  17 + template<typename T>
  18 + __global__ void cuda_chi_grad(T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){
  19 +
  20 + double theta_r = ((theta) * PI)/180; //change angle unit from degree to rad
  21 + float sum = 0;
  22 + unsigned int N = w * h;
  23 +
  24 + //change 1D index to 2D cordinates
  25 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  26 + int yj = blockIdx.y;
  27 + int idx = yj * w + xi;
  28 + int shareidx = threadIdx.x;
  29 +
  30 + extern __shared__ unsigned short bin[];
  31 +
  32 +
  33 + if(xi < w && yj < h){
  34 +
  35 + int gidx;
  36 + int hidx;
  37 +
  38 + //initialize histogram bin to zeros
  39 + for(int i = 0; i < bin_n; i++){
  40 +
  41 + bin[shareidx * bin_n + i] = 0;
  42 + __syncthreads();
  43 +
  44 + }
  45 +
  46 + //get the histogram of the first half of disc and store in bin
  47 + for (int y = yj - r; y <= yj + r; y++){
  48 + for (int x = xi - r; x <= xi + r; x++){
  49 +
  50 + if ((y - yj)*cos(theta_r) + (x - xi)*sin(theta_r) > 0){
  51 +
  52 + gidx = (int) tex2D<T>(texObj, (float)x/w, (float)y/h)/bin_size;
  53 + __syncthreads();
  54 +
  55 + bin[shareidx * bin_n + gidx]++;
  56 + __syncthreads();
  57 +
  58 + }
  59 +
  60 + else{}
  61 + }
  62 + }
  63 +
  64 + //initiallize the gbin
  65 + unsigned short* gbin = (unsigned short*) malloc(bin_n*sizeof(unsigned short));
  66 + memset (gbin, 0, bin_n*sizeof(unsigned short));
  67 +
  68 + //copy the histogram to gbin
  69 + for (unsigned int gi = 0; gi < bin_n; gi++){
  70 +
  71 + gbin[gi] = bin[shareidx * bin_n + gi];
  72 +
  73 + }
  74 +
  75 + //initialize histogram bin to zeros
  76 + for(int j = 0; j < bin_n; j++){ //initialize histogram bin to zeros
  77 +
  78 + bin[shareidx * bin_n + j] = 0;
  79 + __syncthreads();
  80 + }
  81 +
  82 + //get the histogram of the second half of disc and store in bin
  83 + for (int y = yj - r; y <= yj + r; y++){
  84 + for (int x = xi - r; x <= xi + r; x++){
  85 +
  86 + if ((y - yj)*cos(theta_r) + (x - xi)*sin(theta_r) < 0){
  87 +
  88 + hidx = (int) tex2D<T>(texObj, (float)x/w, (float)y/h)/bin_size;
  89 + __syncthreads();
  90 +
  91 + bin[shareidx * bin_n + hidx]++;
  92 + __syncthreads();
  93 +
  94 + }
  95 + else{}
  96 + }
  97 + }
  98 +
  99 + //initiallize the gbin
  100 + unsigned short* hbin = (unsigned short*) malloc(bin_n*sizeof(unsigned short));
  101 + memset (hbin, 0, bin_n*sizeof(unsigned short));
  102 +
  103 + //copy the histogram to hbin
  104 + for (unsigned int hi = 0; hi < bin_n; hi++){
  105 +
  106 + hbin[hi] = bin[shareidx * bin_n + hi];
  107 +
  108 + }
  109 +
  110 + //compare gbin, hbin and calculate the chi distance
  111 + for (int k = 0; k < bin_n; k++){
  112 +
  113 + float flag; // set flag to avoid zero denominator
  114 +
  115 + if ((gbin[k] + hbin[k]) == 0){
  116 + flag = 1;
  117 + }
  118 + else {
  119 + flag = (gbin[k] + hbin[k]);
  120 + __syncthreads();
  121 + }
  122 +
  123 + sum += (gbin[k] - hbin[k])*(gbin[k] - hbin[k])/flag;
  124 + __syncthreads();
  125 +
  126 + }
  127 +
  128 + // return chi-distance for each pixel
  129 + copy[idx] = sum;
  130 +
  131 + free(gbin);
  132 + free(hbin);
  133 + }
  134 + }
  135 +
  136 +
  137 + template<typename T>
  138 + void gpu_chi_grad(T* img, T* copy, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){
  139 +
  140 + unsigned long N = w * h;
  141 +
  142 + // Allocate CUDA array in device memory
  143 +
  144 + //define a channel descriptor for a single 32-bit channel
  145 + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
  146 + cudaArray* cuArray; //declare the cuda array
  147 + cudaMallocArray(&cuArray, &channelDesc, w, h); //allocate the cuda array
  148 +
  149 + // Copy the image data from global memory to the array
  150 + cudaMemcpyToArray(cuArray, 0, 0, img, N * sizeof(T), cudaMemcpyDeviceToDevice);
  151 +
  152 + // Specify texture
  153 + struct cudaResourceDesc resDesc; //create a resource descriptor
  154 + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero
  155 + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type
  156 + resDesc.res.array.array = cuArray; //add a pointer to the cuda array
  157 +
  158 + // Specify texture object parameters
  159 + struct cudaTextureDesc texDesc; //create a texture descriptor
  160 + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero
  161 + texDesc.addressMode[0] = cudaAddressModeMirror; //use wrapping (around the edges)
  162 + texDesc.addressMode[1] = cudaAddressModeMirror;
  163 + texDesc.filterMode = cudaFilterModePoint; //use linear filtering
  164 + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats)
  165 + texDesc.normalizedCoords = 1; //using normalized coordinates
  166 +
  167 + // Create texture object
  168 + cudaTextureObject_t texObj = 0;
  169 + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
  170 +
  171 + //get the maximum number of threads per block for the CUDA device
  172 + int threads = stim::maxThreadsPerBlock();
  173 + int sharemax = stim::sharedMemPerBlock(); //get the size of Shared memory available per block in bytes
  174 + unsigned int shared_bytes = threads * bin_n * sizeof(unsigned short);
  175 +
  176 + if(threads * bin_n > sharemax){
  177 +
  178 + cout <<"Error: shared_bytes exceeds the max value."<<'\n';
  179 + exit(1);
  180 +
  181 + }
  182 +
  183 +
  184 + //calculate the number of blocks
  185 + dim3 blocks(w / threads + 1, h);
  186 +
  187 + //call the kernel to do the multiplication
  188 + cuda_chi_grad <<< blocks, threads, shared_bytes >>>(copy, texObj, w, h, r, bin_n, bin_size, theta);
  189 +
  190 + }
  191 +
  192 + template<typename T>
  193 + void cpu_chi_grad(T* img, T* cpu_copy, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){
  194 +
  195 + unsigned long N = w * h;
  196 + //allocate memory on the GPU for the array
  197 + T* gpu_img;
  198 + T* gpu_copy;
  199 + HANDLE_ERROR( cudaMalloc( &gpu_img, N * sizeof(T) ) );
  200 + HANDLE_ERROR( cudaMalloc( &gpu_copy, N * sizeof(T) ) );
  201 +
  202 + //copy the array to the GPU
  203 + HANDLE_ERROR( cudaMemcpy( gpu_img, img, N * sizeof(T), cudaMemcpyHostToDevice) );
  204 +
  205 + //call the GPU version of this function
  206 + gpu_chi_grad<T>(gpu_img, gpu_copy, w, h, r, bin_n, bin_size, theta);
  207 +
  208 + //copy the array back to the CPU
  209 + HANDLE_ERROR( cudaMemcpy( cpu_copy, gpu_copy, N * sizeof(T), cudaMemcpyDeviceToHost) );
  210 +
  211 + //free allocated memory
  212 + cudaFree(gpu_img);
  213 + cudaFree(gpu_copy);
  214 +
  215 + }
  216 +
  217 + }
  218 +}
  219 +
  220 +
  221 +#endif
0 \ No newline at end of file 222 \ No newline at end of file
stim/cuda/templates/conv2.cuh
@@ -11,8 +11,7 @@ namespace stim{ @@ -11,8 +11,7 @@ namespace stim{
11 namespace cuda{ 11 namespace cuda{
12 12
13 template<typename T> 13 template<typename T>
14 - //__global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){  
15 - __global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){ 14 + __global__ void cuda_conv2(T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned int M){
16 15
17 16
18 //the radius of mask 17 //the radius of mask
@@ -34,7 +33,7 @@ namespace stim{ @@ -34,7 +33,7 @@ namespace stim{
34 //copy[idx] = tex2D<float>(texObj, i+100, j+100); 33 //copy[idx] = tex2D<float>(texObj, i+100, j+100);
35 //return; 34 //return;
36 35
37 - //tex2D<float>(texObj, i, j); 36 + tex2D<float>(texObj, (float)i/w, (float)j/h);
38 37
39 //allocate memory for result 38 //allocate memory for result
40 T sum = 0; 39 T sum = 0;
@@ -51,9 +50,7 @@ namespace stim{ @@ -51,9 +50,7 @@ namespace stim{
51 int xx = x - (i - r); 50 int xx = x - (i - r);
52 int yy = y - (j - r); 51 int yy = y - (j - r);
53 52
54 - //T temp = img[y * w + x] * mask[yy * M + xx];  
55 - //sum += img[y * w + x] * mask[yy * M + xx];  
56 - sum += tex2D<T>(texObj, x, y) * 1.0;//mask[yy * M + xx]; 53 + sum += tex2D<T>(texObj, (float)x/w, (float)y/h) * mask[yy * M + xx];
57 } 54 }
58 } 55 }
59 copy[idx] = sum; 56 copy[idx] = sum;
@@ -88,11 +85,11 @@ namespace stim{ @@ -88,11 +85,11 @@ namespace stim{
88 // Specify texture object parameters 85 // Specify texture object parameters
89 struct cudaTextureDesc texDesc; //create a texture descriptor 86 struct cudaTextureDesc texDesc; //create a texture descriptor
90 memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero 87 memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero
91 - texDesc.addressMode[0] = cudaAddressModeMirror; //use wrapping (around the edges)  
92 - texDesc.addressMode[1] = cudaAddressModeMirror; 88 + texDesc.addressMode[0] = cudaAddressModeClamp; //use wrapping (around the edges)
  89 + texDesc.addressMode[1] = cudaAddressModeClamp;
93 texDesc.filterMode = cudaFilterModePoint; //use linear filtering 90 texDesc.filterMode = cudaFilterModePoint; //use linear filtering
94