Commit f186dbdaad1858d7e18d4b35a1e099ef81ff1f42

Authored by Tianshu Cheng
1 parent 3173e54d

header file for bsds500

stim/cuda/arraymath.cuh
... ... @@ -3,6 +3,11 @@
3 3  
4 4 #include <stim/cuda/arraymath/array_add.cuh>
5 5 #include <stim/cuda/arraymath/array_multiply.cuh>
  6 +#include <stim/cuda/arraymath/array_multiply2.cuh>
  7 +#include <stim/cuda/arraymath/array_divide.cuh>
  8 +#include <stim/cuda/arraymath/array_cos.cuh>
  9 +#include <stim/cuda/arraymath/array_sin.cuh>
  10 +#include <stim/cuda/arraymath/array_atan.cuh>
6 11 #include <stim/cuda/arraymath/array_abs.cuh>
7 12 #include <stim/cuda/arraymath/array_cart2polar.cuh>
8 13  
... ...
stim/cuda/arraymath/array_add.cuh
... ... @@ -3,6 +3,7 @@
3 3  
4 4 #include <iostream>
5 5 #include <cuda.h>
  6 +//#include <cmath>
6 7 #include <stim/cuda/cudatools.h>
7 8  
8 9 namespace stim{
... ...
stim/cuda/arraymath/array_atan.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_ARRAY_ATAN_H
  2 +#define STIM_CUDA_ARRAY_ATAN_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <cmath>
  7 +#include <stim/cuda/cudatools.h>
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + template<typename T>
  13 + __global__ void cuda_atan(T* ptr1, T* out, unsigned int N){
  14 +
  15 + //calculate the 1D index for this thread
  16 + int idx = blockIdx.x * blockDim.x + threadIdx.x;
  17 +
  18 + if(idx < N){
  19 + out[idx] = atan(ptr1[idx]);
  20 + }
  21 +
  22 + }
  23 +
  24 + template<typename T>
  25 + void gpu_atan(T* ptr1, T* out, unsigned int N){
  26 +
  27 + //get the maximum number of threads per block for the CUDA device
  28 + int threads = stim::maxThreadsPerBlock();
  29 +
  30 + //calculate the number of blocks
  31 + int blocks = N / threads + 1;
  32 +
  33 + //call the kernel to do the multiplication
  34 + cuda_atan <<< blocks, threads >>>(ptr1, out, N);
  35 +
  36 + }
  37 +
  38 + template<typename T>
  39 + void cpu_atan(T* ptr1, T* cpu_out, unsigned int N){
  40 +
  41 + //allocate memory on the GPU for the array
  42 + T* gpu_ptr1;
  43 + T* gpu_out;
  44 + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
  45 + HANDLE_ERROR( cudaMalloc( &gpu_out, N * sizeof(T) ) );
  46 +
  47 + //copy the array to the GPU
  48 + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
  49 +
  50 + //call the GPU version of this function
  51 + gpu_atan<T>(gpu_ptr1 ,gpu_out, N);
  52 +
  53 + //copy the array back to the CPU
  54 + HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N * sizeof(T), cudaMemcpyDeviceToHost) );
  55 +
  56 + //free allocated memory
  57 + cudaFree(gpu_ptr1);
  58 + cudaFree(gpu_out);
  59 +
  60 + }
  61 +
  62 + }
  63 +}
  64 +
  65 +
  66 +
  67 +#endif
0 68 \ No newline at end of file
... ...
stim/cuda/arraymath/array_cos.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_ARRAY_COS_H
  2 +#define STIM_CUDA_ARRAY_COS_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <cmath>
  7 +#include <stim/cuda/cudatools.h>
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + template<typename T>
  13 + __global__ void cuda_cos(T* ptr1, T* out, unsigned int N){
  14 +
  15 + //calculate the 1D index for this thread
  16 + int idx = blockIdx.x * blockDim.x + threadIdx.x;
  17 +
  18 + if(idx < N){
  19 + out[idx] = cos(ptr1[idx]);
  20 + }
  21 +
  22 + }
  23 +
  24 + template<typename T>
  25 + void gpu_cos(T* ptr1, T* out, unsigned int N){
  26 +
  27 + //get the maximum number of threads per block for the CUDA device
  28 + int threads = stim::maxThreadsPerBlock();
  29 +
  30 + //calculate the number of blocks
  31 + int blocks = N / threads + 1;
  32 +
  33 + //call the kernel to do the multiplication
  34 + cuda_cos <<< blocks, threads >>>(ptr1, out, N);
  35 +
  36 + }
  37 +
  38 + template<typename T>
  39 + void cpu_cos(T* ptr1, T* cpu_out, unsigned int N){
  40 +
  41 + //allocate memory on the GPU for the array
  42 + T* gpu_ptr1;
  43 + T* gpu_out;
  44 + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
  45 + HANDLE_ERROR( cudaMalloc( &gpu_out, N * sizeof(T) ) );
  46 +
  47 + //copy the array to the GPU
  48 + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
  49 +
  50 + //call the GPU version of this function
  51 + gpu_cos<T>(gpu_ptr1 ,gpu_out, N);
  52 +
  53 + //copy the array back to the CPU
  54 + HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N * sizeof(T), cudaMemcpyDeviceToHost) );
  55 +
  56 + //free allocated memory
  57 + cudaFree(gpu_ptr1);
  58 + cudaFree(gpu_out);
  59 +
  60 + }
  61 +
  62 + }
  63 +}
  64 +
  65 +
  66 +
  67 +#endif
0 68 \ No newline at end of file
... ...
stim/cuda/arraymath/array_divide.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_ARRAY_DIVIDE_H
  2 +#define STIM_CUDA_ARRAY_DIVIDE_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +
  8 +namespace stim{
  9 + namespace cuda{
  10 +
  11 + template<typename T>
  12 + __global__ void cuda_divide(T* ptr1, T* ptr2, T* quotient, unsigned int N){
  13 +
  14 + //calculate the 1D index for this thread
  15 + int idx = blockIdx.x * blockDim.x + threadIdx.x;
  16 +
  17 + if(idx < N){
  18 + quotient[idx] = ptr1[idx] / ptr2[idx];
  19 + }
  20 +
  21 + }
  22 +
  23 + template<typename T>
  24 + void gpu_divide(T* ptr1, T* ptr2, T* quotient, unsigned int N){
  25 +
  26 + //get the maximum number of threads per block for the CUDA device
  27 + int threads = stim::maxThreadsPerBlock();
  28 +
  29 + //calculate the number of blocks
  30 + int blocks = N / threads + 1;
  31 +
  32 + //call the kernel to do the multiplication
  33 + cuda_divide <<< blocks, threads >>>(ptr1, ptr2, quotient, N);
  34 +
  35 + }
  36 +
  37 + template<typename T>
  38 + void cpu_divide(T* ptr1, T* ptr2, T* cpu_quotient, unsigned int N){
  39 +
  40 + //allocate memory on the GPU for the array
  41 + T* gpu_ptr1;
  42 + T* gpu_ptr2;
  43 + T* gpu_quotient;
  44 + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
  45 + HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) );
  46 + HANDLE_ERROR( cudaMalloc( &gpu_quotient, N * sizeof(T) ) );
  47 +
  48 + //copy the array to the GPU
  49 + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
  50 + HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) );
  51 +
  52 + //call the GPU version of this function
  53 + gpu_divide<T>(gpu_ptr1, gpu_ptr2 ,gpu_quotient, N);
  54 +
  55 + //copy the array back to the CPU
  56 + HANDLE_ERROR( cudaMemcpy( cpu_quotient, gpu_quotient, N * sizeof(T), cudaMemcpyDeviceToHost) );
  57 +
  58 + //free allocated memory
  59 + cudaFree(gpu_ptr1);
  60 + cudaFree(gpu_ptr2);
  61 + cudaFree(gpu_quotient);
  62 +
  63 + }
  64 +
  65 + }
  66 +}
  67 +
  68 +
  69 +
  70 +#endif
0 71 \ No newline at end of file
... ...
stim/cuda/arraymath/array_multiply2.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_ARRAY_MULTIPLY_H
  2 +#define STIM_CUDA_ARRAY_MULTIPLY_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <stim/cuda/cudatools.h>
  7 +
  8 +namespace stim{
  9 + namespace cuda{
  10 +
  11 + template<typename T>
  12 + __global__ void cuda_multiply(T* ptr1, T* ptr2, T* product, unsigned int N){
  13 +
  14 + //calculate the 1D index for this thread
  15 + int idx = blockIdx.x * blockDim.x + threadIdx.x;
  16 +
  17 + if(idx < N){
  18 + product[idx] = ptr1[idx] * ptr2[idx];
  19 + }
  20 +
  21 + }
  22 +
  23 + template<typename T>
  24 + void gpu_multiply(T* ptr1, T* ptr2, T* product, unsigned int N){
  25 +
  26 + //get the maximum number of threads per block for the CUDA device
  27 + int threads = stim::maxThreadsPerBlock();
  28 +
  29 + //calculate the number of blocks
  30 + int blocks = N / threads + 1;
  31 +
  32 + //call the kernel to do the multiplication
  33 + cuda_multiply <<< blocks, threads >>>(ptr1, ptr2, product, N);
  34 +
  35 + }
  36 +
  37 + template<typename T>
  38 + void cpu_multiply(T* ptr1, T* ptr2, T* cpu_product, unsigned int N){
  39 +
  40 + //allocate memory on the GPU for the array
  41 + T* gpu_ptr1;
  42 + T* gpu_ptr2;
  43 + T* gpu_product;
  44 + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
  45 + HANDLE_ERROR( cudaMalloc( &gpu_ptr2, N * sizeof(T) ) );
  46 + HANDLE_ERROR( cudaMalloc( &gpu_product, N * sizeof(T) ) );
  47 +
  48 + //copy the array to the GPU
  49 + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
  50 + HANDLE_ERROR( cudaMemcpy( gpu_ptr2, ptr2, N * sizeof(T), cudaMemcpyHostToDevice) );
  51 +
  52 + //call the GPU version of this function
  53 + gpu_multiply<T>(gpu_ptr1, gpu_ptr2 ,gpu_product, N);
  54 +
  55 + //copy the array back to the CPU
  56 + HANDLE_ERROR( cudaMemcpy( cpu_product, gpu_product, N * sizeof(T), cudaMemcpyDeviceToHost) );
  57 +
  58 + //free allocated memory
  59 + cudaFree(gpu_ptr1);
  60 + cudaFree(gpu_ptr2);
  61 + cudaFree(gpu_product);
  62 +
  63 + }
  64 +
  65 + }
  66 +}
  67 +
  68 +
  69 +
  70 +#endif
0 71 \ No newline at end of file
... ...
stim/cuda/arraymath/array_sin.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_ARRAY_SIN_H
  2 +#define STIM_CUDA_ARRAY_SIN_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <cmath>
  7 +#include <stim/cuda/cudatools.h>
  8 +
  9 +namespace stim{
  10 + namespace cuda{
  11 +
  12 + template<typename T>
  13 + __global__ void cuda_sin(T* ptr1, T* out, unsigned int N){
  14 +
  15 + //calculate the 1D index for this thread
  16 + int idx = blockIdx.x * blockDim.x + threadIdx.x;
  17 +
  18 + if(idx < N){
  19 + out[idx] = sin(ptr1[idx]);
  20 + }
  21 +
  22 + }
  23 +
  24 + template<typename T>
  25 + void gpu_sin(T* ptr1, T* out, unsigned int N){
  26 +
  27 + //get the maximum number of threads per block for the CUDA device
  28 + int threads = stim::maxThreadsPerBlock();
  29 +
  30 + //calculate the number of blocks
  31 + int blocks = N / threads + 1;
  32 +
  33 + //call the kernel to do the multiplication
  34 + cuda_sin <<< blocks, threads >>>(ptr1, out, N);
  35 +
  36 + }
  37 +
  38 + template<typename T>
  39 + void cpu_sin(T* ptr1, T* cpu_out, unsigned int N){
  40 +
  41 + //allocate memory on the GPU for the array
  42 + T* gpu_ptr1;
  43 + T* gpu_out;
  44 + HANDLE_ERROR( cudaMalloc( &gpu_ptr1, N * sizeof(T) ) );
  45 + HANDLE_ERROR( cudaMalloc( &gpu_out, N * sizeof(T) ) );
  46 +
  47 + //copy the array to the GPU
  48 + HANDLE_ERROR( cudaMemcpy( gpu_ptr1, ptr1, N * sizeof(T), cudaMemcpyHostToDevice) );
  49 +
  50 + //call the GPU version of this function
  51 + gpu_sin<T>(gpu_ptr1 ,gpu_out, N);
  52 +
  53 + //copy the array back to the CPU
  54 + HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N * sizeof(T), cudaMemcpyDeviceToHost) );
  55 +
  56 + //free allocated memory
  57 + cudaFree(gpu_ptr1);
  58 + cudaFree(gpu_out);
  59 +
  60 + }
  61 +
  62 + }
  63 +}
  64 +
  65 +
  66 +
  67 +#endif
0 68 \ No newline at end of file
... ...
stim/cuda/cudatools/devices.h
... ... @@ -13,6 +13,15 @@ int maxThreadsPerBlock()
13 13 cudaGetDeviceProperties(&props, device);
14 14 return props.maxThreadsPerBlock;
15 15 }
  16 +
  17 +int sharedMemPerBlock()
  18 +{
  19 + int device;
  20 + cudaGetDevice(&device); //get the id of the current device
  21 + cudaDeviceProp props; //device property structure
  22 + cudaGetDeviceProperties(&props, device);
  23 + return props.sharedMemPerBlock;
  24 +}
16 25 } //end namespace rts
17 26  
18 27 #endif
... ...
stim/cuda/templates/chi_gradient.cuh 0 → 100644
  1 +#ifndef STIM_CUDA_CHI_GRAD_H
  2 +#define STIM_CUDA_CHI_GRAD_H
  3 +
  4 +#include <iostream>
  5 +#include <cuda.h>
  6 +#include <cuda_runtime.h>
  7 +#include <stim/cuda/sharedmem.cuh>
  8 +#include <cmath>
  9 +#include <algorithm>
  10 +
  11 +#define PI 3.14159265358979
  12 +
  13 +namespace stim{
  14 + namespace cuda{
  15 +
  16 + /// template parameter @param T is the data type
  17 + template<typename T>
  18 + __global__ void cuda_chi_grad(T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){
  19 +
  20 + double theta_r = ((theta) * PI)/180; //change angle unit from degree to rad
  21 + float sum = 0;
  22 + unsigned int N = w * h;
  23 +
  24 + //change 1D index to 2D cordinates
  25 + int xi = blockIdx.x * blockDim.x + threadIdx.x;
  26 + int yj = blockIdx.y;
  27 + int idx = yj * w + xi;
  28 + int shareidx = threadIdx.x;
  29 +
  30 + extern __shared__ unsigned short bin[];
  31 +
  32 +
  33 + if(xi < w && yj < h){
  34 +
  35 + int gidx;
  36 + int hidx;
  37 +
  38 + //initialize histogram bin to zeros
  39 + for(int i = 0; i < bin_n; i++){
  40 +
  41 + bin[shareidx * bin_n + i] = 0;
  42 + __syncthreads();
  43 +
  44 + }
  45 +
  46 + //get the histogram of the first half of disc and store in bin
  47 + for (int y = yj - r; y <= yj + r; y++){
  48 + for (int x = xi - r; x <= xi + r; x++){
  49 +
  50 + if ((y - yj)*cos(theta_r) + (x - xi)*sin(theta_r) > 0){
  51 +
  52 + gidx = (int) tex2D<T>(texObj, (float)x/w, (float)y/h)/bin_size;
  53 + __syncthreads();
  54 +
  55 + bin[shareidx * bin_n + gidx]++;
  56 + __syncthreads();
  57 +
  58 + }
  59 +
  60 + else{}
  61 + }
  62 + }
  63 +
  64 + //initiallize the gbin
  65 + unsigned short* gbin = (unsigned short*) malloc(bin_n*sizeof(unsigned short));
  66 + memset (gbin, 0, bin_n*sizeof(unsigned short));
  67 +
  68 + //copy the histogram to gbin
  69 + for (unsigned int gi = 0; gi < bin_n; gi++){
  70 +
  71 + gbin[gi] = bin[shareidx * bin_n + gi];
  72 +
  73 + }
  74 +
  75 + //initialize histogram bin to zeros
  76 + for(int j = 0; j < bin_n; j++){ //initialize histogram bin to zeros
  77 +
  78 + bin[shareidx * bin_n + j] = 0;
  79 + __syncthreads();
  80 + }
  81 +
  82 + //get the histogram of the second half of disc and store in bin
  83 + for (int y = yj - r; y <= yj + r; y++){
  84 + for (int x = xi - r; x <= xi + r; x++){
  85 +
  86 + if ((y - yj)*cos(theta_r) + (x - xi)*sin(theta_r) < 0){
  87 +
  88 + hidx = (int) tex2D<T>(texObj, (float)x/w, (float)y/h)/bin_size;
  89 + __syncthreads();
  90 +
  91 + bin[shareidx * bin_n + hidx]++;
  92 + __syncthreads();
  93 +
  94 + }
  95 + else{}
  96 + }
  97 + }
  98 +
  99 + //initiallize the gbin
  100 + unsigned short* hbin = (unsigned short*) malloc(bin_n*sizeof(unsigned short));
  101 + memset (hbin, 0, bin_n*sizeof(unsigned short));
  102 +
  103 + //copy the histogram to hbin
  104 + for (unsigned int hi = 0; hi < bin_n; hi++){
  105 +
  106 + hbin[hi] = bin[shareidx * bin_n + hi];
  107 +
  108 + }
  109 +
  110 + //compare gbin, hbin and calculate the chi distance
  111 + for (int k = 0; k < bin_n; k++){
  112 +
  113 + float flag; // set flag to avoid zero denominator
  114 +
  115 + if ((gbin[k] + hbin[k]) == 0){
  116 + flag = 1;
  117 + }
  118 + else {
  119 + flag = (gbin[k] + hbin[k]);
  120 + __syncthreads();
  121 + }
  122 +
  123 + sum += (gbin[k] - hbin[k])*(gbin[k] - hbin[k])/flag;
  124 + __syncthreads();
  125 +
  126 + }
  127 +
  128 + // return chi-distance for each pixel
  129 + copy[idx] = sum;
  130 +
  131 + free(gbin);
  132 + free(hbin);
  133 + }
  134 + }
  135 +
  136 +
  137 + template<typename T>
  138 + void gpu_chi_grad(T* img, T* copy, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){
  139 +
  140 + unsigned long N = w * h;
  141 +
  142 + // Allocate CUDA array in device memory
  143 +
  144 + //define a channel descriptor for a single 32-bit channel
  145 + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
  146 + cudaArray* cuArray; //declare the cuda array
  147 + cudaMallocArray(&cuArray, &channelDesc, w, h); //allocate the cuda array
  148 +
  149 + // Copy the image data from global memory to the array
  150 + cudaMemcpyToArray(cuArray, 0, 0, img, N * sizeof(T), cudaMemcpyDeviceToDevice);
  151 +
  152 + // Specify texture
  153 + struct cudaResourceDesc resDesc; //create a resource descriptor
  154 + memset(&resDesc, 0, sizeof(resDesc)); //set all values to zero
  155 + resDesc.resType = cudaResourceTypeArray; //specify the resource descriptor type
  156 + resDesc.res.array.array = cuArray; //add a pointer to the cuda array
  157 +
  158 + // Specify texture object parameters
  159 + struct cudaTextureDesc texDesc; //create a texture descriptor
  160 + memset(&texDesc, 0, sizeof(texDesc)); //set all values in the texture descriptor to zero
  161 + texDesc.addressMode[0] = cudaAddressModeMirror; //use wrapping (around the edges)
  162 + texDesc.addressMode[1] = cudaAddressModeMirror;
  163 + texDesc.filterMode = cudaFilterModePoint; //use linear filtering
  164 + texDesc.readMode = cudaReadModeElementType; //reads data based on the element type (32-bit floats)
  165 + texDesc.normalizedCoords = 1; //using normalized coordinates
  166 +
  167 + // Create texture object
  168 + cudaTextureObject_t texObj = 0;
  169 + cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
  170 +
  171 + //get the maximum number of threads per block for the CUDA device
  172 + int threads = stim::maxThreadsPerBlock();
  173 + int sharemax = stim::sharedMemPerBlock(); //get the size of Shared memory available per block in bytes
  174 + unsigned int shared_bytes = threads * bin_n * sizeof(unsigned short);
  175 +
  176 + if(threads * bin_n > sharemax){
  177 +
  178 + cout <<"Error: shared_bytes exceeds the max value."<<'\n';
  179 + exit(1);
  180 +
  181 + }
  182 +
  183 +
  184 + //calculate the number of blocks
  185 + dim3 blocks(w / threads + 1, h);
  186 +
  187 + //call the kernel to do the multiplication
  188 + cuda_chi_grad <<< blocks, threads, shared_bytes >>>(copy, texObj, w, h, r, bin_n, bin_size, theta);
  189 +
  190 + }
  191 +
  192 + template<typename T>
  193 + void cpu_chi_grad(T* img, T* cpu_copy, unsigned int w, unsigned int h, int r, unsigned int bin_n, unsigned int bin_size, float theta){
  194 +
  195 + unsigned long N = w * h;
  196 + //allocate memory on the GPU for the array
  197 + T* gpu_img;
  198 + T* gpu_copy;
  199 + HANDLE_ERROR( cudaMalloc( &gpu_img, N * sizeof(T) ) );
  200 + HANDLE_ERROR( cudaMalloc( &gpu_copy, N * sizeof(T) ) );
  201 +
  202 + //copy the array to the GPU
  203 + HANDLE_ERROR( cudaMemcpy( gpu_img, img, N * sizeof(T), cudaMemcpyHostToDevice) );
  204 +
  205 + //call the GPU version of this function
  206 + gpu_chi_grad<T>(gpu_img, gpu_copy, w, h, r, bin_n, bin_size, theta);
  207 +
  208 + //copy the array back to the CPU
  209 + HANDLE_ERROR( cudaMemcpy( cpu_copy, gpu_copy, N * sizeof(T), cudaMemcpyDeviceToHost) );
  210 +
  211 + //free allocated memory
  212 + cudaFree(gpu_img);
  213 + cudaFree(gpu_copy);
  214 +
  215 + }
  216 +
  217 + }
  218 +}
  219 +
  220 +
  221 +#endif
0 222 \ No newline at end of file
... ...
stim/cuda/templates/conv2.cuh
... ... @@ -11,8 +11,7 @@ namespace stim{
11 11 namespace cuda{
12 12  
13 13 template<typename T>
14   - //__global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned M){
15   - __global__ void cuda_conv2(T* img, T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned int M){
  14 + __global__ void cuda_conv2(T* mask, T* copy, cudaTextureObject_t texObj, unsigned int w, unsigned int h, unsigned int M){
16 15  
17 16  
18 17 //the radius of mask
... ... @@ -51,8 +50,6 @@ namespace stim{
51 50 int xx = x - (i - r);
52 51 int yy = y - (j - r);
53 52  
54   - //T temp = img[y * w + x] * mask[yy * M + xx];
55   - //sum += img[y * w + x] * mask[yy * M + xx];
56 53 sum += tex2D<T>(texObj, (float)x/w, (float)y/h) * mask[yy * M + xx];
57 54 }
58 55 }
... ... @@ -105,8 +102,7 @@ namespace stim{
105 102 dim3 blocks(w / threads + 1, h);
106 103  
107 104 //call the kernel to do the multiplication
108   - //cuda_conv2 <<< blocks, threads >>>(img, mask, copy, w, h, M);
109   - cuda_conv2 <<< blocks, threads >>>(img, mask, copy, texObj, w, h, M);
  105 + cuda_conv2 <<< blocks, threads >>>(mask, copy, texObj, w, h, M);
110 106  
111 107 }
112 108  
... ...
stim/image/image.h
... ... @@ -31,8 +31,12 @@ public:
31 31 }
32 32  
33 33 /// Constructor initializes an image to a given size
34   - image(unsigned int x, unsigned int y = 1, unsigned int z = 1){
  34 + /*image(unsigned int x, unsigned int y = 1, unsigned int z = 1){
35 35 img = cimg_library::CImg<T>(x, y, z);
  36 + }*/
  37 +
  38 + image(unsigned int x, unsigned int y = 1, unsigned int z = 1, unsigned int c = 1){
  39 + img = cimg_library::CImg<T>(x, y, z, c);
36 40 }
37 41  
38 42 //Load an image from a file
... ... @@ -90,6 +94,23 @@ public:
90 94  
91 95 }
92 96  
  97 + /// Copy the given data to the specified channel
  98 +
  99 + /// @param c is the channel number that the data will be copied to
  100 + /// @param buffer is a pointer to the image to be copied to channel c
  101 +
  102 + void set_channel(unsigned int c, T* buffer){
  103 +
  104 + //calculate the number of pixels in a channel
  105 + unsigned int channel_size = width() * height();
  106 +
  107 + //retreive a pointer to the raw image data
  108 + T* ptr = img.data() + channel_size * c;
  109 +
  110 + //copy the buffer to the specified channel
  111 + memcpy(ptr, buffer, sizeof(T) * channel_size);
  112 + }
  113 +
93 114 image<T> getslice(unsigned int c){
94 115  
95 116 //create a new image
... ... @@ -228,6 +249,18 @@ public:
228 249 }
229 250  
230 251  
  252 + image<T> rotate(float angle, float cx, float cy){
  253 +
  254 + image<T> result;
  255 + float zoom = 1;
  256 + unsigned int interpolation = 1;
  257 + unsigned int boundary = 1;
  258 + result.img = img.get_rotate (angle, cx, cy, zoom, interpolation, boundary);
  259 + //result.save("data_output/test_rotate_neum.bmp");
  260 +
  261 + return result;
  262 + }
  263 +
231 264 };
232 265  
233 266 }; //end namespace stim
... ...
stim/image/image_contour_detection.h
1 1  
2   -stim::image<float> gaussian_derivative_filter_odd(stim::image<float> image, int r, unsigned int sigma_n, float theta);
3   -stim::image<float> func_mPb_theta(stim::image<float> img, float theta, int* r, float* alpha, int s);
4   -stim::image<float> func_mPb(stim::image<float> img, unsigned int theta_n, int* r, float* alpha, int s);
5 2 \ No newline at end of file
  3 +//stim::image<float> gaussian_derivative_filter_odd(stim::image<float> image, int r, unsigned int sigma_n, float theta);
  4 +//stim::image<float> func_mPb_theta(stim::image<float> img, float theta, int* r, float* alpha, int s);
  5 +//stim::image<float> func_mPb(stim::image<float> img, unsigned int theta_n, int* r, float* alpha, int s);
  6 +
  7 +stim::image<float> Gd1(stim::image<float> image, int r, unsigned int sigma_n);
  8 +stim::image<float> Gd2(stim::image<float> image, int r, unsigned int sigma_n);
  9 +stim::image<float> Gd_odd(stim::image<float> image, int r, unsigned int sigma_n, float theta);
  10 +stim::image<float> Gd_even(stim::image<float> image, int r, unsigned int sigma_n, float theta);
  11 +stim::image<float> Gd_center(stim::image<float> image, int r, unsigned int sigma_n);
  12 +
  13 +stim::image<float> textons(stim::image<float> image, unsigned int theta_n);
  14 +stim::image<float> kmeans(stim::image<float> textons, unsigned int K);
  15 +stim::image<float> Pb(stim::image<float> image, int r, unsigned int sigma_n);
  16 +stim::image<float> cPb(stim::image<float> img, int* r, float* alpha, int s);
  17 +stim::image<float> tPb(stim::image<float> img, int* r, float* alpha, unsigned int theta_n, unsigned int bin_n, int s, unsigned int K);
6 18 \ No newline at end of file
... ...