-
Status changed to merged
-
mentioned in commit 9e5edd36627f01110c78cf4f07135a741c945813
Showing
6 changed files
Show diff stats
No preview for this file type
1 | +# -*- coding: utf-8 -*- | |
2 | +""" | |
3 | +Created on Sat Jan 19 2018 | |
4 | + | |
5 | +@author: Jiabing | |
6 | +""" | |
7 | + | |
8 | +import struct | |
9 | +import numpy as np | |
10 | +import scipy as sp | |
11 | +import networkx as nx | |
12 | +import matplotlib.pyplot as plt | |
13 | +import math | |
14 | + | |
15 | + | |
16 | + | |
17 | +alkjasfdljkhfsadlkjhfsad | |
18 | +class Point: | |
19 | + def __init__(self, x, y, z, radius): | |
20 | + self.x = x | |
21 | + self.y = y | |
22 | + self.z = z | |
23 | + self.r = radius | |
24 | + | |
25 | + | |
26 | +class Fiber: | |
27 | + def __init__(self, fiber_idx,point_idx): | |
28 | + self.fidx = fiber_idx | |
29 | + self.pidx = point_idx | |
30 | + | |
31 | + | |
32 | + | |
33 | +class Node: | |
34 | + def __init__(self, point_idx, fidx): | |
35 | + self.pidx= point_idx | |
36 | + self.fidx = fidx | |
37 | + | |
38 | + | |
39 | +#class NWT: | |
40 | + | |
41 | +#class network(point, Fiber, Node): | ... | ... |
stim/iVote/ivote2.cuh
... | ... | @@ -152,13 +152,17 @@ namespace stim { |
152 | 152 | |
153 | 153 | |
154 | 154 | template<typename T> |
155 | - void cpu_ivote2(T* cpuI, unsigned int rmax, size_t x, size_t y, bool invert = false, T t = 0, int iter = 8, T phi = 15.0f * (float)stim::PI / 180, int conn = 8, bool debug = false) { | |
155 | + void cpu_ivote2(T* cpuI, unsigned int rmax, size_t x, size_t y, float &gpu_time, bool invert = false, T t = 0, int iter = 8, T phi = 15.0f * (float)stim::PI / 180, int conn = 8, bool debug = false) { | |
156 | 156 | size_t bytes = x*y * sizeof(T); |
157 | 157 | T* gpuI; //allocate space on the gpu to save the input image |
158 | + | |
159 | + gpuTimer_start(); | |
158 | 160 | HANDLE_ERROR(cudaMalloc(&gpuI, bytes)); |
159 | 161 | HANDLE_ERROR(cudaMemcpy(gpuI, cpuI, bytes, cudaMemcpyHostToDevice)); //copy the image to the gpu |
160 | 162 | stim::gpu_ivote2<T>(gpuI, rmax, x, y, invert, t, iter, phi, conn, debug); //call the gpu version of the ivote |
161 | 163 | HANDLE_ERROR(cudaMemcpy(cpuI, gpuI, bytes, cudaMemcpyDeviceToHost)); //copy the output to the cpu |
164 | + | |
165 | + gpu_time = gpuTimer_end(); | |
162 | 166 | } |
163 | 167 | } |
164 | 168 | #endif |
165 | 169 | \ No newline at end of file | ... | ... |
stim/iVote/ivote2/local_max.cuh
... | ... | @@ -10,7 +10,7 @@ namespace stim{ |
10 | 10 | |
11 | 11 | // this kernel calculates the local maximum for finding the cell centers |
12 | 12 | template<typename T> |
13 | - __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, int conn, int x, int y){ | |
13 | + __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, int conn, int x, int y){ | |
14 | 14 | |
15 | 15 | // calculate the 2D coordinates for this current thread. |
16 | 16 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
... | ... | @@ -20,16 +20,16 @@ namespace stim{ |
20 | 20 | return; |
21 | 21 | |
22 | 22 | // convert 2D coordinates to 1D |
23 | - int i = yi * x + xi; | |
23 | + int i = yi * x + xi; | |
24 | 24 | |
25 | 25 | gpuCenters[i] = 0; //initialize the value at this location to zero |
26 | 26 | |
27 | 27 | T val = gpuVote[i]; |
28 | 28 | |
29 | - for(int xl = xi - conn; xl < xi + conn; xl++){ | |
30 | - for(int yl = yi - conn; yl < yi + conn; yl++){ | |
29 | + for(unsigned int xl = xi - conn; xl < xi + conn; xl++){ | |
30 | + for(unsigned int yl = yi - conn; yl < yi + conn; yl++){ | |
31 | 31 | if(xl >= 0 && xl < x && yl >= 0 && yl < y){ |
32 | - int il = yl * x + xl; | |
32 | + unsigned int il = yl * x + xl; | |
33 | 33 | if(gpuVote[il] > val){ |
34 | 34 | return; |
35 | 35 | } |
... | ... | @@ -47,7 +47,7 @@ namespace stim{ |
47 | 47 | } |
48 | 48 | |
49 | 49 | template<typename T> |
50 | - void gpu_local_max(T* gpuCenters, T* gpuVote, unsigned int conn, size_t x, size_t y){ | |
50 | + void gpu_local_max(T* gpuCenters, T* gpuVote, unsigned int conn, unsigned int x, unsigned int y){ | |
51 | 51 | |
52 | 52 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
53 | 53 | /*dim3 threads(max_threads, 1); | ... | ... |
stim/image/image.h
... | ... | @@ -275,10 +275,10 @@ public: |
275 | 275 | int channels = cvImage.channels(); |
276 | 276 | allocate(cols, rows, channels); //allocate space for the image |
277 | 277 | size_t img_bytes = bytes(); |
278 | - unsigned char* cv_ptr = (unsigned char*)cvImage.data; | |
279 | - //if (C() == 1) //if this is a single-color image, just copy the data | |
280 | - // type_copy<T, T>(cv_ptr, img, size()); | |
281 | - memcpy(img, cv_ptr, bytes()); | |
278 | + unsigned char* cv_ptr = (unsigned char*)cvImage.data; | |
279 | + if (C() == 1) //if this is a single-color image, just copy the data | |
280 | + type_copy<unsigned char, T>(cv_ptr, img, size()); | |
281 | + //memcpy(img, cv_ptr, bytes()); | |
282 | 282 | if(C() == 3) //if this is a 3-color image, OpenCV uses BGR interleaving |
283 | 283 | from_opencv(cv_ptr, X(), Y()); |
284 | 284 | #else | ... | ... |
1 | +#ifndef STIM_CUDA_MEDIAN2_H | |
2 | +#define STIM_CUDA_MEDIAN2_H | |
3 | + | |
4 | +#include <iostream> | |
5 | +#include <cuda.h> | |
6 | +#include <cmath> | |
7 | +#include <algorithm> | |
8 | + | |
9 | +#ifdef USING_CUDA | |
10 | +#include "cuda_runtime.h" | |
11 | +#include "device_launch_parameters.h" | |
12 | +#include <stim/cuda/cudatools.h> | |
13 | +#endif | |
14 | + | |
15 | + | |
16 | +//#include <thrust/sort.h> | |
17 | +//#include <thrust/execution_policy.h> | |
18 | +//#include <thrust/binary_search.h> | |
19 | +//#include <thrust/device_ptr.h> | |
20 | + | |
21 | +template <typename T> | |
22 | +__device__ void cuswap ( T& a, T& b ){ | |
23 | + T c(a); | |
24 | + a=b; | |
25 | + b=c; | |
26 | +} | |
27 | + | |
28 | +namespace stim{ | |
29 | + namespace cuda{ | |
30 | + | |
31 | + template<typename T> | |
32 | + __global__ void cuda_median2(T* in, T* out, T* kernel, size_t X, size_t Y, size_t K){ | |
33 | + | |
34 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | |
35 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | |
36 | + | |
37 | + size_t Xout = X - K + 1; | |
38 | + size_t Yout = Y - K + 1; | |
39 | + int i = yi * Xout + xi; | |
40 | + | |
41 | + //return if (i,j) is outside the matrix | |
42 | + if(xi >= Xout || yi >= Yout) return; | |
43 | + | |
44 | + //scan the image, copy data from input to 2D window | |
45 | + size_t kxi, kyi; | |
46 | + for(kyi = 0; kyi < K; kyi++){ | |
47 | + for(kxi = 0; kxi < K; kxi++){ | |
48 | + kernel[i * K * K + kyi * K + kxi] = in[(yi + kyi)* X + xi + kxi]; | |
49 | + } | |
50 | + } | |
51 | + | |
52 | + //calculate kernel radius 4 | |
53 | + int r = (K*K)/2; | |
54 | + | |
55 | + //sort the smallest half pixel values inside the window, calculate the middle one | |
56 | + size_t Ki = i * K * K; | |
57 | + | |
58 | + //sort the smallest half pixel values inside the window, calculate the middle one | |
59 | + for (int p = 0; p < r+1; p++){ | |
60 | + for(int kk = p + 1; kk < K*K; kk++){ | |
61 | + if (kernel[Ki + kk] < kernel[Ki + p]){ | |
62 | + cuswap<T>(kernel[Ki + kk], kernel[Ki + p]); | |
63 | + } | |
64 | + } | |
65 | + } | |
66 | + | |
67 | + //copy the middle pixel value inside the window to output | |
68 | + out[i] = kernel[Ki + r]; | |
69 | + | |
70 | + } | |
71 | + | |
72 | + | |
73 | + template<typename T> | |
74 | + void gpu_median2(T* gpu_in, T* gpu_out, T* gpu_kernel, size_t X, size_t Y, size_t K){ | |
75 | + | |
76 | + //get the maximum number of threads per block for the CUDA device | |
77 | + int threads_total = stim::maxThreadsPerBlock(); | |
78 | + | |
79 | + //set threads in each block | |
80 | + dim3 threads(sqrt(threads_total), sqrt(threads_total)); | |
81 | + | |
82 | + //calculate the number of blocks | |
83 | + dim3 blocks(( (X - K + 1) / threads.x) + 1, ((Y - K + 1) / threads.y) + 1); | |
84 | + | |
85 | + //call the kernel to perform median filter function | |
86 | + cuda_median2 <<< blocks, threads >>>( gpu_in, gpu_out, gpu_kernel, X, Y, K); | |
87 | + | |
88 | + } | |
89 | + | |
90 | + template<typename T> | |
91 | + void cpu_median2(T* cpu_in, T* cpu_out, size_t X, size_t Y, size_t K){ | |
92 | + | |
93 | + #ifndef USING_CUDA | |
94 | + | |
95 | + //output image width and height | |
96 | + size_t X_out = X + K -1; | |
97 | + size_t Y_out = Y + K -1; | |
98 | + | |
99 | + float* window = (float*)malloc(K * k *sizeof(float)); | |
100 | + | |
101 | + for(int i = 0; i< Y; i++){ | |
102 | + for(int j =0; j< X; j++){ | |
103 | + | |
104 | + //Pick up window elements | |
105 | + int k = 0; | |
106 | + for (int m=0; m< kernel_width; m++){ | |
107 | + for(int n = 0; n < kernel_width; n++){ | |
108 | + window[k] = in_image.at<float>(m+i, n+j); | |
109 | + k++; | |
110 | + } | |
111 | + } | |
112 | + | |
113 | + //calculate kernel radius 4 | |
114 | + r_ker = K * K/2; | |
115 | + //Order elements (only half of them) | |
116 | + for(int p = 0; p<r_ker+1; p++){ | |
117 | + //Find position of minimum element | |
118 | + for (int l = p + 1; l < size_kernel; l++){ | |
119 | + | |
120 | + if(window[l] < window[p]){ | |
121 | + float t = window[p]; | |
122 | + window[p] = window[l]; | |
123 | + window[l] = t; } | |
124 | + } | |
125 | + } | |
126 | + | |
127 | + //Get result - the middle element | |
128 | + cpu_out[i * X_out + j] = window[r_ker]; | |
129 | + } | |
130 | + } | |
131 | + #else | |
132 | + //calculate input and out image pixels & calculate kernel size | |
133 | + size_t N_in = X * Y; //number of pixels in the input image | |
134 | + size_t N_out = (X - K + 1) * (Y - K + 1); //number of pixels in the output image | |
135 | + //size_t kernel_size = kernel_width*kernel_width; //total number of pixels in the kernel | |
136 | + | |
137 | + //allocate memory on the GPU for the array | |
138 | + T* gpu_in; //allocate device memory for the input image | |
139 | + HANDLE_ERROR( cudaMalloc( &gpu_in, N_in * sizeof(T) ) ); | |
140 | + T* gpu_kernel; //allocate device memory for the kernel | |
141 | + HANDLE_ERROR( cudaMalloc( &gpu_kernel, K * K * N_out * sizeof(T) ) ); | |
142 | + T* gpu_out; //allocate device memory for the output image | |
143 | + HANDLE_ERROR( cudaMalloc( &gpu_out, N_out * sizeof(T) ) ); | |
144 | + | |
145 | + //copy the array to the GPU | |
146 | + HANDLE_ERROR( cudaMemcpy( gpu_in, cpu_in, N_in * sizeof(T), cudaMemcpyHostToDevice) ); | |
147 | + | |
148 | + //call the GPU version of this function | |
149 | + gpu_median2<T>(gpu_in, gpu_out, gpu_kernel, X, Y, K); | |
150 | + | |
151 | + //copy the array back to the CPU | |
152 | + HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N_out * sizeof(T), cudaMemcpyDeviceToHost) ); | |
153 | + | |
154 | + //free allocated memory | |
155 | + cudaFree(gpu_in); | |
156 | + cudaFree(gpu_kernel); | |
157 | + cudaFree(gpu_out); | |
158 | + | |
159 | + } | |
160 | + | |
161 | + } | |
162 | + #endif | |
163 | +} | |
164 | + | |
165 | + | |
166 | +#endif | ... | ... |