Commit 9e5edd36627f01110c78cf4f07135a741c945813
Merge branch 'jiabing' into 'master'
Jiabing See merge request !39
Showing
6 changed files
with
222 additions
and
11 deletions
Show diff stats
No preview for this file type
1 | +# -*- coding: utf-8 -*- | ||
2 | +""" | ||
3 | +Created on Sat Jan 19 2018 | ||
4 | + | ||
5 | +@author: Jiabing | ||
6 | +""" | ||
7 | + | ||
8 | +import struct | ||
9 | +import numpy as np | ||
10 | +import scipy as sp | ||
11 | +import networkx as nx | ||
12 | +import matplotlib.pyplot as plt | ||
13 | +import math | ||
14 | + | ||
15 | + | ||
16 | + | ||
17 | +alkjasfdljkhfsadlkjhfsad | ||
18 | +class Point: | ||
19 | + def __init__(self, x, y, z, radius): | ||
20 | + self.x = x | ||
21 | + self.y = y | ||
22 | + self.z = z | ||
23 | + self.r = radius | ||
24 | + | ||
25 | + | ||
26 | +class Fiber: | ||
27 | + def __init__(self, fiber_idx,point_idx): | ||
28 | + self.fidx = fiber_idx | ||
29 | + self.pidx = point_idx | ||
30 | + | ||
31 | + | ||
32 | + | ||
33 | +class Node: | ||
34 | + def __init__(self, point_idx, fidx): | ||
35 | + self.pidx= point_idx | ||
36 | + self.fidx = fidx | ||
37 | + | ||
38 | + | ||
39 | +#class NWT: | ||
40 | + | ||
41 | +#class network(point, Fiber, Node): |
stim/iVote/ivote2.cuh
@@ -152,13 +152,17 @@ namespace stim { | @@ -152,13 +152,17 @@ namespace stim { | ||
152 | 152 | ||
153 | 153 | ||
154 | template<typename T> | 154 | template<typename T> |
155 | - void cpu_ivote2(T* cpuI, unsigned int rmax, size_t x, size_t y, bool invert = false, T t = 0, int iter = 8, T phi = 15.0f * (float)stim::PI / 180, int conn = 8, bool debug = false) { | 155 | + void cpu_ivote2(T* cpuI, unsigned int rmax, size_t x, size_t y, float &gpu_time, bool invert = false, T t = 0, int iter = 8, T phi = 15.0f * (float)stim::PI / 180, int conn = 8, bool debug = false) { |
156 | size_t bytes = x*y * sizeof(T); | 156 | size_t bytes = x*y * sizeof(T); |
157 | T* gpuI; //allocate space on the gpu to save the input image | 157 | T* gpuI; //allocate space on the gpu to save the input image |
158 | + | ||
159 | + gpuTimer_start(); | ||
158 | HANDLE_ERROR(cudaMalloc(&gpuI, bytes)); | 160 | HANDLE_ERROR(cudaMalloc(&gpuI, bytes)); |
159 | HANDLE_ERROR(cudaMemcpy(gpuI, cpuI, bytes, cudaMemcpyHostToDevice)); //copy the image to the gpu | 161 | HANDLE_ERROR(cudaMemcpy(gpuI, cpuI, bytes, cudaMemcpyHostToDevice)); //copy the image to the gpu |
160 | stim::gpu_ivote2<T>(gpuI, rmax, x, y, invert, t, iter, phi, conn, debug); //call the gpu version of the ivote | 162 | stim::gpu_ivote2<T>(gpuI, rmax, x, y, invert, t, iter, phi, conn, debug); //call the gpu version of the ivote |
161 | HANDLE_ERROR(cudaMemcpy(cpuI, gpuI, bytes, cudaMemcpyDeviceToHost)); //copy the output to the cpu | 163 | HANDLE_ERROR(cudaMemcpy(cpuI, gpuI, bytes, cudaMemcpyDeviceToHost)); //copy the output to the cpu |
164 | + | ||
165 | + gpu_time = gpuTimer_end(); | ||
162 | } | 166 | } |
163 | } | 167 | } |
164 | #endif | 168 | #endif |
165 | \ No newline at end of file | 169 | \ No newline at end of file |
stim/iVote/ivote2/local_max.cuh
@@ -10,7 +10,7 @@ namespace stim{ | @@ -10,7 +10,7 @@ namespace stim{ | ||
10 | 10 | ||
11 | // this kernel calculates the local maximum for finding the cell centers | 11 | // this kernel calculates the local maximum for finding the cell centers |
12 | template<typename T> | 12 | template<typename T> |
13 | - __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, int conn, int x, int y){ | 13 | + __global__ void cuda_local_max(T* gpuCenters, T* gpuVote, int conn, int x, int y){ |
14 | 14 | ||
15 | // calculate the 2D coordinates for this current thread. | 15 | // calculate the 2D coordinates for this current thread. |
16 | int xi = blockIdx.x * blockDim.x + threadIdx.x; | 16 | int xi = blockIdx.x * blockDim.x + threadIdx.x; |
@@ -20,16 +20,16 @@ namespace stim{ | @@ -20,16 +20,16 @@ namespace stim{ | ||
20 | return; | 20 | return; |
21 | 21 | ||
22 | // convert 2D coordinates to 1D | 22 | // convert 2D coordinates to 1D |
23 | - int i = yi * x + xi; | 23 | + int i = yi * x + xi; |
24 | 24 | ||
25 | gpuCenters[i] = 0; //initialize the value at this location to zero | 25 | gpuCenters[i] = 0; //initialize the value at this location to zero |
26 | 26 | ||
27 | T val = gpuVote[i]; | 27 | T val = gpuVote[i]; |
28 | 28 | ||
29 | - for(int xl = xi - conn; xl < xi + conn; xl++){ | ||
30 | - for(int yl = yi - conn; yl < yi + conn; yl++){ | 29 | + for(unsigned int xl = xi - conn; xl < xi + conn; xl++){ |
30 | + for(unsigned int yl = yi - conn; yl < yi + conn; yl++){ | ||
31 | if(xl >= 0 && xl < x && yl >= 0 && yl < y){ | 31 | if(xl >= 0 && xl < x && yl >= 0 && yl < y){ |
32 | - int il = yl * x + xl; | 32 | + unsigned int il = yl * x + xl; |
33 | if(gpuVote[il] > val){ | 33 | if(gpuVote[il] > val){ |
34 | return; | 34 | return; |
35 | } | 35 | } |
@@ -47,7 +47,7 @@ namespace stim{ | @@ -47,7 +47,7 @@ namespace stim{ | ||
47 | } | 47 | } |
48 | 48 | ||
49 | template<typename T> | 49 | template<typename T> |
50 | - void gpu_local_max(T* gpuCenters, T* gpuVote, unsigned int conn, size_t x, size_t y){ | 50 | + void gpu_local_max(T* gpuCenters, T* gpuVote, unsigned int conn, unsigned int x, unsigned int y){ |
51 | 51 | ||
52 | unsigned int max_threads = stim::maxThreadsPerBlock(); | 52 | unsigned int max_threads = stim::maxThreadsPerBlock(); |
53 | /*dim3 threads(max_threads, 1); | 53 | /*dim3 threads(max_threads, 1); |
stim/image/image.h
@@ -275,10 +275,10 @@ public: | @@ -275,10 +275,10 @@ public: | ||
275 | int channels = cvImage.channels(); | 275 | int channels = cvImage.channels(); |
276 | allocate(cols, rows, channels); //allocate space for the image | 276 | allocate(cols, rows, channels); //allocate space for the image |
277 | size_t img_bytes = bytes(); | 277 | size_t img_bytes = bytes(); |
278 | - unsigned char* cv_ptr = (unsigned char*)cvImage.data; | ||
279 | - //if (C() == 1) //if this is a single-color image, just copy the data | ||
280 | - // type_copy<T, T>(cv_ptr, img, size()); | ||
281 | - memcpy(img, cv_ptr, bytes()); | 278 | + unsigned char* cv_ptr = (unsigned char*)cvImage.data; |
279 | + if (C() == 1) //if this is a single-color image, just copy the data | ||
280 | + type_copy<unsigned char, T>(cv_ptr, img, size()); | ||
281 | + //memcpy(img, cv_ptr, bytes()); | ||
282 | if(C() == 3) //if this is a 3-color image, OpenCV uses BGR interleaving | 282 | if(C() == 3) //if this is a 3-color image, OpenCV uses BGR interleaving |
283 | from_opencv(cv_ptr, X(), Y()); | 283 | from_opencv(cv_ptr, X(), Y()); |
284 | #else | 284 | #else |
1 | +#ifndef STIM_CUDA_MEDIAN2_H | ||
2 | +#define STIM_CUDA_MEDIAN2_H | ||
3 | + | ||
4 | +#include <iostream> | ||
5 | +#include <cuda.h> | ||
6 | +#include <cmath> | ||
7 | +#include <algorithm> | ||
8 | + | ||
9 | +#ifdef USING_CUDA | ||
10 | +#include "cuda_runtime.h" | ||
11 | +#include "device_launch_parameters.h" | ||
12 | +#include <stim/cuda/cudatools.h> | ||
13 | +#endif | ||
14 | + | ||
15 | + | ||
16 | +//#include <thrust/sort.h> | ||
17 | +//#include <thrust/execution_policy.h> | ||
18 | +//#include <thrust/binary_search.h> | ||
19 | +//#include <thrust/device_ptr.h> | ||
20 | + | ||
21 | +template <typename T> | ||
22 | +__device__ void cuswap ( T& a, T& b ){ | ||
23 | + T c(a); | ||
24 | + a=b; | ||
25 | + b=c; | ||
26 | +} | ||
27 | + | ||
28 | +namespace stim{ | ||
29 | + namespace cuda{ | ||
30 | + | ||
31 | + template<typename T> | ||
32 | + __global__ void cuda_median2(T* in, T* out, T* kernel, size_t X, size_t Y, size_t K){ | ||
33 | + | ||
34 | + int xi = blockIdx.x * blockDim.x + threadIdx.x; | ||
35 | + int yi = blockIdx.y * blockDim.y + threadIdx.y; | ||
36 | + | ||
37 | + size_t Xout = X - K + 1; | ||
38 | + size_t Yout = Y - K + 1; | ||
39 | + int i = yi * Xout + xi; | ||
40 | + | ||
41 | + //return if (i,j) is outside the matrix | ||
42 | + if(xi >= Xout || yi >= Yout) return; | ||
43 | + | ||
44 | + //scan the image, copy data from input to 2D window | ||
45 | + size_t kxi, kyi; | ||
46 | + for(kyi = 0; kyi < K; kyi++){ | ||
47 | + for(kxi = 0; kxi < K; kxi++){ | ||
48 | + kernel[i * K * K + kyi * K + kxi] = in[(yi + kyi)* X + xi + kxi]; | ||
49 | + } | ||
50 | + } | ||
51 | + | ||
52 | + //calculate kernel radius 4 | ||
53 | + int r = (K*K)/2; | ||
54 | + | ||
55 | + //sort the smallest half pixel values inside the window, calculate the middle one | ||
56 | + size_t Ki = i * K * K; | ||
57 | + | ||
58 | + //sort the smallest half pixel values inside the window, calculate the middle one | ||
59 | + for (int p = 0; p < r+1; p++){ | ||
60 | + for(int kk = p + 1; kk < K*K; kk++){ | ||
61 | + if (kernel[Ki + kk] < kernel[Ki + p]){ | ||
62 | + cuswap<T>(kernel[Ki + kk], kernel[Ki + p]); | ||
63 | + } | ||
64 | + } | ||
65 | + } | ||
66 | + | ||
67 | + //copy the middle pixel value inside the window to output | ||
68 | + out[i] = kernel[Ki + r]; | ||
69 | + | ||
70 | + } | ||
71 | + | ||
72 | + | ||
73 | + template<typename T> | ||
74 | + void gpu_median2(T* gpu_in, T* gpu_out, T* gpu_kernel, size_t X, size_t Y, size_t K){ | ||
75 | + | ||
76 | + //get the maximum number of threads per block for the CUDA device | ||
77 | + int threads_total = stim::maxThreadsPerBlock(); | ||
78 | + | ||
79 | + //set threads in each block | ||
80 | + dim3 threads(sqrt(threads_total), sqrt(threads_total)); | ||
81 | + | ||
82 | + //calculate the number of blocks | ||
83 | + dim3 blocks(( (X - K + 1) / threads.x) + 1, ((Y - K + 1) / threads.y) + 1); | ||
84 | + | ||
85 | + //call the kernel to perform median filter function | ||
86 | + cuda_median2 <<< blocks, threads >>>( gpu_in, gpu_out, gpu_kernel, X, Y, K); | ||
87 | + | ||
88 | + } | ||
89 | + | ||
90 | + template<typename T> | ||
91 | + void cpu_median2(T* cpu_in, T* cpu_out, size_t X, size_t Y, size_t K){ | ||
92 | + | ||
93 | + #ifndef USING_CUDA | ||
94 | + | ||
95 | + //output image width and height | ||
96 | + size_t X_out = X + K -1; | ||
97 | + size_t Y_out = Y + K -1; | ||
98 | + | ||
99 | + float* window = (float*)malloc(K * k *sizeof(float)); | ||
100 | + | ||
101 | + for(int i = 0; i< Y; i++){ | ||
102 | + for(int j =0; j< X; j++){ | ||
103 | + | ||
104 | + //Pick up window elements | ||
105 | + int k = 0; | ||
106 | + for (int m=0; m< kernel_width; m++){ | ||
107 | + for(int n = 0; n < kernel_width; n++){ | ||
108 | + window[k] = in_image.at<float>(m+i, n+j); | ||
109 | + k++; | ||
110 | + } | ||
111 | + } | ||
112 | + | ||
113 | + //calculate kernel radius 4 | ||
114 | + r_ker = K * K/2; | ||
115 | + //Order elements (only half of them) | ||
116 | + for(int p = 0; p<r_ker+1; p++){ | ||
117 | + //Find position of minimum element | ||
118 | + for (int l = p + 1; l < size_kernel; l++){ | ||
119 | + | ||
120 | + if(window[l] < window[p]){ | ||
121 | + float t = window[p]; | ||
122 | + window[p] = window[l]; | ||
123 | + window[l] = t; } | ||
124 | + } | ||
125 | + } | ||
126 | + | ||
127 | + //Get result - the middle element | ||
128 | + cpu_out[i * X_out + j] = window[r_ker]; | ||
129 | + } | ||
130 | + } | ||
131 | + #else | ||
132 | + //calculate input and out image pixels & calculate kernel size | ||
133 | + size_t N_in = X * Y; //number of pixels in the input image | ||
134 | + size_t N_out = (X - K + 1) * (Y - K + 1); //number of pixels in the output image | ||
135 | + //size_t kernel_size = kernel_width*kernel_width; //total number of pixels in the kernel | ||
136 | + | ||
137 | + //allocate memory on the GPU for the array | ||
138 | + T* gpu_in; //allocate device memory for the input image | ||
139 | + HANDLE_ERROR( cudaMalloc( &gpu_in, N_in * sizeof(T) ) ); | ||
140 | + T* gpu_kernel; //allocate device memory for the kernel | ||
141 | + HANDLE_ERROR( cudaMalloc( &gpu_kernel, K * K * N_out * sizeof(T) ) ); | ||
142 | + T* gpu_out; //allocate device memory for the output image | ||
143 | + HANDLE_ERROR( cudaMalloc( &gpu_out, N_out * sizeof(T) ) ); | ||
144 | + | ||
145 | + //copy the array to the GPU | ||
146 | + HANDLE_ERROR( cudaMemcpy( gpu_in, cpu_in, N_in * sizeof(T), cudaMemcpyHostToDevice) ); | ||
147 | + | ||
148 | + //call the GPU version of this function | ||
149 | + gpu_median2<T>(gpu_in, gpu_out, gpu_kernel, X, Y, K); | ||
150 | + | ||
151 | + //copy the array back to the CPU | ||
152 | + HANDLE_ERROR( cudaMemcpy( cpu_out, gpu_out, N_out * sizeof(T), cudaMemcpyDeviceToHost) ); | ||
153 | + | ||
154 | + //free allocated memory | ||
155 | + cudaFree(gpu_in); | ||
156 | + cudaFree(gpu_kernel); | ||
157 | + cudaFree(gpu_out); | ||
158 | + | ||
159 | + } | ||
160 | + | ||
161 | + } | ||
162 | + #endif | ||
163 | +} | ||
164 | + | ||
165 | + | ||
166 | +#endif |