diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f4b5661..d21dd51 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -46,4 +46,5 @@ target_link_libraries(ivote3 ) #copy an image test case -configure_file(data/nissl/nissl-rat.vol nissl-rat.vol COPYONLY) +configure_file(nissl-raw-data/nissl-float-256.256.256.vol nissl-float-256.256.256.vol COPYONLY) +configure_file(nissl-raw-data/nissl-float-128.128.128.vol nissl-float-128.128.128.vol COPYONLY) diff --git a/cpp/cudafunc.cu b/cpp/cudafunc.cu index 41de304..83fc0d5 100644 --- a/cpp/cudafunc.cu +++ b/cpp/cudafunc.cu @@ -1,27 +1,87 @@ #include "gaussian_blur3.cuh" #include "gradient3.cuh" +#include "mag3.cuh" +#include "vote3.cuh" +#include "update_dir3.cuh" +#include "local_max3.cuh" -void ivote3(float* out, float* img, float sigma, unsigned int x, unsigned int y, unsigned int z){ - //cpu_gaussian_blur_conv3(Ib, img, rhs, x, y, z); - +void ivote3(float* center, float* img, float sigma[], float phi, float d_phi, unsigned int r[], + int iter, float t, unsigned int conn[], unsigned int x, unsigned int y, unsigned int z){ + + + // compute the number of bytes in the input data unsigned int bytes = x * y * z * sizeof(float); + + //assign memory on gpu for the input data. float* gpuI0; - cudaMalloc(&gpuI0, bytes); - //copy the image data to the GPU + cudaMalloc(&gpuI0, bytes); + + //copy the image data to the GPU. cudaMemcpy(gpuI0, img, bytes, cudaMemcpyHostToDevice); + //call the blurring function from the gpu. gpu_gaussian_blur3(gpuI0, sigma, x, y, z); + cudaDeviceSynchronize(); + + //copy the blur data back to the cpu + //cudaMemcpy(img, gpuI0, bytes, cudaMemcpyDeviceToHost); + + //assign memory on the gpu for the gradient along the X, y, z. float* gpu_grad; cudaMalloc(&gpu_grad, bytes*3); - gpu_gradient3(gpu_grad, gpuI0, x, y, z); + //call the gradient function from the gpu. + gpu_gradient3(gpu_grad, gpuI0, x, y, z); + cudaFree(gpuI0); + + //assign memory on the gpu for the gradient magnitude + float* gpu_mag; + cudaMalloc(&gpu_mag, bytes); + + //call the magnitude function + gpu_mag3(gpu_mag, gpu_grad, x, y, z); + //cudaMemcpy(img, gpu_mag, bytes, cudaMemcpyDeviceToHost); + //assign memory on the gpu for the vote. + float* gpu_vote; + cudaMalloc(&gpu_vote, bytes); - //copy the image data back to the cpu - cudaMemcpy(img, gpuI0, x*y*z*sizeof(float), cudaMemcpyDeviceToHost); - cudaMemcpy(out, gpu_grad, bytes*3, cudaMemcpyDeviceToHost); + float cos_phi = cos(phi); + + //call the vote function. + for (int i = 0; i < iter; i++){ + + gpu_vote3(gpu_vote, gpu_grad, gpu_mag, cos_phi, r, x, y, z); + cudaDeviceSynchronize(); + if (i==0) + cudaMemcpy(img, gpu_vote, bytes, cudaMemcpyDeviceToHost); + + if (phi >= d_phi){ + gpu_update_dir3(gpu_grad, gpu_vote, cos_phi, r, x, y, z); + cudaDeviceSynchronize(); + phi = phi - d_phi; + cos_phi = cos(phi); + } + + } - cudaFree(gpuI0); cudaFree(gpu_grad); + cudaFree(gpu_mag); + //cudaMemcpy(center, gpu_vote, bytes, cudaMemcpyDeviceToHost); + + //allocate space on the gpu for the final detected cells. + float* gpu_output; + cudaMalloc(&gpu_output, bytes); + + //call the local max function + gpu_local_max3(gpu_output, gpu_vote, t, conn, x, y, z); + + //copy the final result to the cpu. + cudaMemcpy(center, gpu_output, bytes, cudaMemcpyDeviceToHost); + + + cudaFree(gpu_vote); + cudaFree(gpu_output); + } \ No newline at end of file diff --git a/cpp/gaussian_blur3.cuh b/cpp/gaussian_blur3.cuh index 465dd17..f42e8d4 100644 --- a/cpp/gaussian_blur3.cuh +++ b/cpp/gaussian_blur3.cuh @@ -6,6 +6,7 @@ #include #include #include +#include "cuda_fp16.h" #define pi 3.14159 @@ -13,14 +14,16 @@ template __global__ void blur_x(T* out, T* in, T sigma, unsigned int x, unsigned int y, unsigned int z){ - //calculate x,y,z coordinates for this thread int xi = blockIdx.x * blockDim.x + threadIdx.x; - int yi = blockIdx.y * blockDim.y + threadIdx.y; - int zi = blockIdx.z * blockDim.z + threadIdx.z; + //find the grid size along y + int grid_y = y / blockDim.y; + int blockidx_y = blockIdx.y % grid_y; + int yi = blockidx_y * blockDim.y + threadIdx.y; + int zi = blockIdx.y / grid_y; int i = zi * x * y + yi * x + xi; - // calculate the kernel size + // calculate the kernel size int k_size = sigma * 4; //if the current pixel is outside of the image @@ -28,37 +31,26 @@ return; - int gx, gy, gz, gi; + int gx, gi; T G; T sum = 0; //running weighted sum across the kernel out[i] = 0; - + T sigma_sq = 2 * sigma * sigma; + T a = 1.0 / (sigma * sqrt(2 * pi)); //for each element of the kernel for(int ki = -k_size; ki <= k_size; ki++){ //calculate the gaussian value - G = (1.0 / (sigma * sqrt(2 * pi))) * exp(-(ki*ki) / (2*sigma*sigma)); - - - + G = a * exp(-(ki*ki) / (sigma_sq)); //calculate the global coordinates for this point in the kernel - gx = xi + ki; - gy = yi; - gz = zi; - - //make sure we are inside the bounds of the image - if(gx >= 0 && gx < x ){ - gi = gz * x * y + gy * x + gx; - //perform the integration - sum += G * in[gi]; - } - + gx = (xi + ki) % x; + gi = zi * x * y + yi * x + gx; + sum += G * in[gi]; } //output the result to global memory out[i] = sum; - } @@ -67,11 +59,14 @@ //calculate x,y,z coordinates for this thread int xi = blockIdx.x * blockDim.x + threadIdx.x; - int yi = blockIdx.y * blockDim.y + threadIdx.y; - int zi = blockIdx.z * blockDim.z + threadIdx.z; + //find the grid size along y + int grid_y = y / blockDim.y; + int blockidx_y = blockIdx.y % grid_y; + int yi = blockidx_y * blockDim.y + threadIdx.y; + int zi = blockIdx.y / grid_y; int i = zi * x * y + yi * x + xi; - - // calculate the kernel size + + // calculate the kernel size int k_size = sigma * 4; //if the current pixel is outside of the image @@ -79,34 +74,25 @@ return; - int gx, gy, gz, gi; + int gy, gi; T G; T sum = 0; //running weighted sum across the kernel out[i] = 0; - + T sigma_sq = 2 * sigma * sigma; + T a = 1.0 / (sigma * sqrt(2 * pi)); //for each element of the kernel for(int ki = -k_size; ki <= k_size; ki++){ //calculate the gaussian value - G = 1.0 / (sigma * sqrt(2 * pi)) * exp(-(ki*ki) / (2*sigma*sigma)); - - - + G = a * exp(-(ki*ki) / sigma_sq); //calculate the global coordinates for this point in the kernel - gx = xi; - gy = yi + ki; - gz = zi; - - //make sure we are inside the bounds of the image - if(gx >= 0 && gy >= 0 && gz >= 0 && gx < x && gy < y && gz < z){ - gi = gz * x * y + gy * x + gx; - //perform the integration - sum += G * in[gi]; - } - + gy = (yi + ki ) % y; + gi = zi * x * y + gy * x + xi; + sum += G * in[gi]; } + //output the result to global memory out[i] = sum; } @@ -116,11 +102,14 @@ //calculate x,y,z coordinates for this thread int xi = blockIdx.x * blockDim.x + threadIdx.x; - int yi = blockIdx.y * blockDim.y + threadIdx.y; - int zi = blockIdx.z * blockDim.z + threadIdx.z; + //find the grid size along y + int grid_y = y / blockDim.y; + int blockidx_y = blockIdx.y % grid_y; + int yi = blockidx_y * blockDim.y + threadIdx.y; + int zi = blockIdx.y / grid_y; int i = zi * x * y + yi * x + xi; - - // calculate the kernel size + + // calculate the kernel size int k_size = sigma * 4; //if the current pixel is outside of the image @@ -128,32 +117,22 @@ return; - int gx, gy, gz, gi; + int gz, gi; T G; T sum = 0; //running weighted sum across the kernel out[i] = 0; - + T sigma_sq = 2 * sigma * sigma; + T a = 1.0 / (sigma * sqrt(2 * pi)); //for each element of the kernel for(int ki = -k_size; ki <= k_size; ki++){ //calculate the gaussian value - G = 1.0 / (sigma * sqrt(2 * pi)) * exp(-(ki*ki) / (2*sigma*sigma)); - - - + G = a * exp(-(ki*ki) / sigma_sq); //calculate the global coordinates for this point in the kernel - gx = xi; - gy = yi; - gz = zi + ki; - - //make sure we are inside the bounds of the image - if(gx >= 0 && gy >= 0 && gz >= 0 && gx < x && gy < y && gz < z){ - gi = gz * x * y + gy * x + gx; - //perform the integration - sum += G * in[gi]; - } - + gz = (zi + ki) % z; + gi = gz * x * y + yi * x + xi; + sum += G * in[gi]; } //output the result to global memory @@ -161,15 +140,15 @@ } template - void gpu_gaussian_blur3(T* image, T sigma, unsigned int x, unsigned int y, unsigned int z){ + void gpu_gaussian_blur3(T* image, T sigma[], unsigned int x, unsigned int y, unsigned int z){ //get the number of pixels in the image unsigned int pixels = x * y * z; unsigned int bytes = sizeof(T) * pixels; int max_threads = stim::maxThreadsPerBlock(); - dim3 threads(512, 1, 1); - dim3 blocks(x / threads.x + 1, y, z); + dim3 threads(sqrt (max_threads),sqrt (max_threads)); + dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); //allocate temporary space on the GPU T* gpuIb_x; @@ -179,26 +158,25 @@ T* gpuIb_y; cudaMalloc(&gpuIb_y, bytes); - // blur the original image in the x direction - blur_x <<< blocks, threads >>>(gpuIb_x, image, sigma, x, y, z); + // blur the original image along the x direction + blur_x <<< blocks, threads >>>(gpuIb_x, image, sigma[0], x, y, z); - // blur the original image in the y direction - blur_y <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma, x, y, z); + // blur the x-blurred image along the y direction + blur_y <<< blocks, threads >>>(gpuIb_y, gpuIb_x, sigma[1], x, y, z); - // blur the original image in the z direction - blur_z <<< blocks, threads >>>(image, gpuIb_y, sigma, x, y, z); + // blur the xy-blurred image along the z direction + blur_z <<< blocks, threads >>>(image, gpuIb_y, sigma[2], x, y, z); //cudaMemcpy(image, gpuIb_y, bytes, cudaMemcpyDeviceToDevice); cudaFree(gpuIb_x); cudaFree(gpuIb_y); - } /// Applies a Gaussian blur to a 2D image stored on the CPU template - void cpu_gaussian_blur3(T* blur, T* image, T sigma, unsigned int x, unsigned int y, unsigned int z){ + void cpu_gaussian_blur3(T* blur, T* image, T sigma[], unsigned int x, unsigned int y, unsigned int z){ //get the number of pixels in the image diff --git a/cpp/gradient3.cuh b/cpp/gradient3.cuh index 3c7f018..bcdf4e3 100644 --- a/cpp/gradient3.cuh +++ b/cpp/gradient3.cuh @@ -7,23 +7,30 @@ #include template -__global__ void gradient3(T* out, T* in, unsigned int x, unsigned int y, unsigned int z){ +__global__ void gradient3(T* out, T* in, int x, int y, int z){ //calculate x,y,z coordinates for this thread int xi = blockIdx.x * blockDim.x + threadIdx.x; - int yi = blockIdx.y * blockDim.y + threadIdx.y; - int zi = blockIdx.z * blockDim.z + threadIdx.z; + //find the grid size along y + int grid_y = y / blockDim.y; + int blockidx_y = blockIdx.y % grid_y; + int yi = blockidx_y * blockDim.y + threadIdx.y; + int zi = blockIdx.y / grid_y; int i = zi * x * y + yi * x + xi; //return if the pixel is outside of the image - if(xi >= x || yi >= y || zi>=z) - return; + if(xi >= x || yi >= y || zi>=z) return; //calculate indices for the forward difference int i_xp = zi * x * y + yi * x + (xi + 1); int i_yp = zi * x * y + (yi + 1) * x + xi; int i_zp = (zi + 1) * x * y + yi * x + xi; + //calculate indices for the backward difference + int i_xn = zi * x * y + yi * x + (xi - 1); + int i_yn = zi * x * y + (yi - 1) * x + xi; + int i_zn = (zi - 1) * x * y + yi * x + xi; + //use forward differences if a coordinate is zero if(xi == 0) out[i * 3 + 0] = in[i_xp] - in[i]; @@ -32,11 +39,6 @@ __global__ void gradient3(T* out, T* in, unsigned int x, unsigned int y, unsigne if (zi==0) out[i * 3 + 2] = in[i_zp] - in[i]; - //calculate indices for the backward difference - int i_xn = zi * x * y + yi * x + (xi - 1); - int i_yn = zi * x * y + (yi - 1) * x + xi; - int i_zn = (zi - 1) * x * y + yi * x + xi; - //use backward differences if the coordinate is at the maximum edge if(xi == x-1) out[i * 3 + 0] = in[i] - in[i_xn]; @@ -48,11 +50,10 @@ __global__ void gradient3(T* out, T* in, unsigned int x, unsigned int y, unsigne //otherwise use central differences if(xi > 0 && xi < x-1) out[i * 3 + 0] = (in[i_xp] - in[i_xn]) / 2; - if(yi > 0 && yi < y-1) out[i * 3 + 1] = (in[i_yp] - in[i_yn]) / 2; if(zi > 0 && zi < z-1) - out[i * 3 + 1] = (in[i_zp] - in[i_zn]) / 2; + out[i * 3 + 2] = (in[i_zp] - in[i_zn]) / 2; } @@ -60,12 +61,10 @@ template void gpu_gradient3(T* gpuGrad, T* gpuI, unsigned int x, unsigned int y, unsigned int z){ - //get the number of pixels in the image - unsigned int pixels = x * y * z; - + int max_threads = stim::maxThreadsPerBlock(); - dim3 threads(512, 1, 1); - dim3 blocks(x / threads.x + 1, y, z); + dim3 threads(sqrt (max_threads),sqrt (max_threads)); + dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); //call the GPU kernel to determine the gradient gradient3 <<< blocks, threads >>>(gpuGrad, gpuI, x, y, z); diff --git a/cpp/main.cpp b/cpp/main.cpp index ad2d9d4..65c74e1 100644 --- a/cpp/main.cpp +++ b/cpp/main.cpp @@ -1,6 +1,5 @@ #include #include -//#include #include #include #include @@ -8,85 +7,126 @@ #include #include #include - +#include #define pi 3.14159 -void ivote3(float* out, float* img, float std, unsigned int x, unsigned int y, unsigned int z); - -int main(){ - - float sigma =3; - - std::string filename = "nissl-float-512.512.512.vol"; - unsigned int x = 512; - unsigned int y = 512; - unsigned int z = 512; +void ivote3(float* center, float* img, float std[], float phi, float d_phi, unsigned int r[], int iter, float t, unsigned int conn[], + unsigned int x, unsigned int y, unsigned int z); + +void invert_data(float* cpuI, unsigned int x, unsigned int y, unsigned int z){ + for(int ix = 0; ix < x; ix++){ + for (int iy = 0; iy < y; iy++){ + for (int iz = 0; iz < z; iz++){ + int idx = iz * x * y + iy * x + ix; + cpuI[idx] = 255 - cpuI[idx]; + } + } + } + } + +int main(int argc, char** argv){ + + //output advertisement + std::cout< I; - unsigned int x =512; - unsigned int y =512; - unsigned int z =512; - unsigned int bytes = x*y*z* sizeof(float); + //allocate space on the cpu for the output result + float* cpu_out = (float*) malloc(bytes); - I.read("nissl-float-512.512.512.vol", 512, 512, 512, 1, 0); - //float test = I.get(128, 128, 128); - //std::cout<< test<(I.data(), "after_loading.bmp", 512, 512); - - float* cpuI = (float*) malloc(bytes); + // call the ivote function + ivote3(cpu_out, cpuI, sigma, phi, d_phi, r, iter, t, conn, x, y, z); - memcpy (cpuI, ptr0, bytes); - - - std::ofstream infile ("original-2.vol"); - infile.write((char*)cpuI, bytes); - infile.close();*/ - //stim::cpu2image(cpuI, "00-cpuI.bmp", 512, 1024); + //write the blurred file from the cpuI. + std::ofstream fblur("output/test0.vol", std::ofstream::out | std::ofstream::binary); + fblur.write((char*)cpuI, bytes); + fblur.close(); + + //write the output file. + std::ofstream fo("output/" + OutName.str(), std::ofstream::out | std::ofstream::binary); + fo.write((char*)cpu_out, bytes); + fo.close(); - //std::fstream file2; - //file2.write((char*)cpuI, 512*512*512*sizeof(unsigned char)); - //I.write("data\\volume1.raw"); - //test = ptr0[128*128*128]; - //std::cout<< test<