#ifndef STIM_CUDA_SET_RMAX_H #define STIM_CUDA_SET_RMAX_H #include #include #include #include #include template __global__ void cuda_set_rmax(T* gpu_r, int rx, int ry, int rz, int x, int y, int z){ //calculate x,y,z coordinates for this thread int xi = blockIdx.x * blockDim.x + threadIdx.x; //find the grid size along y int grid_y = y / blockDim.y; int blockidx_y = blockIdx.y % grid_y; int yi = blockidx_y * blockDim.y + threadIdx.y; int zi = blockIdx.y / grid_y; int i = zi * x * y + yi * x + xi; if(xi>=x || yi>=y || zi>=z) return; gpu_r[i*3+0] = rx; gpu_r[i*3+1] = ry; gpu_r[i*3+2] = rz; } template void gpu_set_rmax(T* gpu_r, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads(sqrt (max_threads),sqrt (max_threads)); dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); //call the kernel to do the voting cuda_set_rmax <<< blocks, threads >>>(gpu_r, r[0], r[1], r[2], x , y, z); } template void cpu_set_rmax(T* cpu_r, unsigned int r[], unsigned int x, unsigned int y, unsigned int z){ //calculate the number of bytes in the array unsigned int bytes = x * y * z * sizeof(T); //allocate space on the GPU for the rmax T* gpu_r; cudaMalloc(&gpu_vote, bytes*3); cudaMemcpy(gpu_r, cpu_r, bytes*3, cudaMemcpyHostToDevice); //call the GPU version of the vote calculation function gpu_set_rmax(gpu_r, r, x , y, z); //copy the Vote Data back to the CPU cudaMemcpy(cpu_r, gpu_r, bytes*3, cudaMemcpyDeviceToHost) ; //free allocated memory cudaFree(gpu_r); } #endif