#ifndef STIM_CUDA_MAG3_H #define STIM_CUDA_MAG3_H #include #include #include //#include #include template __global__ void mag3(T* out, T* in, int x, int y, int z){ //calculate x,y,z coordinates for this thread int xi = blockIdx.x * blockDim.x + threadIdx.x; //find the grid size along y int grid_y = y / blockDim.y; int blockidx_y = blockIdx.y % grid_y; int yi = blockidx_y * blockDim.y + threadIdx.y; int zi = blockIdx.y / grid_y; int i = zi * x * y + yi * x + xi; if(xi >= x|| yi >= y || zi>= z) return; float xl = in[i * 3 + 0]; float yl = in[i * 3 + 1]; float zl = in[i * 3 + 2]; out[i] = sqrt(xl * xl + yl * yl + zl * zl); } template void gpu_mag3(T* gpu_mag, T* gpu_cart, unsigned int x, unsigned int y, unsigned int z){ int max_threads = stim::maxThreadsPerBlock(); dim3 threads(sqrt (max_threads),sqrt (max_threads)); dim3 blocks(x / threads.x + 1, (y / threads.y + 1) * z); //call the GPU kernel to determine the gradient mag3 <<< blocks, threads >>>(gpu_mag, gpu_cart, x, y, z); } template void cpu_mag3(T* out, T* in, unsigned int x, unsigned int y, unsigned int z){ //calculate the number of bytes in the array unsigned int bytes = x* y* z * sizeof(T); //allocate memory on the GPU for the input data. T* gpu_cart; cudaMalloc(&gpu_cart, bytes*3); cudaMemcpy(gpu_cart, in, bytes*3, cudaMemcpyHostToDevice); //allocate memory on the GPU for the magnitude T* gpu_mag; cudaMalloc(&gpu_mag, bytes); //call the GPU version of this function gpu_mag3(gpu_mag, gpu_cart, x, y, z); //copy the array back to the CPU cudaMemcpy(out, gpu_mag, bytes, cudaMemcpyDeviceToHost); //free allocated memory cudaFree(gpu_cart); cudaFree(gpu_mag); } #endif