#ifndef STIM_CUDA_ARRAY_CART2POLAR_H #define STIM_CUDA_ARRAY_CART2POLAR_H namespace stim{ namespace cuda{ template __global__ void cuda_cart2polar(T* a, size_t x, size_t y, float rotation){ // calculate the 2D coordinates for this current thread. size_t xi = blockIdx.x * blockDim.x + threadIdx.x; size_t yi = blockIdx.y * blockDim.y + threadIdx.y; // convert 2D coordinates to 1D size_t i = yi * x + xi; if(xi >= x|| yi >= y) return; float xl = a[i * 2 + 0]; float yl = a[i * 2 + 1]; float theta = atan2( yl, xl ) ; float r = sqrt(xl * xl + yl * yl); a[i * 2 + 0] = theta + rotation; a[i * 2 + 1] = r; } template void gpu_cart2polar(T* gpuGrad, size_t x, size_t y, float rotation = 0){ unsigned int max_threads = stim::maxThreadsPerBlock(); dim3 threads(max_threads, 1); dim3 blocks(((unsigned int)x/threads.x) + ((unsigned int)x %threads.x == 0 ? 0:1) , (unsigned int)y); //call the kernel to do the multiplication cuda_cart2polar <<< blocks, threads >>>(gpuGrad, x, y, rotation); } template void cpu_cart2polar(T* a, size_t x, size_t y){ //calculate the number of bytes in the array size_t N = x *y; size_t bytes = N * sizeof(T) * 2; //allocate memory on the GPU for the array T* gpuA; HANDLE_ERROR( cudaMalloc(&gpuA, bytes) ); //copy the array to the GPU HANDLE_ERROR( cudaMemcpy(gpuA, a, bytes, cudaMemcpyHostToDevice) ); //call the GPU version of this function gpu_cart2polar(gpuA, x, y); //copy the array back to the CPU HANDLE_ERROR( cudaMemcpy(a, gpuA, bytes, cudaMemcpyDeviceToHost) ); //free allocated memory cudaFree(gpuA); } } } #endif