#ifndef STIM_CUDA_cpyToshare_H #define STIM_CUDA_cpyToshare_H //this function copy one channel data from global to shared memory in one dimension with size of X bytes. template __device__ void cpyG2S1D(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y, dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){ //calculate the total number of threads available unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z; //calculate the current 1D thread ID unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x; //calculate the number of iteration require for the copy unsigned int I = X/tThreads + 1; //the specified start position in global memory is (x, y, z) unsigned int gstart = z*I_x*I_y + y*I_x + x; for (unsigned int i = 0; i < I; i++){ //each iteration will copy tThreads elements, so the starting index in shared memory //for each iteration will be i * tThreads (iteration # times the number of elements transferred per iteration) unsigned int sIdx = i * tThreads + ti; if (sIdx>= X*Y) return; //each iteration will copy tThreads elements from the global index unsigned int gIdx = gstart + sIdx; //copy global to share dest[sIdx] = src[gIdx]; } } //this function copy one channel data from global to shared memory in two dimensions with size of X*Y bytes. template __device__ void cpyG2S2D(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y, dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){ //calculate the total number of threads available unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z; //calculate the current 1D thread ID unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x; //calculate the number of iteration require for the copy unsigned int I = X*Y/tThreads + 1; unsigned int gz1 = z*I_x*I_y ; for (unsigned int i = 0; i < I; i++){ unsigned int sIdx = i * tThreads + ti; if (sIdx>= X*Y) return; unsigned int sy = sIdx/X; unsigned int sx = sIdx - (sy * X); unsigned int gx = x + sx; unsigned int gy = y + sy; if (gx __device__ void cpyG2S1D3ch(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y, dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){ //calculate the total number of threads available unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z; //calculate the current 1D thread ID unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x; //calculate the number of iteration require for the copy unsigned int I = X/tThreads + 1; //the specified start position in global memory is (x, y, z) unsigned int gstart = z*I_x*I_y + y*I_x + x; for (unsigned int i = 0; i < I; i++){ //each iteration will copy tThreads elements, so the starting index in shared memory //for each iteration will be i * tThreads (iteration # times the number of elements transferred per iteration) unsigned int sIdx = i * tThreads + ti; if (sIdx>= X*Y) return; unsigned int gIdx = gstart*3 + sIdx; //copy global to share dest[sIdx] = src[gIdx]; } } //this function copy three channels data from global to shared memory in two dimensions with size of X*Y bytes. template __device__ void cpyG2S2D3ch(T* dest,T* src, unsigned int x, unsigned int y, unsigned int z, unsigned int X, unsigned int Y, dim3 threadIdx, dim3 blockDim, unsigned int I_x, unsigned int I_y){ //calculate the total number of threads available unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z; //calculate the current 1D thread ID unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x; //calculate the number of iteration require for the copy unsigned int I = X*Y/tThreads + 1; unsigned int gz1 = z*I_x*I_y ; for (unsigned int i = 0; i < I; i++){ unsigned int sIdx = i * tThreads + ti; if (sIdx>= X*Y) return; unsigned int sy = sIdx/X; unsigned int sx = sIdx - (sy * X); unsigned int gx = x + sx/3; unsigned int gy = y + sy; if (gx __device__ void mag_share2D(T* grad, unsigned int bs, unsigned int X, unsigned int Y, dim3 threadIdx, dim3 blockDim){ //calculate the total number of threads available unsigned int tThreads = blockDim.x * blockDim.y * blockDim.z; //calculate the current 1D thread ID unsigned int ti = threadIdx.z * (blockDim.x * blockDim.y) + threadIdx.y * (blockDim.x) + threadIdx.x; //calculate the number of iteration require for the copy unsigned int I = X*Y/tThreads + 1; for (unsigned int i = 0; i < I; i++){ unsigned int sIdx = i * tThreads + ti; if (sIdx>= X*Y) return; float gx = grad[sIdx*3]; float gy = grad[sIdx*3 + 1]; float gz = grad[sIdx*3 + 2]; float mag = sqrt(gx*gx + gy*gy + gz*gz); grad[bs + sIdx] = mag; } } #endif