diff --git a/stim/envi/convert.cu b/stim/envi/convert.cu new file mode 100644 index 0000000..e9f6036 --- /dev/null +++ b/stim/envi/convert.cu @@ -0,0 +1,28 @@ +#include +#include + +__global__ void kernel_permute(char* dest, char* src, size_t sx, size_t sy, size_t sz, size_t d0, size_t d1, size_t d2, size_t typesize){ + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; + size_t yi = blockIdx.y * blockDim.y + threadIdx.y; + size_t zi = blockIdx.z * blockDim.z + threadIdx.z; + + if(xi >= sx || yi >= sy || zi >= sz) return; //return if we are outside the grid + + size_t d[3] = {d0, d1, d2}; + size_t s[3] = {sx, sy, sz}; + size_t p[3] = {xi, yi, zi}; + + size_t si = typesize * (zi * sx * sy + yi * sx + xi); + size_t di = typesize * (p[d[2]] * s[d[0]] * s[d[1]] + p[d[1]] * s[d[0]] + p[d[0]]); + + for(int i = 0; i < typesize; i++) + dest[di+i] = src[si + i]; +} + +void gpu_permute(char* dest, char* src, size_t sx, size_t sy, size_t sz, size_t d0, size_t d1, size_t d2, size_t typesize){ + + int threads_per_block = 1024; + dim3 threads(sqrt(threads_per_block), sqrt(threads_per_block), 1); + dim3 blocks(sx/threads.x + 1, sy/threads.y + 1, sz/threads.z +1); + kernel_permute<<>>(dest, src, sx, sy, sz, d0, d1, d2, typesize); +} \ No newline at end of file -- libgit2 0.21.4