Commit a3776b35f90e189eb229c94e78f95281722d8830

Authored by David Mayerich
1 parent e69a42db

added GPU kernel

Showing 1 changed file with 28 additions and 0 deletions   Show diff stats
stim/envi/convert.cu 0 → 100644
  1 +#include <iostream>
  2 +#include <stim/cuda/cudatools/error.h>
  3 +
  4 +__global__ void kernel_permute(char* dest, char* src, size_t sx, size_t sy, size_t sz, size_t d0, size_t d1, size_t d2, size_t typesize){
  5 + size_t xi = blockIdx.x * blockDim.x + threadIdx.x;
  6 + size_t yi = blockIdx.y * blockDim.y + threadIdx.y;
  7 + size_t zi = blockIdx.z * blockDim.z + threadIdx.z;
  8 +
  9 + if(xi >= sx || yi >= sy || zi >= sz) return; //return if we are outside the grid
  10 +
  11 + size_t d[3] = {d0, d1, d2};
  12 + size_t s[3] = {sx, sy, sz};
  13 + size_t p[3] = {xi, yi, zi};
  14 +
  15 + size_t si = typesize * (zi * sx * sy + yi * sx + xi);
  16 + size_t di = typesize * (p[d[2]] * s[d[0]] * s[d[1]] + p[d[1]] * s[d[0]] + p[d[0]]);
  17 +
  18 + for(int i = 0; i < typesize; i++)
  19 + dest[di+i] = src[si + i];
  20 +}
  21 +
  22 +void gpu_permute(char* dest, char* src, size_t sx, size_t sy, size_t sz, size_t d0, size_t d1, size_t d2, size_t typesize){
  23 +
  24 + int threads_per_block = 1024;
  25 + dim3 threads(sqrt(threads_per_block), sqrt(threads_per_block), 1);
  26 + dim3 blocks(sx/threads.x + 1, sy/threads.y + 1, sz/threads.z +1);
  27 + kernel_permute<<<blocks, threads>>>(dest, src, sx, sy, sz, d0, d1, d2, typesize);
  28 +}
0 29 \ No newline at end of file
... ...