Commit a3776b35f90e189eb229c94e78f95281722d8830
1 parent
e69a42db
added GPU kernel
Showing
1 changed file
with
28 additions
and
0 deletions
Show diff stats
1 | +#include <iostream> | |
2 | +#include <stim/cuda/cudatools/error.h> | |
3 | + | |
4 | +__global__ void kernel_permute(char* dest, char* src, size_t sx, size_t sy, size_t sz, size_t d0, size_t d1, size_t d2, size_t typesize){ | |
5 | + size_t xi = blockIdx.x * blockDim.x + threadIdx.x; | |
6 | + size_t yi = blockIdx.y * blockDim.y + threadIdx.y; | |
7 | + size_t zi = blockIdx.z * blockDim.z + threadIdx.z; | |
8 | + | |
9 | + if(xi >= sx || yi >= sy || zi >= sz) return; //return if we are outside the grid | |
10 | + | |
11 | + size_t d[3] = {d0, d1, d2}; | |
12 | + size_t s[3] = {sx, sy, sz}; | |
13 | + size_t p[3] = {xi, yi, zi}; | |
14 | + | |
15 | + size_t si = typesize * (zi * sx * sy + yi * sx + xi); | |
16 | + size_t di = typesize * (p[d[2]] * s[d[0]] * s[d[1]] + p[d[1]] * s[d[0]] + p[d[0]]); | |
17 | + | |
18 | + for(int i = 0; i < typesize; i++) | |
19 | + dest[di+i] = src[si + i]; | |
20 | +} | |
21 | + | |
22 | +void gpu_permute(char* dest, char* src, size_t sx, size_t sy, size_t sz, size_t d0, size_t d1, size_t d2, size_t typesize){ | |
23 | + | |
24 | + int threads_per_block = 1024; | |
25 | + dim3 threads(sqrt(threads_per_block), sqrt(threads_per_block), 1); | |
26 | + dim3 blocks(sx/threads.x + 1, sy/threads.y + 1, sz/threads.z +1); | |
27 | + kernel_permute<<<blocks, threads>>>(dest, src, sx, sy, sz, d0, d1, d2, typesize); | |
28 | +} | |
0 | 29 | \ No newline at end of file | ... | ... |