diff --git a/stim/envi/binary.h b/stim/envi/binary.h index aeab5b5..62138ce 100644 --- a/stim/envi/binary.h +++ b/stim/envi/binary.h @@ -15,6 +15,12 @@ #include #endif +#ifdef CUDA_FOUND +//CUDA externs +void gpu_permute(char* dest, char* src, size_t sx, size_t sy, size_t sz, size_t d0, size_t d1, size_t d2, size_t typesize); +#include +#endif + namespace stim{ /// This class calculates the optimal setting for independent parameter b (batch size) for @@ -638,10 +644,27 @@ public: // permutes a block of data from the current interleave to the interleave specified (re-arranged dimensions to the order specified by [d0, d1, d2]) size_t permute(T* dest, T* src, size_t sx, size_t sy, size_t sz, size_t d0, size_t d1, size_t d2){ - auto t0 = std::chrono::high_resolution_clock::now(); + std::chrono::high_resolution_clock::time_point t0, t1; + t0 = std::chrono::high_resolution_clock::now(); + +#ifdef CUDA_FOUND + T* gpu_src; + HANDLE_ERROR( cudaMalloc(&gpu_src, sx*sy*sz*sizeof(T)) ); + HANDLE_ERROR( cudaMemcpy(gpu_src, src, sx*sy*sz*sizeof(T), cudaMemcpyHostToDevice) ); + T* gpu_dest; + HANDLE_ERROR( cudaMalloc(&gpu_dest, sx*sy*sz*sizeof(T)) ); + gpu_permute((char*)gpu_dest, (char*)gpu_src, sx, sy, sz, d0, d1, d2, sizeof(T)); + HANDLE_ERROR( cudaMemcpy(dest, gpu_dest, sx*sy*sz*sizeof(T), cudaMemcpyDeviceToHost) ); + HANDLE_ERROR( cudaFree(gpu_src) ); + HANDLE_ERROR( cudaFree(gpu_dest) ); + t1 = std::chrono::high_resolution_clock::now(); + return std::chrono::duration_cast(t1-t0).count(); + +#endif + size_t d[3] = {d0, d1, d2}; size_t s[3] = {sx, sy, sz}; - size_t p[3];// = {x, y, z}; + size_t p[3]; if(d[0] == 0 && d[1] == 1 && d[2] == 2){ //this isn't actually a permute - just copy the data @@ -680,7 +703,7 @@ public: } } } - auto t1 = std::chrono::high_resolution_clock::now(); + t1 = std::chrono::high_resolution_clock::now(); return std::chrono::duration_cast(t1-t0).count(); } -- libgit2 0.21.4