added GPU support for permutes

David Mayerich
1 parent 5db80c2e
Showing 1 changed file with 26 additions and 3 deletions Show diff stats
stim/envi/binary.h
@@ -15,6 +15,12 @@
 #include <unistd.h>
 #endif
+#ifdef CUDA_FOUND
+//CUDA externs
+void gpu_permute(char* dest, char* src, size_t sx, size_t sy, size_t sz, size_t d0, size_t d1, size_t d2, size_t typesize);
+#include <stim/cuda/cudatools/error.h>
+#endif
+
 namespace stim{
 /// This class calculates the optimal setting for independent parameter b (batch size) for
@@ -638,10 +644,27 @@ public:
 	// permutes a block of data from the current interleave to the interleave specified (re-arranged dimensions to the order specified by [d0, d1, d2])
 	size_t permute(T* dest, T* src, size_t sx, size_t sy, size_t sz, size_t d0, size_t d1, size_t d2){
-		auto t0 = std::chrono::high_resolution_clock::now();
+		std::chrono::high_resolution_clock::time_point t0, t1;
+		t0 = std::chrono::high_resolution_clock::now();
+
+#ifdef CUDA_FOUND
+		T* gpu_src;
+		HANDLE_ERROR( cudaMalloc(&gpu_src, sx*sy*sz*sizeof(T)) );
+		HANDLE_ERROR( cudaMemcpy(gpu_src, src, sx*sy*sz*sizeof(T), cudaMemcpyHostToDevice) );
+		T* gpu_dest;
+		HANDLE_ERROR( cudaMalloc(&gpu_dest, sx*sy*sz*sizeof(T)) );		
+		gpu_permute((char*)gpu_dest, (char*)gpu_src, sx, sy, sz, d0, d1, d2, sizeof(T));
+		HANDLE_ERROR( cudaMemcpy(dest, gpu_dest, sx*sy*sz*sizeof(T), cudaMemcpyDeviceToHost) );
+		HANDLE_ERROR( cudaFree(gpu_src) );
+		HANDLE_ERROR( cudaFree(gpu_dest) );
+		t1 = std::chrono::high_resolution_clock::now();
+		return std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count();
+		
+#endif
+		
 		size_t d[3] = {d0, d1, d2};
 		size_t s[3] = {sx, sy, sz};
-		size_t p[3];// = {x, y, z};
+		size_t p[3];
 		if(d[0] == 0 && d[1] == 1 && d[2] == 2){
 			//this isn't actually a permute - just copy the data
@@ -680,7 +703,7 @@ public:
 				}
 			}
 		}
-		auto t1 = std::chrono::high_resolution_clock::now();
+		t1 = std::chrono::high_resolution_clock::now();
 		return std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count();
 	}