//make sure that this header file is only loaded once #ifndef RTS_BINARY_H #define RTS_BINARY_H #include #include #include #include #include #include #ifdef _WIN32 #include #else #include #endif #ifdef USE_CUDA //CUDA externs void gpu_permute(char* dest, char* src, size_t sx, size_t sy, size_t sz, size_t d0, size_t d1, size_t d2, size_t typesize); #include #endif namespace stim{ /// This class calculates the optimal setting for independent parameter b (batch size) for /// minimizing the dependent parameter bps (bytes per second) class stream_optimizer{ protected: size_t Bps[2]; //bytes per second for the previous batch size_t interval_B; //number of bytes processed this interval size_t interval_ms; //number of milliseconds spent in the current interval size_t n[2]; //current batch size (in bytes) size_t h; //spacing used for finite difference calculations size_t dn; //delta value (in bytes) for setting the batch size (minimum change in batch parameter) size_t maxn; //maximum value for the batch size double alpha; //alpha value controls the factor of the gradient that is used to calculate the next point (speed of convergence) bool sample_step; //calculating the derivative (this class alternates between calculating dBps and B) bool forward_diff; //evaluate the derivative using forward differences size_t window_ms; //size of the interval (in milliseconds) integrated to get a reliable bps value // This function rounds x to the nearest value within dB size_t round_limit(double n0){ if(n0 < 0) return dn; //if n0 is less than zero, return the lowest possible n size_t new_n = (size_t)(n0 + 0.5); //now n0 must be positive, so round it to the nearest integer if(new_n > maxn) new_n = maxn; //limit the returned size of x to within the specified bounds size_t lowest = new_n / dn; size_t highest = lowest + dn; size_t diff[2] = {new_n - lowest, highest - new_n}; //calculate the two differences if(diff[0] < diff[1]) return lowest; return highest; } public: //constructor initializes a stream optimizer stream_optimizer(size_t min_batch_size, size_t max_batch_size, double a = 0.003, size_t probe_step = 5, size_t window = 2000){ //Bps = 0; //initialize to zero bytes per second processed Bps[0] = Bps[1] = 0; //initialize the bits per second to 0 interval_B = 0; //zero bytes have been processed at initialization interval_ms = 0; //no time has been spent on the batch so far dn = min_batch_size; //set the minimum batch size as the minimum change in batch size maxn = max_batch_size; //set the maximum batch size n[0] = max_batch_size; //set B h = (max_batch_size / min_batch_size) / probe_step * dn; std::cout<<"h = "< r, unsigned long long h = 0, stim::iotype io = stim::io_in){ for(unsigned long long i = 0; i < D; i++) //set the dimensions of the binary file object R[i] = r[i]; header = h; //save the header size if(!open_file(filename), io) return false; //open the binary file //reset(); return test_file_size(); } bool is_open() { return file.is_open(); } /// Creates a new binary file for streaming /// @param filename is the name of the binary file to be created /// @param r is a STIM vector specifying the size of the file along each dimension /// @offset specifies how many bytes to offset the file (used to leave room for a header) bool create(std::string filename, vec r, unsigned long long offset = 0){ std::ofstream target(filename.c_str(), std::ios::binary); //initialize binary file T p = 0; for(unsigned long long i =0; i < r[0] * r[1] * r[2]; i++){ target.write((char*)(&p), sizeof(T)); } for(unsigned long long i = 0; i < D; i++) //set the dimensions of the binary file object R[i] = r[i]; header = offset; //save the header size if(!open_file(filename)) return false; //open the binary file return test_file_size(); } /// Writes a single page of data to disk. A page consists of a sequence of data of size R[0] * R[1] * ... * R[D-1]. /// @param p is a pointer to the data to be written /// @param page is the page number (index of the highest-numbered dimension) bool write_page( T * p, unsigned long long page){ if(p == NULL){ std::cout<<"ERROR: unable to write into file, empty pointer"<= R[2]){ //make sure the bank number is right std::cout<<"ERROR: page out of range"< R[0] * R[1]){ //make sure the sample and line number is right std::cout<<"ERROR: sample or line out of range in "<<__FILE__<<" (line "<<__LINE__<<")"<= R[0] || y >= R[1]){ //make sure the sample and line number is right std::cout<<"ERROR: sample or line out of range"<= R[1] || z >= R[2] ){ std::cout<<"ERROR: sample ("<= R[0] || z >= R[2] ){ std::cout<<"ERROR: sample or line out of range in "<<__FILE__<<" (line "<<__LINE__<<")"<= R[0]){ //make sure the number is within the possible range std::cout<<"ERROR: sample or line out of range in "<<__FILE__<<" (line "<<__LINE__<<")"<= R[1]){ //make sure the bank number is right std::cout<<"ERROR read_plane_1: page out of range"<= R[0] * R[1] * R[2]){ std::cout<<"ERROR read_pixel: n is out of range"<= R[0] || y < 0 || y >= R[1] || z < 0 || z > R[2]){ std::cout<<"ERROR read_pixel: (x,y,z) is out of range"<(t1-t0).count(); } // permutes a block of data from the current interleave to the interleave specified (re-arranged dimensions to the order specified by [d0, d1, d2]) size_t permute(T* dest, T* src, size_t sx, size_t sy, size_t sz, size_t d0, size_t d1, size_t d2){ std::chrono::high_resolution_clock::time_point t0, t1; t0 = std::chrono::high_resolution_clock::now(); #ifdef USE_CUDA T* gpu_src; HANDLE_ERROR( cudaMalloc(&gpu_src, sx*sy*sz*sizeof(T)) ); HANDLE_ERROR( cudaMemcpy(gpu_src, src, sx*sy*sz*sizeof(T), cudaMemcpyHostToDevice) ); T* gpu_dest; HANDLE_ERROR( cudaMalloc(&gpu_dest, sx*sy*sz*sizeof(T)) ); gpu_permute((char*)gpu_dest, (char*)gpu_src, sx, sy, sz, d0, d1, d2, sizeof(T)); HANDLE_ERROR( cudaMemcpy(dest, gpu_dest, sx*sy*sz*sizeof(T), cudaMemcpyDeviceToHost) ); HANDLE_ERROR( cudaFree(gpu_src) ); HANDLE_ERROR( cudaFree(gpu_dest) ); t1 = std::chrono::high_resolution_clock::now(); return std::chrono::duration_cast(t1-t0).count(); #endif size_t d[3] = {d0, d1, d2}; size_t s[3] = {sx, sy, sz}; size_t p[3]; if(d[0] == 0 && d[1] == 1 && d[2] == 2){ //this isn't actually a permute - just copy the data memcpy(dest, src, sizeof(T) * sx * sy * sz); } else if(d[0] == 0){ //the individual lines are contiguous, so you can memcpy line-by-line size_t y, z; size_t src_idx, dest_idx; size_t x_bytes = sizeof(T) * sx; for(z = 0; z < sz; z++){ p[2] = z; for(y = 0; y < sy; y++){ p[1] = y; src_idx = z * sx * sy + y * sx; dest_idx = p[d[2]] * s[d[0]] * s[d[1]] + p[d[1]] * s[d[0]]; memcpy(dest + dest_idx, src + src_idx, x_bytes); } } } else{ //loop through every damn point size_t x, y, z; size_t src_idx, dest_idx; size_t src_z, src_y; for(z = 0; z < sz; z++){ p[2] = z; src_z = z * sx * sy; for(y = 0; y < sy; y++){ p[1] = y; src_y = src_z + y * sx; for(x = 0; x < sx; x++){ p[0] = x; src_idx = src_y + x; dest_idx = p[d[2]] * s[d[0]] * s[d[1]] + p[d[1]] * s[d[0]] + p[d[0]]; dest[dest_idx] = src[src_idx]; } } } } t1 = std::chrono::high_resolution_clock::now(); return std::chrono::duration_cast(t1-t0).count(); } }; } #endif