From 0152908c4bd0ccb722e0c1b2c4052edbfbc18327 Mon Sep 17 00:00:00 2001 From: Huanchen Zhai Date: Sun, 8 Dec 2024 22:59:05 -0500 Subject: [PATCH] alternative scratch --- pyblock2/driver/core.py | 23 +++++++++++++ src/core/allocator.hpp | 2 ++ src/core/fp_codec.hpp | 8 +++-- src/dmrg/moving_environment.hpp | 59 +++++++++++++++++++++++++++++---- src/dmrg/sweep_algorithm.hpp | 20 ++++++++--- src/pybind/pybind_core.hpp | 5 +++ src/pybind/pybind_dmrg.hpp | 6 ++++ 7 files changed, 111 insertions(+), 12 deletions(-) diff --git a/pyblock2/driver/core.py b/pyblock2/driver/core.py index ade60d9b..2207eb89 100644 --- a/pyblock2/driver/core.py +++ b/pyblock2/driver/core.py @@ -559,6 +559,8 @@ def __init__( restart_dir=None, restart_dir_per_sweep=None, mps_dir=None, + scratch_quota=None, + alt_scratch=None, n_threads=None, n_mkl_threads=1, symm_type=SymmetryTypes.SU2, @@ -596,6 +598,11 @@ def __init__( mps_dir : None or str If not None, MPS will be stored in the given directory instead of the scratch directory. Default is None (MPS will be stored in the scratch directory). + scratch_quota : None or int + If not None, will save intermediates to "alt_scratch" when the intermediate size in "scratch" is + above or equal to this number (in bytes). Default is None (no disk quota). + alt_scratch : None or str + Alternative scratch directory. Default is None. n_threads : None or int Number of threads. When MPI is used, this is the number of threads for each MPI processor. Default is None, and the max number of threads available on this node will be used. @@ -636,6 +643,8 @@ def __init__( self._mps_dir = mps_dir self._restart_dir = restart_dir self._restart_dir_per_sweep = restart_dir_per_sweep + self._scratch_quota = 0 if scratch_quota is None else scratch_quota + self._alt_scratch = alt_scratch self.stack_mem = stack_mem self.stack_mem_ratio = stack_mem_ratio self.fp_codec_cutoff = fp_codec_cutoff @@ -770,6 +779,7 @@ def set_symm_type(self, symm_type, reset_frame=True): self.frame.use_main_stack = False self.frame.compressed_sparse_tensor_storage = self.compressed_mps_storage self.frame.minimal_memory_usage = self.min_mpo_mem + self.frame.save_dir_quota = self._scratch_quota if self.mpi: self.mpi = bw.brs.MPICommunicator() @@ -797,6 +807,19 @@ def set_symm_type(self, symm_type, reset_frame=True): self.mpi.barrier() self.frame.mps_dir = self.mps_dir + if self._alt_scratch is not None: + import os + + if self.mpi is None or self.mpi.rank == self.mpi.root: + if not os.path.isdir(self._alt_scratch): + os.makedirs(self._alt_scratch) + if self.mpi is not None: + self.mpi.barrier() + if not os.path.isdir(self._alt_scratch): + os.makedirs(self._alt_scratch) + self.mpi.barrier() + self.frame.alt_save_dir = self._alt_scratch + if self.restart_dir_per_sweep is not None: self.frame.restart_dir_per_sweep = self.restart_dir_per_sweep diff --git a/src/core/allocator.hpp b/src/core/allocator.hpp index 1b4d2683..59c323ca 100644 --- a/src/core/allocator.hpp +++ b/src/core/allocator.hpp @@ -354,6 +354,8 @@ template struct DataFrame { string restart_dir_optimal_mps_per_sweep = ""; //!< If not empty, save the optimal MPS from each sweep to this dir //!< with sweep index as suffix. + size_t save_dir_quota = 0; //!< Disk quota for save_dir (in bytes). + string alt_save_dir = ""; //!< Alternative scartch folder. string prefix = "F", //!< Filename prefix for common scratch files (such as //!< MPS tensors). prefix_distri = diff --git a/src/core/fp_codec.hpp b/src/core/fp_codec.hpp index 9d40e74c..b52a086a 100644 --- a/src/core/fp_codec.hpp +++ b/src/core/fp_codec.hpp @@ -165,7 +165,9 @@ struct FPCodec { mutable size_t ndata = 0, //!< Length of the array of the data that has been compressed. - ncpsd = 0; //!< Length of the array for the compressed data. + ncpsd = 0; //!< Length of the array for the compressed data. + mutable size_t ncpsd_last = + 0; //!< Length of the array written in the last call. size_t chunk_size = 4096; //!< Length of the array elements that should be //!< processed at one time. size_t n_parallel_chunks = @@ -269,6 +271,7 @@ struct FPCodec { T *pdata = new T[(chunk_size + 1) * min(nchunk, n_parallel_chunks)]; vector cplens(n_parallel_chunks); int ntg = threading->activate_global(); + ncpsd_last = 0; #pragma omp parallel num_threads(ntg) for (size_t ib = 0; ib < nbatch; ib++) { size_t n_this_chunk = @@ -288,9 +291,10 @@ struct FPCodec { size_t cplen = cplens[ic]; ofs.write((char *)&cplen, sizeof(cplen)); ofs.write((char *)(pdata + offset + ic), sizeof(T) * cplen); - ncpsd += cplen; + ncpsd_last += cplen; } } + ncpsd += ncpsd_last; delete[] pdata; threading->activate_normal(); ofs.write((char *)tail.c_str(), 4); diff --git a/src/dmrg/moving_environment.hpp b/src/dmrg/moving_environment.hpp index 55e82624..30d67e68 100644 --- a/src/dmrg/moving_environment.hpp +++ b/src/dmrg/moving_environment.hpp @@ -190,6 +190,8 @@ template struct MovingEnvironment { int fuse_center; // Set this to false for non-propagate expectation bool save_environments = true; + mutable map> left_part_files; + mutable map> right_part_files; MovingEnvironment(const shared_ptr> &mpo, const shared_ptr> &bra, const shared_ptr> &ket, @@ -425,6 +427,11 @@ template struct MovingEnvironment { Partition::deallocate_op_infos_notrunc(left_op_infos_notrunc); if (save_environments) { frame_()->save_data(1, get_left_partition_filename(i)); + left_part_files[i] = make_pair(get_left_partition_filename(i), + renormal_mem * sizeof(FL)); + if (frame_()->fp_codec != nullptr) + left_part_files[i].second = + frame_()->fp_codec->ncpsd_last * sizeof(FP); if (save_partition_info) { frame_()->activate(1); envs[i]->save_data(true, get_left_partition_filename(i, true)); @@ -648,6 +655,11 @@ template struct MovingEnvironment { Partition::deallocate_op_infos_notrunc(right_op_infos_notrunc); if (save_environments) { frame_()->save_data(1, get_right_partition_filename(i)); + right_part_files[i] = make_pair(get_right_partition_filename(i), + renormal_mem * sizeof(FL)); + if (frame_()->fp_codec != nullptr) + right_part_files[i].second = + frame_()->fp_codec->ncpsd_last * sizeof(FP); if (save_partition_info) { frame_()->activate(1); envs[i]->save_data(false, @@ -831,17 +843,39 @@ template struct MovingEnvironment { << ".AR." << tag << ".RIGHT." << Parsing::to_string(i); return ss.str(); } + size_t get_used_save_dir_size() const { + size_t used = 0; + const string xdir = frame_()->save_dir + "/"; + for (const auto &p : left_part_files) + if (p.second.first.rfind(xdir, 0) == 0) + used += p.second.second; + for (const auto &p : right_part_files) + if (p.second.first.rfind(xdir, 0) == 0) + used += p.second.second; + return used; + } string get_left_partition_filename(int i, bool info = false) const { stringstream ss; - ss << frame_()->save_dir << "/" << frame_()->prefix_distri - << ".PART." << (info ? "INFO." : "") << tag << ".LEFT." - << Parsing::to_string(i); + string xdir = frame_()->save_dir; + if (!info && left_part_files.count(i)) + return left_part_files.at(i).first; + else if (!info && frame_()->save_dir_quota != 0 && + get_used_save_dir_size() >= frame_()->save_dir_quota) + xdir = frame_()->alt_save_dir; + ss << xdir << "/" << frame_()->prefix_distri << ".PART." + << (info ? "INFO." : "") << tag << ".LEFT." << Parsing::to_string(i); return ss.str(); } string get_right_partition_filename(int i, bool info = false) const { stringstream ss; - ss << frame_()->save_dir << "/" << frame_()->prefix_distri - << ".PART." << (info ? "INFO." : "") << tag << ".RIGHT." + string xdir = frame_()->save_dir; + if (!info && right_part_files.count(i)) + return right_part_files.at(i).first; + else if (!info && frame_()->save_dir_quota != 0 && + get_used_save_dir_size() >= frame_()->save_dir_quota) + xdir = frame_()->alt_save_dir; + ss << xdir << "/" << frame_()->prefix_distri << ".PART." + << (info ? "INFO." : "") << tag << ".RIGHT." << Parsing::to_string(i); return ss.str(); } @@ -1417,10 +1451,15 @@ template struct MovingEnvironment { left_contract_rotate(center); } for (int i = n_sites - 1; i >= center; i--) - if (envs[i]->right != nullptr) + if (envs[i]->right != nullptr) { frame_()->rename_data( get_right_partition_filename(i), get_right_partition_filename(i + 1)); + if (right_part_files.count(i)) + right_part_files[i + 1] = + make_pair(get_right_partition_filename(i + 1), + right_part_files[i].second); + } for (int i = n_sites - 1; i >= 0; i--) { envs[i]->middle.resize(1); if (i > start_site) { @@ -1470,9 +1509,13 @@ template struct MovingEnvironment { string left_data_name = get_left_partition_filename(i, info); if (Parsing::file_exists(left_data_name)) Parsing::remove_file(left_data_name); + if (info == 0 && left_part_files.count(i)) + left_part_files.erase(left_part_files.find(i)); string right_data_name = get_right_partition_filename(i, info); if (Parsing::file_exists(right_data_name)) Parsing::remove_file(right_data_name); + if (info == 0 && right_part_files.count(i)) + right_part_files.erase(right_part_files.find(i)); } } // Move the center site by one @@ -1503,6 +1546,8 @@ template struct MovingEnvironment { string old_data_name = get_right_partition_filename(center - 1); if (Parsing::file_exists(old_data_name)) Parsing::remove_file(old_data_name); + if (right_part_files.count(center - 1)) + right_part_files.erase(center - 1); } } else if (i < center) { if (envs[center]->right != nullptr && @@ -1519,6 +1564,8 @@ template struct MovingEnvironment { string old_data_name = get_left_partition_filename(center + 1); if (Parsing::file_exists(old_data_name)) Parsing::remove_file(old_data_name); + if (left_part_files.count(center + 1)) + left_part_files.erase(center + 1); } } if (para_rule != nullptr) diff --git a/src/dmrg/sweep_algorithm.hpp b/src/dmrg/sweep_algorithm.hpp index 504ed545..f3e05060 100644 --- a/src/dmrg/sweep_algorithm.hpp +++ b/src/dmrg/sweep_algorithm.hpp @@ -3178,8 +3178,7 @@ template struct DMRG { << " | Twrite = " << frame_()->twrite << " | Tfpread = " << frame_()->fpread << " | Tfpwrite = " << frame_()->fpwrite - << " | Tmporead = " << me->mpo->tread - << " | Tasync = " << frame_()->tasync << endl; + << " | Tmporead = " << me->mpo->tread << endl; if (frame_()->fp_codec != nullptr) sout << " | data = " @@ -3187,8 +3186,13 @@ template struct DMRG { frame_()->fp_codec->ndata * sizeof(FPS)) << " | cpsd = " << Parsing::to_size_string( - frame_()->fp_codec->ncpsd * sizeof(FPS)) - << endl; + frame_()->fp_codec->ncpsd * + sizeof(FPS)); + if (frame_()->save_dir_quota != 0) + sout << " | quota-used = " + << Parsing::to_size_string( + me->get_used_save_dir_size()); + sout << " | Tasync = " << frame_()->tasync << endl; sout << " | Trot = " << me->trot << " | Tctr = " << me->tctr << " | Tint = " << me->tint << " | Tmid = " << me->tmid << " | Tdctr = " << me->tdctr @@ -5181,6 +5185,10 @@ template struct Linear { << Parsing::to_size_string( frame_()->fp_codec->ncpsd * sizeof(FPS)); + if (frame_()->save_dir_quota != 0) + cout << " | quota-used = " + << Parsing::to_size_string( + lme->get_used_save_dir_size()); cout << " | Tasync = " << frame_()->tasync << endl; if (lme != nullptr) cout << " | Trot = " << lme->trot @@ -6506,6 +6514,10 @@ struct Expect { << " | cpsd = " << Parsing::to_size_string( frame_()->fp_codec->ncpsd * sizeof(FPS)); + if (frame_()->save_dir_quota != 0) + cout << " | quota-used = " + << Parsing::to_size_string( + me->get_used_save_dir_size()); cout << " | Tasync = " << frame_()->tasync << endl; if (me != nullptr) cout << " | Trot = " << me->trot << " | Tctr = " << me->tctr diff --git a/src/pybind/pybind_core.hpp b/src/pybind/pybind_core.hpp index 3f2879e1..af889185 100644 --- a/src/pybind/pybind_core.hpp +++ b/src/pybind/pybind_core.hpp @@ -71,6 +71,7 @@ PYBIND11_MAKE_OPAQUE(vector>); PYBIND11_MAKE_OPAQUE(unordered_map); PYBIND11_MAKE_OPAQUE(vector>); PYBIND11_MAKE_OPAQUE(unordered_map>); +PYBIND11_MAKE_OPAQUE(map>); PYBIND11_MAKE_OPAQUE(vector>>); PYBIND11_MAKE_OPAQUE(vector>); PYBIND11_MAKE_OPAQUE(vector); @@ -1503,6 +1504,7 @@ template void bind_data(py::module &m) { py::bind_map>(m, "MapIntInt"); py::bind_vector>>(m, "VectorMapIntInt"); py::bind_map>>(m, "MapIntPIntInt"); + py::bind_map>>(m, "MapIntPStrULLInt"); py::bind_vector>>>( m, "VectorMapIntPIntInt"); py::bind_vector>>(m, "VectorPDoubleStr"); @@ -2668,6 +2670,7 @@ template void bind_fl_io(py::module &m, const string &name) { .def(py::init()) .def_readwrite("ndata", &FPCodec::ndata) .def_readwrite("ncpsd", &FPCodec::ncpsd) + .def_readwrite("ncpsd_last", &FPCodec::ncpsd_last) .def("encode", [](FPCodec *self, py::array_t arr) { FL *tmp = new FL[arr.size() + 2]; @@ -2761,6 +2764,8 @@ template void bind_fl_io(py::module &m, const string &name) { &DataFrame::restart_dir_optimal_mps) .def_readwrite("restart_dir_optimal_mps_per_sweep", &DataFrame::restart_dir_optimal_mps_per_sweep) + .def_readwrite("save_dir_quota", &DataFrame::save_dir_quota) + .def_readwrite("alt_save_dir", &DataFrame::alt_save_dir) .def_readwrite("prefix", &DataFrame::prefix) .def_readwrite("prefix_distri", &DataFrame::prefix_distri) .def_readwrite("prefix_can_write", &DataFrame::prefix_can_write) diff --git a/src/pybind/pybind_dmrg.hpp b/src/pybind/pybind_dmrg.hpp index 87575fd7..658435b5 100644 --- a/src/pybind/pybind_dmrg.hpp +++ b/src/pybind/pybind_dmrg.hpp @@ -812,6 +812,10 @@ void bind_fl_moving_environment(py::module &m, const string &name) { &MovingEnvironment::lowmem_numerical_transform) .def_readwrite("save_environments", &MovingEnvironment::save_environments) + .def_readwrite("left_part_files", + &MovingEnvironment::left_part_files) + .def_readwrite("right_part_files", + &MovingEnvironment::right_part_files) .def("left_contract_rotate", &MovingEnvironment::left_contract_rotate) .def("right_contract_rotate", @@ -879,6 +883,8 @@ void bind_fl_moving_environment(py::module &m, const string &name) { &MovingEnvironment::get_middle_archive_filename) .def("get_right_archive_filename", &MovingEnvironment::get_right_archive_filename) + .def("get_used_save_dir_size", + &MovingEnvironment::get_used_save_dir_size) .def("get_left_partition_filename", &MovingEnvironment::get_left_partition_filename) .def("get_right_partition_filename",