From 59f1487f92512d4c0e4e2ae52e8e79c6e1608829 Mon Sep 17 00:00:00 2001 From: daquexian Date: Mon, 26 Apr 2021 23:07:35 +0800 Subject: [PATCH] release tensor by ReleaseTensor instruction (#4737) * release tensor by instructions, update shut_down_util, skip tensor releasing when exiting Signed-off-by: daquexian * Captures shared_ptr instread of raw pointer Co-authored-by: lixinqi Former-commit-id: 5170ffaeafb137f708b33a950a54bc530f0907d5 --- ...nterpreter_util.cpp => shut_down_util.cpp} | 4 ++-- oneflow/core/eager/eager_blob_object.cpp | 2 ++ .../core/framework/instructions_builder.cpp | 17 ++++++++++++++ oneflow/core/framework/instructions_builder.h | 3 +++ oneflow/core/framework/object.h | 4 ++-- ...nterpreter_util.cpp => shut_down_util.cpp} | 23 ++++++++----------- ...on_interpreter_util.h => shut_down_util.h} | 4 ++-- oneflow/core/framework/tensor_impl.cpp | 7 ++++++ oneflow/core/framework/tensor_storage.cpp | 3 ++- 9 files changed, 47 insertions(+), 20 deletions(-) rename oneflow/api/python/framework/{python_interpreter_util.cpp => shut_down_util.cpp} (85%) rename oneflow/core/framework/{python_interpreter_util.cpp => shut_down_util.cpp} (57%) rename oneflow/core/framework/{python_interpreter_util.h => shut_down_util.h} (93%) diff --git a/oneflow/api/python/framework/python_interpreter_util.cpp b/oneflow/api/python/framework/shut_down_util.cpp similarity index 85% rename from oneflow/api/python/framework/python_interpreter_util.cpp rename to oneflow/api/python/framework/shut_down_util.cpp index 74e89267aec..9f0d769ad56 100644 --- a/oneflow/api/python/framework/python_interpreter_util.cpp +++ b/oneflow/api/python/framework/shut_down_util.cpp @@ -15,14 +15,14 @@ limitations under the License. */ #include #include "oneflow/api/python/of_api_registry.h" -#include "oneflow/core/framework/python_interpreter_util.h" +#include "oneflow/core/framework/shut_down_util.h" namespace py = pybind11; namespace oneflow { ONEFLOW_API_PYBIND11_MODULE("", m) { - m.def("SetShuttingDown", []() { return SetShuttingDown().GetOrThrow(); }); + m.def("SetShuttingDown", []() { return SetShuttingDown(); }); } } // namespace oneflow diff --git a/oneflow/core/eager/eager_blob_object.cpp b/oneflow/core/eager/eager_blob_object.cpp index 9bb011481f2..7e0e4b8f298 100644 --- a/oneflow/core/eager/eager_blob_object.cpp +++ b/oneflow/core/eager/eager_blob_object.cpp @@ -17,6 +17,7 @@ limitations under the License. #include "oneflow/core/vm/allocator.h" #include "oneflow/core/job/parallel_desc.h" #include "oneflow/core/framework/to_string.h" +#include "oneflow/core/framework/shut_down_util.h" namespace oneflow { namespace eager { @@ -78,6 +79,7 @@ Maybe EagerBlobObject::TryAllocateBlobBodyMemory(DeviceCtx* device_ctx) { { // reset tensor_buffer_; const auto& Free = [allocator, required_body_bytes](char* dptr) { + if (IsShuttingDown()) { return; } allocator->Deallocate(dptr, required_body_bytes); }; char* dptr = nullptr; diff --git a/oneflow/core/framework/instructions_builder.cpp b/oneflow/core/framework/instructions_builder.cpp index 2b087b672d7..4a48ceedb69 100644 --- a/oneflow/core/framework/instructions_builder.cpp +++ b/oneflow/core/framework/instructions_builder.cpp @@ -34,6 +34,7 @@ limitations under the License. #include "oneflow/core/vm/read_tensor_shape_arg_cb_phy_instr_operand.h" #include "oneflow/core/vm/no_arg_cb_phy_instr_operand.h" #include "oneflow/core/vm/access_blob_arg_cb_phy_instr_operand.h" +#include "oneflow/core/vm/release_tensor_arg_phy_instr_operand.h" #include "oneflow/core/framework/vm_local_dep_object.h" #include "oneflow/core/framework/tensor.h" @@ -888,6 +889,22 @@ Maybe InstructionsBuilder::FeedBlob( return Maybe::Ok(); } +Maybe InstructionsBuilder::ReleaseTensor( + const std::shared_ptr& eager_blob_object, + const std::shared_ptr& parallel_desc) { + std::string instr_name = parallel_desc->device_tag() + ".ReleaseTensor"; + ObjectMsgPtr instruction = ObjectMsgPtr::New(instr_name); + const std::shared_ptr& infer_local_dep_object = + JUST(eager_blob_object->infer_local_dep_object()); + const std::shared_ptr& compute_local_dep_object = + JUST(eager_blob_object->compute_local_dep_object()); + *instruction->mutable_phy_instr_operand() = std::make_shared( + eager_blob_object, infer_local_dep_object, compute_local_dep_object); + instruction->set_parallel_desc_symbol_id(JUST(parallel_desc->symbol_id())); + instruction_list_->EmplaceBack(std::move(instruction.Mutable())); + return Maybe::Ok(); +} + Maybe InstructionsBuilder::AccessBlobByCallback( const std::shared_ptr& tensor, const std::function& callback, const std::string& modifier) { diff --git a/oneflow/core/framework/instructions_builder.h b/oneflow/core/framework/instructions_builder.h index 08a856795f9..a7bac34cac0 100644 --- a/oneflow/core/framework/instructions_builder.h +++ b/oneflow/core/framework/instructions_builder.h @@ -115,6 +115,9 @@ class InstructionsBuilder : public std::enable_shared_from_this& blob_object, const std::shared_ptr& op_arg_parallel_attr); + Maybe ReleaseTensor(const std::shared_ptr& eager_blob_object, + const std::shared_ptr& parallel_desc); + Maybe AccessBlobByCallback(const std::shared_ptr& tensor, const std::function& callback, const std::string& modifier); diff --git a/oneflow/core/framework/object.h b/oneflow/core/framework/object.h index 14aa44985eb..ad39c5c890a 100644 --- a/oneflow/core/framework/object.h +++ b/oneflow/core/framework/object.h @@ -18,7 +18,7 @@ limitations under the License. #include #include "oneflow/core/framework/op_arg_util.h" -#include "oneflow/core/framework/python_interpreter_util.h" +#include "oneflow/core/framework/shut_down_util.h" namespace oneflow { @@ -42,7 +42,7 @@ class BlobObject : public Object { BlobObject(int64_t object_id, const std::shared_ptr& op_arg_parallel_attr, const std::shared_ptr& op_arg_blob_attr); ~BlobObject() override { - if (!(CHECK_JUST(IsShuttingDown()))) { ForceReleaseAll(); } + if (!IsShuttingDown()) { ForceReleaseAll(); } } std::shared_ptr op_arg_parallel_attr() const; diff --git a/oneflow/core/framework/python_interpreter_util.cpp b/oneflow/core/framework/shut_down_util.cpp similarity index 57% rename from oneflow/core/framework/python_interpreter_util.cpp rename to oneflow/core/framework/shut_down_util.cpp index 50b42583b9c..abbfcf5d1da 100644 --- a/oneflow/core/framework/python_interpreter_util.cpp +++ b/oneflow/core/framework/shut_down_util.cpp @@ -13,32 +13,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include "oneflow/core/framework/python_interpreter_util.h" +#include "oneflow/core/framework/shut_down_util.h" namespace oneflow { namespace { -Maybe*> GetShuttingDown() { - static std::vector shutting_down{false}; +std::atomic* GetShuttingDown() { + static std::atomic shutting_down{false}; return &shutting_down; } } // namespace -Maybe IsShuttingDown() { - auto* shutting_down = JUST(GetShuttingDown()); - CHECK_EQ_OR_RETURN(shutting_down->size(), 1); - bool is_interpreter_shutdown = (*shutting_down)[0]; +bool IsShuttingDown() { + auto* shutting_down = GetShuttingDown(); + bool is_interpreter_shutdown = *shutting_down; return is_interpreter_shutdown; } -Maybe SetShuttingDown() { - auto* shutting_down = JUST(GetShuttingDown()); - CHECK_EQ_OR_RETURN(shutting_down->size(), 1); - (*shutting_down)[0] = true; - return Maybe::Ok(); +void SetShuttingDown() { + auto* shutting_down = GetShuttingDown(); + CHECK_EQ(*shutting_down, false); + *shutting_down = true; } } // namespace oneflow diff --git a/oneflow/core/framework/python_interpreter_util.h b/oneflow/core/framework/shut_down_util.h similarity index 93% rename from oneflow/core/framework/python_interpreter_util.h rename to oneflow/core/framework/shut_down_util.h index 5690e19fe5b..6ec72066d11 100644 --- a/oneflow/core/framework/python_interpreter_util.h +++ b/oneflow/core/framework/shut_down_util.h @@ -20,9 +20,9 @@ limitations under the License. namespace oneflow { -Maybe IsShuttingDown(); +bool IsShuttingDown(); -Maybe SetShuttingDown(); +void SetShuttingDown(); } // namespace oneflow diff --git a/oneflow/core/framework/tensor_impl.cpp b/oneflow/core/framework/tensor_impl.cpp index f8405ad9bac..5d5986c1d59 100644 --- a/oneflow/core/framework/tensor_impl.cpp +++ b/oneflow/core/framework/tensor_impl.cpp @@ -77,6 +77,13 @@ EagerMirroredTensorImpl::EagerMirroredTensorImpl( eager_blob_object_(eager_blob_object) { dtype_ = CHECK_JUST(DType::GetDTypeByDataType(eager_blob_object->blob_desc().data_type())); tensor_storage_ = std::make_shared(eager_blob_object->tensor_buffer()); + const auto& parallel_desc = this->parallel_desc(); + tensor_storage_->set_releaser_hook( + [eager_blob_object, parallel_desc](const std::shared_ptr&) { + PhysicalRun([&](const std::shared_ptr& builder) { + builder->ReleaseTensor(eager_blob_object, parallel_desc); + }); + }); } Maybe EagerMirroredTensorImpl::infer_local_dep_object() const { diff --git a/oneflow/core/framework/tensor_storage.cpp b/oneflow/core/framework/tensor_storage.cpp index aa0fce760dc..6aed5b69d2f 100644 --- a/oneflow/core/framework/tensor_storage.cpp +++ b/oneflow/core/framework/tensor_storage.cpp @@ -16,6 +16,7 @@ limitations under the License. #include "oneflow/core/framework/tensor_storage.h" #include "oneflow/core/eager/eager_blob_object.h" #include "oneflow/core/framework/vm_local_dep_object.h" +#include "oneflow/core/framework/shut_down_util.h" namespace oneflow { namespace one { @@ -27,7 +28,7 @@ TensorStorage::TensorStorage(const std::shared_ptr& tensor_ : buffer_(tensor_buffer) {} TensorStorage::~TensorStorage() { - if (releaser_hook_) { (*releaser_hook_)(buffer_); } + if (!IsShuttingDown() && releaser_hook_) { (*releaser_hook_)(buffer_); } } } // namespace one