Skip to content

Commit

Permalink
fix segfault and infinite loop in python stack getter (#9955)
Browse files Browse the repository at this point in the history
1. <del>#4737 这个很早之前的 PR 在
python 退出后会跳过 tensor 的析构,导致 oom,当时这么做的原因已经不记得了,现在来看不跳过也想不到会有什么问题,所以本 PR
恢复为正常执行析构。</del>
    更新:CI 遇到问题,待进一步定位,这个 PR 先不包含这部分改动了。
原 oom 问题可能和 #9681 这个 PR 把设置
is_shutting_down 的时机提前了有关
3. 修复线程不安全导致的 python stack getter 偶发 segfault 的 bug

---------

Signed-off-by: daquexian <daquexian566@gmail.com>
Co-authored-by: oneflow-ci-bot <ci-bot@oneflow.org>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Mar 22, 2023
1 parent 0bb9fee commit 9e87ac4
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 39 deletions.
4 changes: 2 additions & 2 deletions oneflow/core/vm/instruction.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ class Instruction final : public intrusive::Base {
bool Done() const;
StreamPolicy* mut_stream_policy();
const StreamPolicy& stream_policy() const;
intrusive::shared_ptr<Frame> foreign_frame() const { return foreign_frame_; }
std::shared_ptr<Frame> foreign_frame() const { return foreign_frame_; }

intrusive::Ref::RefCntType ref_cnt() const { return intrusive_ref_.ref_cnt(); }

Expand Down Expand Up @@ -203,7 +203,7 @@ class Instruction final : public intrusive::Base {
Stream* stream_;
std::shared_ptr<InstructionPolicy> instruction_policy_;
InstructionStatusBuffer status_buffer_;
intrusive::shared_ptr<Frame> foreign_frame_;
std::shared_ptr<Frame> foreign_frame_;
};

using InstructionList = intrusive::List<INTRUSIVE_FIELD(Instruction, main_instruction_hook_)>;
Expand Down
14 changes: 4 additions & 10 deletions oneflow/extension/stack/foreign_stack_getter.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,27 +19,21 @@ limitations under the License.
#include <cstdint>
#include <utility>
#include "oneflow/core/common/thread_local_guard.h"
#include "oneflow/core/intrusive/base.h"
#include "oneflow/core/intrusive/shared_ptr.h"

namespace oneflow {

class Frame : public intrusive::Base {
class Frame {
public:
virtual ~Frame() = default;
intrusive::Ref* mut_intrusive_ref() { return &intrusive_ref_; }

private:
intrusive::Ref intrusive_ref_;
};

using ForeignFrameThreadLocalGuard = ThreadLocalGuard<intrusive::shared_ptr<Frame>>;
using ForeignFrameThreadLocalGuard = ThreadLocalGuard<std::shared_ptr<Frame>>;

class ForeignStackGetter {
public:
virtual ~ForeignStackGetter() = default;
virtual intrusive::shared_ptr<Frame> GetCurrentFrame() const = 0;
virtual std::string GetFormattedStack(intrusive::shared_ptr<Frame> frame) const = 0;
virtual std::shared_ptr<Frame> GetCurrentFrame() const = 0;
virtual std::string GetFormattedStack(std::shared_ptr<Frame> frame) const = 0;
};
} // namespace oneflow

Expand Down
44 changes: 17 additions & 27 deletions oneflow/extension/stack/python/stack_getter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,32 +43,24 @@ std::string PyUnicodeToStdString(const PyObject* py_str) {
}
} // namespace

class PyFrame final
: public Frame,
public intrusive::EnableObjectPool<PyFrame, intrusive::kThreadUnsafeAndDisableDestruct> {
class PyFrame final : public Frame {
public:
PyFrame()
: EnableObjectPool(),
filename(nullptr),
funcname(nullptr),
cpython_frame(nullptr),
lineno(0) {}
void __Init__(PyFrameObject* frame, intrusive::shared_ptr<PyFrame> back) {
// There is no need to increase the reference count of these cpython objects
// because they must be alive during the lifetime of `PyFrame`.
this->filename = frame->f_code->co_filename;
this->funcname = frame->f_code->co_name;
this->cpython_frame = frame;
this->back = std::move(back);
}
OF_DISALLOW_COPY_AND_MOVE(PyFrame);
// There is no need to increase the reference count of these cpython objects
// because they must be alive during the lifetime of `PyFrame`.
PyFrame(PyFrameObject* frame, std::shared_ptr<PyFrame> back)
: filename(frame->f_code->co_filename),
funcname(frame->f_code->co_name),
cpython_frame(frame),
lineno(0),
back(std::move(back)) {}
~PyFrame() = default;
OF_DISALLOW_COPY_AND_MOVE(PyFrame);

PyObject* filename;
PyObject* funcname;
PyFrameObject* cpython_frame;
int lineno;
intrusive::shared_ptr<PyFrame> back;
std::shared_ptr<PyFrame> back;
};

class PyStackGetter final : public ForeignStackGetter {
Expand All @@ -77,18 +69,18 @@ class PyStackGetter final : public ForeignStackGetter {
auto* frame = PyEval_GetFrame();
// Get the first frame. It assumes `import oneflow` is called in global scope,
while (frame->f_back != nullptr) { frame = frame->f_back; }
current_frame_ = object_pool_.make_shared(frame, nullptr);
current_frame_ = std::make_shared<PyFrame>(frame, nullptr);
}
// indended to be called in main thread.
intrusive::shared_ptr<Frame> GetCurrentFrame() const override {
std::shared_ptr<Frame> GetCurrentFrame() const override {
if (IsShuttingDown() || !current_frame_) { return nullptr; }
// See `RecordAndEvalFrame` for documentation.
current_frame_->lineno = PyFrame_GetLineNumber(current_frame_->cpython_frame);
return intrusive::shared_ptr<Frame>(current_frame_.get());
return current_frame_;
}

// bad path, performance is not a concern.
std::string GetFormattedStack(intrusive::shared_ptr<Frame> frame) const override {
std::string GetFormattedStack(std::shared_ptr<Frame> frame) const override {
if (frame == nullptr) { return " <unknown>\n"; }
std::string buffer;
const auto* py_frame = dynamic_cast<const PyFrame*>(frame.get());
Expand Down Expand Up @@ -140,13 +132,11 @@ class PyStackGetter final : public ForeignStackGetter {
}

private:
intrusive::shared_ptr<PyFrame> current_frame_;

intrusive::ObjectPool<PyFrame, intrusive::kThreadUnsafeAndDisableDestruct> object_pool_;
std::shared_ptr<PyFrame> current_frame_;

void PushFrame(PyFrameObject* frame) {
if (auto* f = frame->f_back) { current_frame_->lineno = PyFrame_GetLineNumber(f); }
current_frame_ = object_pool_.make_shared(frame, current_frame_);
current_frame_ = std::make_shared<PyFrame>(frame, current_frame_);
}
void PopFrame() {
CHECK_NOTNULL(current_frame_);
Expand Down

0 comments on commit 9e87ac4

Please sign in to comment.