Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rename "deadlock" to "stall" in LoadManager #5341

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/xrpld/app/main/Application.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1557,10 +1557,10 @@
if (!config_->standalone())
{
// VFALCO NOTE This seems unnecessary. If we properly refactor the load
// manager then the deadlock detector can just always be
// manager then the stall detector can just always be
// "armed"
//
getLoadManager().activateDeadlockDetector();
getLoadManager().activateStallDetector();

Check warning on line 1563 in src/xrpld/app/main/Application.cpp

View check run for this annotation

Codecov / codecov/patch

src/xrpld/app/main/Application.cpp#L1563

Added line #L1563 was not covered by tests
}

{
Expand Down
61 changes: 30 additions & 31 deletions src/xrpld/app/main/LoadManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
namespace ripple {

LoadManager::LoadManager(Application& app, beast::Journal journal)
: app_(app), journal_(journal), deadLock_(), armed_(false)
: app_(app), journal_(journal), lastHeartbeat_(), armed_(false)
{
}

Expand All @@ -52,19 +52,19 @@
//------------------------------------------------------------------------------

void
LoadManager::activateDeadlockDetector()
LoadManager::activateStallDetector()

Check warning on line 55 in src/xrpld/app/main/LoadManager.cpp

View check run for this annotation

Codecov / codecov/patch

src/xrpld/app/main/LoadManager.cpp#L55

Added line #L55 was not covered by tests
{
std::lock_guard sl(mutex_);
armed_ = true;
deadLock_ = std::chrono::steady_clock::now();
lastHeartbeat_ = std::chrono::steady_clock::now();

Check warning on line 59 in src/xrpld/app/main/LoadManager.cpp

View check run for this annotation

Codecov / codecov/patch

src/xrpld/app/main/LoadManager.cpp#L59

Added line #L59 was not covered by tests
}

void
LoadManager::resetDeadlockDetector()
LoadManager::heartbeat()

Check warning on line 63 in src/xrpld/app/main/LoadManager.cpp

View check run for this annotation

Codecov / codecov/patch

src/xrpld/app/main/LoadManager.cpp#L63

Added line #L63 was not covered by tests
{
auto const detector_start = std::chrono::steady_clock::now();
auto const heartbeat = std::chrono::steady_clock::now();

Check warning on line 65 in src/xrpld/app/main/LoadManager.cpp

View check run for this annotation

Codecov / codecov/patch

src/xrpld/app/main/LoadManager.cpp#L65

Added line #L65 was not covered by tests
std::lock_guard sl(mutex_);
deadLock_ = detector_start;
lastHeartbeat_ = heartbeat;

Check warning on line 67 in src/xrpld/app/main/LoadManager.cpp

View check run for this annotation

Codecov / codecov/patch

src/xrpld/app/main/LoadManager.cpp#L67

Added line #L67 was not covered by tests
}

//------------------------------------------------------------------------------
Expand Down Expand Up @@ -117,63 +117,62 @@
break;

// Copy out shared data under a lock. Use copies outside lock.
auto const deadLock = deadLock_;
auto const lastHeartbeat = lastHeartbeat_;
auto const armed = armed_;
sl.unlock();

// Measure the amount of time we have been deadlocked, in seconds.
// Measure the amount of time we have been stalled, in seconds.
using namespace std::chrono;
auto const timeSpentDeadlocked =
duration_cast<seconds>(steady_clock::now() - deadLock);
auto const timeSpentStalled =
duration_cast<seconds>(steady_clock::now() - lastHeartbeat);

constexpr auto reportingIntervalSeconds = 10s;
constexpr auto deadlockFatalLogMessageTimeLimit = 90s;
constexpr auto deadlockLogicErrorTimeLimit = 600s;
constexpr auto stallFatalLogMessageTimeLimit = 90s;
constexpr auto stallLogicErrorTimeLimit = 600s;

if (armed && (timeSpentDeadlocked >= reportingIntervalSeconds))
if (armed && (timeSpentStalled >= reportingIntervalSeconds))
{
// Report the deadlocked condition every
// reportingIntervalSeconds
if ((timeSpentDeadlocked % reportingIntervalSeconds) == 0s)
// Report the stalled condition every reportingIntervalSeconds
if ((timeSpentStalled % reportingIntervalSeconds) == 0s)
{
if (timeSpentDeadlocked < deadlockFatalLogMessageTimeLimit)
if (timeSpentStalled < stallFatalLogMessageTimeLimit)
{
JLOG(journal_.warn())
<< "Server stalled for " << timeSpentDeadlocked.count()
<< "Server stalled for " << timeSpentStalled.count()
<< " seconds.";

if (app_.getJobQueue().isOverloaded())
{
JLOG(journal_.warn()) << app_.getJobQueue().getJson(0);
JLOG(journal_.warn())
<< "JobQueue: " << app_.getJobQueue().getJson(0);
}
}
else
{
JLOG(journal_.fatal())
<< "Deadlock detected. Deadlocked time: "
<< timeSpentDeadlocked.count() << "s";
<< "Server stalled for " << timeSpentStalled.count()
<< " seconds.";
JLOG(journal_.fatal())
<< "JobQueue: " << app_.getJobQueue().getJson(0);
}
}

// If we go over the deadlockTimeLimit spent deadlocked, it
// means that the deadlock resolution code has failed, which
// qualifies as undefined behavior.
//
if (timeSpentDeadlocked >= deadlockLogicErrorTimeLimit)
// If we go over the stallLogicErrorTimeLimit spent stalled, it
// means that the stall resolution code has failed, which qualifies
// as a LogicError
if (timeSpentStalled >= stallLogicErrorTimeLimit)
{
JLOG(journal_.fatal())
<< "LogicError: Deadlock detected. Deadlocked time: "
<< timeSpentDeadlocked.count() << "s";
<< "LogicError: Fatal server stall detected. Stalled time: "
<< timeSpentStalled.count() << "s";
JLOG(journal_.fatal())
<< "JobQueue: " << app_.getJobQueue().getJson(0);
LogicError("Deadlock detected");
LogicError("Fatal server stall detected");
}
}
}

bool change;

bool change = false;
if (app_.getJobQueue().isOverloaded())
{
JLOG(journal_.info()) << "Raising local fee (JQ overload): "
Expand Down
24 changes: 12 additions & 12 deletions src/xrpld/app/main/LoadManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,28 +58,28 @@ class LoadManager
*/
~LoadManager();

/** Turn on deadlock detection.
/** Turn on stall detection.

The deadlock detector begins in a disabled state. After this function
is called, it will report deadlocks using a separate thread whenever
The stall detector begins in a disabled state. After this function
is called, it will report stalls using a separate thread whenever
the reset function is not called at least once per 10 seconds.

@see resetDeadlockDetector
@see resetStallDetector
*/
// VFALCO NOTE it seems that the deadlock detector has an "armed" state
// VFALCO NOTE it seems that the stall detector has an "armed" state
// to prevent it from going off during program startup if
// there's a lengthy initialization operation taking place?
//
void
activateDeadlockDetector();
activateStallDetector();

/** Reset the deadlock detection timer.
/** Reset the stall detection timer.

A dedicated thread monitors the deadlock timer, and if too much
A dedicated thread monitors the stall timer, and if too much
time passes it will produce log warnings.
*/
void
resetDeadlockDetector();
heartbeat();

//--------------------------------------------------------------------------

Expand All @@ -98,12 +98,12 @@ class LoadManager
beast::Journal const journal_;

std::thread thread_;
std::mutex mutex_; // Guards deadLock_, armed_, cv_
std::mutex mutex_; // Guards lastHeartbeat_, armed_, cv_
std::condition_variable cv_;
bool stop_ = false;

std::chrono::steady_clock::time_point
deadLock_; // Detect server deadlocks.
// Detect server stalls
std::chrono::steady_clock::time_point lastHeartbeat_;
bool armed_;

friend std::unique_ptr<LoadManager>
Expand Down
2 changes: 1 addition & 1 deletion src/xrpld/app/misc/NetworkOPs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1018,7 +1018,7 @@ NetworkOPsImp::processHeartbeatTimer()

// VFALCO NOTE This is for diagnosing a crash on exit
LoadManager& mgr(app_.getLoadManager());
mgr.resetDeadlockDetector();
mgr.heartbeat();

std::size_t const numPeers = app_.overlay().size();

Expand Down
Loading