From bd92bc28eb4f4614126f045615da29df9e283113 Mon Sep 17 00:00:00 2001 From: Antonio Vicente Date: Fri, 11 Sep 2020 20:53:44 -0400 Subject: [PATCH 01/12] test: Change the GuarddogImpl's interlock so it is time agnostic. Interlocking based on time can result in confusion when using simulated time and without calls to advanceAndWait between calls to GuarddogImpl::forceCheckForTest. Signed-off-by: Antonio Vicente --- source/server/guarddog_impl.cc | 17 ++++++----------- source/server/guarddog_impl.h | 16 ++++++++-------- test/server/guarddog_impl_test.cc | 12 +++++++----- 3 files changed, 21 insertions(+), 24 deletions(-) diff --git a/source/server/guarddog_impl.cc b/source/server/guarddog_impl.cc index a1e11c24b82e..c46abaa6f27c 100644 --- a/source/server/guarddog_impl.cc +++ b/source/server/guarddog_impl.cc @@ -81,11 +81,9 @@ GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuratio GuardDogImpl::~GuardDogImpl() { stop(); } void GuardDogImpl::step() { - { - Thread::LockGuard guard(mutex_); - if (!run_thread_) { - return; - } + Thread::LockGuard guard(mutex_); + if (!run_thread_) { + return; } const auto now = time_source_.monotonicTime(); @@ -156,12 +154,9 @@ void GuardDogImpl::step() { invokeGuardDogActions(WatchDogAction::MISS, miss_threads, now); } - { - Thread::LockGuard guard(mutex_); - test_interlock_hook_->signalFromImpl(now); - if (run_thread_) { - loop_timer_->enableTimer(loop_interval_); - } + test_interlock_hook_->signalFromImpl(); + if (run_thread_) { + loop_timer_->enableTimer(loop_interval_); } } diff --git a/source/server/guarddog_impl.h b/source/server/guarddog_impl.h index 829ccb5f42d8..697debb973ae 100644 --- a/source/server/guarddog_impl.h +++ b/source/server/guarddog_impl.h @@ -45,16 +45,17 @@ class GuardDogImpl : public GuardDog { virtual ~TestInterlockHook() = default; /** - * Called from GuardDogImpl to indicate that it has evaluated all watch-dogs - * up to a particular point in time. + * Called from GuardDogImpl to indicate that it has evaluated all watch-dogs up to a particular + * point in time. Called while the Guarddog mutex is held. */ - virtual void signalFromImpl(MonotonicTime) {} + virtual void signalFromImpl() {} /** - * Called from GuardDog tests to block until the implementation has reached - * the desired point in time. + * Called from GuardDog tests to block until the implementation has reached the desired + * condition. Called while the Guarddog mutex is held. + * @param mutex The Guarddog's mutex for use by Thread::CondVar::wait. */ - virtual void waitFromTest(Thread::MutexBasicLockable&, MonotonicTime) {} + virtual void waitFromTest(Thread::MutexBasicLockable& /*mutex*/) {} }; /** @@ -84,9 +85,8 @@ class GuardDogImpl : public GuardDog { */ void forceCheckForTest() { Thread::LockGuard guard(mutex_); - MonotonicTime now = time_source_.monotonicTime(); loop_timer_->enableTimer(std::chrono::milliseconds(0)); - test_interlock_hook_->waitFromTest(mutex_, now); + test_interlock_hook_->waitFromTest(mutex_); } // Server::GuardDog diff --git a/test/server/guarddog_impl_test.cc b/test/server/guarddog_impl_test.cc index cd284c895ad9..bc4aab8a023a 100644 --- a/test/server/guarddog_impl_test.cc +++ b/test/server/guarddog_impl_test.cc @@ -48,21 +48,23 @@ const int DISABLE_MEGAMISS = 1000000; class DebugTestInterlock : public GuardDogImpl::TestInterlockHook { public: // GuardDogImpl::TestInterlockHook - void signalFromImpl(MonotonicTime time) override { - impl_reached_ = time; + void signalFromImpl() override { + waiting_for_signal_ = false; impl_.notifyAll(); } - void waitFromTest(Thread::MutexBasicLockable& mutex, MonotonicTime time) override + void waitFromTest(Thread::MutexBasicLockable& mutex) override ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex) { - while (impl_reached_ < time) { + ASSERT(!waiting_for_signal_); + waiting_for_signal_ = true; + while (waiting_for_signal_) { impl_.wait(mutex); } } private: Thread::CondVar impl_; - MonotonicTime impl_reached_; + bool waiting_for_signal_ = false; }; // We want to make sure guard-dog is tested with both simulated time and real From e6afe7bd8b12fa7e8c4e951eb4218cd46655ee28 Mon Sep 17 00:00:00 2001 From: Antonio Vicente Date: Fri, 11 Sep 2020 02:05:51 -0400 Subject: [PATCH 02/12] watchdog: Optimize WatchdogImpl::touch in preparation to more frequent petting of the watchdog. Signed-off-by: Antonio Vicente --- include/envoy/server/watchdog.h | 1 - source/server/guarddog_impl.cc | 18 +++++++++++++----- source/server/guarddog_impl.h | 8 ++++++-- source/server/watchdog_impl.h | 15 ++++++--------- test/mocks/server/watch_dog.h | 1 - test/server/guarddog_impl_test.cc | 9 +++++++++ 6 files changed, 34 insertions(+), 18 deletions(-) diff --git a/include/envoy/server/watchdog.h b/include/envoy/server/watchdog.h index d230ab48f6fb..cd76f552e244 100644 --- a/include/envoy/server/watchdog.h +++ b/include/envoy/server/watchdog.h @@ -37,7 +37,6 @@ class WatchDog { */ virtual void touch() PURE; virtual Thread::ThreadId threadId() const PURE; - virtual MonotonicTime lastTouchTime() const PURE; }; using WatchDogSharedPtr = std::shared_ptr; diff --git a/source/server/guarddog_impl.cc b/source/server/guarddog_impl.cc index c46abaa6f27c..ee4597614943 100644 --- a/source/server/guarddog_impl.cc +++ b/source/server/guarddog_impl.cc @@ -100,7 +100,15 @@ void GuardDogImpl::step() { static_cast(ceil(multi_kill_fraction_ * watched_dogs_.size()))); for (auto& watched_dog : watched_dogs_) { - const auto ltt = watched_dog->dog_->lastTouchTime(); + uint64_t touch_count = watched_dog->dog_->touchCount(); + if (watched_dog->last_touch_count_ != touch_count) { + // Watchdog was touched since guarddog last checked; update time and continue. + watched_dog->last_touch_count_ = touch_count; + watched_dog->last_touch_time_ = now; + continue; + } + + const auto ltt = watched_dog->last_touch_time_; const auto tid = watched_dog->dog_->threadId(); const auto delta = now - ltt; if (watched_dog->last_alert_time_ && watched_dog->last_alert_time_.value() < ltt) { @@ -167,14 +175,13 @@ WatchDogSharedPtr GuardDogImpl::createWatchDog(Thread::ThreadId thread_id, // accessed out of the locked section below is const (time_source_ has no // state). const auto wd_interval = loop_interval_ / 2; - WatchDogSharedPtr new_watchdog = - std::make_shared(std::move(thread_id), time_source_, wd_interval); + auto new_watchdog = std::make_shared(std::move(thread_id), wd_interval); WatchedDogPtr watched_dog = std::make_unique(stats_scope_, thread_name, new_watchdog); + new_watchdog->touch(); { Thread::LockGuard guard(wd_lock_); watched_dogs_.push_back(std::move(watched_dog)); } - new_watchdog->touch(); return new_watchdog; } @@ -222,8 +229,9 @@ void GuardDogImpl::invokeGuardDogActions( } GuardDogImpl::WatchedDog::WatchedDog(Stats::Scope& stats_scope, const std::string& thread_name, - const WatchDogSharedPtr& watch_dog) + const WatchDogImplSharedPtr& watch_dog) : dog_(watch_dog), + last_touch_count_(dog_->touchCount()), miss_counter_(stats_scope.counterFromStatName( Stats::StatNameManagedStorage(fmt::format("server.{}.watchdog_miss", thread_name), stats_scope.symbolTable()) diff --git a/source/server/guarddog_impl.h b/source/server/guarddog_impl.h index 697debb973ae..baa86661dc7d 100644 --- a/source/server/guarddog_impl.h +++ b/source/server/guarddog_impl.h @@ -17,6 +17,7 @@ #include "common/common/logger.h" #include "common/common/thread.h" #include "common/event/libevent.h" +#include "server/watchdog_impl.h" #include "absl/types/optional.h" @@ -110,11 +111,14 @@ class GuardDogImpl : public GuardDog { std::vector> thread_ltt_pairs, MonotonicTime now); + using WatchDogImplSharedPtr = std::shared_ptr; struct WatchedDog { WatchedDog(Stats::Scope& stats_scope, const std::string& thread_name, - const WatchDogSharedPtr& watch_dog); + const WatchDogImplSharedPtr& watch_dog); - const WatchDogSharedPtr dog_; + const WatchDogImplSharedPtr dog_; + uint64_t last_touch_count_; + MonotonicTime last_touch_time_; absl::optional last_alert_time_; bool miss_alerted_{}; bool megamiss_alerted_{}; diff --git a/source/server/watchdog_impl.h b/source/server/watchdog_impl.h index ea4ea82e5c28..6ff737e87bf5 100644 --- a/source/server/watchdog_impl.h +++ b/source/server/watchdog_impl.h @@ -19,26 +19,23 @@ class WatchDogImpl : public WatchDog { /** * @param interval WatchDog timer interval (used after startWatchdog()) */ - WatchDogImpl(Thread::ThreadId thread_id, TimeSource& tsource, std::chrono::milliseconds interval) - : thread_id_(thread_id), time_source_(tsource), - latest_touch_time_since_epoch_(tsource.monotonicTime().time_since_epoch()), - timer_interval_(interval) {} + WatchDogImpl(Thread::ThreadId thread_id, std::chrono::milliseconds interval) + : thread_id_(thread_id), timer_interval_(interval) {} Thread::ThreadId threadId() const override { return thread_id_; } - MonotonicTime lastTouchTime() const override { - return MonotonicTime(latest_touch_time_since_epoch_.load()); + uint64_t touchCount() const { + return touch_count_.load(); } // Server::WatchDog void startWatchdog(Event::Dispatcher& dispatcher) override; void touch() override { - latest_touch_time_since_epoch_.store(time_source_.monotonicTime().time_since_epoch()); + ++touch_count_; } private: const Thread::ThreadId thread_id_; - TimeSource& time_source_; - std::atomic latest_touch_time_since_epoch_; + std::atomic touch_count_ = 0; Event::TimerPtr timer_; const std::chrono::milliseconds timer_interval_; }; diff --git a/test/mocks/server/watch_dog.h b/test/mocks/server/watch_dog.h index 105761781c36..a658dd33ac94 100644 --- a/test/mocks/server/watch_dog.h +++ b/test/mocks/server/watch_dog.h @@ -15,7 +15,6 @@ class MockWatchDog : public WatchDog { MOCK_METHOD(void, startWatchdog, (Event::Dispatcher & dispatcher)); MOCK_METHOD(void, touch, ()); MOCK_METHOD(Thread::ThreadId, threadId, (), (const)); - MOCK_METHOD(MonotonicTime, lastTouchTime, (), (const)); }; } // namespace Server } // namespace Envoy diff --git a/test/server/guarddog_impl_test.cc b/test/server/guarddog_impl_test.cc index bc4aab8a023a..14702d7c70d0 100644 --- a/test/server/guarddog_impl_test.cc +++ b/test/server/guarddog_impl_test.cc @@ -309,6 +309,7 @@ TEST_P(GuardDogMissTest, MissTest) { initGuardDog(stats_store_, config_miss_); auto unpet_dog = guard_dog_->createWatchDog(api_->threadFactory().currentThreadId(), "test_thread"); + guard_dog_->forceCheckForTest(); // We'd better start at 0: checkMiss(0, "MissTest check 1"); // At 300ms we shouldn't have hit the timeout yet: @@ -334,6 +335,7 @@ TEST_P(GuardDogMissTest, MegaMissTest) { initGuardDog(stats_store_, config_mega_); auto unpet_dog = guard_dog_->createWatchDog(api_->threadFactory().currentThreadId(), "test_thread"); + guard_dog_->forceCheckForTest(); // We'd better start at 0: checkMegaMiss(0, "MegaMissTest check 1"); // This shouldn't be enough to increment the stat: @@ -360,6 +362,7 @@ TEST_P(GuardDogMissTest, MissCountTest) { initGuardDog(stats_store_, config_miss_); auto sometimes_pet_dog = guard_dog_->createWatchDog(api_->threadFactory().currentThreadId(), "test_thread"); + guard_dog_->forceCheckForTest(); // These steps are executed once without ever touching the watchdog. // Then the last step is to touch the watchdog and repeat the steps. // This verifies that the behavior is reset back to baseline after a touch. @@ -382,9 +385,11 @@ TEST_P(GuardDogMissTest, MissCountTest) { // When we finally touch the dog we should get one more increment once the // timeout value expires: sometimes_pet_dog->touch(); + guard_dog_->forceCheckForTest(); } time_system_->advanceTimeWait(std::chrono::milliseconds(1000)); sometimes_pet_dog->touch(); + guard_dog_->forceCheckForTest(); // Make sure megamiss still works: checkMegaMiss(0UL, "MissCountTest check 5"); time_system_->advanceTimeWait(std::chrono::milliseconds(1500)); @@ -657,6 +662,7 @@ TEST_P(GuardDogActionsTest, MissShouldSaturateOnMissEvent) { // Touch the watchdog, which should allow the event to trigger again. first_dog_->touch(); + guard_dog_->forceCheckForTest(); time_system_->advanceTimeWait(std::chrono::milliseconds(101)); guard_dog_->forceCheckForTest(); @@ -719,6 +725,7 @@ TEST_P(GuardDogActionsTest, MegaMissShouldSaturateOnMegaMissEvent) { // Touch the watchdog, which should allow the event to trigger again. first_dog_->touch(); + guard_dog_->forceCheckForTest(); time_system_->advanceTimeWait(std::chrono::milliseconds(101)); guard_dog_->forceCheckForTest(); @@ -734,6 +741,7 @@ TEST_P(GuardDogActionsTest, ShouldRespectEventPriority) { initGuardDog(fake_stats_, config); auto first_dog = guard_dog_->createWatchDog(Thread::ThreadId(10), "test_thread"); auto second_dog = guard_dog_->createWatchDog(Thread::ThreadId(11), "test_thread"); + guard_dog_->forceCheckForTest(); time_system_->advanceTimeWait(std::chrono::milliseconds(101)); guard_dog_->forceCheckForTest(); }; @@ -748,6 +756,7 @@ TEST_P(GuardDogActionsTest, ShouldRespectEventPriority) { initGuardDog(fake_stats_, config); auto first_dog = guard_dog_->createWatchDog(Thread::ThreadId(10), "test_thread"); auto second_dog = guard_dog_->createWatchDog(Thread::ThreadId(11), "test_thread"); + guard_dog_->forceCheckForTest(); time_system_->advanceTimeWait(std::chrono::milliseconds(101)); guard_dog_->forceCheckForTest(); }; From 08ee8752fa68a789a4fd6b8e976d466902ecac72 Mon Sep 17 00:00:00 2001 From: Antonio Vicente Date: Tue, 15 Sep 2020 12:52:11 -0400 Subject: [PATCH 03/12] prefer fetch_add over ++ run format Signed-off-by: Antonio Vicente --- source/server/guarddog_impl.cc | 3 +-- source/server/guarddog_impl.h | 1 + source/server/watchdog_impl.h | 8 ++------ 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/source/server/guarddog_impl.cc b/source/server/guarddog_impl.cc index ee4597614943..c59379d26e3f 100644 --- a/source/server/guarddog_impl.cc +++ b/source/server/guarddog_impl.cc @@ -230,8 +230,7 @@ void GuardDogImpl::invokeGuardDogActions( GuardDogImpl::WatchedDog::WatchedDog(Stats::Scope& stats_scope, const std::string& thread_name, const WatchDogImplSharedPtr& watch_dog) - : dog_(watch_dog), - last_touch_count_(dog_->touchCount()), + : dog_(watch_dog), last_touch_count_(dog_->touchCount()), miss_counter_(stats_scope.counterFromStatName( Stats::StatNameManagedStorage(fmt::format("server.{}.watchdog_miss", thread_name), stats_scope.symbolTable()) diff --git a/source/server/guarddog_impl.h b/source/server/guarddog_impl.h index baa86661dc7d..396441fe8469 100644 --- a/source/server/guarddog_impl.h +++ b/source/server/guarddog_impl.h @@ -17,6 +17,7 @@ #include "common/common/logger.h" #include "common/common/thread.h" #include "common/event/libevent.h" + #include "server/watchdog_impl.h" #include "absl/types/optional.h" diff --git a/source/server/watchdog_impl.h b/source/server/watchdog_impl.h index 6ff737e87bf5..64f12ade2c4d 100644 --- a/source/server/watchdog_impl.h +++ b/source/server/watchdog_impl.h @@ -23,15 +23,11 @@ class WatchDogImpl : public WatchDog { : thread_id_(thread_id), timer_interval_(interval) {} Thread::ThreadId threadId() const override { return thread_id_; } - uint64_t touchCount() const { - return touch_count_.load(); - } + uint64_t touchCount() const { return touch_count_.load(); } // Server::WatchDog void startWatchdog(Event::Dispatcher& dispatcher) override; - void touch() override { - ++touch_count_; - } + void touch() override { touch_count_.fetch_add(1); } private: const Thread::ThreadId thread_id_; From 3b8b2a9b7d425e70d43af7760185424b48df6407 Mon Sep 17 00:00:00 2001 From: Antonio Vicente Date: Tue, 15 Sep 2020 13:36:20 -0400 Subject: [PATCH 04/12] address review comments Signed-off-by: Antonio Vicente --- source/server/guarddog_impl.cc | 9 ++++----- source/server/guarddog_impl.h | 1 - source/server/watchdog_impl.h | 7 ++++++- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/source/server/guarddog_impl.cc b/source/server/guarddog_impl.cc index c59379d26e3f..0f4457be2dd2 100644 --- a/source/server/guarddog_impl.cc +++ b/source/server/guarddog_impl.cc @@ -100,10 +100,9 @@ void GuardDogImpl::step() { static_cast(ceil(multi_kill_fraction_ * watched_dogs_.size()))); for (auto& watched_dog : watched_dogs_) { - uint64_t touch_count = watched_dog->dog_->touchCount(); - if (watched_dog->last_touch_count_ != touch_count) { - // Watchdog was touched since guarddog last checked; update time and continue. - watched_dog->last_touch_count_ = touch_count; + if (watched_dog->dog_->touchCount() > 0) { + // Watchdog was touched since guarddog last checked; update time and reset the count. + watched_dog->dog_->resetTouchCount(); watched_dog->last_touch_time_ = now; continue; } @@ -230,7 +229,7 @@ void GuardDogImpl::invokeGuardDogActions( GuardDogImpl::WatchedDog::WatchedDog(Stats::Scope& stats_scope, const std::string& thread_name, const WatchDogImplSharedPtr& watch_dog) - : dog_(watch_dog), last_touch_count_(dog_->touchCount()), + : dog_(watch_dog), miss_counter_(stats_scope.counterFromStatName( Stats::StatNameManagedStorage(fmt::format("server.{}.watchdog_miss", thread_name), stats_scope.symbolTable()) diff --git a/source/server/guarddog_impl.h b/source/server/guarddog_impl.h index 396441fe8469..3f45dc658717 100644 --- a/source/server/guarddog_impl.h +++ b/source/server/guarddog_impl.h @@ -118,7 +118,6 @@ class GuardDogImpl : public GuardDog { const WatchDogImplSharedPtr& watch_dog); const WatchDogImplSharedPtr dog_; - uint64_t last_touch_count_; MonotonicTime last_touch_time_; absl::optional last_alert_time_; bool miss_alerted_{}; diff --git a/source/server/watchdog_impl.h b/source/server/watchdog_impl.h index 64f12ade2c4d..df232d11cfaf 100644 --- a/source/server/watchdog_impl.h +++ b/source/server/watchdog_impl.h @@ -23,7 +23,10 @@ class WatchDogImpl : public WatchDog { : thread_id_(thread_id), timer_interval_(interval) {} Thread::ThreadId threadId() const override { return thread_id_; } + // Accessors for use by GuardDogImpl to tell if the watchdog was touched recently and reset the + // count back to 0 when it has. uint64_t touchCount() const { return touch_count_.load(); } + void resetTouchCount() { touch_count_.store(0); } // Server::WatchDog void startWatchdog(Event::Dispatcher& dispatcher) override; @@ -31,7 +34,9 @@ class WatchDogImpl : public WatchDog { private: const Thread::ThreadId thread_id_; - std::atomic touch_count_ = 0; + // Use a 32-bit atomic counter since 64-bit atomics are not generally available in 32-bit ARM and + // 32-bit is large enough for our purposes. + std::atomic touch_count_ = 0; Event::TimerPtr timer_; const std::chrono::milliseconds timer_interval_; }; From 6b623b6e530a6e18a8bd6d59f632b8d16b9310b3 Mon Sep 17 00:00:00 2001 From: Antonio Vicente Date: Tue, 15 Sep 2020 17:43:39 -0400 Subject: [PATCH 05/12] fix return type. Signed-off-by: Antonio Vicente --- source/server/watchdog_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/server/watchdog_impl.h b/source/server/watchdog_impl.h index df232d11cfaf..8014bf521da9 100644 --- a/source/server/watchdog_impl.h +++ b/source/server/watchdog_impl.h @@ -25,7 +25,7 @@ class WatchDogImpl : public WatchDog { Thread::ThreadId threadId() const override { return thread_id_; } // Accessors for use by GuardDogImpl to tell if the watchdog was touched recently and reset the // count back to 0 when it has. - uint64_t touchCount() const { return touch_count_.load(); } + uint32_t touchCount() const { return touch_count_.load(); } void resetTouchCount() { touch_count_.store(0); } // Server::WatchDog From 3f962a1421801ff51b6587268e1a3cc9eda98a52 Mon Sep 17 00:00:00 2001 From: Antonio Vicente Date: Tue, 15 Sep 2020 20:21:18 -0400 Subject: [PATCH 06/12] fix spelling Signed-off-by: Antonio Vicente --- source/server/guarddog_impl.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/server/guarddog_impl.cc b/source/server/guarddog_impl.cc index 0f4457be2dd2..32fc13ba910a 100644 --- a/source/server/guarddog_impl.cc +++ b/source/server/guarddog_impl.cc @@ -101,7 +101,7 @@ void GuardDogImpl::step() { for (auto& watched_dog : watched_dogs_) { if (watched_dog->dog_->touchCount() > 0) { - // Watchdog was touched since guarddog last checked; update time and reset the count. + // Watchdog was touched since the guard dog last checked; update time and reset the count. watched_dog->dog_->resetTouchCount(); watched_dog->last_touch_time_ = now; continue; From bcad016d7a6b56430aab0371bbc1e4d1a7077dbf Mon Sep 17 00:00:00 2001 From: Antonio Vicente Date: Tue, 15 Sep 2020 22:26:55 -0400 Subject: [PATCH 07/12] fix spelling more Signed-off-by: Antonio Vicente --- source/server/guarddog_impl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/source/server/guarddog_impl.h b/source/server/guarddog_impl.h index 3f45dc658717..507628c406d5 100644 --- a/source/server/guarddog_impl.h +++ b/source/server/guarddog_impl.h @@ -48,14 +48,14 @@ class GuardDogImpl : public GuardDog { /** * Called from GuardDogImpl to indicate that it has evaluated all watch-dogs up to a particular - * point in time. Called while the Guarddog mutex is held. + * point in time. Called while the GuardDog mutex is held. */ virtual void signalFromImpl() {} /** * Called from GuardDog tests to block until the implementation has reached the desired - * condition. Called while the Guarddog mutex is held. - * @param mutex The Guarddog's mutex for use by Thread::CondVar::wait. + * condition. Called while the GuardDog mutex is held. + * @param mutex The GuardDog's mutex for use by Thread::CondVar::wait. */ virtual void waitFromTest(Thread::MutexBasicLockable& /*mutex*/) {} }; From 1d5b149f460bb61379a9c25211ad77ceed448914 Mon Sep 17 00:00:00 2001 From: Antonio Vicente Date: Tue, 22 Sep 2020 01:39:11 -0400 Subject: [PATCH 08/12] Switch to atomic and acq_rel memory_order instead of seq_cst Signed-off-by: Antonio Vicente --- source/server/guarddog_impl.cc | 5 ++--- source/server/watchdog_impl.h | 17 +++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/source/server/guarddog_impl.cc b/source/server/guarddog_impl.cc index 7e1d32971b9e..f95e4c38015e 100644 --- a/source/server/guarddog_impl.cc +++ b/source/server/guarddog_impl.cc @@ -103,9 +103,8 @@ void GuardDogImpl::step() { static_cast(ceil(multi_kill_fraction_ * watched_dogs_.size()))); for (auto& watched_dog : watched_dogs_) { - if (watched_dog->dog_->touchCount() > 0) { - // Watchdog was touched since the guard dog last checked; update time and reset the count. - watched_dog->dog_->resetTouchCount(); + if (watched_dog->dog_->getTouchedAndReset()) { + // Watchdog was touched since the guard dog last checked; update last check-in time. watched_dog->last_checkin_ = now; continue; } diff --git a/source/server/watchdog_impl.h b/source/server/watchdog_impl.h index 8014bf521da9..940923415d7c 100644 --- a/source/server/watchdog_impl.h +++ b/source/server/watchdog_impl.h @@ -23,20 +23,21 @@ class WatchDogImpl : public WatchDog { : thread_id_(thread_id), timer_interval_(interval) {} Thread::ThreadId threadId() const override { return thread_id_; } - // Accessors for use by GuardDogImpl to tell if the watchdog was touched recently and reset the - // count back to 0 when it has. - uint32_t touchCount() const { return touch_count_.load(); } - void resetTouchCount() { touch_count_.store(0); } + // Used by GuardDogImpl determine if the watchdog was touched recently and reset the touch status. + bool getTouchedAndReset() { return touched_.exchange(false, std::memory_order_acq_rel); } // Server::WatchDog void startWatchdog(Event::Dispatcher& dispatcher) override; - void touch() override { touch_count_.fetch_add(1); } + void touch() override { + // Set touched_ if not already set. + bool expected = false; + touched_.compare_exchange_strong(expected, true, std::memory_order_release, + std::memory_order_acquire); + } private: const Thread::ThreadId thread_id_; - // Use a 32-bit atomic counter since 64-bit atomics are not generally available in 32-bit ARM and - // 32-bit is large enough for our purposes. - std::atomic touch_count_ = 0; + std::atomic touched_{false}; Event::TimerPtr timer_; const std::chrono::milliseconds timer_interval_; }; From e9db878152e9cdf6f71537c18ccded1f9ffbf496 Mon Sep 17 00:00:00 2001 From: Antonio Vicente Date: Tue, 22 Sep 2020 03:28:38 -0400 Subject: [PATCH 09/12] add comments Signed-off-by: Antonio Vicente --- source/server/guarddog_impl.cc | 4 ++++ source/server/guarddog_impl.h | 5 ++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/source/server/guarddog_impl.cc b/source/server/guarddog_impl.cc index f95e4c38015e..86b9d7a6d0fd 100644 --- a/source/server/guarddog_impl.cc +++ b/source/server/guarddog_impl.cc @@ -84,6 +84,10 @@ GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuratio GuardDogImpl::~GuardDogImpl() { stop(); } void GuardDogImpl::step() { + // Hold mutex_ for the duration of the step() function to ensure that watchdog liveness checks and + // test interlock operations happen in the expected order. Calls to forceCheckForTest() should + // result in a full iteration of the step() function to process recent watchdog touches and + // monotonic time changes. Thread::LockGuard guard(mutex_); if (!run_thread_) { return; diff --git a/source/server/guarddog_impl.h b/source/server/guarddog_impl.h index c1c130ea71c5..ebc4040e1c81 100644 --- a/source/server/guarddog_impl.h +++ b/source/server/guarddog_impl.h @@ -82,9 +82,8 @@ class GuardDogImpl : public GuardDog { const std::chrono::milliseconds loopIntervalForTest() const { return loop_interval_; } /** - * Test hook to force a step() to catch up with the current simulated - * time. This is inlined so that it does not need to be present in the - * production binary. + * Test hook to force a step() to catch up with the current watchdog state and simulated time. + * This is inlined so that it does not need to be present in the production binary. */ void forceCheckForTest() { Thread::LockGuard guard(mutex_); From 5a3ecaf08215c63c0d237b4221594db9fd56cb72 Mon Sep 17 00:00:00 2001 From: Antonio Vicente Date: Tue, 22 Sep 2020 20:23:12 -0400 Subject: [PATCH 10/12] Switch to relaxed memory order Fix spelling Signed-off-by: Antonio Vicente --- source/server/guarddog_impl.cc | 8 ++++---- source/server/watchdog_impl.h | 5 ++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/source/server/guarddog_impl.cc b/source/server/guarddog_impl.cc index 86b9d7a6d0fd..019a4b4c5c89 100644 --- a/source/server/guarddog_impl.cc +++ b/source/server/guarddog_impl.cc @@ -84,10 +84,10 @@ GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuratio GuardDogImpl::~GuardDogImpl() { stop(); } void GuardDogImpl::step() { - // Hold mutex_ for the duration of the step() function to ensure that watchdog liveness checks and - // test interlock operations happen in the expected order. Calls to forceCheckForTest() should - // result in a full iteration of the step() function to process recent watchdog touches and - // monotonic time changes. + // Hold mutex_ for the duration of the step() function to ensure that watchdog still alive checks + // and test interlocks happen in the expected order. Calls to forceCheckForTest() should result in + // a full iteration of the step() function to process recent watchdog touches and monotonic time + // changes. Thread::LockGuard guard(mutex_); if (!run_thread_) { return; diff --git a/source/server/watchdog_impl.h b/source/server/watchdog_impl.h index 940923415d7c..a18034745885 100644 --- a/source/server/watchdog_impl.h +++ b/source/server/watchdog_impl.h @@ -24,15 +24,14 @@ class WatchDogImpl : public WatchDog { Thread::ThreadId threadId() const override { return thread_id_; } // Used by GuardDogImpl determine if the watchdog was touched recently and reset the touch status. - bool getTouchedAndReset() { return touched_.exchange(false, std::memory_order_acq_rel); } + bool getTouchedAndReset() { return touched_.exchange(false, std::memory_order_relaxed); } // Server::WatchDog void startWatchdog(Event::Dispatcher& dispatcher) override; void touch() override { // Set touched_ if not already set. bool expected = false; - touched_.compare_exchange_strong(expected, true, std::memory_order_release, - std::memory_order_acquire); + touched_.compare_exchange_strong(expected, true, std::memory_order_relaxed); } private: From 3b6313a591fd94d31ecf55be752e3c44a39489d9 Mon Sep 17 00:00:00 2001 From: Antonio Vicente Date: Wed, 23 Sep 2020 00:37:08 -0400 Subject: [PATCH 11/12] Kick CI Signed-off-by: Antonio Vicente From 9ee42f599fcef04e4aeeba327e0cc6422eabe28f Mon Sep 17 00:00:00 2001 From: Antonio Vicente Date: Fri, 16 Oct 2020 15:29:57 -0400 Subject: [PATCH 12/12] Kick CI Signed-off-by: Antonio Vicente