diff --git a/include/envoy/server/watchdog.h b/include/envoy/server/watchdog.h index d230ab48f6fb..cd76f552e244 100644 --- a/include/envoy/server/watchdog.h +++ b/include/envoy/server/watchdog.h @@ -37,7 +37,6 @@ class WatchDog { */ virtual void touch() PURE; virtual Thread::ThreadId threadId() const PURE; - virtual MonotonicTime lastTouchTime() const PURE; }; using WatchDogSharedPtr = std::shared_ptr; diff --git a/source/server/guarddog_impl.cc b/source/server/guarddog_impl.cc index 7c512d20310e..f30f82aa26f2 100644 --- a/source/server/guarddog_impl.cc +++ b/source/server/guarddog_impl.cc @@ -85,11 +85,13 @@ GuardDogImpl::GuardDogImpl(Stats::Scope& stats_scope, const Server::Configuratio GuardDogImpl::~GuardDogImpl() { stop(); } void GuardDogImpl::step() { - { - Thread::LockGuard guard(mutex_); - if (!run_thread_) { - return; - } + // Hold mutex_ for the duration of the step() function to ensure that watchdog still alive checks + // and test interlocks happen in the expected order. Calls to forceCheckForTest() should result in + // a full iteration of the step() function to process recent watchdog touches and monotonic time + // changes. + Thread::LockGuard guard(mutex_); + if (!run_thread_) { + return; } const auto now = time_source_.monotonicTime(); @@ -106,7 +108,13 @@ void GuardDogImpl::step() { static_cast(ceil(multi_kill_fraction_ * watched_dogs_.size()))); for (auto& watched_dog : watched_dogs_) { - const auto last_checkin = watched_dog->dog_->lastTouchTime(); + if (watched_dog->dog_->getTouchedAndReset()) { + // Watchdog was touched since the guard dog last checked; update last check-in time. + watched_dog->last_checkin_ = now; + continue; + } + + const auto last_checkin = watched_dog->last_checkin_; const auto tid = watched_dog->dog_->threadId(); const auto delta = now - last_checkin; if (watched_dog->last_alert_time_ && watched_dog->last_alert_time_.value() < last_checkin) { @@ -160,12 +168,9 @@ void GuardDogImpl::step() { invokeGuardDogActions(WatchDogAction::MISS, miss_threads, now); } - { - Thread::LockGuard guard(mutex_); - test_interlock_hook_->signalFromImpl(now); - if (run_thread_) { - loop_timer_->enableTimer(loop_interval_); - } + test_interlock_hook_->signalFromImpl(); + if (run_thread_) { + loop_timer_->enableTimer(loop_interval_); } } @@ -176,14 +181,13 @@ WatchDogSharedPtr GuardDogImpl::createWatchDog(Thread::ThreadId thread_id, // accessed out of the locked section below is const (time_source_ has no // state). const auto wd_interval = loop_interval_ / 2; - WatchDogSharedPtr new_watchdog = - std::make_shared(std::move(thread_id), time_source_, wd_interval); + auto new_watchdog = std::make_shared(std::move(thread_id), wd_interval); WatchedDogPtr watched_dog = std::make_unique(stats_scope_, thread_name, new_watchdog); + new_watchdog->touch(); { Thread::LockGuard guard(wd_lock_); watched_dogs_.push_back(std::move(watched_dog)); } - new_watchdog->touch(); return new_watchdog; } @@ -232,7 +236,7 @@ void GuardDogImpl::invokeGuardDogActions( } GuardDogImpl::WatchedDog::WatchedDog(Stats::Scope& stats_scope, const std::string& thread_name, - const WatchDogSharedPtr& watch_dog) + const WatchDogImplSharedPtr& watch_dog) : dog_(watch_dog), miss_counter_(stats_scope.counterFromStatName( Stats::StatNameManagedStorage(fmt::format("server.{}.watchdog_miss", thread_name), diff --git a/source/server/guarddog_impl.h b/source/server/guarddog_impl.h index 3d8503ec9530..ebc4040e1c81 100644 --- a/source/server/guarddog_impl.h +++ b/source/server/guarddog_impl.h @@ -18,6 +18,8 @@ #include "common/common/thread.h" #include "common/event/libevent.h" +#include "server/watchdog_impl.h" + #include "absl/types/optional.h" namespace Envoy { @@ -45,16 +47,17 @@ class GuardDogImpl : public GuardDog { virtual ~TestInterlockHook() = default; /** - * Called from GuardDogImpl to indicate that it has evaluated all watch-dogs - * up to a particular point in time. + * Called from GuardDogImpl to indicate that it has evaluated all watch-dogs up to a particular + * point in time. Called while the GuardDog mutex is held. */ - virtual void signalFromImpl(MonotonicTime) {} + virtual void signalFromImpl() {} /** - * Called from GuardDog tests to block until the implementation has reached - * the desired point in time. + * Called from GuardDog tests to block until the implementation has reached the desired + * condition. Called while the GuardDog mutex is held. + * @param mutex The GuardDog's mutex for use by Thread::CondVar::wait. */ - virtual void waitFromTest(Thread::MutexBasicLockable&, MonotonicTime) {} + virtual void waitFromTest(Thread::MutexBasicLockable& /*mutex*/) {} }; /** @@ -79,15 +82,13 @@ class GuardDogImpl : public GuardDog { const std::chrono::milliseconds loopIntervalForTest() const { return loop_interval_; } /** - * Test hook to force a step() to catch up with the current simulated - * time. This is inlined so that it does not need to be present in the - * production binary. + * Test hook to force a step() to catch up with the current watchdog state and simulated time. + * This is inlined so that it does not need to be present in the production binary. */ void forceCheckForTest() { Thread::LockGuard guard(mutex_); - MonotonicTime now = time_source_.monotonicTime(); loop_timer_->enableTimer(std::chrono::milliseconds(0)); - test_interlock_hook_->waitFromTest(mutex_, now); + test_interlock_hook_->waitFromTest(mutex_); } // Server::GuardDog @@ -111,11 +112,13 @@ class GuardDogImpl : public GuardDog { std::vector> thread_last_checkin_pairs, MonotonicTime now); + using WatchDogImplSharedPtr = std::shared_ptr; struct WatchedDog { WatchedDog(Stats::Scope& stats_scope, const std::string& thread_name, - const WatchDogSharedPtr& watch_dog); + const WatchDogImplSharedPtr& watch_dog); - const WatchDogSharedPtr dog_; + const WatchDogImplSharedPtr dog_; + MonotonicTime last_checkin_; absl::optional last_alert_time_; bool miss_alerted_{}; bool megamiss_alerted_{}; diff --git a/source/server/watchdog_impl.h b/source/server/watchdog_impl.h index ea4ea82e5c28..a18034745885 100644 --- a/source/server/watchdog_impl.h +++ b/source/server/watchdog_impl.h @@ -19,26 +19,24 @@ class WatchDogImpl : public WatchDog { /** * @param interval WatchDog timer interval (used after startWatchdog()) */ - WatchDogImpl(Thread::ThreadId thread_id, TimeSource& tsource, std::chrono::milliseconds interval) - : thread_id_(thread_id), time_source_(tsource), - latest_touch_time_since_epoch_(tsource.monotonicTime().time_since_epoch()), - timer_interval_(interval) {} + WatchDogImpl(Thread::ThreadId thread_id, std::chrono::milliseconds interval) + : thread_id_(thread_id), timer_interval_(interval) {} Thread::ThreadId threadId() const override { return thread_id_; } - MonotonicTime lastTouchTime() const override { - return MonotonicTime(latest_touch_time_since_epoch_.load()); - } + // Used by GuardDogImpl determine if the watchdog was touched recently and reset the touch status. + bool getTouchedAndReset() { return touched_.exchange(false, std::memory_order_relaxed); } // Server::WatchDog void startWatchdog(Event::Dispatcher& dispatcher) override; void touch() override { - latest_touch_time_since_epoch_.store(time_source_.monotonicTime().time_since_epoch()); + // Set touched_ if not already set. + bool expected = false; + touched_.compare_exchange_strong(expected, true, std::memory_order_relaxed); } private: const Thread::ThreadId thread_id_; - TimeSource& time_source_; - std::atomic latest_touch_time_since_epoch_; + std::atomic touched_{false}; Event::TimerPtr timer_; const std::chrono::milliseconds timer_interval_; }; diff --git a/test/mocks/server/watch_dog.h b/test/mocks/server/watch_dog.h index 105761781c36..a658dd33ac94 100644 --- a/test/mocks/server/watch_dog.h +++ b/test/mocks/server/watch_dog.h @@ -15,7 +15,6 @@ class MockWatchDog : public WatchDog { MOCK_METHOD(void, startWatchdog, (Event::Dispatcher & dispatcher)); MOCK_METHOD(void, touch, ()); MOCK_METHOD(Thread::ThreadId, threadId, (), (const)); - MOCK_METHOD(MonotonicTime, lastTouchTime, (), (const)); }; } // namespace Server } // namespace Envoy diff --git a/test/server/guarddog_impl_test.cc b/test/server/guarddog_impl_test.cc index 1709c0972c89..4e45ffa83716 100644 --- a/test/server/guarddog_impl_test.cc +++ b/test/server/guarddog_impl_test.cc @@ -48,21 +48,23 @@ const int DISABLE_MEGAMISS = 1000000; class DebugTestInterlock : public GuardDogImpl::TestInterlockHook { public: // GuardDogImpl::TestInterlockHook - void signalFromImpl(MonotonicTime time) override { - impl_reached_ = time; + void signalFromImpl() override { + waiting_for_signal_ = false; impl_.notifyAll(); } - void waitFromTest(Thread::MutexBasicLockable& mutex, MonotonicTime time) override + void waitFromTest(Thread::MutexBasicLockable& mutex) override ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex) { - while (impl_reached_ < time) { + ASSERT(!waiting_for_signal_); + waiting_for_signal_ = true; + while (waiting_for_signal_) { impl_.wait(mutex); } } private: Thread::CondVar impl_; - MonotonicTime impl_reached_; + bool waiting_for_signal_ = false; }; // We want to make sure guard-dog is tested with both simulated time and real @@ -307,6 +309,7 @@ TEST_P(GuardDogMissTest, MissTest) { initGuardDog(stats_store_, config_miss_); auto unpet_dog = guard_dog_->createWatchDog(api_->threadFactory().currentThreadId(), "test_thread"); + guard_dog_->forceCheckForTest(); // We'd better start at 0: checkMiss(0, "MissTest check 1"); // At 300ms we shouldn't have hit the timeout yet: @@ -332,6 +335,7 @@ TEST_P(GuardDogMissTest, MegaMissTest) { initGuardDog(stats_store_, config_mega_); auto unpet_dog = guard_dog_->createWatchDog(api_->threadFactory().currentThreadId(), "test_thread"); + guard_dog_->forceCheckForTest(); // We'd better start at 0: checkMegaMiss(0, "MegaMissTest check 1"); // This shouldn't be enough to increment the stat: @@ -358,6 +362,7 @@ TEST_P(GuardDogMissTest, MissCountTest) { initGuardDog(stats_store_, config_miss_); auto sometimes_pet_dog = guard_dog_->createWatchDog(api_->threadFactory().currentThreadId(), "test_thread"); + guard_dog_->forceCheckForTest(); // These steps are executed once without ever touching the watchdog. // Then the last step is to touch the watchdog and repeat the steps. // This verifies that the behavior is reset back to baseline after a touch. @@ -380,9 +385,11 @@ TEST_P(GuardDogMissTest, MissCountTest) { // When we finally touch the dog we should get one more increment once the // timeout value expires: sometimes_pet_dog->touch(); + guard_dog_->forceCheckForTest(); } time_system_->advanceTimeWait(std::chrono::milliseconds(1000)); sometimes_pet_dog->touch(); + guard_dog_->forceCheckForTest(); // Make sure megamiss still works: checkMegaMiss(0UL, "MissCountTest check 5"); time_system_->advanceTimeWait(std::chrono::milliseconds(1500)); @@ -656,6 +663,7 @@ TEST_P(GuardDogActionsTest, MissShouldSaturateOnMissEvent) { // Touch the watchdog, which should allow the event to trigger again. first_dog_->touch(); + guard_dog_->forceCheckForTest(); time_system_->advanceTimeWait(std::chrono::milliseconds(101)); guard_dog_->forceCheckForTest(); @@ -718,6 +726,7 @@ TEST_P(GuardDogActionsTest, MegaMissShouldSaturateOnMegaMissEvent) { // Touch the watchdog, which should allow the event to trigger again. first_dog_->touch(); + guard_dog_->forceCheckForTest(); time_system_->advanceTimeWait(std::chrono::milliseconds(101)); guard_dog_->forceCheckForTest(); @@ -733,6 +742,7 @@ TEST_P(GuardDogActionsTest, ShouldRespectEventPriority) { initGuardDog(fake_stats_, config); auto first_dog = guard_dog_->createWatchDog(Thread::ThreadId(10), "test_thread"); auto second_dog = guard_dog_->createWatchDog(Thread::ThreadId(11), "test_thread"); + guard_dog_->forceCheckForTest(); time_system_->advanceTimeWait(std::chrono::milliseconds(101)); guard_dog_->forceCheckForTest(); }; @@ -747,6 +757,7 @@ TEST_P(GuardDogActionsTest, ShouldRespectEventPriority) { initGuardDog(fake_stats_, config); auto first_dog = guard_dog_->createWatchDog(Thread::ThreadId(10), "test_thread"); auto second_dog = guard_dog_->createWatchDog(Thread::ThreadId(11), "test_thread"); + guard_dog_->forceCheckForTest(); time_system_->advanceTimeWait(std::chrono::milliseconds(101)); guard_dog_->forceCheckForTest(); };