Optimize when to acquire ledgers from the network.

ximinez · Mark Travis · ximinez · commit 87a89ba6b21a · 2024-08-23T17:59:06.000-04:00
Particularly avoid acquiring ledgers likely to be produced locally very soon. Derived from XRPLF#4764 Co-authored-by: Mark Travis <mtravis@ripple.com>
diff --git a/src/ripple/app/ledger/InboundLedger.h b/src/ripple/app/ledger/InboundLedger.h
@@ -197,6 +197,26 @@ class InboundLedger final : public TimeoutCounter,
     std::unique_ptr<PeerSet> mPeerSet;
 };
 
+inline std::string
+to_string(InboundLedger::Reason reason)
+{
+    using enum InboundLedger::Reason;
+    switch (reason)
+    {
+        case HISTORY:
+            return "HISTORY";
+        case SHARD:
+            return "SHARD";
+        case GENERIC:
+            return "GENERIC";
+        case CONSENSUS:
+            return "CONSENSUS";
+        default:
+            assert(false);
+            return "unknown";
+    }
+}
+
 }  // namespace ripple
 
 #endif
diff --git a/src/ripple/app/ledger/impl/InboundLedgers.cpp b/src/ripple/app/ledger/impl/InboundLedgers.cpp
@@ -69,23 +69,103 @@ class InboundLedgersImp : public InboundLedgers
         std::uint32_t seq,
         InboundLedger::Reason reason) override
     {
+        std::stringstream ss;
+        ss << "InboundLedger::acquire: "
+           << "Request: " << to_string(hash) << ", " << seq
+           << " NeedNetworkLedger: "
+           << (app_.getOPs().isNeedNetworkLedger() ? "yes" : "no")
+           << " Reason: " << to_string(reason) << " Old rule: ";
+        if (app_.getOPs().isNeedNetworkLedger() &&
+            (reason != InboundLedger::Reason::GENERIC) &&
+            (reason != InboundLedger::Reason::CONSENSUS))
+            ss << "false";
+        else
+            ss << "true";
+
         assert(hash.isNonZero());
         assert(
             reason != InboundLedger::Reason::SHARD ||
             (seq != 0 && app_.getShardStore()));
 
-        // probably not the right rule
-        if (app_.getOPs().isNeedNetworkLedger() &&
-            (reason != InboundLedger::Reason::GENERIC) &&
-            (reason != InboundLedger::Reason::CONSENSUS))
+        /*  Acquiring ledgers is somewhat expensive. It requires lots of
+         *  computation and network communication. Avoid it when it's not
+         *  appropriate. Every validation from a peer for a ledger that
+         *  we do not have locally results in a call to this function: even
+         *  if we are moments away from validating the same ledger.
+         */
+        // If the node is not in "full" state, it needs to sync to the network,
+        // and doesn't have the necessary tx's and ledger entries to build the
+        // ledger.
+        bool const isFull = app_.getOPs().isFull();
+        // fallingBehind means the last closed ledger is at least 2 behind the
+        // validated ledger. If the node is falling behind the network, it
+        // probably needs information from the network to catch up.
+        //
+        // The reason this should not simply be only at least 1 behind the
+        // validated ledger is that a slight lag is normal case because some
+        // nodes get there slightly later than others. A difference of 2 means
+        // that at least a full ledger interval has passed, so the node is
+        // beginning to fall behind.
+        bool const fallingBehind = app_.getOPs().isFallingBehind();
+        // If everything else is ok, don't try to acquire the ledger if the
+        // requested seq is in the near future relative to the validated ledger.
+        // If the requested ledger is between 1 and 19 inclusive ledgers ahead
+        // of the valid ledger this node has not built it yet, but it's
+        // possible/likely it has the tx's necessary to build it and get caught
+        // up. Plus it might not become validated. On the other hand, if it's
+        // more than 20 in the future, this node should request it so that it
+        // can jump ahead and get caught up.
+        LedgerIndex const validSeq =
+            app_.getLedgerMaster().getValidLedgerIndex();
+        constexpr std::size_t lagLeeway = 20;
+        bool const nearFuture =
+            (seq > validSeq) && (seq < validSeq + lagLeeway);
+        // If everything else is ok, don't try to acquire the ledger if the
+        // request is related to consensus. (Note that consensus calls usually
+        // pass a seq of 0, so nearFuture will be false other than on a brand
+        // new network.)
+        bool const consensus = reason == InboundLedger::Reason::CONSENSUS;
+
+        bool const shouldAcquire = [&]() {
+            // If the node is not synced, try to get the ledger.
+            if (!isFull)
+                return true;
+            // If the node is falling behind, try to get the ledger.
+            if (fallingBehind)
+                return true;
+            // If the ledger is in the near future, do NOT get the ledger. This
+            // node is probably about to build it.
+            if (nearFuture)
+                return false;
+            // If the request is because of consensus, do NOT get the ledger.
+            // This node is probably about to build it.
+            if (consensus)
+                return false;
+            return true;
+        }();
+        ss << " Evaluating whether to acquire ledger " << hash
+           << ". full: " << (isFull ? "true" : "false")
+           << ". falling behind: " << (fallingBehind ? "true" : "false")
+           << ". ledger sequence " << seq << ". Valid sequence: " << validSeq
+           << ". Lag leeway: " << lagLeeway
+           << ". request for near future ledger: "
+           << (nearFuture ? "true" : "false")
+           << ". Consensus: " << (consensus ? "true" : "false")
+           << ". Acquiring ledger? " << (shouldAcquire ? "true" : "false");
+
+        if (!shouldAcquire)
+        {
+            JLOG(j_.debug()) << "Abort(rule): " << ss.str();
             return {};
+        }
 
         bool isNew = true;
         std::shared_ptr<InboundLedger> inbound;
         {
             ScopedLockType sl(mLock);
             if (stopping_)
             {
+                JLOG(j_.debug()) << "Abort(stopping): " << ss.str();
                 return {};
             }
 
@@ -111,13 +191,19 @@ class InboundLedgersImp : public InboundLedgers
         }
 
         if (inbound->isFailed())
+        {
+            JLOG(j_.debug()) << "Abort(failed): " << ss.str();
             return {};
+        }
 
         if (!isNew)
             inbound->update(seq);
 
         if (!inbound->isComplete())
+        {
+            JLOG(j_.debug()) << "Abort(incomplete): " << ss.str();
             return {};
+        }
 
         if (reason == InboundLedger::Reason::HISTORY)
         {
@@ -130,14 +216,17 @@ class InboundLedgersImp : public InboundLedgers
             if (!shardStore)
             {
                 JLOG(j_.error())
-                    << "Acquiring shard with no shard store available";
+                    << "Acquiring shard with no shard store available"
+                    << ss.str();
                 return {};
             }
             if (inbound->getLedger()->stateMap().family().isShardBacked())
                 shardStore->setStored(inbound->getLedger());
             else
                 shardStore->storeLedger(inbound->getLedger());
         }
+
+        JLOG(j_.debug()) << "Requesting: " << ss.str();
         return inbound->getLedger();
     }
 
diff --git a/src/ripple/app/misc/NetworkOPs.cpp b/src/ripple/app/misc/NetworkOPs.cpp
@@ -430,6 +430,8 @@ class NetworkOPsImp final : public NetworkOPs
     clearLedgerFetch() override;
     Json::Value
     getLedgerFetchInfo() override;
+    bool
+    isFallingBehind() const override;
     std::uint32_t
     acceptLedger(
         std::optional<std::chrono::milliseconds> consensusDelay) override;
@@ -729,6 +731,7 @@ class NetworkOPsImp final : public NetworkOPs
     std::atomic<bool> amendmentBlocked_{false};
     std::atomic<bool> amendmentWarned_{false};
     std::atomic<bool> unlBlocked_{false};
+    std::atomic<bool> fallingBehind_{false};
 
     ClosureCounter<void, boost::system::error_code const&> waitHandlerCounter_;
     boost::asio::steady_timer heartbeatTimer_;
@@ -1810,22 +1813,69 @@ NetworkOPsImp::beginConsensus(uint256 const& networkClosed)
 
     auto closingInfo = m_ledgerMaster.getCurrentLedger()->info();
 
-    JLOG(m_journal.info()) << "Consensus time for #" << closingInfo.seq
+    JLOG(m_journal.info()) << "beginConsensus time for #" << closingInfo.seq
                            << " with LCL " << closingInfo.parentHash;
 
-    auto prevLedger = m_ledgerMaster.getLedgerByHash(closingInfo.parentHash);
+    fallingBehind_ = false;
+    if (closingInfo.seq < m_ledgerMaster.getValidLedgerIndex() - 1)
+    {
+        fallingBehind_ = true;
+        JLOG(m_journal.warn())
+            << "beginConsensus Current ledger " << closingInfo.seq
+            << " is at least 2 behind validated "
+            << m_ledgerMaster.getValidLedgerIndex();
+    }
+
+    auto const prevLedger =
+        m_ledgerMaster.getLedgerByHash(closingInfo.parentHash);
 
     if (!prevLedger)
     {
+        fallingBehind_ = true;
         // this shouldn't happen unless we jump ledgers
         if (mMode == OperatingMode::FULL)
         {
-            JLOG(m_journal.warn()) << "Don't have LCL, going to tracking";
+            JLOG(m_journal.warn())
+                << "beginConsensus Don't have LCL, going to tracking";
             setMode(OperatingMode::TRACKING);
         }
 
         return false;
     }
+    else if (!m_ledgerMaster.isValidated(*prevLedger))
+    {
+        // Do not merge this block unless it proves useful.
+        auto parentLedger = prevLedger;
+        for (; parentLedger && !m_ledgerMaster.isValidated(*parentLedger) &&
+             parentLedger->info().seq > closingInfo.seq - 20;
+             parentLedger = m_ledgerMaster.getLedgerByHash(
+                 parentLedger->info().parentHash))
+        {
+            JLOG(m_journal.debug())
+                << "beginConsensus for " << closingInfo.seq << ". Ledger "
+                << parentLedger->info().seq << " (" << parentLedger->info().hash
+                << ") is not validated.";
+        }
+        if (parentLedger && m_ledgerMaster.isValidated(*parentLedger))
+        {
+            JLOG(m_journal.debug())
+                << "beginConsensus for " << closingInfo.seq << ". Ledger "
+                << parentLedger->info().seq << " (" << parentLedger->info().hash
+                << ") IS validated.";
+        }
+        else
+        {
+            if (parentLedger)
+                JLOG(m_journal.warn())
+                    << "beginConsensus for " << closingInfo.seq << ". Previous "
+                    << closingInfo.seq - parentLedger->info().seq
+                    << " ledgers are not validated";
+            else
+                JLOG(m_journal.warn())
+                    << "beginConsensus for " << closingInfo.seq
+                    << ". Ran out of parent ledgers to check.";
+        }
+    }
 
     assert(prevLedger->info().hash == closingInfo.parentHash);
     assert(
@@ -1863,7 +1913,7 @@ NetworkOPsImp::beginConsensus(uint256 const& networkClosed)
         mLastConsensusPhase = currPhase;
     }
 
-    JLOG(m_journal.debug()) << "Initiating consensus engine";
+    JLOG(m_journal.debug()) << "beginConsensus Initiating consensus engine";
     return true;
 }
 
@@ -2749,6 +2799,12 @@ NetworkOPsImp::getLedgerFetchInfo()
     return app_.getInboundLedgers().getInfo();
 }
 
+bool
+NetworkOPsImp::isFallingBehind() const
+{
+    return fallingBehind_;
+}
+
 void
 NetworkOPsImp::pubProposedTransaction(
     std::shared_ptr<ReadView const> const& ledger,
diff --git a/src/ripple/app/misc/NetworkOPs.h b/src/ripple/app/misc/NetworkOPs.h
@@ -227,6 +227,8 @@ class NetworkOPs : public InfoSub::Source
     clearLedgerFetch() = 0;
     virtual Json::Value
     getLedgerFetchInfo() = 0;
+    virtual bool
+    isFallingBehind() const = 0;
 
     /** Accepts the current transaction tree, return the new ledger's sequence
 
diff --git a/src/ripple/protocol/LedgerHeader.h b/src/ripple/protocol/LedgerHeader.h
@@ -55,6 +55,8 @@ struct LedgerHeader
 
     // If validated is false, it means "not yet validated."
     // Once validated is true, it will never be set false at a later time.
+    // NOTE: If you are accessing this directly, you are probably doing it
+    //   wrong. Use LedgerMaster::isValidated().
     // VFALCO TODO Make this not mutable
     bool mutable validated = false;
     bool accepted = false;