Oxend error ping + unfunded tracking

Currently (from a recent PR) we aren't pinging oxend if not active, but that behaviour ended up being quite wrong because lokinet needs to ping even when decommissioned or deregistered (when decommissioned we need the ping to get commissioned again, and if not registered we need the ping to get past the "lokinet isn't pinging" nag screen to prepare a registration). This considerably revises the pinging behaviour: - We ping oxend *unless* there is a specific error with our connections (i.e. we *should* be establishing peer connections but don't have any) - If we do have such an error, we send a new oxend "error" ping to report the error to oxend and get oxend to hold off on sending uptime proofs. Along the way this also changes how we handle the current node state: instead of just tracking deregistered/decommissioned, we now track three states: - LooksRegistered -- which means the SN is known to the network (but not necessarily active or fully staked) - LooksFunded -- which means it is known *and* is fully funded, but not necessarily active - LooksDecommissioned -- which means it is known, funded, and not currently active (which implies decommissioned). The funded (or more precisely, unfunded) state is now tracked in rc_lookup_handler in a "greenlist" -- i.e. new SNs that are so new (i.e. "green") that they aren't even fully staked or active yet.
2 years ago · c5e787b8cb
parent bd869b3b07
commit c5e787b8cb
7 changed files with 147 additions and 81 deletions
--- a/llarp/router/abstractrouter.hpp
+++ b/llarp/router/abstractrouter.hpp
@ -199,14 +199,12 @@ namespace llarp
    virtual bool
    IsServiceNode() const = 0;
-    virtual bool
+    /// Called to determine if we're in a bad state (which gets reported to our oxend) that should
-    IsActiveServiceNode() const = 0;
+    /// prevent uptime proofs from going out to the network (so that the error state gets noticed).
-
+    /// Currently this means we require a decent number of peers whenever we are fully staked
-    /// If we are running as a service node and appear active, i.e. registered and not
+    /// (active or decommed).
-    /// decommissioned, we should *not* ping core if we know of too few peers, to indicate to core
+    virtual std::optional<std::string>
-    /// we are not in a good state.
+    OxendErrorState() const = 0;
    virtual bool
    ShouldPingOxen() const = 0;
    virtual bool
    StartRpcServer() = 0;
@ -315,7 +313,9 @@ namespace llarp
    /// set router's service node whitelist
    virtual void
    SetRouterWhitelist(
-        const std::vector<RouterID>& whitelist, const std::vector<RouterID>& greylist) = 0;
+        const std::vector<RouterID>& whitelist,
        const std::vector<RouterID>& greylist,
        const std::vector<RouterID>& unfundedlist) = 0;
    virtual std::unordered_set<RouterID>
    GetRouterWhitelist() const = 0;
--- a/llarp/router/i_rc_lookup_handler.hpp
+++ b/llarp/router/i_rc_lookup_handler.hpp
@ -34,7 +34,9 @@ namespace llarp
    virtual void
    SetRouterWhitelist(
-        const std::vector<RouterID>& whitelist, const std::vector<RouterID>& greylist) = 0;
+        const std::vector<RouterID>& whitelist,
        const std::vector<RouterID>& greylist,
        const std::vector<RouterID>& greenlist) = 0;
    virtual void
    GetRC(const RouterID& router, RCRequestCallback callback, bool forceLookup = false) = 0;
@ -48,6 +50,12 @@ namespace llarp
    virtual bool
    IsGreylisted(const RouterID& remote) const = 0;
    virtual bool
    IsGreenlisted(const RouterID& remote) const = 0;
    virtual bool
    IsRegistered(const RouterID& remote) const = 0;
    virtual bool
    CheckRC(const RouterContact& rc) const = 0;
--- a/llarp/router/rc_lookup_handler.cpp
+++ b/llarp/router/rc_lookup_handler.cpp
@ -32,26 +32,28 @@ namespace llarp
    whitelistRouters.erase(router);
  }
  static void
  loadColourList(std::unordered_set<RouterID>& beigelist, const std::vector<RouterID>& new_beige)
  {
    beigelist.clear();
    beigelist.insert(new_beige.begin(), new_beige.end());
  }
  void
  RCLookupHandler::SetRouterWhitelist(
-      const std::vector<RouterID>& whitelist, const std::vector<RouterID>& greylist)
+      const std::vector<RouterID>& whitelist,
      const std::vector<RouterID>& greylist,
      const std::vector<RouterID>& greenlist)
  {
    if (whitelist.empty())
      return;
    util::Lock l(_mutex);
-    whitelistRouters.clear();
+    loadColourList(whitelistRouters, whitelist);
-    greylistRouters.clear();
+    loadColourList(greylistRouters, greylist);
-    for (auto& router : whitelist)
+    loadColourList(greenlistRouters, greenlist);
    {
      whitelistRouters.emplace(router);
    }
    for (auto& router : greylist)
    {
      greylistRouters.emplace(router);
    }
-    LogInfo("lokinet service node list now has ", whitelistRouters.size(), " routers");
+    LogInfo("lokinet service node list now has ", whitelistRouters.size(), " active routers");
  }
  bool
@ -140,6 +142,20 @@ namespace llarp
    return greylistRouters.count(remote);
  }
  bool
  RCLookupHandler::IsGreenlisted(const RouterID& remote) const
  {
    util::Lock lock{_mutex};
    return greenlistRouters.count(remote);
  }
  bool
  RCLookupHandler::IsRegistered(const RouterID& remote) const
  {
    util::Lock lock{_mutex};
    return whitelistRouters.count(remote) || greylistRouters.count(remote) || greenlistRouters.count(remote);
  }
  bool
  RCLookupHandler::PathIsAllowed(const RouterID& remote) const
  {
--- a/llarp/router/rc_lookup_handler.hpp
+++ b/llarp/router/rc_lookup_handler.hpp
@ -42,8 +42,11 @@ namespace llarp
    void
    SetRouterWhitelist(
-        const std::vector<RouterID>& whitelist, const std::vector<RouterID>& greylist) override
+        const std::vector<RouterID>& whitelist,
-        EXCLUDES(_mutex);
+        const std::vector<RouterID>& greylist,
        const std::vector<RouterID>& greenlist
        ) override EXCLUDES(_mutex);
    bool
    HaveReceivedWhitelist() const override;
@ -61,6 +64,16 @@ namespace llarp
    bool
    IsGreylisted(const RouterID& remote) const override EXCLUDES(_mutex);
    // "greenlist" = new routers (i.e. "green") that aren't fully funded yet
    bool
    IsGreenlisted(const RouterID& remote) const override EXCLUDES(_mutex);
    // registered just means that there is at least an operator stake, but doesn't require the node
    // be fully funded, active, or not decommed.  (In other words: it is any of the white, grey, or
    // green list).
    bool
    IsRegistered(const RouterID& remote) const override EXCLUDES(_mutex);
    bool
    CheckRC(const RouterContact& rc) const override;
@ -134,8 +147,12 @@ namespace llarp
    bool useWhitelist = false;
    bool isServiceNode = false;
    // whitelist = active routers
    std::unordered_set<RouterID> whitelistRouters GUARDED_BY(_mutex);
    // greylist = fully funded, but decommissioned routers
    std::unordered_set<RouterID> greylistRouters GUARDED_BY(_mutex);
    // greenlist = registered but not fully-staked routers
    std::unordered_set<RouterID> greenlistRouters GUARDED_BY(_mutex);
    using TimePoint = std::chrono::steady_clock::time_point;
    std::unordered_map<RouterID, TimePoint> _routerLookupTimes;
--- a/llarp/router/router.cpp
+++ b/llarp/router/router.cpp
@ -471,16 +471,14 @@ namespace llarp
    return nodedb()->NumLoaded() < KnownPeerWarningThreshold;
  }
-  bool
+  std::optional<std::string>
-  Router::IsActiveServiceNode() const
+  Router::OxendErrorState() const
  {
-    return IsServiceNode() and not(LooksDeregistered() or LooksDecommissioned());
+    // If we're in the white or gray list then we *should* be establishing connections to other
-  }
+    // routers, so if we have almost no peers then something is almost certainly wrong.
-
+    if (LooksFunded() and TooFewPeers())
-  bool
+      return "too few peer connections; lokinet is not adequately connected to the network";
-  Router::ShouldPingOxen() const
+    return std::nullopt;
  {
    return IsActiveServiceNode() and not TooFewPeers();
  }
  void
@ -508,10 +506,17 @@ namespace llarp
  }
  bool
-  Router::LooksDeregistered() const
+  Router::LooksFunded() const
  {
    return IsServiceNode() and whitelistRouters and _rcLookupHandler.HaveReceivedWhitelist()
-        and not _rcLookupHandler.SessionIsAllowed(pubkey());
+        and _rcLookupHandler.SessionIsAllowed(pubkey());
  }
  bool
  Router::LooksRegistered() const
  {
    return IsServiceNode() and whitelistRouters and _rcLookupHandler.HaveReceivedWhitelist()
        and _rcLookupHandler.IsRegistered(pubkey());
  }
  bool
@ -1061,12 +1066,16 @@ namespace llarp
    if (now >= m_NextDecommissionWarn)
    {
      constexpr auto DecommissionWarnInterval = 5min;
-      if (auto dereg = LooksDeregistered(); dereg or decom)
+      if (auto registered = LooksRegistered(), funded = LooksFunded();
          not(registered and funded and not decom))
      {
-        // complain about being deregistered
+        // complain about being deregistered/decommed/unfunded
-        LogError(
+        log::error(
-            "We are running as a service node but we seem to be ",
+            logcat,
-            dereg ? "deregistered" : "decommissioned");
+            "We are running as a service node but we seem to be {}",
            not registered ? "deregistered"
                : decom    ? "decommissioned"
                           : "not fully staked");
        m_NextDecommissionWarn = now + DecommissionWarnInterval;
      }
      else if (isSvcNode and TooFewPeers())
@ -1081,7 +1090,7 @@ namespace llarp
    // if we need more sessions to routers and we are not a service node kicked from the network
    // we shall connect out to others
-    if (connected < connectToNum and not LooksDeregistered())
+    if (connected < connectToNum and LooksFunded())
    {
      size_t dlt = connectToNum - connected;
      LogDebug("connecting to ", dlt, " random routers to keep alive");
@ -1233,9 +1242,11 @@ namespace llarp
  void
  Router::SetRouterWhitelist(
-      const std::vector<RouterID>& whitelist, const std::vector<RouterID>& greylist)
+      const std::vector<RouterID>& whitelist,
      const std::vector<RouterID>& greylist,
      const std::vector<RouterID>& unfundedlist)
  {
-    _rcLookupHandler.SetRouterWhitelist(whitelist, greylist);
+    _rcLookupHandler.SetRouterWhitelist(whitelist, greylist, unfundedlist);
  }
  bool
--- a/llarp/router/router.hpp
+++ b/llarp/router/router.hpp
@ -143,7 +143,9 @@ namespace llarp
    void
    SetRouterWhitelist(
-        const std::vector<RouterID>& whitelist, const std::vector<RouterID>& greylist) override;
+        const std::vector<RouterID>& whitelist,
        const std::vector<RouterID>& greylist,
        const std::vector<RouterID>& unfunded) override;
    std::unordered_set<RouterID>
    GetRouterWhitelist() const override
@ -203,9 +205,16 @@ namespace llarp
    bool
    LooksDecommissioned() const;
-    /// return true if we look like we are a deregistered service node
+    /// return true if we look like we are a registered, fully-staked service node (either active or
    /// decommissioned).  This condition determines when we are allowed to (and attempt to) connect
    /// to other peers when running as a service node.
    bool
-    LooksDeregistered() const;
+    LooksFunded() const;
    /// return true if we a registered service node; not that this only requires a partial stake,
    /// and does not imply that this service node is *active* or fully funded.
    bool
    LooksRegistered() const;
    /// return true if we look like we are allowed and able to test other routers
    bool
@ -378,12 +387,8 @@ namespace llarp
    bool
    IsServiceNode() const override;
-    /// return true if service node *and* not deregistered or decommissioned
+    std::optional<std::string>
-    bool
+    OxendErrorState() const override;
    IsActiveServiceNode() const override;
    bool
    ShouldPingOxen() const override;
    void
    Close();
@ -556,8 +561,11 @@ namespace llarp
    bool m_isServiceNode = false;
    // Delay warning about being decommed/dereged until we've had enough time to sync up with oxend
    static constexpr auto DECOMM_WARNING_STARTUP_DELAY = 15s;
    llarp_time_t m_LastStatsReport = 0s;
-    llarp_time_t m_NextDecommissionWarn = 0s;
+    llarp_time_t m_NextDecommissionWarn = time_now_ms() + DECOMM_WARNING_STARTUP_DELAY;
    std::shared_ptr<llarp::KeyManager> m_keyManager;
    std::shared_ptr<PeerDb> m_peerDb;
--- a/llarp/rpc/lokid_rpc_client.cpp
+++ b/llarp/rpc/lokid_rpc_client.cpp
@ -174,25 +174,27 @@ namespace llarp
      auto makePingRequest = [self = shared_from_this()]() {
        // send a ping
        PubKey pk{};
-        bool should_ping = false;
+        auto r = self->m_Router.lock();
-        if (auto r = self->m_Router.lock())
+        if (not r)
-        {
+          return;  // router has gone away, maybe shutting down?
-          pk = r->pubkey();
+
-          should_ping = r->ShouldPingOxen();
+        pk = r->pubkey();
-        }
+
-        if (should_ping)
+        nlohmann::json payload = {
-        {
+            {"pubkey_ed25519", oxenc::to_hex(pk.begin(), pk.end())},
-          nlohmann::json payload = {
+            {"version", {VERSION[0], VERSION[1], VERSION[2]}}};
-              {"pubkey_ed25519", oxenc::to_hex(pk.begin(), pk.end())},
+
-              {"version", {VERSION[0], VERSION[1], VERSION[2]}}};
+        if (auto err = r->OxendErrorState())
-          self->Request(
+          payload["error"] = *err;
-              "admin.lokinet_ping",
+
-              [](bool success, std::vector<std::string> data) {
+        self->Request(
-                (void)data;
+            "admin.lokinet_ping",
-                LogDebug("Received response for ping. Successful: ", success);
+            [](bool success, std::vector<std::string> data) {
-              },
+              (void)data;
-              payload.dump());
+              LogDebug("Received response for ping. Successful: ", success);
-        }
+            },
            payload.dump());
        // subscribe to block updates
        self->Request("sub.block", [](bool success, std::vector<std::string> data) {
          if (data.empty() or not success)
@ -216,18 +218,13 @@ namespace llarp
    LokidRpcClient::HandleNewServiceNodeList(const nlohmann::json& j)
    {
      std::unordered_map<RouterID, PubKey> keymap;
-      std::vector<RouterID> activeNodeList, nonActiveNodeList;
+      std::vector<RouterID> activeNodeList, decommNodeList, unfundedNodeList;
      if (not j.is_array())
        throw std::runtime_error{
            "Invalid service node list: expected array of service node states"};
      for (auto& snode : j)
      {
        // Skip unstaked snodes:
        if (const auto funded_itr = snode.find("funded"); funded_itr == snode.end()
            or not funded_itr->is_boolean() or not funded_itr->get<bool>())
          continue;
        const auto ed_itr = snode.find("pubkey_ed25519");
        if (ed_itr == snode.end() or not ed_itr->is_string())
          continue;
@ -238,6 +235,10 @@ namespace llarp
        if (active_itr == snode.end() or not active_itr->is_boolean())
          continue;
        const bool active = active_itr->get<bool>();
        const auto funded_itr = snode.find("funded");
        if (funded_itr == snode.end() or not funded_itr->is_boolean())
          continue;
        const bool funded = funded_itr->get<bool>();
        RouterID rid;
        PubKey pk;
@ -246,7 +247,10 @@ namespace llarp
          continue;
        keymap[rid] = pk;
-        (active ? activeNodeList : nonActiveNodeList).push_back(std::move(rid));
+        (active       ? activeNodeList
             : funded ? decommNodeList
                      : unfundedNodeList)
            .push_back(std::move(rid));
      }
      if (activeNodeList.empty())
@ -254,17 +258,19 @@ namespace llarp
        LogWarn("got empty service node list, ignoring.");
        return;
      }
      // inform router about the new list
      if (auto router = m_Router.lock())
      {
        auto& loop = router->loop();
        loop->call([this,
                    active = std::move(activeNodeList),
-                    inactive = std::move(nonActiveNodeList),
+                    decomm = std::move(decommNodeList),
                    unfunded = std::move(unfundedNodeList),
                    keymap = std::move(keymap),
                    router = std::move(router)]() mutable {
          m_KeyMap = std::move(keymap);
-          router->SetRouterWhitelist(active, inactive);
+          router->SetRouterWhitelist(active, decomm, unfunded);
        });
      }
      else