You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
lokinet/llarp/consensus/reachability_testing.hpp

148 lines
6.0 KiB
C++

#pragma once
#include <llarp/router_id.hpp>
#include <llarp/util/time.hpp>
#include <chrono>
#include <queue>
#include <random>
#include <unordered_map>
#include <unordered_set>
#include <vector>
namespace llarp
{
struct Router;
}
namespace llarp::consensus
{
namespace detail
{
using clock_t = std::chrono::steady_clock;
using time_point_t = std::chrono::time_point<clock_t>;
// Returns std::greater on the std::get<N>(v)th element value.
template <typename T, size_t N>
struct nth_greater
{
constexpr bool
operator()(const T& lhs, const T& rhs) const
{
return std::greater<std::tuple_element_t<N, T>>{}(std::get<N>(lhs), std::get<N>(rhs));
}
};
struct incoming_test_state
{
time_point_t last_test{};
time_point_t last_whine{};
bool was_failing = false;
};
} // namespace detail
using time_point_t = detail::time_point_t;
using clock_t = detail::clock_t;
// How often we tick the timer to check whether we need to do any tests.
constexpr auto REACHABILITY_TESTING_TIMER_INTERVAL = 50ms;
class reachability_testing
{
public:
// Distribution for the seconds between node tests: we throw in some randomness to avoid
// potential clustering of tests. (Note that there is some granularity here as the test timer
// only runs every REACHABILITY_TESTING_TIMER_INTERVAL).
std::normal_distribution<float> TESTING_INTERVAL{10.0, 3.0};
// The linear backoff after each consecutive test failure before we re-test. Specifically we
// schedule the next re-test for (TESTING_BACKOFF*previous_failures) + TESTING_INTERVAL(rng).
inline static constexpr auto TESTING_BACKOFF = 10s;
// The upper bound for the re-test interval.
inline static constexpr auto TESTING_BACKOFF_MAX = 2min;
// The maximum number of nodes that we will re-test at once (i.e. per TESTING_TIMING_INTERVAL);
// mainly intended to throttle ourselves if, for instance, our own connectivity loss makes us
// accumulate tons of nodes to test all at once. (Despite the random intervals, this can happen
// if we also get decommissioned during which we can't test at all but still have lots of
// failing nodes we want to test right away when we get recommissioned).
inline static constexpr int MAX_RETESTS_PER_TICK = 4;
// Maximum time without a ping before we start whining about it.
//
// We have a probability of about 0.368* of *not* getting pinged within a ping interval (10s),
// and so the probability of not getting a ping for 2 minutes (i.e. 12 test spans) just because
// we haven't been selected is extremely small (0.0000061). It also coincides nicely with
// blockchain time (i.e. two minutes) and our max testing backoff.
//
// * = approx value of ((n-1)/n)^n for non-tiny values of n
inline static constexpr auto MAX_TIME_WITHOUT_PING = 2min;
// How often we whine in the logs about being unreachable
inline static constexpr auto WHINING_INTERVAL = 2min;
private:
// Queue of pubkeys of service nodes to test; we pop off the back of this until the queue
// empties then we refill it with a shuffled list of all pubkeys then pull off of it until it is
// empty again, etc.
std::vector<RouterID> testing_queue;
// The next time for a general test
time_point_t next_general_test = time_point_t::min();
// When we started, so that we know not to hold off on whining about no pings for a while.
const time_point_t startup = clock_t::now();
// Pubkeys, next test times, and sequential failure counts of service nodes that are currently
// in "failed" status along with the last time they failed; we retest them first after 10s then
// back off linearly by an additional 10s up to a max testing interval of 2m30s, until we get a
// successful response.
using FailingPK = std::tuple<RouterID, time_point_t, int>;
std::priority_queue<FailingPK, std::vector<FailingPK>, detail::nth_greater<FailingPK, 1>>
failing_queue;
std::unordered_set<RouterID> failing;
// Track the last time *this node* was tested by other network nodes; used to detect and warn
// about possible network issues.
detail::incoming_test_state last;
public:
// If it is time to perform another random test, this returns the next node to test from the
// testing queue and returns it, also updating the timer for the next test. If it is not yet
// time, or if the queue is empty and cannot current be replenished, returns std::nullopt. If
// the queue empties then this builds a new one by shuffling current public keys in the swarm's
// "all nodes" then starts using the new queue for this an subsequent calls.
//
// `requeue` is mainly for internal use: if false it avoids rebuilding the queue if we run
// out (and instead just return nullopt).
std::optional<RouterID>
next_random(Router* router, const time_point_t& now = clock_t::now(), bool requeue = true);
// Removes and returns up to MAX_RETESTS_PER_TICK nodes that are due to be tested (i.e.
// next-testing-time <= now). Returns [snrecord, #previous-failures] for each.
std::vector<std::pair<RouterID, int>>
get_failing(const time_point_t& now = clock_t::now());
// Adds a bad node pubkey to the failing list, to be re-tested soon (with a backoff depending on
// `failures`; see TESTING_BACKOFF). `previous_failures` should be the number of previous
// failures *before* this one, i.e. 0 for a random general test; or the failure count returned
// by `get_failing` for repeated failures.
void
add_failing_node(const RouterID& pk, int previous_failures = 0);
/// removes the public key from the failing set
void
remove_node_from_failing(const RouterID& pk);
// Called when this router receives an incomming session
void
incoming_ping(const time_point_t& now = clock_t::now());
// Check whether we received incoming pings recently
void
check_incoming_tests(const time_point_t& now = clock_t::now());
};
} // namespace llarp::consensus