Skip to content

Feature/more reliable uptime calculation #747

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Aug 23, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions validator-api/migrations/20210819120000_monitor_runs.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
-- keeping track of all monitor runs that have happened will help to
-- solve an issue of mixnode being online only for a single check and yet being assigned 100% uptime
CREATE TABLE monitor_run
(
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
timestamp INTEGER NOT NULL
)
12 changes: 12 additions & 0 deletions validator-api/src/network_monitor/monitor/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,18 @@ impl Monitor {
// TODO: slightly more graceful shutdown here
process::exit(1);
}

// indicate our run has completed successfully and should be used in any future
// uptime calculations
if let Err(err) = self.node_status_storage.insert_monitor_run().await {
error!(
"Failed to submit monitor run information to the database - {}",
err
);

// TODO: slightly more graceful shutdown here
process::exit(1);
}
}

// checking it this way with a TestReport is rather suboptimal but given the fact we're only
Expand Down
25 changes: 21 additions & 4 deletions validator-api/src/node_status_api/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use rocket::http::{ContentType, Status};
use rocket::response::{self, Responder, Response};
use rocket::Request;
use serde::{Deserialize, Serialize};
use sqlx::types::time::OffsetDateTime;
use std::convert::TryFrom;
use std::fmt::{self, Display, Formatter};
use std::io::Cursor;
Expand Down Expand Up @@ -90,13 +91,21 @@ pub struct MixnodeStatusReport {

impl MixnodeStatusReport {
pub(crate) fn construct_from_last_day_reports(
report_time: OffsetDateTime,
identity: String,
owner: String,
last_day_ipv4: Vec<NodeStatus>,
last_day_ipv6: Vec<NodeStatus>,
last_hour_test_runs: usize,
last_day_test_runs: usize,
) -> Self {
let node_uptimes =
NodeUptimes::calculate_from_last_day_reports(last_day_ipv4, last_day_ipv6);
let node_uptimes = NodeUptimes::calculate_from_last_day_reports(
report_time,
last_day_ipv4,
last_day_ipv6,
last_hour_test_runs,
last_day_test_runs,
);

MixnodeStatusReport {
identity,
Expand Down Expand Up @@ -128,13 +137,21 @@ pub struct GatewayStatusReport {

impl GatewayStatusReport {
pub(crate) fn construct_from_last_day_reports(
report_time: OffsetDateTime,
identity: String,
owner: String,
last_day_ipv4: Vec<NodeStatus>,
last_day_ipv6: Vec<NodeStatus>,
last_hour_test_runs: usize,
last_day_test_runs: usize,
) -> Self {
let node_uptimes =
NodeUptimes::calculate_from_last_day_reports(last_day_ipv4, last_day_ipv6);
let node_uptimes = NodeUptimes::calculate_from_last_day_reports(
report_time,
last_day_ipv4,
last_day_ipv6,
last_hour_test_runs,
last_day_test_runs,
);

GatewayStatusReport {
identity,
Expand Down
69 changes: 44 additions & 25 deletions validator-api/src/node_status_api/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
use crate::node_status_api::models::Uptime;
use crate::node_status_api::{FIFTEEN_MINUTES, ONE_HOUR};
use crate::storage::models::NodeStatus;
use log::warn;
use sqlx::types::time::OffsetDateTime;
use std::cmp::max;

// A temporary helper struct used to produce reports for active nodes.
pub(crate) struct ActiveNodeDayStatuses {
Expand All @@ -30,33 +32,23 @@ pub(crate) struct NodeUptimes {

impl NodeUptimes {
pub(crate) fn calculate_from_last_day_reports(
report_time: OffsetDateTime,
last_day_ipv4: Vec<NodeStatus>,
last_day_ipv6: Vec<NodeStatus>,
last_hour_test_runs: usize,
last_day_test_runs: usize,
) -> Self {
let now = OffsetDateTime::now_utc();
let hour_ago = (now - ONE_HOUR).unix_timestamp();
let fifteen_minutes_ago = (now - FIFTEEN_MINUTES).unix_timestamp();
let hour_ago = (report_time - ONE_HOUR).unix_timestamp();
let fifteen_minutes_ago = (report_time - FIFTEEN_MINUTES).unix_timestamp();

let ipv4_day_total = last_day_ipv4.len();
let ipv6_day_total = last_day_ipv6.len();
let mut ipv4_day_up = last_day_ipv4.iter().filter(|report| report.up).count();
let mut ipv6_day_up = last_day_ipv6.iter().filter(|report| report.up).count();

let ipv4_day_up = last_day_ipv4.iter().filter(|report| report.up).count();
let ipv6_day_up = last_day_ipv6.iter().filter(|report| report.up).count();

let ipv4_hour_total = last_day_ipv4
.iter()
.filter(|report| report.timestamp >= hour_ago)
.count();
let ipv6_hour_total = last_day_ipv6
.iter()
.filter(|report| report.timestamp >= hour_ago)
.count();

let ipv4_hour_up = last_day_ipv4
let mut ipv4_hour_up = last_day_ipv4
.iter()
.filter(|report| report.up && report.timestamp >= hour_ago)
.count();
let ipv6_hour_up = last_day_ipv6
let mut ipv6_hour_up = last_day_ipv6
.iter()
.filter(|report| report.up && report.timestamp >= hour_ago)
.count();
Expand All @@ -73,15 +65,42 @@ impl NodeUptimes {
.map(|status| status.timestamp >= fifteen_minutes_ago && status.up) // make sure its within last 15min
.unwrap_or_default();

// the unwraps in Uptime::from_ratio are fine because it's impossible for us to have more "up" results than all results in total
// because both of those values originate from the same vector
// If somehow we have more "up" reports than the actual test runs it means something weird is going on
// (or we just started running this code on old data, so if it appears for first 24h, it's fine and actually expected
// as we would not have any run information from the past)
// Either way, bound the the number of "up" reports by number of test runs and log warnings
// if that happens
if ipv4_hour_up > last_hour_test_runs || ipv6_hour_up > last_hour_test_runs {
warn!(
"We have more 'up' reports than the actual number of test runs in last hour! ({} ipv4 'ups', {} ipv6 'ups' for {} test runs)",
ipv4_hour_up,
ipv6_hour_up,
last_hour_test_runs,
);
ipv4_hour_up = max(ipv4_hour_up, last_hour_test_runs);
ipv6_hour_up = max(ipv6_hour_up, last_hour_test_runs);
}

if ipv4_day_up > last_day_test_runs || ipv6_day_up > last_day_test_runs {
warn!(
"We have more 'up' reports than the actual number of test runs in last day! ({} ipv4 'ups', {} ipv6 'ups' for {} test runs)",
ipv4_day_up,
ipv6_day_up,
last_day_test_runs,
);
ipv4_day_up = max(ipv4_day_up, last_day_test_runs);
ipv6_day_up = max(ipv6_day_up, last_day_test_runs);
}

// the unwraps in Uptime::from_ratio are fine because it's impossible for us to have more "up" results
// than total test runs as we just bounded them
NodeUptimes {
most_recent_ipv4,
most_recent_ipv6,
last_hour_ipv4: Uptime::from_ratio(ipv4_hour_up, ipv4_hour_total).unwrap(),
last_hour_ipv6: Uptime::from_ratio(ipv6_hour_up, ipv6_hour_total).unwrap(),
last_day_ipv4: Uptime::from_ratio(ipv4_day_up, ipv4_day_total).unwrap(),
last_day_ipv6: Uptime::from_ratio(ipv6_day_up, ipv6_day_total).unwrap(),
last_hour_ipv4: Uptime::from_ratio(ipv4_hour_up, last_hour_test_runs).unwrap(),
last_hour_ipv6: Uptime::from_ratio(ipv6_hour_up, last_hour_test_runs).unwrap(),
last_day_ipv4: Uptime::from_ratio(ipv4_day_up, last_day_test_runs).unwrap(),
last_day_ipv6: Uptime::from_ratio(ipv6_day_up, last_day_test_runs).unwrap(),
}
}
}
59 changes: 45 additions & 14 deletions validator-api/src/storage/manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@
use crate::network_monitor::monitor::summary_producer::NodeResult;
use crate::node_status_api::models::{HistoricalUptime, Uptime};
use crate::node_status_api::utils::ActiveNodeDayStatuses;
use crate::node_status_api::ONE_DAY;
use crate::storage::models::{ActiveNode, NodeStatus};
use crate::storage::UnixTimestamp;
use sqlx::types::time::OffsetDateTime;
use std::convert::TryFrom;

#[derive(Clone)]
Expand Down Expand Up @@ -463,6 +461,43 @@ impl StorageManager {
Ok(())
}

/// Creates a database entry for a finished network monitor test run.
///
/// # Arguments
///
/// * `timestamp`: unix timestamp at which the monitor test run has occurred
pub(crate) async fn insert_monitor_run(
&self,
timestamp: UnixTimestamp,
) -> Result<(), sqlx::Error> {
sqlx::query!("INSERT INTO monitor_run(timestamp) VALUES (?)", timestamp)
.execute(&self.connection_pool)
.await?;
Ok(())
}

/// Obtains number of network monitor test runs that have occurred within the specified interval.
///
/// # Arguments
///
/// * `since`: unix timestamp indicating the lower bound interval of the selection.
/// * `until`: unix timestamp indicating the upper bound interval of the selection.
pub(crate) async fn get_monitor_runs_count(
&self,
since: UnixTimestamp,
until: UnixTimestamp,
) -> Result<i32, sqlx::Error> {
let count = sqlx::query!(
"SELECT COUNT(*) as count FROM monitor_run WHERE timestamp > ? AND timestamp < ?",
since,
until,
)
.fetch_one(&self.connection_pool)
.await?
.count;
Ok(count)
}

pub(crate) async fn purge_old_mixnode_ipv4_statuses(
&self,
timestamp: UnixTimestamp,
Expand Down Expand Up @@ -579,19 +614,17 @@ impl StorageManager {
// since technically it doesn't touch any SQL directly
pub(crate) async fn get_all_active_mixnodes_statuses(
&self,
since: UnixTimestamp,
) -> Result<Vec<ActiveNodeDayStatuses>, sqlx::Error> {
let now = OffsetDateTime::now_utc();
let day_ago = (now - ONE_DAY).unix_timestamp();

let active_nodes = self.get_all_active_mixnodes(day_ago).await?;
let active_nodes = self.get_all_active_mixnodes(since).await?;

let mut active_day_statuses = Vec::with_capacity(active_nodes.len());
for active_node in active_nodes.into_iter() {
let ipv4_statuses = self
.get_mixnode_ipv4_statuses_since_by_id(active_node.id, day_ago)
.get_mixnode_ipv4_statuses_since_by_id(active_node.id, since)
.await?;
let ipv6_statuses = self
.get_mixnode_ipv6_statuses_since_by_id(active_node.id, day_ago)
.get_mixnode_ipv6_statuses_since_by_id(active_node.id, since)
.await?;

let statuses = ActiveNodeDayStatuses {
Expand All @@ -614,19 +647,17 @@ impl StorageManager {
// since technically it doesn't touch any SQL directly
pub(crate) async fn get_all_active_gateways_statuses(
&self,
since: UnixTimestamp,
) -> Result<Vec<ActiveNodeDayStatuses>, sqlx::Error> {
let now = OffsetDateTime::now_utc();
let day_ago = (now - ONE_DAY).unix_timestamp();

let active_nodes = self.get_all_active_gateways(day_ago).await?;
let active_nodes = self.get_all_active_gateways(since).await?;

let mut active_day_statuses = Vec::with_capacity(active_nodes.len());
for active_node in active_nodes.into_iter() {
let ipv4_statuses = self
.get_gateway_ipv4_statuses_since_by_id(active_node.id, day_ago)
.get_gateway_ipv4_statuses_since_by_id(active_node.id, since)
.await?;
let ipv6_statuses = self
.get_gateway_ipv6_statuses_since_by_id(active_node.id, day_ago)
.get_gateway_ipv6_statuses_since_by_id(active_node.id, since)
.await?;

let statuses = ActiveNodeDayStatuses {
Expand Down
Loading