Feature/more reliable uptime calculation (#747)

jstuczyn · web-flow · commit 15ae0f521e46 · 2021-08-23T11:26:51.000+02:00
* New database table holding monitor run info

* SQL interface for new table

* Updated uptime calculation to instead rely on number of monitor test runs
diff --git a/validator-api/migrations/20210819120000_monitor_runs.sql b/validator-api/migrations/20210819120000_monitor_runs.sql
@@ -0,0 +1,7 @@
+-- keeping track of all monitor runs that have happened will help to
+-- solve an issue of mixnode being online only for a single check and yet being assigned 100% uptime
+CREATE TABLE monitor_run
+(
+    id        INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
+    timestamp INTEGER NOT NULL
+)
diff --git a/validator-api/src/network_monitor/monitor/mod.rs b/validator-api/src/network_monitor/monitor/mod.rs
@@ -77,6 +77,18 @@ impl Monitor {
             // TODO: slightly more graceful shutdown here
             process::exit(1);
         }
+
+        // indicate our run has completed successfully and should be used in any future
+        // uptime calculations
+        if let Err(err) = self.node_status_storage.insert_monitor_run().await {
+            error!(
+                "Failed to submit monitor run information to the database - {}",
+                err
+            );
+
+            // TODO: slightly more graceful shutdown here
+            process::exit(1);
+        }
     }
 
     // checking it this way with a TestReport is rather suboptimal but given the fact we're only
diff --git a/validator-api/src/node_status_api/models.rs b/validator-api/src/node_status_api/models.rs
@@ -7,6 +7,7 @@ use rocket::http::{ContentType, Status};
 use rocket::response::{self, Responder, Response};
 use rocket::Request;
 use serde::{Deserialize, Serialize};
+use sqlx::types::time::OffsetDateTime;
 use std::convert::TryFrom;
 use std::fmt::{self, Display, Formatter};
 use std::io::Cursor;
@@ -90,13 +91,21 @@ pub struct MixnodeStatusReport {
 
 impl MixnodeStatusReport {
     pub(crate) fn construct_from_last_day_reports(
+        report_time: OffsetDateTime,
         identity: String,
         owner: String,
         last_day_ipv4: Vec<NodeStatus>,
         last_day_ipv6: Vec<NodeStatus>,
+        last_hour_test_runs: usize,
+        last_day_test_runs: usize,
     ) -> Self {
-        let node_uptimes =
-            NodeUptimes::calculate_from_last_day_reports(last_day_ipv4, last_day_ipv6);
+        let node_uptimes = NodeUptimes::calculate_from_last_day_reports(
+            report_time,
+            last_day_ipv4,
+            last_day_ipv6,
+            last_hour_test_runs,
+            last_day_test_runs,
+        );
 
         MixnodeStatusReport {
             identity,
@@ -128,13 +137,21 @@ pub struct GatewayStatusReport {
 
 impl GatewayStatusReport {
     pub(crate) fn construct_from_last_day_reports(
+        report_time: OffsetDateTime,
         identity: String,
         owner: String,
         last_day_ipv4: Vec<NodeStatus>,
         last_day_ipv6: Vec<NodeStatus>,
+        last_hour_test_runs: usize,
+        last_day_test_runs: usize,
     ) -> Self {
-        let node_uptimes =
-            NodeUptimes::calculate_from_last_day_reports(last_day_ipv4, last_day_ipv6);
+        let node_uptimes = NodeUptimes::calculate_from_last_day_reports(
+            report_time,
+            last_day_ipv4,
+            last_day_ipv6,
+            last_hour_test_runs,
+            last_day_test_runs,
+        );
 
         GatewayStatusReport {
             identity,
diff --git a/validator-api/src/node_status_api/utils.rs b/validator-api/src/node_status_api/utils.rs
@@ -4,7 +4,9 @@
 use crate::node_status_api::models::Uptime;
 use crate::node_status_api::{FIFTEEN_MINUTES, ONE_HOUR};
 use crate::storage::models::NodeStatus;
+use log::warn;
 use sqlx::types::time::OffsetDateTime;
+use std::cmp::max;
 
 // A temporary helper struct used to produce reports for active nodes.
 pub(crate) struct ActiveNodeDayStatuses {
@@ -30,33 +32,23 @@ pub(crate) struct NodeUptimes {
 
 impl NodeUptimes {
     pub(crate) fn calculate_from_last_day_reports(
+        report_time: OffsetDateTime,
         last_day_ipv4: Vec<NodeStatus>,
         last_day_ipv6: Vec<NodeStatus>,
+        last_hour_test_runs: usize,
+        last_day_test_runs: usize,
     ) -> Self {
-        let now = OffsetDateTime::now_utc();
-        let hour_ago = (now - ONE_HOUR).unix_timestamp();
-        let fifteen_minutes_ago = (now - FIFTEEN_MINUTES).unix_timestamp();
+        let hour_ago = (report_time - ONE_HOUR).unix_timestamp();
+        let fifteen_minutes_ago = (report_time - FIFTEEN_MINUTES).unix_timestamp();
 
-        let ipv4_day_total = last_day_ipv4.len();
-        let ipv6_day_total = last_day_ipv6.len();
+        let mut ipv4_day_up = last_day_ipv4.iter().filter(|report| report.up).count();
+        let mut ipv6_day_up = last_day_ipv6.iter().filter(|report| report.up).count();
 
-        let ipv4_day_up = last_day_ipv4.iter().filter(|report| report.up).count();
-        let ipv6_day_up = last_day_ipv6.iter().filter(|report| report.up).count();
-
-        let ipv4_hour_total = last_day_ipv4
-            .iter()
-            .filter(|report| report.timestamp >= hour_ago)
-            .count();
-        let ipv6_hour_total = last_day_ipv6
-            .iter()
-            .filter(|report| report.timestamp >= hour_ago)
-            .count();
-
-        let ipv4_hour_up = last_day_ipv4
+        let mut ipv4_hour_up = last_day_ipv4
             .iter()
             .filter(|report| report.up && report.timestamp >= hour_ago)
             .count();
-        let ipv6_hour_up = last_day_ipv6
+        let mut ipv6_hour_up = last_day_ipv6
             .iter()
             .filter(|report| report.up && report.timestamp >= hour_ago)
             .count();
@@ -73,15 +65,42 @@ impl NodeUptimes {
             .map(|status| status.timestamp >= fifteen_minutes_ago && status.up) // make sure its within last 15min
             .unwrap_or_default();
 
-        // the unwraps in Uptime::from_ratio are fine because it's impossible for us to have more "up" results than all results in total
-        // because both of those values originate from the same vector
+        // If somehow we have more "up" reports than the actual test runs it means something weird is going on
+        // (or we just started running this code on old data, so if it appears for first 24h, it's fine and actually expected
+        // as we would not have any run information from the past)
+        // Either way, bound the the number of "up" reports by number of test runs and log warnings
+        // if that happens
+        if ipv4_hour_up > last_hour_test_runs || ipv6_hour_up > last_hour_test_runs {
+            warn!(
+                "We have more 'up' reports than the actual number of test runs in last hour! ({} ipv4 'ups', {} ipv6 'ups' for {} test runs)",
+                ipv4_hour_up,
+                ipv6_hour_up,
+                last_hour_test_runs,
+            );
+            ipv4_hour_up = max(ipv4_hour_up, last_hour_test_runs);
+            ipv6_hour_up = max(ipv6_hour_up, last_hour_test_runs);
+        }
+
+        if ipv4_day_up > last_day_test_runs || ipv6_day_up > last_day_test_runs {
+            warn!(
+                "We have more 'up' reports than the actual number of test runs in last day! ({} ipv4 'ups', {} ipv6 'ups' for {} test runs)",
+                ipv4_day_up,
+                ipv6_day_up,
+                last_day_test_runs,
+            );
+            ipv4_day_up = max(ipv4_day_up, last_day_test_runs);
+            ipv6_day_up = max(ipv6_day_up, last_day_test_runs);
+        }
+
+        // the unwraps in Uptime::from_ratio are fine because it's impossible for us to have more "up" results
+        // than total test runs as we just bounded them
         NodeUptimes {
             most_recent_ipv4,
             most_recent_ipv6,
-            last_hour_ipv4: Uptime::from_ratio(ipv4_hour_up, ipv4_hour_total).unwrap(),
-            last_hour_ipv6: Uptime::from_ratio(ipv6_hour_up, ipv6_hour_total).unwrap(),
-            last_day_ipv4: Uptime::from_ratio(ipv4_day_up, ipv4_day_total).unwrap(),
-            last_day_ipv6: Uptime::from_ratio(ipv6_day_up, ipv6_day_total).unwrap(),
+            last_hour_ipv4: Uptime::from_ratio(ipv4_hour_up, last_hour_test_runs).unwrap(),
+            last_hour_ipv6: Uptime::from_ratio(ipv6_hour_up, last_hour_test_runs).unwrap(),
+            last_day_ipv4: Uptime::from_ratio(ipv4_day_up, last_day_test_runs).unwrap(),
+            last_day_ipv6: Uptime::from_ratio(ipv6_day_up, last_day_test_runs).unwrap(),
         }
     }
 }
diff --git a/validator-api/src/storage/manager.rs b/validator-api/src/storage/manager.rs
@@ -4,10 +4,8 @@
 use crate::network_monitor::monitor::summary_producer::NodeResult;
 use crate::node_status_api::models::{HistoricalUptime, Uptime};
 use crate::node_status_api::utils::ActiveNodeDayStatuses;
-use crate::node_status_api::ONE_DAY;
 use crate::storage::models::{ActiveNode, NodeStatus};
 use crate::storage::UnixTimestamp;
-use sqlx::types::time::OffsetDateTime;
 use std::convert::TryFrom;
 
 #[derive(Clone)]
@@ -463,6 +461,43 @@ impl StorageManager {
         Ok(())
     }
 
+    /// Creates a database entry for a finished network monitor test run.
+    ///
+    /// # Arguments
+    ///
+    /// * `timestamp`: unix timestamp at which the monitor test run has occurred
+    pub(crate) async fn insert_monitor_run(
+        &self,
+        timestamp: UnixTimestamp,
+    ) -> Result<(), sqlx::Error> {
+        sqlx::query!("INSERT INTO monitor_run(timestamp) VALUES (?)", timestamp)
+            .execute(&self.connection_pool)
+            .await?;
+        Ok(())
+    }
+
+    /// Obtains number of network monitor test runs that have occurred within the specified interval.
+    ///
+    /// # Arguments
+    ///
+    /// * `since`: unix timestamp indicating the lower bound interval of the selection.
+    /// * `until`: unix timestamp indicating the upper bound interval of the selection.
+    pub(crate) async fn get_monitor_runs_count(
+        &self,
+        since: UnixTimestamp,
+        until: UnixTimestamp,
+    ) -> Result<i32, sqlx::Error> {
+        let count = sqlx::query!(
+            "SELECT COUNT(*) as count FROM monitor_run WHERE timestamp > ? AND timestamp < ?",
+            since,
+            until,
+        )
+        .fetch_one(&self.connection_pool)
+        .await?
+        .count;
+        Ok(count)
+    }
+
     pub(crate) async fn purge_old_mixnode_ipv4_statuses(
         &self,
         timestamp: UnixTimestamp,
@@ -579,19 +614,17 @@ impl StorageManager {
     // since technically it doesn't touch any SQL directly
     pub(crate) async fn get_all_active_mixnodes_statuses(
         &self,
+        since: UnixTimestamp,
     ) -> Result<Vec<ActiveNodeDayStatuses>, sqlx::Error> {
-        let now = OffsetDateTime::now_utc();
-        let day_ago = (now - ONE_DAY).unix_timestamp();
-
-        let active_nodes = self.get_all_active_mixnodes(day_ago).await?;
+        let active_nodes = self.get_all_active_mixnodes(since).await?;
 
         let mut active_day_statuses = Vec::with_capacity(active_nodes.len());
         for active_node in active_nodes.into_iter() {
             let ipv4_statuses = self
-                .get_mixnode_ipv4_statuses_since_by_id(active_node.id, day_ago)
+                .get_mixnode_ipv4_statuses_since_by_id(active_node.id, since)
                 .await?;
             let ipv6_statuses = self
-                .get_mixnode_ipv6_statuses_since_by_id(active_node.id, day_ago)
+                .get_mixnode_ipv6_statuses_since_by_id(active_node.id, since)
                 .await?;
 
             let statuses = ActiveNodeDayStatuses {
@@ -614,19 +647,17 @@ impl StorageManager {
     // since technically it doesn't touch any SQL directly
     pub(crate) async fn get_all_active_gateways_statuses(
         &self,
+        since: UnixTimestamp,
     ) -> Result<Vec<ActiveNodeDayStatuses>, sqlx::Error> {
-        let now = OffsetDateTime::now_utc();
-        let day_ago = (now - ONE_DAY).unix_timestamp();
-
-        let active_nodes = self.get_all_active_gateways(day_ago).await?;
+        let active_nodes = self.get_all_active_gateways(since).await?;
 
         let mut active_day_statuses = Vec::with_capacity(active_nodes.len());
         for active_node in active_nodes.into_iter() {
             let ipv4_statuses = self
-                .get_gateway_ipv4_statuses_since_by_id(active_node.id, day_ago)
+                .get_gateway_ipv4_statuses_since_by_id(active_node.id, since)
                 .await?;
             let ipv6_statuses = self
-                .get_gateway_ipv6_statuses_since_by_id(active_node.id, day_ago)
+                .get_gateway_ipv6_statuses_since_by_id(active_node.id, since)
                 .await?;
 
             let statuses = ActiveNodeDayStatuses {
diff --git a/validator-api/src/storage/mod.rs b/validator-api/src/storage/mod.rs