Skip to content

Commit 15ae0f5

Browse files
authored
Feature/more reliable uptime calculation (#747)
* New database table holding monitor run info * SQL interface for new table * Updated uptime calculation to instead rely on number of monitor test runs
1 parent 2923d4b commit 15ae0f5

File tree

6 files changed

+260
-64
lines changed

6 files changed

+260
-64
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
-- keeping track of all monitor runs that have happened will help to
2+
-- solve an issue of mixnode being online only for a single check and yet being assigned 100% uptime
3+
CREATE TABLE monitor_run
4+
(
5+
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
6+
timestamp INTEGER NOT NULL
7+
)

validator-api/src/network_monitor/monitor/mod.rs

+12
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,18 @@ impl Monitor {
7777
// TODO: slightly more graceful shutdown here
7878
process::exit(1);
7979
}
80+
81+
// indicate our run has completed successfully and should be used in any future
82+
// uptime calculations
83+
if let Err(err) = self.node_status_storage.insert_monitor_run().await {
84+
error!(
85+
"Failed to submit monitor run information to the database - {}",
86+
err
87+
);
88+
89+
// TODO: slightly more graceful shutdown here
90+
process::exit(1);
91+
}
8092
}
8193

8294
// checking it this way with a TestReport is rather suboptimal but given the fact we're only

validator-api/src/node_status_api/models.rs

+21-4
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ use rocket::http::{ContentType, Status};
77
use rocket::response::{self, Responder, Response};
88
use rocket::Request;
99
use serde::{Deserialize, Serialize};
10+
use sqlx::types::time::OffsetDateTime;
1011
use std::convert::TryFrom;
1112
use std::fmt::{self, Display, Formatter};
1213
use std::io::Cursor;
@@ -90,13 +91,21 @@ pub struct MixnodeStatusReport {
9091

9192
impl MixnodeStatusReport {
9293
pub(crate) fn construct_from_last_day_reports(
94+
report_time: OffsetDateTime,
9395
identity: String,
9496
owner: String,
9597
last_day_ipv4: Vec<NodeStatus>,
9698
last_day_ipv6: Vec<NodeStatus>,
99+
last_hour_test_runs: usize,
100+
last_day_test_runs: usize,
97101
) -> Self {
98-
let node_uptimes =
99-
NodeUptimes::calculate_from_last_day_reports(last_day_ipv4, last_day_ipv6);
102+
let node_uptimes = NodeUptimes::calculate_from_last_day_reports(
103+
report_time,
104+
last_day_ipv4,
105+
last_day_ipv6,
106+
last_hour_test_runs,
107+
last_day_test_runs,
108+
);
100109

101110
MixnodeStatusReport {
102111
identity,
@@ -128,13 +137,21 @@ pub struct GatewayStatusReport {
128137

129138
impl GatewayStatusReport {
130139
pub(crate) fn construct_from_last_day_reports(
140+
report_time: OffsetDateTime,
131141
identity: String,
132142
owner: String,
133143
last_day_ipv4: Vec<NodeStatus>,
134144
last_day_ipv6: Vec<NodeStatus>,
145+
last_hour_test_runs: usize,
146+
last_day_test_runs: usize,
135147
) -> Self {
136-
let node_uptimes =
137-
NodeUptimes::calculate_from_last_day_reports(last_day_ipv4, last_day_ipv6);
148+
let node_uptimes = NodeUptimes::calculate_from_last_day_reports(
149+
report_time,
150+
last_day_ipv4,
151+
last_day_ipv6,
152+
last_hour_test_runs,
153+
last_day_test_runs,
154+
);
138155

139156
GatewayStatusReport {
140157
identity,

validator-api/src/node_status_api/utils.rs

+44-25
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44
use crate::node_status_api::models::Uptime;
55
use crate::node_status_api::{FIFTEEN_MINUTES, ONE_HOUR};
66
use crate::storage::models::NodeStatus;
7+
use log::warn;
78
use sqlx::types::time::OffsetDateTime;
9+
use std::cmp::max;
810

911
// A temporary helper struct used to produce reports for active nodes.
1012
pub(crate) struct ActiveNodeDayStatuses {
@@ -30,33 +32,23 @@ pub(crate) struct NodeUptimes {
3032

3133
impl NodeUptimes {
3234
pub(crate) fn calculate_from_last_day_reports(
35+
report_time: OffsetDateTime,
3336
last_day_ipv4: Vec<NodeStatus>,
3437
last_day_ipv6: Vec<NodeStatus>,
38+
last_hour_test_runs: usize,
39+
last_day_test_runs: usize,
3540
) -> Self {
36-
let now = OffsetDateTime::now_utc();
37-
let hour_ago = (now - ONE_HOUR).unix_timestamp();
38-
let fifteen_minutes_ago = (now - FIFTEEN_MINUTES).unix_timestamp();
41+
let hour_ago = (report_time - ONE_HOUR).unix_timestamp();
42+
let fifteen_minutes_ago = (report_time - FIFTEEN_MINUTES).unix_timestamp();
3943

40-
let ipv4_day_total = last_day_ipv4.len();
41-
let ipv6_day_total = last_day_ipv6.len();
44+
let mut ipv4_day_up = last_day_ipv4.iter().filter(|report| report.up).count();
45+
let mut ipv6_day_up = last_day_ipv6.iter().filter(|report| report.up).count();
4246

43-
let ipv4_day_up = last_day_ipv4.iter().filter(|report| report.up).count();
44-
let ipv6_day_up = last_day_ipv6.iter().filter(|report| report.up).count();
45-
46-
let ipv4_hour_total = last_day_ipv4
47-
.iter()
48-
.filter(|report| report.timestamp >= hour_ago)
49-
.count();
50-
let ipv6_hour_total = last_day_ipv6
51-
.iter()
52-
.filter(|report| report.timestamp >= hour_ago)
53-
.count();
54-
55-
let ipv4_hour_up = last_day_ipv4
47+
let mut ipv4_hour_up = last_day_ipv4
5648
.iter()
5749
.filter(|report| report.up && report.timestamp >= hour_ago)
5850
.count();
59-
let ipv6_hour_up = last_day_ipv6
51+
let mut ipv6_hour_up = last_day_ipv6
6052
.iter()
6153
.filter(|report| report.up && report.timestamp >= hour_ago)
6254
.count();
@@ -73,15 +65,42 @@ impl NodeUptimes {
7365
.map(|status| status.timestamp >= fifteen_minutes_ago && status.up) // make sure its within last 15min
7466
.unwrap_or_default();
7567

76-
// the unwraps in Uptime::from_ratio are fine because it's impossible for us to have more "up" results than all results in total
77-
// because both of those values originate from the same vector
68+
// If somehow we have more "up" reports than the actual test runs it means something weird is going on
69+
// (or we just started running this code on old data, so if it appears for first 24h, it's fine and actually expected
70+
// as we would not have any run information from the past)
71+
// Either way, bound the the number of "up" reports by number of test runs and log warnings
72+
// if that happens
73+
if ipv4_hour_up > last_hour_test_runs || ipv6_hour_up > last_hour_test_runs {
74+
warn!(
75+
"We have more 'up' reports than the actual number of test runs in last hour! ({} ipv4 'ups', {} ipv6 'ups' for {} test runs)",
76+
ipv4_hour_up,
77+
ipv6_hour_up,
78+
last_hour_test_runs,
79+
);
80+
ipv4_hour_up = max(ipv4_hour_up, last_hour_test_runs);
81+
ipv6_hour_up = max(ipv6_hour_up, last_hour_test_runs);
82+
}
83+
84+
if ipv4_day_up > last_day_test_runs || ipv6_day_up > last_day_test_runs {
85+
warn!(
86+
"We have more 'up' reports than the actual number of test runs in last day! ({} ipv4 'ups', {} ipv6 'ups' for {} test runs)",
87+
ipv4_day_up,
88+
ipv6_day_up,
89+
last_day_test_runs,
90+
);
91+
ipv4_day_up = max(ipv4_day_up, last_day_test_runs);
92+
ipv6_day_up = max(ipv6_day_up, last_day_test_runs);
93+
}
94+
95+
// the unwraps in Uptime::from_ratio are fine because it's impossible for us to have more "up" results
96+
// than total test runs as we just bounded them
7897
NodeUptimes {
7998
most_recent_ipv4,
8099
most_recent_ipv6,
81-
last_hour_ipv4: Uptime::from_ratio(ipv4_hour_up, ipv4_hour_total).unwrap(),
82-
last_hour_ipv6: Uptime::from_ratio(ipv6_hour_up, ipv6_hour_total).unwrap(),
83-
last_day_ipv4: Uptime::from_ratio(ipv4_day_up, ipv4_day_total).unwrap(),
84-
last_day_ipv6: Uptime::from_ratio(ipv6_day_up, ipv6_day_total).unwrap(),
100+
last_hour_ipv4: Uptime::from_ratio(ipv4_hour_up, last_hour_test_runs).unwrap(),
101+
last_hour_ipv6: Uptime::from_ratio(ipv6_hour_up, last_hour_test_runs).unwrap(),
102+
last_day_ipv4: Uptime::from_ratio(ipv4_day_up, last_day_test_runs).unwrap(),
103+
last_day_ipv6: Uptime::from_ratio(ipv6_day_up, last_day_test_runs).unwrap(),
85104
}
86105
}
87106
}

validator-api/src/storage/manager.rs

+45-14
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,8 @@
44
use crate::network_monitor::monitor::summary_producer::NodeResult;
55
use crate::node_status_api::models::{HistoricalUptime, Uptime};
66
use crate::node_status_api::utils::ActiveNodeDayStatuses;
7-
use crate::node_status_api::ONE_DAY;
87
use crate::storage::models::{ActiveNode, NodeStatus};
98
use crate::storage::UnixTimestamp;
10-
use sqlx::types::time::OffsetDateTime;
119
use std::convert::TryFrom;
1210

1311
#[derive(Clone)]
@@ -463,6 +461,43 @@ impl StorageManager {
463461
Ok(())
464462
}
465463

464+
/// Creates a database entry for a finished network monitor test run.
465+
///
466+
/// # Arguments
467+
///
468+
/// * `timestamp`: unix timestamp at which the monitor test run has occurred
469+
pub(crate) async fn insert_monitor_run(
470+
&self,
471+
timestamp: UnixTimestamp,
472+
) -> Result<(), sqlx::Error> {
473+
sqlx::query!("INSERT INTO monitor_run(timestamp) VALUES (?)", timestamp)
474+
.execute(&self.connection_pool)
475+
.await?;
476+
Ok(())
477+
}
478+
479+
/// Obtains number of network monitor test runs that have occurred within the specified interval.
480+
///
481+
/// # Arguments
482+
///
483+
/// * `since`: unix timestamp indicating the lower bound interval of the selection.
484+
/// * `until`: unix timestamp indicating the upper bound interval of the selection.
485+
pub(crate) async fn get_monitor_runs_count(
486+
&self,
487+
since: UnixTimestamp,
488+
until: UnixTimestamp,
489+
) -> Result<i32, sqlx::Error> {
490+
let count = sqlx::query!(
491+
"SELECT COUNT(*) as count FROM monitor_run WHERE timestamp > ? AND timestamp < ?",
492+
since,
493+
until,
494+
)
495+
.fetch_one(&self.connection_pool)
496+
.await?
497+
.count;
498+
Ok(count)
499+
}
500+
466501
pub(crate) async fn purge_old_mixnode_ipv4_statuses(
467502
&self,
468503
timestamp: UnixTimestamp,
@@ -579,19 +614,17 @@ impl StorageManager {
579614
// since technically it doesn't touch any SQL directly
580615
pub(crate) async fn get_all_active_mixnodes_statuses(
581616
&self,
617+
since: UnixTimestamp,
582618
) -> Result<Vec<ActiveNodeDayStatuses>, sqlx::Error> {
583-
let now = OffsetDateTime::now_utc();
584-
let day_ago = (now - ONE_DAY).unix_timestamp();
585-
586-
let active_nodes = self.get_all_active_mixnodes(day_ago).await?;
619+
let active_nodes = self.get_all_active_mixnodes(since).await?;
587620

588621
let mut active_day_statuses = Vec::with_capacity(active_nodes.len());
589622
for active_node in active_nodes.into_iter() {
590623
let ipv4_statuses = self
591-
.get_mixnode_ipv4_statuses_since_by_id(active_node.id, day_ago)
624+
.get_mixnode_ipv4_statuses_since_by_id(active_node.id, since)
592625
.await?;
593626
let ipv6_statuses = self
594-
.get_mixnode_ipv6_statuses_since_by_id(active_node.id, day_ago)
627+
.get_mixnode_ipv6_statuses_since_by_id(active_node.id, since)
595628
.await?;
596629

597630
let statuses = ActiveNodeDayStatuses {
@@ -614,19 +647,17 @@ impl StorageManager {
614647
// since technically it doesn't touch any SQL directly
615648
pub(crate) async fn get_all_active_gateways_statuses(
616649
&self,
650+
since: UnixTimestamp,
617651
) -> Result<Vec<ActiveNodeDayStatuses>, sqlx::Error> {
618-
let now = OffsetDateTime::now_utc();
619-
let day_ago = (now - ONE_DAY).unix_timestamp();
620-
621-
let active_nodes = self.get_all_active_gateways(day_ago).await?;
652+
let active_nodes = self.get_all_active_gateways(since).await?;
622653

623654
let mut active_day_statuses = Vec::with_capacity(active_nodes.len());
624655
for active_node in active_nodes.into_iter() {
625656
let ipv4_statuses = self
626-
.get_gateway_ipv4_statuses_since_by_id(active_node.id, day_ago)
657+
.get_gateway_ipv4_statuses_since_by_id(active_node.id, since)
627658
.await?;
628659
let ipv6_statuses = self
629-
.get_gateway_ipv6_statuses_since_by_id(active_node.id, day_ago)
660+
.get_gateway_ipv6_statuses_since_by_id(active_node.id, since)
630661
.await?;
631662

632663
let statuses = ActiveNodeDayStatuses {

0 commit comments

Comments
 (0)