Skip to content

Commit ae39a0d

Browse files
codethewebphilipkiely-baseten
authored andcommitted
[ENH]: add metrics for garbage collection (chroma-core#4173)
## Description of changes Adds 3 metrics to the garbage collection service as a starting point: - total jobs, segmented by success/failure - job duration in milliseconds (histogram) - total # of files deleted - total # of versions deleted ## Test plan *How are these changes tested?* Ran garbage collection and validated that the produced values in Granfana looked reasonable. ## Documentation Changes *Are all docstrings for user-facing APIs updated if required? Do we need to make documentation changes in the [docs repository](https://github.com/chroma-core/docs)?* n/a
1 parent 3be0260 commit ae39a0d

File tree

3 files changed

+57
-1
lines changed

3 files changed

+57
-1
lines changed

Cargo.lock

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

rust/garbage_collector/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ tempfile = { workspace = true }
3131
tracing = { workspace = true }
3232
thiserror = { workspace = true }
3333
humantime = { workspace = true }
34+
opentelemetry = { workspace = true }
3435

3536
chroma-config = { workspace = true }
3637
chroma-error = { workspace = true }

rust/garbage_collector/src/garbage_collector_component.rs

+55-1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ use chroma_system::{
1616
use chroma_types::CollectionUuid;
1717
use chrono::{DateTime, Utc};
1818
use futures::{stream::FuturesUnordered, StreamExt};
19+
use opentelemetry::metrics::{Counter, Histogram};
1920
use std::{
2021
collections::{HashMap, HashSet},
2122
fmt::{Debug, Formatter},
@@ -38,6 +39,10 @@ pub(crate) struct GarbageCollector {
3839
system: Option<chroma_system::System>,
3940
default_cleanup_mode: CleanupMode,
4041
tenant_mode_overrides: Option<HashMap<String, CleanupMode>>,
42+
total_jobs_metric: Counter<u64>,
43+
job_duration_ms_metric: Histogram<u64>,
44+
total_files_deleted_metric: Counter<u64>,
45+
total_versions_deleted_metric: Counter<u64>,
4146
}
4247

4348
impl Debug for GarbageCollector {
@@ -66,6 +71,8 @@ impl GarbageCollector {
6671
default_cleanup_mode: CleanupMode,
6772
tenant_mode_overrides: Option<HashMap<String, CleanupMode>>,
6873
) -> Self {
74+
let meter = opentelemetry::global::meter("chroma");
75+
6976
Self {
7077
gc_interval_mins,
7178
relative_cutoff_time,
@@ -77,6 +84,23 @@ impl GarbageCollector {
7784
system: None,
7885
default_cleanup_mode,
7986
tenant_mode_overrides,
87+
total_jobs_metric: meter
88+
.u64_counter("garbage_collector.total_jobs")
89+
.with_description("Total number of garbage collection jobs executed")
90+
.build(),
91+
job_duration_ms_metric: meter
92+
.u64_histogram("garbage_collector.job_duration_ms")
93+
.with_description("Duration of garbage collection jobs in milliseconds")
94+
.with_unit("ms")
95+
.build(),
96+
total_files_deleted_metric: meter
97+
.u64_counter("garbage_collector.total_files_deleted")
98+
.with_description("Total number of files deleted during garbage collection")
99+
.build(),
100+
total_versions_deleted_metric: meter
101+
.u64_counter("garbage_collector.total_versions_deleted")
102+
.with_description("Total number of versions deleted during garbage collection")
103+
.build(),
80104
}
81105
}
82106

@@ -107,7 +131,28 @@ impl GarbageCollector {
107131
);
108132

109133
if let Some(system) = self.system.as_ref() {
110-
return Ok(orchestrator.run(system.clone()).await?);
134+
let started_at = SystemTime::now();
135+
let result = orchestrator.run(system.clone()).await?;
136+
let duration_ms = started_at
137+
.elapsed()
138+
.map(|d| d.as_millis() as u64)
139+
.unwrap_or(0);
140+
self.job_duration_ms_metric.record(duration_ms, &[]);
141+
self.total_files_deleted_metric.add(
142+
result.deletion_list.len() as u64,
143+
&[opentelemetry::KeyValue::new(
144+
"cleanup_mode",
145+
format!("{:?}", cleanup_mode),
146+
)],
147+
);
148+
self.total_versions_deleted_metric.add(
149+
result.num_versions_deleted as u64,
150+
&[opentelemetry::KeyValue::new(
151+
"cleanup_mode",
152+
format!("{:?}", cleanup_mode),
153+
)],
154+
);
155+
return Ok(result);
111156
}
112157
}
113158

@@ -221,6 +266,15 @@ impl Handler<GarbageCollectMessage> for GarbageCollector {
221266
num_failed_jobs
222267
);
223268

269+
self.total_jobs_metric.add(
270+
num_completed_jobs as u64,
271+
&[opentelemetry::KeyValue::new("status", "success")],
272+
);
273+
self.total_jobs_metric.add(
274+
num_failed_jobs as u64,
275+
&[opentelemetry::KeyValue::new("status", "failure")],
276+
);
277+
224278
// Schedule next run
225279
ctx.scheduler.schedule(
226280
GarbageCollectMessage {},

0 commit comments

Comments
 (0)