Skip to content

Commit ddd55eb

Browse files
eliothedemanfacebook-github-bot
authored andcommitted
Improve trace collection for python actors (pytorch-labs#242)
Summary: Pull Request resolved: pytorch-labs#242 stacktraces are now correctly collected from python exceptions. Reviewed By: kaiyuan-li Differential Revision: D76468993
1 parent 9ac7618 commit ddd55eb

File tree

6 files changed

+77
-34
lines changed

6 files changed

+77
-34
lines changed

hyperactor/src/proc.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -876,7 +876,6 @@ impl<A: Actor> Instance<A> {
876876
Ok(actor_handle)
877877
}
878878

879-
#[crate::instrument_infallible(fields(actor_id=tracing::field::display(self.self_id())))]
880879
async fn serve(mut self, mut actor: A) {
881880
let result = self.run_actor_tree(&mut actor).await;
882881

hyperactor_extension/src/telemetry.rs

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
use std::cell::Cell;
1212

1313
use pyo3::prelude::*;
14+
use pyo3::types::PyTraceback;
1415
use tracing::span::EnteredSpan;
1516
// Thread local to store the current span
1617
thread_local! {
@@ -45,15 +46,47 @@ pub fn exit_span() -> PyResult<()> {
4546

4647
/// Log a message with the given metaata
4748
#[pyfunction]
48-
pub fn forward_to_tracing(message: &str, file: &str, lineno: i64, level: i32) {
49+
pub fn forward_to_tracing(py: Python, record: PyObject) -> PyResult<()> {
50+
let message = record.call_method0(py, "getMessage")?;
51+
let message: &str = message.extract(py)?;
52+
let lineno: i64 = record.getattr(py, "lineno")?.extract(py)?;
53+
let file = record.getattr(py, "filename")?;
54+
let file: &str = file.extract(py)?;
55+
let level: i32 = record.getattr(py, "levelno")?.extract(py)?;
56+
4957
// Map level number to level name
5058
match level {
51-
40 => tracing::error!(file = file, lineno = lineno, message),
59+
40 | 50 => {
60+
let exc = record.getattr(py, "exc_info").ok();
61+
let traceback = exc
62+
.and_then(|exc| {
63+
if exc.is_none(py) {
64+
return None;
65+
}
66+
exc.extract::<(PyObject, PyObject, Bound<'_, PyTraceback>)>(py)
67+
.ok()
68+
})
69+
.map(|(_, _, tb)| tb.format().unwrap_or_default());
70+
match traceback {
71+
Some(traceback) => {
72+
tracing::error!(
73+
file = file,
74+
lineno = lineno,
75+
stacktrace = traceback,
76+
message
77+
);
78+
}
79+
None => {
80+
tracing::error!(file = file, lineno = lineno, message);
81+
}
82+
}
83+
}
5284
30 => tracing::warn!(file = file, lineno = lineno, message),
5385
20 => tracing::info!(file = file, lineno = lineno, message),
5486
10 => tracing::debug!(file = file, lineno = lineno, message),
5587
_ => tracing::info!(file = file, lineno = lineno, message),
5688
}
89+
Ok(())
5790
}
5891

5992
pub fn register_python_bindings(module: &Bound<'_, PyModule>) -> PyResult<()> {

python/monarch/_rust_bindings/hyperactor_extension/telemetry.pyi

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,27 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7-
def forward_to_tracing(message: str, file: str, lineno: int, level: int) -> None:
7+
import logging
8+
9+
def forward_to_tracing(record: logging.LogRecord) -> None:
810
"""
911
Log a message with the given metadata using the tracing system.
1012
1113
This function forwards Python log messages to the Rust tracing system,
1214
preserving the original source location and log level.
1315
1416
Args:
15-
- message (str): The log message content.
16-
- file (str): The file where the log message originated.
17-
- lineno (int): The line number where the log message originated.
18-
- level (int): The log level:
19-
- 10: DEBUG
20-
- 20: INFO
21-
- 30: WARN
22-
- 40: ERROR
23-
- other values default to INFO
17+
- record (logging.LogRecord): The log record containing message, file, lineno, and level information.
18+
The function extracts:
19+
- message: The log message content via record.getMessage()
20+
- file: The filename via record.filename
21+
- lineno: The line number via record.lineno
22+
- level: The log level via record.levelno:
23+
- 10: DEBUG
24+
- 20: INFO
25+
- 30: WARN
26+
- 40: ERROR
27+
- other values default to INFO
2428
"""
2529
...
2630

python/monarch/actor_mesh.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -525,7 +525,14 @@ async def instrumented():
525525
enter_span(
526526
the_method.__module__, message.method, str(ctx.mailbox.actor_id)
527527
)
528-
result = await the_method(self.instance, *args, **kwargs)
528+
try:
529+
result = await the_method(self.instance, *args, **kwargs)
530+
except Exception as e:
531+
logging.critical(
532+
"Unahndled exception in actor endpoint",
533+
exc_info=e,
534+
)
535+
raise e
529536
exit_span()
530537
return result
531538

python/monarch/bootstrap_main.py

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -30,28 +30,9 @@ def invoke_main():
3030
# behavior of std out as if it were a terminal.
3131
sys.stdout.reconfigure(line_buffering=True)
3232
global bootstrap_main
33-
from monarch._rust_bindings.hyperactor_extension.telemetry import ( # @manual=//monarch/monarch_extension:monarch_extension # @manual=//monarch/monarch_extension:monarch_extension
34-
forward_to_tracing,
35-
)
3633

3734
# TODO: figure out what from worker_main.py we should reproduce here.
38-
39-
class TracingForwarder(logging.Handler):
40-
def emit(self, record: logging.LogRecord) -> None:
41-
try:
42-
forward_to_tracing(
43-
record.getMessage(),
44-
record.filename or "",
45-
record.lineno or 0,
46-
record.levelno,
47-
)
48-
except AttributeError:
49-
forward_to_tracing(
50-
record.__str__(),
51-
record.filename or "",
52-
record.lineno or 0,
53-
record.levelno,
54-
)
35+
from monarch.telemetry import TracingForwarder
5536

5637
if os.environ.get("MONARCH_ERROR_DURING_BOOTSTRAP_FOR_TESTING") == "1":
5738
raise RuntimeError("Error during bootstrap for testing")

python/monarch/telemetry.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
# pyre-strict
8+
9+
10+
import logging
11+
12+
from monarch._rust_bindings.hyperactor_extension.telemetry import ( # @manual=//monarch/monarch_extension:monarch_extension
13+
forward_to_tracing,
14+
)
15+
16+
17+
class TracingForwarder(logging.Handler):
18+
def emit(self, record: logging.LogRecord) -> None:
19+
forward_to_tracing(record)

0 commit comments

Comments
 (0)