Skip to content

Commit a2092ba

Browse files
: stdio redirection (#900)
Summary: rust startup code (code that runs before `main`) changes the disposition for `SIGPIPE` such that it is silently ignored (that is, runs `signal(Signal::SIGPIPE, SigHandler::SigIgn)` or equivalent). this behavior introduced in 2014, is poorly documented but see rust-lang/rust#62569. a task spawned in `hyperactor::signal_handler::GlobalSignalManager::new` creates an async signal listener using `signal-hook-tokio` crate. it watches for `SIGINT` and `SIGTERM` and on receiving one, executes cleanup code before removing the hooks and re-raising the signals in order to restore and execute the default behaviors (process termination). that signal handling code includes logging calls via `tracing::info!()` and `tracing::error!()`. the problem is, if `SIGTERM` (say) is being handled by an orphan, the earlier death of the parent can mean the orphan's stdout/stderr pipes are closed. normally, writing to a closed pipe would result in signalling `SIGPIPE` and process termination but here a logging call results in an infinite uninterruptible sleep, hanging the process preventing it from shutting down. this diff adds a call to a newly developed function `stdio_redirect::handle_broken_pipes()` which detects this condition and redirects stdio to a file (named derived from the process ID - e.g. `monarch-process-exit-3529266.log`) as needed allowing the process to terminate and write logs normally as it does so. Differential Revision: D80366985
1 parent 955819c commit a2092ba

File tree

4 files changed

+162
-0
lines changed

4 files changed

+162
-0
lines changed

hyperactor/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ unicode-ident = "1.0.12"
6464

6565
[dev-dependencies]
6666
maplit = "1.0"
67+
tempfile = "3.15"
6768
timed_test = { version = "0.0.0", path = "../timed_test" }
6869
tracing-subscriber = { version = "0.3.19", features = ["chrono", "env-filter", "json", "local-time", "parking_lot", "registry"] }
6970
tracing-test = { version = "0.2.3", features = ["no-env-filter"] }

hyperactor/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ pub mod proc;
8686
pub mod reference;
8787
mod signal_handler;
8888
pub mod simnet;
89+
mod stdio_redirect;
8990
pub mod supervision;
9091
pub mod sync;
9192
/// Test utilities

hyperactor/src/signal_handler.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,11 @@ impl GlobalSignalManager {
3232
signal_hook_tokio::Signals::new([signal::SIGINT as i32, signal::SIGTERM as i32])
3333
{
3434
if let Some(signal) = signals.next().await {
35+
// If parent died, stdout/stderr are broken pipes
36+
// that cause uninterruptible sleep on write.
37+
// Detect and redirect to file to prevent hanging.
38+
crate::stdio_redirect::handle_broken_pipes();
39+
3540
tracing::info!("received signal: {}", signal);
3641

3742
get_signal_manager().execute_all_cleanups().await;

hyperactor/src/stdio_redirect.rs

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
use std::fs::OpenOptions;
10+
use std::os::fd::BorrowedFd;
11+
use std::os::unix::io::AsRawFd;
12+
13+
use anyhow::Context;
14+
use nix::errno::Errno;
15+
use nix::libc::STDERR_FILENO;
16+
use nix::libc::STDOUT_FILENO;
17+
use nix::unistd::write;
18+
19+
/// Checks if stdout is broken (e.g., due to parent process death).
20+
///
21+
/// Attempts a minimal write to stdout to detect if the pipe is
22+
/// broken. Returns true if stdout is unavailable (broken pipe or bad
23+
/// file descriptor).
24+
pub(crate) fn is_stdout_broken() -> bool {
25+
// SAFETY: `STDOUT_FILENO` (`1`) is a valid file descriptor by
26+
// definition. `BorrowedFd::borrow_raw` is safe here because we're
27+
// only using it for the duration of this function call and not
28+
// storing it.
29+
let fd = unsafe { BorrowedFd::borrow_raw(STDOUT_FILENO) };
30+
matches!(write(fd, b"\0"), Err(Errno::EPIPE | Errno::EBADF))
31+
}
32+
33+
/// Redirects stdout and stderr to the specified file.
34+
///
35+
/// The file is opened in append mode and created if it doesn't exist.
36+
/// This permanently modifies the process's stdio streams.
37+
pub(crate) fn redirect_stdio_to_file(path: &str) -> anyhow::Result<()> {
38+
let file = OpenOptions::new()
39+
.create(true)
40+
.append(true)
41+
.open(path)
42+
.with_context(|| format!("failed to open log file: {}", path))?;
43+
let raw_fd = file.as_raw_fd();
44+
// SAFETY: `raw_fd` is a valid file descriptor obtained from
45+
// `as_raw_fd()` on an open file. `STDOUT_FILENO` (`1`) and
46+
// `STDERR_FILENO` (`2`) are always valid file descriptor numbers.
47+
// `dup2` is safe to call with these valid file descriptors.
48+
unsafe {
49+
if nix::libc::dup2(raw_fd, STDOUT_FILENO) == -1 {
50+
anyhow::bail!(
51+
"failed to redirect stdout: {}",
52+
std::io::Error::last_os_error()
53+
);
54+
}
55+
if nix::libc::dup2(raw_fd, STDERR_FILENO) == -1 {
56+
anyhow::bail!(
57+
"failed to redirect stderr: {}",
58+
std::io::Error::last_os_error()
59+
);
60+
}
61+
}
62+
std::mem::forget(file);
63+
Ok(())
64+
}
65+
66+
/// Redirects stdout and stderr to a user-specific log file in /tmp.
67+
///
68+
/// Creates a log file at `/tmp/{user}/monarch-process-exit-{pid}.log`
69+
/// and redirects stdio to it. The user directory is created if it
70+
/// doesn't exist.
71+
pub(crate) fn redirect_stdio_to_user_pid_file() -> anyhow::Result<()> {
72+
let user = std::env::var("USER").unwrap_or_else(|_| "unknown".to_string());
73+
let pid = std::process::id();
74+
let log_dir = format!("/tmp/{}", user);
75+
std::fs::create_dir_all(&log_dir)?;
76+
let path = format!("{}/monarch-process-exit-{}.log", log_dir, pid);
77+
redirect_stdio_to_file(&path)?;
78+
Ok(())
79+
}
80+
81+
/// Redirects stdio to a log file if stdout is broken.
82+
pub(crate) fn handle_broken_pipes() {
83+
if is_stdout_broken() {
84+
if redirect_stdio_to_user_pid_file().is_ok() {
85+
tracing::info!(
86+
"stdio for {} redirected due to broken pipe",
87+
std::process::id()
88+
);
89+
}
90+
}
91+
}
92+
93+
#[cfg(test)]
94+
mod tests {
95+
use nix::libc::STDERR_FILENO;
96+
use nix::libc::STDOUT_FILENO;
97+
use tempfile::TempDir;
98+
99+
use super::*;
100+
101+
struct StdioGuard {
102+
saved_stdout: i32,
103+
saved_stderr: i32,
104+
}
105+
106+
impl StdioGuard {
107+
fn new() -> Self {
108+
// SAFETY: `STDOUT_FILENO` (`1`) and `STDERR_FILENO` (`2`)
109+
// are always valid file descriptor numbers. `dup()` is
110+
// safe to call on these standard descriptors and will
111+
// return new file descriptors pointing to the same files.
112+
unsafe {
113+
let saved_stdout = nix::libc::dup(STDOUT_FILENO);
114+
let saved_stderr = nix::libc::dup(STDERR_FILENO);
115+
Self {
116+
saved_stdout,
117+
saved_stderr,
118+
}
119+
}
120+
}
121+
}
122+
123+
impl Drop for StdioGuard {
124+
fn drop(&mut self) {
125+
// SAFETY: `saved_stdout` and `saved_stderr` are valid
126+
// file descriptors returned by `dup()` in `new()`.
127+
// `STDOUT_FILENO` and `STDERR_FILENO` are always valid
128+
// target descriptors. `dup2()` and `close()` are safe to
129+
// call with these valid fds.
130+
unsafe {
131+
nix::libc::dup2(self.saved_stdout, STDOUT_FILENO);
132+
nix::libc::dup2(self.saved_stderr, STDERR_FILENO);
133+
nix::libc::close(self.saved_stdout);
134+
nix::libc::close(self.saved_stderr);
135+
}
136+
}
137+
}
138+
139+
#[test]
140+
fn test_is_stdout_broken_with_working_stdout() {
141+
assert!(!is_stdout_broken());
142+
}
143+
144+
#[test]
145+
fn test_redirect_stdio_to_file_creates_file() {
146+
let _guard = StdioGuard::new();
147+
148+
let temp_dir = TempDir::new().unwrap();
149+
let log_path = temp_dir.path().join("test.log");
150+
let path_str = log_path.to_str().unwrap();
151+
152+
assert!(redirect_stdio_to_file(path_str).is_ok());
153+
assert!(log_path.exists());
154+
}
155+
}

0 commit comments

Comments
 (0)