|
7 | 7 | */
|
8 | 8 |
|
9 | 9 | use std::collections::HashMap;
|
| 10 | +use std::str::FromStr; |
10 | 11 | use std::sync::Arc;
|
| 12 | +use std::time::Duration; |
11 | 13 |
|
| 14 | +use anyhow::anyhow; |
| 15 | +use async_trait::async_trait; |
| 16 | +use hyperactor::WorldId; |
| 17 | +use hyperactor::channel::ChannelAddr; |
| 18 | +use hyperactor::channel::ChannelTransport; |
12 | 19 | use hyperactor_extension::alloc::PyAlloc;
|
13 | 20 | use hyperactor_extension::alloc::PyAllocSpec;
|
| 21 | +use hyperactor_mesh::alloc::AllocSpec; |
14 | 22 | use hyperactor_mesh::alloc::Allocator;
|
| 23 | +use hyperactor_mesh::alloc::AllocatorError; |
15 | 24 | use hyperactor_mesh::alloc::LocalAllocator;
|
16 | 25 | use hyperactor_mesh::alloc::ProcessAllocator;
|
| 26 | +use hyperactor_mesh::alloc::remoteprocess::RemoteProcessAlloc; |
| 27 | +use hyperactor_mesh::alloc::remoteprocess::RemoteProcessAllocHost; |
| 28 | +use hyperactor_mesh::alloc::remoteprocess::RemoteProcessAllocInitializer; |
17 | 29 | use pyo3::exceptions::PyRuntimeError;
|
18 | 30 | use pyo3::prelude::*;
|
19 | 31 | use tokio::process::Command;
|
20 | 32 |
|
| 33 | +use crate::channel::PyChannelAddr; |
21 | 34 | use crate::runtime::signal_safe_block_on;
|
22 | 35 |
|
23 | 36 | #[pyclass(
|
@@ -48,7 +61,7 @@ impl PyLocalAllocator {
|
48 | 61 | .allocate(spec)
|
49 | 62 | .await
|
50 | 63 | .map(|inner| PyAlloc::new(Box::new(inner)))
|
51 |
| - .map_err(|e| PyRuntimeError::new_err(format!("{:?}", e))) |
| 64 | + .map_err(|e| PyRuntimeError::new_err(format!("{}", e))) |
52 | 65 | })
|
53 | 66 | }
|
54 | 67 |
|
@@ -132,9 +145,174 @@ impl PyProcessAllocator {
|
132 | 145 | }
|
133 | 146 | }
|
134 | 147 |
|
| 148 | +/// A `[hyperactor_mesh::alloc::RemoteProcessAllocInitializer]` wrapper to enable subclassing from Python. |
| 149 | +/// |
| 150 | +/// Basically follows https://pyo3.rs/v0.25.0/trait-bounds.html. |
| 151 | +/// The Python subclass should implement `def initialize_alloc(self) -> list[str]`. |
| 152 | +pub struct PyRemoteProcessAllocInitializer { |
| 153 | + // instance of a Python subclass of `monarch._rust_bindings.monarch_hyperactor.alloc.RemoteProcessAllocInitializer`. |
| 154 | + py_inner: Py<PyAny>, |
| 155 | +} |
| 156 | + |
| 157 | +impl Clone for PyRemoteProcessAllocInitializer { |
| 158 | + fn clone(&self) -> Self { |
| 159 | + Self { |
| 160 | + py_inner: Python::with_gil(|py| Py::clone_ref(&self.py_inner, py)), |
| 161 | + } |
| 162 | + } |
| 163 | +} |
| 164 | +impl PyRemoteProcessAllocInitializer { |
| 165 | + /// calls the initializer's `initialize_alloc()` as implemented in python |
| 166 | + /// |
| 167 | + /// NOTE: changes to python method calls must be made in sync with |
| 168 | + /// the method signature of `RemoteAllocInitializer` in |
| 169 | + /// `monarch/python/monarch/_rust_bindings/monarch_hyperactor/alloc.pyi` |
| 170 | + async fn py_initialize_alloc(&self) -> PyResult<Vec<String>> { |
| 171 | + // call the function as implemented in python |
| 172 | + let future = Python::with_gil(|py| -> PyResult<_> { |
| 173 | + let coroutine = self.py_inner.bind(py).call_method0("initialize_alloc")?; |
| 174 | + pyo3_async_runtimes::tokio::into_future(coroutine) |
| 175 | + })?; |
| 176 | + |
| 177 | + let addrs = future.await?; |
| 178 | + Python::with_gil(|py| -> PyResult<Vec<String>> { addrs.extract(py) }) |
| 179 | + } |
| 180 | + |
| 181 | + async fn get_transport_and_port(&self) -> PyResult<(ChannelTransport, u16)> { |
| 182 | + // NOTE: the upstream RemoteAllocator APIs take (transport, port, hostnames) |
| 183 | + // (e.g. assumes the same transport and port for all servers). |
| 184 | + // Until that is fixed we have to assume the same here. |
| 185 | + // Get the transport and port from the first address |
| 186 | + // TODO T227130269 |
| 187 | + let addrs = self.py_initialize_alloc().await?; |
| 188 | + let addr = addrs |
| 189 | + .first() |
| 190 | + .ok_or_else(|| anyhow!("initializer must return non-empty list of addresses"))?; |
| 191 | + let channel_addr = PyChannelAddr::parse(addr)?; |
| 192 | + let port = channel_addr.get_port()?; |
| 193 | + let transport = channel_addr.get_transport()?; |
| 194 | + Ok((transport.into(), port)) |
| 195 | + } |
| 196 | +} |
| 197 | + |
| 198 | +#[async_trait] |
| 199 | +impl RemoteProcessAllocInitializer for PyRemoteProcessAllocInitializer { |
| 200 | + async fn initialize_alloc(&mut self) -> Result<Vec<RemoteProcessAllocHost>, anyhow::Error> { |
| 201 | + // call the function as implemented in python |
| 202 | + let addrs = self.py_initialize_alloc().await?; |
| 203 | + addrs |
| 204 | + .iter() |
| 205 | + .map(|channel_addr| { |
| 206 | + let addr = ChannelAddr::from_str(channel_addr)?; |
| 207 | + let (id, hostname) = match addr { |
| 208 | + ChannelAddr::Tcp(socket) => (socket.ip().to_string(), socket.ip().to_string()), |
| 209 | + ChannelAddr::MetaTls(hostname, _) => (hostname.clone(), hostname.clone()), |
| 210 | + ChannelAddr::Unix(_) => (addr.to_string(), addr.to_string()), |
| 211 | + _ => anyhow::bail!("unsupported transport for channel address: `{addr}`"), |
| 212 | + }; |
| 213 | + Ok(RemoteProcessAllocHost { id, hostname }) |
| 214 | + }) |
| 215 | + .collect() |
| 216 | + } |
| 217 | +} |
| 218 | + |
| 219 | +#[pyclass( |
| 220 | + name = "RemoteAllocatorBase", |
| 221 | + module = "monarch._rust_bindings.monarch_hyperactor.alloc", |
| 222 | + subclass |
| 223 | +)] |
| 224 | +#[derive(Clone)] |
| 225 | +pub struct PyRemoteAllocator { |
| 226 | + // IMPORTANT: other than the `initializer` this struct should not hold any non-trivially |
| 227 | + // clonable data (e.g. such that the Clone derive-attribute would not work). |
| 228 | + // This allows us to avoid having yet-another-wrapper for PyRemoteAllocator since |
| 229 | + // PyRemoteProcessAllocInitializer is already a wrapper and its wrapped Py<PyAny> is |
| 230 | + // shared by reference. |
| 231 | + world_id: String, |
| 232 | + initializer: PyRemoteProcessAllocInitializer, |
| 233 | + heartbeat_interval: Duration, |
| 234 | +} |
| 235 | + |
| 236 | +#[async_trait] |
| 237 | +impl Allocator for PyRemoteAllocator { |
| 238 | + type Alloc = RemoteProcessAlloc; |
| 239 | + |
| 240 | + async fn allocate(&mut self, spec: AllocSpec) -> Result<Self::Alloc, AllocatorError> { |
| 241 | + let initializer = self.initializer.clone(); |
| 242 | + let (transport, port) = initializer |
| 243 | + .get_transport_and_port() |
| 244 | + .await |
| 245 | + .map_err(|e| AllocatorError::Other(e.into()))?; |
| 246 | + |
| 247 | + let alloc = RemoteProcessAlloc::new( |
| 248 | + spec, |
| 249 | + WorldId(self.world_id.clone()), |
| 250 | + transport, |
| 251 | + port, |
| 252 | + self.heartbeat_interval, |
| 253 | + initializer, |
| 254 | + ) |
| 255 | + .await?; |
| 256 | + Ok(alloc) |
| 257 | + } |
| 258 | +} |
| 259 | + |
| 260 | +#[pymethods] |
| 261 | +impl PyRemoteAllocator { |
| 262 | + #[new] |
| 263 | + #[pyo3(signature = ( |
| 264 | + world_id, |
| 265 | + initializer, |
| 266 | + heartbeat_interval = Duration::from_secs(5), |
| 267 | + ))] |
| 268 | + fn new( |
| 269 | + world_id: String, |
| 270 | + initializer: Py<PyAny>, |
| 271 | + heartbeat_interval: Duration, |
| 272 | + ) -> PyResult<Self> { |
| 273 | + Ok(Self { |
| 274 | + world_id, |
| 275 | + initializer: PyRemoteProcessAllocInitializer { |
| 276 | + py_inner: initializer, |
| 277 | + }, |
| 278 | + heartbeat_interval, |
| 279 | + }) |
| 280 | + } |
| 281 | + |
| 282 | + fn allocate_nonblocking<'py>( |
| 283 | + &self, |
| 284 | + py: Python<'py>, |
| 285 | + spec: &PyAllocSpec, |
| 286 | + ) -> PyResult<Bound<'py, PyAny>> { |
| 287 | + let spec = spec.inner.clone(); |
| 288 | + let mut cloned = self.clone(); |
| 289 | + |
| 290 | + pyo3_async_runtimes::tokio::future_into_py(py, async move { |
| 291 | + cloned |
| 292 | + .allocate(spec) |
| 293 | + .await |
| 294 | + .map(|alloc| PyAlloc::new(Box::new(alloc))) |
| 295 | + .map_err(|e| PyRuntimeError::new_err(format!("{}", e))) |
| 296 | + }) |
| 297 | + } |
| 298 | + fn allocate_blocking<'py>(&self, py: Python<'py>, spec: &PyAllocSpec) -> PyResult<PyAlloc> { |
| 299 | + let spec = spec.inner.clone(); |
| 300 | + let mut cloned = self.clone(); |
| 301 | + |
| 302 | + signal_safe_block_on(py, async move { |
| 303 | + cloned |
| 304 | + .allocate(spec) |
| 305 | + .await |
| 306 | + .map(|alloc| PyAlloc::new(Box::new(alloc))) |
| 307 | + .map_err(|e| PyRuntimeError::new_err(format!("{:?}", e))) |
| 308 | + })? |
| 309 | + } |
| 310 | +} |
| 311 | + |
135 | 312 | pub fn register_python_bindings(hyperactor_mod: &Bound<'_, PyModule>) -> PyResult<()> {
|
136 | 313 | hyperactor_mod.add_class::<PyProcessAllocator>()?;
|
137 | 314 | hyperactor_mod.add_class::<PyLocalAllocator>()?;
|
| 315 | + hyperactor_mod.add_class::<PyRemoteAllocator>()?; |
138 | 316 |
|
139 | 317 | Ok(())
|
140 | 318 | }
|
0 commit comments