3
3
import logging
4
4
import os
5
5
from collections .abc import Iterable
6
- from typing import Dict , List , Optional
6
+ from typing import Dict , List , Optional , Type
7
7
8
8
import coolname
9
9
import pandas as pd
10
10
import polars as pl
11
+ import pyarrow .fs as fs
12
+ import pyarrow .parquet as pq
11
13
import ray
12
14
from packaging .version import Version
13
15
from ray .serve import shutdown
@@ -88,8 +90,9 @@ def __init__(
88
90
self ,
89
91
name : str ,
90
92
namespace : str ,
91
- path : Optional [str ] = None ,
92
93
enable_perspective_dashboard : bool = False ,
94
+ filesystem : Type [fs .FileSystem ] = fs .LocalFileSystem ,
95
+ filesystem_kwargs : Optional [dict ] = None ,
93
96
):
94
97
"""An async Ray Actor Class to track task level metadata.
95
98
@@ -114,13 +117,13 @@ def __init__(
114
117
lifetime = "detached" ,
115
118
get_if_exists = True ,
116
119
).remote (name , namespace )
117
- self .path = path
118
120
self .df = None
119
121
self .finished_tasks = {}
120
122
self .user_defined_metadata = {}
121
123
self .perspective_dashboard_enabled = enable_perspective_dashboard
122
124
self .pending_tasks = []
123
125
self .perspective_table_name = f"{ name } _data"
126
+ self .filesystem = filesystem (** (filesystem_kwargs or dict ()))
124
127
125
128
# WARNING: Do not move this import. Importing these modules elsewhere can cause
126
129
# difficult to diagnose, "There is no current event loop in thread 'ray_client_server_" errors.
@@ -306,14 +309,10 @@ def get_proxy_server(self) -> ray.serve.handle.DeploymentHandle:
306
309
return self .proxy_server
307
310
raise Exception ("This task_tracker has no active proxy_server." )
308
311
309
- def save_df (self ) -> None :
310
- """Saves the internally maintained dataframe of task related information from the ray GCS"""
311
- self .get_df ()
312
- if self .path is not None and self .df is not None :
313
- logger .info (f"Writing DataFrame to { self .path } " )
314
- self .df .write_parquet (self .path )
315
- return True
316
- return False
312
+ def save_df (self , path : str ) -> None :
313
+ """Saves the internally maintained dataframe of task related information from the ray GCS to a provided path, using the filesystem attribute"""
314
+ logger .info (f"Writing DataFrame to { path } " )
315
+ pq .write_table (self .get_df ().to_arrow (), path , filesystem = self .filesystem )
317
316
318
317
def clear_df (self ) -> None :
319
318
"""Clears the internally maintained dataframe of task related information from the ray GCS"""
@@ -363,9 +362,9 @@ def get_df(self, process_user_metadata_column=False) -> pl.DataFrame:
363
362
return df_with_user_metadata
364
363
return df
365
364
366
- def save_df (self ) -> None :
365
+ def save_df (self , path : str ) -> None :
367
366
"""Save the dataframe used by this object's AsyncMetadataTracker actor"""
368
- return ray .get (self .tracker .save_df .remote ())
367
+ return ray .get (self .tracker .save_df .remote (path ))
369
368
370
369
def clear (self ) -> None :
371
370
"""Clear the dataframe used by this object's AsyncMetadataTracker actor"""
0 commit comments