Skip to content

Commit 26df821

Browse files
fix: Ray deprecation warnings (#2929)
* remove use_legacy_dataset * remove pytest.mark.modin_index * replace get_internal_block_refs with iter_internal_ref_bundles * fix formatting * fix usage of iter_internal_ref_bundles
1 parent d09a556 commit 26df821

File tree

9 files changed

+14
-23
lines changed

9 files changed

+14
-23
lines changed

awswrangler/distributed/ray/datasources/arrow_parquet_datasource.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,6 @@ def __init__( # noqa: PLR0912,PLR0915
243243
paths,
244244
**dataset_kwargs,
245245
filesystem=filesystem,
246-
use_legacy_dataset=False,
247246
)
248247
except OSError as e:
249248
_handle_read_os_error(e, paths)

awswrangler/distributed/ray/modin/_data_types.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def pyarrow_types_from_pandas_distributed(
1515
) -> dict[str, pa.DataType]:
1616
"""Extract the related Pyarrow data types from a pandas DataFrame."""
1717
func = ray_remote()(pyarrow_types_from_pandas)
18-
first_block_object_ref = _ray_dataset_from_df(df).get_internal_block_refs()[0]
18+
first_block_object_ref = next(_ray_dataset_from_df(df).iter_internal_ref_bundles()).block_refs[0]
1919
return ray_get( # type: ignore[no-any-return]
2020
func(
2121
df=first_block_object_ref,

awswrangler/distributed/ray/modin/_utils.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,15 +51,21 @@ def _to_modin(
5151

5252
return from_partitions(
5353
partitions=[
54-
_block_to_df(block=block, to_pandas_kwargs=_to_pandas_kwargs) for block in dataset.get_internal_block_refs()
54+
_block_to_df(block=block_ref, to_pandas_kwargs=_to_pandas_kwargs)
55+
for ref_bundle in dataset.iter_internal_ref_bundles()
56+
for block_ref in ref_bundle.block_refs
5557
],
5658
axis=0,
5759
index=index,
5860
)
5961

6062

6163
def _split_modin_frame(df: modin_pd.DataFrame, splits: int) -> list[ObjectRef[Any]]:
62-
object_refs: list[ObjectRef[Any]] = _ray_dataset_from_df(df).get_internal_block_refs()
64+
object_refs: list[ObjectRef[Any]] = [
65+
block_ref
66+
for ref_bundle in _ray_dataset_from_df(df).iter_internal_ref_bundles()
67+
for block_ref in ref_bundle.block_refs
68+
]
6369
return object_refs
6470

6571

awswrangler/distributed/ray/modin/s3/_write_dataset.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,11 @@ def write_partitions(df: pd.DataFrame, block_index: int) -> tuple[list[str], dic
145145
)
146146
return paths, partitions_values
147147

148-
block_object_refs = _ray_dataset_from_df(df).get_internal_block_refs()
148+
block_object_refs = (
149+
block_ref
150+
for ref_bundle in _ray_dataset_from_df(df).iter_internal_ref_bundles()
151+
for block_ref in ref_bundle.block_refs
152+
)
149153
result = ray_get(
150154
[write_partitions(object_ref, block_index) for block_index, object_ref in enumerate(block_object_refs)]
151155
)

tests/unit/test_athena.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,6 @@ def test_athena_ctas(path, path2, path3, glue_table, glue_table2, glue_database,
135135
assert len(wr.s3.list_objects(path=path3)) == 0
136136

137137

138-
@pytest.mark.modin_index
139138
def test_athena_read_sql_ctas_bucketing(path, path2, glue_table, glue_table2, glue_database, glue_ctas_database):
140139
df = pd.DataFrame({"c0": [0, 1], "c1": ["foo", "bar"]})
141140
wr.s3.to_parquet(
@@ -1013,7 +1012,6 @@ def test_bucketing_catalog_parquet_table(path, glue_database, glue_table):
10131012
assert table["StorageDescriptor"]["BucketColumns"] == bucket_cols
10141013

10151014

1016-
@pytest.mark.modin_index
10171015
@pytest.mark.parametrize("bucketing_data", [[0, 1, 2], [False, True, False], ["b", "c", "d"]])
10181016
@pytest.mark.parametrize(
10191017
"dtype",
@@ -1102,7 +1100,6 @@ def test_bucketing_catalog_csv_table(path, glue_database, glue_table):
11021100
assert table["StorageDescriptor"]["BucketColumns"] == bucket_cols
11031101

11041102

1105-
@pytest.mark.modin_index
11061103
@pytest.mark.parametrize("bucketing_data", [[0, 1, 2], [False, True, False], ["b", "c", "d"]])
11071104
@pytest.mark.parametrize(
11081105
"dtype",
@@ -1168,7 +1165,6 @@ def test_bucketing_csv_dataset(path, glue_database, glue_table, bucketing_data,
11681165
assert all(x in bucketing_data for x in loaded_df["c0"].to_list())
11691166

11701167

1171-
@pytest.mark.modin_index
11721168
@pytest.mark.parametrize("bucketing_data", [[0, 1, 2, 3], [False, True, False, True], ["b", "c", "d", "e"]])
11731169
def test_combined_bucketing_partitioning_parquet_dataset(path, glue_database, glue_table, bucketing_data):
11741170
nb_of_buckets = 2
@@ -1296,7 +1292,6 @@ def test_combined_bucketing_partitioning_csv_dataset(path, glue_database, glue_t
12961292
assert all(x in bucketing_data for x in loaded_df["c0"].to_list())
12971293

12981294

1299-
@pytest.mark.modin_index
13001295
def test_multiple_bucketing_columns_parquet_dataset(path, glue_database, glue_table):
13011296
nb_of_buckets = 2
13021297
df = pd.DataFrame({"c0": [0, 1, 2, 3], "c1": [4, 6, 5, 7], "c2": ["foo", "bar", "baz", "boo"]})

tests/unit/test_athena_csv.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -372,7 +372,6 @@ def test_athena_csv_types(path, glue_database, glue_table):
372372
ensure_data_types_csv(df2)
373373

374374

375-
@pytest.mark.modin_index
376375
@pytest.mark.parametrize("use_threads", [True, False])
377376
@pytest.mark.parametrize("ctas_approach", [True, False])
378377
@pytest.mark.parametrize("line_count", [1, 2])

tests/unit/test_athena_parquet.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -613,7 +613,6 @@ def test_schema_evolution_disabled(path, glue_table, glue_database):
613613
assert df2.c0.sum() == 3
614614

615615

616-
@pytest.mark.modin_index
617616
def test_date_cast(path, glue_table, glue_database):
618617
df = pd.DataFrame(
619618
{

tests/unit/test_s3_parquet.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,6 @@ def test_index_recovery_simple_str(path, use_threads):
410410
assert_pandas_equals(df, df2)
411411

412412

413-
@pytest.mark.modin_index
414413
@pytest.mark.xfail(
415414
raises=AssertionError,
416415
reason="https://github.com/ray-project/ray/issues/37771",
@@ -447,7 +446,6 @@ def test_range_index_recovery_simple(path, use_threads):
447446
assert_pandas_equals(df.reset_index(level=0), df2.reset_index(level=0))
448447

449448

450-
@pytest.mark.modin_index
451449
@pytest.mark.xfail(
452450
raises=AssertionError,
453451
reason="https://github.com/ray-project/ray/issues/37771",
@@ -498,7 +496,6 @@ def test_multi_index_recovery_nameless(path, use_threads):
498496
assert_pandas_equals(df.reset_index(), df2.reset_index())
499497

500498

501-
@pytest.mark.modin_index
502499
@pytest.mark.xfail(
503500
raises=(wr.exceptions.InvalidArgumentCombination, AssertionError),
504501
reason="Named index not working when partitioning to a single file",
@@ -535,7 +532,6 @@ def test_index_schema_validation(path, glue_database, glue_table, index):
535532
assert_pandas_equals(pd.concat([df, df]), df2)
536533

537534

538-
@pytest.mark.modin_index
539535
@pytest.mark.xfail(
540536
raises=AssertionError,
541537
reason="https://github.com/ray-project/ray/issues/37771",
@@ -625,7 +621,6 @@ def test_to_parquet_dataset_sanitize(path):
625621
assert df2.par.to_list() == ["a", "b"]
626622

627623

628-
@pytest.mark.modin_index
629624
@pytest.mark.parametrize("use_threads", [False, True, 2])
630625
def test_timezone_file(path, use_threads):
631626
file_path = f"{path}0.parquet"
@@ -636,7 +631,6 @@ def test_timezone_file(path, use_threads):
636631
assert_pandas_equals(df, df2)
637632

638633

639-
@pytest.mark.modin_index
640634
@pytest.mark.parametrize("use_threads", [True, False, 2])
641635
def test_timezone_file_columns(path, use_threads):
642636
file_path = f"{path}0.parquet"
@@ -690,7 +684,6 @@ def test_validate_columns(path, partition_cols) -> None:
690684
wr.s3.read_parquet(path, columns=["a", "b", "c"], dataset=True, validate_schema=True)
691685

692686

693-
@pytest.mark.modin_index
694687
@pytest.mark.xfail(
695688
raises=AssertionError,
696689
reason="https://github.com/ray-project/ray/issues/37771",
@@ -715,7 +708,6 @@ def test_mixed_types_column(path) -> None:
715708
wr.s3.to_parquet(df, path, dataset=True, partition_cols=["par"])
716709

717710

718-
@pytest.mark.modin_index
719711
@pytest.mark.parametrize("compression", [None, "snappy", "gzip", "zstd"])
720712
def test_parquet_compression(path, compression) -> None:
721713
df = pd.DataFrame({"id": [1, 2, 3]}, dtype="Int64")

tests/unit/test_s3_text.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,6 @@ def test_csv_dataset_header_modes(path, mode, glue_database, glue_table):
188188
assert df_res.equals(dfs[-1])
189189

190190

191-
@pytest.mark.modin_index
192191
@pytest.mark.xfail(
193192
raises=AssertionError,
194193
reason="https://github.com/ray-project/ray/issues/37771",
@@ -205,7 +204,6 @@ def test_json(path):
205204
assert df1.equals(wr.s3.read_json(path=[path0, path1], use_threads=True))
206205

207206

208-
@pytest.mark.modin_index
209207
@pytest.mark.xfail(
210208
raises=AssertionError,
211209
reason="https://github.com/ray-project/ray/issues/37771",
@@ -366,7 +364,6 @@ def test_csv_line_terminator(path, line_terminator):
366364
assert df.equals(df2)
367365

368366

369-
@pytest.mark.modin_index
370367
def test_read_json_versioned(path) -> None:
371368
path_file = f"{path}0.json"
372369
dfs = [

0 commit comments

Comments
 (0)