|
1 |
| -import pandas as pd |
| 1 | +import modin.pandas as pd |
2 | 2 | import pytest
|
3 | 3 | import ray
|
4 | 4 |
|
@@ -31,6 +31,13 @@ def big_modin_df():
|
31 | 31 | return frame
|
32 | 32 |
|
33 | 33 |
|
| 34 | +def _modin_repartition(df: pd.DataFrame, num_blocks: int) -> pd.DataFrame: |
| 35 | + """Repartition modin dataframe into n blocks""" |
| 36 | + dataset = ray.data.from_modin(df) |
| 37 | + dataset = dataset.repartition(num_blocks) |
| 38 | + return dataset.to_modin() |
| 39 | + |
| 40 | + |
34 | 41 | @pytest.mark.repeat(1)
|
35 | 42 | @pytest.mark.parametrize("benchmark_time", [180])
|
36 | 43 | def test_s3_select(benchmark_time):
|
@@ -90,6 +97,20 @@ def test_s3_write_parquet_dataset(df_s, path, partition_cols, bucketing_info, be
|
90 | 97 | assert timer.elapsed_time < benchmark_time
|
91 | 98 |
|
92 | 99 |
|
| 100 | +@pytest.mark.parametrize("benchmark_time", [200]) |
| 101 | +@pytest.mark.parametrize("partition_cols", [None, ["payment_type"]]) |
| 102 | +@pytest.mark.parametrize("num_blocks", [None, 1, 5]) |
| 103 | +def test_s3_write_parquet_blocks(df_s, path, partition_cols, num_blocks, benchmark_time): |
| 104 | + dataset = True if partition_cols else False |
| 105 | + if num_blocks: |
| 106 | + df_s = _modin_repartition(df_s, num_blocks) |
| 107 | + with ExecutionTimer(f"elapsed time of wr.s3.to_parquet() with repartitioning into {num_blocks} blocks") as timer: |
| 108 | + wr.s3.to_parquet(df_s, path=path, dataset=dataset, partition_cols=partition_cols) |
| 109 | + df = wr.s3.read_parquet(path=path, dataset=dataset) |
| 110 | + assert df.shape == df_s.shape |
| 111 | + assert timer.elapsed_time < benchmark_time |
| 112 | + |
| 113 | + |
93 | 114 | @pytest.mark.parametrize("benchmark_time", [5])
|
94 | 115 | def test_s3_delete_objects(path, path2, benchmark_time):
|
95 | 116 | df = pd.DataFrame({"id": [1, 2, 3]})
|
|
0 commit comments