aws
diff --git a/‎awswrangler/distributed/ray/datasources/__init__.py
Lines changed: 2 additions & 0 deletions b/‎awswrangler/distributed/ray/datasources/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎awswrangler/distributed/ray/datasources/arrow_parquet_base_datasource.py
Lines changed: 87 additions & 0 deletions b/‎awswrangler/distributed/ray/datasources/arrow_parquet_base_datasource.py
Lines changed: 87 additions & 0 deletions
diff --git a/‎awswrangler/distributed/ray/datasources/arrow_parquet_datasource.py
Lines changed: 2 additions & 9 deletions b/‎awswrangler/distributed/ray/datasources/arrow_parquet_datasource.py
Lines changed: 2 additions & 9 deletions
diff --git a/‎awswrangler/distributed/ray/modin/s3/_read_parquet.py
Lines changed: 31 additions & 13 deletions b/‎awswrangler/distributed/ray/modin/s3/_read_parquet.py
Lines changed: 31 additions & 13 deletions
@@ -2,6 +2,7 @@
 
 from awswrangler.distributed.ray.datasources.arrow_csv_datasource import ArrowCSVDatasource
 from awswrangler.distributed.ray.datasources.arrow_json_datasource import ArrowJSONDatasource
+from awswrangler.distributed.ray.datasources.arrow_parquet_base_datasource import ArrowParquetBaseDatasource
 from awswrangler.distributed.ray.datasources.arrow_parquet_datasource import ArrowParquetDatasource
 from awswrangler.distributed.ray.datasources.pandas_file_based_datasource import UserProvidedKeyBlockWritePathProvider
 from awswrangler.distributed.ray.datasources.pandas_text_datasource import (
@@ -14,6 +15,7 @@
 __all__ = [
     "ArrowCSVDatasource",
     "ArrowJSONDatasource",
+    "ArrowParquetBaseDatasource",
     "ArrowParquetDatasource",
     "PandasCSVDataSource",
     "PandasFWFDataSource",
 
@@ -0,0 +1,87 @@
+"""Ray ParquetBaseDatasource Module.
+
+This module is pulled from Ray's [ParquetBaseDatasource]
+(https://github.com/ray-project/ray/blob/master/python/ray/data/datasource/parquet_base_datasource.py) with a few changes
+and customized to ensure compatibility with AWS SDK for pandas behavior. Changes from the original implementation,
+are documented in the comments and marked with (AWS SDK for pandas) prefix.
+"""
+
+from typing import Any, Dict, List, Optional
+
+# fs required to implicitly trigger S3 subsystem initialization
+import pyarrow as pa
+import pyarrow.fs
+import pyarrow.parquet as pq
+from ray.data.block import BlockAccessor
+
+from awswrangler._arrow import _add_table_partitions, _df_to_table
+from awswrangler.distributed.ray.datasources.pandas_file_based_datasource import PandasFileBasedDatasource
+
+
+class ArrowParquetBaseDatasource(PandasFileBasedDatasource):  # pylint: disable=abstract-method
+    """(AWS SDK for pandas) Parquet datasource, for reading and writing Parquet files.
+
+    The following are the changes to the original Ray implementation:
+    1. Added handling of additional parameters `dtype`, `index`, `compression` and added the ability
+       to pass through additional `pyarrow_additional_kwargs` and `s3_additional_kwargs` for writes.
+    3. Added `dataset` and `path_root` parameters to allow user to control loading partitions
+       relative to the root S3 prefix.
+    """
+
+    _FILE_EXTENSION = "parquet"
+
+    def _read_file(  # type: ignore[override]
+        self,
+        f: pa.NativeFile,
+        path: str,
+        path_root: str,
+        **reader_args: Any,
+    ) -> pa.Table:
+        use_threads: bool = reader_args.get("use_threads", False)
+        columns: Optional[List[str]] = reader_args.get("columns", None)
+
+        dataset_kwargs = reader_args.get("dataset_kwargs", {})
+        coerce_int96_timestamp_unit: Optional[str] = dataset_kwargs.get("coerce_int96_timestamp_unit", None)
+
+        table = pq.read_table(
+            f,
+            use_threads=use_threads,
+            columns=columns,
+            coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
+        )
+
+        table = _add_table_partitions(
+            table=table,
+            path=f"s3://{path}",
+            path_root=path_root,
+        )
+
+        return table
+
+    def _open_input_source(
+        self,
+        filesystem: pyarrow.fs.FileSystem,
+        path: str,
+        **open_args: Any,
+    ) -> pa.NativeFile:
+        # Parquet requires `open_input_file` due to random access reads
+        return filesystem.open_input_file(path, **open_args)
+
+    def _write_block(  # type: ignore[override]
+        self,
+        f: pa.NativeFile,
+        block: BlockAccessor[Any],
+        **writer_args: Any,
+    ) -> None:
+        schema: Optional[pa.schema] = writer_args.get("schema", None)
+        dtype: Optional[Dict[str, str]] = writer_args.get("dtype", None)
+        index: bool = writer_args.get("index", False)
+        compression: Optional[str] = writer_args.get("compression", None)
+        pyarrow_additional_kwargs: Dict[str, Any] = writer_args.get("pyarrow_additional_kwargs", {})
+
+        pq.write_table(
+            _df_to_table(block.to_pandas(), schema=schema, index=index, dtype=dtype),
+            f,
+            compression=compression,
+            **pyarrow_additional_kwargs,
+        )
@@ -32,7 +32,7 @@
 
 from awswrangler._arrow import _add_table_partitions, _df_to_table
 from awswrangler.distributed.ray import ray_remote
-from awswrangler.distributed.ray.datasources.pandas_file_based_datasource import PandasFileBasedDatasource
+from awswrangler.distributed.ray.datasources.arrow_parquet_base_datasource import ArrowParquetBaseDatasource
 from awswrangler.s3._write import _COMPRESSION_2_EXT
 
 _logger: logging.Logger = logging.getLogger(__name__)
@@ -72,7 +72,7 @@
 PARQUET_ENCODING_RATIO_ESTIMATE_NUM_ROWS = 5
 
 
-class ArrowParquetDatasource(PandasFileBasedDatasource):  # pylint: disable=abstract-method
+class ArrowParquetDatasource(ArrowParquetBaseDatasource):  # pylint: disable=abstract-method
     """(AWS SDK for pandas) Parquet datasource, for reading and writing Parquet files.
 
     The following are the changes to the original Ray implementation:
@@ -82,8 +82,6 @@ class ArrowParquetDatasource(PandasFileBasedDatasource):  # pylint: disable=abst
        relative to the root S3 prefix.
     """
 
-    _FILE_EXTENSION = "parquet"
-
     def create_reader(self, **kwargs: Dict[str, Any]) -> Reader[Any]:
         """Return a Reader for the given read arguments."""
         return _ArrowParquetDatasourceReader(**kwargs)  # type: ignore[arg-type]
@@ -349,11 +347,6 @@ def _read_pieces(
     schema: Optional[Union[type, "pyarrow.lib.Schema"]],
     serialized_pieces: List[_SerializedPiece],
 ) -> Iterator["pyarrow.Table"]:
-    # This import is necessary to load the tensor extension type.
-    from ray.data.extensions.tensor_extension import (  # type: ignore[attr-defined]  # noqa: F401, E501 # pylint: disable=import-outside-toplevel, unused-import
-        ArrowTensorType,
-    )
-
     # Deserialize after loading the filesystem class.
     pieces: List[ParquetFileFragment] = _deserialize_pieces_with_retry(serialized_pieces)
 
 
@@ -4,18 +4,31 @@
 import modin.pandas as pd
 import pyarrow as pa
 from ray.data import read_datasource
+from ray.data.datasource import FastFileMetadataProvider
+from ray.exceptions import RayTaskError
 
-from awswrangler.distributed.ray.datasources import ArrowParquetDatasource
+from awswrangler.distributed.ray.datasources import ArrowParquetBaseDatasource, ArrowParquetDatasource
 from awswrangler.distributed.ray.modin._utils import _to_modin
 
 if TYPE_CHECKING:
     from mypy_boto3_s3 import S3Client
 
 
+def _resolve_datasource_parameters(bulk_read: bool) -> Dict[str, Any]:
+    if bulk_read:
+        return {
+            "datasource": ArrowParquetBaseDatasource(),
+            "meta_provider": FastFileMetadataProvider(),
+        }
+    return {
+        "datasource": ArrowParquetDatasource(),
+    }
+
+
 def _read_parquet_distributed(  # pylint: disable=unused-argument
     paths: List[str],
     path_root: Optional[str],
-    schema: "pa.schema",
+    schema: Optional[pa.schema],
     columns: Optional[List[str]],
     coerce_int96_timestamp_unit: Optional[str],
     use_threads: Union[bool, int],
@@ -24,18 +37,23 @@ def _read_parquet_distributed(  # pylint: disable=unused-argument
     s3_client: Optional["S3Client"],
     s3_additional_kwargs: Optional[Dict[str, Any]],
     arrow_kwargs: Dict[str, Any],
+    bulk_read: bool,
 ) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
     dataset_kwargs = {}
     if coerce_int96_timestamp_unit:
         dataset_kwargs["coerce_int96_timestamp_unit"] = coerce_int96_timestamp_unit
-    dataset = read_datasource(
-        datasource=ArrowParquetDatasource(),
-        parallelism=parallelism,
-        use_threads=use_threads,
-        paths=paths,
-        schema=schema,
-        columns=columns,
-        dataset_kwargs=dataset_kwargs,
-        path_root=path_root,
-    )
-    return _to_modin(dataset=dataset, to_pandas_kwargs=arrow_kwargs, ignore_index=bool(path_root))
+
+    try:
+        dataset = read_datasource(
+            **_resolve_datasource_parameters(bulk_read),
+            parallelism=parallelism,
+            use_threads=use_threads,
+            paths=paths,
+            schema=schema,
+            columns=columns,
+            path_root=path_root,
+            dataset_kwargs=dataset_kwargs,
+        )
+        return _to_modin(dataset=dataset, to_pandas_kwargs=arrow_kwargs, ignore_index=bool(path_root))
+    except RayTaskError as e:
+        raise e.cause