Simplify the refactoring

jaidisido · jaidisido · commit dd1a0dc279f4 · 2022-08-10T22:50:06.000+01:00
diff --git a/awswrangler/_arrow.py b/awswrangler/_arrow.py
@@ -3,14 +3,50 @@
 import datetime
 import json
 import logging
-from typing import Any, Dict
+from typing import Any, Dict, Optional, Tuple, cast
 
 import pandas as pd
 import pyarrow as pa
 
 _logger: logging.Logger = logging.getLogger(__name__)
 
 
+def _extract_partitions_from_path(path_root: str, path: str) -> Dict[str, str]:
+    path_root = path_root if path_root.endswith("/") else f"{path_root}/"
+    if path_root not in path:
+        raise Exception(f"Object {path} is not under the root path ({path_root}).")
+    path_wo_filename: str = path.rpartition("/")[0] + "/"
+    path_wo_prefix: str = path_wo_filename.replace(f"{path_root}/", "")
+    dirs: Tuple[str, ...] = tuple(x for x in path_wo_prefix.split("/") if (x != "") and (x.count("=") == 1))
+    if not dirs:
+        return {}
+    values_tups = cast(Tuple[Tuple[str, str]], tuple(tuple(x.split("=")[:2]) for x in dirs))
+    values_dics: Dict[str, str] = dict(values_tups)
+    return values_dics
+
+
+def _add_table_partitions(
+    table: pa.Table,
+    path: str,
+    path_root: Optional[str],
+) -> pa.Table:
+    part = _extract_partitions_from_path(path_root, f"s3://{path}") if path_root else None
+    if part:
+        for col, value in part.items():
+            try:
+                table = table.set_column(
+                    table.schema.get_field_index(col),
+                    col,
+                    pa.array([value] * len(table)).dictionary_encode(),
+                )
+            except pa.ArrowInvalid:
+                table = table.append_column(
+                    col,
+                    pa.array([value] * len(table)).dictionary_encode(),
+                )
+    return table
+
+
 def _apply_timezone(df: pd.DataFrame, metadata: Dict[str, Any]) -> pd.DataFrame:
     for c in metadata["columns"]:
         if "field_name" in c and c["field_name"] is not None:
diff --git a/awswrangler/_threading.py b/awswrangler/_threading.py
@@ -1,7 +1,6 @@
 """Threading Module (PRIVATE)."""
 
 import concurrent.futures
-import inspect
 import itertools
 import logging
 from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union
@@ -32,14 +31,10 @@ def __init__(self, use_threads: Union[bool, int]):
     def map(self, func: Callable[..., List[str]], boto3_session: boto3.Session, *iterables: Any) -> List[Any]:
         """Map iterables to multi-threaded function."""
         _logger.debug("Map: %s", func)
-        first_arg = tuple(inspect.signature(func).parameters.keys())[0]
         if self._exec is not None:
-            args = iterables
-            if first_arg == "boto3_session":
-                # Deserialize boto3 session into pickable object
-                boto3_primitives = _utils.boto3_to_primitives(boto3_session=boto3_session)
-                args = (itertools.repeat(boto3_primitives), *iterables)
+            # Deserialize boto3 session into pickable object
+            boto3_primitives = _utils.boto3_to_primitives(boto3_session=boto3_session)
+            args = (itertools.repeat(boto3_primitives), *iterables)
             return list(self._exec.map(func, *args))
         # Single-threaded
-        args = (itertools.repeat(boto3_session), *iterables) if first_arg == "boto3_session" else iterables
-        return list(map(func, *args))  # type: ignore
+        return list(map(func, *(itertools.repeat(boto3_session), *iterables)))  # type: ignore
diff --git a/awswrangler/_utils.py b/awswrangler/_utils.py
@@ -8,14 +8,13 @@
 import random
 import time
 from concurrent.futures import FIRST_COMPLETED, Future, wait
-from typing import TYPE_CHECKING, Any, Callable, Dict, Generator, Iterator, List, Optional, Sequence, Tuple, Union, cast
+from typing import TYPE_CHECKING, Any, Callable, Dict, Generator, List, Optional, Sequence, Tuple, Union, cast
 
 import boto3
 import botocore.config
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-import pyarrow.parquet
 from pyarrow import fs
 
 from awswrangler import _config, exceptions
@@ -159,15 +158,6 @@ def resolve_filesystem(session: Optional[boto3.Session] = None) -> fs.FileSystem
     )
 
 
-def resolve_filesystem_paths(paths: List[str]) -> List[str]:
-    """Resolve and normalize provided paths based on PyArrow filesystem."""
-    resolved_paths = []
-    for path in paths:
-        _, resolved_path = fs._resolve_filesystem_and_path(path=path)  # pylint: disable=protected-access
-        resolved_paths.append(resolved_path)
-    return resolved_paths
-
-
 def parse_path(path: str) -> Tuple[str, str]:
     """Split a full S3 path in bucket and key strings.
 
@@ -462,78 +452,3 @@ def list_to_arrow_table(
         arrays.append(v)
     # Will raise if metadata is not None
     return pa.Table.from_arrays(arrays, schema=schema, metadata=metadata)
-
-
-def _extract_partitions_from_path(path_root: str, path: str) -> Dict[str, str]:
-    path_root = path_root if path_root.endswith("/") else f"{path_root}/"
-    if path_root not in path:
-        raise exceptions.InvalidArgumentValue(f"Object {path} is not under the root path ({path_root}).")
-    path_wo_filename: str = path.rpartition("/")[0] + "/"
-    path_wo_prefix: str = path_wo_filename.replace(f"{path_root}/", "")
-    dirs: Tuple[str, ...] = tuple(x for x in path_wo_prefix.split("/") if (x != "") and (x.count("=") == 1))
-    if not dirs:
-        return {}
-    values_tups = cast(Tuple[Tuple[str, str]], tuple(tuple(x.split("=")[:2]) for x in dirs))
-    values_dics: Dict[str, str] = dict(values_tups)
-    return values_dics
-
-
-def _add_partitions_table(
-    table: pa.Table,
-    path: str,
-    path_root: Optional[str],
-) -> pa.Table:
-    part = _extract_partitions_from_path(path_root, f"s3://{path}") if path_root else None
-    if part:
-        for col, value in part.items():
-            table = table.set_column(
-                table.schema.get_field_index(col),
-                col,
-                pa.array([value] * len(table)).dictionary_encode(),
-            )
-    return table
-
-
-def piece_to_table(
-    piece: pyarrow.parquet.ParquetDataset.pieces,
-    schema: pa.schema,
-    columns: Optional[List[str]],
-    path_root: Optional[str],
-    use_threads: Union[bool, int],
-) -> pa.Table:
-    """Create PyArrow Table from list of ParquetDataset batches."""
-    return _add_partitions_table(
-        table=piece.to_table(use_threads=use_threads, schema=schema, columns=columns),
-        path=piece.path,
-        path_root=path_root,
-    )
-
-
-def batches_to_table(
-    pieces: pyarrow.parquet.ParquetDataset.pieces,
-    schema: pa.schema,
-    columns: Optional[List[str]],
-    path_root: Optional[str],
-    use_threads: Union[bool, int],
-    batch_size: Optional[int],
-) -> Iterator[pa.Table]:
-    """Yield PyArrow Tables from list of ParquetDataset pieces."""
-    batch_kwargs = {
-        "use_threads": use_threads,
-        "columns": columns,
-        "schema": schema,
-    }
-    if batch_size:
-        batch_kwargs["batch_size"] = batch_size
-
-    for piece in pieces:
-        batches = piece.to_batches(**batch_kwargs)
-        for batch in batches:
-            table = _add_partitions_table(
-                table=pa.Table.from_batches([batch], schema=schema),
-                path=piece.path,
-                path_root=path_root,
-            )
-            # If the table is empty, drop it.
-            if table.num_rows > 0:
-                yield table
diff --git a/awswrangler/distributed/_distributed.py b/awswrangler/distributed/_distributed.py
@@ -1,6 +1,5 @@
 """Distributed Module (PRIVATE)."""
 
-import inspect
 import multiprocessing
 import os
 import sys
@@ -48,9 +47,6 @@ def ray_remote(function: Callable[..., Any]) -> Callable[..., Any]:
     if config.distributed:
 
         def wrapper(*args: Any, **kwargs: Any) -> Any:
-            first_arg = tuple(inspect.signature(function).parameters.keys())[0]
-            if first_arg != "boto3_session":
-                args = args[1:]
             return ray.remote(function).remote(*args, **kwargs)
 
         return wrapper
diff --git a/awswrangler/distributed/datasources/parquet_datasource.py b/awswrangler/distributed/datasources/parquet_datasource.py
@@ -5,20 +5,22 @@
 
 import numpy as np
 import pyarrow as pa
+import pyarrow.parquet as pq
 
 # fs required to implicitly trigger S3 subsystem initialization
 import pyarrow.fs  # noqa: F401 pylint: disable=unused-import
 from ray import cloudpickle
 from ray.data.context import DatasetContext
 from ray.data.datasource.datasource import ReadTask
+from ray.data.datasource.file_based_datasource import _resolve_paths_and_filesystem
 from ray.data.datasource.file_meta_provider import DefaultParquetMetadataProvider, ParquetMetadataProvider
 from ray.data.datasource.parquet_datasource import (
     _deregister_parquet_file_fragment_serialization,
     _register_parquet_file_fragment_serialization,
 )
 from ray.data.impl.output_buffer import BlockOutputBuffer
 
-from awswrangler._utils import batches_to_table
+from awswrangler._arrow import _add_table_partitions
 
 _logger: logging.Logger = logging.getLogger(__name__)
 
@@ -34,15 +36,27 @@ def prepare_read(
         self,
         parallelism: int,
         use_threads: Union[bool, int],
-        parquet_dataset: pa.parquet.ParquetDataset,
-        schema: pa.Schema,
+        filesystem: "pyarrow.fs.FileSystem",
+        paths: Union[str, List[str]],
+        schema: "pyarrow.lib.Schema",
         columns: Optional[List[str]] = None,
+        coerce_int96_timestamp_unit: Optional[str] = None,
         path_root: Optional[str] = None,
         meta_provider: ParquetMetadataProvider = DefaultParquetMetadataProvider(),
         _block_udf: Optional[Callable[..., Any]] = None,
     ) -> List[ReadTask]:
         """Create and return read tasks for a Parquet file-based datasource."""
 
+        paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem)
+
+        parquet_dataset = pq.ParquetDataset(
+            path_or_paths=paths,
+            filesystem=filesystem,
+            partitioning=None,
+            coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
+            use_legacy_dataset=False,
+        )
+
         def read_pieces(serialized_pieces: str) -> Iterator[pa.Table]:
             # Deserialize after loading the filesystem class.
             try:
@@ -58,18 +72,24 @@ def read_pieces(serialized_pieces: str) -> Iterator[pa.Table]:
             output_buffer = BlockOutputBuffer(block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size)
 
             _logger.debug("Reading %s parquet pieces", len(pieces))
-            tables = batches_to_table(
-                pieces=pieces,
-                schema=schema,
-                columns=columns,
-                path_root=path_root,
-                use_threads=use_threads,
-                batch_size=PARQUET_READER_ROW_BATCH_SIZE,
-            )
-            for table in tables:
-                output_buffer.add_block(table)
-                if output_buffer.has_next():
-                    yield output_buffer.next()
+            for piece in pieces:
+                batches = piece.to_batches(
+                    use_threads=use_threads,
+                    columns=columns,
+                    schema=schema,
+                    batch_size=PARQUET_READER_ROW_BATCH_SIZE,
+                )
+                for batch in batches:
+                    table = _add_table_partitions(
+                        table=pa.Table.from_batches([batch], schema=schema),
+                        path=piece.path,
+                        path_root=path_root,
+                    )
+                    # If the table is empty, drop it.
+                    if table.num_rows > 0:
+                        output_buffer.add_block(table)
+                        if output_buffer.has_next():
+                            yield output_buffer.next()
 
             output_buffer.finalize()
             if output_buffer.has_next():
diff --git a/awswrangler/s3/_read.py b/awswrangler/s3/_read.py
@@ -10,7 +10,8 @@
 from pandas.api.types import union_categoricals
 
 from awswrangler import exceptions
-from awswrangler._utils import _extract_partitions_from_path, boto3_to_primitives, ensure_cpu_count
+from awswrangler._arrow import _extract_partitions_from_path
+from awswrangler._utils import boto3_to_primitives, ensure_cpu_count
 from awswrangler.s3._list import _prefix_cleanup
 
 _logger: logging.Logger = logging.getLogger(__name__)
diff --git a/awswrangler/s3/_read_parquet.py b/awswrangler/s3/_read_parquet.py
diff --git a/tests/test_moto.py b/tests/test_moto.py
diff --git a/tests/test_s3_parquet.py b/tests/test_s3_parquet.py