Add ignore_null to read_parquet_metadata

kukushking · kukushking · commit 0745e6ac23a6 · 2022-01-18T12:02:15.000Z
diff --git a/awswrangler/_data_types.py b/awswrangler/_data_types.py
@@ -16,7 +16,7 @@
 _logger: logging.Logger = logging.getLogger(__name__)
 
 
-def pyarrow2athena(dtype: pa.DataType) -> str:  # pylint: disable=too-many-branches,too-many-return-statements
+def pyarrow2athena(dtype: pa.DataType, ignore_null: bool) -> Optional[str]:  # pylint: disable=too-many-branches,too-many-return-statements
     """Pyarrow to Athena data types conversion."""
     if pa.types.is_int8(dtype):
         return "tinyint"
@@ -53,6 +53,8 @@ def pyarrow2athena(dtype: pa.DataType) -> str:  # pylint: disable=too-many-branc
     if pa.types.is_map(dtype):
         return f"map<{pyarrow2athena(dtype=dtype.key_type)}, {pyarrow2athena(dtype=dtype.item_type)}>"
     if dtype == pa.null():
+        if ignore_null:
+            return None
         raise exceptions.UndetectedType("We can not infer the data type from an entire null object column")
     raise exceptions.UnsupportedType(f"Unsupported Pyarrow type: {dtype}")
 
@@ -585,14 +587,14 @@ def pyarrow_schema_from_pandas(
 
 
 def athena_types_from_pyarrow_schema(
-    schema: pa.Schema, partitions: Optional[pyarrow.parquet.ParquetPartitions]
+    schema: pa.Schema, partitions: Optional[pyarrow.parquet.ParquetPartitions], ignore_null: bool,
 ) -> Tuple[Dict[str, str], Optional[Dict[str, str]]]:
     """Extract the related Athena data types from any PyArrow Schema considering possible partitions."""
-    columns_types: Dict[str, str] = {str(f.name): pyarrow2athena(dtype=f.type) for f in schema}
+    columns_types: Dict[str, str] = {str(f.name): pyarrow2athena(dtype=f.type, ignore_null=ignore_null) for f in schema}
     _logger.debug("columns_types: %s", columns_types)
     partitions_types: Optional[Dict[str, str]] = None
     if partitions is not None:
-        partitions_types = {p.name: pyarrow2athena(p.dictionary.type) for p in partitions}
+        partitions_types = {p.name: pyarrow2athena(p.dictionary.type, ignore_null=ignore_null) for p in partitions}
     _logger.debug("partitions_types: %s", partitions_types)
     return columns_types, partitions_types
 
diff --git a/awswrangler/s3/_read_parquet.py b/awswrangler/s3/_read_parquet.py
@@ -60,6 +60,7 @@ def _read_parquet_metadata_file(
     s3_additional_kwargs: Optional[Dict[str, str]],
     use_threads: Union[bool, int],
     version_id: Optional[str] = None,
+    ignore_null=False,
     pyarrow_additional_kwargs: Optional[Dict[str, Any]] = None,
 ) -> Optional[Dict[str, str]]:
     pyarrow_args = _set_default_pyarrow_additional_kwargs(pyarrow_additional_kwargs)
@@ -77,7 +78,7 @@ def _read_parquet_metadata_file(
         )
         if pq_file is None:
             return None
-        return _data_types.athena_types_from_pyarrow_schema(schema=pq_file.schema.to_arrow_schema(), partitions=None)[0]
+        return _data_types.athena_types_from_pyarrow_schema(schema=pq_file.schema.to_arrow_schema(), partitions=None, ignore_null=ignore_null)[0]
 
 
 def _read_schemas_from_files(
@@ -87,6 +88,7 @@ def _read_schemas_from_files(
     boto3_session: boto3.Session,
     s3_additional_kwargs: Optional[Dict[str, str]],
     version_ids: Optional[Dict[str, str]] = None,
+    ignore_null: bool = False,
     pyarrow_additional_kwargs: Optional[Dict[str, Any]] = None,
 ) -> Tuple[Dict[str, str], ...]:
 
@@ -102,6 +104,7 @@ def _read_schemas_from_files(
                 s3_additional_kwargs=s3_additional_kwargs,
                 use_threads=use_threads,
                 version_id=version_ids.get(p) if isinstance(version_ids, dict) else None,
+                ignore_null=ignore_null,
                 pyarrow_additional_kwargs=pyarrow_additional_kwargs,
             )
             for p in paths
@@ -117,6 +120,7 @@ def _read_schemas_from_files(
                     itertools.repeat(s3_additional_kwargs),
                     itertools.repeat(use_threads),
                     versions,
+                    itertools.repeat(ignore_null),
                     itertools.repeat(pyarrow_additional_kwargs),
                 )
             )
@@ -175,6 +179,7 @@ def _read_parquet_metadata(
     path_suffix: Optional[str],
     path_ignore_suffix: Optional[str],
     ignore_empty: bool,
+    ignore_null: bool,
     dtype: Optional[Dict[str, str]],
     sampling: float,
     dataset: bool,
@@ -207,6 +212,7 @@ def _read_parquet_metadata(
         else {paths[0]: version_id}
         if isinstance(version_id, str)
         else None,
+        ignore_null=ignore_null,
         pyarrow_additional_kwargs=pyarrow_additional_kwargs,
     )
     columns_types: Dict[str, str] = _merge_schemas(schemas=schemas)
@@ -990,6 +996,7 @@ def read_parquet_metadata(
     path_suffix: Optional[str] = None,
     path_ignore_suffix: Optional[str] = None,
     ignore_empty: bool = True,
+    ignore_null: bool = False,
     dtype: Optional[Dict[str, str]] = None,
     sampling: float = 1.0,
     dataset: bool = False,
@@ -1030,6 +1037,8 @@ def read_parquet_metadata(
         If None, will try to read all files. (default)
     ignore_empty: bool
         Ignore files with 0 bytes.
+    ignore_null: bool
+        Ignore columns with null type.
     dtype : Dict[str, str], optional
         Dictionary of columns names and Athena/Glue types to be casted.
         Useful when you have columns with undetermined data types as partitions columns.
@@ -1083,6 +1092,7 @@ def read_parquet_metadata(
         path_suffix=path_suffix,
         path_ignore_suffix=path_ignore_suffix,
         ignore_empty=ignore_empty,
+        ignore_null=ignore_null,
         dtype=dtype,
         sampling=sampling,
         dataset=dataset,
diff --git a/tests/test_s3_parquet.py b/tests/test_s3_parquet.py
@@ -28,6 +28,19 @@ def test_parquet_metadata_partitions_dataset(path, partition_cols):
     assert (columns_types.get("c1") == "bigint") or (partitions_types.get("c1") == "string")
 
 
+def test_read_parquet_metadata_nulls(path):
+    df = pd.DataFrame({"c0": [None, None, None], "c1": [1, 2, 3], "c2": ["a", "b", "c"]})
+    path = f"{path}df.parquet"
+    wr.s3.to_parquet(df, path)
+    with pytest.raises(wr.exceptions.UndetectedType):
+        wr.s3.read_parquet_metadata(path)
+    columns_types, _ = wr.s3.read_parquet_metadata(path, ignore_null=True)
+    assert len(columns_types) == len(df.columns)
+    assert columns_types.get("c0") == None
+    assert columns_types.get("c1") == "bigint"
+    assert columns_types.get("c2") == "string"
+
+
 @pytest.mark.parametrize("partition_cols", [None, ["c2"], ["value", "c2"]])
 def test_parquet_cast_string_dataset(path, partition_cols):
     df = pd.DataFrame({"id": [1, 2, 3], "value": ["foo", "boo", "bar"], "c2": [4, 5, 6], "c3": [7.0, 8.0, 9.0]})