@@ -60,6 +60,7 @@ def _read_parquet_metadata_file(
60
60
s3_additional_kwargs : Optional [Dict [str , str ]],
61
61
use_threads : Union [bool , int ],
62
62
version_id : Optional [str ] = None ,
63
+ ignore_null = False ,
63
64
pyarrow_additional_kwargs : Optional [Dict [str , Any ]] = None ,
64
65
) -> Optional [Dict [str , str ]]:
65
66
pyarrow_args = _set_default_pyarrow_additional_kwargs (pyarrow_additional_kwargs )
@@ -77,7 +78,7 @@ def _read_parquet_metadata_file(
77
78
)
78
79
if pq_file is None :
79
80
return None
80
- return _data_types .athena_types_from_pyarrow_schema (schema = pq_file .schema .to_arrow_schema (), partitions = None )[0 ]
81
+ return _data_types .athena_types_from_pyarrow_schema (schema = pq_file .schema .to_arrow_schema (), partitions = None , ignore_null = ignore_null )[0 ]
81
82
82
83
83
84
def _read_schemas_from_files (
@@ -87,6 +88,7 @@ def _read_schemas_from_files(
87
88
boto3_session : boto3 .Session ,
88
89
s3_additional_kwargs : Optional [Dict [str , str ]],
89
90
version_ids : Optional [Dict [str , str ]] = None ,
91
+ ignore_null : bool = False ,
90
92
pyarrow_additional_kwargs : Optional [Dict [str , Any ]] = None ,
91
93
) -> Tuple [Dict [str , str ], ...]:
92
94
@@ -102,6 +104,7 @@ def _read_schemas_from_files(
102
104
s3_additional_kwargs = s3_additional_kwargs ,
103
105
use_threads = use_threads ,
104
106
version_id = version_ids .get (p ) if isinstance (version_ids , dict ) else None ,
107
+ ignore_null = ignore_null ,
105
108
pyarrow_additional_kwargs = pyarrow_additional_kwargs ,
106
109
)
107
110
for p in paths
@@ -117,6 +120,7 @@ def _read_schemas_from_files(
117
120
itertools .repeat (s3_additional_kwargs ),
118
121
itertools .repeat (use_threads ),
119
122
versions ,
123
+ itertools .repeat (ignore_null ),
120
124
itertools .repeat (pyarrow_additional_kwargs ),
121
125
)
122
126
)
@@ -175,6 +179,7 @@ def _read_parquet_metadata(
175
179
path_suffix : Optional [str ],
176
180
path_ignore_suffix : Optional [str ],
177
181
ignore_empty : bool ,
182
+ ignore_null : bool ,
178
183
dtype : Optional [Dict [str , str ]],
179
184
sampling : float ,
180
185
dataset : bool ,
@@ -207,6 +212,7 @@ def _read_parquet_metadata(
207
212
else {paths [0 ]: version_id }
208
213
if isinstance (version_id , str )
209
214
else None ,
215
+ ignore_null = ignore_null ,
210
216
pyarrow_additional_kwargs = pyarrow_additional_kwargs ,
211
217
)
212
218
columns_types : Dict [str , str ] = _merge_schemas (schemas = schemas )
@@ -990,6 +996,7 @@ def read_parquet_metadata(
990
996
path_suffix : Optional [str ] = None ,
991
997
path_ignore_suffix : Optional [str ] = None ,
992
998
ignore_empty : bool = True ,
999
+ ignore_null : bool = False ,
993
1000
dtype : Optional [Dict [str , str ]] = None ,
994
1001
sampling : float = 1.0 ,
995
1002
dataset : bool = False ,
@@ -1030,6 +1037,8 @@ def read_parquet_metadata(
1030
1037
If None, will try to read all files. (default)
1031
1038
ignore_empty: bool
1032
1039
Ignore files with 0 bytes.
1040
+ ignore_null: bool
1041
+ Ignore columns with null type.
1033
1042
dtype : Dict[str, str], optional
1034
1043
Dictionary of columns names and Athena/Glue types to be casted.
1035
1044
Useful when you have columns with undetermined data types as partitions columns.
@@ -1083,6 +1092,7 @@ def read_parquet_metadata(
1083
1092
path_suffix = path_suffix ,
1084
1093
path_ignore_suffix = path_ignore_suffix ,
1085
1094
ignore_empty = ignore_empty ,
1095
+ ignore_null = ignore_null ,
1086
1096
dtype = dtype ,
1087
1097
sampling = sampling ,
1088
1098
dataset = dataset ,
0 commit comments