Skip to content

Commit 4ea3662

Browse files
committed
Rearranged imports and formatted
1 parent b8b0573 commit 4ea3662

File tree

3 files changed

+24
-14
lines changed

3 files changed

+24
-14
lines changed

awswrangler/catalog/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@
4242
from awswrangler.catalog._utils import ( # noqa
4343
does_table_exist,
4444
drop_duplicated_columns,
45-
rename_duplicated_columns,
4645
extract_athena_types,
46+
rename_duplicated_columns,
4747
sanitize_column_name,
4848
sanitize_dataframe_columns_names,
4949
sanitize_table_name,
@@ -58,6 +58,7 @@
5858
"delete_column",
5959
"drop_duplicated_columns",
6060
"extract_athena_types",
61+
"rename_duplicated_columns",
6162
"sanitize_column_name",
6263
"sanitize_dataframe_columns_names",
6364
"sanitize_table_name",

awswrangler/catalog/_utils.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def sanitize_column_name(column: str) -> str:
127127

128128
def rename_duplicated_columns(df: pd.DataFrame) -> pd.DataFrame:
129129
"""Append an incremental number to duplicate column names to conform with Amazon Athena.
130-
130+
131131
Also handles potential new duplicated conflicts by appending another `_n`
132132
to the end of the column name if it conflicts.
133133
@@ -137,16 +137,18 @@ def rename_duplicated_columns(df: pd.DataFrame) -> pd.DataFrame:
137137
set_names = set(names)
138138
if len(names) == len(set_names):
139139
return df
140-
d = {key: [name + f"_{i}" if i > 0 else name for i, name in enumerate(names[names==key])] for key in set_names}
140+
d = {key: [name + f"_{i}" if i > 0 else name for i, name in enumerate(names[names == key])] for key in set_names}
141141
df.rename(columns=lambda c: d[c].pop(0), inplace=True)
142142
while df.columns.duplicated().any():
143-
# Catches edge cases where pd.DataFrame({"A": [1, 2], "a": [3, 4], "a_1": [5, 6]})
144-
df = rename_duplicated_columns(df)
145-
143+
# Catches edge cases where pd.DataFrame({"A": [1, 2], "a": [3, 4], "a_1": [5, 6]})
144+
df = rename_duplicated_columns(df)
145+
146146
return df
147147

148148

149-
def sanitize_dataframe_columns_names(df: pd.DataFrame, handle_duplicate_columns: Optional[str] = "warn") -> pd.DataFrame:
149+
def sanitize_dataframe_columns_names(
150+
df: pd.DataFrame, handle_duplicate_columns: Optional[str] = "warn"
151+
) -> pd.DataFrame:
150152
"""Normalize all columns names to be compatible with Amazon Athena.
151153
152154
https://docs.aws.amazon.com/athena/latest/ug/tables-databases-columns-names.html
@@ -185,15 +187,16 @@ def sanitize_dataframe_columns_names(df: pd.DataFrame, handle_duplicate_columns:
185187
df.index.names = [None if x is None else sanitize_column_name(x) for x in df.index.names]
186188
if df.columns.duplicated().any():
187189
if handle_duplicate_columns == "warn":
188-
warnings.warn("Some columns names are duplicated, consider using "+
189-
"`handle_duplicate_columns='[drop|rename]'`")
190+
warnings.warn(
191+
"Some columns names are duplicated, consider using " + "`handle_duplicate_columns='[drop|rename]'`"
192+
)
190193

191194
elif handle_duplicate_columns == "drop":
192195
df = drop_duplicated_columns(df)
193-
196+
194197
elif handle_duplicate_columns == "rename":
195198
df = rename_duplicated_columns(df)
196-
199+
197200
else:
198201
raise ValueError("handle_duplicate_columns must be one of ['warn', 'drop', 'rename']")
199202

tests/test_athena.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -247,9 +247,15 @@ def test_athena_read_list(glue_database):
247247

248248

249249
def test_sanitize_dataframe_column_names():
250-
assert wr.catalog.sanitize_dataframe_columns_names(df=pd.DataFrame({'A': [1, 2]})).equals(pd.DataFrame({'a': [1, 2]})) # Unsure how to test for warnings
251-
assert wr.catalog.sanitize_dataframe_columns_names(df=pd.DataFrame({'A': [1, 2], 'a': [3, 4]}), handle_duplicate_columns="drop").equals(pd.DataFrame({'a': [1, 2]}))
252-
assert wr.catalog.sanitize_dataframe_columns_names(df=pd.DataFrame({'A': [1, 2], 'a': [3, 4], 'a_1': [5, 6]}), handle_duplicate_columns="rename").equals(pd.DataFrame({'a': [1, 2], 'a_1': [3, 4], 'a_1_1': [5, 6]}))
250+
assert wr.catalog.sanitize_dataframe_columns_names(df=pd.DataFrame({"A": [1, 2]})).equals(
251+
pd.DataFrame({"a": [1, 2]})
252+
) # Unsure how to test for warnings
253+
assert wr.catalog.sanitize_dataframe_columns_names(
254+
df=pd.DataFrame({"A": [1, 2], "a": [3, 4]}), handle_duplicate_columns="drop"
255+
).equals(pd.DataFrame({"a": [1, 2]}))
256+
assert wr.catalog.sanitize_dataframe_columns_names(
257+
df=pd.DataFrame({"A": [1, 2], "a": [3, 4], "a_1": [5, 6]}), handle_duplicate_columns="rename"
258+
).equals(pd.DataFrame({"a": [1, 2], "a_1": [3, 4], "a_1_1": [5, 6]}))
253259

254260

255261
def test_sanitize_names():

0 commit comments

Comments
 (0)