@@ -127,7 +127,7 @@ def sanitize_column_name(column: str) -> str:
127
127
128
128
def rename_duplicated_columns (df : pd .DataFrame ) -> pd .DataFrame :
129
129
"""Append an incremental number to duplicate column names to conform with Amazon Athena.
130
-
130
+
131
131
Also handles potential new duplicated conflicts by appending another `_n`
132
132
to the end of the column name if it conflicts.
133
133
@@ -137,16 +137,18 @@ def rename_duplicated_columns(df: pd.DataFrame) -> pd.DataFrame:
137
137
set_names = set (names )
138
138
if len (names ) == len (set_names ):
139
139
return df
140
- d = {key : [name + f"_{ i } " if i > 0 else name for i , name in enumerate (names [names == key ])] for key in set_names }
140
+ d = {key : [name + f"_{ i } " if i > 0 else name for i , name in enumerate (names [names == key ])] for key in set_names }
141
141
df .rename (columns = lambda c : d [c ].pop (0 ), inplace = True )
142
142
while df .columns .duplicated ().any ():
143
- # Catches edge cases where pd.DataFrame({"A": [1, 2], "a": [3, 4], "a_1": [5, 6]})
144
- df = rename_duplicated_columns (df )
145
-
143
+ # Catches edge cases where pd.DataFrame({"A": [1, 2], "a": [3, 4], "a_1": [5, 6]})
144
+ df = rename_duplicated_columns (df )
145
+
146
146
return df
147
147
148
148
149
- def sanitize_dataframe_columns_names (df : pd .DataFrame , handle_duplicate_columns : Optional [str ] = "warn" ) -> pd .DataFrame :
149
+ def sanitize_dataframe_columns_names (
150
+ df : pd .DataFrame , handle_duplicate_columns : Optional [str ] = "warn"
151
+ ) -> pd .DataFrame :
150
152
"""Normalize all columns names to be compatible with Amazon Athena.
151
153
152
154
https://docs.aws.amazon.com/athena/latest/ug/tables-databases-columns-names.html
@@ -185,15 +187,16 @@ def sanitize_dataframe_columns_names(df: pd.DataFrame, handle_duplicate_columns:
185
187
df .index .names = [None if x is None else sanitize_column_name (x ) for x in df .index .names ]
186
188
if df .columns .duplicated ().any ():
187
189
if handle_duplicate_columns == "warn" :
188
- warnings .warn ("Some columns names are duplicated, consider using " +
189
- "`handle_duplicate_columns='[drop|rename]'`" )
190
+ warnings .warn (
191
+ "Some columns names are duplicated, consider using " + "`handle_duplicate_columns='[drop|rename]'`"
192
+ )
190
193
191
194
elif handle_duplicate_columns == "drop" :
192
195
df = drop_duplicated_columns (df )
193
-
196
+
194
197
elif handle_duplicate_columns == "rename" :
195
198
df = rename_duplicated_columns (df )
196
-
199
+
197
200
else :
198
201
raise ValueError ("handle_duplicate_columns must be one of ['warn', 'drop', 'rename']" )
199
202
0 commit comments