Skip to content

Commit 8dccc8b

Browse files
fix: Athena to_iceberg fails with non-lowercase column names (#2736)
Co-authored-by: jaidisido <[email protected]>
1 parent 5e140ff commit 8dccc8b

File tree

2 files changed

+57
-0
lines changed

2 files changed

+57
-0
lines changed

awswrangler/athena/_write_iceberg.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,9 @@ def _determine_differences(
108108
)
109109
frame_columns_types.update(frame_partitions_types)
110110

111+
# lowercase DataFrame columns, as all the column names from Athena will be lowercased
112+
frame_columns_types = {k.lower(): v for k, v in frame_columns_types.items()}
113+
111114
catalog_column_types = typing.cast(
112115
Dict[str, str],
113116
catalog.get_table_types(database=database, table=table, catalog_id=catalog_id, boto3_session=boto3_session),

tests/unit/test_athena_iceberg.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -870,3 +870,57 @@ def test_athena_iceberg_use_partition_function(
870870

871871
assert len(df_out) == len(df) + len(df2)
872872
assert len(df_out.columns) == len(df.columns)
873+
874+
875+
def test_to_iceberg_uppercase_columns(
876+
path: str,
877+
path2: str,
878+
path3: str,
879+
glue_database: str,
880+
glue_table: str,
881+
) -> None:
882+
df = pd.DataFrame(
883+
{
884+
"ID": [1, 2, 3, 4, 5],
885+
"TS": [
886+
ts("2020-01-01 00:00:00.0"),
887+
ts("2020-01-02 00:00:01.0"),
888+
ts("2020-01-03 00:00:00.0"),
889+
ts("2020-01-03 12:30:00.0"),
890+
ts("2020-01-03 16:45:00.0"),
891+
],
892+
}
893+
)
894+
df["ID"] = df["ID"].astype("Int64") # Cast as nullable int64 type
895+
896+
split_index = 4
897+
898+
wr.athena.to_iceberg(
899+
df=df.iloc[:split_index],
900+
database=glue_database,
901+
table=glue_table,
902+
table_location=path,
903+
temp_path=path2,
904+
keep_files=False,
905+
)
906+
907+
wr.athena.to_iceberg(
908+
df=df.iloc[split_index:],
909+
database=glue_database,
910+
table=glue_table,
911+
table_location=path,
912+
temp_path=path2,
913+
s3_output=path3,
914+
keep_files=False,
915+
mode="append",
916+
schema_evolution=True,
917+
)
918+
919+
df_output = wr.athena.read_sql_query(
920+
sql=f'SELECT ID, TS FROM "{glue_table}" ORDER BY ID',
921+
database=glue_database,
922+
ctas_approach=False,
923+
unload_approach=False,
924+
)
925+
926+
assert_pandas_equals(df, df_output)

0 commit comments

Comments
 (0)