fix: Athena to_iceberg fails with non-lowercase column names (#2736)

LeonLuttenberger · jaidisido · web-flow · commit 8dccc8b4b526 · 2024-03-19T10:52:45.000Z
Co-authored-by: jaidisido &lt;jaidisido@gmail.com&gt;
diff --git a/awswrangler/athena/_write_iceberg.py b/awswrangler/athena/_write_iceberg.py
@@ -108,6 +108,9 @@ def _determine_differences(
     )
     frame_columns_types.update(frame_partitions_types)
 
+    # lowercase DataFrame columns, as all the column names from Athena will be lowercased
+    frame_columns_types = {k.lower(): v for k, v in frame_columns_types.items()}
+
     catalog_column_types = typing.cast(
         Dict[str, str],
         catalog.get_table_types(database=database, table=table, catalog_id=catalog_id, boto3_session=boto3_session),
diff --git a/tests/unit/test_athena_iceberg.py b/tests/unit/test_athena_iceberg.py
@@ -870,3 +870,57 @@ def test_athena_iceberg_use_partition_function(
 
     assert len(df_out) == len(df) + len(df2)
     assert len(df_out.columns) == len(df.columns)
+
+
+def test_to_iceberg_uppercase_columns(
+    path: str,
+    path2: str,
+    path3: str,
+    glue_database: str,
+    glue_table: str,
+) -> None:
+    df = pd.DataFrame(
+        {
+            "ID": [1, 2, 3, 4, 5],
+            "TS": [
+                ts("2020-01-01 00:00:00.0"),
+                ts("2020-01-02 00:00:01.0"),
+                ts("2020-01-03 00:00:00.0"),
+                ts("2020-01-03 12:30:00.0"),
+                ts("2020-01-03 16:45:00.0"),
+            ],
+        }
+    )
+    df["ID"] = df["ID"].astype("Int64")  # Cast as nullable int64 type
+
+    split_index = 4
+
+    wr.athena.to_iceberg(
+        df=df.iloc[:split_index],
+        database=glue_database,
+        table=glue_table,
+        table_location=path,
+        temp_path=path2,
+        keep_files=False,
+    )
+
+    wr.athena.to_iceberg(
+        df=df.iloc[split_index:],
+        database=glue_database,
+        table=glue_table,
+        table_location=path,
+        temp_path=path2,
+        s3_output=path3,
+        keep_files=False,
+        mode="append",
+        schema_evolution=True,
+    )
+
+    df_output = wr.athena.read_sql_query(
+        sql=f'SELECT ID, TS FROM "{glue_table}" ORDER BY ID',
+        database=glue_database,
+        ctas_approach=False,
+        unload_approach=False,
+    )
+
+    assert_pandas_equals(df, df_output)

Original file line number	Diff line number	Diff line change
`@@ -108,6 +108,9 @@ def _determine_differences(`
`108`	`108`	`)`
`109`	`109`	`frame_columns_types.update(frame_partitions_types)`
`110`	`110`
	`111`	`+ # lowercase DataFrame columns, as all the column names from Athena will be lowercased`
	`112`	`+ frame_columns_types = {k.lower(): v for k, v in frame_columns_types.items()}`
	`113`	`+`
`111`	`114`	`catalog_column_types = typing.cast(`
`112`	`115`	`Dict[str, str],`
`113`	`116`	`catalog.get_table_types(database=database, table=table, catalog_id=catalog_id, boto3_session=boto3_session),`