Skip to content

Support polars Datetime with timezone types in cudf_polars #19155

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 14 commits into
base: branch-25.08
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/include/cudf/interop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ DLManagedTensor* to_dlpack(
*/
struct column_metadata {
std::string name; ///< Name of the column
std::string timezone; ///< Timezone of the column
std::vector<column_metadata> children_meta; ///< Metadata of children of the column

/**
Expand Down
11 changes: 6 additions & 5 deletions cpp/src/interop/to_arrow_schema.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,22 +42,23 @@ struct dispatch_to_arrow_type {
}

template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
int operator()(column_view input_view, column_metadata const&, ArrowSchema* out)
int operator()(column_view input_view, column_metadata const& metadata, ArrowSchema* out)
{
cudf::type_id const id = input_view.type().id();
auto timezone = metadata.timezone.c_str();
switch (id) {
case cudf::type_id::TIMESTAMP_SECONDS:
return ArrowSchemaSetTypeDateTime(
out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_SECOND, nullptr);
out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_SECOND, timezone);
case cudf::type_id::TIMESTAMP_MILLISECONDS:
return ArrowSchemaSetTypeDateTime(
out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr);
out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, timezone);
case cudf::type_id::TIMESTAMP_MICROSECONDS:
return ArrowSchemaSetTypeDateTime(
out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MICRO, nullptr);
out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MICRO, timezone);
case cudf::type_id::TIMESTAMP_NANOSECONDS:
return ArrowSchemaSetTypeDateTime(
out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_NANO, nullptr);
out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_NANO, timezone);
case cudf::type_id::DURATION_SECONDS:
return ArrowSchemaSetTypeDateTime(
out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_SECOND, nullptr);
Expand Down
5 changes: 4 additions & 1 deletion python/cudf_polars/cudf_polars/containers/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@ def _create_polars_column_metadata(
]
else:
children_meta = []
return plc.interop.ColumnMetadata(name=name, children_meta=children_meta)
timezone = dtype.time_zone if isinstance(dtype, pl.Datetime) else None
return plc.interop.ColumnMetadata(
name=name, timezone=timezone or "", children_meta=children_meta
)


# This is also defined in pylibcudf.interop
Expand Down
2 changes: 0 additions & 2 deletions python/cudf_polars/cudf_polars/containers/datatype.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,6 @@ def _from_polars(dtype: pl.DataType) -> plc.DataType:
elif isinstance(dtype, pl.Time):
raise NotImplementedError("Time of day dtype not implemented")
elif isinstance(dtype, pl.Datetime):
if dtype.time_zone is not None:
raise NotImplementedError("Time zone support")
if dtype.time_unit == "ms":
return plc.DataType(plc.TypeId.TIMESTAMP_MILLISECONDS)
elif dtype.time_unit == "us":
Expand Down
8 changes: 8 additions & 0 deletions python/cudf_polars/cudf_polars/testing/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,14 @@ def pytest_configure(config: pytest.Config) -> None:
"tests/unit/lazyframe/test_collect_schema.py::test_collect_schema_parametric": "https://github.com/pola-rs/polars/issues/23214",
"tests/unit/datatypes/test_struct.py::test_struct_null_cast": "pylibcudf.Scalar does not support struct scalars",
"tests/unit/datatypes/test_struct.py::test_struct_outer_nullability_zip_18119": "pylibcudf.Scalar does not support struct scalars",
"tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[True-columns]": "allow_missing_columns argument in read_parquet not translated in IR",
"tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[True-row_groups]": "allow_missing_columns argument in read_parquet not translated in IR",
"tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[True-prefiltered]": "allow_missing_columns argument in read_parquet not translated in IR",
"tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[True-none]": "allow_missing_columns argument in read_parquet not translated in IR",
"tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-columns]": "allow_missing_columns argument in read_parquet not translated in IR",
"tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-row_groups]": "allow_missing_columns argument in read_parquet not translated in IR",
"tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-prefiltered]": "allow_missing_columns argument in read_parquet not translated in IR",
"tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-none]": "allow_missing_columns argument in read_parquet not translated in IR",
}


Expand Down
3 changes: 3 additions & 0 deletions python/cudf_polars/tests/expressions/test_datetime_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
pl.Datetime("ms"),
pl.Datetime("us"),
pl.Datetime("ns"),
pl.Datetime("ms", time_zone="UTC"),
pl.Datetime("us", time_zone="Europe/Dublin"),
pl.Datetime("ns", time_zone="US/Pacific"),
pl.Duration("ms"),
pl.Duration("us"),
pl.Duration("ns"),
Expand Down
3 changes: 1 addition & 2 deletions python/cudf_polars/tests/utils/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@
[
pl.Time(),
pl.Struct({"a": pl.Binary(), "b": pl.Float32}),
pl.Datetime("ms", time_zone="US/Pacific"),
pl.List(pl.Datetime("ms", time_zone="US/Pacific")),
pl.List(pl.Object()),
pl.Array(pl.Int8, 2),
pl.Binary(),
pl.Categorical(),
Expand Down
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/_interop_helpers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class ColumnMetadata:
This is the Python representation of :cpp:class:`cudf::column_metadata`.
"""
name: str = ""
timezone: str = ""
children_meta: list[ColumnMetadata] = field(default_factory=list)


Expand Down Expand Up @@ -76,6 +77,7 @@ cdef column_metadata _metadata_to_libcudf(metadata):
"""
cdef column_metadata c_metadata
c_metadata.name = metadata.name.encode()
c_metadata.timezone = metadata.timezone.encode()
for child_meta in metadata.children_meta:
c_metadata.children_meta.push_back(_metadata_to_libcudf(child_meta))
return c_metadata
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/libcudf/interop.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \
column_metadata() except +libcudf_exception_handler
column_metadata(string name_) except +libcudf_exception_handler
string name
string timezone
vector[column_metadata] children_meta


Expand Down
4 changes: 2 additions & 2 deletions python/pylibcudf/pylibcudf/tests/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def metadata_from_arrow_type(
pa_type.field(i).type, pa_type.field(i).name
)
child_meta.append(field_meta)
metadata = plc.interop.ColumnMetadata(name, child_meta)
metadata = plc.interop.ColumnMetadata(name, children_meta=child_meta)
elif pa.types.is_struct(pa_type):
child_meta = []
for i in range(pa_type.num_fields):
Expand All @@ -38,7 +38,7 @@ def metadata_from_arrow_type(
metadata = plc.interop.ColumnMetadata(
name,
# libcudf does not store field names, so just match pyarrow's.
child_meta,
children_meta=child_meta,
)
return metadata

Expand Down