Skip to content

Commit

Permalink
Merge pull request #1041 from lsst/tickets/DM-45386
Browse files Browse the repository at this point in the history
DM-45386: Fix serialization of datetime64 to parquet via numpy and astropy.
  • Loading branch information
erykoff committed Jul 25, 2024
2 parents 0ea5e73 + e246ec6 commit 88f4f2d
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 0 deletions.
2 changes: 2 additions & 0 deletions doc/changes/DM-45386.bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix bug where datetime columns would serialize to parquet from pandas but not
from astropy or numpy.
5 changes: 5 additions & 0 deletions python/lsst/daf/butler/formatters/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1248,6 +1248,11 @@ def _numpy_dtype_to_arrow_types(dtype: np.dtype) -> list[Any]:
pa.from_numpy_dtype(cast(tuple[np.dtype, tuple[int, ...]], dt.subdtype)[0].type),
prod(dt.shape),
)
elif dt.type == np.datetime64:
time_unit = "ns" if "ns" in dt.str else "us"
# The pa.timestamp() is the correct datatype to round-trip
# a numpy datetime64[ns] or datetime[us] array.
arrow_type = pa.timestamp(time_unit)
else:
try:
arrow_type = pa.from_numpy_dtype(dt.type)
Expand Down
9 changes: 9 additions & 0 deletions tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
Tests in this module are disabled unless pandas and pyarrow are importable.
"""

import datetime
import os
import unittest

Expand Down Expand Up @@ -138,6 +139,8 @@ def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False):
("f", "i8"),
("strcol", "U10"),
("bytecol", "a10"),
("dtn", "datetime64[ns]"),
("dtu", "datetime64[us]"),
]

if include_multidim:
Expand All @@ -161,6 +164,8 @@ def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False):
data["f"] = np.arange(nrow) * 10
data["strcol"][:] = "teststring"
data["bytecol"][:] = "teststring"
data["dtn"] = datetime.datetime.fromisoformat("2024-07-23")
data["dtu"] = datetime.datetime.fromisoformat("2024-07-23")

if include_multidim:
data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape)
Expand Down Expand Up @@ -901,6 +906,10 @@ def testArrowAstropySchema(self):
def testAstropyParquet(self):
tab1 = _makeSimpleAstropyTable()

# Remove datetime column which doesn't work with astropy currently.
del tab1["dtn"]
del tab1["dtu"]

fname = os.path.join(self.root, "test_astropy.parq")
tab1.write(fname)

Expand Down

0 comments on commit 88f4f2d

Please sign in to comment.