Skip to content

Commit

Permalink
feat(python): Add DataFrame.serialize/deserialize (#16545)
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego authored May 28, 2024
1 parent 51bf030 commit 067cef8
Show file tree
Hide file tree
Showing 10 changed files with 454 additions and 252 deletions.
9 changes: 9 additions & 0 deletions py-polars/docs/source/reference/dataframe/miscellaneous.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,12 @@ Miscellaneous
DataFrame.frame_equal
DataFrame.lazy
DataFrame.map_rows

Serialization
-------------

.. autosummary::
:toctree: api/

DataFrame.deserialize
DataFrame.serialize
4 changes: 2 additions & 2 deletions py-polars/docs/source/reference/lazyframe/miscellaneous.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ Miscellaneous
LazyFrame.pipe
LazyFrame.profile

Read/write logical plan
-----------------------
Serialization
-------------

.. autosummary::
:toctree: api/
Expand Down
178 changes: 147 additions & 31 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@
from polars.type_aliases import DbWriteMode, JaxExportType, TorchExportType

with contextlib.suppress(ImportError): # Module not available when building docs
from polars.polars import PyDataFrame
from polars.polars import dtype_str_repr as _dtype_str_repr
from polars.polars import write_clipboard_string as _write_clipboard_string

Expand All @@ -121,7 +122,6 @@
from polars import DataType, Expr, LazyFrame, Series
from polars.interchange.dataframe import PolarsDataFrame
from polars.ml.torch import PolarsDataset
from polars.polars import PyDataFrame
from polars.type_aliases import (
AsofJoinStrategy,
AvroCompression,
Expand Down Expand Up @@ -418,6 +418,46 @@ def __init__(
)
raise TypeError(msg)

@classmethod
def deserialize(cls, source: str | Path | IOBase) -> Self:
"""
Read a serialized DataFrame from a file.
Parameters
----------
source
Path to a file or a file-like object (by file-like object, we refer to
objects that have a `read()` method, such as a file handler (e.g.
via builtin `open` function) or `BytesIO`).
See Also
--------
DataFrame.serialize
Examples
--------
>>> import io
>>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]})
>>> json = df.serialize()
>>> pl.DataFrame.deserialize(io.StringIO(json))
shape: (3, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞═════╪═════╡
│ 1 ┆ 4.0 │
│ 2 ┆ 5.0 │
│ 3 ┆ 6.0 │
└─────┴─────┘
"""
if isinstance(source, StringIO):
source = BytesIO(source.getvalue().encode())
elif isinstance(source, (str, Path)):
source = normalize_filepath(source)

return cls._from_pydf(PyDataFrame.deserialize(source))

@classmethod
def _from_pydf(cls, py_df: PyDataFrame) -> Self:
"""Construct Polars DataFrame from FFI PyDataFrame object."""
Expand Down Expand Up @@ -2174,30 +2214,78 @@ def to_init_repr(self, n: int = 1000) -> str:

return output.getvalue()

@overload
def serialize(self, file: None = ...) -> str: ...

@overload
def serialize(self, file: IOBase | str | Path) -> None: ...

def serialize(self, file: IOBase | str | Path | None = None) -> str | None:
"""
Serialize this DataFrame to a file or string in JSON format.
Parameters
----------
file
File path or writable file-like object to which the result will be written.
If set to `None` (default), the output is returned as a string instead.
Examples
--------
>>> df = pl.DataFrame(
... {
... "foo": [1, 2, 3],
... "bar": [6, 7, 8],
... }
... )
>>> df.serialize()
'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}'
"""

def serialize_to_string() -> str:
with BytesIO() as buf:
self._df.serialize(buf)
json_bytes = buf.getvalue()
return json_bytes.decode("utf8")

if file is None:
return serialize_to_string()
elif isinstance(file, StringIO):
json_str = serialize_to_string()
file.write(json_str)
return None
elif isinstance(file, (str, Path)):
file = normalize_filepath(file)
self._df.serialize(file)
return None
else:
self._df.serialize(file)
return None

@overload
def write_json(
self,
file: None = ...,
*,
pretty: bool = ...,
row_oriented: bool = ...,
pretty: bool | None = ...,
) -> str: ...

@overload
def write_json(
self,
file: IOBase | str | Path,
*,
pretty: bool = ...,
row_oriented: bool = ...,
pretty: bool | None = ...,
) -> None: ...

def write_json(
self,
file: IOBase | str | Path | None = None,
*,
pretty: bool = False,
row_oriented: bool = False,
pretty: bool | None = None,
) -> str | None:
"""
Serialize to JSON representation.
Expand All @@ -2207,11 +2295,18 @@ def write_json(
file
File path or writable file-like object to which the result will be written.
If set to `None` (default), the output is returned as a string instead.
pretty
Pretty serialize json.
row_oriented
Write to row oriented json. This is slower, but more common.
pretty
Pretty serialize json.
.. deprecated:: 0.20.31
The `pretty` functionality for `write_json` will be removed in the next
breaking release. Use :meth:`serialize` to serialize the DataFrame in
the regular JSON format.
See Also
--------
DataFrame.write_ndjson
Expand All @@ -2224,27 +2319,44 @@ def write_json(
... "bar": [6, 7, 8],
... }
... )
>>> df.write_json()
'{"columns":[{"name":"foo","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"bar","datatype":"Int64","bit_settings":"","values":[6,7,8]}]}'
>>> df.write_json(row_oriented=True)
'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]'
"""
if isinstance(file, (str, Path)):
file = normalize_filepath(file)
to_string_io = (file is not None) and isinstance(file, StringIO)
if file is None or to_string_io:
if pretty is not None:
issue_deprecation_warning(
"The `pretty` functionality for `write_json` will be removed in the next breaking release."
" Use `DataFrame.serialize` to serialize the DataFrame in the regular JSON format.",
version="0.20.31",
)
else:
pretty = False

if not row_oriented:
issue_deprecation_warning(
"`DataFrame.write_json` will only write row-oriented JSON in the next breaking release."
" Use `DataFrame.serialize` instead.",
version="0.20.31",
)

def write_json_to_string(*, pretty: bool, row_oriented: bool) -> str:
with BytesIO() as buf:
self._df.write_json(buf, pretty, row_oriented)
self._df.write_json_old(buf, pretty=pretty, row_oriented=row_oriented)
json_bytes = buf.getvalue()
return json_bytes.decode("utf8")

json_str = json_bytes.decode("utf8")
if to_string_io:
file.write(json_str) # type: ignore[union-attr]
else:
return json_str
if file is None:
return write_json_to_string(pretty=pretty, row_oriented=row_oriented)
elif isinstance(file, StringIO):
json_str = write_json_to_string(pretty=pretty, row_oriented=row_oriented)
file.write(json_str)
return None
elif isinstance(file, (str, Path)):
file = normalize_filepath(file)
self._df.write_json_old(file, pretty=pretty, row_oriented=row_oriented)
return None
else:
self._df.write_json(file, pretty, row_oriented)
return None
self._df.write_json_old(file, pretty=pretty, row_oriented=row_oriented)
return None

@overload
def write_ndjson(self, file: None = None) -> str: ...
Expand Down Expand Up @@ -2273,22 +2385,26 @@ def write_ndjson(self, file: IOBase | str | Path | None = None) -> str | None:
>>> df.write_ndjson()
'{"foo":1,"bar":6}\n{"foo":2,"bar":7}\n{"foo":3,"bar":8}\n'
"""
if isinstance(file, (str, Path)):
file = normalize_filepath(file)
to_string_io = (file is not None) and isinstance(file, StringIO)
if file is None or to_string_io:

def write_ndjson_to_string() -> str:
with BytesIO() as buf:
self._df.write_ndjson(buf)
json_bytes = buf.getvalue()
ndjson_bytes = buf.getvalue()
return ndjson_bytes.decode("utf8")

json_str = json_bytes.decode("utf8")
if to_string_io:
file.write(json_str) # type: ignore[union-attr]
else:
return json_str
if file is None:
return write_ndjson_to_string()
elif isinstance(file, StringIO):
ndjson_str = write_ndjson_to_string()
file.write(ndjson_str)
return None
elif isinstance(file, (str, Path)):
file = normalize_filepath(file)
self._df.write_ndjson(file)
return None
else:
self._df.write_ndjson(file)
return None
return None

@overload
def write_csv(
Expand Down
9 changes: 4 additions & 5 deletions py-polars/polars/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ def function(s: Series) -> Series: # pragma: no cover
@classmethod
def deserialize(cls, source: str | Path | IOBase) -> Self:
"""
Read an expression from a JSON file.
Read a serialized expression from a file.
Parameters
----------
Expand All @@ -351,10 +351,9 @@ def deserialize(cls, source: str | Path | IOBase) -> Self:
Warnings
--------
This function uses :mod:`pickle` under some circumstances, and as
such inherits the security implications. Deserializing can execute
arbitrary code so it should only be attempted on trusted data.
pickle is only used when the logical plan contains python UDFs.
This function uses :mod:`pickle` when the logical plan contains Python UDFs,
and as such inherits the security implications. Deserializing can execute
arbitrary code, so it should only be attempted on trusted data.
See Also
--------
Expand Down
24 changes: 14 additions & 10 deletions py-polars/polars/expr/meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,22 +291,26 @@ def serialize(self, file: IOBase | str | Path | None = None) -> str | None:
>>> pl.Expr.deserialize(StringIO(json)) # doctest: +ELLIPSIS
<Expr ['col("foo").sum().over([col("ba…'] at ...>
"""
if isinstance(file, (str, Path)):
file = normalize_filepath(file)
to_string_io = (file is not None) and isinstance(file, StringIO)
if file is None or to_string_io:

def serialize_to_string() -> str:
with BytesIO() as buf:
self._pyexpr.serialize(buf)
json_bytes = buf.getvalue()
return json_bytes.decode("utf8")

json_str = json_bytes.decode("utf8")
if to_string_io:
file.write(json_str) # type: ignore[union-attr]
else:
return json_str
if file is None:
return serialize_to_string()
elif isinstance(file, StringIO):
json_str = serialize_to_string()
file.write(json_str)
return None
elif isinstance(file, (str, Path)):
file = normalize_filepath(file)
self._pyexpr.serialize(file)
return None
else:
self._pyexpr.serialize(file)
return None
return None

@overload
def write_json(self, file: None = ...) -> str: ...
Expand Down
Loading

0 comments on commit 067cef8

Please sign in to comment.