Skip to content

Commit

Permalink
Visualization cleanup (1/n): Use Table for repr (#1011)
Browse files Browse the repository at this point in the history
This PR has DataFrame repr call into Table repr and removes the existing
DataFrame specific codepath.

For now, we also deprecate the table sizing options (height and width).
It will be fixed to a max of 1x20. (heads up @jaychia)

Also fixs off-by-one error in Table repr and adds length truncation in
Table repr.

This PR:

- [x] DataFrame repr uses Table repr

Future PRs:

- [ ] Add default html_value method to Series
- [ ] Special implementation of html_value for images
- [ ] Manually implement html_repr for Table
- [ ] DataFrame repr calls Table html_repr

---------

Co-authored-by: Xiayue Charles Lin <[email protected]>
  • Loading branch information
xcharleslin and Xiayue Charles Lin committed Jun 7, 2023
1 parent f6e52b3 commit 122b8ed
Show file tree
Hide file tree
Showing 9 changed files with 68 additions and 91 deletions.
2 changes: 1 addition & 1 deletion daft/logical/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def to_name_set(self) -> set[str]:
return set(self.column_names())

def __repr__(self) -> str:
return repr([(field.name, field.dtype) for field in self])
return repr(self._schema)

def union(self, other: Schema) -> Schema:
if not isinstance(other, Schema):
Expand Down
22 changes: 13 additions & 9 deletions daft/viz/dataframe_display.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from daft.dataframe.preview import DataFramePreview
from daft.logical.schema import Schema
from daft.viz.repr import vpartition_repr, vpartition_repr_html
from daft.viz.repr import vpartition_repr_html

HAS_PILLOW = False
try:
Expand All @@ -22,6 +22,7 @@ class DataFrameDisplay:

preview: DataFramePreview
schema: Schema
# These formatting options are deprecated for now and not guaranteed to be supported.
column_char_width: int = 20
max_col_rows: int = 3
num_rows: int = 10
Expand All @@ -46,11 +47,14 @@ def _repr_html_(self) -> str:
)

def __repr__(self) -> str:
return vpartition_repr(
self.preview.preview_partition,
self.schema,
self.num_rows,
self._get_user_message(),
max_col_width=self.column_char_width,
max_lines=self.max_col_rows,
)
if len(self.schema) == 0:
return "(No data to display: Dataframe has no columns)"

if self.preview.preview_partition is not None:
res = repr(self.preview.preview_partition)
else:
res = repr(self.schema)

res += f"\n{self._get_user_message()}"

return res
63 changes: 0 additions & 63 deletions daft/viz/repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,31 +30,6 @@ def _stringify_object_html(val: Any, max_col_width: int, max_lines: int):
return html.escape(_truncate(str(val), max_col_width, max_lines))


def _stringify_vpartition(
data: dict[str, list[Any]],
daft_schema: Schema,
max_col_width: int = DEFAULT_MAX_COL_WIDTH,
max_lines: int = DEFAULT_MAX_LINES,
) -> dict[str, Iterable[str]]:
"""Converts a vPartition into a dictionary of display-friendly stringified values"""
assert all(
colname in data for colname in daft_schema.column_names()
), f"Data does not contain columns: {set(daft_schema.column_names()) - set(data.keys())}"

data_stringified: dict[str, Iterable[str]] = {}
for colname in daft_schema.column_names():
field = daft_schema[colname]
if field.dtype._is_python_type():
data_stringified[colname] = [_truncate(str(val), max_col_width, max_lines) for val in data[colname]]
elif field.dtype == DataType.bool():
# BUG: tabulate library does not handle string literal values "True" and "False" correctly, so we lowercase them.
data_stringified[colname] = [_truncate(str(val).lower(), max_col_width, max_lines) for val in data[colname]]
else:
data_stringified[colname] = [_truncate(str(val), max_col_width, max_lines) for val in data[colname]]

return data_stringified


def _stringify_vpartition_html(
data: dict[str, list[Any]],
daft_schema: Schema,
Expand Down Expand Up @@ -137,41 +112,3 @@ def vpartition_repr_html(
{tabulate_html_string}
<small>{user_message}</small>
</div>"""


def vpartition_repr(
vpartition: Table | None,
daft_schema: Schema,
num_rows: int,
user_message: str,
max_col_width: int = DEFAULT_MAX_COL_WIDTH,
max_lines: int = DEFAULT_MAX_LINES,
) -> str:
"""Converts a vPartition into a prettified string for display in a REPL"""
if len(daft_schema) == 0:
return "(No data to display: Dataframe has no columns)"

data = (
{k: v[:num_rows] for k, v in vpartition.to_pydict().items()}
if vpartition is not None
else {colname: [] for colname in daft_schema.column_names()}
)
data_stringified = _stringify_vpartition(
data,
daft_schema,
max_col_width=max_col_width,
max_lines=max_lines,
)

return (
tabulate(
data_stringified,
headers=[f"{name}\n{daft_schema[name].dtype}" for name in daft_schema.column_names()],
tablefmt="grid",
missingval="None",
# Workaround for https://github.com/astanin/python-tabulate/issues/223
# If table has no rows, specifying maxcolwidths always raises error.
maxcolwidths=max_col_width if vpartition is not None and len(vpartition) else None,
)
+ f"\n{user_message}"
)
2 changes: 1 addition & 1 deletion src/array/ops/take.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ impl Utf8Array {
let val = self.get(idx);
match val {
None => Ok("None".to_string()),
Some(v) => Ok(format!("\"{v}\"")),
Some(v) => Ok(v.to_string()),
}
}
}
Expand Down
4 changes: 4 additions & 0 deletions src/python/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ impl PySchema {
pub fn __getstate__(&self, py: Python) -> PyResult<PyObject> {
Ok(PyBytes::new(py, &bincode::serialize(&self.schema).unwrap()).to_object(py))
}

pub fn __repr__(&self) -> PyResult<String> {
Ok(format!("{}", self.schema))
}
}

impl From<schema::SchemaRef> for PySchema {
Expand Down
13 changes: 9 additions & 4 deletions src/series/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,8 @@ impl Series {
self.inner.cast(&physical_dtype)
}
}
}

impl Display for Series {
// `f` is a buffer, and this method must write the formatted string into it
fn fmt(&self, f: &mut Formatter) -> Result {
pub fn to_prettytable(&self) -> prettytable::Table {
let mut table = prettytable::Table::new();

let header =
Expand Down Expand Up @@ -87,6 +84,14 @@ impl Display for Series {
table.add_row(row.into());
}

table
}
}

impl Display for Series {
// `f` is a buffer, and this method must write the formatted string into it
fn fmt(&self, f: &mut Formatter) -> Result {
let table = self.to_prettytable();
write!(f, "{table}")
}
}
Expand Down
42 changes: 31 additions & 11 deletions src/table/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -363,11 +363,8 @@ impl Table {
let new_series: DaftResult<Vec<_>> = self.columns.iter().map(|s| s.as_physical()).collect();
Table::from_columns(new_series?)
}
}

impl Display for Table {
// `f` is a buffer, and this method must write the formatted string into it
fn fmt(&self, f: &mut Formatter) -> Result {
pub fn to_prettytable(&self, max_col_width: Option<usize>) -> prettytable::Table {
let mut table = prettytable::Table::new();
let header = self
.schema
Expand Down Expand Up @@ -395,26 +392,49 @@ impl Display for Table {
let row = self
.columns
.iter()
.map(|s| s.str_value(i))
.collect::<DaftResult<Vec<String>>>()
.unwrap();
.map(|s| {
let mut str_val = s.str_value(i).unwrap();
if let Some(max_col_width) = max_col_width {
if str_val.len() > max_col_width {
str_val = format!("{}...", &str_val[..max_col_width - 3]);
}
}
str_val
})
.collect::<Vec<String>>();
table.add_row(row.into());
}
if tail_rows != 0 {
let row: prettytable::Row = (0..self.num_columns()).map(|_| "...").collect();
table.add_row(row);
}

for i in 0..tail_rows {
for i in (self.len() - tail_rows)..(self.len()) {
let row = self
.columns
.iter()
.map(|s| s.str_value(self.len() - tail_rows - 1 + i))
.collect::<DaftResult<Vec<String>>>()
.unwrap();
.map(|s| {
let mut str_val = s.str_value(i).unwrap();
if let Some(max_col_width) = max_col_width {
if s.len() > max_col_width {
str_val = format!("{}...", &str_val[..max_col_width - 3]);
}
}
str_val
})
.collect::<Vec<String>>();
table.add_row(row.into());
}

table
}
}

impl Display for Table {
// `f` is a buffer, and this method must write the formatted string into it
fn fmt(&self, f: &mut Formatter) -> Result {
let table = self.to_prettytable(Some(20));

write!(f, "{table}")
}
}
Expand Down
2 changes: 1 addition & 1 deletion tests/dataframe/test_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def _split_table_row(row: str) -> list[str]:
column_types = _split_table_row(lines[2])

data = []
for line in lines[4:-1]:
for line in lines[4:-2]:
if ROW_DIVIDER_REGEX.match(line):
continue
data.append(_split_table_row(line))
Expand Down
9 changes: 8 additions & 1 deletion tests/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,14 @@ def test_schema_to_name_set():

def test_repr():
schema = TABLE.schema()
assert repr(schema) == "[('int', Int64), ('float', Float64), ('string', Utf8), ('bool', Boolean)]"
assert (
repr(schema)
== """+-------+---------+--------+---------+
| int | float | string | bool |
| Int64 | Float64 | Utf8 | Boolean |
+-------+---------+--------+---------+
"""
)


def test_to_col_expr():
Expand Down

0 comments on commit 122b8ed

Please sign in to comment.