Skip to content

Commit

Permalink
FIX: fix empty rows being deleted in read_excel by introducing drop_e…
Browse files Browse the repository at this point in the history
…mpty_rows flag
  • Loading branch information
Rashikraj Shrestha authored and Rashikraj Shrestha committed Aug 18, 2024
1 parent a284174 commit 7c8ebe3
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 8 deletions.
45 changes: 38 additions & 7 deletions py-polars/polars/io/spreadsheet/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def read_excel(
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
raise_if_empty: bool = ...,
drop_empty_rows: bool = ...,
) -> pl.DataFrame: ...


Expand All @@ -72,6 +73,7 @@ def read_excel(
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
raise_if_empty: bool = ...,
drop_empty_rows: bool = ...,
) -> pl.DataFrame: ...


Expand All @@ -89,6 +91,7 @@ def read_excel(
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
raise_if_empty: bool = ...,
drop_empty_rows: bool = ...,
) -> NoReturn: ...


Expand All @@ -108,6 +111,7 @@ def read_excel(
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
raise_if_empty: bool = ...,
drop_empty_rows: bool = ...,
) -> dict[str, pl.DataFrame]: ...


Expand All @@ -125,6 +129,7 @@ def read_excel(
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
raise_if_empty: bool = ...,
drop_empty_rows: bool = ...,
) -> pl.DataFrame: ...


Expand All @@ -142,6 +147,7 @@ def read_excel(
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
raise_if_empty: bool = ...,
drop_empty_rows: bool = ...,
) -> dict[str, pl.DataFrame]: ...


Expand All @@ -160,6 +166,7 @@ def read_excel(
schema_overrides: SchemaDict | None = None,
infer_schema_length: int | None = N_INFER_DEFAULT,
raise_if_empty: bool = True,
drop_empty_rows: bool = True,
) -> pl.DataFrame | dict[str, pl.DataFrame]:
"""
Read Excel spreadsheet data into a DataFrame.
Expand Down Expand Up @@ -231,6 +238,9 @@ def read_excel(
raise_if_empty
When there is no data in the sheet,`NoDataError` is raised. If this parameter
is set to False, an empty DataFrame (with no columns) is returned instead.
drop_empty_rows
A boolean flag whether to drop empty rows or not from the dataframe. Default
is True.
Returns
-------
Expand Down Expand Up @@ -298,6 +308,7 @@ def read_excel(
raise_if_empty=raise_if_empty,
has_header=has_header,
columns=columns,
drop_empty_rows=drop_empty_rows,
)


Expand Down Expand Up @@ -529,6 +540,7 @@ def _read_spreadsheet(
columns: Sequence[int] | Sequence[str] | None = None,
has_header: bool = True,
raise_if_empty: bool = True,
drop_empty_rows: bool = True,
) -> pl.DataFrame | dict[str, pl.DataFrame]:
if isinstance(source, (str, Path)):
source = normalize_filepath(source)
Expand Down Expand Up @@ -560,6 +572,7 @@ def _read_spreadsheet(
read_options=read_options,
raise_if_empty=raise_if_empty,
columns=columns,
drop_empty_rows=drop_empty_rows,
)
for name in sheet_names
}
Expand Down Expand Up @@ -748,6 +761,7 @@ def _csv_buffer_to_frame(
read_options: dict[str, Any],
schema_overrides: SchemaDict | None,
raise_if_empty: bool,
drop_empty_rows: bool,
) -> pl.DataFrame:
"""Translate StringIO buffer containing delimited data as a DataFrame."""
# handle (completely) empty sheet data
Expand Down Expand Up @@ -781,11 +795,19 @@ def _csv_buffer_to_frame(
separator=separator,
**read_options,
)
return _drop_null_data(df, raise_if_empty=raise_if_empty)
return _drop_null_data(
df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows
)


def _drop_null_data(df: pl.DataFrame, *, raise_if_empty: bool) -> pl.DataFrame:
"""If DataFrame contains columns/rows that contain only nulls, drop them."""
def _drop_null_data(
df: pl.DataFrame, *, raise_if_empty: bool, drop_empty_rows: bool = True
) -> pl.DataFrame:
"""
If DataFrame contains columns/rows that contain only nulls, drop them.
If `drop_empty_rows` is set to `False`, empty rows are not dropped.
"""
null_cols = []
for col_name in df.columns:
# note that if multiple unnamed columns are found then all but the first one
Expand All @@ -806,8 +828,9 @@ def _drop_null_data(df: pl.DataFrame, *, raise_if_empty: bool) -> pl.DataFrame:

if len(df) == 0 and len(df.columns) == 0:
return _empty_frame(raise_if_empty)

return df.filter(~F.all_horizontal(F.all().is_null()))
if drop_empty_rows:
return df.filter(~F.all_horizontal(F.all().is_null()))
return df


def _empty_frame(raise_if_empty: bool) -> pl.DataFrame: # noqa: FBT001
Expand Down Expand Up @@ -839,6 +862,7 @@ def _read_spreadsheet_openpyxl(
schema_overrides: SchemaDict | None,
columns: Sequence[int] | Sequence[str] | None,
raise_if_empty: bool,
drop_empty_rows: bool,
) -> pl.DataFrame:
"""Use the 'openpyxl' library to read data from the given worksheet."""
infer_schema_length = read_options.pop("infer_schema_length", None)
Expand Down Expand Up @@ -895,7 +919,9 @@ def _read_spreadsheet_openpyxl(
strict=False,
)

df = _drop_null_data(df, raise_if_empty=raise_if_empty)
df = _drop_null_data(
df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows
)
df = _reorder_columns(df, columns)
return df

Expand All @@ -908,6 +934,7 @@ def _read_spreadsheet_calamine(
schema_overrides: SchemaDict | None,
columns: Sequence[int] | Sequence[str] | None,
raise_if_empty: bool,
drop_empty_rows: bool,
) -> pl.DataFrame:
# if we have 'schema_overrides' and a more recent version of `fastexcel`
# we can pass translated dtypes to the engine to refine the initial parse
Expand Down Expand Up @@ -965,7 +992,9 @@ def _read_spreadsheet_calamine(
if schema_overrides:
df = df.cast(dtypes=schema_overrides)

df = _drop_null_data(df, raise_if_empty=raise_if_empty)
df = _drop_null_data(
df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows
)

# standardise on string dtype for null columns in empty frame
if df.is_empty():
Expand Down Expand Up @@ -1008,6 +1037,7 @@ def _read_spreadsheet_xlsx2csv(
schema_overrides: SchemaDict | None,
columns: Sequence[int] | Sequence[str] | None,
raise_if_empty: bool,
drop_empty_rows: bool,
) -> pl.DataFrame:
"""Use the 'xlsx2csv' library to read data from the given worksheet."""
csv_buffer = StringIO()
Expand All @@ -1030,6 +1060,7 @@ def _read_spreadsheet_xlsx2csv(
read_options=read_options,
schema_overrides=schema_overrides,
raise_if_empty=raise_if_empty,
drop_empty_rows=drop_empty_rows,
)
if cast_to_boolean:
df = df.with_columns(*cast_to_boolean)
Expand Down
Binary file not shown.
42 changes: 41 additions & 1 deletion py-polars/tests/unit/io/test_spreadsheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
if TYPE_CHECKING:
from polars._typing import ExcelSpreadsheetEngine, SelectorType

pytestmark = pytest.mark.slow()
# pytestmark = pytest.mark.slow()


@pytest.fixture()
Expand Down Expand Up @@ -81,6 +81,11 @@ def path_ods_mixed(io_files_path: Path) -> Path:
return io_files_path / "mixed.ods"


@pytest.fixture()
def path_empty_rows_excel(io_files_path: Path) -> Path:
return io_files_path / "test_empty_rows.xlsx"


@pytest.mark.parametrize(
("read_spreadsheet", "source", "engine_params"),
[
Expand Down Expand Up @@ -1058,3 +1063,38 @@ def test_identify_workbook(
bytesio_data = BytesIO(f.read())
assert _identify_workbook(bytesio_data) == file_type
assert isinstance(pl.read_excel(bytesio_data, engine="calamine"), pl.DataFrame)


def test_drop_empty_rows(path_empty_rows_excel: Path) -> None:
df1 = pl.read_excel(source=path_empty_rows_excel, engine="xlsx2csv")
assert df1.shape == (8, 4)
df2 = pl.read_excel(
source=path_empty_rows_excel, engine="xlsx2csv", drop_empty_rows=True
)
assert df2.shape == (8, 4)
df3 = pl.read_excel(
source=path_empty_rows_excel, engine="xlsx2csv", drop_empty_rows=False
)
assert df3.shape == (10, 4)

df4 = pl.read_excel(source=path_empty_rows_excel, engine="openpyxl")
assert df4.shape == (8, 4)
df5 = pl.read_excel(
source=path_empty_rows_excel, engine="openpyxl", drop_empty_rows=True
)
assert df5.shape == (8, 4)
df6 = pl.read_excel(
source=path_empty_rows_excel, engine="openpyxl", drop_empty_rows=False
)
assert df6.shape == (10, 4)

df7 = pl.read_excel(source=path_empty_rows_excel, engine="calamine")
assert df7.shape == (8, 4)
df8 = pl.read_excel(
source=path_empty_rows_excel, engine="calamine", drop_empty_rows=True
)
assert df8.shape == (8, 4)
df9 = pl.read_excel(
source=path_empty_rows_excel, engine="calamine", drop_empty_rows=False
)
assert df9.shape == (10, 4)

0 comments on commit 7c8ebe3

Please sign in to comment.