Skip to content

Commit

Permalink
fix: Add drop_empty_rows flag in read_excel to have control whether t…
Browse files Browse the repository at this point in the history
…o drop empty rows or not
  • Loading branch information
Rashikraj Shrestha authored and Rashikraj Shrestha committed Aug 18, 2024
1 parent a284174 commit a68fca9
Showing 1 changed file with 30 additions and 7 deletions.
37 changes: 30 additions & 7 deletions py-polars/polars/io/spreadsheet/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def read_excel(
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
raise_if_empty: bool = ...,
drop_empty_rows: bool = ...,
) -> pl.DataFrame: ...


Expand All @@ -72,6 +73,7 @@ def read_excel(
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
raise_if_empty: bool = ...,
drop_empty_rows: bool = ...,
) -> pl.DataFrame: ...


Expand All @@ -89,6 +91,7 @@ def read_excel(
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
raise_if_empty: bool = ...,
drop_empty_rows: bool = ...,
) -> NoReturn: ...


Expand All @@ -108,6 +111,7 @@ def read_excel(
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
raise_if_empty: bool = ...,
drop_empty_rows: bool = ...,
) -> dict[str, pl.DataFrame]: ...


Expand All @@ -125,6 +129,7 @@ def read_excel(
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
raise_if_empty: bool = ...,
drop_empty_rows: bool = ...,
) -> pl.DataFrame: ...


Expand All @@ -142,6 +147,7 @@ def read_excel(
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
raise_if_empty: bool = ...,
drop_empty_rows: bool = ...,
) -> dict[str, pl.DataFrame]: ...


Expand All @@ -160,6 +166,7 @@ def read_excel(
schema_overrides: SchemaDict | None = None,
infer_schema_length: int | None = N_INFER_DEFAULT,
raise_if_empty: bool = True,
drop_empty_rows: bool = True,
) -> pl.DataFrame | dict[str, pl.DataFrame]:
"""
Read Excel spreadsheet data into a DataFrame.
Expand Down Expand Up @@ -231,6 +238,9 @@ def read_excel(
raise_if_empty
When there is no data in the sheet,`NoDataError` is raised. If this parameter
is set to False, an empty DataFrame (with no columns) is returned instead.
drop_empty_rows
A boolean flag whether to drop empty rows or not from the dataframe. Default
is True.
Returns
-------
Expand Down Expand Up @@ -298,6 +308,7 @@ def read_excel(
raise_if_empty=raise_if_empty,
has_header=has_header,
columns=columns,
drop_empty_rows=drop_empty_rows,
)


Expand Down Expand Up @@ -529,6 +540,7 @@ def _read_spreadsheet(
columns: Sequence[int] | Sequence[str] | None = None,
has_header: bool = True,
raise_if_empty: bool = True,
drop_empty_rows=True,
) -> pl.DataFrame | dict[str, pl.DataFrame]:
if isinstance(source, (str, Path)):
source = normalize_filepath(source)
Expand Down Expand Up @@ -560,6 +572,7 @@ def _read_spreadsheet(
read_options=read_options,
raise_if_empty=raise_if_empty,
columns=columns,
drop_empty_rows=drop_empty_rows,
)
for name in sheet_names
}
Expand Down Expand Up @@ -748,6 +761,7 @@ def _csv_buffer_to_frame(
read_options: dict[str, Any],
schema_overrides: SchemaDict | None,
raise_if_empty: bool,
drop_empty_rows: bool,
) -> pl.DataFrame:
"""Translate StringIO buffer containing delimited data as a DataFrame."""
# handle (completely) empty sheet data
Expand Down Expand Up @@ -781,11 +795,15 @@ def _csv_buffer_to_frame(
separator=separator,
**read_options,
)
return _drop_null_data(df, raise_if_empty=raise_if_empty)
return _drop_null_data(df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows)


def _drop_null_data(df: pl.DataFrame, *, raise_if_empty: bool) -> pl.DataFrame:
"""If DataFrame contains columns/rows that contain only nulls, drop them."""
def _drop_null_data(df: pl.DataFrame, *, raise_if_empty: bool, drop_empty_rows: bool = True) -> pl.DataFrame:
"""
If DataFrame contains columns/rows that contain only nulls, drop them.
If `drop_empty_rows` is set to `False`, empty rows are not dropped.
"""
null_cols = []
for col_name in df.columns:
# note that if multiple unnamed columns are found then all but the first one
Expand All @@ -806,8 +824,9 @@ def _drop_null_data(df: pl.DataFrame, *, raise_if_empty: bool) -> pl.DataFrame:

if len(df) == 0 and len(df.columns) == 0:
return _empty_frame(raise_if_empty)

return df.filter(~F.all_horizontal(F.all().is_null()))
if drop_empty_rows:
return df.filter(~F.all_horizontal(F.all().is_null()))
return df


def _empty_frame(raise_if_empty: bool) -> pl.DataFrame: # noqa: FBT001
Expand Down Expand Up @@ -839,6 +858,7 @@ def _read_spreadsheet_openpyxl(
schema_overrides: SchemaDict | None,
columns: Sequence[int] | Sequence[str] | None,
raise_if_empty: bool,
drop_empty_rows: bool,
) -> pl.DataFrame:
"""Use the 'openpyxl' library to read data from the given worksheet."""
infer_schema_length = read_options.pop("infer_schema_length", None)
Expand Down Expand Up @@ -895,7 +915,7 @@ def _read_spreadsheet_openpyxl(
strict=False,
)

df = _drop_null_data(df, raise_if_empty=raise_if_empty)
df = _drop_null_data(df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows)
df = _reorder_columns(df, columns)
return df

Expand All @@ -908,6 +928,7 @@ def _read_spreadsheet_calamine(
schema_overrides: SchemaDict | None,
columns: Sequence[int] | Sequence[str] | None,
raise_if_empty: bool,
drop_empty_rows: bool,
) -> pl.DataFrame:
# if we have 'schema_overrides' and a more recent version of `fastexcel`
# we can pass translated dtypes to the engine to refine the initial parse
Expand Down Expand Up @@ -965,7 +986,7 @@ def _read_spreadsheet_calamine(
if schema_overrides:
df = df.cast(dtypes=schema_overrides)

df = _drop_null_data(df, raise_if_empty=raise_if_empty)
df = _drop_null_data(df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows)

# standardise on string dtype for null columns in empty frame
if df.is_empty():
Expand Down Expand Up @@ -1008,6 +1029,7 @@ def _read_spreadsheet_xlsx2csv(
schema_overrides: SchemaDict | None,
columns: Sequence[int] | Sequence[str] | None,
raise_if_empty: bool,
drop_empty_rows: bool,
) -> pl.DataFrame:
"""Use the 'xlsx2csv' library to read data from the given worksheet."""
csv_buffer = StringIO()
Expand All @@ -1030,6 +1052,7 @@ def _read_spreadsheet_xlsx2csv(
read_options=read_options,
schema_overrides=schema_overrides,
raise_if_empty=raise_if_empty,
drop_empty_rows=drop_empty_rows,
)
if cast_to_boolean:
df = df.with_columns(*cast_to_boolean)
Expand Down

0 comments on commit a68fca9

Please sign in to comment.