diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py index 1fd25cc1417ad..ca7873b9f85c2 100644 --- a/py-polars/polars/io/spreadsheet/functions.py +++ b/py-polars/polars/io/spreadsheet/functions.py @@ -55,6 +55,7 @@ def read_excel( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., raise_if_empty: bool = ..., + drop_empty_rows: bool = ..., ) -> pl.DataFrame: ... @@ -72,6 +73,7 @@ def read_excel( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., raise_if_empty: bool = ..., + drop_empty_rows: bool = ..., ) -> pl.DataFrame: ... @@ -89,6 +91,7 @@ def read_excel( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., raise_if_empty: bool = ..., + drop_empty_rows: bool = ..., ) -> NoReturn: ... @@ -108,6 +111,7 @@ def read_excel( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., raise_if_empty: bool = ..., + drop_empty_rows: bool = ..., ) -> dict[str, pl.DataFrame]: ... @@ -125,6 +129,7 @@ def read_excel( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., raise_if_empty: bool = ..., + drop_empty_rows: bool = ..., ) -> pl.DataFrame: ... @@ -142,6 +147,7 @@ def read_excel( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., raise_if_empty: bool = ..., + drop_empty_rows: bool = ..., ) -> dict[str, pl.DataFrame]: ... @@ -160,6 +166,7 @@ def read_excel( schema_overrides: SchemaDict | None = None, infer_schema_length: int | None = N_INFER_DEFAULT, raise_if_empty: bool = True, + drop_empty_rows: bool = True, ) -> pl.DataFrame | dict[str, pl.DataFrame]: """ Read Excel spreadsheet data into a DataFrame. @@ -231,6 +238,9 @@ def read_excel( raise_if_empty When there is no data in the sheet,`NoDataError` is raised. If this parameter is set to False, an empty DataFrame (with no columns) is returned instead. + drop_empty_rows + A boolean flag whether to drop empty rows or not from the dataframe. Default + is True. Returns ------- @@ -298,6 +308,7 @@ def read_excel( raise_if_empty=raise_if_empty, has_header=has_header, columns=columns, + drop_empty_rows=drop_empty_rows, ) @@ -529,6 +540,7 @@ def _read_spreadsheet( columns: Sequence[int] | Sequence[str] | None = None, has_header: bool = True, raise_if_empty: bool = True, + drop_empty_rows=True, ) -> pl.DataFrame | dict[str, pl.DataFrame]: if isinstance(source, (str, Path)): source = normalize_filepath(source) @@ -560,6 +572,7 @@ def _read_spreadsheet( read_options=read_options, raise_if_empty=raise_if_empty, columns=columns, + drop_empty_rows=drop_empty_rows, ) for name in sheet_names } @@ -748,6 +761,7 @@ def _csv_buffer_to_frame( read_options: dict[str, Any], schema_overrides: SchemaDict | None, raise_if_empty: bool, + drop_empty_rows: bool, ) -> pl.DataFrame: """Translate StringIO buffer containing delimited data as a DataFrame.""" # handle (completely) empty sheet data @@ -781,11 +795,15 @@ def _csv_buffer_to_frame( separator=separator, **read_options, ) - return _drop_null_data(df, raise_if_empty=raise_if_empty) + return _drop_null_data(df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows) -def _drop_null_data(df: pl.DataFrame, *, raise_if_empty: bool) -> pl.DataFrame: - """If DataFrame contains columns/rows that contain only nulls, drop them.""" +def _drop_null_data(df: pl.DataFrame, *, raise_if_empty: bool, drop_empty_rows: bool = True) -> pl.DataFrame: + """ + If DataFrame contains columns/rows that contain only nulls, drop them. + + If `drop_empty_rows` is set to `False`, empty rows are not dropped. + """ null_cols = [] for col_name in df.columns: # note that if multiple unnamed columns are found then all but the first one @@ -806,8 +824,9 @@ def _drop_null_data(df: pl.DataFrame, *, raise_if_empty: bool) -> pl.DataFrame: if len(df) == 0 and len(df.columns) == 0: return _empty_frame(raise_if_empty) - - return df.filter(~F.all_horizontal(F.all().is_null())) + if drop_empty_rows: + return df.filter(~F.all_horizontal(F.all().is_null())) + return df def _empty_frame(raise_if_empty: bool) -> pl.DataFrame: # noqa: FBT001 @@ -839,6 +858,7 @@ def _read_spreadsheet_openpyxl( schema_overrides: SchemaDict | None, columns: Sequence[int] | Sequence[str] | None, raise_if_empty: bool, + drop_empty_rows: bool, ) -> pl.DataFrame: """Use the 'openpyxl' library to read data from the given worksheet.""" infer_schema_length = read_options.pop("infer_schema_length", None) @@ -895,7 +915,7 @@ def _read_spreadsheet_openpyxl( strict=False, ) - df = _drop_null_data(df, raise_if_empty=raise_if_empty) + df = _drop_null_data(df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows) df = _reorder_columns(df, columns) return df @@ -908,6 +928,7 @@ def _read_spreadsheet_calamine( schema_overrides: SchemaDict | None, columns: Sequence[int] | Sequence[str] | None, raise_if_empty: bool, + drop_empty_rows: bool, ) -> pl.DataFrame: # if we have 'schema_overrides' and a more recent version of `fastexcel` # we can pass translated dtypes to the engine to refine the initial parse @@ -965,7 +986,7 @@ def _read_spreadsheet_calamine( if schema_overrides: df = df.cast(dtypes=schema_overrides) - df = _drop_null_data(df, raise_if_empty=raise_if_empty) + df = _drop_null_data(df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows) # standardise on string dtype for null columns in empty frame if df.is_empty(): @@ -1008,6 +1029,7 @@ def _read_spreadsheet_xlsx2csv( schema_overrides: SchemaDict | None, columns: Sequence[int] | Sequence[str] | None, raise_if_empty: bool, + drop_empty_rows: bool, ) -> pl.DataFrame: """Use the 'xlsx2csv' library to read data from the given worksheet.""" csv_buffer = StringIO() @@ -1030,6 +1052,7 @@ def _read_spreadsheet_xlsx2csv( read_options=read_options, schema_overrides=schema_overrides, raise_if_empty=raise_if_empty, + drop_empty_rows=drop_empty_rows, ) if cast_to_boolean: df = df.with_columns(*cast_to_boolean)