Skip to content

Commit

Permalink
REF: Use more lazy iterators (pandas-dev#58808)
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke authored May 22, 2024
1 parent 2aa155a commit 7868a58
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 44 deletions.
2 changes: 1 addition & 1 deletion pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,7 +664,7 @@ def _apply_str(self, obj, func: str, *args, **kwargs):
# people may aggregate on a non-callable attribute
# but don't let them think they can pass args to it
assert len(args) == 0
assert len([kwarg for kwarg in kwargs if kwarg not in ["axis"]]) == 0
assert not any(kwarg == "axis" for kwarg in kwargs)
return f
elif hasattr(np, func) and hasattr(obj, "__array__"):
# in particular exclude Window
Expand Down
12 changes: 9 additions & 3 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1750,19 +1750,25 @@ def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike
if `key` matches multiple labels
"""
axis = self._get_axis_number(axis)
other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis]
first_other_axes = next(
(ax for ax in range(self._AXIS_LEN) if ax != axis), None
)

if self._is_label_reference(key, axis=axis):
self._check_label_or_level_ambiguity(key, axis=axis)
values = self.xs(key, axis=other_axes[0])._values
if first_other_axes is None:
raise ValueError("axis matched all axes")
values = self.xs(key, axis=first_other_axes)._values
elif self._is_level_reference(key, axis=axis):
values = self.axes[axis].get_level_values(key)._values
else:
raise KeyError(key)

# Check for duplicates
if values.ndim > 1:
if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex):
if first_other_axes is not None and isinstance(
self._get_axis(first_other_axes), MultiIndex
):
multi_message = (
"\n"
"For a multi-index, the label must be a "
Expand Down
19 changes: 9 additions & 10 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -857,24 +857,23 @@ def _parse_sheet(
# a row containing just the index name(s)
has_index_names = False
if is_list_header and not is_len_one_list_header and index_col is not None:
index_col_list: Sequence[int]
index_col_set: set[int]
if isinstance(index_col, int):
index_col_list = [index_col]
index_col_set = {index_col}
else:
assert isinstance(index_col, Sequence)
index_col_list = index_col
index_col_set = set(index_col)

# We have to handle mi without names. If any of the entries in the data
# columns are not empty, this is a regular row
assert isinstance(header, Sequence)
if len(header) < len(data):
potential_index_names = data[len(header)]
potential_data = [
x
has_index_names = all(
x == "" or x is None
for i, x in enumerate(potential_index_names)
if not control_row[i] and i not in index_col_list
]
has_index_names = all(x == "" or x is None for x in potential_data)
if not control_row[i] and i not in index_col_set
)

if is_list_like(index_col):
# Forward fill values for MultiIndex index.
Expand Down Expand Up @@ -1457,9 +1456,9 @@ def inspect_excel_format(
with zipfile.ZipFile(stream) as zf:
# Workaround for some third party files that use forward slashes and
# lower case names.
component_names = [
component_names = {
name.replace("\\", "/").lower() for name in zf.namelist()
]
}

if "xl/workbook.xml" in component_names:
return "xlsx"
Expand Down
36 changes: 16 additions & 20 deletions pandas/io/excel/_odfreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,29 +122,25 @@ def get_sheet_data(
table: list[list[Scalar | NaTType]] = []

for sheet_row in sheet_rows:
sheet_cells = [
x
for x in sheet_row.childNodes
if hasattr(x, "qname") and x.qname in cell_names
]
empty_cells = 0
table_row: list[Scalar | NaTType] = []

for sheet_cell in sheet_cells:
if sheet_cell.qname == table_cell_name:
value = self._get_cell_value(sheet_cell)
else:
value = self.empty_value

column_repeat = self._get_column_repeat(sheet_cell)

# Queue up empty values, writing only if content succeeds them
if value == self.empty_value:
empty_cells += column_repeat
else:
table_row.extend([self.empty_value] * empty_cells)
empty_cells = 0
table_row.extend([value] * column_repeat)
for sheet_cell in sheet_row.childNodes:
if hasattr(sheet_cell, "qname") and sheet_cell.qname in cell_names:
if sheet_cell.qname == table_cell_name:
value = self._get_cell_value(sheet_cell)
else:
value = self.empty_value

column_repeat = self._get_column_repeat(sheet_cell)

# Queue up empty values, writing only if content succeeds them
if value == self.empty_value:
empty_cells += column_repeat
else:
table_row.extend([self.empty_value] * empty_cells)
empty_cells = 0
table_row.extend([value] * column_repeat)

if max_row_len < len(table_row):
max_row_len = len(table_row)
Expand Down
11 changes: 4 additions & 7 deletions pandas/io/excel/_xlrd.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,16 +128,13 @@ def _parse_cell(cell_contents, cell_typ):
cell_contents = val
return cell_contents

data = []

nrows = sheet.nrows
if file_rows_needed is not None:
nrows = min(nrows, file_rows_needed)
for i in range(nrows):
row = [
return [
[
_parse_cell(value, typ)
for value, typ in zip(sheet.row_values(i), sheet.row_types(i))
]
data.append(row)

return data
for i in range(nrows)
]
7 changes: 4 additions & 3 deletions pandas/io/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ def _convert_arrays_to_dataframe(
dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
) -> DataFrame:
content = lib.to_object_array_tuples(data)
idx_len = content.shape[0]
arrays = convert_object_array(
list(content.T),
dtype=None,
Expand All @@ -177,9 +178,9 @@ def _convert_arrays_to_dataframe(
result_arrays.append(ArrowExtensionArray(pa_array))
arrays = result_arrays # type: ignore[assignment]
if arrays:
df = DataFrame(dict(zip(range(len(columns)), arrays)))
df.columns = columns
return df
return DataFrame._from_arrays(
arrays, columns=columns, index=range(idx_len), verify_integrity=False
)
else:
return DataFrame(columns=columns)

Expand Down

0 comments on commit 7868a58

Please sign in to comment.