REF: Use more lazy iterators (pandas-dev#58808)

Aloqeely · May 22, 2024 · 7868a58 · 7868a58
1 parent 2aa155a
commit 7868a58
Show file tree

Hide file tree

Showing 6 changed files with 43 additions and 44 deletions.
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -664,7 +664,7 @@ def _apply_str(self, obj, func: str, *args, **kwargs):
             # people may aggregate on a non-callable attribute
             # but don't let them think they can pass args to it
             assert len(args) == 0
-            assert len([kwarg for kwarg in kwargs if kwarg not in ["axis"]]) == 0
+            assert not any(kwarg == "axis" for kwarg in kwargs)
             return f
         elif hasattr(np, func) and hasattr(obj, "__array__"):
             # in particular exclude Window

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1750,19 +1750,25 @@ def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike
             if `key` matches multiple labels
         """
         axis = self._get_axis_number(axis)
-        other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis]
+        first_other_axes = next(
+            (ax for ax in range(self._AXIS_LEN) if ax != axis), None
+        )
 
         if self._is_label_reference(key, axis=axis):
             self._check_label_or_level_ambiguity(key, axis=axis)
-            values = self.xs(key, axis=other_axes[0])._values
+            if first_other_axes is None:
+                raise ValueError("axis matched all axes")
+            values = self.xs(key, axis=first_other_axes)._values
         elif self._is_level_reference(key, axis=axis):
             values = self.axes[axis].get_level_values(key)._values
         else:
             raise KeyError(key)
 
         # Check for duplicates
         if values.ndim > 1:
-            if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex):
+            if first_other_axes is not None and isinstance(
+                self._get_axis(first_other_axes), MultiIndex
+            ):
                 multi_message = (
                     "\n"
                     "For a multi-index, the label must be a "

diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -857,24 +857,23 @@ def _parse_sheet(
         # a row containing just the index name(s)
         has_index_names = False
         if is_list_header and not is_len_one_list_header and index_col is not None:
-            index_col_list: Sequence[int]
+            index_col_set: set[int]
             if isinstance(index_col, int):
-                index_col_list = [index_col]
+                index_col_set = {index_col}
             else:
                 assert isinstance(index_col, Sequence)
-                index_col_list = index_col
+                index_col_set = set(index_col)
 
             # We have to handle mi without names. If any of the entries in the data
             # columns are not empty, this is a regular row
             assert isinstance(header, Sequence)
             if len(header) < len(data):
                 potential_index_names = data[len(header)]
-                potential_data = [
-                    x
+                has_index_names = all(
+                    x == "" or x is None
                     for i, x in enumerate(potential_index_names)
-                    if not control_row[i] and i not in index_col_list
-                ]
-                has_index_names = all(x == "" or x is None for x in potential_data)
+                    if not control_row[i] and i not in index_col_set
+                )
 
         if is_list_like(index_col):
             # Forward fill values for MultiIndex index.
@@ -1457,9 +1456,9 @@ def inspect_excel_format(
         with zipfile.ZipFile(stream) as zf:
             # Workaround for some third party files that use forward slashes and
             # lower case names.
-            component_names = [
+            component_names = {
                 name.replace("\\", "/").lower() for name in zf.namelist()
-            ]
+            }
 
         if "xl/workbook.xml" in component_names:
             return "xlsx"

diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py
@@ -122,29 +122,25 @@ def get_sheet_data(
         table: list[list[Scalar | NaTType]] = []
 
         for sheet_row in sheet_rows:
-            sheet_cells = [
-                x
-                for x in sheet_row.childNodes
-                if hasattr(x, "qname") and x.qname in cell_names
-            ]
             empty_cells = 0
             table_row: list[Scalar | NaTType] = []
 
-            for sheet_cell in sheet_cells:
-                if sheet_cell.qname == table_cell_name:
-                    value = self._get_cell_value(sheet_cell)
-                else:
-                    value = self.empty_value
-
-                column_repeat = self._get_column_repeat(sheet_cell)
-
-                # Queue up empty values, writing only if content succeeds them
-                if value == self.empty_value:
-                    empty_cells += column_repeat
-                else:
-                    table_row.extend([self.empty_value] * empty_cells)
-                    empty_cells = 0
-                    table_row.extend([value] * column_repeat)
+            for sheet_cell in sheet_row.childNodes:
+                if hasattr(sheet_cell, "qname") and sheet_cell.qname in cell_names:
+                    if sheet_cell.qname == table_cell_name:
+                        value = self._get_cell_value(sheet_cell)
+                    else:
+                        value = self.empty_value
+
+                    column_repeat = self._get_column_repeat(sheet_cell)
+
+                    # Queue up empty values, writing only if content succeeds them
+                    if value == self.empty_value:
+                        empty_cells += column_repeat
+                    else:
+                        table_row.extend([self.empty_value] * empty_cells)
+                        empty_cells = 0
+                        table_row.extend([value] * column_repeat)
 
             if max_row_len < len(table_row):
                 max_row_len = len(table_row)

diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py
@@ -128,16 +128,13 @@ def _parse_cell(cell_contents, cell_typ):
                         cell_contents = val
             return cell_contents
 
-        data = []
-
         nrows = sheet.nrows
         if file_rows_needed is not None:
             nrows = min(nrows, file_rows_needed)
-        for i in range(nrows):
-            row = [
+        return [
+            [
                 _parse_cell(value, typ)
                 for value, typ in zip(sheet.row_values(i), sheet.row_types(i))
             ]
-            data.append(row)
-
-        return data
+            for i in range(nrows)
+        ]
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
@@ -157,6 +157,7 @@ def _convert_arrays_to_dataframe(
     dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
 ) -> DataFrame:
     content = lib.to_object_array_tuples(data)
+    idx_len = content.shape[0]
     arrays = convert_object_array(
         list(content.T),
         dtype=None,
@@ -177,9 +178,9 @@ def _convert_arrays_to_dataframe(
             result_arrays.append(ArrowExtensionArray(pa_array))
         arrays = result_arrays  # type: ignore[assignment]
     if arrays:
-        df = DataFrame(dict(zip(range(len(columns)), arrays)))
-        df.columns = columns
-        return df
+        return DataFrame._from_arrays(
+            arrays, columns=columns, index=range(idx_len), verify_integrity=False
+        )
     else:
         return DataFrame(columns=columns)