diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index 1fa610f35f56b..03c73489bd3d8 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -329,8 +329,8 @@ def array_ufunc(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes)) if self.ndim == 1: - names = [getattr(x, "name") for x in inputs if hasattr(x, "name")] - name = names[0] if len(set(names)) == 1 else None + names = {getattr(x, "name") for x in inputs if hasattr(x, "name")} + name = names.pop() if len(names) == 1 else None reconstruct_kwargs = {"name": name} else: reconstruct_kwargs = {} diff --git a/pandas/core/common.py b/pandas/core/common.py index 77e986a26fbe9..96291991227d9 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -335,11 +335,12 @@ def is_empty_slice(obj) -> bool: ) -def is_true_slices(line) -> list[bool]: +def is_true_slices(line: abc.Iterable) -> abc.Generator[bool, None, None]: """ - Find non-trivial slices in "line": return a list of booleans with same length. + Find non-trivial slices in "line": yields a bool. """ - return [isinstance(k, slice) and not is_null_slice(k) for k in line] + for k in line: + yield isinstance(k, slice) and not is_null_slice(k) # TODO: used only once in indexing; belongs elsewhere? diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index e10080604260a..45814ca77b70f 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -680,10 +680,11 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: return None # categorical is aware of Sparse -> extract sparse subdtypes - dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes] + subtypes = (x.subtype if isinstance(x, SparseDtype) else x for x in dtypes) # extract the categories' dtype non_cat_dtypes = [ - x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes + x.categories.dtype if isinstance(x, CategoricalDtype) else x + for x in subtypes ] # TODO should categorical always give an answer? from pandas.core.dtypes.cast import find_common_type diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f72a214f120a0..c875ec78891d6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6999,19 +6999,19 @@ def sort_values( f" != length of by ({len(by)})" ) if len(by) > 1: - keys = [self._get_label_or_level_values(x, axis=axis) for x in by] + keys = (self._get_label_or_level_values(x, axis=axis) for x in by) # need to rewrap columns in Series to apply key function if key is not None: - # error: List comprehension has incompatible type List[Series]; - # expected List[ndarray] - keys = [ - Series(k, name=name) # type: ignore[misc] - for (k, name) in zip(keys, by) - ] + keys_data = [Series(k, name=name) for (k, name) in zip(keys, by)] + else: + # error: Argument 1 to "list" has incompatible type + # "Generator[ExtensionArray | ndarray[Any, Any], None, None]"; + # expected "Iterable[Series]" + keys_data = list(keys) # type: ignore[arg-type] indexer = lexsort_indexer( - keys, orders=ascending, na_position=na_position, key=key + keys_data, orders=ascending, na_position=na_position, key=key ) elif len(by): # len(by) == 1 diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0a048d11d0b4d..a20577e8d3df9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -387,7 +387,7 @@ def _aggregate_multiple_funcs(self, arg, *args, **kwargs) -> DataFrame: raise SpecificationError("nested renamer is not supported") if any(isinstance(x, (tuple, list)) for x in arg): - arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg] + arg = ((x, x) if not isinstance(x, (tuple, list)) else x for x in arg) else: # list of functions / function names columns = (com.get_callable_name(f) or f for f in arg) @@ -2077,7 +2077,7 @@ def _apply_to_column_groupbys(self, func) -> DataFrame: obj = self._obj_with_exclusions columns = obj.columns - sgbs = [ + sgbs = ( SeriesGroupBy( obj.iloc[:, i], selection=colname, @@ -2086,7 +2086,7 @@ def _apply_to_column_groupbys(self, func) -> DataFrame: observed=self.observed, ) for i, colname in enumerate(obj.columns) - ] + ) results = [func(sgb) for sgb in sgbs] if not len(results): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4ebc149256336..1b58317c08736 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -11,6 +11,7 @@ class providing the base-class of operations. from collections.abc import ( Hashable, + Iterable, Iterator, Mapping, Sequence, @@ -758,7 +759,7 @@ def get_converter(s): ) raise ValueError(msg) from err - converters = [get_converter(s) for s in index_sample] + converters = (get_converter(s) for s in index_sample) names = (tuple(f(n) for f, n in zip(converters, name)) for name in names) else: @@ -2645,7 +2646,7 @@ def _value_counts( } if isinstance(obj, Series): _name = obj.name - keys = [] if _name in in_axis_names else [obj] + keys: Iterable[Series] = [] if _name in in_axis_names else [obj] else: unique_cols = set(obj.columns) if subset is not None: @@ -2665,12 +2666,12 @@ def _value_counts( else: subsetted = unique_cols - keys = [ + keys = ( # Can't use .values because the column label needs to be preserved obj.iloc[:, idx] for idx, _name in enumerate(obj.columns) if _name not in in_axis_names and _name in subsetted - ] + ) groupings = list(self._grouper.groupings) for key in keys: diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index c5e3f3a50e10d..83e8df5072b92 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -212,20 +212,25 @@ def union_indexes(indexes, sort: bool | None = True) -> Index: if kind == "special": result = indexes[0] - dtis = [x for x in indexes if isinstance(x, DatetimeIndex)] - dti_tzs = [x for x in dtis if x.tz is not None] - if len(dti_tzs) not in [0, len(dtis)]: + num_dtis = 0 + num_dti_tzs = 0 + for idx in indexes: + if isinstance(idx, DatetimeIndex): + num_dtis += 1 + if idx.tz is not None: + num_dti_tzs += 1 + if num_dti_tzs not in [0, num_dtis]: # TODO: this behavior is not tested (so may not be desired), # but is kept in order to keep behavior the same when # deprecating union_many # test_frame_from_dict_with_mixed_indexes raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") - if len(dtis) == len(indexes): + if num_dtis == len(indexes): sort = True result = indexes[0] - elif len(dtis) > 1: + elif num_dtis > 1: # If we have mixed timezones, our casting behavior may depend on # the order of indexes, which we don't want. sort = False diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ebbd85be44009..6a3fb8bc851df 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3140,7 +3140,7 @@ def _union(self, other: Index, sort: bool | None): # worth making this faster? a very unusual case value_set = set(lvals) - value_list.extend([x for x in rvals if x not in value_set]) + value_list.extend(x for x in rvals if x not in value_set) # If objects are unorderable, we must have object dtype. return np.array(value_list, dtype=object) @@ -7620,8 +7620,8 @@ def get_unanimous_names(*indexes: Index) -> tuple[Hashable, ...]: list A list representing the unanimous 'names' found. """ - name_tups = [tuple(i.names) for i in indexes] - name_sets = [{*ns} for ns in zip_longest(*name_tups)] + name_tups = (tuple(i.names) for i in indexes) + name_sets = ({*ns} for ns in zip_longest(*name_tups)) names = tuple(ns.pop() if len(ns) == 1 else None for ns in name_sets) return names diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a5bcf49c5490b..3927619a567bf 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1387,7 +1387,7 @@ def _formatter_func(self, tup): """ Formats each item in tup according to its level's formatter function. """ - formatter_funcs = [level._formatter_func for level in self.levels] + formatter_funcs = (level._formatter_func for level in self.levels) return tuple(func(val) for func, val in zip(formatter_funcs, tup)) def _get_values_for_csv( @@ -1537,7 +1537,7 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None: if level is None: level = range(self.nlevels) else: - level = [self._get_level_number(lev) for lev in level] + level = (self._get_level_number(lev) for lev in level) # set the name for lev, name in zip(level, names): diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index de6c5416e08c9..7055201b5a1ee 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -560,7 +560,7 @@ def get_result(self): # combine as columns in a frame else: - data = dict(zip(range(len(self.objs)), self.objs)) + data = dict(enumerate(self.objs)) # GH28330 Preserves subclassed objects through concat cons = sample._constructor_expanddim @@ -874,7 +874,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde if isinstance(new_index, MultiIndex): new_levels.extend(new_index.levels) - new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes]) + new_codes.extend(np.tile(lab, kpieces) for lab in new_index.codes) else: new_levels.append(new_index.unique()) single_codes = new_index.unique().get_indexer(new_index) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 87b2f97503b6f..7cf2e360a1d01 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -137,24 +137,24 @@ def __init__( self.removed_level = self.removed_level.take(unique_codes) self.removed_level_full = self.removed_level_full.take(unique_codes) - # Bug fix GH 20601 - # If the data frame is too big, the number of unique index combination - # will cause int32 overflow on windows environments. - # We want to check and raise an warning before this happens - num_rows = np.max([index_level.size for index_level in self.new_index_levels]) - num_columns = self.removed_level.size - - # GH20601: This forces an overflow if the number of cells is too high. - num_cells = num_rows * num_columns - - # GH 26314: Previous ValueError raised was too restrictive for many users. - if get_option("performance_warnings") and num_cells > np.iinfo(np.int32).max: - warnings.warn( - f"The following operation may generate {num_cells} cells " - f"in the resulting pandas object.", - PerformanceWarning, - stacklevel=find_stack_level(), - ) + if get_option("performance_warnings"): + # Bug fix GH 20601 + # If the data frame is too big, the number of unique index combination + # will cause int32 overflow on windows environments. + # We want to check and raise an warning before this happens + num_rows = max(index_level.size for index_level in self.new_index_levels) + num_columns = self.removed_level.size + + # GH20601: This forces an overflow if the number of cells is too high. + # GH 26314: Previous ValueError raised was too restrictive for many users. + num_cells = num_rows * num_columns + if num_cells > np.iinfo(np.int32).max: + warnings.warn( + f"The following operation may generate {num_cells} cells " + f"in the resulting pandas object.", + PerformanceWarning, + stacklevel=find_stack_level(), + ) self._make_selectors() @@ -731,10 +731,10 @@ def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex | Index: if len(columns.levels) <= 2: return columns.levels[0]._rename(name=columns.names[0]) - levs = [ + levs = ( [lev[c] if c >= 0 else None for c in codes] for lev, codes in zip(columns.levels[:-1], columns.codes[:-1]) - ] + ) # Remove duplicate tuples in the MultiIndex. tuples = zip(*levs)