From 3d90bc83be20d9ab60ba0878543fdccde8ca6749 Mon Sep 17 00:00:00 2001 From: Mahesh Vashishtha Date: Fri, 18 Nov 2022 03:12:29 -0800 Subject: [PATCH 01/13] FIX-#5234: Use query compiler str_repeat. (#5235) Signed-off-by: mvashishtha --- modin/pandas/series_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modin/pandas/series_utils.py b/modin/pandas/series_utils.py index ae01e884835..368f77684fd 100644 --- a/modin/pandas/series_utils.py +++ b/modin/pandas/series_utils.py @@ -331,7 +331,7 @@ def partition(self, sep=" ", expand=True): ) def repeat(self, repeats): - return self._default_to_pandas(pandas.Series.str.repeat, repeats) + return Series(query_compiler=self._query_compiler.str_repeat(repeats)) def rpartition(self, sep=" ", expand=True): if sep is not None and len(sep) == 0: From c51ab405efec920dbb4baa2e2389409df04e8d43 Mon Sep 17 00:00:00 2001 From: Andrey Date: Fri, 18 Nov 2022 17:27:27 +0100 Subject: [PATCH 02/13] FIX-#5187: Fixed RecursionError in OmnisciLaunchParameters.get() (#5199) Signed-off-by: Andrey Pavlenko --- modin/config/envvars.py | 16 ++++++++++++++-- modin/config/test/test_envvars.py | 31 ++++++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 2c38f0f283a..e287bf41187 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -491,7 +491,7 @@ class PersistentPickle(EnvironmentVariable, type=bool): class HdkLaunchParameters(EnvironmentVariable, type=dict): """ - Additional command line options for the OmniSci engine. + Additional command line options for the HDK engine. Please visit OmniSci documentation for the description of available parameters: https://docs.omnisci.com/installation-and-configuration/config-parameters#configuration-parameters-for-omniscidb @@ -523,8 +523,20 @@ def get(cls) -> dict: OmnisciLaunchParameters.varname in os.environ and HdkLaunchParameters.varname not in os.environ ): - return OmnisciLaunchParameters.get() + return OmnisciLaunchParameters._get() + else: + return HdkLaunchParameters._get() + + @classmethod + def _get(cls) -> dict: + """ + Get the resulted command-line options. + Returns + ------- + dict + Decoded and verified config value. + """ custom_parameters = super().get() result = cls.default.copy() result.update( diff --git a/modin/config/test/test_envvars.py b/modin/config/test/test_envvars.py index b01864afd63..01ed1c9304f 100644 --- a/modin/config/test/test_envvars.py +++ b/modin/config/test/test_envvars.py @@ -13,7 +13,7 @@ import os import pytest - +import modin.config as cfg from modin.config.envvars import EnvironmentVariable, _check_vars, ExactStr @@ -60,3 +60,32 @@ def test_custom_set(make_custom_envvar, set_custom_envvar): def test_custom_help(make_custom_envvar): assert "MODIN_CUSTOM" in make_custom_envvar.get_help() assert "custom var" in make_custom_envvar.get_help() + + +def test_hdk_envvar(): + os.environ[ + cfg.OmnisciLaunchParameters.varname + ] = "enable_union=2,enable_thrift_logs=3" + params = cfg.OmnisciLaunchParameters.get() + assert params["enable_union"] == 2 + assert params["enable_thrift_logs"] == 3 + + params = cfg.HdkLaunchParameters.get() + assert params["enable_union"] == 2 + assert params["enable_thrift_logs"] == 3 + + os.environ[cfg.HdkLaunchParameters.varname] = "enable_union=4,enable_thrift_logs=5" + del cfg.HdkLaunchParameters._value + params = cfg.HdkLaunchParameters.get() + assert params["enable_union"] == 4 + assert params["enable_thrift_logs"] == 5 + + params = cfg.OmnisciLaunchParameters.get() + assert params["enable_union"] == 2 + assert params["enable_thrift_logs"] == 3 + + del os.environ[cfg.OmnisciLaunchParameters.varname] + del cfg.OmnisciLaunchParameters._value + params = cfg.OmnisciLaunchParameters.get() + assert params["enable_union"] == 4 + assert params["enable_thrift_logs"] == 5 From 7af1b9d2569fb74e4b934dfb0ceafad7be565ea1 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sat, 19 Nov 2022 10:55:17 +0100 Subject: [PATCH 03/13] FIX-#5240: fix dask[complete] syntax in conda environment files (#5241) Signed-off-by: Myachev --- environment-dev.yml | 2 +- requirements/environment-py36.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environment-dev.yml b/environment-dev.yml index d10d7b351ba..1a3623f4377 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -5,7 +5,7 @@ dependencies: - pandas==1.5.1 - numpy>=1.18.5 - pyarrow>=4.0.1 - - dask[complete]>=2.22.0 + - dask>=2.22.0 - distributed>=2.22.0 - fsspec - xarray diff --git a/requirements/environment-py36.yml b/requirements/environment-py36.yml index b5f88a4bf89..3303ad03dd6 100644 --- a/requirements/environment-py36.yml +++ b/requirements/environment-py36.yml @@ -5,7 +5,7 @@ dependencies: - pandas - numpy>=1.18.5 - pyarrow>=4.0.1 - - dask[complete]>=2.22.0,<2021.3.0 + - dask>=2.22.0,<2021.3.0 - distributed>=2.22.0,<2021.3.0 - pickle5 # for dask to correctly serialize nested functions - fsspec From 47794730fdb6abdc29b308aa3e9cdbe9cc19ee8d Mon Sep 17 00:00:00 2001 From: Mahesh Vashishtha Date: Mon, 21 Nov 2022 00:30:43 -0800 Subject: [PATCH 04/13] TEST-#5123: Add CodeQL workflow for GitHub code scanning (#5222) Signed-off-by: mvashishtha Co-authored-by: LGTM Migrator Co-authored-by: Karthik Velayutham --- .github/workflows/codeql.yml | 40 ++++++++++++++++++++++ .github/workflows/codeql/codeql-config.yml | 7 ++++ modin/pandas/test/dataframe/test_binary.py | 2 ++ modin/pandas/test/test_io.py | 3 +- modin/pandas/test/test_series.py | 25 ++++++++------ modin/test/test_partition_api.py | 2 +- 6 files changed, 67 insertions(+), 12 deletions(-) create mode 100644 .github/workflows/codeql.yml create mode 100644 .github/workflows/codeql/codeql-config.yml diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 00000000000..cf55eb2f384 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,40 @@ +name: "CodeQL" + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: [ python ] + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Initialize CodeQL + uses: github/codeql-action/init@v2 + with: + languages: ${{ matrix.language }} + queries: +security-and-quality + config-file: ./.github/workflows/codeql/codeql-config.yml + + - name: Autobuild + uses: github/codeql-action/autobuild@v2 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v2 + with: + category: "/language:${{ matrix.language }}" diff --git a/.github/workflows/codeql/codeql-config.yml b/.github/workflows/codeql/codeql-config.yml new file mode 100644 index 00000000000..ea1991a4d82 --- /dev/null +++ b/.github/workflows/codeql/codeql-config.yml @@ -0,0 +1,7 @@ +name: "Modin CodeQL config" + +paths: + - modin/** +paths-ignore: + - modin/experimental/cloud/** # TODO: fix module-level cyclic error, see #5228 + - modin/experimental/core/execution/native/implementations/hdk_on_native/test/** # TODO: fix unhashable list error, see #5227 diff --git a/modin/pandas/test/dataframe/test_binary.py b/modin/pandas/test/dataframe/test_binary.py index 23e0e90a4b3..c86d47e0a3f 100644 --- a/modin/pandas/test/dataframe/test_binary.py +++ b/modin/pandas/test/dataframe/test_binary.py @@ -280,6 +280,8 @@ def test_mismatched_row_partitions(is_idx_aligned, op_type, is_more_other_partit elif op_type == "ser_ser_different_name": modin_res = modin_df2.a / modin_df1.b pandas_res = pandas_df2.a / pandas_df1.b + else: + raise Exception(f"op_type: {op_type} not supported in test") df_equals(modin_res, pandas_res) diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index 768a3ca4ebf..aa84983c21c 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -201,7 +201,8 @@ def eval_to_file(modin_obj, pandas_obj, fn, extension, **fn_kwargs): last_exception = err continue break - else: + # If we do have an exception that's valid let's raise it + if last_exception: raise last_exception getattr(pandas_obj, fn)(unique_filename_pandas, **fn_kwargs) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 22e551d6a7a..6f086e6e6e6 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -1135,8 +1135,9 @@ def test_array(data): @pytest.mark.xfail(reason="Using pandas Series.") -def test_between(): - modin_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_between(data): + modin_series = create_test_series(data) with pytest.raises(NotImplementedError): modin_series.between(None, None) @@ -1577,8 +1578,9 @@ def test_matmul(data): @pytest.mark.xfail(reason="Using pandas Series.") -def test_drop(): - modin_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_drop(data): + modin_series = create_test_series(data) with pytest.raises(NotImplementedError): modin_series.drop(None, None, None, None) @@ -1879,8 +1881,9 @@ def test_fillna(data, reindex, limit): @pytest.mark.xfail(reason="Using pandas Series.") -def test_filter(): - modin_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_filter(data): + modin_series = create_test_series(data) with pytest.raises(NotImplementedError): modin_series.filter(None, None, None) @@ -2400,8 +2403,9 @@ def test_ne(data): @pytest.mark.xfail(reason="Using pandas Series.") -def test_nlargest(): - modin_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_nlargest(data): + modin_series = create_test_series(data) with pytest.raises(NotImplementedError): modin_series.nlargest(None) @@ -2877,8 +2881,9 @@ def test_reset_index(data, drop, name, inplace): @pytest.mark.xfail(reason="Using pandas Series.") -def test_reshape(): - modin_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_reshape(data): + modin_series = create_test_series(data) with pytest.raises(NotImplementedError): modin_series.reshape(None) diff --git a/modin/test/test_partition_api.py b/modin/test/test_partition_api.py index 5f4d8a44bf6..0b5e7edf7fd 100644 --- a/modin/test/test_partition_api.py +++ b/modin/test/test_partition_api.py @@ -130,7 +130,7 @@ def test_from_partitions(axis, index, columns, row_lengths, column_widths): if axis == 0 else [num_cols, num_cols] ) - + futures = [] if Engine.get() == "Ray": if axis is None: futures = [[put_func(df1), put_func(df2)]] From 5acf539fa4f205e383b91ca3baeed8cd94217392 Mon Sep 17 00:00:00 2001 From: Mahesh Vashishtha Date: Mon, 21 Nov 2022 01:40:57 -0800 Subject: [PATCH 05/13] FIX-#5236: Allow binary operations with custom classes. (#5237) Signed-off-by: mvashishtha --- modin/pandas/base.py | 68 +++++++++++----------- modin/pandas/test/dataframe/test_binary.py | 17 ++++++ modin/pandas/test/test_series.py | 11 ++++ modin/pandas/test/utils.py | 11 ++++ 4 files changed, 72 insertions(+), 35 deletions(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index ce517932c7b..2771d105483 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -245,47 +245,42 @@ def _validate_other( TypeError If any validation checks fail. """ - # We skip dtype checking if the other is a scalar. - if is_scalar(other): + if isinstance(other, BasePandasDataset): + return other._query_compiler + if not is_list_like(other): + # We skip dtype checking if the other is a scalar. Note that pandas + # is_scalar can be misleading as it is False for almost all objects, + # even when those objects should be treated as scalars. See e.g. + # https://github.com/modin-project/modin/issues/5236. Therefore, we + # detect scalars by checking that `other` is neither a list-like nor + # another BasePandasDataset. return other axis = self._get_axis_number(axis) if axis is not None else 1 result = other - if isinstance(other, BasePandasDataset): - return other._query_compiler - elif is_list_like(other): - if axis == 0: - if len(other) != len(self._query_compiler.index): - raise ValueError( - f"Unable to coerce to Series, length must be {len(self._query_compiler.index)}: " - + f"given {len(other)}" - ) - else: - if len(other) != len(self._query_compiler.columns): - raise ValueError( - f"Unable to coerce to Series, length must be {len(self._query_compiler.columns)}: " - + f"given {len(other)}" - ) - if hasattr(other, "dtype"): - other_dtypes = [other.dtype] * len(other) - elif is_dict_like(other): - other_dtypes = [ - type(other[label]) - for label in self._query_compiler.get_axis(axis) - # The binary operation is applied for intersection of axis labels - # and dictionary keys. So filtering out extra keys. - if label in other - ] - else: - other_dtypes = [type(x) for x in other] + if axis == 0: + if len(other) != len(self._query_compiler.index): + raise ValueError( + f"Unable to coerce to Series, length must be {len(self._query_compiler.index)}: " + + f"given {len(other)}" + ) else: - other_dtypes = [ - type(other) - for _ in range( - len(self._query_compiler.index) - if axis - else len(self._query_compiler.columns) + if len(other) != len(self._query_compiler.columns): + raise ValueError( + f"Unable to coerce to Series, length must be {len(self._query_compiler.columns)}: " + + f"given {len(other)}" ) + if hasattr(other, "dtype"): + other_dtypes = [other.dtype] * len(other) + elif is_dict_like(other): + other_dtypes = [ + type(other[label]) + for label in self._query_compiler.get_axis(axis) + # The binary operation is applied for intersection of axis labels + # and dictionary keys. So filtering out extra keys. + if label in other ] + else: + other_dtypes = [type(x) for x in other] if compare_index: if not self.index.equals(other.index): raise TypeError("Cannot perform operation with non-equal index") @@ -304,6 +299,9 @@ def _validate_other( if label in other ] + # TODO(https://github.com/modin-project/modin/issues/5239): + # this spuriously rejects other that is a list including some + # custom type that can be added to self's elements. if not all( (is_numeric_dtype(self_dtype) and is_numeric_dtype(other_dtype)) or (is_object_dtype(self_dtype) and is_object_dtype(other_dtype)) diff --git a/modin/pandas/test/dataframe/test_binary.py b/modin/pandas/test/dataframe/test_binary.py index c86d47e0a3f..1334941895e 100644 --- a/modin/pandas/test/dataframe/test_binary.py +++ b/modin/pandas/test/dataframe/test_binary.py @@ -16,6 +16,7 @@ import matplotlib import modin.pandas as pd +from modin._compat import PandasCompatVersion from modin.core.dataframe.pandas.partitioning.axis_partition import ( PandasDataframeAxisPartition, ) @@ -27,6 +28,7 @@ test_data, create_test_dfs, default_to_pandas_ignore_string, + CustomIntegerForAddition, ) from modin.config import Engine, NPartitions from modin.test.test_utils import warns_that_defaulting_to_pandas @@ -336,3 +338,18 @@ def test_add_string_to_df(): modin_df, pandas_df = create_test_dfs(["a", "b"]) eval_general(modin_df, pandas_df, lambda df: "string" + df) eval_general(modin_df, pandas_df, lambda df: df + "string") + + +@pytest.mark.xfail( + PandasCompatVersion.CURRENT == PandasCompatVersion.PY36, + reason="Seems to be a bug in pandas 1.1.5. pandas throws ValueError " + + "for this particular dataframe.", +) +def test_add_custom_class(): + # see https://github.com/modin-project/modin/issues/5236 + # Test that we can add any object that is addable to pandas object data + # via "+". + eval_general( + *create_test_dfs(test_data["int_data"]), + lambda df: df + CustomIntegerForAddition(4), + ) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 6f086e6e6e6..39bf634ef34 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -74,6 +74,7 @@ test_data_large_categorical_series_keys, test_data_large_categorical_series_values, default_to_pandas_ignore_string, + CustomIntegerForAddition, ) from modin.config import NPartitions @@ -635,6 +636,16 @@ def test_add_suffix(data): ) +def test_add_custom_class(): + # see https://github.com/modin-project/modin/issues/5236 + # Test that we can add any object that is addable to pandas object data + # via "+". + eval_general( + *create_test_series(test_data["int_data"]), + lambda df: df + CustomIntegerForAddition(4), + ) + + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) def test_agg(data, func): diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index 21565286309..54774ed95ef 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -479,6 +479,17 @@ time_parsing_csv_path = "modin/pandas/test/data/test_time_parsing.csv" +class CustomIntegerForAddition: + def __init__(self, value: int): + self.value = value + + def __add__(self, other): + return self.value + other + + def __radd__(self, other): + return other + self.value + + def categories_equals(left, right): assert (left.ordered and right.ordered) or (not left.ordered and not right.ordered) assert_extension_array_equal(left, right) From 073dffc31a628978e4dd09b40a6a0ac648f20426 Mon Sep 17 00:00:00 2001 From: Billy2551 Date: Mon, 21 Nov 2022 09:19:01 -0800 Subject: [PATCH 06/13] FIX-#4636: allows `read_parquet` to detect column partitioning in non-local filesystems (#5192) Signed-off-by: Bill Wang --- .../core/io/column_stores/parquet_dispatcher.py | 16 +++++++++++++--- modin/core/storage_formats/pandas/parsers.py | 2 +- modin/pandas/test/test_io.py | 15 +++++++++++++++ 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/modin/core/io/column_stores/parquet_dispatcher.py b/modin/core/io/column_stores/parquet_dispatcher.py index 29f959ab54e..e89f15918bd 100644 --- a/modin/core/io/column_stores/parquet_dispatcher.py +++ b/modin/core/io/column_stores/parquet_dispatcher.py @@ -609,18 +609,28 @@ def _read(cls, path, engine, columns, **kwargs): ) from modin.pandas.io import PQ_INDEX_REGEX - if isinstance(path, str) and os.path.isdir(path): + if isinstance(path, str): + if os.path.isdir(path): + path_generator = os.walk(path) + else: + storage_options = kwargs.get("storage_options") + if storage_options is not None: + fs, fs_path = url_to_fs(path, **storage_options) + else: + fs, fs_path = url_to_fs(path) + path_generator = fs.walk(fs_path) partitioned_columns = set() # We do a tree walk of the path directory because partitioned # parquet directories have a unique column at each directory level. # Thus, we can use os.walk(), which does a dfs search, to walk # through the different columns that the data is partitioned on - for (_, dir_names, files) in os.walk(path): + for (_, dir_names, files) in path_generator: if dir_names: partitioned_columns.add(dir_names[0].split("=")[0]) if files: # Metadata files, git files, .DSStore - if files[0][0] == ".": + # TODO: fix conditional for column partitioning, see issue #4637 + if len(files[0]) > 0 and files[0][0] == ".": continue break partitioned_columns = list(partitioned_columns) diff --git a/modin/core/storage_formats/pandas/parsers.py b/modin/core/storage_formats/pandas/parsers.py index e08f4da8451..2dc4a12adb1 100644 --- a/modin/core/storage_formats/pandas/parsers.py +++ b/modin/core/storage_formats/pandas/parsers.py @@ -705,7 +705,7 @@ def _read_row_group_chunk( ) def parse(files_for_parser, engine, **kwargs): columns = kwargs.get("columns", None) - storage_options = kwargs.pop("storage_options", {}) or {} + storage_options = kwargs.get("storage_options", {}) chunks = [] # `single_worker_read` just passes in a string path if isinstance(files_for_parser, str): diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index aa84983c21c..aab496bbb63 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -1729,6 +1729,21 @@ def test_read_parquet_2462(self, engine): df_equals(test_df, read_df) + @pytest.mark.skipif( + PandasCompatVersion.CURRENT == PandasCompatVersion.PY36, + reason="storage_options not supported for older pandas", + ) + def test_read_parquet_s3_with_column_partitioning(self, engine): + # This test case comes from + # https://github.com/modin-project/modin/issues/4636 + dataset_url = "s3://modin-datasets/modin-bugs/modin_bug_5159_parquet/df.parquet" + eval_io( + fn_name="read_parquet", + path=dataset_url, + engine=engine, + storage_options={"anon": True}, + ) + class TestJson: @pytest.mark.parametrize("lines", [False, True]) From e639418584d45d571126ceba29847e278bec943b Mon Sep 17 00:00:00 2001 From: Andrey Date: Tue, 22 Nov 2022 19:05:03 +0100 Subject: [PATCH 07/13] FIX-#4100: Fall back to Pandas on row drop (#4937) Co-authored-by: Iaroslav Igoshev Signed-off-by: Andrey Pavlenko --- .../hdk_on_native/test/test_dataframe.py | 22 +++++++++++++++++++ .../storage_formats/hdk/query_compiler.py | 3 ++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py index fb1f44fa164..cd604f2cd55 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py @@ -2197,6 +2197,28 @@ def drop_rename_concat(df1, df2, lib, **kwargs): force_arrow_execute=True, ) + def test_drop_row(self): + def drop_row(df, **kwargs): + return df.drop(labels=1) + + run_and_compare( + drop_row, + data=self.data1, + force_lazy=False, + ) + + def test_series_pop(self): + def pop(df, **kwargs): + col = df["a"] + col.pop(0) + return col + + run_and_compare( + pop, + data=self.data1, + force_lazy=False, + ) + def test_empty_transform(self): def apply(df, **kwargs): return df + 1 diff --git a/modin/experimental/core/storage_formats/hdk/query_compiler.py b/modin/experimental/core/storage_formats/hdk/query_compiler.py index 7ac735b4a02..cbf0d95b2b5 100644 --- a/modin/experimental/core/storage_formats/hdk/query_compiler.py +++ b/modin/experimental/core/storage_formats/hdk/query_compiler.py @@ -556,7 +556,8 @@ def concat(self, axis, other, **kwargs): return self.__constructor__(new_modin_frame) def drop(self, index=None, columns=None): - assert index is None, "Only column drop is supported" + if index is not None: + raise NotImplementedError("Row drop") return self.__constructor__( self._modin_frame.take_2d_labels_or_positional( row_labels=index, col_labels=self.columns.drop(columns) From 7b1ae5a17c81b248b40359bc95acd41d778cce6a Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Wed, 23 Nov 2022 00:15:12 -0800 Subject: [PATCH 08/13] FIX-#5138: df_categories_equals typo (#5250) Signed-off-by: Jonathan Shi --- modin/pandas/test/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index 54774ed95ef..412ebed29bd 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -500,7 +500,7 @@ def df_categories_equals(df1, df2): if isinstance(df1, pandas.CategoricalDtype): return categories_equals(df1, df2) elif isinstance(getattr(df1, "dtype"), pandas.CategoricalDtype) and isinstance( - getattr(df1, "dtype"), pandas.CategoricalDtype + getattr(df2, "dtype"), pandas.CategoricalDtype ): return categories_equals(df1.dtype, df2.dtype) else: From a0e5a650b54cc14451dddc8edaf40731ecc212e9 Mon Sep 17 00:00:00 2001 From: Andrey Date: Wed, 23 Nov 2022 14:49:47 +0100 Subject: [PATCH 09/13] FIX-#4859: Add support for PyArrow Dictionary Arrays to type mapping (#4864) Signed-off-by: Andrey Pavlenko --- modin/core/dataframe/pandas/dataframe/dataframe.py | 2 ++ .../hdk_on_native/test/test_dataframe.py | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index 6b7292a46a9..3f31942162f 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -3095,6 +3095,8 @@ def _arrow_type_to_dtype(cls, arrow_type): except NotImplementedError: if pyarrow.types.is_time(arrow_type): res = np.dtype(datetime.time) + elif pyarrow.types.is_dictionary(arrow_type): + res = pandas.CategoricalDtype else: raise diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py index cd604f2cd55..21999c86d28 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py @@ -2388,5 +2388,16 @@ def set_axis(df, **kwargs): ) +class TestFromArrow: + def test_dict(self): + indices = pyarrow.array([0, 1, 0, 1, 2, 0, None, 2]) + dictionary = pyarrow.array(["first", "second", "third"]) + dict_array = pyarrow.DictionaryArray.from_arrays(indices, dictionary) + at = pyarrow.table({"col": dict_array}) + pdf = at.to_pandas() + mdf = from_arrow(at) + df_equals(mdf, pdf) + + if __name__ == "__main__": pytest.main(["-v", __file__]) From 317bd75d0677c0ae1a400f7313953ee6638185ac Mon Sep 17 00:00:00 2001 From: Rehan Sohail Durrani Date: Wed, 23 Nov 2022 15:42:09 -0800 Subject: [PATCH 10/13] FIX-#5252: Disable notebook tests until access control issues are resolved for `modin-test` bucket (#5257) Signed-off-by: Rehan Durrani --- .../jupyter/execution/hdk_on_native/test/test_notebooks.py | 4 ++++ .../execution/pandas_on_dask/test/test_notebooks.py | 7 +++++++ .../jupyter/execution/pandas_on_ray/test/test_notebooks.py | 7 +++++++ 3 files changed, 18 insertions(+) diff --git a/examples/tutorial/jupyter/execution/hdk_on_native/test/test_notebooks.py b/examples/tutorial/jupyter/execution/hdk_on_native/test/test_notebooks.py index 01e928f6bb1..fd755504955 100644 --- a/examples/tutorial/jupyter/execution/hdk_on_native/test/test_notebooks.py +++ b/examples/tutorial/jupyter/execution/hdk_on_native/test/test_notebooks.py @@ -13,6 +13,7 @@ import os import sys +import pytest import nbformat @@ -44,6 +45,9 @@ def test_exercise_1(): # this notebook works "as is" +# GH #5252: Access to the modin-test bucket has changed, so we cannot currently run this test. +# We will need to come back and unskip this test once the access control issue is resolved. +@pytest.mark.skip(reason="Bucket cannot currently be accessed.") def test_exercise_2(): modified_notebook_path = os.path.join(local_notebooks_dir, "exercise_2_test.ipynb") nb = nbformat.read( diff --git a/examples/tutorial/jupyter/execution/pandas_on_dask/test/test_notebooks.py b/examples/tutorial/jupyter/execution/pandas_on_dask/test/test_notebooks.py index 1d723e0c6b5..effc8d37adf 100644 --- a/examples/tutorial/jupyter/execution/pandas_on_dask/test/test_notebooks.py +++ b/examples/tutorial/jupyter/execution/pandas_on_dask/test/test_notebooks.py @@ -13,6 +13,7 @@ import os import sys +import pytest import nbformat @@ -46,6 +47,9 @@ def test_exercise_1(): # this notebook works "as is" but for testing purposes we can use smaller dataset +# GH #5252: Access to the modin-test bucket has changed, so we cannot currently run this test. +# We will need to come back and unskip this test once the access control issue is resolved. +@pytest.mark.skip(reason="Bucket cannot currently be accessed.") def test_exercise_2(): modified_notebook_path = os.path.join(local_notebooks_dir, "exercise_2_test.ipynb") nb = nbformat.read( @@ -99,6 +103,9 @@ def sq_mad_func(self, axis=None, skipna=True, level=None, **kwargs): # this notebook works "as is" but for testing purposes we can use smaller dataset +# GH #5252: Access to the modin-test bucket has changed, so we cannot currently run this test. +# We will need to come back and unskip this test once the access control issue is resolved. +@pytest.mark.skip(reason="Bucket cannot currently be accessed.") def test_exercise_4(): modified_notebook_path = os.path.join(local_notebooks_dir, "exercise_4_test.ipynb") nb = nbformat.read( diff --git a/examples/tutorial/jupyter/execution/pandas_on_ray/test/test_notebooks.py b/examples/tutorial/jupyter/execution/pandas_on_ray/test/test_notebooks.py index 1504143e486..8c6803254e1 100644 --- a/examples/tutorial/jupyter/execution/pandas_on_ray/test/test_notebooks.py +++ b/examples/tutorial/jupyter/execution/pandas_on_ray/test/test_notebooks.py @@ -13,6 +13,7 @@ import os import sys +import pytest import nbformat @@ -47,6 +48,9 @@ def test_exercise_1(): # this notebook works "as is" but for testing purposes we can use smaller dataset +# GH #5252: Access to the modin-test bucket has changed, so we cannot currently run this test. +# We will need to come back and unskip this test once the access control issue is resolved. +@pytest.mark.skip(reason="Bucket cannot currently be accessed.") def test_exercise_2(): modified_notebook_path = os.path.join(local_notebooks_dir, "exercise_2_test.ipynb") nb = nbformat.read( @@ -103,6 +107,9 @@ def sq_mad_func(self, axis=None, skipna=True, level=None, **kwargs): # this notebook works "as is" but for testing purposes we can use smaller dataset +# GH #5252: Access to the modin-test bucket has changed, so we cannot currently run this test. +# We will need to come back and unskip this test once the access control issue is resolved. +@pytest.mark.skip(reason="Bucket cannot currently be accessed.") def test_exercise_4(): modified_notebook_path = os.path.join(local_notebooks_dir, "exercise_4_test.ipynb") nb = nbformat.read( From 6d5545f4a132f0efce02db66a6f5d515d4000812 Mon Sep 17 00:00:00 2001 From: Mahesh Vashishtha Date: Thu, 24 Nov 2022 09:35:58 -0800 Subject: [PATCH 11/13] FIX-#5232: Stop changing original series names during binary ops. (#5249) Signed-off-by: mvashishtha Co-authored-by: Anatoly Myachev --- modin/pandas/series.py | 15 +++++++++------ modin/pandas/test/test_series.py | 11 +++++++++++ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index c378ff1bc44..75fa19cc3a8 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -2409,12 +2409,15 @@ def _prepare_inter_op(self, other): Prepared `other`. """ if isinstance(other, Series): - # NB: deep=False is important for performance bc it retains obj.index._id - new_self = self.copy(deep=False) - new_other = other.copy(deep=False) - if self.name == other.name: - new_self.name = new_other.name = self.name - else: + names_different = self.name != other.name + # NB: if we don't need a rename, do the interaction with shallow + # copies so that we preserve obj.index._id. It's fine to work + # with shallow copies because we'll discard the copies but keep + # the result after the interaction opreation. We can't do a rename + # on shallow copies because we'll mutate the original objects. + new_self = self.copy(deep=names_different) + new_other = other.copy(deep=names_different) + if names_different: new_self.name = new_other.name = MODIN_UNNAMED_SERIES_LABEL else: new_self = self diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 39bf634ef34..6d2860c83e7 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -620,6 +620,17 @@ def test_add(data): inter_df_math_helper(modin_series, pandas_series, "add") +def test_add_does_not_change_original_series_name(): + # See https://github.com/modin-project/modin/issues/5232 + s1 = pd.Series(1, name=1) + s2 = pd.Series(2, name=2) + original_s1 = s1.copy(deep=True) + original_s2 = s2.copy(deep=True) + _ = s1 + s2 + df_equals(s1, original_s1) + df_equals(s2, original_s2) + + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_add_prefix(data): modin_series, pandas_series = create_test_series(data) From 8f6e642d8738d627c48eea3937b1baefab9de991 Mon Sep 17 00:00:00 2001 From: Iaroslav Igoshev Date: Fri, 25 Nov 2022 11:41:44 +0100 Subject: [PATCH 12/13] FEAT-#5253: Upgrade pandas to 1.5.2 (#5254) * Set use-only-tar-bz2 to false Signed-off-by: Igoshev, Iaroslav --- .github/workflows/ci-notebooks.yml | 4 +- .github/workflows/ci.yml | 53 ++++++++++++++++++------- .github/workflows/fuzzydata-test.yml | 4 +- .github/workflows/push-to-master.yml | 12 ++++-- .github/workflows/push.yml | 29 ++++++++++---- environment-dev.yml | 2 +- modin/pandas/__init__.py | 2 +- requirements-dev.txt | 2 +- requirements/env_hdk.yml | 2 +- requirements/requirements-no-engine.yml | 2 +- setup.py | 2 +- 11 files changed, 81 insertions(+), 33 deletions(-) diff --git a/.github/workflows/ci-notebooks.yml b/.github/workflows/ci-notebooks.yml index 18ef4b6101d..04ef8dfa813 100644 --- a/.github/workflows/ci-notebooks.yml +++ b/.github/workflows/ci-notebooks.yml @@ -44,7 +44,9 @@ jobs: environment-file: requirements/env_hdk.yml python-version: 3.8 channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false if: matrix.execution == 'hdk_on_native' - name: Cache datasets uses: actions/cache@v2 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7e03bf16cef..65d87928426 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -192,7 +192,9 @@ jobs: environment-file: environment-dev.yml python-version: 3.8 channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - name: Conda environment run: | conda info @@ -230,7 +232,9 @@ jobs: environment-file: environment-dev.yml python-version: 3.8 channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - name: Conda environment run: | conda info @@ -308,7 +312,9 @@ jobs: environment-file: environment-dev.yml python-version: 3.8 channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - name: Conda environment run: | conda info @@ -360,7 +366,9 @@ jobs: environment-file: environment-dev.yml python-version: 3.8 channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - name: Conda environment run: | conda info @@ -417,7 +425,9 @@ jobs: activate-environment: modin_on_hdk environment-file: requirements/env_hdk.yml python-version: 3.8 - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - name: Conda environment run: | conda info @@ -549,7 +559,9 @@ jobs: environment-file: environment-dev.yml python-version: ${{matrix.python-version}} channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - name: Conda environment run: | conda info @@ -628,7 +640,9 @@ jobs: environment-file: environment-dev.yml python-version: 3.8 channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - name: Conda environment run: | conda info @@ -675,7 +689,9 @@ jobs: environment-file: requirements/environment-py36.yml python-version: 3.6 channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - name: Conda environment run: | conda info @@ -736,7 +752,9 @@ jobs: environment-file: requirements/environment-py36.yml python-version: 3.6 channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - name: Conda environment run: | conda info @@ -805,7 +823,9 @@ jobs: environment-file: environment-dev.yml python-version: 3.8 channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - name: Conda environment run: | conda info @@ -873,8 +893,9 @@ jobs: environment-file: environment-dev.yml python-version: ${{matrix.python-version}} channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! - auto-update-conda: true # this enable `use-only-tar-bz2` feature on Windows + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - name: Conda environment run: | conda info @@ -920,7 +941,9 @@ jobs: environment-file: environment-dev.yml python-version: ${{matrix.python-version}} channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - name: Conda environment run: | conda info @@ -963,7 +986,9 @@ jobs: environment-file: environment-dev.yml python-version: ${{matrix.python-version}} channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - name: Conda environment run: | conda info diff --git a/.github/workflows/fuzzydata-test.yml b/.github/workflows/fuzzydata-test.yml index 35579e2f6b1..6d481990315 100644 --- a/.github/workflows/fuzzydata-test.yml +++ b/.github/workflows/fuzzydata-test.yml @@ -45,7 +45,9 @@ jobs: environment-file: environment-dev.yml python-version: 3.8 channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - name: Conda environment run: | conda info diff --git a/.github/workflows/push-to-master.yml b/.github/workflows/push-to-master.yml index 9df09f0aa1a..6bc2fde5a72 100644 --- a/.github/workflows/push-to-master.yml +++ b/.github/workflows/push-to-master.yml @@ -20,7 +20,9 @@ jobs: environment-file: requirements/requirements-no-engine.yml python-version: 3.8 channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - name: install Ray nightly build run: pip install https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl - name: Conda environment @@ -76,7 +78,9 @@ jobs: environment-file: environment-dev.yml python-version: 3.8 channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - name: Conda environment run: | conda info @@ -125,7 +129,9 @@ jobs: activate-environment: modin python-version: ${{matrix.python-version}} channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - run: pip install -r requirements-dev.txt # Use a ray master commit that includes the fix here: https://github.com/ray-project/ray/pull/16278 # Can be changed after a Ray version > 1.4 is released. diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index eb3ea952f2e..4990927826e 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -28,7 +28,9 @@ jobs: environment-file: environment-dev.yml python-version: 3.8 channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - name: Conda environment run: | conda info @@ -76,7 +78,9 @@ jobs: environment-file: environment-dev.yml python-version: 3.8 channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - name: Conda environment run: | conda info @@ -122,7 +126,9 @@ jobs: activate-environment: modin_on_hdk environment-file: requirements/env_hdk.yml python-version: 3.8 - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - name: Conda environment run: | conda info @@ -177,7 +183,9 @@ jobs: environment-file: environment-dev.yml python-version: ${{matrix.python-version}} channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - name: Conda environment run: | conda info @@ -276,8 +284,9 @@ jobs: environment-file: environment-dev.yml python-version: ${{matrix.python-version}} channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! - auto-update-conda: true # this enable `use-only-tar-bz2` feature on Windows + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - name: Conda environment run: | conda info @@ -322,7 +331,9 @@ jobs: environment-file: environment-dev.yml python-version: ${{matrix.python-version}} channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - name: Conda environment run: | conda info @@ -364,7 +375,9 @@ jobs: environment-file: environment-dev.yml python-version: ${{matrix.python-version}} channel-priority: strict - use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false - name: Conda environment run: | conda info diff --git a/environment-dev.yml b/environment-dev.yml index 1a3623f4377..4e7d94c748b 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -2,7 +2,7 @@ name: modin channels: - conda-forge dependencies: - - pandas==1.5.1 + - pandas==1.5.2 - numpy>=1.18.5 - pyarrow>=4.0.1 - dask>=2.22.0 diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index 55181935aa9..f2b6ee66e97 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -30,7 +30,7 @@ f"Starting Modin in compatibility mode to support legacy pandas version {__pandas_version__}" ) elif PandasCompatVersion.CURRENT == PandasCompatVersion.LATEST: - __pandas_version__ = "1.5.1" + __pandas_version__ = "1.5.2" if pandas.__version__ != __pandas_version__: warnings.warn( diff --git a/requirements-dev.txt b/requirements-dev.txt index 3ae9aa82427..8ece14557f5 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,4 @@ -pandas==1.5.1 +pandas==1.5.2 numpy>=1.18.5 pyarrow>=4.0.1 dask[complete]>=2.22.0 diff --git a/requirements/env_hdk.yml b/requirements/env_hdk.yml index 434d00690bf..b8be4a82fd9 100644 --- a/requirements/env_hdk.yml +++ b/requirements/env_hdk.yml @@ -2,7 +2,7 @@ name: modin_on_hdk channels: - conda-forge dependencies: - - pandas==1.5.1 + - pandas==1.5.2 - pyarrow=6 - numpy>=1.18.5 - fsspec diff --git a/requirements/requirements-no-engine.yml b/requirements/requirements-no-engine.yml index 9fcfd23c661..fad72dc9da8 100644 --- a/requirements/requirements-no-engine.yml +++ b/requirements/requirements-no-engine.yml @@ -1,7 +1,7 @@ channels: - conda-forge dependencies: - - pandas==1.5.1 + - pandas==1.5.2 - numpy>=1.18.5 - pyarrow>=4.0.1 - fsspec diff --git a/setup.py b/setup.py index ec355e46f82..dd6c464f153 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ import versioneer import sys -PANDAS_VERSION = "1.5.1" if sys.version_info >= (3, 8) else "1.1.5" +PANDAS_VERSION = "1.5.2" if sys.version_info >= (3, 8) else "1.1.5" with open("README.md", "r", encoding="utf-8") as fh: long_description = fh.read() From 9534478597c74a70943cba6fdde5f78f48a8fce9 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sun, 27 Nov 2022 23:31:17 +0100 Subject: [PATCH 13/13] PERF-#5268: Call `get` on all partitions at once in `to_pandas` (#4776) Co-authored-by: Vasily Litvinov Co-authored-by: Dmitry Chigarev Signed-off-by: Myachev --- .../pandas/partitioning/partition_manager.py | 19 ++++++++++++++++++- .../partitioning/partition_manager.py | 3 +++ .../partitioning/partition_manager.py | 3 +++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/modin/core/dataframe/pandas/partitioning/partition_manager.py b/modin/core/dataframe/pandas/partitioning/partition_manager.py index 9003fc527b0..c0f41740030 100644 --- a/modin/core/dataframe/pandas/partitioning/partition_manager.py +++ b/modin/core/dataframe/pandas/partitioning/partition_manager.py @@ -641,7 +641,24 @@ def to_pandas(cls, partitions): pandas.DataFrame A pandas DataFrame """ - retrieved_objects = [[obj.to_pandas() for obj in part] for part in partitions] + retrieved_objects = cls.get_objects_from_partitions(partitions.flatten()) + if all( + isinstance(obj, (pandas.DataFrame, pandas.Series)) + for obj in retrieved_objects + ): + height, width, *_ = tuple(partitions.shape) + (0,) + # restore 2d array + objs = iter(retrieved_objects) + retrieved_objects = [ + [next(objs) for _ in range(width)] for __ in range(height) + ] + else: + # Partitions do not always contain pandas objects, for example, hdk uses pyarrow tables. + # This implementation comes from the fact that calling `partition.get` + # function is not always equivalent to `partition.to_pandas`. + retrieved_objects = [ + [obj.to_pandas() for obj in part] for part in partitions + ] if all( isinstance(part, pandas.Series) for row in retrieved_objects for part in row ): diff --git a/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/partition_manager.py b/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/partition_manager.py index d32243c9747..b945f3d65b4 100644 --- a/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/partition_manager.py +++ b/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/partition_manager.py @@ -51,6 +51,9 @@ def get_objects_from_partitions(cls, partitions): list The objects wrapped by `partitions`. """ + for idx, part in enumerate(partitions): + if hasattr(part, "force_materialization"): + partitions[idx] = part.force_materialization() assert all( [len(partition.list_of_blocks) == 1 for partition in partitions] ), "Implementation assumes that each partition contains a signle block." diff --git a/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition_manager.py b/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition_manager.py index 91a720bcb3e..baa4d5cb38c 100644 --- a/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition_manager.py +++ b/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition_manager.py @@ -108,6 +108,9 @@ def get_objects_from_partitions(cls, partitions): list The objects wrapped by `partitions`. """ + for idx, part in enumerate(partitions): + if hasattr(part, "force_materialization"): + partitions[idx] = part.force_materialization() assert all( [len(partition.list_of_blocks) == 1 for partition in partitions] ), "Implementation assumes that each partition contains a signle block."