modin-project · anmyachev · Mar 1, 2024 · Mar 2, 2024 · Apr 5, 2024
@@ -16,7 +16,6 @@
 
 import os
 import platform
-import shutil
 import subprocess
 import sys
 import time
@@ -340,16 +339,15 @@ def fixture(tmp_path):
 
 
 @pytest.fixture
-def make_parquet_file():
+def make_parquet_file(tmp_path):
     """Pytest fixture factory that makes a parquet file/dir for testing.
 
     Yields:
         Function that generates a parquet file/dir
     """
-    filenames = []
 
     def _make_parquet_file(
-        filename,
+        filename=None,
         nrows=NROWS,
         ncols=2,
         force=True,
@@ -369,6 +367,8 @@ def _make_parquet_file(
             partitioned_columns: Create a partitioned directory using pandas.
             row_group_size: Maximum size of each row group.
         """
+        if filename is None:
+            filename = get_unique_filename(extension=".parquet", data_dir=tmp_path)
         if force or not os.path.exists(filename):
             df = pandas.DataFrame(
                 {f"col{x + 1}": np.arange(nrows) for x in range(ncols)}
@@ -395,19 +395,11 @@ def _make_parquet_file(
                 )
             else:
                 df.to_parquet(filename, row_group_size=row_group_size)
-            filenames.append(filename)
+        return filename
 
     # Return function that generates parquet files
     yield _make_parquet_file
 
-    # Delete parquet file that was created
-    for path in filenames:
-        if os.path.exists(path):
-            if os.path.isdir(path):
-                shutil.rmtree(path)
-            else:
-                os.remove(path)
-
 
 @pytest.fixture
 def make_sql_connection():

@@ -18,14 +18,14 @@
 import pandas
 import pyarrow
 import pytest
-from pandas._testing import ensure_clean
 from pandas.core.dtypes.common import is_list_like
 from pyhdk import __version__ as hdk_version
 
 from modin.config import StorageFormat
 from modin.pandas.test.utils import (
     create_test_dfs,
     default_to_pandas_ignore_string,
+    get_unique_filename,
     random_state,
     test_data,
 )
@@ -341,17 +341,17 @@ def test_read_csv_datetime(
 
     @pytest.mark.parametrize("engine", [None, "arrow"])
     @pytest.mark.parametrize("parse_dates", [None, True, False])
-    def test_read_csv_datetime_tz(self, engine, parse_dates):
-        with ensure_clean(".csv") as file:
-            with open(file, "w") as f:
-                f.write("test\n2023-01-01T00:00:00.000-07:00")
+    def test_read_csv_datetime_tz(self, engine, parse_dates, tmp_path):
+        unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
+        with open(unique_filename, "w") as f:
+            f.write("test\n2023-01-01T00:00:00.000-07:00")
 
-            eval_io(
-                fn_name="read_csv",
-                filepath_or_buffer=file,
-                md_extra_kwargs={"engine": engine},
-                parse_dates=parse_dates,
-            )
+        eval_io(
+            fn_name="read_csv",
+            filepath_or_buffer=unique_filename,
+            md_extra_kwargs={"engine": engine},
+            parse_dates=parse_dates,
+        )
 
     @pytest.mark.parametrize("engine", [None, "arrow"])
     @pytest.mark.parametrize(
@@ -399,26 +399,26 @@ def test_read_csv_col_handling(
             "c1.1,c1,c1.1,c1,c1.1,c1.2,c1.2,c2",
         ],
     )
-    def test_read_csv_duplicate_cols(self, cols):
+    def test_read_csv_duplicate_cols(self, cols, tmp_path):
         def test(df, lib, **kwargs):
             data = f"{cols}\n"
-            with ensure_clean(".csv") as fname:
-                with open(fname, "w") as f:
-                    f.write(data)
-                return lib.read_csv(fname)
+            unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
+            with open(unique_filename, "w") as f:
+                f.write(data)
+            return lib.read_csv(unique_filename)
 
         run_and_compare(test, data={})
 
-    def test_read_csv_dtype_object(self):
+    def test_read_csv_dtype_object(self, tmp_path):
         with pytest.warns(UserWarning) as warns:
-            with ensure_clean(".csv") as file:
-                with open(file, "w") as f:
-                    f.write("test\ntest")
+            unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
+            with open(unique_filename, "w") as f:
+                f.write("test\ntest")
 
-                def test(**kwargs):
-                    return pd.read_csv(file, dtype={"test": "object"})
+            def test(**kwargs):
+                return pd.read_csv(unique_filename, dtype={"test": "object"})
 
-                run_and_compare(test, data={})
+            run_and_compare(test, data={})
             for warn in warns.list:
                 assert not re.match(r".*defaulting to pandas.*", str(warn))
 
@@ -892,30 +892,30 @@ def concat(df1, df2, lib, **kwargs):
     @pytest.mark.parametrize("transform", [True, False])
     @pytest.mark.parametrize("sort_last", [True, False])
     # RecursionError in case of concatenation of big number of frames
-    def test_issue_5889(self, transform, sort_last):
-        with ensure_clean(".csv") as file:
-            data = {"a": [1, 2, 3], "b": [1, 2, 3]} if transform else {"a": [1, 2, 3]}
-            pandas.DataFrame(data).to_csv(file, index=False)
+    def test_issue_5889(self, transform, sort_last, tmp_path):
+        unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
+        data = {"a": [1, 2, 3], "b": [1, 2, 3]} if transform else {"a": [1, 2, 3]}
+        pandas.DataFrame(data).to_csv(unique_filename, index=False)
 
-            def test_concat(lib, **kwargs):
-                if transform:
+        def test_concat(lib, **kwargs):
+            if transform:
 
-                    def read_csv():
-                        return lib.read_csv(file)["b"]
+                def read_csv():
+                    return lib.read_csv(unique_filename)["b"]
 
-                else:
+            else:
 
-                    def read_csv():
-                        return lib.read_csv(file)
+                def read_csv():
+                    return lib.read_csv(unique_filename)
 
-                df = read_csv()
-                for _ in range(100):
-                    df = lib.concat([df, read_csv()])
-                if sort_last:
-                    df = lib.concat([df, read_csv()], sort=True)
-                return df
+            df = read_csv()
+            for _ in range(100):
+                df = lib.concat([df, read_csv()])
+            if sort_last:
+                df = lib.concat([df, read_csv()], sort=True)
+            return df
 
-            run_and_compare(test_concat, data={})
+        run_and_compare(test_concat, data={})
 
 
 class TestGroupby:

@@ -18,13 +18,13 @@
 import numpy as np
 import pandas
 import pytest
-from pandas._testing import ensure_clean
 
 import modin.experimental.pandas as pd
-from modin.config import AsyncReadMode, Engine
+from modin.config import Engine
 from modin.pandas.test.utils import (
     df_equals,
     eval_general,
+    get_unique_filename,
     parse_dates_values_by_id,
     test_data,
     time_parsing_csv_path,
@@ -359,7 +359,7 @@ def test_xml_glob(tmp_path, filename):
     reason=f"{Engine.get()} does not have experimental read_custom_text API",
 )
 @pytest.mark.parametrize("set_async_read_mode", [False, True], indirect=True)
-def test_read_custom_json_text(set_async_read_mode):
+def test_read_custom_json_text(set_async_read_mode, tmp_path):
     def _generate_json(file_name, nrows, ncols):
         data = np.random.rand(nrows, ncols)
         df = pandas.DataFrame(data, columns=[f"col{x}" for x in range(ncols)])
@@ -378,33 +378,27 @@ def _custom_parser(io_input, **kwargs):
                 result[key].append(obj[key])
         return pandas.DataFrame(result).rename(columns={"col0": "testID"})
 
-    with ensure_clean() as filename:
-        _generate_json(filename, 64, 8)
+    unique_filename = get_unique_filename(data_dir=tmp_path)
+    _generate_json(unique_filename, 64, 8)
 
-        df1 = pd.read_custom_text(
-            filename,
-            columns=["testID", "col1", "col3"],
-            custom_parser=_custom_parser,
-            is_quoting=False,
-        )
-        df2 = pd.read_json(filename, lines=True)[["col0", "col1", "col3"]].rename(
-            columns={"col0": "testID"}
-        )
-        if AsyncReadMode.get():
-            # If read operations are asynchronous, then the dataframes
-            # check should be inside `ensure_clean` context
-            # because the file may be deleted before actual reading starts
-            df_equals(df1, df2)
-    if not AsyncReadMode.get():
-        df_equals(df1, df2)
+    df1 = pd.read_custom_text(
+        unique_filename,
+        columns=["testID", "col1", "col3"],
+        custom_parser=_custom_parser,
+        is_quoting=False,
+    )
+    df2 = pd.read_json(unique_filename, lines=True)[["col0", "col1", "col3"]].rename(
+        columns={"col0": "testID"}
+    )
+    df_equals(df1, df2)
 
 
 @pytest.mark.skipif(
     Engine.get() not in ("Ray", "Unidist", "Dask"),
     reason=f"{Engine.get()} does not have experimental API",
 )
 @pytest.mark.parametrize("set_async_read_mode", [False, True], indirect=True)
-def test_read_evaluated_dict(set_async_read_mode):
+def test_read_evaluated_dict(set_async_read_mode, tmp_path):
     def _generate_evaluated_dict(file_name, nrows, ncols):
         result = {}
         keys = [f"col{x}" for x in range(ncols)]
@@ -434,23 +428,17 @@ def columns_callback(io_input, **kwargs):
             break
         return columns
 
-    with ensure_clean() as filename:
-        _generate_evaluated_dict(filename, 64, 8)
+    unique_filename = get_unique_filename(data_dir=tmp_path)
+    _generate_evaluated_dict(unique_filename, 64, 8)
 
-        df1 = pd.read_custom_text(
-            filename,
-            columns=["col1", "col2"],
-            custom_parser=_custom_parser,
-        )
-        assert df1.shape == (64, 2)
+    df1 = pd.read_custom_text(
+        unique_filename,
+        columns=["col1", "col2"],
+        custom_parser=_custom_parser,
+    )
+    assert df1.shape == (64, 2)
 
-        df2 = pd.read_custom_text(
-            filename, columns=columns_callback, custom_parser=_custom_parser
-        )
-        if AsyncReadMode.get():
-            # If read operations are asynchronous, then the dataframes
-            # check should be inside `ensure_clean` context
-            # because the file may be deleted before actual reading starts
-            df_equals(df1, df2)
-    if not AsyncReadMode.get():
-        df_equals(df1, df2)
+    df2 = pd.read_custom_text(
+        unique_filename, columns=columns_callback, custom_parser=_custom_parser
+    )
+    df_equals(df1, df2)
@@ -18,7 +18,6 @@
 import numpy as np
 import pandas
 import pytest
-from pandas._testing import ensure_clean
 
 import modin.pandas as pd
 from modin.config import MinPartitionSize, NPartitions, StorageFormat
@@ -35,6 +34,7 @@
     df_equals,
     eval_general,
     generate_multiindex,
+    get_unique_filename,
     int_arg_keys,
     int_arg_values,
     name_contains,
@@ -2243,14 +2243,16 @@ def test___setitem__partitions_aligning():
     df_equals(md_df, pd_df)
 
 
-def test___setitem__with_mismatched_partitions():
-    with ensure_clean(".csv") as fname:
-        np.savetxt(fname, np.random.randint(0, 100, size=(200_000, 99)), delimiter=",")
-        modin_df = pd.read_csv(fname)
-        pandas_df = pandas.read_csv(fname)
-        modin_df["new"] = pd.Series(list(range(len(modin_df))))
-        pandas_df["new"] = pandas.Series(list(range(len(pandas_df))))
-        df_equals(modin_df, pandas_df)
+def test___setitem__with_mismatched_partitions(tmp_path):
+    unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
+    np.savetxt(
+        unique_filename, np.random.randint(0, 100, size=(200_000, 99)), delimiter=","
+    )
+    modin_df = pd.read_csv(unique_filename)
+    pandas_df = pandas.read_csv(unique_filename)
+    modin_df["new"] = pd.Series(list(range(len(modin_df))))
+    pandas_df["new"] = pandas.Series(list(range(len(pandas_df))))
+    df_equals(modin_df, pandas_df)
 
 
 def test___setitem__mask():