Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TEST-#7151: Remove usage of pandas._testing private module #6988

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 5 additions & 13 deletions modin/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

import os
import platform
import shutil
import subprocess
import sys
import time
Expand Down Expand Up @@ -340,16 +339,15 @@ def fixture(tmp_path):


@pytest.fixture
def make_parquet_file():
def make_parquet_file(tmp_path):
"""Pytest fixture factory that makes a parquet file/dir for testing.

Yields:
Function that generates a parquet file/dir
"""
filenames = []

def _make_parquet_file(
filename,
filename=None,
nrows=NROWS,
ncols=2,
force=True,
Expand All @@ -369,6 +367,8 @@ def _make_parquet_file(
partitioned_columns: Create a partitioned directory using pandas.
row_group_size: Maximum size of each row group.
"""
if filename is None:
filename = get_unique_filename(extension=".parquet", data_dir=tmp_path)
if force or not os.path.exists(filename):
df = pandas.DataFrame(
{f"col{x + 1}": np.arange(nrows) for x in range(ncols)}
Expand All @@ -395,19 +395,11 @@ def _make_parquet_file(
)
else:
df.to_parquet(filename, row_group_size=row_group_size)
filenames.append(filename)
return filename

# Return function that generates parquet files
yield _make_parquet_file

# Delete parquet file that was created
for path in filenames:
if os.path.exists(path):
if os.path.isdir(path):
shutil.rmtree(path)
else:
os.remove(path)


@pytest.fixture
def make_sql_connection():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@
import pandas
import pyarrow
import pytest
from pandas._testing import ensure_clean
from pandas.core.dtypes.common import is_list_like
from pyhdk import __version__ as hdk_version

from modin.config import StorageFormat
from modin.pandas.test.utils import (
create_test_dfs,
default_to_pandas_ignore_string,
get_unique_filename,
random_state,
test_data,
)
Expand Down Expand Up @@ -341,17 +341,17 @@ def test_read_csv_datetime(

@pytest.mark.parametrize("engine", [None, "arrow"])
@pytest.mark.parametrize("parse_dates", [None, True, False])
def test_read_csv_datetime_tz(self, engine, parse_dates):
with ensure_clean(".csv") as file:
with open(file, "w") as f:
f.write("test\n2023-01-01T00:00:00.000-07:00")
def test_read_csv_datetime_tz(self, engine, parse_dates, tmp_path):
unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
with open(unique_filename, "w") as f:
f.write("test\n2023-01-01T00:00:00.000-07:00")

eval_io(
fn_name="read_csv",
filepath_or_buffer=file,
md_extra_kwargs={"engine": engine},
parse_dates=parse_dates,
)
eval_io(
fn_name="read_csv",
filepath_or_buffer=unique_filename,
md_extra_kwargs={"engine": engine},
parse_dates=parse_dates,
)

@pytest.mark.parametrize("engine", [None, "arrow"])
@pytest.mark.parametrize(
Expand Down Expand Up @@ -399,26 +399,26 @@ def test_read_csv_col_handling(
"c1.1,c1,c1.1,c1,c1.1,c1.2,c1.2,c2",
],
)
def test_read_csv_duplicate_cols(self, cols):
def test_read_csv_duplicate_cols(self, cols, tmp_path):
def test(df, lib, **kwargs):
data = f"{cols}\n"
with ensure_clean(".csv") as fname:
with open(fname, "w") as f:
f.write(data)
return lib.read_csv(fname)
unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
with open(unique_filename, "w") as f:
f.write(data)
return lib.read_csv(unique_filename)

run_and_compare(test, data={})

def test_read_csv_dtype_object(self):
def test_read_csv_dtype_object(self, tmp_path):
with pytest.warns(UserWarning) as warns:
with ensure_clean(".csv") as file:
with open(file, "w") as f:
f.write("test\ntest")
unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
with open(unique_filename, "w") as f:
f.write("test\ntest")

def test(**kwargs):
return pd.read_csv(file, dtype={"test": "object"})
def test(**kwargs):
return pd.read_csv(unique_filename, dtype={"test": "object"})

run_and_compare(test, data={})
run_and_compare(test, data={})
for warn in warns.list:
assert not re.match(r".*defaulting to pandas.*", str(warn))

Expand Down Expand Up @@ -892,30 +892,30 @@ def concat(df1, df2, lib, **kwargs):
@pytest.mark.parametrize("transform", [True, False])
@pytest.mark.parametrize("sort_last", [True, False])
# RecursionError in case of concatenation of big number of frames
def test_issue_5889(self, transform, sort_last):
with ensure_clean(".csv") as file:
data = {"a": [1, 2, 3], "b": [1, 2, 3]} if transform else {"a": [1, 2, 3]}
pandas.DataFrame(data).to_csv(file, index=False)
def test_issue_5889(self, transform, sort_last, tmp_path):
unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
data = {"a": [1, 2, 3], "b": [1, 2, 3]} if transform else {"a": [1, 2, 3]}
pandas.DataFrame(data).to_csv(unique_filename, index=False)

def test_concat(lib, **kwargs):
if transform:
def test_concat(lib, **kwargs):
if transform:

def read_csv():
return lib.read_csv(file)["b"]
def read_csv():
return lib.read_csv(unique_filename)["b"]

else:
else:

def read_csv():
return lib.read_csv(file)
def read_csv():
return lib.read_csv(unique_filename)

df = read_csv()
for _ in range(100):
df = lib.concat([df, read_csv()])
if sort_last:
df = lib.concat([df, read_csv()], sort=True)
return df
df = read_csv()
for _ in range(100):
df = lib.concat([df, read_csv()])
if sort_last:
df = lib.concat([df, read_csv()], sort=True)
return df

run_and_compare(test_concat, data={})
run_and_compare(test_concat, data={})


class TestGroupby:
Expand Down
68 changes: 28 additions & 40 deletions modin/experimental/pandas/test/test_io_exp.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@
import numpy as np
import pandas
import pytest
from pandas._testing import ensure_clean

import modin.experimental.pandas as pd
from modin.config import AsyncReadMode, Engine
from modin.config import Engine
from modin.pandas.test.utils import (
df_equals,
eval_general,
get_unique_filename,
parse_dates_values_by_id,
test_data,
time_parsing_csv_path,
Expand Down Expand Up @@ -359,7 +359,7 @@ def test_xml_glob(tmp_path, filename):
reason=f"{Engine.get()} does not have experimental read_custom_text API",
)
@pytest.mark.parametrize("set_async_read_mode", [False, True], indirect=True)
def test_read_custom_json_text(set_async_read_mode):
def test_read_custom_json_text(set_async_read_mode, tmp_path):
def _generate_json(file_name, nrows, ncols):
data = np.random.rand(nrows, ncols)
df = pandas.DataFrame(data, columns=[f"col{x}" for x in range(ncols)])
Expand All @@ -378,33 +378,27 @@ def _custom_parser(io_input, **kwargs):
result[key].append(obj[key])
return pandas.DataFrame(result).rename(columns={"col0": "testID"})

with ensure_clean() as filename:
_generate_json(filename, 64, 8)
unique_filename = get_unique_filename(data_dir=tmp_path)
_generate_json(unique_filename, 64, 8)

df1 = pd.read_custom_text(
filename,
columns=["testID", "col1", "col3"],
custom_parser=_custom_parser,
is_quoting=False,
)
df2 = pd.read_json(filename, lines=True)[["col0", "col1", "col3"]].rename(
columns={"col0": "testID"}
)
if AsyncReadMode.get():
# If read operations are asynchronous, then the dataframes
# check should be inside `ensure_clean` context
# because the file may be deleted before actual reading starts
df_equals(df1, df2)
if not AsyncReadMode.get():
df_equals(df1, df2)
df1 = pd.read_custom_text(
unique_filename,
columns=["testID", "col1", "col3"],
custom_parser=_custom_parser,
is_quoting=False,
)
df2 = pd.read_json(unique_filename, lines=True)[["col0", "col1", "col3"]].rename(
columns={"col0": "testID"}
)
df_equals(df1, df2)


@pytest.mark.skipif(
Engine.get() not in ("Ray", "Unidist", "Dask"),
reason=f"{Engine.get()} does not have experimental API",
)
@pytest.mark.parametrize("set_async_read_mode", [False, True], indirect=True)
def test_read_evaluated_dict(set_async_read_mode):
def test_read_evaluated_dict(set_async_read_mode, tmp_path):
def _generate_evaluated_dict(file_name, nrows, ncols):
result = {}
keys = [f"col{x}" for x in range(ncols)]
Expand Down Expand Up @@ -434,23 +428,17 @@ def columns_callback(io_input, **kwargs):
break
return columns

with ensure_clean() as filename:
_generate_evaluated_dict(filename, 64, 8)
unique_filename = get_unique_filename(data_dir=tmp_path)
_generate_evaluated_dict(unique_filename, 64, 8)

df1 = pd.read_custom_text(
filename,
columns=["col1", "col2"],
custom_parser=_custom_parser,
)
assert df1.shape == (64, 2)
df1 = pd.read_custom_text(
unique_filename,
columns=["col1", "col2"],
custom_parser=_custom_parser,
)
assert df1.shape == (64, 2)

df2 = pd.read_custom_text(
filename, columns=columns_callback, custom_parser=_custom_parser
)
if AsyncReadMode.get():
# If read operations are asynchronous, then the dataframes
# check should be inside `ensure_clean` context
# because the file may be deleted before actual reading starts
df_equals(df1, df2)
if not AsyncReadMode.get():
df_equals(df1, df2)
df2 = pd.read_custom_text(
unique_filename, columns=columns_callback, custom_parser=_custom_parser
)
df_equals(df1, df2)
20 changes: 11 additions & 9 deletions modin/pandas/test/dataframe/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import numpy as np
import pandas
import pytest
from pandas._testing import ensure_clean

import modin.pandas as pd
from modin.config import MinPartitionSize, NPartitions, StorageFormat
Expand All @@ -35,6 +34,7 @@
df_equals,
eval_general,
generate_multiindex,
get_unique_filename,
int_arg_keys,
int_arg_values,
name_contains,
Expand Down Expand Up @@ -2243,14 +2243,16 @@ def test___setitem__partitions_aligning():
df_equals(md_df, pd_df)


def test___setitem__with_mismatched_partitions():
with ensure_clean(".csv") as fname:
np.savetxt(fname, np.random.randint(0, 100, size=(200_000, 99)), delimiter=",")
modin_df = pd.read_csv(fname)
pandas_df = pandas.read_csv(fname)
modin_df["new"] = pd.Series(list(range(len(modin_df))))
pandas_df["new"] = pandas.Series(list(range(len(pandas_df))))
df_equals(modin_df, pandas_df)
def test___setitem__with_mismatched_partitions(tmp_path):
unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
np.savetxt(
unique_filename, np.random.randint(0, 100, size=(200_000, 99)), delimiter=","
)
modin_df = pd.read_csv(unique_filename)
pandas_df = pandas.read_csv(unique_filename)
modin_df["new"] = pd.Series(list(range(len(modin_df))))
pandas_df["new"] = pandas.Series(list(range(len(pandas_df))))
df_equals(modin_df, pandas_df)


def test___setitem__mask():
Expand Down
Loading
Loading