Skip to content

Commit

Permalink
fixes
Browse files Browse the repository at this point in the history
Signed-off-by: Anatoly Myachev <[email protected]>
  • Loading branch information
anmyachev committed Mar 2, 2024
1 parent ac8c9cb commit 3b8fa14
Show file tree
Hide file tree
Showing 5 changed files with 165 additions and 190 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@
import pandas
import pyarrow
import pytest
from pandas._testing import ensure_clean
from pandas.core.dtypes.common import is_list_like
from pyhdk import __version__ as hdk_version

from modin.config import StorageFormat
from modin.pandas.test.utils import (
create_test_dfs,
default_to_pandas_ignore_string,
get_unique_filename,
io_ops_bad_exc,
random_state,
test_data,
Expand Down Expand Up @@ -324,17 +324,17 @@ def test_read_csv_datetime(

@pytest.mark.parametrize("engine", [None, "arrow"])
@pytest.mark.parametrize("parse_dates", [None, True, False])
def test_read_csv_datetime_tz(self, engine, parse_dates):
with ensure_clean(".csv") as file:
with open(file, "w") as f:
f.write("test\n2023-01-01T00:00:00.000-07:00")
def test_read_csv_datetime_tz(self, engine, parse_dates, tmp_path):
unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
with open(unique_filename, "w") as f:
f.write("test\n2023-01-01T00:00:00.000-07:00")

eval_io(
fn_name="read_csv",
filepath_or_buffer=file,
md_extra_kwargs={"engine": engine},
parse_dates=parse_dates,
)
eval_io(
fn_name="read_csv",
filepath_or_buffer=unique_filename,
md_extra_kwargs={"engine": engine},
parse_dates=parse_dates,
)

@pytest.mark.parametrize("engine", [None, "arrow"])
@pytest.mark.parametrize(
Expand Down Expand Up @@ -382,26 +382,26 @@ def test_read_csv_col_handling(
"c1.1,c1,c1.1,c1,c1.1,c1.2,c1.2,c2",
],
)
def test_read_csv_duplicate_cols(self, cols):
def test_read_csv_duplicate_cols(self, cols, tmp_path):
def test(df, lib, **kwargs):
data = f"{cols}\n"
with ensure_clean(".csv") as fname:
with open(fname, "w") as f:
f.write(data)
return lib.read_csv(fname)
unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
with open(unique_filename, "w") as f:
f.write(data)
return lib.read_csv(unique_filename)

run_and_compare(test, data={})

def test_read_csv_dtype_object(self):
def test_read_csv_dtype_object(self, tmp_path):
with pytest.warns(UserWarning) as warns:
with ensure_clean(".csv") as file:
with open(file, "w") as f:
f.write("test\ntest")
unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
with open(unique_filename, "w") as f:
f.write("test\ntest")

def test(**kwargs):
return pd.read_csv(file, dtype={"test": "object"})
def test(**kwargs):
return pd.read_csv(unique_filename, dtype={"test": "object"})

run_and_compare(test, data={})
run_and_compare(test, data={})
for warn in warns.list:
assert not re.match(r".*defaulting to pandas.*", str(warn))

Expand Down Expand Up @@ -870,30 +870,30 @@ def concat(df1, df2, lib, **kwargs):
@pytest.mark.parametrize("transform", [True, False])
@pytest.mark.parametrize("sort_last", [True, False])
# RecursionError in case of concatenation of big number of frames
def test_issue_5889(self, transform, sort_last):
with ensure_clean(".csv") as file:
data = {"a": [1, 2, 3], "b": [1, 2, 3]} if transform else {"a": [1, 2, 3]}
pandas.DataFrame(data).to_csv(file, index=False)
def test_issue_5889(self, transform, sort_last, tmp_path):
unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
data = {"a": [1, 2, 3], "b": [1, 2, 3]} if transform else {"a": [1, 2, 3]}
pandas.DataFrame(data).to_csv(unique_filename, index=False)

def test_concat(lib, **kwargs):
if transform:
def test_concat(lib, **kwargs):
if transform:

def read_csv():
return lib.read_csv(file)["b"]
def read_csv():
return lib.read_csv(unique_filename)["b"]

else:
else:

def read_csv():
return lib.read_csv(file)
def read_csv():
return lib.read_csv(unique_filename)

df = read_csv()
for _ in range(100):
df = lib.concat([df, read_csv()])
if sort_last:
df = lib.concat([df, read_csv()], sort=True)
return df
df = read_csv()
for _ in range(100):
df = lib.concat([df, read_csv()])
if sort_last:
df = lib.concat([df, read_csv()], sort=True)
return df

run_and_compare(test_concat, data={})
run_and_compare(test_concat, data={})


class TestGroupby:
Expand Down
68 changes: 28 additions & 40 deletions modin/experimental/pandas/test/test_io_exp.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@
import numpy as np
import pandas
import pytest
from pandas._testing import ensure_clean

import modin.experimental.pandas as pd
from modin.config import AsyncReadMode, Engine
from modin.config import Engine
from modin.pandas.test.utils import (
df_equals,
eval_general,
get_unique_filename,
parse_dates_values_by_id,
test_data,
time_parsing_csv_path,
Expand Down Expand Up @@ -355,7 +355,7 @@ def test_xml_glob(tmp_path, filename):
reason=f"{Engine.get()} does not have experimental read_custom_text API",
)
@pytest.mark.parametrize("set_async_read_mode", [False, True], indirect=True)
def test_read_custom_json_text(set_async_read_mode):
def test_read_custom_json_text(set_async_read_mode, tmp_path):
def _generate_json(file_name, nrows, ncols):
data = np.random.rand(nrows, ncols)
df = pandas.DataFrame(data, columns=[f"col{x}" for x in range(ncols)])
Expand All @@ -374,33 +374,27 @@ def _custom_parser(io_input, **kwargs):
result[key].append(obj[key])
return pandas.DataFrame(result).rename(columns={"col0": "testID"})

with ensure_clean() as filename:
_generate_json(filename, 64, 8)
unique_filename = get_unique_filename(data_dir=tmp_path)
_generate_json(unique_filename, 64, 8)

df1 = pd.read_custom_text(
filename,
columns=["testID", "col1", "col3"],
custom_parser=_custom_parser,
is_quoting=False,
)
df2 = pd.read_json(filename, lines=True)[["col0", "col1", "col3"]].rename(
columns={"col0": "testID"}
)
if AsyncReadMode.get():
# If read operations are asynchronous, then the dataframes
# check should be inside `ensure_clean` context
# because the file may be deleted before actual reading starts
df_equals(df1, df2)
if not AsyncReadMode.get():
df_equals(df1, df2)
df1 = pd.read_custom_text(
unique_filename,
columns=["testID", "col1", "col3"],
custom_parser=_custom_parser,
is_quoting=False,
)
df2 = pd.read_json(unique_filename, lines=True)[["col0", "col1", "col3"]].rename(
columns={"col0": "testID"}
)
df_equals(df1, df2)


@pytest.mark.skipif(
Engine.get() not in ("Ray", "Unidist", "Dask"),
reason=f"{Engine.get()} does not have experimental API",
)
@pytest.mark.parametrize("set_async_read_mode", [False, True], indirect=True)
def test_read_evaluated_dict(set_async_read_mode):
def test_read_evaluated_dict(set_async_read_mode, tmp_path):
def _generate_evaluated_dict(file_name, nrows, ncols):
result = {}
keys = [f"col{x}" for x in range(ncols)]
Expand Down Expand Up @@ -430,23 +424,17 @@ def columns_callback(io_input, **kwargs):
break
return columns

with ensure_clean() as filename:
_generate_evaluated_dict(filename, 64, 8)
unique_filename = get_unique_filename(data_dir=tmp_path)
_generate_evaluated_dict(unique_filename, 64, 8)

df1 = pd.read_custom_text(
filename,
columns=["col1", "col2"],
custom_parser=_custom_parser,
)
assert df1.shape == (64, 2)
df1 = pd.read_custom_text(
unique_filename,
columns=["col1", "col2"],
custom_parser=_custom_parser,
)
assert df1.shape == (64, 2)

df2 = pd.read_custom_text(
filename, columns=columns_callback, custom_parser=_custom_parser
)
if AsyncReadMode.get():
# If read operations are asynchronous, then the dataframes
# check should be inside `ensure_clean` context
# because the file may be deleted before actual reading starts
df_equals(df1, df2)
if not AsyncReadMode.get():
df_equals(df1, df2)
df2 = pd.read_custom_text(
unique_filename, columns=columns_callback, custom_parser=_custom_parser
)
df_equals(df1, df2)
20 changes: 11 additions & 9 deletions modin/pandas/test/dataframe/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import numpy as np
import pandas
import pytest
from pandas._testing import ensure_clean
from pandas.testing import assert_index_equal

import modin.pandas as pd
Expand All @@ -35,6 +34,7 @@
df_equals,
eval_general,
generate_multiindex,
get_unique_filename,
int_arg_keys,
int_arg_values,
name_contains,
Expand Down Expand Up @@ -2207,14 +2207,16 @@ def test___setitem__partitions_aligning():
df_equals(md_df, pd_df)


def test___setitem__with_mismatched_partitions():
with ensure_clean(".csv") as fname:
np.savetxt(fname, np.random.randint(0, 100, size=(200_000, 99)), delimiter=",")
modin_df = pd.read_csv(fname)
pandas_df = pandas.read_csv(fname)
modin_df["new"] = pd.Series(list(range(len(modin_df))))
pandas_df["new"] = pandas.Series(list(range(len(pandas_df))))
df_equals(modin_df, pandas_df)
def test___setitem__with_mismatched_partitions(tmp_path):
unique_filename = get_unique_filename(extension="csv", data_dir=tmp_path)
np.savetxt(
unique_filename, np.random.randint(0, 100, size=(200_000, 99)), delimiter=","
)
modin_df = pd.read_csv(unique_filename)
pandas_df = pandas.read_csv(unique_filename)
modin_df["new"] = pd.Series(list(range(len(modin_df))))
pandas_df["new"] = pandas.Series(list(range(len(pandas_df))))
df_equals(modin_df, pandas_df)


def test___setitem__mask():
Expand Down
Loading

0 comments on commit 3b8fa14

Please sign in to comment.