Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REFACTOR: Add mypy checks for modin.core.dataframe.algebra #5167

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions asv_bench/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,7 @@ class BaseTimeGroupBy:
def setup(self, shape, ngroups=5, groupby_ncols=1):
ngroups = translator_groupby_ngroups(ngroups, shape)
self.df, self.groupby_columns = generate_dataframe(
"int",
*shape,
RAND_LOW,
RAND_HIGH,
groupby_ncols,
count_groups=ngroups,
"int", *shape, RAND_LOW, RAND_HIGH, groupby_ncols, count_groups=ngroups,
yashgosa marked this conversation as resolved.
Show resolved Hide resolved
)


Expand Down
7 changes: 1 addition & 6 deletions asv_bench/benchmarks/hdk/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,12 +442,7 @@ class BaseTimeGroupBy:
def setup(self, shape, ngroups=5, groupby_ncols=1):
ngroups = translator_groupby_ngroups(ngroups, shape)
self.df, self.groupby_columns = generate_dataframe(
"int",
*shape,
RAND_LOW,
RAND_HIGH,
groupby_ncols,
count_groups=ngroups,
"int", *shape, RAND_LOW, RAND_HIGH, groupby_ncols, count_groups=ngroups,
)
# correct while we use 'col*' like name for non-groupby columns
# and 'groupby_col*' like name for groupby columns
Expand Down
7 changes: 1 addition & 6 deletions asv_bench/benchmarks/hdk/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,5 @@ def setup(self, cache, shape):
self.filename, self.names, self.dtype = cache[file_id]

def time_read_csv_names(self, cache, shape):
df = IMPL.read_csv(
self.filename,
names=self.names,
header=0,
dtype=self.dtype,
)
df = IMPL.read_csv(self.filename, names=self.names, header=0, dtype=self.dtype,)
trigger_import(df)
6 changes: 1 addition & 5 deletions asv_bench/benchmarks/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,4 @@ def setup(self, test_filenames, shape):
self.shape_id = get_shape_id(shape)

def time_read_parquet(self, test_filenames, shape):
execute(
IMPL.read_parquet(
test_filenames[self.shape_id],
)
)
execute(IMPL.read_parquet(test_filenames[self.shape_id],))
13 changes: 2 additions & 11 deletions asv_bench/benchmarks/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,11 +244,7 @@ def gen_true_false_int_data(nrows, ncols, rand_low, rand_high):


def gen_data(
data_type: str,
nrows: int,
ncols: int,
rand_low: int,
rand_high: int,
data_type: str, nrows: int, ncols: int, rand_low: int, rand_high: int,
) -> dict:
"""
Generate data with caching.
Expand Down Expand Up @@ -501,12 +497,7 @@ def execute(
return

# compatibility with old Modin versions
all(
map(
lambda partition: partition.drain_call_queue() or True,
partitions,
)
)
all(map(lambda partition: partition.drain_call_queue() or True, partitions,))
if ASV_USE_ENGINE == "ray":
from ray import wait

Expand Down
5 changes: 1 addition & 4 deletions asv_bench/benchmarks/utils/data_shapes.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,7 @@
"hdk.TimeReadCsvNames",
],
),
(
HDK_BINARY_OP_DATA_SIZE[ASV_DATASET_SIZE],
["hdk.TimeMerge", "hdk.TimeAppend"],
),
(HDK_BINARY_OP_DATA_SIZE[ASV_DATASET_SIZE], ["hdk.TimeMerge", "hdk.TimeAppend"],),
(
HDK_SERIES_DATA_SIZE[ASV_DATASET_SIZE],
["hdk.TimeBinaryOpSeries", "hdk.TimeValueCountsSeries"],
Expand Down
23 changes: 5 additions & 18 deletions modin/_compat/core/py36/base_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,16 +206,9 @@ def read_json(
returns=_doc_returns_qc,
)
def read_feather(
cls,
path,
columns=None,
use_threads=True,
cls, path, columns=None, use_threads=True,
): # noqa: PR01
return cls._read_feather(
path=path,
columns=columns,
use_threads=use_threads,
)
return cls._read_feather(path=path, columns=columns, use_threads=use_threads,)

@classmethod
@_inherit_docstrings(pandas.read_stata, apilink="pandas.read_stata")
Expand Down Expand Up @@ -258,13 +251,10 @@ def read_stata(
returns=_doc_returns_qc,
)
def read_pickle(
cls,
filepath_or_buffer,
compression="infer",
cls, filepath_or_buffer, compression="infer",
): # noqa: PR01
return cls._read_pickle(
filepath_or_buffer=filepath_or_buffer,
compression=compression,
filepath_or_buffer=filepath_or_buffer, compression=compression,
)

@classmethod
Expand Down Expand Up @@ -306,8 +296,5 @@ def to_pickle(
protocol: int = 4, # older pandas only supports protocol <= 4
): # noqa: PR01, D200
return cls._to_pickle(
obj,
filepath_or_buffer,
compression=compression,
protocol=protocol,
obj, filepath_or_buffer, compression=compression, protocol=protocol,
)
8 changes: 2 additions & 6 deletions modin/_compat/pandas_api/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@
from modin._compat import PandasCompatVersion

if PandasCompatVersion.CURRENT == PandasCompatVersion.PY36:
from .py36 import (
Python36CompatibleBasePandasDataset as BasePandasDatasetCompat,
)
from .py36 import Python36CompatibleBasePandasDataset as BasePandasDatasetCompat
from .py36 import Python36CompatibleDataFrame as DataFrameCompat
from .py36 import Python36CompatibleSeries as SeriesCompat
from .py36 import Python36CompatibleDataFrameGroupBy as DataFrameGroupByCompat
Expand All @@ -27,9 +25,7 @@
from .py36 import Python36CompatibleRolling as RollingCompat
from .py36 import Python36CompatibleResampler as ResamplerCompat
elif PandasCompatVersion.CURRENT == PandasCompatVersion.LATEST:
from .latest import (
LatestCompatibleBasePandasDataset as BasePandasDatasetCompat,
)
from .latest import LatestCompatibleBasePandasDataset as BasePandasDatasetCompat
from .latest import LatestCompatibleDataFrame as DataFrameCompat
from .latest import LatestCompatibleSeries as SeriesCompat
from .latest import LatestCompatibleDataFrameGroupBy as DataFrameGroupByCompat
Expand Down
22 changes: 3 additions & 19 deletions modin/_compat/pandas_api/latest/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,11 +224,7 @@ def rank(
)

def reindex(
self,
index=None,
columns=None,
copy=True,
**kwargs,
self, index=None, columns=None, copy=True, **kwargs,
):
return self._reindex(index=index, columns=columns, copy=copy, **kwargs)

Expand Down Expand Up @@ -345,13 +341,7 @@ def set_axis(self, labels, axis=0, inplace=no_default, *, copy=no_default):
)

def sem(
self,
axis=None,
skipna=True,
level=None,
ddof=1,
numeric_only=None,
**kwargs,
self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None, **kwargs,
):
return self._sem(
axis=axis,
Expand Down Expand Up @@ -402,13 +392,7 @@ def skew(
)

def std(
self,
axis=None,
skipna=True,
level=None,
ddof=1,
numeric_only=None,
**kwargs,
self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None, **kwargs,
):
return self._std(
axis=axis,
Expand Down
13 changes: 2 additions & 11 deletions modin/_compat/pandas_api/py36/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,18 +160,9 @@ def rank(
)

def reindex(
self,
index=None,
columns=None,
copy=True,
**kwargs,
self, index=None, columns=None, copy=True, **kwargs,
):
return self._reindex(
index=index,
columns=columns,
copy=copy,
**kwargs,
)
return self._reindex(index=index, columns=columns, copy=copy, **kwargs,)

def resample(
self,
Expand Down
5 changes: 1 addition & 4 deletions modin/_compat/pandas_api/py36/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,10 +155,7 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs):

return DataFrame(
query_compiler=FactoryDispatcher.read_parquet(
path=path,
engine=engine,
columns=columns,
**kwargs,
path=path, engine=engine, columns=columns, **kwargs,
)
)

Expand Down
7 changes: 1 addition & 6 deletions modin/_compat/pandas_api/py36/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,7 @@ def idxmin(self, axis=0, skipna=True, *args, **kwargs):
return self._idxmin(axis=axis, skipna=skipna)

def kurt(
self,
axis=None,
skipna=None,
level=None,
numeric_only=None,
**kwargs,
self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs,
): # noqa: PR01, RT01, D200
if axis not in (None, 0, "index", "rows"):
raise ValueError(f"No axis named {axis} for object type Series")
Expand Down
7 changes: 1 addition & 6 deletions modin/_compat/pandas_api/py36/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,7 @@ def create_stat_method(name):
"""

def stat_method(
self,
axis=None,
skipna=None,
level=None,
numeric_only=None,
**kwargs,
self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs,
):
return self._stat_operation(name, axis, skipna, level, numeric_only, **kwargs)

Expand Down
1 change: 0 additions & 1 deletion modin/_compat/pandas_api/py36/window.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ def __init__(
axis,
)


@append_to_docstring("Compatibility layer for 'Python 3.6 pandas' for Rolling.")
@_inherit_docstrings(pandas.core.window.rolling.Rolling)
class Python36CompatibleRolling(BaseCompatibleRolling):
Expand Down
10 changes: 3 additions & 7 deletions modin/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,18 +372,15 @@ def TestReadCSVFixture():
# each xdist worker spawned in separate process with separate namespace and dataset
pytest.csvs_names = {file_id: get_unique_filename() for file_id in files_ids}
# test_read_csv_col_handling, test_read_csv_parsing
_make_csv_file(filenames)(
filename=pytest.csvs_names["test_read_csv_regular"],
)
_make_csv_file(filenames)(filename=pytest.csvs_names["test_read_csv_regular"],)
# test_read_csv_parsing
_make_csv_file(filenames)(
filename=pytest.csvs_names["test_read_csv_yes_no"],
additional_col_values=["Yes", "true", "No", "false"],
)
# test_read_csv_col_handling
_make_csv_file(filenames)(
filename=pytest.csvs_names["test_read_csv_blank_lines"],
add_blank_lines=True,
filename=pytest.csvs_names["test_read_csv_blank_lines"], add_blank_lines=True,
)
# test_read_csv_nans_handling
_make_csv_file(filenames)(
Expand All @@ -393,8 +390,7 @@ def TestReadCSVFixture():
)
# test_read_csv_error_handling
_make_csv_file(filenames)(
filename=pytest.csvs_names["test_read_csv_bad_lines"],
add_bad_lines=True,
filename=pytest.csvs_names["test_read_csv_bad_lines"], add_bad_lines=True,
)

yield
Expand Down
17 changes: 12 additions & 5 deletions modin/core/dataframe/algebra/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

"""Module houses builder class for Binary operator."""

from typing import Any, Callable
import numpy as np
import pandas

Expand All @@ -23,7 +24,9 @@ class Binary(Operator):
"""Builder class for Binary operator."""

@classmethod
def register(cls, func, join_type="outer", labels="replace"):
def register(
cls, func: Callable, join_type: str = "outer", labels: str = "replace"
) -> Callable:
"""
Build template binary operator.

Expand All @@ -44,8 +47,13 @@ def register(cls, func, join_type="outer", labels="replace"):
"""

def caller(
query_compiler, other, broadcast=False, *args, dtypes=None, **kwargs
):
query_compiler: Any,
other: Any,
broadcast: bool = False,
*args: Any,
dtypes: Any | None = None,
**kwargs: Any
) -> Any:
"""
Apply binary `func` to passed operands.

Expand Down Expand Up @@ -112,8 +120,7 @@ def caller(
)
else:
new_modin_frame = query_compiler._modin_frame.map(
lambda df: func(df, other, *args, **kwargs),
dtypes=dtypes,
lambda df: func(df, other, *args, **kwargs), dtypes=dtypes,
)
return query_compiler.__constructor__(new_modin_frame)

Expand Down
5 changes: 3 additions & 2 deletions modin/core/dataframe/algebra/default2pandas/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

"""Module houses default binary functions builder class."""

from typing import Any, Callable
from .default import DefaultMethod

import pandas
Expand All @@ -23,7 +24,7 @@ class BinaryDefault(DefaultMethod):
"""Build default-to-pandas methods which executes binary functions."""

@classmethod
def build_default_to_pandas(cls, fn, fn_name):
def build_default_to_pandas(cls, fn: Callable, fn_name: str) -> Callable:
"""
Build function that do fallback to pandas for passed binary `fn`.

Expand All @@ -41,7 +42,7 @@ def build_default_to_pandas(cls, fn, fn_name):
to the casted to pandas frame.
"""

def bin_ops_wrapper(df, other, *args, **kwargs):
def bin_ops_wrapper(df: Any, other: Any, *args: Any, **kwargs: Any) -> None:
"""Apply specified binary function to the passed operands."""
squeeze_other = kwargs.pop("broadcast", False) or kwargs.pop(
"squeeze_other", False
Expand Down
5 changes: 4 additions & 1 deletion modin/core/dataframe/algebra/default2pandas/cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,16 @@
"""Module houses default applied-on-category functions builder class."""

from .series import SeriesDefault
import pandas


class CatDefault(SeriesDefault):
"""Builder for default-to-pandas methods which is executed under category accessor."""

@classmethod
def frame_wrapper(cls, df):
def frame_wrapper(
cls, df: pandas.DataFrame
) -> pandas.core.arrays.categorical.CategoricalAccessor:
"""
Get category accessor of the passed frame.

Expand Down
Loading