Skip to content

Commit

Permalink
[SPARK-43476][SPARK-43477][SPARK-43478][PS] Support StringMethods f…
Browse files Browse the repository at this point in the history
…or pandas 2.0.0 and above

### What changes were proposed in this pull request?

This PR proposes to support `StringMethods` for pandas 2.0.0 and above.

### Why are the changes needed?

Support the latest pandas for pandas API on Spark.

### Does this PR introduce _any_ user-facing change?

`StringMethods.split`, `StringMethods.rsplit` and `StringMethods.replace` is available with the latest pandas.

### How was this patch tested?

Enabling the existing UTs.

Closes #42312 from itholic/pandas_str_split.

Lead-authored-by: itholic <[email protected]>
Co-authored-by: Haejoon Lee <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
  • Loading branch information
2 people authored and zhengruifeng committed Aug 11, 2023
1 parent 73b0376 commit 6729f49
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 28 deletions.
1 change: 1 addition & 0 deletions python/docs/source/migration_guide/pyspark_upgrade.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ Upgrading from PySpark 3.5 to 4.0
* In Spark 4.0, ``include_start`` and ``include_end`` parameters from ``Series.between_time`` have been removed from pandas API on Spark, use ``inclusive`` instead.
* In Spark 4.0, the various datetime attributes of ``DatetimeIndex`` (``day``, ``month``, ``year`` etc.) are now ``int32`` instead of ``int64`` from pandas API on Spark.
* In Spark 4.0, ``sort_columns`` parameter from ``DataFrame.plot`` and `Series.plot`` has been removed from pandas API on Spark.
* In Spark 4.0, the default value of ``regex`` parameter for ``Series.str.replace`` has been changed from ``True`` to ``False`` from pandas API on Spark. Additionally, a single character ``pat`` with ``regex=True`` is now treated as a regular expression instead of a string literal.


Upgrading from PySpark 3.3 to 3.4
Expand Down
21 changes: 8 additions & 13 deletions python/pyspark/pandas/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
"""
String functions on pandas-on-Spark Series
"""
import warnings
from typing import (
Any,
Callable,
Expand Down Expand Up @@ -1516,7 +1515,7 @@ def replace(
n: int = -1,
case: Optional[bool] = None,
flags: int = 0,
regex: bool = True,
regex: bool = False,
) -> "ps.Series":
"""
Replace occurrences of pattern/regex in the Series with some other
Expand Down Expand Up @@ -1580,35 +1579,31 @@ def replace(
Reverse every lowercase alphabetic word:
>>> repl = lambda m: m.group(0)[::-1]
>>> ps.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl)
>>> ps.Series(['foo 123', 'bar baz', np.nan]).str.replace('[a-z]+', repl, regex=True)
0 oof 123
1 rab zab
2 None
dtype: object
Using regex groups (extract second group and swap case):
>>> pat = r"(?P<one>\\w+) (?P<two>\\w+) (?P<three>\\w+)"
>>> pat = "(?P<one>\\w+) (?P<two>\\w+) (?P<three>\\w+)"
>>> repl = lambda m: m.group('two').swapcase()
>>> ps.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl)
>>> ps.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl, regex=True)
0 tWO
1 bAR
dtype: object
Using a compiled regex with flags:
>>> import re
>>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE)
>>> ps.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar')
>>> regex_pat = re.compile('FUZ', flags=re.IGNORECASE)
>>> ps.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar', regex=True)
0 foo
1 bar
2 None
dtype: object
"""
warnings.warn(
"Default value of `regex` will be changed to `False` instead of `True` in 4.0.0.",
FutureWarning,
)

def pandas_replace(s) -> ps.Series[str]: # type: ignore[no-untyped-def]
return s.str.replace(pat, repl, n=n, case=case, flags=flags, regex=regex)
Expand Down Expand Up @@ -2027,7 +2022,7 @@ def split(

@pandas_udf(returnType=return_type) # type: ignore[call-overload]
def pudf(s: pd.Series) -> pd.Series:
return s.str.split(pat, n)
return s.str.split(pat, n=n)

psser = self._data._with_new_scol(
pudf(self._data.spark.column).alias(self._data._internal.data_spark_column_names[0]),
Expand Down Expand Up @@ -2174,7 +2169,7 @@ def rsplit(

@pandas_udf(returnType=return_type) # type: ignore[call-overload]
def pudf(s: pd.Series) -> pd.Series:
return s.str.rsplit(pat, n)
return s.str.rsplit(pat, n=n)

psser = self._data._with_new_scol(
pudf(self._data.spark.column).alias(self._data._internal.data_spark_column_names[0]),
Expand Down
18 changes: 3 additions & 15 deletions python/pyspark/pandas/tests/test_series_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,10 +246,6 @@ def test_string_repeat(self):
with self.assertRaises(TypeError):
self.check_func(lambda x: x.str.repeat(repeats=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))

@unittest.skipIf(
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
"TODO(SPARK-43476): Enable SeriesStringTests.test_string_replace for pandas 2.0.0.",
)
def test_string_replace(self):
self.check_func(lambda x: x.str.replace("a.", "xx", regex=True))
self.check_func(lambda x: x.str.replace("a.", "xx", regex=False))
Expand All @@ -259,10 +255,10 @@ def test_string_replace(self):
def repl(m):
return m.group(0)[::-1]

self.check_func(lambda x: x.str.replace(r"[a-z]+", repl))
self.check_func(lambda x: x.str.replace("[a-z]+", repl, regex=True))
# compiled regex with flags
regex_pat = re.compile(r"WHITESPACE", flags=re.IGNORECASE)
self.check_func(lambda x: x.str.replace(regex_pat, "---"))
regex_pat = re.compile("WHITESPACE", flags=re.IGNORECASE)
self.check_func(lambda x: x.str.replace(regex_pat, "---", regex=True))

def test_string_rfind(self):
self.check_func(lambda x: x.str.rfind("a"))
Expand Down Expand Up @@ -297,10 +293,6 @@ def test_string_slice_replace(self):
self.check_func(lambda x: x.str.slice_replace(stop=2, repl="X"))
self.check_func(lambda x: x.str.slice_replace(start=1, stop=3, repl="X"))

@unittest.skipIf(
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
"TODO(SPARK-43478): Enable SeriesStringTests.test_string_split for pandas 2.0.0.",
)
def test_string_split(self):
self.check_func_on_series(lambda x: repr(x.str.split()), self.pser[:-1])
self.check_func_on_series(lambda x: repr(x.str.split(r"p*")), self.pser[:-1])
Expand All @@ -311,10 +303,6 @@ def test_string_split(self):
with self.assertRaises(NotImplementedError):
self.check_func(lambda x: x.str.split(expand=True))

@unittest.skipIf(
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
"TODO(SPARK-43477): Enable SeriesStringTests.test_string_rsplit for pandas 2.0.0.",
)
def test_string_rsplit(self):
self.check_func_on_series(lambda x: repr(x.str.rsplit()), self.pser[:-1])
self.check_func_on_series(lambda x: repr(x.str.rsplit(r"p*")), self.pser[:-1])
Expand Down

0 comments on commit 6729f49

Please sign in to comment.