[SPARK-43476][SPARK-43477][SPARK-43478][PS] Support StringMethods f…

…or pandas 2.0.0 and above ### What changes were proposed in this pull request? This PR proposes to support `StringMethods` for pandas 2.0.0 and above. ### Why are the changes needed? Support the latest pandas for pandas API on Spark. ### Does this PR introduce _any_ user-facing change? `StringMethods.split`, `StringMethods.rsplit` and `StringMethods.replace` is available with the latest pandas. ### How was this patch tested? Enabling the existing UTs. Closes #42312 from itholic/pandas_str_split. Lead-authored-by: itholic <[email protected]> Co-authored-by: Haejoon Lee <[email protected]> Signed-off-by: Ruifeng Zheng <[email protected]>
apache · Aug 11, 2023 · 6729f49 · 6729f49
1 parent 73b0376
commit 6729f49
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 28 deletions.
diff --git a/python/docs/source/migration_guide/pyspark_upgrade.rst b/python/docs/source/migration_guide/pyspark_upgrade.rst
@@ -36,6 +36,7 @@ Upgrading from PySpark 3.5 to 4.0
 * In Spark 4.0, ``include_start`` and ``include_end`` parameters from ``Series.between_time`` have been removed from pandas API on Spark, use ``inclusive`` instead.
 * In Spark 4.0, the various datetime attributes of ``DatetimeIndex`` (``day``, ``month``, ``year`` etc.) are now ``int32`` instead of ``int64`` from pandas API on Spark.
 * In Spark 4.0, ``sort_columns`` parameter from ``DataFrame.plot`` and `Series.plot`` has been removed from pandas API on Spark.
+* In Spark 4.0, the default value of ``regex`` parameter for ``Series.str.replace`` has been changed from ``True`` to ``False`` from pandas API on Spark. Additionally, a single character ``pat`` with ``regex=True`` is now treated as a regular expression instead of a string literal.
 
 
 Upgrading from PySpark 3.3 to 3.4

diff --git a/python/pyspark/pandas/strings.py b/python/pyspark/pandas/strings.py
@@ -18,7 +18,6 @@
 """
 String functions on pandas-on-Spark Series
 """
-import warnings
 from typing import (
     Any,
     Callable,
@@ -1516,7 +1515,7 @@ def replace(
         n: int = -1,
         case: Optional[bool] = None,
         flags: int = 0,
-        regex: bool = True,
+        regex: bool = False,
     ) -> "ps.Series":
         """
         Replace occurrences of pattern/regex in the Series with some other
@@ -1580,35 +1579,31 @@ def replace(
         Reverse every lowercase alphabetic word:
 
         >>> repl = lambda m: m.group(0)[::-1]
-        >>> ps.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl)
+        >>> ps.Series(['foo 123', 'bar baz', np.nan]).str.replace('[a-z]+', repl, regex=True)
         0    oof 123
         1    rab zab
         2       None
         dtype: object
 
         Using regex groups (extract second group and swap case):
 
-        >>> pat = r"(?P<one>\\w+) (?P<two>\\w+) (?P<three>\\w+)"
+        >>> pat = "(?P<one>\\w+) (?P<two>\\w+) (?P<three>\\w+)"
         >>> repl = lambda m: m.group('two').swapcase()
-        >>> ps.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl)
+        >>> ps.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl, regex=True)
         0    tWO
         1    bAR
         dtype: object
 
         Using a compiled regex with flags:
 
         >>> import re
-        >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE)
-        >>> ps.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar')
+        >>> regex_pat = re.compile('FUZ', flags=re.IGNORECASE)
+        >>> ps.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar', regex=True)
         0     foo
         1     bar
         2    None
         dtype: object
         """
-        warnings.warn(
-            "Default value of `regex` will be changed to `False` instead of `True` in 4.0.0.",
-            FutureWarning,
-        )
 
         def pandas_replace(s) -> ps.Series[str]:  # type: ignore[no-untyped-def]
             return s.str.replace(pat, repl, n=n, case=case, flags=flags, regex=regex)
@@ -2027,7 +2022,7 @@ def split(
 
         @pandas_udf(returnType=return_type)  # type: ignore[call-overload]
         def pudf(s: pd.Series) -> pd.Series:
-            return s.str.split(pat, n)
+            return s.str.split(pat, n=n)
 
         psser = self._data._with_new_scol(
             pudf(self._data.spark.column).alias(self._data._internal.data_spark_column_names[0]),
@@ -2174,7 +2169,7 @@ def rsplit(
 
         @pandas_udf(returnType=return_type)  # type: ignore[call-overload]
         def pudf(s: pd.Series) -> pd.Series:
-            return s.str.rsplit(pat, n)
+            return s.str.rsplit(pat, n=n)
 
         psser = self._data._with_new_scol(
             pudf(self._data.spark.column).alias(self._data._internal.data_spark_column_names[0]),

diff --git a/python/pyspark/pandas/tests/test_series_string.py b/python/pyspark/pandas/tests/test_series_string.py
@@ -246,10 +246,6 @@ def test_string_repeat(self):
         with self.assertRaises(TypeError):
             self.check_func(lambda x: x.str.repeat(repeats=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43476): Enable SeriesStringTests.test_string_replace for pandas 2.0.0.",
-    )
     def test_string_replace(self):
         self.check_func(lambda x: x.str.replace("a.", "xx", regex=True))
         self.check_func(lambda x: x.str.replace("a.", "xx", regex=False))
@@ -259,10 +255,10 @@ def test_string_replace(self):
         def repl(m):
             return m.group(0)[::-1]
 
-        self.check_func(lambda x: x.str.replace(r"[a-z]+", repl))
+        self.check_func(lambda x: x.str.replace("[a-z]+", repl, regex=True))
         # compiled regex with flags
-        regex_pat = re.compile(r"WHITESPACE", flags=re.IGNORECASE)
-        self.check_func(lambda x: x.str.replace(regex_pat, "---"))
+        regex_pat = re.compile("WHITESPACE", flags=re.IGNORECASE)
+        self.check_func(lambda x: x.str.replace(regex_pat, "---", regex=True))
 
     def test_string_rfind(self):
         self.check_func(lambda x: x.str.rfind("a"))
@@ -297,10 +293,6 @@ def test_string_slice_replace(self):
         self.check_func(lambda x: x.str.slice_replace(stop=2, repl="X"))
         self.check_func(lambda x: x.str.slice_replace(start=1, stop=3, repl="X"))
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43478): Enable SeriesStringTests.test_string_split for pandas 2.0.0.",
-    )
     def test_string_split(self):
         self.check_func_on_series(lambda x: repr(x.str.split()), self.pser[:-1])
         self.check_func_on_series(lambda x: repr(x.str.split(r"p*")), self.pser[:-1])
@@ -311,10 +303,6 @@ def test_string_split(self):
         with self.assertRaises(NotImplementedError):
             self.check_func(lambda x: x.str.split(expand=True))
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43477): Enable SeriesStringTests.test_string_rsplit for pandas 2.0.0.",
-    )
     def test_string_rsplit(self):
         self.check_func_on_series(lambda x: repr(x.str.rsplit()), self.pser[:-1])
         self.check_func_on_series(lambda x: repr(x.str.rsplit(r"p*")), self.pser[:-1])