Skip to content

Commit

Permalink
[CHORE]: move utf8 functions from daft-dsl to daft-functions (#3101)
Browse files Browse the repository at this point in the history
This refers to #2854 moving the UTF-8 functions from daft-dsl to
daft-functions
  • Loading branch information
ConeyLiu authored Nov 15, 2024
1 parent a271c78 commit 0709691
Show file tree
Hide file tree
Showing 52 changed files with 2,197 additions and 1,635 deletions.
62 changes: 34 additions & 28 deletions daft/daft/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -1095,34 +1095,6 @@ class PyExpr:
def __repr__(self) -> str: ...
def __hash__(self) -> int: ...
def __reduce__(self) -> tuple: ...
def utf8_endswith(self, pattern: PyExpr) -> PyExpr: ...
def utf8_startswith(self, pattern: PyExpr) -> PyExpr: ...
def utf8_contains(self, pattern: PyExpr) -> PyExpr: ...
def utf8_match(self, pattern: PyExpr) -> PyExpr: ...
def utf8_split(self, pattern: PyExpr, regex: bool) -> PyExpr: ...
def utf8_extract(self, pattern: PyExpr, index: int) -> PyExpr: ...
def utf8_extract_all(self, pattern: PyExpr, index: int) -> PyExpr: ...
def utf8_replace(self, pattern: PyExpr, replacement: PyExpr, regex: bool) -> PyExpr: ...
def utf8_length(self) -> PyExpr: ...
def utf8_length_bytes(self) -> PyExpr: ...
def utf8_lower(self) -> PyExpr: ...
def utf8_upper(self) -> PyExpr: ...
def utf8_lstrip(self) -> PyExpr: ...
def utf8_rstrip(self) -> PyExpr: ...
def utf8_reverse(self) -> PyExpr: ...
def utf8_capitalize(self) -> PyExpr: ...
def utf8_left(self, nchars: PyExpr) -> PyExpr: ...
def utf8_right(self, nchars: PyExpr) -> PyExpr: ...
def utf8_find(self, substr: PyExpr) -> PyExpr: ...
def utf8_rpad(self, length: PyExpr, pad: PyExpr) -> PyExpr: ...
def utf8_lpad(self, length: PyExpr, pad: PyExpr) -> PyExpr: ...
def utf8_repeat(self, n: PyExpr) -> PyExpr: ...
def utf8_like(self, pattern: PyExpr) -> PyExpr: ...
def utf8_ilike(self, pattern: PyExpr) -> PyExpr: ...
def utf8_substr(self, start: PyExpr, length: PyExpr) -> PyExpr: ...
def utf8_to_date(self, format: str) -> PyExpr: ...
def utf8_to_datetime(self, format: str, timezone: str | None = None) -> PyExpr: ...
def utf8_normalize(self, remove_punct: bool, lowercase: bool, nfd_unicode: bool, white_space: bool) -> PyExpr: ...
def struct_get(self, name: str) -> PyExpr: ...
def map_get(self, key: PyExpr) -> PyExpr: ...
def partitioning_days(self) -> PyExpr: ...
Expand Down Expand Up @@ -1320,6 +1292,40 @@ def list_max(expr: PyExpr) -> PyExpr: ...
def list_slice(expr: PyExpr, start: PyExpr, end: PyExpr | None = None) -> PyExpr: ...
def list_chunk(expr: PyExpr, size: int) -> PyExpr: ...

# ---
# expr.utf8 namespace
# ---
def utf8_endswith(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
def utf8_startswith(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
def utf8_contains(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
def utf8_match(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
def utf8_split(expr: PyExpr, pattern: PyExpr, regex: bool) -> PyExpr: ...
def utf8_extract(expr: PyExpr, pattern: PyExpr, index: int) -> PyExpr: ...
def utf8_extract_all(expr: PyExpr, pattern: PyExpr, index: int) -> PyExpr: ...
def utf8_replace(expr: PyExpr, pattern: PyExpr, replacement: PyExpr, regex: bool) -> PyExpr: ...
def utf8_length(expr: PyExpr) -> PyExpr: ...
def utf8_length_bytes(expr: PyExpr) -> PyExpr: ...
def utf8_lower(expr: PyExpr) -> PyExpr: ...
def utf8_upper(expr: PyExpr) -> PyExpr: ...
def utf8_lstrip(expr: PyExpr) -> PyExpr: ...
def utf8_rstrip(expr: PyExpr) -> PyExpr: ...
def utf8_reverse(expr: PyExpr) -> PyExpr: ...
def utf8_capitalize(expr: PyExpr) -> PyExpr: ...
def utf8_left(expr: PyExpr, nchars: PyExpr) -> PyExpr: ...
def utf8_right(expr: PyExpr, nchars: PyExpr) -> PyExpr: ...
def utf8_find(expr: PyExpr, substr: PyExpr) -> PyExpr: ...
def utf8_rpad(expr: PyExpr, length: PyExpr, pad: PyExpr) -> PyExpr: ...
def utf8_lpad(expr: PyExpr, length: PyExpr, pad: PyExpr) -> PyExpr: ...
def utf8_repeat(expr: PyExpr, n: PyExpr) -> PyExpr: ...
def utf8_like(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
def utf8_ilike(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
def utf8_substr(expr: PyExpr, start: PyExpr, length: PyExpr) -> PyExpr: ...
def utf8_to_date(expr: PyExpr, format: str) -> PyExpr: ...
def utf8_to_datetime(expr: PyExpr, format: str, timezone: str | None = None) -> PyExpr: ...
def utf8_normalize(
expr: PyExpr, remove_punct: bool, lowercase: bool, nfd_unicode: bool, white_space: bool
) -> PyExpr: ...

class PyCatalog:
@staticmethod
def new() -> PyCatalog: ...
Expand Down
60 changes: 32 additions & 28 deletions daft/expressions/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1887,7 +1887,7 @@ def contains(self, substr: str | Expression) -> Expression:
Expression: a Boolean expression indicating whether each value contains the provided pattern
"""
substr_expr = Expression._to_expression(substr)
return Expression._from_pyexpr(self._expr.utf8_contains(substr_expr._expr))
return Expression._from_pyexpr(native.utf8_contains(self._expr, substr_expr._expr))

def match(self, pattern: str | Expression) -> Expression:
"""Checks whether each string matches the given regular expression pattern in a string column
Expand Down Expand Up @@ -1917,7 +1917,7 @@ def match(self, pattern: str | Expression) -> Expression:
Expression: a Boolean expression indicating whether each value matches the provided pattern
"""
pattern_expr = Expression._to_expression(pattern)
return Expression._from_pyexpr(self._expr.utf8_match(pattern_expr._expr))
return Expression._from_pyexpr(native.utf8_match(self._expr, pattern_expr._expr))

def endswith(self, suffix: str | Expression) -> Expression:
"""Checks whether each string ends with the given pattern in a string column
Expand Down Expand Up @@ -1947,7 +1947,7 @@ def endswith(self, suffix: str | Expression) -> Expression:
Expression: a Boolean expression indicating whether each value ends with the provided pattern
"""
suffix_expr = Expression._to_expression(suffix)
return Expression._from_pyexpr(self._expr.utf8_endswith(suffix_expr._expr))
return Expression._from_pyexpr(native.utf8_endswith(self._expr, suffix_expr._expr))

def startswith(self, prefix: str | Expression) -> Expression:
"""Checks whether each string starts with the given pattern in a string column
Expand Down Expand Up @@ -1977,7 +1977,7 @@ def startswith(self, prefix: str | Expression) -> Expression:
Expression: a Boolean expression indicating whether each value starts with the provided pattern
"""
prefix_expr = Expression._to_expression(prefix)
return Expression._from_pyexpr(self._expr.utf8_startswith(prefix_expr._expr))
return Expression._from_pyexpr(native.utf8_startswith(self._expr, prefix_expr._expr))

def split(self, pattern: str | Expression, regex: bool = False) -> Expression:
r"""Splits each string on the given literal or regex pattern, into a list of strings.
Expand Down Expand Up @@ -2028,7 +2028,7 @@ def split(self, pattern: str | Expression, regex: bool = False) -> Expression:
Expression: A List[Utf8] expression containing the string splits for each string in the column.
"""
pattern_expr = Expression._to_expression(pattern)
return Expression._from_pyexpr(self._expr.utf8_split(pattern_expr._expr, regex))
return Expression._from_pyexpr(native.utf8_split(self._expr, pattern_expr._expr, regex))

def concat(self, other: str | Expression) -> Expression:
"""Concatenates two string expressions together
Expand Down Expand Up @@ -2119,7 +2119,7 @@ def extract(self, pattern: str | Expression, index: int = 0) -> Expression:
`extract_all`
"""
pattern_expr = Expression._to_expression(pattern)
return Expression._from_pyexpr(self._expr.utf8_extract(pattern_expr._expr, index))
return Expression._from_pyexpr(native.utf8_extract(self._expr, pattern_expr._expr, index))

def extract_all(self, pattern: str | Expression, index: int = 0) -> Expression:
r"""Extracts the specified match group from all regex matches in each string in a string column.
Expand Down Expand Up @@ -2175,7 +2175,7 @@ def extract_all(self, pattern: str | Expression, index: int = 0) -> Expression:
`extract`
"""
pattern_expr = Expression._to_expression(pattern)
return Expression._from_pyexpr(self._expr.utf8_extract_all(pattern_expr._expr, index))
return Expression._from_pyexpr(native.utf8_extract_all(self._expr, pattern_expr._expr, index))

def replace(
self,
Expand Down Expand Up @@ -2232,7 +2232,9 @@ def replace(
"""
pattern_expr = Expression._to_expression(pattern)
replacement_expr = Expression._to_expression(replacement)
return Expression._from_pyexpr(self._expr.utf8_replace(pattern_expr._expr, replacement_expr._expr, regex))
return Expression._from_pyexpr(
native.utf8_replace(self._expr, pattern_expr._expr, replacement_expr._expr, regex)
)

def length(self) -> Expression:
"""Retrieves the length for a UTF-8 string column
Expand All @@ -2259,7 +2261,7 @@ def length(self) -> Expression:
Returns:
Expression: an UInt64 expression with the length of each string
"""
return Expression._from_pyexpr(self._expr.utf8_length())
return Expression._from_pyexpr(native.utf8_length(self._expr))

def length_bytes(self) -> Expression:
"""Retrieves the length for a UTF-8 string column in bytes.
Expand All @@ -2286,7 +2288,7 @@ def length_bytes(self) -> Expression:
Returns:
Expression: an UInt64 expression with the length of each string
"""
return Expression._from_pyexpr(self._expr.utf8_length_bytes())
return Expression._from_pyexpr(native.utf8_length_bytes(self._expr))

def lower(self) -> Expression:
"""Convert UTF-8 string to all lowercase
Expand All @@ -2313,7 +2315,7 @@ def lower(self) -> Expression:
Returns:
Expression: a String expression which is `self` lowercased
"""
return Expression._from_pyexpr(self._expr.utf8_lower())
return Expression._from_pyexpr(native.utf8_lower(self._expr))

def upper(self) -> Expression:
"""Convert UTF-8 string to all upper
Expand All @@ -2340,7 +2342,7 @@ def upper(self) -> Expression:
Returns:
Expression: a String expression which is `self` uppercased
"""
return Expression._from_pyexpr(self._expr.utf8_upper())
return Expression._from_pyexpr(native.utf8_upper(self._expr))

def lstrip(self) -> Expression:
"""Strip whitespace from the left side of a UTF-8 string
Expand All @@ -2367,7 +2369,7 @@ def lstrip(self) -> Expression:
Returns:
Expression: a String expression which is `self` with leading whitespace stripped
"""
return Expression._from_pyexpr(self._expr.utf8_lstrip())
return Expression._from_pyexpr(native.utf8_lstrip(self._expr))

def rstrip(self) -> Expression:
"""Strip whitespace from the right side of a UTF-8 string
Expand All @@ -2394,7 +2396,7 @@ def rstrip(self) -> Expression:
Returns:
Expression: a String expression which is `self` with trailing whitespace stripped
"""
return Expression._from_pyexpr(self._expr.utf8_rstrip())
return Expression._from_pyexpr(native.utf8_rstrip(self._expr))

def reverse(self) -> Expression:
"""Reverse a UTF-8 string
Expand All @@ -2421,7 +2423,7 @@ def reverse(self) -> Expression:
Returns:
Expression: a String expression which is `self` reversed
"""
return Expression._from_pyexpr(self._expr.utf8_reverse())
return Expression._from_pyexpr(native.utf8_reverse(self._expr))

def capitalize(self) -> Expression:
"""Capitalize a UTF-8 string
Expand All @@ -2448,7 +2450,7 @@ def capitalize(self) -> Expression:
Returns:
Expression: a String expression which is `self` uppercased with the first character and lowercased the rest
"""
return Expression._from_pyexpr(self._expr.utf8_capitalize())
return Expression._from_pyexpr(native.utf8_capitalize(self._expr))

def left(self, nchars: int | Expression) -> Expression:
"""Gets the n (from nchars) left-most characters of each string
Expand Down Expand Up @@ -2476,7 +2478,7 @@ def left(self, nchars: int | Expression) -> Expression:
Expression: a String expression which is the `n` left-most characters of `self`
"""
nchars_expr = Expression._to_expression(nchars)
return Expression._from_pyexpr(self._expr.utf8_left(nchars_expr._expr))
return Expression._from_pyexpr(native.utf8_left(self._expr, nchars_expr._expr))

def right(self, nchars: int | Expression) -> Expression:
"""Gets the n (from nchars) right-most characters of each string
Expand Down Expand Up @@ -2504,7 +2506,7 @@ def right(self, nchars: int | Expression) -> Expression:
Expression: a String expression which is the `n` right-most characters of `self`
"""
nchars_expr = Expression._to_expression(nchars)
return Expression._from_pyexpr(self._expr.utf8_right(nchars_expr._expr))
return Expression._from_pyexpr(native.utf8_right(self._expr, nchars_expr._expr))

def find(self, substr: str | Expression) -> Expression:
"""Returns the index of the first occurrence of the substring in each string
Expand Down Expand Up @@ -2536,7 +2538,7 @@ def find(self, substr: str | Expression) -> Expression:
Expression: an Int64 expression with the index of the first occurrence of the substring in each string
"""
substr_expr = Expression._to_expression(substr)
return Expression._from_pyexpr(self._expr.utf8_find(substr_expr._expr))
return Expression._from_pyexpr(native.utf8_find(self._expr, substr_expr._expr))

def rpad(self, length: int | Expression, pad: str | Expression) -> Expression:
"""Right-pads each string by truncating or padding with the character
Expand Down Expand Up @@ -2569,7 +2571,7 @@ def rpad(self, length: int | Expression, pad: str | Expression) -> Expression:
"""
length_expr = Expression._to_expression(length)
pad_expr = Expression._to_expression(pad)
return Expression._from_pyexpr(self._expr.utf8_rpad(length_expr._expr, pad_expr._expr))
return Expression._from_pyexpr(native.utf8_rpad(self._expr, length_expr._expr, pad_expr._expr))

def lpad(self, length: int | Expression, pad: str | Expression) -> Expression:
"""Left-pads each string by truncating on the right or padding with the character
Expand Down Expand Up @@ -2602,7 +2604,7 @@ def lpad(self, length: int | Expression, pad: str | Expression) -> Expression:
"""
length_expr = Expression._to_expression(length)
pad_expr = Expression._to_expression(pad)
return Expression._from_pyexpr(self._expr.utf8_lpad(length_expr._expr, pad_expr._expr))
return Expression._from_pyexpr(native.utf8_lpad(self._expr, length_expr._expr, pad_expr._expr))

def repeat(self, n: int | Expression) -> Expression:
"""Repeats each string n times
Expand Down Expand Up @@ -2630,7 +2632,7 @@ def repeat(self, n: int | Expression) -> Expression:
Expression: a String expression which is `self` repeated `n` times
"""
n_expr = Expression._to_expression(n)
return Expression._from_pyexpr(self._expr.utf8_repeat(n_expr._expr))
return Expression._from_pyexpr(native.utf8_repeat(self._expr, n_expr._expr))

def like(self, pattern: str | Expression) -> Expression:
"""Checks whether each string matches the given SQL LIKE pattern, case sensitive
Expand Down Expand Up @@ -2661,7 +2663,7 @@ def like(self, pattern: str | Expression) -> Expression:
Expression: a Boolean expression indicating whether each value matches the provided pattern
"""
pattern_expr = Expression._to_expression(pattern)
return Expression._from_pyexpr(self._expr.utf8_like(pattern_expr._expr))
return Expression._from_pyexpr(native.utf8_like(self._expr, pattern_expr._expr))

def ilike(self, pattern: str | Expression) -> Expression:
"""Checks whether each string matches the given SQL LIKE pattern, case insensitive
Expand Down Expand Up @@ -2692,7 +2694,7 @@ def ilike(self, pattern: str | Expression) -> Expression:
Expression: a Boolean expression indicating whether each value matches the provided pattern
"""
pattern_expr = Expression._to_expression(pattern)
return Expression._from_pyexpr(self._expr.utf8_ilike(pattern_expr._expr))
return Expression._from_pyexpr(native.utf8_ilike(self._expr, pattern_expr._expr))

def substr(self, start: int | Expression, length: int | Expression | None = None) -> Expression:
"""Extract a substring from a string, starting at a specified index and extending for a given length.
Expand Down Expand Up @@ -2724,7 +2726,7 @@ def substr(self, start: int | Expression, length: int | Expression | None = None
"""
start_expr = Expression._to_expression(start)
length_expr = Expression._to_expression(length)
return Expression._from_pyexpr(self._expr.utf8_substr(start_expr._expr, length_expr._expr))
return Expression._from_pyexpr(native.utf8_substr(self._expr, start_expr._expr, length_expr._expr))

def to_date(self, format: str) -> Expression:
"""Converts a string to a date using the specified format
Expand Down Expand Up @@ -2755,7 +2757,7 @@ def to_date(self, format: str) -> Expression:
Returns:
Expression: a Date expression which is parsed by given format
"""
return Expression._from_pyexpr(self._expr.utf8_to_date(format))
return Expression._from_pyexpr(native.utf8_to_date(self._expr, format))

def to_datetime(self, format: str, timezone: str | None = None) -> Expression:
"""Converts a string to a datetime using the specified format and timezone
Expand Down Expand Up @@ -2805,7 +2807,7 @@ def to_datetime(self, format: str, timezone: str | None = None) -> Expression:
Returns:
Expression: a DateTime expression which is parsed by given format and timezone
"""
return Expression._from_pyexpr(self._expr.utf8_to_datetime(format, timezone))
return Expression._from_pyexpr(native.utf8_to_datetime(self._expr, format, timezone))

def normalize(
self,
Expand Down Expand Up @@ -2849,7 +2851,9 @@ def normalize(
Returns:
Expression: a String expression which is normalized.
"""
return Expression._from_pyexpr(self._expr.utf8_normalize(remove_punct, lowercase, nfd_unicode, white_space))
return Expression._from_pyexpr(
native.utf8_normalize(self._expr, remove_punct, lowercase, nfd_unicode, white_space)
)

def tokenize_encode(
self,
Expand Down
8 changes: 1 addition & 7 deletions src/daft-dsl/src/functions/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ pub mod python;
pub mod scalar;
pub mod sketch;
pub mod struct_;
pub mod utf8;

use std::{
fmt::{Display, Formatter, Result, Write},
Expand All @@ -18,15 +17,11 @@ use python::PythonUDF;
pub use scalar::*;
use serde::{Deserialize, Serialize};

use self::{
map::MapExpr, partitioning::PartitioningExpr, sketch::SketchExpr, struct_::StructExpr,
utf8::Utf8Expr,
};
use self::{map::MapExpr, partitioning::PartitioningExpr, sketch::SketchExpr, struct_::StructExpr};
use crate::{Expr, ExprRef, Operator};

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub enum FunctionExpr {
Utf8(Utf8Expr),
Map(MapExpr),
Sketch(SketchExpr),
Struct(StructExpr),
Expand All @@ -49,7 +44,6 @@ impl FunctionExpr {
#[inline]
fn get_evaluator(&self) -> &dyn FunctionEvaluator {
match self {
Self::Utf8(expr) => expr.get_evaluator(),
Self::Map(expr) => expr.get_evaluator(),
Self::Sketch(expr) => expr.get_evaluator(),
Self::Struct(expr) => expr.get_evaluator(),
Expand Down
Loading

0 comments on commit 0709691

Please sign in to comment.