Skip to content

Commit 0709691

Browse files
authored
[CHORE]: move utf8 functions from daft-dsl to daft-functions (Eventual-Inc#3101)
This refers to Eventual-Inc#2854 moving the UTF-8 functions from daft-dsl to daft-functions
1 parent a271c78 commit 0709691

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+2197
-1635
lines changed

daft/daft/__init__.pyi

Lines changed: 34 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1095,34 +1095,6 @@ class PyExpr:
10951095
def __repr__(self) -> str: ...
10961096
def __hash__(self) -> int: ...
10971097
def __reduce__(self) -> tuple: ...
1098-
def utf8_endswith(self, pattern: PyExpr) -> PyExpr: ...
1099-
def utf8_startswith(self, pattern: PyExpr) -> PyExpr: ...
1100-
def utf8_contains(self, pattern: PyExpr) -> PyExpr: ...
1101-
def utf8_match(self, pattern: PyExpr) -> PyExpr: ...
1102-
def utf8_split(self, pattern: PyExpr, regex: bool) -> PyExpr: ...
1103-
def utf8_extract(self, pattern: PyExpr, index: int) -> PyExpr: ...
1104-
def utf8_extract_all(self, pattern: PyExpr, index: int) -> PyExpr: ...
1105-
def utf8_replace(self, pattern: PyExpr, replacement: PyExpr, regex: bool) -> PyExpr: ...
1106-
def utf8_length(self) -> PyExpr: ...
1107-
def utf8_length_bytes(self) -> PyExpr: ...
1108-
def utf8_lower(self) -> PyExpr: ...
1109-
def utf8_upper(self) -> PyExpr: ...
1110-
def utf8_lstrip(self) -> PyExpr: ...
1111-
def utf8_rstrip(self) -> PyExpr: ...
1112-
def utf8_reverse(self) -> PyExpr: ...
1113-
def utf8_capitalize(self) -> PyExpr: ...
1114-
def utf8_left(self, nchars: PyExpr) -> PyExpr: ...
1115-
def utf8_right(self, nchars: PyExpr) -> PyExpr: ...
1116-
def utf8_find(self, substr: PyExpr) -> PyExpr: ...
1117-
def utf8_rpad(self, length: PyExpr, pad: PyExpr) -> PyExpr: ...
1118-
def utf8_lpad(self, length: PyExpr, pad: PyExpr) -> PyExpr: ...
1119-
def utf8_repeat(self, n: PyExpr) -> PyExpr: ...
1120-
def utf8_like(self, pattern: PyExpr) -> PyExpr: ...
1121-
def utf8_ilike(self, pattern: PyExpr) -> PyExpr: ...
1122-
def utf8_substr(self, start: PyExpr, length: PyExpr) -> PyExpr: ...
1123-
def utf8_to_date(self, format: str) -> PyExpr: ...
1124-
def utf8_to_datetime(self, format: str, timezone: str | None = None) -> PyExpr: ...
1125-
def utf8_normalize(self, remove_punct: bool, lowercase: bool, nfd_unicode: bool, white_space: bool) -> PyExpr: ...
11261098
def struct_get(self, name: str) -> PyExpr: ...
11271099
def map_get(self, key: PyExpr) -> PyExpr: ...
11281100
def partitioning_days(self) -> PyExpr: ...
@@ -1320,6 +1292,40 @@ def list_max(expr: PyExpr) -> PyExpr: ...
13201292
def list_slice(expr: PyExpr, start: PyExpr, end: PyExpr | None = None) -> PyExpr: ...
13211293
def list_chunk(expr: PyExpr, size: int) -> PyExpr: ...
13221294

1295+
# ---
1296+
# expr.utf8 namespace
1297+
# ---
1298+
def utf8_endswith(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
1299+
def utf8_startswith(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
1300+
def utf8_contains(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
1301+
def utf8_match(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
1302+
def utf8_split(expr: PyExpr, pattern: PyExpr, regex: bool) -> PyExpr: ...
1303+
def utf8_extract(expr: PyExpr, pattern: PyExpr, index: int) -> PyExpr: ...
1304+
def utf8_extract_all(expr: PyExpr, pattern: PyExpr, index: int) -> PyExpr: ...
1305+
def utf8_replace(expr: PyExpr, pattern: PyExpr, replacement: PyExpr, regex: bool) -> PyExpr: ...
1306+
def utf8_length(expr: PyExpr) -> PyExpr: ...
1307+
def utf8_length_bytes(expr: PyExpr) -> PyExpr: ...
1308+
def utf8_lower(expr: PyExpr) -> PyExpr: ...
1309+
def utf8_upper(expr: PyExpr) -> PyExpr: ...
1310+
def utf8_lstrip(expr: PyExpr) -> PyExpr: ...
1311+
def utf8_rstrip(expr: PyExpr) -> PyExpr: ...
1312+
def utf8_reverse(expr: PyExpr) -> PyExpr: ...
1313+
def utf8_capitalize(expr: PyExpr) -> PyExpr: ...
1314+
def utf8_left(expr: PyExpr, nchars: PyExpr) -> PyExpr: ...
1315+
def utf8_right(expr: PyExpr, nchars: PyExpr) -> PyExpr: ...
1316+
def utf8_find(expr: PyExpr, substr: PyExpr) -> PyExpr: ...
1317+
def utf8_rpad(expr: PyExpr, length: PyExpr, pad: PyExpr) -> PyExpr: ...
1318+
def utf8_lpad(expr: PyExpr, length: PyExpr, pad: PyExpr) -> PyExpr: ...
1319+
def utf8_repeat(expr: PyExpr, n: PyExpr) -> PyExpr: ...
1320+
def utf8_like(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
1321+
def utf8_ilike(expr: PyExpr, pattern: PyExpr) -> PyExpr: ...
1322+
def utf8_substr(expr: PyExpr, start: PyExpr, length: PyExpr) -> PyExpr: ...
1323+
def utf8_to_date(expr: PyExpr, format: str) -> PyExpr: ...
1324+
def utf8_to_datetime(expr: PyExpr, format: str, timezone: str | None = None) -> PyExpr: ...
1325+
def utf8_normalize(
1326+
expr: PyExpr, remove_punct: bool, lowercase: bool, nfd_unicode: bool, white_space: bool
1327+
) -> PyExpr: ...
1328+
13231329
class PyCatalog:
13241330
@staticmethod
13251331
def new() -> PyCatalog: ...

daft/expressions/expressions.py

Lines changed: 32 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1887,7 +1887,7 @@ def contains(self, substr: str | Expression) -> Expression:
18871887
Expression: a Boolean expression indicating whether each value contains the provided pattern
18881888
"""
18891889
substr_expr = Expression._to_expression(substr)
1890-
return Expression._from_pyexpr(self._expr.utf8_contains(substr_expr._expr))
1890+
return Expression._from_pyexpr(native.utf8_contains(self._expr, substr_expr._expr))
18911891

18921892
def match(self, pattern: str | Expression) -> Expression:
18931893
"""Checks whether each string matches the given regular expression pattern in a string column
@@ -1917,7 +1917,7 @@ def match(self, pattern: str | Expression) -> Expression:
19171917
Expression: a Boolean expression indicating whether each value matches the provided pattern
19181918
"""
19191919
pattern_expr = Expression._to_expression(pattern)
1920-
return Expression._from_pyexpr(self._expr.utf8_match(pattern_expr._expr))
1920+
return Expression._from_pyexpr(native.utf8_match(self._expr, pattern_expr._expr))
19211921

19221922
def endswith(self, suffix: str | Expression) -> Expression:
19231923
"""Checks whether each string ends with the given pattern in a string column
@@ -1947,7 +1947,7 @@ def endswith(self, suffix: str | Expression) -> Expression:
19471947
Expression: a Boolean expression indicating whether each value ends with the provided pattern
19481948
"""
19491949
suffix_expr = Expression._to_expression(suffix)
1950-
return Expression._from_pyexpr(self._expr.utf8_endswith(suffix_expr._expr))
1950+
return Expression._from_pyexpr(native.utf8_endswith(self._expr, suffix_expr._expr))
19511951

19521952
def startswith(self, prefix: str | Expression) -> Expression:
19531953
"""Checks whether each string starts with the given pattern in a string column
@@ -1977,7 +1977,7 @@ def startswith(self, prefix: str | Expression) -> Expression:
19771977
Expression: a Boolean expression indicating whether each value starts with the provided pattern
19781978
"""
19791979
prefix_expr = Expression._to_expression(prefix)
1980-
return Expression._from_pyexpr(self._expr.utf8_startswith(prefix_expr._expr))
1980+
return Expression._from_pyexpr(native.utf8_startswith(self._expr, prefix_expr._expr))
19811981

19821982
def split(self, pattern: str | Expression, regex: bool = False) -> Expression:
19831983
r"""Splits each string on the given literal or regex pattern, into a list of strings.
@@ -2028,7 +2028,7 @@ def split(self, pattern: str | Expression, regex: bool = False) -> Expression:
20282028
Expression: A List[Utf8] expression containing the string splits for each string in the column.
20292029
"""
20302030
pattern_expr = Expression._to_expression(pattern)
2031-
return Expression._from_pyexpr(self._expr.utf8_split(pattern_expr._expr, regex))
2031+
return Expression._from_pyexpr(native.utf8_split(self._expr, pattern_expr._expr, regex))
20322032

20332033
def concat(self, other: str | Expression) -> Expression:
20342034
"""Concatenates two string expressions together
@@ -2119,7 +2119,7 @@ def extract(self, pattern: str | Expression, index: int = 0) -> Expression:
21192119
`extract_all`
21202120
"""
21212121
pattern_expr = Expression._to_expression(pattern)
2122-
return Expression._from_pyexpr(self._expr.utf8_extract(pattern_expr._expr, index))
2122+
return Expression._from_pyexpr(native.utf8_extract(self._expr, pattern_expr._expr, index))
21232123

21242124
def extract_all(self, pattern: str | Expression, index: int = 0) -> Expression:
21252125
r"""Extracts the specified match group from all regex matches in each string in a string column.
@@ -2175,7 +2175,7 @@ def extract_all(self, pattern: str | Expression, index: int = 0) -> Expression:
21752175
`extract`
21762176
"""
21772177
pattern_expr = Expression._to_expression(pattern)
2178-
return Expression._from_pyexpr(self._expr.utf8_extract_all(pattern_expr._expr, index))
2178+
return Expression._from_pyexpr(native.utf8_extract_all(self._expr, pattern_expr._expr, index))
21792179

21802180
def replace(
21812181
self,
@@ -2232,7 +2232,9 @@ def replace(
22322232
"""
22332233
pattern_expr = Expression._to_expression(pattern)
22342234
replacement_expr = Expression._to_expression(replacement)
2235-
return Expression._from_pyexpr(self._expr.utf8_replace(pattern_expr._expr, replacement_expr._expr, regex))
2235+
return Expression._from_pyexpr(
2236+
native.utf8_replace(self._expr, pattern_expr._expr, replacement_expr._expr, regex)
2237+
)
22362238

22372239
def length(self) -> Expression:
22382240
"""Retrieves the length for a UTF-8 string column
@@ -2259,7 +2261,7 @@ def length(self) -> Expression:
22592261
Returns:
22602262
Expression: an UInt64 expression with the length of each string
22612263
"""
2262-
return Expression._from_pyexpr(self._expr.utf8_length())
2264+
return Expression._from_pyexpr(native.utf8_length(self._expr))
22632265

22642266
def length_bytes(self) -> Expression:
22652267
"""Retrieves the length for a UTF-8 string column in bytes.
@@ -2286,7 +2288,7 @@ def length_bytes(self) -> Expression:
22862288
Returns:
22872289
Expression: an UInt64 expression with the length of each string
22882290
"""
2289-
return Expression._from_pyexpr(self._expr.utf8_length_bytes())
2291+
return Expression._from_pyexpr(native.utf8_length_bytes(self._expr))
22902292

22912293
def lower(self) -> Expression:
22922294
"""Convert UTF-8 string to all lowercase
@@ -2313,7 +2315,7 @@ def lower(self) -> Expression:
23132315
Returns:
23142316
Expression: a String expression which is `self` lowercased
23152317
"""
2316-
return Expression._from_pyexpr(self._expr.utf8_lower())
2318+
return Expression._from_pyexpr(native.utf8_lower(self._expr))
23172319

23182320
def upper(self) -> Expression:
23192321
"""Convert UTF-8 string to all upper
@@ -2340,7 +2342,7 @@ def upper(self) -> Expression:
23402342
Returns:
23412343
Expression: a String expression which is `self` uppercased
23422344
"""
2343-
return Expression._from_pyexpr(self._expr.utf8_upper())
2345+
return Expression._from_pyexpr(native.utf8_upper(self._expr))
23442346

23452347
def lstrip(self) -> Expression:
23462348
"""Strip whitespace from the left side of a UTF-8 string
@@ -2367,7 +2369,7 @@ def lstrip(self) -> Expression:
23672369
Returns:
23682370
Expression: a String expression which is `self` with leading whitespace stripped
23692371
"""
2370-
return Expression._from_pyexpr(self._expr.utf8_lstrip())
2372+
return Expression._from_pyexpr(native.utf8_lstrip(self._expr))
23712373

23722374
def rstrip(self) -> Expression:
23732375
"""Strip whitespace from the right side of a UTF-8 string
@@ -2394,7 +2396,7 @@ def rstrip(self) -> Expression:
23942396
Returns:
23952397
Expression: a String expression which is `self` with trailing whitespace stripped
23962398
"""
2397-
return Expression._from_pyexpr(self._expr.utf8_rstrip())
2399+
return Expression._from_pyexpr(native.utf8_rstrip(self._expr))
23982400

23992401
def reverse(self) -> Expression:
24002402
"""Reverse a UTF-8 string
@@ -2421,7 +2423,7 @@ def reverse(self) -> Expression:
24212423
Returns:
24222424
Expression: a String expression which is `self` reversed
24232425
"""
2424-
return Expression._from_pyexpr(self._expr.utf8_reverse())
2426+
return Expression._from_pyexpr(native.utf8_reverse(self._expr))
24252427

24262428
def capitalize(self) -> Expression:
24272429
"""Capitalize a UTF-8 string
@@ -2448,7 +2450,7 @@ def capitalize(self) -> Expression:
24482450
Returns:
24492451
Expression: a String expression which is `self` uppercased with the first character and lowercased the rest
24502452
"""
2451-
return Expression._from_pyexpr(self._expr.utf8_capitalize())
2453+
return Expression._from_pyexpr(native.utf8_capitalize(self._expr))
24522454

24532455
def left(self, nchars: int | Expression) -> Expression:
24542456
"""Gets the n (from nchars) left-most characters of each string
@@ -2476,7 +2478,7 @@ def left(self, nchars: int | Expression) -> Expression:
24762478
Expression: a String expression which is the `n` left-most characters of `self`
24772479
"""
24782480
nchars_expr = Expression._to_expression(nchars)
2479-
return Expression._from_pyexpr(self._expr.utf8_left(nchars_expr._expr))
2481+
return Expression._from_pyexpr(native.utf8_left(self._expr, nchars_expr._expr))
24802482

24812483
def right(self, nchars: int | Expression) -> Expression:
24822484
"""Gets the n (from nchars) right-most characters of each string
@@ -2504,7 +2506,7 @@ def right(self, nchars: int | Expression) -> Expression:
25042506
Expression: a String expression which is the `n` right-most characters of `self`
25052507
"""
25062508
nchars_expr = Expression._to_expression(nchars)
2507-
return Expression._from_pyexpr(self._expr.utf8_right(nchars_expr._expr))
2509+
return Expression._from_pyexpr(native.utf8_right(self._expr, nchars_expr._expr))
25082510

25092511
def find(self, substr: str | Expression) -> Expression:
25102512
"""Returns the index of the first occurrence of the substring in each string
@@ -2536,7 +2538,7 @@ def find(self, substr: str | Expression) -> Expression:
25362538
Expression: an Int64 expression with the index of the first occurrence of the substring in each string
25372539
"""
25382540
substr_expr = Expression._to_expression(substr)
2539-
return Expression._from_pyexpr(self._expr.utf8_find(substr_expr._expr))
2541+
return Expression._from_pyexpr(native.utf8_find(self._expr, substr_expr._expr))
25402542

25412543
def rpad(self, length: int | Expression, pad: str | Expression) -> Expression:
25422544
"""Right-pads each string by truncating or padding with the character
@@ -2569,7 +2571,7 @@ def rpad(self, length: int | Expression, pad: str | Expression) -> Expression:
25692571
"""
25702572
length_expr = Expression._to_expression(length)
25712573
pad_expr = Expression._to_expression(pad)
2572-
return Expression._from_pyexpr(self._expr.utf8_rpad(length_expr._expr, pad_expr._expr))
2574+
return Expression._from_pyexpr(native.utf8_rpad(self._expr, length_expr._expr, pad_expr._expr))
25732575

25742576
def lpad(self, length: int | Expression, pad: str | Expression) -> Expression:
25752577
"""Left-pads each string by truncating on the right or padding with the character
@@ -2602,7 +2604,7 @@ def lpad(self, length: int | Expression, pad: str | Expression) -> Expression:
26022604
"""
26032605
length_expr = Expression._to_expression(length)
26042606
pad_expr = Expression._to_expression(pad)
2605-
return Expression._from_pyexpr(self._expr.utf8_lpad(length_expr._expr, pad_expr._expr))
2607+
return Expression._from_pyexpr(native.utf8_lpad(self._expr, length_expr._expr, pad_expr._expr))
26062608

26072609
def repeat(self, n: int | Expression) -> Expression:
26082610
"""Repeats each string n times
@@ -2630,7 +2632,7 @@ def repeat(self, n: int | Expression) -> Expression:
26302632
Expression: a String expression which is `self` repeated `n` times
26312633
"""
26322634
n_expr = Expression._to_expression(n)
2633-
return Expression._from_pyexpr(self._expr.utf8_repeat(n_expr._expr))
2635+
return Expression._from_pyexpr(native.utf8_repeat(self._expr, n_expr._expr))
26342636

26352637
def like(self, pattern: str | Expression) -> Expression:
26362638
"""Checks whether each string matches the given SQL LIKE pattern, case sensitive
@@ -2661,7 +2663,7 @@ def like(self, pattern: str | Expression) -> Expression:
26612663
Expression: a Boolean expression indicating whether each value matches the provided pattern
26622664
"""
26632665
pattern_expr = Expression._to_expression(pattern)
2664-
return Expression._from_pyexpr(self._expr.utf8_like(pattern_expr._expr))
2666+
return Expression._from_pyexpr(native.utf8_like(self._expr, pattern_expr._expr))
26652667

26662668
def ilike(self, pattern: str | Expression) -> Expression:
26672669
"""Checks whether each string matches the given SQL LIKE pattern, case insensitive
@@ -2692,7 +2694,7 @@ def ilike(self, pattern: str | Expression) -> Expression:
26922694
Expression: a Boolean expression indicating whether each value matches the provided pattern
26932695
"""
26942696
pattern_expr = Expression._to_expression(pattern)
2695-
return Expression._from_pyexpr(self._expr.utf8_ilike(pattern_expr._expr))
2697+
return Expression._from_pyexpr(native.utf8_ilike(self._expr, pattern_expr._expr))
26962698

26972699
def substr(self, start: int | Expression, length: int | Expression | None = None) -> Expression:
26982700
"""Extract a substring from a string, starting at a specified index and extending for a given length.
@@ -2724,7 +2726,7 @@ def substr(self, start: int | Expression, length: int | Expression | None = None
27242726
"""
27252727
start_expr = Expression._to_expression(start)
27262728
length_expr = Expression._to_expression(length)
2727-
return Expression._from_pyexpr(self._expr.utf8_substr(start_expr._expr, length_expr._expr))
2729+
return Expression._from_pyexpr(native.utf8_substr(self._expr, start_expr._expr, length_expr._expr))
27282730

27292731
def to_date(self, format: str) -> Expression:
27302732
"""Converts a string to a date using the specified format
@@ -2755,7 +2757,7 @@ def to_date(self, format: str) -> Expression:
27552757
Returns:
27562758
Expression: a Date expression which is parsed by given format
27572759
"""
2758-
return Expression._from_pyexpr(self._expr.utf8_to_date(format))
2760+
return Expression._from_pyexpr(native.utf8_to_date(self._expr, format))
27592761

27602762
def to_datetime(self, format: str, timezone: str | None = None) -> Expression:
27612763
"""Converts a string to a datetime using the specified format and timezone
@@ -2805,7 +2807,7 @@ def to_datetime(self, format: str, timezone: str | None = None) -> Expression:
28052807
Returns:
28062808
Expression: a DateTime expression which is parsed by given format and timezone
28072809
"""
2808-
return Expression._from_pyexpr(self._expr.utf8_to_datetime(format, timezone))
2810+
return Expression._from_pyexpr(native.utf8_to_datetime(self._expr, format, timezone))
28092811

28102812
def normalize(
28112813
self,
@@ -2849,7 +2851,9 @@ def normalize(
28492851
Returns:
28502852
Expression: a String expression which is normalized.
28512853
"""
2852-
return Expression._from_pyexpr(self._expr.utf8_normalize(remove_punct, lowercase, nfd_unicode, white_space))
2854+
return Expression._from_pyexpr(
2855+
native.utf8_normalize(self._expr, remove_punct, lowercase, nfd_unicode, white_space)
2856+
)
28532857

28542858
def tokenize_encode(
28552859
self,

src/daft-dsl/src/functions/mod.rs

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ pub mod python;
55
pub mod scalar;
66
pub mod sketch;
77
pub mod struct_;
8-
pub mod utf8;
98

109
use std::{
1110
fmt::{Display, Formatter, Result, Write},
@@ -18,15 +17,11 @@ use python::PythonUDF;
1817
pub use scalar::*;
1918
use serde::{Deserialize, Serialize};
2019

21-
use self::{
22-
map::MapExpr, partitioning::PartitioningExpr, sketch::SketchExpr, struct_::StructExpr,
23-
utf8::Utf8Expr,
24-
};
20+
use self::{map::MapExpr, partitioning::PartitioningExpr, sketch::SketchExpr, struct_::StructExpr};
2521
use crate::{Expr, ExprRef, Operator};
2622

2723
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
2824
pub enum FunctionExpr {
29-
Utf8(Utf8Expr),
3025
Map(MapExpr),
3126
Sketch(SketchExpr),
3227
Struct(StructExpr),
@@ -49,7 +44,6 @@ impl FunctionExpr {
4944
#[inline]
5045
fn get_evaluator(&self) -> &dyn FunctionEvaluator {
5146
match self {
52-
Self::Utf8(expr) => expr.get_evaluator(),
5347
Self::Map(expr) => expr.get_evaluator(),
5448
Self::Sketch(expr) => expr.get_evaluator(),
5549
Self::Struct(expr) => expr.get_evaluator(),

0 commit comments

Comments
 (0)