Skip to content

Commit 5693973

Browse files
authored
Merge branch 'main' into ignore_corrupt
2 parents db5b237 + bf3fb1c commit 5693973

50 files changed

Lines changed: 560 additions & 220 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/ISSUE_TEMPLATE/config.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@ contact_links:
44
url: https://github.com/Eventual-Inc/Daft/discussions/new/choose
55
about: Please ask questions here.
66
- name: Slack
7-
url: https://dist-data.slack.com/archives/C052CA6Q9N1
7+
url: https://daft.ai/slack
88
about: Or the `#daft-dev` channel in the Daft Slack.

README.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ Daft has an Apache 2.0 license - please see the LICENSE file.
125125
:alt: Coverage
126126

127127
.. |Slack| image:: https://img.shields.io/badge/slack-@distdata-purple.svg?logo=slack
128-
:target: https://join.slack.com/t/dist-data/shared_invite/zt-3rh9jr9iv-tmmTNOlQpfvhEy2NTMWS_w
128+
:target: https://daft.ai/slack
129129
:alt: slack community
130130

131131
.. |TrendShift| image:: https://trendshift.io/api/badge/repositories/8239

daft/daft/__init__.pyi

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1536,6 +1536,7 @@ class PySchema:
15361536
class PyExpr:
15371537
def alias(self, name: str) -> PyExpr: ...
15381538
def cast(self, dtype: PyDataType) -> PyExpr: ...
1539+
def try_cast(self, dtype: PyDataType) -> PyExpr: ...
15391540
def if_else(self, if_true: PyExpr, if_false: PyExpr) -> PyExpr: ...
15401541
def count(self, mode: CountMode) -> PyExpr: ...
15411542
def count_distinct(self) -> PyExpr: ...
@@ -1830,6 +1831,7 @@ class PySeries:
18301831
def agg_list(self) -> PySeries: ...
18311832
def agg_set(self) -> PySeries: ...
18321833
def cast(self, dtype: PyDataType) -> PySeries: ...
1834+
def try_cast(self, dtype: PyDataType) -> PySeries: ...
18331835
def pow(self, exp: float) -> PySeries: ...
18341836
def log2(self) -> PySeries: ...
18351837
def log10(self) -> PySeries: ...
@@ -2808,8 +2810,8 @@ class PyFileReference:
28082810
def writable(self) -> bool: ...
28092811
def path(self) -> str: ...
28102812
def name(self) -> str: ...
2811-
def offset(self) -> int | None: ...
2812-
def length(self) -> int | None: ...
2813+
def position(self) -> int | None: ...
2814+
def size(self) -> int | None: ...
28132815

28142816
class PyDaftFile:
28152817
def __init__(self, path: str | None = None, data: bytes | None = None) -> None: ...

daft/expressions/expressions.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,19 @@ def cast(self, dtype: DataTypeLike) -> Expression:
497497

498498
return cast(self, dtype)
499499

500+
def try_cast(self, dtype: DataTypeLike) -> Expression:
501+
"""Attempts to cast an expression to the given datatype, returning null on failure.
502+
503+
Unlike `cast`, this method does not raise an error when the conversion fails.
504+
Instead, it returns null for values that cannot be converted.
505+
506+
Tip: See Also
507+
[`daft.functions.try_cast`](https://docs.daft.ai/en/stable/api/functions/try_cast/)
508+
"""
509+
from daft.functions import try_cast
510+
511+
return try_cast(self, dtype)
512+
500513
if TYPE_CHECKING:
501514

502515
def as_int8(self) -> Expression: ...

daft/expressions/pyarrow_visitor.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,12 @@ def visit_cast(self, expr: Expression, dtype: DataType) -> pc.Expression:
3838
pc_type = dtype.to_arrow_dtype()
3939
return pc_expr.cast(pc_type)
4040

41+
def visit_try_cast(self, expr: Expression, dtype: DataType) -> pc.Expression:
42+
"""Converts the try_cast with safe=True cast options."""
43+
pc_expr = self.visit(expr)
44+
pc_type = dtype.to_arrow_dtype()
45+
return pc_expr.cast(pc_type, safe=True)
46+
4147
def visit_list(self, items: list[Expression]) -> pc.Expression:
4248
raise ValueError("pyarrow.compute does not have a make_list function.")
4349

daft/expressions/visitor.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ class ExpressionVisitor(ABC, Generic[R]):
3232
... print(f"Cast: {dtype}")
3333
... self.visit(expr)
3434
...
35+
... def visit_try_cast(self, expr: Expression, dtype: DataType) -> None:
36+
... print(f"TryCast: {dtype}")
37+
... self.visit(expr)
38+
...
3539
... def visit_function(self, name: str, args: list[Expression]) -> None:
3640
... print(f"Function: {name}")
3741
... for arg in args:
@@ -78,6 +82,13 @@ def visit_cast(self, expr: Expression, dtype: DataType) -> R:
7882
"""Visit a cast expression."""
7983
...
8084

85+
def visit_try_cast(self, expr: Expression, dtype: DataType) -> R:
86+
"""Visit a try_cast expression.
87+
88+
Default implementation delegates to visit_cast for backwards compatibility.
89+
"""
90+
return self.visit_cast(expr, dtype)
91+
8192
@abstractmethod
8293
def visit_function(self, name: str, args: list[Expression]) -> R:
8394
"""Visit a function call expression."""
@@ -173,6 +184,9 @@ def visit_alias(self, expr: Expression, alias: str) -> set[str]:
173184
def visit_cast(self, expr: Expression, dtype: DataType) -> set[str]:
174185
return self.visit(expr)
175186

187+
def visit_try_cast(self, expr: Expression, dtype: DataType) -> set[str]:
188+
return self.visit(expr)
189+
176190
def visit_function(self, name: str, args: list[Expression]) -> set[str]:
177191
return set().union(*(self.visit(arg) for arg in args))
178192

daft/file/file.py

Lines changed: 46 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import shutil
44
import tempfile
5+
import warnings
56
from typing import TYPE_CHECKING
67

78
from daft.daft import PyDaftFile, PyFileReference
@@ -57,10 +58,30 @@ def __init__(
5758
url: str,
5859
io_config: IOConfig | None = None,
5960
media_type: MediaType = MediaType.unknown(),
61+
position: int | None = None,
62+
size: int | None = None,
6063
offset: int | None = None,
6164
length: int | None = None,
6265
) -> None:
63-
self._inner = PyFileReference._from_tuple((media_type._media_type, url, io_config, offset, length)) # type: ignore
66+
if offset is not None:
67+
warnings.warn(
68+
"`offset` is deprecated; use `position` instead.",
69+
DeprecationWarning,
70+
stacklevel=2,
71+
)
72+
if position is None:
73+
position = offset
74+
75+
if length is not None:
76+
warnings.warn(
77+
"`length` is deprecated; use `size` instead.",
78+
DeprecationWarning,
79+
stacklevel=2,
80+
)
81+
if size is None:
82+
size = length
83+
84+
self._inner = PyFileReference._from_tuple((media_type._media_type, url, io_config, position, size)) # type: ignore
6485

6586
def open(self, buffer_size: int | None = None) -> PyDaftFile:
6687
return PyDaftFile._from_file_reference(self._inner, buffer_size=buffer_size)
@@ -110,17 +131,37 @@ def name(self) -> str:
110131
"""
111132
return self._inner.name()
112133

134+
@property
135+
def position(self) -> int | None:
136+
"""The starting byte position for range reads, or None for full-file reads."""
137+
return self._inner.position()
138+
113139
@property
114140
def offset(self) -> int | None:
115-
"""The byte offset for range reads, or None for full-file reads."""
116-
return self._inner.offset()
141+
"""Deprecated alias for `position`. The byte offset for range reads, or None for full-file reads."""
142+
warnings.warn(
143+
"`File.offset` is deprecated; use `File.position` instead.",
144+
DeprecationWarning,
145+
stacklevel=2,
146+
)
147+
return self._inner.position()
117148

118149
@property
119150
def length(self) -> int | None:
120-
"""The byte length for range reads, or None for full-file reads."""
121-
return self._inner.length()
151+
"""Deprecated alias for the byte-range read window size, or None for full-file reads.
152+
153+
Note: this returns the requested range size (caller intent), not the derived file
154+
size. Use `File.size()` for the actual file size.
155+
"""
156+
warnings.warn(
157+
"`File.length` is deprecated; use `File.size()` instead.",
158+
DeprecationWarning,
159+
stacklevel=2,
160+
)
161+
return self._inner.size()
122162

123163
def size(self) -> int:
164+
"""The size of the file in bytes, derived from the underlying file."""
124165
return PyDaftFile._from_file_reference(self._inner, buffer_size=BUFFER_SNIFF).size()
125166

126167
def mime_type(self) -> str:

daft/functions/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@
147147
random_int,
148148
eq_null_safe,
149149
cast,
150+
try_cast,
150151
is_null,
151152
not_null,
152153
fill_null,
@@ -570,6 +571,7 @@
570571
"total_nanoseconds",
571572
"total_seconds",
572573
"trunc",
574+
"try_cast",
573575
"try_compress",
574576
"try_decode",
575577
"try_decompress",

daft/functions/misc.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,45 @@ def cast(expr: Expression, dtype: DataTypeLike) -> Expression:
189189
return Expression._from_pyexpr(expr._expr.cast(dtype._dtype))
190190

191191

192+
def try_cast(expr: Expression, dtype: DataTypeLike) -> Expression:
193+
"""Attempts to cast an expression to the given datatype, returning null on failure.
194+
195+
Unlike `cast`, this function does not raise an error when the conversion fails.
196+
Instead, it returns null for values that cannot be converted.
197+
198+
Returns:
199+
Expression: Expression with the specified new datatype, with null for failed conversions
200+
201+
Note:
202+
- If a string is provided, it will use the sql engine to parse the string into a data type.
203+
- A python `type` can also be provided, in which case the corresponding Daft data type will be used.
204+
205+
Examples:
206+
>>> import daft
207+
>>> df = daft.from_pydict({"str_val": ["1", "2", "abc", None]})
208+
>>> df = df.select(df["str_val"].try_cast(daft.DataType.int64()))
209+
>>> df.show()
210+
╭─────────╮
211+
│ str_val │
212+
│ --- │
213+
│ Int64 │
214+
╞═════════╡
215+
│ 1 │
216+
├╌╌╌╌╌╌╌╌╌┤
217+
│ 2 │
218+
├╌╌╌╌╌╌╌╌╌┤
219+
│ None │
220+
├╌╌╌╌╌╌╌╌╌┤
221+
│ None │
222+
╰─────────╯
223+
<BLANKLINE>
224+
(Showing first 4 of 4 rows)
225+
"""
226+
dtype = DataType._infer(dtype)
227+
expr = Expression._to_expression(expr)
228+
return Expression._from_pyexpr(expr._expr.try_cast(dtype._dtype))
229+
230+
192231
def is_null(expr: Expression) -> Expression:
193232
"""Checks if values in the Expression are Null (a special value indicating missing data).
194233

daft/io/iceberg/_visitors.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@ def visit_alias(self, expr: Expression, alias: str) -> BooleanExpression:
6969
def visit_cast(self, expr: Expression, dtype: DataType) -> BooleanExpression:
7070
return self.visit(expr)
7171

72+
def visit_try_cast(self, expr: Expression, dtype: DataType) -> BooleanExpression:
73+
return self.visit(expr)
74+
7275
def visit_function(self, name: str, args: list[Expression]) -> BooleanExpression:
7376
raise ValueError(f"Iceberg does not support function '{name}' in filter expressions")
7477

0 commit comments

Comments
 (0)