-
-
Notifications
You must be signed in to change notification settings - Fork 27
Expr as singleton #798
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Expr as singleton #798
Changes from 4 commits
0a9617f
8433deb
ed93aa7
a24c6c9
f8b9f4e
129aa7a
cb85ffc
4f8e0e3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,6 @@ | ||
from __future__ import annotations | ||
|
||
import contextlib | ||
import functools | ||
import itertools | ||
import operator | ||
import warnings | ||
|
@@ -26,8 +25,9 @@ | |
from dask.dataframe.io.parquet.utils import _split_user_options | ||
from dask.dataframe.io.utils import _is_local_fs | ||
from dask.delayed import delayed | ||
from dask.utils import apply, natural_sort_key, typename | ||
from dask.utils import apply, funcname, natural_sort_key, typename | ||
from fsspec.utils import stringify_path | ||
from toolz import identity | ||
|
||
from dask_expr._expr import ( | ||
EQ, | ||
|
@@ -47,26 +47,15 @@ | |
determine_column_projection, | ||
) | ||
from dask_expr._reductions import Len | ||
from dask_expr._util import _convert_to_list | ||
from dask_expr._util import _convert_to_list, _tokenize_deterministic | ||
from dask_expr.io import BlockwiseIO, PartitionsFiltered | ||
|
||
NONE_LABEL = "__null_dask_index__" | ||
|
||
_cached_dataset_info = {} | ||
_CACHED_DATASET_SIZE = 10 | ||
_CACHED_PLAN_SIZE = 10 | ||
_cached_plan = {} | ||
|
||
|
||
def _control_cached_dataset_info(key): | ||
if ( | ||
len(_cached_dataset_info) > _CACHED_DATASET_SIZE | ||
and key not in _cached_dataset_info | ||
): | ||
key_to_pop = list(_cached_dataset_info.keys())[0] | ||
_cached_dataset_info.pop(key_to_pop) | ||
|
||
|
||
def _control_cached_plan(key): | ||
if len(_cached_plan) > _CACHED_PLAN_SIZE and key not in _cached_plan: | ||
key_to_pop = list(_cached_plan.keys())[0] | ||
|
@@ -121,7 +110,7 @@ def _lower(self): | |
class ToParquetData(Blockwise): | ||
_parameters = ToParquet._parameters | ||
|
||
@cached_property | ||
@property | ||
def io_func(self): | ||
return ToParquetFunctionWrapper( | ||
self.engine, | ||
|
@@ -257,7 +246,6 @@ def to_parquet( | |
|
||
# Clear read_parquet caches in case we are | ||
# also reading from the overwritten path | ||
_cached_dataset_info.clear() | ||
_cached_plan.clear() | ||
|
||
# Always skip divisions checks if divisions are unknown | ||
|
@@ -383,11 +371,6 @@ def to_parquet( | |
if compute: | ||
out = out.compute(**compute_kwargs) | ||
|
||
# Invalidate the filesystem listing cache for the output path after write. | ||
# We do this before returning, even if `compute=False`. This helps ensure | ||
# that reading files that were just written succeeds. | ||
fs.invalidate_cache(path) | ||
|
||
return out | ||
|
||
|
||
|
@@ -413,6 +396,7 @@ class ReadParquet(PartitionsFiltered, BlockwiseIO): | |
"kwargs", | ||
"_partitions", | ||
"_series", | ||
"_dataset_info_cache", | ||
] | ||
_defaults = { | ||
"columns": None, | ||
|
@@ -432,6 +416,7 @@ class ReadParquet(PartitionsFiltered, BlockwiseIO): | |
"kwargs": None, | ||
"_partitions": None, | ||
"_series": False, | ||
"_dataset_info_cache": list, | ||
} | ||
_pq_length_stats = None | ||
_absorb_projections = True | ||
|
@@ -474,7 +459,21 @@ def _simplify_up(self, parent, dependents): | |
return Literal(sum(_lengths)) | ||
|
||
@cached_property | ||
def _name(self): | ||
return ( | ||
funcname(type(self)).lower() | ||
+ "-" | ||
+ _tokenize_deterministic(self.checksum, *self.operands) | ||
) | ||
Comment on lines
+467
to
+472
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this checksum is part of the |
||
|
||
@property | ||
def checksum(self): | ||
return self._dataset_info["checksum"] | ||
|
||
@property | ||
def _dataset_info(self): | ||
if rv := self.operand("_dataset_info_cache"): | ||
return rv[0] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When the Subsequent expressions that are derived which are inheriting the cache will just access this making the |
||
# Process and split user options | ||
( | ||
dataset_options, | ||
|
@@ -536,13 +535,20 @@ def _dataset_info(self): | |
**other_options, | ||
}, | ||
) | ||
dataset_token = tokenize(*args) | ||
if dataset_token not in _cached_dataset_info: | ||
_control_cached_dataset_info(dataset_token) | ||
_cached_dataset_info[dataset_token] = self.engine._collect_dataset_info( | ||
*args | ||
) | ||
dataset_info = _cached_dataset_info[dataset_token].copy() | ||
dataset_info = self.engine._collect_dataset_info(*args) | ||
checksum = [] | ||
files_for_checksum = [] | ||
if dataset_info["has_metadata_file"]: | ||
files_for_checksum = [self.path + fs.sep + "_metadata"] | ||
else: | ||
files_for_checksum = dataset_info["ds"].files | ||
|
||
for file in files_for_checksum: | ||
# The checksum / file info is usually already cached by the fsspec | ||
# FileSystem dir_cache since this info was already asked for in | ||
# _collect_dataset_info | ||
checksum.append(fs.checksum(file)) | ||
dataset_info["checksum"] = tokenize(checksum) | ||
Comment on lines
+556
to
+561
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To deal with the cache consistency problem described in #800 I am calculating a checksum here. For s3 this falls back to using the ETag provided in the listdir response. This should not add any overhead since this stuff is already cached by fsspec. |
||
|
||
# Infer meta, accounting for index and columns arguments. | ||
meta = self.engine._create_dd_meta(dataset_info) | ||
|
@@ -558,6 +564,7 @@ def _dataset_info(self): | |
dataset_info["all_columns"] = all_columns | ||
dataset_info["calculate_divisions"] = self.calculate_divisions | ||
|
||
self._dataset_info_cache.append(dataset_info) | ||
return dataset_info | ||
|
||
@property | ||
|
@@ -571,10 +578,10 @@ def _meta(self): | |
return meta[columns] | ||
return meta | ||
|
||
@cached_property | ||
@property | ||
def _io_func(self): | ||
if self._plan["empty"]: | ||
return lambda x: x | ||
return identity | ||
dataset_info = self._dataset_info | ||
return ParquetFunctionWrapper( | ||
self.engine, | ||
|
@@ -662,7 +669,7 @@ def _update_length_statistics(self): | |
stat["num-rows"] for stat in _collect_pq_statistics(self) | ||
) | ||
|
||
@functools.cached_property | ||
@property | ||
def _fusion_compression_factor(self): | ||
if self.operand("columns") is None: | ||
return 1 | ||
|
@@ -767,9 +774,11 @@ def _maybe_list(val): | |
return [val] | ||
|
||
return [ | ||
_maybe_list(val.to_list_tuple()) | ||
if hasattr(val, "to_list_tuple") | ||
else _maybe_list(val) | ||
( | ||
_maybe_list(val.to_list_tuple()) | ||
if hasattr(val, "to_list_tuple") | ||
else _maybe_list(val) | ||
) | ||
for val in self | ||
] | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The new concept here is that I am moving off a global cache. The dataset_info is always calculated whenever a user calls
read_parquet(
foo)
and will therefore always receive an accurate representation of the dataset at the time this is called.This dataset_info is cached in this paramter. I am choosing a list as a container but this could be anything. I could also just set the operand and mutate the expression in place.
The benefit of using a paramter for this cache is that the cache will naturally propagate to all derived instances, e.g. whenever we rewrite the expression using
Expr.substitute_parameters
. This allows us to maintain the cache during optimization and it ties the lifetime of the cache to the lifetime of the expression ancestry removing any need for us to invalidate the cache ever.