Skip to content

Commit a151320

Browse files
committed
Add optional file-based listings caching
1 parent 7b2e9b2 commit a151320

File tree

10 files changed

+204
-28
lines changed

10 files changed

+204
-28
lines changed

ci/environment-win.yml

+2
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,5 @@ dependencies:
2525
- nomkl
2626
- s3fs
2727
- tqdm
28+
- diskcache
29+
- platformdirs

docs/source/api.rst

+6-2
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ Base Classes
4747
fsspec.core.OpenFiles
4848
fsspec.core.get_fs_token_paths
4949
fsspec.core.url_to_fs
50-
fsspec.dircache.DirCache
50+
fsspec.dircache.MemDirCache
51+
fsspec.dircache.FileDirCache
5152
fsspec.FSMap
5253
fsspec.generic.GenericFileSystem
5354
fsspec.registry.register_implementation
@@ -82,7 +83,10 @@ Base Classes
8283

8384
.. autofunction:: fsspec.core.url_to_fs
8485

85-
.. autoclass:: fsspec.dircache.DirCache
86+
.. autoclass:: fsspec.dircache.MemDirCache
87+
:members: __init__
88+
89+
.. autoclass:: fsspec.dircache.FileDirCache
8690
:members: __init__
8791

8892
.. autoclass:: fsspec.FSMap

docs/source/changelog.rst

+7
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
11
Changelog
22
=========
33

4+
Dev
5+
---------
6+
7+
Enhancements
8+
9+
- add filebased listing cache using diskcache
10+
411
2023.9.2
512
--------
613

docs/source/features.rst

+12-9
Original file line numberDiff line numberDiff line change
@@ -181,15 +181,18 @@ Listings Caching
181181
----------------
182182

183183
For some implementations, getting file listings (i.e., ``ls`` and anything that
184-
depends on it) is expensive. These implementations use dict-like instances of
185-
:class:`fsspec.dircache.DirCache` to manage the listings.
186-
187-
The cache allows for time-based expiry of entries with the ``listings_expiry_time``
188-
parameter, or LRU expiry with the ``max_paths`` parameter. These can be
189-
set on any implementation instance that uses listings caching; or to skip the
190-
caching altogether, use ``use_listings_cache=False``. That would be appropriate
191-
when the target location is known to be volatile because it is being written
192-
to from other sources.
184+
depends on it) is expensive. These implementations use either dict-like instances of
185+
:class:`fsspec.dircache.MemDirCache` or file-based caching with instances of
186+
:class:`fsspec.dircache.FileDirCache` to manage the listings.
187+
188+
The type of cache that is used, can be controlled via the keyword ``listings_cache_type``
189+
that has to be one of `memdircache` or `filedircache`. The cache allows for time-based expiry
190+
of entries with the ``listings_expiry_time`` parameter, or LRU expiry with the ``max_paths``
191+
parameter. These can be set on any implementation instance that uses listings caching; or to
192+
skip the caching altogether, use ``use_listings_cache=False``. That would be appropriate
193+
when the target location is known to be volatile because it is being written to from other
194+
sources. If you want to use the file-based caching, you can also provide the argument
195+
``listings_cache_location`` to determine where the cache file is stored.
193196

194197
When the ``fsspec`` instance writes to the backend, the method ``invalidate_cache``
195198
is called, so that subsequent listing of the given paths will force a refresh. In

fsspec/dircache.py

+95-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
1+
import logging
12
import time
23
from collections.abc import MutableMapping
34
from functools import lru_cache
5+
from pathlib import Path
46

7+
logger = logging.getLogger("fsspec")
58

6-
class DirCache(MutableMapping):
9+
10+
class MemDirCache(MutableMapping):
711
"""
812
Caching of directory listings, in a structure like::
913
@@ -93,6 +97,95 @@ def __iter__(self):
9397

9498
def __reduce__(self):
9599
return (
96-
DirCache,
100+
MemDirCache,
97101
(self.use_listings_cache, self.listings_expiry_time, self.max_paths),
98102
)
103+
104+
105+
class FileDirCache(MutableMapping):
106+
def __init__(
107+
self,
108+
use_listings_cache=True,
109+
listings_expiry_time=None,
110+
listings_cache_location=None,
111+
**kwargs,
112+
):
113+
"""
114+
115+
Parameters
116+
----------
117+
use_listings_cache: bool
118+
If False, this cache never returns items, but always reports KeyError,
119+
and setting items has no effect
120+
listings_expiry_time: int or float (optional)
121+
Time in seconds that a listing is considered valid. If None,
122+
listings do not expire.
123+
listings_cache_location: str (optional)
124+
Directory path at which the listings cache file is stored. If None,
125+
an autogenerated path at the user folder is created.
126+
127+
"""
128+
import platformdirs
129+
from diskcache import Cache
130+
131+
listings_expiry_time = listings_expiry_time and float(listings_expiry_time)
132+
133+
if listings_cache_location:
134+
listings_cache_location = Path(listings_cache_location) / str(
135+
listings_expiry_time
136+
)
137+
listings_cache_location.mkdir(exist_ok=True, parents=True)
138+
else:
139+
listings_cache_location = Path(
140+
platformdirs.user_cache_dir(appname="fsspec_dircache")
141+
) / str(listings_expiry_time)
142+
143+
try:
144+
listings_cache_location.mkdir(exist_ok=True, parents=True)
145+
except Exception:
146+
logger.error(
147+
f"folder for dircache could not be created at {listings_cache_location}"
148+
)
149+
150+
self.cache_location = listings_cache_location
151+
152+
logger.info(f"Dircache located at {listings_cache_location}")
153+
154+
self._cache = Cache(directory=listings_cache_location)
155+
self.use_listings_cache = use_listings_cache
156+
self.listings_expiry_time = listings_expiry_time
157+
158+
def __getitem__(self, item):
159+
"""Draw item as fileobject from cache, retry if timeout occurs"""
160+
return self._cache.get(key=item, read=True, retry=True)
161+
162+
def clear(self):
163+
self._cache.clear()
164+
165+
def __len__(self):
166+
return len(list(self._cache.iterkeys()))
167+
168+
def __contains__(self, item):
169+
value = self._cache.get(item, retry=True) # None, if expired
170+
if value:
171+
return True
172+
return False
173+
174+
def __setitem__(self, key, value):
175+
if not self.use_listings_cache:
176+
return
177+
self._cache.set(
178+
key=key, value=value, expire=self.listings_expiry_time, retry=True
179+
)
180+
181+
def __delitem__(self, key):
182+
del self._cache[key]
183+
184+
def __iter__(self):
185+
return (k for k in self._cache.iterkeys() if k in self)
186+
187+
def __reduce__(self):
188+
return (
189+
FileDirCache,
190+
(self.use_listings_cache, self.listings_expiry_time, self.cache_location),
191+
)

fsspec/implementations/http.py

+2
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ def __init__(
103103
request_options.pop("listings_expiry_time", None)
104104
request_options.pop("max_paths", None)
105105
request_options.pop("skip_instance_cache", None)
106+
request_options.pop("listings_cache_type", None)
107+
request_options.pop("listings_cache_location", None)
106108
self.kwargs = request_options
107109

108110
@property

fsspec/implementations/tests/test_http.py

+67-12
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,30 @@ def test_list_invalid_args(server):
2626
h.glob(server + "/index/*")
2727

2828

29-
def test_list_cache(server):
30-
h = fsspec.filesystem("http", use_listings_cache=True)
29+
@pytest.mark.parametrize("listings_cache_type", ["memdircache", "filedircache"])
30+
def test_list_cache(server, listings_cache_type):
31+
h = fsspec.filesystem(
32+
"http", use_listings_cache=True, listings_cache_type=listings_cache_type
33+
)
34+
35+
h.dircache.clear() # Needed for filedircache
36+
3137
out = h.glob(server + "/index/*")
3238
assert out == [server + "/index/realfile"]
3339

40+
h.dircache.clear() # clean up
41+
42+
43+
@pytest.mark.parametrize("listings_cache_type", ["memdircache", "filedircache"])
44+
def test_list_cache_with_expiry_time_cached(server, listings_cache_type):
45+
h = fsspec.filesystem(
46+
"http",
47+
use_listings_cache=True,
48+
listings_expiry_time=30,
49+
listings_cache_type=listings_cache_type,
50+
)
3451

35-
def test_list_cache_with_expiry_time_cached(server):
36-
h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=30)
52+
h.dircache.clear() # Needed for filedircache
3753

3854
# First, the directory cache is not initialized.
3955
assert not h.dircache
@@ -49,9 +65,19 @@ def test_list_cache_with_expiry_time_cached(server):
4965
out = h.glob(server + "/index/*")
5066
assert out == [server + "/index/realfile"]
5167

68+
h.dircache.clear() # clean up
69+
70+
71+
@pytest.mark.parametrize("listings_cache_type", ["memdircache", "filedircache"])
72+
def test_list_cache_with_expiry_time_purged(server, listings_cache_type):
73+
h = fsspec.filesystem(
74+
"http",
75+
use_listings_cache=True,
76+
listings_expiry_time=0.3,
77+
listings_cache_type=listings_cache_type,
78+
)
5279

53-
def test_list_cache_with_expiry_time_purged(server):
54-
h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=0.3)
80+
h.dircache.clear() # Needed for filedircache
5581

5682
# First, the directory cache is not initialized.
5783
assert not h.dircache
@@ -80,9 +106,20 @@ def test_list_cache_with_expiry_time_purged(server):
80106
cached_items = h.dircache.get(server + "/index/")
81107
assert len(cached_items) == 1
82108

109+
h.dircache.clear() # clean up
83110

84-
def test_list_cache_reuse(server):
85-
h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=5)
111+
112+
@pytest.mark.parametrize("listings_cache_type", ["memdircache", "filedircache"])
113+
def test_list_cache_reuse(server, listings_cache_type):
114+
h = fsspec.filesystem(
115+
"http",
116+
use_listings_cache=True,
117+
listings_expiry_time=5,
118+
listings_cache_type=listings_cache_type,
119+
)
120+
121+
# Needed for filedircache
122+
h.dircache.clear()
86123

87124
# First, the directory cache is not initialized.
88125
assert not h.dircache
@@ -101,14 +138,26 @@ def test_list_cache_reuse(server):
101138

102139
# Verify that yet another new instance, with caching enabled,
103140
# will see the same cache content again.
104-
h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=5)
141+
h = fsspec.filesystem(
142+
"http",
143+
use_listings_cache=True,
144+
listings_expiry_time=5,
145+
listings_cache_type=listings_cache_type,
146+
)
105147
assert len(h.dircache) == 1
106148

107149
# However, yet another instance with a different expiry time will also not have
108150
# any valid cache content.
109-
h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=666)
151+
h = fsspec.filesystem(
152+
"http",
153+
use_listings_cache=True,
154+
listings_expiry_time=666,
155+
listings_cache_type=listings_cache_type,
156+
)
110157
assert len(h.dircache) == 0
111158

159+
h.dircache.clear() # clean up
160+
112161

113162
def test_ls_raises_filenotfound(server):
114163
h = fsspec.filesystem("http")
@@ -123,8 +172,14 @@ def test_list_cache_with_max_paths(server):
123172
assert out == [server + "/index/realfile"]
124173

125174

126-
def test_list_cache_with_skip_instance_cache(server):
127-
h = fsspec.filesystem("http", use_listings_cache=True, skip_instance_cache=True)
175+
@pytest.mark.parametrize("listings_cache_type", ["memdircache", "filedircache"])
176+
def test_list_cache_with_skip_instance_cache(server, listings_cache_type):
177+
h = fsspec.filesystem(
178+
"http",
179+
use_listings_cache=True,
180+
skip_instance_cache=True,
181+
listings_cache_type=listings_cache_type,
182+
)
128183
out = h.glob(server + "/index/*")
129184
assert out == [server + "/index/realfile"]
130185

fsspec/spec.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
from .callbacks import _DEFAULT_CALLBACK
1515
from .config import apply_config, conf
16-
from .dircache import DirCache
16+
from .dircache import FileDirCache, MemDirCache
1717
from .transaction import Transaction
1818
from .utils import (
1919
_unstrip_protocol,
@@ -127,7 +127,7 @@ def __init__(self, *args, **storage_options):
127127
Parameters
128128
----------
129129
use_listings_cache, listings_expiry_time, max_paths:
130-
passed to ``DirCache``, if the implementation supports
130+
passed to ``MemDirCache``, if the implementation supports
131131
directory listing caching. Pass use_listings_cache=False
132132
to disable such caching.
133133
skip_instance_cache: bool
@@ -144,7 +144,16 @@ def __init__(self, *args, **storage_options):
144144
self._intrans = False
145145
self._transaction = None
146146
self._invalidated_caches_in_transaction = []
147-
self.dircache = DirCache(**storage_options)
147+
148+
listings_cache_type = storage_options.get("listings_cache_type", "memdircache")
149+
if listings_cache_type not in ("memdircache", "filedircache"):
150+
raise ValueError(
151+
"'listings_cache_type' has to be one of ('memdircache', 'filedircache')"
152+
)
153+
if listings_cache_type == "memdircache":
154+
self.dircache = MemDirCache(**storage_options)
155+
else:
156+
self.dircache = FileDirCache(**storage_options)
148157

149158
if storage_options.pop("add_docs", None):
150159
warnings.warn("add_docs is no longer supported.", FutureWarning)

setup.py

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
"abfs": ["adlfs"],
1515
"adl": ["adlfs"],
1616
"dask": ["dask", "distributed"],
17+
"dircache": ["diskcache", "platformdirs"],
1718
"dropbox": ["dropboxdrivefs", "requests", "dropbox"],
1819
"gcs": ["gcsfs"],
1920
"git": ["pygit2"],

tox.ini

Whitespace-only changes.

0 commit comments

Comments
 (0)