Skip to content

Commit f990b3e

Browse files
committed
Add optional file-based listings caching
1 parent 3cc5b48 commit f990b3e

File tree

10 files changed

+195
-26
lines changed

10 files changed

+195
-26
lines changed

ci/environment-win.yml

+2
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,5 @@ dependencies:
2020
- python-libarchive-c
2121
- numpy
2222
- nomkl
23+
- diskcache
24+
- appdirs

docs/source/api.rst

+6-2
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ Base Classes
4141
fsspec.core.BaseCache
4242
fsspec.core.get_fs_token_paths
4343
fsspec.core.url_to_fs
44-
fsspec.dircache.DirCache
44+
fsspec.dircache.MemDirCache
45+
fsspec.dircache.FileDirCache
4546
fsspec.registry.ReadOnlyRegistry
4647
fsspec.registry.register_implementation
4748
fsspec.callbacks.Callback
@@ -75,7 +76,10 @@ Base Classes
7576

7677
.. autofunction:: fsspec.core.url_to_fs
7778

78-
.. autoclass:: fsspec.dircache.DirCache
79+
.. autoclass:: fsspec.dircache.MemDirCache
80+
:members: __init__
81+
82+
.. autoclass:: fsspec.dircache.FileDirCache
7983
:members: __init__
8084

8185
.. autoclass:: fsspec.registry.ReadOnlyRegistry

docs/source/changelog.rst

+8
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,14 @@
11
Changelog
22
=========
33

4+
Dev
5+
---------
6+
7+
Enhancements
8+
9+
- add filebased listing cache using diskcache
10+
11+
412
2022.01.0
513
---------
614

docs/source/features.rst

+12-9
Original file line numberDiff line numberDiff line change
@@ -179,15 +179,18 @@ Listings Caching
179179
----------------
180180

181181
For some implementations, getting file listings (i.e., ``ls`` and anything that
182-
depends on it) is expensive. These implementations use dict-like instances of
183-
:class:`fsspec.dircache.DirCache` to manage the listings.
184-
185-
The cache allows for time-based expiry of entries with the ``listings_expiry_time``
186-
parameter, or LRU expiry with the ``max_paths`` parameter. These can be
187-
set on any implementation instance that uses listings caching; or to skip the
188-
caching altogether, use ``use_listings_cache=False``. That would be appropriate
189-
when the target location is known to be volatile because it is being written
190-
to from other sources.
182+
depends on it) is expensive. These implementations use either dict-like instances of
183+
:class:`fsspec.dircache.MemDirCache` or file-based caching with instances of
184+
:class:`fsspec.dircache.FileDirCache` to manage the listings.
185+
186+
The type of cache that is used, can be controlled via the keyword ``listings_cache_type``
187+
that has to be one of `memdircache` or `filedircache`. The cache allows for time-based expiry
188+
of entries with the ``listings_expiry_time`` parameter, or LRU expiry with the ``max_paths``
189+
parameter. These can be set on any implementation instance that uses listings caching; or to
190+
skip the caching altogether, use ``use_listings_cache=False``. That would be appropriate
191+
when the target location is known to be volatile because it is being written to from other
192+
sources. If you want to use the file-based caching, you can also provide the argument
193+
``listings_cache_location`` to determine where the cache file is stored.
191194

192195
When the ``fsspec`` instance writes to the backend, the method ``invalidate_cache``
193196
is called, so that subsequent listing of the given paths will force a refresh. In

fsspec/dircache.py

+95-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
1+
import logging
12
import time
23
from collections.abc import MutableMapping
34
from functools import lru_cache
5+
from pathlib import Path
46

7+
logger = logging.getLogger("fsspec")
58

6-
class DirCache(MutableMapping):
9+
10+
class MemDirCache(MutableMapping):
711
"""
812
Caching of directory listings, in a structure like::
913
@@ -91,6 +95,95 @@ def __iter__(self):
9195

9296
def __reduce__(self):
9397
return (
94-
DirCache,
98+
MemDirCache,
9599
(self.use_listings_cache, self.listings_expiry_time, self.max_paths),
96100
)
101+
102+
103+
class FileDirCache(MutableMapping):
104+
def __init__(
105+
self,
106+
use_listings_cache=True,
107+
listings_expiry_time=None,
108+
listings_cache_location=None,
109+
**kwargs,
110+
):
111+
"""
112+
113+
Parameters
114+
----------
115+
use_listings_cache: bool
116+
If False, this cache never returns items, but always reports KeyError,
117+
and setting items has no effect
118+
listings_expiry_time: int or float (optional)
119+
Time in seconds that a listing is considered valid. If None,
120+
listings do not expire.
121+
listings_cache_location: str (optional)
122+
Directory path at which the listings cache file is stored. If None,
123+
an autogenerated path at the user folder is created.
124+
125+
"""
126+
import appdirs
127+
from diskcache import Cache
128+
129+
listings_expiry_time = listings_expiry_time and float(listings_expiry_time)
130+
131+
if listings_cache_location:
132+
listings_cache_location = Path(listings_cache_location) / str(
133+
listings_expiry_time
134+
)
135+
listings_cache_location.mkdir(exist_ok=True, parents=True)
136+
else:
137+
listings_cache_location = Path(
138+
appdirs.user_cache_dir(appname="fsspec_dircache")
139+
) / str(listings_expiry_time)
140+
141+
try:
142+
listings_cache_location.mkdir(exist_ok=True, parents=True)
143+
except Exception:
144+
logger.error(
145+
f"folder for dircache could not be created at {listings_cache_location}"
146+
)
147+
148+
self.cache_location = listings_cache_location
149+
150+
logger.info(f"Dircache located at {listings_cache_location}")
151+
152+
self._cache = Cache(directory=listings_cache_location)
153+
self.use_listings_cache = use_listings_cache
154+
self.listings_expiry_time = listings_expiry_time
155+
156+
def __getitem__(self, item):
157+
"""Draw item as fileobject from cache, retry if timeout occurs"""
158+
return self._cache.get(key=item, read=True, retry=True)
159+
160+
def clear(self):
161+
self._cache.clear()
162+
163+
def __len__(self):
164+
return len(list(self._cache.iterkeys()))
165+
166+
def __contains__(self, item):
167+
value = self._cache.get(item, retry=True) # None, if expired
168+
if value:
169+
return True
170+
return False
171+
172+
def __setitem__(self, key, value):
173+
if not self.use_listings_cache:
174+
return
175+
self._cache.set(
176+
key=key, value=value, expire=self.listings_expiry_time, retry=True
177+
)
178+
179+
def __delitem__(self, key):
180+
del self._cache[key]
181+
182+
def __iter__(self):
183+
return (k for k in self._cache.iterkeys() if k in self)
184+
185+
def __reduce__(self):
186+
return (
187+
FileDirCache,
188+
(self.use_listings_cache, self.listings_expiry_time, self.cache_location),
189+
)

fsspec/implementations/http.py

+2
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,8 @@ def __init__(
102102
request_options.pop("listings_expiry_time", None)
103103
request_options.pop("max_paths", None)
104104
request_options.pop("skip_instance_cache", None)
105+
request_options.pop("listings_cache_type", None)
106+
request_options.pop("listings_cache_location", None)
105107
self.kwargs = request_options
106108

107109
if not asynchronous:

fsspec/implementations/tests/test_http.py

+55-10
Original file line numberDiff line numberDiff line change
@@ -170,14 +170,30 @@ def test_list_invalid_args(server):
170170
h.glob(server + "/index/*")
171171

172172

173-
def test_list_cache(server):
174-
h = fsspec.filesystem("http", use_listings_cache=True)
173+
@pytest.mark.parametrize("listings_cache_type", ["memdircache", "filedircache"])
174+
def test_list_cache(server, listings_cache_type):
175+
h = fsspec.filesystem(
176+
"http", use_listings_cache=True, listings_cache_type=listings_cache_type
177+
)
178+
179+
h.dircache.clear() # Needed for filedircache
180+
175181
out = h.glob(server + "/index/*")
176182
assert out == [server + "/index/realfile"]
177183

184+
h.dircache.clear() # clean up
185+
178186

179-
def test_list_cache_with_expiry_time_cached(server):
180-
h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=30)
187+
@pytest.mark.parametrize("listings_cache_type", ["memdircache", "filedircache"])
188+
def test_list_cache_with_expiry_time_cached(server, listings_cache_type):
189+
h = fsspec.filesystem(
190+
"http",
191+
use_listings_cache=True,
192+
listings_expiry_time=30,
193+
listings_cache_type=listings_cache_type,
194+
)
195+
196+
h.dircache.clear() # Needed for filedircache
181197

182198
# First, the directory cache is not initialized.
183199
assert not h.dircache
@@ -193,9 +209,19 @@ def test_list_cache_with_expiry_time_cached(server):
193209
out = h.glob(server + "/index/*")
194210
assert out == [server + "/index/realfile"]
195211

212+
h.dircache.clear() # clean up
213+
214+
215+
@pytest.mark.parametrize("listings_cache_type", ["memdircache", "filedircache"])
216+
def test_list_cache_with_expiry_time_purged(server, listings_cache_type):
217+
h = fsspec.filesystem(
218+
"http",
219+
use_listings_cache=True,
220+
listings_expiry_time=0.3,
221+
listings_cache_type=listings_cache_type,
222+
)
196223

197-
def test_list_cache_with_expiry_time_purged(server):
198-
h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=0.3)
224+
h.dircache.clear() # Needed for filedircache
199225

200226
# First, the directory cache is not initialized.
201227
assert not h.dircache
@@ -224,9 +250,20 @@ def test_list_cache_with_expiry_time_purged(server):
224250
cached_items = h.dircache.get(server + "/index/")
225251
assert len(cached_items) == 1
226252

253+
h.dircache.clear() # clean up
227254

228-
def test_list_cache_reuse(server):
229-
h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=5)
255+
256+
@pytest.mark.parametrize("listings_cache_type", ["memdircache", "filedircache"])
257+
def test_list_cache_reuse(server, listings_cache_type):
258+
h = fsspec.filesystem(
259+
"http",
260+
use_listings_cache=True,
261+
listings_expiry_time=5,
262+
listings_cache_type=listings_cache_type,
263+
)
264+
265+
# Needed for filedircache
266+
h.dircache.clear()
230267

231268
# First, the directory cache is not initialized.
232269
assert not h.dircache
@@ -253,6 +290,8 @@ def test_list_cache_reuse(server):
253290
h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=666)
254291
assert len(h.dircache) == 0
255292

293+
h.dircache.clear() # clean up
294+
256295

257296
def test_ls_raises_filenotfound(server):
258297
h = fsspec.filesystem("http")
@@ -267,8 +306,14 @@ def test_list_cache_with_max_paths(server):
267306
assert out == [server + "/index/realfile"]
268307

269308

270-
def test_list_cache_with_skip_instance_cache(server):
271-
h = fsspec.filesystem("http", use_listings_cache=True, skip_instance_cache=True)
309+
@pytest.mark.parametrize("listings_cache_type", ["memdircache", "filedircache"])
310+
def test_list_cache_with_skip_instance_cache(server, listings_cache_type):
311+
h = fsspec.filesystem(
312+
"http",
313+
use_listings_cache=True,
314+
skip_instance_cache=True,
315+
listings_cache_type=listings_cache_type,
316+
)
272317
out = h.glob(server + "/index/*")
273318
assert out == [server + "/index/realfile"]
274319

fsspec/spec.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
from .callbacks import _DEFAULT_CALLBACK
1212
from .config import apply_config, conf
13-
from .dircache import DirCache
13+
from .dircache import FileDirCache, MemDirCache
1414
from .transaction import Transaction
1515
from .utils import _unstrip_protocol, other_paths, read_block, stringify_path, tokenize
1616

@@ -113,7 +113,7 @@ def __init__(self, *args, **storage_options):
113113
Parameters
114114
----------
115115
use_listings_cache, listings_expiry_time, max_paths:
116-
passed to ``DirCache``, if the implementation supports
116+
passed to ``MemDirCache``, if the implementation supports
117117
directory listing caching. Pass use_listings_cache=False
118118
to disable such caching.
119119
skip_instance_cache: bool
@@ -130,7 +130,16 @@ def __init__(self, *args, **storage_options):
130130
self._intrans = False
131131
self._transaction = None
132132
self._invalidated_caches_in_transaction = []
133-
self.dircache = DirCache(**storage_options)
133+
134+
listings_cache_type = storage_options.get("listings_cache_type", "memdircache")
135+
if listings_cache_type not in ("memdircache", "filedircache"):
136+
raise ValueError(
137+
"'listings_cache_type' has to be one of ('memdircache', 'filedircache')"
138+
)
139+
if listings_cache_type == "memdircache":
140+
self.dircache = MemDirCache(**storage_options)
141+
else:
142+
self.dircache = FileDirCache(**storage_options)
134143

135144
if storage_options.pop("add_docs", None):
136145
warnings.warn("add_docs is no longer supported.", FutureWarning)

setup.py

+1
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
"fuse": ["fusepy"],
5555
"libarchive": ["libarchive-c"],
5656
"gui": ["panel"],
57+
"dircache": ["diskcache", "appdirs"],
5758
},
5859
zip_safe=False,
5960
)

tox.ini

+2
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ conda_deps=
3737
numpy
3838
nomkl
3939
jinja2
40+
diskcache
41+
appdirs
4042
deps=
4143
hadoop-test-cluster==0.1.0
4244
smbprotocol

0 commit comments

Comments
 (0)