Skip to content

Commit 4dad114

Browse files
committed
Add optional file-based listings caching
1 parent c2ab54d commit 4dad114

File tree

12 files changed

+248
-57
lines changed

12 files changed

+248
-57
lines changed

ci/environment-win.yml

+2
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,5 @@ dependencies:
2525
- nomkl
2626
- s3fs
2727
- tqdm
28+
- diskcache
29+
- platformdirs

docs/source/api.rst

+6-2
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ Base Classes
4747
fsspec.core.OpenFiles
4848
fsspec.core.get_fs_token_paths
4949
fsspec.core.url_to_fs
50-
fsspec.dircache.DirCache
50+
fsspec.dircache.MemoryDirCache
51+
fsspec.dircache.FileDirCache
5152
fsspec.FSMap
5253
fsspec.generic.GenericFileSystem
5354
fsspec.registry.register_implementation
@@ -82,7 +83,10 @@ Base Classes
8283

8384
.. autofunction:: fsspec.core.url_to_fs
8485

85-
.. autoclass:: fsspec.dircache.DirCache
86+
.. autoclass:: fsspec.dircache.MemoryDirCache
87+
:members: __init__
88+
89+
.. autoclass:: fsspec.dircache.FileDirCache
8690
:members: __init__
8791

8892
.. autoclass:: fsspec.FSMap

docs/source/changelog.rst

+7
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
11
Changelog
22
=========
33

4+
Dev
5+
---------
6+
7+
Enhancements
8+
9+
- add file-based listing cache using diskcache
10+
411
2024.2.0
512
--------
613

docs/source/features.rst

+12-9
Original file line numberDiff line numberDiff line change
@@ -181,15 +181,18 @@ Listings Caching
181181
----------------
182182

183183
For some implementations, getting file listings (i.e., ``ls`` and anything that
184-
depends on it) is expensive. These implementations use dict-like instances of
185-
:class:`fsspec.dircache.DirCache` to manage the listings.
186-
187-
The cache allows for time-based expiry of entries with the ``listings_expiry_time``
188-
parameter, or LRU expiry with the ``max_paths`` parameter. These can be
189-
set on any implementation instance that uses listings caching; or to skip the
190-
caching altogether, use ``use_listings_cache=False``. That would be appropriate
191-
when the target location is known to be volatile because it is being written
192-
to from other sources.
184+
depends on it) is expensive. These implementations use either dict-like instances of
185+
:class:`fsspec.dircache.MemoryDirCache` or file-based caching with instances of
186+
:class:`fsspec.dircache.FileDirCache` to manage the listings.
187+
188+
The type of cache that is used, can be controlled via the keyword ``listings_cache_type``
189+
that has to be one of `memdircache` or `filedircache`. The cache allows for time-based expiry
190+
of entries with the ``listings_expiry_time`` parameter, or LRU expiry with the ``max_paths``
191+
parameter. These can be set on any implementation instance that uses listings caching; or to
192+
skip the caching altogether, use ``use_listings_cache=False``. That would be appropriate
193+
when the target location is known to be volatile because it is being written to from other
194+
sources. If you want to use the file-based caching, you can also provide the argument
195+
``listings_cache_location`` to determine where the cache file is stored.
193196

194197
When the ``fsspec`` instance writes to the backend, the method ``invalidate_cache``
195198
is called, so that subsequent listing of the given paths will force a refresh. In

fsspec/asyn.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -297,15 +297,15 @@ class AsyncFileSystem(AbstractFileSystem):
297297
mirror_sync_methods = True
298298
disable_throttling = False
299299

300-
def __init__(self, *args, asynchronous=False, loop=None, batch_size=None, **kwargs):
300+
def __init__(self, *args, asynchronous=False, loop=None, batch_size=None, listings_cache_options=None, **kwargs):
301301
self.asynchronous = asynchronous
302302
self._pid = os.getpid()
303303
if not asynchronous:
304304
self._loop = loop or get_loop()
305305
else:
306306
self._loop = None
307307
self.batch_size = batch_size
308-
super().__init__(*args, **kwargs)
308+
super().__init__(listings_cache_options, *args, **kwargs)
309309

310310
@property
311311
def loop(self):

fsspec/dircache.py

+121-16
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
1+
import logging
12
import time
23
from collections.abc import MutableMapping
4+
from enum import Enum
35
from functools import lru_cache
6+
from pathlib import Path
7+
from typing import Union, Optional
48

9+
logger = logging.getLogger(__name__)
510

6-
class DirCache(MutableMapping):
11+
12+
class MemoryDirCache(MutableMapping):
713
"""
814
Caching of directory listings, in a structure like::
915
@@ -26,19 +32,15 @@ class DirCache(MutableMapping):
2632

2733
def __init__(
2834
self,
29-
use_listings_cache=True,
30-
listings_expiry_time=None,
35+
expiry_time=None,
3136
max_paths=None,
3237
**kwargs,
3338
):
3439
"""
3540
3641
Parameters
3742
----------
38-
use_listings_cache: bool
39-
If False, this cache never returns items, but always reports KeyError,
40-
and setting items has no effect
41-
listings_expiry_time: int or float (optional)
43+
expiry_time: int or float (optional)
4244
Time in seconds that a listing is considered valid. If None,
4345
listings do not expire.
4446
max_paths: int (optional)
@@ -49,13 +51,12 @@ def __init__(
4951
self._times = {}
5052
if max_paths:
5153
self._q = lru_cache(max_paths + 1)(lambda key: self._cache.pop(key, None))
52-
self.use_listings_cache = use_listings_cache
53-
self.listings_expiry_time = listings_expiry_time
54+
self.expiry_time = expiry_time
5455
self.max_paths = max_paths
5556

5657
def __getitem__(self, item):
57-
if self.listings_expiry_time is not None:
58-
if self._times.get(item, 0) - time.time() < -self.listings_expiry_time:
58+
if self.expiry_time is not None:
59+
if self._times.get(item, 0) - time.time() < -self.expiry_time:
5960
del self._cache[item]
6061
if self.max_paths:
6162
self._q(item)
@@ -75,12 +76,10 @@ def __contains__(self, item):
7576
return False
7677

7778
def __setitem__(self, key, value):
78-
if not self.use_listings_cache:
79-
return
8079
if self.max_paths:
8180
self._q(key)
8281
self._cache[key] = value
83-
if self.listings_expiry_time is not None:
82+
if self.expiry_time is not None:
8483
self._times[key] = time.time()
8584

8685
def __delitem__(self, key):
@@ -93,6 +92,112 @@ def __iter__(self):
9392

9493
def __reduce__(self):
9594
return (
96-
DirCache,
97-
(self.use_listings_cache, self.listings_expiry_time, self.max_paths),
95+
MemoryDirCache,
96+
(self.expiry_time, self.max_paths),
97+
)
98+
99+
100+
class FileDirCache(MutableMapping):
101+
def __init__(
102+
self,
103+
expiry_time=None,
104+
directory=None,
105+
**kwargs,
106+
):
107+
"""
108+
109+
Parameters
110+
----------
111+
expiry_time: int or float (optional)
112+
Time in seconds that a listing is considered valid. If None,
113+
listings do not expire.
114+
directory: str (optional)
115+
Directory path at which the listings cache file is stored. If None,
116+
an autogenerated path at the user folder is created.
117+
118+
"""
119+
import platformdirs
120+
from diskcache import Cache
121+
122+
if not directory:
123+
directory = platformdirs.user_cache_dir(appname="fsspec")
124+
directory = Path(directory) / "dircache" / str(expiry_time)
125+
126+
try:
127+
directory.mkdir(exist_ok=True, parents=True)
128+
except OSError as e:
129+
logger.error(
130+
f"folder for dircache could not be created at {directory}"
131+
)
132+
raise e
133+
else:
134+
logger.info(f"Dircache located at {directory}")
135+
136+
self.directory = directory
137+
self._cache = Cache(directory=str(directory))
138+
self.expiry_time = expiry_time
139+
140+
def __getitem__(self, item):
141+
"""Draw item as fileobject from cache, retry if timeout occurs"""
142+
return self._cache.get(key=item, read=True, retry=True)
143+
144+
def clear(self):
145+
self._cache.clear()
146+
147+
def __len__(self):
148+
return len(list(self._cache.iterkeys()))
149+
150+
def __contains__(self, item):
151+
value = self._cache.get(item, retry=True) # None, if expired
152+
if value:
153+
return True
154+
return False
155+
156+
def __setitem__(self, key, value):
157+
self._cache.set(
158+
key=key, value=value, expire=self.expiry_time, retry=True
159+
)
160+
161+
def __delitem__(self, key):
162+
del self._cache[key]
163+
164+
def __iter__(self):
165+
return (k for k in self._cache.iterkeys() if k in self)
166+
167+
def __reduce__(self):
168+
return (
169+
FileDirCache,
170+
(self.expiry_time, self.directory),
98171
)
172+
173+
174+
class CacheType(Enum):
175+
MEMORY = MemoryDirCache
176+
FILE = FileDirCache
177+
178+
179+
def create_dircache(
180+
cache_type: Union[str, CacheType] = None,
181+
expiry_time: Optional[Union[int, float]] = None,
182+
**kwargs,
183+
) -> Optional[Union[MemoryDirCache, FileDirCache]]:
184+
if not cache_type:
185+
return
186+
cache_map = {
187+
CacheType.MEMORY: MemoryDirCache,
188+
CacheType.FILE: FileDirCache,
189+
}
190+
if isinstance(cache_type, str):
191+
try:
192+
cache_type = CacheType[cache_type.upper()]
193+
except KeyError as e:
194+
raise ValueError(f"Cache type must be one of {', '.join(ct.name.lower() for ct in CacheType)}") from e
195+
expiry_time = expiry_time and float(expiry_time)
196+
if expiry_time == 0.0:
197+
return
198+
return cache_map[cache_type](expiry_time, **kwargs)
199+
200+
201+
if __name__ == "__main__":
202+
d = create_dircache(cache_type="memory")
203+
print(d)

fsspec/fuse.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ def main(args):
224224
/historic/packages/RPMS /tmp/ftp \\
225225
-o 'simplecache-cache_storage=/tmp/simplecache' \\
226226
-o 'simplecache-check_files=false[bool]' \\
227-
-o 'ftp-listings_expiry_time=60[int]' \\
227+
-o 'ftp-expiry_time=60[int]' \\
228228
-o 'ftp-username=anonymous' \\
229229
-o 'ftp-password=xieyanbo'
230230
"""

fsspec/implementations/http.py

+15-7
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
)
2323

2424
from ..caching import AllBytes
25+
from ..dircache import CacheType
2526

2627
# https://stackoverflow.com/a/15926317/3821154
2728
ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
@@ -58,6 +59,7 @@ def __init__(
5859
client_kwargs=None,
5960
get_client=get_client,
6061
encoded=False,
62+
listings_cache_options=None,
6163
**storage_options,
6264
):
6365
"""
@@ -83,11 +85,13 @@ def __init__(
8385
A callable which takes keyword arguments and constructs
8486
an aiohttp.ClientSession. It's state will be managed by
8587
the HTTPFileSystem class.
88+
listings_cache_options: dict
89+
Options for the listings cache.
8690
storage_options: key-value
8791
Any other parameters passed on to requests
8892
cache_type, cache_options: defaults used in open
8993
"""
90-
super().__init__(self, asynchronous=asynchronous, loop=loop, **storage_options)
94+
super().__init__(self, asynchronous=asynchronous, loop=loop, listings_cache_options=listings_cache_options, **storage_options)
9195
self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
9296
self.simple_links = simple_links
9397
self.same_schema = same_scheme
@@ -104,10 +108,12 @@ def __init__(
104108
# TODO: Maybe rename `self.kwargs` to `self.request_options` to make
105109
# it clearer.
106110
request_options = copy(storage_options)
107-
self.use_listings_cache = request_options.pop("use_listings_cache", False)
108-
request_options.pop("listings_expiry_time", None)
109-
request_options.pop("max_paths", None)
110-
request_options.pop("skip_instance_cache", None)
111+
# self.use_listings_cache = request_options.pop("use_listings_cache", False)
112+
# request_options.pop("expiry_time", None)
113+
# request_options.pop("max_paths", None)
114+
# request_options.pop("skip_instance_cache", None)
115+
# request_options.pop("listings_cache_type", None)
116+
# request_options.pop("listings_cache_location", None)
111117
self.kwargs = request_options
112118

113119
@property
@@ -201,11 +207,13 @@ async def _ls_real(self, url, detail=True, **kwargs):
201207
return sorted(out)
202208

203209
async def _ls(self, url, detail=True, **kwargs):
204-
if self.use_listings_cache and url in self.dircache:
210+
listings_cache_disabled = self.dircache is None
211+
if not listings_cache_disabled and url in self.dircache:
205212
out = self.dircache[url]
206213
else:
207214
out = await self._ls_real(url, detail=detail, **kwargs)
208-
self.dircache[url] = out
215+
if not listings_cache_disabled:
216+
self.dircache[url] = out
209217
return out
210218

211219
ls = sync_wrapper(_ls)

0 commit comments

Comments
 (0)