Skip to content

Commit 7ec27c1

Browse files
committed
Adjust for our purposes
1 parent fa0b29e commit 7ec27c1

File tree

3 files changed

+78
-104
lines changed

3 files changed

+78
-104
lines changed

src/fscacher/cache.py

Lines changed: 8 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
1-
from collections import deque, namedtuple
21
from functools import wraps
32
from inspect import Parameter, signature
43
import logging
54
import os
65
import os.path as op
76
import shutil
8-
import time
97
import appdirs
108
import joblib
9+
from .fastio import walk
10+
from .util import DirFingerprint, FileFingerprint
1111

1212
lgr = logging.getLogger(__name__)
1313

@@ -114,7 +114,7 @@ def fingerprinter(*args, **kwargs):
114114
if op.isdir(path):
115115
fprint = self._get_dir_fingerprint(path)
116116
else:
117-
fprint = self._get_file_fingerprint(path)
117+
fprint = FileFingerprint.for_file(path)
118118
if fprint is None:
119119
lgr.debug("Calling %s directly since no fingerprint for %r", f, path)
120120
# just call the function -- we have no fingerprint,
@@ -143,67 +143,8 @@ def fingerprinter(*args, **kwargs):
143143
return fingerprinter
144144

145145
@staticmethod
146-
def _get_file_fingerprint(path):
147-
"""Simplistic generic file fingerprinting based on ctime, mtime, and size
148-
"""
149-
try:
150-
# we can't take everything, since atime can change, etc.
151-
# So let's take some
152-
s = os.stat(path, follow_symlinks=True)
153-
fprint = FileFingerprint.from_stat(s)
154-
lgr.log(5, "Fingerprint for %s: %s", path, fprint)
155-
return fprint
156-
except Exception as exc:
157-
lgr.debug(f"Cannot fingerprint {path}: {exc}")
158-
159-
@staticmethod
160-
def _get_dir_fingerprint(path):
161-
fprint = DirFingerprint()
162-
dirqueue = deque([path])
163-
try:
164-
while dirqueue:
165-
d = dirqueue.popleft()
166-
with os.scandir(d) as entries:
167-
for e in entries:
168-
if e.is_dir(follow_symlinks=True):
169-
dirqueue.append(e.path)
170-
else:
171-
s = e.stat(follow_symlinks=True)
172-
fprint.add_file(e.path, FileFingerprint.from_stat(s))
173-
except Exception as exc:
174-
lgr.debug(f"Cannot fingerprint {path}: {exc}")
175-
return None
176-
else:
177-
return fprint
178-
179-
180-
class FileFingerprint(namedtuple("FileFingerprint", "mtime_ns ctime_ns size inode")):
181-
@classmethod
182-
def from_stat(cls, s):
183-
return cls(s.st_mtime_ns, s.st_ctime_ns, s.st_size, s.st_ino)
184-
185-
def modified_in_window(self, min_dtime):
186-
return abs(time.time() - self.mtime_ns * 1e-9) < min_dtime
187-
188-
def to_tuple(self):
189-
return tuple(self)
190-
191-
192-
class DirFingerprint:
193-
def __init__(self):
194-
self.last_modified = None
195-
self.tree_fprints = {}
196-
197-
def add_file(self, path, fprint: FileFingerprint):
198-
self.tree_fprints[path] = fprint
199-
if self.last_modified is None or self.last_modified < fprint.mtime_ns:
200-
self.last_modified = fprint.mtime_ns
201-
202-
def modified_in_window(self, min_dtime):
203-
if self.last_modified is None:
204-
return False
205-
else:
206-
return abs(time.time() - self.last_modified * 1e-9) < min_dtime
207-
208-
def to_tuple(self):
209-
return sum(sorted(self.tree_fprints.items()), ())
146+
def _get_dir_fingerprint(dirpath):
147+
dprint = DirFingerprint()
148+
for path, fprint in walk(dirpath):
149+
dprint.add_file(path, fprint)
150+
return dprint

src/fscacher/fastio.py

Lines changed: 18 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,12 @@
1414
# ==============================================================================
1515
"""Routines for multi-threaded i/o."""
1616

17-
from hashlib import md5
17+
import logging
1818
import os
19-
import sys
2019
import threading
20+
from .util import FileFingerprint
21+
22+
lgr = logging.getLogger(__name__)
2123

2224

2325
def walk(top, threads=60):
@@ -38,24 +40,24 @@ def walk(top, threads=60):
3840
threads: Size of fixed thread pool.
3941
4042
Yields:
41-
A (path, subdirs, files) tuple for each directory within top, including
42-
itself. These tuples come in no particular order; however, the contents
43-
of each tuple itself is sorted.
43+
A (path, fingerprint) pair for each file within top. These pairs come in
44+
no particular order.
4445
"""
4546
if not os.path.isdir(top):
4647
return
4748
lock = threading.Lock()
4849
on_input = threading.Condition(lock)
4950
on_output = threading.Condition(lock)
50-
state = {"tasks": 1}
51+
tasks = 1
5152
paths = [top]
5253
output = []
5354

5455
def worker():
56+
nonlocal tasks
5557
while True:
5658
with lock:
5759
while True:
58-
if not state["tasks"]:
60+
if not tasks:
5961
output.append(None)
6062
on_output.notify()
6163
return
@@ -65,34 +67,27 @@ def worker():
6567
path = paths.pop()
6668
break
6769
try:
68-
dirs = []
69-
files = []
7070
for item in sorted(os.listdir(path)):
7171
subpath = os.path.join(path, item)
7272
if os.path.isdir(subpath):
73-
dirs.append(item)
7473
with lock:
75-
state["tasks"] += 1
74+
tasks += 1
7675
paths.append(subpath)
7776
on_input.notify()
7877
else:
79-
with open(subpath, "rb") as fp:
80-
digest = md5()
81-
digest.update(fp.read())
82-
files.append((item, digest.hexdigest()))
83-
with lock:
84-
output.append((path, dirs, files))
85-
on_output.notify()
86-
except OSError as e:
87-
print(e, file=sys.stderr)
78+
with lock:
79+
output.append((subpath, FileFingerprint.for_file(subpath)))
80+
on_output.notify()
81+
except OSError:
82+
lgr.exception("Error scanning directory %s", path)
8883
finally:
8984
with lock:
90-
state["tasks"] -= 1
91-
if not state["tasks"]:
85+
tasks -= 1
86+
if not tasks:
9287
on_input.notify_all()
9388

9489
workers = [
95-
threading.Thread(target=worker, name="fastio.walk %d %s" % (i, top))
90+
threading.Thread(target=worker, name=f"fastio.walk {i} {top}", daemon=True)
9691
for i in range(threads)
9792
]
9893
for w in workers:
@@ -106,17 +101,3 @@ def worker():
106101
yield item
107102
else:
108103
threads -= 1
109-
110-
111-
if __name__ == "__main__":
112-
loc = sys.argv[1]
113-
if len(sys.argv) > 2:
114-
nthreads = int(sys.argv[2])
115-
gen = walk(loc, threads=nthreads)
116-
else:
117-
gen = walk(loc)
118-
filecount = 0
119-
for val in gen:
120-
filecount += len(val[2])
121-
print(val)
122-
print(f"Total: {filecount}")

src/fscacher/util.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from collections import namedtuple
2+
import logging
3+
import os
4+
import time
5+
6+
lgr = logging.getLogger(__name__)
7+
8+
9+
class FileFingerprint(namedtuple("FileFingerprint", "mtime_ns ctime_ns size inode")):
10+
@classmethod
11+
def for_file(cls, path):
12+
"""Simplistic generic file fingerprinting based on ctime, mtime, and size
13+
"""
14+
try:
15+
# we can't take everything, since atime can change, etc.
16+
# So let's take some
17+
s = os.stat(path, follow_symlinks=True)
18+
fprint = cls.from_stat(s)
19+
lgr.log(5, "Fingerprint for %s: %s", path, fprint)
20+
return fprint
21+
except Exception as exc:
22+
lgr.debug(f"Cannot fingerprint {path}: {exc}")
23+
24+
@classmethod
25+
def from_stat(cls, s):
26+
return cls(s.st_mtime_ns, s.st_ctime_ns, s.st_size, s.st_ino)
27+
28+
def modified_in_window(self, min_dtime):
29+
return abs(time.time() - self.mtime_ns * 1e-9) < min_dtime
30+
31+
def to_tuple(self):
32+
return tuple(self)
33+
34+
35+
class DirFingerprint:
36+
def __init__(self):
37+
self.last_modified = None
38+
self.tree_fprints = {}
39+
40+
def add_file(self, path, fprint: FileFingerprint):
41+
self.tree_fprints[path] = fprint
42+
if self.last_modified is None or self.last_modified < fprint.mtime_ns:
43+
self.last_modified = fprint.mtime_ns
44+
45+
def modified_in_window(self, min_dtime):
46+
if self.last_modified is None:
47+
return False
48+
else:
49+
return abs(time.time() - self.last_modified * 1e-9) < min_dtime
50+
51+
def to_tuple(self):
52+
return sum(sorted(self.tree_fprints.items()), ())

0 commit comments

Comments
 (0)