Skip to content

Commit cc37237

Browse files
committed
Fix add_file
1 parent 2caba4e commit cc37237

File tree

1 file changed

+34
-25
lines changed

1 file changed

+34
-25
lines changed

zstash/hpss_utils.py

Lines changed: 34 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import _hashlib
1313

1414
from .hpss import hpss_put
15-
from .settings import BLOCK_SIZE, TupleFilesRowNoId, TupleTarsRowNoId, config, logger
15+
from .settings import TupleFilesRowNoId, TupleTarsRowNoId, config, logger
1616
from .utils import create_tars_table, tars_table_exists, ts_utc
1717

1818

@@ -269,40 +269,49 @@ def add_files(
269269
return failures
270270

271271

272+
# Create a wrapper that computes hash while data passes through
273+
class HashingFileWrapper:
274+
def __init__(self, fileobj, hasher):
275+
self.fileobj = fileobj
276+
self.hasher = hasher
277+
278+
def read(self, size=-1):
279+
data = self.fileobj.read(size)
280+
if data:
281+
self.hasher.update(data)
282+
return data
283+
284+
272285
# Add file to tar archive while computing its hash
273286
# Return file offset (in tar archive), size and md5 hash
274287
def add_file(
275288
tar: tarfile.TarFile, file_name: str, follow_symlinks: bool
276289
) -> Tuple[int, int, datetime, Optional[str]]:
290+
offset = tar.offset
291+
tarinfo = tar.gettarinfo(file_name)
277292

278-
offset: int = tar.offset
279-
tarinfo: tarfile.TarInfo = tar.gettarinfo(file_name)
280-
# Change the size of any hardlinks from 0 to the size of the actual file
281293
if tarinfo.islnk():
282294
tarinfo.size = os.path.getsize(file_name)
283295

284-
# Add the file to the tar
285-
if (tarinfo.isfile() or tarinfo.islnk()) and tarinfo.size > 0:
286-
with open(file_name, "rb") as fileobj:
287-
tar.addfile(tarinfo, fileobj)
296+
md5 = None
297+
298+
# For files/hardlinks
299+
if tarinfo.isfile() or tarinfo.islnk():
300+
if tarinfo.size > 0:
301+
# Non-empty files: stream with hash computation
302+
hash_md5 = hashlib.md5()
303+
with open(file_name, "rb") as f:
304+
wrapper = HashingFileWrapper(f, hash_md5)
305+
tar.addfile(tarinfo, wrapper)
306+
md5 = hash_md5.hexdigest()
307+
else:
308+
# Empty files: just add to tar, compute hash of empty data
309+
tar.addfile(tarinfo)
310+
md5 = hashlib.md5(b"").hexdigest() # MD5 of empty bytes
288311
else:
312+
# Directories, symlinks, etc.
289313
tar.addfile(tarinfo)
290314

291-
md5: Optional[str] = None
292-
# Only add files or hardlinks.
293-
# (So don't add directories or softlinks.)
294-
if tarinfo.isfile() or tarinfo.islnk():
295-
f = open(file_name, "rb")
296-
hash_md5: _hashlib.HASH = hashlib.md5()
297-
298-
while True:
299-
data = f.read(BLOCK_SIZE)
300-
if len(data) > 0:
301-
hash_md5.update(data)
302-
if len(data) < BLOCK_SIZE:
303-
break
304-
md5 = hash_md5.hexdigest()
305-
f.close()
306-
size: int = tarinfo.size
307-
mtime: datetime = datetime.utcfromtimestamp(tarinfo.mtime)
315+
size = tarinfo.size
316+
mtime = datetime.utcfromtimestamp(tarinfo.mtime)
308317
return offset, size, mtime, md5

0 commit comments

Comments
 (0)