|
12 | 12 | import _hashlib |
13 | 13 |
|
14 | 14 | from .hpss import hpss_put |
15 | | -from .settings import BLOCK_SIZE, TupleFilesRowNoId, TupleTarsRowNoId, config, logger |
| 15 | +from .settings import TupleFilesRowNoId, TupleTarsRowNoId, config, logger |
16 | 16 | from .utils import create_tars_table, tars_table_exists, ts_utc |
17 | 17 |
|
18 | 18 |
|
@@ -269,40 +269,49 @@ def add_files( |
269 | 269 | return failures |
270 | 270 |
|
271 | 271 |
|
| 272 | +# Create a wrapper that computes hash while data passes through |
| 273 | +class HashingFileWrapper: |
| 274 | + def __init__(self, fileobj, hasher): |
| 275 | + self.fileobj = fileobj |
| 276 | + self.hasher = hasher |
| 277 | + |
| 278 | + def read(self, size=-1): |
| 279 | + data = self.fileobj.read(size) |
| 280 | + if data: |
| 281 | + self.hasher.update(data) |
| 282 | + return data |
| 283 | + |
| 284 | + |
272 | 285 | # Add file to tar archive while computing its hash |
273 | 286 | # Return file offset (in tar archive), size and md5 hash |
274 | 287 | def add_file( |
275 | 288 | tar: tarfile.TarFile, file_name: str, follow_symlinks: bool |
276 | 289 | ) -> Tuple[int, int, datetime, Optional[str]]: |
| 290 | + offset = tar.offset |
| 291 | + tarinfo = tar.gettarinfo(file_name) |
277 | 292 |
|
278 | | - offset: int = tar.offset |
279 | | - tarinfo: tarfile.TarInfo = tar.gettarinfo(file_name) |
280 | | - # Change the size of any hardlinks from 0 to the size of the actual file |
281 | 293 | if tarinfo.islnk(): |
282 | 294 | tarinfo.size = os.path.getsize(file_name) |
283 | 295 |
|
284 | | - # Add the file to the tar |
285 | | - if (tarinfo.isfile() or tarinfo.islnk()) and tarinfo.size > 0: |
286 | | - with open(file_name, "rb") as fileobj: |
287 | | - tar.addfile(tarinfo, fileobj) |
| 296 | + md5 = None |
| 297 | + |
| 298 | + # For files/hardlinks |
| 299 | + if tarinfo.isfile() or tarinfo.islnk(): |
| 300 | + if tarinfo.size > 0: |
| 301 | + # Non-empty files: stream with hash computation |
| 302 | + hash_md5 = hashlib.md5() |
| 303 | + with open(file_name, "rb") as f: |
| 304 | + wrapper = HashingFileWrapper(f, hash_md5) |
| 305 | + tar.addfile(tarinfo, wrapper) |
| 306 | + md5 = hash_md5.hexdigest() |
| 307 | + else: |
| 308 | + # Empty files: just add to tar, compute hash of empty data |
| 309 | + tar.addfile(tarinfo) |
| 310 | + md5 = hashlib.md5(b"").hexdigest() # MD5 of empty bytes |
288 | 311 | else: |
| 312 | + # Directories, symlinks, etc. |
289 | 313 | tar.addfile(tarinfo) |
290 | 314 |
|
291 | | - md5: Optional[str] = None |
292 | | - # Only add files or hardlinks. |
293 | | - # (So don't add directories or softlinks.) |
294 | | - if tarinfo.isfile() or tarinfo.islnk(): |
295 | | - f = open(file_name, "rb") |
296 | | - hash_md5: _hashlib.HASH = hashlib.md5() |
297 | | - |
298 | | - while True: |
299 | | - data = f.read(BLOCK_SIZE) |
300 | | - if len(data) > 0: |
301 | | - hash_md5.update(data) |
302 | | - if len(data) < BLOCK_SIZE: |
303 | | - break |
304 | | - md5 = hash_md5.hexdigest() |
305 | | - f.close() |
306 | | - size: int = tarinfo.size |
307 | | - mtime: datetime = datetime.utcfromtimestamp(tarinfo.mtime) |
| 315 | + size = tarinfo.size |
| 316 | + mtime = datetime.utcfromtimestamp(tarinfo.mtime) |
308 | 317 | return offset, size, mtime, md5 |
0 commit comments