Skip to content
2 changes: 1 addition & 1 deletion .github/workflows/build_workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ jobs:
environment-file: conda/dev.yml
channel-priority: flexible # Changed from strict to flexible
auto-update-conda: true
python-version: "3.11" # Use stable Python version for docs
python-version: "3.13" # Use stable Python version for docs

# sphinx-multiversion allows for version docs.
- name: Build Sphinx Docs
Expand Down
57 changes: 9 additions & 48 deletions zstash/hpss_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import os
import os.path
import sqlite3
import sys
import tarfile
import traceback
from datetime import datetime
Expand Down Expand Up @@ -283,17 +282,11 @@ def add_file(
if tarinfo.islnk():
tarinfo.size = os.path.getsize(file_name)

# Add the file to the tar - ONLY change this line for Python 3.13+
if (
sys.version_info >= (3, 13)
and (tarinfo.isfile() or tarinfo.islnk())
and tarinfo.size > 0
):
# Python 3.13+ requires fileobj for non-empty regular files
# Add the file to the tar
if (tarinfo.isfile() or tarinfo.islnk()) and tarinfo.size > 0:
with open(file_name, "rb") as fileobj:
tar.addfile(tarinfo, fileobj)
else:
# Original code - unchanged
tar.addfile(tarinfo)

md5: Optional[str] = None
Expand All @@ -303,45 +296,13 @@ def add_file(
f: _io.TextIOWrapper = open(file_name, "rb")
hash_md5: _hashlib.HASH = hashlib.md5()

# For Python 3.13+, addfile() already wrote the content, so we only calculate MD5
if sys.version_info >= (3, 13) and tarinfo.size > 0:
# Just calculate MD5, don't write to tar (already done by addfile)
while True:
data: bytes = f.read(BLOCK_SIZE)
if len(data) > 0:
hash_md5.update(data)
if len(data) < BLOCK_SIZE:
break
md5 = hash_md5.hexdigest()
else:
# Original logic for Python < 3.13 or empty files
if tar.fileobj is not None:
tar_fileobj: _io.BufferedWriter = tar.fileobj
else:
raise TypeError("Invalid tar.fileobj={}".format(tar.fileobj))
while True:
s: bytes = f.read(BLOCK_SIZE)
if len(s) > 0:
# If the block read in is non-empty, write it to fileobj and update the hash
tar_fileobj.write(s)
hash_md5.update(s)
if len(s) < BLOCK_SIZE:
# If the block read in is smaller than BLOCK_SIZE,
# then we have reached the end of the file.
# blocks = how many blocks of tarfile.BLOCKSIZE fit in tarinfo.size
# remainder = how much more content is required to reach tarinfo.size
blocks: int
remainder: int
blocks, remainder = divmod(tarinfo.size, tarfile.BLOCKSIZE)
if remainder > 0:
null_bytes: bytes = tarfile.NUL
# Write null_bytes to get the last block to tarfile.BLOCKSIZE
tar_fileobj.write(null_bytes * (tarfile.BLOCKSIZE - remainder))
blocks += 1
# Increase the offset by the amount already saved to the tar
tar.offset += blocks * tarfile.BLOCKSIZE
break
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@golaz Can you please review this diff? I can't tell if this tracking of tar.offset is actually needed. From what Claude tells me, that tar.addfile(tarinfo, fileobj) should be taking care of everything.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I should note the tests pass on GitHub Actions using Python 3.11, 3.12, 3.13.

md5 = hash_md5.hexdigest()
while True:
data: bytes = f.read(BLOCK_SIZE)
if len(data) > 0:
hash_md5.update(data)
if len(data) < BLOCK_SIZE:
break
md5 = hash_md5.hexdigest()
f.close()
size: int = tarinfo.size
mtime: datetime = datetime.utcfromtimestamp(tarinfo.mtime)
Expand Down
Loading