diff --git a/.github/workflows/build_workflow.yml b/.github/workflows/build_workflow.yml index 72c0f9f1..b215c46f 100644 --- a/.github/workflows/build_workflow.yml +++ b/.github/workflows/build_workflow.yml @@ -19,10 +19,10 @@ jobs: - name: Checkout Code Repository uses: actions/checkout@v3 - - name: Set up Python 3.9 + - name: Set up Python 3.13 uses: actions/setup-python@v4 with: - python-version: 3.9 + python-version: "3.13" # Run all pre-commit hooks on all the files. # Getting only staged files can be tricky in case a new PR is opened @@ -30,10 +30,13 @@ jobs: # This is the equivalent of running "pre-commit run --all-files" locally. # If you commit with the `--no-verify` flag, this check may fail. - name: Install and Run Pre-commit - uses: pre-commit/action@v3.0.0 + uses: pre-commit/action@v3.0.1 build: runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.11", "3.12", "3.13"] defaults: run: shell: bash -l {0} @@ -44,11 +47,11 @@ jobs: - name: Cache Conda uses: actions/cache@v3 env: - CACHE_NUMBER: 0 + CACHE_NUMBER: 1 # Increment this to invalidate cache with: path: ~/conda_pkgs_dir key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ - hashFiles('conda/dev.yml') }} + hashFiles('conda/dev.yml') }}-python${{ matrix.python-version }} - name: Build Conda Environment uses: conda-incubator/setup-miniconda@v3 @@ -57,13 +60,27 @@ jobs: miniforge-variant: Miniforge3 miniforge-version: latest environment-file: conda/dev.yml - channel-priority: strict + channel-priority: flexible # Changed from strict to flexible auto-update-conda: true + python-version: ${{ matrix.python-version }} + channels: conda-forge + use-only-tar-bz2: true + + - name: Verify Environment and Fix Dependencies + run: | + conda info + conda list + # Ensure we have the right Python version + python --version + # Fix pip issues for Python 3.12+ + if [[ "${{ matrix.python-version }}" == "3.12" ]] || [[ "${{ matrix.python-version }}" == "3.13" ]]; then + python -m ensurepip --upgrade || true + python -m pip install --upgrade --force-reinstall pip setuptools wheel + fi - name: Install `zstash` Package run: | - python -m pip install --upgrade pip - pip install . + python -m pip install . - name: Run Tests run: | @@ -77,7 +94,7 @@ jobs: defaults: run: shell: bash -l {0} - timeout-minutes: 5 + timeout-minutes: 10 # Increased timeout for docs steps: - uses: actions/checkout@v3 with: @@ -87,11 +104,11 @@ jobs: - name: Cache Conda uses: actions/cache@v3 env: - CACHE_NUMBER: 0 + CACHE_NUMBER: 1 # Match the build job cache number with: path: ~/conda_pkgs_dir key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ - hashFiles('conda/dev.yml') }} + hashFiles('conda/dev.yml') }}-docs - name: Build Conda Environment uses: conda-incubator/setup-miniconda@v3 @@ -100,8 +117,9 @@ jobs: miniforge-variant: Miniforge3 miniforge-version: latest environment-file: conda/dev.yml - channel-priority: strict + channel-priority: flexible # Changed from strict to flexible auto-update-conda: true + python-version: "3.13" # Use stable Python version for docs # sphinx-multiversion allows for version docs. - name: Build Sphinx Docs diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 482e6413..bf0d71b3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -12,19 +12,19 @@ repos: exclude: conda/meta.yaml - repo: https://github.com/psf/black - rev: 24.10.0 + rev: 25.1.0 hooks: - id: black - repo: https://github.com/PyCQA/isort - rev: 5.13.2 + rev: 6.0.1 hooks: - id: isort # Need to use flake8 GitHub mirror due to CentOS git issue with GitLab # https://github.com/pre-commit/pre-commit/issues/1206 - repo: https://github.com/pycqa/flake8 - rev: 7.1.1 + rev: 7.3.0 hooks: - id: flake8 args: ["--config=setup.cfg"] @@ -32,7 +32,7 @@ repos: exclude: analysis_data_preprocess - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.11.2 + rev: v1.18.2 hooks: - id: mypy args: ["--config=setup.cfg", "--install-types", "--non-interactive"] diff --git a/conda/dev.yml b/conda/dev.yml index 5a19a9c4..6dd327e2 100644 --- a/conda/dev.yml +++ b/conda/dev.yml @@ -1,31 +1,30 @@ name: zstash_dev channels: - conda-forge - - defaults dependencies: # Base # ================= - - pip=22.2.2 - - python=3.9.13 - - six=1.16.0 - - globus-sdk=3.15.0 + - pip + - python >=3.11,<3.14 + - sqlite + - six >=1.16.0 + - globus-sdk >=3.15.0 # Developer Tools # ================= # If versions are updated, also update 'rev' in `.pre-commit.config.yaml` - - black=24.10.0 - - flake8=7.1.1 - - flake8-isort=6.1.1 - - mypy=1.11.2 - - pre-commit=4.0.1 - - tbump=6.9.0 + - black ==25.1.0 + - flake8 ==7.3.0 + - isort ==6.0.1 + - mypy ==1.18.2 + - pre-commit ==4.3.0 + - tbump >=6.9.0 # Documentation # ================= # If versions are updated, also update in `.github/workflows/workflow.yml` - - jinja2<3.1 - - sphinx=5.2.3 - - sphinx-multiversion=0.2.4 - - sphinx_rtd_theme=1.0.0 + - jinja2 <3.1 + - sphinx >=5.2.0 + - sphinx-multiversion >=0.2.4 + - sphinx_rtd_theme >=1.0.0 # Need to pin docutils because 0.17 has a bug with unordered lists # https://github.com/readthedocs/sphinx_rtd_theme/issues/1115 - - docutils=0.16 -prefix: /opt/miniconda3/envs/zstash_dev + - docutils >=0.16,<0.17 diff --git a/conda/meta.yaml b/conda/meta.yaml deleted file mode 100644 index b5b5bd98..00000000 --- a/conda/meta.yaml +++ /dev/null @@ -1,35 +0,0 @@ -{% set name = "zstash" %} -{% set version = "1.4.4" %} - -package: - name: {{ name|lower }} - version: {{ version }} - -source: - git_url: https://github.com/E3SM-Project/zstash.git - git_rev: v{{ version }} - -build: - number: 0 - script: "{{ PYTHON }} -m pip install . --no-deps -vv" - noarch: python - -requirements: - host: - - python >=3.9 - - pip - - run: - - python >=3.9 - - fair-research-login >=0.2.6,<0.3.0 - - globus-sdk >=3.0.0,<4.0.0 - - six - -test: - imports: - - zstash - commands: - - zstash --help - -about: - home: https://github.com/E3SM-Project/zstash diff --git a/setup.cfg b/setup.cfg index 22201a95..4e6c6cb4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -46,7 +46,7 @@ exclude = venv [mypy] -python_version = 3.9 +python_version = 3.13 check_untyped_defs = True ignore_missing_imports = True warn_unused_ignores = True diff --git a/setup.py b/setup.py index 98e77b1d..e440a2d9 100644 --- a/setup.py +++ b/setup.py @@ -7,6 +7,6 @@ author_email="forsyth2@llnl.gov, golaz1@llnl.gov, shaheen2@llnl.gov", description="Long term HPSS archiving software for E3SM", packages=find_packages(include=["zstash", "zstash.*"]), - python_requires=">=3.9", + python_requires=">=3.11,<3.14", entry_points={"console_scripts": ["zstash=zstash.main:main"]}, ) diff --git a/zstash/extract.py b/zstash/extract.py index 20007a12..64977aef 100644 --- a/zstash/extract.py +++ b/zstash/extract.py @@ -591,7 +591,7 @@ def extractFiles( # noqa: C901 try: # Seek file position if tar.fileobj is not None: - fileobj: _io.BufferedReader = tar.fileobj + fileobj = tar.fileobj else: raise TypeError("Invalid tar.fileobj={}".format(tar.fileobj)) fileobj.seek(files_row.offset) @@ -665,7 +665,7 @@ def extractFiles( # noqa: C901 # relying here on 'touch'. This is not the prettiest solution. # Maybe a better one can be implemented later. if tarinfo.issym(): - tmp1: int = tarinfo.mtime + tmp1 = tarinfo.mtime tmp2: datetime = datetime.fromtimestamp(tmp1) tmp3: str = tmp2.strftime("%Y%m%d%H%M.%S") os.system("touch -h -t %s %s" % (tmp3, tarinfo.name)) diff --git a/zstash/globus.py b/zstash/globus.py index 0d66223e..2cacad5f 100644 --- a/zstash/globus.py +++ b/zstash/globus.py @@ -57,7 +57,6 @@ def globus_activate(hpss: str): def file_exists(name: str) -> bool: - global archive_directory_listing for entry in archive_directory_listing: if entry.get("name") == name: @@ -72,9 +71,6 @@ def file_exists(name: str) -> bool: def globus_transfer( # noqa: C901 remote_ep: str, remote_path: str, name: str, transfer_type: str, non_blocking: bool ): - global transfer_client - global local_endpoint - global remote_endpoint global transfer_data global task_id global archive_directory_listing @@ -199,7 +195,6 @@ def globus_transfer( # noqa: C901 def globus_block_wait( task_id: str, wait_timeout: int, polling_interval: int, max_retries: int ): - global transfer_client # poll every "polling_interval" seconds to speed up small transfers. Report every 2 hours, stop waiting aftert 5*2 = 10 hours logger.info( @@ -211,7 +206,7 @@ def globus_block_wait( try: # Wait for the task to complete logger.info( - f"{ts_utc()}: on task_wait try {retry_count+1} out of {max_retries}" + f"{ts_utc()}: on task_wait try {retry_count + 1} out of {max_retries}" ) transfer_client.task_wait( task_id, timeout=wait_timeout, polling_interval=10 @@ -244,7 +239,6 @@ def globus_block_wait( def globus_wait(task_id: str): - global transfer_client try: """ @@ -288,9 +282,6 @@ def globus_wait(task_id: str): def globus_finalize(non_blocking: bool = False): - global transfer_client - global transfer_data - global task_id global global_variable_tarfiles_pushed last_task_id = None diff --git a/zstash/hpss_utils.py b/zstash/hpss_utils.py index 7f873e47..2f1158bc 100644 --- a/zstash/hpss_utils.py +++ b/zstash/hpss_utils.py @@ -10,17 +10,16 @@ from typing import List, Optional, Tuple import _hashlib -import _io from .hpss import hpss_put -from .settings import BLOCK_SIZE, TupleFilesRowNoId, TupleTarsRowNoId, config, logger +from .settings import TupleFilesRowNoId, TupleTarsRowNoId, config, logger from .utils import create_tars_table, tars_table_exists, ts_utc # Minimum output file object class HashIO(object): def __init__(self, name: str, mode: str, do_hash: bool): - self.f: _io.BufferedWriter = open(name, mode) + self.f = open(name, mode) self.hash: Optional[_hashlib.HASH] if do_hash: self.hash = hashlib.md5() @@ -270,54 +269,49 @@ def add_files( return failures +# Create a wrapper that computes hash while data passes through +class HashingFileWrapper: + def __init__(self, fileobj, hasher): + self.fileobj = fileobj + self.hasher = hasher + + def read(self, size=-1): + data = self.fileobj.read(size) + if data: + self.hasher.update(data) + return data + + # Add file to tar archive while computing its hash # Return file offset (in tar archive), size and md5 hash def add_file( tar: tarfile.TarFile, file_name: str, follow_symlinks: bool ) -> Tuple[int, int, datetime, Optional[str]]: + offset = tar.offset + tarinfo = tar.gettarinfo(file_name) - offset: int = tar.offset - tarinfo: tarfile.TarInfo = tar.gettarinfo(file_name) - # Change the size of any hardlinks from 0 to the size of the actual file if tarinfo.islnk(): tarinfo.size = os.path.getsize(file_name) - # Add the file to the tar - tar.addfile(tarinfo) - md5: Optional[str] = None - # Only add files or hardlinks. - # (So don't add directories or softlinks.) + md5 = None + + # For files/hardlinks if tarinfo.isfile() or tarinfo.islnk(): - f: _io.TextIOWrapper = open(file_name, "rb") - hash_md5: _hashlib.HASH = hashlib.md5() - if tar.fileobj is not None: - fileobj: _io.BufferedWriter = tar.fileobj + if tarinfo.size > 0: + # Non-empty files: stream with hash computation + hash_md5 = hashlib.md5() + with open(file_name, "rb") as f: + wrapper = HashingFileWrapper(f, hash_md5) + tar.addfile(tarinfo, wrapper) + md5 = hash_md5.hexdigest() else: - raise TypeError("Invalid tar.fileobj={}".format(tar.fileobj)) - while True: - s: str = f.read(BLOCK_SIZE) - if len(s) > 0: - # If the block read in is non-empty, write it to fileobj and update the hash - fileobj.write(s) - hash_md5.update(s) - if len(s) < BLOCK_SIZE: - # If the block read in is smaller than BLOCK_SIZE, - # then we have reached the end of the file. - # blocks = how many blocks of tarfile.BLOCKSIZE fit in tarinfo.size - # remainder = how much more content is required to reach tarinfo.size - blocks: int - remainder: int - blocks, remainder = divmod(tarinfo.size, tarfile.BLOCKSIZE) - if remainder > 0: - null_bytes: bytes = tarfile.NUL - # Write null_bytes to get the last block to tarfile.BLOCKSIZE - fileobj.write(null_bytes * (tarfile.BLOCKSIZE - remainder)) - blocks += 1 - # Increase the offset by the amount already saved to the tar - tar.offset += blocks * tarfile.BLOCKSIZE - break - f.close() - md5 = hash_md5.hexdigest() - size: int = tarinfo.size - mtime: datetime = datetime.utcfromtimestamp(tarinfo.mtime) + # Empty files: just add to tar, compute hash of empty data + tar.addfile(tarinfo) + md5 = hashlib.md5(b"").hexdigest() # MD5 of empty bytes + else: + # Directories, symlinks, etc. + tar.addfile(tarinfo) + + size = tarinfo.size + mtime = datetime.utcfromtimestamp(tarinfo.mtime) return offset, size, mtime, md5