From e0b79e9dc2b2066b284b179bcff0d42252522924 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Fri, 2 Jan 2026 18:32:36 -0600 Subject: [PATCH 01/34] Initial refactor to delete tars properly --- zstash/create.py | 48 ++++--- zstash/globus.py | 262 ++++++++++++++++++++++-------------- zstash/hpss.py | 73 +++++----- zstash/hpss_utils.py | 11 +- zstash/transfer_tracking.py | 58 ++++++++ zstash/update.py | 57 ++++---- 6 files changed, 321 insertions(+), 188 deletions(-) create mode 100644 zstash/transfer_tracking.py diff --git a/zstash/create.py b/zstash/create.py index b502f1e6..3eb9e602 100644 --- a/zstash/create.py +++ b/zstash/create.py @@ -14,6 +14,7 @@ from .hpss import hpss_put from .hpss_utils import add_files from .settings import DEFAULT_CACHE, config, get_db_filename, logger +from .transfer_tracking import TransferManager from .utils import ( create_tars_table, get_files_to_archive, @@ -52,12 +53,13 @@ def create(): logger.error(input_path_error_str) raise NotADirectoryError(input_path_error_str) + transfer_manager: TransferManager = TransferManager() if hpss != "none": url = urlparse(hpss) if url.scheme == "globus": # identify globus endpoints logger.debug(f"{ts_utc()}:Calling globus_activate(hpss)") - globus_activate(hpss) + transfer_manager.globus_config = globus_activate(hpss) else: # config.hpss is not "none", so we need to # create target HPSS directory @@ -88,14 +90,21 @@ def create(): # Create and set up the database logger.debug(f"{ts_utc()}: Calling create_database()") - failures: List[str] = create_database(cache, args) + failures: List[str] = create_database(cache, args, transfer_manager) # Transfer to HPSS. Always keep a local copy. logger.debug(f"{ts_utc()}: calling hpss_put() for {get_db_filename(cache)}") - hpss_put(hpss, get_db_filename(cache), cache, keep=args.keep, is_index=True) + hpss_put( + hpss, + get_db_filename(cache), + cache, + keep=args.keep, + is_index=True, + transfer_manager=transfer_manager, + ) logger.debug(f"{ts_utc()}: calling globus_finalize()") - globus_finalize(non_blocking=args.non_blocking) + globus_finalize(transfer_manager, non_blocking=args.non_blocking) if len(failures) > 0: # List the failures @@ -204,7 +213,9 @@ def setup_create() -> Tuple[str, argparse.Namespace]: return cache, args -def create_database(cache: str, args: argparse.Namespace) -> List[str]: +def create_database( + cache: str, args: argparse.Namespace, transfer_manager: TransferManager +) -> List[str]: # Create new database logger.debug(f"{ts_utc()}:Creating index database") if os.path.exists(get_db_filename(cache)): @@ -263,26 +274,7 @@ def create_database(cache: str, args: argparse.Namespace) -> List[str]: files: List[str] = get_files_to_archive(cache, args.include, args.exclude) failures: List[str] - if args.follow_symlinks: - try: - # Add files to archive - failures = add_files( - cur, - con, - -1, - files, - cache, - args.keep, - args.follow_symlinks, - skip_tars_md5=args.no_tars_md5, - non_blocking=args.non_blocking, - error_on_duplicate_tar=args.error_on_duplicate_tar, - overwrite_duplicate_tars=args.overwrite_duplicate_tars, - force_database_corruption=args.for_developers_force_database_corruption, - ) - except FileNotFoundError: - raise Exception("Archive creation failed due to broken symlink.") - else: + try: # Add files to archive failures = add_files( cur, @@ -297,7 +289,13 @@ def create_database(cache: str, args: argparse.Namespace) -> List[str]: error_on_duplicate_tar=args.error_on_duplicate_tar, overwrite_duplicate_tars=args.overwrite_duplicate_tars, force_database_corruption=args.for_developers_force_database_corruption, + transfer_manager=transfer_manager, ) + except FileNotFoundError as e: + if args.follow_symlinks: + raise Exception("Archive creation failed due to broken symlink.") + else: + raise e # Close database con.commit() diff --git a/zstash/globus.py b/zstash/globus.py index 2cacad5f..0ff56731 100644 --- a/zstash/globus.py +++ b/zstash/globus.py @@ -3,9 +3,8 @@ import sys from typing import List, Optional -from globus_sdk import TransferAPIError, TransferClient, TransferData +from globus_sdk import TransferAPIError, TransferClient from globus_sdk.response import GlobusHTTPResponse -from globus_sdk.services.transfer.response.iterable import IterableTransferResponse from six.moves.urllib.parse import urlparse from .globus_utils import ( @@ -17,78 +16,85 @@ submit_transfer_with_checks, ) from .settings import logger +from .transfer_tracking import GlobusConfig, TransferBatch, TransferManager from .utils import ts_utc -remote_endpoint = None -local_endpoint = None -transfer_client: TransferClient = None -transfer_data: TransferData = None -task_id = None -archive_directory_listing: IterableTransferResponse = None - -def globus_activate(hpss: str): +def globus_activate( + hpss: str, globus_config: Optional[GlobusConfig] = None +) -> Optional[GlobusConfig]: """ Read the local globus endpoint UUID from ~/.zstash.ini. If the ini file does not exist, create an ini file with empty values, and try to find the local endpoint UUID based on the FQDN """ - global transfer_client - global local_endpoint - global remote_endpoint url = urlparse(hpss) if url.scheme != "globus": - return + return None + if globus_config is None: + globus_config = GlobusConfig() check_state_files() - remote_endpoint = url.netloc - local_endpoint = get_local_endpoint_id(local_endpoint) - if remote_endpoint.upper() in HPSS_ENDPOINT_MAP.keys(): - remote_endpoint = HPSS_ENDPOINT_MAP.get(remote_endpoint.upper()) - both_endpoints: List[Optional[str]] = [local_endpoint, remote_endpoint] - transfer_client = get_transfer_client_with_auth(both_endpoints) + globus_config.remote_endpoint = url.netloc + globus_config.local_endpoint = get_local_endpoint_id(globus_config.local_endpoint) + upper_remote_ep = globus_config.remote_endpoint.upper() + if upper_remote_ep in HPSS_ENDPOINT_MAP.keys(): + globus_config.remote_endpoint = HPSS_ENDPOINT_MAP[upper_remote_ep] + both_endpoints: List[Optional[str]] = [ + globus_config.local_endpoint, + globus_config.remote_endpoint, + ] + globus_config.transfer_client = get_transfer_client_with_auth(both_endpoints) for ep_id in both_endpoints: - r = transfer_client.endpoint_autoactivate(ep_id, if_expires_in=600) + r = globus_config.transfer_client.endpoint_autoactivate( + ep_id, if_expires_in=600 + ) if r.get("code") == "AutoActivationFailed": logger.error( f"The {ep_id} endpoint is not activated or the current activation expires soon. Please go to https://app.globus.org/file-manager/collections/{ep_id} and (re)activate the endpoint." ) sys.exit(1) + return globus_config -def file_exists(name: str) -> bool: - +def file_exists(archive_directory_listing, name: str) -> bool: for entry in archive_directory_listing: if entry.get("name") == name: return True return False -global_variable_tarfiles_pushed = 0 - - # C901 'globus_transfer' is too complex (20) def globus_transfer( # noqa: C901 - remote_ep: str, remote_path: str, name: str, transfer_type: str, non_blocking: bool + transfer_manager: TransferManager, + remote_ep: str, + remote_path: str, + name: str, + transfer_type: str, + non_blocking: bool, ): - global transfer_data - global task_id - global archive_directory_listing - global global_variable_tarfiles_pushed logger.info(f"{ts_utc()}: Entered globus_transfer() for name = {name}") logger.debug(f"{ts_utc()}: non_blocking = {non_blocking}") - if not transfer_client: - globus_activate("globus://" + remote_ep) - if not transfer_client: + if (not transfer_manager.globus_config) or ( + not transfer_manager.globus_config.transfer_client + ): + transfer_manager.globus_config = globus_activate("globus://" + remote_ep) + if (not transfer_manager.globus_config) or ( + not transfer_manager.globus_config.transfer_client + ): sys.exit(1) if transfer_type == "get": - if not archive_directory_listing: - archive_directory_listing = transfer_client.operation_ls( - remote_endpoint, remote_path + if not transfer_manager.globus_config.archive_directory_listing: + transfer_manager.globus_config.archive_directory_listing = ( + transfer_manager.globus_config.transfer_client.operation_ls( + transfer_manager.globus_config.remote_endpoint, remote_path + ) ) - if not file_exists(name): + if not file_exists( + transfer_manager.globus_config.archive_directory_listing, name + ): logger.error( "Remote file globus://{}{}/{} does not exist".format( remote_ep, remote_path, name @@ -96,45 +102,45 @@ def globus_transfer( # noqa: C901 ) sys.exit(1) + mrt: Optional[TransferBatch] = transfer_manager.get_most_recent_transfer() transfer_data = set_up_TransferData( transfer_type, - local_endpoint, # Global - remote_endpoint, # Global + transfer_manager.globus_config.local_endpoint, + transfer_manager.globus_config.remote_endpoint, remote_path, name, - transfer_client, # Global - transfer_data, # Global + transfer_manager.globus_config.transfer_client, + mrt.transfer_data if mrt else None, ) task: GlobusHTTPResponse try: - if task_id: - task = transfer_client.get_task(task_id) - prev_task_status = task["status"] + if mrt and mrt.task_id: + task = transfer_manager.globus_config.transfer_client.get_task(mrt.task_id) + mrt.task_status = task["status"] # one of {ACTIVE, SUCCEEDED, FAILED, CANCELED, PENDING, INACTIVE} # NOTE: How we behave here depends upon whether we want to support mutliple active transfers. # Presently, we do not, except inadvertantly (if status == PENDING) - if prev_task_status == "ACTIVE": + if mrt.task_status == "ACTIVE": logger.info( - f"{ts_utc()}: Previous task_id {task_id} Still Active. Returning ACTIVE." + f"{ts_utc()}: Previous task_id {mrt.task_id} Still Active. Returning ACTIVE." ) - return "ACTIVE" - elif prev_task_status == "SUCCEEDED": + # Don't return early - continue to submit the new transfer + # The previous transfer will complete asynchronously + elif mrt.task_status == "SUCCEEDED": logger.info( - f"{ts_utc()}: Previous task_id {task_id} status = SUCCEEDED." + f"{ts_utc()}: Previous task_id {mrt.task_id} status = SUCCEEDED." ) src_ep = task["source_endpoint_id"] dst_ep = task["destination_endpoint_id"] label = task["label"] ts = ts_utc() logger.info( - "{}:Globus transfer {}, from {} to {}: {} succeeded".format( - ts, task_id, src_ep, dst_ep, label - ) + f"{ts}:Globus transfer {mrt.task_id}, from {src_ep} to {dst_ep}: {label} succeeded" ) else: logger.error( - f"{ts_utc()}: Previous task_id {task_id} status = {prev_task_status}." + f"{ts_utc()}: Previous task_id {mrt.task_id} status = {mrt.task_status}." ) # DEBUG: review accumulated items in TransferData @@ -142,15 +148,17 @@ def globus_transfer( # noqa: C901 attribs = transfer_data.__dict__ for item in attribs["data"]["DATA"]: if item["DATA_TYPE"] == "transfer_item": - global_variable_tarfiles_pushed += 1 + transfer_manager.cumulative_tarfiles_pushed += 1 print( - f" (routine) PUSHING (#{global_variable_tarfiles_pushed}) STORED source item: {item['source_path']}", + f" (routine) PUSHING (#{transfer_manager.cumulative_tarfiles_pushed}) STORED source item: {item['source_path']}", flush=True, ) - # SUBMIT new transfer here + # Submit the current transfer_data logger.info(f"{ts_utc()}: DIVING: Submit Transfer for {transfer_data['label']}") - task = submit_transfer_with_checks(transfer_client, transfer_data) + task = submit_transfer_with_checks( + transfer_manager.globus_config.transfer_client, transfer_data + ) task_id = task.get("task_id") # NOTE: This log message is misleading. If we have accumulated multiple tar files for transfer, # the "lable" given here refers only to the LAST tarfile in the TransferData list. @@ -158,7 +166,19 @@ def globus_transfer( # noqa: C901 f"{ts_utc()}: SURFACE Submit Transfer returned new task_id = {task_id} for label {transfer_data['label']}" ) + # Create new transfer record with the task info + new_transfer = TransferBatch() + new_transfer.file_paths = [] # TODO: is this right? + new_transfer.task_id = task_id + new_transfer.task_status = "UNKNOWN" + new_transfer.is_globus = True + new_transfer.transfer_data = ( + None # This batch was submitted # TODO: is this right? + ) + transfer_manager.batches.append(new_transfer) + # Nullify the submitted transfer data structure so that a new one will be created on next call. + # Reset for building the next batch transfer_data = None except TransferAPIError as e: if e.code == "NoCredException": @@ -174,26 +194,36 @@ def globus_transfer( # noqa: C901 logger.error("Exception: {}".format(e)) sys.exit(1) + new_mrt: Optional[TransferBatch] = transfer_manager.get_most_recent_transfer() # test for blocking on new task_id task_status = "UNKNOWN" - if not non_blocking: - task_status = globus_block_wait( - task_id=task_id, wait_timeout=7200, polling_interval=10, max_retries=5 - ) - else: - logger.info(f"{ts_utc()}: NO BLOCKING (task_wait) for task_id {task_id}") + if new_mrt and new_mrt.task_id: + if not non_blocking: + new_mrt.task_status = globus_block_wait( + transfer_manager.globus_config.transfer_client, + task_id=new_mrt.task_id, + wait_timeout=7200, + max_retries=5, + ) + else: + logger.info( + f"{ts_utc()}: NO BLOCKING (task_wait) for task_id {new_mrt.task_id}" + ) if transfer_type == "put": return task_status if transfer_type == "get" and task_id: - globus_wait(task_id) + globus_wait(transfer_manager.globus_config.transfer_client, task_id) return task_status def globus_block_wait( - task_id: str, wait_timeout: int, polling_interval: int, max_retries: int + transfer_client: TransferClient, + task_id: str, + wait_timeout: int, + max_retries: int, ): # poll every "polling_interval" seconds to speed up small transfers. Report every 2 hours, stop waiting aftert 5*2 = 10 hours @@ -238,8 +268,7 @@ def globus_block_wait( return task_status -def globus_wait(task_id: str): - +def globus_wait(transfer_client: TransferClient, task_id: str): try: """ A Globus transfer job (task) can be in one of the three states: @@ -281,44 +310,73 @@ def globus_wait(task_id: str): sys.exit(1) -def globus_finalize(non_blocking: bool = False): - global global_variable_tarfiles_pushed +def globus_finalize(transfer_manager: TransferManager, non_blocking: bool = False): last_task_id = None - if transfer_data: - # DEBUG: review accumulated items in TransferData - logger.info(f"{ts_utc()}: FINAL TransferData: accumulated items:") - attribs = transfer_data.__dict__ - for item in attribs["data"]["DATA"]: - if item["DATA_TYPE"] == "transfer_item": - global_variable_tarfiles_pushed += 1 - print( - f" (finalize) PUSHING ({global_variable_tarfiles_pushed}) source item: {item['source_path']}", - flush=True, - ) + if transfer_manager.globus_config is None: + logger.warning("No GlobusConfig object provided for finalization") + return - # SUBMIT new transfer here - logger.info(f"{ts_utc()}: DIVING: Submit Transfer for {transfer_data['label']}") - try: - last_task = submit_transfer_with_checks(transfer_client, transfer_data) - last_task_id = last_task.get("task_id") - except TransferAPIError as e: - if e.code == "NoCredException": - logger.error( - "{}. Please go to https://app.globus.org/endpoints and activate the endpoint.".format( - e.message + transfer: Optional[TransferBatch] = transfer_manager.get_most_recent_transfer() + if transfer: + # Check if there's any pending transfer data that hasn't been submitted yet + if transfer.transfer_data: + # DEBUG: review accumulated items in TransferData + logger.info(f"{ts_utc()}: FINAL TransferData: accumulated items:") + attribs = transfer.transfer_data.__dict__ + for item in attribs["data"]["DATA"]: + if item["DATA_TYPE"] == "transfer_item": + transfer_manager.cumulative_tarfiles_pushed += 1 + print( + f" (finalize) PUSHING ({transfer_manager.cumulative_tarfiles_pushed}) source item: {item['source_path']}", + flush=True, ) + + # SUBMIT new transfer here + logger.info( + f"{ts_utc()}: DIVING: Submit Transfer for {transfer.transfer_data['label']}" + ) + try: + last_task = submit_transfer_with_checks( + transfer_manager.globus_config.transfer_client, + transfer.transfer_data, ) - else: - logger.error(e) - sys.exit(1) - except Exception as e: - logger.error("Exception: {}".format(e)) - sys.exit(1) + last_task_id = last_task.get("task_id") + except TransferAPIError as e: + if e.code == "NoCredException": + logger.error( + "{}. Please go to https://app.globus.org/endpoints and activate the endpoint.".format( + e.message + ) + ) + else: + logger.error(e) + sys.exit(1) + except Exception as e: + logger.error("Exception: {}".format(e)) + sys.exit(1) + + # Wait for any submitted transfers to complete + # In non-blocking mode, this ensures index.db and any accumulated tar files complete + # In blocking mode, this is redundant but harmless + skip_last_wait: bool = False + if transfer and transfer.task_id: + if transfer.task_id == last_task_id: + skip_last_wait = ( + True # No reason to call globus_wait twice on the same task_id + ) + logger.info( + f"{ts_utc()}: Waiting for transfer task_id={transfer.task_id} to complete" + ) + globus_wait( + transfer_manager.globus_config.transfer_client, transfer.task_id + ) + if last_task_id and (not skip_last_wait): + logger.info( + f"{ts_utc()}: Waiting for last transfer task_id={last_task_id} to complete" + ) + globus_wait(transfer_manager.globus_config.transfer_client, last_task_id) - if not non_blocking: - if task_id: - globus_wait(task_id) - if last_task_id: - globus_wait(last_task_id) + # Clean up tar files that were queued for deletion + transfer_manager.delete_successfully_transferred_files() diff --git a/zstash/hpss.py b/zstash/hpss.py index 24603388..ef41210b 100644 --- a/zstash/hpss.py +++ b/zstash/hpss.py @@ -2,12 +2,13 @@ import os.path import subprocess -from typing import List +from typing import List, Optional from six.moves.urllib.parse import urlparse from .globus import globus_transfer from .settings import get_db_filename, logger +from .transfer_tracking import GlobusConfig, TransferBatch, TransferManager from .utils import run_command, ts_utc prev_transfers: List[str] = list() @@ -22,16 +23,10 @@ def hpss_transfer( keep: bool = False, non_blocking: bool = False, is_index: bool = False, + transfer_manager: Optional[TransferManager] = None, ): - global prev_transfers - global curr_transfers - - logger.info( - f"{ts_utc()}: in hpss_transfer, prev_transfers is starting as {prev_transfers}" - ) - # logger.debug( - # f"{ts_utc()}: in hpss_transfer, curr_transfers is starting as {curr_transfers}" - # ) + if not transfer_manager: + transfer_manager = TransferManager() if hpss == "none": logger.info("{}: HPSS is unavailable".format(transfer_type)) @@ -85,7 +80,8 @@ def hpss_transfer( endpoint = url.netloc url_path = url.path - curr_transfers.append(file_path) + # Append to curr_transfers: + transfer_manager.batches[-1].file_paths.append(file_path) # logger.debug( # f"{ts_utc()}: curr_transfers has been appended to, is now {curr_transfers}" # ) @@ -104,16 +100,21 @@ def hpss_transfer( # For `get`, this directory is where the file we get from HPSS will go. os.chdir(path) + globus_status: str = "UNKNOWN" if scheme == "globus": - globus_status = "UNKNOWN" + if not transfer_manager.globus_config: + raise RuntimeError("Scheme is 'globus' but no GlobusConfig provided") # Transfer file using the Globus Transfer Service logger.info(f"{ts_utc()}: DIVING: hpss calls globus_transfer(name={name})") - globus_status = globus_transfer( - endpoint, url_path, name, transfer_type, non_blocking - ) - logger.info( - f"{ts_utc()}: SURFACE hpss globus_transfer(name={name}) returns {globus_status}" + globus_transfer( + transfer_manager, endpoint, url_path, name, transfer_type, non_blocking ) + mrt: Optional[TransferBatch] = transfer_manager.get_most_recent_transfer() + if mrt and mrt.task_status: + globus_status = mrt.task_status + logger.info( + f"{ts_utc()}: SURFACE hpss globus_transfer(name={name}) returns {globus_status}" + ) # NOTE: Here, the status could be "EXHAUSTED_TIMEOUT_RETRIES", meaning a very long transfer # or perhaps transfer is hanging. We should decide whether to ignore it, or cancel it, but # we'd need the task_id to issue a cancellation. Perhaps we should have globus_transfer @@ -129,20 +130,10 @@ def hpss_transfer( os.chdir(cwd) if transfer_type == "put": - if not keep: - if (scheme != "globus") or (globus_status == "SUCCEEDED"): - # Note: This is intended to fulfill the default removal of successfully-transfered - # tar files when keep=False, irrespective of non-blocking status - logger.debug( - f"{ts_utc()}: deleting transfered files {prev_transfers}" - ) - for src_path in prev_transfers: - os.remove(src_path) - prev_transfers = curr_transfers - curr_transfers = list() - logger.info( - f"{ts_utc()}: prev_transfers has been set to {prev_transfers}" - ) + if (not keep) and (not is_index): + # We never delete if `--keep` is set. + # We also never delete `index.db`. + transfer_manager.delete_successfully_transferred_files() def hpss_put( @@ -152,18 +143,34 @@ def hpss_put( keep: bool = True, non_blocking: bool = False, is_index=False, + transfer_manager: Optional[TransferManager] = None, ): """ Put a file to the HPSS archive. """ - hpss_transfer(hpss, file_path, "put", cache, keep, non_blocking, is_index) + hpss_transfer( + hpss, + file_path, + "put", + cache, + keep, + non_blocking, + is_index, + transfer_manager, + ) def hpss_get(hpss: str, file_path: str, cache: str): """ Get a file from the HPSS archive. """ - hpss_transfer(hpss, file_path, "get", cache, False) + url = urlparse(hpss) + transfer_manager: TransferManager = TransferManager() + if url.scheme == "globus": + transfer_manager.globus_config = GlobusConfig() + hpss_transfer( + hpss, file_path, "get", cache, False, transfer_manager=transfer_manager + ) def hpss_chgrp(hpss: str, group: str, recurse: bool = False): diff --git a/zstash/hpss_utils.py b/zstash/hpss_utils.py index 2f1158bc..76326d9b 100644 --- a/zstash/hpss_utils.py +++ b/zstash/hpss_utils.py @@ -13,6 +13,7 @@ from .hpss import hpss_put from .settings import TupleFilesRowNoId, TupleTarsRowNoId, config, logger +from .transfer_tracking import TransferManager from .utils import create_tars_table, tars_table_exists, ts_utc @@ -66,6 +67,7 @@ def add_files( error_on_duplicate_tar: bool = False, overwrite_duplicate_tars: bool = False, force_database_corruption: str = "", + transfer_manager: Optional[TransferManager] = None, ) -> List[str]: # Now, perform the actual archiving @@ -160,7 +162,14 @@ def add_files( logger.info( f"{ts_utc()}: DIVING: (add_files): Calling hpss_put to dispatch archive file {tfname} [keep, non_blocking] = [{keep}, {non_blocking}]" ) - hpss_put(hpss, os.path.join(cache, tfname), cache, keep, non_blocking) + hpss_put( + hpss, + os.path.join(cache, tfname), + cache, + keep, + non_blocking, + transfer_manager, + ) logger.info( f"{ts_utc()}: SURFACE (add_files): Called hpss_put to dispatch archive file {tfname}" ) diff --git a/zstash/transfer_tracking.py b/zstash/transfer_tracking.py new file mode 100644 index 00000000..f7d6b5c5 --- /dev/null +++ b/zstash/transfer_tracking.py @@ -0,0 +1,58 @@ +import os +from typing import List, Optional + +from globus_sdk import TransferClient, TransferData +from globus_sdk.services.transfer.response.iterable import IterableTransferResponse + + +class GlobusConfig: + """Globus connection configuration""" + + def __init__(self): + self.remote_endpoint: str + self.local_endpoint: str + self.transfer_client: TransferClient + self.archive_directory_listing: Optional[IterableTransferResponse] = None + + +class TransferBatch: + """Represents one batch of files being transferred""" + + def __init__(self): + self.file_paths: List[str] = [] + self.task_id: Optional[str] = None + self.task_status: Optional[str] = None + self.is_globus: bool = False + self.transfer_data: Optional[TransferData] = None # Only for Globus + + def delete_files(self): + for src_path in self.file_paths: + if os.path.exists(src_path): + os.remove(src_path) + + +class TransferManager: + def __init__(self): + # All transfer batches (Globus or HPSS) + self.batches: List[TransferBatch] = [] + self.cumulative_tarfiles_pushed: int = 0 + + # Connection state (Globus-specific, None if not using Globus) + self.globus_config: Optional[GlobusConfig] = None + + def get_most_recent_transfer(self) -> Optional[TransferBatch]: + return self.batches[-1] if self.batches else None + + def delete_successfully_transferred_files(self): + new_batch_list: List[TransferBatch] = [] + for batch in self.batches: + if (not batch.is_globus) or (batch.task_status == "SUCCEEDED"): + # The files were transferred successfully. + # So, we can delete them. + batch.delete_files() + else: + new_batch_list.append(batch) + # We don't need to keep tracking batches that have been both: + # 1. transferred successfully + # 2. been deleted + self.batches = new_batch_list diff --git a/zstash/update.py b/zstash/update.py index ebc4f84c..a20a912c 100644 --- a/zstash/update.py +++ b/zstash/update.py @@ -11,7 +11,16 @@ from .globus import globus_activate, globus_finalize from .hpss import hpss_get, hpss_put from .hpss_utils import add_files -from .settings import DEFAULT_CACHE, TIME_TOL, config, get_db_filename, logger +from .settings import ( + DEFAULT_CACHE, + TIME_TOL, + FilesRow, + TupleFilesRow, + config, + get_db_filename, + logger, +) +from .transfer_tracking import TransferManager from .utils import get_files_to_archive_with_stats, update_config @@ -21,7 +30,8 @@ def update(): cache: str args, cache = setup_update() - result: Optional[List[str]] = update_database(args, cache) + transfer_manager = TransferManager() + result: Optional[List[str]] = update_database(args, cache, transfer_manager) if result is None: # There was either nothing to update or `--dry-run` was set. @@ -34,9 +44,16 @@ def update(): hpss = config.hpss else: raise TypeError("Invalid config.hpss={}".format(config.hpss)) - hpss_put(hpss, get_db_filename(cache), cache, keep=args.keep, is_index=True) + hpss_put( + hpss, + get_db_filename(cache), + cache, + keep=args.keep, + is_index=True, + transfer_manager=transfer_manager, + ) - globus_finalize(non_blocking=args.non_blocking) + globus_finalize(transfer_manager, non_blocking=args.non_blocking) # List failures if len(failures) > 0: @@ -138,7 +155,7 @@ def setup_update() -> Tuple[argparse.Namespace, str]: # C901 'update_database' is too complex (20) def update_database( # noqa: C901 - args: argparse.Namespace, cache: str + args: argparse.Namespace, cache: str, transfer_manager: TransferManager ) -> Optional[List[str]]: # Open database logger.debug("Opening index database") @@ -151,7 +168,7 @@ def update_database( # noqa: C901 hpss: str = config.hpss else: raise TypeError("Invalid config.hpss={}".format(config.hpss)) - globus_activate(hpss) + transfer_manager.globus_config = globus_activate(hpss) hpss_get(hpss, get_db_filename(cache), cache) else: error_str: str = ( @@ -266,27 +283,7 @@ def update_database( # noqa: C901 for tfile in tfiles: tfile_string: str = tfile[0] itar = max(itar, int(tfile_string[0:6], 16)) - - # Add files - failures: List[str] - if args.follow_symlinks: - try: - # Add files - failures = add_files( - cur, - con, - itar, - newfiles, - cache, - keep, - args.follow_symlinks, - non_blocking=args.non_blocking, - error_on_duplicate_tar=args.error_on_duplicate_tar, - overwrite_duplicate_tars=args.overwrite_duplicate_tars, - ) - except FileNotFoundError: - raise Exception("Archive update failed due to broken symlink.") - else: + try: # Add files failures = add_files( cur, @@ -299,7 +296,13 @@ def update_database( # noqa: C901 non_blocking=args.non_blocking, error_on_duplicate_tar=args.error_on_duplicate_tar, overwrite_duplicate_tars=args.overwrite_duplicate_tars, + transfer_manager=transfer_manager, ) + except FileNotFoundError as e: + if args.follow_symlinks: + raise Exception("Archive update failed due to broken symlink.") + else: + raise e # Close database con.commit() From 8a4e74e65e23c1aa30c4d216b18920b4a085a2d6 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Fri, 2 Jan 2026 18:47:19 -0600 Subject: [PATCH 02/34] Fixes from Claude --- zstash/globus.py | 7 +++---- zstash/hpss.py | 33 +++++++++++++++++++++------------ zstash/transfer_tracking.py | 22 ++++++++++++++++++++++ 3 files changed, 46 insertions(+), 16 deletions(-) diff --git a/zstash/globus.py b/zstash/globus.py index 0ff56731..cec7a5c5 100644 --- a/zstash/globus.py +++ b/zstash/globus.py @@ -168,13 +168,12 @@ def globus_transfer( # noqa: C901 # Create new transfer record with the task info new_transfer = TransferBatch() - new_transfer.file_paths = [] # TODO: is this right? + new_transfer.file_paths = ( + mrt.file_paths if mrt else [] + ) # Copy from the batch being submitted new_transfer.task_id = task_id new_transfer.task_status = "UNKNOWN" new_transfer.is_globus = True - new_transfer.transfer_data = ( - None # This batch was submitted # TODO: is this right? - ) transfer_manager.batches.append(new_transfer) # Nullify the submitted transfer data structure so that a new one will be created on next call. diff --git a/zstash/hpss.py b/zstash/hpss.py index ef41210b..dc47c5e3 100644 --- a/zstash/hpss.py +++ b/zstash/hpss.py @@ -28,6 +28,19 @@ def hpss_transfer( if not transfer_manager: transfer_manager = TransferManager() + url = urlparse(hpss) + scheme = url.scheme + + # Create a new batch if needed (before we start adding files) + if not transfer_manager.batches or transfer_manager.batches[-1].task_id: + # Either no batches exist, or the last batch was already submitted + new_batch = TransferBatch() + new_batch.is_globus = scheme == "globus" + transfer_manager.batches.append(new_batch) + logger.debug( + f"{ts_utc()}: Created new TransferBatch, total batches: {len(transfer_manager.batches)}" + ) + if hpss == "none": logger.info("{}: HPSS is unavailable".format(transfer_type)) if transfer_type == "put" and file_path != get_db_filename(cache): @@ -70,22 +83,18 @@ def hpss_transfer( else: raise ValueError("Invalid transfer_type={}".format(transfer_type)) logger.info("Transferring file {} HPSS: {}".format(transfer_word, file_path)) - scheme: str - endpoint: str - path: str - name: str - url = urlparse(hpss) - scheme = url.scheme - endpoint = url.netloc + endpoint: str = url.netloc url_path = url.path + path: str + name: str + path, name = os.path.split(file_path) - # Append to curr_transfers: + # Add this file to the current batch transfer_manager.batches[-1].file_paths.append(file_path) - # logger.debug( - # f"{ts_utc()}: curr_transfers has been appended to, is now {curr_transfers}" - # ) - path, name = os.path.split(file_path) + logger.debug( + f"{ts_utc()}: Added {file_path} to current batch, batch now has {len(transfer_manager.batches[-1].file_paths)} files" + ) # Need to be in local directory for `hsi` to work cwd = os.getcwd() diff --git a/zstash/transfer_tracking.py b/zstash/transfer_tracking.py index f7d6b5c5..5434039d 100644 --- a/zstash/transfer_tracking.py +++ b/zstash/transfer_tracking.py @@ -4,6 +4,9 @@ from globus_sdk import TransferClient, TransferData from globus_sdk.services.transfer.response.iterable import IterableTransferResponse +from .settings import logger +from .utils import ts_utc + class GlobusConfig: """Globus connection configuration""" @@ -44,13 +47,32 @@ def get_most_recent_transfer(self) -> Optional[TransferBatch]: return self.batches[-1] if self.batches else None def delete_successfully_transferred_files(self): + """Check transfer status and delete files from successful transfers""" new_batch_list: List[TransferBatch] = [] for batch in self.batches: + # Check if this is a Globus transfer that needs status update + if batch.is_globus and batch.task_id and (batch.task_status != "SUCCEEDED"): + if self.globus_config and self.globus_config.transfer_client: + # Non-blocking status check + logger.debug( + f"{ts_utc()}: Checking status of task_id={batch.task_id}" + ) + task = self.globus_config.transfer_client.get_task(batch.task_id) + batch.task_status = task["status"] + logger.debug( + f"{ts_utc()}: task_id={batch.task_id} status={batch.task_status}" + ) + + # Now delete if successful if (not batch.is_globus) or (batch.task_status == "SUCCEEDED"): # The files were transferred successfully. # So, we can delete them. + logger.info( + f"{ts_utc()}: Deleting {len(batch.file_paths)} files from successful transfer" + ) batch.delete_files() else: + # Keep tracking this batch - not yet successful new_batch_list.append(batch) # We don't need to keep tracking batches that have been both: # 1. transferred successfully From 43963ee0113651d9406da99b72c5f4593845f4eb Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Mon, 5 Jan 2026 19:00:24 -0600 Subject: [PATCH 03/34] Further fixes --- .../test_globus_tar_deletion.bash | 3 +- zstash/globus.py | 17 ++++------ zstash/hpss.py | 14 ++++---- zstash/hpss_utils.py | 3 +- zstash/transfer_tracking.py | 32 +++++++++++-------- 5 files changed, 37 insertions(+), 32 deletions(-) diff --git a/tests/integration/bash_tests/run_from_any/test_globus_tar_deletion.bash b/tests/integration/bash_tests/run_from_any/test_globus_tar_deletion.bash index 4b76f4c5..ed75a69d 100755 --- a/tests/integration/bash_tests/run_from_any/test_globus_tar_deletion.bash +++ b/tests/integration/bash_tests/run_from_any/test_globus_tar_deletion.bash @@ -137,7 +137,8 @@ test_globus_tar_deletion() keep_flag="" fi - zstash create ${blocking_flag} ${keep_flag} --hpss=${globus_path}/${case_name} --maxsize 128 zstash_demo 2>&1 | tee ${case_name}.log + # Use -v so debug logs show up. + zstash create ${blocking_flag} ${keep_flag} --hpss=${globus_path}/${case_name} --maxsize 128 -v zstash_demo 2>&1 | tee ${case_name}.log if [ $? != 0 ]; then echo "${case_name} failed. Check ${case_name}_create.log for details. Cannot continue." return 1 diff --git a/zstash/globus.py b/zstash/globus.py index cec7a5c5..d74965c5 100644 --- a/zstash/globus.py +++ b/zstash/globus.py @@ -166,18 +166,15 @@ def globus_transfer( # noqa: C901 f"{ts_utc()}: SURFACE Submit Transfer returned new task_id = {task_id} for label {transfer_data['label']}" ) - # Create new transfer record with the task info - new_transfer = TransferBatch() - new_transfer.file_paths = ( - mrt.file_paths if mrt else [] - ) # Copy from the batch being submitted - new_transfer.task_id = task_id - new_transfer.task_status = "UNKNOWN" - new_transfer.is_globus = True - transfer_manager.batches.append(new_transfer) + # Update the current batch with the task info + # The batch was already created in hpss_transfer with files added to it + # We just need to mark it as submitted + if transfer_manager.batches: + transfer_manager.batches[-1].task_id = task_id + transfer_manager.batches[-1].task_status = "UNKNOWN" + transfer_manager.batches[-1].transfer_data = None # Was just submitted # Nullify the submitted transfer data structure so that a new one will be created on next call. - # Reset for building the next batch transfer_data = None except TransferAPIError as e: if e.code == "NoCredException": diff --git a/zstash/hpss.py b/zstash/hpss.py index dc47c5e3..683a01b4 100644 --- a/zstash/hpss.py +++ b/zstash/hpss.py @@ -91,10 +91,11 @@ def hpss_transfer( path, name = os.path.split(file_path) # Add this file to the current batch - transfer_manager.batches[-1].file_paths.append(file_path) - logger.debug( - f"{ts_utc()}: Added {file_path} to current batch, batch now has {len(transfer_manager.batches[-1].file_paths)} files" - ) + if (not keep) and (not is_index): # Never track index.db for deletion + transfer_manager.batches[-1].file_paths.append(file_path) + logger.debug( + f"{ts_utc()}: Added {file_path} to current batch, batch now has {len(transfer_manager.batches[-1].file_paths)} files" + ) # Need to be in local directory for `hsi` to work cwd = os.getcwd() @@ -112,7 +113,7 @@ def hpss_transfer( globus_status: str = "UNKNOWN" if scheme == "globus": if not transfer_manager.globus_config: - raise RuntimeError("Scheme is 'globus' but no GlobusConfig provided") + transfer_manager.globus_config = GlobusConfig() # Transfer file using the Globus Transfer Service logger.info(f"{ts_utc()}: DIVING: hpss calls globus_transfer(name={name})") globus_transfer( @@ -139,9 +140,8 @@ def hpss_transfer( os.chdir(cwd) if transfer_type == "put": - if (not keep) and (not is_index): + if not keep: # We never delete if `--keep` is set. - # We also never delete `index.db`. transfer_manager.delete_successfully_transferred_files() diff --git a/zstash/hpss_utils.py b/zstash/hpss_utils.py index 76326d9b..573640c9 100644 --- a/zstash/hpss_utils.py +++ b/zstash/hpss_utils.py @@ -168,7 +168,8 @@ def add_files( cache, keep, non_blocking, - transfer_manager, + is_index=False, + transfer_manager=transfer_manager, ) logger.info( f"{ts_utc()}: SURFACE (add_files): Called hpss_put to dispatch archive file {tfname}" diff --git a/zstash/transfer_tracking.py b/zstash/transfer_tracking.py index 5434039d..47e1ee61 100644 --- a/zstash/transfer_tracking.py +++ b/zstash/transfer_tracking.py @@ -12,9 +12,9 @@ class GlobusConfig: """Globus connection configuration""" def __init__(self): - self.remote_endpoint: str - self.local_endpoint: str - self.transfer_client: TransferClient + self.remote_endpoint: Optional[str] = None + self.local_endpoint: Optional[str] = None + self.transfer_client: Optional[TransferClient] = None self.archive_directory_listing: Optional[IterableTransferResponse] = None @@ -48,10 +48,18 @@ def get_most_recent_transfer(self) -> Optional[TransferBatch]: def delete_successfully_transferred_files(self): """Check transfer status and delete files from successful transfers""" - new_batch_list: List[TransferBatch] = [] + logger.info( + f"{ts_utc()}: Checking for successfully transferred files to delete" + ) for batch in self.batches: + # Skip if already processed + if not batch.file_paths: + logger.debug(f"{ts_utc()}: batch was already processed, skipping") + continue + # Check if this is a Globus transfer that needs status update if batch.is_globus and batch.task_id and (batch.task_status != "SUCCEEDED"): + logger.debug(f"{ts_utc()}: batch is globus AND is not yet successful") if self.globus_config and self.globus_config.transfer_client: # Non-blocking status check logger.debug( @@ -62,19 +70,17 @@ def delete_successfully_transferred_files(self): logger.debug( f"{ts_utc()}: task_id={batch.task_id} status={batch.task_status}" ) + else: + logger.debug( + f"{ts_utc()}: globus_config is not set up with a transfer client" + ) # Now delete if successful if (not batch.is_globus) or (batch.task_status == "SUCCEEDED"): - # The files were transferred successfully. - # So, we can delete them. + # The files were transferred successfully, so delete them logger.info( f"{ts_utc()}: Deleting {len(batch.file_paths)} files from successful transfer" ) batch.delete_files() - else: - # Keep tracking this batch - not yet successful - new_batch_list.append(batch) - # We don't need to keep tracking batches that have been both: - # 1. transferred successfully - # 2. been deleted - self.batches = new_batch_list + logger.debug("Deletion completed") + batch.file_paths = [] # Mark as processed From 25b606bf0f3b1854cf75a11904c180261341204d Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Tue, 6 Jan 2026 13:16:36 -0600 Subject: [PATCH 04/34] Add progressive deletion tests --- .../test_globus_tar_deletion.bash | 121 +++++++++++++++++- 1 file changed, 117 insertions(+), 4 deletions(-) diff --git a/tests/integration/bash_tests/run_from_any/test_globus_tar_deletion.bash b/tests/integration/bash_tests/run_from_any/test_globus_tar_deletion.bash index ed75a69d..8de806e6 100755 --- a/tests/integration/bash_tests/run_from_any/test_globus_tar_deletion.bash +++ b/tests/integration/bash_tests/run_from_any/test_globus_tar_deletion.bash @@ -147,7 +147,7 @@ test_globus_tar_deletion() check_log_has "Creating new tar archive 000000.tar" ${case_name}.log || return 2 echo "" - echo "Checking directory status after 'zstash create' has completed. src should only have index.db. dst should have tar and index.db." + echo "Checking directory status after 'zstash create' has completed." echo "Checking logs in current directory: ${PWD}" echo "" @@ -182,6 +182,98 @@ test_globus_tar_deletion() return 0 # Success } +test_globus_progressive_deletion() +{ + local path_to_repo=$1 + local dst_endpoint=$2 + local dst_dir=$3 + local blocking_str=$4 + + src_dir=${path_to_repo}/tests/utils/globus_tar_deletion + rm -rf ${src_dir} + mkdir -p ${src_dir} + dst_endpoint_uuid=$(get_endpoint ${dst_endpoint}) + globus_path=globus://${dst_endpoint_uuid}/${dst_dir} + + case_name=${blocking_str}_progressive_deletion + echo "Running test_globus_progressive_deletion on case=${case_name}" + echo "Exit codes: 0 -- success, 1 -- zstash failed, 2 -- grep check failed" + + setup ${case_name} "${src_dir}" + + # Create files totaling >2 GB to trigger multiple tars with maxsize=1 GB + # Each file is ~700 MB, so we'll get 3 tars + echo "Creating large test files (this may take a minute)..." + dd if=/dev/zero of=zstash_demo/file1.dat bs=1M count=700 2>/dev/null # 700 MB + dd if=/dev/zero of=zstash_demo/file2.dat bs=1M count=700 2>/dev/null # 700 MB + dd if=/dev/zero of=zstash_demo/file3.dat bs=1M count=700 2>/dev/null # 700 MB + echo "✓ Test files created" + + if [ "$blocking_str" == "non-blocking" ]; then + blocking_flag="--non-blocking" + else + blocking_flag="" + fi + + # Run with maxsize=1 GB to create multiple tars + echo "Running zstash create (this may take several minutes due to file size and transfers)..." + zstash create ${blocking_flag} --hpss=${globus_path}/${case_name} --maxsize 1 -v zstash_demo 2>&1 | tee ${case_name}.log + if [ $? != 0 ]; then + echo "${case_name} failed." + return 1 + fi + + # Check that multiple tar files were created + tar_count=$(grep -c "Creating new tar archive" ${case_name}.log) + if [ ${tar_count} -lt 2 ]; then + echo "Expected at least 2 tar archives to be created, found ${tar_count}" + return 2 + fi + echo "✓ Created ${tar_count} tar archives" + + # Check that files were deleted progressively + deletion_count=$(grep -c "Deleting .* files from successful transfer" ${case_name}.log) + + if [ "$blocking_str" == "blocking" ]; then + # In blocking mode, we should see deletion after each tar transfer + if [ ${deletion_count} -lt $((tar_count - 1)) ]; then + echo "Expected at least $((tar_count - 1)) deletion events in blocking mode, found ${deletion_count}" + return 2 + fi + echo "✓ Files deleted progressively (${deletion_count} deletion events)" + else + # In non-blocking mode, deletions happen when we check status + if [ ${deletion_count} -lt 1 ]; then + echo "Expected at least 1 deletion event in non-blocking mode, found ${deletion_count}" + return 2 + fi + echo "✓ Files deleted (${deletion_count} deletion events in non-blocking mode)" + fi + + # Verify that NO tar files remain in source after completion + echo "Checking that no tar files remain in source" + ls ${src_dir}/${case_name}/zstash_demo/zstash/*.tar 2>&1 | tee ls_tar_check.log + if grep -q "\.tar" ls_tar_check.log && ! grep -q "No such file" ls_tar_check.log; then + echo "Found tar files that should have been deleted!" + return 2 + fi + echo "✓ All tar files successfully deleted from source" + + # Verify tar files exist in destination + if [ "$blocking_str" == "non-blocking" ]; then + wait_for_directory "${dst_dir}/${case_name}" || return 1 + fi + + dst_tar_count=$(ls ${dst_dir}/${case_name}/*.tar 2>/dev/null | wc -l) + if [ ${dst_tar_count} -ne ${tar_count} ]; then + echo "Expected ${tar_count} tar files in destination, found ${dst_tar_count}" + return 2 + fi + echo "✓ All ${tar_count} tar files present in destination" + + return 0 +} + # Follow these directions ##################################################### # Example usage: @@ -233,7 +325,14 @@ run_test_with_tracking() { echo "Running: ${test_name}" echo "==========================================" - if test_globus_tar_deletion "${args[@]}"; then + # Determine which test function to call based on test name + if [[ "${test_name}" == *"progressive"* ]]; then + test_func=test_globus_progressive_deletion + else + test_func=test_globus_tar_deletion + fi + + if ${test_func} "${args[@]}"; then # Print test result in the output block AND at the end echo "✓ ${test_name} PASSED" test_results+=("✓ ${test_name} PASSED") # Uses Global variable @@ -253,15 +352,29 @@ tests_passed=0 tests_failed=0 test_results=() # Global variable to hold test results -echo "Primary tests: single authentication code tests for each endpoint" +echo "Primary tests: basic functionality tests" echo "If a test hangs, check if https://app.globus.org/activity reports any errors on your transfers." -# Run all tests independently +# Run basic tests +# These check that AT THE END of the run, +# we either still have the files (keep) or the files are deleted (non-keep). run_test_with_tracking "blocking_non-keep" ${path_to_repo} ${endpoint_str} ${machine_dst_dir} "blocking" "non-keep" || true run_test_with_tracking "non-blocking_non-keep" ${path_to_repo} ${endpoint_str} ${machine_dst_dir} "non-blocking" "non-keep" || true run_test_with_tracking "blocking_keep" ${path_to_repo} ${endpoint_str} ${machine_dst_dir} "blocking" "keep" || true run_test_with_tracking "non-blocking_keep" ${path_to_repo} ${endpoint_str} ${machine_dst_dir} "non-blocking" "keep" || true +echo "" +echo "Progressive deletion tests: verify files are deleted as transfers complete" +echo "WARNING: These tests create ~2GB of data and will take several minutes" + +# Run progressive deletion tests +# Thes check that DURING the run, +# files are deleted after successful transfers (non-keep only). +# Blocking -- get files, transfer files, delete at src, start next transfer. +# Non-blocking -- get files, transfer files, get next set of files, transfer those files, check if previous transfer is done (and if so, delete at src). +run_test_with_tracking "blocking_progressive_deletion" ${path_to_repo} ${endpoint_str} ${machine_dst_dir} "blocking" || true +run_test_with_tracking "non-blocking_progressive_deletion" ${path_to_repo} ${endpoint_str} ${machine_dst_dir} "non-blocking" || true + # Print summary echo "" echo "==========================================" From ed3248d17a6209dcd1a3467d6853478d4312e6b8 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Wed, 4 Mar 2026 15:29:59 -0600 Subject: [PATCH 05/34] Pre-commit fixes --- zstash/update.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/zstash/update.py b/zstash/update.py index a20a912c..a151a0bc 100644 --- a/zstash/update.py +++ b/zstash/update.py @@ -11,15 +11,7 @@ from .globus import globus_activate, globus_finalize from .hpss import hpss_get, hpss_put from .hpss_utils import add_files -from .settings import ( - DEFAULT_CACHE, - TIME_TOL, - FilesRow, - TupleFilesRow, - config, - get_db_filename, - logger, -) +from .settings import DEFAULT_CACHE, TIME_TOL, config, get_db_filename, logger from .transfer_tracking import TransferManager from .utils import get_files_to_archive_with_stats, update_config From 864abe249817475a80856b65bc14afb668c28a9b Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Wed, 4 Mar 2026 19:27:38 -0600 Subject: [PATCH 06/34] Address code review comments --- zstash/create.py | 6 +++--- zstash/globus.py | 5 +++-- zstash/hpss.py | 3 --- zstash/update.py | 6 +++--- 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/zstash/create.py b/zstash/create.py index 3eb9e602..4671a2b9 100644 --- a/zstash/create.py +++ b/zstash/create.py @@ -104,7 +104,7 @@ def create(): ) logger.debug(f"{ts_utc()}: calling globus_finalize()") - globus_finalize(transfer_manager, non_blocking=args.non_blocking) + globus_finalize(transfer_manager) if len(failures) > 0: # List the failures @@ -291,11 +291,11 @@ def create_database( force_database_corruption=args.for_developers_force_database_corruption, transfer_manager=transfer_manager, ) - except FileNotFoundError as e: + except FileNotFoundError: if args.follow_symlinks: raise Exception("Archive creation failed due to broken symlink.") else: - raise e + raise # Close database con.commit() diff --git a/zstash/globus.py b/zstash/globus.py index d74965c5..da87369d 100644 --- a/zstash/globus.py +++ b/zstash/globus.py @@ -72,7 +72,7 @@ def globus_transfer( # noqa: C901 name: str, transfer_type: str, non_blocking: bool, -): +) -> str: logger.info(f"{ts_utc()}: Entered globus_transfer() for name = {name}") logger.debug(f"{ts_utc()}: non_blocking = {non_blocking}") @@ -201,6 +201,7 @@ def globus_transfer( # noqa: C901 wait_timeout=7200, max_retries=5, ) + task_status = new_mrt.task_status else: logger.info( f"{ts_utc()}: NO BLOCKING (task_wait) for task_id {new_mrt.task_id}" @@ -306,7 +307,7 @@ def globus_wait(transfer_client: TransferClient, task_id: str): sys.exit(1) -def globus_finalize(transfer_manager: TransferManager, non_blocking: bool = False): +def globus_finalize(transfer_manager: TransferManager): last_task_id = None diff --git a/zstash/hpss.py b/zstash/hpss.py index 683a01b4..3806ed08 100644 --- a/zstash/hpss.py +++ b/zstash/hpss.py @@ -11,9 +11,6 @@ from .transfer_tracking import GlobusConfig, TransferBatch, TransferManager from .utils import run_command, ts_utc -prev_transfers: List[str] = list() -curr_transfers: List[str] = list() - def hpss_transfer( hpss: str, diff --git a/zstash/update.py b/zstash/update.py index a151a0bc..270f3536 100644 --- a/zstash/update.py +++ b/zstash/update.py @@ -45,7 +45,7 @@ def update(): transfer_manager=transfer_manager, ) - globus_finalize(transfer_manager, non_blocking=args.non_blocking) + globus_finalize(transfer_manager) # List failures if len(failures) > 0: @@ -290,11 +290,11 @@ def update_database( # noqa: C901 overwrite_duplicate_tars=args.overwrite_duplicate_tars, transfer_manager=transfer_manager, ) - except FileNotFoundError as e: + except FileNotFoundError: if args.follow_symlinks: raise Exception("Archive update failed due to broken symlink.") else: - raise e + raise # Close database con.commit() From 67e8a000d00f5beb9f2933a6832603ccc045bd15 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Thu, 5 Mar 2026 15:41:18 -0600 Subject: [PATCH 07/34] Minor performance improvements --- zstash/transfer_tracking.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/zstash/transfer_tracking.py b/zstash/transfer_tracking.py index 47e1ee61..74bab2bb 100644 --- a/zstash/transfer_tracking.py +++ b/zstash/transfer_tracking.py @@ -30,8 +30,10 @@ def __init__(self): def delete_files(self): for src_path in self.file_paths: - if os.path.exists(src_path): + try: os.remove(src_path) + except FileNotFoundError: + logger.warning(f"File already deleted: {src_path}") class TransferManager: @@ -51,12 +53,10 @@ def delete_successfully_transferred_files(self): logger.info( f"{ts_utc()}: Checking for successfully transferred files to delete" ) + self.batches = [ + batch for batch in self.batches if batch.file_paths + ] # Clean up empty batches for batch in self.batches: - # Skip if already processed - if not batch.file_paths: - logger.debug(f"{ts_utc()}: batch was already processed, skipping") - continue - # Check if this is a Globus transfer that needs status update if batch.is_globus and batch.task_id and (batch.task_status != "SUCCEEDED"): logger.debug(f"{ts_utc()}: batch is globus AND is not yet successful") From e6f07e66d32480f70baede6cfebb47a4e5a02fb5 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Thu, 5 Mar 2026 16:48:30 -0600 Subject: [PATCH 08/34] Add type annotation --- zstash/globus.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/zstash/globus.py b/zstash/globus.py index da87369d..dd0027a5 100644 --- a/zstash/globus.py +++ b/zstash/globus.py @@ -5,6 +5,7 @@ from globus_sdk import TransferAPIError, TransferClient from globus_sdk.response import GlobusHTTPResponse +from globus_sdk.services.transfer.response.iterable import IterableTransferResponse from six.moves.urllib.parse import urlparse from .globus_utils import ( @@ -57,7 +58,7 @@ def globus_activate( return globus_config -def file_exists(archive_directory_listing, name: str) -> bool: +def file_exists(archive_directory_listing: IterableTransferResponse, name: str) -> bool: for entry in archive_directory_listing: if entry.get("name") == name: return True From d8cf91954ed683440846636b457ae2abe7f40547 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Thu, 5 Mar 2026 17:09:40 -0600 Subject: [PATCH 09/34] Fix typo --- .../bash_tests/run_from_any/test_globus_tar_deletion.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/bash_tests/run_from_any/test_globus_tar_deletion.bash b/tests/integration/bash_tests/run_from_any/test_globus_tar_deletion.bash index 8de806e6..8649dacb 100755 --- a/tests/integration/bash_tests/run_from_any/test_globus_tar_deletion.bash +++ b/tests/integration/bash_tests/run_from_any/test_globus_tar_deletion.bash @@ -368,7 +368,7 @@ echo "Progressive deletion tests: verify files are deleted as transfers complete echo "WARNING: These tests create ~2GB of data and will take several minutes" # Run progressive deletion tests -# Thes check that DURING the run, +# These check that DURING the run, # files are deleted after successful transfers (non-keep only). # Blocking -- get files, transfer files, delete at src, start next transfer. # Non-blocking -- get files, transfer files, get next set of files, transfer those files, check if previous transfer is done (and if so, delete at src). From cf05c5350fee3edef1b0b872a8f344ffd156f78f Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Fri, 6 Mar 2026 17:59:50 -0600 Subject: [PATCH 10/34] Refactor add_files --- zstash/create.py | 4 +- zstash/hpss_utils.py | 510 +++++++++++++++++++++++-------------------- zstash/update.py | 4 +- 3 files changed, 277 insertions(+), 241 deletions(-) diff --git a/zstash/create.py b/zstash/create.py index 4671a2b9..74c7e583 100644 --- a/zstash/create.py +++ b/zstash/create.py @@ -12,7 +12,7 @@ from .globus import globus_activate, globus_finalize from .hpss import hpss_put -from .hpss_utils import add_files +from .hpss_utils import construct_tars from .settings import DEFAULT_CACHE, config, get_db_filename, logger from .transfer_tracking import TransferManager from .utils import ( @@ -276,7 +276,7 @@ def create_database( failures: List[str] try: # Add files to archive - failures = add_files( + failures = construct_tars( cur, con, -1, diff --git a/zstash/hpss_utils.py b/zstash/hpss_utils.py index 573640c9..c45a0285 100644 --- a/zstash/hpss_utils.py +++ b/zstash/hpss_utils.py @@ -17,6 +17,193 @@ from .utils import create_tars_table, tars_table_exists, ts_utc +class TarWrapper(object): + def __init__(self, tar_num: int, cache: str, do_hash: bool, follow_symlinks: bool): + # Create a hex value at least 6 digits long + tname: str = "{0:0{1}x}".format(tar_num, 6) + # Create the tar file name by adding ".tar" + self.tfname: str = f"{tname}.tar" + logger.info(f"{ts_utc()}: Creating new tar archive {self.tfname}") + # Open that tar file in the cache + self.tarFileObject = HashIO(os.path.join(cache, self.tfname), "wb", do_hash) + # FIXME: error: Argument "fileobj" to "open" has incompatible type "HashIO"; expected "Optional[IO[bytes]]" + self.tar = tarfile.open(mode="w", fileobj=self.tarFileObject, dereference=follow_symlinks) # type: ignore + + def process_file( + self, current_file: str, archived: List[TupleFilesRowNoId], failures: List[str] + ) -> int: + logger.info(f"Archiving {current_file}") + tar_size: int = 0 + try: + offset: int + size: int + mtime: datetime + md5: Optional[str] + offset, size, mtime, md5 = add_file_to_tar_archive(self.tar, current_file) + t: TupleFilesRowNoId = ( + current_file, + size, + mtime, + md5, + self.tfname, + offset, + ) + archived.append(t) + # Increase tarsize by the size of the current file. + # Use `tell()` to also include the tar's metadata in the size. + tar_size = self.tarFileObject.tell() + except Exception: + # Catch all exceptions here. + traceback.print_exc() + logger.error(f"Archiving {current_file}") + failures.append(current_file) + return tar_size + + def process_tar( + self, + cache: str, + keep: bool, + non_blocking: bool, + transfer_manager: Optional[TransferManager], + skip_tars_md5: bool, + cur: sqlite3.Cursor, + con: sqlite3.Connection, + force_database_corruption: str, + error_on_duplicate_tar: bool, + overwrite_duplicate_tars: bool, + archived: List[TupleFilesRowNoId], + ): + logger.debug(f"{ts_utc()}: Closing tar archive {self.tfname}") + self.tar.close() + + tarsize = self.tarFileObject.tell() + tar_md5: Optional[str] = self.tarFileObject.md5() + self.tarFileObject.close() + logger.info(f"{ts_utc()}: (add_files): Completed archive file {self.tfname}") + + # Transfer tar to HPSS + if config.hpss is not None: + hpss: str = config.hpss + else: + raise TypeError("Invalid config.hpss={}".format(config.hpss)) + + logger.info(f"Contents of the cache prior to `hpss_put`: {os.listdir(cache)}") + + logger.info( + f"{ts_utc()}: DIVING: (add_files): Calling hpss_put to dispatch archive file {self.tfname} [keep, non_blocking] = [{keep}, {non_blocking}]" + ) + hpss_put( + hpss, + os.path.join(cache, self.tfname), + cache, + keep, + non_blocking, + is_index=False, + transfer_manager=transfer_manager, + ) + logger.info( + f"{ts_utc()}: SURFACE (add_files): Called hpss_put to dispatch archive file {self.tfname}" + ) + + if not skip_tars_md5: + tar_tuple: TupleTarsRowNoId = (self.tfname, tarsize, tar_md5) + logger.info("tar name={}, tar size={}, tar md5={}".format(*tar_tuple)) + if not tars_table_exists(cur): + # Need to create tars table + create_tars_table(cur, con) + + # For developers only! For debugging purposes only! + if force_database_corruption == "simulate_row_existing": + # Tested by database_corruption.bash Cases 3, 5 + logger.info( + f"TESTING/DEBUGGING ONLY: Simulating row existing for {self.tfname}." + ) + cur.execute("INSERT INTO tars VALUES (NULL,?,?,?)", tar_tuple) + elif force_database_corruption == "simulate_row_existing_bad_size": + # Tested by database_corruption.bash CaseS 4, 7 + logger.info( + f"TESTING/DEBUGGING ONLY: Simulating row existing with bad size for {self.tfname}." + ) + cur.execute( + "INSERT INTO tars VALUES (NULL,?,?,?)", + (self.tfname, tarsize + 1000, tar_md5), + ) + + # We're done adding files to the tar. + # And we've transferred it to HPSS. + # Now we can insert the tar into the database. + cur.execute("SELECT COUNT(*) FROM tars WHERE name = ?", (self.tfname,)) + tar_count: int = cur.fetchone()[0] + if tar_count != 0: + error_str: str = ( + f"Database corruption detected! {self.tfname} is already in the database." + ) + if error_on_duplicate_tar: + # Tested by database_corruption.bash Case 3 + # Exists - error out + logger.error(error_str) + raise RuntimeError(error_str) + elif overwrite_duplicate_tars: + # Tested by database_corruption.bash Case 4 + # Exists - update with new size and md5 + logger.warning(error_str) + logger.warning(f"Updating existing tar {self.tfname} to proceed.") + cur.execute( + "UPDATE tars SET size = ?, md5 = ? WHERE name = ?", + (tarsize, tar_md5, self.tfname), + ) + else: + # Tested by database_corruption.bash Cases 5,7 + # Proceed as if we're in the typical case -- insert new + logger.warning(error_str) + logger.warning(f"Adding a new entry for {self.tfname}.") + cur.execute("INSERT INTO tars VALUES (NULL,?,?,?)", tar_tuple) + elif force_database_corruption == "simulate_no_correct_size": + # Tested by database_corruption.bash Case 6 + # For developers only! For debugging purposes only! + # Add this tar twice, with different sizes. + logger.info( + f"TESTING/DEBUGGING ONLY: Simulating no correct size for {self.tfname}." + ) + cur.execute( + "INSERT INTO tars VALUES (NULL,?,?,?)", + (self.tfname, tarsize + 1000, tar_md5), + ) + cur.execute( + "INSERT INTO tars VALUES (NULL,?,?,?)", + (self.tfname, tarsize + 2000, tar_md5), + ) + elif force_database_corruption == "simulate_bad_size_for_most_recent": + # Tested by database_corruption.bash Case 8 + # For developers only! For debugging purposes only! + # Add this tar twice, second time with bad size. + logger.info( + f"TESTING/DEBUGGING ONLY: Simulating bad size for most recent entry for {self.tfname}." + ) + cur.execute( + "INSERT INTO tars VALUES (NULL,?,?,?)", + (self.tfname, tarsize, tar_md5), + ) + cur.execute( + "INSERT INTO tars VALUES (NULL,?,?,?)", + (self.tfname, tarsize + 2000, tar_md5), + ) + else: + # Tested by database_corruption.bash Cases 1,2 + # Typical case + # Doesn't exist - insert new + logger.info(f"Adding {self.tfname} to the database.") + cur.execute("INSERT INTO tars VALUES (NULL,?,?,?)", tar_tuple) + + con.commit() + + # Update database with the individual files that have been archived + # Add a row to the "files" table, + # the last 6 columns matching the values of `archived` + cur.executemany("insert into files values (NULL,?,?,?,?,?,?)", archived) + con.commit() + + # Minimum output file object class HashIO(object): def __init__(self, name: str, mode: str, do_hash: bool): @@ -54,7 +241,42 @@ def close(self): self.closed = True -def add_files( +# Add file to tar archive while computing its hash +# Return file offset (in tar archive), size and md5 hash +def add_file_to_tar_archive( + tar: tarfile.TarFile, file_name: str +) -> Tuple[int, int, datetime, Optional[str]]: + offset = tar.offset + tarinfo = tar.gettarinfo(file_name) + + if tarinfo.islnk(): + tarinfo.size = os.path.getsize(file_name) + + md5 = None + + # For files/hardlinks + if tarinfo.isfile() or tarinfo.islnk(): + if tarinfo.size > 0: + # Non-empty files: stream with hash computation + hash_md5 = hashlib.md5() + with open(file_name, "rb") as f: + wrapper = HashingFileWrapper(f, hash_md5) + tar.addfile(tarinfo, wrapper) + md5 = hash_md5.hexdigest() + else: + # Empty files: just add to tar, compute hash of empty data + tar.addfile(tarinfo) + md5 = hashlib.md5(b"").hexdigest() # MD5 of empty bytes + else: + # Directories, symlinks, etc. + tar.addfile(tarinfo) + + size = tarinfo.size + mtime = datetime.utcfromtimestamp(tarinfo.mtime) + return offset, size, mtime, md5 + + +def construct_tars( cur: sqlite3.Cursor, con: sqlite3.Connection, itar: int, @@ -70,211 +292,60 @@ def add_files( transfer_manager: Optional[TransferManager] = None, ) -> List[str]: - # Now, perform the actual archiving failures: List[str] = [] - create_new_tar: bool = True nfiles: int = len(files) - archived: List[TupleFilesRowNoId] - tarsize: int - tname: str - tfname: str - tarFileObject: HashIO - tar: tarfile.TarFile - for i in range(nfiles): - - # New tar in the local cache - if create_new_tar: - create_new_tar = False - archived = [] - tarsize = 0 - itar += 1 - # Create a hex value at least 6 digits long - tname = "{0:0{1}x}".format(itar, 6) - # Create the tar file name by adding ".tar" - tfname = "{}.tar".format(tname) - logger.info(f"{ts_utc()}: Creating new tar archive {tfname}") - # Open that tar file in the cache - do_hash: bool - if not skip_tars_md5: - # If we're not skipping tars, we want to calculate the hash of the tars. - do_hash = True - else: - do_hash = False - tarFileObject = HashIO(os.path.join(cache, tfname), "wb", do_hash) - # FIXME: error: Argument "fileobj" to "open" has incompatible type "HashIO"; expected "Optional[IO[bytes]]" - tar = tarfile.open(mode="w", fileobj=tarFileObject, dereference=follow_symlinks) # type: ignore - - # Add current file to tar archive - current_file: str = files[i] - logger.info("Archiving {}".format(current_file)) - try: - offset: int - size: int - mtime: datetime - md5: Optional[str] - offset, size, mtime, md5 = add_file(tar, current_file, follow_symlinks) - t: TupleFilesRowNoId = ( - current_file, - size, - mtime, - md5, - tfname, - offset, - ) - archived.append(t) - # Increase tarsize by the size of the current file. - # Use `tell()` to also include the tar's metadata in the size. - tarsize = tarFileObject.tell() - except Exception: - # Catch all exceptions here. - traceback.print_exc() - logger.error("Archiving {}".format(current_file)) - failures.append(current_file) - - # Close tar archive if current file is the last one or - # if adding one more would push us over the limit. - next_file_size: int = tar.gettarinfo(current_file).size - if config.maxsize is not None: - maxsize: int = config.maxsize - else: - raise TypeError("Invalid config.maxsize={}".format(config.maxsize)) - if i == nfiles - 1 or tarsize + next_file_size > maxsize: - - # Close current temporary file - logger.debug(f"{ts_utc()}: Closing tar archive {tfname}") - tar.close() - tarsize = tarFileObject.tell() - tar_md5: Optional[str] = tarFileObject.md5() - tarFileObject.close() - logger.info(f"{ts_utc()}: (add_files): Completed archive file {tfname}") - - # Transfer tar to HPSS - if config.hpss is not None: - hpss: str = config.hpss + if config.maxsize is not None: + max_size: int = config.maxsize + else: + raise TypeError(f"Invalid config.maxsize={config.maxsize}") + + i_file: int = 0 + while i_file < nfiles: + # Each iteration of this loop constructs one tar + + # `create` passes in itar=-1, so the first tar will be 000001.tar + # `update` passes in itar=max existing tar number, so the first tar will be max+1 + itar += 1 + tar_size: int = 0 + archived: List[TupleFilesRowNoId] = [] + + # Open a new tar + # Note: if we're not skipping tars, we want to calculate the hash of the tars. + tar_wrapper = TarWrapper( + tar_num=itar, + cache=cache, + do_hash=not skip_tars_md5, + follow_symlinks=follow_symlinks, + ) + + # Add files to the tar until we reach the max size + while i_file < nfiles: + current_file: str = files[i_file] + current_file_size: int = os.path.getsize(current_file) + # Check if adding this file would send us over the max size. + if tar_size + current_file_size < max_size: + # Add current file to tar archive + tar_size += tar_wrapper.process_file(current_file, archived, failures) + i_file += 1 else: - raise TypeError("Invalid config.hpss={}".format(config.hpss)) - - logger.info( - f"Contents of the cache prior to `hpss_put`: {os.listdir(cache)}" - ) - - logger.info( - f"{ts_utc()}: DIVING: (add_files): Calling hpss_put to dispatch archive file {tfname} [keep, non_blocking] = [{keep}, {non_blocking}]" - ) - hpss_put( - hpss, - os.path.join(cache, tfname), - cache, - keep, - non_blocking, - is_index=False, - transfer_manager=transfer_manager, - ) - logger.info( - f"{ts_utc()}: SURFACE (add_files): Called hpss_put to dispatch archive file {tfname}" - ) - - if not skip_tars_md5: - tar_tuple: TupleTarsRowNoId = (tfname, tarsize, tar_md5) - logger.info("tar name={}, tar size={}, tar md5={}".format(*tar_tuple)) - if not tars_table_exists(cur): - # Need to create tars table - create_tars_table(cur, con) - - # For developers only! For debugging purposes only! - if force_database_corruption == "simulate_row_existing": - # Tested by database_corruption.bash Cases 3, 5 - logger.info( - f"TESTING/DEBUGGING ONLY: Simulating row existing for {tfname}." - ) - cur.execute("INSERT INTO tars VALUES (NULL,?,?,?)", tar_tuple) - elif force_database_corruption == "simulate_row_existing_bad_size": - # Tested by database_corruption.bash CaseS 4, 7 - logger.info( - f"TESTING/DEBUGGING ONLY: Simulating row existing with bad size for {tfname}." - ) - cur.execute( - "INSERT INTO tars VALUES (NULL,?,?,?)", - (tfname, tarsize + 1000, tar_md5), - ) - - # We're done adding files to the tar. - # And we've transferred it to HPSS. - # Now we can insert the tar into the database. - cur.execute("SELECT COUNT(*) FROM tars WHERE name = ?", (tfname,)) - tar_count: int = cur.fetchone()[0] - if tar_count != 0: - error_str: str = ( - f"Database corruption detected! {tfname} is already in the database." - ) - if error_on_duplicate_tar: - # Tested by database_corruption.bash Case 3 - # Exists - error out - logger.error(error_str) - raise RuntimeError(error_str) - elif overwrite_duplicate_tars: - # Tested by database_corruption.bash Case 4 - # Exists - update with new size and md5 - logger.warning(error_str) - logger.warning(f"Updating existing tar {tfname} to proceed.") - cur.execute( - "UPDATE tars SET size = ?, md5 = ? WHERE name = ?", - (tarsize, tar_md5, tfname), - ) - else: - # Tested by database_corruption.bash Cases 5,7 - # Proceed as if we're in the typical case -- insert new - logger.warning(error_str) - logger.warning(f"Adding a new entry for {tfname}.") - cur.execute("INSERT INTO tars VALUES (NULL,?,?,?)", tar_tuple) - elif force_database_corruption == "simulate_no_correct_size": - # Tested by database_corruption.bash Case 6 - # For developers only! For debugging purposes only! - # Add this tar twice, with different sizes. - logger.info( - f"TESTING/DEBUGGING ONLY: Simulating no correct size for {tfname}." - ) - cur.execute( - "INSERT INTO tars VALUES (NULL,?,?,?)", - (tfname, tarsize + 1000, tar_md5), - ) - cur.execute( - "INSERT INTO tars VALUES (NULL,?,?,?)", - (tfname, tarsize + 2000, tar_md5), - ) - elif force_database_corruption == "simulate_bad_size_for_most_recent": - # Tested by database_corruption.bash Case 8 - # For developers only! For debugging purposes only! - # Add this tar twice, second time with bad size. - logger.info( - f"TESTING/DEBUGGING ONLY: Simulating bad size for most recent entry for {tfname}." - ) - cur.execute( - "INSERT INTO tars VALUES (NULL,?,?,?)", - (tfname, tarsize, tar_md5), - ) - cur.execute( - "INSERT INTO tars VALUES (NULL,?,?,?)", - (tfname, tarsize + 2000, tar_md5), - ) - else: - # Tested by database_corruption.bash Cases 1,2 - # Typical case - # Doesn't exist - insert new - logger.info(f"Adding {tfname} to the database.") - cur.execute("INSERT INTO tars VALUES (NULL,?,?,?)", tar_tuple) - - con.commit() - - # Update database with the individual files that have been archived - # Add a row to the "files" table, - # the last 6 columns matching the values of `archived` - cur.executemany("insert into files values (NULL,?,?,?,?,?,?)", archived) - con.commit() - - # Open new tar next time - create_new_tar = True + # Time to close and transfer this tar archive. + break + + # Close and transfer this tar archive, and update the database with the archived files. + tar_wrapper.process_tar( + cache, + keep, + non_blocking, + transfer_manager, + skip_tars_md5, + cur, + con, + force_database_corruption, + error_on_duplicate_tar, + overwrite_duplicate_tars, + archived, + ) return failures @@ -290,38 +361,3 @@ def read(self, size=-1): if data: self.hasher.update(data) return data - - -# Add file to tar archive while computing its hash -# Return file offset (in tar archive), size and md5 hash -def add_file( - tar: tarfile.TarFile, file_name: str, follow_symlinks: bool -) -> Tuple[int, int, datetime, Optional[str]]: - offset = tar.offset - tarinfo = tar.gettarinfo(file_name) - - if tarinfo.islnk(): - tarinfo.size = os.path.getsize(file_name) - - md5 = None - - # For files/hardlinks - if tarinfo.isfile() or tarinfo.islnk(): - if tarinfo.size > 0: - # Non-empty files: stream with hash computation - hash_md5 = hashlib.md5() - with open(file_name, "rb") as f: - wrapper = HashingFileWrapper(f, hash_md5) - tar.addfile(tarinfo, wrapper) - md5 = hash_md5.hexdigest() - else: - # Empty files: just add to tar, compute hash of empty data - tar.addfile(tarinfo) - md5 = hashlib.md5(b"").hexdigest() # MD5 of empty bytes - else: - # Directories, symlinks, etc. - tar.addfile(tarinfo) - - size = tarinfo.size - mtime = datetime.utcfromtimestamp(tarinfo.mtime) - return offset, size, mtime, md5 diff --git a/zstash/update.py b/zstash/update.py index 270f3536..38516a4d 100644 --- a/zstash/update.py +++ b/zstash/update.py @@ -10,7 +10,7 @@ from .globus import globus_activate, globus_finalize from .hpss import hpss_get, hpss_put -from .hpss_utils import add_files +from .hpss_utils import construct_tars from .settings import DEFAULT_CACHE, TIME_TOL, config, get_db_filename, logger from .transfer_tracking import TransferManager from .utils import get_files_to_archive_with_stats, update_config @@ -277,7 +277,7 @@ def update_database( # noqa: C901 itar = max(itar, int(tfile_string[0:6], 16)) try: # Add files - failures = add_files( + failures = construct_tars( cur, con, itar, From 42a8ea28b930c7cb56d9e2940bd819c347a1a00c Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Fri, 6 Mar 2026 18:52:06 -0600 Subject: [PATCH 11/34] Refactor process_tar --- zstash/create.py | 11 +++-- zstash/hpss_utils.py | 106 +++++++++++++++++++++++++++---------------- zstash/update.py | 10 ++-- 3 files changed, 81 insertions(+), 46 deletions(-) diff --git a/zstash/create.py b/zstash/create.py index 74c7e583..5916d82a 100644 --- a/zstash/create.py +++ b/zstash/create.py @@ -12,7 +12,7 @@ from .globus import globus_activate, globus_finalize from .hpss import hpss_put -from .hpss_utils import construct_tars +from .hpss_utils import DevOptions, construct_tars from .settings import DEFAULT_CACHE, config, get_db_filename, logger from .transfer_tracking import TransferManager from .utils import ( @@ -274,6 +274,11 @@ def create_database( files: List[str] = get_files_to_archive(cache, args.include, args.exclude) failures: List[str] + dev_options: DevOptions = DevOptions( + error_on_duplicate_tar=args.error_on_duplicate_tar, + overwrite_duplicate_tars=args.overwrite_duplicate_tars, + force_database_corruption=args.for_developers_force_database_corruption, + ) try: # Add files to archive failures = construct_tars( @@ -284,11 +289,9 @@ def create_database( cache, args.keep, args.follow_symlinks, + dev_options, skip_tars_md5=args.no_tars_md5, non_blocking=args.non_blocking, - error_on_duplicate_tar=args.error_on_duplicate_tar, - overwrite_duplicate_tars=args.overwrite_duplicate_tars, - force_database_corruption=args.for_developers_force_database_corruption, transfer_manager=transfer_manager, ) except FileNotFoundError: diff --git a/zstash/hpss_utils.py b/zstash/hpss_utils.py index c45a0285..95a39913 100644 --- a/zstash/hpss_utils.py +++ b/zstash/hpss_utils.py @@ -17,6 +17,44 @@ from .utils import create_tars_table, tars_table_exists, ts_utc +# This class holds parameters for developer options. +# I.e., these parameters should only ever be activated by developers during debugging and/or testing. +class DevOptions(object): + def __init__( + self, + error_on_duplicate_tar: bool, + overwrite_duplicate_tars: bool, + force_database_corruption: str, + ): + self.error_on_duplicate_tar: bool = error_on_duplicate_tar + self.overwrite_duplicate_tars: bool = overwrite_duplicate_tars + self.force_database_corruption: str = force_database_corruption + + def simulate_row_existing( + self, + tfname: str, + cur: sqlite3.Cursor, + tar_tuple: TupleTarsRowNoId, + tar_size: int, + tar_md5: Optional[str], + ): + if self.force_database_corruption == "simulate_row_existing": + # Tested by database_corruption.bash Cases 3, 5 + logger.info( + f"TESTING/DEBUGGING ONLY: Simulating row existing for {tfname}." + ) + cur.execute("INSERT INTO tars VALUES (NULL,?,?,?)", tar_tuple) + elif self.force_database_corruption == "simulate_row_existing_bad_size": + # Tested by database_corruption.bash Cases 4, 7 + logger.info( + f"TESTING/DEBUGGING ONLY: Simulating row existing with bad size for {tfname}." + ) + cur.execute( + "INSERT INTO tars VALUES (NULL,?,?,?)", + (tfname, tar_size + 1000, tar_md5), + ) + + class TarWrapper(object): def __init__(self, tar_num: int, cache: str, do_hash: bool, follow_symlinks: bool): # Create a hex value at least 6 digits long @@ -49,7 +87,7 @@ def process_file( offset, ) archived.append(t) - # Increase tarsize by the size of the current file. + # Increase tar_size by the size of the current file. # Use `tell()` to also include the tar's metadata in the size. tar_size = self.tarFileObject.tell() except Exception: @@ -68,20 +106,19 @@ def process_tar( skip_tars_md5: bool, cur: sqlite3.Cursor, con: sqlite3.Connection, - force_database_corruption: str, - error_on_duplicate_tar: bool, - overwrite_duplicate_tars: bool, + dev_options: DevOptions, archived: List[TupleFilesRowNoId], ): + # 1. Close the tar #################################################### logger.debug(f"{ts_utc()}: Closing tar archive {self.tfname}") self.tar.close() - tarsize = self.tarFileObject.tell() + tar_size = self.tarFileObject.tell() tar_md5: Optional[str] = self.tarFileObject.md5() self.tarFileObject.close() logger.info(f"{ts_utc()}: (add_files): Completed archive file {self.tfname}") - # Transfer tar to HPSS + # 2. Transfer the tar to HPSS ######################################### if config.hpss is not None: hpss: str = config.hpss else: @@ -92,6 +129,7 @@ def process_tar( logger.info( f"{ts_utc()}: DIVING: (add_files): Calling hpss_put to dispatch archive file {self.tfname} [keep, non_blocking] = [{keep}, {non_blocking}]" ) + # Actually transfer the tar file hpss_put( hpss, os.path.join(cache, self.tfname), @@ -105,29 +143,18 @@ def process_tar( f"{ts_utc()}: SURFACE (add_files): Called hpss_put to dispatch archive file {self.tfname}" ) + # 3. Add the tar itself to the tars table ############################# if not skip_tars_md5: - tar_tuple: TupleTarsRowNoId = (self.tfname, tarsize, tar_md5) + tar_tuple: TupleTarsRowNoId = (self.tfname, tar_size, tar_md5) logger.info("tar name={}, tar size={}, tar md5={}".format(*tar_tuple)) if not tars_table_exists(cur): # Need to create tars table create_tars_table(cur, con) - # For developers only! For debugging purposes only! - if force_database_corruption == "simulate_row_existing": - # Tested by database_corruption.bash Cases 3, 5 - logger.info( - f"TESTING/DEBUGGING ONLY: Simulating row existing for {self.tfname}." - ) - cur.execute("INSERT INTO tars VALUES (NULL,?,?,?)", tar_tuple) - elif force_database_corruption == "simulate_row_existing_bad_size": - # Tested by database_corruption.bash CaseS 4, 7 - logger.info( - f"TESTING/DEBUGGING ONLY: Simulating row existing with bad size for {self.tfname}." - ) - cur.execute( - "INSERT INTO tars VALUES (NULL,?,?,?)", - (self.tfname, tarsize + 1000, tar_md5), - ) + # For developers only! For debugging/testing purposes only! + dev_options.simulate_row_existing( + self.tfname, cur, tar_tuple, tar_size, tar_md5 + ) # We're done adding files to the tar. # And we've transferred it to HPSS. @@ -138,19 +165,19 @@ def process_tar( error_str: str = ( f"Database corruption detected! {self.tfname} is already in the database." ) - if error_on_duplicate_tar: + if dev_options.error_on_duplicate_tar: # Tested by database_corruption.bash Case 3 # Exists - error out logger.error(error_str) raise RuntimeError(error_str) - elif overwrite_duplicate_tars: + elif dev_options.overwrite_duplicate_tars: # Tested by database_corruption.bash Case 4 # Exists - update with new size and md5 logger.warning(error_str) logger.warning(f"Updating existing tar {self.tfname} to proceed.") cur.execute( "UPDATE tars SET size = ?, md5 = ? WHERE name = ?", - (tarsize, tar_md5, self.tfname), + (tar_size, tar_md5, self.tfname), ) else: # Tested by database_corruption.bash Cases 5,7 @@ -158,7 +185,7 @@ def process_tar( logger.warning(error_str) logger.warning(f"Adding a new entry for {self.tfname}.") cur.execute("INSERT INTO tars VALUES (NULL,?,?,?)", tar_tuple) - elif force_database_corruption == "simulate_no_correct_size": + elif dev_options.force_database_corruption == "simulate_no_correct_size": # Tested by database_corruption.bash Case 6 # For developers only! For debugging purposes only! # Add this tar twice, with different sizes. @@ -167,13 +194,16 @@ def process_tar( ) cur.execute( "INSERT INTO tars VALUES (NULL,?,?,?)", - (self.tfname, tarsize + 1000, tar_md5), + (self.tfname, tar_size + 1000, tar_md5), ) cur.execute( "INSERT INTO tars VALUES (NULL,?,?,?)", - (self.tfname, tarsize + 2000, tar_md5), + (self.tfname, tar_size + 2000, tar_md5), ) - elif force_database_corruption == "simulate_bad_size_for_most_recent": + elif ( + dev_options.force_database_corruption + == "simulate_bad_size_for_most_recent" + ): # Tested by database_corruption.bash Case 8 # For developers only! For debugging purposes only! # Add this tar twice, second time with bad size. @@ -182,11 +212,11 @@ def process_tar( ) cur.execute( "INSERT INTO tars VALUES (NULL,?,?,?)", - (self.tfname, tarsize, tar_md5), + (self.tfname, tar_size, tar_md5), ) cur.execute( "INSERT INTO tars VALUES (NULL,?,?,?)", - (self.tfname, tarsize + 2000, tar_md5), + (self.tfname, tar_size + 2000, tar_md5), ) else: # Tested by database_corruption.bash Cases 1,2 @@ -197,6 +227,7 @@ def process_tar( con.commit() + # 4. Add the files included in this tar to the files table ############ # Update database with the individual files that have been archived # Add a row to the "files" table, # the last 6 columns matching the values of `archived` @@ -252,7 +283,7 @@ def add_file_to_tar_archive( if tarinfo.islnk(): tarinfo.size = os.path.getsize(file_name) - md5 = None + md5: Optional[str] = None # For files/hardlinks if tarinfo.isfile() or tarinfo.islnk(): @@ -269,6 +300,7 @@ def add_file_to_tar_archive( md5 = hashlib.md5(b"").hexdigest() # MD5 of empty bytes else: # Directories, symlinks, etc. + # md5 will be None in these cases. tar.addfile(tarinfo) size = tarinfo.size @@ -284,11 +316,9 @@ def construct_tars( cache: str, keep: bool, follow_symlinks: bool, + dev_options: DevOptions, skip_tars_md5: bool = False, non_blocking: bool = False, - error_on_duplicate_tar: bool = False, - overwrite_duplicate_tars: bool = False, - force_database_corruption: str = "", transfer_manager: Optional[TransferManager] = None, ) -> List[str]: @@ -341,9 +371,7 @@ def construct_tars( skip_tars_md5, cur, con, - force_database_corruption, - error_on_duplicate_tar, - overwrite_duplicate_tars, + dev_options, archived, ) diff --git a/zstash/update.py b/zstash/update.py index 38516a4d..ae96696a 100644 --- a/zstash/update.py +++ b/zstash/update.py @@ -10,7 +10,7 @@ from .globus import globus_activate, globus_finalize from .hpss import hpss_get, hpss_put -from .hpss_utils import construct_tars +from .hpss_utils import DevOptions, construct_tars from .settings import DEFAULT_CACHE, TIME_TOL, config, get_db_filename, logger from .transfer_tracking import TransferManager from .utils import get_files_to_archive_with_stats, update_config @@ -275,6 +275,11 @@ def update_database( # noqa: C901 for tfile in tfiles: tfile_string: str = tfile[0] itar = max(itar, int(tfile_string[0:6], 16)) + dev_options: DevOptions = DevOptions( + error_on_duplicate_tar=args.error_on_duplicate_tar, + overwrite_duplicate_tars=args.overwrite_duplicate_tars, + force_database_corruption="", + ) try: # Add files failures = construct_tars( @@ -285,9 +290,8 @@ def update_database( # noqa: C901 cache, keep, args.follow_symlinks, + dev_options, non_blocking=args.non_blocking, - error_on_duplicate_tar=args.error_on_duplicate_tar, - overwrite_duplicate_tars=args.overwrite_duplicate_tars, transfer_manager=transfer_manager, ) except FileNotFoundError: From f6fcb3f6b9be8f89ee29040c535f14c0a9df516e Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Fri, 6 Mar 2026 19:00:10 -0600 Subject: [PATCH 12/34] Better structure file size logic --- zstash/hpss_utils.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/zstash/hpss_utils.py b/zstash/hpss_utils.py index 95a39913..072fdc79 100644 --- a/zstash/hpss_utils.py +++ b/zstash/hpss_utils.py @@ -352,15 +352,16 @@ def construct_tars( # Add files to the tar until we reach the max size while i_file < nfiles: current_file: str = files[i_file] + # TODO: check if the extra stat call (via getsize) impacts performance current_file_size: int = os.path.getsize(current_file) # Check if adding this file would send us over the max size. - if tar_size + current_file_size < max_size: - # Add current file to tar archive + if tar_size + current_file_size > max_size: + # If so, time to close and transfer this tar archive. + break + else: + # If not, add current file to tar archive. tar_size += tar_wrapper.process_file(current_file, archived, failures) i_file += 1 - else: - # Time to close and transfer this tar archive. - break # Close and transfer this tar archive, and update the database with the archived files. tar_wrapper.process_tar( From 0c3cd7c73ca9db061bca0661d4a94120e1778a73 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Fri, 6 Mar 2026 20:28:36 -0600 Subject: [PATCH 13/34] Clarify and improve Globus logic --- zstash/globus.py | 55 ++++++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/zstash/globus.py b/zstash/globus.py index dd0027a5..bf1f03ef 100644 --- a/zstash/globus.py +++ b/zstash/globus.py @@ -120,14 +120,13 @@ def globus_transfer( # noqa: C901 task = transfer_manager.globus_config.transfer_client.get_task(mrt.task_id) mrt.task_status = task["status"] # one of {ACTIVE, SUCCEEDED, FAILED, CANCELED, PENDING, INACTIVE} - # NOTE: How we behave here depends upon whether we want to support mutliple active transfers. - # Presently, we do not, except inadvertantly (if status == PENDING) if mrt.task_status == "ACTIVE": logger.info( f"{ts_utc()}: Previous task_id {mrt.task_id} Still Active. Returning ACTIVE." ) - # Don't return early - continue to submit the new transfer - # The previous transfer will complete asynchronously + # Don't return early -- continue to submit the new transfer. + # The previous transfer will complete asynchronously. + # That is, we will permit multiple active transfers. elif mrt.task_status == "SUCCEEDED": logger.info( f"{ts_utc()}: Previous task_id {mrt.task_id} status = SUCCEEDED." @@ -155,8 +154,9 @@ def globus_transfer( # noqa: C901 flush=True, ) - # Submit the current transfer_data logger.info(f"{ts_utc()}: DIVING: Submit Transfer for {transfer_data['label']}") + # Submit the current transfer_data + # ALWAYS submit. If we've gotten to this point, we're ready to submit. task = submit_transfer_with_checks( transfer_manager.globus_config.transfer_client, transfer_data ) @@ -196,11 +196,11 @@ def globus_transfer( # noqa: C901 task_status = "UNKNOWN" if new_mrt and new_mrt.task_id: if not non_blocking: + # If blocking, wait for the task to complete and get the final status, + # before we proceed with any more transfers. new_mrt.task_status = globus_block_wait( transfer_manager.globus_config.transfer_client, task_id=new_mrt.task_id, - wait_timeout=7200, - max_retries=5, ) task_status = new_mrt.task_status else: @@ -220,33 +220,48 @@ def globus_transfer( # noqa: C901 def globus_block_wait( transfer_client: TransferClient, task_id: str, - wait_timeout: int, - max_retries: int, + wait_timeout: int = 7200, # 7200/3600 = 2 hours + max_retries: int = 5, ): - - # poll every "polling_interval" seconds to speed up small transfers. Report every 2 hours, stop waiting aftert 5*2 = 10 hours + # Poll every "polling_interval" seconds to speed up small transfers. + # Report every "wait_timeout" seconds, and stop waiting after "max_retries" reports. + # By default: report every 2 hours, stop waiting after 5*2 = 10 hours logger.info( f"{ts_utc()}: BLOCKING START: invoking task_wait for task_id = {task_id}" ) - task_status = "UNKNOWN" - retry_count = 0 + task_status: str = "UNKNOWN" + retry_count: int = 0 while retry_count < max_retries: try: - # Wait for the task to complete logger.info( f"{ts_utc()}: on task_wait try {retry_count + 1} out of {max_retries}" ) - transfer_client.task_wait( + # Wait for the task to complete. This is what makes this function BLOCKING. + # From https://globus-sdk-python.readthedocs.io/en/stable/services/transfer.html#globus_sdk.TransferClient.task_wait: Wait until a Task is complete or fails, with a time limit. If the task is “ACTIVE” after time runs out, returns False. Otherwise returns True. + task_is_not_active: bool = transfer_client.task_wait( task_id, timeout=wait_timeout, polling_interval=10 ) + if task_is_not_active: + curr_task = transfer_client.get_task(task_id) + task_status = curr_task["status"] + if task_status == "SUCCEEDED": + break # Break out of the while-loop. The transfer already succeeded, so no need to retry. + elif task_status in ["FAILED", "CANCELED"]: + error_str = f"{ts_utc()}: task_wait returned True, but task_status={task_status} for task_id {task_id}. No reason to keep retrying now." + logger.warning(error_str) + # We still need to break, because no matter how long we wait now, nothing will change with the transfer status. + break + elif task_status in ["PENDING", "INACTIVE"]: + error_str = f"{ts_utc()}: task_wait returned True, but task_status={task_status} for task_id {task_id}. Will retry waiting until max_retries is reached." + logger.warning(error_str) + # Don't break -- continue retries + else: + error_str = f"{ts_utc()}: task_wait returned True, but task_status={task_status} is unexpected for task_id {task_id}. Will retry waiting until max_retries is reached." + logger.warning(error_str) + # Don't break -- continue retries logger.info(f"{ts_utc()}: done with wait") except Exception as e: logger.error(f"Unexpected Exception: {e}") - else: - curr_task = transfer_client.get_task(task_id) - task_status = curr_task["status"] - if task_status == "SUCCEEDED": - break finally: retry_count += 1 logger.info( From 5f32decd5715236bca7875323a1a7ffc9143dfc6 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Mon, 9 Mar 2026 13:51:49 -0700 Subject: [PATCH 14/34] Fix Python tests --- zstash/hpss_utils.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/zstash/hpss_utils.py b/zstash/hpss_utils.py index 072fdc79..096d989f 100644 --- a/zstash/hpss_utils.py +++ b/zstash/hpss_utils.py @@ -352,16 +352,25 @@ def construct_tars( # Add files to the tar until we reach the max size while i_file < nfiles: current_file: str = files[i_file] - # TODO: check if the extra stat call (via getsize) impacts performance - current_file_size: int = os.path.getsize(current_file) + # It's ok that current_file isn't in the tar yet. + tarinfo = tar_wrapper.tar.gettarinfo(current_file) + current_file_size = tarinfo.size + # Check if adding this file would send us over the max size. - if tar_size + current_file_size > max_size: - # If so, time to close and transfer this tar archive. - break - else: - # If not, add current file to tar archive. - tar_size += tar_wrapper.process_file(current_file, archived, failures) + if (tar_size == 0) or (tar_size + current_file_size <= max_size): + # Case 1: if the tar is empty, always add the file, even if it's over the max size. + # Case 2: if the tar is nonempty, only add the file if it won't put us over the max size. + new_tar_size = tar_wrapper.process_file( + current_file, archived, failures + ) + if new_tar_size != 0: + tar_size = new_tar_size + # Else: process_file failed, so we should keep the original tar_size i_file += 1 + else: + # Over the size limit. + # Time to close and transfer this tar archive. + break # Close and transfer this tar archive, and update the database with the archived files. tar_wrapper.process_tar( From f03160cac59561fda69c65180812825b9e1a7243 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Mon, 9 Mar 2026 16:46:05 -0700 Subject: [PATCH 15/34] Perlmutter tests passing --- zstash/globus.py | 21 ++++++++++++--- zstash/hpss.py | 5 +++- zstash/hpss_utils.py | 63 +++++++++++++++++++++++++++++++------------- 3 files changed, 66 insertions(+), 23 deletions(-) diff --git a/zstash/globus.py b/zstash/globus.py index bf1f03ef..c9acffba 100644 --- a/zstash/globus.py +++ b/zstash/globus.py @@ -124,9 +124,22 @@ def globus_transfer( # noqa: C901 logger.info( f"{ts_utc()}: Previous task_id {mrt.task_id} Still Active. Returning ACTIVE." ) - # Don't return early -- continue to submit the new transfer. - # The previous transfer will complete asynchronously. - # That is, we will permit multiple active transfers. + # There is currently an active Globus transfer. + # Globus allows 3 simultaneous transfers, but right now, we only do one at a time. + # So, return for now, skipping the rest of `globus_transfer`. + # What happens next? + # We return to constructing tars. + # We won't need to create a new batch because the current batch exists. + # Once the transfer status is finally SUCCEEDED, + # we'll submit the current batch for Globus transfer. + if non_blocking: + return "ACTIVE" + else: + error_str: str = ( + "task_status='ACTIVE', but in blocking mode, the previous transfer should have waited through globus_block_wait" + ) + logger.error(error_str) + raise RuntimeError(error_str) elif mrt.task_status == "SUCCEEDED": logger.info( f"{ts_utc()}: Previous task_id {mrt.task_id} status = SUCCEEDED." @@ -138,6 +151,8 @@ def globus_transfer( # noqa: C901 logger.info( f"{ts}:Globus transfer {mrt.task_id}, from {src_ep} to {dst_ep}: {label} succeeded" ) + # The previous transfer succeeded. + # That means we can transfer the current batch now. else: logger.error( f"{ts_utc()}: Previous task_id {mrt.task_id} status = {mrt.task_status}." diff --git a/zstash/hpss.py b/zstash/hpss.py index 3806ed08..00b2c8f0 100644 --- a/zstash/hpss.py +++ b/zstash/hpss.py @@ -113,9 +113,12 @@ def hpss_transfer( transfer_manager.globus_config = GlobusConfig() # Transfer file using the Globus Transfer Service logger.info(f"{ts_utc()}: DIVING: hpss calls globus_transfer(name={name})") - globus_transfer( + task_status: str = globus_transfer( transfer_manager, endpoint, url_path, name, transfer_type, non_blocking ) + logger.info( + f"{ts_utc()}: globus_transfer(name={name}) returned task_status={task_status}" + ) mrt: Optional[TransferBatch] = transfer_manager.get_most_recent_transfer() if mrt and mrt.task_status: globus_status = mrt.task_status diff --git a/zstash/hpss_utils.py b/zstash/hpss_utils.py index 096d989f..0967d87e 100644 --- a/zstash/hpss_utils.py +++ b/zstash/hpss_utils.py @@ -68,7 +68,11 @@ def __init__(self, tar_num: int, cache: str, do_hash: bool, follow_symlinks: boo self.tar = tarfile.open(mode="w", fileobj=self.tarFileObject, dereference=follow_symlinks) # type: ignore def process_file( - self, current_file: str, archived: List[TupleFilesRowNoId], failures: List[str] + self, + current_file: str, + tar_info: tarfile.TarInfo, + archived: List[TupleFilesRowNoId], + failures: List[str], ) -> int: logger.info(f"Archiving {current_file}") tar_size: int = 0 @@ -77,7 +81,9 @@ def process_file( size: int mtime: datetime md5: Optional[str] - offset, size, mtime, md5 = add_file_to_tar_archive(self.tar, current_file) + offset, size, mtime, md5 = add_file_to_tar_archive( + self.tar, current_file, tar_info + ) t: TupleFilesRowNoId = ( current_file, size, @@ -275,36 +281,32 @@ def close(self): # Add file to tar archive while computing its hash # Return file offset (in tar archive), size and md5 hash def add_file_to_tar_archive( - tar: tarfile.TarFile, file_name: str + tar: tarfile.TarFile, file_name: str, tar_info: tarfile.TarInfo ) -> Tuple[int, int, datetime, Optional[str]]: offset = tar.offset - tarinfo = tar.gettarinfo(file_name) - - if tarinfo.islnk(): - tarinfo.size = os.path.getsize(file_name) md5: Optional[str] = None # For files/hardlinks - if tarinfo.isfile() or tarinfo.islnk(): - if tarinfo.size > 0: + if tar_info.isfile() or tar_info.islnk(): + if tar_info.size > 0: # Non-empty files: stream with hash computation hash_md5 = hashlib.md5() with open(file_name, "rb") as f: wrapper = HashingFileWrapper(f, hash_md5) - tar.addfile(tarinfo, wrapper) + tar.addfile(tar_info, wrapper) md5 = hash_md5.hexdigest() else: # Empty files: just add to tar, compute hash of empty data - tar.addfile(tarinfo) + tar.addfile(tar_info) md5 = hashlib.md5(b"").hexdigest() # MD5 of empty bytes else: # Directories, symlinks, etc. # md5 will be None in these cases. - tar.addfile(tarinfo) + tar.addfile(tar_info) - size = tarinfo.size - mtime = datetime.utcfromtimestamp(tarinfo.mtime) + size = tar_info.size + mtime = datetime.utcfromtimestamp(tar_info.mtime) return offset, size, mtime, md5 @@ -331,6 +333,7 @@ def construct_tars( raise TypeError(f"Invalid config.maxsize={config.maxsize}") i_file: int = 0 + carried_over_tar_info: Optional[tarfile.TarInfo] = None while i_file < nfiles: # Each iteration of this loop constructs one tar @@ -352,24 +355,46 @@ def construct_tars( # Add files to the tar until we reach the max size while i_file < nfiles: current_file: str = files[i_file] - # It's ok that current_file isn't in the tar yet. - tarinfo = tar_wrapper.tar.gettarinfo(current_file) - current_file_size = tarinfo.size + if carried_over_tar_info: + # If current_file wasn't added to the last tar, + # then we can immediately add it now. + # No need to repeat the size calculations. + tar_info = carried_over_tar_info + current_file_size = tar_info.size + else: + try: + # It's ok that current_file isn't in the tar yet. + tar_info = tar_wrapper.tar.gettarinfo(current_file) + if tar_info.islnk(): + tar_info.size = os.path.getsize(current_file) + current_file_size = tar_info.size + except Exception: + # We couldn't even get the size of this file. + # So let's continue to the next file. + # (Likely cause: broken symlink) + traceback.print_exc() + logger.error(f"Archiving {current_file}") + # failures.append(current_file) # Mark this file as an error + # i_file += 1 # Go to the next file + raise # Check if adding this file would send us over the max size. if (tar_size == 0) or (tar_size + current_file_size <= max_size): # Case 1: if the tar is empty, always add the file, even if it's over the max size. # Case 2: if the tar is nonempty, only add the file if it won't put us over the max size. new_tar_size = tar_wrapper.process_file( - current_file, archived, failures + current_file, tar_info, archived, failures ) if new_tar_size != 0: tar_size = new_tar_size # Else: process_file failed, so we should keep the original tar_size i_file += 1 else: - # Over the size limit. + # Over the size limit: # Time to close and transfer this tar archive. + carried_over_tar_info = tar_info # Carry over this info to the next tar + # Break out of the inner while-loop: + # Done adding files to this particular tar. break # Close and transfer this tar archive, and update the database with the archived files. From 221a1b6ee52fe53e395b493cd42ab75e2ae60e3d Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Tue, 10 Mar 2026 11:15:21 -0500 Subject: [PATCH 16/34] Address simple code review comments --- zstash/create.py | 34 ++++++++++++++-------------------- zstash/hpss_utils.py | 24 +++++++++++++++--------- zstash/update.py | 32 +++++++++++++------------------- 3 files changed, 42 insertions(+), 48 deletions(-) diff --git a/zstash/create.py b/zstash/create.py index 5916d82a..7cd76231 100644 --- a/zstash/create.py +++ b/zstash/create.py @@ -279,26 +279,20 @@ def create_database( overwrite_duplicate_tars=args.overwrite_duplicate_tars, force_database_corruption=args.for_developers_force_database_corruption, ) - try: - # Add files to archive - failures = construct_tars( - cur, - con, - -1, - files, - cache, - args.keep, - args.follow_symlinks, - dev_options, - skip_tars_md5=args.no_tars_md5, - non_blocking=args.non_blocking, - transfer_manager=transfer_manager, - ) - except FileNotFoundError: - if args.follow_symlinks: - raise Exception("Archive creation failed due to broken symlink.") - else: - raise + # Add files to archive + failures = construct_tars( + cur, + con, + -1, + files, + cache, + args.keep, + args.follow_symlinks, + dev_options, + skip_tars_md5=args.no_tars_md5, + non_blocking=args.non_blocking, + transfer_manager=transfer_manager, + ) # Close database con.commit() diff --git a/zstash/hpss_utils.py b/zstash/hpss_utils.py index 0967d87e..0cd8ba81 100644 --- a/zstash/hpss_utils.py +++ b/zstash/hpss_utils.py @@ -332,12 +332,18 @@ def construct_tars( else: raise TypeError(f"Invalid config.maxsize={config.maxsize}") + operation: str + if itar == -1: + operation = "create" + else: + operation = "update" + i_file: int = 0 carried_over_tar_info: Optional[tarfile.TarInfo] = None while i_file < nfiles: # Each iteration of this loop constructs one tar - # `create` passes in itar=-1, so the first tar will be 000001.tar + # `create` passes in itar=-1, so the first tar will be 000000.tar # `update` passes in itar=max existing tar number, so the first tar will be max+1 itar += 1 tar_size: int = 0 @@ -361,6 +367,7 @@ def construct_tars( # No need to repeat the size calculations. tar_info = carried_over_tar_info current_file_size = tar_info.size + carried_over_tar_info = None # Reset for next iteration else: try: # It's ok that current_file isn't in the tar yet. @@ -368,15 +375,14 @@ def construct_tars( if tar_info.islnk(): tar_info.size = os.path.getsize(current_file) current_file_size = tar_info.size - except Exception: - # We couldn't even get the size of this file. - # So let's continue to the next file. - # (Likely cause: broken symlink) - traceback.print_exc() + except FileNotFoundError: logger.error(f"Archiving {current_file}") - # failures.append(current_file) # Mark this file as an error - # i_file += 1 # Go to the next file - raise + if follow_symlinks: + raise Exception( + f"Archive {operation} failed due to broken symlink." + ) + else: + raise # Check if adding this file would send us over the max size. if (tar_size == 0) or (tar_size + current_file_size <= max_size): diff --git a/zstash/update.py b/zstash/update.py index ae96696a..5c1db6f5 100644 --- a/zstash/update.py +++ b/zstash/update.py @@ -280,25 +280,19 @@ def update_database( # noqa: C901 overwrite_duplicate_tars=args.overwrite_duplicate_tars, force_database_corruption="", ) - try: - # Add files - failures = construct_tars( - cur, - con, - itar, - newfiles, - cache, - keep, - args.follow_symlinks, - dev_options, - non_blocking=args.non_blocking, - transfer_manager=transfer_manager, - ) - except FileNotFoundError: - if args.follow_symlinks: - raise Exception("Archive update failed due to broken symlink.") - else: - raise + # Add files + failures = construct_tars( + cur, + con, + itar, + newfiles, + cache, + keep, + args.follow_symlinks, + dev_options, + non_blocking=args.non_blocking, + transfer_manager=transfer_manager, + ) # Close database con.commit() From f7e02750f0bc19d0aad4a28883497a31d148fc62 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Tue, 10 Mar 2026 11:44:50 -0500 Subject: [PATCH 17/34] Address globus_finalize comment --- zstash/globus.py | 235 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 172 insertions(+), 63 deletions(-) diff --git a/zstash/globus.py b/zstash/globus.py index c9acffba..4c1f10b0 100644 --- a/zstash/globus.py +++ b/zstash/globus.py @@ -1,7 +1,7 @@ from __future__ import absolute_import, print_function import sys -from typing import List, Optional +from typing import Dict, List, Optional, Set, Tuple from globus_sdk import TransferAPIError, TransferClient from globus_sdk.response import GlobusHTTPResponse @@ -338,73 +338,182 @@ def globus_wait(transfer_client: TransferClient, task_id: str): sys.exit(1) -def globus_finalize(transfer_manager: TransferManager): +def _submit_pending_transfer_data( + transfer_client: TransferClient, + transfer_manager: TransferManager, +) -> Optional[str]: + """ + If the most recent batch has unsubmitted TransferData, submit it and return task_id. + Otherwise return None. + """ + transfer: Optional[TransferBatch] = transfer_manager.get_most_recent_transfer() + if not transfer or not transfer.transfer_data: + return None + + logger.info(f"{ts_utc()}: FINAL TransferData: accumulated items:") + attribs = transfer.transfer_data.__dict__ + for item in attribs["data"]["DATA"]: + if item["DATA_TYPE"] == "transfer_item": + transfer_manager.cumulative_tarfiles_pushed += 1 + print( + f" (finalize) PUSHING ({transfer_manager.cumulative_tarfiles_pushed}) source item: {item['source_path']}", + flush=True, + ) + + logger.info( + f"{ts_utc()}: DIVING: Submit Transfer for {transfer.transfer_data['label']}" + ) + try: + last_task = submit_transfer_with_checks(transfer_client, transfer.transfer_data) + task_id = last_task.get("task_id") + + # Best-effort: if this batch represents the submission, store the task_id. + if task_id and transfer.is_globus and not transfer.task_id: + transfer.task_id = task_id + + return task_id + + except TransferAPIError as e: + if e.code == "NoCredException": + logger.error( + "{}. Please go to https://app.globus.org/endpoints and activate the endpoint.".format( + e.message + ) + ) + else: + logger.error(e) + sys.exit(1) + except Exception as e: + logger.error("Exception: {}".format(e)) + sys.exit(1) + + +def _collect_globus_task_ids( + transfer_manager: TransferManager, + extra_task_id: Optional[str], +) -> Tuple[List[str], Dict[str, TransferBatch]]: + """ + Return (ordered unique task_ids, task_id->batch mapping for first occurrence). - last_task_id = None + Includes: + - All Globus batch.task_id values where batch.file_paths is non-empty + (i.e., still pending deletion) + - extra_task_id if provided + Skips: + - Non-Globus batches + - Batches without task_id + - Batches whose file_paths is empty (already processed/deleted) + """ + task_ids: List[str] = [] + seen: Set[str] = set() + task_to_batch: Dict[str, TransferBatch] = {} + + for batch in transfer_manager.batches: + already_deleted: bool = not batch.file_paths + + if (not batch.is_globus) or (not batch.task_id) or (already_deleted): + continue + + # By this point, we know batch.task_id is not None + tid: str = batch.task_id + if tid in seen: + continue + + seen.add(tid) + task_ids.append(tid) + task_to_batch[tid] = batch + + # Always include extra_task_id (e.g., just-submitted transfer), + # even if not yet reflected in batches. + if extra_task_id and (extra_task_id not in seen): + task_ids.append(extra_task_id) + + return task_ids, task_to_batch + + +def _refresh_batch_status( + transfer_client: TransferClient, + task_id: str, + task_to_batch: Dict[str, TransferBatch], +) -> Optional[str]: + """ + Fetch Globus task status and update corresponding batch.task_status if present. + Returns status, or None if fetch fails. + """ + try: + task: GlobusHTTPResponse = transfer_client.get_task(task_id) + status = task["status"] + batch: Optional[TransferBatch] = task_to_batch.get(task_id) + if batch: + batch.task_status = status + return status + except Exception as e: + logger.warning( + f"{ts_utc()}: Could not fetch status for task_id={task_id}; will wait anyway. ({e})" + ) + return None + + +def _wait_for_all_tasks( + transfer_client: TransferClient, + task_ids: List[str], + task_to_batch: Dict[str, TransferBatch], +) -> None: + """ + For each task_id, refresh status; if not SUCCEEDED, block via globus_wait; + then refresh status again for deletion logic. + """ + for tid in task_ids: + status = _refresh_batch_status(transfer_client, tid, task_to_batch) + if status == "SUCCEEDED": + logger.info(f"{ts_utc()}: task_id={tid} already SUCCEEDED; skipping wait") + continue + + logger.info( + f"{ts_utc()}: Waiting for transfer task_id={tid} to complete (status={status})" + ) + globus_wait(transfer_client, tid) + + # After wait returns, task is terminal; refresh once more. + _refresh_batch_status(transfer_client, tid, task_to_batch) + + +def _prune_empty_batches(transfer_manager: TransferManager) -> None: + """ + Remove batches which have no remaining files to manage. + + Note: we only prune batches whose file_paths is empty, regardless of Globus/HPSS. + That matches current semantics where file_paths=[] means "processed". + """ + before = len(transfer_manager.batches) + transfer_manager.batches = [b for b in transfer_manager.batches if b.file_paths] + after = len(transfer_manager.batches) + if after != before: + logger.debug(f"{ts_utc()}: Pruned {before - after} empty transfer batches") + + +def globus_finalize(transfer_manager: TransferManager) -> None: if transfer_manager.globus_config is None: logger.warning("No GlobusConfig object provided for finalization") return + if transfer_manager.globus_config.transfer_client is None: + logger.warning("GlobusConfig provided but transfer_client is None") + return - transfer: Optional[TransferBatch] = transfer_manager.get_most_recent_transfer() - if transfer: - # Check if there's any pending transfer data that hasn't been submitted yet - if transfer.transfer_data: - # DEBUG: review accumulated items in TransferData - logger.info(f"{ts_utc()}: FINAL TransferData: accumulated items:") - attribs = transfer.transfer_data.__dict__ - for item in attribs["data"]["DATA"]: - if item["DATA_TYPE"] == "transfer_item": - transfer_manager.cumulative_tarfiles_pushed += 1 - print( - f" (finalize) PUSHING ({transfer_manager.cumulative_tarfiles_pushed}) source item: {item['source_path']}", - flush=True, - ) + # By this point, we know transfer_client is not None + transfer_client: TransferClient = transfer_manager.globus_config.transfer_client - # SUBMIT new transfer here - logger.info( - f"{ts_utc()}: DIVING: Submit Transfer for {transfer.transfer_data['label']}" - ) - try: - last_task = submit_transfer_with_checks( - transfer_manager.globus_config.transfer_client, - transfer.transfer_data, - ) - last_task_id = last_task.get("task_id") - except TransferAPIError as e: - if e.code == "NoCredException": - logger.error( - "{}. Please go to https://app.globus.org/endpoints and activate the endpoint.".format( - e.message - ) - ) - else: - logger.error(e) - sys.exit(1) - except Exception as e: - logger.error("Exception: {}".format(e)) - sys.exit(1) - - # Wait for any submitted transfers to complete - # In non-blocking mode, this ensures index.db and any accumulated tar files complete - # In blocking mode, this is redundant but harmless - skip_last_wait: bool = False - if transfer and transfer.task_id: - if transfer.task_id == last_task_id: - skip_last_wait = ( - True # No reason to call globus_wait twice on the same task_id - ) - logger.info( - f"{ts_utc()}: Waiting for transfer task_id={transfer.task_id} to complete" - ) - globus_wait( - transfer_manager.globus_config.transfer_client, transfer.task_id - ) - if last_task_id and (not skip_last_wait): - logger.info( - f"{ts_utc()}: Waiting for last transfer task_id={last_task_id} to complete" - ) - globus_wait(transfer_manager.globus_config.transfer_client, last_task_id) + last_task_id: Optional[str] = _submit_pending_transfer_data( + transfer_client, transfer_manager + ) + + task_ids: List[str] + task_to_batch: Dict[str, TransferBatch] + task_ids, task_to_batch = _collect_globus_task_ids(transfer_manager, last_task_id) + + _wait_for_all_tasks(transfer_client, task_ids, task_to_batch) + + transfer_manager.delete_successfully_transferred_files() - # Clean up tar files that were queued for deletion - transfer_manager.delete_successfully_transferred_files() + _prune_empty_batches(transfer_manager) From c64d0593d5490ef70edb9a1e9e1fd3c6713aabee Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Tue, 10 Mar 2026 10:20:50 -0700 Subject: [PATCH 18/34] Make fix to pass follow_symlinks.sh --- zstash/hpss_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zstash/hpss_utils.py b/zstash/hpss_utils.py index 0cd8ba81..64154821 100644 --- a/zstash/hpss_utils.py +++ b/zstash/hpss_utils.py @@ -334,7 +334,7 @@ def construct_tars( operation: str if itar == -1: - operation = "create" + operation = "creation" else: operation = "update" From 5a7274247bb5439f429a14986a57bc4b74c7d240 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Tue, 10 Mar 2026 14:02:52 -0500 Subject: [PATCH 19/34] Address simple code review comments --- .../bash_tests/run_from_any/test_globus_tar_deletion.bash | 2 +- zstash/globus.py | 4 ++-- zstash/hpss_utils.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration/bash_tests/run_from_any/test_globus_tar_deletion.bash b/tests/integration/bash_tests/run_from_any/test_globus_tar_deletion.bash index 8649dacb..cd5ac041 100755 --- a/tests/integration/bash_tests/run_from_any/test_globus_tar_deletion.bash +++ b/tests/integration/bash_tests/run_from_any/test_globus_tar_deletion.bash @@ -140,7 +140,7 @@ test_globus_tar_deletion() # Use -v so debug logs show up. zstash create ${blocking_flag} ${keep_flag} --hpss=${globus_path}/${case_name} --maxsize 128 -v zstash_demo 2>&1 | tee ${case_name}.log if [ $? != 0 ]; then - echo "${case_name} failed. Check ${case_name}_create.log for details. Cannot continue." + echo "${case_name} failed. Check ${case_name}.log for details. Cannot continue." return 1 fi echo "${case_name} completed successfully. Checking ${case_name}.log now." diff --git a/zstash/globus.py b/zstash/globus.py index 4c1f10b0..6fb27bf6 100644 --- a/zstash/globus.py +++ b/zstash/globus.py @@ -495,10 +495,10 @@ def _prune_empty_batches(transfer_manager: TransferManager) -> None: def globus_finalize(transfer_manager: TransferManager) -> None: if transfer_manager.globus_config is None: - logger.warning("No GlobusConfig object provided for finalization") + logger.debug("No GlobusConfig object provided for finalization") return if transfer_manager.globus_config.transfer_client is None: - logger.warning("GlobusConfig provided but transfer_client is None") + logger.debug("GlobusConfig provided but transfer_client is None") return # By this point, we know transfer_client is not None diff --git a/zstash/hpss_utils.py b/zstash/hpss_utils.py index 64154821..c7b01e41 100644 --- a/zstash/hpss_utils.py +++ b/zstash/hpss_utils.py @@ -130,7 +130,7 @@ def process_tar( else: raise TypeError("Invalid config.hpss={}".format(config.hpss)) - logger.info(f"Contents of the cache prior to `hpss_put`: {os.listdir(cache)}") + logger.debug(f"Contents of the cache prior to `hpss_put`: {os.listdir(cache)}") logger.info( f"{ts_utc()}: DIVING: (add_files): Calling hpss_put to dispatch archive file {self.tfname} [keep, non_blocking] = [{keep}, {non_blocking}]" From 35a5a43abb1f175e89e5b78dfcc0906f51d514c7 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Tue, 10 Mar 2026 14:20:07 -0500 Subject: [PATCH 20/34] Address remaining comments --- zstash/globus.py | 14 ++++------ zstash/hpss.py | 12 ++++++-- zstash/transfer_tracking.py | 56 ++++++++++++++++++++++--------------- zstash/update.py | 2 +- 4 files changed, 50 insertions(+), 34 deletions(-) diff --git a/zstash/globus.py b/zstash/globus.py index 6fb27bf6..fe37b4aa 100644 --- a/zstash/globus.py +++ b/zstash/globus.py @@ -124,14 +124,12 @@ def globus_transfer( # noqa: C901 logger.info( f"{ts_utc()}: Previous task_id {mrt.task_id} Still Active. Returning ACTIVE." ) - # There is currently an active Globus transfer. - # Globus allows 3 simultaneous transfers, but right now, we only do one at a time. - # So, return for now, skipping the rest of `globus_transfer`. - # What happens next? - # We return to constructing tars. - # We won't need to create a new batch because the current batch exists. - # Once the transfer status is finally SUCCEEDED, - # we'll submit the current batch for Globus transfer. + # There is currently an active Globus transfer associated with this + # managed transfer record. For this call, we skip submitting a new + # transfer for this record and return early. + # In non-blocking mode, callers may have multiple Globus transfers + # in flight overall; this branch only ensures we do not resubmit + # the same task while it is still ACTIVE. if non_blocking: return "ACTIVE" else: diff --git a/zstash/hpss.py b/zstash/hpss.py index 00b2c8f0..6163b33e 100644 --- a/zstash/hpss.py +++ b/zstash/hpss.py @@ -169,13 +169,19 @@ def hpss_put( ) -def hpss_get(hpss: str, file_path: str, cache: str): +def hpss_get( + hpss: str, + file_path: str, + cache: str, + transfer_manager: Optional[TransferManager] = None, +): """ Get a file from the HPSS archive. """ url = urlparse(hpss) - transfer_manager: TransferManager = TransferManager() - if url.scheme == "globus": + if not transfer_manager: + transfer_manager = TransferManager() + if (url.scheme == "globus") and not (transfer_manager.globus_config): transfer_manager.globus_config = GlobusConfig() hpss_transfer( hpss, file_path, "get", cache, False, transfer_manager=transfer_manager diff --git a/zstash/transfer_tracking.py b/zstash/transfer_tracking.py index 74bab2bb..2ab8c6c5 100644 --- a/zstash/transfer_tracking.py +++ b/zstash/transfer_tracking.py @@ -53,29 +53,41 @@ def delete_successfully_transferred_files(self): logger.info( f"{ts_utc()}: Checking for successfully transferred files to delete" ) - self.batches = [ - batch for batch in self.batches if batch.file_paths - ] # Clean up empty batches + # Clean up empty batches first + self.batches = [batch for batch in self.batches if batch.file_paths] + # Identify pending Globus batches (not yet succeeded) + pending_globus_batches = [ + batch + for batch in self.batches + if batch.is_globus and batch.task_id and batch.task_status != "SUCCEEDED" + ] + # To avoid excessive Globus API calls, only poll a small subset of + # pending batches on each invocation (here: at most one). + if pending_globus_batches: + batch_to_poll = pending_globus_batches[0] + logger.debug( + f"{ts_utc()}: batch is globus AND is not yet successful " + f"(task_id={batch_to_poll.task_id})" + ) + if self.globus_config and self.globus_config.transfer_client: + # Non-blocking status check + logger.debug( + f"{ts_utc()}: Checking status of task_id={batch_to_poll.task_id}" + ) + task = self.globus_config.transfer_client.get_task( + batch_to_poll.task_id + ) + batch_to_poll.task_status = task["status"] + logger.debug( + f"{ts_utc()}: task_id={batch_to_poll.task_id} " + f"status={batch_to_poll.task_status}" + ) + else: + logger.debug( + f"{ts_utc()}: globus_config is not set up with a transfer client" + ) + # Now delete files for successful transfers for batch in self.batches: - # Check if this is a Globus transfer that needs status update - if batch.is_globus and batch.task_id and (batch.task_status != "SUCCEEDED"): - logger.debug(f"{ts_utc()}: batch is globus AND is not yet successful") - if self.globus_config and self.globus_config.transfer_client: - # Non-blocking status check - logger.debug( - f"{ts_utc()}: Checking status of task_id={batch.task_id}" - ) - task = self.globus_config.transfer_client.get_task(batch.task_id) - batch.task_status = task["status"] - logger.debug( - f"{ts_utc()}: task_id={batch.task_id} status={batch.task_status}" - ) - else: - logger.debug( - f"{ts_utc()}: globus_config is not set up with a transfer client" - ) - - # Now delete if successful if (not batch.is_globus) or (batch.task_status == "SUCCEEDED"): # The files were transferred successfully, so delete them logger.info( diff --git a/zstash/update.py b/zstash/update.py index 5c1db6f5..dcb4c790 100644 --- a/zstash/update.py +++ b/zstash/update.py @@ -161,7 +161,7 @@ def update_database( # noqa: C901 else: raise TypeError("Invalid config.hpss={}".format(config.hpss)) transfer_manager.globus_config = globus_activate(hpss) - hpss_get(hpss, get_db_filename(cache), cache) + hpss_get(hpss, get_db_filename(cache), cache, transfer_manager) else: error_str: str = ( "--hpss argument is required when local copy of database is unavailable" From c408e5bf6e81a3dcbdd951ac00256279f141aa7c Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Tue, 10 Mar 2026 15:30:43 -0500 Subject: [PATCH 21/34] Improve batch status handling and comments --- zstash/globus.py | 32 +++++++++++++++++++++++--------- zstash/hpss.py | 5 +++-- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/zstash/globus.py b/zstash/globus.py index fe37b4aa..242b4c68 100644 --- a/zstash/globus.py +++ b/zstash/globus.py @@ -117,22 +117,29 @@ def globus_transfer( # noqa: C901 task: GlobusHTTPResponse try: if mrt and mrt.task_id: + # This the current transfer task associated with the most recent batch. task = transfer_manager.globus_config.transfer_client.get_task(mrt.task_id) + # Update the most recent batch's task_status based on the current status from Globus API. mrt.task_status = task["status"] - # one of {ACTIVE, SUCCEEDED, FAILED, CANCELED, PENDING, INACTIVE} + # According to https://docs.globus.org/api/transfer/task/#task_fields, + # this will be one of {ACTIVE, INACTIVE, SUCCEEDED, FAILED} if mrt.task_status == "ACTIVE": + # The most recent transfer (mrt) is still active. logger.info( f"{ts_utc()}: Previous task_id {mrt.task_id} Still Active. Returning ACTIVE." ) - # There is currently an active Globus transfer associated with this - # managed transfer record. For this call, we skip submitting a new - # transfer for this record and return early. - # In non-blocking mode, callers may have multiple Globus transfers - # in flight overall; this branch only ensures we do not resubmit - # the same task while it is still ACTIVE. if non_blocking: + # Globus allows up to 3 simulataneous transfers, + # but zstash is currently configured to only ever allow 1. + # If we're in this block, then we're already at 1 active transfer. + # We will therefore wait to submit a new transfer until it's done. + # So, we'll simply return and the next run of globus_transfer + # (i.e., on the next tar) will evaluate if the active transfer has finished. return "ACTIVE" else: + # If we're in this block, then the blocking wait + # for the previous transfer to finish was unsuccessful. + # This is an unexpected state and so we raise an error. error_str: str = ( "task_status='ACTIVE', but in blocking mode, the previous transfer should have waited through globus_block_wait" ) @@ -259,16 +266,23 @@ def globus_block_wait( task_status = curr_task["status"] if task_status == "SUCCEEDED": break # Break out of the while-loop. The transfer already succeeded, so no need to retry. - elif task_status in ["FAILED", "CANCELED"]: + elif task_status == "FAILED": error_str = f"{ts_utc()}: task_wait returned True, but task_status={task_status} for task_id {task_id}. No reason to keep retrying now." logger.warning(error_str) # We still need to break, because no matter how long we wait now, nothing will change with the transfer status. break - elif task_status in ["PENDING", "INACTIVE"]: + elif task_status in [ + "INACTIVE", + "UNKNOWN", + "EXHAUSTED_TIMEOUT_RETRIES", + ]: + # The latter two options here are ones we assign manually and aren't included on + # https://docs.globus.org/api/transfer/task/#task_fields error_str = f"{ts_utc()}: task_wait returned True, but task_status={task_status} for task_id {task_id}. Will retry waiting until max_retries is reached." logger.warning(error_str) # Don't break -- continue retries else: + # If we're in this block, then somehow an unexpected task_status was returned. error_str = f"{ts_utc()}: task_wait returned True, but task_status={task_status} is unexpected for task_id {task_id}. Will retry waiting until max_retries is reached." logger.warning(error_str) # Don't break -- continue retries diff --git a/zstash/hpss.py b/zstash/hpss.py index 6163b33e..19a961b7 100644 --- a/zstash/hpss.py +++ b/zstash/hpss.py @@ -87,8 +87,9 @@ def hpss_transfer( name: str path, name = os.path.split(file_path) - # Add this file to the current batch - if (not keep) and (not is_index): # Never track index.db for deletion + # Never track index.db for deletion, only the tar files + if (not keep) and (not is_index): + # Add this tar file to the current batch transfer_manager.batches[-1].file_paths.append(file_path) logger.debug( f"{ts_utc()}: Added {file_path} to current batch, batch now has {len(transfer_manager.batches[-1].file_paths)} files" From b2fe7c695a4362ad6dd231464f6d5bf680cbd51a Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Tue, 10 Mar 2026 15:44:00 -0500 Subject: [PATCH 22/34] Remove pending_globus_batches check --- zstash/transfer_tracking.py | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/zstash/transfer_tracking.py b/zstash/transfer_tracking.py index 2ab8c6c5..72ee5a4c 100644 --- a/zstash/transfer_tracking.py +++ b/zstash/transfer_tracking.py @@ -55,37 +55,6 @@ def delete_successfully_transferred_files(self): ) # Clean up empty batches first self.batches = [batch for batch in self.batches if batch.file_paths] - # Identify pending Globus batches (not yet succeeded) - pending_globus_batches = [ - batch - for batch in self.batches - if batch.is_globus and batch.task_id and batch.task_status != "SUCCEEDED" - ] - # To avoid excessive Globus API calls, only poll a small subset of - # pending batches on each invocation (here: at most one). - if pending_globus_batches: - batch_to_poll = pending_globus_batches[0] - logger.debug( - f"{ts_utc()}: batch is globus AND is not yet successful " - f"(task_id={batch_to_poll.task_id})" - ) - if self.globus_config and self.globus_config.transfer_client: - # Non-blocking status check - logger.debug( - f"{ts_utc()}: Checking status of task_id={batch_to_poll.task_id}" - ) - task = self.globus_config.transfer_client.get_task( - batch_to_poll.task_id - ) - batch_to_poll.task_status = task["status"] - logger.debug( - f"{ts_utc()}: task_id={batch_to_poll.task_id} " - f"status={batch_to_poll.task_status}" - ) - else: - logger.debug( - f"{ts_utc()}: globus_config is not set up with a transfer client" - ) # Now delete files for successful transfers for batch in self.batches: if (not batch.is_globus) or (batch.task_status == "SUCCEEDED"): From ee135f68e9fd7db79e4b3b2802356aa7ee8cca6b Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Tue, 10 Mar 2026 16:33:13 -0500 Subject: [PATCH 23/34] Address comment on keep --- zstash/create.py | 2 +- zstash/globus.py | 31 ++++++++++++++----------------- zstash/update.py | 2 +- 3 files changed, 16 insertions(+), 19 deletions(-) diff --git a/zstash/create.py b/zstash/create.py index 7cd76231..ad48bd4f 100644 --- a/zstash/create.py +++ b/zstash/create.py @@ -104,7 +104,7 @@ def create(): ) logger.debug(f"{ts_utc()}: calling globus_finalize()") - globus_finalize(transfer_manager) + globus_finalize(transfer_manager, args.keep) if len(failures) > 0: # List the failures diff --git a/zstash/globus.py b/zstash/globus.py index 242b4c68..0e2a3f71 100644 --- a/zstash/globus.py +++ b/zstash/globus.py @@ -401,30 +401,25 @@ def _submit_pending_transfer_data( def _collect_globus_task_ids( - transfer_manager: TransferManager, - extra_task_id: Optional[str], + transfer_manager: TransferManager, extra_task_id: Optional[str], keep: bool ) -> Tuple[List[str], Dict[str, TransferBatch]]: """ Return (ordered unique task_ids, task_id->batch mapping for first occurrence). - - Includes: - - All Globus batch.task_id values where batch.file_paths is non-empty - (i.e., still pending deletion) - - extra_task_id if provided - - Skips: - - Non-Globus batches - - Batches without task_id - - Batches whose file_paths is empty (already processed/deleted) """ task_ids: List[str] = [] seen: Set[str] = set() task_to_batch: Dict[str, TransferBatch] = {} for batch in transfer_manager.batches: - already_deleted: bool = not batch.file_paths - - if (not batch.is_globus) or (not batch.task_id) or (already_deleted): + if not keep: + # NOTE: This is always true if `keep` is set, + # since we never track files for deletion if `keep` is set. + already_deleted: bool = not batch.file_paths + if already_deleted: + # This batch has already been processed and files deleted, so we can skip it. + continue + + if (not batch.is_globus) or (not batch.task_id): continue # By this point, we know batch.task_id is not None @@ -505,7 +500,7 @@ def _prune_empty_batches(transfer_manager: TransferManager) -> None: logger.debug(f"{ts_utc()}: Pruned {before - after} empty transfer batches") -def globus_finalize(transfer_manager: TransferManager) -> None: +def globus_finalize(transfer_manager: TransferManager, keep: bool) -> None: if transfer_manager.globus_config is None: logger.debug("No GlobusConfig object provided for finalization") return @@ -522,7 +517,9 @@ def globus_finalize(transfer_manager: TransferManager) -> None: task_ids: List[str] task_to_batch: Dict[str, TransferBatch] - task_ids, task_to_batch = _collect_globus_task_ids(transfer_manager, last_task_id) + task_ids, task_to_batch = _collect_globus_task_ids( + transfer_manager, last_task_id, keep + ) _wait_for_all_tasks(transfer_client, task_ids, task_to_batch) diff --git a/zstash/update.py b/zstash/update.py index dcb4c790..1d27d799 100644 --- a/zstash/update.py +++ b/zstash/update.py @@ -45,7 +45,7 @@ def update(): transfer_manager=transfer_manager, ) - globus_finalize(transfer_manager) + globus_finalize(transfer_manager, args.keep) # List failures if len(failures) > 0: From 75efad0379ec54b4c99f5764f866b55a97f76992 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Mon, 23 Mar 2026 13:10:06 -0500 Subject: [PATCH 24/34] Update logger calls Reviewed globus.py, hpss.py, hpss_utils.py, transfer_tracking.py --- zstash/hpss.py | 4 ++-- zstash/hpss_utils.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/zstash/hpss.py b/zstash/hpss.py index 19a961b7..10f449a4 100644 --- a/zstash/hpss.py +++ b/zstash/hpss.py @@ -118,13 +118,13 @@ def hpss_transfer( transfer_manager, endpoint, url_path, name, transfer_type, non_blocking ) logger.info( - f"{ts_utc()}: globus_transfer(name={name}) returned task_status={task_status}" + f"{ts_utc()}: SURFACE: hpss globus_transfer(name={name}) returned task_status={task_status}" ) mrt: Optional[TransferBatch] = transfer_manager.get_most_recent_transfer() if mrt and mrt.task_status: globus_status = mrt.task_status logger.info( - f"{ts_utc()}: SURFACE hpss globus_transfer(name={name}) returns {globus_status}" + f"{ts_utc()}: Most recent globus_transfer returned task_status={globus_status}" ) # NOTE: Here, the status could be "EXHAUSTED_TIMEOUT_RETRIES", meaning a very long transfer # or perhaps transfer is hanging. We should decide whether to ignore it, or cancel it, but diff --git a/zstash/hpss_utils.py b/zstash/hpss_utils.py index c7b01e41..35125a9c 100644 --- a/zstash/hpss_utils.py +++ b/zstash/hpss_utils.py @@ -122,7 +122,7 @@ def process_tar( tar_size = self.tarFileObject.tell() tar_md5: Optional[str] = self.tarFileObject.md5() self.tarFileObject.close() - logger.info(f"{ts_utc()}: (add_files): Completed archive file {self.tfname}") + logger.info(f"{ts_utc()}: (process_tar): Completed archive file {self.tfname}") # 2. Transfer the tar to HPSS ######################################### if config.hpss is not None: @@ -133,7 +133,7 @@ def process_tar( logger.debug(f"Contents of the cache prior to `hpss_put`: {os.listdir(cache)}") logger.info( - f"{ts_utc()}: DIVING: (add_files): Calling hpss_put to dispatch archive file {self.tfname} [keep, non_blocking] = [{keep}, {non_blocking}]" + f"{ts_utc()}: DIVING: (process_tar): Calling hpss_put to dispatch archive file {self.tfname} [keep, non_blocking] = [{keep}, {non_blocking}]" ) # Actually transfer the tar file hpss_put( @@ -146,7 +146,7 @@ def process_tar( transfer_manager=transfer_manager, ) logger.info( - f"{ts_utc()}: SURFACE (add_files): Called hpss_put to dispatch archive file {self.tfname}" + f"{ts_utc()}: SURFACE (process_tar): Called hpss_put to dispatch archive file {self.tfname}" ) # 3. Add the tar itself to the tars table ############################# From 985f94ee0ecfb337b7cb85ccbeef357f859d9dc3 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Mon, 23 Mar 2026 13:19:21 -0500 Subject: [PATCH 25/34] Rename tar table skip parameter --- zstash/create.py | 2 +- zstash/hpss_utils.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/zstash/create.py b/zstash/create.py index ad48bd4f..db851b99 100644 --- a/zstash/create.py +++ b/zstash/create.py @@ -289,7 +289,7 @@ def create_database( args.keep, args.follow_symlinks, dev_options, - skip_tars_md5=args.no_tars_md5, + skip_tars_table=args.no_tars_md5, non_blocking=args.non_blocking, transfer_manager=transfer_manager, ) diff --git a/zstash/hpss_utils.py b/zstash/hpss_utils.py index 35125a9c..bc6b88a7 100644 --- a/zstash/hpss_utils.py +++ b/zstash/hpss_utils.py @@ -109,7 +109,7 @@ def process_tar( keep: bool, non_blocking: bool, transfer_manager: Optional[TransferManager], - skip_tars_md5: bool, + skip_tars_table: bool, cur: sqlite3.Cursor, con: sqlite3.Connection, dev_options: DevOptions, @@ -150,7 +150,7 @@ def process_tar( ) # 3. Add the tar itself to the tars table ############################# - if not skip_tars_md5: + if not skip_tars_table: tar_tuple: TupleTarsRowNoId = (self.tfname, tar_size, tar_md5) logger.info("tar name={}, tar size={}, tar md5={}".format(*tar_tuple)) if not tars_table_exists(cur): @@ -319,7 +319,7 @@ def construct_tars( keep: bool, follow_symlinks: bool, dev_options: DevOptions, - skip_tars_md5: bool = False, + skip_tars_table: bool = False, non_blocking: bool = False, transfer_manager: Optional[TransferManager] = None, ) -> List[str]: @@ -354,7 +354,7 @@ def construct_tars( tar_wrapper = TarWrapper( tar_num=itar, cache=cache, - do_hash=not skip_tars_md5, + do_hash=not skip_tars_table, follow_symlinks=follow_symlinks, ) @@ -409,7 +409,7 @@ def construct_tars( keep, non_blocking, transfer_manager, - skip_tars_md5, + skip_tars_table, cur, con, dev_options, From bb11a8d659c3379ae650714e77d3adb1595293c9 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Mon, 23 Mar 2026 13:28:43 -0500 Subject: [PATCH 26/34] Add comment explaining write method --- zstash/hpss_utils.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/zstash/hpss_utils.py b/zstash/hpss_utils.py index bc6b88a7..a6c761cf 100644 --- a/zstash/hpss_utils.py +++ b/zstash/hpss_utils.py @@ -257,6 +257,18 @@ def tell(self) -> int: return self.position def write(self, s): + """ + This is called implicitly. + In TarWrapper.__init__: + + ``` + self.tarFileObject = HashIO(os.path.join(cache, self.tfname), "wb", do_hash) + self.tar = tarfile.open(mode="w", fileobj=self.tarFileObject, dereference=follow_symlinks) + ``` + + tarfile.open requires that the fileobj argument has a write() method. + It calls that method to write data to the tar file. + """ self.f.write(s) if self.hash: self.hash.update(s) From 2931d9e73087f3dd97424fb2243237e4875f84c5 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Mon, 23 Mar 2026 13:37:43 -0500 Subject: [PATCH 27/34] Add comment explaining tars table --- zstash/hpss_utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/zstash/hpss_utils.py b/zstash/hpss_utils.py index a6c761cf..7310ef8b 100644 --- a/zstash/hpss_utils.py +++ b/zstash/hpss_utils.py @@ -362,7 +362,12 @@ def construct_tars( archived: List[TupleFilesRowNoId] = [] # Open a new tar - # Note: if we're not skipping tars, we want to calculate the hash of the tars. + # Note: if we're not skipping the tars table, then we DO want to calculate the hash of the tars. + # That is, we DO want to add the tar to the tars table in the database. + # That means we need to calculate the hash of the tar file as well. + # + # We ALWAYS want to calculate the hashes of the individual files, regardless of skip_tars_table, + # because we need to add those to the files table. tar_wrapper = TarWrapper( tar_num=itar, cache=cache, From 357b26499785a9ab5a7f6aa04a66aec130c7553d Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Mon, 23 Mar 2026 13:44:10 -0500 Subject: [PATCH 28/34] Update comments on tar submission --- zstash/hpss_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/zstash/hpss_utils.py b/zstash/hpss_utils.py index 7310ef8b..3a603f48 100644 --- a/zstash/hpss_utils.py +++ b/zstash/hpss_utils.py @@ -124,7 +124,7 @@ def process_tar( self.tarFileObject.close() logger.info(f"{ts_utc()}: (process_tar): Completed archive file {self.tfname}") - # 2. Transfer the tar to HPSS ######################################### + # 2. Submit the tar to the transfer manager's batch transfer system ### if config.hpss is not None: hpss: str = config.hpss else: @@ -135,7 +135,7 @@ def process_tar( logger.info( f"{ts_utc()}: DIVING: (process_tar): Calling hpss_put to dispatch archive file {self.tfname} [keep, non_blocking] = [{keep}, {non_blocking}]" ) - # Actually transfer the tar file + # Actually submit the tar file hpss_put( hpss, os.path.join(cache, self.tfname), @@ -420,7 +420,7 @@ def construct_tars( # Done adding files to this particular tar. break - # Close and transfer this tar archive, and update the database with the archived files. + # Close the tar, submit it to the batch transfer system, and update the database with the archived files (and optionally the tar as well, depending on skip_tars_table) tar_wrapper.process_tar( cache, keep, From 902df88b6e0d58894ff2dd0b3aa4d3e1ecd81285 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Mon, 23 Mar 2026 18:26:47 -0500 Subject: [PATCH 29/34] Reduce complexity of construct_tars --- tests/unit/test_optimized_update.py | 34 ----------- zstash/create.py | 11 ++-- zstash/hpss_utils.py | 92 +++++++++++++++-------------- zstash/update.py | 8 +-- zstash/utils.py | 9 --- 5 files changed, 60 insertions(+), 94 deletions(-) diff --git a/tests/unit/test_optimized_update.py b/tests/unit/test_optimized_update.py index 59b5d78d..5934209b 100644 --- a/tests/unit/test_optimized_update.py +++ b/tests/unit/test_optimized_update.py @@ -563,40 +563,6 @@ def test_time_tolerance_check(self): assert is_within_tolerance == should_match -class TestBackwardCompatibility: - """Tests to ensure backward compatibility with existing code.""" - - def test_get_files_to_archive_still_works(self, tmp_path): - """Test that legacy get_files_to_archive function still works.""" - from zstash.utils import get_files_to_archive - - (tmp_path / "file.txt").write_text("content") - - os.chdir(tmp_path) - result = get_files_to_archive("cache", None, None) - - # Should return list of strings - assert isinstance(result, list) - assert len(result) > 0 - assert all(isinstance(item, str) for item in result) - - def test_output_format_matches_original(self, tmp_path): - """Test that file paths are normalized the same way as original.""" - subdir = tmp_path / "subdir" - subdir.mkdir() - (subdir / "file.txt").write_text("content") - - os.chdir(tmp_path) - - from zstash.utils import get_files_to_archive - - legacy_result = get_files_to_archive("cache", None, None) - new_result = list(get_files_to_archive_with_stats("cache", None, None).keys()) - - # Should produce same file list - assert legacy_result == new_result - - @pytest.fixture def mock_database(): """Fixture providing a mock database cursor.""" diff --git a/zstash/create.py b/zstash/create.py index db851b99..1c956a72 100644 --- a/zstash/create.py +++ b/zstash/create.py @@ -6,7 +6,8 @@ import os.path import sqlite3 import sys -from typing import Any, List, Tuple +from datetime import datetime +from typing import Any, Dict, List, Tuple from six.moves.urllib.parse import urlparse @@ -17,7 +18,7 @@ from .transfer_tracking import TransferManager from .utils import ( create_tars_table, - get_files_to_archive, + get_files_to_archive_with_stats, run_command, tars_table_exists, ts_utc, @@ -271,7 +272,9 @@ def create_database( cur.execute("insert into config values (?,?)", (attr, value)) con.commit() - files: List[str] = get_files_to_archive(cache, args.include, args.exclude) + file_stats: Dict[str, Tuple[int, datetime]] = get_files_to_archive_with_stats( + cache, args.include, args.exclude + ) failures: List[str] dev_options: DevOptions = DevOptions( @@ -284,7 +287,7 @@ def create_database( cur, con, -1, - files, + file_stats, cache, args.keep, args.follow_symlinks, diff --git a/zstash/hpss_utils.py b/zstash/hpss_utils.py index 3a603f48..aa25c6af 100644 --- a/zstash/hpss_utils.py +++ b/zstash/hpss_utils.py @@ -7,7 +7,7 @@ import tarfile import traceback from datetime import datetime -from typing import List, Optional, Tuple +from typing import Dict, List, Optional, Tuple import _hashlib @@ -290,6 +290,19 @@ def close(self): self.closed = True +def estimate_tar_entry_size(file_size: int) -> int: + """ + Estimate how much space a file of a given size would take in the tar archive, + including metadata and padding. + """ + TAR_BLOCK_SIZE = 512 + TAR_HEADER_SIZE = 512 # per file header + # This formula computes: ceil(file_size / TAR_BLOCK_SIZE) + # But faster and avoiding floats. + data_blocks = (file_size + TAR_BLOCK_SIZE - 1) // TAR_BLOCK_SIZE + return TAR_HEADER_SIZE + (data_blocks * TAR_BLOCK_SIZE) + + # Add file to tar archive while computing its hash # Return file offset (in tar archive), size and md5 hash def add_file_to_tar_archive( @@ -326,7 +339,7 @@ def construct_tars( cur: sqlite3.Cursor, con: sqlite3.Connection, itar: int, - files: List[str], + file_stats: Dict[str, Tuple[int, datetime]], cache: str, keep: bool, follow_symlinks: bool, @@ -337,6 +350,7 @@ def construct_tars( ) -> List[str]: failures: List[str] = [] + files: List[str] = list(file_stats.keys()) nfiles: int = len(files) if config.maxsize is not None: @@ -351,14 +365,13 @@ def construct_tars( operation = "update" i_file: int = 0 - carried_over_tar_info: Optional[tarfile.TarInfo] = None while i_file < nfiles: # Each iteration of this loop constructs one tar # `create` passes in itar=-1, so the first tar will be 000000.tar # `update` passes in itar=max existing tar number, so the first tar will be max+1 itar += 1 - tar_size: int = 0 + cumulative_tar_size: int = 0 archived: List[TupleFilesRowNoId] = [] # Open a new tar @@ -378,47 +391,40 @@ def construct_tars( # Add files to the tar until we reach the max size while i_file < nfiles: current_file: str = files[i_file] - if carried_over_tar_info: - # If current_file wasn't added to the last tar, - # then we can immediately add it now. - # No need to repeat the size calculations. - tar_info = carried_over_tar_info - current_file_size = tar_info.size - carried_over_tar_info = None # Reset for next iteration - else: - try: - # It's ok that current_file isn't in the tar yet. - tar_info = tar_wrapper.tar.gettarinfo(current_file) - if tar_info.islnk(): - tar_info.size = os.path.getsize(current_file) - current_file_size = tar_info.size - except FileNotFoundError: - logger.error(f"Archiving {current_file}") - if follow_symlinks: - raise Exception( - f"Archive {operation} failed due to broken symlink." - ) - else: - raise - - # Check if adding this file would send us over the max size. - if (tar_size == 0) or (tar_size + current_file_size <= max_size): - # Case 1: if the tar is empty, always add the file, even if it's over the max size. - # Case 2: if the tar is nonempty, only add the file if it won't put us over the max size. - new_tar_size = tar_wrapper.process_file( - current_file, tar_info, archived, failures - ) - if new_tar_size != 0: - tar_size = new_tar_size - # Else: process_file failed, so we should keep the original tar_size - i_file += 1 - else: - # Over the size limit: - # Time to close and transfer this tar archive. - carried_over_tar_info = tar_info # Carry over this info to the next tar - # Break out of the inner while-loop: + current_file_size: int + current_file_size, _ = file_stats[current_file] + estimated_entry_size: int = estimate_tar_entry_size(current_file_size) + if (cumulative_tar_size != 0) and ( + cumulative_tar_size + estimated_entry_size > max_size + ): + # Over the size limit: time to close and transfer this tar archive. # Done adding files to this particular tar. + # Break out of the inner while-loop break + # If we make it this far, + # we know we can add the current file without going over the max size. + # (Either that, or the tar is currently empty, + # in which case we add the file even if it's over the max size.) + try: + tar_info = tar_wrapper.tar.gettarinfo(current_file) + if tar_info.islnk(): + tar_info.size = os.path.getsize(current_file) + except FileNotFoundError: + logger.error(f"Archiving {current_file}") + if follow_symlinks: + raise Exception( + f"Archive {operation} failed due to broken symlink." + ) + else: + raise + new_cumulative_tar_size = tar_wrapper.process_file( + current_file, tar_info, archived, failures + ) + if new_cumulative_tar_size != 0: + # Update the cumulative tar size with the new tar size returned by process_file. + cumulative_tar_size = new_cumulative_tar_size + # Else: process_file failed, so we should keep the original cumulative_tar_size + i_file += 1 # Close the tar, submit it to the batch transfer system, and update the database with the archived files (and optionally the tar as well, depending on skip_tars_table) tar_wrapper.process_tar( diff --git a/zstash/update.py b/zstash/update.py index 1d27d799..817da424 100644 --- a/zstash/update.py +++ b/zstash/update.py @@ -226,7 +226,7 @@ def update_database( # noqa: C901 else: archived_files[file_path] = (size, mtime) - newfiles: List[str] = [] + newfiles: Dict[str, Tuple[int, datetime]] = {} files_checked = 0 for file_path in files: @@ -236,7 +236,7 @@ def update_database( # noqa: C901 # Check if file exists in database if file_path not in archived_files: # File not in database - it's new - newfiles.append(file_path) + newfiles[file_path] = (size_new, mdtime_new) else: # File exists in database - check if it changed archived_size, archived_mtime = archived_files[file_path] @@ -246,7 +246,7 @@ def update_database( # noqa: C901 and (abs((mdtime_new - archived_mtime).total_seconds()) <= TIME_TOL) ): # File has changed - newfiles.append(file_path) + newfiles[file_path] = (size_new, mdtime_new) files_checked += 1 @@ -261,7 +261,7 @@ def update_database( # noqa: C901 # --dry-run option if args.dry_run: print("List of files to be updated") - for file_path in newfiles: + for file_path in newfiles.keys(): print(file_path) # Close database con.commit() diff --git a/zstash/utils.py b/zstash/utils.py index 1bec4f3e..04154136 100644 --- a/zstash/utils.py +++ b/zstash/utils.py @@ -220,15 +220,6 @@ def get_files_to_archive_with_stats( return file_stats -def get_files_to_archive(cache: str, include: str, exclude: str) -> List[str]: - """ - LEGACY VERSION: Still used for `zstash create`. - Uses the optimized version but returns only the file list. - """ - file_stats = get_files_to_archive_with_stats(cache, include, exclude) - return list(file_stats.keys()) - - def update_config(cur: sqlite3.Cursor): # Retrieve some configuration settings from database # Loop through all attributes of config. From 898e95e12c01b51517cea9e71b3b9eeb65f3a82e Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Thu, 26 Mar 2026 15:35:54 -0500 Subject: [PATCH 30/34] Address comments --- zstash/globus.py | 112 +++++++++++++++++++++--------------- zstash/globus_utils.py | 3 + zstash/hpss.py | 6 +- zstash/transfer_tracking.py | 3 +- 4 files changed, 74 insertions(+), 50 deletions(-) diff --git a/zstash/globus.py b/zstash/globus.py index 0e2a3f71..b20d01e5 100644 --- a/zstash/globus.py +++ b/zstash/globus.py @@ -3,7 +3,7 @@ import sys from typing import Dict, List, Optional, Set, Tuple -from globus_sdk import TransferAPIError, TransferClient +from globus_sdk import TransferAPIError, TransferClient, TransferData from globus_sdk.response import GlobusHTTPResponse from globus_sdk.services.transfer.response.iterable import IterableTransferResponse from six.moves.urllib.parse import urlparse @@ -65,6 +65,20 @@ def file_exists(archive_directory_listing: IterableTransferResponse, name: str) return False +def update_cumulative_tarfiles_pushed( + transfer_manager: TransferManager, transfer_data: TransferData +) -> None: + logger.info(f"{ts_utc()}: TransferData: accumulated items:") + attribs = transfer_data.__dict__ + for item in attribs["data"]["DATA"]: + if item["DATA_TYPE"] == "transfer_item": + transfer_manager.cumulative_tarfiles_pushed += 1 + print( + f"PUSHED (#{transfer_manager.cumulative_tarfiles_pushed}) tars, STORED source item: {item['source_path']}", + flush=True, + ) + + # C901 'globus_transfer' is too complex (20) def globus_transfer( # noqa: C901 transfer_manager: TransferManager, @@ -103,7 +117,11 @@ def globus_transfer( # noqa: C901 ) sys.exit(1) - mrt: Optional[TransferBatch] = transfer_manager.get_most_recent_transfer() + mrb: Optional[TransferBatch] = transfer_manager.get_most_recent_batch() + if not mrb: + raise RuntimeError( + "The transfer manager should always have at least one batch by the time globus_transfer is called, however, the batch list is empty." + ) transfer_data = set_up_TransferData( transfer_type, transfer_manager.globus_config.local_endpoint, @@ -111,22 +129,22 @@ def globus_transfer( # noqa: C901 remote_path, name, transfer_manager.globus_config.transfer_client, - mrt.transfer_data if mrt else None, + mrb.transfer_data, ) task: GlobusHTTPResponse try: - if mrt and mrt.task_id: + if mrb.task_id: # This the current transfer task associated with the most recent batch. - task = transfer_manager.globus_config.transfer_client.get_task(mrt.task_id) + task = transfer_manager.globus_config.transfer_client.get_task(mrb.task_id) # Update the most recent batch's task_status based on the current status from Globus API. - mrt.task_status = task["status"] + mrb.task_status = task["status"] # According to https://docs.globus.org/api/transfer/task/#task_fields, # this will be one of {ACTIVE, INACTIVE, SUCCEEDED, FAILED} - if mrt.task_status == "ACTIVE": - # The most recent transfer (mrt) is still active. + if mrb.task_status == "ACTIVE": + # The most recent transfer (mrb) is still active. logger.info( - f"{ts_utc()}: Previous task_id {mrt.task_id} Still Active. Returning ACTIVE." + f"{ts_utc()}: Previous task_id {mrb.task_id} Still Active. Returning ACTIVE." ) if non_blocking: # Globus allows up to 3 simulataneous transfers, @@ -145,34 +163,32 @@ def globus_transfer( # noqa: C901 ) logger.error(error_str) raise RuntimeError(error_str) - elif mrt.task_status == "SUCCEEDED": + elif mrb.task_status == "SUCCEEDED": logger.info( - f"{ts_utc()}: Previous task_id {mrt.task_id} status = SUCCEEDED." + f"{ts_utc()}: Previous task_id {mrb.task_id} status = SUCCEEDED." ) src_ep = task["source_endpoint_id"] dst_ep = task["destination_endpoint_id"] label = task["label"] ts = ts_utc() logger.info( - f"{ts}:Globus transfer {mrt.task_id}, from {src_ep} to {dst_ep}: {label} succeeded" + f"{ts}:Globus transfer {mrb.task_id}, from {src_ep} to {dst_ep}: {label} succeeded" ) # The previous transfer succeeded. # That means we can transfer the current batch now. else: + # The previous transfer is in an unexpected state (i.e., "INACTIVE", "FAILED"). + # Either way, the previous transfer is effectively terminated, + # so we will proceed with the current transfer attempt. + # (I.e., we will not return yet). + # Note: any status we manually set + # (I.e., "UNKNOWN", "SUBMITTED", "EXHAUSTED_TIMEOUT_RETRIES") is NOT possible here, + # because we're using `task["status"]` from the globus_sdk TransferClient. logger.error( - f"{ts_utc()}: Previous task_id {mrt.task_id} status = {mrt.task_status}." + f"{ts_utc()}: Previous task_id {mrb.task_id} status = {mrb.task_status}." ) - # DEBUG: review accumulated items in TransferData - logger.info(f"{ts_utc()}: TransferData: accumulated items:") - attribs = transfer_data.__dict__ - for item in attribs["data"]["DATA"]: - if item["DATA_TYPE"] == "transfer_item": - transfer_manager.cumulative_tarfiles_pushed += 1 - print( - f" (routine) PUSHING (#{transfer_manager.cumulative_tarfiles_pushed}) STORED source item: {item['source_path']}", - flush=True, - ) + update_cumulative_tarfiles_pushed(transfer_manager, transfer_data) logger.info(f"{ts_utc()}: DIVING: Submit Transfer for {transfer_data['label']}") # Submit the current transfer_data @@ -181,19 +197,26 @@ def globus_transfer( # noqa: C901 transfer_manager.globus_config.transfer_client, transfer_data ) task_id = task.get("task_id") - # NOTE: This log message is misleading. If we have accumulated multiple tar files for transfer, - # the "lable" given here refers only to the LAST tarfile in the TransferData list. logger.info( - f"{ts_utc()}: SURFACE Submit Transfer returned new task_id = {task_id} for label {transfer_data['label']}" + f"{ts_utc()}: SURFACE Submit Transfer returned new task_id = {task_id}, with last tarfile having label: {transfer_data['label']}" ) # Update the current batch with the task info # The batch was already created in hpss_transfer with files added to it # We just need to mark it as submitted if transfer_manager.batches: + # Update these two fields of the most recent batch + # (which is still available in this function as `mrb`). transfer_manager.batches[-1].task_id = task_id - transfer_manager.batches[-1].task_status = "UNKNOWN" - transfer_manager.batches[-1].transfer_data = None # Was just submitted + transfer_manager.batches[-1].task_status = "SUBMITTED" + else: + # This block should be impossible to reach. + # By now, we've ensured that `get_most_recent_batch()` returns a batch, + # and we haven't removed any batches since then, + # so there should always be at least one batch in `batches`. + error_str = "transfer_manager has no batches" + logger.error(error_str) + raise RuntimeError(error_str) # Nullify the submitted transfer data structure so that a new one will be created on next call. transfer_data = None @@ -211,22 +234,26 @@ def globus_transfer( # noqa: C901 logger.error("Exception: {}".format(e)) sys.exit(1) - new_mrt: Optional[TransferBatch] = transfer_manager.get_most_recent_transfer() - # test for blocking on new task_id task_status = "UNKNOWN" - if new_mrt and new_mrt.task_id: + if mrb.task_id: if not non_blocking: # If blocking, wait for the task to complete and get the final status, # before we proceed with any more transfers. - new_mrt.task_status = globus_block_wait( + mrb.task_status = globus_block_wait( transfer_manager.globus_config.transfer_client, - task_id=new_mrt.task_id, + task_id=mrb.task_id, ) - task_status = new_mrt.task_status + task_status = mrb.task_status else: logger.info( - f"{ts_utc()}: NO BLOCKING (task_wait) for task_id {new_mrt.task_id}" + f"{ts_utc()}: NO BLOCKING (task_wait) for task_id {mrb.task_id}" ) + else: + # This block should be impossible to reach. + # By now, we've set `transfer_manager.batches[-1].task_id = task_id` or else raised an error, so `mrb.task_id` should always be set. + error_str = "No task_id found for most recent batch after submission" + logger.error(f"{ts_utc()}: {error_str}") + raise RuntimeError(error_str) if transfer_type == "put": return task_status @@ -274,9 +301,10 @@ def globus_block_wait( elif task_status in [ "INACTIVE", "UNKNOWN", + "SUBMITTED", "EXHAUSTED_TIMEOUT_RETRIES", ]: - # The latter two options here are ones we assign manually and aren't included on + # The latter three options here are ones we assign manually and aren't included on # https://docs.globus.org/api/transfer/task/#task_fields error_str = f"{ts_utc()}: task_wait returned True, but task_status={task_status} for task_id {task_id}. Will retry waiting until max_retries is reached." logger.warning(error_str) @@ -358,19 +386,11 @@ def _submit_pending_transfer_data( If the most recent batch has unsubmitted TransferData, submit it and return task_id. Otherwise return None. """ - transfer: Optional[TransferBatch] = transfer_manager.get_most_recent_transfer() + transfer: Optional[TransferBatch] = transfer_manager.get_most_recent_batch() if not transfer or not transfer.transfer_data: return None - logger.info(f"{ts_utc()}: FINAL TransferData: accumulated items:") - attribs = transfer.transfer_data.__dict__ - for item in attribs["data"]["DATA"]: - if item["DATA_TYPE"] == "transfer_item": - transfer_manager.cumulative_tarfiles_pushed += 1 - print( - f" (finalize) PUSHING ({transfer_manager.cumulative_tarfiles_pushed}) source item: {item['source_path']}", - flush=True, - ) + update_cumulative_tarfiles_pushed(transfer_manager, transfer.transfer_data) logger.info( f"{ts_utc()}: DIVING: Submit Transfer for {transfer.transfer_data['label']}" diff --git a/zstash/globus_utils.py b/zstash/globus_utils.py index e5346f69..669b6831 100644 --- a/zstash/globus_utils.py +++ b/zstash/globus_utils.py @@ -226,6 +226,9 @@ def set_up_TransferData( transfer_client: TransferClient, transfer_data: Optional[TransferData] = None, ) -> TransferData: + """ + Set up the TransferData object, creating one if not provided. + """ if not local_endpoint: raise ValueError("Local endpoint ID is not set.") if not remote_endpoint: diff --git a/zstash/hpss.py b/zstash/hpss.py index 10f449a4..e8693e4c 100644 --- a/zstash/hpss.py +++ b/zstash/hpss.py @@ -120,9 +120,9 @@ def hpss_transfer( logger.info( f"{ts_utc()}: SURFACE: hpss globus_transfer(name={name}) returned task_status={task_status}" ) - mrt: Optional[TransferBatch] = transfer_manager.get_most_recent_transfer() - if mrt and mrt.task_status: - globus_status = mrt.task_status + mrb: Optional[TransferBatch] = transfer_manager.get_most_recent_batch() + if mrb and mrb.task_status: + globus_status = mrb.task_status logger.info( f"{ts_utc()}: Most recent globus_transfer returned task_status={globus_status}" ) diff --git a/zstash/transfer_tracking.py b/zstash/transfer_tracking.py index 72ee5a4c..84f74a11 100644 --- a/zstash/transfer_tracking.py +++ b/zstash/transfer_tracking.py @@ -45,7 +45,8 @@ def __init__(self): # Connection state (Globus-specific, None if not using Globus) self.globus_config: Optional[GlobusConfig] = None - def get_most_recent_transfer(self) -> Optional[TransferBatch]: + def get_most_recent_batch(self) -> Optional[TransferBatch]: + """Get the last batch added to the manager, or None if no batches exist""" return self.batches[-1] if self.batches else None def delete_successfully_transferred_files(self): From 5a6c7f0f38c10b82271cbab17e9a05a6b7ce4f5a Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Thu, 26 Mar 2026 17:08:03 -0500 Subject: [PATCH 31/34] Add TaskStatus Enum --- zstash/globus.py | 62 +++++++++++++++---------------------- zstash/hpss.py | 6 ++-- zstash/transfer_tracking.py | 43 +++++++++++++++++++++++-- 3 files changed, 69 insertions(+), 42 deletions(-) diff --git a/zstash/globus.py b/zstash/globus.py index b20d01e5..16f22b80 100644 --- a/zstash/globus.py +++ b/zstash/globus.py @@ -17,7 +17,7 @@ submit_transfer_with_checks, ) from .settings import logger -from .transfer_tracking import GlobusConfig, TransferBatch, TransferManager +from .transfer_tracking import GlobusConfig, TaskStatus, TransferBatch, TransferManager from .utils import ts_utc @@ -87,7 +87,7 @@ def globus_transfer( # noqa: C901 name: str, transfer_type: str, non_blocking: bool, -) -> str: +) -> TaskStatus: logger.info(f"{ts_utc()}: Entered globus_transfer() for name = {name}") logger.debug(f"{ts_utc()}: non_blocking = {non_blocking}") @@ -138,10 +138,8 @@ def globus_transfer( # noqa: C901 # This the current transfer task associated with the most recent batch. task = transfer_manager.globus_config.transfer_client.get_task(mrb.task_id) # Update the most recent batch's task_status based on the current status from Globus API. - mrb.task_status = task["status"] - # According to https://docs.globus.org/api/transfer/task/#task_fields, - # this will be one of {ACTIVE, INACTIVE, SUCCEEDED, FAILED} - if mrb.task_status == "ACTIVE": + mrb.task_status = TaskStatus.convert_from_status_from_globus_sdk(task) + if mrb.task_status == TaskStatus.ACTIVE: # The most recent transfer (mrb) is still active. logger.info( f"{ts_utc()}: Previous task_id {mrb.task_id} Still Active. Returning ACTIVE." @@ -153,7 +151,7 @@ def globus_transfer( # noqa: C901 # We will therefore wait to submit a new transfer until it's done. # So, we'll simply return and the next run of globus_transfer # (i.e., on the next tar) will evaluate if the active transfer has finished. - return "ACTIVE" + return TaskStatus.ACTIVE else: # If we're in this block, then the blocking wait # for the previous transfer to finish was unsuccessful. @@ -163,7 +161,7 @@ def globus_transfer( # noqa: C901 ) logger.error(error_str) raise RuntimeError(error_str) - elif mrb.task_status == "SUCCEEDED": + elif mrb.task_status == TaskStatus.SUCCEEDED: logger.info( f"{ts_utc()}: Previous task_id {mrb.task_id} status = SUCCEEDED." ) @@ -208,7 +206,7 @@ def globus_transfer( # noqa: C901 # Update these two fields of the most recent batch # (which is still available in this function as `mrb`). transfer_manager.batches[-1].task_id = task_id - transfer_manager.batches[-1].task_status = "SUBMITTED" + transfer_manager.batches[-1].task_status = TaskStatus.SUBMITTED else: # This block should be impossible to reach. # By now, we've ensured that `get_most_recent_batch()` returns a batch, @@ -234,7 +232,7 @@ def globus_transfer( # noqa: C901 logger.error("Exception: {}".format(e)) sys.exit(1) - task_status = "UNKNOWN" + task_status: TaskStatus = TaskStatus.UNKNOWN if mrb.task_id: if not non_blocking: # If blocking, wait for the task to complete and get the final status, @@ -269,14 +267,14 @@ def globus_block_wait( task_id: str, wait_timeout: int = 7200, # 7200/3600 = 2 hours max_retries: int = 5, -): +) -> TaskStatus: # Poll every "polling_interval" seconds to speed up small transfers. # Report every "wait_timeout" seconds, and stop waiting after "max_retries" reports. # By default: report every 2 hours, stop waiting after 5*2 = 10 hours logger.info( f"{ts_utc()}: BLOCKING START: invoking task_wait for task_id = {task_id}" ) - task_status: str = "UNKNOWN" + task_status: TaskStatus = TaskStatus.UNKNOWN retry_count: int = 0 while retry_count < max_retries: try: @@ -289,29 +287,17 @@ def globus_block_wait( task_id, timeout=wait_timeout, polling_interval=10 ) if task_is_not_active: - curr_task = transfer_client.get_task(task_id) - task_status = curr_task["status"] - if task_status == "SUCCEEDED": + curr_task: GlobusHTTPResponse = transfer_client.get_task(task_id) + task_status = TaskStatus.convert_from_status_from_globus_sdk(curr_task) + if task_status == TaskStatus.SUCCEEDED: break # Break out of the while-loop. The transfer already succeeded, so no need to retry. - elif task_status == "FAILED": + elif task_status == TaskStatus.FAILED: error_str = f"{ts_utc()}: task_wait returned True, but task_status={task_status} for task_id {task_id}. No reason to keep retrying now." logger.warning(error_str) # We still need to break, because no matter how long we wait now, nothing will change with the transfer status. break - elif task_status in [ - "INACTIVE", - "UNKNOWN", - "SUBMITTED", - "EXHAUSTED_TIMEOUT_RETRIES", - ]: - # The latter three options here are ones we assign manually and aren't included on - # https://docs.globus.org/api/transfer/task/#task_fields - error_str = f"{ts_utc()}: task_wait returned True, but task_status={task_status} for task_id {task_id}. Will retry waiting until max_retries is reached." - logger.warning(error_str) - # Don't break -- continue retries else: - # If we're in this block, then somehow an unexpected task_status was returned. - error_str = f"{ts_utc()}: task_wait returned True, but task_status={task_status} is unexpected for task_id {task_id}. Will retry waiting until max_retries is reached." + error_str = f"{ts_utc()}: task_wait returned True, but task_status={task_status} for task_id {task_id}. Will retry waiting until max_retries is reached." logger.warning(error_str) # Don't break -- continue retries logger.info(f"{ts_utc()}: done with wait") @@ -327,7 +313,7 @@ def globus_block_wait( logger.info( f"{ts_utc()}: BLOCKING EXHAUSTED {max_retries} of timeout {wait_timeout} seconds" ) - task_status = "EXHAUSTED_TIMEOUT_RETRIES" + task_status = TaskStatus.EXHAUSTED_TIMEOUT_RETRIES logger.info( f"{ts_utc()}: BLOCKING ENDS: task_id {task_id} returned from task_wait with status {task_status}" @@ -339,8 +325,10 @@ def globus_block_wait( def globus_wait(transfer_client: TransferClient, task_id: str): try: """ - A Globus transfer job (task) can be in one of the three states: - ACTIVE, SUCCEEDED, FAILED. The script every 20 seconds polls a + A Globus transfer job (task) can be in one of the four states: + {ACTIVE, SUCCEEDED, FAILED, INACTIVE} + according to https://docs.globus.org/api/transfer/task/#task_fields. + The script every 20 seconds polls a status of the transfer job (task) from the Globus Transfer service, with 20 second timeout limit. If the task is ACTIVE after time runs out 'task_wait' returns False, and True otherwise. @@ -351,8 +339,8 @@ def globus_wait(transfer_client: TransferClient, task_id: str): The Globus transfer job (task) has been finished (SUCCEEDED or FAILED). Check if the transfer SUCCEEDED or FAILED. """ - task = transfer_client.get_task(task_id) - if task["status"] == "SUCCEEDED": + task: GlobusHTTPResponse = transfer_client.get_task(task_id) + if TaskStatus.convert_from_status_from_globus_sdk(task) == TaskStatus.SUCCEEDED: src_ep = task["source_endpoint_id"] dst_ep = task["destination_endpoint_id"] label = task["label"] @@ -463,14 +451,14 @@ def _refresh_batch_status( transfer_client: TransferClient, task_id: str, task_to_batch: Dict[str, TransferBatch], -) -> Optional[str]: +) -> Optional[TaskStatus]: """ Fetch Globus task status and update corresponding batch.task_status if present. Returns status, or None if fetch fails. """ try: task: GlobusHTTPResponse = transfer_client.get_task(task_id) - status = task["status"] + status: TaskStatus = TaskStatus.convert_from_status_from_globus_sdk(task) batch: Optional[TransferBatch] = task_to_batch.get(task_id) if batch: batch.task_status = status @@ -493,7 +481,7 @@ def _wait_for_all_tasks( """ for tid in task_ids: status = _refresh_batch_status(transfer_client, tid, task_to_batch) - if status == "SUCCEEDED": + if status == TaskStatus.SUCCEEDED: logger.info(f"{ts_utc()}: task_id={tid} already SUCCEEDED; skipping wait") continue diff --git a/zstash/hpss.py b/zstash/hpss.py index e8693e4c..fac368f8 100644 --- a/zstash/hpss.py +++ b/zstash/hpss.py @@ -8,7 +8,7 @@ from .globus import globus_transfer from .settings import get_db_filename, logger -from .transfer_tracking import GlobusConfig, TransferBatch, TransferManager +from .transfer_tracking import GlobusConfig, TaskStatus, TransferBatch, TransferManager from .utils import run_command, ts_utc @@ -108,13 +108,13 @@ def hpss_transfer( # For `get`, this directory is where the file we get from HPSS will go. os.chdir(path) - globus_status: str = "UNKNOWN" + globus_status: TaskStatus = TaskStatus.UNKNOWN if scheme == "globus": if not transfer_manager.globus_config: transfer_manager.globus_config = GlobusConfig() # Transfer file using the Globus Transfer Service logger.info(f"{ts_utc()}: DIVING: hpss calls globus_transfer(name={name})") - task_status: str = globus_transfer( + task_status: TaskStatus = globus_transfer( transfer_manager, endpoint, url_path, name, transfer_type, non_blocking ) logger.info( diff --git a/zstash/transfer_tracking.py b/zstash/transfer_tracking.py index 84f74a11..1cac18d5 100644 --- a/zstash/transfer_tracking.py +++ b/zstash/transfer_tracking.py @@ -1,7 +1,9 @@ import os +from enum import Enum, auto from typing import List, Optional from globus_sdk import TransferClient, TransferData +from globus_sdk.response import GlobusHTTPResponse from globus_sdk.services.transfer.response.iterable import IterableTransferResponse from .settings import logger @@ -18,13 +20,50 @@ def __init__(self): self.archive_directory_listing: Optional[IterableTransferResponse] = None +class TaskStatus(Enum): + """Enum for Globus transfer task status""" + + # The first 4 values are defined by the Globus API: + # https://docs.globus.org/api/transfer/task/#task_fields + SUCCEEDED = auto() + ACTIVE = auto() + INACTIVE = auto() + FAILED = auto() + # The last 3 values are custom statuses we add on. + UNKNOWN = auto() + SUBMITTED = auto() + EXHAUSTED_TIMEOUT_RETRIES = auto() + + @classmethod + def convert_from_status_from_globus_sdk(cls, globus_task: GlobusHTTPResponse): + """Convert a Globus API status string to a TaskStatus enum value""" + status_from_globus_sdk: str = globus_task["status"] + status_from_globus_sdk = status_from_globus_sdk.upper() + if status_from_globus_sdk == "SUCCEEDED": + return TaskStatus.SUCCEEDED + elif status_from_globus_sdk == "ACTIVE": + return TaskStatus.ACTIVE + elif status_from_globus_sdk == "INACTIVE": + return TaskStatus.INACTIVE + elif status_from_globus_sdk == "FAILED": + return TaskStatus.FAILED + else: + logger.warning( + f"Received unrecognized Globus status: {status_from_globus_sdk}" + ) + return TaskStatus.UNKNOWN + + def __str__(self) -> str: + return self.name + + class TransferBatch: """Represents one batch of files being transferred""" def __init__(self): self.file_paths: List[str] = [] self.task_id: Optional[str] = None - self.task_status: Optional[str] = None + self.task_status: Optional[TaskStatus] = None self.is_globus: bool = False self.transfer_data: Optional[TransferData] = None # Only for Globus @@ -58,7 +97,7 @@ def delete_successfully_transferred_files(self): self.batches = [batch for batch in self.batches if batch.file_paths] # Now delete files for successful transfers for batch in self.batches: - if (not batch.is_globus) or (batch.task_status == "SUCCEEDED"): + if (not batch.is_globus) or (batch.task_status == TaskStatus.SUCCEEDED): # The files were transferred successfully, so delete them logger.info( f"{ts_utc()}: Deleting {len(batch.file_paths)} files from successful transfer" From f7570944af33951275e0b813538ca1fdfc14a563 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Mon, 30 Mar 2026 14:59:56 -0500 Subject: [PATCH 32/34] Improve handling of TransferData --- zstash/globus.py | 40 ++++++++++++++++++++------ zstash/globus_utils.py | 64 ++++++++++++++++++++++++------------------ 2 files changed, 68 insertions(+), 36 deletions(-) diff --git a/zstash/globus.py b/zstash/globus.py index 16f22b80..dbf46f6e 100644 --- a/zstash/globus.py +++ b/zstash/globus.py @@ -10,10 +10,12 @@ from .globus_utils import ( HPSS_ENDPOINT_MAP, + add_file_to_TransferData, check_state_files, + create_TransferData, + get_label, get_local_endpoint_id, get_transfer_client_with_auth, - set_up_TransferData, submit_transfer_with_checks, ) from .settings import logger @@ -122,14 +124,37 @@ def globus_transfer( # noqa: C901 raise RuntimeError( "The transfer manager should always have at least one batch by the time globus_transfer is called, however, the batch list is empty." ) - transfer_data = set_up_TransferData( + + if transfer_manager.globus_config.local_endpoint: + local_endpoint: str = transfer_manager.globus_config.local_endpoint + else: + raise ValueError("Local endpoint ID is not set.") + if transfer_manager.globus_config.remote_endpoint: + remote_endpoint: str = transfer_manager.globus_config.remote_endpoint + else: + raise ValueError("Remote endpoint ID is not set.") + label: str = get_label(remote_path, name) + transfer_data: TransferData + if mrb.transfer_data: + # We already have a TransferData for this batch. + transfer_data = mrb.transfer_data + else: + # We need to create a new TransferData for this batch. + transfer_data = create_TransferData( + transfer_type, + local_endpoint, + remote_endpoint, + transfer_manager.globus_config.transfer_client, + label, + ) + add_file_to_TransferData( transfer_type, - transfer_manager.globus_config.local_endpoint, - transfer_manager.globus_config.remote_endpoint, + local_endpoint, + remote_endpoint, remote_path, name, - transfer_manager.globus_config.transfer_client, - mrb.transfer_data, + transfer_data, + label, ) task: GlobusHTTPResponse @@ -215,9 +240,6 @@ def globus_transfer( # noqa: C901 error_str = "transfer_manager has no batches" logger.error(error_str) raise RuntimeError(error_str) - - # Nullify the submitted transfer data structure so that a new one will be created on next call. - transfer_data = None except TransferAPIError as e: if e.code == "NoCredException": logger.error( diff --git a/zstash/globus_utils.py b/zstash/globus_utils.py index 669b6831..9c6f4e2a 100644 --- a/zstash/globus_utils.py +++ b/zstash/globus_utils.py @@ -217,51 +217,61 @@ def save_tokens(token_response): # Primarily used by globus_transfer ########################################### -def set_up_TransferData( +def get_label(remote_path: str, name: str) -> str: + subdir = os.path.basename(os.path.normpath(remote_path)) + subdir_label = re.sub("[^A-Za-z0-9_ -]", "", subdir) + filename = name.split(".")[0] + label = subdir_label + " " + filename + return label + + +def create_TransferData( + transfer_type: str, + local_endpoint: str, + remote_endpoint: str, + transfer_client: TransferClient, + label: str, +) -> TransferData: + if transfer_type == "get": + src_ep = remote_endpoint + dst_ep = local_endpoint + else: + src_ep = local_endpoint + dst_ep = remote_endpoint + transfer_data = TransferData( + transfer_client, + src_ep, + dst_ep, + label=label, + verify_checksum=True, + preserve_timestamp=True, + fail_on_quota_errors=True, + ) + return transfer_data + + +def add_file_to_TransferData( transfer_type: str, local_endpoint: Optional[str], remote_endpoint: Optional[str], remote_path: str, name: str, - transfer_client: TransferClient, - transfer_data: Optional[TransferData] = None, -) -> TransferData: - """ - Set up the TransferData object, creating one if not provided. - """ + transfer_data: TransferData, + label: str, +): if not local_endpoint: raise ValueError("Local endpoint ID is not set.") if not remote_endpoint: raise ValueError("Remote endpoint ID is not set.") if transfer_type == "get": - src_ep = remote_endpoint src_path = os.path.join(remote_path, name) - dst_ep = local_endpoint dst_path = os.path.join(os.getcwd(), name) else: - src_ep = local_endpoint src_path = os.path.join(os.getcwd(), name) - dst_ep = remote_endpoint dst_path = os.path.join(remote_path, name) - subdir = os.path.basename(os.path.normpath(remote_path)) - subdir_label = re.sub("[^A-Za-z0-9_ -]", "", subdir) - filename = name.split(".")[0] - label = subdir_label + " " + filename - - if not transfer_data: - transfer_data = TransferData( - transfer_client, - src_ep, - dst_ep, - label=label, - verify_checksum=True, - preserve_timestamp=True, - fail_on_quota_errors=True, - ) transfer_data.add_item(src_path, dst_path) transfer_data["label"] = label - return transfer_data def submit_transfer_with_checks(transfer_client, transfer_data) -> GlobusHTTPResponse: From d57c3118e6b7e7a6651b36d91f8e4445d14639d5 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Fri, 3 Apr 2026 13:20:36 -0500 Subject: [PATCH 33/34] Address comments --- zstash/create.py | 4 ++-- zstash/globus.py | 2 +- zstash/hpss.py | 2 +- zstash/hpss_utils.py | 6 +++--- zstash/update.py | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/zstash/create.py b/zstash/create.py index 1c956a72..872b91e4 100644 --- a/zstash/create.py +++ b/zstash/create.py @@ -99,9 +99,9 @@ def create(): hpss, get_db_filename(cache), cache, + transfer_manager, keep=args.keep, is_index=True, - transfer_manager=transfer_manager, ) logger.debug(f"{ts_utc()}: calling globus_finalize()") @@ -292,9 +292,9 @@ def create_database( args.keep, args.follow_symlinks, dev_options, + transfer_manager, skip_tars_table=args.no_tars_md5, non_blocking=args.non_blocking, - transfer_manager=transfer_manager, ) # Close database diff --git a/zstash/globus.py b/zstash/globus.py index dbf46f6e..b99a6392 100644 --- a/zstash/globus.py +++ b/zstash/globus.py @@ -207,7 +207,7 @@ def globus_transfer( # noqa: C901 # Note: any status we manually set # (I.e., "UNKNOWN", "SUBMITTED", "EXHAUSTED_TIMEOUT_RETRIES") is NOT possible here, # because we're using `task["status"]` from the globus_sdk TransferClient. - logger.error( + logger.warning( f"{ts_utc()}: Previous task_id {mrb.task_id} status = {mrb.task_status}." ) diff --git a/zstash/hpss.py b/zstash/hpss.py index fac368f8..2c59145a 100644 --- a/zstash/hpss.py +++ b/zstash/hpss.py @@ -150,10 +150,10 @@ def hpss_put( hpss: str, file_path: str, cache: str, + transfer_manager: TransferManager, keep: bool = True, non_blocking: bool = False, is_index=False, - transfer_manager: Optional[TransferManager] = None, ): """ Put a file to the HPSS archive. diff --git a/zstash/hpss_utils.py b/zstash/hpss_utils.py index aa25c6af..0956cd89 100644 --- a/zstash/hpss_utils.py +++ b/zstash/hpss_utils.py @@ -108,7 +108,7 @@ def process_tar( cache: str, keep: bool, non_blocking: bool, - transfer_manager: Optional[TransferManager], + transfer_manager: TransferManager, skip_tars_table: bool, cur: sqlite3.Cursor, con: sqlite3.Connection, @@ -140,10 +140,10 @@ def process_tar( hpss, os.path.join(cache, self.tfname), cache, + transfer_manager, keep, non_blocking, is_index=False, - transfer_manager=transfer_manager, ) logger.info( f"{ts_utc()}: SURFACE (process_tar): Called hpss_put to dispatch archive file {self.tfname}" @@ -344,9 +344,9 @@ def construct_tars( keep: bool, follow_symlinks: bool, dev_options: DevOptions, + transfer_manager: TransferManager, skip_tars_table: bool = False, non_blocking: bool = False, - transfer_manager: Optional[TransferManager] = None, ) -> List[str]: failures: List[str] = [] diff --git a/zstash/update.py b/zstash/update.py index 817da424..69f72e0f 100644 --- a/zstash/update.py +++ b/zstash/update.py @@ -40,9 +40,9 @@ def update(): hpss, get_db_filename(cache), cache, + transfer_manager, keep=args.keep, is_index=True, - transfer_manager=transfer_manager, ) globus_finalize(transfer_manager, args.keep) @@ -290,8 +290,8 @@ def update_database( # noqa: C901 keep, args.follow_symlinks, dev_options, + transfer_manager, non_blocking=args.non_blocking, - transfer_manager=transfer_manager, ) # Close database From 3d8b908c6baa8abc8c9734e17d0ccb16a85dfa7d Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Fri, 3 Apr 2026 14:41:07 -0500 Subject: [PATCH 34/34] Preliminary refactor of hpss_transfer() --- zstash/hpss.py | 148 +++++++++++++++++++++++++++++-------------------- 1 file changed, 87 insertions(+), 61 deletions(-) diff --git a/zstash/hpss.py b/zstash/hpss.py index 2c59145a..19855488 100644 --- a/zstash/hpss.py +++ b/zstash/hpss.py @@ -12,6 +12,85 @@ from .utils import run_command, ts_utc +def ensure_transfer_batch(transfer_manager: TransferManager, scheme: str): + # Create a new batch if needed (before we start adding files) + if not transfer_manager.batches or transfer_manager.batches[-1].task_id: + # Either no batches exist, or the last batch was already submitted + new_batch = TransferBatch() + new_batch.is_globus = scheme == "globus" + transfer_manager.batches.append(new_batch) + logger.debug( + f"{ts_utc()}: Created new TransferBatch, total batches: {len(transfer_manager.batches)}" + ) + + +def local_put(file_path: str, cache: str): + if file_path != get_db_filename(cache): + # We are adding a file (that is not the cache) to the local non-HPSS archive + logger.info("put: Keeping tar files locally and removing write permissions") + # https://unix.stackexchange.com/questions/46915/get-the-chmod-numerical-value-for-a-file + display_mode_command: List[str] = "stat --format '%a' {}".format( + file_path + ).split() + display_mode_output: bytes = subprocess.check_output( + display_mode_command + ).strip() + logger.info("{!r} original mode={!r}".format(file_path, display_mode_output)) + # https://www.washington.edu/doit/technology-tips-chmod-overview + # Remove write-permission from user, group, and others, + # without changing read or execute permissions for any. + change_mode_command: List[str] = "chmod ugo-w {}".format(file_path).split() + # An error will be raised if this line fails. + subprocess.check_output(change_mode_command) + new_display_mode_output: bytes = subprocess.check_output( + display_mode_command + ).strip() + logger.info("{!r} new mode={!r}".format(file_path, new_display_mode_output)) + # else: no action needed + + +def run_hpss_put(hpss: str, name: str): + command: str = f'hsi -q "cd {hpss}; put {name}"' + error_str: str = f"Transferring file to HPSS: {name}" + run_command(command, error_str) + + +def run_hpss_get(hpss: str, name: str): + command: str = f'hsi -q "cd {hpss}; get {name}"' + error_str: str = f"Transferring file from HPSS: {name}" + run_command(command, error_str) + + +def globus_transfer_wrapper( + transfer_manager: TransferManager, + endpoint: str, + url_path: str, + name: str, + transfer_type: str, + non_blocking: bool, +): + if not transfer_manager.globus_config: + transfer_manager.globus_config = GlobusConfig() + # Transfer file using the Globus Transfer Service + logger.info(f"{ts_utc()}: DIVING: hpss calls globus_transfer(name={name})") + task_status: TaskStatus = globus_transfer( + transfer_manager, endpoint, url_path, name, transfer_type, non_blocking + ) + logger.info( + f"{ts_utc()}: SURFACE: hpss globus_transfer(name={name}) returned task_status={task_status}" + ) + mrb: Optional[TransferBatch] = transfer_manager.get_most_recent_batch() + if mrb and mrb.task_status: + globus_status: TaskStatus = mrb.task_status + logger.info( + f"{ts_utc()}: Most recent globus_transfer returned task_status={globus_status}" + ) + # NOTE: Here, the status could be "EXHAUSTED_TIMEOUT_RETRIES", meaning a very long transfer + # or perhaps transfer is hanging. We should decide whether to ignore it, or cancel it, but + # we'd need the task_id to issue a cancellation. Perhaps we should have globus_transfer + # return a tuple (task_id, status). + + def hpss_transfer( hpss: str, file_path: str, @@ -28,55 +107,19 @@ def hpss_transfer( url = urlparse(hpss) scheme = url.scheme - # Create a new batch if needed (before we start adding files) - if not transfer_manager.batches or transfer_manager.batches[-1].task_id: - # Either no batches exist, or the last batch was already submitted - new_batch = TransferBatch() - new_batch.is_globus = scheme == "globus" - transfer_manager.batches.append(new_batch) - logger.debug( - f"{ts_utc()}: Created new TransferBatch, total batches: {len(transfer_manager.batches)}" - ) + ensure_transfer_batch(transfer_manager, scheme) if hpss == "none": logger.info("{}: HPSS is unavailable".format(transfer_type)) - if transfer_type == "put" and file_path != get_db_filename(cache): - # We are adding a file (that is not the cache) to the local non-HPSS archive - logger.info( - "{}: Keeping tar files locally and removing write permissions".format( - transfer_type - ) - ) - # https://unix.stackexchange.com/questions/46915/get-the-chmod-numerical-value-for-a-file - display_mode_command: List[str] = "stat --format '%a' {}".format( - file_path - ).split() - display_mode_output: bytes = subprocess.check_output( - display_mode_command - ).strip() - logger.info( - "{!r} original mode={!r}".format(file_path, display_mode_output) - ) - # https://www.washington.edu/doit/technology-tips-chmod-overview - # Remove write-permission from user, group, and others, - # without changing read or execute permissions for any. - change_mode_command: List[str] = "chmod ugo-w {}".format(file_path).split() - # An error will be raised if this line fails. - subprocess.check_output(change_mode_command) - new_display_mode_output: bytes = subprocess.check_output( - display_mode_command - ).strip() - logger.info("{!r} new mode={!r}".format(file_path, new_display_mode_output)) + if transfer_type == "put": + local_put(file_path, cache) # else: no action needed else: transfer_word: str - transfer_command: str if transfer_type == "put": transfer_word = "to" - transfer_command = "put" elif transfer_type == "get": transfer_word = "from" - transfer_command = "get" else: raise ValueError("Invalid transfer_type={}".format(transfer_type)) logger.info("Transferring file {} HPSS: {}".format(transfer_word, file_path)) @@ -108,33 +151,16 @@ def hpss_transfer( # For `get`, this directory is where the file we get from HPSS will go. os.chdir(path) - globus_status: TaskStatus = TaskStatus.UNKNOWN if scheme == "globus": - if not transfer_manager.globus_config: - transfer_manager.globus_config = GlobusConfig() - # Transfer file using the Globus Transfer Service - logger.info(f"{ts_utc()}: DIVING: hpss calls globus_transfer(name={name})") - task_status: TaskStatus = globus_transfer( + globus_transfer_wrapper( transfer_manager, endpoint, url_path, name, transfer_type, non_blocking ) - logger.info( - f"{ts_utc()}: SURFACE: hpss globus_transfer(name={name}) returned task_status={task_status}" - ) - mrb: Optional[TransferBatch] = transfer_manager.get_most_recent_batch() - if mrb and mrb.task_status: - globus_status = mrb.task_status - logger.info( - f"{ts_utc()}: Most recent globus_transfer returned task_status={globus_status}" - ) - # NOTE: Here, the status could be "EXHAUSTED_TIMEOUT_RETRIES", meaning a very long transfer - # or perhaps transfer is hanging. We should decide whether to ignore it, or cancel it, but - # we'd need the task_id to issue a cancellation. Perhaps we should have globus_transfer - # return a tuple (task_id, status). else: # Transfer file using `hsi` - command: str = 'hsi -q "cd {}; {} {}"'.format(hpss, transfer_command, name) - error_str: str = "Transferring file {} HPSS: {}".format(transfer_word, name) - run_command(command, error_str) + if transfer_type == "put": + run_hpss_put(hpss, name) + else: + run_hpss_get(hpss, name) # Return to original working directory if path != "":