Skip to content
Open
25 changes: 21 additions & 4 deletions WorkbenchConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,26 @@ def get_config(self):
"csv_id_to_node_id_map_path"
]
else:
config["csv_id_to_node_id_map_path"] = os.path.join(
config["csv_id_to_node_id_map_dir"],
config["csv_id_to_node_id_map_filename"],
)
# Check if we're in recovery mode with a specific session suffix
if (config["recovery_mode_starting_from_node_id"] is not False and
config["recovery_mode_session_suffix"] is not False):
# Use the provided recovery mode session suffix
base_filename, ext = os.path.splitext(config["csv_id_to_node_id_map_filename"])
unique_filename = f"{base_filename}.{config['recovery_mode_session_suffix']}{ext}"
config["csv_id_to_node_id_map_path"] = os.path.join(
config["csv_id_to_node_id_map_dir"],
unique_filename,
)
else:
# Add unique identifier to prevent conflicts between multiple workbench instances
from workbench_utils import get_config_file_identifier_shortened
config_file_id = get_config_file_identifier_shortened(config)
base_filename, ext = os.path.splitext(config["csv_id_to_node_id_map_filename"])
unique_filename = f"{base_filename}.{config_file_id}{ext}"
config["csv_id_to_node_id_map_path"] = os.path.join(
config["csv_id_to_node_id_map_dir"],
unique_filename,
)

if "path_to_python" in user_mods:
config["path_to_python"] = user_mods["path_to_python"]
Expand Down Expand Up @@ -378,6 +394,7 @@ def get_default_config(self):
"include_password_in_rollback_config_file": False,
"remove_password_from_config_file": False,
"recovery_mode_starting_from_node_id": False,
"recovery_mode_session_suffix": False,
"viewer_override_fieldname": "field_viewer_override",
"check_for_workbench_updates": True,
}
Expand Down
132 changes: 86 additions & 46 deletions workbench_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import mimetypes
import collections
import urllib.parse
import tempfile
from pathlib import Path
from ruamel.yaml import YAML, YAMLError
from unidecode import unidecode
Expand All @@ -47,6 +48,8 @@
# Global lists of terms to reduce queries to Drupal.
checked_terms = list()
newly_created_terms = list()
# Global variable to store session-specific temp file identifier
_session_temp_identifier = None
# These are the Drupal field names on the standard types of media.
file_fields = [
"field_media_file",
Expand Down Expand Up @@ -6293,9 +6296,11 @@ def remove_media_and_file(config, media_id):


def get_preprocessed_input_csv_file_path(config):
# Always use unique naming to prevent conflicts between multiple workbench instances
config_file_id = get_config_file_identifier_shortened(config)
return (
os.path.join(config["temp_dir"], os.path.basename(config["input_csv"]))
+ ".preprocessed"
+ "." + config_file_id + ".preprocessed"
)


Expand Down Expand Up @@ -9828,11 +9833,9 @@ def get_rollback_csv_filepath(config):
else:
rollback_csv_filename = f"{rollback_csv_filename_basename}.csv"

if os.environ.get("ISLANDORA_WORKBENCH_SECONDARY_TASKS") is not None:
secondary_tasks = json.loads(os.environ["ISLANDORA_WORKBENCH_SECONDARY_TASKS"])
if os.path.abspath(config["current_config_file_path"]) in secondary_tasks:
config_file_id = get_config_file_identifier(config)
rollback_csv_filename = rollback_csv_filename + "." + config_file_id
# Always use unique naming to prevent conflicts between multiple workbench instances
config_file_id = get_config_file_identifier_shortened(config)
rollback_csv_filename = rollback_csv_filename + "." + config_file_id

if "rollback_csv_file_path" in config and len(config["rollback_csv_file_path"]) > 0:
if config["timestamp_rollback"] is True:
Expand Down Expand Up @@ -9913,21 +9916,27 @@ def get_rollback_config_filepath(config):
now_string = EXECUTION_START_TIME.strftime("%Y_%m_%d_%H_%M_%S")

if config["timestamp_rollback"] is True:
# Always use unique naming to prevent conflicts between multiple workbench instances
config_file_id = get_config_file_identifier_shortened(config)
rollback_config_filepath = os.path.join(
f"{rb_config_file_dir}",
f"{rollback_config_filename_basename}.{now_string}.yml",
f"{rollback_config_filename_basename}.{now_string}.{config_file_id}.yml",
)
elif (
config["recovery_mode_starting_from_node_id"] is not False
and value_is_numeric(config["recovery_mode_starting_from_node_id"]) is True
):
# Always use unique naming to prevent conflicts between multiple workbench instances
config_file_id = get_config_file_identifier_shortened(config)
rollback_config_filepath = os.path.join(
f"{rb_config_file_dir}",
f"{rollback_config_filename_basename}.{now_string}.recovery_mode.yml",
f"{rollback_config_filename_basename}.{now_string}.{config_file_id}.recovery_mode.yml",
)
else:
# Always use unique naming to prevent conflicts between multiple workbench instances
config_file_id = get_config_file_identifier_shortened(config)
rollback_config_filepath = os.path.join(
f"{rb_config_file_dir}", f"{rollback_config_filename_basename}.yml"
f"{rb_config_file_dir}", f"{rollback_config_filename_basename}.{config_file_id}.yml"
)

if (
Expand All @@ -9947,9 +9956,19 @@ def get_rollback_config_filepath(config):
)
return os.path.abspath(rollback_config_file_path)
else:
rollback_config_filepath = os.path.abspath(
# Always use unique naming to prevent conflicts between multiple workbench instances
config_file_id = get_config_file_identifier_shortened(config)
rollback_config_file_path_head, rollback_config_file_path_tail = os.path.split(
config["rollback_config_file_path"]
)
rollback_config_file_basename, rollback_config_file_ext = os.path.splitext(
rollback_config_file_path_tail
)
rollback_config_filepath = os.path.join(
rollback_config_file_path_head,
f"{rollback_config_file_basename}.{config_file_id}{rollback_config_file_ext}",
)
rollback_config_filepath = os.path.abspath(rollback_config_filepath)

return rollback_config_filepath

Expand Down Expand Up @@ -10113,23 +10132,18 @@ def get_csv_from_google_sheet(config):
logging.error(message)
sys.exit("Error: " + message)

if os.environ.get("ISLANDORA_WORKBENCH_SECONDARY_TASKS") is not None:
secondary_tasks = json.loads(os.environ["ISLANDORA_WORKBENCH_SECONDARY_TASKS"])
config_file_id = get_config_file_identifier(config)
if os.path.abspath(config["current_config_file_path"]) in secondary_tasks:
config_file_id = get_config_file_identifier(config)
exported_csv_path = os.path.join(
config["temp_dir"],
config["google_sheets_csv_filename"] + "." + config_file_id,
)
else:
exported_csv_path = os.path.join(
config["temp_dir"], config["google_sheets_csv_filename"]
)
# Always use unique naming to prevent conflicts between multiple workbench instances
# In recovery mode with session suffix, use that suffix; otherwise generate new unique ID
if (config["recovery_mode_starting_from_node_id"] is not False and
config["recovery_mode_session_suffix"] is not False):
config_file_id = config["recovery_mode_session_suffix"]
else:
exported_csv_path = os.path.join(
config["temp_dir"], config["google_sheets_csv_filename"]
)
config_file_id = get_config_file_identifier_shortened(config)

exported_csv_path = os.path.join(
config["temp_dir"],
config["google_sheets_csv_filename"] + "." + config_file_id,
)

open(exported_csv_path, "wb+").write(response.content)

Expand Down Expand Up @@ -10164,22 +10178,17 @@ def get_csv_from_excel(config):
record[headers[x]] = row[x].value
records.append(record)

if os.environ.get("ISLANDORA_WORKBENCH_SECONDARY_TASKS") is not None:
secondary_tasks = json.loads(os.environ["ISLANDORA_WORKBENCH_SECONDARY_TASKS"])
config_file_id = get_config_file_identifier(config)
if os.path.abspath(config["current_config_file_path"]) in secondary_tasks:
config_file_id = get_config_file_identifier(config)
exported_csv_path = os.path.join(
config["temp_dir"], config["excel_csv_filename"] + "." + config_file_id
)
else:
exported_csv_path = os.path.join(
config["temp_dir"], config["excel_csv_filename"]
)
# Always use unique naming to prevent conflicts between multiple workbench instances
# In recovery mode with session suffix, use that suffix; otherwise generate new unique ID
if (config["recovery_mode_starting_from_node_id"] is not False and
config["recovery_mode_session_suffix"] is not False):
config_file_id = config["recovery_mode_session_suffix"]
else:
exported_csv_path = os.path.join(
config["temp_dir"], config["excel_csv_filename"]
)
config_file_id = get_config_file_identifier_shortened(config)

exported_csv_path = os.path.join(
config["temp_dir"], config["excel_csv_filename"] + "." + config_file_id
)

csv_writer_file_handle = open(exported_csv_path, "w+", newline="", encoding="utf-8")
csv_writer = csv.DictWriter(csv_writer_file_handle, fieldnames=headers)
Expand Down Expand Up @@ -10213,11 +10222,15 @@ def get_extracted_csv_file_path(config):
else:
return False

if os.environ.get("ISLANDORA_WORKBENCH_SECONDARY_TASKS") is not None:
secondary_tasks = json.loads(os.environ["ISLANDORA_WORKBENCH_SECONDARY_TASKS"])
if os.path.abspath(config["current_config_file_path"]) in secondary_tasks:
config_file_id = get_config_file_identifier(config)
exported_csv_filename = exported_csv_filename + "." + config_file_id
# Always use unique naming to prevent conflicts between multiple workbench instances
# In recovery mode with session suffix, use that suffix; otherwise generate new unique ID
if (config["recovery_mode_starting_from_node_id"] is not False and
config["recovery_mode_session_suffix"] is not False):
config_file_id = config["recovery_mode_session_suffix"]
else:
config_file_id = get_config_file_identifier_shortened(config)

exported_csv_filename = exported_csv_filename + "." + config_file_id

return os.path.join(config["temp_dir"], exported_csv_filename)

Expand Down Expand Up @@ -11582,6 +11595,33 @@ def get_config_file_identifier(config):

return config_file_id

def get_config_file_identifier_shortened(config):
"""Gets a unique identifier of the current config file. Used in names of temp files, etc."""
"""Parameters
----------
config : dict
The configuration settings defined by workbench_config.get_config().
Returns
-------
string
A string based on just the config file's name (without path or extension).
"""
global _session_temp_identifier

# Extract just the filename without path and extension
config_file_path = config["current_config_file_path"]
config_file_name = os.path.basename(config_file_path)
config_file_id = os.path.splitext(config_file_name)[0]

# Create session-specific identifier once per session using process ID for uniqueness
if _session_temp_identifier is None:
with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{config_file_id}", prefix=f"pid{os.getpid()}_") as temp:
_session_temp_identifier = os.path.basename(temp.name)
# Clean up the temporary file since we only need the name
os.unlink(temp.name)

return _session_temp_identifier


def calculate_response_time_trend(config, response_time):
"""Gets the average response time from the most recent 20 HTTP requests."""
Expand Down