Skip to content

Commit 4cf7236

Browse files
fix: always produce adapter.tar.gz placeholder so transfer doesn't hold the job
1 parent bfa2308 commit 4cf7236

1 file changed

Lines changed: 17 additions & 8 deletions

File tree

chtc/train_sft.sh

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -59,18 +59,27 @@ if [ ! -f "${DATASET_PATH}" ]; then
5959
fi
6060
echo "==> dataset: ${DATASET_PATH} ($(wc -l < "${DATASET_PATH}") lines)"
6161

62+
# Placeholder tarball so HTCondor's transfer_output_files never fails
63+
# (even if training crashes below). Overwritten on success.
64+
tar -czf "${INITIAL_PWD}/adapter.tar.gz" --files-from /dev/null
65+
66+
# Run training WITHOUT set -e so a crash still reaches the tar step.
67+
set +e
6268
echo "==> python scripts/train_sft.py --config ${CONFIG}"
6369
python scripts/train_sft.py --config "${CONFIG}"
70+
TRAIN_EXIT=$?
71+
set -e
72+
73+
if [ "${TRAIN_EXIT}" -ne 0 ]; then
74+
echo "ERROR: training exited ${TRAIN_EXIT}. Placeholder tarball will be transferred."
75+
fi
6476

65-
# Tar the LoRA adapter + tokenizer for transfer back. HTCondor's
66-
# transfer_output_files expects the file at the job cwd (initial_pwd).
67-
OUTPUT_DIR=$(python -c "import yaml; c = yaml.safe_load(open('${CONFIG}')); print(c['output_dir'])")
68-
if [ -d "${OUTPUT_DIR}" ]; then
77+
# Tar the LoRA adapter + tokenizer. Overwrites the placeholder on success.
78+
OUTPUT_DIR=$(python -c "import yaml; c = yaml.safe_load(open('${CONFIG}')); print(c['output_dir'])" 2>/dev/null || echo "")
79+
if [ -n "${OUTPUT_DIR}" ] && [ -d "${OUTPUT_DIR}" ]; then
6980
tar -czf "${INITIAL_PWD}/adapter.tar.gz" -C "$(dirname "${OUTPUT_DIR}")" "$(basename "${OUTPUT_DIR}")"
7081
echo "==> wrote adapter tarball: ${INITIAL_PWD}/adapter.tar.gz"
7182
ls -lh "${INITIAL_PWD}/adapter.tar.gz"
72-
else
73-
echo "WARNING: output_dir ${OUTPUT_DIR} not found — training may have failed silently"
74-
# Create an empty tarball so HTCondor's transfer_output_files doesn't error.
75-
tar -czf "${INITIAL_PWD}/adapter.tar.gz" --files-from /dev/null
7683
fi
84+
85+
exit "${TRAIN_EXIT}"

0 commit comments

Comments
 (0)