@@ -59,18 +59,27 @@ if [ ! -f "${DATASET_PATH}" ]; then
5959fi
6060echo " ==> dataset: ${DATASET_PATH} ($( wc -l < " ${DATASET_PATH} " ) lines)"
6161
62+ # Placeholder tarball so HTCondor's transfer_output_files never fails
63+ # (even if training crashes below). Overwritten on success.
64+ tar -czf " ${INITIAL_PWD} /adapter.tar.gz" --files-from /dev/null
65+
66+ # Run training WITHOUT set -e so a crash still reaches the tar step.
67+ set +e
6268echo " ==> python scripts/train_sft.py --config ${CONFIG} "
6369python scripts/train_sft.py --config " ${CONFIG} "
70+ TRAIN_EXIT=$?
71+ set -e
72+
73+ if [ " ${TRAIN_EXIT} " -ne 0 ]; then
74+ echo " ERROR: training exited ${TRAIN_EXIT} . Placeholder tarball will be transferred."
75+ fi
6476
65- # Tar the LoRA adapter + tokenizer for transfer back. HTCondor's
66- # transfer_output_files expects the file at the job cwd (initial_pwd).
67- OUTPUT_DIR=$( python -c " import yaml; c = yaml.safe_load(open('${CONFIG} ')); print(c['output_dir'])" )
68- if [ -d " ${OUTPUT_DIR} " ]; then
77+ # Tar the LoRA adapter + tokenizer. Overwrites the placeholder on success.
78+ OUTPUT_DIR=$( python -c " import yaml; c = yaml.safe_load(open('${CONFIG} ')); print(c['output_dir'])" 2> /dev/null || echo " " )
79+ if [ -n " ${OUTPUT_DIR} " ] && [ -d " ${OUTPUT_DIR} " ]; then
6980 tar -czf " ${INITIAL_PWD} /adapter.tar.gz" -C " $( dirname " ${OUTPUT_DIR} " ) " " $( basename " ${OUTPUT_DIR} " ) "
7081 echo " ==> wrote adapter tarball: ${INITIAL_PWD} /adapter.tar.gz"
7182 ls -lh " ${INITIAL_PWD} /adapter.tar.gz"
72- else
73- echo " WARNING: output_dir ${OUTPUT_DIR} not found — training may have failed silently"
74- # Create an empty tarball so HTCondor's transfer_output_files doesn't error.
75- tar -czf " ${INITIAL_PWD} /adapter.tar.gz" --files-from /dev/null
7683fi
84+
85+ exit " ${TRAIN_EXIT} "
0 commit comments