diff --git a/unsloth_zoo/dataset_utils.py b/unsloth_zoo/dataset_utils.py index 39d5825b..1adf2118 100644 --- a/unsloth_zoo/dataset_utils.py +++ b/unsloth_zoo/dataset_utils.py @@ -370,6 +370,7 @@ def standardize_data_formats( aliases_for_system = ["system",], aliases_for_user = ["user", "human", "input",], aliases_for_assistant = ["gpt", "assistant", "output",], + num_proc = None, ): """ Standardizes ShareGPT and other formats to user/assistant Hugging Face format. @@ -456,14 +457,20 @@ def _standardize_dataset(examples): pass from multiprocessing import cpu_count - num_proc = cpu_count() - - return dataset.map( - _standardize_dataset, - batched = True, - desc = "Unsloth: Standardizing formats", - num_proc = num_proc, - ) + if num_proc is None or not isinstance(num_proc, int): + num_proc = cpu_count() + + try: + return dataset.map( + _standardize_dataset, + batched=True, + desc="Unsloth: Standardizing formats", + num_proc=num_proc, + ) + except RuntimeError as e: + raise RuntimeError( + f"Unsloth: Process crashed: {str(e)}\nTry reducing num_proc (currently {num_proc}) to a lower value." + ) from e pass