|
| 1 | +#!/bin/bash |
| 2 | +############################################################################### |
| 3 | +# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. |
| 4 | +# |
| 5 | +# See LICENSE for license information. |
| 6 | +################################################################################# |
| 7 | + |
| 8 | +# examples |
| 9 | +# bash ./examples/scripts/prepare_dataset.sh ./data_path DeepSeekV2Tokenizer deepseek-ai/DeepSeek-V2 |
| 10 | + |
| 11 | +export DATA_PATH=$1 |
| 12 | +# Note: The same type of tokenizer uses the same tokenizer model. |
| 13 | +# For example, `deepseek-ai/DeepSeek-V2` and `deepseek-ai/DeepSeek-V2-Lite` use |
| 14 | +# the same tokenizer model. Therefore, the `tokenizer_type` is the same as `DeepSeekV2Tokenizer`, |
| 15 | +# and the tokenized data path is also the same. |
| 16 | +# Therefore, if you have already preprocessed the data using the same tokenizer model, |
| 17 | +# you don't need to run this script again. |
| 18 | +# |
| 19 | +# tokenizer_type, tokenizer_model |
| 20 | +# DeepSeekV2Tokenizer, deepseek-ai/DeepSeek-V2 |
| 21 | +# DeepSeekV2Tokenizer, deepseek-ai/DeepSeek-V2-Lite |
| 22 | +# DeepSeekV3Tokenizer, deepseek-ai/DeepSeek-V3 |
| 23 | +# DeepSeekV3Tokenizer, deepseek-ai/DeepSeek-V3-base |
| 24 | +# |
| 25 | +# available tokenizer types: Primus/primus/backends/megatron/training/tokenizer/tokenizer.py@build_tokenizer |
| 26 | +# available tokenizer models: https://huggingface.co |
| 27 | +export TOKENIZER_TYPE=$2 # DeepSeekV2Tokenizer |
| 28 | +export TOKENIZER_MODEL=$3 # deepseek-ai/DeepSeek-V2-Lite |
| 29 | + |
| 30 | +# framework path |
| 31 | +PRIMUS_PATH=$(realpath "$(dirname "$0")/../..") |
| 32 | +export PRIMUS_PATH |
| 33 | +export MEGATRON_PATH=${MEGATRON_PATH:-${PRIMUS_PATH}/../Megatron-LM} |
| 34 | +export PYTHONPATH=${MEGATRON_PATH}:${PRIMUS_PATH}:${PYTHONPATH} |
| 35 | +[[ ! -d "${MEGATRON_PATH}" ]] && { |
| 36 | + echo "Error: MEGATRON_PATH (${MEGATRON_PATH}) does not exist" |
| 37 | + exit 1 |
| 38 | +} |
| 39 | +echo "HF_HOME: $HF_HOME" |
| 40 | +echo "PRIMUS_PATH: $PRIMUS_PATH" |
| 41 | +echo "MEGATRON_PATH: $MEGATRON_PATH" |
| 42 | + |
| 43 | +# bookcorpus dataset |
| 44 | +export DATASET=bookcorpus |
| 45 | +DATASET_PATH="${DATA_PATH}/${DATASET}" |
| 46 | +OUTPUT_PATH="$DATASET_PATH/${TOKENIZER_TYPE}" |
| 47 | +export HF_HOME=${HF_HOME:-"${DATA_PATH}"/data/huggingface} |
| 48 | +mkdir -p "$OUTPUT_PATH" |
| 49 | + |
| 50 | +export TOKENIZED_DATA_PATH=${TOKENIZED_DATA_PATH:-"${OUTPUT_PATH}"/bookcorpus_text_sentence} |
| 51 | +if [[ -f "${TOKENIZED_DATA_PATH}.bin" && -f "${TOKENIZED_DATA_PATH}.idx" ]]; then |
| 52 | + echo "Tokenized data files ${DATA_PATH}.bin and ${DATA_PATH}.idx exist, skip data preprocess" |
| 53 | + exit 0 |
| 54 | +fi |
| 55 | + |
| 56 | +START_TIME=$(date +%s) |
| 57 | +if [[ -f "${DATASET_PATH}/bookcorpus_megatron.json" ]]; then |
| 58 | + echo "Find the '${DATASET}' dataset: '${DATASET_PATH}'/bookcorpus_megatron.json, skip download." |
| 59 | +else |
| 60 | + echo "Downloading '${DATASET}' dataset to '${DATASET_PATH}'..." |
| 61 | + python3 "${PRIMUS_PATH}"/examples/scripts/prepare_bookcorpus_megatron_dataset.py --out-dir "${DATASET_PATH}" |
| 62 | +fi |
| 63 | + |
| 64 | +END_TIME=$(date +%s) |
| 65 | +ELAPSED_TIME=$((END_TIME - START_TIME)) |
| 66 | +echo "Download '${DATASET}' completed. Time: '${ELAPSED_TIME}' s" |
| 67 | + |
| 68 | +START_TIME=$(date +%s) |
| 69 | +python "${PRIMUS_PATH}"/examples/scripts/preprocess_data.py \ |
| 70 | + --input "${DATASET_PATH}"/bookcorpus_megatron.json \ |
| 71 | + --extra-tokenizer-type "${TOKENIZER_TYPE}" \ |
| 72 | + --tokenizer-model "${TOKENIZER_MODEL}" \ |
| 73 | + --output-prefix "${OUTPUT_PATH}"/bookcorpus \ |
| 74 | + --workers "$(nproc)" --split-sentences --partitions 2 |
| 75 | + |
| 76 | +END_TIME=$(date +%s) |
| 77 | +ELAPSED_TIME=$((END_TIME - START_TIME)) |
| 78 | +echo "Dataset '${DATASET}' preprocess completed. Time: '${ELAPSED_TIME}' s" |
0 commit comments