Skip to content

Commit fd63893

Browse files
committed
data preprocess
1 parent 33edbc2 commit fd63893

File tree

8 files changed

+577
-12
lines changed

8 files changed

+577
-12
lines changed

examples/megatron/exp_pretrain.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ modules:
6666
overlap_param_gather: true
6767

6868
# data
69-
train_data_path: ${DATA_PATH:/home/azureuser/tas-public/data/deepseek-datasets/mmap_deepseekv2_datasets_text_document}
69+
train_data_path: ${TOKENIZED_DATA_PATH:null}
7070
valid_data_path: null
7171
test_data_path: null
7272

examples/megatron/run_pretrain.sh

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,31 @@ export MEGATRON_PATH=${MEGATRON_PATH:-${PRIMUS_PATH}/../Megatron-LM}
1818
exit 1
1919
}
2020

21-
# data
22-
mkdir -p "${PRIMUS_PATH}"/data/deepseek-datasets
23-
export HF_HOME="${PRIMUS_PATH}"/data/huggingface
24-
export DATA_PATH="${PRIMUS_PATH}"/data/deepseek-datasets/mmap_deepseekv2_datasets_text_document
25-
if [[ ! -f "${DATA_PATH}.bin" || ! -f "${DATA_PATH}.idx" ]]; then
26-
echo "Error: Missing required deepseek files. \
27-
Please follow the README.md and download ${DATA_PATH}.bin and ${DATA_PATH}.idx."
21+
# model config
22+
export MODEL_CONFIG_FILE=$PRIMUS_PATH/primus/configs/models/megatron/${MODEL_CONFIG}.yaml
23+
EXTRA_TOKENIZER_TYPE=$(grep "^extra_tokenizer_type:" "$MODEL_CONFIG_FILE" | awk -F ': ' '{print $2}')
24+
TOKENIZER_TYPE=$(grep "^tokenizer_type:" "$MODEL_CONFIG_FILE" | awk -F ': ' '{print $2}')
25+
if [ -n "$EXTRA_TOKENIZER_TYPE" ]; then
26+
TOKENIZER_TYPE=$EXTRA_TOKENIZER_TYPE
27+
fi
28+
export TOKENIZER_TYPE
29+
TOKENIZER_MODEL=$(grep "^tokenizer_model:" "$MODEL_CONFIG_FILE" | awk -F ': ' '{print $2}')
30+
export TOKENIZER_MODEL
31+
if [[ ! -f "${MODEL_CONFIG_FILE}" ]]; then
32+
echo "Error: Missing model config file: $MODEL_CONFIG_FILE."
33+
exit 1
34+
fi
35+
36+
# dataset
37+
DATASET=bookcorpus
38+
export DATA_PATH=${DATA_PATH:-"/apps/tas/0_public/data"}
39+
export HF_HOME=${HF_HOME:-"${DATA_PATH}"/huggingface}
40+
export TOKENIZED_DATA_PATH=${TOKENIZED_DATA_PATH:-${DATA_PATH}/${DATASET}/${TOKENIZER_TYPE}/bookcorpus_text_sentence}
41+
if [[ ! -f "${TOKENIZED_DATA_PATH}.bin" || ! -f "${TOKENIZED_DATA_PATH}.idx" ]]; then
42+
echo "Error: Missing required tokenized dataset files. \
43+
Please prepare the data with command: \
44+
bash ./examples/scripts/prepare_dataset.sh ${DATA_PATH} ${TOKENIZER_TYPE} ${TOKENIZER_MODEL}"
45+
2846
exit 1
2947
fi
3048

@@ -79,7 +97,10 @@ if [ "$NODE_RANK" = "0" ]; then
7997
echo "[NODE-$NODE_RANK] PRIMUS_PATH: $PRIMUS_PATH"
8098
echo "[NODE-$NODE_RANK] MEGATRON_PATH: $MEGATRON_PATH"
8199
echo "[NODE-$NODE_RANK] HF_HOME: $HF_HOME"
82-
echo "[NODE-$NODE_RANK] DATA_PATH: $DATA_PATH"
100+
echo "[NODE-$NODE_RANK] TOKENIZED_DATA_PATH: $TOKENIZED_DATA_PATH"
101+
echo "[NODE-$NODE_RANK] MODEL_CONFIG_FILE: $MODEL_CONFIG_FILE"
102+
echo "[NODE-$NODE_RANK] TOKENIZER_TYPE: $TOKENIZER_TYPE"
103+
echo "[NODE-$NODE_RANK] TOKENIZER_MODEL: $TOKENIZER_MODEL"
83104
echo "[NODE-$NODE_RANK] RUN_ENV: $RUN_ENV"
84105
echo ""
85106
fi
@@ -161,8 +182,8 @@ if [ "$RUN_ENV" = "torchrun" ]; then
161182
# build helper_cpp of megatron
162183
pushd "${MEGATRON_PATH}/megatron/core/datasets" && make && popd || exit 1
163184

164-
torchrun "${DISTRIBUTED_ARGS[@]}" examples/deepseek/pretrain.py \
165-
--exp examples/deepseek/exp_pretrain.yaml \
185+
torchrun "${DISTRIBUTED_ARGS[@]}" examples/megatron/pretrain.py \
186+
--exp examples/megatron/exp_pretrain.yaml \
166187
2>&1 | tee $TRAIN_LOG
167188

168189
elif [ "$RUN_ENV" = "slurm" ]; then
@@ -195,7 +216,7 @@ elif [ "$RUN_ENV" = "slurm" ]; then
195216
--env NCCL_PROTO=$NCCL_PROTO \
196217
--env RCCL_MSCCL_ENABLE=$RCCL_MSCCL_ENABLE \
197218
--env HF_HOME=$HF_HOME \
198-
--env DATA_PATH=$DATA_PATH \
219+
--env TOKENIZED_DATA_PATH=$TOKENIZED_DATA_PATH \
199220
--env MODEL_CONFIG=$MODEL_CONFIG \
200221
--env TE_HIPBLASLT_TUNING_RUN_COUNT=$TE_HIPBLASLT_TUNING_RUN_COUNT \
201222
--env TE_HIPBLASLT_TUNING_ALGO_COUNT=$TE_HIPBLASLT_TUNING_ALGO_COUNT \

examples/scripts/get_ip_interface.sh

100644100755
File mode changed.
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
###############################################################################
2+
# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
3+
#
4+
# See LICENSE for license information.
5+
#################################################################################
6+
7+
import argparse
8+
from pathlib import Path
9+
10+
import nltk
11+
from datasets import load_dataset
12+
13+
if __name__ == "__main__":
14+
parser = argparse.ArgumentParser()
15+
parser.add_argument("--out-dir", type=str, required=False, default="tmp/data", help="Path to output JSON")
16+
args = parser.parse_args()
17+
out_dir = Path(args.out_dir)
18+
out_dir.mkdir(exist_ok=True, parents=True)
19+
nltk.download("punkt_tab")
20+
21+
dataset = load_dataset("bookcorpus", split="train", trust_remote_code=True)
22+
dataset.to_json(out_dir / "bookcorpus_megatron.json")
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#!/bin/bash
2+
###############################################################################
3+
# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
4+
#
5+
# See LICENSE for license information.
6+
#################################################################################
7+
8+
# examples
9+
# bash ./examples/scripts/prepare_dataset.sh ./data_path DeepSeekV2Tokenizer deepseek-ai/DeepSeek-V2
10+
11+
export DATA_PATH=$1
12+
# Note: The same type of tokenizer uses the same tokenizer model.
13+
# For example, `deepseek-ai/DeepSeek-V2` and `deepseek-ai/DeepSeek-V2-Lite` use
14+
# the same tokenizer model. Therefore, the `tokenizer_type` is the same as `DeepSeekV2Tokenizer`,
15+
# and the tokenized data path is also the same.
16+
# Therefore, if you have already preprocessed the data using the same tokenizer model,
17+
# you don't need to run this script again.
18+
#
19+
# tokenizer_type, tokenizer_model
20+
# DeepSeekV2Tokenizer, deepseek-ai/DeepSeek-V2
21+
# DeepSeekV2Tokenizer, deepseek-ai/DeepSeek-V2-Lite
22+
# DeepSeekV3Tokenizer, deepseek-ai/DeepSeek-V3
23+
# DeepSeekV3Tokenizer, deepseek-ai/DeepSeek-V3-base
24+
#
25+
# available tokenizer types: Primus/primus/backends/megatron/training/tokenizer/tokenizer.py@build_tokenizer
26+
# available tokenizer models: https://huggingface.co
27+
export TOKENIZER_TYPE=$2 # DeepSeekV2Tokenizer
28+
export TOKENIZER_MODEL=$3 # deepseek-ai/DeepSeek-V2-Lite
29+
30+
# framework path
31+
PRIMUS_PATH=$(realpath "$(dirname "$0")/../..")
32+
export PRIMUS_PATH
33+
export MEGATRON_PATH=${MEGATRON_PATH:-${PRIMUS_PATH}/../Megatron-LM}
34+
export PYTHONPATH=${MEGATRON_PATH}:${PRIMUS_PATH}:${PYTHONPATH}
35+
[[ ! -d "${MEGATRON_PATH}" ]] && {
36+
echo "Error: MEGATRON_PATH (${MEGATRON_PATH}) does not exist"
37+
exit 1
38+
}
39+
echo "HF_HOME: $HF_HOME"
40+
echo "PRIMUS_PATH: $PRIMUS_PATH"
41+
echo "MEGATRON_PATH: $MEGATRON_PATH"
42+
43+
# bookcorpus dataset
44+
export DATASET=bookcorpus
45+
DATASET_PATH="${DATA_PATH}/${DATASET}"
46+
OUTPUT_PATH="$DATASET_PATH/${TOKENIZER_TYPE}"
47+
export HF_HOME=${HF_HOME:-"${DATA_PATH}"/data/huggingface}
48+
mkdir -p "$OUTPUT_PATH"
49+
50+
export TOKENIZED_DATA_PATH=${TOKENIZED_DATA_PATH:-"${OUTPUT_PATH}"/bookcorpus_text_sentence}
51+
if [[ -f "${TOKENIZED_DATA_PATH}.bin" && -f "${TOKENIZED_DATA_PATH}.idx" ]]; then
52+
echo "Tokenized data files ${DATA_PATH}.bin and ${DATA_PATH}.idx exist, skip data preprocess"
53+
exit 0
54+
fi
55+
56+
START_TIME=$(date +%s)
57+
if [[ -f "${DATASET_PATH}/bookcorpus_megatron.json" ]]; then
58+
echo "Find the '${DATASET}' dataset: '${DATASET_PATH}'/bookcorpus_megatron.json, skip download."
59+
else
60+
echo "Downloading '${DATASET}' dataset to '${DATASET_PATH}'..."
61+
python3 "${PRIMUS_PATH}"/examples/scripts/prepare_bookcorpus_megatron_dataset.py --out-dir "${DATASET_PATH}"
62+
fi
63+
64+
END_TIME=$(date +%s)
65+
ELAPSED_TIME=$((END_TIME - START_TIME))
66+
echo "Download '${DATASET}' completed. Time: '${ELAPSED_TIME}' s"
67+
68+
START_TIME=$(date +%s)
69+
python "${PRIMUS_PATH}"/examples/scripts/preprocess_data.py \
70+
--input "${DATASET_PATH}"/bookcorpus_megatron.json \
71+
--extra-tokenizer-type "${TOKENIZER_TYPE}" \
72+
--tokenizer-model "${TOKENIZER_MODEL}" \
73+
--output-prefix "${OUTPUT_PATH}"/bookcorpus \
74+
--workers "$(nproc)" --split-sentences --partitions 2
75+
76+
END_TIME=$(date +%s)
77+
ELAPSED_TIME=$((END_TIME - START_TIME))
78+
echo "Dataset '${DATASET}' preprocess completed. Time: '${ELAPSED_TIME}' s"

0 commit comments

Comments
 (0)