Merge pull request #1792 from AI-Hypercomputer:jacobplatin/fix-misc-inference-benchmark-issues

maxtext authors · maxtext authors · commit c7868c9ce946 · 2025-05-30T07:25:54.000-07:00
PiperOrigin-RevId: 765181154
diff --git a/MaxText/inference_mlperf/README.md b/MaxText/inference_mlperf/README.md
@@ -13,14 +13,24 @@ source .env/bin/activate
 ```
 
 ### Install loadgen
+Note: this is taken from the MLCommons inference [README](https://github.com/mlcommons/inference/blob/master/loadgen/README_BUILD.md#quick-start) (as of May 2025).
 ```
-sudo apt-get install python3-dev
-sudo apt-get install build-essential -y
-git clone git@github.com:mlcommons/inference.git
-cd inference/
-cd loadgen/ && python3 -m pip install .
+pip install absl-py numpy
+git clone --recurse-submodules https://github.com/mlcommons/inference.git mlperf_inference
+cd mlperf_inference/loadgen
+CFLAGS="-std=c++14 -O3" python -m pip install .
 ```
 
+If you run into an issue like the following:
+
+```
+ImportError: venv/lib/libstdc++.so.6: version `GLIBCXX_3.4.30'
+not found (required by venv/lib/python3.10/site-packages/lperf_loadgen.cpython-310-x86_64-linux-gnu.so)
+```
+
+Please try running `conda install -c conda-forge gcc_linux-64 gxx_linux-64 libstdcxx-ng` if you using Conda or `sudo apt install build-essential` if you are using Venv and then reinstalling `loadgen`
+
+
 ### Download datasets
 
 ```
diff --git a/MaxText/inference_mlperf/llama_offline_run.sh b/MaxText/inference_mlperf/llama_offline_run.sh
@@ -92,7 +92,7 @@ if [ -z "$MAXENGINE_ARGS" ];
 then
   CHECKPOINT="gs://msingh-bkt/checkpoints/quant_${MODEL_NAME}-chat/mlperf_070924/int8_"
   BASE_CFG="model_name=${MODEL_NAME} tokenizer_path=${TOKENIZER_PATH} load_parameters_path=${CHECKPOINT}"
-  QUANT_CFG="quantization=int8 quantize_kvcache=True checkpoint_is_quantized=True"
+  QUANT_CFG="quantization=int8 quantize_kvcache=True checkpoint_is_quantized=True skip_jax_distributed_system=true"
   MAXENGINE_ARGS="${BASE_CFG} ${QUANT_CFG}"
 fi
 
@@ -117,7 +117,7 @@ else
   export DATASET_TYPE=full
   export DATASET_PATH=${DATA_DISK_DIR}/processed-data.pkl
   export TOTAL_SAMPLE_COUNT=24576
-  export USER_CONFIG=user.conf
+  export USER_CONFIG=user.conf # NOTE: you may need to change this path(e.g. `MaxText/inference_mlperf/user.conf`)
 fi
 
 # LIBTPU_INIT_ARGS="--xla_tpu_enable_data_parallel_all_reduce_opt=true --xla_tpu_data_parallel_opt_different_sized_ops=true --xla_tpu_enable_async_collective_fusion=true --xla_tpu_enable_async_collective_fusion_fuse_all_gather=true --xla_tpu_enable_async_collective_fusion_multiple_steps=true --xla_tpu_overlap_compute_collective_tc=true --xla_enable_async_all_gather=true"
diff --git a/MaxText/inference_mlperf/offline_inference.py b/MaxText/inference_mlperf/offline_inference.py
@@ -40,6 +40,7 @@
 
 DecodeState = Any
 Params = Any
+PRNGKeyType = Any
 
 log = logging.getLogger(__name__)
 
@@ -130,19 +131,20 @@ def process(
       input_true_length: int,
       max_length: int,
       prefill_done: Callable[[List[Tuple[engine_api.ResultTokens, int]], List[int], DecodeState], None],
+      rng: PRNGKeyType,
   ) -> None:
     """Prefill helper process runner"""
     padded_length = len(input_tokens_padded)
     if self._type == "default":
       first_token, decode_state = self._processor.process(
-          model_params, decode_state, decode_slot, input_tokens_padded, input_true_length
+          model_params, decode_state, decode_slot, input_tokens_padded, input_true_length, rng
       )
       prefill_done([(first_token, decode_slot)], [input_id], decode_state)
     elif self._type == "batch":
       if padded_length == max_length:
         # fallback to default mode
         first_token, decode_state = self._processor.process(
-            model_params, decode_state, decode_slot, input_tokens_padded, input_true_length
+            model_params, decode_state, decode_slot, input_tokens_padded, input_true_length, rng
         )
         prefill_done([(first_token, decode_slot)], [input_id], decode_state)
       else:
@@ -249,6 +251,9 @@ def batch_inference_with_callback(
     counter = EventCounter(input=0, prefill=0, decode=0, detokenize=0)
     dummy_length = 1
 
+    rng = jax.random.PRNGKey(1234)
+    rng, _ = jax.random.split(rng)
+
     def prefill_done(prefill_result, ids, decode_state):
       nonlocal self
       nonlocal counter
@@ -345,7 +350,15 @@ def detokenize():
 
       # Do prefill when there are free slots
       self.prefill.process(
-          self.params, self.decode_state, slot, row.id, row.tokens, row.true_length, self.max_prefill_length, prefill_done
+          self.params,
+          self.decode_state,
+          slot,
+          row.id,
+          row.tokens,
+          row.true_length,
+          self.max_prefill_length,
+          prefill_done,
+          rng,
       )
     self.prefill.finalize(self.params, self.decode_state, prefill_done)
 
diff --git a/MaxText/inference_mlperf/trillium/benchmarks_llama2-70b-trillium_2x4.sh b/MaxText/inference_mlperf/trillium/benchmarks_llama2-70b-trillium_2x4.sh
@@ -1,8 +1,10 @@
 #!/usr/bin/env bash
 
+# NOTE: please check the README located at MaxText/inference_mlperf/README.md for instructions on how
+# to set up the environment before running this script.
 # Run command:
 # bash benchmarks_llama2-70b-trillium_2x4.sh [-b benchmark_type]
-# benchmark_type can be: performance, audit, accuracy, or all (default)
+# benchmark_type can be: performance (default), audit, accuracy, or all
 
 run_name="trillium_llama2-70b"
 dry_run=false
@@ -84,21 +86,21 @@ if [[ -z ${CHECKPOINT} ]] ; then
 fi
 
 if [[ -z ${TOKENIZER_PATH} ]] ; then
-  export TOKENIZER_PATH="/home/${USER}/maxtext/assets/tokenizer.llama2"
+  export TOKENIZER_PATH="/home/${USER}/maxtext/assets/tokenizer.llama2" # NOTE: you may need to change this path for your VM
 fi
 
 if [ -z "$PREFILL_LENS_AND_PER_DEVICE_BATCH_SIZES" ];
 then
     PREFILL_LEN="1024"
-    BATCH_SIZE_PER_DEVICE="64" 
+    BATCH_SIZE_PER_DEVICE="64"
     export PREFILL_LENS_AND_PER_DEVICE_BATCH_SIZES="${PREFILL_LEN},${BATCH_SIZE_PER_DEVICE}"
 fi
 
 
 BASE_CFG="model_name=llama2-70b tokenizer_path=${TOKENIZER_PATH} load_parameters_path=${CHECKPOINT}"
 QUANT_CFG="quantization=${QUANTIZATION} quant_cfg_path=${QUANT_PATH} checkpoint_is_quantized=True"
 KV_QUANT_CFG="quantize_kvcache=True kv_quant_dtype=${KV_QUANT_DTYPE}"
-export MAXENGINE_ARGS="${BASE_CFG} ${QUANT_CFG} ${KV_QUANT_CFG} optimize_mesh_for_tpu_v6e=True"
+export MAXENGINE_ARGS="${BASE_CFG} ${QUANT_CFG} ${KV_QUANT_CFG} optimize_mesh_for_tpu_v6e=True skip_jax_distributed_system=True"
 echo
 echo $MAXENGINE_ARGS
 echo
@@ -117,7 +119,7 @@ run_benchmark() {
             ;;
         "accuracy")
             export HF_CKPT="meta-llama/Llama-2-70b-chat-hf"
-            $cmd bash llama_offline_run.sh ${RUN_OPTIONS} -r benchmarks_accuracy_${RUN_DESC} -a  
+            $cmd bash llama_offline_run.sh ${RUN_OPTIONS} -r benchmarks_accuracy_${RUN_DESC} -a
             ;;
     esac
 }
diff --git a/MaxText/inference_mlperf/trillium/microbenchmarks_llama2-70b-trillium_2x4.sh b/MaxText/inference_mlperf/trillium/microbenchmarks_llama2-70b-trillium_2x4.sh
@@ -1,7 +1,7 @@
 # Run command:
 # bash microbenchmarks_llama2-70b-trillium_2x4.sh
-# Look at profiles: 
-# tensorboard --logdir /tmp/mb/profiles/trillium_llama2_70b/tensorboard/prefill_insert_1024 
+# Look at profiles:
+# tensorboard --logdir /tmp/mb/profiles/trillium_llama2_70b/tensorboard/prefill_insert_1024
 
 
 run_name="trillium_llama2-70b"
diff --git a/MaxText/prefill_packing.py b/MaxText/prefill_packing.py
@@ -114,11 +114,12 @@ def process(
       decode_slot: int,
       input_tokens_padded: jax.Array,
       input_true_length: int,
+      rng: PRNGKeyType,
   ) -> Tuple[engine_api.ResultTokens, DecodeState]:
     """Process a new input."""
 
     process_fn = self._process_compiled(model_params, len(input_tokens_padded))
-    return process_fn(model_params, input_tokens_padded, decode_slot, input_true_length, decode_state)
+    return process_fn(model_params, input_tokens_padded, decode_slot, input_true_length, decode_state, rng)
 
   def _process_compiled(self, params: Params, padded_length: int):
     """Ahead-of-time compilation wrapper of _process()."""
diff --git a/requirements.txt b/requirements.txt
@@ -28,7 +28,7 @@ pyink
 pre-commit
 pytype
 pillow>=11.1.0
-sentencepiece==0.1.97
+sentencepiece==0.2.0
 tensorflow-text>=2.13.0
 tensorflow>=2.13.0
 tensorflow-datasets