amd
diff --git a/‎script/README.md‎
Lines changed: 1 addition & 1 deletion b/‎script/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎script/app-mlperf-inference-mlcommons-cpp/README.md‎
Lines changed: 100 additions & 0 deletions b/‎script/app-mlperf-inference-mlcommons-cpp/README.md‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎script/app-mlperf-inference-mlcommons-cpp/customize.py‎
Lines changed: 57 additions & 5 deletions b/‎script/app-mlperf-inference-mlcommons-cpp/customize.py‎
Lines changed: 57 additions & 5 deletions
diff --git a/‎script/app-mlperf-inference-mlcommons-cpp/inc/backend.h‎
Lines changed: 9 additions & 8 deletions b/‎script/app-mlperf-inference-mlcommons-cpp/inc/backend.h‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎script/app-mlperf-inference-mlcommons-cpp/inc/device.h‎
Lines changed: 2 additions & 2 deletions b/‎script/app-mlperf-inference-mlcommons-cpp/inc/device.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎script/app-mlperf-inference-mlcommons-cpp/inc/gpu_device.h‎
Lines changed: 1 addition & 1 deletion b/‎script/app-mlperf-inference-mlcommons-cpp/inc/gpu_device.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎script/app-mlperf-inference-mlcommons-cpp/inc/model.h‎
Lines changed: 30 additions & 0 deletions b/‎script/app-mlperf-inference-mlcommons-cpp/inc/model.h‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎script/app-mlperf-inference-mlcommons-cpp/inc/onnxruntime_backend.h‎
Lines changed: 3 additions & 3 deletions b/‎script/app-mlperf-inference-mlcommons-cpp/inc/onnxruntime_backend.h‎
Lines changed: 3 additions & 3 deletions
@@ -1,6 +1,6 @@
 # MLCommons Automation Scripts
 
-*Last updated: 2026-04-20 21:26:30*
+*Last updated: 2026-04-21 04:12:49*
 
 This directory contains automation scripts for MLPerf benchmarks, AI/ML workflows, and development operations.
 
 
@@ -0,0 +1,100 @@
+# README for app-mlperf-inference-mlcommons-cpp
+This README is automatically generated. Create and add custom content in info.md. Please follow the [script execution document](https://docs.mlcommons.org/mlcflow/targets/script/execution-flow/) to understand more about the MLC script execution.
+
+`mlcflow` stores all local data under `$HOME/MLC` by default. So, if there is space constraint on the home directory and you have more space on say `/mnt/$USER`, you can do
+```
+mkdir /mnt/$USER/MLC
+ln -s /mnt/$USER/MLC $HOME/MLC
+```
+You can also use the `ENV` variable `MLC_REPOS` to control this location but this will need a set after every system reboot.
+
+## Setup
+
+If you are not on a Python development environment please refer to the [official docs](https://docs.mlcommons.org/mlcflow/install/) for the installation.
+
+```bash
+python3 -m venv mlcflow
+. mlcflow/bin/activate
+pip install mlcflow
+```
+
+- Using a virtual environment is recommended (per `pip` best practices), but you may skip it or use `--break-system-packages` if needed.
+
+### Pull mlperf-automations
+
+Once `mlcflow` is installed:
+
+```bash
+mlc pull repo mlcommons@mlperf-automations --pat=<Your Private Access Token>
+```
+- `--pat` or `--ssh` is only needed if the repo is PRIVATE
+- If `--pat` is avoided, you'll be asked to enter the password where you can enter your Private Access Token
+- `--ssh` option can be used instead of `--pat=<>` option if you prefer to use SSH for accessing the github repository.
+## Run Commands
+
+```bash
+mlcr app,mlcommons,mlperf,inference,cpp
+```
+
+### Script Inputs
+
+| Name | Description | Choices | Default |
+|------|-------------|---------|------|
+| `--count` |  |  | `` |
+| `--max_batchsize` |  |  | `` |
+| `--mlperf_conf` |  |  | `` |
+| `--mode` |  |  | `` |
+| `--output_dir` |  |  | `` |
+| `--performance_sample_count` |  |  | `` |
+| `--scenario` |  |  | `` |
+| `--user_conf` |  |  | `` |
+### Generic Script Inputs
+
+| Name | Description | Choices | Default |
+|------|-------------|---------|------|
+| `--input` | Input to the script passed using the env key `MLC_INPUT` |  | `` |
+| `--output` | Output from the script passed using the env key `MLC_OUTPUT` |  | `` |
+| `--outdirname` | The directory to store the script output |  | `cache directory ($HOME/MLC/repos/local/cache/<>) if the script is cacheable or else the current directory` |
+| `--outbasename` | The output file/folder name |  | `` |
+| `--search_folder_path` | The folder path where executables of a given script need to be searched. Search is done recursively upto 4 levels. |  | `` |
+| `--name` |  |  | `` |
+| `--extra_cache_tags` | Extra cache tags to be added to the cached entry when the script results are saved |  | `` |
+| `--skip_compile` | Skip compilation |  | `False` |
+| `--skip_run` | Skip run |  | `False` |
+| `--skip_sudo` | Skip SUDO detection |  | `False` |
+| `--accept_license` | Accept the required license requirement to run the script |  | `False` |
+| `--skip_system_deps` | Skip installing any system dependencies |  | `False` |
+| `--git_ssh` | Use SSH for git repos |  | `False` |
+| `--gh_token` | Github Token |  | `` |
+| `--hf_token` | Huggingface Token |  | `` |
+| `--verify_ssl` | Verify SSL |  | `False` |
+## Variations
+
+### Batch-size
+
+- `batch-size.#` _(# can be substituted dynamically)_
+
+### Device
+
+- `cpu` (default)
+- `cuda`
+
+### Framework
+
+- `onnxruntime` (default)
+- `pytorch`
+- `tf`
+- `tflite`
+- `tvm-onnx`
+
+### Loadgen-scenario
+
+- `multistream`
+- `offline` (default)
+- `server`
+- `singlestream`
+
+### Model
+
+- `resnet50` (default)
+- `retinanet`
@@ -44,6 +44,11 @@ def preprocess(i):
     script_path = i['run_script_input']['path']
     if env['MLC_MODEL'] == "retinanet":
         env['MLC_DATASET_LIST'] = env['MLC_DATASET_ANNOTATIONS_FILE_PATH']
+    elif 'bert' in env['MLC_MODEL']:
+        env['MLC_DATASET_SQUAD_TOKENIZED_ROOT'] = env.get(
+            'MLC_DATASET_SQUAD_TOKENIZED_ROOT', '')
+        env['MLC_DATASET_MAX_SEQ_LENGTH'] = env.get(
+            'MLC_DATASET_SQUAD_TOKENIZED_MAX_SEQ_LENGTH', '384')
     env['MLC_SOURCE_FOLDER_PATH'] = os.path.join(script_path, "src")
 
     for file in os.listdir(env['MLC_SOURCE_FOLDER_PATH']):
@@ -66,26 +71,51 @@ def preprocess(i):
 
     if '+ CXXFLAGS' not in env:
         env['+ CXXFLAGS'] = []
-    env['+ CXXFLAGS'].append("-std=c++14")
+    if env.get('MLC_MLPERF_BACKEND', '') == 'pytorch':
+        env['+ CXXFLAGS'].append("-std=c++17")
+    else:
+        env['+ CXXFLAGS'].append("-std=c++14")
 
     # add preprocessor flag like "#define MLC_MODEL_RESNET50"
-    env['+ CXXFLAGS'].append('-DMLC_MODEL_' + env['MLC_MODEL'].upper())
+    env['+ CXXFLAGS'].append('-DMLC_MODEL_' + env['MLC_MODEL'].upper().replace('-', '_').replace('.', '_'))
     # add preprocessor flag like "#define MLC_MLPERF_BACKEND_ONNXRUNTIME"
     env['+ CXXFLAGS'].append('-DMLC_MLPERF_BACKEND_' +
                              env['MLC_MLPERF_BACKEND'].upper())
     # add preprocessor flag like "#define MLC_MLPERF_DEVICE_CPU"
     env['+ CXXFLAGS'].append('-DMLC_MLPERF_DEVICE_' +
                              env['MLC_MLPERF_DEVICE'].upper())
 
+    # For PyTorch backend, detect LibTorch include/lib paths from pip torch
+    if env.get('MLC_MLPERF_BACKEND', '') == 'pytorch':
+        import torch as _torch
+        torch_path = os.path.dirname(_torch.__file__)
+        torch_inc = os.path.join(torch_path, 'include')
+        torch_inc_csrc = os.path.join(torch_path, 'include', 'torch', 'csrc', 'api', 'include')
+        torch_lib = os.path.join(torch_path, 'lib')
+        env['+CPLUS_INCLUDE_PATH'].append(torch_inc)
+        env['+CPLUS_INCLUDE_PATH'].append(torch_inc_csrc)
+        env['+C_INCLUDE_PATH'].append(torch_inc)
+        env['+C_INCLUDE_PATH'].append(torch_inc_csrc)
+        env['+LD_LIBRARY_PATH'].append(torch_lib)
+        env['+DYLD_FALLBACK_LIBRARY_PATH'].append(torch_lib)
+        if not _torch.compiled_with_cxx11_abi():
+            env['+ CXXFLAGS'].append('-D_GLIBCXX_USE_CXX11_ABI=0')
+
     if '+ LDCXXFLAGS' not in env:
         env['+ LDCXXFLAGS'] = []
 
     env['+ LDCXXFLAGS'] += [
         "-lmlperf_loadgen",
         "-lpthread"
     ]
+
+    # For PyTorch, link against torch, torch_cpu, and c10
+    if env.get('MLC_MLPERF_BACKEND', '') == 'pytorch':
+        env['+ LDCXXFLAGS'] += ['-ltorch', '-ltorch_cpu', '-lc10']
+        if env.get('MLC_MLPERF_DEVICE', '') == 'gpu':
+            env['+ LDCXXFLAGS'] += ['-ltorch_cuda', '-lc10_cuda']
     # e.g. -lonnxruntime
-    if 'MLC_MLPERF_BACKEND_LIB_NAMESPEC' in env:
+    elif 'MLC_MLPERF_BACKEND_LIB_NAMESPEC' in env:
         env['+ LDCXXFLAGS'].append('-l' +
                                    env['MLC_MLPERF_BACKEND_LIB_NAMESPEC'])
     # e.g. -lcudart
@@ -96,9 +126,31 @@ def preprocess(i):
     env['MLC_LINKER_LANG'] = 'CXX'
     env['MLC_RUN_DIR'] = os.getcwd()
 
+
+    # For PyTorch backend, convert .pth weights to TorchScript .pt if needed
+    if env.get('MLC_MLPERF_BACKEND', '') == 'pytorch':
+        model_path = env.get('MLC_ML_MODEL_FILE_WITH_PATH', '')
+        if model_path.endswith('.pth'):
+            torchscript_path = model_path.replace('.pth', '_torchscript.pt')
+            if not os.path.exists(torchscript_path):
+                import torch
+                import torchvision.models as models
+                logger.info(f"Converting {model_path} to TorchScript at {torchscript_path}")
+                model = models.resnet50()
+                model.load_state_dict(torch.load(model_path, map_location='cpu', weights_only=False))
+                model.eval()
+                traced = torch.jit.trace(model, torch.randn(1, 3, 224, 224))
+                traced.save(torchscript_path)
+                logger.info("TorchScript conversion done")
+            env['MLC_ML_MODEL_FILE_WITH_PATH'] = torchscript_path
+
     if 'MLC_MLPERF_USER_CONF' not in env:
-        env['MLC_MLPERF_USER_CONF'] = os.path.join(
-            env['MLC_MLPERF_INFERENCE_CLASSIFICATION_AND_DETECTION_PATH'], "user.conf")
+        if 'bert' in env['MLC_MODEL']:
+            env['MLC_MLPERF_USER_CONF'] = os.path.join(
+                env.get('MLC_MLPERF_INFERENCE_BERT_PATH', ''), "user.conf")
+        else:
+            env['MLC_MLPERF_USER_CONF'] = os.path.join(
+                env['MLC_MLPERF_INFERENCE_CLASSIFICATION_AND_DETECTION_PATH'], "user.conf")
 
     return {'return': 0}
 
 
@@ -28,9 +28,9 @@
  * the location in memory of this batch, and passes this to RunInference implemented by
  * derived classes (e.g. OnnxRuntimeBackend).
  */
-class Backend {
+class MlcBackend {
 public:
-    Backend(std::shared_ptr<Model> &model, std::shared_ptr<Device> &device,
+    MlcBackend(std::shared_ptr<Model> &model, std::shared_ptr<MlcDevice> &device,
             size_t performance_sample_count, size_t batch_size)
             : model(model), device(device)
             , performance_sample_count(performance_sample_count), batch_size(batch_size)
@@ -60,7 +60,7 @@ class Backend {
             std::cerr << "warning: performance sample count = 0" << std::endl;
     }
 
-    virtual ~Backend() {
+    virtual ~MlcBackend() {
         for (size_t i = 0; i < num_inputs; i++) {
             for (size_t j = 0; j < num_memory; j++) {
                 device->Free(j, sample_memory[i][j]);
@@ -175,14 +175,15 @@ class Backend {
         size_t memory_index = device->GetMemoryIndex(concurrency_index);
         // might use batch_memory
         std::unique_lock<std::mutex> batch_memory_lock{batch_memory_mutex[memory_index], std::defer_lock};
+        if (!contiguous)
+            batch_memory_lock.lock();
         for (size_t i = 0; i < num_inputs; i++) {
             // if input is contiguous, use input directly as batch address
             // otherwise, gather a batch to batch_memory
             if (contiguous) {
                 batch_data[i] = GetMemoryAddress(i, memory_index, node->index_in_memory);
             } else {
                 // copy data if not contiguous
-                batch_memory_lock.lock();
                 for (size_t k = 0; k < batch.size(); k++) {
                     const mlperf::QuerySample &sample = batch[k];
                     void *sample_address = GetMemoryAddress(i, memory_index, sample_map[sample.index].index_in_memory);
@@ -232,7 +233,7 @@ class Backend {
 
 protected:
     std::shared_ptr<Model> model;
-    std::shared_ptr<Device> device;
+    std::shared_ptr<MlcDevice> device;
     size_t performance_sample_count;
     size_t batch_size;
     size_t num_memory;
@@ -275,12 +276,12 @@ class Backend {
     Trie batches;
 };
 
-class DummyBackend : public Backend {
+class DummyBackend : public MlcBackend {
 public:
     DummyBackend(
-        std::shared_ptr<Model> &model, std::shared_ptr<Device> &device,
+        std::shared_ptr<Model> &model, std::shared_ptr<MlcDevice> &device,
         size_t performance_sample_count, size_t batch_size)
-        : Backend(model, device, performance_sample_count, batch_size) {}
+        : MlcBackend(model, device, performance_sample_count, batch_size) {}
 
     void RunInference(
             size_t concurrency_index,
 
@@ -19,7 +19,7 @@
  *
  * The Alloc, Free, Read, Write, Copy operations are for the corresponding device memory.
  */
-class Device {
+class MlcDevice {
 public:
     virtual size_t NumConcurrency() const = 0;
     virtual size_t NumMemory() const = 0;
@@ -33,7 +33,7 @@ class Device {
     virtual void SetConcurrencyIndex(size_t concurrency_index) {}
 };
 
-class CPUDevice : public Device {
+class CPUDevice : public MlcDevice {
     size_t NumConcurrency() const override {
         return 2;//std::thread::hardware_concurrency();
     }
 
@@ -10,7 +10,7 @@
 
 #define CHECK_CUDA_SUCCESS(x) if ((x) != cudaSuccess) std::cerr << "encountered CUDA error" << std::endl;
 
-class GPUDevice : public Device {
+class GPUDevice : public MlcDevice {
     size_t NumConcurrency() const override {
         return NumMemory();
     }
 
@@ -121,4 +121,34 @@ class Retinanet : public Model {
     float score_threshold;
 };
 
+
+class BertLarge : public Model {
+public:
+    BertLarge(std::string model_path, size_t max_seq_length) :
+        Model(
+            model_path,
+            3, {"input_ids", "input_mask", "segment_ids"},
+            {max_seq_length * sizeof(int64_t), max_seq_length * sizeof(int64_t), max_seq_length * sizeof(int64_t)},
+            {{max_seq_length}, {max_seq_length}, {max_seq_length}},
+            2, {"output_start_logits", "output_end_logits"},
+            {max_seq_length * sizeof(float), max_seq_length * sizeof(float)},
+            {{max_seq_length}, {max_seq_length}}),
+        max_seq_length(max_seq_length) {}
+
+    void PostProcess(
+            mlperf::QuerySampleIndex index,
+            const std::vector<void *> &raw,
+            const std::vector<std::vector<size_t>> &raw_shapes,
+            std::vector<uint8_t> &response_buffer) override {
+        // Concatenate start_logits and end_logits into response
+        size_t logits_bytes = max_seq_length * sizeof(float);
+        response_buffer.resize(2 * logits_bytes);
+        std::memcpy(response_buffer.data(), raw.at(0), logits_bytes);
+        std::memcpy(response_buffer.data() + logits_bytes, raw.at(1), logits_bytes);
+    }
+
+private:
+    size_t max_seq_length;
+};
+
 #endif // MODEL_H_
@@ -11,13 +11,13 @@
 
 #include "backend.h"
 
-class OnnxRuntimeBackend : public Backend {
+class OnnxRuntimeBackend : public MlcBackend {
 public:
     OnnxRuntimeBackend(
-            std::shared_ptr<Model> &model, std::shared_ptr<Device> &device,
+            std::shared_ptr<Model> &model, std::shared_ptr<MlcDevice> &device,
             size_t performance_sample_count, size_t batch_size,
             bool use_cuda)
-            : Backend(model, device, performance_sample_count, batch_size)
+            : MlcBackend(model, device, performance_sample_count, batch_size)
             , env(ORT_LOGGING_LEVEL_WARNING, "env") {
         for (size_t i = 0; i < device->NumMemory(); i++) {
             memory_infos.emplace_back(
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@`
`10`	`10`
`11`	`11`	`#define CHECK_CUDA_SUCCESS(x) if ((x) != cudaSuccess) std::cerr << "encountered CUDA error" << std::endl;`
`12`	`12`
`13`		`-class GPUDevice : public Device {`
	`13`	`+class GPUDevice : public MlcDevice {`
`14`	`14`	`size_t NumConcurrency() const override {`
`15`	`15`	`return NumMemory();`
`16`	`16`	`}`