mlperf-automations/.github/workflows/test-nvidia-mlperf-inference-implementations.yml at dev · amd/mlperf-automations · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
name: MLPerf Inference Nvidia implementations

on:
  pull_request_target:
    branches: [ "main", "dev" ]
    paths:
      - '.github/workflows/test-nvidia-mlperf-inference-implementations.yml'
      - 'script/app-mlperf-inference-nvidia/**'
      - '!**.md'

jobs:
  run_nvidia:
      if: github.repository_owner == 'mlcommons'
      runs-on:
       - ubuntu-latest
      strategy:
        fail-fast: false
        matrix:
          system: [ "GO-i9" ]
          # system: [ "mlc-server" ]
          python-version: [ "3.12" ]
          model: [ "resnet50", "bert-99" ]
          exclude:
           - model: gptj-99.9
           - system: phoenix1
           - system: GO-i91

      steps:
      - name: Set up SSH
        run: |
          mkdir -p ~/.ssh
          chmod 700 ~/.ssh

          echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
          chmod 600 ~/.ssh/id_ed25519

          #ssh-keyscan -p ${{ secrets.SSH_PORT }} -H ${{ secrets.SSH_HOST }}  >> ~/.ssh/known_hosts
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v3
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install mlcflow
        run: |
          pip install mlcflow
          pip install tabulate
      - name: Pull MLOps repo
        shell: bash
        env:
          REPO: ${{ github.event.pull_request.head.repo.html_url }}
          BRANCH: ${{ github.event.pull_request.head.ref }}
        run: |
          mlc pull repo "$REPO" --branch="$BRANCH"

      - name: Test MLPerf Inference NVIDIA ${{ matrix.model }}
        env:
          gpu_name: rtx_4090
        run: |
          # Set hw_name based on matrix.system
          if [ "${{ matrix.system }}" = "GO-spr" ]; then
            hw_name="RTX4090x2"
            gpu_name=rtx_4090
            docker_string=" --docker --docker_recreate=yes"
          elif [ "${{ matrix.system }}" = "mlc-server" ]; then
            hw_name="H100x8"
            gpu_name=h100
            docker_string=" "
          else
            hw_name="RTX4090x1"
            gpu_name=rtx_4090
            docker_string=" --docker"
          fi
          if [ "${{ matrix.model }}" = "bert-99.9" ]; then
            submission_preprocessor_args=" --noinfer-low-accuracy-results"
          else
            submission_preprocessor_args=""
          fi
          category="datacenter,edge"
          if [ -f "gh_action/bin/deactivate" ]; then source gh_action/bin/deactivate; fi
          python3 -m venv gh_action
          source gh_action/bin/activate
          export MLC_REPOS=$HOME/GH_MLC
          pip install --upgrade mlcflow
          mlc pull repo mlcommons@mlperf-automations --branch=dev

          mlcrr run-mlperf,inference,_all-scenarios,_submission,_full,_r6.0-dev --docker --docker_copy_mlc_repos --remote_host=${{ secrets.SSH_HOST }} --remote_user=${{ secrets.SSH_USER }} --remote_port=${{ secrets.SSH_PORT }} --remote_ssh_key_file=$HOME/.ssh/id_ed25519 --remote_skip_host_verify --preprocess_submission=yes --pull_changes=yes --pull_inference_changes=yes --execution_mode=valid --gpu_name=$gpu_name --pull_changes=yes --pull_inference_changes=yes --model=${{ matrix.model }} --submitter="GATEOverflow" --hw_name=$hw_name --implementation=nvidia --backend=tensorrt --category=$category --division=closed  --docker_dt --docker_mlc_repo=mlcommons@mlperf-automations --docker_mlc_repo_branch=dev --adr.compiler.tags=gcc --device=cuda --use_model_from_host=yes --use_dataset_from_host=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean  $docker_string $submission_preprocessor_args --quiet
          #mlcr push,github,mlperf,inference,submission --repo_url=https://github.com/mlcommons/mlperf_inference_unofficial_submissions_v5.1 --repo_branch=auto-update --commit_message="Results from GH action on NVIDIA_$hw_name" --quiet --submission_dir=$HOME/gh_action_submissions --hw_name=$hw_name
          #mlcr push,github,mlperf,inference,submission --repo_url=https://github.com/GATEOverflow/mlperf_inference_submissions_v5.0 --repo_branch=main --commit_message="Results from GH actions on NVIDIA_$hw_name" --quiet --submission_dir=$HOME/gh_action_submissions --hw_name=$hw_name