replayio
diff --git a/‎.github/workflows/eval-runner.yml
Lines changed: 160 additions & 0 deletions b/‎.github/workflows/eval-runner.yml
Lines changed: 160 additions & 0 deletions
diff --git a/‎.github/workflows/ghcr-build.yml
Lines changed: 47 additions & 1 deletion b/‎.github/workflows/ghcr-build.yml
Lines changed: 47 additions & 1 deletion
diff --git a/‎README.md
Lines changed: 12 additions & 16 deletions b/‎README.md
Lines changed: 12 additions & 16 deletions
diff --git a/‎containers/app/Dockerfile
Lines changed: 1 addition & 0 deletions b/‎containers/app/Dockerfile
Lines changed: 1 addition & 0 deletions
diff --git a/‎containers/app/entrypoint.sh
Lines changed: 5 additions & 0 deletions b/‎containers/app/entrypoint.sh
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/i18n/fr/docusaurus-plugin-content-docs/current/python/python.md
Lines changed: 3 additions & 1 deletion b/‎docs/i18n/fr/docusaurus-plugin-content-docs/current/python/python.md
Lines changed: 3 additions & 1 deletion
diff --git a/‎docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/about.md
Lines changed: 22 additions & 26 deletions b/‎docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/about.md
Lines changed: 22 additions & 26 deletions
@@ -0,0 +1,160 @@
+name: Run Evaluation
+
+on:
+  pull_request:
+    types: [labeled]
+  schedule:
+    - cron: "0 1 * * *" # Run daily at 1 AM UTC
+  workflow_dispatch:
+    inputs:
+      reason:
+        description: "Reason for manual trigger"
+        required: true
+        default: ""
+
+env:
+  N_PROCESSES: 32 # Global configuration for number of parallel processes for evaluation
+
+jobs:
+  run-evaluation:
+    if: github.event.label.name == 'eval-this' || github.event_name != 'pull_request'
+    runs-on: ubuntu-latest
+    permissions:
+      contents: "read"
+      id-token: "write"
+      pull-requests: "write"
+      issues: "write"
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install poetry via pipx
+        run: pipx install poetry
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: "poetry"
+
+      - name: Comment on PR if 'eval-this' label is present
+        if: github.event_name == 'pull_request' && github.event.label.name == 'eval-this'
+        uses: KeisukeYamashita/create-comment@v1
+        with:
+          unique: false
+          comment: |
+            Hi! I started running the evaluation on your PR. You will receive a comment with the results shortly.
+
+      - name: Install Python dependencies using Poetry
+        run: poetry install
+
+      - name: Configure config.toml for evaluation
+        env:
+          DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_LLM_API_KEY }}
+        run: |
+          echo "[llm.eval]" > config.toml
+          echo "model = \"deepseek/deepseek-chat\"" >> config.toml
+          echo "api_key = \"$DEEPSEEK_API_KEY\"" >> config.toml
+          echo "temperature = 0.0" >> config.toml
+
+      - name: Run integration test evaluation
+        env:
+          ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
+          RUNTIME: remote
+          SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
+          EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
+
+        run: |
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES
+
+          # get evaluation report
+          REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek-chat_maxiter_10_N* -name "report.md" -type f | head -n 1)
+          echo "REPORT_FILE: $REPORT_FILE"
+          echo "INTEGRATION_TEST_REPORT<<EOF" >> $GITHUB_ENV
+          cat $REPORT_FILE >> $GITHUB_ENV
+          echo >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+      - name: Run SWE-Bench evaluation
+        env:
+          ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
+          RUNTIME: remote
+          SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
+          EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
+
+        run: |
+          poetry run ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test
+          OUTPUT_FOLDER=$(find evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Lite-test/CodeActAgent -name "deepseek-chat_maxiter_50_N_*-no-hint-run_1" -type d | head -n 1)
+          echo "OUTPUT_FOLDER for SWE-bench evaluation: $OUTPUT_FOLDER"
+          poetry run ./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FOLDER/output.jsonl $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test
+
+          poetry run ./evaluation/swe_bench/scripts/eval/summarize_outputs.py $OUTPUT_FOLDER/output.jsonl > summarize_outputs.log 2>&1
+          echo "SWEBENCH_REPORT<<EOF" >> $GITHUB_ENV
+          cat summarize_outputs.log >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+      - name: Create tar.gz of evaluation outputs
+        run: |
+          TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
+          tar -czvf evaluation_outputs_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs
+
+      - name: Upload evaluation results as artifact
+        uses: actions/upload-artifact@v4
+        id: upload_results_artifact
+        with:
+          name: evaluation-outputs
+          path: evaluation_outputs_*.tar.gz
+
+      - name: Get artifact URL
+        run: echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV
+
+      - name: Authenticate to Google Cloud
+        uses: 'google-github-actions/auth@v2'
+        with:
+          credentials_json: ${{ secrets.GCP_RESEARCH_OBJECT_CREATOR_SA_KEY }}
+
+      - name: Set timestamp and trigger reason
+        run: |
+          echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV
+          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
+            echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV
+          elif [[ "${{ github.event_name }}" == "schedule" ]]; then
+            echo "TRIGGER_REASON=schedule" >> $GITHUB_ENV
+          else
+            echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV
+          fi
+
+      - name: Upload evaluation results to Google Cloud Storage
+        uses: 'google-github-actions/upload-cloud-storage@v2'
+        with:
+          path: 'evaluation/evaluation_outputs/outputs'
+          destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}'
+
+      - name: Comment with evaluation results and artifact link
+        id: create_comment
+        uses: KeisukeYamashita/create-comment@v1
+        with:
+          number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }}
+          unique: false
+          comment: |
+              Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (eval-this label on PR #{0})', github.event.pull_request.number) || github.event_name == 'schedule' && 'Daily Schedule' || format('Manual Trigger: {0}', github.event.inputs.reason) }}
+              Commit: ${{ github.sha }}
+              **SWE-Bench Evaluation Report**
+              ${{ env.SWEBENCH_REPORT }}
+              ---
+              **Integration Tests Evaluation Report**
+              ${{ env.INTEGRATION_TEST_REPORT }}
+              ---
+              You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}).
+
+      - name: Post to a Slack channel
+        id: slack
+        uses: slackapi/[email protected]
+        with:
+          channel-id: 'C07SVQSCR6F'
+          slack-message: "*Evaluation Trigger:* ${{ github.event_name == 'pull_request' && format('Pull Request (eval-this label on PR #{0})', github.event.pull_request.number) || github.event_name == 'schedule' && 'Daily Schedule' || format('Manual Trigger: {0}', github.event.inputs.reason) }}\n\nLink to summary: [here](https://github.com/${{ github.repository }}/issues/${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }}#issuecomment-${{ steps.create_comment.outputs.comment-id }})"
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.EVAL_NOTIF_SLACK_BOT_TOKEN }}
@@ -1,5 +1,5 @@
 # Workflow that builds, tests and then pushes the OpenHands and runtime docker images to the ghcr.io repository
-name: Build, Test and Publish RT Image
+name: Docker
 
 # Always run on "main"
 # Always run on tags
@@ -399,3 +399,49 @@ jobs:
         run: |
           echo "Some runtime tests failed or were cancelled"
           exit 1
+  update_pr_description:
+    name: Update PR Description
+    if: github.event_name == 'pull_request'
+    needs: [ghcr_build_runtime]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Get short SHA
+        id: short_sha
+        run: echo "SHORT_SHA=$(echo ${{ github.event.pull_request.head.sha }} | cut -c1-7)" >> $GITHUB_OUTPUT
+
+      - name: Update PR Description
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          REPO: ${{ github.repository }}
+          SHORT_SHA: ${{ steps.short_sha.outputs.SHORT_SHA }}
+        run: |
+          echo "updating PR description"
+          DOCKER_RUN_COMMAND="docker run -it --rm \
+            -p 3000:3000 \
+            -v /var/run/docker.sock:/var/run/docker.sock \
+            --add-host host.docker.internal:host-gateway \
+            -e SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:$SHORT_SHA-nikolaik \
+            --name openhands-app-$SHORT_SHA \
+            ghcr.io/all-hands-ai/runtime:$SHORT_SHA"
+
+          PR_BODY=$(gh pr view $PR_NUMBER --json body --jq .body)
+
+          if echo "$PR_BODY" | grep -q "To run this PR locally, use the following command:"; then
+            UPDATED_PR_BODY=$(echo "${PR_BODY}" | sed -E "s|docker run -it --rm.*|$DOCKER_RUN_COMMAND|")
+          else
+            UPDATED_PR_BODY="${PR_BODY}
+
+          ---
+
+          To run this PR locally, use the following command:
+          \`\`\`
+          $DOCKER_RUN_COMMAND
+          \`\`\`"
+          fi
+
+          echo "updated body: $UPDATED_PR_BODY"
+          gh pr edit $PR_NUMBER --body "$UPDATED_PR_BODY"
@@ -33,37 +33,33 @@ Learn more at [docs.all-hands.dev](https://docs.all-hands.dev), or jump to the [
 
 ## ⚡ Quick Start
 
-The easiest way to run OpenHands is in Docker. You can change `WORKSPACE_BASE` below to
-point OpenHands to existing code that you'd like to modify.
-
+The easiest way to run OpenHands is in Docker.
 See the [Installation](https://docs.all-hands.dev/modules/usage/installation) guide for
 system requirements and more information.
 
 ```bash
-export WORKSPACE_BASE=$(pwd)/workspace
-
-docker pull ghcr.io/all-hands-ai/runtime:0.11-nikolaik
+docker pull docker.all-hands.dev/all-hands-ai/runtime:0.12-nikolaik
 
-docker run -it --pull=always \
-    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.11-nikolaik \
-    -e SANDBOX_USER_ID=$(id -u) \
-    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
-    -v $WORKSPACE_BASE:/opt/workspace_base \
+docker run -it --rm --pull=always \
+    -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.12-nikolaik \
     -v /var/run/docker.sock:/var/run/docker.sock \
     -p 3000:3000 \
     --add-host host.docker.internal:host-gateway \
-    --name openhands-app-$(date +%Y%m%d%H%M%S) \
-    ghcr.io/all-hands-ai/openhands:0.11
+    --name openhands-app \
+    docker.all-hands.dev/all-hands-ai/openhands:0.12
 ```
 
 You'll find OpenHands running at [http://localhost:3000](http://localhost:3000)!
 
-You'll need a model provider and API key. One option that works well: [Claude 3.5 Sonnet](https://www.anthropic.com/api), but you have [many options](https://docs.all-hands.dev/modules/usage/llms).
+Finally, you'll need a model provider and API key.
+[Anthropic's Claude 3.5 Sonnet](https://www.anthropic.com/api) (`anthropic/claude-3-5-sonnet-20241022`)
+works best, but you have [many options](https://docs.all-hands.dev/modules/usage/llms).
 
 ---
 
-You can also run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode),
-or as an [interactive CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode).
+You can also [connect OpenHands to your local filesystem](https://docs.all-hands.dev/modules/usage/runtimes),
+run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode),
+or interact with it via a [friendly CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode).
 
 Visit [Installation](https://docs.all-hands.dev/modules/usage/installation) for more information and setup instructions.
 
 
@@ -41,6 +41,7 @@ ENV SANDBOX_LOCAL_RUNTIME_URL=http://host.docker.internal
 ENV USE_HOST_NETWORK=false
 ENV WORKSPACE_BASE=/opt/workspace_base
 ENV OPENHANDS_BUILD_VERSION=$OPENHANDS_BUILD_VERSION
+ENV SANDBOX_USER_ID=0
 RUN mkdir -p $WORKSPACE_BASE
 
 RUN apt-get update -y \
 
@@ -18,6 +18,11 @@ if [ -z "$SANDBOX_USER_ID" ]; then
   exit 1
 fi
 
+if [ -z "$WORKSPACE_MOUNT_PATH" ]; then
+  # This is set to /opt/workspace in the Dockerfile. But if the user isn't mounting, we want to unset it so that OpenHands doesn't mount at all
+  unset WORKSPACE_BASE
+fi
+
 if [[ "$SANDBOX_USER_ID" -eq 0 ]]; then
   echo "Running OpenHands as root"
   export RUN_AS_OPENHANDS=false
 
@@ -1,3 +1,5 @@
+
+
 # Documentation Python
 
-Les documents apparaîtront ici après le déploiement.
+La documentation apparaîtra ici après le déploiement.
@@ -1,53 +1,49 @@
----
-sidebar_position: 7
----
+# 📚 Divers
 
-# 📚 Divers {#misc}
+## ⭐️ Stratégie de recherche
 
-## ⭐️ Stratégie de Recherche {#research-strategy}
+La réplication complète d'applications de niveau production avec des LLM est une entreprise complexe. Notre stratégie implique :
 
-La réalisation d'une réplication complète des applications de production avec les LLM est une entreprise complexe. Notre stratégie implique :
+1. **Recherche technique fondamentale :** Se concentrer sur la recherche fondamentale pour comprendre et améliorer les aspects techniques de la génération et de la gestion de code
+2. **Capacités spécialisées :** Améliorer l'efficacité des composants de base grâce à la curation de données, aux méthodes d'entraînement, etc.
+3. **Planification des tâches :** Développer des capacités de détection de bugs, de gestion de base de code et d'optimisation
+4. **Évaluation :** Établir des métriques d'évaluation complètes pour mieux comprendre et améliorer nos modèles
 
-1. **Recherche Technique de Base :** Se concentrer sur la recherche fondamentale pour comprendre et améliorer les aspects techniques de la génération et de la gestion de code.
-2. **Compétences Spécialisées :** Améliorer l'efficacité des composants de base grâce à la curation des données, aux méthodes de formation, et plus encore.
-3. **Planification des Tâches :** Développer des capacités pour la détection de bogues, la gestion du code source et l'optimisation.
-4. **Évaluation :** Établir des métriques d'évaluation complètes pour mieux comprendre et améliorer nos modèles.
+## 🚧 Agent par défaut
 
-## 🚧 Agent Par Défaut {#default-agent}
+Notre Agent par défaut est actuellement le [CodeActAgent](agents), qui est capable de générer du code et de gérer des fichiers.
 
-- Notre agent par défaut est actuellement le CodeActAgent, capable de générer du code et de gérer des fichiers. Nous travaillons sur d'autres implémentations d'agents, y compris [SWE Agent](https://swe-agent.com/). Vous pouvez [lire à propos de notre ensemble actuel d'agents ici](./agents).
+## 🤝 Comment contribuer
 
-## 🤝 Comment Contribuer {#how-to-contribute}
+OpenHands est un projet communautaire et nous accueillons les contributions de tous. Que vous soyez développeur, chercheur ou simplement enthousiaste à l'idée de faire progresser le domaine de l'ingénierie logicielle avec l'IA, il existe de nombreuses façons de s'impliquer :
 
-OpenHands est un projet communautaire, et nous accueillons les contributions de tout le monde. Que vous soyez développeur, chercheur, ou simplement enthousiaste à l'idée de faire progresser le domaine de l'ingénierie logicielle avec l'IA, il existe de nombreuses façons de vous impliquer :
-
-- **Contributions de Code :** Aidez-nous à développer les fonctionnalités de base, l'interface frontend ou les solutions de sandboxing.
-- **Recherche et Évaluation :** Contribuez à notre compréhension des LLM en ingénierie logicielle, participez à l'évaluation des modèles ou suggérez des améliorations.
-- **Retour d'Information et Tests :** Utilisez l'ensemble d'outils OpenHands, signalez des bogues, suggérez des fonctionnalités ou fournissez des retours sur l'ergonomie.
+- **Contributions de code :** Aidez-nous à développer les fonctionnalités de base, l'interface frontend ou les solutions de sandboxing
+- **Recherche et évaluation :** Contribuez à notre compréhension des LLM dans l'ingénierie logicielle, participez à l'évaluation des modèles ou suggérez des améliorations
+- **Retours et tests :** Utilisez la boîte à outils OpenHands, signalez des bugs, suggérez des fonctionnalités ou donnez votre avis sur la facilité d'utilisation
 
 Pour plus de détails, veuillez consulter [ce document](https://github.com/All-Hands-AI/OpenHands/blob/main/CONTRIBUTING.md).
 
-## 🤖 Rejoignez Notre Communauté {#join-our-community}
+## 🤖 Rejoignez notre communauté
 
-Nous avons maintenant à la fois un espace de travail Slack pour la collaboration sur la construction d'OpenHands et un serveur Discord pour discuter de tout ce qui est lié, par exemple, à ce projet, aux LLM, aux agents, etc.
+Nous avons à la fois un espace de travail Slack pour la collaboration sur la construction d'OpenHands et un serveur Discord pour discuter de tout ce qui est lié, par exemple, à ce projet, LLM, agent, etc.
 
 - [Espace de travail Slack](https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA)
 - [Serveur Discord](https://discord.gg/ESHStjSjD4)
 
-Si vous souhaitez contribuer, n'hésitez pas à rejoindre notre communauté. Simplifions l'ingénierie logicielle ensemble !
+Si vous souhaitez contribuer, n'hésitez pas à rejoindre notre communauté. Simplifions ensemble l'ingénierie logicielle !
 
-🐚 **Codez moins, créez plus avec OpenHands.**
+🐚 **Codez moins, faites plus avec OpenHands.**
 
 [![Star History Chart](https://api.star-history.com/svg?repos=All-Hands-AI/OpenHands&type=Date)](https://star-history.com/#All-Hands-AI/OpenHands&Date)
 
-## 🛠️ Construit Avec {#built-with}
+## 🛠️ Construit avec
 
-OpenHands est construit en utilisant une combinaison de cadres et de bibliothèques puissants, offrant une base robuste pour son développement. Voici les technologies clés utilisées dans le projet :
+OpenHands est construit en utilisant une combinaison de frameworks et de bibliothèques puissants, fournissant une base solide pour son développement. Voici les principales technologies utilisées dans le projet :
 
 ![FastAPI](https://img.shields.io/badge/FastAPI-black?style=for-the-badge) ![uvicorn](https://img.shields.io/badge/uvicorn-black?style=for-the-badge) ![LiteLLM](https://img.shields.io/badge/LiteLLM-black?style=for-the-badge) ![Docker](https://img.shields.io/badge/Docker-black?style=for-the-badge) ![Ruff](https://img.shields.io/badge/Ruff-black?style=for-the-badge) ![MyPy](https://img.shields.io/badge/MyPy-black?style=for-the-badge) ![LlamaIndex](https://img.shields.io/badge/LlamaIndex-black?style=for-the-badge) ![React](https://img.shields.io/badge/React-black?style=for-the-badge)
 
-Veuillez noter que la sélection de ces technologies est en cours, et que des technologies supplémentaires peuvent être ajoutées ou des existantes supprimées au fur et à mesure de l'évolution du projet. Nous nous efforçons d'adopter les outils les plus adaptés et efficaces pour améliorer les capacités d'OpenHands.
+Veuillez noter que la sélection de ces technologies est en cours et que des technologies supplémentaires peuvent être ajoutées ou des technologies existantes peuvent être supprimées à mesure que le projet évolue. Nous nous efforçons d'adopter les outils les plus appropriés et les plus efficaces pour améliorer les capacités d'OpenHands.
 
-## 📜 Licence {#license}
+## 📜 Licence
 
 Distribué sous la licence MIT. Voir [notre licence](https://github.com/All-Hands-AI/OpenHands/blob/main/LICENSE) pour plus d'informations.