From 0eeb3895c778927735e27bc00726799130c4c1c2 Mon Sep 17 00:00:00 2001
From: Liz <91279165+lizradway@users.noreply.github.com>
Date: Tue, 19 May 2026 10:09:46 -0400
Subject: [PATCH 1/2] feat(hackathon): add benchmarking strandley script

---
 package-lock.json                             | 399 +++++++++---------
 strandly/package.json                         |   4 +-
 strandly/src/benchmark/README.md              | 158 +++++++
 strandly/src/benchmark/cloudwatch.ts          |  76 ++++
 strandly/src/benchmark/configs.ts             | 105 +++++
 strandly/src/benchmark/contextbench/loader.ts | 176 ++++++++
 .../src/benchmark/contextbench/trajectory.ts  |  81 ++++
 strandly/src/benchmark/evaluator.ts           | 111 +++++
 strandly/src/benchmark/index.ts               | 145 +++++++
 strandly/src/benchmark/reporter.ts            |  58 +++
 strandly/src/benchmark/runner.ts              |  75 ++++
 strandly/src/benchmark/types.ts               |  67 +++
 strandly/src/cli.ts                           |  17 +
 13 files changed, 1282 insertions(+), 190 deletions(-)
 create mode 100644 strandly/src/benchmark/README.md
 create mode 100644 strandly/src/benchmark/cloudwatch.ts
 create mode 100644 strandly/src/benchmark/configs.ts
 create mode 100644 strandly/src/benchmark/contextbench/loader.ts
 create mode 100644 strandly/src/benchmark/contextbench/trajectory.ts
 create mode 100644 strandly/src/benchmark/evaluator.ts
 create mode 100644 strandly/src/benchmark/index.ts
 create mode 100644 strandly/src/benchmark/reporter.ts
 create mode 100644 strandly/src/benchmark/runner.ts
 create mode 100644 strandly/src/benchmark/types.ts

diff --git a/package-lock.json b/package-lock.json
index eccc431fa..6a4fa0605 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -454,6 +454,28 @@
         "node": ">=20.0.0"
       }
     },
+    "node_modules/@aws-sdk/client-cloudwatch": {
+      "version": "3.1049.0",
+      "resolved": "https://registry.npmjs.org/@aws-sdk/client-cloudwatch/-/client-cloudwatch-3.1049.0.tgz",
+      "integrity": "sha512-pxzt53Ch0luCqnSaWNI7vL8MjvF5WFTRV/VzyuHoWOydbaC3RQT2i5DcAW7hjaYQmmRItQnvazTeO+obsBR+4Q==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@aws-crypto/sha256-browser": "5.2.0",
+        "@aws-crypto/sha256-js": "5.2.0",
+        "@aws-sdk/core": "^3.974.12",
+        "@aws-sdk/credential-provider-node": "^3.972.43",
+        "@aws-sdk/types": "^3.973.8",
+        "@smithy/core": "^3.24.2",
+        "@smithy/fetch-http-handler": "^5.4.2",
+        "@smithy/middleware-compression": "^4.4.2",
+        "@smithy/node-http-handler": "^4.7.2",
+        "@smithy/types": "^4.14.1",
+        "tslib": "^2.6.2"
+      },
+      "engines": {
+        "node": ">=20.0.0"
+      }
+    },
     "node_modules/@aws-sdk/client-cognito-identity": {
       "version": "3.1033.0",
       "dev": true,
@@ -668,24 +690,18 @@
       }
     },
     "node_modules/@aws-sdk/core": {
-      "version": "3.974.8",
-      "resolved": "https://registry.npmjs.org/@aws-sdk/core/-/core-3.974.8.tgz",
-      "integrity": "sha512-njR2qoG6ZuB0kvAS2FyICsFZJ6gmCcf2X/7JcD14sUvGDm26wiZ5BrA6LOiUxKFEF+IVe7kdroxyE00YlkiYsw==",
+      "version": "3.974.12",
+      "resolved": "https://registry.npmjs.org/@aws-sdk/core/-/core-3.974.12.tgz",
+      "integrity": "sha512-qrqgioqYFjwR6LatVNS1L2Vk++EwRIxqSQXPKNv5Ofux2D8UNgqMQ1znnMyEImXquVPTtbf71fc128pvmU6y9A==",
       "license": "Apache-2.0",
       "dependencies": {
         "@aws-sdk/types": "^3.973.8",
-        "@aws-sdk/xml-builder": "^3.972.22",
-        "@smithy/core": "^3.23.17",
-        "@smithy/node-config-provider": "^4.3.14",
-        "@smithy/property-provider": "^4.2.14",
-        "@smithy/protocol-http": "^5.3.14",
-        "@smithy/signature-v4": "^5.3.14",
-        "@smithy/smithy-client": "^4.12.13",
+        "@aws-sdk/xml-builder": "^3.972.24",
+        "@aws/lambda-invoke-store": "^0.2.2",
+        "@smithy/core": "^3.24.2",
+        "@smithy/signature-v4": "^5.4.2",
         "@smithy/types": "^4.14.1",
-        "@smithy/util-base64": "^4.3.2",
-        "@smithy/util-middleware": "^4.2.14",
-        "@smithy/util-retry": "^4.3.6",
-        "@smithy/util-utf8": "^4.2.2",
+        "bowser": "^2.11.0",
         "tslib": "^2.6.2"
       },
       "engines": {
@@ -720,14 +736,14 @@
       }
     },
     "node_modules/@aws-sdk/credential-provider-env": {
-      "version": "3.972.34",
-      "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-env/-/credential-provider-env-3.972.34.tgz",
-      "integrity": "sha512-XT0jtf8Fw9JE6ppsQeoNnZRiG+jqRixMT1v1ZR17G60UvVdsQmTG8nbEyHuEPfMxDXEhfdARaM/XiEhca4lGHQ==",
+      "version": "3.972.38",
+      "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-env/-/credential-provider-env-3.972.38.tgz",
+      "integrity": "sha512-m3WjZEgPtioMhPmwqUt+DhlTJ2i9ufR6DhfkyXojb9puEvfR+ur2U5shavu5/Cc9WHHsDCvALi6UFHgcqjhQ5w==",
       "license": "Apache-2.0",
       "dependencies": {
-        "@aws-sdk/core": "^3.974.8",
+        "@aws-sdk/core": "^3.974.12",
         "@aws-sdk/types": "^3.973.8",
-        "@smithy/property-provider": "^4.2.14",
+        "@smithy/core": "^3.24.2",
         "@smithy/types": "^4.14.1",
         "tslib": "^2.6.2"
       },
@@ -736,20 +752,17 @@
       }
     },
     "node_modules/@aws-sdk/credential-provider-http": {
-      "version": "3.972.36",
-      "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-http/-/credential-provider-http-3.972.36.tgz",
-      "integrity": "sha512-DPoGWfy7J7RKxvbf5kOKIGQkD2ek3dbKgzKIGrnLuvZBz5myU+Im/H6pmc14QcnFbqHMqxvtWSgRDSJW3qXLQg==",
+      "version": "3.972.40",
+      "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-http/-/credential-provider-http-3.972.40.tgz",
+      "integrity": "sha512-D78L/m2Dr6cJnnSvWoAudPhQmCwmJ7j6APXsPYmFpPaKfQTfCSu0rdm8j14Np+VmXF9z8Aj8HE3xFpsrwtfgeg==",
       "license": "Apache-2.0",
       "dependencies": {
-        "@aws-sdk/core": "^3.974.8",
+        "@aws-sdk/core": "^3.974.12",
         "@aws-sdk/types": "^3.973.8",
-        "@smithy/fetch-http-handler": "^5.3.17",
-        "@smithy/node-http-handler": "^4.6.1",
-        "@smithy/property-provider": "^4.2.14",
-        "@smithy/protocol-http": "^5.3.14",
-        "@smithy/smithy-client": "^4.12.13",
+        "@smithy/core": "^3.24.2",
+        "@smithy/fetch-http-handler": "^5.4.2",
+        "@smithy/node-http-handler": "^4.7.2",
         "@smithy/types": "^4.14.1",
-        "@smithy/util-stream": "^4.5.25",
         "tslib": "^2.6.2"
       },
       "engines": {
@@ -757,23 +770,22 @@
       }
     },
     "node_modules/@aws-sdk/credential-provider-ini": {
-      "version": "3.972.38",
-      "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-ini/-/credential-provider-ini-3.972.38.tgz",
-      "integrity": "sha512-oDzUBu2MGJFgoar05sPMCwSrhw44ASyccrHzj66vO69OZqi7I6hZZxXfuPLC8OCzW7C+sU+bI73XHij41yekgQ==",
-      "license": "Apache-2.0",
-      "dependencies": {
-        "@aws-sdk/core": "^3.974.8",
-        "@aws-sdk/credential-provider-env": "^3.972.34",
-        "@aws-sdk/credential-provider-http": "^3.972.36",
-        "@aws-sdk/credential-provider-login": "^3.972.38",
-        "@aws-sdk/credential-provider-process": "^3.972.34",
-        "@aws-sdk/credential-provider-sso": "^3.972.38",
-        "@aws-sdk/credential-provider-web-identity": "^3.972.38",
-        "@aws-sdk/nested-clients": "^3.997.6",
+      "version": "3.972.42",
+      "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-ini/-/credential-provider-ini-3.972.42.tgz",
+      "integrity": "sha512-Mu5ESvFXeinafVM8jTIvRqcvK2Ehj4kz3auT39yUcHwu1Vfxo6xRlmUafdKLW4tusjAJukQwK09sCSMgOm7OKg==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@aws-sdk/core": "^3.974.12",
+        "@aws-sdk/credential-provider-env": "^3.972.38",
+        "@aws-sdk/credential-provider-http": "^3.972.40",
+        "@aws-sdk/credential-provider-login": "^3.972.42",
+        "@aws-sdk/credential-provider-process": "^3.972.38",
+        "@aws-sdk/credential-provider-sso": "^3.972.42",
+        "@aws-sdk/credential-provider-web-identity": "^3.972.42",
+        "@aws-sdk/nested-clients": "^3.997.10",
         "@aws-sdk/types": "^3.973.8",
-        "@smithy/credential-provider-imds": "^4.2.14",
-        "@smithy/property-provider": "^4.2.14",
-        "@smithy/shared-ini-file-loader": "^4.4.9",
+        "@smithy/core": "^3.24.2",
+        "@smithy/credential-provider-imds": "^4.3.2",
         "@smithy/types": "^4.14.1",
         "tslib": "^2.6.2"
       },
@@ -782,17 +794,15 @@
       }
     },
     "node_modules/@aws-sdk/credential-provider-login": {
-      "version": "3.972.38",
-      "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-login/-/credential-provider-login-3.972.38.tgz",
-      "integrity": "sha512-g1NosS8qe4OF++G2UFCM5ovSkgipC7YYor5KCWatG0UoMSO5YFj9C8muePlyVmOBV/WTI16Jo3/s1NUo/o1Bww==",
+      "version": "3.972.42",
+      "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-login/-/credential-provider-login-3.972.42.tgz",
+      "integrity": "sha512-O6WkZga3kf0yqyJYd1dbeJqVhEgJx/x1UaLgtbR+XuL/YP+K5y6QTxQKL7ka9z3jnQASESKGAPnRyt4D5hQrxA==",
       "license": "Apache-2.0",
       "dependencies": {
-        "@aws-sdk/core": "^3.974.8",
-        "@aws-sdk/nested-clients": "^3.997.6",
+        "@aws-sdk/core": "^3.974.12",
+        "@aws-sdk/nested-clients": "^3.997.10",
         "@aws-sdk/types": "^3.973.8",
-        "@smithy/property-provider": "^4.2.14",
-        "@smithy/protocol-http": "^5.3.14",
-        "@smithy/shared-ini-file-loader": "^4.4.9",
+        "@smithy/core": "^3.24.2",
         "@smithy/types": "^4.14.1",
         "tslib": "^2.6.2"
       },
@@ -801,21 +811,20 @@
       }
     },
     "node_modules/@aws-sdk/credential-provider-node": {
-      "version": "3.972.39",
-      "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-node/-/credential-provider-node-3.972.39.tgz",
-      "integrity": "sha512-HEswDQyxUtadoZ/bJsPPENHg7R0Lzym5LuMksJeHvqhCOpP+rtkDLKI4/ZChH4w3cf5kG8n6bZuI8PzajoiqMg==",
+      "version": "3.972.43",
+      "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-node/-/credential-provider-node-3.972.43.tgz",
+      "integrity": "sha512-D/DJmbrWRP5BXEO3FH+ar4el+2n6OlGofiud7dQun2jES+AQEJjczenp1jBb4MBN7CpGpS8nsWGQLtuzc9tQbA==",
       "license": "Apache-2.0",
       "dependencies": {
-        "@aws-sdk/credential-provider-env": "^3.972.34",
-        "@aws-sdk/credential-provider-http": "^3.972.36",
-        "@aws-sdk/credential-provider-ini": "^3.972.38",
-        "@aws-sdk/credential-provider-process": "^3.972.34",
-        "@aws-sdk/credential-provider-sso": "^3.972.38",
-        "@aws-sdk/credential-provider-web-identity": "^3.972.38",
+        "@aws-sdk/credential-provider-env": "^3.972.38",
+        "@aws-sdk/credential-provider-http": "^3.972.40",
+        "@aws-sdk/credential-provider-ini": "^3.972.42",
+        "@aws-sdk/credential-provider-process": "^3.972.38",
+        "@aws-sdk/credential-provider-sso": "^3.972.42",
+        "@aws-sdk/credential-provider-web-identity": "^3.972.42",
         "@aws-sdk/types": "^3.973.8",
-        "@smithy/credential-provider-imds": "^4.2.14",
-        "@smithy/property-provider": "^4.2.14",
-        "@smithy/shared-ini-file-loader": "^4.4.9",
+        "@smithy/core": "^3.24.2",
+        "@smithy/credential-provider-imds": "^4.3.2",
         "@smithy/types": "^4.14.1",
         "tslib": "^2.6.2"
       },
@@ -824,15 +833,14 @@
       }
     },
     "node_modules/@aws-sdk/credential-provider-process": {
-      "version": "3.972.34",
-      "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-process/-/credential-provider-process-3.972.34.tgz",
-      "integrity": "sha512-T3IFs4EVmVi1dVN5RciFnklCANSzvrQd/VuHY9ThHSQmYkTogjcGkoJEr+oNUPQZnso52183088NqysMPji1/Q==",
+      "version": "3.972.38",
+      "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-process/-/credential-provider-process-3.972.38.tgz",
+      "integrity": "sha512-EnbYVajGgbkb24s0K1eo4VNAPV5mHIET7LSvirTaFCwkfrfaOJxtSE+wY/tJdKDS21cEYkZs2ruCaAm+W4iblg==",
       "license": "Apache-2.0",
       "dependencies": {
-        "@aws-sdk/core": "^3.974.8",
+        "@aws-sdk/core": "^3.974.12",
         "@aws-sdk/types": "^3.973.8",
-        "@smithy/property-provider": "^4.2.14",
-        "@smithy/shared-ini-file-loader": "^4.4.9",
+        "@smithy/core": "^3.24.2",
         "@smithy/types": "^4.14.1",
         "tslib": "^2.6.2"
       },
@@ -841,17 +849,16 @@
       }
     },
     "node_modules/@aws-sdk/credential-provider-sso": {
-      "version": "3.972.38",
-      "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-sso/-/credential-provider-sso-3.972.38.tgz",
-      "integrity": "sha512-5ZxG+t0+3Q3QPh8KEjX6syskhgNf7I0MN7oGioTf6Lm1NTjfP7sIcYGNsthXC2qR8vcD3edNZwCr2ovfSSWuRA==",
+      "version": "3.972.42",
+      "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-sso/-/credential-provider-sso-3.972.42.tgz",
+      "integrity": "sha512-RVV/9NbFwI8ZHEH5dn39lGyFmSbSVj1+orZdr6QsOe1mW9DCglmlen0cFaNZmCcqkqc7erNRHNBduxbeZuHAnw==",
       "license": "Apache-2.0",
       "dependencies": {
-        "@aws-sdk/core": "^3.974.8",
-        "@aws-sdk/nested-clients": "^3.997.6",
-        "@aws-sdk/token-providers": "3.1041.0",
+        "@aws-sdk/core": "^3.974.12",
+        "@aws-sdk/nested-clients": "^3.997.10",
+        "@aws-sdk/token-providers": "3.1049.0",
         "@aws-sdk/types": "^3.973.8",
-        "@smithy/property-provider": "^4.2.14",
-        "@smithy/shared-ini-file-loader": "^4.4.9",
+        "@smithy/core": "^3.24.2",
         "@smithy/types": "^4.14.1",
         "tslib": "^2.6.2"
       },
@@ -860,16 +867,15 @@
       }
     },
     "node_modules/@aws-sdk/credential-provider-sso/node_modules/@aws-sdk/token-providers": {
-      "version": "3.1041.0",
-      "resolved": "https://registry.npmjs.org/@aws-sdk/token-providers/-/token-providers-3.1041.0.tgz",
-      "integrity": "sha512-Th7kPI6YPtvJUcdznooXJMy+9rQWjmEF81LxaJssngBzuysK4a/x+l8kjm1zb7nYsUPbndnBdUnwng/3PLvtGw==",
+      "version": "3.1049.0",
+      "resolved": "https://registry.npmjs.org/@aws-sdk/token-providers/-/token-providers-3.1049.0.tgz",
+      "integrity": "sha512-r7+d0lQMTHKypkmaF5jRTBYLYHCUHzt3gaVoN9SidLhQeWhCmHk3AKrboDTpPF5b7Pt7vKu3+oeMjznM2Eu1ow==",
       "license": "Apache-2.0",
       "dependencies": {
-        "@aws-sdk/core": "^3.974.8",
-        "@aws-sdk/nested-clients": "^3.997.6",
+        "@aws-sdk/core": "^3.974.12",
+        "@aws-sdk/nested-clients": "^3.997.10",
         "@aws-sdk/types": "^3.973.8",
-        "@smithy/property-provider": "^4.2.14",
-        "@smithy/shared-ini-file-loader": "^4.4.9",
+        "@smithy/core": "^3.24.2",
         "@smithy/types": "^4.14.1",
         "tslib": "^2.6.2"
       },
@@ -878,16 +884,15 @@
       }
     },
     "node_modules/@aws-sdk/credential-provider-web-identity": {
-      "version": "3.972.38",
-      "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-web-identity/-/credential-provider-web-identity-3.972.38.tgz",
-      "integrity": "sha512-lYHFF30DGI20jZcYX8cm6Ns0V7f1dDN6g/MBDLTyD/5iw+bXs3yBr2iAiHDkx4RFU5JgsnZvCHYKiRVPRdmOgw==",
+      "version": "3.972.42",
+      "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-web-identity/-/credential-provider-web-identity-3.972.42.tgz",
+      "integrity": "sha512-/67fXX0ddllD4u2Nujc5PvT4byHgpMUfz6+RxIKi/0nFIckeorm7JvXgzBuDyVKw0s58EbofmETDWUf9vTEuHQ==",
       "license": "Apache-2.0",
       "dependencies": {
-        "@aws-sdk/core": "^3.974.8",
-        "@aws-sdk/nested-clients": "^3.997.6",
+        "@aws-sdk/core": "^3.974.12",
+        "@aws-sdk/nested-clients": "^3.997.10",
         "@aws-sdk/types": "^3.973.8",
-        "@smithy/property-provider": "^4.2.14",
-        "@smithy/shared-ini-file-loader": "^4.4.9",
+        "@smithy/core": "^3.24.2",
         "@smithy/types": "^4.14.1",
         "tslib": "^2.6.2"
       },
@@ -1062,6 +1067,7 @@
       "version": "3.972.37",
       "resolved": "https://registry.npmjs.org/@aws-sdk/middleware-sdk-s3/-/middleware-sdk-s3-3.972.37.tgz",
       "integrity": "sha512-Km7M+i8DrLArVzrid1gfxeGhYHBd3uxvE77g0s5a52zPSVosxzQBnJ0gwWb6NIp/DOk8gsBMhi7V+cpJG0ndTA==",
+      "dev": true,
       "license": "Apache-2.0",
       "dependencies": {
         "@aws-sdk/core": "^3.974.8",
@@ -1137,49 +1143,20 @@
       }
     },
     "node_modules/@aws-sdk/nested-clients": {
-      "version": "3.997.6",
-      "resolved": "https://registry.npmjs.org/@aws-sdk/nested-clients/-/nested-clients-3.997.6.tgz",
-      "integrity": "sha512-WBDnqatJl+kGObpfmfSxqnXeYTu3Me8wx8WCtvoxX3pfWrrTv8I4WTMSSs7PZqcRcVh8WeUKMgGFjMG+52SR1w==",
+      "version": "3.997.10",
+      "resolved": "https://registry.npmjs.org/@aws-sdk/nested-clients/-/nested-clients-3.997.10.tgz",
+      "integrity": "sha512-FtQ/Bt327peZJuyo4WZSOLVUTw9ujRxntepiC7L65FxA2P82Xlq0g14T22BuqBUeMjDoxa9nvwiMHjLIfP3eUg==",
       "license": "Apache-2.0",
       "dependencies": {
         "@aws-crypto/sha256-browser": "5.2.0",
         "@aws-crypto/sha256-js": "5.2.0",
-        "@aws-sdk/core": "^3.974.8",
-        "@aws-sdk/middleware-host-header": "^3.972.10",
-        "@aws-sdk/middleware-logger": "^3.972.10",
-        "@aws-sdk/middleware-recursion-detection": "^3.972.11",
-        "@aws-sdk/middleware-user-agent": "^3.972.38",
-        "@aws-sdk/region-config-resolver": "^3.972.13",
-        "@aws-sdk/signature-v4-multi-region": "^3.996.25",
+        "@aws-sdk/core": "^3.974.12",
+        "@aws-sdk/signature-v4-multi-region": "^3.996.27",
         "@aws-sdk/types": "^3.973.8",
-        "@aws-sdk/util-endpoints": "^3.996.8",
-        "@aws-sdk/util-user-agent-browser": "^3.972.10",
-        "@aws-sdk/util-user-agent-node": "^3.973.24",
-        "@smithy/config-resolver": "^4.4.17",
-        "@smithy/core": "^3.23.17",
-        "@smithy/fetch-http-handler": "^5.3.17",
-        "@smithy/hash-node": "^4.2.14",
-        "@smithy/invalid-dependency": "^4.2.14",
-        "@smithy/middleware-content-length": "^4.2.14",
-        "@smithy/middleware-endpoint": "^4.4.32",
-        "@smithy/middleware-retry": "^4.5.7",
-        "@smithy/middleware-serde": "^4.2.20",
-        "@smithy/middleware-stack": "^4.2.14",
-        "@smithy/node-config-provider": "^4.3.14",
-        "@smithy/node-http-handler": "^4.6.1",
-        "@smithy/protocol-http": "^5.3.14",
-        "@smithy/smithy-client": "^4.12.13",
+        "@smithy/core": "^3.24.2",
+        "@smithy/fetch-http-handler": "^5.4.2",
+        "@smithy/node-http-handler": "^4.7.2",
         "@smithy/types": "^4.14.1",
-        "@smithy/url-parser": "^4.2.14",
-        "@smithy/util-base64": "^4.3.2",
-        "@smithy/util-body-length-browser": "^4.2.2",
-        "@smithy/util-body-length-node": "^4.2.3",
-        "@smithy/util-defaults-mode-browser": "^4.3.49",
-        "@smithy/util-defaults-mode-node": "^4.2.54",
-        "@smithy/util-endpoints": "^3.4.2",
-        "@smithy/util-middleware": "^4.2.14",
-        "@smithy/util-retry": "^4.3.6",
-        "@smithy/util-utf8": "^4.2.2",
         "tslib": "^2.6.2"
       },
       "engines": {
@@ -1203,15 +1180,14 @@
       }
     },
     "node_modules/@aws-sdk/signature-v4-multi-region": {
-      "version": "3.996.25",
-      "resolved": "https://registry.npmjs.org/@aws-sdk/signature-v4-multi-region/-/signature-v4-multi-region-3.996.25.tgz",
-      "integrity": "sha512-+CMIt3e1VzlklAECmG+DtP1sV8iKq25FuA0OKpnJ4KA0kxUtd7CgClY7/RU6VzJBQwbN4EJ9Ue6plvqx1qGadw==",
+      "version": "3.996.27",
+      "resolved": "https://registry.npmjs.org/@aws-sdk/signature-v4-multi-region/-/signature-v4-multi-region-3.996.27.tgz",
+      "integrity": "sha512-0Phbz4t6HI3D3skxvG2uI+VWU034/nSIw1T8d+FPzzQG9EQTrw94o9mOKO2Gv3n3Oc8P7JD7RAUxkoneLWv5Eg==",
       "license": "Apache-2.0",
       "dependencies": {
-        "@aws-sdk/middleware-sdk-s3": "^3.972.37",
         "@aws-sdk/types": "^3.973.8",
-        "@smithy/protocol-http": "^5.3.14",
-        "@smithy/signature-v4": "^5.3.14",
+        "@smithy/core": "^3.24.2",
+        "@smithy/signature-v4": "^5.4.2",
         "@smithy/types": "^4.14.1",
         "tslib": "^2.6.2"
       },
@@ -1249,6 +1225,7 @@
     },
     "node_modules/@aws-sdk/util-arn-parser": {
       "version": "3.972.3",
+      "dev": true,
       "license": "Apache-2.0",
       "dependencies": {
         "tslib": "^2.6.2"
@@ -1332,14 +1309,14 @@
       }
     },
     "node_modules/@aws-sdk/xml-builder": {
-      "version": "3.972.22",
-      "resolved": "https://registry.npmjs.org/@aws-sdk/xml-builder/-/xml-builder-3.972.22.tgz",
-      "integrity": "sha512-PMYKKtJd70IsSG0yHrdAbxBr+ZWBKLvzFZfD3/urxgf6hXVMzuU5M+3MJ5G67RpOmLBu1fAUN65SbWuKUCOlAA==",
+      "version": "3.972.24",
+      "resolved": "https://registry.npmjs.org/@aws-sdk/xml-builder/-/xml-builder-3.972.24.tgz",
+      "integrity": "sha512-V8z5YcDPfsvzrBlj0xR1vhRtocblhYbqdreCJB/voGd4Sr5zjNAeWxexbnqVtskTJe0vFb5KMqbSL++ePl+zRw==",
       "license": "Apache-2.0",
       "dependencies": {
         "@nodable/entities": "2.1.0",
         "@smithy/types": "^4.14.1",
-        "fast-xml-parser": "5.7.2",
+        "fast-xml-parser": "5.7.3",
         "tslib": "^2.6.2"
       },
       "engines": {
@@ -3113,20 +3090,13 @@
       }
     },
     "node_modules/@smithy/core": {
-      "version": "3.23.17",
-      "resolved": "https://registry.npmjs.org/@smithy/core/-/core-3.23.17.tgz",
-      "integrity": "sha512-x7BlLbUFL8NWCGjMF9C+1N5cVCxcPa7g6Tv9B4A2luWx3be3oU8hQ96wIwxe/s7OhIzvoJH73HAUSg5JXVlEtQ==",
+      "version": "3.24.3",
+      "resolved": "https://registry.npmjs.org/@smithy/core/-/core-3.24.3.tgz",
+      "integrity": "sha512-Ep/7tPamGY8mgESE3LyLKtxJyy6U52WWAqr/3wial47Sj4u3PiIF73AOGI27UyLy9duTkhZbgzodOfLV4TduZg==",
       "license": "Apache-2.0",
       "dependencies": {
-        "@smithy/protocol-http": "^5.3.14",
-        "@smithy/types": "^4.14.1",
-        "@smithy/url-parser": "^4.2.14",
-        "@smithy/util-base64": "^4.3.2",
-        "@smithy/util-body-length-browser": "^4.2.2",
-        "@smithy/util-middleware": "^4.2.14",
-        "@smithy/util-stream": "^4.5.25",
-        "@smithy/util-utf8": "^4.2.2",
-        "@smithy/uuid": "^1.1.2",
+        "@aws-crypto/crc32": "5.2.0",
+        "@smithy/types": "^4.14.2",
         "tslib": "^2.6.2"
       },
       "engines": {
@@ -3134,13 +3104,13 @@
       }
     },
     "node_modules/@smithy/credential-provider-imds": {
-      "version": "4.2.14",
+      "version": "4.3.3",
+      "resolved": "https://registry.npmjs.org/@smithy/credential-provider-imds/-/credential-provider-imds-4.3.3.tgz",
+      "integrity": "sha512-I2Bti0DKFo2IJyN28ijCsx51BAumEYR4/1yZ1FXyBygy9MqbnMqCev4JPth/MbpRfBSRAX35hITSnAdJRo1u5w==",
       "license": "Apache-2.0",
       "dependencies": {
-        "@smithy/node-config-provider": "^4.3.14",
-        "@smithy/property-provider": "^4.2.14",
-        "@smithy/types": "^4.14.1",
-        "@smithy/url-parser": "^4.2.14",
+        "@smithy/core": "^3.24.3",
+        "@smithy/types": "^4.14.2",
         "tslib": "^2.6.2"
       },
       "engines": {
@@ -3208,13 +3178,13 @@
       }
     },
     "node_modules/@smithy/fetch-http-handler": {
-      "version": "5.3.17",
+      "version": "5.4.3",
+      "resolved": "https://registry.npmjs.org/@smithy/fetch-http-handler/-/fetch-http-handler-5.4.3.tgz",
+      "integrity": "sha512-F+DRf8IJazRJgYog2A/yJK7eYVc0rqTlRzO+5ZxjJd4WkZoKz0IJRncf7G6t1pdVT3kryJcwuTFhN1c5m6N47A==",
       "license": "Apache-2.0",
       "dependencies": {
-        "@smithy/protocol-http": "^5.3.14",
-        "@smithy/querystring-builder": "^4.2.14",
-        "@smithy/types": "^4.14.1",
-        "@smithy/util-base64": "^4.3.2",
+        "@smithy/core": "^3.24.3",
+        "@smithy/types": "^4.14.2",
         "tslib": "^2.6.2"
       },
       "engines": {
@@ -3295,6 +3265,21 @@
         "node": ">=18.0.0"
       }
     },
+    "node_modules/@smithy/middleware-compression": {
+      "version": "4.4.3",
+      "resolved": "https://registry.npmjs.org/@smithy/middleware-compression/-/middleware-compression-4.4.3.tgz",
+      "integrity": "sha512-IuZ+ebi3OteVFprY33vV7oLfZxRx0YACjoGhex59PX7+sHgG0f75wyb5FZuOZhJoQPnWaDD5piirEwWzyAmb3A==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@smithy/core": "^3.24.3",
+        "@smithy/types": "^4.14.2",
+        "fflate": "0.8.1",
+        "tslib": "^2.6.2"
+      },
+      "engines": {
+        "node": ">=18.0.0"
+      }
+    },
     "node_modules/@smithy/middleware-content-length": {
       "version": "4.2.14",
       "license": "Apache-2.0",
@@ -3387,14 +3372,13 @@
       }
     },
     "node_modules/@smithy/node-http-handler": {
-      "version": "4.6.1",
-      "resolved": "https://registry.npmjs.org/@smithy/node-http-handler/-/node-http-handler-4.6.1.tgz",
-      "integrity": "sha512-iB+orM4x3xrr57X3YaXazfKnntl0LHlZB1kcXSGzMV1Tt0+YwEjGlbjk/44qEGtBzXAz6yFDzkYTKSV6Pj2HUg==",
+      "version": "4.7.3",
+      "resolved": "https://registry.npmjs.org/@smithy/node-http-handler/-/node-http-handler-4.7.3.tgz",
+      "integrity": "sha512-/jPhevcTFPMVl6KNjbaI47iOg1zxC7IsnX4PQDGVZKMFceOXtB8IEYaB7a9VvkP/3oC60WzTeKocvSI7vLT0vA==",
       "license": "Apache-2.0",
       "dependencies": {
-        "@smithy/protocol-http": "^5.3.14",
-        "@smithy/querystring-builder": "^4.2.14",
-        "@smithy/types": "^4.14.1",
+        "@smithy/core": "^3.24.3",
+        "@smithy/types": "^4.14.2",
         "tslib": "^2.6.2"
       },
       "engines": {
@@ -3470,16 +3454,13 @@
       }
     },
     "node_modules/@smithy/signature-v4": {
-      "version": "5.3.14",
+      "version": "5.4.3",
+      "resolved": "https://registry.npmjs.org/@smithy/signature-v4/-/signature-v4-5.4.3.tgz",
+      "integrity": "sha512-53+75QuPl6DL+ct6vVEB51FDO5oulXr20TPV46VvJZg76lIlXNWfxi8j+G2V/t0I2qxCBOa3vX/8bmjrpFVo9g==",
       "license": "Apache-2.0",
       "dependencies": {
-        "@smithy/is-array-buffer": "^4.2.2",
-        "@smithy/protocol-http": "^5.3.14",
-        "@smithy/types": "^4.14.1",
-        "@smithy/util-hex-encoding": "^4.2.2",
-        "@smithy/util-middleware": "^4.2.14",
-        "@smithy/util-uri-escape": "^4.2.2",
-        "@smithy/util-utf8": "^4.2.2",
+        "@smithy/core": "^3.24.3",
+        "@smithy/types": "^4.14.2",
         "tslib": "^2.6.2"
       },
       "engines": {
@@ -3505,7 +3486,9 @@
       }
     },
     "node_modules/@smithy/types": {
-      "version": "4.14.1",
+      "version": "4.14.2",
+      "resolved": "https://registry.npmjs.org/@smithy/types/-/types-4.14.2.tgz",
+      "integrity": "sha512-P+otAxbV4CqBybp7EkcJCrig63yE2E7PuNVOmilVMRcx/O+QDzGULTrKsq4DV13gSfak9ObPrWaHl/9bL5YcWw==",
       "license": "Apache-2.0",
       "dependencies": {
         "tslib": "^2.6.2"
@@ -5842,9 +5825,9 @@
       "license": "BSD-3-Clause"
     },
     "node_modules/fast-xml-builder": {
-      "version": "1.1.7",
-      "resolved": "https://registry.npmjs.org/fast-xml-builder/-/fast-xml-builder-1.1.7.tgz",
-      "integrity": "sha512-Yh7/7rQuMXICNr0oMYDR2yHP6oUvmQsTToFeOWj/kIDhAwQ+c4Ol/lbcwOmEM5OHYQmh6S6EQSQ1sljCKP36bQ==",
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/fast-xml-builder/-/fast-xml-builder-1.2.0.tgz",
+      "integrity": "sha512-00aAWieqff+ZJhsXA4g1g7M8k+7AYoMUUHF+/zFb5U6Uv/P0Vl4QZo84/IcufzYalLuEj9928bXN9PbbFzMF0Q==",
       "funding": [
         {
           "type": "github",
@@ -5853,13 +5836,14 @@
       ],
       "license": "MIT",
       "dependencies": {
-        "path-expression-matcher": "^1.1.3"
+        "path-expression-matcher": "^1.5.0",
+        "xml-naming": "^0.1.0"
       }
     },
     "node_modules/fast-xml-parser": {
-      "version": "5.7.2",
-      "resolved": "https://registry.npmjs.org/fast-xml-parser/-/fast-xml-parser-5.7.2.tgz",
-      "integrity": "sha512-P7oW7tLbYnhOLQk/Gv7cZgzgMPP/XN03K02/Jy6Y/NHzyIAIpxuZIM/YqAkfiXFPxA2CTm7NtCijK9EDu09u2w==",
+      "version": "5.7.3",
+      "resolved": "https://registry.npmjs.org/fast-xml-parser/-/fast-xml-parser-5.7.3.tgz",
+      "integrity": "sha512-C0AaNuC+mscy6vrAQKAc/rMq+zAPHodfHGZu4sGVehvAQt/JLG1O5zEcYcXSY5zSqr4YVgxsB+pHXTq0i7eDlg==",
       "funding": [
         {
           "type": "github",
@@ -5869,7 +5853,7 @@
       "license": "MIT",
       "dependencies": {
         "@nodable/entities": "^2.1.0",
-        "fast-xml-builder": "^1.1.5",
+        "fast-xml-builder": "^1.1.7",
         "path-expression-matcher": "^1.5.0",
         "strnum": "^2.2.3"
       },
@@ -5923,6 +5907,12 @@
         "node": "^12.20 || >= 14.13"
       }
     },
+    "node_modules/fflate": {
+      "version": "0.8.1",
+      "resolved": "https://registry.npmjs.org/fflate/-/fflate-0.8.1.tgz",
+      "integrity": "sha512-/exOvEuc+/iaUm105QIiOt4LpBdMTWsXxqR0HDF35vx3fmaKzw7354gTilCh5rkzEt8WYyG//ku3h3nRmd7CHQ==",
+      "license": "MIT"
+    },
     "node_modules/file-entry-cache": {
       "version": "8.0.0",
       "dev": true,
@@ -7869,9 +7859,9 @@
       "license": "MIT"
     },
     "node_modules/strnum": {
-      "version": "2.2.3",
-      "resolved": "https://registry.npmjs.org/strnum/-/strnum-2.2.3.tgz",
-      "integrity": "sha512-oKx6RUCuHfT3oyVjtnrmn19H1SiCqgJSg+54XqURKp5aCMbrXrhLjRN9TjuwMjiYstZ0MzDrHqkGZ5dFTKd+zg==",
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/strnum/-/strnum-2.3.0.tgz",
+      "integrity": "sha512-ums3KNd42PGyx5xaoVTO1mjU1bH3NpY4vsrVlnv9PNGqQj8wd7rJ6nEypLrJ7z5vxK5RP0yMLo6J/Gsm62DI5Q==",
       "funding": [
         {
           "type": "github",
@@ -8535,6 +8525,21 @@
         }
       }
     },
+    "node_modules/xml-naming": {
+      "version": "0.1.0",
+      "resolved": "https://registry.npmjs.org/xml-naming/-/xml-naming-0.1.0.tgz",
+      "integrity": "sha512-k8KO9hrMyNk6tUWqUfkTEZbezRRpONVOzUTnc97VnCvyj6Tf9lyUR9EDAIeiVLv56jsMcoXEwjW8Kv5yPY52lw==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/NaturalIntelligence"
+        }
+      ],
+      "license": "MIT",
+      "engines": {
+        "node": ">=16.0.0"
+      }
+    },
     "node_modules/xtend": {
       "version": "4.0.2",
       "dev": true,
@@ -8595,8 +8600,10 @@
       "name": "@strands-agents/strandly",
       "version": "0.0.1",
       "dependencies": {
+        "@aws-sdk/client-cloudwatch": "^3",
         "commander": "^14",
-        "tsx": "^4.21.0"
+        "tsx": "^4.21.0",
+        "zod": "^3.23"
       },
       "bin": {
         "strandly": "src/cli.ts"
@@ -8606,6 +8613,15 @@
         "typescript": "^5.5.0"
       }
     },
+    "strandly/node_modules/zod": {
+      "version": "3.25.76",
+      "resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz",
+      "integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==",
+      "license": "MIT",
+      "funding": {
+        "url": "https://github.com/sponsors/colinhacks"
+      }
+    },
     "strands-dev": {
       "name": "@strands-agents/dev",
       "version": "0.0.1",
@@ -8653,6 +8669,7 @@
         "@opentelemetry/sdk-metrics": "^2.6.1",
         "@opentelemetry/sdk-trace-base": "^2.6.1",
         "@opentelemetry/sdk-trace-node": "^2.6.1",
+        "@smithy/types": "^4.0.0",
         "@types/express": "^5.0.6",
         "@types/node": "^25.6.0",
         "@types/uuid": "^11.0.0",
@@ -8688,6 +8705,7 @@
         "@opentelemetry/sdk-metrics": "^2.6.1",
         "@opentelemetry/sdk-trace-base": "^2.6.1",
         "@opentelemetry/sdk-trace-node": "^2.6.1",
+        "@smithy/types": "^4.0.0",
         "express": "^5.1.0",
         "openai": "^6.7.0",
         "zod": "^4.1.12"
@@ -8729,6 +8747,9 @@
         "@opentelemetry/sdk-trace-node": {
           "optional": true
         },
+        "@smithy/types": {
+          "optional": true
+        },
         "express": {
           "optional": true
         },
diff --git a/strandly/package.json b/strandly/package.json
index 27baf2160..9c9b4f0c8 100644
--- a/strandly/package.json
+++ b/strandly/package.json
@@ -10,8 +10,10 @@
     "type-check": "tsc --noEmit"
   },
   "dependencies": {
+    "@aws-sdk/client-cloudwatch": "^3",
     "commander": "^14",
-    "tsx": "^4.21.0"
+    "tsx": "^4.21.0",
+    "zod": "^3.23"
   },
   "devDependencies": {
     "@types/node": "^22",
diff --git a/strandly/src/benchmark/README.md b/strandly/src/benchmark/README.md
new file mode 100644
index 000000000..6e5b9bd32
--- /dev/null
+++ b/strandly/src/benchmark/README.md
@@ -0,0 +1,158 @@
+# strandly benchmark
+
+Benchmarks Strands agents against [ContextBench](https://github.com/EuniAI/ContextBench) — a code investigation benchmark that measures how well an agent finds relevant code for GitHub issues.
+
+## Quick Start
+
+```bash
+# Run the default config (control) on the default task
+AWS_REGION=us-east-1 strandly benchmark --suite contextbench
+
+# Run a specific built-in config
+AWS_REGION=us-east-1 strandly benchmark --suite contextbench --config offloader
+
+# Run with a custom agent file
+AWS_REGION=us-east-1 strandly benchmark --suite contextbench --agent-file ./my-agent.ts
+
+# Use a different model
+AWS_REGION=us-east-1 strandly benchmark --suite contextbench --model us.anthropic.claude-haiku-4-5-20251001-v1:0
+
+# Fail if file coverage drops below 10%
+AWS_REGION=us-east-1 strandly benchmark --suite contextbench --min-coverage 0.1
+
+# Save results to files
+AWS_REGION=us-east-1 strandly benchmark --suite contextbench --output results.json --output-md results.md
+
+# Emit metrics to CloudWatch
+AWS_REGION=us-east-1 strandly benchmark --suite contextbench --cloudwatch
+```
+
+## Prerequisites
+
+- **Node.js 20+**
+- **Python 3.x** with `pyarrow`, `tree-sitter`, `tree-sitter-languages`
+- **AWS credentials** configured (for Bedrock model access and optional CloudWatch)
+- **`AWS_REGION`** set to a region with Bedrock access (e.g. `us-east-1`)
+
+Install Python deps:
+```bash
+pip install pyarrow tree-sitter 'tree-sitter-languages; python_version < "3.12"'
+```
+
+## Options
+
+| Flag | Description |
+|------|-------------|
+| `--suite <name>` | **(required)** Benchmark suite. Currently: `contextbench` |
+| `--config <name>` | Run only this built-in config |
+| `--agent-file <path>` | Path to a `.ts` file exporting a custom `BenchmarkConfig` |
+| `--task <id>` | ContextBench task ID (default: `django__django-15987`) |
+| `--model <id>` | Model ID for built-in configs (default: `us.anthropic.claude-sonnet-4-20250514-v1:0`) |
+| `--min-coverage <n>` | Minimum file coverage (0-1). Exit 1 if below. |
+| `--output <path>` | Write JSON results to file |
+| `--output-md <path>` | Write markdown summary to file |
+| `--cloudwatch` | Emit metrics to AWS CloudWatch |
+
+## Built-in Configs
+
+These will be updated once we have preset context management strategies.
+
+| Name | Strategy | Description |
+|------|----------|-------------|
+| `control` | SlidingWindow ws=40 | SDK default, no extras |
+| `offloader` | ContextOffloader | Offloads tool results >2500 tokens, keeps 1000 token preview |
+| `offloader-aggressive` | ContextOffloader | Offloads >500 tokens, keeps 200 token preview |
+| `summarizing` | SummarizingConversationManager | Summarizes oldest 30% of messages, proactive at 70% context |
+| `sliding-proactive` | SlidingWindow + proactive | Same ws=40 but proactively compresses at 70% context usage |
+| `offloader-summarizing` | Offloader + Summarizing | Combined: offload large results + summarize old messages |
+
+## Custom Agent File
+
+Create a `.ts` file that exports a `BenchmarkConfig`:
+
+```typescript
+import { Agent } from './strands-ts/src/agent/agent.js'
+import { BedrockModel } from './strands-ts/src/models/bedrock.js'
+import { bash } from './strands-ts/src/vended-tools/bash/bash.js'
+import { ContextOffloader } from './strands-ts/src/vended-plugins/context-offloader/plugin.js'
+import { InMemoryStorage } from './strands-ts/src/vended-plugins/context-offloader/storage.js'
+import type { BenchmarkConfig } from './strandly/src/benchmark/types.js'
+
+const config: BenchmarkConfig = {
+  name: 'my-experiment',
+  description: 'Testing new offloading thresholds',
+  createAgent(task) {
+    return new Agent({
+      model: new BedrockModel({ stream: false }),
+      tools: [bash],
+      plugins: [new ContextOffloader({ storage: new InMemoryStorage(), maxResultTokens: 1000, previewTokens: 500 })],
+      systemPrompt: task.prompt,
+      printer: false,
+    })
+  },
+}
+
+export default config
+```
+
+Run it:
+```bash
+AWS_REGION=us-east-1 strandly benchmark --suite contextbench --agent-file ./my-experiment.ts
+```
+
+## Metrics
+
+Each run produces:
+
+| Metric | What it measures |
+|--------|-----------------|
+| **File Coverage** | Fraction of gold files the agent found (recall) |
+| **File Precision** | Fraction of files the agent read that were relevant |
+| **Symbol Coverage/Precision** | Same at function/class granularity |
+| **Span Coverage/Precision** | Same at line-range granularity |
+| **EditLoc Recall/Precision** | Did the agent find the exact edit locations? |
+| **Input Tokens** | Total tokens consumed (cost proxy) |
+| **Cycles** | Number of agent loop iterations |
+| **Latency** | Wall-clock time |
+
+## CloudWatch
+
+Metrics go to namespace `StrandsSDK/Benchmarks` with dimensions `Config`, `Task`, `Branch`:
+
+- `FileCoverage`
+- `FilePrecision`
+- `TokenUsage`
+- `CycleCount`
+- `Latency`
+
+Requires AWS credentials with `cloudwatch:PutMetricData` permission.
+
+## How It Works
+
+1. Clones the ContextBench repo (cached at `.cache/contextbench/`)
+2. Loads a task from their gold parquet files (issue + gold file/span annotations)
+3. Clones the target repo at the correct commit
+4. Creates a Strands agent with the selected config
+5. Runs the agent — it uses `bash` to explore the repo and find relevant code
+6. Extracts which files the agent read from its tool call history
+7. Evaluates against ContextBench gold annotations (Python subprocess)
+8. Reports results
+
+## Runtime
+
+~5-10 minutes per config per task. Running all 6 built-in configs takes ~50 minutes.
+
+## Adding New Benchmark Suites
+
+The `--suite` flag supports multiple benchmarks. To add a new one, implement the `BenchmarkSuite` interface and register it in `index.ts`:
+
+```typescript
+import type { BenchmarkSuite } from './types.js'
+
+const myBench: BenchmarkSuite = {
+  name: 'mybench',
+  async run(opts) {
+    // Load tasks, run agent, evaluate, return results
+  },
+}
+```
diff --git a/strandly/src/benchmark/cloudwatch.ts b/strandly/src/benchmark/cloudwatch.ts
new file mode 100644
index 000000000..4affb4717
--- /dev/null
+++ b/strandly/src/benchmark/cloudwatch.ts
@@ -0,0 +1,76 @@
+import type { BenchmarkSuiteResult } from './types.js'
+
+const NAMESPACE = 'StrandsSDK/Benchmarks'
+
+export async function emitMetrics(result: BenchmarkSuiteResult): Promise<void> {
+  const { CloudWatchClient, PutMetricDataCommand } = await import('@aws-sdk/client-cloudwatch')
+  const client = new CloudWatchClient({})
+
+  const metricData = result.results
+    .filter((r) => !r.error)
+    .flatMap((r) => [
+      {
+        MetricName: 'FileCoverage',
+        Value: r.evaluation.fileCoverage,
+        Unit: 'None' as const,
+        Dimensions: [
+          { Name: 'Config', Value: r.config },
+          { Name: 'Task', Value: r.task },
+          { Name: 'Branch', Value: result.branch },
+        ],
+      },
+      {
+        MetricName: 'FilePrecision',
+        Value: r.evaluation.filePrecision,
+        Unit: 'None' as const,
+        Dimensions: [
+          { Name: 'Config', Value: r.config },
+          { Name: 'Task', Value: r.task },
+          { Name: 'Branch', Value: result.branch },
+        ],
+      },
+      {
+        MetricName: 'TokenUsage',
+        Value: r.metrics.inputTokens + r.metrics.outputTokens,
+        Unit: 'Count' as const,
+        Dimensions: [
+          { Name: 'Config', Value: r.config },
+          { Name: 'Task', Value: r.task },
+          { Name: 'Branch', Value: result.branch },
+        ],
+      },
+      {
+        MetricName: 'CycleCount',
+        Value: r.metrics.cycleCount,
+        Unit: 'Count' as const,
+        Dimensions: [
+          { Name: 'Config', Value: r.config },
+          { Name: 'Task', Value: r.task },
+          { Name: 'Branch', Value: result.branch },
+        ],
+      },
+      {
+        MetricName: 'Latency',
+        Value: r.metrics.latencyMs,
+        Unit: 'Milliseconds' as const,
+        Dimensions: [
+          { Name: 'Config', Value: r.config },
+          { Name: 'Task', Value: r.task },
+          { Name: 'Branch', Value: result.branch },
+        ],
+      },
+    ])
+
+  // CloudWatch accepts max 1000 metric data points per request
+  for (let i = 0; i < metricData.length; i += 1000) {
+    const batch = metricData.slice(i, i + 1000)
+    await client.send(
+      new PutMetricDataCommand({
+        Namespace: NAMESPACE,
+        MetricData: batch,
+      })
+    )
+  }
+
+  console.log(`Emitted ${metricData.length} metrics to CloudWatch namespace: ${NAMESPACE}`)
+}
diff --git a/strandly/src/benchmark/configs.ts b/strandly/src/benchmark/configs.ts
new file mode 100644
index 000000000..d1ff9e438
--- /dev/null
+++ b/strandly/src/benchmark/configs.ts
@@ -0,0 +1,105 @@
+import { Agent } from '../../../strands-ts/src/agent/agent.js'
+import { BedrockModel } from '../../../strands-ts/src/models/bedrock.js'
+import { bash } from '../../../strands-ts/src/vended-tools/bash/bash.js'
+import { SlidingWindowConversationManager } from '../../../strands-ts/src/conversation-manager/sliding-window-conversation-manager.js'
+import { SummarizingConversationManager } from '../../../strands-ts/src/conversation-manager/summarizing-conversation-manager.js'
+import { ContextOffloader } from '../../../strands-ts/src/vended-plugins/context-offloader/plugin.js'
+import { InMemoryStorage } from '../../../strands-ts/src/vended-plugins/context-offloader/storage.js'
+import type { BenchmarkConfig, ContextBenchTask } from './types.js'
+
+const DEFAULT_MODEL = 'us.anthropic.claude-sonnet-4-20250514-v1:0'
+
+// TODO: Update these configs once we have preset context management strategies
+export function getConfigs(modelId?: string): BenchmarkConfig[] {
+  const model = modelId ?? DEFAULT_MODEL
+
+  return [
+    {
+      name: 'control',
+      description: `SDK default (SlidingWindow ws=40, ${model})`,
+      createAgent(task: ContextBenchTask): Agent {
+        return new Agent({
+          model: new BedrockModel({ modelId: model, stream: false }),
+          tools: [bash],
+          systemPrompt: task.prompt,
+          printer: false,
+        })
+      },
+    },
+    {
+      name: 'offloader',
+      description: `Context offloading (maxResult=2500, preview=1000, ${model})`,
+      createAgent(task: ContextBenchTask): Agent {
+        return new Agent({
+          model: new BedrockModel({ modelId: model, stream: false }),
+          tools: [bash],
+          plugins: [new ContextOffloader({ storage: new InMemoryStorage() })],
+          systemPrompt: task.prompt,
+          printer: false,
+        })
+      },
+    },
+    {
+      name: 'offloader-aggressive',
+      description: `Aggressive offloading (maxResult=500, preview=200, ${model})`,
+      createAgent(task: ContextBenchTask): Agent {
+        return new Agent({
+          model: new BedrockModel({ modelId: model, stream: false }),
+          tools: [bash],
+          plugins: [new ContextOffloader({ storage: new InMemoryStorage(), maxResultTokens: 500, previewTokens: 200 })],
+          systemPrompt: task.prompt,
+          printer: false,
+        })
+      },
+    },
+    {
+      name: 'summarizing',
+      description: `Summarizing conversation manager (ratio=0.3, proactive, ${model})`,
+      createAgent(task: ContextBenchTask): Agent {
+        return new Agent({
+          model: new BedrockModel({ modelId: model, stream: false }),
+          tools: [bash],
+          conversationManager: new SummarizingConversationManager({
+            summaryRatio: 0.3,
+            proactiveCompression: true,
+          }),
+          systemPrompt: task.prompt,
+          printer: false,
+        })
+      },
+    },
+    {
+      name: 'sliding-proactive',
+      description: `Sliding window (ws=40) with proactive compression (${model})`,
+      createAgent(task: ContextBenchTask): Agent {
+        return new Agent({
+          model: new BedrockModel({ modelId: model, stream: false }),
+          tools: [bash],
+          conversationManager: new SlidingWindowConversationManager({
+            windowSize: 40,
+            proactiveCompression: true,
+          }),
+          systemPrompt: task.prompt,
+          printer: false,
+        })
+      },
+    },
+    {
+      name: 'offloader-summarizing',
+      description: `Offloading + summarizing combined (${model})`,
+      createAgent(task: ContextBenchTask): Agent {
+        return new Agent({
+          model: new BedrockModel({ modelId: model, stream: false }),
+          tools: [bash],
+          plugins: [new ContextOffloader({ storage: new InMemoryStorage() })],
+          conversationManager: new SummarizingConversationManager({
+            summaryRatio: 0.3,
+            proactiveCompression: true,
+          }),
+          systemPrompt: task.prompt,
+          printer: false,
+        })
+      },
+    },
+  ]
+}
diff --git a/strandly/src/benchmark/contextbench/loader.ts b/strandly/src/benchmark/contextbench/loader.ts
new file mode 100644
index 000000000..0fa14d529
--- /dev/null
+++ b/strandly/src/benchmark/contextbench/loader.ts
@@ -0,0 +1,176 @@
+import { execSync } from 'node:child_process'
+import { existsSync, writeFileSync, mkdirSync } from 'node:fs'
+import { join, resolve } from 'node:path'
+import type { ContextBenchTask, GoldAnnotation } from '../types.js'
+
+const CACHE_DIR = resolve(import.meta.dirname, '../../../../.cache/contextbench')
+const CONTEXTBENCH_REPO = 'https://github.com/EuniAI/ContextBench.git'
+
+export function ensureContextBenchCloned(): string {
+  const repoDir = join(CACHE_DIR, 'contextbench-repo')
+
+  if (!existsSync(join(repoDir, '.git'))) {
+    mkdirSync(CACHE_DIR, { recursive: true })
+    console.log('Cloning ContextBench repository...')
+    execSync(`git clone --depth 1 ${CONTEXTBENCH_REPO} ${repoDir}`, { stdio: 'inherit' })
+  }
+
+  return repoDir
+}
+
+export function ensureDependencies(): void {
+  try {
+    execSync('python3 -c "import pyarrow; import tree_sitter"', { stdio: 'pipe' })
+  } catch {
+    console.error(
+      'Missing Python dependencies for ContextBench evaluation.\n' +
+        'Install with: pip install pyarrow tree-sitter tree-sitter-languages datasets'
+    )
+    process.exit(1)
+  }
+}
+
+export function loadTask(taskId: string): ContextBenchTask {
+  const contextbenchDir = ensureContextBenchCloned()
+  const goldParquet = join(contextbenchDir, 'data', 'contextbench_verified.parquet')
+
+  if (!existsSync(goldParquet)) {
+    throw new Error(`Gold data not found at ${goldParquet}`)
+  }
+
+  const tmp = join(CACHE_DIR, 'tmp')
+  mkdirSync(tmp, { recursive: true })
+
+  const scriptFile = join(tmp, 'load_task.py')
+  const parquetPath = JSON.stringify(goldParquet)
+  const taskIdStr = JSON.stringify(taskId)
+  writeFileSync(
+    scriptFile,
+`import pyarrow.parquet as pq, json, sys
+
+df = pq.read_table(${parquetPath}).to_pandas()
+task_id = ${taskIdStr}
+
+row = df[df["original_inst_id"] == task_id]
+if row.empty:
+    row = df[df["instance_id"].str.contains(task_id)]
+if row.empty:
+    print(json.dumps({"error": "Task not found: " + task_id}))
+    sys.exit(0)
+
+r = row.iloc[0]
+print(json.dumps({
+    "instance_id": str(r["instance_id"]),
+    "original_inst_id": str(r["original_inst_id"]),
+    "repo": str(r["repo"]),
+    "repo_url": str(r["repo_url"]),
+    "base_commit": str(r["base_commit"]),
+    "problem_statement": str(r["problem_statement"]),
+    "gold_context": str(r["gold_context"]),
+}))
+`)
+
+  const output = execSync(`python3 ${scriptFile}`, { encoding: 'utf-8', stdio: ['pipe', 'pipe', 'pipe'] })
+  const data = JSON.parse(output.trim())
+
+  if (data.error) {
+    throw new Error(data.error)
+  }
+
+  const goldContext: Array<{ file: string; start_line?: number; end_line?: number }> = JSON.parse(
+    data.gold_context
+  )
+
+  return {
+    id: data.original_inst_id,
+    repo: data.repo,
+    issue: extractIssueNumber(data.original_inst_id),
+    baseCommit: data.base_commit,
+    prompt: buildPrompt(data.problem_statement, data.repo),
+    goldAnnotations: parseGoldContext(goldContext),
+  }
+}
+
+export function ensureRepoCloned(task: ContextBenchTask): string {
+  if (!/^[0-9a-f]+$/i.test(task.baseCommit)) {
+    throw new Error(`Invalid base commit: ${task.baseCommit}`)
+  }
+  if (!/^[a-zA-Z0-9_.-]+\/[a-zA-Z0-9_.-]+$/.test(task.repo)) {
+    throw new Error(`Invalid repo format: ${task.repo}`)
+  }
+
+  const repoDir = join(CACHE_DIR, 'repos', task.repo.replace('/', '__'))
+
+  if (!existsSync(join(repoDir, '.git'))) {
+    mkdirSync(join(CACHE_DIR, 'repos'), { recursive: true })
+    console.log(`  Cloning ${task.repo}...`)
+    execSync(`git clone --depth 100 https://github.com/${task.repo}.git ${repoDir}`, {
+      stdio: 'inherit',
+    })
+  }
+
+  execSync(`git checkout ${task.baseCommit} 2>/dev/null || git fetch --depth 100 origin ${task.baseCommit} && git checkout ${task.baseCommit}`, {
+    cwd: repoDir,
+    stdio: 'pipe',
+  })
+
+  return repoDir
+}
+
+export function listTasks(): string[] {
+  const contextbenchDir = ensureContextBenchCloned()
+  const goldParquet = join(contextbenchDir, 'data', 'contextbench_verified.parquet')
+
+  const tmp = join(CACHE_DIR, 'tmp')
+  mkdirSync(tmp, { recursive: true })
+
+  const scriptFile = join(tmp, 'list_tasks.py')
+  const parquetPath = JSON.stringify(goldParquet)
+  writeFileSync(
+    scriptFile,
+`import pyarrow.parquet as pq, json
+
+t = pq.read_table(${parquetPath}, columns=["original_inst_id"])
+ids = t.column("original_inst_id").to_pylist()
+print(json.dumps(ids[:20]))
+`)
+
+  const output = execSync(`python3 ${scriptFile}`, { encoding: 'utf-8', stdio: ['pipe', 'pipe', 'pipe'] })
+  return JSON.parse(output.trim())
+}
+
+function buildPrompt(problemStatement: string, repo: string): string {
+  return `You are a code investigation agent. Your task is to find all relevant code locations for the following GitHub issue in the ${repo} repository.
+
+## Issue
+
+${problemStatement}
+
+Investigate the repository to find all files, functions, and code spans relevant to this issue. Use the available tools to read files and search the codebase. Be thorough — find all relevant locations, not just the first match.
+
+When you are done, list all the files and specific line ranges you found to be relevant.`
+}
+
+function parseGoldContext(
+  goldContext: Array<{ file: string; start_line?: number; end_line?: number }>
+): GoldAnnotation[] {
+  const byFile = new Map<string, GoldAnnotation>()
+
+  for (const entry of goldContext) {
+    let annotation = byFile.get(entry.file)
+    if (!annotation) {
+      annotation = { file: entry.file, spans: [] }
+      byFile.set(entry.file, annotation)
+    }
+    if (entry.start_line != null && entry.end_line != null) {
+      annotation.spans!.push({ startLine: entry.start_line, endLine: entry.end_line })
+    }
+  }
+
+  return [...byFile.values()]
+}
+
+function extractIssueNumber(instanceId: string): number {
+  const match = instanceId.match(/-(\d+)$/)
+  return match ? parseInt(match[1]!, 10) : 0
+}
diff --git a/strandly/src/benchmark/contextbench/trajectory.ts b/strandly/src/benchmark/contextbench/trajectory.ts
new file mode 100644
index 000000000..b12a614a1
--- /dev/null
+++ b/strandly/src/benchmark/contextbench/trajectory.ts
@@ -0,0 +1,81 @@
+import type { Message } from '../../../../strands-ts/src/types/messages.js'
+
+export interface TrajectoryEntry {
+  file: string
+  content?: string
+  startLine?: number
+  endLine?: number
+}
+
+export function extractTrajectory(messages: Message[], repoDir?: string): TrajectoryEntry[] {
+  const entries: TrajectoryEntry[] = []
+  const seen = new Set<string>()
+
+  for (const msg of messages) {
+    if (msg.role !== 'assistant') continue
+
+    for (const block of msg.content) {
+      if (block.type !== 'toolUseBlock') continue
+
+      const paths = extractFilePathsFromToolCall(block.name, block.input as Record<string, unknown>)
+      for (const filePath of paths) {
+        const relative = toRelativePath(filePath, repoDir)
+        if (relative && !seen.has(relative)) {
+          seen.add(relative)
+          entries.push({ file: relative })
+        }
+      }
+    }
+  }
+
+  return entries
+}
+
+export function trajectoryToFileList(entries: TrajectoryEntry[]): string[] {
+  return entries.map((e) => e.file)
+}
+
+function toRelativePath(filePath: string, repoDir?: string): string {
+  if (!repoDir) return filePath
+
+  const normalized = repoDir.endsWith('/') ? repoDir : repoDir + '/'
+  if (filePath.startsWith(normalized)) {
+    return filePath.slice(normalized.length)
+  }
+  if (filePath.startsWith('/')) {
+    return filePath
+  }
+  return filePath
+}
+
+function extractFilePathsFromToolCall(toolName: string, input: Record<string, unknown>): string[] {
+  if (toolName === 'file_editor' || toolName === 'fileEditor') {
+    const path = (input.path ?? input.file_path ?? input.filePath) as string | undefined
+    if (path && (input.command === 'view' || !input.command)) return [path]
+  }
+
+  if (toolName === 'bash') {
+    const cmd = (input.command ?? input.cmd) as string | undefined
+    if (!cmd) return []
+
+    const paths: string[] = []
+
+    const catMatch = cmd.match(/(?:cat|head|tail|less|more)\s+([^\s|;>&]+)/g)
+    if (catMatch) {
+      for (const m of catMatch) {
+        const file = m.replace(/^(?:cat|head|tail|less|more)\s+/, '')
+        if (file && !file.startsWith('-')) paths.push(file)
+      }
+    }
+
+    const sedMatch = cmd.match(/sed\s+.*\s+([^\s|;>&]+\.\w+)/)
+    if (sedMatch) paths.push(sedMatch[1]!)
+
+    const pythonCat = cmd.match(/python.*(?:open|read)\s*\(\s*['"]([^'"]+)['"]\s*\)/)
+    if (pythonCat) paths.push(pythonCat[1]!)
+
+    return paths
+  }
+
+  return []
+}
diff --git a/strandly/src/benchmark/evaluator.ts b/strandly/src/benchmark/evaluator.ts
new file mode 100644
index 000000000..93ccb5bda
--- /dev/null
+++ b/strandly/src/benchmark/evaluator.ts
@@ -0,0 +1,111 @@
+import { execSync } from 'node:child_process'
+import { writeFileSync, mkdtempSync } from 'node:fs'
+import { join } from 'node:path'
+import { tmpdir } from 'node:os'
+import type { ContextBenchTask, EvaluationMetrics } from './types.js'
+import { ensureContextBenchCloned } from './contextbench/loader.js'
+
+export async function evaluate(
+  task: ContextBenchTask,
+  filesRead: string[],
+  spans?: Record<string, Array<{ start: number; end: number }>>
+): Promise<EvaluationMetrics> {
+  const contextbenchDir = ensureContextBenchCloned()
+  const goldParquet = join(contextbenchDir, 'data', 'contextbench_verified.parquet')
+  const tmp = mkdtempSync(join(tmpdir(), 'bench-eval-'))
+
+  const predData = {
+    instance_id: task.id,
+    original_inst_id: task.id,
+    repo_url: `https://github.com/${task.repo}.git`,
+    commit: task.baseCommit,
+    traj_data: {
+      pred_steps: [{ files: filesRead, spans: spans ?? {}, symbols: {} }],
+      pred_files: filesRead,
+      pred_spans: spans ?? {},
+      pred_symbols: {},
+    },
+  }
+
+  const predFile = join(tmp, 'pred.jsonl')
+  writeFileSync(predFile, JSON.stringify(predData) + '\n')
+
+  const scriptFile = join(tmp, 'evaluate.py')
+  const cbDir = JSON.stringify(contextbenchDir)
+  const goldPath = JSON.stringify(goldParquet)
+  const predPath = JSON.stringify(predFile)
+  const reposDir = JSON.stringify(join(tmp, 'repos'))
+  writeFileSync(
+    scriptFile,
+`import sys, os, io, json
+
+sys.path.insert(0, ${cbDir})
+os.environ["PYTHONDONTWRITEBYTECODE"] = "1"
+
+_real_stdout = sys.stdout
+sys.stdout = io.StringIO()
+sys.stderr = open(os.devnull, "w")
+
+from contextbench.evaluate import evaluate_instance
+from contextbench.parsers import GoldLoader
+
+gold_loader = GoldLoader(${goldPath})
+pred_data = json.loads(open(${predPath}).readline())
+instance_id = pred_data["instance_id"]
+original_id = pred_data.get("original_inst_id", instance_id)
+
+gold = gold_loader.get(instance_id) or gold_loader.get(original_id)
+if not gold:
+    sys.stdout = _real_stdout
+    print(json.dumps({"error": "no_gold_found"}))
+    sys.exit(0)
+
+result = evaluate_instance(instance_id, gold, pred_data, ${reposDir})
+sys.stdout = _real_stdout
+print(json.dumps(result, default=str))
+`)
+
+  const output = execSync(`python3 ${scriptFile}`, {
+    encoding: 'utf-8',
+    stdio: ['pipe', 'pipe', 'pipe'],
+    timeout: 120_000,
+  })
+
+  const result = JSON.parse(output.trim())
+
+  if (result.error) {
+    console.warn(`  Evaluation error: ${result.error}`)
+    return emptyMetrics()
+  }
+
+  return extractMetrics(result)
+}
+
+function extractMetrics(result: Record<string, unknown>): EvaluationMetrics {
+  const final = (result.final ?? {}) as Record<string, Record<string, number>>
+  const editloc = (result.editloc ?? {}) as Record<string, number>
+
+  return {
+    fileCoverage: final.file?.coverage ?? 0,
+    filePrecision: final.file?.precision ?? 0,
+    symbolCoverage: final.symbol?.coverage ?? 0,
+    symbolPrecision: final.symbol?.precision ?? 0,
+    spanCoverage: final.span?.coverage ?? 0,
+    spanPrecision: final.span?.precision ?? 0,
+    editLocRecall: editloc.recall ?? 0,
+    editLocPrecision: editloc.precision ?? 0,
+  }
+}
+
+function emptyMetrics(): EvaluationMetrics {
+  return {
+    fileCoverage: 0,
+    filePrecision: 0,
+    symbolCoverage: 0,
+    symbolPrecision: 0,
+    spanCoverage: 0,
+    spanPrecision: 0,
+    editLocRecall: 0,
+    editLocPrecision: 0,
+  }
+}
diff --git a/strandly/src/benchmark/index.ts b/strandly/src/benchmark/index.ts
new file mode 100644
index 000000000..5bb4f21a8
--- /dev/null
+++ b/strandly/src/benchmark/index.ts
@@ -0,0 +1,145 @@
+import { execSync } from 'node:child_process'
+import { resolve } from 'node:path'
+import { pathToFileURL } from 'node:url'
+import type { BenchmarkConfig, BenchmarkRunOpts, BenchmarkSuite, BenchmarkSuiteResult } from './types.js'
+import { getConfigs } from './configs.js'
+import { loadTask, ensureDependencies } from './contextbench/loader.js'
+import { runBenchmark } from './runner.js'
+import { writeResults, generateMarkdown } from './reporter.js'
+import { emitMetrics } from './cloudwatch.js'
+
+const ROOT = resolve(import.meta.dirname, '../../..')
+
+const DEFAULT_TASK = 'django__django-15987'
+
+async function loadCustomConfig(agentFile: string): Promise<BenchmarkConfig> {
+  const absPath = resolve(agentFile)
+  const module = (await import(pathToFileURL(absPath).href)) as { default?: BenchmarkConfig; config?: BenchmarkConfig }
+  const config = module.default ?? module.config
+  if (!config || typeof config.createAgent !== 'function') {
+    throw new Error(
+      `Agent file must export a BenchmarkConfig (with name, description, createAgent). Got: ${Object.keys(module).join(', ')}`
+    )
+  }
+  return config
+}
+
+const contextbench: BenchmarkSuite = {
+  name: 'contextbench',
+  async run(opts: BenchmarkRunOpts): Promise<BenchmarkSuiteResult> {
+    ensureDependencies()
+
+    const taskId = opts.task ?? DEFAULT_TASK
+    console.log(`Loading task: ${taskId}`)
+    const task = loadTask(taskId)
+    console.log(`  Repo: ${task.repo}, commit: ${task.baseCommit.slice(0, 12)}`)
+
+    const configs = getConfigs(opts.model)
+    let selectedConfigs: BenchmarkConfig[]
+
+    if (opts.agentFile) {
+      const custom = await loadCustomConfig(opts.agentFile)
+      console.log(`Using custom agent: ${custom.name}`)
+      selectedConfigs = [custom]
+    } else if (opts.config) {
+      selectedConfigs = configs.filter((c) => c.name === opts.config)
+    } else {
+      selectedConfigs = configs
+    }
+
+    if (selectedConfigs.length === 0) {
+      const available = configs.map((c) => c.name).join(', ')
+      throw new Error(`Unknown config "${opts.config}". Available: ${available}`)
+    }
+
+    const results = []
+    for (const config of selectedConfigs) {
+      console.log(`\nRunning config: ${config.name} (${config.description})`)
+      const result = await runBenchmark(config, task)
+      results.push(result)
+
+      if (result.error) {
+        console.log(`  ✗ ${config.name}: ERROR — ${result.error}`)
+      } else {
+        console.log(
+          `  ✓ ${config.name}: coverage=${(result.evaluation.fileCoverage * 100).toFixed(0)}% ` +
+            `precision=${(result.evaluation.filePrecision * 100).toFixed(1)}% ` +
+            `tokens=${(result.metrics.inputTokens / 1000).toFixed(0)}K ` +
+            `cycles=${result.metrics.cycleCount}`
+        )
+      }
+    }
+
+    const gitSha = execSync('git rev-parse HEAD', { cwd: ROOT, encoding: 'utf-8' }).trim()
+    const branch = execSync('git rev-parse --abbrev-ref HEAD', { cwd: ROOT, encoding: 'utf-8' }).trim()
+
+    return {
+      suite: 'contextbench',
+      timestamp: new Date().toISOString(),
+      gitSha,
+      branch,
+      results,
+    }
+  },
+}
+
+const suites: Record<string, BenchmarkSuite> = { contextbench }
+
+export interface BenchmarkOpts {
+  suite: string
+  config?: string
+  agentFile?: string
+  task?: string
+  model?: string
+  minCoverage?: number
+  output?: string
+  outputMd?: string
+  cloudwatch?: boolean
+}
+
+export async function benchmark(opts: BenchmarkOpts): Promise<void> {
+  const suite = suites[opts.suite]
+  if (!suite) {
+    const available = Object.keys(suites).join(', ')
+    console.error(`Unknown benchmark suite: "${opts.suite}". Available: ${available}`)
+    process.exit(1)
+  }
+
+  console.log(`\nRunning benchmark suite: ${suite.name}\n`)
+
+  const result = await suite.run({ config: opts.config, agentFile: opts.agentFile, task: opts.task, model: opts.model })
+
+  writeResults(result, { output: opts.output, outputMd: opts.outputMd })
+
+  if (!opts.output && !opts.outputMd) {
+    console.log('\n' + generateMarkdown(result))
+  }
+
+  if (opts.cloudwatch) {
+    await emitMetrics(result)
+  }
+
+  const failed = result.results.filter((r) => r.error)
+  if (failed.length > 0) {
+    console.error(`\n${failed.length} benchmark(s) errored.`)
+    process.exit(1)
+  }
+
+  if (opts.minCoverage != null) {
+    const belowThreshold = result.results.filter(
+      (r) => !r.error && r.evaluation.fileCoverage < opts.minCoverage!
+    )
+    if (belowThreshold.length > 0) {
+      console.error(
+        `\nFAILED: ${belowThreshold.length} config(s) below minimum coverage of ${(opts.minCoverage * 100).toFixed(0)}%:`
+      )
+      for (const r of belowThreshold) {
+        console.error(`  ${r.config}: ${(r.evaluation.fileCoverage * 100).toFixed(1)}%`)
+      }
+      process.exit(1)
+    }
+    console.log(`\nAll configs above minimum coverage threshold (${(opts.minCoverage * 100).toFixed(0)}%)`)
+  }
+
+  process.exit(0)
+}
diff --git a/strandly/src/benchmark/reporter.ts b/strandly/src/benchmark/reporter.ts
new file mode 100644
index 000000000..f7ab60ee9
--- /dev/null
+++ b/strandly/src/benchmark/reporter.ts
@@ -0,0 +1,58 @@
+import { writeFileSync } from 'node:fs'
+import type { BenchmarkSuiteResult } from './types.js'
+
+export function generateMarkdown(result: BenchmarkSuiteResult): string {
+  const passed = result.results.filter((r) => !r.error)
+  const failed = result.results.filter((r) => r.error)
+
+  let md = `## Benchmark Results: ${result.suite}\n\n`
+  md += `**${passed.length}/${result.results.length}** configs completed`
+  if (failed.length > 0) md += ` | ${failed.length} errored`
+  md += `\n\n`
+
+  for (const r of passed) {
+    const tokens = formatTokens(r.metrics.inputTokens + r.metrics.outputTokens)
+    const coverage = (r.evaluation.fileCoverage * 100).toFixed(0)
+    const precision = (r.evaluation.filePrecision * 100).toFixed(1)
+
+    md += `<details>\n`
+    md += `<summary><b>${r.config}</b>: File Coverage ${coverage}% | Precision ${precision}% | ${tokens} tokens | ${r.metrics.cycleCount} cycles</summary>\n\n`
+    md += `| Metric | Coverage | Precision |\n`
+    md += `|--------|----------|----------|\n`
+    md += `| File | ${r.evaluation.fileCoverage.toFixed(3)} | ${r.evaluation.filePrecision.toFixed(3)} |\n`
+    md += `| Symbol | ${r.evaluation.symbolCoverage.toFixed(3)} | ${r.evaluation.symbolPrecision.toFixed(3)} |\n`
+    md += `| Span | ${r.evaluation.spanCoverage.toFixed(3)} | ${r.evaluation.spanPrecision.toFixed(3)} |\n`
+    md += `| EditLoc | ${r.evaluation.editLocRecall.toFixed(3)} (recall) | ${r.evaluation.editLocPrecision.toFixed(3)} |\n\n`
+    md += `**Metrics:** ${r.metrics.inputTokens.toLocaleString()} input tokens, ${r.metrics.outputTokens.toLocaleString()} output tokens, ${(r.metrics.latencyMs / 1000).toFixed(1)}s\n\n`
+    md += `**Files read:** ${r.trajectory.length}\n\n`
+    md += `</details>\n\n`
+  }
+
+  if (failed.length > 0) {
+    md += `<details>\n<summary>Errors (${failed.length})</summary>\n\n`
+    for (const r of failed) {
+      md += `- **${r.config}**: ${r.error}\n`
+    }
+    md += `\n</details>\n`
+  }
+
+  md += `\n---\n*Run at ${result.timestamp} on \`${result.branch}\` (${result.gitSha.slice(0, 7)})*\n`
+  return md
+}
+
+export function writeResults(result: BenchmarkSuiteResult, opts: { output?: string; outputMd?: string }): void {
+  if (opts.output) {
+    writeFileSync(opts.output, JSON.stringify(result, null, 2))
+    console.log(`JSON results written to: ${opts.output}`)
+  }
+  if (opts.outputMd) {
+    writeFileSync(opts.outputMd, generateMarkdown(result))
+    console.log(`Markdown summary written to: ${opts.outputMd}`)
+  }
+}
+
+function formatTokens(tokens: number): string {
+  if (tokens >= 1_000_000) return `${(tokens / 1_000_000).toFixed(1)}M`
+  if (tokens >= 1_000) return `${(tokens / 1_000).toFixed(0)}K`
+  return String(tokens)
+}
diff --git a/strandly/src/benchmark/runner.ts b/strandly/src/benchmark/runner.ts
new file mode 100644
index 000000000..ff35cfd16
--- /dev/null
+++ b/strandly/src/benchmark/runner.ts
@@ -0,0 +1,75 @@
+import type { BenchmarkConfig, BenchmarkResult, ContextBenchTask } from './types.js'
+import { extractTrajectory, trajectoryToFileList } from './contextbench/trajectory.js'
+import { ensureRepoCloned } from './contextbench/loader.js'
+import { evaluate } from './evaluator.js'
+
+export async function runBenchmark(config: BenchmarkConfig, task: ContextBenchTask): Promise<BenchmarkResult> {
+  const repoDir = ensureRepoCloned(task)
+  console.log(`  Repo at: ${repoDir}`)
+
+  const startTime = performance.now()
+  const heartbeat = setInterval(() => {
+    const elapsed = ((performance.now() - startTime) / 1000).toFixed(0)
+    process.stdout.write(`\r  running... ${elapsed}s elapsed`)
+  }, 5_000)
+
+  try {
+    const agent = config.createAgent(task)
+
+    const result = await Promise.race([
+      agent.invoke(
+        `The repository is cloned at: ${repoDir}\n\nInvestigate the issue and find all relevant code locations.`
+      ),
+      new Promise<never>((_, reject) =>
+        setTimeout(() => reject(new Error('Benchmark timed out after 10 minutes')), 600_000)
+      ),
+    ])
+
+    const latencyMs = performance.now() - startTime
+    const trajectory = extractTrajectory(agent.messages, repoDir)
+    const fileList = trajectoryToFileList(trajectory)
+
+    const evaluation = await evaluate(task, fileList)
+
+    const usage = result.metrics?.accumulatedUsage
+    return {
+      config: config.name,
+      task: task.id,
+      metrics: {
+        inputTokens: usage?.inputTokens ?? 0,
+        outputTokens: usage?.outputTokens ?? 0,
+        cycleCount: result.metrics?.cycleCount ?? 0,
+        latencyMs: Math.round(latencyMs),
+      },
+      evaluation,
+      trajectory: fileList,
+    }
+  } catch (err) {
+    const latencyMs = performance.now() - startTime
+    return {
+      config: config.name,
+      task: task.id,
+      metrics: {
+        inputTokens: 0,
+        outputTokens: 0,
+        cycleCount: 0,
+        latencyMs: Math.round(latencyMs),
+      },
+      evaluation: {
+        fileCoverage: 0,
+        filePrecision: 0,
+        symbolCoverage: 0,
+        symbolPrecision: 0,
+        spanCoverage: 0,
+        spanPrecision: 0,
+        editLocRecall: 0,
+        editLocPrecision: 0,
+      },
+      trajectory: [],
+      error: err instanceof Error ? err.message : String(err),
+    }
+  } finally {
+    clearInterval(heartbeat)
+    process.stdout.write('\n')
+  }
+}
diff --git a/strandly/src/benchmark/types.ts b/strandly/src/benchmark/types.ts
new file mode 100644
index 000000000..dc33f3597
--- /dev/null
+++ b/strandly/src/benchmark/types.ts
@@ -0,0 +1,67 @@
+import type { Agent } from '../../../strands-ts/src/agent/agent.js'
+
+export interface BenchmarkSuite {
+  name: string
+  run(opts: BenchmarkRunOpts): Promise<BenchmarkSuiteResult>
+}
+
+export interface BenchmarkRunOpts {
+  config?: string
+  agentFile?: string
+  task?: string
+  model?: string
+}
+
+export interface BenchmarkConfig {
+  name: string
+  description: string
+  createAgent(task: ContextBenchTask): Agent
+}
+
+export interface ContextBenchTask {
+  id: string
+  repo: string
+  issue: number
+  baseCommit: string
+  prompt: string
+  goldAnnotations: GoldAnnotation[]
+}
+
+export interface GoldAnnotation {
+  file: string
+  symbols?: string[]
+  spans?: { startLine: number; endLine: number }[]
+}
+
+export interface EvaluationMetrics {
+  fileCoverage: number
+  filePrecision: number
+  symbolCoverage: number
+  symbolPrecision: number
+  spanCoverage: number
+  spanPrecision: number
+  editLocRecall: number
+  editLocPrecision: number
+}
+
+export interface BenchmarkResult {
+  config: string
+  task: string
+  metrics: {
+    inputTokens: number
+    outputTokens: number
+    cycleCount: number
+    latencyMs: number
+  }
+  evaluation: EvaluationMetrics
+  trajectory: string[]
+  error?: string
+}
+
+export interface BenchmarkSuiteResult {
+  suite: string
+  timestamp: string
+  gitSha: string
+  branch: string
+  results: BenchmarkResult[]
+}
diff --git a/strandly/src/cli.ts b/strandly/src/cli.ts
index 02cd1da42..6ef3a6fa9 100755
--- a/strandly/src/cli.ts
+++ b/strandly/src/cli.ts
@@ -151,6 +151,23 @@ program
     }
   })
 
+program
+  .command('benchmark')
+  .description('Run agent benchmarks')
+  .requiredOption('--suite <name>', 'Benchmark suite to run (contextbench)')
+  .option('--config <name>', 'Run specific config only')
+  .option('--agent-file <path>', 'Path to a .ts file exporting a BenchmarkConfig')
+  .option('--task <id>', 'Task ID within the suite')
+  .option('--model <id>', 'Model ID for built-in configs (default: us.anthropic.claude-sonnet-4-20250514-v1:0)')
+  .option('--min-coverage <n>', 'Minimum file coverage (0-1). Fails if below this.')
+  .option('--output <path>', 'Write JSON results to file')
+  .option('--output-md <path>', 'Write markdown summary to file')
+  .option('--cloudwatch', 'Emit metrics to CloudWatch')
+  .action(async (opts) => {
+    const { benchmark } = await import('./benchmark/index.js')
+    await benchmark({ ...opts, minCoverage: opts.minCoverage ? parseFloat(opts.minCoverage) : undefined, model: opts.model })
+  })
+
 program.parse()
 
 function run(cmd: string, opts?: { cwd?: string; env?: Record<string, string> }): void {

From cb20a77bcb3fa0e5427b1f481bb3fd5fef33fe44 Mon Sep 17 00:00:00 2001
From: Liz <91279165+lizradway@users.noreply.github.com>
Date: Thu, 21 May 2026 14:38:38 -0400
Subject: [PATCH 2/2] update from comments

---
 strandly/package.json                             | 3 +--
 strandly/src/benchmark/contextbench/loader.ts     | 3 +--
 strandly/src/benchmark/contextbench/trajectory.ts | 1 +
 strandly/src/benchmark/runner.ts                  | 8 +++++---
 strandly/src/cli.ts                               | 7 ++++++-
 5 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/strandly/package.json b/strandly/package.json
index 9c9b4f0c8..c4223c920 100644
--- a/strandly/package.json
+++ b/strandly/package.json
@@ -12,8 +12,7 @@
   "dependencies": {
     "@aws-sdk/client-cloudwatch": "^3",
     "commander": "^14",
-    "tsx": "^4.21.0",
-    "zod": "^3.23"
+    "tsx": "^4.21.0"
   },
   "devDependencies": {
     "@types/node": "^22",
diff --git a/strandly/src/benchmark/contextbench/loader.ts b/strandly/src/benchmark/contextbench/loader.ts
index 0fa14d529..88636b651 100644
--- a/strandly/src/benchmark/contextbench/loader.ts
+++ b/strandly/src/benchmark/contextbench/loader.ts
@@ -22,11 +22,10 @@ export function ensureDependencies(): void {
   try {
     execSync('python3 -c "import pyarrow; import tree_sitter"', { stdio: 'pipe' })
   } catch {
-    console.error(
+    throw new Error(
       'Missing Python dependencies for ContextBench evaluation.\n' +
         'Install with: pip install pyarrow tree-sitter tree-sitter-languages datasets'
     )
-    process.exit(1)
   }
 }
 
diff --git a/strandly/src/benchmark/contextbench/trajectory.ts b/strandly/src/benchmark/contextbench/trajectory.ts
index b12a614a1..d8d9e38cc 100644
--- a/strandly/src/benchmark/contextbench/trajectory.ts
+++ b/strandly/src/benchmark/contextbench/trajectory.ts
@@ -48,6 +48,7 @@ function toRelativePath(filePath: string, repoDir?: string): string {
   return filePath
 }
 
+// Best-effort extraction — won't catch all patterns (e.g. grep -rn, find -exec, piped commands, quoted paths)
 function extractFilePathsFromToolCall(toolName: string, input: Record<string, unknown>): string[] {
   if (toolName === 'file_editor' || toolName === 'fileEditor') {
     const path = (input.path ?? input.file_path ?? input.filePath) as string | undefined
diff --git a/strandly/src/benchmark/runner.ts b/strandly/src/benchmark/runner.ts
index ff35cfd16..0c4cf8019 100644
--- a/strandly/src/benchmark/runner.ts
+++ b/strandly/src/benchmark/runner.ts
@@ -16,14 +16,16 @@ export async function runBenchmark(config: BenchmarkConfig, task: ContextBenchTa
   try {
     const agent = config.createAgent(task)
 
+    let timeoutId: ReturnType<typeof setTimeout>
     const result = await Promise.race([
       agent.invoke(
         `The repository is cloned at: ${repoDir}\n\nInvestigate the issue and find all relevant code locations.`
       ),
-      new Promise<never>((_, reject) =>
-        setTimeout(() => reject(new Error('Benchmark timed out after 10 minutes')), 600_000)
-      ),
+      new Promise<never>((_, reject) => {
+        timeoutId = setTimeout(() => reject(new Error('Benchmark timed out after 10 minutes')), 600_000)
+      }),
     ])
+    clearTimeout(timeoutId!)
 
     const latencyMs = performance.now() - startTime
     const trajectory = extractTrajectory(agent.messages, repoDir)
diff --git a/strandly/src/cli.ts b/strandly/src/cli.ts
index 6ef3a6fa9..98d35b4fd 100755
--- a/strandly/src/cli.ts
+++ b/strandly/src/cli.ts
@@ -164,8 +164,13 @@ program
   .option('--output-md <path>', 'Write markdown summary to file')
   .option('--cloudwatch', 'Emit metrics to CloudWatch')
   .action(async (opts) => {
+    const minCoverage = opts.minCoverage ? parseFloat(opts.minCoverage) : undefined
+    if (minCoverage !== undefined && (isNaN(minCoverage) || minCoverage < 0 || minCoverage > 1)) {
+      console.error('--min-coverage must be a number between 0 and 1')
+      process.exit(1)
+    }
     const { benchmark } = await import('./benchmark/index.js')
-    await benchmark({ ...opts, minCoverage: opts.minCoverage ? parseFloat(opts.minCoverage) : undefined, model: opts.model })
+    await benchmark({ ...opts, minCoverage, model: opts.model })
   })
 
 program.parse()