From 0eeb3895c778927735e27bc00726799130c4c1c2 Mon Sep 17 00:00:00 2001 From: Liz <91279165+lizradway@users.noreply.github.com> Date: Tue, 19 May 2026 10:09:46 -0400 Subject: [PATCH 1/2] feat(hackathon): add benchmarking strandley script --- package-lock.json | 399 +++++++++--------- strandly/package.json | 4 +- strandly/src/benchmark/README.md | 158 +++++++ strandly/src/benchmark/cloudwatch.ts | 76 ++++ strandly/src/benchmark/configs.ts | 105 +++++ strandly/src/benchmark/contextbench/loader.ts | 176 ++++++++ .../src/benchmark/contextbench/trajectory.ts | 81 ++++ strandly/src/benchmark/evaluator.ts | 111 +++++ strandly/src/benchmark/index.ts | 145 +++++++ strandly/src/benchmark/reporter.ts | 58 +++ strandly/src/benchmark/runner.ts | 75 ++++ strandly/src/benchmark/types.ts | 67 +++ strandly/src/cli.ts | 17 + 13 files changed, 1282 insertions(+), 190 deletions(-) create mode 100644 strandly/src/benchmark/README.md create mode 100644 strandly/src/benchmark/cloudwatch.ts create mode 100644 strandly/src/benchmark/configs.ts create mode 100644 strandly/src/benchmark/contextbench/loader.ts create mode 100644 strandly/src/benchmark/contextbench/trajectory.ts create mode 100644 strandly/src/benchmark/evaluator.ts create mode 100644 strandly/src/benchmark/index.ts create mode 100644 strandly/src/benchmark/reporter.ts create mode 100644 strandly/src/benchmark/runner.ts create mode 100644 strandly/src/benchmark/types.ts diff --git a/package-lock.json b/package-lock.json index eccc431fa..6a4fa0605 100644 --- a/package-lock.json +++ b/package-lock.json @@ -454,6 +454,28 @@ "node": ">=20.0.0" } }, + "node_modules/@aws-sdk/client-cloudwatch": { + "version": "3.1049.0", + "resolved": "https://registry.npmjs.org/@aws-sdk/client-cloudwatch/-/client-cloudwatch-3.1049.0.tgz", + "integrity": "sha512-pxzt53Ch0luCqnSaWNI7vL8MjvF5WFTRV/VzyuHoWOydbaC3RQT2i5DcAW7hjaYQmmRItQnvazTeO+obsBR+4Q==", + "license": "Apache-2.0", + "dependencies": { + "@aws-crypto/sha256-browser": "5.2.0", + "@aws-crypto/sha256-js": "5.2.0", + "@aws-sdk/core": "^3.974.12", + "@aws-sdk/credential-provider-node": "^3.972.43", + "@aws-sdk/types": "^3.973.8", + "@smithy/core": "^3.24.2", + "@smithy/fetch-http-handler": "^5.4.2", + "@smithy/middleware-compression": "^4.4.2", + "@smithy/node-http-handler": "^4.7.2", + "@smithy/types": "^4.14.1", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=20.0.0" + } + }, "node_modules/@aws-sdk/client-cognito-identity": { "version": "3.1033.0", "dev": true, @@ -668,24 +690,18 @@ } }, "node_modules/@aws-sdk/core": { - "version": "3.974.8", - "resolved": "https://registry.npmjs.org/@aws-sdk/core/-/core-3.974.8.tgz", - "integrity": "sha512-njR2qoG6ZuB0kvAS2FyICsFZJ6gmCcf2X/7JcD14sUvGDm26wiZ5BrA6LOiUxKFEF+IVe7kdroxyE00YlkiYsw==", + "version": "3.974.12", + "resolved": "https://registry.npmjs.org/@aws-sdk/core/-/core-3.974.12.tgz", + "integrity": "sha512-qrqgioqYFjwR6LatVNS1L2Vk++EwRIxqSQXPKNv5Ofux2D8UNgqMQ1znnMyEImXquVPTtbf71fc128pvmU6y9A==", "license": "Apache-2.0", "dependencies": { "@aws-sdk/types": "^3.973.8", - "@aws-sdk/xml-builder": "^3.972.22", - "@smithy/core": "^3.23.17", - "@smithy/node-config-provider": "^4.3.14", - "@smithy/property-provider": "^4.2.14", - "@smithy/protocol-http": "^5.3.14", - "@smithy/signature-v4": "^5.3.14", - "@smithy/smithy-client": "^4.12.13", + "@aws-sdk/xml-builder": "^3.972.24", + "@aws/lambda-invoke-store": "^0.2.2", + "@smithy/core": "^3.24.2", + "@smithy/signature-v4": "^5.4.2", "@smithy/types": "^4.14.1", - "@smithy/util-base64": "^4.3.2", - "@smithy/util-middleware": "^4.2.14", - "@smithy/util-retry": "^4.3.6", - "@smithy/util-utf8": "^4.2.2", + "bowser": "^2.11.0", "tslib": "^2.6.2" }, "engines": { @@ -720,14 +736,14 @@ } }, "node_modules/@aws-sdk/credential-provider-env": { - "version": "3.972.34", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-env/-/credential-provider-env-3.972.34.tgz", - "integrity": "sha512-XT0jtf8Fw9JE6ppsQeoNnZRiG+jqRixMT1v1ZR17G60UvVdsQmTG8nbEyHuEPfMxDXEhfdARaM/XiEhca4lGHQ==", + "version": "3.972.38", + "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-env/-/credential-provider-env-3.972.38.tgz", + "integrity": "sha512-m3WjZEgPtioMhPmwqUt+DhlTJ2i9ufR6DhfkyXojb9puEvfR+ur2U5shavu5/Cc9WHHsDCvALi6UFHgcqjhQ5w==", "license": "Apache-2.0", "dependencies": { - "@aws-sdk/core": "^3.974.8", + "@aws-sdk/core": "^3.974.12", "@aws-sdk/types": "^3.973.8", - "@smithy/property-provider": "^4.2.14", + "@smithy/core": "^3.24.2", "@smithy/types": "^4.14.1", "tslib": "^2.6.2" }, @@ -736,20 +752,17 @@ } }, "node_modules/@aws-sdk/credential-provider-http": { - "version": "3.972.36", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-http/-/credential-provider-http-3.972.36.tgz", - "integrity": "sha512-DPoGWfy7J7RKxvbf5kOKIGQkD2ek3dbKgzKIGrnLuvZBz5myU+Im/H6pmc14QcnFbqHMqxvtWSgRDSJW3qXLQg==", + "version": "3.972.40", + "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-http/-/credential-provider-http-3.972.40.tgz", + "integrity": "sha512-D78L/m2Dr6cJnnSvWoAudPhQmCwmJ7j6APXsPYmFpPaKfQTfCSu0rdm8j14Np+VmXF9z8Aj8HE3xFpsrwtfgeg==", "license": "Apache-2.0", "dependencies": { - "@aws-sdk/core": "^3.974.8", + "@aws-sdk/core": "^3.974.12", "@aws-sdk/types": "^3.973.8", - "@smithy/fetch-http-handler": "^5.3.17", - "@smithy/node-http-handler": "^4.6.1", - "@smithy/property-provider": "^4.2.14", - "@smithy/protocol-http": "^5.3.14", - "@smithy/smithy-client": "^4.12.13", + "@smithy/core": "^3.24.2", + "@smithy/fetch-http-handler": "^5.4.2", + "@smithy/node-http-handler": "^4.7.2", "@smithy/types": "^4.14.1", - "@smithy/util-stream": "^4.5.25", "tslib": "^2.6.2" }, "engines": { @@ -757,23 +770,22 @@ } }, "node_modules/@aws-sdk/credential-provider-ini": { - "version": "3.972.38", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-ini/-/credential-provider-ini-3.972.38.tgz", - "integrity": "sha512-oDzUBu2MGJFgoar05sPMCwSrhw44ASyccrHzj66vO69OZqi7I6hZZxXfuPLC8OCzW7C+sU+bI73XHij41yekgQ==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/core": "^3.974.8", - "@aws-sdk/credential-provider-env": "^3.972.34", - "@aws-sdk/credential-provider-http": "^3.972.36", - "@aws-sdk/credential-provider-login": "^3.972.38", - "@aws-sdk/credential-provider-process": "^3.972.34", - "@aws-sdk/credential-provider-sso": "^3.972.38", - "@aws-sdk/credential-provider-web-identity": "^3.972.38", - "@aws-sdk/nested-clients": "^3.997.6", + "version": "3.972.42", + "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-ini/-/credential-provider-ini-3.972.42.tgz", + "integrity": "sha512-Mu5ESvFXeinafVM8jTIvRqcvK2Ehj4kz3auT39yUcHwu1Vfxo6xRlmUafdKLW4tusjAJukQwK09sCSMgOm7OKg==", + "license": "Apache-2.0", + "dependencies": { + "@aws-sdk/core": "^3.974.12", + "@aws-sdk/credential-provider-env": "^3.972.38", + "@aws-sdk/credential-provider-http": "^3.972.40", + "@aws-sdk/credential-provider-login": "^3.972.42", + "@aws-sdk/credential-provider-process": "^3.972.38", + "@aws-sdk/credential-provider-sso": "^3.972.42", + "@aws-sdk/credential-provider-web-identity": "^3.972.42", + "@aws-sdk/nested-clients": "^3.997.10", "@aws-sdk/types": "^3.973.8", - "@smithy/credential-provider-imds": "^4.2.14", - "@smithy/property-provider": "^4.2.14", - "@smithy/shared-ini-file-loader": "^4.4.9", + "@smithy/core": "^3.24.2", + "@smithy/credential-provider-imds": "^4.3.2", "@smithy/types": "^4.14.1", "tslib": "^2.6.2" }, @@ -782,17 +794,15 @@ } }, "node_modules/@aws-sdk/credential-provider-login": { - "version": "3.972.38", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-login/-/credential-provider-login-3.972.38.tgz", - "integrity": "sha512-g1NosS8qe4OF++G2UFCM5ovSkgipC7YYor5KCWatG0UoMSO5YFj9C8muePlyVmOBV/WTI16Jo3/s1NUo/o1Bww==", + "version": "3.972.42", + "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-login/-/credential-provider-login-3.972.42.tgz", + "integrity": "sha512-O6WkZga3kf0yqyJYd1dbeJqVhEgJx/x1UaLgtbR+XuL/YP+K5y6QTxQKL7ka9z3jnQASESKGAPnRyt4D5hQrxA==", "license": "Apache-2.0", "dependencies": { - "@aws-sdk/core": "^3.974.8", - "@aws-sdk/nested-clients": "^3.997.6", + "@aws-sdk/core": "^3.974.12", + "@aws-sdk/nested-clients": "^3.997.10", "@aws-sdk/types": "^3.973.8", - "@smithy/property-provider": "^4.2.14", - "@smithy/protocol-http": "^5.3.14", - "@smithy/shared-ini-file-loader": "^4.4.9", + "@smithy/core": "^3.24.2", "@smithy/types": "^4.14.1", "tslib": "^2.6.2" }, @@ -801,21 +811,20 @@ } }, "node_modules/@aws-sdk/credential-provider-node": { - "version": "3.972.39", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-node/-/credential-provider-node-3.972.39.tgz", - "integrity": "sha512-HEswDQyxUtadoZ/bJsPPENHg7R0Lzym5LuMksJeHvqhCOpP+rtkDLKI4/ZChH4w3cf5kG8n6bZuI8PzajoiqMg==", + "version": "3.972.43", + "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-node/-/credential-provider-node-3.972.43.tgz", + "integrity": "sha512-D/DJmbrWRP5BXEO3FH+ar4el+2n6OlGofiud7dQun2jES+AQEJjczenp1jBb4MBN7CpGpS8nsWGQLtuzc9tQbA==", "license": "Apache-2.0", "dependencies": { - "@aws-sdk/credential-provider-env": "^3.972.34", - "@aws-sdk/credential-provider-http": "^3.972.36", - "@aws-sdk/credential-provider-ini": "^3.972.38", - "@aws-sdk/credential-provider-process": "^3.972.34", - "@aws-sdk/credential-provider-sso": "^3.972.38", - "@aws-sdk/credential-provider-web-identity": "^3.972.38", + "@aws-sdk/credential-provider-env": "^3.972.38", + "@aws-sdk/credential-provider-http": "^3.972.40", + "@aws-sdk/credential-provider-ini": "^3.972.42", + "@aws-sdk/credential-provider-process": "^3.972.38", + "@aws-sdk/credential-provider-sso": "^3.972.42", + "@aws-sdk/credential-provider-web-identity": "^3.972.42", "@aws-sdk/types": "^3.973.8", - "@smithy/credential-provider-imds": "^4.2.14", - "@smithy/property-provider": "^4.2.14", - "@smithy/shared-ini-file-loader": "^4.4.9", + "@smithy/core": "^3.24.2", + "@smithy/credential-provider-imds": "^4.3.2", "@smithy/types": "^4.14.1", "tslib": "^2.6.2" }, @@ -824,15 +833,14 @@ } }, "node_modules/@aws-sdk/credential-provider-process": { - "version": "3.972.34", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-process/-/credential-provider-process-3.972.34.tgz", - "integrity": "sha512-T3IFs4EVmVi1dVN5RciFnklCANSzvrQd/VuHY9ThHSQmYkTogjcGkoJEr+oNUPQZnso52183088NqysMPji1/Q==", + "version": "3.972.38", + "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-process/-/credential-provider-process-3.972.38.tgz", + "integrity": "sha512-EnbYVajGgbkb24s0K1eo4VNAPV5mHIET7LSvirTaFCwkfrfaOJxtSE+wY/tJdKDS21cEYkZs2ruCaAm+W4iblg==", "license": "Apache-2.0", "dependencies": { - "@aws-sdk/core": "^3.974.8", + "@aws-sdk/core": "^3.974.12", "@aws-sdk/types": "^3.973.8", - "@smithy/property-provider": "^4.2.14", - "@smithy/shared-ini-file-loader": "^4.4.9", + "@smithy/core": "^3.24.2", "@smithy/types": "^4.14.1", "tslib": "^2.6.2" }, @@ -841,17 +849,16 @@ } }, "node_modules/@aws-sdk/credential-provider-sso": { - "version": "3.972.38", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-sso/-/credential-provider-sso-3.972.38.tgz", - "integrity": "sha512-5ZxG+t0+3Q3QPh8KEjX6syskhgNf7I0MN7oGioTf6Lm1NTjfP7sIcYGNsthXC2qR8vcD3edNZwCr2ovfSSWuRA==", + "version": "3.972.42", + "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-sso/-/credential-provider-sso-3.972.42.tgz", + "integrity": "sha512-RVV/9NbFwI8ZHEH5dn39lGyFmSbSVj1+orZdr6QsOe1mW9DCglmlen0cFaNZmCcqkqc7erNRHNBduxbeZuHAnw==", "license": "Apache-2.0", "dependencies": { - "@aws-sdk/core": "^3.974.8", - "@aws-sdk/nested-clients": "^3.997.6", - "@aws-sdk/token-providers": "3.1041.0", + "@aws-sdk/core": "^3.974.12", + "@aws-sdk/nested-clients": "^3.997.10", + "@aws-sdk/token-providers": "3.1049.0", "@aws-sdk/types": "^3.973.8", - "@smithy/property-provider": "^4.2.14", - "@smithy/shared-ini-file-loader": "^4.4.9", + "@smithy/core": "^3.24.2", "@smithy/types": "^4.14.1", "tslib": "^2.6.2" }, @@ -860,16 +867,15 @@ } }, "node_modules/@aws-sdk/credential-provider-sso/node_modules/@aws-sdk/token-providers": { - "version": "3.1041.0", - "resolved": "https://registry.npmjs.org/@aws-sdk/token-providers/-/token-providers-3.1041.0.tgz", - "integrity": "sha512-Th7kPI6YPtvJUcdznooXJMy+9rQWjmEF81LxaJssngBzuysK4a/x+l8kjm1zb7nYsUPbndnBdUnwng/3PLvtGw==", + "version": "3.1049.0", + "resolved": "https://registry.npmjs.org/@aws-sdk/token-providers/-/token-providers-3.1049.0.tgz", + "integrity": "sha512-r7+d0lQMTHKypkmaF5jRTBYLYHCUHzt3gaVoN9SidLhQeWhCmHk3AKrboDTpPF5b7Pt7vKu3+oeMjznM2Eu1ow==", "license": "Apache-2.0", "dependencies": { - "@aws-sdk/core": "^3.974.8", - "@aws-sdk/nested-clients": "^3.997.6", + "@aws-sdk/core": "^3.974.12", + "@aws-sdk/nested-clients": "^3.997.10", "@aws-sdk/types": "^3.973.8", - "@smithy/property-provider": "^4.2.14", - "@smithy/shared-ini-file-loader": "^4.4.9", + "@smithy/core": "^3.24.2", "@smithy/types": "^4.14.1", "tslib": "^2.6.2" }, @@ -878,16 +884,15 @@ } }, "node_modules/@aws-sdk/credential-provider-web-identity": { - "version": "3.972.38", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-web-identity/-/credential-provider-web-identity-3.972.38.tgz", - "integrity": "sha512-lYHFF30DGI20jZcYX8cm6Ns0V7f1dDN6g/MBDLTyD/5iw+bXs3yBr2iAiHDkx4RFU5JgsnZvCHYKiRVPRdmOgw==", + "version": "3.972.42", + "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-web-identity/-/credential-provider-web-identity-3.972.42.tgz", + "integrity": "sha512-/67fXX0ddllD4u2Nujc5PvT4byHgpMUfz6+RxIKi/0nFIckeorm7JvXgzBuDyVKw0s58EbofmETDWUf9vTEuHQ==", "license": "Apache-2.0", "dependencies": { - "@aws-sdk/core": "^3.974.8", - "@aws-sdk/nested-clients": "^3.997.6", + "@aws-sdk/core": "^3.974.12", + "@aws-sdk/nested-clients": "^3.997.10", "@aws-sdk/types": "^3.973.8", - "@smithy/property-provider": "^4.2.14", - "@smithy/shared-ini-file-loader": "^4.4.9", + "@smithy/core": "^3.24.2", "@smithy/types": "^4.14.1", "tslib": "^2.6.2" }, @@ -1062,6 +1067,7 @@ "version": "3.972.37", "resolved": "https://registry.npmjs.org/@aws-sdk/middleware-sdk-s3/-/middleware-sdk-s3-3.972.37.tgz", "integrity": "sha512-Km7M+i8DrLArVzrid1gfxeGhYHBd3uxvE77g0s5a52zPSVosxzQBnJ0gwWb6NIp/DOk8gsBMhi7V+cpJG0ndTA==", + "dev": true, "license": "Apache-2.0", "dependencies": { "@aws-sdk/core": "^3.974.8", @@ -1137,49 +1143,20 @@ } }, "node_modules/@aws-sdk/nested-clients": { - "version": "3.997.6", - "resolved": "https://registry.npmjs.org/@aws-sdk/nested-clients/-/nested-clients-3.997.6.tgz", - "integrity": "sha512-WBDnqatJl+kGObpfmfSxqnXeYTu3Me8wx8WCtvoxX3pfWrrTv8I4WTMSSs7PZqcRcVh8WeUKMgGFjMG+52SR1w==", + "version": "3.997.10", + "resolved": "https://registry.npmjs.org/@aws-sdk/nested-clients/-/nested-clients-3.997.10.tgz", + "integrity": "sha512-FtQ/Bt327peZJuyo4WZSOLVUTw9ujRxntepiC7L65FxA2P82Xlq0g14T22BuqBUeMjDoxa9nvwiMHjLIfP3eUg==", "license": "Apache-2.0", "dependencies": { "@aws-crypto/sha256-browser": "5.2.0", "@aws-crypto/sha256-js": "5.2.0", - "@aws-sdk/core": "^3.974.8", - "@aws-sdk/middleware-host-header": "^3.972.10", - "@aws-sdk/middleware-logger": "^3.972.10", - "@aws-sdk/middleware-recursion-detection": "^3.972.11", - "@aws-sdk/middleware-user-agent": "^3.972.38", - "@aws-sdk/region-config-resolver": "^3.972.13", - "@aws-sdk/signature-v4-multi-region": "^3.996.25", + "@aws-sdk/core": "^3.974.12", + "@aws-sdk/signature-v4-multi-region": "^3.996.27", "@aws-sdk/types": "^3.973.8", - "@aws-sdk/util-endpoints": "^3.996.8", - "@aws-sdk/util-user-agent-browser": "^3.972.10", - "@aws-sdk/util-user-agent-node": "^3.973.24", - "@smithy/config-resolver": "^4.4.17", - "@smithy/core": "^3.23.17", - "@smithy/fetch-http-handler": "^5.3.17", - "@smithy/hash-node": "^4.2.14", - "@smithy/invalid-dependency": "^4.2.14", - "@smithy/middleware-content-length": "^4.2.14", - "@smithy/middleware-endpoint": "^4.4.32", - "@smithy/middleware-retry": "^4.5.7", - "@smithy/middleware-serde": "^4.2.20", - "@smithy/middleware-stack": "^4.2.14", - "@smithy/node-config-provider": "^4.3.14", - "@smithy/node-http-handler": "^4.6.1", - "@smithy/protocol-http": "^5.3.14", - "@smithy/smithy-client": "^4.12.13", + "@smithy/core": "^3.24.2", + "@smithy/fetch-http-handler": "^5.4.2", + "@smithy/node-http-handler": "^4.7.2", "@smithy/types": "^4.14.1", - "@smithy/url-parser": "^4.2.14", - "@smithy/util-base64": "^4.3.2", - "@smithy/util-body-length-browser": "^4.2.2", - "@smithy/util-body-length-node": "^4.2.3", - "@smithy/util-defaults-mode-browser": "^4.3.49", - "@smithy/util-defaults-mode-node": "^4.2.54", - "@smithy/util-endpoints": "^3.4.2", - "@smithy/util-middleware": "^4.2.14", - "@smithy/util-retry": "^4.3.6", - "@smithy/util-utf8": "^4.2.2", "tslib": "^2.6.2" }, "engines": { @@ -1203,15 +1180,14 @@ } }, "node_modules/@aws-sdk/signature-v4-multi-region": { - "version": "3.996.25", - "resolved": "https://registry.npmjs.org/@aws-sdk/signature-v4-multi-region/-/signature-v4-multi-region-3.996.25.tgz", - "integrity": "sha512-+CMIt3e1VzlklAECmG+DtP1sV8iKq25FuA0OKpnJ4KA0kxUtd7CgClY7/RU6VzJBQwbN4EJ9Ue6plvqx1qGadw==", + "version": "3.996.27", + "resolved": "https://registry.npmjs.org/@aws-sdk/signature-v4-multi-region/-/signature-v4-multi-region-3.996.27.tgz", + "integrity": "sha512-0Phbz4t6HI3D3skxvG2uI+VWU034/nSIw1T8d+FPzzQG9EQTrw94o9mOKO2Gv3n3Oc8P7JD7RAUxkoneLWv5Eg==", "license": "Apache-2.0", "dependencies": { - "@aws-sdk/middleware-sdk-s3": "^3.972.37", "@aws-sdk/types": "^3.973.8", - "@smithy/protocol-http": "^5.3.14", - "@smithy/signature-v4": "^5.3.14", + "@smithy/core": "^3.24.2", + "@smithy/signature-v4": "^5.4.2", "@smithy/types": "^4.14.1", "tslib": "^2.6.2" }, @@ -1249,6 +1225,7 @@ }, "node_modules/@aws-sdk/util-arn-parser": { "version": "3.972.3", + "dev": true, "license": "Apache-2.0", "dependencies": { "tslib": "^2.6.2" @@ -1332,14 +1309,14 @@ } }, "node_modules/@aws-sdk/xml-builder": { - "version": "3.972.22", - "resolved": "https://registry.npmjs.org/@aws-sdk/xml-builder/-/xml-builder-3.972.22.tgz", - "integrity": "sha512-PMYKKtJd70IsSG0yHrdAbxBr+ZWBKLvzFZfD3/urxgf6hXVMzuU5M+3MJ5G67RpOmLBu1fAUN65SbWuKUCOlAA==", + "version": "3.972.24", + "resolved": "https://registry.npmjs.org/@aws-sdk/xml-builder/-/xml-builder-3.972.24.tgz", + "integrity": "sha512-V8z5YcDPfsvzrBlj0xR1vhRtocblhYbqdreCJB/voGd4Sr5zjNAeWxexbnqVtskTJe0vFb5KMqbSL++ePl+zRw==", "license": "Apache-2.0", "dependencies": { "@nodable/entities": "2.1.0", "@smithy/types": "^4.14.1", - "fast-xml-parser": "5.7.2", + "fast-xml-parser": "5.7.3", "tslib": "^2.6.2" }, "engines": { @@ -3113,20 +3090,13 @@ } }, "node_modules/@smithy/core": { - "version": "3.23.17", - "resolved": "https://registry.npmjs.org/@smithy/core/-/core-3.23.17.tgz", - "integrity": "sha512-x7BlLbUFL8NWCGjMF9C+1N5cVCxcPa7g6Tv9B4A2luWx3be3oU8hQ96wIwxe/s7OhIzvoJH73HAUSg5JXVlEtQ==", + "version": "3.24.3", + "resolved": "https://registry.npmjs.org/@smithy/core/-/core-3.24.3.tgz", + "integrity": "sha512-Ep/7tPamGY8mgESE3LyLKtxJyy6U52WWAqr/3wial47Sj4u3PiIF73AOGI27UyLy9duTkhZbgzodOfLV4TduZg==", "license": "Apache-2.0", "dependencies": { - "@smithy/protocol-http": "^5.3.14", - "@smithy/types": "^4.14.1", - "@smithy/url-parser": "^4.2.14", - "@smithy/util-base64": "^4.3.2", - "@smithy/util-body-length-browser": "^4.2.2", - "@smithy/util-middleware": "^4.2.14", - "@smithy/util-stream": "^4.5.25", - "@smithy/util-utf8": "^4.2.2", - "@smithy/uuid": "^1.1.2", + "@aws-crypto/crc32": "5.2.0", + "@smithy/types": "^4.14.2", "tslib": "^2.6.2" }, "engines": { @@ -3134,13 +3104,13 @@ } }, "node_modules/@smithy/credential-provider-imds": { - "version": "4.2.14", + "version": "4.3.3", + "resolved": "https://registry.npmjs.org/@smithy/credential-provider-imds/-/credential-provider-imds-4.3.3.tgz", + "integrity": "sha512-I2Bti0DKFo2IJyN28ijCsx51BAumEYR4/1yZ1FXyBygy9MqbnMqCev4JPth/MbpRfBSRAX35hITSnAdJRo1u5w==", "license": "Apache-2.0", "dependencies": { - "@smithy/node-config-provider": "^4.3.14", - "@smithy/property-provider": "^4.2.14", - "@smithy/types": "^4.14.1", - "@smithy/url-parser": "^4.2.14", + "@smithy/core": "^3.24.3", + "@smithy/types": "^4.14.2", "tslib": "^2.6.2" }, "engines": { @@ -3208,13 +3178,13 @@ } }, "node_modules/@smithy/fetch-http-handler": { - "version": "5.3.17", + "version": "5.4.3", + "resolved": "https://registry.npmjs.org/@smithy/fetch-http-handler/-/fetch-http-handler-5.4.3.tgz", + "integrity": "sha512-F+DRf8IJazRJgYog2A/yJK7eYVc0rqTlRzO+5ZxjJd4WkZoKz0IJRncf7G6t1pdVT3kryJcwuTFhN1c5m6N47A==", "license": "Apache-2.0", "dependencies": { - "@smithy/protocol-http": "^5.3.14", - "@smithy/querystring-builder": "^4.2.14", - "@smithy/types": "^4.14.1", - "@smithy/util-base64": "^4.3.2", + "@smithy/core": "^3.24.3", + "@smithy/types": "^4.14.2", "tslib": "^2.6.2" }, "engines": { @@ -3295,6 +3265,21 @@ "node": ">=18.0.0" } }, + "node_modules/@smithy/middleware-compression": { + "version": "4.4.3", + "resolved": "https://registry.npmjs.org/@smithy/middleware-compression/-/middleware-compression-4.4.3.tgz", + "integrity": "sha512-IuZ+ebi3OteVFprY33vV7oLfZxRx0YACjoGhex59PX7+sHgG0f75wyb5FZuOZhJoQPnWaDD5piirEwWzyAmb3A==", + "license": "Apache-2.0", + "dependencies": { + "@smithy/core": "^3.24.3", + "@smithy/types": "^4.14.2", + "fflate": "0.8.1", + "tslib": "^2.6.2" + }, + "engines": { + "node": ">=18.0.0" + } + }, "node_modules/@smithy/middleware-content-length": { "version": "4.2.14", "license": "Apache-2.0", @@ -3387,14 +3372,13 @@ } }, "node_modules/@smithy/node-http-handler": { - "version": "4.6.1", - "resolved": "https://registry.npmjs.org/@smithy/node-http-handler/-/node-http-handler-4.6.1.tgz", - "integrity": "sha512-iB+orM4x3xrr57X3YaXazfKnntl0LHlZB1kcXSGzMV1Tt0+YwEjGlbjk/44qEGtBzXAz6yFDzkYTKSV6Pj2HUg==", + "version": "4.7.3", + "resolved": "https://registry.npmjs.org/@smithy/node-http-handler/-/node-http-handler-4.7.3.tgz", + "integrity": "sha512-/jPhevcTFPMVl6KNjbaI47iOg1zxC7IsnX4PQDGVZKMFceOXtB8IEYaB7a9VvkP/3oC60WzTeKocvSI7vLT0vA==", "license": "Apache-2.0", "dependencies": { - "@smithy/protocol-http": "^5.3.14", - "@smithy/querystring-builder": "^4.2.14", - "@smithy/types": "^4.14.1", + "@smithy/core": "^3.24.3", + "@smithy/types": "^4.14.2", "tslib": "^2.6.2" }, "engines": { @@ -3470,16 +3454,13 @@ } }, "node_modules/@smithy/signature-v4": { - "version": "5.3.14", + "version": "5.4.3", + "resolved": "https://registry.npmjs.org/@smithy/signature-v4/-/signature-v4-5.4.3.tgz", + "integrity": "sha512-53+75QuPl6DL+ct6vVEB51FDO5oulXr20TPV46VvJZg76lIlXNWfxi8j+G2V/t0I2qxCBOa3vX/8bmjrpFVo9g==", "license": "Apache-2.0", "dependencies": { - "@smithy/is-array-buffer": "^4.2.2", - "@smithy/protocol-http": "^5.3.14", - "@smithy/types": "^4.14.1", - "@smithy/util-hex-encoding": "^4.2.2", - "@smithy/util-middleware": "^4.2.14", - "@smithy/util-uri-escape": "^4.2.2", - "@smithy/util-utf8": "^4.2.2", + "@smithy/core": "^3.24.3", + "@smithy/types": "^4.14.2", "tslib": "^2.6.2" }, "engines": { @@ -3505,7 +3486,9 @@ } }, "node_modules/@smithy/types": { - "version": "4.14.1", + "version": "4.14.2", + "resolved": "https://registry.npmjs.org/@smithy/types/-/types-4.14.2.tgz", + "integrity": "sha512-P+otAxbV4CqBybp7EkcJCrig63yE2E7PuNVOmilVMRcx/O+QDzGULTrKsq4DV13gSfak9ObPrWaHl/9bL5YcWw==", "license": "Apache-2.0", "dependencies": { "tslib": "^2.6.2" @@ -5842,9 +5825,9 @@ "license": "BSD-3-Clause" }, "node_modules/fast-xml-builder": { - "version": "1.1.7", - "resolved": "https://registry.npmjs.org/fast-xml-builder/-/fast-xml-builder-1.1.7.tgz", - "integrity": "sha512-Yh7/7rQuMXICNr0oMYDR2yHP6oUvmQsTToFeOWj/kIDhAwQ+c4Ol/lbcwOmEM5OHYQmh6S6EQSQ1sljCKP36bQ==", + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/fast-xml-builder/-/fast-xml-builder-1.2.0.tgz", + "integrity": "sha512-00aAWieqff+ZJhsXA4g1g7M8k+7AYoMUUHF+/zFb5U6Uv/P0Vl4QZo84/IcufzYalLuEj9928bXN9PbbFzMF0Q==", "funding": [ { "type": "github", @@ -5853,13 +5836,14 @@ ], "license": "MIT", "dependencies": { - "path-expression-matcher": "^1.1.3" + "path-expression-matcher": "^1.5.0", + "xml-naming": "^0.1.0" } }, "node_modules/fast-xml-parser": { - "version": "5.7.2", - "resolved": "https://registry.npmjs.org/fast-xml-parser/-/fast-xml-parser-5.7.2.tgz", - "integrity": "sha512-P7oW7tLbYnhOLQk/Gv7cZgzgMPP/XN03K02/Jy6Y/NHzyIAIpxuZIM/YqAkfiXFPxA2CTm7NtCijK9EDu09u2w==", + "version": "5.7.3", + "resolved": "https://registry.npmjs.org/fast-xml-parser/-/fast-xml-parser-5.7.3.tgz", + "integrity": "sha512-C0AaNuC+mscy6vrAQKAc/rMq+zAPHodfHGZu4sGVehvAQt/JLG1O5zEcYcXSY5zSqr4YVgxsB+pHXTq0i7eDlg==", "funding": [ { "type": "github", @@ -5869,7 +5853,7 @@ "license": "MIT", "dependencies": { "@nodable/entities": "^2.1.0", - "fast-xml-builder": "^1.1.5", + "fast-xml-builder": "^1.1.7", "path-expression-matcher": "^1.5.0", "strnum": "^2.2.3" }, @@ -5923,6 +5907,12 @@ "node": "^12.20 || >= 14.13" } }, + "node_modules/fflate": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/fflate/-/fflate-0.8.1.tgz", + "integrity": "sha512-/exOvEuc+/iaUm105QIiOt4LpBdMTWsXxqR0HDF35vx3fmaKzw7354gTilCh5rkzEt8WYyG//ku3h3nRmd7CHQ==", + "license": "MIT" + }, "node_modules/file-entry-cache": { "version": "8.0.0", "dev": true, @@ -7869,9 +7859,9 @@ "license": "MIT" }, "node_modules/strnum": { - "version": "2.2.3", - "resolved": "https://registry.npmjs.org/strnum/-/strnum-2.2.3.tgz", - "integrity": "sha512-oKx6RUCuHfT3oyVjtnrmn19H1SiCqgJSg+54XqURKp5aCMbrXrhLjRN9TjuwMjiYstZ0MzDrHqkGZ5dFTKd+zg==", + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/strnum/-/strnum-2.3.0.tgz", + "integrity": "sha512-ums3KNd42PGyx5xaoVTO1mjU1bH3NpY4vsrVlnv9PNGqQj8wd7rJ6nEypLrJ7z5vxK5RP0yMLo6J/Gsm62DI5Q==", "funding": [ { "type": "github", @@ -8535,6 +8525,21 @@ } } }, + "node_modules/xml-naming": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/xml-naming/-/xml-naming-0.1.0.tgz", + "integrity": "sha512-k8KO9hrMyNk6tUWqUfkTEZbezRRpONVOzUTnc97VnCvyj6Tf9lyUR9EDAIeiVLv56jsMcoXEwjW8Kv5yPY52lw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/NaturalIntelligence" + } + ], + "license": "MIT", + "engines": { + "node": ">=16.0.0" + } + }, "node_modules/xtend": { "version": "4.0.2", "dev": true, @@ -8595,8 +8600,10 @@ "name": "@strands-agents/strandly", "version": "0.0.1", "dependencies": { + "@aws-sdk/client-cloudwatch": "^3", "commander": "^14", - "tsx": "^4.21.0" + "tsx": "^4.21.0", + "zod": "^3.23" }, "bin": { "strandly": "src/cli.ts" @@ -8606,6 +8613,15 @@ "typescript": "^5.5.0" } }, + "strandly/node_modules/zod": { + "version": "3.25.76", + "resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz", + "integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==", + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/colinhacks" + } + }, "strands-dev": { "name": "@strands-agents/dev", "version": "0.0.1", @@ -8653,6 +8669,7 @@ "@opentelemetry/sdk-metrics": "^2.6.1", "@opentelemetry/sdk-trace-base": "^2.6.1", "@opentelemetry/sdk-trace-node": "^2.6.1", + "@smithy/types": "^4.0.0", "@types/express": "^5.0.6", "@types/node": "^25.6.0", "@types/uuid": "^11.0.0", @@ -8688,6 +8705,7 @@ "@opentelemetry/sdk-metrics": "^2.6.1", "@opentelemetry/sdk-trace-base": "^2.6.1", "@opentelemetry/sdk-trace-node": "^2.6.1", + "@smithy/types": "^4.0.0", "express": "^5.1.0", "openai": "^6.7.0", "zod": "^4.1.12" @@ -8729,6 +8747,9 @@ "@opentelemetry/sdk-trace-node": { "optional": true }, + "@smithy/types": { + "optional": true + }, "express": { "optional": true }, diff --git a/strandly/package.json b/strandly/package.json index 27baf2160..9c9b4f0c8 100644 --- a/strandly/package.json +++ b/strandly/package.json @@ -10,8 +10,10 @@ "type-check": "tsc --noEmit" }, "dependencies": { + "@aws-sdk/client-cloudwatch": "^3", "commander": "^14", - "tsx": "^4.21.0" + "tsx": "^4.21.0", + "zod": "^3.23" }, "devDependencies": { "@types/node": "^22", diff --git a/strandly/src/benchmark/README.md b/strandly/src/benchmark/README.md new file mode 100644 index 000000000..6e5b9bd32 --- /dev/null +++ b/strandly/src/benchmark/README.md @@ -0,0 +1,158 @@ +# strandly benchmark + +Benchmarks Strands agents against [ContextBench](https://github.com/EuniAI/ContextBench) — a code investigation benchmark that measures how well an agent finds relevant code for GitHub issues. + +## Quick Start + +```bash +# Run the default config (control) on the default task +AWS_REGION=us-east-1 strandly benchmark --suite contextbench + +# Run a specific built-in config +AWS_REGION=us-east-1 strandly benchmark --suite contextbench --config offloader + +# Run with a custom agent file +AWS_REGION=us-east-1 strandly benchmark --suite contextbench --agent-file ./my-agent.ts + +# Use a different model +AWS_REGION=us-east-1 strandly benchmark --suite contextbench --model us.anthropic.claude-haiku-4-5-20251001-v1:0 + +# Fail if file coverage drops below 10% +AWS_REGION=us-east-1 strandly benchmark --suite contextbench --min-coverage 0.1 + +# Save results to files +AWS_REGION=us-east-1 strandly benchmark --suite contextbench --output results.json --output-md results.md + +# Emit metrics to CloudWatch +AWS_REGION=us-east-1 strandly benchmark --suite contextbench --cloudwatch +``` + +## Prerequisites + +- **Node.js 20+** +- **Python 3.x** with `pyarrow`, `tree-sitter`, `tree-sitter-languages` +- **AWS credentials** configured (for Bedrock model access and optional CloudWatch) +- **`AWS_REGION`** set to a region with Bedrock access (e.g. `us-east-1`) + +Install Python deps: +```bash +pip install pyarrow tree-sitter 'tree-sitter-languages; python_version < "3.12"' +``` + +## Options + +| Flag | Description | +|------|-------------| +| `--suite ` | **(required)** Benchmark suite. Currently: `contextbench` | +| `--config ` | Run only this built-in config | +| `--agent-file ` | Path to a `.ts` file exporting a custom `BenchmarkConfig` | +| `--task ` | ContextBench task ID (default: `django__django-15987`) | +| `--model ` | Model ID for built-in configs (default: `us.anthropic.claude-sonnet-4-20250514-v1:0`) | +| `--min-coverage ` | Minimum file coverage (0-1). Exit 1 if below. | +| `--output ` | Write JSON results to file | +| `--output-md ` | Write markdown summary to file | +| `--cloudwatch` | Emit metrics to AWS CloudWatch | + +## Built-in Configs + +These will be updated once we have preset context management strategies. + +| Name | Strategy | Description | +|------|----------|-------------| +| `control` | SlidingWindow ws=40 | SDK default, no extras | +| `offloader` | ContextOffloader | Offloads tool results >2500 tokens, keeps 1000 token preview | +| `offloader-aggressive` | ContextOffloader | Offloads >500 tokens, keeps 200 token preview | +| `summarizing` | SummarizingConversationManager | Summarizes oldest 30% of messages, proactive at 70% context | +| `sliding-proactive` | SlidingWindow + proactive | Same ws=40 but proactively compresses at 70% context usage | +| `offloader-summarizing` | Offloader + Summarizing | Combined: offload large results + summarize old messages | + +## Custom Agent File + +Create a `.ts` file that exports a `BenchmarkConfig`: + +```typescript +import { Agent } from './strands-ts/src/agent/agent.js' +import { BedrockModel } from './strands-ts/src/models/bedrock.js' +import { bash } from './strands-ts/src/vended-tools/bash/bash.js' +import { ContextOffloader } from './strands-ts/src/vended-plugins/context-offloader/plugin.js' +import { InMemoryStorage } from './strands-ts/src/vended-plugins/context-offloader/storage.js' +import type { BenchmarkConfig } from './strandly/src/benchmark/types.js' + +const config: BenchmarkConfig = { + name: 'my-experiment', + description: 'Testing new offloading thresholds', + createAgent(task) { + return new Agent({ + model: new BedrockModel({ stream: false }), + tools: [bash], + plugins: [new ContextOffloader({ storage: new InMemoryStorage(), maxResultTokens: 1000, previewTokens: 500 })], + systemPrompt: task.prompt, + printer: false, + }) + }, +} + +export default config +``` + +Run it: +```bash +AWS_REGION=us-east-1 strandly benchmark --suite contextbench --agent-file ./my-experiment.ts +``` + +## Metrics + +Each run produces: + +| Metric | What it measures | +|--------|-----------------| +| **File Coverage** | Fraction of gold files the agent found (recall) | +| **File Precision** | Fraction of files the agent read that were relevant | +| **Symbol Coverage/Precision** | Same at function/class granularity | +| **Span Coverage/Precision** | Same at line-range granularity | +| **EditLoc Recall/Precision** | Did the agent find the exact edit locations? | +| **Input Tokens** | Total tokens consumed (cost proxy) | +| **Cycles** | Number of agent loop iterations | +| **Latency** | Wall-clock time | + +## CloudWatch + +Metrics go to namespace `StrandsSDK/Benchmarks` with dimensions `Config`, `Task`, `Branch`: + +- `FileCoverage` +- `FilePrecision` +- `TokenUsage` +- `CycleCount` +- `Latency` + +Requires AWS credentials with `cloudwatch:PutMetricData` permission. + +## How It Works + +1. Clones the ContextBench repo (cached at `.cache/contextbench/`) +2. Loads a task from their gold parquet files (issue + gold file/span annotations) +3. Clones the target repo at the correct commit +4. Creates a Strands agent with the selected config +5. Runs the agent — it uses `bash` to explore the repo and find relevant code +6. Extracts which files the agent read from its tool call history +7. Evaluates against ContextBench gold annotations (Python subprocess) +8. Reports results + +## Runtime + +~5-10 minutes per config per task. Running all 6 built-in configs takes ~50 minutes. + +## Adding New Benchmark Suites + +The `--suite` flag supports multiple benchmarks. To add a new one, implement the `BenchmarkSuite` interface and register it in `index.ts`: + +```typescript +import type { BenchmarkSuite } from './types.js' + +const myBench: BenchmarkSuite = { + name: 'mybench', + async run(opts) { + // Load tasks, run agent, evaluate, return results + }, +} +``` diff --git a/strandly/src/benchmark/cloudwatch.ts b/strandly/src/benchmark/cloudwatch.ts new file mode 100644 index 000000000..4affb4717 --- /dev/null +++ b/strandly/src/benchmark/cloudwatch.ts @@ -0,0 +1,76 @@ +import type { BenchmarkSuiteResult } from './types.js' + +const NAMESPACE = 'StrandsSDK/Benchmarks' + +export async function emitMetrics(result: BenchmarkSuiteResult): Promise { + const { CloudWatchClient, PutMetricDataCommand } = await import('@aws-sdk/client-cloudwatch') + const client = new CloudWatchClient({}) + + const metricData = result.results + .filter((r) => !r.error) + .flatMap((r) => [ + { + MetricName: 'FileCoverage', + Value: r.evaluation.fileCoverage, + Unit: 'None' as const, + Dimensions: [ + { Name: 'Config', Value: r.config }, + { Name: 'Task', Value: r.task }, + { Name: 'Branch', Value: result.branch }, + ], + }, + { + MetricName: 'FilePrecision', + Value: r.evaluation.filePrecision, + Unit: 'None' as const, + Dimensions: [ + { Name: 'Config', Value: r.config }, + { Name: 'Task', Value: r.task }, + { Name: 'Branch', Value: result.branch }, + ], + }, + { + MetricName: 'TokenUsage', + Value: r.metrics.inputTokens + r.metrics.outputTokens, + Unit: 'Count' as const, + Dimensions: [ + { Name: 'Config', Value: r.config }, + { Name: 'Task', Value: r.task }, + { Name: 'Branch', Value: result.branch }, + ], + }, + { + MetricName: 'CycleCount', + Value: r.metrics.cycleCount, + Unit: 'Count' as const, + Dimensions: [ + { Name: 'Config', Value: r.config }, + { Name: 'Task', Value: r.task }, + { Name: 'Branch', Value: result.branch }, + ], + }, + { + MetricName: 'Latency', + Value: r.metrics.latencyMs, + Unit: 'Milliseconds' as const, + Dimensions: [ + { Name: 'Config', Value: r.config }, + { Name: 'Task', Value: r.task }, + { Name: 'Branch', Value: result.branch }, + ], + }, + ]) + + // CloudWatch accepts max 1000 metric data points per request + for (let i = 0; i < metricData.length; i += 1000) { + const batch = metricData.slice(i, i + 1000) + await client.send( + new PutMetricDataCommand({ + Namespace: NAMESPACE, + MetricData: batch, + }) + ) + } + + console.log(`Emitted ${metricData.length} metrics to CloudWatch namespace: ${NAMESPACE}`) +} diff --git a/strandly/src/benchmark/configs.ts b/strandly/src/benchmark/configs.ts new file mode 100644 index 000000000..d1ff9e438 --- /dev/null +++ b/strandly/src/benchmark/configs.ts @@ -0,0 +1,105 @@ +import { Agent } from '../../../strands-ts/src/agent/agent.js' +import { BedrockModel } from '../../../strands-ts/src/models/bedrock.js' +import { bash } from '../../../strands-ts/src/vended-tools/bash/bash.js' +import { SlidingWindowConversationManager } from '../../../strands-ts/src/conversation-manager/sliding-window-conversation-manager.js' +import { SummarizingConversationManager } from '../../../strands-ts/src/conversation-manager/summarizing-conversation-manager.js' +import { ContextOffloader } from '../../../strands-ts/src/vended-plugins/context-offloader/plugin.js' +import { InMemoryStorage } from '../../../strands-ts/src/vended-plugins/context-offloader/storage.js' +import type { BenchmarkConfig, ContextBenchTask } from './types.js' + +const DEFAULT_MODEL = 'us.anthropic.claude-sonnet-4-20250514-v1:0' + +// TODO: Update these configs once we have preset context management strategies +export function getConfigs(modelId?: string): BenchmarkConfig[] { + const model = modelId ?? DEFAULT_MODEL + + return [ + { + name: 'control', + description: `SDK default (SlidingWindow ws=40, ${model})`, + createAgent(task: ContextBenchTask): Agent { + return new Agent({ + model: new BedrockModel({ modelId: model, stream: false }), + tools: [bash], + systemPrompt: task.prompt, + printer: false, + }) + }, + }, + { + name: 'offloader', + description: `Context offloading (maxResult=2500, preview=1000, ${model})`, + createAgent(task: ContextBenchTask): Agent { + return new Agent({ + model: new BedrockModel({ modelId: model, stream: false }), + tools: [bash], + plugins: [new ContextOffloader({ storage: new InMemoryStorage() })], + systemPrompt: task.prompt, + printer: false, + }) + }, + }, + { + name: 'offloader-aggressive', + description: `Aggressive offloading (maxResult=500, preview=200, ${model})`, + createAgent(task: ContextBenchTask): Agent { + return new Agent({ + model: new BedrockModel({ modelId: model, stream: false }), + tools: [bash], + plugins: [new ContextOffloader({ storage: new InMemoryStorage(), maxResultTokens: 500, previewTokens: 200 })], + systemPrompt: task.prompt, + printer: false, + }) + }, + }, + { + name: 'summarizing', + description: `Summarizing conversation manager (ratio=0.3, proactive, ${model})`, + createAgent(task: ContextBenchTask): Agent { + return new Agent({ + model: new BedrockModel({ modelId: model, stream: false }), + tools: [bash], + conversationManager: new SummarizingConversationManager({ + summaryRatio: 0.3, + proactiveCompression: true, + }), + systemPrompt: task.prompt, + printer: false, + }) + }, + }, + { + name: 'sliding-proactive', + description: `Sliding window (ws=40) with proactive compression (${model})`, + createAgent(task: ContextBenchTask): Agent { + return new Agent({ + model: new BedrockModel({ modelId: model, stream: false }), + tools: [bash], + conversationManager: new SlidingWindowConversationManager({ + windowSize: 40, + proactiveCompression: true, + }), + systemPrompt: task.prompt, + printer: false, + }) + }, + }, + { + name: 'offloader-summarizing', + description: `Offloading + summarizing combined (${model})`, + createAgent(task: ContextBenchTask): Agent { + return new Agent({ + model: new BedrockModel({ modelId: model, stream: false }), + tools: [bash], + plugins: [new ContextOffloader({ storage: new InMemoryStorage() })], + conversationManager: new SummarizingConversationManager({ + summaryRatio: 0.3, + proactiveCompression: true, + }), + systemPrompt: task.prompt, + printer: false, + }) + }, + }, + ] +} diff --git a/strandly/src/benchmark/contextbench/loader.ts b/strandly/src/benchmark/contextbench/loader.ts new file mode 100644 index 000000000..0fa14d529 --- /dev/null +++ b/strandly/src/benchmark/contextbench/loader.ts @@ -0,0 +1,176 @@ +import { execSync } from 'node:child_process' +import { existsSync, writeFileSync, mkdirSync } from 'node:fs' +import { join, resolve } from 'node:path' +import type { ContextBenchTask, GoldAnnotation } from '../types.js' + +const CACHE_DIR = resolve(import.meta.dirname, '../../../../.cache/contextbench') +const CONTEXTBENCH_REPO = 'https://github.com/EuniAI/ContextBench.git' + +export function ensureContextBenchCloned(): string { + const repoDir = join(CACHE_DIR, 'contextbench-repo') + + if (!existsSync(join(repoDir, '.git'))) { + mkdirSync(CACHE_DIR, { recursive: true }) + console.log('Cloning ContextBench repository...') + execSync(`git clone --depth 1 ${CONTEXTBENCH_REPO} ${repoDir}`, { stdio: 'inherit' }) + } + + return repoDir +} + +export function ensureDependencies(): void { + try { + execSync('python3 -c "import pyarrow; import tree_sitter"', { stdio: 'pipe' }) + } catch { + console.error( + 'Missing Python dependencies for ContextBench evaluation.\n' + + 'Install with: pip install pyarrow tree-sitter tree-sitter-languages datasets' + ) + process.exit(1) + } +} + +export function loadTask(taskId: string): ContextBenchTask { + const contextbenchDir = ensureContextBenchCloned() + const goldParquet = join(contextbenchDir, 'data', 'contextbench_verified.parquet') + + if (!existsSync(goldParquet)) { + throw new Error(`Gold data not found at ${goldParquet}`) + } + + const tmp = join(CACHE_DIR, 'tmp') + mkdirSync(tmp, { recursive: true }) + + const scriptFile = join(tmp, 'load_task.py') + const parquetPath = JSON.stringify(goldParquet) + const taskIdStr = JSON.stringify(taskId) + writeFileSync( + scriptFile, +`import pyarrow.parquet as pq, json, sys + +df = pq.read_table(${parquetPath}).to_pandas() +task_id = ${taskIdStr} + +row = df[df["original_inst_id"] == task_id] +if row.empty: + row = df[df["instance_id"].str.contains(task_id)] +if row.empty: + print(json.dumps({"error": "Task not found: " + task_id})) + sys.exit(0) + +r = row.iloc[0] +print(json.dumps({ + "instance_id": str(r["instance_id"]), + "original_inst_id": str(r["original_inst_id"]), + "repo": str(r["repo"]), + "repo_url": str(r["repo_url"]), + "base_commit": str(r["base_commit"]), + "problem_statement": str(r["problem_statement"]), + "gold_context": str(r["gold_context"]), +})) +`) + + const output = execSync(`python3 ${scriptFile}`, { encoding: 'utf-8', stdio: ['pipe', 'pipe', 'pipe'] }) + const data = JSON.parse(output.trim()) + + if (data.error) { + throw new Error(data.error) + } + + const goldContext: Array<{ file: string; start_line?: number; end_line?: number }> = JSON.parse( + data.gold_context + ) + + return { + id: data.original_inst_id, + repo: data.repo, + issue: extractIssueNumber(data.original_inst_id), + baseCommit: data.base_commit, + prompt: buildPrompt(data.problem_statement, data.repo), + goldAnnotations: parseGoldContext(goldContext), + } +} + +export function ensureRepoCloned(task: ContextBenchTask): string { + if (!/^[0-9a-f]+$/i.test(task.baseCommit)) { + throw new Error(`Invalid base commit: ${task.baseCommit}`) + } + if (!/^[a-zA-Z0-9_.-]+\/[a-zA-Z0-9_.-]+$/.test(task.repo)) { + throw new Error(`Invalid repo format: ${task.repo}`) + } + + const repoDir = join(CACHE_DIR, 'repos', task.repo.replace('/', '__')) + + if (!existsSync(join(repoDir, '.git'))) { + mkdirSync(join(CACHE_DIR, 'repos'), { recursive: true }) + console.log(` Cloning ${task.repo}...`) + execSync(`git clone --depth 100 https://github.com/${task.repo}.git ${repoDir}`, { + stdio: 'inherit', + }) + } + + execSync(`git checkout ${task.baseCommit} 2>/dev/null || git fetch --depth 100 origin ${task.baseCommit} && git checkout ${task.baseCommit}`, { + cwd: repoDir, + stdio: 'pipe', + }) + + return repoDir +} + +export function listTasks(): string[] { + const contextbenchDir = ensureContextBenchCloned() + const goldParquet = join(contextbenchDir, 'data', 'contextbench_verified.parquet') + + const tmp = join(CACHE_DIR, 'tmp') + mkdirSync(tmp, { recursive: true }) + + const scriptFile = join(tmp, 'list_tasks.py') + const parquetPath = JSON.stringify(goldParquet) + writeFileSync( + scriptFile, +`import pyarrow.parquet as pq, json + +t = pq.read_table(${parquetPath}, columns=["original_inst_id"]) +ids = t.column("original_inst_id").to_pylist() +print(json.dumps(ids[:20])) +`) + + const output = execSync(`python3 ${scriptFile}`, { encoding: 'utf-8', stdio: ['pipe', 'pipe', 'pipe'] }) + return JSON.parse(output.trim()) +} + +function buildPrompt(problemStatement: string, repo: string): string { + return `You are a code investigation agent. Your task is to find all relevant code locations for the following GitHub issue in the ${repo} repository. + +## Issue + +${problemStatement} + +Investigate the repository to find all files, functions, and code spans relevant to this issue. Use the available tools to read files and search the codebase. Be thorough — find all relevant locations, not just the first match. + +When you are done, list all the files and specific line ranges you found to be relevant.` +} + +function parseGoldContext( + goldContext: Array<{ file: string; start_line?: number; end_line?: number }> +): GoldAnnotation[] { + const byFile = new Map() + + for (const entry of goldContext) { + let annotation = byFile.get(entry.file) + if (!annotation) { + annotation = { file: entry.file, spans: [] } + byFile.set(entry.file, annotation) + } + if (entry.start_line != null && entry.end_line != null) { + annotation.spans!.push({ startLine: entry.start_line, endLine: entry.end_line }) + } + } + + return [...byFile.values()] +} + +function extractIssueNumber(instanceId: string): number { + const match = instanceId.match(/-(\d+)$/) + return match ? parseInt(match[1]!, 10) : 0 +} diff --git a/strandly/src/benchmark/contextbench/trajectory.ts b/strandly/src/benchmark/contextbench/trajectory.ts new file mode 100644 index 000000000..b12a614a1 --- /dev/null +++ b/strandly/src/benchmark/contextbench/trajectory.ts @@ -0,0 +1,81 @@ +import type { Message } from '../../../../strands-ts/src/types/messages.js' + +export interface TrajectoryEntry { + file: string + content?: string + startLine?: number + endLine?: number +} + +export function extractTrajectory(messages: Message[], repoDir?: string): TrajectoryEntry[] { + const entries: TrajectoryEntry[] = [] + const seen = new Set() + + for (const msg of messages) { + if (msg.role !== 'assistant') continue + + for (const block of msg.content) { + if (block.type !== 'toolUseBlock') continue + + const paths = extractFilePathsFromToolCall(block.name, block.input as Record) + for (const filePath of paths) { + const relative = toRelativePath(filePath, repoDir) + if (relative && !seen.has(relative)) { + seen.add(relative) + entries.push({ file: relative }) + } + } + } + } + + return entries +} + +export function trajectoryToFileList(entries: TrajectoryEntry[]): string[] { + return entries.map((e) => e.file) +} + +function toRelativePath(filePath: string, repoDir?: string): string { + if (!repoDir) return filePath + + const normalized = repoDir.endsWith('/') ? repoDir : repoDir + '/' + if (filePath.startsWith(normalized)) { + return filePath.slice(normalized.length) + } + if (filePath.startsWith('/')) { + return filePath + } + return filePath +} + +function extractFilePathsFromToolCall(toolName: string, input: Record): string[] { + if (toolName === 'file_editor' || toolName === 'fileEditor') { + const path = (input.path ?? input.file_path ?? input.filePath) as string | undefined + if (path && (input.command === 'view' || !input.command)) return [path] + } + + if (toolName === 'bash') { + const cmd = (input.command ?? input.cmd) as string | undefined + if (!cmd) return [] + + const paths: string[] = [] + + const catMatch = cmd.match(/(?:cat|head|tail|less|more)\s+([^\s|;>&]+)/g) + if (catMatch) { + for (const m of catMatch) { + const file = m.replace(/^(?:cat|head|tail|less|more)\s+/, '') + if (file && !file.startsWith('-')) paths.push(file) + } + } + + const sedMatch = cmd.match(/sed\s+.*\s+([^\s|;>&]+\.\w+)/) + if (sedMatch) paths.push(sedMatch[1]!) + + const pythonCat = cmd.match(/python.*(?:open|read)\s*\(\s*['"]([^'"]+)['"]\s*\)/) + if (pythonCat) paths.push(pythonCat[1]!) + + return paths + } + + return [] +} diff --git a/strandly/src/benchmark/evaluator.ts b/strandly/src/benchmark/evaluator.ts new file mode 100644 index 000000000..93ccb5bda --- /dev/null +++ b/strandly/src/benchmark/evaluator.ts @@ -0,0 +1,111 @@ +import { execSync } from 'node:child_process' +import { writeFileSync, mkdtempSync } from 'node:fs' +import { join } from 'node:path' +import { tmpdir } from 'node:os' +import type { ContextBenchTask, EvaluationMetrics } from './types.js' +import { ensureContextBenchCloned } from './contextbench/loader.js' + +export async function evaluate( + task: ContextBenchTask, + filesRead: string[], + spans?: Record> +): Promise { + const contextbenchDir = ensureContextBenchCloned() + const goldParquet = join(contextbenchDir, 'data', 'contextbench_verified.parquet') + const tmp = mkdtempSync(join(tmpdir(), 'bench-eval-')) + + const predData = { + instance_id: task.id, + original_inst_id: task.id, + repo_url: `https://github.com/${task.repo}.git`, + commit: task.baseCommit, + traj_data: { + pred_steps: [{ files: filesRead, spans: spans ?? {}, symbols: {} }], + pred_files: filesRead, + pred_spans: spans ?? {}, + pred_symbols: {}, + }, + } + + const predFile = join(tmp, 'pred.jsonl') + writeFileSync(predFile, JSON.stringify(predData) + '\n') + + const scriptFile = join(tmp, 'evaluate.py') + const cbDir = JSON.stringify(contextbenchDir) + const goldPath = JSON.stringify(goldParquet) + const predPath = JSON.stringify(predFile) + const reposDir = JSON.stringify(join(tmp, 'repos')) + writeFileSync( + scriptFile, +`import sys, os, io, json + +sys.path.insert(0, ${cbDir}) +os.environ["PYTHONDONTWRITEBYTECODE"] = "1" + +_real_stdout = sys.stdout +sys.stdout = io.StringIO() +sys.stderr = open(os.devnull, "w") + +from contextbench.evaluate import evaluate_instance +from contextbench.parsers import GoldLoader + +gold_loader = GoldLoader(${goldPath}) +pred_data = json.loads(open(${predPath}).readline()) +instance_id = pred_data["instance_id"] +original_id = pred_data.get("original_inst_id", instance_id) + +gold = gold_loader.get(instance_id) or gold_loader.get(original_id) +if not gold: + sys.stdout = _real_stdout + print(json.dumps({"error": "no_gold_found"})) + sys.exit(0) + +result = evaluate_instance(instance_id, gold, pred_data, ${reposDir}) +sys.stdout = _real_stdout +print(json.dumps(result, default=str)) +`) + + const output = execSync(`python3 ${scriptFile}`, { + encoding: 'utf-8', + stdio: ['pipe', 'pipe', 'pipe'], + timeout: 120_000, + }) + + const result = JSON.parse(output.trim()) + + if (result.error) { + console.warn(` Evaluation error: ${result.error}`) + return emptyMetrics() + } + + return extractMetrics(result) +} + +function extractMetrics(result: Record): EvaluationMetrics { + const final = (result.final ?? {}) as Record> + const editloc = (result.editloc ?? {}) as Record + + return { + fileCoverage: final.file?.coverage ?? 0, + filePrecision: final.file?.precision ?? 0, + symbolCoverage: final.symbol?.coverage ?? 0, + symbolPrecision: final.symbol?.precision ?? 0, + spanCoverage: final.span?.coverage ?? 0, + spanPrecision: final.span?.precision ?? 0, + editLocRecall: editloc.recall ?? 0, + editLocPrecision: editloc.precision ?? 0, + } +} + +function emptyMetrics(): EvaluationMetrics { + return { + fileCoverage: 0, + filePrecision: 0, + symbolCoverage: 0, + symbolPrecision: 0, + spanCoverage: 0, + spanPrecision: 0, + editLocRecall: 0, + editLocPrecision: 0, + } +} diff --git a/strandly/src/benchmark/index.ts b/strandly/src/benchmark/index.ts new file mode 100644 index 000000000..5bb4f21a8 --- /dev/null +++ b/strandly/src/benchmark/index.ts @@ -0,0 +1,145 @@ +import { execSync } from 'node:child_process' +import { resolve } from 'node:path' +import { pathToFileURL } from 'node:url' +import type { BenchmarkConfig, BenchmarkRunOpts, BenchmarkSuite, BenchmarkSuiteResult } from './types.js' +import { getConfigs } from './configs.js' +import { loadTask, ensureDependencies } from './contextbench/loader.js' +import { runBenchmark } from './runner.js' +import { writeResults, generateMarkdown } from './reporter.js' +import { emitMetrics } from './cloudwatch.js' + +const ROOT = resolve(import.meta.dirname, '../../..') + +const DEFAULT_TASK = 'django__django-15987' + +async function loadCustomConfig(agentFile: string): Promise { + const absPath = resolve(agentFile) + const module = (await import(pathToFileURL(absPath).href)) as { default?: BenchmarkConfig; config?: BenchmarkConfig } + const config = module.default ?? module.config + if (!config || typeof config.createAgent !== 'function') { + throw new Error( + `Agent file must export a BenchmarkConfig (with name, description, createAgent). Got: ${Object.keys(module).join(', ')}` + ) + } + return config +} + +const contextbench: BenchmarkSuite = { + name: 'contextbench', + async run(opts: BenchmarkRunOpts): Promise { + ensureDependencies() + + const taskId = opts.task ?? DEFAULT_TASK + console.log(`Loading task: ${taskId}`) + const task = loadTask(taskId) + console.log(` Repo: ${task.repo}, commit: ${task.baseCommit.slice(0, 12)}`) + + const configs = getConfigs(opts.model) + let selectedConfigs: BenchmarkConfig[] + + if (opts.agentFile) { + const custom = await loadCustomConfig(opts.agentFile) + console.log(`Using custom agent: ${custom.name}`) + selectedConfigs = [custom] + } else if (opts.config) { + selectedConfigs = configs.filter((c) => c.name === opts.config) + } else { + selectedConfigs = configs + } + + if (selectedConfigs.length === 0) { + const available = configs.map((c) => c.name).join(', ') + throw new Error(`Unknown config "${opts.config}". Available: ${available}`) + } + + const results = [] + for (const config of selectedConfigs) { + console.log(`\nRunning config: ${config.name} (${config.description})`) + const result = await runBenchmark(config, task) + results.push(result) + + if (result.error) { + console.log(` ✗ ${config.name}: ERROR — ${result.error}`) + } else { + console.log( + ` ✓ ${config.name}: coverage=${(result.evaluation.fileCoverage * 100).toFixed(0)}% ` + + `precision=${(result.evaluation.filePrecision * 100).toFixed(1)}% ` + + `tokens=${(result.metrics.inputTokens / 1000).toFixed(0)}K ` + + `cycles=${result.metrics.cycleCount}` + ) + } + } + + const gitSha = execSync('git rev-parse HEAD', { cwd: ROOT, encoding: 'utf-8' }).trim() + const branch = execSync('git rev-parse --abbrev-ref HEAD', { cwd: ROOT, encoding: 'utf-8' }).trim() + + return { + suite: 'contextbench', + timestamp: new Date().toISOString(), + gitSha, + branch, + results, + } + }, +} + +const suites: Record = { contextbench } + +export interface BenchmarkOpts { + suite: string + config?: string + agentFile?: string + task?: string + model?: string + minCoverage?: number + output?: string + outputMd?: string + cloudwatch?: boolean +} + +export async function benchmark(opts: BenchmarkOpts): Promise { + const suite = suites[opts.suite] + if (!suite) { + const available = Object.keys(suites).join(', ') + console.error(`Unknown benchmark suite: "${opts.suite}". Available: ${available}`) + process.exit(1) + } + + console.log(`\nRunning benchmark suite: ${suite.name}\n`) + + const result = await suite.run({ config: opts.config, agentFile: opts.agentFile, task: opts.task, model: opts.model }) + + writeResults(result, { output: opts.output, outputMd: opts.outputMd }) + + if (!opts.output && !opts.outputMd) { + console.log('\n' + generateMarkdown(result)) + } + + if (opts.cloudwatch) { + await emitMetrics(result) + } + + const failed = result.results.filter((r) => r.error) + if (failed.length > 0) { + console.error(`\n${failed.length} benchmark(s) errored.`) + process.exit(1) + } + + if (opts.minCoverage != null) { + const belowThreshold = result.results.filter( + (r) => !r.error && r.evaluation.fileCoverage < opts.minCoverage! + ) + if (belowThreshold.length > 0) { + console.error( + `\nFAILED: ${belowThreshold.length} config(s) below minimum coverage of ${(opts.minCoverage * 100).toFixed(0)}%:` + ) + for (const r of belowThreshold) { + console.error(` ${r.config}: ${(r.evaluation.fileCoverage * 100).toFixed(1)}%`) + } + process.exit(1) + } + console.log(`\nAll configs above minimum coverage threshold (${(opts.minCoverage * 100).toFixed(0)}%)`) + } + + process.exit(0) +} diff --git a/strandly/src/benchmark/reporter.ts b/strandly/src/benchmark/reporter.ts new file mode 100644 index 000000000..f7ab60ee9 --- /dev/null +++ b/strandly/src/benchmark/reporter.ts @@ -0,0 +1,58 @@ +import { writeFileSync } from 'node:fs' +import type { BenchmarkSuiteResult } from './types.js' + +export function generateMarkdown(result: BenchmarkSuiteResult): string { + const passed = result.results.filter((r) => !r.error) + const failed = result.results.filter((r) => r.error) + + let md = `## Benchmark Results: ${result.suite}\n\n` + md += `**${passed.length}/${result.results.length}** configs completed` + if (failed.length > 0) md += ` | ${failed.length} errored` + md += `\n\n` + + for (const r of passed) { + const tokens = formatTokens(r.metrics.inputTokens + r.metrics.outputTokens) + const coverage = (r.evaluation.fileCoverage * 100).toFixed(0) + const precision = (r.evaluation.filePrecision * 100).toFixed(1) + + md += `
\n` + md += `${r.config}: File Coverage ${coverage}% | Precision ${precision}% | ${tokens} tokens | ${r.metrics.cycleCount} cycles\n\n` + md += `| Metric | Coverage | Precision |\n` + md += `|--------|----------|----------|\n` + md += `| File | ${r.evaluation.fileCoverage.toFixed(3)} | ${r.evaluation.filePrecision.toFixed(3)} |\n` + md += `| Symbol | ${r.evaluation.symbolCoverage.toFixed(3)} | ${r.evaluation.symbolPrecision.toFixed(3)} |\n` + md += `| Span | ${r.evaluation.spanCoverage.toFixed(3)} | ${r.evaluation.spanPrecision.toFixed(3)} |\n` + md += `| EditLoc | ${r.evaluation.editLocRecall.toFixed(3)} (recall) | ${r.evaluation.editLocPrecision.toFixed(3)} |\n\n` + md += `**Metrics:** ${r.metrics.inputTokens.toLocaleString()} input tokens, ${r.metrics.outputTokens.toLocaleString()} output tokens, ${(r.metrics.latencyMs / 1000).toFixed(1)}s\n\n` + md += `**Files read:** ${r.trajectory.length}\n\n` + md += `
\n\n` + } + + if (failed.length > 0) { + md += `
\nErrors (${failed.length})\n\n` + for (const r of failed) { + md += `- **${r.config}**: ${r.error}\n` + } + md += `\n
\n` + } + + md += `\n---\n*Run at ${result.timestamp} on \`${result.branch}\` (${result.gitSha.slice(0, 7)})*\n` + return md +} + +export function writeResults(result: BenchmarkSuiteResult, opts: { output?: string; outputMd?: string }): void { + if (opts.output) { + writeFileSync(opts.output, JSON.stringify(result, null, 2)) + console.log(`JSON results written to: ${opts.output}`) + } + if (opts.outputMd) { + writeFileSync(opts.outputMd, generateMarkdown(result)) + console.log(`Markdown summary written to: ${opts.outputMd}`) + } +} + +function formatTokens(tokens: number): string { + if (tokens >= 1_000_000) return `${(tokens / 1_000_000).toFixed(1)}M` + if (tokens >= 1_000) return `${(tokens / 1_000).toFixed(0)}K` + return String(tokens) +} diff --git a/strandly/src/benchmark/runner.ts b/strandly/src/benchmark/runner.ts new file mode 100644 index 000000000..ff35cfd16 --- /dev/null +++ b/strandly/src/benchmark/runner.ts @@ -0,0 +1,75 @@ +import type { BenchmarkConfig, BenchmarkResult, ContextBenchTask } from './types.js' +import { extractTrajectory, trajectoryToFileList } from './contextbench/trajectory.js' +import { ensureRepoCloned } from './contextbench/loader.js' +import { evaluate } from './evaluator.js' + +export async function runBenchmark(config: BenchmarkConfig, task: ContextBenchTask): Promise { + const repoDir = ensureRepoCloned(task) + console.log(` Repo at: ${repoDir}`) + + const startTime = performance.now() + const heartbeat = setInterval(() => { + const elapsed = ((performance.now() - startTime) / 1000).toFixed(0) + process.stdout.write(`\r running... ${elapsed}s elapsed`) + }, 5_000) + + try { + const agent = config.createAgent(task) + + const result = await Promise.race([ + agent.invoke( + `The repository is cloned at: ${repoDir}\n\nInvestigate the issue and find all relevant code locations.` + ), + new Promise((_, reject) => + setTimeout(() => reject(new Error('Benchmark timed out after 10 minutes')), 600_000) + ), + ]) + + const latencyMs = performance.now() - startTime + const trajectory = extractTrajectory(agent.messages, repoDir) + const fileList = trajectoryToFileList(trajectory) + + const evaluation = await evaluate(task, fileList) + + const usage = result.metrics?.accumulatedUsage + return { + config: config.name, + task: task.id, + metrics: { + inputTokens: usage?.inputTokens ?? 0, + outputTokens: usage?.outputTokens ?? 0, + cycleCount: result.metrics?.cycleCount ?? 0, + latencyMs: Math.round(latencyMs), + }, + evaluation, + trajectory: fileList, + } + } catch (err) { + const latencyMs = performance.now() - startTime + return { + config: config.name, + task: task.id, + metrics: { + inputTokens: 0, + outputTokens: 0, + cycleCount: 0, + latencyMs: Math.round(latencyMs), + }, + evaluation: { + fileCoverage: 0, + filePrecision: 0, + symbolCoverage: 0, + symbolPrecision: 0, + spanCoverage: 0, + spanPrecision: 0, + editLocRecall: 0, + editLocPrecision: 0, + }, + trajectory: [], + error: err instanceof Error ? err.message : String(err), + } + } finally { + clearInterval(heartbeat) + process.stdout.write('\n') + } +} diff --git a/strandly/src/benchmark/types.ts b/strandly/src/benchmark/types.ts new file mode 100644 index 000000000..dc33f3597 --- /dev/null +++ b/strandly/src/benchmark/types.ts @@ -0,0 +1,67 @@ +import type { Agent } from '../../../strands-ts/src/agent/agent.js' + +export interface BenchmarkSuite { + name: string + run(opts: BenchmarkRunOpts): Promise +} + +export interface BenchmarkRunOpts { + config?: string + agentFile?: string + task?: string + model?: string +} + +export interface BenchmarkConfig { + name: string + description: string + createAgent(task: ContextBenchTask): Agent +} + +export interface ContextBenchTask { + id: string + repo: string + issue: number + baseCommit: string + prompt: string + goldAnnotations: GoldAnnotation[] +} + +export interface GoldAnnotation { + file: string + symbols?: string[] + spans?: { startLine: number; endLine: number }[] +} + +export interface EvaluationMetrics { + fileCoverage: number + filePrecision: number + symbolCoverage: number + symbolPrecision: number + spanCoverage: number + spanPrecision: number + editLocRecall: number + editLocPrecision: number +} + +export interface BenchmarkResult { + config: string + task: string + metrics: { + inputTokens: number + outputTokens: number + cycleCount: number + latencyMs: number + } + evaluation: EvaluationMetrics + trajectory: string[] + error?: string +} + +export interface BenchmarkSuiteResult { + suite: string + timestamp: string + gitSha: string + branch: string + results: BenchmarkResult[] +} diff --git a/strandly/src/cli.ts b/strandly/src/cli.ts index 02cd1da42..6ef3a6fa9 100755 --- a/strandly/src/cli.ts +++ b/strandly/src/cli.ts @@ -151,6 +151,23 @@ program } }) +program + .command('benchmark') + .description('Run agent benchmarks') + .requiredOption('--suite ', 'Benchmark suite to run (contextbench)') + .option('--config ', 'Run specific config only') + .option('--agent-file ', 'Path to a .ts file exporting a BenchmarkConfig') + .option('--task ', 'Task ID within the suite') + .option('--model ', 'Model ID for built-in configs (default: us.anthropic.claude-sonnet-4-20250514-v1:0)') + .option('--min-coverage ', 'Minimum file coverage (0-1). Fails if below this.') + .option('--output ', 'Write JSON results to file') + .option('--output-md ', 'Write markdown summary to file') + .option('--cloudwatch', 'Emit metrics to CloudWatch') + .action(async (opts) => { + const { benchmark } = await import('./benchmark/index.js') + await benchmark({ ...opts, minCoverage: opts.minCoverage ? parseFloat(opts.minCoverage) : undefined, model: opts.model }) + }) + program.parse() function run(cmd: string, opts?: { cwd?: string; env?: Record }): void { From cb20a77bcb3fa0e5427b1f481bb3fd5fef33fe44 Mon Sep 17 00:00:00 2001 From: Liz <91279165+lizradway@users.noreply.github.com> Date: Thu, 21 May 2026 14:38:38 -0400 Subject: [PATCH 2/2] update from comments --- strandly/package.json | 3 +-- strandly/src/benchmark/contextbench/loader.ts | 3 +-- strandly/src/benchmark/contextbench/trajectory.ts | 1 + strandly/src/benchmark/runner.ts | 8 +++++--- strandly/src/cli.ts | 7 ++++++- 5 files changed, 14 insertions(+), 8 deletions(-) diff --git a/strandly/package.json b/strandly/package.json index 9c9b4f0c8..c4223c920 100644 --- a/strandly/package.json +++ b/strandly/package.json @@ -12,8 +12,7 @@ "dependencies": { "@aws-sdk/client-cloudwatch": "^3", "commander": "^14", - "tsx": "^4.21.0", - "zod": "^3.23" + "tsx": "^4.21.0" }, "devDependencies": { "@types/node": "^22", diff --git a/strandly/src/benchmark/contextbench/loader.ts b/strandly/src/benchmark/contextbench/loader.ts index 0fa14d529..88636b651 100644 --- a/strandly/src/benchmark/contextbench/loader.ts +++ b/strandly/src/benchmark/contextbench/loader.ts @@ -22,11 +22,10 @@ export function ensureDependencies(): void { try { execSync('python3 -c "import pyarrow; import tree_sitter"', { stdio: 'pipe' }) } catch { - console.error( + throw new Error( 'Missing Python dependencies for ContextBench evaluation.\n' + 'Install with: pip install pyarrow tree-sitter tree-sitter-languages datasets' ) - process.exit(1) } } diff --git a/strandly/src/benchmark/contextbench/trajectory.ts b/strandly/src/benchmark/contextbench/trajectory.ts index b12a614a1..d8d9e38cc 100644 --- a/strandly/src/benchmark/contextbench/trajectory.ts +++ b/strandly/src/benchmark/contextbench/trajectory.ts @@ -48,6 +48,7 @@ function toRelativePath(filePath: string, repoDir?: string): string { return filePath } +// Best-effort extraction — won't catch all patterns (e.g. grep -rn, find -exec, piped commands, quoted paths) function extractFilePathsFromToolCall(toolName: string, input: Record): string[] { if (toolName === 'file_editor' || toolName === 'fileEditor') { const path = (input.path ?? input.file_path ?? input.filePath) as string | undefined diff --git a/strandly/src/benchmark/runner.ts b/strandly/src/benchmark/runner.ts index ff35cfd16..0c4cf8019 100644 --- a/strandly/src/benchmark/runner.ts +++ b/strandly/src/benchmark/runner.ts @@ -16,14 +16,16 @@ export async function runBenchmark(config: BenchmarkConfig, task: ContextBenchTa try { const agent = config.createAgent(task) + let timeoutId: ReturnType const result = await Promise.race([ agent.invoke( `The repository is cloned at: ${repoDir}\n\nInvestigate the issue and find all relevant code locations.` ), - new Promise((_, reject) => - setTimeout(() => reject(new Error('Benchmark timed out after 10 minutes')), 600_000) - ), + new Promise((_, reject) => { + timeoutId = setTimeout(() => reject(new Error('Benchmark timed out after 10 minutes')), 600_000) + }), ]) + clearTimeout(timeoutId!) const latencyMs = performance.now() - startTime const trajectory = extractTrajectory(agent.messages, repoDir) diff --git a/strandly/src/cli.ts b/strandly/src/cli.ts index 6ef3a6fa9..98d35b4fd 100755 --- a/strandly/src/cli.ts +++ b/strandly/src/cli.ts @@ -164,8 +164,13 @@ program .option('--output-md ', 'Write markdown summary to file') .option('--cloudwatch', 'Emit metrics to CloudWatch') .action(async (opts) => { + const minCoverage = opts.minCoverage ? parseFloat(opts.minCoverage) : undefined + if (minCoverage !== undefined && (isNaN(minCoverage) || minCoverage < 0 || minCoverage > 1)) { + console.error('--min-coverage must be a number between 0 and 1') + process.exit(1) + } const { benchmark } = await import('./benchmark/index.js') - await benchmark({ ...opts, minCoverage: opts.minCoverage ? parseFloat(opts.minCoverage) : undefined, model: opts.model }) + await benchmark({ ...opts, minCoverage, model: opts.model }) }) program.parse()