diff --git a/bench/HARNESS.md b/bench/HARNESS.md index 8d72346a..fa3661a1 100644 --- a/bench/HARNESS.md +++ b/bench/HARNESS.md @@ -202,7 +202,7 @@ in a `.venv`/Docker subprocess → parse its JSON report → `{resolved,score}`) copy of the process/venv/Docker/temp/report plumbing; commit0+appworld also share its stdin-piping runner (`runVenvScriptStdin`). - **Real, runnable with ZERO extra deps:** finsearchcomp (GitHub dataset + fixtures + LLM judge — the gate bench), hotpotqa + simpleqa + frames (HF/web QA + F1/LLM judge; `*_FIXTURES=1` offline), **aec-bench** (real GitHub task tree + fixtures; judge = the task's own `tests/verify.py` over python3 stdlib — **deterministic, graded per-field partial credit, no Docker, no LLM** → the candidate non-oracle correctable-middle-band bench for the open gate). -- **Real code, needs an external harness/tools to run (fail loud with the exact install/Docker fix; never a fabricated score):** swe-bench + terminal-bench (`bench/.venv` + Docker), **commit0** (ISOLATED `bench/.venv-commit0` via `python3 -m venv bench/.venv-commit0 && bench/.venv-commit0/bin/pip install commit0 datasets` — its deps conflict with the shared `.venv`; override dir with `COMMIT0_VENV` — plus Docker; judge = official pytest harness, graded (passed+xfail)/total; the rollout prompt stages in-box (clones `commit-0/` @ `base_commit`, emits `git diff`); `COMMIT0_FIXTURES=1` for offline listing), **programbench** (`pip install programbench` + Docker on linux/amd64 + HF blobs; judge = official cleanroom eval, graded passed/total; `PROGRAMBENCH_FIXTURES=1` offline), **appworld** (`pip install appworld` + `appworld install` + `appworld download data`; judge = AppWorld's own `world.evaluate()`, graded passes/num_tests — NO committed fixture: task data exists only after `download data`, so loadTasks fails loud rather than fabricate a task), mind2web, cad-design + cadbench + cadgenbench (openscad/blender/build123d). +- **Real code, needs an external harness/tools to run (fail loud with the exact install/Docker fix; never a fabricated score):** swe-bench + terminal-bench (`bench/.venv` + Docker), **commit0** (ISOLATED `bench/.venv-commit0` via `python3 -m venv bench/.venv-commit0 && bench/.venv-commit0/bin/pip install commit0 datasets` — its deps conflict with the shared `.venv`; override dir with `COMMIT0_VENV` — plus Docker; judge = official pytest harness, graded (passed+xfail)/total; the rollout prompt stages in-box (clones `commit-0/` @ `base_commit`, emits `git diff`); `COMMIT0_FIXTURES=1` for offline listing), **programbench** (`pip install programbench` + Docker on linux/amd64 + HF blobs; judge = official cleanroom eval, graded passed/total; `PROGRAMBENCH_FIXTURES=1` offline), **appworld** (`pip install appworld` + `appworld install` + `appworld download data`; judge = AppWorld's own `world.evaluate()`, graded passes/num_tests — NO committed fixture: task data exists only after `download data`, so loadTasks fails loud rather than fabricate a task), **dabstep** (`DABSTEP_DIR=/path/to/EnvCommons/DABStep` with the released `dataset.csv`, `splits/*.txt`, `files/*`, and `grade.py`; judge delegates to official `grade.py`; `DABSTEP_FIXTURES=1` only tests adapter plumbing and does not fabricate benchmark scores), mind2web, cad-design + cadbench + cadgenbench (openscad/blender/build123d). - **goldArtifact:** aec-bench returns the task's real `golden_pass.md` (verify-judge works fully offline). commit0 / programbench / appworld return `undefined` — the oracle is a git ref / stripped source / engine-bundled solution, not a portable string; judge correctness is proven by a real solve through the harness, not a synthetic gold (documented + fail-loud, not a fake). - **Absent (not built):** swe-gym, swe-bench-multimodal, and the rest of the survey set. Every unbuilt/scaffold adapter fails LOUD (throws with the integration step) rather than faking a score — no silent zeros in any corpus. Offline fixture tests: `benchmarks/{aec-bench,commit0,programbench,appworld}.test.mts` (`tsx --test`). diff --git a/bench/fixtures/dabstep.json b/bench/fixtures/dabstep.json new file mode 100644 index 00000000..59626579 --- /dev/null +++ b/bench/fixtures/dabstep.json @@ -0,0 +1,22 @@ +[ + { + "task_id": 1, + "instructions": "Using the payment files, answer this calibration task with the exact integer 42.", + "all_golds_by_task": [ + { + "kind": "number", + "value": 42.0 + } + ] + }, + { + "task_id": 2, + "instructions": "Using the payment files, answer this calibration task with the card scheme nexpay.", + "all_golds_by_task": [ + { + "kind": "scheme", + "value": "nexpay" + } + ] + } +] diff --git a/bench/package.json b/bench/package.json index f1d97625..f3fae32a 100644 --- a/bench/package.json +++ b/bench/package.json @@ -2,7 +2,7 @@ "name": "@tangle-network/agent-bench", "version": "0.1.0", "type": "module", - "description": "The unified benchmark suite for agent-runtime agents: 18 adapters (commit0, enterpriseops-gym, trata-hedge, finsearchcomp, swe-bench, humaneval, …) behind one resolveAdapter registry, each with a real deterministic judge. Score any profile/skill/prompt change against them. Map: bench/HARNESS.md.", + "description": "The unified benchmark suite for agent-runtime agents: 19 adapters (commit0, enterpriseops-gym, trata-hedge, finsearchcomp, dabstep, swe-bench, humaneval, …) behind one resolveAdapter registry, each with a real deterministic judge. Score any profile/skill/prompt change against them. Map: bench/HARNESS.md.", "main": "src/index.ts", "types": "src/index.ts", "exports": { @@ -18,7 +18,7 @@ }, "dependencies": { "@tangle-network/agent-eval": "^0.100.0", - "@tangle-network/agent-runtime": "^0.78.0", + "@tangle-network/agent-runtime": "^0.79.3", "@tangle-network/sandbox": "^0.9.3" }, "devDependencies": { @@ -27,6 +27,10 @@ }, "files": [ "src", + "fixtures", + "scripts", + "tb_agents/*.py", + "steerers", "README.md" ], "publishConfig": { diff --git a/bench/pnpm-lock.yaml b/bench/pnpm-lock.yaml index 618cf533..8526b7af 100644 --- a/bench/pnpm-lock.yaml +++ b/bench/pnpm-lock.yaml @@ -9,21 +9,21 @@ importers: .: dependencies: '@tangle-network/agent-eval': - specifier: ^0.89.0 - version: 0.89.0(@tangle-network/sandbox@0.4.3(viem@2.52.0(typescript@5.9.3)(zod@4.4.3)))(typescript@5.9.3) + specifier: ^0.100.0 + version: 0.100.0(typescript@6.0.3) '@tangle-network/agent-runtime': - specifier: file:.. - version: file:..(@tangle-network/agent-eval@0.89.0(@tangle-network/sandbox@0.4.3(viem@2.52.0(typescript@5.9.3)(zod@4.4.3)))(typescript@5.9.3))(@tangle-network/sandbox@0.4.3(viem@2.52.0(typescript@5.9.3)(zod@4.4.3))) + specifier: ^0.79.3 + version: 0.79.3(@tangle-network/agent-eval@0.100.0(typescript@6.0.3))(@tangle-network/agent-interface@0.14.0)(@tangle-network/sandbox@0.9.5(viem@2.52.0(typescript@6.0.3)(zod@4.4.3))) '@tangle-network/sandbox': - specifier: ^0.4.3 - version: 0.4.3(viem@2.52.0(typescript@5.9.3)(zod@4.4.3)) + specifier: ^0.9.3 + version: 0.9.5(viem@2.52.0(typescript@6.0.3)(zod@4.4.3)) devDependencies: tsx: specifier: ^4.19.0 version: 4.22.4 typescript: - specifier: ^5.7.0 - version: 5.9.3 + specifier: ^6.0.3 + version: 6.0.3 packages: @@ -248,32 +248,39 @@ packages: '@scure/bip39@2.2.0': resolution: {integrity: sha512-T/Bj/YvYMNkIPq6EENO6/rcs2e7qTNuyoUXf0KBFDmp0ZDu0H2X4Lq6yC3i0c8PcWkov5EbW+yQZZbdMmk154A==} - '@tangle-network/agent-eval@0.89.0': - resolution: {integrity: sha512-ifpIj1rjaE0KtL0yteX/5kreSDyDvygs6iCee+OVSrZmxZD6ZfW/0iIm+2e15elrIAiW10UJo+tUBqp1Zqu+Lg==} + '@tangle-network/agent-core@0.3.4': + resolution: {integrity: sha512-Hvz3ABRouNtBmRvGqPxifAO2yuILneJMylWH5jW/jeS2F03RvqkGYuXyGXWWLqosYbb3hVAvSEe4Ykm2FMGEDQ==} + + '@tangle-network/agent-eval@0.100.0': + resolution: {integrity: sha512-yBupVJJAqHozhe1BL5xBuDObjvNsoY+XmJo7qfpw/w7rehAXbKliBb4k3XS1G55+GaYPjFA+xwPzlEDQISpMRw==} engines: {node: '>=20'} hasBin: true - peerDependencies: - '@tangle-network/sandbox': '>=0.2.1 <0.5.0' - peerDependenciesMeta: - '@tangle-network/sandbox': - optional: true '@tangle-network/agent-integrations@0.29.0': resolution: {integrity: sha512-Avn4oBDTRP5v/3o1xq++uu/9+Rhl2hscIggeFPBGjtVYwhvbsSZL9pRrF3LfjqL9rjx9AocZOdsZC6MXrxKnkg==} engines: {node: '>=20'} hasBin: true - '@tangle-network/agent-runtime@file:..': - resolution: {directory: .., type: directory} + '@tangle-network/agent-interface@0.10.1': + resolution: {integrity: sha512-yehY/0EgKvu8lG6jIVoZCtMPLkj8VEWwasuAtuph2RaB9MKE5wuxRF647O6jw8KufNZ3aQ2UVVWpZ19dGCbs6w==} + + '@tangle-network/agent-interface@0.13.0': + resolution: {integrity: sha512-CeTPGRLoXqpt0h+BCyFgZPkfU1zyRpWmqfD+85i/uk+uvbqxkfI+JprfKVf3tBsQuCgJPSjPt5qjdW8n3h2BVg==} + + '@tangle-network/agent-interface@0.14.0': + resolution: {integrity: sha512-9CyGhIpl90E7v4MTm3b1ti3Bp7BfPigk2Nafgi21Lg0U+QxlNB656F2JmVpUuSbOo9aGZPtg5nXu5EBTlV5a1g==} + + '@tangle-network/agent-runtime@0.79.3': + resolution: {integrity: sha512-CIQ09F9zK8agXbPvilRySCX3QB8XssnYx95VHsonWs5D4M5kXn3v+dXzz1aPbnOCxveEHLyiE7zApUyj3WU1yA==} engines: {node: '>=20'} hasBin: true peerDependencies: - '@tangle-network/agent-eval': '>=0.83.0 <1.0.0' - '@tangle-network/agent-knowledge': '>=1.3.0 <2.0.0' - '@tangle-network/sandbox': '>=0.1.2 <0.7.0' + '@tangle-network/agent-eval': '>=0.97.0 <1.0.0' + '@tangle-network/agent-interface': '>=0.14.0 <1.0.0' + '@tangle-network/sandbox': '>=0.8.0 <1.0.0' playwright: ^1.40.0 peerDependenciesMeta: - '@tangle-network/agent-knowledge': + '@tangle-network/agent-interface': optional: true '@tangle-network/sandbox': optional: true @@ -300,8 +307,8 @@ packages: viem: optional: true - '@tangle-network/sandbox@0.4.3': - resolution: {integrity: sha512-6QE3Nuhkd8f+OlpRJbumHTAG4wKR+ESXT47UE0fjTf7ndRWLnhE4RZ7YRtHVo/Q9ZZr0FGH1mwM+6tW0NAT1bA==} + '@tangle-network/sandbox@0.9.5': + resolution: {integrity: sha512-yvX2OX6uISBVnMQ+v6Upkesa3u8yj6BHxsfcS6p8Vze+M4WBpyhkwA+onzFHuo9rti557ItZn8yDu4a/klljvQ==} peerDependencies: '@mastra/core': ^1.36.0 '@modelcontextprotocol/sdk': ^1.29.0 @@ -385,8 +392,8 @@ packages: engines: {node: '>=18.0.0'} hasBin: true - typescript@5.9.3: - resolution: {integrity: sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==} + typescript@6.0.3: + resolution: {integrity: sha512-y2TvuxSZPDyQakkFRPZHKFm+KKVqIisdg9/CZwm9ftvKXLP8NRWj38/ODjNbr43SsoXqNuAisEf1GdCxqWcdBw==} engines: {node: '>=14.17'} hasBin: true @@ -558,16 +565,20 @@ snapshots: '@noble/hashes': 2.2.0 '@scure/base': 2.2.0 - '@tangle-network/agent-eval@0.89.0(@tangle-network/sandbox@0.4.3(viem@2.52.0(typescript@5.9.3)(zod@4.4.3)))(typescript@5.9.3)': + '@tangle-network/agent-core@0.3.4': + dependencies: + '@tangle-network/agent-interface': 0.14.0 + zod: 4.4.3 + + '@tangle-network/agent-eval@0.100.0(typescript@6.0.3)': dependencies: '@asteasolutions/zod-to-openapi': 8.5.0(zod@4.4.3) '@ax-llm/ax': 19.0.45(zod@4.4.3) '@hono/node-server': 2.0.4(hono@4.12.23) - '@tangle-network/tcloud': 0.4.12(typescript@5.9.3)(zod@4.4.3) + '@tangle-network/agent-interface': 0.10.1 + '@tangle-network/tcloud': 0.4.12(typescript@6.0.3)(zod@4.4.3) hono: 4.12.23 zod: 4.4.3 - optionalDependencies: - '@tangle-network/sandbox': 0.4.3(viem@2.52.0(typescript@5.9.3)(zod@4.4.3)) transitivePeerDependencies: - '@mastra/core' - '@modelcontextprotocol/sdk' @@ -579,32 +590,48 @@ snapshots: '@tangle-network/agent-integrations@0.29.0': {} - '@tangle-network/agent-runtime@file:..(@tangle-network/agent-eval@0.89.0(@tangle-network/sandbox@0.4.3(viem@2.52.0(typescript@5.9.3)(zod@4.4.3)))(typescript@5.9.3))(@tangle-network/sandbox@0.4.3(viem@2.52.0(typescript@5.9.3)(zod@4.4.3)))': + '@tangle-network/agent-interface@0.10.1': + dependencies: + zod: 4.4.3 + + '@tangle-network/agent-interface@0.13.0': dependencies: - '@tangle-network/agent-eval': 0.89.0(@tangle-network/sandbox@0.4.3(viem@2.52.0(typescript@5.9.3)(zod@4.4.3)))(typescript@5.9.3) + zod: 4.4.3 + + '@tangle-network/agent-interface@0.14.0': + dependencies: + zod: 4.4.3 + + '@tangle-network/agent-runtime@0.79.3(@tangle-network/agent-eval@0.100.0(typescript@6.0.3))(@tangle-network/agent-interface@0.14.0)(@tangle-network/sandbox@0.9.5(viem@2.52.0(typescript@6.0.3)(zod@4.4.3)))': + dependencies: + '@tangle-network/agent-eval': 0.100.0(typescript@6.0.3) optionalDependencies: - '@tangle-network/sandbox': 0.4.3(viem@2.52.0(typescript@5.9.3)(zod@4.4.3)) + '@tangle-network/agent-interface': 0.14.0 + '@tangle-network/sandbox': 0.9.5(viem@2.52.0(typescript@6.0.3)(zod@4.4.3)) - '@tangle-network/sandbox@0.3.0(viem@2.52.0(typescript@5.9.3)(zod@4.4.3))': + '@tangle-network/sandbox@0.3.0(viem@2.52.0(typescript@6.0.3)(zod@4.4.3))': dependencies: '@tangle-network/agent-integrations': 0.29.0 optionalDependencies: - viem: 2.52.0(typescript@5.9.3)(zod@4.4.3) + viem: 2.52.0(typescript@6.0.3)(zod@4.4.3) - '@tangle-network/sandbox@0.4.3(viem@2.52.0(typescript@5.9.3)(zod@4.4.3))': + '@tangle-network/sandbox@0.9.5(viem@2.52.0(typescript@6.0.3)(zod@4.4.3))': + dependencies: + '@tangle-network/agent-core': 0.3.4 + '@tangle-network/agent-interface': 0.13.0 optionalDependencies: - viem: 2.52.0(typescript@5.9.3)(zod@4.4.3) + viem: 2.52.0(typescript@6.0.3)(zod@4.4.3) '@tangle-network/tcloud-attestation@0.1.1': {} - '@tangle-network/tcloud@0.4.12(typescript@5.9.3)(zod@4.4.3)': + '@tangle-network/tcloud@0.4.12(typescript@6.0.3)(zod@4.4.3)': dependencies: '@scure/bip32': 2.2.0 '@scure/bip39': 2.2.0 - '@tangle-network/sandbox': 0.3.0(viem@2.52.0(typescript@5.9.3)(zod@4.4.3)) + '@tangle-network/sandbox': 0.3.0(viem@2.52.0(typescript@6.0.3)(zod@4.4.3)) '@tangle-network/tcloud-attestation': 0.1.1 commander: 14.0.3 - viem: 2.52.0(typescript@5.9.3)(zod@4.4.3) + viem: 2.52.0(typescript@6.0.3)(zod@4.4.3) transitivePeerDependencies: - '@mastra/core' - '@modelcontextprotocol/sdk' @@ -615,9 +642,9 @@ snapshots: - utf-8-validate - zod - abitype@1.2.3(typescript@5.9.3)(zod@4.4.3): + abitype@1.2.3(typescript@6.0.3)(zod@4.4.3): optionalDependencies: - typescript: 5.9.3 + typescript: 6.0.3 zod: 4.4.3 commander@14.0.3: {} @@ -668,7 +695,7 @@ snapshots: dependencies: yaml: 2.9.0 - ox@0.14.27(typescript@5.9.3)(zod@4.4.3): + ox@0.14.27(typescript@6.0.3)(zod@4.4.3): dependencies: '@adraffy/ens-normalize': 1.11.1 '@noble/ciphers': 1.3.0 @@ -676,10 +703,10 @@ snapshots: '@noble/hashes': 1.8.0 '@scure/bip32': 1.7.0 '@scure/bip39': 1.6.0 - abitype: 1.2.3(typescript@5.9.3)(zod@4.4.3) + abitype: 1.2.3(typescript@6.0.3)(zod@4.4.3) eventemitter3: 5.0.1 optionalDependencies: - typescript: 5.9.3 + typescript: 6.0.3 transitivePeerDependencies: - zod @@ -689,20 +716,20 @@ snapshots: optionalDependencies: fsevents: 2.3.3 - typescript@5.9.3: {} + typescript@6.0.3: {} - viem@2.52.0(typescript@5.9.3)(zod@4.4.3): + viem@2.52.0(typescript@6.0.3)(zod@4.4.3): dependencies: '@noble/curves': 1.9.1 '@noble/hashes': 1.8.0 '@scure/bip32': 1.7.0 '@scure/bip39': 1.6.0 - abitype: 1.2.3(typescript@5.9.3)(zod@4.4.3) + abitype: 1.2.3(typescript@6.0.3)(zod@4.4.3) isows: 1.0.7(ws@8.20.1) - ox: 0.14.27(typescript@5.9.3)(zod@4.4.3) + ox: 0.14.27(typescript@6.0.3)(zod@4.4.3) ws: 8.20.1 optionalDependencies: - typescript: 5.9.3 + typescript: 6.0.3 transitivePeerDependencies: - bufferutil - utf-8-validate diff --git a/bench/scripts/dabstep_judge.py b/bench/scripts/dabstep_judge.py new file mode 100644 index 00000000..1d45acf1 --- /dev/null +++ b/bench/scripts/dabstep_judge.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +"""DABStep judge bridge. + +Reads {"prediction": str, "golds": list} from stdin and delegates scoring to +the official DABStep grade.py module. This script owns no grading semantics. +""" + +import argparse +import importlib.util +import json +import sys +from pathlib import Path + + +def load_grade(grade_file: Path): + spec = importlib.util.spec_from_file_location("dabstep_grade", grade_file) + if spec is None or spec.loader is None: + raise RuntimeError(f"could not import DABStep grade file: {grade_file}") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module.grade + + +def main() -> int: + parser = argparse.ArgumentParser(description="Score one DABStep answer") + parser.add_argument("--grade-file", required=True) + args = parser.parse_args() + + try: + payload = json.loads(sys.stdin.read()) + prediction = payload["prediction"] + golds = payload["golds"] + correct = bool(load_grade(Path(args.grade_file))(prediction, golds)) + print(json.dumps({"correct": correct, "score": 1.0 if correct else 0.0})) + return 0 + except Exception as exc: + print(json.dumps({"error": str(exc)})) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/bench/src/adapters.ts b/bench/src/adapters.ts index 715f16ae..0f549d7a 100644 --- a/bench/src/adapters.ts +++ b/bench/src/adapters.ts @@ -11,6 +11,7 @@ import { createCadBenchAdapter } from './benchmarks/cadbench' import { createCadDesignAdapter } from './benchmarks/cad-design' import { createCadGenBenchAdapter } from './benchmarks/cadgenbench' import { createCommit0Adapter } from './benchmarks/commit0' +import { createDabstepAdapter } from './benchmarks/dabstep' import { createEnterpriseOpsGymAdapter } from './benchmarks/enterpriseops-gym' import { createFinsearchcompAdapter } from './benchmarks/finsearchcomp' import { createFramesAdapter } from './benchmarks/frames' @@ -32,6 +33,7 @@ export const ADAPTERS: Record BenchmarkAdapter> = { // delegates to the benchmark's own harness and fails loud when it/Docker is absent. 'aec-bench': createAecBenchAdapter, commit0: createCommit0Adapter, + dabstep: createDabstepAdapter, programbench: createProgrambenchAdapter, appworld: createAppWorldAdapter, // AppWorld's native interactive protocol — the worker is the in-engine ReAct diff --git a/bench/src/benchmarks/dabstep.test.mts b/bench/src/benchmarks/dabstep.test.mts new file mode 100644 index 00000000..09792e7d --- /dev/null +++ b/bench/src/benchmarks/dabstep.test.mts @@ -0,0 +1,70 @@ +/** + * Offline DABStep adapter test. Official live tasks need a DABStep checkout with + * the released dataset.csv. Fixture mode only exercises adapter plumbing; it + * never scores benchmark rows without the official grade.py. + */ +import assert from 'node:assert/strict' +import { mkdtemp, rm } from 'node:fs/promises' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { test } from 'node:test' +import { createDabstepAdapter, dabstepAnswerOutput } from './dabstep' + +process.env.DABSTEP_FIXTURES = '1' + +type Events = Parameters[0] +const stream = (text: string): Events => [{ data: { finalText: text } }] as unknown as Events + +test('loadTasks fixtures expose DABStep prompt and resource metadata shape', async () => { + const adapter = createDabstepAdapter() + const tasks = await adapter.loadTasks({ ids: ['1'] }) + assert.equal(tasks.length, 1) + assert.equal(tasks[0].id, '1') + assert.match(tasks[0].prompt, /DABStep data-analysis task/) + const meta = tasks[0].metadata as Record + assert.equal(meta.taskId, 1) + assert.equal(Array.isArray(meta.golds), true) +}) + +test('answer OutputAdapter extracts final fenced answer when present', () => { + assert.equal(dabstepAnswerOutput.parse(stream('work\n```answer\n42\n```')), '42') + assert.equal(dabstepAnswerOutput.parse(stream('Final Answer: 42')), 'Final Answer: 42') +}) + +test('goldArtifact exposes the fixture oracle without scoring it as a benchmark result', async () => { + const adapter = createDabstepAdapter() + const [task] = await adapter.loadTasks({ ids: ['1'] }) + const gold = await adapter.goldArtifact(task) + assert.equal(gold, '42') +}) + +test('judge fails loud without an official DABSTEP_DIR/grade.py', async () => { + const adapter = createDabstepAdapter() + const [task] = await adapter.loadTasks({ ids: ['1'] }) + delete process.env.DABSTEP_DIR + await assert.rejects(adapter.judge(task, '42'), /DABSTEP_DIR is required/) +}) + +test('preflight is fixture-safe and live mode fails loud without DABSTEP_DIR', async () => { + const fixtureAdapter = createDabstepAdapter() + await fixtureAdapter.preflight() + delete process.env.DABSTEP_FIXTURES + delete process.env.DABSTEP_DIR + const liveAdapter = createDabstepAdapter() + await assert.rejects(liveAdapter.preflight(), /DABSTEP_DIR is required/) + process.env.DABSTEP_FIXTURES = '1' +}) + +test('live preflight fails loud when the checkout is missing released dataset.csv', async () => { + delete process.env.DABSTEP_FIXTURES + const dir = await mkdtemp(join(tmpdir(), 'dabstep-missing-dataset-')) + process.env.DABSTEP_DIR = dir + try { + const adapter = createDabstepAdapter() + await assert.rejects(adapter.preflight(), /released dataset\.csv/) + } finally { + process.env.DABSTEP_FIXTURES = '1' + delete process.env.DABSTEP_DIR + await rm(dir, { recursive: true, force: true }) + } +}) diff --git a/bench/src/benchmarks/dabstep.ts b/bench/src/benchmarks/dabstep.ts new file mode 100644 index 00000000..0fb17932 --- /dev/null +++ b/bench/src/benchmarks/dabstep.ts @@ -0,0 +1,212 @@ +/** + * DABStep adapter (EnvCommons/DABStep) — data-analysis questions over synthetic + * payment files. Worker artifact = final answer text. Judge = the official + * DABStep `grade.py` normalization/matching function. No LLM judge. + * + * Live tasks require `DABSTEP_DIR` pointing at an official DABStep checkout that + * includes `dataset.csv`, `splits/*.txt`, `files/*`, and `grade.py`. The adapter + * exposes `metadata.resourceRoot` so runners can mount the benchmark files into + * AgentProfile.resources.files; it does not paste the dataset into prompt text. + */ + +import { join } from 'node:path' +import { access, readFile, stat } from 'node:fs/promises' +import type { OutputAdapter } from '@tangle-network/agent-runtime/loops' +import { benchRoot, runVenvPython, runVenvScriptStdin } from './_harness' +import type { BenchmarkAdapter, BenchScore, BenchTask, LoadOptions } from './types' + +const FIXTURES = join(benchRoot, 'fixtures', 'dabstep.json') +const DEFAULT_SPLIT = 'easy' + +interface DabstepFixtureRow { + task_id: number + instructions: string + all_golds_by_task: Array> +} + +interface DabstepMeta { + taskId: number + split: string + golds: Array> + resourceRoot?: string +} + +const dabstepDir = (): string | undefined => process.env.DABSTEP_DIR +const gradeFile = (dir: string): string => join(dir, 'grade.py') +const resourceRoot = (dir: string): string => join(dir, 'files') + +async function assertFile(path: string, label: string): Promise { + try { + await access(path) + } catch (err) { + throw new Error(`DABStep: missing ${label} at ${path} (${err instanceof Error ? err.message : err})`) + } +} + +async function assertOfficialFiles(dir: string, split: string): Promise { + await assertFile(join(dir, 'dataset.csv'), 'released dataset.csv') + await assertFile(join(dir, 'splits', `${split}.txt`), `${split} split file`) + await assertFile(gradeFile(dir), 'official grade.py') + const files = resourceRoot(dir) + try { + const s = await stat(files) + if (!s.isDirectory()) throw new Error('not a directory') + } catch (err) { + throw new Error(`DABStep: missing benchmark files directory at ${files} (${err instanceof Error ? err.message : err})`) + } +} + +export const dabstepAnswerOutput: OutputAdapter = { + parse(events) { + let text = '' + for (const ev of events) { + const d = (ev as { data?: Record })?.data + const t = d?.finalText ?? d?.text ?? d?.result + if (typeof t === 'string' && t.length > 0) text = t + } + const fences = [...text.matchAll(/```(?:text|answer)?\s*\n([\s\S]*?)```/g)] + return (fences.at(-1)?.[1] ?? text).trim() + }, +} + +function rowToTask(row: DabstepFixtureRow, split: string, dir?: string): BenchTask { + const meta: DabstepMeta = { + taskId: row.task_id, + split, + golds: row.all_golds_by_task, + ...(dir ? { resourceRoot: resourceRoot(dir) } : {}), + } + return { + id: String(row.task_id), + split, + prompt: [ + 'Solve this DABStep data-analysis task using the mounted payment files.', + 'Use code or shell commands as needed, then return only the final answer.', + '', + row.instructions, + ].join('\n'), + metadata: meta as unknown as Record, + } +} + +function readMeta(task: BenchTask): DabstepMeta { + const md = task.metadata + if (!md || typeof md.taskId !== 'number' || !Array.isArray(md.golds)) { + throw new Error(`dabstep task ${task.id} missing metadata — loadTasks did not populate it`) + } + return md as unknown as DabstepMeta +} + +function selectRows(rows: DabstepFixtureRow[], opts: LoadOptions, split: string, dir?: string): BenchTask[] { + let tasks = rows.map((row) => rowToTask(row, split, dir)) + if (opts.ids) { + const want = new Set(opts.ids) + tasks = tasks.filter((task) => want.has(task.id)) + } else if (opts.limit !== undefined) { + tasks = tasks.slice(0, opts.limit) + } + if (tasks.length === 0) throw new Error(`DABStep: no tasks matched ${JSON.stringify(opts)} for split=${split}`) + return tasks +} + +async function loadFixtures(opts: LoadOptions, split: string): Promise { + const rows = JSON.parse(await readFile(FIXTURES, 'utf8')) as DabstepFixtureRow[] + console.warn(`[dabstep] DABSTEP_FIXTURES=1 — loading ${rows.length} adapter fixtures from ${FIXTURES}`) + return selectRows(rows, opts, split) +} + +async function loadOfficialTasks(dir: string, opts: LoadOptions, split: string): Promise { + const script = ` +import ast, csv, json, sys +from pathlib import Path + +root = Path(sys.argv[1]) +split = sys.argv[2] +limit = None if sys.argv[3] == "" else int(sys.argv[3]) +ids = set(json.loads(sys.argv[4])) +dataset = root / "dataset.csv" +split_file = root / "splits" / f"{split}.txt" +if not dataset.exists(): + raise SystemExit(f"missing official DABStep dataset.csv at {dataset}") +if not split_file.exists(): + raise SystemExit(f"missing official DABStep split file at {split_file}") +split_ids = {int(line.strip()) for line in split_file.read_text().splitlines() if line.strip()} +out = [] +with dataset.open(newline="") as f: + for row in csv.DictReader(f): + task_id = int(row["task_id"]) + if task_id not in split_ids: + continue + if ids and str(task_id) not in ids: + continue + out.append({ + "task_id": task_id, + "instructions": f"{row['question']}\\n{row['guidelines']}", + "all_golds_by_task": ast.literal_eval(str(row["all_golds_by_task"])), + }) + if limit is not None and len(out) >= limit: + break +if not out: + raise SystemExit(f"no DABStep rows matched split={split} ids={sorted(ids)} limit={limit}") +print(json.dumps(out)) +` + const stdout = await runVenvPython(script, [dir, split, opts.limit === undefined ? '' : String(opts.limit), JSON.stringify(opts.ids ?? [])]) + return selectRows(JSON.parse(stdout) as DabstepFixtureRow[], opts, split, dir) +} + +export function createDabstepAdapter(): BenchmarkAdapter { + const fixturesMode = process.env.DABSTEP_FIXTURES === '1' + + return { + name: 'dabstep', + output: dabstepAnswerOutput, + + async preflight() { + if (fixturesMode) return + const dir = dabstepDir() + if (!dir) { + throw new Error( + 'DABSTEP_DIR is required. Fix: clone https://github.com/EnvCommons/DABStep, add the released dataset.csv under that checkout, then set DABSTEP_DIR=/path/to/DABStep.', + ) + } + await assertOfficialFiles(dir, DEFAULT_SPLIT) + await loadOfficialTasks(dir, { limit: 1 }, DEFAULT_SPLIT) + }, + + async loadTasks(opts: LoadOptions = {}) { + const split = opts.split ?? DEFAULT_SPLIT + if (fixturesMode) return loadFixtures(opts, split) + const dir = dabstepDir() + if (!dir) throw new Error('DABSTEP_DIR is required to load official DABStep tasks') + return loadOfficialTasks(dir, opts, split) + }, + + async goldArtifact(task: BenchTask) { + const meta = readMeta(task) + const first = meta.golds[0] + if (!first) return undefined + const value = first.value + return value === undefined ? undefined : String(value) + }, + + async judge(task: BenchTask, artifact: string): Promise { + const meta = readMeta(task) + const dir = dabstepDir() + if (!dir) throw new Error('DABSTEP_DIR is required to judge DABStep tasks with the official grade.py') + const stdout = await runVenvScriptStdin( + join(benchRoot, 'scripts', 'dabstep_judge.py'), + ['--grade-file', gradeFile(dir)], + JSON.stringify({ prediction: artifact, golds: meta.golds }), + { cwd: benchRoot }, + ) + const report = JSON.parse(stdout.trim().split('\n').at(-1) ?? '{}') as { correct?: boolean; score?: number; error?: string } + if (report.error) throw new Error(`DABStep judge error for ${task.id}: ${report.error}`) + const score = typeof report.score === 'number' ? report.score : 0 + return { + resolved: report.correct === true, + score, + detail: JSON.stringify({ taskId: meta.taskId, split: meta.split, correct: report.correct }), + } + }, + } +} diff --git a/bench/src/decoder-live.mts b/bench/src/decoder-live.mts index dc15bae7..8525694a 100644 --- a/bench/src/decoder-live.mts +++ b/bench/src/decoder-live.mts @@ -47,7 +47,7 @@ async function main(): Promise { }, profile: { name: 'decoder-live' }, }, - } as never)) as Record unknown> & { id?: string } + } as never)) as unknown as Record unknown> & { id?: string } try { console.error('[live] box', box.id, '— waiting for running…') diff --git a/bench/src/trata-gepa.mts b/bench/src/trata-gepa.mts index 7c2993c1..d760408d 100644 --- a/bench/src/trata-gepa.mts +++ b/bench/src/trata-gepa.mts @@ -367,7 +367,7 @@ async function main(): Promise { apiKey: routerKey, model: reflectModel, }, - driverTarget: + proposerTarget: 'a FINANCIAL ANALYST SYSTEM INSTRUCTION: the directive given to an agent that produces an investment memo from embedded earnings call transcripts, SEC filings, financial statements, and investor presentations. ' + 'The memo is scored by a rubric with 4-6 analytical themes, each requiring 2-4 specific analytical moves (quantitative claims, strategic conclusions, peer comparisons, or explicit calculations). ' + 'A theme is "hit" only when the agent makes the SPECIFIC move — not just gestures at the theme. ' + diff --git a/bench/tsconfig.json b/bench/tsconfig.json index eff42166..7501a77c 100644 --- a/bench/tsconfig.json +++ b/bench/tsconfig.json @@ -10,7 +10,11 @@ "skipLibCheck": true, "types": ["node"], "esModuleInterop": true, - "resolveJsonModule": true + "resolveJsonModule": true, + "paths": { + "@tangle-network/agent-runtime/loops": ["../src/runtime/index.ts"], + "@tangle-network/sandbox": ["../node_modules/@tangle-network/sandbox"] + } }, "include": ["src/**/*.ts", "src/**/*.mts"], "exclude": ["src/authored"] diff --git a/docs/api/primitive-catalog.md b/docs/api/primitive-catalog.md index 84e59f22..6ff0e4bc 100644 --- a/docs/api/primitive-catalog.md +++ b/docs/api/primitive-catalog.md @@ -337,7 +337,7 @@ Import from `@tangle-network/agent-runtime/intelligence` — 60 exports. ### Recursive atom + loop kernel (alias of ./runtime) -Import from `@tangle-network/agent-runtime/loops` — 383 exports. +Import from `@tangle-network/agent-runtime/loops` — 386 exports. | Symbol | Kind | Summary | |---|---|---| @@ -547,6 +547,7 @@ Import from `@tangle-network/agent-runtime/loops` — 383 exports. | `ExecResult` | interface | _(no summary — add a TSDoc line at the declaration)_ | | `Executor` | interface | The leaf runtime — ONE open interface, not a closed union. `execute` returns a | | `ExecutorContext` | interface | Construction context handed to a `ExecutorFactory` — the seams a built-in needs | +| `ExecutorRegistry` | interface | The OPEN resolver: maps an `AgentSpec` to a `ExecutorFactory`. The default | | `ExecutorResult` | interface | Terminal artifact of a one-shot `Executor.execute`. | | `FanoutOptions` | interface | `fanout(items, { synthesize? })` — N children spawned in one round (one per item, bounded by | | `FanoutSynthesis` | interface | How a fanout's synthesis child is built + read. `synthesisTask` projects the drained child | @@ -695,6 +696,7 @@ Import from `@tangle-network/agent-runtime/loops` — 383 exports. | `Environment` | type | A checkable task domain — implement these 5 hooks and the suite does the rest. The | | `EqualKOnCost` | type | `equalKOnCost(arms, opts)` — the cross-arm equal-compute check on conserved cost. | | `ExecutorConfig` | type | Config for {@link createExecutor}: the backend is DATA — the cost dial a profile, | +| `ExecutorFactory` | type | Builds a fresh `Executor` for one spawn from the resolved spec. Per-spawn (not | | `Fanout` | type | `fanout(items, opts)` — build the fanout combinator over a static item list. | | `FanoutWinnerSelector` | type | A winner-selection strategy: argmax/sort over the gathered child iterations (each output is the | | `FlatWidenGate` | type | The flat default `ScopeWidenGate` factory contract — never widens, keeping the R2 firewall | @@ -710,6 +712,7 @@ Import from `@tangle-network/agent-runtime/loops` — 383 exports. | `Pipeline` | type | `pipeline(stages)` — build the sequential combinator from an ordered stage list. The first | | `RenderCorpusToInstructions` | type | `renderCorpusToInstructions(opts)` — the flywheel read-back projection. Async (queries the | | `RunPersonified` | type | The composed run signature. | +| `Runtime` | type | The runtime tag of a `Executor` impl. Open by intent: custom runtimes use their own string name. | | `Settled` | type | A settled child, delivered by `scope.next()`. `seq` is the monotonic cursor order | | `Shell` | type | Command runner seam. Host code can use `localShell`; sandbox code can wrap `box.exec`. | | `SteeringDecision` | type | Terminal-or-continue decision shared by all three steering drivers. The | diff --git a/docs/api/runtime.md b/docs/api/runtime.md index d3114ed4..930a2af8 100644 --- a/docs/api/runtime.md +++ b/docs/api/runtime.md @@ -1748,7 +1748,7 @@ thread the seams onto each spawn. Exactly one is required — fail loud if neith ##### registry? -> `readonly` `optional` **registry?**: `ExecutorRegistry` +> `readonly` `optional` **registry?**: [`ExecutorRegistry`](#executorregistry) Defined in: [runtime/personify/types.ts:120](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/personify/types.ts#L120) @@ -8132,7 +8132,7 @@ Defined in: [runtime/supervise/run-context.ts:48](https://github.com/tangle-netw ##### executors -> `readonly` **executors**: `ExecutorRegistry` +> `readonly` **executors**: [`ExecutorRegistry`](#executorregistry) Defined in: [runtime/supervise/run-context.ts:49](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/run-context.ts#L49) @@ -8166,7 +8166,7 @@ Defined in: [runtime/environment-provider.ts:269](https://github.com/tangle-netw ##### runtime? -> `optional` **runtime?**: `Runtime` +> `optional` **runtime?**: [`Runtime`](#runtime-3) Defined in: [runtime/environment-provider.ts:270](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/environment-provider.ts#L270) @@ -8786,7 +8786,7 @@ own agent (mastra/agno/raw HTTP/anything) is first-class by implementing this in ##### runtime -> `readonly` **runtime**: `Runtime` +> `readonly` **runtime**: [`Runtime`](#runtime-3) Defined in: [runtime/supervise/types.ts:72](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/types.ts#L72) @@ -9023,6 +9023,74 @@ Opaque seams the registry threads through; a built-in narrows what it needs. *** +### ExecutorRegistry + +Defined in: [runtime/supervise/types.ts:182](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/types.ts#L182) + +The OPEN resolver: maps an `AgentSpec` to a `ExecutorFactory`. The default +registry resolves the three built-ins AND accepts a BYO `executor`/factory; callers +register more runtimes by name. NOT a closed switch — registration is the extension +point, mirroring the open `Executor` interface. + +#### Methods + +##### register() + +> **register**\<`Out`\>(`runtime`, `factory`): `void` + +Defined in: [runtime/supervise/types.ts:184](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/types.ts#L184) + +Register a factory for a named runtime. Throws on a duplicate name (fail loud). + +###### Type Parameters + +###### Out + +`Out` + +###### Parameters + +###### runtime + +[`Runtime`](#runtime-3) + +###### factory + +[`ExecutorFactory`](#executorfactory)\<`Out`\> + +###### Returns + +`void` + +##### resolve() + +> **resolve**\<`Out`\>(`spec`): \{ `succeeded`: `true`; `value`: [`ExecutorFactory`](#executorfactory)\<`Out`\>; \} \| \{ `succeeded`: `false`; `error`: `string`; \} + +Defined in: [runtime/supervise/types.ts:191](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/types.ts#L191) + +Resolve a spec to a factory. Precedence: a BYO `spec.executor` → a trivial factory +returning it; else `harness === null` → the `'router'` factory; else a registered +factory for the harness-derived runtime. Returns a typed outcome — the caller +inspects `succeeded` before `value` (no silent fallback). + +###### Type Parameters + +###### Out + +`Out` + +###### Parameters + +###### spec + +[`AgentSpec`](#agentspec) + +###### Returns + +\{ `succeeded`: `true`; `value`: [`ExecutorFactory`](#executorfactory)\<`Out`\>; \} \| \{ `succeeded`: `false`; `error`: `string`; \} + +*** + ### Budget Defined in: [runtime/supervise/types.ts:199](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/types.ts#L199) @@ -9424,7 +9492,7 @@ Result payload store backing `outRef` rehydration. ##### executors -> `readonly` **executors**: `ExecutorRegistry` +> `readonly` **executors**: [`ExecutorRegistry`](#executorregistry) Defined in: [runtime/supervise/types.ts:442](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/types.ts#L442) @@ -12967,6 +13035,49 @@ conserved pool meters all runtimes identically. `tokens` carries `LoopTokenUsage *** +### Runtime + +> **Runtime** = `"router"` \| `"inline"` \| `"sandbox"` \| `"cli"` \| `string` & `object` + +Defined in: [runtime/supervise/types.ts:137](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/types.ts#L137) + +The runtime tag of a `Executor` impl. Open by intent: custom runtimes use their own string name. +External executors can register additional runtime strings without widening this type. + +*** + +### ExecutorFactory + +> **ExecutorFactory**\<`Out`\> = (`spec`, `ctx`) => [`Executor`](#executor)\<`Out`\> + +Defined in: [runtime/supervise/types.ts:165](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/types.ts#L165) + +Builds a fresh `Executor` for one spawn from the resolved spec. Per-spawn (not +shared) so each child owns its own box/abort/teardown lifecycle. A BYO factory lets a +user supply construction args without pre-instantiating. + +#### Type Parameters + +##### Out + +`Out` + +#### Parameters + +##### spec + +[`AgentSpec`](#agentspec) + +##### ctx + +[`ExecutorContext`](#executorcontext) + +#### Returns + +[`Executor`](#executor)\<`Out`\> + +*** + ### Settled > **Settled**\<`Out`\> = \{ `kind`: `"done"`; `handle`: `Handle`\<`Out`\>; `out`: `Out`; `outRef`: `string`; `verdict?`: `DefaultVerdict`; `spent`: [`Spend`](#spend); `seq`: `number`; \} \| \{ `kind`: `"down"`; `handle`: `Handle`\<`Out`\>; `reason`: `string`; `infra`: `boolean`; `restartCount`: `number`; `seq`: `number`; \} @@ -13345,7 +13456,7 @@ The conserved pool a `delegate()` call applies when the caller does not pass its ### cliWorktreeExecutor -> `const` **cliWorktreeExecutor**: `ExecutorFactory`\<`unknown`\> +> `const` **cliWorktreeExecutor**: [`ExecutorFactory`](#executorfactory)\<`unknown`\> Defined in: [runtime/supervise/runtime.ts:1360](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/runtime.ts#L1360) @@ -13622,7 +13733,7 @@ run once on the prompt, emit the terminal result event, tear down. ##### factory -`ExecutorFactory`\<`unknown`\> +[`ExecutorFactory`](#executorfactory)\<`unknown`\> #### Returns @@ -15819,7 +15930,7 @@ state between runs), so two runs never cross-contaminate their journals/blobs. ### createExecutor() -> **createExecutor**(`config`): `ExecutorFactory`\<`unknown`\> +> **createExecutor**(`config`): [`ExecutorFactory`](#executorfactory)\<`unknown`\> Defined in: [runtime/supervise/runtime.ts:1413](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/runtime.ts#L1413) @@ -15838,13 +15949,13 @@ per-vendor adapter or a closed `inline|sandbox|cli` switch — those bypass the #### Returns -`ExecutorFactory`\<`unknown`\> +[`ExecutorFactory`](#executorfactory)\<`unknown`\> *** ### createExecutorRegistry() -> **createExecutorRegistry**(): `ExecutorRegistry` +> **createExecutorRegistry**(): [`ExecutorRegistry`](#executorregistry) Defined in: [runtime/supervise/runtime.ts:1459](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/runtime.ts#L1459) @@ -15860,7 +15971,7 @@ harness-derived runtime (`'sandbox'` for any `BackendType`); else fail loud. #### Returns -`ExecutorRegistry` +[`ExecutorRegistry`](#executorregistry) *** diff --git a/docs/api/runtime/environment-provider.md b/docs/api/runtime/environment-provider.md index 6a1b9c40..23b1761b 100644 --- a/docs/api/runtime/environment-provider.md +++ b/docs/api/runtime/environment-provider.md @@ -282,7 +282,7 @@ Defined in: [runtime/environment-provider.ts:269](https://github.com/tangle-netw ##### runtime? -> `optional` **runtime?**: `Runtime` +> `optional` **runtime?**: [`Runtime`](../runtime.md#runtime-3) Defined in: [runtime/environment-provider.ts:270](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/environment-provider.ts#L270) @@ -442,7 +442,7 @@ Adapt a `SandboxClient` into the shared `AgentEnvironmentProvider` contract. ### providerAsExecutor() -> **providerAsExecutor**(`provider`, `options?`): `ExecutorFactory`\<`unknown`\> +> **providerAsExecutor**(`provider`, `options?`): [`ExecutorFactory`](../runtime.md#executorfactory)\<`unknown`\> Defined in: [runtime/environment-provider.ts:278](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/environment-provider.ts#L278) @@ -462,4 +462,4 @@ Adapt an environment provider into an `ExecutorFactory` for `createExecutor`. #### Returns -`ExecutorFactory`\<`unknown`\> +[`ExecutorFactory`](../runtime.md#executorfactory)\<`unknown`\> diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index da847b6b..b9f061ff 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -19,7 +19,7 @@ importers: version: 0.14.0 '@tangle-network/sandbox': specifier: '>=0.8.0 <1.0.0' - version: 0.8.2(viem@2.52.2(typescript@5.9.3)(zod@4.4.3)) + version: 0.9.5(viem@2.52.2(typescript@5.9.3)(zod@4.4.3)) '@types/node': specifier: ^25.9.3 version: 25.9.3 @@ -633,6 +633,9 @@ packages: '@shikijs/vscode-textmate@10.0.2': resolution: {integrity: sha512-83yeghZ2xxin3Nj8z1NMd/NCuca+gsYXswywDy5bHvwlWL8tpTQmzGeUuHd9FC3E/SBEMvzJRwWEOz5gGes9Qg==} + '@tangle-network/agent-core@0.3.4': + resolution: {integrity: sha512-Hvz3ABRouNtBmRvGqPxifAO2yuILneJMylWH5jW/jeS2F03RvqkGYuXyGXWWLqosYbb3hVAvSEe4Ykm2FMGEDQ==} + '@tangle-network/agent-eval@0.100.0': resolution: {integrity: sha512-yBupVJJAqHozhe1BL5xBuDObjvNsoY+XmJo7qfpw/w7rehAXbKliBb4k3XS1G55+GaYPjFA+xwPzlEDQISpMRw==} engines: {node: '>=20'} @@ -646,12 +649,12 @@ packages: '@tangle-network/agent-interface@0.10.0': resolution: {integrity: sha512-oiREgihkeX/xcGEtFfi9AkAfU2VzuF7SSla2s0iliXPUXyHCIIx6jwzHiYdwb1ZGCfvC+T+0SWOIa6fN5u195g==} + '@tangle-network/agent-interface@0.13.0': + resolution: {integrity: sha512-CeTPGRLoXqpt0h+BCyFgZPkfU1zyRpWmqfD+85i/uk+uvbqxkfI+JprfKVf3tBsQuCgJPSjPt5qjdW8n3h2BVg==} + '@tangle-network/agent-interface@0.14.0': resolution: {integrity: sha512-9CyGhIpl90E7v4MTm3b1ti3Bp7BfPigk2Nafgi21Lg0U+QxlNB656F2JmVpUuSbOo9aGZPtg5nXu5EBTlV5a1g==} - '@tangle-network/agent-interface@0.8.0': - resolution: {integrity: sha512-okz9LGKwPNKODNyT9Y7+T+sQsJ4g6oTy/hpWpxR6r2BI7pS6WqIdgCOQcx98+WtlPoibkY3ewRRAb8YJMrPHog==} - '@tangle-network/sandbox@0.3.0': resolution: {integrity: sha512-KfgvKhsUaOpkJe3AD19w7s4hdQekBlXQGoNx0xS4u6vuQk5YnFzBgv+EQeHCkkgETpYOWS2AN+6u/JhSyWStMw==} peerDependencies: @@ -672,8 +675,8 @@ packages: viem: optional: true - '@tangle-network/sandbox@0.8.2': - resolution: {integrity: sha512-MG3dj7SnF7vI8CagW1OwpkJSUq3IREpADBWp6knOukKxSYYCMGwJ0nPZz+O2eotI+Nl2A2LIGiHqPB82jgOvjw==} + '@tangle-network/sandbox@0.9.5': + resolution: {integrity: sha512-yvX2OX6uISBVnMQ+v6Upkesa3u8yj6BHxsfcS6p8Vze+M4WBpyhkwA+onzFHuo9rti557ItZn8yDu4a/klljvQ==} peerDependencies: '@mastra/core': ^1.36.0 '@modelcontextprotocol/sdk': ^1.29.0 @@ -1621,6 +1624,11 @@ snapshots: '@shikijs/vscode-textmate@10.0.2': {} + '@tangle-network/agent-core@0.3.4': + dependencies: + '@tangle-network/agent-interface': 0.14.0 + zod: 4.4.3 + '@tangle-network/agent-eval@0.100.0(typescript@5.9.3)': dependencies: '@asteasolutions/zod-to-openapi': 8.5.0(zod@4.4.3) @@ -1645,11 +1653,11 @@ snapshots: dependencies: zod: 4.4.3 - '@tangle-network/agent-interface@0.14.0': + '@tangle-network/agent-interface@0.13.0': dependencies: zod: 4.4.3 - '@tangle-network/agent-interface@0.8.0': + '@tangle-network/agent-interface@0.14.0': dependencies: zod: 4.4.3 @@ -1659,9 +1667,10 @@ snapshots: optionalDependencies: viem: 2.52.2(typescript@5.9.3)(zod@4.4.3) - '@tangle-network/sandbox@0.8.2(viem@2.52.2(typescript@5.9.3)(zod@4.4.3))': + '@tangle-network/sandbox@0.9.5(viem@2.52.2(typescript@5.9.3)(zod@4.4.3))': dependencies: - '@tangle-network/agent-interface': 0.8.0 + '@tangle-network/agent-core': 0.3.4 + '@tangle-network/agent-interface': 0.13.0 optionalDependencies: viem: 2.52.2(typescript@5.9.3)(zod@4.4.3) diff --git a/src/runtime/environment-provider.ts b/src/runtime/environment-provider.ts index 66bf6a5d..0832c0ee 100644 --- a/src/runtime/environment-provider.ts +++ b/src/runtime/environment-provider.ts @@ -506,6 +506,7 @@ function environmentAsSandboxInstance( return { response: resultFromEvents(events, text).content, success: true, + status: 'success', durationMs: 0, ...(usage ? { usage } : {}), } @@ -908,6 +909,7 @@ function promptResultFromAgentTurnResult(result: AgentTurnResult): PromptResult return { response: result.text, success: result.success, + status: result.success ? 'success' : 'failed', durationMs: 0, ...(result.error ? { error: result.error } : {}), ...(result.usage ? { usage: result.usage } : {}), diff --git a/src/runtime/index.ts b/src/runtime/index.ts index 8a889a78..0e1b78b8 100644 --- a/src/runtime/index.ts +++ b/src/runtime/index.ts @@ -459,8 +459,11 @@ export type { Budget, Executor, ExecutorContext, + ExecutorFactory, + ExecutorRegistry, ExecutorResult, ResultBlobStore, + Runtime, Scope, Settled, Spend, diff --git a/src/runtime/supervise/types.ts b/src/runtime/supervise/types.ts index 8b45a98f..f2c7d4dd 100644 --- a/src/runtime/supervise/types.ts +++ b/src/runtime/supervise/types.ts @@ -132,8 +132,8 @@ export type UsageEvent = | { kind: 'cost'; usd: number } | { kind: 'iteration' } -/** The runtime tag of a `Executor` impl. Open by intent — `string` so a BYO executor - * names its own runtime; the built-ins use these literals. */ +/** The runtime tag of a `Executor` impl. Open by intent: custom runtimes use their own string name. + * External executors can register additional runtime strings without widening this type. */ export type Runtime = 'router' | 'inline' | 'sandbox' | 'cli' | (string & {}) // ── Executor resolution (OPEN registry, not a switch) ─────────────────────────