tangle-network · drewstone · Jun 29, 2026 · Jun 29, 2026 · Jun 29, 2026 · Jun 29, 2026
diff --git a/bench/HARNESS.md b/bench/HARNESS.md
@@ -202,7 +202,7 @@ in a `.venv`/Docker subprocess → parse its JSON report → `{resolved,score}`)
 copy of the process/venv/Docker/temp/report plumbing; commit0+appworld also share its
 stdin-piping runner (`runVenvScriptStdin`).
 - **Real, runnable with ZERO extra deps:** finsearchcomp (GitHub dataset + fixtures + LLM judge — the gate bench), hotpotqa + simpleqa + frames (HF/web QA + F1/LLM judge; `*_FIXTURES=1` offline), **aec-bench** (real GitHub task tree + fixtures; judge = the task's own `tests/verify.py` over python3 stdlib — **deterministic, graded per-field partial credit, no Docker, no LLM** → the candidate non-oracle correctable-middle-band bench for the open gate).
-- **Real code, needs an external harness/tools to run (fail loud with the exact install/Docker fix; never a fabricated score):** swe-bench + terminal-bench (`bench/.venv` + Docker), **commit0** (ISOLATED `bench/.venv-commit0` via `python3 -m venv bench/.venv-commit0 && bench/.venv-commit0/bin/pip install commit0 datasets` — its deps conflict with the shared `.venv`; override dir with `COMMIT0_VENV` — plus Docker; judge = official pytest harness, graded (passed+xfail)/total; the rollout prompt stages in-box (clones `commit-0/<repo>` @ `base_commit`, emits `git diff`); `COMMIT0_FIXTURES=1` for offline listing), **programbench** (`pip install programbench` + Docker on linux/amd64 + HF blobs; judge = official cleanroom eval, graded passed/total; `PROGRAMBENCH_FIXTURES=1` offline), **appworld** (`pip install appworld` + `appworld install` + `appworld download data`; judge = AppWorld's own `world.evaluate()`, graded passes/num_tests — NO committed fixture: task data exists only after `download data`, so loadTasks fails loud rather than fabricate a task), mind2web, cad-design + cadbench + cadgenbench (openscad/blender/build123d).
+- **Real code, needs an external harness/tools to run (fail loud with the exact install/Docker fix; never a fabricated score):** swe-bench + terminal-bench (`bench/.venv` + Docker), **commit0** (ISOLATED `bench/.venv-commit0` via `python3 -m venv bench/.venv-commit0 && bench/.venv-commit0/bin/pip install commit0 datasets` — its deps conflict with the shared `.venv`; override dir with `COMMIT0_VENV` — plus Docker; judge = official pytest harness, graded (passed+xfail)/total; the rollout prompt stages in-box (clones `commit-0/<repo>` @ `base_commit`, emits `git diff`); `COMMIT0_FIXTURES=1` for offline listing), **programbench** (`pip install programbench` + Docker on linux/amd64 + HF blobs; judge = official cleanroom eval, graded passed/total; `PROGRAMBENCH_FIXTURES=1` offline), **appworld** (`pip install appworld` + `appworld install` + `appworld download data`; judge = AppWorld's own `world.evaluate()`, graded passes/num_tests — NO committed fixture: task data exists only after `download data`, so loadTasks fails loud rather than fabricate a task), **dabstep** (`DABSTEP_DIR=/path/to/EnvCommons/DABStep` with the released `dataset.csv`, `splits/*.txt`, `files/*`, and `grade.py`; judge delegates to official `grade.py`; `DABSTEP_FIXTURES=1` only tests adapter plumbing and does not fabricate benchmark scores), mind2web, cad-design + cadbench + cadgenbench (openscad/blender/build123d).
 - **goldArtifact:** aec-bench returns the task's real `golden_pass.md` (verify-judge works fully offline). commit0 / programbench / appworld return `undefined` — the oracle is a git ref / stripped source / engine-bundled solution, not a portable string; judge correctness is proven by a real solve through the harness, not a synthetic gold (documented + fail-loud, not a fake).
 - **Absent (not built):** swe-gym, swe-bench-multimodal, and the rest of the survey set.
 Every unbuilt/scaffold adapter fails LOUD (throws with the integration step) rather than faking a score — no silent zeros in any corpus. Offline fixture tests: `benchmarks/{aec-bench,commit0,programbench,appworld}.test.mts` (`tsx --test`).

diff --git a/bench/fixtures/dabstep.json b/bench/fixtures/dabstep.json
@@ -0,0 +1,22 @@
+[
+  {
+    "task_id": 1,
+    "instructions": "Using the payment files, answer this calibration task with the exact integer 42.",
+    "all_golds_by_task": [
+      {
+        "kind": "number",
+        "value": 42.0
+      }
+    ]
+  },
+  {
+    "task_id": 2,
+    "instructions": "Using the payment files, answer this calibration task with the card scheme nexpay.",
+    "all_golds_by_task": [
+      {
+        "kind": "scheme",
+        "value": "nexpay"
+      }
+    ]
+  }
+]
diff --git a/bench/package.json b/bench/package.json
@@ -2,7 +2,7 @@
   "name": "@tangle-network/agent-bench",
   "version": "0.1.0",
   "type": "module",
-  "description": "The unified benchmark suite for agent-runtime agents: 18 adapters (commit0, enterpriseops-gym, trata-hedge, finsearchcomp, swe-bench, humaneval, …) behind one resolveAdapter registry, each with a real deterministic judge. Score any profile/skill/prompt change against them. Map: bench/HARNESS.md.",
+  "description": "The unified benchmark suite for agent-runtime agents: 19 adapters (commit0, enterpriseops-gym, trata-hedge, finsearchcomp, dabstep, swe-bench, humaneval, …) behind one resolveAdapter registry, each with a real deterministic judge. Score any profile/skill/prompt change against them. Map: bench/HARNESS.md.",
   "main": "src/index.ts",
   "types": "src/index.ts",
   "exports": {
@@ -18,7 +18,7 @@
   },
   "dependencies": {
     "@tangle-network/agent-eval": "^0.100.0",
-    "@tangle-network/agent-runtime": "^0.78.0",
+    "@tangle-network/agent-runtime": "^0.79.3",
     "@tangle-network/sandbox": "^0.9.3"
   },
   "devDependencies": {
@@ -27,6 +27,10 @@
   },
   "files": [
     "src",
+    "fixtures",
+    "scripts",
+    "tb_agents/*.py",
+    "steerers",
     "README.md"
   ],
   "publishConfig": {

diff --git a/bench/pnpm-lock.yaml b/bench/pnpm-lock.yaml
diff --git a/bench/scripts/dabstep_judge.py b/bench/scripts/dabstep_judge.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+"""DABStep judge bridge.
+
+Reads {"prediction": str, "golds": list} from stdin and delegates scoring to
+the official DABStep grade.py module. This script owns no grading semantics.
+"""
+
+import argparse
+import importlib.util
+import json
+import sys
+from pathlib import Path
+
+
+def load_grade(grade_file: Path):
+    spec = importlib.util.spec_from_file_location("dabstep_grade", grade_file)
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"could not import DABStep grade file: {grade_file}")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module.grade
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Score one DABStep answer")
+    parser.add_argument("--grade-file", required=True)
+    args = parser.parse_args()
+
+    try:
+        payload = json.loads(sys.stdin.read())
+        prediction = payload["prediction"]
+        golds = payload["golds"]
+        correct = bool(load_grade(Path(args.grade_file))(prediction, golds))
+        print(json.dumps({"correct": correct, "score": 1.0 if correct else 0.0}))
+        return 0
+    except Exception as exc:
+        print(json.dumps({"error": str(exc)}))
+        return 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/bench/src/adapters.ts b/bench/src/adapters.ts
@@ -11,6 +11,7 @@ import { createCadBenchAdapter } from './benchmarks/cadbench'
 import { createCadDesignAdapter } from './benchmarks/cad-design'
 import { createCadGenBenchAdapter } from './benchmarks/cadgenbench'
 import { createCommit0Adapter } from './benchmarks/commit0'
+import { createDabstepAdapter } from './benchmarks/dabstep'
 import { createEnterpriseOpsGymAdapter } from './benchmarks/enterpriseops-gym'
 import { createFinsearchcompAdapter } from './benchmarks/finsearchcomp'
 import { createFramesAdapter } from './benchmarks/frames'
@@ -32,6 +33,7 @@ export const ADAPTERS: Record<string, () => BenchmarkAdapter> = {
   // delegates to the benchmark's own harness and fails loud when it/Docker is absent.
   'aec-bench': createAecBenchAdapter,
   commit0: createCommit0Adapter,
+  dabstep: createDabstepAdapter,
   programbench: createProgrambenchAdapter,
   appworld: createAppWorldAdapter,
   // AppWorld's native interactive protocol — the worker is the in-engine ReAct