diff --git a/bench/src/swe-bench-env.test.ts b/bench/src/swe-bench-env.test.ts new file mode 100644 index 00000000..30dbdc02 --- /dev/null +++ b/bench/src/swe-bench-env.test.ts @@ -0,0 +1,71 @@ +import { mkdtempSync, realpathSync, rmSync, symlinkSync, writeFileSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { afterAll, describe, expect, it } from 'vitest' +import { isInsideJail, isTestPath, jailPath } from './swe-bench-env' + +describe('isTestPath', () => { + it('flags test directories and test-named python files', () => { + expect(isTestPath('tests/test_models.py')).toBe(true) + expect(isTestPath('pkg/test/helpers.py')).toBe(true) + expect(isTestPath('pkg/tests/helpers.py')).toBe(true) + expect(isTestPath('test_models.py')).toBe(true) + expect(isTestPath('models_test.py')).toBe(true) + expect(isTestPath('conftest.py')).toBe(true) + expect(isTestPath('pkg/conftest.py')).toBe(true) + }) + + it('does not flag ordinary source files', () => { + expect(isTestPath('src/foo.py')).toBe(false) + expect(isTestPath('pkg/models.py')).toBe(false) + // `testing.py` is not a test file by the test_/_test/conftest rules. + expect(isTestPath('pkg/testing.py')).toBe(false) + // A `latest/` segment must not trip the `tests?/` directory rule. + expect(isTestPath('latest/foo.py')).toBe(false) + }) +}) + +describe('jailPath', () => { + const root = '/work/repo' + + it('rejects `..` traversal and absolute paths', () => { + expect(jailPath(root, '../x')).toBeNull() + expect(jailPath(root, 'a/../../etc/passwd')).toBeNull() + expect(jailPath(root, '/etc/passwd')).toBeNull() + }) + + it('accepts in-repo relative paths and strips a leading `./`', () => { + expect(jailPath(root, 'src/a.py')).toBe('src/a.py') + expect(jailPath(root, './a.py')).toBe('a.py') + expect(jailPath(root, 'a.py')).toBe('a.py') + }) +}) + +describe('isInsideJail (realpath containment)', () => { + // Mirror the `resolveInJail` closure in `call()`: realpath-resolve a workspace-relative path, then + // assert containment. Offline — operates on a throwaway temp dir, no git clone, no network. + const dir = mkdtempSync(join(tmpdir(), 'swe-jail-')) + const jailRoot = realpathSync(dir) + afterAll(() => rmSync(dir, { recursive: true, force: true })) + + it('admits a real file inside the jail', () => { + const inside = join(dir, 'a.py') + writeFileSync(inside, 'x = 1\n') + expect(isInsideJail(jailRoot, realpathSync(inside))).toBe(true) + expect(isInsideJail(jailRoot, jailRoot)).toBe(true) + }) + + it('rejects reading through a symlink that escapes the jail', () => { + // A repo could ship `escape -> /etc`; following it must not let the agent read /etc/passwd. + const link = join(dir, 'escape') + symlinkSync('/etc', link) + // `resolveInJail` does `realpathSync(join(ws.dir, relPath))` then this containment check. + const real = realpathSync(join(dir, 'escape/passwd')) + expect(real).toBe('/etc/passwd') + expect(isInsideJail(jailRoot, real)).toBe(false) + }) + + it('rejects a sibling dir that shares the jail-root prefix', () => { + expect(isInsideJail('/tmp/swe-x', '/tmp/swe-x-evil/secret')).toBe(false) + }) +}) diff --git a/bench/src/swe-bench-env.ts b/bench/src/swe-bench-env.ts index ccfb0931..e82b2c1a 100644 --- a/bench/src/swe-bench-env.ts +++ b/bench/src/swe-bench-env.ts @@ -13,22 +13,40 @@ * memorization. Always report this; never claim a "clean" frontier number from this arena alone. */ import { execFile } from 'node:child_process' -import { existsSync, lstatSync, mkdtempSync, readdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs' +import { existsSync, lstatSync, mkdtempSync, readdirSync, readFileSync, realpathSync, rmSync, writeFileSync } from 'node:fs' import { tmpdir } from 'node:os' -import { join } from 'node:path' +import { join, sep } from 'node:path' import { promisify } from 'node:util' import type { AgenticSurface, AgenticTask, AgenticTool, ArtifactHandle, SurfaceScore } from '@tangle-network/agent-runtime/loops' import { createSweBenchAdapter } from './benchmarks/swe-bench' import type { BenchTask } from './benchmarks/types' const exec = promisify(execFile) -const isTestPath = (p: string) => /(^|\/)(tests?)\//.test(p) || /test_.*\.py$|_test\.py$|conftest\.py$/.test(p) +export const isTestPath = (p: string) => /(^|\/)(tests?)\//.test(p) || /test_.*\.py$|_test\.py$|conftest\.py$/.test(p) + +/** + * Cheap string pre-filter for an agent-supplied repo-relative path, applied before the path is + * joined to a workspace root: rejects absolute paths and any `..` segment, strips a leading `./`. + * Returns the cleaned relative path, or `null` if it must be refused. Pure and side-effect-free — + * `root` is unused here (the symlink-following boundary is the realpath jail, not this filter) but + * is taken so call sites read symmetrically with the realpath check. + */ +export const jailPath = (_root: string, p: string): string | null => { + if (p.startsWith('/') || p.includes('..')) return null + return p.replace(/^\.\//, '') +} + +/** + * Containment predicate for the realpath jail: true iff `real` (an already-resolved absolute path) + * is `jailRoot` itself or lies strictly inside it. The `+ sep` guard stops a sibling like + * `/tmp/swe-x-evil` from matching the root `/tmp/swe-x`. Pure and side-effect-free. + */ +export const isInsideJail = (jailRoot: string, real: string): boolean => real === jailRoot || real.startsWith(jailRoot + sep) interface Ws { dir: string task: BenchTask } -const workspaces = new Map() /** Build the SWE-bench Environment + a DISJOINT-slice task supplier over the Verified split. The * supplier keys tasks by dataset offset so `runStrategyEvolution`'s train [0,trainN) and holdout @@ -41,6 +59,8 @@ export async function createSweBenchEnvironment(poolN = 80): Promise<{ const adapter = createSweBenchAdapter() const pool = await adapter.loadTasks({ limit: poolN, split: 'test' }) const byId = new Map(pool.map((t) => [t.id, t])) + // Each environment owns its workspace registry so concurrent environments don't share state. + const workspaces = new Map() const environment: AgenticSurface = { name: 'swe-bench-verified', @@ -70,9 +90,18 @@ export async function createSweBenchEnvironment(poolN = 80): Promise<{ async call(handle, name, args) { const ws = workspaces.get(handle.id) if (!ws) return 'ERROR: workspace closed' - const safe = (p: string): string | null => { - if (p.startsWith('/') || p.includes('..')) return null - return p.replace(/^\.\//, '') + // Cheap pre-filter: reject absolute paths and `..` traversal, strip a leading `./`. The real + // boundary is the realpath jail check below (resolveInJail) — `safe` only normalizes the string + // form. `ws.dir` is passed for signature symmetry; the filter itself is root-independent. + const safe = (p: string): string | null => jailPath(ws.dir, p) + // Resolve `relPath` to an absolute path and assert it stays inside the workspace AFTER following + // symlinks (a repo symlink targeting /etc/passwd would otherwise escape the string-only jail). + // The target must exist (both callers read it first); a missing path throws and the caller + // surfaces the error message, matching the previous read-then-fail behavior. + const jailRoot = realpathSync(ws.dir) + const resolveInJail = (relPath: string): string | null => { + const real = realpathSync(join(ws.dir, relPath)) + return isInsideJail(jailRoot, real) ? real : null } if (name === 'list_files') { const sub = safe(String(args.dir ?? '')) ?? '' @@ -106,8 +135,15 @@ export async function createSweBenchEnvironment(poolN = 80): Promise<{ if (name === 'read_file') { const p = safe(String(args.path ?? '')) if (!p) return 'ERROR: invalid path' + let real: string | null try { - const c = readFileSync(join(ws.dir, p), 'utf8') + real = resolveInJail(p) + } catch (e) { + return `(error: ${(e as Error).message})` + } + if (!real) return `ERROR: path ${p} escapes the workspace` + try { + const c = readFileSync(real, 'utf8') return c.length > 24_000 ? `${c.slice(0, 24_000)}\n...[truncated]` : c } catch (e) { return `(error: ${(e as Error).message})` @@ -119,9 +155,16 @@ export async function createSweBenchEnvironment(poolN = 80): Promise<{ if (isTestPath(p)) return 'REJECTED: editing test files is forbidden (the evaluation runs hidden tests).' const oldStr = String(args.old_string ?? '') const newStr = String(args.new_string ?? '') + let real: string | null + try { + real = resolveInJail(p) + } catch (e) { + return `(cannot read ${p}: ${(e as Error).message})` + } + if (!real) return `ERROR: path ${p} escapes the workspace` let content: string try { - content = readFileSync(join(ws.dir, p), 'utf8') + content = readFileSync(real, 'utf8') } catch (e) { return `(cannot read ${p}: ${(e as Error).message})` } @@ -129,7 +172,7 @@ export async function createSweBenchEnvironment(poolN = 80): Promise<{ const count = content.split(oldStr).length - 1 if (count === 0) return `ERROR: old_string not found in ${p}. read_file it and copy EXACT text.` if (count > 1) return `ERROR: old_string appears ${count}× in ${p} — add surrounding context to make it unique.` - writeFileSync(join(ws.dir, p), content.replace(oldStr, newStr)) + writeFileSync(real, content.replace(oldStr, newStr)) return `edited ${p}: replaced 1 occurrence` } return `ERROR: unknown tool ${name}` diff --git a/bench/swe-self-improve.mts b/bench/src/swe-self-improve.mts similarity index 96% rename from bench/swe-self-improve.mts rename to bench/src/swe-self-improve.mts index a1d72cff..069f3b31 100644 --- a/bench/swe-self-improve.mts +++ b/bench/src/swe-self-improve.mts @@ -4,14 +4,14 @@ * draws a disjoint holdout slice and gates once — adaptive reuse is impossible). CONTAMINATION CAVEAT * applies (public fixes may be memorized) — reported, never claimed clean. * - * CALIBRATE first (cost gate): TANGLE_API_KEY=… CALIBRATE=1 N=3 tsx bench/swe-self-improve.mts - * Full run: TANGLE_API_KEY=… TRAIN_N=6 HOLDOUT_N=8 GENERATIONS=2 tsx bench/swe-self-improve.mts + * CALIBRATE first (cost gate): TANGLE_API_KEY=… CALIBRATE=1 N=3 tsx bench/src/swe-self-improve.mts + * Full run: TANGLE_API_KEY=… TRAIN_N=6 HOLDOUT_N=8 GENERATIONS=2 tsx bench/src/swe-self-improve.mts */ import { mkdtempSync, rmSync } from 'node:fs' import { join } from 'node:path' import { createChatClient } from '@tangle-network/agent-eval' import { refine, runAgentic, runStrategyEvolution, sample } from '@tangle-network/agent-runtime/loops' -import { createSweBenchEnvironment } from './src/swe-bench-env' +import { createSweBenchEnvironment } from './swe-bench-env' async function main(): Promise { const routerKey = process.env.TANGLE_API_KEY diff --git a/docs/api/mcp.md b/docs/api/mcp.md index 1cbb79b3..4892b9c4 100644 --- a/docs/api/mcp.md +++ b/docs/api/mcp.md @@ -3956,40 +3956,6 @@ Defined in: [mcp/tools/coordination.ts:56](https://github.com/tangle-network/age *** -### AnalystRegistry - -Defined in: [mcp/tools/coordination.ts:62](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L62) - -#### Properties - -##### kinds - -> `readonly` **kinds**: readonly `object`[] - -Defined in: [mcp/tools/coordination.ts:63](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L63) - -##### run - -> `readonly` **run**: (`kindId`, `trace`) => `Promise`\<`unknown`\> - -Defined in: [mcp/tools/coordination.ts:64](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L64) - -###### Parameters - -###### kindId - -`string` - -###### trace - -`unknown` - -###### Returns - -`Promise`\<`unknown`\> - -*** - ### CoordinationToolsOptions Defined in: [mcp/tools/coordination.ts:94](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L94) @@ -4010,7 +3976,7 @@ Defined in: [mcp/tools/coordination.ts:96](https://github.com/tangle-network/age ##### makeWorkerAgent -> `readonly` **makeWorkerAgent**: [`MakeWorkerAgent`](#makeworkeragent) +> `readonly` **makeWorkerAgent**: [`MakeWorkerAgent`](runtime.md#makeworkeragent) Defined in: [mcp/tools/coordination.ts:97](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L97) @@ -4022,7 +3988,7 @@ Defined in: [mcp/tools/coordination.ts:98](https://github.com/tangle-network/age ##### analysts? -> `readonly` `optional` **analysts?**: [`AnalystRegistry`](#analystregistry) +> `readonly` `optional` **analysts?**: [`AnalystRegistry`](runtime.md#analystregistry) Defined in: [mcp/tools/coordination.ts:99](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L99) @@ -5812,24 +5778,6 @@ Defined in: [mcp/tools/coordination.ts:60](https://github.com/tangle-network/age *** -### MakeWorkerAgent - -> **MakeWorkerAgent** = (`profile`) => [`Agent`](runtime.md#agent)\<`unknown`, `unknown`\> - -Defined in: [mcp/tools/coordination.ts:92](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L92) - -#### Parameters - -##### profile - -`unknown` - -#### Returns - -[`Agent`](runtime.md#agent)\<`unknown`, `unknown`\> - -*** - ### DelegateResult > **DelegateResult** = \{ `status`: `"winner"`; `out`: `unknown`; `outRef`: `string`; `spentTotal`: [`Spend`](runtime.md#spend); \} \| \{ `status`: `"no-winner"`; `reason`: `string`; `spentTotal`: [`Spend`](runtime.md#spend); \} @@ -7748,6 +7696,18 @@ Re-exports [mcpToolsForRuntimeMcpSubset](index.md#mcptoolsforruntimemcpsubset) *** +### AnalystRegistry + +Re-exports [AnalystRegistry](runtime.md#analystregistry) + +*** + ### CoordinationEvent Re-exports [CoordinationEvent](runtime.md#coordinationevent) + +*** + +### MakeWorkerAgent + +Re-exports [MakeWorkerAgent](runtime.md#makeworkeragent) diff --git a/docs/api/primitive-catalog.md b/docs/api/primitive-catalog.md index 27526a6b..17362d32 100644 --- a/docs/api/primitive-catalog.md +++ b/docs/api/primitive-catalog.md @@ -337,7 +337,7 @@ Import from `@tangle-network/agent-runtime/intelligence` — 60 exports. ### Recursive atom + loop kernel (alias of ./runtime) -Import from `@tangle-network/agent-runtime/loops` — 381 exports. +Import from `@tangle-network/agent-runtime/loops` — 383 exports. | Symbol | Kind | Summary | |---|---|---| @@ -487,6 +487,7 @@ Import from `@tangle-network/agent-runtime/loops` — 381 exports. | `AgentTurnInput` | interface | _(no summary — add a TSDoc line at the declaration)_ | | `AgentTurnResult` | interface | _(no summary — add a TSDoc line at the declaration)_ | | `AnalystFinding` | interface | Unified envelope every analyst emits. Schema-versioned so renderers | +| `AnalystRegistry` | interface | _(no summary — add a TSDoc line at the declaration)_ | | `AnytimeReport` | interface | _(no summary — add a TSDoc line at the declaration)_ | | `AnytimeStrategySummary` | interface | _(no summary — add a TSDoc line at the declaration)_ | | `AnytimeTaskCurve` | interface | anytimeReport — time-to-satisfactory-output metrics, derived entirely from the | @@ -702,6 +703,7 @@ Import from `@tangle-network/agent-runtime/loops` — 381 exports. | `LoopShape` | type | A reusable act-body factory. Given the persona's content + seams (`ShapeContext`), it | | `LoopTraceEvent` | type | _(no summary — add a TSDoc line at the declaration)_ | | `LoopUntil` | type | `loopUntil(spec)` — build the iterative-deepening combinator. `seed` is the initial state. | +| `MakeWorkerAgent` | type | _(no summary — add a TSDoc line at the declaration)_ | | `MountRecorder` | type | Records a mounted resource into the run's provenance manifest. Passed to | | `Outcome` | type | The terminal contract Drew wants: a loop returns a FINISHED deliverable, or the concrete | | `Panel` | type | `panel(spec)` — build the M-judge write-only-merge combinator. | diff --git a/docs/api/runtime.md b/docs/api/runtime.md index eac1add0..4269b2e1 100644 --- a/docs/api/runtime.md +++ b/docs/api/runtime.md @@ -412,6 +412,40 @@ The last artifact read error, if the abort fired during the retry loop. ## Interfaces +### AnalystRegistry + +Defined in: [mcp/tools/coordination.ts:62](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L62) + +#### Properties + +##### kinds + +> `readonly` **kinds**: readonly `object`[] + +Defined in: [mcp/tools/coordination.ts:63](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L63) + +##### run + +> `readonly` **run**: (`kindId`, `trace`) => `Promise`\<`unknown`\> + +Defined in: [mcp/tools/coordination.ts:64](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L64) + +###### Parameters + +###### kindId + +`string` + +###### trace + +`unknown` + +###### Returns + +`Promise`\<`unknown`\> + +*** + ### WorktreeCommandResult Defined in: [mcp/worktree-harness.ts:39](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/worktree-harness.ts#L39) @@ -7231,7 +7265,7 @@ What the spawn was supposed to produce — surfaced in traces/reports. ### DriverAgentOptions -Defined in: [runtime/supervise/coordination-driver.ts:45](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L45) +Defined in: [runtime/supervise/coordination-driver.ts:46](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L46) #### Properties @@ -7239,13 +7273,13 @@ Defined in: [runtime/supervise/coordination-driver.ts:45](https://github.com/tan > `readonly` **name**: `string` -Defined in: [runtime/supervise/coordination-driver.ts:46](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L46) +Defined in: [runtime/supervise/coordination-driver.ts:47](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L47) ##### brain > `readonly` **brain**: [`ToolLoopChat`](#toolloopchat) -Defined in: [runtime/supervise/coordination-driver.ts:50](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L50) +Defined in: [runtime/supervise/coordination-driver.ts:51](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L51) The driver-LLM seam — ONE inference turn over the conversation + the coordination tool specs (the canonical `ToolLoopChat`): a scripted mock offline, the router's tool-calling in @@ -7255,15 +7289,15 @@ The driver-LLM seam — ONE inference turn over the conversation + the coordinat > `readonly` **blobs**: [`ResultBlobStore`](#resultblobstore) -Defined in: [runtime/supervise/coordination-driver.ts:52](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L52) +Defined in: [runtime/supervise/coordination-driver.ts:53](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L53) Shared blob store — `observe_agent` reads settled outputs through it. ##### makeWorkerAgent -> `readonly` **makeWorkerAgent**: [`MakeWorkerAgent`](mcp.md#makeworkeragent) +> `readonly` **makeWorkerAgent**: [`MakeWorkerAgent`](#makeworkeragent) -Defined in: [runtime/supervise/coordination-driver.ts:54](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L54) +Defined in: [runtime/supervise/coordination-driver.ts:55](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L55) Resolve a spawned `profile` to a worker LEAF or a driver child (the recursion seam). @@ -7271,7 +7305,7 @@ Resolve a spawned `profile` to a worker LEAF or a driver child (the recursion se > `readonly` **perWorker**: [`Budget`](#budget-10) -Defined in: [runtime/supervise/coordination-driver.ts:56](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L56) +Defined in: [runtime/supervise/coordination-driver.ts:57](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L57) Per-child budget reserved from the conserved pool on each spawn. @@ -7279,16 +7313,35 @@ Per-child budget reserved from the conserved pool on each spawn. > `readonly` `optional` **maxLiveWorkers?**: `number` -Defined in: [runtime/supervise/coordination-driver.ts:59](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L59) +Defined in: [runtime/supervise/coordination-driver.ts:60](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L60) Hard cap on simultaneously-LIVE workers — `spawn_agent` fails closed once this many are in flight (a concurrency fence on top of the conserved-pool fence). Omit/`<= 0` = no cap. +##### analysts? + +> `readonly` `optional` **analysts?**: [`AnalystRegistry`](#analystregistry) + +Defined in: [runtime/supervise/coordination-driver.ts:63](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L63) + +The analyst lenses available to the driver. Required for `analyzeOnSettle` (and `run_analyst`). + Unset → no analyst feed (status quo: the driver gets settled outputs, no findings). + +##### analyzeOnSettle? + +> `readonly` `optional` **analyzeOnSettle?**: readonly `string`[] + +Defined in: [runtime/supervise/coordination-driver.ts:67](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L67) + +Analyst kind ids run AUTOMATICALLY when a worker settles `done` — each result re-enters as a + `finding` the driver pulls and composes its next steer from. The UP-leg of the self-improving + loop. Omit/empty = no auto-analysis (status quo). Requires `analysts`. + ##### systemPrompt > `readonly` **systemPrompt**: `string` \| ((`task`) => `string`) -Defined in: [runtime/supervise/coordination-driver.ts:62](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L62) +Defined in: [runtime/supervise/coordination-driver.ts:70](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L70) The driver's stance — a string, or built from the task (the worker-driver prompt / the generator). INJECTED so the prompt is a pluggable, optimizable role. @@ -7297,7 +7350,7 @@ The driver's stance — a string, or built from the task (the worker-driver prom > `readonly` `optional` **extraTools?**: readonly `object`[] -Defined in: [runtime/supervise/coordination-driver.ts:67](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L67) +Defined in: [runtime/supervise/coordination-driver.ts:75](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L75) WORK tools the driver may call DIRECTLY (alongside the coordination verbs) — so the driver is not a pure manager but a full agent that can ACT (do simple work itself) OR SPAWN (delegate). @@ -7308,7 +7361,7 @@ WORK tools the driver may call DIRECTLY (alongside the coordination verbs) — s > `readonly` `optional` **executeExtraTool?**: (`name`, `args`) => `Promise`\<`string` \| `null` \| `undefined`\> -Defined in: [runtime/supervise/coordination-driver.ts:74](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L74) +Defined in: [runtime/supervise/coordination-driver.ts:82](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L82) Runs an `extraTools` call. Returns a string result, or null/undefined to signal "not handled" so the call falls through to the coordination dispatch. Required iff `extraTools` is set. @@ -7331,7 +7384,7 @@ Runs an `extraTools` call. Returns a string result, or null/undefined to signal > `readonly` `optional` **maxTurns?**: `number` -Defined in: [runtime/supervise/coordination-driver.ts:82](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L82) +Defined in: [runtime/supervise/coordination-driver.ts:90](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L90) Max driver turns before the loop force-finalizes on the best settled child. Default 16. `0` lifts the turn-COUNT cap: the loop is bounded instead by the conserved budget pool, @@ -7342,7 +7395,7 @@ Max driver turns before the loop force-finalizes on the best settled child. Defa > `readonly` `optional` **now?**: () => `number` -Defined in: [runtime/supervise/coordination-driver.ts:85](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L85) +Defined in: [runtime/supervise/coordination-driver.ts:93](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L93) Injected clock for the in-loop absolute-deadline guard — keeps the deadline check deterministic in tests. Defaults to `Date.now`. @@ -7355,7 +7408,7 @@ Injected clock for the in-loop absolute-deadline guard — keeps the deadline ch > `readonly` `optional` **compaction?**: [`ToolLoopCompactionOptions`](#toolloopcompactionoptions) -Defined in: [runtime/supervise/coordination-driver.ts:94](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L94) +Defined in: [runtime/supervise/coordination-driver.ts:102](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L102) Give the driver brain a chapter-lifecycle on its OWN context window. The LLM-brain front doors lose to a dumb-Ralph respawn because the brain re-bills its whole coordination transcript every @@ -8221,7 +8274,7 @@ The completion oracle for backend-derived workers (settled ⟺ delivered). Stron ##### makeWorkerAgent? -> `readonly` `optional` **makeWorkerAgent?**: [`MakeWorkerAgent`](mcp.md#makeworkeragent) +> `readonly` `optional` **makeWorkerAgent?**: [`MakeWorkerAgent`](#makeworkeragent) Defined in: [runtime/supervise/supervise.ts:56](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervise.ts#L56) @@ -8301,11 +8354,31 @@ Hard cap on simultaneously-LIVE workers — `spawn_agent` fails closed once this flight. The conserved pool bounds TOTAL work; this bounds SIMULTANEOUS work (live boxes/ sandboxes a real fleet runs at once). Omit/`<= 0` = no cap (the pool stays the only fence). +##### analysts? + +> `readonly` `optional` **analysts?**: [`AnalystRegistry`](#analystregistry) + +Defined in: [runtime/supervise/supervise.ts:84](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervise.ts#L84) + +Analyst lenses available to the driver. Required for `analyzeOnSettle`. Unset → status quo + (the driver receives settled worker outputs, no analyst findings). + +##### analyzeOnSettle? + +> `readonly` `optional` **analyzeOnSettle?**: readonly `string`[] + +Defined in: [runtime/supervise/supervise.ts:89](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervise.ts#L89) + +Analyst kind ids run AUTOMATICALLY when a worker settles `done` — each re-enters as a `finding` + the driver pulls (`await_event`) and composes its next steer from. The self-improving UP-leg, + threaded to the driver at this level (propagate to sub-drivers via a recursive `makeWorkerAgent`). + Omit/empty = status quo (no analyst feed). Requires `analysts`. + ##### blobs? > `readonly` `optional` **blobs?**: [`ResultBlobStore`](#resultblobstore) -Defined in: [runtime/supervise/supervise.ts:83](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervise.ts#L83) +Defined in: [runtime/supervise/supervise.ts:91](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervise.ts#L91) Worker output store. Defaults to in-memory. @@ -8313,19 +8386,19 @@ Worker output store. Defaults to in-memory. > `readonly` `optional` **maxDepth?**: `number` -Defined in: [runtime/supervise/supervise.ts:84](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervise.ts#L84) +Defined in: [runtime/supervise/supervise.ts:92](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervise.ts#L92) ##### maxTurns? > `readonly` `optional` **maxTurns?**: `number` -Defined in: [runtime/supervise/supervise.ts:85](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervise.ts#L85) +Defined in: [runtime/supervise/supervise.ts:93](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervise.ts#L93) ##### compaction? > `readonly` `optional` **compaction?**: [`ToolLoopCompactionOptions`](#toolloopcompactionoptions) -Defined in: [runtime/supervise/supervise.ts:91](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervise.ts#L91) +Defined in: [runtime/supervise/supervise.ts:99](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervise.ts#L99) Give the supervisor brain a chapter-lifecycle on its OWN context window (router arm only): once its coordination transcript exceeds `thresholdTokens` it distills to a compact progress note and @@ -8337,13 +8410,13 @@ Give the supervisor brain a chapter-lifecycle on its OWN context window (router > `readonly` `optional` **runId?**: `string` -Defined in: [runtime/supervise/supervise.ts:92](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervise.ts#L92) +Defined in: [runtime/supervise/supervise.ts:100](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervise.ts#L100) ##### now? > `readonly` `optional` **now?**: () => `number` -Defined in: [runtime/supervise/supervise.ts:93](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervise.ts#L93) +Defined in: [runtime/supervise/supervise.ts:101](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervise.ts#L101) ###### Returns @@ -8353,7 +8426,7 @@ Defined in: [runtime/supervise/supervise.ts:93](https://github.com/tangle-networ > `readonly` `optional` **allowedModels?**: readonly `string`[] -Defined in: [runtime/supervise/supervise.ts:97](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervise.ts#L97) +Defined in: [runtime/supervise/supervise.ts:105](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervise.ts#L105) Restrict the run to this subset of models. When set, every configured model — the supervisor router model, the profile's model, and the backend's model — must be a member, @@ -8416,7 +8489,7 @@ Defined in: [runtime/supervise/supervisor-agent.ts:70](https://github.com/tangle ##### makeWorkerAgent -> `readonly` **makeWorkerAgent**: [`MakeWorkerAgent`](mcp.md#makeworkeragent) +> `readonly` **makeWorkerAgent**: [`MakeWorkerAgent`](#makeworkeragent) Defined in: [runtime/supervise/supervisor-agent.ts:72](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervisor-agent.ts#L72) @@ -8495,17 +8568,34 @@ Runs an `extraTools` call; null/undefined falls through to the coordination disp `Promise`\<`string` \| `null` \| `undefined`\> +##### analysts? + +> `readonly` `optional` **analysts?**: [`AnalystRegistry`](#analystregistry) + +Defined in: [runtime/supervise/supervisor-agent.ts:98](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervisor-agent.ts#L98) + +Analyst lenses available to the driver (both arms). Required for `analyzeOnSettle`. + +##### analyzeOnSettle? + +> `readonly` `optional` **analyzeOnSettle?**: readonly `string`[] + +Defined in: [runtime/supervise/supervisor-agent.ts:101](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervisor-agent.ts#L101) + +Analyst kinds run on each worker-settle → a `finding` the driver composes its next steer from + (the self-improving UP-leg). Unset/empty = status quo (no analyst feed). Requires `analysts`. + ##### maxTurns? > `readonly` `optional` **maxTurns?**: `number` -Defined in: [runtime/supervise/supervisor-agent.ts:97](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervisor-agent.ts#L97) +Defined in: [runtime/supervise/supervisor-agent.ts:102](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervisor-agent.ts#L102) ##### compaction? > `readonly` `optional` **compaction?**: [`ToolLoopCompactionOptions`](#toolloopcompactionoptions) -Defined in: [runtime/supervise/supervisor-agent.ts:101](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervisor-agent.ts#L101) +Defined in: [runtime/supervise/supervisor-agent.ts:106](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervisor-agent.ts#L106) Give the supervisor brain a chapter-lifecycle on its OWN context window (router arm only) — it distills its coordination transcript to a compact progress note once it exceeds the threshold, @@ -12109,6 +12199,24 @@ Every message on the one typed pipe. UP (child→parent): question / settled / f *** +### MakeWorkerAgent + +> **MakeWorkerAgent** = (`profile`) => [`Agent`](#agent)\<`unknown`, `unknown`\> + +Defined in: [mcp/tools/coordination.ts:92](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L92) + +#### Parameters + +##### profile + +`unknown` + +#### Returns + +[`Agent`](#agent)\<`unknown`, `unknown`\> + +*** + ### InProcessOnPrompt > **InProcessOnPrompt** = (`prompt`, `ctx`) => `SandboxEvent`[] \| `AsyncIterable`\<`SandboxEvent`\> \| `Promise`\<`SandboxEvent`[]\> @@ -15394,7 +15502,7 @@ executor has produced its output. The inner `score` is preserved; only `valid` i > **driverAgent**(`opts`): [`Agent`](#agent)\<`unknown`, `unknown`\> -Defined in: [runtime/supervise/coordination-driver.ts:157](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L157) +Defined in: [runtime/supervise/coordination-driver.ts:165](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L165) Build the intelligent recursive driver. Its `act` is the LLM tool-loop; spawn it as a `driverChild` (`driver-executor.ts`) to run it inside a nested scope, recursively. @@ -15415,7 +15523,7 @@ Build the intelligent recursive driver. Its `act` is the LLM tool-loop; spawn it > **finalizeBestDelivered**(`settled`, `blobs`): `Promise`\<`unknown`\> -Defined in: [runtime/supervise/coordination-driver.ts:356](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L356) +Defined in: [runtime/supervise/coordination-driver.ts:373](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/coordination-driver.ts#L373) Keep-best finalize under the completion-oracle: return the highest-scoring DELIVERED child's output (settled `done` AND `valid` — its deliverable check passed). Returns undefined when no @@ -15463,7 +15571,7 @@ Stand up the coordination MCP over a live scope. The HOST address is `127.0.0.1` ###### makeWorkerAgent -[`MakeWorkerAgent`](mcp.md#makeworkeragent) +[`MakeWorkerAgent`](#makeworkeragent) ###### perWorker @@ -15486,7 +15594,7 @@ Hard cap on simultaneously-LIVE workers — `spawn_agent` fails closed once this ###### analysts? -[`AnalystRegistry`](mcp.md#analystregistry) +[`AnalystRegistry`](#analystregistry) Trace-analyst lenses the driver can run (`run_analyst`) or auto-fire on settle. @@ -15806,7 +15914,7 @@ Fail loud on a `down` settlement: only a `done` child is an iteration. ### workerFromBackend() -> **workerFromBackend**(`backend`, `deliverable?`): [`MakeWorkerAgent`](mcp.md#makeworkeragent) +> **workerFromBackend**(`backend`, `deliverable?`): [`MakeWorkerAgent`](#makeworkeragent) Defined in: [runtime/supervise/supervise.ts:26](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervise.ts#L26) @@ -15826,7 +15934,7 @@ Build the worker seam from a backend (WHERE workers run) + an optional completio #### Returns -[`MakeWorkerAgent`](mcp.md#makeworkeragent) +[`MakeWorkerAgent`](#makeworkeragent) *** @@ -15834,7 +15942,7 @@ Build the worker seam from a backend (WHERE workers run) + an optional completio > **supervise**(`profile`, `task`, `opts`): `Promise`\<[`SupervisedResult`](#supervisedresult)\<`unknown`\>\> -Defined in: [runtime/supervise/supervise.ts:108](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervise.ts#L108) +Defined in: [runtime/supervise/supervise.ts:116](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervise.ts#L116) #### Parameters @@ -15860,7 +15968,7 @@ Defined in: [runtime/supervise/supervise.ts:108](https://github.com/tangle-netwo > **supervisorAgent**(`profile`, `deps`): [`Agent`](#agent)\<`unknown`, `unknown`\> -Defined in: [runtime/supervise/supervisor-agent.ts:104](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervisor-agent.ts#L104) +Defined in: [runtime/supervise/supervisor-agent.ts:109](https://github.com/tangle-network/agent-runtime/blob/main/src/runtime/supervise/supervisor-agent.ts#L109) #### Parameters diff --git a/examples/ablation-suite/ablation.ts b/examples/ablation-suite/ablation.ts index 9fe2837e..cf9ac757 100644 --- a/examples/ablation-suite/ablation.ts +++ b/examples/ablation-suite/ablation.ts @@ -7,10 +7,13 @@ * just burns tokens. One-knob-delta design (baseline + each single knob flipped) keeps it O(N), not 2^N. * * STATUS — honest: the framework + the cost autopsy are real; knobs are wired incrementally. WIRED: - * `topology` (single/fanout/fanout-refine = refine/sample/sampleThenRefine) + `budget`. The rest are - * DECLARED knobs that FAIL LOUD if set (no silent no-op — you must not think GEPA ran when it didn't); - * each is a tracked next-increment over a real substrate primitive (named in the throw). Validate the - * framework on the cheap contamination-proof task, THEN point `environment`/`tasks` at SWE-bench. + * `topology` (single/fanout/fanout-refine = refine/sample/sampleThenRefine) + `budget`; `driverSteer` + * (the supervisor brain spawns + steers a graded worker, analyst up-leg on — via `selfImprovingSupervisor`) + * and `optimize:'gepa'` (GEPA-tune the driver's compose-prompt on a DISJOINT train slice, freeze, then + * drive — via `optimizeDriverPrompt`; implies `driverSteer`). STILL DECLARED + FAIL LOUD: `halo`, + * `persistentArtifact` (no silent no-op — each names its substrate primitive in the throw). Note: the + * driverSteer/optimize arms report real resolve + $ but NOT a per-token/latency breakdown (uncaptured, + * not a real zero). Validate on the cheap contamination-proof task, THEN point `environment`/`tasks` at SWE-bench. */ import { pairedBootstrap } from '@tangle-network/agent-eval' import { @@ -23,6 +26,14 @@ import { sampleThenRefine, } from '@tangle-network/agent-runtime/loops' import { codingEnv, codingTasks } from '../self-improving-coder/self-improving-coder' +import { optimizeDriverPrompt } from './gepa-driver-prompt' +import { selfImprovingSupervisor } from './self-improving-supervisor' + +/** The baseline driver/steerer standing instruction — the compose-next-prompt the GEPA pass mutates + * (its `baselinePrompt`) and the prompt the supervisor runs with when `optimize` is off. Kept terse: + * GEPA earns the lift, this is only the floor. */ +const baselineDriverPrompt = + 'You are a driver coordinating one worker on a coding task. Read the worker’s settled output and the analyst finding, then steer the next attempt: name the concrete next action, require the worker to verify the change took, and only stop once every required check passes.' export interface AblationKnobs { /** WIRED → strategy: single=`refine` (iterate one artifact), fanout=`sample` (N parallel, pick best), @@ -55,16 +66,6 @@ const unwiredKnobs: Array<{ isSet: (v: unknown) => boolean prim: string }> = [ - { - k: 'driverSteer', - isSet: (v) => v === true, - prim: 'supervise(driverProfile,{backend,analyzeOnSettle}) — driver composes the steer from the analyst finding', - }, - { - k: 'optimize', - isSet: (v) => !!v && v !== 'off', - prim: "selfImprove() w/ executable JudgeConfig optimizing the driver's compose-prompt on TRAIN, frozen", - }, { k: 'halo', isSet: (v) => v === true, prim: 'HALO analyst option' }, { k: 'persistentArtifact', isSet: (v) => v === true, prim: 'openSandboxRun resume' }, ] @@ -99,6 +100,16 @@ export async function runAblation(opts: { maxTokens?: number innerTurns?: number } + /** The DRIVER brain's own router substrate (used by the `driverSteer`/`optimize` arms). Defaults to + * the worker's router + model. The supervisor's inference is separate compute from the worker's, so + * it carries its own model knob. */ + supervisor?: { + routerBaseUrl?: string + routerKey?: string + model?: string + /** Reflection model for the GEPA optimize pass (defaults to the supervisor/worker model). */ + reflectionModel?: string + } onArm?: (r: ArmResult) => void }): Promise { // ONE held-out set, shared across all arms — the fair-comparison invariant. @@ -110,6 +121,13 @@ export async function runAblation(opts: { knobs: { ...opts.base, ...d.knob } as AblationKnobs, })), ] + // The driver brain's router substrate (the `driverSteer`/`optimize` arms) — defaults to the worker's + // router + model. The supervisor's inference is separate compute from the worker's. + const supervisorRouter = { + baseUrl: opts.supervisor?.routerBaseUrl ?? opts.worker.routerBaseUrl, + apiKey: opts.supervisor?.routerKey ?? opts.worker.routerKey, + model: opts.supervisor?.model ?? opts.worker.model, + } const results: ArmResult[] = [] for (const arm of arms) { for (const u of unwiredKnobs) { @@ -118,34 +136,108 @@ export async function runAblation(opts: { `ablation: knob '${u.k}'=${JSON.stringify(arm.knobs[u.k])} (arm "${arm.name}") is DECLARED but not yet wired — wire it over ${u.prim} before claiming it ran. (No silent no-op.)`, ) } + // `optimize` implies `driverSteer`: a tuned compose-prompt only has effect through the driver loop. + const driverSteer = arm.knobs.driverSteer === true || arm.knobs.optimize === 'gepa' + + // The `optimize:'gepa'` knob: BEFORE the held-out arm runs, GEPA-tune the driver's compose-prompt on + // a DISJOINT train slice (offset past the held-out window so train ∩ holdout = ∅), freeze the winner, + // and use it for this arm's driverSteer runs. Off → the baseline standing prompt drives the loop. + let driverPrompt = baselineDriverPrompt + let gepaUsd = 0 + if (arm.knobs.optimize === 'gepa') { + const opt = await optimizeDriverPrompt({ + surface: opts.environment, + tasks: opts.tasks, + trainOffset: opts.holdoutOffset + opts.holdoutN, + trainN: opts.holdoutN, + baselinePrompt: baselineDriverPrompt, + worker: opts.worker, + ...(opts.supervisor?.reflectionModel !== undefined + ? { reflectionModel: opts.supervisor.reflectionModel } + : {}), + }) + driverPrompt = opt.systemPrompt + gepaUsd = opt.usd // the TRAIN-side GEPA cost, counted into this arm's $ (the fair-cost invariant) + console.log( + `ablation: arm "${arm.name}" GEPA driver-prompt ${opt.shipped ? 'SHIPPED' : 'kept-baseline'} (train lift ${(100 * opt.lift).toFixed(0)}pp)`, + ) + } + let resolved = 0 let ti = 0 let to = 0 - let usd = 0 + let usd = gepaUsd // seed with the TRAIN-side GEPA optimization cost so the arm's $ is honest let ms = 0 let shots = 0 let comps = 0 const perTask: number[] = [] for (const t of tasks) { - const r = await runAgentic({ - surface: opts.environment, - task: t, - strategy: topologyStrategy[arm.knobs.topology], - budget: arm.knobs.budget, - routerBaseUrl: opts.worker.routerBaseUrl, - routerKey: opts.worker.routerKey, - model: opts.worker.model, - ...(opts.worker.maxTokens !== undefined ? { maxTokens: opts.worker.maxTokens } : {}), - ...(opts.worker.innerTurns !== undefined ? { innerTurns: opts.worker.innerTurns } : {}), - }) - if (r.resolved) resolved++ - perTask.push(r.resolved ? 1 : 0) - ti += r.tokens.input - to += r.tokens.output - usd += r.usd - ms += r.ms - shots += r.shots - comps += r.completions + try { + if (driverSteer) { + // The driver-steered path: the supervisor brain spawns + steers a graded worker on a conserved + // pool, with the analyst up-leg on. `selfImprovingSupervisor` reports the deployable outcome + + // the FULL conserved spend (driver inference + all worker work: $, tokens, latency). `shots` + // stays 0 — a multi-worker supervised run has no single refine-shot count (N/A, not a real zero). + const sup = await selfImprovingSupervisor({ + surface: opts.environment, + task: t, + driverPrompt, + worker: { + routerBaseUrl: opts.worker.routerBaseUrl, + routerKey: opts.worker.routerKey, + model: opts.worker.model, + ...(opts.worker.maxTokens !== undefined ? { maxTokens: opts.worker.maxTokens } : {}), + ...(opts.worker.innerTurns !== undefined + ? { innerTurns: opts.worker.innerTurns } + : {}), + budget: arm.knobs.budget, + }, + budget: { + // Pool for the driver's turns PLUS several worker spawns (each reserves ~innerTurns+2 + // iterations) so the analyst up-leg can drive a spawn-refine loop, not stall after one + // worker. The autopsy measures the real cost; this is intentionally not equal-k. + maxIterations: arm.knobs.budget * ((opts.worker.innerTurns ?? 6) + 2) + 16, + maxTokens: (opts.worker.maxTokens ?? 4000) * Math.max(4, arm.knobs.budget * 3), + }, + analyze: true, + router: supervisorRouter, + }) + if (sup.resolved) resolved++ + perTask.push(sup.resolved ? 1 : 0) + usd += sup.usd + ti += sup.tokensIn + to += sup.tokensOut + ms += sup.ms + } else { + const r = await runAgentic({ + surface: opts.environment, + task: t, + strategy: topologyStrategy[arm.knobs.topology], + budget: arm.knobs.budget, + routerBaseUrl: opts.worker.routerBaseUrl, + routerKey: opts.worker.routerKey, + model: opts.worker.model, + ...(opts.worker.maxTokens !== undefined ? { maxTokens: opts.worker.maxTokens } : {}), + ...(opts.worker.innerTurns !== undefined ? { innerTurns: opts.worker.innerTurns } : {}), + }) + if (r.resolved) resolved++ + perTask.push(r.resolved ? 1 : 0) + ti += r.tokens.input + to += r.tokens.output + usd += r.usd + ms += r.ms + shots += r.shots + comps += r.completions + } + } catch (e) { + // One task throw (network/quota/etc.) must not lose the whole arm's accumulated data: + // count it as unresolved and keep going so the arm returns partial results. Warn loud. + const msg = e instanceof Error ? e.message : String(e) + console.warn( + `ablation: arm "${arm.name}" task "${t.id}" failed (counted unresolved): ${msg}`, + ) + perTask.push(0) + } } const n = tasks.length const res: ArmResult = { @@ -217,19 +309,29 @@ async function main(): Promise { maxTokens: 4000, innerTurns: Number(process.env.INNER_TURNS ?? 6), } - console.log(`═══ ABLATION (cheap contamination-proof task) — worker=${worker.model} ═══`) + const supervisor = { + model: process.env.SUPERVISOR_MODEL ?? worker.model, + reflectionModel: process.env.REFLECTION_MODEL ?? 'gemini-2.5-pro', + } + console.log( + `═══ ABLATION (cheap contamination-proof task) — worker=${worker.model} driver=${supervisor.model} ═══`, + ) const results = await runAblation({ environment: codingEnv, tasks: codingTasks, holdoutOffset: 100, // a fixed disjoint held-out slice holdoutN: Number(process.env.HOLDOUT_N ?? 6), base: { topology: 'single', budget: Number(process.env.BUDGET ?? 2) }, - // one-knob-delta: flip ONLY topology (the wired knob) vs baseline. + // one-knob-delta: flip ONLY one knob vs baseline. topology is the cheap free arm; driverSteer adds + // the driver brain; optimize tunes that brain's compose-prompt on a disjoint train slice first. deltas: [ { name: 'fanout', knob: { topology: 'fanout' } }, { name: 'fanout-refine', knob: { topology: 'fanout-refine' } }, + { name: 'driver-steer', knob: { driverSteer: true } }, + { name: 'driver-gepa', knob: { optimize: 'gepa' } }, ], worker, + supervisor, onArm: (r) => console.log( ` ${r.name}: ${(100 * r.resolve).toFixed(0)}% resolve, $${r.costUsd.toFixed(4)}, ${(r.latencyMs / 1000).toFixed(0)}s`, diff --git a/examples/ablation-suite/gepa-driver-prompt.ts b/examples/ablation-suite/gepa-driver-prompt.ts new file mode 100644 index 00000000..2ed130ac --- /dev/null +++ b/examples/ablation-suite/gepa-driver-prompt.ts @@ -0,0 +1,167 @@ +/** + * gepa-driver-prompt — GEPA-optimize the driver's compose-next-prompt on TRAIN, executable-graded, + * frozen, held-out-certified, and return the winner. + * + * This is the `optimize: 'gepa'` knob from the ablation board (ablation.ts), wired over the real + * substrate: agent-eval's `selfImprove` (the held-out-gated closed loop) driven by `gepaProposer` + * (the reflective prompt mutator). It is NOT `improve()` — `improve()` writes the winner back into an + * `AgentProfile` field, but the steerer prompt (what the driver composes the next round's instruction + * from) is not a profile field. So we call `selfImprove` directly with the steerer string as the + * `baselineSurface` the proposer mutates. + * + * The grading is EXECUTABLE, never an LLM judge: each candidate steerer runs a real `refine` rollout + * over the surface (its harness-verified `resolved`/`score`), and the `JudgeConfig` reads those + * outcomes straight off the artifact. A candidate's fitness IS the resolve it actually earned on the + * environment's own check — there is no model in the scoring loop to flatter it. + * + * The candidate steerer reaches the run through `refine`'s built-in analyst steerer + * (`AgenticOptions.analystInstruction`): the closest in-strategy proxy for "the driver's + * compose-next-prompt", since `refine`'s between-shot analyst IS the thing that composes the next + * instruction from the trajectory. GEPA tunes that instruction; the held-out gate certifies it. + */ + +import { + type DispatchContext, + gepaProposer, + type JudgeConfig, + type MutableSurface, + type Scenario, + selfImprove, +} from '@tangle-network/agent-eval/contract' +import { + type AgenticRunResult, + type AgenticSurface, + type AgenticTask, + refine, + runAgentic, +} from '@tangle-network/agent-runtime/loops' + +/** One TRAIN scenario: the coding task carried as the scenario's domain payload. The agent reads + * `scenario.task` to run the rollout; the judge reads the artifact the rollout produced. */ +interface DriverPromptScenario extends Scenario { + task: AgenticTask +} + +/** The default reflection model — a model the Tangle router actually serves. The substrate default + * (`anthropic/claude-sonnet-4.6`) is NOT served by the router, so `gepaProposer` would fail every + * reflection call; callers should pass their own, but this keeps the zero-config path live. */ +const defaultReflectionModel = 'gemini-2.5-pro' + +/** The mutation levers offered to the reflective proposer — what a steerer-prompt rewrite may change. + * These orient the model toward the kinds of edits that move a compose-next-prompt's effectiveness. */ +const steererMutationPrimitives = [ + 'sharpen what the reviewer must check on the trajectory before recommending an action', + 'make the recommended next actions more concrete and tool-grounded', + 'add an explicit verify-it-took step after each change', + 'tighten the COMPLETE / continue decision so it stops only when every required change is verified', +] + +export async function optimizeDriverPrompt(opts: { + surface: AgenticSurface + tasks: (offset: number, n: number) => Promise + trainOffset: number + trainN: number + baselinePrompt: string + worker: { + routerBaseUrl: string + routerKey: string + model: string + maxTokens?: number + innerTurns?: number + } + reflectionModel?: string +}): Promise<{ systemPrompt: string; lift: number; shipped: boolean; usd: number }> { + const { surface, worker } = opts + + // TRAIN scenarios — the disjoint training slice. `selfImprove` splits a held-out fraction off these + // for the gate, so the winner is certified on tasks the proposer never optimized against. + const trainTasks = await opts.tasks(opts.trainOffset, opts.trainN) + const scenarios: DriverPromptScenario[] = trainTasks.map((task) => ({ + id: task.id, + kind: 'coding', + task, + })) + + // The agent under improvement: it receives the CURRENT candidate steerer (the surface string) and + // runs a real `refine` rollout with that steerer as the analyst instruction. The returned artifact + // is the harness-verified `AgenticRunResult` — `resolved`/`score` come from `surface.score`, not a + // self-report, so the candidate cannot fabricate a win. + const agent = async ( + candidate: MutableSurface, + scenario: DriverPromptScenario, + _ctx: DispatchContext, + ): Promise => { + // The candidate is the steerer prompt. A `CodeSurface` is not a prompt — this loop only optimizes + // the string steerer, so a non-string candidate is a wiring error that must fail loud. + if (typeof candidate !== 'string') { + throw new Error( + `optimizeDriverPrompt: candidate surface is a CodeSurface, not a steerer prompt — this loop optimizes the string steerer only`, + ) + } + return runAgentic({ + surface, + task: scenario.task, + strategy: refine, + budget: opts.worker.innerTurns ? Math.max(2, Math.ceil(opts.worker.innerTurns / 2)) : 2, + routerBaseUrl: worker.routerBaseUrl, + routerKey: worker.routerKey, + model: worker.model, + // The candidate steerer drives the run via refine's built-in between-shot analyst. + analystInstruction: candidate, + ...(worker.maxTokens !== undefined ? { maxTokens: worker.maxTokens } : {}), + ...(worker.innerTurns !== undefined ? { innerTurns: worker.innerTurns } : {}), + }) + } + + // The EXECUTABLE judge — no LLM in the scoring loop. Composite = the artifact's harness-verified + // resolve fraction; the `resolved` dimension is the binary deployable pass. A thrown judge would be + // recorded as a failed cell, so we read defensively-shaped numeric fields and never throw on shape. + const judge: JudgeConfig = { + name: 'surface-resolve', + dimensions: [ + { key: 'resolved', description: 'the surface verifier passed every check (1) or not (0)' }, + { key: 'score', description: 'the surface verifier pass fraction in [0,1]' }, + ], + score: ({ artifact }) => ({ + dimensions: { + resolved: artifact.resolved ? 1 : 0, + score: artifact.score, + }, + composite: artifact.score, + notes: `executable grade: resolved=${artifact.resolved} score=${artifact.score.toFixed(3)}`, + }), + } + + const reflectionModel = opts.reflectionModel ?? defaultReflectionModel + + const result = await selfImprove({ + agent, + scenarios, + judge, + baselineSurface: opts.baselinePrompt, + proposer: gepaProposer({ + llm: { baseUrl: worker.routerBaseUrl, apiKey: worker.routerKey }, + model: reflectionModel, + target: + 'the driver compose-next-prompt (the between-shot steerer that turns the trajectory into the next instruction)', + mutationPrimitives: steererMutationPrimitives, + }), + // One generation, two candidates, a third of TRAIN held out for the gate — the cheap proof shape; + // raise generations/populationSize for a deeper search once the cheap run is green. + budget: { generations: 1, populationSize: 2, holdoutFraction: 0.34 }, + }) + + // The winner surface is the promoted steerer. A `CodeSurface` winner is impossible here (the + // baseline + every mutation is a string), but guard the type so the return stays a clean string. + const winner = result.winner.surface + const systemPrompt = typeof winner === 'string' ? winner : opts.baselinePrompt + + return { + systemPrompt, + lift: result.lift, + shipped: result.gateDecision === 'ship', + // The TRAIN-side optimization cost (baseline + every generation) — counted into the arm's $ so the + // cost-aware ablation never hides the price of GEPA behind the held-out run alone. + usd: result.totalCostUsd, + } +} diff --git a/examples/ablation-suite/self-improving-supervisor.ts b/examples/ablation-suite/self-improving-supervisor.ts new file mode 100644 index 00000000..217bf423 --- /dev/null +++ b/examples/ablation-suite/self-improving-supervisor.ts @@ -0,0 +1,128 @@ +/** + * self-improving-supervisor — the one-call DX recipe for the driver-steered supervisor over a graded + * task. It composes three already-built seams instead of hand-wiring a loop: + * + * surfaceWorkerSeam → WHERE the worker runs + the completion oracle that makes "settled ⟺ delivered" + * supervise() → the LLM driver brain that spawns + steers the worker on a conserved budget + * analysts/onSettle → the self-improving UP-leg: when a worker settles, an analyst reads its output + * and re-enters a short `finding` the driver composes its next steer from + * + * `analyze` is the one knob that flips the up-leg on: off → the driver sees raw settled outputs; on → + * the driver also receives a one-line analyst read of each settled worker (the steer firewall stays in + * the analyst registry — the analyst summarizes, it never decides the verdict). + */ +import { + type AgenticSurface, + type AgenticTask, + type SupervisorProfile, + supervise, +} from '@tangle-network/agent-runtime/loops' +import { type SurfaceWorkerOut, surfaceWorkerSeam } from './surface-worker' + +export interface SelfImprovingSupervisorOptions { + /** The agentic surface the worker acts on (grading + task generation live here). */ + readonly surface: AgenticSurface + /** The single graded task the supervisor must resolve. */ + readonly task: AgenticTask + /** The driver brain's standing instruction — the optimized prompt from the GEPA pass, or a baseline. */ + readonly driverPrompt: string + /** WHERE the worker runs (router substrate + model + inner-loop bounds). Threaded to the seam. */ + readonly worker: { + readonly routerBaseUrl: string + readonly routerKey: string + readonly model: string + readonly maxTokens?: number + readonly innerTurns?: number + readonly budget?: number + } + /** The conserved compute pool for the whole supervised run. */ + readonly budget: { readonly maxIterations: number; readonly maxTokens: number } + /** Flip the self-improving up-leg on: feed the driver a one-line analyst read of each settled worker. */ + readonly analyze?: boolean + /** The supervisor brain's router substrate (the driver's own inference). */ + readonly router: { readonly baseUrl: string; readonly apiKey: string; readonly model: string } +} + +/** The minimal one-lens registry used only when `analyze` is on: a single `progress` lens that reads + * the worker's settled output and hands the driver a short summary (the up-leg). It declares its kind + * so `analyzeOnSettle:['progress']` resolves, and its `run` returns the `{ summary }` read. The shape + * is validated structurally against `supervise`'s `analysts` option at the call site. */ +function progressAnalyst() { + return { + kinds: [ + { + id: 'progress', + description: "Summarize the worker's settled output for the driver's next steer.", + area: 'progress', + }, + ], + run: async (_kindId: string, trace: unknown) => { + // `trace` is the worker's settled blob — a SurfaceWorkerOut object. `String(obj)` yields the + // useless literal '[object Object]', so read the real fields into the driver's next-steer context. + const w = (trace ?? {}) as Partial + const summary = + typeof w === 'object' && w !== null && 'resolved' in w + ? `worker ${w.resolved ? 'RESOLVED' : 'did NOT resolve'} — score ${(100 * (w.score ?? 0)).toFixed(0)}%, ${w.shots ?? '?'} shot(s)${w.summary ? `: ${w.summary}` : ''}` + : `worker produced: ${JSON.stringify(trace).slice(0, 400)}` + return { summary } + }, + } +} + +/** Run the driver-steered supervisor over one graded task and report the deployable outcome: + * `resolved` (a winner delivered), `score` ([0,1] from the completion verdict), and `usd` (the real + * conserved spend — paid even on a no-winner). */ +export async function selfImprovingSupervisor(opts: SelfImprovingSupervisorOptions): Promise<{ + resolved: boolean + score: number + usd: number + tokensIn: number + tokensOut: number + ms: number +}> { + const seam = surfaceWorkerSeam({ + surface: opts.surface, + task: opts.task, + worker: opts.worker, + }) + + const profile: SupervisorProfile = { name: 'driver', systemPrompt: opts.driverPrompt } + + // Size the per-worker reservation so MULTIPLE workers fit the conserved pool. The default reserves + // the WHOLE iteration pool per worker (supervise.defaultPerWorker forwards budget.maxIterations + // unchanged), so only one worker ever spawns — which would defeat the spawn-a-refined-worker steering + // the analyst up-leg exists to drive. A small per-worker iteration slice lets the driver re-spawn. + const perWorkerIters = (opts.worker.innerTurns ?? 6) + 2 + + const result = await supervise(profile, opts.task, { + makeWorkerAgent: seam.makeWorkerAgent, + deliverable: seam.deliverable, + budget: opts.budget, + perWorker: { maxIterations: perWorkerIters, maxTokens: opts.worker.maxTokens ?? 4000 }, + router: { + routerBaseUrl: opts.router.baseUrl, + routerKey: opts.router.apiKey, + model: opts.router.model, + }, + ...(opts.analyze + ? { analysts: progressAnalyst(), analyzeOnSettle: ['progress'] as const } + : {}), + }) + + // The supervise winner carries the driver's finalize output (the best-delivered worker's blob), NOT a + // verdict field — read the real surface-checked score/resolved off that SurfaceWorkerOut. + const out = result.kind === 'winner' ? (result.out as SurfaceWorkerOut | undefined) : undefined + const resolved = out?.resolved ?? false + const score = out?.score ?? 0 + // Report the FULL conserved spend (driver inference + all worker work) so the cost-aware ablation has + // real token + latency columns for this arm, not fake zeros. + const sp = result.spentTotal + return { + resolved, + score, + usd: sp.usd, + tokensIn: sp.tokens.input, + tokensOut: sp.tokens.output, + ms: sp.ms, + } +} diff --git a/examples/ablation-suite/surface-worker.ts b/examples/ablation-suite/surface-worker.ts new file mode 100644 index 00000000..fbe84c38 --- /dev/null +++ b/examples/ablation-suite/surface-worker.ts @@ -0,0 +1,147 @@ +/** + * surface-worker — the GRADED-worker seam for the self-improving supervisor. + * + * `supervise()` spawns workers by resolving a profile through `makeWorkerAgent` to an `Agent` whose + * `executorSpec` carries a leaf `Executor`. This seam makes that worker actually WORK the + * `AgenticSurface` task: each spawned worker runs ONE `runAgentic({ surface, task, strategy: refine })` + * — the canonical depth tool loop over the surface — and settles with the surface's score as its + * verdict. So the driver can spawn/steer workers and read a real, surface-checked result, not a + * self-report. + * + * The paired `deliverable` is the completion oracle: settled ⟺ resolved. A worker that ran but + * didn't drive the artifact to its final checked state settles `valid:false`, so a keep-best driver + * never counts it as done (the Foreman 0/18 lesson — "done" means the check passed). + * + * v1 SIMPLIFICATION: the worker IGNORES the driver's brief — every spawn is a fresh `refine` attempt + * on the SAME task. The driver's intelligence in v1 is allocation (how many workers, when to stop), + * not per-worker instruction authoring; threading a per-worker brief into the surface tool loop is the + * next increment. + */ + +import type { + Agent, + AgentProfile, + AgentSpec, + Executor, + ExecutorResult, + Spend, +} from '@tangle-network/agent-runtime/loops' +import { + type AgenticSurface, + type AgenticTask, + type DeliverableSpec, + type MakeWorkerAgent, + refine, + runAgentic, +} from '@tangle-network/agent-runtime/loops' + +/** What the worker executor settles with — the surface verdict the driver + deliverable read. + * `resolved` is the surface check's pass/fail (settled ⟺ resolved); `score` is the partial-credit + * fraction; the rest is a short human summary for traces/reports. */ +export interface SurfaceWorkerOut { + readonly resolved: boolean + readonly score: number + readonly shots: number + readonly summary: string +} + +export interface SurfaceWorkerOptions { + readonly surface: AgenticSurface + readonly task: AgenticTask + readonly worker: { + readonly routerBaseUrl: string + readonly routerKey: string + readonly model: string + readonly maxTokens?: number + readonly innerTurns?: number + /** refine shot budget for ONE worker attempt (max steered shots). Defaults to 1. */ + readonly budget?: number + } +} + +/** One spawned worker = one `runAgentic` refine attempt over the surface task. The result is cached on + * first `execute` and read back by `resultArtifact()` (the replay source the scope journals). */ +function surfaceWorkerExecutor(opts: SurfaceWorkerOptions): Executor { + const { surface, task, worker } = opts + let artifact: ExecutorResult | undefined + return { + runtime: 'surface-worker', + // v1: the worker ignores the spawn `task` (the driver's brief) — each spawn is a fresh refine + // attempt on the SAME surface task. `runAgentic` already stamps real tokens/usd/ms from its + // conserved pool, so we forward those as the worker's Spend (no re-pricing here). + async execute(): Promise> { + const r = await runAgentic({ + surface, + task, + strategy: refine, + budget: worker.budget ?? 1, + routerBaseUrl: worker.routerBaseUrl, + routerKey: worker.routerKey, + model: worker.model, + ...(worker.maxTokens !== undefined ? { maxTokens: worker.maxTokens } : {}), + ...(worker.innerTurns !== undefined ? { innerTurns: worker.innerTurns } : {}), + }) + const out: SurfaceWorkerOut = { + resolved: r.resolved, + score: r.score, + shots: r.shots, + summary: `refine ${r.shots} shot(s) → ${(100 * r.score).toFixed(0)}% (${ + r.resolved ? 'resolved' : 'unresolved' + })`, + } + const spent: Spend = { + iterations: r.completions, + tokens: r.tokens, + usd: r.usd, + ms: r.ms, + } + artifact = { + outRef: `surface-worker:${task.id}:${r.shots}:${r.resolved ? 'ok' : 'no'}`, + out, + verdict: { valid: r.resolved, score: r.score }, + spent, + } + return artifact + }, + teardown: () => Promise.resolve({ destroyed: true }), + resultArtifact() { + if (!artifact) throw new Error('surfaceWorkerExecutor: resultArtifact before execute') + return artifact + }, + } +} + +/** + * Build the graded-worker seam: a `makeWorkerAgent` `supervise()` spawns through, and the matching + * `deliverable` (settled ⟺ resolved). Hand both to `supervise(profile, intent, { makeWorkerAgent, + * deliverable, budget })` — every spawned worker then works the surface task and settles with the + * surface-checked verdict. + */ +export function surfaceWorkerSeam(opts: SurfaceWorkerOptions): { + makeWorkerAgent: MakeWorkerAgent + deliverable: DeliverableSpec +} { + const makeWorkerAgent: MakeWorkerAgent = (rawProfile) => { + const p = (rawProfile ?? {}) as { name?: unknown } + const name = typeof p.name === 'string' && p.name.length > 0 ? p.name : 'surface-worker' + // harness:null is unused — the BYO `executor` overrides harness resolution entirely (the scope + // resolves a BYO `spec.executor` first). `act` is never called for a spawned child. + const spec: AgentSpec = { + profile: rawProfile as AgentProfile, + harness: null, + executor: surfaceWorkerExecutor(opts) as Executor, + } + return { name, act: async () => '', executorSpec: spec } as Agent & { + executorSpec: AgentSpec + } + } + + // The completion oracle: DELIVERED ⟺ the worker resolved the surface check. The driver's keep-best + // / stop decision rides on this `valid`, never on a worker self-report. + const deliverable: DeliverableSpec = { + describe: `resolve the surface task ${opts.task.id} (every required check passes)`, + check: (out) => (out as SurfaceWorkerOut | undefined)?.resolved === true, + } + + return { makeWorkerAgent, deliverable } +} diff --git a/src/runtime/index.ts b/src/runtime/index.ts index e4bad3e0..8a889a78 100644 --- a/src/runtime/index.ts +++ b/src/runtime/index.ts @@ -37,8 +37,14 @@ export { } from '../durable/spawn-journal' // The typed coordination-bus event (up: settled/question/finding; down: steer/answer) — surfaced // here so a host folding the bus onto its own timeline (the supervise-topology observability) can -// type its `onEvent` subscriber without reaching into the `/mcp` subpath. -export type { CoordinationEvent } from './../mcp/tools/coordination' +// type its `onEvent` subscriber without reaching into the `/mcp` subpath. `MakeWorkerAgent` rides +// alongside it: the worker-seam type `supervise`/`workerFromBackend` traffic in, so a host authoring +// its own seam types it from the loop layer rather than the `/mcp` subpath. +export type { + AnalystRegistry, + CoordinationEvent, + MakeWorkerAgent, +} from './../mcp/tools/coordination' export { type AnytimeReport, type AnytimeStrategySummary, diff --git a/src/runtime/supervise/coordination-driver.ts b/src/runtime/supervise/coordination-driver.ts index 219c5749..62b0aeca 100644 --- a/src/runtime/supervise/coordination-driver.ts +++ b/src/runtime/supervise/coordination-driver.ts @@ -28,6 +28,7 @@ import { ValidationError } from '../../errors' import type { McpToolDescriptor } from '../../mcp/server' import { + type AnalystRegistry, coordinationVerbNames, createCoordinationTools, type MakeWorkerAgent, @@ -57,6 +58,13 @@ export interface DriverAgentOptions { /** Hard cap on simultaneously-LIVE workers — `spawn_agent` fails closed once this many are in * flight (a concurrency fence on top of the conserved-pool fence). Omit/`<= 0` = no cap. */ readonly maxLiveWorkers?: number + /** The analyst lenses available to the driver. Required for `analyzeOnSettle` (and `run_analyst`). + * Unset → no analyst feed (status quo: the driver gets settled outputs, no findings). */ + readonly analysts?: AnalystRegistry + /** Analyst kind ids run AUTOMATICALLY when a worker settles `done` — each result re-enters as a + * `finding` the driver pulls and composes its next steer from. The UP-leg of the self-improving + * loop. Omit/empty = no auto-analysis (status quo). Requires `analysts`. */ + readonly analyzeOnSettle?: ReadonlyArray /** The driver's stance — a string, or built from the task (the worker-driver prompt / * the generator). INJECTED so the prompt is a pluggable, optimizable role. */ readonly systemPrompt: string | ((task: unknown) => string) @@ -165,6 +173,13 @@ export function driverAgent(opts: DriverAgentOptions): Agent { 'driverAgent: extraTools requires executeExtraTool (how to run a work-tool call)', ) } + // Fail loud on a half-wired analyst seam (matches the extraTools pattern): analyze-on-settle with no + // lens registry is a silent no-op the house rules forbid — the driver would get no findings, no error. + if ((opts.analyzeOnSettle?.length ?? 0) > 0 && !opts.analysts) { + throw new ValidationError( + 'driverAgent: analyzeOnSettle requires analysts (the lens registry the kinds resolve against)', + ) + } // A work tool that shadows a coordination verb would leave the driver unable to coordinate. // Validate against the reserved verb set HERE (construction), so the conflict fails loud — not // buried inside act() where the supervisor would swallow the throw into a quiet no-winner. @@ -199,6 +214,8 @@ export function driverAgent(opts: DriverAgentOptions): Agent { makeWorkerAgent: opts.makeWorkerAgent, perWorker: opts.perWorker, ...(opts.maxLiveWorkers !== undefined ? { maxLiveWorkers: opts.maxLiveWorkers } : {}), + ...(opts.analysts ? { analysts: opts.analysts } : {}), + ...(opts.analyzeOnSettle ? { analyzeOnSettle: opts.analyzeOnSettle } : {}), }) const byName = new Map(coord.tools.map((t) => [t.name, t])) const toolSpecs: ToolSpec[] = [ diff --git a/src/runtime/supervise/supervise.ts b/src/runtime/supervise/supervise.ts index 5760d73f..46fc8444 100644 --- a/src/runtime/supervise/supervise.ts +++ b/src/runtime/supervise/supervise.ts @@ -9,7 +9,7 @@ */ import type { AgentProfile } from '@tangle-network/sandbox' import { ValidationError } from '../../errors' -import type { MakeWorkerAgent } from '../../mcp/tools/coordination' +import type { AnalystRegistry, MakeWorkerAgent } from '../../mcp/tools/coordination' import type { RouterConfig } from '../router-client' import type { ToolLoopChat, ToolLoopCompactionOptions } from '../tool-loop' import { type DeliverableSpec, gateOnDeliverable } from './completion-gate' @@ -79,6 +79,14 @@ export interface SuperviseOptions { * flight. The conserved pool bounds TOTAL work; this bounds SIMULTANEOUS work (live boxes/ * sandboxes a real fleet runs at once). Omit/`<= 0` = no cap (the pool stays the only fence). */ readonly maxLiveWorkers?: number + /** Analyst lenses available to the driver. Required for `analyzeOnSettle`. Unset → status quo + * (the driver receives settled worker outputs, no analyst findings). */ + readonly analysts?: AnalystRegistry + /** Analyst kind ids run AUTOMATICALLY when a worker settles `done` — each re-enters as a `finding` + * the driver pulls (`await_event`) and composes its next steer from. The self-improving UP-leg, + * threaded to the driver at this level (propagate to sub-drivers via a recursive `makeWorkerAgent`). + * Omit/empty = status quo (no analyst feed). Requires `analysts`. */ + readonly analyzeOnSettle?: ReadonlyArray /** Worker output store. Defaults to in-memory. */ readonly blobs?: ResultBlobStore readonly maxDepth?: number @@ -140,6 +148,8 @@ export function supervise(profile: SupervisorProfile, task: unknown, opts: Super ...(opts.driveHarness ? { driveHarness: opts.driveHarness } : {}), ...(opts.extraTools ? { extraTools: opts.extraTools } : {}), ...(opts.executeExtraTool ? { executeExtraTool: opts.executeExtraTool } : {}), + ...(opts.analysts ? { analysts: opts.analysts } : {}), + ...(opts.analyzeOnSettle ? { analyzeOnSettle: opts.analyzeOnSettle } : {}), ...(opts.maxTurns !== undefined ? { maxTurns: opts.maxTurns } : {}), ...(opts.compaction ? { compaction: opts.compaction } : {}), }) diff --git a/src/runtime/supervise/supervisor-agent.ts b/src/runtime/supervise/supervisor-agent.ts index 5b4a5fc6..86691e86 100644 --- a/src/runtime/supervise/supervisor-agent.ts +++ b/src/runtime/supervise/supervisor-agent.ts @@ -14,7 +14,7 @@ * oracle (`finalizeBestDelivered` — the best DELIVERED child, never the driver's own prose). */ import { ValidationError } from '../../errors' -import type { MakeWorkerAgent } from '../../mcp/tools/coordination' +import type { AnalystRegistry, MakeWorkerAgent } from '../../mcp/tools/coordination' import { type RouterConfig, routerBrain } from '../router-client' import type { ToolLoopChat, ToolLoopCompactionOptions } from '../tool-loop' import { driverAgent, finalizeBestDelivered } from './coordination-driver' @@ -94,6 +94,11 @@ export interface SupervisorAgentDeps { name: string, args: Record, ) => Promise + /** Analyst lenses available to the driver (both arms). Required for `analyzeOnSettle`. */ + readonly analysts?: AnalystRegistry + /** Analyst kinds run on each worker-settle → a `finding` the driver composes its next steer from + * (the self-improving UP-leg). Unset/empty = status quo (no analyst feed). Requires `analysts`. */ + readonly analyzeOnSettle?: ReadonlyArray readonly maxTurns?: number /** Give the supervisor brain a chapter-lifecycle on its OWN context window (router arm only) — it * distills its coordination transcript to a compact progress note once it exceeds the threshold, @@ -129,6 +134,8 @@ export function supervisorAgent( ...(deps.maxLiveWorkers !== undefined ? { maxLiveWorkers: deps.maxLiveWorkers } : {}), ...(deps.extraTools ? { extraTools: deps.extraTools } : {}), ...(deps.executeExtraTool ? { executeExtraTool: deps.executeExtraTool } : {}), + ...(deps.analysts ? { analysts: deps.analysts } : {}), + ...(deps.analyzeOnSettle ? { analyzeOnSettle: deps.analyzeOnSettle } : {}), ...(deps.maxTurns !== undefined ? { maxTurns: deps.maxTurns } : {}), ...(deps.compaction ? { compaction: deps.compaction } : {}), }) @@ -150,6 +157,8 @@ export function supervisorAgent( makeWorkerAgent: deps.makeWorkerAgent, perWorker: deps.perWorker, ...(deps.maxLiveWorkers !== undefined ? { maxLiveWorkers: deps.maxLiveWorkers } : {}), + ...(deps.analysts ? { analysts: deps.analysts } : {}), + ...(deps.analyzeOnSettle ? { analyzeOnSettle: deps.analyzeOnSettle } : {}), }) try { await driveHarness({ profile, task, scope, coordinationMcpUrl: mcp.url }) diff --git a/tests/loops/coordination-driver.test.ts b/tests/loops/coordination-driver.test.ts index fe6de46d..8d0e1eeb 100644 --- a/tests/loops/coordination-driver.test.ts +++ b/tests/loops/coordination-driver.test.ts @@ -559,3 +559,35 @@ describe('driverAgent — the driver can ACT (call work tools itself), not only expect(() => driverAgent(opts)).toThrow(/collides with a coordination verb/) }) }) + +describe('driverAgent — the analyst up-leg (analysts + analyzeOnSettle pass-through)', () => { + const noWorker = (_p: unknown): Agent => + ({ + name: 'w', + act: async () => '', + executorSpec: { profile: { name: 'w' } as AgentProfile, harness: null }, + }) as Agent & { executorSpec: AgentSpec } + const analysts = { + kinds: [{ id: 'progress', description: 'read the settled output', area: 'progress' }], + run: async () => ({ note: 'ok' }), + } + + it('fails loud when analyzeOnSettle is set without analysts (matches the extraTools guard)', () => { + expect(() => + driverAgent({ + ...driverOpts('x', scriptedBrain([]), noWorker), + analyzeOnSettle: ['progress'], + }), + ).toThrow(/analyzeOnSettle requires analysts/) + }) + + it('constructs when both analysts and analyzeOnSettle are provided (the up-leg wired)', () => { + expect(() => + driverAgent({ + ...driverOpts('x', scriptedBrain([]), noWorker), + analysts, + analyzeOnSettle: ['progress'], + }), + ).not.toThrow() + }) +})