From d4f535148b4e3c9c9e8485a2293977ff499d8049 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 27 Jun 2026 17:34:43 -0600 Subject: [PATCH 1/6] =?UTF-8?q?feat(examples):=20self-improving-coder=20?= =?UTF-8?q?=E2=80=94=20the=20RSI=20spine,=20composed=20cleanly,=20on=20a?= =?UTF-8?q?=20contamination-proof=20task?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pristine self-improvement loop with NOTHING hand-rolled: an AgentProfile-shaped worker over an AgenticSurface (the task, real tools), gated by runStrategyEvolution — which authors strategies from TRAIN losses then makes ONE promotion decision on a FRESH holdout slice via promotionGate. Adaptive data analysis is structurally impossible (disjoint task offsets, holdout read once). The only new code is the Environment: a contamination-proof generated coding task (constants derived per-seed, so no model could have memorized it), graded by real pytest. $0 calibration self-check (reference->100%, stub->0%) gates spend. The bundled task is deliberately simple — a capable model aces it, so the gate correctly returns no-promotion; swap a harder Environment (or SWE-bench) for a discriminating run. --- .../self-improving-coder.ts | 286 ++++++++++++++++++ 1 file changed, 286 insertions(+) create mode 100644 examples/self-improving-coder/self-improving-coder.ts diff --git a/examples/self-improving-coder/self-improving-coder.ts b/examples/self-improving-coder/self-improving-coder.ts new file mode 100644 index 00000000..736ed05b --- /dev/null +++ b/examples/self-improving-coder/self-improving-coder.ts @@ -0,0 +1,286 @@ +/** + * Self-improving coder — the substrate's self-improvement spine, composed cleanly, on a + * CONTAMINATION-PROOF coding task. NOTHING here is hand-rolled: the genome is an `AgentProfile`-shaped + * worker, the task is an `AgenticSurface` (open/tools/call/score/close), and the held-out-gated + * flywheel is `runStrategyEvolution` — which authors candidate strategies from TRAIN losses, then + * makes ONE promotion decision on a FRESH holdout slice the search never touched (`promotionGate`, + * a seeded paired-bootstrap CI). Adaptive data analysis is structurally impossible: the holdout is + * disjoint by task offset and read exactly once. + * + * Why contamination-proof: each task is a small wire-protocol library whose constants (version, + * separators, checksum modulus, opcode) are DERIVED FROM THE SEED and specified ONLY by the test file. + * A frontier model cannot have memorized the fix — the exact contract is generated per task. Graded by + * REAL pytest (a deployable check, never an LLM judge). + * + * IMPORTANT — the bundled task is DELIBERATELY SIMPLE (a few functions fully pinned by their tests). + * A capable model aces it (every strategy scores 1.0), so the gate CORRECTLY returns no-promotion: + * you cannot demonstrate self-improvement where there is no headroom — and this harness refuses to + * pretend otherwise (calibrate-before-measure, enforced). To get a DISCRIMINATING run, swap in a task + * with a correctable middle band (algorithmically hard generated tasks, or a real benchmark below). + * + * To run frontier SWE-bench instead, swap `environment`/`tasks` for the SWE-bench `Environment` + * (bench/src/benchmarks/swe-bench.ts) — everything else is identical. (That arena is contamination- + * SUSPECT: its bugs are public GitHub fixes a model may have memorized — report it, never claim clean.) + * + * Run: TANGLE_API_KEY= pnpm tsx examples/self-improving-coder/self-improving-coder.ts + */ +import { execFileSync } from 'node:child_process' +import { mkdirSync, mkdtempSync, readdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { createChatClient } from '@tangle-network/agent-eval' +import { + type AgenticSurface, + type AgenticTask, + type AgenticTool, + type ArtifactHandle, + refine, + runStrategyEvolution, + sample, + type SurfaceScore, +} from '@tangle-network/agent-runtime/loops' + +// ── The contamination-proof task generator (deterministic per seed) ────────────── +/** A small wire-protocol library, fully specified by its tests, with seed-derived constants. The + * agent must READ the tests to infer the exact contract — it cannot recall it. Returns the stub the + * agent edits + the hidden-ish test file (the agent may read it; grading runs it). */ +function constsFor(seed: number): { VER: string; SEP: string; MOD: number } { + const r = (m: number) => ((seed * 2654435761) >>> 0) % m + return { VER: `v${(r(900) + 100).toString(36)}`, SEP: ['-', '|', ':', '/', '#'][r(5)]!, MOD: [97, 101, 103, 107, 109][r(5)]! } +} +function genTask(seed: number): { stub: string; test: string; total: number } { + const { VER, SEP, MOD } = constsFor(seed) + const t = (id: number, text: string) => `${VER}${SEP}${id}${SEP}${text}` + const tests = [ + 'import pytest', + 'from lib import encode, decode, checksum, valid', + '', + `def test_encode(): assert encode(3, "hi") == ${JSON.stringify(t(3, 'hi'))}`, + `def test_encode_zero(): assert encode(0, "") == ${JSON.stringify(t(0, ''))}`, + `def test_decode(): assert decode(${JSON.stringify(t(9, 'ab'))}) == {"id": 9, "text": "ab"}`, + 'def test_roundtrip(): assert decode(encode(42, "yo")) == {"id": 42, "text": "yo"}', + `def test_checksum(): assert checksum("abc") == sum(b for b in b"abc") % ${MOD}`, + `def test_checksum_empty(): assert checksum("") == 0`, + `def test_valid_true(): assert valid(${JSON.stringify(t(1, 'x'))}) is True`, + `def test_valid_bad_version(): assert valid("zz${SEP}1${SEP}x") is False`, + `def test_valid_bad_shape(): assert valid("not a token") is False`, + '', + ].join('\n') + const stub = [ + '# Implement these so test_lib.py passes. Infer the exact format from the tests.', + 'def encode(id, text):', + ' raise NotImplementedError', + 'def decode(s):', + ' raise NotImplementedError', + 'def checksum(text):', + ' raise NotImplementedError', + 'def valid(s):', + ' raise NotImplementedError', + '', + ].join('\n') + return { stub, test: tests, total: 9 } +} + +// ── The Environment (AgenticSurface) — host pytest, no Docker. (Docker is a swap for untrusted code.) ── +interface Ws { + dir: string + total: number +} +const workspaces = new Map() + +function pytestPassed(dir: string): { passed: number; total: number } { + let out = '' + try { + out = execFileSync('python3', ['-m', 'pytest', '-q', '--tb=no', '-p', 'no:cacheprovider', 'test_lib.py'], { + cwd: dir, + encoding: 'utf8', + timeout: 60_000, + stdio: ['ignore', 'pipe', 'pipe'], + }) + } catch (e) { + out = (e as { stdout?: string }).stdout ?? '' + } + const passed = Number(out.match(/(\d+) passed/)?.[1] ?? 0) + const failed = Number(out.match(/(\d+) failed/)?.[1] ?? 0) + Number(out.match(/(\d+) error/)?.[1] ?? 0) + return { passed, total: passed + failed } +} + +const codingEnv: AgenticSurface = { + name: 'generated-coding', + async open(task) { + const seed = Number((task.meta as { seed?: number })?.seed ?? 0) + const { stub, test, total } = genTask(seed) + const dir = mkdtempSync(join(tmpdir(), 'sic-')) + writeFileSync(join(dir, 'lib.py'), stub) + writeFileSync(join(dir, 'test_lib.py'), test) + const handle: ArtifactHandle = { id: dir, surface: 'generated-coding' } + workspaces.set(dir, { dir, total }) + return handle + }, + async tools() { + return [ + { type: 'function', function: { name: 'list_files', description: 'List the files in the workspace.', parameters: { type: 'object', properties: {} } } }, + { type: 'function', function: { name: 'read_file', description: 'Read a file (e.g. test_lib.py to learn the contract, or lib.py).', parameters: { type: 'object', properties: { path: { type: 'string' } }, required: ['path'] } } }, + { type: 'function', function: { name: 'write_file', description: 'Write COMPLETE contents of lib.py (the implementation). test_lib.py is read-only.', parameters: { type: 'object', properties: { path: { type: 'string' }, content: { type: 'string' } }, required: ['path', 'content'] } } }, + // NO run_tests: the agent cannot iterate-until-green. It must implement correctly from READING the + // tests — which creates real headroom and makes the STRATEGY (planning, multiple attempts) matter. + ] satisfies AgenticTool[] + }, + async call(handle, name, args) { + const ws = workspaces.get(handle.id) + if (!ws) return 'ERROR: workspace closed' + if (name === 'list_files') return readdirSync(ws.dir).join('\n') + if (name === 'read_file') { + try { + return readFileSync(join(ws.dir, String(args.path ?? '')), 'utf8').slice(0, 8000) + } catch (e) { + return `ERROR: ${(e as Error).message}` + } + } + if (name === 'write_file') { + const p = String(args.path ?? '') + if (!p.endsWith('lib.py') || p.includes('..') || p.startsWith('/')) return 'ERROR: only lib.py is writable' + try { + mkdirSync(ws.dir, { recursive: true }) + writeFileSync(join(ws.dir, 'lib.py'), String(args.content ?? '')) + return 'wrote lib.py' + } catch (e) { + return `ERROR: ${(e as Error).message}` + } + } + if (name === 'run_tests') { + const { passed, total } = pytestPassed(ws.dir) + return `pytest: ${passed}/${total} passed` + } + return `ERROR: unknown tool ${name}` + }, + async score(_task, handle): Promise { + const ws = workspaces.get(handle.id) + if (!ws) return { passes: 0, total: 0, errored: 1 } + const { passed, total } = pytestPassed(ws.dir) + return total > 0 ? { passes: passed, total, errored: 0 } : { passes: 0, total: ws.total, errored: 1 } + }, + async close(handle) { + const ws = workspaces.get(handle.id) + if (!ws) return + workspaces.delete(handle.id) + rmSync(ws.dir, { recursive: true, force: true }) + }, +} + +// ── The disjoint task supplier (train [0,trainN); holdout drawn past it) ────────── +const tasks = async (offset: number, n: number): Promise => + Array.from({ length: n }, (_, i) => { + const seed = offset + i + return { + id: `gen-${seed}`, + systemPrompt: + 'You are a Python engineer. The library lib.py has stub functions; its exact contract is defined ONLY by ' + + 'test_lib.py. You CANNOT run the tests — read test_lib.py CAREFULLY (every assertion, every edge case) and ' + + 'implement lib.py correctly in one pass with write_file. Get the edge cases right (empty inputs, malformed ' + + 'inputs, exact formats). Do not edit test_lib.py.', + userPrompt: 'Read test_lib.py to learn the exact contract, then write a correct lib.py. You cannot run the tests — reason carefully.', + meta: { seed }, + } satisfies AgenticTask + }) + +/** The correct lib.py for a seed — used ONLY by the $0 calibration self-check (never by the agent). */ +function referenceLib(seed: number): string { + const { VER, SEP, MOD } = constsFor(seed) + return [ + `VER, SEP, MOD = ${JSON.stringify(VER)}, ${JSON.stringify(SEP)}, ${MOD}`, + 'def encode(id, text): return f"{VER}{SEP}{id}{SEP}{text}"', + 'def decode(s):', + ' v, i, t = s.split(SEP, 2)', + ' return {"id": int(i), "text": t}', + 'def checksum(text): return sum(text.encode()) % MOD if text else 0', + 'def valid(s):', + ' p = s.split(SEP)', + ' return len(p) == 3 and p[0] == VER and p[1].isdigit()', + '', + ].join('\n') +} + +/** calibrate-before-measure: prove the task is SOLVABLE (reference → all pass) and the grader + * DISCRIMINATES (stub → 0). $0, no router. A reference that doesn't clear means the task/grader is + * broken — fix it before spending. */ +async function calibrate(): Promise { + console.log('═══ CALIBRATION ($0) — task solvable + grader discriminates? ═══') + let ok = true + for (const seed of [0, 1, 2, 7, 11]) { + const task = (await tasks(seed, 1))[0]! + const h = await codingEnv.open(task) + const stub = await codingEnv.score(task, h) + // write the reference, re-score + await codingEnv.call(h, 'write_file', { path: 'lib.py', content: referenceLib(seed) }) + const ref = await codingEnv.score(task, h) + await codingEnv.close(h) + const pass = ref.passes === ref.total && ref.total > 0 && stub.passes === 0 + ok &&= pass + console.log(` seed ${seed}: stub ${stub.passes}/${stub.total} → reference ${ref.passes}/${ref.total} ${pass ? '✓' : '✗ BROKEN'}`) + } + console.log(ok ? '\n>>> CALIBRATED — task is solvable + the grader discriminates. Safe to run the loop.' : '\n>>> BROKEN — fix the task/grader before spending.') + if (!ok) process.exit(1) +} + +async function main(): Promise { + if (process.env.CALIBRATE) return calibrate() + const routerKey = process.env.TANGLE_API_KEY + if (!routerKey) throw new Error('set TANGLE_API_KEY (the worker + the author both call the router)') + const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1' + const workerModel = process.env.WORKER_MODEL ?? 'deepseek-v4-flash' + // The author WRITES strategy code (a `defineStrategy` module) — it needs a strong coder + a token + // budget (thinking models return empty content without one) + a fallback. deepseek-flash can't. + const authorModel = process.env.AUTHOR_MODEL ?? 'gemini-2.5-pro' + + // The author writes candidate-strategy .mts files into outDir, then dynamically imports them — they + // `import '@tangle-network/agent-runtime/loops'`, which only resolves UNDER the package (self-reference). + // A /tmp outDir would fail to resolve it; keep it under the project root. + const outDir = mkdtempSync(join(process.cwd(), '.sic-run-')) + const report = await runStrategyEvolution({ + environment: codingEnv, + tasks, + trainN: Number(process.env.TRAIN_N ?? 8), + holdoutN: Number(process.env.HOLDOUT_N ?? 12), + worker: { routerBaseUrl, routerKey, model: workerModel, innerTurns: Number(process.env.INNER_TURNS ?? 8), maxTokens: 4000 }, + author: { + chat: createChatClient({ transport: 'router', baseUrl: routerBaseUrl, apiKey: routerKey, defaultModel: authorModel }), + model: authorModel, + maxTokens: 8000, + fallbackModel: process.env.AUTHOR_FALLBACK ?? 'deepseek-v4-flash', + }, + baselines: [sample, refine], + budget: Number(process.env.BUDGET ?? 3), + generations: Number(process.env.GENERATIONS ?? 2), + populationSize: Number(process.env.POP ?? 2), + outDir, + }) + rmSync(outDir, { recursive: true, force: true }) + + const v = report.verdict + if (process.env.DUMP) { + // Autopsy: gen0 baseline scores (headroom) + every authored candidate's score/error (did they + // lose on a saturated task, or error at runtime?). + const r = report as unknown as Record + const slim = (x: unknown) => + JSON.stringify(x, (_k, val) => (typeof val === 'function' ? '[fn]' : val), 1) + console.log('--- gen0 ---', slim(r.gen0 ?? r.gen0Champion)) + console.log('--- generations ---', slim(r.generations)?.slice(0, 3000)) + } + console.log('\n═══ SELF-IMPROVING CODER — certified on a FROZEN holdout (no adaptive reuse) ═══') + console.log(`worker=${workerModel} author=${authorModel}`) + console.log(`gen0 champion: ${report.gen0Champion.name}`) + console.log(`final champion: ${report.finalChampion.name}`) + console.log(`PROMOTED: ${v.promoted} (${v.reason})`) + console.log(`held-out lift: mean ${v.lift.mean.toFixed(3)} 95% CI [${v.lift.low.toFixed(3)}, ${v.lift.high.toFixed(3)}] n=${v.n}`) + console.log( + v.promoted + ? '\n>>> The search taught the agent a strategy that fixes MORE on tasks it never trained on, beyond luck. Self-improvement CERTIFIED.' + : '\n>>> No promotion: the evolved strategy did not beat gen0 on the fresh holdout beyond noise (honest null).', + ) +} + +main().catch((e) => { + console.error(e) + process.exit(1) +}) From 7f7d93dd24d877e1fb7d4eb1323ee81099339076 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 27 Jun 2026 20:19:51 -0600 Subject: [PATCH 2/6] =?UTF-8?q?feat(bench):=20SWE-bench=20Verified=20as=20?= =?UTF-8?q?an=20AgenticSurface=20=E2=80=94=20the=20proper,=20no-cheating?= =?UTF-8?q?=20frontier=20run?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit createSweBenchEnvironment: the agent clones the repo at base_commit, explores + makes SURGICAL edits via tools (edit_file, source-only, test files path-jailed), and score() grades the git diff with the official swebench Docker harness. The substrate drives the agentic loop (runAgentic / runStrategyEvolution) — no hand-rolled tool-loop. Never sees the hidden tests or the gold patch. swe-self-improve.mts wires it into runStrategyEvolution with a disjoint train/holdout split (the substrate enforces freeze + one holdout decision — no adaptive reuse). CALIBRATE mode runs the baseline on a few bugs first (cost gate). CONTAMINATION CAVEAT documented: public fixes may be memorized; report it, never claim a clean frontier number from this arena alone. --- bench/src/swe-bench-env.ts | 173 +++++++++++++++++++++++++++++++++++++ bench/swe-self-improve.mts | 79 +++++++++++++++++ 2 files changed, 252 insertions(+) create mode 100644 bench/src/swe-bench-env.ts create mode 100644 bench/swe-self-improve.mts diff --git a/bench/src/swe-bench-env.ts b/bench/src/swe-bench-env.ts new file mode 100644 index 00000000..44961efc --- /dev/null +++ b/bench/src/swe-bench-env.ts @@ -0,0 +1,173 @@ +/** + * SWE-bench Verified as an `AgenticSurface` — the PROPER, no-cheating way to run a coding agent on real + * GitHub bugs through the substrate (`runAgentic`/`runBenchmark`/`runStrategyEvolution` drive the loop; + * we only provide tools + a deployable score). The agent clones the repo at base_commit, explores + + * edits SOURCE via tools (never tests — path-jailed), and `score()` grades the resulting `git diff` + * with the OFFICIAL swebench Docker harness (apply patch → FAIL_TO_PASS + PASS_TO_PASS → resolved). + * + * No cheating by construction: the agent never sees the hidden tests or the gold patch (the adapter's + * prompt is the issue only); `edit_file` refuses test files; the score is a real test run, not a judge. + * + * CONTAMINATION CAVEAT: SWE-bench bugs are public GitHub fixes a frontier model may have MEMORIZED. + * A clean train→holdout split (disjoint instances) rules out adaptive-reuse, but NOT training-data + * memorization. Always report this; never claim a "clean" frontier number from this arena alone. + */ +import { execFile } from 'node:child_process' +import { existsSync, mkdtempSync, readdirSync, readFileSync, rmSync, statSync, writeFileSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { promisify } from 'node:util' +import type { AgenticSurface, AgenticTask, AgenticTool, ArtifactHandle, SurfaceScore } from '@tangle-network/agent-runtime/loops' +import { createSweBenchAdapter } from './benchmarks/swe-bench' +import type { BenchTask } from './benchmarks/types' + +const exec = promisify(execFile) +const isTestPath = (p: string) => /(^|\/)(tests?)\//.test(p) || /test_.*\.py$|_test\.py$|conftest\.py$/.test(p) + +interface Ws { + dir: string + task: BenchTask +} +const workspaces = new Map() + +/** Build the SWE-bench Environment + a DISJOINT-slice task supplier over the Verified split. The + * supplier keys tasks by dataset offset so `runStrategyEvolution`'s train [0,trainN) and holdout + * [trainN+off,…) never overlap. Verified is loaded once; instances carry their repo/base_commit. */ +export async function createSweBenchEnvironment(poolN = 80): Promise<{ + environment: AgenticSurface + tasks: (offset: number, n: number) => Promise + adapter: ReturnType +}> { + const adapter = createSweBenchAdapter() + const pool = await adapter.loadTasks({ limit: poolN, split: 'test' }) + const byId = new Map(pool.map((t) => [t.id, t])) + + const environment: AgenticSurface = { + name: 'swe-bench-verified', + async open(task) { + const bt = byId.get(task.id) + if (!bt) throw new Error(`swe-bench-env: unknown task ${task.id}`) + const md = bt.metadata as Record + const dir = mkdtempSync(join(tmpdir(), 'swe-')) + await exec('git', ['clone', '--filter=blob:none', '--no-checkout', '--quiet', `https://github.com/${md.repo}.git`, dir], { timeout: 420_000 }) + await exec('git', ['-C', dir, 'checkout', '--quiet', md.base_commit], { timeout: 300_000 }) + const handle: ArtifactHandle = { id: dir, surface: 'swe-bench-verified' } + workspaces.set(dir, { dir, task: bt }) + return handle + }, + async tools() { + return [ + { type: 'function', function: { name: 'list_files', description: 'List source files under a repo subdirectory (recursive, bounded). "" = repo root.', parameters: { type: 'object', properties: { dir: { type: 'string' } }, required: ['dir'] } } }, + { type: 'function', function: { name: 'read_file', description: 'Read a repo file by path.', parameters: { type: 'object', properties: { path: { type: 'string' } }, required: ['path'] } } }, + { type: 'function', function: { name: 'edit_file', description: 'Surgical fix: replace the EXACT old_string (must occur once — copy whitespace precisely) with new_string in a SOURCE file. Minimal changes, never whole-file rewrites. Test files are rejected.', parameters: { type: 'object', properties: { path: { type: 'string' }, old_string: { type: 'string' }, new_string: { type: 'string' } }, required: ['path', 'old_string', 'new_string'] } } }, + ] satisfies AgenticTool[] + }, + async call(handle, name, args) { + const ws = workspaces.get(handle.id) + if (!ws) return 'ERROR: workspace closed' + const safe = (p: string): string | null => { + const n = p.replace(/^\.?\//, '') + return n.includes('..') || n.startsWith('/') ? null : n + } + if (name === 'list_files') { + const sub = safe(String(args.dir ?? '')) ?? '' + const root = join(ws.dir, sub) + if (!existsSync(root)) return `(no such path: ${sub})` + const out: string[] = [] + const walk = (d: string, depth: number) => { + if (depth > 2 || out.length > 240) return + let entries: string[] = [] + try { + entries = readdirSync(d) + } catch { + return + } + for (const e of entries) { + if (e.startsWith('.') || e === 'node_modules' || e === '__pycache__') continue + const p = join(d, e) + let isDir = false + try { + isDir = statSync(p).isDirectory() + } catch { + continue + } + out.push(p.slice(ws.dir.length + 1) + (isDir ? '/' : '')) + if (isDir) walk(p, depth + 1) + } + } + walk(root, 0) + return out.slice(0, 240).join('\n') || '(empty)' + } + if (name === 'read_file') { + const p = safe(String(args.path ?? '')) + if (!p) return 'ERROR: invalid path' + try { + const c = readFileSync(join(ws.dir, p), 'utf8') + return c.length > 24_000 ? `${c.slice(0, 24_000)}\n...[truncated]` : c + } catch (e) { + return `(error: ${(e as Error).message})` + } + } + if (name === 'edit_file') { + const p = safe(String(args.path ?? '')) + if (!p) return 'ERROR: invalid path' + if (isTestPath(p)) return 'REJECTED: editing test files is forbidden (the evaluation runs hidden tests).' + const oldStr = String(args.old_string ?? '') + const newStr = String(args.new_string ?? '') + let content: string + try { + content = readFileSync(join(ws.dir, p), 'utf8') + } catch (e) { + return `(cannot read ${p}: ${(e as Error).message})` + } + if (!oldStr) return 'ERROR: old_string is empty.' + const count = content.split(oldStr).length - 1 + if (count === 0) return `ERROR: old_string not found in ${p}. read_file it and copy EXACT text.` + if (count > 1) return `ERROR: old_string appears ${count}× in ${p} — add surrounding context to make it unique.` + writeFileSync(join(ws.dir, p), content.replace(oldStr, newStr)) + return `edited ${p}: replaced 1 occurrence` + } + return `ERROR: unknown tool ${name}` + }, + async score(_task, handle): Promise { + const ws = workspaces.get(handle.id) + if (!ws) return { passes: 0, total: 1, errored: 1 } + let patch = '' + try { + const r = await exec('git', ['-C', ws.dir, 'diff'], { maxBuffer: 20_000_000, timeout: 60_000 }) + patch = r.stdout + } catch { + patch = '' + } + if (!patch.trim()) return { passes: 0, total: 1, errored: 0 } + try { + const s = await adapter.judge(ws.task, patch) + return { passes: s.resolved ? 1 : 0, total: 1, errored: 0 } + } catch { + return { passes: 0, total: 1, errored: 1 } + } + }, + async close(handle) { + const ws = workspaces.get(handle.id) + if (!ws) return + workspaces.delete(handle.id) + rmSync(ws.dir, { recursive: true, force: true }) + }, + } + + const tasks = async (offset: number, n: number): Promise => { + const slice = pool.slice(offset, offset + n) + if (slice.length < n) throw new Error(`swe-bench-env: pool exhausted at offset ${offset} (need ${n}, have ${slice.length}; raise poolN)`) + return slice.map((bt) => ({ + id: bt.id, + systemPrompt: + 'You are a senior engineer fixing a real bug in the checked-out repository. Use list_files + read_file to ' + + 'locate and fully read the relevant source, diagnose the root cause from the issue, then fix it with edit_file — ' + + 'a MINIMAL surgical change (a few lines, like a real PR), source only (test files are rejected). Do not rewrite whole files.', + userPrompt: bt.prompt, + meta: { instanceId: bt.id }, + })) + } + + return { environment, tasks, adapter } +} diff --git a/bench/swe-self-improve.mts b/bench/swe-self-improve.mts new file mode 100644 index 00000000..0e857c43 --- /dev/null +++ b/bench/swe-self-improve.mts @@ -0,0 +1,79 @@ +/** + * SWE-bench self-improvement — the PROPER, no-cheating run: a frontier worker over the SWE-bench + * `Environment`, with `runStrategyEvolution` enforcing the train→freeze→holdout split (the substrate + * draws a disjoint holdout slice and gates once — adaptive reuse is impossible). CONTAMINATION CAVEAT + * applies (public fixes may be memorized) — reported, never claimed clean. + * + * CALIBRATE first (cost gate): TANGLE_API_KEY=… CALIBRATE=1 N=3 tsx bench/swe-self-improve.mts + * Full run: TANGLE_API_KEY=… TRAIN_N=6 HOLDOUT_N=8 GENERATIONS=2 tsx bench/swe-self-improve.mts + */ +import { mkdtempSync, rmSync } from 'node:fs' +import { join } from 'node:path' +import { createChatClient } from '@tangle-network/agent-eval' +import { refine, runAgentic, runStrategyEvolution, sample } from '@tangle-network/agent-runtime/loops' +import { createSweBenchEnvironment } from './src/swe-bench-env' + +async function main(): Promise { + const routerKey = process.env.TANGLE_API_KEY + if (!routerKey) throw new Error('TANGLE_API_KEY required (worker + author call the router)') + const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1' + const workerModel = process.env.WORKER_MODEL ?? 'gemini-2.5-pro' + const authorModel = process.env.AUTHOR_MODEL ?? 'gemini-2.5-pro' + const innerTurns = Number(process.env.INNER_TURNS ?? 40) + const { environment, tasks } = await createSweBenchEnvironment(Number(process.env.POOL_N ?? 80)) + + if (process.env.CALIBRATE) { + const n = Number(process.env.N ?? 3) + const ts = await tasks(0, n) + console.log(`═══ SWE-bench CALIBRATION — ${workerModel}, baseline=refine, ${n} real bugs ═══`) + let resolved = 0 + for (const t of ts) { + const t0 = Date.now() + const r = await runAgentic({ surface: environment, task: t, strategy: refine, routerBaseUrl, routerKey, model: workerModel, innerTurns, budget: 1 }) + if (r.resolved) resolved++ + console.log(` ${t.id.padEnd(32)} resolved=${r.resolved} (${Math.round((Date.now() - t0) / 1000)}s)`) + } + const band = resolved > 0 && resolved < n + console.log(`\n>>> baseline resolved ${resolved}/${n}. ${band ? 'HEADROOM — the loop has room to improve. PROCEED.' : resolved === 0 ? 'TOO HARD / env issue — inspect before the loop.' : 'saturated at this small n — raise N.'}`) + return + } + + const outDir = mkdtempSync(join(process.cwd(), '.swe-run-')) + const report = await runStrategyEvolution({ + environment, + tasks, + trainN: Number(process.env.TRAIN_N ?? 6), + holdoutN: Number(process.env.HOLDOUT_N ?? 8), + worker: { routerBaseUrl, routerKey, model: workerModel, innerTurns }, + author: { + chat: createChatClient({ transport: 'router', baseUrl: routerBaseUrl, apiKey: routerKey, defaultModel: authorModel }), + model: authorModel, + maxTokens: 8000, + fallbackModel: process.env.AUTHOR_FALLBACK ?? 'deepseek-v4-flash', + }, + baselines: [sample, refine], + budget: Number(process.env.BUDGET ?? 2), + generations: Number(process.env.GENERATIONS ?? 2), + populationSize: Number(process.env.POP ?? 2), + outDir, + }) + rmSync(outDir, { recursive: true, force: true }) + + const v = report.verdict + console.log('\n═══ SWE-bench SELF-IMPROVEMENT — certified on a FROZEN holdout (CONTAMINATION-flagged) ═══') + console.log(`worker=${workerModel} author=${authorModel}`) + console.log(`gen0 champion: ${report.gen0Champion.name}`) + console.log(`final champion: ${report.finalChampion.name}`) + console.log(`PROMOTED: ${v.promoted} (${v.reason})`) + console.log(`held-out lift: mean ${v.lift.mean.toFixed(3)} 95% CI [${v.lift.low.toFixed(3)}, ${v.lift.high.toFixed(3)}] n=${v.n}`) + console.log( + v.promoted + ? '\n>>> The search taught the agent a strategy that resolves MORE real bugs it never trained on, beyond luck. (Report the contamination caveat: public fixes may be memorized.)' + : '\n>>> No promotion: the evolved strategy did not beat gen0 on the fresh holdout beyond noise (honest null).', + ) +} + +main().catch((e) => { + console.error(e) + process.exit(1) +}) From ed844764b1849ee0b8ebf93233f1d8e669ad4170 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 27 Jun 2026 20:25:12 -0600 Subject: [PATCH 3/6] fix(bench): SWE-bench worker needs maxTokens (thinking model) + a persist-and-edit prompt Calibration showed gemini-2.5-pro returning empty (no tool calls) without a maxTokens cap, then stopping after ~3 turns without editing. Set worker maxTokens=8000 and a prompt that forces broad exploration + at least one edit_file attempt. Log completions/shots in CALIBRATE mode for headroom diagnosis. --- bench/src/swe-bench-env.ts | 9 ++++++--- bench/swe-self-improve.mts | 6 +++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/bench/src/swe-bench-env.ts b/bench/src/swe-bench-env.ts index 44961efc..98b48909 100644 --- a/bench/src/swe-bench-env.ts +++ b/bench/src/swe-bench-env.ts @@ -161,9 +161,12 @@ export async function createSweBenchEnvironment(poolN = 80): Promise<{ return slice.map((bt) => ({ id: bt.id, systemPrompt: - 'You are a senior engineer fixing a real bug in the checked-out repository. Use list_files + read_file to ' + - 'locate and fully read the relevant source, diagnose the root cause from the issue, then fix it with edit_file — ' + - 'a MINIMAL surgical change (a few lines, like a real PR), source only (test files are rejected). Do not rewrite whole files.', + 'You are a senior engineer fixing a real bug in the checked-out repository. Work PERSISTENTLY and do not ' + + 'stop early: use list_files + read_file to explore BROADLY (read many candidate files — the bug is rarely in ' + + 'the first file you open), trace the issue to its root cause, then fix it with edit_file. You MUST make at ' + + 'least one edit_file call — never finish with prose alone or without attempting a fix. Make a MINIMAL surgical ' + + 'change (a few lines, like a real PR), source only (test files are rejected). If an edit_file fails (old_string ' + + 'not unique/found), read the file again and retry with exact text. Keep going until you have made your best fix.', userPrompt: bt.prompt, meta: { instanceId: bt.id }, })) diff --git a/bench/swe-self-improve.mts b/bench/swe-self-improve.mts index 0e857c43..270a3f68 100644 --- a/bench/swe-self-improve.mts +++ b/bench/swe-self-improve.mts @@ -29,9 +29,9 @@ async function main(): Promise { let resolved = 0 for (const t of ts) { const t0 = Date.now() - const r = await runAgentic({ surface: environment, task: t, strategy: refine, routerBaseUrl, routerKey, model: workerModel, innerTurns, budget: 1 }) + const r = await runAgentic({ surface: environment, task: t, strategy: refine, routerBaseUrl, routerKey, model: workerModel, maxTokens: 8000, innerTurns, budget: 1 }) if (r.resolved) resolved++ - console.log(` ${t.id.padEnd(32)} resolved=${r.resolved} (${Math.round((Date.now() - t0) / 1000)}s)`) + console.log(` ${t.id.padEnd(32)} resolved=${r.resolved} completions=${r.completions} shots=${r.shots} (${Math.round((Date.now() - t0) / 1000)}s)`) } const band = resolved > 0 && resolved < n console.log(`\n>>> baseline resolved ${resolved}/${n}. ${band ? 'HEADROOM — the loop has room to improve. PROCEED.' : resolved === 0 ? 'TOO HARD / env issue — inspect before the loop.' : 'saturated at this small n — raise N.'}`) @@ -44,7 +44,7 @@ async function main(): Promise { tasks, trainN: Number(process.env.TRAIN_N ?? 6), holdoutN: Number(process.env.HOLDOUT_N ?? 8), - worker: { routerBaseUrl, routerKey, model: workerModel, innerTurns }, + worker: { routerBaseUrl, routerKey, model: workerModel, maxTokens: 8000, innerTurns }, author: { chat: createChatClient({ transport: 'router', baseUrl: routerBaseUrl, apiKey: routerKey, defaultModel: authorModel }), model: authorModel, From fb6f682a712b0badfde7b68669a41443477cead9 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 27 Jun 2026 21:31:25 -0600 Subject: [PATCH 4/6] feat(examples): ablation knob-board + cost-aware one-knob-delta runner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The instrument for 'what actually helps': a configurable agent where each self-improvement technique is a knob (topology/trace-analysis/steering/GEPA-skillopt/persistent-artifact), swept one-knob-at-a-time (O(N) not 2^N) at equal compute, with a full autopsy — resolve AND token/$/latency per arm — so we see what helps vs what just burns tokens. WIRED: topology (refine/sample/sampleThenRefine) + budget. The rest are DECLARED knobs that FAIL LOUD if set (no silent no-op — names the substrate primitive to wire). Exports codingEnv/codingTasks from self-improving-coder (guarded main) for the cheap validation fixture. --- examples/ablation-suite/ablation.ts | 199 ++++++++++++++++++ .../self-improving-coder.ts | 11 +- 2 files changed, 205 insertions(+), 5 deletions(-) create mode 100644 examples/ablation-suite/ablation.ts diff --git a/examples/ablation-suite/ablation.ts b/examples/ablation-suite/ablation.ts new file mode 100644 index 00000000..46a3c0e9 --- /dev/null +++ b/examples/ablation-suite/ablation.ts @@ -0,0 +1,199 @@ +/** + * ablation — the cost-aware knob-board + one-knob-delta runner for agent self-improvement techniques. + * + * THE VISION: a single configurable agent where every technique is a knob (topology, trace-analysis, + * steering, GEPA/skill optimization, persistent artifacts), swept across arms at EQUAL COMPUTE, with a + * full autopsy — resolve rate AND token/$/latency cost per arm — so we see what really helps vs what + * just burns tokens. One-knob-delta design (baseline + each single knob flipped) keeps it O(N), not 2^N. + * + * STATUS — honest: the framework + the cost autopsy are real; knobs are wired incrementally. WIRED: + * `topology` (single/fanout/fanout-refine = refine/sample/sampleThenRefine) + `budget`. The rest are + * DECLARED knobs that FAIL LOUD if set (no silent no-op — you must not think GEPA ran when it didn't); + * each is a tracked next-increment over a real substrate primitive (named in the throw). Validate the + * framework on the cheap contamination-proof task, THEN point `environment`/`tasks` at SWE-bench. + */ +import { + type AgenticSurface, + type AgenticTask, + refine, + runAgentic, + sample, + sampleThenRefine, + type Strategy, +} from '@tangle-network/agent-runtime/loops' +import { codingEnv, codingTasks } from '../self-improving-coder/self-improving-coder' + +export interface AblationKnobs { + /** WIRED → strategy: single=`refine` (iterate one artifact), fanout=`sample` (N parallel, pick best), + * fanout-refine=`sampleThenRefine`. The coordination shape. */ + topology: 'single' | 'fanout' | 'fanout-refine' + /** WIRED → equal-compute unit (refine: max shots; fanout: rollout width). */ + budget: number + // ── DECLARED knobs — fail loud until wired (each over a named substrate primitive) ── + optimize?: 'off' | 'gepa' | 'skillOpt' // gepaProposer / skillOptProposer on TRAIN, frozen, then run + traceAnalysis?: 'off' | 'settle' | 'live' // analyzeOnSettle / watchTrace (agent-eval analysts) + halo?: boolean + steering?: boolean // trace finding → steer_worker (event-bus) + persistentArtifact?: boolean // multi-round persistent artifact (openSandboxRun resume) +} + +const topologyStrategy: Record = { + single: refine, + fanout: sample, + 'fanout-refine': sampleThenRefine, +} + +/** Fail loud on a set-but-unwired knob — the house rule (no silent no-op). Names the primitive to wire. */ +const unwiredKnobs: Array<{ k: keyof AblationKnobs; isSet: (v: unknown) => boolean; prim: string }> = [ + { k: 'optimize', isSet: (v) => !!v && v !== 'off', prim: 'gepaProposer/skillOptProposer + improve() on TRAIN, frozen' }, + { k: 'traceAnalysis', isSet: (v) => !!v && v !== 'off', prim: 'analyzeOnSettle / watchTrace (agent-eval analysts)' }, + { k: 'halo', isSet: (v) => v === true, prim: 'HALO analyst option' }, + { k: 'steering', isSet: (v) => v === true, prim: 'event-bus finding → steer_worker' }, + { k: 'persistentArtifact', isSet: (v) => v === true, prim: 'openSandboxRun resume' }, +] + +export interface ArmResult { + name: string + knobs: AblationKnobs + n: number + resolve: number // mean resolved (0..1) on the held-out set + tokensIn: number + tokensOut: number + costUsd: number + latencyMs: number + shotsMean: number + completionsMean: number +} + +export async function runAblation(opts: { + environment: AgenticSurface + tasks: (offset: number, n: number) => Promise + holdoutOffset: number + holdoutN: number + base: AblationKnobs + /** Each delta = a ONE-KNOB change vs base (the one-knob-delta design). */ + deltas: Array<{ name: string; knob: Partial }> + worker: { routerBaseUrl: string; routerKey: string; model: string; maxTokens?: number; innerTurns?: number } + onArm?: (r: ArmResult) => void +}): Promise { + // ONE held-out set, shared across all arms — the fair-comparison invariant. + const tasks = await opts.tasks(opts.holdoutOffset, opts.holdoutN) + const arms = [ + { name: 'baseline', knobs: opts.base }, + ...opts.deltas.map((d) => ({ name: d.name, knobs: { ...opts.base, ...d.knob } as AblationKnobs })), + ] + const results: ArmResult[] = [] + for (const arm of arms) { + for (const u of unwiredKnobs) { + if (u.isSet(arm.knobs[u.k])) + throw new Error( + `ablation: knob '${u.k}'=${JSON.stringify(arm.knobs[u.k])} (arm "${arm.name}") is DECLARED but not yet wired — wire it over ${u.prim} before claiming it ran. (No silent no-op.)`, + ) + } + let resolved = 0 + let ti = 0 + let to = 0 + let usd = 0 + let ms = 0 + let shots = 0 + let comps = 0 + for (const t of tasks) { + const r = await runAgentic({ + surface: opts.environment, + task: t, + strategy: topologyStrategy[arm.knobs.topology], + budget: arm.knobs.budget, + routerBaseUrl: opts.worker.routerBaseUrl, + routerKey: opts.worker.routerKey, + model: opts.worker.model, + ...(opts.worker.maxTokens !== undefined ? { maxTokens: opts.worker.maxTokens } : {}), + ...(opts.worker.innerTurns !== undefined ? { innerTurns: opts.worker.innerTurns } : {}), + }) + if (r.resolved) resolved++ + ti += r.tokens.input + to += r.tokens.output + usd += r.usd + ms += r.ms + shots += r.shots + comps += r.completions + } + const n = tasks.length + const res: ArmResult = { + name: arm.name, + knobs: arm.knobs, + n, + resolve: resolved / n, + tokensIn: ti, + tokensOut: to, + costUsd: usd, + latencyMs: ms, + shotsMean: shots / n, + completionsMean: comps / n, + } + results.push(res) + opts.onArm?.(res) + } + return results +} + +/** The cost-aware autopsy: per-arm resolve + tokens + $ + latency, and Δ vs baseline (lift AND cost). */ +export function printAutopsy(results: ArmResult[]): void { + const base = results[0] + const pad = (s: string, n: number) => s.padEnd(n) + console.log(`\n═══ ABLATION AUTOPSY (n=${base?.n} held-out, one-knob-delta vs baseline) ═══`) + console.log( + pad('arm', 16) + pad('topology', 14) + pad('resolve', 9) + pad('tok(in/out)', 16) + pad('$', 9) + pad('lat(s)', 9) + pad('shots', 7) + pad('Δresolve', 10) + 'Δ$', + ) + for (const r of results) { + const dR = base ? r.resolve - base.resolve : 0 + const dC = base ? r.costUsd - base.costUsd : 0 + console.log( + pad(r.name, 16) + + pad(r.knobs.topology, 14) + + pad(`${(100 * r.resolve).toFixed(0)}%`, 9) + + pad(`${r.tokensIn}/${r.tokensOut}`, 16) + + pad(`$${r.costUsd.toFixed(4)}`, 9) + + pad((r.latencyMs / 1000).toFixed(0), 9) + + pad(r.shotsMean.toFixed(1), 7) + + pad(`${dR >= 0 ? '+' : ''}${(100 * dR).toFixed(0)}pp`, 10) + + `${dC >= 0 ? '+' : ''}$${dC.toFixed(4)}`, + ) + } + console.log( + '\n>>> Read it cost-aware: a +resolve that costs +$$ may be worse than baseline. The whole point is to see what HELPS vs what just BURNS.', + ) +} + +async function main(): Promise { + const routerKey = process.env.TANGLE_API_KEY + if (!routerKey) throw new Error('TANGLE_API_KEY required') + const worker = { + routerBaseUrl: process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1', + routerKey, + model: process.env.WORKER_MODEL ?? 'deepseek-v4-flash', + maxTokens: 4000, + innerTurns: Number(process.env.INNER_TURNS ?? 6), + } + console.log(`═══ ABLATION (cheap contamination-proof task) — worker=${worker.model} ═══`) + const results = await runAblation({ + environment: codingEnv, + tasks: codingTasks, + holdoutOffset: 100, // a fixed disjoint held-out slice + holdoutN: Number(process.env.HOLDOUT_N ?? 6), + base: { topology: 'single', budget: Number(process.env.BUDGET ?? 2) }, + // one-knob-delta: flip ONLY topology (the wired knob) vs baseline. + deltas: [ + { name: 'fanout', knob: { topology: 'fanout' } }, + { name: 'fanout-refine', knob: { topology: 'fanout-refine' } }, + ], + worker, + onArm: (r) => console.log(` ${r.name}: ${(100 * r.resolve).toFixed(0)}% resolve, $${r.costUsd.toFixed(4)}, ${(r.latencyMs / 1000).toFixed(0)}s`), + }) + printAutopsy(results) +} + +if (import.meta.url === `file://${process.argv[1]}`) + main().catch((e) => { + console.error(e) + process.exit(1) + }) diff --git a/examples/self-improving-coder/self-improving-coder.ts b/examples/self-improving-coder/self-improving-coder.ts index 736ed05b..eaaa206d 100644 --- a/examples/self-improving-coder/self-improving-coder.ts +++ b/examples/self-improving-coder/self-improving-coder.ts @@ -105,7 +105,7 @@ function pytestPassed(dir: string): { passed: number; total: number } { return { passed, total: passed + failed } } -const codingEnv: AgenticSurface = { +export const codingEnv: AgenticSurface = { name: 'generated-coding', async open(task) { const seed = Number((task.meta as { seed?: number })?.seed ?? 0) @@ -169,7 +169,7 @@ const codingEnv: AgenticSurface = { } // ── The disjoint task supplier (train [0,trainN); holdout drawn past it) ────────── -const tasks = async (offset: number, n: number): Promise => +export const codingTasks = async (offset: number, n: number): Promise => Array.from({ length: n }, (_, i) => { const seed = offset + i return { @@ -208,7 +208,7 @@ async function calibrate(): Promise { console.log('═══ CALIBRATION ($0) — task solvable + grader discriminates? ═══') let ok = true for (const seed of [0, 1, 2, 7, 11]) { - const task = (await tasks(seed, 1))[0]! + const task = (await codingTasks(seed, 1))[0]! const h = await codingEnv.open(task) const stub = await codingEnv.score(task, h) // write the reference, re-score @@ -239,7 +239,7 @@ async function main(): Promise { const outDir = mkdtempSync(join(process.cwd(), '.sic-run-')) const report = await runStrategyEvolution({ environment: codingEnv, - tasks, + tasks: codingTasks, trainN: Number(process.env.TRAIN_N ?? 8), holdoutN: Number(process.env.HOLDOUT_N ?? 12), worker: { routerBaseUrl, routerKey, model: workerModel, innerTurns: Number(process.env.INNER_TURNS ?? 8), maxTokens: 4000 }, @@ -280,7 +280,8 @@ async function main(): Promise { ) } -main().catch((e) => { +if (import.meta.url === `file://${process.argv[1]}`) + main().catch((e) => { console.error(e) process.exit(1) }) From bd127783c37cd478da3b34ccf85ef8c2a1d02f31 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 27 Jun 2026 22:15:02 -0600 Subject: [PATCH 5/6] feat(examples): ablation significance (paired bootstrap CI) + point steering knob at the driver-steers-worker loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds task-aligned per-task resolve vectors + pairedBootstrap 95% CI on every arm's Δresolve (✓ = CI excludes 0 = real lift) — no more point lifts. Reframes the rich knobs to the RIGHT primitives: the steering knob is the supervise() driver-steers-worker loop (driver composes the next prompt from the analyst's analyzeOnSettle finding — a driver brain in the loop, not the inline analyst-steerer); the optimize knob is selfImprove() with an executable JudgeConfig optimizing the driver's compose-prompt on TRAIN, frozen. Both fail loud until wired. --- examples/ablation-suite/ablation.ts | 41 ++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/examples/ablation-suite/ablation.ts b/examples/ablation-suite/ablation.ts index 46a3c0e9..889c7e3a 100644 --- a/examples/ablation-suite/ablation.ts +++ b/examples/ablation-suite/ablation.ts @@ -12,6 +12,7 @@ * each is a tracked next-increment over a real substrate primitive (named in the throw). Validate the * framework on the cheap contamination-proof task, THEN point `environment`/`tasks` at SWE-bench. */ +import { pairedBootstrap } from '@tangle-network/agent-eval' import { type AgenticSurface, type AgenticTask, @@ -30,10 +31,15 @@ export interface AblationKnobs { /** WIRED → equal-compute unit (refine: max shots; fanout: rollout width). */ budget: number // ── DECLARED knobs — fail loud until wired (each over a named substrate primitive) ── - optimize?: 'off' | 'gepa' | 'skillOpt' // gepaProposer / skillOptProposer on TRAIN, frozen, then run - traceAnalysis?: 'off' | 'settle' | 'live' // analyzeOnSettle / watchTrace (agent-eval analysts) - halo?: boolean - steering?: boolean // trace finding → steer_worker (event-bus) + /** The DRIVER-steers-WORKER loop: supervise() drives the worker, analyzeOnSettle fires the analyst on + * each settled round → a `finding` the driver pulls and composes the next prompt from. (NOT the + * refine analyst-steerer — that's the degenerate inline version; this is a driver brain in the loop.) */ + driverSteer?: boolean // supervise(driverProfile,{backend,analyzeOnSettle}) + steer_agent + /** GEPA-optimize the DRIVER's compose-next-prompt system prompt on TRAIN (executable-graded via the + * surface score), frozen, then run — selfImprove() with an executable JudgeConfig (NOT improve(): the + * steerer prompt is not a profile field). */ + optimize?: 'off' | 'gepa' + halo?: boolean // HALO analyst option persistentArtifact?: boolean // multi-round persistent artifact (openSandboxRun resume) } @@ -45,10 +51,9 @@ const topologyStrategy: Record = { /** Fail loud on a set-but-unwired knob — the house rule (no silent no-op). Names the primitive to wire. */ const unwiredKnobs: Array<{ k: keyof AblationKnobs; isSet: (v: unknown) => boolean; prim: string }> = [ - { k: 'optimize', isSet: (v) => !!v && v !== 'off', prim: 'gepaProposer/skillOptProposer + improve() on TRAIN, frozen' }, - { k: 'traceAnalysis', isSet: (v) => !!v && v !== 'off', prim: 'analyzeOnSettle / watchTrace (agent-eval analysts)' }, + { k: 'driverSteer', isSet: (v) => v === true, prim: 'supervise(driverProfile,{backend,analyzeOnSettle}) — driver composes the steer from the analyst finding' }, + { k: 'optimize', isSet: (v) => !!v && v !== 'off', prim: "selfImprove() w/ executable JudgeConfig optimizing the driver's compose-prompt on TRAIN, frozen" }, { k: 'halo', isSet: (v) => v === true, prim: 'HALO analyst option' }, - { k: 'steering', isSet: (v) => v === true, prim: 'event-bus finding → steer_worker' }, { k: 'persistentArtifact', isSet: (v) => v === true, prim: 'openSandboxRun resume' }, ] @@ -63,6 +68,8 @@ export interface ArmResult { latencyMs: number shotsMean: number completionsMean: number + /** Per-task resolved (0/1), task-aligned across arms — the paired vector for significance. */ + perTask: number[] } export async function runAblation(opts: { @@ -97,6 +104,7 @@ export async function runAblation(opts: { let ms = 0 let shots = 0 let comps = 0 + const perTask: number[] = [] for (const t of tasks) { const r = await runAgentic({ surface: opts.environment, @@ -110,6 +118,7 @@ export async function runAblation(opts: { ...(opts.worker.innerTurns !== undefined ? { innerTurns: opts.worker.innerTurns } : {}), }) if (r.resolved) resolved++ + perTask.push(r.resolved ? 1 : 0) ti += r.tokens.input to += r.tokens.output usd += r.usd @@ -129,6 +138,7 @@ export async function runAblation(opts: { latencyMs: ms, shotsMean: shots / n, completionsMean: comps / n, + perTask, } results.push(res) opts.onArm?.(res) @@ -142,25 +152,30 @@ export function printAutopsy(results: ArmResult[]): void { const pad = (s: string, n: number) => s.padEnd(n) console.log(`\n═══ ABLATION AUTOPSY (n=${base?.n} held-out, one-knob-delta vs baseline) ═══`) console.log( - pad('arm', 16) + pad('topology', 14) + pad('resolve', 9) + pad('tok(in/out)', 16) + pad('$', 9) + pad('lat(s)', 9) + pad('shots', 7) + pad('Δresolve', 10) + 'Δ$', + pad('arm', 16) + pad('topology', 14) + pad('resolve', 9) + pad('$', 9) + pad('lat(s)', 8) + pad('shots', 7) + pad('Δresolve [95% CI]', 24) + 'Δ$', ) for (const r of results) { - const dR = base ? r.resolve - base.resolve : 0 const dC = base ? r.costUsd - base.costUsd : 0 + // Significance: paired bootstrap of this arm's per-task resolve vs baseline's (task-aligned). + let lift = '+0pp' + if (base && r !== base) { + const b = pairedBootstrap(base.perTask, r.perTask, { confidence: 0.95, statistic: 'mean' }) + const sig = b.low > 0 || b.high < 0 ? '✓' : '·' // CI excludes 0 ⇒ real + lift = `${b.median >= 0 ? '+' : ''}${(100 * b.median).toFixed(0)}pp [${(100 * b.low).toFixed(0)},${(100 * b.high).toFixed(0)}] ${sig}` + } console.log( pad(r.name, 16) + pad(r.knobs.topology, 14) + pad(`${(100 * r.resolve).toFixed(0)}%`, 9) + - pad(`${r.tokensIn}/${r.tokensOut}`, 16) + pad(`$${r.costUsd.toFixed(4)}`, 9) + - pad((r.latencyMs / 1000).toFixed(0), 9) + + pad((r.latencyMs / 1000).toFixed(0), 8) + pad(r.shotsMean.toFixed(1), 7) + - pad(`${dR >= 0 ? '+' : ''}${(100 * dR).toFixed(0)}pp`, 10) + + pad(lift, 24) + `${dC >= 0 ? '+' : ''}$${dC.toFixed(4)}`, ) } console.log( - '\n>>> Read it cost-aware: a +resolve that costs +$$ may be worse than baseline. The whole point is to see what HELPS vs what just BURNS.', + '\n>>> Read it cost-aware: ✓ = CI excludes 0 (real lift). A +resolve that costs +$$ or is not ✓ may be worse than baseline. The point is to see what HELPS vs what just BURNS.', ) } From 6e747231dd5001e6b3d576d33be7f2b70b44c946 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sat, 27 Jun 2026 23:18:02 -0600 Subject: [PATCH 6/6] fix(examples): address self-improving coder review --- .gitignore | 2 + bench/src/swe-bench-env.ts | 23 ++- bench/swe-self-improve.mts | 49 ++--- examples/ablation-suite/ablation.ts | 51 ++++-- .../self-improving-coder.ts | 167 ++++++++++++------ 5 files changed, 200 insertions(+), 92 deletions(-) diff --git a/.gitignore b/.gitignore index d8334027..ed4bec97 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,5 @@ bench/scripts/__pycache__/ # local rollout-corpus scratch (raw jsonl, per work-line) corpus/ test_repo/ +.sic-run-*/ +.swe-run-*/ diff --git a/bench/src/swe-bench-env.ts b/bench/src/swe-bench-env.ts index 98b48909..ccfb0931 100644 --- a/bench/src/swe-bench-env.ts +++ b/bench/src/swe-bench-env.ts @@ -13,7 +13,7 @@ * memorization. Always report this; never claim a "clean" frontier number from this arena alone. */ import { execFile } from 'node:child_process' -import { existsSync, mkdtempSync, readdirSync, readFileSync, rmSync, statSync, writeFileSync } from 'node:fs' +import { existsSync, lstatSync, mkdtempSync, readdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs' import { tmpdir } from 'node:os' import { join } from 'node:path' import { promisify } from 'node:util' @@ -49,11 +49,16 @@ export async function createSweBenchEnvironment(poolN = 80): Promise<{ if (!bt) throw new Error(`swe-bench-env: unknown task ${task.id}`) const md = bt.metadata as Record const dir = mkdtempSync(join(tmpdir(), 'swe-')) - await exec('git', ['clone', '--filter=blob:none', '--no-checkout', '--quiet', `https://github.com/${md.repo}.git`, dir], { timeout: 420_000 }) - await exec('git', ['-C', dir, 'checkout', '--quiet', md.base_commit], { timeout: 300_000 }) - const handle: ArtifactHandle = { id: dir, surface: 'swe-bench-verified' } - workspaces.set(dir, { dir, task: bt }) - return handle + try { + await exec('git', ['clone', '--filter=blob:none', '--no-checkout', '--quiet', `https://github.com/${md.repo}.git`, dir], { timeout: 420_000 }) + await exec('git', ['-C', dir, 'checkout', '--quiet', md.base_commit], { timeout: 300_000 }) + const handle: ArtifactHandle = { id: dir, surface: 'swe-bench-verified' } + workspaces.set(dir, { dir, task: bt }) + return handle + } catch (error) { + rmSync(dir, { recursive: true, force: true }) + throw error + } }, async tools() { return [ @@ -66,8 +71,8 @@ export async function createSweBenchEnvironment(poolN = 80): Promise<{ const ws = workspaces.get(handle.id) if (!ws) return 'ERROR: workspace closed' const safe = (p: string): string | null => { - const n = p.replace(/^\.?\//, '') - return n.includes('..') || n.startsWith('/') ? null : n + if (p.startsWith('/') || p.includes('..')) return null + return p.replace(/^\.\//, '') } if (name === 'list_files') { const sub = safe(String(args.dir ?? '')) ?? '' @@ -87,7 +92,7 @@ export async function createSweBenchEnvironment(poolN = 80): Promise<{ const p = join(d, e) let isDir = false try { - isDir = statSync(p).isDirectory() + isDir = lstatSync(p).isDirectory() } catch { continue } diff --git a/bench/swe-self-improve.mts b/bench/swe-self-improve.mts index 270a3f68..a1d72cff 100644 --- a/bench/swe-self-improve.mts +++ b/bench/swe-self-improve.mts @@ -22,7 +22,7 @@ async function main(): Promise { const innerTurns = Number(process.env.INNER_TURNS ?? 40) const { environment, tasks } = await createSweBenchEnvironment(Number(process.env.POOL_N ?? 80)) - if (process.env.CALIBRATE) { + if (process.env.CALIBRATE === '1') { const n = Number(process.env.N ?? 3) const ts = await tasks(0, n) console.log(`═══ SWE-bench CALIBRATION — ${workerModel}, baseline=refine, ${n} real bugs ═══`) @@ -38,26 +38,31 @@ async function main(): Promise { return } - const outDir = mkdtempSync(join(process.cwd(), '.swe-run-')) - const report = await runStrategyEvolution({ - environment, - tasks, - trainN: Number(process.env.TRAIN_N ?? 6), - holdoutN: Number(process.env.HOLDOUT_N ?? 8), - worker: { routerBaseUrl, routerKey, model: workerModel, maxTokens: 8000, innerTurns }, - author: { - chat: createChatClient({ transport: 'router', baseUrl: routerBaseUrl, apiKey: routerKey, defaultModel: authorModel }), - model: authorModel, - maxTokens: 8000, - fallbackModel: process.env.AUTHOR_FALLBACK ?? 'deepseek-v4-flash', - }, - baselines: [sample, refine], - budget: Number(process.env.BUDGET ?? 2), - generations: Number(process.env.GENERATIONS ?? 2), - populationSize: Number(process.env.POP ?? 2), - outDir, - }) - rmSync(outDir, { recursive: true, force: true }) + const report = await (async () => { + const outDir = mkdtempSync(join(process.cwd(), '.swe-run-')) + try { + return await runStrategyEvolution({ + environment, + tasks, + trainN: Number(process.env.TRAIN_N ?? 6), + holdoutN: Number(process.env.HOLDOUT_N ?? 8), + worker: { routerBaseUrl, routerKey, model: workerModel, maxTokens: 8000, innerTurns }, + author: { + chat: createChatClient({ transport: 'router', baseUrl: routerBaseUrl, apiKey: routerKey, defaultModel: authorModel }), + model: authorModel, + maxTokens: 8000, + fallbackModel: process.env.AUTHOR_FALLBACK ?? 'deepseek-v4-flash', + }, + baselines: [sample, refine], + budget: Number(process.env.BUDGET ?? 2), + generations: Number(process.env.GENERATIONS ?? 2), + populationSize: Number(process.env.POP ?? 2), + outDir, + }) + } finally { + rmSync(outDir, { recursive: true, force: true }) + } + })() const v = report.verdict console.log('\n═══ SWE-bench SELF-IMPROVEMENT — certified on a FROZEN holdout (CONTAMINATION-flagged) ═══') @@ -74,6 +79,6 @@ async function main(): Promise { } main().catch((e) => { - console.error(e) + console.error(e instanceof Error ? (e.stack ?? e.message) : String(e)) process.exit(1) }) diff --git a/examples/ablation-suite/ablation.ts b/examples/ablation-suite/ablation.ts index 889c7e3a..9fe2837e 100644 --- a/examples/ablation-suite/ablation.ts +++ b/examples/ablation-suite/ablation.ts @@ -18,9 +18,9 @@ import { type AgenticTask, refine, runAgentic, + type Strategy, sample, sampleThenRefine, - type Strategy, } from '@tangle-network/agent-runtime/loops' import { codingEnv, codingTasks } from '../self-improving-coder/self-improving-coder' @@ -50,9 +50,21 @@ const topologyStrategy: Record = { } /** Fail loud on a set-but-unwired knob — the house rule (no silent no-op). Names the primitive to wire. */ -const unwiredKnobs: Array<{ k: keyof AblationKnobs; isSet: (v: unknown) => boolean; prim: string }> = [ - { k: 'driverSteer', isSet: (v) => v === true, prim: 'supervise(driverProfile,{backend,analyzeOnSettle}) — driver composes the steer from the analyst finding' }, - { k: 'optimize', isSet: (v) => !!v && v !== 'off', prim: "selfImprove() w/ executable JudgeConfig optimizing the driver's compose-prompt on TRAIN, frozen" }, +const unwiredKnobs: Array<{ + k: keyof AblationKnobs + isSet: (v: unknown) => boolean + prim: string +}> = [ + { + k: 'driverSteer', + isSet: (v) => v === true, + prim: 'supervise(driverProfile,{backend,analyzeOnSettle}) — driver composes the steer from the analyst finding', + }, + { + k: 'optimize', + isSet: (v) => !!v && v !== 'off', + prim: "selfImprove() w/ executable JudgeConfig optimizing the driver's compose-prompt on TRAIN, frozen", + }, { k: 'halo', isSet: (v) => v === true, prim: 'HALO analyst option' }, { k: 'persistentArtifact', isSet: (v) => v === true, prim: 'openSandboxRun resume' }, ] @@ -80,14 +92,23 @@ export async function runAblation(opts: { base: AblationKnobs /** Each delta = a ONE-KNOB change vs base (the one-knob-delta design). */ deltas: Array<{ name: string; knob: Partial }> - worker: { routerBaseUrl: string; routerKey: string; model: string; maxTokens?: number; innerTurns?: number } + worker: { + routerBaseUrl: string + routerKey: string + model: string + maxTokens?: number + innerTurns?: number + } onArm?: (r: ArmResult) => void }): Promise { // ONE held-out set, shared across all arms — the fair-comparison invariant. const tasks = await opts.tasks(opts.holdoutOffset, opts.holdoutN) const arms = [ { name: 'baseline', knobs: opts.base }, - ...opts.deltas.map((d) => ({ name: d.name, knobs: { ...opts.base, ...d.knob } as AblationKnobs })), + ...opts.deltas.map((d) => ({ + name: d.name, + knobs: { ...opts.base, ...d.knob } as AblationKnobs, + })), ] const results: ArmResult[] = [] for (const arm of arms) { @@ -152,7 +173,14 @@ export function printAutopsy(results: ArmResult[]): void { const pad = (s: string, n: number) => s.padEnd(n) console.log(`\n═══ ABLATION AUTOPSY (n=${base?.n} held-out, one-knob-delta vs baseline) ═══`) console.log( - pad('arm', 16) + pad('topology', 14) + pad('resolve', 9) + pad('$', 9) + pad('lat(s)', 8) + pad('shots', 7) + pad('Δresolve [95% CI]', 24) + 'Δ$', + pad('arm', 16) + + pad('topology', 14) + + pad('resolve', 9) + + pad('$', 9) + + pad('lat(s)', 8) + + pad('shots', 7) + + pad('Δresolve [95% CI]', 24) + + 'Δ$', ) for (const r of results) { const dC = base ? r.costUsd - base.costUsd : 0 @@ -161,7 +189,7 @@ export function printAutopsy(results: ArmResult[]): void { if (base && r !== base) { const b = pairedBootstrap(base.perTask, r.perTask, { confidence: 0.95, statistic: 'mean' }) const sig = b.low > 0 || b.high < 0 ? '✓' : '·' // CI excludes 0 ⇒ real - lift = `${b.median >= 0 ? '+' : ''}${(100 * b.median).toFixed(0)}pp [${(100 * b.low).toFixed(0)},${(100 * b.high).toFixed(0)}] ${sig}` + lift = `${b.mean >= 0 ? '+' : ''}${(100 * b.mean).toFixed(0)}pp [${(100 * b.low).toFixed(0)},${(100 * b.high).toFixed(0)}] ${sig}` } console.log( pad(r.name, 16) + @@ -202,13 +230,16 @@ async function main(): Promise { { name: 'fanout-refine', knob: { topology: 'fanout-refine' } }, ], worker, - onArm: (r) => console.log(` ${r.name}: ${(100 * r.resolve).toFixed(0)}% resolve, $${r.costUsd.toFixed(4)}, ${(r.latencyMs / 1000).toFixed(0)}s`), + onArm: (r) => + console.log( + ` ${r.name}: ${(100 * r.resolve).toFixed(0)}% resolve, $${r.costUsd.toFixed(4)}, ${(r.latencyMs / 1000).toFixed(0)}s`, + ), }) printAutopsy(results) } if (import.meta.url === `file://${process.argv[1]}`) main().catch((e) => { - console.error(e) + console.error(e instanceof Error ? (e.stack ?? e.message) : String(e)) process.exit(1) }) diff --git a/examples/self-improving-coder/self-improving-coder.ts b/examples/self-improving-coder/self-improving-coder.ts index eaaa206d..d799cffc 100644 --- a/examples/self-improving-coder/self-improving-coder.ts +++ b/examples/self-improving-coder/self-improving-coder.ts @@ -25,7 +25,7 @@ * Run: TANGLE_API_KEY= pnpm tsx examples/self-improving-coder/self-improving-coder.ts */ import { execFileSync } from 'node:child_process' -import { mkdirSync, mkdtempSync, readdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs' +import { mkdtempSync, readdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs' import { tmpdir } from 'node:os' import { join } from 'node:path' import { createChatClient } from '@tangle-network/agent-eval' @@ -36,8 +36,8 @@ import { type ArtifactHandle, refine, runStrategyEvolution, - sample, type SurfaceScore, + sample, } from '@tangle-network/agent-runtime/loops' // ── The contamination-proof task generator (deterministic per seed) ────────────── @@ -46,7 +46,11 @@ import { * agent edits + the hidden-ish test file (the agent may read it; grading runs it). */ function constsFor(seed: number): { VER: string; SEP: string; MOD: number } { const r = (m: number) => ((seed * 2654435761) >>> 0) % m - return { VER: `v${(r(900) + 100).toString(36)}`, SEP: ['-', '|', ':', '/', '#'][r(5)]!, MOD: [97, 101, 103, 107, 109][r(5)]! } + return { + VER: `v${(r(900) + 100).toString(36)}`, + SEP: ['-', '|', ':', '/', '#'][r(5)]!, + MOD: [97, 101, 103, 107, 109][r(5)]!, + } } function genTask(seed: number): { stub: string; test: string; total: number } { const { VER, SEP, MOD } = constsFor(seed) @@ -91,17 +95,22 @@ const workspaces = new Map() function pytestPassed(dir: string): { passed: number; total: number } { let out = '' try { - out = execFileSync('python3', ['-m', 'pytest', '-q', '--tb=no', '-p', 'no:cacheprovider', 'test_lib.py'], { - cwd: dir, - encoding: 'utf8', - timeout: 60_000, - stdio: ['ignore', 'pipe', 'pipe'], - }) + out = execFileSync( + 'python3', + ['-m', 'pytest', '-q', '--tb=no', '-p', 'no:cacheprovider', 'test_lib.py'], + { + cwd: dir, + encoding: 'utf8', + timeout: 60_000, + stdio: ['ignore', 'pipe', 'pipe'], + }, + ) } catch (e) { out = (e as { stdout?: string }).stdout ?? '' } const passed = Number(out.match(/(\d+) passed/)?.[1] ?? 0) - const failed = Number(out.match(/(\d+) failed/)?.[1] ?? 0) + Number(out.match(/(\d+) error/)?.[1] ?? 0) + const failed = + Number(out.match(/(\d+) failed/)?.[1] ?? 0) + Number(out.match(/(\d+) error/)?.[1] ?? 0) return { passed, total: passed + failed } } @@ -119,9 +128,39 @@ export const codingEnv: AgenticSurface = { }, async tools() { return [ - { type: 'function', function: { name: 'list_files', description: 'List the files in the workspace.', parameters: { type: 'object', properties: {} } } }, - { type: 'function', function: { name: 'read_file', description: 'Read a file (e.g. test_lib.py to learn the contract, or lib.py).', parameters: { type: 'object', properties: { path: { type: 'string' } }, required: ['path'] } } }, - { type: 'function', function: { name: 'write_file', description: 'Write COMPLETE contents of lib.py (the implementation). test_lib.py is read-only.', parameters: { type: 'object', properties: { path: { type: 'string' }, content: { type: 'string' } }, required: ['path', 'content'] } } }, + { + type: 'function', + function: { + name: 'list_files', + description: 'List the files in the workspace.', + parameters: { type: 'object', properties: {} }, + }, + }, + { + type: 'function', + function: { + name: 'read_file', + description: 'Read a file (e.g. test_lib.py to learn the contract, or lib.py).', + parameters: { + type: 'object', + properties: { path: { type: 'string' } }, + required: ['path'], + }, + }, + }, + { + type: 'function', + function: { + name: 'write_file', + description: + 'Write COMPLETE contents of lib.py (the implementation). test_lib.py is read-only.', + parameters: { + type: 'object', + properties: { path: { type: 'string' }, content: { type: 'string' } }, + required: ['path', 'content'], + }, + }, + }, // NO run_tests: the agent cannot iterate-until-green. It must implement correctly from READING the // tests — which creates real headroom and makes the STRATEGY (planning, multiple attempts) matter. ] satisfies AgenticTool[] @@ -131,34 +170,34 @@ export const codingEnv: AgenticSurface = { if (!ws) return 'ERROR: workspace closed' if (name === 'list_files') return readdirSync(ws.dir).join('\n') if (name === 'read_file') { + const p = String(args.path ?? '') + if (p !== 'lib.py' && p !== 'test_lib.py') + return 'ERROR: only lib.py and test_lib.py are readable' try { - return readFileSync(join(ws.dir, String(args.path ?? '')), 'utf8').slice(0, 8000) + return readFileSync(join(ws.dir, p), 'utf8').slice(0, 8000) } catch (e) { return `ERROR: ${(e as Error).message}` } } if (name === 'write_file') { const p = String(args.path ?? '') - if (!p.endsWith('lib.py') || p.includes('..') || p.startsWith('/')) return 'ERROR: only lib.py is writable' + if (p !== 'lib.py') return 'ERROR: only lib.py is writable' try { - mkdirSync(ws.dir, { recursive: true }) writeFileSync(join(ws.dir, 'lib.py'), String(args.content ?? '')) return 'wrote lib.py' } catch (e) { return `ERROR: ${(e as Error).message}` } } - if (name === 'run_tests') { - const { passed, total } = pytestPassed(ws.dir) - return `pytest: ${passed}/${total} passed` - } return `ERROR: unknown tool ${name}` }, async score(_task, handle): Promise { const ws = workspaces.get(handle.id) if (!ws) return { passes: 0, total: 0, errored: 1 } const { passed, total } = pytestPassed(ws.dir) - return total > 0 ? { passes: passed, total, errored: 0 } : { passes: 0, total: ws.total, errored: 1 } + return total > 0 + ? { passes: passed, total, errored: 0 } + : { passes: 0, total: ws.total, errored: 1 } }, async close(handle) { const ws = workspaces.get(handle.id) @@ -179,7 +218,8 @@ export const codingTasks = async (offset: number, n: number): Promise { await codingEnv.close(h) const pass = ref.passes === ref.total && ref.total > 0 && stub.passes === 0 ok &&= pass - console.log(` seed ${seed}: stub ${stub.passes}/${stub.total} → reference ${ref.passes}/${ref.total} ${pass ? '✓' : '✗ BROKEN'}`) + console.log( + ` seed ${seed}: stub ${stub.passes}/${stub.total} → reference ${ref.passes}/${ref.total} ${pass ? '✓' : '✗ BROKEN'}`, + ) } - console.log(ok ? '\n>>> CALIBRATED — task is solvable + the grader discriminates. Safe to run the loop.' : '\n>>> BROKEN — fix the task/grader before spending.') + console.log( + ok + ? '\n>>> CALIBRATED — task is solvable + the grader discriminates. Safe to run the loop.' + : '\n>>> BROKEN — fix the task/grader before spending.', + ) if (!ok) process.exit(1) } async function main(): Promise { - if (process.env.CALIBRATE) return calibrate() + if (process.env.CALIBRATE === '1') return calibrate() const routerKey = process.env.TANGLE_API_KEY - if (!routerKey) throw new Error('set TANGLE_API_KEY (the worker + the author both call the router)') + if (!routerKey) + throw new Error('set TANGLE_API_KEY (the worker + the author both call the router)') const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1' const workerModel = process.env.WORKER_MODEL ?? 'deepseek-v4-flash' // The author WRITES strategy code (a `defineStrategy` module) — it needs a strong coder + a token @@ -236,29 +283,45 @@ async function main(): Promise { // The author writes candidate-strategy .mts files into outDir, then dynamically imports them — they // `import '@tangle-network/agent-runtime/loops'`, which only resolves UNDER the package (self-reference). // A /tmp outDir would fail to resolve it; keep it under the project root. - const outDir = mkdtempSync(join(process.cwd(), '.sic-run-')) - const report = await runStrategyEvolution({ - environment: codingEnv, - tasks: codingTasks, - trainN: Number(process.env.TRAIN_N ?? 8), - holdoutN: Number(process.env.HOLDOUT_N ?? 12), - worker: { routerBaseUrl, routerKey, model: workerModel, innerTurns: Number(process.env.INNER_TURNS ?? 8), maxTokens: 4000 }, - author: { - chat: createChatClient({ transport: 'router', baseUrl: routerBaseUrl, apiKey: routerKey, defaultModel: authorModel }), - model: authorModel, - maxTokens: 8000, - fallbackModel: process.env.AUTHOR_FALLBACK ?? 'deepseek-v4-flash', - }, - baselines: [sample, refine], - budget: Number(process.env.BUDGET ?? 3), - generations: Number(process.env.GENERATIONS ?? 2), - populationSize: Number(process.env.POP ?? 2), - outDir, - }) - rmSync(outDir, { recursive: true, force: true }) + const report = await (async () => { + const outDir = mkdtempSync(join(process.cwd(), '.sic-run-')) + try { + return await runStrategyEvolution({ + environment: codingEnv, + tasks: codingTasks, + trainN: Number(process.env.TRAIN_N ?? 8), + holdoutN: Number(process.env.HOLDOUT_N ?? 12), + worker: { + routerBaseUrl, + routerKey, + model: workerModel, + innerTurns: Number(process.env.INNER_TURNS ?? 8), + maxTokens: 4000, + }, + author: { + chat: createChatClient({ + transport: 'router', + baseUrl: routerBaseUrl, + apiKey: routerKey, + defaultModel: authorModel, + }), + model: authorModel, + maxTokens: 8000, + fallbackModel: process.env.AUTHOR_FALLBACK ?? 'deepseek-v4-flash', + }, + baselines: [sample, refine], + budget: Number(process.env.BUDGET ?? 3), + generations: Number(process.env.GENERATIONS ?? 2), + populationSize: Number(process.env.POP ?? 2), + outDir, + }) + } finally { + rmSync(outDir, { recursive: true, force: true }) + } + })() const v = report.verdict - if (process.env.DUMP) { + if (process.env.DUMP === '1') { // Autopsy: gen0 baseline scores (headroom) + every authored candidate's score/error (did they // lose on a saturated task, or error at runtime?). const r = report as unknown as Record @@ -272,7 +335,9 @@ async function main(): Promise { console.log(`gen0 champion: ${report.gen0Champion.name}`) console.log(`final champion: ${report.finalChampion.name}`) console.log(`PROMOTED: ${v.promoted} (${v.reason})`) - console.log(`held-out lift: mean ${v.lift.mean.toFixed(3)} 95% CI [${v.lift.low.toFixed(3)}, ${v.lift.high.toFixed(3)}] n=${v.n}`) + console.log( + `held-out lift: mean ${v.lift.mean.toFixed(3)} 95% CI [${v.lift.low.toFixed(3)}, ${v.lift.high.toFixed(3)}] n=${v.n}`, + ) console.log( v.promoted ? '\n>>> The search taught the agent a strategy that fixes MORE on tasks it never trained on, beyond luck. Self-improvement CERTIFIED.' @@ -282,6 +347,6 @@ async function main(): Promise { if (import.meta.url === `file://${process.argv[1]}`) main().catch((e) => { - console.error(e) - process.exit(1) -}) + console.error(e instanceof Error ? (e.stack ?? e.message) : String(e)) + process.exit(1) + })