diff --git a/.gitignore b/.gitignore index d8334027..ed4bec97 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,5 @@ bench/scripts/__pycache__/ # local rollout-corpus scratch (raw jsonl, per work-line) corpus/ test_repo/ +.sic-run-*/ +.swe-run-*/ diff --git a/bench/src/swe-bench-env.ts b/bench/src/swe-bench-env.ts new file mode 100644 index 00000000..ccfb0931 --- /dev/null +++ b/bench/src/swe-bench-env.ts @@ -0,0 +1,181 @@ +/** + * SWE-bench Verified as an `AgenticSurface` — the PROPER, no-cheating way to run a coding agent on real + * GitHub bugs through the substrate (`runAgentic`/`runBenchmark`/`runStrategyEvolution` drive the loop; + * we only provide tools + a deployable score). The agent clones the repo at base_commit, explores + + * edits SOURCE via tools (never tests — path-jailed), and `score()` grades the resulting `git diff` + * with the OFFICIAL swebench Docker harness (apply patch → FAIL_TO_PASS + PASS_TO_PASS → resolved). + * + * No cheating by construction: the agent never sees the hidden tests or the gold patch (the adapter's + * prompt is the issue only); `edit_file` refuses test files; the score is a real test run, not a judge. + * + * CONTAMINATION CAVEAT: SWE-bench bugs are public GitHub fixes a frontier model may have MEMORIZED. + * A clean train→holdout split (disjoint instances) rules out adaptive-reuse, but NOT training-data + * memorization. Always report this; never claim a "clean" frontier number from this arena alone. + */ +import { execFile } from 'node:child_process' +import { existsSync, lstatSync, mkdtempSync, readdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { promisify } from 'node:util' +import type { AgenticSurface, AgenticTask, AgenticTool, ArtifactHandle, SurfaceScore } from '@tangle-network/agent-runtime/loops' +import { createSweBenchAdapter } from './benchmarks/swe-bench' +import type { BenchTask } from './benchmarks/types' + +const exec = promisify(execFile) +const isTestPath = (p: string) => /(^|\/)(tests?)\//.test(p) || /test_.*\.py$|_test\.py$|conftest\.py$/.test(p) + +interface Ws { + dir: string + task: BenchTask +} +const workspaces = new Map() + +/** Build the SWE-bench Environment + a DISJOINT-slice task supplier over the Verified split. The + * supplier keys tasks by dataset offset so `runStrategyEvolution`'s train [0,trainN) and holdout + * [trainN+off,…) never overlap. Verified is loaded once; instances carry their repo/base_commit. */ +export async function createSweBenchEnvironment(poolN = 80): Promise<{ + environment: AgenticSurface + tasks: (offset: number, n: number) => Promise + adapter: ReturnType +}> { + const adapter = createSweBenchAdapter() + const pool = await adapter.loadTasks({ limit: poolN, split: 'test' }) + const byId = new Map(pool.map((t) => [t.id, t])) + + const environment: AgenticSurface = { + name: 'swe-bench-verified', + async open(task) { + const bt = byId.get(task.id) + if (!bt) throw new Error(`swe-bench-env: unknown task ${task.id}`) + const md = bt.metadata as Record + const dir = mkdtempSync(join(tmpdir(), 'swe-')) + try { + await exec('git', ['clone', '--filter=blob:none', '--no-checkout', '--quiet', `https://github.com/${md.repo}.git`, dir], { timeout: 420_000 }) + await exec('git', ['-C', dir, 'checkout', '--quiet', md.base_commit], { timeout: 300_000 }) + const handle: ArtifactHandle = { id: dir, surface: 'swe-bench-verified' } + workspaces.set(dir, { dir, task: bt }) + return handle + } catch (error) { + rmSync(dir, { recursive: true, force: true }) + throw error + } + }, + async tools() { + return [ + { type: 'function', function: { name: 'list_files', description: 'List source files under a repo subdirectory (recursive, bounded). "" = repo root.', parameters: { type: 'object', properties: { dir: { type: 'string' } }, required: ['dir'] } } }, + { type: 'function', function: { name: 'read_file', description: 'Read a repo file by path.', parameters: { type: 'object', properties: { path: { type: 'string' } }, required: ['path'] } } }, + { type: 'function', function: { name: 'edit_file', description: 'Surgical fix: replace the EXACT old_string (must occur once — copy whitespace precisely) with new_string in a SOURCE file. Minimal changes, never whole-file rewrites. Test files are rejected.', parameters: { type: 'object', properties: { path: { type: 'string' }, old_string: { type: 'string' }, new_string: { type: 'string' } }, required: ['path', 'old_string', 'new_string'] } } }, + ] satisfies AgenticTool[] + }, + async call(handle, name, args) { + const ws = workspaces.get(handle.id) + if (!ws) return 'ERROR: workspace closed' + const safe = (p: string): string | null => { + if (p.startsWith('/') || p.includes('..')) return null + return p.replace(/^\.\//, '') + } + if (name === 'list_files') { + const sub = safe(String(args.dir ?? '')) ?? '' + const root = join(ws.dir, sub) + if (!existsSync(root)) return `(no such path: ${sub})` + const out: string[] = [] + const walk = (d: string, depth: number) => { + if (depth > 2 || out.length > 240) return + let entries: string[] = [] + try { + entries = readdirSync(d) + } catch { + return + } + for (const e of entries) { + if (e.startsWith('.') || e === 'node_modules' || e === '__pycache__') continue + const p = join(d, e) + let isDir = false + try { + isDir = lstatSync(p).isDirectory() + } catch { + continue + } + out.push(p.slice(ws.dir.length + 1) + (isDir ? '/' : '')) + if (isDir) walk(p, depth + 1) + } + } + walk(root, 0) + return out.slice(0, 240).join('\n') || '(empty)' + } + if (name === 'read_file') { + const p = safe(String(args.path ?? '')) + if (!p) return 'ERROR: invalid path' + try { + const c = readFileSync(join(ws.dir, p), 'utf8') + return c.length > 24_000 ? `${c.slice(0, 24_000)}\n...[truncated]` : c + } catch (e) { + return `(error: ${(e as Error).message})` + } + } + if (name === 'edit_file') { + const p = safe(String(args.path ?? '')) + if (!p) return 'ERROR: invalid path' + if (isTestPath(p)) return 'REJECTED: editing test files is forbidden (the evaluation runs hidden tests).' + const oldStr = String(args.old_string ?? '') + const newStr = String(args.new_string ?? '') + let content: string + try { + content = readFileSync(join(ws.dir, p), 'utf8') + } catch (e) { + return `(cannot read ${p}: ${(e as Error).message})` + } + if (!oldStr) return 'ERROR: old_string is empty.' + const count = content.split(oldStr).length - 1 + if (count === 0) return `ERROR: old_string not found in ${p}. read_file it and copy EXACT text.` + if (count > 1) return `ERROR: old_string appears ${count}× in ${p} — add surrounding context to make it unique.` + writeFileSync(join(ws.dir, p), content.replace(oldStr, newStr)) + return `edited ${p}: replaced 1 occurrence` + } + return `ERROR: unknown tool ${name}` + }, + async score(_task, handle): Promise { + const ws = workspaces.get(handle.id) + if (!ws) return { passes: 0, total: 1, errored: 1 } + let patch = '' + try { + const r = await exec('git', ['-C', ws.dir, 'diff'], { maxBuffer: 20_000_000, timeout: 60_000 }) + patch = r.stdout + } catch { + patch = '' + } + if (!patch.trim()) return { passes: 0, total: 1, errored: 0 } + try { + const s = await adapter.judge(ws.task, patch) + return { passes: s.resolved ? 1 : 0, total: 1, errored: 0 } + } catch { + return { passes: 0, total: 1, errored: 1 } + } + }, + async close(handle) { + const ws = workspaces.get(handle.id) + if (!ws) return + workspaces.delete(handle.id) + rmSync(ws.dir, { recursive: true, force: true }) + }, + } + + const tasks = async (offset: number, n: number): Promise => { + const slice = pool.slice(offset, offset + n) + if (slice.length < n) throw new Error(`swe-bench-env: pool exhausted at offset ${offset} (need ${n}, have ${slice.length}; raise poolN)`) + return slice.map((bt) => ({ + id: bt.id, + systemPrompt: + 'You are a senior engineer fixing a real bug in the checked-out repository. Work PERSISTENTLY and do not ' + + 'stop early: use list_files + read_file to explore BROADLY (read many candidate files — the bug is rarely in ' + + 'the first file you open), trace the issue to its root cause, then fix it with edit_file. You MUST make at ' + + 'least one edit_file call — never finish with prose alone or without attempting a fix. Make a MINIMAL surgical ' + + 'change (a few lines, like a real PR), source only (test files are rejected). If an edit_file fails (old_string ' + + 'not unique/found), read the file again and retry with exact text. Keep going until you have made your best fix.', + userPrompt: bt.prompt, + meta: { instanceId: bt.id }, + })) + } + + return { environment, tasks, adapter } +} diff --git a/bench/swe-self-improve.mts b/bench/swe-self-improve.mts new file mode 100644 index 00000000..a1d72cff --- /dev/null +++ b/bench/swe-self-improve.mts @@ -0,0 +1,84 @@ +/** + * SWE-bench self-improvement — the PROPER, no-cheating run: a frontier worker over the SWE-bench + * `Environment`, with `runStrategyEvolution` enforcing the train→freeze→holdout split (the substrate + * draws a disjoint holdout slice and gates once — adaptive reuse is impossible). CONTAMINATION CAVEAT + * applies (public fixes may be memorized) — reported, never claimed clean. + * + * CALIBRATE first (cost gate): TANGLE_API_KEY=… CALIBRATE=1 N=3 tsx bench/swe-self-improve.mts + * Full run: TANGLE_API_KEY=… TRAIN_N=6 HOLDOUT_N=8 GENERATIONS=2 tsx bench/swe-self-improve.mts + */ +import { mkdtempSync, rmSync } from 'node:fs' +import { join } from 'node:path' +import { createChatClient } from '@tangle-network/agent-eval' +import { refine, runAgentic, runStrategyEvolution, sample } from '@tangle-network/agent-runtime/loops' +import { createSweBenchEnvironment } from './src/swe-bench-env' + +async function main(): Promise { + const routerKey = process.env.TANGLE_API_KEY + if (!routerKey) throw new Error('TANGLE_API_KEY required (worker + author call the router)') + const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1' + const workerModel = process.env.WORKER_MODEL ?? 'gemini-2.5-pro' + const authorModel = process.env.AUTHOR_MODEL ?? 'gemini-2.5-pro' + const innerTurns = Number(process.env.INNER_TURNS ?? 40) + const { environment, tasks } = await createSweBenchEnvironment(Number(process.env.POOL_N ?? 80)) + + if (process.env.CALIBRATE === '1') { + const n = Number(process.env.N ?? 3) + const ts = await tasks(0, n) + console.log(`═══ SWE-bench CALIBRATION — ${workerModel}, baseline=refine, ${n} real bugs ═══`) + let resolved = 0 + for (const t of ts) { + const t0 = Date.now() + const r = await runAgentic({ surface: environment, task: t, strategy: refine, routerBaseUrl, routerKey, model: workerModel, maxTokens: 8000, innerTurns, budget: 1 }) + if (r.resolved) resolved++ + console.log(` ${t.id.padEnd(32)} resolved=${r.resolved} completions=${r.completions} shots=${r.shots} (${Math.round((Date.now() - t0) / 1000)}s)`) + } + const band = resolved > 0 && resolved < n + console.log(`\n>>> baseline resolved ${resolved}/${n}. ${band ? 'HEADROOM — the loop has room to improve. PROCEED.' : resolved === 0 ? 'TOO HARD / env issue — inspect before the loop.' : 'saturated at this small n — raise N.'}`) + return + } + + const report = await (async () => { + const outDir = mkdtempSync(join(process.cwd(), '.swe-run-')) + try { + return await runStrategyEvolution({ + environment, + tasks, + trainN: Number(process.env.TRAIN_N ?? 6), + holdoutN: Number(process.env.HOLDOUT_N ?? 8), + worker: { routerBaseUrl, routerKey, model: workerModel, maxTokens: 8000, innerTurns }, + author: { + chat: createChatClient({ transport: 'router', baseUrl: routerBaseUrl, apiKey: routerKey, defaultModel: authorModel }), + model: authorModel, + maxTokens: 8000, + fallbackModel: process.env.AUTHOR_FALLBACK ?? 'deepseek-v4-flash', + }, + baselines: [sample, refine], + budget: Number(process.env.BUDGET ?? 2), + generations: Number(process.env.GENERATIONS ?? 2), + populationSize: Number(process.env.POP ?? 2), + outDir, + }) + } finally { + rmSync(outDir, { recursive: true, force: true }) + } + })() + + const v = report.verdict + console.log('\n═══ SWE-bench SELF-IMPROVEMENT — certified on a FROZEN holdout (CONTAMINATION-flagged) ═══') + console.log(`worker=${workerModel} author=${authorModel}`) + console.log(`gen0 champion: ${report.gen0Champion.name}`) + console.log(`final champion: ${report.finalChampion.name}`) + console.log(`PROMOTED: ${v.promoted} (${v.reason})`) + console.log(`held-out lift: mean ${v.lift.mean.toFixed(3)} 95% CI [${v.lift.low.toFixed(3)}, ${v.lift.high.toFixed(3)}] n=${v.n}`) + console.log( + v.promoted + ? '\n>>> The search taught the agent a strategy that resolves MORE real bugs it never trained on, beyond luck. (Report the contamination caveat: public fixes may be memorized.)' + : '\n>>> No promotion: the evolved strategy did not beat gen0 on the fresh holdout beyond noise (honest null).', + ) +} + +main().catch((e) => { + console.error(e instanceof Error ? (e.stack ?? e.message) : String(e)) + process.exit(1) +}) diff --git a/examples/ablation-suite/ablation.ts b/examples/ablation-suite/ablation.ts new file mode 100644 index 00000000..9fe2837e --- /dev/null +++ b/examples/ablation-suite/ablation.ts @@ -0,0 +1,245 @@ +/** + * ablation — the cost-aware knob-board + one-knob-delta runner for agent self-improvement techniques. + * + * THE VISION: a single configurable agent where every technique is a knob (topology, trace-analysis, + * steering, GEPA/skill optimization, persistent artifacts), swept across arms at EQUAL COMPUTE, with a + * full autopsy — resolve rate AND token/$/latency cost per arm — so we see what really helps vs what + * just burns tokens. One-knob-delta design (baseline + each single knob flipped) keeps it O(N), not 2^N. + * + * STATUS — honest: the framework + the cost autopsy are real; knobs are wired incrementally. WIRED: + * `topology` (single/fanout/fanout-refine = refine/sample/sampleThenRefine) + `budget`. The rest are + * DECLARED knobs that FAIL LOUD if set (no silent no-op — you must not think GEPA ran when it didn't); + * each is a tracked next-increment over a real substrate primitive (named in the throw). Validate the + * framework on the cheap contamination-proof task, THEN point `environment`/`tasks` at SWE-bench. + */ +import { pairedBootstrap } from '@tangle-network/agent-eval' +import { + type AgenticSurface, + type AgenticTask, + refine, + runAgentic, + type Strategy, + sample, + sampleThenRefine, +} from '@tangle-network/agent-runtime/loops' +import { codingEnv, codingTasks } from '../self-improving-coder/self-improving-coder' + +export interface AblationKnobs { + /** WIRED → strategy: single=`refine` (iterate one artifact), fanout=`sample` (N parallel, pick best), + * fanout-refine=`sampleThenRefine`. The coordination shape. */ + topology: 'single' | 'fanout' | 'fanout-refine' + /** WIRED → equal-compute unit (refine: max shots; fanout: rollout width). */ + budget: number + // ── DECLARED knobs — fail loud until wired (each over a named substrate primitive) ── + /** The DRIVER-steers-WORKER loop: supervise() drives the worker, analyzeOnSettle fires the analyst on + * each settled round → a `finding` the driver pulls and composes the next prompt from. (NOT the + * refine analyst-steerer — that's the degenerate inline version; this is a driver brain in the loop.) */ + driverSteer?: boolean // supervise(driverProfile,{backend,analyzeOnSettle}) + steer_agent + /** GEPA-optimize the DRIVER's compose-next-prompt system prompt on TRAIN (executable-graded via the + * surface score), frozen, then run — selfImprove() with an executable JudgeConfig (NOT improve(): the + * steerer prompt is not a profile field). */ + optimize?: 'off' | 'gepa' + halo?: boolean // HALO analyst option + persistentArtifact?: boolean // multi-round persistent artifact (openSandboxRun resume) +} + +const topologyStrategy: Record = { + single: refine, + fanout: sample, + 'fanout-refine': sampleThenRefine, +} + +/** Fail loud on a set-but-unwired knob — the house rule (no silent no-op). Names the primitive to wire. */ +const unwiredKnobs: Array<{ + k: keyof AblationKnobs + isSet: (v: unknown) => boolean + prim: string +}> = [ + { + k: 'driverSteer', + isSet: (v) => v === true, + prim: 'supervise(driverProfile,{backend,analyzeOnSettle}) — driver composes the steer from the analyst finding', + }, + { + k: 'optimize', + isSet: (v) => !!v && v !== 'off', + prim: "selfImprove() w/ executable JudgeConfig optimizing the driver's compose-prompt on TRAIN, frozen", + }, + { k: 'halo', isSet: (v) => v === true, prim: 'HALO analyst option' }, + { k: 'persistentArtifact', isSet: (v) => v === true, prim: 'openSandboxRun resume' }, +] + +export interface ArmResult { + name: string + knobs: AblationKnobs + n: number + resolve: number // mean resolved (0..1) on the held-out set + tokensIn: number + tokensOut: number + costUsd: number + latencyMs: number + shotsMean: number + completionsMean: number + /** Per-task resolved (0/1), task-aligned across arms — the paired vector for significance. */ + perTask: number[] +} + +export async function runAblation(opts: { + environment: AgenticSurface + tasks: (offset: number, n: number) => Promise + holdoutOffset: number + holdoutN: number + base: AblationKnobs + /** Each delta = a ONE-KNOB change vs base (the one-knob-delta design). */ + deltas: Array<{ name: string; knob: Partial }> + worker: { + routerBaseUrl: string + routerKey: string + model: string + maxTokens?: number + innerTurns?: number + } + onArm?: (r: ArmResult) => void +}): Promise { + // ONE held-out set, shared across all arms — the fair-comparison invariant. + const tasks = await opts.tasks(opts.holdoutOffset, opts.holdoutN) + const arms = [ + { name: 'baseline', knobs: opts.base }, + ...opts.deltas.map((d) => ({ + name: d.name, + knobs: { ...opts.base, ...d.knob } as AblationKnobs, + })), + ] + const results: ArmResult[] = [] + for (const arm of arms) { + for (const u of unwiredKnobs) { + if (u.isSet(arm.knobs[u.k])) + throw new Error( + `ablation: knob '${u.k}'=${JSON.stringify(arm.knobs[u.k])} (arm "${arm.name}") is DECLARED but not yet wired — wire it over ${u.prim} before claiming it ran. (No silent no-op.)`, + ) + } + let resolved = 0 + let ti = 0 + let to = 0 + let usd = 0 + let ms = 0 + let shots = 0 + let comps = 0 + const perTask: number[] = [] + for (const t of tasks) { + const r = await runAgentic({ + surface: opts.environment, + task: t, + strategy: topologyStrategy[arm.knobs.topology], + budget: arm.knobs.budget, + routerBaseUrl: opts.worker.routerBaseUrl, + routerKey: opts.worker.routerKey, + model: opts.worker.model, + ...(opts.worker.maxTokens !== undefined ? { maxTokens: opts.worker.maxTokens } : {}), + ...(opts.worker.innerTurns !== undefined ? { innerTurns: opts.worker.innerTurns } : {}), + }) + if (r.resolved) resolved++ + perTask.push(r.resolved ? 1 : 0) + ti += r.tokens.input + to += r.tokens.output + usd += r.usd + ms += r.ms + shots += r.shots + comps += r.completions + } + const n = tasks.length + const res: ArmResult = { + name: arm.name, + knobs: arm.knobs, + n, + resolve: resolved / n, + tokensIn: ti, + tokensOut: to, + costUsd: usd, + latencyMs: ms, + shotsMean: shots / n, + completionsMean: comps / n, + perTask, + } + results.push(res) + opts.onArm?.(res) + } + return results +} + +/** The cost-aware autopsy: per-arm resolve + tokens + $ + latency, and Δ vs baseline (lift AND cost). */ +export function printAutopsy(results: ArmResult[]): void { + const base = results[0] + const pad = (s: string, n: number) => s.padEnd(n) + console.log(`\n═══ ABLATION AUTOPSY (n=${base?.n} held-out, one-knob-delta vs baseline) ═══`) + console.log( + pad('arm', 16) + + pad('topology', 14) + + pad('resolve', 9) + + pad('$', 9) + + pad('lat(s)', 8) + + pad('shots', 7) + + pad('Δresolve [95% CI]', 24) + + 'Δ$', + ) + for (const r of results) { + const dC = base ? r.costUsd - base.costUsd : 0 + // Significance: paired bootstrap of this arm's per-task resolve vs baseline's (task-aligned). + let lift = '+0pp' + if (base && r !== base) { + const b = pairedBootstrap(base.perTask, r.perTask, { confidence: 0.95, statistic: 'mean' }) + const sig = b.low > 0 || b.high < 0 ? '✓' : '·' // CI excludes 0 ⇒ real + lift = `${b.mean >= 0 ? '+' : ''}${(100 * b.mean).toFixed(0)}pp [${(100 * b.low).toFixed(0)},${(100 * b.high).toFixed(0)}] ${sig}` + } + console.log( + pad(r.name, 16) + + pad(r.knobs.topology, 14) + + pad(`${(100 * r.resolve).toFixed(0)}%`, 9) + + pad(`$${r.costUsd.toFixed(4)}`, 9) + + pad((r.latencyMs / 1000).toFixed(0), 8) + + pad(r.shotsMean.toFixed(1), 7) + + pad(lift, 24) + + `${dC >= 0 ? '+' : ''}$${dC.toFixed(4)}`, + ) + } + console.log( + '\n>>> Read it cost-aware: ✓ = CI excludes 0 (real lift). A +resolve that costs +$$ or is not ✓ may be worse than baseline. The point is to see what HELPS vs what just BURNS.', + ) +} + +async function main(): Promise { + const routerKey = process.env.TANGLE_API_KEY + if (!routerKey) throw new Error('TANGLE_API_KEY required') + const worker = { + routerBaseUrl: process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1', + routerKey, + model: process.env.WORKER_MODEL ?? 'deepseek-v4-flash', + maxTokens: 4000, + innerTurns: Number(process.env.INNER_TURNS ?? 6), + } + console.log(`═══ ABLATION (cheap contamination-proof task) — worker=${worker.model} ═══`) + const results = await runAblation({ + environment: codingEnv, + tasks: codingTasks, + holdoutOffset: 100, // a fixed disjoint held-out slice + holdoutN: Number(process.env.HOLDOUT_N ?? 6), + base: { topology: 'single', budget: Number(process.env.BUDGET ?? 2) }, + // one-knob-delta: flip ONLY topology (the wired knob) vs baseline. + deltas: [ + { name: 'fanout', knob: { topology: 'fanout' } }, + { name: 'fanout-refine', knob: { topology: 'fanout-refine' } }, + ], + worker, + onArm: (r) => + console.log( + ` ${r.name}: ${(100 * r.resolve).toFixed(0)}% resolve, $${r.costUsd.toFixed(4)}, ${(r.latencyMs / 1000).toFixed(0)}s`, + ), + }) + printAutopsy(results) +} + +if (import.meta.url === `file://${process.argv[1]}`) + main().catch((e) => { + console.error(e instanceof Error ? (e.stack ?? e.message) : String(e)) + process.exit(1) + }) diff --git a/examples/self-improving-coder/self-improving-coder.ts b/examples/self-improving-coder/self-improving-coder.ts new file mode 100644 index 00000000..d799cffc --- /dev/null +++ b/examples/self-improving-coder/self-improving-coder.ts @@ -0,0 +1,352 @@ +/** + * Self-improving coder — the substrate's self-improvement spine, composed cleanly, on a + * CONTAMINATION-PROOF coding task. NOTHING here is hand-rolled: the genome is an `AgentProfile`-shaped + * worker, the task is an `AgenticSurface` (open/tools/call/score/close), and the held-out-gated + * flywheel is `runStrategyEvolution` — which authors candidate strategies from TRAIN losses, then + * makes ONE promotion decision on a FRESH holdout slice the search never touched (`promotionGate`, + * a seeded paired-bootstrap CI). Adaptive data analysis is structurally impossible: the holdout is + * disjoint by task offset and read exactly once. + * + * Why contamination-proof: each task is a small wire-protocol library whose constants (version, + * separators, checksum modulus, opcode) are DERIVED FROM THE SEED and specified ONLY by the test file. + * A frontier model cannot have memorized the fix — the exact contract is generated per task. Graded by + * REAL pytest (a deployable check, never an LLM judge). + * + * IMPORTANT — the bundled task is DELIBERATELY SIMPLE (a few functions fully pinned by their tests). + * A capable model aces it (every strategy scores 1.0), so the gate CORRECTLY returns no-promotion: + * you cannot demonstrate self-improvement where there is no headroom — and this harness refuses to + * pretend otherwise (calibrate-before-measure, enforced). To get a DISCRIMINATING run, swap in a task + * with a correctable middle band (algorithmically hard generated tasks, or a real benchmark below). + * + * To run frontier SWE-bench instead, swap `environment`/`tasks` for the SWE-bench `Environment` + * (bench/src/benchmarks/swe-bench.ts) — everything else is identical. (That arena is contamination- + * SUSPECT: its bugs are public GitHub fixes a model may have memorized — report it, never claim clean.) + * + * Run: TANGLE_API_KEY= pnpm tsx examples/self-improving-coder/self-improving-coder.ts + */ +import { execFileSync } from 'node:child_process' +import { mkdtempSync, readdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { createChatClient } from '@tangle-network/agent-eval' +import { + type AgenticSurface, + type AgenticTask, + type AgenticTool, + type ArtifactHandle, + refine, + runStrategyEvolution, + type SurfaceScore, + sample, +} from '@tangle-network/agent-runtime/loops' + +// ── The contamination-proof task generator (deterministic per seed) ────────────── +/** A small wire-protocol library, fully specified by its tests, with seed-derived constants. The + * agent must READ the tests to infer the exact contract — it cannot recall it. Returns the stub the + * agent edits + the hidden-ish test file (the agent may read it; grading runs it). */ +function constsFor(seed: number): { VER: string; SEP: string; MOD: number } { + const r = (m: number) => ((seed * 2654435761) >>> 0) % m + return { + VER: `v${(r(900) + 100).toString(36)}`, + SEP: ['-', '|', ':', '/', '#'][r(5)]!, + MOD: [97, 101, 103, 107, 109][r(5)]!, + } +} +function genTask(seed: number): { stub: string; test: string; total: number } { + const { VER, SEP, MOD } = constsFor(seed) + const t = (id: number, text: string) => `${VER}${SEP}${id}${SEP}${text}` + const tests = [ + 'import pytest', + 'from lib import encode, decode, checksum, valid', + '', + `def test_encode(): assert encode(3, "hi") == ${JSON.stringify(t(3, 'hi'))}`, + `def test_encode_zero(): assert encode(0, "") == ${JSON.stringify(t(0, ''))}`, + `def test_decode(): assert decode(${JSON.stringify(t(9, 'ab'))}) == {"id": 9, "text": "ab"}`, + 'def test_roundtrip(): assert decode(encode(42, "yo")) == {"id": 42, "text": "yo"}', + `def test_checksum(): assert checksum("abc") == sum(b for b in b"abc") % ${MOD}`, + `def test_checksum_empty(): assert checksum("") == 0`, + `def test_valid_true(): assert valid(${JSON.stringify(t(1, 'x'))}) is True`, + `def test_valid_bad_version(): assert valid("zz${SEP}1${SEP}x") is False`, + `def test_valid_bad_shape(): assert valid("not a token") is False`, + '', + ].join('\n') + const stub = [ + '# Implement these so test_lib.py passes. Infer the exact format from the tests.', + 'def encode(id, text):', + ' raise NotImplementedError', + 'def decode(s):', + ' raise NotImplementedError', + 'def checksum(text):', + ' raise NotImplementedError', + 'def valid(s):', + ' raise NotImplementedError', + '', + ].join('\n') + return { stub, test: tests, total: 9 } +} + +// ── The Environment (AgenticSurface) — host pytest, no Docker. (Docker is a swap for untrusted code.) ── +interface Ws { + dir: string + total: number +} +const workspaces = new Map() + +function pytestPassed(dir: string): { passed: number; total: number } { + let out = '' + try { + out = execFileSync( + 'python3', + ['-m', 'pytest', '-q', '--tb=no', '-p', 'no:cacheprovider', 'test_lib.py'], + { + cwd: dir, + encoding: 'utf8', + timeout: 60_000, + stdio: ['ignore', 'pipe', 'pipe'], + }, + ) + } catch (e) { + out = (e as { stdout?: string }).stdout ?? '' + } + const passed = Number(out.match(/(\d+) passed/)?.[1] ?? 0) + const failed = + Number(out.match(/(\d+) failed/)?.[1] ?? 0) + Number(out.match(/(\d+) error/)?.[1] ?? 0) + return { passed, total: passed + failed } +} + +export const codingEnv: AgenticSurface = { + name: 'generated-coding', + async open(task) { + const seed = Number((task.meta as { seed?: number })?.seed ?? 0) + const { stub, test, total } = genTask(seed) + const dir = mkdtempSync(join(tmpdir(), 'sic-')) + writeFileSync(join(dir, 'lib.py'), stub) + writeFileSync(join(dir, 'test_lib.py'), test) + const handle: ArtifactHandle = { id: dir, surface: 'generated-coding' } + workspaces.set(dir, { dir, total }) + return handle + }, + async tools() { + return [ + { + type: 'function', + function: { + name: 'list_files', + description: 'List the files in the workspace.', + parameters: { type: 'object', properties: {} }, + }, + }, + { + type: 'function', + function: { + name: 'read_file', + description: 'Read a file (e.g. test_lib.py to learn the contract, or lib.py).', + parameters: { + type: 'object', + properties: { path: { type: 'string' } }, + required: ['path'], + }, + }, + }, + { + type: 'function', + function: { + name: 'write_file', + description: + 'Write COMPLETE contents of lib.py (the implementation). test_lib.py is read-only.', + parameters: { + type: 'object', + properties: { path: { type: 'string' }, content: { type: 'string' } }, + required: ['path', 'content'], + }, + }, + }, + // NO run_tests: the agent cannot iterate-until-green. It must implement correctly from READING the + // tests — which creates real headroom and makes the STRATEGY (planning, multiple attempts) matter. + ] satisfies AgenticTool[] + }, + async call(handle, name, args) { + const ws = workspaces.get(handle.id) + if (!ws) return 'ERROR: workspace closed' + if (name === 'list_files') return readdirSync(ws.dir).join('\n') + if (name === 'read_file') { + const p = String(args.path ?? '') + if (p !== 'lib.py' && p !== 'test_lib.py') + return 'ERROR: only lib.py and test_lib.py are readable' + try { + return readFileSync(join(ws.dir, p), 'utf8').slice(0, 8000) + } catch (e) { + return `ERROR: ${(e as Error).message}` + } + } + if (name === 'write_file') { + const p = String(args.path ?? '') + if (p !== 'lib.py') return 'ERROR: only lib.py is writable' + try { + writeFileSync(join(ws.dir, 'lib.py'), String(args.content ?? '')) + return 'wrote lib.py' + } catch (e) { + return `ERROR: ${(e as Error).message}` + } + } + return `ERROR: unknown tool ${name}` + }, + async score(_task, handle): Promise { + const ws = workspaces.get(handle.id) + if (!ws) return { passes: 0, total: 0, errored: 1 } + const { passed, total } = pytestPassed(ws.dir) + return total > 0 + ? { passes: passed, total, errored: 0 } + : { passes: 0, total: ws.total, errored: 1 } + }, + async close(handle) { + const ws = workspaces.get(handle.id) + if (!ws) return + workspaces.delete(handle.id) + rmSync(ws.dir, { recursive: true, force: true }) + }, +} + +// ── The disjoint task supplier (train [0,trainN); holdout drawn past it) ────────── +export const codingTasks = async (offset: number, n: number): Promise => + Array.from({ length: n }, (_, i) => { + const seed = offset + i + return { + id: `gen-${seed}`, + systemPrompt: + 'You are a Python engineer. The library lib.py has stub functions; its exact contract is defined ONLY by ' + + 'test_lib.py. You CANNOT run the tests — read test_lib.py CAREFULLY (every assertion, every edge case) and ' + + 'implement lib.py correctly in one pass with write_file. Get the edge cases right (empty inputs, malformed ' + + 'inputs, exact formats). Do not edit test_lib.py.', + userPrompt: + 'Read test_lib.py to learn the exact contract, then write a correct lib.py. You cannot run the tests — reason carefully.', + meta: { seed }, + } satisfies AgenticTask + }) + +/** The correct lib.py for a seed — used ONLY by the $0 calibration self-check (never by the agent). */ +function referenceLib(seed: number): string { + const { VER, SEP, MOD } = constsFor(seed) + return [ + `VER, SEP, MOD = ${JSON.stringify(VER)}, ${JSON.stringify(SEP)}, ${MOD}`, + 'def encode(id, text): return f"{VER}{SEP}{id}{SEP}{text}"', + 'def decode(s):', + ' v, i, t = s.split(SEP, 2)', + ' return {"id": int(i), "text": t}', + 'def checksum(text): return sum(text.encode()) % MOD if text else 0', + 'def valid(s):', + ' p = s.split(SEP)', + ' return len(p) == 3 and p[0] == VER and p[1].isdigit()', + '', + ].join('\n') +} + +/** calibrate-before-measure: prove the task is SOLVABLE (reference → all pass) and the grader + * DISCRIMINATES (stub → 0). $0, no router. A reference that doesn't clear means the task/grader is + * broken — fix it before spending. */ +async function calibrate(): Promise { + console.log('═══ CALIBRATION ($0) — task solvable + grader discriminates? ═══') + let ok = true + for (const seed of [0, 1, 2, 7, 11]) { + const task = (await codingTasks(seed, 1))[0]! + const h = await codingEnv.open(task) + const stub = await codingEnv.score(task, h) + // write the reference, re-score + await codingEnv.call(h, 'write_file', { path: 'lib.py', content: referenceLib(seed) }) + const ref = await codingEnv.score(task, h) + await codingEnv.close(h) + const pass = ref.passes === ref.total && ref.total > 0 && stub.passes === 0 + ok &&= pass + console.log( + ` seed ${seed}: stub ${stub.passes}/${stub.total} → reference ${ref.passes}/${ref.total} ${pass ? '✓' : '✗ BROKEN'}`, + ) + } + console.log( + ok + ? '\n>>> CALIBRATED — task is solvable + the grader discriminates. Safe to run the loop.' + : '\n>>> BROKEN — fix the task/grader before spending.', + ) + if (!ok) process.exit(1) +} + +async function main(): Promise { + if (process.env.CALIBRATE === '1') return calibrate() + const routerKey = process.env.TANGLE_API_KEY + if (!routerKey) + throw new Error('set TANGLE_API_KEY (the worker + the author both call the router)') + const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1' + const workerModel = process.env.WORKER_MODEL ?? 'deepseek-v4-flash' + // The author WRITES strategy code (a `defineStrategy` module) — it needs a strong coder + a token + // budget (thinking models return empty content without one) + a fallback. deepseek-flash can't. + const authorModel = process.env.AUTHOR_MODEL ?? 'gemini-2.5-pro' + + // The author writes candidate-strategy .mts files into outDir, then dynamically imports them — they + // `import '@tangle-network/agent-runtime/loops'`, which only resolves UNDER the package (self-reference). + // A /tmp outDir would fail to resolve it; keep it under the project root. + const report = await (async () => { + const outDir = mkdtempSync(join(process.cwd(), '.sic-run-')) + try { + return await runStrategyEvolution({ + environment: codingEnv, + tasks: codingTasks, + trainN: Number(process.env.TRAIN_N ?? 8), + holdoutN: Number(process.env.HOLDOUT_N ?? 12), + worker: { + routerBaseUrl, + routerKey, + model: workerModel, + innerTurns: Number(process.env.INNER_TURNS ?? 8), + maxTokens: 4000, + }, + author: { + chat: createChatClient({ + transport: 'router', + baseUrl: routerBaseUrl, + apiKey: routerKey, + defaultModel: authorModel, + }), + model: authorModel, + maxTokens: 8000, + fallbackModel: process.env.AUTHOR_FALLBACK ?? 'deepseek-v4-flash', + }, + baselines: [sample, refine], + budget: Number(process.env.BUDGET ?? 3), + generations: Number(process.env.GENERATIONS ?? 2), + populationSize: Number(process.env.POP ?? 2), + outDir, + }) + } finally { + rmSync(outDir, { recursive: true, force: true }) + } + })() + + const v = report.verdict + if (process.env.DUMP === '1') { + // Autopsy: gen0 baseline scores (headroom) + every authored candidate's score/error (did they + // lose on a saturated task, or error at runtime?). + const r = report as unknown as Record + const slim = (x: unknown) => + JSON.stringify(x, (_k, val) => (typeof val === 'function' ? '[fn]' : val), 1) + console.log('--- gen0 ---', slim(r.gen0 ?? r.gen0Champion)) + console.log('--- generations ---', slim(r.generations)?.slice(0, 3000)) + } + console.log('\n═══ SELF-IMPROVING CODER — certified on a FROZEN holdout (no adaptive reuse) ═══') + console.log(`worker=${workerModel} author=${authorModel}`) + console.log(`gen0 champion: ${report.gen0Champion.name}`) + console.log(`final champion: ${report.finalChampion.name}`) + console.log(`PROMOTED: ${v.promoted} (${v.reason})`) + console.log( + `held-out lift: mean ${v.lift.mean.toFixed(3)} 95% CI [${v.lift.low.toFixed(3)}, ${v.lift.high.toFixed(3)}] n=${v.n}`, + ) + console.log( + v.promoted + ? '\n>>> The search taught the agent a strategy that fixes MORE on tasks it never trained on, beyond luck. Self-improvement CERTIFIED.' + : '\n>>> No promotion: the evolved strategy did not beat gen0 on the fresh holdout beyond noise (honest null).', + ) +} + +if (import.meta.url === `file://${process.argv[1]}`) + main().catch((e) => { + console.error(e instanceof Error ? (e.stack ?? e.message) : String(e)) + process.exit(1) + })