From d4f535148b4e3c9c9e8485a2293977ff499d8049 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Sat, 27 Jun 2026 17:34:43 -0600
Subject: [PATCH 1/6] =?UTF-8?q?feat(examples):=20self-improving-coder=20?=
 =?UTF-8?q?=E2=80=94=20the=20RSI=20spine,=20composed=20cleanly,=20on=20a?=
 =?UTF-8?q?=20contamination-proof=20task?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pristine self-improvement loop with NOTHING hand-rolled: an AgentProfile-shaped worker over an
AgenticSurface (the task, real tools), gated by runStrategyEvolution — which authors strategies from
TRAIN losses then makes ONE promotion decision on a FRESH holdout slice via promotionGate. Adaptive
data analysis is structurally impossible (disjoint task offsets, holdout read once). The only new code
is the Environment: a contamination-proof generated coding task (constants derived per-seed, so no
model could have memorized it), graded by real pytest. $0 calibration self-check (reference->100%,
stub->0%) gates spend. The bundled task is deliberately simple — a capable model aces it, so the gate
correctly returns no-promotion; swap a harder Environment (or SWE-bench) for a discriminating run.
---
 .../self-improving-coder.ts                   | 286 ++++++++++++++++++
 1 file changed, 286 insertions(+)
 create mode 100644 examples/self-improving-coder/self-improving-coder.ts
diff --git a/examples/self-improving-coder/self-improving-coder.ts b/examples/self-improving-coder/self-improving-coder.ts
new file mode 100644
index 00000000..736ed05b
--- /dev/null
+++ b/examples/self-improving-coder/self-improving-coder.ts
@@ -0,0 +1,286 @@
+/**
+ * Self-improving coder — the substrate's self-improvement spine, composed cleanly, on a
+ * CONTAMINATION-PROOF coding task. NOTHING here is hand-rolled: the genome is an `AgentProfile`-shaped
+ * worker, the task is an `AgenticSurface` (open/tools/call/score/close), and the held-out-gated
+ * flywheel is `runStrategyEvolution` — which authors candidate strategies from TRAIN losses, then
+ * makes ONE promotion decision on a FRESH holdout slice the search never touched (`promotionGate`,
+ * a seeded paired-bootstrap CI). Adaptive data analysis is structurally impossible: the holdout is
+ * disjoint by task offset and read exactly once.
+ *
+ * Why contamination-proof: each task is a small wire-protocol library whose constants (version,
+ * separators, checksum modulus, opcode) are DERIVED FROM THE SEED and specified ONLY by the test file.
+ * A frontier model cannot have memorized the fix — the exact contract is generated per task. Graded by
+ * REAL pytest (a deployable check, never an LLM judge).
+ *
+ * IMPORTANT — the bundled task is DELIBERATELY SIMPLE (a few functions fully pinned by their tests).
+ * A capable model aces it (every strategy scores 1.0), so the gate CORRECTLY returns no-promotion:
+ * you cannot demonstrate self-improvement where there is no headroom — and this harness refuses to
+ * pretend otherwise (calibrate-before-measure, enforced). To get a DISCRIMINATING run, swap in a task
+ * with a correctable middle band (algorithmically hard generated tasks, or a real benchmark below).
+ *
+ * To run frontier SWE-bench instead, swap `environment`/`tasks` for the SWE-bench `Environment`
+ * (bench/src/benchmarks/swe-bench.ts) — everything else is identical. (That arena is contamination-
+ * SUSPECT: its bugs are public GitHub fixes a model may have memorized — report it, never claim clean.)
+ *
+ * Run:  TANGLE_API_KEY=<router key>  pnpm tsx examples/self-improving-coder/self-improving-coder.ts
+ */
+import { execFileSync } from 'node:child_process'
+import { mkdirSync, mkdtempSync, readdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs'
+import { tmpdir } from 'node:os'
+import { join } from 'node:path'
+import { createChatClient } from '@tangle-network/agent-eval'
+import {
+  type AgenticSurface,
+  type AgenticTask,
+  type AgenticTool,
+  type ArtifactHandle,
+  refine,
+  runStrategyEvolution,
+  sample,
+  type SurfaceScore,
+} from '@tangle-network/agent-runtime/loops'
+
+// ── The contamination-proof task generator (deterministic per seed) ──────────────
+/** A small wire-protocol library, fully specified by its tests, with seed-derived constants. The
+ *  agent must READ the tests to infer the exact contract — it cannot recall it. Returns the stub the
+ *  agent edits + the hidden-ish test file (the agent may read it; grading runs it). */
+function constsFor(seed: number): { VER: string; SEP: string; MOD: number } {
+  const r = (m: number) => ((seed * 2654435761) >>> 0) % m
+  return { VER: `v${(r(900) + 100).toString(36)}`, SEP: ['-', '|', ':', '/', '#'][r(5)]!, MOD: [97, 101, 103, 107, 109][r(5)]! }
+}
+function genTask(seed: number): { stub: string; test: string; total: number } {
+  const { VER, SEP, MOD } = constsFor(seed)
+  const t = (id: number, text: string) => `${VER}${SEP}${id}${SEP}${text}`
+  const tests = [
+    'import pytest',
+    'from lib import encode, decode, checksum, valid',
+    '',
+    `def test_encode(): assert encode(3, "hi") == ${JSON.stringify(t(3, 'hi'))}`,
+    `def test_encode_zero(): assert encode(0, "") == ${JSON.stringify(t(0, ''))}`,
+    `def test_decode(): assert decode(${JSON.stringify(t(9, 'ab'))}) == {"id": 9, "text": "ab"}`,
+    'def test_roundtrip(): assert decode(encode(42, "yo")) == {"id": 42, "text": "yo"}',
+    `def test_checksum(): assert checksum("abc") == sum(b for b in b"abc") % ${MOD}`,
+    `def test_checksum_empty(): assert checksum("") == 0`,
+    `def test_valid_true(): assert valid(${JSON.stringify(t(1, 'x'))}) is True`,
+    `def test_valid_bad_version(): assert valid("zz${SEP}1${SEP}x") is False`,
+    `def test_valid_bad_shape(): assert valid("not a token") is False`,
+    '',
+  ].join('\n')
+  const stub = [
+    '# Implement these so test_lib.py passes. Infer the exact format from the tests.',
+    'def encode(id, text):',
+    '    raise NotImplementedError',
+    'def decode(s):',
+    '    raise NotImplementedError',
+    'def checksum(text):',
+    '    raise NotImplementedError',
+    'def valid(s):',
+    '    raise NotImplementedError',
+    '',
+  ].join('\n')
+  return { stub, test: tests, total: 9 }
+}
+
+// ── The Environment (AgenticSurface) — host pytest, no Docker. (Docker is a swap for untrusted code.) ──
+interface Ws {
+  dir: string
+  total: number
+}
+const workspaces = new Map<string, Ws>()
+
+function pytestPassed(dir: string): { passed: number; total: number } {
+  let out = ''
+  try {
+    out = execFileSync('python3', ['-m', 'pytest', '-q', '--tb=no', '-p', 'no:cacheprovider', 'test_lib.py'], {
+      cwd: dir,
+      encoding: 'utf8',
+      timeout: 60_000,
+      stdio: ['ignore', 'pipe', 'pipe'],
+    })
+  } catch (e) {
+    out = (e as { stdout?: string }).stdout ?? ''
+  }
+  const passed = Number(out.match(/(\d+) passed/)?.[1] ?? 0)
+  const failed = Number(out.match(/(\d+) failed/)?.[1] ?? 0) + Number(out.match(/(\d+) error/)?.[1] ?? 0)
+  return { passed, total: passed + failed }
+}
+
+const codingEnv: AgenticSurface = {
+  name: 'generated-coding',
+  async open(task) {
+    const seed = Number((task.meta as { seed?: number })?.seed ?? 0)
+    const { stub, test, total } = genTask(seed)
+    const dir = mkdtempSync(join(tmpdir(), 'sic-'))
+    writeFileSync(join(dir, 'lib.py'), stub)
+    writeFileSync(join(dir, 'test_lib.py'), test)
+    const handle: ArtifactHandle = { id: dir, surface: 'generated-coding' }
+    workspaces.set(dir, { dir, total })
+    return handle
+  },
+  async tools() {
+    return [
+      { type: 'function', function: { name: 'list_files', description: 'List the files in the workspace.', parameters: { type: 'object', properties: {} } } },
+      { type: 'function', function: { name: 'read_file', description: 'Read a file (e.g. test_lib.py to learn the contract, or lib.py).', parameters: { type: 'object', properties: { path: { type: 'string' } }, required: ['path'] } } },
+      { type: 'function', function: { name: 'write_file', description: 'Write COMPLETE contents of lib.py (the implementation). test_lib.py is read-only.', parameters: { type: 'object', properties: { path: { type: 'string' }, content: { type: 'string' } }, required: ['path', 'content'] } } },
+      // NO run_tests: the agent cannot iterate-until-green. It must implement correctly from READING the
+      // tests — which creates real headroom and makes the STRATEGY (planning, multiple attempts) matter.
+    ] satisfies AgenticTool[]
+  },
+  async call(handle, name, args) {
+    const ws = workspaces.get(handle.id)
+    if (!ws) return 'ERROR: workspace closed'
+    if (name === 'list_files') return readdirSync(ws.dir).join('\n')
+    if (name === 'read_file') {
+      try {
+        return readFileSync(join(ws.dir, String(args.path ?? '')), 'utf8').slice(0, 8000)
+      } catch (e) {
+        return `ERROR: ${(e as Error).message}`
+      }
+    }
+    if (name === 'write_file') {
+      const p = String(args.path ?? '')
+      if (!p.endsWith('lib.py') || p.includes('..') || p.startsWith('/')) return 'ERROR: only lib.py is writable'
+      try {
+        mkdirSync(ws.dir, { recursive: true })
+        writeFileSync(join(ws.dir, 'lib.py'), String(args.content ?? ''))
+        return 'wrote lib.py'
+      } catch (e) {
+        return `ERROR: ${(e as Error).message}`
+      }
+    }
+    if (name === 'run_tests') {
+      const { passed, total } = pytestPassed(ws.dir)
+      return `pytest: ${passed}/${total} passed`
+    }
+    return `ERROR: unknown tool ${name}`
+  },
+  async score(_task, handle): Promise<SurfaceScore> {
+    const ws = workspaces.get(handle.id)
+    if (!ws) return { passes: 0, total: 0, errored: 1 }
+    const { passed, total } = pytestPassed(ws.dir)
+    return total > 0 ? { passes: passed, total, errored: 0 } : { passes: 0, total: ws.total, errored: 1 }
+  },
+  async close(handle) {
+    const ws = workspaces.get(handle.id)
+    if (!ws) return
+    workspaces.delete(handle.id)
+    rmSync(ws.dir, { recursive: true, force: true })
+  },
+}
+
+// ── The disjoint task supplier (train [0,trainN); holdout drawn past it) ──────────
+const tasks = async (offset: number, n: number): Promise<AgenticTask[]> =>
+  Array.from({ length: n }, (_, i) => {
+    const seed = offset + i
+    return {
+      id: `gen-${seed}`,
+      systemPrompt:
+        'You are a Python engineer. The library lib.py has stub functions; its exact contract is defined ONLY by ' +
+        'test_lib.py. You CANNOT run the tests — read test_lib.py CAREFULLY (every assertion, every edge case) and ' +
+        'implement lib.py correctly in one pass with write_file. Get the edge cases right (empty inputs, malformed ' +
+        'inputs, exact formats). Do not edit test_lib.py.',
+      userPrompt: 'Read test_lib.py to learn the exact contract, then write a correct lib.py. You cannot run the tests — reason carefully.',
+      meta: { seed },
+    } satisfies AgenticTask
+  })
+
+/** The correct lib.py for a seed — used ONLY by the $0 calibration self-check (never by the agent). */
+function referenceLib(seed: number): string {
+  const { VER, SEP, MOD } = constsFor(seed)
+  return [
+    `VER, SEP, MOD = ${JSON.stringify(VER)}, ${JSON.stringify(SEP)}, ${MOD}`,
+    'def encode(id, text): return f"{VER}{SEP}{id}{SEP}{text}"',
+    'def decode(s):',
+    '    v, i, t = s.split(SEP, 2)',
+    '    return {"id": int(i), "text": t}',
+    'def checksum(text): return sum(text.encode()) % MOD if text else 0',
+    'def valid(s):',
+    '    p = s.split(SEP)',
+    '    return len(p) == 3 and p[0] == VER and p[1].isdigit()',
+    '',
+  ].join('\n')
+}
+
+/** calibrate-before-measure: prove the task is SOLVABLE (reference → all pass) and the grader
+ *  DISCRIMINATES (stub → 0). $0, no router. A reference that doesn't clear means the task/grader is
+ *  broken — fix it before spending. */
+async function calibrate(): Promise<void> {
+  console.log('═══ CALIBRATION ($0) — task solvable + grader discriminates? ═══')
+  let ok = true
+  for (const seed of [0, 1, 2, 7, 11]) {
+    const task = (await tasks(seed, 1))[0]!
+    const h = await codingEnv.open(task)
+    const stub = await codingEnv.score(task, h)
+    // write the reference, re-score
+    await codingEnv.call(h, 'write_file', { path: 'lib.py', content: referenceLib(seed) })
+    const ref = await codingEnv.score(task, h)
+    await codingEnv.close(h)
+    const pass = ref.passes === ref.total && ref.total > 0 && stub.passes === 0
+    ok &&= pass
+    console.log(`  seed ${seed}: stub ${stub.passes}/${stub.total}  →  reference ${ref.passes}/${ref.total}  ${pass ? '✓' : '✗ BROKEN'}`)
+  }
+  console.log(ok ? '\n>>> CALIBRATED — task is solvable + the grader discriminates. Safe to run the loop.' : '\n>>> BROKEN — fix the task/grader before spending.')
+  if (!ok) process.exit(1)
+}
+
+async function main(): Promise<void> {
+  if (process.env.CALIBRATE) return calibrate()
+  const routerKey = process.env.TANGLE_API_KEY
+  if (!routerKey) throw new Error('set TANGLE_API_KEY (the worker + the author both call the router)')
+  const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1'
+  const workerModel = process.env.WORKER_MODEL ?? 'deepseek-v4-flash'
+  // The author WRITES strategy code (a `defineStrategy` module) — it needs a strong coder + a token
+  // budget (thinking models return empty content without one) + a fallback. deepseek-flash can't.
+  const authorModel = process.env.AUTHOR_MODEL ?? 'gemini-2.5-pro'
+
+  // The author writes candidate-strategy .mts files into outDir, then dynamically imports them — they
+  // `import '@tangle-network/agent-runtime/loops'`, which only resolves UNDER the package (self-reference).
+  // A /tmp outDir would fail to resolve it; keep it under the project root.
+  const outDir = mkdtempSync(join(process.cwd(), '.sic-run-'))
+  const report = await runStrategyEvolution({
+    environment: codingEnv,
+    tasks,
+    trainN: Number(process.env.TRAIN_N ?? 8),
+    holdoutN: Number(process.env.HOLDOUT_N ?? 12),
+    worker: { routerBaseUrl, routerKey, model: workerModel, innerTurns: Number(process.env.INNER_TURNS ?? 8), maxTokens: 4000 },
+    author: {
+      chat: createChatClient({ transport: 'router', baseUrl: routerBaseUrl, apiKey: routerKey, defaultModel: authorModel }),
+      model: authorModel,
+      maxTokens: 8000,
+      fallbackModel: process.env.AUTHOR_FALLBACK ?? 'deepseek-v4-flash',
+    },
+    baselines: [sample, refine],
+    budget: Number(process.env.BUDGET ?? 3),
+    generations: Number(process.env.GENERATIONS ?? 2),
+    populationSize: Number(process.env.POP ?? 2),
+    outDir,
+  })
+  rmSync(outDir, { recursive: true, force: true })
+
+  const v = report.verdict
+  if (process.env.DUMP) {
+    // Autopsy: gen0 baseline scores (headroom) + every authored candidate's score/error (did they
+    // lose on a saturated task, or error at runtime?).
+    const r = report as unknown as Record<string, unknown>
+    const slim = (x: unknown) =>
+      JSON.stringify(x, (_k, val) => (typeof val === 'function' ? '[fn]' : val), 1)
+    console.log('--- gen0 ---', slim(r.gen0 ?? r.gen0Champion))
+    console.log('--- generations ---', slim(r.generations)?.slice(0, 3000))
+  }
+  console.log('\n═══ SELF-IMPROVING CODER — certified on a FROZEN holdout (no adaptive reuse) ═══')
+  console.log(`worker=${workerModel}  author=${authorModel}`)
+  console.log(`gen0 champion:   ${report.gen0Champion.name}`)
+  console.log(`final champion:  ${report.finalChampion.name}`)
+  console.log(`PROMOTED:        ${v.promoted}  (${v.reason})`)
+  console.log(`held-out lift:   mean ${v.lift.mean.toFixed(3)}  95% CI [${v.lift.low.toFixed(3)}, ${v.lift.high.toFixed(3)}]  n=${v.n}`)
+  console.log(
+    v.promoted
+      ? '\n>>> The search taught the agent a strategy that fixes MORE on tasks it never trained on, beyond luck. Self-improvement CERTIFIED.'
+      : '\n>>> No promotion: the evolved strategy did not beat gen0 on the fresh holdout beyond noise (honest null).',
+  )
+}
+
+main().catch((e) => {
+  console.error(e)
+  process.exit(1)
+})

From 7f7d93dd24d877e1fb7d4eb1323ee81099339076 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Sat, 27 Jun 2026 20:19:51 -0600
Subject: [PATCH 2/6] =?UTF-8?q?feat(bench):=20SWE-bench=20Verified=20as=20?=
 =?UTF-8?q?an=20AgenticSurface=20=E2=80=94=20the=20proper,=20no-cheating?=
 =?UTF-8?q?=20frontier=20run?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

createSweBenchEnvironment: the agent clones the repo at base_commit, explores + makes SURGICAL
edits via tools (edit_file, source-only, test files path-jailed), and score() grades the git diff
with the official swebench Docker harness. The substrate drives the agentic loop (runAgentic /
runStrategyEvolution) — no hand-rolled tool-loop. Never sees the hidden tests or the gold patch.

swe-self-improve.mts wires it into runStrategyEvolution with a disjoint train/holdout split (the
substrate enforces freeze + one holdout decision — no adaptive reuse). CALIBRATE mode runs the
baseline on a few bugs first (cost gate). CONTAMINATION CAVEAT documented: public fixes may be
memorized; report it, never claim a clean frontier number from this arena alone.
---
 bench/src/swe-bench-env.ts | 173 +++++++++++++++++++++++++++++++++++++
 bench/swe-self-improve.mts |  79 +++++++++++++++++
 2 files changed, 252 insertions(+)
 create mode 100644 bench/src/swe-bench-env.ts
 create mode 100644 bench/swe-self-improve.mts

diff --git a/bench/src/swe-bench-env.ts b/bench/src/swe-bench-env.ts
new file mode 100644
index 00000000..44961efc
--- /dev/null
+++ b/bench/src/swe-bench-env.ts
@@ -0,0 +1,173 @@
+/**
+ * SWE-bench Verified as an `AgenticSurface` — the PROPER, no-cheating way to run a coding agent on real
+ * GitHub bugs through the substrate (`runAgentic`/`runBenchmark`/`runStrategyEvolution` drive the loop;
+ * we only provide tools + a deployable score). The agent clones the repo at base_commit, explores +
+ * edits SOURCE via tools (never tests — path-jailed), and `score()` grades the resulting `git diff`
+ * with the OFFICIAL swebench Docker harness (apply patch → FAIL_TO_PASS + PASS_TO_PASS → resolved).
+ *
+ * No cheating by construction: the agent never sees the hidden tests or the gold patch (the adapter's
+ * prompt is the issue only); `edit_file` refuses test files; the score is a real test run, not a judge.
+ *
+ * CONTAMINATION CAVEAT: SWE-bench bugs are public GitHub fixes a frontier model may have MEMORIZED.
+ * A clean train→holdout split (disjoint instances) rules out adaptive-reuse, but NOT training-data
+ * memorization. Always report this; never claim a "clean" frontier number from this arena alone.
+ */
+import { execFile } from 'node:child_process'
+import { existsSync, mkdtempSync, readdirSync, readFileSync, rmSync, statSync, writeFileSync } from 'node:fs'
+import { tmpdir } from 'node:os'
+import { join } from 'node:path'
+import { promisify } from 'node:util'
+import type { AgenticSurface, AgenticTask, AgenticTool, ArtifactHandle, SurfaceScore } from '@tangle-network/agent-runtime/loops'
+import { createSweBenchAdapter } from './benchmarks/swe-bench'
+import type { BenchTask } from './benchmarks/types'
+
+const exec = promisify(execFile)
+const isTestPath = (p: string) => /(^|\/)(tests?)\//.test(p) || /test_.*\.py$|_test\.py$|conftest\.py$/.test(p)
+
+interface Ws {
+  dir: string
+  task: BenchTask
+}
+const workspaces = new Map<string, Ws>()
+
+/** Build the SWE-bench Environment + a DISJOINT-slice task supplier over the Verified split. The
+ *  supplier keys tasks by dataset offset so `runStrategyEvolution`'s train [0,trainN) and holdout
+ *  [trainN+off,…) never overlap. Verified is loaded once; instances carry their repo/base_commit. */
+export async function createSweBenchEnvironment(poolN = 80): Promise<{
+  environment: AgenticSurface
+  tasks: (offset: number, n: number) => Promise<AgenticTask[]>
+  adapter: ReturnType<typeof createSweBenchAdapter>
+}> {
+  const adapter = createSweBenchAdapter()
+  const pool = await adapter.loadTasks({ limit: poolN, split: 'test' })
+  const byId = new Map(pool.map((t) => [t.id, t]))
+
+  const environment: AgenticSurface = {
+    name: 'swe-bench-verified',
+    async open(task) {
+      const bt = byId.get(task.id)
+      if (!bt) throw new Error(`swe-bench-env: unknown task ${task.id}`)
+      const md = bt.metadata as Record<string, string>
+      const dir = mkdtempSync(join(tmpdir(), 'swe-'))
+      await exec('git', ['clone', '--filter=blob:none', '--no-checkout', '--quiet', `https://github.com/${md.repo}.git`, dir], { timeout: 420_000 })
+      await exec('git', ['-C', dir, 'checkout', '--quiet', md.base_commit], { timeout: 300_000 })
+      const handle: ArtifactHandle = { id: dir, surface: 'swe-bench-verified' }
+      workspaces.set(dir, { dir, task: bt })
+      return handle
+    },
+    async tools() {
+      return [
+        { type: 'function', function: { name: 'list_files', description: 'List source files under a repo subdirectory (recursive, bounded). "" = repo root.', parameters: { type: 'object', properties: { dir: { type: 'string' } }, required: ['dir'] } } },
+        { type: 'function', function: { name: 'read_file', description: 'Read a repo file by path.', parameters: { type: 'object', properties: { path: { type: 'string' } }, required: ['path'] } } },
+        { type: 'function', function: { name: 'edit_file', description: 'Surgical fix: replace the EXACT old_string (must occur once — copy whitespace precisely) with new_string in a SOURCE file. Minimal changes, never whole-file rewrites. Test files are rejected.', parameters: { type: 'object', properties: { path: { type: 'string' }, old_string: { type: 'string' }, new_string: { type: 'string' } }, required: ['path', 'old_string', 'new_string'] } } },
+      ] satisfies AgenticTool[]
+    },
+    async call(handle, name, args) {
+      const ws = workspaces.get(handle.id)
+      if (!ws) return 'ERROR: workspace closed'
+      const safe = (p: string): string | null => {
+        const n = p.replace(/^\.?\//, '')
+        return n.includes('..') || n.startsWith('/') ? null : n
+      }
+      if (name === 'list_files') {
+        const sub = safe(String(args.dir ?? '')) ?? ''
+        const root = join(ws.dir, sub)
+        if (!existsSync(root)) return `(no such path: ${sub})`
+        const out: string[] = []
+        const walk = (d: string, depth: number) => {
+          if (depth > 2 || out.length > 240) return
+          let entries: string[] = []
+          try {
+            entries = readdirSync(d)
+          } catch {
+            return
+          }
+          for (const e of entries) {
+            if (e.startsWith('.') || e === 'node_modules' || e === '__pycache__') continue
+            const p = join(d, e)
+            let isDir = false
+            try {
+              isDir = statSync(p).isDirectory()
+            } catch {
+              continue
+            }
+            out.push(p.slice(ws.dir.length + 1) + (isDir ? '/' : ''))
+            if (isDir) walk(p, depth + 1)
+          }
+        }
+        walk(root, 0)
+        return out.slice(0, 240).join('\n') || '(empty)'
+      }
+      if (name === 'read_file') {
+        const p = safe(String(args.path ?? ''))
+        if (!p) return 'ERROR: invalid path'
+        try {
+          const c = readFileSync(join(ws.dir, p), 'utf8')
+          return c.length > 24_000 ? `${c.slice(0, 24_000)}\n...[truncated]` : c
+        } catch (e) {
+          return `(error: ${(e as Error).message})`
+        }
+      }
+      if (name === 'edit_file') {
+        const p = safe(String(args.path ?? ''))
+        if (!p) return 'ERROR: invalid path'
+        if (isTestPath(p)) return 'REJECTED: editing test files is forbidden (the evaluation runs hidden tests).'
+        const oldStr = String(args.old_string ?? '')
+        const newStr = String(args.new_string ?? '')
+        let content: string
+        try {
+          content = readFileSync(join(ws.dir, p), 'utf8')
+        } catch (e) {
+          return `(cannot read ${p}: ${(e as Error).message})`
+        }
+        if (!oldStr) return 'ERROR: old_string is empty.'
+        const count = content.split(oldStr).length - 1
+        if (count === 0) return `ERROR: old_string not found in ${p}. read_file it and copy EXACT text.`
+        if (count > 1) return `ERROR: old_string appears ${count}× in ${p} — add surrounding context to make it unique.`
+        writeFileSync(join(ws.dir, p), content.replace(oldStr, newStr))
+        return `edited ${p}: replaced 1 occurrence`
+      }
+      return `ERROR: unknown tool ${name}`
+    },
+    async score(_task, handle): Promise<SurfaceScore> {
+      const ws = workspaces.get(handle.id)
+      if (!ws) return { passes: 0, total: 1, errored: 1 }
+      let patch = ''
+      try {
+        const r = await exec('git', ['-C', ws.dir, 'diff'], { maxBuffer: 20_000_000, timeout: 60_000 })
+        patch = r.stdout
+      } catch {
+        patch = ''
+      }
+      if (!patch.trim()) return { passes: 0, total: 1, errored: 0 }
+      try {
+        const s = await adapter.judge(ws.task, patch)
+        return { passes: s.resolved ? 1 : 0, total: 1, errored: 0 }
+      } catch {
+        return { passes: 0, total: 1, errored: 1 }
+      }
+    },
+    async close(handle) {
+      const ws = workspaces.get(handle.id)
+      if (!ws) return
+      workspaces.delete(handle.id)
+      rmSync(ws.dir, { recursive: true, force: true })
+    },
+  }
+
+  const tasks = async (offset: number, n: number): Promise<AgenticTask[]> => {
+    const slice = pool.slice(offset, offset + n)
+    if (slice.length < n) throw new Error(`swe-bench-env: pool exhausted at offset ${offset} (need ${n}, have ${slice.length}; raise poolN)`)
+    return slice.map((bt) => ({
+      id: bt.id,
+      systemPrompt:
+        'You are a senior engineer fixing a real bug in the checked-out repository. Use list_files + read_file to ' +
+        'locate and fully read the relevant source, diagnose the root cause from the issue, then fix it with edit_file — ' +
+        'a MINIMAL surgical change (a few lines, like a real PR), source only (test files are rejected). Do not rewrite whole files.',
+      userPrompt: bt.prompt,
+      meta: { instanceId: bt.id },
+    }))
+  }
+
+  return { environment, tasks, adapter }
+}
diff --git a/bench/swe-self-improve.mts b/bench/swe-self-improve.mts
new file mode 100644
index 00000000..0e857c43
--- /dev/null
+++ b/bench/swe-self-improve.mts
@@ -0,0 +1,79 @@
+/**
+ * SWE-bench self-improvement — the PROPER, no-cheating run: a frontier worker over the SWE-bench
+ * `Environment`, with `runStrategyEvolution` enforcing the train→freeze→holdout split (the substrate
+ * draws a disjoint holdout slice and gates once — adaptive reuse is impossible). CONTAMINATION CAVEAT
+ * applies (public fixes may be memorized) — reported, never claimed clean.
+ *
+ *   CALIBRATE first (cost gate):  TANGLE_API_KEY=… CALIBRATE=1 N=3 tsx bench/swe-self-improve.mts
+ *   Full run:                     TANGLE_API_KEY=… TRAIN_N=6 HOLDOUT_N=8 GENERATIONS=2 tsx bench/swe-self-improve.mts
+ */
+import { mkdtempSync, rmSync } from 'node:fs'
+import { join } from 'node:path'
+import { createChatClient } from '@tangle-network/agent-eval'
+import { refine, runAgentic, runStrategyEvolution, sample } from '@tangle-network/agent-runtime/loops'
+import { createSweBenchEnvironment } from './src/swe-bench-env'
+
+async function main(): Promise<void> {
+  const routerKey = process.env.TANGLE_API_KEY
+  if (!routerKey) throw new Error('TANGLE_API_KEY required (worker + author call the router)')
+  const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1'
+  const workerModel = process.env.WORKER_MODEL ?? 'gemini-2.5-pro'
+  const authorModel = process.env.AUTHOR_MODEL ?? 'gemini-2.5-pro'
+  const innerTurns = Number(process.env.INNER_TURNS ?? 40)
+  const { environment, tasks } = await createSweBenchEnvironment(Number(process.env.POOL_N ?? 80))
+
+  if (process.env.CALIBRATE) {
+    const n = Number(process.env.N ?? 3)
+    const ts = await tasks(0, n)
+    console.log(`═══ SWE-bench CALIBRATION — ${workerModel}, baseline=refine, ${n} real bugs ═══`)
+    let resolved = 0
+    for (const t of ts) {
+      const t0 = Date.now()
+      const r = await runAgentic({ surface: environment, task: t, strategy: refine, routerBaseUrl, routerKey, model: workerModel, innerTurns, budget: 1 })
+      if (r.resolved) resolved++
+      console.log(`  ${t.id.padEnd(32)} resolved=${r.resolved} (${Math.round((Date.now() - t0) / 1000)}s)`)
+    }
+    const band = resolved > 0 && resolved < n
+    console.log(`\n>>> baseline resolved ${resolved}/${n}. ${band ? 'HEADROOM — the loop has room to improve. PROCEED.' : resolved === 0 ? 'TOO HARD / env issue — inspect before the loop.' : 'saturated at this small n — raise N.'}`)
+    return
+  }
+
+  const outDir = mkdtempSync(join(process.cwd(), '.swe-run-'))
+  const report = await runStrategyEvolution({
+    environment,
+    tasks,
+    trainN: Number(process.env.TRAIN_N ?? 6),
+    holdoutN: Number(process.env.HOLDOUT_N ?? 8),
+    worker: { routerBaseUrl, routerKey, model: workerModel, innerTurns },
+    author: {
+      chat: createChatClient({ transport: 'router', baseUrl: routerBaseUrl, apiKey: routerKey, defaultModel: authorModel }),
+      model: authorModel,
+      maxTokens: 8000,
+      fallbackModel: process.env.AUTHOR_FALLBACK ?? 'deepseek-v4-flash',
+    },
+    baselines: [sample, refine],
+    budget: Number(process.env.BUDGET ?? 2),
+    generations: Number(process.env.GENERATIONS ?? 2),
+    populationSize: Number(process.env.POP ?? 2),
+    outDir,
+  })
+  rmSync(outDir, { recursive: true, force: true })
+
+  const v = report.verdict
+  console.log('\n═══ SWE-bench SELF-IMPROVEMENT — certified on a FROZEN holdout (CONTAMINATION-flagged) ═══')
+  console.log(`worker=${workerModel}  author=${authorModel}`)
+  console.log(`gen0 champion:   ${report.gen0Champion.name}`)
+  console.log(`final champion:  ${report.finalChampion.name}`)
+  console.log(`PROMOTED:        ${v.promoted}  (${v.reason})`)
+  console.log(`held-out lift:   mean ${v.lift.mean.toFixed(3)}  95% CI [${v.lift.low.toFixed(3)}, ${v.lift.high.toFixed(3)}]  n=${v.n}`)
+  console.log(
+    v.promoted
+      ? '\n>>> The search taught the agent a strategy that resolves MORE real bugs it never trained on, beyond luck. (Report the contamination caveat: public fixes may be memorized.)'
+      : '\n>>> No promotion: the evolved strategy did not beat gen0 on the fresh holdout beyond noise (honest null).',
+  )
+}
+
+main().catch((e) => {
+  console.error(e)
+  process.exit(1)
+})

From ed844764b1849ee0b8ebf93233f1d8e669ad4170 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Sat, 27 Jun 2026 20:25:12 -0600
Subject: [PATCH 3/6] fix(bench): SWE-bench worker needs maxTokens (thinking
 model) + a persist-and-edit prompt

Calibration showed gemini-2.5-pro returning empty (no tool calls) without a maxTokens cap, then stopping
after ~3 turns without editing. Set worker maxTokens=8000 and a prompt that forces broad exploration +
at least one edit_file attempt. Log completions/shots in CALIBRATE mode for headroom diagnosis.
---
 bench/src/swe-bench-env.ts | 9 ++++++---
 bench/swe-self-improve.mts | 6 +++---
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/bench/src/swe-bench-env.ts b/bench/src/swe-bench-env.ts
index 44961efc..98b48909 100644
--- a/bench/src/swe-bench-env.ts
+++ b/bench/src/swe-bench-env.ts
@@ -161,9 +161,12 @@ export async function createSweBenchEnvironment(poolN = 80): Promise<{
     return slice.map((bt) => ({
       id: bt.id,
       systemPrompt:
-        'You are a senior engineer fixing a real bug in the checked-out repository. Use list_files + read_file to ' +
-        'locate and fully read the relevant source, diagnose the root cause from the issue, then fix it with edit_file — ' +
-        'a MINIMAL surgical change (a few lines, like a real PR), source only (test files are rejected). Do not rewrite whole files.',
+        'You are a senior engineer fixing a real bug in the checked-out repository. Work PERSISTENTLY and do not ' +
+        'stop early: use list_files + read_file to explore BROADLY (read many candidate files — the bug is rarely in ' +
+        'the first file you open), trace the issue to its root cause, then fix it with edit_file. You MUST make at ' +
+        'least one edit_file call — never finish with prose alone or without attempting a fix. Make a MINIMAL surgical ' +
+        'change (a few lines, like a real PR), source only (test files are rejected). If an edit_file fails (old_string ' +
+        'not unique/found), read the file again and retry with exact text. Keep going until you have made your best fix.',
       userPrompt: bt.prompt,
       meta: { instanceId: bt.id },
     }))
diff --git a/bench/swe-self-improve.mts b/bench/swe-self-improve.mts
index 0e857c43..270a3f68 100644
--- a/bench/swe-self-improve.mts
+++ b/bench/swe-self-improve.mts
@@ -29,9 +29,9 @@ async function main(): Promise<void> {
     let resolved = 0
     for (const t of ts) {
       const t0 = Date.now()
-      const r = await runAgentic({ surface: environment, task: t, strategy: refine, routerBaseUrl, routerKey, model: workerModel, innerTurns, budget: 1 })
+      const r = await runAgentic({ surface: environment, task: t, strategy: refine, routerBaseUrl, routerKey, model: workerModel, maxTokens: 8000, innerTurns, budget: 1 })
       if (r.resolved) resolved++
-      console.log(`  ${t.id.padEnd(32)} resolved=${r.resolved} (${Math.round((Date.now() - t0) / 1000)}s)`)
+      console.log(`  ${t.id.padEnd(32)} resolved=${r.resolved} completions=${r.completions} shots=${r.shots} (${Math.round((Date.now() - t0) / 1000)}s)`)
     }
     const band = resolved > 0 && resolved < n
     console.log(`\n>>> baseline resolved ${resolved}/${n}. ${band ? 'HEADROOM — the loop has room to improve. PROCEED.' : resolved === 0 ? 'TOO HARD / env issue — inspect before the loop.' : 'saturated at this small n — raise N.'}`)
@@ -44,7 +44,7 @@ async function main(): Promise<void> {
     tasks,
     trainN: Number(process.env.TRAIN_N ?? 6),
     holdoutN: Number(process.env.HOLDOUT_N ?? 8),
-    worker: { routerBaseUrl, routerKey, model: workerModel, innerTurns },
+    worker: { routerBaseUrl, routerKey, model: workerModel, maxTokens: 8000, innerTurns },
     author: {
       chat: createChatClient({ transport: 'router', baseUrl: routerBaseUrl, apiKey: routerKey, defaultModel: authorModel }),
       model: authorModel,

From fb6f682a712b0badfde7b68669a41443477cead9 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Sat, 27 Jun 2026 21:31:25 -0600
Subject: [PATCH 4/6] feat(examples): ablation knob-board + cost-aware
 one-knob-delta runner
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The instrument for 'what actually helps': a configurable agent where each self-improvement technique is
a knob (topology/trace-analysis/steering/GEPA-skillopt/persistent-artifact), swept one-knob-at-a-time
(O(N) not 2^N) at equal compute, with a full autopsy — resolve AND token/$/latency per arm — so we see
what helps vs what just burns tokens. WIRED: topology (refine/sample/sampleThenRefine) + budget. The
rest are DECLARED knobs that FAIL LOUD if set (no silent no-op — names the substrate primitive to wire).
Exports codingEnv/codingTasks from self-improving-coder (guarded main) for the cheap validation fixture.
---
 examples/ablation-suite/ablation.ts           | 199 ++++++++++++++++++
 .../self-improving-coder.ts                   |  11 +-
 2 files changed, 205 insertions(+), 5 deletions(-)
 create mode 100644 examples/ablation-suite/ablation.ts

diff --git a/examples/ablation-suite/ablation.ts b/examples/ablation-suite/ablation.ts
new file mode 100644
index 00000000..46a3c0e9
--- /dev/null
+++ b/examples/ablation-suite/ablation.ts
@@ -0,0 +1,199 @@
+/**
+ * ablation — the cost-aware knob-board + one-knob-delta runner for agent self-improvement techniques.
+ *
+ * THE VISION: a single configurable agent where every technique is a knob (topology, trace-analysis,
+ * steering, GEPA/skill optimization, persistent artifacts), swept across arms at EQUAL COMPUTE, with a
+ * full autopsy — resolve rate AND token/$/latency cost per arm — so we see what really helps vs what
+ * just burns tokens. One-knob-delta design (baseline + each single knob flipped) keeps it O(N), not 2^N.
+ *
+ * STATUS — honest: the framework + the cost autopsy are real; knobs are wired incrementally. WIRED:
+ * `topology` (single/fanout/fanout-refine = refine/sample/sampleThenRefine) + `budget`. The rest are
+ * DECLARED knobs that FAIL LOUD if set (no silent no-op — you must not think GEPA ran when it didn't);
+ * each is a tracked next-increment over a real substrate primitive (named in the throw). Validate the
+ * framework on the cheap contamination-proof task, THEN point `environment`/`tasks` at SWE-bench.
+ */
+import {
+  type AgenticSurface,
+  type AgenticTask,
+  refine,
+  runAgentic,
+  sample,
+  sampleThenRefine,
+  type Strategy,
+} from '@tangle-network/agent-runtime/loops'
+import { codingEnv, codingTasks } from '../self-improving-coder/self-improving-coder'
+
+export interface AblationKnobs {
+  /** WIRED → strategy: single=`refine` (iterate one artifact), fanout=`sample` (N parallel, pick best),
+   *  fanout-refine=`sampleThenRefine`. The coordination shape. */
+  topology: 'single' | 'fanout' | 'fanout-refine'
+  /** WIRED → equal-compute unit (refine: max shots; fanout: rollout width). */
+  budget: number
+  // ── DECLARED knobs — fail loud until wired (each over a named substrate primitive) ──
+  optimize?: 'off' | 'gepa' | 'skillOpt' // gepaProposer / skillOptProposer on TRAIN, frozen, then run
+  traceAnalysis?: 'off' | 'settle' | 'live' // analyzeOnSettle / watchTrace (agent-eval analysts)
+  halo?: boolean
+  steering?: boolean // trace finding → steer_worker (event-bus)
+  persistentArtifact?: boolean // multi-round persistent artifact (openSandboxRun resume)
+}
+
+const topologyStrategy: Record<AblationKnobs['topology'], Strategy> = {
+  single: refine,
+  fanout: sample,
+  'fanout-refine': sampleThenRefine,
+}
+
+/** Fail loud on a set-but-unwired knob — the house rule (no silent no-op). Names the primitive to wire. */
+const unwiredKnobs: Array<{ k: keyof AblationKnobs; isSet: (v: unknown) => boolean; prim: string }> = [
+  { k: 'optimize', isSet: (v) => !!v && v !== 'off', prim: 'gepaProposer/skillOptProposer + improve() on TRAIN, frozen' },
+  { k: 'traceAnalysis', isSet: (v) => !!v && v !== 'off', prim: 'analyzeOnSettle / watchTrace (agent-eval analysts)' },
+  { k: 'halo', isSet: (v) => v === true, prim: 'HALO analyst option' },
+  { k: 'steering', isSet: (v) => v === true, prim: 'event-bus finding → steer_worker' },
+  { k: 'persistentArtifact', isSet: (v) => v === true, prim: 'openSandboxRun resume' },
+]
+
+export interface ArmResult {
+  name: string
+  knobs: AblationKnobs
+  n: number
+  resolve: number // mean resolved (0..1) on the held-out set
+  tokensIn: number
+  tokensOut: number
+  costUsd: number
+  latencyMs: number
+  shotsMean: number
+  completionsMean: number
+}
+
+export async function runAblation(opts: {
+  environment: AgenticSurface
+  tasks: (offset: number, n: number) => Promise<AgenticTask[]>
+  holdoutOffset: number
+  holdoutN: number
+  base: AblationKnobs
+  /** Each delta = a ONE-KNOB change vs base (the one-knob-delta design). */
+  deltas: Array<{ name: string; knob: Partial<AblationKnobs> }>
+  worker: { routerBaseUrl: string; routerKey: string; model: string; maxTokens?: number; innerTurns?: number }
+  onArm?: (r: ArmResult) => void
+}): Promise<ArmResult[]> {
+  // ONE held-out set, shared across all arms — the fair-comparison invariant.
+  const tasks = await opts.tasks(opts.holdoutOffset, opts.holdoutN)
+  const arms = [
+    { name: 'baseline', knobs: opts.base },
+    ...opts.deltas.map((d) => ({ name: d.name, knobs: { ...opts.base, ...d.knob } as AblationKnobs })),
+  ]
+  const results: ArmResult[] = []
+  for (const arm of arms) {
+    for (const u of unwiredKnobs) {
+      if (u.isSet(arm.knobs[u.k]))
+        throw new Error(
+          `ablation: knob '${u.k}'=${JSON.stringify(arm.knobs[u.k])} (arm "${arm.name}") is DECLARED but not yet wired — wire it over ${u.prim} before claiming it ran. (No silent no-op.)`,
+        )
+    }
+    let resolved = 0
+    let ti = 0
+    let to = 0
+    let usd = 0
+    let ms = 0
+    let shots = 0
+    let comps = 0
+    for (const t of tasks) {
+      const r = await runAgentic({
+        surface: opts.environment,
+        task: t,
+        strategy: topologyStrategy[arm.knobs.topology],
+        budget: arm.knobs.budget,
+        routerBaseUrl: opts.worker.routerBaseUrl,
+        routerKey: opts.worker.routerKey,
+        model: opts.worker.model,
+        ...(opts.worker.maxTokens !== undefined ? { maxTokens: opts.worker.maxTokens } : {}),
+        ...(opts.worker.innerTurns !== undefined ? { innerTurns: opts.worker.innerTurns } : {}),
+      })
+      if (r.resolved) resolved++
+      ti += r.tokens.input
+      to += r.tokens.output
+      usd += r.usd
+      ms += r.ms
+      shots += r.shots
+      comps += r.completions
+    }
+    const n = tasks.length
+    const res: ArmResult = {
+      name: arm.name,
+      knobs: arm.knobs,
+      n,
+      resolve: resolved / n,
+      tokensIn: ti,
+      tokensOut: to,
+      costUsd: usd,
+      latencyMs: ms,
+      shotsMean: shots / n,
+      completionsMean: comps / n,
+    }
+    results.push(res)
+    opts.onArm?.(res)
+  }
+  return results
+}
+
+/** The cost-aware autopsy: per-arm resolve + tokens + $ + latency, and Δ vs baseline (lift AND cost). */
+export function printAutopsy(results: ArmResult[]): void {
+  const base = results[0]
+  const pad = (s: string, n: number) => s.padEnd(n)
+  console.log(`\n═══ ABLATION AUTOPSY (n=${base?.n} held-out, one-knob-delta vs baseline) ═══`)
+  console.log(
+    pad('arm', 16) + pad('topology', 14) + pad('resolve', 9) + pad('tok(in/out)', 16) + pad('$', 9) + pad('lat(s)', 9) + pad('shots', 7) + pad('Δresolve', 10) + 'Δ$',
+  )
+  for (const r of results) {
+    const dR = base ? r.resolve - base.resolve : 0
+    const dC = base ? r.costUsd - base.costUsd : 0
+    console.log(
+      pad(r.name, 16) +
+        pad(r.knobs.topology, 14) +
+        pad(`${(100 * r.resolve).toFixed(0)}%`, 9) +
+        pad(`${r.tokensIn}/${r.tokensOut}`, 16) +
+        pad(`$${r.costUsd.toFixed(4)}`, 9) +
+        pad((r.latencyMs / 1000).toFixed(0), 9) +
+        pad(r.shotsMean.toFixed(1), 7) +
+        pad(`${dR >= 0 ? '+' : ''}${(100 * dR).toFixed(0)}pp`, 10) +
+        `${dC >= 0 ? '+' : ''}$${dC.toFixed(4)}`,
+    )
+  }
+  console.log(
+    '\n>>> Read it cost-aware: a +resolve that costs +$$ may be worse than baseline. The whole point is to see what HELPS vs what just BURNS.',
+  )
+}
+
+async function main(): Promise<void> {
+  const routerKey = process.env.TANGLE_API_KEY
+  if (!routerKey) throw new Error('TANGLE_API_KEY required')
+  const worker = {
+    routerBaseUrl: process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1',
+    routerKey,
+    model: process.env.WORKER_MODEL ?? 'deepseek-v4-flash',
+    maxTokens: 4000,
+    innerTurns: Number(process.env.INNER_TURNS ?? 6),
+  }
+  console.log(`═══ ABLATION (cheap contamination-proof task) — worker=${worker.model} ═══`)
+  const results = await runAblation({
+    environment: codingEnv,
+    tasks: codingTasks,
+    holdoutOffset: 100, // a fixed disjoint held-out slice
+    holdoutN: Number(process.env.HOLDOUT_N ?? 6),
+    base: { topology: 'single', budget: Number(process.env.BUDGET ?? 2) },
+    // one-knob-delta: flip ONLY topology (the wired knob) vs baseline.
+    deltas: [
+      { name: 'fanout', knob: { topology: 'fanout' } },
+      { name: 'fanout-refine', knob: { topology: 'fanout-refine' } },
+    ],
+    worker,
+    onArm: (r) => console.log(`  ${r.name}: ${(100 * r.resolve).toFixed(0)}% resolve, $${r.costUsd.toFixed(4)}, ${(r.latencyMs / 1000).toFixed(0)}s`),
+  })
+  printAutopsy(results)
+}
+
+if (import.meta.url === `file://${process.argv[1]}`)
+  main().catch((e) => {
+    console.error(e)
+    process.exit(1)
+  })
diff --git a/examples/self-improving-coder/self-improving-coder.ts b/examples/self-improving-coder/self-improving-coder.ts
index 736ed05b..eaaa206d 100644
--- a/examples/self-improving-coder/self-improving-coder.ts
+++ b/examples/self-improving-coder/self-improving-coder.ts
@@ -105,7 +105,7 @@ function pytestPassed(dir: string): { passed: number; total: number } {
   return { passed, total: passed + failed }
 }
 
-const codingEnv: AgenticSurface = {
+export const codingEnv: AgenticSurface = {
   name: 'generated-coding',
   async open(task) {
     const seed = Number((task.meta as { seed?: number })?.seed ?? 0)
@@ -169,7 +169,7 @@ const codingEnv: AgenticSurface = {
 }
 
 // ── The disjoint task supplier (train [0,trainN); holdout drawn past it) ──────────
-const tasks = async (offset: number, n: number): Promise<AgenticTask[]> =>
+export const codingTasks = async (offset: number, n: number): Promise<AgenticTask[]> =>
   Array.from({ length: n }, (_, i) => {
     const seed = offset + i
     return {
@@ -208,7 +208,7 @@ async function calibrate(): Promise<void> {
   console.log('═══ CALIBRATION ($0) — task solvable + grader discriminates? ═══')
   let ok = true
   for (const seed of [0, 1, 2, 7, 11]) {
-    const task = (await tasks(seed, 1))[0]!
+    const task = (await codingTasks(seed, 1))[0]!
     const h = await codingEnv.open(task)
     const stub = await codingEnv.score(task, h)
     // write the reference, re-score
@@ -239,7 +239,7 @@ async function main(): Promise<void> {
   const outDir = mkdtempSync(join(process.cwd(), '.sic-run-'))
   const report = await runStrategyEvolution({
     environment: codingEnv,
-    tasks,
+    tasks: codingTasks,
     trainN: Number(process.env.TRAIN_N ?? 8),
     holdoutN: Number(process.env.HOLDOUT_N ?? 12),
     worker: { routerBaseUrl, routerKey, model: workerModel, innerTurns: Number(process.env.INNER_TURNS ?? 8), maxTokens: 4000 },
@@ -280,7 +280,8 @@ async function main(): Promise<void> {
   )
 }
 
-main().catch((e) => {
+if (import.meta.url === `file://${process.argv[1]}`)
+  main().catch((e) => {
   console.error(e)
   process.exit(1)
 })

From bd127783c37cd478da3b34ccf85ef8c2a1d02f31 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Sat, 27 Jun 2026 22:15:02 -0600
Subject: [PATCH 5/6] feat(examples): ablation significance (paired bootstrap
 CI) + point steering knob at the driver-steers-worker loop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds task-aligned per-task resolve vectors + pairedBootstrap 95% CI on every arm's Δresolve (✓ = CI
excludes 0 = real lift) — no more point lifts. Reframes the rich knobs to the RIGHT primitives: the
steering knob is the supervise() driver-steers-worker loop (driver composes the next prompt from the
analyst's analyzeOnSettle finding — a driver brain in the loop, not the inline analyst-steerer); the
optimize knob is selfImprove() with an executable JudgeConfig optimizing the driver's compose-prompt
on TRAIN, frozen. Both fail loud until wired.
---
 examples/ablation-suite/ablation.ts | 41 ++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/examples/ablation-suite/ablation.ts b/examples/ablation-suite/ablation.ts
index 46a3c0e9..889c7e3a 100644
--- a/examples/ablation-suite/ablation.ts
+++ b/examples/ablation-suite/ablation.ts
@@ -12,6 +12,7 @@
  * each is a tracked next-increment over a real substrate primitive (named in the throw). Validate the
  * framework on the cheap contamination-proof task, THEN point `environment`/`tasks` at SWE-bench.
  */
+import { pairedBootstrap } from '@tangle-network/agent-eval'
 import {
   type AgenticSurface,
   type AgenticTask,
@@ -30,10 +31,15 @@ export interface AblationKnobs {
   /** WIRED → equal-compute unit (refine: max shots; fanout: rollout width). */
   budget: number
   // ── DECLARED knobs — fail loud until wired (each over a named substrate primitive) ──
-  optimize?: 'off' | 'gepa' | 'skillOpt' // gepaProposer / skillOptProposer on TRAIN, frozen, then run
-  traceAnalysis?: 'off' | 'settle' | 'live' // analyzeOnSettle / watchTrace (agent-eval analysts)
-  halo?: boolean
-  steering?: boolean // trace finding → steer_worker (event-bus)
+  /** The DRIVER-steers-WORKER loop: supervise() drives the worker, analyzeOnSettle fires the analyst on
+   *  each settled round → a `finding` the driver pulls and composes the next prompt from. (NOT the
+   *  refine analyst-steerer — that's the degenerate inline version; this is a driver brain in the loop.) */
+  driverSteer?: boolean // supervise(driverProfile,{backend,analyzeOnSettle}) + steer_agent
+  /** GEPA-optimize the DRIVER's compose-next-prompt system prompt on TRAIN (executable-graded via the
+   *  surface score), frozen, then run — selfImprove() with an executable JudgeConfig (NOT improve(): the
+   *  steerer prompt is not a profile field). */
+  optimize?: 'off' | 'gepa'
+  halo?: boolean // HALO analyst option
   persistentArtifact?: boolean // multi-round persistent artifact (openSandboxRun resume)
 }
 
@@ -45,10 +51,9 @@ const topologyStrategy: Record<AblationKnobs['topology'], Strategy> = {
 
 /** Fail loud on a set-but-unwired knob — the house rule (no silent no-op). Names the primitive to wire. */
 const unwiredKnobs: Array<{ k: keyof AblationKnobs; isSet: (v: unknown) => boolean; prim: string }> = [
-  { k: 'optimize', isSet: (v) => !!v && v !== 'off', prim: 'gepaProposer/skillOptProposer + improve() on TRAIN, frozen' },
-  { k: 'traceAnalysis', isSet: (v) => !!v && v !== 'off', prim: 'analyzeOnSettle / watchTrace (agent-eval analysts)' },
+  { k: 'driverSteer', isSet: (v) => v === true, prim: 'supervise(driverProfile,{backend,analyzeOnSettle}) — driver composes the steer from the analyst finding' },
+  { k: 'optimize', isSet: (v) => !!v && v !== 'off', prim: "selfImprove() w/ executable JudgeConfig optimizing the driver's compose-prompt on TRAIN, frozen" },
   { k: 'halo', isSet: (v) => v === true, prim: 'HALO analyst option' },
-  { k: 'steering', isSet: (v) => v === true, prim: 'event-bus finding → steer_worker' },
   { k: 'persistentArtifact', isSet: (v) => v === true, prim: 'openSandboxRun resume' },
 ]
 
@@ -63,6 +68,8 @@ export interface ArmResult {
   latencyMs: number
   shotsMean: number
   completionsMean: number
+  /** Per-task resolved (0/1), task-aligned across arms — the paired vector for significance. */
+  perTask: number[]
 }
 
 export async function runAblation(opts: {
@@ -97,6 +104,7 @@ export async function runAblation(opts: {
     let ms = 0
     let shots = 0
     let comps = 0
+    const perTask: number[] = []
     for (const t of tasks) {
       const r = await runAgentic({
         surface: opts.environment,
@@ -110,6 +118,7 @@ export async function runAblation(opts: {
         ...(opts.worker.innerTurns !== undefined ? { innerTurns: opts.worker.innerTurns } : {}),
       })
       if (r.resolved) resolved++
+      perTask.push(r.resolved ? 1 : 0)
       ti += r.tokens.input
       to += r.tokens.output
       usd += r.usd
@@ -129,6 +138,7 @@ export async function runAblation(opts: {
       latencyMs: ms,
       shotsMean: shots / n,
       completionsMean: comps / n,
+      perTask,
     }
     results.push(res)
     opts.onArm?.(res)
@@ -142,25 +152,30 @@ export function printAutopsy(results: ArmResult[]): void {
   const pad = (s: string, n: number) => s.padEnd(n)
   console.log(`\n═══ ABLATION AUTOPSY (n=${base?.n} held-out, one-knob-delta vs baseline) ═══`)
   console.log(
-    pad('arm', 16) + pad('topology', 14) + pad('resolve', 9) + pad('tok(in/out)', 16) + pad('$', 9) + pad('lat(s)', 9) + pad('shots', 7) + pad('Δresolve', 10) + 'Δ$',
+    pad('arm', 16) + pad('topology', 14) + pad('resolve', 9) + pad('$', 9) + pad('lat(s)', 8) + pad('shots', 7) + pad('Δresolve [95% CI]', 24) + 'Δ$',
   )
   for (const r of results) {
-    const dR = base ? r.resolve - base.resolve : 0
     const dC = base ? r.costUsd - base.costUsd : 0
+    // Significance: paired bootstrap of this arm's per-task resolve vs baseline's (task-aligned).
+    let lift = '+0pp'
+    if (base && r !== base) {
+      const b = pairedBootstrap(base.perTask, r.perTask, { confidence: 0.95, statistic: 'mean' })
+      const sig = b.low > 0 || b.high < 0 ? '✓' : '·' // CI excludes 0 ⇒ real
+      lift = `${b.median >= 0 ? '+' : ''}${(100 * b.median).toFixed(0)}pp [${(100 * b.low).toFixed(0)},${(100 * b.high).toFixed(0)}] ${sig}`
+    }
     console.log(
       pad(r.name, 16) +
         pad(r.knobs.topology, 14) +
         pad(`${(100 * r.resolve).toFixed(0)}%`, 9) +
-        pad(`${r.tokensIn}/${r.tokensOut}`, 16) +
         pad(`$${r.costUsd.toFixed(4)}`, 9) +
-        pad((r.latencyMs / 1000).toFixed(0), 9) +
+        pad((r.latencyMs / 1000).toFixed(0), 8) +
         pad(r.shotsMean.toFixed(1), 7) +
-        pad(`${dR >= 0 ? '+' : ''}${(100 * dR).toFixed(0)}pp`, 10) +
+        pad(lift, 24) +
         `${dC >= 0 ? '+' : ''}$${dC.toFixed(4)}`,
     )
   }
   console.log(
-    '\n>>> Read it cost-aware: a +resolve that costs +$$ may be worse than baseline. The whole point is to see what HELPS vs what just BURNS.',
+    '\n>>> Read it cost-aware: ✓ = CI excludes 0 (real lift). A +resolve that costs +$$ or is not ✓ may be worse than baseline. The point is to see what HELPS vs what just BURNS.',
   )
 }
 

From 6e747231dd5001e6b3d576d33be7f2b70b44c946 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Sat, 27 Jun 2026 23:18:02 -0600
Subject: [PATCH 6/6] fix(examples): address self-improving coder review

---
 .gitignore                                    |   2 +
 bench/src/swe-bench-env.ts                    |  23 ++-
 bench/swe-self-improve.mts                    |  49 ++---
 examples/ablation-suite/ablation.ts           |  51 ++++--
 .../self-improving-coder.ts                   | 167 ++++++++++++------
 5 files changed, 200 insertions(+), 92 deletions(-)

diff --git a/.gitignore b/.gitignore
index d8334027..ed4bec97 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,5 @@ bench/scripts/__pycache__/
 # local rollout-corpus scratch (raw jsonl, per work-line)
 corpus/
 test_repo/
+.sic-run-*/
+.swe-run-*/
diff --git a/bench/src/swe-bench-env.ts b/bench/src/swe-bench-env.ts
index 98b48909..ccfb0931 100644
--- a/bench/src/swe-bench-env.ts
+++ b/bench/src/swe-bench-env.ts
@@ -13,7 +13,7 @@
  * memorization. Always report this; never claim a "clean" frontier number from this arena alone.
  */
 import { execFile } from 'node:child_process'
-import { existsSync, mkdtempSync, readdirSync, readFileSync, rmSync, statSync, writeFileSync } from 'node:fs'
+import { existsSync, lstatSync, mkdtempSync, readdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs'
 import { tmpdir } from 'node:os'
 import { join } from 'node:path'
 import { promisify } from 'node:util'
@@ -49,11 +49,16 @@ export async function createSweBenchEnvironment(poolN = 80): Promise<{
       if (!bt) throw new Error(`swe-bench-env: unknown task ${task.id}`)
       const md = bt.metadata as Record<string, string>
       const dir = mkdtempSync(join(tmpdir(), 'swe-'))
-      await exec('git', ['clone', '--filter=blob:none', '--no-checkout', '--quiet', `https://github.com/${md.repo}.git`, dir], { timeout: 420_000 })
-      await exec('git', ['-C', dir, 'checkout', '--quiet', md.base_commit], { timeout: 300_000 })
-      const handle: ArtifactHandle = { id: dir, surface: 'swe-bench-verified' }
-      workspaces.set(dir, { dir, task: bt })
-      return handle
+      try {
+        await exec('git', ['clone', '--filter=blob:none', '--no-checkout', '--quiet', `https://github.com/${md.repo}.git`, dir], { timeout: 420_000 })
+        await exec('git', ['-C', dir, 'checkout', '--quiet', md.base_commit], { timeout: 300_000 })
+        const handle: ArtifactHandle = { id: dir, surface: 'swe-bench-verified' }
+        workspaces.set(dir, { dir, task: bt })
+        return handle
+      } catch (error) {
+        rmSync(dir, { recursive: true, force: true })
+        throw error
+      }
     },
     async tools() {
       return [
@@ -66,8 +71,8 @@ export async function createSweBenchEnvironment(poolN = 80): Promise<{
       const ws = workspaces.get(handle.id)
       if (!ws) return 'ERROR: workspace closed'
       const safe = (p: string): string | null => {
-        const n = p.replace(/^\.?\//, '')
-        return n.includes('..') || n.startsWith('/') ? null : n
+        if (p.startsWith('/') || p.includes('..')) return null
+        return p.replace(/^\.\//, '')
       }
       if (name === 'list_files') {
         const sub = safe(String(args.dir ?? '')) ?? ''
@@ -87,7 +92,7 @@ export async function createSweBenchEnvironment(poolN = 80): Promise<{
             const p = join(d, e)
             let isDir = false
             try {
-              isDir = statSync(p).isDirectory()
+              isDir = lstatSync(p).isDirectory()
             } catch {
               continue
             }
diff --git a/bench/swe-self-improve.mts b/bench/swe-self-improve.mts
index 270a3f68..a1d72cff 100644
--- a/bench/swe-self-improve.mts
+++ b/bench/swe-self-improve.mts
@@ -22,7 +22,7 @@ async function main(): Promise<void> {
   const innerTurns = Number(process.env.INNER_TURNS ?? 40)
   const { environment, tasks } = await createSweBenchEnvironment(Number(process.env.POOL_N ?? 80))
 
-  if (process.env.CALIBRATE) {
+  if (process.env.CALIBRATE === '1') {
     const n = Number(process.env.N ?? 3)
     const ts = await tasks(0, n)
     console.log(`═══ SWE-bench CALIBRATION — ${workerModel}, baseline=refine, ${n} real bugs ═══`)
@@ -38,26 +38,31 @@ async function main(): Promise<void> {
     return
   }
 
-  const outDir = mkdtempSync(join(process.cwd(), '.swe-run-'))
-  const report = await runStrategyEvolution({
-    environment,
-    tasks,
-    trainN: Number(process.env.TRAIN_N ?? 6),
-    holdoutN: Number(process.env.HOLDOUT_N ?? 8),
-    worker: { routerBaseUrl, routerKey, model: workerModel, maxTokens: 8000, innerTurns },
-    author: {
-      chat: createChatClient({ transport: 'router', baseUrl: routerBaseUrl, apiKey: routerKey, defaultModel: authorModel }),
-      model: authorModel,
-      maxTokens: 8000,
-      fallbackModel: process.env.AUTHOR_FALLBACK ?? 'deepseek-v4-flash',
-    },
-    baselines: [sample, refine],
-    budget: Number(process.env.BUDGET ?? 2),
-    generations: Number(process.env.GENERATIONS ?? 2),
-    populationSize: Number(process.env.POP ?? 2),
-    outDir,
-  })
-  rmSync(outDir, { recursive: true, force: true })
+  const report = await (async () => {
+    const outDir = mkdtempSync(join(process.cwd(), '.swe-run-'))
+    try {
+      return await runStrategyEvolution({
+        environment,
+        tasks,
+        trainN: Number(process.env.TRAIN_N ?? 6),
+        holdoutN: Number(process.env.HOLDOUT_N ?? 8),
+        worker: { routerBaseUrl, routerKey, model: workerModel, maxTokens: 8000, innerTurns },
+        author: {
+          chat: createChatClient({ transport: 'router', baseUrl: routerBaseUrl, apiKey: routerKey, defaultModel: authorModel }),
+          model: authorModel,
+          maxTokens: 8000,
+          fallbackModel: process.env.AUTHOR_FALLBACK ?? 'deepseek-v4-flash',
+        },
+        baselines: [sample, refine],
+        budget: Number(process.env.BUDGET ?? 2),
+        generations: Number(process.env.GENERATIONS ?? 2),
+        populationSize: Number(process.env.POP ?? 2),
+        outDir,
+      })
+    } finally {
+      rmSync(outDir, { recursive: true, force: true })
+    }
+  })()
 
   const v = report.verdict
   console.log('\n═══ SWE-bench SELF-IMPROVEMENT — certified on a FROZEN holdout (CONTAMINATION-flagged) ═══')
@@ -74,6 +79,6 @@ async function main(): Promise<void> {
 }
 
 main().catch((e) => {
-  console.error(e)
+  console.error(e instanceof Error ? (e.stack ?? e.message) : String(e))
   process.exit(1)
 })
diff --git a/examples/ablation-suite/ablation.ts b/examples/ablation-suite/ablation.ts
index 889c7e3a..9fe2837e 100644
--- a/examples/ablation-suite/ablation.ts
+++ b/examples/ablation-suite/ablation.ts
@@ -18,9 +18,9 @@ import {
   type AgenticTask,
   refine,
   runAgentic,
+  type Strategy,
   sample,
   sampleThenRefine,
-  type Strategy,
 } from '@tangle-network/agent-runtime/loops'
 import { codingEnv, codingTasks } from '../self-improving-coder/self-improving-coder'
 
@@ -50,9 +50,21 @@ const topologyStrategy: Record<AblationKnobs['topology'], Strategy> = {
 }
 
 /** Fail loud on a set-but-unwired knob — the house rule (no silent no-op). Names the primitive to wire. */
-const unwiredKnobs: Array<{ k: keyof AblationKnobs; isSet: (v: unknown) => boolean; prim: string }> = [
-  { k: 'driverSteer', isSet: (v) => v === true, prim: 'supervise(driverProfile,{backend,analyzeOnSettle}) — driver composes the steer from the analyst finding' },
-  { k: 'optimize', isSet: (v) => !!v && v !== 'off', prim: "selfImprove() w/ executable JudgeConfig optimizing the driver's compose-prompt on TRAIN, frozen" },
+const unwiredKnobs: Array<{
+  k: keyof AblationKnobs
+  isSet: (v: unknown) => boolean
+  prim: string
+}> = [
+  {
+    k: 'driverSteer',
+    isSet: (v) => v === true,
+    prim: 'supervise(driverProfile,{backend,analyzeOnSettle}) — driver composes the steer from the analyst finding',
+  },
+  {
+    k: 'optimize',
+    isSet: (v) => !!v && v !== 'off',
+    prim: "selfImprove() w/ executable JudgeConfig optimizing the driver's compose-prompt on TRAIN, frozen",
+  },
   { k: 'halo', isSet: (v) => v === true, prim: 'HALO analyst option' },
   { k: 'persistentArtifact', isSet: (v) => v === true, prim: 'openSandboxRun resume' },
 ]
@@ -80,14 +92,23 @@ export async function runAblation(opts: {
   base: AblationKnobs
   /** Each delta = a ONE-KNOB change vs base (the one-knob-delta design). */
   deltas: Array<{ name: string; knob: Partial<AblationKnobs> }>
-  worker: { routerBaseUrl: string; routerKey: string; model: string; maxTokens?: number; innerTurns?: number }
+  worker: {
+    routerBaseUrl: string
+    routerKey: string
+    model: string
+    maxTokens?: number
+    innerTurns?: number
+  }
   onArm?: (r: ArmResult) => void
 }): Promise<ArmResult[]> {
   // ONE held-out set, shared across all arms — the fair-comparison invariant.
   const tasks = await opts.tasks(opts.holdoutOffset, opts.holdoutN)
   const arms = [
     { name: 'baseline', knobs: opts.base },
-    ...opts.deltas.map((d) => ({ name: d.name, knobs: { ...opts.base, ...d.knob } as AblationKnobs })),
+    ...opts.deltas.map((d) => ({
+      name: d.name,
+      knobs: { ...opts.base, ...d.knob } as AblationKnobs,
+    })),
   ]
   const results: ArmResult[] = []
   for (const arm of arms) {
@@ -152,7 +173,14 @@ export function printAutopsy(results: ArmResult[]): void {
   const pad = (s: string, n: number) => s.padEnd(n)
   console.log(`\n═══ ABLATION AUTOPSY (n=${base?.n} held-out, one-knob-delta vs baseline) ═══`)
   console.log(
-    pad('arm', 16) + pad('topology', 14) + pad('resolve', 9) + pad('$', 9) + pad('lat(s)', 8) + pad('shots', 7) + pad('Δresolve [95% CI]', 24) + 'Δ$',
+    pad('arm', 16) +
+      pad('topology', 14) +
+      pad('resolve', 9) +
+      pad('$', 9) +
+      pad('lat(s)', 8) +
+      pad('shots', 7) +
+      pad('Δresolve [95% CI]', 24) +
+      'Δ$',
   )
   for (const r of results) {
     const dC = base ? r.costUsd - base.costUsd : 0
@@ -161,7 +189,7 @@ export function printAutopsy(results: ArmResult[]): void {
     if (base && r !== base) {
       const b = pairedBootstrap(base.perTask, r.perTask, { confidence: 0.95, statistic: 'mean' })
       const sig = b.low > 0 || b.high < 0 ? '✓' : '·' // CI excludes 0 ⇒ real
-      lift = `${b.median >= 0 ? '+' : ''}${(100 * b.median).toFixed(0)}pp [${(100 * b.low).toFixed(0)},${(100 * b.high).toFixed(0)}] ${sig}`
+      lift = `${b.mean >= 0 ? '+' : ''}${(100 * b.mean).toFixed(0)}pp [${(100 * b.low).toFixed(0)},${(100 * b.high).toFixed(0)}] ${sig}`
     }
     console.log(
       pad(r.name, 16) +
@@ -202,13 +230,16 @@ async function main(): Promise<void> {
       { name: 'fanout-refine', knob: { topology: 'fanout-refine' } },
     ],
     worker,
-    onArm: (r) => console.log(`  ${r.name}: ${(100 * r.resolve).toFixed(0)}% resolve, $${r.costUsd.toFixed(4)}, ${(r.latencyMs / 1000).toFixed(0)}s`),
+    onArm: (r) =>
+      console.log(
+        `  ${r.name}: ${(100 * r.resolve).toFixed(0)}% resolve, $${r.costUsd.toFixed(4)}, ${(r.latencyMs / 1000).toFixed(0)}s`,
+      ),
   })
   printAutopsy(results)
 }
 
 if (import.meta.url === `file://${process.argv[1]}`)
   main().catch((e) => {
-    console.error(e)
+    console.error(e instanceof Error ? (e.stack ?? e.message) : String(e))
     process.exit(1)
   })
diff --git a/examples/self-improving-coder/self-improving-coder.ts b/examples/self-improving-coder/self-improving-coder.ts
index eaaa206d..d799cffc 100644
--- a/examples/self-improving-coder/self-improving-coder.ts
+++ b/examples/self-improving-coder/self-improving-coder.ts
@@ -25,7 +25,7 @@
  * Run:  TANGLE_API_KEY=<router key>  pnpm tsx examples/self-improving-coder/self-improving-coder.ts
  */
 import { execFileSync } from 'node:child_process'
-import { mkdirSync, mkdtempSync, readdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs'
+import { mkdtempSync, readdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs'
 import { tmpdir } from 'node:os'
 import { join } from 'node:path'
 import { createChatClient } from '@tangle-network/agent-eval'
@@ -36,8 +36,8 @@ import {
   type ArtifactHandle,
   refine,
   runStrategyEvolution,
-  sample,
   type SurfaceScore,
+  sample,
 } from '@tangle-network/agent-runtime/loops'
 
 // ── The contamination-proof task generator (deterministic per seed) ──────────────
@@ -46,7 +46,11 @@ import {
  *  agent edits + the hidden-ish test file (the agent may read it; grading runs it). */
 function constsFor(seed: number): { VER: string; SEP: string; MOD: number } {
   const r = (m: number) => ((seed * 2654435761) >>> 0) % m
-  return { VER: `v${(r(900) + 100).toString(36)}`, SEP: ['-', '|', ':', '/', '#'][r(5)]!, MOD: [97, 101, 103, 107, 109][r(5)]! }
+  return {
+    VER: `v${(r(900) + 100).toString(36)}`,
+    SEP: ['-', '|', ':', '/', '#'][r(5)]!,
+    MOD: [97, 101, 103, 107, 109][r(5)]!,
+  }
 }
 function genTask(seed: number): { stub: string; test: string; total: number } {
   const { VER, SEP, MOD } = constsFor(seed)
@@ -91,17 +95,22 @@ const workspaces = new Map<string, Ws>()
 function pytestPassed(dir: string): { passed: number; total: number } {
   let out = ''
   try {
-    out = execFileSync('python3', ['-m', 'pytest', '-q', '--tb=no', '-p', 'no:cacheprovider', 'test_lib.py'], {
-      cwd: dir,
-      encoding: 'utf8',
-      timeout: 60_000,
-      stdio: ['ignore', 'pipe', 'pipe'],
-    })
+    out = execFileSync(
+      'python3',
+      ['-m', 'pytest', '-q', '--tb=no', '-p', 'no:cacheprovider', 'test_lib.py'],
+      {
+        cwd: dir,
+        encoding: 'utf8',
+        timeout: 60_000,
+        stdio: ['ignore', 'pipe', 'pipe'],
+      },
+    )
   } catch (e) {
     out = (e as { stdout?: string }).stdout ?? ''
   }
   const passed = Number(out.match(/(\d+) passed/)?.[1] ?? 0)
-  const failed = Number(out.match(/(\d+) failed/)?.[1] ?? 0) + Number(out.match(/(\d+) error/)?.[1] ?? 0)
+  const failed =
+    Number(out.match(/(\d+) failed/)?.[1] ?? 0) + Number(out.match(/(\d+) error/)?.[1] ?? 0)
   return { passed, total: passed + failed }
 }
 
@@ -119,9 +128,39 @@ export const codingEnv: AgenticSurface = {
   },
   async tools() {
     return [
-      { type: 'function', function: { name: 'list_files', description: 'List the files in the workspace.', parameters: { type: 'object', properties: {} } } },
-      { type: 'function', function: { name: 'read_file', description: 'Read a file (e.g. test_lib.py to learn the contract, or lib.py).', parameters: { type: 'object', properties: { path: { type: 'string' } }, required: ['path'] } } },
-      { type: 'function', function: { name: 'write_file', description: 'Write COMPLETE contents of lib.py (the implementation). test_lib.py is read-only.', parameters: { type: 'object', properties: { path: { type: 'string' }, content: { type: 'string' } }, required: ['path', 'content'] } } },
+      {
+        type: 'function',
+        function: {
+          name: 'list_files',
+          description: 'List the files in the workspace.',
+          parameters: { type: 'object', properties: {} },
+        },
+      },
+      {
+        type: 'function',
+        function: {
+          name: 'read_file',
+          description: 'Read a file (e.g. test_lib.py to learn the contract, or lib.py).',
+          parameters: {
+            type: 'object',
+            properties: { path: { type: 'string' } },
+            required: ['path'],
+          },
+        },
+      },
+      {
+        type: 'function',
+        function: {
+          name: 'write_file',
+          description:
+            'Write COMPLETE contents of lib.py (the implementation). test_lib.py is read-only.',
+          parameters: {
+            type: 'object',
+            properties: { path: { type: 'string' }, content: { type: 'string' } },
+            required: ['path', 'content'],
+          },
+        },
+      },
       // NO run_tests: the agent cannot iterate-until-green. It must implement correctly from READING the
       // tests — which creates real headroom and makes the STRATEGY (planning, multiple attempts) matter.
     ] satisfies AgenticTool[]
@@ -131,34 +170,34 @@ export const codingEnv: AgenticSurface = {
     if (!ws) return 'ERROR: workspace closed'
     if (name === 'list_files') return readdirSync(ws.dir).join('\n')
     if (name === 'read_file') {
+      const p = String(args.path ?? '')
+      if (p !== 'lib.py' && p !== 'test_lib.py')
+        return 'ERROR: only lib.py and test_lib.py are readable'
       try {
-        return readFileSync(join(ws.dir, String(args.path ?? '')), 'utf8').slice(0, 8000)
+        return readFileSync(join(ws.dir, p), 'utf8').slice(0, 8000)
       } catch (e) {
         return `ERROR: ${(e as Error).message}`
       }
     }
     if (name === 'write_file') {
       const p = String(args.path ?? '')
-      if (!p.endsWith('lib.py') || p.includes('..') || p.startsWith('/')) return 'ERROR: only lib.py is writable'
+      if (p !== 'lib.py') return 'ERROR: only lib.py is writable'
       try {
-        mkdirSync(ws.dir, { recursive: true })
         writeFileSync(join(ws.dir, 'lib.py'), String(args.content ?? ''))
         return 'wrote lib.py'
       } catch (e) {
         return `ERROR: ${(e as Error).message}`
       }
     }
-    if (name === 'run_tests') {
-      const { passed, total } = pytestPassed(ws.dir)
-      return `pytest: ${passed}/${total} passed`
-    }
     return `ERROR: unknown tool ${name}`
   },
   async score(_task, handle): Promise<SurfaceScore> {
     const ws = workspaces.get(handle.id)
     if (!ws) return { passes: 0, total: 0, errored: 1 }
     const { passed, total } = pytestPassed(ws.dir)
-    return total > 0 ? { passes: passed, total, errored: 0 } : { passes: 0, total: ws.total, errored: 1 }
+    return total > 0
+      ? { passes: passed, total, errored: 0 }
+      : { passes: 0, total: ws.total, errored: 1 }
   },
   async close(handle) {
     const ws = workspaces.get(handle.id)
@@ -179,7 +218,8 @@ export const codingTasks = async (offset: number, n: number): Promise<AgenticTas
         'test_lib.py. You CANNOT run the tests — read test_lib.py CAREFULLY (every assertion, every edge case) and ' +
         'implement lib.py correctly in one pass with write_file. Get the edge cases right (empty inputs, malformed ' +
         'inputs, exact formats). Do not edit test_lib.py.',
-      userPrompt: 'Read test_lib.py to learn the exact contract, then write a correct lib.py. You cannot run the tests — reason carefully.',
+      userPrompt:
+        'Read test_lib.py to learn the exact contract, then write a correct lib.py. You cannot run the tests — reason carefully.',
       meta: { seed },
     } satisfies AgenticTask
   })
@@ -217,16 +257,23 @@ async function calibrate(): Promise<void> {
     await codingEnv.close(h)
     const pass = ref.passes === ref.total && ref.total > 0 && stub.passes === 0
     ok &&= pass
-    console.log(`  seed ${seed}: stub ${stub.passes}/${stub.total}  →  reference ${ref.passes}/${ref.total}  ${pass ? '✓' : '✗ BROKEN'}`)
+    console.log(
+      `  seed ${seed}: stub ${stub.passes}/${stub.total}  →  reference ${ref.passes}/${ref.total}  ${pass ? '✓' : '✗ BROKEN'}`,
+    )
   }
-  console.log(ok ? '\n>>> CALIBRATED — task is solvable + the grader discriminates. Safe to run the loop.' : '\n>>> BROKEN — fix the task/grader before spending.')
+  console.log(
+    ok
+      ? '\n>>> CALIBRATED — task is solvable + the grader discriminates. Safe to run the loop.'
+      : '\n>>> BROKEN — fix the task/grader before spending.',
+  )
   if (!ok) process.exit(1)
 }
 
 async function main(): Promise<void> {
-  if (process.env.CALIBRATE) return calibrate()
+  if (process.env.CALIBRATE === '1') return calibrate()
   const routerKey = process.env.TANGLE_API_KEY
-  if (!routerKey) throw new Error('set TANGLE_API_KEY (the worker + the author both call the router)')
+  if (!routerKey)
+    throw new Error('set TANGLE_API_KEY (the worker + the author both call the router)')
   const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1'
   const workerModel = process.env.WORKER_MODEL ?? 'deepseek-v4-flash'
   // The author WRITES strategy code (a `defineStrategy` module) — it needs a strong coder + a token
@@ -236,29 +283,45 @@ async function main(): Promise<void> {
   // The author writes candidate-strategy .mts files into outDir, then dynamically imports them — they
   // `import '@tangle-network/agent-runtime/loops'`, which only resolves UNDER the package (self-reference).
   // A /tmp outDir would fail to resolve it; keep it under the project root.
-  const outDir = mkdtempSync(join(process.cwd(), '.sic-run-'))
-  const report = await runStrategyEvolution({
-    environment: codingEnv,
-    tasks: codingTasks,
-    trainN: Number(process.env.TRAIN_N ?? 8),
-    holdoutN: Number(process.env.HOLDOUT_N ?? 12),
-    worker: { routerBaseUrl, routerKey, model: workerModel, innerTurns: Number(process.env.INNER_TURNS ?? 8), maxTokens: 4000 },
-    author: {
-      chat: createChatClient({ transport: 'router', baseUrl: routerBaseUrl, apiKey: routerKey, defaultModel: authorModel }),
-      model: authorModel,
-      maxTokens: 8000,
-      fallbackModel: process.env.AUTHOR_FALLBACK ?? 'deepseek-v4-flash',
-    },
-    baselines: [sample, refine],
-    budget: Number(process.env.BUDGET ?? 3),
-    generations: Number(process.env.GENERATIONS ?? 2),
-    populationSize: Number(process.env.POP ?? 2),
-    outDir,
-  })
-  rmSync(outDir, { recursive: true, force: true })
+  const report = await (async () => {
+    const outDir = mkdtempSync(join(process.cwd(), '.sic-run-'))
+    try {
+      return await runStrategyEvolution({
+        environment: codingEnv,
+        tasks: codingTasks,
+        trainN: Number(process.env.TRAIN_N ?? 8),
+        holdoutN: Number(process.env.HOLDOUT_N ?? 12),
+        worker: {
+          routerBaseUrl,
+          routerKey,
+          model: workerModel,
+          innerTurns: Number(process.env.INNER_TURNS ?? 8),
+          maxTokens: 4000,
+        },
+        author: {
+          chat: createChatClient({
+            transport: 'router',
+            baseUrl: routerBaseUrl,
+            apiKey: routerKey,
+            defaultModel: authorModel,
+          }),
+          model: authorModel,
+          maxTokens: 8000,
+          fallbackModel: process.env.AUTHOR_FALLBACK ?? 'deepseek-v4-flash',
+        },
+        baselines: [sample, refine],
+        budget: Number(process.env.BUDGET ?? 3),
+        generations: Number(process.env.GENERATIONS ?? 2),
+        populationSize: Number(process.env.POP ?? 2),
+        outDir,
+      })
+    } finally {
+      rmSync(outDir, { recursive: true, force: true })
+    }
+  })()
 
   const v = report.verdict
-  if (process.env.DUMP) {
+  if (process.env.DUMP === '1') {
     // Autopsy: gen0 baseline scores (headroom) + every authored candidate's score/error (did they
     // lose on a saturated task, or error at runtime?).
     const r = report as unknown as Record<string, unknown>
@@ -272,7 +335,9 @@ async function main(): Promise<void> {
   console.log(`gen0 champion:   ${report.gen0Champion.name}`)
   console.log(`final champion:  ${report.finalChampion.name}`)
   console.log(`PROMOTED:        ${v.promoted}  (${v.reason})`)
-  console.log(`held-out lift:   mean ${v.lift.mean.toFixed(3)}  95% CI [${v.lift.low.toFixed(3)}, ${v.lift.high.toFixed(3)}]  n=${v.n}`)
+  console.log(
+    `held-out lift:   mean ${v.lift.mean.toFixed(3)}  95% CI [${v.lift.low.toFixed(3)}, ${v.lift.high.toFixed(3)}]  n=${v.n}`,
+  )
   console.log(
     v.promoted
       ? '\n>>> The search taught the agent a strategy that fixes MORE on tasks it never trained on, beyond luck. Self-improvement CERTIFIED.'
@@ -282,6 +347,6 @@ async function main(): Promise<void> {
 
 if (import.meta.url === `file://${process.argv[1]}`)
   main().catch((e) => {
-  console.error(e)
-  process.exit(1)
-})
+    console.error(e instanceof Error ? (e.stack ?? e.message) : String(e))
+    process.exit(1)
+  })