diff --git a/.gitignore b/.gitignore
index d8334027..ed4bec97 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,5 @@ bench/scripts/__pycache__/
 # local rollout-corpus scratch (raw jsonl, per work-line)
 corpus/
 test_repo/
+.sic-run-*/
+.swe-run-*/
diff --git a/bench/src/swe-bench-env.ts b/bench/src/swe-bench-env.ts
new file mode 100644
index 00000000..ccfb0931
--- /dev/null
+++ b/bench/src/swe-bench-env.ts
@@ -0,0 +1,181 @@
+/**
+ * SWE-bench Verified as an `AgenticSurface` — the PROPER, no-cheating way to run a coding agent on real
+ * GitHub bugs through the substrate (`runAgentic`/`runBenchmark`/`runStrategyEvolution` drive the loop;
+ * we only provide tools + a deployable score). The agent clones the repo at base_commit, explores +
+ * edits SOURCE via tools (never tests — path-jailed), and `score()` grades the resulting `git diff`
+ * with the OFFICIAL swebench Docker harness (apply patch → FAIL_TO_PASS + PASS_TO_PASS → resolved).
+ *
+ * No cheating by construction: the agent never sees the hidden tests or the gold patch (the adapter's
+ * prompt is the issue only); `edit_file` refuses test files; the score is a real test run, not a judge.
+ *
+ * CONTAMINATION CAVEAT: SWE-bench bugs are public GitHub fixes a frontier model may have MEMORIZED.
+ * A clean train→holdout split (disjoint instances) rules out adaptive-reuse, but NOT training-data
+ * memorization. Always report this; never claim a "clean" frontier number from this arena alone.
+ */
+import { execFile } from 'node:child_process'
+import { existsSync, lstatSync, mkdtempSync, readdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs'
+import { tmpdir } from 'node:os'
+import { join } from 'node:path'
+import { promisify } from 'node:util'
+import type { AgenticSurface, AgenticTask, AgenticTool, ArtifactHandle, SurfaceScore } from '@tangle-network/agent-runtime/loops'
+import { createSweBenchAdapter } from './benchmarks/swe-bench'
+import type { BenchTask } from './benchmarks/types'
+
+const exec = promisify(execFile)
+const isTestPath = (p: string) => /(^|\/)(tests?)\//.test(p) || /test_.*\.py$|_test\.py$|conftest\.py$/.test(p)
+
+interface Ws {
+  dir: string
+  task: BenchTask
+}
+const workspaces = new Map<string, Ws>()
+
+/** Build the SWE-bench Environment + a DISJOINT-slice task supplier over the Verified split. The
+ *  supplier keys tasks by dataset offset so `runStrategyEvolution`'s train [0,trainN) and holdout
+ *  [trainN+off,…) never overlap. Verified is loaded once; instances carry their repo/base_commit. */
+export async function createSweBenchEnvironment(poolN = 80): Promise<{
+  environment: AgenticSurface
+  tasks: (offset: number, n: number) => Promise<AgenticTask[]>
+  adapter: ReturnType<typeof createSweBenchAdapter>
+}> {
+  const adapter = createSweBenchAdapter()
+  const pool = await adapter.loadTasks({ limit: poolN, split: 'test' })
+  const byId = new Map(pool.map((t) => [t.id, t]))
+
+  const environment: AgenticSurface = {
+    name: 'swe-bench-verified',
+    async open(task) {
+      const bt = byId.get(task.id)
+      if (!bt) throw new Error(`swe-bench-env: unknown task ${task.id}`)
+      const md = bt.metadata as Record<string, string>
+      const dir = mkdtempSync(join(tmpdir(), 'swe-'))
+      try {
+        await exec('git', ['clone', '--filter=blob:none', '--no-checkout', '--quiet', `https://github.com/${md.repo}.git`, dir], { timeout: 420_000 })
+        await exec('git', ['-C', dir, 'checkout', '--quiet', md.base_commit], { timeout: 300_000 })
+        const handle: ArtifactHandle = { id: dir, surface: 'swe-bench-verified' }
+        workspaces.set(dir, { dir, task: bt })
+        return handle
+      } catch (error) {
+        rmSync(dir, { recursive: true, force: true })
+        throw error
+      }
+    },
+    async tools() {
+      return [
+        { type: 'function', function: { name: 'list_files', description: 'List source files under a repo subdirectory (recursive, bounded). "" = repo root.', parameters: { type: 'object', properties: { dir: { type: 'string' } }, required: ['dir'] } } },
+        { type: 'function', function: { name: 'read_file', description: 'Read a repo file by path.', parameters: { type: 'object', properties: { path: { type: 'string' } }, required: ['path'] } } },
+        { type: 'function', function: { name: 'edit_file', description: 'Surgical fix: replace the EXACT old_string (must occur once — copy whitespace precisely) with new_string in a SOURCE file. Minimal changes, never whole-file rewrites. Test files are rejected.', parameters: { type: 'object', properties: { path: { type: 'string' }, old_string: { type: 'string' }, new_string: { type: 'string' } }, required: ['path', 'old_string', 'new_string'] } } },
+      ] satisfies AgenticTool[]
+    },
+    async call(handle, name, args) {
+      const ws = workspaces.get(handle.id)
+      if (!ws) return 'ERROR: workspace closed'
+      const safe = (p: string): string | null => {
+        if (p.startsWith('/') || p.includes('..')) return null
+        return p.replace(/^\.\//, '')
+      }
+      if (name === 'list_files') {
+        const sub = safe(String(args.dir ?? '')) ?? ''
+        const root = join(ws.dir, sub)
+        if (!existsSync(root)) return `(no such path: ${sub})`
+        const out: string[] = []
+        const walk = (d: string, depth: number) => {
+          if (depth > 2 || out.length > 240) return
+          let entries: string[] = []
+          try {
+            entries = readdirSync(d)
+          } catch {
+            return
+          }
+          for (const e of entries) {
+            if (e.startsWith('.') || e === 'node_modules' || e === '__pycache__') continue
+            const p = join(d, e)
+            let isDir = false
+            try {
+              isDir = lstatSync(p).isDirectory()
+            } catch {
+              continue
+            }
+            out.push(p.slice(ws.dir.length + 1) + (isDir ? '/' : ''))
+            if (isDir) walk(p, depth + 1)
+          }
+        }
+        walk(root, 0)
+        return out.slice(0, 240).join('\n') || '(empty)'
+      }
+      if (name === 'read_file') {
+        const p = safe(String(args.path ?? ''))
+        if (!p) return 'ERROR: invalid path'
+        try {
+          const c = readFileSync(join(ws.dir, p), 'utf8')
+          return c.length > 24_000 ? `${c.slice(0, 24_000)}\n...[truncated]` : c
+        } catch (e) {
+          return `(error: ${(e as Error).message})`
+        }
+      }
+      if (name === 'edit_file') {
+        const p = safe(String(args.path ?? ''))
+        if (!p) return 'ERROR: invalid path'
+        if (isTestPath(p)) return 'REJECTED: editing test files is forbidden (the evaluation runs hidden tests).'
+        const oldStr = String(args.old_string ?? '')
+        const newStr = String(args.new_string ?? '')
+        let content: string
+        try {
+          content = readFileSync(join(ws.dir, p), 'utf8')
+        } catch (e) {
+          return `(cannot read ${p}: ${(e as Error).message})`
+        }
+        if (!oldStr) return 'ERROR: old_string is empty.'
+        const count = content.split(oldStr).length - 1
+        if (count === 0) return `ERROR: old_string not found in ${p}. read_file it and copy EXACT text.`
+        if (count > 1) return `ERROR: old_string appears ${count}× in ${p} — add surrounding context to make it unique.`
+        writeFileSync(join(ws.dir, p), content.replace(oldStr, newStr))
+        return `edited ${p}: replaced 1 occurrence`
+      }
+      return `ERROR: unknown tool ${name}`
+    },
+    async score(_task, handle): Promise<SurfaceScore> {
+      const ws = workspaces.get(handle.id)
+      if (!ws) return { passes: 0, total: 1, errored: 1 }
+      let patch = ''
+      try {
+        const r = await exec('git', ['-C', ws.dir, 'diff'], { maxBuffer: 20_000_000, timeout: 60_000 })
+        patch = r.stdout
+      } catch {
+        patch = ''
+      }
+      if (!patch.trim()) return { passes: 0, total: 1, errored: 0 }
+      try {
+        const s = await adapter.judge(ws.task, patch)
+        return { passes: s.resolved ? 1 : 0, total: 1, errored: 0 }
+      } catch {
+        return { passes: 0, total: 1, errored: 1 }
+      }
+    },
+    async close(handle) {
+      const ws = workspaces.get(handle.id)
+      if (!ws) return
+      workspaces.delete(handle.id)
+      rmSync(ws.dir, { recursive: true, force: true })
+    },
+  }
+
+  const tasks = async (offset: number, n: number): Promise<AgenticTask[]> => {
+    const slice = pool.slice(offset, offset + n)
+    if (slice.length < n) throw new Error(`swe-bench-env: pool exhausted at offset ${offset} (need ${n}, have ${slice.length}; raise poolN)`)
+    return slice.map((bt) => ({
+      id: bt.id,
+      systemPrompt:
+        'You are a senior engineer fixing a real bug in the checked-out repository. Work PERSISTENTLY and do not ' +
+        'stop early: use list_files + read_file to explore BROADLY (read many candidate files — the bug is rarely in ' +
+        'the first file you open), trace the issue to its root cause, then fix it with edit_file. You MUST make at ' +
+        'least one edit_file call — never finish with prose alone or without attempting a fix. Make a MINIMAL surgical ' +
+        'change (a few lines, like a real PR), source only (test files are rejected). If an edit_file fails (old_string ' +
+        'not unique/found), read the file again and retry with exact text. Keep going until you have made your best fix.',
+      userPrompt: bt.prompt,
+      meta: { instanceId: bt.id },
+    }))
+  }
+
+  return { environment, tasks, adapter }
+}
diff --git a/bench/swe-self-improve.mts b/bench/swe-self-improve.mts
new file mode 100644
index 00000000..a1d72cff
--- /dev/null
+++ b/bench/swe-self-improve.mts
@@ -0,0 +1,84 @@
+/**
+ * SWE-bench self-improvement — the PROPER, no-cheating run: a frontier worker over the SWE-bench
+ * `Environment`, with `runStrategyEvolution` enforcing the train→freeze→holdout split (the substrate
+ * draws a disjoint holdout slice and gates once — adaptive reuse is impossible). CONTAMINATION CAVEAT
+ * applies (public fixes may be memorized) — reported, never claimed clean.
+ *
+ *   CALIBRATE first (cost gate):  TANGLE_API_KEY=… CALIBRATE=1 N=3 tsx bench/swe-self-improve.mts
+ *   Full run:                     TANGLE_API_KEY=… TRAIN_N=6 HOLDOUT_N=8 GENERATIONS=2 tsx bench/swe-self-improve.mts
+ */
+import { mkdtempSync, rmSync } from 'node:fs'
+import { join } from 'node:path'
+import { createChatClient } from '@tangle-network/agent-eval'
+import { refine, runAgentic, runStrategyEvolution, sample } from '@tangle-network/agent-runtime/loops'
+import { createSweBenchEnvironment } from './src/swe-bench-env'
+
+async function main(): Promise<void> {
+  const routerKey = process.env.TANGLE_API_KEY
+  if (!routerKey) throw new Error('TANGLE_API_KEY required (worker + author call the router)')
+  const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1'
+  const workerModel = process.env.WORKER_MODEL ?? 'gemini-2.5-pro'
+  const authorModel = process.env.AUTHOR_MODEL ?? 'gemini-2.5-pro'
+  const innerTurns = Number(process.env.INNER_TURNS ?? 40)
+  const { environment, tasks } = await createSweBenchEnvironment(Number(process.env.POOL_N ?? 80))
+
+  if (process.env.CALIBRATE === '1') {
+    const n = Number(process.env.N ?? 3)
+    const ts = await tasks(0, n)
+    console.log(`═══ SWE-bench CALIBRATION — ${workerModel}, baseline=refine, ${n} real bugs ═══`)
+    let resolved = 0
+    for (const t of ts) {
+      const t0 = Date.now()
+      const r = await runAgentic({ surface: environment, task: t, strategy: refine, routerBaseUrl, routerKey, model: workerModel, maxTokens: 8000, innerTurns, budget: 1 })
+      if (r.resolved) resolved++
+      console.log(`  ${t.id.padEnd(32)} resolved=${r.resolved} completions=${r.completions} shots=${r.shots} (${Math.round((Date.now() - t0) / 1000)}s)`)
+    }
+    const band = resolved > 0 && resolved < n
+    console.log(`\n>>> baseline resolved ${resolved}/${n}. ${band ? 'HEADROOM — the loop has room to improve. PROCEED.' : resolved === 0 ? 'TOO HARD / env issue — inspect before the loop.' : 'saturated at this small n — raise N.'}`)
+    return
+  }
+
+  const report = await (async () => {
+    const outDir = mkdtempSync(join(process.cwd(), '.swe-run-'))
+    try {
+      return await runStrategyEvolution({
+        environment,
+        tasks,
+        trainN: Number(process.env.TRAIN_N ?? 6),
+        holdoutN: Number(process.env.HOLDOUT_N ?? 8),
+        worker: { routerBaseUrl, routerKey, model: workerModel, maxTokens: 8000, innerTurns },
+        author: {
+          chat: createChatClient({ transport: 'router', baseUrl: routerBaseUrl, apiKey: routerKey, defaultModel: authorModel }),
+          model: authorModel,
+          maxTokens: 8000,
+          fallbackModel: process.env.AUTHOR_FALLBACK ?? 'deepseek-v4-flash',
+        },
+        baselines: [sample, refine],
+        budget: Number(process.env.BUDGET ?? 2),
+        generations: Number(process.env.GENERATIONS ?? 2),
+        populationSize: Number(process.env.POP ?? 2),
+        outDir,
+      })
+    } finally {
+      rmSync(outDir, { recursive: true, force: true })
+    }
+  })()
+
+  const v = report.verdict
+  console.log('\n═══ SWE-bench SELF-IMPROVEMENT — certified on a FROZEN holdout (CONTAMINATION-flagged) ═══')
+  console.log(`worker=${workerModel}  author=${authorModel}`)
+  console.log(`gen0 champion:   ${report.gen0Champion.name}`)
+  console.log(`final champion:  ${report.finalChampion.name}`)
+  console.log(`PROMOTED:        ${v.promoted}  (${v.reason})`)
+  console.log(`held-out lift:   mean ${v.lift.mean.toFixed(3)}  95% CI [${v.lift.low.toFixed(3)}, ${v.lift.high.toFixed(3)}]  n=${v.n}`)
+  console.log(
+    v.promoted
+      ? '\n>>> The search taught the agent a strategy that resolves MORE real bugs it never trained on, beyond luck. (Report the contamination caveat: public fixes may be memorized.)'
+      : '\n>>> No promotion: the evolved strategy did not beat gen0 on the fresh holdout beyond noise (honest null).',
+  )
+}
+
+main().catch((e) => {
+  console.error(e instanceof Error ? (e.stack ?? e.message) : String(e))
+  process.exit(1)
+})
diff --git a/examples/ablation-suite/ablation.ts b/examples/ablation-suite/ablation.ts
new file mode 100644
index 00000000..9fe2837e
--- /dev/null
+++ b/examples/ablation-suite/ablation.ts
@@ -0,0 +1,245 @@
+/**
+ * ablation — the cost-aware knob-board + one-knob-delta runner for agent self-improvement techniques.
+ *
+ * THE VISION: a single configurable agent where every technique is a knob (topology, trace-analysis,
+ * steering, GEPA/skill optimization, persistent artifacts), swept across arms at EQUAL COMPUTE, with a
+ * full autopsy — resolve rate AND token/$/latency cost per arm — so we see what really helps vs what
+ * just burns tokens. One-knob-delta design (baseline + each single knob flipped) keeps it O(N), not 2^N.
+ *
+ * STATUS — honest: the framework + the cost autopsy are real; knobs are wired incrementally. WIRED:
+ * `topology` (single/fanout/fanout-refine = refine/sample/sampleThenRefine) + `budget`. The rest are
+ * DECLARED knobs that FAIL LOUD if set (no silent no-op — you must not think GEPA ran when it didn't);
+ * each is a tracked next-increment over a real substrate primitive (named in the throw). Validate the
+ * framework on the cheap contamination-proof task, THEN point `environment`/`tasks` at SWE-bench.
+ */
+import { pairedBootstrap } from '@tangle-network/agent-eval'
+import {
+  type AgenticSurface,
+  type AgenticTask,
+  refine,
+  runAgentic,
+  type Strategy,
+  sample,
+  sampleThenRefine,
+} from '@tangle-network/agent-runtime/loops'
+import { codingEnv, codingTasks } from '../self-improving-coder/self-improving-coder'
+
+export interface AblationKnobs {
+  /** WIRED → strategy: single=`refine` (iterate one artifact), fanout=`sample` (N parallel, pick best),
+   *  fanout-refine=`sampleThenRefine`. The coordination shape. */
+  topology: 'single' | 'fanout' | 'fanout-refine'
+  /** WIRED → equal-compute unit (refine: max shots; fanout: rollout width). */
+  budget: number
+  // ── DECLARED knobs — fail loud until wired (each over a named substrate primitive) ──
+  /** The DRIVER-steers-WORKER loop: supervise() drives the worker, analyzeOnSettle fires the analyst on
+   *  each settled round → a `finding` the driver pulls and composes the next prompt from. (NOT the
+   *  refine analyst-steerer — that's the degenerate inline version; this is a driver brain in the loop.) */
+  driverSteer?: boolean // supervise(driverProfile,{backend,analyzeOnSettle}) + steer_agent
+  /** GEPA-optimize the DRIVER's compose-next-prompt system prompt on TRAIN (executable-graded via the
+   *  surface score), frozen, then run — selfImprove() with an executable JudgeConfig (NOT improve(): the
+   *  steerer prompt is not a profile field). */
+  optimize?: 'off' | 'gepa'
+  halo?: boolean // HALO analyst option
+  persistentArtifact?: boolean // multi-round persistent artifact (openSandboxRun resume)
+}
+
+const topologyStrategy: Record<AblationKnobs['topology'], Strategy> = {
+  single: refine,
+  fanout: sample,
+  'fanout-refine': sampleThenRefine,
+}
+
+/** Fail loud on a set-but-unwired knob — the house rule (no silent no-op). Names the primitive to wire. */
+const unwiredKnobs: Array<{
+  k: keyof AblationKnobs
+  isSet: (v: unknown) => boolean
+  prim: string
+}> = [
+  {
+    k: 'driverSteer',
+    isSet: (v) => v === true,
+    prim: 'supervise(driverProfile,{backend,analyzeOnSettle}) — driver composes the steer from the analyst finding',
+  },
+  {
+    k: 'optimize',
+    isSet: (v) => !!v && v !== 'off',
+    prim: "selfImprove() w/ executable JudgeConfig optimizing the driver's compose-prompt on TRAIN, frozen",
+  },
+  { k: 'halo', isSet: (v) => v === true, prim: 'HALO analyst option' },
+  { k: 'persistentArtifact', isSet: (v) => v === true, prim: 'openSandboxRun resume' },
+]
+
+export interface ArmResult {
+  name: string
+  knobs: AblationKnobs
+  n: number
+  resolve: number // mean resolved (0..1) on the held-out set
+  tokensIn: number
+  tokensOut: number
+  costUsd: number
+  latencyMs: number
+  shotsMean: number
+  completionsMean: number
+  /** Per-task resolved (0/1), task-aligned across arms — the paired vector for significance. */
+  perTask: number[]
+}
+
+export async function runAblation(opts: {
+  environment: AgenticSurface
+  tasks: (offset: number, n: number) => Promise<AgenticTask[]>
+  holdoutOffset: number
+  holdoutN: number
+  base: AblationKnobs
+  /** Each delta = a ONE-KNOB change vs base (the one-knob-delta design). */
+  deltas: Array<{ name: string; knob: Partial<AblationKnobs> }>
+  worker: {
+    routerBaseUrl: string
+    routerKey: string
+    model: string
+    maxTokens?: number
+    innerTurns?: number
+  }
+  onArm?: (r: ArmResult) => void
+}): Promise<ArmResult[]> {
+  // ONE held-out set, shared across all arms — the fair-comparison invariant.
+  const tasks = await opts.tasks(opts.holdoutOffset, opts.holdoutN)
+  const arms = [
+    { name: 'baseline', knobs: opts.base },
+    ...opts.deltas.map((d) => ({
+      name: d.name,
+      knobs: { ...opts.base, ...d.knob } as AblationKnobs,
+    })),
+  ]
+  const results: ArmResult[] = []
+  for (const arm of arms) {
+    for (const u of unwiredKnobs) {
+      if (u.isSet(arm.knobs[u.k]))
+        throw new Error(
+          `ablation: knob '${u.k}'=${JSON.stringify(arm.knobs[u.k])} (arm "${arm.name}") is DECLARED but not yet wired — wire it over ${u.prim} before claiming it ran. (No silent no-op.)`,
+        )
+    }
+    let resolved = 0
+    let ti = 0
+    let to = 0
+    let usd = 0
+    let ms = 0
+    let shots = 0
+    let comps = 0
+    const perTask: number[] = []
+    for (const t of tasks) {
+      const r = await runAgentic({
+        surface: opts.environment,
+        task: t,
+        strategy: topologyStrategy[arm.knobs.topology],
+        budget: arm.knobs.budget,
+        routerBaseUrl: opts.worker.routerBaseUrl,
+        routerKey: opts.worker.routerKey,
+        model: opts.worker.model,
+        ...(opts.worker.maxTokens !== undefined ? { maxTokens: opts.worker.maxTokens } : {}),
+        ...(opts.worker.innerTurns !== undefined ? { innerTurns: opts.worker.innerTurns } : {}),
+      })
+      if (r.resolved) resolved++
+      perTask.push(r.resolved ? 1 : 0)
+      ti += r.tokens.input
+      to += r.tokens.output
+      usd += r.usd
+      ms += r.ms
+      shots += r.shots
+      comps += r.completions
+    }
+    const n = tasks.length
+    const res: ArmResult = {
+      name: arm.name,
+      knobs: arm.knobs,
+      n,
+      resolve: resolved / n,
+      tokensIn: ti,
+      tokensOut: to,
+      costUsd: usd,
+      latencyMs: ms,
+      shotsMean: shots / n,
+      completionsMean: comps / n,
+      perTask,
+    }
+    results.push(res)
+    opts.onArm?.(res)
+  }
+  return results
+}
+
+/** The cost-aware autopsy: per-arm resolve + tokens + $ + latency, and Δ vs baseline (lift AND cost). */
+export function printAutopsy(results: ArmResult[]): void {
+  const base = results[0]
+  const pad = (s: string, n: number) => s.padEnd(n)
+  console.log(`\n═══ ABLATION AUTOPSY (n=${base?.n} held-out, one-knob-delta vs baseline) ═══`)
+  console.log(
+    pad('arm', 16) +
+      pad('topology', 14) +
+      pad('resolve', 9) +
+      pad('$', 9) +
+      pad('lat(s)', 8) +
+      pad('shots', 7) +
+      pad('Δresolve [95% CI]', 24) +
+      'Δ$',
+  )
+  for (const r of results) {
+    const dC = base ? r.costUsd - base.costUsd : 0
+    // Significance: paired bootstrap of this arm's per-task resolve vs baseline's (task-aligned).
+    let lift = '+0pp'
+    if (base && r !== base) {
+      const b = pairedBootstrap(base.perTask, r.perTask, { confidence: 0.95, statistic: 'mean' })
+      const sig = b.low > 0 || b.high < 0 ? '✓' : '·' // CI excludes 0 ⇒ real
+      lift = `${b.mean >= 0 ? '+' : ''}${(100 * b.mean).toFixed(0)}pp [${(100 * b.low).toFixed(0)},${(100 * b.high).toFixed(0)}] ${sig}`
+    }
+    console.log(
+      pad(r.name, 16) +
+        pad(r.knobs.topology, 14) +
+        pad(`${(100 * r.resolve).toFixed(0)}%`, 9) +
+        pad(`$${r.costUsd.toFixed(4)}`, 9) +
+        pad((r.latencyMs / 1000).toFixed(0), 8) +
+        pad(r.shotsMean.toFixed(1), 7) +
+        pad(lift, 24) +
+        `${dC >= 0 ? '+' : ''}$${dC.toFixed(4)}`,
+    )
+  }
+  console.log(
+    '\n>>> Read it cost-aware: ✓ = CI excludes 0 (real lift). A +resolve that costs +$$ or is not ✓ may be worse than baseline. The point is to see what HELPS vs what just BURNS.',
+  )
+}
+
+async function main(): Promise<void> {
+  const routerKey = process.env.TANGLE_API_KEY
+  if (!routerKey) throw new Error('TANGLE_API_KEY required')
+  const worker = {
+    routerBaseUrl: process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1',
+    routerKey,
+    model: process.env.WORKER_MODEL ?? 'deepseek-v4-flash',
+    maxTokens: 4000,
+    innerTurns: Number(process.env.INNER_TURNS ?? 6),
+  }
+  console.log(`═══ ABLATION (cheap contamination-proof task) — worker=${worker.model} ═══`)
+  const results = await runAblation({
+    environment: codingEnv,
+    tasks: codingTasks,
+    holdoutOffset: 100, // a fixed disjoint held-out slice
+    holdoutN: Number(process.env.HOLDOUT_N ?? 6),
+    base: { topology: 'single', budget: Number(process.env.BUDGET ?? 2) },
+    // one-knob-delta: flip ONLY topology (the wired knob) vs baseline.
+    deltas: [
+      { name: 'fanout', knob: { topology: 'fanout' } },
+      { name: 'fanout-refine', knob: { topology: 'fanout-refine' } },
+    ],
+    worker,
+    onArm: (r) =>
+      console.log(
+        `  ${r.name}: ${(100 * r.resolve).toFixed(0)}% resolve, $${r.costUsd.toFixed(4)}, ${(r.latencyMs / 1000).toFixed(0)}s`,
+      ),
+  })
+  printAutopsy(results)
+}
+
+if (import.meta.url === `file://${process.argv[1]}`)
+  main().catch((e) => {
+    console.error(e instanceof Error ? (e.stack ?? e.message) : String(e))
+    process.exit(1)
+  })
diff --git a/examples/self-improving-coder/self-improving-coder.ts b/examples/self-improving-coder/self-improving-coder.ts
new file mode 100644
index 00000000..d799cffc
--- /dev/null
+++ b/examples/self-improving-coder/self-improving-coder.ts
@@ -0,0 +1,352 @@
+/**
+ * Self-improving coder — the substrate's self-improvement spine, composed cleanly, on a
+ * CONTAMINATION-PROOF coding task. NOTHING here is hand-rolled: the genome is an `AgentProfile`-shaped
+ * worker, the task is an `AgenticSurface` (open/tools/call/score/close), and the held-out-gated
+ * flywheel is `runStrategyEvolution` — which authors candidate strategies from TRAIN losses, then
+ * makes ONE promotion decision on a FRESH holdout slice the search never touched (`promotionGate`,
+ * a seeded paired-bootstrap CI). Adaptive data analysis is structurally impossible: the holdout is
+ * disjoint by task offset and read exactly once.
+ *
+ * Why contamination-proof: each task is a small wire-protocol library whose constants (version,
+ * separators, checksum modulus, opcode) are DERIVED FROM THE SEED and specified ONLY by the test file.
+ * A frontier model cannot have memorized the fix — the exact contract is generated per task. Graded by
+ * REAL pytest (a deployable check, never an LLM judge).
+ *
+ * IMPORTANT — the bundled task is DELIBERATELY SIMPLE (a few functions fully pinned by their tests).
+ * A capable model aces it (every strategy scores 1.0), so the gate CORRECTLY returns no-promotion:
+ * you cannot demonstrate self-improvement where there is no headroom — and this harness refuses to
+ * pretend otherwise (calibrate-before-measure, enforced). To get a DISCRIMINATING run, swap in a task
+ * with a correctable middle band (algorithmically hard generated tasks, or a real benchmark below).
+ *
+ * To run frontier SWE-bench instead, swap `environment`/`tasks` for the SWE-bench `Environment`
+ * (bench/src/benchmarks/swe-bench.ts) — everything else is identical. (That arena is contamination-
+ * SUSPECT: its bugs are public GitHub fixes a model may have memorized — report it, never claim clean.)
+ *
+ * Run:  TANGLE_API_KEY=<router key>  pnpm tsx examples/self-improving-coder/self-improving-coder.ts
+ */
+import { execFileSync } from 'node:child_process'
+import { mkdtempSync, readdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs'
+import { tmpdir } from 'node:os'
+import { join } from 'node:path'
+import { createChatClient } from '@tangle-network/agent-eval'
+import {
+  type AgenticSurface,
+  type AgenticTask,
+  type AgenticTool,
+  type ArtifactHandle,
+  refine,
+  runStrategyEvolution,
+  type SurfaceScore,
+  sample,
+} from '@tangle-network/agent-runtime/loops'
+
+// ── The contamination-proof task generator (deterministic per seed) ──────────────
+/** A small wire-protocol library, fully specified by its tests, with seed-derived constants. The
+ *  agent must READ the tests to infer the exact contract — it cannot recall it. Returns the stub the
+ *  agent edits + the hidden-ish test file (the agent may read it; grading runs it). */
+function constsFor(seed: number): { VER: string; SEP: string; MOD: number } {
+  const r = (m: number) => ((seed * 2654435761) >>> 0) % m
+  return {
+    VER: `v${(r(900) + 100).toString(36)}`,
+    SEP: ['-', '|', ':', '/', '#'][r(5)]!,
+    MOD: [97, 101, 103, 107, 109][r(5)]!,
+  }
+}
+function genTask(seed: number): { stub: string; test: string; total: number } {
+  const { VER, SEP, MOD } = constsFor(seed)
+  const t = (id: number, text: string) => `${VER}${SEP}${id}${SEP}${text}`
+  const tests = [
+    'import pytest',
+    'from lib import encode, decode, checksum, valid',
+    '',
+    `def test_encode(): assert encode(3, "hi") == ${JSON.stringify(t(3, 'hi'))}`,
+    `def test_encode_zero(): assert encode(0, "") == ${JSON.stringify(t(0, ''))}`,
+    `def test_decode(): assert decode(${JSON.stringify(t(9, 'ab'))}) == {"id": 9, "text": "ab"}`,
+    'def test_roundtrip(): assert decode(encode(42, "yo")) == {"id": 42, "text": "yo"}',
+    `def test_checksum(): assert checksum("abc") == sum(b for b in b"abc") % ${MOD}`,
+    `def test_checksum_empty(): assert checksum("") == 0`,
+    `def test_valid_true(): assert valid(${JSON.stringify(t(1, 'x'))}) is True`,
+    `def test_valid_bad_version(): assert valid("zz${SEP}1${SEP}x") is False`,
+    `def test_valid_bad_shape(): assert valid("not a token") is False`,
+    '',
+  ].join('\n')
+  const stub = [
+    '# Implement these so test_lib.py passes. Infer the exact format from the tests.',
+    'def encode(id, text):',
+    '    raise NotImplementedError',
+    'def decode(s):',
+    '    raise NotImplementedError',
+    'def checksum(text):',
+    '    raise NotImplementedError',
+    'def valid(s):',
+    '    raise NotImplementedError',
+    '',
+  ].join('\n')
+  return { stub, test: tests, total: 9 }
+}
+
+// ── The Environment (AgenticSurface) — host pytest, no Docker. (Docker is a swap for untrusted code.) ──
+interface Ws {
+  dir: string
+  total: number
+}
+const workspaces = new Map<string, Ws>()
+
+function pytestPassed(dir: string): { passed: number; total: number } {
+  let out = ''
+  try {
+    out = execFileSync(
+      'python3',
+      ['-m', 'pytest', '-q', '--tb=no', '-p', 'no:cacheprovider', 'test_lib.py'],
+      {
+        cwd: dir,
+        encoding: 'utf8',
+        timeout: 60_000,
+        stdio: ['ignore', 'pipe', 'pipe'],
+      },
+    )
+  } catch (e) {
+    out = (e as { stdout?: string }).stdout ?? ''
+  }
+  const passed = Number(out.match(/(\d+) passed/)?.[1] ?? 0)
+  const failed =
+    Number(out.match(/(\d+) failed/)?.[1] ?? 0) + Number(out.match(/(\d+) error/)?.[1] ?? 0)
+  return { passed, total: passed + failed }
+}
+
+export const codingEnv: AgenticSurface = {
+  name: 'generated-coding',
+  async open(task) {
+    const seed = Number((task.meta as { seed?: number })?.seed ?? 0)
+    const { stub, test, total } = genTask(seed)
+    const dir = mkdtempSync(join(tmpdir(), 'sic-'))
+    writeFileSync(join(dir, 'lib.py'), stub)
+    writeFileSync(join(dir, 'test_lib.py'), test)
+    const handle: ArtifactHandle = { id: dir, surface: 'generated-coding' }
+    workspaces.set(dir, { dir, total })
+    return handle
+  },
+  async tools() {
+    return [
+      {
+        type: 'function',
+        function: {
+          name: 'list_files',
+          description: 'List the files in the workspace.',
+          parameters: { type: 'object', properties: {} },
+        },
+      },
+      {
+        type: 'function',
+        function: {
+          name: 'read_file',
+          description: 'Read a file (e.g. test_lib.py to learn the contract, or lib.py).',
+          parameters: {
+            type: 'object',
+            properties: { path: { type: 'string' } },
+            required: ['path'],
+          },
+        },
+      },
+      {
+        type: 'function',
+        function: {
+          name: 'write_file',
+          description:
+            'Write COMPLETE contents of lib.py (the implementation). test_lib.py is read-only.',
+          parameters: {
+            type: 'object',
+            properties: { path: { type: 'string' }, content: { type: 'string' } },
+            required: ['path', 'content'],
+          },
+        },
+      },
+      // NO run_tests: the agent cannot iterate-until-green. It must implement correctly from READING the
+      // tests — which creates real headroom and makes the STRATEGY (planning, multiple attempts) matter.
+    ] satisfies AgenticTool[]
+  },
+  async call(handle, name, args) {
+    const ws = workspaces.get(handle.id)
+    if (!ws) return 'ERROR: workspace closed'
+    if (name === 'list_files') return readdirSync(ws.dir).join('\n')
+    if (name === 'read_file') {
+      const p = String(args.path ?? '')
+      if (p !== 'lib.py' && p !== 'test_lib.py')
+        return 'ERROR: only lib.py and test_lib.py are readable'
+      try {
+        return readFileSync(join(ws.dir, p), 'utf8').slice(0, 8000)
+      } catch (e) {
+        return `ERROR: ${(e as Error).message}`
+      }
+    }
+    if (name === 'write_file') {
+      const p = String(args.path ?? '')
+      if (p !== 'lib.py') return 'ERROR: only lib.py is writable'
+      try {
+        writeFileSync(join(ws.dir, 'lib.py'), String(args.content ?? ''))
+        return 'wrote lib.py'
+      } catch (e) {
+        return `ERROR: ${(e as Error).message}`
+      }
+    }
+    return `ERROR: unknown tool ${name}`
+  },
+  async score(_task, handle): Promise<SurfaceScore> {
+    const ws = workspaces.get(handle.id)
+    if (!ws) return { passes: 0, total: 0, errored: 1 }
+    const { passed, total } = pytestPassed(ws.dir)
+    return total > 0
+      ? { passes: passed, total, errored: 0 }
+      : { passes: 0, total: ws.total, errored: 1 }
+  },
+  async close(handle) {
+    const ws = workspaces.get(handle.id)
+    if (!ws) return
+    workspaces.delete(handle.id)
+    rmSync(ws.dir, { recursive: true, force: true })
+  },
+}
+
+// ── The disjoint task supplier (train [0,trainN); holdout drawn past it) ──────────
+export const codingTasks = async (offset: number, n: number): Promise<AgenticTask[]> =>
+  Array.from({ length: n }, (_, i) => {
+    const seed = offset + i
+    return {
+      id: `gen-${seed}`,
+      systemPrompt:
+        'You are a Python engineer. The library lib.py has stub functions; its exact contract is defined ONLY by ' +
+        'test_lib.py. You CANNOT run the tests — read test_lib.py CAREFULLY (every assertion, every edge case) and ' +
+        'implement lib.py correctly in one pass with write_file. Get the edge cases right (empty inputs, malformed ' +
+        'inputs, exact formats). Do not edit test_lib.py.',
+      userPrompt:
+        'Read test_lib.py to learn the exact contract, then write a correct lib.py. You cannot run the tests — reason carefully.',
+      meta: { seed },
+    } satisfies AgenticTask
+  })
+
+/** The correct lib.py for a seed — used ONLY by the $0 calibration self-check (never by the agent). */
+function referenceLib(seed: number): string {
+  const { VER, SEP, MOD } = constsFor(seed)
+  return [
+    `VER, SEP, MOD = ${JSON.stringify(VER)}, ${JSON.stringify(SEP)}, ${MOD}`,
+    'def encode(id, text): return f"{VER}{SEP}{id}{SEP}{text}"',
+    'def decode(s):',
+    '    v, i, t = s.split(SEP, 2)',
+    '    return {"id": int(i), "text": t}',
+    'def checksum(text): return sum(text.encode()) % MOD if text else 0',
+    'def valid(s):',
+    '    p = s.split(SEP)',
+    '    return len(p) == 3 and p[0] == VER and p[1].isdigit()',
+    '',
+  ].join('\n')
+}
+
+/** calibrate-before-measure: prove the task is SOLVABLE (reference → all pass) and the grader
+ *  DISCRIMINATES (stub → 0). $0, no router. A reference that doesn't clear means the task/grader is
+ *  broken — fix it before spending. */
+async function calibrate(): Promise<void> {
+  console.log('═══ CALIBRATION ($0) — task solvable + grader discriminates? ═══')
+  let ok = true
+  for (const seed of [0, 1, 2, 7, 11]) {
+    const task = (await codingTasks(seed, 1))[0]!
+    const h = await codingEnv.open(task)
+    const stub = await codingEnv.score(task, h)
+    // write the reference, re-score
+    await codingEnv.call(h, 'write_file', { path: 'lib.py', content: referenceLib(seed) })
+    const ref = await codingEnv.score(task, h)
+    await codingEnv.close(h)
+    const pass = ref.passes === ref.total && ref.total > 0 && stub.passes === 0
+    ok &&= pass
+    console.log(
+      `  seed ${seed}: stub ${stub.passes}/${stub.total}  →  reference ${ref.passes}/${ref.total}  ${pass ? '✓' : '✗ BROKEN'}`,
+    )
+  }
+  console.log(
+    ok
+      ? '\n>>> CALIBRATED — task is solvable + the grader discriminates. Safe to run the loop.'
+      : '\n>>> BROKEN — fix the task/grader before spending.',
+  )
+  if (!ok) process.exit(1)
+}
+
+async function main(): Promise<void> {
+  if (process.env.CALIBRATE === '1') return calibrate()
+  const routerKey = process.env.TANGLE_API_KEY
+  if (!routerKey)
+    throw new Error('set TANGLE_API_KEY (the worker + the author both call the router)')
+  const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1'
+  const workerModel = process.env.WORKER_MODEL ?? 'deepseek-v4-flash'
+  // The author WRITES strategy code (a `defineStrategy` module) — it needs a strong coder + a token
+  // budget (thinking models return empty content without one) + a fallback. deepseek-flash can't.
+  const authorModel = process.env.AUTHOR_MODEL ?? 'gemini-2.5-pro'
+
+  // The author writes candidate-strategy .mts files into outDir, then dynamically imports them — they
+  // `import '@tangle-network/agent-runtime/loops'`, which only resolves UNDER the package (self-reference).
+  // A /tmp outDir would fail to resolve it; keep it under the project root.
+  const report = await (async () => {
+    const outDir = mkdtempSync(join(process.cwd(), '.sic-run-'))
+    try {
+      return await runStrategyEvolution({
+        environment: codingEnv,
+        tasks: codingTasks,
+        trainN: Number(process.env.TRAIN_N ?? 8),
+        holdoutN: Number(process.env.HOLDOUT_N ?? 12),
+        worker: {
+          routerBaseUrl,
+          routerKey,
+          model: workerModel,
+          innerTurns: Number(process.env.INNER_TURNS ?? 8),
+          maxTokens: 4000,
+        },
+        author: {
+          chat: createChatClient({
+            transport: 'router',
+            baseUrl: routerBaseUrl,
+            apiKey: routerKey,
+            defaultModel: authorModel,
+          }),
+          model: authorModel,
+          maxTokens: 8000,
+          fallbackModel: process.env.AUTHOR_FALLBACK ?? 'deepseek-v4-flash',
+        },
+        baselines: [sample, refine],
+        budget: Number(process.env.BUDGET ?? 3),
+        generations: Number(process.env.GENERATIONS ?? 2),
+        populationSize: Number(process.env.POP ?? 2),
+        outDir,
+      })
+    } finally {
+      rmSync(outDir, { recursive: true, force: true })
+    }
+  })()
+
+  const v = report.verdict
+  if (process.env.DUMP === '1') {
+    // Autopsy: gen0 baseline scores (headroom) + every authored candidate's score/error (did they
+    // lose on a saturated task, or error at runtime?).
+    const r = report as unknown as Record<string, unknown>
+    const slim = (x: unknown) =>
+      JSON.stringify(x, (_k, val) => (typeof val === 'function' ? '[fn]' : val), 1)
+    console.log('--- gen0 ---', slim(r.gen0 ?? r.gen0Champion))
+    console.log('--- generations ---', slim(r.generations)?.slice(0, 3000))
+  }
+  console.log('\n═══ SELF-IMPROVING CODER — certified on a FROZEN holdout (no adaptive reuse) ═══')
+  console.log(`worker=${workerModel}  author=${authorModel}`)
+  console.log(`gen0 champion:   ${report.gen0Champion.name}`)
+  console.log(`final champion:  ${report.finalChampion.name}`)
+  console.log(`PROMOTED:        ${v.promoted}  (${v.reason})`)
+  console.log(
+    `held-out lift:   mean ${v.lift.mean.toFixed(3)}  95% CI [${v.lift.low.toFixed(3)}, ${v.lift.high.toFixed(3)}]  n=${v.n}`,
+  )
+  console.log(
+    v.promoted
+      ? '\n>>> The search taught the agent a strategy that fixes MORE on tasks it never trained on, beyond luck. Self-improvement CERTIFIED.'
+      : '\n>>> No promotion: the evolved strategy did not beat gen0 on the fresh holdout beyond noise (honest null).',
+  )
+}
+
+if (import.meta.url === `file://${process.argv[1]}`)
+  main().catch((e) => {
+    console.error(e instanceof Error ? (e.stack ?? e.message) : String(e))
+    process.exit(1)
+  })