tangle-network · drewstone · Jun 28, 2026 · Jun 27, 2026 · Jun 28, 2026 · Jun 28, 2026
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,5 @@ bench/scripts/__pycache__/
 # local rollout-corpus scratch (raw jsonl, per work-line)
 corpus/
 test_repo/
+.sic-run-*/
+.swe-run-*/
diff --git a/bench/src/swe-bench-env.ts b/bench/src/swe-bench-env.ts
@@ -0,0 +1,181 @@
+/**
+ * SWE-bench Verified as an `AgenticSurface` — the PROPER, no-cheating way to run a coding agent on real
+ * GitHub bugs through the substrate (`runAgentic`/`runBenchmark`/`runStrategyEvolution` drive the loop;
+ * we only provide tools + a deployable score). The agent clones the repo at base_commit, explores +
+ * edits SOURCE via tools (never tests — path-jailed), and `score()` grades the resulting `git diff`
+ * with the OFFICIAL swebench Docker harness (apply patch → FAIL_TO_PASS + PASS_TO_PASS → resolved).
+ *
+ * No cheating by construction: the agent never sees the hidden tests or the gold patch (the adapter's
+ * prompt is the issue only); `edit_file` refuses test files; the score is a real test run, not a judge.
+ *
+ * CONTAMINATION CAVEAT: SWE-bench bugs are public GitHub fixes a frontier model may have MEMORIZED.
+ * A clean train→holdout split (disjoint instances) rules out adaptive-reuse, but NOT training-data
+ * memorization. Always report this; never claim a "clean" frontier number from this arena alone.
+ */
+import { execFile } from 'node:child_process'
+import { existsSync, lstatSync, mkdtempSync, readdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs'
+import { tmpdir } from 'node:os'
+import { join } from 'node:path'
+import { promisify } from 'node:util'
+import type { AgenticSurface, AgenticTask, AgenticTool, ArtifactHandle, SurfaceScore } from '@tangle-network/agent-runtime/loops'
+import { createSweBenchAdapter } from './benchmarks/swe-bench'
+import type { BenchTask } from './benchmarks/types'
+
+const exec = promisify(execFile)
+const isTestPath = (p: string) => /(^|\/)(tests?)\//.test(p) || /test_.*\.py$|_test\.py$|conftest\.py$/.test(p)
+
+interface Ws {
+  dir: string
+  task: BenchTask
+}
+const workspaces = new Map<string, Ws>()
+
+/** Build the SWE-bench Environment + a DISJOINT-slice task supplier over the Verified split. The
+ *  supplier keys tasks by dataset offset so `runStrategyEvolution`'s train [0,trainN) and holdout
+ *  [trainN+off,…) never overlap. Verified is loaded once; instances carry their repo/base_commit. */
+export async function createSweBenchEnvironment(poolN = 80): Promise<{
+  environment: AgenticSurface
+  tasks: (offset: number, n: number) => Promise<AgenticTask[]>
+  adapter: ReturnType<typeof createSweBenchAdapter>
+}> {
+  const adapter = createSweBenchAdapter()
+  const pool = await adapter.loadTasks({ limit: poolN, split: 'test' })
+  const byId = new Map(pool.map((t) => [t.id, t]))
+
+  const environment: AgenticSurface = {
+    name: 'swe-bench-verified',
+    async open(task) {
+      const bt = byId.get(task.id)
+      if (!bt) throw new Error(`swe-bench-env: unknown task ${task.id}`)
+      const md = bt.metadata as Record<string, string>
+      const dir = mkdtempSync(join(tmpdir(), 'swe-'))
+      try {
+        await exec('git', ['clone', '--filter=blob:none', '--no-checkout', '--quiet', `https://github.com/${md.repo}.git`, dir], { timeout: 420_000 })
+        await exec('git', ['-C', dir, 'checkout', '--quiet', md.base_commit], { timeout: 300_000 })
+        const handle: ArtifactHandle = { id: dir, surface: 'swe-bench-verified' }
+        workspaces.set(dir, { dir, task: bt })
+        return handle
+      } catch (error) {
+        rmSync(dir, { recursive: true, force: true })
+        throw error
+      }
+    },
+    async tools() {
+      return [
+        { type: 'function', function: { name: 'list_files', description: 'List source files under a repo subdirectory (recursive, bounded). "" = repo root.', parameters: { type: 'object', properties: { dir: { type: 'string' } }, required: ['dir'] } } },
+        { type: 'function', function: { name: 'read_file', description: 'Read a repo file by path.', parameters: { type: 'object', properties: { path: { type: 'string' } }, required: ['path'] } } },
+        { type: 'function', function: { name: 'edit_file', description: 'Surgical fix: replace the EXACT old_string (must occur once — copy whitespace precisely) with new_string in a SOURCE file. Minimal changes, never whole-file rewrites. Test files are rejected.', parameters: { type: 'object', properties: { path: { type: 'string' }, old_string: { type: 'string' }, new_string: { type: 'string' } }, required: ['path', 'old_string', 'new_string'] } } },
+      ] satisfies AgenticTool[]
+    },
+    async call(handle, name, args) {
+      const ws = workspaces.get(handle.id)
+      if (!ws) return 'ERROR: workspace closed'
+      const safe = (p: string): string | null => {
+        if (p.startsWith('/') || p.includes('..')) return null
+        return p.replace(/^\.\//, '')
+      }
+      if (name === 'list_files') {
+        const sub = safe(String(args.dir ?? '')) ?? ''
+        const root = join(ws.dir, sub)
+        if (!existsSync(root)) return `(no such path: ${sub})`
+        const out: string[] = []
+        const walk = (d: string, depth: number) => {
+          if (depth > 2 || out.length > 240) return
+          let entries: string[] = []
+          try {
+            entries = readdirSync(d)
+          } catch {
+            return
+          }
+          for (const e of entries) {
+            if (e.startsWith('.') || e === 'node_modules' || e === '__pycache__') continue
+            const p = join(d, e)
+            let isDir = false
+            try {
+              isDir = lstatSync(p).isDirectory()
+            } catch {
+              continue
+            }
+            out.push(p.slice(ws.dir.length + 1) + (isDir ? '/' : ''))
+            if (isDir) walk(p, depth + 1)
+          }
+        }
+        walk(root, 0)
+        return out.slice(0, 240).join('\n') || '(empty)'
+      }
+      if (name === 'read_file') {
+        const p = safe(String(args.path ?? ''))
+        if (!p) return 'ERROR: invalid path'
+        try {
+          const c = readFileSync(join(ws.dir, p), 'utf8')
+          return c.length > 24_000 ? `${c.slice(0, 24_000)}\n...[truncated]` : c
+        } catch (e) {
+          return `(error: ${(e as Error).message})`
+        }
+      }
+      if (name === 'edit_file') {
+        const p = safe(String(args.path ?? ''))
+        if (!p) return 'ERROR: invalid path'
+        if (isTestPath(p)) return 'REJECTED: editing test files is forbidden (the evaluation runs hidden tests).'
+        const oldStr = String(args.old_string ?? '')
+        const newStr = String(args.new_string ?? '')
+        let content: string
+        try {
+          content = readFileSync(join(ws.dir, p), 'utf8')
+        } catch (e) {
+          return `(cannot read ${p}: ${(e as Error).message})`
+        }
+        if (!oldStr) return 'ERROR: old_string is empty.'
+        const count = content.split(oldStr).length - 1
+        if (count === 0) return `ERROR: old_string not found in ${p}. read_file it and copy EXACT text.`
+        if (count > 1) return `ERROR: old_string appears ${count}× in ${p} — add surrounding context to make it unique.`
+        writeFileSync(join(ws.dir, p), content.replace(oldStr, newStr))
+        return `edited ${p}: replaced 1 occurrence`
+      }
+      return `ERROR: unknown tool ${name}`
+    },
+    async score(_task, handle): Promise<SurfaceScore> {
+      const ws = workspaces.get(handle.id)
+      if (!ws) return { passes: 0, total: 1, errored: 1 }
+      let patch = ''
+      try {
+        const r = await exec('git', ['-C', ws.dir, 'diff'], { maxBuffer: 20_000_000, timeout: 60_000 })
+        patch = r.stdout
+      } catch {
+        patch = ''
+      }
+      if (!patch.trim()) return { passes: 0, total: 1, errored: 0 }
+      try {
+        const s = await adapter.judge(ws.task, patch)
+        return { passes: s.resolved ? 1 : 0, total: 1, errored: 0 }
+      } catch {
+        return { passes: 0, total: 1, errored: 1 }
+      }
+    },
+    async close(handle) {
+      const ws = workspaces.get(handle.id)
+      if (!ws) return
+      workspaces.delete(handle.id)
+      rmSync(ws.dir, { recursive: true, force: true })
+    },
+  }
+
+  const tasks = async (offset: number, n: number): Promise<AgenticTask[]> => {
+    const slice = pool.slice(offset, offset + n)
+    if (slice.length < n) throw new Error(`swe-bench-env: pool exhausted at offset ${offset} (need ${n}, have ${slice.length}; raise poolN)`)
+    return slice.map((bt) => ({
+      id: bt.id,
+      systemPrompt:
+        'You are a senior engineer fixing a real bug in the checked-out repository. Work PERSISTENTLY and do not ' +
+        'stop early: use list_files + read_file to explore BROADLY (read many candidate files — the bug is rarely in ' +
+        'the first file you open), trace the issue to its root cause, then fix it with edit_file. You MUST make at ' +
+        'least one edit_file call — never finish with prose alone or without attempting a fix. Make a MINIMAL surgical ' +
+        'change (a few lines, like a real PR), source only (test files are rejected). If an edit_file fails (old_string ' +
+        'not unique/found), read the file again and retry with exact text. Keep going until you have made your best fix.',
+      userPrompt: bt.prompt,
+      meta: { instanceId: bt.id },
+    }))
+  }
+
+  return { environment, tasks, adapter }
+}
diff --git a/bench/swe-self-improve.mts b/bench/swe-self-improve.mts
@@ -0,0 +1,84 @@
+/**
+ * SWE-bench self-improvement — the PROPER, no-cheating run: a frontier worker over the SWE-bench
+ * `Environment`, with `runStrategyEvolution` enforcing the train→freeze→holdout split (the substrate
+ * draws a disjoint holdout slice and gates once — adaptive reuse is impossible). CONTAMINATION CAVEAT
+ * applies (public fixes may be memorized) — reported, never claimed clean.
+ *
+ *   CALIBRATE first (cost gate):  TANGLE_API_KEY=… CALIBRATE=1 N=3 tsx bench/swe-self-improve.mts
+ *   Full run:                     TANGLE_API_KEY=… TRAIN_N=6 HOLDOUT_N=8 GENERATIONS=2 tsx bench/swe-self-improve.mts
+ */
+import { mkdtempSync, rmSync } from 'node:fs'
+import { join } from 'node:path'
+import { createChatClient } from '@tangle-network/agent-eval'
+import { refine, runAgentic, runStrategyEvolution, sample } from '@tangle-network/agent-runtime/loops'
+import { createSweBenchEnvironment } from './src/swe-bench-env'
+
+async function main(): Promise<void> {
+  const routerKey = process.env.TANGLE_API_KEY
+  if (!routerKey) throw new Error('TANGLE_API_KEY required (worker + author call the router)')
+  const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1'
+  const workerModel = process.env.WORKER_MODEL ?? 'gemini-2.5-pro'
+  const authorModel = process.env.AUTHOR_MODEL ?? 'gemini-2.5-pro'
+  const innerTurns = Number(process.env.INNER_TURNS ?? 40)
+  const { environment, tasks } = await createSweBenchEnvironment(Number(process.env.POOL_N ?? 80))
+
+  if (process.env.CALIBRATE === '1') {
+    const n = Number(process.env.N ?? 3)
+    const ts = await tasks(0, n)
+    console.log(`═══ SWE-bench CALIBRATION — ${workerModel}, baseline=refine, ${n} real bugs ═══`)
+    let resolved = 0
+    for (const t of ts) {
+      const t0 = Date.now()
+      const r = await runAgentic({ surface: environment, task: t, strategy: refine, routerBaseUrl, routerKey, model: workerModel, maxTokens: 8000, innerTurns, budget: 1 })
+      if (r.resolved) resolved++
+      console.log(`  ${t.id.padEnd(32)} resolved=${r.resolved} completions=${r.completions} shots=${r.shots} (${Math.round((Date.now() - t0) / 1000)}s)`)
+    }
+    const band = resolved > 0 && resolved < n
+    console.log(`\n>>> baseline resolved ${resolved}/${n}. ${band ? 'HEADROOM — the loop has room to improve. PROCEED.' : resolved === 0 ? 'TOO HARD / env issue — inspect before the loop.' : 'saturated at this small n — raise N.'}`)
+    return
+  }
+
+  const report = await (async () => {
+    const outDir = mkdtempSync(join(process.cwd(), '.swe-run-'))
+    try {
+      return await runStrategyEvolution({
+        environment,
+        tasks,
+        trainN: Number(process.env.TRAIN_N ?? 6),
+        holdoutN: Number(process.env.HOLDOUT_N ?? 8),
+        worker: { routerBaseUrl, routerKey, model: workerModel, maxTokens: 8000, innerTurns },
+        author: {
+          chat: createChatClient({ transport: 'router', baseUrl: routerBaseUrl, apiKey: routerKey, defaultModel: authorModel }),
+          model: authorModel,
+          maxTokens: 8000,
+          fallbackModel: process.env.AUTHOR_FALLBACK ?? 'deepseek-v4-flash',
+        },
+        baselines: [sample, refine],
+        budget: Number(process.env.BUDGET ?? 2),
+        generations: Number(process.env.GENERATIONS ?? 2),
+        populationSize: Number(process.env.POP ?? 2),
+        outDir,
+      })
+    } finally {
+      rmSync(outDir, { recursive: true, force: true })
+    }
+  })()
+
+  const v = report.verdict
+  console.log('\n═══ SWE-bench SELF-IMPROVEMENT — certified on a FROZEN holdout (CONTAMINATION-flagged) ═══')
+  console.log(`worker=${workerModel}  author=${authorModel}`)
+  console.log(`gen0 champion:   ${report.gen0Champion.name}`)
+  console.log(`final champion:  ${report.finalChampion.name}`)
+  console.log(`PROMOTED:        ${v.promoted}  (${v.reason})`)
+  console.log(`held-out lift:   mean ${v.lift.mean.toFixed(3)}  95% CI [${v.lift.low.toFixed(3)}, ${v.lift.high.toFixed(3)}]  n=${v.n}`)
+  console.log(
+    v.promoted
+      ? '\n>>> The search taught the agent a strategy that resolves MORE real bugs it never trained on, beyond luck. (Report the contamination caveat: public fixes may be memorized.)'
+      : '\n>>> No promotion: the evolved strategy did not beat gen0 on the fresh holdout beyond noise (honest null).',
+  )
+}
+
+main().catch((e) => {
+  console.error(e instanceof Error ? (e.stack ?? e.message) : String(e))
+  process.exit(1)
+})