Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,5 @@ bench/scripts/__pycache__/
# local rollout-corpus scratch (raw jsonl, per work-line)
corpus/
test_repo/
.sic-run-*/
.swe-run-*/
181 changes: 181 additions & 0 deletions bench/src/swe-bench-env.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
/**
* SWE-bench Verified as an `AgenticSurface` — the PROPER, no-cheating way to run a coding agent on real
* GitHub bugs through the substrate (`runAgentic`/`runBenchmark`/`runStrategyEvolution` drive the loop;
* we only provide tools + a deployable score). The agent clones the repo at base_commit, explores +
* edits SOURCE via tools (never tests — path-jailed), and `score()` grades the resulting `git diff`
* with the OFFICIAL swebench Docker harness (apply patch → FAIL_TO_PASS + PASS_TO_PASS → resolved).
*
* No cheating by construction: the agent never sees the hidden tests or the gold patch (the adapter's
* prompt is the issue only); `edit_file` refuses test files; the score is a real test run, not a judge.
*
* CONTAMINATION CAVEAT: SWE-bench bugs are public GitHub fixes a frontier model may have MEMORIZED.
* A clean train→holdout split (disjoint instances) rules out adaptive-reuse, but NOT training-data
* memorization. Always report this; never claim a "clean" frontier number from this arena alone.
*/
import { execFile } from 'node:child_process'
import { existsSync, lstatSync, mkdtempSync, readdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { promisify } from 'node:util'
import type { AgenticSurface, AgenticTask, AgenticTool, ArtifactHandle, SurfaceScore } from '@tangle-network/agent-runtime/loops'
import { createSweBenchAdapter } from './benchmarks/swe-bench'
import type { BenchTask } from './benchmarks/types'

const exec = promisify(execFile)
const isTestPath = (p: string) => /(^|\/)(tests?)\//.test(p) || /test_.*\.py$|_test\.py$|conftest\.py$/.test(p)

interface Ws {
dir: string
task: BenchTask
}
const workspaces = new Map<string, Ws>()

/** Build the SWE-bench Environment + a DISJOINT-slice task supplier over the Verified split. The
* supplier keys tasks by dataset offset so `runStrategyEvolution`'s train [0,trainN) and holdout
* [trainN+off,…) never overlap. Verified is loaded once; instances carry their repo/base_commit. */
export async function createSweBenchEnvironment(poolN = 80): Promise<{
environment: AgenticSurface
tasks: (offset: number, n: number) => Promise<AgenticTask[]>
adapter: ReturnType<typeof createSweBenchAdapter>
}> {
const adapter = createSweBenchAdapter()
const pool = await adapter.loadTasks({ limit: poolN, split: 'test' })
const byId = new Map(pool.map((t) => [t.id, t]))

const environment: AgenticSurface = {
name: 'swe-bench-verified',
async open(task) {
const bt = byId.get(task.id)
if (!bt) throw new Error(`swe-bench-env: unknown task ${task.id}`)
const md = bt.metadata as Record<string, string>
const dir = mkdtempSync(join(tmpdir(), 'swe-'))
try {
await exec('git', ['clone', '--filter=blob:none', '--no-checkout', '--quiet', `https://github.com/${md.repo}.git`, dir], { timeout: 420_000 })
await exec('git', ['-C', dir, 'checkout', '--quiet', md.base_commit], { timeout: 300_000 })
const handle: ArtifactHandle = { id: dir, surface: 'swe-bench-verified' }
workspaces.set(dir, { dir, task: bt })
return handle
} catch (error) {
rmSync(dir, { recursive: true, force: true })
throw error
}
},
async tools() {
return [
{ type: 'function', function: { name: 'list_files', description: 'List source files under a repo subdirectory (recursive, bounded). "" = repo root.', parameters: { type: 'object', properties: { dir: { type: 'string' } }, required: ['dir'] } } },
{ type: 'function', function: { name: 'read_file', description: 'Read a repo file by path.', parameters: { type: 'object', properties: { path: { type: 'string' } }, required: ['path'] } } },
{ type: 'function', function: { name: 'edit_file', description: 'Surgical fix: replace the EXACT old_string (must occur once — copy whitespace precisely) with new_string in a SOURCE file. Minimal changes, never whole-file rewrites. Test files are rejected.', parameters: { type: 'object', properties: { path: { type: 'string' }, old_string: { type: 'string' }, new_string: { type: 'string' } }, required: ['path', 'old_string', 'new_string'] } } },
] satisfies AgenticTool[]
},
async call(handle, name, args) {
const ws = workspaces.get(handle.id)
if (!ws) return 'ERROR: workspace closed'
const safe = (p: string): string | null => {
if (p.startsWith('/') || p.includes('..')) return null
return p.replace(/^\.\//, '')
}
if (name === 'list_files') {
const sub = safe(String(args.dir ?? '')) ?? ''
const root = join(ws.dir, sub)
if (!existsSync(root)) return `(no such path: ${sub})`
const out: string[] = []
const walk = (d: string, depth: number) => {
if (depth > 2 || out.length > 240) return
let entries: string[] = []
try {
entries = readdirSync(d)
} catch {
return
}
for (const e of entries) {
if (e.startsWith('.') || e === 'node_modules' || e === '__pycache__') continue
const p = join(d, e)
let isDir = false
try {
isDir = lstatSync(p).isDirectory()
} catch {
continue
}
out.push(p.slice(ws.dir.length + 1) + (isDir ? '/' : ''))
if (isDir) walk(p, depth + 1)
}
}
walk(root, 0)
return out.slice(0, 240).join('\n') || '(empty)'
}
if (name === 'read_file') {
const p = safe(String(args.path ?? ''))
if (!p) return 'ERROR: invalid path'
try {
const c = readFileSync(join(ws.dir, p), 'utf8')
return c.length > 24_000 ? `${c.slice(0, 24_000)}\n...[truncated]` : c
} catch (e) {
return `(error: ${(e as Error).message})`
}
}
if (name === 'edit_file') {
const p = safe(String(args.path ?? ''))
if (!p) return 'ERROR: invalid path'
if (isTestPath(p)) return 'REJECTED: editing test files is forbidden (the evaluation runs hidden tests).'
const oldStr = String(args.old_string ?? '')
const newStr = String(args.new_string ?? '')
let content: string
try {
content = readFileSync(join(ws.dir, p), 'utf8')
} catch (e) {
return `(cannot read ${p}: ${(e as Error).message})`
}
if (!oldStr) return 'ERROR: old_string is empty.'
const count = content.split(oldStr).length - 1
if (count === 0) return `ERROR: old_string not found in ${p}. read_file it and copy EXACT text.`
if (count > 1) return `ERROR: old_string appears ${count}× in ${p} — add surrounding context to make it unique.`
writeFileSync(join(ws.dir, p), content.replace(oldStr, newStr))
return `edited ${p}: replaced 1 occurrence`
}
return `ERROR: unknown tool ${name}`
},
async score(_task, handle): Promise<SurfaceScore> {
const ws = workspaces.get(handle.id)
if (!ws) return { passes: 0, total: 1, errored: 1 }
let patch = ''
try {
const r = await exec('git', ['-C', ws.dir, 'diff'], { maxBuffer: 20_000_000, timeout: 60_000 })
patch = r.stdout
} catch {
patch = ''
}
if (!patch.trim()) return { passes: 0, total: 1, errored: 0 }
try {
const s = await adapter.judge(ws.task, patch)
return { passes: s.resolved ? 1 : 0, total: 1, errored: 0 }
} catch {
return { passes: 0, total: 1, errored: 1 }
}
},
async close(handle) {
const ws = workspaces.get(handle.id)
if (!ws) return
workspaces.delete(handle.id)
rmSync(ws.dir, { recursive: true, force: true })
},
}

const tasks = async (offset: number, n: number): Promise<AgenticTask[]> => {
const slice = pool.slice(offset, offset + n)
if (slice.length < n) throw new Error(`swe-bench-env: pool exhausted at offset ${offset} (need ${n}, have ${slice.length}; raise poolN)`)
return slice.map((bt) => ({
id: bt.id,
systemPrompt:
'You are a senior engineer fixing a real bug in the checked-out repository. Work PERSISTENTLY and do not ' +
'stop early: use list_files + read_file to explore BROADLY (read many candidate files — the bug is rarely in ' +
'the first file you open), trace the issue to its root cause, then fix it with edit_file. You MUST make at ' +
'least one edit_file call — never finish with prose alone or without attempting a fix. Make a MINIMAL surgical ' +
'change (a few lines, like a real PR), source only (test files are rejected). If an edit_file fails (old_string ' +
'not unique/found), read the file again and retry with exact text. Keep going until you have made your best fix.',
userPrompt: bt.prompt,
meta: { instanceId: bt.id },
}))
}

return { environment, tasks, adapter }
}
84 changes: 84 additions & 0 deletions bench/swe-self-improve.mts
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/**
* SWE-bench self-improvement — the PROPER, no-cheating run: a frontier worker over the SWE-bench
* `Environment`, with `runStrategyEvolution` enforcing the train→freeze→holdout split (the substrate
* draws a disjoint holdout slice and gates once — adaptive reuse is impossible). CONTAMINATION CAVEAT
* applies (public fixes may be memorized) — reported, never claimed clean.
*
* CALIBRATE first (cost gate): TANGLE_API_KEY=… CALIBRATE=1 N=3 tsx bench/swe-self-improve.mts
* Full run: TANGLE_API_KEY=… TRAIN_N=6 HOLDOUT_N=8 GENERATIONS=2 tsx bench/swe-self-improve.mts
*/
import { mkdtempSync, rmSync } from 'node:fs'
import { join } from 'node:path'
import { createChatClient } from '@tangle-network/agent-eval'
import { refine, runAgentic, runStrategyEvolution, sample } from '@tangle-network/agent-runtime/loops'
import { createSweBenchEnvironment } from './src/swe-bench-env'

async function main(): Promise<void> {
const routerKey = process.env.TANGLE_API_KEY
if (!routerKey) throw new Error('TANGLE_API_KEY required (worker + author call the router)')
const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1'
const workerModel = process.env.WORKER_MODEL ?? 'gemini-2.5-pro'
const authorModel = process.env.AUTHOR_MODEL ?? 'gemini-2.5-pro'
const innerTurns = Number(process.env.INNER_TURNS ?? 40)
const { environment, tasks } = await createSweBenchEnvironment(Number(process.env.POOL_N ?? 80))

if (process.env.CALIBRATE === '1') {
const n = Number(process.env.N ?? 3)
const ts = await tasks(0, n)
console.log(`═══ SWE-bench CALIBRATION — ${workerModel}, baseline=refine, ${n} real bugs ═══`)
let resolved = 0
for (const t of ts) {
const t0 = Date.now()
const r = await runAgentic({ surface: environment, task: t, strategy: refine, routerBaseUrl, routerKey, model: workerModel, maxTokens: 8000, innerTurns, budget: 1 })
if (r.resolved) resolved++
console.log(` ${t.id.padEnd(32)} resolved=${r.resolved} completions=${r.completions} shots=${r.shots} (${Math.round((Date.now() - t0) / 1000)}s)`)
}
const band = resolved > 0 && resolved < n
console.log(`\n>>> baseline resolved ${resolved}/${n}. ${band ? 'HEADROOM — the loop has room to improve. PROCEED.' : resolved === 0 ? 'TOO HARD / env issue — inspect before the loop.' : 'saturated at this small n — raise N.'}`)
return
}

const report = await (async () => {
const outDir = mkdtempSync(join(process.cwd(), '.swe-run-'))
try {
return await runStrategyEvolution({
environment,
tasks,
trainN: Number(process.env.TRAIN_N ?? 6),
holdoutN: Number(process.env.HOLDOUT_N ?? 8),
worker: { routerBaseUrl, routerKey, model: workerModel, maxTokens: 8000, innerTurns },
author: {
chat: createChatClient({ transport: 'router', baseUrl: routerBaseUrl, apiKey: routerKey, defaultModel: authorModel }),
model: authorModel,
maxTokens: 8000,
fallbackModel: process.env.AUTHOR_FALLBACK ?? 'deepseek-v4-flash',
},
baselines: [sample, refine],
budget: Number(process.env.BUDGET ?? 2),
generations: Number(process.env.GENERATIONS ?? 2),
populationSize: Number(process.env.POP ?? 2),
outDir,
})
} finally {
rmSync(outDir, { recursive: true, force: true })
}
})()

const v = report.verdict
console.log('\n═══ SWE-bench SELF-IMPROVEMENT — certified on a FROZEN holdout (CONTAMINATION-flagged) ═══')
console.log(`worker=${workerModel} author=${authorModel}`)
console.log(`gen0 champion: ${report.gen0Champion.name}`)
console.log(`final champion: ${report.finalChampion.name}`)
console.log(`PROMOTED: ${v.promoted} (${v.reason})`)
console.log(`held-out lift: mean ${v.lift.mean.toFixed(3)} 95% CI [${v.lift.low.toFixed(3)}, ${v.lift.high.toFixed(3)}] n=${v.n}`)
console.log(
v.promoted
? '\n>>> The search taught the agent a strategy that resolves MORE real bugs it never trained on, beyond luck. (Report the contamination caveat: public fixes may be memorized.)'
: '\n>>> No promotion: the evolved strategy did not beat gen0 on the fresh holdout beyond noise (honest null).',
)
}

main().catch((e) => {
console.error(e instanceof Error ? (e.stack ?? e.message) : String(e))
process.exit(1)
})
Loading
Loading