Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions bench/src/swe-bench-env.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import { mkdtempSync, realpathSync, rmSync, symlinkSync, writeFileSync } from 'node:fs'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { afterAll, describe, expect, it } from 'vitest'
import { isInsideJail, isTestPath, jailPath } from './swe-bench-env'

describe('isTestPath', () => {
it('flags test directories and test-named python files', () => {
expect(isTestPath('tests/test_models.py')).toBe(true)
expect(isTestPath('pkg/test/helpers.py')).toBe(true)
expect(isTestPath('pkg/tests/helpers.py')).toBe(true)
expect(isTestPath('test_models.py')).toBe(true)
expect(isTestPath('models_test.py')).toBe(true)
expect(isTestPath('conftest.py')).toBe(true)
expect(isTestPath('pkg/conftest.py')).toBe(true)
})

it('does not flag ordinary source files', () => {
expect(isTestPath('src/foo.py')).toBe(false)
expect(isTestPath('pkg/models.py')).toBe(false)
// `testing.py` is not a test file by the test_/_test/conftest rules.
expect(isTestPath('pkg/testing.py')).toBe(false)
// A `latest/` segment must not trip the `tests?/` directory rule.
expect(isTestPath('latest/foo.py')).toBe(false)
})
})

describe('jailPath', () => {
const root = '/work/repo'

it('rejects `..` traversal and absolute paths', () => {
expect(jailPath(root, '../x')).toBeNull()
expect(jailPath(root, 'a/../../etc/passwd')).toBeNull()
expect(jailPath(root, '/etc/passwd')).toBeNull()
})

it('accepts in-repo relative paths and strips a leading `./`', () => {
expect(jailPath(root, 'src/a.py')).toBe('src/a.py')
expect(jailPath(root, './a.py')).toBe('a.py')
expect(jailPath(root, 'a.py')).toBe('a.py')
})
})

describe('isInsideJail (realpath containment)', () => {
// Mirror the `resolveInJail` closure in `call()`: realpath-resolve a workspace-relative path, then
// assert containment. Offline — operates on a throwaway temp dir, no git clone, no network.
const dir = mkdtempSync(join(tmpdir(), 'swe-jail-'))
const jailRoot = realpathSync(dir)
afterAll(() => rmSync(dir, { recursive: true, force: true }))

it('admits a real file inside the jail', () => {
const inside = join(dir, 'a.py')
writeFileSync(inside, 'x = 1\n')
expect(isInsideJail(jailRoot, realpathSync(inside))).toBe(true)
expect(isInsideJail(jailRoot, jailRoot)).toBe(true)
})

it('rejects reading through a symlink that escapes the jail', () => {
// A repo could ship `escape -> /etc`; following it must not let the agent read /etc/passwd.
const link = join(dir, 'escape')
symlinkSync('/etc', link)
// `resolveInJail` does `realpathSync(join(ws.dir, relPath))` then this containment check.
const real = realpathSync(join(dir, 'escape/passwd'))
expect(real).toBe('/etc/passwd')
expect(isInsideJail(jailRoot, real)).toBe(false)
})

it('rejects a sibling dir that shares the jail-root prefix', () => {
expect(isInsideJail('/tmp/swe-x', '/tmp/swe-x-evil/secret')).toBe(false)
})
})
63 changes: 53 additions & 10 deletions bench/src/swe-bench-env.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,40 @@
* memorization. Always report this; never claim a "clean" frontier number from this arena alone.
*/
import { execFile } from 'node:child_process'
import { existsSync, lstatSync, mkdtempSync, readdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs'
import { existsSync, lstatSync, mkdtempSync, readdirSync, readFileSync, realpathSync, rmSync, writeFileSync } from 'node:fs'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { join, sep } from 'node:path'
import { promisify } from 'node:util'
import type { AgenticSurface, AgenticTask, AgenticTool, ArtifactHandle, SurfaceScore } from '@tangle-network/agent-runtime/loops'
import { createSweBenchAdapter } from './benchmarks/swe-bench'
import type { BenchTask } from './benchmarks/types'

const exec = promisify(execFile)
const isTestPath = (p: string) => /(^|\/)(tests?)\//.test(p) || /test_.*\.py$|_test\.py$|conftest\.py$/.test(p)
export const isTestPath = (p: string) => /(^|\/)(tests?)\//.test(p) || /test_.*\.py$|_test\.py$|conftest\.py$/.test(p)

/**
* Cheap string pre-filter for an agent-supplied repo-relative path, applied before the path is
* joined to a workspace root: rejects absolute paths and any `..` segment, strips a leading `./`.
* Returns the cleaned relative path, or `null` if it must be refused. Pure and side-effect-free —
* `root` is unused here (the symlink-following boundary is the realpath jail, not this filter) but
* is taken so call sites read symmetrically with the realpath check.
*/
export const jailPath = (_root: string, p: string): string | null => {
if (p.startsWith('/') || p.includes('..')) return null
return p.replace(/^\.\//, '')
}

/**
* Containment predicate for the realpath jail: true iff `real` (an already-resolved absolute path)
* is `jailRoot` itself or lies strictly inside it. The `+ sep` guard stops a sibling like
* `/tmp/swe-x-evil` from matching the root `/tmp/swe-x`. Pure and side-effect-free.
*/
export const isInsideJail = (jailRoot: string, real: string): boolean => real === jailRoot || real.startsWith(jailRoot + sep)

interface Ws {
dir: string
task: BenchTask
}
const workspaces = new Map<string, Ws>()

/** Build the SWE-bench Environment + a DISJOINT-slice task supplier over the Verified split. The
* supplier keys tasks by dataset offset so `runStrategyEvolution`'s train [0,trainN) and holdout
Expand All @@ -41,6 +59,8 @@ export async function createSweBenchEnvironment(poolN = 80): Promise<{
const adapter = createSweBenchAdapter()
const pool = await adapter.loadTasks({ limit: poolN, split: 'test' })
const byId = new Map(pool.map((t) => [t.id, t]))
// Each environment owns its workspace registry so concurrent environments don't share state.
const workspaces = new Map<string, Ws>()

const environment: AgenticSurface = {
name: 'swe-bench-verified',
Expand Down Expand Up @@ -70,9 +90,18 @@ export async function createSweBenchEnvironment(poolN = 80): Promise<{
async call(handle, name, args) {
const ws = workspaces.get(handle.id)
if (!ws) return 'ERROR: workspace closed'
const safe = (p: string): string | null => {
if (p.startsWith('/') || p.includes('..')) return null
return p.replace(/^\.\//, '')
// Cheap pre-filter: reject absolute paths and `..` traversal, strip a leading `./`. The real
// boundary is the realpath jail check below (resolveInJail) — `safe` only normalizes the string
// form. `ws.dir` is passed for signature symmetry; the filter itself is root-independent.
const safe = (p: string): string | null => jailPath(ws.dir, p)
// Resolve `relPath` to an absolute path and assert it stays inside the workspace AFTER following
// symlinks (a repo symlink targeting /etc/passwd would otherwise escape the string-only jail).
// The target must exist (both callers read it first); a missing path throws and the caller
// surfaces the error message, matching the previous read-then-fail behavior.
const jailRoot = realpathSync(ws.dir)
const resolveInJail = (relPath: string): string | null => {
const real = realpathSync(join(ws.dir, relPath))
return isInsideJail(jailRoot, real) ? real : null
}
if (name === 'list_files') {
const sub = safe(String(args.dir ?? '')) ?? ''
Expand Down Expand Up @@ -106,8 +135,15 @@ export async function createSweBenchEnvironment(poolN = 80): Promise<{
if (name === 'read_file') {
const p = safe(String(args.path ?? ''))
if (!p) return 'ERROR: invalid path'
let real: string | null
try {
const c = readFileSync(join(ws.dir, p), 'utf8')
real = resolveInJail(p)
} catch (e) {
return `(error: ${(e as Error).message})`
}
if (!real) return `ERROR: path ${p} escapes the workspace`
try {
const c = readFileSync(real, 'utf8')
return c.length > 24_000 ? `${c.slice(0, 24_000)}\n...[truncated]` : c
} catch (e) {
return `(error: ${(e as Error).message})`
Expand All @@ -119,17 +155,24 @@ export async function createSweBenchEnvironment(poolN = 80): Promise<{
if (isTestPath(p)) return 'REJECTED: editing test files is forbidden (the evaluation runs hidden tests).'
const oldStr = String(args.old_string ?? '')
const newStr = String(args.new_string ?? '')
let real: string | null
try {
real = resolveInJail(p)
} catch (e) {
return `(cannot read ${p}: ${(e as Error).message})`
}
if (!real) return `ERROR: path ${p} escapes the workspace`
let content: string
try {
content = readFileSync(join(ws.dir, p), 'utf8')
content = readFileSync(real, 'utf8')
} catch (e) {
return `(cannot read ${p}: ${(e as Error).message})`
}
if (!oldStr) return 'ERROR: old_string is empty.'
const count = content.split(oldStr).length - 1
if (count === 0) return `ERROR: old_string not found in ${p}. read_file it and copy EXACT text.`
if (count > 1) return `ERROR: old_string appears ${count}× in ${p} — add surrounding context to make it unique.`
writeFileSync(join(ws.dir, p), content.replace(oldStr, newStr))
writeFileSync(real, content.replace(oldStr, newStr))
return `edited ${p}: replaced 1 occurrence`
}
return `ERROR: unknown tool ${name}`
Expand Down
6 changes: 3 additions & 3 deletions bench/swe-self-improve.mts → bench/src/swe-self-improve.mts
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
* draws a disjoint holdout slice and gates once — adaptive reuse is impossible). CONTAMINATION CAVEAT
* applies (public fixes may be memorized) — reported, never claimed clean.
*
* CALIBRATE first (cost gate): TANGLE_API_KEY=… CALIBRATE=1 N=3 tsx bench/swe-self-improve.mts
* Full run: TANGLE_API_KEY=… TRAIN_N=6 HOLDOUT_N=8 GENERATIONS=2 tsx bench/swe-self-improve.mts
* CALIBRATE first (cost gate): TANGLE_API_KEY=… CALIBRATE=1 N=3 tsx bench/src/swe-self-improve.mts
* Full run: TANGLE_API_KEY=… TRAIN_N=6 HOLDOUT_N=8 GENERATIONS=2 tsx bench/src/swe-self-improve.mts
*/
import { mkdtempSync, rmSync } from 'node:fs'
import { join } from 'node:path'
import { createChatClient } from '@tangle-network/agent-eval'
import { refine, runAgentic, runStrategyEvolution, sample } from '@tangle-network/agent-runtime/loops'
import { createSweBenchEnvironment } from './src/swe-bench-env'
import { createSweBenchEnvironment } from './swe-bench-env'

async function main(): Promise<void> {
const routerKey = process.env.TANGLE_API_KEY
Expand Down
68 changes: 14 additions & 54 deletions docs/api/mcp.md
Original file line number Diff line number Diff line change
Expand Up @@ -3956,40 +3956,6 @@ Defined in: [mcp/tools/coordination.ts:56](https://github.com/tangle-network/age

***

### AnalystRegistry

Defined in: [mcp/tools/coordination.ts:62](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L62)

#### Properties

##### kinds

> `readonly` **kinds**: readonly `object`[]

Defined in: [mcp/tools/coordination.ts:63](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L63)

##### run

> `readonly` **run**: (`kindId`, `trace`) => `Promise`\<`unknown`\>

Defined in: [mcp/tools/coordination.ts:64](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L64)

###### Parameters

###### kindId

`string`

###### trace

`unknown`

###### Returns

`Promise`\<`unknown`\>

***

### CoordinationToolsOptions

Defined in: [mcp/tools/coordination.ts:94](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L94)
Expand All @@ -4010,7 +3976,7 @@ Defined in: [mcp/tools/coordination.ts:96](https://github.com/tangle-network/age

##### makeWorkerAgent

> `readonly` **makeWorkerAgent**: [`MakeWorkerAgent`](#makeworkeragent)
> `readonly` **makeWorkerAgent**: [`MakeWorkerAgent`](runtime.md#makeworkeragent)

Defined in: [mcp/tools/coordination.ts:97](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L97)

Expand All @@ -4022,7 +3988,7 @@ Defined in: [mcp/tools/coordination.ts:98](https://github.com/tangle-network/age

##### analysts?

> `readonly` `optional` **analysts?**: [`AnalystRegistry`](#analystregistry)
> `readonly` `optional` **analysts?**: [`AnalystRegistry`](runtime.md#analystregistry)

Defined in: [mcp/tools/coordination.ts:99](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L99)

Expand Down Expand Up @@ -5812,24 +5778,6 @@ Defined in: [mcp/tools/coordination.ts:60](https://github.com/tangle-network/age

***

### MakeWorkerAgent

> **MakeWorkerAgent** = (`profile`) => [`Agent`](runtime.md#agent)\<`unknown`, `unknown`\>

Defined in: [mcp/tools/coordination.ts:92](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L92)

#### Parameters

##### profile

`unknown`

#### Returns

[`Agent`](runtime.md#agent)\<`unknown`, `unknown`\>

***

### DelegateResult

> **DelegateResult** = \{ `status`: `"winner"`; `out`: `unknown`; `outRef`: `string`; `spentTotal`: [`Spend`](runtime.md#spend); \} \| \{ `status`: `"no-winner"`; `reason`: `string`; `spentTotal`: [`Spend`](runtime.md#spend); \}
Expand Down Expand Up @@ -7748,6 +7696,18 @@ Re-exports [mcpToolsForRuntimeMcpSubset](index.md#mcptoolsforruntimemcpsubset)

***

### AnalystRegistry

Re-exports [AnalystRegistry](runtime.md#analystregistry)

***

### CoordinationEvent

Re-exports [CoordinationEvent](runtime.md#coordinationevent)

***

### MakeWorkerAgent

Re-exports [MakeWorkerAgent](runtime.md#makeworkeragent)
4 changes: 3 additions & 1 deletion docs/api/primitive-catalog.md
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ Import from `@tangle-network/agent-runtime/intelligence` — 60 exports.

### Recursive atom + loop kernel (alias of ./runtime)

Import from `@tangle-network/agent-runtime/loops` — 381 exports.
Import from `@tangle-network/agent-runtime/loops` — 383 exports.

| Symbol | Kind | Summary |
|---|---|---|
Expand Down Expand Up @@ -487,6 +487,7 @@ Import from `@tangle-network/agent-runtime/loops` — 381 exports.
| `AgentTurnInput` | interface | _(no summary — add a TSDoc line at the declaration)_ |
| `AgentTurnResult` | interface | _(no summary — add a TSDoc line at the declaration)_ |
| `AnalystFinding` | interface | Unified envelope every analyst emits. Schema-versioned so renderers |
| `AnalystRegistry` | interface | _(no summary — add a TSDoc line at the declaration)_ |
| `AnytimeReport` | interface | _(no summary — add a TSDoc line at the declaration)_ |
| `AnytimeStrategySummary` | interface | _(no summary — add a TSDoc line at the declaration)_ |
| `AnytimeTaskCurve` | interface | anytimeReport — time-to-satisfactory-output metrics, derived entirely from the |
Expand Down Expand Up @@ -702,6 +703,7 @@ Import from `@tangle-network/agent-runtime/loops` — 381 exports.
| `LoopShape` | type | A reusable act-body factory. Given the persona's content + seams (`ShapeContext`), it |
| `LoopTraceEvent` | type | _(no summary — add a TSDoc line at the declaration)_ |
| `LoopUntil` | type | `loopUntil(spec)` — build the iterative-deepening combinator. `seed` is the initial state. |
| `MakeWorkerAgent` | type | _(no summary — add a TSDoc line at the declaration)_ |
| `MountRecorder` | type | Records a mounted resource into the run's provenance manifest. Passed to |
| `Outcome` | type | The terminal contract Drew wants: a loop returns a FINISHED deliverable, or the concrete |
| `Panel` | type | `panel(spec)` — build the M-judge write-only-merge combinator. |
Expand Down
Loading
Loading