tangle-network · drewstone · Jun 28, 2026 · Jun 28, 2026 · Jun 28, 2026 · Jun 28, 2026
diff --git a/bench/src/swe-bench-env.test.ts b/bench/src/swe-bench-env.test.ts
@@ -0,0 +1,71 @@
+import { mkdtempSync, realpathSync, rmSync, symlinkSync, writeFileSync } from 'node:fs'
+import { tmpdir } from 'node:os'
+import { join } from 'node:path'
+import { afterAll, describe, expect, it } from 'vitest'
+import { isInsideJail, isTestPath, jailPath } from './swe-bench-env'
+
+describe('isTestPath', () => {
+  it('flags test directories and test-named python files', () => {
+    expect(isTestPath('tests/test_models.py')).toBe(true)
+    expect(isTestPath('pkg/test/helpers.py')).toBe(true)
+    expect(isTestPath('pkg/tests/helpers.py')).toBe(true)
+    expect(isTestPath('test_models.py')).toBe(true)
+    expect(isTestPath('models_test.py')).toBe(true)
+    expect(isTestPath('conftest.py')).toBe(true)
+    expect(isTestPath('pkg/conftest.py')).toBe(true)
+  })
+
+  it('does not flag ordinary source files', () => {
+    expect(isTestPath('src/foo.py')).toBe(false)
+    expect(isTestPath('pkg/models.py')).toBe(false)
+    // `testing.py` is not a test file by the test_/_test/conftest rules.
+    expect(isTestPath('pkg/testing.py')).toBe(false)
+    // A `latest/` segment must not trip the `tests?/` directory rule.
+    expect(isTestPath('latest/foo.py')).toBe(false)
+  })
+})
+
+describe('jailPath', () => {
+  const root = '/work/repo'
+
+  it('rejects `..` traversal and absolute paths', () => {
+    expect(jailPath(root, '../x')).toBeNull()
+    expect(jailPath(root, 'a/../../etc/passwd')).toBeNull()
+    expect(jailPath(root, '/etc/passwd')).toBeNull()
+  })
+
+  it('accepts in-repo relative paths and strips a leading `./`', () => {
+    expect(jailPath(root, 'src/a.py')).toBe('src/a.py')
+    expect(jailPath(root, './a.py')).toBe('a.py')
+    expect(jailPath(root, 'a.py')).toBe('a.py')
+  })
+})
+
+describe('isInsideJail (realpath containment)', () => {
+  // Mirror the `resolveInJail` closure in `call()`: realpath-resolve a workspace-relative path, then
+  // assert containment. Offline — operates on a throwaway temp dir, no git clone, no network.
+  const dir = mkdtempSync(join(tmpdir(), 'swe-jail-'))
+  const jailRoot = realpathSync(dir)
+  afterAll(() => rmSync(dir, { recursive: true, force: true }))
+
+  it('admits a real file inside the jail', () => {
+    const inside = join(dir, 'a.py')
+    writeFileSync(inside, 'x = 1\n')
+    expect(isInsideJail(jailRoot, realpathSync(inside))).toBe(true)
+    expect(isInsideJail(jailRoot, jailRoot)).toBe(true)
+  })
+
+  it('rejects reading through a symlink that escapes the jail', () => {
+    // A repo could ship `escape -> /etc`; following it must not let the agent read /etc/passwd.
+    const link = join(dir, 'escape')
+    symlinkSync('/etc', link)
+    // `resolveInJail` does `realpathSync(join(ws.dir, relPath))` then this containment check.
+    const real = realpathSync(join(dir, 'escape/passwd'))
+    expect(real).toBe('/etc/passwd')
+    expect(isInsideJail(jailRoot, real)).toBe(false)
+  })
+
+  it('rejects a sibling dir that shares the jail-root prefix', () => {
+    expect(isInsideJail('/tmp/swe-x', '/tmp/swe-x-evil/secret')).toBe(false)
+  })
+})
diff --git a/bench/src/swe-bench-env.ts b/bench/src/swe-bench-env.ts
@@ -13,22 +13,40 @@
  * memorization. Always report this; never claim a "clean" frontier number from this arena alone.
  */
 import { execFile } from 'node:child_process'
-import { existsSync, lstatSync, mkdtempSync, readdirSync, readFileSync, rmSync, writeFileSync } from 'node:fs'
+import { existsSync, lstatSync, mkdtempSync, readdirSync, readFileSync, realpathSync, rmSync, writeFileSync } from 'node:fs'
 import { tmpdir } from 'node:os'
-import { join } from 'node:path'
+import { join, sep } from 'node:path'
 import { promisify } from 'node:util'
 import type { AgenticSurface, AgenticTask, AgenticTool, ArtifactHandle, SurfaceScore } from '@tangle-network/agent-runtime/loops'
 import { createSweBenchAdapter } from './benchmarks/swe-bench'
 import type { BenchTask } from './benchmarks/types'
 
 const exec = promisify(execFile)
-const isTestPath = (p: string) => /(^|\/)(tests?)\//.test(p) || /test_.*\.py$|_test\.py$|conftest\.py$/.test(p)
+export const isTestPath = (p: string) => /(^|\/)(tests?)\//.test(p) || /test_.*\.py$|_test\.py$|conftest\.py$/.test(p)
+
+/**
+ * Cheap string pre-filter for an agent-supplied repo-relative path, applied before the path is
+ * joined to a workspace root: rejects absolute paths and any `..` segment, strips a leading `./`.
+ * Returns the cleaned relative path, or `null` if it must be refused. Pure and side-effect-free —
+ * `root` is unused here (the symlink-following boundary is the realpath jail, not this filter) but
+ * is taken so call sites read symmetrically with the realpath check.
+ */
+export const jailPath = (_root: string, p: string): string | null => {
+  if (p.startsWith('/') || p.includes('..')) return null
+  return p.replace(/^\.\//, '')
+}
+
+/**
+ * Containment predicate for the realpath jail: true iff `real` (an already-resolved absolute path)
+ * is `jailRoot` itself or lies strictly inside it. The `+ sep` guard stops a sibling like
+ * `/tmp/swe-x-evil` from matching the root `/tmp/swe-x`. Pure and side-effect-free.
+ */
+export const isInsideJail = (jailRoot: string, real: string): boolean => real === jailRoot || real.startsWith(jailRoot + sep)
 
 interface Ws {
   dir: string
   task: BenchTask
 }
-const workspaces = new Map<string, Ws>()
 
 /** Build the SWE-bench Environment + a DISJOINT-slice task supplier over the Verified split. The
  *  supplier keys tasks by dataset offset so `runStrategyEvolution`'s train [0,trainN) and holdout
@@ -41,6 +59,8 @@ export async function createSweBenchEnvironment(poolN = 80): Promise<{
   const adapter = createSweBenchAdapter()
   const pool = await adapter.loadTasks({ limit: poolN, split: 'test' })
   const byId = new Map(pool.map((t) => [t.id, t]))
+  // Each environment owns its workspace registry so concurrent environments don't share state.
+  const workspaces = new Map<string, Ws>()
 
   const environment: AgenticSurface = {
     name: 'swe-bench-verified',
@@ -70,9 +90,18 @@ export async function createSweBenchEnvironment(poolN = 80): Promise<{
     async call(handle, name, args) {
       const ws = workspaces.get(handle.id)
       if (!ws) return 'ERROR: workspace closed'
-      const safe = (p: string): string | null => {
-        if (p.startsWith('/') || p.includes('..')) return null
-        return p.replace(/^\.\//, '')
+      // Cheap pre-filter: reject absolute paths and `..` traversal, strip a leading `./`. The real
+      // boundary is the realpath jail check below (resolveInJail) — `safe` only normalizes the string
+      // form. `ws.dir` is passed for signature symmetry; the filter itself is root-independent.
+      const safe = (p: string): string | null => jailPath(ws.dir, p)
+      // Resolve `relPath` to an absolute path and assert it stays inside the workspace AFTER following
+      // symlinks (a repo symlink targeting /etc/passwd would otherwise escape the string-only jail).
+      // The target must exist (both callers read it first); a missing path throws and the caller
+      // surfaces the error message, matching the previous read-then-fail behavior.
+      const jailRoot = realpathSync(ws.dir)
+      const resolveInJail = (relPath: string): string | null => {
+        const real = realpathSync(join(ws.dir, relPath))
+        return isInsideJail(jailRoot, real) ? real : null
       }
       if (name === 'list_files') {
         const sub = safe(String(args.dir ?? '')) ?? ''
@@ -106,8 +135,15 @@ export async function createSweBenchEnvironment(poolN = 80): Promise<{
       if (name === 'read_file') {
         const p = safe(String(args.path ?? ''))
         if (!p) return 'ERROR: invalid path'
+        let real: string | null
         try {
-          const c = readFileSync(join(ws.dir, p), 'utf8')
+          real = resolveInJail(p)
+        } catch (e) {
+          return `(error: ${(e as Error).message})`
+        }
+        if (!real) return `ERROR: path ${p} escapes the workspace`
+        try {
+          const c = readFileSync(real, 'utf8')
           return c.length > 24_000 ? `${c.slice(0, 24_000)}\n...[truncated]` : c
         } catch (e) {
           return `(error: ${(e as Error).message})`
@@ -119,17 +155,24 @@ export async function createSweBenchEnvironment(poolN = 80): Promise<{
         if (isTestPath(p)) return 'REJECTED: editing test files is forbidden (the evaluation runs hidden tests).'
         const oldStr = String(args.old_string ?? '')
         const newStr = String(args.new_string ?? '')
+        let real: string | null
+        try {
+          real = resolveInJail(p)
+        } catch (e) {
+          return `(cannot read ${p}: ${(e as Error).message})`
+        }
+        if (!real) return `ERROR: path ${p} escapes the workspace`
         let content: string
         try {
-          content = readFileSync(join(ws.dir, p), 'utf8')
+          content = readFileSync(real, 'utf8')
         } catch (e) {
           return `(cannot read ${p}: ${(e as Error).message})`
         }
         if (!oldStr) return 'ERROR: old_string is empty.'
         const count = content.split(oldStr).length - 1
         if (count === 0) return `ERROR: old_string not found in ${p}. read_file it and copy EXACT text.`
         if (count > 1) return `ERROR: old_string appears ${count}× in ${p} — add surrounding context to make it unique.`
-        writeFileSync(join(ws.dir, p), content.replace(oldStr, newStr))
+        writeFileSync(real, content.replace(oldStr, newStr))
         return `edited ${p}: replaced 1 occurrence`
       }
       return `ERROR: unknown tool ${name}`

diff --git a/bench/swe-self-improve.mts → bench/src/swe-self-improve.mts b/bench/swe-self-improve.mts → bench/src/swe-self-improve.mts
@@ -4,14 +4,14 @@
  * draws a disjoint holdout slice and gates once — adaptive reuse is impossible). CONTAMINATION CAVEAT
  * applies (public fixes may be memorized) — reported, never claimed clean.
  *
- *   CALIBRATE first (cost gate):  TANGLE_API_KEY=… CALIBRATE=1 N=3 tsx bench/swe-self-improve.mts
- *   Full run:                     TANGLE_API_KEY=… TRAIN_N=6 HOLDOUT_N=8 GENERATIONS=2 tsx bench/swe-self-improve.mts
+ *   CALIBRATE first (cost gate):  TANGLE_API_KEY=… CALIBRATE=1 N=3 tsx bench/src/swe-self-improve.mts
+ *   Full run:                     TANGLE_API_KEY=… TRAIN_N=6 HOLDOUT_N=8 GENERATIONS=2 tsx bench/src/swe-self-improve.mts
  */
 import { mkdtempSync, rmSync } from 'node:fs'
 import { join } from 'node:path'
 import { createChatClient } from '@tangle-network/agent-eval'
 import { refine, runAgentic, runStrategyEvolution, sample } from '@tangle-network/agent-runtime/loops'
-import { createSweBenchEnvironment } from './src/swe-bench-env'
+import { createSweBenchEnvironment } from './swe-bench-env'
 
 async function main(): Promise<void> {
   const routerKey = process.env.TANGLE_API_KEY

diff --git a/docs/api/mcp.md b/docs/api/mcp.md
@@ -3956,40 +3956,6 @@ Defined in: [mcp/tools/coordination.ts:56](https://github.com/tangle-network/age
 
 ***
 
-### AnalystRegistry
-
-Defined in: [mcp/tools/coordination.ts:62](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L62)
-
-#### Properties
-
-##### kinds
-
-> `readonly` **kinds**: readonly `object`[]
-
-Defined in: [mcp/tools/coordination.ts:63](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L63)
-
-##### run
-
-> `readonly` **run**: (`kindId`, `trace`) => `Promise`\<`unknown`\>
-
-Defined in: [mcp/tools/coordination.ts:64](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L64)
-
-###### Parameters
-
-###### kindId
-
-`string`
-
-###### trace
-
-`unknown`
-
-###### Returns
-
-`Promise`\<`unknown`\>
-
-***
-
 ### CoordinationToolsOptions
 
 Defined in: [mcp/tools/coordination.ts:94](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L94)
@@ -4010,7 +3976,7 @@ Defined in: [mcp/tools/coordination.ts:96](https://github.com/tangle-network/age
 
 ##### makeWorkerAgent
 
-> `readonly` **makeWorkerAgent**: [`MakeWorkerAgent`](#makeworkeragent)
+> `readonly` **makeWorkerAgent**: [`MakeWorkerAgent`](runtime.md#makeworkeragent)
 
 Defined in: [mcp/tools/coordination.ts:97](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L97)
 
@@ -4022,7 +3988,7 @@ Defined in: [mcp/tools/coordination.ts:98](https://github.com/tangle-network/age
 
 ##### analysts?
 
-> `readonly` `optional` **analysts?**: [`AnalystRegistry`](#analystregistry)
+> `readonly` `optional` **analysts?**: [`AnalystRegistry`](runtime.md#analystregistry)
 
 Defined in: [mcp/tools/coordination.ts:99](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L99)
 
@@ -5812,24 +5778,6 @@ Defined in: [mcp/tools/coordination.ts:60](https://github.com/tangle-network/age
 
 ***
 
-### MakeWorkerAgent
-
-> **MakeWorkerAgent** = (`profile`) => [`Agent`](runtime.md#agent)\<`unknown`, `unknown`\>
-
-Defined in: [mcp/tools/coordination.ts:92](https://github.com/tangle-network/agent-runtime/blob/main/src/mcp/tools/coordination.ts#L92)
-
-#### Parameters
-
-##### profile
-
-`unknown`
-
-#### Returns
-
-[`Agent`](runtime.md#agent)\<`unknown`, `unknown`\>
-
-***
-
 ### DelegateResult
 
 > **DelegateResult** = \{ `status`: `"winner"`; `out`: `unknown`; `outRef`: `string`; `spentTotal`: [`Spend`](runtime.md#spend); \} \| \{ `status`: `"no-winner"`; `reason`: `string`; `spentTotal`: [`Spend`](runtime.md#spend); \}
@@ -7748,6 +7696,18 @@ Re-exports [mcpToolsForRuntimeMcpSubset](index.md#mcptoolsforruntimemcpsubset)
 
 ***
 
+### AnalystRegistry
+
+Re-exports [AnalystRegistry](runtime.md#analystregistry)
+
+***
+
 ### CoordinationEvent
 
 Re-exports [CoordinationEvent](runtime.md#coordinationevent)
+
+***
+
+### MakeWorkerAgent
+
+Re-exports [MakeWorkerAgent](runtime.md#makeworkeragent)
diff --git a/docs/api/primitive-catalog.md b/docs/api/primitive-catalog.md
@@ -337,7 +337,7 @@ Import from `@tangle-network/agent-runtime/intelligence` — 60 exports.
 
 ### Recursive atom + loop kernel (alias of ./runtime)
 
-Import from `@tangle-network/agent-runtime/loops` — 381 exports.
+Import from `@tangle-network/agent-runtime/loops` — 383 exports.
 
 | Symbol | Kind | Summary |
 |---|---|---|
@@ -487,6 +487,7 @@ Import from `@tangle-network/agent-runtime/loops` — 381 exports.
 | `AgentTurnInput` | interface | _(no summary — add a TSDoc line at the declaration)_ |
 | `AgentTurnResult` | interface | _(no summary — add a TSDoc line at the declaration)_ |
 | `AnalystFinding` | interface | Unified envelope every analyst emits. Schema-versioned so renderers |
+| `AnalystRegistry` | interface | _(no summary — add a TSDoc line at the declaration)_ |
 | `AnytimeReport` | interface | _(no summary — add a TSDoc line at the declaration)_ |
 | `AnytimeStrategySummary` | interface | _(no summary — add a TSDoc line at the declaration)_ |
 | `AnytimeTaskCurve` | interface | anytimeReport — time-to-satisfactory-output metrics, derived entirely from the |
@@ -702,6 +703,7 @@ Import from `@tangle-network/agent-runtime/loops` — 381 exports.
 | `LoopShape` | type | A reusable act-body factory. Given the persona's content + seams (`ShapeContext`), it |
 | `LoopTraceEvent` | type | _(no summary — add a TSDoc line at the declaration)_ |
 | `LoopUntil` | type | `loopUntil(spec)` — build the iterative-deepening combinator. `seed` is the initial state. |
+| `MakeWorkerAgent` | type | _(no summary — add a TSDoc line at the declaration)_ |
 | `MountRecorder` | type | Records a mounted resource into the run's provenance manifest. Passed to |
 | `Outcome` | type | The terminal contract Drew wants: a loop returns a FINISHED deliverable, or the concrete |
 | `Panel` | type | `panel(spec)` — build the M-judge write-only-merge combinator. |