diff --git a/docs/results/autodata-live.md b/docs/results/autodata-live.md new file mode 100644 index 0000000..3746b36 --- /dev/null +++ b/docs/results/autodata-live.md @@ -0,0 +1,58 @@ +# Autodata live result: a false null, autopsied, then a real (clean) null + +Running the agentic data-creation loop (`src/autodata/`) on a real arXiv doc with real two-tier +solver models, to manufacture training examples that separate a strong solver from a weak one +(the discriminative reward). The headline is a null — but the path to it is the result. + +## What happened, in order + +1. **First runs looked like a null with a *negative* gap.** Across two tier pairs — + `glm-4.5-air` vs `glm-5.2`, then `groq/llama-3.1-8b-instant` vs `gemini-2.5-pro` — every run + reported 0 accepted and a strong−weak gap *below zero* (plain −0.47, then −1.00). A frontier + model scoring *below* an 8B on reasoning questions is not credible. + +2. **Autopsy (a direct probe on the real judge) found an artifact, not a finding.** At the solver's + `maxTokens: 1024`, the strong **reasoning** model (`gemini-2.5-pro`, and `glm-5.2` before it) + spent its whole budget on hidden reasoning and returned **empty visible content** on hard + prompts — which the judge scored 0. So "strong" was being scored 0 for *answering nothing*, + manufacturing a false negative gap. The trivial cost-gate smoke ("reply ok") didn't trigger it, + so it slipped through. (Confirmed: the same prompt at `maxTokens: 8000` → gemini answers in + 956 chars and scores 1.00.) + +3. **Fix (this PR).** The solver now uses a reasoning-safe `maxTokens` (8000) **and fails loud on + empty content** — an empty answer is a measurement failure, never a silent 0 that corrupts the + gap (the repo's no-silent-zeros rule). The model tier is now an env knob + (`AUTODATA_WEAK_MODEL` / `AUTODATA_STRONG_MODEL` / `…_CHALLENGER_MODEL` / `…_JUDGE_MODEL`), and + the price table covers the wide tier. + +4. **The clean result.** Re-run with the fix, `llama-3.1-8b` vs `gemini-2.5-pro`: + + | metric | value | + |---|---| + | accepted (discriminating) examples | **0 / 3** | + | plain gap (n=1) | 0.000 | + | refined best-gap per slot (n=3) | 0.006 | + | Δ (refined − plain) | **+0.006 — no meaningful widening** | + | spend | $0.09 | + + The gap is now **~0, not negative** — `gemini-2.5-pro` and `llama-3.1-8b` score about **equally**. + +## The finding + +On these auto-generated, doc-grounded questions a small model performs as well as a frontier one, +because **the answer is extractable from the provided context** — reading beats reasoning, so model +capability does not separate and no example clears the discriminative bar. This is *not* a +model-tier problem (we used a genuine 8B-vs-frontier gap); it is a **question-difficulty** problem. + +The lever is therefore the **challenger**, not the model tier: to open a real gap the challenger must +generate **non-extractive, reasoning-heavy** questions (multi-step derivations, numerical claims that +require following the paper's argument) — which is exactly the move the Autodata paper relies on +("the agent's initial attempt was usually a high-level summary question… subsequent rounds moved the +questions toward specific algorithmic steps the paper's actual argument required"). Our challenger, +on a single section, mostly produces extractable questions. Making it harder is the next experiment. + +## Status + +Mechanism: proven end-to-end on real frontier models, cost-tracked, fail-loud. Empirical +discrimination: a clean null on extractive questions. The harness is now trustworthy (no empty-→0 +artifact); the open lever is challenger difficulty. diff --git a/src/autodata/router-roles.ts b/src/autodata/router-roles.ts index 6ba7413..39801ef 100644 --- a/src/autodata/router-roles.ts +++ b/src/autodata/router-roles.ts @@ -37,10 +37,13 @@ export const DEFAULT_BASE_URL = 'https://router.tangle.tools/v1' // GLM family IS served, so the real tier here is the smallest GLM (`glm-4.5-air`) as the weak solver // vs the latest (`glm-5.2`) as the strong solver. Same family, a real generational/size gap; swap // these constants back to the Qwen ids once the router provisions that upstream. -export const WEAK_SOLVER_MODEL = 'glm-4.5-air' -export const STRONG_SOLVER_MODEL = 'glm-5.2' -export const CHALLENGER_MODEL = 'glm-5.2' -export const JUDGE_MODEL = 'glm-5.2' +// The solver tier is the experiment's load-bearing knob — a real strong>weak capability gap is +// required for any example to clear the discriminative bar. Overridable by env so the tier can be +// swept without a code change (e.g. AUTODATA_STRONG_MODEL=gemini-2.5-pro AUTODATA_WEAK_MODEL=groq/llama-3.1-8b-instant). +export const WEAK_SOLVER_MODEL = process.env.AUTODATA_WEAK_MODEL ?? 'glm-4.5-air' +export const STRONG_SOLVER_MODEL = process.env.AUTODATA_STRONG_MODEL ?? 'glm-5.2' +export const CHALLENGER_MODEL = process.env.AUTODATA_CHALLENGER_MODEL ?? 'glm-5.2' +export const JUDGE_MODEL = process.env.AUTODATA_JUDGE_MODEL ?? 'glm-5.2' interface ModelPrice { /** USD per 1M input tokens. */ @@ -58,6 +61,10 @@ interface ModelPrice { const PRICE_TABLE: Record = { 'glm-4.5-air': { inputPerM: 0.2, outputPerM: 0.6 }, 'glm-5.2': { inputPerM: 0.95, outputPerM: 3.0 }, + // Wide-tier solver pair (a genuine small-vs-frontier capability gap). Approximate router rates. + 'groq/llama-3.1-8b-instant': { inputPerM: 0.05, outputPerM: 0.08 }, + 'gemini-2.5-pro': { inputPerM: 1.25, outputPerM: 10.0 }, + 'gemini-2.5-flash': { inputPerM: 0.3, outputPerM: 2.5 }, } /** Per-call usage record surfaced to an optional sink for cost-provenance reporting. */ @@ -268,10 +275,22 @@ function solverClient(cfg: RouterRolesConfig, model: string): SandboxClient { baseUrl: cfg.baseUrl, model, messages: [{ role: 'user', content: prompt }], - maxTokens: 1024, + // Reasoning models (gemini-2.5-pro, glm-5.2, …) spend their budget on hidden reasoning and + // emit EMPTY visible content when it is too low — at 1024 a "strong" solver returned nothing + // and was scored 0, manufacturing a false negative strong−weak gap. Give every solver room + // for reasoning + a full answer. + maxTokens: 8000, signal: ctx.signal, onCall: cfg.onCall, }) + // Fail loud: an empty answer is a measurement failure, not a score of 0. Letting empty → 0 + // silently corrupts the strong/weak gap (the whole signal), so refuse to score it. + if (r.content.trim() === '') { + throw new Error( + `solver '${model}' returned empty visible content (likely all tokens spent on hidden ` + + `reasoning) — raise maxTokens or pick a non-reasoning solver; refusing to score it as 0`, + ) + } return [ { type: 'llm_call',