From 3bdf319bbf7a179c8c3e6c97ba692707976c7626 Mon Sep 17 00:00:00 2001 From: Danlin Yu Date: Fri, 8 May 2026 21:44:49 -0400 Subject: [PATCH 1/2] Add posterior uncertainty and OpenClaw roadmap --- bayesian_agent/core/belief.py | 17 +++++++++ bayesian_agent/core/context.py | 1 + docs/openclaw-integration-roadmap.md | 54 ++++++++++++++++++++++++++++ schemas/skill_belief.schema.json | 2 ++ tests/test_core.py | 3 ++ tests/test_registry_context.py | 1 + 6 files changed, 78 insertions(+) create mode 100644 docs/openclaw-integration-roadmap.md diff --git a/bayesian_agent/core/belief.py b/bayesian_agent/core/belief.py index d5264b6..88df495 100644 --- a/bayesian_agent/core/belief.py +++ b/bayesian_agent/core/belief.py @@ -43,6 +43,21 @@ def success_probability(self) -> float: denom = self.alpha + self.beta return self.alpha / denom if denom else 0.0 + @property + def posterior_variance(self) -> float: + """Variance of the Beta posterior over Skill success probability.""" + + denom = self.alpha + self.beta + if denom <= 0: + return 0.0 + return (self.alpha * self.beta) / ((denom**2) * (denom + 1.0)) + + @property + def posterior_std(self) -> float: + """Standard deviation of the Beta posterior.""" + + return self.posterior_variance**0.5 + def update(self, event: TrajectoryEvidence) -> "SkillBelief": outcome = event.outcome.strip().lower() if outcome == "success": @@ -72,6 +87,8 @@ def to_dict(self) -> Dict[str, Any]: "alpha": self.alpha, "beta": self.beta, "posterior_success": self.success_probability, + "posterior_variance": self.posterior_variance, + "posterior_std": self.posterior_std, "contexts": self.contexts, "failure_modes": self.failure_modes, "evidence": self.evidence[-MAX_EVIDENCE:], diff --git a/bayesian_agent/core/context.py b/bayesian_agent/core/context.py index 70509b6..403979c 100644 --- a/bayesian_agent/core/context.py +++ b/bayesian_agent/core/context.py @@ -27,6 +27,7 @@ def render(self, task_context: str = "", limit: int = 5) -> str: lines.append( "- " f"{belief.skill_id}: posterior_success={belief.success_probability:.3f}, " + f"posterior_std={belief.posterior_std:.3f}, " f"alpha={belief.alpha:.1f}, beta={belief.beta:.1f}, " f"observations={belief.observations}, mean_tokens={belief.mean_tokens:.1f}, " f"rewrite={decision.action}, failures={failures}" diff --git a/docs/openclaw-integration-roadmap.md b/docs/openclaw-integration-roadmap.md new file mode 100644 index 0000000..30c5ed8 --- /dev/null +++ b/docs/openclaw-integration-roadmap.md @@ -0,0 +1,54 @@ +# OpenClaw Integration Roadmap + +This roadmap sketches how Bayesian-Agent can become useful for persistent assistant systems such as OpenClaw while staying framework-agnostic. + +## Why OpenClaw Is a Good Testbed + +OpenClaw-style assistants repeatedly execute durable workflows: coding repair, research review, browser tasks, grading support, inbox triage, and project maintenance. These workflows naturally produce trajectories, verifier signals, and reusable procedures. + +Bayesian-Agent can help by turning those repeated procedures into evidence-weighted Skills instead of relying on unfiltered memory or anecdotal prompt edits. + +## Target Use Cases + +1. **Workflow reliability** + - Track which SOPs actually succeed for recurring assistant tasks. + - Prefer high-posterior, low-cost procedures for similar future contexts. + +2. **Failure-mode learning** + - Count recurring errors such as missing tests, stale browser refs, premature grading, or unsafe external actions. + - Route clustered failures to `patch`, `split`, or `retire` decisions. + +3. **Incremental repair** + - When a batch task partially fails, rerun only the failed units with posterior-weighted context. + - Useful for grading batches, code-review batches, literature-screening batches, and benchmark tasks. + +4. **Cross-harness memory discipline** + - Keep the Bayesian registry separate from the execution harness. + - Let OpenClaw, CLI agents, and future harnesses share the same evidence format. + +## Minimal Adapter Shape + +An OpenClaw adapter should export verified run records into `TrajectoryEvidence` fields: + +- `task_id`: stable id for the workflow unit, such as `grading/assignment/student_id` or `repo/pr/check_id`. +- `skill_id`: SOP or skill used, such as `openclaw/grading/rubric_feedback`. +- `context`: coarse task family, such as `grading`, `coding`, `browser`, `research`. +- `outcome`: verifier result: `success`, `failure`, or `error`. +- `failure_mode`: short normalized failure label. +- token/runtime metadata where available. + +## First Contribution Thread + +The first practical improvement is uncertainty visibility. Posterior mean alone can over-rank sparse evidence. Rendering posterior standard deviation helps downstream harnesses distinguish: + +- high-confidence successful skills; +- promising but under-tested skills; +- unstable skills that need exploration or splitting. + +## Suggested Next PRs + +1. Add posterior uncertainty fields and render them in context. *(small, self-contained)* +2. Add configurable ranking strategies: exploit, explore, cost-aware, and context-match. +3. Add an `openclaw` example exporter that converts a small JSONL task log into `TrajectoryEvidence`. +4. Add richer repair-plan output with failure-mode clusters, not only failed task ids. +5. Add reproducible benchmark runner scripts for published artifacts. diff --git a/schemas/skill_belief.schema.json b/schemas/skill_belief.schema.json index 8886ad0..0365d75 100644 --- a/schemas/skill_belief.schema.json +++ b/schemas/skill_belief.schema.json @@ -9,6 +9,8 @@ "alpha": {"type": "number", "minimum": 0}, "beta": {"type": "number", "minimum": 0}, "posterior_success": {"type": "number", "minimum": 0, "maximum": 1}, + "posterior_variance": {"type": "number", "minimum": 0}, + "posterior_std": {"type": "number", "minimum": 0}, "contexts": {"type": "object", "additionalProperties": {"type": "integer", "minimum": 0}}, "failure_modes": {"type": "object", "additionalProperties": {"type": "integer", "minimum": 0}}, "evidence": {"type": "array", "items": {"$ref": "trajectory.schema.json"}}, diff --git a/tests/test_core.py b/tests/test_core.py index 6effa53..da29724 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -34,6 +34,9 @@ def test_skill_belief_updates_beta_posterior_and_cost(self): self.assertEqual(belief.failure_modes["blank_cell"], 1) self.assertEqual(belief.mean_tokens, 20.0) self.assertAlmostEqual(belief.success_probability, 0.5) + self.assertAlmostEqual(belief.posterior_variance, 0.05) + self.assertAlmostEqual(belief.posterior_std, 0.05**0.5) + self.assertIn("posterior_std", belief.to_dict()) def test_rewrite_decision_is_serializable(self): decision = RewriteDecision(action="patch", reason="failures cluster", confidence=0.75) diff --git a/tests/test_registry_context.py b/tests/test_registry_context.py index 185775a..7c50790 100644 --- a/tests/test_registry_context.py +++ b/tests/test_registry_context.py @@ -30,6 +30,7 @@ def test_context_builder_orders_by_posterior_and_cost(self): self.assertIn("Bayesian Skill Context", context) self.assertLess(context.find("skill/a"), context.find("skill/b")) self.assertIn("posterior_success", context) + self.assertIn("posterior_std", context) def test_rewrite_policy_selects_actions(self): registry = BayesianSkillRegistry.in_memory() From b1e5dfbd421c45f17b299f9f19aff6572d67dc61 Mon Sep 17 00:00:00 2001 From: Danlin Yu Date: Fri, 8 May 2026 21:52:11 -0400 Subject: [PATCH 2/2] Add workflow learning utilities for agent harnesses --- bayesian_agent/__init__.py | 7 +++ bayesian_agent/adapters/__init__.py | 3 +- bayesian_agent/adapters/workflow_log.py | 65 ++++++++++++++++++++ bayesian_agent/cli.py | 32 +++++++++- bayesian_agent/core/context.py | 5 +- bayesian_agent/core/ranking.py | 75 ++++++++++++++++++++++++ bayesian_agent/core/registry.py | 7 ++- bayesian_agent/core/repair.py | 36 +++++++++++- bayesian_agent/core/standards.py | 56 ++++++++++++++++++ docs/openclaw-integration-roadmap.md | 18 ++++-- examples/openclaw_workflow_log/README.md | 31 ++++++++++ tests/test_registry_context.py | 10 ++++ tests/test_repair_cli.py | 16 ++++- tests/test_workflow_log_and_standards.py | 65 ++++++++++++++++++++ 14 files changed, 412 insertions(+), 14 deletions(-) create mode 100644 bayesian_agent/adapters/workflow_log.py create mode 100644 bayesian_agent/core/ranking.py create mode 100644 bayesian_agent/core/standards.py create mode 100644 examples/openclaw_workflow_log/README.md create mode 100644 tests/test_workflow_log_and_standards.py diff --git a/bayesian_agent/__init__.py b/bayesian_agent/__init__.py index 4441a20..8491f72 100644 --- a/bayesian_agent/__init__.py +++ b/bayesian_agent/__init__.py @@ -4,13 +4,20 @@ from bayesian_agent.core.context import SkillContextBuilder from bayesian_agent.core.evidence import TrajectoryEvidence from bayesian_agent.core.policy import RewritePolicy +from bayesian_agent.core.ranking import RankingStrategy, get_strategy from bayesian_agent.core.registry import BayesianSkillRegistry +from bayesian_agent.core.standards import DEFAULT_AGENTIC_STANDARDS, WorkflowStandard, evaluate_standards __all__ = [ "BayesianSkillRegistry", "RewriteDecision", + "DEFAULT_AGENTIC_STANDARDS", + "RankingStrategy", "RewritePolicy", "SkillBelief", "SkillContextBuilder", "TrajectoryEvidence", + "WorkflowStandard", + "evaluate_standards", + "get_strategy", ] diff --git a/bayesian_agent/adapters/__init__.py b/bayesian_agent/adapters/__init__.py index 022c5e0..6e69092 100644 --- a/bayesian_agent/adapters/__init__.py +++ b/bayesian_agent/adapters/__init__.py @@ -2,5 +2,6 @@ from bayesian_agent.adapters.base import AgentAdapter from bayesian_agent.adapters.generic_agent import GenericAgentAdapter +from bayesian_agent.adapters.workflow_log import evidence_from_jsonl, workflow_record_to_evidence -__all__ = ["AgentAdapter", "GenericAgentAdapter"] +__all__ = ["AgentAdapter", "GenericAgentAdapter", "evidence_from_jsonl", "workflow_record_to_evidence"] diff --git a/bayesian_agent/adapters/workflow_log.py b/bayesian_agent/adapters/workflow_log.py new file mode 100644 index 0000000..825bb96 --- /dev/null +++ b/bayesian_agent/adapters/workflow_log.py @@ -0,0 +1,65 @@ +"""Utilities for converting generic assistant workflow logs into trajectory evidence.""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Iterable, Iterator, Mapping, Optional + +from bayesian_agent.core.evidence import TrajectoryEvidence + + +SUCCESS_VALUES = {"success", "succeeded", "ok", "passed", "complete", "completed", True} + + +def workflow_record_to_evidence( + record: Mapping[str, Any], + *, + default_skill_id: str = "workflow/default", + default_context: str = "workflow", +) -> TrajectoryEvidence: + """Convert an OpenClaw/Hermes-like workflow record into `TrajectoryEvidence`. + + The function intentionally accepts several common field names so external + harnesses can integrate without adopting Bayesian-Agent internals first. + """ + + task_id = str(record.get("task_id") or record.get("id") or record.get("run_id") or "") + skill_id = str(record.get("skill_id") or record.get("sop_id") or record.get("workflow_id") or default_skill_id) + context = str(record.get("context") or record.get("task_family") or record.get("workflow") or default_context) + raw_outcome = record.get("outcome", record.get("status", record.get("success"))) + outcome = "success" if raw_outcome in SUCCESS_VALUES else "failure" + return TrajectoryEvidence( + task_id=task_id, + skill_id=skill_id, + context=context, + outcome=outcome, + input_tokens=int(record.get("input_tokens") or record.get("prompt_tokens") or 0), + output_tokens=int(record.get("output_tokens") or record.get("completion_tokens") or 0), + total_tokens=int(record.get("total_tokens") or 0), + turns=int(record.get("turns") or record.get("steps") or 0), + elapsed_seconds=float(record.get("elapsed_seconds") or record.get("duration_seconds") or 0.0), + failure_mode=str(record.get("failure_mode") or record.get("error_type") or record.get("error") or ""), + summary=str(record.get("summary") or record.get("title") or task_id), + metadata={k: v for k, v in record.items() if k not in {"transcript", "messages"}}, + ) + + +def iter_jsonl(path: str | Path) -> Iterator[Mapping[str, Any]]: + """Yield JSON objects from a JSONL file, skipping blank lines.""" + + for line in Path(path).read_text(encoding="utf-8").splitlines(): + if line.strip(): + yield json.loads(line) + + +def evidence_from_jsonl( + path: str | Path, + *, + default_skill_id: str = "workflow/default", + default_context: str = "workflow", +) -> Iterable[TrajectoryEvidence]: + """Read assistant workflow records from JSONL and yield trajectory evidence.""" + + for record in iter_jsonl(path): + yield workflow_record_to_evidence(record, default_skill_id=default_skill_id, default_context=default_context) diff --git a/bayesian_agent/cli.py b/bayesian_agent/cli.py index 0879b92..3aeeaad 100644 --- a/bayesian_agent/cli.py +++ b/bayesian_agent/cli.py @@ -10,7 +10,8 @@ from bayesian_agent.core.context import SkillContextBuilder from bayesian_agent.core.evidence import TrajectoryEvidence from bayesian_agent.core.registry import BayesianSkillRegistry -from bayesian_agent.core.repair import failed_task_ids, normalize_results, summarize, summarize_incremental_lift +from bayesian_agent.core.repair import failed_task_ids, normalize_results, repair_report, summarize, summarize_incremental_lift +from bayesian_agent.adapters.workflow_log import evidence_from_jsonl def _read_json(path: str) -> Mapping[str, Any]: @@ -44,6 +45,14 @@ def build_parser() -> argparse.ArgumentParser: evolve.add_argument("--registry", required=True, help="Output registry JSON path.") evolve.add_argument("--context-out", default="", help="Optional rendered Skill context path.") + evolve_log = sub.add_parser("evolve-workflow-log", help="Update a registry from generic assistant workflow JSONL records.") + evolve_log.add_argument("--jsonl", action="append", required=True, help="Path to a workflow JSONL file.") + evolve_log.add_argument("--registry", required=True, help="Output registry JSON path.") + evolve_log.add_argument("--context-out", default="", help="Optional rendered Skill context path.") + evolve_log.add_argument("--default-skill-id", default="workflow/default") + evolve_log.add_argument("--default-context", default="workflow") + evolve_log.add_argument("--strategy", default="exploit", help="Context ranking strategy.") + summarize_cmd = sub.add_parser("summarize", help="Summarize a results JSON file.") summarize_cmd.add_argument("--results", required=True) summarize_cmd.add_argument("--out", required=True) @@ -52,6 +61,10 @@ def build_parser() -> argparse.ArgumentParser: repair.add_argument("--baseline", required=True) repair.add_argument("--out", required=True) + repair_report_cmd = sub.add_parser("repair-report", help="Summarize failed task ids and failure-mode clusters.") + repair_report_cmd.add_argument("--baseline", required=True) + repair_report_cmd.add_argument("--out", required=True) + lift = sub.add_parser("incremental-summary", help="Summarize baseline plus repair traces.") lift.add_argument("--baseline", required=True) lift.add_argument("--repairs", required=True) @@ -70,6 +83,20 @@ def main(argv: Sequence[str] = None) -> int: if args.context_out: Path(args.context_out).write_text(SkillContextBuilder(registry).render(), encoding="utf-8") return 0 + if args.command == "evolve-workflow-log": + registry = BayesianSkillRegistry(args.registry) + for jsonl_path in args.jsonl: + registry.record_many( + evidence_from_jsonl( + jsonl_path, + default_skill_id=args.default_skill_id, + default_context=args.default_context, + ) + ) + registry.save() + if args.context_out: + Path(args.context_out).write_text(SkillContextBuilder(registry).render(strategy=args.strategy), encoding="utf-8") + return 0 if args.command == "summarize": _write_json(args.out, summarize(normalize_results(_read_json(args.results)))) return 0 @@ -77,6 +104,9 @@ def main(argv: Sequence[str] = None) -> int: failures = {k: sorted(v) for k, v in failed_task_ids(normalize_results(_read_json(args.baseline))).items()} _write_json(args.out, failures) return 0 + if args.command == "repair-report": + _write_json(args.out, repair_report(normalize_results(_read_json(args.baseline)))) + return 0 if args.command == "incremental-summary": baseline = normalize_results(_read_json(args.baseline)) repairs = normalize_results(_read_json(args.repairs)) diff --git a/bayesian_agent/core/context.py b/bayesian_agent/core/context.py index 403979c..44b6007 100644 --- a/bayesian_agent/core/context.py +++ b/bayesian_agent/core/context.py @@ -13,13 +13,14 @@ def __init__(self, registry: BayesianSkillRegistry, policy: RewritePolicy = None self.registry = registry self.policy = policy or RewritePolicy() - def render(self, task_context: str = "", limit: int = 5) -> str: - beliefs = self.registry.top(limit=limit, context=task_context) + def render(self, task_context: str = "", limit: int = 5, strategy: str = "exploit") -> str: + beliefs = self.registry.top(limit=limit, context=task_context, strategy=strategy) if not beliefs: return "" lines = [ "### Bayesian Skill Context", "Use these posterior-weighted Skills/SOPs as hypotheses, not as unquestioned instructions.", + f"Ranking strategy: {strategy}.", ] for belief in beliefs: decision = self.policy.decide(belief) diff --git a/bayesian_agent/core/ranking.py b/bayesian_agent/core/ranking.py new file mode 100644 index 0000000..d9615f8 --- /dev/null +++ b/bayesian_agent/core/ranking.py @@ -0,0 +1,75 @@ +"""Skill ranking strategies for posterior-weighted context selection.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Callable, Dict + +from bayesian_agent.core.belief import SkillBelief + + +@dataclass(frozen=True) +class RankingStrategy: + """A named strategy for ranking Skill beliefs.""" + + name: str + description: str + scorer: Callable[[SkillBelief, str], float] + + def score(self, belief: SkillBelief, context: str = "") -> float: + return float(self.scorer(belief, context)) + + +def _context_bonus(belief: SkillBelief, context: str) -> float: + if not context: + return 0.0 + if context in belief.contexts: + return 1.0 + # Lightweight partial match for hierarchical contexts such as "openclaw/grading". + return 0.25 if any(context in known or known in context for known in belief.contexts) else 0.0 + + +def _safe_mean_tokens(belief: SkillBelief) -> float: + return max(float(belief.mean_tokens or 0.0), 1.0) + + +def exploit_score(belief: SkillBelief, context: str = "") -> float: + """Prefer proven, context-matching, low-uncertainty Skills.""" + + return belief.success_probability + (0.15 * _context_bonus(belief, context)) - (0.25 * belief.posterior_std) + + +def explore_score(belief: SkillBelief, context: str = "") -> float: + """Prefer uncertain Skills with some contextual relevance.""" + + return belief.posterior_std + (0.10 * _context_bonus(belief, context)) + min(belief.observations, 3) * 0.01 + + +def cost_aware_score(belief: SkillBelief, context: str = "") -> float: + """Prefer success per token, while retaining a small context bonus.""" + + return (belief.success_probability / _safe_mean_tokens(belief)) * 1000.0 + (0.10 * _context_bonus(belief, context)) + + +def context_aware_score(belief: SkillBelief, context: str = "") -> float: + """Prefer Skills proven in the same or nearby task context.""" + + return belief.success_probability + (0.35 * _context_bonus(belief, context)) - (0.10 * belief.posterior_std) + + +STRATEGIES: Dict[str, RankingStrategy] = { + "exploit": RankingStrategy("exploit", "Prefer proven, low-uncertainty Skills.", exploit_score), + "explore": RankingStrategy("explore", "Prefer Skills that need more evidence.", explore_score), + "cost_aware": RankingStrategy("cost_aware", "Prefer high-success, low-token Skills.", cost_aware_score), + "context_aware": RankingStrategy("context_aware", "Prefer Skills proven in similar contexts.", context_aware_score), +} + + +def get_strategy(name: str = "exploit") -> RankingStrategy: + """Return a ranking strategy by name.""" + + normalized = (name or "exploit").strip().lower().replace("-", "_") + if normalized not in STRATEGIES: + available = ", ".join(sorted(STRATEGIES)) + raise ValueError(f"Unknown ranking strategy '{name}'. Available: {available}") + return STRATEGIES[normalized] diff --git a/bayesian_agent/core/registry.py b/bayesian_agent/core/registry.py index 047881f..e36a5a5 100644 --- a/bayesian_agent/core/registry.py +++ b/bayesian_agent/core/registry.py @@ -8,6 +8,7 @@ from bayesian_agent.core.belief import SkillBelief from bayesian_agent.core.evidence import TrajectoryEvidence, utc_now +from bayesian_agent.core.ranking import get_strategy class BayesianSkillRegistry: @@ -60,11 +61,11 @@ def record_many(self, events: Iterable[TrajectoryEvidence]) -> List[SkillBelief] def beliefs(self) -> List[SkillBelief]: return [SkillBelief.from_dict(skill_id, raw) for skill_id, raw in self.data.get("skills", {}).items()] - def top(self, limit: int = 5, context: str = "") -> List[SkillBelief]: + def top(self, limit: int = 5, context: str = "", strategy: str = "exploit") -> List[SkillBelief]: beliefs = self.beliefs() + ranking = get_strategy(strategy) def score(belief: SkillBelief): - context_bonus = 1 if context and context in belief.contexts else 0 - return (context_bonus, belief.success_probability, belief.observations, -belief.mean_tokens) + return (ranking.score(belief, context), belief.observations, -belief.mean_tokens, belief.skill_id) return sorted(beliefs, key=score, reverse=True)[:limit] diff --git a/bayesian_agent/core/repair.py b/bayesian_agent/core/repair.py index 7a5e32d..eb60503 100644 --- a/bayesian_agent/core/repair.py +++ b/bayesian_agent/core/repair.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Set +from typing import Any, Dict, Iterable, List, Mapping, Set BenchmarkResults = Dict[str, List[Dict[str, Any]]] @@ -23,6 +23,40 @@ def failed_task_ids(results: Mapping[str, Iterable[Mapping[str, Any]]]) -> Dict[ return failed +def failure_mode_clusters(results: Mapping[str, Iterable[Mapping[str, Any]]]) -> Dict[str, Dict[str, List[str]]]: + """Group failed task ids by normalized failure mode for targeted repair.""" + + clusters: Dict[str, Dict[str, List[str]]] = {} + for benchmark, runs in results.items(): + for run in runs: + task_id = run.get("task_id") + if not task_id or run.get("success"): + continue + mode = str(run.get("failure_mode") or run.get("error") or "unknown_failure") + clusters.setdefault(str(benchmark), {}).setdefault(mode, []).append(str(task_id)) + return clusters + + +def repair_report(results: Mapping[str, Iterable[Mapping[str, Any]]]) -> Dict[str, Dict[str, Any]]: + """Create a repair-oriented summary with failed ids and failure clusters.""" + + normalized = normalize_results(results) + failed = failed_task_ids(normalized) + clusters = failure_mode_clusters(normalized) + report: Dict[str, Dict[str, Any]] = {} + for benchmark, runs in normalized.items(): + runs = list(runs) + failed_ids = sorted(failed.get(benchmark, set())) + report[benchmark] = { + "tasks": len(runs), + "failed_tasks": failed_ids, + "failure_count": len(failed_ids), + "failure_modes": {mode: sorted(ids) for mode, ids in sorted(clusters.get(benchmark, {}).items())}, + "recommended_action": "clustered_repair" if clusters.get(benchmark) else "none", + } + return report + + def dedupe_by_task_id(runs: Iterable[Mapping[str, Any]]) -> List[Dict[str, Any]]: order: List[str] = [] by_id: Dict[str, Dict[str, Any]] = {} diff --git a/bayesian_agent/core/standards.py b/bayesian_agent/core/standards.py new file mode 100644 index 0000000..ec3eb54 --- /dev/null +++ b/bayesian_agent/core/standards.py @@ -0,0 +1,56 @@ +"""Reusable working-standard checks for agentic workflows.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any, Dict, Iterable, List, Mapping + + +@dataclass(frozen=True) +class WorkflowStandard: + """A lightweight checklist that can be attached to a Skill or workflow context.""" + + standard_id: str + description: str + required_signals: List[str] = field(default_factory=list) + forbidden_failure_modes: List[str] = field(default_factory=list) + + def evaluate(self, run: Mapping[str, Any]) -> Dict[str, Any]: + signals = set(str(signal) for signal in run.get("signals", []) or []) + failure_mode = str(run.get("failure_mode") or run.get("error") or "") + missing = [signal for signal in self.required_signals if signal not in signals] + forbidden = failure_mode in set(self.forbidden_failure_modes) + passed = not missing and not forbidden + return { + "standard_id": self.standard_id, + "passed": passed, + "missing_signals": missing, + "forbidden_failure_mode": failure_mode if forbidden else "", + "description": self.description, + } + + +def evaluate_standards(run: Mapping[str, Any], standards: Iterable[WorkflowStandard]) -> List[Dict[str, Any]]: + """Evaluate a trajectory-like run against multiple workflow standards.""" + + return [standard.evaluate(run) for standard in standards] + + +DEFAULT_AGENTIC_STANDARDS: List[WorkflowStandard] = [ + WorkflowStandard( + standard_id="verify_before_done", + description="The agent should verify the result before declaring the task complete.", + required_signals=["verified"], + forbidden_failure_modes=["premature_done", "unverified_completion"], + ), + WorkflowStandard( + standard_id="respect_external_action_boundary", + description="External sends, purchases, grading posts, and destructive actions require explicit approval.", + forbidden_failure_modes=["unauthorized_external_action", "unsafe_destructive_action"], + ), + WorkflowStandard( + standard_id="record_failure_mode", + description="Failed runs should carry a normalized failure mode for repair learning.", + required_signals=["failure_mode_recorded"], + ), +] diff --git a/docs/openclaw-integration-roadmap.md b/docs/openclaw-integration-roadmap.md index 30c5ed8..45070cc 100644 --- a/docs/openclaw-integration-roadmap.md +++ b/docs/openclaw-integration-roadmap.md @@ -45,10 +45,18 @@ The first practical improvement is uncertainty visibility. Posterior mean alone - promising but under-tested skills; - unstable skills that need exploration or splitting. +## Implemented in This Branch + +- Posterior uncertainty fields: `posterior_variance` and `posterior_std`. +- Configurable ranking strategies: `exploit`, `explore`, `cost_aware`, and `context_aware`. +- Workflow JSONL ingestion via `evolve-workflow-log`. +- Failure-mode repair reports via `repair-report`. +- Lightweight workflow standards for agentic quality gates. + ## Suggested Next PRs -1. Add posterior uncertainty fields and render them in context. *(small, self-contained)* -2. Add configurable ranking strategies: exploit, explore, cost-aware, and context-match. -3. Add an `openclaw` example exporter that converts a small JSONL task log into `TrajectoryEvidence`. -4. Add richer repair-plan output with failure-mode clusters, not only failed task ids. -5. Add reproducible benchmark runner scripts for published artifacts. +1. Add reproducible benchmark runner scripts for published artifacts. +2. Add a first-party continuous-evaluation loop that periodically ingests workflow logs and writes ranked Skill context. +3. Add richer Bayesian policies, such as Thompson sampling or lower-confidence-bound ranking. +4. Add a concrete OpenClaw exporter once the stable log location/schema is known. +5. Add adapters for one additional non-GenericAgent harness to demonstrate real cross-harness transfer. diff --git a/examples/openclaw_workflow_log/README.md b/examples/openclaw_workflow_log/README.md new file mode 100644 index 0000000..44283fe --- /dev/null +++ b/examples/openclaw_workflow_log/README.md @@ -0,0 +1,31 @@ +# OpenClaw / Hermes Workflow Log Example + +This example shows how a persistent assistant harness can feed workflow outcomes into Bayesian-Agent without adopting a new runtime. + +## JSONL Input + +Each line is one workflow run. Field names are intentionally permissive. + +```jsonl +{"id":"grade-001","workflow":"grading","sop_id":"openclaw/grading/rubric_feedback","success":true,"total_tokens":2400,"signals":["verified"]} +{"id":"grade-002","workflow":"grading","sop_id":"openclaw/grading/rubric_feedback","success":false,"failure_mode":"rubric_mismatch","total_tokens":2100,"signals":["failure_mode_recorded"]} +``` + +## Evolve a Registry + +```bash +bayesian-agent evolve-workflow-log \ + --jsonl runs.jsonl \ + --registry temp/openclaw_beliefs.json \ + --context-out temp/openclaw_context.md \ + --strategy context_aware +``` + +## Why This Matters + +OpenClaw and Hermes-style agents repeatedly execute workflows such as grading, coding repair, browser tasks, project reviews, and inbox triage. Bayesian-Agent can turn those runs into evidence-weighted Skills: + +- successful workflows become reusable context; +- recurring failure modes become repair targets; +- uncertain workflows can be explored instead of trusted blindly; +- low-token/high-success workflows can be preferred when cost matters. diff --git a/tests/test_registry_context.py b/tests/test_registry_context.py index 7c50790..413a0b0 100644 --- a/tests/test_registry_context.py +++ b/tests/test_registry_context.py @@ -5,6 +5,7 @@ from bayesian_agent import BayesianSkillRegistry, TrajectoryEvidence from bayesian_agent.core.context import SkillContextBuilder from bayesian_agent.core.policy import RewritePolicy +from bayesian_agent.core.ranking import get_strategy class RegistryContextTests(unittest.TestCase): @@ -32,6 +33,15 @@ def test_context_builder_orders_by_posterior_and_cost(self): self.assertIn("posterior_success", context) self.assertIn("posterior_std", context) + def test_ranking_strategies_can_change_selection_pressure(self): + registry = BayesianSkillRegistry.in_memory() + registry.record(TrajectoryEvidence(task_id="a1", skill_id="skill/proven", context="ctx", outcome="success", total_tokens=1000)) + registry.record(TrajectoryEvidence(task_id="a2", skill_id="skill/proven", context="ctx", outcome="success", total_tokens=1000)) + registry.record(TrajectoryEvidence(task_id="b1", skill_id="skill/cheap", context="ctx", outcome="success", total_tokens=1)) + + self.assertEqual(get_strategy("cost-aware").name, "cost_aware") + self.assertEqual(registry.top(context="ctx", strategy="cost_aware")[0].skill_id, "skill/cheap") + def test_rewrite_policy_selects_actions(self): registry = BayesianSkillRegistry.in_memory() for i in range(4): diff --git a/tests/test_repair_cli.py b/tests/test_repair_cli.py index 4b32dc8..25acd87 100644 --- a/tests/test_repair_cli.py +++ b/tests/test_repair_cli.py @@ -3,7 +3,7 @@ import unittest from pathlib import Path -from bayesian_agent.core.repair import failed_task_ids, merge_repairs, summarize_incremental_lift +from bayesian_agent.core.repair import failed_task_ids, merge_repairs, repair_report, summarize_incremental_lift from bayesian_agent.cli import main @@ -21,10 +21,24 @@ def test_failed_task_ids_and_merge_repairs(self): ] } + baseline["sop_bench"][1]["failure_mode"] = "wrong_cell" + self.assertEqual(failed_task_ids(baseline), {"sop_bench": {"sop_02"}}) + self.assertEqual(repair_report(baseline)["sop_bench"]["failure_modes"], {"wrong_cell": ["sop_02"]}) self.assertTrue(all(run["success"] for run in merge_repairs(baseline, repairs)["sop_bench"])) self.assertEqual(summarize_incremental_lift(baseline, repairs)["sop_bench"]["accuracy"], 1.0) + def test_cli_repair_report_writes_clusters(self): + with tempfile.TemporaryDirectory() as td: + src = Path(td) / "results.json" + out = Path(td) / "repair_report.json" + src.write_text(json.dumps({"results": {"bench": [{"task_id": "a", "success": False, "failure_mode": "missing_verify"}]}}), encoding="utf-8") + + code = main(["repair-report", "--baseline", str(src), "--out", str(out)]) + + self.assertEqual(code, 0) + self.assertEqual(json.loads(out.read_text())["bench"]["failure_modes"], {"missing_verify": ["a"]}) + def test_cli_summarize_writes_json(self): with tempfile.TemporaryDirectory() as td: src = Path(td) / "results.json" diff --git a/tests/test_workflow_log_and_standards.py b/tests/test_workflow_log_and_standards.py new file mode 100644 index 0000000..ea75c72 --- /dev/null +++ b/tests/test_workflow_log_and_standards.py @@ -0,0 +1,65 @@ +import json +import tempfile +import unittest +from pathlib import Path + +from bayesian_agent.adapters.workflow_log import evidence_from_jsonl, workflow_record_to_evidence +from bayesian_agent.core.standards import DEFAULT_AGENTIC_STANDARDS, WorkflowStandard, evaluate_standards +from bayesian_agent.cli import main + + +class WorkflowLogAndStandardsTests(unittest.TestCase): + def test_workflow_record_to_evidence_accepts_agent_log_aliases(self): + event = workflow_record_to_evidence( + { + "run_id": "grade-1", + "workflow_id": "openclaw/grading/rubric_feedback", + "task_family": "grading", + "status": "completed", + "prompt_tokens": 100, + "completion_tokens": 25, + "steps": 4, + "duration_seconds": 3.5, + "summary": "drafted rubric feedback", + } + ) + + self.assertEqual(event.task_id, "grade-1") + self.assertEqual(event.skill_id, "openclaw/grading/rubric_feedback") + self.assertEqual(event.context, "grading") + self.assertTrue(event.success) + self.assertEqual(event.total_tokens, 125) + self.assertEqual(event.turns, 4) + + def test_evolve_workflow_log_cli_updates_registry(self): + with tempfile.TemporaryDirectory() as td: + src = Path(td) / "runs.jsonl" + registry = Path(td) / "beliefs.json" + context = Path(td) / "context.md" + records = [ + {"id": "a", "workflow": "coding", "sop_id": "openclaw/coding/test_first", "success": True, "total_tokens": 20}, + {"id": "b", "workflow": "coding", "sop_id": "openclaw/coding/test_first", "success": False, "failure_mode": "missing_test", "total_tokens": 30}, + ] + src.write_text("\n".join(json.dumps(record) for record in records), encoding="utf-8") + + code = main(["evolve-workflow-log", "--jsonl", str(src), "--registry", str(registry), "--context-out", str(context)]) + + self.assertEqual(code, 0) + raw = json.loads(registry.read_text()) + belief = raw["skills"]["openclaw/coding/test_first"] + self.assertEqual(belief["observations"], 2) + self.assertIn("posterior_std", context.read_text()) + + def test_workflow_standard_evaluation(self): + standard = WorkflowStandard("must_verify", "verification required", required_signals=["verified"]) + + failed = standard.evaluate({"signals": []}) + passed = standard.evaluate({"signals": ["verified"]}) + + self.assertFalse(failed["passed"]) + self.assertTrue(passed["passed"]) + self.assertTrue(evaluate_standards({"signals": ["verified", "failure_mode_recorded"]}, DEFAULT_AGENTIC_STANDARDS)) + + +if __name__ == "__main__": + unittest.main()