DataArcTech · danlinyu · May 9, 2026 · May 9, 2026
diff --git a/bayesian_agent/__init__.py b/bayesian_agent/__init__.py
@@ -4,13 +4,20 @@
 from bayesian_agent.core.context import SkillContextBuilder
 from bayesian_agent.core.evidence import TrajectoryEvidence
 from bayesian_agent.core.policy import RewritePolicy
+from bayesian_agent.core.ranking import RankingStrategy, get_strategy
 from bayesian_agent.core.registry import BayesianSkillRegistry
+from bayesian_agent.core.standards import DEFAULT_AGENTIC_STANDARDS, WorkflowStandard, evaluate_standards
 
 __all__ = [
     "BayesianSkillRegistry",
     "RewriteDecision",
+    "DEFAULT_AGENTIC_STANDARDS",
+    "RankingStrategy",
     "RewritePolicy",
     "SkillBelief",
     "SkillContextBuilder",
     "TrajectoryEvidence",
+    "WorkflowStandard",
+    "evaluate_standards",
+    "get_strategy",
 ]
diff --git a/bayesian_agent/adapters/__init__.py b/bayesian_agent/adapters/__init__.py
@@ -2,5 +2,6 @@
 
 from bayesian_agent.adapters.base import AgentAdapter
 from bayesian_agent.adapters.generic_agent import GenericAgentAdapter
+from bayesian_agent.adapters.workflow_log import evidence_from_jsonl, workflow_record_to_evidence
 
-__all__ = ["AgentAdapter", "GenericAgentAdapter"]
+__all__ = ["AgentAdapter", "GenericAgentAdapter", "evidence_from_jsonl", "workflow_record_to_evidence"]
diff --git a/bayesian_agent/adapters/workflow_log.py b/bayesian_agent/adapters/workflow_log.py
@@ -0,0 +1,65 @@
+"""Utilities for converting generic assistant workflow logs into trajectory evidence."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any, Iterable, Iterator, Mapping, Optional
+
+from bayesian_agent.core.evidence import TrajectoryEvidence
+
+
+SUCCESS_VALUES = {"success", "succeeded", "ok", "passed", "complete", "completed", True}
+
+
+def workflow_record_to_evidence(
+    record: Mapping[str, Any],
+    *,
+    default_skill_id: str = "workflow/default",
+    default_context: str = "workflow",
+) -> TrajectoryEvidence:
+    """Convert an OpenClaw/Hermes-like workflow record into `TrajectoryEvidence`.
+
+    The function intentionally accepts several common field names so external
+    harnesses can integrate without adopting Bayesian-Agent internals first.
+    """
+
+    task_id = str(record.get("task_id") or record.get("id") or record.get("run_id") or "")
+    skill_id = str(record.get("skill_id") or record.get("sop_id") or record.get("workflow_id") or default_skill_id)
+    context = str(record.get("context") or record.get("task_family") or record.get("workflow") or default_context)
+    raw_outcome = record.get("outcome", record.get("status", record.get("success")))
+    outcome = "success" if raw_outcome in SUCCESS_VALUES else "failure"
+    return TrajectoryEvidence(
+        task_id=task_id,
+        skill_id=skill_id,
+        context=context,
+        outcome=outcome,
+        input_tokens=int(record.get("input_tokens") or record.get("prompt_tokens") or 0),
+        output_tokens=int(record.get("output_tokens") or record.get("completion_tokens") or 0),
+        total_tokens=int(record.get("total_tokens") or 0),
+        turns=int(record.get("turns") or record.get("steps") or 0),
+        elapsed_seconds=float(record.get("elapsed_seconds") or record.get("duration_seconds") or 0.0),
+        failure_mode=str(record.get("failure_mode") or record.get("error_type") or record.get("error") or ""),
+        summary=str(record.get("summary") or record.get("title") or task_id),
+        metadata={k: v for k, v in record.items() if k not in {"transcript", "messages"}},
+    )
+
+
+def iter_jsonl(path: str | Path) -> Iterator[Mapping[str, Any]]:
+    """Yield JSON objects from a JSONL file, skipping blank lines."""
+
+    for line in Path(path).read_text(encoding="utf-8").splitlines():
+        if line.strip():
+            yield json.loads(line)
+
+
+def evidence_from_jsonl(
+    path: str | Path,
+    *,
+    default_skill_id: str = "workflow/default",
+    default_context: str = "workflow",
+) -> Iterable[TrajectoryEvidence]:
+    """Read assistant workflow records from JSONL and yield trajectory evidence."""
+
+    for record in iter_jsonl(path):
+        yield workflow_record_to_evidence(record, default_skill_id=default_skill_id, default_context=default_context)
diff --git a/bayesian_agent/cli.py b/bayesian_agent/cli.py
@@ -10,7 +10,8 @@
 from bayesian_agent.core.context import SkillContextBuilder
 from bayesian_agent.core.evidence import TrajectoryEvidence
 from bayesian_agent.core.registry import BayesianSkillRegistry
-from bayesian_agent.core.repair import failed_task_ids, normalize_results, summarize, summarize_incremental_lift
+from bayesian_agent.core.repair import failed_task_ids, normalize_results, repair_report, summarize, summarize_incremental_lift
+from bayesian_agent.adapters.workflow_log import evidence_from_jsonl
 
 
 def _read_json(path: str) -> Mapping[str, Any]:
@@ -44,6 +45,14 @@ def build_parser() -> argparse.ArgumentParser:
     evolve.add_argument("--registry", required=True, help="Output registry JSON path.")
     evolve.add_argument("--context-out", default="", help="Optional rendered Skill context path.")
 
+    evolve_log = sub.add_parser("evolve-workflow-log", help="Update a registry from generic assistant workflow JSONL records.")
+    evolve_log.add_argument("--jsonl", action="append", required=True, help="Path to a workflow JSONL file.")
+    evolve_log.add_argument("--registry", required=True, help="Output registry JSON path.")
+    evolve_log.add_argument("--context-out", default="", help="Optional rendered Skill context path.")
+    evolve_log.add_argument("--default-skill-id", default="workflow/default")
+    evolve_log.add_argument("--default-context", default="workflow")
+    evolve_log.add_argument("--strategy", default="exploit", help="Context ranking strategy.")
+
     summarize_cmd = sub.add_parser("summarize", help="Summarize a results JSON file.")
     summarize_cmd.add_argument("--results", required=True)
     summarize_cmd.add_argument("--out", required=True)
@@ -52,6 +61,10 @@ def build_parser() -> argparse.ArgumentParser:
     repair.add_argument("--baseline", required=True)
     repair.add_argument("--out", required=True)
 
+    repair_report_cmd = sub.add_parser("repair-report", help="Summarize failed task ids and failure-mode clusters.")
+    repair_report_cmd.add_argument("--baseline", required=True)
+    repair_report_cmd.add_argument("--out", required=True)
+
     lift = sub.add_parser("incremental-summary", help="Summarize baseline plus repair traces.")
     lift.add_argument("--baseline", required=True)
     lift.add_argument("--repairs", required=True)
@@ -70,13 +83,30 @@ def main(argv: Sequence[str] = None) -> int:
         if args.context_out:
             Path(args.context_out).write_text(SkillContextBuilder(registry).render(), encoding="utf-8")
         return 0
+    if args.command == "evolve-workflow-log":
+        registry = BayesianSkillRegistry(args.registry)
+        for jsonl_path in args.jsonl:
+            registry.record_many(
+                evidence_from_jsonl(
+                    jsonl_path,
+                    default_skill_id=args.default_skill_id,
+                    default_context=args.default_context,
+                )
+            )
+        registry.save()
+        if args.context_out:
+            Path(args.context_out).write_text(SkillContextBuilder(registry).render(strategy=args.strategy), encoding="utf-8")
+        return 0
     if args.command == "summarize":
         _write_json(args.out, summarize(normalize_results(_read_json(args.results))))
         return 0
     if args.command == "repair-plan":
         failures = {k: sorted(v) for k, v in failed_task_ids(normalize_results(_read_json(args.baseline))).items()}
         _write_json(args.out, failures)
         return 0
+    if args.command == "repair-report":
+        _write_json(args.out, repair_report(normalize_results(_read_json(args.baseline))))
+        return 0
     if args.command == "incremental-summary":
         baseline = normalize_results(_read_json(args.baseline))
         repairs = normalize_results(_read_json(args.repairs))

diff --git a/bayesian_agent/core/belief.py b/bayesian_agent/core/belief.py
@@ -43,6 +43,21 @@ def success_probability(self) -> float:
         denom = self.alpha + self.beta
         return self.alpha / denom if denom else 0.0
 
+    @property
+    def posterior_variance(self) -> float:
+        """Variance of the Beta posterior over Skill success probability."""
+
+        denom = self.alpha + self.beta
+        if denom <= 0:
+            return 0.0
+        return (self.alpha * self.beta) / ((denom**2) * (denom + 1.0))
+
+    @property
+    def posterior_std(self) -> float:
+        """Standard deviation of the Beta posterior."""
+
+        return self.posterior_variance**0.5
+
     def update(self, event: TrajectoryEvidence) -> "SkillBelief":
         outcome = event.outcome.strip().lower()
         if outcome == "success":
@@ -72,6 +87,8 @@ def to_dict(self) -> Dict[str, Any]:
             "alpha": self.alpha,
             "beta": self.beta,
             "posterior_success": self.success_probability,
+            "posterior_variance": self.posterior_variance,
+            "posterior_std": self.posterior_std,
             "contexts": self.contexts,
             "failure_modes": self.failure_modes,
             "evidence": self.evidence[-MAX_EVIDENCE:],

diff --git a/bayesian_agent/core/context.py b/bayesian_agent/core/context.py
@@ -13,20 +13,22 @@ def __init__(self, registry: BayesianSkillRegistry, policy: RewritePolicy = None
         self.registry = registry
         self.policy = policy or RewritePolicy()
 
-    def render(self, task_context: str = "", limit: int = 5) -> str:
-        beliefs = self.registry.top(limit=limit, context=task_context)
+    def render(self, task_context: str = "", limit: int = 5, strategy: str = "exploit") -> str:
+        beliefs = self.registry.top(limit=limit, context=task_context, strategy=strategy)
         if not beliefs:
             return ""
         lines = [
             "### Bayesian Skill Context",
             "Use these posterior-weighted Skills/SOPs as hypotheses, not as unquestioned instructions.",
+            f"Ranking strategy: {strategy}.",
         ]
         for belief in beliefs:
             decision = self.policy.decide(belief)
             failures = ", ".join(f"{k}={v}" for k, v in sorted(belief.failure_modes.items())[:3]) or "none"
             lines.append(
                 "- "
                 f"{belief.skill_id}: posterior_success={belief.success_probability:.3f}, "
+                f"posterior_std={belief.posterior_std:.3f}, "
                 f"alpha={belief.alpha:.1f}, beta={belief.beta:.1f}, "
                 f"observations={belief.observations}, mean_tokens={belief.mean_tokens:.1f}, "
                 f"rewrite={decision.action}, failures={failures}"

diff --git a/bayesian_agent/core/ranking.py b/bayesian_agent/core/ranking.py
@@ -0,0 +1,75 @@
+"""Skill ranking strategies for posterior-weighted context selection."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Callable, Dict
+
+from bayesian_agent.core.belief import SkillBelief
+
+
+@dataclass(frozen=True)
+class RankingStrategy:
+    """A named strategy for ranking Skill beliefs."""
+
+    name: str
+    description: str
+    scorer: Callable[[SkillBelief, str], float]
+
+    def score(self, belief: SkillBelief, context: str = "") -> float:
+        return float(self.scorer(belief, context))
+
+
+def _context_bonus(belief: SkillBelief, context: str) -> float:
+    if not context:
+        return 0.0
+    if context in belief.contexts:
+        return 1.0
+    # Lightweight partial match for hierarchical contexts such as "openclaw/grading".
+    return 0.25 if any(context in known or known in context for known in belief.contexts) else 0.0
+
+
+def _safe_mean_tokens(belief: SkillBelief) -> float:
+    return max(float(belief.mean_tokens or 0.0), 1.0)
+
+
+def exploit_score(belief: SkillBelief, context: str = "") -> float:
+    """Prefer proven, context-matching, low-uncertainty Skills."""
+
+    return belief.success_probability + (0.15 * _context_bonus(belief, context)) - (0.25 * belief.posterior_std)
+
+
+def explore_score(belief: SkillBelief, context: str = "") -> float:
+    """Prefer uncertain Skills with some contextual relevance."""
+
+    return belief.posterior_std + (0.10 * _context_bonus(belief, context)) + min(belief.observations, 3) * 0.01
+
+
+def cost_aware_score(belief: SkillBelief, context: str = "") -> float:
+    """Prefer success per token, while retaining a small context bonus."""
+
+    return (belief.success_probability / _safe_mean_tokens(belief)) * 1000.0 + (0.10 * _context_bonus(belief, context))
+
+
+def context_aware_score(belief: SkillBelief, context: str = "") -> float:
+    """Prefer Skills proven in the same or nearby task context."""
+
+    return belief.success_probability + (0.35 * _context_bonus(belief, context)) - (0.10 * belief.posterior_std)
+
+
+STRATEGIES: Dict[str, RankingStrategy] = {
+    "exploit": RankingStrategy("exploit", "Prefer proven, low-uncertainty Skills.", exploit_score),
+    "explore": RankingStrategy("explore", "Prefer Skills that need more evidence.", explore_score),
+    "cost_aware": RankingStrategy("cost_aware", "Prefer high-success, low-token Skills.", cost_aware_score),
+    "context_aware": RankingStrategy("context_aware", "Prefer Skills proven in similar contexts.", context_aware_score),
+}
+
+
+def get_strategy(name: str = "exploit") -> RankingStrategy:
+    """Return a ranking strategy by name."""
+
+    normalized = (name or "exploit").strip().lower().replace("-", "_")
+    if normalized not in STRATEGIES:
+        available = ", ".join(sorted(STRATEGIES))
+        raise ValueError(f"Unknown ranking strategy '{name}'. Available: {available}")
+    return STRATEGIES[normalized]
diff --git a/bayesian_agent/core/registry.py b/bayesian_agent/core/registry.py
@@ -8,6 +8,7 @@
 
 from bayesian_agent.core.belief import SkillBelief
 from bayesian_agent.core.evidence import TrajectoryEvidence, utc_now
+from bayesian_agent.core.ranking import get_strategy
 
 
 class BayesianSkillRegistry:
@@ -60,11 +61,11 @@ def record_many(self, events: Iterable[TrajectoryEvidence]) -> List[SkillBelief]
     def beliefs(self) -> List[SkillBelief]:
         return [SkillBelief.from_dict(skill_id, raw) for skill_id, raw in self.data.get("skills", {}).items()]
 
-    def top(self, limit: int = 5, context: str = "") -> List[SkillBelief]:
+    def top(self, limit: int = 5, context: str = "", strategy: str = "exploit") -> List[SkillBelief]:
         beliefs = self.beliefs()
+        ranking = get_strategy(strategy)
 
         def score(belief: SkillBelief):
-            context_bonus = 1 if context and context in belief.contexts else 0
-            return (context_bonus, belief.success_probability, belief.observations, -belief.mean_tokens)
+            return (ranking.score(belief, context), belief.observations, -belief.mean_tokens, belief.skill_id)
 
         return sorted(beliefs, key=score, reverse=True)[:limit]
diff --git a/bayesian_agent/core/repair.py b/bayesian_agent/core/repair.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Set
+from typing import Any, Dict, Iterable, List, Mapping, Set
 
 
 BenchmarkResults = Dict[str, List[Dict[str, Any]]]
@@ -23,6 +23,40 @@ def failed_task_ids(results: Mapping[str, Iterable[Mapping[str, Any]]]) -> Dict[
     return failed
 
 
+def failure_mode_clusters(results: Mapping[str, Iterable[Mapping[str, Any]]]) -> Dict[str, Dict[str, List[str]]]:
+    """Group failed task ids by normalized failure mode for targeted repair."""
+
+    clusters: Dict[str, Dict[str, List[str]]] = {}
+    for benchmark, runs in results.items():
+        for run in runs:
+            task_id = run.get("task_id")
+            if not task_id or run.get("success"):
+                continue
+            mode = str(run.get("failure_mode") or run.get("error") or "unknown_failure")
+            clusters.setdefault(str(benchmark), {}).setdefault(mode, []).append(str(task_id))
+    return clusters
+
+
+def repair_report(results: Mapping[str, Iterable[Mapping[str, Any]]]) -> Dict[str, Dict[str, Any]]:
+    """Create a repair-oriented summary with failed ids and failure clusters."""
+
+    normalized = normalize_results(results)
+    failed = failed_task_ids(normalized)
+    clusters = failure_mode_clusters(normalized)
+    report: Dict[str, Dict[str, Any]] = {}
+    for benchmark, runs in normalized.items():
+        runs = list(runs)
+        failed_ids = sorted(failed.get(benchmark, set()))
+        report[benchmark] = {
+            "tasks": len(runs),
+            "failed_tasks": failed_ids,
+            "failure_count": len(failed_ids),
+            "failure_modes": {mode: sorted(ids) for mode, ids in sorted(clusters.get(benchmark, {}).items())},
+            "recommended_action": "clustered_repair" if clusters.get(benchmark) else "none",
+        }
+    return report
+
+
 def dedupe_by_task_id(runs: Iterable[Mapping[str, Any]]) -> List[Dict[str, Any]]:
     order: List[str] = []
     by_id: Dict[str, Dict[str, Any]] = {}