Add bandit priors for shadow meta-controller

2026-03-21 07:44:20 +00:00 · 2026-03-21 07:44:20 +00:00 · 1e1bd0dc4a
commit 1e1bd0dc4a
parent 6bba85fe9a
3 changed files with 90 additions and 1 deletions
--- a/syncpatch/bandit_policy.py
+++ b/syncpatch/bandit_policy.py
@ -0,0 +1,69 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+POLICY_CANDIDATE = Path('/home/openclaw/.openclaw/workspace/data/policy_candidate.json')
+
+
+def load_policy_candidate(path: Path = POLICY_CANDIDATE) -> dict[str, Any]:
+    try:
+        return json.loads(path.read_text(encoding='utf-8'))
+    except Exception:
+        return {'plans': {}, 'families': {}, 'generated_at': ''}
+
+
+
+def plan_prior(policy: dict[str, Any], chosen_plan: str) -> dict[str, Any]:
+    row = ((policy or {}).get('plans') or {}).get(chosen_plan or '', {}) or {}
+    return {
+        'mode': row.get('mode', 'observe'),
+        'beta_mean': float(row.get('beta_mean', 0.5) or 0.5),
+        'count': int(row.get('count', 0) or 0),
+        'alpha': float(row.get('alpha', 1.0) or 1.0),
+        'beta': float(row.get('beta', 1.0) or 1.0),
+    }
+
+
+
+def family_priors(policy: dict[str, Any], families: list[str]) -> list[dict[str, Any]]:
+    out = []
+    bucket = (policy or {}).get('families') or {}
+    for fam in families or []:
+        row = bucket.get(fam, {}) or {}
+        out.append({
+            'family': fam,
+            'mode': row.get('mode', 'observe'),
+            'beta_mean': float(row.get('beta_mean', 0.5) or 0.5),
+            'count': int(row.get('count', 0) or 0),
+            'alpha': float(row.get('alpha', 1.0) or 1.0),
+            'beta': float(row.get('beta', 1.0) or 1.0),
+        })
+    return out
+
+
+
+def apply_bandit_bias(*, base_decision: str, base_reason: str, chosen_plan: str, families: list[str], uncertainty: dict[str, Any], policy: dict[str, Any]) -> dict[str, Any]:
+    plan = plan_prior(policy, chosen_plan)
+    fams = family_priors(policy, families)
+    decision = base_decision
+    reason = base_reason
+
+    if plan['mode'] == 'prefer' and plan['beta_mean'] >= 0.65:
+        decision = 'run_plan'
+        reason = f'policy_prefers_plan:{chosen_plan}'
+    elif plan['mode'] == 'avoid' and uncertainty.get('level') in {'medium', 'high'}:
+        if 'ambiguous_access' in families:
+            decision = 'ask_clarification'
+            reason = f'policy_avoids_plan:{chosen_plan}'
+        elif base_decision == 'run_plan':
+            decision = 'answer_direct'
+            reason = f'policy_deescalates_plan:{chosen_plan}'
+
+    return {
+        'decision': decision,
+        'reason': reason,
+        'plan_prior': plan,
+        'family_priors': fams,
+    }
--- a/syncpatch/meta_controller.py
+++ b/syncpatch/meta_controller.py
@ -6,6 +6,7 @@ from typing import Any

 from tool_graph import build_tool_graph
 from uncertainty_model import estimate_uncertainty
+from bandit_policy import load_policy_candidate, apply_bandit_bias



@ -37,6 +38,19 @@ def shadow_decision(message: str, analysis: dict[str, Any], family_candidates: l
        decision = 'run_plan'
        reason = 'weak_grounding_under_uncertainty'

+    chosen_plan = str(analysis.get('composition_reason') or 'single_tool')
+    policy = load_policy_candidate()
+    bandit = apply_bandit_bias(
+        base_decision=decision,
+        base_reason=reason,
+        chosen_plan=chosen_plan,
+        families=families,
+        uncertainty=uncertainty,
+        policy=policy,
+    )
+    decision = bandit.get('decision', decision)
+    reason = bandit.get('reason', reason)
+
    return {
        'ts': datetime.now(timezone.utc).isoformat(),
        'message': message,
@ -47,7 +61,11 @@ def shadow_decision(message: str, analysis: dict[str, Any], family_candidates: l
        'uncertainty': uncertainty,
        'family_candidates': families,
        'normalized_task': f"{analysis.get('role','')}:{analysis.get('task_type','')}",
-        'chosen_plan': str(analysis.get('composition_reason') or 'single_tool'),
+        'chosen_plan': chosen_plan,
+        'policy_hint': {
+            'plan_prior': bandit.get('plan_prior', {}),
+            'family_priors': bandit.get('family_priors', []),
+        },
    }


--- a/syncpatch/train-policy-offline
+++ b/syncpatch/train-policy-offline
@ -52,6 +52,8 @@ def main() -> int:
                'success': success,
                'failure_like': failure,
                'avg_reward': (reward_sum / count if count else 0.0),
+                'alpha': success + 1.0,
+                'beta': failure + 1.0,
                'beta_mean': beta_mean(success, failure),
                'mode': classify_mode(row),
            }