diff --git a/syncpatch/bandit_policy.py b/syncpatch/bandit_policy.py new file mode 100644 index 0000000..9e0f73c --- /dev/null +++ b/syncpatch/bandit_policy.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +POLICY_CANDIDATE = Path('/home/openclaw/.openclaw/workspace/data/policy_candidate.json') + + +def load_policy_candidate(path: Path = POLICY_CANDIDATE) -> dict[str, Any]: + try: + return json.loads(path.read_text(encoding='utf-8')) + except Exception: + return {'plans': {}, 'families': {}, 'generated_at': ''} + + + +def plan_prior(policy: dict[str, Any], chosen_plan: str) -> dict[str, Any]: + row = ((policy or {}).get('plans') or {}).get(chosen_plan or '', {}) or {} + return { + 'mode': row.get('mode', 'observe'), + 'beta_mean': float(row.get('beta_mean', 0.5) or 0.5), + 'count': int(row.get('count', 0) or 0), + 'alpha': float(row.get('alpha', 1.0) or 1.0), + 'beta': float(row.get('beta', 1.0) or 1.0), + } + + + +def family_priors(policy: dict[str, Any], families: list[str]) -> list[dict[str, Any]]: + out = [] + bucket = (policy or {}).get('families') or {} + for fam in families or []: + row = bucket.get(fam, {}) or {} + out.append({ + 'family': fam, + 'mode': row.get('mode', 'observe'), + 'beta_mean': float(row.get('beta_mean', 0.5) or 0.5), + 'count': int(row.get('count', 0) or 0), + 'alpha': float(row.get('alpha', 1.0) or 1.0), + 'beta': float(row.get('beta', 1.0) or 1.0), + }) + return out + + + +def apply_bandit_bias(*, base_decision: str, base_reason: str, chosen_plan: str, families: list[str], uncertainty: dict[str, Any], policy: dict[str, Any]) -> dict[str, Any]: + plan = plan_prior(policy, chosen_plan) + fams = family_priors(policy, families) + decision = base_decision + reason = base_reason + + if plan['mode'] == 'prefer' and plan['beta_mean'] >= 0.65: + decision = 'run_plan' + reason = f'policy_prefers_plan:{chosen_plan}' + elif plan['mode'] == 'avoid' and uncertainty.get('level') in {'medium', 'high'}: + if 'ambiguous_access' in families: + decision = 'ask_clarification' + reason = f'policy_avoids_plan:{chosen_plan}' + elif base_decision == 'run_plan': + decision = 'answer_direct' + reason = f'policy_deescalates_plan:{chosen_plan}' + + return { + 'decision': decision, + 'reason': reason, + 'plan_prior': plan, + 'family_priors': fams, + } diff --git a/syncpatch/meta_controller.py b/syncpatch/meta_controller.py index c4b00e6..6ee337f 100644 --- a/syncpatch/meta_controller.py +++ b/syncpatch/meta_controller.py @@ -6,6 +6,7 @@ from typing import Any from tool_graph import build_tool_graph from uncertainty_model import estimate_uncertainty +from bandit_policy import load_policy_candidate, apply_bandit_bias @@ -37,6 +38,19 @@ def shadow_decision(message: str, analysis: dict[str, Any], family_candidates: l decision = 'run_plan' reason = 'weak_grounding_under_uncertainty' + chosen_plan = str(analysis.get('composition_reason') or 'single_tool') + policy = load_policy_candidate() + bandit = apply_bandit_bias( + base_decision=decision, + base_reason=reason, + chosen_plan=chosen_plan, + families=families, + uncertainty=uncertainty, + policy=policy, + ) + decision = bandit.get('decision', decision) + reason = bandit.get('reason', reason) + return { 'ts': datetime.now(timezone.utc).isoformat(), 'message': message, @@ -47,7 +61,11 @@ def shadow_decision(message: str, analysis: dict[str, Any], family_candidates: l 'uncertainty': uncertainty, 'family_candidates': families, 'normalized_task': f"{analysis.get('role','')}:{analysis.get('task_type','')}", - 'chosen_plan': str(analysis.get('composition_reason') or 'single_tool'), + 'chosen_plan': chosen_plan, + 'policy_hint': { + 'plan_prior': bandit.get('plan_prior', {}), + 'family_priors': bandit.get('family_priors', []), + }, } diff --git a/syncpatch/train-policy-offline b/syncpatch/train-policy-offline index 4333c6f..bff7535 100644 --- a/syncpatch/train-policy-offline +++ b/syncpatch/train-policy-offline @@ -52,6 +52,8 @@ def main() -> int: 'success': success, 'failure_like': failure, 'avg_reward': (reward_sum / count if count else 0.0), + 'alpha': success + 1.0, + 'beta': failure + 1.0, 'beta_mean': beta_mean(success, failure), 'mode': classify_mode(row), }