Add bandit priors for shadow meta-controller
This commit is contained in:
parent
6bba85fe9a
commit
1e1bd0dc4a
3 changed files with 90 additions and 1 deletions
69
syncpatch/bandit_policy.py
Normal file
69
syncpatch/bandit_policy.py
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
POLICY_CANDIDATE = Path('/home/openclaw/.openclaw/workspace/data/policy_candidate.json')
|
||||
|
||||
|
||||
def load_policy_candidate(path: Path = POLICY_CANDIDATE) -> dict[str, Any]:
|
||||
try:
|
||||
return json.loads(path.read_text(encoding='utf-8'))
|
||||
except Exception:
|
||||
return {'plans': {}, 'families': {}, 'generated_at': ''}
|
||||
|
||||
|
||||
|
||||
def plan_prior(policy: dict[str, Any], chosen_plan: str) -> dict[str, Any]:
|
||||
row = ((policy or {}).get('plans') or {}).get(chosen_plan or '', {}) or {}
|
||||
return {
|
||||
'mode': row.get('mode', 'observe'),
|
||||
'beta_mean': float(row.get('beta_mean', 0.5) or 0.5),
|
||||
'count': int(row.get('count', 0) or 0),
|
||||
'alpha': float(row.get('alpha', 1.0) or 1.0),
|
||||
'beta': float(row.get('beta', 1.0) or 1.0),
|
||||
}
|
||||
|
||||
|
||||
|
||||
def family_priors(policy: dict[str, Any], families: list[str]) -> list[dict[str, Any]]:
|
||||
out = []
|
||||
bucket = (policy or {}).get('families') or {}
|
||||
for fam in families or []:
|
||||
row = bucket.get(fam, {}) or {}
|
||||
out.append({
|
||||
'family': fam,
|
||||
'mode': row.get('mode', 'observe'),
|
||||
'beta_mean': float(row.get('beta_mean', 0.5) or 0.5),
|
||||
'count': int(row.get('count', 0) or 0),
|
||||
'alpha': float(row.get('alpha', 1.0) or 1.0),
|
||||
'beta': float(row.get('beta', 1.0) or 1.0),
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
|
||||
def apply_bandit_bias(*, base_decision: str, base_reason: str, chosen_plan: str, families: list[str], uncertainty: dict[str, Any], policy: dict[str, Any]) -> dict[str, Any]:
|
||||
plan = plan_prior(policy, chosen_plan)
|
||||
fams = family_priors(policy, families)
|
||||
decision = base_decision
|
||||
reason = base_reason
|
||||
|
||||
if plan['mode'] == 'prefer' and plan['beta_mean'] >= 0.65:
|
||||
decision = 'run_plan'
|
||||
reason = f'policy_prefers_plan:{chosen_plan}'
|
||||
elif plan['mode'] == 'avoid' and uncertainty.get('level') in {'medium', 'high'}:
|
||||
if 'ambiguous_access' in families:
|
||||
decision = 'ask_clarification'
|
||||
reason = f'policy_avoids_plan:{chosen_plan}'
|
||||
elif base_decision == 'run_plan':
|
||||
decision = 'answer_direct'
|
||||
reason = f'policy_deescalates_plan:{chosen_plan}'
|
||||
|
||||
return {
|
||||
'decision': decision,
|
||||
'reason': reason,
|
||||
'plan_prior': plan,
|
||||
'family_priors': fams,
|
||||
}
|
||||
|
|
@ -6,6 +6,7 @@ from typing import Any
|
|||
|
||||
from tool_graph import build_tool_graph
|
||||
from uncertainty_model import estimate_uncertainty
|
||||
from bandit_policy import load_policy_candidate, apply_bandit_bias
|
||||
|
||||
|
||||
|
||||
|
|
@ -37,6 +38,19 @@ def shadow_decision(message: str, analysis: dict[str, Any], family_candidates: l
|
|||
decision = 'run_plan'
|
||||
reason = 'weak_grounding_under_uncertainty'
|
||||
|
||||
chosen_plan = str(analysis.get('composition_reason') or 'single_tool')
|
||||
policy = load_policy_candidate()
|
||||
bandit = apply_bandit_bias(
|
||||
base_decision=decision,
|
||||
base_reason=reason,
|
||||
chosen_plan=chosen_plan,
|
||||
families=families,
|
||||
uncertainty=uncertainty,
|
||||
policy=policy,
|
||||
)
|
||||
decision = bandit.get('decision', decision)
|
||||
reason = bandit.get('reason', reason)
|
||||
|
||||
return {
|
||||
'ts': datetime.now(timezone.utc).isoformat(),
|
||||
'message': message,
|
||||
|
|
@ -47,7 +61,11 @@ def shadow_decision(message: str, analysis: dict[str, Any], family_candidates: l
|
|||
'uncertainty': uncertainty,
|
||||
'family_candidates': families,
|
||||
'normalized_task': f"{analysis.get('role','')}:{analysis.get('task_type','')}",
|
||||
'chosen_plan': str(analysis.get('composition_reason') or 'single_tool'),
|
||||
'chosen_plan': chosen_plan,
|
||||
'policy_hint': {
|
||||
'plan_prior': bandit.get('plan_prior', {}),
|
||||
'family_priors': bandit.get('family_priors', []),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -52,6 +52,8 @@ def main() -> int:
|
|||
'success': success,
|
||||
'failure_like': failure,
|
||||
'avg_reward': (reward_sum / count if count else 0.0),
|
||||
'alpha': success + 1.0,
|
||||
'beta': failure + 1.0,
|
||||
'beta_mean': beta_mean(success, failure),
|
||||
'mode': classify_mode(row),
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue