Add bandit priors for shadow meta-controller

This commit is contained in:
Openclaw 2026-03-21 07:44:20 +00:00
parent 6bba85fe9a
commit 1e1bd0dc4a
3 changed files with 90 additions and 1 deletions

View file

@ -0,0 +1,69 @@
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
POLICY_CANDIDATE = Path('/home/openclaw/.openclaw/workspace/data/policy_candidate.json')
def load_policy_candidate(path: Path = POLICY_CANDIDATE) -> dict[str, Any]:
try:
return json.loads(path.read_text(encoding='utf-8'))
except Exception:
return {'plans': {}, 'families': {}, 'generated_at': ''}
def plan_prior(policy: dict[str, Any], chosen_plan: str) -> dict[str, Any]:
row = ((policy or {}).get('plans') or {}).get(chosen_plan or '', {}) or {}
return {
'mode': row.get('mode', 'observe'),
'beta_mean': float(row.get('beta_mean', 0.5) or 0.5),
'count': int(row.get('count', 0) or 0),
'alpha': float(row.get('alpha', 1.0) or 1.0),
'beta': float(row.get('beta', 1.0) or 1.0),
}
def family_priors(policy: dict[str, Any], families: list[str]) -> list[dict[str, Any]]:
out = []
bucket = (policy or {}).get('families') or {}
for fam in families or []:
row = bucket.get(fam, {}) or {}
out.append({
'family': fam,
'mode': row.get('mode', 'observe'),
'beta_mean': float(row.get('beta_mean', 0.5) or 0.5),
'count': int(row.get('count', 0) or 0),
'alpha': float(row.get('alpha', 1.0) or 1.0),
'beta': float(row.get('beta', 1.0) or 1.0),
})
return out
def apply_bandit_bias(*, base_decision: str, base_reason: str, chosen_plan: str, families: list[str], uncertainty: dict[str, Any], policy: dict[str, Any]) -> dict[str, Any]:
plan = plan_prior(policy, chosen_plan)
fams = family_priors(policy, families)
decision = base_decision
reason = base_reason
if plan['mode'] == 'prefer' and plan['beta_mean'] >= 0.65:
decision = 'run_plan'
reason = f'policy_prefers_plan:{chosen_plan}'
elif plan['mode'] == 'avoid' and uncertainty.get('level') in {'medium', 'high'}:
if 'ambiguous_access' in families:
decision = 'ask_clarification'
reason = f'policy_avoids_plan:{chosen_plan}'
elif base_decision == 'run_plan':
decision = 'answer_direct'
reason = f'policy_deescalates_plan:{chosen_plan}'
return {
'decision': decision,
'reason': reason,
'plan_prior': plan,
'family_priors': fams,
}

View file

@ -6,6 +6,7 @@ from typing import Any
from tool_graph import build_tool_graph from tool_graph import build_tool_graph
from uncertainty_model import estimate_uncertainty from uncertainty_model import estimate_uncertainty
from bandit_policy import load_policy_candidate, apply_bandit_bias
@ -37,6 +38,19 @@ def shadow_decision(message: str, analysis: dict[str, Any], family_candidates: l
decision = 'run_plan' decision = 'run_plan'
reason = 'weak_grounding_under_uncertainty' reason = 'weak_grounding_under_uncertainty'
chosen_plan = str(analysis.get('composition_reason') or 'single_tool')
policy = load_policy_candidate()
bandit = apply_bandit_bias(
base_decision=decision,
base_reason=reason,
chosen_plan=chosen_plan,
families=families,
uncertainty=uncertainty,
policy=policy,
)
decision = bandit.get('decision', decision)
reason = bandit.get('reason', reason)
return { return {
'ts': datetime.now(timezone.utc).isoformat(), 'ts': datetime.now(timezone.utc).isoformat(),
'message': message, 'message': message,
@ -47,7 +61,11 @@ def shadow_decision(message: str, analysis: dict[str, Any], family_candidates: l
'uncertainty': uncertainty, 'uncertainty': uncertainty,
'family_candidates': families, 'family_candidates': families,
'normalized_task': f"{analysis.get('role','')}:{analysis.get('task_type','')}", 'normalized_task': f"{analysis.get('role','')}:{analysis.get('task_type','')}",
'chosen_plan': str(analysis.get('composition_reason') or 'single_tool'), 'chosen_plan': chosen_plan,
'policy_hint': {
'plan_prior': bandit.get('plan_prior', {}),
'family_priors': bandit.get('family_priors', []),
},
} }

View file

@ -52,6 +52,8 @@ def main() -> int:
'success': success, 'success': success,
'failure_like': failure, 'failure_like': failure,
'avg_reward': (reward_sum / count if count else 0.0), 'avg_reward': (reward_sum / count if count else 0.0),
'alpha': success + 1.0,
'beta': failure + 1.0,
'beta_mean': beta_mean(success, failure), 'beta_mean': beta_mean(success, failure),
'mode': classify_mode(row), 'mode': classify_mode(row),
} }