Add offline policy builder and sacred eval gate
This commit is contained in:
parent
94eae8ceba
commit
6bba85fe9a
3 changed files with 141 additions and 0 deletions
6
configs/sacred_eval_gate.json
Normal file
6
configs/sacred_eval_gate.json
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
{
|
||||||
|
"min_route_accuracy_percent": 95.0,
|
||||||
|
"min_memory_signal_accuracy_percent": 90.0,
|
||||||
|
"max_timeouts": 0,
|
||||||
|
"require_pass": true
|
||||||
|
}
|
||||||
69
syncpatch/run-sacred-evals
Normal file
69
syncpatch/run-sacred-evals
Normal file
|
|
@ -0,0 +1,69 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
ROOT = Path('/home/openclaw/.openclaw/workspace')
|
||||||
|
CONFIG = ROOT / 'configs' / 'sacred_eval_gate.json'
|
||||||
|
RESULT = ROOT / 'evals' / 'results' / 'regression_results.json'
|
||||||
|
RUNNER = ROOT / 'evals' / 'run_regression.py'
|
||||||
|
|
||||||
|
|
||||||
|
def load_config() -> dict:
|
||||||
|
try:
|
||||||
|
return json.loads(CONFIG.read_text(encoding='utf-8'))
|
||||||
|
except Exception:
|
||||||
|
return {
|
||||||
|
'min_route_accuracy_percent': 95.0,
|
||||||
|
'min_memory_signal_accuracy_percent': 90.0,
|
||||||
|
'max_timeouts': 0,
|
||||||
|
'require_pass': True,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def run_regression() -> int:
|
||||||
|
proc = subprocess.run(['python3', str(RUNNER)], capture_output=True, text=True)
|
||||||
|
return proc.returncode
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
cfg = load_config()
|
||||||
|
rc = run_regression()
|
||||||
|
if rc != 0:
|
||||||
|
print(json.dumps({'ok': False, 'decision': 'hold', 'reason': 'runner_failed', 'runner_rc': rc}, ensure_ascii=False))
|
||||||
|
return 1
|
||||||
|
data = json.loads(RESULT.read_text(encoding='utf-8'))
|
||||||
|
route = float(data.get('route_accuracy_percent', 0.0) or 0.0)
|
||||||
|
memory = float(data.get('memory_signal_accuracy_percent', 0.0) or 0.0)
|
||||||
|
timeouts = int(data.get('timeouts', 0) or 0)
|
||||||
|
passed = bool(data.get('pass'))
|
||||||
|
ok = True
|
||||||
|
reasons = []
|
||||||
|
if route < float(cfg.get('min_route_accuracy_percent', 95.0)):
|
||||||
|
ok = False
|
||||||
|
reasons.append('route_accuracy_below_threshold')
|
||||||
|
if memory < float(cfg.get('min_memory_signal_accuracy_percent', 90.0)):
|
||||||
|
ok = False
|
||||||
|
reasons.append('memory_accuracy_below_threshold')
|
||||||
|
if timeouts > int(cfg.get('max_timeouts', 0)):
|
||||||
|
ok = False
|
||||||
|
reasons.append('timeouts_above_threshold')
|
||||||
|
if bool(cfg.get('require_pass', True)) and not passed:
|
||||||
|
ok = False
|
||||||
|
reasons.append('regression_not_pass')
|
||||||
|
print(json.dumps({
|
||||||
|
'ok': ok,
|
||||||
|
'decision': 'promote' if ok else 'hold',
|
||||||
|
'reasons': reasons,
|
||||||
|
'route_accuracy_percent': route,
|
||||||
|
'memory_signal_accuracy_percent': memory,
|
||||||
|
'timeouts': timeouts,
|
||||||
|
'pass': passed,
|
||||||
|
}, ensure_ascii=False))
|
||||||
|
return 0 if ok else 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
raise SystemExit(main())
|
||||||
66
syncpatch/train-policy-offline
Normal file
66
syncpatch/train-policy-offline
Normal file
|
|
@ -0,0 +1,66 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
ROOT = Path('/home/openclaw/.openclaw/workspace')
|
||||||
|
POLICY_STATS = ROOT / 'data' / 'policy_stats.json'
|
||||||
|
OUT_PATH = ROOT / 'data' / 'policy_candidate.json'
|
||||||
|
|
||||||
|
|
||||||
|
def beta_mean(success: int, failure: int, alpha: float = 1.0, beta: float = 1.0) -> float:
|
||||||
|
return (success + alpha) / (success + failure + alpha + beta)
|
||||||
|
|
||||||
|
|
||||||
|
def classify_mode(row: dict) -> str:
|
||||||
|
count = int(row.get('count', 0) or 0)
|
||||||
|
success = int(row.get('success', 0) or 0)
|
||||||
|
failure = int(row.get('failure', 0) or 0)
|
||||||
|
clar = int(row.get('clarification', 0) or 0)
|
||||||
|
reward_sum = float(row.get('reward_sum', 0.0) or 0.0)
|
||||||
|
avg_reward = reward_sum / count if count else 0.0
|
||||||
|
if count >= 3 and success >= max(2, failure + clar) and avg_reward >= 3.0:
|
||||||
|
return 'prefer'
|
||||||
|
if count >= 3 and failure > success and avg_reward < 0.5:
|
||||||
|
return 'avoid'
|
||||||
|
return 'observe'
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
try:
|
||||||
|
stats = json.loads(POLICY_STATS.read_text(encoding='utf-8'))
|
||||||
|
except Exception as exc:
|
||||||
|
print(json.dumps({'ok': False, 'error': f'cannot_read_policy_stats: {exc}'}, ensure_ascii=False))
|
||||||
|
return 1
|
||||||
|
|
||||||
|
candidate = {
|
||||||
|
'generated_at': datetime.now(timezone.utc).isoformat(),
|
||||||
|
'plans': {},
|
||||||
|
'families': {},
|
||||||
|
}
|
||||||
|
|
||||||
|
for bucket in ('plans', 'families'):
|
||||||
|
for key, row in (stats.get(bucket, {}) or {}).items():
|
||||||
|
success = int(row.get('success', 0) or 0)
|
||||||
|
failure = int(row.get('failure', 0) or 0) + int(row.get('clarification', 0) or 0)
|
||||||
|
count = int(row.get('count', 0) or 0)
|
||||||
|
reward_sum = float(row.get('reward_sum', 0.0) or 0.0)
|
||||||
|
candidate[bucket][key] = {
|
||||||
|
'count': count,
|
||||||
|
'success': success,
|
||||||
|
'failure_like': failure,
|
||||||
|
'avg_reward': (reward_sum / count if count else 0.0),
|
||||||
|
'beta_mean': beta_mean(success, failure),
|
||||||
|
'mode': classify_mode(row),
|
||||||
|
}
|
||||||
|
|
||||||
|
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
OUT_PATH.write_text(json.dumps(candidate, ensure_ascii=False, indent=2), encoding='utf-8')
|
||||||
|
print(json.dumps({'ok': True, 'path': str(OUT_PATH), 'plans': len(candidate['plans']), 'families': len(candidate['families'])}, ensure_ascii=False))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
raise SystemExit(main())
|
||||||
Loading…
Add table
Add a link
Reference in a new issue