diff --git a/configs/sacred_eval_gate.json b/configs/sacred_eval_gate.json new file mode 100644 index 0000000..7475374 --- /dev/null +++ b/configs/sacred_eval_gate.json @@ -0,0 +1,6 @@ +{ + "min_route_accuracy_percent": 95.0, + "min_memory_signal_accuracy_percent": 90.0, + "max_timeouts": 0, + "require_pass": true +} diff --git a/syncpatch/run-sacred-evals b/syncpatch/run-sacred-evals new file mode 100644 index 0000000..8f9c769 --- /dev/null +++ b/syncpatch/run-sacred-evals @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import json +import subprocess +from pathlib import Path + +ROOT = Path('/home/openclaw/.openclaw/workspace') +CONFIG = ROOT / 'configs' / 'sacred_eval_gate.json' +RESULT = ROOT / 'evals' / 'results' / 'regression_results.json' +RUNNER = ROOT / 'evals' / 'run_regression.py' + + +def load_config() -> dict: + try: + return json.loads(CONFIG.read_text(encoding='utf-8')) + except Exception: + return { + 'min_route_accuracy_percent': 95.0, + 'min_memory_signal_accuracy_percent': 90.0, + 'max_timeouts': 0, + 'require_pass': True, + } + + +def run_regression() -> int: + proc = subprocess.run(['python3', str(RUNNER)], capture_output=True, text=True) + return proc.returncode + + +def main() -> int: + cfg = load_config() + rc = run_regression() + if rc != 0: + print(json.dumps({'ok': False, 'decision': 'hold', 'reason': 'runner_failed', 'runner_rc': rc}, ensure_ascii=False)) + return 1 + data = json.loads(RESULT.read_text(encoding='utf-8')) + route = float(data.get('route_accuracy_percent', 0.0) or 0.0) + memory = float(data.get('memory_signal_accuracy_percent', 0.0) or 0.0) + timeouts = int(data.get('timeouts', 0) or 0) + passed = bool(data.get('pass')) + ok = True + reasons = [] + if route < float(cfg.get('min_route_accuracy_percent', 95.0)): + ok = False + reasons.append('route_accuracy_below_threshold') + if memory < float(cfg.get('min_memory_signal_accuracy_percent', 90.0)): + ok = False + reasons.append('memory_accuracy_below_threshold') + if timeouts > int(cfg.get('max_timeouts', 0)): + ok = False + reasons.append('timeouts_above_threshold') + if bool(cfg.get('require_pass', True)) and not passed: + ok = False + reasons.append('regression_not_pass') + print(json.dumps({ + 'ok': ok, + 'decision': 'promote' if ok else 'hold', + 'reasons': reasons, + 'route_accuracy_percent': route, + 'memory_signal_accuracy_percent': memory, + 'timeouts': timeouts, + 'pass': passed, + }, ensure_ascii=False)) + return 0 if ok else 1 + + +if __name__ == '__main__': + raise SystemExit(main()) diff --git a/syncpatch/train-policy-offline b/syncpatch/train-policy-offline new file mode 100644 index 0000000..4333c6f --- /dev/null +++ b/syncpatch/train-policy-offline @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import json +from datetime import datetime, timezone +from pathlib import Path + +ROOT = Path('/home/openclaw/.openclaw/workspace') +POLICY_STATS = ROOT / 'data' / 'policy_stats.json' +OUT_PATH = ROOT / 'data' / 'policy_candidate.json' + + +def beta_mean(success: int, failure: int, alpha: float = 1.0, beta: float = 1.0) -> float: + return (success + alpha) / (success + failure + alpha + beta) + + +def classify_mode(row: dict) -> str: + count = int(row.get('count', 0) or 0) + success = int(row.get('success', 0) or 0) + failure = int(row.get('failure', 0) or 0) + clar = int(row.get('clarification', 0) or 0) + reward_sum = float(row.get('reward_sum', 0.0) or 0.0) + avg_reward = reward_sum / count if count else 0.0 + if count >= 3 and success >= max(2, failure + clar) and avg_reward >= 3.0: + return 'prefer' + if count >= 3 and failure > success and avg_reward < 0.5: + return 'avoid' + return 'observe' + + +def main() -> int: + try: + stats = json.loads(POLICY_STATS.read_text(encoding='utf-8')) + except Exception as exc: + print(json.dumps({'ok': False, 'error': f'cannot_read_policy_stats: {exc}'}, ensure_ascii=False)) + return 1 + + candidate = { + 'generated_at': datetime.now(timezone.utc).isoformat(), + 'plans': {}, + 'families': {}, + } + + for bucket in ('plans', 'families'): + for key, row in (stats.get(bucket, {}) or {}).items(): + success = int(row.get('success', 0) or 0) + failure = int(row.get('failure', 0) or 0) + int(row.get('clarification', 0) or 0) + count = int(row.get('count', 0) or 0) + reward_sum = float(row.get('reward_sum', 0.0) or 0.0) + candidate[bucket][key] = { + 'count': count, + 'success': success, + 'failure_like': failure, + 'avg_reward': (reward_sum / count if count else 0.0), + 'beta_mean': beta_mean(success, failure), + 'mode': classify_mode(row), + } + + OUT_PATH.parent.mkdir(parents=True, exist_ok=True) + OUT_PATH.write_text(json.dumps(candidate, ensure_ascii=False, indent=2), encoding='utf-8') + print(json.dumps({'ok': True, 'path': str(OUT_PATH), 'plans': len(candidate['plans']), 'families': len(candidate['families'])}, ensure_ascii=False)) + return 0 + + +if __name__ == '__main__': + raise SystemExit(main())