#!/usr/bin/env python3 from __future__ import annotations import json import os import signal import subprocess from datetime import datetime, timezone from pathlib import Path ROOT = Path('/home/openclaw/.openclaw/workspace') CONFIG = ROOT / 'configs' / 'sacred_eval_gate.json' RESULT = ROOT / 'evals' / 'results' / 'regression_results.json' RUNNER = ROOT / 'evals' / 'run_regression.py' def load_config() -> dict: try: return json.loads(CONFIG.read_text(encoding='utf-8')) except Exception: return { 'min_route_accuracy_percent': 95.0, 'min_memory_signal_accuracy_percent': 90.0, 'max_timeouts': 0, 'require_pass': True, 'runner_timeout_seconds': 90, 'runner_start': 0, 'runner_limit': 5, 'runner_per_case_timeout': 8, 'runner_progress_every': 5, } def gate_output_path() -> Path: ts = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ') return ROOT / 'evals' / 'results' / f'sacred_gate_{ts}.json' def run_regression(cfg: dict) -> tuple[int, str, str, Path]: output = gate_output_path() cmd = ['python3', str(RUNNER), '--output', str(output)] start = cfg.get('runner_start', None) limit = cfg.get('runner_limit', None) per_case = cfg.get('runner_per_case_timeout', None) progress = cfg.get('runner_progress_every', None) if start is not None: cmd.extend(['--start', str(start)]) if limit is not None: cmd.extend(['--limit', str(limit)]) if per_case is not None: cmd.extend(['--per-case-timeout', str(per_case)]) if progress is not None: cmd.extend(['--progress-every', str(progress)]) proc = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, start_new_session=True, ) try: stdout, stderr = proc.communicate(timeout=int(cfg.get('runner_timeout_seconds', 90) or 90)) except subprocess.TimeoutExpired: try: os.killpg(proc.pid, signal.SIGTERM) except Exception: proc.kill() raise return proc.returncode, stdout, stderr, output def main() -> int: cfg = load_config() try: rc, stdout, stderr, output_path = run_regression(cfg) except subprocess.TimeoutExpired: print(json.dumps({'ok': False, 'decision': 'hold', 'reason': 'runner_timeout'}, ensure_ascii=False)) return 1 if rc != 0: print(json.dumps({'ok': False, 'decision': 'hold', 'reason': 'runner_failed', 'runner_rc': rc, 'stderr': (stderr or '')[:300]}, ensure_ascii=False)) return 1 result_path = output_path if output_path.exists() else RESULT data = json.loads(result_path.read_text(encoding='utf-8')) route = float(data.get('route_accuracy_percent', 0.0) or 0.0) memory = float(data.get('memory_signal_accuracy_percent', 0.0) or 0.0) timeouts = int(data.get('timeouts', 0) or 0) passed = bool(data.get('pass')) ok = True reasons = [] if route < float(cfg.get('min_route_accuracy_percent', 95.0)): ok = False reasons.append('route_accuracy_below_threshold') if memory < float(cfg.get('min_memory_signal_accuracy_percent', 90.0)): ok = False reasons.append('memory_accuracy_below_threshold') if timeouts > int(cfg.get('max_timeouts', 0)): ok = False reasons.append('timeouts_above_threshold') if bool(cfg.get('require_pass', True)) and not passed: ok = False reasons.append('regression_not_pass') print(json.dumps({ 'ok': ok, 'decision': 'promote' if ok else 'hold', 'reasons': reasons, 'route_accuracy_percent': route, 'memory_signal_accuracy_percent': memory, 'timeouts': timeouts, 'pass': passed, 'result_path': str(result_path), }, ensure_ascii=False)) return 0 if ok else 1 if __name__ == '__main__': raise SystemExit(main())