diff --git a/configs/sacred_eval_gate.json b/configs/sacred_eval_gate.json index 7475374..be687fb 100644 --- a/configs/sacred_eval_gate.json +++ b/configs/sacred_eval_gate.json @@ -2,5 +2,10 @@ "min_route_accuracy_percent": 95.0, "min_memory_signal_accuracy_percent": 90.0, "max_timeouts": 0, - "require_pass": true + "require_pass": true, + "runner_timeout_seconds": 90, + "runner_start": 0, + "runner_limit": 5, + "runner_per_case_timeout": 8, + "runner_progress_every": 5 } diff --git a/syncpatch/run-sacred-evals b/syncpatch/run-sacred-evals index 826ab21..e881277 100644 --- a/syncpatch/run-sacred-evals +++ b/syncpatch/run-sacred-evals @@ -2,7 +2,10 @@ from __future__ import annotations import json +import os +import signal import subprocess +from datetime import datetime, timezone from pathlib import Path ROOT = Path('/home/openclaw/.openclaw/workspace') @@ -20,40 +23,64 @@ def load_config() -> dict: 'min_memory_signal_accuracy_percent': 90.0, 'max_timeouts': 0, 'require_pass': True, - 'runner_timeout_seconds': 180, + 'runner_timeout_seconds': 90, 'runner_start': 0, - 'runner_limit': 14, + 'runner_limit': 5, + 'runner_per_case_timeout': 8, + 'runner_progress_every': 5, } -def run_regression(cfg: dict) -> tuple[int, str, str]: - cmd = ['python3', str(RUNNER)] +def gate_output_path() -> Path: + ts = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ') + return ROOT / 'evals' / 'results' / f'sacred_gate_{ts}.json' + + +def run_regression(cfg: dict) -> tuple[int, str, str, Path]: + output = gate_output_path() + cmd = ['python3', str(RUNNER), '--output', str(output)] start = cfg.get('runner_start', None) limit = cfg.get('runner_limit', None) + per_case = cfg.get('runner_per_case_timeout', None) + progress = cfg.get('runner_progress_every', None) if start is not None: cmd.extend(['--start', str(start)]) if limit is not None: cmd.extend(['--limit', str(limit)]) - proc = subprocess.run( + if per_case is not None: + cmd.extend(['--per-case-timeout', str(per_case)]) + if progress is not None: + cmd.extend(['--progress-every', str(progress)]) + proc = subprocess.Popen( cmd, - capture_output=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, text=True, - timeout=int(cfg.get('runner_timeout_seconds', 180) or 180), + start_new_session=True, ) - return proc.returncode, proc.stdout, proc.stderr + try: + stdout, stderr = proc.communicate(timeout=int(cfg.get('runner_timeout_seconds', 90) or 90)) + except subprocess.TimeoutExpired: + try: + os.killpg(proc.pid, signal.SIGTERM) + except Exception: + proc.kill() + raise + return proc.returncode, stdout, stderr, output def main() -> int: cfg = load_config() try: - rc, stdout, stderr = run_regression(cfg) + rc, stdout, stderr, output_path = run_regression(cfg) except subprocess.TimeoutExpired: print(json.dumps({'ok': False, 'decision': 'hold', 'reason': 'runner_timeout'}, ensure_ascii=False)) return 1 if rc != 0: print(json.dumps({'ok': False, 'decision': 'hold', 'reason': 'runner_failed', 'runner_rc': rc, 'stderr': (stderr or '')[:300]}, ensure_ascii=False)) return 1 - data = json.loads(RESULT.read_text(encoding='utf-8')) + result_path = output_path if output_path.exists() else RESULT + data = json.loads(result_path.read_text(encoding='utf-8')) route = float(data.get('route_accuracy_percent', 0.0) or 0.0) memory = float(data.get('memory_signal_accuracy_percent', 0.0) or 0.0) timeouts = int(data.get('timeouts', 0) or 0) @@ -80,6 +107,7 @@ def main() -> int: 'memory_signal_accuracy_percent': memory, 'timeouts': timeouts, 'pass': passed, + 'result_path': str(result_path), }, ensure_ascii=False)) return 0 if ok else 1