openclaw-intelligence-core-.../syncpatch/run-sacred-evals
2026-03-21 07:55:21 +00:00

116 lines
3.9 KiB
Python

#!/usr/bin/env python3
from __future__ import annotations
import json
import os
import signal
import subprocess
from datetime import datetime, timezone
from pathlib import Path
ROOT = Path('/home/openclaw/.openclaw/workspace')
CONFIG = ROOT / 'configs' / 'sacred_eval_gate.json'
RESULT = ROOT / 'evals' / 'results' / 'regression_results.json'
RUNNER = ROOT / 'evals' / 'run_regression.py'
def load_config() -> dict:
try:
return json.loads(CONFIG.read_text(encoding='utf-8'))
except Exception:
return {
'min_route_accuracy_percent': 95.0,
'min_memory_signal_accuracy_percent': 90.0,
'max_timeouts': 0,
'require_pass': True,
'runner_timeout_seconds': 90,
'runner_start': 0,
'runner_limit': 5,
'runner_per_case_timeout': 8,
'runner_progress_every': 5,
}
def gate_output_path() -> Path:
ts = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
return ROOT / 'evals' / 'results' / f'sacred_gate_{ts}.json'
def run_regression(cfg: dict) -> tuple[int, str, str, Path]:
output = gate_output_path()
cmd = ['python3', str(RUNNER), '--output', str(output)]
start = cfg.get('runner_start', None)
limit = cfg.get('runner_limit', None)
per_case = cfg.get('runner_per_case_timeout', None)
progress = cfg.get('runner_progress_every', None)
if start is not None:
cmd.extend(['--start', str(start)])
if limit is not None:
cmd.extend(['--limit', str(limit)])
if per_case is not None:
cmd.extend(['--per-case-timeout', str(per_case)])
if progress is not None:
cmd.extend(['--progress-every', str(progress)])
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
start_new_session=True,
)
try:
stdout, stderr = proc.communicate(timeout=int(cfg.get('runner_timeout_seconds', 90) or 90))
except subprocess.TimeoutExpired:
try:
os.killpg(proc.pid, signal.SIGTERM)
except Exception:
proc.kill()
raise
return proc.returncode, stdout, stderr, output
def main() -> int:
cfg = load_config()
try:
rc, stdout, stderr, output_path = run_regression(cfg)
except subprocess.TimeoutExpired:
print(json.dumps({'ok': False, 'decision': 'hold', 'reason': 'runner_timeout'}, ensure_ascii=False))
return 1
if rc != 0:
print(json.dumps({'ok': False, 'decision': 'hold', 'reason': 'runner_failed', 'runner_rc': rc, 'stderr': (stderr or '')[:300]}, ensure_ascii=False))
return 1
result_path = output_path if output_path.exists() else RESULT
data = json.loads(result_path.read_text(encoding='utf-8'))
route = float(data.get('route_accuracy_percent', 0.0) or 0.0)
memory = float(data.get('memory_signal_accuracy_percent', 0.0) or 0.0)
timeouts = int(data.get('timeouts', 0) or 0)
passed = bool(data.get('pass'))
ok = True
reasons = []
if route < float(cfg.get('min_route_accuracy_percent', 95.0)):
ok = False
reasons.append('route_accuracy_below_threshold')
if memory < float(cfg.get('min_memory_signal_accuracy_percent', 90.0)):
ok = False
reasons.append('memory_accuracy_below_threshold')
if timeouts > int(cfg.get('max_timeouts', 0)):
ok = False
reasons.append('timeouts_above_threshold')
if bool(cfg.get('require_pass', True)) and not passed:
ok = False
reasons.append('regression_not_pass')
print(json.dumps({
'ok': ok,
'decision': 'promote' if ok else 'hold',
'reasons': reasons,
'route_accuracy_percent': route,
'memory_signal_accuracy_percent': memory,
'timeouts': timeouts,
'pass': passed,
'result_path': str(result_path),
}, ensure_ascii=False))
return 0 if ok else 1
if __name__ == '__main__':
raise SystemExit(main())