88 lines
2.9 KiB
Python
88 lines
2.9 KiB
Python
#!/usr/bin/env python3
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import subprocess
|
|
from pathlib import Path
|
|
|
|
ROOT = Path('/home/openclaw/.openclaw/workspace')
|
|
CONFIG = ROOT / 'configs' / 'sacred_eval_gate.json'
|
|
RESULT = ROOT / 'evals' / 'results' / 'regression_results.json'
|
|
RUNNER = ROOT / 'evals' / 'run_regression.py'
|
|
|
|
|
|
def load_config() -> dict:
|
|
try:
|
|
return json.loads(CONFIG.read_text(encoding='utf-8'))
|
|
except Exception:
|
|
return {
|
|
'min_route_accuracy_percent': 95.0,
|
|
'min_memory_signal_accuracy_percent': 90.0,
|
|
'max_timeouts': 0,
|
|
'require_pass': True,
|
|
'runner_timeout_seconds': 180,
|
|
'runner_start': 0,
|
|
'runner_limit': 14,
|
|
}
|
|
|
|
|
|
def run_regression(cfg: dict) -> tuple[int, str, str]:
|
|
cmd = ['python3', str(RUNNER)]
|
|
start = cfg.get('runner_start', None)
|
|
limit = cfg.get('runner_limit', None)
|
|
if start is not None:
|
|
cmd.extend(['--start', str(start)])
|
|
if limit is not None:
|
|
cmd.extend(['--limit', str(limit)])
|
|
proc = subprocess.run(
|
|
cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=int(cfg.get('runner_timeout_seconds', 180) or 180),
|
|
)
|
|
return proc.returncode, proc.stdout, proc.stderr
|
|
|
|
|
|
def main() -> int:
|
|
cfg = load_config()
|
|
try:
|
|
rc, stdout, stderr = run_regression(cfg)
|
|
except subprocess.TimeoutExpired:
|
|
print(json.dumps({'ok': False, 'decision': 'hold', 'reason': 'runner_timeout'}, ensure_ascii=False))
|
|
return 1
|
|
if rc != 0:
|
|
print(json.dumps({'ok': False, 'decision': 'hold', 'reason': 'runner_failed', 'runner_rc': rc, 'stderr': (stderr or '')[:300]}, ensure_ascii=False))
|
|
return 1
|
|
data = json.loads(RESULT.read_text(encoding='utf-8'))
|
|
route = float(data.get('route_accuracy_percent', 0.0) or 0.0)
|
|
memory = float(data.get('memory_signal_accuracy_percent', 0.0) or 0.0)
|
|
timeouts = int(data.get('timeouts', 0) or 0)
|
|
passed = bool(data.get('pass'))
|
|
ok = True
|
|
reasons = []
|
|
if route < float(cfg.get('min_route_accuracy_percent', 95.0)):
|
|
ok = False
|
|
reasons.append('route_accuracy_below_threshold')
|
|
if memory < float(cfg.get('min_memory_signal_accuracy_percent', 90.0)):
|
|
ok = False
|
|
reasons.append('memory_accuracy_below_threshold')
|
|
if timeouts > int(cfg.get('max_timeouts', 0)):
|
|
ok = False
|
|
reasons.append('timeouts_above_threshold')
|
|
if bool(cfg.get('require_pass', True)) and not passed:
|
|
ok = False
|
|
reasons.append('regression_not_pass')
|
|
print(json.dumps({
|
|
'ok': ok,
|
|
'decision': 'promote' if ok else 'hold',
|
|
'reasons': reasons,
|
|
'route_accuracy_percent': route,
|
|
'memory_signal_accuracy_percent': memory,
|
|
'timeouts': timeouts,
|
|
'pass': passed,
|
|
}, ensure_ascii=False))
|
|
return 0 if ok else 1
|
|
|
|
|
|
if __name__ == '__main__':
|
|
raise SystemExit(main())
|