openclaw-intelligence-core-.../syncpatch/run-sacred-evals

#!/usr/bin/env python3
from __future__ import annotations

import json
import subprocess
from pathlib import Path

ROOT = Path('/home/openclaw/.openclaw/workspace')
CONFIG = ROOT / 'configs' / 'sacred_eval_gate.json'
RESULT = ROOT / 'evals' / 'results' / 'regression_results.json'
RUNNER = ROOT / 'evals' / 'run_regression.py'


def load_config() -> dict:
    try:
        return json.loads(CONFIG.read_text(encoding='utf-8'))
    except Exception:
        return {
            'min_route_accuracy_percent': 95.0,
            'min_memory_signal_accuracy_percent': 90.0,
            'max_timeouts': 0,
            'require_pass': True,
            'runner_timeout_seconds': 180,
            'runner_start': 0,
            'runner_limit': 14,
        }


def run_regression(cfg: dict) -> tuple[int, str, str]:
    cmd = ['python3', str(RUNNER)]
    start = cfg.get('runner_start', None)
    limit = cfg.get('runner_limit', None)
    if start is not None:
        cmd.extend(['--start', str(start)])
    if limit is not None:
        cmd.extend(['--limit', str(limit)])
    proc = subprocess.run(
        cmd,
        capture_output=True,
        text=True,
        timeout=int(cfg.get('runner_timeout_seconds', 180) or 180),
    )
    return proc.returncode, proc.stdout, proc.stderr


def main() -> int:
    cfg = load_config()
    try:
        rc, stdout, stderr = run_regression(cfg)
    except subprocess.TimeoutExpired:
        print(json.dumps({'ok': False, 'decision': 'hold', 'reason': 'runner_timeout'}, ensure_ascii=False))
        return 1
    if rc != 0:
        print(json.dumps({'ok': False, 'decision': 'hold', 'reason': 'runner_failed', 'runner_rc': rc, 'stderr': (stderr or '')[:300]}, ensure_ascii=False))
        return 1
    data = json.loads(RESULT.read_text(encoding='utf-8'))
    route = float(data.get('route_accuracy_percent', 0.0) or 0.0)
    memory = float(data.get('memory_signal_accuracy_percent', 0.0) or 0.0)
    timeouts = int(data.get('timeouts', 0) or 0)
    passed = bool(data.get('pass'))
    ok = True
    reasons = []
    if route < float(cfg.get('min_route_accuracy_percent', 95.0)):
        ok = False
        reasons.append('route_accuracy_below_threshold')
    if memory < float(cfg.get('min_memory_signal_accuracy_percent', 90.0)):
        ok = False
        reasons.append('memory_accuracy_below_threshold')
    if timeouts > int(cfg.get('max_timeouts', 0)):
        ok = False
        reasons.append('timeouts_above_threshold')
    if bool(cfg.get('require_pass', True)) and not passed:
        ok = False
        reasons.append('regression_not_pass')
    print(json.dumps({
        'ok': ok,
        'decision': 'promote' if ok else 'hold',
        'reasons': reasons,
        'route_accuracy_percent': route,
        'memory_signal_accuracy_percent': memory,
        'timeouts': timeouts,
        'pass': passed,
    }, ensure_ascii=False))
    return 0 if ok else 1


if __name__ == '__main__':
    raise SystemExit(main())