openclaw-intelligence-core-.../syncpatch/run-sacred-evals

#!/usr/bin/env python3
from __future__ import annotations

import json
import os
import signal
import subprocess
from datetime import datetime, timezone
from pathlib import Path

ROOT = Path('/home/openclaw/.openclaw/workspace')
CONFIG = ROOT / 'configs' / 'sacred_eval_gate.json'
RESULT = ROOT / 'evals' / 'results' / 'regression_results.json'
RUNNER = ROOT / 'evals' / 'run_regression.py'


def load_config() -> dict:
    try:
        return json.loads(CONFIG.read_text(encoding='utf-8'))
    except Exception:
        return {
            'min_route_accuracy_percent': 95.0,
            'min_memory_signal_accuracy_percent': 90.0,
            'max_timeouts': 0,
            'require_pass': True,
            'runner_timeout_seconds': 90,
            'runner_start': 0,
            'runner_limit': 5,
            'runner_per_case_timeout': 8,
            'runner_progress_every': 5,
        }


def gate_output_path() -> Path:
    ts = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
    return ROOT / 'evals' / 'results' / f'sacred_gate_{ts}.json'


def run_regression(cfg: dict) -> tuple[int, str, str, Path]:
    output = gate_output_path()
    cmd = ['python3', str(RUNNER), '--output', str(output)]
    start = cfg.get('runner_start', None)
    limit = cfg.get('runner_limit', None)
    per_case = cfg.get('runner_per_case_timeout', None)
    progress = cfg.get('runner_progress_every', None)
    if start is not None:
        cmd.extend(['--start', str(start)])
    if limit is not None:
        cmd.extend(['--limit', str(limit)])
    if per_case is not None:
        cmd.extend(['--per-case-timeout', str(per_case)])
    if progress is not None:
        cmd.extend(['--progress-every', str(progress)])
    proc = subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
        start_new_session=True,
    )
    try:
        stdout, stderr = proc.communicate(timeout=int(cfg.get('runner_timeout_seconds', 90) or 90))
    except subprocess.TimeoutExpired:
        try:
            os.killpg(proc.pid, signal.SIGTERM)
        except Exception:
            proc.kill()
        raise
    return proc.returncode, stdout, stderr, output


def main() -> int:
    cfg = load_config()
    try:
        rc, stdout, stderr, output_path = run_regression(cfg)
    except subprocess.TimeoutExpired:
        print(json.dumps({'ok': False, 'decision': 'hold', 'reason': 'runner_timeout'}, ensure_ascii=False))
        return 1
    if rc != 0:
        print(json.dumps({'ok': False, 'decision': 'hold', 'reason': 'runner_failed', 'runner_rc': rc, 'stderr': (stderr or '')[:300]}, ensure_ascii=False))
        return 1
    result_path = output_path if output_path.exists() else RESULT
    data = json.loads(result_path.read_text(encoding='utf-8'))
    route = float(data.get('route_accuracy_percent', 0.0) or 0.0)
    memory = float(data.get('memory_signal_accuracy_percent', 0.0) or 0.0)
    timeouts = int(data.get('timeouts', 0) or 0)
    passed = bool(data.get('pass'))
    ok = True
    reasons = []
    if route < float(cfg.get('min_route_accuracy_percent', 95.0)):
        ok = False
        reasons.append('route_accuracy_below_threshold')
    if memory < float(cfg.get('min_memory_signal_accuracy_percent', 90.0)):
        ok = False
        reasons.append('memory_accuracy_below_threshold')
    if timeouts > int(cfg.get('max_timeouts', 0)):
        ok = False
        reasons.append('timeouts_above_threshold')
    if bool(cfg.get('require_pass', True)) and not passed:
        ok = False
        reasons.append('regression_not_pass')
    print(json.dumps({
        'ok': ok,
        'decision': 'promote' if ok else 'hold',
        'reasons': reasons,
        'route_accuracy_percent': route,
        'memory_signal_accuracy_percent': memory,
        'timeouts': timeouts,
        'pass': passed,
        'result_path': str(result_path),
    }, ensure_ascii=False))
    return 0 if ok else 1


if __name__ == '__main__':
    raise SystemExit(main())