Stabilize sacred gate and policy thresholds
This commit is contained in:
parent
1e1bd0dc4a
commit
b1320a65f0
2 changed files with 27 additions and 7 deletions
|
|
@ -20,19 +20,38 @@ def load_config() -> dict:
|
|||
'min_memory_signal_accuracy_percent': 90.0,
|
||||
'max_timeouts': 0,
|
||||
'require_pass': True,
|
||||
'runner_timeout_seconds': 180,
|
||||
'runner_start': 0,
|
||||
'runner_limit': 14,
|
||||
}
|
||||
|
||||
|
||||
def run_regression() -> int:
|
||||
proc = subprocess.run(['python3', str(RUNNER)], capture_output=True, text=True)
|
||||
return proc.returncode
|
||||
def run_regression(cfg: dict) -> tuple[int, str, str]:
|
||||
cmd = ['python3', str(RUNNER)]
|
||||
start = cfg.get('runner_start', None)
|
||||
limit = cfg.get('runner_limit', None)
|
||||
if start is not None:
|
||||
cmd.extend(['--start', str(start)])
|
||||
if limit is not None:
|
||||
cmd.extend(['--limit', str(limit)])
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=int(cfg.get('runner_timeout_seconds', 180) or 180),
|
||||
)
|
||||
return proc.returncode, proc.stdout, proc.stderr
|
||||
|
||||
|
||||
def main() -> int:
|
||||
cfg = load_config()
|
||||
rc = run_regression()
|
||||
try:
|
||||
rc, stdout, stderr = run_regression(cfg)
|
||||
except subprocess.TimeoutExpired:
|
||||
print(json.dumps({'ok': False, 'decision': 'hold', 'reason': 'runner_timeout'}, ensure_ascii=False))
|
||||
return 1
|
||||
if rc != 0:
|
||||
print(json.dumps({'ok': False, 'decision': 'hold', 'reason': 'runner_failed', 'runner_rc': rc}, ensure_ascii=False))
|
||||
print(json.dumps({'ok': False, 'decision': 'hold', 'reason': 'runner_failed', 'runner_rc': rc, 'stderr': (stderr or '')[:300]}, ensure_ascii=False))
|
||||
return 1
|
||||
data = json.loads(RESULT.read_text(encoding='utf-8'))
|
||||
route = float(data.get('route_accuracy_percent', 0.0) or 0.0)
|
||||
|
|
|
|||
|
|
@ -21,9 +21,10 @@ def classify_mode(row: dict) -> str:
|
|||
clar = int(row.get('clarification', 0) or 0)
|
||||
reward_sum = float(row.get('reward_sum', 0.0) or 0.0)
|
||||
avg_reward = reward_sum / count if count else 0.0
|
||||
if count >= 3 and success >= max(2, failure + clar) and avg_reward >= 3.0:
|
||||
failure_like = failure + clar
|
||||
if count >= 3 and success >= 3 and success >= (failure_like + 2) and avg_reward >= 3.0:
|
||||
return 'prefer'
|
||||
if count >= 3 and failure > success and avg_reward < 0.5:
|
||||
if count >= 3 and failure_like >= 2 and failure_like > success and avg_reward < 1.0:
|
||||
return 'avoid'
|
||||
return 'observe'
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue