#!/usr/bin/env python3 """ Skill eval runner with regression protection. Usage: python scripts/run_evals.py # run all skills python scripts/run_evals.py codereview # run single skill python scripts/run_evals.py codereview --iter 2 # specify iteration python scripts/run_evals.py --check-regression # fail if any skill regressed """ import argparse import json import re import subprocess import time from pathlib import Path SKILLS_DIR = Path("skills") WORKSPACE_ROOT = Path("evals-workspace") BASELINE_FILE = Path("baselines.json") def run_prompt(prompt: str, with_skill: bool) -> tuple[str, float]: agent = "main" if with_skill else "default" start = time.time() result = subprocess.run( ["kiro-cli", "chat", "--agent", agent, "--no-interactive", prompt], capture_output=True, text=True, timeout=90, ) elapsed = round(time.time() - start, 2) response = re.sub(r'\x1b\[[0-9;]*[A-Za-z]', '', result.stdout).strip() return response, elapsed def grade(response: str, expected_output: str) -> dict: words = re.findall(r'[a-zA-Z]{5,}', expected_output) keywords = list({w.lower() for w in words}) matched = [kw for kw in keywords if kw in response.lower()] score = round(len(matched) / len(keywords), 2) if keywords else 0.0 return {"score": score, "passed": score >= 0.4, "matched_keywords": matched[:10]} def load_baselines() -> dict: return json.loads(BASELINE_FILE.read_text()) if BASELINE_FILE.exists() else {} def save_baselines(baselines: dict) -> None: BASELINE_FILE.write_text(json.dumps(baselines, indent=2)) def run_skill_evals(skill_name: str, iteration: int) -> dict: evals_file = SKILLS_DIR / skill_name / "evals" / "evals.json" if not evals_file.exists(): print(f" ⚠️ No evals.json: {evals_file}") return {} evals = json.loads(evals_file.read_text()).get("evals", []) iter_dir = WORKSPACE_ROOT / skill_name / f"iteration-{iteration}" results = [] print(f"\n[Skill: {skill_name}] iteration-{iteration}") for case in evals: case_id, prompt, expected = case["id"], case["prompt"], case["expected_output"] case_dir = iter_dir / f"eval-{skill_name}-{case_id}" for mode in ("with_skill", "without_skill"): out_dir = case_dir / mode out_dir.mkdir(parents=True, exist_ok=True) response, elapsed = run_prompt(prompt, mode == "with_skill") grading = grade(response, expected) (out_dir / "response.txt").write_text(response) (out_dir / "timing.json").write_text(json.dumps({"duration_seconds": elapsed, "with_skill": mode == "with_skill"}, indent=2)) (out_dir / "grading.json").write_text(json.dumps(grading, indent=2)) with_grade = json.loads((case_dir / "with_skill" / "grading.json").read_text()) without_grade = json.loads((case_dir / "without_skill" / "grading.json").read_text()) delta = round(with_grade["score"] - without_grade["score"], 2) status = "✅" if with_grade["passed"] else "❌" print(f" Case {case_id}: {status} score={with_grade['score']} (Δ{delta:+.2f} vs baseline)") results.append({"id": case_id, "with_skill": with_grade, "without_skill": without_grade}) passed = sum(1 for r in results if r["with_skill"]["passed"]) pass_rate = round(passed / len(results), 2) if results else 0 benchmark = {"skill": skill_name, "iteration": iteration, "total": len(results), "passed": passed, "pass_rate": pass_rate, "cases": results} (iter_dir / "benchmark.json").write_text(json.dumps(benchmark, indent=2)) print(f" Pass rate: {passed}/{len(results)}") return benchmark def check_regression(benchmark: dict, baselines: dict) -> list[str]: """Return list of regression messages, empty if no regression.""" skill = benchmark.get("skill") if not skill or skill not in baselines: return [] baseline_rate = baselines[skill]["pass_rate"] current_rate = benchmark.get("pass_rate", 0) if current_rate < baseline_rate: return [f"{skill}: pass_rate dropped {baseline_rate} → {current_rate}"] return [] def main(): parser = argparse.ArgumentParser() parser.add_argument("skill", nargs="?", help="Skill name (default: all)") parser.add_argument("--iter", type=int, default=1) parser.add_argument("--update-baseline", action="store_true", help="Save current results as new baseline") parser.add_argument("--check-regression", action="store_true", help="Exit non-zero if any skill regressed") args = parser.parse_args() skills = [args.skill] if args.skill else [d.name for d in SKILLS_DIR.iterdir() if d.is_dir()] baselines = load_baselines() regressions = [] all_results = [] for skill in filter(None, skills): result = run_skill_evals(skill, args.iter) if result: all_results.append(result) regressions.extend(check_regression(result, baselines)) total = sum(r.get("total", 0) for r in all_results) passed = sum(r.get("passed", 0) for r in all_results) print(f"\n{'='*40}") print(f"Overall: {passed}/{total} cases passed") if args.update_baseline: for r in all_results: baselines[r["skill"]] = {"pass_rate": r["pass_rate"], "iteration": r["iteration"]} save_baselines(baselines) print(f"Baselines updated → {BASELINE_FILE}") if args.check_regression and regressions: print("\n🚨 Regressions detected:") for msg in regressions: print(f" ❌ {msg}") raise SystemExit(1) print(f"Results saved to {WORKSPACE_ROOT}/") if __name__ == "__main__": main()