143 lines
5.6 KiB
Python
143 lines
5.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Skill eval runner with regression protection.
|
|
|
|
Usage:
|
|
python scripts/run_evals.py # run all skills
|
|
python scripts/run_evals.py codereview # run single skill
|
|
python scripts/run_evals.py codereview --iter 2 # specify iteration
|
|
python scripts/run_evals.py --check-regression # fail if any skill regressed
|
|
"""
|
|
import argparse
|
|
import json
|
|
import re
|
|
import subprocess
|
|
import time
|
|
from pathlib import Path
|
|
|
|
SKILLS_DIR = Path("skills")
|
|
WORKSPACE_ROOT = Path("evals-workspace")
|
|
BASELINE_FILE = Path("baselines.json")
|
|
|
|
|
|
def run_prompt(prompt: str, with_skill: bool) -> tuple[str, float]:
|
|
agent = "main" if with_skill else "default"
|
|
start = time.time()
|
|
result = subprocess.run(
|
|
["kiro-cli", "chat", "--agent", agent, "--no-interactive", "--message", prompt],
|
|
capture_output=True, text=True, timeout=90,
|
|
)
|
|
elapsed = round(time.time() - start, 2)
|
|
response = re.sub(r'\x1b\[[0-9;]*[A-Za-z]', '', result.stdout).strip()
|
|
return response, elapsed
|
|
|
|
|
|
def grade(response: str, expected_output: str) -> dict:
|
|
words = re.findall(r'[a-zA-Z]{5,}', expected_output)
|
|
keywords = list({w.lower() for w in words})
|
|
matched = [kw for kw in keywords if kw in response.lower()]
|
|
score = round(len(matched) / len(keywords), 2) if keywords else 0.0
|
|
return {"score": score, "passed": score >= 0.4, "matched_keywords": matched[:10]}
|
|
|
|
|
|
def load_baselines() -> dict:
|
|
return json.loads(BASELINE_FILE.read_text()) if BASELINE_FILE.exists() else {}
|
|
|
|
|
|
def save_baselines(baselines: dict) -> None:
|
|
BASELINE_FILE.write_text(json.dumps(baselines, indent=2))
|
|
|
|
|
|
def run_skill_evals(skill_name: str, iteration: int) -> dict:
|
|
evals_file = SKILLS_DIR / skill_name / "evals" / "evals.json"
|
|
if not evals_file.exists():
|
|
print(f" ⚠️ No evals.json: {evals_file}")
|
|
return {}
|
|
|
|
evals = json.loads(evals_file.read_text()).get("evals", [])
|
|
iter_dir = WORKSPACE_ROOT / skill_name / f"iteration-{iteration}"
|
|
results = []
|
|
print(f"\n[Skill: {skill_name}] iteration-{iteration}")
|
|
|
|
for case in evals:
|
|
case_id, prompt, expected = case["id"], case["prompt"], case["expected_output"]
|
|
case_dir = iter_dir / f"eval-{skill_name}-{case_id}"
|
|
|
|
for mode in ("with_skill", "without_skill"):
|
|
out_dir = case_dir / mode
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|
response, elapsed = run_prompt(prompt, mode == "with_skill")
|
|
grading = grade(response, expected)
|
|
(out_dir / "response.txt").write_text(response)
|
|
(out_dir / "timing.json").write_text(json.dumps({"duration_seconds": elapsed, "with_skill": mode == "with_skill"}, indent=2))
|
|
(out_dir / "grading.json").write_text(json.dumps(grading, indent=2))
|
|
|
|
with_grade = json.loads((case_dir / "with_skill" / "grading.json").read_text())
|
|
without_grade = json.loads((case_dir / "without_skill" / "grading.json").read_text())
|
|
delta = round(with_grade["score"] - without_grade["score"], 2)
|
|
status = "✅" if with_grade["passed"] else "❌"
|
|
print(f" Case {case_id}: {status} score={with_grade['score']} (Δ{delta:+.2f} vs baseline)")
|
|
results.append({"id": case_id, "with_skill": with_grade, "without_skill": without_grade})
|
|
|
|
passed = sum(1 for r in results if r["with_skill"]["passed"])
|
|
pass_rate = round(passed / len(results), 2) if results else 0
|
|
benchmark = {"skill": skill_name, "iteration": iteration, "total": len(results), "passed": passed, "pass_rate": pass_rate, "cases": results}
|
|
(iter_dir / "benchmark.json").write_text(json.dumps(benchmark, indent=2))
|
|
print(f" Pass rate: {passed}/{len(results)}")
|
|
return benchmark
|
|
|
|
|
|
def check_regression(benchmark: dict, baselines: dict) -> list[str]:
|
|
"""Return list of regression messages, empty if no regression."""
|
|
skill = benchmark.get("skill")
|
|
if not skill or skill not in baselines:
|
|
return []
|
|
baseline_rate = baselines[skill]["pass_rate"]
|
|
current_rate = benchmark.get("pass_rate", 0)
|
|
if current_rate < baseline_rate:
|
|
return [f"{skill}: pass_rate dropped {baseline_rate} → {current_rate}"]
|
|
return []
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("skill", nargs="?", help="Skill name (default: all)")
|
|
parser.add_argument("--iter", type=int, default=1)
|
|
parser.add_argument("--update-baseline", action="store_true", help="Save current results as new baseline")
|
|
parser.add_argument("--check-regression", action="store_true", help="Exit non-zero if any skill regressed")
|
|
args = parser.parse_args()
|
|
|
|
skills = [args.skill] if args.skill else [d.name for d in SKILLS_DIR.iterdir() if d.is_dir()]
|
|
baselines = load_baselines()
|
|
regressions = []
|
|
all_results = []
|
|
|
|
for skill in filter(None, skills):
|
|
result = run_skill_evals(skill, args.iter)
|
|
if result:
|
|
all_results.append(result)
|
|
regressions.extend(check_regression(result, baselines))
|
|
|
|
total = sum(r.get("total", 0) for r in all_results)
|
|
passed = sum(r.get("passed", 0) for r in all_results)
|
|
print(f"\n{'='*40}")
|
|
print(f"Overall: {passed}/{total} cases passed")
|
|
|
|
if args.update_baseline:
|
|
for r in all_results:
|
|
baselines[r["skill"]] = {"pass_rate": r["pass_rate"], "iteration": r["iteration"]}
|
|
save_baselines(baselines)
|
|
print(f"Baselines updated → {BASELINE_FILE}")
|
|
|
|
if args.check_regression and regressions:
|
|
print("\n🚨 Regressions detected:")
|
|
for msg in regressions:
|
|
print(f" ❌ {msg}")
|
|
raise SystemExit(1)
|
|
|
|
print(f"Results saved to {WORKSPACE_ROOT}/")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|