Files
Team c0d14c6ac1 chore: restructure skills repo with new agents and skill bundles
- Add new skills: deep-dive, docs-rag, meta-creator, ppt-maker, sdlc
- Add agent configs: g-assistent, meta-creator, sdlc with prompt files
- Add reference docs for custom agents and skills specification
- Add utility scripts: install-agents.sh, orchestrate.py, puml2svg.sh
- Update README and commit-message skill config
- Remove deprecated skills: codereview, python, testing, typescript
- Add .gitignore
2026-04-18 13:07:46 +08:00

143 lines
5.6 KiB
Python

#!/usr/bin/env python3
"""
Skill eval runner with regression protection.
Usage:
python scripts/run_evals.py # run all skills
python scripts/run_evals.py codereview # run single skill
python scripts/run_evals.py codereview --iter 2 # specify iteration
python scripts/run_evals.py --check-regression # fail if any skill regressed
"""
import argparse
import json
import re
import subprocess
import time
from pathlib import Path
SKILLS_DIR = Path("skills")
WORKSPACE_ROOT = Path("evals-workspace")
BASELINE_FILE = Path("baselines.json")
def run_prompt(prompt: str, with_skill: bool) -> tuple[str, float]:
agent = "main" if with_skill else "default"
start = time.time()
result = subprocess.run(
["kiro-cli", "chat", "--agent", agent, "--no-interactive", prompt],
capture_output=True, text=True, timeout=90,
)
elapsed = round(time.time() - start, 2)
response = re.sub(r'\x1b\[[0-9;]*[A-Za-z]', '', result.stdout).strip()
return response, elapsed
def grade(response: str, expected_output: str) -> dict:
words = re.findall(r'[a-zA-Z]{5,}', expected_output)
keywords = list({w.lower() for w in words})
matched = [kw for kw in keywords if kw in response.lower()]
score = round(len(matched) / len(keywords), 2) if keywords else 0.0
return {"score": score, "passed": score >= 0.4, "matched_keywords": matched[:10]}
def load_baselines() -> dict:
return json.loads(BASELINE_FILE.read_text()) if BASELINE_FILE.exists() else {}
def save_baselines(baselines: dict) -> None:
BASELINE_FILE.write_text(json.dumps(baselines, indent=2))
def run_skill_evals(skill_name: str, iteration: int) -> dict:
evals_file = SKILLS_DIR / skill_name / "evals" / "evals.json"
if not evals_file.exists():
print(f" ⚠️ No evals.json: {evals_file}")
return {}
evals = json.loads(evals_file.read_text()).get("evals", [])
iter_dir = WORKSPACE_ROOT / skill_name / f"iteration-{iteration}"
results = []
print(f"\n[Skill: {skill_name}] iteration-{iteration}")
for case in evals:
case_id, prompt, expected = case["id"], case["prompt"], case["expected_output"]
case_dir = iter_dir / f"eval-{skill_name}-{case_id}"
for mode in ("with_skill", "without_skill"):
out_dir = case_dir / mode
out_dir.mkdir(parents=True, exist_ok=True)
response, elapsed = run_prompt(prompt, mode == "with_skill")
grading = grade(response, expected)
(out_dir / "response.txt").write_text(response)
(out_dir / "timing.json").write_text(json.dumps({"duration_seconds": elapsed, "with_skill": mode == "with_skill"}, indent=2))
(out_dir / "grading.json").write_text(json.dumps(grading, indent=2))
with_grade = json.loads((case_dir / "with_skill" / "grading.json").read_text())
without_grade = json.loads((case_dir / "without_skill" / "grading.json").read_text())
delta = round(with_grade["score"] - without_grade["score"], 2)
status = "" if with_grade["passed"] else ""
print(f" Case {case_id}: {status} score={with_grade['score']}{delta:+.2f} vs baseline)")
results.append({"id": case_id, "with_skill": with_grade, "without_skill": without_grade})
passed = sum(1 for r in results if r["with_skill"]["passed"])
pass_rate = round(passed / len(results), 2) if results else 0
benchmark = {"skill": skill_name, "iteration": iteration, "total": len(results), "passed": passed, "pass_rate": pass_rate, "cases": results}
(iter_dir / "benchmark.json").write_text(json.dumps(benchmark, indent=2))
print(f" Pass rate: {passed}/{len(results)}")
return benchmark
def check_regression(benchmark: dict, baselines: dict) -> list[str]:
"""Return list of regression messages, empty if no regression."""
skill = benchmark.get("skill")
if not skill or skill not in baselines:
return []
baseline_rate = baselines[skill]["pass_rate"]
current_rate = benchmark.get("pass_rate", 0)
if current_rate < baseline_rate:
return [f"{skill}: pass_rate dropped {baseline_rate}{current_rate}"]
return []
def main():
parser = argparse.ArgumentParser()
parser.add_argument("skill", nargs="?", help="Skill name (default: all)")
parser.add_argument("--iter", type=int, default=1)
parser.add_argument("--update-baseline", action="store_true", help="Save current results as new baseline")
parser.add_argument("--check-regression", action="store_true", help="Exit non-zero if any skill regressed")
args = parser.parse_args()
skills = [args.skill] if args.skill else [d.name for d in SKILLS_DIR.iterdir() if d.is_dir()]
baselines = load_baselines()
regressions = []
all_results = []
for skill in filter(None, skills):
result = run_skill_evals(skill, args.iter)
if result:
all_results.append(result)
regressions.extend(check_regression(result, baselines))
total = sum(r.get("total", 0) for r in all_results)
passed = sum(r.get("passed", 0) for r in all_results)
print(f"\n{'='*40}")
print(f"Overall: {passed}/{total} cases passed")
if args.update_baseline:
for r in all_results:
baselines[r["skill"]] = {"pass_rate": r["pass_rate"], "iteration": r["iteration"]}
save_baselines(baselines)
print(f"Baselines updated → {BASELINE_FILE}")
if args.check_regression and regressions:
print("\n🚨 Regressions detected:")
for msg in regressions:
print(f"{msg}")
raise SystemExit(1)
print(f"Results saved to {WORKSPACE_ROOT}/")
if __name__ == "__main__":
main()