from __future__ import annotations
import argparse
import json
from typing import Any, Iterable, Set
from common_json import load_answer_from_artifact, extract_first_json
def _as_set(parsed: Any) -> Set[str]:
# Accept list[str], or {"env_vars":[...]} or {"vars":[...]}
if isinstance(parsed, list):
return {x for x in parsed if isinstance(x, str)}
if isinstance(parsed, dict):
for k in ("env_vars", "vars", "env", "environment_variables"):
v = parsed.get(k)
if isinstance(v, list):
return {x for x in v if isinstance(x, str)}
return set()
def f1(p: float, r: float) -> float:
return 0.0 if (p + r) == 0 else 2*p*r/(p+r)
def main():
ap = argparse.ArgumentParser(description="Score env var extraction answers vs gold set.")
ap.add_argument("--gold", required=True, help="Gold JSON from envvar_gold.py")
ap.add_argument("--pred", required=True, help="Model output artifact (.txt/.json or a directory)")
ap.add_argument("--out", default=None, help="Write score JSON here")
args = ap.parse_args()
gold = json.loads(open(args.gold, "r", encoding="utf-8").read())
gold_set = set(gold.get("env_vars", []))
raw, parsed = load_answer_from_artifact(args.pred)
if parsed is None:
try:
parsed = extract_first_json(raw)
except Exception:
parsed = None
pred_set = _as_set(parsed)
tp = len(pred_set & gold_set)
fp = len(pred_set - gold_set)
fn = len(gold_set - pred_set)
precision = 0.0 if (tp+fp)==0 else tp/(tp+fp)
recall = 0.0 if (tp+fn)==0 else tp/(tp+fn)
score = {
"parse_ok": 1 if parsed is not None else 0,
"expected_type_ok": 1 if isinstance(parsed, (list, dict)) else 0,
"counts": {"tp": tp, "fp": fp, "fn": fn},
"precision": precision,
"recall": recall,
"f1": f1(precision, recall),
"missing": sorted(gold_set - pred_set),
"extra": sorted(pred_set - gold_set),
}
out_path = args.out
if out_path:
with open(out_path, "w", encoding="utf-8") as f:
json.dump(score, f, indent=2, ensure_ascii=False)
print(json.dumps(score, indent=2, ensure_ascii=False))
if __name__ == "__main__":
main()