#!/usr/bin/env python3
from __future__ import annotations
import json
import re
import sys
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
def _repo_root() -> Path:
return Path(__file__).resolve().parents[1]
TOKEN_RE = re.compile(r"^[A-Z][A-Z0-9_/-]*$")
TOKEN_REF_RE = re.compile(r"\[([^\[\]]+)\]")
CONTRACT_FILE_RE = re.compile(r"^(?P<stem>.+)_v(?P<ver>[0-9]+)\.md$")
@dataclass(frozen=True)
class DocBlocks:
legend: str
content: str
def _strip_inline_code(text: str) -> str:
parts = text.split("`")
if len(parts) < 3:
return text
out: list[str] = []
for i, part in enumerate(parts):
if i % 2 == 0:
out.append(part)
return "".join(out)
def _parse_blocks(md: str) -> DocBlocks:
lines = md.splitlines()
i = 0
while i < len(lines) and not lines[i].strip():
i += 1
if i >= len(lines) or lines[i].strip() != "[LEGEND]":
raise ValueError("first non-empty line must be [LEGEND]")
legend_start = i + 1
content_header_idx = None
in_fence = False
for j in range(legend_start, len(lines)):
line = lines[j]
if line.strip().startswith("```"):
in_fence = not in_fence
continue
if in_fence:
continue
if line.strip() == "[CONTENT]":
content_header_idx = j
break
if content_header_idx is None:
raise ValueError("missing [CONTENT] header")
# Disallow multiple [CONTENT] headers (outside fences).
in_fence = False
for j in range(content_header_idx + 1, len(lines)):
line = lines[j]
if line.strip().startswith("```"):
in_fence = not in_fence
continue
if in_fence:
continue
if line.strip() == "[CONTENT]":
raise ValueError("multiple [CONTENT] headers found; use exactly one")
legend = "\n".join(lines[legend_start:content_header_idx]).strip("\n")
content = "\n".join(lines[content_header_idx + 1 :]).strip("\n")
return DocBlocks(legend=legend, content=content)
def _parse_legend_tokens(legend_block: str) -> dict[str, str]:
tokens: dict[str, str] = {}
for raw in legend_block.splitlines():
line = raw.strip()
if not line or line.startswith("#"):
continue
if "=" not in line:
raise ValueError(f"legend line must be 'TOKEN = Meaning' (got: {raw!r})")
left, right = line.split("=", 1)
token = left.strip()
meaning = right.strip()
if not token:
raise ValueError(f"empty token in legend line: {raw!r}")
if not TOKEN_RE.match(token):
raise ValueError(f"invalid token name: {token!r} (expected uppercase TOKEN like FOO_BAR)")
if token in tokens:
raise ValueError(f"duplicate token in legend: {token}")
tokens[token] = meaning
return tokens
def _extract_token_refs(content_block: str) -> set[str]:
refs: set[str] = set()
in_fence = False
for raw in content_block.splitlines():
line = raw.rstrip("\n")
if line.strip().startswith("```"):
in_fence = not in_fence
continue
if in_fence:
continue
line = _strip_inline_code(line)
for inner in TOKEN_REF_RE.findall(line):
candidate = inner.strip()
if "|" in candidate:
token, src = candidate.split("|", 1)
token = token.strip()
src = src.strip()
if src != "LEGEND.md":
continue
candidate = token
if TOKEN_RE.match(candidate):
refs.add(candidate)
return refs
def _should_skip_dir(name: str) -> bool:
return name in {
".git",
".hg",
".svn",
".venv",
"node_modules",
"target",
"dist",
"build",
".next",
".cache",
"__pycache__",
}
def _iter_markdown_files(root: Path) -> list[Path]:
files: list[Path] = []
for path in sorted(root.rglob("*.md")):
if any(_should_skip_dir(p) for p in path.parts):
continue
files.append(path)
return files
def _is_repo_root_freeform(path: Path, root: Path) -> bool:
try:
rel = path.relative_to(root)
except ValueError:
return False
if rel.parent != Path("."):
return False
return rel.name in {"AGENTS.md", "README.md"}
def _doc_kind(rel: Path) -> str:
if len(rel.parts) >= 3 and rel.parts[0] == "docs" and rel.parts[1] == "contracts":
if CONTRACT_FILE_RE.match(rel.name):
return "contract"
return "contracts-meta"
if rel.name == "LEGEND.md":
return "legend"
if rel.name in {"MAP.md", "PHILOSOPHY.md", "GOALS.md", "ARCHITECTURE.md"}:
return "context"
if rel.parts and rel.parts[0] == "docs":
return "doc"
return "doc"
def _sorted_dict(d: dict[str, str]) -> dict[str, str]:
return {k: d[k] for k in sorted(d.keys())}
def main() -> int:
root = _repo_root()
legend_path = root / "LEGEND.md"
if not legend_path.exists():
print("ERROR: missing LEGEND.md", file=sys.stderr)
return 2
errors: list[str] = []
try:
global_blocks = _parse_blocks(legend_path.read_text(encoding="utf-8"))
global_tokens = _parse_legend_tokens(global_blocks.legend)
except Exception as e:
print(f"ERROR: LEGEND.md is invalid: {e}", file=sys.stderr)
return 2
md_files = _iter_markdown_files(root)
docs: list[dict[str, str]] = []
local_tokens_by_doc: dict[str, dict[str, str]] = {}
refs_by_doc: dict[str, list[str]] = {}
for path in md_files:
if _is_repo_root_freeform(path, root):
continue
rel = path.relative_to(root)
rel_str = str(rel)
docs.append({"path": rel_str, "kind": _doc_kind(rel)})
try:
blocks = _parse_blocks(path.read_text(encoding="utf-8"))
except Exception as e:
errors.append(f"{rel_str}: {e}")
continue
try:
local_tokens = _parse_legend_tokens(blocks.legend)
except Exception as e:
errors.append(f"{rel_str}: invalid [LEGEND]: {e}")
continue
if path == legend_path:
local_tokens = {}
local_tokens_by_doc[rel_str] = _sorted_dict(local_tokens)
refs_by_doc[rel_str] = sorted(_extract_token_refs(blocks.content))
if errors:
print("== context index errors ==", file=sys.stderr)
for e in errors:
print(f"- {e}", file=sys.stderr)
print(f"\nFAIL: context index ({len(errors)} error(s)).", file=sys.stderr)
return 2
# Build reverse index for quick navigation.
reverse_index: dict[str, dict[str, object]] = {}
for token, meaning in global_tokens.items():
reverse_index[token] = {
"meaning": meaning,
"defined_in": "LEGEND.md",
"used_in": [],
}
for doc, local_tokens in local_tokens_by_doc.items():
for token, meaning in local_tokens.items():
reverse_index[token] = {
"meaning": meaning,
"defined_in": doc,
"used_in": [],
}
for doc, refs in refs_by_doc.items():
for token in refs:
if token not in reverse_index:
# Gate should catch this; keep index robust anyway.
reverse_index[token] = {"meaning": "", "defined_in": "", "used_in": []}
used_in = reverse_index[token].get("used_in")
if isinstance(used_in, list):
used_in.append(doc)
for token in list(reverse_index.keys()):
used_in = reverse_index[token].get("used_in")
if isinstance(used_in, list):
reverse_index[token]["used_in"] = sorted(set(str(x) for x in used_in))
out_dir = root / "ai"
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "context_index.json"
now = datetime.now(timezone.utc).isoformat(timespec="seconds")
payload = {
"version": 1,
"generated_at_utc": now,
"docs": sorted(docs, key=lambda d: (d.get("kind", ""), d.get("path", ""))),
"tokens_global": _sorted_dict(global_tokens),
"tokens_local": {k: local_tokens_by_doc[k] for k in sorted(local_tokens_by_doc.keys())},
"refs": {k: refs_by_doc[k] for k in sorted(refs_by_doc.keys())},
"reverse_index": {k: reverse_index[k] for k in sorted(reverse_index.keys())},
}
out_path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
print(f"Wrote: {out_path}")
return 0
if __name__ == "__main__":
raise SystemExit(main())