Skip to main content
Glama
brockwebb

Open Census MCP Server

by brockwebb
build_mini_corpus.py6.17 kB
#!/usr/bin/env python3 """ build_mini_corpus.py -------------------- * Scans knowledge-base/source-docs/OtherACS/ knowledge-base/source-docs/acs_table_shells/ * Builds knowledge-base/data/mini_manifest.json * Saves plain-text for every file under knowledge-base/data/mini_corpus/<sha1>.txt Extraction strategy (PDF): 1. pdfminer.six – super-fast if text embedded 2. poppler `pdftotext` fallback – better with odd encodings 3. OCR (pdf2image + pytesseract)– last-resort for scanned / “print-to-PDF” All failures are logged; the script never crashes on a bad file. © 2025 – tweak / reuse freely """ import json, hashlib, logging, subprocess, tempfile, shutil, re, html from pathlib import Path from datetime import datetime import pdfminer.high_level as pdfminer from pdf2image import convert_from_path import pytesseract import logging, warnings # ------------------------------------------------------------------ # QUIET DOWN pdfminer + pillow font warnings logging.getLogger("pdfminer").setLevel(logging.ERROR) warnings.filterwarnings( "ignore", message=r"Could get FontBBox from font descriptor", module="pdfminer", ) # pdf2image / pillow sometimes shout about DPI; silence those too logging.getLogger("PIL").setLevel(logging.ERROR) # ------------------------------------------------------------------ # --------------------------------------------------------------------------- # # CONFIG – adjust only if your paths differ ROOT = Path(__file__).resolve().parent # knowledge-base/ SRC_DIRS = [ ROOT / "source-docs" / "OtherACS", ROOT / "source-docs" / "acs_table_shells", ] OUT_DIR = ROOT / "data" TEXT_DIR = OUT_DIR / "mini_corpus" MANIFEST = OUT_DIR / "mini_manifest.json" MIN_SIZE = 200 # bytes – ignore truly empty extractions # --------------------------------------------------------------------------- # logging.basicConfig( level=logging.INFO, format="%(asctime)s | %(levelname)-7s | %(message)s", datefmt="%H:%M:%S", ) log = logging.getLogger("mini-corpus") # --------------------------------------------------------------------------- # # Helper – robust PDF→text def pdf_to_text(file: Path) -> str: """Return extracted text or '' . 3-stage fallback.""" # 1) pdfminer try: text = pdfminer.extract_text(str(file)) or "" if text.strip(): return text except Exception as e: log.debug(f"pdfminer failed on {file.name}: {e}") # 2) pdftotext if shutil.which("pdftotext"): try: with tempfile.NamedTemporaryFile(suffix=".txt") as tmp: subprocess.run( ["pdftotext", "-layout", "-enc", "UTF-8", str(file), tmp.name], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) text = Path(tmp.name).read_text(encoding="utf-8", errors="ignore") if text.strip(): return text except Exception as e: log.debug(f"pdftotext failed on {file.name}: {e}") else: log.warning("pdftotext not found – skipping stage-2 fallback") # 3) OCR try: pages = convert_from_path(str(file), dpi=300, fmt="png") ocr_chunks = [] for img in pages: ocr_chunks.append(pytesseract.image_to_string(img, lang="eng")) text = "\n".join(ocr_chunks).strip() if text: log.info(f"OCR succeeded on {file.name}") return text except Exception as e: log.debug(f"OCR failed on {file.name}: {e}") return "" # --------------------------------------------------------------------------- # def plain_text(file: Path) -> str: """Dispatcher: pick extractor based on extension.""" ext = file.suffix.lower() if ext == ".pdf": return pdf_to_text(file) elif ext in {".txt", ".md"}: return file.read_text("utf-8", errors="ignore") elif ext in {".html", ".htm"}: raw = file.read_text("utf-8", errors="ignore") return html.unescape(re.sub("<[^>]+>", " ", raw)) else: # Leave xlsx/csv/tsv etc. alone – store empty text, still list in manifest return "" # --------------------------------------------------------------------------- # def sha1_bytes(data: bytes) -> str: return hashlib.sha1(data).hexdigest() def save_text(text: str) -> str: """Write deduplicated text file => returns sha1.""" h = sha1_bytes(text.encode("utf-8")) out_path = TEXT_DIR / f"{h}.txt" if not out_path.exists(): out_path.write_text(text, "utf-8") return h # --------------------------------------------------------------------------- # def main() -> None: TEXT_DIR.mkdir(parents=True, exist_ok=True) OUT_DIR.mkdir(parents=True, exist_ok=True) manifest = [] processed = 0 for src_root in SRC_DIRS: for path in sorted(src_root.rglob("*")): if path.is_dir(): continue processed += 1 rel = path.relative_to(ROOT) # hash of file bytes for manifest file_bytes = path.read_bytes() file_sha1 = sha1_bytes(file_bytes) text = plain_text(path) text_sha1 = save_text(text) if len(text.encode()) > MIN_SIZE else None manifest.append( { "path" : str(rel), "bytes" : len(file_bytes), "file_sha1" : file_sha1, "text_sha1" : text_sha1, "ext" : path.suffix.lower(), "timestamp" : datetime.utcnow().isoformat(timespec="seconds")+"Z", } ) if processed % 50 == 0: log.info(f"…{processed} files processed") MANIFEST.write_text(json.dumps(manifest, indent=2)) log.info(f"✅ Wrote {len(manifest):,} entries → {MANIFEST.relative_to(ROOT)}") log.info(f"✅ Text corpus saved in {TEXT_DIR.relative_to(ROOT)}") if __name__ == "__main__": main()

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/brockwebb/open-census-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server