Open Census MCP Server

open-census-mcp-server
knowledge-base

build_mini_corpus.py•6.02 KiB

#!/usr/bin/env python3 """ build_mini_corpus.py -------------------- * Scans knowledge-base/source-docs/OtherACS/ knowledge-base/source-docs/acs_table_shells/ * Builds knowledge-base/data/mini_manifest.json * Saves plain-text for every file under knowledge-base/data/mini_corpus/<sha1>.txt Extraction strategy (PDF): 1. pdfminer.six – super-fast if text embedded 2. poppler `pdftotext` fallback – better with odd encodings 3. OCR (pdf2image + pytesseract)– last-resort for scanned / “print-to-PDF” All failures are logged; the script never crashes on a bad file. © 2025 – tweak / reuse freely """ import json, hashlib, logging, subprocess, tempfile, shutil, re, html from pathlib import Path from datetime import datetime import pdfminer.high_level as pdfminer from pdf2image import convert_from_path import pytesseract import logging, warnings # ------------------------------------------------------------------ # QUIET DOWN pdfminer + pillow font warnings logging.getLogger("pdfminer").setLevel(logging.ERROR) warnings.filterwarnings( "ignore", message=r"Could get FontBBox from font descriptor", module="pdfminer", ) # pdf2image / pillow sometimes shout about DPI; silence those too logging.getLogger("PIL").setLevel(logging.ERROR) # ------------------------------------------------------------------ # --------------------------------------------------------------------------- # # CONFIG – adjust only if your paths differ ROOT = Path(__file__).resolve().parent # knowledge-base/ SRC_DIRS = [ ROOT / "source-docs" / "OtherACS", ROOT / "source-docs" / "acs_table_shells", ] OUT_DIR = ROOT / "data" TEXT_DIR = OUT_DIR / "mini_corpus" MANIFEST = OUT_DIR / "mini_manifest.json" MIN_SIZE = 200 # bytes – ignore truly empty extractions # --------------------------------------------------------------------------- # logging.basicConfig( level=logging.INFO, format="%(asctime)s | %(levelname)-7s | %(message)s", datefmt="%H:%M:%S", ) log = logging.getLogger("mini-corpus") # --------------------------------------------------------------------------- # # Helper – robust PDF→text def pdf_to_text(file: Path) -> str: """Return extracted text or '' . 3-stage fallback.""" # 1) pdfminer try: text = pdfminer.extract_text(str(file)) or "" if text.strip(): return text except Exception as e: log.debug(f"pdfminer failed on {file.name}: {e}") # 2) pdftotext if shutil.which("pdftotext"): try: with tempfile.NamedTemporaryFile(suffix=".txt") as tmp: subprocess.run( ["pdftotext", "-layout", "-enc", "UTF-8", str(file), tmp.name], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) text = Path(tmp.name).read_text(encoding="utf-8", errors="ignore") if text.strip(): return text except Exception as e: log.debug(f"pdftotext failed on {file.name}: {e}") else: log.warning("pdftotext not found – skipping stage-2 fallback") # 3) OCR try: pages = convert_from_path(str(file), dpi=300, fmt="png") ocr_chunks = [] for img in pages: ocr_chunks.append(pytesseract.image_to_string(img, lang="eng")) text = "\n".join(ocr_chunks).strip() if text: log.info(f"OCR succeeded on {file.name}") return text except Exception as e: log.debug(f"OCR failed on {file.name}: {e}") return "" # --------------------------------------------------------------------------- # def plain_text(file: Path) -> str: """Dispatcher: pick extractor based on extension.""" ext = file.suffix.lower() if ext == ".pdf": return pdf_to_text(file) elif ext in {".txt", ".md"}: return file.read_text("utf-8", errors="ignore") elif ext in {".html", ".htm"}: raw = file.read_text("utf-8", errors="ignore") return html.unescape(re.sub("<[^>]+>", " ", raw)) else: # Leave xlsx/csv/tsv etc. alone – store empty text, still list in manifest return "" # --------------------------------------------------------------------------- # def sha1_bytes(data: bytes) -> str: return hashlib.sha1(data).hexdigest() def save_text(text: str) -> str: """Write deduplicated text file => returns sha1.""" h = sha1_bytes(text.encode("utf-8")) out_path = TEXT_DIR / f"{h}.txt" if not out_path.exists(): out_path.write_text(text, "utf-8") return h # --------------------------------------------------------------------------- # def main() -> None: TEXT_DIR.mkdir(parents=True, exist_ok=True) OUT_DIR.mkdir(parents=True, exist_ok=True) manifest = [] processed = 0 for src_root in SRC_DIRS: for path in sorted(src_root.rglob("*")): if path.is_dir(): continue processed += 1 rel = path.relative_to(ROOT) # hash of file bytes for manifest file_bytes = path.read_bytes() file_sha1 = sha1_bytes(file_bytes) text = plain_text(path) text_sha1 = save_text(text) if len(text.encode()) > MIN_SIZE else None manifest.append( { "path" : str(rel), "bytes" : len(file_bytes), "file_sha1" : file_sha1, "text_sha1" : text_sha1, "ext" : path.suffix.lower(), "timestamp" : datetime.utcnow().isoformat(timespec="seconds")+"Z", } ) if processed % 50 == 0: log.info(f"…{processed} files processed") MANIFEST.write_text(json.dumps(manifest, indent=2)) log.info(f"✅ Wrote {len(manifest):,} entries → {MANIFEST.relative_to(ROOT)}") log.info(f"✅ Text corpus saved in {TEXT_DIR.relative_to(ROOT)}") if __name__ == "__main__": main()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/brockwebb/open-census-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

build_mini_corpus.py•6.02 KiB