RooCode-RAG-Lookup

Overview Schema Related Servers Score Discussions

RooCode-RAG-Lookup
extraction

parse_pdf.py•4.63 KiB

"""Raw extraction for pdf file type""" from typing import List, Optional import json import os from datetime import datetime import pdfplumber VERTICAL_GAP = 7 def extract_pdf_into_lines(path: str) -> List[str]: """Turn pdf file into lines of text""" lines = [] with pdfplumber.open(path) as pdf: nr_pages = len(pdf.pages) i = 0 for page in pdf.pages: page_lines = page.extract_text_lines( keep_blank_chars=False, use_text_flow=True ) for line in page_lines: lines.append( { "text": line["text"].strip(), "top": line["top"], "bottom": line["bottom"], } ) i += 1 print(f"Page {i}/{nr_pages} complete") return lines def chunk_lines_to_paragraphs(lines: List[str]) -> List[str]: paragraphs = [] current = [] prev_bottom = None for line in lines: if not line["text"]: continue new_para = False if prev_bottom is not None: gap = abs(line["top"] - prev_bottom) if gap > VERTICAL_GAP: new_para = True if new_para and current: paragraphs.append(" ".join(current)) current = [] current.append(line["text"]) prev_bottom = line["bottom"] if current: paragraphs.append(" ".join(current)) return paragraphs def save_paragraphs_to_file(out_path: str, paragraphs: List[str]): """Saves paragraphs to a text file where each new line contains a paragraph.""" with open(out_path, "w", encoding="utf-8") as f: for paragraph in paragraphs: f.write(paragraph + "\n") def extra_pdf_metadata(path: str, out_path: str, extra_key_words: Optional[List[str]] = None): """ Extract pdf metadata as a json to the out path, optimized for Haystack RAG Extra key words can be provided if necessary """ with pdfplumber.open(path) as pdf: # Extract basic PDF metadata pdf_meta = pdf.metadata or {} # Get file system metadata file_stats = os.stat(path) file_size = file_stats.st_size file_modified = datetime.fromtimestamp(file_stats.st_mtime).isoformat() file_created = datetime.fromtimestamp(file_stats.st_ctime).isoformat() # Calculate text statistics total_chars = 0 total_words = 0 for page in pdf.pages: text = page.extract_text() or "" total_chars += len(text) total_words += len(text.split()) pdf_key_words_str = pdf_meta.get("Keywords", "") extra_key_words_str = "" if extra_key_words is not None: if pdf_key_words_str: pdf_key_words_str += " " extra_key_words_str += " ".join(extra_key_words) # Build comprehensive metadata for RAG metadata = { # Document identification "file_path": os.path.abspath(path), "file_name": os.path.basename(path), "file_size_bytes": file_size, "file_modified": file_modified, "file_created": file_created, # PDF-specific metadata "title": pdf_meta.get("Title", ""), "author": pdf_meta.get("Author", ""), "subject": pdf_meta.get("Subject", ""), "creator": pdf_meta.get("Creator", ""), "producer": pdf_meta.get("Producer", ""), "creation_date": pdf_meta.get("CreationDate", ""), "modification_date": pdf_meta.get("ModDate", ""), "keywords": extra_key_words_str, # Document statistics "page_count": len(pdf.pages), "total_characters": total_chars, "total_words": total_words, "avg_words_per_page": round(total_words / len(pdf.pages), 2) if pdf.pages else 0, } # Add extraction timestamp metadata["extracted_at"] = datetime.now().isoformat() # Save metadata as JSON with open(out_path, "w", encoding="utf-8") as f: json.dump(metadata, f, indent=2, ensure_ascii=False) return metadata if __name__ == "__main__": lines = extract_pdf_into_lines("./documents/ISO-IEC-29500-2.pdf") paragraphs = chunk_lines_to_paragraphs(lines) print(f"Extracted {len(paragraphs)} paragraphs") save_paragraphs_to_file("./ExtractedText/ISO-IEC-29500-2.txt", paragraphs) extra_pdf_metadata( "./documents/ISO-IEC-29500-2.pdf", "ExtractedText/ISO-IEC-29500-2.meta.json", ["Microsoft", "Word", "xml"] )

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Mazchoo/RooCode-RAG-Lookup'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

parse_pdf.py•4.63 KiB