"""Raw extraction for pdf file type"""
from typing import List, Optional
import json
import os
from datetime import datetime
import pdfplumber
VERTICAL_GAP = 7
def extract_pdf_into_lines(path: str) -> List[str]:
"""Turn pdf file into lines of text"""
lines = []
with pdfplumber.open(path) as pdf:
nr_pages = len(pdf.pages)
i = 0
for page in pdf.pages:
page_lines = page.extract_text_lines(
keep_blank_chars=False, use_text_flow=True
)
for line in page_lines:
lines.append(
{
"text": line["text"].strip(),
"top": line["top"],
"bottom": line["bottom"],
}
)
i += 1
print(f"Page {i}/{nr_pages} complete")
return lines
def chunk_lines_to_paragraphs(lines: List[str]) -> List[str]:
paragraphs = []
current = []
prev_bottom = None
for line in lines:
if not line["text"]:
continue
new_para = False
if prev_bottom is not None:
gap = abs(line["top"] - prev_bottom)
if gap > VERTICAL_GAP:
new_para = True
if new_para and current:
paragraphs.append(" ".join(current))
current = []
current.append(line["text"])
prev_bottom = line["bottom"]
if current:
paragraphs.append(" ".join(current))
return paragraphs
def save_paragraphs_to_file(out_path: str, paragraphs: List[str]):
"""Saves paragraphs to a text file where each new line contains a paragraph."""
with open(out_path, "w", encoding="utf-8") as f:
for paragraph in paragraphs:
f.write(paragraph + "\n")
def extra_pdf_metadata(path: str, out_path: str, extra_key_words: Optional[List[str]] = None):
"""
Extract pdf metadata as a json to the out path, optimized for Haystack RAG
Extra key words can be provided if necessary
"""
with pdfplumber.open(path) as pdf:
# Extract basic PDF metadata
pdf_meta = pdf.metadata or {}
# Get file system metadata
file_stats = os.stat(path)
file_size = file_stats.st_size
file_modified = datetime.fromtimestamp(file_stats.st_mtime).isoformat()
file_created = datetime.fromtimestamp(file_stats.st_ctime).isoformat()
# Calculate text statistics
total_chars = 0
total_words = 0
for page in pdf.pages:
text = page.extract_text() or ""
total_chars += len(text)
total_words += len(text.split())
pdf_key_words_str = pdf_meta.get("Keywords", "")
extra_key_words_str = ""
if extra_key_words is not None:
if pdf_key_words_str:
pdf_key_words_str += " "
extra_key_words_str += " ".join(extra_key_words)
# Build comprehensive metadata for RAG
metadata = {
# Document identification
"file_path": os.path.abspath(path),
"file_name": os.path.basename(path),
"file_size_bytes": file_size,
"file_modified": file_modified,
"file_created": file_created,
# PDF-specific metadata
"title": pdf_meta.get("Title", ""),
"author": pdf_meta.get("Author", ""),
"subject": pdf_meta.get("Subject", ""),
"creator": pdf_meta.get("Creator", ""),
"producer": pdf_meta.get("Producer", ""),
"creation_date": pdf_meta.get("CreationDate", ""),
"modification_date": pdf_meta.get("ModDate", ""),
"keywords": extra_key_words_str,
# Document statistics
"page_count": len(pdf.pages),
"total_characters": total_chars,
"total_words": total_words,
"avg_words_per_page": round(total_words / len(pdf.pages), 2)
if pdf.pages
else 0,
}
# Add extraction timestamp
metadata["extracted_at"] = datetime.now().isoformat()
# Save metadata as JSON
with open(out_path, "w", encoding="utf-8") as f:
json.dump(metadata, f, indent=2, ensure_ascii=False)
return metadata
if __name__ == "__main__":
lines = extract_pdf_into_lines("./documents/ISO-IEC-29500-2.pdf")
paragraphs = chunk_lines_to_paragraphs(lines)
print(f"Extracted {len(paragraphs)} paragraphs")
save_paragraphs_to_file("./ExtractedText/ISO-IEC-29500-2.txt", paragraphs)
extra_pdf_metadata(
"./documents/ISO-IEC-29500-2.pdf",
"ExtractedText/ISO-IEC-29500-2.meta.json",
["Microsoft", "Word", "xml"]
)