import pdfplumber
from typing import List
class TableExtractor:
def __init__(self):
pass
def extract_tables(self, source_path: str, page_range: range = None) -> List[str]:
"""
Extract tables from PDF and convert to Markdown format.
Args:
source_path: Path to the local PDF file.
page_range: Range of pages to process.
Returns:
List of Markdown formatted tables.
"""
tables_md = []
try:
with pdfplumber.open(source_path) as pdf:
if page_range is None:
page_range = range(len(pdf.pages))
for page_num in page_range:
if page_num < 0 or page_num >= len(pdf.pages):
continue
page = pdf.pages[page_num]
extracted_tables = page.extract_tables()
for table_data in extracted_tables:
md_table = self._convert_to_markdown(table_data)
if md_table:
tables_md.append(
f"**Table (Page {page_num + 1})**\n{md_table}"
)
except Exception as e:
# Fallback or silent error if pdfplumber fails, as table extraction is optional enhancement
print(f"Warning: Table extraction failed: {e}")
return tables_md
def _convert_to_markdown(self, data: List[List[str]]) -> str:
"""
Convert a list of lists (table data) to a Markdown table string.
"""
if not data:
return ""
# Filter out None values and replace newlines
cleaned_data = [
[
str(cell).replace("\n", " ").strip() if cell is not None else ""
for cell in row
]
for row in data
]
if not cleaned_data:
return ""
# Create header
header = cleaned_data[0]
header_row = "| " + " | ".join(header) + " |"
separator_row = "| " + " | ".join(["---"] * len(header)) + " |"
# Create body
body_rows = []
for row in cleaned_data[1:]:
# Ensure row length matches header
if len(row) < len(header):
row += [""] * (len(header) - len(row))
elif len(row) > len(header):
row = row[: len(header)]
body_rows.append("| " + " | ".join(row) + " |")
return "\n".join([header_row, separator_row] + body_rows)