MCPunk
by jurasofish
- mcpunk
import logging
from abc import abstractmethod
from pathlib import Path
from bs4 import ( # type: ignore[attr-defined]
BeautifulSoup,
NavigableString,
Tag,
)
from mcpunk.file_chunk import Chunk, ChunkCategory
from mcpunk.python_file_analysis import Callable, extract_imports, extract_module_statements
logger = logging.getLogger(__name__)
class BaseChunker:
"""Base class for file chunkers."""
def __init__(self, source_code: str, file_path: Path) -> None:
self.source_code = source_code
self.file_path = file_path
@staticmethod
@abstractmethod
def can_chunk(source_code: str, file_path: Path) -> bool:
"""Return True if the file can likely be chunked by this class.
This should be a very quick cheap check, quite possibly just using the file
extension. Do not assume that the file exists on disk.
Users of file chunks should handle gracefully the case where this returns
True but `chunk_file` fails. For example, the file may appear to be Python
but could contain invalid syntax.
"""
raise NotImplementedError
@abstractmethod
def chunk_file(self) -> list[Chunk]:
"""Chunk the given file."""
raise NotImplementedError
class WholeFileChunker(BaseChunker):
"""Unconditionally chunk the whole file in a single chunk."""
@staticmethod
def can_chunk(source_code: str, file_path: Path) -> bool: # noqa: ARG004
return True
def chunk_file(self) -> list[Chunk]:
return [
Chunk(
category=ChunkCategory.whole_file,
name="whole_file",
content=self.source_code,
line=1,
),
]
class PythonChunker(BaseChunker):
@staticmethod
def can_chunk(source_code: str, file_path: Path) -> bool: # noqa: ARG004
return str(file_path).endswith(".py")
def chunk_file(self) -> list[Chunk]:
callables = Callable.from_source_code(self.source_code)
imports = "\n".join(extract_imports(self.source_code))
module_level_statements = "\n".join(extract_module_statements(self.source_code))
chunks: list[Chunk] = []
if imports.strip() != "":
chunks.append(
Chunk(category=ChunkCategory.imports, name="imports", line=None, content=imports),
)
if module_level_statements.strip() != "":
chunks.append(
Chunk(
category=ChunkCategory.module_level,
name="module_level_statements",
line=None,
content=module_level_statements,
),
)
chunks.extend(
Chunk(
category=ChunkCategory.callable,
name=callable_.name,
line=callable_.line,
content=callable_.code,
)
for callable_ in callables
)
return chunks
class MarkdownChunker(BaseChunker):
@staticmethod
def can_chunk(source_code: str, file_path: Path) -> bool: # noqa: ARG004
return str(file_path).endswith(".md")
def chunk_file(self) -> list[Chunk]:
chunks: list[Chunk] = []
current_section: list[str] = []
current_heading: str | None = None
current_line = 1
start_of_section = 1
for line in self.source_code.split("\n"):
if line.startswith("#"):
# If we have a previous section, save it
if current_section:
chunks.append(
Chunk(
category=ChunkCategory.markdown_section,
name=current_heading.replace("#", "").strip()
if current_heading is not None
else "(no heading)",
line=start_of_section,
content="\n".join(current_section),
),
)
current_heading = line
current_section = [line]
start_of_section = current_line
else:
current_section.append(line)
current_line += 1
# Add the last section
if current_section:
chunks.append(
Chunk(
category=ChunkCategory.markdown_section,
name=current_heading.replace("#", "").strip()
if current_heading is not None
else "(no heading)",
line=start_of_section,
content="\n".join(current_section),
),
)
return chunks
class VueChunker(BaseChunker):
"""Chunks Vue Single File Components into their constituent blocks.
Intention is to put template, script, style (and other custom) blocks
into their own chunks.
See https://vuejs.org/api/sfc-spec
"""
@staticmethod
def can_chunk(source_code: str, file_path: Path) -> bool: # noqa: ARG004
return str(file_path).endswith(".vue")
def chunk_file(self) -> list[Chunk]:
chunks: list[Chunk] = []
# To preserve whitespace, we wrap the source code in a <pre> tag.
# Without this, BeautifulSoup will strip/fiddle whitespace.
# See https://stackoverflow.com/a/33788712
soup_with_pre = BeautifulSoup(
"<pre>" + self.source_code + "</pre>",
"html.parser",
)
soup_within_pre: Tag | None = soup_with_pre.pre
if soup_within_pre is None:
logger.error(f"soup_within_pre is None for {self.source_code}")
return chunks
# Find all top-level blocks. These are typcially template, script, style,
# plus any custom blocks.
top_level_elements = soup_within_pre.find_all(recursive=False)
if top_level_elements is not None:
for element in top_level_elements:
if isinstance(element, Tag): # Only process Tag elements
chunks.append(
Chunk(
category=ChunkCategory.other,
name=str(element.name), # Convert to str to satisfy type checker
content=str(element),
line=element.sourceline,
),
)
# Get content not in any tag, aggressively stripping whitespace.
# I'm not sure if it's actually valid to have content outside a <something>
# tag but whatever doesn't hurt to grab it.
outer_content_items: list[str] = []
for outer_content_item in soup_within_pre.find_all(string=True, recursive=False):
if outer_content_item and isinstance(outer_content_item, NavigableString):
if stripped := str(outer_content_item).strip():
outer_content_items.append(stripped)
if outer_content_items:
chunks.append(
Chunk(
category=ChunkCategory.module_level,
name="outer_content",
content="\n".join(outer_content_items),
line=None,
),
)
return chunks