process_examples.py•7.19 kB
import os
import glob
import fnmatch
import urllib.parse
import nbformat
from nbconvert import MarkdownExporter
import yaml
import context.utils as utils  # Your custom LLM utilities
def format_url(url: str) -> str:
    return urllib.parse.quote(url, safe=":/?=&")
def simplify_content_with_llm(prompt_text, text, llm):
    """
    Simplify Markdown content using an LLM.
    Parameters:
        prompt_text (str): The prompt to guide the simplification.
        text (str): The original Markdown content.
        llm (str): The language model identifier to use.
    Returns:
        str: The simplified Markdown content.
    """
    if not prompt_text:
        prompt_text = """
        Simplify the following Markdown content.
        Remove fluff and keep only key technical details.
        Remove any extraneous buttons or sections.
        """
    llm_output = utils.get_llm_output(prompt_text, text, llm=llm)
    return llm_output["response"], llm_output["usage"]
def load_config_yaml():
    with open("config.yaml", "r", encoding="utf-8") as f:
        return yaml.safe_load(f)
class IPYNBHandler:
    """
    Handles the conversion and processing of Jupyter Notebook (.ipynb) files.
    """
    def __init__(
        self,
        include_patterns,
        exclude_patterns,
        prompt_config,
        output_fragments,
        clone_dir,
        clone_url,
        llm,
    ):
        self.include_patterns = include_patterns
        self.exclude_patterns = exclude_patterns
        self.prompt_config = prompt_config
        self.output_fragments = output_fragments
        self.clone_dir = clone_dir
        self.clone_url = clone_url
        self.llm = llm
    def convert_ipynb_to_md(self, ipynb_file):
        """
        Convert a Jupyter Notebook to Markdown using nbconvert.
        """
        with open(ipynb_file, "r", encoding="utf-8") as f:
            notebook = nbformat.read(f, as_version=4)
        exporter = MarkdownExporter()
        md_content, _ = exporter.from_notebook_node(notebook)
        return md_content
    def get_prompt_for_ipynb(self, file_path):
        """
        Determine which prompt to use based on the file path and prompt configuration.
        """
        default_prompt = self.prompt_config.get("default_prompt", "")
        prompt_folder = self.prompt_config.get("prompt_folder", "")
        custom_prompts = self.prompt_config.get("custom_prompts", [])
        selected_prompt = default_prompt
        for entry in custom_prompts:
            pattern = entry.get("pattern")
            prompt_file = entry.get("prompt")
            if pattern and prompt_file:
                # Last matching prompt wins
                if fnmatch.fnmatch(file_path, f"{self.clone_dir}/{pattern}"):
                    selected_prompt = prompt_file
        full_prompt_path = os.path.join(prompt_folder, selected_prompt)
        if os.path.exists(full_prompt_path):
            with open(full_prompt_path, "r", encoding="utf-8") as f:
                return f.read()
        else:
            print(
                f"Warning: Prompt file {full_prompt_path} not found. Using empty prompt."
            )
            return ""
    def get_ipynb_files_from_globs(self):
        """
        Expand include glob patterns into a list of .ipynb file paths
        and filter out files matching any exclude pattern.
        """
        files = []
        for pattern in self.include_patterns:
            matched = glob.glob(f"{self.clone_dir}/{pattern}", recursive=True)
            files.extend(matched)
        if self.exclude_patterns:
            filtered_files = []
            for f in files:
                if any(
                    fnmatch.fnmatch(f, f"{self.clone_dir}/{pat}")
                    for pat in self.exclude_patterns
                ):
                    continue
                filtered_files.append(f)
            files = filtered_files
        return files
    def process(self):
        """
        Process each .ipynb file:
          - Convert to Markdown.
          - Simplify content via the LLM.
          - Append the result to a combined Markdown string.
        Returns:
            str: The combined Markdown content.
        """
        ipynb_files = self.get_ipynb_files_from_globs()
        output = ""
        total_tokens_used = 0
        for ipynb_file in ipynb_files:
            if not os.path.exists(ipynb_file):
                print(f"⚠ File not found: {ipynb_file}")
                continue
            print(f"Processing {ipynb_file}...")
            # Convert notebook to Markdown
            md_content = self.convert_ipynb_to_md(ipynb_file)
            prompt_text = self.get_prompt_for_ipynb(ipynb_file)
            simplified_content, tokens_used = simplify_content_with_llm(
                prompt_text, md_content, self.llm
            )
            file_title = os.path.basename(ipynb_file).replace(".ipynb", "")
            print(f"💰 Tokens Used {tokens_used}")
            total_tokens_used += tokens_used
            source_link = ipynb_file.replace(
                self.clone_dir, f"{self.clone_url}/blob/main"
            )
            ipynb_output = (
                f"# IPYNB Notebook: {file_title} [Source Link]({format_url(source_link)})\n\n"
                + simplified_content
                + "\n\n---\n\n"
            )
            if self.output_fragments:
                os.makedirs(self.output_fragments, exist_ok=True)
                file_name = f"{(file_title.replace('-', '_').replace(' ', '_').strip('/')) or 'index'}.txt"
                print("this is file_name", file_name)
                ipynb_output_file_path = os.path.join(
                    self.output_fragments,
                    file_name,
                )
                with open(ipynb_output_file_path, "w") as f:
                    f.write(ipynb_output)
            output += ipynb_output
        print(f" 💰 💰 Tokens Used : {total_tokens_used}")
        return output
if __name__ == "__main__":
    # Load configuration from YAML
    config = load_config_yaml().get("examples_context", {})
    clone_dir = config.get("clone_dir")
    clone_url = config.get("clone_url")
    # Retrieve the LLM parameter from the config; default to "gemini" if not provided.
    llm = config.get("llm", "gemini")
    # IPYNB configuration
    ipynb_include = config.get("include", [])
    ipynb_exclude = config.get("exclude", [])
    ipynb_prompts = config.get("prompts", {})
    ipynb_output_file = config.get("output_file", "")
    ipynb_output_fragments = config.get("output_fragments", "")
    # Process IPYNB files
    ipynb_handler = IPYNBHandler(
        ipynb_include,
        ipynb_exclude,
        ipynb_prompts,
        ipynb_output_fragments,
        clone_dir,
        clone_url,
        llm,
    )
    ipynb_content = ipynb_handler.process()
    # Save IPYNB content to the desired output file
    if ipynb_output_file:
        os.makedirs(os.path.dirname(ipynb_output_file), exist_ok=True)
        with open(ipynb_output_file, "w", encoding="utf-8") as f:
            f.write(ipynb_content)
        print(f"✔ IPYNB content saved in {ipynb_output_file}")