process_docs.py•7.45 kB
import os
import fnmatch
import json
import yaml
from firecrawl import FirecrawlApp
import context.utils as utils  # Your custom LLM utilities
def simplify_content_with_llm(prompt_text, text, llm):
    """
    Simplify Markdown content using an LLM.
    Parameters:
        prompt_text (str): The prompt to guide the simplification.
        text (str): The original Markdown content.
        llm (str): The language model identifier to use.
    Returns:
        str: The simplified Markdown content.
    """
    if not prompt_text:
        prompt_text = """
        Simplify the following Markdown content.
        Remove fluff and keep only key technical details.
        Remove any extraneous buttons or sections.
        """
    llm_output = utils.get_llm_output(prompt_text, text, llm=llm)
    return llm_output["response"], llm_output["usage"]
def load_config_yaml():
    with open("config.yaml", "r", encoding="utf-8") as f:
        return yaml.safe_load(f)
class DocsHandler:
    """
    Handles the processing of documentation files.
    """
    def __init__(
        self,
        include_patterns,
        exclude_patterns,
        prompt_config,
        docs_output_fragments,
        base_url,
        doc_tree_file,
        llm,
    ):
        self.include_patterns = include_patterns
        self.exclude_patterns = exclude_patterns
        self.prompt_config = prompt_config
        self.base_url = base_url
        self.doc_tree_file = doc_tree_file
        self.docs_output_fragments = docs_output_fragments
        self.llm = llm
    def traverse_doc_tree(self, doc_tree, parent_path=""):
        """
        Traverse a documentation tree structure and return a list of tuples (full_path, href, element).
        """
        results = []
        for node in doc_tree:
            current = (
                os.path.join(parent_path, node["element"])
                if parent_path
                else node["element"]
            )
            results.append((current, node.get("href", ""), node["element"]))
            if node.get("children"):
                results.extend(self.traverse_doc_tree(node["children"], current))
        return results
    def scrape_document_to_md(self, url):
        """
        Scrape a document URL and return its Markdown content.
        """
        api_key = os.getenv("FIRECRAWL_API_KEY")
        if not api_key:
            raise Exception("FirewCrawl API Key missing")
        app = FirecrawlApp(api_key=api_key)
        response = app.scrape_url(url=url, formats=["markdown"]).markdown
        return response
    def get_prompt_for_identifier(self, identifier):
        """
        Determine which prompt to use based on the document identifier and prompt configuration.
        """
        prompt_folder = self.prompt_config.get("prompt_folder", "")
        default_prompt = self.prompt_config.get("default_prompt", "")
        custom_prompts = self.prompt_config.get("custom_prompts", [])
        selected_prompt = default_prompt
        for entry in custom_prompts:
            pattern = entry.get("pattern")
            prompt_file = entry.get("prompt")
            if pattern and prompt_file and fnmatch.fnmatch(identifier, pattern):
                selected_prompt = prompt_file
        full_prompt_path = os.path.join(prompt_folder, selected_prompt)
        if os.path.exists(full_prompt_path):
            with open(full_prompt_path, "r", encoding="utf-8") as f:
                return f.read()
        else:
            print(
                f"Warning: Prompt file {full_prompt_path} not found. Using empty prompt."
            )
            return ""
    def process(self):
        """
        Process the documentation tree:
          - Traverse the doc tree to identify docs to include.
          - Scrape each doc URL.
          - Simplify its Markdown content via the LLM.
          - Append the result to a combined Markdown string.
        Returns:
            str: The combined Markdown content.
        """
        with open(self.doc_tree_file, "r", encoding="utf-8") as f:
            doc_tree = json.load(f)
        docs = self.traverse_doc_tree(doc_tree)
        selected = []
        for full_path, href, element in docs:
            include = True
            if self.include_patterns:
                include = any(
                    fnmatch.fnmatch(full_path, pat) for pat in self.include_patterns
                )
            exclude = False
            if self.exclude_patterns:
                exclude = any(
                    fnmatch.fnmatch(full_path, pat) for pat in self.exclude_patterns
                )
            if include and not exclude:
                selected.append((full_path, href, element))
        output = ""
        total_tokens_used = 0
        for full_path, href, element in selected:
            # Build full URL based on the href value.
            url = self.base_url.rstrip("/") + href if href.startswith("/") else href
            try:
                content = self.scrape_document_to_md(url)
            except Exception as e:
                content = f"Error fetching {url}: {str(e)}"
            prompt_text = self.get_prompt_for_identifier(full_path)
            simplified, tokens_used = simplify_content_with_llm(
                prompt_text, content, self.llm
            )
            print(f"💰 Tokens Used {tokens_used}")
            total_tokens_used += tokens_used
            doc_output = (
                f"# {element} [Source Link]({self.base_url}{href})\n\n"
                + simplified
                + "\n\n---\n\n"
            )
            if self.docs_output_fragments:
                os.makedirs(self.docs_output_fragments, exist_ok=True)
                file_name = f"{(href.replace('-', '_').strip('/')) or 'index'}.txt"
                print("this is file_name", file_name)
                doc_output_file_path = os.path.join(
                    self.docs_output_fragments,
                    file_name,
                )
                with open(doc_output_file_path, "w") as f:
                    f.write(doc_output)
            output += doc_output
        print(f"💰 💰 Total Tokens Used : {total_tokens_used}")
        return output
if __name__ == "__main__":
    # Load configuration from YAML
    config = load_config_yaml().get("docs_context", {})
    clone_dir = config.get("clone_dir")
    # Retrieve the LLM parameter from the config; default to "gemini" if not provided.
    llm = config.get("llm", "gemini")
    # Docs configuration
    docs_include = config.get("include", [])
    docs_exclude = config.get("exclude", [])
    docs_prompts = config.get("prompts", {})
    docs_output_fragments = config.get("output_fragments")
    docs_output_file = config.get("output_file", "")
    docs_base_url = config.get("base_url", "")
    docs_tree_file = config.get("tree_file", "")
    # Process Docs
    docs_handler = DocsHandler(
        docs_include,
        docs_exclude,
        docs_prompts,
        docs_output_fragments,
        docs_base_url,
        docs_tree_file,
        llm,
    )
    docs_content = docs_handler.process()
    # Save Docs content to the desired output file
    if docs_output_file:
        os.makedirs(os.path.dirname(docs_output_file), exist_ok=True)
        with open(docs_output_file, "w", encoding="utf-8") as f:
            f.write(docs_content)
        print(f"✔ Docs content saved in {docs_output_file}")