PitchLense MCP

upload_extractor.py•16.5 KiB

"""
Upload extractor tool that uses Gemini LLM analyzers and Perplexity to build startup_text.

Workflow:
- Accepts a list of local files with metadata (name/type/extension/path).
- Uses the appropriate Gemini analyzer (text, image, audio, video, document) to extract
  structured textual context per file.
- Calls Perplexity multiple times to synthesize a comprehensive startup_text string from
  the collected contexts.
"""

from __future__ import annotations

import os
from typing import Any, Dict, List, Optional, Tuple

from ..core.gemini_client import GeminiLLM
from .perplexity_search import PerplexityMCPTool


def _guess_mime_from_extension(ext: str) -> str:
    ext = (ext or "").lower().lstrip(".")
    if ext in {"pdf"}:
        return "application/pdf"
    if ext in {"docx"}:
        return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    if ext in {"pptx"}:
        return "application/vnd.openxmlformats-officedocument.presentationml.presentation"
    if ext in {"txt", "md", "csv", "json"}:
        return "text/plain"
    if ext in {"jpg", "jpeg"}:
        return "image/jpeg"
    if ext in {"png"}:
        return "image/png"
    if ext in {"mp3"}:
        return "audio/mp3"
    if ext in {"wav"}:
        return "audio/wav"
    if ext in {"mp4"}:
        return "video/mp4"
    if ext in {"mov"}:
        return "video/quicktime"
    return "application/octet-stream"


class UploadExtractor:
    """Extracts textual context from uploaded files and synthesizes startup_text."""

    def __init__(self, llm_client: GeminiLLM):
        self.llm = llm_client

    def _extract_text_from_plainfile(self, file_path: str) -> str:
        try:
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                raw = f.read()
        except Exception:
            raw = ""
        if not raw:
            return ""
        prompt = (
            "Extract key startup-relevant details from the following text. "
            "Return a concise summary covering company, product, customers, metrics, risks, and plans.\n\n" + raw
        )
        resp = self.llm.predict(
            system_message="You are a precise analyst extracting startup-relevant context. Maintain professional language and avoid inappropriate content. Focus strictly on business information.",
            user_message=prompt,
        )
        return resp.get("response", "")

    def _extract_from_document(self, file_path: str, mime_type: str) -> str:
        try:
            result = self.llm.document_analyzer.predict(
                document_path=file_path,
                prompt=(
                    "Extract ALL text content from this document completely. Perform OCR on any images, "
                    "charts, tables, or visual elements to extract text. Present tables in a structured "
                    "text format with clear column headers and data. For charts and graphs, describe "
                    "the data and extract any visible text or numbers. Include all headers, footers, "
                    "captions, and annotations. After extracting all content, provide a comprehensive "
                    "summary focusing on startup-relevant details: company information, market analysis, "
                    "product details, traction metrics, financials, team information, risks, and roadmap. "
                    "Maintain professional language and avoid inappropriate content. Focus strictly on business information."
                ),
                mime_type=mime_type or "application/pdf",
            )
            return result.get("text", "")
        except Exception as e:
            return f"Document analysis failed: {str(e)}"

    def _extract_from_image(self, file_path: str, mime_type: str) -> str:
        try:
            result = self.llm.image_analyzer.predict(
                image_input=file_path,
                prompt=(
                    "Perform OCR to extract ALL text visible in this image, including text overlays, "
                    "labels, captions, and any written content. If there are tables, present them in "
                    "a structured text format with clear column headers and data. If there are charts, "
                    "graphs, or diagrams, describe the visual data and extract any numbers, percentages, "
                    "or metrics shown. Identify and describe all visual elements including: product screenshots, "
                    "logos, team photos, charts, graphs, tables, infographics, and any other visual content. "
                    "Provide a comprehensive analysis focusing on startup-relevant information: company details, "
                    "product features, metrics, team information, and business data. "
                    "Maintain professional language and avoid inappropriate content. Focus strictly on business information."
                ),
                mime_type=mime_type or "image/jpeg",
            )
            return result.get("text", "")
        except Exception as e:
            return f"Image analysis failed: {str(e)}"

    def _extract_from_audio(self, file_path: str, mime_type: str) -> str:
        try:
            result = self.llm.audio_analyzer.predict(
                audio_input=file_path,
                prompt=(
                    "Transcribe this audio content completely. Provide a word-for-word transcription "
                    "of all spoken content, including any pauses, speaker changes, or background audio. "
                    "If multiple speakers are present, identify them clearly. After the transcription, "
                    "provide a summary focusing on startup-relevant details: company information, "
                    "product details, traction metrics, financials, risks, and future roadmap."
                ),
                mime_type=mime_type or "audio/mp3",
            )
            return result.get("text", "")
        except Exception as e:
            return f"Audio analysis failed: {str(e)}"

    def _extract_from_video(self, file_path: str, mime_type: str) -> str:
        try:
            result = self.llm.video_analyzer.predict(
                video_input=file_path,
                prompt=(
                    "First, transcribe all spoken content in this video completely. Provide a word-for-word "
                    "transcription of all dialogue, including speaker identification if multiple speakers are present. "
                    "Then, analyze and describe all visual information including: text overlays, charts, graphs, "
                    "tables, diagrams, slides, product screenshots, logos, and any other visual elements. "
                    "Present tables and charts in a structured text format. Extract any text visible in the video. "
                    "Finally, provide a comprehensive summary focusing on startup-relevant details: company information, "
                    "product details, traction metrics, financials, risks, and future roadmap."
                ),
                mime_type=mime_type or "video/mp4",
            )
            return result.get("text", "")
        except Exception as e:
            return f"Video analysis failed: {str(e)}"

    def extract_documents(self, uploads: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        Extract content from a list of uploaded local files.

        Each upload dict should have keys: filename, file_extension, local_path, filetype (optional).
        """
        documents: List[Dict[str, Any]] = []
        for u in uploads:
            filename = u.get("filename") or os.path.basename(u.get("local_path", ""))
            ext = (u.get("file_extension") or os.path.splitext(filename or "")[1].lstrip(".")).lower()
            local_path = u.get("local_path")
            filetype = u.get("filetype") or ""
            mime = _guess_mime_from_extension(ext)

            extracted = ""
            if mime.startswith("image/"):
                extracted = self._extract_from_image(local_path, mime)
            elif mime.startswith("video/"):
                extracted = self._extract_from_video(local_path, mime)
            elif mime.startswith("audio/"):
                extracted = self._extract_from_audio(local_path, mime)
            elif mime == "text/plain":
                extracted = self._extract_text_from_plainfile(local_path)
            else:
                # Treat as document by default
                extracted = self._extract_from_document(local_path, mime)

            documents.append({
                "name": filename,
                "type": filetype or mime,
                "extension": ext,
                "mime_type": mime,
                "content": extracted,
            })

        return documents

    def synthesize_startup_text(self, documents: List[Dict[str, Any]]) -> str:
        """Run 10 Perplexity calls in parallel (one per section) and concatenate results."""
        result = self.synthesize_startup_text_with_sources(documents)
        return result["text"]

    def synthesize_startup_text_with_sources(self, documents: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Run 10 Perplexity calls in parallel (one per section) and return both text and sources."""
        if not documents:
            return {"text": "", "sources": []}
        
        # Build a compact context string
        pieces: List[str] = []
        for doc in documents:
            name = doc.get("name")
            dtype = doc.get("type")
            content = (doc.get("content") or "").strip()
            if not content:
                continue
            pieces.append(f"Document: {name} (type: {dtype})\nContent:\n{content}\n---\n")
        base_context = "\n".join(pieces)
        try:
            print(f"[Extractor] Documents with content: {len(pieces)}; base_context_len={len(base_context)}")
        except Exception:
            pass

        sections: List[Tuple[int, str, str]] = [
            (
                1,
                "Startup name, industry, founder names, Founded year, Location, Stage, Industry",
                (
                    "Provide a precise, structured paragraph covering these fields. Include known brand names, "
                    "legal entity (if any), website/domain, and short one-line positioning."
                ),
            ),
            (
                2,
                "Business model",
                (
                    "Explain clearly how the startup makes money (pricing, plans, ARPU, sales motion, channels). "
                    "Include ICP (ideal customer profile), buyer persona, and key activation/retention levers."
                ),
            ),
            (
                3,
                "Product or what that startup is about",
                (
                    "Describe the product and problem-solution. List core features, notable differentiators, "
                    "tech stack (if known), key integrations/APIs, and deployment (cloud/on-prem/mobile)."
                ),
            ),
            (
                4,
                "Financial Information",
                (
                    "Summarize revenue (MRR/ARR), growth, gross margin (if known), burn, runway, funding history, "
                    "valuation (if public), unit economics (CAC, LTV, payback), and notable KPIs."
                ),
            ),
            (
                5,
                "Traction & Customers",
                (
                    "Cover users/customers, cohorts, growth rates, churn, NRR, expansion/upsell, notable logos, "
                    "use cases, and adoption by segment/geo where known."
                ),
            ),
            (
                6,
                "Team",
                (
                    "Summarize founders and key team (names, prior roles/companies), headcount by function, "
                    "hiring plan, board/advisors (if any), and organizational strengths."
                ),
            ),
            (
                7,
                "Market & Competition",
                (
                    "Summarize TAM/SAM/SOM (with sources if available), market growth, demand drivers, "
                    "competitors (direct/indirect), a brief competition matrix, and differentiation/moat."
                ),
            ),
            (
                8,
                "Recent News & Updates",
                (
                    "List notable news, launches, partnerships, regulatory items, and product updates with dates "
                    "and links where possible."
                ),
            ),
            (
                9,
                "Challenges & Risks",
                (
                    "Identify top risks across product, market, competitive, legal, operational, financial, "
                    "and team categories, with 1-2 actionable recommendations each."
                ),
            ),
            (
                10,
                "Future Plans",
                (
                    "Summarize roadmap (near/medium term), GTM plans, hiring, geographic expansion, "
                    "and key milestones/OKRs."
                ),
            ),
        ]

        # Build prompts per section
        def build_prompt(title: str, instruction: str) -> str:
            return (
                "You are a research assistant with web access. Using the context below and web search where helpful, "
                f"write the section: {title}. {instruction} If unknown, state unknown. Prefer precise data. "
                "Keep to 200-300 words. Provide inline URLs if citing external info.\n\n"
                "Context:\n" + base_context
            )

        from concurrent.futures import ThreadPoolExecutor, as_completed

        # Require Perplexity and do not fallback to Gemini
        if not os.getenv("PERPLEXITY_API_KEY"):
            raise RuntimeError("PERPLEXITY_API_KEY not set; cannot synthesize with Perplexity")

        results_map: Dict[int, str] = {}
        all_sources: List[Dict[str, str]] = []
        # Run Perplexity calls in parallel
        with ThreadPoolExecutor(max_workers=len(sections)) as executor:
            future_to_idx = {}
            for idx, title, instruction in sections:
                prompt = build_prompt(title, instruction)
                # Create a new tool per call to avoid shared state
                ppx = PerplexityMCPTool()
                future = executor.submit(lambda p=prompt, t=ppx: t.search_perplexity(p))
                future_to_idx[future] = idx

            for future in as_completed(future_to_idx):
                idx = future_to_idx[future]
                try:
                    resp = future.result()
                    err = resp.get("error") if isinstance(resp, dict) else None
                    ans = (resp.get("answer") or "").strip() if isinstance(resp, dict) else ""
                    sources = resp.get("sources", []) if isinstance(resp, dict) else []
                    results_map[idx] = ans
                    
                    # Collect sources from this section
                    if sources:
                        all_sources.extend(sources)
                    
                    try:
                        print(f"[Extractor] Section {idx}: answer_len={len(ans)} error={bool(err)} sources={len(sources)}")
                    except Exception:
                        pass
                except Exception as exc:
                    results_map[idx] = ""

        # If Perplexity produced no content at all, raise an error
        non_empty_sections = sum(1 for v in results_map.values() if (v or "").strip())
        if non_empty_sections == 0:
            raise RuntimeError("Perplexity returned no answers for any section")

        # Concatenate in order with headings
        ordered: List[str] = []
        for idx, title, _ in sections:
            section_text = results_map.get(idx, "")
            header = f"{idx}. {title}"
            ordered.append(header + "\n" + section_text)

        synthesized_text = "\n\n".join(ordered).strip()

        # Combine extracted content + synthesized sections to build final startup_text
        combined = (
            "Extracted Document Content\n" + base_context.strip() +
            "\n\n---\n\n" +
            "Synthesis\n" + synthesized_text
        ).strip()
        
        # Deduplicate sources by URL
        seen_urls = set()
        unique_sources = []
        for source in all_sources:
            url = source.get("url")
            if url and url not in seen_urls:
                seen_urls.add(url)
                unique_sources.append(source)
        
        return {
            "text": combined,
            "sources": unique_sources
        }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/connectaman/Pitchlense-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

upload_extractor.py•16.5 KiB

"""
Upload extractor tool that uses Gemini LLM analyzers and Perplexity to build startup_text.

Workflow:
- Accepts a list of local files with metadata (name/type/extension/path).
- Uses the appropriate Gemini analyzer (text, image, audio, video, document) to extract
  structured textual context per file.
- Calls Perplexity multiple times to synthesize a comprehensive startup_text string from
  the collected contexts.
"""

from __future__ import annotations

import os
from typing import Any, Dict, List, Optional, Tuple

from ..core.gemini_client import GeminiLLM
from .perplexity_search import PerplexityMCPTool


def _guess_mime_from_extension(ext: str) -> str:
    ext = (ext or "").lower().lstrip(".")
    if ext in {"pdf"}:
        return "application/pdf"
    if ext in {"docx"}:
        return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    if ext in {"pptx"}:
        return "application/vnd.openxmlformats-officedocument.presentationml.presentation"
    if ext in {"txt", "md", "csv", "json"}:
        return "text/plain"
    if ext in {"jpg", "jpeg"}:
        return "image/jpeg"
    if ext in {"png"}:
        return "image/png"
    if ext in {"mp3"}:
        return "audio/mp3"
    if ext in {"wav"}:
        return "audio/wav"
    if ext in {"mp4"}:
        return "video/mp4"
    if ext in {"mov"}:
        return "video/quicktime"
    return "application/octet-stream"


class UploadExtractor:
    """Extracts textual context from uploaded files and synthesizes startup_text."""

    def __init__(self, llm_client: GeminiLLM):
        self.llm = llm_client

    def _extract_text_from_plainfile(self, file_path: str) -> str:
        try:
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                raw = f.read()
        except Exception:
            raw = ""
        if not raw:
            return ""
        prompt = (
            "Extract key startup-relevant details from the following text. "
            "Return a concise summary covering company, product, customers, metrics, risks, and plans.\n\n" + raw
        )
        resp = self.llm.predict(
            system_message="You are a precise analyst extracting startup-relevant context. Maintain professional language and avoid inappropriate content. Focus strictly on business information.",
            user_message=prompt,
        )
        return resp.get("response", "")

    def _extract_from_document(self, file_path: str, mime_type: str) -> str:
        try:
            result = self.llm.document_analyzer.predict(
                document_path=file_path,
                prompt=(
                    "Extract ALL text content from this document completely. Perform OCR on any images, "
                    "charts, tables, or visual elements to extract text. Present tables in a structured "
                    "text format with clear column headers and data. For charts and graphs, describe "
                    "the data and extract any visible text or numbers. Include all headers, footers, "
                    "captions, and annotations. After extracting all content, provide a comprehensive "
                    "summary focusing on startup-relevant details: company information, market analysis, "
                    "product details, traction metrics, financials, team information, risks, and roadmap. "
                    "Maintain professional language and avoid inappropriate content. Focus strictly on business information."
                ),
                mime_type=mime_type or "application/pdf",
            )
            return result.get("text", "")
        except Exception as e:
            return f"Document analysis failed: {str(e)}"

    def _extract_from_image(self, file_path: str, mime_type: str) -> str:
        try:
            result = self.llm.image_analyzer.predict(
                image_input=file_path,
                prompt=(
                    "Perform OCR to extract ALL text visible in this image, including text overlays, "
                    "labels, captions, and any written content. If there are tables, present them in "
                    "a structured text format with clear column headers and data. If there are charts, "
                    "graphs, or diagrams, describe the visual data and extract any numbers, percentages, "
                    "or metrics shown. Identify and describe all visual elements including: product screenshots, "
                    "logos, team photos, charts, graphs, tables, infographics, and any other visual content. "
                    "Provide a comprehensive analysis focusing on startup-relevant information: company details, "
                    "product features, metrics, team information, and business data. "
                    "Maintain professional language and avoid inappropriate content. Focus strictly on business information."
                ),
                mime_type=mime_type or "image/jpeg",
            )
            return result.get("text", "")
        except Exception as e:
            return f"Image analysis failed: {str(e)}"

    def _extract_from_audio(self, file_path: str, mime_type: str) -> str:
        try:
            result = self.llm.audio_analyzer.predict(
                audio_input=file_path,
                prompt=(
                    "Transcribe this audio content completely. Provide a word-for-word transcription "
                    "of all spoken content, including any pauses, speaker changes, or background audio. "
                    "If multiple speakers are present, identify them clearly. After the transcription, "
                    "provide a summary focusing on startup-relevant details: company information, "
                    "product details, traction metrics, financials, risks, and future roadmap."
                ),
                mime_type=mime_type or "audio/mp3",
            )
            return result.get("text", "")
        except Exception as e:
            return f"Audio analysis failed: {str(e)}"

    def _extract_from_video(self, file_path: str, mime_type: str) -> str:
        try:
            result = self.llm.video_analyzer.predict(
                video_input=file_path,
                prompt=(
                    "First, transcribe all spoken content in this video completely. Provide a word-for-word "
                    "transcription of all dialogue, including speaker identification if multiple speakers are present. "
                    "Then, analyze and describe all visual information including: text overlays, charts, graphs, "
                    "tables, diagrams, slides, product screenshots, logos, and any other visual elements. "
                    "Present tables and charts in a structured text format. Extract any text visible in the video. "
                    "Finally, provide a comprehensive summary focusing on startup-relevant details: company information, "
                    "product details, traction metrics, financials, risks, and future roadmap."
                ),
                mime_type=mime_type or "video/mp4",
            )
            return result.get("text", "")
        except Exception as e:
            return f"Video analysis failed: {str(e)}"

    def extract_documents(self, uploads: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        Extract content from a list of uploaded local files.

        Each upload dict should have keys: filename, file_extension, local_path, filetype (optional).
        """
        documents: List[Dict[str, Any]] = []
        for u in uploads:
            filename = u.get("filename") or os.path.basename(u.get("local_path", ""))
            ext = (u.get("file_extension") or os.path.splitext(filename or "")[1].lstrip(".")).lower()
            local_path = u.get("local_path")
            filetype = u.get("filetype") or ""
            mime = _guess_mime_from_extension(ext)

            extracted = ""
            if mime.startswith("image/"):
                extracted = self._extract_from_image(local_path, mime)
            elif mime.startswith("video/"):
                extracted = self._extract_from_video(local_path, mime)
            elif mime.startswith("audio/"):
                extracted = self._extract_from_audio(local_path, mime)
            elif mime == "text/plain":
                extracted = self._extract_text_from_plainfile(local_path)
            else:
                # Treat as document by default
                extracted = self._extract_from_document(local_path, mime)

            documents.append({
                "name": filename,
                "type": filetype or mime,
                "extension": ext,
                "mime_type": mime,
                "content": extracted,
            })

        return documents

    def synthesize_startup_text(self, documents: List[Dict[str, Any]]) -> str:
        """Run 10 Perplexity calls in parallel (one per section) and concatenate results."""
        result = self.synthesize_startup_text_with_sources(documents)
        return result["text"]

    def synthesize_startup_text_with_sources(self, documents: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Run 10 Perplexity calls in parallel (one per section) and return both text and sources."""
        if not documents:
            return {"text": "", "sources": []}
        
        # Build a compact context string
        pieces: List[str] = []
        for doc in documents:
            name = doc.get("name")
            dtype = doc.get("type")
            content = (doc.get("content") or "").strip()
            if not content:
                continue
            pieces.append(f"Document: {name} (type: {dtype})\nContent:\n{content}\n---\n")
        base_context = "\n".join(pieces)
        try:
            print(f"[Extractor] Documents with content: {len(pieces)}; base_context_len={len(base_context)}")
        except Exception:
            pass

        sections: List[Tuple[int, str, str]] = [
            (
                1,
                "Startup name, industry, founder names, Founded year, Location, Stage, Industry",
                (
                    "Provide a precise, structured paragraph covering these fields. Include known brand names, "
                    "legal entity (if any), website/domain, and short one-line positioning."
                ),
            ),
            (
                2,
                "Business model",
                (
                    "Explain clearly how the startup makes money (pricing, plans, ARPU, sales motion, channels). "
                    "Include ICP (ideal customer profile), buyer persona, and key activation/retention levers."
                ),
            ),
            (
                3,
                "Product or what that startup is about",
                (
                    "Describe the product and problem-solution. List core features, notable differentiators, "
                    "tech stack (if known), key integrations/APIs, and deployment (cloud/on-prem/mobile)."
                ),
            ),
            (
                4,
                "Financial Information",
                (
                    "Summarize revenue (MRR/ARR), growth, gross margin (if known), burn, runway, funding history, "
                    "valuation (if public), unit economics (CAC, LTV, payback), and notable KPIs."
                ),
            ),
            (
                5,
                "Traction & Customers",
                (
                    "Cover users/customers, cohorts, growth rates, churn, NRR, expansion/upsell, notable logos, "
                    "use cases, and adoption by segment/geo where known."
                ),
            ),
            (
                6,
                "Team",
                (
                    "Summarize founders and key team (names, prior roles/companies), headcount by function, "
                    "hiring plan, board/advisors (if any), and organizational strengths."
                ),
            ),
            (
                7,
                "Market & Competition",
                (
                    "Summarize TAM/SAM/SOM (with sources if available), market growth, demand drivers, "
                    "competitors (direct/indirect), a brief competition matrix, and differentiation/moat."
                ),
            ),
            (
                8,
                "Recent News & Updates",
                (
                    "List notable news, launches, partnerships, regulatory items, and product updates with dates "
                    "and links where possible."
                ),
            ),
            (
                9,
                "Challenges & Risks",
                (
                    "Identify top risks across product, market, competitive, legal, operational, financial, "
                    "and team categories, with 1-2 actionable recommendations each."
                ),
            ),
            (
                10,
                "Future Plans",
                (
                    "Summarize roadmap (near/medium term), GTM plans, hiring, geographic expansion, "
                    "and key milestones/OKRs."
                ),
            ),
        ]

        # Build prompts per section
        def build_prompt(title: str, instruction: str) -> str:
            return (
                "You are a research assistant with web access. Using the context below and web search where helpful, "
                f"write the section: {title}. {instruction} If unknown, state unknown. Prefer precise data. "
                "Keep to 200-300 words. Provide inline URLs if citing external info.\n\n"
                "Context:\n" + base_context
            )

        from concurrent.futures import ThreadPoolExecutor, as_completed

        # Require Perplexity and do not fallback to Gemini
        if not os.getenv("PERPLEXITY_API_KEY"):
            raise RuntimeError("PERPLEXITY_API_KEY not set; cannot synthesize with Perplexity")

        results_map: Dict[int, str] = {}
        all_sources: List[Dict[str, str]] = []
        # Run Perplexity calls in parallel
        with ThreadPoolExecutor(max_workers=len(sections)) as executor:
            future_to_idx = {}
            for idx, title, instruction in sections:
                prompt = build_prompt(title, instruction)
                # Create a new tool per call to avoid shared state
                ppx = PerplexityMCPTool()
                future = executor.submit(lambda p=prompt, t=ppx: t.search_perplexity(p))
                future_to_idx[future] = idx

            for future in as_completed(future_to_idx):
                idx = future_to_idx[future]
                try:
                    resp = future.result()
                    err = resp.get("error") if isinstance(resp, dict) else None
                    ans = (resp.get("answer") or "").strip() if isinstance(resp, dict) else ""
                    sources = resp.get("sources", []) if isinstance(resp, dict) else []
                    results_map[idx] = ans
                    
                    # Collect sources from this section
                    if sources:
                        all_sources.extend(sources)
                    
                    try:
                        print(f"[Extractor] Section {idx}: answer_len={len(ans)} error={bool(err)} sources={len(sources)}")
                    except Exception:
                        pass
                except Exception as exc:
                    results_map[idx] = ""

        # If Perplexity produced no content at all, raise an error
        non_empty_sections = sum(1 for v in results_map.values() if (v or "").strip())
        if non_empty_sections == 0:
            raise RuntimeError("Perplexity returned no answers for any section")

        # Concatenate in order with headings
        ordered: List[str] = []
        for idx, title, _ in sections:
            section_text = results_map.get(idx, "")
            header = f"{idx}. {title}"
            ordered.append(header + "\n" + section_text)

        synthesized_text = "\n\n".join(ordered).strip()

        # Combine extracted content + synthesized sections to build final startup_text
        combined = (
            "Extracted Document Content\n" + base_context.strip() +
            "\n\n---\n\n" +
            "Synthesis\n" + synthesized_text
        ).strip()
        
        # Deduplicate sources by URL
        seen_urls = set()
        unique_sources = []
        for source in all_sources:
            url = source.get("url")
            if url and url not in seen_urls:
                seen_urls.add(url)
                unique_sources.append(source)
        
        return {
            "text": combined,
            "sources": unique_sources
        }