content-core

Overview Schema Related Servers Score Discussions

content-core
tests
integration

test_extraction.py•13.1 KiB

from pathlib import Path import pytest from content_core.content.extraction import extract_content # type: ignore @pytest.fixture def fixture_path(): """Provides the path to the directory containing test input files.""" return Path(__file__).parent.parent / "input_content" @pytest.mark.asyncio async def test_extract_content_from_text(): """Tests content extraction from a raw text string.""" input_data = {"content": "My sample content for testing."} result = await extract_content(input_data) assert hasattr(result, "source_type") assert result.source_type == "text" assert "My sample content for testing." in result.content assert result.title == "" # Or based on actual behavior @pytest.mark.asyncio async def test_extract_content_from_url(fixture_path): """Tests content extraction from a URL.""" # Using a known URL from the notebook example input_data = {"url": "https://www.supernovalabs.com", "url_engine": "simple"} result = await extract_content(input_data) assert hasattr(result, "source_type") assert result.source_type == "url" # Check for expected title and content snippets based on notebook output assert "Supernova Labs" in result.title assert "AI Consulting" in result.title # assert "Supernova Labs" in result.content # assert "AI Opportunity Map" in result.content # Example snippet @pytest.mark.asyncio async def test_extract_content_from_url_firecrawl(fixture_path): """Tests content extraction from a URL.""" try: import firecrawl except ImportError: pytest.skip("Firecrawl not installed") # Using a known URL from the notebook example input_data = {"url": "https://www.supernovalabs.com", "url_engine": "firecrawl"} result = await extract_content(input_data) assert hasattr(result, "source_type") assert result.source_type == "url" # Check for expected title and content snippets based on notebook output assert "Supernova Labs" in result.title assert "AI Consulting" in result.title # Check that content was extracted and contains relevant keywords assert len(result.content) > 100 assert "AI" in result.content @pytest.mark.asyncio async def test_extract_content_from_url_jina(fixture_path): """Tests content extraction from a URL.""" # Using a known URL from the notebook example input_data = {"url": "https://www.supernovalabs.com", "url_engine": "jina"} result = await extract_content(input_data) assert hasattr(result, "source_type") assert result.source_type == "url" # Check for expected title and content snippets based on notebook output assert "Supernova Labs" in result.title # Check that content was extracted and contains relevant keywords assert len(result.content) > 100 assert "AI" in result.content @pytest.mark.asyncio async def test_extract_content_from_url_crawl4ai(fixture_path): """Tests content extraction from a URL using Crawl4AI.""" pytest.importorskip("crawl4ai", reason="Crawl4AI not installed") # Using a known URL from the notebook example input_data = {"url": "https://www.supernovalabs.com", "url_engine": "crawl4ai"} result = await extract_content(input_data) assert hasattr(result, "source_type") assert result.source_type == "url" # Check for expected title and content snippets based on notebook output assert "Supernova Labs" in result.title assert "AI Consulting" in result.title # Check that content was extracted and contains relevant keywords assert len(result.content) > 100 assert "AI" in result.content @pytest.mark.asyncio async def test_extract_content_from_mp4(fixture_path): """Tests content extraction (transcript) from an MP4 file.""" mp4_file = fixture_path / "file.mp4" # Ensure the user adds this file if not mp4_file.exists(): pytest.skip(f"Fixture file not found: {mp4_file}") input_data = {"file_path": str(mp4_file)} result = await extract_content(input_data) assert hasattr(result, "source_type") assert result.source_type == "file" assert result.title == "file.mp4" assert result.identified_type == "audio/mp3" # Expect audio/mp3 after extraction assert "welcome" in result.content.lower() # Check for expected word @pytest.mark.asyncio @pytest.mark.xfail( reason="Event loop cleanup issue with httpx when running after other audio tests. " "This is a known pytest-asyncio + httpx interaction issue that doesn't affect functionality.", strict=False ) async def test_extract_content_from_mp3(fixture_path): """Tests content extraction (transcript) from an MP3 file.""" mp3_file = fixture_path / "file.mp3" # Ensure the user adds this file if not mp3_file.exists(): pytest.skip(f"Fixture file not found: {mp3_file}") input_data = {"file_path": str(mp3_file)} result = await extract_content(input_data) assert hasattr(result, "source_type") assert result.source_type == "file" assert result.title == "file.mp3" assert result.identified_type == "audio/mpeg" # Expect audio/mpeg after extraction assert "welcome" in result.content.lower() # Check for expected word @pytest.mark.asyncio async def test_extract_content_from_markdown(fixture_path): """Tests content extraction from a Markdown file.""" md_file = fixture_path / "file.md" # Ensure the user adds this file if not md_file.exists(): pytest.skip(f"Fixture file not found: {md_file}") input_data = {"file_path": str(md_file)} result = await extract_content(input_data) assert hasattr(result, "source_type") assert result.source_type == "file" assert result.title == "file.md" assert result.identified_type == "text/plain" # Expect text/plain for MD files assert "Buenos Aires" in result.content # Check for expected text @pytest.mark.asyncio async def test_extract_content_from_epub(fixture_path): """Tests content extraction from an EPUB file.""" epub_file = fixture_path / "file.epub" # Ensure the user adds this file if not epub_file.exists(): pytest.skip(f"Fixture file not found: {epub_file}") input_data = {"file_path": str(epub_file)} result = await extract_content(input_data) assert hasattr(result, "source_type") assert result.source_type == "file" assert result.title == "file.epub" assert ( result.identified_type == "application/epub+zip" ) # Expect application/epub+zip for EPUB files assert "Wonderland" in result.content # Check for expected text @pytest.mark.asyncio async def test_extract_content_from_youtube_url(fixture_path): """Tests extracting content from a YouTube URL.""" # Use a different, more stable video URL youtube_url = "https://www.youtube.com/watch?v=pBy1zgt0XPc" result = await extract_content(dict(url=youtube_url)) assert result.source_type == "url" assert result.identified_type == "youtube" # Expect 'youtube' type assert "What is GitHub?" in result.title # Check for expected title segment # Update keyword checks for the new video assert "github" in result.content.lower() assert "code" in result.content.lower() assert "git" in result.content.lower() # Check for 'git' assert len(result.content) > 50 # Expecting a shorter transcript for this video @pytest.mark.asyncio async def test_extract_content_from_pdf(fixture_path): """Tests extracting content from a PDF file.""" pdf_file = fixture_path / "file.pdf" if not pdf_file.exists(): pytest.skip(f"Fixture file not found: {pdf_file}") result = await extract_content(dict(file_path=str(pdf_file))) assert result.source_type == "file" assert result.identified_type == "application/pdf" assert "Buenos Aires" in result.content # Check for expected text assert result.title is not None # Attempt to extract title/metadata assert len(result.content) > 0 # Check that some content was extracted @pytest.mark.asyncio async def test_extract_content_from_pptx(fixture_path): """Tests extracting content from a PPTX file.""" pptx_file = fixture_path / "file.pptx" if not pptx_file.exists(): pytest.skip(f"Fixture file not found: {pptx_file}") result = await extract_content(dict(file_path=str(pptx_file))) assert result.source_type == "file" assert ( result.identified_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation" ) assert "MASTERNODE" in result.content # Check for expected text assert result.title is not None # Attempt to extract title/metadata assert len(result.content) > 0 # Check that some content was extracted @pytest.mark.asyncio async def test_extract_content_from_docx(fixture_path): """Tests extracting content from a DOCX file.""" docx_file = fixture_path / "file.docx" if not docx_file.exists(): pytest.skip(f"Fixture file not found: {docx_file}") result = await extract_content(dict(file_path=str(docx_file))) assert result.source_type == "file" assert ( result.identified_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ) assert "Buenos Aires" in result.content # Check for expected text assert result.title is not None # Attempt to extract title/metadata assert len(result.content) > 0 # Check that some content was extracted @pytest.mark.asyncio async def test_extract_content_from_xlsx(fixture_path): """Tests extracting content from a XLSX file.""" xlsx_file = fixture_path / "file.xlsx" if not xlsx_file.exists(): pytest.skip(f"Fixture file not found: {xlsx_file}") result = await extract_content(dict(file_path=str(xlsx_file), document_engine="simple")) assert result.source_type == "file" assert ( result.identified_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) assert result.title is not None # Attempt to extract title/metadata assert len(result.content) > 0 # Check that some content was extracted # @pytest.mark.asyncio # async def test_extract_content_from_xlsx_docling(fixture_path): # """Tests extracting content from a XLSX file using docling engine.""" # xlsx_file = fixture_path / "file.xlsx" # if not xlsx_file.exists(): # pytest.skip(f"Fixture file not found: {xlsx_file}") # result = await extract_content(dict(file_path=str(xlsx_file), document_engine="docling")) # assert result.source_type == "file" # assert ( # result.identified_type # == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" # ) # assert result.title is not None # Attempt to extract title/metadata # assert len(result.content) > 0 # Check that some content was extracted @pytest.mark.asyncio async def test_extract_content_from_pdf_url(): """Tests extracting content from a remote PDF URL.""" url = "https://arxiv.org/pdf/2408.09869" result = await extract_content({"url": url}) assert result.source_type == "url" assert result.identified_type == "application/pdf" assert len(result.content) > 100 # Expect substantial extracted text @pytest.mark.asyncio async def test_auto_mode_fallback_to_crawl4ai(): """ Tests that auto mode correctly falls back to Crawl4AI when Jina fails. This test verifies the fallback chain: 1. Auto mode tries Jina first (when no FIRECRAWL_API_KEY) 2. When Jina raises an exception, it should try Crawl4AI 3. When Crawl4AI succeeds, content should be returned """ pytest.importorskip("crawl4ai", reason="Crawl4AI not installed - auto mode fallback test requires Crawl4AI") import os from unittest.mock import patch # Temporarily ensure FIRECRAWL_API_KEY is not set (so auto mode tries Jina first) original_firecrawl_key = os.environ.get("FIRECRAWL_API_KEY") if original_firecrawl_key: del os.environ["FIRECRAWL_API_KEY"] try: # Mock extract_url_jina to raise an exception (simulating Jina failure) with patch("content_core.processors.url.extract_url_jina") as mock_jina: # Simulate Jina API failure mock_jina.side_effect = Exception("Jina API error (mocked)") # Test URL - use auto mode (should fallback to Crawl4AI when Jina fails) test_url = "https://www.supernovalabs.com" input_data = {"url": test_url, "url_engine": "auto"} result = await extract_content(input_data) # Verify that the extraction succeeded (via Crawl4AI fallback) assert result is not None assert hasattr(result, "source_type") assert result.source_type == "url" # Verify content was successfully extracted assert len(result.content) > 100 assert "AI" in result.content or "Supernova" in result.title # Verify that Jina was attempted (and failed) mock_jina.assert_called_once_with(test_url) finally: # Restore original FIRECRAWL_API_KEY if it was set if original_firecrawl_key: os.environ["FIRECRAWL_API_KEY"] = original_firecrawl_key

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/lfnovo/content-core'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_extraction.py•13.1 KiB