RAGStack-Lambda

Overview Schema Related Servers Score Discussions

test_scraper_extractor.py•9.81 KiB

"""Unit tests for content extraction.""" import pytest from ragstack_common.scraper.extractor import ( ExtractedContent, add_frontmatter, extract_content, extract_title, find_main_content, html_to_markdown, sanitize_html, ) class TestExtractTitle: """Tests for extract_title function.""" def test_extracts_og_title(self): html = """ <html> <head> <meta property="og:title" content="OG Title Here"> <title>Regular Title</title> </head> <body><h1>H1 Title</h1></body> </html> """ assert extract_title(html) == "OG Title Here" def test_extracts_title_tag(self): html = """ <html> <head><title>Page Title</title></head> <body><h1>H1 Title</h1></body> </html> """ assert extract_title(html) == "Page Title" def test_extracts_h1_as_fallback(self): html = """ <html> <head></head> <body><h1>Main Heading</h1></body> </html> """ assert extract_title(html) == "Main Heading" def test_returns_none_if_missing(self): html = "<html><head></head><body><p>No title here</p></body></html>" assert extract_title(html) is None def test_strips_whitespace(self): html = "<html><head><title> Spaced Title </title></head></html>" assert extract_title(html) == "Spaced Title" class TestSanitizeHtml: """Tests for sanitize_html function.""" def test_removes_script_tags(self): html = "<html><body><script>alert('hi')</script><p>Content</p></body></html>" soup = sanitize_html(html) assert soup.find("script") is None assert "Content" in soup.get_text() def test_removes_style_tags(self): html = "<html><body><style>.red{color:red}</style><p>Content</p></body></html>" soup = sanitize_html(html) assert soup.find("style") is None def test_removes_nav(self): html = "<html><body><nav><a>Link</a></nav><p>Content</p></body></html>" soup = sanitize_html(html) assert soup.find("nav") is None assert "Content" in soup.get_text() def test_removes_footer(self): html = "<html><body><p>Content</p><footer>Footer stuff</footer></body></html>" soup = sanitize_html(html) assert soup.find("footer") is None def test_removes_header(self): html = "<html><body><header>Header stuff</header><p>Content</p></body></html>" soup = sanitize_html(html) assert soup.find("header") is None def test_preserves_main_content(self): html = "<html><body><main><p>Main content</p></main></body></html>" soup = sanitize_html(html) assert soup.find("main") is not None assert "Main content" in soup.get_text() def test_preserves_article(self): html = "<html><body><article><p>Article content</p></article></body></html>" soup = sanitize_html(html) assert soup.find("article") is not None def test_removes_navigation_role(self): html = '<html><body><div role="navigation">Nav</div><p>Content</p></body></html>' soup = sanitize_html(html) assert soup.find(attrs={"role": "navigation"}) is None def test_removes_sidebar_class(self): html = '<html><body><div class="sidebar">Sidebar</div><p>Content</p></body></html>' soup = sanitize_html(html) assert soup.find(class_="sidebar") is None class TestFindMainContent: """Tests for find_main_content function.""" def test_finds_main_element(self): html = """ <html><body> <main><p>This is the main content that should be extracted.</p></main> <aside>Sidebar</aside> </body></html> """ soup = sanitize_html(html) main = find_main_content(soup) assert "main content" in main.get_text().lower() def test_finds_article_element(self): html = """ <html><body> <article><p>This is an article with enough content to be found.</p></article> </body></html> """ soup = sanitize_html(html) main = find_main_content(soup) assert "article" in main.get_text().lower() def test_finds_role_main(self): html = """ <html><body> <div role="main"><p>Main content area with enough text to be found.</p></div> </body></html> """ soup = sanitize_html(html) main = find_main_content(soup) assert "Main content" in main.get_text() def test_falls_back_to_body(self): html = """ <html><body> <p>Just some content without semantic elements.</p> </body></html> """ soup = sanitize_html(html) main = find_main_content(soup) assert main.name == "body" class TestHtmlToMarkdown: """Tests for html_to_markdown function.""" def test_converts_headings(self): html = "<h1>Title</h1><h2>Subtitle</h2><p>Paragraph</p>" md = html_to_markdown(html) assert "# Title" in md assert "## Subtitle" in md def test_converts_lists(self): html = "<ul><li>Item 1</li><li>Item 2</li></ul>" md = html_to_markdown(html) assert "- Item 1" in md assert "- Item 2" in md def test_converts_ordered_lists(self): html = "<ol><li>First</li><li>Second</li></ol>" md = html_to_markdown(html) assert "1. First" in md or "1." in md # markdownify uses numbers def test_converts_paragraphs(self): html = "<p>First paragraph</p><p>Second paragraph</p>" md = html_to_markdown(html) assert "First paragraph" in md assert "Second paragraph" in md def test_preserves_code_blocks(self): html = '<pre><code class="language-python">print("hello")</code></pre>' md = html_to_markdown(html) assert "print" in md assert "hello" in md def test_converts_inline_code(self): html = "<p>Use the <code>print()</code> function</p>" md = html_to_markdown(html) assert "`print()`" in md def test_removes_excessive_whitespace(self): html = "<p>First</p>\n\n\n\n<p>Second</p>" md = html_to_markdown(html) # Should not have more than 2 consecutive newlines assert "\n\n\n" not in md def test_empty_html(self): md = html_to_markdown("") assert md == "" class TestAddFrontmatter: """Tests for add_frontmatter function.""" def test_adds_source_url(self): content = "# Hello\n\nContent here" result = add_frontmatter(content, "https://example.com/page") assert "source_url: https://example.com/page" in result assert result.startswith("---") assert "---\n# Hello" in result # Frontmatter ends with ---\n then content def test_adds_title_if_provided(self): content = "Content" result = add_frontmatter(content, "https://example.com", title="My Title") assert 'title: "My Title"' in result def test_escapes_quotes_in_title(self): content = "Content" result = add_frontmatter(content, "https://example.com", title='Title with "quotes"') assert 'title: "Title with \\"quotes\\""' in result def test_adds_scraped_at(self): content = "Content" result = add_frontmatter(content, "https://example.com") assert "scraped_at:" in result class TestExtractContent: """Tests for extract_content function.""" def test_full_extraction_pipeline(self): html = """ <html> <head><title>Test Page</title></head> <body> <nav><a>Menu</a></nav> <main> <h1>Main Heading</h1> <p>This is the main content of the page that should be extracted.</p> </main> <footer>Copyright 2024</footer> </body> </html> """ result = extract_content(html, "https://example.com/test") assert isinstance(result, ExtractedContent) assert result.title == "Test Page" assert result.source_url == "https://example.com/test" assert "source_url: https://example.com/test" in result.markdown assert "Main Heading" in result.markdown assert "main content" in result.markdown.lower() assert "Menu" not in result.markdown # Nav should be removed assert "Copyright" not in result.markdown # Footer should be removed def test_preserves_code_blocks(self): html = """ <html> <head><title>Code Page</title></head> <body> <main> <p>Here is some code:</p> <pre><code class="language-python">def hello(): print("world")</code></pre> </main> </body> </html> """ result = extract_content(html, "https://example.com/code") assert "print" in result.markdown assert "hello" in result.markdown def test_counts_words(self): html = """ <html> <head><title>Test</title></head> <body><main><p>One two three four five six seven.</p></main></body> </html> """ result = extract_content(html, "https://example.com") assert result.word_count > 0 def test_handles_empty_html(self): result = extract_content("", "https://example.com") assert result.title == "Untitled" assert result.source_url == "https://example.com" def test_handles_malformed_html(self): html = "<p>Unclosed paragraph<div>Nested wrong</p></div>" result = extract_content(html, "https://example.com") # Should not raise, should produce some output assert result.markdown is not None if __name__ == "__main__": pytest.main([__file__, "-v"])

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/HatmanStack/RAGStack-Lambda'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_scraper_extractor.py•9.81 KiB