Scrapy MCP Server

scrapy-mcp
tests
integration

test_langchain_blog_conversion.py•3.52 KiB

#!/usr/bin/env python3 """Integration test for LangChain blog conversion to verify paragraph formatting.""" import pytest import tempfile from extractor.scraper import WebScraper from extractor.markdown_converter import MarkdownConverter @pytest.mark.asyncio async def test_langchain_blog_conversion(): """Test conversion of the LangChain blog with different methods.""" url = "https://blog.langchain.com/context-engineering-for-agents/" scraper = WebScraper() converter = MarkdownConverter() print(f"Testing URL: {url}") print("=" * 80) # Test with different scraping methods methods = ["simple", "scrapy", "selenium"] for method in methods: print(f"\n🔍 Testing with method: {method}") print("-" * 40) try: scrape_result = await scraper.scrape_url(url=url, method=method) if "error" in scrape_result: print(f"❌ Scraping failed: {scrape_result['error']}") continue # Show what we got from scraping content = scrape_result.get("content", {}) print(f"Content keys: {list(content.keys())}") html_content = content.get("html", "") text_content = content.get("text", "") print(f"HTML length: {len(html_content) if html_content else 0}") print(f"Text length: {len(text_content) if text_content else 0}") # Convert to Markdown result = converter.convert_webpage_to_markdown( scrape_result=scrape_result, extract_main_content=True, include_metadata=True, ) if result.success: markdown = result.markdown_content lines = markdown.split("\n") empty_lines = [i for i, line in enumerate(lines) if line.strip() == ""] print(f"✅ Conversion successful!") print(f" Total lines: {len(lines)}") print(f" Empty lines: {len(empty_lines)}") print(f" Total chars: {len(markdown)}") # Show first few lines to check structure print(f"\nFirst 10 lines:") for i, line in enumerate(lines[:10]): print(f" {i:2d}: {repr(line[:80])}") # Save to temp file for inspection with tempfile.NamedTemporaryFile( mode="w", suffix=f"_langchain_blog_{method}.md", delete=False, encoding="utf-8", ) as f: f.write(markdown) filename = f.name print(f"💾 Saved to {filename}") # Check if it looks properly formatted if len(empty_lines) > 10 and len(lines) > 50: print("✅ Appears to have proper paragraph structure!") else: print("❌ Still appears to be poorly formatted") # Show a sample middle section if len(lines) > 50: print(f"\nSample from middle (lines 25-35):") for i in range(25, min(35, len(lines))): print(f" {i:2d}: {repr(lines[i][:80])}") else: print(f"❌ Conversion failed: {result.get('error')}") except Exception as e: print(f"❌ Error with {method}: {str(e)}") print(f"\n🏁 Testing complete!") if __name__ == "__main__": import asyncio asyncio.run(test_langchain_blog_conversion())

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ThreeFish-AI/scrapy-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_langchain_blog_conversion.py•3.52 KiB