MarkItDown MCP Server

Overview Schema Related Servers Score Discussions

markitdown-mcp
tests
integration

test_file_conversion.py•22.3 KiB

""" End-to-end file conversion integration tests. Tests actual file conversion with real MarkItDown library. """ from pathlib import Path import pytest from markitdown_mcp.server import MarkItDownMCPServer, MCPRequest from tests.helpers.assertions import ( assert_convert_file_response, assert_mcp_success_response, ) class TestFileConversionIntegration: """Test end-to-end file conversion with real files.""" @pytest.mark.integration @pytest.mark.asyncio async def test_text_file_conversion_end_to_end(self, temp_dir): """Test complete text file conversion flow.""" server = MarkItDownMCPServer() # Create test file with various text features test_content = """# Sample Document This is a **test document** with various formatting: ## Features - Lists - *Italic text* - **Bold text** - `Code snippets` ### Code Block ``` function hello() { console.log("Hello, World!"); } ``` > This is a blockquote for testing. Link: [GitHub](https://github.com) --- End of document. """ test_file = Path(temp_dir) / "sample.md" test_file.write_text(test_content) # Convert the file request = MCPRequest( id="text-conversion-test", method="tools/call", params={"name": "convert_file", "arguments": {"file_path": str(test_file)}}, ) response = await server.handle_request(request) assert_convert_file_response(response, "Sample Document", "sample.md") # Verify content preservation result_text = response.result["content"][0]["text"] assert "test document" in result_text assert "Features" in result_text assert "blockquote" in result_text assert "GitHub" in result_text @pytest.mark.integration @pytest.mark.asyncio async def test_json_file_conversion_end_to_end(self, temp_dir): """Test complete JSON file conversion flow.""" server = MarkItDownMCPServer() # Create complex JSON file json_content = """{ "api": { "name": "MarkItDown MCP Server", "version": "1.0.0", "description": "Document conversion API" }, "features": [ { "name": "File Conversion", "supported_formats": ["pdf", "docx", "txt", "json", "csv"], "description": "Convert documents to Markdown" }, { "name": "Directory Processing", "batch_processing": true, "description": "Process multiple files at once" } ], "configuration": { "max_file_size": "100MB", "concurrent_requests": 10, "supported_languages": ["en", "es", "fr", "de"] }, "metadata": { "created": "2024-01-01T00:00:00Z", "last_updated": null, "author": "Test Suite" } }""" json_file = Path(temp_dir) / "config.json" json_file.write_text(json_content) # Convert the JSON file request = MCPRequest( id="json-conversion-test", method="tools/call", params={"name": "convert_file", "arguments": {"file_path": str(json_file)}}, ) response = await server.handle_request(request) assert_convert_file_response(response, "MarkItDown MCP Server", "config.json") # Verify JSON structure is preserved in converted text result_text = response.result["content"][0]["text"] assert "MarkItDown MCP Server" in result_text assert "File Conversion" in result_text assert "supported_formats" in result_text assert "100MB" in result_text @pytest.mark.integration @pytest.mark.asyncio async def test_csv_file_conversion_end_to_end(self, temp_dir): """Test complete CSV file conversion flow.""" server = MarkItDownMCPServer() # Create CSV with various data types csv_content = """Name,Age,Department,Salary,Start Date,Active John Doe,30,Engineering,85000.50,2020-01-15,true Jane Smith,28,"Marketing & Sales",72000.00,2021-03-22,true Bob Johnson,35,Engineering,95000.00,2019-07-10,true Alice Brown,26,HR,55000.25,2022-01-03,true Charlie Davis,42,"Engineering, Senior",125000.00,2018-05-14,false Emma Wilson,29,Marketing,68000.75,2021-09-08,true """ csv_file = Path(temp_dir) / "employees.csv" csv_file.write_text(csv_content) # Convert the CSV file request = MCPRequest( id="csv-conversion-test", method="tools/call", params={"name": "convert_file", "arguments": {"file_path": str(csv_file)}}, ) response = await server.handle_request(request) assert_convert_file_response(response, "John Doe", "employees.csv") # Verify CSV data is preserved result_text = response.result["content"][0]["text"] assert "Jane Smith" in result_text assert "Engineering" in result_text assert "85000" in result_text assert "Marketing" in result_text @pytest.mark.integration @pytest.mark.asyncio async def test_html_file_conversion_end_to_end(self, temp_dir): """Test complete HTML file conversion flow.""" server = MarkItDownMCPServer() # Create HTML file with various elements html_content = """<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>Sample HTML Page</title> <style> body { font-family: Arial, sans-serif; } .highlight { background-color: yellow; } </style> </head> <body> <header> <h1>Welcome to Our Website</h1> <nav> <ul> <li><a href="#about">About</a></li> <li><a href="#services">Services</a></li> <li><a href="#contact">Contact</a></li> </ul> </nav> </header> <main> <section id="about"> <h2>About Us</h2> <p>We are a <em>leading provider</em> of <strong>document conversion services</strong>.</p> <p class="highlight">Our mission is to make document processing simple and efficient.</p> </section> <section id="services"> <h2>Our Services</h2> <table> <thead> <tr> <th>Service</th> <th>Description</th> <th>Price</th> </tr> </thead> <tbody> <tr> <td>PDF Conversion</td> <td>Convert PDF files to various formats</td> <td>$10/file</td> </tr> <tr> <td>Batch Processing</td> <td>Process multiple files simultaneously</td> <td>$50/batch</td> </tr> </tbody> </table> </section> <section id="contact"> <h2>Contact Information</h2> <form> <label for="name">Name:</label> <input type="text" id="name" name="name" required> <label for="email">Email:</label> <input type="email" id="email" name="email" required> <label for="message">Message:</label> <textarea id="message" name="message" rows="4" required></textarea> <button type="submit">Send Message</button> </form> </section> </main> <footer> <p>© 2024 Document Conversion Services. All rights reserved.</p> </footer> <script> console.log("Page loaded successfully"); document.querySelector('form').addEventListener('submit', function(e) { e.preventDefault(); alert('Thank you for your message!'); }); </script> </body> </html>""" html_file = Path(temp_dir) / "webpage.html" html_file.write_text(html_content) # Convert the HTML file request = MCPRequest( id="html-conversion-test", method="tools/call", params={"name": "convert_file", "arguments": {"file_path": str(html_file)}}, ) response = await server.handle_request(request) assert_convert_file_response(response, "Welcome to Our Website", "webpage.html") # Verify HTML content is extracted result_text = response.result["content"][0]["text"] assert "About Us" in result_text assert "leading provider" in result_text assert "PDF Conversion" in result_text assert "Contact Information" in result_text @pytest.mark.integration @pytest.mark.asyncio async def test_xml_file_conversion_end_to_end(self, temp_dir): """Test complete XML file conversion flow.""" server = MarkItDownMCPServer() # Create XML file with various elements xml_content = """<?xml version="1.0" encoding="UTF-8"?> <bookstore> <book id="1" category="fiction"> <title lang="en">The Great Gatsby</title> <author>F. Scott Fitzgerald</author> <year>1925</year> <price currency="USD">12.99</price> <description> A classic American novel set in the summer of 1922. The story is narrated by Nick Carraway. </description> <tags> <tag>classic</tag> <tag>american literature</tag> <tag>1920s</tag> </tags> </book> <book id="2" category="science"> <title lang="en">A Brief History of Time</title> <author>Stephen Hawking</author> <year>1988</year> <price currency="USD">15.99</price> <description> A popular science book on cosmology by physicist Stephen Hawking. </description> <tags> <tag>science</tag> <tag>cosmology</tag> <tag>physics</tag> </tags> </book> <metadata> <catalog_version>2.1</catalog_version> <last_updated>2024-01-01</last_updated> <total_books>2</total_books> </metadata> </bookstore>""" xml_file = Path(temp_dir) / "bookstore.xml" xml_file.write_text(xml_content) # Convert the XML file request = MCPRequest( id="xml-conversion-test", method="tools/call", params={"name": "convert_file", "arguments": {"file_path": str(xml_file)}}, ) response = await server.handle_request(request) assert_convert_file_response(response, "Great Gatsby", "bookstore.xml") # Verify XML content is preserved result_text = response.result["content"][0]["text"] assert "F. Scott Fitzgerald" in result_text assert "Stephen Hawking" in result_text assert "1925" in result_text assert "cosmology" in result_text class TestDirectoryConversionIntegration: """Test end-to-end directory conversion with real files.""" @pytest.mark.integration @pytest.mark.asyncio async def test_directory_conversion_mixed_files(self, temp_dir): """Test directory conversion with mixed file types.""" server = MarkItDownMCPServer() # Create directory structure with various files source_dir = Path(temp_dir) / "mixed_docs" source_dir.mkdir() output_dir = Path(temp_dir) / "converted_output" output_dir.mkdir() # Create various file types files_created = { "readme.txt": "# Project README\n\nThis is a sample project.", "config.json": '{"name": "project", "version": "1.0"}', "data.csv": "Name,Value\nSetting1,100\nSetting2,200", "info.html": "<html><body><h1>Information</h1><p>Details here.</p></body></html>", "notes.md": "## Notes\n\n- Important point 1\n- Important point 2", } for filename, content in files_created.items(): file_path = source_dir / filename file_path.write_text(content) # Convert directory request = MCPRequest( id="dir-mixed-test", method="tools/call", params={ "name": "convert_directory", "arguments": { "input_directory": str(source_dir), "output_directory": str(output_dir), }, }, ) response = await server.handle_request(request) assert_mcp_success_response(response, "dir-mixed-test") # Verify conversion results result_text = response.result["content"][0]["text"] assert "Successfully converted: 5" in result_text assert "Failed conversions: 0" in result_text # Verify output files exist output_files = list(output_dir.glob("*.md")) assert len(output_files) == 5 # Check specific conversions readme_output = output_dir / "readme.md" assert readme_output.exists() readme_content = readme_output.read_text() assert "Project README" in readme_content @pytest.mark.integration @pytest.mark.asyncio async def test_nested_directory_conversion(self, temp_dir): """Test directory conversion with nested structure.""" server = MarkItDownMCPServer() # Create nested directory structure base_dir = Path(temp_dir) / "nested_project" base_dir.mkdir() # Create subdirectories (base_dir / "docs").mkdir() (base_dir / "data" / "config").mkdir(parents=True) (base_dir / "scripts").mkdir() # Create files in different locations files = { "README.txt": "Main project README", "docs/guide.md": "# User Guide\n\nInstructions here.", "docs/api.txt": "API Documentation\n\nEndpoints and usage.", "data/settings.json": '{"debug": true, "port": 8080}', "data/config/app.json": '{"name": "MyApp", "env": "production"}', "scripts/deploy.txt": "Deployment script documentation", } for rel_path, content in files.items(): file_path = base_dir / rel_path file_path.write_text(content) output_dir = Path(temp_dir) / "nested_output" # Convert nested directory request = MCPRequest( id="nested-dir-test", method="tools/call", params={ "name": "convert_directory", "arguments": { "input_directory": str(base_dir), "output_directory": str(output_dir), }, }, ) response = await server.handle_request(request) assert_mcp_success_response(response, "nested-dir-test") # Verify all files were processed result_text = response.result["content"][0]["text"] assert "Successfully converted: 6" in result_text # Verify nested structure is preserved assert (output_dir / "README.md").exists() assert (output_dir / "docs" / "guide.md").exists() assert (output_dir / "docs" / "api.md").exists() assert (output_dir / "data" / "settings.md").exists() assert (output_dir / "data" / "config" / "app.md").exists() assert (output_dir / "scripts" / "deploy.md").exists() class TestBase64ConversionIntegration: """Test end-to-end base64 content conversion.""" @pytest.mark.integration @pytest.mark.asyncio async def test_base64_text_conversion_end_to_end(self): """Test complete base64 text conversion flow.""" server = MarkItDownMCPServer() # Create content to encode original_content = """# Base64 Test Document This document tests base64 encoding and decoding. ## Features - Unicode support: 你好 🌍 - Special characters: ®©™ - Code blocks: ```python def hello(): print("Hello from base64!") ``` End of test document. """ # Encode content import base64 encoded_content = base64.b64encode(original_content.encode("utf-8")).decode("ascii") # Convert via base64 request = MCPRequest( id="base64-text-test", method="tools/call", params={ "name": "convert_file", "arguments": {"file_content": encoded_content, "filename": "base64_test.md"}, }, ) response = await server.handle_request(request) assert_convert_file_response(response, "Base64 Test Document", "base64_test.md") # Verify content preservation result_text = response.result["content"][0]["text"] assert "Unicode support" in result_text assert "你好 🌍" in result_text assert "Special characters" in result_text assert "Hello from base64!" in result_text @pytest.mark.integration @pytest.mark.asyncio async def test_base64_json_conversion_end_to_end(self): """Test complete base64 JSON conversion flow.""" server = MarkItDownMCPServer() # Create JSON content json_content = """{ "test": "base64 encoding", "unicode": "测试 🚀", "nested": { "array": [1, 2, 3], "object": { "key": "value" } }, "special_chars": "quotes \\"test\\" and slashes \\\\" }""" # Encode content import base64 encoded_content = base64.b64encode(json_content.encode("utf-8")).decode("ascii") # Convert via base64 request = MCPRequest( id="base64-json-test", method="tools/call", params={ "name": "convert_file", "arguments": {"file_content": encoded_content, "filename": "test_data.json"}, }, ) response = await server.handle_request(request) assert_convert_file_response(response, "base64 encoding", "test_data.json") # Verify JSON structure preservation result_text = response.result["content"][0]["text"] assert "测试 🚀" in result_text assert "nested" in result_text assert "array" in result_text assert "quotes" in result_text class TestErrorScenariosIntegration: """Test error scenarios in real conversion contexts.""" @pytest.mark.integration @pytest.mark.asyncio async def test_corrupted_file_handling_integration(self, temp_dir): """Test handling of corrupted files in integration context.""" server = MarkItDownMCPServer() # Create files with various corruption types corrupted_files = { "invalid.json": '{"incomplete": json without closing', "broken.csv": 'Name,Age\n"Unclosed quote,25\nValid,30', "fake.pdf": "This is not a PDF file at all, just text", "incomplete.html": "<html><body><h1>Missing closing tags", } results = [] for filename, content in corrupted_files.items(): file_path = Path(temp_dir) / filename file_path.write_text(content) request = MCPRequest( id=f"corrupt-{filename}", method="tools/call", params={"name": "convert_file", "arguments": {"file_path": str(file_path)}}, ) response = await server.handle_request(request) results.append((filename, response)) # Analyze results - some may succeed with partial conversion, # others may fail gracefully for filename, response in results: # Response should either succeed or fail gracefully assert response.result is not None or response.error is not None # If there's an error, it should be informative if response.error: error_msg = response.error["message"] # Error should not expose system internals assert "traceback" not in error_msg.lower() assert "exception" not in error_msg.lower() @pytest.mark.integration @pytest.mark.asyncio async def test_permission_errors_integration(self, temp_dir): """Test handling of permission errors in integration context.""" server = MarkItDownMCPServer() # Create a file and make it unreadable (if supported by OS) test_file = Path(temp_dir) / "restricted.txt" test_file.write_text("This file will be made unreadable") try: # Try to make file unreadable test_file.chmod(0o000) request = MCPRequest( id="permission-test", method="tools/call", params={"name": "convert_file", "arguments": {"file_path": str(test_file)}}, ) response = await server.handle_request(request) # Should handle permission error gracefully if response.error: error_msg = response.error["message"].lower() assert any( term in error_msg for term in ["permission", "access", "denied", "readable"] ) finally: # Restore permissions for cleanup try: test_file.chmod(0o644) except (OSError, PermissionError): pass # Ignore permission restore errors @pytest.mark.integration @pytest.mark.asyncio async def test_large_file_timeout_integration(self, temp_dir): """Test handling of very large files that might timeout.""" server = MarkItDownMCPServer() # Create a very large text file (5MB) large_content = "This is a large file line.\n" * 250000 # ~5MB large_file = Path(temp_dir) / "very_large.txt" large_file.write_text(large_content) request = MCPRequest( id="large-file-test", method="tools/call", params={"name": "convert_file", "arguments": {"file_path": str(large_file)}}, ) # This might take a while or timeout response = await server.handle_request(request) # Should either succeed or fail gracefully assert response.result is not None or response.error is not None if response.result: # If successful, should contain expected content result_text = response.result["content"][0]["text"] assert "large file line" in result_text if response.error: # If failed, error should be reasonable error_msg = response.error["message"].lower() # Could be timeout, memory, or processing error acceptable_errors = ["timeout", "memory", "size", "large", "processing"] assert any(term in error_msg for term in acceptable_errors)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/trsdn/markitdown-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_file_conversion.py•22.3 KiB