#!/usr/bin/env python3
"""Validate OpenZIM MCP website structure and content.
This script checks HTML structure, links, and required files for the
OpenZIM MCP documentation website.
"""
import re
from pathlib import Path
def check_file_exists(file_path):
"""Check if a file exists."""
return Path(file_path).exists()
def validate_html_structure(html_content):
"""Validate basic HTML structure."""
issues = []
# Check for required meta tags
required_meta = [
"charset",
"viewport",
"description",
"canonical",
"robots",
"theme-color",
"og:title",
"og:description",
"og:site_name",
"og:image",
"og:image:width",
"og:image:height",
"og:image:alt",
"twitter:card",
"twitter:site",
"twitter:creator",
"twitter:image:alt",
]
for meta in required_meta:
if meta not in html_content:
issues.append(f"Missing meta tag: {meta}")
# Check for structured data (JSON-LD)
required_schemas = [
'"@type": "SoftwareApplication"',
'"@type": "Organization"',
'"@type": "WebSite"',
'"@type": "BreadcrumbList"',
'"@type": "TechArticle"',
]
for schema in required_schemas:
if schema not in html_content:
issues.append(f"Missing structured data: {schema}")
# Check for required sections
required_sections = [
'id="home"',
'id="features"',
'id="installation"',
'id="usage"',
]
for section in required_sections:
if section not in html_content:
issues.append(f"Missing section: {section}")
# Check for navigation
if 'class="navbar"' not in html_content:
issues.append("Missing navigation bar")
# Check for footer
if 'class="footer"' not in html_content:
issues.append("Missing footer")
return issues
def validate_css_references(html_content, base_dir):
"""Check if CSS files referenced in HTML exist."""
issues = []
# Find CSS links
css_pattern = r'<link[^>]*href=["\']([^"\']*\.css)["\']'
css_files = re.findall(css_pattern, html_content)
for css_file in css_files:
if not css_file.startswith("http"):
full_path = Path(base_dir) / css_file
if not full_path.exists():
issues.append(f"Missing CSS file: {css_file}")
return issues
def validate_js_references(html_content, base_dir):
"""Check if JavaScript files referenced in HTML exist."""
issues = []
# Find JS scripts
js_pattern = r'<script[^>]*src=["\']([^"\']*\.js)["\']'
js_files = re.findall(js_pattern, html_content)
for js_file in js_files:
if not js_file.startswith("http"):
full_path = Path(base_dir) / js_file
if not full_path.exists():
issues.append(f"Missing JavaScript file: {js_file}")
return issues
def validate_image_references(html_content, base_dir):
"""Check if image files referenced in HTML exist."""
issues = []
# Find image sources
img_pattern = r'(?:src|href)=["\']([^"\']*\.(png|jpg|jpeg|gif|svg|ico))["\']'
img_files = re.findall(img_pattern, html_content)
for img_file, _ext in img_files:
if not img_file.startswith("http"):
full_path = Path(base_dir) / img_file
if not full_path.exists():
issues.append(f"Missing image file: {img_file}")
return issues
def validate_internal_links(html_content):
"""Check for broken internal anchor links."""
issues = []
# Find internal links
link_pattern = r'href=["\']#([^"\']*)["\']'
anchor_links = re.findall(link_pattern, html_content)
# Find IDs in the document
id_pattern = r'id=["\']([^"\']*)["\']'
ids = re.findall(id_pattern, html_content)
for link in anchor_links:
if link and link not in ids:
issues.append(f"Broken internal link: #{link}")
return issues
def main():
"""Run validation checks on the website."""
website_dir = Path(__file__).parent
html_file = website_dir / "index.html"
if not html_file.exists():
print("❌ index.html not found!")
return False
print("🔍 Validating OpenZIM MCP website...")
print(f"📁 Website directory: {website_dir}")
# Read HTML content
with open(html_file, "r", encoding="utf-8") as f:
html_content = f.read()
all_issues = []
# Run validations
print("\n📋 Running validations...")
# HTML structure
html_issues = validate_html_structure(html_content)
all_issues.extend([f"HTML: {issue}" for issue in html_issues])
# CSS references
css_issues = validate_css_references(html_content, website_dir)
all_issues.extend([f"CSS: {issue}" for issue in css_issues])
# JavaScript references
js_issues = validate_js_references(html_content, website_dir)
all_issues.extend([f"JS: {issue}" for issue in js_issues])
# Image references
img_issues = validate_image_references(html_content, website_dir)
all_issues.extend([f"IMG: {issue}" for issue in img_issues])
# Internal links
link_issues = validate_internal_links(html_content)
all_issues.extend([f"LINK: {issue}" for issue in link_issues])
# Report results
print("\n📊 Validation Results:")
print(f" HTML structure checks: {len(html_issues)} issues")
print(f" CSS reference checks: {len(css_issues)} issues")
print(f" JavaScript reference checks: {len(js_issues)} issues")
print(f" Image reference checks: {len(img_issues)} issues")
print(f" Internal link checks: {len(link_issues)} issues")
if all_issues:
print(f"\n❌ Found {len(all_issues)} issues:")
for issue in all_issues:
print(f" • {issue}")
return False
else:
print("\n✅ All validations passed!")
# Additional file checks
print("\n📁 Checking required files:")
required_files = [
"assets/styles.css",
"assets/script.js",
"assets/favicon.svg",
"assets/og-image.svg",
"robots.txt",
"sitemap.xml",
"humans.txt",
".well-known/security.txt",
".well-known/dnt-policy.txt",
]
missing_files = []
for file_path in required_files:
full_path = website_dir / file_path
if full_path.exists():
print(f" ✅ {file_path}")
else:
print(f" ❌ {file_path}")
missing_files.append(file_path)
if missing_files:
print(f"\n⚠️ Missing {len(missing_files)} optional files")
return True # Still consider it a pass for core functionality
else:
print("\n🎉 All files present and website is ready!")
return True
if __name__ == "__main__":
success = main()
exit(0 if success else 1)