Skip to main content
Glama
pre_publish_check.py25.4 kB
#!/usr/bin/env python3 """ Pre-Publication Security Checklist Run this script before pushing to GitHub to verify no sensitive data is included """ import os import re import sys from pathlib import Path from typing import Optional, Dict, List, Set # Define script_dir once at module level script_dir = Path(__file__).parent.parent # Colors for output GREEN = '\033[92m' RED = '\033[91m' YELLOW = '\033[93m' RESET = '\033[0m' BOLD = '\033[1m' def print_header(text): print(f"\n{BOLD}{'='*70}{RESET}") print(f"{BOLD}{text}{RESET}") print(f"{BOLD}{'='*70}{RESET}\n") def print_success(text): print(f"{GREEN}✓{RESET} {text}") def print_error(text): print(f"{RED}✗{RESET} {text}") def print_warning(text): print(f"{YELLOW}⚠{RESET} {text}") def load_ansible_inventory() -> Optional[Dict[str, Set[str]]]: """ Load Ansible inventory to extract real hostnames and IP addresses. Returns a dict with 'ips' and 'hostnames' sets, or None if not available. """ try: # Try to load from .env first env_path = script_dir / '.env' ansible_inventory_path = None if env_path.exists(): with open(env_path, 'r') as f: for line in f: line = line.strip() if line.startswith('ANSIBLE_INVENTORY_PATH='): ansible_inventory_path = line.split('=', 1)[1].strip().strip('"\'') break # If not in .env, check for default location if not ansible_inventory_path: default_path = script_dir / 'ansible_hosts.yml' if default_path.exists(): ansible_inventory_path = str(default_path) if not ansible_inventory_path or not Path(ansible_inventory_path).exists(): return None # Parse the YAML inventory import yaml with open(ansible_inventory_path, 'r') as f: inventory = yaml.safe_load(f) ips = set() hostnames = set() domains = set() def extract_host_info(data): """Recursively extract IPs and hostnames from inventory structure""" if isinstance(data, dict): # Check for hosts if 'hosts' in data: for hostname, host_data in data['hosts'].items(): # Add hostname hostnames.add(hostname) # Extract domain if hostname has one if '.' in hostname: domain = '.'.join(hostname.split('.')[1:]) if domain: domains.add(domain) # Extract IP addresses if isinstance(host_data, dict): if 'ansible_host' in host_data: ips.add(host_data['ansible_host']) if 'ip' in host_data: ips.add(host_data['ip']) if 'address' in host_data: ips.add(host_data['address']) # Recurse into children if 'children' in data: for child in data['children'].values(): extract_host_info(child) # Recurse into any other dicts for value in data.values(): if isinstance(value, dict): extract_host_info(value) # Start extraction from root if inventory: extract_host_info(inventory) return { 'ips': ips, 'hostnames': hostnames, 'domains': domains } except ImportError: print_warning("PyYAML not installed - skipping Ansible inventory check") print_warning("Install with: pip install pyyaml") return None except Exception as e: print_warning(f"Could not load Ansible inventory: {e}") return None def check_file_exists(filepath): """Check if a file exists""" return Path(filepath).exists() def check_file_in_gitignore(filename, gitignore_content): """Check if a filename is in gitignore""" return filename in gitignore_content def scan_for_ips(content, filename): """Scan content for private IP addresses""" # Pattern for private IP addresses ip_pattern = r'\b(?:10\.\d{1,3}\.\d{1,3}\.\d{1,3}|172\.(?:1[6-9]|2\d|3[01])\.\d{1,3}\.\d{1,3}|192\.168\.\d{1,3}\.\d{1,3})\b' matches = re.findall(ip_pattern, content) # Filter out example IPs real_ips = [ip for ip in matches if not ip.startswith('192.168.1.') and not ip.startswith('10.0.1.')] return real_ips def scan_for_api_keys(content, filename): """Scan content for potential API keys and sensitive data""" # Common patterns for API keys and sensitive data patterns = [ r'api[_-]?key["\']?\s*[:=]\s*["\']([a-zA-Z0-9]{20,})["\']', r'token["\']?\s*[:=]\s*["\']([a-zA-Z0-9]{20,})["\']', r'password["\']?\s*[:=]\s*["\']([^"\']{8,})["\']', r'secret["\']?\s*[:=]\s*["\']([a-zA-Z0-9]{16,})["\']', r'auth["\']?\s*[:=]\s*["\']([a-zA-Z0-9]{16,})["\']', # GitHub URLs with actual usernames r'https://github\.com/([a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+)', # Notion URLs with actual IDs r'https://www\.notion\.so/([a-zA-Z0-9]{32})', # Email addresses r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', ] found_items = [] for pattern in patterns: matches = re.findall(pattern, content, re.IGNORECASE) for match in matches: # Skip common example patterns if any(example in match.lower() for example in [ 'example.com', 'your-username', 'your-api-key', 'placeholder', 'template', 'sample', 'demo' ]): continue found_items.append(match) return found_items def check_sensitive_files(): """Check for sensitive files that shouldn't be committed""" print_header("Checking for Sensitive Files") issues = [] # Read .gitignore gitignore_path = script_dir / '.gitignore' if gitignore_path.exists(): with open(gitignore_path, 'r') as f: gitignore_content = f.read() else: print_error(".gitignore not found!") return False # Files that MUST be in .gitignore required_ignored = [ '.env', 'PROJECT_INSTRUCTIONS.md', 'ansible_hosts.yml', 'CLAUDE_CUSTOM.md' ] for filename in required_ignored: if check_file_in_gitignore(filename, gitignore_content): print_success(f"{filename} is in .gitignore") else: print_error(f"{filename} is NOT in .gitignore!") issues.append(f"{filename} must be added to .gitignore") # Files that should exist but not be committed sensitive_files = [ ('.env', '.env should exist but not be committed'), ('PROJECT_INSTRUCTIONS.md', 'Your customized instructions (not the .example)'), ('ansible_hosts.yml', 'Your real inventory (not the .example)'), ('CLAUDE_CUSTOM.md', 'Your customized Claude guide with homelab-specific details') ] print() for filename, description in sensitive_files: filepath = script_dir / filename if filepath.exists(): print_warning(f"{filename} exists - {description}") if not check_file_in_gitignore(filename, gitignore_content): issues.append(f"{filename} exists but is not in .gitignore!") else: print_success(f"{filename} does not exist (good for public repo)") # Example files that SHOULD exist print() example_files = [ '.env.example', 'PROJECT_INSTRUCTIONS.example.md', 'ansible_hosts.example.yml', 'CLAUDE_CUSTOM.example.md' ] for filename in example_files: if check_file_exists(script_dir / filename): print_success(f"{filename} exists") else: print_error(f"{filename} is missing!") issues.append(f"{filename} should be included as template") return len(issues) == 0 def check_documentation_files(): """Check that documentation files exist""" print_header("Checking Documentation Files") issues = [] required_docs = { 'README.md': 'Main documentation', 'SECURITY.md': 'Security guidelines', '.env.example': 'Environment variables template', 'requirements.txt': 'Python dependencies', 'PROJECT_INSTRUCTIONS.example.md': 'Claude instructions template', 'ansible_hosts.example.yml': 'Ansible inventory template', 'CLAUDE.md': 'Claude AI development guide (public)', 'CLAUDE_CUSTOM.example.md': 'Claude customization template' } for filename, description in required_docs.items(): filepath = script_dir / filename if filepath.exists(): print_success(f"{filename} - {description}") else: print_error(f"{filename} is MISSING - {description}") issues.append(f"Missing {filename}") return len(issues) == 0 def scan_python_files(): """Scan Python files for potential issues""" print_header("Scanning Python Files for Sensitive Data") issues = [] python_files = list(script_dir.glob('*.py')) for py_file in python_files: if py_file.name == 'pre_publish_check.py': continue try: with open(py_file, 'r', encoding='utf-8') as f: content = f.read() # Check for hardcoded IPs (in non-example contexts) ips = scan_for_ips(content, py_file.name) if ips: print_warning(f"{py_file.name}: Found IP addresses - {', '.join(ips)}") print_warning(" → Verify these are example IPs only") # Check for potential API keys (basic check) keys = scan_for_api_keys(content, py_file.name) if keys: print_error(f"{py_file.name}: Potential API keys found!") issues.append(f"{py_file.name} may contain hardcoded credentials") # Check for common mistakes if 'your-api-key' not in content.lower() and 'api' in content.lower(): if 'getenv' not in content and 'os.environ' not in content: print_warning(f"{py_file.name}: Check that API keys use environment variables") except Exception as e: print_error(f"Error scanning {py_file.name}: {e}") if not issues: print_success("No obvious sensitive data found in Python files") return len(issues) == 0 def check_claude_custom_md(): """Special check for CLAUDE_CUSTOM.md if it exists (should be gitignored)""" print_header("Checking CLAUDE_CUSTOM.md for Private Information") issues = [] claude_custom_md = script_dir / 'CLAUDE_CUSTOM.md' if not claude_custom_md.exists(): print_success("CLAUDE_CUSTOM.md does not exist (good for public repo)") return True # If it exists, check if it's gitignored gitignore_path = script_dir / '.gitignore' if gitignore_path.exists(): with open(gitignore_path, 'r') as f: gitignore_content = f.read() if 'CLAUDE_CUSTOM.md' not in gitignore_content: print_error("CLAUDE_CUSTOM.md exists but is NOT in .gitignore!") issues.append("CLAUDE_CUSTOM.md must be added to .gitignore") else: print_success("CLAUDE_CUSTOM.md is properly gitignored") # If CLAUDE_CUSTOM.md exists, scan it for private information try: with open(claude_custom_md, 'r', encoding='utf-8') as f: content = f.read() # Check for real URLs github_urls = re.findall(r'https://github\.com/([a-zA-Z0-9_-]+)', content) notion_urls = re.findall(r'https://www\.notion\.so/([a-zA-Z0-9]{32})', content) if github_urls: real_github = [url for url in github_urls if url not in ['your-username', 'example-user']] if real_github: print_warning(f"CLAUDE_CUSTOM.md contains real GitHub URLs: {', '.join(real_github)}") if notion_urls: print_warning(f"CLAUDE_CUSTOM.md contains Notion URLs (likely private)") # Check for private IPs ips = scan_for_ips(content, 'CLAUDE_CUSTOM.md') if ips: print_warning(f"CLAUDE_CUSTOM.md contains IP addresses: {', '.join(ips)}") # Check for other sensitive data sensitive = scan_for_api_keys(content, 'CLAUDE_CUSTOM.md') if sensitive: print_warning(f"CLAUDE_CUSTOM.md may contain sensitive information") except Exception as e: print_error(f"Error scanning CLAUDE_CUSTOM.md: {e}") return len(issues) == 0 def check_claude_md_public(): """Check that CLAUDE.md is public-ready (no homelab-specific details)""" print_header("Checking CLAUDE.md is Public-Ready") issues = [] claude_md = script_dir / 'CLAUDE.md' if not claude_md.exists(): print_error("CLAUDE.md does not exist (should be committed to public repo)") return False print_success("CLAUDE.md exists") # Scan for common homelab-specific patterns try: with open(claude_md, 'r', encoding='utf-8') as f: content = f.read() # Check for specific server names that shouldn't be in public version homelab_patterns = [ (r'\bDell-Server\b', 'Dell-Server'), (r'\bHL16\b', 'HL16'), (r'\bServer-01\b', 'Server-01'), ] for pattern, name in homelab_patterns: if re.search(pattern, content, re.IGNORECASE): print_error(f"CLAUDE.md contains homelab-specific reference: {name}") issues.append(f"Found {name} in CLAUDE.md") # Check for private IPs (should only have example IPs) ips = scan_for_ips(content, 'CLAUDE.md') suspicious_ips = [ip for ip in ips if not ( ip.startswith('192.168.1.') or ip.startswith('10.0.1.') or ip.startswith('192.0.2.') # RFC 5737 documentation range )] if suspicious_ips: print_error(f"CLAUDE.md contains non-example IPs: {', '.join(suspicious_ips)}") issues.append("CLAUDE.md contains real IP addresses") if not issues: print_success("CLAUDE.md appears public-ready (no homelab-specific details found)") except Exception as e: print_error(f"Error scanning CLAUDE.md: {e}") return False return len(issues) == 0 def scan_markdown_files(): """Scan markdown files for real infrastructure details""" print_header("Scanning Markdown Files for Infrastructure Details") issues = [] # Files to check (excluding the customized versions) md_files = [ 'README.md', 'SECURITY.md', 'PROJECT_INSTRUCTIONS.example.md', 'ansible_hosts.example.yml' ] for filename in md_files: filepath = script_dir / filename if not filepath.exists(): continue try: with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # Check for private IPs that aren't examples ips = scan_for_ips(content, filename) suspicious_ips = [ip for ip in ips if not ( ip.startswith('192.168.1.') or ip.startswith('10.0.1.') or ip.startswith('172.16.') )] if suspicious_ips: print_error(f"{filename}: Found non-example IPs: {', '.join(suspicious_ips)}") issues.append(f"{filename} contains real IP addresses") else: print_success(f"{filename}: No real IP addresses found") # Check for common domain patterns if '.local' in content or 'home.' in content: # Count occurrences local_count = content.count('.local') home_count = content.count('home.') if filename == 'README.md' and (local_count > 3 or home_count > 0): print_warning(f"{filename}: Contains .local or home. domains ({local_count + home_count} times)") print_warning(" → Verify these are examples, not your real domains") except Exception as e: print_error(f"Error scanning {filename}: {e}") return len(issues) == 0 def scan_for_real_infrastructure(inventory_data: Optional[Dict[str, Set[str]]]): """ Scan all code files for references to real infrastructure from Ansible inventory. This is a context-aware check that knows YOUR specific hostnames and IPs. """ if not inventory_data: print_header("Scanning for Real Infrastructure Details") print_warning("Ansible inventory not found - skipping context-aware infrastructure scan") print_warning("To enable: Set ANSIBLE_INVENTORY_PATH in .env or create ansible_hosts.yml") return True print_header("Scanning for Real Infrastructure Details (Context-Aware)") print(f"Loaded {len(inventory_data['ips'])} IP addresses, " + f"{len(inventory_data['hostnames'])} hostnames, " + f"{len(inventory_data['domains'])} domains from inventory") issues = [] # Files to check (exclude sensitive files that should already be gitignored) files_to_check = [] # Python files files_to_check.extend([f for f in script_dir.glob('*.py') if f.name != 'pre_publish_check.py']) # Markdown files (all public ones - exclude gitignored CLAUDE_CUSTOM.md and PROJECT_INSTRUCTIONS.md) all_md_files = script_dir.glob('*.md') gitignored_md = {'CLAUDE_CUSTOM.md', 'PROJECT_INSTRUCTIONS.md'} for md_file in all_md_files: if md_file.name not in gitignored_md: files_to_check.append(md_file) # YAML example files files_to_check.extend(script_dir.glob('*.example.yml')) files_to_check.extend(script_dir.glob('*.example.yaml')) for file_path in files_to_check: try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() found_issues = [] # Check for real IP addresses for ip in inventory_data['ips']: if ip in content: # Make sure it's not in a comment explaining what to replace lines_with_ip = [line for line in content.split('\n') if ip in line] # Filter out comments that are clearly examples real_occurrences = [line for line in lines_with_ip if not any(keyword in line.lower() for keyword in ['example', 'replace', 'your-ip', 'placeholder', 'template'])] if real_occurrences: found_issues.append(f"Real IP address: {ip}") # Check for real hostnames for hostname in inventory_data['hostnames']: # Use word boundaries to avoid false positives pattern = r'\b' + re.escape(hostname) + r'\b' if re.search(pattern, content, re.IGNORECASE): # Check if it's in a comment/example context lines_with_hostname = [line for line in content.split('\n') if re.search(pattern, line, re.IGNORECASE)] real_occurrences = [line for line in lines_with_hostname if not any(keyword in line.lower() for keyword in ['example', 'replace', 'your-host', 'placeholder', 'template', 'e.g.', 'i.e.'])] if real_occurrences: found_issues.append(f"Real hostname: {hostname}") # Check for real domain names (be careful with common domains) for domain in inventory_data['domains']: # Skip very common domains that might be coincidental if domain in ['local', 'com', 'net', 'org', 'home']: continue pattern = r'\b' + re.escape(domain) + r'\b' if re.search(pattern, content, re.IGNORECASE): lines_with_domain = [line for line in content.split('\n') if re.search(pattern, line, re.IGNORECASE)] real_occurrences = [line for line in lines_with_domain if not any(keyword in line.lower() for keyword in ['example', 'replace', 'your-domain', 'placeholder', 'template'])] if real_occurrences: found_issues.append(f"Real domain: {domain}") if found_issues: print_error(f"{file_path.name}: Found real infrastructure details!") for issue in set(found_issues): # Use set to avoid duplicates print(f" → {issue}") issues.append(f"{file_path.name} contains real infrastructure details") else: print_success(f"{file_path.name}: No real infrastructure details found") except Exception as e: print_warning(f"Error scanning {file_path.name}: {e}") if issues: print() print_error("❌ Found references to real infrastructure in files that will be committed!") print_error("These files should only contain example/placeholder data.") else: print() print_success("✅ No real infrastructure details found in public files") return len(issues) == 0 def final_reminders(): """Print final reminders""" print_header("Final Pre-Publication Checklist") reminders = [ "Review git history for accidentally committed secrets", "Ensure .env is NOT in git history (git rm --cached .env if needed)", "Ensure ansible_hosts.yml is NOT in git history (if you have one)", "Test with a fresh clone in a new directory", "Verify .gitignore is working (git status should not show sensitive files)", "Double-check that PROJECT_INSTRUCTIONS.md and CLAUDE_CUSTOM.md are gitignored", "Verify CLAUDE.md is public-ready (no homelab-specific details)", "Verify CLAUDE_CUSTOM.example.md exists and contains placeholder data only", "Context-aware check scanned for YOUR real IPs/hostnames (if inventory found)", "Review all commits for sensitive data before pushing", "Consider using 'git secrets' or similar tools", "Update GitHub repository description and tags", "Add topics/tags to GitHub repo for discoverability", "Consider adding LICENSE file if not present" ] for i, reminder in enumerate(reminders, 1): print(f" {i}. {reminder}") print() def main(): """Run all checks""" print(f"{BOLD}Homelab MCP - Pre-Publication Security Check{RESET}") print(f"{BOLD}{'='*70}{RESET}") print(f"Checking directory: {script_dir}\n") # Load Ansible inventory for context-aware checking inventory_data = load_ansible_inventory() if inventory_data: print(f"✓ Loaded Ansible inventory with {len(inventory_data['ips'])} IPs, " + f"{len(inventory_data['hostnames'])} hosts, {len(inventory_data['domains'])} domains\n") all_passed = True # Run checks if not check_sensitive_files(): all_passed = False if not check_documentation_files(): all_passed = False if not check_claude_custom_md(): all_passed = False if not check_claude_md_public(): all_passed = False if not scan_python_files(): all_passed = False if not scan_markdown_files(): all_passed = False # NEW: Context-aware infrastructure scanning if not scan_for_real_infrastructure(inventory_data): all_passed = False # Final summary print_header("Summary") if all_passed: print_success("All checks passed! ✨") print_success("Review the warnings above and the final checklist before publishing.") else: print_error("Some checks FAILED! ⚠️") print_error("Fix the issues above before publishing to GitHub.") final_reminders() # Exit code sys.exit(0 if all_passed else 1) if __name__ == '__main__': main()

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/bjeans/homelab-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server