repo_scraper.py•3.2 kB
#!/usr/bin/env python3
import os
import sys
from pathlib import Path
def convert_to_markdown(file_path, content):
"""Convert file content to markdown format"""
file_ext = file_path.suffix.lower()
if file_ext in ['.md', '.markdown']:
return content
# For code files, wrap in code blocks
if file_ext in ['.py', '.js', '.ts', '.java', '.cpp', '.c', '.h', '.css', '.html', '.json', '.xml', '.yaml', '.yml']:
lang = {
'.py': 'python', '.js': 'javascript', '.ts': 'typescript',
'.java': 'java', '.cpp': 'cpp', '.c': 'c', '.h': 'c',
'.css': 'css', '.html': 'html', '.json': 'json',
'.xml': 'xml', '.yaml': 'yaml', '.yml': 'yaml'
}.get(file_ext, '')
return f"```{lang}\n{content}\n```"
# For other text files, return as-is
return content
def process_repository(repo_path, suffixes, ignore_folders, output_dir='output'):
"""Process repository files and convert to markdown"""
repo_path = Path(repo_path)
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
processed_count = 0
ignore_folders = ["__mocks__"]
for root, dirs, files in os.walk(repo_path):
# Remove ignored directories from dirs list to prevent traversal
dirs[:] = [d for d in dirs if d not in ignore_folders]
for file in files:
file_path = Path(root) / file
# Check if file has matching suffix
if file_path.suffix.lower() in suffixes and file_path.suffix.lower() not in [".test.ts"]:
print(f"Processing directory: {file_path}")
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
# Convert to markdown
markdown_content = convert_to_markdown(file_path, content)
# Create output filename
relative_path = file_path.relative_to(repo_path)
output_filename = str(relative_path).replace('/', '_').replace('\\', '_') + '.md'
output_file = output_path / output_filename
# Write markdown file
with open(output_file, 'w', encoding='utf-8') as f:
f.write(f"# {file_path}\n\n")
f.write(markdown_content)
print(f"Processed: {relative_path}")
processed_count += 1
except Exception as e:
print(f"Error processing {file_path}: {e}")
print(f"\nCompleted! Processed {processed_count} files.")
def main():
repo_path = "/home/esel/Documents/doc-server/typescript-sdk/src"
suffixes = [".ts"]
ignore_folders = ["__mocks__"]
print(f"Repository: {repo_path}")
print(f"File suffixes: {suffixes}")
print(f"Ignore folders: {ignore_folders}")
print()
process_repository(repo_path, suffixes, ignore_folders)
if __name__ == "__main__":
main()