Scantool - File Scanner MCP

Overview Schema Related Servers Score Discussions

python_analyzer.py•20 KiB

"""Python code analyzer for extracting imports, entry points, and structure.""" import re from typing import Optional from pathlib import Path try: import tree_sitter_python from tree_sitter import Language, Parser HAS_TREE_SITTER = True except ImportError: HAS_TREE_SITTER = False from .base import BaseAnalyzer from .models import ImportInfo, EntryPointInfo, DefinitionInfo, CallInfo class PythonAnalyzer(BaseAnalyzer): """Analyzer for Python source files (.py, .pyw).""" def __init__(self): """Initialize with tree-sitter parser if available.""" super().__init__() self.parser = None if HAS_TREE_SITTER: self.parser = Parser() self.parser.language = Language(tree_sitter_python.language()) @classmethod def get_extensions(cls) -> list[str]: """Python file extensions.""" return [".py", ".pyw"] @classmethod def get_language_name(cls) -> str: """Return language name.""" return "Python" @classmethod def get_priority(cls) -> int: """Standard priority.""" return 10 def should_analyze(self, file_path: str) -> bool: """ Skip Python files that should not be analyzed. Matches PythonScanner.should_skip() logic: - Skip compiled Python files (.pyc, .pyo, .pyd) - __pycache__ directories are already filtered by COMMON_SKIP_DIRS """ filename = Path(file_path).name # Skip compiled Python files if filename.endswith(('.pyc', '.pyo', '.pyd')): return False return True def is_low_value_for_inventory(self, file_path: str, size: int = 0) -> bool: """ Identify low-value Python files for inventory listing. Low-value files (unless central): - Empty or near-empty __init__.py files - conftest.py (pytest fixtures, usually boilerplate) - setup.py/setup.cfg (unless large) """ filename = Path(file_path).name # Empty __init__.py files are low-value if filename == "__init__.py" and size < 100: return True # Empty conftest.py is usually just pytest boilerplate if filename == "conftest.py" and size < 200: return True # Very small setup files if filename in ("setup.py", "setup.cfg") and size < 100: return True # Fall back to base class (very small files) return super().is_low_value_for_inventory(file_path, size) def extract_imports(self, file_path: str, content: str) -> list[ImportInfo]: """ Extract import statements from Python file. Patterns supported: - from x.y import z - from x.y import z as w - from x.y import (a, b, c) - import x.y.z - import x.y as z - from . import x (relative import) - from ..utils import y (relative import) """ imports = [] # Pattern 1: from X import Y from_import_pattern = r'^\s*from\s+([\w.]+)\s+import\s+(.+?)(?:\s+#.*)?$' for match in re.finditer(from_import_pattern, content, re.MULTILINE): module = match.group(1) imported_items_str = match.group(2) line_num = content[:match.start()].count('\n') + 1 # Parse imported items (handle multi-line imports, aliases) imported_names = [] # Remove parentheses if present imported_items_str = imported_items_str.strip('()') for item in imported_items_str.split(','): item = item.strip() if ' as ' in item: name, alias = item.split(' as ') imported_names.append(name.strip()) else: imported_names.append(item) # Determine if relative import is_relative = module.startswith('.') import_type = "relative" if is_relative else "from_import" # Resolve relative imports to absolute paths target_module = module if is_relative: resolved = self._resolve_relative_import(file_path, module) if resolved: target_module = resolved imports.append( ImportInfo( source_file=file_path, target_module=target_module, line=line_num, import_type=import_type, imported_names=imported_names, ) ) # Pattern 2: import X import_pattern = r'^\s*import\s+([\w.]+)(?:\s+as\s+\w+)?(?:\s+#.*)?$' for match in re.finditer(import_pattern, content, re.MULTILINE): module = match.group(1) line_num = content[:match.start()].count('\n') + 1 imports.append( ImportInfo( source_file=file_path, target_module=module, line=line_num, import_type="import", imported_names=[], ) ) return imports def find_entry_points(self, file_path: str, content: str) -> list[EntryPointInfo]: """ Find entry points in Python file. Entry points: - def main() functions - if __name__ == "__main__" blocks - Flask/FastAPI/FastMCP app instances - Exports in __init__.py files """ entry_points = [] # Pattern 1: def main() main_func_pattern = r'^def\s+main\s*\(' for match in re.finditer(main_func_pattern, content, re.MULTILINE): line_num = content[:match.start()].count('\n') + 1 entry_points.append( EntryPointInfo( file=file_path, type="main_function", name="main", line=line_num, ) ) # Pattern 2: if __name__ == "__main__" if_main_pattern = r'if\s+__name__\s*==\s*["\']__main__["\']' for match in re.finditer(if_main_pattern, content): line_num = content[:match.start()].count('\n') + 1 entry_points.append( EntryPointInfo( file=file_path, type="if_main", name="__main__", line=line_num ) ) # Pattern 3: Flask/FastAPI/FastMCP app instances app_pattern = r'(app|server|mcp)\s*=\s*(Flask|FastAPI|FastMCP|Starlette)\(' for match in re.finditer(app_pattern, content): line_num = content[:match.start()].count('\n') + 1 var_name = match.group(1) framework = match.group(2) entry_points.append( EntryPointInfo( file=file_path, type="app_instance", name=var_name, line=line_num, framework=framework, ) ) # Pattern 4: __init__.py exports if file_path.endswith("__init__.py"): # Look for __all__ = [...] all_pattern = r'__all__\s*=\s*\[(.*?)\]' for match in re.finditer(all_pattern, content, re.MULTILINE | re.DOTALL): line_num = content[:match.start()].count('\n') + 1 exports_str = match.group(1) # Parse list of exported names exports = [ name.strip().strip('"').strip("'") for name in exports_str.split(',') if name.strip() ] if exports: entry_points.append( EntryPointInfo( file=file_path, type="export", name=f"__all__ ({len(exports)} items)", line=line_num, ) ) # Look for from .X import Y (re-exports) reexport_pattern = r'^from\s+\.\S+\s+import\s+(\w+)' reexports = re.findall(reexport_pattern, content, re.MULTILINE) if reexports: entry_points.append( EntryPointInfo( file=file_path, type="export", name=f"re-exports ({len(reexports)} items)", line=1, # General indicator ) ) return entry_points def classify_file(self, file_path: str, content: str) -> str: """ Classify Python file into architectural cluster. Enhanced classification with Python-specific patterns. """ # Use base implementation first cluster = super().classify_file(file_path, content) # If not already classified, check for Python-specific patterns if cluster == "other": # Check for common Python patterns in content if "if __name__ ==" in content or "def main(" in content: return "entry_points" # Check for test files by content (pytest, unittest) if any( pattern in content for pattern in ["import pytest", "import unittest", "from unittest"] ): return "tests" # Check for common utility patterns if any( pattern in content for pattern in ["def helper_", "def util_", "class Helper", "class Util"] ): return "utilities" return cluster # =================================================================== # LAYER 2: Structure-level analysis # =================================================================== def extract_definitions(self, file_path: str, content: str) -> list[DefinitionInfo]: """ Extract function/class definitions using tree-sitter. Falls back to regex if tree-sitter unavailable. """ if not self.parser: # Fallback to regex-based extraction return self._extract_definitions_regex(file_path, content) try: source_bytes = content.encode("utf-8") tree = self.parser.parse(source_bytes) return self._extract_definitions_tree_sitter( file_path, tree.root_node, source_bytes ) except Exception: # Fallback to regex on parse error return self._extract_definitions_regex(file_path, content) def _extract_definitions_tree_sitter( self, file_path: str, root, source_bytes: bytes ) -> list[DefinitionInfo]: """Extract definitions using tree-sitter AST.""" definitions = [] def traverse(node, parent_class=None): # Class definitions if node.type == "class_definition": name_node = node.child_by_field_name("name") if name_node: name = source_bytes[name_node.start_byte : name_node.end_byte].decode( "utf-8" ) line = node.start_point[0] + 1 # Get signature (base classes) superclasses = [] for child in node.children: if child.type == "argument_list": for arg in child.children: if arg.type == "identifier": superclasses.append( source_bytes[arg.start_byte : arg.end_byte].decode( "utf-8" ) ) signature = ( f"({', '.join(superclasses)})" if superclasses else None ) definitions.append( DefinitionInfo( file=file_path, type="class", name=name, line=line, signature=signature, parent=None, ) ) # Traverse children for methods for child in node.children: traverse(child, parent_class=name) # Function/method definitions elif node.type == "function_definition": name_node = node.child_by_field_name("name") if name_node: name = source_bytes[name_node.start_byte : name_node.end_byte].decode( "utf-8" ) line = node.start_point[0] + 1 # Get signature (parameters) params_node = node.child_by_field_name("parameters") signature = None if params_node: signature = source_bytes[ params_node.start_byte : params_node.end_byte ].decode("utf-8") func_type = "method" if parent_class else "function" definitions.append( DefinitionInfo( file=file_path, type=func_type, name=name, line=line, signature=signature, parent=parent_class, ) ) # Continue traversing else: for child in node.children: traverse(child, parent_class) traverse(root) return definitions def _extract_definitions_regex( self, file_path: str, content: str ) -> list[DefinitionInfo]: """Fallback: Extract definitions using regex.""" definitions = [] # Classes for match in re.finditer(r"^class\s+(\w+)", content, re.MULTILINE): line = content[: match.start()].count("\n") + 1 definitions.append( DefinitionInfo( file=file_path, type="class", name=match.group(1), line=line, signature=None, parent=None, ) ) # Functions for match in re.finditer(r"^def\s+(\w+)\s*\(", content, re.MULTILINE): line = content[: match.start()].count("\n") + 1 definitions.append( DefinitionInfo( file=file_path, type="function", name=match.group(1), line=line, signature=None, parent=None, ) ) return definitions def extract_calls( self, file_path: str, content: str, definitions: list[DefinitionInfo] ) -> list[CallInfo]: """ Extract function/method calls using tree-sitter. Falls back to regex if tree-sitter unavailable. """ if not self.parser: # Fallback to regex-based extraction return self._extract_calls_regex(file_path, content, definitions) try: source_bytes = content.encode("utf-8") tree = self.parser.parse(source_bytes) return self._extract_calls_tree_sitter( file_path, tree.root_node, source_bytes, definitions ) except Exception: # Fallback to regex on parse error return self._extract_calls_regex(file_path, content, definitions) def _extract_calls_tree_sitter( self, file_path: str, root, source_bytes: bytes, definitions: list[DefinitionInfo] ) -> list[CallInfo]: """Extract calls using tree-sitter AST.""" calls = [] current_function = None def traverse(node, context_func=None): nonlocal current_function # Track which function we're inside if node.type == "function_definition": name_node = node.child_by_field_name("name") if name_node: current_function = source_bytes[ name_node.start_byte : name_node.end_byte ].decode("utf-8") # Traverse children for child in node.children: traverse(child, current_function) current_function = context_func # Restore context # Call expressions elif node.type == "call": # Get function being called func_node = node.child_by_field_name("function") if func_node: # Handle simple calls: foo() if func_node.type == "identifier": callee_name = source_bytes[ func_node.start_byte : func_node.end_byte ].decode("utf-8") line = node.start_point[0] + 1 calls.append( CallInfo( caller_file=file_path, caller_name=context_func, callee_name=callee_name, line=line, is_cross_file=False, # Will determine later ) ) # Handle attribute calls: obj.method() elif func_node.type == "attribute": attr_node = func_node.child_by_field_name("attribute") if attr_node: callee_name = source_bytes[ attr_node.start_byte : attr_node.end_byte ].decode("utf-8") line = node.start_point[0] + 1 calls.append( CallInfo( caller_file=file_path, caller_name=context_func, callee_name=callee_name, line=line, is_cross_file=False, ) ) # Continue traversing for child in node.children: traverse(child, context_func) # Continue traversing else: for child in node.children: traverse(child, context_func) traverse(root) # Mark cross-file calls local_defs = {d.name for d in definitions} for call in calls: if call.callee_name not in local_defs: call.is_cross_file = True return calls def _extract_calls_regex( self, file_path: str, content: str, definitions: list[DefinitionInfo] ) -> list[CallInfo]: """Fallback: Extract calls using regex.""" calls = [] # Find all function calls: name( for match in re.finditer(r"\b(\w+)\s*\(", content): callee_name = match.group(1) line = content[: match.start()].count("\n") + 1 # Skip Python keywords if callee_name in [ "if", "for", "while", "def", "class", "return", "print", ]: continue calls.append( CallInfo( caller_file=file_path, caller_name=None, # Cannot determine with regex callee_name=callee_name, line=line, is_cross_file=False, ) ) # Mark cross-file calls local_defs = {d.name for d in definitions} for call in calls: if call.callee_name not in local_defs: call.is_cross_file = True return calls

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/mariusei/file-scanner-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

python_analyzer.py•20 KiB