Skip to main content
Glama

RAG Document Server

by jaimeferj
reference_extractor.py5.83 kB
"""Extract Python object references from documentation text.""" import re from typing import List, Set, Dict class PythonReferenceExtractor: """Extract Python class, function, and module references from text.""" # Patterns for Python references PATTERNS = { # Class.method() or Class.attribute 'class_method': re.compile(r'\b([A-Z][a-zA-Z0-9_]*\.[a-z_][a-zA-Z0-9_]*)\(\)?'), # module.Class or module.function 'module_object': re.compile(r'\b([a-z_][a-z0-9_]*\.[A-Z][a-zA-Z0-9_]*(?:\.[a-z_][a-zA-Z0-9_]*)?)\b'), # @decorator 'decorator': re.compile(r'@([a-z_][a-z0-9_]*(?:\.[a-z_][a-z0-9_]*)*)'), # function_name() with backticks 'function': re.compile(r'`([a-z_][a-z0-9_]*)\(\)`'), # Class name with backticks 'class': re.compile(r'`([A-Z][a-zA-Z0-9_]*)`'), # Full qualified names like dagster.AutomationCondition 'qualified_name': re.compile(r'\b(dagster(?:\.[a-z_][a-z0-9_]*)*\.[A-Z][a-zA-Z0-9_]*(?:\.[a-z_][a-zA-Z0-9_]*)?)\b'), } # GitHub URL pattern GITHUB_URL_PATTERN = re.compile( r'https://github\.com/dagster-io/dagster/blob/master/python_modules/[^\s\)\]]+(?:#L\d+)?' ) def extract_references(self, text: str) -> Dict[str, Set[str]]: """ Extract Python object references from text. Args: text: Text to extract references from Returns: Dictionary with categorized references: { 'class_method': {'AutomationCondition.eager', ...}, 'module_object': {...}, 'decorator': {'asset', 'op', ...}, 'function': {...}, 'class': {'AutomationCondition', ...}, 'qualified_name': {'dagster.AutomationCondition.eager', ...}, 'all': set of all unique references } """ references = { 'class_method': set(), 'module_object': set(), 'decorator': set(), 'function': set(), 'class': set(), 'qualified_name': set(), } # Extract each pattern type for pattern_name, pattern in self.PATTERNS.items(): matches = pattern.findall(text) references[pattern_name].update(matches) # Create combined set of all references all_refs = set() for refs in references.values(): all_refs.update(refs) references['all'] = all_refs return references def extract_github_urls(self, text: str) -> List[str]: """ Extract GitHub URLs from text. Args: text: Text to extract URLs from Returns: List of GitHub URLs """ return self.GITHUB_URL_PATTERN.findall(text) def prioritize_references(self, references: Dict[str, Set[str]], max_refs: int = 3) -> List[str]: """ Prioritize which references to follow up on. Args: references: Dictionary of categorized references max_refs: Maximum number of references to return Returns: List of prioritized reference strings to query """ prioritized = [] # Priority order: qualified_name > class_method > class > module_object # These are most likely to have detailed documentation # 1. Qualified names (most specific) for ref in references['qualified_name']: if len(prioritized) >= max_refs: break prioritized.append(ref) # 2. Class methods (e.g., AutomationCondition.eager) if len(prioritized) < max_refs: for ref in references['class_method']: if len(prioritized) >= max_refs: break # Skip if we already have the qualified version if not any(ref in p for p in prioritized): prioritized.append(ref) # 3. Classes (e.g., AutomationCondition) if len(prioritized) < max_refs: for ref in references['class']: if len(prioritized) >= max_refs: break # Skip if we already have this as part of a method/qualified name if not any(ref in p for p in prioritized): prioritized.append(ref) # 4. Module objects if len(prioritized) < max_refs: for ref in references['module_object']: if len(prioritized) >= max_refs: break if not any(ref in p for p in prioritized): prioritized.append(ref) # 5. Decorators (common ones like @asset, @op) if len(prioritized) < max_refs: for ref in references['decorator']: if len(prioritized) >= max_refs: break if not any(ref in p for p in prioritized): prioritized.append(f"@{ref}") return prioritized def format_reference_for_query(self, reference: str) -> str: """ Format a reference into a query string. Args: reference: Python reference (e.g., 'AutomationCondition.eager') Returns: Query string for RAG system """ # Remove common prefixes if present ref = reference.replace('dagster.', '') # Format as a question if '.' in ref: parts = ref.split('.') if ref.startswith('@'): return f"what is the {ref} decorator" else: return f"what is {parts[0]} {parts[1]}" else: if ref.startswith('@'): return f"what is the {ref} decorator" else: return f"what is {ref}"

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/jaimeferj/mcp-rag-docs'

If you have feedback or need assistance with the MCP directory API, please join our Discord server