Skip to main content
Glama
contract_processor.py7.93 kB
from typing import Dict, Any, List import re import logging from .base_processor import BaseProcessor from mcp.context import MCPContext from models.entity_extractor import BERTEntityExtractor from models.summarizer import T5Summarizer logger = logging.getLogger(__name__) class ContractProcessor(BaseProcessor): """Processor for handling contract documents.""" def __init__(self, config: Dict[str, Any]): super().__init__(config) self.entity_extractor = BERTEntityExtractor( model_name=config["models"]["entity_extractor"]["model_name"], confidence_threshold=config["models"]["entity_extractor"]["confidence_threshold"] ) self.summarizer = T5Summarizer( model_name=config["models"]["summarizer"]["model_name"], max_length=config["models"]["summarizer"]["max_length"] ) def can_handle(self, context: MCPContext) -> bool: """Check if the document is a contract.""" if not context.raw_text: if context.compressed: context.decompress() if not context.raw_text: return False # Check if document is already classified if context.metadata.get("document_type") == "contract": return True # Look for contract indicators in the text contract_indicators = [ "agreement", "contract", "terms and conditions", "party", "parties", "hereby agrees", "obligations", "termination", "governing law" ] text_lower = context.raw_text.lower() indicator_count = sum(1 for indicator in contract_indicators if indicator in text_lower) # If at least 3 indicators are found, consider it a contract return indicator_count >= 3 def process(self, context: MCPContext) -> MCPContext: """Process a contract document.""" if not self.validate_context(context): context.add_to_history( processor_name=self.__class__.__name__, status="skipped", details={"reason": "Invalid context"} ) return context if context.compressed: context.decompress() # Mark document as contract context.update_metadata({"document_type": "contract"}) # Generate a summary of the contract summary = self.summarizer.summarize(context.raw_text) context.add_extracted_data("summary", summary, confidence=0.8) # Extract contract sections sections = self._extract_sections(context.raw_text) context.add_extracted_data("sections", sections, confidence=0.85) # Extract key dates dates = self._extract_dates(context.raw_text) for date_type, date_value in dates.items(): context.add_extracted_data(f"date_{date_type}", date_value, confidence=0.9) # Extract parties entities = self.entity_extractor.extract_entities(context.raw_text) parties = self._extract_parties(context.raw_text, entities) context.add_extracted_data("parties", parties, confidence=self.entity_extractor.get_confidence()) # Extract key terms key_terms = self._extract_key_terms(context.raw_text, sections) context.add_extracted_data("key_terms", key_terms, confidence=0.75) return context def _extract_sections(self, text: str) -> Dict[str, str]: """Extract main sections from the contract.""" sections = {} # Simple regex-based section extraction # This is a simplified implementation section_pattern = re.compile(r'(?i)^[\d\.\s]*([A-Z][A-Z\s]+)[\.\s]*$(.*?)(?=^[\d\.\s]*[A-Z][A-Z\s]+[\.\s]*$|\Z)', re.MULTILINE | re.DOTALL) for match in section_pattern.finditer(text): section_title = match.group(1).strip() section_content = match.group(2).strip() sections[section_title] = section_content return sections def _extract_dates(self, text: str) -> Dict[str, str]: """Extract key dates from the contract.""" dates = {} # Look for effective date effective_date_pattern = re.compile(r'(?i)(?:effective\s+date|commencement\s+date|start\s+date)[:\s]+([A-Za-z]+\s+\d{1,2}(?:st|nd|rd|th)?,?\s+\d{4}|\d{1,2}[-/]\d{1,2}[-/]\d{2,4})') effective_match = effective_date_pattern.search(text) if effective_match: dates["effective"] = effective_match.group(1).strip() # Look for termination date termination_date_pattern = re.compile(r'(?i)(?:termination\s+date|end\s+date|expiry\s+date)[:\s]+([A-Za-z]+\s+\d{1,2}(?:st|nd|rd|th)?,?\s+\d{4}|\d{1,2}[-/]\d{1,2}[-/]\d{2,4})') termination_match = termination_date_pattern.search(text) if termination_match: dates["termination"] = termination_match.group(1).strip() return dates def _extract_parties(self, text: str, entities: Dict[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]: """Extract parties to the contract.""" parties = [] # Extract organizations if "organization" in entities: for org in entities["organization"]: # Look for context around the organization to determine the role context_window = 200 # Characters to look around entity start_idx = max(0, org["start"] - context_window) end_idx = min(len(text), org["end"] + context_window) context_text = text[start_idx:end_idx].lower() role = None if any(indicator in context_text for indicator in ["first party", "party of the first part"]): role = "first_party" elif any(indicator in context_text for indicator in ["second party", "party of the second part"]): role = "second_party" elif "seller" in context_text: role = "seller" elif "buyer" in context_text: role = "buyer" elif "lessor" in context_text: role = "lessor" elif "lessee" in context_text: role = "lessee" parties.append({ "name": org["text"], "role": role, "confidence": org["confidence"] }) return parties def _extract_key_terms(self, text: str, sections: Dict[str, str]) -> Dict[str, Any]: """Extract key contract terms.""" key_terms = {} # Extract payment terms payment_section = None for title, content in sections.items(): if any(term in title.lower() for term in ["payment", "consideration", "compensation", "fees"]): payment_section = content break if payment_section: # Look for amounts amount_pattern = re.compile(r'(?i)(?:amount|fee|payment|price)[:\s]+[$€£]?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)') amount_match = amount_pattern.search(payment_section) if amount_match: key_terms["payment_amount"] = amount_match.group(1).strip() # Extract governing law law_pattern = re.compile(r'(?i)(?:governed\s+by|governing\s+law|jurisdiction)[:\s]+(?:the\s+laws\s+of\s+)?([A-Za-z\s]+)') law_match = law_pattern.search(text) if law_match: key_terms["governing_law"] = law_match.group(1).strip() return key_terms

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arifazim/MCP_Document_Classifer'

If you have feedback or need assistance with the MCP directory API, please join our Discord server