File Search Server

utils.py•6.09 kB

import os import re from typing import List, Iterator, Tuple, Dict from models import SearchParams # Load environment variables from .env file try: from dotenv import load_dotenv load_dotenv() except ImportError: # dotenv not installed, skip loading .env file pass def get_valid_files(folder_path: str) -> Iterator[Tuple[str, str, str]]: """ Generator that yields valid files from the folder, filtering out unwanted directories and files. Yields: Tuple[str, str, str]: (filepath, relative_path, filename) """ for root, _, files in os.walk(folder_path): # Skip unwanted directories if any(pattern in root.lower() for pattern in [ '.cache', '.local', '.vscode', '.ds_store', '.venv', '.git', 'venv', '__pycache__', 'site-packages', 'cloudstorage', 'clouddocs' ]): continue for file in files: # Skip hidden files and system files if file.startswith('.') or file.startswith('~') or file.endswith('.h'): continue filepath = os.path.join(root, file) rel_path = os.path.relpath(filepath, folder_path) yield filepath, rel_path, file def parse_search_prompt_with_llm(prompt: str) -> SearchParams: """ Use an LLM to parse the user prompt and extract search parameters. Replace this with actual LLM API calls (OpenAI, Claude, etc.) """ # Standard prompt template for file search parsing system_prompt = """You are an expert file search parameter extraction assistant. Your task is to analyze user requests for file searches and extract structured parameters. ## Your Role: - Parse natural language file search requests - Extract relevant search criteria - Return structured JSON data - Be precise and consistent ## Output Format: Always return ONLY valid JSON in this exact structure: { "file_types": [], "filename_keywords": [], "content_keywords": [], "search_sequence": ["file_type", "filename", "content"], "search_logic": "AND" } ## Field Definitions: - file_types: File extensions (.pdf, .txt, .docx, .doc, .ipynb, .py, .js, etc.) - filename_keywords: Keywords to appear in file names. If the keywrods are used to define file types, just ignore them here. - content_keywords: Keywords likely to appear inside file contents - search_sequence: Always use ["file_type", "filename", "content"] - search_logic: Always use "AND" (files must match all criteria) ## Examples: User: "find pdf files about machine learning" Output: {"file_types": [".pdf"], "filename_keywords": ["machine", "learning"], "content_keywords": ["machine", "learning"], "search_sequence": ["file_type", "filename", "content"], "search_logic": "AND"} User: "python scripts with neural network code" Output: {"file_types": [".py"], "filename_keywords": ["neural", "network"], "content_keywords": ["neural", "network", "python"], "search_sequence": ["file_type", "filename", "content"], "search_logic": "AND"}""" user_prompt = f"""Parse this file search request: Request: "{prompt}" Return the JSON structure with extracted parameters.""" try: from openai import OpenAI client = OpenAI() response = client.chat.completions.create( model="gpt-4o-mini", temperature=0, messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt} ] ) llm_response = response.choices[0].message.content print(f"LLM Response: {llm_response}") # Debug output if not llm_response or llm_response.strip() == "": raise ValueError("Empty response from OpenAI API") # Clean the response - remove any markdown code blocks or extra text llm_response = llm_response.strip() if llm_response.startswith("```json"): llm_response = llm_response[7:] if llm_response.startswith("```"): llm_response = llm_response[3:] if llm_response.endswith("```"): llm_response = llm_response[:-3] llm_response = llm_response.strip() import json parsed = json.loads(llm_response) return SearchParams(**parsed) except json.JSONDecodeError as e: print(f"JSON parsing failed: {e}") print(f"Raw LLM response: '{llm_response}'") return fallback_parse_prompt(prompt) except Exception as e: print(f"LLM parsing failed: {e}, falling back to rule-based parsing") return fallback_parse_prompt(prompt) def fallback_parse_prompt(prompt: str) -> SearchParams: """Fallback rule-based parsing if LLM fails""" prompt_lower = prompt.lower() # Extract file types file_types = re.findall(r'\.(\w+)', prompt_lower) file_types = [f'.{ext}' for ext in file_types] # Remove file type mentions and extract keywords keyword_text = re.sub(r'\.(\w+)', '', prompt_lower) keywords = [word.strip() for word in keyword_text.split() if word.strip() and len(word) > 2] # Determine search logic from prompt search_logic = "OR" if any(word in prompt_lower for word in [' or ', ' either ', ' any ']) else "AND" return SearchParams( file_types=file_types, filename_keywords=keywords, content_keywords=keywords, search_sequence=["file_type", "filename", "content"], search_logic=search_logic ) def parse_search_prompt(prompt: str) -> SearchParams: """Main parsing function that uses LLM with fallback""" return parse_search_prompt_with_llm(prompt) def validate_folder_path(folder_path: str) -> List[Dict[str, str]]: """Validate if folder path exists and is a directory""" if not os.path.exists(folder_path): return [{"error": f"Folder path {folder_path} does not exist"}] if not os.path.isdir(folder_path): return [{"error": f"{folder_path} is not a directory"}] return []

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/liu10250510/mcp-file-search-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server