SuiAgentic

upload_folder.py•4.78 kB

import os import argparse import asyncio from pathlib import Path import uuid from datetime import datetime import PyPDF2 from sentence_transformers import SentenceTransformer from app.services.document_service import DocumentService from app.services.text_service import TextService from app.services.qdrant_service import QdrantService from app.config.settings import MODEL_NAME # Initialize services document_service = DocumentService() text_service = TextService() qdrant_service = QdrantService() async def process_pdf_file(file_path, call_name=None): """Process a single PDF file and upload to Qdrant""" try: file_name = os.path.basename(file_path) print(f"Processing {file_name}...") # Use file name as call_name if not provided if not call_name: call_name = os.path.splitext(file_name)[0] # Create a virtual document URL based on filename virtual_url = f"file://{file_name}" # Check if a document with this name already exists existing_doc_id = qdrant_service.get_existing_doc_id(virtual_url) # Generate a unique document ID (reuse if updating) doc_id = existing_doc_id if existing_doc_id else str(uuid.uuid4()) current_date = datetime.now().isoformat() # If updating, delete existing chunks first if existing_doc_id: qdrant_service.delete_document_chunks(doc_id) print(f"Updating existing document: {file_name}") else: print(f"Creating new document: {file_name}") # Extract text from PDF with open(file_path, 'rb') as f: text = document_service.extract_text_from_pdf(f) # Split text into chunks and generate embeddings chunks = text_service.chunk_text(text) if not chunks: print(f"Warning: No text content extracted from {file_name}") return print(f"Extracted {len(chunks)} chunks from {file_name}") embeddings = text_service.generate_embeddings(chunks) # Prepare points for Qdrant points = [] for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)): points.append( { "id": str(uuid.uuid4()), "vector": embedding, "payload": { "text": chunk, "url": virtual_url, "call_name": call_name, "doc_id": doc_id, "chunk_id": i, "date": current_date, "total_chunks": len(chunks), "file_name": file_name } } ) # Store in Qdrant qdrant_service.upsert_points(points) print(f"Successfully {'updated' if existing_doc_id else 'embedded'} {file_name} with {len(chunks)} chunks") return { "file_name": file_name, "call_name": call_name, "doc_id": doc_id, "chunks": len(chunks) } except Exception as e: print(f"Error processing {file_path}: {str(e)}") return None async def process_folder(folder_path, prefix=None): """Process all PDF files in a folder""" if not os.path.isdir(folder_path): print(f"Error: {folder_path} is not a valid directory") return folder = Path(folder_path) pdf_files = list(folder.glob("**/*.pdf")) if not pdf_files: print(f"No PDF files found in {folder_path}") return print(f"Found {len(pdf_files)} PDF files in {folder_path}") results = [] for pdf_file in pdf_files: # Use prefix if provided if prefix: call_name = f"{prefix} - {pdf_file.stem}" else: call_name = pdf_file.stem result = await process_pdf_file(str(pdf_file), call_name) if result: results.append(result) print(f"Successfully processed {len(results)} out of {len(pdf_files)} files") return results def parse_arguments(): parser = argparse.ArgumentParser(description="Upload folder of PDFs to Qdrant") parser.add_argument("folder", type=str, help="Path to folder containing PDF files") parser.add_argument("--prefix", type=str, help="Prefix to add to document names (optional)") return parser.parse_args() if __name__ == "__main__": args = parse_arguments() asyncio.run(process_folder(args.folder, args.prefix))

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/AnhQuan2004/mcp_agent'

If you have feedback or need assistance with the MCP directory API, please join our Discord server