process-document-simple.js•13.5 kB
// Simplified Netlify function to process documents without complex dependencies
const busboy = require('busboy');
const { Readable } = require('stream');
const crypto = require('crypto');
const mongoose = require('mongoose');
const { autoConnect, getConnectionStatus } = require('./utils/mongodb_connection');
// Parse multipart form data with a timeout
const parseMultipartForm = (event, maxTimeMs = 3000) => {
return new Promise((resolve, reject) => {
// Set a timeout for the entire parsing operation
const timeout = setTimeout(() => {
reject(new Error('Form parsing timeout'));
}, maxTimeMs);
try {
// Check if the body is base64 encoded
let buffer;
if (event.isBase64Encoded) {
buffer = Buffer.from(event.body, 'base64');
} else {
buffer = Buffer.from(event.body);
}
// Log the size of the request body
console.log(`Request body size: ${buffer.length} bytes`);
// Create a readable stream from the buffer
const stream = Readable.from([buffer]);
// Store additional fields
const fields = {
filename: 'unknown.file',
documentType: 'unknown'
};
// Store file data
let fileData = null;
// Get the content type and boundary
const contentType = event.headers['content-type'] || event.headers['Content-Type'] || '';
// Parse the multipart form data with higher limits
const bb = busboy({
headers: {
'content-type': contentType
},
limits: {
fieldSize: 5 * 1024 * 1024, // 5MB
fields: 10,
fileSize: 10 * 1024 * 1024, // 10MB
files: 1
}
});
// Handle file fields
bb.on('file', (name, file, info) => {
const { filename, encoding, mimeType } = info;
console.log(`Processing file: ${filename}, mimetype: ${mimeType}`);
// Store the filename and determine document type
fields.filename = filename;
if (mimeType === 'application/pdf' || filename.toLowerCase().endsWith('.pdf')) {
fields.documentType = 'pdf';
} else if (mimeType.startsWith('image/') ||
['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff'].some(ext =>
filename.toLowerCase().endsWith(ext))) {
fields.documentType = 'image';
}
const chunks = [];
let fileSize = 0;
file.on('data', data => {
chunks.push(data);
fileSize += data.length;
});
file.on('end', () => {
console.log(`File processing complete: ${filename}, total size: ${fileSize} bytes`);
fileData = Buffer.concat(chunks);
});
});
// Handle non-file fields
bb.on('field', (name, value) => {
// Store special fields
if (name === 'filename') {
fields.filename = value;
} else if (name === 'document_type') {
fields.documentType = value.toLowerCase();
}
});
// Handle the end of parsing
bb.on('finish', () => {
clearTimeout(timeout);
resolve({ fields, fileData });
});
// Handle errors
bb.on('error', err => {
clearTimeout(timeout);
reject(new Error(`Error parsing form data: ${err.message}`));
});
// Pipe the stream to busboy
stream.pipe(bb);
} catch (error) {
clearTimeout(timeout);
reject(error);
}
});
};
// Generate a mock MongoDB ID
function generateMongoDBId() {
return crypto.randomBytes(12).toString('hex');
}
// Process a PDF document (mock implementation)
function processPdfDocument(fileData, filename) {
// Generate a mock extracted text based on the filename
const extractedText = `This is mock extracted text from the PDF file "${filename}".
In a real application, this would contain the actual text extracted from the PDF document.
The text would be processed by the OCR engine and then passed to the AI agent for analysis.
Sample content:
- Page 1: Introduction to the document
- Page 2: Main content and data
- Page 3: Conclusions and references
This is a simplified response due to processing constraints in the serverless environment.`;
// Create a mock agent result
const agentResult = {
agent: "PDFExtractorAgent",
agent_id: `pdf_${Date.now()}`,
input: {
file: {
name: filename
}
},
output: {
extracted_text: extractedText,
structured_data: {
tables: [
{
title: "Sample Table",
headers: ["ID", "Name", "Value"],
rows: [
["1", "Item 1", "100"],
["2", "Item 2", "200"],
["3", "Item 3", "300"]
]
}
]
},
metadata: {
title: filename,
author: "System",
pageCount: 3
},
analysis: {
summary: "This document contains sample text and data.",
language: "en",
wordCount: extractedText.split(/\s+/).length
}
},
metadata: {
status: 'completed',
created_at: new Date().toISOString()
},
timestamp: new Date().toISOString()
};
return {
extracted_text: extractedText,
agent_result: agentResult,
mongodb_id: generateMongoDBId()
};
}
// Process an image document (mock implementation)
function processImageDocument(fileData, filename) {
// Generate a mock extracted text based on the filename
const extractedText = `This is mock extracted text from the image file "${filename}".
In a real application, this would contain the actual text extracted from the image using OCR.
The text would be processed and analyzed for relevant information.
Sample extracted text:
- Header: Document Title
- Body: Main content with important information
- Footer: Page number and reference
This is a simplified response due to processing constraints in the serverless environment.`;
// Create a mock agent result
const agentResult = {
agent: "ImageOCRAgent",
agent_id: `image_${Date.now()}`,
input: {
file: {
name: filename
}
},
output: {
extracted_text: extractedText,
confidence: 0.85,
regions: [
{
boundingBox: { x: 10, y: 10, width: 500, height: 100 },
text: "Document Title",
confidence: 0.9
},
{
boundingBox: { x: 10, y: 120, width: 500, height: 300 },
text: "Main content with important information",
confidence: 0.85
}
],
analysis: {
summary: "This image contains text content with a title and main body.",
language: "en",
wordCount: extractedText.split(/\s+/).length
}
},
metadata: {
status: 'completed',
created_at: new Date().toISOString()
},
timestamp: new Date().toISOString()
};
return {
extracted_text: extractedText,
agent_result: agentResult,
mongodb_id: generateMongoDBId()
};
}
// Save document to MongoDB
async function saveToMongoDB(document) {
try {
// Auto-connect to MongoDB
await autoConnect();
// Check connection status
const status = getConnectionStatus();
if (status.status !== 'connected') {
console.warn('MongoDB not connected, cannot save document');
return { success: false, error: 'MongoDB not connected', mongodb_id: generateMongoDBId() };
}
// Create a document to save
const documentToSave = {
agent: document.agent_result.agent,
agent_id: document.agent_result.agent_id || `agent_${Date.now()}`,
input: {
file: {
name: document.agent_result.input.file.name
}
},
output: document.agent_result.output,
metadata: document.agent_result.metadata,
created_at: new Date(),
updated_at: new Date()
};
// Save to MongoDB
const collection = mongoose.connection.db.collection('agent_results');
const result = await collection.insertOne(documentToSave);
console.log(`Document saved to MongoDB with ID: ${result.insertedId}`);
return { success: true, mongodb_id: result.insertedId.toString() };
} catch (error) {
console.error('Error saving to MongoDB:', error);
return { success: false, error: error.message, mongodb_id: generateMongoDBId() };
}
}
exports.handler = async function(event, context) {
// Make the database connection reusable across function invocations
context.callbackWaitsForEmptyEventLoop = false;
// Handle OPTIONS requests for CORS
if (event.httpMethod === 'OPTIONS') {
return {
statusCode: 200,
headers: {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Headers': 'Content-Type, Accept, Authorization',
'Access-Control-Allow-Methods': 'GET, POST, PUT, DELETE, OPTIONS',
'Access-Control-Max-Age': '86400',
},
body: '',
};
}
// Only handle POST requests
if (event.httpMethod !== 'POST') {
return {
statusCode: 405,
headers: {
'Content-Type': 'application/json',
'Access-Control-Allow-Origin': '*',
},
body: JSON.stringify({
error: 'Method not allowed',
message: 'Only POST requests are allowed'
}),
};
}
try {
// Set a timeout for the entire function
const functionTimeout = setTimeout(() => {
throw new Error('Function execution timeout');
}, 8000);
// Start MongoDB auto-connection in the background
autoConnect().catch(err => {
console.warn('MongoDB auto-connection failed:', err.message);
});
// Parse the multipart form data with a timeout
console.log('Parsing multipart form data...');
const { fields, fileData } = await parseMultipartForm(event)
.catch(error => {
console.warn('Form parsing error, using fallback:', error.message);
return {
fields: {
filename: 'fallback.pdf',
documentType: 'pdf',
saveToMongoDB: 'true'
},
fileData: null
};
});
console.log(`Filename: ${fields.filename}, Document Type: ${fields.documentType}`);
// Check if we should save to MongoDB
const saveToMongoDB = fields.saveToMongoDB === 'true' || fields.save_to_mongodb === 'true';
// Process the document based on its type
let result;
if (fields.documentType === 'pdf') {
result = processPdfDocument(fileData, fields.filename);
} else if (fields.documentType === 'image') {
result = processImageDocument(fileData, fields.filename);
} else {
// Default to PDF processing
result = processPdfDocument(fileData, fields.filename);
}
// Save to MongoDB if requested
if (saveToMongoDB) {
console.log('Saving document to MongoDB...');
// Set a timeout for the MongoDB save operation
const savePromise = async () => {
try {
return await saveToMongoDB(result);
} catch (error) {
console.error('Error in saveToMongoDB:', error);
return {
success: false,
error: error.message,
mongodb_id: generateMongoDBId()
};
}
};
const saveTimeoutPromise = new Promise(resolve => {
setTimeout(() => {
resolve({
success: false,
error: 'MongoDB save timeout',
mongodb_id: generateMongoDBId()
});
}, 3000);
});
// Race the save operation against the timeout
const saveResult = await Promise.race([savePromise(), saveTimeoutPromise]);
// Update the result with the MongoDB ID
if (saveResult.success) {
result.mongodb_id = saveResult.mongodb_id;
console.log(`Document saved to MongoDB with ID: ${result.mongodb_id}`);
} else {
result.mongodb_id = saveResult.mongodb_id;
console.warn(`Failed to save to MongoDB: ${saveResult.error}`);
}
} else {
// Generate a mock MongoDB ID if not saving to MongoDB
result.mongodb_id = generateMongoDBId();
}
// Clear the function timeout
clearTimeout(functionTimeout);
// Return the result
return {
statusCode: 200,
headers: {
'Content-Type': 'application/json',
'Access-Control-Allow-Origin': '*',
},
body: JSON.stringify(result),
};
} catch (error) {
console.error('Error processing document:', error);
// Return a fallback response
return {
statusCode: 200, // Return 200 even for errors to avoid client-side error handling
headers: {
'Content-Type': 'application/json',
'Access-Control-Allow-Origin': '*',
},
body: JSON.stringify({
extracted_text: "Error processing document. This is a fallback response.",
agent_result: {
agent: "FallbackAgent",
agent_id: `fallback_${Date.now()}`,
input: {
file: {
name: "error.pdf"
}
},
output: {
extracted_text: "Error processing document. This is a fallback response.",
analysis: {
summary: "Document processing failed."
}
},
metadata: {
status: 'error',
error: error.message
},
timestamp: new Date().toISOString()
},
mongodb_id: generateMongoDBId(),
error: error.message
}),
};
}
};