#!/usr/bin/env python3
"""
Local HTTP Server for Google Scholar
This creates a simple web interface to test the Google Scholar scraper locally
without needing an MCP client.
"""
import json
import time
from flask import Flask, request, jsonify, render_template_string
from main import GoogleScholarScraper
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("local-server")
# Initialize Flask app and scraper
app = Flask(__name__)
scraper = GoogleScholarScraper()
# Try to import QueryProcessor, but make it optional
try:
from query_processor import QueryProcessor
query_processor = QueryProcessor()
HAS_QUERY_PROCESSOR = True
except ImportError:
logger.warning("QueryProcessor not available. Smart search will be disabled.")
query_processor = None
HAS_QUERY_PROCESSOR = False
# HTML template for the web interface
HTML_TEMPLATE = """
<!DOCTYPE html>
<html>
<head>
<title>Google Scholar Local Server</title>
<style>
body { font-family: Arial, sans-serif; max-width: 1200px; margin: 0 auto; padding: 20px; }
.container { background: #f5f5f5; padding: 20px; border-radius: 8px; margin-bottom: 20px; }
.form-group { margin-bottom: 15px; }
label { display: block; font-weight: bold; margin-bottom: 5px; }
input, select, button { padding: 8px; border: 1px solid #ddd; border-radius: 4px; }
input[type="text"], select { width: 300px; }
button { background: #007bff; color: white; cursor: pointer; }
button:hover { background: #0056b3; }
.results { background: white; padding: 20px; border-radius: 8px; margin-top: 20px; }
.paper { border-bottom: 1px solid #eee; padding: 15px 0; }
.paper:last-child { border-bottom: none; }
.title { font-weight: bold; color: #007bff; margin-bottom: 5px; }
.authors { color: #666; margin-bottom: 5px; }
.snippet { margin-bottom: 5px; }
.meta { font-size: 0.9em; color: #888; }
.loading { text-align: center; padding: 20px; color: #666; }
.error { color: #d33; background: #ffe6e6; padding: 10px; border-radius: 4px; }
</style>
</head>
<body>
<h1>🔬 Google Scholar Local Server</h1>
<div class="container">
<h2>Search Papers</h2>
<form id="searchForm">
<div class="form-group">
<label>Search Query:</label>
<input type="text" id="query" placeholder="e.g., machine learning neural networks" required>
</div>
<div class="form-group">
<label>Number of Results:</label>
<select id="numResults">
<option value="5">5</option>
<option value="10" selected>10</option>
<option value="15">15</option>
<option value="20">20</option>
</select>
</div>
<div class="form-group">
<label>Start Year (optional):</label>
<input type="number" id="startYear" placeholder="e.g., 2020">
</div>
<div class="form-group">
<label>End Year (optional):</label>
<input type="number" id="endYear" placeholder="e.g., 2024">
</div>
<button type="submit">Search Papers</button>
</form>
</div>
<div class="container">
<h2>Quick Actions</h2>
<button onclick="searchAuthor()">Search Author Papers</button>
<button onclick="searchRecent()">Recent Papers</button>
<button onclick="searchHighlyCited()">Highly Cited Papers</button>
<button onclick="smartSearch()">🧠 Smart Search (AI Keywords)</button>
</div>
<div id="results"></div>
<script>
document.getElementById('searchForm').addEventListener('submit', function(e) {
e.preventDefault();
searchPapers();
});
async function searchPapers() {
const query = document.getElementById('query').value;
const numResults = document.getElementById('numResults').value;
const startYear = document.getElementById('startYear').value;
const endYear = document.getElementById('endYear').value;
showLoading();
try {
const params = new URLSearchParams({
query: query,
num_results: numResults
});
if (startYear) params.append('start_year', startYear);
if (endYear) params.append('end_year', endYear);
const response = await fetch(`/api/search?${params}`);
const data = await response.json();
if (data.error) {
showError(data.error);
} else {
showResults(data);
}
} catch (error) {
showError('Network error: ' + error.message);
}
}
async function searchAuthor() {
const author = prompt('Enter author name:', 'Geoffrey Hinton');
if (!author) return;
showLoading();
try {
const response = await fetch(`/api/author?author_name=${encodeURIComponent(author)}&num_results=10`);
const data = await response.json();
showResults(data);
} catch (error) {
showError('Error: ' + error.message);
}
}
async function searchRecent() {
const field = prompt('Enter research field:', 'quantum computing');
if (!field) return;
showLoading();
try {
const response = await fetch(`/api/recent?field=${encodeURIComponent(field)}&years_back=2&num_results=10`);
const data = await response.json();
showResults(data);
} catch (error) {
showError('Error: ' + error.message);
}
}
async function searchHighlyCited() {
const topic = prompt('Enter topic:', 'transformer neural networks');
if (!topic) return;
showLoading();
try {
const response = await fetch(`/api/highly_cited?topic=${encodeURIComponent(topic)}&min_citations=100&num_results=10`);
const data = await response.json();
showResults(data);
} catch (error) {
showError('Error: ' + error.message);
}
}
async function smartSearch() {
const query = prompt('Enter natural language query:', 'I\'m interested in computer vision papers from CVPR 2023 that have been highly cited');
if (!query) return;
showLoading();
try {
const response = await fetch(`/api/smart_search?query=${encodeURIComponent(query)}&num_results=15`);
const data = await response.json();
// Show the keyword extraction info
if (data.keyword_extraction) {
const extraction = data.keyword_extraction;
document.getElementById('results').innerHTML = `
<div class="results">
<h2>🧠 AI Keyword Extraction</h2>
<div style="background: #f0f8ff; padding: 15px; border-radius: 8px; margin-bottom: 20px;">
<p><strong>Original Query:</strong> "${query}"</p>
<p><strong>Extracted Keywords:</strong> ${extraction.primary_keywords ? extraction.primary_keywords.join(', ') : 'None'}</p>
<p><strong>Optimized Query:</strong> "${extraction.optimized_query || 'None'}"</p>
<p><strong>Search Type:</strong> ${extraction.search_type || 'general'}</p>
${extraction.venue_filters && extraction.venue_filters.length > 0 ? `<p><strong>Venues:</strong> ${extraction.venue_filters.join(', ')}</p>` : ''}
${extraction.year_range && (extraction.year_range.start_year || extraction.year_range.end_year) ? `<p><strong>Year Range:</strong> ${extraction.year_range.start_year || '?'} - ${extraction.year_range.end_year || '?'}</p>` : ''}
</div>
</div>
`;
}
showResults(data);
} catch (error) {
showError('Error: ' + error.message);
}
}
function showLoading() {
document.getElementById('results').innerHTML = '<div class="loading">🔍 Searching Google Scholar...</div>';
}
function showError(error) {
document.getElementById('results').innerHTML = `<div class="error">❌ ${error}</div>`;
}
function showResults(data) {
const papers = data.papers || [];
let html = `<div class="results">
<h2>Search Results</h2>
<p><strong>Query:</strong> ${data.search_query || data.author || data.field || data.topic}</p>
<p><strong>Total Results:</strong> ${papers.length}</p>
`;
if (papers.length === 0) {
html += '<p>No papers found.</p>';
} else {
papers.forEach(paper => {
html += `<div class="paper">
<div class="title">${paper.title || 'No title'}</div>
<div class="authors">${paper.authors || 'Unknown authors'}</div>
<div class="snippet">${paper.snippet || 'No description available'}</div>
<div class="meta">
${paper.year ? `Year: ${paper.year} | ` : ''}
${paper.cited_by ? `Citations: ${paper.cited_by} | ` : ''}
${paper.url ? `<a href="${paper.url}" target="_blank">View Paper</a>` : ''}
${paper.pdf_url ? ` | <a href="${paper.pdf_url}" target="_blank">PDF</a>` : ''}
</div>
</div>`;
});
}
html += '</div>';
document.getElementById('results').innerHTML = html;
}
</script>
</body>
</html>
"""
@app.route('/')
def index():
"""Serve the main web interface"""
return render_template_string(HTML_TEMPLATE)
@app.route('/api/search')
def api_search():
"""API endpoint for paper search"""
try:
query = request.args.get('query', '')
num_results = int(request.args.get('num_results', 10))
start_year = request.args.get('start_year')
end_year = request.args.get('end_year')
if start_year:
start_year = int(start_year)
if end_year:
end_year = int(end_year)
papers = scraper.search_papers(query, num_results, start_year, end_year)
return jsonify({
"search_query": query,
"filters": {
"start_year": start_year,
"end_year": end_year
},
"total_results": len(papers),
"papers": papers
})
except Exception as e:
logger.error(f"Search error: {e}")
return jsonify({"error": str(e)}), 500
@app.route('/api/author')
def api_author():
"""API endpoint for author search"""
try:
author_name = request.args.get('author_name', '')
num_results = int(request.args.get('num_results', 10))
query = f'author:"{author_name}"'
papers = scraper.search_papers(query, num_results)
return jsonify({
"author": author_name,
"total_papers_found": len(papers),
"papers": papers
})
except Exception as e:
logger.error(f"Author search error: {e}")
return jsonify({"error": str(e)}), 500
@app.route('/api/recent')
def api_recent():
"""API endpoint for recent papers search"""
try:
field = request.args.get('field', '')
years_back = int(request.args.get('years_back', 2))
num_results = int(request.args.get('num_results', 10))
current_year = time.localtime().tm_year
start_year = current_year - years_back
papers = scraper.search_papers(field, num_results, start_year, current_year)
return jsonify({
"field": field,
"time_range": f"{start_year}-{current_year}",
"total_results": len(papers),
"papers": papers
})
except Exception as e:
logger.error(f"Recent search error: {e}")
return jsonify({"error": str(e)}), 500
@app.route('/api/highly_cited')
def api_highly_cited():
"""API endpoint for highly cited papers search"""
try:
topic = request.args.get('topic', '')
min_citations = int(request.args.get('min_citations', 100))
num_results = int(request.args.get('num_results', 10))
# Search and filter by citations
papers = scraper.search_papers(topic, num_results * 2) # Get more results to filter
highly_cited = [p for p in papers if p.get("cited_by", 0) >= min_citations][:num_results]
return jsonify({
"topic": topic,
"minimum_citations": min_citations,
"total_highly_cited": len(highly_cited),
"papers": highly_cited
})
except Exception as e:
logger.error(f"Highly cited search error: {e}")
return jsonify({"error": str(e)}), 500
@app.route('/api/smart_search')
def api_smart_search():
"""API endpoint for AI-powered smart search with keyword extraction"""
try:
query = request.args.get('query', '')
num_results = int(request.args.get('num_results', 15))
# Check if QueryProcessor is available
if not HAS_QUERY_PROCESSOR:
# Fall back to regular search if QueryProcessor is not available
papers = scraper.search_papers(query, num_results)
return jsonify({
"original_query": query,
"error": "Smart search not available - QueryProcessor not loaded",
"fallback_used": True,
"total_results": len(papers),
"papers": papers
})
# Extract keywords and optimize query
optimized_query, search_params = query_processor.process_query(query)
strategy = query_processor.create_search_strategy(search_params)
# Perform the search with optimized parameters
start_year = strategy.get('filters', {}).get('start_year')
end_year = strategy.get('filters', {}).get('end_year')
papers = scraper.search_papers(
optimized_query,
num_results,
start_year,
end_year
)
# Filter by citations if specified
if strategy.get('min_citations'):
papers = [p for p in papers if p.get('cited_by', 0) >= strategy['min_citations']]
return jsonify({
"original_query": query,
"keyword_extraction": search_params,
"search_strategy": strategy,
"optimized_query": optimized_query,
"total_results": len(papers),
"papers": papers
})
except Exception as e:
logger.error(f"Smart search error: {e}")
return jsonify({"error": str(e)}), 500
if __name__ == '__main__':
print("🚀 Starting Google Scholar Local Server...")
print("📡 Open your browser to: http://localhost:5000")
print("Press Ctrl+C to stop\n")
app.run(host='127.0.0.1', port=5000, debug=True)