#!/usr/bin/env python3
"""
Query Processor for Google Scholar
This module processes natural language queries and converts them into
optimized Google Scholar search queries with proper filters and parameters.
"""
import json
import re
from typing import Dict, List, Optional, Tuple, Any
from datetime import datetime
class QueryProcessor:
"""Processes natural language queries for Google Scholar searches"""
def __init__(self):
self.current_year = datetime.now().year
# Common academic venues
self.venues = {
'cvpr', 'iccv', 'eccv', 'nips', 'neurips', 'icml', 'iclr', 'aaai',
'ijcai', 'acl', 'emnlp', 'naacl', 'coling', 'sigir', 'www', 'kdd',
'icse', 'fse', 'ase', 'chi', 'uist', 'cscw', 'ubicomp', 'iswc',
'vldb', 'sigmod', 'icde', 'pods', 'osdi', 'sosp', 'nsdi', 'atc'
}
# Citation criteria keywords
self.citation_keywords = {
'highly cited': 'highly_cited',
'influential': 'highly_cited',
'important': 'highly_cited',
'popular': 'highly_cited',
'seminal': 'highly_cited',
'breakthrough': 'highly_cited'
}
# Function calling schema for LLM integration
self.extraction_function = {
"name": "extract_search_keywords",
"description": "Extract optimal keywords and filters for academic paper search",
"input_schema": {
"type": "object",
"properties": {
"primary_keywords": {
"type": "array",
"items": {"type": "string"},
"description": "Main search terms (2-4 words max each)"
},
"venue_filters": {
"type": "array",
"items": {"type": "string"},
"description": "Conference/journal names"
},
"year_range": {
"type": "object",
"properties": {
"start_year": {"type": "integer"},
"end_year": {"type": "integer"}
},
"description": "Publication year range"
},
"citation_criteria": {
"type": "string",
"enum": ["any", "highly_cited", "recent"],
"description": "Citation requirements"
},
"search_type": {
"type": "string",
"enum": ["general", "author", "venue", "recent", "highly_cited"],
"description": "Type of search to perform"
},
"optimized_query": {
"type": "string",
"description": "Final optimized search query for Google Scholar"
}
},
"required": ["primary_keywords", "optimized_query"]
}
}
def process_query(self, query: str) -> Tuple[str, Dict[str, Any]]:
"""Process a natural language query into search parameters"""
# Convert to lowercase for processing
query_lower = query.lower()
# Extract search parameters
search_params = {}
# Extract primary keywords (simple approach)
# Remove common stop words and extract meaningful terms
stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'i', 'me', 'my', 'you', 'your', 'can', 'could', 'would', 'should', 'will', 'shall', 'may', 'might', 'must', 'have', 'has', 'had', 'do', 'does', 'did', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'that', 'this', 'these', 'those', 'them', 'they', 'their', 'there', 'here', 'where', 'when', 'what', 'how', 'why', 'who', 'which', 'about', 'from', 'up', 'down', 'out', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'find', 'search', 'looking', 'interested', 'want', 'need', 'get', 'help', 'show', 'give', 'tell', 'papers', 'research', 'study', 'work', 'article'}
# Extract potential keywords
words = re.findall(r'\b\w+\b', query_lower)
keywords = [word for word in words if word not in stop_words and len(word) > 2]
# Group related terms
primary_keywords = []
i = 0
while i < len(keywords):
if i < len(keywords) - 1:
bigram = f"{keywords[i]} {keywords[i+1]}"
if any(term in bigram for term in ['machine learning', 'deep learning', 'neural network', 'computer vision', 'natural language', 'artificial intelligence', 'data science', 'quantum computing']):
primary_keywords.append(bigram)
i += 2
continue
primary_keywords.append(keywords[i])
i += 1
search_params['primary_keywords'] = primary_keywords[:4] # Limit to 4 keywords
# Extract venue filters
venue_filters = []
for venue in self.venues:
if venue in query_lower:
venue_filters.append(venue.upper())
# Look for year mentions
year_pattern = r'\b(19|20)\d{2}\b'
years = [int(year) for year in re.findall(year_pattern, query)]
if years:
search_params['year_range'] = {
'start_year': min(years),
'end_year': max(years)
}
# Check for recent papers request
if any(term in query_lower for term in ['recent', 'latest', 'new', 'current']):
search_params['year_range'] = {
'start_year': self.current_year - 2,
'end_year': self.current_year
}
# Check for citation criteria
citation_criteria = 'any'
for keyword, criteria in self.citation_keywords.items():
if keyword in query_lower:
citation_criteria = criteria
break
search_params['citation_criteria'] = citation_criteria
search_params['venue_filters'] = venue_filters
# Determine search type
if 'author' in query_lower or 'by' in query_lower:
search_params['search_type'] = 'author'
elif venue_filters:
search_params['search_type'] = 'venue'
elif citation_criteria == 'highly_cited':
search_params['search_type'] = 'highly_cited'
elif 'recent' in query_lower:
search_params['search_type'] = 'recent'
else:
search_params['search_type'] = 'general'
# Build optimized query
optimized_query = self._build_optimized_query(search_params)
return optimized_query, search_params
def _build_optimized_query(self, search_params: Dict[str, Any]) -> str:
"""Build an optimized Google Scholar query from search parameters"""
keywords = search_params.get('primary_keywords', [])
venues = search_params.get('venue_filters', [])
# Start with main keywords
query_parts = []
# Add primary keywords
for keyword in keywords[:3]: # Limit to 3 main keywords
if len(keyword.split()) > 1:
query_parts.append(f'"{keyword}"')
else:
query_parts.append(keyword)
# Add venue filters
for venue in venues:
query_parts.append(venue)
return ' '.join(query_parts)
def create_search_strategy(self, search_params: Dict[str, Any]) -> Dict[str, Any]:
"""Create a complete search strategy from parameters"""
strategy = {
'primary_search': self._build_optimized_query(search_params),
'num_results': 15, # Default number of results
'filters': {}
}
# Add year filters
if 'year_range' in search_params:
year_range = search_params['year_range']
if 'start_year' in year_range:
strategy['filters']['start_year'] = year_range['start_year']
if 'end_year' in year_range:
strategy['filters']['end_year'] = year_range['end_year']
# Add citation filters
if search_params.get('citation_criteria') == 'highly_cited':
strategy['min_citations'] = 100
# Adjust number of results based on search type
search_type = search_params.get('search_type', 'general')
if search_type == 'highly_cited':
strategy['num_results'] = 20 # Get more to filter by citations
elif search_type == 'recent':
strategy['num_results'] = 10
return strategy
def demo_claude_function_calling():
"""Demonstrate how this would work with Claude function calling"""
print("š¤ Claude Function Calling Demo")
print("=" * 50)
# Example user query
user_query = "I'm interested in computer vision papers from CVPR 2023 that have been highly cited"
print(f"š User Query: '{user_query}'")
print()
# Process with our QueryProcessor
processor = QueryProcessor()
optimized_query, search_params = processor.process_query(user_query)
print("š Extracted Parameters:")
print(f" Keywords: {search_params.get('primary_keywords', [])}")
print(f" Venues: {search_params.get('venue_filters', [])}")
print(f" Years: {search_params.get('year_range', {})}")
print(f" Citation Criteria: {search_params.get('citation_criteria', 'any')}")
print(f" Search Type: {search_params.get('search_type', 'general')}")
print()
print(f"šÆ Optimized Query: '{optimized_query}'")
print()
# Create search strategy
strategy = processor.create_search_strategy(search_params)
print("š Search Strategy:")
print(json.dumps(strategy, indent=2))
print("\nš” In a real implementation, this would:")
print("1. Send the query to Claude/OpenAI with the function schema")
print("2. Get structured extraction results")
print("3. Use those results to search Google Scholar")
print("4. Return formatted results to the user")
if __name__ == "__main__":
demo_claude_function_calling()