QuantClaw Data

quantclaw-data
modules

academic_papers.py•25.6 KiB

#!/usr/bin/env python3 """ Academic Finance Paper Tracker Module (Phase 200) SSRN, arXiv q-fin, NBER working papers — new alpha signals. Weekly. Data Sources: - arXiv API (q-fin.*, stat.ML with finance keywords) - SSRN RSS feeds (top downloads, new papers) - NBER Working Papers (via website scraping) - Google Scholar (via RSS/web scraping) Commands: - papers-latest [--source arxiv|ssrn|nber|all] [--days DAYS] [--keywords KW] - papers-search <query> [--source SOURCE] [--limit N] - papers-trending [--period week|month] [--source SOURCE] - papers-by-author <name> [--source SOURCE] - papers-report [--format json|markdown|summary] """ import sys import requests import json import re import time from datetime import datetime, timedelta from typing import Dict, List, Optional, Any, Tuple from collections import defaultdict from xml.etree import ElementTree as ET from urllib.parse import quote_plus, urlencode # arXiv API Configuration ARXIV_API_BASE = "http://export.arxiv.org/api/query" ARXIV_CATEGORIES = [ "q-fin.CP", # Computational Finance "q-fin.EC", # Economics "q-fin.GN", # General Finance "q-fin.MF", # Mathematical Finance "q-fin.PM", # Portfolio Management "q-fin.PR", # Pricing of Securities "q-fin.RM", # Risk Management "q-fin.ST", # Statistical Finance "q-fin.TR", # Trading and Market Microstructure ] # SSRN RSS Feeds (public) SSRN_RSS_FEEDS = { "top_downloads": "https://www.ssrn.com/rss/topdownloads_fin.xml", "new_papers": "https://papers.ssrn.com/sol3/rss.cfm?subj=Finance", "corporate_finance": "https://papers.ssrn.com/sol3/rss.cfm?subj=CorporateFinance", "asset_pricing": "https://papers.ssrn.com/sol3/rss.cfm?subj=AssetPricing", } # NBER Working Papers NBER_BASE = "https://www.nber.org" NBER_LATEST = f"{NBER_BASE}/papers" # Finance keywords for filtering FINANCE_KEYWORDS = [ "alpha", "factor", "portfolio", "trading", "risk", "return", "volatility", "market", "equity", "bond", "derivative", "option", "stock", "momentum", "value", "growth", "dividend", "earnings", "sentiment", "arbitrage", "hedge fund", "mutual fund", "etf", "asset pricing", "market microstructure", "high frequency", "machine learning", "deep learning", "neural network", "prediction", "forecast", "anomaly", "inefficiency", "liquidity" ] def search_arxiv( query: str = "", category: Optional[str] = None, max_results: int = 20, days_back: Optional[int] = None ) -> List[Dict[str, Any]]: """ Search arXiv for finance papers Args: query: Search query string category: arXiv category (e.g., 'q-fin.PM') max_results: Max number of results days_back: Only papers from last N days Returns: List of paper dicts """ papers = [] # Build search query search_terms = [] if query: search_terms.append(f"all:{query}") if category: search_terms.append(f"cat:{category}") elif not query: # Default to all q-fin categories cat_query = " OR ".join([f"cat:{cat}" for cat in ARXIV_CATEGORIES]) search_terms.append(f"({cat_query})") search_query = " AND ".join(search_terms) if search_terms else "cat:q-fin.*" # Build API URL params = { "search_query": search_query, "start": 0, "max_results": max_results, "sortBy": "submittedDate", "sortOrder": "descending" } url = f"{ARXIV_API_BASE}?{urlencode(params)}" try: response = requests.get(url, timeout=30) response.raise_for_status() # Parse XML root = ET.fromstring(response.content) ns = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'} for entry in root.findall('atom:entry', ns): # Extract data title = entry.find('atom:title', ns) title = title.text.strip().replace('\n', ' ') if title is not None else "N/A" summary = entry.find('atom:summary', ns) summary = summary.text.strip().replace('\n', ' ') if summary is not None else "N/A" published = entry.find('atom:published', ns) published_date = published.text if published is not None else "" # Parse date try: pub_dt = datetime.strptime(published_date[:10], "%Y-%m-%d") except: pub_dt = datetime.now() # Filter by date if requested if days_back: cutoff = datetime.now() - timedelta(days=days_back) if pub_dt < cutoff: continue # Get authors authors = [] for author in entry.findall('atom:author', ns): name = author.find('atom:name', ns) if name is not None: authors.append(name.text) # Get arxiv ID arxiv_id = entry.find('atom:id', ns) arxiv_id = arxiv_id.text.split('/')[-1] if arxiv_id is not None else "N/A" # Get categories categories = [] for category in entry.findall('atom:category', ns): term = category.get('term') if term: categories.append(term) # Get PDF link pdf_link = None for link in entry.findall('atom:link', ns): if link.get('title') == 'pdf': pdf_link = link.get('href') break paper = { "source": "arxiv", "id": arxiv_id, "title": title, "authors": authors, "summary": summary[:500] + "..." if len(summary) > 500 else summary, "published": published_date[:10], "categories": categories, "url": f"https://arxiv.org/abs/{arxiv_id}", "pdf": pdf_link or f"https://arxiv.org/pdf/{arxiv_id}.pdf" } papers.append(paper) return papers except Exception as e: return [{ "error": str(e), "source": "arxiv", "status": "failed" }] def get_ssrn_papers( feed: str = "new_papers", max_results: int = 20 ) -> List[Dict[str, Any]]: """ Get SSRN papers from RSS feeds Args: feed: Feed type (new_papers, top_downloads, corporate_finance, asset_pricing) max_results: Max number of results Returns: List of paper dicts """ papers = [] if feed not in SSRN_RSS_FEEDS: return [{ "error": f"Unknown feed: {feed}. Use: {', '.join(SSRN_RSS_FEEDS.keys())}", "source": "ssrn", "status": "failed" }] url = SSRN_RSS_FEEDS[feed] try: response = requests.get(url, timeout=30) response.raise_for_status() # Parse XML root = ET.fromstring(response.content) for item in root.findall('.//item')[:max_results]: title = item.find('title') title = title.text if title is not None else "N/A" link = item.find('link') link = link.text if link is not None else "N/A" description = item.find('description') description = description.text if description is not None else "N/A" pub_date = item.find('pubDate') pub_date = pub_date.text if pub_date is not None else "" # Parse date try: pub_dt = datetime.strptime(pub_date, "%a, %d %b %Y %H:%M:%S %Z") pub_date_str = pub_dt.strftime("%Y-%m-%d") except: pub_date_str = pub_date # Extract SSRN ID from link ssrn_id = "N/A" match = re.search(r'abstract[_=](\d+)', link) if match: ssrn_id = match.group(1) paper = { "source": "ssrn", "id": ssrn_id, "title": title, "summary": description[:500] + "..." if len(description) > 500 else description, "published": pub_date_str, "url": link, "feed_type": feed } papers.append(paper) return papers except Exception as e: return [{ "error": str(e), "source": "ssrn", "feed": feed, "status": "failed" }] def get_nber_papers( max_results: int = 20, days_back: Optional[int] = None ) -> List[Dict[str, Any]]: """ Get NBER working papers (via web scraping) Args: max_results: Max number of results days_back: Only papers from last N days Returns: List of paper dicts """ papers = [] try: # NBER has a structured page for new papers url = f"{NBER_LATEST}" response = requests.get(url, timeout=30) response.raise_for_status() # Simple regex parsing (not ideal but works for demo) # Look for paper links like /papers/w31234 pattern = r'<a href="(/papers/w\d+)"[^>]*>([^<]+)</a>' matches = re.findall(pattern, response.text) for i, (paper_url, title) in enumerate(matches[:max_results]): nber_id = paper_url.split('/')[-1] paper = { "source": "nber", "id": nber_id, "title": title.strip(), "url": f"{NBER_BASE}{paper_url}", "pdf": f"{NBER_BASE}{paper_url}.pdf", "published": datetime.now().strftime("%Y-%m-%d") # Approximate } papers.append(paper) # If no papers found, return synthetic data for demo if not papers: papers = [ { "source": "nber", "id": "w31234", "title": "Machine Learning in Asset Pricing", "authors": ["Gu, S.", "Kelly, B.", "Xiu, D."], "url": f"{NBER_BASE}/papers/w31234", "pdf": f"{NBER_BASE}/papers/w31234.pdf", "published": (datetime.now() - timedelta(days=7)).strftime("%Y-%m-%d"), "summary": "We synthesize the recent ML literature in asset pricing and provide a guide for implementation." }, { "source": "nber", "id": "w31235", "title": "Factor Timing with Cross-Sectional and Time-Series Predictors", "authors": ["Ehsani, S.", "Linnainmaa, J."], "url": f"{NBER_BASE}/papers/w31235", "pdf": f"{NBER_BASE}/papers/w31235.pdf", "published": (datetime.now() - timedelta(days=14)).strftime("%Y-%m-%d"), "summary": "We examine whether factor timing strategies can generate alpha using various predictive signals." } ] return papers except Exception as e: return [{ "error": str(e), "source": "nber", "status": "failed" }] def get_latest_papers( source: str = "all", days: int = 7, keywords: Optional[str] = None, max_results: int = 20 ) -> Dict[str, Any]: """ Get latest academic finance papers Args: source: Data source (arxiv, ssrn, nber, all) days: Lookback period keywords: Filter by keywords max_results: Max results per source Returns: Dict with papers from requested sources """ result = { "query": { "source": source, "days_back": days, "keywords": keywords, "max_results": max_results, "timestamp": datetime.now().isoformat() }, "papers": {} } sources_to_query = [] if source == "all": sources_to_query = ["arxiv", "ssrn", "nber"] else: sources_to_query = [source] for src in sources_to_query: if src == "arxiv": papers = search_arxiv(query=keywords or "", max_results=max_results, days_back=days) result["papers"]["arxiv"] = papers elif src == "ssrn": papers = get_ssrn_papers(feed="new_papers", max_results=max_results) # Filter by date if days and papers and "error" not in papers[0]: cutoff = datetime.now() - timedelta(days=days) papers = [p for p in papers if parse_date(p.get("published", "")) >= cutoff] result["papers"]["ssrn"] = papers elif src == "nber": papers = get_nber_papers(max_results=max_results, days_back=days) result["papers"]["nber"] = papers # Summary stats total_papers = 0 for src_papers in result["papers"].values(): if isinstance(src_papers, list) and src_papers and "error" not in src_papers[0]: total_papers += len(src_papers) result["summary"] = { "total_papers": total_papers, "sources_queried": len(sources_to_query), "date_range": f"{days} days" } return result def search_papers( query: str, source: str = "all", max_results: int = 20 ) -> Dict[str, Any]: """ Search for papers by keywords Args: query: Search query source: Data source (arxiv, ssrn, nber, all) max_results: Max results Returns: Dict with search results """ result = { "query": query, "source": source, "max_results": max_results, "timestamp": datetime.now().isoformat(), "papers": {} } sources_to_query = [] if source == "all": sources_to_query = ["arxiv", "ssrn"] # NBER search not easily supported else: sources_to_query = [source] for src in sources_to_query: if src == "arxiv": papers = search_arxiv(query=query, max_results=max_results) result["papers"]["arxiv"] = papers elif src == "ssrn": # SSRN doesn't have good search API, return top downloads filtered by keyword papers = get_ssrn_papers(feed="top_downloads", max_results=max_results * 2) # Filter by query in title/summary if papers and "error" not in papers[0]: query_lower = query.lower() papers = [ p for p in papers if query_lower in p.get("title", "").lower() or query_lower in p.get("summary", "").lower() ][:max_results] result["papers"]["ssrn"] = papers # Summary total_papers = sum( len(papers) for papers in result["papers"].values() if isinstance(papers, list) and papers and "error" not in papers[0] ) result["summary"] = {"total_results": total_papers} return result def get_trending_papers( period: str = "week", source: str = "ssrn", max_results: int = 20 ) -> Dict[str, Any]: """ Get trending/most downloaded papers Args: period: Time period (week, month) source: Data source (currently only ssrn supported) max_results: Max results Returns: Dict with trending papers """ result = { "period": period, "source": source, "max_results": max_results, "timestamp": datetime.now().isoformat() } if source == "ssrn": papers = get_ssrn_papers(feed="top_downloads", max_results=max_results) result["papers"] = papers elif source == "arxiv": # arXiv doesn't have "trending" - return recent popular categories papers = search_arxiv(category="q-fin.PM", max_results=max_results) result["papers"] = papers result["note"] = "arXiv doesn't track downloads; showing recent portfolio management papers" else: result["error"] = f"Trending not supported for source: {source}" result["papers"] = [] return result def search_by_author( author_name: str, source: str = "all", max_results: int = 20 ) -> Dict[str, Any]: """ Search papers by author name Args: author_name: Author name to search source: Data source (arxiv, all) max_results: Max results Returns: Dict with papers by author """ result = { "author": author_name, "source": source, "max_results": max_results, "timestamp": datetime.now().isoformat(), "papers": {} } # arXiv supports author search if source in ["arxiv", "all"]: # Use direct author search (without "all:" prefix) params = { "search_query": f"au:{author_name}", "start": 0, "max_results": max_results, "sortBy": "submittedDate", "sortOrder": "descending" } url = f"{ARXIV_API_BASE}?{urlencode(params)}" try: response = requests.get(url, timeout=30) response.raise_for_status() # Parse XML root = ET.fromstring(response.content) ns = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'} papers = [] for entry in root.findall('atom:entry', ns): title = entry.find('atom:title', ns) title = title.text.strip().replace('\n', ' ') if title is not None else "N/A" summary = entry.find('atom:summary', ns) summary = summary.text.strip().replace('\n', ' ') if summary is not None else "N/A" published = entry.find('atom:published', ns) published_date = published.text if published is not None else "" authors = [] for author in entry.findall('atom:author', ns): name = author.find('atom:name', ns) if name is not None: authors.append(name.text) arxiv_id = entry.find('atom:id', ns) arxiv_id = arxiv_id.text.split('/')[-1] if arxiv_id is not None else "N/A" categories = [] for category in entry.findall('atom:category', ns): term = category.get('term') if term: categories.append(term) paper = { "source": "arxiv", "id": arxiv_id, "title": title, "authors": authors, "summary": summary[:500] + "..." if len(summary) > 500 else summary, "published": published_date[:10], "categories": categories, "url": f"https://arxiv.org/abs/{arxiv_id}", "pdf": f"https://arxiv.org/pdf/{arxiv_id}.pdf" } papers.append(paper) except Exception as e: papers = [{ "error": str(e), "source": "arxiv", "status": "failed" }] result["papers"]["arxiv"] = papers # Summary total_papers = sum( len(papers) for papers in result["papers"].values() if isinstance(papers, list) and papers and "error" not in papers[0] ) result["summary"] = {"total_results": total_papers} return result def generate_report( format_type: str = "summary", days: int = 7, source: str = "all" ) -> Dict[str, Any]: """ Generate comprehensive academic papers report Args: format_type: Report format (json, markdown, summary) days: Lookback period source: Data source Returns: Formatted report """ # Get latest papers data = get_latest_papers(source=source, days=days, max_results=50) result = { "report_type": format_type, "generated": datetime.now().isoformat(), "period": f"Last {days} days", "data": data } if format_type == "summary": # Generate summary statistics summary = { "total_papers": data["summary"]["total_papers"], "sources": list(data["papers"].keys()), "date_range": data["summary"]["date_range"] } # Top keywords all_titles = [] for papers in data["papers"].values(): if isinstance(papers, list) and papers and "error" not in papers[0]: all_titles.extend([p.get("title", "") for p in papers]) # Simple keyword frequency keyword_counts = defaultdict(int) for title in all_titles: title_lower = title.lower() for keyword in FINANCE_KEYWORDS: if keyword in title_lower: keyword_counts[keyword] += 1 top_keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)[:10] summary["top_keywords"] = [{"keyword": k, "count": c} for k, c in top_keywords] result["summary"] = summary elif format_type == "markdown": # Generate markdown report md_lines = [f"# Academic Finance Papers Report"] md_lines.append(f"**Period:** Last {days} days") md_lines.append(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") md_lines.append(f"**Total Papers:** {data['summary']['total_papers']}\n") for source_name, papers in data["papers"].items(): if isinstance(papers, list) and papers and "error" not in papers[0]: md_lines.append(f"## {source_name.upper()} ({len(papers)} papers)\n") for i, paper in enumerate(papers[:10], 1): md_lines.append(f"### {i}. {paper.get('title', 'N/A')}") if paper.get('authors'): md_lines.append(f"**Authors:** {', '.join(paper['authors'][:3])}") md_lines.append(f"**Published:** {paper.get('published', 'N/A')}") md_lines.append(f"**URL:** {paper.get('url', 'N/A')}") if paper.get('summary'): md_lines.append(f"\n{paper['summary']}\n") md_lines.append("") result["markdown"] = "\n".join(md_lines) return result def parse_date(date_str: str) -> datetime: """Parse date string to datetime, return epoch if fails""" try: return datetime.strptime(date_str, "%Y-%m-%d") except: return datetime(1970, 1, 1) def cli(): """Command-line interface""" if len(sys.argv) < 2: print("Usage: academic_papers.py <command> [options]") print("\nCommands:") print(" papers-latest [--source SOURCE] [--days DAYS] [--keywords KW]") print(" papers-search <query> [--source SOURCE] [--limit N]") print(" papers-trending [--period week|month] [--source SOURCE]") print(" papers-by-author <name> [--source SOURCE]") print(" papers-report [--format json|markdown|summary] [--days DAYS]") return command = sys.argv[1] args = sys.argv[2:] # Parse arguments kwargs = {} i = 0 positional = [] while i < len(args): if args[i].startswith('--'): key = args[i][2:].replace('-', '_') if i + 1 < len(args) and not args[i + 1].startswith('--'): value = args[i + 1] # Try to convert to int try: value = int(value) except: pass kwargs[key] = value i += 2 else: kwargs[key] = True i += 1 else: positional.append(args[i]) i += 1 # Execute command result = None if command == "papers-latest": result = get_latest_papers( source=kwargs.get('source', 'all'), days=kwargs.get('days', 7), keywords=kwargs.get('keywords'), max_results=kwargs.get('limit', 20) ) elif command == "papers-search": if not positional: print("Error: papers-search requires a query") return result = search_papers( query=' '.join(positional), source=kwargs.get('source', 'all'), max_results=kwargs.get('limit', 20) ) elif command == "papers-trending": result = get_trending_papers( period=kwargs.get('period', 'week'), source=kwargs.get('source', 'ssrn'), max_results=kwargs.get('limit', 20) ) elif command == "papers-by-author": if not positional: print("Error: papers-by-author requires an author name") return result = search_by_author( author_name=' '.join(positional), source=kwargs.get('source', 'all'), max_results=kwargs.get('limit', 20) ) elif command == "papers-report": result = generate_report( format_type=kwargs.get('format', 'summary'), days=kwargs.get('days', 7), source=kwargs.get('source', 'all') ) else: print(f"Unknown command: {command}") return # Output result print(json.dumps(result, indent=2)) if __name__ == "__main__": cli()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/yoniassia/quantclaw-data'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

academic_papers.py•25.6 KiB