github_client.py•8.26 kB
"""GitHub client for searching AI/ML repositories."""
import os
from datetime import datetime, timedelta, timezone
from typing import List, Dict, Optional
from github import Github, GithubException
class GithubClient:
"""Client for searching GitHub repositories."""
# Keywords by AI research area
AI_KEYWORDS = {
"core_ai": ["LLM", "transformer", "diffusion", "GPT", "neural network", "deep learning"],
"multimodal": ["CLIP", "stable diffusion", "text-to-image", "vision-language"],
"applications": ["RAG", "AI agent", "langchain", "prompt engineering", "RLHF"],
"infrastructure": ["pytorch", "tensorflow", "vLLM", "model optimization"],
"robotics": ["robotics", "robot learning", "embodied AI", "manipulation", "navigation"],
"bioinfo": ["bioinformatics", "protein folding", "alphafold", "drug discovery", "genomics"],
"science": ["AI4Science", "scientific machine learning", "physics-informed neural networks"],
"rl": ["reinforcement learning", "multi-agent", "game AI", "AlphaGo"],
"graph": ["graph neural network", "GNN", "molecular modeling"],
"recsys": ["recommender systems", "personalization"],
"timeseries": ["time series forecasting", "anomaly detection"],
"emerging": ["federated learning", "neuromorphic computing", "quantum machine learning"],
}
# Common topics for AI repositories
AI_TOPICS = [
"llm", "gpt", "transformer", "deep-learning", "machine-learning",
"computer-vision", "nlp", "reinforcement-learning", "generative-ai",
"diffusion", "pytorch", "tensorflow", "huggingface", "ai-agent",
"robotics", "bioinformatics", "graph-neural-network",
]
def __init__(self, token: Optional[str] = None):
"""Initialize GitHub client.
Args:
token: GitHub personal access token (optional but recommended)
"""
token = token or os.getenv("GITHUB_TOKEN")
self.github = Github(token) if token else Github()
def search_repositories(
self,
keywords: Optional[List[str]] = None,
topics: Optional[List[str]] = None,
min_stars: int = 50,
language: Optional[str] = None,
created_since: Optional[str] = None,
pushed_since: Optional[str] = None,
sort_by: str = "stars",
max_results: int = 50,
) -> List[Dict]:
"""Search for repositories.
Args:
keywords: Keywords to search in name/description/README
topics: Repository topics to filter by
min_stars: Minimum number of stars
language: Programming language filter
created_since: Created since (e.g., '7d', '1w', '1m')
pushed_since: Last pushed since (e.g., '7d', '1w', '1m')
sort_by: Sort by 'stars', 'forks', 'updated', or 'best-match'
max_results: Maximum number of results
Returns:
List of repository dictionaries
"""
# Build query
query_parts = []
# Add keywords
if keywords:
keyword_query = " ".join(keywords)
query_parts.append(keyword_query)
# Add topics
if topics:
for topic in topics:
query_parts.append(f"topic:{topic}")
# Add stars filter
query_parts.append(f"stars:>={min_stars}")
# Add language filter
if language:
query_parts.append(f"language:{language}")
# Add date filters
if created_since:
date = self._parse_relative_date(created_since)
query_parts.append(f"created:>={date}")
if pushed_since:
date = self._parse_relative_date(pushed_since)
query_parts.append(f"pushed:>={date}")
query = " ".join(query_parts)
# Search repositories
try:
repositories = self.github.search_repositories(
query=query,
sort=sort_by,
order="desc",
)
results = []
count = 0
for repo in repositories:
if count >= max_results:
break
results.append({
"name": repo.name,
"full_name": repo.full_name,
"description": repo.description,
"url": repo.html_url,
"stars": repo.stargazers_count,
"forks": repo.forks_count,
"language": repo.language,
"topics": repo.topics if hasattr(repo, "topics") else [],
"created_at": repo.created_at.isoformat(),
"updated_at": repo.updated_at.isoformat(),
"pushed_at": repo.pushed_at.isoformat() if repo.pushed_at else None,
"homepage": repo.homepage,
"license": repo.license.name if repo.license else None,
"source": "github",
})
count += 1
return results
except GithubException as e:
print(f"GitHub API error: {e}")
return []
def get_trending_repositories(
self,
period: str = "daily",
language: Optional[str] = None,
max_results: int = 25,
) -> List[Dict]:
"""Get trending repositories.
Note: GitHub API doesn't have official trending endpoint, so we approximate
by searching for recently created/updated repos with high stars.
Args:
period: 'daily', 'weekly', or 'monthly'
language: Programming language filter
max_results: Maximum number of results
Returns:
List of repository dictionaries
"""
# Map period to days
period_days = {
"daily": 1,
"weekly": 7,
"monthly": 30,
}
days = period_days.get(period, 7)
# Search for recently starred repos with AI topics
return self.search_repositories(
topics=self.AI_TOPICS[:5], # Use top 5 most common topics
min_stars=100,
language=language,
pushed_since=f"{days}d",
sort_by="stars",
max_results=max_results,
)
def search_by_area(
self,
area: str,
min_stars: int = 50,
days: int = 30,
max_results: int = 25,
) -> List[Dict]:
"""Search repositories by research area.
Args:
area: Research area (e.g., 'llm', 'robotics', 'bioinfo')
min_stars: Minimum number of stars
days: Look back this many days
max_results: Maximum number of results
Returns:
List of repository dictionaries
"""
keywords = self.AI_KEYWORDS.get(area.lower())
if not keywords:
raise ValueError(f"Unknown area: {area}. Valid areas: {list(self.AI_KEYWORDS.keys())}")
return self.search_repositories(
keywords=keywords[:3], # Use top 3 keywords to avoid too restrictive search
min_stars=min_stars,
pushed_since=f"{days}d",
sort_by="stars",
max_results=max_results,
)
def _parse_relative_date(self, date_str: str) -> str:
"""Parse relative date string to ISO date.
Args:
date_str: Relative date (e.g., '7d', '1w', '1m')
Returns:
ISO format date string
"""
unit = date_str[-1]
value = int(date_str[:-1])
if unit == "d":
delta = timedelta(days=value)
elif unit == "w":
delta = timedelta(weeks=value)
elif unit == "m":
delta = timedelta(days=value * 30)
else:
raise ValueError(f"Invalid date format: {date_str}. Use format like '7d', '1w', '1m'")
date = datetime.now(timezone.utc) - delta
return date.strftime("%Y-%m-%d")