github_extractor.py•16.7 kB
import os
import re
import time
import logging
import requests
from typing import List, Dict, Any, Optional, Tuple
from bs4 import BeautifulSoup
from github import Github, RateLimitExceededException
from github.ContentFile import ContentFile
from dotenv import load_dotenv
from tqdm import tqdm
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Load environment variables
load_dotenv()
class GitHubExtractor:
def __init__(self, token: Optional[str] = None, output_dir: str = "docs/move_files"):
"""
Initialize GitHub extractor.
Args:
token: GitHub personal access token (from env var GITHUB_TOKEN if not provided)
output_dir: Directory to save extracted files
"""
# Use provided token or get from environment
self.token = token or os.getenv("GITHUB_TOKEN")
if not self.token:
logger.warning("No GitHub token provided. Rate limits will be restricted.")
# Initialize GitHub API
self.github = Github(self.token) if self.token else Github()
# Set output directory
self.output_dir = output_dir
os.makedirs(self.output_dir, exist_ok=True)
# Base GitHub search URL
self.base_url = "https://github.com/search"
def search_code_with_api(self, query: str, language: str = None,
extension: str = None, max_results: int = 100) -> List[Dict[str, Any]]:
"""
Search GitHub for code using the GitHub API.
Args:
query: Search query string
language: Filter by programming language
extension: Filter by file extension
max_results: Maximum number of results to return
Returns:
List of code search results with metadata
"""
try:
# Build query string
search_query = query
if language:
search_query += f" language:{language}"
if extension:
search_query += f" extension:{extension}"
logger.info(f"Searching GitHub API with query: {search_query}")
# Use the search repositories approach from search_github.py
full_query = search_query + ' in:readme+in:description'
search_results = self.github.search_repositories(full_query, 'stars', 'desc')
logger.info(f"Found {search_results.totalCount} repositories")
# Process results
results = []
count = 0
for repo in tqdm(range(0, min(search_results.totalCount, max_results))):
try:
repo_data = search_results[repo]
repo_name = repo_data.full_name
# Get repository contents
logger.info(f"Scanning repository: {repo_name}")
# Look for Move files in the repository
move_files = self._find_move_files_in_repo(repo_data, extension=extension)
for file_data in move_files:
if count >= max_results:
break
results.append(file_data)
count += 1
# Respect rate limits
time.sleep(2)
except RateLimitExceededException:
logger.warning("Rate limit exceeded. Sleeping for 60 seconds.")
time.sleep(60)
continue
except Exception as e:
logger.warning(f"Error processing repository {repo}: {str(e)}")
continue
logger.info(f"Found {len(results)} code results")
return results
except RateLimitExceededException:
logger.error("GitHub API rate limit exceeded. Try again later or use a token with higher limits.")
return []
except Exception as e:
logger.error(f"Error searching GitHub API: {str(e)}")
return []
def _find_move_files_in_repo(self, repo, extension: str = "move") -> List[Dict[str, Any]]:
"""
Find all Move files in a repository.
Args:
repo: GitHub repository object
extension: File extension to search for
Returns:
List of file data dictionaries
"""
results = []
# Use recursive traversal to find all files
try:
contents = repo.get_contents("")
while contents:
file_content = contents.pop(0)
if file_content.type == "dir":
# Add directory contents to the stack
contents.extend(repo.get_contents(file_content.path))
else:
# Check if it's a Move file
if file_content.name.endswith(f".{extension}"):
try:
content = file_content.decoded_content.decode('utf-8')
# Check if file contains "use sui" for Move files
if extension == "move" and "use sui" not in content.lower():
continue
# Create result object
result = {
'name': file_content.name,
'path': file_content.path,
'repo': repo.full_name,
'url': file_content.html_url,
'content': content
}
results.append(result)
logger.debug(f"Found {extension} file: {file_content.path}")
except Exception as e:
logger.warning(f"Error getting content for {file_content.html_url}: {str(e)}")
except Exception as e:
logger.warning(f"Error traversing repository {repo.full_name}: {str(e)}")
return results
def search_code_with_scraping(self, query: str, path_pattern: str = None,
max_pages: int = 5) -> List[Dict[str, Any]]:
"""
Search GitHub for code by scraping the GitHub search results page.
This is a fallback when API doesn't work or limits are exceeded.
Args:
query: Search query string
path_pattern: Path pattern to filter results (e.g., "*.move")
max_pages: Maximum number of pages to scrape
Returns:
List of code search results with metadata
"""
results = []
# Build query parameters
params = {
'q': query,
'type': 'code',
}
if path_pattern:
params['q'] += f" path:{path_pattern}"
logger.info(f"Scraping GitHub search with query: {params['q']}")
# Start with page 1
page = 1
while page <= max_pages:
try:
# Add page parameter
params['p'] = page
# Send request with proper headers to mimic a browser
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml',
'Accept-Language': 'en-US,en;q=0.9',
}
response = requests.get(self.base_url, params=params, headers=headers)
if response.status_code != 200:
logger.error(f"Failed to retrieve search results. Status code: {response.status_code}")
break
# Parse HTML
soup = BeautifulSoup(response.text, 'html.parser')
# Find code results
code_blocks = soup.select('div.code-list-item')
if not code_blocks:
logger.info(f"No more results found on page {page}")
break
# Process each code block
for block in code_blocks:
try:
# Extract repository info
repo_element = block.select_one('a.Link--secondary')
if not repo_element:
continue
repo_name = repo_element.text.strip()
# Extract file path
file_element = block.select_one('a.Link--primary')
if not file_element:
continue
file_path = file_element.text.strip()
file_url = f"https://github.com/{repo_name}/blob/master/{file_path}"
# Extract the raw URL for content
raw_url = f"https://raw.githubusercontent.com/{repo_name}/master/{file_path}"
results.append({
'name': os.path.basename(file_path),
'path': file_path,
'repo': repo_name,
'url': file_url,
'raw_url': raw_url,
'content': None # Will be fetched later
})
except Exception as e:
logger.warning(f"Error parsing code block: {str(e)}")
# Go to next page
page += 1
# Respect rate limits with a delay
time.sleep(2)
except Exception as e:
logger.error(f"Error scraping GitHub search page {page}: {str(e)}")
break
logger.info(f"Found {len(results)} code results through scraping")
# Fetch content for each result
self._fetch_contents_for_scraped_results(results)
return results
def _fetch_contents_for_scraped_results(self, results: List[Dict[str, Any]]):
"""
Fetch file contents for results obtained through scraping.
Args:
results: List of search results from scraping
"""
logger.info(f"Fetching content for {len(results)} files...")
for result in tqdm(results):
try:
if 'raw_url' not in result or not result['raw_url']:
continue
response = requests.get(result['raw_url'])
if response.status_code == 200:
result['content'] = response.text
else:
logger.warning(f"Failed to fetch content for {result['raw_url']}. Status: {response.status_code}")
# Respect rate limits
time.sleep(0.5)
except Exception as e:
logger.warning(f"Error fetching content for {result.get('raw_url', 'unknown URL')}: {str(e)}")
def extract_move_files(self, query: str = "use sui", use_scraping: bool = True,
max_results: int = 100) -> List[Dict[str, Any]]:
"""
Extract .move files from GitHub based on search query.
Args:
query: Search query
use_scraping: Whether to use web scraping as fallback
max_results: Maximum number of results to extract
Returns:
List of extracted file data
"""
results = []
# Try API first if token is available
if self.token and not use_scraping:
logger.info("Attempting to use GitHub API...")
results = self.search_code_with_api(query, extension="move", max_results=max_results)
# Fall back to scraping if API didn't work or explicitly requested
if not results and use_scraping:
logger.info("Using web scraping method...")
results = self.search_code_with_scraping(query, path_pattern="*.move", max_pages=5)
# Filter out results without content
valid_results = [r for r in results if r.get('content')]
if len(valid_results) < len(results):
logger.warning(f"Filtered out {len(results) - len(valid_results)} results without content")
return valid_results
def download_move_files(self, files: List[Dict[str, Any]]) -> List[str]:
"""
Download .move files to the output directory.
Args:
files: List of file data from extract_move_files
Returns:
List of paths to downloaded files
"""
downloaded_paths = []
logger.info(f"Downloading {len(files)} .move files to {self.output_dir}")
for file_data in tqdm(files):
try:
# Create subdirectory based on repo name to avoid name conflicts
repo_dir = file_data['repo'].replace('/', '_')
file_dir = os.path.join(self.output_dir, repo_dir)
os.makedirs(file_dir, exist_ok=True)
# Create file path
file_name = file_data['name']
file_path = os.path.join(file_dir, file_name)
# Write content to file
with open(file_path, 'w', encoding='utf-8') as f:
f.write(file_data['content'])
downloaded_paths.append(file_path)
except Exception as e:
logger.error(f"Error saving file {file_data.get('name', 'unknown')}: {str(e)}")
logger.info(f"Successfully downloaded {len(downloaded_paths)} files")
return downloaded_paths
def _check_rate_limit(self):
"""Check remaining rate limit and pause if necessary"""
if not self.token:
# Be conservative with unauthenticated requests
time.sleep(2)
return
rate_limit = self.github.get_rate_limit()
remaining = rate_limit.search.remaining
if remaining < 10:
reset_time = rate_limit.search.reset.timestamp()
current_time = time.time()
sleep_time = max(1, reset_time - current_time)
logger.warning(f"Rate limit low ({remaining} remaining). Sleeping for {sleep_time:.1f} seconds")
time.sleep(sleep_time)
def extract_and_index_move_files(query: str = "use sui",
output_dir: str = "docs/move_files",
github_token: Optional[str] = None,
use_scraping: bool = True,
max_results: int = 100) -> Tuple[int, List[str]]:
"""
Extract Move files from GitHub and return paths for indexing.
Args:
query: GitHub search query
output_dir: Directory to save files
github_token: GitHub API token
use_scraping: Whether to use web scraping fallback
max_results: Maximum files to extract
Returns:
Tuple of (number of files, list of file paths)
"""
# Initialize extractor
extractor = GitHubExtractor(token=github_token, output_dir=output_dir)
# Extract files
files = extractor.extract_move_files(query, use_scraping, max_results)
# Download files
file_paths = extractor.download_move_files(files)
return len(file_paths), file_paths