Skip to main content
Glama

get_github_repo

Extract and process code from a GitHub repository branch as structured text for AI analysis and processing.

Instructions

Process and return the code from a GitHub repository branch as text

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
repo_urlYes
branchNomaster

Implementation Reference

  • The main handler function for the 'get_github_repo' tool, registered via @mcp.tool(). It instantiates GithubRepo2Txt, calls its process_repo method asynchronously with a timeout, and returns the processed repository content as a string.
    @mcp.tool()
    async def get_github_repo(repo_url: str, branch: str = "master")->str:
        """
        Process and return the code from a GitHub repository branch as text
        """
        try:
            # Create an event loop
            loop = asyncio.get_event_loop()
            # Wrap synchronous operation in async operation with 300 seconds (5 minutes) timeout
            repo_processor = GithubRepo2Txt()
            repo_name, content = await asyncio.wait_for(
                loop.run_in_executor(None, repo_processor.process_repo, repo_url, branch),
                timeout=3000
            )
            # logger.info(f"Processed GitLab repository: {repo_name}")
    
            return content
        except asyncio.TimeoutError:
            return "Processing timeout, please check repository size or network connection"
        except Exception as e:
            # logger.error(f"Error processing GitLab repository: {e}")
            return f"Processing failed: {str(e)}"
  • The supporting method in GithubRepo2Txt class that implements the core logic: retrieves README, repository structure, and file contents (skipping binary files), then combines them into a formatted text string for analysis.
    def process_repo(self, repo_url, branch='master'):
        """
        处理GitHub仓库并返回处理后的内容
        
        Args:
            repo_url (str): GitHub仓库URL
            branch (str, optional): 分支名称. 默认为 'master'
            
        Returns:
            tuple: (repo_name, content_string) - 仓库名和处理后的内容字符串
        """
        repo_name = repo_url.split('/')[-1]
        repo = self.github.get_repo(repo_url.replace('https://github.com/', ''))
    
        # print(f"Getting {repo_name}'s README")
        readme_content = self._get_readme_content(repo, branch)
    
        # print(f"\nGetting {repo_name}'s repo structure")
        repo_structure = f"repo structure: {repo_name}\n"
        repo_structure += self._traverse_repo_iteratively(repo, branch)
    
        # print(f"\nGetting {repo_name}'s file")
        file_contents = self._get_file_contents_iteratively(repo, branch)
    
        instructions = "Please analyze using the following provided files and contents:\n\n"
        
        # 组合所有内容
        content = (
            instructions +
            f"README:\n{readme_content}\n\n" +
            repo_structure +
            '\n\n' +
            file_contents
        )
        
        return repo_name, content
  • The GithubRepo2Txt class providing utility methods for fetching GitHub repository contents iteratively, handling README, structure traversal, and file content retrieval while skipping binaries.
    class GithubRepo2Txt:
        def __init__(self):
            # _=load_dotenv(find_dotenv())
            load_dotenv()
            # GITLAB_TOKEN = os.getenv('GITLAB_TOKEN')    
            self.GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')
            if not self.GITHUB_TOKEN:
                raise ValueError("Please set 'GITHUB_TOKEN' env param")
            self.github = Github(self.GITHUB_TOKEN)
            
        def _get_readme_content(self, repo, branch='master'):
            """
            Retrieve the content of the README file.
            """
            readme_variants = ['README.md', 'readme.md', 'ReadMe.md']
            for readme in readme_variants:
                try:
                    readme = repo.get_contents(readme,ref=branch)
                    return readme.decoded_content.decode('utf-8')
                except:
                    continue
            return "README not found."
    
        def _traverse_repo_iteratively(self, repo, branch='master'):
            """
            Traverse the repository iteratively to avoid recursion limits for large repositories.
            """
            structure = ""
            dirs_to_visit = [("", repo.get_contents("", ref=branch))]
            dirs_visited = set()
    
            while dirs_to_visit:
                path, contents = dirs_to_visit.pop()
                dirs_visited.add(path)
                for content in tqdm(contents, desc=f"Processing {path}", leave=False):
                    if content.type == "dir":
                        if content.path not in dirs_visited:
                            structure += f"{path}/{content.name}/\n"
                            dirs_to_visit.append((f"{path}/{content.name}", repo.get_contents(content.path, ref=branch)))
                    else:
                        structure += f"{path}/{content.name}\n"
            return structure
    
        def _get_file_contents_iteratively(self, repo, branch='master'):
            file_contents = ""
            dirs_to_visit = [("", repo.get_contents("", ref=branch))]
            dirs_visited = set()
            binary_extensions = [
                # Compiled executables and libraries
                '.exe', '.dll', '.so', '.a', '.lib', '.dylib', '.o', '.obj',
                # Compressed archives
                '.zip', '.tar', '.tar.gz', '.tgz', '.rar', '.7z', '.bz2', '.gz', '.xz', '.z', '.lz', '.lzma', '.lzo', '.rz', '.sz', '.dz',
                # Application-specific files
                '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods', '.odp',
                # Media files (less common)
                '.png', '.jpg', '.jpeg', '.gif', '.mp3', '.mp4', '.wav', '.flac', '.ogg', '.avi', '.mkv', '.mov', '.webm', '.wmv', '.m4a', '.aac',
                # Virtual machine and container images
                '.iso', '.vmdk', '.qcow2', '.vdi', '.vhd', '.vhdx', '.ova', '.ovf',
                # Database files
                '.db', '.sqlite', '.mdb', '.accdb', '.frm', '.ibd', '.dbf',
                # Java-related files
                '.jar', '.class', '.war', '.ear', '.jpi',
                # Python bytecode and packages
                '.pyc', '.pyo', '.pyd', '.egg', '.whl',
                # Other potentially important extensions
                '.deb', '.rpm', '.apk', '.msi', '.dmg', '.pkg', '.bin', '.dat', '.data',
                '.dump', '.img', '.toast', '.vcd', '.crx', '.xpi', '.lockb', 'package-lock.json', '.svg' ,
                '.eot', '.otf', '.ttf', '.woff', '.woff2',
                '.ico', '.icns', '.cur',
                '.cab', '.dmp', '.msp', '.msm',
                '.keystore', '.jks', '.truststore', '.cer', '.crt', '.der', '.p7b', '.p7c', '.p12', '.pfx', '.pem', '.csr',
                '.key', '.pub', '.sig', '.pgp', '.gpg',
                '.nupkg', '.snupkg', '.appx', '.msix', '.msp', '.msu',
                '.deb', '.rpm', '.snap', '.flatpak', '.appimage',
                '.ko', '.sys', '.elf',
                '.swf', '.fla', '.swc',
                '.rlib', '.pdb', '.idb', '.pdb', '.dbg',
                '.sdf', '.bak', '.tmp', '.temp', '.log', '.tlog', '.ilk',
                '.bpl', '.dcu', '.dcp', '.dcpil', '.drc',
                '.aps', '.res', '.rsrc', '.rc', '.resx',
                '.prefs', '.properties', '.ini', '.cfg', '.config', '.conf',
                '.DS_Store', '.localized', '.svn', '.git', '.gitignore', '.gitkeep',
            ]
    
            while dirs_to_visit:
                path, contents = dirs_to_visit.pop()
                dirs_visited.add(path)
                for content in tqdm(contents, desc=f"Downloading {path}", leave=False):
                    if content.type == "dir":
                        if content.path not in dirs_visited:
                            dirs_to_visit.append((f"{path}/{content.name}", repo.get_contents(content.path, ref=branch)))
                    else:
                        # Check if the file extension suggests it's a binary file
                        if any(content.name.endswith(ext) for ext in binary_extensions):
                            file_contents += f"File: {path}/{content.name}\nContent: Skipped binary file\n\n"
                        else:
                            file_contents += f"File: {path}/{content.name}\n"
                            try:
                                if content.encoding is None or content.encoding == 'none':
                                    file_contents += "Content: Skipped due to missing encoding\n\n"
                                else:
                                    try:
                                        decoded_content = content.decoded_content.decode('utf-8')
                                        file_contents += f"Content:\n{decoded_content}\n\n"
                                    except UnicodeDecodeError:
                                        try:
                                            decoded_content = content.decoded_content.decode('latin-1')
                                            file_contents += f"Content (Latin-1 Decoded):\n{decoded_content}\n\n"
                                        except UnicodeDecodeError:
                                            file_contents += "Content: Skipped due to unsupported encoding\n\n"
                            except (AttributeError, UnicodeDecodeError):
                                file_contents += "Content: Skipped due to decoding error or missing decoded_content\n\n"
            return file_contents
    
        def process_repo(self, repo_url, branch='master'):
            """
            处理GitHub仓库并返回处理后的内容
            
            Args:
                repo_url (str): GitHub仓库URL
                branch (str, optional): 分支名称. 默认为 'master'
                
            Returns:
                tuple: (repo_name, content_string) - 仓库名和处理后的内容字符串
            """
            repo_name = repo_url.split('/')[-1]
            repo = self.github.get_repo(repo_url.replace('https://github.com/', ''))
    
            # print(f"Getting {repo_name}'s README")
            readme_content = self._get_readme_content(repo, branch)
    
            # print(f"\nGetting {repo_name}'s repo structure")
            repo_structure = f"repo structure: {repo_name}\n"
            repo_structure += self._traverse_repo_iteratively(repo, branch)
    
            # print(f"\nGetting {repo_name}'s file")
            file_contents = self._get_file_contents_iteratively(repo, branch)
    
            instructions = "Please analyze using the following provided files and contents:\n\n"
            
            # 组合所有内容
            content = (
                instructions +
                f"README:\n{readme_content}\n\n" +
                repo_structure +
                '\n\n' +
                file_contents
            )
            
            return repo_name, content
    
        def save_repo_contents(self, repo_url, branch='master'):
            """
            处理GitHub仓库并保存到文件
            
            Args:
                repo_url (str): GitHub仓库URL
                branch (str, optional): 分支名称. 默认为 'master'
                
            Returns:
                str: 输出文件的路径
            """
            try:
                repo_name, content = self.process_repo(repo_url, branch)
                output_filename = f'{repo_name}_contents.txt'
                
                with open(output_filename, 'w', encoding='utf-8') as f:
                    f.write(content)
                    
                # print(f"Repository contents saved to '{output_filename}'.")
                return output_filename
                
            except Exception as e:
                raise Exception(f"Error processing repository: {str(e)}")
    
    # if __name__ == '__main__':
    
    #     repo_url = input("please input GitHub repo URL: ")
    #     branch = input("please input the branch(default: master): ") or "master"
        
    #     try:
    #         repo_processor = GithubRepo2Txt()
    #         output_file = repo_processor.save_repo_contents(repo_url, branch)
    #     except ValueError as ve:
    #         print(f"Error: {ve}")
    #     except Exception as e:
    #         print(f"An error occurred: {e}")
    #         print("Please check the repository URL and try again.")
        """
        # 作为模块导入使用
        from repo2llm.githubrepo2txt import GithubRepo2Txt
    
        # 创建实例
        repo_processor = GithubRepo2Txt()
    
        # 方式1:直接保存到文件
        output_file = repo_processor.save_repo_contents(
            repo_url="https://github.com/username/repo",
            branch="master"  # 可选参数
        )
    
        # 方式2:获取处理后的内容
        repo_name, content = repo_processor.process_repo(
            repo_url="https://github.com/username/repo",
            branch="master"  # 可选参数
        )
    """
  • The @mcp.tool() decorator registers the get_github_repo function as an MCP tool.
    @mcp.tool()
Install Server

Other Tools

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/crisschan/mcp-repo2llm'

If you have feedback or need assistance with the MCP directory API, please join our Discord server