diff --git a/README.md b/README.md index cdb4711..87db5f2 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,57 @@ -# repo2file4gpt +# 📁 repo2file4gpt -This project aims to convert the content of GitHub repositories into a structured, machine-readable format, enabling AI models like ChatGPT to utilize them as a knowledge base. +repo2file4gpt is a Python package that scrapes GitHub repositories and exports their content into structured markdown files. The goal is to convert repository content into a machine-readable format for use in AI training data. + +It extracts key files like code, markdown, and notebooks from public GitHub repositories. The content is exported into an aggregated markdown file per repository with the full hierarchy preserved. + +## Installation + +You can install repo2file4gpt directly from PyPI: + +```bash +pip install repo2file4gpt +``` + +## Quick Start + +### Command Line Interface + +After installing repo2file4gpt, you can use it from the command line as follows: + +```bash +repo2file4gpt --token YOUR_GITHUB_TOKEN --repos user/repo1 user/repo2 --filetypes py js --output_dir ./outputs/ +``` + +Replace `YOUR_GITHUB_TOKEN` with your actual GitHub token, and `user/repo1` and `user/repo2` with the actual repositories you want to process. + +### Python Code + +You can also use repo2file4gpt in your Python code: + +```python +import repo2file4gpt + +# Specify the GitHub token, list of repositories, file types, and output directory +token = "YOUR_GITHUB_TOKEN" +repos = ["user/repo1", "user/repo2"] +filetypes = ["py", "js"] +output_dir = "./outputs/" + +# Create a RepositoryScraper instance +processor = repo2file4gpt.RepositoryScraper(token, filetypes, repo2file4gpt.LINE_LIMITS, output_dir) + +# Process the repositories +processor.process_repositories(repos) +``` + +Again, replace `YOUR_GITHUB_TOKEN` with your actual GitHub token, and `user/repo1` and `user/repo2` with the actual repositories you want to process. + +## TODO + +- Add support for more file types. +- Improve error handling for robustness. +- Optimize performance for large repositories. + +## Contributing + +Contributions are welcome! Please feel free to submit a Pull Request. diff --git a/repo2file4gpt/__main__.py b/repo2file4gpt/__main__.py index 21176e9..fcad13c 100644 --- a/repo2file4gpt/__main__.py +++ b/repo2file4gpt/__main__.py @@ -1,15 +1,24 @@ +import argparse +from .config import REPO_URLS, FILETYPES, LINE_LIMITS, OUTPUT_DIR +from repo2file4gpt.repo_scrapper import RepositoryScraper import os -from .config import REPO_URLS, FILETYPES, LINE_LIMITS -from repo2file4gpt.repo_scrapper import RepositoryScraper +def main(): + parser = argparse.ArgumentParser(description="Process some repositories.") + parser.add_argument("--token", type=str, help="GitHub access token") + parser.add_argument("--repos", type=str, nargs="+", help="List of repository URLs to process") + parser.add_argument("--filetypes", type=str, nargs="+", help="List of file types to process") + parser.add_argument("--output_dir", type=str, help="Output directory") + args = parser.parse_args() -def main(): - processor = RepositoryScraper( - os.getenv("GITHUB_ACCESS_TOKEN"), FILETYPES, LINE_LIMITS - ) - processor.process_repositories(REPO_URLS) + token = args.token if args.token else os.getenv("GITHUB_ACCESS_TOKEN") + repos = [repo.strip() for repo in args.repos] if args.repos else REPO_URLS + filetypes = [ftype.strip() for ftype in args.filetypes] if args.filetypes else FILETYPES + output_dir = args.output_dir.strip() if args.output_dir else OUTPUT_DIR + processor = RepositoryScraper(token, filetypes, LINE_LIMITS, output_dir) + processor.process_repositories(repos) if __name__ == "__main__": main() diff --git a/repo2file4gpt/config.py b/repo2file4gpt/config.py index 6432ed0..152199e 100644 --- a/repo2file4gpt/config.py +++ b/repo2file4gpt/config.py @@ -1,8 +1,18 @@ # List of repository URLs to process -REPO_URLS = ["langroid/langroid", "microsoft/autogen"] +REPO_URLS = [ + "graykode/nlp-tutorial", + "google/dopamine", + "sebastianruder/NLP-progress", + "hezarai/hezar", + "langroid/langroid", + "microsoft/autogen", +] # List of filetypes to consider FILETYPES = [".ts", ".js", ".json", ".csv", ".py", ".ipynb", ".sol", ".md"] # Line limits for each filetype LINE_LIMITS = {"json": 500} + +# Specify the output directory +OUTPUT_DIR = "./outputs/" \ No newline at end of file diff --git a/repo2file4gpt/gh_api_wrapper.py b/repo2file4gpt/gh_api_wrapper.py index 80045eb..9a9fd31 100644 --- a/repo2file4gpt/gh_api_wrapper.py +++ b/repo2file4gpt/gh_api_wrapper.py @@ -1,3 +1,4 @@ +import logging from github import Github @@ -7,7 +8,23 @@ def __init__(self, token): def get_repo(self, url): repo_name = url.split("/")[-1] - return self.github.get_repo(url.split("/")[-2] + "/" + repo_name) + return self.github.get_repo(url.split("/")[-2] + "/" + repo_name), repo_name def get_contents(self, repo, path=""): - return repo.get_contents(path) + try: + return repo.get_contents(path) + except Exception as e: + logging.error( + f"Error getting contents of {path} in {repo.full_name}: {str(e)}" + ) + return [] + + def estimate_total_files(self, repo, path=""): + contents = self.get_contents(repo, path) + total_files = 0 + for content in contents: + if content.type == "dir": + total_files += self.estimate_total_files(repo, content.path) + else: + total_files += 1 + return total_files diff --git a/repo2file4gpt/markdown_exporter.py b/repo2file4gpt/markdown_exporter.py index 0dd1e50..131f110 100644 --- a/repo2file4gpt/markdown_exporter.py +++ b/repo2file4gpt/markdown_exporter.py @@ -1,15 +1,25 @@ import os +import logging class MarkdownExporter: def __init__(self, output_dir="./outputs"): self.output_dir = output_dir os.makedirs(output_dir, exist_ok=True) + os.makedirs("./logs", exist_ok=True) + logging.basicConfig( + filename="./logs/app.log", + filemode="w", + format="%(name)s - %(levelname)s - %(message)s", + level=logging.INFO, + ) def open_file(self, filename): self.file = open(os.path.join(self.output_dir, filename), "w", encoding="utf-8") + logging.info(f"Processing {filename}") def close_file(self): + logging.info(f"Finished processing {self.file.name}") self.file.close() def write_header(self, repo_name): @@ -33,5 +43,13 @@ def write_metadata(self, repo): def write_file_content(self, content, filetype): print(f"\n### {content.path}\n", file=self.file) print(f"```{filetype}", file=self.file) - print(content.decoded_content.decode(), file=self.file) + message = None + try: + if content.encoding == "base64": + print(content.decoded_content.decode(), file=self.file) + else: + message = f"Content of {content.path} could not be decoded. Encoding: {content.encoding}" + except Exception as e: + message = f"Error processing content of {content.path}: {str(e)}" print("```", file=self.file) + return message diff --git a/repo2file4gpt/repo_scrapper.py b/repo2file4gpt/repo_scrapper.py index 22efc19..e8d8d10 100644 --- a/repo2file4gpt/repo_scrapper.py +++ b/repo2file4gpt/repo_scrapper.py @@ -1,34 +1,53 @@ +import logging +from tqdm import tqdm from .gh_api_wrapper import GithubAPIWrapper from .markdown_exporter import MarkdownExporter class RepositoryScraper: - def __init__(self, token, filetypes, line_limits): + def __init__(self, token, filetypes, line_limits, output_dir): self.github = GithubAPIWrapper(token) - self.exporter = MarkdownExporter() + self.exporter = MarkdownExporter(output_dir) self.filetypes = filetypes self.line_limits = line_limits - def get_repo_content(self, repo, path=""): + def get_repo_content(self, repo, path="", pbar=None): contents = self.github.get_contents(repo, path) for content in contents: if content.type == "dir": - self.get_repo_content(repo, content.path) + self.get_repo_content(repo, content.path, pbar) else: if content.path.endswith(tuple(self.filetypes)): - self.exporter.write_file_content( + message = self.exporter.write_file_content( content, content.path.split(".")[-1] ) + if message is not None: + logging.warning(message) + pbar.update() - def process_repository(self, url): - repo = self.github.get_repo(url) - repo_name = url.split("/")[-1] - self.exporter.open_file(f"{repo_name}.md") + def process_repository(self, url, pbar): + repo, repo_name = self.github.get_repo(url) + owner_handle = repo.owner.login + total_files = self.github.estimate_total_files(repo) + self.exporter.open_file(f"{owner_handle}_{repo_name}.md") self.exporter.write_header(repo_name) self.exporter.write_metadata(repo) - self.get_repo_content(repo) + with tqdm( + total=total_files, + desc=f"Processing {repo.full_name}", + ncols=70, + position=1, + leave=False, + ) as pbar2: + self.get_repo_content(repo, pbar=pbar2) self.exporter.close_file() + pbar.update() def process_repositories(self, repo_urls): - for url in repo_urls: - self.process_repository(url) + print(f"Processing {len(repo_urls)} repositories\n") + print("Please checkout the log file for additional logs and error reports\n") + with tqdm( + total=len(repo_urls), desc="Overall Progress", ncols=70, position=0 + ) as pbar: + for url in repo_urls: + self.process_repository(url, pbar) diff --git a/requirements.txt b/requirements.txt index fbec796..9b71624 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ -PyGithub==2.1.1 +PyGithub>=2.1.1 +tqdm>=4.62.3 diff --git a/setup.py b/setup.py index 1e0768d..0aba59e 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,5 @@ name="repo2file4gpt", version="0.1.0", packages=find_packages(), - install_requires=[ - "github", - ], + install_requires=["PyGithub", "tqdm"], )