-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
155 additions
and
29 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,57 @@ | ||
# repo2file4gpt | ||
# 📁 repo2file4gpt | ||
|
||
This project aims to convert the content of GitHub repositories into a structured, machine-readable format, enabling AI models like ChatGPT to utilize them as a knowledge base. | ||
repo2file4gpt is a Python package that scrapes GitHub repositories and exports their content into structured markdown files. The goal is to convert repository content into a machine-readable format for use in AI training data. | ||
|
||
It extracts key files like code, markdown, and notebooks from public GitHub repositories. The content is exported into an aggregated markdown file per repository with the full hierarchy preserved. | ||
|
||
## Installation | ||
|
||
You can install repo2file4gpt directly from PyPI: | ||
|
||
```bash | ||
pip install repo2file4gpt | ||
``` | ||
|
||
## Quick Start | ||
|
||
### Command Line Interface | ||
|
||
After installing repo2file4gpt, you can use it from the command line as follows: | ||
|
||
```bash | ||
repo2file4gpt --token YOUR_GITHUB_TOKEN --repos user/repo1 user/repo2 --filetypes py js --output_dir ./outputs/ | ||
``` | ||
|
||
Replace `YOUR_GITHUB_TOKEN` with your actual GitHub token, and `user/repo1` and `user/repo2` with the actual repositories you want to process. | ||
|
||
### Python Code | ||
|
||
You can also use repo2file4gpt in your Python code: | ||
|
||
```python | ||
import repo2file4gpt | ||
|
||
# Specify the GitHub token, list of repositories, file types, and output directory | ||
token = "YOUR_GITHUB_TOKEN" | ||
repos = ["user/repo1", "user/repo2"] | ||
filetypes = ["py", "js"] | ||
output_dir = "./outputs/" | ||
|
||
# Create a RepositoryScraper instance | ||
processor = repo2file4gpt.RepositoryScraper(token, filetypes, repo2file4gpt.LINE_LIMITS, output_dir) | ||
|
||
# Process the repositories | ||
processor.process_repositories(repos) | ||
``` | ||
|
||
Again, replace `YOUR_GITHUB_TOKEN` with your actual GitHub token, and `user/repo1` and `user/repo2` with the actual repositories you want to process. | ||
|
||
## TODO | ||
|
||
- Add support for more file types. | ||
- Improve error handling for robustness. | ||
- Optimize performance for large repositories. | ||
|
||
## Contributing | ||
|
||
Contributions are welcome! Please feel free to submit a Pull Request. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,24 @@ | ||
import argparse | ||
from .config import REPO_URLS, FILETYPES, LINE_LIMITS, OUTPUT_DIR | ||
from repo2file4gpt.repo_scrapper import RepositoryScraper | ||
import os | ||
|
||
from .config import REPO_URLS, FILETYPES, LINE_LIMITS | ||
from repo2file4gpt.repo_scrapper import RepositoryScraper | ||
def main(): | ||
parser = argparse.ArgumentParser(description="Process some repositories.") | ||
parser.add_argument("--token", type=str, help="GitHub access token") | ||
parser.add_argument("--repos", type=str, nargs="+", help="List of repository URLs to process") | ||
parser.add_argument("--filetypes", type=str, nargs="+", help="List of file types to process") | ||
parser.add_argument("--output_dir", type=str, help="Output directory") | ||
|
||
args = parser.parse_args() | ||
|
||
def main(): | ||
processor = RepositoryScraper( | ||
os.getenv("GITHUB_ACCESS_TOKEN"), FILETYPES, LINE_LIMITS | ||
) | ||
processor.process_repositories(REPO_URLS) | ||
token = args.token if args.token else os.getenv("GITHUB_ACCESS_TOKEN") | ||
repos = [repo.strip() for repo in args.repos] if args.repos else REPO_URLS | ||
filetypes = [ftype.strip() for ftype in args.filetypes] if args.filetypes else FILETYPES | ||
output_dir = args.output_dir.strip() if args.output_dir else OUTPUT_DIR | ||
|
||
processor = RepositoryScraper(token, filetypes, LINE_LIMITS, output_dir) | ||
processor.process_repositories(repos) | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,18 @@ | ||
# List of repository URLs to process | ||
REPO_URLS = ["langroid/langroid", "microsoft/autogen"] | ||
REPO_URLS = [ | ||
"graykode/nlp-tutorial", | ||
"google/dopamine", | ||
"sebastianruder/NLP-progress", | ||
"hezarai/hezar", | ||
"langroid/langroid", | ||
"microsoft/autogen", | ||
] | ||
|
||
# List of filetypes to consider | ||
FILETYPES = [".ts", ".js", ".json", ".csv", ".py", ".ipynb", ".sol", ".md"] | ||
|
||
# Line limits for each filetype | ||
LINE_LIMITS = {"json": 500} | ||
|
||
# Specify the output directory | ||
OUTPUT_DIR = "./outputs/" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,34 +1,53 @@ | ||
import logging | ||
from tqdm import tqdm | ||
from .gh_api_wrapper import GithubAPIWrapper | ||
from .markdown_exporter import MarkdownExporter | ||
|
||
|
||
class RepositoryScraper: | ||
def __init__(self, token, filetypes, line_limits): | ||
def __init__(self, token, filetypes, line_limits, output_dir): | ||
self.github = GithubAPIWrapper(token) | ||
self.exporter = MarkdownExporter() | ||
self.exporter = MarkdownExporter(output_dir) | ||
self.filetypes = filetypes | ||
self.line_limits = line_limits | ||
|
||
def get_repo_content(self, repo, path=""): | ||
def get_repo_content(self, repo, path="", pbar=None): | ||
contents = self.github.get_contents(repo, path) | ||
for content in contents: | ||
if content.type == "dir": | ||
self.get_repo_content(repo, content.path) | ||
self.get_repo_content(repo, content.path, pbar) | ||
else: | ||
if content.path.endswith(tuple(self.filetypes)): | ||
self.exporter.write_file_content( | ||
message = self.exporter.write_file_content( | ||
content, content.path.split(".")[-1] | ||
) | ||
if message is not None: | ||
logging.warning(message) | ||
pbar.update() | ||
|
||
def process_repository(self, url): | ||
repo = self.github.get_repo(url) | ||
repo_name = url.split("/")[-1] | ||
self.exporter.open_file(f"{repo_name}.md") | ||
def process_repository(self, url, pbar): | ||
repo, repo_name = self.github.get_repo(url) | ||
owner_handle = repo.owner.login | ||
total_files = self.github.estimate_total_files(repo) | ||
self.exporter.open_file(f"{owner_handle}_{repo_name}.md") | ||
self.exporter.write_header(repo_name) | ||
self.exporter.write_metadata(repo) | ||
self.get_repo_content(repo) | ||
with tqdm( | ||
total=total_files, | ||
desc=f"Processing {repo.full_name}", | ||
ncols=70, | ||
position=1, | ||
leave=False, | ||
) as pbar2: | ||
self.get_repo_content(repo, pbar=pbar2) | ||
self.exporter.close_file() | ||
pbar.update() | ||
|
||
def process_repositories(self, repo_urls): | ||
for url in repo_urls: | ||
self.process_repository(url) | ||
print(f"Processing {len(repo_urls)} repositories\n") | ||
print("Please checkout the log file for additional logs and error reports\n") | ||
with tqdm( | ||
total=len(repo_urls), desc="Overall Progress", ncols=70, position=0 | ||
) as pbar: | ||
for url in repo_urls: | ||
self.process_repository(url, pbar) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
PyGithub==2.1.1 | ||
PyGithub>=2.1.1 | ||
tqdm>=4.62.3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters