Skip to content

Commit

Permalink
add a concise readme
Browse files Browse the repository at this point in the history
  • Loading branch information
jrazi committed Dec 17, 2023
1 parent e3b2008 commit 39c5adc
Show file tree
Hide file tree
Showing 8 changed files with 155 additions and 29 deletions.
58 changes: 56 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,57 @@
# repo2file4gpt
# 📁 repo2file4gpt

This project aims to convert the content of GitHub repositories into a structured, machine-readable format, enabling AI models like ChatGPT to utilize them as a knowledge base.
repo2file4gpt is a Python package that scrapes GitHub repositories and exports their content into structured markdown files. The goal is to convert repository content into a machine-readable format for use in AI training data.

It extracts key files like code, markdown, and notebooks from public GitHub repositories. The content is exported into an aggregated markdown file per repository with the full hierarchy preserved.

## Installation

You can install repo2file4gpt directly from PyPI:

```bash
pip install repo2file4gpt
```

## Quick Start

### Command Line Interface

After installing repo2file4gpt, you can use it from the command line as follows:

```bash
repo2file4gpt --token YOUR_GITHUB_TOKEN --repos user/repo1 user/repo2 --filetypes py js --output_dir ./outputs/
```

Replace `YOUR_GITHUB_TOKEN` with your actual GitHub token, and `user/repo1` and `user/repo2` with the actual repositories you want to process.

### Python Code

You can also use repo2file4gpt in your Python code:

```python
import repo2file4gpt

# Specify the GitHub token, list of repositories, file types, and output directory
token = "YOUR_GITHUB_TOKEN"
repos = ["user/repo1", "user/repo2"]
filetypes = ["py", "js"]
output_dir = "./outputs/"

# Create a RepositoryScraper instance
processor = repo2file4gpt.RepositoryScraper(token, filetypes, repo2file4gpt.LINE_LIMITS, output_dir)

# Process the repositories
processor.process_repositories(repos)
```

Again, replace `YOUR_GITHUB_TOKEN` with your actual GitHub token, and `user/repo1` and `user/repo2` with the actual repositories you want to process.

## TODO

- Add support for more file types.
- Improve error handling for robustness.
- Optimize performance for large repositories.

## Contributing

Contributions are welcome! Please feel free to submit a Pull Request.
23 changes: 16 additions & 7 deletions repo2file4gpt/__main__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,24 @@
import argparse
from .config import REPO_URLS, FILETYPES, LINE_LIMITS, OUTPUT_DIR
from repo2file4gpt.repo_scrapper import RepositoryScraper
import os

from .config import REPO_URLS, FILETYPES, LINE_LIMITS
from repo2file4gpt.repo_scrapper import RepositoryScraper
def main():
parser = argparse.ArgumentParser(description="Process some repositories.")
parser.add_argument("--token", type=str, help="GitHub access token")
parser.add_argument("--repos", type=str, nargs="+", help="List of repository URLs to process")
parser.add_argument("--filetypes", type=str, nargs="+", help="List of file types to process")
parser.add_argument("--output_dir", type=str, help="Output directory")

args = parser.parse_args()

def main():
processor = RepositoryScraper(
os.getenv("GITHUB_ACCESS_TOKEN"), FILETYPES, LINE_LIMITS
)
processor.process_repositories(REPO_URLS)
token = args.token if args.token else os.getenv("GITHUB_ACCESS_TOKEN")
repos = [repo.strip() for repo in args.repos] if args.repos else REPO_URLS
filetypes = [ftype.strip() for ftype in args.filetypes] if args.filetypes else FILETYPES
output_dir = args.output_dir.strip() if args.output_dir else OUTPUT_DIR

processor = RepositoryScraper(token, filetypes, LINE_LIMITS, output_dir)
processor.process_repositories(repos)

if __name__ == "__main__":
main()
12 changes: 11 additions & 1 deletion repo2file4gpt/config.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
# List of repository URLs to process
REPO_URLS = ["langroid/langroid", "microsoft/autogen"]
REPO_URLS = [
"graykode/nlp-tutorial",
"google/dopamine",
"sebastianruder/NLP-progress",
"hezarai/hezar",
"langroid/langroid",
"microsoft/autogen",
]

# List of filetypes to consider
FILETYPES = [".ts", ".js", ".json", ".csv", ".py", ".ipynb", ".sol", ".md"]

# Line limits for each filetype
LINE_LIMITS = {"json": 500}

# Specify the output directory
OUTPUT_DIR = "./outputs/"
21 changes: 19 additions & 2 deletions repo2file4gpt/gh_api_wrapper.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
from github import Github


Expand All @@ -7,7 +8,23 @@ def __init__(self, token):

def get_repo(self, url):
repo_name = url.split("/")[-1]
return self.github.get_repo(url.split("/")[-2] + "/" + repo_name)
return self.github.get_repo(url.split("/")[-2] + "/" + repo_name), repo_name

def get_contents(self, repo, path=""):
return repo.get_contents(path)
try:
return repo.get_contents(path)
except Exception as e:
logging.error(
f"Error getting contents of {path} in {repo.full_name}: {str(e)}"
)
return []

def estimate_total_files(self, repo, path=""):
contents = self.get_contents(repo, path)
total_files = 0
for content in contents:
if content.type == "dir":
total_files += self.estimate_total_files(repo, content.path)
else:
total_files += 1
return total_files
20 changes: 19 additions & 1 deletion repo2file4gpt/markdown_exporter.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,25 @@
import os
import logging


class MarkdownExporter:
def __init__(self, output_dir="./outputs"):
self.output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)
os.makedirs("./logs", exist_ok=True)
logging.basicConfig(
filename="./logs/app.log",
filemode="w",
format="%(name)s - %(levelname)s - %(message)s",
level=logging.INFO,
)

def open_file(self, filename):
self.file = open(os.path.join(self.output_dir, filename), "w", encoding="utf-8")
logging.info(f"Processing {filename}")

def close_file(self):
logging.info(f"Finished processing {self.file.name}")
self.file.close()

def write_header(self, repo_name):
Expand All @@ -33,5 +43,13 @@ def write_metadata(self, repo):
def write_file_content(self, content, filetype):
print(f"\n### {content.path}\n", file=self.file)
print(f"```{filetype}", file=self.file)
print(content.decoded_content.decode(), file=self.file)
message = None
try:
if content.encoding == "base64":
print(content.decoded_content.decode(), file=self.file)
else:
message = f"Content of {content.path} could not be decoded. Encoding: {content.encoding}"
except Exception as e:
message = f"Error processing content of {content.path}: {str(e)}"
print("```", file=self.file)
return message
43 changes: 31 additions & 12 deletions repo2file4gpt/repo_scrapper.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,53 @@
import logging
from tqdm import tqdm
from .gh_api_wrapper import GithubAPIWrapper
from .markdown_exporter import MarkdownExporter


class RepositoryScraper:
def __init__(self, token, filetypes, line_limits):
def __init__(self, token, filetypes, line_limits, output_dir):
self.github = GithubAPIWrapper(token)
self.exporter = MarkdownExporter()
self.exporter = MarkdownExporter(output_dir)
self.filetypes = filetypes
self.line_limits = line_limits

def get_repo_content(self, repo, path=""):
def get_repo_content(self, repo, path="", pbar=None):
contents = self.github.get_contents(repo, path)
for content in contents:
if content.type == "dir":
self.get_repo_content(repo, content.path)
self.get_repo_content(repo, content.path, pbar)
else:
if content.path.endswith(tuple(self.filetypes)):
self.exporter.write_file_content(
message = self.exporter.write_file_content(
content, content.path.split(".")[-1]
)
if message is not None:
logging.warning(message)
pbar.update()

def process_repository(self, url):
repo = self.github.get_repo(url)
repo_name = url.split("/")[-1]
self.exporter.open_file(f"{repo_name}.md")
def process_repository(self, url, pbar):
repo, repo_name = self.github.get_repo(url)
owner_handle = repo.owner.login
total_files = self.github.estimate_total_files(repo)
self.exporter.open_file(f"{owner_handle}_{repo_name}.md")
self.exporter.write_header(repo_name)
self.exporter.write_metadata(repo)
self.get_repo_content(repo)
with tqdm(
total=total_files,
desc=f"Processing {repo.full_name}",
ncols=70,
position=1,
leave=False,
) as pbar2:
self.get_repo_content(repo, pbar=pbar2)
self.exporter.close_file()
pbar.update()

def process_repositories(self, repo_urls):
for url in repo_urls:
self.process_repository(url)
print(f"Processing {len(repo_urls)} repositories\n")
print("Please checkout the log file for additional logs and error reports\n")
with tqdm(
total=len(repo_urls), desc="Overall Progress", ncols=70, position=0
) as pbar:
for url in repo_urls:
self.process_repository(url, pbar)
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
PyGithub==2.1.1
PyGithub>=2.1.1
tqdm>=4.62.3
4 changes: 1 addition & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,5 @@
name="repo2file4gpt",
version="0.1.0",
packages=find_packages(),
install_requires=[
"github",
],
install_requires=["PyGithub", "tqdm"],
)

0 comments on commit 39c5adc

Please sign in to comment.