add a concise readme

jrazi · Dec 17, 2023 · 39c5adc · 39c5adc
1 parent e3b2008
commit 39c5adc
Show file tree

Hide file tree

Showing 8 changed files with 155 additions and 29 deletions.
diff --git a/README.md b/README.md
@@ -1,3 +1,57 @@
-# repo2file4gpt
+# 📁 repo2file4gpt
 
-This project aims to convert the content of GitHub repositories into a structured, machine-readable format, enabling AI models like ChatGPT to utilize them as a knowledge base.
+repo2file4gpt is a Python package that scrapes GitHub repositories and exports their content into structured markdown files. The goal is to convert repository content into a machine-readable format for use in AI training data.
+
+It extracts key files like code, markdown, and notebooks from public GitHub repositories. The content is exported into an aggregated markdown file per repository with the full hierarchy preserved.
+
+## Installation
+
+You can install repo2file4gpt directly from PyPI:
+
+```bash
+pip install repo2file4gpt
+```
+
+## Quick Start
+
+### Command Line Interface
+
+After installing repo2file4gpt, you can use it from the command line as follows:
+
+```bash
+repo2file4gpt --token YOUR_GITHUB_TOKEN --repos user/repo1 user/repo2 --filetypes py js --output_dir ./outputs/
+```
+
+Replace `YOUR_GITHUB_TOKEN` with your actual GitHub token, and `user/repo1` and `user/repo2` with the actual repositories you want to process.
+
+### Python Code
+
+You can also use repo2file4gpt in your Python code:
+
+```python
+import repo2file4gpt
+
+# Specify the GitHub token, list of repositories, file types, and output directory
+token = "YOUR_GITHUB_TOKEN"
+repos = ["user/repo1", "user/repo2"]
+filetypes = ["py", "js"]
+output_dir = "./outputs/"
+
+# Create a RepositoryScraper instance
+processor = repo2file4gpt.RepositoryScraper(token, filetypes, repo2file4gpt.LINE_LIMITS, output_dir)
+
+# Process the repositories
+processor.process_repositories(repos)
+```
+
+Again, replace `YOUR_GITHUB_TOKEN` with your actual GitHub token, and `user/repo1` and `user/repo2` with the actual repositories you want to process.
+
+## TODO
+
+- Add support for more file types.
+- Improve error handling for robustness.
+- Optimize performance for large repositories.
+
+## Contributing
+
+Contributions are welcome! Please feel free to submit a Pull Request.
diff --git a/repo2file4gpt/__main__.py b/repo2file4gpt/__main__.py
@@ -1,15 +1,24 @@
+import argparse
+from .config import REPO_URLS, FILETYPES, LINE_LIMITS, OUTPUT_DIR
+from repo2file4gpt.repo_scrapper import RepositoryScraper
 import os
 
-from .config import REPO_URLS, FILETYPES, LINE_LIMITS
-from repo2file4gpt.repo_scrapper import RepositoryScraper
+def main():
+    parser = argparse.ArgumentParser(description="Process some repositories.")
+    parser.add_argument("--token", type=str, help="GitHub access token")
+    parser.add_argument("--repos", type=str, nargs="+", help="List of repository URLs to process")
+    parser.add_argument("--filetypes", type=str, nargs="+", help="List of file types to process")
+    parser.add_argument("--output_dir", type=str, help="Output directory")
 
+    args = parser.parse_args()
 
-def main():
-    processor = RepositoryScraper(
-        os.getenv("GITHUB_ACCESS_TOKEN"), FILETYPES, LINE_LIMITS
-    )
-    processor.process_repositories(REPO_URLS)
+    token = args.token if args.token else os.getenv("GITHUB_ACCESS_TOKEN")
+    repos = [repo.strip() for repo in args.repos] if args.repos else REPO_URLS
+    filetypes = [ftype.strip() for ftype in args.filetypes] if args.filetypes else FILETYPES
+    output_dir = args.output_dir.strip() if args.output_dir else OUTPUT_DIR
 
+    processor = RepositoryScraper(token, filetypes, LINE_LIMITS, output_dir)
+    processor.process_repositories(repos)
 
 if __name__ == "__main__":
     main()
diff --git a/repo2file4gpt/config.py b/repo2file4gpt/config.py
@@ -1,8 +1,18 @@
 # List of repository URLs to process
-REPO_URLS = ["langroid/langroid", "microsoft/autogen"]
+REPO_URLS = [
+    "graykode/nlp-tutorial",
+    "google/dopamine",
+    "sebastianruder/NLP-progress",
+    "hezarai/hezar",
+    "langroid/langroid",
+    "microsoft/autogen",
+]
 
 # List of filetypes to consider
 FILETYPES = [".ts", ".js", ".json", ".csv", ".py", ".ipynb", ".sol", ".md"]
 
 # Line limits for each filetype
 LINE_LIMITS = {"json": 500}
+
+# Specify the output directory
+OUTPUT_DIR = "./outputs/"
diff --git a/repo2file4gpt/gh_api_wrapper.py b/repo2file4gpt/gh_api_wrapper.py
@@ -1,3 +1,4 @@
+import logging
 from github import Github
 
 
@@ -7,7 +8,23 @@ def __init__(self, token):
 
     def get_repo(self, url):
         repo_name = url.split("/")[-1]
-        return self.github.get_repo(url.split("/")[-2] + "/" + repo_name)
+        return self.github.get_repo(url.split("/")[-2] + "/" + repo_name), repo_name
 
     def get_contents(self, repo, path=""):
-        return repo.get_contents(path)
+        try:
+            return repo.get_contents(path)
+        except Exception as e:
+            logging.error(
+                f"Error getting contents of {path} in {repo.full_name}: {str(e)}"
+            )
+            return []
+
+    def estimate_total_files(self, repo, path=""):
+        contents = self.get_contents(repo, path)
+        total_files = 0
+        for content in contents:
+            if content.type == "dir":
+                total_files += self.estimate_total_files(repo, content.path)
+            else:
+                total_files += 1
+        return total_files
diff --git a/repo2file4gpt/markdown_exporter.py b/repo2file4gpt/markdown_exporter.py
@@ -1,15 +1,25 @@
 import os
+import logging
 
 
 class MarkdownExporter:
     def __init__(self, output_dir="./outputs"):
         self.output_dir = output_dir
         os.makedirs(output_dir, exist_ok=True)
+        os.makedirs("./logs", exist_ok=True)
+        logging.basicConfig(
+            filename="./logs/app.log",
+            filemode="w",
+            format="%(name)s - %(levelname)s - %(message)s",
+            level=logging.INFO,
+        )
 
     def open_file(self, filename):
         self.file = open(os.path.join(self.output_dir, filename), "w", encoding="utf-8")
+        logging.info(f"Processing {filename}")
 
     def close_file(self):
+        logging.info(f"Finished processing {self.file.name}")
         self.file.close()
 
     def write_header(self, repo_name):
@@ -33,5 +43,13 @@ def write_metadata(self, repo):
     def write_file_content(self, content, filetype):
         print(f"\n### {content.path}\n", file=self.file)
         print(f"```{filetype}", file=self.file)
-        print(content.decoded_content.decode(), file=self.file)
+        message = None
+        try:
+            if content.encoding == "base64":
+                print(content.decoded_content.decode(), file=self.file)
+            else:
+                message = f"Content of {content.path} could not be decoded. Encoding: {content.encoding}"
+        except Exception as e:
+            message = f"Error processing content of {content.path}: {str(e)}"
         print("```", file=self.file)
+        return message
diff --git a/repo2file4gpt/repo_scrapper.py b/repo2file4gpt/repo_scrapper.py
@@ -1,34 +1,53 @@
+import logging
+from tqdm import tqdm
 from .gh_api_wrapper import GithubAPIWrapper
 from .markdown_exporter import MarkdownExporter
 
 
 class RepositoryScraper:
-    def __init__(self, token, filetypes, line_limits):
+    def __init__(self, token, filetypes, line_limits, output_dir):
         self.github = GithubAPIWrapper(token)
-        self.exporter = MarkdownExporter()
+        self.exporter = MarkdownExporter(output_dir)
         self.filetypes = filetypes
         self.line_limits = line_limits
 
-    def get_repo_content(self, repo, path=""):
+    def get_repo_content(self, repo, path="", pbar=None):
         contents = self.github.get_contents(repo, path)
         for content in contents:
             if content.type == "dir":
-                self.get_repo_content(repo, content.path)
+                self.get_repo_content(repo, content.path, pbar)
             else:
                 if content.path.endswith(tuple(self.filetypes)):
-                    self.exporter.write_file_content(
+                    message = self.exporter.write_file_content(
                         content, content.path.split(".")[-1]
                     )
+                    if message is not None:
+                        logging.warning(message)
+            pbar.update()
 
-    def process_repository(self, url):
-        repo = self.github.get_repo(url)
-        repo_name = url.split("/")[-1]
-        self.exporter.open_file(f"{repo_name}.md")
+    def process_repository(self, url, pbar):
+        repo, repo_name = self.github.get_repo(url)
+        owner_handle = repo.owner.login
+        total_files = self.github.estimate_total_files(repo)
+        self.exporter.open_file(f"{owner_handle}_{repo_name}.md")
         self.exporter.write_header(repo_name)
         self.exporter.write_metadata(repo)
-        self.get_repo_content(repo)
+        with tqdm(
+            total=total_files,
+            desc=f"Processing {repo.full_name}",
+            ncols=70,
+            position=1,
+            leave=False,
+        ) as pbar2:
+            self.get_repo_content(repo, pbar=pbar2)
         self.exporter.close_file()
+        pbar.update()
 
     def process_repositories(self, repo_urls):
-        for url in repo_urls:
-            self.process_repository(url)
+        print(f"Processing {len(repo_urls)} repositories\n")
+        print("Please checkout the log file for additional logs and error reports\n")
+        with tqdm(
+            total=len(repo_urls), desc="Overall Progress", ncols=70, position=0
+        ) as pbar:
+            for url in repo_urls:
+                self.process_repository(url, pbar)
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,2 @@
-PyGithub==2.1.1
+PyGithub>=2.1.1
+tqdm>=4.62.3
diff --git a/setup.py b/setup.py
@@ -4,7 +4,5 @@
     name="repo2file4gpt",
     version="0.1.0",
     packages=find_packages(),
-    install_requires=[
-        "github",
-    ],
+    install_requires=["PyGithub", "tqdm"],
 )