refactor: add scripts

CCBR · Jul 19, 2024 · 342e04d · 342e04d
1 parent 9209f8b
commit 342e04d
Show file tree

Hide file tree

Showing 18 changed files with 546 additions and 0 deletions.
diff --git a/assets/make_readme/Dockerfile b/assets/make_readme/Dockerfile
@@ -0,0 +1,13 @@
+# Use an official Python runtime as a parent image
+FROM python:3.11-slim
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Update the package list and install bash
+RUN apt-get update && \
+    apt-get install -y bash && \
+    apt-get clean
+
+# Install the required Python packages
+RUN pip install --no-cache-dir pandas requests python-dateutil
diff --git a/assets/make_readme/about_us.md b/assets/make_readme/about_us.md
@@ -0,0 +1,9 @@
+
+## About Us
+
+- 👋 Hi, we're the [**@CCBR**](https://bioinformatics.ccr.cancer.gov/ccbr/), a group of bioinformatics analysts and engineers
+- 📖 We build flexible, reproducible, workflows for next-generation sequencing data
+- :bulb: We [collaborate](https://abcs-amp.nih.gov/project/request/CCBR/) with [CCR](https://ccr.cancer.gov/) PIs
+- 📫 You can reach us at [[email protected]](mailto:[email protected])
+- 🏁 Check out our [release history](#release-history)
+- :link: Our [Zenodo](https://zenodo.org/communities/ccbr) community
diff --git a/assets/make_readme/add_toc.py b/assets/make_readme/add_toc.py
@@ -0,0 +1,57 @@
+import re
+import argparse
+
+def extract_headers(markdown_content):
+    """
+    Extract headers from the markdown content.
+    """
+    headers = re.findall(r'^(#{1,6})\s*(.*)', markdown_content, re.MULTILINE)
+    return headers
+
+def generate_toc(headers):
+    """
+    Generate the Table of Contents (TOC) from the headers.
+    """
+    toc_lines = ["## Table of Contents"]
+    for header in headers:
+        level = len(header[0])
+        title = header[1].strip()
+        anchor = title.lower().replace(' ', '-').replace('.', '')
+        toc_lines.append(f"{'  ' * (level - 1)}- [{title}](#{anchor})")
+    return '\n'.join(toc_lines)
+
+def insert_toc(markdown_content, toc):
+    """
+    Insert TOC into the markdown content after the first header.
+    """
+    toc_placeholder = "<!-- TOC -->"
+    if toc_placeholder in markdown_content:
+        updated_content = markdown_content.replace(toc_placeholder, toc)
+    else:
+        first_header_pos = markdown_content.find('\n#')
+        if first_header_pos == -1:
+            first_header_pos = 0
+        updated_content = markdown_content[:first_header_pos] + toc + '\n\n' + markdown_content[first_header_pos:]
+    return updated_content
+
+def main():
+    parser = argparse.ArgumentParser(description='Add a Table of Contents (TOC) to a Markdown file.')
+    parser.add_argument('--input', '-i', required=True, help='Input Markdown file')
+    parser.add_argument('--output', '-o', required=True, help='Output Markdown file')
+
+    args = parser.parse_args()
+
+    with open(args.input, 'r') as f:
+        markdown_content = f.read()
+
+    headers = extract_headers(markdown_content)
+    toc = generate_toc(headers)
+    updated_content = insert_toc(markdown_content, toc)
+
+    with open(args.output, 'w') as f:
+        f.write(updated_content)
+
+    print(f"TOC added to {args.output}")
+
+if __name__ == "__main__":
+    main()
diff --git a/assets/make_readme/back_to_top.md b/assets/make_readme/back_to_top.md
@@ -0,0 +1,5 @@
+
+<hr>
+<p align="center">
+	<a href="##table-of-contents">Back to Top</a>
+</p>
diff --git a/assets/make_readme/banner.md b/assets/make_readme/banner.md
@@ -0,0 +1,2 @@
+
+[<img src="https://raw.githubusercontent.com/CCBR/.github/main/img/ccbrbanner.png">](https://bioinformatics.ccr.cancer.gov/ccbr/)
diff --git a/assets/make_readme/ccbrpipeliner_release_history.md b/assets/make_readme/ccbrpipeliner_release_history.md
@@ -0,0 +1,20 @@
+
+## Release History
+
+`module load ccbrpipeliner` loads default release of ccbrpipeliner. Each release comprises of a unique combination of the version numbers of the different pipelines offered as part of the ccbrpipeliner suite.
+
+| Release | Tool versions | Released on | Decommissioned on |
+| --- | --- | --- | --- |
+| 1 | RENEE v2.1 <sup>@#</sup> | July, 10th 2023 | July, 14th 2023 |
+| 2 | RENEE v2.2 <sup>@#</sup> | July, 14th 2023 | September, 5th 2023 |
+| 3 | RENEE v2.2 <sup>@#</sup>, XAVIER v2.0 <sup>@</sup>| July, 21st 2023 | - |
+| 4 | RENEE v2.5 <sup>@#</sup>, XAVIER v3.0 <sup>@#</sup>| September, 5th 2023 | - |
+| 5 | RENEE v2.5 <sup>@#</sup>, XAVIER v3.0 <sup>@#</sup>, CARLISLE v2.4 <sup>@</sup>, CHAMPAGNE v0.2 <sup>@</sup>, CRUISE v0.1 <sup>@</sup>, spacesavers2 v0.10 <sup>@</sup>, permfix v0.6 <sup>@</sup> | October, 27th 2023 | - |
+| 6<sup>*</sup> | RENEE v2.5 <sup>@#</sup>, XAVIER v3.0 <sup>@#</sup>, CARLISLE v2.4 <sup>@</sup>, CHAMPAGNE v0.3 <sup>@</sup>, CRUISE v0.1 <sup>@</sup>, ASPEN v1.0 <sup>@</sup>, spacesavers2 v0.12 <sup>@</sup>, permfix v0.6 <sup>@</sup> | February, 29th 2024 | - |
+
+>
+> <sup>*</sup> = Current DEFAULT version on BIOWULF
+>
+> <sup>@</sup> = CLI available
+>  
+> <sup>#</sup> = GUI available
diff --git a/assets/make_readme/citation.md b/assets/make_readme/citation.md
@@ -0,0 +1,4 @@
+
+## Citation
+
+Most of our end-to-end pipelines which have been used in published research work have been made available to the entire bioinformatics community via a Zenodo DOI. Please feel free to visit our [Zenodo community page](https://zenodo.org/communities/ccbr). And if you use our pipelines, don't forget to cite us!
diff --git a/assets/make_readme/get_per_user_commits.py b/assets/make_readme/get_per_user_commits.py
@@ -0,0 +1,136 @@
+import requests
+import os
+import pandas as pd
+from collections import defaultdict
+from datetime import datetime, timedelta
+
+# Replace these with your GitHub token and organization name
+GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')
+ORG_NAME = 'CCBR'
+# ORG_NAME = 'CCRGeneticsBranch'
+# ORG_NAME = 'NIDAP-Community'
+# ORG_NAME = 'NCI-VB'
+
+headers = {
+    'Accept': 'application/vnd.github.v3+json',
+    'Authorization': f'token {GITHUB_TOKEN}'
+}
+
+def get_repos(org_name):
+    repos = []
+    page = 1
+    while True:
+        response = requests.get(f'https://api.github.com/orgs/{org_name}/repos?per_page=100&page={page}', headers=headers)
+        if response.status_code != 200:
+            break
+        repos.extend(response.json())
+        if len(response.json()) < 100:
+            break
+        page += 1
+    return repos
+
+def get_members(org_name):
+    members = set()
+    page = 1
+    while True:
+        response = requests.get(f'https://api.github.com/orgs/{org_name}/members?per_page=100&page={page}', headers=headers)
+        if response.status_code != 200:
+            break
+        page_members = response.json()
+        if not page_members:
+            break
+        for member in page_members:
+            members.add(member['login'])
+        page += 1
+    return members
+
+def get_outside_collaborators(repo_full_name):
+    collaborators = set()
+    page = 1
+    while True:
+        response = requests.get(f'https://api.github.com/repos/{repo_full_name}/collaborators?affiliation=outside&per_page=100&page={page}', headers=headers)
+        if response.status_code != 200:
+            break
+        outside_collaborators = response.json()
+        if not outside_collaborators:
+            break
+        for collaborator in outside_collaborators:
+            collaborators.add(collaborator['login'])
+        page += 1
+    return collaborators
+
+def get_commits_count(repo_full_name, members_and_collaborators):
+    commits_count_by_user = defaultdict(lambda: {'total': 0, 'last_month': 0, 'last_6_months': 0})
+    page = 1
+    today = datetime.utcnow()
+    one_month_ago = today - timedelta(days=30)
+    six_months_ago = today - timedelta(days=180)
+
+    while True:
+        response = requests.get(f'https://api.github.com/repos/{repo_full_name}/commits?per_page=100&page={page}', headers=headers)
+        if response.status_code != 200:
+            break
+        commits = response.json()
+        if not commits:
+            break
+
+        for commit in commits:
+            author_login = commit['author']['login'] if commit['author'] else 'unknown'
+            commit_date_str = commit['commit']['author']['date']
+            commit_date = datetime.strptime(commit_date_str, '%Y-%m-%dT%H:%M:%SZ')
+
+            if author_login != 'unknown' and author_login in members_and_collaborators:
+                commits_count_by_user[author_login]['total'] += 1
+                if commit_date >= one_month_ago:
+                    commits_count_by_user[author_login]['last_month'] += 1
+                if commit_date >= six_months_ago:
+                    commits_count_by_user[author_login]['last_6_months'] += 1
+
+        page += 1
+
+    return commits_count_by_user
+
+def main():
+    members = get_members(ORG_NAME)
+    repos = get_repos(ORG_NAME)
+
+    # Collect outside collaborators
+    outside_collaborators = set()
+    for repo in repos:
+        repo_full_name = repo['full_name']
+        # print(f"Fetching outside collaborators for repository: {repo_full_name}")
+        outside_collaborators.update(get_outside_collaborators(repo_full_name))
+
+    members_and_collaborators = members.union(outside_collaborators)
+
+    user_commits = defaultdict(lambda: {'total': 0, 'last_month': 0, 'last_6_months': 0})
+
+    for repo in repos:
+        repo_full_name = repo['full_name']
+        # print(f"Processing repository: {repo_full_name}")
+        commits_count_by_user = get_commits_count(repo_full_name, members_and_collaborators)
+        for user, counts in commits_count_by_user.items():
+            user_commits[user]['total'] += counts['total']
+            user_commits[user]['last_month'] += counts['last_month']
+            user_commits[user]['last_6_months'] += counts['last_6_months']
+
+    # Convert to a DataFrame
+    data = []
+    for user, counts in user_commits.items():
+        data.append([
+            user,
+            counts['total'],
+            counts['last_month'],
+            counts['last_6_months']
+        ])
+
+    df = pd.DataFrame(data, columns=['User', 'Total Commits', 'Commits in Last Month', 'Commits in Last 6 Months'])
+    df = df[df['User'] != 'unknown']  # Remove 'unknown' users
+    df = df.sort_values(by='Total Commits', ascending=False).head(10)  # Top 10 users
+
+    # Create a Markdown table
+    markdown_table = df.to_markdown(index=False, headers=['User', 'Total Commits', 'Commits in Last Month', 'Commits in Last 6 Months'])
+    print(markdown_table)
+
+if __name__ == "__main__":
+    main()
diff --git a/assets/make_readme/get_recent_releases_table.py b/assets/make_readme/get_recent_releases_table.py
@@ -0,0 +1,91 @@
+import requests
+import argparse
+import pandas as pd
+from datetime import datetime
+from dateutil.relativedelta import relativedelta
+
+# Replace these with your GitHub token and organization name
+GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')
+ORG_NAME = 'CCBR'
+
+headers = {
+    'Accept': 'application/vnd.github.v3+json',
+    'Authorization': f'token {GITHUB_TOKEN}'
+}
+
+def get_date_n_months_ago(n_months):
+    today = datetime.now()
+    n_months_ago = today - relativedelta(months=n_months)
+    return n_months_ago.strftime('%Y-%m-%d')
+
+def get_repos(org_name):
+    repos = []
+    page = 1
+    while True:
+        response = requests.get(f'https://api.github.com/orgs/{org_name}/repos?per_page=100&page={page}', headers=headers)
+        if response.status_code != 200:
+            break
+        repos.extend(response.json())
+        if len(response.json()) < 100:
+            break
+        page += 1
+    return repos
+
+def format_date(date_str):
+    try:
+        # Parse the date string and format it as YYYY-MM-DD
+        date_obj = datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%SZ')
+        return date_obj.strftime('%Y-%m-%d')
+    except ValueError:
+        return 'Unknown date'
+
+def get_latest_release(repo_full_name):
+    response = requests.get(f'https://api.github.com/repos/{repo_full_name}/releases/latest', headers=headers)
+    if response.status_code == 200:
+        return response.json()
+    return None
+
+def get_open_issues_count(repo_full_name):
+    response = requests.get(f'https://api.github.com/repos/{repo_full_name}/issues?state=open', headers=headers)
+    if response.status_code == 200:
+        return len(response.json())
+    return 0
+
+def main():
+    parser = argparse.ArgumentParser(description='Fetch GitHub repository releases.')
+    parser.add_argument('--nmonths', type=int, default=0, help='Number of months to filter releases. If not provided, shows all releases.')
+    args = parser.parse_args()
+
+    repos = get_repos(ORG_NAME)
+    releases = []
+    cutoff_date = get_date_n_months_ago(args.nmonths)
+
+    for repo in repos:
+        latest_release = get_latest_release(repo['full_name'])
+        open_issues_count = get_open_issues_count(repo['full_name'])
+        if latest_release:
+            repo_name = repo['name']
+            release_name = latest_release['name']
+            release_url = latest_release['html_url']
+            release_date = latest_release['published_at']
+            formatted_date = format_date(release_date)
+            if formatted_date != 'Unknown date' and (args.nmonths == 0 or formatted_date >= cutoff_date):
+                releases.append({
+                    'Repo Name': f"[{repo_name}](https://github.com/{ORG_NAME}/{repo_name})",
+                    'Release Name': f"[{release_name}]({release_url})",
+                    'Release Date': formatted_date,
+                    'Open Issues': open_issues_count
+                })
+
+    # Sort releases by date in descending order
+    sorted_releases = sorted(releases, key=lambda x: x['Release Date'], reverse=True)
+
+    # Create a DataFrame for Markdown table
+    df = pd.DataFrame(sorted_releases)
+    markdown_table = df.to_markdown(index=False, headers=['Repo Name', 'Release Name', 'Release Date', 'Open Issues'])
+
+    # Print Markdown table
+    print(markdown_table)
+
+if __name__ == "__main__":
+    main()