feat: Collect output from all attempts of workflows; full JSON output

This is a reworking of `get_action_errors.py` with the following goals: - Retrieve all attempts of a workflow, not just the last one. This is important for collection of transient errors since people will usually re-run their workflows when there is a failure unrelated to their code. - Tolerate interruptions of the script due to timeouts, rate-limiting, and networking issues. On re-run, workflow runs that have been fully fetched are skipped. - Record the full JSON for workflow attempts, checks, and annotations so that we can do more in-depth queries if needed. The script no longer produces a CSV, but it's still straightforward to query how often we're seeing a particular error.
edx · Jan 31, 2024 · e187a7c · e187a7c
1 parent f76ad37
commit e187a7c
Show file tree

Hide file tree

Showing 3 changed files with 248 additions and 118 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -14,6 +14,12 @@ Change Log
 Unreleased
 ~~~~~~~~~~
 
+[3.4.0] - 2024-02-01
+~~~~~~~~~~~~~~~~~~~~
+Changed
+_______
+* GitHub Actions downloader script now pulls all attempts and records in JSON. CLI has changed.
+
 [3.3.0] - 2024-01-23
 ~~~~~~~~~~~~~~~~~~~~
 Changed

diff --git a/edx_arch_experiments/__init__.py b/edx_arch_experiments/__init__.py
@@ -2,4 +2,4 @@
 A plugin to include applications under development by the architecture team at 2U.
 """
 
-__version__ = '3.3.0'
+__version__ = '3.4.0'
diff --git a/edx_arch_experiments/scripts/get_action_errors.py b/edx_arch_experiments/scripts/get_action_errors.py
@@ -1,138 +1,262 @@
 """
-Script to get the annotations from all failed checks in edx-platform after a given date
+Script to get the annotations from all checks in a repo in a given date range.
 
-Gets all the commits to master after the date, then for each commit gets each check suite, then for each failed check
-suite gets each run. Collects the annotations for all the failed runs. The annotations will sometimes contain useful
-error messages, sometimes just the exit code. Getting the full logs requires admin permissions to edx-platform so it's
-not included in this script.
-Example output row:
-commit_date,run_started_at,run_completed_at,commit_hash,name,message
-2023-07-26T20:59:01Z,2023-07-27T06:56:23Z,2023-07-27T07:01:58Z,06e738e64a3485ecec037a9b8a36cf4ae145ea8a,
-upgrade-one-python-dependency-workflow,Process completed with exit code 2.
-
-
-This script takes a pretty long time to run (15m for 2 months) and there is a risk if you look too far back you will hit
-your API limit.
+Run with --help (or see docstring of `run`) for more details.
 """
 
-from csv import DictWriter
+import json
+import os
+import time
 from datetime import datetime
+from os import path
 
 import click
 import requests
 
 
-@click.command()
-@click.option('--token', envvar='GITHUB_TOKEN')
-@click.option('--start_date', type=click.DateTime(formats=["%Y-%m-%d"]), help="Date of earliest commit")
-@click.option('--filename', help="Where to write the data")
-def get_errors_from_date(token, start_date, filename):
+def _ensure_dir(base, *more):
     """
-    Creates a csv documenting the annotations from all failed runs for commits to edx-platform after the given date
+    Join these path segments, create as dir if not already, and return the path.
+    """
+    subdir = path.join(base, *more)
+    os.makedirs(subdir, exist_ok=True)
+    return subdir
 
-    Parameters:
-        token (string): The GitHub API token. Retrieved from the env GITHUB_TOKEN variable
-        start_date (date): The earliest date to look for
-        filename (string): Where to write the csv
 
-    """
-    headers = {'Authorization': f"Bearer {token}"}
-    all_commits_after_date = get_commits_after_date(start_date, headers=headers)
-    all_check_suites = []
-    all_rows = []
-    for commit in all_commits_after_date:
-        # gather all the check suite data from each commit into a single list
-        add_commit_check_suites(commit, all_check_suites, headers)
-    for check_suite in all_check_suites:
-        # only record annotations for failed runs
-        if check_suite['conclusion'] == 'failure':
-            check_runs = requests.get(check_suite['check_runs_url'], headers=headers).json()
-            for run in check_runs['check_runs']:
-                if run['conclusion'] == 'failure' and run['output']['annotations_count'] > 0:
-                    annotations = requests.get(run['output']['annotations_url'], headers=headers).json()
-                    for annotation in annotations:
-                        all_rows.append({
-                            'commit_hash': run['head_sha'],
-                            'name': run['name'],
-                            'message': annotation['message'],
-                            'run_started_at': run['started_at'],
-                            'run_completed_at': run['completed_at'],
-                            'commit_date': check_suite['commit_date']
-                        })
-
-    with open(filename, 'w') as f:
-        writes = DictWriter(f, fieldnames=['commit_date', 'run_started_at', 'run_completed_at', 'commit_hash', 'name',
-                                           'message'])
-        writes.writeheader()
-        writes.writerows(all_rows)
-
-
-def get_commits_after_date(cut_off_date, headers):
-    """
-    Get API data for all commits to edx-platform/master after the given date
+class ActionsDownloader:
 
-    Parameters:
-        cut_off_date (date): Earliest date to look
-        headers (dict): Authentication headers for GH requests
+    def __init__(self, *, output_dir, token):
+        self.output_dir = output_dir
+        self.api_headers = {
+            'Accept': 'application/vnd.github+json',
+            'Authorization': f"Bearer {token}",
+            'X-GitHub-Api-Version': '2022-11-28',
+        }
 
-    Returns:
-        A list of all the API responses for each commit after the date
-    """
-    base_url = "https://api.github.com/repos/openedx/edx-platform/commits?sha=master&per_page=100"
-    # will keep track of whether we've hit our start_date. the API automatically returns commits ordered
-    # by date, descending
-    found_last = False
-    all_commits_after_date = []
-    page = 1
-    while not found_last:
-        page_url = f"{base_url}&page={page}"
-        print(f"Fetching page {page_url}")
-        response = requests.get(page_url, headers=headers)
-        if response.status_code >= 400:
-            print(response)
-            break
-        response_json = response.json()
-        if len(response_json) == 0:
-            break
-        for single_commit in response_json:
-            # if present, take off the "Z" at the end of the date to make it proper ISO format
-            commit_date = datetime.fromisoformat(single_commit['commit']['committer']['date'].replace("Z", ""))
-            if commit_date < cut_off_date:
-                found_last = True
-                break
-            all_commits_after_date.append(single_commit)
-        page += 1
-    return all_commits_after_date
-
-
-def add_commit_check_suites(current_commit, current_suites, headers):
+        # Unix epoch timestamp in seconds when GitHub will allow us to
+        # resume making requests (or None if not rate-limited)
+        self.github_sleep_until_s = None
+
+        # The actual contents doesn't really matter, but empty files might
+        # be confusing and maybe the fetch date will be useful.
+        self.download_marker_data = {
+            'fetch_run_timestamp': datetime.utcnow().isoformat(),
+        }
+        self.workflow_fetch_params = {
+            # We don't want the response to include all the PRs that include this commit.
+            # It can be large for active repos and it doesn't help us.
+            'exclude_pull_requests': 'true',
+        }
+
+    def _github_get(self, url, *, params=None):
+        """
+        GET the url with GitHub auth token and return a `requests` response.
+
+        Performs both proactive backoff based on response header hints and
+        reactive backoff based on error responses responses.
+
+        Docs:
+        - https://docs.github.com/en/rest/using-the-rest-api/best-practices-for-using-the-rest-api?apiVersion=2022-11-28
+        - https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28
+        """
+        backoff_s = None  # exponential backoff in seconds, or None if not in effect
+        while True:
+            # If GitHub has told us how long to wait, use that instead of the
+            # current exponential backoff value.
+            if self.github_sleep_until_s is not None:
+                # Add slop to prevent tight loop on expiry
+                sleep_s = self.github_sleep_until_s + 5.0 - time.time()
+            else:
+                sleep_s = backoff_s
+
+            if sleep_s and sleep_s > 0:
+                time.sleep(sleep_s)
+
+            response = requests.get(url, params=params, headers=self.api_headers, timeout=20.0)
+
+            # Update rate-limiting data for next call
+            if out_of_requests := (response.headers.get('x-ratelimit-remaining') == '0'):
+                self.github_sleep_until_s = int(response.headers.get('x-ratelimit-reset'))
+                print(
+                    "Reached rate limit. "
+                    f"Will wait {int(self.github_sleep_until_s - time.time())} seconds "
+                    "before next request."
+                )
+            else:
+                self.github_sleep_until_s = None
+                backoff_s = None
+
+            if response.status_code == 200:
+                # We're good to go!
+                return response
+            elif out_of_requests:
+                # We got an error and have been informed we're out of requests.
+                # (Should be a 429 or 403 according to GitHub's docs.) We'll try
+                # again.
+                continue
+            elif response.status_code == 429:
+                # It's possible that GitHub might give us a 429 without the
+                # expected rate-limiting headers.
+                print("Rate-limited without timing hint; performing exponential backoff.")
+                backoff_s = 2 * backoff_s if backoff_s else 4
+                continue
+            else:
+                # Generic error case
+                response.raise_for_status()
+
+    def _write_json(self, data, *path_parts):
+        """
+        Write data as pretty-printed JSON to the given path (joining as needed).
+        """
+        with open(path.join(*path_parts), 'w') as f:
+            json.dump(data, f, sort_keys=True, indent=2)
+
+    def _log_attempt(self, attempt, workflow_dir):
+        """
+        Log this attempt and all of its checks and annotations.
+        """
+        attempt_dir = _ensure_dir(workflow_dir, f"attempt_{attempt['run_attempt']}")
+
+        attempt_file = path.join(attempt_dir, f"attempt.json")
+        if path.isfile(attempt_file):
+            print("Attempt already fully downloaded; skipping.")
+            return
+
+        # Get the checks associated with this workflow run -- this
+        # includes output title, summary, and text.
+        for check_run in self._github_get(attempt['check_suite_url'] + '/check-runs').json()['check_runs']:
+            self._write_json(check_run, attempt_dir, f"check_run_{check_run['id']}.json")
+
+            annotations = self._github_get(check_run['output']['annotations_url']).json()
+            self._write_json(annotations, attempt_dir, f"annotations_{check_run['id']}.json")
+
+        # Do this last, indicating that the attempt was completely
+        # downloaded. This allows us to skip it next time.
+        self._write_json(attempt, attempt_file)
+
+    def _list_all_attempts(self, run):
+        """
+        Yield all attempts of the given workflow run, including that one.
+        """
+        yield run
+        while next_url := run.get('previous_attempt_url'):
+            resp = self._github_get(next_url, params=self.workflow_fetch_params)
+            run = resp.json()
+            yield run
+
+    def _download_workflow_run(self, run, repo_dir):
+        print(f"Downloading workflow run id={run['id']}")
+
+        workflow_dir = _ensure_dir(repo_dir, f"run_{run['id']}",)
+
+        download_marker = path.join(workflow_dir, f"download-marker.json")
+        if path.isfile(download_marker):
+            print("Workflow already fully downloaded; skipping.")
+            return
+
+        # We're getting the *most recent attempt* of a run. Spool
+        # out the whole list of attempts and write them out.
+        for attempt in self._list_all_attempts(run):
+            self._log_attempt(attempt, workflow_dir)
+
+        # Once all attempts have been logged, write out a marker file
+        # that indicates this workflow has been completely downloaded
+        # and can be skipped in the future.
+        self._write_json(self.download_marker_data, download_marker)
+
+    def _list_completed_runs(self, owner, repo, start_date, end_date):
+        """
+        Yield all completed workflow runs.
+        """
+        # https://docs.github.com/en/rest/actions/workflow-runs?apiVersion=2022-11-28#list-workflow-runs-for-a-repository
+        runs_url = f"https://api.github.com/repos/{owner}/{repo}/actions/runs"
+        while runs_url is not None:
+            params = {
+                **self.workflow_fetch_params,
+                # Filter on status=completed to get all workflows that have finished
+                # running. The API docs allow you to use a status *or* a conclusion
+                # here, but doesn't explain their relationship. The check run API docs
+                # seem to cover basically the same values, though:
+                # https://docs.github.com/en/rest/guides/using-the-rest-api-to-interact-with-checks?apiVersion=2022-11-28#about-check-runs
+                'status': 'completed',
+                # https://docs.github.com/en/search-github/getting-started-with-searching-on-github/understanding-the-search-syntax#query-for-dates
+                'created': f'{start_date}..{end_date}',
+                'per_page': 100,
+            }
+            print(f"Requesting {runs_url} with {params=!r}")
+            resp = self._github_get(runs_url, params=params)
+            yield from resp.json()['workflow_runs']
+            runs_url = resp.links.get('next', {}).get('url')
+
+    def download(self, owner, repo, start_date, end_date):
+        repo_dir = _ensure_dir(self.output_dir, owner, repo)
+
+        for workflow_run in self._list_completed_runs(owner, repo, start_date, end_date):
+            self._download_workflow_run(workflow_run, repo_dir)
+
+
+@click.command()
+@click.option(
+    '--token', envvar='GITHUB_TOKEN',
+    required=True,
+    help="A GitHub access token that has access to the repository.",
+)
+@click.option(
+    '--output-dir', type=click.Path(file_okay=False, dir_okay=True, writable=True),
+    required=True,
+    help="A directory (or path where one can be created) where the output will be written.",
+)
+@click.option(
+    '--owner', type=str, required=True,
+    help="Owning user or organization of the repo, e.g. openedx.",
+)
+@click.option(
+    '--repo', type=str, required=True,
+    help="Repo shortname, e.g. edx-platform.",
+)
+@click.option(
+    '--start-date', type=click.DateTime(formats=["%Y-%m-%d"]), required=True,
+    help="Only fetch workflow runs starting from this date.",
+)
+@click.option(
+    '--end-date', type=click.DateTime(formats=["%Y-%m-%d"]), required=True,
+    help="Only fetch workflow runs up through this date.",
+)
+def run(*, token, output_dir, owner, repo, start_date, end_date):
     """
-    Add API information from all check suites performed for a given commit to the given list
+    Fetch information about workflows and check outcomes for a repository
+    within some date range, writing the output to a directory.
+
+    This script will fetch:
+
+    \b
+    - Workflow runs, including prior attempts
+    - Check runs associated with the attempts
+    - Annotations produced by the checks
+
+    The output directory will contain subdirectories of the form
+    `{OWNER}/{REPO}/run_#/attempt_#/` for each attempt of a workflow run.
+    (run_# is numbered by workflow run ID, and attempt_# by attempt index.)
+    The run_# directory will also contain a `download-marker.json` which
+    indicates that all information about the workflow run was successfully
+    downloaded. (If missing, this indicates a partial download.)
+
+    Each attempt directory will contain:
 
-    Parameters:
-        current_commit (str): the SHA of the commit to check
-        current_suites (list): list to be extended
-        headers (dict): Authentication headers for connecting to GitHub
+    \b
+    - attempt.json: Information about the workflow run. The documents for each
+      attempt of a workflow run will be largely the same.
+    - check_run_#.json: One of the checks associated with the attempt, numbered
+      by check-run ID.
+    - annotations_#.json: Annotations associated with the check run of that ID.
     """
-    sha = current_commit['sha']
-    check_url = f"https://api.github.com/repos/openedx/edx-platform/commits/{sha}/check-suites?per_page=100"
-    page = 1
-    while True:
-        # Keep going until we get an empty check_suites list or an error. An empty list means we've hit the last page.
-        paginated_url = f"{check_url}&page={page}"
-        print(f"Fetching page {paginated_url}")
-        response = requests.get(paginated_url, headers=headers).json()
-        if 'check_suites' not in response.keys():
-            print(response)
-            break
-        check_suites = response['check_suites']
-        if len(check_suites) == 0:
-            break
-        # silly line to pass the date of the commit along to eventually write in the spreadsheet
-        current_suites.extend([{**s, 'commit_date': current_commit['commit']['committer']['date']}
-                               for s in check_suites])
-        page += 1
+    dl = ActionsDownloader(output_dir=output_dir, token=token)
+    dl.download(
+        owner, repo,
+        # We just want the Y-m-d part
+        start_date.date().isoformat(), end_date.date().isoformat(),
+    )
 
 
 if __name__ == '__main__':
-    get_errors_from_date()
+    run()