Add CI for checking for broken links manually, weekly and in PRs #23
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Check URLs | |
on: | |
workflow_dispatch: | |
schedule: | |
- cron: '17 5 * * 0' # 5:17 AM every Sunday | |
pull_request: | |
branches: [ main ] | |
env: | |
ignore_url_patterns: | | |
http://localhost:4000 | |
https://preview.bssw.io | |
https://github.com/<your-github-handle> | |
ignore_file_patterns: | | |
docs/ | |
images/ | |
utils/ | |
Events/ | |
jobs: | |
check-urls: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Set up Python | |
uses: actions/setup-python@v2 | |
with: | |
python-version: '3.9' | |
- name: Install dependencies | |
run: | | |
python -m pip --no-cache-dir --disable-pip-version-check install --upgrade pip | |
python -m pip --no-cache-dir --disable-pip-version-check install linkchecker | |
- name: Reformat environment variables | |
id: setup_vars | |
run: | | |
tmp=$(echo "${{ env.ignore_url_patterns }}" | tr '\n' ' ') | |
echo "ignore_url_patterns=$tmp" >> $GITHUB_OUTPUT | |
tmp=$(echo "${{ env.ignore_file_patterns }}" | tr '\n' ' ') | |
echo "ignore_file_patterns=$tmp" >> $GITHUB_OUTPUT | |
- name: Checkout Repository | |
uses: actions/checkout@v4 | |
- name: Get Changed Files (for PRs) | |
if: ${{ github.event_name == 'pull_request' }} | |
id: changed-files | |
uses: tj-actions/changed-files@v42 | |
with: | |
separator: ' ' | |
- name: Generate lists of files to check and ignore | |
id: file_list | |
run: | | |
if [ "${{ github.event_name }}" = "pull_request" ]; then | |
echo "files=${{ steps.changed-files.outputs.all_changed_files }}" >> $GITHUB_OUTPUT | |
echo "ignore_file_patterns=" >> $GITHUB_OUTPUT | |
else | |
echo "files=" >> $GITHUB_OUTPUT | |
echo "ignore_file_patterns=${{ steps.setup_vars.outputs.ignore_file_patterns }}" >> $GITHUB_OUTPUT | |
fi | |
#exclude_patterns: ${{ steps.setup_vars.outputs.ignore_url_patterns }} | |
- name: Check URLs in selected files | |
run: | | |
# Create an rc file controlling behavior of linkchecker | |
echo " | |
[checking] | |
sslverify=0 | |
maxfilesizedownload=100000 | |
maxfilesizeparse=100000 | |
[csv] | |
separator=, | |
parts=all | |
[MarkdownCheck] | |
filename_re=.*\.md$ | |
" > .linkcheckerrc | |
for f in ${{ steps.file_list.outputs.files }}; do | |
if [ "${f##*.}" != "md" ]; then | |
continue | |
fi | |
for ef in ${{ steps.file_list.outputs.ignore_file_patterns }}; do | |
if [ "$ef" = "$f" ]; then | |
continue 2 # ignore this file | |
fi | |
done | |
linkchecker -f .linkcheckerrc -F csv/linkchecker-out.csv -t 20 --check-extern --no-follow-url ".*" --timeout=30 $f || true | |
tail -n +2 linkchecker-out.csv | grep -v ',200.*$' >> linkchecker-out-all.csv | |
done | |
- name: Upload artifact | |
uses: actions/upload-artifact@v4 | |
with: | |
name: my-artifact | |
path: linkchecker-out-all.csv | |
# | |
# Keep the recurring failures and definitely bad lists in repo on | |
# branch manage-broken-links | |
# | |
# Download those files before startin | |
# | |
# If a link "works" (200) remove it from "recurring failures" list | |
# If a link "does not work" (!= 200) | |
# - if it is already on recurring failures list | |
# - if it is too old, flag it as "definitely bad", else nothing | |
# | |
# - if it is not already on persistent failures list, add it to "new" and "persistent failures" list and date it | |
# | |
# Upload the recurring failures and definitely bad lists to somehwere | |
# Report success if definitely bad list is empty, otherwise failure | |
# generate email with links or actual data | |
# | |
# | |
# Description: | |
# | |
# Triggers in one of three ways; 1) manually, 2) scheduled weekly Sunday's 5:17 AM | |
# or 3) pull request | |
# | |
# Stores ignore pattern cases in env. variables and then reformats those (because | |
# they have newlines) into a comma-separated single line string that can be digested | |
# as inputs to other actions. | |
# | |
# For PRs, uses changed-files action to get list of changed files and passes this | |
# to urlchecker via `include_files param. Also, ignore file patterns is set to | |
# empty string for PRs because we think URLs anywhere in PRs should be checked. | |
# | |
# For scheduled or manual triggers, uses fact that empty `include_files` param | |
# causes urlchecker to process *all* files that match in `file_type` param but do | |
# not match any `exclude_files` patterns. These file patterns for exclude work | |
# more or less like file globs. So, specifying the initial part of the string | |
# for a file (path) name is sufficient to ignore the file. | |
# | |
# We include Events in file patterns to ignore because of all content we host, | |
# we suspect Event URLs are the most likely to go stale rather quickly **and** because | |
# the URL validness is important only during the short window prior to the event. | |
# That said, we don't want to ignore Events in PRs and we do not as per above. | |
# |