Skip to content

Commit

Permalink
use rc file
Browse files Browse the repository at this point in the history
  • Loading branch information
markcmiller86 committed Jun 28, 2024
1 parent 02b5458 commit 2310eff
Show file tree
Hide file tree
Showing 2 changed files with 307 additions and 17 deletions.
26 changes: 9 additions & 17 deletions .github/workflows/check-urls.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,18 +65,6 @@ jobs:
#exclude_patterns: ${{ steps.setup_vars.outputs.ignore_url_patterns }}
- name: Check URLs in selected files
run: |
# Create an rc file controlling behavior of linkchecker
echo "
[checking]
sslverify=0
maxfilesizedownload=100000
maxfilesizeparse=100000
[csv]
separator=;
parts=all
[MarkdownCheck]
filename_re=.*\.md$
" > .linkcheckerrc
for f in ${{ steps.file_list.outputs.files }}; do
if [ "${f##*.}" != "md" ]; then
continue
Expand All @@ -86,17 +74,15 @@ jobs:
continue 2 # ignore this file
fi
done
linkchecker -f .linkcheckerrc -F csv/linkchecker-out.csv -t 20 --check-extern --no-follow-url ".*" --timeout=30 $f || true
#tail -n +2 linkchecker-out.csv | grep -v ',200.*$' >> linkchecker-out-all.csv
tail -n +4 linkchecker-out.csv >> linkchecker-out-all.csv
linkchecker -f utils/LinkChecker/.linkcheckerrc file://$(pwd)/$f >> linkchecker.out || true
cat linkchecker.out >> linkchecker-all.out
done
- name: Upload artifact
uses: actions/upload-artifact@v4
with:
name: my-artifact
path: linkchecker-out-all.csv

path: linkchecker-all.out

#
# Keep the recurring failures and definitely bad lists in repo on
Expand All @@ -117,6 +103,12 @@ jobs:
# generate email with links or actual data
#

# bare URLs in markdown are not actually links and will not be checked. Many
# markdown renderers and browsers will recognize these and handle them as links
# but that is by convention only. There is no markdown standard for how bare
# URLs in markdown are handled. The only standard is to enclose them in `<` and
# `>` chars.

#
# Description:
#
Expand Down
298 changes: 298 additions & 0 deletions utils/LinkChecker/.linkcheckerrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,298 @@
# Sample configuration file; see the linkcheckerrc(5) man page or
# execute linkchecker -h for help on these options.
# Commandline options override these settings.

##################### output configuration ##########################
[output]
# enable debug messages; see 'linkchecker -h' for valid debug names, example:
#debug=all
# print status output
status=0
# change the logging type
#log=text
# turn on/off --verbose
verbose=1
# turn on/off --warnings
#warnings=1
# turn on/off --quiet
#quiet=0
# additional file output, example:
#fileoutput = text, html, gml, sql
# errors to ignore (URL regular expression, message regular expression)
#ignoreerrors=
# ignore all errors for broken.example.com:
# ^https?://broken.example.com/
# ignore SSL errors for dev.example.com:
# ^https://dev.example.com/ ^SSLError


##################### logger configuration ##########################
# logger output part names:
# all For all parts
# realurl The full url link
# result Valid or invalid, with messages
# extern 1 or 0, only in some logger types reported
# base <base href=...>
# name <a href=...>name</a> and <img alt="name">
# parenturl The referrer URL if there is any
# info Some additional info, e.g. FTP welcome messages
# warning Warnings
# dltime Download time
# checktime Check time
# url The original url name, can be relative
# intro The blurb at the beginning, "starting at ..."
# outro The blurb at the end, "found x errors ..."
# stats Statistics including URL lengths and contents.

# each Logger can have separate configuration parameters

# standard text logger
[text]
#filename=linkchecker-out.txt
#parts=all
#wraplength=65
# colors for the various parts, syntax is <color> or <type>;<color>
# type can be bold, light, blink, invert
# color can be default, black, red, green, yellow, blue, purple, cyan, white,
# Black, Red, Green, Yellow, Blue, Purple, Cyan, White
#colorparent=default
#colorurl=default
#colorname=default
#colorreal=cyan
#colorbase=purple
#colorvalid=bold;green
#colorinvalid=bold;red
#colorinfo=default
#colorwarning=bold;yellow
#colordltime=default
#colorreset=default

# GML logger
[gml]
#filename=linkchecker-out.gml
#parts=all
# valid encodings are listed in http://docs.python.org/library/codecs.html#standard-encodings
# example:
#encoding=utf_16

# DOT logger
[dot]
#filename=linkchecker-out.dot
#parts=all
# default encoding is ascii since the original DOT format does not
# support other charsets, example:
#encoding=iso-8859-15

# CSV logger
[csv]
#filename=linkchecker-out.csv
#separator=;
#quotechar="
#dialect=excel
#parts=all

# SQL logger
[sql]
#filename=linkchecker-out.sql
#dbname=linksdb
#separator=;
#parts=all

# HTML logger
[html]
#filename=linkchecker-out.html
# colors for the various parts
#colorbackground=#fff7e5
#colorurl=#dcd5cf
#colorborder=#000000
#colorlink=#191c83
#colorwarning=#e0954e
#colorerror=#db4930
#colorok=#3ba557
#parts=all

# failures logger
[failures]
#filename=$XDG_DATA_HOME/linkchecker/failures

# custom xml logger
[xml]
#filename=linkchecker-out.xml
# system encoding is used by default. Example:
#encoding=iso-8859-1

# GraphXML logger
[gxml]
#filename=linkchecker-out.gxml
# system encoding is used by default. Example:
#encoding=iso-8859-1

# Sitemap logger
[sitemap]
#filename=linkchecker-out.sitemap.xml
#encoding=utf-8
#priority=0.5
#frequency=daily


##################### checking options ##########################
[checking]
# number of threads
threads=20
# connection timeout in seconds
timeout=30
# Time to wait for checks to finish after the user aborts the first time
# (with Ctrl-C or the abort button).
#aborttimeout=300
# The recursion level determines how many times links inside pages are followed.
#recursionlevel=-1
# parse a cookiefile for initial cookie data, example:
#cookiefile=/path/to/cookies.txt
# User-Agent header string to send to HTTP web servers
# Note that robots.txt are always checked with the original User-Agent. Example:
#useragent=Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)
# When checking finishes, write a memory dump to a temporary file.
# The memory dump is written both when checking finishes normally
# and when checking gets canceled.
# The memory dump only works if the python-meliae package is installed.
# Otherwise a warning is printed to install it.
#debugmemory=0
# When checking absolute URLs inside local files, the given root directory
# is used as base URL.
# Note that the given directory must have URL syntax, so it must use a slash
# to join directories instead of a backslash.
# And the given directory must end with a slash.
# Unix example:
#localwebroot=/var/www/
# Windows example:
#localwebroot=/C|/public_html/
# Check SSL certificates. Set to an absolute pathname for a custom
# CA cert bundle to use. Set to zero to disable SSL certificate verification.
sslverify=0
# Stop checking new URLs after the given number of seconds. Same as if the
# user hits Ctrl-C after X seconds. Example:
#maxrunseconds=600
# Don't download files larger than the given number of bytes
#maxfilesizedownload=5242880
# Don't parse files larger than the given number of bytes
#maxfilesizeparse=1048576
# Maximum number of URLs to check. New URLs will not be queued after the
# given number of URLs is checked. Example:
#maxnumurls=153
# Maximum number of requests per second to one host.
#maxrequestspersecond=10
# Respect the instructions in any robots.txt files
#robotstxt=1
# Allowed URL schemes as a comma-separated list. Example:
allowedschemes=file,http,https
# Size of the result cache. Checking more urls might increase memory usage during runtime
#resultcachesize=100000

##################### filtering options ##########################
[filtering]
#ignore=
# ignore everything with 'lconline' in the URL name
# lconline
# and ignore everything with 'bookmark' in the URL name
# bookmark
# and ignore all mailto: URLs
# ^mailto:
# do not recurse into the following URLs

#nofollow=
# just an example
# http://www\.example\.com/bla

# Ignore specified warnings (see linkchecker -h for the list of
# recognized warnings). Add a comma-separated list of warnings here
# that prevent a valid URL from being logged. Note that the warning
# will be logged for invalid URLs. Example:
#ignorewarnings=url-unicode-domain
# Regular expression to add more URLs recognized as internal links.
# Default is that URLs given on the command line are internal.
#internlinks=^http://www\.example\.net/
# Check external links
checkextern=1


##################### password authentication ##########################
[authentication]
# WARNING: if you store passwords in this configuration entry, make sure the
# configuration file is not readable by other users.
# Different user/password pairs for different URLs can be provided.
# Entries are a triple (URL regular expression, username, password),
# separated by whitespace.
# If the regular expression matches, the given user/password pair is used
# for authentication. The commandline options -u,-p match every link
# and therefore override the entries given here. The first match wins.
# At the moment, authentication is used for http[s] and ftp links.
#entry=
# Note that passwords are optional. If any passwords are stored here,
# this file should not readable by other users.
# ^https?://www\.example\.com/~calvin/ calvin mypass
# ^ftp://www\.example\.com/secret/ calvin

# if the website requires a login via a page with an HTML form the URL of the
# page and optionally the username and password input element name attributes
# can be provided.
#loginurl=http://www.example.com/

# The name attributes of the username and password HTML input elements
#loginuserfield=login
#loginpasswordfield=password
# Optionally the name attributes of any additional input elements and the values
# to populate them with. Note that these are submitted without checking
# whether matching input elements exist in the HTML form. Example:
#loginextrafields=
# name1:value1
# name 2:value 2

############################ Plugins ###################################
#
# uncomment sections to enable plugins

# Check HTML anchors
#[AnchorCheck]

# Print HTTP header info
#[HttpHeaderInfo]
# Comma separated list of header prefixes to print.
# The names are case insensitive.
# The default list is empty, so it should be non-empty when activating
# this plugin. Example:
#prefixes=Server,X-

# Add country info to URLs
#[LocationInfo]

# Run W3C syntax checks
#[CssSyntaxCheck]
#[HtmlSyntaxCheck]

# Search for regular expression in page contents
#[RegexCheck]
# Example:
#warningregex=Oracle Error

# Search for viruses in page contents
#[VirusCheck]
#clamavconf=/etc/clamav/clamd.conf

# Check that SSL certificates have at least the given number of days validity.
#[SslCertificateCheck]
#sslcertwarndays=30

# Parse and check links in PDF files
#[PdfParser]

# Parse and check links in Word files
#[WordParser]

# Parse and check links in Markdown files.
# Supported links are:
# <http://autolink.com>
# [name](http://link.com "Optional title")
# [id]: http://link.com "Optional title"
[MarkdownCheck]
# Regexp of filename
filename_re=.*\.(markdown|md(own)?|mkdn?)$

0 comments on commit 2310eff

Please sign in to comment.