Skip to content

Commit

Permalink
Refactor check website internal links to use threading (#3320)
Browse files Browse the repository at this point in the history
## Description

Resolves WATonomous/infra-config#3319

<!--- Please provide a summary of your changes. Make sure to include
relevant motivation, context, and link related documents/conversations.
-->

<!--- Use [linking
keywords](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)
like `Resolves #<issue_number>` to automatically close issues. -->


## Checklist
- [x] I have read and understood the [WATcloud
Guidelines](https://cloud.watonomous.ca/docs/community-docs/watcloud/guidelines)
- [x] I have performed a self-review of my code
  • Loading branch information
alexboden authored Oct 14, 2024
1 parent b271864 commit f0a77d8
Showing 1 changed file with 69 additions and 41 deletions.
110 changes: 69 additions & 41 deletions scripts/validate-internal-links.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed

# CONFIG
if len(sys.argv) < 2:
Expand Down Expand Up @@ -47,38 +48,37 @@ def get_xpath(element):
parts.reverse()
return '/' + '/'.join(parts)

def crawl_and_fetch_links(url, visited, internal_links, external_links):
"""Recursively fetch links from the given URL, separating internal and external links."""
try:
response = requests.get(url)
except requests.RequestException as e:
print(f"Request for {url} failed: {e}")
global fail_build
fail_build = True
return

soup = BeautifulSoup(response.text, 'html.parser')
for a in soup.find_all('a', href=True):
link = urljoin(url, a.get('href'))
if link not in visited:
visited.add(link)
xpath = get_xpath(a)
if is_internal_url(link):
link = convert_deployed_domain_to_base(link)
internal_links.add((url, link, xpath))
# print(f"Found internal link: {link} from url: {url} at xpath path: {xpath}")
link_without_fragment = urlparse(link)._replace(fragment='').geturl()
crawl_and_fetch_links(link_without_fragment, visited, internal_links, external_links)
else:
external_links.add(link)

def crawl_and_fetch_links_wrapper(url):
def crawl_and_fetch_links(url):
"""Wrapper function for crawl_and_fetch_links."""
visited_links = set()
visited = set()
internal_links_tuples = set() # (source, destination, xpath)
external_links = set()
crawl_and_fetch_links(url, visited_links, internal_links_tuples, external_links)
return internal_links_tuples, external_links
def crawl(url):
"""Recursively fetch links from the given URL, separating internal and external links."""
try:
response = requests.get(url)
except requests.RequestException as e:
print(f"Request for {url} failed: {e}")
global fail_build
fail_build = True
return

soup = BeautifulSoup(response.text, 'html.parser')
for a in soup.find_all('a', href=True):
link = urljoin(url, a.get('href'))
if link not in visited:
visited.add(link)
xpath = get_xpath(a)
if is_internal_url(link):
link = convert_deployed_domain_to_base(link)
internal_links_tuples.add((url, link, xpath))
# print(f"Found internal link: {link} from url: {url} at xpath path: {xpath}")
link_without_fragment = urlparse(link)._replace(fragment='').geturl()
crawl(link_without_fragment)
else:
external_links.add(link)
return internal_links_tuples, external_links
return crawl(url)

def get_response_code(full_url) -> int:
"""Check if a URL, including its fragment, is valid.
Expand Down Expand Up @@ -113,34 +113,62 @@ def link_has_fragment(full_url) -> bool:
return urlparse(full_url).fragment != ''

def validate_internal_links(internal_links_tuples):
"""Check if internal links are valid."""
"""Check if internal links are valid in parallel."""
invalid_links = []
for link in internal_links_tuples:

def check_link(link):
"""Check if the link is valid."""
_, destination, _ = link
status_code = get_response_code(destination)
if status_code != 200:
invalid_links.append([link, status_code])
return (link, status_code) if status_code != 200 else None

with ThreadPoolExecutor(max_workers=10) as executor:
futures = [executor.submit(check_link, link) for link in internal_links_tuples]
for future in as_completed(futures):
result = future.result()
if result:
invalid_links.append(result)

return invalid_links

def validate_internal_link_fragments(internal_links_tuples):
"""Check if internal link fragments are valid."""
"""Check if internal link fragments are valid in parallel."""
invalid_fragment_links = []
for link in internal_links_tuples:

def check_fragment(link):
"""Check if the fragment of the link is valid."""
_, destination, _ = link
if link_has_fragment(destination):
if not check_fragment_validity(destination):
invalid_fragment_links.append(link)
return link
return None

with ThreadPoolExecutor(max_workers=10) as executor:
futures = [executor.submit(check_fragment, link) for link in internal_links_tuples]
for future in as_completed(futures):
result = future.result()
if result:
invalid_fragment_links.append(result)

return invalid_fragment_links

if __name__ == '__main__':
print("Collecting links...")
internal_links_tuples, external_links = crawl_and_fetch_links_wrapper(BASE_URL)
internal_links_tuples, external_links = crawl_and_fetch_links(BASE_URL)
print(f"Found {len(internal_links_tuples)} internal links")
print(f"Found {len(external_links)} external links")
invalid_internal_links = validate_internal_links(internal_links_tuples)
invalid_fragment_links = validate_internal_link_fragments(internal_links_tuples)

if len(invalid_internal_links) == 0 and len(invalid_fragment_links) == 0:
# Run the validation in parallel
with ThreadPoolExecutor(max_workers=4) as executor:
validate_internal_links_future = executor.submit(validate_internal_links, internal_links_tuples)
validate_internal_fragments_future = executor.submit(validate_internal_link_fragments, internal_links_tuples)

# Wait for both validations to complete
invalid_internal_links = validate_internal_links_future.result()
invalid_fragment_links = validate_internal_fragments_future.result()

# Print the results
if len(invalid_internal_links) == 0 and len(invalid_fragment_links) == 0 and not fail_build:
print(f"All {len(internal_links_tuples)} internal links are valid.")
sys.exit(0) # Exit with success

Expand All @@ -157,4 +185,4 @@ def validate_internal_link_fragments(internal_links_tuples):

print('Hint: Use $x("XPath") in the browser console to find the element.')

sys.exit(1) # Fail the build if there are invalid links
sys.exit(1) # Fail the build if there are invalid links

0 comments on commit f0a77d8

Please sign in to comment.