Skip to content

Commit

Permalink
fix for when documentation has no canonical URL
Browse files Browse the repository at this point in the history
  • Loading branch information
gereonvey committed Feb 26, 2024
1 parent 555ae86 commit 73ee148
Showing 1 changed file with 23 additions and 21 deletions.
44 changes: 23 additions & 21 deletions checks-checker
Original file line number Diff line number Diff line change
Expand Up @@ -91,28 +91,30 @@ def checkUrl(url: str) -> tuple:

# get the canonical link of the document
canonical_links=soup.select('head link[rel*=canonical]')
# # Sometimes documents have multiple instances of canonical links
# # I've never experienced them differ, so we don't care
# if len(canonical_links)>1:
# print("More than 1 canonical link!")
# for canonical_link in canonical_links:
# print(f"Canonical link: {canonical_link['href']}")
canonical_url=canonical_links[0]['href']

# for suse documentation, remove the index.html suffix (probably works for everyone else too)
if parts.hostname == "documentation.suse.com":
# Python 3.9 and newer: canonical.removesuffix('index.html')
if canonical_url.endswith('index.html'):
canonical_url = canonical_url[:-10]
# add the fragment to the canonical URL, if any
if parts.fragment:
canonical_url += "#" + parts.fragment
# Warn if the URL used is not the canonical one
if canonical_url != url:
# canonical_links might be empty
if len(canonical_links)>0 and 'href' in canonical_links[0]:
# # Sometimes documents have multiple instances of canonical links
# # I've never experienced them differ, so we don't care
# if len(canonical_links)>1:
# print("More than 1 canonical link!")
# for canonical_link in canonical_links:
# print(f"Canonical link: {canonical_link['href']}")
canonical_url=canonical_links[0]['href']

# for suse documentation, remove the index.html suffix (probably works for everyone else too)
if parts.hostname == "documentation.suse.com":
errors.append(f"Not the canonical URL! Use {canonical_url} instead of {url}")
else:
warnings.append(f"Not the canonical URL! Use {canonical_url} instead of {url}")
# Python 3.9 and newer: canonical.removesuffix('index.html')
if canonical_url.endswith('index.html'):
canonical_url = canonical_url[:-10]
# add the fragment to the canonical URL, if any
if parts.fragment:
canonical_url += "#" + parts.fragment
# Warn if the URL used is not the canonical one
if canonical_url != url:
if parts.hostname == "documentation.suse.com":
errors.append(f"Not the canonical URL! Use {canonical_url} instead of {url}")
else:
warnings.append(f"Not the canonical URL! Use {canonical_url} instead of {url}")

return (errors, warnings)

Expand Down

0 comments on commit 73ee148

Please sign in to comment.