Add heuristic checking for HTML anchors

Previously only anchors specified or generated in markdown could be linked to, without complaint from the link checker. We now use a simple heuristic check for `name` or `id` attributes. Duplicate code has been refactored and all XML anchor checks updated to use regex rather than substring match.
getzola · Jan 8, 2022 · 15c2b00 · 15c2b00
1 parent 359de43
commit 15c2b00
Show file tree

Hide file tree

Showing 11 changed files with 71 additions and 24 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/components/config/src/config/link_checker.rs b/components/config/src/config/link_checker.rs
@@ -1,6 +1,6 @@
 use serde_derive::{Deserialize, Serialize};
 
-#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(default)]
 pub struct LinkChecker {
     /// Skip link checking for these URL prefixes
@@ -9,8 +9,3 @@ pub struct LinkChecker {
     pub skip_anchor_prefixes: Vec<String>,
 }
 
-impl Default for LinkChecker {
-    fn default() -> LinkChecker {
-        LinkChecker { skip_prefixes: Vec::new(), skip_anchor_prefixes: Vec::new() }
-    }
-}
diff --git a/components/library/src/content/mod.rs b/components/library/src/content/mod.rs
@@ -14,6 +14,7 @@ pub use self::ser::{SerializingPage, SerializingSection};
 
 use config::Config;
 use rendering::Heading;
+use utils::links::anchor_id_checks;
 
 pub fn has_anchor(headings: &[Heading], anchor: &str) -> bool {
     for heading in headings {
@@ -28,6 +29,12 @@ pub fn has_anchor(headings: &[Heading], anchor: &str) -> bool {
     false
 }
 
+
+pub fn has_anchor_id(content: &str, anchor: &str) -> bool {
+    let checks = anchor_id_checks(anchor);
+    checks.is_match(content)
+}
+
 /// Looks into the current folder for the path and see if there's anything that is not a .md
 /// file. Those will be copied next to the rendered .html file
 /// If `recursive` is set to `true`, it will add all subdirectories assets as well. This should

diff --git a/components/library/src/content/page.rs b/components/library/src/content/page.rs
@@ -21,6 +21,8 @@ use crate::content::ser::SerializingPage;
 use crate::content::{find_related_assets, has_anchor};
 use utils::fs::read_file;
 
+use super::has_anchor_id;
+
 lazy_static! {
     // Based on https://regex101.com/r/H2n38Z/1/tests
     // A regex parsing RFC3339 date followed by {_,-}, some characters and ended by .md
@@ -300,6 +302,10 @@ impl Page {
         has_anchor(&self.toc, anchor)
     }
 
+    pub fn has_anchor_id(&self, id: &str) -> bool {
+        has_anchor_id(&self.content, id)
+    }
+
     pub fn to_serialized<'a>(&'a self, library: &'a Library) -> SerializingPage<'a> {
         SerializingPage::from_page(self, library)
     }

diff --git a/components/link_checker/Cargo.toml b/components/link_checker/Cargo.toml
@@ -9,6 +9,7 @@ lazy_static = "1"
 
 config = { path = "../config" }
 errors = { path = "../errors" }
+utils = { path = "../utils" }
 
 [dependencies.reqwest]
 version = "0.11"

diff --git a/components/link_checker/src/lib.rs b/components/link_checker/src/lib.rs
@@ -2,6 +2,7 @@ use lazy_static::lazy_static;
 use reqwest::header::{HeaderMap, ACCEPT};
 use reqwest::{blocking::Client, StatusCode};
 
+use utils::links::anchor_id_checks;
 use config::LinkChecker;
 
 use std::collections::HashMap;
@@ -104,22 +105,9 @@ fn has_anchor(url: &str) -> bool {
 fn check_page_for_anchor(url: &str, body: String) -> errors::Result<()> {
     let index = url.find('#').unwrap();
     let anchor = url.get(index + 1..).unwrap();
-    let checks = [
-        format!(" id={}", anchor),
-        format!(" ID={}", anchor),
-        format!(" id='{}'", anchor),
-        format!(" ID='{}'", anchor),
-        format!(r#" id="{}""#, anchor),
-        format!(r#" ID="{}""#, anchor),
-        format!(" name={}", anchor),
-        format!(" NAME={}", anchor),
-        format!(" name='{}'", anchor),
-        format!(" NAME='{}'", anchor),
-        format!(r#" name="{}""#, anchor),
-        format!(r#" NAME="{}""#, anchor),
-    ];
-
-    if checks.iter().any(|check| body[..].contains(&check[..])) {
+    let checks = anchor_id_checks(anchor);
+
+    if checks.is_match(&body){
         Ok(())
     } else {
         Err(errors::Error::from(format!("Anchor `#{}` not found on page", anchor)))
@@ -338,7 +326,7 @@ mod tests {
     #[test]
     fn skip_anchor_prefixes() {
         let ignore_url = format!("{}{}", mockito::server_url(), "/ignore/");
-        let config = LinkChecker { skip_prefixes: vec![], skip_anchor_prefixes: vec![ignore_url] };
+        let config = LinkChecker { skip_anchor_prefixes: vec![ignore_url], ..Default::default() };
 
         let _m1 = mock("GET", "/ignore/i30hobj1cy")
             .with_header("Content-Type", "text/html")

diff --git a/components/site/src/link_checking.rs b/components/site/src/link_checking.rs
@@ -63,7 +63,8 @@ pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> {
             let page = library
                 .get_page(&full_path)
                 .expect("Couldn't find section in check_internal_links_with_anchors");
-            !page.has_anchor(anchor)
+
+            !(page.has_anchor(anchor)||page.has_anchor_id(anchor))
         }
     });
 

diff --git a/components/utils/Cargo.toml b/components/utils/Cargo.toml
@@ -9,6 +9,7 @@ include = ["src/**/*"]
 tera = "1"
 unicode-segmentation = "1.2"
 walkdir = "2"
+regex="1"
 toml = "0.5"
 serde = { version = "1.0", features = ["derive"] }
 slug = "0.1"

diff --git a/components/utils/src/lib.rs b/components/utils/src/lib.rs
@@ -1,5 +1,6 @@
 pub mod de;
 pub mod fs;
+pub mod links;
 pub mod minify;
 pub mod net;
 pub mod site;

diff --git a/components/utils/src/links.rs b/components/utils/src/links.rs
@@ -0,0 +1,42 @@
+use regex::Regex;
+
+
+pub fn anchor_id_checks(anchor:&str) -> Regex {
+    Regex::new(
+        &format!(r#" (?i)(id|ID|name|NAME) *= *("|')*{}("|'| |>)+"#, anchor)
+    ).unwrap()
+}
+
+
+#[cfg(test)]
+mod tests{
+    use super::anchor_id_checks;
+
+    fn check(anchor:&str, content:&str) -> bool {
+        anchor_id_checks(anchor).is_match(content)
+    }
+
+    #[test]
+    fn matchers () {
+        let m = |content| {check("fred", content)};
+
+        // Canonical match/non match
+        assert!(m(r#"<a name="fred">"#));
+        assert!(m(r#"<a id="fred">"#));
+        assert!(!m(r#"<a name="george">"#));
+
+        // Whitespace variants
+        assert!(m(r#"<a id ="fred">"#));
+        assert!(m(r#"<a id = "fred">"#));
+        assert!(m(r#"<a id="fred" >"#));
+        assert!(m(r#"<a  id="fred" >"#));
+
+        // Quote variants
+        assert!(m(r#"<a id='fred'>"#));
+        assert!(m(r#"<a id=fred>"#));
+
+        // Case variants
+        assert!(m(r#"<a ID="fred">"#));
+        assert!(m(r#"<a iD="fred">"#));
+    }
+}
diff --git a/docs/content/documentation/getting-started/configuration.md b/docs/content/documentation/getting-started/configuration.md
@@ -130,6 +130,9 @@ skip_anchor_prefixes = [
     "https://caniuse.com/",
 ]
 
+# Check for links to anchors defined in HTML rather than markdown
+heuristic_link_check = true
+
 # Various slugification strategies, see below for details
 # Defaults to everything being a slug
 [slugify]