Skip to content

Commit

Permalink
Add heuristic checking for HTML anchors
Browse files Browse the repository at this point in the history
Previously only anchors specified or generated in markdown could be
linked to, without complaint from the link checker. We now use a
simple heuristic check for `name` or `id` attributes.

Duplicate code has been refactored and all XML anchor checks updated
to use regex rather than substring match.
  • Loading branch information
phillord committed Jan 8, 2022
1 parent 359de43 commit 15c2b00
Show file tree
Hide file tree
Showing 11 changed files with 71 additions and 24 deletions.
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 1 addition & 6 deletions components/config/src/config/link_checker.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use serde_derive::{Deserialize, Serialize};

#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
#[serde(default)]
pub struct LinkChecker {
/// Skip link checking for these URL prefixes
Expand All @@ -9,8 +9,3 @@ pub struct LinkChecker {
pub skip_anchor_prefixes: Vec<String>,
}

impl Default for LinkChecker {
fn default() -> LinkChecker {
LinkChecker { skip_prefixes: Vec::new(), skip_anchor_prefixes: Vec::new() }
}
}
7 changes: 7 additions & 0 deletions components/library/src/content/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ pub use self::ser::{SerializingPage, SerializingSection};

use config::Config;
use rendering::Heading;
use utils::links::anchor_id_checks;

pub fn has_anchor(headings: &[Heading], anchor: &str) -> bool {
for heading in headings {
Expand All @@ -28,6 +29,12 @@ pub fn has_anchor(headings: &[Heading], anchor: &str) -> bool {
false
}


pub fn has_anchor_id(content: &str, anchor: &str) -> bool {
let checks = anchor_id_checks(anchor);
checks.is_match(content)
}

/// Looks into the current folder for the path and see if there's anything that is not a .md
/// file. Those will be copied next to the rendered .html file
/// If `recursive` is set to `true`, it will add all subdirectories assets as well. This should
Expand Down
6 changes: 6 additions & 0 deletions components/library/src/content/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ use crate::content::ser::SerializingPage;
use crate::content::{find_related_assets, has_anchor};
use utils::fs::read_file;

use super::has_anchor_id;

lazy_static! {
// Based on https://regex101.com/r/H2n38Z/1/tests
// A regex parsing RFC3339 date followed by {_,-}, some characters and ended by .md
Expand Down Expand Up @@ -300,6 +302,10 @@ impl Page {
has_anchor(&self.toc, anchor)
}

pub fn has_anchor_id(&self, id: &str) -> bool {
has_anchor_id(&self.content, id)
}

pub fn to_serialized<'a>(&'a self, library: &'a Library) -> SerializingPage<'a> {
SerializingPage::from_page(self, library)
}
Expand Down
1 change: 1 addition & 0 deletions components/link_checker/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ lazy_static = "1"

config = { path = "../config" }
errors = { path = "../errors" }
utils = { path = "../utils" }

[dependencies.reqwest]
version = "0.11"
Expand Down
22 changes: 5 additions & 17 deletions components/link_checker/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use lazy_static::lazy_static;
use reqwest::header::{HeaderMap, ACCEPT};
use reqwest::{blocking::Client, StatusCode};

use utils::links::anchor_id_checks;
use config::LinkChecker;

use std::collections::HashMap;
Expand Down Expand Up @@ -104,22 +105,9 @@ fn has_anchor(url: &str) -> bool {
fn check_page_for_anchor(url: &str, body: String) -> errors::Result<()> {
let index = url.find('#').unwrap();
let anchor = url.get(index + 1..).unwrap();
let checks = [
format!(" id={}", anchor),
format!(" ID={}", anchor),
format!(" id='{}'", anchor),
format!(" ID='{}'", anchor),
format!(r#" id="{}""#, anchor),
format!(r#" ID="{}""#, anchor),
format!(" name={}", anchor),
format!(" NAME={}", anchor),
format!(" name='{}'", anchor),
format!(" NAME='{}'", anchor),
format!(r#" name="{}""#, anchor),
format!(r#" NAME="{}""#, anchor),
];

if checks.iter().any(|check| body[..].contains(&check[..])) {
let checks = anchor_id_checks(anchor);

if checks.is_match(&body){
Ok(())
} else {
Err(errors::Error::from(format!("Anchor `#{}` not found on page", anchor)))
Expand Down Expand Up @@ -338,7 +326,7 @@ mod tests {
#[test]
fn skip_anchor_prefixes() {
let ignore_url = format!("{}{}", mockito::server_url(), "/ignore/");
let config = LinkChecker { skip_prefixes: vec![], skip_anchor_prefixes: vec![ignore_url] };
let config = LinkChecker { skip_anchor_prefixes: vec![ignore_url], ..Default::default() };

let _m1 = mock("GET", "/ignore/i30hobj1cy")
.with_header("Content-Type", "text/html")
Expand Down
3 changes: 2 additions & 1 deletion components/site/src/link_checking.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> {
let page = library
.get_page(&full_path)
.expect("Couldn't find section in check_internal_links_with_anchors");
!page.has_anchor(anchor)

!(page.has_anchor(anchor)||page.has_anchor_id(anchor))
}
});

Expand Down
1 change: 1 addition & 0 deletions components/utils/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ include = ["src/**/*"]
tera = "1"
unicode-segmentation = "1.2"
walkdir = "2"
regex="1"
toml = "0.5"
serde = { version = "1.0", features = ["derive"] }
slug = "0.1"
Expand Down
1 change: 1 addition & 0 deletions components/utils/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
pub mod de;
pub mod fs;
pub mod links;
pub mod minify;
pub mod net;
pub mod site;
Expand Down
42 changes: 42 additions & 0 deletions components/utils/src/links.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
use regex::Regex;


pub fn anchor_id_checks(anchor:&str) -> Regex {
Regex::new(
&format!(r#" (?i)(id|ID|name|NAME) *= *("|')*{}("|'| |>)+"#, anchor)
).unwrap()
}


#[cfg(test)]
mod tests{
use super::anchor_id_checks;

fn check(anchor:&str, content:&str) -> bool {
anchor_id_checks(anchor).is_match(content)
}

#[test]
fn matchers () {
let m = |content| {check("fred", content)};

// Canonical match/non match
assert!(m(r#"<a name="fred">"#));
assert!(m(r#"<a id="fred">"#));
assert!(!m(r#"<a name="george">"#));

// Whitespace variants
assert!(m(r#"<a id ="fred">"#));
assert!(m(r#"<a id = "fred">"#));
assert!(m(r#"<a id="fred" >"#));
assert!(m(r#"<a id="fred" >"#));

// Quote variants
assert!(m(r#"<a id='fred'>"#));
assert!(m(r#"<a id=fred>"#));

// Case variants
assert!(m(r#"<a ID="fred">"#));
assert!(m(r#"<a iD="fred">"#));
}
}
3 changes: 3 additions & 0 deletions docs/content/documentation/getting-started/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@ skip_anchor_prefixes = [
"https://caniuse.com/",
]

# Check for links to anchors defined in HTML rather than markdown
heuristic_link_check = true

# Various slugification strategies, see below for details
# Defaults to everything being a slug
[slugify]
Expand Down

0 comments on commit 15c2b00

Please sign in to comment.