Adding clearurls crate to clean tracking params from links and markdo…

…wn. (LemmyNet#5018) * Adding clearurls crate to clean tracking params from links and markdown. - Thanks to @jenrdikw for creating this - Fixes LemmyNet#4905 * Upgrading to new version of clearurls * Fix clippy
sunaurus · Sep 20, 2024 · 140762b · 140762b
1 parent dddf687
commit 140762b
Show file tree

Hide file tree

Showing 5 changed files with 59 additions and 23 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/api_common/src/utils.rs b/crates/api_common/src/utils.rs
@@ -49,6 +49,7 @@ use lemmy_utils::{
   utils::{
     markdown::{markdown_check_for_blocked_urls, markdown_rewrite_image_links},
     slurs::{build_slur_regex, remove_slurs},
+    validation::clean_urls_in_text,
   },
   CACHE_DURATION_FEDERATION,
 };
@@ -947,6 +948,7 @@ pub async fn process_markdown(
   context: &LemmyContext,
 ) -> LemmyResult<String> {
   let text = remove_slurs(text, slur_regex);
+  let text = clean_urls_in_text(&text);
 
   markdown_check_for_blocked_urls(&text, url_blocklist)?;
 

diff --git a/crates/db_schema/src/utils.rs b/crates/db_schema/src/utils.rs
@@ -30,7 +30,7 @@ use i_love_jesus::CursorKey;
 use lemmy_utils::{
   error::{LemmyErrorExt, LemmyErrorType, LemmyResult},
   settings::SETTINGS,
-  utils::validation::clean_url_params,
+  utils::validation::clean_url,
 };
 use regex::Regex;
 use rustls::{
@@ -305,7 +305,7 @@ pub fn diesel_url_update(opt: Option<&str>) -> LemmyResult<Option<Option<DbUrl>>
     // An empty string is an erase
     Some("") => Ok(Some(None)),
     Some(str_url) => Url::parse(str_url)
-      .map(|u| Some(Some(clean_url_params(&u).into())))
+      .map(|u| Some(Some(clean_url(&u).into())))
       .with_lemmy_type(LemmyErrorType::InvalidUrl),
     None => Ok(None),
   }
@@ -316,7 +316,7 @@ pub fn diesel_url_update(opt: Option<&str>) -> LemmyResult<Option<Option<DbUrl>>
 pub fn diesel_url_create(opt: Option<&str>) -> LemmyResult<Option<DbUrl>> {
   match opt {
     Some(str_url) => Url::parse(str_url)
-      .map(|u| Some(clean_url_params(&u).into()))
+      .map(|u| Some(clean_url(&u).into()))
       .with_lemmy_type(LemmyErrorType::InvalidUrl),
     None => Ok(None),
   }

diff --git a/crates/utils/Cargo.toml b/crates/utils/Cargo.toml
@@ -81,6 +81,7 @@ markdown-it = { version = "0.6.1", optional = true }
 ts-rs = { workspace = true, optional = true }
 enum-map = { workspace = true, optional = true }
 cfg-if = "1"
+clearurls = { version = "0.0.4", features = ["linkify"] }
 
 [dev-dependencies]
 reqwest = { workspace = true }

diff --git a/crates/utils/src/utils/validation.rs b/crates/utils/src/utils/validation.rs
@@ -1,4 +1,5 @@
 use crate::error::{LemmyErrorExt, LemmyErrorType, LemmyResult};
+use clearurls::UrlCleaner;
 use itertools::Itertools;
 use regex::{Regex, RegexBuilder, RegexSet};
 use std::sync::LazyLock;
@@ -10,12 +11,8 @@ static VALID_MATRIX_ID_REGEX: LazyLock<Regex> = LazyLock::new(|| {
     .expect("compile regex")
 });
 // taken from https://en.wikipedia.org/wiki/UTM_parameters
-static CLEAN_URL_PARAMS_REGEX: LazyLock<Regex> = LazyLock::new(|| {
-  Regex::new(
-    r"^(utm_source|utm_medium|utm_campaign|utm_term|utm_content|gclid|gclsrc|dclid|fbclid)=",
-  )
-  .expect("compile regex")
-});
+static URL_CLEANER: LazyLock<UrlCleaner> =
+  LazyLock::new(|| UrlCleaner::from_embedded_rules().expect("compile clearurls"));
 const ALLOWED_POST_URL_SCHEMES: [&str; 3] = ["http", "https", "magnet"];
 
 const BODY_MAX_LENGTH: usize = 10000;
@@ -257,16 +254,22 @@ pub fn build_and_check_regex(regex_str_opt: &Option<&str>) -> LemmyResult<Option
   )
 }
 
-pub fn clean_url_params(url: &Url) -> Url {
-  let mut url_out = url.clone();
-  if let Some(query) = url.query() {
-    let new_query = query
-      .split_inclusive('&')
-      .filter(|q| !CLEAN_URL_PARAMS_REGEX.is_match(q))
-      .collect::<String>();
-    url_out.set_query(Some(&new_query));
+/// Cleans a url of tracking parameters.
+pub fn clean_url(url: &Url) -> Url {
+  match URL_CLEANER.clear_single_url(url) {
+    Ok(res) => res.into_owned(),
+    // If there are any errors, just return the original url
+    Err(_) => url.clone(),
+  }
+}
+
+/// Cleans all the links in a string of tracking parameters.
+pub fn clean_urls_in_text(text: &str) -> String {
+  match URL_CLEANER.clear_text(text) {
+    Ok(res) => res.into_owned(),
+    // If there are any errors, just return the original text
+    Err(_) => text.to_owned(),
   }
-  url_out
 }
 
 pub fn check_site_visibility_valid(
@@ -357,7 +360,8 @@ mod tests {
       build_and_check_regex,
       check_site_visibility_valid,
       check_urls_are_valid,
-      clean_url_params,
+      clean_url,
+      clean_urls_in_text,
       is_url_blocked,
       is_valid_actor_name,
       is_valid_bio_field,
@@ -378,18 +382,32 @@ mod tests {
 
   #[test]
   fn test_clean_url_params() -> LemmyResult<()> {
-    let url = Url::parse("https://example.com/path/123?utm_content=buffercf3b2&utm_medium=social&user+name=random+user%20&id=123")?;
-    let cleaned = clean_url_params(&url);
-    let expected = Url::parse("https://example.com/path/123?user+name=random+user%20&id=123")?;
+    let url = Url::parse("https://example.com/path/123?utm_content=buffercf3b2&utm_medium=social&user+name=random+user&id=123")?;
+    let cleaned = clean_url(&url);
+    let expected = Url::parse("https://example.com/path/123?user+name=random+user&id=123")?;
     assert_eq!(expected.to_string(), cleaned.to_string());
 
     let url = Url::parse("https://example.com/path/123")?;
-    let cleaned = clean_url_params(&url);
+    let cleaned = clean_url(&url);
     assert_eq!(url.to_string(), cleaned.to_string());
 
     Ok(())
   }
 
+  #[test]
+  fn test_clean_body() -> LemmyResult<()> {
+    let text = "[a link](https://example.com/path/123?utm_content=buffercf3b2&utm_medium=social&user+name=random+user&id=123)";
+    let cleaned = clean_urls_in_text(text);
+    let expected = "[a link](https://example.com/path/123?user+name=random+user&id=123)";
+    assert_eq!(expected.to_string(), cleaned.to_string());
+
+    let text = "[a link](https://example.com/path/123)";
+    let cleaned = clean_urls_in_text(text);
+    assert_eq!(text.to_string(), cleaned);
+
+    Ok(())
+  }
+
   #[test]
   fn regex_checks() {
     assert!(is_valid_post_title("hi").is_err());