Skip to content

Commit

Permalink
Adding clearurls crate to clean tracking params from links and markdo…
Browse files Browse the repository at this point in the history
…wn. (LemmyNet#5018)

* Adding clearurls crate to clean tracking params from links and markdown.

- Thanks to @jenrdikw for creating this
- Fixes LemmyNet#4905

* Upgrading to new version of clearurls

* Fix clippy
  • Loading branch information
dessalines authored and Nutomic committed Sep 20, 2024
1 parent dddf687 commit 140762b
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 23 deletions.
15 changes: 15 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions crates/api_common/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ use lemmy_utils::{
utils::{
markdown::{markdown_check_for_blocked_urls, markdown_rewrite_image_links},
slurs::{build_slur_regex, remove_slurs},
validation::clean_urls_in_text,
},
CACHE_DURATION_FEDERATION,
};
Expand Down Expand Up @@ -947,6 +948,7 @@ pub async fn process_markdown(
context: &LemmyContext,
) -> LemmyResult<String> {
let text = remove_slurs(text, slur_regex);
let text = clean_urls_in_text(&text);

markdown_check_for_blocked_urls(&text, url_blocklist)?;

Expand Down
6 changes: 3 additions & 3 deletions crates/db_schema/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ use i_love_jesus::CursorKey;
use lemmy_utils::{
error::{LemmyErrorExt, LemmyErrorType, LemmyResult},
settings::SETTINGS,
utils::validation::clean_url_params,
utils::validation::clean_url,
};
use regex::Regex;
use rustls::{
Expand Down Expand Up @@ -305,7 +305,7 @@ pub fn diesel_url_update(opt: Option<&str>) -> LemmyResult<Option<Option<DbUrl>>
// An empty string is an erase
Some("") => Ok(Some(None)),
Some(str_url) => Url::parse(str_url)
.map(|u| Some(Some(clean_url_params(&u).into())))
.map(|u| Some(Some(clean_url(&u).into())))
.with_lemmy_type(LemmyErrorType::InvalidUrl),
None => Ok(None),
}
Expand All @@ -316,7 +316,7 @@ pub fn diesel_url_update(opt: Option<&str>) -> LemmyResult<Option<Option<DbUrl>>
pub fn diesel_url_create(opt: Option<&str>) -> LemmyResult<Option<DbUrl>> {
match opt {
Some(str_url) => Url::parse(str_url)
.map(|u| Some(clean_url_params(&u).into()))
.map(|u| Some(clean_url(&u).into()))
.with_lemmy_type(LemmyErrorType::InvalidUrl),
None => Ok(None),
}
Expand Down
1 change: 1 addition & 0 deletions crates/utils/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ markdown-it = { version = "0.6.1", optional = true }
ts-rs = { workspace = true, optional = true }
enum-map = { workspace = true, optional = true }
cfg-if = "1"
clearurls = { version = "0.0.4", features = ["linkify"] }

[dev-dependencies]
reqwest = { workspace = true }
Expand Down
58 changes: 38 additions & 20 deletions crates/utils/src/utils/validation.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use crate::error::{LemmyErrorExt, LemmyErrorType, LemmyResult};
use clearurls::UrlCleaner;
use itertools::Itertools;
use regex::{Regex, RegexBuilder, RegexSet};
use std::sync::LazyLock;
Expand All @@ -10,12 +11,8 @@ static VALID_MATRIX_ID_REGEX: LazyLock<Regex> = LazyLock::new(|| {
.expect("compile regex")
});
// taken from https://en.wikipedia.org/wiki/UTM_parameters
static CLEAN_URL_PARAMS_REGEX: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"^(utm_source|utm_medium|utm_campaign|utm_term|utm_content|gclid|gclsrc|dclid|fbclid)=",
)
.expect("compile regex")
});
static URL_CLEANER: LazyLock<UrlCleaner> =
LazyLock::new(|| UrlCleaner::from_embedded_rules().expect("compile clearurls"));
const ALLOWED_POST_URL_SCHEMES: [&str; 3] = ["http", "https", "magnet"];

const BODY_MAX_LENGTH: usize = 10000;
Expand Down Expand Up @@ -257,16 +254,22 @@ pub fn build_and_check_regex(regex_str_opt: &Option<&str>) -> LemmyResult<Option
)
}

pub fn clean_url_params(url: &Url) -> Url {
let mut url_out = url.clone();
if let Some(query) = url.query() {
let new_query = query
.split_inclusive('&')
.filter(|q| !CLEAN_URL_PARAMS_REGEX.is_match(q))
.collect::<String>();
url_out.set_query(Some(&new_query));
/// Cleans a url of tracking parameters.
pub fn clean_url(url: &Url) -> Url {
match URL_CLEANER.clear_single_url(url) {
Ok(res) => res.into_owned(),
// If there are any errors, just return the original url
Err(_) => url.clone(),
}
}

/// Cleans all the links in a string of tracking parameters.
pub fn clean_urls_in_text(text: &str) -> String {
match URL_CLEANER.clear_text(text) {
Ok(res) => res.into_owned(),
// If there are any errors, just return the original text
Err(_) => text.to_owned(),
}
url_out
}

pub fn check_site_visibility_valid(
Expand Down Expand Up @@ -357,7 +360,8 @@ mod tests {
build_and_check_regex,
check_site_visibility_valid,
check_urls_are_valid,
clean_url_params,
clean_url,
clean_urls_in_text,
is_url_blocked,
is_valid_actor_name,
is_valid_bio_field,
Expand All @@ -378,18 +382,32 @@ mod tests {

#[test]
fn test_clean_url_params() -> LemmyResult<()> {
let url = Url::parse("https://example.com/path/123?utm_content=buffercf3b2&utm_medium=social&user+name=random+user%20&id=123")?;
let cleaned = clean_url_params(&url);
let expected = Url::parse("https://example.com/path/123?user+name=random+user%20&id=123")?;
let url = Url::parse("https://example.com/path/123?utm_content=buffercf3b2&utm_medium=social&user+name=random+user&id=123")?;
let cleaned = clean_url(&url);
let expected = Url::parse("https://example.com/path/123?user+name=random+user&id=123")?;
assert_eq!(expected.to_string(), cleaned.to_string());

let url = Url::parse("https://example.com/path/123")?;
let cleaned = clean_url_params(&url);
let cleaned = clean_url(&url);
assert_eq!(url.to_string(), cleaned.to_string());

Ok(())
}

#[test]
fn test_clean_body() -> LemmyResult<()> {
let text = "[a link](https://example.com/path/123?utm_content=buffercf3b2&utm_medium=social&user+name=random+user&id=123)";
let cleaned = clean_urls_in_text(text);
let expected = "[a link](https://example.com/path/123?user+name=random+user&id=123)";
assert_eq!(expected.to_string(), cleaned.to_string());

let text = "[a link](https://example.com/path/123)";
let cleaned = clean_urls_in_text(text);
assert_eq!(text.to_string(), cleaned);

Ok(())
}

#[test]
fn regex_checks() {
assert!(is_valid_post_title("hi").is_err());
Expand Down

0 comments on commit 140762b

Please sign in to comment.