diff --git a/Cargo.lock b/Cargo.lock index fd0f1af..0b731cb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -215,6 +215,7 @@ version = "1.0.6" dependencies = [ "ahash", "assert_cmd", + "bitflags 2.4.0", "cached", "chardet", "chardetng", diff --git a/Cargo.toml b/Cargo.toml index ab7f4f8..8835b75 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ exclude = [ [dependencies] ahash = "0.8.3" +bitflags = "2.4.0" cached = "0.46.0" chardet = { version = "0.2.4", optional = true } chardetng = { version = "0.1.17", optional = true } diff --git a/README.md b/README.md index 3324a79..ed8e0e9 100644 --- a/README.md +++ b/README.md @@ -18,22 +18,22 @@ Python version supports more encodings, but a lot of them are old almost unused ## ⚡ Performance -This package offer better performance than Python version (3 times faster, than MYPYC version of charset-normalizer, 6 times faster than usual Python version). -However, in comparison with `chardet` and `chardetng` packages it is slower but more accurate (I guess because it process whole file chunk by chunk). +This package offer better performance than Python version (4 times faster, than MYPYC version of charset-normalizer, 8 times faster than usual Python version). +In comparison with `chardet` and `chardetng` packages it has approximately the same speed but more accurate. Here are some numbers. | Package | Accuracy | Mean per file (ms) | File per sec (est) | |---------------------------------------------------------------------------------------------|:----------:|:------------------:|:------------------:| -| [chardet](https://crates.io/crates/chardet) | 82.6 % | 2.2 ms | 450 file/sec | +| [chardet](https://crates.io/crates/chardet) | 82.6 % | 3 ms | 333 file/sec | | [chardetng](https://crates.io/crates/chardetng) | 90.7 % | 1.6 ms | 625 file/sec | -| charset-normalizer-rs | **97.1 %** | **2.7 ms** | 370 file/sec | +| charset-normalizer-rs | **97.1 %** | **1.5 ms** | 666 file/sec | | [charset-normalizer](https://github.com/Ousret/charset_normalizer) (Python + MYPYC version) | **98 %** | **8 ms** | 125 file/sec | | Package | 99th percentile | 95th percentile | 50th percentile | |---------------------------------------------------------------------------------------------|:---------------:|:---------------:|:---------------:| | [chardet](https://crates.io/crates/chardet) | 8 ms | 2 ms | 0.2 ms | | [chardetng](https://crates.io/crates/chardetng) | 14 ms | 5 ms | 0.5 ms | -| charset-normalizer-rs | 19 ms | 7 ms | 1.2 ms | +| charset-normalizer-rs | 12 ms | 5 ms | 0.7 ms | | [charset-normalizer](https://github.com/Ousret/charset_normalizer) (Python + MYPYC version) | 94 ms | 37 ms | 3 ms | Stats are generated using 400+ files using default parameters. These results might change at any time. diff --git a/src/consts.rs b/src/consts.rs index c25f282..beeb7cc 100644 --- a/src/consts.rs +++ b/src/consts.rs @@ -18,7 +18,7 @@ lazy_static! { pub static ref TOO_SMALL_SEQUENCE: usize = 32; pub static ref TOO_BIG_SEQUENCE: usize = 1_000_000; // 10E6 - pub(crate) static ref UTF8_MAXIMAL_ALLOCATION: usize = 128; + pub(crate) static ref UTF8_MAXIMAL_ALLOCATION: usize = 1_112_064; pub(crate) static ref UNICODE_RANGES_COMBINED: [(&'static str, RangeInclusive);279] = [ ("Control character", 0..=31), ("Basic Latin", 32..=127), diff --git a/src/md.rs b/src/md.rs index 3b2df2d..d57db4a 100644 --- a/src/md.rs +++ b/src/md.rs @@ -1,19 +1,246 @@ #![allow(unused_variables)] +#![allow(unused_imports)] -use crate::consts::COMMON_SAFE_ASCII_CHARACTERS; -use crate::utils::{ - is_accentuated, is_case_variable, is_cjk, is_emoticon, is_hangul, is_hiragana, is_katakana, - is_latin, is_punctuation, is_separator, is_suspiciously_successive_range, is_symbol, is_thai, - is_unprintable, remove_accent, unicode_range, -}; +use crate::consts::{COMMON_SAFE_ASCII_CHARACTERS, UTF8_MAXIMAL_ALLOCATION}; +use crate::utils::{is_suspiciously_successive_range, remove_accent, unicode_range}; +use bitflags::{bitflags, Flags}; use cached::proc_macro::cached; +use cached::UnboundCache; use log::trace; use ordered_float::OrderedFloat; +use unic::char::property::EnumeratedCharProperty; +use unic::ucd::{is_white_space, GeneralCategory, Name}; // // Mess detection module // +// Mess Plugin Char representation +// used to collect additional information about char +// and eliminate repeated calculations + +#[derive(Copy, Clone, PartialEq)] +pub struct MessDetectorCharFlags(u32); + +bitflags! { + impl MessDetectorCharFlags: u32 { + const WHITESPACE = 0b0000_0000_0000_0000_0000_0000_0000_0001; + const UNPRINTABLE = 0b0000_0000_0000_0000_0000_0000_0000_0010; + const SYMBOL = 0b0000_0000_0000_0000_0000_0000_0000_0100; + const EMOTICON = 0b0000_0000_0000_0000_0000_0000_0000_1000; + const COMMON_SAFE = 0b0000_0000_0000_0000_0000_0000_0001_0000; + const WEIRD_SAFE = 0b0000_0000_0000_0000_0000_0000_0010_0000; + const PUNCTUATION = 0b0000_0000_0000_0000_0000_0000_0100_0000; + const SEPARATOR = 0b0000_0000_0000_0000_0000_0000_1000_0000; + const ASCII = 0b0000_0000_0000_0000_0000_0001_0000_0000; + const ASCII_ALPHABETIC = 0b0000_0000_0000_0000_0000_0010_0000_0000; + const ASCII_GRAPHIC = 0b0000_0000_0000_0000_0000_0100_0000_0000; + const ASCII_DIGIT = 0b0000_0000_0000_0000_0000_1000_0000_0000; + const LATIN = 0b0000_0000_0000_0000_0001_0000_0000_0000; + const ALPHABETIC = 0b0000_0000_0000_0000_0010_0000_0000_0000; + const ACCENTUATED = 0b0000_0000_0000_0000_0100_0000_0000_0000; + const CJK = 0b0000_0000_0000_0000_1000_0000_0000_0000; + const HANGUL = 0b0000_0000_0000_0001_0000_0000_0000_0000; + const KATAKANA = 0b0000_0000_0000_0010_0000_0000_0000_0000; + const HIRAGANA = 0b0000_0000_0000_0100_0000_0000_0000_0000; + const THAI = 0b0000_0000_0000_1000_0000_0000_0000_0000; + const CASE_VARIABLE = 0b0000_0000_0001_0000_0000_0000_0000_0000; + const LOWERCASE = 0b0000_0000_0010_0000_0000_0000_0000_0000; + const UPPERCASE = 0b0000_0000_0100_0000_0000_0000_0000_0000; + const NUMERIC = 0b0000_0000_1000_0000_0000_0000_0000_0000; + } +} + +#[derive(Copy, Clone)] +pub struct MessDetectorChar { + pub character: char, + pub flags: MessDetectorCharFlags, + pub unicode_range: Option<&'static str>, +} + +impl PartialEq for MessDetectorChar { + fn eq(&self, other: &Self) -> bool { + self.character == other.character + } +} + +impl MessDetectorChar { + pub fn new(character: char) -> Self { + new_mess_detector_character(character) + } + fn in_category( + category: &str, + range: Option<&str>, + categories_exact: &[&str], + categories_partial: &[&str], + ranges_partial: &[&str], + ) -> bool { + // unicode category part + if categories_exact.contains(&category) + || categories_partial.iter().any(|&cp| category.contains(cp)) + { + return true; + } + // unicode range part + if !ranges_partial.is_empty() { + if let Some(range) = range { + return ranges_partial.iter().any(|&r| range.contains(r)); + } + } + false + } + + fn in_description(name: Option, patterns: &[&str]) -> bool { + name.is_some_and(|description| { + patterns + .iter() + .any(|&s| description.to_string().contains(s)) + }) + } + + pub fn is(&self, flag: MessDetectorCharFlags) -> bool { + self.flags.contains(flag) + } +} + +#[cached( + type = "UnboundCache", + create = "{ UnboundCache::with_capacity(*UTF8_MAXIMAL_ALLOCATION) }", + convert = r#"{ character }"# +)] +pub fn new_mess_detector_character(character: char) -> MessDetectorChar { + let mut flags = MessDetectorCharFlags::empty(); + + // PLEASE NOTE! In case of idiomatic refactoring + // take in account performance. Sometimes match could be used but it + // will require calculate all conditions and can decrease performance + // in comparison to usual if then else + + // ascii probing + if character.is_ascii() { + flags.insert(MessDetectorCharFlags::ASCII); + if character.is_ascii_graphic() { + flags.insert(MessDetectorCharFlags::ASCII_GRAPHIC); + if character.is_ascii_alphabetic() { + flags.insert(MessDetectorCharFlags::ASCII_ALPHABETIC); + } else if character.is_ascii_digit() { + flags.insert(MessDetectorCharFlags::ASCII_DIGIT); + } + } + } + + // unicode information + let name = Name::of(character); + let category = GeneralCategory::of(character).abbr_name(); + let range = unicode_range(character); + + // whitespace + if character.is_whitespace() { + flags.insert(MessDetectorCharFlags::WHITESPACE); + flags.insert(MessDetectorCharFlags::SEPARATOR); + } else { + // safe symbols (non-whitespace) + if COMMON_SAFE_ASCII_CHARACTERS.contains(character) { + flags.insert(MessDetectorCharFlags::COMMON_SAFE); + } + if "<>-=~|_".contains(character) { + flags.insert(MessDetectorCharFlags::WEIRD_SAFE); + } + + // numeric + if flags.contains(MessDetectorCharFlags::ASCII_DIGIT) || character.is_numeric() { + flags.insert(MessDetectorCharFlags::NUMERIC); + } else if flags.contains(MessDetectorCharFlags::ASCII_ALPHABETIC) + || character.is_alphabetic() + { + // alphabetic + flags.insert(MessDetectorCharFlags::ALPHABETIC); + if character.is_lowercase() { + flags.insert(MessDetectorCharFlags::LOWERCASE); + flags.insert(MessDetectorCharFlags::CASE_VARIABLE); + } else if character.is_uppercase() { + flags.insert(MessDetectorCharFlags::UPPERCASE); + flags.insert(MessDetectorCharFlags::CASE_VARIABLE); + } + } else if !flags.contains(MessDetectorCharFlags::ASCII_GRAPHIC) + && !['\x1A', '\u{FEFF}'].contains(&character) + && MessDetectorChar::in_category(category, range, &["Cc"], &[], &["Control character"]) + { + flags.insert(MessDetectorCharFlags::UNPRINTABLE); + } + + // emoticon + if MessDetectorChar::in_category(category, range, &[], &[], &["Emoticons"]) { + flags.insert(MessDetectorCharFlags::EMOTICON); + } + + // separator + if ['|', '+', '<', '>'].contains(&character) + || MessDetectorChar::in_category(category, range, &["Po", "Pd", "Pc"], &["Z"], &[]) + { + flags.insert(MessDetectorCharFlags::SEPARATOR); + } + } + + // punctuation + if MessDetectorChar::in_category(category, range, &[], &["P"], &["Punctuation"]) { + flags.insert(MessDetectorCharFlags::PUNCTUATION); + } + + // symbol + if MessDetectorChar::in_category(category, range, &[], &["N", "S"], &["Forms"]) { + flags.insert(MessDetectorCharFlags::SYMBOL); + } + + // latin + if MessDetectorChar::in_description(name, &["LATIN"]) { + flags.insert(MessDetectorCharFlags::LATIN); + } else { + // cjk + if MessDetectorChar::in_description(name, &["CJK"]) { + flags.insert(MessDetectorCharFlags::CJK); + } + // hangul + if MessDetectorChar::in_description(name, &["HANGUL"]) { + flags.insert(MessDetectorCharFlags::HANGUL); + } + // katakana + if MessDetectorChar::in_description(name, &["KATAKANA"]) { + flags.insert(MessDetectorCharFlags::KATAKANA); + } + // hiragana + if MessDetectorChar::in_description(name, &["HIRAGANA"]) { + flags.insert(MessDetectorCharFlags::HIRAGANA); + } + // thai + if MessDetectorChar::in_description(name, &["THAI"]) { + flags.insert(MessDetectorCharFlags::THAI); + } + } + + // accentuated + if MessDetectorChar::in_description( + name, + &[ + "WITH GRAVE", + "WITH ACUTE", + "WITH CEDILLA", + "WITH DIAERESIS", + "WITH CIRCUMFLEX", + "WITH TILDE", + ], + ) { + flags.insert(MessDetectorCharFlags::ACCENTUATED); + } + + // create new object + MessDetectorChar { + character, + flags, + unicode_range: range, + } +} + // Base abstract trait used for mess detection plugins. // All detectors MUST extend and implement given methods. trait MessDetectorPlugin { @@ -23,11 +250,11 @@ trait MessDetectorPlugin { } // Determine if given character should be fed in - fn eligible(&self, character: char) -> bool; + fn eligible(&self, character: &MessDetectorChar) -> bool; // The main routine to be executed upon character. // Insert the logic in witch the text would be considered chaotic. - fn feed(&mut self, character: char); + fn feed(&mut self, character: &MessDetectorChar); // Compute the chaos ratio based on what your feed() has seen. // Must NOT be lower than 0.; No restriction gt 0. @@ -42,26 +269,28 @@ struct TooManySymbolOrPunctuationPlugin { punctuation_count: u64, symbol_count: u64, character_count: u64, - last_printable_char: Option, + last_printable_char: Option, } impl MessDetectorPlugin for TooManySymbolOrPunctuationPlugin { - fn eligible(&self, character: char) -> bool { - !is_unprintable(character) + fn eligible(&self, character: &MessDetectorChar) -> bool { + !character.is(MessDetectorCharFlags::UNPRINTABLE) } - fn feed(&mut self, character: char) { + fn feed(&mut self, character: &MessDetectorChar) { self.character_count += 1; - let is_different_char = self - .last_printable_char - .map_or(true, |last_char| character != last_char); - if is_different_char && !COMMON_SAFE_ASCII_CHARACTERS.contains(character) { - if is_punctuation(character) { + if (self.last_printable_char.is_none() || *character != self.last_printable_char.unwrap()) + && !character.is(MessDetectorCharFlags::COMMON_SAFE) + { + if character.is(MessDetectorCharFlags::PUNCTUATION) { self.punctuation_count += 1; - } else if !character.is_numeric() && is_symbol(character) && !is_emoticon(character) { + } else if !character.is(MessDetectorCharFlags::NUMERIC) + && character.is(MessDetectorCharFlags::SYMBOL) + && !character.is(MessDetectorCharFlags::EMOTICON) + { self.symbol_count += 2; } } - self.last_printable_char = Some(character); + self.last_printable_char = Some(*character); } fn ratio(&self) -> f32 { if self.character_count == 0 { @@ -88,12 +317,12 @@ struct TooManyAccentuatedPlugin { } impl MessDetectorPlugin for TooManyAccentuatedPlugin { - fn eligible(&self, character: char) -> bool { - character.is_alphabetic() + fn eligible(&self, character: &MessDetectorChar) -> bool { + character.is(MessDetectorCharFlags::ALPHABETIC) } - fn feed(&mut self, character: char) { + fn feed(&mut self, character: &MessDetectorChar) { self.character_count += 1; - if is_accentuated(character) { + if character.is(MessDetectorCharFlags::ACCENTUATED) { self.accentuated_count += 1; } } @@ -116,11 +345,11 @@ struct UnprintablePlugin { } impl MessDetectorPlugin for UnprintablePlugin { - fn eligible(&self, character: char) -> bool { + fn eligible(&self, character: &MessDetectorChar) -> bool { true } - fn feed(&mut self, character: char) { - if is_unprintable(character) { + fn feed(&mut self, character: &MessDetectorChar) { + if character.is(MessDetectorCharFlags::UNPRINTABLE) { self.unprintable_count += 1; } self.character_count += 1; @@ -140,27 +369,40 @@ impl MessDetectorPlugin for UnprintablePlugin { struct SuspiciousDuplicateAccentPlugin { character_count: u64, successive_count: u64, - last_latin_character: Option, + last_latin_character: Option, } impl MessDetectorPlugin for SuspiciousDuplicateAccentPlugin { - fn eligible(&self, character: char) -> bool { - character.is_alphabetic() && is_latin(character) + fn eligible(&self, character: &MessDetectorChar) -> bool { + character.is(MessDetectorCharFlags::ALPHABETIC) + && character.is(MessDetectorCharFlags::LATIN) } - fn feed(&mut self, character: char) { + fn feed(&mut self, character: &MessDetectorChar) { self.character_count += 1; - if let Some(last_latin_char) = self.last_latin_character { - if is_accentuated(character) && is_accentuated(last_latin_char) { - if character.is_uppercase() && last_latin_char.is_uppercase() { - self.successive_count += 1; - } - // Worse if its the same char duplicated with different accent. - if remove_accent(character) == remove_accent(last_latin_char) { - self.successive_count += 1; - } + if self.last_latin_character.is_some() + && character.is(MessDetectorCharFlags::ACCENTUATED) + && self + .last_latin_character + .unwrap() + .is(MessDetectorCharFlags::ACCENTUATED) + { + if character.is(MessDetectorCharFlags::UPPERCASE) + && self + .last_latin_character + .unwrap() + .is(MessDetectorCharFlags::UPPERCASE) + { + self.successive_count += 1; + } + + // Worse if its the same char duplicated with different accent. + if remove_accent(character.character) + == remove_accent(self.last_latin_character.unwrap().character) + { + self.successive_count += 1; } } - self.last_latin_character = Some(character); + self.last_latin_character = Some(*character); } fn ratio(&self) -> f32 { if self.character_count == 0 { @@ -177,38 +419,37 @@ impl MessDetectorPlugin for SuspiciousDuplicateAccentPlugin { struct SuspiciousRangePlugin { character_count: u64, suspicious_successive_range_count: u64, - last_printable_char: Option, + last_printable_char: Option, } impl MessDetectorPlugin for SuspiciousRangePlugin { - fn eligible(&self, character: char) -> bool { - !is_unprintable(character) + fn eligible(&self, character: &MessDetectorChar) -> bool { + !character.is(MessDetectorCharFlags::UNPRINTABLE) } - fn feed(&mut self, character: char) { + fn feed(&mut self, character: &MessDetectorChar) { self.character_count += 1; - if character.is_whitespace() - || is_punctuation(character) - || COMMON_SAFE_ASCII_CHARACTERS.contains(character) + if character.is(MessDetectorCharFlags::WHITESPACE) + || character.is(MessDetectorCharFlags::PUNCTUATION) + || character.is(MessDetectorCharFlags::COMMON_SAFE) { self.last_printable_char = None; return; } if self.last_printable_char.is_none() { - self.last_printable_char = Some(character); + self.last_printable_char = Some(*character); return; } - let tmp_a = self.last_printable_char.unwrap(); - let unicode_range_a = unicode_range(tmp_a); - let unicode_range_b = unicode_range(character); - - if is_suspiciously_successive_range(unicode_range_a, unicode_range_b) { + if is_suspiciously_successive_range( + self.last_printable_char.unwrap().unicode_range, + character.unicode_range, + ) { self.suspicious_successive_range_count += 1; } - self.last_printable_char = Some(character); + self.last_printable_char = Some(*character); } fn ratio(&self) -> f32 { (self.character_count > 0) @@ -235,33 +476,36 @@ struct SuperWeirdWordPlugin { foreign_long_watch: bool, bad_character_count: u64, buffer_accent_count: u64, - buffer: String, + buffer: Vec, } impl MessDetectorPlugin for SuperWeirdWordPlugin { - fn eligible(&self, character: char) -> bool { + fn eligible(&self, character: &MessDetectorChar) -> bool { true } - fn feed(&mut self, character: char) { - if character.is_ascii_alphabetic() { - self.buffer.push(character); - if is_accentuated(character) { + fn feed(&mut self, character: &MessDetectorChar) { + if character.is(MessDetectorCharFlags::ASCII_ALPHABETIC) { + self.buffer.push(*character); + if character.is(MessDetectorCharFlags::ACCENTUATED) { self.buffer_accent_count += 1; } - self.foreign_long_watch |= (!is_latin(character) || is_accentuated(character)) - && !is_cjk(character) - && !is_hangul(character) - && !is_katakana(character) - && !is_hiragana(character) - && !is_thai(character); - + self.foreign_long_watch |= (!character.is(MessDetectorCharFlags::LATIN) + || character.is(MessDetectorCharFlags::ACCENTUATED)) + && !character.is(MessDetectorCharFlags::CJK) + && !character.is(MessDetectorCharFlags::HANGUL) + && !character.is(MessDetectorCharFlags::KATAKANA) + && !character.is(MessDetectorCharFlags::HIRAGANA) + && !character.is(MessDetectorCharFlags::THAI); return; } if self.buffer.is_empty() { return; } - if character.is_whitespace() || is_punctuation(character) || is_separator(character) { + if character.is(MessDetectorCharFlags::WHITESPACE) + || character.is(MessDetectorCharFlags::PUNCTUATION) + || character.is(MessDetectorCharFlags::SEPARATOR) + { self.word_count += 1; let buffer_length = self.buffer.len(); self.character_count += buffer_length as u64; @@ -273,14 +517,20 @@ impl MessDetectorPlugin for SuperWeirdWordPlugin { // Word/Buffer ending with an upper case accentuated letter are so rare, // that we will consider them all as suspicious. Same weight as foreign_long suspicious. - let last_char = self.buffer.chars().last().unwrap(); - if is_accentuated(last_char) && last_char.is_uppercase() { + let last_char = self.buffer.last().unwrap(); + if last_char.is(MessDetectorCharFlags::ACCENTUATED) + && last_char.is(MessDetectorCharFlags::UPPERCASE) + { self.foreign_long_count += 1; self.is_current_word_bad = true; } } if buffer_length >= 24 && self.foreign_long_watch { - let uppercase_count = self.buffer.chars().filter(|c| c.is_uppercase()).count(); + let uppercase_count = self + .buffer + .iter() + .filter(|&c| c.is(MessDetectorCharFlags::UPPERCASE)) + .count(); let mut probable_camel_cased: bool = false; if uppercase_count > 0 && (uppercase_count as f32 / buffer_length as f32) <= 0.3 { @@ -302,12 +552,12 @@ impl MessDetectorPlugin for SuperWeirdWordPlugin { self.foreign_long_watch = false; self.buffer.clear(); self.buffer_accent_count = 0; - } else if !"<>-=~|_".contains(character) - && !character.is_ascii_digit() - && is_symbol(character) + } else if !character.is(MessDetectorCharFlags::WEIRD_SAFE) + && !character.is(MessDetectorCharFlags::ASCII_DIGIT) + && character.is(MessDetectorCharFlags::SYMBOL) { self.is_current_word_bad = true; - self.buffer.push(character); + self.buffer.push(*character); } } fn ratio(&self) -> f32 { @@ -330,15 +580,15 @@ struct CjkInvalidStopPlugin { } impl MessDetectorPlugin for CjkInvalidStopPlugin { - fn eligible(&self, _: char) -> bool { + fn eligible(&self, _: &MessDetectorChar) -> bool { true } - fn feed(&mut self, character: char) { - if "丅丄".contains(character) { + fn feed(&mut self, character: &MessDetectorChar) { + if "丅丄".contains(character.character) { self.wrong_stop_count += 1; return; } - if is_cjk(character) { + if character.is(MessDetectorCharFlags::CJK) { self.cjk_character_count += 1; } } @@ -361,7 +611,7 @@ struct ArchaicUpperLowerPlugin { successive_upper_lower_count: u64, successive_upper_lower_count_final: u64, character_count: u64, - last_alpha_seen: Option, + last_alpha_seen: Option, } impl Default for ArchaicUpperLowerPlugin { @@ -379,15 +629,16 @@ impl Default for ArchaicUpperLowerPlugin { } impl MessDetectorPlugin for ArchaicUpperLowerPlugin { - fn eligible(&self, _: char) -> bool { + fn eligible(&self, _: &MessDetectorChar) -> bool { true } - fn feed(&mut self, character: char) { - if !(character.is_alphabetic() && is_case_variable(character)) + fn feed(&mut self, character: &MessDetectorChar) { + if !(character.is(MessDetectorCharFlags::ALPHABETIC) + && character.is(MessDetectorCharFlags::CASE_VARIABLE)) && self.character_count_since_last_sep > 0 { if self.character_count_since_last_sep <= 64 - && !character.is_ascii_digit() + && !character.is(MessDetectorCharFlags::ASCII_DIGIT) && !self.current_ascii_only { self.successive_upper_lower_count_final += self.successive_upper_lower_count; @@ -403,11 +654,13 @@ impl MessDetectorPlugin for ArchaicUpperLowerPlugin { return; } - self.current_ascii_only &= character.is_ascii(); + self.current_ascii_only &= character.is(MessDetectorCharFlags::ASCII); if let Some(tmp_last_alpha) = self.last_alpha_seen { - if (character.is_uppercase() && tmp_last_alpha.is_lowercase()) - || (character.is_lowercase() && tmp_last_alpha.is_uppercase()) + if (character.is(MessDetectorCharFlags::UPPERCASE) + && tmp_last_alpha.is(MessDetectorCharFlags::LOWERCASE)) + || (character.is(MessDetectorCharFlags::LOWERCASE) + && tmp_last_alpha.is(MessDetectorCharFlags::UPPERCASE)) { if self.buf { self.successive_upper_lower_count += 2; @@ -422,7 +675,7 @@ impl MessDetectorPlugin for ArchaicUpperLowerPlugin { self.character_count += 1; self.character_count_since_last_sep += 1; - self.last_alpha_seen = Some(character); + self.last_alpha_seen = Some(*character); } fn ratio(&self) -> f32 { if self.character_count == 0 { @@ -463,10 +716,11 @@ pub(crate) fn mess_ratio( .chain(std::iter::once('\n')) .enumerate() { + let mess_char = MessDetectorChar::new(ch); detectors .iter_mut() - .filter(|detector| detector.eligible(ch)) - .for_each(|detector| detector.feed(ch)); + .filter(|detector| detector.eligible(&mess_char)) + .for_each(|detector| detector.feed(&mess_char)); if (index > 0 && index.rem_euclid(intermediary_mean_mess_ratio_calc) == 0) || index == length diff --git a/src/tests/md.rs b/src/tests/md.rs index 24e0464..2911e7e 100644 --- a/src/tests/md.rs +++ b/src/tests/md.rs @@ -50,7 +50,168 @@ fn test_datasets_mess_ratio() { false, ) { let mr = mess_ratio(decoded_sequence, Some(OrderedFloat(1.0))); - assert!(mr < 0.2); + assert!(mr < 0.2, "Mess ratio is very high = {} for {}", mr, path); } } } + +#[test] +fn test_is_accentuated() { + let tests = [ + ('é', true), + ('è', true), + ('à', true), + ('À', true), + ('Ù', true), + ('ç', true), + ('a', false), + ('€', false), + ('&', false), + ('Ö', true), + ('ü', true), + ('ê', true), + ('Ñ', true), + ('Ý', true), + ('Ω', false), + ('ø', false), + ('Ё', false), + ]; + for test in &tests { + assert_eq!( + MessDetectorChar::new(test.0).is(MessDetectorCharFlags::ACCENTUATED), + test.1, + ); + } +} + +#[test] +fn test_is_latin() { + let tests = [('я', false), ('a', true)]; + for test in &tests { + assert_eq!( + MessDetectorChar::new(test.0).is(MessDetectorCharFlags::LATIN), + test.1, + ); + } +} + +#[test] +fn test_is_cjk() { + let tests = [('я', false), ('是', true)]; + for test in &tests { + assert_eq!( + MessDetectorChar::new(test.0).is(MessDetectorCharFlags::CJK), + test.1, + ); + } +} + +#[test] +fn test_is_hiragana() { + let tests = [('是', false), ('お', true)]; + for test in &tests { + assert_eq!( + MessDetectorChar::new(test.0).is(MessDetectorCharFlags::HIRAGANA), + test.1, + ); + } +} + +#[test] +fn test_is_katakana() { + let tests = [('お', false), ('キ', true)]; + for test in &tests { + assert_eq!( + MessDetectorChar::new(test.0).is(MessDetectorCharFlags::KATAKANA), + test.1, + ); + } +} + +#[test] +fn test_is_hangul() { + let tests = [('キ', false), ('ㅂ', true)]; + for test in &tests { + assert_eq!( + MessDetectorChar::new(test.0).is(MessDetectorCharFlags::HANGUL), + test.1, + ); + } +} + +#[test] +fn test_is_thai() { + let tests = [('キ', false), ('ย', true)]; + for test in &tests { + assert_eq!( + MessDetectorChar::new(test.0).is(MessDetectorCharFlags::THAI), + test.1, + ); + } +} + +#[test] +fn test_is_case_variable() { + let tests = [('#', false), ('я', true)]; + for test in &tests { + assert_eq!( + MessDetectorChar::new(test.0).is(MessDetectorCharFlags::CASE_VARIABLE), + test.1, + ); + } +} + +#[test] +fn test_is_punctuation() { + let tests = [('!', true), ('?', true), ('a', false), (':', true)]; + for test in &tests { + assert_eq!( + MessDetectorChar::new(test.0).is(MessDetectorCharFlags::PUNCTUATION), + test.1, + ); + } +} + +#[test] +fn test_is_symbol() { + let tests = [('+', true), ('∑', true), ('a', false), ('я', false)]; + for test in &tests { + assert_eq!( + MessDetectorChar::new(test.0).is(MessDetectorCharFlags::SYMBOL), + test.1, + ); + } +} + +#[test] +fn test_is_emoticon() { + let tests = [('🙂', true), ('∑', false), ('😂', true), ('я', false)]; + for test in &tests { + assert_eq!( + MessDetectorChar::new(test.0).is(MessDetectorCharFlags::EMOTICON), + test.1, + ); + } +} + +#[test] +fn test_is_separator() { + let tests = [(' ', true), ('a', false), ('!', true), ('я', false)]; + for test in &tests { + assert_eq!( + MessDetectorChar::new(test.0).is(MessDetectorCharFlags::SEPARATOR), + test.1, + ); + } +} + +#[test] +fn test_is_unprintable() { + let tests = [(' ', false), ('a', false), ('!', false), ('\u{0000}', true)]; + for test in &tests { + assert_eq!( + MessDetectorChar::new(test.0).is(MessDetectorCharFlags::UNPRINTABLE), + test.1, + ); + } +} diff --git a/src/tests/utils.rs b/src/tests/utils.rs index 76c95c6..47566cc 100644 --- a/src/tests/utils.rs +++ b/src/tests/utils.rs @@ -6,74 +6,6 @@ use std::fs::File; use std::io::Read; use std::path::PathBuf; -#[test] -fn test_is_accentuated() { - let tests = [ - ('é', true), - ('è', true), - ('à', true), - ('À', true), - ('Ù', true), - ('ç', true), - ('a', false), - ('€', false), - ('&', false), - ('Ö', true), - ('ü', true), - ('ê', true), - ('Ñ', true), - ('Ý', true), - ('Ω', false), - ('ø', false), - ('Ё', false), - ]; - for test in &tests { - assert_eq!(is_accentuated(test.0), test.1); - } -} - -#[test] -fn test_is_latin() { - assert!(!is_latin('я')); - assert!(is_latin('a')); -} - -#[test] -fn test_is_cjk() { - assert!(!is_cjk('я')); - assert!(is_cjk('是')); -} - -#[test] -fn test_is_hiragana() { - assert!(!is_hiragana('是')); - assert!(is_hiragana('お')); -} - -#[test] -fn test_is_katakana() { - assert!(!is_katakana('お')); - assert!(is_katakana('キ')); -} - -#[test] -fn test_is_hangul() { - assert!(!is_hangul('キ')); - assert!(is_hangul('ㅂ')); -} - -#[test] -fn test_is_thai() { - assert!(!is_thai('キ')); - assert!(is_thai('ย')); -} - -#[test] -fn test_is_case_variable() { - assert!(!is_case_variable('#')); - assert!(is_case_variable('я')); -} - #[test] fn test_is_unicode_range_secondary() { assert!(!is_unicode_range_secondary("Something")); @@ -129,46 +61,6 @@ fn test_range_scan() { assert!(res.contains("Cyrillic")); } -#[test] -fn test_is_punctuation() { - let tests = [('!', true), ('?', true), ('a', false), (':', true)]; - for test in &tests { - assert_eq!(is_punctuation(test.0), test.1); - } -} - -#[test] -fn test_is_symbol() { - let tests = [('+', true), ('∑', true), ('a', false), ('я', false)]; - for test in &tests { - assert_eq!(is_symbol(test.0), test.1); - } -} - -#[test] -fn test_is_emoticon() { - let tests = [('🙂', true), ('∑', false), ('😂', true), ('я', false)]; - for test in &tests { - assert_eq!(is_emoticon(test.0), test.1); - } -} - -#[test] -fn test_is_separator() { - let tests = [(' ', true), ('a', false), ('!', true), ('я', false)]; - for test in &tests { - assert_eq!(is_separator(test.0), test.1); - } -} - -#[test] -fn test_is_unprintable() { - let tests = [(' ', false), ('a', false), ('!', false), ('\u{0000}', true)]; - for test in &tests { - assert_eq!(is_unprintable(test.0), test.1); - } -} - #[test] fn test_is_multi_byte_encoding() { let tests = [("utf-8", true), ("ascii", false), ("euc-jp", true)]; diff --git a/src/utils.rs b/src/utils.rs index 2d9875d..35abc76 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -3,113 +3,20 @@ use crate::assets::LANGUAGES; use crate::consts::{ ENCODING_MARKS, IANA_SUPPORTED, IANA_SUPPORTED_SIMILAR, RE_POSSIBLE_ENCODING_INDICATION, - UNICODE_RANGES_COMBINED, UNICODE_SECONDARY_RANGE_KEYWORD, UTF8_MAXIMAL_ALLOCATION, + UNICODE_RANGES_COMBINED, UNICODE_SECONDARY_RANGE_KEYWORD, }; use crate::entity::Language; use ahash::{HashSet, HashSetExt}; -use cached::proc_macro::cached; -use cached::UnboundCache; use encoding::label::encoding_from_whatwg_label; use encoding::{CodecError, DecoderTrap, EncoderTrap, Encoding, EncodingRef, StringWriter}; use std::borrow::Cow; use std::fs; use std::path::{Path, PathBuf}; -use unic::char::property::EnumeratedCharProperty; use unic::ucd::normal::decompose_canonical; -use unic::ucd::{GeneralCategory, Name}; +use unic::ucd::Name; // Utils module -// check if character category contains one of categories_partial or -// if character category is exactly one of categories_exact or -// character is from range which has name, contains one of ranges_partial -fn in_category( - character: char, - categories_exact: &[&str], - categories_partial: &[&str], - ranges_partial: &[&str], -) -> bool { - // unicode category part - let category = GeneralCategory::of(character).abbr_name(); - if categories_exact.contains(&category) - || categories_partial.iter().any(|&cp| category.contains(cp)) - { - return true; - } - // unicode range part - if !ranges_partial.is_empty() { - if let Some(range) = unicode_range(character) { - return ranges_partial.iter().any(|&r| range.contains(r)); - } - } - false -} - -// check if character description contains at least one of patterns -fn in_description(character: char, patterns: &[&str]) -> bool { - Name::of(character).is_some_and(|description| { - patterns - .iter() - .any(|&s| description.to_string().contains(s)) - }) -} - -#[cached( - type = "UnboundCache", - create = "{ UnboundCache::with_capacity(*UTF8_MAXIMAL_ALLOCATION) }", - convert = r#"{ character }"# -)] -pub(crate) fn is_punctuation(character: char) -> bool { - in_category(character, &[], &["P"], &["Punctuation"]) -} - -#[cached( - type = "UnboundCache", - create = "{ UnboundCache::with_capacity(*UTF8_MAXIMAL_ALLOCATION) }", - convert = r#"{ character }"# -)] -pub(crate) fn is_symbol(character: char) -> bool { - in_category(character, &[], &["N", "S"], &["Forms"]) -} - -#[cached( - type = "UnboundCache", - create = "{ UnboundCache::with_capacity(*UTF8_MAXIMAL_ALLOCATION) }", - convert = r#"{ character }"# -)] -pub(crate) fn is_emoticon(character: char) -> bool { - in_category(character, &[], &[], &["Emoticons"]) -} - -#[cached( - type = "UnboundCache", - create = "{ UnboundCache::with_capacity(*UTF8_MAXIMAL_ALLOCATION) }", - convert = r#"{ character }"# -)] -pub(crate) fn is_separator(character: char) -> bool { - if character.is_whitespace() || ['|', '+', '<', '>'].contains(&character) { - return true; - } - in_category(character, &["Po", "Pd", "Pc"], &["Z"], &[]) -} - -#[cached( - type = "UnboundCache", - create = "{ UnboundCache::with_capacity(*UTF8_MAXIMAL_ALLOCATION) }", - convert = r#"{ character }"# -)] -pub(crate) fn is_unprintable(character: char) -> bool { - !character.is_whitespace() - && !character.is_ascii_graphic() - && !['\x1A', '\u{FEFF}'].contains(&character) - && in_category(character, &["Cc"], &[], &["Control character"]) -} - -#[cached( - type = "UnboundCache", - create = "{ UnboundCache::with_capacity(*UTF8_MAXIMAL_ALLOCATION) }", - convert = r#"{ character }"# -)] pub(crate) fn is_accentuated(character: char) -> bool { let patterns = [ "WITH GRAVE", @@ -119,71 +26,11 @@ pub(crate) fn is_accentuated(character: char) -> bool { "WITH CIRCUMFLEX", "WITH TILDE", ]; - in_description(character, &patterns) -} - -#[cached( - type = "UnboundCache", - create = "{ UnboundCache::with_capacity(*UTF8_MAXIMAL_ALLOCATION) }", - convert = r#"{ character }"# -)] -pub(crate) fn is_latin(character: char) -> bool { - let patterns = ["LATIN"]; - in_description(character, &patterns) -} - -#[cached( - type = "UnboundCache", - create = "{ UnboundCache::with_capacity(*UTF8_MAXIMAL_ALLOCATION) }", - convert = r#"{ character }"# -)] -pub(crate) fn is_cjk(character: char) -> bool { - let patterns = ["CJK"]; - in_description(character, &patterns) -} - -#[cached( - type = "UnboundCache", - create = "{ UnboundCache::with_capacity(*UTF8_MAXIMAL_ALLOCATION) }", - convert = r#"{ character }"# -)] -pub(crate) fn is_hiragana(character: char) -> bool { - let patterns = ["HIRAGANA"]; - in_description(character, &patterns) -} - -#[cached( - type = "UnboundCache", - create = "{ UnboundCache::with_capacity(*UTF8_MAXIMAL_ALLOCATION) }", - convert = r#"{ character }"# -)] -pub(crate) fn is_katakana(character: char) -> bool { - let patterns = ["KATAKANA"]; - in_description(character, &patterns) -} - -#[cached( - type = "UnboundCache", - create = "{ UnboundCache::with_capacity(*UTF8_MAXIMAL_ALLOCATION) }", - convert = r#"{ character }"# -)] -pub(crate) fn is_hangul(character: char) -> bool { - let patterns = ["HANGUL"]; - in_description(character, &patterns) -} - -#[cached( - type = "UnboundCache", - create = "{ UnboundCache::with_capacity(*UTF8_MAXIMAL_ALLOCATION) }", - convert = r#"{ character }"# -)] -pub(crate) fn is_thai(character: char) -> bool { - let patterns = ["THAI"]; - in_description(character, &patterns) -} - -pub(crate) fn is_case_variable(character: char) -> bool { - character.is_lowercase() != character.is_uppercase() + Name::of(character).is_some_and(|description| { + patterns + .iter() + .any(|&s| description.to_string().contains(s)) + }) } pub(crate) fn is_unicode_range_secondary(range_name: &str) -> bool {