From a74f2f1cc154f096df1f0bba45a0f9cf92133211 Mon Sep 17 00:00:00 2001 From: Deyan Ginev Date: Mon, 25 Mar 2024 06:37:09 -0400 Subject: [PATCH] math reports and clippy lints --- examples/citation_ngrams.rs | 2 +- examples/corpus_math_count.rs | 137 ++++++++++++++++++ examples/corpus_mathml_stats.rs | 8 +- examples/corpus_node_model.rs | 2 +- examples/corpus_statement_paragraphs_model.rs | 4 +- examples/pattern_example.rs | 10 +- examples/word_tokenization.rs | 2 +- src/ngrams.rs | 4 +- src/parallel_data/corpus.rs | 54 ++++++- src/patterns/rules.rs | 4 +- tests/dnm_test.rs | 10 +- 11 files changed, 212 insertions(+), 25 deletions(-) create mode 100644 examples/corpus_math_count.rs diff --git a/examples/citation_ngrams.rs b/examples/citation_ngrams.rs index 0b958f6e14..892d9e3a00 100644 --- a/examples/citation_ngrams.rs +++ b/examples/citation_ngrams.rs @@ -34,7 +34,7 @@ fn main() -> Result<(), Box> { let mut input_args = env::args(); let _ = input_args.next(); // skip process name - while let Some(file_path) = input_args.next() { + for file_path in input_args { eprintln!("-- opening {:?}", file_path); let file = File::open(file_path)?; let reader = BufReader::new(file); diff --git a/examples/corpus_math_count.rs b/examples/corpus_math_count.rs new file mode 100644 index 0000000000..d641393112 --- /dev/null +++ b/examples/corpus_math_count.rs @@ -0,0 +1,137 @@ +//! Count the total number of elements, +//! and their Content MathML annotations +//! in a directory of HTML documents +//! +//! example use for arXMLiv: +//! `cargo run --release --example corpus_math_count /data/datasets/dataset-arXMLiv-2022` +//! +//! This script extracts the raw data from a "blind" descent over each `` element, and may +//! require additional cutoffs and post-processing over uncurated corpora. +//! You can find an example of post-processing done for the data of arXMLiv here: +//! https://gist.github.com/dginev/e50a632d31be05bb87d64cc1800f6fd4#file-apply_cutoffs-pl +#![allow(clippy::unused_io_amount)] + +use std::collections::HashMap; +use std::env; +use std::fs::File; +use std::io::{BufWriter, Error}; +use std::time::Instant; + +use libxml::xpath::Context; +use llamapun::parallel_data::Corpus; + +static BUFFER_CAPACITY: usize = 10_485_760; + +pub fn main() -> Result<(), Error> { + let start = Instant::now(); + // Read input arguments + let mut input_args = env::args(); + let _ = input_args.next(); // skip process name + let corpus_path = match input_args.next() { + Some(path) => path, + None => "tests/resources/".to_string(), + }; + let node_statistics_filepath = match input_args.next() { + Some(path) => path, + None => "corpus_math_count.csv".to_string(), + }; + let content_statistics_filepath = match input_args.next() { + Some(path) => path, + None => "corpus_content_count.csv".to_string(), + }; + + let extension_filter = input_args.next(); + + let node_statistics_file = File::create(node_statistics_filepath)?; + let content_statistics_file = File::create(content_statistics_filepath)?; + + let mut corpus = Corpus::new(corpus_path); + corpus.extension = extension_filter; + + let mut total = 0; + let (math_catalog, content_math_catalog) = corpus.catalogs_with_parallel_walk(|document| { + let mut math_count_hash = HashMap::new(); + let mut content_count_hash = HashMap::new(); + // just return the number of math elements + let mut xpath_context = Context::new(&document.dom).unwrap(); + let math_count = xpath_context + .findvalue("count(//*[local-name()='math'])", None) + .unwrap(); + math_count_hash.insert(math_count, 1); + + let content_count = xpath_context + .findvalue( + "count(//*[local-name()='annotation-xml' and @encoding='MathML-Content'])", + None, + ).unwrap(); + content_count_hash.insert(content_count, 1); + + (math_count_hash, content_count_hash) + }); + + let duration_sec = start.elapsed().as_millis(); + eprintln!("---"); + eprintln!("Math counting finished in {:?}ms", duration_sec); + + // Report on Math. + let mut catalog_vec: Vec<(&String, &u64)> = math_catalog.iter().collect(); + catalog_vec.sort_by(|a, b| b.1.cmp(a.1)); + + let buffered_writer = BufWriter::with_capacity(BUFFER_CAPACITY, node_statistics_file); + let mut csv_writer = csv::Writer::from_writer(buffered_writer); + csv_writer.write_record(["math elements", "documents in corpus"])?; + + for (key, val) in catalog_vec { + total += key.parse::().unwrap() * val; + csv_writer.write_record([key, &val.to_string()])?; + } + eprintln!(" Grand total of in dataset: "); + eprintln!(" --- "); + eprintln!(" {} ", total); + eprintln!(" --- "); + // Close the writer + csv_writer.flush()?; + + // Report on Content Math. + total = 0; + let mut catalog_vec: Vec<(&String, &u64)> = content_math_catalog.iter().collect(); + catalog_vec.sort_by(|a, b| b.1.cmp(a.1)); + + let buffered_writer = BufWriter::with_capacity(BUFFER_CAPACITY, content_statistics_file); + let mut csv_writer = csv::Writer::from_writer(buffered_writer); + csv_writer.write_record(["annotation-xml elements", "documents in corpus"])?; + + for (key, val) in catalog_vec { + total += key.parse::().unwrap() * val; + csv_writer.write_record([key, &val.to_string()])?; + } + eprintln!(" Grand total of Content MathML in dataset: "); + eprintln!(" --- "); + eprintln!(" {} ", total); + eprintln!(" --- "); + // Close the writer + csv_writer.flush() + +} + +// Example output from arXMLiv 2022: +// Math counting finished in 14030571ms +// Grand total of in dataset: +// --- +// 970414519 +// --- +// Grand total of Content MathML in dataset: +// --- +// 953308908 +// --- + +// Example output from ar5iv 2024: +//Math counting finished in 22121404ms +// Grand total of in dataset: +// --- +// 1059794660 +// --- +// Grand total of Content MathML in dataset: +// --- +// 1038882200 +// --- diff --git a/examples/corpus_mathml_stats.rs b/examples/corpus_mathml_stats.rs index 8f8c1e9310..e54e30869a 100644 --- a/examples/corpus_mathml_stats.rs +++ b/examples/corpus_mathml_stats.rs @@ -22,7 +22,6 @@ use std::collections::{HashMap, HashSet}; use std::env; use std::fs::File; use std::io::{BufWriter, Error}; -use std::thread; use std::time::Instant; use libxml::readonly::RoNode; @@ -71,8 +70,7 @@ pub fn main() -> Result<(), Error> { let catalog = corpus.catalog_with_parallel_walk(|document| { println!( - "Thread: {:?}, doc: {:?}", - thread::current().name(), + "doc: {:?}", document.path ); @@ -103,10 +101,10 @@ pub fn main() -> Result<(), Error> { let buffered_writer = BufWriter::with_capacity(BUFFER_CAPACITY, node_statistics_file); let mut csv_writer = csv::Writer::from_writer(buffered_writer); - csv_writer.write_record(&["name@attr[value]", "frequency"])?; + csv_writer.write_record(["name@attr[value]", "frequency"])?; for (key, val) in catalog_vec { - csv_writer.write_record(&[key, &val.to_string()])?; + csv_writer.write_record([key, &val.to_string()])?; } // Close the writer csv_writer.flush() diff --git a/examples/corpus_node_model.rs b/examples/corpus_node_model.rs index 0c08d24d4d..1902b3077f 100644 --- a/examples/corpus_node_model.rs +++ b/examples/corpus_node_model.rs @@ -14,7 +14,7 @@ use std::time::Instant; use libxml::readonly::RoNode; use llamapun::parallel_data::Corpus; -static NEWLINE: &'static [u8] = b"\n"; +static NEWLINE: &[u8] = b"\n"; static BUFFER_CAPACITY: usize = 10_485_760; pub fn main() -> Result<(), Error> { diff --git a/examples/corpus_statement_paragraphs_model.rs b/examples/corpus_statement_paragraphs_model.rs index 69095082d0..1e083fe6ab 100644 --- a/examples/corpus_statement_paragraphs_model.rs +++ b/examples/corpus_statement_paragraphs_model.rs @@ -258,7 +258,7 @@ fn extract_document_statements( } } // Discard paragraphs outside of a reasonable [4,1024] word count range - if word_count < 4 || word_count > 1024 { + if !(4..=1024).contains(&word_count) { overflow_count += 1; invalid_paragraph = true; } @@ -294,7 +294,7 @@ fn extract_document_statements( /// give a sha256 hash, assemble a filename based on it fn hash_file_path(directory: &str, content: &str) -> String { let mut hasher = Sha256::new(); - hasher.input_str(&content); + hasher.input_str(content); let hash = hasher.result_str(); directory.to_string() + "/" + &hash + ".txt" } diff --git a/examples/pattern_example.rs b/examples/pattern_example.rs index 2b1536b0ed..a7e63722f9 100644 --- a/examples/pattern_example.rs +++ b/examples/pattern_example.rs @@ -38,9 +38,9 @@ fn math_node_to_string(node: RoNode) -> String { } /// helper function -fn math_node_to_string_actual(node: RoNode, mut string: &mut String) { +fn math_node_to_string_actual(node: RoNode, string: &mut String) { match node.get_name().as_ref() { - "semantics" => math_node_to_string_children(node, &mut string), + "semantics" => math_node_to_string_children(node, string), "annotation" | "annotation-xml" => {}, "text" => { if node.is_text_node() { @@ -51,7 +51,7 @@ fn math_node_to_string_actual(node: RoNode, mut string: &mut String) { string.push('<'); string.push_str(default); string.push('>'); - math_node_to_string_children(node, &mut string); + math_node_to_string_children(node, string); string.push('<'); string.push('/'); string.push_str(default); @@ -61,13 +61,13 @@ fn math_node_to_string_actual(node: RoNode, mut string: &mut String) { } /// helper function -fn math_node_to_string_children(node: RoNode, mut string: &mut String) { +fn math_node_to_string_children(node: RoNode, string: &mut String) { let mut cur = node.get_first_child(); loop { if cur.is_none() { break; } - math_node_to_string_actual(cur.unwrap(), &mut string); + math_node_to_string_actual(cur.unwrap(), string); cur = cur.unwrap().get_next_sibling(); } } diff --git a/examples/word_tokenization.rs b/examples/word_tokenization.rs index badf0d5a7b..232920f894 100644 --- a/examples/word_tokenization.rs +++ b/examples/word_tokenization.rs @@ -50,7 +50,7 @@ fn main() { let inorder_dictionary = dictionary.sorted(); let mut inorder_frequency: Vec<(usize, usize)> = Vec::new(); for entry in &inorder_dictionary { - let frequency = unigrams.get(&entry.0); + let frequency = unigrams.get(entry.0); inorder_frequency.push((entry.1, frequency)); } plot_simple( diff --git a/src/ngrams.rs b/src/ngrams.rs index 8743a311c4..2f35338b0c 100644 --- a/src/ngrams.rs +++ b/src/ngrams.rs @@ -124,7 +124,7 @@ impl Ngrams { if words_since_anchor_seen == self.window_size && side == AnchorSide::Right { // it has been too long since we saw an anchor, add to the current buffer, record and // reset - self.record_words(continuous_buffer.drain(..).collect()); + self.record_words(std::mem::take(&mut continuous_buffer)); context_window.clear(); side = AnchorSide::Left; } @@ -132,7 +132,7 @@ impl Ngrams { } // Any remaining content should be added continuous_buffer.extend(context_window.asc_iter().copied()); - self.record_words(continuous_buffer.drain(..).collect()); + self.record_words(std::mem::take(&mut continuous_buffer)); } /// Take an arbitrarily long vector of words, and record all (overlapping) ngrams obtainable from diff --git a/src/parallel_data/corpus.rs b/src/parallel_data/corpus.rs index c08320421c..17559af6e7 100644 --- a/src/parallel_data/corpus.rs +++ b/src/parallel_data/corpus.rs @@ -48,7 +48,7 @@ impl Corpus { } } - /// Get a parallel iterator over the documents + /// Get a parallel iterator over the documents, returning a single report catalog pub fn catalog_with_parallel_walk(&self, closure: F) -> HashMap where F: Fn(Document) -> HashMap + Send + Sync { ParWalkDir::new(self.path.clone()) @@ -95,4 +95,56 @@ impl Corpus { map1 }) } + + /// Get a parallel iterator over the documents, returning a pair of report catalogs + pub fn catalogs_with_parallel_walk(&self, closure: F) -> (HashMap,HashMap) + where F: Fn(Document) -> (HashMap,HashMap) + Send + Sync { + ParWalkDir::new(self.path.clone()) + .num_threads(rayon::current_num_threads()) + .skip_hidden(true) + .sort(false) + .into_iter() + .filter_map(|each| { + if let Ok(entry) = each { + let file_name = entry.file_name.to_str().unwrap_or(""); + let selected = if let Some(ref extension) = self.extension { + file_name.ends_with(extension) + } else { + file_name.ends_with(".html") || file_name.ends_with(".xhtml") + }; + if selected { + let path = entry.path().to_str().unwrap_or("").to_owned(); + if !path.is_empty() { + return Some(path); + } + } + } + // all other cases + None + }) + .enumerate() + .par_bridge() + .map(|each| { + let (index, path) = each; + let document = Document::new(path, self).unwrap(); + if index % 1000 == 0 && index > 0 { + println!( + "-- catalog_with_parallel_walk now processing document {:?}", + 1 + index + ); + } + closure(document) + }) + .reduce(|| (HashMap::new(),HashMap::new()), |(mut map11, mut map12), (map21,map22)| { + for (k, v) in map21 { + let entry = map11.entry(k).or_insert(0); + *entry += v; + } + for (k, v) in map22 { + let entry = map12.entry(k).or_insert(0); + *entry += v; + } + (map11,map12) + }) + } } diff --git a/src/patterns/rules.rs b/src/patterns/rules.rs index 91876078fd..78c3875a3d 100644 --- a/src/patterns/rules.rs +++ b/src/patterns/rules.rs @@ -1064,7 +1064,7 @@ impl PatternFile { match cur.get_name().as_ref() { "meta" => { if meta_opt.is_some() { - return Err("pattern_file has multiple meta nodes".to_string()).map_err(err_map); + return Err("pattern_file has multiple meta nodes".to_string()); } meta_opt = Some(MetaDescription::load_from_node(cur, file_name.to_string()).map_err(err_map)?); @@ -1085,7 +1085,7 @@ impl PatternFile { pctx.add_sequence_rule(cur).map_err(err_map)?; }, x => { - return Err(format!("Unexpected node \"{x}\" in pattern_file")).map_err(err_map); + return Err(format!("Unexpected node \"{x}\" in pattern_file")); }, } } diff --git a/tests/dnm_test.rs b/tests/dnm_test.rs index 18136923f5..8f885b3324 100644 --- a/tests/dnm_test.rs +++ b/tests/dnm_test.rs @@ -83,19 +83,19 @@ fn test_xml_node_to_plaintext() { let mut node = doc.get_root_readonly().unwrap(); match node.get_first_child() { Some(n) => node = n, - None => assert!(false), //DOM generation failed + None => unreachable!(), //DOM generation failed } while node.get_name() != "body" { match node.get_next_sibling() { Some(n) => node = n, - None => assert!(false), + None => unreachable!(), } } node = node.get_first_child().unwrap(); while node.get_name() != "h1" { match node.get_next_sibling() { Some(n) => node = n, - None => assert!(false), + None => unreachable!(), } } //Node content should have been processed @@ -106,7 +106,7 @@ fn test_xml_node_to_plaintext() { while node.get_name() != "h2" { match node.get_next_sibling() { Some(n) => node = n, - None => assert!(false), + None => unreachable!(), } } //node was skipped in dnm generation @@ -114,7 +114,7 @@ fn test_xml_node_to_plaintext() { while node.get_name() != "a" { match node.get_next_sibling() { Some(n) => node = n, - None => assert!(false), + None => unreachable!(), } } //node content should have been replaced by "[link]"