Skip to content
This repository has been archived by the owner on Mar 25, 2024. It is now read-only.

Commit

Permalink
math reports and clippy lints
Browse files Browse the repository at this point in the history
  • Loading branch information
dginev committed Mar 25, 2024
1 parent 283be4a commit a74f2f1
Show file tree
Hide file tree
Showing 11 changed files with 212 additions and 25 deletions.
2 changes: 1 addition & 1 deletion examples/citation_ngrams.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ fn main() -> Result<(), Box<dyn Error>> {

let mut input_args = env::args();
let _ = input_args.next(); // skip process name
while let Some(file_path) = input_args.next() {
for file_path in input_args {
eprintln!("-- opening {:?}", file_path);
let file = File::open(file_path)?;
let reader = BufReader::new(file);
Expand Down
137 changes: 137 additions & 0 deletions examples/corpus_math_count.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
//! Count the total number of <math> elements,
//! and their Content MathML annotations
//! in a directory of HTML documents
//!
//! example use for arXMLiv:
//! `cargo run --release --example corpus_math_count /data/datasets/dataset-arXMLiv-2022`
//!
//! This script extracts the raw data from a "blind" descent over each `<math>` element, and may
//! require additional cutoffs and post-processing over uncurated corpora.
//! You can find an example of post-processing done for the data of arXMLiv here:
//! https://gist.github.com/dginev/e50a632d31be05bb87d64cc1800f6fd4#file-apply_cutoffs-pl
#![allow(clippy::unused_io_amount)]

use std::collections::HashMap;
use std::env;
use std::fs::File;
use std::io::{BufWriter, Error};
use std::time::Instant;

use libxml::xpath::Context;
use llamapun::parallel_data::Corpus;

static BUFFER_CAPACITY: usize = 10_485_760;

pub fn main() -> Result<(), Error> {
let start = Instant::now();
// Read input arguments
let mut input_args = env::args();
let _ = input_args.next(); // skip process name
let corpus_path = match input_args.next() {
Some(path) => path,
None => "tests/resources/".to_string(),
};
let node_statistics_filepath = match input_args.next() {
Some(path) => path,
None => "corpus_math_count.csv".to_string(),
};
let content_statistics_filepath = match input_args.next() {
Some(path) => path,
None => "corpus_content_count.csv".to_string(),
};

let extension_filter = input_args.next();

let node_statistics_file = File::create(node_statistics_filepath)?;
let content_statistics_file = File::create(content_statistics_filepath)?;

let mut corpus = Corpus::new(corpus_path);
corpus.extension = extension_filter;

let mut total = 0;
let (math_catalog, content_math_catalog) = corpus.catalogs_with_parallel_walk(|document| {
let mut math_count_hash = HashMap::new();
let mut content_count_hash = HashMap::new();
// just return the number of math elements
let mut xpath_context = Context::new(&document.dom).unwrap();
let math_count = xpath_context
.findvalue("count(//*[local-name()='math'])", None)
.unwrap();
math_count_hash.insert(math_count, 1);

let content_count = xpath_context
.findvalue(
"count(//*[local-name()='annotation-xml' and @encoding='MathML-Content'])",
None,
).unwrap();
content_count_hash.insert(content_count, 1);

(math_count_hash, content_count_hash)
});

let duration_sec = start.elapsed().as_millis();
eprintln!("---");
eprintln!("Math counting finished in {:?}ms", duration_sec);

// Report on Math.
let mut catalog_vec: Vec<(&String, &u64)> = math_catalog.iter().collect();
catalog_vec.sort_by(|a, b| b.1.cmp(a.1));

let buffered_writer = BufWriter::with_capacity(BUFFER_CAPACITY, node_statistics_file);
let mut csv_writer = csv::Writer::from_writer(buffered_writer);
csv_writer.write_record(["math elements", "documents in corpus"])?;

for (key, val) in catalog_vec {
total += key.parse::<u64>().unwrap() * val;
csv_writer.write_record([key, &val.to_string()])?;
}
eprintln!(" Grand total of <math> in dataset: ");
eprintln!(" --- ");
eprintln!(" {} ", total);
eprintln!(" --- ");
// Close the writer
csv_writer.flush()?;

// Report on Content Math.
total = 0;
let mut catalog_vec: Vec<(&String, &u64)> = content_math_catalog.iter().collect();
catalog_vec.sort_by(|a, b| b.1.cmp(a.1));

let buffered_writer = BufWriter::with_capacity(BUFFER_CAPACITY, content_statistics_file);
let mut csv_writer = csv::Writer::from_writer(buffered_writer);
csv_writer.write_record(["annotation-xml elements", "documents in corpus"])?;

for (key, val) in catalog_vec {
total += key.parse::<u64>().unwrap() * val;
csv_writer.write_record([key, &val.to_string()])?;
}
eprintln!(" Grand total of Content MathML <annotation-xml> in dataset: ");
eprintln!(" --- ");
eprintln!(" {} ", total);
eprintln!(" --- ");
// Close the writer
csv_writer.flush()

}

// Example output from arXMLiv 2022:
// Math counting finished in 14030571ms
// Grand total of <math> in dataset:
// ---
// 970414519
// ---
// Grand total of Content MathML <annotation-xml> in dataset:
// ---
// 953308908
// ---

// Example output from ar5iv 2024:
//Math counting finished in 22121404ms
// Grand total of <math> in dataset:
// ---
// 1059794660
// ---
// Grand total of Content MathML <annotation-xml> in dataset:
// ---
// 1038882200
// ---
8 changes: 3 additions & 5 deletions examples/corpus_mathml_stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ use std::collections::{HashMap, HashSet};
use std::env;
use std::fs::File;
use std::io::{BufWriter, Error};
use std::thread;
use std::time::Instant;

use libxml::readonly::RoNode;
Expand Down Expand Up @@ -71,8 +70,7 @@ pub fn main() -> Result<(), Error> {

let catalog = corpus.catalog_with_parallel_walk(|document| {
println!(
"Thread: {:?}, doc: {:?}",
thread::current().name(),
"doc: {:?}",
document.path
);

Expand Down Expand Up @@ -103,10 +101,10 @@ pub fn main() -> Result<(), Error> {

let buffered_writer = BufWriter::with_capacity(BUFFER_CAPACITY, node_statistics_file);
let mut csv_writer = csv::Writer::from_writer(buffered_writer);
csv_writer.write_record(&["name@attr[value]", "frequency"])?;
csv_writer.write_record(["name@attr[value]", "frequency"])?;

for (key, val) in catalog_vec {
csv_writer.write_record(&[key, &val.to_string()])?;
csv_writer.write_record([key, &val.to_string()])?;
}
// Close the writer
csv_writer.flush()
Expand Down
2 changes: 1 addition & 1 deletion examples/corpus_node_model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use std::time::Instant;
use libxml::readonly::RoNode;
use llamapun::parallel_data::Corpus;

static NEWLINE: &'static [u8] = b"\n";
static NEWLINE: &[u8] = b"\n";
static BUFFER_CAPACITY: usize = 10_485_760;

pub fn main() -> Result<(), Error> {
Expand Down
4 changes: 2 additions & 2 deletions examples/corpus_statement_paragraphs_model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ fn extract_document_statements(
}
}
// Discard paragraphs outside of a reasonable [4,1024] word count range
if word_count < 4 || word_count > 1024 {
if !(4..=1024).contains(&word_count) {
overflow_count += 1;
invalid_paragraph = true;
}
Expand Down Expand Up @@ -294,7 +294,7 @@ fn extract_document_statements(
/// give a sha256 hash, assemble a filename based on it
fn hash_file_path(directory: &str, content: &str) -> String {
let mut hasher = Sha256::new();
hasher.input_str(&content);
hasher.input_str(content);
let hash = hasher.result_str();
directory.to_string() + "/" + &hash + ".txt"
}
Expand Down
10 changes: 5 additions & 5 deletions examples/pattern_example.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ fn math_node_to_string(node: RoNode) -> String {
}

/// helper function
fn math_node_to_string_actual(node: RoNode, mut string: &mut String) {
fn math_node_to_string_actual(node: RoNode, string: &mut String) {
match node.get_name().as_ref() {
"semantics" => math_node_to_string_children(node, &mut string),
"semantics" => math_node_to_string_children(node, string),
"annotation" | "annotation-xml" => {},
"text" => {
if node.is_text_node() {
Expand All @@ -51,7 +51,7 @@ fn math_node_to_string_actual(node: RoNode, mut string: &mut String) {
string.push('<');
string.push_str(default);
string.push('>');
math_node_to_string_children(node, &mut string);
math_node_to_string_children(node, string);
string.push('<');
string.push('/');
string.push_str(default);
Expand All @@ -61,13 +61,13 @@ fn math_node_to_string_actual(node: RoNode, mut string: &mut String) {
}

/// helper function
fn math_node_to_string_children(node: RoNode, mut string: &mut String) {
fn math_node_to_string_children(node: RoNode, string: &mut String) {
let mut cur = node.get_first_child();
loop {
if cur.is_none() {
break;
}
math_node_to_string_actual(cur.unwrap(), &mut string);
math_node_to_string_actual(cur.unwrap(), string);
cur = cur.unwrap().get_next_sibling();
}
}
Expand Down
2 changes: 1 addition & 1 deletion examples/word_tokenization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ fn main() {
let inorder_dictionary = dictionary.sorted();
let mut inorder_frequency: Vec<(usize, usize)> = Vec::new();
for entry in &inorder_dictionary {
let frequency = unigrams.get(&entry.0);
let frequency = unigrams.get(entry.0);
inorder_frequency.push((entry.1, frequency));
}
plot_simple(
Expand Down
4 changes: 2 additions & 2 deletions src/ngrams.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,15 +124,15 @@ impl Ngrams {
if words_since_anchor_seen == self.window_size && side == AnchorSide::Right {
// it has been too long since we saw an anchor, add to the current buffer, record and
// reset
self.record_words(continuous_buffer.drain(..).collect());
self.record_words(std::mem::take(&mut continuous_buffer));
context_window.clear();
side = AnchorSide::Left;
}
}
}
// Any remaining content should be added
continuous_buffer.extend(context_window.asc_iter().copied());
self.record_words(continuous_buffer.drain(..).collect());
self.record_words(std::mem::take(&mut continuous_buffer));
}

/// Take an arbitrarily long vector of words, and record all (overlapping) ngrams obtainable from
Expand Down
54 changes: 53 additions & 1 deletion src/parallel_data/corpus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ impl Corpus {
}
}

/// Get a parallel iterator over the documents
/// Get a parallel iterator over the documents, returning a single report catalog
pub fn catalog_with_parallel_walk<F>(&self, closure: F) -> HashMap<String, u64>
where F: Fn(Document) -> HashMap<String, u64> + Send + Sync {
ParWalkDir::new(self.path.clone())
Expand Down Expand Up @@ -95,4 +95,56 @@ impl Corpus {
map1
})
}

/// Get a parallel iterator over the documents, returning a pair of report catalogs
pub fn catalogs_with_parallel_walk<F>(&self, closure: F) -> (HashMap<String, u64>,HashMap<String, u64>)
where F: Fn(Document) -> (HashMap<String, u64>,HashMap<String, u64>) + Send + Sync {
ParWalkDir::new(self.path.clone())
.num_threads(rayon::current_num_threads())
.skip_hidden(true)
.sort(false)
.into_iter()
.filter_map(|each| {
if let Ok(entry) = each {
let file_name = entry.file_name.to_str().unwrap_or("");
let selected = if let Some(ref extension) = self.extension {
file_name.ends_with(extension)
} else {
file_name.ends_with(".html") || file_name.ends_with(".xhtml")
};
if selected {
let path = entry.path().to_str().unwrap_or("").to_owned();
if !path.is_empty() {
return Some(path);
}
}
}
// all other cases
None
})
.enumerate()
.par_bridge()
.map(|each| {
let (index, path) = each;
let document = Document::new(path, self).unwrap();
if index % 1000 == 0 && index > 0 {
println!(
"-- catalog_with_parallel_walk now processing document {:?}",
1 + index
);
}
closure(document)
})
.reduce(|| (HashMap::new(),HashMap::new()), |(mut map11, mut map12), (map21,map22)| {
for (k, v) in map21 {
let entry = map11.entry(k).or_insert(0);
*entry += v;
}
for (k, v) in map22 {
let entry = map12.entry(k).or_insert(0);
*entry += v;
}
(map11,map12)
})
}
}
4 changes: 2 additions & 2 deletions src/patterns/rules.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1064,7 +1064,7 @@ impl PatternFile {
match cur.get_name().as_ref() {
"meta" => {
if meta_opt.is_some() {
return Err("pattern_file has multiple meta nodes".to_string()).map_err(err_map);
return Err("pattern_file has multiple meta nodes".to_string());
}
meta_opt =
Some(MetaDescription::load_from_node(cur, file_name.to_string()).map_err(err_map)?);
Expand All @@ -1085,7 +1085,7 @@ impl PatternFile {
pctx.add_sequence_rule(cur).map_err(err_map)?;
},
x => {
return Err(format!("Unexpected node \"{x}\" in pattern_file")).map_err(err_map);
return Err(format!("Unexpected node \"{x}\" in pattern_file"));
},
}
}
Expand Down
10 changes: 5 additions & 5 deletions tests/dnm_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,19 +83,19 @@ fn test_xml_node_to_plaintext() {
let mut node = doc.get_root_readonly().unwrap();
match node.get_first_child() {
Some(n) => node = n,
None => assert!(false), //DOM generation failed
None => unreachable!(), //DOM generation failed
}
while node.get_name() != "body" {
match node.get_next_sibling() {
Some(n) => node = n,
None => assert!(false),
None => unreachable!(),
}
}
node = node.get_first_child().unwrap();
while node.get_name() != "h1" {
match node.get_next_sibling() {
Some(n) => node = n,
None => assert!(false),
None => unreachable!(),
}
}
//Node content should have been processed
Expand All @@ -106,15 +106,15 @@ fn test_xml_node_to_plaintext() {
while node.get_name() != "h2" {
match node.get_next_sibling() {
Some(n) => node = n,
None => assert!(false),
None => unreachable!(),
}
}
//node was skipped in dnm generation
assert_eq!(dnm.get_range_of_node(node).unwrap().get_plaintext(), "");
while node.get_name() != "a" {
match node.get_next_sibling() {
Some(n) => node = n,
None => assert!(false),
None => unreachable!(),
}
}
//node content should have been replaced by "[link]"
Expand Down

0 comments on commit a74f2f1

Please sign in to comment.