Skip to content
This repository has been archived by the owner on Mar 25, 2024. It is now read-only.

Commit

Permalink
format all
Browse files Browse the repository at this point in the history
  • Loading branch information
dginev committed Jan 11, 2023
1 parent 18203f7 commit 283be4a
Show file tree
Hide file tree
Showing 25 changed files with 144 additions and 178 deletions.
18 changes: 10 additions & 8 deletions examples/citation_ngrams.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@
// /data/datasets/embeddings-arXMLiv-08-2019/token_model_error.txt
extern crate llamapun;

use llamapun::ngrams::{Ngrams};
use llamapun::ngrams::Ngrams;
use serde::Serialize;
use std::collections::HashMap;
use std::error::Error;
use std::env;
use std::error::Error;
use std::fs::File;
use std::io::{prelude::*, BufWriter, BufReader};
use std::io::{prelude::*, BufReader, BufWriter};
use std::time::Instant;
use serde::Serialize;

static BUFFER_CAPACITY: usize = 10_485_760;
#[derive(Debug, Serialize)]
Expand All @@ -23,14 +23,13 @@ struct HeadingRecord<'a> {
frequency: usize,
}


fn main() -> Result<(), Box<dyn Error>> {
let start_example = Instant::now();
let mut ngrams = Ngrams {
n: 4,
window_size: 15,
anchor: Some("citationelement".to_string()),
counts: HashMap::new()
counts: HashMap::new(),
};

let mut input_args = env::args();
Expand All @@ -39,7 +38,7 @@ fn main() -> Result<(), Box<dyn Error>> {
eprintln!("-- opening {:?}", file_path);
let file = File::open(file_path)?;
let reader = BufReader::new(file);
let mut accum : usize = 0;
let mut accum: usize = 0;
for line in reader.lines() {
let content = line?;
if content.contains("citationelement") {
Expand All @@ -51,7 +50,10 @@ fn main() -> Result<(), Box<dyn Error>> {
}
}
}
let ngrams_file = File::create(format!("{}_grams_{}_window.csv", ngrams.n, ngrams.window_size))?;
let ngrams_file = File::create(format!(
"{}_grams_{}_window.csv",
ngrams.n, ngrams.window_size
))?;
let buffered_writer = BufWriter::with_capacity(BUFFER_CAPACITY, ngrams_file);
let mut csv_writer = csv::Writer::from_writer(buffered_writer);
for (ngram, frequency) in ngrams.sorted() {
Expand Down
8 changes: 5 additions & 3 deletions examples/corpus_heading_stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
//
/// Extracts a corpus heading model from an unpacked corpus of HTML files
/// With math lexemes (default):
/// $ cargo run --release --example corpus_heading_stats /path/to/corpus/ heading_report_filename.csv
/// $ cargo run --release --example corpus_heading_stats /path/to/corpus/
/// heading_report_filename.csv
use std::collections::HashMap;
use std::env;
use std::fs::File;
Expand Down Expand Up @@ -40,7 +41,8 @@ pub fn main() -> Result<(), Error> {
};

let mut corpus = Corpus::new(corpus_path);
// we are interested in canonical heading statistics, so discard a lot of the counting machinery and special content
// we are interested in canonical heading statistics, so discard a lot of the counting machinery
// and special content
corpus
.dnm_parameters
.special_tag_name_options
Expand Down Expand Up @@ -92,7 +94,7 @@ pub fn main() -> Result<(), Error> {
overflow_count += 1;
invalid_heading = true;
break;
}
},
};
if !word_string.is_empty() && word_string != "NUM" {
heading_buffer.push_str(&word_string);
Expand Down
15 changes: 9 additions & 6 deletions examples/corpus_statement_paragraphs_model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
/// paragraph_data.tar
///
/// With math discarded:
/// $ cargo run --release --example corpus_statement_paragraphs_model /path/to/corpus statement_paragraphs.tar discard_math
/// $ cargo run --release --example corpus_statement_paragraphs_model /path/to/corpus
/// statement_paragraphs.tar discard_math
use std::collections::{HashMap, HashSet};
use std::env;
use std::fs::File;
Expand Down Expand Up @@ -132,8 +133,8 @@ fn extract_document_statements(
let mut context = Context::new(&document.dom).unwrap();

'paragraphs: for mut paragraph in document.extended_paragraph_iter() {
// I. Determine the class for this paragraph entry, so that we can iterate over its content after
// if no markup at all, ignore the paragraph and skip to next
// I. Determine the class for this paragraph entry, so that we can iterate over its content
// after if no markup at all, ignore the paragraph and skip to next
let para = paragraph.dnm.root_node;
let para_parent = para.get_parent().unwrap();
let mut prev_heading_opt = paragraph.dnm.root_node.get_prev_sibling();
Expand Down Expand Up @@ -225,7 +226,8 @@ fn extract_document_statements(
continue 'paragraphs;
}
};
// II. We have a labeled statement. Extract content of current paragraph, validating basic data quality
// II. We have a labeled statement. Extract content of current paragraph, validating basic data
// quality
let mut word_count = 0;
let mut invalid_paragraph = false;
let mut paragraph_buffer = String::new();
Expand All @@ -247,7 +249,7 @@ fn extract_document_statements(
overflow_count += 1;
invalid_paragraph = true;
break 'words;
}
},
};
if !word_string.is_empty() {
word_count += 1;
Expand All @@ -271,7 +273,8 @@ fn extract_document_statements(
thread_data.push((paragraph_buffer, paragraph_filename));
}
}
// III. Record valid entries into archive target, having collected all labeled samples for this document
// III. Record valid entries into archive target, having collected all labeled samples for this
// document
let mut builder_lock = tar_builder.lock().unwrap();
for (paragraph_buffer, paragraph_filename) in thread_data.into_iter() {
builder_lock
Expand Down
10 changes: 5 additions & 5 deletions examples/pattern_example.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,12 @@ fn math_node_to_string(node: RoNode) -> String {
fn math_node_to_string_actual(node: RoNode, mut string: &mut String) {
match node.get_name().as_ref() {
"semantics" => math_node_to_string_children(node, &mut string),
"annotation" | "annotation-xml" => {}
"annotation" | "annotation-xml" => {},
"text" => {
if node.is_text_node() {
string.push_str(&node.get_content());
}
}
},
default => {
string.push('<');
string.push_str(default);
Expand All @@ -56,7 +56,7 @@ fn math_node_to_string_actual(node: RoNode, mut string: &mut String) {
string.push('/');
string.push_str(default);
string.push('>');
}
},
}
}

Expand All @@ -82,14 +82,14 @@ fn print_marker(marker: &MarkerEnum, alt_dnm: &DNM, xpath_context: &Context) {
DNMRange::deserialize(&text_marker.range.serialize(), alt_dnm, xpath_context)
.get_plaintext()
);
}
},
MarkerEnum::Math(ref math_marker) => {
println!(
"<h5>MathMarker</h5> \"{}\"\n <br /><br /> <p>{}</p>",
&get_pattern_marker_string(&math_marker.marker),
&math_node_to_string(math_marker.node)
);
}
},
}
}

Expand Down
12 changes: 3 additions & 9 deletions examples/word_tokenization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,20 +98,14 @@ fn main() {
// As well as some basic Benchmarking info:
let end_reports = start_example.elapsed().as_millis();
println!("--- Benchmark report:");
println!(
" LibXML parse took {:?}ms",
end_parse
);
println!(" LibXML parse took {:?}ms", end_parse);
println!(
" LLaMaPun word tokenization took {:?}ms",
end_example-end_parse
end_example - end_parse
);
println!(
" Finished report generation in {:?}ms",
end_reports - end_example
);
println!(
" Total time: {:?}ms",
end_reports
);
println!(" Total time: {:?}ms", end_reports);
}
9 changes: 4 additions & 5 deletions src/ams.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@ use regex::Regex;
use std::fmt;

/// Checks a llamapun `Document` for 'ltx_theorem' AMS markup
pub fn has_markup(doc: &Document) -> bool {
has_markup_xmldoc(&doc.dom)
}
pub fn has_markup(doc: &Document) -> bool { has_markup_xmldoc(&doc.dom) }

/// Checks a libxml document for `ltx_theorem` AMS markup
pub fn has_markup_xmldoc(dom: &XmlDoc) -> bool {
Expand All @@ -25,8 +23,9 @@ pub fn has_markup_xmldoc(dom: &XmlDoc) -> bool {
/// Semantically fixed structural environments in scientific documents, to collect as
/// add-on to the AMS markup
///
/// Note we are explicitly ignoring some of the very high-frequency environments, as they are not rich on textual content.
/// Namely: references, appendix, pacs, subject; Which are rich in metadata and semi-structured content (figures, tables).
/// Note we are explicitly ignoring some of the very high-frequency environments, as they are not
/// rich on textual content. Namely: references, appendix, pacs, subject; Which are rich in metadata
/// and semi-structured content (figures, tables).
#[allow(missing_docs)]
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum StructuralEnv {
Expand Down
14 changes: 5 additions & 9 deletions src/dnm/c14n.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@ impl DNM {
/// Our linguistic canonical form will only include 1) node name, 2) class attribute and 3)
/// textual content - excludes certain experimental markup, such as all math annotation
/// elements - excludes whitespace nodes and comment nodes
pub fn to_c14n_basic(&self) -> String {
self.node_c14n_basic(self.root_node)
}
pub fn to_c14n_basic(&self) -> String { self.node_c14n_basic(self.root_node) }

/// Canonicalize a single node of choice
pub fn node_c14n_basic(&self, node: RoNode) -> String {
Expand All @@ -32,9 +30,7 @@ impl DNM {
}

/// Obtain an MD5 hash from the canonical string of the entire DOM
pub fn to_hash_basic(&self) -> String {
self.node_hash_basic(self.root_node)
}
pub fn to_hash_basic(&self) -> String { self.node_hash_basic(self.root_node) }

/// Obtain an MD5 hash from the canonical string of a Node
pub fn node_hash_basic(&self, node: RoNode) -> String {
Expand Down Expand Up @@ -63,7 +59,7 @@ impl DNM {
// ignore empty nodes
}
}
}
},
Some(ElementNode) => {
// Skip artefact nodes
let name: String = node.get_name();
Expand Down Expand Up @@ -112,10 +108,10 @@ impl DNM {
canonical_node.push_str(&name);
canonical_node.push('>');
}
}
},
_ => {
println!("-- Skipping node {:?}", node.get_name());
} // skip all other node types for now
}, // skip all other node types for now
}
}
}
Expand Down
12 changes: 5 additions & 7 deletions src/dnm/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ impl DNM {
end,
dnm: self,
}),
None => Err("not found in node map".into())
None => Err("not found in node map".into()),
}
}

Expand All @@ -199,9 +199,7 @@ impl DNM {
}

/// Get the underlying text for this DNM
pub fn get_plaintext(&self) -> &str {
&self.plaintext
}
pub fn get_plaintext(&self) -> &str { &self.plaintext }

/// The heart of the dnm generation...
fn recurse_node_create(&mut self, node: RoNode) {
Expand Down Expand Up @@ -333,16 +331,16 @@ impl DNM {
push_token!(self, token, node);
record_node_map!(self, node, offset_start);
return;
}
},
Some(SpecialTagsOption::FunctionNormalize(f)) => {
push_token!(self, &f(node), node);
record_node_map!(self, node, offset_start);
return;
}
},
Some(&SpecialTagsOption::Skip) => {
record_node_map!(self, node, offset_start);
return;
}
},
None => continue,
}
}
Expand Down
5 changes: 3 additions & 2 deletions src/dnm/parameters.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,9 @@ impl DNMParameters {
class_options.insert("ltx_note_outer".to_string(), SpecialTagsOption::Skip);
class_options.insert("ltx_bibliography".to_string(), SpecialTagsOption::Skip);
// Ignores all caption metadata tags, to avoid leaking artefacts into a pure language target
// TODO: Is there merit to extending this to ignoring all ltx_tag elements? leaving things as-is allows for some
// curious artefacts to sneak into the plain-text files, such as bullets/numbers from \item commands
// TODO: Is there merit to extending this to ignoring all ltx_tag elements? leaving things as-is
// allows for some curious artefacts to sneak into the plain-text files, such as
// bullets/numbers from \item commands
class_options.insert("ltx_tag_figure".to_string(), SpecialTagsOption::Skip);
class_options.insert("ltx_tag_table".to_string(), SpecialTagsOption::Skip);

Expand Down
26 changes: 9 additions & 17 deletions src/dnm/range.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,10 @@ impl<'dnmrange> DNMRange<'dnmrange> {
&(self.dnm.plaintext)[self.dnm.byte_offsets[self.start]..self.dnm.byte_offsets[self.end]]
}
/// Get the plaintext without trailing white spaces
pub fn get_plaintext_truncated(&self) -> &'dnmrange str {
self.get_plaintext().trim_end()
}
pub fn get_plaintext_truncated(&self) -> &'dnmrange str { self.get_plaintext().trim_end() }

/// Get the first corresponding DOM node for this range
pub fn get_node(&self) -> RoNode {
self.dnm.back_map[self.start].0
}
pub fn get_node(&self) -> RoNode { self.dnm.back_map[self.start].0 }

/// Returns a `DNMRange` with the leading and trailing whitespaces removed
pub fn trim(&self) -> DNMRange<'dnmrange> {
Expand Down Expand Up @@ -133,9 +129,7 @@ impl<'dnmrange> DNMRange<'dnmrange> {
}

/// checks whether the range is empty
pub fn is_empty(&self) -> bool {
self.start == self.end
}
pub fn is_empty(&self) -> bool { self.start == self.end }

/*
* SERIALIZATION CODE
Expand All @@ -155,9 +149,7 @@ impl<'dnmrange> DNMRange<'dnmrange> {
}

/// creates an arange from to xpointers
pub fn create_arange(from: &str, to: &str) -> String {
format!("arange({from},{to})")
}
pub fn create_arange(from: &str, to: &str) -> String { format!("arange({from},{to})") }

/// Serializes a node and an offset into an xpointer
/// is_end indicates whether the node indicates the end of the interval
Expand Down Expand Up @@ -215,7 +207,7 @@ impl<'dnmrange> DNMRange<'dnmrange> {
get_node_number(parent, act, &|n: RoNode| n.get_name() == act.get_name()).unwrap()
)
}
}
},
Some(x) => format!("//*[@id=\"{x}\"]"),
}
}
Expand Down Expand Up @@ -270,7 +262,7 @@ impl<'dnmrange> DNMRange<'dnmrange> {
pos += 1;
}
pos
}
},
Err(_) => get_position_of_lowest_parent(node, dnm),
}
} else {
Expand Down Expand Up @@ -309,7 +301,7 @@ fn get_next_sibling(root_node: RoNode, node: RoNode) -> Option<RoNode> {
} else {
get_next_sibling(root_node, node.get_parent().unwrap())
}
}
},
Some(n) => Some(n),
}
}
Expand All @@ -332,10 +324,10 @@ fn get_node_number(
match cur.get_next_sibling() {
None => {
return Err(());
}
},
Some(n) => {
cur = n;
}
},
}
}
Ok(count)
Expand Down
Loading

0 comments on commit 283be4a

Please sign in to comment.