format all

KWARC · Jan 11, 2023 · 283be4a · 283be4a
1 parent 18203f7
commit 283be4a
Show file tree

Hide file tree

Showing 25 changed files with 144 additions and 178 deletions.
diff --git a/examples/citation_ngrams.rs b/examples/citation_ngrams.rs
@@ -7,14 +7,14 @@
 //    /data/datasets/embeddings-arXMLiv-08-2019/token_model_error.txt
 extern crate llamapun;
 
-use llamapun::ngrams::{Ngrams};
+use llamapun::ngrams::Ngrams;
+use serde::Serialize;
 use std::collections::HashMap;
-use std::error::Error;
 use std::env;
+use std::error::Error;
 use std::fs::File;
-use std::io::{prelude::*, BufWriter, BufReader};
+use std::io::{prelude::*, BufReader, BufWriter};
 use std::time::Instant;
-use serde::Serialize;
 
 static BUFFER_CAPACITY: usize = 10_485_760;
 #[derive(Debug, Serialize)]
@@ -23,14 +23,13 @@ struct HeadingRecord<'a> {
   frequency: usize,
 }
 
-
 fn main() -> Result<(), Box<dyn Error>> {
   let start_example = Instant::now();
   let mut ngrams = Ngrams {
     n: 4,
     window_size: 15,
     anchor: Some("citationelement".to_string()),
-    counts: HashMap::new()
+    counts: HashMap::new(),
   };
 
   let mut input_args = env::args();
@@ -39,7 +38,7 @@ fn main() -> Result<(), Box<dyn Error>> {
     eprintln!("-- opening {:?}", file_path);
     let file = File::open(file_path)?;
     let reader = BufReader::new(file);
-    let mut accum : usize = 0;
+    let mut accum: usize = 0;
     for line in reader.lines() {
       let content = line?;
       if content.contains("citationelement") {
@@ -51,7 +50,10 @@ fn main() -> Result<(), Box<dyn Error>> {
       }
     }
   }
-  let ngrams_file = File::create(format!("{}_grams_{}_window.csv", ngrams.n, ngrams.window_size))?;
+  let ngrams_file = File::create(format!(
+    "{}_grams_{}_window.csv",
+    ngrams.n, ngrams.window_size
+  ))?;
   let buffered_writer = BufWriter::with_capacity(BUFFER_CAPACITY, ngrams_file);
   let mut csv_writer = csv::Writer::from_writer(buffered_writer);
   for (ngram, frequency) in ngrams.sorted() {

diff --git a/examples/corpus_heading_stats.rs b/examples/corpus_heading_stats.rs
@@ -3,7 +3,8 @@
 //
 /// Extracts a corpus heading model from an unpacked corpus of HTML files
 /// With math lexemes (default):
-/// $ cargo run --release --example corpus_heading_stats /path/to/corpus/  heading_report_filename.csv
+/// $ cargo run --release --example corpus_heading_stats /path/to/corpus/
+/// heading_report_filename.csv
 use std::collections::HashMap;
 use std::env;
 use std::fs::File;
@@ -40,7 +41,8 @@ pub fn main() -> Result<(), Error> {
   };
 
   let mut corpus = Corpus::new(corpus_path);
-  // we are interested in canonical heading statistics, so discard a lot of the counting machinery and special content
+  // we are interested in canonical heading statistics, so discard a lot of the counting machinery
+  // and special content
   corpus
     .dnm_parameters
     .special_tag_name_options
@@ -92,7 +94,7 @@ pub fn main() -> Result<(), Error> {
             overflow_count += 1;
             invalid_heading = true;
             break;
-          }
+          },
         };
         if !word_string.is_empty() && word_string != "NUM" {
           heading_buffer.push_str(&word_string);

diff --git a/examples/corpus_statement_paragraphs_model.rs b/examples/corpus_statement_paragraphs_model.rs
@@ -7,7 +7,8 @@
 /// paragraph_data.tar
 ///
 /// With math discarded:
-/// $ cargo run --release --example corpus_statement_paragraphs_model /path/to/corpus statement_paragraphs.tar discard_math
+/// $ cargo run --release --example corpus_statement_paragraphs_model /path/to/corpus
+/// statement_paragraphs.tar discard_math
 use std::collections::{HashMap, HashSet};
 use std::env;
 use std::fs::File;
@@ -132,8 +133,8 @@ fn extract_document_statements(
   let mut context = Context::new(&document.dom).unwrap();
 
   'paragraphs: for mut paragraph in document.extended_paragraph_iter() {
-    // I. Determine the class for this paragraph entry, so that we can iterate over its content after
-    // if no markup at all, ignore the paragraph and skip to next
+    // I. Determine the class for this paragraph entry, so that we can iterate over its content
+    // after if no markup at all, ignore the paragraph and skip to next
     let para = paragraph.dnm.root_node;
     let para_parent = para.get_parent().unwrap();
     let mut prev_heading_opt = paragraph.dnm.root_node.get_prev_sibling();
@@ -225,7 +226,8 @@ fn extract_document_statements(
         continue 'paragraphs;
       }
     };
-    // II. We have a labeled statement. Extract content of current paragraph, validating basic data quality
+    // II. We have a labeled statement. Extract content of current paragraph, validating basic data
+    // quality
     let mut word_count = 0;
     let mut invalid_paragraph = false;
     let mut paragraph_buffer = String::new();
@@ -247,7 +249,7 @@ fn extract_document_statements(
           overflow_count += 1;
           invalid_paragraph = true;
           break 'words;
-        }
+        },
       };
       if !word_string.is_empty() {
         word_count += 1;
@@ -271,7 +273,8 @@ fn extract_document_statements(
       thread_data.push((paragraph_buffer, paragraph_filename));
     }
   }
-  // III. Record valid entries into archive target, having collected all labeled samples for this document
+  // III. Record valid entries into archive target, having collected all labeled samples for this
+  // document
   let mut builder_lock = tar_builder.lock().unwrap();
   for (paragraph_buffer, paragraph_filename) in thread_data.into_iter() {
     builder_lock

diff --git a/examples/pattern_example.rs b/examples/pattern_example.rs
@@ -41,12 +41,12 @@ fn math_node_to_string(node: RoNode) -> String {
 fn math_node_to_string_actual(node: RoNode, mut string: &mut String) {
   match node.get_name().as_ref() {
     "semantics" => math_node_to_string_children(node, &mut string),
-    "annotation" | "annotation-xml" => {}
+    "annotation" | "annotation-xml" => {},
     "text" => {
       if node.is_text_node() {
         string.push_str(&node.get_content());
       }
-    }
+    },
     default => {
       string.push('<');
       string.push_str(default);
@@ -56,7 +56,7 @@ fn math_node_to_string_actual(node: RoNode, mut string: &mut String) {
       string.push('/');
       string.push_str(default);
       string.push('>');
-    }
+    },
   }
 }
 
@@ -82,14 +82,14 @@ fn print_marker(marker: &MarkerEnum, alt_dnm: &DNM, xpath_context: &Context) {
         DNMRange::deserialize(&text_marker.range.serialize(), alt_dnm, xpath_context)
           .get_plaintext()
       );
-    }
+    },
     MarkerEnum::Math(ref math_marker) => {
       println!(
         "<h5>MathMarker</h5> \"{}\"\n <br /><br /> <p>{}</p>",
         &get_pattern_marker_string(&math_marker.marker),
         &math_node_to_string(math_marker.node)
       );
-    }
+    },
   }
 }
 

diff --git a/examples/word_tokenization.rs b/examples/word_tokenization.rs
@@ -98,20 +98,14 @@ fn main() {
   // As well as some basic Benchmarking info:
   let end_reports = start_example.elapsed().as_millis();
   println!("--- Benchmark report:");
-  println!(
-    "    LibXML parse took {:?}ms",
-    end_parse
-  );
+  println!("    LibXML parse took {:?}ms", end_parse);
   println!(
     "    LLaMaPun word tokenization took {:?}ms",
-    end_example-end_parse
+    end_example - end_parse
   );
   println!(
     "    Finished report generation in {:?}ms",
     end_reports - end_example
   );
-  println!(
-    "    Total time: {:?}ms",
-    end_reports
-  );
+  println!("    Total time: {:?}ms", end_reports);
 }
diff --git a/src/ams.rs b/src/ams.rs
@@ -9,9 +9,7 @@ use regex::Regex;
 use std::fmt;
 
 /// Checks a llamapun `Document` for 'ltx_theorem' AMS markup
-pub fn has_markup(doc: &Document) -> bool {
-  has_markup_xmldoc(&doc.dom)
-}
+pub fn has_markup(doc: &Document) -> bool { has_markup_xmldoc(&doc.dom) }
 
 /// Checks a libxml document for `ltx_theorem` AMS markup
 pub fn has_markup_xmldoc(dom: &XmlDoc) -> bool {
@@ -25,8 +23,9 @@ pub fn has_markup_xmldoc(dom: &XmlDoc) -> bool {
 /// Semantically fixed structural environments in scientific documents, to collect as
 /// add-on to the AMS markup
 ///
-/// Note we are explicitly ignoring some of the very high-frequency environments, as they are not rich on textual content.
-/// Namely: references, appendix, pacs, subject; Which are rich in metadata and semi-structured content (figures, tables).
+/// Note we are explicitly ignoring some of the very high-frequency environments, as they are not
+/// rich on textual content. Namely: references, appendix, pacs, subject; Which are rich in metadata
+/// and semi-structured content (figures, tables).
 #[allow(missing_docs)]
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 pub enum StructuralEnv {

diff --git a/src/dnm/c14n.rs b/src/dnm/c14n.rs
@@ -20,9 +20,7 @@ impl DNM {
   /// Our linguistic canonical form will only include 1) node name, 2) class attribute and 3)
   /// textual content - excludes certain experimental markup, such as all math annotation
   /// elements  - excludes whitespace nodes and comment nodes
-  pub fn to_c14n_basic(&self) -> String {
-    self.node_c14n_basic(self.root_node)
-  }
+  pub fn to_c14n_basic(&self) -> String { self.node_c14n_basic(self.root_node) }
 
   /// Canonicalize a single node of choice
   pub fn node_c14n_basic(&self, node: RoNode) -> String {
@@ -32,9 +30,7 @@ impl DNM {
   }
 
   /// Obtain an MD5 hash from the canonical string of the entire DOM
-  pub fn to_hash_basic(&self) -> String {
-    self.node_hash_basic(self.root_node)
-  }
+  pub fn to_hash_basic(&self) -> String { self.node_hash_basic(self.root_node) }
 
   /// Obtain an MD5 hash from the canonical string of a Node
   pub fn node_hash_basic(&self, node: RoNode) -> String {
@@ -63,7 +59,7 @@ impl DNM {
             // ignore empty nodes
           }
         }
-      }
+      },
       Some(ElementNode) => {
         // Skip artefact nodes
         let name: String = node.get_name();
@@ -112,10 +108,10 @@ impl DNM {
           canonical_node.push_str(&name);
           canonical_node.push('>');
         }
-      }
+      },
       _ => {
         println!("-- Skipping node {:?}", node.get_name());
-      } // skip all other node types for now
+      }, // skip all other node types for now
     }
   }
 }

diff --git a/src/dnm/mod.rs b/src/dnm/mod.rs
@@ -189,7 +189,7 @@ impl DNM {
         end,
         dnm: self,
       }),
-      None => Err("not found in node map".into())
+      None => Err("not found in node map".into()),
     }
   }
 
@@ -199,9 +199,7 @@ impl DNM {
   }
 
   /// Get the underlying text for this DNM
-  pub fn get_plaintext(&self) -> &str {
-    &self.plaintext
-  }
+  pub fn get_plaintext(&self) -> &str { &self.plaintext }
 
   /// The heart of the dnm generation...
   fn recurse_node_create(&mut self, node: RoNode) {
@@ -333,16 +331,16 @@ impl DNM {
             push_token!(self, token, node);
             record_node_map!(self, node, offset_start);
             return;
-          }
+          },
           Some(SpecialTagsOption::FunctionNormalize(f)) => {
             push_token!(self, &f(node), node);
             record_node_map!(self, node, offset_start);
             return;
-          }
+          },
           Some(&SpecialTagsOption::Skip) => {
             record_node_map!(self, node, offset_start);
             return;
-          }
+          },
           None => continue,
         }
       }

diff --git a/src/dnm/parameters.rs b/src/dnm/parameters.rs
@@ -131,8 +131,9 @@ impl DNMParameters {
     class_options.insert("ltx_note_outer".to_string(), SpecialTagsOption::Skip);
     class_options.insert("ltx_bibliography".to_string(), SpecialTagsOption::Skip);
     // Ignores all caption metadata tags, to avoid leaking artefacts into a pure language target
-    // TODO: Is there merit to extending this to ignoring all ltx_tag elements? leaving things as-is allows for some
-    // curious artefacts to sneak into the plain-text files, such as bullets/numbers from \item commands
+    // TODO: Is there merit to extending this to ignoring all ltx_tag elements? leaving things as-is
+    // allows for some curious artefacts to sneak into the plain-text files, such as
+    // bullets/numbers from \item commands
     class_options.insert("ltx_tag_figure".to_string(), SpecialTagsOption::Skip);
     class_options.insert("ltx_tag_table".to_string(), SpecialTagsOption::Skip);
 

diff --git a/src/dnm/range.rs b/src/dnm/range.rs
@@ -34,14 +34,10 @@ impl<'dnmrange> DNMRange<'dnmrange> {
     &(self.dnm.plaintext)[self.dnm.byte_offsets[self.start]..self.dnm.byte_offsets[self.end]]
   }
   /// Get the plaintext without trailing white spaces
-  pub fn get_plaintext_truncated(&self) -> &'dnmrange str {
-    self.get_plaintext().trim_end()
-  }
+  pub fn get_plaintext_truncated(&self) -> &'dnmrange str { self.get_plaintext().trim_end() }
 
   /// Get the first corresponding DOM node for this range
-  pub fn get_node(&self) -> RoNode {
-    self.dnm.back_map[self.start].0
-  }
+  pub fn get_node(&self) -> RoNode { self.dnm.back_map[self.start].0 }
 
   /// Returns a `DNMRange` with the leading and trailing whitespaces removed
   pub fn trim(&self) -> DNMRange<'dnmrange> {
@@ -133,9 +129,7 @@ impl<'dnmrange> DNMRange<'dnmrange> {
   }
 
   /// checks whether the range is empty
-  pub fn is_empty(&self) -> bool {
-    self.start == self.end
-  }
+  pub fn is_empty(&self) -> bool { self.start == self.end }
 
   /*
    * SERIALIZATION CODE
@@ -155,9 +149,7 @@ impl<'dnmrange> DNMRange<'dnmrange> {
   }
 
   /// creates an arange from to xpointers
-  pub fn create_arange(from: &str, to: &str) -> String {
-    format!("arange({from},{to})")
-  }
+  pub fn create_arange(from: &str, to: &str) -> String { format!("arange({from},{to})") }
 
   /// Serializes a node and an offset into an xpointer
   /// is_end indicates whether the node indicates the end of the interval
@@ -215,7 +207,7 @@ impl<'dnmrange> DNMRange<'dnmrange> {
             get_node_number(parent, act, &|n: RoNode| n.get_name() == act.get_name()).unwrap()
           )
         }
-      }
+      },
       Some(x) => format!("//*[@id=\"{x}\"]"),
     }
   }
@@ -270,7 +262,7 @@ impl<'dnmrange> DNMRange<'dnmrange> {
             pos += 1;
           }
           pos
-        }
+        },
         Err(_) => get_position_of_lowest_parent(node, dnm),
       }
     } else {
@@ -309,7 +301,7 @@ fn get_next_sibling(root_node: RoNode, node: RoNode) -> Option<RoNode> {
       } else {
         get_next_sibling(root_node, node.get_parent().unwrap())
       }
-    }
+    },
     Some(n) => Some(n),
   }
 }
@@ -332,10 +324,10 @@ fn get_node_number(
     match cur.get_next_sibling() {
       None => {
         return Err(());
-      }
+      },
       Some(n) => {
         cur = n;
-      }
+      },
     }
   }
   Ok(count)