Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use zstd over gzip when not compiling for WASM #44

Merged
merged 15 commits into from
Jul 1, 2019
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
.env
*.bin
*.bin.zstd
*.bin.gz
/Cargo.lock
gh-pages/
Expand Down
12 changes: 7 additions & 5 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,22 @@ exclude = [

[dependencies]
failure = "0.1.5"
flate2 = { version = "1.0.7", features = ["rust_backend"], default_features = false }
lazy_static = "1.3.0"
log = "0.4.6"
regex = "1.1.2"
regex = "1.1.7"
rmp-serde = "0.13.7"
serde = "1.0.89"
serde_derive = "1.0.89"
serde = { version = "1.0.92", features = ["derive"] }
unicode-normalization = "0.1.8"

# spdx deps
serde_json = { version = "1.0.39", optional = true }

[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
rayon = "1.0.3"
rayon = "1.1.0"
zstd = "0.4.24+zstd.1.4.0"
Jake-Shadle marked this conversation as resolved.
Show resolved Hide resolved

[target.'cfg(target_arch = "wasm32")'.dependencies]
zstd = { version = "0.4.24+zstd.1.4.0", default-features = false, features = ["wasm"] }

[dev-dependencies]
env_logger = "0.6.1"
Expand Down
127 changes: 78 additions & 49 deletions cli/Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ include = [
"/src/**/*",
"/build.rs",
"/Cargo.*",
"/embedded-cache.bin.gz",
"/embedded-cache.bin.zstd",
]

[dependencies]
Expand Down
2 changes: 1 addition & 1 deletion cli/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use std::path::Path;

use askalono::Store;

const EMBEDDED_CACHE: &str = "embedded-cache.bin.gz";
const EMBEDDED_CACHE: &str = "embedded-cache.bin.zstd";

fn main() {
if env::var("CARGO_FEATURE_EMBEDDED_CACHE").is_err() {
Expand Down
1 change: 0 additions & 1 deletion cli/src/formats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,6 @@ impl<'a> FileResult<'a> {
}

fn as_json(&self) -> String {
use serde_json;
serde_json::to_string(self).expect("must produce valid json output")
}
}
Expand Down
6 changes: 3 additions & 3 deletions cli/src/identify.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ pub fn identify_data(
score: cr.score,
license: CLIIdentifiedLicense {
aliases: store.aliases(&cr.license.name).unwrap().clone(),
name: cr.license.name.clone(),
name: cr.license.name.to_owned(),
kind: cr.license.kind,
},
line_range: cr.line_range,
Expand All @@ -126,7 +126,7 @@ pub fn identify_data(
if let Some(license) = result.license {
output.license = Some(CLIIdentifiedLicense {
aliases: store.aliases(&license.name).unwrap().clone(),
name: license.name,
name: license.name.to_owned(),
kind: license.kind,
});

Expand All @@ -138,7 +138,7 @@ pub fn identify_data(
}

// not a good enough match overall, but maybe inside
if output.containing.len() > 0 {
if !output.containing.is_empty() {
if want_diff {
diff_result(&text_data, &result.containing[0].license.data);
}
Expand Down
2 changes: 1 addition & 1 deletion cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ fn main() {

let cache_file: PathBuf = options
.cache
.unwrap_or_else(|| "./askalono-cache.bin.gz".into());
.unwrap_or_else(|| "./askalono-cache.bin.zstd".into());

let output_format = options.format.unwrap_or(OutputFormat::text);

Expand Down
4 changes: 2 additions & 2 deletions examples/annotate-text.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ enum Annotation {
fn main() {
let args: Vec<_> = std::env::args().collect();
if args.len() != 2 {
eprintln!("usage: annotate-text cache.bin.gz < input.txt > output.html");
eprintln!("usage: annotate-text cache.bin.zstd < input.txt > output.html");
std::process::exit(1);
}

Expand All @@ -37,7 +37,7 @@ fn main() {
for result in &results.containing {
annotations.insert(
result.line_range.0,
Annotation::Begin(result.license.name.clone()),
Annotation::Begin(result.license.name.to_owned()),
);
annotations.insert(result.line_range.1, Annotation::End);
}
Expand Down
2 changes: 1 addition & 1 deletion src/license.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

use std::{collections::HashMap, fmt};

use serde_derive::{Deserialize, Serialize};
use serde::{Deserialize, Serialize};

use crate::{
ngram::NgramSet,
Expand Down
2 changes: 1 addition & 1 deletion src/ngram.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use std::{
collections::{hash_map::Iter, HashMap, VecDeque},
};

use serde_derive::{Deserialize, Serialize};
use serde::{Deserialize, Serialize};

#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
pub struct NgramSet {
Expand Down
2 changes: 2 additions & 0 deletions src/preproc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ fn trim_line(input: &str) -> String {

// Aggressive preprocessors

#[allow(dead_code)]
fn lcs_substr(fstr: &str, sstr: &str) -> Option<String> {
let mut f_chars = fstr.chars();
let mut s_chars = sstr.chars();
Expand Down Expand Up @@ -161,6 +162,7 @@ fn lcs_substr(fstr: &str, sstr: &str) -> Option<String> {
}
}

#[allow(dead_code)]
fn remove_common_tokens(text: &str) -> String {
let lines: Vec<&str> = text.split('\n').collect();
let mut largest_substr = String::new();
Expand Down
83 changes: 40 additions & 43 deletions src/store/analyze.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@

use std::{cmp::Ordering, fmt};

use crate::{license::LicenseType, license::TextData, store::base::Store};
use crate::{
license::LicenseType,
license::TextData,
store::base::{LicenseEntry, Store},
};

/// Information about text that was compared against licenses in the store.
///
Expand All @@ -17,7 +21,7 @@ pub struct Match<'a> {
pub score: f32,
/// The name of the closest matching license in the `Store`. This will
/// always be something that exists in the store, regardless of the score.
pub name: String,
Jake-Shadle marked this conversation as resolved.
Show resolved Hide resolved
pub name: &'a str,
/// The type of the license that matched. Useful to know if the match was
/// the complete text, a header, or something else.
pub license_type: LicenseType,
Expand Down Expand Up @@ -59,46 +63,41 @@ impl<'a> fmt::Debug for Match<'a> {
}
}

// this could probably be a stand-alone closure, but I was hitting lifetime
// hell, so a macro it is. feel free to attempt it yourself.
macro_rules! analyze_fold_closure {
($text:ident) => {
|mut acc: Vec<PartialMatch<'_>>, (name, data)| {
acc.push(PartialMatch {
score: data.original.match_score($text),
name,
license_type: LicenseType::Original,
data: &data.original,
});
data.alternates.iter().for_each(|alt| {
acc.push(PartialMatch {
score: alt.match_score($text),
name,
license_type: LicenseType::Alternate,
data: alt,
})
});
data.headers.iter().for_each(|head| {
acc.push(PartialMatch {
score: head.match_score($text),
name,
license_type: LicenseType::Header,
data: head,
})
});
acc
}
};
}

impl Store {
/// Compare the given `TextData` against all licenses in the `Store`.
///
/// This parallelizes the search as much as it can to find the best match.
/// Once a match is obtained, it can be optimized further; see methods on
/// `TextData` for more information.
pub fn analyze(&self, text: &TextData) -> Match<'_> {
let mut res: Vec<PartialMatch<'_>>;
pub fn analyze<'a>(&'a self, text: &TextData) -> Match<'a> {
let mut res: Vec<PartialMatch<'a>>;

let analyze_fold =
Jake-Shadle marked this conversation as resolved.
Show resolved Hide resolved
|mut acc: Vec<PartialMatch<'a>>, (name, data): (&'a String, &'a LicenseEntry)| {
acc.push(PartialMatch {
score: data.original.match_score(text),
name,
license_type: LicenseType::Original,
data: &data.original,
});
data.alternates.iter().for_each(|alt| {
acc.push(PartialMatch {
score: alt.match_score(text),
name,
license_type: LicenseType::Alternate,
data: alt,
})
});
data.headers.iter().for_each(|head| {
acc.push(PartialMatch {
score: head.match_score(text),
name,
license_type: LicenseType::Header,
data: head,
})
});
acc
};

// parallel analysis
#[cfg(not(target_arch = "wasm32"))]
Expand All @@ -107,10 +106,10 @@ impl Store {
res = self
.licenses
.par_iter()
.fold(Vec::new, analyze_fold_closure!(text))
.fold(Vec::new, analyze_fold)
.reduce(
Vec::new,
|mut a: Vec<PartialMatch<'_>>, b: Vec<PartialMatch<'_>>| {
|mut a: Vec<PartialMatch<'a>>, b: Vec<PartialMatch<'a>>| {
a.extend(b);
a
},
Expand All @@ -125,17 +124,15 @@ impl Store {
.licenses
.iter()
// len of licenses isn't strictly correct, but it'll do
.fold(
Vec::with_capacity(self.licenses.len()),
analyze_fold_closure!(text),
);
.fold(Vec::with_capacity(self.licenses.len()), analyze_fold);
res.sort_unstable_by(|a, b| b.partial_cmp(a).unwrap());
}

let m = &res[0];

Match {
score: m.score,
name: m.name.to_string(),
name: m.name,
license_type: m.license_type,
data: m.data,
}
Expand Down
4 changes: 2 additions & 2 deletions src/store/base.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
use std::collections::HashMap;

use failure::{format_err, Error};
use serde_derive::{Deserialize, Serialize};
use serde::{Deserialize, Serialize};

use crate::{license::LicenseType, license::TextData};

Expand All @@ -31,7 +31,7 @@ pub(crate) struct LicenseEntry {
/// use askalono::{Store, TextData};
///
/// # fn main() -> Result<(), Box<Error>> {
/// let store = Store::from_cache(File::open("askalono-cache.bin.gz")?)?;
/// let store = Store::from_cache(File::open("askalono-cache.bin.zstd")?)?;
/// let result = store.analyze(&TextData::from("what's this"));
/// # Ok(())
/// # }
Expand Down
44 changes: 20 additions & 24 deletions src/store/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,14 @@

use std::{io::copy, io::prelude::*};

use failure::{bail, format_err, Error};
use flate2::{read::GzDecoder, Compression, GzBuilder};
use failure::Error;
use log::info;
use rmp_serde::Serializer;
use serde::Serialize;

use crate::store::base::Store;

const CACHE_VERSION: &[u8] = b"askalono-03";
const CACHE_VERSION: &[u8] = b"askalono-04";

impl Store {
/// Create a store from a cache file.
Expand All @@ -21,48 +20,45 @@ impl Store {
/// the full SPDX set from disk in 200-300 ms. The cache will be
/// sanity-checked to ensure it was generated with a similar version of
/// askalono.
pub fn from_cache<R>(readable: R) -> Result<Store, Error>
pub fn from_cache<R>(mut readable: R) -> Result<Store, Error>
where
R: Read + Sized,
{
use rmp_serde::decode::from_read;
let mut header = [0u8; 11];
readable.read_exact(&mut header)?;

let dec = GzDecoder::new(readable);
{
let extra = dec
.header()
.ok_or_else(|| format_err!("cache gzip header invalid"))?
.extra()
.ok_or_else(|| format_err!("cache gzip extra header missing"))?;
if extra != CACHE_VERSION {
bail!("cache version mismatch");
}
if header != CACHE_VERSION {
failure::bail!("cache version mismatch");
}

let store = from_read(dec)?;
let dec = zstd::Decoder::new(readable)?;
let store = rmp_serde::decode::from_read(dec)?;
Ok(store)
}

/// Serialize the current store.
///
/// The output will be a MessagePack'd gzip'd binary stream that should be
/// The output will be a MessagePack'd gzip'd or zstd'd binary stream that should be
/// written to disk.
pub fn to_cache<W>(&self, mut writable: W) -> Result<(), Error>
where
W: Write + Sized,
{
let mut buf = Vec::new();
{
let buf = {
// This currently sits around 3.7MiB, so go up to 4 to fit comfortably
let mut buf = Vec::with_capacity(4 * 1024 * 1024);
let mut serializer = Serializer::new(&mut buf);
self.serialize(&mut serializer)?;
}
buf
};

info!("Pre-compressed output is {} bytes", buf.len());

let mut gz = GzBuilder::new()
.extra(CACHE_VERSION)
.write(&mut writable, Compression::best());
copy(&mut buf.as_slice(), &mut gz)?;
writable.write_all(CACHE_VERSION)?;
let mut zenc = zstd::Encoder::new(writable, 21)?;

copy(&mut buf.as_slice(), &mut zenc)?;
zenc.finish()?;

Ok(())
}
Expand Down
Loading