diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ee60374a..4708dbc3f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,8 +9,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## Unreleased ### Changes +- Inputs are now enumerated incrementally as scanning proceeds rather than done in an initial batch step ([#216](https://github.com/praetorian-inc/noseyparker/pull/216)). + This reduces peak memory use and CPU time 10-20%, particularly in environments with slow I/O. + A consequence of this change is that the total amount of data to scan is not known until it has actually been scanned, and so the scanning progress bar no longer shows a completion percentage. -- When scanning, the progress bar for cloning Git repositories now includes the current repository URL ([#212](https://github.com/praetorian-inc/noseyparker/pull/212)). +- When cloning Git repositories while scanning, the progress bar for now includes the current repository URL ([#212](https://github.com/praetorian-inc/noseyparker/pull/212)). - When scanning, automatically cloned Git repositories are now recorded with the path given on the command line instead of the canonicalized path ([#212](https://github.com/praetorian-inc/noseyparker/pull/212)). This makes datastores slightly more portable across different environments, such as within a Docker container and on the host machine, as relative paths can now be recorded. @@ -20,11 +23,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - The built-in support for enumerating and interacting with GitHub is now a compile time-selectable feature that is enabled by default ([#213](https://github.com/praetorian-inc/noseyparker/pull/213)). This makes it possible to build a slimmer release for environments where GitHub functionality is unused. +### Fixes + +- The `Google OAuth Credentials` rule has been revised to avoid runtime errors about an empty capture group. + ## [v0.19.0](https://github.com/praetorian-inc/noseyparker/releases/v0.19.0) (2024-07-30) ### Additions - - The `scan` and `github repos list` commands offer a new `--github-repo-type={all,source,fork}` option to select a subset of repositories ([#204](https://github.com/praetorian-inc/noseyparker/pull/204)). - A category mechanism is now provided for rules ([#208](https://github.com/praetorian-inc/noseyparker/pull/208)). @@ -42,7 +48,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), The category information is included in output in the `rules list` command. ### Changes - - The `scan` and `github repos list` commands now only consider non-forked repositories by default ([#204](https://github.com/praetorian-inc/noseyparker/pull/204)). This behavior can be reverted to the previous behavior using the `--github-repo-type=all` option. @@ -84,7 +89,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [v0.18.1](https://github.com/praetorian-inc/noseyparker/releases/v0.18.1) (2024-07-12) ### Fixes - - Nosey Parker no longer crashes upon startup when running in environments with less than 4 GiB of RAM ([#202](https://github.com/praetorian-inc/noseyparker/pull/202)). - The `Base64-PEM-Encoded Private Key` rule has been refined to reduce false positives and avoid a rare performance pitfall. @@ -93,7 +97,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [v0.18.0](https://github.com/praetorian-inc/noseyparker/releases/v0.18.0) (2024-06-27) ### Additions - - The README now includes several animated GIFs that demonstrate simple example use cases ([#154](https://github.com/praetorian-inc/noseyparker/pull/154)). - The `report` command now offers a new `--finding-status=STATUS` filtering option ([#162](https://github.com/praetorian-inc/noseyparker/pull/162)). @@ -151,7 +154,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), This helps avoid complete loss of scan results in the rare event of a crash. ### Fixes - - A rare crash when parsing malformed Git commit timestamps has been fixed by updating the `gix-date` dependency ([#185](https://github.com/praetorian-inc/noseyparker/pull/185)). - Upon `noseyparker` startup, if resource limits cannot be adjusted, instead of crashing, a warning is printed and the process attempts to continue ([#170](https://github.com/praetorian-inc/noseyparker/issues/170)). diff --git a/Cargo.lock b/Cargo.lock index e58aef8fd..7b0dc13db 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2164,12 +2164,12 @@ dependencies = [ "anyhow", "bstr", "bstring-serde", + "crossbeam-channel", "fixedbitset 0.5.7", "gix", "ignore", "petgraph", "pretty_assertions", - "progress", "roaring", "schemars", "serde", @@ -4144,21 +4144,22 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "vectorscan-rs" -version = "0.0.2" +version = "0.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "077f7330215aebade732a3bb3368a216b2dd51026dd4d0e6fb2164a7e6124e61" +checksum = "0597e8f38ea3b288d842860aa1d0af3e73be1b2a284c117f39e8f653361d0522" dependencies = [ "bitflags 2.6.0", "foreign-types 0.5.0", + "libc", "thiserror", "vectorscan-rs-sys", ] [[package]] name = "vectorscan-rs-sys" -version = "0.0.2" +version = "0.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1373a5cd584a1ac79045d9399b550cdb539769e0f328f19723260628905b7de2" +checksum = "53a944ddd15a3b14af5e881a11cf0b0845ec8d1b6367a35f5f0f17da976e2241" dependencies = [ "cmake", "flate2", diff --git a/crates/input-enumerator/Cargo.toml b/crates/input-enumerator/Cargo.toml index 1751830a4..cd3912e67 100644 --- a/crates/input-enumerator/Cargo.toml +++ b/crates/input-enumerator/Cargo.toml @@ -13,11 +13,11 @@ publish.workspace = true anyhow = { version = "1.0" } bstr = { version = "1.0", features = ["serde"] } bstring-serde = { path = "../bstring-serde" } +crossbeam-channel = "0.5" fixedbitset = "0.5" gix = { version = "0.64", features = ["max-performance", "serde"] } ignore = "0.4" petgraph = "0.6" -progress = { path = "../progress" } roaring = "0.10" schemars = { version = "0.8" } serde = { version = "1.0", features = ["derive"] } diff --git a/crates/input-enumerator/src/git_metadata_graph.rs b/crates/input-enumerator/src/git_metadata_graph.rs index 5b40c8cce..a3ef72c4f 100644 --- a/crates/input-enumerator/src/git_metadata_graph.rs +++ b/crates/input-enumerator/src/git_metadata_graph.rs @@ -13,7 +13,6 @@ use std::time::Instant; use tracing::{debug, error, warn}; use crate::bstring_table::BStringTable; -use progress::Progress; type Symbol = crate::bstring_table::Symbol; @@ -310,7 +309,7 @@ pub struct RepoMetadata { } impl GitMetadataGraph { - pub fn repo_metadata(&self, progress: &Progress) -> Result> { + pub fn get_repo_metadata(&self) -> Result> { let t1 = Instant::now(); let symbols = &self.symbols; let cg = &self.commits; @@ -511,17 +510,15 @@ impl GitMetadataGraph { assert_eq!(num_commits_visited, num_commits); assert_eq!(visited_commits.len(), num_commits); - progress.suspend(|| { - debug!( - "{num_commits_visited} commits visited; \ - {max_frontier_size} max entries in frontier; \ - {max_live_seen_sets} max live seen sets; \ - {num_trees_introduced} trees introduced; \ - {num_blobs_introduced} blobs introduced; \ - {:.6}s", - t1.elapsed().as_secs_f64() - ); - }); + debug!( + "{num_commits_visited} commits visited; \ + {max_frontier_size} max entries in frontier; \ + {max_live_seen_sets} max live seen sets; \ + {num_trees_introduced} trees introduced; \ + {num_blobs_introduced} blobs introduced; \ + {:.6}s", + t1.elapsed().as_secs_f64() + ); // Massage intermediate accumulated results into output format let commit_metadata = cg diff --git a/crates/input-enumerator/src/git_repo_enumerator.rs b/crates/input-enumerator/src/git_repo_enumerator.rs index 774d99df0..62b722be9 100644 --- a/crates/input-enumerator/src/git_repo_enumerator.rs +++ b/crates/input-enumerator/src/git_repo_enumerator.rs @@ -3,15 +3,14 @@ use gix::{hashtable::HashMap, ObjectId, OdbHandle, Repository}; use ignore::gitignore::Gitignore; use smallvec::SmallVec; use std::path::{Path, PathBuf}; +use std::time::Instant; // use std::time::Instant; -use tracing::{error_span, warn}; +use tracing::{debug, debug_span, warn}; use crate::blob_appearance::{BlobAppearance, BlobAppearanceSet}; use crate::git_commit_metadata::CommitMetadata; use crate::git_metadata_graph::GitMetadataGraph; -use progress::Progress; - // ------------------------------------------------------------------------------------------------- // implementation helpers // ------------------------------------------------------------------------------------------------- @@ -43,8 +42,9 @@ pub struct ObjectCounts { } // TODO: measure how helpful or pointless it is to count the objects in advance -// FIXME: if keeping the pre-counting step, add some new kind of progress indicator -fn count_git_objects(odb: &OdbHandle, progress: &Progress) -> Result { +fn count_git_objects(odb: &OdbHandle) -> Result { + let t1 = Instant::now(); + use gix::object::Kind; use gix::odb::store::iter::Ordering; use gix::prelude::*; @@ -52,17 +52,17 @@ fn count_git_objects(odb: &OdbHandle, progress: &Progress) -> Result num_commits += 1, @@ -71,6 +71,9 @@ fn count_git_objects(odb: &OdbHandle, progress: &Progress) -> Result {} } } + + debug!("Counted {num_objects} objects in {:.6}s", t1.elapsed().as_secs_f64()); + Ok(ObjectCounts { num_commits, num_trees, @@ -129,25 +132,18 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> { } } - pub fn run(&self, progress: &mut Progress) -> Result { + pub fn run(&self) -> Result { + let t1 = Instant::now(); + use gix::object::Kind; use gix::odb::store::iter::Ordering; use gix::prelude::*; - let _span = error_span!("enumerate_git_with_metadata", "{}", self.path.display()).entered(); - - macro_rules! warn { - ($($arg:expr),*) => { - progress.suspend(|| { - tracing::warn!($($arg),*); - }) - } - } + let _span = debug_span!("enumerate_git_with_metadata", "{}", self.path.display()).entered(); let odb = &self.repo.objects; // TODO: measure how helpful or pointless it is to count the objects in advance - // FIXME: if keeping the pre-counting step, add some new kind of progress indicator // First count the objects to figure out how big to allocate data structures. // We're assuming that the repository doesn't change in the meantime. @@ -156,7 +152,7 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> { num_commits, num_trees, num_blobs, - } = count_git_objects(odb, progress)?; + } = count_git_objects(odb)?; let mut blobs: Vec<(ObjectId, u64)> = Vec::with_capacity(num_blobs); let mut metadata_graph = GitMetadataGraph::with_capacity(num_commits, num_trees, num_blobs); @@ -183,7 +179,6 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> { let obj_size = hdr.size(); metadata_graph.get_blob_idx(oid); blobs.push((oid, obj_size)); - progress.inc(obj_size); } Kind::Commit => { @@ -244,8 +239,10 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> { } } + debug!("Built metadata graph in {:.6}s", t1.elapsed().as_secs_f64()); + let path = self.path.to_owned(); - match metadata_graph.repo_metadata(progress) { + match metadata_graph.get_repo_metadata() { Err(e) => { warn!("Failed to compute reachable blobs; ignoring metadata: {e}"); let blobs = blobs @@ -281,10 +278,8 @@ impl<'a> GitRepoWithMetadataEnumerator<'a> { // Build blobs result set. // // Apply any path-based ignore rules to blobs here, like the filesystem enumerator, - // filtering out blobs that have paths to ignore. - // - // Note that the behavior of ignoring blobs from Git repositories may be - // surprising. + // filtering out blobs that have paths to ignore. Note that the behavior of + // ignoring blobs from Git repositories may be surprising: // // A blob may appear within a Git repository under many different paths. // Nosey Parker doesn't compute the *entire* set of paths that each blob @@ -375,20 +370,12 @@ impl<'a> GitRepoEnumerator<'a> { Self { path, repo } } - pub fn run(&self, progress: &mut Progress) -> Result { + pub fn run(&self) -> Result { use gix::object::Kind; use gix::odb::store::iter::Ordering; use gix::prelude::*; - let _span = error_span!("enumerate_git", "{}", self.path.display()).entered(); - - macro_rules! warn { - ($($arg:expr),*) => { - progress.suspend(|| { - tracing::warn!($($arg),*); - }) - } - } + let _span = debug_span!("enumerate_git", "{}", self.path.display()).entered(); let odb = &self.repo.objects; @@ -406,7 +393,6 @@ impl<'a> GitRepoEnumerator<'a> { if hdr.kind() == Kind::Blob { let obj_size = hdr.size(); blobs.push((oid, obj_size)); - progress.inc(obj_size); } } diff --git a/crates/input-enumerator/src/lib.rs b/crates/input-enumerator/src/lib.rs index 2074965eb..2a0817980 100644 --- a/crates/input-enumerator/src/lib.rs +++ b/crates/input-enumerator/src/lib.rs @@ -4,30 +4,21 @@ pub mod git_commit_metadata; pub mod git_metadata_graph; use anyhow::Result; +use crossbeam_channel::Sender; use ignore::{ gitignore::{Gitignore, GitignoreBuilder}, DirEntry, WalkBuilder, WalkState, }; use std::path::{Path, PathBuf}; -use std::sync::Mutex; +use std::time::Instant; use tracing::{debug, error, warn}; -use progress::Progress; - mod git_repo_enumerator; pub use git_repo_enumerator::{GitRepoEnumerator, GitRepoResult, GitRepoWithMetadataEnumerator}; -pub struct FilesystemEnumeratorResult { - pub files: Vec, - pub git_repos: Vec, -} - -impl FilesystemEnumeratorResult { - pub fn total_blob_bytes(&self) -> u64 { - let git_blob_bytes: u64 = self.git_repos.iter().map(|e| e.total_blob_bytes()).sum(); - let file_bytes: u64 = self.files.iter().map(|e| e.num_bytes).sum(); - git_blob_bytes + file_bytes - } +pub enum EnumeratorResult { + File(FileResult), + GitRepo(GitRepoResult), } pub struct FileResult { @@ -35,6 +26,8 @@ pub struct FileResult { pub num_bytes: u64, } +pub type Output = Sender; + // ------------------------------------------------------------------------------------------------- // VisitorBuilder // ------------------------------------------------------------------------------------------------- @@ -42,13 +35,8 @@ struct VisitorBuilder<'t> { max_file_size: Option, collect_git_metadata: bool, enumerate_git_history: bool, - - global_files: &'t Mutex>, - global_git_repos: &'t Mutex>, - gitignore: &'t Gitignore, - - progress: &'t Progress, + output: &'t Output, } impl<'s, 't> ignore::ParallelVisitorBuilder<'s> for VisitorBuilder<'t> @@ -60,12 +48,8 @@ where max_file_size: self.max_file_size, collect_git_metadata: self.collect_git_metadata, enumerate_git_history: self.enumerate_git_history, - local_files: Vec::new(), - local_git_repos: Vec::new(), - global_files: self.global_files, - global_git_repos: self.global_git_repos, gitignore: self.gitignore, - progress: self.progress.clone(), + output: self.output, }) } } @@ -77,16 +61,8 @@ struct Visitor<'t> { collect_git_metadata: bool, enumerate_git_history: bool, max_file_size: Option, - - local_files: Vec, - local_git_repos: Vec, - - global_files: &'t Mutex>, - global_git_repos: &'t Mutex>, - gitignore: &'t Gitignore, - - progress: Progress, + output: &'t Output, } impl<'t> Visitor<'t> { @@ -94,18 +70,13 @@ impl<'t> Visitor<'t> { fn file_too_big(&self, size: u64) -> bool { self.max_file_size.map_or(false, |max_size| size > max_size) } -} -impl<'t> Drop for Visitor<'t> { - fn drop(&mut self) { - self.global_files - .lock() - .unwrap() - .extend(std::mem::take(&mut self.local_files)); - self.global_git_repos - .lock() - .unwrap() - .extend(std::mem::take(&mut self.local_git_repos)); + fn found_file(&mut self, r: FileResult) { + self.output.send(EnumeratorResult::File(r)).unwrap(); + } + + fn found_git_repo(&mut self, r: GitRepoResult) { + self.output.send(EnumeratorResult::GitRepo(r)).unwrap(); } } @@ -115,7 +86,7 @@ impl<'t> ignore::ParallelVisitor for Visitor<'t> { let entry = match result { Err(e) => { - error!("Failed to get entry: {}; skipping", e); + error!("Failed to get entry: {e}; skipping"); return WalkState::Skip; } Ok(v) => v, @@ -137,8 +108,7 @@ impl<'t> ignore::ParallelVisitor for Visitor<'t> { if self.file_too_big(bytes) { debug!("Skipping {}: size {bytes} exceeds max size", path.display()); } else { - self.progress.inc(bytes); - self.local_files.push(FileResult { + self.found_file(FileResult { path, num_bytes: bytes, }); @@ -154,7 +124,8 @@ impl<'t> ignore::ParallelVisitor for Visitor<'t> { return WalkState::Skip; } Ok(Some(repository)) => { - debug!("Found Git repo at {}", path.display()); + let t1 = Instant::now(); + debug!("Found Git repository at {}", path.display()); if self.collect_git_metadata { let enumerator = GitRepoWithMetadataEnumerator::new( @@ -162,29 +133,34 @@ impl<'t> ignore::ParallelVisitor for Visitor<'t> { &repository, &self.gitignore, ); - match enumerator.run(&mut self.progress) { + match enumerator.run() { Err(e) => { error!( "Failed to enumerate Git repository at {}: {e}; skipping", - path.display() + path.display(), ); return WalkState::Skip; } - Ok(r) => self.local_git_repos.push(r), + Ok(r) => self.found_git_repo(r), } } else { let enumerator = GitRepoEnumerator::new(path, &repository); - match enumerator.run(&mut self.progress) { + match enumerator.run() { Err(e) => { error!( "Failed to enumerate Git repository at {}: {e}; skipping", - path.display() + path.display(), ); return WalkState::Skip; } - Ok(r) => self.local_git_repos.push(r), + Ok(r) => self.found_git_repo(r), } } + debug!( + "Enumerated Git repository at {} in {:.6}s", + path.display(), + t1.elapsed().as_secs_f64() + ); } Ok(None) => {} } @@ -322,30 +298,22 @@ impl FilesystemEnumerator { self } - pub fn run(&self, progress: &Progress) -> Result { - let files = Mutex::new(Vec::new()); - let git_repos = Mutex::new(Vec::new()); - + pub fn run(&self, output: Output) -> Result<()> { let gitignore = self.gitignore_builder.build()?; let mut visitor_builder = VisitorBuilder { collect_git_metadata: self.collect_git_metadata, enumerate_git_history: self.enumerate_git_history, max_file_size: self.max_file_size, - global_files: &files, - global_git_repos: &git_repos, gitignore: &gitignore, - progress, + output: &output, }; self.walk_builder .build_parallel() .visit(&mut visitor_builder); - let files = files.into_inner()?; - let git_repos = git_repos.into_inner().unwrap(); - - Ok(FilesystemEnumeratorResult { files, git_repos }) + Ok(()) } } diff --git a/crates/noseyparker-cli/Cargo.toml b/crates/noseyparker-cli/Cargo.toml index 90b2b6dc0..cf0610f3c 100644 --- a/crates/noseyparker-cli/Cargo.toml +++ b/crates/noseyparker-cli/Cargo.toml @@ -103,7 +103,7 @@ tracing = "0.1" tracing-log = "0.2" tracing-subscriber = { version = "0.3", features = ["tracing-log", "ansi", "env-filter", "smallvec", "fmt"], default-features = false } url = "2.3" -vectorscan-rs = { version = "0.0.2" } +vectorscan-rs = { version = "0.0.3" } [dev-dependencies] assert_cmd = { version = "2.0", features = ["color-auto"] } diff --git a/crates/noseyparker-cli/src/cmd_rules/cmd_rules_check.rs b/crates/noseyparker-cli/src/cmd_rules/cmd_rules_check.rs index bae1b0be5..84ff37f6a 100644 --- a/crates/noseyparker-cli/src/cmd_rules/cmd_rules_check.rs +++ b/crates/noseyparker-cli/src/cmd_rules/cmd_rules_check.rs @@ -1,7 +1,7 @@ use anyhow::{bail, Context, Result}; use regex::Regex; use std::collections::HashSet; -use tracing::{debug_span, error, error_span, info, warn}; +use tracing::{debug_span, error, info, warn}; use vectorscan_rs::{BlockDatabase, Flag, Pattern, Scan}; use noseyparker::rules_database::RulesDatabase; @@ -108,7 +108,7 @@ pub fn run(_global_args: &GlobalArgs, args: &RulesCheckArgs) -> Result<()> { // - all referenced rules are unique { for (ruleset_num, ruleset) in rulesets.iter().enumerate() { - let _span = error_span!("ruleset", "{}:{}", ruleset_num + 1, ruleset.name).entered(); + let _span = debug_span!("ruleset", "{}:{}", ruleset_num + 1, ruleset.name).entered(); if let Err(e) = loaded.resolve_ruleset_rules(ruleset) { error!("Failed to resolve rules: {e}"); num_errors += 1; @@ -201,7 +201,7 @@ struct CheckStats { fn check_rule(rule_num: usize, rule: &Rule) -> Result { let syntax = rule.syntax(); - let _span = error_span!("rule", "{}:{}", rule_num + 1, syntax.name).entered(); + let _span = debug_span!("rule", "{}:{}", rule_num + 1, syntax.name).entered(); let mut num_warnings = 0; let mut num_errors = 0; diff --git a/crates/noseyparker-cli/src/cmd_scan.rs b/crates/noseyparker-cli/src/cmd_scan.rs index 25da1a308..6d5c7b624 100644 --- a/crates/noseyparker-cli/src/cmd_scan.rs +++ b/crates/noseyparker-cli/src/cmd_scan.rs @@ -4,12 +4,12 @@ use rayon::prelude::*; use std::path::Path; use std::sync::Mutex; use std::time::{Duration, Instant}; -use tracing::{debug, error, error_span, info, trace, warn}; +use tracing::{debug, debug_span, error, info, trace, warn}; use crate::{args, rule_loader::RuleLoader}; use content_guesser::Guesser; -use input_enumerator::{open_git_repo, FileResult, FilesystemEnumerator}; +use input_enumerator::{open_git_repo, EnumeratorResult, FilesystemEnumerator}; use progress::Progress; use noseyparker::blob::{Blob, BlobId}; @@ -239,129 +239,118 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> } // --------------------------------------------------------------------------------------------- - // Enumerate initial filesystem inputs + // Enumerate inputs and scan // --------------------------------------------------------------------------------------------- - let inputs = { - let mut progress = Progress::new_bytes_spinner("Enumerating inputs...", progress_enabled); - - let input_enumerator = || -> Result { - let mut ie = FilesystemEnumerator::new(&input_roots)?; - ie.threads(args.num_jobs); - ie.max_filesize(args.content_filtering_args.max_file_size_bytes()); - if args.input_specifier_args.git_history == args::GitHistoryMode::None { - ie.enumerate_git_history(false); - } + let scan_start = Instant::now(); - // Load default ignore file. Note that we have to write it to a file first, - // because the API for the `ignore` crate doesn't expose something that takes a - // string. - let ignore_path = datastore.scratch_dir().join("default_ignore_rules.conf"); - std::fs::write(&ignore_path, DEFAULT_IGNORE_RULES).with_context(|| { - format!("Failed to write default ignore rules to {}", ignore_path.display()) - })?; + let (enum_send, enum_recv) = { + let channel_size = std::cmp::max(args.num_jobs * 16, 128); + crossbeam_channel::bounded(channel_size) + }; + let input_enumerator = || -> Result { + let mut ie = FilesystemEnumerator::new(&input_roots)?; + ie.threads(args.num_jobs); + ie.max_filesize(args.content_filtering_args.max_file_size_bytes()); + if args.input_specifier_args.git_history == args::GitHistoryMode::None { + ie.enumerate_git_history(false); + } - ie.add_ignore(&ignore_path).with_context(|| { - format!("Failed to load ignore rules from {}", ignore_path.display()) - })?; + // Load default ignore file. Note that we have to write it to a file first, + // because the API for the `ignore` crate doesn't expose something that takes a + // string. + let ignore_path = datastore.scratch_dir().join("default_ignore_rules.conf"); + std::fs::write(&ignore_path, DEFAULT_IGNORE_RULES).with_context(|| { + format!("Failed to write default ignore rules to {}", ignore_path.display()) + })?; - // Load any specified ignore files - for ignore_path in args.content_filtering_args.ignore.iter() { - debug!("Using ignore rules from {}", ignore_path.display()); - ie.add_ignore(ignore_path).with_context(|| { - format!("Failed to load ignore rules from {}", ignore_path.display()) - })?; - } + ie.add_ignore(&ignore_path).with_context(|| { + format!("Failed to load ignore rules from {}", ignore_path.display()) + })?; - // Make sure the datastore itself is not scanned - let datastore_path = std::fs::canonicalize(datastore.root_dir())?; - ie.filter_entry(move |entry| { - let path = match std::fs::canonicalize(entry.path()) { - Err(e) => { - warn!("Failed to canonicalize path {}: {}", entry.path().display(), e); - return true; - } - Ok(p) => p, - }; - path != datastore_path - }); + // Load any specified ignore files + for ignore_path in args.content_filtering_args.ignore.iter() { + debug!("Using ignore rules from {}", ignore_path.display()); + ie.add_ignore(ignore_path).with_context(|| { + format!("Failed to load ignore rules from {}", ignore_path.display()) + })?; + } - // Determine whether to collect git metadata or not - let collect_git_metadata = match args.metadata_args.git_blob_provenance { - args::GitBlobProvenanceMode::FirstSeen => true, - args::GitBlobProvenanceMode::Minimal => false, + // Make sure the datastore itself is not scanned + let datastore_path = std::fs::canonicalize(datastore.root_dir())?; + ie.filter_entry(move |entry| { + let path = match std::fs::canonicalize(entry.path()) { + Err(e) => { + warn!("Failed to canonicalize path {}: {}", entry.path().display(), e); + return true; + } + Ok(p) => p, }; - ie.collect_git_metadata(collect_git_metadata); - - Ok(ie) - }() - .context("Failed to initialize filesystem enumerator")?; - - let inputs = input_enumerator - .run(&progress) - .context("Failed to enumerate filesystem inputs")?; - let total_bytes_found: u64 = { - let blob_bytes: u64 = inputs.git_repos.iter().map(|r| r.total_blob_bytes()).sum(); - let file_bytes: u64 = inputs.files.iter().map(|e| e.num_bytes).sum(); - blob_bytes + file_bytes + path != datastore_path + }); + + // Determine whether to collect git metadata or not + let collect_git_metadata = match args.metadata_args.git_blob_provenance { + args::GitBlobProvenanceMode::FirstSeen => true, + args::GitBlobProvenanceMode::Minimal => false, }; - progress.finish_with_message(format!( - "Found {} from {} plain {} and {} blobs from {} Git {}", - HumanBytes(total_bytes_found), - HumanCount(inputs.files.len() as u64), - if inputs.files.len() == 1 { - "file" - } else { - "files" - }, - HumanCount(inputs.git_repos.iter().map(|r| r.num_blobs()).sum()), - HumanCount(inputs.git_repos.len() as u64), - if inputs.git_repos.len() == 1 { - "repo" - } else { - "repos" - }, - )); - inputs - }; + ie.collect_git_metadata(collect_git_metadata); + + Ok(ie) + }() + .context("Failed to initialize filesystem enumerator")?; + + // Kick off enumerator in a separate thread so that scanning can proceed simultaneously + let input_enumerator_thread = std::thread::Builder::new() + .name("input_enumerator".to_string()) + .spawn(move || -> Result<_> { input_enumerator.run(enum_send) }) + .context("Failed to enumerate filesystem inputs")?; // --------------------------------------------------------------------------------------------- // Define some matcher helper code and shared state // --------------------------------------------------------------------------------------------- - let scan_start = Instant::now(); - let total_blob_bytes = inputs.total_blob_bytes(); - - let num_matchers_counter = Mutex::new(0u64); // how many matchers have been initialized? + let num_blob_processors = Mutex::new(0u64); // how many blob processors have been initialized? let matcher_stats = Mutex::new(MatcherStats::default()); let seen_blobs = BlobIdMap::new(); - let make_matcher = || -> Result<(Matcher, Guesser)> { - *num_matchers_counter.lock().unwrap() += 1; - let matcher = Matcher::new(&rules_db, &seen_blobs, Some(&matcher_stats))?; + let t1 = Instant::now(); + let matcher = Matcher::new(&rules_db, &seen_blobs, Some(&matcher_stats))?; + let blob_processor_init_time = Mutex::new(t1.elapsed()); + let make_blob_processor = || -> Result { + let t1 = Instant::now(); + let matcher = matcher.clone(); + *num_blob_processors.lock().unwrap() += 1; let guesser = Guesser::new()?; - Ok((matcher, guesser)) + { + let mut init_time = blob_processor_init_time.lock().unwrap(); + *init_time = *init_time + t1.elapsed(); + } + Ok(BlobProcessor { matcher, guesser }) }; // FIXME: have this print out aggregate rate at finish - let mut progress = - Progress::new_bytes_bar(total_blob_bytes, "Scanning content", progress_enabled); + let mut progress = Progress::new_bytes_spinner("Scanning content", progress_enabled); // FIXME: expose the following as a CLI parameter - const BATCH_SIZE: usize = 16 * 1024; - const COMMIT_INTERVAL: Duration = Duration::from_secs(1); - - // Create a channel pair for matcher threads to get their results to the datastore recorder. - let channel_size = std::cmp::max(args.num_jobs * BATCH_SIZE, 64 * BATCH_SIZE); - let (send_ds, recv_ds) = crossbeam_channel::bounded::(channel_size); + const DATASTORE_BATCH_SIZE: usize = 16 * 1024; + const DATASTORE_COMMIT_INTERVAL: Duration = Duration::from_secs(1); + + // Create a channel pair for processor threads to get their results to the datastore recorder. + let (send_ds, recv_ds) = { + let channel_size = + std::cmp::max(args.num_jobs * DATASTORE_BATCH_SIZE, 64 * DATASTORE_BATCH_SIZE); + crossbeam_channel::bounded::(channel_size) + }; let blobs_dir = datastore.blobs_dir(); - // We create a separate thread for writing matches to the datastore. + // --------------------------------------------------------------------------------------------- + // Create datastore writer thread. // The datastore uses SQLite, which does best with a single writer. + // --------------------------------------------------------------------------------------------- let datastore_writer_thread = std::thread::Builder::new() .name("datastore".to_string()) .spawn(move || -> Result<_> { - let _span = error_span!("datastore", dir = datastore.root_dir().display().to_string()) - .entered(); + let _span = debug_span!("datastore", "{}", datastore.root_dir().display()).entered(); let mut total_recording_time: std::time::Duration = Default::default(); let mut num_matches_added: u64 = 0; @@ -372,7 +361,7 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> // Record all messages chunked transactions, trying to commit at least every // COMMIT_ITERVAL. - let mut batch: Vec = Vec::with_capacity(BATCH_SIZE); + let mut batch: Vec = Vec::with_capacity(DATASTORE_BATCH_SIZE); let mut matches_in_batch: usize = 0; let mut last_commit_time = Instant::now(); @@ -381,9 +370,9 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> matches_in_batch += message.2.len(); batch.push(message); - if batch.len() >= BATCH_SIZE - || matches_in_batch >= BATCH_SIZE - || last_commit_time.elapsed() >= COMMIT_INTERVAL + if batch.len() >= DATASTORE_BATCH_SIZE + || matches_in_batch >= DATASTORE_BATCH_SIZE + || last_commit_time.elapsed() >= DATASTORE_COMMIT_INTERVAL { let t1 = std::time::Instant::now(); let batch_len = batch.len(); @@ -441,153 +430,43 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> Ok((datastore, num_matches, num_matches_added)) })?; - // A function to be immediately called, to allow syntactic simplification of error propagation - let scan_inner = || -> Result<()> { - // --------------------------------------------------------------------------------------------- - // Scan plain files - // --------------------------------------------------------------------------------------------- - inputs.files.par_iter().try_for_each_init( - || -> Result<_> { - let matcher = make_matcher()?; - - Ok((matcher, progress.clone())) + // --------------------------------------------------------------------------------------------- + // Scan enumerated inputs + // + // Kick off scanner threads, but for better error messages, don't check its result until after + // checking the datastore writer thread. + // --------------------------------------------------------------------------------------------- + let scan_res: Result<()> = enum_recv + .into_iter() + .par_bridge() + .try_for_each_init( + || -> Result<(BlobProcessor<'_>, Progress)> { + let processor = make_blob_processor()?; + Ok((processor, progress.clone())) }, - |state: &mut Result<_>, file_result: &FileResult| -> Result<()> { - let _span = error_span!("file-scan", path = file_result.path.display().to_string()) - .entered(); - - let (matcher, progress) = match state { + move |state: &mut Result<_>, enum_result: EnumeratorResult| -> Result<()> { + let (processor, progress): &mut (BlobProcessor<'_>, Progress) = match state { Ok(state) => state, Err(e) => bail!("Failed to initialize worker: {e}"), }; - let fname = &file_result.path; - let blob = match Blob::from_file(fname) { - Err(e) => { - error!("Failed to load blob from {}: {}", fname.display(), e); - return Ok(()); - } - Ok(v) => v, - }; - progress.inc(file_result.num_bytes); - - run_matcher( - matcher, - ProvenanceSet::new(Provenance::from_file(fname.clone()), Vec::new()), - blob, - &send_ds, - args.snippet_length, - args.metadata_args.blob_metadata, - args.copy_blobs, - &blobs_dir, - progress, - )?; - - Ok(()) - }, - )?; - - // --------------------------------------------------------------------------------------------- - // Scan Git repo inputs - // --------------------------------------------------------------------------------------------- - inputs - .git_repos - .into_par_iter() - .try_for_each(|git_repo_result| -> Result<()> { - let span = - error_span!("git-scan", repo_path = git_repo_result.path.display().to_string()); - let _span = span.enter(); - - let repository = match open_git_repo(&git_repo_result.path) { - Ok(Some(repository)) => repository.into_sync(), - Ok(None) => { - error!( - "Failed to re-open previously-found repository at {}", - git_repo_result.path.display() - ); - return Ok(()); - } - Err(err) => { - error!( - "Failed to re-open previously-found repository at {}: {err}", - git_repo_result.path.display() - ); - return Ok(()); - } - }; - - git_repo_result.blobs.into_par_iter().try_for_each_init( - || -> Result<_> { - let _span = span.enter(); - let repo = repository.to_thread_local(); - let matcher = make_matcher()?; - Ok((repo, matcher, progress.clone())) - }, - |state: &mut Result<_>, md| -> Result<()> { - let _span = span.enter(); - let (repo, matcher, progress) = match state { - Ok(state) => state, - Err(e) => bail!("Failed to initialize worker: {e}"), - }; + match enum_result { + EnumeratorResult::File(file_result) => { + let _span = debug_span!("file-scan", "{}", file_result.path.display()) + .entered(); - let size = md.num_bytes; - let blob_id = md.blob_oid; - progress.inc(size); - let repo_path = &git_repo_result.path; - - let blob = match repo.find_object(blob_id) { + let fname = &file_result.path; + let blob = match Blob::from_file(fname) { Err(e) => { - error!( - "Failed to read blob {blob_id} from Git repository at {}: {e}", - repo_path.display(), - ); + error!("Failed to load blob from {}: {}", fname.display(), e); return Ok(()); } - Ok(mut blob) => { - let data = std::mem::take(&mut blob.data); // avoid a copy - Blob::new(BlobId::from(&blob_id), data) - } + Ok(v) => v, }; + progress.inc(file_result.num_bytes); - let provenance = { - let mut it = md.first_seen.iter(); - if let Some(e) = it.next() { - let commit_metadata = git_repo_result - .commit_metadata - .get(&e.commit_oid) - .expect("should have commit metadata"); - let p = Provenance::from_git_repo_with_first_commit( - repo_path.clone(), - commit_metadata.clone(), - e.path.clone(), - ); - - let ps = it - .map(|e| { - let commit_metadata = git_repo_result - .commit_metadata - .get(&e.commit_oid) - .expect("should have commit metadata"); - Provenance::from_git_repo_with_first_commit( - repo_path.clone(), - commit_metadata.clone(), - e.path.clone(), - ) - }) - .collect(); - - ProvenanceSet::new(p, ps) - } else { - ProvenanceSet::new( - Provenance::from_git_repo(repo_path.clone()), - Vec::new(), - ) - } - }; - - run_matcher( - matcher, - provenance, + processor.run( + ProvenanceSet::new(Provenance::from_file(fname.clone()), Vec::new()), blob, &send_ds, args.snippet_length, @@ -598,23 +477,144 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> )?; Ok(()) - }, - ) - })?; + } - Ok(()) - }; + EnumeratorResult::GitRepo(git_repo_result) => { + let span = + debug_span!("git-scan", "{}", git_repo_result.path.display()); + let _span = span.enter(); - // kick off scanner threads, but for better error messages, don't return its error until after - // checking the datastore writer thread - let scan_res = scan_inner(); + let repository = match open_git_repo(&git_repo_result.path) { + Ok(Some(repository)) => repository.into_sync(), + Ok(None) => { + error!( + "Failed to re-open previously-found repository at {}", + git_repo_result.path.display() + ); + return Ok(()); + } + Err(err) => { + error!( + "Failed to re-open previously-found repository at {}: {err}", + git_repo_result.path.display() + ); + return Ok(()); + } + }; + + git_repo_result + .blobs + .into_par_iter() + // XXX try to be more conservative with parallelism here; use + // explicitly larger granularity. + // + // Git repos are typically represented with packfiles on disk, and + // oftentimes with just a single packfile. + // + // gix _does_ allow packfiles to be read by multiple threads with + // decent parallel speedup up to a few threads, but it doesn't scale + // linearly. + // + // The optimal efficiency for reading all blobs from a Git repo would + // probably involve one thread per packfile. Doing that would require + // restructuring this code. + .with_min_len(512) + .try_for_each_init( + || -> Result<_> { + let _span = span.enter(); + let repo = repository.to_thread_local(); + let processor = make_blob_processor()?; + Ok((repo, processor, progress.clone())) + }, + |state: &mut Result<_>, md| -> Result<()> { + let _span = span.enter(); + let (repo, processor, progress) = match state { + Ok(state) => state, + Err(e) => bail!("Failed to initialize worker: {e}"), + }; + + let size = md.num_bytes; + let blob_id = md.blob_oid; + progress.inc(size); + let repo_path = &git_repo_result.path; + + let blob = match repo.find_object(blob_id) { + Err(e) => { + error!( + "Failed to read blob {blob_id} from Git repository at {}: {e}", + repo_path.display(), + ); + return Ok(()); + } + Ok(mut blob) => { + let data = std::mem::take(&mut blob.data); // avoid a copy + Blob::new(BlobId::from(&blob_id), data) + } + }; + + let provenance = { + let mut it = md.first_seen.iter(); + if let Some(e) = it.next() { + let commit_metadata = git_repo_result + .commit_metadata + .get(&e.commit_oid) + .expect("should have commit metadata"); + let p = Provenance::from_git_repo_with_first_commit( + repo_path.clone(), + commit_metadata.clone(), + e.path.clone(), + ); + + let ps = it + .map(|e| { + let commit_metadata = git_repo_result + .commit_metadata + .get(&e.commit_oid) + .expect("should have commit metadata"); + Provenance::from_git_repo_with_first_commit( + repo_path.clone(), + commit_metadata.clone(), + e.path.clone(), + ) + }) + .collect(); + + ProvenanceSet::new(p, ps) + } else { + ProvenanceSet::new( + Provenance::from_git_repo(repo_path.clone()), + Vec::new(), + ) + } + }; + + processor.run( + provenance, + blob, + &send_ds, + args.snippet_length, + args.metadata_args.blob_metadata, + args.copy_blobs, + &blobs_dir, + progress, + )?; + Ok(()) + } + )?; + + Ok(()) + } + } + }); // --------------------------------------------------------------------------------------------- - // Wait for all inputs to be scanned and the database thread to finish + // Wait for all inputs to be enumerated and scanned and the database thread to finish // --------------------------------------------------------------------------------------------- - // Get rid of the reference to the sending channel after starting the scanners, - // to ensure things terminate as expected. - drop(send_ds); + input_enumerator_thread + .join() + .unwrap() + .context("Failed to enumerate inputs")?; + let (datastore, num_matches, num_new_matches) = datastore_writer_thread .join() .unwrap() @@ -628,9 +628,14 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> // Finalize and report // --------------------------------------------------------------------------------------------- { - debug!("{} matchers created during scan", num_matchers_counter.into_inner()?); + debug!( + "{} blob processors created in {:.1}s during scan", + num_blob_processors.into_inner()?, + blob_processor_init_time.into_inner()?.as_secs_f64() + ); debug!("{} items in the blob ID set", seen_blobs.len()); + drop(matcher); let matcher_stats = matcher_stats.into_inner()?; let scan_duration = scan_start.elapsed(); let seen_bytes_per_sec = @@ -712,136 +717,151 @@ impl MetadataResult { } } -#[allow(clippy::too_many_arguments)] -fn run_matcher( - matcher_guesser: &mut (Matcher, Guesser), - provenance: ProvenanceSet, - blob: Blob, - send_ds: &crossbeam_channel::Sender, - snippet_length: usize, - blob_metadata_recording_mode: args::BlobMetadataMode, - copy_blobs: args::CopyBlobsMode, - blobs_dir: &Path, - progress: &Progress, -) -> Result<()> { - let blob_id = blob.id.hex(); - let _span = error_span!("matcher", blob_id).entered(); - - let (matcher, guesser) = matcher_guesser; - - let t1 = Instant::now(); - let res = matcher.scan_blob(&blob, &provenance); - let scan_time = t1.elapsed(); - let scan_us = scan_time.as_micros(); - - match res { - Err(e) => { - progress.suspend(|| { - error!("Failed to scan blob {} from {}: {e}", blob.id, provenance.first()) - }); - Ok(()) - } - - // blob already seen, but with no matches; nothing to do! - Ok(ScanResult::SeenSansMatches) => { - trace!("({scan_us}us) blob already scanned with no matches"); - Ok(()) - } - - // blob already seen; all we need to do is record its provenance - Ok(ScanResult::SeenWithMatches) => { - trace!("({scan_us}us) blob already scanned with matches"); - let metadata = BlobMetadata { - id: blob.id, - num_bytes: blob.len(), - mime_essence: None, - charset: None, - }; - send_ds - .send((provenance, metadata, Vec::new())) - .context("Failed to save blob scan results")?; - Ok(()) - } - - // blob has not been seen; need to record blob metadata, provenance, and matches - Ok(ScanResult::New(matches)) => { - trace!("({scan_us}us) blob newly scanned; {} matches", matches.len()); +/// A combined matcher and content type guesser +struct BlobProcessor<'a> { + matcher: Matcher<'a>, + guesser: Guesser, +} - let do_copy_blob = match copy_blobs { - args::CopyBlobsMode::All => true, - args::CopyBlobsMode::Matching => !matches.is_empty(), - args::CopyBlobsMode::None => false, - }; - if do_copy_blob { - let output_dir = blobs_dir.join(&blob_id[..2]); - let output_path = output_dir.join(&blob_id[2..]); - trace!("saving blob to {}", output_path.display()); - match std::fs::create_dir(&output_dir) { - Ok(()) => {} - Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {} - Err(e) => { - bail!("Failed to create blob directory at {}: {}", output_dir.display(), e); - } - } - std::fs::write(&output_path, &blob.bytes).with_context(|| { - format!("Failed to write blob contents to {}", output_path.display()) - })?; +impl<'a> BlobProcessor<'a> { + #[allow(clippy::too_many_arguments)] + fn run( + &mut self, + provenance: ProvenanceSet, + blob: Blob, + send_ds: &crossbeam_channel::Sender, + snippet_length: usize, + blob_metadata_recording_mode: args::BlobMetadataMode, + copy_blobs: args::CopyBlobsMode, + blobs_dir: &Path, + progress: &Progress, + ) -> Result<()> { + let blob_id = blob.id.hex(); + let _span = debug_span!("matcher", blob_id).entered(); + + let t1 = Instant::now(); + let res = self.matcher.scan_blob(&blob, &provenance); + let scan_time = t1.elapsed(); + let scan_us = scan_time.as_micros(); + + match res { + Err(e) => { + progress.suspend(|| { + error!("Failed to scan blob {} from {}: {e}", blob.id, provenance.first()) + }); + Ok(()) } - // If there are no matches, we can bail out here and avoid recording anything. - // UNLESS the `--blob-metadata=all` mode was specified; then we need to record the - // provenance for _all_ seen blobs. - if blob_metadata_recording_mode != args::BlobMetadataMode::All && matches.is_empty() { - return Ok(()); + // blob already seen, but with no matches; nothing to do! + Ok(ScanResult::SeenSansMatches) => { + trace!("({scan_us}us) blob already scanned with no matches"); + Ok(()) } - let metadata = match blob_metadata_recording_mode { - args::BlobMetadataMode::None => BlobMetadata { + // blob already seen; all we need to do is record its provenance + Ok(ScanResult::SeenWithMatches) => { + trace!("({scan_us}us) blob already scanned with matches"); + let metadata = BlobMetadata { id: blob.id, num_bytes: blob.len(), mime_essence: None, charset: None, - }, - _ => { - let md = MetadataResult::from_blob_and_provenance(guesser, &blob, &provenance); - BlobMetadata { - id: blob.id, - num_bytes: blob.len(), - mime_essence: md.mime_essence, - charset: md.charset, + }; + send_ds + .send((provenance, metadata, Vec::new())) + .context("Failed to save blob scan results")?; + Ok(()) + } + + // blob has not been seen; need to record blob metadata, provenance, and matches + Ok(ScanResult::New(matches)) => { + trace!("({scan_us}us) blob newly scanned; {} matches", matches.len()); + + let do_copy_blob = match copy_blobs { + args::CopyBlobsMode::All => true, + args::CopyBlobsMode::Matching => !matches.is_empty(), + args::CopyBlobsMode::None => false, + }; + if do_copy_blob { + let output_dir = blobs_dir.join(&blob_id[..2]); + let output_path = output_dir.join(&blob_id[2..]); + trace!("saving blob to {}", output_path.display()); + match std::fs::create_dir(&output_dir) { + Ok(()) => {} + Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {} + Err(e) => { + bail!( + "Failed to create blob directory at {}: {}", + output_dir.display(), + e + ); + } } + std::fs::write(&output_path, &blob.bytes).with_context(|| { + format!("Failed to write blob contents to {}", output_path.display()) + })?; } - }; - // Convert each BlobMatch into a regular Match - let matches = match matches - .iter() - .map(|m| m.matching_input_offset_span.end) - .max() - { - Some(max_end) => { - // compute the location mapping only on the input that's necessary to look at - let loc_mapping = location::LocationMapping::new(&blob.bytes[0..max_end]); - - let capacity: usize = matches.iter().map(|m| m.captures.len() - 1).sum(); - let mut new_matches = Vec::with_capacity(capacity); - new_matches.extend( - matches - .iter() - .map(|m| (None, Match::convert(&loc_mapping, m, snippet_length))), - ); - new_matches - } - None => { - debug_assert!(matches.is_empty()); - Vec::new() + // If there are no matches, we can bail out here and avoid recording anything. + // UNLESS the `--blob-metadata=all` mode was specified; then we need to record the + // provenance for _all_ seen blobs. + if blob_metadata_recording_mode != args::BlobMetadataMode::All && matches.is_empty() + { + return Ok(()); } - }; - send_ds - .send((provenance, metadata, matches)) - .context("Failed to save results")?; - Ok(()) + let metadata = match blob_metadata_recording_mode { + args::BlobMetadataMode::None => BlobMetadata { + id: blob.id, + num_bytes: blob.len(), + mime_essence: None, + charset: None, + }, + _ => { + let md = MetadataResult::from_blob_and_provenance( + &self.guesser, + &blob, + &provenance, + ); + BlobMetadata { + id: blob.id, + num_bytes: blob.len(), + mime_essence: md.mime_essence, + charset: md.charset, + } + } + }; + + // Convert each BlobMatch into a regular Match + let matches = match matches + .iter() + .map(|m| m.matching_input_offset_span.end) + .max() + { + Some(max_end) => { + // compute the location mapping only on the input that's necessary to look at + let loc_mapping = location::LocationMapping::new(&blob.bytes[0..max_end]); + + let capacity: usize = matches.iter().map(|m| m.captures.len() - 1).sum(); + let mut new_matches = Vec::with_capacity(capacity); + new_matches.extend( + matches + .iter() + .map(|m| (None, Match::convert(&loc_mapping, m, snippet_length))), + ); + new_matches + } + None => { + debug_assert!(matches.is_empty()); + Vec::new() + } + }; + + send_ds + .send((provenance, metadata, matches)) + .context("Failed to save results")?; + Ok(()) + } } } } diff --git a/crates/noseyparker-cli/tests/rules/snapshots/test_noseyparker__rules__rules_list_json-2.snap b/crates/noseyparker-cli/tests/rules/snapshots/test_noseyparker__rules__rules_list_json-2.snap index 6dcd799ac..113c73de2 100644 --- a/crates/noseyparker-cli/tests/rules/snapshots/test_noseyparker__rules__rules_list_json-2.snap +++ b/crates/noseyparker-cli/tests/rules/snapshots/test_noseyparker__rules__rules_list_json-2.snap @@ -1522,12 +1522,12 @@ expression: stdout }, { "id": "np.google.6", - "structural_id": "4e3b74e85c70254144b1d2d08289f4d04af98f35", + "structural_id": "a665479683c770faad11645f8f0aa791c651e55f", "name": "Google OAuth Credentials", "syntax": { "name": "Google OAuth Credentials", "id": "np.google.6", - "pattern": "(?x)\n\\b\n([0-9]+-[a-z0-9_]{32}\\.apps\\.googleusercontent\\.com) (?# client ID )\n(?: (?s) .{0,40} ) (?# Arbitrary intermediate stuff; ?s causes . to match newlines )\n\\b\n\n(?: (GOCSPX-[a-zA-Z0-9_-]{28}) (?# prefixed client secret )\n | (?: (?i) client.?secret .{0,10} \\b ([a-zA-Z0-9_-]{24}) ) (?# non-prefixed client secret )\n)\n\n(?:[^a-zA-Z0-9_-] | $)\n", + "pattern": "(?x)\n\\b\n([0-9]+-[a-z0-9_]{32}\\.apps\\.googleusercontent\\.com) (?# client ID )\n(?: (?s) .{0,40} ) (?# Arbitrary intermediate stuff; ?s causes . to match newlines )\n\\b\n\n(?: (?i) client.?secret .{0,10} )?\n\n( (?:GOCSPX-[a-zA-Z0-9_-]{28}) (?# prefixed client secret )\n| (?:\\b [a-zA-Z0-9_-]{24}) (?# non-prefixed client secret )\n)\n\n(?:[^a-zA-Z0-9_-] | $)\n", "examples": [ "const CLIENT_ID = '304167046824-45h8no7j0s38akv998nivvb7i17ckqeh.apps.googleusercontent.com';\nconst CLIENT_SECRET = '1QcFpNjHoAf4_XczYwhYicTl';\n", "public static GAPIS_CREDENTIALS = {\n // 1. Generate credentials: https://console.cloud.google.com/apis/\n // 2. Create OAuth page and set spreadsheets and drive.metadata.readonly scopes\n client_id: '132261435625-69ubohrvppjr9hcc5t9uighsb7j2cqhv.apps.googleusercontent.com',\n client_secret: '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',\n redirect_uri: `http://localhost:${Config.OAUTH_HTTP_PORT}/oauth2callback`\n};\n" diff --git a/crates/noseyparker/Cargo.toml b/crates/noseyparker/Cargo.toml index a898c68ee..29a0aa8e1 100644 --- a/crates/noseyparker/Cargo.toml +++ b/crates/noseyparker/Cargo.toml @@ -55,7 +55,7 @@ thiserror = "1" tokio = { version = "1.23", optional = true } tracing = "0.1" url = "2.3" -vectorscan-rs = { version = "0.0.2" } +vectorscan-rs = { version = "0.0.3" } [dev-dependencies] pretty_assertions = "1.3" diff --git a/crates/noseyparker/data/default/builtin/rules/google.yml b/crates/noseyparker/data/default/builtin/rules/google.yml index 18e8f490a..7815d3ca0 100644 --- a/crates/noseyparker/data/default/builtin/rules/google.yml +++ b/crates/noseyparker/data/default/builtin/rules/google.yml @@ -156,8 +156,10 @@ rules: (?: (?s) .{0,40} ) (?# Arbitrary intermediate stuff; ?s causes . to match newlines ) \b - (?: (GOCSPX-[a-zA-Z0-9_-]{28}) (?# prefixed client secret ) - | (?: (?i) client.?secret .{0,10} \b ([a-zA-Z0-9_-]{24}) ) (?# non-prefixed client secret ) + (?: (?i) client.?secret .{0,10} )? + + ( (?:GOCSPX-[a-zA-Z0-9_-]{28}) (?# prefixed client secret ) + | (?:\b [a-zA-Z0-9_-]{24}) (?# non-prefixed client secret ) ) (?:[^a-zA-Z0-9_-] | $) diff --git a/crates/noseyparker/src/matcher.rs b/crates/noseyparker/src/matcher.rs index d73ef0313..45789c214 100644 --- a/crates/noseyparker/src/matcher.rs +++ b/crates/noseyparker/src/matcher.rs @@ -18,7 +18,7 @@ use crate::rules_database::RulesDatabase; /// /// When matching with Vectorscan, we simply collect all matches into a preallocated `Vec`, /// and then go through them all after scanning is complete. -#[derive(PartialEq, Eq, Debug)] +#[derive(PartialEq, Eq, Debug, Copy, Clone)] struct RawMatch { rule_id: u32, start_idx: u64, @@ -49,6 +49,7 @@ pub struct BlobMatch<'a> { pub captures: regex::bytes::Captures<'a>, } +#[derive(Clone)] struct UserData { /// A scratch vector for raw matches from Vectorscan, to minimize allocation raw_matches_scratch: Vec, @@ -63,6 +64,7 @@ struct UserData { /// A `Matcher` is able to scan inputs for matches from rules in a `RulesDatabase`. /// /// If doing multi-threaded scanning, use a separate `Matcher` for each thread. +#[derive(Clone)] pub struct Matcher<'a> { /// A scratch buffer for Vectorscan vs_scanner: vectorscan_rs::BlockScanner<'a>, diff --git a/crates/noseyparker/src/matcher_stats.rs b/crates/noseyparker/src/matcher_stats.rs index 3a9eadaa9..4d9ee754c 100644 --- a/crates/noseyparker/src/matcher_stats.rs +++ b/crates/noseyparker/src/matcher_stats.rs @@ -1,7 +1,7 @@ // ------------------------------------------------------------------------------------------------- // MatchStats // ------------------------------------------------------------------------------------------------- -#[derive(Debug, Default)] +#[derive(Debug, Default, Clone)] pub struct MatcherStats { pub blobs_seen: u64, pub blobs_scanned: u64, diff --git a/crates/noseyparker/src/rule_profiling.rs b/crates/noseyparker/src/rule_profiling.rs index bd41e7a1a..181cf8740 100644 --- a/crates/noseyparker/src/rule_profiling.rs +++ b/crates/noseyparker/src/rule_profiling.rs @@ -3,7 +3,7 @@ use std::time::{Duration, Instant}; // ------------------------------------------------------------------------------------------------- // RuleProfile // ------------------------------------------------------------------------------------------------- -#[derive(Debug, Default)] +#[derive(Debug, Default, Clone)] pub struct RuleProfile { raw_match_counts: Vec, stage2_durations: Vec,