diff --git a/CHANGELOG.md b/CHANGELOG.md index 32667aad8..c975ddb0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,15 +5,18 @@ - Breaking: `.git/` is now ignored by default when using `--hidden` / `-H`, use `--no-ignore` / `-I` or `--no-ignore-vcs` to override, see #1387 and #1396 (@skoriop) - ## Bugfixes - Fix `NO_COLOR` support, see #1421 (@acuteenvy) ## Changes -- The default number of threads is now constrained to be at most 16. This should improve startup time on - systems with many CPU cores. (#1203) +- Performance has been significanly improved, both due to optimizations in the underlying `ignore` + crate (#1429), and in `fd` itself (#1422) + +- The default number of threads is now more constrained. We use all available cores and hyperthreads up + to a maximum of 8. Past that, we use only one thread per physical core, up to a limit of 64. This + should improve performance and startup time on systems with many CPU cores. (#1203, #1412, #1430) ## Other diff --git a/Cargo.lock b/Cargo.lock index 28843dc50..b23a375fe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -336,6 +336,7 @@ dependencies = [ "nix 0.27.1", "normpath", "nu-ansi-term", + "num_cpus", "regex", "regex-syntax", "tempfile", @@ -374,6 +375,12 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +[[package]] +name = "hermit-abi" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" + [[package]] name = "home" version = "0.5.5" @@ -554,6 +561,16 @@ dependencies = [ "autocfg", ] +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi", + "libc", +] + [[package]] name = "once_cell" version = "1.18.0" diff --git a/Cargo.toml b/Cargo.toml index 4f2f464f3..6ce5c4d06 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,6 +49,7 @@ normpath = "1.1.1" crossbeam-channel = "0.5.8" clap_complete = {version = "4.4.4", optional = true} faccess = "0.2.4" +num_cpus = "1.16.0" [dependencies.clap] version = "4.4.7" diff --git a/src/cli.rs b/src/cli.rs index 1b0228835..ab2dfc0b4 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -715,24 +715,25 @@ impl Opts { fn default_num_threads() -> NonZeroUsize { // If we can't get the amount of parallelism for some reason, then // default to a single thread, because that is safe. - // Note that the minimum value for a NonZeroUsize is 1. - // Unfortunately, we can't do `NonZeroUsize::new(1).unwrap()` - // in a const context. - const FALLBACK_PARALLELISM: NonZeroUsize = NonZeroUsize::MIN; - // As the number of threads increases, the startup time suffers from - // initializing the threads, and we get diminishing returns from additional - // parallelism. So set a maximum number of threads to use by default. - // - // This value is based on some empirical observations, but the ideal value - // probably depends on the exact hardware in use. - // - // Safety: The literal "20" is known not to be zero. - const MAX_DEFAULT_THREADS: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(20) }; - - std::cmp::min( - std::thread::available_parallelism().unwrap_or(FALLBACK_PARALLELISM), - MAX_DEFAULT_THREADS, - ) + let fallback = NonZeroUsize::MIN; + // fd generally scales well up to 8 threads. Past that, it's better to limit + // ourselves to the number of physical cores, ignoring hyperthreads. + let threshold = NonZeroUsize::new(8).unwrap(); + // To limit startup overhead on massively parallel machines, don't use more + // than 64 threads. + let limit = NonZeroUsize::new(64).unwrap(); + + // Get the total number of CPUs available, including hyperthreads + let threads = std::thread::available_parallelism().unwrap_or(fallback); + if threads <= threshold { + return threads; + } + + // Compute min(threads, max(cores, threshold), limit). Avoid calling + // num_cpus::get_physical() if we don't have to, since it costs a bit of + // startup time to parse /proc/cpuinfo. + let cores = NonZeroUsize::new(num_cpus::get_physical()).unwrap(); + threads.min(cores.clamp(threshold, limit)) } #[derive(Copy, Clone, PartialEq, Eq, ValueEnum)]