From 1353aa5050be61b19ac3e98d371f38df06a53af5 Mon Sep 17 00:00:00 2001
From: Toby Lawrence <toby@nuclearfurnace.com>
Date: Sat, 11 Apr 2020 17:58:25 -0400
Subject: [PATCH] General fixes: cord::arch, libc vs mach, benchmarking, etc

---
 .rustfmt.toml              |  17 --
 Cargo.toml                 |  28 ++--
 README.md                  |  44 ++----
 azure-pipelines.yml        |   8 +-
 benches/timing.rs          | 313 ++++++++++++++++++-------------------
 ci/azure-bench-stable.yml  |  20 +++
 ci/azure-rustfmt.yml       |   2 +-
 ci/azure-test-minimum.yaml |  20 ++-
 ci/azure-test-nightly.yml  |  24 ++-
 ci/azure-test-stable.yml   |  16 +-
 src/counter.rs             |  41 +++--
 src/instant.rs             |  16 +-
 src/lib.rs                 |  34 ++--
 src/mock.rs                |  26 ++-
 src/monotonic.rs           |  72 +++++++--
 src/upkeep.rs              |   8 +-
 16 files changed, 372 insertions(+), 317 deletions(-)
 delete mode 100644 .rustfmt.toml
 create mode 100644 ci/azure-bench-stable.yml

diff --git a/.rustfmt.toml b/.rustfmt.toml
deleted file mode 100644
index 201f270..0000000
--- a/.rustfmt.toml
+++ /dev/null
@@ -1,17 +0,0 @@
-edition = "2018"
-max_width = 120
-wrap_comments = true
-comment_width = 120
-use_try_shorthand = true
-merge_imports = true
-reorder_imports = true
-reorder_modules = true
-fn_args_density = "Compressed"
-fn_single_line = true
-imports_indent = "Block"
-match_block_trailing_comma = true
-merge_derives = true
-force_multiline_blocks = true
-normalize_comments = true
-reorder_impl_items = true
-use_field_init_shorthand = true
diff --git a/Cargo.toml b/Cargo.toml
index d455554..79018c2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "quanta"
 version = "0.4.2-alpha.0"
-authors = ["Toby Lawrence <tlawrence@nuclearfurnace.com>"]
+authors = ["Toby Lawrence <toby@nuclearfurnace.com>"]
 edition = "2018"
 
 license = "MIT"
@@ -16,18 +16,28 @@ readme = "README.md"
 
 keywords = ["rdtsc", "timing", "nanosecond"]
 
+[[bench]]
+name = "timing"
+harness = false
+
 [features]
-asm = ["tsc"]
-tsc = []
 metrics = ["metrics-core"]
 
 [dependencies]
-libc = "^0.2"
 metrics-core = { version = "^0.5", optional = true }
 
-[target.'cfg(windows)'.dependencies]
-winapi = { version = "0.3.6", features = ["profileapi"] }
+[target.'cfg(target_os = "linux")'.dependencies]
+libc = "^0.2"
+
+[target.'cfg(target_os = "macos")'.dependencies]
+mach = "^0.3"
+
+[target.'cfg(target_os = "ios")'.dependencies]
+mach = "^0.3"
+
+[target.'cfg(target_os = "windows")'.dependencies]
+winapi = { version = "^0.3", features = ["profileapi"] }
 
-[dev-dependencies.clocksource]
-version = "^0.4"
-features = []
+[dev-dependencies]
+criterion = "^0.3"
+clocksource = "^0.5"
diff --git a/README.md b/README.md
index 4f62dd0..8c2f551 100644
--- a/README.md
+++ b/README.md
@@ -28,46 +28,24 @@ __quanta__ is a high-speed timing library, useful for getting the current time _
 The API documentation of this library can be found at [docs.rs/quanta](https://docs.rs/quanta/).
 
 ## general features
-- time in nanoseconds
-- super fast! (see the benchmarks)
-- high-precision mode!
+- monotonic time in nanoseconds or raw cycles
+- extremely low overhead where possible
+- optimized for instruction-level accuracy in measurements
 - mockable!
-- cross-platform! (we target Linux, Windows, macOS, Solaris, \*BSD)
+- cross-platform!
 - fun, science-y name!
 
-## performance
-
-quanta provides high-speed access to the native system timing facilities and in general, with optimized assembly turned off, is generally on par with the standard library and external crates:
+## platform / architecture support
 
-    test bench::time_clocksource_counter       ... bench:      30,060 ns/iter (+/- 2,051)
-    test bench::time_clocksource_counter_delta ... bench:      74,790 ns/iter (+/- 2,897)
-    test bench::time_clocksource_time          ... bench:      30,439 ns/iter (+/- 2,571)
-    test bench::time_clocksource_time_delta    ... bench:      60,429 ns/iter (+/- 5,393)
-    test bench::time_hotmic_now                ... bench:      30,202 ns/iter (+/- 1,643)
-    test bench::time_hotmic_now_delta          ... bench:      59,499 ns/iter (+/- 5,829)
-    test bench::time_hotmic_raw                ... bench:      29,371 ns/iter (+/- 2,110)
-    test bench::time_hotmic_raw_delta          ... bench:      66,385 ns/iter (+/- 2,904)
-    test bench::time_instant_delta             ... bench:      64,285 ns/iter (+/- 3,311)
-    test bench::time_instant_now               ... bench:      18,603 ns/iter (+/- 1,116)
+For platforms, we have tier 1 support for Linux, Windows, and macOS/iOS.  Platforms such as Solaris or various BSDs has tier 2.5 support: `quanta` should work on them by virtue of depending on `libc`, but we don't test or build on these platforms as all.
 
-The non-delta tests represent the time it takes to take a single time measurement, while the delta tests represent the time to take two measurements and calculate the delta.  We can see that without using the optimized assembly features that both `quanta` and `clocksource` provide, taking single measurements is slower than [`Instant::now`] but generally consumes the same amount of time overall to take the measurements and calculate the delta, around 60-65ns.
+Architecture-wise, x86/x86-64 and SSE2 are required for the optimized TSC codepath.  This is handled transparently via compile-time target features, so you must build with the appropriate compiler flags to specify the CPU features where your binary will run, as runtime detection is not supported.
 
-Using optimized assembly, things can be much faster:
-
-    test bench::time_clocksource_counter       ... bench:      11,424 ns/iter (+/- 848)
-    test bench::time_clocksource_counter_delta ... bench:      36,813 ns/iter (+/- 2,047)
-    test bench::time_clocksource_time          ... bench:      25,499 ns/iter (+/- 2,101)
-    test bench::time_clocksource_time_delta    ... bench:      50,761 ns/iter (+/- 3,114)
-    test bench::time_hotmic_now                ... bench:      18,918 ns/iter (+/- 1,591)
-    test bench::time_hotmic_now_delta          ... bench:      38,367 ns/iter (+/- 2,134)
-    test bench::time_hotmic_raw                ... bench:      10,984 ns/iter (+/- 814)
-    test bench::time_hotmic_raw_delta          ... bench:      29,635 ns/iter (+/- 1,685)
-    test bench::time_instant_delta             ... bench:      63,968 ns/iter (+/- 3,805)
-    test bench::time_instant_now               ... bench:      18,096 ns/iter (+/- 1,381)
+## performance
 
-Both `quanta` and `clocksource` provide a way for the caller to get the "raw" measurement from the underlying time source, which is an unrefined value that needs to be scaled by a reference time source to end up as a meanginful value.  This is provided for taking measurments in tight loops where the deltas can be calculated after the fact.  For `clocksource`, the `counter` mode is the raw value, and `time` mode is the `Instant::now` equivalent.  For `quanta`, `raw` mode and `now` are as described above.
+Accessing the TSC on a modern x86 processor has an extremely low overhead of roughly ~11ns, and `quanta` provides the thinnest possible layer over this.  Using the native time facilities, such as `clock_gettime(CLOCK_MONOTONIC)` on Linux, you may expect to see closer to 17-18ns of overhead.
 
-We can see that both `quanta` and `clocksource` are measurably faster than `Instant::now` both in taking the discrete measurements and computing the delta.  `quanta`, however, edges out `clocksource`.
+Measurements have not been taken for non-x86-based architectures/platforms.
 
 ## why use this over stdlib or clocksource?
 
@@ -76,7 +54,7 @@ The performance alone is enough to choose this over the stdlib timing facilities
 When compared to `clocksource`, though, we have a few extra features that can make the difference:
 
 - `Clock` can be mocked, allowing you to easily control the passage of time in your tests
-- `Clock` provides a `start` and `end` method which, in optimized `asm` mode, can replace calls to `raw` and provide more accuracy in the measurement of the code in between
+- `Clock` provides `start` and `end` as replacements for `raw`, which are optimized for instruction-level accuracy, avoiding instruction reordering that might taint measurements
 
 ## license
 
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 40bd9cd..bfd23f6 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -2,14 +2,8 @@ trigger: ["master"]
 pr: ["master"]
 
 jobs:
-# Check the crate formatting.
 - template: ci/azure-rustfmt.yml
-
-# Actaully test the crate.
 - template: ci/azure-test-stable.yml
-
-# Test it to make sure it still works on our minimum version.
 - template: ci/azure-test-minimum.yaml
-
-# Now test it against nightly w/ ASM support.
 - template: ci/azure-test-nightly.yml
+- template: ci/azure-bench-stable.yml
diff --git a/benches/timing.rs b/benches/timing.rs
index 88c0ecb..892259b 100644
--- a/benches/timing.rs
+++ b/benches/timing.rs
@@ -1,161 +1,154 @@
-#![cfg_attr(test, feature(test))]
-mod bench {
-    extern crate clocksource;
-    extern crate test;
-    use self::test::Bencher;
-    use clocksource::Clocksource;
-    use quanta::Clock;
-    use std::time::Instant;
-
-    #[bench]
-    fn time_instant_now(b: &mut Bencher) {
-        b.iter(|| {
-            for _ in 0..1000 {
-                test::black_box(Instant::now());
-            }
-        });
-    }
-
-    #[bench]
-    fn time_instant_delta(b: &mut Bencher) {
-        b.iter(|| {
-            for _ in 0..1000 {
-                let start = Instant::now();
-                let d = Instant::now() - start;
-                test::black_box((d.as_secs() * 1_000_000_000) + u64::from(d.subsec_nanos()));
-            }
-        });
-    }
-
-    #[bench]
-    fn time_clocksource_counter(b: &mut Bencher) {
-        let cs = Clocksource::new();
-        b.iter(|| {
-            for _ in 0..1000 {
-                test::black_box(cs.counter());
-            }
-        });
-    }
-
-    #[bench]
-    fn time_clocksource_counter_delta(b: &mut Bencher) {
-        let cs = Clocksource::new();
-        b.iter(|| {
-            for _ in 0..1000 {
-                let start = cs.counter();
-                let end = cs.counter();
-                let t0 = cs.convert(start);
-                let t1 = cs.convert(end);
-                test::black_box(t1 - t0);
-            }
-        });
-    }
-
-    #[bench]
-    fn time_clocksource_time(b: &mut Bencher) {
-        let cs = Clocksource::new();
-        b.iter(|| {
-            for _ in 0..1000 {
-                test::black_box(cs.time());
-            }
-        });
-    }
-
-    #[bench]
-    fn time_clocksource_time_delta(b: &mut Bencher) {
-        let cs = Clocksource::new();
-        b.iter(|| {
-            for _ in 0..1000 {
-                let t0 = cs.time();
-                let t1 = cs.time();
-                test::black_box(t1 - t0);
-            }
-        });
-    }
-
-    #[bench]
-    fn time_quanta_now(b: &mut Bencher) {
-        let cs: Clock = Clock::new();
-        b.iter(|| {
-            for _ in 0..1000 {
-                test::black_box(cs.now());
-            }
-        });
-    }
-
-    #[bench]
-    fn time_quanta_now_delta(b: &mut Bencher) {
-        let cs: Clock = Clock::new();
-        b.iter(|| {
-            for _ in 0..1000 {
-                let start = cs.now();
-                let end = cs.now();
-                test::black_box(end - start);
-            }
-        });
-    }
-
-    #[bench]
-    fn time_quanta_start(b: &mut Bencher) {
-        let cs: Clock = Clock::new();
-        b.iter(|| {
-            for _ in 0..1000 {
-                test::black_box(cs.start());
-            }
-        });
-    }
-
-    #[bench]
-    fn time_quanta_end(b: &mut Bencher) {
-        let cs: Clock = Clock::new();
-        b.iter(|| {
-            for _ in 0..1000 {
-                test::black_box(cs.end());
-            }
-        });
-    }
-
-    #[bench]
-    fn time_quanta_start_end_delta(b: &mut Bencher) {
-        let cs: Clock = Clock::new();
-        b.iter(|| {
-            for _ in 0..1000 {
-                let start = cs.start();
-                let end = cs.end();
-                test::black_box(cs.delta(start, end));
-            }
-        });
-    }
-
-    #[bench]
-    fn time_quanta_raw(b: &mut Bencher) {
-        let cs: Clock = Clock::new();
-        b.iter(|| {
-            for _ in 0..1000 {
-                test::black_box(cs.raw());
-            }
-        });
-    }
-
-    #[bench]
-    fn time_quanta_raw_delta(b: &mut Bencher) {
-        let cs: Clock = Clock::new();
-        b.iter(|| {
-            for _ in 0..1000 {
-                let start = cs.raw();
-                let end = cs.raw();
-                test::black_box(cs.delta(start, end));
-            }
-        });
-    }
-
-    #[bench]
-    fn time_quanta_recent(b: &mut Bencher) {
-        let cs: Clock = Clock::new();
-        b.iter(|| {
-            for _ in 0..1000 {
-                test::black_box(cs.recent());
-            }
-        });
-    }
+use clocksource::Clocksource;
+use criterion::{criterion_group, criterion_main, Bencher, Criterion};
+use quanta::Clock;
+use std::time::Instant;
+
+fn time_instant_now(b: &mut Bencher) {
+    b.iter(|| Instant::now())
+}
+
+fn time_clocksource_time(b: &mut Bencher) {
+    let cs = Clocksource::new();
+    b.iter(|| cs.time())
+}
+
+fn time_clocksource_counter(b: &mut Bencher) {
+    let cs = Clocksource::new();
+    b.iter(|| cs.counter())
+}
+
+fn time_clocksource_counter_scaled(b: &mut Bencher) {
+    let cs = Clocksource::new();
+    b.iter(|| cs.convert(cs.counter()))
+}
+
+fn time_quanta_now(b: &mut Bencher) {
+    let clock = Clock::new();
+    b.iter(|| clock.now())
+}
+
+fn time_quanta_raw(b: &mut Bencher) {
+    let clock = Clock::new();
+    b.iter(|| clock.raw())
 }
+
+fn time_quanta_raw_scaled(b: &mut Bencher) {
+    let clock = Clock::new();
+    b.iter(|| clock.scaled(clock.raw()))
+}
+
+fn time_quanta_start(b: &mut Bencher) {
+    let clock = Clock::new();
+    b.iter(|| clock.start())
+}
+
+fn time_quanta_start_scaled(b: &mut Bencher) {
+    let clock = Clock::new();
+    b.iter(|| clock.scaled(clock.start()))
+}
+
+fn time_quanta_end(b: &mut Bencher) {
+    let clock = Clock::new();
+    b.iter(|| clock.end())
+}
+
+fn time_quanta_end_scaled(b: &mut Bencher) {
+    let clock = Clock::new();
+    b.iter(|| clock.scaled(clock.end()))
+}
+
+fn time_instant_delta(b: &mut Bencher) {
+    b.iter(|| {
+        let start = Instant::now();
+        let d = Instant::now() - start;
+        (d.as_secs() * 1_000_000_000) + u64::from(d.subsec_nanos())
+    })
+}
+
+fn time_clocksource_counter_delta(b: &mut Bencher) {
+    let cs = Clocksource::new();
+    b.iter(|| {
+        let start = cs.counter();
+        let end = cs.counter();
+        cs.convert(end) - cs.convert(start)
+    })
+}
+
+fn time_clocksource_time_delta(b: &mut Bencher) {
+    let cs = Clocksource::new();
+    b.iter(|| {
+        let t0 = cs.time();
+        let t1 = cs.time();
+        t1 - t0
+    })
+}
+
+fn time_quanta_raw_delta(b: &mut Bencher) {
+    let clock = Clock::new();
+    b.iter(|| {
+        let start = clock.raw();
+        let end = clock.raw();
+        clock.delta(start, end)
+    })
+}
+
+fn time_quanta_now_delta(b: &mut Bencher) {
+    let clock = Clock::new();
+    b.iter(|| {
+        let start = clock.now();
+        let end = clock.now();
+        end - start
+    })
+}
+
+fn time_quanta_start_end_delta(b: &mut Bencher) {
+    let clock = Clock::new();
+    b.iter(|| {
+        let start = clock.start();
+        let end = clock.end();
+        clock.delta(start, end)
+    })
+}
+
+fn time_quanta_recent(b: &mut Bencher) {
+    let clock = Clock::new();
+    b.iter(|| clock.recent())
+}
+
+fn benchmark(c: &mut Criterion) {
+    let mut std_group = c.benchmark_group("stdlib");
+    std_group.bench_function("instant now", time_instant_now);
+    std_group.bench_function("instant delta", time_instant_delta);
+
+    std_group.finish();
+
+    let mut cs_group = c.benchmark_group("clocksource");
+    cs_group.bench_function("clocksource time", time_clocksource_time);
+    cs_group.bench_function("clocksource time delta", time_clocksource_time_delta);
+    cs_group.bench_function("clocksource counter", time_clocksource_counter);
+    cs_group.bench_function(
+        "clocksource counter scaled",
+        time_clocksource_counter_scaled,
+    );
+    cs_group.bench_function("clocksource counter delta", time_clocksource_counter_delta);
+
+    cs_group.finish();
+
+    let mut q_group = c.benchmark_group("quanta");
+    q_group.bench_function("quanta now", time_quanta_now);
+    q_group.bench_function("quanta now delta", time_quanta_now_delta);
+    q_group.bench_function("quanta raw", time_quanta_raw);
+    q_group.bench_function("quanta raw scaled", time_quanta_raw_scaled);
+    q_group.bench_function("quanta raw delta", time_quanta_raw_delta);
+    q_group.bench_function("quanta start", time_quanta_start);
+    q_group.bench_function("quanta start scaled", time_quanta_start_scaled);
+    q_group.bench_function("quanta end", time_quanta_end);
+    q_group.bench_function("quanta end scaled", time_quanta_end_scaled);
+    q_group.bench_function("quanta start/end delta", time_quanta_start_end_delta);
+    q_group.bench_function("quanta recent", time_quanta_recent);
+
+    q_group.finish();
+}
+
+criterion_group!(benches, benchmark);
+criterion_main!(benches);
diff --git a/ci/azure-bench-stable.yml b/ci/azure-bench-stable.yml
new file mode 100644
index 0000000..52320a4
--- /dev/null
+++ b/ci/azure-bench-stable.yml
@@ -0,0 +1,20 @@
+jobs:
+- job: bench_stable
+  displayName: Bench Stable
+  strategy:
+    matrix:
+      "(Ubuntu 18.04)":
+        vmImage: ubuntu-18.04
+      "(macOS X Catalina)":
+        vmImage: macOS-10.15
+      "(Windows Server 2019)":
+        vmImage: windows-2019
+  pool:
+    vmImage: $(vmImage)
+
+  steps:
+  - template: azure-install-rust.yml
+    parameters:
+      rust_version: stable
+  - script: cargo bench
+    displayName: cargo bench
diff --git a/ci/azure-rustfmt.yml b/ci/azure-rustfmt.yml
index cb52dbb..d7ce1a6 100644
--- a/ci/azure-rustfmt.yml
+++ b/ci/azure-rustfmt.yml
@@ -3,7 +3,7 @@ jobs:
 - job: rustfmt
   displayName: Check rustfmt
   pool:
-    vmImage: ubuntu-16.04
+    vmImage: ubuntu-18.04
   steps:
     - template: azure-install-rust.yml
       parameters:
diff --git a/ci/azure-test-minimum.yaml b/ci/azure-test-minimum.yaml
index eb37a0e..43954bd 100644
--- a/ci/azure-test-minimum.yaml
+++ b/ci/azure-test-minimum.yaml
@@ -1,20 +1,26 @@
 jobs:
-- job: test_quanta_minimum
-  displayName: Test Quanta Minimum
+- job: test_minimum
+  displayName: Test Minimum
   strategy:
     matrix:
-      Linux:
+      "(Ubuntu 16.04)":
         vmImage: ubuntu-16.04
-      MacOS:
-        vmImage: macOS-10.13
-      Windows:
+      "(Ubuntu 18.04)":
+        vmImage: ubuntu-18.04
+      "(macOS X Mojave)":
+        vmImage: macOS-10.14
+      "(macOS X Catalina)":
+        vmImage: macOS-10.15
+      "(Windows Server 2016)":
         vmImage: vs2017-win2016
+      "(Windows Server 2019)":
+        vmImage: windows-2019
   pool:
     vmImage: $(vmImage)
 
   steps:
   - template: azure-install-rust.yml
     parameters:
-      rust_version: 1.34.0
+      rust_version: 1.36.0
   - script: cargo test
     displayName: cargo test
diff --git a/ci/azure-test-nightly.yml b/ci/azure-test-nightly.yml
index bd8aa4a..4a6d011 100644
--- a/ci/azure-test-nightly.yml
+++ b/ci/azure-test-nightly.yml
@@ -1,14 +1,14 @@
 jobs:
-- job: test_quanta_nightly
-  displayName: Test Quanta Nightly
+- job: test_nightly
+  displayName: Test Nightly
   strategy:
     matrix:
-      Linux:
-        vmImage: ubuntu-16.04
-      MacOS:
-        vmImage: macOS-10.13
-      Windows:
-        vmImage: vs2017-win2016
+      "(Ubuntu 18.04)":
+        vmImage: ubuntu-18.04
+      "(macOS X Catalina)":
+        vmImage: macOS-10.15
+      "(Windows Server 2019)":
+        vmImage: windows-2019
   pool:
     vmImage: $(vmImage)
 
@@ -16,10 +16,6 @@ jobs:
   - template: azure-install-rust.yml
     parameters:
       rust_version: nightly
-  - script: cargo test --features asm
-    displayName: cargo test --features asm
-  - script: cargo bench
-    displayName: cargo bench
-  - script: cargo bench --features asm
-    displayName: cargo bench --features asm
+  - script: cargo test
+    displayName: cargo test
 
diff --git a/ci/azure-test-stable.yml b/ci/azure-test-stable.yml
index 2fb8d85..c3c8306 100644
--- a/ci/azure-test-stable.yml
+++ b/ci/azure-test-stable.yml
@@ -1,14 +1,14 @@
 jobs:
-- job: test_quanta_stable
-  displayName: Test Quanta Stable
+- job: test_stable
+  displayName: Test Stable
   strategy:
     matrix:
-      Linux:
-        vmImage: ubuntu-16.04
-      MacOS:
-        vmImage: macOS-10.13
-      Windows:
-        vmImage: vs2017-win2016
+      "(Ubuntu 18.04)":
+        vmImage: ubuntu-18.04
+      "(macOS X Catalina)":
+        vmImage: macOS-10.15
+      "(Windows Server 2019)":
+        vmImage: windows-2019
   pool:
     vmImage: $(vmImage)
 
diff --git a/src/counter.rs b/src/counter.rs
index f69c5a3..33fe315 100644
--- a/src/counter.rs
+++ b/src/counter.rs
@@ -1,44 +1,55 @@
 use crate::ClockSource;
 
+#[cfg(all(target_arch = "x86", target_feature = "sse2"))]
+use std::arch::x86::{__rdtscp, _mm_lfence, _rdtsc};
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+use std::arch::x86_64::{__rdtscp, _mm_lfence, _rdtsc};
+
 #[derive(Debug, Clone, Default)]
 pub struct Counter;
 
 impl Counter {
     #[allow(dead_code)]
-    pub fn new() -> Self { Counter {} }
+    pub fn new() -> Self {
+        Counter {}
+    }
 }
 
-#[cfg(feature = "tsc")]
+#[cfg(all(
+    any(target_arch = "x86", target_arch = "x86_64"),
+    target_feature = "sse2"
+))]
 impl ClockSource for Counter {
     fn now(&self) -> u64 {
-        let mut l: u32;
-        let mut h: u32;
         unsafe {
-            asm!("lfence; rdtsc" : "={eax}" (l), "={edx}" (h) ::: "volatile");
+            _mm_lfence();
+            _rdtsc()
         }
-        ((u64::from(h)) << 32) | u64::from(l)
     }
 
     fn start(&self) -> u64 {
-        let mut l: u32;
-        let mut h: u32;
         unsafe {
-            asm!("lfence; rdtsc; lfence" : "={eax}" (l), "={edx}" (h) ::: "volatile");
+            _mm_lfence();
+            let result = _rdtsc();
+            _mm_lfence();
+            result
         }
-        ((u64::from(h)) << 32) | u64::from(l)
     }
 
     fn end(&self) -> u64 {
-        let mut l: u32;
-        let mut h: u32;
+        let mut _aux: u32 = 0;
         unsafe {
-            asm!("rdtscp; lfence" : "={eax}" (l), "={edx}" (h) ::: "volatile");
+            let result = __rdtscp(&mut _aux as *mut _);
+            _mm_lfence();
+            result
         }
-        ((u64::from(h)) << 32) | u64::from(l)
     }
 }
 
-#[cfg(not(feature = "tsc"))]
+#[cfg(not(all(
+    any(target_arch = "x86", target_arch = "x86_64"),
+    target_feature = "sse2"
+)))]
 impl ClockSource for Counter {
     fn now(&self) -> u64 {
         panic!("can't use counter without TSC support");
diff --git a/src/instant.rs b/src/instant.rs
index bb53e0c..cbcbc81 100644
--- a/src/instant.rs
+++ b/src/instant.rs
@@ -1,7 +1,7 @@
 #[cfg(feature = "metrics")]
 use metrics_core::AsNanoseconds;
 
-use std::cmp::{Ord, PartialOrd, Ordering};
+use std::cmp::{Ord, Ordering, PartialOrd};
 use std::fmt;
 use std::ops::{Add, AddAssign, Sub, SubAssign};
 use std::time::Duration;
@@ -47,7 +47,8 @@ impl Instant {
     /// println!("{:?}", new_now.duration_since(now));
     /// ```
     pub fn duration_since(&self, earlier: Instant) -> Duration {
-        self.0.checked_sub(earlier.0)
+        self.0
+            .checked_sub(earlier.0)
             .map(Duration::from_nanos)
             .expect("supplied instant is later than self")
     }
@@ -70,8 +71,7 @@ impl Instant {
     /// println!("{:?}", now.checked_duration_since(new_now)); // None
     /// ```
     pub fn checked_duration_since(&self, earlier: Instant) -> Option<Duration> {
-        self.0.checked_sub(earlier.0)
-            .map(Duration::from_nanos)
+        self.0.checked_sub(earlier.0).map(Duration::from_nanos)
     }
 
     /// Returns the amount of time elapsed from another instant to this one,
@@ -93,23 +93,21 @@ impl Instant {
     /// ```
     pub fn saturating_duration_since(&self, earlier: Instant) -> Duration {
         self.checked_duration_since(earlier)
-            .unwrap_or(Duration::new(0, 0))
+            .unwrap_or_else(|| Duration::new(0, 0))
     }
 
     /// Returns `Some(t)` where `t` is the time `self + duration` if `t` can be represented as
     /// `Instant` (which means it's inside the bounds of the underlying data structure), `None`
     /// otherwise.
     pub fn checked_add(&self, duration: Duration) -> Option<Instant> {
-        self.0.checked_add(duration.as_nanos() as u64)
-            .map(Instant)
+        self.0.checked_add(duration.as_nanos() as u64).map(Instant)
     }
 
     /// Returns `Some(t)` where `t` is the time `self - duration` if `t` can be represented as
     /// `Instant` (which means it's inside the bounds of the underlying data structure), `None`
     /// otherwise.
     pub fn checked_sub(&self, duration: Duration) -> Option<Instant> {
-        self.0.checked_sub(duration.as_nanos() as u64)
-            .map(Instant)
+        self.0.checked_sub(duration.as_nanos() as u64).map(Instant)
     }
 
     /// Gets the inner value of this `Instant`.
diff --git a/src/lib.rs b/src/lib.rs
index 5770a89..d957f7c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -45,8 +45,6 @@
 //! [clock_gettime]: https://linux.die.net/man/3/clock_gettime
 //! [#29722]: https://github.com/rust-lang/rust/issues/29722
 //! [tsc_support]: http://oliveryang.net/2015/09/pitfalls-of-TSC-usage/
-#![cfg_attr(feature = "tsc", feature(asm))]
-
 use std::sync::{
     atomic::{AtomicU64, Ordering},
     Arc,
@@ -56,7 +54,6 @@ use std::time::Duration;
 mod monotonic;
 use self::monotonic::Monotonic;
 mod counter;
-#[allow(unused_imports)]
 use self::counter::Counter;
 mod mock;
 pub use self::mock::{IntoNanoseconds, Mock};
@@ -69,9 +66,15 @@ static GLOBAL_RECENT: AtomicU64 = AtomicU64::new(0);
 
 type Reference = Monotonic;
 
-#[cfg(feature = "tsc")]
+#[cfg(all(
+    any(target_arch = "x86", target_arch = "x86_64"),
+    target_feature = "sse2"
+))]
 type Source = Counter;
-#[cfg(not(feature = "tsc"))]
+#[cfg(not(all(
+    any(target_arch = "x86", target_arch = "x86_64"),
+    target_feature = "sse2"
+)))]
 type Source = Monotonic;
 
 #[derive(Debug, Clone)]
@@ -167,7 +170,9 @@ impl Calibration {
 }
 
 impl Default for Calibration {
-    fn default() -> Self { Self::new() }
+    fn default() -> Self {
+        Self::new()
+    }
 }
 
 /// Unified clock for taking measurements.
@@ -301,10 +306,11 @@ impl Clock {
                 let scaled = if calibration.identical {
                     value
                 } else {
-                    (((value as f64 - calibration.src_time) * calibration.hz_ratio) + calibration.ref_time) as u64
+                    (((value as f64 - calibration.src_time) * calibration.hz_ratio)
+                        + calibration.ref_time) as u64
                 };
                 Instant(scaled)
-            },
+            }
             ClockType::Mock(_) => Instant(value),
         }
     }
@@ -320,7 +326,9 @@ impl Clock {
     pub fn delta(&self, start: u64, end: u64) -> Duration {
         let raw_delta = end.wrapping_sub(start);
         let scaled = match &self.inner {
-            ClockType::Optimized(_, _, calibration) => (raw_delta as f64 * calibration.hz_ratio) as u64,
+            ClockType::Optimized(_, _, calibration) => {
+                (raw_delta as f64 * calibration.hz_ratio) as u64
+            }
             ClockType::Mock(_) => raw_delta,
         };
         Duration::from_nanos(scaled)
@@ -354,11 +362,15 @@ impl Clock {
     }
 
     /// Updates the recent current time.
-    pub(crate) fn upkeep(value: Instant) { GLOBAL_RECENT.store(value.0, Ordering::Release); }
+    pub(crate) fn upkeep(value: Instant) {
+        GLOBAL_RECENT.store(value.0, Ordering::Release);
+    }
 }
 
 impl Default for Clock {
-    fn default() -> Clock { Clock::new() }
+    fn default() -> Clock {
+        Clock::new()
+    }
 }
 
 #[cfg(test)]
diff --git a/src/mock.rs b/src/mock.rs
index 2177e4d..26bdc68 100644
--- a/src/mock.rs
+++ b/src/mock.rs
@@ -17,11 +17,15 @@ pub trait IntoNanoseconds {
 }
 
 impl IntoNanoseconds for u64 {
-    fn into_nanos(self) -> u64 { self }
+    fn into_nanos(self) -> u64 {
+        self
+    }
 }
 
 impl IntoNanoseconds for Duration {
-    fn into_nanos(self) -> u64 { self.as_nanos() as u64 }
+    fn into_nanos(self) -> u64 {
+        self.as_nanos() as u64
+    }
 }
 
 /// Controllable time source for use in tests.
@@ -39,19 +43,27 @@ impl Mock {
 
     /// Increments the time by the given amount.
     pub fn increment<N: IntoNanoseconds>(&self, amount: N) {
-        self.offset.fetch_add(amount.into_nanos(), Ordering::Release);
+        self.offset
+            .fetch_add(amount.into_nanos(), Ordering::Release);
     }
 
     /// Decrements the time by the given amount.
     pub fn decrement<N: IntoNanoseconds>(&self, amount: N) {
-        self.offset.fetch_sub(amount.into_nanos(), Ordering::Release);
+        self.offset
+            .fetch_sub(amount.into_nanos(), Ordering::Release);
     }
 }
 
 impl ClockSource for Mock {
-    fn now(&self) -> u64 { self.offset.load(Ordering::Acquire) }
+    fn now(&self) -> u64 {
+        self.offset.load(Ordering::Acquire)
+    }
 
-    fn start(&self) -> u64 { self.now() }
+    fn start(&self) -> u64 {
+        self.now()
+    }
 
-    fn end(&self) -> u64 { self.now() }
+    fn end(&self) -> u64 {
+        self.now()
+    }
 }
diff --git a/src/monotonic.rs b/src/monotonic.rs
index a99a1bc..64e990e 100644
--- a/src/monotonic.rs
+++ b/src/monotonic.rs
@@ -1,6 +1,13 @@
 use crate::ClockSource;
 
-#[cfg(all(not(target_os = "macos"), not(target_os = "ios"), not(target_os = "windows")))]
+#[cfg(any(target_os = "macos", target_os = "ios"))]
+use mach::mach_time::{mach_continuous_time, mach_timebase_info};
+
+#[cfg(all(
+    not(target_os = "macos"),
+    not(target_os = "ios"),
+    not(target_os = "windows")
+))]
 #[derive(Debug, Clone)]
 pub struct Monotonic;
 
@@ -10,24 +17,41 @@ pub struct Monotonic {
     factor: u64,
 }
 
-#[cfg(all(not(target_os = "macos"), not(target_os = "ios"), not(target_os = "windows")))]
+#[cfg(all(
+    not(target_os = "macos"),
+    not(target_os = "ios"),
+    not(target_os = "windows")
+))]
 impl Monotonic {
-    pub fn new() -> Monotonic { Monotonic {} }
+    pub fn new() -> Monotonic {
+        Monotonic {}
+    }
 }
 
-#[cfg(all(not(target_os = "macos"), not(target_os = "ios"), not(target_os = "windows")))]
+#[cfg(all(
+    not(target_os = "macos"),
+    not(target_os = "ios"),
+    not(target_os = "windows")
+))]
 impl ClockSource for Monotonic {
     fn now(&self) -> u64 {
-        let mut ts = libc::timespec { tv_sec: 0, tv_nsec: 0 };
+        let mut ts = libc::timespec {
+            tv_sec: 0,
+            tv_nsec: 0,
+        };
         unsafe {
             libc::clock_gettime(libc::CLOCK_MONOTONIC, &mut ts);
         }
         (ts.tv_sec as u64) * 1_000_000_000 + (ts.tv_nsec as u64)
     }
 
-    fn start(&self) -> u64 { self.now() }
+    fn start(&self) -> u64 {
+        self.now()
+    }
 
-    fn end(&self) -> u64 { self.now() }
+    fn end(&self) -> u64 {
+        self.now()
+    }
 }
 
 #[cfg(target_os = "windows")]
@@ -39,7 +63,9 @@ impl Monotonic {
         let denom = unsafe {
             let mut freq = mem::zeroed();
             if profileapi::QueryPerformanceFrequency(&mut freq) == 0 {
-                unreachable!("QueryPerformanceFrequency on Windows XP or later should never return zero!");
+                unreachable!(
+                    "QueryPerformanceFrequency on Windows XP or later should never return zero!"
+                );
             }
             *freq.QuadPart() as u64
         };
@@ -59,24 +85,30 @@ impl ClockSource for Monotonic {
         let raw = unsafe {
             let mut count = mem::zeroed();
             if profileapi::QueryPerformanceCounter(&mut count) == 0 {
-                unreachable!("QueryPerformanceCounter on Windows XP or later should never return zero!");
+                unreachable!(
+                    "QueryPerformanceCounter on Windows XP or later should never return zero!"
+                );
             }
             *count.QuadPart() as u64
         };
         raw * self.factor
     }
 
-    fn start(&self) -> u64 { self.now() }
+    fn start(&self) -> u64 {
+        self.now()
+    }
 
-    fn end(&self) -> u64 { self.now() }
+    fn end(&self) -> u64 {
+        self.now()
+    }
 }
 
 #[cfg(any(target_os = "macos", target_os = "ios"))]
 impl Monotonic {
     pub fn new() -> Monotonic {
-        let mut info = libc::mach_timebase_info { numer: 0, denom: 0 };
+        let mut info = mach_timebase_info { numer: 0, denom: 0 };
         unsafe {
-            libc::mach_timebase_info(&mut info);
+            mach_timebase_info(&mut info);
         }
 
         let factor = u64::from(info.numer) / u64::from(info.denom);
@@ -87,15 +119,21 @@ impl Monotonic {
 #[cfg(any(target_os = "macos", target_os = "ios"))]
 impl ClockSource for Monotonic {
     fn now(&self) -> u64 {
-        let raw = unsafe { libc::mach_absolute_time() };
+        let raw = unsafe { mach_continuous_time() };
         raw * self.factor
     }
 
-    fn start(&self) -> u64 { self.now() }
+    fn start(&self) -> u64 {
+        self.now()
+    }
 
-    fn end(&self) -> u64 { self.now() }
+    fn end(&self) -> u64 {
+        self.now()
+    }
 }
 
 impl Default for Monotonic {
-    fn default() -> Self { Self::new() }
+    fn default() -> Self {
+        Self::new()
+    }
 }
diff --git a/src/upkeep.rs b/src/upkeep.rs
index fb5400b..9e207f4 100644
--- a/src/upkeep.rs
+++ b/src/upkeep.rs
@@ -32,10 +32,14 @@ impl Builder {
     /// This creates a new internal clock for acquiring the current time.  If you have an existing
     /// [`Clock`] that is already calibrated, it is slightly faster to clone it and construct the
     /// builder with [`new_with_clock`](Builder::new_with_clock) to avoid recalibrating.
-    pub fn new(interval: Duration) -> Builder { Self::new_with_clock(interval, Clock::new()) }
+    pub fn new(interval: Duration) -> Builder {
+        Self::new_with_clock(interval, Clock::new())
+    }
 
     /// Creates a new [`Builder`] with the specified [`Clock`] instance.
-    pub fn new_with_clock(interval: Duration, clock: Clock) -> Builder { Builder { interval, clock } }
+    pub fn new_with_clock(interval: Duration, clock: Clock) -> Builder {
+        Builder { interval, clock }
+    }
 
     /// Start the upkeep thread, periodically updating the global coarse time.
     ///