From d061fee177c70ae146db2b9720c85dc1f38215af Mon Sep 17 00:00:00 2001 From: Tyson Nottingham Date: Mon, 28 Sep 2020 17:34:27 -0700 Subject: [PATCH] Stable hashing: add comments and tests concerning platform-independence SipHasher128 implements short_write in an endian-independent way, yet its write_xxx Hasher trait methods undo this endian-independence by byte swapping the integer inputs on big-endian hardware. StableHasher then adds endian-independence back by also byte-swapping on big-endian hardware prior to invoking SipHasher128. This double swap may have the appearance of being a no-op, but is in fact by design. In particular, we really do want SipHasher128 to be platform-dependent, in order to be consistent with the libstd SipHasher. Try to clarify this intent. Also, add and update a couple of unit tests. --- compiler/rustc_data_structures/src/sip128.rs | 25 +++++-- .../rustc_data_structures/src/sip128/tests.rs | 56 +++++++++++--- .../src/stable_hasher.rs | 6 +- .../src/stable_hasher/tests.rs | 73 +++++++++++++++++++ 4 files changed, 142 insertions(+), 18 deletions(-) create mode 100644 compiler/rustc_data_structures/src/stable_hasher/tests.rs diff --git a/compiler/rustc_data_structures/src/sip128.rs b/compiler/rustc_data_structures/src/sip128.rs index beb28dd072058..2c4eff618c685 100644 --- a/compiler/rustc_data_structures/src/sip128.rs +++ b/compiler/rustc_data_structures/src/sip128.rs @@ -125,15 +125,28 @@ impl SipHasher128 { // A specialized write function for values with size <= 8. // - // The hashing of multi-byte integers depends on endianness. E.g.: + // The input must be zero-extended to 64-bits by the caller. This extension + // isn't hashed, but the implementation requires it for correctness. + // + // This function, given the same integer size and value, has the same effect + // on both little- and big-endian hardware. It operates on values without + // depending on their sequence in memory, so is independent of endianness. + // + // However, we want SipHasher128 to be platform-dependent, in order to be + // consistent with the platform-dependent SipHasher in libstd. In other + // words, we want: + // // - little-endian: `write_u32(0xDDCCBBAA)` == `write([0xAA, 0xBB, 0xCC, 0xDD])` // - big-endian: `write_u32(0xDDCCBBAA)` == `write([0xDD, 0xCC, 0xBB, 0xAA])` // - // This function does the right thing for little-endian hardware. On - // big-endian hardware `x` must be byte-swapped first to give the right - // behaviour. After any byte-swapping, the input must be zero-extended to - // 64-bits. The caller is responsible for the byte-swapping and - // zero-extension. + // Therefore, in order to produce endian-dependent results, SipHasher128's + // `write_xxx` Hasher trait methods byte-swap `x` prior to zero-extending. + // + // If clients of SipHasher128 itself want platform-independent results, they + // *also* must byte-swap integer inputs before invoking the `write_xxx` + // methods on big-endian hardware (that is, two byte-swaps must occur--one + // in the client, and one in SipHasher128). Additionally, they must extend + // `usize` and `isize` types to 64 bits on 32-bit systems. #[inline] fn short_write(&mut self, _x: T, x: u64) { let size = mem::size_of::(); diff --git a/compiler/rustc_data_structures/src/sip128/tests.rs b/compiler/rustc_data_structures/src/sip128/tests.rs index 80b7fc7475610..2e2274a7b775e 100644 --- a/compiler/rustc_data_structures/src/sip128/tests.rs +++ b/compiler/rustc_data_structures/src/sip128/tests.rs @@ -1,7 +1,6 @@ use super::*; use std::hash::{Hash, Hasher}; -use std::{mem, slice}; // Hash just the bytes of the slice, without length prefix struct Bytes<'a>(&'a [u8]); @@ -399,20 +398,55 @@ fn test_hash_no_concat_alias() { } #[test] -fn test_write_short_works() { - let test_usize = 0xd0c0b0a0usize; +fn test_short_write_works() { + let test_u8 = 0xFF_u8; + let test_u16 = 0x1122_u16; + let test_u32 = 0x22334455_u32; + let test_u64 = 0x33445566_778899AA_u64; + let test_u128 = 0x11223344_55667788_99AABBCC_DDEEFF77_u128; + let test_usize = 0xD0C0B0A0_usize; + + let test_i8 = -1_i8; + let test_i16 = -2_i16; + let test_i32 = -3_i32; + let test_i64 = -4_i64; + let test_i128 = -5_i128; + let test_isize = -6_isize; + let mut h1 = SipHasher128::new_with_keys(0, 0); - h1.write_usize(test_usize); h1.write(b"bytes"); h1.write(b"string"); - h1.write_u8(0xFFu8); - h1.write_u8(0x01u8); + h1.write_u8(test_u8); + h1.write_u16(test_u16); + h1.write_u32(test_u32); + h1.write_u64(test_u64); + h1.write_u128(test_u128); + h1.write_usize(test_usize); + h1.write_i8(test_i8); + h1.write_i16(test_i16); + h1.write_i32(test_i32); + h1.write_i64(test_i64); + h1.write_i128(test_i128); + h1.write_isize(test_isize); + let mut h2 = SipHasher128::new_with_keys(0, 0); - h2.write(unsafe { - slice::from_raw_parts(&test_usize as *const _ as *const u8, mem::size_of::()) - }); h2.write(b"bytes"); h2.write(b"string"); - h2.write(&[0xFFu8, 0x01u8]); - assert_eq!(h1.finish128(), h2.finish128()); + h2.write(&test_u8.to_ne_bytes()); + h2.write(&test_u16.to_ne_bytes()); + h2.write(&test_u32.to_ne_bytes()); + h2.write(&test_u64.to_ne_bytes()); + h2.write(&test_u128.to_ne_bytes()); + h2.write(&test_usize.to_ne_bytes()); + h2.write(&test_i8.to_ne_bytes()); + h2.write(&test_i16.to_ne_bytes()); + h2.write(&test_i32.to_ne_bytes()); + h2.write(&test_i64.to_ne_bytes()); + h2.write(&test_i128.to_ne_bytes()); + h2.write(&test_isize.to_ne_bytes()); + + let h1_hash = h1.finish128(); + let h2_hash = h2.finish128(); + + assert_eq!(h1_hash, h2_hash); } diff --git a/compiler/rustc_data_structures/src/stable_hasher.rs b/compiler/rustc_data_structures/src/stable_hasher.rs index c1c79b174f415..68875b3fbde9b 100644 --- a/compiler/rustc_data_structures/src/stable_hasher.rs +++ b/compiler/rustc_data_structures/src/stable_hasher.rs @@ -5,6 +5,9 @@ use smallvec::SmallVec; use std::hash::{BuildHasher, Hash, Hasher}; use std::mem; +#[cfg(test)] +mod tests; + /// When hashing something that ends up affecting properties like symbol names, /// we want these symbol names to be calculated independently of other factors /// like what architecture you're compiling *from*. @@ -129,7 +132,8 @@ impl Hasher for StableHasher { fn write_isize(&mut self, i: isize) { // Always treat isize as i64 so we get the same results on 32 and 64 bit // platforms. This is important for symbol hashes when cross compiling, - // for example. + // for example. Sign extending here is preferable as it means that the + // same negative number hashes the same on both 32 and 64 bit platforms. self.state.write_i64((i as i64).to_le()); } } diff --git a/compiler/rustc_data_structures/src/stable_hasher/tests.rs b/compiler/rustc_data_structures/src/stable_hasher/tests.rs new file mode 100644 index 0000000000000..cd6ff96a555f4 --- /dev/null +++ b/compiler/rustc_data_structures/src/stable_hasher/tests.rs @@ -0,0 +1,73 @@ +use super::*; + +// The tests below compare the computed hashes to particular expected values +// in order to test that we produce the same results on different platforms, +// regardless of endianness and `usize` and `isize` size differences (this +// of course assumes we run these tests on platforms that differ in those +// ways). The expected values depend on the hashing algorithm used, so they +// need to be updated whenever StableHasher changes its hashing algorithm. + +#[test] +fn test_hash_integers() { + // Test that integers are handled consistently across platforms. + let test_u8 = 0xAB_u8; + let test_u16 = 0xFFEE_u16; + let test_u32 = 0x445577AA_u32; + let test_u64 = 0x01234567_13243546_u64; + let test_u128 = 0x22114433_66557788_99AACCBB_EEDDFF77_u128; + let test_usize = 0xD0C0B0A0_usize; + + let test_i8 = -100_i8; + let test_i16 = -200_i16; + let test_i32 = -300_i32; + let test_i64 = -400_i64; + let test_i128 = -500_i128; + let test_isize = -600_isize; + + let mut h = StableHasher::new(); + test_u8.hash(&mut h); + test_u16.hash(&mut h); + test_u32.hash(&mut h); + test_u64.hash(&mut h); + test_u128.hash(&mut h); + test_usize.hash(&mut h); + test_i8.hash(&mut h); + test_i16.hash(&mut h); + test_i32.hash(&mut h); + test_i64.hash(&mut h); + test_i128.hash(&mut h); + test_isize.hash(&mut h); + + // This depends on the hashing algorithm. See note at top of file. + let expected = (2736651863462566372, 8121090595289675650); + + assert_eq!(h.finalize(), expected); +} + +#[test] +fn test_hash_usize() { + // Test that usize specifically is handled consistently across platforms. + let test_usize = 0xABCDEF01_usize; + + let mut h = StableHasher::new(); + test_usize.hash(&mut h); + + // This depends on the hashing algorithm. See note at top of file. + let expected = (5798740672699530587, 11186240177685111648); + + assert_eq!(h.finalize(), expected); +} + +#[test] +fn test_hash_isize() { + // Test that isize specifically is handled consistently across platforms. + let test_isize = -7_isize; + + let mut h = StableHasher::new(); + test_isize.hash(&mut h); + + // This depends on the hashing algorithm. See note at top of file. + let expected = (14721296605626097289, 11385941877786388409); + + assert_eq!(h.finalize(), expected); +}