Auto merge of #38357 - arielb1:deterministic-hash, r=michaelwoerister

make deterministic_hash host-architecture-independent `DefPath::deterministic_hash` used to call `std::hash::Hash`, which depends on the current architecture in several ways, which would prevent metadata written on one host architecture from being successfully read on another one. Use a hasher we control instead. Fixes #38177. r? @michaelwoerister
rust-lang · Dec 16, 2016 · c6d8ab0 · c6d8ab0
2 parents 8ae9040 + e1d4b8f
commit c6d8ab0
Show file tree

Hide file tree

Showing 14 changed files with 269 additions and 229 deletions.
diff --git a/src/librustc/hir/map/definitions.rs b/src/librustc/hir/map/definitions.rs
@@ -10,9 +10,9 @@
 
 use hir::def_id::{CrateNum, DefId, DefIndex, LOCAL_CRATE};
 use rustc_data_structures::fx::FxHashMap;
+use rustc_data_structures::stable_hasher::StableHasher;
 use std::fmt::Write;
 use std::hash::{Hash, Hasher};
-use std::collections::hash_map::DefaultHasher;
 use syntax::ast;
 use syntax::symbol::{Symbol, InternedString};
 use ty::TyCtxt;
@@ -131,7 +131,8 @@ impl DefPath {
     }
 
     pub fn deterministic_hash(&self, tcx: TyCtxt) -> u64 {
-        let mut state = DefaultHasher::new();
+        debug!("deterministic_hash({:?})", self);
+        let mut state = StableHasher::new();
         self.deterministic_hash_to(tcx, &mut state);
         state.finish()
     }
@@ -377,4 +378,3 @@ impl DefPathData {
         self.as_interned_str().to_string()
     }
 }
-
diff --git a/src/librustc/ty/util.rs b/src/librustc/ty/util.rs
@@ -24,11 +24,11 @@ use util::nodemap::FxHashMap;
 use middle::lang_items;
 
 use rustc_const_math::{ConstInt, ConstIsize, ConstUsize};
+use rustc_data_structures::stable_hasher::{StableHasher, StableHasherResult};
 
 use std::cell::RefCell;
 use std::cmp;
-use std::hash::{Hash, Hasher};
-use std::collections::hash_map::DefaultHasher;
+use std::hash::Hash;
 use std::intrinsics;
 use syntax::ast::{self, Name};
 use syntax::attr::{self, SignedInt, UnsignedInt};
@@ -349,7 +349,7 @@ impl<'a, 'gcx, 'tcx> TyCtxt<'a, 'gcx, 'tcx> {
     /// Creates a hash of the type `Ty` which will be the same no matter what crate
     /// context it's calculated within. This is used by the `type_id` intrinsic.
     pub fn type_id_hash(self, ty: Ty<'tcx>) -> u64 {
-        let mut hasher = TypeIdHasher::new(self, DefaultHasher::default());
+        let mut hasher = TypeIdHasher::new(self);
         hasher.visit_ty(ty);
         hasher.finish()
     }
@@ -395,96 +395,26 @@ impl<'a, 'gcx, 'tcx> TyCtxt<'a, 'gcx, 'tcx> {
     }
 }
 
-/// When hashing a type this ends up affecting properties like symbol names. We
-/// want these symbol names to be calculated independent of other factors like
-/// what architecture you're compiling *from*.
-///
-/// The hashing just uses the standard `Hash` trait, but the implementations of
-/// `Hash` for the `usize` and `isize` types are *not* architecture independent
-/// (e.g. they has 4 or 8 bytes). As a result we want to avoid `usize` and
-/// `isize` completely when hashing. To ensure that these don't leak in we use a
-/// custom hasher implementation here which inflates the size of these to a `u64`
-/// and `i64`.
-///
-/// The same goes for endianess: We always convert multi-byte integers to little
-/// endian before hashing.
-#[derive(Debug)]
-pub struct ArchIndependentHasher<H> {
-    inner: H,
-}
-
-impl<H> ArchIndependentHasher<H> {
-    pub fn new(inner: H) -> ArchIndependentHasher<H> {
-        ArchIndependentHasher { inner: inner }
-    }
-
-    pub fn into_inner(self) -> H {
-        self.inner
-    }
+pub struct TypeIdHasher<'a, 'gcx: 'a+'tcx, 'tcx: 'a, W> {
+    tcx: TyCtxt<'a, 'gcx, 'tcx>,
+    state: StableHasher<W>,
 }
 
-impl<H: Hasher> Hasher for ArchIndependentHasher<H> {
-    fn write(&mut self, bytes: &[u8]) {
-        self.inner.write(bytes)
-    }
-
-    fn finish(&self) -> u64 {
-        self.inner.finish()
-    }
-
-    fn write_u8(&mut self, i: u8) {
-        self.inner.write_u8(i)
-    }
-    fn write_u16(&mut self, i: u16) {
-        self.inner.write_u16(i.to_le())
-    }
-    fn write_u32(&mut self, i: u32) {
-        self.inner.write_u32(i.to_le())
-    }
-    fn write_u64(&mut self, i: u64) {
-        self.inner.write_u64(i.to_le())
-    }
-    fn write_usize(&mut self, i: usize) {
-        self.inner.write_u64((i as u64).to_le())
+impl<'a, 'gcx, 'tcx, W> TypeIdHasher<'a, 'gcx, 'tcx, W>
+    where W: StableHasherResult
+{
+    pub fn new(tcx: TyCtxt<'a, 'gcx, 'tcx>) -> Self {
+        TypeIdHasher { tcx: tcx, state: StableHasher::new() }
     }
-    fn write_i8(&mut self, i: i8) {
-        self.inner.write_i8(i)
-    }
-    fn write_i16(&mut self, i: i16) {
-        self.inner.write_i16(i.to_le())
-    }
-    fn write_i32(&mut self, i: i32) {
-        self.inner.write_i32(i.to_le())
-    }
-    fn write_i64(&mut self, i: i64) {
-        self.inner.write_i64(i.to_le())
-    }
-    fn write_isize(&mut self, i: isize) {
-        self.inner.write_i64((i as i64).to_le())
-    }
-}
-
-pub struct TypeIdHasher<'a, 'gcx: 'a+'tcx, 'tcx: 'a, H> {
-    tcx: TyCtxt<'a, 'gcx, 'tcx>,
-    state: ArchIndependentHasher<H>,
-}
 
-impl<'a, 'gcx, 'tcx, H: Hasher> TypeIdHasher<'a, 'gcx, 'tcx, H> {
-    pub fn new(tcx: TyCtxt<'a, 'gcx, 'tcx>, state: H) -> Self {
-        TypeIdHasher {
-            tcx: tcx,
-            state: ArchIndependentHasher::new(state),
-        }
+    pub fn finish(self) -> W {
+        self.state.finish()
     }
 
     pub fn hash<T: Hash>(&mut self, x: T) {
         x.hash(&mut self.state);
     }
 
-    pub fn finish(self) -> u64 {
-        self.state.finish()
-    }
-
     fn hash_discriminant_u8<T>(&mut self, x: &T) {
         let v = unsafe {
             intrinsics::discriminant_value(x)
@@ -504,13 +434,11 @@ impl<'a, 'gcx, 'tcx, H: Hasher> TypeIdHasher<'a, 'gcx, 'tcx, H> {
     pub fn def_path(&mut self, def_path: &ast_map::DefPath) {
         def_path.deterministic_hash_to(self.tcx, &mut self.state);
     }
-
-    pub fn into_inner(self) -> H {
-        self.state.inner
-    }
 }
 
-impl<'a, 'gcx, 'tcx, H: Hasher> TypeVisitor<'tcx> for TypeIdHasher<'a, 'gcx, 'tcx, H> {
+impl<'a, 'gcx, 'tcx, W> TypeVisitor<'tcx> for TypeIdHasher<'a, 'gcx, 'tcx, W>
+    where W: StableHasherResult
+{
     fn visit_ty(&mut self, ty: Ty<'tcx>) -> bool {
         // Distinguish between the Ty variants uniformly.
         self.hash_discriminant_u8(&ty.sty);

diff --git a/src/librustc_data_structures/lib.rs b/src/librustc_data_structures/lib.rs
@@ -44,6 +44,8 @@ extern crate serialize as rustc_serialize; // used by deriving
 #[cfg(unix)]
 extern crate libc;
 
+pub use rustc_serialize::hex::ToHex;
+
 pub mod array_vec;
 pub mod accumulate_vec;
 pub mod small_vec;
@@ -59,6 +61,7 @@ pub mod indexed_vec;
 pub mod obligation_forest;
 pub mod snapshot_map;
 pub mod snapshot_vec;
+pub mod stable_hasher;
 pub mod transitive_relation;
 pub mod unify;
 pub mod fnv;

diff --git a/src/librustc_data_structures/stable_hasher.rs b/src/librustc_data_structures/stable_hasher.rs
@@ -0,0 +1,176 @@
+// Copyright 2016 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use std::hash::Hasher;
+use std::marker::PhantomData;
+use std::mem;
+use blake2b::Blake2bHasher;
+use rustc_serialize::leb128;
+
+fn write_unsigned_leb128_to_buf(buf: &mut [u8; 16], value: u64) -> usize {
+    leb128::write_unsigned_leb128_to(value, |i, v| buf[i] = v)
+}
+
+fn write_signed_leb128_to_buf(buf: &mut [u8; 16], value: i64) -> usize {
+    leb128::write_signed_leb128_to(value, |i, v| buf[i] = v)
+}
+
+/// When hashing something that ends up affecting properties like symbol names. We
+/// want these symbol names to be calculated independent of other factors like
+/// what architecture you're compiling *from*.
+///
+/// The hashing just uses the standard `Hash` trait, but the implementations of
+/// `Hash` for the `usize` and `isize` types are *not* architecture independent
+/// (e.g. they has 4 or 8 bytes). As a result we want to avoid `usize` and
+/// `isize` completely when hashing.
+///
+/// To do that, we encode all integers to be hashed with some
+/// arch-independent encoding.
+///
+/// At the moment, we pass i8/u8 straight through and encode
+/// all other integers using leb128.
+///
+/// This hasher currently always uses the stable Blake2b algorithm
+/// and allows for variable output lengths through its type
+/// parameter.
+#[derive(Debug)]
+pub struct StableHasher<W> {
+    state: Blake2bHasher,
+    bytes_hashed: u64,
+    width: PhantomData<W>,
+}
+
+pub trait StableHasherResult: Sized {
+    fn finish(hasher: StableHasher<Self>) -> Self;
+}
+
+impl<W: StableHasherResult> StableHasher<W> {
+    pub fn new() -> Self {
+        StableHasher {
+            state: Blake2bHasher::new(mem::size_of::<W>(), &[]),
+            bytes_hashed: 0,
+            width: PhantomData,
+        }
+    }
+
+    pub fn finish(self) -> W {
+        W::finish(self)
+    }
+}
+
+impl StableHasherResult for [u8; 20] {
+    fn finish(mut hasher: StableHasher<Self>) -> Self {
+        let mut result: [u8; 20] = [0; 20];
+        result.copy_from_slice(hasher.state.finalize());
+        result
+    }
+}
+
+impl StableHasherResult for u64 {
+    fn finish(mut hasher: StableHasher<Self>) -> Self {
+        hasher.state.finalize();
+        hasher.state.finish()
+    }
+}
+
+impl<W> StableHasher<W> {
+    #[inline]
+    pub fn finalize(&mut self) -> &[u8] {
+        self.state.finalize()
+    }
+
+    #[inline]
+    pub fn bytes_hashed(&self) -> u64 {
+        self.bytes_hashed
+    }
+
+    #[inline]
+    fn write_uleb128(&mut self, value: u64) {
+        let mut buf = [0; 16];
+        let len = write_unsigned_leb128_to_buf(&mut buf, value);
+        self.state.write(&buf[..len]);
+        self.bytes_hashed += len as u64;
+    }
+
+    #[inline]
+    fn write_ileb128(&mut self, value: i64) {
+        let mut buf = [0; 16];
+        let len = write_signed_leb128_to_buf(&mut buf, value);
+        self.state.write(&buf[..len]);
+        self.bytes_hashed += len as u64;
+    }
+}
+
+// For the non-u8 integer cases we leb128 encode them first. Because small
+// integers dominate, this significantly and cheaply reduces the number of
+// bytes hashed, which is good because blake2b is expensive.
+impl<W> Hasher for StableHasher<W> {
+    fn finish(&self) -> u64 {
+        panic!("use StableHasher::finish instead");
+    }
+
+    #[inline]
+    fn write(&mut self, bytes: &[u8]) {
+        self.state.write(bytes);
+        self.bytes_hashed += bytes.len() as u64;
+    }
+
+    #[inline]
+    fn write_u8(&mut self, i: u8) {
+        self.state.write_u8(i);
+        self.bytes_hashed += 1;
+    }
+
+    #[inline]
+    fn write_u16(&mut self, i: u16) {
+        self.write_uleb128(i as u64);
+    }
+
+    #[inline]
+    fn write_u32(&mut self, i: u32) {
+        self.write_uleb128(i as u64);
+    }
+
+    #[inline]
+    fn write_u64(&mut self, i: u64) {
+        self.write_uleb128(i);
+    }
+
+    #[inline]
+    fn write_usize(&mut self, i: usize) {
+        self.write_uleb128(i as u64);
+    }
+
+    #[inline]
+    fn write_i8(&mut self, i: i8) {
+        self.state.write_i8(i);
+        self.bytes_hashed += 1;
+    }
+
+    #[inline]
+    fn write_i16(&mut self, i: i16) {
+        self.write_ileb128(i as i64);
+    }
+
+    #[inline]
+    fn write_i32(&mut self, i: i32) {
+        self.write_ileb128(i as i64);
+    }
+
+    #[inline]
+    fn write_i64(&mut self, i: i64) {
+        self.write_ileb128(i);
+    }
+
+    #[inline]
+    fn write_isize(&mut self, i: isize) {
+        self.write_ileb128(i as i64);
+    }
+}