From a40ea03a169a9f28fefd4382d95369f6b7a464cd Mon Sep 17 00:00:00 2001 From: Michael Cuevas Date: Mon, 10 Jul 2023 09:41:17 -0700 Subject: [PATCH] add FilteredObjectId skeleton Summary: Initial design of a FilteredObjectId. This will be used by the FilteredBackingStore to wrap ObjectIds used by other BackingStores. ** this is just an initial implementation. It will probably evolve as I start implementing FilteredBackingStore and start uncovering issues with the design ** Reviewed By: kmancini Differential Revision: D46010041 fbshipit-source-id: a85ea448af0033278b0e9b4ac7208fec1f33d0d6 --- eden/fs/store/filter/CMakeLists.txt | 19 ++ eden/fs/store/filter/FilteredObjectId.cpp | 219 ++++++++++++++++++ eden/fs/store/filter/FilteredObjectId.h | 197 ++++++++++++++++ .../filter/test/FilteredObjectIdTest.cpp | 101 ++++++++ 4 files changed, 536 insertions(+) create mode 100644 eden/fs/store/filter/CMakeLists.txt create mode 100644 eden/fs/store/filter/FilteredObjectId.cpp create mode 100644 eden/fs/store/filter/FilteredObjectId.h create mode 100644 eden/fs/store/filter/test/FilteredObjectIdTest.cpp diff --git a/eden/fs/store/filter/CMakeLists.txt b/eden/fs/store/filter/CMakeLists.txt new file mode 100644 index 0000000000000..ccb7f504248f2 --- /dev/null +++ b/eden/fs/store/filter/CMakeLists.txt @@ -0,0 +1,19 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2. + +file(GLOB STORE_FILTER_SRCS "*.cpp") + +add_library( + eden_store_filter STATIC + ${STORE_FILTER_SRCS} +) + +target_link_libraries( + eden_store_filter + PUBLIC + eden_model + eden_utils + ${RE2_LIBRARY} +) diff --git a/eden/fs/store/filter/FilteredObjectId.cpp b/eden/fs/store/filter/FilteredObjectId.cpp new file mode 100644 index 0000000000000..bb0d037601f5d --- /dev/null +++ b/eden/fs/store/filter/FilteredObjectId.cpp @@ -0,0 +1,219 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This software may be used and distributed according to the terms of the + * GNU General Public License version 2. + */ + +#include "eden/fs/store/filter/FilteredObjectId.h" + +#include +#include + +#include "eden/fs/utils/Throw.h" + +using folly::ByteRange; +using folly::Endian; +using folly::StringPiece; +using std::string; + +namespace facebook::eden { + +std::string FilteredObjectId::serializeBlob(const ObjectId& object) { + // If we're dealing with a blob FilteredObjectId, we only need to + // serialize two components: + std::string buf; + buf.reserve(1 + sizeof(object)); + uint8_t objectType = FilteredObjectId::OBJECT_TYPE_BLOB; + + buf.append(reinterpret_cast(&objectType), sizeof(objectType)); + buf.append(object.asString()); + return buf; +} + +std::string FilteredObjectId::serializeTree( + RelativePathPiece path, + std::string_view filterId, + const ObjectId& object) { + std::string buf; + // We serialize trees as + // + size_t pathLen = path.value().length(); + uint8_t pathVarint[folly::kMaxVarintLength64] = {}; + size_t pathVarintLen = folly::encodeVarint(pathLen, pathVarint); + XLOGF( + DBG9, + "pathLen: {}, pathVarint: {}, pathVarintLen: {}", + pathLen, + pathVarint, + pathVarintLen); + + size_t filterLen = filterId.length(); + uint8_t filterVarint[folly::kMaxVarintLength64] = {}; + size_t filterVarintLen = folly::encodeVarint(filterLen, filterVarint); + XLOGF( + INFO, + "filterLen: {}, filterVarint: {}, pathVarintLen: {}", + filterLen, + filterVarint, + filterVarintLen); + uint8_t objectType = FilteredObjectId::OBJECT_TYPE_TREE; + + buf.reserve( + sizeof(objectType) + pathVarintLen + pathLen + filterVarintLen + + filterLen + sizeof(object)); + buf.append(reinterpret_cast(&objectType), sizeof(objectType)); + buf.append(reinterpret_cast(filterVarint), filterVarintLen); + buf.append(filterId); + buf.append(reinterpret_cast(pathVarint), pathVarintLen); + buf.append(path.value().begin(), path.value().end()); + buf.append(object.asString()); + return buf; +} + +RelativePathPiece FilteredObjectId::path() const { + switch (value_.data()[0]) { + case FilteredObjectId::OBJECT_TYPE_TREE: + // Skip the first byte of data that contains the type + folly::Range r(value_.data(), value_.size()); + r.advance(sizeof(FilteredObjectId::OBJECT_TYPE_TREE)); + + // Skip the variable length filter id. decodeVarint() advances the + // range for us, so we don't need to skip the VarInt after reading it. + size_t varintSize = folly::decodeVarint(r); + r.advance(varintSize); + varintSize = folly::decodeVarint(r); + + StringPiece data{r.begin(), varintSize}; + // value_ was built with a known good RelativePath, thus we don't need + // to recheck it when deserializing. + return RelativePathPiece{data, detail::SkipPathSanityCheck{}}; + } + // We don't know the path of non-tree objects. Throw. + throwf( + "Cannot determine path of non-tree FilteredObjectId: {}", value_); +} + +StringPiece FilteredObjectId::filter() const { + switch (value_.data()[0]) { + case FilteredObjectId::OBJECT_TYPE_TREE: + // Skip the first byte of data that contains the type + folly::Range r(value_.data(), value_.size()); + r.advance(sizeof(FilteredObjectId::OBJECT_TYPE_TREE)); + + // Determine the location/size of the filter + size_t varintSize = folly::decodeVarint(r); + + // decodeVarint advances the range for us, so we can use the current + // start of the range. + StringPiece data{r.begin(), varintSize}; + return data; + } + // We don't know the filter of non-tree objects. Throw. + throwf( + "Cannot determine filter for non-tree FilteredObjectId: {}", value_); +} + +ObjectId FilteredObjectId::object() const { + switch (value_.data()[0]) { + case FilteredObjectId::OBJECT_TYPE_TREE: { + // Skip the first byte of data that contains the type + folly::Range r(value_.data(), value_.size()); + r.advance(sizeof(FilteredObjectId::OBJECT_TYPE_TREE)); + + // Determine the location/size of the filter and skip it + size_t varintSize = folly::decodeVarint(r); + r.advance(varintSize); + + // Determine the location/size of the path and skip it + varintSize = folly::decodeVarint(r); + r.advance(varintSize); + + // Parse the ObjectId bytes and use them to create an ObjectId + ObjectId object = ObjectId{r}; + return object; + } + + case FilteredObjectId::OBJECT_TYPE_BLOB: { + folly::Range r(value_.data(), value_.size()); + r.advance(sizeof(FilteredObjectId::OBJECT_TYPE_BLOB)); + ObjectId object = ObjectId{r}; + return object; + } + } + // Unknown FilteredObjectId type. Throw. + throwf( + "Unknown FilteredObjectId type: {}", value_.data()[0]); +} + +// Since some FilteredObjectIds are created without validation, we should +// validate that we return a valid type. +FilteredObjectId::FilteredObjectIdType FilteredObjectId::objectType() const { + switch (value_.data()[0]) { + case FilteredObjectId::OBJECT_TYPE_TREE: + return FilteredObjectIdType::OBJECT_TYPE_TREE; + case FilteredObjectId::OBJECT_TYPE_BLOB: + return FilteredObjectIdType::OBJECT_TYPE_BLOB; + } + // Unknown FilteredObjectId type. Throw. + throwf("Unknown FilteredObjectId type: {}", value_[0]); +} + +// It's possible that FilteredObjectIds with different filterIds evaluate to +// the same underlying object. However, that's not for the FilteredObjectId +// implementation to decide. This implementation strictly checks if the FOID +// contents are byte-wise equal. +bool FilteredObjectId::operator==(const FilteredObjectId& otherHash) const { + return value_ == otherHash.value_; +} + +// The comment above for == also applies here. +bool FilteredObjectId::operator<(const FilteredObjectId& otherHash) const { + return value_ < otherHash.value_; +} + +void FilteredObjectId::validate() { + ByteRange infoBytes = folly::Range{value_.data(), value_.size()}; + XLOGF(DBG9, "{}", value_); + + // Ensure the type byte is valid + uint8_t typeByte = infoBytes.data()[0]; + if (typeByte != FilteredObjectId::OBJECT_TYPE_BLOB && + typeByte != FilteredObjectId::OBJECT_TYPE_TREE) { + auto msg = fmt::format( + "Invalid FilteredObjectId type byte {}. Value_ = {}", typeByte, value_); + XLOGF(ERR, "{}", msg); + throw std::invalid_argument(msg); + } + infoBytes.advance(1); + + // Validating the wrapped ObjectId is impossible since we don't know what + // it should contain. Therefore, we simply return if we're validating a + // filtered blob Id. + if (typeByte == FilteredObjectId::OBJECT_TYPE_BLOB) { + return; + } + + // For trees, we can actually perform some validation. We can ensure the + // varints describing the filterid and path are valid + auto expectedSize = folly::tryDecodeVarint(infoBytes); + if (UNLIKELY(!expectedSize)) { + auto msg = fmt::format( + "failed to decode filter id VarInt when validating FilteredObjectId {}: {}", + value_, + expectedSize.error()); + throw std::invalid_argument(msg); + } + infoBytes.advance(*expectedSize); + + expectedSize = folly::tryDecodeVarint(infoBytes); + if (UNLIKELY(!expectedSize)) { + auto msg = fmt::format( + "failed to decode path length VarInt when validating FilteredObjectId {}: {}", + value_, + expectedSize.error()); + throw std::invalid_argument(msg); + } +} + +} // namespace facebook::eden diff --git a/eden/fs/store/filter/FilteredObjectId.h b/eden/fs/store/filter/FilteredObjectId.h new file mode 100644 index 0000000000000..dc8069e26d1e8 --- /dev/null +++ b/eden/fs/store/filter/FilteredObjectId.h @@ -0,0 +1,197 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This software may be used and distributed according to the terms of the + * GNU General Public License version 2. + */ + +#pragma once + +#include +#include "eden/fs/model/ObjectId.h" +#include "eden/fs/utils/PathFuncs.h" + +namespace facebook::eden { + +/** + * FilteredBackingStores need to keep track of a few extra pieces of state with + * each ObjectId in order to properly filter objects across their lifetime. + * + * The first crucial piece of information they need is whether the given object + * is a tree or a blob. This is defined in the first byte of the ObjectId. The + * rest of the FilteredObjectId (FOID for short) is different depending on the + * object's type (tree or blob). + * + * ============= Blob FOIDs ============= + * + * By filtering trees directly, we get blob filtering for free! This is because + * we process (and filter) the direct children of a tree whenever we process a + * tree itself. Any filtered blobs are unreachable after their parent tree is + * processed. + * + * This means Blob FOIDs don't need any extra information associated with them + * besides the type byte mentioned above. Our Blob FOIDs are in the form: + * + * + * + * The ObjectId mentioned above can be used in whatever BackingStore the + * FilteredBackingStore is wrapped around. In most cases, this will be an + * HgObjectID. + * + * ============= Tree FOIDs ============= + * + * For trees, we need to keep track of what filter was active when the ObjectId + * was created when the corresponding tree was fetched. This information is + * variable length, so we use a VarInt to encode the length of the filter id. + * + * We also need to keep track of the path associated with the tree object so we + * can determine whether the object needs to be filtered prior to fetching any + * data associated with it. The path is variable length, so we use a VarInt to + * encode the length of the path. + * + * Finally, like blobs, we include an ObjectId we can use in the BackingStore + * the FilteredBackingStore wraps. ObjectIds are variable length, but we place + * them at the end of the ObjectID. Therefore we should always know where they + * end. This gives us the form: + * + * + */ +class FilteredObjectId { + public: + /** + * It doesn't make sense for a FilteredObjectId to be default constructed. At + * a minimum, a wrapped ObjectId must be provided. + */ + FilteredObjectId() = delete; + + /** + * Construct a filtered *blob* object id. + */ + explicit FilteredObjectId(const ObjectId& edenObjectId) + : value_{serializeBlob(edenObjectId)} { + validate(); + } + + /** + * Construct a filtered *tree* object id. + */ + FilteredObjectId( + RelativePathPiece path, + std::string_view filterId, + const ObjectId& edenObjectId) + : value_{serializeTree(path, filterId, edenObjectId)} { + validate(); + } + + /** + * This function should only be used when the caller knows the underlying + * bytes from the passed in ObjectId is in the form of a FilteredObjectId. + */ + static FilteredObjectId fromObjectId(const ObjectId& id) { + return FilteredObjectId{id.getBytes()}; + } + + explicit FilteredObjectId(std::string str) noexcept : value_{std::move(str)} { + validate(); + } + + explicit FilteredObjectId(folly::ByteRange bytes) + : value_{constructFromByteRange(bytes)} { + validate(); + } + + ~FilteredObjectId() = default; + + FilteredObjectId(const FilteredObjectId& other) = default; + FilteredObjectId& operator=(const FilteredObjectId& other) = default; + + FilteredObjectId(FilteredObjectId&& other) noexcept + : value_{std::exchange(other.value_, std::string{})} {} + + FilteredObjectId& operator=(FilteredObjectId&& other) noexcept { + value_ = std::exchange(other.value_, std::string{}); + return *this; + } + + /* + * Returns the path portion of the *tree* FilteredObjectId. NOTE: This + * function will throw an exception if it is called on a Blob FOID! + */ + RelativePathPiece path() const; + + /* + * Returns the filter portion of the *tree* FilteredObjectId. NOTE: This + * function will throw an exception if it is called on a Blob FOID! + */ + folly::StringPiece filter() const; + + /* + * Returns the object portion of the FilteredObjectId. NOTE: This function + * works for BOTH Blob and Tree FOIDs. + */ + ObjectId object() const; + + // We start FilteredObjectId types at 0x10 so that they can be distinguished + // from HgProxyHash types that start at 0x01 and extend until 0x02. In the + // future, this could help us migrate HgProxyHash-based ObjectIds to + // FilteredObjectIds. + enum FilteredObjectIdType : uint8_t { + // If the Object ID's type is 16, then it represents a blob object and is of + // the form + OBJECT_TYPE_BLOB = 0x10, + + // If the Object ID's type is 17, then it represents a tree object and is of + // the form + OBJECT_TYPE_TREE = 0x11, + }; + + /* + * Returns the type of the FilteredObjectId. NOTE: This function works for + * BOTH Blob and Tree FOIDs. + */ + FilteredObjectIdType objectType() const; + + bool operator==(const FilteredObjectId&) const; + bool operator<(const FilteredObjectId&) const; + + const std::string& getValue() const { + return value_; + } + + private: + static std::string constructFromByteRange(folly::ByteRange bytes) { + return std::string{(const char*)bytes.data(), bytes.size()}; + } + + /** + * Serialize the tree path, filter, and object data into a buffer that will + * be stored in the LocalStore. + */ + static std::string serializeTree( + RelativePathPiece path, + std::string_view filterId, + const ObjectId&); + + /** + * Serialize the blob object data into a buffer that will be stored in the + * LocalStore. + */ + static std::string serializeBlob(const ObjectId& object); + + /** + * Validate data found in value_. + * + * The value_ member variable should already contain the serialized data, + * (as returned by serialize()). + * + * Note there will be an exception being thrown if `value_` is invalid. + */ + void validate(); + + /** + * The serialized data as written in the LocalStore. + */ + std::string value_; +}; + +} // namespace facebook::eden diff --git a/eden/fs/store/filter/test/FilteredObjectIdTest.cpp b/eden/fs/store/filter/test/FilteredObjectIdTest.cpp new file mode 100644 index 0000000000000..f25452d169d89 --- /dev/null +++ b/eden/fs/store/filter/test/FilteredObjectIdTest.cpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This software may be used and distributed according to the terms of the + * GNU General Public License version 2. + */ + +#include +#include +#include +#include + +#include "eden/fs/model/ObjectId.h" +#include "eden/fs/store/filter/FilteredObjectId.h" +#include "eden/fs/utils/PathFuncs.h" + +using namespace facebook::eden; + +TEST(FilteredObjectIdTest, test_blob) { + std::string objectIdString = "deadbeeffacebooc"; + folly::ByteRange objectIdBytes{folly::StringPiece{objectIdString}}; + ObjectId object{objectIdBytes}; + + FilteredObjectId filterId{object}; + + EXPECT_EQ(filterId.objectType(), FilteredObjectId::OBJECT_TYPE_BLOB); + EXPECT_EQ(filterId.object(), object); +} + +TEST(FilteredObjectIdTest, test_blob_getters_throw) { + std::string objectIdString = "deadbeef facebooc"; + folly::ByteRange objectIdBytes{folly::StringPiece{objectIdString}}; + ObjectId object{objectIdBytes}; + + FilteredObjectId filterId{object}; + + // Blob objects don't have path/filters associated with them. Using the + // getters results in an exception. + EXPECT_EQ(filterId.objectType(), FilteredObjectId::OBJECT_TYPE_BLOB); + EXPECT_THROW(filterId.filter(), std::invalid_argument); + EXPECT_THROW(filterId.path(), std::invalid_argument); +} + +TEST(FilteredObjectIdTest, test_tree_short_filter_and_path) { + std::string objectIdString = "deadbeef facebooc"; + folly::ByteRange objectIdBytes{folly::StringPiece{objectIdString}}; + ObjectId object{objectIdBytes}; + std::string filterSet = "filterset"; + auto pathPiece = + RelativePath{"this is a long enough string to push past SSO"}; + + FilteredObjectId filterId{pathPiece, filterSet, object}; + + EXPECT_EQ(filterId.objectType(), FilteredObjectId::OBJECT_TYPE_TREE); + EXPECT_EQ(filterId.path(), pathPiece); + EXPECT_EQ(filterId.filter(), filterSet); + EXPECT_EQ(filterId.object(), object); +} + +TEST(FilteredObjectIdTest, test_tree_long_filter_and_path) { + std::string objectIdString = "deadbeef facebooc"; + folly::ByteRange objectIdBytes{folly::StringPiece{objectIdString}}; + ObjectId object{objectIdBytes}; + std::string filterSet = + "This filterset is very long. Some would say it's longer than 255 characters. " + "This filterset is very long. Some would say it's longer than 255 characters. " + "This filterset is very long. Some would say it's longer than 255 characters. " + "This filterset is very long. Some would say it's longer than 255 characters. " + "This filterset is very long. Some would say it's longer than 255 characters. "; + auto pathPiece = RelativePath{ + "This is a very long string that is greater than 255 chars" + "This is a very long string that is greater than 255 chars" + "This is a very long string that is greater than 255 chars" + "This is a very long string that is greater than 255 chars" + "This is a very long string that is greater than 255 chars"}; + + FilteredObjectId filterId{pathPiece, filterSet, object}; + + EXPECT_EQ(filterId.objectType(), FilteredObjectId::OBJECT_TYPE_TREE); + EXPECT_EQ(filterId.path(), pathPiece); + EXPECT_EQ(filterId.filter(), filterSet); + EXPECT_EQ(filterId.object(), object); +} + +TEST(FilteredObjectIdTest, test_copy_and_move) { + std::string objectIdString = "objectid"; + folly::ByteRange objectIdBytes{folly::StringPiece{objectIdString}}; + ObjectId object{objectIdBytes}; + std::string filterSet = "filterset"; + auto pathPiece = RelativePath{"a path piece"}; + + FilteredObjectId filterId{pathPiece, filterSet, object}; + FilteredObjectId filterIdCopy{filterId}; + EXPECT_EQ(filterId.objectType(), FilteredObjectId::OBJECT_TYPE_TREE); + EXPECT_EQ(filterIdCopy.objectType(), FilteredObjectId::OBJECT_TYPE_TREE); + EXPECT_EQ(filterId, filterIdCopy); + + FilteredObjectId movedFilterId{std::move(filterId)}; + EXPECT_EQ(movedFilterId.objectType(), FilteredObjectId::OBJECT_TYPE_TREE); + EXPECT_EQ(movedFilterId, movedFilterId); +}