From 821a401ae9ecee7f6b18c5dea70021a88acbc88b Mon Sep 17 00:00:00 2001 From: Gergely Szilvasy Date: Wed, 19 Jul 2023 10:05:46 -0700 Subject: [PATCH] CodeSet for deduping large datasets (#2949) Summary: Pull Request resolved: https://github.com/facebookresearch/faiss/pull/2949 A more scalable alternative to `np.unique` for deduping large datasets with a quantized code. Reviewed By: mlomeli1 Differential Revision: D47443953 fbshipit-source-id: 4a1554d4d4200b5fa657e9d8b7395bba9856a8e3 --- faiss/python/__init__.py | 1 + faiss/python/class_wrappers.py | 18 ++++++++++++++++++ faiss/utils/utils.cpp | 9 +++++++++ faiss/utils/utils.h | 10 ++++++++++ tests/test_contrib.py | 14 ++++++++++++++ 5 files changed, 52 insertions(+) diff --git a/faiss/python/__init__.py b/faiss/python/__init__.py index 31bfb79233..d650033096 100644 --- a/faiss/python/__init__.py +++ b/faiss/python/__init__.py @@ -41,6 +41,7 @@ class_wrappers.handle_IDSelectorSubset(IDSelectorBatch, class_owns=True) class_wrappers.handle_IDSelectorSubset(IDSelectorArray, class_owns=False) class_wrappers.handle_IDSelectorSubset(IDSelectorBitmap, class_owns=False, force_int64=False) +class_wrappers.handle_CodeSet(CodeSet) this_module = sys.modules[__name__] diff --git a/faiss/python/class_wrappers.py b/faiss/python/class_wrappers.py index efce359220..d9031904cf 100644 --- a/faiss/python/class_wrappers.py +++ b/faiss/python/class_wrappers.py @@ -1102,3 +1102,21 @@ def replacement_init(self, *args): self.original_init(*args) the_class.__init__ = replacement_init + + +def handle_CodeSet(the_class): + + def replacement_insert(self, codes, inserted=None): + n, d = codes.shape + assert d == self.d + codes = np.ascontiguousarray(codes, dtype=np.uint8) + + if inserted is None: + inserted = np.empty(n, dtype=bool) + else: + assert inserted.shape == (n, ) + + self.insert_c(n, swig_ptr(codes), swig_ptr(inserted)) + return inserted + + replace_method(the_class, 'insert', replacement_insert) diff --git a/faiss/utils/utils.cpp b/faiss/utils/utils.cpp index 2eee790c20..0a4f9a7719 100644 --- a/faiss/utils/utils.cpp +++ b/faiss/utils/utils.cpp @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -623,4 +624,12 @@ void CombinerRangeKNN::write_result(T* D_res, int64_t* I_res) { template struct CombinerRangeKNN; template struct CombinerRangeKNN; +void CodeSet::insert(size_t n, const uint8_t* codes, bool* inserted) { + for (size_t i = 0; i < n; i++) { + auto res = s.insert( + std::vector(codes + i * d, codes + i * d + d)); + inserted[i] = res.second; + } +} + } // namespace faiss diff --git a/faiss/utils/utils.h b/faiss/utils/utils.h index 8578be9447..372e8c80d0 100644 --- a/faiss/utils/utils.h +++ b/faiss/utils/utils.h @@ -17,7 +17,9 @@ #define FAISS_utils_h #include +#include #include +#include #include #include @@ -209,6 +211,14 @@ struct CombinerRangeKNN { void write_result(T* D_res, int64_t* I_res); }; +struct CodeSet { + size_t d; + std::set> s; + + explicit CodeSet(size_t d) : d(d) {} + void insert(size_t n, const uint8_t* codes, bool* inserted); +}; + } // namespace faiss #endif /* FAISS_utils_h */ diff --git a/tests/test_contrib.py b/tests/test_contrib.py index 898e7315fd..1982241142 100644 --- a/tests/test_contrib.py +++ b/tests/test_contrib.py @@ -630,3 +630,17 @@ def test_hnsw_permute(self): np.testing.assert_equal(Dnew, Dref) Inew_remap = perm[Inew] np.testing.assert_equal(Inew_remap, Iref) + + +class TestCodeSet(unittest.TestCase): + + def test_code_set(self): + """ CodeSet and np.unique should produce the same output """ + d = 8 + n = 1000 # > 256 and using only 0 or 1 so there must be duplicates + codes = np.random.randint(0, 2, (n, d), dtype=np.uint8) + s = faiss.CodeSet(d) + inserted = s.insert(codes) + np.testing.assert_equal( + np.sort(np.unique(codes, axis=0), axis=None), + np.sort(codes[inserted], axis=None))