From b99050550d41fddeeee589fa86c0d17dfb8c57fb Mon Sep 17 00:00:00 2001 From: Kumar Saurabh Arora Date: Thu, 28 Mar 2024 15:18:50 -0700 Subject: [PATCH] Support of skip_ids in merge_from_multiple function of OnDiskInvertedLists Summary: **Context** 1. [Issue 2621](https://github.com/facebookresearch/faiss/issues/2621) discuss inconsistency between OnDiskInvertedList and InvertedList. OnDiskInvertedList is supposed to handle disk based multiple Index Shards. Thus, we should name it differently when merging invls from index shard. 2. [Issue 2876](https://github.com/facebookresearch/faiss/issues/2876) provides usecase of shifting ids when merging invls from different shards. **In this diff**, 1. To address #1 above, I renamed the merge_from function to merge_from_multiple without touching merge_from base class. why so? To continue to allow merge invl from one index to ondiskinvl from other index. 2. To address #2 above, I have added support of shift_ids in merge_from_multiple to shift ids from different shards. This can be used when each shard has same set of ids but different data. This is not recommended if id is already unique across shards. Differential Revision: D55482518 --- faiss/invlists/InvertedLists.h | 2 +- faiss/invlists/OnDiskInvertedLists.cpp | 24 ++++++++++++++++++---- faiss/invlists/OnDiskInvertedLists.h | 3 ++- tests/test_merge.cpp | 28 ++++++++++++++++++++++++-- 4 files changed, 49 insertions(+), 8 deletions(-) diff --git a/faiss/invlists/InvertedLists.h b/faiss/invlists/InvertedLists.h index 90a9d65411..4f6ad4ade3 100644 --- a/faiss/invlists/InvertedLists.h +++ b/faiss/invlists/InvertedLists.h @@ -129,7 +129,7 @@ struct InvertedLists { * high level functions */ /// move all entries from oivf (empty on output) - void merge_from(InvertedLists* oivf, size_t add_id); + virtual void merge_from(InvertedLists* oivf, size_t add_id); // how to copy a subset of elements from the inverted lists // This depends on two integers, a1 and a2. diff --git a/faiss/invlists/OnDiskInvertedLists.cpp b/faiss/invlists/OnDiskInvertedLists.cpp index 3017d164c6..802a3b5694 100644 --- a/faiss/invlists/OnDiskInvertedLists.cpp +++ b/faiss/invlists/OnDiskInvertedLists.cpp @@ -565,22 +565,28 @@ void OnDiskInvertedLists::free_slot(size_t offset, size_t capacity) { /***************************************** * Compact form *****************************************/ - -size_t OnDiskInvertedLists::merge_from( +size_t OnDiskInvertedLists::merge_from_multiple( const InvertedLists** ils, int n_il, + bool shift_ids, bool verbose) { FAISS_THROW_IF_NOT_MSG( totsize == 0, "works only on an empty InvertedLists"); std::vector sizes(nlist); + std::vector shift_id_offsets(n_il); for (int i = 0; i < n_il; i++) { const InvertedLists* il = ils[i]; FAISS_THROW_IF_NOT(il->nlist == nlist && il->code_size == code_size); + size_t il_totsize = 0; for (size_t j = 0; j < nlist; j++) { sizes[j] += il->list_size(j); + il_totsize += il->list_size(j); } + + shift_id_offsets[i] = + (shift_ids && i > 0) ? shift_id_offsets[i - 1] + il_totsize : 0; } size_t cums = 0; @@ -605,11 +611,21 @@ size_t OnDiskInvertedLists::merge_from( const InvertedLists* il = ils[i]; size_t n_entry = il->list_size(j); l.size += n_entry; + ScopedIds scope_ids(il, j); + const idx_t* scope_ids_data = scope_ids.get(); + std::vector new_ids; + if (shift_ids) { + new_ids.resize(n_entry); + for (size_t k = 0; k < n_entry; k++) { + new_ids[k] = scope_ids[k] + shift_id_offsets[i]; + } + scope_ids_data = new_ids.data(); + } update_entries( j, l.size - n_entry, n_entry, - ScopedIds(il, j).get(), + scope_ids_data, ScopedCodes(il, j).get()); } assert(l.size == l.capacity); @@ -638,7 +654,7 @@ size_t OnDiskInvertedLists::merge_from( size_t OnDiskInvertedLists::merge_from_1( const InvertedLists* ils, bool verbose) { - return merge_from(&ils, 1, verbose); + return merge_from_multiple(&ils, 1, verbose); } void OnDiskInvertedLists::crop_invlists(size_t l0, size_t l1) { diff --git a/faiss/invlists/OnDiskInvertedLists.h b/faiss/invlists/OnDiskInvertedLists.h index 98cb653a7a..01c7f3481e 100644 --- a/faiss/invlists/OnDiskInvertedLists.h +++ b/faiss/invlists/OnDiskInvertedLists.h @@ -101,9 +101,10 @@ struct OnDiskInvertedLists : InvertedLists { // copy all inverted lists into *this, in compact form (without // allocating slots) - size_t merge_from( + size_t merge_from_multiple( const InvertedLists** ils, int n_il, + bool shift_ids = false, bool verbose = false); /// same as merge_from for a single invlist diff --git a/tests/test_merge.cpp b/tests/test_merge.cpp index 5a1d08cfba..edbe2a03a6 100644 --- a/tests/test_merge.cpp +++ b/tests/test_merge.cpp @@ -32,6 +32,7 @@ size_t nq = 100; int nindex = 4; int k = 10; int nlist = 40; +int shard_size = nb / nindex; struct CommonData { std::vector database; @@ -100,7 +101,7 @@ int compare_merged( auto il = new faiss::OnDiskInvertedLists( index0->nlist, index0->code_size, filename.c_str()); - il->merge_from(lists.data(), lists.size()); + il->merge_from_multiple(lists.data(), lists.size(), shift_ids); index0->replace_invlists(il, true); index0->ntotal = ntotal; @@ -110,11 +111,14 @@ int compare_merged( nq, cd.queries.data(), k, newD.data(), newI.data()); size_t ndiff = 0; + bool adjust_ids = shift_ids && !standard_merge; for (size_t i = 0; i < k * nq; i++) { - if (refI[i] != newI[i]) { + idx_t new_id = adjust_ids ? refI[i] % shard_size : refI[i]; + if (refI[i] != new_id) { ndiff++; } } + return ndiff; } @@ -220,3 +224,23 @@ TEST(MERGE, merge_flat_ondisk_2) { int ndiff = compare_merged(&index_shards, false, false); EXPECT_GE(0, ndiff); } + +// now use ondisk specific merge and use shift ids +TEST(MERGE, merge_flat_ondisk_3) { + faiss::IndexShards index_shards(d, false, false); + index_shards.own_indices = true; + + std::vector ids; + for (int i = 0; i < nb; ++i) { + int id = i % shard_size; + ids.push_back(id); + } + for (int i = 0; i < nindex; i++) { + index_shards.add_shard( + new faiss::IndexIVFFlat(&cd.quantizer, d, nlist)); + } + EXPECT_TRUE(index_shards.is_trained); + index_shards.add_with_ids(nb, cd.database.data(), ids.data()); + int ndiff = compare_merged(&index_shards, true, false); + EXPECT_GE(0, ndiff); +}