Skip to content

Commit

Permalink
fix: fix group by on chunked segment (#37292)
Browse files Browse the repository at this point in the history
#37244

---------

Signed-off-by: sunby <[email protected]>
  • Loading branch information
sunby authored Nov 5, 2024
1 parent 9a0e1c8 commit bd04cac
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 16 deletions.
15 changes: 8 additions & 7 deletions internal/core/src/common/QueryResult.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,9 @@ struct OffsetDisPairComparator {
};
struct VectorIterator {
public:
VectorIterator(int chunk_count, int64_t chunk_rows = -1)
: chunk_rows_(chunk_rows) {
VectorIterator(int chunk_count,
const std::vector<int64_t>& total_rows_until_chunk = {})
: total_rows_until_chunk_(total_rows_until_chunk) {
iterators_.reserve(chunk_count);
}

Expand Down Expand Up @@ -119,7 +120,7 @@ struct VectorIterator {
private:
int64_t
convert_to_segment_offset(int64_t chunk_offset, int chunk_idx) {
if (chunk_rows_ == -1) {
if (total_rows_until_chunk_.size() == 0) {
AssertInfo(
iterators_.size() == 1,
"Wrong state for vectorIterators, which having incorrect "
Expand All @@ -129,7 +130,7 @@ struct VectorIterator {
iterators_.size());
return chunk_offset;
}
return chunk_idx * chunk_rows_ + chunk_offset;
return total_rows_until_chunk_[chunk_idx] + chunk_offset;
}

private:
Expand All @@ -139,7 +140,7 @@ struct VectorIterator {
OffsetDisPairComparator>
heap_;
bool sealed = false;
int64_t chunk_rows_ = -1;
std::vector<int64_t> total_rows_until_chunk_;
//currently, VectorIterator is guaranteed to be used serially without concurrent problem, in the future
//we may need to add mutex to protect the variable sealed
};
Expand All @@ -163,7 +164,7 @@ struct SearchResult {
AssembleChunkVectorIterators(
int64_t nq,
int chunk_count,
int64_t rows_per_chunk,
const std::vector<int64_t>& total_rows_until_chunk,
const std::vector<knowhere::IndexNode::IteratorPtr>& kw_iterators) {
AssertInfo(kw_iterators.size() == nq * chunk_count,
"kw_iterators count:{} is not equal to nq*chunk_count:{}, "
Expand All @@ -176,7 +177,7 @@ struct SearchResult {
vec_iter_idx = vec_iter_idx % nq;
if (vector_iterators.size() < nq) {
auto vector_iterator = std::make_shared<VectorIterator>(
chunk_count, rows_per_chunk);
chunk_count, total_rows_until_chunk);
vector_iterators.emplace_back(vector_iterator);
}
auto kw_iterator = kw_iterators[i];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ PrepareVectorIteratorsFromIndex(const SearchInfo& search_info,
index.VectorIterators(dataset, search_conf, bitset);
if (iterators_val.has_value()) {
search_result.AssembleChunkVectorIterators(
nq, 1, -1, iterators_val.value());
nq, 1, {0}, iterators_val.value());
} else {
LOG_ERROR(
"Returned knowhere iterator has non-ready iterators "
Expand Down
5 changes: 5 additions & 0 deletions internal/core/src/mmap/ChunkedColumn.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,11 @@ class ChunkedColumnBase : public ColumnBase {
return num_rows_until_chunk_[chunk_id];
}

const std::vector<int64_t>&
GetNumRowsUntilChunk() const {
return num_rows_until_chunk_;
}

protected:
bool nullable_{false};
size_t num_rows_{0};
Expand Down
9 changes: 5 additions & 4 deletions internal/core/src/query/SearchOnGrowing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,11 +150,12 @@ SearchOnGrowing(const segcore::SegmentGrowingImpl& segment,
}
}
if (info.group_by_field_id_.has_value()) {
std::vector<int64_t> chunk_rows(max_chunk, 0);
for (int i = 1; i < max_chunk; ++i) {
chunk_rows[i] = i * vec_size_per_chunk;
}
search_result.AssembleChunkVectorIterators(
num_queries,
max_chunk,
vec_size_per_chunk,
final_qr.chunk_iterators());
num_queries, max_chunk, chunk_rows, final_qr.chunk_iterators());
} else {
search_result.distances_ = std::move(final_qr.mutable_distances());
search_result.seg_offsets_ =
Expand Down
8 changes: 5 additions & 3 deletions internal/core/src/query/SearchOnSealed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,10 @@ SearchOnSealed(const Schema& schema,
offset += chunk_size;
}
if (search_info.group_by_field_id_.has_value()) {
result.AssembleChunkVectorIterators(
num_queries, 1, -1, final_qr.chunk_iterators());
result.AssembleChunkVectorIterators(num_queries,
num_chunk,
column->GetNumRowsUntilChunk(),
final_qr.chunk_iterators());
} else {
result.distances_ = std::move(final_qr.mutable_distances());
result.seg_offsets_ = std::move(final_qr.mutable_seg_offsets());
Expand Down Expand Up @@ -201,7 +203,7 @@ SearchOnSealed(const Schema& schema,
auto sub_qr = BruteForceSearchIterators(
dataset, vec_data, row_count, search_info, bitset, data_type);
result.AssembleChunkVectorIterators(
num_queries, 1, -1, sub_qr.chunk_iterators());
num_queries, 1, {0}, sub_qr.chunk_iterators());
} else {
auto sub_qr = BruteForceSearch(
dataset, vec_data, row_count, search_info, bitset, data_type);
Expand Down
29 changes: 28 additions & 1 deletion internal/core/unittest/test_chunked_segment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ TEST(test_chunk_segment, TestSearchOnSealed) {
search_info,
query_data,
1,
chunk_size * chunk_num,
total_row_count,
bv,
search_result);

Expand All @@ -107,4 +107,31 @@ TEST(test_chunk_segment, TestSearchOnSealed) {
for (int i = 0; i < total_row_count; i++) {
ASSERT_TRUE(offsets.find(i) != offsets.end());
}

// test with group by
search_info.group_by_field_id_ = fakevec_id;
std::fill(bitset_data, bitset_data + bitset_size, 0);
query::SearchOnSealed(*schema,
column,
search_info,
query_data,
1,
total_row_count,
bv,
search_result);

ASSERT_EQ(1, search_result.vector_iterators_->size());

auto iter = search_result.vector_iterators_->at(0);
// collect all offsets
offsets.clear();
while (iter->HasNext()) {
auto [offset, distance] = iter->Next().value();
offsets.insert(offset);
}

ASSERT_EQ(total_row_count, offsets.size());
for (int i = 0; i < total_row_count; i++) {
ASSERT_TRUE(offsets.find(i) != offsets.end());
}
}

0 comments on commit bd04cac

Please sign in to comment.