-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[BUG] fix multi collection log purge #2617
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. alternative: a new multi-collection state machine that creates records |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
from chromadb.api.client import Client | ||
from chromadb.config import System | ||
from chromadb.test.property import invariants | ||
|
||
|
||
def test_log_purge(sqlite_persistent: System) -> None: | ||
client = Client.from_system(sqlite_persistent) | ||
|
||
first_collection = client.create_collection( | ||
"first_collection", metadata={"hnsw:sync_threshold": 10, "hnsw:batch_size": 10} | ||
) | ||
second_collection = client.create_collection( | ||
"second_collection", metadata={"hnsw:sync_threshold": 10, "hnsw:batch_size": 10} | ||
) | ||
collections = [first_collection, second_collection] | ||
|
||
# (Does not trigger a purge) | ||
for i in range(5): | ||
first_collection.add(ids=str(i), embeddings=[i, i]) | ||
|
||
# (Should trigger a purge) | ||
for i in range(100): | ||
second_collection.add(ids=str(i), embeddings=[i, i]) | ||
|
||
# The purge of the second collection should not be blocked by the first | ||
invariants.log_size_below_max(client._system, collections, True) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,5 @@ | ||
import gc | ||
import math | ||
from chromadb.api.configuration import HNSWConfigurationInternal | ||
from chromadb.config import System | ||
from chromadb.db.base import get_sql | ||
from chromadb.db.impl.sqlite import SqliteDB | ||
|
@@ -334,29 +333,33 @@ def _total_embedding_queue_log_size(sqlite: SqliteDB) -> int: | |
|
||
|
||
def log_size_below_max( | ||
system: System, collection: Collection, has_collection_mutated: bool | ||
system: System, collections: List[Collection], has_collection_mutated: bool | ||
) -> None: | ||
sqlite = system.instance(SqliteDB) | ||
|
||
if has_collection_mutated: | ||
# Must always keep one entry to avoid reusing seq_ids | ||
assert _total_embedding_queue_log_size(sqlite) >= 1 | ||
|
||
hnsw_config = cast( | ||
HNSWConfigurationInternal, | ||
collection.get_model() | ||
.get_configuration() | ||
.get_parameter("hnsw_configuration") | ||
.value, | ||
Comment on lines
-346
to
-350
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. using |
||
# We purge per-collection as the sync_threshold is a per-collection setting | ||
sync_threshold_sum = sum( | ||
collection.metadata.get("hnsw:sync_threshold", 1000) | ||
if collection.metadata is not None | ||
else 1000 | ||
for collection in collections | ||
) | ||
batch_size_sum = sum( | ||
collection.metadata.get("hnsw:batch_size", 100) | ||
if collection.metadata is not None | ||
else 100 | ||
for collection in collections | ||
) | ||
sync_threshold = cast(int, hnsw_config.get_parameter("sync_threshold").value) | ||
batch_size = cast(int, hnsw_config.get_parameter("batch_size").value) | ||
|
||
# -1 is used because the queue is always at least 1 entry long, so deletion stops before the max ack'ed sequence ID. | ||
# And if the batch_size != sync_threshold, the queue can have up to batch_size - 1 more entries. | ||
assert ( | ||
_total_embedding_queue_log_size(sqlite) - 1 | ||
<= sync_threshold + batch_size - 1 | ||
<= sync_threshold_sum + batch_size_sum - 1 | ||
) | ||
else: | ||
assert _total_embedding_queue_log_size(sqlite) == 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixes a bug where if
trigger_vector_segments_max_seq_id_migration()
below attempted to load a segment, it would throw because the SQLite component wasn't technically startedthis scenario happens when upgrading from an old version or when any collection had not yet hit its first sync_threshold persist trigger
was not caught because the CLI test didn't add any collections, I updated the test to trigger this path