Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: introduce Store protocol (v2) #5259

Merged
merged 9 commits into from
Jul 7, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions haystack/preview/document_stores/memory/document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@
from tqdm.auto import tqdm

from haystack.preview.dataclasses import Document
from haystack.preview.document_stores.protocols import Store, DuplicatePolicy
from haystack.preview.document_stores.memory._filters import match
from haystack.preview.document_stores.errors import DuplicateDocumentError, MissingDocumentError
from haystack.utils.scipy_utils import expit

logger = logging.getLogger(__name__)
DuplicatePolicy = Literal["skip", "overwrite", "fail"]

# document scores are essentially unbounded and will be scaled to values between 0 and 1 if scale_score is set to
# True (default). Scaling uses the expit function (inverse of the logit function) after applying a SCALING_FACTOR. A
Expand All @@ -23,7 +23,7 @@
SCALING_FACTOR = 8


class MemoryDocumentStore:
class MemoryDocumentStore(Store):
ZanSara marked this conversation as resolved.
Show resolved Hide resolved
"""
Stores data in-memory. It's ephemeral and cannot be saved to disk.
"""
Expand Down
114 changes: 114 additions & 0 deletions haystack/preview/document_stores/protocols.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
from typing import Protocol, Optional, Dict, Any, List, Literal

import logging

from haystack.preview.dataclasses import Document


logger = logging.getLogger(__name__)
DuplicatePolicy = Literal["skip", "overwrite", "fail"]
ZanSara marked this conversation as resolved.
Show resolved Hide resolved


class Store(Protocol):
"""
Stores data, like Documents, to be used by the components of a Pipeline.
"""

def count_documents(self) -> int:
"""
Returns the number of how many documents are present in the document store.
ZanSara marked this conversation as resolved.
Show resolved Hide resolved
"""

def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
"""
Returns the documents that match the filters provided.

Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical operator (`"$and"`,
`"$or"`, `"$not"`), a comparison operator (`"$eq"`, `$ne`, `"$in"`, `$nin`, `"$gt"`, `"$gte"`, `"$lt"`,
`"$lte"`) or a metadata field name.

Logical operator keys take a dictionary of metadata field names and/or logical operators as value. Metadata
field names take a dictionary of comparison operators as value. Comparison operator keys take a single value or
(in case of `"$in"`) a list of values as value. If no logical operator is provided, `"$and"` is used as default
operation. If no comparison operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used
as default operation.

Example:

```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
# or simpler using default operators
filters = {
"type": "article",
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": ["economy", "politics"],
"publisher": "nytimes"
}
}
```

To use the same logical operator multiple times on the same level, logical operators can take a list of
dictionaries as value.

Example:

```python
filters = {
"$or": [
{
"$and": {
"Type": "News Paper",
"Date": {
"$lt": "2019-01-01"
}
}
},
{
"$and": {
"Type": "Blog Post",
"Date": {
"$gte": "2019-01-01"
}
}
}
]
}
```

:param filters: the filters to apply to the document list.
:return: a list of Documents that match the given filters.
"""

def write_documents(self, documents: List[Document], duplicates: DuplicatePolicy = "fail") -> None:
"""
Writes (or overwrites) documents into the store.

:param documents: a list of documents.
:param duplicates: documents with the same ID count as duplicates. When duplicates are met,
the store can:
- skip: keep the existing document and ignore the new one.
- overwrite: remove the old document and write the new one.
- fail: an error is raised
:raises DuplicateError: Exception trigger on duplicate document if `duplicates="fail"`
:return: None
"""

def delete_documents(self, document_ids: List[str]) -> None:
"""
Deletes all documents with a matching document_ids from the document store.
Fails with `MissingDocumentError` if no document with this id is present in the store.

:param object_ids: the object_ids to delete
"""
8 changes: 5 additions & 3 deletions haystack/preview/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
)
from canals.pipeline.sockets import find_input_sockets

from haystack.preview.document_stores.protocols import Store


class NoSuchStoreError(PipelineError):
pass
Expand All @@ -23,9 +25,9 @@ class Pipeline(CanalsPipeline):

def __init__(self):
super().__init__()
self.stores = {}
self.stores: Dict[str, Store] = {}

def add_store(self, name: str, store: object) -> None:
def add_store(self, name: str, store: Store) -> None:
"""
Make a store available to all nodes of this pipeline.

Expand All @@ -43,7 +45,7 @@ def list_stores(self) -> List[str]:
"""
return list(self.stores.keys())

def get_store(self, name: str) -> object:
def get_store(self, name: str) -> Store:
"""
Returns the store associated with the given name.

Expand Down