Merge branch 'main' into LLM-system-prompt

deepset-ai · Oct 21, 2024 · 17f5b14 · 17f5b14
2 parents de76809 + 322f63d
commit 17f5b14
Show file tree

Hide file tree

Showing 38 changed files with 430 additions and 53 deletions.
diff --git a/.github/workflows/docstrings_linting.yml b/.github/workflows/docstrings_linting.yml
@@ -10,6 +10,9 @@ on:
     paths:
       - "**.py"
 
+env:
+  HATCH_VERSION: "1.13.0"
+
 jobs:
   docstrings-linting:
     runs-on: ubuntu-latest
@@ -19,7 +22,7 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Install Hatch
-        run: pip install hatch=="1.9.3"
+        run: pip install hatch==${{ env.HATCH_VERSION }}
 
       - name: ruff docstrings linting
         run: hatch run ruff check haystack
diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
@@ -18,7 +18,7 @@ on:
 env:
   PYTHON_VERSION: "3.8"
   OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-  HATCH_VERSION: "1.9.3"
+  HATCH_VERSION: "1.13.0"
 
 jobs:
   run:

diff --git a/.github/workflows/pypi_release.yml b/.github/workflows/pypi_release.yml
@@ -8,7 +8,7 @@ on:
       - "!v[0-9]+.[0-9]+.[0-9]-rc0"
 
 env:
-  HATCH_VERSION: "1.9.3"
+  HATCH_VERSION: "1.13.0"
 
 jobs:
   release-on-pypi:

diff --git a/.github/workflows/readme_sync.yml b/.github/workflows/readme_sync.yml
@@ -13,7 +13,7 @@ on:
       - "!v1.[0-9]+.x"
 
 env:
-  HATCH_VERSION: "1.9.3"
+  HATCH_VERSION: "1.13.0"
   PYTHON_VERSION: "3.10"
 
 jobs:
@@ -38,7 +38,7 @@ jobs:
           # in config files with their id.
           README_API_KEY: ${{ secrets.README_API_KEY }}
         # The command is a bit misleading, we're not actually syncing anything here,
-        # we're just generating the markdown files from the the yaml configs.
+        # we're just generating the markdown files from the yaml configs.
         run: hatch run readme:sync
 
       - name: Get version

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -30,7 +30,7 @@ env:
   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
   HF_API_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }}
   PYTHON_VERSION: "3.8"
-  HATCH_VERSION: "1.9.3"
+  HATCH_VERSION: "1.13.0"
 
 jobs:
   format:

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -236,7 +236,7 @@ tested by the CI, but once again, running the checks locally will speed up the r
 
 To check your code type checking, run:
 ```sh
-hatch run test:type
+hatch run test:types
 ```
 
 

diff --git a/haystack/components/embedders/backends/sentence_transformers_backend.py b/haystack/components/embedders/backends/sentence_transformers_backend.py
@@ -2,9 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any, Dict, List, Optional, cast
-
-import numpy as np
+from typing import Any, Dict, List, Optional
 
 from haystack.lazy_imports import LazyImport
 from haystack.utils.auth import Secret
@@ -29,6 +27,7 @@ def get_embedding_backend(
         truncate_dim: Optional[int] = None,
         model_kwargs: Optional[Dict[str, Any]] = None,
         tokenizer_kwargs: Optional[Dict[str, Any]] = None,
+        config_kwargs: Optional[Dict[str, Any]] = None,
     ):
         embedding_backend_id = f"{model}{device}{auth_token}{truncate_dim}"
 
@@ -42,6 +41,7 @@ def get_embedding_backend(
             truncate_dim=truncate_dim,
             model_kwargs=model_kwargs,
             tokenizer_kwargs=tokenizer_kwargs,
+            config_kwargs=config_kwargs,
         )
         _SentenceTransformersEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend
         return embedding_backend
@@ -61,6 +61,7 @@ def __init__(
         truncate_dim: Optional[int] = None,
         model_kwargs: Optional[Dict[str, Any]] = None,
         tokenizer_kwargs: Optional[Dict[str, Any]] = None,
+        config_kwargs: Optional[Dict[str, Any]] = None,
     ):
         sentence_transformers_import.check()
         self.model = SentenceTransformer(
@@ -71,8 +72,9 @@ def __init__(
             truncate_dim=truncate_dim,
             model_kwargs=model_kwargs,
             tokenizer_kwargs=tokenizer_kwargs,
+            config_kwargs=config_kwargs,
         )
 
     def embed(self, data: List[str], **kwargs) -> List[List[float]]:
-        embeddings = cast(np.ndarray, self.model.encode(data, **kwargs)).tolist()
+        embeddings = self.model.encode(data, **kwargs).tolist()
         return embeddings
diff --git a/haystack/components/embedders/sentence_transformers_document_embedder.py b/haystack/components/embedders/sentence_transformers_document_embedder.py
@@ -54,6 +54,7 @@ def __init__(  # noqa: PLR0913
         truncate_dim: Optional[int] = None,
         model_kwargs: Optional[Dict[str, Any]] = None,
         tokenizer_kwargs: Optional[Dict[str, Any]] = None,
+        config_kwargs: Optional[Dict[str, Any]] = None,
         precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32",
     ):
         """
@@ -96,10 +97,12 @@ def __init__(  # noqa: PLR0913
         :param tokenizer_kwargs:
             Additional keyword arguments for `AutoTokenizer.from_pretrained` when loading the tokenizer.
             Refer to specific model documentation for available kwargs.
+        :param config_kwargs:
+            Additional keyword arguments for `AutoConfig.from_pretrained` when loading the model configuration.
         :param precision:
             The precision to use for the embeddings.
             All non-float32 precisions are quantized embeddings.
-            Quantized embeddings are smaller in size and faster to compute, but may have a lower accuracy.
+            Quantized embeddings are smaller and faster to compute, but may have a lower accuracy.
             They are useful for reducing the size of the embeddings of a corpus for semantic search, among other tasks.
         """
 
@@ -117,6 +120,7 @@ def __init__(  # noqa: PLR0913
         self.truncate_dim = truncate_dim
         self.model_kwargs = model_kwargs
         self.tokenizer_kwargs = tokenizer_kwargs
+        self.config_kwargs = config_kwargs
         self.embedding_backend = None
         self.precision = precision
 
@@ -149,6 +153,7 @@ def to_dict(self) -> Dict[str, Any]:
             truncate_dim=self.truncate_dim,
             model_kwargs=self.model_kwargs,
             tokenizer_kwargs=self.tokenizer_kwargs,
+            config_kwargs=self.config_kwargs,
             precision=self.precision,
         )
         if serialization_dict["init_parameters"].get("model_kwargs") is not None:
@@ -186,6 +191,7 @@ def warm_up(self):
                 truncate_dim=self.truncate_dim,
                 model_kwargs=self.model_kwargs,
                 tokenizer_kwargs=self.tokenizer_kwargs,
+                config_kwargs=self.config_kwargs,
             )
             if self.tokenizer_kwargs and self.tokenizer_kwargs.get("model_max_length"):
                 self.embedding_backend.model.max_seq_length = self.tokenizer_kwargs["model_max_length"]

diff --git a/haystack/components/embedders/sentence_transformers_text_embedder.py b/haystack/components/embedders/sentence_transformers_text_embedder.py
@@ -34,7 +34,7 @@ class SentenceTransformersTextEmbedder:
     ```
     """
 
-    def __init__(
+    def __init__(  # noqa: PLR0913
         self,
         model: str = "sentence-transformers/all-mpnet-base-v2",
         device: Optional[ComponentDevice] = None,
@@ -48,6 +48,7 @@ def __init__(
         truncate_dim: Optional[int] = None,
         model_kwargs: Optional[Dict[str, Any]] = None,
         tokenizer_kwargs: Optional[Dict[str, Any]] = None,
+        config_kwargs: Optional[Dict[str, Any]] = None,
         precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32",
     ):
         """
@@ -86,6 +87,8 @@ def __init__(
         :param tokenizer_kwargs:
             Additional keyword arguments for `AutoTokenizer.from_pretrained` when loading the tokenizer.
             Refer to specific model documentation for available kwargs.
+        :param config_kwargs:
+            Additional keyword arguments for `AutoConfig.from_pretrained` when loading the model configuration.
         :param precision:
             The precision to use for the embeddings.
             All non-float32 precisions are quantized embeddings.
@@ -105,6 +108,7 @@ def __init__(
         self.truncate_dim = truncate_dim
         self.model_kwargs = model_kwargs
         self.tokenizer_kwargs = tokenizer_kwargs
+        self.config_kwargs = config_kwargs
         self.embedding_backend = None
         self.precision = precision
 
@@ -135,6 +139,7 @@ def to_dict(self) -> Dict[str, Any]:
             truncate_dim=self.truncate_dim,
             model_kwargs=self.model_kwargs,
             tokenizer_kwargs=self.tokenizer_kwargs,
+            config_kwargs=self.config_kwargs,
             precision=self.precision,
         )
         if serialization_dict["init_parameters"].get("model_kwargs") is not None:
@@ -172,6 +177,7 @@ def warm_up(self):
                 truncate_dim=self.truncate_dim,
                 model_kwargs=self.model_kwargs,
                 tokenizer_kwargs=self.tokenizer_kwargs,
+                config_kwargs=self.config_kwargs,
             )
             if self.tokenizer_kwargs and self.tokenizer_kwargs.get("model_max_length"):
                 self.embedding_backend.model.max_seq_length = self.tokenizer_kwargs["model_max_length"]

diff --git a/haystack/components/routers/file_type_router.py b/haystack/components/routers/file_type_router.py
@@ -54,16 +54,24 @@ class FileTypeRouter:
     :param mime_types: A list of MIME types or regex patterns to classify the input files or byte streams.
     """
 
-    def __init__(self, mime_types: List[str]):
+    def __init__(self, mime_types: List[str], additional_mimetypes: Optional[Dict[str, str]] = None):
         """
         Initialize the FileTypeRouter component.
 
         :param mime_types: A list of MIME types or regex patterns to classify the input files or byte streams.
             (for example: `["text/plain", "audio/x-wav", "image/jpeg"]`).
+
+        :param additional_mimetypes: A dictionary containing the MIME type to add to the mimetypes package to prevent
+            unsupported or non native packages from being unclassified.
+            (for example: `{"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx"}`).
         """
         if not mime_types:
             raise ValueError("The list of mime types cannot be empty.")
 
+        if additional_mimetypes:
+            for mime, ext in additional_mimetypes.items():
+                mimetypes.add_type(mime, ext)
+
         self.mime_type_patterns = []
         for mime_type in mime_types:
             if not self._is_valid_mime_type_format(mime_type):

diff --git a/haystack/logging.py b/haystack/logging.py
@@ -190,7 +190,10 @@ def patch_make_records_to_use_kwarg_string_interpolation(original_make_records:
     @functools.wraps(original_make_records)
     def _wrapper(name, level, fn, lno, msg, args, exc_info, func=None, extra=None, sinfo=None) -> Any:
         safe_extra = extra or {}
-        interpolated_msg = msg.format(**safe_extra)
+        try:
+            interpolated_msg = msg.format(**safe_extra)
+        except (KeyError, ValueError):
+            interpolated_msg = msg
         return original_make_records(name, level, fn, lno, interpolated_msg, (), exc_info, func, extra, sinfo)
 
     return _wrapper

diff --git a/haystack/testing/test_utils.py b/haystack/testing/test_utils.py
@@ -5,8 +5,6 @@
 import os
 import random
 
-import numpy as np
-
 from haystack import logging
 
 logger = logging.getLogger(__name__)
@@ -23,7 +21,6 @@ def set_all_seeds(seed: int, deterministic_cudnn: bool = False) -> None:
     :param deterministic_cudnn: Enable for full reproducibility when using CUDA. Caution: might slow down training.
     """
     random.seed(seed)
-    np.random.seed(seed)
     os.environ["PYTHONHASHSEED"] = str(seed)
 
     try:

diff --git a/haystack/tracing/logging_tracer.py b/haystack/tracing/logging_tracer.py
@@ -0,0 +1,82 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import contextlib
+import dataclasses
+from typing import Any, Dict, Iterator, Optional
+
+from haystack import logging
+from haystack.tracing import Span, Tracer
+
+logger = logging.getLogger(__name__)
+
+RESET_COLOR = "\033[0m"
+
+
+@dataclasses.dataclass
+class LoggingSpan(Span):
+    operation_name: str
+    tags: Dict[str, Any] = dataclasses.field(default_factory=dict)
+
+    def set_tag(self, key: str, value: Any) -> None:
+        """
+        Set a single tag on the span.
+
+        :param key: the name of the tag.
+        :param value: the value of the tag.
+        """
+        self.tags[key] = value
+
+
+class LoggingTracer(Tracer):
+    """
+    A simple tracer that logs the operation name and tags of a span.
+    """
+
+    def __init__(self, tags_color_strings: Optional[Dict[str, str]] = None) -> None:
+        """
+        Initialize the LoggingTracer.
+
+        :param tags_color_strings:
+            A dictionary that maps tag names to color strings that should be used when logging the tags.
+            The color strings should be in the format of
+            [ANSI escape codes](https://en.wikipedia.org/wiki/ANSI_escape_code#Colors).
+            For example, to color the tag "haystack.component.input" in red, you would pass
+            `tags_color_strings={"haystack.component.input": "\x1b[1;31m"}`.
+        """
+
+        self.tags_color_strings = tags_color_strings or {}
+
+    @contextlib.contextmanager
+    def trace(self, operation_name: str, tags: Optional[Dict[str, Any]] = None) -> Iterator[Span]:
+        """
+        Trace the execution of a block of code.
+
+        :param operation_name: the name of the operation being traced.
+        :param tags: tags to apply to the newly created span.
+        :returns: the newly created span.
+        """
+
+        custom_span = LoggingSpan(operation_name, tags=tags or {})
+
+        try:
+            yield custom_span
+        except Exception as e:
+            raise e
+        # we make sure to log the operation name and tags of the span when the context manager exits
+        # both in case of success and error
+        finally:
+            operation_name = custom_span.operation_name
+            tags = custom_span.tags or {}
+            logger.debug("Operation: {operation_name}", operation_name=operation_name)
+            for tag_name, tag_value in tags.items():
+                color_string = self.tags_color_strings.get(tag_name, "")
+                logger.debug(
+                    color_string + "{tag_name}={tag_value}" + RESET_COLOR, tag_name=tag_name, tag_value=tag_value
+                )
+
+    def current_span(self) -> Optional[Span]:
+        """Return the current active span, if any."""
+        # we don't store spans in this simple tracer
+        return None
diff --git a/haystack/utils/callable_serialization.py b/haystack/utils/callable_serialization.py
@@ -40,7 +40,7 @@ def deserialize_callable(callable_handle: str) -> Optional[Callable]:
     module = sys.modules.get(module_name, None)
     if not module:
         raise DeserializationError(f"Could not locate the module of the callable: {module_name}")
-    streaming_callback = getattr(module, function_name, None)
-    if not streaming_callback:
+    deserialized_callable = getattr(module, function_name, None)
+    if not deserialized_callable:
         raise DeserializationError(f"Could not locate the callable: {function_name}")
-    return streaming_callback
+    return deserialized_callable
diff --git a/haystack/utils/expit.py b/haystack/utils/expit.py
@@ -2,9 +2,13 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import numpy as np
+from numpy import exp
 
 
-def expit(x: float) -> float:
-    """Compute logistic sigmoid function. Maps input values to a range between 0 and 1"""
-    return 1 / (1 + np.exp(-x))
+def expit(x) -> float:
+    """
+    Compute logistic sigmoid function. Maps input values to a range between 0 and 1
+
+    :param x: input value. Can be a scalar or a numpy array.
+    """
+    return 1 / (1 + exp(-x))
diff --git a/pyproject.toml b/pyproject.toml
@@ -60,6 +60,7 @@ dependencies = [
 ]
 
 [tool.hatch.envs.default]
+installer = "uv"
 dependencies = [
   "pre-commit",
   "ruff",
@@ -151,6 +152,7 @@ types = "mypy --install-types --non-interactive --cache-dir=.mypy_cache/ {args:h
 lint = "pylint -ry -j 0 {args:haystack}"
 
 [tool.hatch.envs.readme]
+installer = "uv"
 detached = true                         # To avoid installing the dependencies from the default environment
 dependencies = ["haystack-pydoc-tools"]