Skip to content

Commit

Permalink
Merge branch 'main' into LLM-system-prompt
Browse files Browse the repository at this point in the history
  • Loading branch information
davidsbatista committed Oct 21, 2024
2 parents de76809 + 322f63d commit 17f5b14
Show file tree
Hide file tree
Showing 38 changed files with 430 additions and 53 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/docstrings_linting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ on:
paths:
- "**.py"

env:
HATCH_VERSION: "1.13.0"

jobs:
docstrings-linting:
runs-on: ubuntu-latest
Expand All @@ -19,7 +22,7 @@ jobs:
uses: actions/checkout@v4

- name: Install Hatch
run: pip install hatch=="1.9.3"
run: pip install hatch==${{ env.HATCH_VERSION }}

- name: ruff docstrings linting
run: hatch run ruff check haystack
2 changes: 1 addition & 1 deletion .github/workflows/e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ on:
env:
PYTHON_VERSION: "3.8"
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
HATCH_VERSION: "1.9.3"
HATCH_VERSION: "1.13.0"

jobs:
run:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pypi_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:
- "!v[0-9]+.[0-9]+.[0-9]-rc0"

env:
HATCH_VERSION: "1.9.3"
HATCH_VERSION: "1.13.0"

jobs:
release-on-pypi:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/readme_sync.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ on:
- "!v1.[0-9]+.x"

env:
HATCH_VERSION: "1.9.3"
HATCH_VERSION: "1.13.0"
PYTHON_VERSION: "3.10"

jobs:
Expand All @@ -38,7 +38,7 @@ jobs:
# in config files with their id.
README_API_KEY: ${{ secrets.README_API_KEY }}
# The command is a bit misleading, we're not actually syncing anything here,
# we're just generating the markdown files from the the yaml configs.
# we're just generating the markdown files from the yaml configs.
run: hatch run readme:sync

- name: Get version
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
HF_API_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }}
PYTHON_VERSION: "3.8"
HATCH_VERSION: "1.9.3"
HATCH_VERSION: "1.13.0"

jobs:
format:
Expand Down
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ tested by the CI, but once again, running the checks locally will speed up the r

To check your code type checking, run:
```sh
hatch run test:type
hatch run test:types
```


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@
#
# SPDX-License-Identifier: Apache-2.0

from typing import Any, Dict, List, Optional, cast

import numpy as np
from typing import Any, Dict, List, Optional

from haystack.lazy_imports import LazyImport
from haystack.utils.auth import Secret
Expand All @@ -29,6 +27,7 @@ def get_embedding_backend(
truncate_dim: Optional[int] = None,
model_kwargs: Optional[Dict[str, Any]] = None,
tokenizer_kwargs: Optional[Dict[str, Any]] = None,
config_kwargs: Optional[Dict[str, Any]] = None,
):
embedding_backend_id = f"{model}{device}{auth_token}{truncate_dim}"

Expand All @@ -42,6 +41,7 @@ def get_embedding_backend(
truncate_dim=truncate_dim,
model_kwargs=model_kwargs,
tokenizer_kwargs=tokenizer_kwargs,
config_kwargs=config_kwargs,
)
_SentenceTransformersEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend
return embedding_backend
Expand All @@ -61,6 +61,7 @@ def __init__(
truncate_dim: Optional[int] = None,
model_kwargs: Optional[Dict[str, Any]] = None,
tokenizer_kwargs: Optional[Dict[str, Any]] = None,
config_kwargs: Optional[Dict[str, Any]] = None,
):
sentence_transformers_import.check()
self.model = SentenceTransformer(
Expand All @@ -71,8 +72,9 @@ def __init__(
truncate_dim=truncate_dim,
model_kwargs=model_kwargs,
tokenizer_kwargs=tokenizer_kwargs,
config_kwargs=config_kwargs,
)

def embed(self, data: List[str], **kwargs) -> List[List[float]]:
embeddings = cast(np.ndarray, self.model.encode(data, **kwargs)).tolist()
embeddings = self.model.encode(data, **kwargs).tolist()
return embeddings
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def __init__( # noqa: PLR0913
truncate_dim: Optional[int] = None,
model_kwargs: Optional[Dict[str, Any]] = None,
tokenizer_kwargs: Optional[Dict[str, Any]] = None,
config_kwargs: Optional[Dict[str, Any]] = None,
precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32",
):
"""
Expand Down Expand Up @@ -96,10 +97,12 @@ def __init__( # noqa: PLR0913
:param tokenizer_kwargs:
Additional keyword arguments for `AutoTokenizer.from_pretrained` when loading the tokenizer.
Refer to specific model documentation for available kwargs.
:param config_kwargs:
Additional keyword arguments for `AutoConfig.from_pretrained` when loading the model configuration.
:param precision:
The precision to use for the embeddings.
All non-float32 precisions are quantized embeddings.
Quantized embeddings are smaller in size and faster to compute, but may have a lower accuracy.
Quantized embeddings are smaller and faster to compute, but may have a lower accuracy.
They are useful for reducing the size of the embeddings of a corpus for semantic search, among other tasks.
"""

Expand All @@ -117,6 +120,7 @@ def __init__( # noqa: PLR0913
self.truncate_dim = truncate_dim
self.model_kwargs = model_kwargs
self.tokenizer_kwargs = tokenizer_kwargs
self.config_kwargs = config_kwargs
self.embedding_backend = None
self.precision = precision

Expand Down Expand Up @@ -149,6 +153,7 @@ def to_dict(self) -> Dict[str, Any]:
truncate_dim=self.truncate_dim,
model_kwargs=self.model_kwargs,
tokenizer_kwargs=self.tokenizer_kwargs,
config_kwargs=self.config_kwargs,
precision=self.precision,
)
if serialization_dict["init_parameters"].get("model_kwargs") is not None:
Expand Down Expand Up @@ -186,6 +191,7 @@ def warm_up(self):
truncate_dim=self.truncate_dim,
model_kwargs=self.model_kwargs,
tokenizer_kwargs=self.tokenizer_kwargs,
config_kwargs=self.config_kwargs,
)
if self.tokenizer_kwargs and self.tokenizer_kwargs.get("model_max_length"):
self.embedding_backend.model.max_seq_length = self.tokenizer_kwargs["model_max_length"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class SentenceTransformersTextEmbedder:
```
"""

def __init__(
def __init__( # noqa: PLR0913
self,
model: str = "sentence-transformers/all-mpnet-base-v2",
device: Optional[ComponentDevice] = None,
Expand All @@ -48,6 +48,7 @@ def __init__(
truncate_dim: Optional[int] = None,
model_kwargs: Optional[Dict[str, Any]] = None,
tokenizer_kwargs: Optional[Dict[str, Any]] = None,
config_kwargs: Optional[Dict[str, Any]] = None,
precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32",
):
"""
Expand Down Expand Up @@ -86,6 +87,8 @@ def __init__(
:param tokenizer_kwargs:
Additional keyword arguments for `AutoTokenizer.from_pretrained` when loading the tokenizer.
Refer to specific model documentation for available kwargs.
:param config_kwargs:
Additional keyword arguments for `AutoConfig.from_pretrained` when loading the model configuration.
:param precision:
The precision to use for the embeddings.
All non-float32 precisions are quantized embeddings.
Expand All @@ -105,6 +108,7 @@ def __init__(
self.truncate_dim = truncate_dim
self.model_kwargs = model_kwargs
self.tokenizer_kwargs = tokenizer_kwargs
self.config_kwargs = config_kwargs
self.embedding_backend = None
self.precision = precision

Expand Down Expand Up @@ -135,6 +139,7 @@ def to_dict(self) -> Dict[str, Any]:
truncate_dim=self.truncate_dim,
model_kwargs=self.model_kwargs,
tokenizer_kwargs=self.tokenizer_kwargs,
config_kwargs=self.config_kwargs,
precision=self.precision,
)
if serialization_dict["init_parameters"].get("model_kwargs") is not None:
Expand Down Expand Up @@ -172,6 +177,7 @@ def warm_up(self):
truncate_dim=self.truncate_dim,
model_kwargs=self.model_kwargs,
tokenizer_kwargs=self.tokenizer_kwargs,
config_kwargs=self.config_kwargs,
)
if self.tokenizer_kwargs and self.tokenizer_kwargs.get("model_max_length"):
self.embedding_backend.model.max_seq_length = self.tokenizer_kwargs["model_max_length"]
Expand Down
10 changes: 9 additions & 1 deletion haystack/components/routers/file_type_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,24 @@ class FileTypeRouter:
:param mime_types: A list of MIME types or regex patterns to classify the input files or byte streams.
"""

def __init__(self, mime_types: List[str]):
def __init__(self, mime_types: List[str], additional_mimetypes: Optional[Dict[str, str]] = None):
"""
Initialize the FileTypeRouter component.
:param mime_types: A list of MIME types or regex patterns to classify the input files or byte streams.
(for example: `["text/plain", "audio/x-wav", "image/jpeg"]`).
:param additional_mimetypes: A dictionary containing the MIME type to add to the mimetypes package to prevent
unsupported or non native packages from being unclassified.
(for example: `{"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx"}`).
"""
if not mime_types:
raise ValueError("The list of mime types cannot be empty.")

if additional_mimetypes:
for mime, ext in additional_mimetypes.items():
mimetypes.add_type(mime, ext)

self.mime_type_patterns = []
for mime_type in mime_types:
if not self._is_valid_mime_type_format(mime_type):
Expand Down
5 changes: 4 additions & 1 deletion haystack/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,10 @@ def patch_make_records_to_use_kwarg_string_interpolation(original_make_records:
@functools.wraps(original_make_records)
def _wrapper(name, level, fn, lno, msg, args, exc_info, func=None, extra=None, sinfo=None) -> Any:
safe_extra = extra or {}
interpolated_msg = msg.format(**safe_extra)
try:
interpolated_msg = msg.format(**safe_extra)
except (KeyError, ValueError):
interpolated_msg = msg
return original_make_records(name, level, fn, lno, interpolated_msg, (), exc_info, func, extra, sinfo)

return _wrapper
Expand Down
3 changes: 0 additions & 3 deletions haystack/testing/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
import os
import random

import numpy as np

from haystack import logging

logger = logging.getLogger(__name__)
Expand All @@ -23,7 +21,6 @@ def set_all_seeds(seed: int, deterministic_cudnn: bool = False) -> None:
:param deterministic_cudnn: Enable for full reproducibility when using CUDA. Caution: might slow down training.
"""
random.seed(seed)
np.random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

try:
Expand Down
82 changes: 82 additions & 0 deletions haystack/tracing/logging_tracer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

import contextlib
import dataclasses
from typing import Any, Dict, Iterator, Optional

from haystack import logging
from haystack.tracing import Span, Tracer

logger = logging.getLogger(__name__)

RESET_COLOR = "\033[0m"


@dataclasses.dataclass
class LoggingSpan(Span):
operation_name: str
tags: Dict[str, Any] = dataclasses.field(default_factory=dict)

def set_tag(self, key: str, value: Any) -> None:
"""
Set a single tag on the span.
:param key: the name of the tag.
:param value: the value of the tag.
"""
self.tags[key] = value


class LoggingTracer(Tracer):
"""
A simple tracer that logs the operation name and tags of a span.
"""

def __init__(self, tags_color_strings: Optional[Dict[str, str]] = None) -> None:
"""
Initialize the LoggingTracer.
:param tags_color_strings:
A dictionary that maps tag names to color strings that should be used when logging the tags.
The color strings should be in the format of
[ANSI escape codes](https://en.wikipedia.org/wiki/ANSI_escape_code#Colors).
For example, to color the tag "haystack.component.input" in red, you would pass
`tags_color_strings={"haystack.component.input": "\x1b[1;31m"}`.
"""

self.tags_color_strings = tags_color_strings or {}

@contextlib.contextmanager
def trace(self, operation_name: str, tags: Optional[Dict[str, Any]] = None) -> Iterator[Span]:
"""
Trace the execution of a block of code.
:param operation_name: the name of the operation being traced.
:param tags: tags to apply to the newly created span.
:returns: the newly created span.
"""

custom_span = LoggingSpan(operation_name, tags=tags or {})

try:
yield custom_span
except Exception as e:
raise e
# we make sure to log the operation name and tags of the span when the context manager exits
# both in case of success and error
finally:
operation_name = custom_span.operation_name
tags = custom_span.tags or {}
logger.debug("Operation: {operation_name}", operation_name=operation_name)
for tag_name, tag_value in tags.items():
color_string = self.tags_color_strings.get(tag_name, "")
logger.debug(
color_string + "{tag_name}={tag_value}" + RESET_COLOR, tag_name=tag_name, tag_value=tag_value
)

def current_span(self) -> Optional[Span]:
"""Return the current active span, if any."""
# we don't store spans in this simple tracer
return None
6 changes: 3 additions & 3 deletions haystack/utils/callable_serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def deserialize_callable(callable_handle: str) -> Optional[Callable]:
module = sys.modules.get(module_name, None)
if not module:
raise DeserializationError(f"Could not locate the module of the callable: {module_name}")
streaming_callback = getattr(module, function_name, None)
if not streaming_callback:
deserialized_callable = getattr(module, function_name, None)
if not deserialized_callable:
raise DeserializationError(f"Could not locate the callable: {function_name}")
return streaming_callback
return deserialized_callable
12 changes: 8 additions & 4 deletions haystack/utils/expit.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
#
# SPDX-License-Identifier: Apache-2.0

import numpy as np
from numpy import exp


def expit(x: float) -> float:
"""Compute logistic sigmoid function. Maps input values to a range between 0 and 1"""
return 1 / (1 + np.exp(-x))
def expit(x) -> float:
"""
Compute logistic sigmoid function. Maps input values to a range between 0 and 1
:param x: input value. Can be a scalar or a numpy array.
"""
return 1 / (1 + exp(-x))
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ dependencies = [
]

[tool.hatch.envs.default]
installer = "uv"
dependencies = [
"pre-commit",
"ruff",
Expand Down Expand Up @@ -151,6 +152,7 @@ types = "mypy --install-types --non-interactive --cache-dir=.mypy_cache/ {args:h
lint = "pylint -ry -j 0 {args:haystack}"

[tool.hatch.envs.readme]
installer = "uv"
detached = true # To avoid installing the dependencies from the default environment
dependencies = ["haystack-pydoc-tools"]

Expand Down
Loading

0 comments on commit 17f5b14

Please sign in to comment.