fix!: make PyPDFToDocument JSON-serializable (#6396)

* add registry * release not * add checks * rm superflous check * fix typo * rm print :-)
deepset-ai · Nov 23, 2023 · b0b5147 · b0b5147
1 parent a492771
commit b0b5147
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 9 deletions.
diff --git a/haystack/preview/components/converters/pypdf.py b/haystack/preview/components/converters/pypdf.py
@@ -1,11 +1,11 @@
 import io
 import logging
-from typing import List, Union, Optional, Protocol
+from typing import List, Union, Protocol, Dict
 from pathlib import Path
 
 from haystack.preview.dataclasses import ByteStream
 from haystack.preview.lazy_imports import LazyImport
-from haystack.preview import Document, component
+from haystack.preview import Document, component, default_to_dict
 
 with LazyImport("Run 'pip install pypdf'") as pypdf_import:
     from pypdf import PdfReader
@@ -34,6 +34,11 @@ def convert(self, reader: "PdfReader") -> Document:
         return Document(content=text)
 
 
+# This registry is used to store converters names and instances.
+# It can be used to register custom converters.
+CONVERTERS_REGISTRY: Dict[str, PyPDFConverter] = {"default": DefaultConverter()}
+
+
 @component
 class PyPDFToDocument:
     """
@@ -42,14 +47,27 @@ class PyPDFToDocument:
     A default text extraction converter is used if no custom converter is provided.
     """
 
-    def __init__(self, converter: Optional[PyPDFConverter] = None):
+    def __init__(self, converter_name: str = "default"):
         """
         Initializes the PyPDFToDocument component with an optional custom converter.
-        :param converter: A converter instance that adheres to the PyPDFConverter protocol.
-                          If None, the DefaultConverter is used.
+        :param converter_name: A converter name that is registered in the CONVERTERS_REGISTRY.
+            Defaults to 'default'.
         """
         pypdf_import.check()
-        self.converter: PyPDFConverter = converter or DefaultConverter()
+
+        try:
+            converter = CONVERTERS_REGISTRY[converter_name]
+        except KeyError:
+            msg = (
+                f"Invalid converter_name: {converter_name}.\n Available converters: {list(CONVERTERS_REGISTRY.keys())}"
+            )
+            raise ValueError(msg) from KeyError
+        self.converter_name = converter_name
+        self._converter: PyPDFConverter = converter
+
+    def to_dict(self):
+        # do not serialize the _converter instance
+        return default_to_dict(self, converter_name=self.converter_name)
 
     @component.output_types(documents=List[Document])
     def run(self, sources: List[Union[str, Path, ByteStream]]):
@@ -63,7 +81,7 @@ def run(self, sources: List[Union[str, Path, ByteStream]]):
         for source in sources:
             try:
                 pdf_reader = self._get_pdf_reader(source)
-                document = self.converter.convert(pdf_reader)
+                document = self._converter.convert(pdf_reader)
             except Exception as e:
                 logger.warning("Could not read %s and convert it to Document, skipping. %s", source, e)
                 continue

diff --git a/releasenotes/notes/fix-pypdf-serialization-93744fd01ef5b841.yaml b/releasenotes/notes/fix-pypdf-serialization-93744fd01ef5b841.yaml
@@ -0,0 +1,5 @@
+---
+preview:
+  - |
+    Make `PyPDFToDocument` accept a `converter_name` parameter instead of
+    a `converter` instance, to allow a smooth serialization of the component.
diff --git a/test/preview/components/converters/test_pypdf_to_document.py b/test/preview/components/converters/test_pypdf_to_document.py
@@ -3,11 +3,20 @@
 from pypdf import PdfReader
 
 from haystack.preview import Document
-from haystack.preview.components.converters.pypdf import PyPDFToDocument
+from haystack.preview.components.converters.pypdf import PyPDFToDocument, CONVERTERS_REGISTRY
 from haystack.preview.dataclasses import ByteStream
 
 
 class TestPyPDFToDocument:
+    def test_init(self):
+        component = PyPDFToDocument()
+        assert component.converter_name == "default"
+        assert hasattr(component, "_converter")
+
+    def test_init_fail_nonexisting_converter(self):
+        with pytest.raises(ValueError):
+            PyPDFToDocument(converter_name="non_existing_converter")
+
     @pytest.mark.unit
     def test_run(self, preview_samples_path):
         """
@@ -58,7 +67,9 @@ class MyCustomConverter:
             def convert(self, reader: PdfReader) -> Document:
                 return Document(content="I don't care about converting given pdfs, I always return this")
 
-        converter = PyPDFToDocument(converter=MyCustomConverter())
+        CONVERTERS_REGISTRY["custom"] = MyCustomConverter()
+
+        converter = PyPDFToDocument(converter_name="custom")
         output = converter.run(sources=paths)
         docs = output["documents"]
         assert len(docs) == 1