dlt-hub · sh-rp · Oct 8, 2024 · Jun 19, 2024 · Jun 20, 2024 · Jun 20, 2024
diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml
@@ -77,7 +77,7 @@ jobs:
 
       - name: Install dependencies
         # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
-        run: poetry install --no-interaction -E redshift -E gs -E s3 -E az -E parquet -E duckdb -E cli --with sentry-sdk --with pipeline -E deltalake
+        run: poetry install --no-interaction -E redshift -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline -E deltalake
 
       - name: create secrets.toml
         run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml

diff --git a/.github/workflows/test_doc_snippets.yml b/.github/workflows/test_doc_snippets.yml
@@ -60,7 +60,7 @@ jobs:
         uses: actions/checkout@master
 
       - name: Start weaviate
-        run: docker compose -f ".github/weaviate-compose.yml" up -d
+        run:  docker compose -f "tests/load/weaviate/docker-compose.yml" up -d
 
       - name: Setup Python
         uses: actions/setup-python@v4

diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml
@@ -73,7 +73,7 @@ jobs:
         uses: actions/checkout@master
 
       - name: Start weaviate
-        run: docker compose -f ".github/weaviate-compose.yml" up -d
+        run:  docker compose -f "tests/load/weaviate/docker-compose.yml" up -d
 
       - name: Setup Python
         uses: actions/setup-python@v4
@@ -122,7 +122,7 @@ jobs:
 
       - name: Stop weaviate
         if: always()
-        run: docker compose -f ".github/weaviate-compose.yml" down -v
+        run:  docker compose -f "tests/load/weaviate/docker-compose.yml" down -v
 
       - name: Stop SFTP server
         if: always()

diff --git a/.github/workflows/test_pyarrow17.yml b/.github/workflows/test_pyarrow17.yml
@@ -65,11 +65,12 @@ jobs:
           key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-pyarrow17
 
       - name: Install dependencies
-        run: poetry install --no-interaction --with sentry-sdk --with pipeline -E deltalake -E gs -E s3 -E az
+        run: poetry install --no-interaction --with sentry-sdk --with pipeline -E deltalake -E duckdb -E filesystem -E gs -E s3 -E az
+
 
       - name: Upgrade pyarrow
         run: poetry run pip install pyarrow==17.0.0
-
+      
       - name: create secrets.toml
         run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml
 

diff --git a/.github/workflows/test_sqlalchemy_destinations.yml b/.github/workflows/test_sqlalchemy_destinations.yml
@@ -94,6 +94,3 @@ jobs:
       # always run full suite, also on branches
       - run: poetry run pytest tests/load -x --ignore tests/load/sources
         name: Run tests Linux
-        env:
-          DESTINATION__SQLALCHEMY_MYSQL__CREDENTIALS: mysql://root:[email protected]:3306/dlt_data  # Use root cause we need to create databases
-          DESTINATION__SQLALCHEMY_SQLITE__CREDENTIALS: sqlite:///_storage/dl_data.sqlite
diff --git a/Makefile b/Makefile
@@ -109,4 +109,11 @@ test-build-images: build-library
 
 preprocess-docs: 
 	# run docs preprocessing to run a few checks and ensure examples can be parsed
-	cd docs/website && npm i && npm run preprocess-docs
+	cd docs/website && npm i && npm run preprocess-docs
+
+start-test-containers:
+	docker compose -f "tests/load/dremio/docker-compose.yml" up -d
+	docker compose -f "tests/load/postgres/docker-compose.yml" up -d
+	docker compose -f "tests/load/weaviate/docker-compose.yml" up -d
+	docker compose -f "tests/load/filesystem_sftp/docker-compose.yml" up -d
+	docker compose -f "tests/load/sqlalchemy/docker-compose.yml" up -d
diff --git a/dlt/common/data_writers/writers.py b/dlt/common/data_writers/writers.py
@@ -320,23 +320,10 @@ def _create_writer(self, schema: "pa.Schema") -> "pa.parquet.ParquetWriter":
         )
 
     def write_header(self, columns_schema: TTableSchemaColumns) -> None:
-        from dlt.common.libs.pyarrow import pyarrow, get_py_arrow_datatype
+        from dlt.common.libs.pyarrow import columns_to_arrow
 
         # build schema
-        self.schema = pyarrow.schema(
-            [
-                pyarrow.field(
-                    name,
-                    get_py_arrow_datatype(
-                        schema_item,
-                        self._caps,
-                        self.timestamp_timezone,
-                    ),
-                    nullable=is_nullable_column(schema_item),
-                )
-                for name, schema_item in columns_schema.items()
-            ]
-        )
+        self.schema = columns_to_arrow(columns_schema, self._caps, self.timestamp_timezone)
         # find row items that are of the json type (could be abstracted out for use in other writers?)
         self.nested_indices = [
             i for i, field in columns_schema.items() if field["data_type"] == "json"

diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py
@@ -1,6 +1,8 @@
 from abc import ABC, abstractmethod
 import dataclasses
 from importlib import import_module
+from contextlib import contextmanager
+
 from types import TracebackType
 from typing import (
     Callable,
@@ -18,24 +20,33 @@
     Any,
     TypeVar,
     Generic,
+    Generator,
+    TYPE_CHECKING,
+    Protocol,
+    Tuple,
+    AnyStr,
 )
 from typing_extensions import Annotated
 import datetime  # noqa: 251
 import inspect
 
 from dlt.common import logger, pendulum
+
 from dlt.common.configuration.specs.base_configuration import extract_inner_hint
 from dlt.common.destination.typing import PreparedTableSchema
 from dlt.common.destination.utils import verify_schema_capabilities, verify_supported_data_types
 from dlt.common.exceptions import TerminalException
 from dlt.common.metrics import LoadJobMetrics
 from dlt.common.normalizers.naming import NamingConvention
-from dlt.common.schema import Schema, TSchemaTables
+from dlt.common.schema.typing import TTableSchemaColumns
+
+from dlt.common.schema import Schema, TSchemaTables, TTableSchema
 from dlt.common.schema.typing import (
     C_DLT_LOAD_ID,
     TLoaderReplaceStrategy,
 )
 from dlt.common.schema.utils import fill_hints_from_parent_and_clone_table
+
 from dlt.common.configuration import configspec, resolve_configuration, known_sections, NotResolved
 from dlt.common.configuration.specs import BaseConfiguration, CredentialsConfiguration
 from dlt.common.destination.capabilities import DestinationCapabilitiesContext
@@ -49,13 +60,26 @@
 from dlt.common.storages import FileStorage
 from dlt.common.storages.load_storage import ParsedLoadJobFileName
 from dlt.common.storages.load_package import LoadJobInfo, TPipelineStateDoc
+from dlt.common.exceptions import MissingDependencyException
+
 
 TDestinationConfig = TypeVar("TDestinationConfig", bound="DestinationClientConfiguration")
 TDestinationClient = TypeVar("TDestinationClient", bound="JobClientBase")
 TDestinationDwhClient = TypeVar("TDestinationDwhClient", bound="DestinationClientDwhConfiguration")
 
 DEFAULT_FILE_LAYOUT = "{table_name}/{load_id}.{file_id}.{ext}"
 
+if TYPE_CHECKING:
+    try:
+        from dlt.common.libs.pandas import DataFrame
+        from dlt.common.libs.pyarrow import Table as ArrowTable
+    except MissingDependencyException:
+        DataFrame = Any
+        ArrowTable = Any
+else:
+    DataFrame = Any
+    ArrowTable = Any
+
 
 class StorageSchemaInfo(NamedTuple):
     version_hash: str
@@ -442,6 +466,65 @@ def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJobRe
         return []
 
 
+class SupportsReadableRelation(Protocol):
+    """A readable relation retrieved from a destination that supports it"""
+
+    schema_columns: TTableSchemaColumns
+    """Known dlt table columns for this relation"""
+
+    def df(self, chunk_size: int = None) -> Optional[DataFrame]:
+        """Fetches the results as data frame. For large queries the results may be chunked
+
+        Fetches the results into a data frame. The default implementation uses helpers in `pandas.io.sql` to generate Pandas data frame.
+        This function will try to use native data frame generation for particular destination. For `BigQuery`: `QueryJob.to_dataframe` is used.
+        For `duckdb`: `DuckDBPyConnection.df'
+
+        Args:
+            chunk_size (int, optional): Will chunk the results into several data frames. Defaults to None
+            **kwargs (Any): Additional parameters which will be passed to native data frame generation function.
+
+        Returns:
+            Optional[DataFrame]: A data frame with query results. If chunk_size > 0, None will be returned if there is no more data in results
+        """
+        ...
+
+    def arrow(self, chunk_size: int = None) -> Optional[ArrowTable]: ...
+
+    def iter_df(self, chunk_size: int) -> Generator[DataFrame, None, None]: ...
+
+    def iter_arrow(self, chunk_size: int) -> Generator[ArrowTable, None, None]: ...
+
+    def fetchall(self) -> List[Tuple[Any, ...]]: ...
+
+    def fetchmany(self, chunk_size: int) -> List[Tuple[Any, ...]]: ...
+
+    def iter_fetch(self, chunk_size: int) -> Generator[List[Tuple[Any, ...]], Any, Any]: ...
+
+    def fetchone(self) -> Optional[Tuple[Any, ...]]: ...
+
+
+class DBApiCursor(SupportsReadableRelation):
+    """Protocol for DBAPI cursor"""
+
+    description: Tuple[Any, ...]
+
+    native_cursor: "DBApiCursor"
+    """Cursor implementation native to current destination"""
+
+    def execute(self, query: AnyStr, *args: Any, **kwargs: Any) -> None: ...
+    def close(self) -> None: ...
+
+
+class SupportsReadableDataset(Protocol):
+    """A readable dataset retrieved from a destination, has support for creating readable relations for a query or table"""
+
+    def __call__(self, query: Any) -> SupportsReadableRelation: ...
+
+    def __getitem__(self, table: str) -> SupportsReadableRelation: ...
+
+    def __getattr__(self, table: str) -> SupportsReadableRelation: ...
+
+
 class JobClientBase(ABC):
     def __init__(
         self,

diff --git a/dlt/common/libs/pandas.py b/dlt/common/libs/pandas.py
@@ -3,6 +3,7 @@
 
 try:
     import pandas
+    from pandas import DataFrame
 except ModuleNotFoundError:
     raise MissingDependencyException("dlt Pandas Helpers", ["pandas"])