From 8c0934c13d046c7b00ce9e43c183f4cb03cc3992 Mon Sep 17 00:00:00 2001 From: Josh Wills Date: Wed, 21 Jun 2023 09:06:00 -0400 Subject: [PATCH] Support alternate string formatting strategies for external sources --- README.md | 13 +++++++++++-- dbt/adapters/duckdb/relation.py | 21 +++++++++++++++------ tests/functional/adapter/test_sources.py | 12 +++++++++--- 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 9e812427..23c33a31 100644 --- a/README.md +++ b/README.md @@ -234,8 +234,7 @@ FROM read_parquet(['s3://my-bucket/my-sources/source2a.parquet', 's3://my-bucket ``` Note that the value of the `external_location` property does not need to be a path-like string; it can also be a function -call, which is helpful in the case that you have an external source that is a CSV file which requires special handling for DuckDB -to load it correctly: +call, which is helpful in the case that you have an external source that is a CSV file which requires special handling for DuckDB to load it correctly: ``` sources: @@ -244,8 +243,18 @@ sources: - name: flights meta: external_location: "read_csv('flights.csv', types={'FlightDate': 'DATE'}, names=['FlightDate', 'UniqueCarrier'])" + formatter: oldstyle ``` +Note that we need to override the default `str.format` string formatting strategy for this example +because the `types={'FlightDate': 'DATE'}` argument to the `read_csv` function will be interpreted by +`str.format` as a template to be matched on, which will cause a `KeyError: "'FlightDate'"` when we attempt +to parse the source in a dbt model. The `formatter` configuration option for the source indicates whether +we should use `newstyle` string formatting (the default), `oldstyle` string formatting, or `template` string +formatting. You can read up on the strategies the various string formatting techniques use at this +[Stack Overflow answer](https://stackoverflow.com/questions/13451989/pythons-many-ways-of-string-formatting-are-the-older-ones-going-to-be-depre) and see examples of their use +in this [dbt-duckdb integration test](https://github.com/jwills/dbt-duckdb/blob/master/tests/functional/adapter/test_sources.py). + #### Writing to external files We support creating dbt models that are backed by external files via the `external` materialization strategy: diff --git a/dbt/adapters/duckdb/relation.py b/dbt/adapters/duckdb/relation.py index 6e2fe076..665b499f 100644 --- a/dbt/adapters/duckdb/relation.py +++ b/dbt/adapters/duckdb/relation.py @@ -1,4 +1,5 @@ from dataclasses import dataclass +from string import Template from typing import Any from typing import Optional from typing import Type @@ -26,12 +27,20 @@ def create_from_source(cls: Type[Self], source: SourceDefinition, **kwargs: Any) if DuckDBConnectionManager._ENV is not None: # No connection means we are probably in the dbt parsing phase, so don't load yet. DuckDBConnectionManager.env().load_source(plugin_name, source_config) - elif "external_location" in source_config.meta: - # Call str.format with the schema, name and identifier for the source so that they - # can be injected into the string; this helps reduce boilerplate when all - # of the tables in the source have a similar location based on their name - # and/or identifier. - ext_location = source_config["external_location"].format(**source_config.as_dict()) + elif "external_location" in source_config: + ext_location_template = source_config["external_location"] + formatter = source_config.get("formatter", "newstyle") + if formatter == "newstyle": + ext_location = ext_location_template.format_map(source_config.as_dict()) + elif formatter == "oldstyle": + ext_location = ext_location_template % source_config.as_dict() + elif formatter == "template": + ext_location = Template(ext_location_template).substitute(source_config.as_dict()) + else: + raise ValueError( + f"Formatter {formatter} not recognized. Must be one of 'newstyle', 'oldstyle', or 'template'." + ) + # If it's a function call or already has single quotes, don't add them if "(" not in ext_location and not ext_location.startswith("'"): ext_location = f"'{ext_location}'" diff --git a/tests/functional/adapter/test_sources.py b/tests/functional/adapter/test_sources.py index a0ad5541..85070000 100644 --- a/tests/functional/adapter/test_sources.py +++ b/tests/functional/adapter/test_sources.py @@ -22,14 +22,20 @@ - name: seeds_ost identifier: "seeds_other_source_table" config: - external_location: "read_csv_auto('/tmp/{identifier}.csv')" + external_location: "read_csv_auto('/tmp/%(identifier)s.csv')" + formatter: oldstyle + - name: seeds_other_source_table + config: + external_location: "read_csv_auto('/tmp/${name}.csv')" + formatter: template """ models_source_model_sql = """select * from {{ source('external_source', 'seeds_source') }} """ -models_multi_source_model_sql = """select * from {{ source('external_source', 'seeds_source') }} - inner join {{ source('external_source', 'seeds_ost') }} USING (id) +models_multi_source_model_sql = """select s.* from {{ source('external_source', 'seeds_source') }} s + inner join {{ source('external_source', 'seeds_ost') }} oldstyle USING (id) + inner join {{ source('external_source', 'seeds_other_source_table') }} tmpl USING (id) """