diff --git a/.circleci/config.yml b/.circleci/config.yml index c78dd5b..e6ee9e2 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,7 +2,8 @@ version: 2.1 jobs: - integration-tests: + integration-tests-core: + docker: - image: cimg/python:3.9.9 - image: cimg/postgres:14.0 @@ -13,19 +14,27 @@ jobs: DBT_PROFILES_DIR: ./integration_tests/ci DBT_PROJECT_DIR: ./integration_tests BIGQUERY_SERVICE_KEY_PATH: "/home/circleci/bigquery-service-key.json" - DBT_VERSION: 1.6.0 + DBT_VERSION: 1.6.* steps: - checkout - - run: - name: Install Python packages + - run: &pip-install-core + name: Install core Python packages & dbt-core command: | python3 -m venv venv . venv/bin/activate pip install -U pip setuptools wheel - pip install dbt-core==$DBT_VERSION dbt-postgres==$DBT_VERSION dbt-bigquery==$DBT_VERSION dbt-snowflake==$DBT_VERSION dbt-duckdb==$DBT_VERSION + pip install "dbt-core==$DBT_VERSION" - run: + name: Install dbt adapter packages + command: | + python3 -m venv venv + . venv/bin/activate + pip install "dbt-postgres==$DBT_VERSION" "dbt-bigquery==$DBT_VERSION" "dbt-snowflake==$DBT_VERSION" + pip install "dbt-duckdb==$DBT_VERSION" + + - run: &dbt-deps name: Install dbt dependencies command: | . venv/bin/activate @@ -74,12 +83,65 @@ jobs: - store_artifacts: path: ./logs + integration-tests-spark-thrift: + + docker: + - image: cimg/python:3.9.9 + - image: godatadriven/spark:3.1.1 + environment: + WAIT_FOR: localhost:5432 + command: > + --class org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 + --name Thrift JDBC/ODBC Server + - image: postgres:9.6.17-alpine + environment: + POSTGRES_USER: dbt + POSTGRES_PASSWORD: dbt + POSTGRES_DB: metastore + + resource_class: small + + environment: + DBT_PROFILES_DIR: ./integration_tests/ci + DBT_PROJECT_DIR: ./integration_tests + DBT_VERSION: 1.6.* + + steps: + - checkout + - run: + name: Install Ubuntu packages + command: | + sudo apt-get update + sudo apt-get install libsasl2-dev libsasl2-2 + - run: *pip-install-core + - run: + name: Install dbt adapter packages + command: | + python3 -m venv venv + . venv/bin/activate + pip install dbt-spark "dbt-spark[PyHive]" + - run: *dbt-deps + - run: + name: Wait for Spark-Thrift + command: dockerize -wait tcp://localhost:10000 -timeout 15m -wait-retry-interval 5s + - run: + name: "Run Tests - Spark" + command: | + . venv/bin/activate + dbt build -t spark --project-dir $DBT_PROJECT_DIR + + - store_artifacts: + path: ./logs + workflows: version: 2 test-all: jobs: - hold: type: approval - - integration-tests: + - integration-tests-core: + requires: + - hold + - integration-tests-spark-thrift: requires: - hold diff --git a/.gitignore b/.gitignore index 83beff7..404fb93 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ target/ dbt_packages/ logs/ .python-version +integration_tests/.spark-warehouse +integration_tests/.hive-metastore diff --git a/integration_tests/ci/profiles.yml b/integration_tests/ci/profiles.yml index d8238b7..8762a84 100644 --- a/integration_tests/ci/profiles.yml +++ b/integration_tests/ci/profiles.yml @@ -37,4 +37,15 @@ integration_tests: type: duckdb path: ":memory:" + spark: + type: spark + method: thrift + host: 127.0.0.1 + port: 10000 + user: dbt + schema: analytics + connect_retries: 5 + connect_timeout: 60 + retry_all: true + target: postgres diff --git a/integration_tests/docker-compose.yml b/integration_tests/docker-compose.yml new file mode 100644 index 0000000..9bc9e50 --- /dev/null +++ b/integration_tests/docker-compose.yml @@ -0,0 +1,28 @@ +version: "3.7" +services: + + dbt-spark3-thrift: + image: godatadriven/spark:3.1.1 + ports: + - "10000:10000" + - "4040:4040" + depends_on: + - dbt-hive-metastore + command: > + --class org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 + --name Thrift JDBC/ODBC Server + volumes: + - ./.spark-warehouse/:/spark-warehouse/ + - ./docker/hive-site.xml:/usr/spark/conf/hive-site.xml + - ./docker/spark-defaults.conf:/usr/spark/conf/spark-defaults.conf + environment: + - WAIT_FOR=dbt-hive-metastore:5432 + + dbt-hive-metastore: + image: postgres:9.6.17-alpine + volumes: + - ./.hive-metastore/:/var/lib/postgresql/data + environment: + - POSTGRES_USER=dbt + - POSTGRES_PASSWORD=dbt + - POSTGRES_DB=metastore diff --git a/integration_tests/docker-start.sh b/integration_tests/docker-start.sh new file mode 100644 index 0000000..a8341a6 --- /dev/null +++ b/integration_tests/docker-start.sh @@ -0,0 +1 @@ +docker-compose up -d diff --git a/integration_tests/docker-stop.sh b/integration_tests/docker-stop.sh new file mode 100644 index 0000000..a20ef1a --- /dev/null +++ b/integration_tests/docker-stop.sh @@ -0,0 +1 @@ +docker-compose down && rm -rf ./.hive-metastore/ && rm -rf ./.spark-warehouse/ diff --git a/integration_tests/docker/hive-site.xml b/integration_tests/docker/hive-site.xml new file mode 100644 index 0000000..457d04f --- /dev/null +++ b/integration_tests/docker/hive-site.xml @@ -0,0 +1,46 @@ + + + + + + + + javax.jdo.option.ConnectionURL + jdbc:postgresql://dbt-hive-metastore/metastore + + + + javax.jdo.option.ConnectionDriverName + org.postgresql.Driver + + + + javax.jdo.option.ConnectionUserName + dbt + + + + javax.jdo.option.ConnectionPassword + dbt + + + + hive.metastore.schema.verification + false + + diff --git a/integration_tests/docker/spark-defaults.conf b/integration_tests/docker/spark-defaults.conf new file mode 100644 index 0000000..30ec595 --- /dev/null +++ b/integration_tests/docker/spark-defaults.conf @@ -0,0 +1,9 @@ +spark.driver.memory 2g +spark.executor.memory 2g +spark.hadoop.datanucleus.autoCreateTables true +spark.hadoop.datanucleus.schema.autoCreateTables true +spark.hadoop.datanucleus.fixedDatastore false +spark.serializer org.apache.spark.serializer.KryoSerializer +spark.jars.packages org.apache.hudi:hudi-spark3-bundle_2.12:0.10.0 +spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension +spark.driver.userClassPathFirst true diff --git a/integration_tests/macros/get_test_dates.sql b/integration_tests/macros/get_test_dates.sql index 17036bd..a73956c 100644 --- a/integration_tests/macros/get_test_dates.sql +++ b/integration_tests/macros/get_test_dates.sql @@ -9,8 +9,8 @@ select 1 as day_of_week, 7 as iso_day_of_week, 334 as day_of_year, - cast('2020-11-29' as date) as week_start_date, - cast('2020-12-05' as date) as week_end_date, + cast('{{ get_test_week_start_date()[0] }}' as date) as week_start_date, + cast('{{ get_test_week_end_date()[0] }}' as date) as week_end_date, {{ get_test_week_of_year()[0] }} as week_of_year, -- in ISO terms, this is the end of the prior week cast('2020-11-23' as date) as iso_week_start_date, @@ -44,8 +44,8 @@ select 3 as day_of_week, 2 as iso_day_of_week, 336 as day_of_year, - cast('2020-11-29' as date) as week_start_date, - cast('2020-12-05' as date) as week_end_date, + cast('{{ get_test_week_start_date()[1] }}' as date) as week_start_date, + cast('{{ get_test_week_end_date()[1] }}' as date) as week_end_date, {{ get_test_week_of_year()[1] }} as week_of_year, cast('2020-11-30' as date) as iso_week_start_date, cast('2020-12-06' as date) as iso_week_end_date, @@ -83,6 +83,39 @@ select {{ return([48,49]) }} {%- endmacro %} +{% macro spark__get_test_week_of_year() -%} + {# weeks_of_year for '2020-11-29' and '2020-12-01', respectively #} + {# spark uses ISO year #} + {{ return([48,49]) }} +{%- endmacro %} + + +{% macro get_test_week_start_date() -%} + {{ return(adapter.dispatch('get_test_week_start_date', 'dbt_date_integration_tests') ()) }} +{%- endmacro %} + +{% macro default__get_test_week_start_date() -%} + {{ return(['2020-11-29', '2020-11-29']) }} +{%- endmacro %} + +{% macro spark__get_test_week_start_date() -%} + {# spark does not support non-iso weeks #} + {{ return(['2020-11-23', '2020-11-30']) }} +{%- endmacro %} + + +{% macro get_test_week_end_date() -%} + {{ return(adapter.dispatch('get_test_week_end_date', 'dbt_date_integration_tests') ()) }} +{%- endmacro %} + +{% macro default__get_test_week_end_date() -%} + {{ return(['2020-12-05', '2020-12-05']) }} +{%- endmacro %} + +{% macro spark__get_test_week_end_date() -%} + {# spark does not support non-iso weeks #} + {{ return(['2020-11-29', '2020-12-06']) }} +{%- endmacro %} {% macro get_test_timestamps() -%} @@ -108,3 +141,8 @@ select {{ return(['2021-06-07 07:35:20.000000', '2021-06-07 14:35:20.000000']) }} {%- endmacro %} + +{% macro spark__get_test_timestamps() -%} + {{ return(['2021-06-07 07:35:20.000000', + '2021-06-07 14:35:20.000000']) }} +{%- endmacro %} diff --git a/macros/calendar_date/convert_timezone.sql b/macros/calendar_date/convert_timezone.sql index c7f967f..cee836b 100644 --- a/macros/calendar_date/convert_timezone.sql +++ b/macros/calendar_date/convert_timezone.sql @@ -14,13 +14,6 @@ convert_timezone('{{ source_tz }}', '{{ target_tz }}', timestamp(datetime({{ column }}, '{{ target_tz}}')) {%- endmacro -%} -{%- macro spark__convert_timezone(column, target_tz, source_tz) -%} -from_utc_timestamp( - to_utc_timestamp({{ column }}, '{{ source_tz }}'), - '{{ target_tz }}' - ) -{%- endmacro -%} - {% macro postgres__convert_timezone(column, target_tz, source_tz) -%} cast( cast({{ column }} as {{ dbt.type_timestamp() }}) @@ -35,3 +28,10 @@ cast( {% macro duckdb__convert_timezone(column, target_tz, source_tz) -%} {{ return(dbt_date.postgres__convert_timezone(column, target_tz, source_tz)) }} {%- endmacro -%} + +{%- macro spark__convert_timezone(column, target_tz, source_tz) -%} +from_utc_timestamp( + to_utc_timestamp({{ column }}, '{{ source_tz }}'), + '{{ target_tz }}' + ) +{%- endmacro -%} diff --git a/macros/calendar_date/day_name.sql b/macros/calendar_date/day_name.sql index 1849996..7c951a7 100644 --- a/macros/calendar_date/day_name.sql +++ b/macros/calendar_date/day_name.sql @@ -43,3 +43,8 @@ dayname({{ date }}) {%- endif -%} {%- endmacro %} + +{%- macro spark__day_name(date, short) -%} +{%- set f = 'E' if short else 'EEEE' -%} + date_format({{ date }}, '{{ f }}') +{%- endmacro %} diff --git a/macros/calendar_date/day_of_week.sql b/macros/calendar_date/day_of_week.sql index abd2197..d28392c 100644 --- a/macros/calendar_date/day_of_week.sql +++ b/macros/calendar_date/day_of_week.sql @@ -86,3 +86,12 @@ {%- macro duckdb__day_of_week(date, isoweek) -%} {{ return(dbt_date.postgres__day_of_week(date, isoweek)) }} {%- endmacro %} + + +{%- macro spark__day_of_week(date, isoweek) -%} + + {%- set dow = "dayofweek_iso" if isoweek else "dayofweek" -%} + + {{ dbt_date.date_part(dow, date) }} + +{%- endmacro %} diff --git a/macros/calendar_date/day_of_year.sql b/macros/calendar_date/day_of_year.sql index 1c2a931..704662b 100644 --- a/macros/calendar_date/day_of_year.sql +++ b/macros/calendar_date/day_of_year.sql @@ -13,3 +13,7 @@ {%- macro redshift__day_of_year(date) -%} cast({{ dbt_date.date_part('dayofyear', date) }} as {{ dbt.type_bigint() }}) {%- endmacro %} + +{%- macro spark__day_of_year(date) -%} + dayofyear({{ date }}) +{%- endmacro %} diff --git a/macros/calendar_date/iso_week_end.sql b/macros/calendar_date/iso_week_end.sql index 3fa4125..341774d 100644 --- a/macros/calendar_date/iso_week_end.sql +++ b/macros/calendar_date/iso_week_end.sql @@ -15,4 +15,3 @@ {%- macro snowflake__iso_week_end(date) -%} {{ dbt_date._iso_week_end(date, 'weekiso') }} {%- endmacro %} - diff --git a/macros/calendar_date/iso_week_of_year.sql b/macros/calendar_date/iso_week_of_year.sql index e1182e9..9aeeec9 100644 --- a/macros/calendar_date/iso_week_of_year.sql +++ b/macros/calendar_date/iso_week_of_year.sql @@ -23,3 +23,7 @@ cast({{ dbt_date.date_part(week_type, date) }} as {{ dbt.type_int() }}) {%- macro duckdb__iso_week_of_year(date) -%} {{ return(dbt_date.postgres__iso_week_of_year(date)) }} {%- endmacro %} + +{%- macro spark__iso_week_of_year(date) -%} +{{ dbt_date._iso_week_of_year(date, 'week') }} +{%- endmacro %} diff --git a/macros/calendar_date/iso_week_start.sql b/macros/calendar_date/iso_week_start.sql index 07d01fd..f4c5ff5 100644 --- a/macros/calendar_date/iso_week_start.sql +++ b/macros/calendar_date/iso_week_start.sql @@ -22,3 +22,7 @@ cast({{ dbt.date_trunc(week_type, date) }} as date) {%- macro duckdb__iso_week_start(date) -%} {{ return(dbt_date.postgres__iso_week_start(date)) }} {%- endmacro %} + +{%- macro spark__iso_week_start(date) -%} +{{ dbt_date._iso_week_start(date, 'week') }} +{%- endmacro %} diff --git a/macros/calendar_date/month_name.sql b/macros/calendar_date/month_name.sql index d9a7494..c57709e 100644 --- a/macros/calendar_date/month_name.sql +++ b/macros/calendar_date/month_name.sql @@ -31,3 +31,8 @@ monthname({{ date }}) {%- endif -%} {%- endmacro %} + +{%- macro spark__month_name(date, short) -%} +{%- set f = 'LLL' if short else 'LLLL' -%} + date_format({{ date }}, '{{ f }}') +{%- endmacro %} diff --git a/macros/calendar_date/to_unixtimestamp.sql b/macros/calendar_date/to_unixtimestamp.sql index 6c9bbfb..99a6f1a 100644 --- a/macros/calendar_date/to_unixtimestamp.sql +++ b/macros/calendar_date/to_unixtimestamp.sql @@ -13,3 +13,8 @@ {%- macro bigquery__to_unixtimestamp(timestamp) -%} unix_seconds({{ timestamp }}) {%- endmacro %} + +{%- macro spark__to_unixtimestamp(timestamp) -%} + unix_timestamp({{ timestamp }}) +{%- endmacro %} + diff --git a/macros/calendar_date/week_of_year.sql b/macros/calendar_date/week_of_year.sql index e9adf48..d234cfd 100644 --- a/macros/calendar_date/week_of_year.sql +++ b/macros/calendar_date/week_of_year.sql @@ -16,3 +16,7 @@ cast(to_char({{ date }}, 'WW') as {{ dbt.type_int() }}) {%- macro duckdb__week_of_year(date) -%} cast(ceil(dayofyear({{ date }}) / 7) as int) {%- endmacro %} + +{# {%- macro spark__week_of_year(date) -%} +weekofyear({{ date }}) +{%- endmacro %} #} diff --git a/macros/fiscal_date/get_fiscal_year_dates.sql b/macros/fiscal_date/get_fiscal_year_dates.sql index 990a3d9..3229321 100644 --- a/macros/fiscal_date/get_fiscal_year_dates.sql +++ b/macros/fiscal_date/get_fiscal_year_dates.sql @@ -6,7 +6,7 @@ -- this gets all the dates within a fiscal year -- determined by the given year-end-month -- ending on the saturday closest to that month's end date -with date_dimension as ( +with fsc_date_dimension as ( select * from {{ dates }} ), year_month_end as ( @@ -15,7 +15,7 @@ year_month_end as ( d.year_number - {{ shift_year }} as fiscal_year_number, d.month_end_date from - date_dimension d + fsc_date_dimension d where d.month_of_year = {{ year_end_month }} group by 1,2 @@ -29,7 +29,7 @@ weeks as ( d.date_day as week_start_date, cast({{ dbt.dateadd('day', 6, 'd.date_day') }} as date) as week_end_date from - date_dimension d + fsc_date_dimension d where d.day_of_week = {{ week_start_day }} @@ -97,7 +97,7 @@ fiscal_year_dates as ( order by w.week_start_date ) as fiscal_week_of_year from - date_dimension d + fsc_date_dimension d join fiscal_year_range m on d.date_day between m.fiscal_year_start_date and m.fiscal_year_end_date join