Skip to content

Commit 53a3356

Browse files
authored
[Feature] Add Spark support (#108)
* Add Spark docker image for development * Initial spike to support spark * Move spark docker * Breakout Spark CI
1 parent 88703ab commit 53a3356

20 files changed

+259
-22
lines changed

.circleci/config.yml

+68-6
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@ version: 2.1
22

33
jobs:
44

5-
integration-tests:
5+
integration-tests-core:
6+
67
docker:
78
- image: cimg/python:3.9.9
89
- image: cimg/postgres:14.0
@@ -13,19 +14,27 @@ jobs:
1314
DBT_PROFILES_DIR: ./integration_tests/ci
1415
DBT_PROJECT_DIR: ./integration_tests
1516
BIGQUERY_SERVICE_KEY_PATH: "/home/circleci/bigquery-service-key.json"
16-
DBT_VERSION: 1.6.0
17+
DBT_VERSION: 1.6.*
1718

1819
steps:
1920
- checkout
20-
- run:
21-
name: Install Python packages
21+
- run: &pip-install-core
22+
name: Install core Python packages & dbt-core
2223
command: |
2324
python3 -m venv venv
2425
. venv/bin/activate
2526
pip install -U pip setuptools wheel
26-
pip install dbt-core==$DBT_VERSION dbt-postgres==$DBT_VERSION dbt-bigquery==$DBT_VERSION dbt-snowflake==$DBT_VERSION dbt-duckdb==$DBT_VERSION
27+
pip install "dbt-core==$DBT_VERSION"
2728
2829
- run:
30+
name: Install dbt adapter packages
31+
command: |
32+
python3 -m venv venv
33+
. venv/bin/activate
34+
pip install "dbt-postgres==$DBT_VERSION" "dbt-bigquery==$DBT_VERSION" "dbt-snowflake==$DBT_VERSION"
35+
pip install "dbt-duckdb==$DBT_VERSION"
36+
37+
- run: &dbt-deps
2938
name: Install dbt dependencies
3039
command: |
3140
. venv/bin/activate
@@ -74,12 +83,65 @@ jobs:
7483
- store_artifacts:
7584
path: ./logs
7685

86+
integration-tests-spark-thrift:
87+
88+
docker:
89+
- image: cimg/python:3.9.9
90+
- image: godatadriven/spark:3.1.1
91+
environment:
92+
WAIT_FOR: localhost:5432
93+
command: >
94+
--class org.apache.spark.sql.hive.thriftserver.HiveThriftServer2
95+
--name Thrift JDBC/ODBC Server
96+
- image: postgres:9.6.17-alpine
97+
environment:
98+
POSTGRES_USER: dbt
99+
POSTGRES_PASSWORD: dbt
100+
POSTGRES_DB: metastore
101+
102+
resource_class: small
103+
104+
environment:
105+
DBT_PROFILES_DIR: ./integration_tests/ci
106+
DBT_PROJECT_DIR: ./integration_tests
107+
DBT_VERSION: 1.6.*
108+
109+
steps:
110+
- checkout
111+
- run:
112+
name: Install Ubuntu packages
113+
command: |
114+
sudo apt-get update
115+
sudo apt-get install libsasl2-dev libsasl2-2
116+
- run: *pip-install-core
117+
- run:
118+
name: Install dbt adapter packages
119+
command: |
120+
python3 -m venv venv
121+
. venv/bin/activate
122+
pip install dbt-spark "dbt-spark[PyHive]"
123+
- run: *dbt-deps
124+
- run:
125+
name: Wait for Spark-Thrift
126+
command: dockerize -wait tcp://localhost:10000 -timeout 15m -wait-retry-interval 5s
127+
- run:
128+
name: "Run Tests - Spark"
129+
command: |
130+
. venv/bin/activate
131+
dbt build -t spark --project-dir $DBT_PROJECT_DIR
132+
133+
- store_artifacts:
134+
path: ./logs
135+
77136
workflows:
78137
version: 2
79138
test-all:
80139
jobs:
81140
- hold:
82141
type: approval
83-
- integration-tests:
142+
- integration-tests-core:
143+
requires:
144+
- hold
145+
- integration-tests-spark-thrift:
84146
requires:
85147
- hold

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,5 @@ target/
33
dbt_packages/
44
logs/
55
.python-version
6+
integration_tests/.spark-warehouse
7+
integration_tests/.hive-metastore

integration_tests/ci/profiles.yml

+11
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,15 @@ integration_tests:
3737
type: duckdb
3838
path: ":memory:"
3939

40+
spark:
41+
type: spark
42+
method: thrift
43+
host: 127.0.0.1
44+
port: 10000
45+
user: dbt
46+
schema: analytics
47+
connect_retries: 5
48+
connect_timeout: 60
49+
retry_all: true
50+
4051
target: postgres

integration_tests/docker-compose.yml

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
version: "3.7"
2+
services:
3+
4+
dbt-spark3-thrift:
5+
image: godatadriven/spark:3.1.1
6+
ports:
7+
- "10000:10000"
8+
- "4040:4040"
9+
depends_on:
10+
- dbt-hive-metastore
11+
command: >
12+
--class org.apache.spark.sql.hive.thriftserver.HiveThriftServer2
13+
--name Thrift JDBC/ODBC Server
14+
volumes:
15+
- ./.spark-warehouse/:/spark-warehouse/
16+
- ./docker/hive-site.xml:/usr/spark/conf/hive-site.xml
17+
- ./docker/spark-defaults.conf:/usr/spark/conf/spark-defaults.conf
18+
environment:
19+
- WAIT_FOR=dbt-hive-metastore:5432
20+
21+
dbt-hive-metastore:
22+
image: postgres:9.6.17-alpine
23+
volumes:
24+
- ./.hive-metastore/:/var/lib/postgresql/data
25+
environment:
26+
- POSTGRES_USER=dbt
27+
- POSTGRES_PASSWORD=dbt
28+
- POSTGRES_DB=metastore

integration_tests/docker-start.sh

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
docker-compose up -d

integration_tests/docker-stop.sh

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
docker-compose down && rm -rf ./.hive-metastore/ && rm -rf ./.spark-warehouse/
+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
<?xml version="1.0"?>
2+
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3+
<!--
4+
Licensed to the Apache Software Foundation (ASF) under one or more
5+
contributor license agreements. See the NOTICE file distributed with
6+
this work for additional information regarding copyright ownership.
7+
The ASF licenses this file to You under the Apache License, Version 2.0
8+
(the "License"); you may not use this file except in compliance with
9+
the License. You may obtain a copy of the License at
10+
11+
http://www.apache.org/licenses/LICENSE-2.0
12+
13+
Unless required by applicable law or agreed to in writing, software
14+
distributed under the License is distributed on an "AS IS" BASIS,
15+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
See the License for the specific language governing permissions and
17+
limitations under the License.
18+
-->
19+
20+
<configuration>
21+
22+
<property>
23+
<name>javax.jdo.option.ConnectionURL</name>
24+
<value>jdbc:postgresql://dbt-hive-metastore/metastore</value>
25+
</property>
26+
27+
<property>
28+
<name>javax.jdo.option.ConnectionDriverName</name>
29+
<value>org.postgresql.Driver</value>
30+
</property>
31+
32+
<property>
33+
<name>javax.jdo.option.ConnectionUserName</name>
34+
<value>dbt</value>
35+
</property>
36+
37+
<property>
38+
<name>javax.jdo.option.ConnectionPassword</name>
39+
<value>dbt</value>
40+
</property>
41+
42+
<property>
43+
<name>hive.metastore.schema.verification</name>
44+
<value>false</value>
45+
</property>
46+
</configuration>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
spark.driver.memory 2g
2+
spark.executor.memory 2g
3+
spark.hadoop.datanucleus.autoCreateTables true
4+
spark.hadoop.datanucleus.schema.autoCreateTables true
5+
spark.hadoop.datanucleus.fixedDatastore false
6+
spark.serializer org.apache.spark.serializer.KryoSerializer
7+
spark.jars.packages org.apache.hudi:hudi-spark3-bundle_2.12:0.10.0
8+
spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension
9+
spark.driver.userClassPathFirst true

integration_tests/macros/get_test_dates.sql

+42-4
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ select
99
1 as day_of_week,
1010
7 as iso_day_of_week,
1111
334 as day_of_year,
12-
cast('2020-11-29' as date) as week_start_date,
13-
cast('2020-12-05' as date) as week_end_date,
12+
cast('{{ get_test_week_start_date()[0] }}' as date) as week_start_date,
13+
cast('{{ get_test_week_end_date()[0] }}' as date) as week_end_date,
1414
{{ get_test_week_of_year()[0] }} as week_of_year,
1515
-- in ISO terms, this is the end of the prior week
1616
cast('2020-11-23' as date) as iso_week_start_date,
@@ -44,8 +44,8 @@ select
4444
3 as day_of_week,
4545
2 as iso_day_of_week,
4646
336 as day_of_year,
47-
cast('2020-11-29' as date) as week_start_date,
48-
cast('2020-12-05' as date) as week_end_date,
47+
cast('{{ get_test_week_start_date()[1] }}' as date) as week_start_date,
48+
cast('{{ get_test_week_end_date()[1] }}' as date) as week_end_date,
4949
{{ get_test_week_of_year()[1] }} as week_of_year,
5050
cast('2020-11-30' as date) as iso_week_start_date,
5151
cast('2020-12-06' as date) as iso_week_end_date,
@@ -83,6 +83,39 @@ select
8383
{{ return([48,49]) }}
8484
{%- endmacro %}
8585

86+
{% macro spark__get_test_week_of_year() -%}
87+
{# weeks_of_year for '2020-11-29' and '2020-12-01', respectively #}
88+
{# spark uses ISO year #}
89+
{{ return([48,49]) }}
90+
{%- endmacro %}
91+
92+
93+
{% macro get_test_week_start_date() -%}
94+
{{ return(adapter.dispatch('get_test_week_start_date', 'dbt_date_integration_tests') ()) }}
95+
{%- endmacro %}
96+
97+
{% macro default__get_test_week_start_date() -%}
98+
{{ return(['2020-11-29', '2020-11-29']) }}
99+
{%- endmacro %}
100+
101+
{% macro spark__get_test_week_start_date() -%}
102+
{# spark does not support non-iso weeks #}
103+
{{ return(['2020-11-23', '2020-11-30']) }}
104+
{%- endmacro %}
105+
106+
107+
{% macro get_test_week_end_date() -%}
108+
{{ return(adapter.dispatch('get_test_week_end_date', 'dbt_date_integration_tests') ()) }}
109+
{%- endmacro %}
110+
111+
{% macro default__get_test_week_end_date() -%}
112+
{{ return(['2020-12-05', '2020-12-05']) }}
113+
{%- endmacro %}
114+
115+
{% macro spark__get_test_week_end_date() -%}
116+
{# spark does not support non-iso weeks #}
117+
{{ return(['2020-11-29', '2020-12-06']) }}
118+
{%- endmacro %}
86119

87120

88121
{% macro get_test_timestamps() -%}
@@ -108,3 +141,8 @@ select
108141
{{ return(['2021-06-07 07:35:20.000000',
109142
'2021-06-07 14:35:20.000000']) }}
110143
{%- endmacro %}
144+
145+
{% macro spark__get_test_timestamps() -%}
146+
{{ return(['2021-06-07 07:35:20.000000',
147+
'2021-06-07 14:35:20.000000']) }}
148+
{%- endmacro %}

macros/calendar_date/convert_timezone.sql

+7-7
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,6 @@ convert_timezone('{{ source_tz }}', '{{ target_tz }}',
1414
timestamp(datetime({{ column }}, '{{ target_tz}}'))
1515
{%- endmacro -%}
1616

17-
{%- macro spark__convert_timezone(column, target_tz, source_tz) -%}
18-
from_utc_timestamp(
19-
to_utc_timestamp({{ column }}, '{{ source_tz }}'),
20-
'{{ target_tz }}'
21-
)
22-
{%- endmacro -%}
23-
2417
{% macro postgres__convert_timezone(column, target_tz, source_tz) -%}
2518
cast(
2619
cast({{ column }} as {{ dbt.type_timestamp() }})
@@ -35,3 +28,10 @@ cast(
3528
{% macro duckdb__convert_timezone(column, target_tz, source_tz) -%}
3629
{{ return(dbt_date.postgres__convert_timezone(column, target_tz, source_tz)) }}
3730
{%- endmacro -%}
31+
32+
{%- macro spark__convert_timezone(column, target_tz, source_tz) -%}
33+
from_utc_timestamp(
34+
to_utc_timestamp({{ column }}, '{{ source_tz }}'),
35+
'{{ target_tz }}'
36+
)
37+
{%- endmacro -%}

macros/calendar_date/day_name.sql

+5
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,8 @@
4343
dayname({{ date }})
4444
{%- endif -%}
4545
{%- endmacro %}
46+
47+
{%- macro spark__day_name(date, short) -%}
48+
{%- set f = 'E' if short else 'EEEE' -%}
49+
date_format({{ date }}, '{{ f }}')
50+
{%- endmacro %}

macros/calendar_date/day_of_week.sql

+9
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,12 @@
8686
{%- macro duckdb__day_of_week(date, isoweek) -%}
8787
{{ return(dbt_date.postgres__day_of_week(date, isoweek)) }}
8888
{%- endmacro %}
89+
90+
91+
{%- macro spark__day_of_week(date, isoweek) -%}
92+
93+
{%- set dow = "dayofweek_iso" if isoweek else "dayofweek" -%}
94+
95+
{{ dbt_date.date_part(dow, date) }}
96+
97+
{%- endmacro %}

macros/calendar_date/day_of_year.sql

+4
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,7 @@
1313
{%- macro redshift__day_of_year(date) -%}
1414
cast({{ dbt_date.date_part('dayofyear', date) }} as {{ dbt.type_bigint() }})
1515
{%- endmacro %}
16+
17+
{%- macro spark__day_of_year(date) -%}
18+
dayofyear({{ date }})
19+
{%- endmacro %}

macros/calendar_date/iso_week_end.sql

-1
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,3 @@
1515
{%- macro snowflake__iso_week_end(date) -%}
1616
{{ dbt_date._iso_week_end(date, 'weekiso') }}
1717
{%- endmacro %}
18-

macros/calendar_date/iso_week_of_year.sql

+4
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,7 @@ cast({{ dbt_date.date_part(week_type, date) }} as {{ dbt.type_int() }})
2323
{%- macro duckdb__iso_week_of_year(date) -%}
2424
{{ return(dbt_date.postgres__iso_week_of_year(date)) }}
2525
{%- endmacro %}
26+
27+
{%- macro spark__iso_week_of_year(date) -%}
28+
{{ dbt_date._iso_week_of_year(date, 'week') }}
29+
{%- endmacro %}

macros/calendar_date/iso_week_start.sql

+4
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,7 @@ cast({{ dbt.date_trunc(week_type, date) }} as date)
2222
{%- macro duckdb__iso_week_start(date) -%}
2323
{{ return(dbt_date.postgres__iso_week_start(date)) }}
2424
{%- endmacro %}
25+
26+
{%- macro spark__iso_week_start(date) -%}
27+
{{ dbt_date._iso_week_start(date, 'week') }}
28+
{%- endmacro %}

macros/calendar_date/month_name.sql

+5
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,8 @@
3131
monthname({{ date }})
3232
{%- endif -%}
3333
{%- endmacro %}
34+
35+
{%- macro spark__month_name(date, short) -%}
36+
{%- set f = 'LLL' if short else 'LLLL' -%}
37+
date_format({{ date }}, '{{ f }}')
38+
{%- endmacro %}

macros/calendar_date/to_unixtimestamp.sql

+5
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,8 @@
1313
{%- macro bigquery__to_unixtimestamp(timestamp) -%}
1414
unix_seconds({{ timestamp }})
1515
{%- endmacro %}
16+
17+
{%- macro spark__to_unixtimestamp(timestamp) -%}
18+
unix_timestamp({{ timestamp }})
19+
{%- endmacro %}
20+

macros/calendar_date/week_of_year.sql

+4
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,7 @@ cast(to_char({{ date }}, 'WW') as {{ dbt.type_int() }})
1616
{%- macro duckdb__week_of_year(date) -%}
1717
cast(ceil(dayofyear({{ date }}) / 7) as int)
1818
{%- endmacro %}
19+
20+
{# {%- macro spark__week_of_year(date) -%}
21+
weekofyear({{ date }})
22+
{%- endmacro %} #}

0 commit comments

Comments
 (0)