Skip to content

Commit 1d935f7

Browse files
committed
feat(spark): Add new spark image (java/scala only) with okdp extensions (aws/minio, prometheus java agent, okdp-spark-auth-filter)
1 parent 59eb7d8 commit 1d935f7

File tree

4 files changed

+317
-0
lines changed

4 files changed

+317
-0
lines changed

spark/Dockerfile

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
ARG SPARK_VERSION=3.2.1
18+
ARG HADOOP_VERSION=3.2
19+
ARG SCALA_VERSION=2.12
20+
ARG JAVA_VERSION=11
21+
22+
ARG REGISTRY=quay.io
23+
ARG REPO=okdp
24+
ARG BASE_IMAGE=${REGISTRY}/${REPO}/spark:base-spark-${SPARK_VERSION}-scala-${SCALA_VERSION}-java-${JAVA_VERSION}
25+
26+
FROM eclipse-temurin:${JAVA_VERSION}-jre-jammy AS okdp_addons
27+
ARG SPARK_VERSION=3.2.1
28+
ARG SCALA_VERSION=2.12
29+
30+
RUN set -ex; \
31+
apt-get update; \
32+
apt install -y --no-install-recommends maven
33+
34+
WORKDIR /workspace
35+
36+
COPY okdp-addons.pom deps/pom.xml
37+
38+
# The setup consumes less space compare to inheriting from the parent pom
39+
# Handles the transitive dependencies versions through the pom
40+
# Manage Java AWS SDK v1 (hadoop <3.4)/V2 (hadoop >=3.4)
41+
RUN mvn dependency:get -DgroupId=org.apache.spark -DartifactId=spark-parent_${SCALA_VERSION} -Dversion=${SPARK_VERSION} -Dpackaging=pom; \
42+
mvn dependency:copy -Dartifact=org.apache.spark:spark-parent_${SCALA_VERSION}:${SPARK_VERSION}:pom -Dproject.basedir=./ -DoutputDirectory=./; \
43+
HADOOP_VERSION=$(grep "<hadoop.version>" spark-parent_${SCALA_VERSION}-${SPARK_VERSION}.pom | sed -e 's/^ *<hadoop.version>\(.*\)<\/hadoop.version> *$/\1/'|head -1); \
44+
mv ./deps/pom.xml .; \
45+
mvn clean dependency:copy-dependencies \
46+
-Dspark.version=${SPARK_VERSION} \
47+
-Dscala.version=${SCALA_VERSION} \
48+
-Dhadoop.version=${HADOOP_VERSION} \
49+
-Paws
50+
51+
FROM $BASE_IMAGE
52+
53+
ENV JMX_CONF_DIR /etc/metrics/conf/
54+
55+
# OKDP addons
56+
COPY --from=okdp_addons --chown=spark:spark /workspace/target/dependency/* $SPARK_HOME/jars
57+
RUN chown -R spark:spark ${SPARK_HOME}/jars/
58+
59+
# Jmx prometheus metrics
60+
COPY --chown=spark:spark metrics.properties ${JMX_CONF_DIR}/metrics.properties
61+
COPY --chown=spark:spark prometheus.yaml ${JMX_CONF_DIR}/prometheus.yaml
62+
63+
USER spark
64+

spark/metrics.properties

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#
2+
# Copyright 2018 Google LLC
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# https://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
17+
*.sink.jmx.class=org.apache.spark.metrics.sink.JmxSink
18+
driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource
19+
executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource

spark/okdp-addons.pom

+111
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!--
3+
~
4+
~ Copyright 2024 tosit.io
5+
~
6+
~ Licensed under the Apache License, Version 2.0 (the "License");
7+
~ you may not use this file except in compliance with the License.
8+
~ You may obtain a copy of the License at
9+
~
10+
~ http://www.apache.org/licenses/LICENSE-2.0
11+
~
12+
~ Unless required by applicable law or agreed to in writing, software
13+
~ distributed under the License is distributed on an "AS IS" BASIS,
14+
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
~ See the License for the specific language governing permissions and
16+
~ limitations under the License.
17+
~
18+
-->
19+
<!--
20+
OKDP spark docker images add-ons:
21+
* Manage compatibilty (transitive) dependency versions between Hadoop, spark and extensions.
22+
* Prevents mixing extensions (e.x.hadoop-aws) version with other hadoop artifacts from different versions.
23+
* Manage AWS SDK V2 and SDK V1 for S3 (breaking changes between SDK V1 and V2) versions.
24+
* See also: https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/setup-project-maven.html
25+
* See also: https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/aws_sdk_upgrade.html
26+
* See also: https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/home.html
27+
* See also: https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/setup-project-maven.html
28+
-->
29+
<project xmlns="http://maven.apache.org/POM/4.0.0"
30+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
31+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
32+
<modelVersion>4.0.0</modelVersion>
33+
<name>OKDP Addons</name>
34+
<groupId>io.okdp</groupId>
35+
<artifactId>okdp-spark-docker-addons</artifactId>
36+
<version>${spark.version}</version>
37+
<packaging>pom</packaging>
38+
<description>
39+
OKDP extensions for spark docker images
40+
</description>
41+
<properties>
42+
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
43+
</properties>
44+
<dependencies>
45+
<!-- *** Default OKDP Extensions *** -->
46+
<!-- okdp spark auth filter -->
47+
<dependency>
48+
<groupId>io.okdp</groupId>
49+
<artifactId>okdp-spark-auth-filter</artifactId>
50+
<version>1.1.0</version>
51+
<exclusions>
52+
<exclusion>
53+
<groupId>*</groupId>
54+
<artifactId>*</artifactId>
55+
</exclusion>
56+
</exclusions>
57+
</dependency>
58+
<!-- jmx prometheus agent -->
59+
<dependency>
60+
<groupId>io.prometheus.jmx</groupId>
61+
<artifactId>jmx_prometheus_javaagent</artifactId>
62+
<version>0.20.0</version>
63+
<exclusions>
64+
<exclusion>
65+
<groupId>*</groupId>
66+
<artifactId>*</artifactId>
67+
</exclusion>
68+
</exclusions>
69+
</dependency>
70+
<!-- minio/aws S3 requirement -->
71+
<dependency>
72+
<groupId>org.apache.hadoop</groupId>
73+
<artifactId>hadoop-common</artifactId>
74+
<version>${hadoop.version}</version>
75+
<exclusions>
76+
<exclusion>
77+
<groupId>*</groupId>
78+
<artifactId>*</artifactId>
79+
</exclusion>
80+
</exclusions>
81+
</dependency>
82+
</dependencies>
83+
<profiles>
84+
<profile>
85+
<!-- minio/aws S3
86+
The AWS bandle can take up 500MB, to be optimized later! -->
87+
<id>aws</id>
88+
<dependencies>
89+
<dependency>
90+
<groupId>org.apache.hadoop</groupId>
91+
<artifactId>hadoop-aws</artifactId>
92+
<version>${hadoop.version}</version>
93+
</dependency>
94+
<!-- S3A Committers:
95+
https://github.com/apache/hadoop/blob/trunk/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/committers.md
96+
-->
97+
<dependency>
98+
<groupId>org.apache.spark</groupId>
99+
<artifactId>spark-hadoop-cloud_${scala.version}</artifactId>
100+
<version>${spark.version}</version>
101+
<exclusions>
102+
<exclusion>
103+
<groupId>*</groupId>
104+
<artifactId>*</artifactId>
105+
</exclusion>
106+
</exclusions>
107+
</dependency>
108+
</dependencies>
109+
</profile>
110+
</profiles>
111+
</project>

spark/prometheus.yaml

+123
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#
2+
# Copyright 2018 Google LLC
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# https://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
17+
---
18+
lowercaseOutputName: true
19+
attrNameSnakeCase: true
20+
rules:
21+
# These come from the application driver if it's a streaming application
22+
# Example: default/streaming.driver.com.example.ClassName.StreamingMetrics.streaming.lastCompletedBatch_schedulingDelay
23+
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.(\S+)\.StreamingMetrics\.streaming\.(\S+)><>Value
24+
name: spark_streaming_driver_$4
25+
labels:
26+
app_namespace: "$1"
27+
app_id: "$2"
28+
# These come from the application driver if it's a structured streaming application
29+
# Example: default/streaming.driver.spark.streaming.QueryName.inputRate-total
30+
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.spark\.streaming\.(\S+)\.(\S+)><>Value
31+
name: spark_structured_streaming_driver_$4
32+
labels:
33+
app_namespace: "$1"
34+
app_id: "$2"
35+
query_name: "$3"
36+
# These come from the application executors
37+
# Example: default/spark-pi.0.executor.threadpool.activeTasks
38+
- pattern: metrics<name=(\S+)\.(\S+)\.(\S+)\.executor\.(\S+)><>Value
39+
name: spark_executor_$4
40+
type: GAUGE
41+
labels:
42+
app_namespace: "$1"
43+
app_id: "$2"
44+
executor_id: "$3"
45+
# These come from the application driver
46+
# Example: default/spark-pi.driver.DAGScheduler.stage.failedStages
47+
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.(BlockManager|DAGScheduler|jvm)\.(\S+)><>Value
48+
name: spark_driver_$3_$4
49+
type: GAUGE
50+
labels:
51+
app_namespace: "$1"
52+
app_id: "$2"
53+
# These come from the application driver
54+
# Emulate timers for DAGScheduler like messagePRocessingTime
55+
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.DAGScheduler\.(.*)><>Count
56+
name: spark_driver_DAGScheduler_$3_count
57+
type: COUNTER
58+
labels:
59+
app_namespace: "$1"
60+
app_id: "$2"
61+
# HiveExternalCatalog is of type counter
62+
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.HiveExternalCatalog\.(.*)><>Count
63+
name: spark_driver_HiveExternalCatalog_$3_count
64+
type: COUNTER
65+
labels:
66+
app_namespace: "$1"
67+
app_id: "$2"
68+
# These come from the application driver
69+
# Emulate histograms for CodeGenerator
70+
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.CodeGenerator\.(.*)><>Count
71+
name: spark_driver_CodeGenerator_$3_count
72+
type: COUNTER
73+
labels:
74+
app_namespace: "$1"
75+
app_id: "$2"
76+
# These come from the application driver
77+
# Emulate timer (keep only count attribute) plus counters for LiveListenerBus
78+
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.LiveListenerBus\.(.*)><>Count
79+
name: spark_driver_LiveListenerBus_$3_count
80+
type: COUNTER
81+
labels:
82+
app_namespace: "$1"
83+
app_id: "$2"
84+
# Get Gauge type metrics for LiveListenerBus
85+
- pattern: metrics<name=(\S+)\.(\S+)\.driver\.LiveListenerBus\.(.*)><>Value
86+
name: spark_driver_LiveListenerBus_$3
87+
type: GAUGE
88+
labels:
89+
app_namespace: "$1"
90+
app_id: "$2"
91+
# Executors counters
92+
- pattern: metrics<name=(\S+)\.(\S+)\.(.*)\.executor\.(.*)><>Count
93+
name: spark_executor_$4_count
94+
type: COUNTER
95+
labels:
96+
app_namespace: "$1"
97+
app_id: "$2"
98+
executor_id: "$3"
99+
# These come from the application executors
100+
# Example: app-20160809000059-0000.0.jvm.threadpool.activeTasks
101+
- pattern: metrics<name=(\S+)\.(\S+)\.([0-9]+)\.(jvm|NettyBlockTransfer)\.(.*)><>Value
102+
name: spark_executor_$4_$5
103+
type: GAUGE
104+
labels:
105+
app_namespace: "$1"
106+
app_id: "$2"
107+
executor_id: "$3"
108+
- pattern: metrics<name=(\S+)\.(\S+)\.([0-9]+)\.HiveExternalCatalog\.(.*)><>Count
109+
name: spark_executor_HiveExternalCatalog_$4_count
110+
type: COUNTER
111+
labels:
112+
app_namespace: "$1"
113+
app_id: "$2"
114+
executor_id: "$3"
115+
# These come from the application driver
116+
# Emulate histograms for CodeGenerator
117+
- pattern: metrics<name=(\S+)\.(\S+)\.([0-9]+)\.CodeGenerator\.(.*)><>Count
118+
name: spark_executor_CodeGenerator_$4_count
119+
type: COUNTER
120+
labels:
121+
app_namespace: "$1"
122+
app_id: "$2"
123+
executor_id: "$3"

0 commit comments

Comments
 (0)