Skip to content

Commit 59eb7d8

Browse files
committed
feat(spark-base): Add new spark-base image (java/scala only) without okdp extensions
1 parent 8fe0375 commit 59eb7d8

File tree

2 files changed

+223
-0
lines changed

2 files changed

+223
-0
lines changed

spark-base/Dockerfile

+81
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
ARG JAVA_VERSION=11
18+
ARG BASE_IMAGE=eclipse-temurin:${JAVA_VERSION}-jre-jammy
19+
FROM $BASE_IMAGE
20+
21+
ARG spark_uid=185
22+
23+
ARG SPARK_VERSION=3.2.1
24+
ARG HADOOP_VERSION=3.2
25+
ARG SCALA_VERSION=2.12
26+
ARG SPARK_DIST_DOWNLOAD_URL=https://archive.apache.org/dist/spark
27+
28+
ENV SPARK_HOME /opt/spark
29+
ENV SPARK_CONF_DIR ${SPARK_HOME}/conf
30+
31+
ENV SPARK_VERSION ${SPARK_VERSION}
32+
ENV HADOOP_VERSION ${HADOOP_VERSION}
33+
ENV SCALA_VERSION ${SCALA_VERSION}
34+
35+
RUN groupadd --system --gid=${spark_uid} spark && \
36+
useradd --system --uid=${spark_uid} --gid=spark spark
37+
38+
RUN set -ex; \
39+
apt-get update; \
40+
ln -s /lib /lib64; \
41+
apt install -y --no-install-recommends gnupg2 bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu libnss-wrapper curl; \
42+
mkdir -p ${SPARK_HOME}; \
43+
mkdir -p ${SPARK_HOME}/work-dir; \
44+
chmod g+w ${SPARK_HOME}/work-dir; \
45+
chown -R spark:spark ${SPARK_HOME}; \
46+
rm /bin/sh; \
47+
ln -sv /bin/bash /bin/sh; \
48+
echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su; \
49+
chgrp root /etc/passwd && chmod ug+rw /etc/passwd; \
50+
rm -rf /var/cache/apt/* && rm -rf /var/lib/apt/lists/*
51+
52+
RUN set -ex;\
53+
export WORK_DIR="$(mktemp -d)"; \
54+
DIST=spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}; \
55+
if [ "${SCALA_VERSION}" = "2.13" ]; then \
56+
DIST+=-scala${SCALA_VERSION}; \
57+
fi; \
58+
curl --retry 3 --retry-all-errors -k ${SPARK_DIST_DOWNLOAD_URL}/${DIST}.tgz -o ${WORK_DIR}/spark.tgz; \
59+
curl --retry 3 --retry-all-errors -k ${SPARK_DIST_DOWNLOAD_URL}/${DIST}.tgz.asc -o ${WORK_DIR}/spark.tgz.asc; \
60+
curl --retry 3 --retry-all-errors -k https://downloads.apache.org/spark/KEYS -o ${WORK_DIR}/KEYS; \
61+
export GNUPGHOME="$(mktemp -d)"; \
62+
gpg --batch --import ${WORK_DIR}/KEYS; \
63+
gpg --batch --verify ${WORK_DIR}/spark.tgz.asc ${WORK_DIR}/spark.tgz; \
64+
tar --strip-components=1 -zxvf ${WORK_DIR}/spark.tgz -C ${SPARK_HOME}/; \
65+
chown -R spark:spark ${SPARK_HOME}/; \
66+
mv ${SPARK_HOME}/kubernetes/dockerfiles/spark/decom.sh /opt/; \
67+
mv ${SPARK_HOME}/kubernetes/tests ${SPARK_HOME}/; \
68+
chmod a+x /opt/decom.sh; \
69+
gpgconf --kill all; \
70+
rm -rf ${GNUPGHOME} ${WORK_DIR}; \
71+
rm -fr ${SPARK_HOME}/conf rm -fr ${SPARK_HOME}/yarn rm -fr ${SPARK_HOME}/kubernetes
72+
73+
COPY entrypoint.sh /opt/entrypoint.sh
74+
RUN chmod a+x /opt/entrypoint.sh
75+
76+
WORKDIR ${SPARK_HOME}/work-dir
77+
78+
USER spark
79+
80+
ENTRYPOINT [ "/opt/entrypoint.sh" ]
81+

spark-base/entrypoint.sh

+142
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
#!/bin/bash
2+
#
3+
# Licensed to the Apache Software Foundation (ASF) under one or more
4+
# contributor license agreements. See the NOTICE file distributed with
5+
# this work for additional information regarding copyright ownership.
6+
# The ASF licenses this file to You under the Apache License, Version 2.0
7+
# (the "License"); you may not use this file except in compliance with
8+
# the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
#
18+
# Prevent any errors from being silently ignored
19+
set -eo pipefail
20+
21+
attempt_setup_fake_passwd_entry() {
22+
# Check whether there is a passwd entry for the container UID
23+
local myuid; myuid="$(id -u)"
24+
# If there is no passwd entry for the container UID, attempt to fake one
25+
# You can also refer to the https://github.com/docker-library/official-images/pull/13089#issuecomment-1534706523
26+
# It's to resolve OpenShift random UID case.
27+
# See also: https://github.com/docker-library/postgres/pull/448
28+
if ! getent passwd "$myuid" &> /dev/null; then
29+
local wrapper
30+
for wrapper in {/usr,}/lib{/*,}/libnss_wrapper.so; do
31+
if [ -s "$wrapper" ]; then
32+
NSS_WRAPPER_PASSWD="$(mktemp)"
33+
NSS_WRAPPER_GROUP="$(mktemp)"
34+
export LD_PRELOAD="$wrapper" NSS_WRAPPER_PASSWD NSS_WRAPPER_GROUP
35+
local mygid; mygid="$(id -g)"
36+
printf 'spark:x:%s:%s:${SPARK_USER_NAME:-anonymous uid}:%s:/bin/false\n' "$myuid" "$mygid" "$SPARK_HOME" > "$NSS_WRAPPER_PASSWD"
37+
printf 'spark:x:%s:\n' "$mygid" > "$NSS_WRAPPER_GROUP"
38+
break
39+
fi
40+
done
41+
fi
42+
}
43+
44+
if [ -z "$JAVA_HOME" ]; then
45+
JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}')
46+
fi
47+
48+
SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*"
49+
for v in "${!SPARK_JAVA_OPT_@}"; do
50+
SPARK_EXECUTOR_JAVA_OPTS+=( "${!v}" )
51+
done
52+
53+
if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then
54+
SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH"
55+
fi
56+
57+
if ! [ -z "${PYSPARK_PYTHON+x}" ]; then
58+
export PYSPARK_PYTHON
59+
fi
60+
if ! [ -z "${PYSPARK_DRIVER_PYTHON+x}" ]; then
61+
export PYSPARK_DRIVER_PYTHON
62+
fi
63+
64+
# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor.
65+
# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s.
66+
if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then
67+
export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)"
68+
fi
69+
70+
if ! [ -z "${HADOOP_CONF_DIR+x}" ]; then
71+
SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH";
72+
fi
73+
74+
if ! [ -z "${SPARK_CONF_DIR+x}" ]; then
75+
SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH";
76+
elif ! [ -z "${SPARK_HOME+x}" ]; then
77+
SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH";
78+
fi
79+
80+
# SPARK-43540: add current working directory into executor classpath
81+
SPARK_CLASSPATH="$SPARK_CLASSPATH:$PWD"
82+
83+
# Switch to spark if no USER specified (root by default) otherwise use USER directly
84+
switch_spark_if_root() {
85+
if [ $(id -u) -eq 0 ]; then
86+
echo gosu spark
87+
fi
88+
}
89+
90+
spark_3_2_support(){
91+
if ! printf '%s\n%s' "$1" "$2" | sort -C -V
92+
then
93+
# + 3.3.0
94+
echo "org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend --podName $SPARK_EXECUTOR_POD_NAME"
95+
else
96+
# -3.0.0
97+
echo "org.apache.spark.executor.CoarseGrainedExecutorBackend"
98+
fi
99+
}
100+
101+
KUBERNETES_EXECUTOR_BACKEND="$(spark_3_2_support $SPARK_VERSION '3.2.4')"
102+
103+
case "$1" in
104+
driver)
105+
shift 1
106+
CMD=(
107+
"$SPARK_HOME/bin/spark-submit"
108+
--conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS"
109+
--conf "spark.executorEnv.SPARK_DRIVER_POD_IP=$SPARK_DRIVER_BIND_ADDRESS"
110+
--deploy-mode client
111+
"$@"
112+
)
113+
attempt_setup_fake_passwd_entry
114+
# Execute the container CMD under tini for better hygiene
115+
exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}"
116+
;;
117+
executor)
118+
shift 1
119+
CMD=(
120+
${JAVA_HOME}/bin/java
121+
"${SPARK_EXECUTOR_JAVA_OPTS[@]}"
122+
-Xms"$SPARK_EXECUTOR_MEMORY"
123+
-Xmx"$SPARK_EXECUTOR_MEMORY"
124+
-cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH"
125+
$KUBERNETES_EXECUTOR_BACKEND
126+
--driver-url "$SPARK_DRIVER_URL"
127+
--executor-id "$SPARK_EXECUTOR_ID"
128+
--cores "$SPARK_EXECUTOR_CORES"
129+
--app-id "$SPARK_APPLICATION_ID"
130+
--hostname "$SPARK_EXECUTOR_POD_IP"
131+
--resourceProfileId "$SPARK_RESOURCE_PROFILE_ID"
132+
)
133+
attempt_setup_fake_passwd_entry
134+
# Execute the container CMD under tini for better hygiene
135+
exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}"
136+
;;
137+
138+
*)
139+
# Non-spark-on-k8s command provided, proceeding in pass-through mode...
140+
exec "$@"
141+
;;
142+
esac

0 commit comments

Comments
 (0)