Skip to content

Commit 5da5b8a

Browse files
committed
feat(spark): Minimize minio/aws sdk v1/v2 depedendencies to reduce spark image size
1 parent 8f55245 commit 5da5b8a

File tree

3 files changed

+262
-6
lines changed

3 files changed

+262
-6
lines changed

spark/Dockerfile

+13-4
Original file line numberDiff line numberDiff line change
@@ -33,30 +33,39 @@ RUN set -ex; \
3333

3434
WORKDIR /workspace
3535

36-
COPY okdp-addons.pom deps/pom.xml
36+
COPY okdp-addons.pom .
37+
COPY minio.pom .
3738

3839
# The setup consumes less space compare to inheriting from the parent pom
3940
# Handles the transitive dependencies versions through the pom
4041
# Manage Java AWS SDK v1 (hadoop <3.4)/V2 (hadoop >=3.4)
4142
# Some pom.xml versions comes with Control M
43+
# Minio and AWS profiles are mutually exclusive: aws includes minio
4244
RUN mvn -ntp dependency:get -DgroupId=org.apache.spark -DartifactId=spark-parent_${SCALA_VERSION} -Dversion=${SPARK_VERSION} -Dpackaging=pom; \
4345
mvn -ntp dependency:copy -Dartifact=org.apache.spark:spark-parent_${SCALA_VERSION}:${SPARK_VERSION}:pom -Dproject.basedir=./ -DoutputDirectory=./; \
4446
dos2unix spark-parent_${SCALA_VERSION}-${SPARK_VERSION}.pom; \
4547
HADOOP_VERSION=$(grep "<hadoop.version>" spark-parent_${SCALA_VERSION}-${SPARK_VERSION}.pom | tr -d ' ' | sed -e 's/^ *<hadoop.version>\(.*\)<\/hadoop.version> *$/\1/' | sort -rn | head -n 1); \
46-
mv ./deps/pom.xml .; \
48+
mvn -ntp dependency:get -DgroupId=org.apache.hadoop -DartifactId=hadoop-aws -Dversion=${HADOOP_VERSION} -Dpackaging=pom; \
49+
mvn -ntp dependency:copy -Dartifact=org.apache.hadoop:hadoop-aws:${HADOOP_VERSION}:pom -Dproject.basedir=./ -DoutputDirectory=./; \
50+
AWS_JAVA_SDK_VERSION=$(mvn -ntp dependency:tree -f hadoop-aws-${HADOOP_VERSION}.pom | grep -E "(com.amazonaws|software.amazon.awssdk):(aws-java-sdk-bundle|bundle):jar:.*:compile" | awk '{ print $NF }' | awk -F: '{ print $4 }'); \
51+
mvn -ntp clean install \
52+
-Daws-java-sdk.version=${AWS_JAVA_SDK_VERSION} \
53+
-Daws-sdk-profile.version=v$(echo ${AWS_JAVA_SDK_VERSION} | cut -d '.' -f 1) \
54+
-f minio.pom; \
4755
mvn -ntp clean dependency:copy-dependencies \
4856
-Dspark.version=${SPARK_VERSION} \
4957
-Dscala.version=${SCALA_VERSION} \
5058
-Dhadoop.version=${HADOOP_VERSION} \
51-
-Paws
59+
-Daws-java-sdk.version=${AWS_JAVA_SDK_VERSION} \
60+
-Pminio \
61+
-f okdp-addons.pom
5262

5363
FROM $BASE_IMAGE
5464

5565
ENV JMX_CONF_DIR /etc/metrics/conf/
5666

5767
# OKDP addons
5868
COPY --from=okdp_addons --chown=spark:spark /workspace/target/dependency/* $SPARK_HOME/jars
59-
RUN chown -R spark:spark ${SPARK_HOME}/jars/
6069

6170
# Jmx prometheus metrics
6271
COPY --chown=spark:spark metrics.properties ${JMX_CONF_DIR}/metrics.properties

spark/minio.pom

+209
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!--
3+
~
4+
~ Copyright 2024 tosit.io
5+
~
6+
~ Licensed under the Apache License, Version 2.0 (the "License");
7+
~ you may not use this file except in compliance with the License.
8+
~ You may obtain a copy of the License at
9+
~
10+
~ http://www.apache.org/licenses/LICENSE-2.0
11+
~
12+
~ Unless required by applicable law or agreed to in writing, software
13+
~ distributed under the License is distributed on an "AS IS" BASIS,
14+
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
~ See the License for the specific language governing permissions and
16+
~ limitations under the License.
17+
~
18+
-->
19+
<!--
20+
OKDP spark docker images add-ons:
21+
* Manage compatibilty (transitive) dependency versions between Hadoop, spark and extensions.
22+
* Prevents mixing extensions (e.x.hadoop-aws) version with other hadoop artifacts from different
23+
versions.
24+
* Manage AWS SDK V2 and SDK V1 for S3 (breaking changes between SDK V1 and V2) versions.
25+
* Using teh hadoop-project as parent is complex
26+
* See also: https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/setup-project-maven.html
27+
* See also: https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/aws_sdk_upgrade.html
28+
* See also: https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/home.html
29+
* See also: https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/setup-project-maven.html
30+
31+
hadoop version < 3.4: https://github.com/aws/aws-sdk-java/blob/master/aws-java-sdk-bundle (aws sdk
32+
v1)
33+
hadoop version >= 3.4: https://github.com/aws/aws-sdk-java-v2/blob/master/bundle/pom.xml (aws sdk
34+
v2)
35+
-->
36+
<project xmlns="http://maven.apache.org/POM/4.0.0"
37+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
38+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
39+
<modelVersion>4.0.0</modelVersion>
40+
<name>OKDP AWS SDK for Java - Minio Bundle</name>
41+
<groupId>com.amazonaws</groupId>
42+
<artifactId>okdp-minio-aws-s3-bundle</artifactId>
43+
<version>${aws-java-sdk.version}</version>
44+
<packaging>jar</packaging>
45+
<description>
46+
OKDP AWS SDK for Java - Minio Bundle
47+
The bundle contains S3 service only with around 6.5MB instead of +350MB (+540MB in v2 bundle)
48+
</description>
49+
<properties>
50+
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
51+
</properties>
52+
<profiles>
53+
<profile>
54+
<!-- aws java sdk v1 bundle can take up 350MB! -->
55+
<!-- https://github.com/aws/aws-sdk-java/blob/master/aws-java-sdk-bundle/pom.xml -->
56+
<id>minio-aws-java-sdk-s3-v1</id>
57+
<activation>
58+
<property>
59+
<name>aws-sdk-profile.version</name>
60+
<value>v1</value>
61+
</property>
62+
</activation>
63+
<dependencies>
64+
<dependency>
65+
<groupId>com.amazonaws</groupId>
66+
<artifactId>aws-java-sdk-s3</artifactId>
67+
<version>${aws-java-sdk.version}</version>
68+
</dependency>
69+
<!--
70+
S3Guard was removed since Hadoop 3.3.5 (https://issues.apache.org/jira/browse/HADOOP-17409)
71+
But spark still depends on the previous hadoop versions < 3.3.5 (ex.: spark 3.5.1 => hadoop 3.3.4)
72+
-->
73+
<dependency>
74+
<groupId>com.amazonaws</groupId>
75+
<artifactId>aws-java-sdk-dynamodb</artifactId>
76+
<version>${aws-java-sdk.version}</version>
77+
</dependency>
78+
</dependencies>
79+
<build>
80+
<plugins>
81+
<plugin>
82+
<groupId>org.apache.maven.plugins</groupId>
83+
<artifactId>maven-shade-plugin</artifactId>
84+
<executions>
85+
<execution>
86+
<phase>package</phase>
87+
<goals>
88+
<goal>shade</goal>
89+
</goals>
90+
<configuration>
91+
<shadedArtifactAttached>false</shadedArtifactAttached>
92+
<promoteTransitiveDependencies>true</promoteTransitiveDependencies>
93+
<artifactSet>
94+
<includes>
95+
<include>joda-time:joda-time</include>
96+
<include>com.fasterxml.jackson.core:*</include>
97+
<include>com.fasterxml.jackson.dataformat:jackson-dataformat-cbor</include>
98+
<include>org.apache.httpcomponents:*</include>
99+
<include>commons-codec:commons-codec</include>
100+
<include>commons-logging:commons-logging</include>
101+
<include>io.netty:*</include>
102+
<include>com.amazonaws:*</include>
103+
</includes>
104+
</artifactSet>
105+
<relocations>
106+
<relocation>
107+
<pattern>org.joda</pattern>
108+
<shadedPattern>com.amazonaws.thirdparty.joda</shadedPattern>
109+
</relocation>
110+
<relocation>
111+
<pattern>com.fasterxml.jackson</pattern>
112+
<shadedPattern>com.amazonaws.thirdparty.jackson</shadedPattern>
113+
</relocation>
114+
<relocation>
115+
<pattern>org.apache.http</pattern>
116+
<shadedPattern>com.amazonaws.thirdparty.apache.http</shadedPattern>
117+
</relocation>
118+
<relocation>
119+
<pattern>org.apache.commons.codec</pattern>
120+
<shadedPattern>com.amazonaws.thirdparty.apache.codec</shadedPattern>
121+
</relocation>
122+
<relocation>
123+
<pattern>org.apache.commons.logging</pattern>
124+
<shadedPattern>com.amazonaws.thirdparty.apache.logging</shadedPattern>
125+
</relocation>
126+
<relocation>
127+
<pattern>io.netty</pattern>
128+
<shadedPattern>com.amazonaws.thirdparty.io.netty</shadedPattern>
129+
</relocation>
130+
</relocations>
131+
</configuration>
132+
</execution>
133+
</executions>
134+
</plugin>
135+
</plugins>
136+
</build>
137+
</profile>
138+
<profile>
139+
<!-- aws java sdk v2 bundle can take up 500MB! -->
140+
<!-- https://github.com/aws/aws-sdk-java-v2/blob/master/bundle/pom.xml -->
141+
<id>minio-aws-java-sdk-s3-v2</id>
142+
<activation>
143+
<property>
144+
<name>aws-sdk-profile.version</name>
145+
<value>v2</value>
146+
</property>
147+
</activation>
148+
<dependencies>
149+
<dependency>
150+
<groupId>software.amazon.awssdk</groupId>
151+
<artifactId>s3</artifactId>
152+
<version>${aws-java-sdk.version}</version>
153+
</dependency>
154+
</dependencies>
155+
<build>
156+
<plugins>
157+
<plugin>
158+
<groupId>org.apache.maven.plugins</groupId>
159+
<artifactId>maven-shade-plugin</artifactId>
160+
<executions>
161+
<execution>
162+
<phase>package</phase>
163+
<goals>
164+
<goal>shade</goal>
165+
</goals>
166+
<configuration>
167+
<shadedArtifactAttached>false</shadedArtifactAttached>
168+
<promoteTransitiveDependencies>true</promoteTransitiveDependencies>
169+
<artifactSet>
170+
<includes>
171+
<include>com.fasterxml.jackson.jr:*</include>
172+
<include>io.netty:*</include>
173+
<include>org.apache.httpcomponents:*</include>
174+
<include>org.reactivestreams:*</include>
175+
<include>org.slf4j:*</include>
176+
<include>commons-codec:commons-codec</include>
177+
<include>software.amazon.awssdk:*</include>
178+
<include>software.amazon:*</include>
179+
<include>software.amazon.s3.accessgrants:*</include>
180+
<inlcude>com.github.ben-manes.caffeine:*</inlcude>
181+
<include>commons-logging:*</include>
182+
</includes>
183+
</artifactSet>
184+
<relocations>
185+
<relocation>
186+
<pattern>org.apache</pattern>
187+
<shadedPattern>software.amazon.awssdk.thirdparty.org.apache</shadedPattern>
188+
<excludes>
189+
<exclude>org.apache.log4j.*</exclude>
190+
</excludes>
191+
</relocation>
192+
<relocation>
193+
<pattern>io.netty</pattern>
194+
<shadedPattern>software.amazon.awssdk.thirdparty.io.netty</shadedPattern>
195+
</relocation>
196+
<relocation>
197+
<pattern>org.slf4j</pattern>
198+
<shadedPattern>software.amazon.awssdk.thirdparty.org.slf4j</shadedPattern>
199+
</relocation>
200+
</relocations>
201+
</configuration>
202+
</execution>
203+
</executions>
204+
</plugin>
205+
</plugins>
206+
</build>
207+
</profile>
208+
</profiles>
209+
</project>

spark/okdp-addons.pom

+40-2
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,46 @@
8282
</dependencies>
8383
<profiles>
8484
<profile>
85-
<!-- minio/aws S3
86-
The AWS bandle can take up 500MB, to be optimized later! -->
85+
<!-- aws S3: The AWS bandle can take up 500MB !, use the optimized okdp shaded jar -->
86+
<id>minio</id>
87+
<dependencies>
88+
<dependency>
89+
<groupId>org.apache.hadoop</groupId>
90+
<artifactId>hadoop-aws</artifactId>
91+
<version>${hadoop.version}</version>
92+
<exclusions>
93+
<exclusion>
94+
<groupId>*</groupId>
95+
<artifactId>*</artifactId>
96+
</exclusion>
97+
</exclusions>
98+
</dependency>
99+
<!-- Locally installed dependency from 'minio.pom' file
100+
A minimal artifact containing only S3 dependencies
101+
-->
102+
<dependency>
103+
<groupId>com.amazonaws</groupId>
104+
<artifactId>okdp-minio-aws-s3-bundle</artifactId>
105+
<version>${aws-java-sdk.version}</version>
106+
</dependency>
107+
<!-- S3A Committers:
108+
https://github.com/apache/hadoop/blob/trunk/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/committers.md
109+
-->
110+
<dependency>
111+
<groupId>org.apache.spark</groupId>
112+
<artifactId>spark-hadoop-cloud_${scala.version}</artifactId>
113+
<version>${spark.version}</version>
114+
<exclusions>
115+
<exclusion>
116+
<groupId>*</groupId>
117+
<artifactId>*</artifactId>
118+
</exclusion>
119+
</exclusions>
120+
</dependency>
121+
</dependencies>
122+
</profile>
123+
<profile>
124+
<!-- aws S3: The AWS bandle can take up 500MB ! -->
87125
<id>aws</id>
88126
<dependencies>
89127
<dependency>

0 commit comments

Comments
 (0)