forked from qubole/sparklens
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Squash of adding initial autotune for number of executors. This also …
…removes a lot of JSON code and drops handling for the classic SparkLens file format instead folks should directly use the history files part2
- Loading branch information
Showing
34 changed files
with
2,385 additions
and
1,420 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
CREATE TABLE IF NOT EXISTS local.developers ( | ||
username string, | ||
firstname string, | ||
lastname string) | ||
USING iceberg | ||
PARTITIONED BY (username); | ||
CREATE TABLE IF NOT EXISTS local.projects ( | ||
creator string, | ||
projectname string) | ||
USING iceberg | ||
PARTITIONED BY (creator); | ||
INSERT INTO local.developers VALUES("krisnova", "Kris", "Nova"); | ||
INSERT INTO local.projects VALUES("krisnova", "aurae"); | ||
SELECT * FROM local.developers INNER JOIN local.projects ON local.projects.creator = local.developers.username; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
#!/bin/bash | ||
|
||
set -ex | ||
|
||
# Download Spark and iceberg if not present | ||
SPARK_MAJOR=${SPARK_MAJOR:-"3.3"} | ||
SPARK_VERSION=${SPARK_VERSION:-"${SPARK_MAJOR}.4"} | ||
SCALA_VERSION=${SCALA_VERSION:-"2.12"} | ||
HADOOP_VERSION="3" | ||
SPARK_PATH="$(pwd)/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" | ||
SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3.tgz" | ||
ICEBERG_VERSION=${ICEBERG_VERSION:-"1.4.0"} | ||
if [ ! -f "${SPARK_FILE}" ]; then | ||
SPARK_DIST_URL="https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" | ||
SPARK_ARCHIVE_DIST_URL="https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" | ||
if command -v axel &> /dev/null | ||
then | ||
(axel "$SPARK_DIST_URL" || axel "$SPARK_ARCHIVE_DIST_URL") & | ||
else | ||
(wget "$SPARK_DIST_URL" || wget "$SPARK_ARCHIVE_DIST_URL") & | ||
fi | ||
fi | ||
# Download Icberg if not present | ||
ICEBERG_FILE="iceberg-spark-runtime-${SPARK_MAJOR}_${SCALA_VERSION}-${ICEBERG_VERSION}.jar" | ||
if [ ! -f "${ICEBERG_FILE}" ]; then | ||
wget "https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-${SPARK_MAJOR}_${SCALA_VERSION}/${ICEBERG_VERSION}/${ICEBERG_FILE}" -O "${ICEBERG_FILE}" & | ||
fi | ||
# See https://www.mail-archive.com/[email protected]/msg27796.html | ||
BC_FILE="bcpkix-jdk15on-1.68.jar" | ||
BC_PROV_FILE="bcprov-jdk15on-1.68.jar" | ||
if [ ! -f "${BC_FILE}" ]; then | ||
wget https://repo1.maven.org/maven2/org/bouncycastle/bcpkix-jdk15on/1.68/bcpkix-jdk15on-1.68.jar | ||
fi | ||
if [ ! -f "${BC_PROV_FILE}" ]; then | ||
wget https://repo1.maven.org/maven2/org/bouncycastle/bcprov-jdk15on/1.68/bcprov-jdk15on-1.68.jar | ||
fi | ||
wait | ||
sleep 1 | ||
# Setup the env | ||
if [ ! -d "${SPARK_PATH}" ]; then | ||
tar -xf "${SPARK_FILE}" | ||
fi | ||
|
||
SPARK_HOME="${SPARK_PATH}" | ||
export SPARK_HOME | ||
|
||
if [ ! -f "${SPARK_PATH}/jars/${ICEBERG_FILE}" ]; then | ||
# Delete the old JAR first. | ||
rm "${SPARK_PATH}/jars/iceberg-spark-runtime*.jar" || echo "No old version to delete." | ||
cp "${ICEBERG_FILE}" "${SPARK_PATH}/jars/${ICEBERG_FILE}" | ||
fi | ||
|
||
# Copy boncy castle for Kube | ||
cp bc*jdk*.jar ${SPARK_PATH}/jars/ | ||
|
||
# Set up for running pyspark and friends | ||
export PATH="${SPARK_PATH}:${SPARK_PATH}/python:${SPARK_PATH}/bin:${SPARK_PATH}/sbin:${PATH}" | ||
|
||
# Make sure we have a history directory | ||
mkdir -p /tmp/spark-events | ||
|
||
mkdir -p ./data/fetched/ | ||
if [ ! -f ./data/fetched/2021 ]; then | ||
wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021" -O ./data/fetched/2021 | ||
fi | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
#!/bin/bash | ||
|
||
# Runs an end to end test, updates the output | ||
|
||
set -ex | ||
|
||
rm -rf /tmp/spark-events | ||
|
||
source env_setup.sh | ||
|
||
|
||
sql_file=e2e/partioned_table_join.sql | ||
EXTENSIONS=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions | ||
|
||
# Run a simple local e2e example | ||
|
||
mkdir -p /tmp/spark-events | ||
${SPARK_HOME}/bin/spark-sql --master local[5] \ | ||
--conf spark.eventLog.enabled=true \ | ||
--conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \ | ||
--conf spark.sql.catalog.spark_catalog.type=hive \ | ||
--conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ | ||
--conf spark.sql.catalog.local.type=hadoop \ | ||
--conf "spark.sql.catalog.local.warehouse=$PWD/warehouse" \ | ||
--name "fresh" \ | ||
-f "${sql_file}" | ||
|
||
|
||
cp /tmp/spark-events/* ./src/test/event-history-test-files/local-fresh | ||
|
||
# Same example but with dynamic allocation turned on IF AND ONLY IF we have a kube cluster? idk. | ||
|
||
source setup_micro_k8s.sh | ||
|
||
if [ -n "${kube_host}" ]; then | ||
|
||
rm -rf /tmp/spark-events | ||
# This warehouse path sort of works but not really, ideally we'd point to s3/minio but it's enough | ||
# for now. | ||
WAREHOUSE_PATH=${WAREHOUSE_PATH:-/tmp/warehouse} | ||
rm -rf ${WAREHOUSE_PATH} | ||
mkdir -p /tmp/spark-events | ||
${SPARK_HOME}/bin/spark-sql --master "k8s://${kube_host}" \ | ||
--deploy-mode client \ | ||
--conf spark.eventLog.enabled=true \ | ||
--conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \ | ||
--conf spark.sql.catalog.spark_catalog.type=hive \ | ||
--conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ | ||
--conf spark.sql.catalog.local.type=hadoop \ | ||
--conf spark.dynamicAllocation.enabled=true \ | ||
--conf "spark.sql.catalog.local.warehouse=/tmp/warehouse" \ | ||
--conf spark.kubernetes.container.image=${image} \ | ||
--conf spark.kubernetes.authenticate.caCertFile=${cert_path} \ | ||
--conf spark.kubernetes.authenticate.submission.caCertFile=${cert_path} \ | ||
--conf spark.dynamicAllocation.shuffleTracking.enabled=true \ | ||
--name "fresh-kube" \ | ||
-f "${sql_file}" | ||
|
||
cp /tmp/spark-events/* ./src/test/event-history-test-files/local-fresh-dynamic | ||
|
||
# TODO: Register trap for this. | ||
cp ~/.kube/${backup_config} ~/.kube/config | ||
else | ||
echo "Not updating dynamically scaled version" | ||
fi | ||
|
||
# Run only the tests that use the "fresh" output | ||
# that is the one where we run Spark as part of the test above | ||
# This allows us to test in CI we support different history file versions provided we're compiled | ||
# for that version. | ||
sbt -DsparkVersion=${SPARK_VERSION} ";clean;compile;testOnly com.qubole.sparklens.app.EventHistoryFileReportingSuite -- -z fresh" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
<scalastyle> | ||
<name>Scalastyle standard configuration</name> | ||
<check level="warning" class="org.scalastyle.file.FileTabChecker" enabled="true"></check> | ||
<check level="warning" class="org.scalastyle.file.FileLengthChecker" enabled="true"> | ||
<parameters> | ||
<parameter name="maxFileLength"><![CDATA[800]]></parameter> | ||
</parameters> | ||
</check> | ||
<check level="warning" class="org.scalastyle.file.HeaderMatchesChecker" enabled="false"> | ||
<parameters> | ||
<parameter name="header"><![CDATA[// Copyright (C) 2011-2012 the original author or authors. | ||
// See the LICENCE.txt file distributed with this work for additional | ||
// information regarding copyright ownership. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License.]]></parameter> | ||
</parameters> | ||
</check> | ||
<check level="warning" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check> | ||
<check level="warning" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="true"></check> | ||
<check level="warning" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check> | ||
<check level="warning" class="org.scalastyle.file.FileLineLengthChecker" enabled="true"> | ||
<parameters> | ||
<parameter name="maxLineLength"><![CDATA[200]]></parameter> | ||
<parameter name="tabSize"><![CDATA[4]]></parameter> | ||
</parameters> | ||
</check> | ||
<check level="warning" class="org.scalastyle.scalariform.ClassNamesChecker" enabled="true"> | ||
<parameters> | ||
<parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter> | ||
</parameters> | ||
</check> | ||
<check level="warning" class="org.scalastyle.scalariform.ObjectNamesChecker" enabled="true"> | ||
<parameters> | ||
<parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter> | ||
</parameters> | ||
</check> | ||
<check level="warning" class="org.scalastyle.scalariform.PackageObjectNamesChecker" enabled="true"> | ||
<parameters> | ||
<parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter> | ||
</parameters> | ||
</check> | ||
<check level="warning" class="org.scalastyle.scalariform.EqualsHashCodeChecker" enabled="true"></check> | ||
<check level="warning" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="true"> | ||
<parameters> | ||
<parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter> | ||
</parameters> | ||
</check> | ||
<check level="warning" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true"> | ||
<parameters> | ||
<parameter name="maxParameters"><![CDATA[8]]></parameter> | ||
</parameters> | ||
</check> | ||
<check level="warning" class="org.scalastyle.scalariform.MagicNumberChecker" enabled="false"> | ||
<parameters> | ||
<parameter name="ignore"><![CDATA[-1,0,1,2,3]]></parameter> | ||
</parameters> | ||
</check> | ||
<check level="warning" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="true"></check> | ||
<check level="warning" class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" enabled="true"></check> | ||
<check level="warning" class="org.scalastyle.scalariform.ReturnChecker" enabled="false"></check> | ||
<check level="warning" class="org.scalastyle.scalariform.NullChecker" enabled="false"></check> | ||
<check level="warning" class="org.scalastyle.scalariform.NoCloneChecker" enabled="true"></check> | ||
<check level="warning" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check> | ||
<check level="warning" class="org.scalastyle.scalariform.CovariantEqualsChecker" enabled="true"></check> | ||
<check level="warning" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"></check> | ||
<check level="warning" class="org.scalastyle.file.RegexChecker" enabled="false"> | ||
<parameters> | ||
<parameter name="regex"><![CDATA[println]]></parameter> | ||
</parameters> | ||
</check> | ||
<check level="warning" class="org.scalastyle.scalariform.NumberOfTypesChecker" enabled="true"> | ||
<parameters> | ||
<parameter name="maxTypes"><![CDATA[30]]></parameter> | ||
</parameters> | ||
</check> | ||
<check level="warning" class="org.scalastyle.scalariform.CyclomaticComplexityChecker" enabled="true"> | ||
<parameters> | ||
<parameter name="maximum"><![CDATA[10]]></parameter> | ||
</parameters> | ||
</check> | ||
<check level="warning" class="org.scalastyle.scalariform.UppercaseLChecker" enabled="true"></check> | ||
<check level="warning" class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" enabled="true"></check> | ||
<check level="warning" class="org.scalastyle.scalariform.IfBraceChecker" enabled="true"> | ||
<parameters> | ||
<parameter name="singleLineAllowed"><![CDATA[true]]></parameter> | ||
<parameter name="doubleLineAllowed"><![CDATA[false]]></parameter> | ||
</parameters> | ||
</check> | ||
<check level="warning" class="org.scalastyle.scalariform.MethodLengthChecker" enabled="true"> | ||
<parameters> | ||
<parameter name="maxLength"><![CDATA[50]]></parameter> | ||
</parameters> | ||
</check> | ||
<check level="warning" class="org.scalastyle.scalariform.MethodNamesChecker" enabled="true"> | ||
<parameters> | ||
<parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*$]]></parameter> | ||
</parameters> | ||
</check> | ||
<check level="warning" class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" enabled="true"> | ||
<parameters> | ||
<parameter name="maxMethods"><![CDATA[30]]></parameter> | ||
</parameters> | ||
</check> | ||
<check level="warning" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="false"></check> | ||
<check level="warning" class="org.scalastyle.file.NewLineAtEofChecker" enabled="false"></check> | ||
<check level="warning" class="org.scalastyle.file.NoNewLineAtEofChecker" enabled="false"></check> | ||
</scalastyle> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
if [command -v microk8s ]; then | ||
echo "microk8s already installed yay!" | ||
elif [ command -v snap ]; then | ||
sudo snap install microk8s --classic | ||
MYUSER=$USER | ||
sudo usermod -a -G microk8s $MYUSER | ||
sudo chown -R $MYUSER ~/.kube | ||
newgrp microk8s | ||
elif [ command -v homebrew ]; then | ||
brew install ubuntu/microk8s/microk8s | ||
microk8s install | ||
echo "Find your cert (see https://www.waitingforcode.com/apache-spark/setting-up-apache-spark-kubernetes-microk8s/read)" | ||
exit 1 | ||
fi | ||
if [ command -v microk8s ]; then | ||
sudo microk8s status --wait-ready | ||
sudo microk8s enable dns | ||
sudo microk8s enable dashboard | ||
sudo microk8s enable storage | ||
sudo microk8s enable registry | ||
fi | ||
kube_host=$(sudo microk8s kubectl cluster-info |grep control |grep http |sed -ne 's/.*\(http[^"]*\).*/\1/p' | cut -d \' -f 2) | ||
if [ -n "${kube_host}" ]; then | ||
cd "${SPARK_HOME}" | ||
kube_host="https://127.0.0.1:16443" | ||
repo=localhost:32000/local-spark | ||
tag=magic | ||
image="${repo}/spark:${tag}" | ||
./bin/docker-image-tool.sh -r "localhost:32000/local-spark" -t ${tag} build | ||
cert_path=/var/snap/microk8s/current/certs/ca.crt | ||
backup_config="config-$(date +%s)" | ||
cp ~/.kube/config ~/.kube/${backup_config} | ||
# I don't love this but kubeconfig & KUBECONFIG don't seemt to be respected by Spark. | ||
sudo microk8s config > ~/.kube/config | ||
docker push "${image}" | ||
cd - | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.