Skip to content

Commit

Permalink
Merge branch 'apache:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
himadripal authored Nov 14, 2024
2 parents 6534a9d + 9657b75 commit cd82f03
Show file tree
Hide file tree
Showing 571 changed files with 14,153 additions and 9,635 deletions.
2 changes: 1 addition & 1 deletion .github/actions/setup-spark-builder/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ inputs:
comet-version:
description: 'The Comet version to use for Spark'
required: true
default: '0.3.0-SNAPSHOT'
default: '0.4.0-SNAPSHOT'
runs:
using: "composite"
steps:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/docker-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ jobs:
- name: Extract Comet version
id: extract_version
run: |
# use the tag that triggered this workflow as the Comet version e.g. 0.3.0-rc1
# use the tag that triggered this workflow as the Comet version e.g. 0.4.0-rc1
echo "COMET_VERSION=${GITHUB_REF##*/}" >> $GITHUB_ENV
- name: Echo Comet version
run: echo "The current Comet version is ${{ env.COMET_VERSION }}"
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/pr_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,8 @@ jobs:
uses: ./.github/actions/java-test
with:
maven_opts: -Pspark-${{ matrix.spark-version }}
upload-test-reports: true
# https://github.com/codecov/codecov-action/issues/1549
# upload-test-reports: true

macos-aarch64-test-with-spark4_0:
strategy:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/spark_sql_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,12 @@ jobs:
with:
spark-version: ${{ matrix.spark-version.full }}
spark-short-version: ${{ matrix.spark-version.short }}
comet-version: '0.3.0-SNAPSHOT' # TODO: get this from pom.xml
comet-version: '0.4.0-SNAPSHOT' # TODO: get this from pom.xml
- name: Run Spark tests
run: |
cd apache-spark
rm -rf /root/.m2/repository/org/apache/parquet # somehow parquet cache requires cleanups
ENABLE_COMET=true ENABLE_COMET_SHUFFLE=${{ matrix.module.name == 'sql/core-1' && 'false' || 'true' }} build/sbt ${{ matrix.module.args1 }} "${{ matrix.module.args2 }}"
ENABLE_COMET=true ENABLE_COMET_SHUFFLE=true build/sbt ${{ matrix.module.args1 }} "${{ matrix.module.args2 }}"
env:
LC_ALL: "C.UTF-8"

4 changes: 2 additions & 2 deletions .github/workflows/spark_sql_test_ansi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,12 @@ jobs:
with:
spark-version: ${{ matrix.spark-version.full }}
spark-short-version: ${{ matrix.spark-version.short }}
comet-version: '0.3.0-SNAPSHOT' # TODO: get this from pom.xml
comet-version: '0.4.0-SNAPSHOT' # TODO: get this from pom.xml
- name: Run Spark tests
run: |
cd apache-spark
rm -rf /root/.m2/repository/org/apache/parquet # somehow parquet cache requires cleanups
RUST_BACKTRACE=1 ENABLE_COMET=true ENABLE_COMET_ANSI_MODE=true ENABLE_COMET_SHUFFLE=${{ matrix.module.name == 'sql/core-1' && 'false' || 'true' }} build/sbt ${{ matrix.module.args1 }} "${{ matrix.module.args2 }}"
RUST_BACKTRACE=1 ENABLE_COMET=true ENABLE_COMET_ANSI_MODE=true ENABLE_COMET_SHUFFLE=true build/sbt ${{ matrix.module.args1 }} "${{ matrix.module.args2 }}"
env:
LC_ALL: "C.UTF-8"

1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ filtered_rat.txt
dev/dist
apache-rat-*.jar
venv
dev/release/comet-rm/workdir
29 changes: 1 addition & 28 deletions LICENSE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -210,31 +210,4 @@ This project includes code from Apache Aurora.

Copyright: 2016 The Apache Software Foundation.
Home page: https://aurora.apache.org/
License: http://www.apache.org/licenses/LICENSE-2.0

--------------------------------------------------------------------------------

This project includes software from the twox-hash project
https://github.com/shepmaster/twox-hash

The MIT License (MIT)

Copyright (c) 2015 Jake Goulding

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
License: http://www.apache.org/licenses/LICENSE-2.0
16 changes: 16 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,22 @@ format:
./mvnw compile test-compile scalafix:scalafix -Psemanticdb $(PROFILES)
./mvnw spotless:apply $(PROFILES)

# build native libs for amd64 architecture Linux/MacOS on a Linux/amd64 machine/container
core-amd64-libs:
cd native && cargo build -j 2 --release
ifdef HAS_OSXCROSS
rustup target add x86_64-apple-darwin
cd native && cargo build -j 2 --target x86_64-apple-darwin --release
endif

# build native libs for arm64 architecture Linux/MacOS on a Linux/arm64 machine/container
core-arm64-libs:
cd native && cargo build -j 2 --release
ifdef HAS_OSXCROSS
rustup target add aarch64-apple-darwin
cd native && cargo build -j 2 --target aarch64-apple-darwin --release
endif

core-amd64:
rustup target add x86_64-apple-darwin
cd native && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --release
Expand Down
6 changes: 4 additions & 2 deletions NOTICE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,7 @@ Copyright 2024 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).

This product includes software from the twox-hash project (MIT License)
https://github.com/shepmaster/twox-hash
This product includes software developed at
Apache Gluten (https://github.com/apache/incubator-gluten/)
Specifically:
- Optimizer rule to replace SortMergeJoin with ShuffleHashJoin
25 changes: 17 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,12 @@ under the License.
<img src="docs/source/_static/images/DataFusionComet-Logo-Light.png" width="512" alt="logo"/>

Apache DataFusion Comet is a high-performance accelerator for Apache Spark, built on top of the powerful
[Apache DataFusion](https://datafusion.apache.org) query engine. Comet is designed to significantly enhance the
[Apache DataFusion] query engine. Comet is designed to significantly enhance the
performance of Apache Spark workloads while leveraging commodity hardware and seamlessly integrating with the
Spark ecosystem without requiring any code changes.

[Apache DataFusion]: https://datafusion.apache.org

# Benefits of Using Comet

## Run Spark Queries at DataFusion Speeds
Expand All @@ -44,25 +46,30 @@ The following chart shows the time it takes to run the 22 TPC-H queries against
using a single executor with 8 cores. See the [Comet Benchmarking Guide](https://datafusion.apache.org/comet/contributor-guide/benchmarking.html)
for details of the environment used for these benchmarks.

When using Comet, the overall run time is reduced from 616 seconds to 379 seconds, a 1.62x speedup, with query 1
running more than 7x faster than Spark.
When using Comet, the overall run time is reduced from 616 seconds to 374 seconds, a 1.6x speedup, with query 1
running 9x faster than Spark.

Running the same queries with DataFusion standalone (without Spark) using the same number of cores results in a 3.6x
speedup compared to Spark.

Comet is not yet achieving full DataFusion speeds in all cases, but with future work we aim to provide a 2x-4x speedup
for a broader set of queries.

![](docs/source/_static/images/benchmark-results/2024-08-23/tpch_allqueries.png)
![](docs/source/_static/images/benchmark-results/0.3.0/tpch_allqueries.png)

Here is a breakdown showing relative performance of Spark, Comet, and DataFusion for each TPC-H query.

![](docs/source/_static/images/benchmark-results/2024-08-23/tpch_queries_compare.png)
![](docs/source/_static/images/benchmark-results/0.3.0/tpch_queries_compare.png)

The following charts shows how much Comet currently accelerates each query from the benchmark.

### Relative speedup

The following chart shows how much Comet currently accelerates each query from the benchmark. Performance optimization
is an ongoing task, and we welcome contributions from the community to help achieve even greater speedups in the future.
![](docs/source/_static/images/benchmark-results/0.3.0/tpch_queries_speedup_rel.png)

![](docs/source/_static/images/benchmark-results/2024-08-23/tpch_queries_speedup_rel.png)
### Absolute speedup

![](docs/source/_static/images/benchmark-results/0.3.0/tpch_queries_speedup_abs.png)

These benchmarks can be reproduced in any environment using the documentation in the
[Comet Benchmarking Guide](https://datafusion.apache.org/comet/contributor-guide/benchmarking.html). We encourage
Expand Down Expand Up @@ -100,6 +107,8 @@ To get started with Apache DataFusion Comet, follow the
[DataFusion Slack and Discord channels](https://datafusion.apache.org/contributor-guide/communication.html) to connect
with other users, ask questions, and share your experiences with Comet.

Follow [Apache DataFusion Comet Overview](https://datafusion.apache.org/comet/user-guide/overview.html) to get more detailed information

## Contributing

We welcome contributions from the community to help improve and enhance Apache DataFusion Comet. Whether it's fixing
Expand Down
86 changes: 0 additions & 86 deletions bin/comet-spark-shell

This file was deleted.

4 changes: 2 additions & 2 deletions common/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ under the License.
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.comet</groupId>
<groupId>org.apache.datafusion</groupId>
<artifactId>comet-parent-spark${spark.version.short}_${scala.binary.version}</artifactId>
<version>0.3.0-SNAPSHOT</version>
<version>0.4.0-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>

Expand Down
15 changes: 12 additions & 3 deletions common/src/main/java/org/apache/comet/NativeBase.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,24 @@ public abstract class NativeBase {

private static final String libraryToLoad = System.mapLibraryName(NATIVE_LIB_NAME);
private static boolean loaded = false;
private static volatile Throwable loadErr = null;
private static final String searchPattern = "libcomet-";

static {
if (!isLoaded()) {
try {
load();
} catch (Throwable th) {
LOG.warn("Failed to load comet library", th);
// logging may not be initialized yet, so also write to stderr
System.err.println("Failed to load comet library: " + th.getMessage());
loadErr = th;
}
}

public static synchronized boolean isLoaded() {
public static synchronized boolean isLoaded() throws Throwable {
if (loadErr != null) {
throw loadErr;
}
return loaded;
}

Expand All @@ -81,7 +90,7 @@ static synchronized void load() {

// Try to load Comet library from the java.library.path.
try {
System.loadLibrary(libraryToLoad);
System.loadLibrary(NATIVE_LIB_NAME);
loaded = true;
} catch (UnsatisfiedLinkError ex) {
// Doesn't exist, so proceed to loading bundled library.
Expand Down
Loading

0 comments on commit cd82f03

Please sign in to comment.