Merge remote-tracking branch 'apache/main' into ngates/sum-statistic

apache · Jan 23, 2025 · bcec17c · bcec17c
2 parents 579e046 + 49f95af
commit bcec17c
Show file tree

Hide file tree

Showing 199 changed files with 18,269 additions and 15,121 deletions.
diff --git a/.github/workflows/extended.yml b/.github/workflows/extended.yml
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-name: Rust Hash Collisions
+name: Datafusion extended tests
 
 concurrency:
   group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
@@ -24,15 +24,51 @@ concurrency:
 # https://docs.github.com/en/actions/writing-workflows/choosing-when-your-workflow-runs/events-that-trigger-workflows#running-your-pull_request-workflow-when-a-pull-request-merges
 #
 # this job is intended to only run only on the main branch as it is time consuming
-# and very rarely fails. However, it is important coverage to ensure correctness
-# in the (very rare) event of a hash failure.
+# and should not fail often. However, it is important coverage to ensure correctness
+# in the (very rare) event of a hash failure or sqlite query failure.
 on:
   # Run on all commits to main
   push:
     branches:
       - main
 
 jobs:
+  # Check crate compiles and base cargo check passes
+  linux-build-lib:
+    name: linux build test
+    runs-on: ubuntu-latest
+    container:
+      image: amd64/rust
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup Rust toolchain
+        uses: ./.github/actions/setup-builder
+        with:
+          rust-version: stable
+      - name: Prepare cargo build
+        run: cargo check --profile ci --all-targets
+
+  # Run extended tests (with feature 'extended_tests')
+  linux-test-extended:
+    name: cargo test 'extended_tests' (amd64)
+    needs: linux-build-lib
+    runs-on: ubuntu-latest
+    container:
+      image: amd64/rust
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+      - name: Setup Rust toolchain
+        uses: ./.github/actions/setup-builder
+        with:
+          rust-version: stable
+      - name: Run tests (excluding doctests)
+        run: cargo test --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --workspace --lib --tests --bins --features avro,json,backtrace,extended_tests
+      - name: Verify Working Directory Clean
+        run: git diff --exit-code
+
   # Check answers are correct when hash values collide
   hash-collisions:
     name: cargo test hash collisions (amd64)
@@ -51,4 +87,21 @@ jobs:
       - name: Run tests
         run: |
           cd datafusion
-          cargo test  --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --exclude datafusion-sqllogictest --workspace --lib --tests --features=force_hash_collisions,avro
+          cargo test  --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --exclude datafusion-sqllogictest --workspace --lib --tests --features=force_hash_collisions,avro,extended_tests
+
+  sqllogictest-sqlite:
+    name: "Run sqllogictests with the sqlite test suite"
+    runs-on: ubuntu-latest
+    container:
+      image: amd64/rust
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+      - name: Setup Rust toolchain
+        uses: ./.github/actions/setup-builder
+        with:
+          rust-version: stable
+      - name: Run sqllogictest
+        run: cargo test --profile release-nonlto --test sqllogictests -- --include-sqlite
diff --git a/Cargo.toml b/Cargo.toml
@@ -77,21 +77,21 @@ version = "44.0.0"
 ahash = { version = "0.8", default-features = false, features = [
     "runtime-rng",
 ] }
-arrow = { version = "53.3.0", features = [
+arrow = { version = "54.0.0", features = [
     "prettyprint",
 ] }
-arrow-array = { version = "53.3.0", default-features = false, features = [
+arrow-array = { version = "54.0.0", default-features = false, features = [
     "chrono-tz",
 ] }
-arrow-buffer = { version = "53.3.0", default-features = false }
-arrow-flight = { version = "53.3.0", features = [
+arrow-buffer = { version = "54.0.0", default-features = false }
+arrow-flight = { version = "54.0.0", features = [
     "flight-sql-experimental",
 ] }
-arrow-ipc = { version = "53.3.0", default-features = false, features = [
+arrow-ipc = { version = "54.0.0", default-features = false, features = [
     "lz4",
 ] }
-arrow-ord = { version = "53.3.0", default-features = false }
-arrow-schema = { version = "53.3.0", default-features = false }
+arrow-ord = { version = "54.0.0", default-features = false }
+arrow-schema = { version = "54.0.0", default-features = false }
 async-trait = "0.1.73"
 bigdecimal = "0.4.7"
 bytes = "1.4"
@@ -133,7 +133,7 @@ itertools = "0.14"
 log = "^0.4"
 object_store = { version = "0.11.0", default-features = false }
 parking_lot = "0.12"
-parquet = { version = "53.3.0", default-features = false, features = [
+parquet = { version = "54.0.0", default-features = false, features = [
     "arrow",
     "async",
     "object_store",

diff --git a/README.md b/README.md
@@ -146,3 +146,27 @@ stable API, we also improve the API over time. As a result, we typically
 deprecate methods before removing them, according to the [deprecation guidelines].
 
 [deprecation guidelines]: https://datafusion.apache.org/library-user-guide/api-health.html
+
+## Dependencies and a `Cargo.lock`
+
+`datafusion` is intended for use as a library and thus purposely does not have a
+`Cargo.lock` file checked in. You can read more about the distinction in the
+[Cargo book].
+
+CI tests always run against the latest compatible versions of all dependencies
+(the equivalent of doing `cargo update`), as suggested in the [Cargo CI guide]
+and we rely on Dependabot for other upgrades. This strategy has two problems
+that occasionally arise:
+
+1. CI failures when downstream libraries upgrade in some non compatible way
+2. Local development builds that fail when DataFusion inadvertently relies on
+   a feature in a newer version of a dependency than declared in `Cargo.toml`
+   (e.g. a new method is added to a trait that we use).
+
+However, we think the current strategy is the best tradeoff between maintenance
+overhead and user experience and ensures DataFusion always works with the latest
+compatible versions of all dependencies. If you encounter either of these
+problems, please open an issue or PR.
+
+[cargo book]: https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
+[cargo ci guide]: https://doc.rust-lang.org/cargo/guide/continuous-integration.html#verifying-latest-dependencies
diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
@@ -83,6 +83,7 @@ external_aggr:          External aggregation benchmark
 h2o_small:              h2oai benchmark with small dataset (1e7 rows),  default file format is csv
 h2o_medium:             h2oai benchmark with medium dataset (1e8 rows), default file format is csv
 h2o_big:                h2oai benchmark with large dataset (1e9 rows),  default file format is csv
+imdb:                   Join Order Benchmark (JOB) using the IMDB dataset converted to parquet
 
 **********
 * Supported Configuration (Environment Variables)
@@ -536,23 +537,52 @@ data_imdb() {
     done
 
     if [ "$convert_needed" = true ]; then
-        if [ ! -f "${imdb_dir}/imdb.tgz" ]; then
-            echo "Downloading IMDB dataset..."
+        # Expected size of the dataset
+        expected_size="1263193115" # 1.18 GB
+
+        echo -n "Looking for imdb.tgz... "
+        if [ -f "${imdb_temp_gz}" ]; then
+            echo "found"
+            echo -n "Checking size... "
+            OUTPUT_SIZE=$(wc -c "${imdb_temp_gz}" 2>/dev/null | awk '{print $1}' || true)
 
+            #Checking the size of the existing file
+            if [ "${OUTPUT_SIZE}" = "${expected_size}" ]; then
+                # Existing file is of the expected size, no need for download
+                echo "OK ${OUTPUT_SIZE}"
+            else
+                # Existing file is partially installed, remove it and initiate a new download
+                echo "MISMATCH"
+                echo "Size less than expected: ${OUTPUT_SIZE} found, ${expected_size} required"
+                echo "Downloading IMDB dataset..."
+                rm -f "${imdb_temp_gz}"
+
+                # Download the dataset
+                curl -o "${imdb_temp_gz}" "${imdb_url}"
+
+                # Size check of the installed file
+                DOWNLOADED_SIZE=$(wc -c "${imdb_temp_gz}" | awk '{print $1}')
+                if [ "${DOWNLOADED_SIZE}" != "${expected_size}" ]; then
+                    echo "Error: Download size mismatch"
+                    echo "Expected: ${expected_size}"
+                    echo "Got: ${DOWNLADED_SIZE}"
+                    echo "Please re-initiate the download"
+                    return 1
+                fi
+            fi
+        else
+            # No existing file found, initiate a new download
+            echo "not found"
+            echo "Downloading IMDB dataset ${expected_size} expected)..."
             # Download the dataset
             curl -o "${imdb_temp_gz}" "${imdb_url}"
-
-            # Extract the dataset
-            tar -xzvf "${imdb_temp_gz}" -C "${imdb_dir}"
-            $CARGO_COMMAND --bin imdb -- convert --input ${imdb_dir} --output ${imdb_dir} --format parquet
-        else 
-            echo "IMDB.tgz already exists."
-
-            # Extract the dataset
-            tar -xzvf "${imdb_temp_gz}" -C "${imdb_dir}"
-            $CARGO_COMMAND --bin imdb -- convert --input ${imdb_dir} --output ${imdb_dir} --format parquet
         fi
+
+        # Extract the dataset
+        tar -xzvf "${imdb_temp_gz}" -C "${imdb_dir}"
+        $CARGO_COMMAND --bin imdb -- convert --input ${imdb_dir} --output ${imdb_dir} --format parquet
         echo "IMDB dataset downloaded and extracted."
+
     else
         echo "IMDB dataset already exists and contains required parquet files."
     fi