apache · alamb · Oct 26, 2024 · Oct 24, 2024 · Oct 25, 2024 · alamb
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -342,6 +342,34 @@ This benchmarks is derived from the [TPC-H][1] version
 [2]: https://github.com/databricks/tpch-dbgen.git,
 [2.17.1]: https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-h_v2.17.1.pdf
 
+## External Aggregation
+
+Run the benchmark for aggregations with limited memory.
+
+When the memory limit is exceeded, the aggregation intermediate results will be spilled to disk, and finally read back with sort-merge.
+
+External aggregation benchmarks run several aggregation queries with different memory limits, on TPCH `lineitem` table. Queries can be found in [`external_aggr.rs`](src/bin/external_aggr.rs).
+
+This benchmark is inspired by [DuckDB's external aggregation paper](https://hannes.muehleisen.org/publications/icde2024-out-of-core-kuiper-boncz-muehleisen.pdf), specifically Section VI.
+
+### External Aggregation Example Runs
+1. Run all queries with predefined memory limits:
+```bash
+# Under 'benchmarks/' directory
+cargo run --release --bin external_aggr -- benchmark -n 4 --iterations 3 -p '....../data/tpch_sf1' -o '/tmp/aggr.json'
+```
+
+2. Run a query with specific memory limit:
+```bash
+cargo run --release --bin external_aggr -- benchmark -n 4 --iterations 3 -p '....../data/tpch_sf1' -o '/tmp/aggr.json' --query 1 --memory-limit 30M
+```
+
+3. Run all queries with `bench.sh` script:
+```bash
+./bench.sh data external_aggr
+./bench.sh run external_aggr
+```
+
 
 # Older Benchmarks
 

diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
@@ -171,6 +171,10 @@ main() {
                 imdb)
                     data_imdb
                     ;;
+                external_aggr)
+                    # same data as for tpch
+                    data_tpch "1"
+                    ;;
                 *)
                     echo "Error: unknown benchmark '$BENCHMARK' for data generation"
                     usage
@@ -537,10 +541,12 @@ run_external_aggr() {
     echo "RESULTS_FILE: ${RESULTS_FILE}"
     echo "Running external aggregation benchmark..."
 
-    # Only parquet is supported
-    # External aggregation is not stable yet, set partitions to 4 to make sure
-    # this benchmark can always run.
-    $CARGO_COMMAND --bin external_aggr -- benchmark -n 4 --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}"
+    # Only parquet is supported.
+    # Since per-operator memory limit is calculated as (total-memory-limit / 
+    # number-of-partitions), and by default `--partitions` is set to number of
+    # CPU cores, we set a constant number of partitions to prevent this 
+    # benchmark to fail on some machines.
+    $CARGO_COMMAND --bin external_aggr -- benchmark --partitions 4 --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}"
 }