From 6ce0cca60e0d9de8bacac7eb5ce9874d15a5d233 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Tue, 5 May 2026 12:28:05 +0100 Subject: [PATCH] Updated datafusion-vortex submission with up-to-date versions and new dedicated tool --- datafusion-vortex-partitioned/README.md | 43 +++++++++++ datafusion-vortex-partitioned/benchmark.sh | 3 +- datafusion-vortex-partitioned/check | 5 +- datafusion-vortex-partitioned/create.sql | 12 +-- datafusion-vortex-partitioned/data-size | 3 +- datafusion-vortex-partitioned/install | 61 +++++++-------- datafusion-vortex-partitioned/load | 20 ++++- datafusion-vortex-partitioned/make-json.sh | 41 ++++++++++ datafusion-vortex-partitioned/queries.sql | 4 +- datafusion-vortex-partitioned/query | 14 ++-- .../results/20260505/c6a.2xlarge.json | 57 ++++++++++++++ .../results/20260505/c6a.4xlarge.json | 57 ++++++++++++++ .../results/20260505/c6a.xlarge.json | 57 ++++++++++++++ .../results/20260505/c8g.4xlarge.json | 57 ++++++++++++++ datafusion-vortex/README.md | 43 +++++++++++ datafusion-vortex/benchmark.sh | 3 +- datafusion-vortex/check | 7 +- datafusion-vortex/create.sql | 13 +--- datafusion-vortex/data-size | 3 +- datafusion-vortex/install | 76 +++++++------------ datafusion-vortex/load | 17 +++-- datafusion-vortex/make-json.sh | 41 ++++++++++ datafusion-vortex/queries.sql | 4 +- datafusion-vortex/query | 16 ++-- .../results/20260505/c6a.2xlarge.json | 57 ++++++++++++++ .../results/20260505/c6a.4xlarge.json | 57 ++++++++++++++ .../results/20260505/c6a.xlarge.json | 57 ++++++++++++++ .../results/20260505/c8g.4xlarge.json | 57 ++++++++++++++ 28 files changed, 737 insertions(+), 148 deletions(-) create mode 100644 datafusion-vortex-partitioned/README.md create mode 100755 datafusion-vortex-partitioned/make-json.sh create mode 100644 datafusion-vortex-partitioned/results/20260505/c6a.2xlarge.json create mode 100644 datafusion-vortex-partitioned/results/20260505/c6a.4xlarge.json create mode 100644 datafusion-vortex-partitioned/results/20260505/c6a.xlarge.json create mode 100644 datafusion-vortex-partitioned/results/20260505/c8g.4xlarge.json create mode 100644 datafusion-vortex/README.md create mode 100755 datafusion-vortex/make-json.sh create mode 100644 datafusion-vortex/results/20260505/c6a.2xlarge.json create mode 100644 datafusion-vortex/results/20260505/c6a.4xlarge.json create mode 100644 datafusion-vortex/results/20260505/c6a.xlarge.json create mode 100644 datafusion-vortex/results/20260505/c8g.4xlarge.json diff --git a/datafusion-vortex-partitioned/README.md b/datafusion-vortex-partitioned/README.md new file mode 100644 index 0000000000..4842587a95 --- /dev/null +++ b/datafusion-vortex-partitioned/README.md @@ -0,0 +1,43 @@ +# DataFusion + Vortex + +Partitioned Vortex dataset, converted one-for-one from the 100 ClickBench Parquet files and queried with [`vortex-datafusion-cli`]. + +[`vortex-datafusion-cli`]: https://github.com/vortex-data/vortex-datafusion-cli + +## Cookbook: Generate benchmark results + +Follow the same EC2 setup used by [datafusion-partitioned](../datafusion-partitioned/README.md), then run: + +```bash +cd ClickBench/datafusion-vortex-partitioned +bash benchmark.sh +``` + +The shared benchmark harness builds `vortex-datafusion-cli`, downloads the partitioned Parquet files, converts each `partitioned/hits_N.parquet` file into exactly one `vortex/hits_N.vortex` file, and runs the query set. + +The `install` script checks out `vortex-datafusion-cli` tag `0.70.0-53.1.0`. CLI tags use `-`, where the first component is the `vortex-datafusion` crate version and the second is the DataFusion/DataFusion CLI version. + +You can update/preview the results by running: + +```bash +./make-json.sh # Example. ./make-json.sh c6a.xlarge +``` + +## Parquet To Vortex Conversion + +Each input file is converted independently through `vortex-datafusion-cli`: + +```sql +CREATE EXTERNAL TABLE hits_parquet +STORED AS PARQUET +LOCATION 'partitioned/hits_0.parquet' +OPTIONS ('binary_as_string' 'true'); + +COPY ( + SELECT * EXCEPT ("EventDate"), + CAST(CAST("EventDate" AS INTEGER) AS DATE) AS "EventDate" + FROM hits_parquet +) TO 'vortex/hits_0.vortex' STORED AS VORTEX; +``` + +`binary_as_string=true` handles the incorrect Parquet logical annotation before Vortex is written. The produced Vortex files store those fields as strings, so benchmark reads use only the Vortex table registration. diff --git a/datafusion-vortex-partitioned/benchmark.sh b/datafusion-vortex-partitioned/benchmark.sh index 51153ec49e..e236dbbd21 100755 --- a/datafusion-vortex-partitioned/benchmark.sh +++ b/datafusion-vortex-partitioned/benchmark.sh @@ -1,7 +1,6 @@ #!/bin/bash # Thin shim — actual flow is in lib/benchmark-common.sh. -# query_bench (the vortex driver) handles its own dataset download/conversion. -export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned" export BENCH_DURABLE=yes export BENCH_RESTARTABLE=no exec ../lib/benchmark-common.sh diff --git a/datafusion-vortex-partitioned/check b/datafusion-vortex-partitioned/check index 1a9d479a70..98d385b378 100755 --- a/datafusion-vortex-partitioned/check +++ b/datafusion-vortex-partitioned/check @@ -1,6 +1,5 @@ #!/bin/bash set -e -# Stateless system — confirm datafusion-cli (the playground's query -# driver) is on PATH. -command -v datafusion-cli >/dev/null +DF=vortex-datafusion-cli/target/release/vortex-datafusion-cli +"$DF" -q -c "SELECT 1" >/dev/null diff --git a/datafusion-vortex-partitioned/create.sql b/datafusion-vortex-partitioned/create.sql index 858646b651..e54d401ac9 100644 --- a/datafusion-vortex-partitioned/create.sql +++ b/datafusion-vortex-partitioned/create.sql @@ -1,9 +1,3 @@ -CREATE EXTERNAL TABLE hits_raw -STORED AS PARQUET -LOCATION 'partitioned' -OPTIONS ('binary_as_string' 'true'); - -CREATE VIEW hits AS -SELECT * EXCEPT ("EventDate"), - CAST(CAST("EventDate" AS INTEGER) AS DATE) AS "EventDate" -FROM hits_raw; +CREATE EXTERNAL TABLE hits +STORED AS VORTEX +LOCATION 'vortex'; diff --git a/datafusion-vortex-partitioned/data-size b/datafusion-vortex-partitioned/data-size index ec6675b0ed..a8311fb4bf 100755 --- a/datafusion-vortex-partitioned/data-size +++ b/datafusion-vortex-partitioned/data-size @@ -1,5 +1,4 @@ #!/bin/bash set -e -# Sum the byte counts of all generated .vortex files. -find . -name '*.vortex' -printf '%s\n' | awk '{s+=$1} END {print s+0}' +find vortex -name '*.vortex' -printf '%s\n' | awk '{s+=$1} END {print s+0}' diff --git a/datafusion-vortex-partitioned/install b/datafusion-vortex-partitioned/install index 6a24ecfe63..5d88f819f4 100755 --- a/datafusion-vortex-partitioned/install +++ b/datafusion-vortex-partitioned/install @@ -1,41 +1,36 @@ #!/bin/bash set -e -VORTEX_VERSION=0.44.0 +if [ ! -x vortex-datafusion-cli/target/release/vortex-datafusion-cli ]; then + # - + CLI_TAG=0.70.0-53.1.0 -if ! command -v cargo >/dev/null 2>&1; then - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh - bash rust-init.sh -y -fi -export HOME=${HOME:=~} -# shellcheck disable=SC1091 -source ~/.cargo/env + if ! command -v cargo >/dev/null 2>&1; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh + bash rust-init.sh -y + fi + export HOME=${HOME:=~} + # shellcheck disable=SC1091 + source "$HOME/.cargo/env" -sudo apt-get update -y -# vortex-duckdb's build.rs runs bindgen, which needs libclang plus the -# clang freestanding headers (stdbool.h etc.); without libclang-dev the -# build fails with `'stdbool.h' file not found`. -sudo apt-get install -y gcc jq build-essential git clang libclang-dev + if [ "$(free -g | awk '/^Mem:/{print $2}')" -lt 12 ]; then + if [ "$(swapon --noheadings --show | wc -l)" -eq 0 ]; then + sudo fallocate -l 8G /swapfile + sudo chmod 600 /swapfile + sudo mkswap /swapfile + sudo swapon /swapfile + fi + fi -if [ ! -d vortex ]; then - git clone https://github.com/spiraldb/vortex.git -fi -( - cd vortex - git fetch --tags - git checkout "$VORTEX_VERSION" - # See datafusion-vortex/install — submodule update isn't idempotent - # without sync + --force when a previous run left a partial clone. - git submodule sync --recursive - git submodule update --init --recursive --force - cargo build --release --bin query_bench --package bench-vortex -) + sudo apt-get update -y + sudo apt-get install -y build-essential clang cmake git libclang-dev pkg-config -# Build datafusion-cli so ./query can return actual rows instead of -# the bench driver's JSON timing blob. See datafusion-vortex/install. -if ! command -v datafusion-cli >/dev/null 2>&1; then - cargo install --locked --version 49.0.2 datafusion-cli - # Cargo installs into $HOME/.cargo/bin; the playground agent - # runs scripts with a stripped PATH, so symlink into /usr/local/bin. - sudo ln -sf "$HOME/.cargo/bin/datafusion-cli" /usr/local/bin/datafusion-cli + if [ ! -d vortex-datafusion-cli ]; then + git clone https://github.com/vortex-data/vortex-datafusion-cli.git + fi + cd vortex-datafusion-cli + git fetch --tags + git checkout "$CLI_TAG" + CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" \ + cargo build --release --bin vortex-datafusion-cli fi diff --git a/datafusion-vortex-partitioned/load b/datafusion-vortex-partitioned/load index 68db5b446c..c485043cb0 100755 --- a/datafusion-vortex-partitioned/load +++ b/datafusion-vortex-partitioned/load @@ -1,9 +1,23 @@ #!/bin/bash set -e -# ./query uses datafusion-cli against the partitioned parquet files -# under partitioned/. See datafusion-vortex/load for the rationale. +DF=vortex-datafusion-cli/target/release/vortex-datafusion-cli + mkdir -p partitioned -../lib/download-hits-parquet-partitioned partitioned +mv hits_*.parquet partitioned/ 2>/dev/null || true + +rm -rf vortex +mkdir -p vortex + +seq 0 99 | xargs -P"$(nproc)" -I{} "$DF" -q \ + -c "SET datafusion.execution.target_partitions = 1;" \ + -c "CREATE EXTERNAL TABLE hits_parquet STORED AS PARQUET LOCATION 'partitioned/hits_{}.parquet' OPTIONS ('binary_as_string' 'true');" \ + -c "COPY (SELECT * EXCEPT (\"EventDate\"), CAST(CAST(\"EventDate\" AS INTEGER) AS DATE) AS \"EventDate\" FROM hits_parquet) TO 'vortex/hits_{}.vortex' STORED AS VORTEX;" + +files=$(find vortex -maxdepth 1 -name 'hits_*.vortex' | wc -l) +if [ "$files" -ne 100 ]; then + echo "Expected 100 Vortex files, found $files" >&2 + exit 1 +fi sync diff --git a/datafusion-vortex-partitioned/make-json.sh b/datafusion-vortex-partitioned/make-json.sh new file mode 100755 index 0000000000..9ebfc16429 --- /dev/null +++ b/datafusion-vortex-partitioned/make-json.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# This script converts the raw `result.csv` data from `benchmark.sh` into the +# final json format used by the benchmark dashboard. +# +# usage : ./make-json.sh +# +# example (save results//c6a.4xlarge.json) +# ./make-json.sh c6a.4xlarge + +MACHINE=$1 +DATE=$(date -u +%Y-%m-%d) +YYYYMMDD=${DATE//-/} +mkdir -p "results/${YYYYMMDD}" +OUTPUT_FILE="results/${YYYYMMDD}/${MACHINE}.json" +SYSTEM_NAME="DataFusion (Vortex, partitioned)" +LOAD_TIME=${LOAD_TIME:-null} +DATA_SIZE=${DATA_SIZE:-$(./data-size 2>/dev/null || echo null)} +DATA_SIZE=${DATA_SIZE:-null} + +# Read the CSV and build the result array using sed +RESULT_ARRAY=$(awk -F, '{arr[$1]=arr[$1]","$3} END {for (i=1;i<=length(arr);i++) {gsub(/^,/, "", arr[i]); printf " ["arr[i]"]"; if (i $OUTPUT_FILE +{ + "system": "$SYSTEM_NAME", + "date": "$DATE", + "machine": "$MACHINE", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "hardware": "cpu", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": $LOAD_TIME, + "data_size": $DATA_SIZE, + "result": [ + $RESULT_ARRAY + ] +} +EOF diff --git a/datafusion-vortex-partitioned/queries.sql b/datafusion-vortex-partitioned/queries.sql index 0c30150ef6..9a183cd6e2 100644 --- a/datafusion-vortex-partitioned/queries.sql +++ b/datafusion-vortex-partitioned/queries.sql @@ -16,7 +16,7 @@ SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPh SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10; SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10; -SELECT "UserID", extract(minute FROM "EventTime") AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; +SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449; SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%'; SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; @@ -40,4 +40,4 @@ SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventD SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; SELECT "URLHash", "EventDate", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate" ORDER BY PageViews DESC LIMIT 10 OFFSET 100; SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; -SELECT DATE_TRUNC('minute', "EventTime") AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', "EventTime") ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000; +SELECT DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000; diff --git a/datafusion-vortex-partitioned/query b/datafusion-vortex-partitioned/query index 090d16b735..2e4df9d6a9 100755 --- a/datafusion-vortex-partitioned/query +++ b/datafusion-vortex-partitioned/query @@ -1,17 +1,16 @@ #!/bin/bash -# Reads a SQL query from stdin, runs it via datafusion-cli against the -# partitioned parquet files. See ../datafusion-vortex/query for the -# rationale; the vortex bench binary is benchmark-only. -# Stdout: query result. -# Stderr: query runtime in fractional seconds on the last line. +# Reads a SQL query from stdin and runs it via vortex-datafusion-cli. +# Stdout: query result. Stderr: query runtime in fractional seconds. set -e +DF=vortex-datafusion-cli/target/release/vortex-datafusion-cli + query=$(cat) -tmp=$(mktemp /tmp/datafusion.XXXXXX.sql) +tmp=$(mktemp /tmp/datafusion-vortex.XXXXXX.sql) trap 'rm -f "$tmp"' EXIT printf '%s\n' "$query" > "$tmp" -out=$(datafusion-cli -f create.sql "$tmp" 2>&1) && status=0 || status=$? +out=$("$DF" -f create.sql -f "$tmp" 2>&1) && status=0 || status=$? if [ "$status" -ne 0 ]; then printf '%s\n' "$out" >&2 @@ -19,5 +18,4 @@ if [ "$status" -ne 0 ]; then fi printf '%s\n' "$out" | grep -v 'Elapsed' || true - printf '%s\n' "$out" | awk '/Elapsed/ { e = $2 } END { print e }' >&2 diff --git a/datafusion-vortex-partitioned/results/20260505/c6a.2xlarge.json b/datafusion-vortex-partitioned/results/20260505/c6a.2xlarge.json new file mode 100644 index 0000000000..77659d69c6 --- /dev/null +++ b/datafusion-vortex-partitioned/results/20260505/c6a.2xlarge.json @@ -0,0 +1,57 @@ +{ + "system": "DataFusion (Vortex, partitioned)", + "date": "2026-05-05", + "machine": "c6a.2xlarge", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "hardware": "cpu", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 108.58, + "data_size": 15328662856, + "result": [ + [0.078,0.002,0.002], + [0.170,0.027,0.028], + [0.214,0.072,0.069], + [0.650,0.071,0.070], + [1.367,0.840,0.837], + [1.355,0.785,0.786], + [0.075,0.002,0.002], + [0.179,0.031,0.033], + [1.269,1.045,1.044], + [1.670,1.251,1.237], + [0.770,0.162,0.156], + [1.099,0.192,0.191], + [1.603,0.694,0.682], + [3.280,1.199,1.198], + [1.395,0.667,0.670], + [1.108,0.948,0.942], + [3.158,1.772,1.779], + [3.122,1.778,1.785], + [5.120,3.531,3.507], + [0.313,0.043,0.048], + [15.675,0.906,0.901], + [17.831,0.929,0.926], + [22.767,1.105,1.103], + [22.582,1.521,1.574], + [0.319,0.079,0.075], + [1.603,0.146,0.147], + [0.609,0.081,0.082], + [16.343,1.328,1.365], + [15.637,15.200,15.211], + [0.814,0.656,0.668], + [2.796,0.592,0.595], + [5.885,0.641,0.629], + [3.929,3.007,3.019], + [16.025,3.545,3.512], + [15.999,3.567,3.503], + [1.455,1.293,1.298], + [0.254,0.074,0.073], + [0.203,0.034,0.034], + [0.243,0.024,0.022], + [0.386,0.130,0.129], + [0.247,0.019,0.016], + [0.249,0.015,0.015], + [0.242,0.014,0.015] + ] +} \ No newline at end of file diff --git a/datafusion-vortex-partitioned/results/20260505/c6a.4xlarge.json b/datafusion-vortex-partitioned/results/20260505/c6a.4xlarge.json new file mode 100644 index 0000000000..cb9a10682a --- /dev/null +++ b/datafusion-vortex-partitioned/results/20260505/c6a.4xlarge.json @@ -0,0 +1,57 @@ +{ + "system": "DataFusion (Vortex, partitioned)", + "date": "2026-05-05", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 99.94, + "data_size": 15328662856, + "result": [ + [0.082, 0.002, 0.002], + [0.152, 0.030, 0.027], + [0.175, 0.059, 0.060], + [0.633, 0.087, 0.089], + [1.306, 0.628, 0.626], + [1.316, 0.603, 0.592], + [0.090, 0.002, 0.002], + [0.166, 0.031, 0.030], + [1.223, 0.786, 0.772], + [1.664, 0.861, 0.871], + [0.729, 0.131, 0.133], + [1.116, 0.148, 0.147], + [1.605, 0.581, 0.578], + [3.174, 1.070, 1.068], + [1.527, 0.610, 0.597], + [0.887, 0.727, 0.715], + [3.174, 1.509, 1.532], + [3.153, 1.510, 1.506], + [4.788, 2.907, 2.827], + [0.313, 0.048, 0.049], + [15.848, 0.537, 0.528], + [17.859, 0.781, 0.772], + [22.900, 0.894, 0.878], + [21.132, 0.858, 0.785], + [0.287, 0.084, 0.091], + [1.607, 0.153, 0.146], + [0.757, 0.088, 0.088], + [16.230, 0.974, 1.011], + [13.690, 8.122, 8.151], + [0.512, 0.369, 0.364], + [2.788, 0.487, 0.487], + [5.882, 0.592, 0.589], + [3.929, 2.653, 2.674], + [15.934, 3.107, 2.890], + [15.951, 2.902, 2.918], + [1.060, 0.930, 0.917], + [0.261, 0.082, 0.082], + [0.208, 0.035, 0.036], + [0.197, 0.024, 0.023], + [0.385, 0.146, 0.146], + [0.251, 0.017, 0.016], + [0.248, 0.019, 0.014], + [0.244, 0.015, 0.015] + ] +} diff --git a/datafusion-vortex-partitioned/results/20260505/c6a.xlarge.json b/datafusion-vortex-partitioned/results/20260505/c6a.xlarge.json new file mode 100644 index 0000000000..162555df72 --- /dev/null +++ b/datafusion-vortex-partitioned/results/20260505/c6a.xlarge.json @@ -0,0 +1,57 @@ +{ + "system": "DataFusion (Vortex, partitioned)", + "date": "2026-05-05", + "machine": "c6a.xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 203.96, + "data_size": 15328662856, + "result": [ + [0.094, 0.002, 0.002], + [0.255, 0.048, 0.045], + [0.372, 0.126, 0.126], + [0.646, 0.104, 0.102], + [1.758, 1.538, 1.542], + [1.531, 1.347, 1.361], + [0.076, 0.002, 0.002], + [0.269, 0.059, 0.055], + [2.173, 1.919, 1.895], + [2.624, 2.323, 2.232], + [0.781, 0.277, 0.277], + [0.881, 0.347, 0.351], + [1.571, 1.092, 1.096], + [3.396, 1.602, 1.609], + [1.666, 1.054, 1.063], + [1.921, 1.705, 1.688], + [3.797, 3.020, 3.037], + [3.719, 3.013, 3.018], + [12.789, 13.580, 8.736], + [0.363, 0.068, 0.067], + [15.387, 1.708, 1.698], + [17.891, 1.673, 1.663], + [22.657, 1.888, 1.923], + [18.103, 2.159, 2.178], + [0.339, 0.105, 0.109], + [1.313, 0.246, 0.249], + [0.351, 0.117, 0.119], + [16.247, 2.395, 2.436], + [29.179, 28.872, 28.767], + [1.477, 1.302, 1.294], + [2.846, 0.978, 0.975], + [5.908, 0.972, 0.956], + [29.987, 6.008, 8.928], + [18.597, 23.579, 21.585], + [24.180, 18.393, 21.401], + [2.550, 2.433, 2.394], + [0.309, 0.081, 0.079], + [0.255, 0.036, 0.036], + [0.255, 0.032, 0.029], + [0.412, 0.140, 0.138], + [0.243, 0.017, 0.017], + [0.237, 0.017, 0.018], + [0.235, 0.017, 0.016] + ] +} diff --git a/datafusion-vortex-partitioned/results/20260505/c8g.4xlarge.json b/datafusion-vortex-partitioned/results/20260505/c8g.4xlarge.json new file mode 100644 index 0000000000..2838bd3576 --- /dev/null +++ b/datafusion-vortex-partitioned/results/20260505/c8g.4xlarge.json @@ -0,0 +1,57 @@ +{ + "system": "DataFusion (Vortex, partitioned)", + "date": "2026-05-05", + "machine": "c8g.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 107.21, + "data_size": 15329147920, + "result": [ + [0.047, 0.001, 0.001], + [0.092, 0.015, 0.015], + [0.139, 0.038, 0.036], + [0.614, 0.032, 0.032], + [1.347, 0.233, 0.233], + [1.323, 0.236, 0.237], + [0.045, 0.001, 0.001], + [0.100, 0.018, 0.017], + [1.034, 0.311, 0.315], + [1.591, 0.577, 0.555], + [0.664, 0.060, 0.061], + [1.381, 0.074, 0.075], + [1.700, 0.214, 0.223], + [2.986, 0.342, 0.355], + [1.295, 0.222, 0.219], + [0.939, 0.257, 0.252], + [2.847, 0.512, 0.510], + [2.693, 0.508, 0.506], + [3.899, 0.980, 0.966], + [0.279, 0.020, 0.022], + [15.898, 0.469, 0.469], + [17.882, 0.303, 0.302], + [23.013, 0.611, 0.373], + [19.679, 0.519, 0.519], + [0.285, 0.037, 0.036], + [1.842, 0.055, 0.057], + [1.036, 0.033, 0.041], + [16.514, 0.451, 0.451], + [13.751, 6.440, 6.440], + [0.434, 0.345, 0.344], + [2.739, 0.191, 0.192], + [6.081, 0.198, 0.190], + [4.114, 0.772, 0.776], + [15.880, 1.048, 1.045], + [15.887, 1.044, 1.054], + [0.561, 0.449, 0.446], + [0.166, 0.054, 0.053], + [0.139, 0.031, 0.031], + [0.145, 0.016, 0.014], + [0.263, 0.106, 0.105], + [0.147, 0.014, 0.013], + [0.138, 0.012, 0.012], + [0.128, 0.014, 0.014] + ] +} diff --git a/datafusion-vortex/README.md b/datafusion-vortex/README.md new file mode 100644 index 0000000000..ed45008536 --- /dev/null +++ b/datafusion-vortex/README.md @@ -0,0 +1,43 @@ +# DataFusion + Vortex + +Single-file Vortex dataset, converted from the ClickBench Parquet file and queried with [`vortex-datafusion-cli`]. + +[`vortex-datafusion-cli`]: https://github.com/vortex-data/vortex-datafusion-cli + +## Cookbook: Generate benchmark results + +Follow the same EC2 setup used by [datafusion](../datafusion/README.md), then run: + +```bash +cd ClickBench/datafusion-vortex +bash benchmark.sh +``` + +The shared benchmark harness builds `vortex-datafusion-cli`, downloads `hits.parquet`, converts it to `vortex/hits.vortex`, and runs the query set. + +The `install` script checks out `vortex-datafusion-cli` tag `0.70.0-53.1.0`. CLI tags use `-`, where the first component is the `vortex-datafusion` crate version and the second is the DataFusion/DataFusion CLI version. + +You can update/preview the results by running: + +```bash +./make-json.sh # Example. ./make-json.sh c6a.xlarge +``` + +## Parquet To Vortex Conversion + +The conversion intentionally goes through the DataFusion CLI path: + +```sql +CREATE EXTERNAL TABLE hits_parquet +STORED AS PARQUET +LOCATION 'hits.parquet' +OPTIONS ('binary_as_string' 'true'); + +COPY ( + SELECT * EXCEPT ("EventDate"), + CAST(CAST("EventDate" AS INTEGER) AS DATE) AS "EventDate" + FROM hits_parquet +) TO 'vortex/hits.vortex' STORED AS VORTEX; +``` + +`binary_as_string=true` handles the ClickBench Parquet byte/string mismatch before Vortex is written. The resulting Vortex file stores those columns as strings, so Vortex reads do not need the Parquet-only option. diff --git a/datafusion-vortex/benchmark.sh b/datafusion-vortex/benchmark.sh index dce465f2fc..617422ddc2 100755 --- a/datafusion-vortex/benchmark.sh +++ b/datafusion-vortex/benchmark.sh @@ -1,7 +1,6 @@ #!/bin/bash # Thin shim — actual flow is in lib/benchmark-common.sh. -# clickbench (the vortex driver) handles its own dataset download/conversion. -export BENCH_DOWNLOAD_SCRIPT="" +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" export BENCH_DURABLE=yes export BENCH_RESTARTABLE=no exec ../lib/benchmark-common.sh diff --git a/datafusion-vortex/check b/datafusion-vortex/check index bc7a2938f9..98d385b378 100755 --- a/datafusion-vortex/check +++ b/datafusion-vortex/check @@ -1,8 +1,5 @@ #!/bin/bash set -e -# Stateless system — confirm datafusion-cli (the playground's query -# driver) is built. The vortex bench binary (used by benchmark.sh, -# not by ./query) was renamed from `clickbench` to `query_bench` -# upstream, so the old check would always fail post-install. -command -v datafusion-cli >/dev/null +DF=vortex-datafusion-cli/target/release/vortex-datafusion-cli +"$DF" -q -c "SELECT 1" >/dev/null diff --git a/datafusion-vortex/create.sql b/datafusion-vortex/create.sql index 8efc6dea74..a5cfc6c05d 100644 --- a/datafusion-vortex/create.sql +++ b/datafusion-vortex/create.sql @@ -1,10 +1,3 @@ -CREATE EXTERNAL TABLE hits_raw -STORED AS PARQUET -LOCATION 'hits.parquet' -OPTIONS ('binary_as_string' 'true'); - - -CREATE VIEW hits AS -SELECT * EXCEPT ("EventDate"), - CAST(CAST("EventDate" AS INTEGER) AS DATE) AS "EventDate" -FROM hits_raw; +CREATE EXTERNAL TABLE hits +STORED AS VORTEX +LOCATION 'vortex/hits.vortex'; diff --git a/datafusion-vortex/data-size b/datafusion-vortex/data-size index ec6675b0ed..a8311fb4bf 100755 --- a/datafusion-vortex/data-size +++ b/datafusion-vortex/data-size @@ -1,5 +1,4 @@ #!/bin/bash set -e -# Sum the byte counts of all generated .vortex files. -find . -name '*.vortex' -printf '%s\n' | awk '{s+=$1} END {print s+0}' +find vortex -name '*.vortex' -printf '%s\n' | awk '{s+=$1} END {print s+0}' diff --git a/datafusion-vortex/install b/datafusion-vortex/install index 057159ae6d..5d88f819f4 100755 --- a/datafusion-vortex/install +++ b/datafusion-vortex/install @@ -1,56 +1,36 @@ #!/bin/bash set -e -# 0.34.0 referenced two private spiraldb-owned submodules -# (spiraldb/duckdb and spiraldb/duckdb-rs) under duckdb-vortex/, which -# now 404 on GitHub. From 0.41.0 onward the duckdb dep moved to the -# upstream duckdb/duckdb repo, and 0.42.0+ ship without a .gitmodules -# file at all (vendored / Cargo registry deps). 0.44.0 matches what -# datafusion-vortex-partitioned uses. -VORTEX_VERSION=0.44.0 +if [ ! -x vortex-datafusion-cli/target/release/vortex-datafusion-cli ]; then + # - + CLI_TAG=0.70.0-53.1.0 -if ! command -v cargo >/dev/null 2>&1; then - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh - bash rust-init.sh -y -fi -export HOME=${HOME:=~} -# shellcheck disable=SC1091 -source ~/.cargo/env + if ! command -v cargo >/dev/null 2>&1; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh + bash rust-init.sh -y + fi + export HOME=${HOME:=~} + # shellcheck disable=SC1091 + source "$HOME/.cargo/env" -sudo apt-get update -y -# vortex-duckdb's build.rs runs bindgen, which needs libclang plus the -# clang freestanding headers (stdbool.h etc.); without libclang-dev the -# build fails with `'stdbool.h' file not found`. -sudo apt-get install -y gcc jq build-essential git clang libclang-dev + if [ "$(free -g | awk '/^Mem:/{print $2}')" -lt 12 ]; then + if [ "$(swapon --noheadings --show | wc -l)" -eq 0 ]; then + sudo fallocate -l 8G /swapfile + sudo chmod 600 /swapfile + sudo mkswap /swapfile + sudo swapon /swapfile + fi + fi -if [ ! -d vortex ]; then - git clone https://github.com/spiraldb/vortex.git -fi -( - cd vortex - git fetch --tags - git checkout "$VORTEX_VERSION" - # `git submodule update --init` fails with - # "fatal: destination path 'duckdb-vortex/duckdb' exists and is not an - # empty directory" once the submodule has been cloned but isn't fully - # registered (a partial state previous runs leave behind). `sync` - # refreshes the configured URLs and `--force` re-checkouts cleanly, - # which is what we want for an idempotent setup. - git submodule sync --recursive - git submodule update --init --recursive --force - # Upstream renamed the `clickbench` bin to `query_bench` — match - # the partitioned variant's install. - cargo build --release --bin query_bench --package bench-vortex -) + sudo apt-get update -y + sudo apt-get install -y build-essential clang cmake git libclang-dev pkg-config -# Build datafusion-cli so ./query can return actual rows instead of -# the bench driver's JSON timing blob. The benchmark binary above -# emits gh-json stats only; for an interactive playground we want a -# proper SQL client. datafusion-cli reads parquet directly — close -# enough to "datafusion (parquet)" semantics for the playground. -if ! command -v datafusion-cli >/dev/null 2>&1; then - cargo install --locked --version 49.0.2 datafusion-cli - # Cargo installs into $HOME/.cargo/bin; the playground agent - # runs scripts with a stripped PATH, so symlink into /usr/local/bin. - sudo ln -sf "$HOME/.cargo/bin/datafusion-cli" /usr/local/bin/datafusion-cli + if [ ! -d vortex-datafusion-cli ]; then + git clone https://github.com/vortex-data/vortex-datafusion-cli.git + fi + cd vortex-datafusion-cli + git fetch --tags + git checkout "$CLI_TAG" + CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" \ + cargo build --release --bin vortex-datafusion-cli fi diff --git a/datafusion-vortex/load b/datafusion-vortex/load index 4fed22b64e..de2a689823 100755 --- a/datafusion-vortex/load +++ b/datafusion-vortex/load @@ -1,12 +1,15 @@ #!/bin/bash set -e -# The ./query path uses datafusion-cli reading hits.parquet directly, -# so make sure that file is in CWD. (The shared bench_download already -# stages it; we don't need vortex's clickbench warmup for playground -# query output.) -if [ ! -e hits.parquet ]; then - ../lib/download-hits-parquet-single . -fi +DF=vortex-datafusion-cli/target/release/vortex-datafusion-cli +rm -rf vortex +mkdir -p vortex + +"$DF" -q \ + -c "SET datafusion.execution.target_partitions = 1;" \ + -c "CREATE EXTERNAL TABLE hits_parquet STORED AS PARQUET LOCATION 'hits.parquet' OPTIONS ('binary_as_string' 'true');" \ + -c "COPY (SELECT * EXCEPT (\"EventDate\"), CAST(CAST(\"EventDate\" AS INTEGER) AS DATE) AS \"EventDate\" FROM hits_parquet) TO 'vortex/hits.vortex' STORED AS VORTEX;" + +test -f vortex/hits.vortex sync diff --git a/datafusion-vortex/make-json.sh b/datafusion-vortex/make-json.sh new file mode 100755 index 0000000000..d19a3ddfab --- /dev/null +++ b/datafusion-vortex/make-json.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# This script converts the raw `result.csv` data from `benchmark.sh` into the +# final json format used by the benchmark dashboard. +# +# usage : ./make-json.sh +# +# example ./make-json.sh c6a.4xlarge # saves results//c6a.4xlarge.json +# + +MACHINE=$1 +DATE=$(date -u +%Y-%m-%d) +YYYYMMDD=${DATE//-/} +mkdir -p "results/${YYYYMMDD}" +OUTPUT_FILE="results/${YYYYMMDD}/${MACHINE}.json" +SYSTEM_NAME="DataFusion (Vortex, single)" +LOAD_TIME=${LOAD_TIME:-null} +DATA_SIZE=${DATA_SIZE:-$(./data-size 2>/dev/null || echo null)} +DATA_SIZE=${DATA_SIZE:-null} + +# Read the CSV and build the result array using sed +RESULT_ARRAY=$(awk -F, '{arr[$1]=arr[$1]","$3} END {for (i=1;i<=length(arr);i++) {gsub(/^,/, "", arr[i]); printf " ["arr[i]"]"; if (i $OUTPUT_FILE +{ + "system": "$SYSTEM_NAME", + "date": "$DATE", + "machine": "$MACHINE", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "hardware": "cpu", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": $LOAD_TIME, + "data_size": $DATA_SIZE, + "result": [ + $RESULT_ARRAY + ] +} +EOF diff --git a/datafusion-vortex/queries.sql b/datafusion-vortex/queries.sql index 0c30150ef6..9a183cd6e2 100644 --- a/datafusion-vortex/queries.sql +++ b/datafusion-vortex/queries.sql @@ -16,7 +16,7 @@ SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPh SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10; SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10; -SELECT "UserID", extract(minute FROM "EventTime") AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; +SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449; SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%'; SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; @@ -40,4 +40,4 @@ SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventD SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; SELECT "URLHash", "EventDate", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate" ORDER BY PageViews DESC LIMIT 10 OFFSET 100; SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; -SELECT DATE_TRUNC('minute', "EventTime") AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', "EventTime") ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000; +SELECT DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000; diff --git a/datafusion-vortex/query b/datafusion-vortex/query index cbd082fae5..2e4df9d6a9 100755 --- a/datafusion-vortex/query +++ b/datafusion-vortex/query @@ -1,19 +1,16 @@ #!/bin/bash -# Reads a SQL query from stdin, runs it via datafusion-cli against the -# parquet file. The vortex bench binary (built in ./install for the -# benchmark.sh path) only emits gh-json timing blobs and has no -# result-printing mode, so for the interactive playground we use -# datafusion-cli directly — matching datafusion (Parquet). -# Stdout: query result. -# Stderr: query runtime in fractional seconds on the last line. +# Reads a SQL query from stdin and runs it via vortex-datafusion-cli. +# Stdout: query result. Stderr: query runtime in fractional seconds. set -e +DF=vortex-datafusion-cli/target/release/vortex-datafusion-cli + query=$(cat) -tmp=$(mktemp /tmp/datafusion.XXXXXX.sql) +tmp=$(mktemp /tmp/datafusion-vortex.XXXXXX.sql) trap 'rm -f "$tmp"' EXIT printf '%s\n' "$query" > "$tmp" -out=$(datafusion-cli -f create.sql "$tmp" 2>&1) && status=0 || status=$? +out=$("$DF" -f create.sql -f "$tmp" 2>&1) && status=0 || status=$? if [ "$status" -ne 0 ]; then printf '%s\n' "$out" >&2 @@ -21,5 +18,4 @@ if [ "$status" -ne 0 ]; then fi printf '%s\n' "$out" | grep -v 'Elapsed' || true - printf '%s\n' "$out" | awk '/Elapsed/ { e = $2 } END { print e }' >&2 diff --git a/datafusion-vortex/results/20260505/c6a.2xlarge.json b/datafusion-vortex/results/20260505/c6a.2xlarge.json new file mode 100644 index 0000000000..1b2f6ae659 --- /dev/null +++ b/datafusion-vortex/results/20260505/c6a.2xlarge.json @@ -0,0 +1,57 @@ +{ + "system": "DataFusion (Vortex, single)", + "date": "2026-05-05", + "machine": "c6a.2xlarge", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "hardware": "cpu", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 146.68, + "data_size": 15269997296, + "result": [ + [0.099,0.001,0.001], + [0.298,0.114,0.116], + [0.331,0.161,0.160], + [0.589,0.187,0.190], + [1.180,0.942,0.942], + [1.442,0.953,0.911], + [0.075,0.001,0.001], + [0.275,0.125,0.119], + [1.401,1.153,1.151], + [1.698,1.331,1.357], + [0.707,0.257,0.248], + [0.774,0.272,0.285], + [1.316,0.786,0.809], + [3.030,1.094,1.300], + [1.613,0.761,0.777], + [1.313,1.055,1.061], + [3.449,1.902,1.863], + [3.430,1.894,1.892], + [5.018,3.543,3.527], + [0.382,0.168,0.170], + [15.322,1.071,1.049], + [18.207,1.296,1.316], + [21.967,7.898,6.495], + [0.111,0.118,0.117], + [3.054,0.329,0.334], + [1.231,0.297,0.274], + [2.308,0.318,0.319], + [16.213,1.628,1.613], + [17.861,17.254,17.424], + [0.997,0.832,0.772], + [2.710,0.720,0.738], + [5.799,0.783,0.792], + [4.209,3.275,3.257], + [16.457,3.560,3.559], + [16.425,3.573,3.531], + [1.514,1.348,1.355], + [0.352,0.183,0.186], + [0.291,0.136,0.130], + [0.299,0.139,0.138], + [0.444,0.267,0.271], + [0.284,0.115,0.116], + [0.331,0.116,0.116], + [0.331,0.115,0.111] + ] +} \ No newline at end of file diff --git a/datafusion-vortex/results/20260505/c6a.4xlarge.json b/datafusion-vortex/results/20260505/c6a.4xlarge.json new file mode 100644 index 0000000000..6aa75ba3d7 --- /dev/null +++ b/datafusion-vortex/results/20260505/c6a.4xlarge.json @@ -0,0 +1,57 @@ +{ + "system": "DataFusion (Vortex, single)", + "date": "2026-05-05", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": null, + "data_size": 15269997296, + "result": [ + [0.100, 0.001, 0.001], + [0.318, 0.142, 0.142], + [0.305, 0.173, 0.170], + [0.622, 0.223, 0.222], + [0.966, 0.743, 0.740], + [1.415, 0.805, 0.806], + [0.078, 0.001, 0.001], + [0.300, 0.146, 0.147], + [1.390, 0.907, 0.903], + [1.811, 0.964, 1.022], + [0.740, 0.244, 0.240], + [0.860, 0.251, 0.254], + [1.506, 0.763, 0.755], + [3.031, 1.076, 1.073], + [1.519, 0.785, 0.791], + [1.056, 0.839, 0.841], + [3.343, 1.647, 1.646], + [3.329, 1.633, 1.635], + [4.732, 2.774, 2.966], + [0.380, 0.193, 0.195], + [15.839, 0.827, 0.833], + [18.245, 1.367, 1.365], + [22.015, 1.433, 1.427], + [57.863, 55.932, 56.962], + [2.468, 0.360, 0.355], + [1.275, 0.316, 0.312], + [2.342, 0.353, 0.360], + [16.237, 1.416, 1.438], + [14.892, 8.576, 9.159], + [0.579, 0.445, 0.479], + [2.761, 0.621, 0.630], + [5.838, 0.722, 0.714], + [5.407, 2.822, 2.790], + [16.400, 3.074, 3.087], + [16.436, 3.055, 3.110], + [1.150, 0.974, 0.969], + [0.383, 0.198, 0.199], + [0.332, 0.159, 0.151], + [0.336, 0.166, 0.161], + [0.482, 0.300, 0.303], + [0.323, 0.151, 0.148], + [0.313, 0.140, 0.145], + [0.301, 0.142, 0.138] + ] +} diff --git a/datafusion-vortex/results/20260505/c6a.xlarge.json b/datafusion-vortex/results/20260505/c6a.xlarge.json new file mode 100644 index 0000000000..96212fe5c3 --- /dev/null +++ b/datafusion-vortex/results/20260505/c6a.xlarge.json @@ -0,0 +1,57 @@ +{ + "system": "DataFusion (Vortex, single)", + "date": "2026-05-05", + "machine": "c6a.xlarge", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "hardware": "cpu", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 214.48, + "data_size": 15269997296, + "result": [ + [0.095,0.001,0.001], + [0.294,0.118,0.118], + [0.368,0.207,0.204], + [0.606,0.202,0.207], + [1.783,1.602,1.609], + [1.597,1.440,1.435], + [0.076,0.001,0.001], + [0.266,0.127,0.127], + [2.126,1.966,1.952], + [2.343,2.128,2.112], + [0.715,0.351,0.343], + [0.767,0.405,0.403], + [1.625,1.201,1.196], + [3.496,1.662,1.918], + [1.746,1.138,1.141], + [1.917,1.756,1.766], + [3.860,3.092,3.051], + [3.811,3.068,3.039], + [14.188,14.738,15.730], + [0.407,0.167,0.185], + [15.198,2.291,2.160], + [30.110,36.377,35.010], + [42.205,40.162,38.771], + [0.144,0.144,0.143], + [2.343,0.441,0.430], + [1.271,0.416,0.378], + [2.310,0.463,0.474], + [28.792,28.452,28.469], + [39.856,38.212,39.387], + [1.448,1.290,1.289], + [2.854,1.035,1.040], + [5.820,1.052,1.035], + [21.409,24.075,8.592], + [30.597,33.675,38.120], + [26.918,21.302,19.043], + [2.485,2.311,2.292], + [0.410,0.194,0.182], + [0.346,0.126,0.123], + [0.352,0.142,0.141], + [0.543,0.303,0.280], + [0.331,0.110,0.110], + [0.310,0.108,0.114], + [0.316,0.101,0.107] + ] +} \ No newline at end of file diff --git a/datafusion-vortex/results/20260505/c8g.4xlarge.json b/datafusion-vortex/results/20260505/c8g.4xlarge.json new file mode 100644 index 0000000000..ac55cb45ff --- /dev/null +++ b/datafusion-vortex/results/20260505/c8g.4xlarge.json @@ -0,0 +1,57 @@ +{ + "system": "DataFusion (Vortex, single)", + "date": "2026-05-05", + "machine": "c8g.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 70.42, + "data_size": 15269901360, + "result": [ + [0.058, 0.001, 0.001], + [0.221, 0.121, 0.123], + [0.224, 0.125, 0.133], + [0.567, 0.128, 0.131], + [1.030, 0.314, 0.311], + [1.231, 0.402, 0.407], + [0.052, 0.001, 0.001], + [0.224, 0.127, 0.122], + [1.063, 0.396, 0.404], + [1.639, 0.566, 0.609], + [0.655, 0.151, 0.167], + [0.992, 0.161, 0.160], + [1.503, 0.348, 0.354], + [2.757, 0.413, 0.412], + [1.434, 0.355, 0.379], + [0.686, 0.339, 0.341], + [2.707, 0.636, 0.597], + [2.697, 0.620, 0.626], + [3.912, 0.998, 1.044], + [0.339, 0.140, 0.149], + [15.985, 0.618, 0.614], + [18.192, 0.523, 0.528], + [21.971, 0.617, 0.619], + [57.869, 57.269, 56.779], + [2.600, 0.193, 0.198], + [1.413, 0.176, 0.173], + [2.498, 0.196, 0.200], + [16.312, 0.682, 0.701], + [15.384, 6.855, 7.000], + [0.484, 0.403, 0.383], + [2.618, 0.305, 0.300], + [5.743, 0.305, 0.311], + [3.820, 0.837, 0.832], + [16.154, 1.232, 1.247], + [16.173, 1.246, 1.170], + [0.666, 0.468, 0.468], + [0.282, 0.158, 0.154], + [0.255, 0.135, 0.140], + [0.261, 0.151, 0.147], + [0.405, 0.274, 0.281], + [0.240, 0.123, 0.128], + [0.237, 0.125, 0.122], + [0.231, 0.127, 0.125] + ] +}