Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions datafusion-vortex-partitioned/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# DataFusion + Vortex

Partitioned Vortex dataset, converted one-for-one from the 100 ClickBench Parquet files and queried with [`vortex-datafusion-cli`].

[`vortex-datafusion-cli`]: https://github.com/vortex-data/vortex-datafusion-cli

## Cookbook: Generate benchmark results

Follow the same EC2 setup used by [datafusion-partitioned](../datafusion-partitioned/README.md), then run:

```bash
cd ClickBench/datafusion-vortex-partitioned
bash benchmark.sh
```

The shared benchmark harness builds `vortex-datafusion-cli`, downloads the partitioned Parquet files, converts each `partitioned/hits_N.parquet` file into exactly one `vortex/hits_N.vortex` file, and runs the query set.

The `install` script checks out `vortex-datafusion-cli` tag `0.70.0-53.1.0`. CLI tags use `<vortex-version>-<df-version>`, where the first component is the `vortex-datafusion` crate version and the second is the DataFusion/DataFusion CLI version.

You can update/preview the results by running:

```bash
./make-json.sh <machine-name> # Example. ./make-json.sh c6a.xlarge
```

## Parquet To Vortex Conversion

Each input file is converted independently through `vortex-datafusion-cli`:

```sql
CREATE EXTERNAL TABLE hits_parquet
STORED AS PARQUET
LOCATION 'partitioned/hits_0.parquet'
OPTIONS ('binary_as_string' 'true');

COPY (
SELECT * EXCEPT ("EventDate"),
CAST(CAST("EventDate" AS INTEGER) AS DATE) AS "EventDate"
FROM hits_parquet
) TO 'vortex/hits_0.vortex' STORED AS VORTEX;
```

`binary_as_string=true` handles the incorrect Parquet logical annotation before Vortex is written. The produced Vortex files store those fields as strings, so benchmark reads use only the Vortex table registration.
3 changes: 1 addition & 2 deletions datafusion-vortex-partitioned/benchmark.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/bin/bash
# Thin shim — actual flow is in lib/benchmark-common.sh.
# query_bench (the vortex driver) handles its own dataset download/conversion.
export BENCH_DOWNLOAD_SCRIPT=""
export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-partitioned"
export BENCH_DURABLE=yes
export BENCH_RESTARTABLE=no
exec ../lib/benchmark-common.sh
5 changes: 2 additions & 3 deletions datafusion-vortex-partitioned/check
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/bin/bash
set -e

# Stateless system — confirm datafusion-cli (the playground's query
# driver) is on PATH.
command -v datafusion-cli >/dev/null
DF=vortex-datafusion-cli/target/release/vortex-datafusion-cli
"$DF" -q -c "SELECT 1" >/dev/null
12 changes: 3 additions & 9 deletions datafusion-vortex-partitioned/create.sql
Original file line number Diff line number Diff line change
@@ -1,9 +1,3 @@
CREATE EXTERNAL TABLE hits_raw
STORED AS PARQUET
LOCATION 'partitioned'
OPTIONS ('binary_as_string' 'true');

CREATE VIEW hits AS
SELECT * EXCEPT ("EventDate"),
CAST(CAST("EventDate" AS INTEGER) AS DATE) AS "EventDate"
FROM hits_raw;
CREATE EXTERNAL TABLE hits
STORED AS VORTEX
LOCATION 'vortex';
3 changes: 1 addition & 2 deletions datafusion-vortex-partitioned/data-size
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/bin/bash
set -e

# Sum the byte counts of all generated .vortex files.
find . -name '*.vortex' -printf '%s\n' | awk '{s+=$1} END {print s+0}'
find vortex -name '*.vortex' -printf '%s\n' | awk '{s+=$1} END {print s+0}'
61 changes: 28 additions & 33 deletions datafusion-vortex-partitioned/install
Original file line number Diff line number Diff line change
@@ -1,41 +1,36 @@
#!/bin/bash
set -e

VORTEX_VERSION=0.44.0
if [ ! -x vortex-datafusion-cli/target/release/vortex-datafusion-cli ]; then
# <vortex-version>-<datafusion-version>
CLI_TAG=0.70.0-53.1.0

if ! command -v cargo >/dev/null 2>&1; then
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh
bash rust-init.sh -y
fi
export HOME=${HOME:=~}
# shellcheck disable=SC1091
source ~/.cargo/env
if ! command -v cargo >/dev/null 2>&1; then
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh
bash rust-init.sh -y
fi
export HOME=${HOME:=~}
# shellcheck disable=SC1091
source "$HOME/.cargo/env"

sudo apt-get update -y
# vortex-duckdb's build.rs runs bindgen, which needs libclang plus the
# clang freestanding headers (stdbool.h etc.); without libclang-dev the
# build fails with `'stdbool.h' file not found`.
sudo apt-get install -y gcc jq build-essential git clang libclang-dev
if [ "$(free -g | awk '/^Mem:/{print $2}')" -lt 12 ]; then
if [ "$(swapon --noheadings --show | wc -l)" -eq 0 ]; then
sudo fallocate -l 8G /swapfile
sudo chmod 600 /swapfile
sudo mkswap /swapfile
sudo swapon /swapfile
fi
fi

if [ ! -d vortex ]; then
git clone https://github.com/spiraldb/vortex.git
fi
(
cd vortex
git fetch --tags
git checkout "$VORTEX_VERSION"
# See datafusion-vortex/install — submodule update isn't idempotent
# without sync + --force when a previous run left a partial clone.
git submodule sync --recursive
git submodule update --init --recursive --force
cargo build --release --bin query_bench --package bench-vortex
)
sudo apt-get update -y
sudo apt-get install -y build-essential clang cmake git libclang-dev pkg-config

# Build datafusion-cli so ./query can return actual rows instead of
# the bench driver's JSON timing blob. See datafusion-vortex/install.
if ! command -v datafusion-cli >/dev/null 2>&1; then
cargo install --locked --version 49.0.2 datafusion-cli
# Cargo installs into $HOME/.cargo/bin; the playground agent
# runs scripts with a stripped PATH, so symlink into /usr/local/bin.
sudo ln -sf "$HOME/.cargo/bin/datafusion-cli" /usr/local/bin/datafusion-cli
if [ ! -d vortex-datafusion-cli ]; then
git clone https://github.com/vortex-data/vortex-datafusion-cli.git
fi
cd vortex-datafusion-cli
git fetch --tags
git checkout "$CLI_TAG"
CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" \
cargo build --release --bin vortex-datafusion-cli
fi
20 changes: 17 additions & 3 deletions datafusion-vortex-partitioned/load
Original file line number Diff line number Diff line change
@@ -1,9 +1,23 @@
#!/bin/bash
set -e

# ./query uses datafusion-cli against the partitioned parquet files
# under partitioned/. See datafusion-vortex/load for the rationale.
DF=vortex-datafusion-cli/target/release/vortex-datafusion-cli

mkdir -p partitioned
../lib/download-hits-parquet-partitioned partitioned
mv hits_*.parquet partitioned/ 2>/dev/null || true

rm -rf vortex
mkdir -p vortex

seq 0 99 | xargs -P"$(nproc)" -I{} "$DF" -q \
-c "SET datafusion.execution.target_partitions = 1;" \
-c "CREATE EXTERNAL TABLE hits_parquet STORED AS PARQUET LOCATION 'partitioned/hits_{}.parquet' OPTIONS ('binary_as_string' 'true');" \
-c "COPY (SELECT * EXCEPT (\"EventDate\"), CAST(CAST(\"EventDate\" AS INTEGER) AS DATE) AS \"EventDate\" FROM hits_parquet) TO 'vortex/hits_{}.vortex' STORED AS VORTEX;"

files=$(find vortex -maxdepth 1 -name 'hits_*.vortex' | wc -l)
if [ "$files" -ne 100 ]; then
echo "Expected 100 Vortex files, found $files" >&2
exit 1
fi

sync
41 changes: 41 additions & 0 deletions datafusion-vortex-partitioned/make-json.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash

# This script converts the raw `result.csv` data from `benchmark.sh` into the
# final json format used by the benchmark dashboard.
#
# usage : ./make-json.sh <machine>
#
# example (save results/<YYYYMMDD>/c6a.4xlarge.json)
# ./make-json.sh c6a.4xlarge

MACHINE=$1
DATE=$(date -u +%Y-%m-%d)
YYYYMMDD=${DATE//-/}
mkdir -p "results/${YYYYMMDD}"
OUTPUT_FILE="results/${YYYYMMDD}/${MACHINE}.json"
SYSTEM_NAME="DataFusion (Vortex, partitioned)"
LOAD_TIME=${LOAD_TIME:-null}
DATA_SIZE=${DATA_SIZE:-$(./data-size 2>/dev/null || echo null)}
DATA_SIZE=${DATA_SIZE:-null}

# Read the CSV and build the result array using sed
RESULT_ARRAY=$(awk -F, '{arr[$1]=arr[$1]","$3} END {for (i=1;i<=length(arr);i++) {gsub(/^,/, "", arr[i]); printf " ["arr[i]"]"; if (i<length(arr)) printf ",\n"}}' result.csv)

# form the final JSON structure from the template
cat <<EOF > $OUTPUT_FILE
{
"system": "$SYSTEM_NAME",
"date": "$DATE",
"machine": "$MACHINE",
"cluster_size": 1,
"proprietary": "no",
"tuned": "no",
"hardware": "cpu",
"tags": ["Rust","column-oriented","embedded","stateless"],
"load_time": $LOAD_TIME,
"data_size": $DATA_SIZE,
"result": [
$RESULT_ARRAY
]
}
EOF
4 changes: 2 additions & 2 deletions datafusion-vortex-partitioned/queries.sql
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPh
SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10;
SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10;
SELECT "UserID", extract(minute FROM "EventTime") AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449;
SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%';
SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
Expand All @@ -40,4 +40,4 @@ SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventD
SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
SELECT "URLHash", "EventDate", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate" ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
SELECT DATE_TRUNC('minute', "EventTime") AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', "EventTime") ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000;
SELECT DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000;
14 changes: 6 additions & 8 deletions datafusion-vortex-partitioned/query
Original file line number Diff line number Diff line change
@@ -1,23 +1,21 @@
#!/bin/bash
# Reads a SQL query from stdin, runs it via datafusion-cli against the
# partitioned parquet files. See ../datafusion-vortex/query for the
# rationale; the vortex bench binary is benchmark-only.
# Stdout: query result.
# Stderr: query runtime in fractional seconds on the last line.
# Reads a SQL query from stdin and runs it via vortex-datafusion-cli.
# Stdout: query result. Stderr: query runtime in fractional seconds.
set -e

DF=vortex-datafusion-cli/target/release/vortex-datafusion-cli

query=$(cat)
tmp=$(mktemp /tmp/datafusion.XXXXXX.sql)
tmp=$(mktemp /tmp/datafusion-vortex.XXXXXX.sql)
trap 'rm -f "$tmp"' EXIT
printf '%s\n' "$query" > "$tmp"

out=$(datafusion-cli -f create.sql "$tmp" 2>&1) && status=0 || status=$?
out=$("$DF" -f create.sql -f "$tmp" 2>&1) && status=0 || status=$?

if [ "$status" -ne 0 ]; then
printf '%s\n' "$out" >&2
exit "$status"
fi

printf '%s\n' "$out" | grep -v 'Elapsed' || true

printf '%s\n' "$out" | awk '/Elapsed/ { e = $2 } END { print e }' >&2
57 changes: 57 additions & 0 deletions datafusion-vortex-partitioned/results/20260505/c6a.2xlarge.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
{
"system": "DataFusion (Vortex, partitioned)",
"date": "2026-05-05",
"machine": "c6a.2xlarge",
"cluster_size": 1,
"proprietary": "no",
"tuned": "no",
"hardware": "cpu",
"tags": ["Rust","column-oriented","embedded","stateless"],
"load_time": 108.58,
"data_size": 15328662856,
"result": [
[0.078,0.002,0.002],
[0.170,0.027,0.028],
[0.214,0.072,0.069],
[0.650,0.071,0.070],
[1.367,0.840,0.837],
[1.355,0.785,0.786],
[0.075,0.002,0.002],
[0.179,0.031,0.033],
[1.269,1.045,1.044],
[1.670,1.251,1.237],
[0.770,0.162,0.156],
[1.099,0.192,0.191],
[1.603,0.694,0.682],
[3.280,1.199,1.198],
[1.395,0.667,0.670],
[1.108,0.948,0.942],
[3.158,1.772,1.779],
[3.122,1.778,1.785],
[5.120,3.531,3.507],
[0.313,0.043,0.048],
[15.675,0.906,0.901],
[17.831,0.929,0.926],
[22.767,1.105,1.103],
[22.582,1.521,1.574],
[0.319,0.079,0.075],
[1.603,0.146,0.147],
[0.609,0.081,0.082],
[16.343,1.328,1.365],
[15.637,15.200,15.211],
[0.814,0.656,0.668],
[2.796,0.592,0.595],
[5.885,0.641,0.629],
[3.929,3.007,3.019],
[16.025,3.545,3.512],
[15.999,3.567,3.503],
[1.455,1.293,1.298],
[0.254,0.074,0.073],
[0.203,0.034,0.034],
[0.243,0.024,0.022],
[0.386,0.130,0.129],
[0.247,0.019,0.016],
[0.249,0.015,0.015],
[0.242,0.014,0.015]
]
}
57 changes: 57 additions & 0 deletions datafusion-vortex-partitioned/results/20260505/c6a.4xlarge.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
{
"system": "DataFusion (Vortex, partitioned)",
"date": "2026-05-05",
"machine": "c6a.4xlarge",
"cluster_size": 1,
"proprietary": "no",
"hardware": "cpu",
"tuned": "no",
"tags": ["Rust","column-oriented","embedded","stateless"],
"load_time": 99.94,
"data_size": 15328662856,
"result": [
[0.082, 0.002, 0.002],
[0.152, 0.030, 0.027],
[0.175, 0.059, 0.060],
[0.633, 0.087, 0.089],
[1.306, 0.628, 0.626],
[1.316, 0.603, 0.592],
[0.090, 0.002, 0.002],
[0.166, 0.031, 0.030],
[1.223, 0.786, 0.772],
[1.664, 0.861, 0.871],
[0.729, 0.131, 0.133],
[1.116, 0.148, 0.147],
[1.605, 0.581, 0.578],
[3.174, 1.070, 1.068],
[1.527, 0.610, 0.597],
[0.887, 0.727, 0.715],
[3.174, 1.509, 1.532],
[3.153, 1.510, 1.506],
[4.788, 2.907, 2.827],
[0.313, 0.048, 0.049],
[15.848, 0.537, 0.528],
[17.859, 0.781, 0.772],
[22.900, 0.894, 0.878],
[21.132, 0.858, 0.785],
[0.287, 0.084, 0.091],
[1.607, 0.153, 0.146],
[0.757, 0.088, 0.088],
[16.230, 0.974, 1.011],
[13.690, 8.122, 8.151],
[0.512, 0.369, 0.364],
[2.788, 0.487, 0.487],
[5.882, 0.592, 0.589],
[3.929, 2.653, 2.674],
[15.934, 3.107, 2.890],
[15.951, 2.902, 2.918],
[1.060, 0.930, 0.917],
[0.261, 0.082, 0.082],
[0.208, 0.035, 0.036],
[0.197, 0.024, 0.023],
[0.385, 0.146, 0.146],
[0.251, 0.017, 0.016],
[0.248, 0.019, 0.014],
[0.244, 0.015, 0.015]
]
}
Loading