Skip to content

Commit 0762bd4

Browse files
committed
Update scripts / readme
1 parent b06dd09 commit 0762bd4

File tree

6 files changed

+43
-26
lines changed

6 files changed

+43
-26
lines changed

datafusion-partitioned/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ The benchmark should be completed in under an hour. On-demand pricing is $0.6 pe
2727
```
2828

2929
1. `bash benchmark.sh`
30+
1. `./save-result.sh c6a.4xlarge`
3031

3132
### Know Issues
3233

datafusion-partitioned/benchmark.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ sudo apt-get update -y
1111
sudo apt-get install -y gcc
1212

1313
echo "Install DataFusion main branch"
14-
git clone https://github.com/apache/arrow-datafusion.git
15-
cd arrow-datafusion/
14+
git clone https://github.com/apache/datafusion.git
15+
cd datafusion/
1616
git checkout 51.0.0
1717
CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" cargo build --release --package datafusion-cli --bin datafusion-cli
1818
export PATH="`pwd`/target/release:$PATH"

datafusion-partitioned/save-result.sh

Lines changed: 1 addition & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -7,28 +7,7 @@
77
#
88
# example (save results/c6a.4xlarge.json)
99
# ./save-result.sh c6a.4xlarge
10-
#
11-
# Example input result.csv, showing three runs for query 1:
12-
# 1,1,0.108
13-
# 1,2,0.108
14-
# 1,3,0.108
15-
#
16-
# Example output output.json, stored in results/<machine.json
17-
# {
18-
# "system": "DataFusion (Parquet, partitioned)",
19-
# "date": "2025-07-11",
20-
# "machine": "c6a.xlarge",
21-
# "cluster_size": 1,
22-
# "proprietary": "no",
23-
# "tuned": "no",
24-
# "tags": ["Rust","column-oriented","embedded","stateless", "lukewarm-cold-run"],
25-
# "load_time": 0,
26-
# "data_size": 14737666736,
27-
# "result": [
28-
# [0.108, 0.108, 0.108],
29-
# ...
30-
# ]
31-
# }
10+
3211
MACHINE=$1
3312
OUTPUT_FILE="results/${MACHINE}.json"
3413
SYSTEM_NAME="DataFusion (Parquet, partitioned)"

datafusion/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ The benchmark should be completed in under an hour. On-demand pricing is $0.6 pe
2727
```
2828

2929
1. `bash benchmark.sh`
30+
1. `./save-result.sh c6a.4xlarge`
3031

3132
### Know Issues
3233

datafusion/benchmark.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ sudo apt-get update -y
1111
sudo apt-get install -y gcc
1212

1313
echo "Install DataFusion main branch"
14-
git clone https://github.com/apache/arrow-datafusion.git
15-
cd arrow-datafusion/
14+
git clone https://github.com/apache/datafusion.git
15+
cd datafusion/
1616
git checkout 51.0.0
1717
CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" cargo build --release --package datafusion-cli --bin datafusion-cli
1818
export PATH="`pwd`/target/release:$PATH"

datafusion/save-result.sh

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/bin/bash
2+
3+
# This scripts converts the raw results.csv data from `benchmark.sh` into a the
4+
# final json format used by the benchmark dashboard.
5+
#
6+
# usage : ./save-result.sh <machine>
7+
#
8+
# example (save results/c6a.4xlarge.json)
9+
# ./save-result.sh c6a.4xlarge
10+
11+
MACHINE=$1
12+
OUTPUT_FILE="results/${MACHINE}.json"
13+
SYSTEM_NAME="DataFusion (Parquet, single)"
14+
DATE=$(date +%Y-%m-%d)
15+
16+
17+
# Read the CSV and build the result array using sed
18+
RESULT_ARRAY=$(awk -F, '{arr[$1]=arr[$1]","$3} END {for (i=1;i<=length(arr);i++) {gsub(/^,/, "", arr[i]); printf " ["arr[i]"]"; if (i<length(arr)) printf ",\n"}}' result.csv)
19+
20+
# form the final JSON structure from the template
21+
cat <<EOF > $OUTPUT_FILE
22+
{
23+
"system": "$SYSTEM_NAME",
24+
"date": "$DATE",
25+
"machine": "$MACHINE",
26+
"cluster_size": 1,
27+
"proprietary": "no",
28+
"tuned": "no",
29+
"tags": ["Rust","column-oriented","embedded","stateless"],
30+
"load_time": 0,
31+
"data_size": 14779976446,
32+
"result": [
33+
$RESULT_ARRAY
34+
]
35+
}
36+
EOF

0 commit comments

Comments
 (0)