diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 9294d55..83b481e 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -21,24 +21,15 @@ on: workflow_dispatch: inputs: scale_factor: - description: 'Scale factor for benchmark' + description: 'Scale factors to benchmark (comma-separated, e.g. "1,10")' required: false - default: '1' - type: choice - options: - - '0.1' - - '1' - - '10' + default: '1,10' + type: string engines: description: 'Engines to benchmark (comma-separated)' required: false default: 'duckdb,geopandas,sedonadb,spatial_polars' type: string - timeout: - description: 'Query timeout in seconds (default: 60, increase for full benchmark)' - required: false - default: '60' - type: string sedonadb_version: description: 'SedonaDB version (e.g., 1.0.0, leave empty for latest)' required: false @@ -71,12 +62,12 @@ on: sedonadb_nightly: description: 'Use SedonaDB nightly build from Gemfury (ignores version if true)' required: false - default: true + default: false type: boolean duckdb_nightly: description: 'Use DuckDB pre-release/nightly build (ignores version if true)' required: false - default: true + default: false type: boolean concurrency: @@ -85,27 +76,47 @@ concurrency: env: CARGO_TERM_COLOR: always - SCALE_FACTOR: ${{ github.event.inputs.scale_factor || '1' }} BENCHMARK_ENGINES: ${{ github.event.inputs.engines || 'duckdb,geopandas,sedonadb,spatial_polars' }} - QUERY_TIMEOUT: ${{ github.event.inputs.timeout || '60' }} BENCHMARK_RUNS: ${{ github.event.inputs.runs || '3' }} # Package versions (empty = latest, can be overridden via workflow_dispatch) SEDONADB_VERSION: ${{ github.event.inputs.sedonadb_version }} DUCKDB_VERSION: ${{ github.event.inputs.duckdb_version }} GEOPANDAS_VERSION: ${{ github.event.inputs.geopandas_version }} SPATIAL_POLARS_VERSION: ${{ github.event.inputs.spatial_polars_version }} - # Nightly build options (default: true) - SEDONADB_NIGHTLY: ${{ github.event.inputs.sedonadb_nightly || 'true' }} - DUCKDB_NIGHTLY: ${{ github.event.inputs.duckdb_nightly || 'true' }} + # Nightly build options (default: false, use stable releases) + SEDONADB_NIGHTLY: ${{ github.event.inputs.sedonadb_nightly || 'false' }} + DUCKDB_NIGHTLY: ${{ github.event.inputs.duckdb_nightly || 'false' }} + QUERY_TIMEOUT: ${{ github.event.inputs.timeout || '600' }} # Hugging Face dataset for benchmark data HF_DATASET: apache-sedona/spatialbench HF_DATA_VERSION: v0.1.0 jobs: + # Parse scale factors into a JSON array for matrix strategy + parse-scale-factors: + name: Parse Scale Factors + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.parse.outputs.matrix }} + steps: + - name: Parse scale factor input + id: parse + run: | + # Default: "1,10" for automatic runs, or user-provided for workflow_dispatch + INPUT="${{ github.event.inputs.scale_factor || '1,10' }}" + # Convert comma-separated string to JSON array: "1,10" -> ["1","10"] + MATRIX=$(echo "$INPUT" | tr ',' '\n' | sed 's/^[[:space:]]*//;s/[[:space:]]*$//' | jq -R . | jq -s -c .) + echo "matrix=$MATRIX" + echo "matrix=$MATRIX" >> $GITHUB_OUTPUT + # Download benchmark data from Hugging Face download-data: - name: Download Data (SF${{ github.event.inputs.scale_factor || '1' }}) + name: Download Data (SF${{ matrix.scale_factor }}) + needs: parse-scale-factors runs-on: ubuntu-latest + strategy: + matrix: + scale_factor: ${{ fromJson(needs.parse-scale-factors.outputs.matrix) }} steps: - uses: actions/checkout@v6 @@ -113,8 +124,8 @@ jobs: id: cache-data uses: actions/cache@v5 with: - path: benchmark-data-sf${{ env.SCALE_FACTOR }} - key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }} + path: benchmark-data-sf${{ matrix.scale_factor }} + key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ matrix.scale_factor }} - name: Setup Python if: steps.cache-data.outputs.cache-hit != 'true' @@ -128,9 +139,11 @@ jobs: - name: Download benchmark data from Hugging Face if: steps.cache-data.outputs.cache-hit != 'true' + env: + SCALE_FACTOR: ${{ matrix.scale_factor }} run: | # Map scale factor to HF folder name - SF="${{ env.SCALE_FACTOR }}" + SF="${{ matrix.scale_factor }}" if [ "$SF" = "0.1" ]; then HF_SF="sf0.1" else @@ -155,66 +168,74 @@ jobs: " # Move data to expected location - mkdir -p benchmark-data-sf${{ env.SCALE_FACTOR }} + mkdir -p benchmark-data-sf${{ matrix.scale_factor }} - SF="${{ env.SCALE_FACTOR }}" + SF="${{ matrix.scale_factor }}" if [ "$SF" = "0.1" ]; then HF_SF="sf0.1" else HF_SF="sf${SF}" fi - cp -r hf-data/${{ env.HF_DATA_VERSION }}/${HF_SF}/* benchmark-data-sf${{ env.SCALE_FACTOR }}/ + cp -r hf-data/${{ env.HF_DATA_VERSION }}/${HF_SF}/* benchmark-data-sf${{ matrix.scale_factor }}/ echo "Downloaded data structure:" - find benchmark-data-sf${{ env.SCALE_FACTOR }} -type f -name "*.parquet" | head -20 + find benchmark-data-sf${{ matrix.scale_factor }} -type f -name "*.parquet" | head -20 echo "" echo "Directory contents:" - ls -la benchmark-data-sf${{ env.SCALE_FACTOR }}/ + ls -la benchmark-data-sf${{ matrix.scale_factor }}/ echo "" echo "Total size:" - du -sh benchmark-data-sf${{ env.SCALE_FACTOR }}/ + du -sh benchmark-data-sf${{ matrix.scale_factor }}/ - name: Show cached data info if: steps.cache-data.outputs.cache-hit == 'true' run: | echo "Using cached benchmark data" echo "Directory contents:" - ls -la benchmark-data-sf${{ env.SCALE_FACTOR }}/ + ls -la benchmark-data-sf${{ matrix.scale_factor }}/ echo "" echo "Total size:" - du -sh benchmark-data-sf${{ env.SCALE_FACTOR }}/ + du -sh benchmark-data-sf${{ matrix.scale_factor }}/ + + # ── Per-query benchmark jobs ── + # Each query runs in its own job (separate runner) so that if one query + # OOMs and kills the runner, the remaining queries still execute. + # max-parallel: 1 ensures queries run sequentially per engine to avoid + # overloading the CI and to keep results orderly. benchmark-duckdb: - name: Benchmark DuckDB (SF${{ github.event.inputs.scale_factor || '1' }}) - needs: download-data + name: DuckDB ${{ matrix.query }} (SF${{ matrix.scale_factor }}) + needs: [parse-scale-factors, download-data] runs-on: ubuntu-latest if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb,spatial_polars', 'duckdb') + strategy: + matrix: + scale_factor: ${{ fromJson(needs.parse-scale-factors.outputs.matrix) }} + query: [q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12] + fail-fast: false + max-parallel: 2 # 1 per scale factor steps: - uses: actions/checkout@v6 - name: Restore benchmark data from cache uses: actions/cache/restore@v5 with: - path: benchmark-data-sf${{ env.SCALE_FACTOR }} - key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }} + path: benchmark-data-sf${{ matrix.scale_factor }} + key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ matrix.scale_factor }} fail-on-cache-miss: true - name: Setup Python uses: actions/setup-python@v6 with: python-version: '3.11' + cache: 'pip' + cache-dependency-path: .github/workflows/benchmark.yml - name: Install dependencies run: | - echo "=== DuckDB Installation Parameters ===" - echo "DUCKDB_NIGHTLY: ${{ env.DUCKDB_NIGHTLY }}" - echo "DUCKDB_VERSION: ${{ env.DUCKDB_VERSION }}" - echo "======================================" if [ "${{ env.DUCKDB_NIGHTLY }}" = "true" ]; then - # Use --pre to install pre-release dev builds (e.g., 1.4.4.dev48) - # Constraint <1.5.0 ensures we get 1.4.x branch dev builds - pip install "duckdb<1.5.0" --pre pyarrow pandas + pip install duckdb --pre pyarrow pandas elif [ -n "${{ env.DUCKDB_VERSION }}" ]; then pip install "duckdb==${{ env.DUCKDB_VERSION }}" pyarrow pandas else @@ -222,47 +243,57 @@ jobs: fi echo "Installed DuckDB version: $(python -c 'import duckdb; print(duckdb.__version__)')" - - name: Pre-install DuckDB spatial extension + - name: Install DuckDB spatial extension run: | - # Dev builds don't have spatial extension in core_nightly, so always use default repo - python -c "import duckdb; con = duckdb.connect(); con.execute('INSTALL spatial'); print('DuckDB spatial extension installed')" + # INSTALL is a no-op on DuckDB 1.5 stable (spatial bundled natively) but required for nightly builds. + python -c "import duckdb; con = duckdb.connect(); con.execute('INSTALL spatial'); con.execute('LOAD spatial'); print('DuckDB spatial extension installed and loaded')" - - name: Run DuckDB benchmark + - name: Run DuckDB ${{ matrix.query }} run: | python benchmark/run_benchmark.py \ - --data-dir benchmark-data-sf${{ env.SCALE_FACTOR }} \ + --data-dir benchmark-data-sf${{ matrix.scale_factor }} \ --engines duckdb \ - --timeout ${{ env.QUERY_TIMEOUT }} \ + --queries ${{ matrix.query }} \ --runs ${{ env.BENCHMARK_RUNS }} \ - --scale-factor ${{ env.SCALE_FACTOR }} \ - --output duckdb_results.json + --timeout ${{ env.QUERY_TIMEOUT }} \ + --scale-factor ${{ matrix.scale_factor }} \ + --output duckdb_${{ matrix.query }}_results.json - name: Upload results + if: always() && hashFiles(format('duckdb_{0}_results.json', matrix.query)) != '' uses: actions/upload-artifact@v6 with: - name: duckdb-results-sf${{ env.SCALE_FACTOR }} - path: duckdb_results.json + name: duckdb-${{ matrix.query }}-results-sf${{ matrix.scale_factor }} + path: duckdb_${{ matrix.query }}_results.json retention-days: 30 benchmark-geopandas: - name: Benchmark GeoPandas (SF${{ github.event.inputs.scale_factor || '1' }}) - needs: download-data + name: GeoPandas ${{ matrix.query }} (SF${{ matrix.scale_factor }}) + needs: [parse-scale-factors, download-data] runs-on: ubuntu-latest if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb,spatial_polars', 'geopandas') + strategy: + matrix: + scale_factor: ${{ fromJson(needs.parse-scale-factors.outputs.matrix) }} + query: [q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12] + fail-fast: false + max-parallel: 2 steps: - uses: actions/checkout@v6 - name: Restore benchmark data from cache uses: actions/cache/restore@v5 with: - path: benchmark-data-sf${{ env.SCALE_FACTOR }} - key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }} + path: benchmark-data-sf${{ matrix.scale_factor }} + key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ matrix.scale_factor }} fail-on-cache-miss: true - name: Setup Python uses: actions/setup-python@v6 with: python-version: '3.11' + cache: 'pip' + cache-dependency-path: .github/workflows/benchmark.yml - name: Install dependencies run: | @@ -273,51 +304,56 @@ jobs: fi echo "Installed GeoPandas version: $(python -c 'from importlib.metadata import version; print(version("geopandas"))')" - - name: Run GeoPandas benchmark + - name: Run GeoPandas ${{ matrix.query }} run: | python benchmark/run_benchmark.py \ - --data-dir benchmark-data-sf${{ env.SCALE_FACTOR }} \ + --data-dir benchmark-data-sf${{ matrix.scale_factor }} \ --engines geopandas \ - --timeout ${{ env.QUERY_TIMEOUT }} \ + --queries ${{ matrix.query }} \ --runs ${{ env.BENCHMARK_RUNS }} \ - --scale-factor ${{ env.SCALE_FACTOR }} \ - --output geopandas_results.json + --timeout ${{ env.QUERY_TIMEOUT }} \ + --scale-factor ${{ matrix.scale_factor }} \ + --output geopandas_${{ matrix.query }}_results.json - name: Upload results + if: always() && hashFiles(format('geopandas_{0}_results.json', matrix.query)) != '' uses: actions/upload-artifact@v6 with: - name: geopandas-results-sf${{ env.SCALE_FACTOR }} - path: geopandas_results.json + name: geopandas-${{ matrix.query }}-results-sf${{ matrix.scale_factor }} + path: geopandas_${{ matrix.query }}_results.json retention-days: 30 benchmark-sedonadb: - name: Benchmark SedonaDB (SF${{ github.event.inputs.scale_factor || '1' }}) - needs: download-data + name: SedonaDB ${{ matrix.query }} (SF${{ matrix.scale_factor }}) + needs: [parse-scale-factors, download-data] runs-on: ubuntu-latest if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb,spatial_polars', 'sedonadb') + strategy: + matrix: + scale_factor: ${{ fromJson(needs.parse-scale-factors.outputs.matrix) }} + query: [q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12] + fail-fast: false + max-parallel: 2 steps: - uses: actions/checkout@v6 - name: Restore benchmark data from cache uses: actions/cache/restore@v5 with: - path: benchmark-data-sf${{ env.SCALE_FACTOR }} - key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }} + path: benchmark-data-sf${{ matrix.scale_factor }} + key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ matrix.scale_factor }} fail-on-cache-miss: true - name: Setup Python uses: actions/setup-python@v6 with: python-version: '3.11' + cache: 'pip' + cache-dependency-path: .github/workflows/benchmark.yml - name: Install dependencies run: | - echo "=== SedonaDB Installation Parameters ===" - echo "SEDONADB_NIGHTLY: ${{ env.SEDONADB_NIGHTLY }}" - echo "SEDONADB_VERSION: ${{ env.SEDONADB_VERSION }}" - echo "========================================" if [ "${{ env.SEDONADB_NIGHTLY }}" = "true" ]; then - # Use Gemfury as primary index and --pre to install nightly alpha builds (e.g., 0.3.0a69) pip install "sedonadb[geopandas]" pandas pyarrow pyproj \ --pre \ --index-url https://repo.fury.io/sedona-nightlies/ \ @@ -329,42 +365,52 @@ jobs: fi echo "Installed SedonaDB version: $(python -c 'from importlib.metadata import version; print(version("sedonadb"))')" - - name: Run SedonaDB benchmark + - name: Run SedonaDB ${{ matrix.query }} run: | python benchmark/run_benchmark.py \ - --data-dir benchmark-data-sf${{ env.SCALE_FACTOR }} \ + --data-dir benchmark-data-sf${{ matrix.scale_factor }} \ --engines sedonadb \ - --timeout ${{ env.QUERY_TIMEOUT }} \ + --queries ${{ matrix.query }} \ --runs ${{ env.BENCHMARK_RUNS }} \ - --scale-factor ${{ env.SCALE_FACTOR }} \ - --output sedonadb_results.json + --timeout ${{ env.QUERY_TIMEOUT }} \ + --scale-factor ${{ matrix.scale_factor }} \ + --output sedonadb_${{ matrix.query }}_results.json - name: Upload results + if: always() && hashFiles(format('sedonadb_{0}_results.json', matrix.query)) != '' uses: actions/upload-artifact@v6 with: - name: sedonadb-results-sf${{ env.SCALE_FACTOR }} - path: sedonadb_results.json + name: sedonadb-${{ matrix.query }}-results-sf${{ matrix.scale_factor }} + path: sedonadb_${{ matrix.query }}_results.json retention-days: 30 benchmark-spatial-polars: - name: Benchmark Spatial Polars (SF${{ github.event.inputs.scale_factor || '1' }}) - needs: download-data + name: Spatial Polars ${{ matrix.query }} (SF${{ matrix.scale_factor }}) + needs: [parse-scale-factors, download-data] runs-on: ubuntu-latest if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb,spatial_polars', 'spatial_polars') + strategy: + matrix: + scale_factor: ${{ fromJson(needs.parse-scale-factors.outputs.matrix) }} + query: [q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12] + fail-fast: false + max-parallel: 2 steps: - uses: actions/checkout@v6 - name: Restore benchmark data from cache uses: actions/cache/restore@v5 with: - path: benchmark-data-sf${{ env.SCALE_FACTOR }} - key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }} + path: benchmark-data-sf${{ matrix.scale_factor }} + key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ matrix.scale_factor }} fail-on-cache-miss: true - name: Setup Python uses: actions/setup-python@v6 with: python-version: '3.11' + cache: 'pip' + cache-dependency-path: .github/workflows/benchmark.yml - name: Install dependencies run: | @@ -375,61 +421,42 @@ jobs: fi echo "Installed Spatial Polars version: $(python -c 'from importlib.metadata import version; print(version("spatial-polars"))')" - - name: Run Spatial Polars benchmark + - name: Run Spatial Polars ${{ matrix.query }} run: | python benchmark/run_benchmark.py \ - --data-dir benchmark-data-sf${{ env.SCALE_FACTOR }} \ + --data-dir benchmark-data-sf${{ matrix.scale_factor }} \ --engines spatial_polars \ - --timeout ${{ env.QUERY_TIMEOUT }} \ + --queries ${{ matrix.query }} \ --runs ${{ env.BENCHMARK_RUNS }} \ - --scale-factor ${{ env.SCALE_FACTOR }} \ - --output spatial_polars_results.json + --timeout ${{ env.QUERY_TIMEOUT }} \ + --scale-factor ${{ matrix.scale_factor }} \ + --output spatial_polars_${{ matrix.query }}_results.json - name: Upload results + if: always() && hashFiles(format('spatial_polars_{0}_results.json', matrix.query)) != '' uses: actions/upload-artifact@v6 with: - name: spatial_polars-results-sf${{ env.SCALE_FACTOR }} - path: spatial_polars_results.json + name: spatial_polars-${{ matrix.query }}-results-sf${{ matrix.scale_factor }} + path: spatial_polars_${{ matrix.query }}_results.json retention-days: 30 summarize-results: - name: Summarize Results (SF${{ github.event.inputs.scale_factor || '1' }}) - needs: [benchmark-duckdb, benchmark-geopandas, benchmark-sedonadb, benchmark-spatial-polars] - if: always() && (needs.benchmark-duckdb.result == 'success' || needs.benchmark-geopandas.result == 'success' || needs.benchmark-sedonadb.result == 'success' || needs.benchmark-spatial-polars.result == 'success') + name: Summarize Results (SF${{ matrix.scale_factor }}) + needs: [parse-scale-factors, benchmark-duckdb, benchmark-geopandas, benchmark-sedonadb, benchmark-spatial-polars] + if: always() && (needs.benchmark-duckdb.result != 'cancelled' || needs.benchmark-geopandas.result != 'cancelled' || needs.benchmark-sedonadb.result != 'cancelled' || needs.benchmark-spatial-polars.result != 'cancelled') runs-on: ubuntu-latest + strategy: + matrix: + scale_factor: ${{ fromJson(needs.parse-scale-factors.outputs.matrix) }} steps: - uses: actions/checkout@v6 - - name: Download DuckDB results - if: needs.benchmark-duckdb.result == 'success' - uses: actions/download-artifact@v7 - with: - name: duckdb-results-sf${{ env.SCALE_FACTOR }} - path: results - continue-on-error: true - - - name: Download GeoPandas results - if: needs.benchmark-geopandas.result == 'success' - uses: actions/download-artifact@v7 - with: - name: geopandas-results-sf${{ env.SCALE_FACTOR }} - path: results - continue-on-error: true - - - name: Download SedonaDB results - if: needs.benchmark-sedonadb.result == 'success' - uses: actions/download-artifact@v7 - with: - name: sedonadb-results-sf${{ env.SCALE_FACTOR }} - path: results - continue-on-error: true - - - name: Download Spatial Polars results - if: needs.benchmark-spatial-polars.result == 'success' + - name: Download all results for this scale factor uses: actions/download-artifact@v7 with: - name: spatial_polars-results-sf${{ env.SCALE_FACTOR }} + pattern: '*-results-sf${{ matrix.scale_factor }}' path: results + merge-multiple: true continue-on-error: true - name: Setup Python @@ -443,6 +470,7 @@ jobs: --results-dir results \ --timeout ${{ env.QUERY_TIMEOUT }} \ --runs ${{ env.BENCHMARK_RUNS }} \ + --engines ${{ env.BENCHMARK_ENGINES }} \ --output benchmark_summary.md - name: Display summary @@ -454,7 +482,7 @@ jobs: - name: Upload combined results uses: actions/upload-artifact@v6 with: - name: benchmark-summary-sf${{ env.SCALE_FACTOR }} + name: benchmark-summary-sf${{ matrix.scale_factor }} path: | results/ benchmark_summary.md diff --git a/benchmark/run_benchmark.py b/benchmark/run_benchmark.py index e05e237..ce438ae 100755 --- a/benchmark/run_benchmark.py +++ b/benchmark/run_benchmark.py @@ -428,6 +428,7 @@ def run_benchmark( timeout: int, scale_factor: float, runs: int = 3, + output_file: str | None = None, ) -> BenchmarkSuite: """Generic benchmark runner for any engine. @@ -438,6 +439,9 @@ def run_benchmark( If runs > 1 and the first run succeeds, additional runs are performed and the average time is reported for fair comparison. + + If output_file is provided, results are saved incrementally after each + query so that partial results survive if the runner crashes mid-way. """ from importlib.metadata import version as pkg_version @@ -483,60 +487,94 @@ def run_benchmark( all_queries = config["queries_getter"]() engine_class = config["class"] - for query_name, query_sql in all_queries.items(): - if queries and query_name not in queries: - continue + # Determine which queries will be run + query_items = [ + (qname, qsql) for qname, qsql in all_queries.items() + if not queries or qname in queries + ] - print(f" Running {query_name}...", end=" ", flush=True) + # Pre-populate all queries as "not_started" so even a total crash + # (e.g. OOM killing the runner) leaves a file showing what was attempted + for query_name, _ in query_items: + suite.results.append(BenchmarkResult( + query=query_name, + engine=engine, + time_seconds=None, + row_count=None, + status="not_started", + error_message=None, + )) + if output_file: + save_results([suite], output_file) - # First run - result = run_query_isolated( - engine_class=engine_class, - engine_name=engine, - data_paths=data_paths, - query_name=query_name, - query_sql=query_sql, - timeout=timeout, - ) + # Install a SIGTERM handler so we flush results if the runner is shutting down + def _sigterm_handler(signum, frame): + print(f"\nReceived signal {signum}, saving partial results...", flush=True) + if output_file: + save_results([suite], output_file) + sys.exit(128 + signum) - # If first run succeeded and we want multiple runs, do additional runs - if result.status == "success" and runs > 1: - run_times = [result.time_seconds] - - for run_num in range(2, runs + 1): - additional_result = run_query_isolated( - engine_class=engine_class, - engine_name=engine, - data_paths=data_paths, - query_name=query_name, - query_sql=query_sql, - timeout=timeout, - ) - if additional_result.status == "success": - run_times.append(additional_result.time_seconds) - else: - # If any subsequent run fails, just use successful runs - break - - # Calculate average of all successful runs - avg_time = round(sum(run_times) / len(run_times), 2) - result = BenchmarkResult( - query=query_name, - engine=engine, - time_seconds=avg_time, - row_count=result.row_count, - status="success", - error_message=None, + prev_handler = signal.signal(signal.SIGTERM, _sigterm_handler) + + try: + for idx, (query_name, query_sql) in enumerate(query_items): + print(f" Running {query_name}...", end=" ", flush=True) + + # First run + result = run_query_isolated( + engine_class=engine_class, + engine_name=engine, + data_paths=data_paths, + query_name=query_name, + query_sql=query_sql, + timeout=timeout, ) - print(f"{avg_time}s avg ({len(run_times)} runs, {result.row_count} rows)") - elif result.status == "success": - print(f"{result.time_seconds}s ({result.row_count} rows)") - else: - print(f"{result.status.upper()}: {result.error_message}") - suite.results.append(result) - if result.status == "success": - suite.total_time += result.time_seconds + # If first run succeeded and we want multiple runs, do additional runs + if result.status == "success" and runs > 1: + run_times = [result.time_seconds] + + for run_num in range(2, runs + 1): + additional_result = run_query_isolated( + engine_class=engine_class, + engine_name=engine, + data_paths=data_paths, + query_name=query_name, + query_sql=query_sql, + timeout=timeout, + ) + if additional_result.status == "success": + run_times.append(additional_result.time_seconds) + else: + # If any subsequent run fails, just use successful runs + break + + # Calculate average of all successful runs + avg_time = round(sum(run_times) / len(run_times), 2) + result = BenchmarkResult( + query=query_name, + engine=engine, + time_seconds=avg_time, + row_count=result.row_count, + status="success", + error_message=None, + ) + print(f"{avg_time}s avg ({len(run_times)} runs, {result.row_count} rows)") + elif result.status == "success": + print(f"{result.time_seconds}s ({result.row_count} rows)") + else: + print(f"{result.status.upper()}: {result.error_message}") + + # Replace the pre-populated "not_started" entry with the actual result + suite.results[idx] = result + if result.status == "success": + suite.total_time += result.time_seconds + + # Save partial results after each query so they survive crashes + if output_file: + save_results([suite], output_file) + finally: + signal.signal(signal.SIGTERM, prev_handler) return suite @@ -629,7 +667,7 @@ def main(): print(f" {table}: {path}") results = [ - run_benchmark(engine, data_paths, queries, args.timeout, args.scale_factor, args.runs) + run_benchmark(engine, data_paths, queries, args.timeout, args.scale_factor, args.runs, args.output) for engine in engines ] diff --git a/benchmark/summarize_results.py b/benchmark/summarize_results.py index a52fcc8..74b7a3f 100755 --- a/benchmark/summarize_results.py +++ b/benchmark/summarize_results.py @@ -26,8 +26,20 @@ from pathlib import Path -def load_results(results_dir: str) -> dict: - """Load all JSON result files from a directory.""" +def load_results(results_dir: str, expected_engines: list[str] | None = None) -> dict: + """Load all JSON result files from a directory. + + Supports two layouts: + 1. One file per engine (e.g., duckdb_results.json with all queries) + 2. One file per query (e.g., duckdb_q1_results.json with a single query) + + Per-query files are merged into a single suite per engine. If multiple files + contain results for the same engine, their query results are combined. + + If expected_engines is provided, engines that were expected to run but have + no results file will be included with all queries marked as 'not_started'. + This handles the case where a runner was OOM-killed before uploading results. + """ results = {} results_path = Path(results_dir) @@ -36,7 +48,55 @@ def load_results(results_dir: str) -> dict: data = json.load(f) for suite in data.get("results", []): engine = suite["engine"] - results[engine] = suite + if engine not in results: + results[engine] = suite + else: + # Merge query results from multiple files for the same engine + existing_queries = {r["query"] for r in results[engine].get("results", [])} + for r in suite.get("results", []): + if r["query"] not in existing_queries: + results[engine]["results"].append(r) + existing_queries.add(r["query"]) + elif r.get("status") != "not_started": + # Replace not_started placeholder with actual result + results[engine]["results"] = [ + r if existing["query"] == r["query"] else existing + for existing in results[engine]["results"] + ] + + # For expected engines with no results, create placeholder entries + if expected_engines: + # Determine the full query list from engines that did report results + all_queries = set() + scale_factor = None + for engine_data in results.values(): + if scale_factor is None: + scale_factor = engine_data.get("scale_factor", 1) + for r in engine_data.get("results", []): + all_queries.add(r["query"]) + + # Default to q1-q12 if no engine reported any results + if not all_queries: + all_queries = {f"q{i}" for i in range(1, 13)} + + for engine in expected_engines: + if engine not in results: + results[engine] = { + "engine": engine, + "version": "unknown", + "scale_factor": scale_factor or 1, + "timestamp": datetime.now(timezone.utc).isoformat(), + "results": [ + { + "query": q, + "status": "not_started", + "time_seconds": None, + "row_count": None, + "error_message": "Runner was killed before completing this query (likely OOM)", + } + for q in sorted(all_queries, key=lambda x: int(x[1:])) + ], + } return results @@ -151,29 +211,38 @@ def generate_markdown_summary(results: dict, output_file: str, query_timeout: in row += " ⏱️ TIMEOUT |" elif status == "error": row += " ❌ ERROR |" + elif status == "not_started": + row += " 💀 OOM |" else: row += " — |" lines.append(row) - # Win count summary + # Win count and completion summary win_counts = {engine: 0 for engine in engines} + completed_counts = {engine: 0 for engine in engines} + total_queries = len(all_queries) for query in all_queries: winner = get_winner(query, data, engines) if winner: win_counts[winner] += 1 + for engine in engines: + result = data.get(engine, {}).get(query, {}) + if result.get("status") == "success": + completed_counts[engine] += 1 lines.extend([ "", "## 🥇 Performance Summary", "", - "| Engine | Wins |", - "|--------|:----:|", + "| Engine | Completed | Wins |", + "|--------|:---------:|:----:|", ]) for engine in sorted(engines, key=lambda e: win_counts[e], reverse=True): icon_name = engine_icons.get(engine, engine.title()) wins = win_counts[engine] - lines.append(f"| {icon_name} | {wins} |") + completed = completed_counts[engine] + lines.append(f"| {icon_name} | {completed}/{total_queries} | {wins} |") # Detailed results section (collapsible) lines.extend([ @@ -203,6 +272,7 @@ def generate_markdown_summary(results: dict, output_file: str, query_timeout: in "success": "✅", "error": "❌", "timeout": "⏱️", + "not_started": "💀", }.get(status, "❓") lines.append(f"| {query.upper()} | {time_str} | {status_emoji} | {row_str} |") @@ -219,14 +289,24 @@ def generate_markdown_summary(results: dict, output_file: str, query_timeout: in for engine in engines: engine_errors = [] + not_started_queries = [] for query in all_queries: result = data.get(engine, {}).get(query, {}) - if result.get("status") in ("error", "timeout"): + status = result.get("status") + if status in ("error", "timeout"): error_msg = result.get("error_message", "No details available") # Truncate long error messages if len(error_msg) > 200: error_msg = error_msg[:200] + "..." engine_errors.append(f"- **{query.upper()}**: `{error_msg}`") + elif status == "not_started": + not_started_queries.append(query.upper()) + + if not_started_queries: + engine_errors.append( + f"- **{', '.join(not_started_queries)}**: " + f"`Could not complete these queries, likely due to OOM (runner was killed)`" + ) if engine_errors: has_errors = True @@ -248,6 +328,7 @@ def generate_markdown_summary(results: dict, output_file: str, query_timeout: in "| **bold** | Fastest for this query |", "| ⏱️ TIMEOUT | Query exceeded timeout |", "| ❌ ERROR | Query failed |", + "| 💀 OOM | Could not run, likely due to out-of-memory (runner killed) |", "", f"*Generated by [SpatialBench](https://github.com/apache/sedona-spatialbench) on {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}*", ]) @@ -289,10 +370,18 @@ def main(): default=3, help="Number of runs per query (for reporting)", ) + parser.add_argument( + "--engines", + type=str, + default=None, + help="Comma-separated list of expected engines (e.g., 'duckdb,geopandas,sedonadb,spatial_polars'). " + "Engines that were expected but have no results will be shown as OOM/runner-killed.", + ) args = parser.parse_args() - results = load_results(args.results_dir) + expected_engines = [e.strip() for e in args.engines.split(",")] if args.engines else None + results = load_results(args.results_dir, expected_engines=expected_engines) if not results: print(f"No results found in {args.results_dir}")