diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 9294d55..83b481e 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -21,24 +21,15 @@ on:
   workflow_dispatch:
     inputs:
       scale_factor:
-        description: 'Scale factor for benchmark'
+        description: 'Scale factors to benchmark (comma-separated, e.g. "1,10")'
         required: false
-        default: '1'
-        type: choice
-        options:
-          - '0.1'
-          - '1'
-          - '10'
+        default: '1,10'
+        type: string
       engines:
         description: 'Engines to benchmark (comma-separated)'
         required: false
         default: 'duckdb,geopandas,sedonadb,spatial_polars'
         type: string
-      timeout:
-        description: 'Query timeout in seconds (default: 60, increase for full benchmark)'
-        required: false
-        default: '60'
-        type: string
       sedonadb_version:
         description: 'SedonaDB version (e.g., 1.0.0, leave empty for latest)'
         required: false
@@ -71,12 +62,12 @@ on:
       sedonadb_nightly:
         description: 'Use SedonaDB nightly build from Gemfury (ignores version if true)'
         required: false
-        default: true
+        default: false
         type: boolean
       duckdb_nightly:
         description: 'Use DuckDB pre-release/nightly build (ignores version if true)'
         required: false
-        default: true
+        default: false
         type: boolean
 
 concurrency:
@@ -85,27 +76,47 @@ concurrency:
 
 env:
   CARGO_TERM_COLOR: always
-  SCALE_FACTOR: ${{ github.event.inputs.scale_factor || '1' }}
   BENCHMARK_ENGINES: ${{ github.event.inputs.engines || 'duckdb,geopandas,sedonadb,spatial_polars' }}
-  QUERY_TIMEOUT: ${{ github.event.inputs.timeout || '60' }}
   BENCHMARK_RUNS: ${{ github.event.inputs.runs || '3' }}
   # Package versions (empty = latest, can be overridden via workflow_dispatch)
   SEDONADB_VERSION: ${{ github.event.inputs.sedonadb_version }}
   DUCKDB_VERSION: ${{ github.event.inputs.duckdb_version }}
   GEOPANDAS_VERSION: ${{ github.event.inputs.geopandas_version }}
   SPATIAL_POLARS_VERSION: ${{ github.event.inputs.spatial_polars_version }}
-  # Nightly build options (default: true)
-  SEDONADB_NIGHTLY: ${{ github.event.inputs.sedonadb_nightly || 'true' }}
-  DUCKDB_NIGHTLY: ${{ github.event.inputs.duckdb_nightly || 'true' }}
+  # Nightly build options (default: false, use stable releases)
+  SEDONADB_NIGHTLY: ${{ github.event.inputs.sedonadb_nightly || 'false' }}
+  DUCKDB_NIGHTLY: ${{ github.event.inputs.duckdb_nightly || 'false' }}
+  QUERY_TIMEOUT: ${{ github.event.inputs.timeout || '600' }}
   # Hugging Face dataset for benchmark data
   HF_DATASET: apache-sedona/spatialbench
   HF_DATA_VERSION: v0.1.0
 
 jobs:
+  # Parse scale factors into a JSON array for matrix strategy
+  parse-scale-factors:
+    name: Parse Scale Factors
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.parse.outputs.matrix }}
+    steps:
+      - name: Parse scale factor input
+        id: parse
+        run: |
+          # Default: "1,10" for automatic runs, or user-provided for workflow_dispatch
+          INPUT="${{ github.event.inputs.scale_factor || '1,10' }}"
+          # Convert comma-separated string to JSON array: "1,10" -> ["1","10"]
+          MATRIX=$(echo "$INPUT" | tr ',' '\n' | sed 's/^[[:space:]]*//;s/[[:space:]]*$//' | jq -R . | jq -s -c .)
+          echo "matrix=$MATRIX"
+          echo "matrix=$MATRIX" >> $GITHUB_OUTPUT
+
   # Download benchmark data from Hugging Face
   download-data:
-    name: Download Data (SF${{ github.event.inputs.scale_factor || '1' }})
+    name: Download Data (SF${{ matrix.scale_factor }})
+    needs: parse-scale-factors
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        scale_factor: ${{ fromJson(needs.parse-scale-factors.outputs.matrix) }}
     steps:
       - uses: actions/checkout@v6
 
@@ -113,8 +124,8 @@ jobs:
         id: cache-data
         uses: actions/cache@v5
         with:
-          path: benchmark-data-sf${{ env.SCALE_FACTOR }}
-          key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }}
+          path: benchmark-data-sf${{ matrix.scale_factor }}
+          key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ matrix.scale_factor }}
 
       - name: Setup Python
         if: steps.cache-data.outputs.cache-hit != 'true'
@@ -128,9 +139,11 @@ jobs:
 
       - name: Download benchmark data from Hugging Face
         if: steps.cache-data.outputs.cache-hit != 'true'
+        env:
+          SCALE_FACTOR: ${{ matrix.scale_factor }}
         run: |
           # Map scale factor to HF folder name
-          SF="${{ env.SCALE_FACTOR }}"
+          SF="${{ matrix.scale_factor }}"
           if [ "$SF" = "0.1" ]; then
             HF_SF="sf0.1"
           else
@@ -155,66 +168,74 @@ jobs:
           "
 
           # Move data to expected location
-          mkdir -p benchmark-data-sf${{ env.SCALE_FACTOR }}
+          mkdir -p benchmark-data-sf${{ matrix.scale_factor }}
 
-          SF="${{ env.SCALE_FACTOR }}"
+          SF="${{ matrix.scale_factor }}"
           if [ "$SF" = "0.1" ]; then
             HF_SF="sf0.1"
           else
             HF_SF="sf${SF}"
           fi
 
-          cp -r hf-data/${{ env.HF_DATA_VERSION }}/${HF_SF}/* benchmark-data-sf${{ env.SCALE_FACTOR }}/
+          cp -r hf-data/${{ env.HF_DATA_VERSION }}/${HF_SF}/* benchmark-data-sf${{ matrix.scale_factor }}/
 
           echo "Downloaded data structure:"
-          find benchmark-data-sf${{ env.SCALE_FACTOR }} -type f -name "*.parquet" | head -20
+          find benchmark-data-sf${{ matrix.scale_factor }} -type f -name "*.parquet" | head -20
           echo ""
           echo "Directory contents:"
-          ls -la benchmark-data-sf${{ env.SCALE_FACTOR }}/
+          ls -la benchmark-data-sf${{ matrix.scale_factor }}/
           echo ""
           echo "Total size:"
-          du -sh benchmark-data-sf${{ env.SCALE_FACTOR }}/
+          du -sh benchmark-data-sf${{ matrix.scale_factor }}/
 
       - name: Show cached data info
         if: steps.cache-data.outputs.cache-hit == 'true'
         run: |
           echo "Using cached benchmark data"
           echo "Directory contents:"
-          ls -la benchmark-data-sf${{ env.SCALE_FACTOR }}/
+          ls -la benchmark-data-sf${{ matrix.scale_factor }}/
           echo ""
           echo "Total size:"
-          du -sh benchmark-data-sf${{ env.SCALE_FACTOR }}/
+          du -sh benchmark-data-sf${{ matrix.scale_factor }}/
+
+  # ── Per-query benchmark jobs ──
+  # Each query runs in its own job (separate runner) so that if one query
+  # OOMs and kills the runner, the remaining queries still execute.
+  # max-parallel: 1 ensures queries run sequentially per engine to avoid
+  # overloading the CI and to keep results orderly.
 
   benchmark-duckdb:
-    name: Benchmark DuckDB (SF${{ github.event.inputs.scale_factor || '1' }})
-    needs: download-data
+    name: DuckDB ${{ matrix.query }} (SF${{ matrix.scale_factor }})
+    needs: [parse-scale-factors, download-data]
     runs-on: ubuntu-latest
     if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb,spatial_polars', 'duckdb')
+    strategy:
+      matrix:
+        scale_factor: ${{ fromJson(needs.parse-scale-factors.outputs.matrix) }}
+        query: [q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12]
+      fail-fast: false
+      max-parallel: 2  # 1 per scale factor
     steps:
       - uses: actions/checkout@v6
 
       - name: Restore benchmark data from cache
         uses: actions/cache/restore@v5
         with:
-          path: benchmark-data-sf${{ env.SCALE_FACTOR }}
-          key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }}
+          path: benchmark-data-sf${{ matrix.scale_factor }}
+          key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ matrix.scale_factor }}
           fail-on-cache-miss: true
 
       - name: Setup Python
         uses: actions/setup-python@v6
         with:
           python-version: '3.11'
+          cache: 'pip'
+          cache-dependency-path: .github/workflows/benchmark.yml
 
       - name: Install dependencies
         run: |
-          echo "=== DuckDB Installation Parameters ==="
-          echo "DUCKDB_NIGHTLY: ${{ env.DUCKDB_NIGHTLY }}"
-          echo "DUCKDB_VERSION: ${{ env.DUCKDB_VERSION }}"
-          echo "======================================"
           if [ "${{ env.DUCKDB_NIGHTLY }}" = "true" ]; then
-            # Use --pre to install pre-release dev builds (e.g., 1.4.4.dev48)
-            # Constraint <1.5.0 ensures we get 1.4.x branch dev builds
-            pip install "duckdb<1.5.0" --pre pyarrow pandas
+            pip install duckdb --pre pyarrow pandas
           elif [ -n "${{ env.DUCKDB_VERSION }}" ]; then
             pip install "duckdb==${{ env.DUCKDB_VERSION }}" pyarrow pandas
           else
@@ -222,47 +243,57 @@ jobs:
           fi
           echo "Installed DuckDB version: $(python -c 'import duckdb; print(duckdb.__version__)')"
 
-      - name: Pre-install DuckDB spatial extension
+      - name: Install DuckDB spatial extension
         run: |
-          # Dev builds don't have spatial extension in core_nightly, so always use default repo
-          python -c "import duckdb; con = duckdb.connect(); con.execute('INSTALL spatial'); print('DuckDB spatial extension installed')"
+          # INSTALL is a no-op on DuckDB 1.5 stable (spatial bundled natively) but required for nightly builds.
+          python -c "import duckdb; con = duckdb.connect(); con.execute('INSTALL spatial'); con.execute('LOAD spatial'); print('DuckDB spatial extension installed and loaded')"
 
-      - name: Run DuckDB benchmark
+      - name: Run DuckDB ${{ matrix.query }}
         run: |
           python benchmark/run_benchmark.py \
-            --data-dir benchmark-data-sf${{ env.SCALE_FACTOR }} \
+            --data-dir benchmark-data-sf${{ matrix.scale_factor }} \
             --engines duckdb \
-            --timeout ${{ env.QUERY_TIMEOUT }} \
+            --queries ${{ matrix.query }} \
             --runs ${{ env.BENCHMARK_RUNS }} \
-            --scale-factor ${{ env.SCALE_FACTOR }} \
-            --output duckdb_results.json
+            --timeout ${{ env.QUERY_TIMEOUT }} \
+            --scale-factor ${{ matrix.scale_factor }} \
+            --output duckdb_${{ matrix.query }}_results.json
 
       - name: Upload results
+        if: always() && hashFiles(format('duckdb_{0}_results.json', matrix.query)) != ''
         uses: actions/upload-artifact@v6
         with:
-          name: duckdb-results-sf${{ env.SCALE_FACTOR }}
-          path: duckdb_results.json
+          name: duckdb-${{ matrix.query }}-results-sf${{ matrix.scale_factor }}
+          path: duckdb_${{ matrix.query }}_results.json
           retention-days: 30
 
   benchmark-geopandas:
-    name: Benchmark GeoPandas (SF${{ github.event.inputs.scale_factor || '1' }})
-    needs: download-data
+    name: GeoPandas ${{ matrix.query }} (SF${{ matrix.scale_factor }})
+    needs: [parse-scale-factors, download-data]
     runs-on: ubuntu-latest
     if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb,spatial_polars', 'geopandas')
+    strategy:
+      matrix:
+        scale_factor: ${{ fromJson(needs.parse-scale-factors.outputs.matrix) }}
+        query: [q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12]
+      fail-fast: false
+      max-parallel: 2
     steps:
       - uses: actions/checkout@v6
 
       - name: Restore benchmark data from cache
         uses: actions/cache/restore@v5
         with:
-          path: benchmark-data-sf${{ env.SCALE_FACTOR }}
-          key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }}
+          path: benchmark-data-sf${{ matrix.scale_factor }}
+          key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ matrix.scale_factor }}
           fail-on-cache-miss: true
 
       - name: Setup Python
         uses: actions/setup-python@v6
         with:
           python-version: '3.11'
+          cache: 'pip'
+          cache-dependency-path: .github/workflows/benchmark.yml
 
       - name: Install dependencies
         run: |
@@ -273,51 +304,56 @@ jobs:
           fi
           echo "Installed GeoPandas version: $(python -c 'from importlib.metadata import version; print(version("geopandas"))')"
 
-      - name: Run GeoPandas benchmark
+      - name: Run GeoPandas ${{ matrix.query }}
         run: |
           python benchmark/run_benchmark.py \
-            --data-dir benchmark-data-sf${{ env.SCALE_FACTOR }} \
+            --data-dir benchmark-data-sf${{ matrix.scale_factor }} \
             --engines geopandas \
-            --timeout ${{ env.QUERY_TIMEOUT }} \
+            --queries ${{ matrix.query }} \
             --runs ${{ env.BENCHMARK_RUNS }} \
-            --scale-factor ${{ env.SCALE_FACTOR }} \
-            --output geopandas_results.json
+            --timeout ${{ env.QUERY_TIMEOUT }} \
+            --scale-factor ${{ matrix.scale_factor }} \
+            --output geopandas_${{ matrix.query }}_results.json
 
       - name: Upload results
+        if: always() && hashFiles(format('geopandas_{0}_results.json', matrix.query)) != ''
         uses: actions/upload-artifact@v6
         with:
-          name: geopandas-results-sf${{ env.SCALE_FACTOR }}
-          path: geopandas_results.json
+          name: geopandas-${{ matrix.query }}-results-sf${{ matrix.scale_factor }}
+          path: geopandas_${{ matrix.query }}_results.json
           retention-days: 30
 
   benchmark-sedonadb:
-    name: Benchmark SedonaDB (SF${{ github.event.inputs.scale_factor || '1' }})
-    needs: download-data
+    name: SedonaDB ${{ matrix.query }} (SF${{ matrix.scale_factor }})
+    needs: [parse-scale-factors, download-data]
     runs-on: ubuntu-latest
     if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb,spatial_polars', 'sedonadb')
+    strategy:
+      matrix:
+        scale_factor: ${{ fromJson(needs.parse-scale-factors.outputs.matrix) }}
+        query: [q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12]
+      fail-fast: false
+      max-parallel: 2
     steps:
       - uses: actions/checkout@v6
 
       - name: Restore benchmark data from cache
         uses: actions/cache/restore@v5
         with:
-          path: benchmark-data-sf${{ env.SCALE_FACTOR }}
-          key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }}
+          path: benchmark-data-sf${{ matrix.scale_factor }}
+          key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ matrix.scale_factor }}
           fail-on-cache-miss: true
 
       - name: Setup Python
         uses: actions/setup-python@v6
         with:
           python-version: '3.11'
+          cache: 'pip'
+          cache-dependency-path: .github/workflows/benchmark.yml
 
       - name: Install dependencies
         run: |
-          echo "=== SedonaDB Installation Parameters ==="
-          echo "SEDONADB_NIGHTLY: ${{ env.SEDONADB_NIGHTLY }}"
-          echo "SEDONADB_VERSION: ${{ env.SEDONADB_VERSION }}"
-          echo "========================================"
           if [ "${{ env.SEDONADB_NIGHTLY }}" = "true" ]; then
-            # Use Gemfury as primary index and --pre to install nightly alpha builds (e.g., 0.3.0a69)
             pip install "sedonadb[geopandas]" pandas pyarrow pyproj \
               --pre \
               --index-url https://repo.fury.io/sedona-nightlies/ \
@@ -329,42 +365,52 @@ jobs:
           fi
           echo "Installed SedonaDB version: $(python -c 'from importlib.metadata import version; print(version("sedonadb"))')"
 
-      - name: Run SedonaDB benchmark
+      - name: Run SedonaDB ${{ matrix.query }}
         run: |
           python benchmark/run_benchmark.py \
-            --data-dir benchmark-data-sf${{ env.SCALE_FACTOR }} \
+            --data-dir benchmark-data-sf${{ matrix.scale_factor }} \
             --engines sedonadb \
-            --timeout ${{ env.QUERY_TIMEOUT }} \
+            --queries ${{ matrix.query }} \
             --runs ${{ env.BENCHMARK_RUNS }} \
-            --scale-factor ${{ env.SCALE_FACTOR }} \
-            --output sedonadb_results.json
+            --timeout ${{ env.QUERY_TIMEOUT }} \
+            --scale-factor ${{ matrix.scale_factor }} \
+            --output sedonadb_${{ matrix.query }}_results.json
 
       - name: Upload results
+        if: always() && hashFiles(format('sedonadb_{0}_results.json', matrix.query)) != ''
         uses: actions/upload-artifact@v6
         with:
-          name: sedonadb-results-sf${{ env.SCALE_FACTOR }}
-          path: sedonadb_results.json
+          name: sedonadb-${{ matrix.query }}-results-sf${{ matrix.scale_factor }}
+          path: sedonadb_${{ matrix.query }}_results.json
           retention-days: 30
 
   benchmark-spatial-polars:
-    name: Benchmark Spatial Polars (SF${{ github.event.inputs.scale_factor || '1' }})
-    needs: download-data
+    name: Spatial Polars ${{ matrix.query }} (SF${{ matrix.scale_factor }})
+    needs: [parse-scale-factors, download-data]
     runs-on: ubuntu-latest
     if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb,spatial_polars', 'spatial_polars')
+    strategy:
+      matrix:
+        scale_factor: ${{ fromJson(needs.parse-scale-factors.outputs.matrix) }}
+        query: [q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12]
+      fail-fast: false
+      max-parallel: 2
     steps:
       - uses: actions/checkout@v6
 
       - name: Restore benchmark data from cache
         uses: actions/cache/restore@v5
         with:
-          path: benchmark-data-sf${{ env.SCALE_FACTOR }}
-          key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }}
+          path: benchmark-data-sf${{ matrix.scale_factor }}
+          key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ matrix.scale_factor }}
           fail-on-cache-miss: true
 
       - name: Setup Python
         uses: actions/setup-python@v6
         with:
           python-version: '3.11'
+          cache: 'pip'
+          cache-dependency-path: .github/workflows/benchmark.yml
 
       - name: Install dependencies
         run: |
@@ -375,61 +421,42 @@ jobs:
           fi
           echo "Installed Spatial Polars version: $(python -c 'from importlib.metadata import version; print(version("spatial-polars"))')"
 
-      - name: Run Spatial Polars benchmark
+      - name: Run Spatial Polars ${{ matrix.query }}
         run: |
           python benchmark/run_benchmark.py \
-            --data-dir benchmark-data-sf${{ env.SCALE_FACTOR }} \
+            --data-dir benchmark-data-sf${{ matrix.scale_factor }} \
             --engines spatial_polars \
-            --timeout ${{ env.QUERY_TIMEOUT }} \
+            --queries ${{ matrix.query }} \
             --runs ${{ env.BENCHMARK_RUNS }} \
-            --scale-factor ${{ env.SCALE_FACTOR }} \
-            --output spatial_polars_results.json
+            --timeout ${{ env.QUERY_TIMEOUT }} \
+            --scale-factor ${{ matrix.scale_factor }} \
+            --output spatial_polars_${{ matrix.query }}_results.json
 
       - name: Upload results
+        if: always() && hashFiles(format('spatial_polars_{0}_results.json', matrix.query)) != ''
         uses: actions/upload-artifact@v6
         with:
-          name: spatial_polars-results-sf${{ env.SCALE_FACTOR }}
-          path: spatial_polars_results.json
+          name: spatial_polars-${{ matrix.query }}-results-sf${{ matrix.scale_factor }}
+          path: spatial_polars_${{ matrix.query }}_results.json
           retention-days: 30
 
   summarize-results:
-    name: Summarize Results (SF${{ github.event.inputs.scale_factor || '1' }})
-    needs: [benchmark-duckdb, benchmark-geopandas, benchmark-sedonadb, benchmark-spatial-polars]
-    if: always() && (needs.benchmark-duckdb.result == 'success' || needs.benchmark-geopandas.result == 'success' || needs.benchmark-sedonadb.result == 'success' || needs.benchmark-spatial-polars.result == 'success')
+    name: Summarize Results (SF${{ matrix.scale_factor }})
+    needs: [parse-scale-factors, benchmark-duckdb, benchmark-geopandas, benchmark-sedonadb, benchmark-spatial-polars]
+    if: always() && (needs.benchmark-duckdb.result != 'cancelled' || needs.benchmark-geopandas.result != 'cancelled' || needs.benchmark-sedonadb.result != 'cancelled' || needs.benchmark-spatial-polars.result != 'cancelled')
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        scale_factor: ${{ fromJson(needs.parse-scale-factors.outputs.matrix) }}
     steps:
       - uses: actions/checkout@v6
 
-      - name: Download DuckDB results
-        if: needs.benchmark-duckdb.result == 'success'
-        uses: actions/download-artifact@v7
-        with:
-          name: duckdb-results-sf${{ env.SCALE_FACTOR }}
-          path: results
-        continue-on-error: true
-
-      - name: Download GeoPandas results
-        if: needs.benchmark-geopandas.result == 'success'
-        uses: actions/download-artifact@v7
-        with:
-          name: geopandas-results-sf${{ env.SCALE_FACTOR }}
-          path: results
-        continue-on-error: true
-
-      - name: Download SedonaDB results
-        if: needs.benchmark-sedonadb.result == 'success'
-        uses: actions/download-artifact@v7
-        with:
-          name: sedonadb-results-sf${{ env.SCALE_FACTOR }}
-          path: results
-        continue-on-error: true
-
-      - name: Download Spatial Polars results
-        if: needs.benchmark-spatial-polars.result == 'success'
+      - name: Download all results for this scale factor
         uses: actions/download-artifact@v7
         with:
-          name: spatial_polars-results-sf${{ env.SCALE_FACTOR }}
+          pattern: '*-results-sf${{ matrix.scale_factor }}'
           path: results
+          merge-multiple: true
         continue-on-error: true
 
       - name: Setup Python
@@ -443,6 +470,7 @@ jobs:
             --results-dir results \
             --timeout ${{ env.QUERY_TIMEOUT }} \
             --runs ${{ env.BENCHMARK_RUNS }} \
+            --engines ${{ env.BENCHMARK_ENGINES }} \
             --output benchmark_summary.md
 
       - name: Display summary
@@ -454,7 +482,7 @@ jobs:
       - name: Upload combined results
         uses: actions/upload-artifact@v6
         with:
-          name: benchmark-summary-sf${{ env.SCALE_FACTOR }}
+          name: benchmark-summary-sf${{ matrix.scale_factor }}
           path: |
             results/
             benchmark_summary.md
diff --git a/benchmark/run_benchmark.py b/benchmark/run_benchmark.py
index e05e237..ce438ae 100755
--- a/benchmark/run_benchmark.py
+++ b/benchmark/run_benchmark.py
@@ -428,6 +428,7 @@ def run_benchmark(
     timeout: int,
     scale_factor: float,
     runs: int = 3,
+    output_file: str | None = None,
 ) -> BenchmarkSuite:
     """Generic benchmark runner for any engine.
 
@@ -438,6 +439,9 @@ def run_benchmark(
 
     If runs > 1 and the first run succeeds, additional runs are performed
     and the average time is reported for fair comparison.
+
+    If output_file is provided, results are saved incrementally after each
+    query so that partial results survive if the runner crashes mid-way.
     """
 
     from importlib.metadata import version as pkg_version
@@ -483,60 +487,94 @@ def run_benchmark(
     all_queries = config["queries_getter"]()
     engine_class = config["class"]
 
-    for query_name, query_sql in all_queries.items():
-        if queries and query_name not in queries:
-            continue
+    # Determine which queries will be run
+    query_items = [
+        (qname, qsql) for qname, qsql in all_queries.items()
+        if not queries or qname in queries
+    ]
 
-        print(f"  Running {query_name}...", end=" ", flush=True)
+    # Pre-populate all queries as "not_started" so even a total crash
+    # (e.g. OOM killing the runner) leaves a file showing what was attempted
+    for query_name, _ in query_items:
+        suite.results.append(BenchmarkResult(
+            query=query_name,
+            engine=engine,
+            time_seconds=None,
+            row_count=None,
+            status="not_started",
+            error_message=None,
+        ))
+    if output_file:
+        save_results([suite], output_file)
 
-        # First run
-        result = run_query_isolated(
-            engine_class=engine_class,
-            engine_name=engine,
-            data_paths=data_paths,
-            query_name=query_name,
-            query_sql=query_sql,
-            timeout=timeout,
-        )
+    # Install a SIGTERM handler so we flush results if the runner is shutting down
+    def _sigterm_handler(signum, frame):
+        print(f"\nReceived signal {signum}, saving partial results...", flush=True)
+        if output_file:
+            save_results([suite], output_file)
+        sys.exit(128 + signum)
 
-        # If first run succeeded and we want multiple runs, do additional runs
-        if result.status == "success" and runs > 1:
-            run_times = [result.time_seconds]
-
-            for run_num in range(2, runs + 1):
-                additional_result = run_query_isolated(
-                    engine_class=engine_class,
-                    engine_name=engine,
-                    data_paths=data_paths,
-                    query_name=query_name,
-                    query_sql=query_sql,
-                    timeout=timeout,
-                )
-                if additional_result.status == "success":
-                    run_times.append(additional_result.time_seconds)
-                else:
-                    # If any subsequent run fails, just use successful runs
-                    break
-
-            # Calculate average of all successful runs
-            avg_time = round(sum(run_times) / len(run_times), 2)
-            result = BenchmarkResult(
-                query=query_name,
-                engine=engine,
-                time_seconds=avg_time,
-                row_count=result.row_count,
-                status="success",
-                error_message=None,
+    prev_handler = signal.signal(signal.SIGTERM, _sigterm_handler)
+
+    try:
+        for idx, (query_name, query_sql) in enumerate(query_items):
+            print(f"  Running {query_name}...", end=" ", flush=True)
+
+            # First run
+            result = run_query_isolated(
+                engine_class=engine_class,
+                engine_name=engine,
+                data_paths=data_paths,
+                query_name=query_name,
+                query_sql=query_sql,
+                timeout=timeout,
             )
-            print(f"{avg_time}s avg ({len(run_times)} runs, {result.row_count} rows)")
-        elif result.status == "success":
-            print(f"{result.time_seconds}s ({result.row_count} rows)")
-        else:
-            print(f"{result.status.upper()}: {result.error_message}")
 
-        suite.results.append(result)
-        if result.status == "success":
-            suite.total_time += result.time_seconds
+            # If first run succeeded and we want multiple runs, do additional runs
+            if result.status == "success" and runs > 1:
+                run_times = [result.time_seconds]
+
+                for run_num in range(2, runs + 1):
+                    additional_result = run_query_isolated(
+                        engine_class=engine_class,
+                        engine_name=engine,
+                        data_paths=data_paths,
+                        query_name=query_name,
+                        query_sql=query_sql,
+                        timeout=timeout,
+                    )
+                    if additional_result.status == "success":
+                        run_times.append(additional_result.time_seconds)
+                    else:
+                        # If any subsequent run fails, just use successful runs
+                        break
+
+                # Calculate average of all successful runs
+                avg_time = round(sum(run_times) / len(run_times), 2)
+                result = BenchmarkResult(
+                    query=query_name,
+                    engine=engine,
+                    time_seconds=avg_time,
+                    row_count=result.row_count,
+                    status="success",
+                    error_message=None,
+                )
+                print(f"{avg_time}s avg ({len(run_times)} runs, {result.row_count} rows)")
+            elif result.status == "success":
+                print(f"{result.time_seconds}s ({result.row_count} rows)")
+            else:
+                print(f"{result.status.upper()}: {result.error_message}")
+
+            # Replace the pre-populated "not_started" entry with the actual result
+            suite.results[idx] = result
+            if result.status == "success":
+                suite.total_time += result.time_seconds
+
+            # Save partial results after each query so they survive crashes
+            if output_file:
+                save_results([suite], output_file)
+    finally:
+        signal.signal(signal.SIGTERM, prev_handler)
 
     return suite
 
@@ -629,7 +667,7 @@ def main():
         print(f"  {table}: {path}")
 
     results = [
-        run_benchmark(engine, data_paths, queries, args.timeout, args.scale_factor, args.runs)
+        run_benchmark(engine, data_paths, queries, args.timeout, args.scale_factor, args.runs, args.output)
         for engine in engines
     ]
 
diff --git a/benchmark/summarize_results.py b/benchmark/summarize_results.py
index a52fcc8..74b7a3f 100755
--- a/benchmark/summarize_results.py
+++ b/benchmark/summarize_results.py
@@ -26,8 +26,20 @@
 from pathlib import Path
 
 
-def load_results(results_dir: str) -> dict:
-    """Load all JSON result files from a directory."""
+def load_results(results_dir: str, expected_engines: list[str] | None = None) -> dict:
+    """Load all JSON result files from a directory.
+
+    Supports two layouts:
+    1. One file per engine (e.g., duckdb_results.json with all queries)
+    2. One file per query (e.g., duckdb_q1_results.json with a single query)
+
+    Per-query files are merged into a single suite per engine. If multiple files
+    contain results for the same engine, their query results are combined.
+
+    If expected_engines is provided, engines that were expected to run but have
+    no results file will be included with all queries marked as 'not_started'.
+    This handles the case where a runner was OOM-killed before uploading results.
+    """
     results = {}
     results_path = Path(results_dir)
 
@@ -36,7 +48,55 @@ def load_results(results_dir: str) -> dict:
             data = json.load(f)
             for suite in data.get("results", []):
                 engine = suite["engine"]
-                results[engine] = suite
+                if engine not in results:
+                    results[engine] = suite
+                else:
+                    # Merge query results from multiple files for the same engine
+                    existing_queries = {r["query"] for r in results[engine].get("results", [])}
+                    for r in suite.get("results", []):
+                        if r["query"] not in existing_queries:
+                            results[engine]["results"].append(r)
+                            existing_queries.add(r["query"])
+                        elif r.get("status") != "not_started":
+                            # Replace not_started placeholder with actual result
+                            results[engine]["results"] = [
+                                r if existing["query"] == r["query"] else existing
+                                for existing in results[engine]["results"]
+                            ]
+
+    # For expected engines with no results, create placeholder entries
+    if expected_engines:
+        # Determine the full query list from engines that did report results
+        all_queries = set()
+        scale_factor = None
+        for engine_data in results.values():
+            if scale_factor is None:
+                scale_factor = engine_data.get("scale_factor", 1)
+            for r in engine_data.get("results", []):
+                all_queries.add(r["query"])
+
+        # Default to q1-q12 if no engine reported any results
+        if not all_queries:
+            all_queries = {f"q{i}" for i in range(1, 13)}
+
+        for engine in expected_engines:
+            if engine not in results:
+                results[engine] = {
+                    "engine": engine,
+                    "version": "unknown",
+                    "scale_factor": scale_factor or 1,
+                    "timestamp": datetime.now(timezone.utc).isoformat(),
+                    "results": [
+                        {
+                            "query": q,
+                            "status": "not_started",
+                            "time_seconds": None,
+                            "row_count": None,
+                            "error_message": "Runner was killed before completing this query (likely OOM)",
+                        }
+                        for q in sorted(all_queries, key=lambda x: int(x[1:]))
+                    ],
+                }
 
     return results
 
@@ -151,29 +211,38 @@ def generate_markdown_summary(results: dict, output_file: str, query_timeout: in
                 row += " ⏱️ TIMEOUT |"
             elif status == "error":
                 row += " ❌ ERROR |"
+            elif status == "not_started":
+                row += " 💀 OOM |"
             else:
                 row += " — |"
         lines.append(row)
 
-    # Win count summary
+    # Win count and completion summary
     win_counts = {engine: 0 for engine in engines}
+    completed_counts = {engine: 0 for engine in engines}
+    total_queries = len(all_queries)
     for query in all_queries:
         winner = get_winner(query, data, engines)
         if winner:
             win_counts[winner] += 1
+        for engine in engines:
+            result = data.get(engine, {}).get(query, {})
+            if result.get("status") == "success":
+                completed_counts[engine] += 1
 
     lines.extend([
         "",
         "## 🥇 Performance Summary",
         "",
-        "| Engine | Wins |",
-        "|--------|:----:|",
+        "| Engine | Completed | Wins |",
+        "|--------|:---------:|:----:|",
     ])
 
     for engine in sorted(engines, key=lambda e: win_counts[e], reverse=True):
         icon_name = engine_icons.get(engine, engine.title())
         wins = win_counts[engine]
-        lines.append(f"| {icon_name} | {wins} |")
+        completed = completed_counts[engine]
+        lines.append(f"| {icon_name} | {completed}/{total_queries} | {wins} |")
 
     # Detailed results section (collapsible)
     lines.extend([
@@ -203,6 +272,7 @@ def generate_markdown_summary(results: dict, output_file: str, query_timeout: in
                 "success": "✅",
                 "error": "❌",
                 "timeout": "⏱️",
+                "not_started": "💀",
             }.get(status, "❓")
 
             lines.append(f"| {query.upper()} | {time_str} | {status_emoji} | {row_str} |")
@@ -219,14 +289,24 @@ def generate_markdown_summary(results: dict, output_file: str, query_timeout: in
 
     for engine in engines:
         engine_errors = []
+        not_started_queries = []
         for query in all_queries:
             result = data.get(engine, {}).get(query, {})
-            if result.get("status") in ("error", "timeout"):
+            status = result.get("status")
+            if status in ("error", "timeout"):
                 error_msg = result.get("error_message", "No details available")
                 # Truncate long error messages
                 if len(error_msg) > 200:
                     error_msg = error_msg[:200] + "..."
                 engine_errors.append(f"- **{query.upper()}**: `{error_msg}`")
+            elif status == "not_started":
+                not_started_queries.append(query.upper())
+
+        if not_started_queries:
+            engine_errors.append(
+                f"- **{', '.join(not_started_queries)}**: "
+                f"`Could not complete these queries, likely due to OOM (runner was killed)`"
+            )
 
         if engine_errors:
             has_errors = True
@@ -248,6 +328,7 @@ def generate_markdown_summary(results: dict, output_file: str, query_timeout: in
         "| **bold** | Fastest for this query |",
         "| ⏱️ TIMEOUT | Query exceeded timeout |",
         "| ❌ ERROR | Query failed |",
+        "| 💀 OOM | Could not run, likely due to out-of-memory (runner killed) |",
         "",
         f"*Generated by [SpatialBench](https://github.com/apache/sedona-spatialbench) on {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}*",
     ])
@@ -289,10 +370,18 @@ def main():
         default=3,
         help="Number of runs per query (for reporting)",
     )
+    parser.add_argument(
+        "--engines",
+        type=str,
+        default=None,
+        help="Comma-separated list of expected engines (e.g., 'duckdb,geopandas,sedonadb,spatial_polars'). "
+        "Engines that were expected but have no results will be shown as OOM/runner-killed.",
+    )
 
     args = parser.parse_args()
 
-    results = load_results(args.results_dir)
+    expected_engines = [e.strip() for e in args.engines.split(",")] if args.engines else None
+    results = load_results(args.results_dir, expected_engines=expected_engines)
 
     if not results:
         print(f"No results found in {args.results_dir}")