Merge remote-tracking branch 'upstream/main' into add_boundaries_keyword_resample

Giacomo Caria · Giacomo Caria · commit 15bcb4cd27ce · 2025-10-22T20:16:33.000-03:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -24,24 +24,23 @@ repos:
       - id: rst-inline-touching-normal
       - id: text-unicode-replacement-char
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.13.3
+    rev: v0.14.1
     hooks:
       - id: ruff-check
         args: ["--fix", "--show-fixes"]
       - id: ruff-format
-  # Disabled: blackdoc v0.4.3 has compatibility issues with Python 3.13
-  # Re-enable when blackdoc is updated to support Python 3.13
-  # - repo: https://github.com/keewis/blackdoc
-  #   rev: v0.4.3
-  #   hooks:
-  #     - id: blackdoc
-  #       exclude: "generate_aggregations.py"
-  #       additional_dependencies: ["black==24.8.0"]
+  - repo: https://github.com/keewis/blackdoc
+    rev: v0.4.5
+    hooks:
+      - id: blackdoc
+        exclude: "generate_aggregations.py"
+        # make sure this is the most recent version of black
+        additional_dependencies: ["black==25.9.0"]
   - repo: https://github.com/rbubley/mirrors-prettier
     rev: v3.6.2
     hooks:
       - id: prettier
-        args: [--cache-location=.prettier_cache/cache]
+        args: ["--cache-location=.prettier_cache/cache"]
   - repo: https://github.com/pre-commit/mirrors-mypy
     rev: v1.18.2
     hooks:
@@ -61,20 +60,22 @@ repos:
             numpy,
           ]
   - repo: https://github.com/citation-file-format/cff-converter-python
-    rev: ebf0b5e44d67f8beaa1cd13a0d0393ea04c6058d
+    rev: 5295f87c0e261da61a7b919fc754e3a77edd98a7
     hooks:
       - id: validate-cff
   - repo: https://github.com/ComPWA/taplo-pre-commit
     rev: v0.9.3
     hooks:
       - id: taplo-format
         args: ["--option", "array_auto_collapse=false"]
+      - id: taplo-lint
+        args: ["--no-schema"]
   - repo: https://github.com/abravalheri/validate-pyproject
     rev: v0.24.1
     hooks:
       - id: validate-pyproject
         additional_dependencies: ["validate-pyproject-schema-store[all]"]
   - repo: https://github.com/adhtruong/mirrors-typos
-    rev: v1.37.2
+    rev: v1.38.1
     hooks:
       - id: typos
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
@@ -29,7 +29,7 @@
   // If missing or the empty string, the tool will be automatically
   // determined by looking for tools on the PATH environment
   // variable.
-  "environment_type": "mamba",
+  "environment_type": "rattler",
   "conda_channels": ["conda-forge"],
 
   // timeout in seconds for installing any dependencies in environment
@@ -76,7 +76,7 @@
   // https://github.com/airspeed-velocity/asv/issues/1389#issuecomment-2076131185
   "build_command": [
     "python -m build",
-    "python -mpip wheel --no-deps --no-build-isolation --no-index -w {build_cache_dir} {build_dir}"
+    "python -m pip wheel --no-deps --no-build-isolation --no-index -w {build_cache_dir} {build_dir}"
   ],
   // Combinations of libraries/python versions can be excluded/included
   // from the set to test. Each entry is a dictionary containing additional
diff --git a/asv_bench/benchmarks/combine.py b/asv_bench/benchmarks/combine.py
@@ -5,6 +5,22 @@
 from . import requires_dask
 
 
+class Concat1d:
+    """Benchmark concatenating large datasets"""
+
+    def setup(self) -> None:
+        self.data_arrays = [
+            xr.DataArray(data=np.zeros(4 * 1024 * 1024, dtype=np.int8), dims=["x"])
+            for _ in range(10)
+        ]
+
+    def time_concat(self) -> None:
+        xr.concat(self.data_arrays, dim="x")
+
+    def peakmem_concat(self) -> None:
+        xr.concat(self.data_arrays, dim="x")
+
+
 class Combine1d:
     """Benchmark concatenating and merging large datasets"""
 
diff --git a/ci/requirements/environment-benchmark.yml b/ci/requirements/environment-benchmark.yml
@@ -12,6 +12,7 @@ dependencies:
   - numba
   - numbagg
   - numexpr
+  - py-rattler
   - numpy>=2.2,<2.3 # https://github.com/numba/numba/issues/10105
   - opt_einsum
   - packaging
diff --git a/doc/examples/apply_ufunc_vectorize_1d.ipynb b/doc/examples/apply_ufunc_vectorize_1d.ipynb
@@ -28,7 +28,7 @@
     "\n",
     "### Load data\n",
     "\n",
-    "First lets load an example dataset"
+    "First let's load an example dataset"
    ]
   },
   {
diff --git a/doc/user-guide/hierarchical-data.rst b/doc/user-guide/hierarchical-data.rst
@@ -426,7 +426,7 @@ We can use :py:meth:`xarray.DataTree.match` for this:
 We can also subset trees by the contents of the nodes.
 :py:meth:`xarray.DataTree.filter` retains only the nodes of a tree that meet a certain condition.
 For example, we could recreate the Simpson's family tree with the ages of each individual, then filter for only the adults:
-First lets recreate the tree but with an ``age`` data variable in every node:
+First let's recreate the tree but with an ``age`` data variable in every node:
 
 .. jupyter-execute::
 
diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst
@@ -128,7 +128,7 @@ The backends are tried in order: **netcdf4 → h5netcdf → scipy → pydap →
     .. code-block:: python
 
         # Prefer h5netcdf over netcdf4
-        xr.set_options(netcdf_engine_order=['h5netcdf', 'netcdf4', 'scipy'])
+        xr.set_options(netcdf_engine_order=["h5netcdf", "netcdf4", "scipy"])
 
     See :ref:`options` for more details on configuration options.
 
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -32,12 +32,20 @@ Bug Fixes
 ~~~~~~~~~
 - Fix h5netcdf backend for format=None, use same rule as netcdf4 backend (:pull:`10859`).
   By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_
-
 - ``netcdf4`` and ``pydap`` backends now use stricter URL detection to avoid incorrectly claiming
   remote URLs. The ``pydap`` backend now only claims URLs with explicit DAP protocol indicators
   (``dap2://`` or ``dap4://`` schemes, or ``/dap2/`` or ``/dap4/`` in the URL path). This prevents
   both backends from claiming remote Zarr stores and other non-DAP URLs without an explicit
   ``engine=`` argument. (:pull:`10804`). By `Ian Hunt-Isaak <https://github.com/ianhi>`_.
+- Fix indexing with empty arrays for scipy & h5netcdf backends which now resolves to empty slices (:issue:`10867`, :pull:`10870`).
+  By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_
+
+Performance
+~~~~~~~~~~~
+
+- Speedup and reduce memory usage of :py:func:`concat`. Magnitude of improvement scales
+  with size of the concatenation dimension. By `Deepak Cherian <https://github.com/dcherian>`_.
+  :issue:`10864` :pull:`10866`.
 
 Documentation
 ~~~~~~~~~~~~~
diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -1420,10 +1420,10 @@ def open_mfdataset(
     chunks : int, dict, 'auto' or None, optional
         Dictionary with keys given by dimension names and values given by chunk sizes.
         In general, these should divide the dimensions of each dataset. If int, chunk
-        each dimension by ``chunks``. By default, chunks will be chosen to load entire
-        input files into memory at once. This has a major impact on performance: please
-        see the full documentation for more details [2]_. This argument is evaluated
-        on a per-file basis, so chunk sizes that span multiple files will be ignored.
+        each dimension by ``chunks``. By default, chunks will be chosen to match the
+        chunks on disk. This may impact performance: please see the full documentation
+        for more details [2]_. This argument is evaluated on a per-file basis, so chunk
+        sizes that span multiple files will be ignored.
     concat_dim : str, DataArray, Index or a Sequence of these or None, optional
         Dimensions to concatenate files along.  You only need to provide this argument
         if ``combine='nested'``, and if any of the dimensions along which you want to
diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py
@@ -1366,15 +1366,19 @@ def _decompose_outer_indexer(
         gains = [
             (
                 (np.max(k) - np.min(k) + 1.0) / len(np.unique(k))
-                if isinstance(k, np.ndarray)
+                if isinstance(k, np.ndarray) and k.size != 0
                 else 0
             )
             for k in indexer_elems
         ]
         array_index = np.argmax(np.array(gains)) if len(gains) > 0 else None
 
         for i, (k, s) in enumerate(zip(indexer_elems, shape, strict=False)):
-            if isinstance(k, np.ndarray) and i != array_index:
+            if isinstance(k, np.ndarray) and k.size == 0:
+                # empty np.ndarray key is converted to empty slice
+                # see https://github.com/pydata/xarray/issues/10867
+                backend_indexer.append(slice(0, 0))
+            elif isinstance(k, np.ndarray) and i != array_index:
                 # np.ndarray key is converted to slice that covers the entire
                 # entries of this key.
                 backend_indexer.append(slice(np.min(k), np.max(k) + 1))
diff --git a/xarray/structure/concat.py b/xarray/structure/concat.py
@@ -745,10 +745,11 @@ def get_indexes(name):
                         yield PandasIndex(data, dim_name, coord_dtype=var.dtype)
 
     # create concatenation index, needed for later reindexing
+    # use np.cumulative_sum(concat_dim_lengths, include_initial=True) when we support numpy>=2
     file_start_indexes = np.append(0, np.cumsum(concat_dim_lengths))
-    concat_index = np.arange(file_start_indexes[-1])
-    concat_index_size = concat_index.size
+    concat_index_size = file_start_indexes[-1]
     variable_index_mask = np.ones(concat_index_size, dtype=bool)
+    variable_reindexer = None
 
     # stack up each variable and/or index to fill-out the dataset (in order)
     # n.b. this loop preserves variable order, needed for groupby.
@@ -776,7 +777,6 @@ def get_indexes(name):
                     end = file_start_indexes[i + 1]
                     variable_index_mask[slice(start, end)] = False
 
-            variable_index = concat_index[variable_index_mask]
             vars = ensure_common_dims(variables, var_concat_dim_length)
 
             # Try to concatenate the indexes, concatenate the variables when no index
@@ -807,12 +807,22 @@ def get_indexes(name):
                     vars, dim_name, positions, combine_attrs=combine_attrs
                 )
                 # reindex if variable is not present in all datasets
-                if len(variable_index) < concat_index_size:
+                if not variable_index_mask.all():
+                    if variable_reindexer is None:
+                        # allocate only once
+                        variable_reindexer = np.empty(
+                            concat_index_size,
+                            # cannot use uint since we need -1 as a sentinel for reindexing
+                            dtype=np.min_scalar_type(-concat_index_size),
+                        )
+                    np.cumsum(variable_index_mask, out=variable_reindexer)
+                    # variable_index_mask is boolean, so the first element is 1.
+                    # offset by 1 to start at 0.
+                    variable_reindexer -= 1
+                    variable_reindexer[~variable_index_mask] = -1
                     combined_var = reindex_variables(
                         variables={name: combined_var},
-                        dim_pos_indexers={
-                            dim_name: pd.Index(variable_index).get_indexer(concat_index)
-                        },
+                        dim_pos_indexers={dim_name: variable_reindexer},
                         fill_value=fill_value,
                     )[name]
                 result_vars[name] = combined_var
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -986,6 +986,15 @@ def test_isel_dataarray(self) -> None:
             actual = on_disk.isel(dim2=on_disk["dim2"] < 3)
             assert_identical(expected, actual)
 
+    def test_empty_isel(self) -> None:
+        # Make sure isel works lazily with empty indexer.
+        # GH:issue:10867
+        in_memory = xr.Dataset({"a": ("x", np.arange(4))}, coords={"x": np.arange(4)})
+        with self.roundtrip(in_memory) as on_disk:
+            expected = in_memory.isel(x=[])
+            actual = on_disk.isel(x=[])
+            assert_identical(expected, actual)
+
     def validate_array_type(self, ds):
         # Make sure that only NumpyIndexingAdapter stores a bare np.ndarray.
         def find_and_validate_array(obj):

Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@`
`28`	`28`	`"\n",`
`29`	`29`	`"### Load data\n",`
`30`	`30`	`"\n",`
`31`		`- "First lets load an example dataset"`
	`31`	`+ "First let's load an example dataset"`
`32`	`32`	`]`
`33`	`33`	`},`
`34`	`34`	`{`