Skip to content

Commit 71bbab0

Browse files
author
Giacomo Caria
committed
apply changes
1 parent b5e4b0e commit 71bbab0

File tree

5 files changed

+109
-2
lines changed

5 files changed

+109
-2
lines changed

xarray/core/common.py

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,16 @@
66
from contextlib import suppress
77
from html import escape
88
from textwrap import dedent
9-
from typing import TYPE_CHECKING, Any, Concatenate, ParamSpec, TypeVar, Union, overload
9+
from typing import (
10+
TYPE_CHECKING,
11+
Any,
12+
Concatenate,
13+
Literal,
14+
ParamSpec,
15+
TypeVar,
16+
Union,
17+
overload,
18+
)
1019

1120
import numpy as np
1221
import pandas as pd
@@ -925,6 +934,7 @@ def _resample(
925934
offset: pd.Timedelta | datetime.timedelta | str | None,
926935
origin: str | DatetimeLike,
927936
restore_coord_dims: bool | None,
937+
boundaries: Literal["exact", "trim"] | None = None,
928938
**indexer_kwargs: ResampleCompatible | Resampler,
929939
) -> T_Resample:
930940
"""Returns a Resample object for performing resampling operations.
@@ -960,6 +970,11 @@ def _resample(
960970
restore_coord_dims : bool, optional
961971
If True, also restore the dimension order of multi-dimensional
962972
coordinates.
973+
boundaries : {"exact", "trim"}, optional
974+
How to handle boundaries when the data doesn't evenly fit the resampling
975+
frequency. If 'exact', a ValueError will be raised if the data doesn't
976+
evenly fit. If 'trim', incomplete periods are dropped. If None (default),
977+
uses the current behavior (includes incomplete periods).
963978
**indexer_kwargs : {dim: freq}
964979
The keyword arguments form of ``indexer``.
965980
One of indexer or indexer_kwargs must be provided.
@@ -1107,8 +1122,45 @@ def _resample(
11071122
grouper: Resampler
11081123
if isinstance(freq, ResampleCompatible):
11091124
grouper = TimeResampler(
1110-
freq=freq, closed=closed, label=label, origin=origin, offset=offset
1125+
freq=freq,
1126+
closed=closed,
1127+
label=label,
1128+
origin=origin,
1129+
offset=offset,
1130+
boundaries=boundaries,
11111131
)
1132+
1133+
# Apply trim logic at the resample level if needed
1134+
if boundaries == "trim":
1135+
# First, get the resampling periods to identify incomplete ones
1136+
from xarray.core.groupby import ResolvedGrouper
1137+
1138+
temp_grouper = ResolvedGrouper(grouper, group, self)
1139+
temp_encoded = temp_grouper.encoded
1140+
1141+
# Count data points in each period
1142+
codes = temp_encoded.codes
1143+
counts = np.bincount(codes.values)
1144+
1145+
if len(counts) > 0:
1146+
# Find the most common count (expected points per period)
1147+
unique_counts, count_frequencies = np.unique(
1148+
counts, return_counts=True
1149+
)
1150+
most_common_count = unique_counts[np.argmax(count_frequencies)]
1151+
1152+
# Identify incomplete periods
1153+
incomplete_periods = counts < most_common_count
1154+
1155+
if np.any(incomplete_periods):
1156+
# Find which data points belong to incomplete periods
1157+
incomplete_codes = np.where(incomplete_periods)[0]
1158+
valid_mask = ~np.isin(codes.values, incomplete_codes)
1159+
1160+
# Filter the data to exclude incomplete periods
1161+
group = group.isel({group.dims[0]: valid_mask})
1162+
# Also update the object to match the filtered group
1163+
self = self.isel({group.dims[0]: valid_mask})
11121164
elif isinstance(freq, Resampler):
11131165
grouper = freq
11141166
else:

xarray/core/dataarray.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7433,6 +7433,7 @@ def resample(
74337433
offset: pd.Timedelta | datetime.timedelta | str | None = None,
74347434
origin: str | DatetimeLike = "start_day",
74357435
restore_coord_dims: bool | None = None,
7436+
boundaries: Literal["exact", "trim"] | None = None,
74367437
**indexer_kwargs: ResampleCompatible | Resampler,
74377438
) -> DataArrayResample:
74387439
"""Returns a Resample object for performing resampling operations.
@@ -7468,6 +7469,11 @@ def resample(
74687469
restore_coord_dims : bool, optional
74697470
If True, also restore the dimension order of multi-dimensional
74707471
coordinates.
7472+
boundaries : {"exact", "trim"}, optional
7473+
How to handle boundaries when the data doesn't evenly fit the resampling
7474+
frequency. If 'exact', a ValueError will be raised if the data doesn't
7475+
evenly fit. If 'trim', incomplete periods are dropped. If None (default),
7476+
uses the current behavior (includes incomplete periods).
74717477
**indexer_kwargs : str, datetime.timedelta, pd.Timedelta, pd.DateOffset, or Resampler
74727478
The keyword arguments form of ``indexer``.
74737479
One of indexer or indexer_kwargs must be provided.
@@ -7572,6 +7578,7 @@ def resample(
75727578
offset=offset,
75737579
origin=origin,
75747580
restore_coord_dims=restore_coord_dims,
7581+
boundaries=boundaries,
75757582
**indexer_kwargs,
75767583
)
75777584

xarray/core/dataset.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10371,6 +10371,7 @@ def resample(
1037110371
offset: pd.Timedelta | datetime.timedelta | str | None = None,
1037210372
origin: str | DatetimeLike = "start_day",
1037310373
restore_coord_dims: bool | None = None,
10374+
boundaries: Literal["exact", "trim"] | None = None,
1037410375
**indexer_kwargs: ResampleCompatible | Resampler,
1037510376
) -> DatasetResample:
1037610377
"""Returns a Resample object for performing resampling operations.
@@ -10406,6 +10407,11 @@ def resample(
1040610407
restore_coord_dims : bool, optional
1040710408
If True, also restore the dimension order of multi-dimensional
1040810409
coordinates.
10410+
boundaries : {"exact", "trim"}, optional
10411+
How to handle boundaries when the data doesn't evenly fit the resampling
10412+
frequency. If 'exact', a ValueError will be raised if the data doesn't
10413+
evenly fit. If 'trim', incomplete periods are dropped. If None (default),
10414+
uses the current behavior (includes incomplete periods).
1040910415
**indexer_kwargs : str, datetime.timedelta, pd.Timedelta, pd.DateOffset, or Resampler
1041010416
The keyword arguments form of ``indexer``.
1041110417
One of indexer or indexer_kwargs must be provided.
@@ -10438,6 +10444,7 @@ def resample(
1043810444
offset=offset,
1043910445
origin=origin,
1044010446
restore_coord_dims=restore_coord_dims,
10447+
boundaries=boundaries,
1044110448
**indexer_kwargs,
1044210449
)
1044310450

xarray/groupers.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -484,13 +484,19 @@ class TimeResampler(Resampler):
484484
- 'end_day': `origin` is the ceiling midnight of the last day
485485
offset : pd.Timedelta, datetime.timedelta, or str, default is None
486486
An offset timedelta added to the origin.
487+
boundaries : {"exact", "trim"}, optional
488+
How to handle boundaries when the data doesn't evenly fit the resampling
489+
frequency. If 'exact', a ValueError will be raised if the data doesn't
490+
evenly fit. If 'trim', incomplete periods are dropped. If None (default),
491+
uses the current behavior (includes incomplete periods).
487492
"""
488493

489494
freq: ResampleCompatible
490495
closed: SideOptions | None = field(default=None)
491496
label: SideOptions | None = field(default=None)
492497
origin: str | DatetimeLike = field(default="start_day")
493498
offset: pd.Timedelta | datetime.timedelta | str | None = field(default=None)
499+
boundaries: Literal["exact", "trim"] | None = field(default=None, kw_only=True)
494500

495501
index_grouper: CFTimeGrouper | pd.Grouper = field(init=False, repr=False)
496502
group_as_index: pd.Index = field(init=False, repr=False)
@@ -502,6 +508,7 @@ def reset(self) -> Self:
502508
label=self.label,
503509
origin=self.origin,
504510
offset=self.offset,
511+
boundaries=self.boundaries,
505512
)
506513

507514
def _init_properties(self, group: T_Group) -> None:
@@ -566,6 +573,22 @@ def factorize(self, group: T_Group) -> EncodedGroups:
566573
self._init_properties(group)
567574
full_index, first_items, codes_ = self._get_index_and_items()
568575
sbins = first_items.values.astype(np.int64)
576+
577+
# Handle boundaries parameter for exact checking
578+
if self.boundaries == "exact":
579+
# Check if data evenly fits the resampling frequency
580+
counts = np.bincount(codes_)
581+
expected_points = len(group) // len(first_items)
582+
incomplete_periods = counts < expected_points
583+
584+
if np.any(incomplete_periods):
585+
raise ValueError(
586+
f"Data does not evenly fit the resampling frequency. "
587+
f"Expected {expected_points} points per period, but found periods with "
588+
f"{counts[incomplete_periods]} points. Use boundaries='trim' "
589+
f"to handle incomplete periods."
590+
)
591+
569592
group_indices: GroupIndices = tuple(
570593
list(itertools.starmap(slice, pairwise(sbins))) + [slice(sbins[-1], None)]
571594
)

xarray/tests/test_groupby.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2081,6 +2081,24 @@ def test_resample_skipna(self) -> None:
20812081
expected = DataArray([np.nan, 1, 1], [("time", times[::4])])
20822082
assert_identical(result, expected)
20832083

2084+
def test_resample_boundaries(self) -> None:
2085+
"""Test the boundaries parameter for resample."""
2086+
# Create 31-day data with predictable values (0-30)
2087+
times = pd.date_range("2000-01-01", periods=31, freq="D")
2088+
array = DataArray(np.arange(31), [("time", times)])
2089+
2090+
# Test boundaries="trim" - drops incomplete periods
2091+
result_trim = array.resample(time="7D", boundaries="trim").mean()
2092+
assert len(result_trim.time) == 4
2093+
expected_trim = np.array([3.0, 10.0, 17.0, 24.0])
2094+
np.testing.assert_array_equal(result_trim.values, expected_trim)
2095+
2096+
# Test boundaries="exact" - raises error for uneven data
2097+
with pytest.raises(
2098+
ValueError, match="Data does not evenly fit the resampling frequency"
2099+
):
2100+
array.resample(time="7D", boundaries="exact").mean()
2101+
20842102
def test_upsample(self) -> None:
20852103
times = pd.date_range("2000-01-01", freq="6h", periods=5)
20862104
array = DataArray(np.arange(5), [("time", times)])

0 commit comments

Comments
 (0)