diff --git a/CHANGELOG.md b/CHANGELOG.md index d6d7e539f..4cc1cdaa1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ Changelogs for this project are recorded in this file since v0.2.0. ### Added * Allow parallel computation of DTW barycenters and plug it in `TimeSeriesKMeans`. +* `PiecewiseAggregateApproximation.segment_indices` exposes the start/end indices of each PAA segment in the original time series ([#441](https://github.com/tslearn-team/tslearn/issues/441)). ### Changed diff --git a/tests/test_piecewise.py b/tests/test_piecewise.py index 1e3dbaad8..e4b3c69b1 100644 --- a/tests/test_piecewise.py +++ b/tests/test_piecewise.py @@ -24,6 +24,37 @@ def test_paa(): paa_est.distance_paa(paa_repr[0], paa_repr[1])) +def test_paa_segment_indices(): + # Regression test for #441: expose PAA segment boundaries so callers can + # map paa_data[i] back to the original-series index range it summarises. + paa = PiecewiseAggregateApproximation(n_segments=3) + data = [[-1., 2., 0.1, -1., 1., -1.]] + # Before fitting, segment_indices must raise NotFittedError (consistent + # with distance / transform). + np.testing.assert_raises(NotFittedError, paa.segment_indices) + + paa_data = paa.fit_transform(data) + seg_idx = paa.segment_indices() + + # Shape and dtype contract. + assert seg_idx.shape == (3, 2) + assert np.issubdtype(seg_idx.dtype, np.integer) + + # The boundaries must reproduce the means stored in paa_data — this is the + # property a user actually relies on when locating "where changes occur". + arr = np.asarray(data, dtype=float) + for i_seg, (start, end) in enumerate(seg_idx): + np.testing.assert_allclose( + paa_data[0, i_seg, 0], arr[0, start:end].mean() + ) + + # Non-divisible length: trailing samples are dropped, like transform does. + paa2 = PiecewiseAggregateApproximation(n_segments=3) + paa2.fit([[1., 2., 3., 4., 5., 6., 7.]]) # sz=7, n_segments=3 -> sz_seg=2 + seg_idx2 = paa2.segment_indices() + np.testing.assert_array_equal(seg_idx2, [[0, 2], [2, 4], [4, 6]]) + + def test_sax(): unfitted_sax = SymbolicAggregateApproximation(n_segments=3, alphabet_size_avg=2) diff --git a/tslearn/piecewise/piecewise.py b/tslearn/piecewise/piecewise.py index 32faea603..85e69bbb5 100644 --- a/tslearn/piecewise/piecewise.py +++ b/tslearn/piecewise/piecewise.py @@ -258,6 +258,48 @@ def inverse_transform(self, X): X = check_dims(X) return inv_transform_paa(X, original_size=self._X_fit_dims_[1]) + def segment_indices(self): + """Return the start/end indices of each PAA segment in the original + time series. + + These are the boundaries used when transforming a fitted-length time + series into its PAA representation: segment ``i`` of the PAA output + is the mean of ``ts[start_i:end_i]`` in the original series. + + Returns + ------- + numpy.ndarray of shape (n_segments, 2), dtype=int + ``[[start_0, end_0], [start_1, end_1], ...]`` segment ranges in the + original time-series index. ``end_i`` is exclusive and matches the + half-open convention used by :meth:`transform` (which slices + ``X[i_ts, start:end, :]``). + + Examples + -------- + >>> paa = PiecewiseAggregateApproximation(n_segments=3) + >>> _ = paa.fit([[-1., 2., 0.1, -1., 1., -1.]]) + >>> paa.segment_indices() + array([[0, 2], + [2, 4], + [4, 6]]) + + Notes + ----- + The segment width matches what :meth:`transform` uses internally: + ``sz_segment = sz_fit // n_segments``. Trailing samples beyond + ``n_segments * sz_segment`` are dropped, exactly as in + :meth:`transform` — this keeps the indices consistent with the values + in ``paa_data``. + """ + self._is_fitted() + sz_fit = int(self._X_fit_dims_[1]) + # Match _transform's segment-width convention so callers can map + # paa_data[i_seg] back to ts[start_i:end_i] without off-by-one. + sz_segment = sz_fit // self.n_segments + starts = numpy.arange(self.n_segments, dtype=int) * sz_segment + ends = starts + sz_segment + return numpy.stack([starts, ends], axis=1) + def _more_tags(self): tags = super()._more_tags() tags.update({