Skip to content

Commit 629d533

Browse files
authored
Use float64 in Jenks natural breaks internals (#1100) (#1101)
* Use float64 in Jenks natural breaks internals (#1100) The Jenks matrices and bin edge array used float32, causing the naive variance formula (sum_squares - sum*sum/w) to lose all significant digits when data had a large offset relative to its spread. Changed lower_class_limits, var_combinations, val cast, and kclass to float64. * Add regression test for Jenks float32 precision (#1100) test_natural_breaks_large_offset_1100: five tight clusters offset by 100,000 must be separated into 5 distinct classes. With float32 internals, the variance calculation lost all signal and merged clusters. * Retrigger CI
1 parent 71b0c23 commit 629d533

2 files changed

Lines changed: 37 additions & 4 deletions

File tree

xrspatial/classify.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -547,12 +547,12 @@ def quantile(agg: xr.DataArray,
547547
def _run_numpy_jenks_matrices(data, n_classes):
548548
n_data = data.shape[0]
549549
lower_class_limits = np.zeros(
550-
(n_data + 1, n_classes + 1), dtype=np.float32
550+
(n_data + 1, n_classes + 1), dtype=np.float64
551551
)
552552
lower_class_limits[1, 1:n_classes + 1] = 1.0
553553

554554
var_combinations = np.zeros(
555-
(n_data + 1, n_classes + 1), dtype=np.float32
555+
(n_data + 1, n_classes + 1), dtype=np.float64
556556
)
557557
var_combinations[2:n_data + 1, 1:n_classes + 1] = np.inf
558558

@@ -568,7 +568,7 @@ def _run_numpy_jenks_matrices(data, n_classes):
568568
lower_class_limit = l - m
569569
i4 = lower_class_limit - 1
570570

571-
val = np.float32(data[i4])
571+
val = data[i4]
572572

573573
# here we're estimating variance for each potential classing
574574
# of the data, for each potential number of classes. `w`
@@ -610,7 +610,7 @@ def _run_jenks(data, n_classes):
610610
lower_class_limits, _ = _run_numpy_jenks_matrices(data, n_classes)
611611

612612
k = data.shape[0]
613-
kclass = np.zeros(n_classes + 1, dtype=np.float32)
613+
kclass = np.zeros(n_classes + 1, dtype=np.float64)
614614
kclass[0] = data[0]
615615
kclass[-1] = data[-1]
616616
count_num = n_classes

xrspatial/tests/test_classify.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,39 @@ def test_natural_breaks_cupy_matches_numpy():
488488
)
489489

490490

491+
def test_natural_breaks_large_offset_1100():
492+
"""Jenks should separate tight clusters even when data has a large offset.
493+
494+
Regression test for #1100: float32 internals caused the variance
495+
formula to lose all significant digits for offset data.
496+
"""
497+
rng = np.random.default_rng(0)
498+
centers = np.array([100_000, 100_010, 100_020, 100_030, 100_040])
499+
data = np.concatenate([c + rng.uniform(-1, 1, 200) for c in centers])
500+
agg = xr.DataArray(data.reshape(10, 100))
501+
502+
result = natural_breaks(agg, k=5, num_sample=None)
503+
result_data = result.data
504+
if hasattr(result_data, 'compute'):
505+
result_data = result_data.compute()
506+
507+
# All 5 classes should be present
508+
unique_classes = np.unique(result_data[~np.isnan(result_data)])
509+
assert len(unique_classes) == 5, (
510+
f"Expected 5 classes, got {len(unique_classes)}: {unique_classes}"
511+
)
512+
513+
# Each center's 200 points should be in a single class
514+
flat = result_data.ravel()
515+
for i, center in enumerate(centers):
516+
start = i * 200
517+
end = start + 200
518+
chunk_classes = np.unique(flat[start:end])
519+
assert len(chunk_classes) == 1, (
520+
f"Center {center}: expected 1 class, got {len(chunk_classes)}"
521+
)
522+
523+
491524
@dask_array_available
492525
def test_natural_breaks_dask_matches_numpy():
493526
elevation = np.arange(100, dtype=np.float64).reshape(10, 10)

0 commit comments

Comments
 (0)