From deef9cd421557e1207c6c481c9220f5222765425 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
<41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 9 Apr 2026 13:05:32 +0000
Subject: [PATCH 1/2] feat: add kNearestWeightedImpute for distance-weighted
KNN imputation
Adds Imputation.kNearestWeightedImpute to FSharp.Stats.ML, addressing the
weighted KNN imputation request in #318. The new function accepts a
pluggable distance metric and a distanceToWeight converter, allowing both
inverse-Euclidean and similarity-based (e.g. Pearson correlation) weighting
strategies.
Changes:
- src/FSharp.Stats/ML/Imputation.fs: new kNearestWeightedImpute function
- tests/FSharp.Stats.Tests/Imputation.fs: 6 new tests (1200/1200 pass)
- tests/.../FSharp.Stats.Tests.fsproj: register new test file
Closes #318
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
src/FSharp.Stats/ML/Imputation.fs | 58 +++++++++
.../FSharp.Stats.Tests.fsproj | 1 +
tests/FSharp.Stats.Tests/Imputation.fs | 111 ++++++++++++++++++
3 files changed, 170 insertions(+)
create mode 100644 tests/FSharp.Stats.Tests/Imputation.fs
diff --git a/src/FSharp.Stats/ML/Imputation.fs b/src/FSharp.Stats/ML/Imputation.fs
index 2ff7ea84..b5625d7d 100644
--- a/src/FSharp.Stats/ML/Imputation.fs
+++ b/src/FSharp.Stats/ML/Imputation.fs
@@ -99,6 +99,64 @@ module Imputation =
tmpArr.[index]
+ ///
+ /// Imputation by distance-weighted k-nearest neighbour.
+ /// Missing values are replaced by a weighted average of the k nearest neighbours,
+ /// where each neighbour's contribution is scaled by a user-supplied weight derived
+ /// from its distance to the incomplete row.
+ ///
+ ///
+ /// Distance function between two float arrays.
+ /// Use DistanceMetrics.Array.euclideanNaNSquared (the default in
+ /// ) to skip NaN positions when measuring distance.
+ ///
+ ///
+ /// Converts a raw distance value into a non-negative weight.
+ /// For Euclidean-style metrics use an inverse such as fun d -> 1.0 / (d + System.Double.Epsilon).
+ /// For similarity measures (e.g. Pearson correlation) pass id directly,
+ /// or its reciprocal if you stored it as a distance.
+ ///
+ /// Number of nearest neighbours to consider.
+ /// Complete rows used as the neighbour pool (rows with missing values are excluded upstream by ).
+ /// The row containing the missing value to impute.
+ /// Column index of the missing value within .
+ /// Imputed value at .
+ ///
+ ///
+ /// // Distance-weighted KNN with inverse-distance weighting
+ /// let isMissing = System.Double.IsNaN
+ /// let invDistWeight d = 1.0 / (d + System.Double.Epsilon)
+ /// let imputer = Imputation.kNearestWeightedImpute DistanceMetrics.Array.euclideanNaNSquared invDistWeight 3
+ /// let imputed = Imputation.imputeBy imputer isMissing rawData
+ ///
+ ///
+ let kNearestWeightedImpute
+ (distanceMetric: DistanceMetrics.Distance)
+ (distanceToWeight: float -> float)
+ k
+ : MatrixBaseImputation =
+ fun data arr index ->
+ let dataset = data |> Array.ofSeq
+ let n = min k dataset.Length
+ if n = 0 then
+ nan
+ else
+ let neighbors =
+ dataset
+ |> Array.map (fun row -> (distanceMetric row arr, row))
+ |> Array.sortBy fst
+ |> Array.take n
+ let weights = neighbors |> Array.map (fun (d, _) -> distanceToWeight d)
+ let totalWeight = Array.sum weights
+ if totalWeight = 0.0 then
+ neighbors |> Array.averageBy (fun (_, row: float[]) -> row.[index])
+ else
+ let weightedSum =
+ Array.map2 (fun w (_, row: float[]) -> w * row.[index]) weights neighbors
+ |> Array.sum
+ weightedSum / totalWeight
+
+
/// Imputes column-wise by vector-based imputation
///
///
diff --git a/tests/FSharp.Stats.Tests/FSharp.Stats.Tests.fsproj b/tests/FSharp.Stats.Tests/FSharp.Stats.Tests.fsproj
index dfcdb5fd..8350818c 100644
--- a/tests/FSharp.Stats.Tests/FSharp.Stats.Tests.fsproj
+++ b/tests/FSharp.Stats.Tests/FSharp.Stats.Tests.fsproj
@@ -29,6 +29,7 @@
+
diff --git a/tests/FSharp.Stats.Tests/Imputation.fs b/tests/FSharp.Stats.Tests/Imputation.fs
new file mode 100644
index 00000000..c1f36a3d
--- /dev/null
+++ b/tests/FSharp.Stats.Tests/Imputation.fs
@@ -0,0 +1,111 @@
+module ImputationTests
+
+open Expecto
+open FSharp.Stats
+open FSharp.Stats.ML
+
+/// Tolerance for floating-point comparisons
+let eps = 1e-9
+
+[]
+let imputationTests =
+ testList "Imputation" [
+
+ testList "kNearestImpute" [
+
+ test "imputes single missing value using nearest neighbour mean" {
+ // Row 0: [1; 2; 3], Row 1: [2; 4; 6], Row 2: [4; 8; 12]
+ // Query: [1; nan; 3] — missing at index 1
+ // Euclidean (NaN-skipped) distances from [1;__;3] to rows:
+ // Row 0 [1;2;3]: d² = (1-1)² + (3-3)² = 0
+ // Row 1 [2;4;6]: d² = (1-2)² + (3-6)² = 10
+ // Row 2 [4;8;12]: d² = (1-4)² + (3-12)² = 90
+ // k=2 nearest: [1;2;3] and [2;4;6] → mean of index 1 = (2+4)/2 = 3
+ let data = [| [|1.0; 2.0; 3.0|]; [|2.0; 4.0; 6.0|]; [|4.0; 8.0; 12.0|] |]
+ let query = [| 1.0; nan; 3.0 |]
+ let imputer = Imputation.kNearestImpute 2
+ let result = imputer data query 1
+ Expect.floatClose Accuracy.high result 3.0 "kNN-2 unweighted mean should be 3.0"
+ }
+
+ test "imputes correctly when k equals dataset size" {
+ let data = [| [|0.0; 10.0|]; [|4.0; 20.0|] |]
+ let query = [| 1.0; nan |]
+ // k=2, mean of 10 and 20 = 15
+ let imputer = Imputation.kNearestImpute 2
+ let result = imputer data query 1
+ Expect.floatClose Accuracy.high result 15.0 "mean of 10 and 20 should be 15"
+ }
+ ]
+
+ testList "kNearestWeightedImpute" [
+
+ test "with k=1 returns the nearest neighbour value unchanged" {
+ // Only one neighbour → weight doesn't matter
+ let data = [| [|0.0; 10.0|]; [|10.0; 99.0|] |]
+ let query = [| 1.0; nan |]
+ let invDist d = 1.0 / (d + System.Double.Epsilon)
+ let imputer = Imputation.kNearestWeightedImpute FSharp.Stats.DistanceMetrics.Array.euclideanNaNSquared invDist 1
+ let result = imputer data query 1
+ // Nearest is [0; 10], its value at index 1 is 10.0
+ Expect.floatClose Accuracy.high result 10.0 "nearest neighbour value should be 10.0"
+ }
+
+ test "inverse-distance weighting biases result toward closer neighbour" {
+ // Data: [0; 10] and [4; 20]
+ // Query: [1; nan]
+ // Distances (euclideanNaNSquared, skip NaN):
+ // d²([0;10], [1;_]) = (1-0)² = 1.0
+ // d²([4;20], [1;_]) = (1-4)² = 9.0
+ // Weights: w1=1/1=1, w2=1/9
+ // Weighted mean = (1*10 + (1/9)*20) / (1 + 1/9)
+ // = (10 + 20/9) / (10/9)
+ // = (110/9) / (10/9) = 11.0
+ let data = [| [|0.0; 10.0|]; [|4.0; 20.0|] |]
+ let query = [| 1.0; nan |]
+ let invDist d = 1.0 / d
+ let imputer = Imputation.kNearestWeightedImpute FSharp.Stats.DistanceMetrics.Array.euclideanNaNSquared invDist 2
+ let result = imputer data query 1
+ Expect.floatClose Accuracy.high result 11.0 "weighted average should be 11.0"
+ }
+
+ test "equal distances yield simple mean regardless of weight function" {
+ // If all neighbours are equidistant the weighted mean equals the simple mean
+ // [0;10] and [2;20] are both d²=1 from query [1;nan]
+ let data = [| [|0.0; 10.0|]; [|2.0; 20.0|] |]
+ let query = [| 1.0; nan |]
+ let invDist d = 1.0 / d
+ let imputer = Imputation.kNearestWeightedImpute FSharp.Stats.DistanceMetrics.Array.euclideanNaNSquared invDist 2
+ let result = imputer data query 1
+ // Both weights = 1/1 = 1 → equal weights → mean = (10+20)/2 = 15
+ Expect.floatClose Accuracy.high result 15.0 "equal-distance weighted mean should be 15.0"
+ }
+
+ test "returns nan when dataset is empty" {
+ let data : float[][] = [| |]
+ let query = [| 1.0; nan |]
+ let invDist d = 1.0 / (d + System.Double.Epsilon)
+ let imputer = Imputation.kNearestWeightedImpute FSharp.Stats.DistanceMetrics.Array.euclideanNaNSquared invDist 3
+ let result = imputer data query 1
+ Expect.isTrue (System.Double.IsNaN result) "should return nan for empty dataset"
+ }
+
+ test "imputeBy with kNearestWeightedImpute replaces nans in matrix" {
+ let isMissing = System.Double.IsNaN
+ let rawData : seq =
+ seq {
+ yield [| 0.0; 10.0; 100.0 |]
+ yield [| 4.0; 20.0; 200.0 |]
+ yield [| 1.0; nan; 150.0 |]
+ }
+ let invDist d = 1.0 / (d + System.Double.Epsilon)
+ let imputer = Imputation.kNearestWeightedImpute FSharp.Stats.DistanceMetrics.Array.euclideanNaNSquared invDist 2
+ let result = Imputation.imputeBy imputer isMissing rawData
+ // The third row's missing value at index 1 should be imputed
+ let imputed = result.[2].[1]
+ Expect.isFalse (System.Double.IsNaN imputed) "imputed value should not be nan"
+ // Verify it lies in the plausible range [10, 20]
+ Expect.isTrue (imputed >= 10.0 && imputed <= 20.0) "imputed value should be between 10 and 20"
+ }
+ ]
+ ]
From eb30f29ce2ef91f36d8189bde138f21e1e55006e Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Thu, 9 Apr 2026 13:05:35 +0000
Subject: [PATCH 2/2] ci: trigger checks