From deef9cd421557e1207c6c481c9220f5222765425 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 9 Apr 2026 13:05:32 +0000 Subject: [PATCH 1/2] feat: add kNearestWeightedImpute for distance-weighted KNN imputation Adds Imputation.kNearestWeightedImpute to FSharp.Stats.ML, addressing the weighted KNN imputation request in #318. The new function accepts a pluggable distance metric and a distanceToWeight converter, allowing both inverse-Euclidean and similarity-based (e.g. Pearson correlation) weighting strategies. Changes: - src/FSharp.Stats/ML/Imputation.fs: new kNearestWeightedImpute function - tests/FSharp.Stats.Tests/Imputation.fs: 6 new tests (1200/1200 pass) - tests/.../FSharp.Stats.Tests.fsproj: register new test file Closes #318 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/FSharp.Stats/ML/Imputation.fs | 58 +++++++++ .../FSharp.Stats.Tests.fsproj | 1 + tests/FSharp.Stats.Tests/Imputation.fs | 111 ++++++++++++++++++ 3 files changed, 170 insertions(+) create mode 100644 tests/FSharp.Stats.Tests/Imputation.fs diff --git a/src/FSharp.Stats/ML/Imputation.fs b/src/FSharp.Stats/ML/Imputation.fs index 2ff7ea84..b5625d7d 100644 --- a/src/FSharp.Stats/ML/Imputation.fs +++ b/src/FSharp.Stats/ML/Imputation.fs @@ -99,6 +99,64 @@ module Imputation = tmpArr.[index] + /// + /// Imputation by distance-weighted k-nearest neighbour. + /// Missing values are replaced by a weighted average of the k nearest neighbours, + /// where each neighbour's contribution is scaled by a user-supplied weight derived + /// from its distance to the incomplete row. + /// + /// + /// Distance function between two float arrays. + /// Use DistanceMetrics.Array.euclideanNaNSquared (the default in + /// ) to skip NaN positions when measuring distance. + /// + /// + /// Converts a raw distance value into a non-negative weight. + /// For Euclidean-style metrics use an inverse such as fun d -> 1.0 / (d + System.Double.Epsilon). + /// For similarity measures (e.g. Pearson correlation) pass id directly, + /// or its reciprocal if you stored it as a distance. + /// + /// Number of nearest neighbours to consider. + /// Complete rows used as the neighbour pool (rows with missing values are excluded upstream by ). + /// The row containing the missing value to impute. + /// Column index of the missing value within . + /// Imputed value at . + /// + /// + /// // Distance-weighted KNN with inverse-distance weighting + /// let isMissing = System.Double.IsNaN + /// let invDistWeight d = 1.0 / (d + System.Double.Epsilon) + /// let imputer = Imputation.kNearestWeightedImpute DistanceMetrics.Array.euclideanNaNSquared invDistWeight 3 + /// let imputed = Imputation.imputeBy imputer isMissing rawData + /// + /// + let kNearestWeightedImpute + (distanceMetric: DistanceMetrics.Distance) + (distanceToWeight: float -> float) + k + : MatrixBaseImputation = + fun data arr index -> + let dataset = data |> Array.ofSeq + let n = min k dataset.Length + if n = 0 then + nan + else + let neighbors = + dataset + |> Array.map (fun row -> (distanceMetric row arr, row)) + |> Array.sortBy fst + |> Array.take n + let weights = neighbors |> Array.map (fun (d, _) -> distanceToWeight d) + let totalWeight = Array.sum weights + if totalWeight = 0.0 then + neighbors |> Array.averageBy (fun (_, row: float[]) -> row.[index]) + else + let weightedSum = + Array.map2 (fun w (_, row: float[]) -> w * row.[index]) weights neighbors + |> Array.sum + weightedSum / totalWeight + + /// Imputes column-wise by vector-based imputation /// /// diff --git a/tests/FSharp.Stats.Tests/FSharp.Stats.Tests.fsproj b/tests/FSharp.Stats.Tests/FSharp.Stats.Tests.fsproj index dfcdb5fd..8350818c 100644 --- a/tests/FSharp.Stats.Tests/FSharp.Stats.Tests.fsproj +++ b/tests/FSharp.Stats.Tests/FSharp.Stats.Tests.fsproj @@ -29,6 +29,7 @@ + diff --git a/tests/FSharp.Stats.Tests/Imputation.fs b/tests/FSharp.Stats.Tests/Imputation.fs new file mode 100644 index 00000000..c1f36a3d --- /dev/null +++ b/tests/FSharp.Stats.Tests/Imputation.fs @@ -0,0 +1,111 @@ +module ImputationTests + +open Expecto +open FSharp.Stats +open FSharp.Stats.ML + +/// Tolerance for floating-point comparisons +let eps = 1e-9 + +[] +let imputationTests = + testList "Imputation" [ + + testList "kNearestImpute" [ + + test "imputes single missing value using nearest neighbour mean" { + // Row 0: [1; 2; 3], Row 1: [2; 4; 6], Row 2: [4; 8; 12] + // Query: [1; nan; 3] — missing at index 1 + // Euclidean (NaN-skipped) distances from [1;__;3] to rows: + // Row 0 [1;2;3]: d² = (1-1)² + (3-3)² = 0 + // Row 1 [2;4;6]: d² = (1-2)² + (3-6)² = 10 + // Row 2 [4;8;12]: d² = (1-4)² + (3-12)² = 90 + // k=2 nearest: [1;2;3] and [2;4;6] → mean of index 1 = (2+4)/2 = 3 + let data = [| [|1.0; 2.0; 3.0|]; [|2.0; 4.0; 6.0|]; [|4.0; 8.0; 12.0|] |] + let query = [| 1.0; nan; 3.0 |] + let imputer = Imputation.kNearestImpute 2 + let result = imputer data query 1 + Expect.floatClose Accuracy.high result 3.0 "kNN-2 unweighted mean should be 3.0" + } + + test "imputes correctly when k equals dataset size" { + let data = [| [|0.0; 10.0|]; [|4.0; 20.0|] |] + let query = [| 1.0; nan |] + // k=2, mean of 10 and 20 = 15 + let imputer = Imputation.kNearestImpute 2 + let result = imputer data query 1 + Expect.floatClose Accuracy.high result 15.0 "mean of 10 and 20 should be 15" + } + ] + + testList "kNearestWeightedImpute" [ + + test "with k=1 returns the nearest neighbour value unchanged" { + // Only one neighbour → weight doesn't matter + let data = [| [|0.0; 10.0|]; [|10.0; 99.0|] |] + let query = [| 1.0; nan |] + let invDist d = 1.0 / (d + System.Double.Epsilon) + let imputer = Imputation.kNearestWeightedImpute FSharp.Stats.DistanceMetrics.Array.euclideanNaNSquared invDist 1 + let result = imputer data query 1 + // Nearest is [0; 10], its value at index 1 is 10.0 + Expect.floatClose Accuracy.high result 10.0 "nearest neighbour value should be 10.0" + } + + test "inverse-distance weighting biases result toward closer neighbour" { + // Data: [0; 10] and [4; 20] + // Query: [1; nan] + // Distances (euclideanNaNSquared, skip NaN): + // d²([0;10], [1;_]) = (1-0)² = 1.0 + // d²([4;20], [1;_]) = (1-4)² = 9.0 + // Weights: w1=1/1=1, w2=1/9 + // Weighted mean = (1*10 + (1/9)*20) / (1 + 1/9) + // = (10 + 20/9) / (10/9) + // = (110/9) / (10/9) = 11.0 + let data = [| [|0.0; 10.0|]; [|4.0; 20.0|] |] + let query = [| 1.0; nan |] + let invDist d = 1.0 / d + let imputer = Imputation.kNearestWeightedImpute FSharp.Stats.DistanceMetrics.Array.euclideanNaNSquared invDist 2 + let result = imputer data query 1 + Expect.floatClose Accuracy.high result 11.0 "weighted average should be 11.0" + } + + test "equal distances yield simple mean regardless of weight function" { + // If all neighbours are equidistant the weighted mean equals the simple mean + // [0;10] and [2;20] are both d²=1 from query [1;nan] + let data = [| [|0.0; 10.0|]; [|2.0; 20.0|] |] + let query = [| 1.0; nan |] + let invDist d = 1.0 / d + let imputer = Imputation.kNearestWeightedImpute FSharp.Stats.DistanceMetrics.Array.euclideanNaNSquared invDist 2 + let result = imputer data query 1 + // Both weights = 1/1 = 1 → equal weights → mean = (10+20)/2 = 15 + Expect.floatClose Accuracy.high result 15.0 "equal-distance weighted mean should be 15.0" + } + + test "returns nan when dataset is empty" { + let data : float[][] = [| |] + let query = [| 1.0; nan |] + let invDist d = 1.0 / (d + System.Double.Epsilon) + let imputer = Imputation.kNearestWeightedImpute FSharp.Stats.DistanceMetrics.Array.euclideanNaNSquared invDist 3 + let result = imputer data query 1 + Expect.isTrue (System.Double.IsNaN result) "should return nan for empty dataset" + } + + test "imputeBy with kNearestWeightedImpute replaces nans in matrix" { + let isMissing = System.Double.IsNaN + let rawData : seq = + seq { + yield [| 0.0; 10.0; 100.0 |] + yield [| 4.0; 20.0; 200.0 |] + yield [| 1.0; nan; 150.0 |] + } + let invDist d = 1.0 / (d + System.Double.Epsilon) + let imputer = Imputation.kNearestWeightedImpute FSharp.Stats.DistanceMetrics.Array.euclideanNaNSquared invDist 2 + let result = Imputation.imputeBy imputer isMissing rawData + // The third row's missing value at index 1 should be imputed + let imputed = result.[2].[1] + Expect.isFalse (System.Double.IsNaN imputed) "imputed value should not be nan" + // Verify it lies in the plausible range [10, 20] + Expect.isTrue (imputed >= 10.0 && imputed <= 20.0) "imputed value should be between 10 and 20" + } + ] + ] From eb30f29ce2ef91f36d8189bde138f21e1e55006e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 9 Apr 2026 13:05:35 +0000 Subject: [PATCH 2/2] ci: trigger checks