Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions src/FSharp.Stats/ML/Imputation.fs
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,64 @@ module Imputation =
tmpArr.[index]


/// <summary>
/// Imputation by distance-weighted k-nearest neighbour.
/// Missing values are replaced by a weighted average of the k nearest neighbours,
/// where each neighbour's contribution is scaled by a user-supplied weight derived
/// from its distance to the incomplete row.
/// </summary>
/// <param name="distanceMetric">
/// Distance function between two float arrays.
/// Use <c>DistanceMetrics.Array.euclideanNaNSquared</c> (the default in
/// <see cref="kNearestImpute"/>) to skip NaN positions when measuring distance.
/// </param>
/// <param name="distanceToWeight">
/// Converts a raw distance value into a non-negative weight.
/// For Euclidean-style metrics use an inverse such as <c>fun d -> 1.0 / (d + System.Double.Epsilon)</c>.
/// For similarity measures (e.g. Pearson correlation) pass <c>id</c> directly,
/// or its reciprocal if you stored it as a distance.
/// </param>
/// <param name="k">Number of nearest neighbours to consider.</param>
/// <param name="data">Complete rows used as the neighbour pool (rows with missing values are excluded upstream by <see cref="imputeBy"/>).</param>
/// <param name="arr">The row containing the missing value to impute.</param>
/// <param name="index">Column index of the missing value within <paramref name="arr"/>.</param>
/// <returns>Imputed value at <paramref name="index"/>.</returns>
/// <example>
/// <code>
/// // Distance-weighted KNN with inverse-distance weighting
/// let isMissing = System.Double.IsNaN
/// let invDistWeight d = 1.0 / (d + System.Double.Epsilon)
/// let imputer = Imputation.kNearestWeightedImpute DistanceMetrics.Array.euclideanNaNSquared invDistWeight 3
/// let imputed = Imputation.imputeBy imputer isMissing rawData
/// </code>
/// </example>
let kNearestWeightedImpute
(distanceMetric: DistanceMetrics.Distance<float[]>)
(distanceToWeight: float -> float)
k
: MatrixBaseImputation<float[],float> =
fun data arr index ->
let dataset = data |> Array.ofSeq
let n = min k dataset.Length
if n = 0 then
nan
else
let neighbors =
dataset
|> Array.map (fun row -> (distanceMetric row arr, row))
|> Array.sortBy fst
|> Array.take n
let weights = neighbors |> Array.map (fun (d, _) -> distanceToWeight d)
let totalWeight = Array.sum weights
if totalWeight = 0.0 then
neighbors |> Array.averageBy (fun (_, row: float[]) -> row.[index])
else
let weightedSum =
Array.map2 (fun w (_, row: float[]) -> w * row.[index]) weights neighbors
|> Array.sum
weightedSum / totalWeight


/// <summary>Imputes column-wise by vector-based imputation</summary>
/// <remarks></remarks>
/// <param name="impute"></param>
Expand Down
1 change: 1 addition & 0 deletions tests/FSharp.Stats.Tests/FSharp.Stats.Tests.fsproj
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
<Compile Include="Testing.fs" />
<Compile Include="Optimization.fs" />
<Compile Include="SummaryStats.fs" />
<Compile Include="Imputation.fs" />
<Compile Include="Main.fs" />
</ItemGroup>
<ItemGroup>
Expand Down
111 changes: 111 additions & 0 deletions tests/FSharp.Stats.Tests/Imputation.fs
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
module ImputationTests

open Expecto
open FSharp.Stats
open FSharp.Stats.ML

/// Tolerance for floating-point comparisons
let eps = 1e-9

[<Tests>]
let imputationTests =
testList "Imputation" [

testList "kNearestImpute" [

test "imputes single missing value using nearest neighbour mean" {
// Row 0: [1; 2; 3], Row 1: [2; 4; 6], Row 2: [4; 8; 12]
// Query: [1; nan; 3] β€” missing at index 1
// Euclidean (NaN-skipped) distances from [1;__;3] to rows:
// Row 0 [1;2;3]: dΒ² = (1-1)Β² + (3-3)Β² = 0
// Row 1 [2;4;6]: dΒ² = (1-2)Β² + (3-6)Β² = 10
// Row 2 [4;8;12]: dΒ² = (1-4)Β² + (3-12)Β² = 90
// k=2 nearest: [1;2;3] and [2;4;6] β†’ mean of index 1 = (2+4)/2 = 3
let data = [| [|1.0; 2.0; 3.0|]; [|2.0; 4.0; 6.0|]; [|4.0; 8.0; 12.0|] |]
let query = [| 1.0; nan; 3.0 |]
let imputer = Imputation.kNearestImpute 2
let result = imputer data query 1
Expect.floatClose Accuracy.high result 3.0 "kNN-2 unweighted mean should be 3.0"
}

test "imputes correctly when k equals dataset size" {
let data = [| [|0.0; 10.0|]; [|4.0; 20.0|] |]
let query = [| 1.0; nan |]
// k=2, mean of 10 and 20 = 15
let imputer = Imputation.kNearestImpute 2
let result = imputer data query 1
Expect.floatClose Accuracy.high result 15.0 "mean of 10 and 20 should be 15"
}
]

testList "kNearestWeightedImpute" [

test "with k=1 returns the nearest neighbour value unchanged" {
// Only one neighbour β†’ weight doesn't matter
let data = [| [|0.0; 10.0|]; [|10.0; 99.0|] |]
let query = [| 1.0; nan |]
let invDist d = 1.0 / (d + System.Double.Epsilon)
let imputer = Imputation.kNearestWeightedImpute FSharp.Stats.DistanceMetrics.Array.euclideanNaNSquared invDist 1
let result = imputer data query 1
// Nearest is [0; 10], its value at index 1 is 10.0
Expect.floatClose Accuracy.high result 10.0 "nearest neighbour value should be 10.0"
}

test "inverse-distance weighting biases result toward closer neighbour" {
// Data: [0; 10] and [4; 20]
// Query: [1; nan]
// Distances (euclideanNaNSquared, skip NaN):
// dΒ²([0;10], [1;_]) = (1-0)Β² = 1.0
// dΒ²([4;20], [1;_]) = (1-4)Β² = 9.0
// Weights: w1=1/1=1, w2=1/9
// Weighted mean = (1*10 + (1/9)*20) / (1 + 1/9)
// = (10 + 20/9) / (10/9)
// = (110/9) / (10/9) = 11.0
let data = [| [|0.0; 10.0|]; [|4.0; 20.0|] |]
let query = [| 1.0; nan |]
let invDist d = 1.0 / d
let imputer = Imputation.kNearestWeightedImpute FSharp.Stats.DistanceMetrics.Array.euclideanNaNSquared invDist 2
let result = imputer data query 1
Expect.floatClose Accuracy.high result 11.0 "weighted average should be 11.0"
}

test "equal distances yield simple mean regardless of weight function" {
// If all neighbours are equidistant the weighted mean equals the simple mean
// [0;10] and [2;20] are both dΒ²=1 from query [1;nan]
let data = [| [|0.0; 10.0|]; [|2.0; 20.0|] |]
let query = [| 1.0; nan |]
let invDist d = 1.0 / d
let imputer = Imputation.kNearestWeightedImpute FSharp.Stats.DistanceMetrics.Array.euclideanNaNSquared invDist 2
let result = imputer data query 1
// Both weights = 1/1 = 1 β†’ equal weights β†’ mean = (10+20)/2 = 15
Expect.floatClose Accuracy.high result 15.0 "equal-distance weighted mean should be 15.0"
}

test "returns nan when dataset is empty" {
let data : float[][] = [| |]
let query = [| 1.0; nan |]
let invDist d = 1.0 / (d + System.Double.Epsilon)
let imputer = Imputation.kNearestWeightedImpute FSharp.Stats.DistanceMetrics.Array.euclideanNaNSquared invDist 3
let result = imputer data query 1
Expect.isTrue (System.Double.IsNaN result) "should return nan for empty dataset"
}

test "imputeBy with kNearestWeightedImpute replaces nans in matrix" {
let isMissing = System.Double.IsNaN
let rawData : seq<float[]> =
seq {
yield [| 0.0; 10.0; 100.0 |]
yield [| 4.0; 20.0; 200.0 |]
yield [| 1.0; nan; 150.0 |]
}
let invDist d = 1.0 / (d + System.Double.Epsilon)
let imputer = Imputation.kNearestWeightedImpute FSharp.Stats.DistanceMetrics.Array.euclideanNaNSquared invDist 2
let result = Imputation.imputeBy imputer isMissing rawData
// The third row's missing value at index 1 should be imputed
let imputed = result.[2].[1]
Expect.isFalse (System.Double.IsNaN imputed) "imputed value should not be nan"
// Verify it lies in the plausible range [10, 20]
Expect.isTrue (imputed >= 10.0 && imputed <= 20.0) "imputed value should be between 10 and 20"
}
]
]
Loading