From bbb12dab1a1fec0be2433e11a58a0014027f8304 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 16 Jun 2026 19:26:19 +0000
Subject: [PATCH 1/2] Add dependency-free DER scoring to core
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add aai_cli/core/der.py, a pure-Python diarization error rate scorer
mirroring core/wer.py's shape (frozen Score, score, pooled). It computes
the standard NIST/pyannote DER — missed / false-alarm / speaker-confusion
time over reference speech time — by partitioning the shared timeline at
every segment boundary and optimally mapping speaker labels via exact
permutation search (diarization speaker counts are small).

No new dependency: pyannote.metrics pulls numpy/scipy/pandas, and the
lighter PyPI options still pull numpy or compile a C++ extension, whereas
the current eval stack (jiwer) has neither numpy nor scipy.

Not yet wired into `assembly eval` — DER needs reference speaker timing
(RTTM-style segments) that the current text-only dataset path doesn't
carry; that integration is a separate change.
---
 aai_cli/core/der.py | 157 ++++++++++++++++++++++++++++++++++++++++++++
 tests/test_der.py   | 110 +++++++++++++++++++++++++++++++
 2 files changed, 267 insertions(+)
 create mode 100644 aai_cli/core/der.py
 create mode 100644 tests/test_der.py

diff --git a/aai_cli/core/der.py b/aai_cli/core/der.py
new file mode 100644
index 00000000..13626b6b
--- /dev/null
+++ b/aai_cli/core/der.py
@@ -0,0 +1,157 @@
+"""Diarization error rate (DER) scoring for `assembly eval`, dependency-free.
+
+The companion to :mod:`aai_cli.core.wer`: where WER scores *what* was said, DER
+scores *who* spoke when. It is the standard NIST/pyannote metric — the fraction
+of reference speech time that is misattributed — computed here in plain Python so
+adding diarization metrics costs no new dependency (``pyannote.metrics`` drags in
+numpy/scipy/pandas; the lighter PyPI options still pull numpy or compile a C++
+extension). No SDK, no Rich: the command layer owns all rendering.
+
+Both reference and hypothesis are sequences of speaker-labelled :class:`Segment`s.
+Speaker labels are arbitrary, so the speakers are optimally mapped one-to-one
+before errors are counted (an exact search — diarization speaker counts are
+small). The timeline is partitioned at every segment boundary into atomic
+intervals; within each, the per-interval NIST tally (missed / false-alarm /
+confusion) is summed, all weighted by reference *speaker*-time so overlapping
+speech is counted once per concurrent speaker.
+"""
+
+from __future__ import annotations
+
+import itertools
+from collections.abc import Sequence
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class Segment:
+    """A stretch of speech (``start`` to ``end``, in seconds) attributed to one speaker."""
+
+    speaker: str
+    start: float
+    end: float
+
+
+@dataclass(frozen=True)
+class Score:
+    """Diarization error against ``speech`` seconds of reference speech.
+
+    The three NIST components are kept separately (so a caller can show the
+    breakdown) and ``errors`` is their sum; pooled across files for corpus DER.
+    """
+
+    missed: float
+    false_alarm: float
+    confusion: float
+    speech: float
+
+    @property
+    def errors(self) -> float:
+        """Total misattributed time: missed + false-alarm + speaker-confusion seconds."""
+        return self.missed + self.false_alarm + self.confusion
+
+    @property
+    def der(self) -> float:
+        """Diarization error rate: error time over reference speech time.
+
+        The caller guarantees a reference with speech (an empty reference makes
+        DER undefined, the same contract :class:`wer.Score` keeps for ``words``).
+        """
+        return self.errors / self.speech
+
+
+def _boundaries(reference: Sequence[Segment], hypothesis: Sequence[Segment]) -> list[float]:
+    """The sorted, de-duplicated segment endpoints that partition the timeline.
+
+    Between two consecutive boundaries every segment is wholly present or wholly
+    absent, so each atomic interval has a fixed set of active speakers.
+    """
+    return sorted({point for seg in (*reference, *hypothesis) for point in (seg.start, seg.end)})
+
+
+def _active(segments: Sequence[Segment], start: float, end: float) -> set[str]:
+    """The distinct speakers whose segment covers the atomic interval ``[start, end)``."""
+    return {seg.speaker for seg in segments if seg.start <= start and seg.end >= end}
+
+
+def _speakers(segments: Sequence[Segment]) -> list[str]:
+    """The distinct speaker labels in ``segments``, in a deterministic order."""
+    return sorted({seg.speaker for seg in segments})
+
+
+def _weight(weights: list[list[float]], row: int, col: int) -> float:
+    """The matrix weight at ``(row, col)``, or 0 outside it — the zero-padding
+    that lets the larger side's unmatched speakers cost nothing."""
+    if row < len(weights) and col < len(weights[0]):
+        return weights[row][col]
+    return 0.0
+
+
+def _max_weight_assignment(weights: list[list[float]]) -> float:
+    """The greatest total weight of a one-to-one row-to-column assignment.
+
+    Exact search: the matrix is zero-padded to ``size x size`` and every
+    permutation (speaker mapping) is tried. Diarization files have few speakers,
+    so the factorial search stays cheap, and padding sidesteps the orientation
+    branch a rectangular search would need.
+    """
+    size = max(len(weights), len(weights[0]))
+    return max(
+        sum(_weight(weights, row, col) for row, col in enumerate(perm))
+        for perm in itertools.permutations(range(size))
+    )
+
+
+def _correct_time(
+    reference: Sequence[Segment],
+    hypothesis: Sequence[Segment],
+    cooccurrence: dict[tuple[str, str], float],
+) -> float:
+    """Correctly attributed speech time under the optimal speaker mapping.
+
+    ``cooccurrence[(ref, hyp)]`` is how long that reference and hypothesis
+    speaker were concurrently active; the best one-to-one mapping maximises the
+    matched total (an empty hypothesis maps to nothing, so it scores 0).
+    """
+    ref_speakers, hyp_speakers = _speakers(reference), _speakers(hypothesis)
+    if not ref_speakers:
+        return 0.0
+    weights = [[cooccurrence.get((ref, hyp), 0.0) for hyp in hyp_speakers] for ref in ref_speakers]
+    return _max_weight_assignment(weights)
+
+
+def score(reference: Sequence[Segment], hypothesis: Sequence[Segment]) -> Score:
+    """Score a hypothesis diarization against a reference.
+
+    Walks the shared timeline once, tallying missed speech (reference speakers
+    with no hypothesis speaker to cover them), false alarms (the reverse), and
+    co-occurrence per speaker pair; speaker confusion is then the matched
+    overlap that the optimal mapping could *not* account for.
+    """
+    cooccurrence: dict[tuple[str, str], float] = {}
+    missed = false_alarm = matched = speech = 0.0
+    boundaries = _boundaries(reference, hypothesis)
+    for start, end in itertools.pairwise(boundaries):
+        duration = end - start
+        ref_active = _active(reference, start, end)
+        hyp_active = _active(hypothesis, start, end)
+        speech += duration * len(ref_active)
+        missed += duration * max(0, len(ref_active) - len(hyp_active))
+        false_alarm += duration * max(0, len(hyp_active) - len(ref_active))
+        matched += duration * min(len(ref_active), len(hyp_active))
+        for ref in ref_active:
+            for hyp in hyp_active:
+                cooccurrence[(ref, hyp)] = cooccurrence.get((ref, hyp), 0.0) + duration
+    confusion = matched - _correct_time(reference, hypothesis, cooccurrence)
+    return Score(missed=missed, false_alarm=false_alarm, confusion=confusion, speech=speech)
+
+
+def pooled(scores: list[Score]) -> Score:
+    """Corpus-level score: error and speech time summed across files (DER is then
+    total error time over total reference time, not a mean of per-file rates)."""
+    return Score(
+        missed=sum(item.missed for item in scores),
+        false_alarm=sum(item.false_alarm for item in scores),
+        confusion=sum(item.confusion for item in scores),
+        speech=sum(item.speech for item in scores),
+    )
diff --git a/tests/test_der.py b/tests/test_der.py
new file mode 100644
index 00000000..f909db59
--- /dev/null
+++ b/tests/test_der.py
@@ -0,0 +1,110 @@
+"""DER scoring (`aai_cli.core.der`): timeline tally, optimal speaker mapping, pooling."""
+
+import dataclasses
+
+import pytest
+
+from aai_cli.core import der
+
+
+def _assign(obj, attribute, value):
+    setattr(obj, attribute, value)
+
+
+def seg(speaker: str, start: float, end: float) -> der.Segment:
+    return der.Segment(speaker, start, end)
+
+
+def test_score_is_an_immutable_value():
+    with pytest.raises(dataclasses.FrozenInstanceError):
+        _assign(der.Score(missed=0.0, false_alarm=0.0, confusion=0.0, speech=1.0), "missed", 1.0)
+
+
+def test_errors_and_der_combine_the_components():
+    score = der.Score(missed=1.0, false_alarm=2.0, confusion=3.0, speech=10.0)
+    assert score.errors == 6.0
+    assert score.der == 0.6
+
+
+def test_identical_diarization_scores_zero():
+    ref = [seg("A", 0, 10), seg("B", 10, 20)]
+    score = der.score(ref, ref)
+    assert score == der.Score(missed=0.0, false_alarm=0.0, confusion=0.0, speech=20.0)
+    assert score.der == 0.0
+
+
+def test_speaker_labels_are_mapped_optimally():
+    # Same timing, relabelled and reversed: the optimal 1:1 mapping recovers it,
+    # so a correct scorer reports no error despite none of the labels matching.
+    ref = [seg("A", 0, 10), seg("B", 10, 20)]
+    hyp = [seg("spk_1", 10, 20), seg("spk_0", 0, 10)]
+    assert der.score(ref, hyp).der == 0.0
+
+
+def test_missing_hypothesis_is_all_missed_speech():
+    score = der.score([seg("A", 0, 10)], [])
+    assert score == der.Score(missed=10.0, false_alarm=0.0, confusion=0.0, speech=10.0)
+    assert score.der == 1.0
+
+
+def test_hypothesis_speech_outside_the_reference_is_false_alarm():
+    score = der.score([seg("A", 0, 10)], [seg("X", 0, 15)])
+    assert score == der.Score(missed=0.0, false_alarm=5.0, confusion=0.0, speech=10.0)
+    assert score.der == 0.5
+
+
+def test_no_reference_speech_is_pure_false_alarm():
+    # An empty reference: every hypothesis second is a false alarm and there is
+    # no speech to map against (DER itself is undefined, but the tally holds).
+    score = der.score([], [seg("X", 0, 10)])
+    assert score == der.Score(missed=0.0, false_alarm=10.0, confusion=0.0, speech=0.0)
+
+
+def test_one_hypothesis_speaker_split_across_two_references_is_confusion():
+    # A single hypothesis speaker covers both reference speakers; the mapping can
+    # only credit the one it overlaps most (B, 10s), the rest (A, 3s) is confusion.
+    ref = [seg("A", 0, 3), seg("B", 3, 13)]
+    hyp = [seg("X", 0, 13)]
+    score = der.score(ref, hyp)
+    assert score == der.Score(missed=0.0, false_alarm=0.0, confusion=3.0, speech=13.0)
+    assert score.der == pytest.approx(3 / 13)
+
+
+def test_one_reference_speaker_split_across_two_hypotheses_is_confusion():
+    # The mirror case (more hypothesis than reference speakers): only one of the
+    # two hypothesis speakers can be mapped to A, the other 5s is confusion.
+    ref = [seg("A", 0, 10)]
+    hyp = [seg("X", 0, 5), seg("Y", 5, 10)]
+    score = der.score(ref, hyp)
+    assert score == der.Score(missed=0.0, false_alarm=0.0, confusion=5.0, speech=10.0)
+    assert score.der == 0.5
+
+
+def test_optimal_mapping_beats_a_greedy_one():
+    # Co-occurrence is (A,X)=10, (A,Y)=9, (B,X)=8. Greedy takes A↔X (10) and is
+    # then stuck with B↔Y (0) for 10 correct; the optimal A↔Y + B↔X gives 17
+    # correct, so confusion is 27-17=10, not 27-10=17. Pins the max-assignment.
+    ref = [seg("A", 0, 19), seg("B", 19, 27)]
+    hyp = [seg("X", 0, 10), seg("Y", 10, 19), seg("X", 19, 27)]
+    score = der.score(ref, hyp)
+    assert score == der.Score(missed=0.0, false_alarm=0.0, confusion=10.0, speech=27.0)
+    assert score.der == pytest.approx(10 / 27)
+
+
+def test_overlapping_reference_speakers_count_per_speaker():
+    # [5,10) has two reference speakers talking at once, so it contributes 2x its
+    # 5s of wall-clock to reference speech (15s wall-clock -> 20s speaker-time).
+    ref = [seg("A", 0, 10), seg("B", 5, 15)]
+    score = der.score(ref, ref)
+    assert score == der.Score(missed=0.0, false_alarm=0.0, confusion=0.0, speech=20.0)
+
+
+def test_pooled_sums_components_for_corpus_der():
+    total = der.pooled(
+        [
+            der.Score(missed=1.0, false_alarm=2.0, confusion=3.0, speech=10.0),
+            der.Score(missed=0.0, false_alarm=1.0, confusion=1.0, speech=30.0),
+        ]
+    )
+    assert total == der.Score(missed=1.0, false_alarm=3.0, confusion=4.0, speech=40.0)
+    assert total.der == 0.2

From 1873402cfb65fe46ef4c164a17d406236e43a8aa Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 16 Jun 2026 19:32:24 +0000
Subject: [PATCH 2/2] Add Segment immutability and disjoint-speaker DER tests

Kill two mutation-gate survivors on core/der.py: assert Segment is frozen,
and cover the case where reference and hypothesis each carry an unmatched
speaker (so the optimal mapping must weigh a non-co-occurring pair). These
tests were validated by the full gate but missed the prior commit's stale
staging snapshot.
---
 tests/test_der.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tests/test_der.py b/tests/test_der.py
index f909db59..c4a1b614 100644
--- a/tests/test_der.py
+++ b/tests/test_der.py
@@ -20,6 +20,11 @@ def test_score_is_an_immutable_value():
         _assign(der.Score(missed=0.0, false_alarm=0.0, confusion=0.0, speech=1.0), "missed", 1.0)
 
 
+def test_segment_is_an_immutable_value():
+    with pytest.raises(dataclasses.FrozenInstanceError):
+        _assign(seg("A", 0, 10), "speaker", "B")
+
+
 def test_errors_and_der_combine_the_components():
     score = der.Score(missed=1.0, false_alarm=2.0, confusion=3.0, speech=10.0)
     assert score.errors == 6.0
@@ -80,6 +85,17 @@ def test_one_reference_speaker_split_across_two_hypotheses_is_confusion():
     assert score.der == 0.5
 
 
+def test_disjoint_extra_speakers_are_missed_and_false_alarm():
+    # A<->X overlap perfectly; reference B (10s) has no hypothesis (missed) and
+    # hypothesis Y (10s) has no reference (false alarm). The leftover B/Y pair
+    # never co-occurs, so the mapping cannot credit it as correct.
+    ref = [seg("A", 0, 10), seg("B", 10, 20)]
+    hyp = [seg("X", 0, 10), seg("Y", 20, 30)]
+    score = der.score(ref, hyp)
+    assert score == der.Score(missed=10.0, false_alarm=10.0, confusion=0.0, speech=20.0)
+    assert score.der == 1.0
+
+
 def test_optimal_mapping_beats_a_greedy_one():
     # Co-occurrence is (A,X)=10, (A,Y)=9, (B,X)=8. Greedy takes A↔X (10) and is
     # then stuck with B↔Y (0) for 10 correct; the optimal A↔Y + B↔X gives 17