Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 70 additions & 10 deletions haystack/components/evaluators/document_ndcg.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from math import log2
from typing import Any

from haystack import Document, component
from haystack import Document, component, default_to_dict


@component
Expand Down Expand Up @@ -34,6 +34,51 @@ class DocumentNDCGEvaluator:
```
"""

def __init__(self, document_comparison_field: str = "content") -> None:
"""
Create a DocumentNDCGEvaluator component.

:param document_comparison_field:
The Document field to use for comparison. Possible options:
- `"content"`: uses `doc.content`
- `"id"`: uses `doc.id`
- A `meta.` prefix followed by a key name: uses `doc.meta["<key>"]`
(e.g. `"meta.file_id"`, `"meta.page_number"`)
Nested keys are supported (e.g. `"meta.source.url"`).
"""
self.document_comparison_field = document_comparison_field

def _get_comparison_value(self, doc: Document) -> Any:
"""
Extract the comparison value from a document based on the configured field.
"""
if self.document_comparison_field == "content":
return doc.content
if self.document_comparison_field == "id":
return doc.id
if self.document_comparison_field.startswith("meta."):
parts = self.document_comparison_field[5:].split(".")
value = doc.meta
for part in parts:
if not isinstance(value, dict) or part not in value:
return None
value = value[part]
return value
msg = (
f"Unsupported document_comparison_field: '{self.document_comparison_field}'. "
"Use 'content', 'id', or 'meta.<key>'."
)
raise ValueError(msg)

def to_dict(self) -> dict[str, Any]:
"""
Serializes the component to a dictionary.

:returns:
Dictionary with serialized data.
"""
return default_to_dict(self, document_comparison_field=self.document_comparison_field)

@component.output_types(score=float, individual_scores=list[float])
def run(
self, ground_truth_documents: list[list[Document]], retrieved_documents: list[list[Document]]
Expand Down Expand Up @@ -78,7 +123,7 @@ def validate_inputs(gt_docs: list[list[Document]], ret_docs: list[list[Document]
The retrieved_documents to validate.

:raises ValueError:
If the ground_truth_documents or the retrieved_documents are an empty a list.
If the ground_truth_documents or the retrieved_documents are an empty list.
If the length of ground_truth_documents and retrieved_documents differs.
If any list of documents in ground_truth_documents contains a mix of documents with and without a score.
"""
Expand All @@ -95,8 +140,7 @@ def validate_inputs(gt_docs: list[list[Document]], ret_docs: list[list[Document]
msg = "Either none or all documents in each list of ground_truth_documents must have a score."
raise ValueError(msg)

@staticmethod
def calculate_dcg(gt_docs: list[Document], ret_docs: list[Document]) -> float:
def calculate_dcg(self, gt_docs: list[Document], ret_docs: list[Document]) -> float:
Comment thread
Kunal-Somani marked this conversation as resolved.
"""
Calculate the discounted cumulative gain (DCG) of the retrieved documents.

Expand All @@ -109,24 +153,40 @@ def calculate_dcg(gt_docs: list[Document], ret_docs: list[Document]) -> float:
documents based on the ground truth documents.
"""
dcg = 0.0
relevant_id_to_score = {doc.id: doc.score if doc.score is not None else 1 for doc in gt_docs}
# Build lookup from comparison value -> relevance score, skipping documents
# whose comparison value cannot be determined (e.g. missing meta key)
relevant_value_to_score: dict[Any, float] = {}
for doc in gt_docs:
value = self._get_comparison_value(doc)
if value is not None:
relevant_value_to_score[value] = doc.score if doc.score is not None else 1

for i, doc in enumerate(ret_docs):
if doc.id in relevant_id_to_score: # TODO Related to https://github.com/deepset-ai/haystack/issues/8412
dcg += relevant_id_to_score[doc.id] / log2(i + 2) # i + 2 because i is 0-indexed
value = self._get_comparison_value(doc)
if value is not None and value in relevant_value_to_score:
dcg += relevant_value_to_score[value] / log2(i + 2) # i + 2 because i is 0-indexed
return dcg

@staticmethod
def calculate_idcg(gt_docs: list[Document]) -> float:
def calculate_idcg(self, gt_docs: list[Document]) -> float:
"""
Calculate the ideal discounted cumulative gain (IDCG) of the ground truth documents.

Ground truth documents whose comparison value cannot be determined (e.g. missing meta key)
are excluded, since they can never be matched in `calculate_dcg` either. Including them here
would inflate the IDCG and make it impossible for NDCG to reach 1.0 for a perfect retrieval.

:param gt_docs:
The ground truth documents.
:returns:
The ideal discounted cumulative gain (IDCG) of the ground truth documents.
"""
# Filter out documents that cannot be matched, consistent with calculate_dcg
matchable_docs = [doc for doc in gt_docs if self._get_comparison_value(doc) is not None]

idcg = 0.0
for i, doc in enumerate(sorted(gt_docs, key=lambda x: x.score if x.score is not None else 1, reverse=True)):
for i, doc in enumerate(
sorted(matchable_docs, key=lambda x: x.score if x.score is not None else 1, reverse=True)
):
# If the document has a score, use it; otherwise, use 1 for binary relevance.
Comment thread
Kunal-Somani marked this conversation as resolved.
relevance = doc.score if doc.score is not None else 1
idcg += relevance / log2(i + 2) # i + 2 because i is 0-indexed
Expand Down
Comment thread
Kunal-Somani marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
upgrade:
- |
``DocumentNDCGEvaluator`` now matches documents by their ``content`` field
by default instead of their auto-generated ``id``. Previously, ground
truth and retrieved documents were matched only if they had identical
``id`` values, which rarely happened in practice since IDs are generated
independently for each Document instance. As a result, NDCG scores
computed with this evaluator may change for existing pipelines. To keep
the previous ``id``-based matching behavior, pass
``document_comparison_field="id"`` when constructing the evaluator.
enhancements:
- |
Added ``document_comparison_field`` parameter to ``DocumentNDCGEvaluator``,
consistent with ``DocumentMAPEvaluator``, ``DocumentMRREvaluator``, and
``DocumentRecallEvaluator``. Users can now match documents by ``"content"``,
``"id"``, or any ``"meta.<key>"`` field when calculating NDCG scores.
154 changes: 153 additions & 1 deletion test/components/evaluators/test_document_ndcg.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import pytest

from haystack import Document
from haystack import Document, default_from_dict
from haystack.components.evaluators.document_ndcg import DocumentNDCGEvaluator


Expand Down Expand Up @@ -201,3 +201,155 @@ def test_calculate_idcg_empty():
gt_docs = []
idcg = evaluator.calculate_idcg(gt_docs)
assert idcg == 0


def test_to_dict_default():
evaluator = DocumentNDCGEvaluator()
data = evaluator.to_dict()
assert data == {
"type": "haystack.components.evaluators.document_ndcg.DocumentNDCGEvaluator",
"init_parameters": {"document_comparison_field": "content"},
}


def test_to_dict_custom_field():
evaluator = DocumentNDCGEvaluator(document_comparison_field="id")
data = evaluator.to_dict()
assert data == {
"type": "haystack.components.evaluators.document_ndcg.DocumentNDCGEvaluator",
"init_parameters": {"document_comparison_field": "id"},
}


def test_from_dict():
data = {
"type": "haystack.components.evaluators.document_ndcg.DocumentNDCGEvaluator",
"init_parameters": {"document_comparison_field": "id"},
}
evaluator = default_from_dict(DocumentNDCGEvaluator, data)
assert evaluator.document_comparison_field == "id"


def test_run_with_id_comparison():
# Documents with same content but different IDs — id comparison
# must match on id, not content
evaluator = DocumentNDCGEvaluator(document_comparison_field="id")
result = evaluator.run(
ground_truth_documents=[[Document(id="doc1", content="France"), Document(id="doc2", content="Paris")]],
retrieved_documents=[
[
Document(id="doc1", content="different text"),
Document(id="doc3", content="Germany"),
Document(id="doc2", content="also different"),
]
],
)
assert result["individual_scores"][0] == pytest.approx(0.9197, abs=1e-4)
assert result["score"] == pytest.approx(0.9197, abs=1e-4)


def test_run_with_id_comparison_no_match():
evaluator = DocumentNDCGEvaluator(document_comparison_field="id")
result = evaluator.run(
ground_truth_documents=[[Document(id="doc1", content="France")]],
retrieved_documents=[[Document(id="doc99", content="France")]],
)
# Same content, different ID — should NOT match when comparing by id
assert result["individual_scores"] == [0.0]
assert result["score"] == 0.0


def test_run_with_meta_comparison():
evaluator = DocumentNDCGEvaluator(document_comparison_field="meta.file_id")
result = evaluator.run(
ground_truth_documents=[
[Document(content="France", meta={"file_id": "f1"}), Document(content="Paris", meta={"file_id": "f2"})]
],
retrieved_documents=[
[
Document(content="different", meta={"file_id": "f1"}),
Document(content="irrelevant", meta={"file_id": "f99"}),
Document(content="also different", meta={"file_id": "f2"}),
]
],
)
assert result["individual_scores"][0] == pytest.approx(0.9197, abs=1e-4)
assert result["score"] == pytest.approx(0.9197, abs=1e-4)


def test_run_with_nested_meta_comparison():
evaluator = DocumentNDCGEvaluator(document_comparison_field="meta.source.url")
result = evaluator.run(
ground_truth_documents=[[Document(content="x", meta={"source": {"url": "https://a.com"}})]],
retrieved_documents=[[Document(content="z", meta={"source": {"url": "https://a.com"}})]],
)
assert result["individual_scores"] == [1.0]
assert result["score"] == 1.0


def test_run_with_meta_missing_key_treated_as_no_match():
# Documents missing the meta key should not match anything
evaluator = DocumentNDCGEvaluator(document_comparison_field="meta.file_id")
result = evaluator.run(
ground_truth_documents=[[Document(content="France", meta={"file_id": "f1"})]],
retrieved_documents=[[Document(content="France", meta={})]],
)
assert result["individual_scores"] == [0.0]
assert result["score"] == 0.0


def test_run_with_id_comparison_with_scores():
# Verify that relevance scores are honoured when comparing by id
evaluator = DocumentNDCGEvaluator(document_comparison_field="id")
result = evaluator.run(
ground_truth_documents=[
[
Document(id="doc1", content="foo", score=3),
Document(id="doc2", content="bar", score=2),
Document(id="doc3", content="baz", score=3),
Document(id="doc6", content="qux", score=2),
Document(id="doc7", content="quux", score=3),
Document(id="doc8", content="corge", score=2),
]
],
retrieved_documents=[
[
Document(id="doc1", content="x"),
Document(id="doc2", content="y"),
Document(id="doc3", content="z"),
Document(id="doc4", content="w"),
Document(id="doc5", content="v"),
]
],
)
assert result["individual_scores"][0] == pytest.approx(0.6592, abs=1e-4)
assert result["score"] == pytest.approx(0.6592, abs=1e-4)


def test_unsupported_comparison_field_raises():
evaluator = DocumentNDCGEvaluator(document_comparison_field="embedding")
with pytest.raises(ValueError, match="Unsupported document_comparison_field"):
evaluator.run(
ground_truth_documents=[[Document(content="France")]], retrieved_documents=[[Document(content="France")]]
)


def test_run_with_meta_missing_key_can_still_reach_perfect_ndcg():
"""
Regression test for the IDCG/DCG inflation bug: ground truth documents that
cannot be matched (missing the configured meta key) must be excluded from
IDCG too, otherwise NDCG can never reach 1.0 even for a perfect retrieval.
"""
evaluator = DocumentNDCGEvaluator(document_comparison_field="meta.file_id")
result = evaluator.run(
ground_truth_documents=[
[
Document(content="France", meta={"file_id": "f1"}),
Document(content="unmatchable", meta={}), # no file_id -> cannot be matched
]
],
retrieved_documents=[[Document(content="France", meta={"file_id": "f1"})]],
)
# Perfect retrieval of the one matchable document should yield NDCG of exactly 1.0
assert result["individual_scores"] == [1.0]
assert result["score"] == 1.0
Loading