From 41a96ee4d6597e452394a38fe798528f7f823137 Mon Sep 17 00:00:00 2001 From: Kunal Somani Date: Mon, 15 Jun 2026 15:06:43 +0530 Subject: [PATCH 1/4] feat: add document_comparison_field to DocumentNDCGEvaluator --- .../components/evaluators/document_ndcg.py | 102 ++++++++++---- ...ld-to-ndcg-evaluator-f9abbbd556f49c04.yaml | 8 ++ .../evaluators/test_document_ndcg.py | 133 +++++++++++++++++- 3 files changed, 217 insertions(+), 26 deletions(-) create mode 100644 releasenotes/notes/add-document-comparison-field-to-ndcg-evaluator-f9abbbd556f49c04.yaml diff --git a/haystack/components/evaluators/document_ndcg.py b/haystack/components/evaluators/document_ndcg.py index ca88292f6c..d3e7e4b43d 100644 --- a/haystack/components/evaluators/document_ndcg.py +++ b/haystack/components/evaluators/document_ndcg.py @@ -5,35 +5,80 @@ from math import log2 from typing import Any -from haystack import Document, component +from haystack import Document, component, default_to_dict @component class DocumentNDCGEvaluator: """ - Evaluator that calculates the normalized discounted cumulative gain (NDCG) of retrieved documents. + Evaluator that calculates the normalized discounted cumulative gain (NDCG) of retrieved documents. - Each question can have multiple ground truth documents and multiple retrieved documents. - If the ground truth documents have relevance scores, the NDCG calculation uses these scores. - Otherwise, it assumes binary relevance of all ground truth documents. + Each question can have multiple ground truth documents and multiple retrieved documents. + If the ground truth documents have relevance scores, the NDCG calculation uses these scores. + Otherwise, it assumes binary relevance of all ground truth documents. - Usage example: + Usage example: ```python - from haystack import Document - from haystack.components.evaluators import DocumentNDCGEvaluator - - evaluator = DocumentNDCGEvaluator() - result = evaluator.run( - ground_truth_documents=[[Document(content="France", score=1.0), Document(content="Paris", score=0.5)]], - retrieved_documents=[[Document(content="France"), Document(content="Germany"), Document(content="Paris")]], - ) - print(result["individual_scores"]) - # [0.8869] - print(result["score"]) - # 0.8869 + from haystack import Document + from haystack.components.evaluators import DocumentNDCGEvaluator + + evaluator = DocumentNDCGEvaluator() + result = evaluator.run( + ground_truth_documents=[[Document(content="France", score=1.0), Document(content="Paris", score=0.5)]], + retrieved_documents=[[Document(content="France"), Document(content="Germany"), Document(content="Paris")]], + ) + print(result["individual_scores"]) + # [0.8869] + print(result["score"]) + # 0.8869 ``` """ + def __init__(self, document_comparison_field: str = "content") -> None: + """ + Create a DocumentNDCGEvaluator component. + + :param document_comparison_field: + The Document field to use for comparison. Possible options: + - `"content"`: uses `doc.content` + - `"id"`: uses `doc.id` + - A `meta.` prefix followed by a key name: uses `doc.meta[""]` + (e.g. `"meta.file_id"`, `"meta.page_number"`) + Nested keys are supported (e.g. `"meta.source.url"`). + """ + self.document_comparison_field = document_comparison_field + + def _get_comparison_value(self, doc: Document) -> Any: + """ + Extract the comparison value from a document based on the configured field. + """ + if self.document_comparison_field == "content": + return doc.content + if self.document_comparison_field == "id": + return doc.id + if self.document_comparison_field.startswith("meta."): + parts = self.document_comparison_field[5:].split(".") + value = doc.meta + for part in parts: + if not isinstance(value, dict) or part not in value: + return None + value = value[part] + return value + msg = ( + f"Unsupported document_comparison_field: '{self.document_comparison_field}'. " + "Use 'content', 'id', or 'meta.'." + ) + raise ValueError(msg) + + def to_dict(self) -> dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return default_to_dict(self, document_comparison_field=self.document_comparison_field) + @component.output_types(score=float, individual_scores=list[float]) def run( self, ground_truth_documents: list[list[Document]], retrieved_documents: list[list[Document]] @@ -78,7 +123,7 @@ def validate_inputs(gt_docs: list[list[Document]], ret_docs: list[list[Document] The retrieved_documents to validate. :raises ValueError: - If the ground_truth_documents or the retrieved_documents are an empty a list. + If the ground_truth_documents or the retrieved_documents are an empty list. If the length of ground_truth_documents and retrieved_documents differs. If any list of documents in ground_truth_documents contains a mix of documents with and without a score. """ @@ -95,8 +140,7 @@ def validate_inputs(gt_docs: list[list[Document]], ret_docs: list[list[Document] msg = "Either none or all documents in each list of ground_truth_documents must have a score." raise ValueError(msg) - @staticmethod - def calculate_dcg(gt_docs: list[Document], ret_docs: list[Document]) -> float: + def calculate_dcg(self, gt_docs: list[Document], ret_docs: list[Document]) -> float: """ Calculate the discounted cumulative gain (DCG) of the retrieved documents. @@ -104,15 +148,24 @@ def calculate_dcg(gt_docs: list[Document], ret_docs: list[Document]) -> float: The ground truth documents. :param ret_docs: The retrieved documents. + :param document_comparison_field: + The Document field used to match retrieved documents against ground truth documents. :returns: The discounted cumulative gain (DCG) of the retrieved documents based on the ground truth documents. """ dcg = 0.0 - relevant_id_to_score = {doc.id: doc.score if doc.score is not None else 1 for doc in gt_docs} + # Build lookup from comparison value -> relevance score + relevant_value_to_score: dict[Any, float] = {} + for doc in gt_docs: + value = self._get_comparison_value(doc) + if value is not None: + relevant_value_to_score[value] = doc.score if doc.score is not None else 1 + for i, doc in enumerate(ret_docs): - if doc.id in relevant_id_to_score: # TODO Related to https://github.com/deepset-ai/haystack/issues/8412 - dcg += relevant_id_to_score[doc.id] / log2(i + 2) # i + 2 because i is 0-indexed + value = self._get_comparison_value(doc) + if value is not None and value in relevant_value_to_score: + dcg += relevant_value_to_score[value] / log2(i + 2) # i + 2 because i is 0-indexed return dcg @staticmethod @@ -127,7 +180,6 @@ def calculate_idcg(gt_docs: list[Document]) -> float: """ idcg = 0.0 for i, doc in enumerate(sorted(gt_docs, key=lambda x: x.score if x.score is not None else 1, reverse=True)): - # If the document has a score, use it; otherwise, use 1 for binary relevance. relevance = doc.score if doc.score is not None else 1 idcg += relevance / log2(i + 2) # i + 2 because i is 0-indexed return idcg diff --git a/releasenotes/notes/add-document-comparison-field-to-ndcg-evaluator-f9abbbd556f49c04.yaml b/releasenotes/notes/add-document-comparison-field-to-ndcg-evaluator-f9abbbd556f49c04.yaml new file mode 100644 index 0000000000..5d7ab2e18e --- /dev/null +++ b/releasenotes/notes/add-document-comparison-field-to-ndcg-evaluator-f9abbbd556f49c04.yaml @@ -0,0 +1,8 @@ +enhancements: + - | + Added ``document_comparison_field`` parameter to ``DocumentNDCGEvaluator``, + consistent with ``DocumentMAPEvaluator``, ``DocumentMRREvaluator``, and + ``DocumentRecallEvaluator``. Users can now match documents by ``"content"``, + ``"id"``, or any ``"meta."`` field when calculating NDCG scores, + resolving a known limitation referenced in + `#8412 `_. diff --git a/test/components/evaluators/test_document_ndcg.py b/test/components/evaluators/test_document_ndcg.py index 1b300eb4d8..408c736e7d 100644 --- a/test/components/evaluators/test_document_ndcg.py +++ b/test/components/evaluators/test_document_ndcg.py @@ -4,7 +4,7 @@ import pytest -from haystack import Document +from haystack import Document, default_from_dict from haystack.components.evaluators.document_ndcg import DocumentNDCGEvaluator @@ -201,3 +201,134 @@ def test_calculate_idcg_empty(): gt_docs = [] idcg = evaluator.calculate_idcg(gt_docs) assert idcg == 0 + + +def test_to_dict_default(): + evaluator = DocumentNDCGEvaluator() + data = evaluator.to_dict() + assert data == { + "type": "haystack.components.evaluators.document_ndcg.DocumentNDCGEvaluator", + "init_parameters": {"document_comparison_field": "content"}, + } + + +def test_to_dict_custom_field(): + evaluator = DocumentNDCGEvaluator(document_comparison_field="id") + data = evaluator.to_dict() + assert data == { + "type": "haystack.components.evaluators.document_ndcg.DocumentNDCGEvaluator", + "init_parameters": {"document_comparison_field": "id"}, + } + + +def test_from_dict(): + data = { + "type": "haystack.components.evaluators.document_ndcg.DocumentNDCGEvaluator", + "init_parameters": {"document_comparison_field": "id"}, + } + evaluator = default_from_dict(DocumentNDCGEvaluator, data) + assert evaluator.document_comparison_field == "id" + + +def test_run_with_id_comparison(): + # Documents with same content but different IDs — id comparison + # must match on id, not content + evaluator = DocumentNDCGEvaluator(document_comparison_field="id") + result = evaluator.run( + ground_truth_documents=[[Document(id="doc1", content="France"), Document(id="doc2", content="Paris")]], + retrieved_documents=[ + [ + Document(id="doc1", content="different text"), + Document(id="doc3", content="Germany"), + Document(id="doc2", content="also different"), + ] + ], + ) + assert result["individual_scores"][0] == pytest.approx(0.9197, abs=1e-4) + assert result["score"] == pytest.approx(0.9197, abs=1e-4) + + +def test_run_with_id_comparison_no_match(): + evaluator = DocumentNDCGEvaluator(document_comparison_field="id") + result = evaluator.run( + ground_truth_documents=[[Document(id="doc1", content="France")]], + retrieved_documents=[[Document(id="doc99", content="France")]], + ) + # Same content, different ID — should NOT match when comparing by id + assert result["individual_scores"] == [0.0] + assert result["score"] == 0.0 + + +def test_run_with_meta_comparison(): + evaluator = DocumentNDCGEvaluator(document_comparison_field="meta.file_id") + result = evaluator.run( + ground_truth_documents=[ + [Document(content="France", meta={"file_id": "f1"}), Document(content="Paris", meta={"file_id": "f2"})] + ], + retrieved_documents=[ + [ + Document(content="different", meta={"file_id": "f1"}), + Document(content="irrelevant", meta={"file_id": "f99"}), + Document(content="also different", meta={"file_id": "f2"}), + ] + ], + ) + assert result["individual_scores"][0] == pytest.approx(0.9197, abs=1e-4) + assert result["score"] == pytest.approx(0.9197, abs=1e-4) + + +def test_run_with_nested_meta_comparison(): + evaluator = DocumentNDCGEvaluator(document_comparison_field="meta.source.url") + result = evaluator.run( + ground_truth_documents=[[Document(content="x", meta={"source": {"url": "https://a.com"}})]], + retrieved_documents=[[Document(content="z", meta={"source": {"url": "https://a.com"}})]], + ) + assert result["individual_scores"] == [1.0] + assert result["score"] == 1.0 + + +def test_run_with_meta_missing_key_treated_as_no_match(): + # Documents missing the meta key should not match anything + evaluator = DocumentNDCGEvaluator(document_comparison_field="meta.file_id") + result = evaluator.run( + ground_truth_documents=[[Document(content="France", meta={"file_id": "f1"})]], + retrieved_documents=[[Document(content="France", meta={})]], + ) + assert result["individual_scores"] == [0.0] + assert result["score"] == 0.0 + + +def test_run_with_id_comparison_with_scores(): + # Verify that relevance scores are honoured when comparing by id + evaluator = DocumentNDCGEvaluator(document_comparison_field="id") + result = evaluator.run( + ground_truth_documents=[ + [ + Document(id="doc1", content="foo", score=3), + Document(id="doc2", content="bar", score=2), + Document(id="doc3", content="baz", score=3), + Document(id="doc6", content="qux", score=2), + Document(id="doc7", content="quux", score=3), + Document(id="doc8", content="corge", score=2), + ] + ], + retrieved_documents=[ + [ + Document(id="doc1", content="x"), + Document(id="doc2", content="y"), + Document(id="doc3", content="z"), + Document(id="doc4", content="w"), + Document(id="doc5", content="v"), + ] + ], + ) + assert result["individual_scores"][0] == pytest.approx(0.6592, abs=1e-4) + assert result["score"] == pytest.approx(0.6592, abs=1e-4) + + +def test_unsupported_comparison_field_raises(): + evaluator = DocumentNDCGEvaluator(document_comparison_field="embedding") + with pytest.raises(ValueError, match="Unsupported document_comparison_field"): + evaluator.run( + ground_truth_documents=[[Document(content="France")]], retrieved_documents=[[Document(content="France")]] + ) From b40158aa68efe24931555bbf4b9fd3a98c05161f Mon Sep 17 00:00:00 2001 From: Kunal Somani Date: Mon, 15 Jun 2026 15:20:52 +0530 Subject: [PATCH 2/4] chore: simplify release note RST syntax --- ...t-comparison-field-to-ndcg-evaluator-f9abbbd556f49c04.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/releasenotes/notes/add-document-comparison-field-to-ndcg-evaluator-f9abbbd556f49c04.yaml b/releasenotes/notes/add-document-comparison-field-to-ndcg-evaluator-f9abbbd556f49c04.yaml index 5d7ab2e18e..5f66487ce5 100644 --- a/releasenotes/notes/add-document-comparison-field-to-ndcg-evaluator-f9abbbd556f49c04.yaml +++ b/releasenotes/notes/add-document-comparison-field-to-ndcg-evaluator-f9abbbd556f49c04.yaml @@ -4,5 +4,5 @@ enhancements: consistent with ``DocumentMAPEvaluator``, ``DocumentMRREvaluator``, and ``DocumentRecallEvaluator``. Users can now match documents by ``"content"``, ``"id"``, or any ``"meta."`` field when calculating NDCG scores, - resolving a known limitation referenced in - `#8412 `_. + resolving a known limitation where documents were previously matched by + hardcoded ``id`` comparison instead of configurable field comparison. From 5881b799f87f2a06951d6e3c40df9da1ea2c03b3 Mon Sep 17 00:00:00 2001 From: Kunal Somani Date: Fri, 19 Jun 2026 15:48:48 +0530 Subject: [PATCH 3/4] fix: address review comments - fix IDCG/DCG mismatch, docstring formatting, add upgrade note --- .../components/evaluators/document_ndcg.py | 21 +++++++++++++------ ...ld-to-ndcg-evaluator-f9abbbd556f49c04.yaml | 14 ++++++++++--- .../evaluators/test_document_ndcg.py | 21 +++++++++++++++++++ 3 files changed, 47 insertions(+), 9 deletions(-) diff --git a/haystack/components/evaluators/document_ndcg.py b/haystack/components/evaluators/document_ndcg.py index d3e7e4b43d..d395d3fb83 100644 --- a/haystack/components/evaluators/document_ndcg.py +++ b/haystack/components/evaluators/document_ndcg.py @@ -1,3 +1,4 @@ +# haystack/components/evaluators/document_ndcg.py # SPDX-FileCopyrightText: 2022-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 @@ -148,14 +149,13 @@ def calculate_dcg(self, gt_docs: list[Document], ret_docs: list[Document]) -> fl The ground truth documents. :param ret_docs: The retrieved documents. - :param document_comparison_field: - The Document field used to match retrieved documents against ground truth documents. :returns: The discounted cumulative gain (DCG) of the retrieved documents based on the ground truth documents. """ dcg = 0.0 - # Build lookup from comparison value -> relevance score + # Build lookup from comparison value -> relevance score, skipping documents + # whose comparison value cannot be determined (e.g. missing meta key) relevant_value_to_score: dict[Any, float] = {} for doc in gt_docs: value = self._get_comparison_value(doc) @@ -168,18 +168,27 @@ def calculate_dcg(self, gt_docs: list[Document], ret_docs: list[Document]) -> fl dcg += relevant_value_to_score[value] / log2(i + 2) # i + 2 because i is 0-indexed return dcg - @staticmethod - def calculate_idcg(gt_docs: list[Document]) -> float: + def calculate_idcg(self, gt_docs: list[Document]) -> float: """ Calculate the ideal discounted cumulative gain (IDCG) of the ground truth documents. + Ground truth documents whose comparison value cannot be determined (e.g. missing meta key) + are excluded, since they can never be matched in `calculate_dcg` either. Including them here + would inflate the IDCG and make it impossible for NDCG to reach 1.0 for a perfect retrieval. + :param gt_docs: The ground truth documents. :returns: The ideal discounted cumulative gain (IDCG) of the ground truth documents. """ + # Filter out documents that cannot be matched, consistent with calculate_dcg + matchable_docs = [doc for doc in gt_docs if self._get_comparison_value(doc) is not None] + idcg = 0.0 - for i, doc in enumerate(sorted(gt_docs, key=lambda x: x.score if x.score is not None else 1, reverse=True)): + for i, doc in enumerate( + sorted(matchable_docs, key=lambda x: x.score if x.score is not None else 1, reverse=True) + ): + # If the document has a score, use it; otherwise, use 1 for binary relevance. relevance = doc.score if doc.score is not None else 1 idcg += relevance / log2(i + 2) # i + 2 because i is 0-indexed return idcg diff --git a/releasenotes/notes/add-document-comparison-field-to-ndcg-evaluator-f9abbbd556f49c04.yaml b/releasenotes/notes/add-document-comparison-field-to-ndcg-evaluator-f9abbbd556f49c04.yaml index 5f66487ce5..57b45d3150 100644 --- a/releasenotes/notes/add-document-comparison-field-to-ndcg-evaluator-f9abbbd556f49c04.yaml +++ b/releasenotes/notes/add-document-comparison-field-to-ndcg-evaluator-f9abbbd556f49c04.yaml @@ -1,8 +1,16 @@ +upgrade: + - | + ``DocumentNDCGEvaluator`` now matches documents by their ``content`` field + by default instead of their auto-generated ``id``. Previously, ground + truth and retrieved documents were matched only if they had identical + ``id`` values, which rarely happened in practice since IDs are generated + independently for each Document instance. As a result, NDCG scores + computed with this evaluator may change for existing pipelines. To keep + the previous ``id``-based matching behavior, pass + ``document_comparison_field="id"`` when constructing the evaluator. enhancements: - | Added ``document_comparison_field`` parameter to ``DocumentNDCGEvaluator``, consistent with ``DocumentMAPEvaluator``, ``DocumentMRREvaluator``, and ``DocumentRecallEvaluator``. Users can now match documents by ``"content"``, - ``"id"``, or any ``"meta."`` field when calculating NDCG scores, - resolving a known limitation where documents were previously matched by - hardcoded ``id`` comparison instead of configurable field comparison. + ``"id"``, or any ``"meta."`` field when calculating NDCG scores. diff --git a/test/components/evaluators/test_document_ndcg.py b/test/components/evaluators/test_document_ndcg.py index 408c736e7d..3ae49595ae 100644 --- a/test/components/evaluators/test_document_ndcg.py +++ b/test/components/evaluators/test_document_ndcg.py @@ -332,3 +332,24 @@ def test_unsupported_comparison_field_raises(): evaluator.run( ground_truth_documents=[[Document(content="France")]], retrieved_documents=[[Document(content="France")]] ) + + +def test_run_with_meta_missing_key_can_still_reach_perfect_ndcg(): + """ + Regression test for the IDCG/DCG inflation bug: ground truth documents that + cannot be matched (missing the configured meta key) must be excluded from + IDCG too, otherwise NDCG can never reach 1.0 even for a perfect retrieval. + """ + evaluator = DocumentNDCGEvaluator(document_comparison_field="meta.file_id") + result = evaluator.run( + ground_truth_documents=[ + [ + Document(content="France", meta={"file_id": "f1"}), + Document(content="unmatchable", meta={}), # no file_id -> cannot be matched + ] + ], + retrieved_documents=[[Document(content="France", meta={"file_id": "f1"})]], + ) + # Perfect retrieval of the one matchable document should yield NDCG of exactly 1.0 + assert result["individual_scores"] == [1.0] + assert result["score"] == 1.0 From c0948f087773b4d4cadd814ff7c76e16358bfb69 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Fri, 19 Jun 2026 14:12:48 +0200 Subject: [PATCH 4/4] Apply suggestions from code review Co-authored-by: bogdankostic --- .../components/evaluators/document_ndcg.py | 35 +++++++++---------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/haystack/components/evaluators/document_ndcg.py b/haystack/components/evaluators/document_ndcg.py index d395d3fb83..14a8463205 100644 --- a/haystack/components/evaluators/document_ndcg.py +++ b/haystack/components/evaluators/document_ndcg.py @@ -1,4 +1,3 @@ -# haystack/components/evaluators/document_ndcg.py # SPDX-FileCopyrightText: 2022-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 @@ -12,26 +11,26 @@ @component class DocumentNDCGEvaluator: """ - Evaluator that calculates the normalized discounted cumulative gain (NDCG) of retrieved documents. + Evaluator that calculates the normalized discounted cumulative gain (NDCG) of retrieved documents. - Each question can have multiple ground truth documents and multiple retrieved documents. - If the ground truth documents have relevance scores, the NDCG calculation uses these scores. - Otherwise, it assumes binary relevance of all ground truth documents. + Each question can have multiple ground truth documents and multiple retrieved documents. + If the ground truth documents have relevance scores, the NDCG calculation uses these scores. + Otherwise, it assumes binary relevance of all ground truth documents. - Usage example: + Usage example: ```python - from haystack import Document - from haystack.components.evaluators import DocumentNDCGEvaluator - - evaluator = DocumentNDCGEvaluator() - result = evaluator.run( - ground_truth_documents=[[Document(content="France", score=1.0), Document(content="Paris", score=0.5)]], - retrieved_documents=[[Document(content="France"), Document(content="Germany"), Document(content="Paris")]], - ) - print(result["individual_scores"]) - # [0.8869] - print(result["score"]) - # 0.8869 + from haystack import Document + from haystack.components.evaluators import DocumentNDCGEvaluator + + evaluator = DocumentNDCGEvaluator() + result = evaluator.run( + ground_truth_documents=[[Document(content="France", score=1.0), Document(content="Paris", score=0.5)]], + retrieved_documents=[[Document(content="France"), Document(content="Germany"), Document(content="Paris")]], + ) + print(result["individual_scores"]) + # [0.8869] + print(result["score"]) + # 0.8869 ``` """