deepset-ai · Kunal-Somani · Jun 15, 2026 · Jun 15, 2026 · Jun 19, 2026 · Jun 19, 2026
@@ -5,7 +5,7 @@
 from math import log2
 from typing import Any
 
-from haystack import Document, component
+from haystack import Document, component, default_to_dict
 
 
 @component
@@ -34,6 +34,51 @@ class DocumentNDCGEvaluator:
     ```
     """
 
+    def __init__(self, document_comparison_field: str = "content") -> None:
+        """
+        Create a DocumentNDCGEvaluator component.
+
+        :param document_comparison_field:
+            The Document field to use for comparison. Possible options:
+            - `"content"`: uses `doc.content`
+            - `"id"`: uses `doc.id`
+            - A `meta.` prefix followed by a key name: uses `doc.meta["<key>"]`
+              (e.g. `"meta.file_id"`, `"meta.page_number"`)
+              Nested keys are supported (e.g. `"meta.source.url"`).
+        """
+        self.document_comparison_field = document_comparison_field
+
+    def _get_comparison_value(self, doc: Document) -> Any:
+        """
+        Extract the comparison value from a document based on the configured field.
+        """
+        if self.document_comparison_field == "content":
+            return doc.content
+        if self.document_comparison_field == "id":
+            return doc.id
+        if self.document_comparison_field.startswith("meta."):
+            parts = self.document_comparison_field[5:].split(".")
+            value = doc.meta
+            for part in parts:
+                if not isinstance(value, dict) or part not in value:
+                    return None
+                value = value[part]
+            return value
+        msg = (
+            f"Unsupported document_comparison_field: '{self.document_comparison_field}'. "
+            "Use 'content', 'id', or 'meta.<key>'."
+        )
+        raise ValueError(msg)
+
+    def to_dict(self) -> dict[str, Any]:
+        """
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
+        """
+        return default_to_dict(self, document_comparison_field=self.document_comparison_field)
+
     @component.output_types(score=float, individual_scores=list[float])
     def run(
         self, ground_truth_documents: list[list[Document]], retrieved_documents: list[list[Document]]
@@ -78,7 +123,7 @@ def validate_inputs(gt_docs: list[list[Document]], ret_docs: list[list[Document]
             The retrieved_documents to validate.
 
         :raises ValueError:
-            If the ground_truth_documents or the retrieved_documents are an empty a list.
+            If the ground_truth_documents or the retrieved_documents are an empty list.
             If the length of ground_truth_documents and retrieved_documents differs.
             If any list of documents in ground_truth_documents contains a mix of documents with and without a score.
         """
@@ -95,8 +140,7 @@ def validate_inputs(gt_docs: list[list[Document]], ret_docs: list[list[Document]
                 msg = "Either none or all documents in each list of ground_truth_documents must have a score."
                 raise ValueError(msg)
 
-    @staticmethod
-    def calculate_dcg(gt_docs: list[Document], ret_docs: list[Document]) -> float:
+    def calculate_dcg(self, gt_docs: list[Document], ret_docs: list[Document]) -> float:
         """
         Calculate the discounted cumulative gain (DCG) of the retrieved documents.
 
@@ -109,24 +153,40 @@ def calculate_dcg(gt_docs: list[Document], ret_docs: list[Document]) -> float:
             documents based on the ground truth documents.
         """
         dcg = 0.0
-        relevant_id_to_score = {doc.id: doc.score if doc.score is not None else 1 for doc in gt_docs}
+        # Build lookup from comparison value -> relevance score, skipping documents
+        # whose comparison value cannot be determined (e.g. missing meta key)
+        relevant_value_to_score: dict[Any, float] = {}
+        for doc in gt_docs:
+            value = self._get_comparison_value(doc)
+            if value is not None:
+                relevant_value_to_score[value] = doc.score if doc.score is not None else 1
+
         for i, doc in enumerate(ret_docs):
-            if doc.id in relevant_id_to_score:  # TODO Related to https://github.com/deepset-ai/haystack/issues/8412
-                dcg += relevant_id_to_score[doc.id] / log2(i + 2)  # i + 2 because i is 0-indexed
+            value = self._get_comparison_value(doc)
+            if value is not None and value in relevant_value_to_score:
+                dcg += relevant_value_to_score[value] / log2(i + 2)  # i + 2 because i is 0-indexed
         return dcg
 
-    @staticmethod
-    def calculate_idcg(gt_docs: list[Document]) -> float:
+    def calculate_idcg(self, gt_docs: list[Document]) -> float:
         """
         Calculate the ideal discounted cumulative gain (IDCG) of the ground truth documents.
 
+        Ground truth documents whose comparison value cannot be determined (e.g. missing meta key)
+        are excluded, since they can never be matched in `calculate_dcg` either. Including them here
+        would inflate the IDCG and make it impossible for NDCG to reach 1.0 for a perfect retrieval.
+
         :param gt_docs:
             The ground truth documents.
         :returns:
             The ideal discounted cumulative gain (IDCG) of the ground truth documents.
         """
+        # Filter out documents that cannot be matched, consistent with calculate_dcg
+        matchable_docs = [doc for doc in gt_docs if self._get_comparison_value(doc) is not None]
+
         idcg = 0.0
-        for i, doc in enumerate(sorted(gt_docs, key=lambda x: x.score if x.score is not None else 1, reverse=True)):
+        for i, doc in enumerate(
+            sorted(matchable_docs, key=lambda x: x.score if x.score is not None else 1, reverse=True)
+        ):
             # If the document has a score, use it; otherwise, use 1 for binary relevance.
             relevance = doc.score if doc.score is not None else 1
             idcg += relevance / log2(i + 2)  # i + 2 because i is 0-indexed

@@ -0,0 +1,16 @@
+upgrade:
+  - |
+    ``DocumentNDCGEvaluator`` now matches documents by their ``content`` field
+    by default instead of their auto-generated ``id``. Previously, ground
+    truth and retrieved documents were matched only if they had identical
+    ``id`` values, which rarely happened in practice since IDs are generated
+    independently for each Document instance. As a result, NDCG scores
+    computed with this evaluator may change for existing pipelines. To keep
+    the previous ``id``-based matching behavior, pass
+    ``document_comparison_field="id"`` when constructing the evaluator.
+enhancements:
+  - |
+    Added ``document_comparison_field`` parameter to ``DocumentNDCGEvaluator``,
+    consistent with ``DocumentMAPEvaluator``, ``DocumentMRREvaluator``, and
+    ``DocumentRecallEvaluator``. Users can now match documents by ``"content"``,
+    ``"id"``, or any ``"meta.<key>"`` field when calculating NDCG scores.
@@ -4,7 +4,7 @@
 
 import pytest
 
-from haystack import Document
+from haystack import Document, default_from_dict
 from haystack.components.evaluators.document_ndcg import DocumentNDCGEvaluator
 
 
@@ -201,3 +201,155 @@ def test_calculate_idcg_empty():
     gt_docs = []
     idcg = evaluator.calculate_idcg(gt_docs)
     assert idcg == 0
+
+
+def test_to_dict_default():
+    evaluator = DocumentNDCGEvaluator()
+    data = evaluator.to_dict()
+    assert data == {
+        "type": "haystack.components.evaluators.document_ndcg.DocumentNDCGEvaluator",
+        "init_parameters": {"document_comparison_field": "content"},
+    }
+
+
+def test_to_dict_custom_field():
+    evaluator = DocumentNDCGEvaluator(document_comparison_field="id")
+    data = evaluator.to_dict()
+    assert data == {
+        "type": "haystack.components.evaluators.document_ndcg.DocumentNDCGEvaluator",
+        "init_parameters": {"document_comparison_field": "id"},
+    }
+
+
+def test_from_dict():
+    data = {
+        "type": "haystack.components.evaluators.document_ndcg.DocumentNDCGEvaluator",
+        "init_parameters": {"document_comparison_field": "id"},
+    }
+    evaluator = default_from_dict(DocumentNDCGEvaluator, data)
+    assert evaluator.document_comparison_field == "id"
+
+
+def test_run_with_id_comparison():
+    # Documents with same content but different IDs — id comparison
+    # must match on id, not content
+    evaluator = DocumentNDCGEvaluator(document_comparison_field="id")
+    result = evaluator.run(
+        ground_truth_documents=[[Document(id="doc1", content="France"), Document(id="doc2", content="Paris")]],
+        retrieved_documents=[
+            [
+                Document(id="doc1", content="different text"),
+                Document(id="doc3", content="Germany"),
+                Document(id="doc2", content="also different"),
+            ]
+        ],
+    )
+    assert result["individual_scores"][0] == pytest.approx(0.9197, abs=1e-4)
+    assert result["score"] == pytest.approx(0.9197, abs=1e-4)
+
+
+def test_run_with_id_comparison_no_match():
+    evaluator = DocumentNDCGEvaluator(document_comparison_field="id")
+    result = evaluator.run(
+        ground_truth_documents=[[Document(id="doc1", content="France")]],
+        retrieved_documents=[[Document(id="doc99", content="France")]],
+    )
+    # Same content, different ID — should NOT match when comparing by id
+    assert result["individual_scores"] == [0.0]
+    assert result["score"] == 0.0
+
+
+def test_run_with_meta_comparison():
+    evaluator = DocumentNDCGEvaluator(document_comparison_field="meta.file_id")
+    result = evaluator.run(
+        ground_truth_documents=[
+            [Document(content="France", meta={"file_id": "f1"}), Document(content="Paris", meta={"file_id": "f2"})]
+        ],
+        retrieved_documents=[
+            [
+                Document(content="different", meta={"file_id": "f1"}),
+                Document(content="irrelevant", meta={"file_id": "f99"}),
+                Document(content="also different", meta={"file_id": "f2"}),
+            ]
+        ],
+    )
+    assert result["individual_scores"][0] == pytest.approx(0.9197, abs=1e-4)
+    assert result["score"] == pytest.approx(0.9197, abs=1e-4)
+
+
+def test_run_with_nested_meta_comparison():
+    evaluator = DocumentNDCGEvaluator(document_comparison_field="meta.source.url")
+    result = evaluator.run(
+        ground_truth_documents=[[Document(content="x", meta={"source": {"url": "https://a.com"}})]],
+        retrieved_documents=[[Document(content="z", meta={"source": {"url": "https://a.com"}})]],
+    )
+    assert result["individual_scores"] == [1.0]
+    assert result["score"] == 1.0
+
+
+def test_run_with_meta_missing_key_treated_as_no_match():
+    # Documents missing the meta key should not match anything
+    evaluator = DocumentNDCGEvaluator(document_comparison_field="meta.file_id")
+    result = evaluator.run(
+        ground_truth_documents=[[Document(content="France", meta={"file_id": "f1"})]],
+        retrieved_documents=[[Document(content="France", meta={})]],
+    )
+    assert result["individual_scores"] == [0.0]
+    assert result["score"] == 0.0
+
+
+def test_run_with_id_comparison_with_scores():
+    # Verify that relevance scores are honoured when comparing by id
+    evaluator = DocumentNDCGEvaluator(document_comparison_field="id")
+    result = evaluator.run(
+        ground_truth_documents=[
+            [
+                Document(id="doc1", content="foo", score=3),
+                Document(id="doc2", content="bar", score=2),
+                Document(id="doc3", content="baz", score=3),
+                Document(id="doc6", content="qux", score=2),
+                Document(id="doc7", content="quux", score=3),
+                Document(id="doc8", content="corge", score=2),
+            ]
+        ],
+        retrieved_documents=[
+            [
+                Document(id="doc1", content="x"),
+                Document(id="doc2", content="y"),
+                Document(id="doc3", content="z"),
+                Document(id="doc4", content="w"),
+                Document(id="doc5", content="v"),
+            ]
+        ],
+    )
+    assert result["individual_scores"][0] == pytest.approx(0.6592, abs=1e-4)
+    assert result["score"] == pytest.approx(0.6592, abs=1e-4)
+
+
+def test_unsupported_comparison_field_raises():
+    evaluator = DocumentNDCGEvaluator(document_comparison_field="embedding")
+    with pytest.raises(ValueError, match="Unsupported document_comparison_field"):
+        evaluator.run(
+            ground_truth_documents=[[Document(content="France")]], retrieved_documents=[[Document(content="France")]]
+        )
+
+
+def test_run_with_meta_missing_key_can_still_reach_perfect_ndcg():
+    """
+    Regression test for the IDCG/DCG inflation bug: ground truth documents that
+    cannot be matched (missing the configured meta key) must be excluded from
+    IDCG too, otherwise NDCG can never reach 1.0 even for a perfect retrieval.
+    """
+    evaluator = DocumentNDCGEvaluator(document_comparison_field="meta.file_id")
+    result = evaluator.run(
+        ground_truth_documents=[
+            [
+                Document(content="France", meta={"file_id": "f1"}),
+                Document(content="unmatchable", meta={}),  # no file_id -> cannot be matched
+            ]
+        ],
+        retrieved_documents=[[Document(content="France", meta={"file_id": "f1"})]],
+    )
+    # Perfect retrieval of the one matchable document should yield NDCG of exactly 1.0
+    assert result["individual_scores"] == [1.0]
+    assert result["score"] == 1.0