diff --git a/haystack/components/evaluators/document_ndcg.py b/haystack/components/evaluators/document_ndcg.py index ca88292f6c..14a8463205 100644 --- a/haystack/components/evaluators/document_ndcg.py +++ b/haystack/components/evaluators/document_ndcg.py @@ -5,7 +5,7 @@ from math import log2 from typing import Any -from haystack import Document, component +from haystack import Document, component, default_to_dict @component @@ -34,6 +34,51 @@ class DocumentNDCGEvaluator: ``` """ + def __init__(self, document_comparison_field: str = "content") -> None: + """ + Create a DocumentNDCGEvaluator component. + + :param document_comparison_field: + The Document field to use for comparison. Possible options: + - `"content"`: uses `doc.content` + - `"id"`: uses `doc.id` + - A `meta.` prefix followed by a key name: uses `doc.meta[""]` + (e.g. `"meta.file_id"`, `"meta.page_number"`) + Nested keys are supported (e.g. `"meta.source.url"`). + """ + self.document_comparison_field = document_comparison_field + + def _get_comparison_value(self, doc: Document) -> Any: + """ + Extract the comparison value from a document based on the configured field. + """ + if self.document_comparison_field == "content": + return doc.content + if self.document_comparison_field == "id": + return doc.id + if self.document_comparison_field.startswith("meta."): + parts = self.document_comparison_field[5:].split(".") + value = doc.meta + for part in parts: + if not isinstance(value, dict) or part not in value: + return None + value = value[part] + return value + msg = ( + f"Unsupported document_comparison_field: '{self.document_comparison_field}'. " + "Use 'content', 'id', or 'meta.'." + ) + raise ValueError(msg) + + def to_dict(self) -> dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return default_to_dict(self, document_comparison_field=self.document_comparison_field) + @component.output_types(score=float, individual_scores=list[float]) def run( self, ground_truth_documents: list[list[Document]], retrieved_documents: list[list[Document]] @@ -78,7 +123,7 @@ def validate_inputs(gt_docs: list[list[Document]], ret_docs: list[list[Document] The retrieved_documents to validate. :raises ValueError: - If the ground_truth_documents or the retrieved_documents are an empty a list. + If the ground_truth_documents or the retrieved_documents are an empty list. If the length of ground_truth_documents and retrieved_documents differs. If any list of documents in ground_truth_documents contains a mix of documents with and without a score. """ @@ -95,8 +140,7 @@ def validate_inputs(gt_docs: list[list[Document]], ret_docs: list[list[Document] msg = "Either none or all documents in each list of ground_truth_documents must have a score." raise ValueError(msg) - @staticmethod - def calculate_dcg(gt_docs: list[Document], ret_docs: list[Document]) -> float: + def calculate_dcg(self, gt_docs: list[Document], ret_docs: list[Document]) -> float: """ Calculate the discounted cumulative gain (DCG) of the retrieved documents. @@ -109,24 +153,40 @@ def calculate_dcg(gt_docs: list[Document], ret_docs: list[Document]) -> float: documents based on the ground truth documents. """ dcg = 0.0 - relevant_id_to_score = {doc.id: doc.score if doc.score is not None else 1 for doc in gt_docs} + # Build lookup from comparison value -> relevance score, skipping documents + # whose comparison value cannot be determined (e.g. missing meta key) + relevant_value_to_score: dict[Any, float] = {} + for doc in gt_docs: + value = self._get_comparison_value(doc) + if value is not None: + relevant_value_to_score[value] = doc.score if doc.score is not None else 1 + for i, doc in enumerate(ret_docs): - if doc.id in relevant_id_to_score: # TODO Related to https://github.com/deepset-ai/haystack/issues/8412 - dcg += relevant_id_to_score[doc.id] / log2(i + 2) # i + 2 because i is 0-indexed + value = self._get_comparison_value(doc) + if value is not None and value in relevant_value_to_score: + dcg += relevant_value_to_score[value] / log2(i + 2) # i + 2 because i is 0-indexed return dcg - @staticmethod - def calculate_idcg(gt_docs: list[Document]) -> float: + def calculate_idcg(self, gt_docs: list[Document]) -> float: """ Calculate the ideal discounted cumulative gain (IDCG) of the ground truth documents. + Ground truth documents whose comparison value cannot be determined (e.g. missing meta key) + are excluded, since they can never be matched in `calculate_dcg` either. Including them here + would inflate the IDCG and make it impossible for NDCG to reach 1.0 for a perfect retrieval. + :param gt_docs: The ground truth documents. :returns: The ideal discounted cumulative gain (IDCG) of the ground truth documents. """ + # Filter out documents that cannot be matched, consistent with calculate_dcg + matchable_docs = [doc for doc in gt_docs if self._get_comparison_value(doc) is not None] + idcg = 0.0 - for i, doc in enumerate(sorted(gt_docs, key=lambda x: x.score if x.score is not None else 1, reverse=True)): + for i, doc in enumerate( + sorted(matchable_docs, key=lambda x: x.score if x.score is not None else 1, reverse=True) + ): # If the document has a score, use it; otherwise, use 1 for binary relevance. relevance = doc.score if doc.score is not None else 1 idcg += relevance / log2(i + 2) # i + 2 because i is 0-indexed diff --git a/releasenotes/notes/add-document-comparison-field-to-ndcg-evaluator-f9abbbd556f49c04.yaml b/releasenotes/notes/add-document-comparison-field-to-ndcg-evaluator-f9abbbd556f49c04.yaml new file mode 100644 index 0000000000..57b45d3150 --- /dev/null +++ b/releasenotes/notes/add-document-comparison-field-to-ndcg-evaluator-f9abbbd556f49c04.yaml @@ -0,0 +1,16 @@ +upgrade: + - | + ``DocumentNDCGEvaluator`` now matches documents by their ``content`` field + by default instead of their auto-generated ``id``. Previously, ground + truth and retrieved documents were matched only if they had identical + ``id`` values, which rarely happened in practice since IDs are generated + independently for each Document instance. As a result, NDCG scores + computed with this evaluator may change for existing pipelines. To keep + the previous ``id``-based matching behavior, pass + ``document_comparison_field="id"`` when constructing the evaluator. +enhancements: + - | + Added ``document_comparison_field`` parameter to ``DocumentNDCGEvaluator``, + consistent with ``DocumentMAPEvaluator``, ``DocumentMRREvaluator``, and + ``DocumentRecallEvaluator``. Users can now match documents by ``"content"``, + ``"id"``, or any ``"meta."`` field when calculating NDCG scores. diff --git a/test/components/evaluators/test_document_ndcg.py b/test/components/evaluators/test_document_ndcg.py index 1b300eb4d8..3ae49595ae 100644 --- a/test/components/evaluators/test_document_ndcg.py +++ b/test/components/evaluators/test_document_ndcg.py @@ -4,7 +4,7 @@ import pytest -from haystack import Document +from haystack import Document, default_from_dict from haystack.components.evaluators.document_ndcg import DocumentNDCGEvaluator @@ -201,3 +201,155 @@ def test_calculate_idcg_empty(): gt_docs = [] idcg = evaluator.calculate_idcg(gt_docs) assert idcg == 0 + + +def test_to_dict_default(): + evaluator = DocumentNDCGEvaluator() + data = evaluator.to_dict() + assert data == { + "type": "haystack.components.evaluators.document_ndcg.DocumentNDCGEvaluator", + "init_parameters": {"document_comparison_field": "content"}, + } + + +def test_to_dict_custom_field(): + evaluator = DocumentNDCGEvaluator(document_comparison_field="id") + data = evaluator.to_dict() + assert data == { + "type": "haystack.components.evaluators.document_ndcg.DocumentNDCGEvaluator", + "init_parameters": {"document_comparison_field": "id"}, + } + + +def test_from_dict(): + data = { + "type": "haystack.components.evaluators.document_ndcg.DocumentNDCGEvaluator", + "init_parameters": {"document_comparison_field": "id"}, + } + evaluator = default_from_dict(DocumentNDCGEvaluator, data) + assert evaluator.document_comparison_field == "id" + + +def test_run_with_id_comparison(): + # Documents with same content but different IDs — id comparison + # must match on id, not content + evaluator = DocumentNDCGEvaluator(document_comparison_field="id") + result = evaluator.run( + ground_truth_documents=[[Document(id="doc1", content="France"), Document(id="doc2", content="Paris")]], + retrieved_documents=[ + [ + Document(id="doc1", content="different text"), + Document(id="doc3", content="Germany"), + Document(id="doc2", content="also different"), + ] + ], + ) + assert result["individual_scores"][0] == pytest.approx(0.9197, abs=1e-4) + assert result["score"] == pytest.approx(0.9197, abs=1e-4) + + +def test_run_with_id_comparison_no_match(): + evaluator = DocumentNDCGEvaluator(document_comparison_field="id") + result = evaluator.run( + ground_truth_documents=[[Document(id="doc1", content="France")]], + retrieved_documents=[[Document(id="doc99", content="France")]], + ) + # Same content, different ID — should NOT match when comparing by id + assert result["individual_scores"] == [0.0] + assert result["score"] == 0.0 + + +def test_run_with_meta_comparison(): + evaluator = DocumentNDCGEvaluator(document_comparison_field="meta.file_id") + result = evaluator.run( + ground_truth_documents=[ + [Document(content="France", meta={"file_id": "f1"}), Document(content="Paris", meta={"file_id": "f2"})] + ], + retrieved_documents=[ + [ + Document(content="different", meta={"file_id": "f1"}), + Document(content="irrelevant", meta={"file_id": "f99"}), + Document(content="also different", meta={"file_id": "f2"}), + ] + ], + ) + assert result["individual_scores"][0] == pytest.approx(0.9197, abs=1e-4) + assert result["score"] == pytest.approx(0.9197, abs=1e-4) + + +def test_run_with_nested_meta_comparison(): + evaluator = DocumentNDCGEvaluator(document_comparison_field="meta.source.url") + result = evaluator.run( + ground_truth_documents=[[Document(content="x", meta={"source": {"url": "https://a.com"}})]], + retrieved_documents=[[Document(content="z", meta={"source": {"url": "https://a.com"}})]], + ) + assert result["individual_scores"] == [1.0] + assert result["score"] == 1.0 + + +def test_run_with_meta_missing_key_treated_as_no_match(): + # Documents missing the meta key should not match anything + evaluator = DocumentNDCGEvaluator(document_comparison_field="meta.file_id") + result = evaluator.run( + ground_truth_documents=[[Document(content="France", meta={"file_id": "f1"})]], + retrieved_documents=[[Document(content="France", meta={})]], + ) + assert result["individual_scores"] == [0.0] + assert result["score"] == 0.0 + + +def test_run_with_id_comparison_with_scores(): + # Verify that relevance scores are honoured when comparing by id + evaluator = DocumentNDCGEvaluator(document_comparison_field="id") + result = evaluator.run( + ground_truth_documents=[ + [ + Document(id="doc1", content="foo", score=3), + Document(id="doc2", content="bar", score=2), + Document(id="doc3", content="baz", score=3), + Document(id="doc6", content="qux", score=2), + Document(id="doc7", content="quux", score=3), + Document(id="doc8", content="corge", score=2), + ] + ], + retrieved_documents=[ + [ + Document(id="doc1", content="x"), + Document(id="doc2", content="y"), + Document(id="doc3", content="z"), + Document(id="doc4", content="w"), + Document(id="doc5", content="v"), + ] + ], + ) + assert result["individual_scores"][0] == pytest.approx(0.6592, abs=1e-4) + assert result["score"] == pytest.approx(0.6592, abs=1e-4) + + +def test_unsupported_comparison_field_raises(): + evaluator = DocumentNDCGEvaluator(document_comparison_field="embedding") + with pytest.raises(ValueError, match="Unsupported document_comparison_field"): + evaluator.run( + ground_truth_documents=[[Document(content="France")]], retrieved_documents=[[Document(content="France")]] + ) + + +def test_run_with_meta_missing_key_can_still_reach_perfect_ndcg(): + """ + Regression test for the IDCG/DCG inflation bug: ground truth documents that + cannot be matched (missing the configured meta key) must be excluded from + IDCG too, otherwise NDCG can never reach 1.0 even for a perfect retrieval. + """ + evaluator = DocumentNDCGEvaluator(document_comparison_field="meta.file_id") + result = evaluator.run( + ground_truth_documents=[ + [ + Document(content="France", meta={"file_id": "f1"}), + Document(content="unmatchable", meta={}), # no file_id -> cannot be matched + ] + ], + retrieved_documents=[[Document(content="France", meta={"file_id": "f1"})]], + ) + # Perfect retrieval of the one matchable document should yield NDCG of exactly 1.0 + assert result["individual_scores"] == [1.0] + assert result["score"] == 1.0