Merge pull request #125 from majiayu000/fix/mhc-empty-and-low-similarity-issues-7-8

woodthom2 · web-flow · commit 216f30b805d3 · 2026-01-15T09:59:52.000Z
fix: skip empty MHC items and add similarity threshold (closes #7, #8)
diff --git a/src/harmony/matching/default_matcher.py b/src/harmony/matching/default_matcher.py
@@ -80,7 +80,8 @@ def match_instruments(
         texts_cached_vectors: dict[str, List[float]] = {}, batch_size: int = 1000, max_batches: int = 2000,
         is_negate: bool = True,
         clustering_algorithm: str = "affinity_propagation",
-        num_clusters_for_kmeans: int = None
+        num_clusters_for_kmeans: int = None,
+        mhc_min_similarity: float = 0.0
 ) -> MatchResult:
     for instrument in instruments:
         for question in instrument.questions:
@@ -98,5 +99,6 @@ def match_instruments(
         texts_cached_vectors=texts_cached_vectors,
         is_negate=is_negate,
         clustering_algorithm=clustering_algorithm,
-        num_clusters_for_kmeans=num_clusters_for_kmeans
+        num_clusters_for_kmeans=num_clusters_for_kmeans,
+        mhc_min_similarity=mhc_min_similarity
     )
diff --git a/src/harmony/matching/matcher.py b/src/harmony/matching/matcher.py
@@ -609,7 +609,8 @@ def match_instruments_with_function(
         texts_cached_vectors: dict[str, List[float]] = {},
         is_negate: bool = True,
         clustering_algorithm: ClusteringAlgorithm = ClusteringAlgorithm.affinity_propagation,
-        num_clusters_for_kmeans: int = None
+        num_clusters_for_kmeans: int = None,
+        mhc_min_similarity: float = 0.0
 ) -> MatchResult:
 
     all_questions: List[Question] = []
@@ -675,17 +676,31 @@ def match_instruments_with_function(
     if vectors_pos.size > 0 and len(mhc_embeddings) > 0:
         similarities_mhc = cosine_similarity(vectors_pos, mhc_embeddings)
         ctrs = {}
-        top_mhc_match_ids = np.argmax(similarities_mhc, axis=1)
-        for idx, mhc_item_idx in enumerate(top_mhc_match_ids):
-            question_text = mhc_questions[mhc_item_idx].question_text
-            if not question_text or len(question_text.strip()) < 3:
+
+        # Build mask of valid MHC questions (non-empty text)
+        valid_mhc_mask = np.array([
+            bool(mhc_questions[i].question_text and len(mhc_questions[i].question_text.strip()) >= 3)
+            for i in range(len(mhc_questions))
+        ])
+
+        for idx in range(len(all_questions)):
+            # Get similarities for this question, masking out invalid MHC items
+            masked_similarities = np.where(valid_mhc_mask, similarities_mhc[idx], -np.inf)
+            mhc_item_idx = int(np.argmax(masked_similarities))
+            strength_of_match = similarities_mhc[idx, mhc_item_idx]
+
+            # Skip if no valid MHC items or similarity is below threshold
+            if masked_similarities[mhc_item_idx] == -np.inf:
                 continue
+            if strength_of_match < mhc_min_similarity:
+                continue
+
+            question_text = mhc_questions[mhc_item_idx].question_text
             if all_questions[idx].instrument_id not in ctrs:
                 ctrs[all_questions[idx].instrument_id] = Counter()
             for topic in mhc_all_metadatas[mhc_item_idx]["topics"]:
                 ctrs[all_questions[idx].instrument_id][topic] += 1
             all_questions[idx].nearest_match_from_mhc_auto = mhc_questions[mhc_item_idx].model_dump()
-            strength_of_match = similarities_mhc[idx, mhc_item_idx]
             all_questions[idx].topics_strengths = {topic: float(strength_of_match)}
 
         instrument_to_category = {}
diff --git a/tests/test_mhc_filtering.py b/tests/test_mhc_filtering.py
@@ -0,0 +1,221 @@
+'''
+MIT License
+
+Copyright (c) 2023 Ulster University (https://www.ulster.ac.uk).
+Project: Harmony (https://harmonydata.ac.uk)
+Maintainer: Thomas Wood (https://fastdatascience.com)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+'''
+
+import sys
+import unittest
+
+sys.path.append("../src")
+
+import numpy as np
+from sentence_transformers import SentenceTransformer
+
+from harmony import match_instruments
+from harmony.schemas.requests.text import Instrument, Question
+
+
+model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
+
+
+def create_mhc_question_with_text(text):
+    """Create a Question object, bypassing validation for empty text (simulates MHC data)"""
+    q = Question.model_construct(question_text=text)
+    return q
+
+
+class TestMhcEmptyFiltering(unittest.TestCase):
+    """Tests for Issue #7: Remove empty items from MHC"""
+
+    def test_empty_mhc_questions_are_skipped(self):
+        """Verify that empty MHC questions are not matched"""
+        questions_en = [
+            Question(question_text="I feel anxious and worried about things"),
+            Question(question_text="I have trouble sleeping at night"),
+        ]
+        instrument_en = Instrument(questions=questions_en)
+
+        # MHC data with empty questions (using model_construct to bypass validation)
+        mhc_metadata = [
+            {'topics': ['anxiety']},
+            {'topics': ['empty_topic']},  # This has empty text
+            {'topics': ['sleep disorders']},
+        ]
+
+        mhc_questions = [
+            create_mhc_question_with_text("Do you feel nervous or anxious?"),
+            create_mhc_question_with_text(""),  # Empty question - should be skipped
+            create_mhc_question_with_text("Do you have difficulty sleeping?"),
+        ]
+
+        mhc_embeddings = model.encode(np.asarray([
+            "Do you feel nervous or anxious?",
+            "placeholder",  # Will be masked out
+            "Do you have difficulty sleeping?",
+        ]))
+
+        match_response = match_instruments(
+            [instrument_en],
+            mhc_questions=mhc_questions,
+            mhc_embeddings=mhc_embeddings,
+            mhc_all_metadatas=mhc_metadata,
+            mhc_min_similarity=0.3
+        )
+
+        self.assertEqual(2, len(match_response.questions))
+
+        # Verify no question matched to the empty MHC item
+        for q in match_response.questions:
+            if q.nearest_match_from_mhc_auto:
+                self.assertNotEqual("", q.nearest_match_from_mhc_auto.get("question_text", ""))
+
+    def test_whitespace_only_mhc_questions_are_skipped(self):
+        """Verify that whitespace-only MHC questions are not matched"""
+        questions_en = [Question(question_text="I feel depressed")]
+        instrument_en = Instrument(questions=questions_en)
+
+        mhc_metadata = [
+            {'topics': ['whitespace']},
+            {'topics': ['depression']},
+        ]
+
+        mhc_questions = [
+            create_mhc_question_with_text("   "),  # Whitespace only - should be skipped
+            create_mhc_question_with_text("Do you feel depressed or sad?"),
+        ]
+
+        mhc_embeddings = model.encode(np.asarray([
+            "placeholder",
+            "Do you feel depressed or sad?",
+        ]))
+
+        match_response = match_instruments(
+            [instrument_en],
+            mhc_questions=mhc_questions,
+            mhc_embeddings=mhc_embeddings,
+            mhc_all_metadatas=mhc_metadata,
+            mhc_min_similarity=0.3
+        )
+
+        # Should match to the valid depression question
+        if match_response.questions[0].nearest_match_from_mhc_auto:
+            matched_text = match_response.questions[0].nearest_match_from_mhc_auto.get("question_text", "")
+            self.assertIn("depressed", matched_text.lower())
+
+
+class TestMhcSimilarityThreshold(unittest.TestCase):
+    """Tests for Issue #8: Don't match to MHC items if similarity is too low"""
+
+    def test_low_similarity_no_match(self):
+        """Verify that questions with low similarity to MHC are not matched"""
+        # Unrelated question
+        questions = [Question(question_text="I lost my car keys")]
+        instrument = Instrument(questions=questions)
+
+        mhc_metadata = [
+            {'topics': ['eating disorders']},
+            {'topics': ['anxiety']},
+        ]
+
+        mhc_questions_as_text = [
+            "Do you worry about your weight?",
+            "Do you feel anxious?",
+        ]
+
+        mhc_embeddings = model.encode(np.asarray(mhc_questions_as_text))
+        mhc_questions = [Question(question_text=t) for t in mhc_questions_as_text]
+
+        # Use high threshold to ensure no match
+        match_response = match_instruments(
+            [instrument],
+            mhc_questions=mhc_questions,
+            mhc_embeddings=mhc_embeddings,
+            mhc_all_metadatas=mhc_metadata,
+            mhc_min_similarity=0.8
+        )
+
+        # Should not have MHC match due to low similarity
+        self.assertIsNone(match_response.questions[0].nearest_match_from_mhc_auto)
+
+    def test_high_similarity_match(self):
+        """Verify that questions with high similarity to MHC are matched"""
+        questions = [Question(question_text="I feel nervous and anxious")]
+        instrument = Instrument(questions=questions)
+
+        mhc_metadata = [
+            {'topics': ['anxiety']},
+        ]
+
+        mhc_questions_as_text = [
+            "Do you feel nervous or anxious?",
+        ]
+
+        mhc_embeddings = model.encode(np.asarray(mhc_questions_as_text))
+        mhc_questions = [Question(question_text=t) for t in mhc_questions_as_text]
+
+        # Use low threshold to allow match
+        match_response = match_instruments(
+            [instrument],
+            mhc_questions=mhc_questions,
+            mhc_embeddings=mhc_embeddings,
+            mhc_all_metadatas=mhc_metadata,
+            mhc_min_similarity=0.3
+        )
+
+        # Should have MHC match
+        self.assertIsNotNone(match_response.questions[0].nearest_match_from_mhc_auto)
+
+    def test_threshold_filters_unrelated(self):
+        """Verify that mhc_min_similarity threshold filters unrelated questions"""
+        # Completely unrelated question about cooking
+        questions = [Question(question_text="How do I make a chocolate cake?")]
+        instrument = Instrument(questions=questions)
+
+        mhc_metadata = [
+            {'topics': ['depression']},
+        ]
+
+        mhc_questions_as_text = [
+            "Have you felt hopeless about the future?",
+        ]
+
+        mhc_embeddings = model.encode(np.asarray(mhc_questions_as_text))
+        mhc_questions = [Question(question_text=t) for t in mhc_questions_as_text]
+
+        # Use explicit threshold of 0.5 to filter unrelated
+        match_response = match_instruments(
+            [instrument],
+            mhc_questions=mhc_questions,
+            mhc_embeddings=mhc_embeddings,
+            mhc_all_metadatas=mhc_metadata,
+            mhc_min_similarity=0.5
+        )
+
+        # Should not have MHC match - cooking and depression are unrelated
+        self.assertIsNone(match_response.questions[0].nearest_match_from_mhc_auto)
+
+
+if __name__ == '__main__':
+    unittest.main()