fix(resolver): make resolver cache thread-safe with per-identifier locking

LalatenduMohanty · claude · LalatenduMohanty · commit 03b4d1782a2e · 2026-04-06T08:52:05.000-04:00
Add per-identifier locks to _find_cached_candidates() and return defensive copies from _get_cached_candidates() to prevent concurrent threads from corrupting cached candidate lists during parallel builds. A single global lock would serialize all resolution work, so a per-identifier scheme is used instead — threads resolving different packages proceed concurrently while threads resolving the same package wait for the first to populate the cache. Closes: #1024 Co-Authored-By: Claude <claude@anthropic.com> Signed-off-by: Lalatendu Mohanty <lmohanty@redhat.com>
diff --git a/src/fromager/resolver.py b/src/fromager/resolver.py
@@ -10,6 +10,7 @@
 import logging
 import os
 import re
+import threading
 import typing
 from collections.abc import Iterable
 from operator import attrgetter
@@ -423,6 +424,8 @@ def get_project_from_pypi(
 
 class BaseProvider(ExtrasProvider):
     resolver_cache: typing.ClassVar[ResolverCache] = {}
+    _cache_locks: typing.ClassVar[dict[str, threading.Lock]] = {}
+    _meta_lock: typing.ClassVar[threading.Lock] = threading.Lock()
     provider_description: typing.ClassVar[str]
 
     def __init__(
@@ -552,22 +555,62 @@ def get_dependencies(self, candidate: Candidate) -> list[Requirement]:
         # return candidate.dependencies
         return []
 
+    def _get_identifier_lock(self, identifier: str) -> threading.Lock:
+        """Get or create a per-identifier lock for thread-safe cache access.
+
+        Uses a short-lived meta-lock to protect the lock dict itself.
+        The per-identifier lock ensures threads resolving different packages
+        proceed concurrently, while threads resolving the same package
+        wait for the first to populate the cache.
+        """
+        with self._meta_lock:
+            if identifier not in self._cache_locks:
+                self._cache_locks[identifier] = threading.Lock()
+            return self._cache_locks[identifier]
+
     def _get_cached_candidates(self, identifier: str) -> list[Candidate]:
-        """Get list of cached candidates for identifier and provider
+        """Get a copy of cached candidates for identifier and provider.
 
         The method always returns a list. If the cache did not have an entry
-        before, a new empty list is stored in the cache and returned to the
-        caller. The caller can mutate the list in place to update the cache.
+        before, a new empty list is stored in the cache. A copy is returned
+        so callers cannot accidentally corrupt the cache.
+
+        Must be called under the per-identifier lock from _get_identifier_lock.
         """
         cls = type(self)
         provider_cache = cls.resolver_cache.setdefault(identifier, {})
         candidate_cache = provider_cache.setdefault((cls, self.cache_key), [])
-        return candidate_cache
+        return list(candidate_cache)
+
+    def _set_cached_candidates(
+        self, identifier: str, candidates: list[Candidate]
+    ) -> None:
+        """Store candidates in the cache for identifier and provider.
+
+        Must be called under the per-identifier lock from _get_identifier_lock.
+        """
+        cls = type(self)
+        provider_cache = cls.resolver_cache.setdefault(identifier, {})
+        provider_cache[(cls, self.cache_key)] = list(candidates)
 
     def _find_cached_candidates(self, identifier: str) -> Candidates:
-        """Find candidates with caching"""
-        cached_candidates: list[Candidate] = []
-        if self.use_cache_candidates:
+        """Find candidates with caching.
+
+        Uses a per-identifier lock so threads resolving different packages
+        proceed concurrently, while threads resolving the same package
+        wait for the first to populate the cache.
+        """
+        if not self.use_cache_candidates:
+            candidates = list(self.find_candidates(identifier))
+            logger.debug(
+                "%s: got %i unfiltered candidates, ignoring cache",
+                identifier,
+                len(candidates),
+            )
+            return candidates
+
+        lock = self._get_identifier_lock(identifier)
+        with lock:
             cached_candidates = self._get_cached_candidates(identifier)
             if cached_candidates:
                 logger.debug(
@@ -576,22 +619,15 @@ def _find_cached_candidates(self, identifier: str) -> Candidates:
                     len(cached_candidates),
                 )
                 return cached_candidates
-        candidates = list(self.find_candidates(identifier))
-        if self.use_cache_candidates:
-            # mutate list object in-place
-            cached_candidates[:] = candidates
+
+            candidates = list(self.find_candidates(identifier))
+            self._set_cached_candidates(identifier, candidates)
             logger.debug(
                 "%s: cache %i unfiltered candidates",
                 identifier,
                 len(candidates),
             )
-        else:
-            logger.debug(
-                "%s: got %i unfiltered candidates, ignoring cache",
-                identifier,
-                len(candidates),
-            )
-        return candidates
+            return candidates
 
     def _get_no_match_error_message(
         self, identifier: str, requirements: RequirementsMap
diff --git a/tests/test_resolver.py b/tests/test_resolver.py
@@ -1,5 +1,7 @@
 import datetime
 import re
+import threading
+import time
 import typing
 
 import pytest
@@ -11,6 +13,7 @@
 
 from fromager import constraints, resolver
 from fromager.__main__ import main as fromager
+from fromager.candidate import Candidate
 
 _hydra_core_simple_response = """
 <!DOCTYPE html>
@@ -153,10 +156,8 @@ def test_provider_cache_key_pypi(pypi_hydra_resolver: typing.Any) -> None:
     resolver_cache = resolver.BaseProvider.resolver_cache
     assert req.name in resolver_cache
     assert (resolver.PyPIProvider, provider.cache_key) in resolver_cache[req.name]
-    # mutated in place
-    assert provider._get_cached_candidates(req.name) is req_cache
+    # _get_cached_candidates returns a defensive copy, not the same object
     assert len(provider._get_cached_candidates(req.name)) == 7
-    assert len(req_cache) == 7
 
 
 def test_provider_cache_key_gitlab(gitlab_decile_resolver: typing.Any) -> None:
@@ -1278,3 +1279,95 @@ def test_cli_package_resolver(
     assert "- PyPI versions: 1.2.2, 1.3.1+local, 1.3.2, 2.0.0a1" in result.stdout
     assert "- only wheels on PyPI: 1.3.1+local, 2.0.0a1" in result.stdout
     assert "- missing from Fromager: 1.3.1+local, 2.0.0a1" in result.stdout
+
+
+def _make_candidate(name: str, version: str) -> Candidate:
+    """Create a minimal Candidate for testing."""
+    return Candidate(
+        name=name, version=Version(version), url="https://example.com", is_sdist=False
+    )
+
+
+class _StubProvider(resolver.BaseProvider):
+    """Minimal BaseProvider subclass for cache tests."""
+
+    provider_description = "stub"
+
+    @property
+    def cache_key(self) -> str:
+        return "stub-key"
+
+    def find_candidates(self, identifier: str) -> list[Candidate]:
+        return []
+
+
+def test_get_cached_candidates_returns_defensive_copy() -> None:
+    """Mutating the list returned by _get_cached_candidates must not corrupt the cache."""
+    provider = _StubProvider()
+    identifier = "test-pkg"
+
+    # Seed the cache directly so the test doesn't depend on the aliasing bug
+    resolver.BaseProvider.resolver_cache[identifier] = {
+        (type(provider), provider.cache_key): [_make_candidate("test-pkg", "1.0.0")]
+    }
+
+    # Get candidates again and mutate the returned list
+    first = provider._get_cached_candidates(identifier)
+    first.append(_make_candidate("test-pkg", "2.0.0"))
+
+    # The cache should not reflect the caller's mutation
+    second = provider._get_cached_candidates(identifier)
+    assert len(second) == 1, (
+        "_get_cached_candidates should return a defensive copy, "
+        "not a direct reference to the internal cache"
+    )
+    assert second[0].version == Version("1.0.0")
+
+
+def test_find_cached_candidates_thread_safe() -> None:
+    """Concurrent threads must not bypass the cache and call find_candidates multiple times."""
+    call_count = 0
+    call_count_lock = threading.Lock()
+
+    class _SlowProvider(resolver.BaseProvider):
+        """Provider with a slow find_candidates to expose thread races."""
+
+        provider_description = "slow"
+
+        @property
+        def cache_key(self) -> str:
+            return "slow-key"
+
+        def find_candidates(self, identifier: str) -> list[Candidate]:
+            nonlocal call_count
+            with call_count_lock:
+                call_count += 1
+            time.sleep(0.2)
+            return [_make_candidate(identifier, "1.0.0")]
+
+    barrier = threading.Barrier(4)
+
+    def resolve_in_thread(provider: _SlowProvider, ident: str) -> None:
+        barrier.wait(timeout=5)
+        list(provider._find_cached_candidates(ident))
+
+    providers = [_SlowProvider() for _ in range(4)]
+    threads = [
+        threading.Thread(
+            target=resolve_in_thread,
+            args=(thread_provider, "shared-pkg"),
+            name=f"resolver-{i}",
+        )
+        for i, thread_provider in enumerate(providers)
+    ]
+
+    for t in threads:
+        t.start()
+    for t in threads:
+        t.join(timeout=10)
+
+    assert call_count == 1, (
+        f"find_candidates() was called {call_count} times; expected 1. "
+        "Without thread-safe caching, multiple threads bypass the cache "
+        "and redundantly call find_candidates()."
+    )