From 67541ae3886b035ceeba82616767030da1965080 Mon Sep 17 00:00:00 2001
From: Lalatendu Mohanty <lmohanty@redhat.com>
Date: Mon, 6 Apr 2026 00:40:13 -0400
Subject: [PATCH 1/2] fix(resolver): make resolver cache thread-safe with
 per-identifier locking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add per-identifier locks to _find_cached_candidates() and return
defensive copies from _get_cached_candidates() to prevent concurrent
threads from corrupting cached candidate lists during parallel builds.

A single global lock would serialize all resolution work, so a
per-identifier scheme is used instead — threads resolving different
packages proceed concurrently while threads resolving the same
package wait for the first to populate the cache.

Closes: #1024
Co-Authored-By: Claude <claude@anthropic.com>
Signed-off-by: Lalatendu Mohanty <lmohanty@redhat.com>
---
 src/fromager/resolver.py | 84 ++++++++++++++++++++++++++--------
 tests/test_resolver.py   | 99 ++++++++++++++++++++++++++++++++++++++--
 2 files changed, 162 insertions(+), 21 deletions(-)

diff --git a/src/fromager/resolver.py b/src/fromager/resolver.py
index 2ee09115..43366c1c 100644
--- a/src/fromager/resolver.py
+++ b/src/fromager/resolver.py
@@ -10,6 +10,7 @@
 import logging
 import os
 import re
+import threading
 import typing
 from collections.abc import Iterable
 from operator import attrgetter
@@ -422,7 +423,21 @@ def get_project_from_pypi(
 
 
 class BaseProvider(ExtrasProvider):
+    """Base class for Fromager's dependency resolver (resolvelib + extras).
+
+    Subclasses implement ``find_candidates``, ``cache_key``, and
+    ``provider_description`` to list versions from PyPI, a version map, etc.
+
+    Candidate lists are cached per package in one global dict, with a lock per
+    package so parallel work on different packages does not clash.
+
+    ``find_matches`` keeps only versions that fit the requirements and
+    constraints, then picks newest first.
+    """
+
     resolver_cache: typing.ClassVar[ResolverCache] = {}
+    _cache_locks: typing.ClassVar[dict[str, threading.Lock]] = {}
+    _meta_lock: typing.ClassVar[threading.Lock] = threading.Lock()
     provider_description: typing.ClassVar[str]
 
     def __init__(
@@ -552,22 +567,62 @@ def get_dependencies(self, candidate: Candidate) -> list[Requirement]:
         # return candidate.dependencies
         return []
 
+    def _get_identifier_lock(self, identifier: str) -> threading.Lock:
+        """Get or create a per-identifier lock for thread-safe cache access.
+
+        Uses a short-lived meta-lock to protect the lock dict itself.
+        The per-identifier lock ensures threads resolving different packages
+        proceed concurrently, while threads resolving the same package
+        wait for the first to populate the cache.
+        """
+        with self._meta_lock:
+            if identifier not in self._cache_locks:
+                self._cache_locks[identifier] = threading.Lock()
+            return self._cache_locks[identifier]
+
     def _get_cached_candidates(self, identifier: str) -> list[Candidate]:
-        """Get list of cached candidates for identifier and provider
+        """Get a copy of cached candidates for identifier and provider.
 
         The method always returns a list. If the cache did not have an entry
-        before, a new empty list is stored in the cache and returned to the
-        caller. The caller can mutate the list in place to update the cache.
+        before, a new empty list is stored in the cache. A copy is returned
+        so callers cannot accidentally corrupt the cache.
+
+        Must be called under the per-identifier lock from _get_identifier_lock.
         """
         cls = type(self)
         provider_cache = cls.resolver_cache.setdefault(identifier, {})
         candidate_cache = provider_cache.setdefault((cls, self.cache_key), [])
-        return candidate_cache
+        return list(candidate_cache)
+
+    def _set_cached_candidates(
+        self, identifier: str, candidates: list[Candidate]
+    ) -> None:
+        """Store candidates in the cache for identifier and provider.
+
+        Must be called under the per-identifier lock from _get_identifier_lock.
+        """
+        cls = type(self)
+        provider_cache = cls.resolver_cache.setdefault(identifier, {})
+        provider_cache[(cls, self.cache_key)] = list(candidates)
 
     def _find_cached_candidates(self, identifier: str) -> Candidates:
-        """Find candidates with caching"""
-        cached_candidates: list[Candidate] = []
-        if self.use_cache_candidates:
+        """Find candidates with caching.
+
+        Uses a per-identifier lock so threads resolving different packages
+        proceed concurrently, while threads resolving the same package
+        wait for the first to populate the cache.
+        """
+        if not self.use_cache_candidates:
+            candidates = list(self.find_candidates(identifier))
+            logger.debug(
+                "%s: got %i unfiltered candidates, ignoring cache",
+                identifier,
+                len(candidates),
+            )
+            return candidates
+
+        lock = self._get_identifier_lock(identifier)
+        with lock:
             cached_candidates = self._get_cached_candidates(identifier)
             if cached_candidates:
                 logger.debug(
@@ -576,22 +631,15 @@ def _find_cached_candidates(self, identifier: str) -> Candidates:
                     len(cached_candidates),
                 )
                 return cached_candidates
-        candidates = list(self.find_candidates(identifier))
-        if self.use_cache_candidates:
-            # mutate list object in-place
-            cached_candidates[:] = candidates
+
+            candidates = list(self.find_candidates(identifier))
+            self._set_cached_candidates(identifier, candidates)
             logger.debug(
                 "%s: cache %i unfiltered candidates",
                 identifier,
                 len(candidates),
             )
-        else:
-            logger.debug(
-                "%s: got %i unfiltered candidates, ignoring cache",
-                identifier,
-                len(candidates),
-            )
-        return candidates
+            return candidates
 
     def _get_no_match_error_message(
         self, identifier: str, requirements: RequirementsMap
diff --git a/tests/test_resolver.py b/tests/test_resolver.py
index 690a1288..a548a2cc 100644
--- a/tests/test_resolver.py
+++ b/tests/test_resolver.py
@@ -1,5 +1,7 @@
 import datetime
 import re
+import threading
+import time
 import typing
 
 import pytest
@@ -11,6 +13,7 @@
 
 from fromager import constraints, resolver
 from fromager.__main__ import main as fromager
+from fromager.candidate import Candidate
 
 _hydra_core_simple_response = """
 <!DOCTYPE html>
@@ -153,10 +156,8 @@ def test_provider_cache_key_pypi(pypi_hydra_resolver: typing.Any) -> None:
     resolver_cache = resolver.BaseProvider.resolver_cache
     assert req.name in resolver_cache
     assert (resolver.PyPIProvider, provider.cache_key) in resolver_cache[req.name]
-    # mutated in place
-    assert provider._get_cached_candidates(req.name) is req_cache
+    # _get_cached_candidates returns a defensive copy, not the same object
     assert len(provider._get_cached_candidates(req.name)) == 7
-    assert len(req_cache) == 7
 
 
 def test_provider_cache_key_gitlab(gitlab_decile_resolver: typing.Any) -> None:
@@ -1278,3 +1279,95 @@ def test_cli_package_resolver(
     assert "- PyPI versions: 1.2.2, 1.3.1+local, 1.3.2, 2.0.0a1" in result.stdout
     assert "- only wheels on PyPI: 1.3.1+local, 2.0.0a1" in result.stdout
     assert "- missing from Fromager: 1.3.1+local, 2.0.0a1" in result.stdout
+
+
+def _make_candidate(name: str, version: str) -> Candidate:
+    """Create a minimal Candidate for testing."""
+    return Candidate(
+        name=name, version=Version(version), url="https://example.com", is_sdist=False
+    )
+
+
+class _StubProvider(resolver.BaseProvider):
+    """Minimal BaseProvider subclass for cache tests."""
+
+    provider_description = "stub"
+
+    @property
+    def cache_key(self) -> str:
+        return "stub-key"
+
+    def find_candidates(self, identifier: str) -> list[Candidate]:
+        return []
+
+
+def test_get_cached_candidates_returns_defensive_copy() -> None:
+    """Mutating the list returned by _get_cached_candidates must not corrupt the cache."""
+    provider = _StubProvider()
+    identifier = "test-pkg"
+
+    # Seed the cache directly so the test doesn't depend on the aliasing bug
+    resolver.BaseProvider.resolver_cache[identifier] = {
+        (type(provider), provider.cache_key): [_make_candidate("test-pkg", "1.0.0")]
+    }
+
+    # Get candidates again and mutate the returned list
+    first = provider._get_cached_candidates(identifier)
+    first.append(_make_candidate("test-pkg", "2.0.0"))
+
+    # The cache should not reflect the caller's mutation
+    second = provider._get_cached_candidates(identifier)
+    assert len(second) == 1, (
+        "_get_cached_candidates should return a defensive copy, "
+        "not a direct reference to the internal cache"
+    )
+    assert second[0].version == Version("1.0.0")
+
+
+def test_find_cached_candidates_thread_safe() -> None:
+    """Concurrent threads must not bypass the cache and call find_candidates multiple times."""
+    call_count = 0
+    call_count_lock = threading.Lock()
+
+    class _SlowProvider(resolver.BaseProvider):
+        """Provider with a slow find_candidates to expose thread races."""
+
+        provider_description = "slow"
+
+        @property
+        def cache_key(self) -> str:
+            return "slow-key"
+
+        def find_candidates(self, identifier: str) -> list[Candidate]:
+            nonlocal call_count
+            with call_count_lock:
+                call_count += 1
+            time.sleep(0.2)
+            return [_make_candidate(identifier, "1.0.0")]
+
+    barrier = threading.Barrier(4)
+
+    def resolve_in_thread(provider: _SlowProvider, ident: str) -> None:
+        barrier.wait(timeout=5)
+        list(provider._find_cached_candidates(ident))
+
+    providers = [_SlowProvider() for _ in range(4)]
+    threads = [
+        threading.Thread(
+            target=resolve_in_thread,
+            args=(thread_provider, "shared-pkg"),
+            name=f"resolver-{i}",
+        )
+        for i, thread_provider in enumerate(providers)
+    ]
+
+    for t in threads:
+        t.start()
+    for t in threads:
+        t.join(timeout=10)
+
+    assert call_count == 1, (
+        f"find_candidates() was called {call_count} times; expected 1. "
+        "Without thread-safe caching, multiple threads bypass the cache "
+        "and redundantly call find_candidates()."
+    )

From 8218e5be45e427ea8bb07266c1fec7a6227855e0 Mon Sep 17 00:00:00 2001
From: Lalatendu Mohanty <lmohanty@redhat.com>
Date: Mon, 6 Apr 2026 10:34:49 -0400
Subject: [PATCH 2/2] fix(resolver): treat empty candidate list as valid cache
 entry

The old truthiness check `if cached_candidates:` treated an empty
list as a cache miss, silently discarding valid results. In practice
this has no effect because find_matches raises ResolverException on
empty candidates, terminating the resolution before a second lookup
can occur. But the cache should not discard data it already computed.

Use None as the "not yet cached" sentinel instead.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Lalatendu Mohanty <lmohanty@redhat.com>
---
 src/fromager/resolver.py | 14 ++++++++------
 tests/test_resolver.py   |  4 +++-
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/fromager/resolver.py b/src/fromager/resolver.py
index 43366c1c..c55f7d33 100644
--- a/src/fromager/resolver.py
+++ b/src/fromager/resolver.py
@@ -580,18 +580,20 @@ def _get_identifier_lock(self, identifier: str) -> threading.Lock:
                 self._cache_locks[identifier] = threading.Lock()
             return self._cache_locks[identifier]
 
-    def _get_cached_candidates(self, identifier: str) -> list[Candidate]:
+    def _get_cached_candidates(self, identifier: str) -> list[Candidate] | None:
         """Get a copy of cached candidates for identifier and provider.
 
-        The method always returns a list. If the cache did not have an entry
-        before, a new empty list is stored in the cache. A copy is returned
-        so callers cannot accidentally corrupt the cache.
+        Returns None if no entry exists in the cache, or a copy of the cached
+        list (which may be empty). A copy is returned so callers cannot
+        accidentally corrupt the cache.
 
         Must be called under the per-identifier lock from _get_identifier_lock.
         """
         cls = type(self)
         provider_cache = cls.resolver_cache.setdefault(identifier, {})
-        candidate_cache = provider_cache.setdefault((cls, self.cache_key), [])
+        candidate_cache = provider_cache.get((cls, self.cache_key))
+        if candidate_cache is None:
+            return None
         return list(candidate_cache)
 
     def _set_cached_candidates(
@@ -624,7 +626,7 @@ def _find_cached_candidates(self, identifier: str) -> Candidates:
         lock = self._get_identifier_lock(identifier)
         with lock:
             cached_candidates = self._get_cached_candidates(identifier)
-            if cached_candidates:
+            if cached_candidates is not None:
                 logger.debug(
                     "%s: use %i cached candidates",
                     identifier,
diff --git a/tests/test_resolver.py b/tests/test_resolver.py
index a548a2cc..9768319d 100644
--- a/tests/test_resolver.py
+++ b/tests/test_resolver.py
@@ -147,7 +147,7 @@ def test_provider_cache_key_pypi(pypi_hydra_resolver: typing.Any) -> None:
     provider = pypi_hydra_resolver.provider
     assert provider.cache_key == "https://pypi.org/simple/"
     req_cache = provider._get_cached_candidates(req.name)
-    assert req_cache == []
+    assert req_cache is None
 
     result = pypi_hydra_resolver.resolve([req])
     candidate = result.mapping[req.name]
@@ -1313,10 +1313,12 @@ def test_get_cached_candidates_returns_defensive_copy() -> None:
 
     # Get candidates again and mutate the returned list
     first = provider._get_cached_candidates(identifier)
+    assert first is not None
     first.append(_make_candidate("test-pkg", "2.0.0"))
 
     # The cache should not reflect the caller's mutation
     second = provider._get_cached_candidates(identifier)
+    assert second is not None
     assert len(second) == 1, (
         "_get_cached_candidates should return a defensive copy, "
         "not a direct reference to the internal cache"