From 67541ae3886b035ceeba82616767030da1965080 Mon Sep 17 00:00:00 2001 From: Lalatendu Mohanty Date: Mon, 6 Apr 2026 00:40:13 -0400 Subject: [PATCH 1/2] fix(resolver): make resolver cache thread-safe with per-identifier locking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add per-identifier locks to _find_cached_candidates() and return defensive copies from _get_cached_candidates() to prevent concurrent threads from corrupting cached candidate lists during parallel builds. A single global lock would serialize all resolution work, so a per-identifier scheme is used instead — threads resolving different packages proceed concurrently while threads resolving the same package wait for the first to populate the cache. Closes: #1024 Co-Authored-By: Claude Signed-off-by: Lalatendu Mohanty --- src/fromager/resolver.py | 84 ++++++++++++++++++++++++++-------- tests/test_resolver.py | 99 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 162 insertions(+), 21 deletions(-) diff --git a/src/fromager/resolver.py b/src/fromager/resolver.py index 2ee09115..43366c1c 100644 --- a/src/fromager/resolver.py +++ b/src/fromager/resolver.py @@ -10,6 +10,7 @@ import logging import os import re +import threading import typing from collections.abc import Iterable from operator import attrgetter @@ -422,7 +423,21 @@ def get_project_from_pypi( class BaseProvider(ExtrasProvider): + """Base class for Fromager's dependency resolver (resolvelib + extras). + + Subclasses implement ``find_candidates``, ``cache_key``, and + ``provider_description`` to list versions from PyPI, a version map, etc. + + Candidate lists are cached per package in one global dict, with a lock per + package so parallel work on different packages does not clash. + + ``find_matches`` keeps only versions that fit the requirements and + constraints, then picks newest first. + """ + resolver_cache: typing.ClassVar[ResolverCache] = {} + _cache_locks: typing.ClassVar[dict[str, threading.Lock]] = {} + _meta_lock: typing.ClassVar[threading.Lock] = threading.Lock() provider_description: typing.ClassVar[str] def __init__( @@ -552,22 +567,62 @@ def get_dependencies(self, candidate: Candidate) -> list[Requirement]: # return candidate.dependencies return [] + def _get_identifier_lock(self, identifier: str) -> threading.Lock: + """Get or create a per-identifier lock for thread-safe cache access. + + Uses a short-lived meta-lock to protect the lock dict itself. + The per-identifier lock ensures threads resolving different packages + proceed concurrently, while threads resolving the same package + wait for the first to populate the cache. + """ + with self._meta_lock: + if identifier not in self._cache_locks: + self._cache_locks[identifier] = threading.Lock() + return self._cache_locks[identifier] + def _get_cached_candidates(self, identifier: str) -> list[Candidate]: - """Get list of cached candidates for identifier and provider + """Get a copy of cached candidates for identifier and provider. The method always returns a list. If the cache did not have an entry - before, a new empty list is stored in the cache and returned to the - caller. The caller can mutate the list in place to update the cache. + before, a new empty list is stored in the cache. A copy is returned + so callers cannot accidentally corrupt the cache. + + Must be called under the per-identifier lock from _get_identifier_lock. """ cls = type(self) provider_cache = cls.resolver_cache.setdefault(identifier, {}) candidate_cache = provider_cache.setdefault((cls, self.cache_key), []) - return candidate_cache + return list(candidate_cache) + + def _set_cached_candidates( + self, identifier: str, candidates: list[Candidate] + ) -> None: + """Store candidates in the cache for identifier and provider. + + Must be called under the per-identifier lock from _get_identifier_lock. + """ + cls = type(self) + provider_cache = cls.resolver_cache.setdefault(identifier, {}) + provider_cache[(cls, self.cache_key)] = list(candidates) def _find_cached_candidates(self, identifier: str) -> Candidates: - """Find candidates with caching""" - cached_candidates: list[Candidate] = [] - if self.use_cache_candidates: + """Find candidates with caching. + + Uses a per-identifier lock so threads resolving different packages + proceed concurrently, while threads resolving the same package + wait for the first to populate the cache. + """ + if not self.use_cache_candidates: + candidates = list(self.find_candidates(identifier)) + logger.debug( + "%s: got %i unfiltered candidates, ignoring cache", + identifier, + len(candidates), + ) + return candidates + + lock = self._get_identifier_lock(identifier) + with lock: cached_candidates = self._get_cached_candidates(identifier) if cached_candidates: logger.debug( @@ -576,22 +631,15 @@ def _find_cached_candidates(self, identifier: str) -> Candidates: len(cached_candidates), ) return cached_candidates - candidates = list(self.find_candidates(identifier)) - if self.use_cache_candidates: - # mutate list object in-place - cached_candidates[:] = candidates + + candidates = list(self.find_candidates(identifier)) + self._set_cached_candidates(identifier, candidates) logger.debug( "%s: cache %i unfiltered candidates", identifier, len(candidates), ) - else: - logger.debug( - "%s: got %i unfiltered candidates, ignoring cache", - identifier, - len(candidates), - ) - return candidates + return candidates def _get_no_match_error_message( self, identifier: str, requirements: RequirementsMap diff --git a/tests/test_resolver.py b/tests/test_resolver.py index 690a1288..a548a2cc 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -1,5 +1,7 @@ import datetime import re +import threading +import time import typing import pytest @@ -11,6 +13,7 @@ from fromager import constraints, resolver from fromager.__main__ import main as fromager +from fromager.candidate import Candidate _hydra_core_simple_response = """ @@ -153,10 +156,8 @@ def test_provider_cache_key_pypi(pypi_hydra_resolver: typing.Any) -> None: resolver_cache = resolver.BaseProvider.resolver_cache assert req.name in resolver_cache assert (resolver.PyPIProvider, provider.cache_key) in resolver_cache[req.name] - # mutated in place - assert provider._get_cached_candidates(req.name) is req_cache + # _get_cached_candidates returns a defensive copy, not the same object assert len(provider._get_cached_candidates(req.name)) == 7 - assert len(req_cache) == 7 def test_provider_cache_key_gitlab(gitlab_decile_resolver: typing.Any) -> None: @@ -1278,3 +1279,95 @@ def test_cli_package_resolver( assert "- PyPI versions: 1.2.2, 1.3.1+local, 1.3.2, 2.0.0a1" in result.stdout assert "- only wheels on PyPI: 1.3.1+local, 2.0.0a1" in result.stdout assert "- missing from Fromager: 1.3.1+local, 2.0.0a1" in result.stdout + + +def _make_candidate(name: str, version: str) -> Candidate: + """Create a minimal Candidate for testing.""" + return Candidate( + name=name, version=Version(version), url="https://example.com", is_sdist=False + ) + + +class _StubProvider(resolver.BaseProvider): + """Minimal BaseProvider subclass for cache tests.""" + + provider_description = "stub" + + @property + def cache_key(self) -> str: + return "stub-key" + + def find_candidates(self, identifier: str) -> list[Candidate]: + return [] + + +def test_get_cached_candidates_returns_defensive_copy() -> None: + """Mutating the list returned by _get_cached_candidates must not corrupt the cache.""" + provider = _StubProvider() + identifier = "test-pkg" + + # Seed the cache directly so the test doesn't depend on the aliasing bug + resolver.BaseProvider.resolver_cache[identifier] = { + (type(provider), provider.cache_key): [_make_candidate("test-pkg", "1.0.0")] + } + + # Get candidates again and mutate the returned list + first = provider._get_cached_candidates(identifier) + first.append(_make_candidate("test-pkg", "2.0.0")) + + # The cache should not reflect the caller's mutation + second = provider._get_cached_candidates(identifier) + assert len(second) == 1, ( + "_get_cached_candidates should return a defensive copy, " + "not a direct reference to the internal cache" + ) + assert second[0].version == Version("1.0.0") + + +def test_find_cached_candidates_thread_safe() -> None: + """Concurrent threads must not bypass the cache and call find_candidates multiple times.""" + call_count = 0 + call_count_lock = threading.Lock() + + class _SlowProvider(resolver.BaseProvider): + """Provider with a slow find_candidates to expose thread races.""" + + provider_description = "slow" + + @property + def cache_key(self) -> str: + return "slow-key" + + def find_candidates(self, identifier: str) -> list[Candidate]: + nonlocal call_count + with call_count_lock: + call_count += 1 + time.sleep(0.2) + return [_make_candidate(identifier, "1.0.0")] + + barrier = threading.Barrier(4) + + def resolve_in_thread(provider: _SlowProvider, ident: str) -> None: + barrier.wait(timeout=5) + list(provider._find_cached_candidates(ident)) + + providers = [_SlowProvider() for _ in range(4)] + threads = [ + threading.Thread( + target=resolve_in_thread, + args=(thread_provider, "shared-pkg"), + name=f"resolver-{i}", + ) + for i, thread_provider in enumerate(providers) + ] + + for t in threads: + t.start() + for t in threads: + t.join(timeout=10) + + assert call_count == 1, ( + f"find_candidates() was called {call_count} times; expected 1. " + "Without thread-safe caching, multiple threads bypass the cache " + "and redundantly call find_candidates()." + ) From 8218e5be45e427ea8bb07266c1fec7a6227855e0 Mon Sep 17 00:00:00 2001 From: Lalatendu Mohanty Date: Mon, 6 Apr 2026 10:34:49 -0400 Subject: [PATCH 2/2] fix(resolver): treat empty candidate list as valid cache entry The old truthiness check `if cached_candidates:` treated an empty list as a cache miss, silently discarding valid results. In practice this has no effect because find_matches raises ResolverException on empty candidates, terminating the resolution before a second lookup can occur. But the cache should not discard data it already computed. Use None as the "not yet cached" sentinel instead. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Lalatendu Mohanty --- src/fromager/resolver.py | 14 ++++++++------ tests/test_resolver.py | 4 +++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/fromager/resolver.py b/src/fromager/resolver.py index 43366c1c..c55f7d33 100644 --- a/src/fromager/resolver.py +++ b/src/fromager/resolver.py @@ -580,18 +580,20 @@ def _get_identifier_lock(self, identifier: str) -> threading.Lock: self._cache_locks[identifier] = threading.Lock() return self._cache_locks[identifier] - def _get_cached_candidates(self, identifier: str) -> list[Candidate]: + def _get_cached_candidates(self, identifier: str) -> list[Candidate] | None: """Get a copy of cached candidates for identifier and provider. - The method always returns a list. If the cache did not have an entry - before, a new empty list is stored in the cache. A copy is returned - so callers cannot accidentally corrupt the cache. + Returns None if no entry exists in the cache, or a copy of the cached + list (which may be empty). A copy is returned so callers cannot + accidentally corrupt the cache. Must be called under the per-identifier lock from _get_identifier_lock. """ cls = type(self) provider_cache = cls.resolver_cache.setdefault(identifier, {}) - candidate_cache = provider_cache.setdefault((cls, self.cache_key), []) + candidate_cache = provider_cache.get((cls, self.cache_key)) + if candidate_cache is None: + return None return list(candidate_cache) def _set_cached_candidates( @@ -624,7 +626,7 @@ def _find_cached_candidates(self, identifier: str) -> Candidates: lock = self._get_identifier_lock(identifier) with lock: cached_candidates = self._get_cached_candidates(identifier) - if cached_candidates: + if cached_candidates is not None: logger.debug( "%s: use %i cached candidates", identifier, diff --git a/tests/test_resolver.py b/tests/test_resolver.py index a548a2cc..9768319d 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -147,7 +147,7 @@ def test_provider_cache_key_pypi(pypi_hydra_resolver: typing.Any) -> None: provider = pypi_hydra_resolver.provider assert provider.cache_key == "https://pypi.org/simple/" req_cache = provider._get_cached_candidates(req.name) - assert req_cache == [] + assert req_cache is None result = pypi_hydra_resolver.resolve([req]) candidate = result.mapping[req.name] @@ -1313,10 +1313,12 @@ def test_get_cached_candidates_returns_defensive_copy() -> None: # Get candidates again and mutate the returned list first = provider._get_cached_candidates(identifier) + assert first is not None first.append(_make_candidate("test-pkg", "2.0.0")) # The cache should not reflect the caller's mutation second = provider._get_cached_candidates(identifier) + assert second is not None assert len(second) == 1, ( "_get_cached_candidates should return a defensive copy, " "not a direct reference to the internal cache"