From 93675d476e63536e1be813e1d80e4cfca9c67d79 Mon Sep 17 00:00:00 2001 From: etonlels Date: Wed, 1 Jul 2026 10:28:39 -0600 Subject: [PATCH] fix: make `Config.fingerprint` deterministic across processes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `Config.fingerprint` is `crc32(pickle.dumps(config.dict()))` and is used as part of the on-disk model cache key. Config fields such as `linter.rules` are `set`s, and set/frozenset iteration order is not stable across Python processes. Pickling them directly therefore produces different bytes each run, so the fingerprint — and every cache entry keyed by it — changes on every invocation. The practical effect is that the model definition cache never hits across processes: every model is re-parsed on each load. On a 45-project monorepo (~1340 models) this made a warm-cache load re-parse everything through the fork pool, ~74s -> ~44s once the cache actually hits. Fix by canonicalizing the config dict before hashing: recursively sort set/frozenset members into lists so serialization is order-stable while preserving contents. Lists/tuples/dicts are recursed into; other values are unchanged. Co-Authored-By: OpenCode google-vertex/claude-opus-4-8@default Signed-off-by: etonlels --- sqlmesh/core/config/root.py | 24 +++++++++++++- tests/core/test_config.py | 63 +++++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 1 deletion(-) diff --git a/sqlmesh/core/config/root.py b/sqlmesh/core/config/root.py index 211d271b01..b36b7dadc1 100644 --- a/sqlmesh/core/config/root.py +++ b/sqlmesh/core/config/root.py @@ -76,6 +76,24 @@ def validate_regex_key_dict(value: t.Dict[str | re.Pattern, t.Any]) -> t.Dict[re return compile_regex_mapping(value) +def _canonicalize(obj: object) -> object: + """Recursively convert an object into a canonical, order-stable form for hashing. + + ``set``/``frozenset`` iteration order is not stable across Python processes, so + pickling them directly yields non-deterministic bytes. That makes any hash derived + from the pickle (e.g. ``Config.fingerprint``) change run-to-run, which silently + invalidates on-disk caches keyed by the fingerprint. Sorting set members into a + list restores determinism while preserving contents. + """ + if isinstance(obj, (set, frozenset)): + return sorted(map(_canonicalize, obj)) # type: ignore[type-var] + if isinstance(obj, dict): + return {k: _canonicalize(v) for k, v in obj.items()} + if isinstance(obj, (list, tuple)): + return type(obj)(map(_canonicalize, obj)) + return obj + + if t.TYPE_CHECKING: from sqlmesh.core._typing import Self @@ -364,4 +382,8 @@ def dialect(self) -> t.Optional[str]: @property def fingerprint(self) -> str: - return str(zlib.crc32(pickle.dumps(self.dict(exclude={"loader", "notification_targets"})))) + return str( + zlib.crc32( + pickle.dumps(_canonicalize(self.dict(exclude={"loader", "notification_targets"}))) + ) + ) diff --git a/tests/core/test_config.py b/tests/core/test_config.py index 7af556d6a3..5285d5274c 100644 --- a/tests/core/test_config.py +++ b/tests/core/test_config.py @@ -1572,3 +1572,66 @@ def test_load_configs_in_dbt_project_without_config_py(tmp_path: Path): # model_defaults assert config.model_defaults.dialect == "duckdb" # from dbt profiles.yml assert config.model_defaults.start == "2020-01-01" # from sqlmesh.yaml + + +def test_canonicalize_sorts_sets() -> None: + from sqlmesh.core.config.root import _canonicalize + + assert _canonicalize({3, 1, 2}) == [1, 2, 3] + assert _canonicalize(frozenset(["b", "a", "c"])) == ["a", "b", "c"] + + +def test_canonicalize_recurses_into_containers() -> None: + from sqlmesh.core.config.root import _canonicalize + + assert _canonicalize({"rules": {"z", "a"}, "nested": [{3, 1}, ("x", {"q", "b"})]}) == { + "rules": ["a", "z"], + "nested": [[1, 3], ("x", ["b", "q"])], + } + + +def test_canonicalize_preserves_non_set_values_and_types() -> None: + from sqlmesh.core.config.root import _canonicalize + + assert _canonicalize({"a": 1, "b": [1, 2, 3]}) == {"a": 1, "b": [1, 2, 3]} + # list/tuple types are preserved (not coerced into each other) + assert isinstance(_canonicalize((1, 2)), tuple) + assert isinstance(_canonicalize([1, 2]), list) + + +def test_config_fingerprint_is_deterministic_across_processes() -> None: + """Config.fingerprint keys the on-disk model cache and must be stable across runs. + + Set/frozenset iteration order depends on PYTHONHASHSEED, so a config containing a + set-valued field (e.g. linter.rules) would otherwise hash differently in each + process, silently invalidating the cache. Run the same config in two subprocesses + with different hash seeds and assert the fingerprint matches. + """ + import subprocess + import sys + + program = ( + "from sqlmesh.core.config import Config, ModelDefaultsConfig\n" + "from sqlmesh.core.config.linter import LinterConfig\n" + "config = Config(\n" + " model_defaults=ModelDefaultsConfig(dialect='duckdb'),\n" + " linter=LinterConfig(\n" + " enabled=True,\n" + " rules={'ruleA', 'ruleB', 'ruleC', 'ruleD', 'ruleE', 'ruleF'},\n" + " ),\n" + ")\n" + "print(config.fingerprint)\n" + ) + + def _fingerprint(hashseed: str) -> str: + env = {**os.environ, "PYTHONHASHSEED": hashseed} + result = subprocess.run( + [sys.executable, "-c", program], + capture_output=True, + text=True, + env=env, + check=True, + ) + return result.stdout.strip() + + assert _fingerprint("0") == _fingerprint("12345")