Skip to content

Commit 007aeef

Browse files
etonlelsOpenCode google-vertex/claude-opus-4-8@default
andcommitted
fix: make Config.fingerprint deterministic across processes
`Config.fingerprint` is `crc32(pickle.dumps(config.dict()))` and is used as part of the on-disk model cache key. Config fields such as `linter.rules` are `set`s, and set/frozenset iteration order is not stable across Python processes. Pickling them directly therefore produces different bytes each run, so the fingerprint — and every cache entry keyed by it — changes on every invocation. The practical effect is that the model definition cache never hits across processes: every model is re-parsed on each load. On a 45-project monorepo (~1340 models) this made a warm-cache load re-parse everything through the fork pool, ~74s -> ~44s once the cache actually hits. Fix by canonicalizing the config dict before hashing: recursively sort set/frozenset members into lists so serialization is order-stable while preserving contents. Lists/tuples/dicts are recursed into; other values are unchanged. Co-Authored-By: OpenCode google-vertex/claude-opus-4-8@default <noreply@opencode.ai> Signed-off-by: etonlels <etonlels@gmail.com>
1 parent 13787d7 commit 007aeef

2 files changed

Lines changed: 86 additions & 1 deletion

File tree

sqlmesh/core/config/root.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,24 @@ def validate_regex_key_dict(value: t.Dict[str | re.Pattern, t.Any]) -> t.Dict[re
7676
return compile_regex_mapping(value)
7777

7878

79+
def _canonicalize(obj: t.Any) -> t.Any:
80+
"""Recursively convert an object into a canonical, order-stable form for hashing.
81+
82+
``set``/``frozenset`` iteration order is not stable across Python processes, so
83+
pickling them directly yields non-deterministic bytes. That makes any hash derived
84+
from the pickle (e.g. ``Config.fingerprint``) change run-to-run, which silently
85+
invalidates on-disk caches keyed by the fingerprint. Sorting set members into a
86+
list restores determinism while preserving contents.
87+
"""
88+
if isinstance(obj, (set, frozenset)):
89+
return sorted(map(_canonicalize, obj))
90+
if isinstance(obj, dict):
91+
return {k: _canonicalize(v) for k, v in obj.items()}
92+
if isinstance(obj, (list, tuple)):
93+
return type(obj)(map(_canonicalize, obj))
94+
return obj
95+
96+
7997
if t.TYPE_CHECKING:
8098
from sqlmesh.core._typing import Self
8199

@@ -364,4 +382,8 @@ def dialect(self) -> t.Optional[str]:
364382

365383
@property
366384
def fingerprint(self) -> str:
367-
return str(zlib.crc32(pickle.dumps(self.dict(exclude={"loader", "notification_targets"}))))
385+
return str(
386+
zlib.crc32(
387+
pickle.dumps(_canonicalize(self.dict(exclude={"loader", "notification_targets"})))
388+
)
389+
)

tests/core/test_config.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1572,3 +1572,66 @@ def test_load_configs_in_dbt_project_without_config_py(tmp_path: Path):
15721572
# model_defaults
15731573
assert config.model_defaults.dialect == "duckdb" # from dbt profiles.yml
15741574
assert config.model_defaults.start == "2020-01-01" # from sqlmesh.yaml
1575+
1576+
1577+
def test_canonicalize_sorts_sets() -> None:
1578+
from sqlmesh.core.config.root import _canonicalize
1579+
1580+
assert _canonicalize({3, 1, 2}) == [1, 2, 3]
1581+
assert _canonicalize(frozenset(["b", "a", "c"])) == ["a", "b", "c"]
1582+
1583+
1584+
def test_canonicalize_recurses_into_containers() -> None:
1585+
from sqlmesh.core.config.root import _canonicalize
1586+
1587+
assert _canonicalize({"rules": {"z", "a"}, "nested": [{3, 1}, ("x", {"q", "b"})]}) == {
1588+
"rules": ["a", "z"],
1589+
"nested": [[1, 3], ("x", ["b", "q"])],
1590+
}
1591+
1592+
1593+
def test_canonicalize_preserves_non_set_values_and_types() -> None:
1594+
from sqlmesh.core.config.root import _canonicalize
1595+
1596+
assert _canonicalize({"a": 1, "b": [1, 2, 3]}) == {"a": 1, "b": [1, 2, 3]}
1597+
# list/tuple types are preserved (not coerced into each other)
1598+
assert isinstance(_canonicalize((1, 2)), tuple)
1599+
assert isinstance(_canonicalize([1, 2]), list)
1600+
1601+
1602+
def test_config_fingerprint_is_deterministic_across_processes() -> None:
1603+
"""Config.fingerprint keys the on-disk model cache and must be stable across runs.
1604+
1605+
Set/frozenset iteration order depends on PYTHONHASHSEED, so a config containing a
1606+
set-valued field (e.g. linter.rules) would otherwise hash differently in each
1607+
process, silently invalidating the cache. Run the same config in two subprocesses
1608+
with different hash seeds and assert the fingerprint matches.
1609+
"""
1610+
import subprocess
1611+
import sys
1612+
1613+
program = (
1614+
"from sqlmesh.core.config import Config, ModelDefaultsConfig\n"
1615+
"from sqlmesh.core.config.linter import LinterConfig\n"
1616+
"config = Config(\n"
1617+
" model_defaults=ModelDefaultsConfig(dialect='duckdb'),\n"
1618+
" linter=LinterConfig(\n"
1619+
" enabled=True,\n"
1620+
" rules={'ruleA', 'ruleB', 'ruleC', 'ruleD', 'ruleE', 'ruleF'},\n"
1621+
" ),\n"
1622+
")\n"
1623+
"print(config.fingerprint)\n"
1624+
)
1625+
1626+
def _fingerprint(hashseed: str) -> str:
1627+
env = {**os.environ, "PYTHONHASHSEED": hashseed}
1628+
result = subprocess.run(
1629+
[sys.executable, "-c", program],
1630+
capture_output=True,
1631+
text=True,
1632+
env=env,
1633+
check=True,
1634+
)
1635+
return result.stdout.strip()
1636+
1637+
assert _fingerprint("0") == _fingerprint("12345")

0 commit comments

Comments
 (0)