Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion sqlmesh/core/config/root.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,24 @@ def validate_regex_key_dict(value: t.Dict[str | re.Pattern, t.Any]) -> t.Dict[re
return compile_regex_mapping(value)


def _canonicalize(obj: object) -> object:
"""Recursively convert an object into a canonical, order-stable form for hashing.

``set``/``frozenset`` iteration order is not stable across Python processes, so
pickling them directly yields non-deterministic bytes. That makes any hash derived
from the pickle (e.g. ``Config.fingerprint``) change run-to-run, which silently
invalidates on-disk caches keyed by the fingerprint. Sorting set members into a
list restores determinism while preserving contents.
"""
if isinstance(obj, (set, frozenset)):
return sorted(map(_canonicalize, obj)) # type: ignore[type-var]
if isinstance(obj, dict):
return {k: _canonicalize(v) for k, v in obj.items()}
if isinstance(obj, (list, tuple)):
return type(obj)(map(_canonicalize, obj))
return obj


if t.TYPE_CHECKING:
from sqlmesh.core._typing import Self

Expand Down Expand Up @@ -364,4 +382,8 @@ def dialect(self) -> t.Optional[str]:

@property
def fingerprint(self) -> str:
return str(zlib.crc32(pickle.dumps(self.dict(exclude={"loader", "notification_targets"}))))
return str(
zlib.crc32(
pickle.dumps(_canonicalize(self.dict(exclude={"loader", "notification_targets"})))
)
)
63 changes: 63 additions & 0 deletions tests/core/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1572,3 +1572,66 @@ def test_load_configs_in_dbt_project_without_config_py(tmp_path: Path):
# model_defaults
assert config.model_defaults.dialect == "duckdb" # from dbt profiles.yml
assert config.model_defaults.start == "2020-01-01" # from sqlmesh.yaml


def test_canonicalize_sorts_sets() -> None:
from sqlmesh.core.config.root import _canonicalize

assert _canonicalize({3, 1, 2}) == [1, 2, 3]
assert _canonicalize(frozenset(["b", "a", "c"])) == ["a", "b", "c"]


def test_canonicalize_recurses_into_containers() -> None:
from sqlmesh.core.config.root import _canonicalize

assert _canonicalize({"rules": {"z", "a"}, "nested": [{3, 1}, ("x", {"q", "b"})]}) == {
"rules": ["a", "z"],
"nested": [[1, 3], ("x", ["b", "q"])],
}


def test_canonicalize_preserves_non_set_values_and_types() -> None:
from sqlmesh.core.config.root import _canonicalize

assert _canonicalize({"a": 1, "b": [1, 2, 3]}) == {"a": 1, "b": [1, 2, 3]}
# list/tuple types are preserved (not coerced into each other)
assert isinstance(_canonicalize((1, 2)), tuple)
assert isinstance(_canonicalize([1, 2]), list)


def test_config_fingerprint_is_deterministic_across_processes() -> None:
"""Config.fingerprint keys the on-disk model cache and must be stable across runs.

Set/frozenset iteration order depends on PYTHONHASHSEED, so a config containing a
set-valued field (e.g. linter.rules) would otherwise hash differently in each
process, silently invalidating the cache. Run the same config in two subprocesses
with different hash seeds and assert the fingerprint matches.
"""
import subprocess
import sys

program = (
"from sqlmesh.core.config import Config, ModelDefaultsConfig\n"
"from sqlmesh.core.config.linter import LinterConfig\n"
"config = Config(\n"
" model_defaults=ModelDefaultsConfig(dialect='duckdb'),\n"
" linter=LinterConfig(\n"
" enabled=True,\n"
" rules={'ruleA', 'ruleB', 'ruleC', 'ruleD', 'ruleE', 'ruleF'},\n"
" ),\n"
")\n"
"print(config.fingerprint)\n"
)

def _fingerprint(hashseed: str) -> str:
env = {**os.environ, "PYTHONHASHSEED": hashseed}
result = subprocess.run(
[sys.executable, "-c", program],
capture_output=True,
text=True,
env=env,
check=True,
)
return result.stdout.strip()

assert _fingerprint("0") == _fingerprint("12345")