diff --git a/librarian/storage/database.py b/librarian/storage/database.py index e3a423f..c58e956 100644 --- a/librarian/storage/database.py +++ b/librarian/storage/database.py @@ -12,7 +12,7 @@ import threading from collections.abc import Generator from contextlib import contextmanager -from datetime import datetime +from datetime import date, datetime from typing import Any import sqlite_vec @@ -30,6 +30,19 @@ logger = logging.getLogger(__name__) +def _json_default(value: Any) -> str: + """JSON fallback for types YAML frontmatter emits but stdlib json can't encode. + + Obsidian and other markdown frontmatter commonly contain `YYYY-MM-DD` values + which PyYAML parses into `datetime.date`. Stored as ISO strings; round-tripped + values come back as strings (acceptable because metadata is informational, + not queried as dates). + """ + if isinstance(value, date | datetime): + return value.isoformat() + return str(value) + + def get_effective_embedding_dimension() -> int: """Get the embedding dimension based on configured provider.""" if EMBEDDING_PROVIDER == "openai": @@ -286,7 +299,9 @@ def insert_document(self, document: Document) -> int: document.path, document.title, document.content, - json.dumps(document.metadata) if document.metadata else None, + json.dumps(document.metadata, default=_json_default) + if document.metadata + else None, document.file_mtime, document.asset_type.value, ), @@ -312,7 +327,9 @@ def update_document(self, document: Document) -> None: document.path, document.title, document.content, - json.dumps(document.metadata) if document.metadata else None, + json.dumps(document.metadata, default=_json_default) + if document.metadata + else None, document.file_mtime, document.asset_type.value, document.id, diff --git a/tests/test_database.py b/tests/test_database.py new file mode 100644 index 0000000..904c3ad --- /dev/null +++ b/tests/test_database.py @@ -0,0 +1,69 @@ +"""Regression tests for librarian.storage.database.""" + +from datetime import date, datetime +from pathlib import Path + +import pytest + +from librarian.storage.database import Database +from librarian.types import AssetType, Document + + +@pytest.fixture +def db(tmp_path: Path) -> Database: + """Fresh Database instance per test, isolated to a tmp file.""" + return Database(db_path=str(tmp_path / "test.db")) + + +class TestMetadataSerialization: + """JSON-serialize document metadata containing types that YAML frontmatter + emits but the stdlib json encoder doesn't know about.""" + + def test_insert_with_date_metadata(self, db: Database) -> None: + """A `date` in metadata (from `last_push: 2026-05-19` frontmatter) must + not crash on insert. Regression: previously raised + `TypeError: Object of type date is not JSON serializable`.""" + doc = Document( + id=None, + path="/note-with-date.md", + title="Note", + content="body", + metadata={ + "last_push": date(2026, 5, 19), + "updated_at": datetime(2026, 5, 19, 12, 30, 45), + "tags": ["repo", "MOC"], + }, + file_mtime=0.0, + asset_type=AssetType.TEXT, + ) + doc_id = db.insert_document(doc) + assert doc_id is not None + + got = db.get_document_by_path("/note-with-date.md") + assert got is not None + # Dates round-trip as ISO strings; metadata is informational, not queried as dates. + assert got.metadata["last_push"] == "2026-05-19" + assert got.metadata["updated_at"].startswith("2026-05-19T12:30:45") + assert got.metadata["tags"] == ["repo", "MOC"] + + def test_update_with_date_metadata(self, db: Database) -> None: + """update_document() must also handle date metadata (parallel call site).""" + doc = Document( + id=None, + path="/n.md", + title="N", + content="", + metadata={"tag": "a"}, + file_mtime=0.0, + asset_type=AssetType.TEXT, + ) + db.insert_document(doc) + + loaded = db.get_document_by_path("/n.md") + assert loaded is not None + loaded.metadata = {"last_push": date(2026, 5, 19)} + db.update_document(loaded) + + got = db.get_document_by_path("/n.md") + assert got is not None + assert got.metadata["last_push"] == "2026-05-19"