From aa1108b6acd8498a0c6d47396aaddd1734cd91ec Mon Sep 17 00:00:00 2001 From: Patrick Kelly Date: Wed, 20 May 2026 08:36:50 -0400 Subject: [PATCH 1/2] fix(storage): JSON-serialize date/datetime metadata from YAML frontmatter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Obsidian and other markdown frontmatter commonly contain `YYYY-MM-DD` values that PyYAML parses into `datetime.date`, e.g.: --- last_push: 2026-05-19 --- When `MarkdownParser` extracted the frontmatter and `Database.insert_document` ran `json.dumps(document.metadata)`, this crashed with: TypeError: Object of type date is not JSON serializable Add a small `_json_default` fallback that converts `date` / `datetime` to ISO strings (and falls back to `str()` for anything else). Round-tripped values come back as strings — acceptable because metadata is informational and not queried as dates. Includes a regression test that fails before this change and passes after, covering both `insert_document` and `update_document`. Co-Authored-By: Claude Opus 4.7 (1M context) --- librarian/storage/database.py | 23 ++++++++++-- tests/test_database.py | 69 +++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 3 deletions(-) create mode 100644 tests/test_database.py diff --git a/librarian/storage/database.py b/librarian/storage/database.py index e3a423f..e4bf89a 100644 --- a/librarian/storage/database.py +++ b/librarian/storage/database.py @@ -12,7 +12,7 @@ import threading from collections.abc import Generator from contextlib import contextmanager -from datetime import datetime +from datetime import date, datetime from typing import Any import sqlite_vec @@ -30,6 +30,19 @@ logger = logging.getLogger(__name__) +def _json_default(value: Any) -> str: + """JSON fallback for types YAML frontmatter emits but stdlib json can't encode. + + Obsidian and other markdown frontmatter commonly contain `YYYY-MM-DD` values + which PyYAML parses into `datetime.date`. Stored as ISO strings; round-tripped + values come back as strings (acceptable because metadata is informational, + not queried as dates). + """ + if isinstance(value, (date, datetime)): + return value.isoformat() + return str(value) + + def get_effective_embedding_dimension() -> int: """Get the embedding dimension based on configured provider.""" if EMBEDDING_PROVIDER == "openai": @@ -286,7 +299,9 @@ def insert_document(self, document: Document) -> int: document.path, document.title, document.content, - json.dumps(document.metadata) if document.metadata else None, + json.dumps(document.metadata, default=_json_default) + if document.metadata + else None, document.file_mtime, document.asset_type.value, ), @@ -312,7 +327,9 @@ def update_document(self, document: Document) -> None: document.path, document.title, document.content, - json.dumps(document.metadata) if document.metadata else None, + json.dumps(document.metadata, default=_json_default) + if document.metadata + else None, document.file_mtime, document.asset_type.value, document.id, diff --git a/tests/test_database.py b/tests/test_database.py new file mode 100644 index 0000000..904c3ad --- /dev/null +++ b/tests/test_database.py @@ -0,0 +1,69 @@ +"""Regression tests for librarian.storage.database.""" + +from datetime import date, datetime +from pathlib import Path + +import pytest + +from librarian.storage.database import Database +from librarian.types import AssetType, Document + + +@pytest.fixture +def db(tmp_path: Path) -> Database: + """Fresh Database instance per test, isolated to a tmp file.""" + return Database(db_path=str(tmp_path / "test.db")) + + +class TestMetadataSerialization: + """JSON-serialize document metadata containing types that YAML frontmatter + emits but the stdlib json encoder doesn't know about.""" + + def test_insert_with_date_metadata(self, db: Database) -> None: + """A `date` in metadata (from `last_push: 2026-05-19` frontmatter) must + not crash on insert. Regression: previously raised + `TypeError: Object of type date is not JSON serializable`.""" + doc = Document( + id=None, + path="/note-with-date.md", + title="Note", + content="body", + metadata={ + "last_push": date(2026, 5, 19), + "updated_at": datetime(2026, 5, 19, 12, 30, 45), + "tags": ["repo", "MOC"], + }, + file_mtime=0.0, + asset_type=AssetType.TEXT, + ) + doc_id = db.insert_document(doc) + assert doc_id is not None + + got = db.get_document_by_path("/note-with-date.md") + assert got is not None + # Dates round-trip as ISO strings; metadata is informational, not queried as dates. + assert got.metadata["last_push"] == "2026-05-19" + assert got.metadata["updated_at"].startswith("2026-05-19T12:30:45") + assert got.metadata["tags"] == ["repo", "MOC"] + + def test_update_with_date_metadata(self, db: Database) -> None: + """update_document() must also handle date metadata (parallel call site).""" + doc = Document( + id=None, + path="/n.md", + title="N", + content="", + metadata={"tag": "a"}, + file_mtime=0.0, + asset_type=AssetType.TEXT, + ) + db.insert_document(doc) + + loaded = db.get_document_by_path("/n.md") + assert loaded is not None + loaded.metadata = {"last_push": date(2026, 5, 19)} + db.update_document(loaded) + + got = db.get_document_by_path("/n.md") + assert got is not None + assert got.metadata["last_push"] == "2026-05-19" From 66477380cfeb44575433cbaacd7ac9ae8d8ebe47 Mon Sep 17 00:00:00 2001 From: Mateo Torres Date: Thu, 21 May 2026 10:42:36 -0300 Subject: [PATCH 2/2] Update librarian/storage/database.py --- librarian/storage/database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/librarian/storage/database.py b/librarian/storage/database.py index e4bf89a..c58e956 100644 --- a/librarian/storage/database.py +++ b/librarian/storage/database.py @@ -38,7 +38,7 @@ def _json_default(value: Any) -> str: values come back as strings (acceptable because metadata is informational, not queried as dates). """ - if isinstance(value, (date, datetime)): + if isinstance(value, date | datetime): return value.isoformat() return str(value)