From 46e4a8b54ac852568d0bb20f804548f5f7c8271b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Contreras=20Guill=C3=A9n?= Date: Mon, 15 Jun 2026 19:10:01 +0200 Subject: [PATCH] fix(worker): no traceback on malformed persisted submit; pyfly v26.06.104 (clean errors) - ExtractionWorker._process now reconstructs the typed request (_build_request) INSIDE the try/except, so an invalid stored schema/options is marked permanent_error and logged cleanly instead of escaping to the poll loop's logger.exception() (raw traceback). - Upgrade pyfly to the latest released v26.06.104: expected 4xx errors log at WARNING without a stack trace (CQRS + request log), and the CLI prints clean one-line errors. Release v26.6.11. --- CHANGELOG.md | 22 +++++++++ pyproject.toml | 6 +-- src/flydocs/__init__.py | 2 +- .../core/services/workers/job_worker.py | 49 ++++++++++--------- uv.lock | 8 +-- 5 files changed, 57 insertions(+), 30 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 39b0acf..93c8ed8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,28 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project uses **CalVer `YY.M.PP`** (PEP 440 may normalise patch numbers for the Python wheel — e.g. `26.06.00` → `26.6.0`). +## [26.6.11] - 2026-06-15 + +### Fixed + +- **A malformed persisted extraction no longer dumps a raw traceback in the + worker log.** `ExtractionWorker._process` reconstructed the typed request + (`_build_request` → pydantic `model_validate`) *outside* its `try/except`, so an + invalid stored `schema_json`/`options_json` raised a `ValidationError` that + escaped to the poll loop's `logger.exception(...)` and printed a full stack + trace. The reconstruction now runs inside the guarded block, where + `_is_permanent()` classifies it as terminal and the job is marked + `permanent_error` and logged cleanly — no traceback. + +### Changed + +- **Upgraded pyfly to `v26.06.104`** for framework-level clean error reporting: + expected client/domain faults (validation, business-rule, auth — the 4xx + family) are now logged at WARNING without a stack trace across the CQRS handlers + and the web request log, and the pyfly CLI prints a clean `Error: ...` line + instead of a traceback (`--debug` / `PYFLY_DEBUG` restores the full trace). + Dependency pin and floor moved to `v26.06.104` / `>=26.6.104`. + ## [26.6.10] - 2026-06-15 ### Changed diff --git a/pyproject.toml b/pyproject.toml index 6da3838..f0d5e87 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "flydocs" # CalVer YY.MM.PP -- bumped per release. Note that PEP 440 normalises # ``26.05.01`` -> ``26.5.1`` in the built wheel filename. -version = "26.6.10" +version = "26.6.11" description = "Pure-multimodal Intelligent Document Processing service: structured fields + bounding boxes, validation, authenticity checks, LLM judge, and a business-rule engine. Sync + queue-backed async APIs over fireflyframework-pyfly and -agentic. Part of Firefly OperationOS, platform-agnostic by design." readme = "README.md" requires-python = ">=3.13" @@ -19,7 +19,7 @@ dependencies = [ # so a fresh ``uv sync`` is enough to boot the full stack. The ``web`` # extra declares starlette + uvicorn, which the worker health server # imports directly; the floor carries ``pyfly.actuator.install_health_indicators``. - "pyfly[fastapi,web,observability,security,data-relational,postgresql,eda,redis,client,scheduling,cli]>=26.6.103", + "pyfly[fastapi,web,observability,security,data-relational,postgresql,eda,redis,client,scheduling,cli]>=26.6.104", # GenAI metaframework -- FireflyAgent with multimodal content (BinaryContent/ImageUrl) # over pydantic-ai. Pulls in the OpenAI / Anthropic / Bedrock providers via pydantic-ai-slim. @@ -131,7 +131,7 @@ override-dependencies = [ # (vestigial) ./vendor clone + Dockerfile BuildKit context for pyfly are now # no-ops — the path-rewrite sed no longer matches a git source — exactly as for # agentic; they can be removed in a later cleanup. -pyfly = { git = "https://github.com/fireflyframework/fireflyframework-pyfly.git", tag = "v26.06.103" } +pyfly = { git = "https://github.com/fireflyframework/fireflyframework-pyfly.git", tag = "v26.06.104" } fireflyframework-agentic = { git = "https://github.com/fireflyframework/fireflyframework-agentic.git", tag = "v26.05.30" } [tool.hatch.build.targets.wheel] diff --git a/src/flydocs/__init__.py b/src/flydocs/__init__.py index c4e06f6..3454fe6 100644 --- a/src/flydocs/__init__.py +++ b/src/flydocs/__init__.py @@ -24,4 +24,4 @@ `PromptRegistry`). """ -__version__ = "26.6.10" +__version__ = "26.6.11" diff --git a/src/flydocs/core/services/workers/job_worker.py b/src/flydocs/core/services/workers/job_worker.py index 889e90b..6504db3 100644 --- a/src/flydocs/core/services/workers/job_worker.py +++ b/src/flydocs/core/services/workers/job_worker.py @@ -241,30 +241,35 @@ async def _process(self, extraction_id: str) -> None: extraction_id=row.id, attempt=attempts, ) - request = self._build_request(row) - # Capture the original intent BEFORE we mutate the request: we - # need to know whether the caller wanted bbox refinement so we - # can publish the post-processing event afterwards, even if we - # skip the inline node below. - wants_bbox_refine = bool(getattr(request.options.stages, "bbox_refine", False)) - if wants_bbox_refine: - # Architectural decision: on the async path, skip the inline - # bbox_refine node entirely. The dedicated BboxRefineWorker - # picks up the post-processing event we publish below and - # grounds bboxes there. Running both wastes minutes of CPU - # and LLM tokens on duplicate work — and when the inline - # step times out (which it does on multi-PDF bundles) the - # pipeline framework marks the node as failed, which is - # misleading because the out-of-band path recovers - # transparently. The :class:`BboxRefiner` is idempotent - # (already-grounded fields are skipped on re-run), so even - # if both paths execute the work won't double up — but - # bypassing inline saves the latency outright. - stages_skipped = request.options.stages.model_copy(update={"bbox_refine": False}) - options_skipped = request.options.model_copy(update={"stages": stages_skipped}) - request = request.model_copy(update={"options": options_skipped}) started = time.monotonic() try: + # Reconstruct the typed request from the persisted row INSIDE the + # try: a malformed schema/options payload makes pydantic raise a + # ValidationError, which _is_permanent() treats as a terminal + # failure (marked permanent_error) — instead of escaping to the + # poll loop's logger.exception() and dumping a raw traceback. + request = self._build_request(row) + # Capture the original intent BEFORE we mutate the request: we + # need to know whether the caller wanted bbox refinement so we + # can publish the post-processing event afterwards, even if we + # skip the inline node below. + wants_bbox_refine = bool(getattr(request.options.stages, "bbox_refine", False)) + if wants_bbox_refine: + # Architectural decision: on the async path, skip the inline + # bbox_refine node entirely. The dedicated BboxRefineWorker + # picks up the post-processing event we publish below and + # grounds bboxes there. Running both wastes minutes of CPU + # and LLM tokens on duplicate work — and when the inline + # step times out (which it does on multi-PDF bundles) the + # pipeline framework marks the node as failed, which is + # misleading because the out-of-band path recovers + # transparently. The :class:`BboxRefiner` is idempotent + # (already-grounded fields are skipped on re-run), so even + # if both paths execute the work won't double up — but + # bypassing inline saves the latency outright. + stages_skipped = request.options.stages.model_copy(update={"bbox_refine": False}) + options_skipped = request.options.model_copy(update={"stages": stages_skipped}) + request = request.model_copy(update={"options": options_skipped}) result = await asyncio.wait_for( self._orchestrator.execute(request, extraction_id=row.id), timeout=self._settings.async_timeout_s, diff --git a/uv.lock b/uv.lock index 53a1388..3188806 100644 --- a/uv.lock +++ b/uv.lock @@ -1483,7 +1483,7 @@ security = [ [[package]] name = "flydocs" -version = "26.6.10" +version = "26.6.11" source = { editable = "." } dependencies = [ { name = "aiosqlite" }, @@ -1539,7 +1539,7 @@ requires-dist = [ { name = "pydantic", specifier = ">=2.10.0" }, { name = "pydantic-ai-slim", extras = ["anthropic", "openai", "bedrock"], specifier = ">=1.56.0" }, { name = "pydantic-settings", specifier = ">=2.7.0" }, - { name = "pyfly", extras = ["fastapi", "web", "observability", "security", "data-relational", "postgresql", "eda", "redis", "client", "scheduling", "cli"], git = "https://github.com/fireflyframework/fireflyframework-pyfly.git?tag=v26.06.103" }, + { name = "pyfly", extras = ["fastapi", "web", "observability", "security", "data-relational", "postgresql", "eda", "redis", "client", "scheduling", "cli"], git = "https://github.com/fireflyframework/fireflyframework-pyfly.git?tag=v26.06.104" }, { name = "pymupdf", specifier = ">=1.24" }, { name = "pypdf", specifier = ">=4.3.0" }, { name = "pyright", marker = "extra == 'dev'", specifier = ">=1.1" }, @@ -3942,8 +3942,8 @@ wheels = [ [[package]] name = "pyfly" -version = "26.6.103" -source = { git = "https://github.com/fireflyframework/fireflyframework-pyfly.git?tag=v26.06.103#67244e4fc23ce8bee1f40b0f819c2c8858161804" } +version = "26.6.104" +source = { git = "https://github.com/fireflyframework/fireflyframework-pyfly.git?tag=v26.06.104#a921a67fd0184b2caf05813bd3f7e6048e22b976" } dependencies = [ { name = "pydantic" }, { name = "pyyaml" },