From 9b7ec103a8cd5bcd84c9317f7bc73265c1e69a76 Mon Sep 17 00:00:00 2001 From: Jon Froehlich Date: Thu, 25 Jun 2026 16:29:12 -0700 Subject: [PATCH] Harden backfill_original_filenames against per-row failures (#1391) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The backfill runs on every container start over the whole artifact dataset. It had no per-row error isolation, so a single malformed row would propagate its exception out of the command and abort the entire run — and since docker-entrypoint.sh has no `set -e`, that would fail silently (traceback prints, startup continues to runserver), leaving every row's original_*_filename unset. The most likely trigger is a null `date` or `title`, which makes Artifact.generate_filename() raise (date.year / title.title()). This is defensive insurance, not a fix for an observed failure — the 2.25.0 prod backfill did populate the legacy rows correctly. - Process each (artifact, file field) pair inside its own try/except; log and count the row, then continue. Extract the row logic into _backfill_row(). - Surface an errored-row count in the summary log line. - Regression test: a row with a null date (generate_filename raises) no longer aborts the batch; a good legacy row in the same run is still backfilled. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../commands/backfill_original_filenames.py | 135 +++++++++++------- website/tests/test_artifact.py | 34 +++++ 2 files changed, 118 insertions(+), 51 deletions(-) diff --git a/website/management/commands/backfill_original_filenames.py b/website/management/commands/backfill_original_filenames.py index 1ef875e9..b6e6a816 100644 --- a/website/management/commands/backfill_original_filenames.py +++ b/website/management/commands/backfill_original_filenames.py @@ -47,27 +47,39 @@ def handle(self, *args, **options): total_updated = 0 total_skipped = 0 + total_errors = 0 for model in self.MODELS: - updated, skipped = self._backfill_model(model, dry_run) + updated, skipped, errors = self._backfill_model(model, dry_run) total_updated += updated total_skipped += skipped + total_errors += errors verb = "Would update" if dry_run else "Updated" _logger.info( f"backfill_original_filenames: {verb} {total_updated} filename " f"field(s); skipped {total_skipped} (already standardized — original " - f"name unrecoverable)." + f"name unrecoverable); {total_errors} row(s) errored and were skipped." ) _logger.debug("Completed backfill_original_filenames.py") def _backfill_model(self, model, dry_run): """Backfill both file fields for one concrete artifact model. - Returns a ``(num_updated, num_skipped)`` tuple counting individual - filename fields touched / deliberately left blank. + Returns a ``(num_updated, num_skipped, num_errors)`` tuple counting + individual filename fields touched / deliberately left blank / skipped + because the row raised. + + Each row is processed inside its own try/except: this iterates the + whole production dataset, and a single malformed row (e.g. a null + ``date`` or ``title`` makes ``generate_filename`` raise) must not abort + the entire backfill and leave every later row untouched. The entrypoint + has no ``set -e``, so an aborted run would fail silently (the traceback + prints but startup continues) — this per-row isolation is defensive + insurance against that. """ num_updated = 0 num_skipped = 0 + num_errors = 0 for file_attr, original_attr in self.FILE_FIELDS: # Only rows that have a file but no captured original name yet. candidates = ( @@ -80,52 +92,73 @@ def _backfill_model(self, model, dry_run): candidates = candidates.prefetch_related("authors") for artifact in candidates: - file_field = getattr(artifact, file_attr) - if not file_field: - continue - - current_basename = os.path.basename(file_field.name) - current_no_ext = os.path.splitext(current_basename)[0] - standardized_no_ext = Artifact.generate_filename(artifact) - - # Treat the file as already-standardized when its name equals the - # standardized scheme OR is a uniquified variant of it. When a - # standardized name collides on disk, ensure_filename_is_unique() - # (fileutils.py) appends "-" — e.g. - # "Lee_Talk_CHI2021-1782399772.42.pdf" — so the on-disk name - # still STARTS WITH the standardized base. Matching only on exact - # equality would misread those as never-renamed and record the - # standardized+suffix name as the "original" — a false positive. - already_standardized = ( - current_no_ext == standardized_no_ext - or current_no_ext.startswith(standardized_no_ext + "-") - ) - if already_standardized: - # Already renamed — the original upload name is gone. - _logger.debug( - f"Skipping {model.__name__} id={artifact.pk} {file_attr}=" - f"'{current_basename}': already standardized." + try: + if self._backfill_row(model, artifact, file_attr, + original_attr, dry_run): + num_updated += 1 + else: + num_skipped += 1 + except Exception: + # Log and move on — never let one row kill the batch. + _logger.exception( + "backfill_original_filenames: skipping %s id=%s %s due " + "to an error", model.__name__, + getattr(artifact, "pk", "?"), file_attr, ) - num_skipped += 1 - continue - - # Never renamed: the current on-disk name is the original. - if dry_run: - _logger.debug( - f"[dry-run] Would set {original_attr}='{current_basename}' " - f"for {model.__name__} id={artifact.pk} '{artifact.title}'" - ) - else: - # Write directly via the queryset so this stays a pure data - # backfill — no file-rename / thumbnail side effects from the - # model's save(). - model.objects.filter(pk=artifact.pk).update( - **{original_attr: current_basename} - ) - _logger.debug( - f"Set {original_attr}='{current_basename}' for " - f"{model.__name__} id={artifact.pk} '{artifact.title}'" - ) - num_updated += 1 + num_errors += 1 + + return num_updated, num_skipped, num_errors + + def _backfill_row(self, model, artifact, file_attr, original_attr, dry_run): + """Backfill one (artifact, file field) pair. + + Returns True if the original name was (or would be) recorded, False if + the row was deliberately skipped because its file is already + standardized. Raises on malformed data — the caller isolates that. + """ + file_field = getattr(artifact, file_attr) + if not file_field: + return False + + current_basename = os.path.basename(file_field.name) + current_no_ext = os.path.splitext(current_basename)[0] + standardized_no_ext = Artifact.generate_filename(artifact) + + # Treat the file as already-standardized when its name equals the + # standardized scheme OR is a uniquified variant of it. When a + # standardized name collides on disk, ensure_filename_is_unique() + # (fileutils.py) appends "-" — e.g. + # "Lee_Talk_CHI2021-1782399772.42.pdf" — so the on-disk name still + # STARTS WITH the standardized base. Matching only on exact equality + # would misread those as never-renamed and record the standardized+ + # suffix name as the "original" — a false positive. + already_standardized = ( + current_no_ext == standardized_no_ext + or current_no_ext.startswith(standardized_no_ext + "-") + ) + if already_standardized: + # Already renamed — the original upload name is gone. + _logger.debug( + f"Skipping {model.__name__} id={artifact.pk} {file_attr}=" + f"'{current_basename}': already standardized." + ) + return False - return num_updated, num_skipped + # Never renamed: the current on-disk name is the original. + if dry_run: + _logger.debug( + f"[dry-run] Would set {original_attr}='{current_basename}' " + f"for {model.__name__} id={artifact.pk} '{artifact.title}'" + ) + else: + # Write directly via the queryset so this stays a pure data + # backfill — no file-rename / thumbnail side effects from the + # model's save(). + model.objects.filter(pk=artifact.pk).update( + **{original_attr: current_basename} + ) + _logger.debug( + f"Set {original_attr}='{current_basename}' for " + f"{model.__name__} id={artifact.pk} '{artifact.title}'" + ) + return True diff --git a/website/tests/test_artifact.py b/website/tests/test_artifact.py index dac88ea5..582cc48b 100644 --- a/website/tests/test_artifact.py +++ b/website/tests/test_artifact.py @@ -296,6 +296,40 @@ def test_uniquified_standardized_name_is_not_backfilled(self): talk.refresh_from_db() self.assertIsNone(talk.original_pdf_filename) + def test_one_bad_row_does_not_abort_the_batch(self): + """ + The backfill runs on every container start over the whole dataset, so a + single malformed row must not abort the run and leave every other row + untouched. A null ``date`` makes ``generate_filename`` raise + (``date.year``); before per-row isolation that exception propagated out + of the command. Here the bad row is skipped and a good legacy row is + still backfilled. + """ + # Malformed: no authors (so save() skips the rename and accepts the + # null date) + a file, which makes it a backfill candidate that raises. + bad = TalkFactory( + title="Bad Row", forum_name="CHI", date=None, + pdf_file=_pdf("bad_upload.pdf"), + ) + # A good legacy candidate with a non-standard (never-renamed) name. + good = TalkFactory( + title="Good Row", forum_name="CHI", date=date(2019, 1, 1), + pdf_file=_pdf("good_upload_final.pdf"), + ) + good_name = os.path.basename(good.pdf_file.name) + Talk.objects.filter(pk__in=[bad.pk, good.pk]).update( + original_pdf_filename=None + ) + + # Must not raise despite the malformed row... + call_command("backfill_original_filenames") + + good.refresh_from_db() + bad.refresh_from_db() + # ...and the good row is still processed. + self.assertEqual(good.original_pdf_filename, good_name) + self.assertIsNone(bad.original_pdf_filename) + class OriginalUploadFilenamesDisplayTests(SimpleTestCase): """ArtifactAdmin.original_upload_filenames read-only display (issue #1391)."""