From 9b7ec103a8cd5bcd84c9317f7bc73265c1e69a76 Mon Sep 17 00:00:00 2001
From: Jon Froehlich <jonf@cs.uw.edu>
Date: Thu, 25 Jun 2026 16:29:12 -0700
Subject: [PATCH] Harden backfill_original_filenames against per-row failures
 (#1391)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The backfill runs on every container start over the whole artifact dataset.
It had no per-row error isolation, so a single malformed row would propagate
its exception out of the command and abort the entire run — and since
docker-entrypoint.sh has no `set -e`, that would fail silently (traceback
prints, startup continues to runserver), leaving every row's original_*_filename
unset. The most likely trigger is a null `date` or `title`, which makes
Artifact.generate_filename() raise (date.year / title.title()).

This is defensive insurance, not a fix for an observed failure — the 2.25.0
prod backfill did populate the legacy rows correctly.

- Process each (artifact, file field) pair inside its own try/except; log and
  count the row, then continue. Extract the row logic into _backfill_row().
- Surface an errored-row count in the summary log line.
- Regression test: a row with a null date (generate_filename raises) no longer
  aborts the batch; a good legacy row in the same run is still backfilled.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../commands/backfill_original_filenames.py   | 135 +++++++++++-------
 website/tests/test_artifact.py                |  34 +++++
 2 files changed, 118 insertions(+), 51 deletions(-)

diff --git a/website/management/commands/backfill_original_filenames.py b/website/management/commands/backfill_original_filenames.py
index 1ef875e9..b6e6a816 100644
--- a/website/management/commands/backfill_original_filenames.py
+++ b/website/management/commands/backfill_original_filenames.py
@@ -47,27 +47,39 @@ def handle(self, *args, **options):
 
         total_updated = 0
         total_skipped = 0
+        total_errors = 0
         for model in self.MODELS:
-            updated, skipped = self._backfill_model(model, dry_run)
+            updated, skipped, errors = self._backfill_model(model, dry_run)
             total_updated += updated
             total_skipped += skipped
+            total_errors += errors
 
         verb = "Would update" if dry_run else "Updated"
         _logger.info(
             f"backfill_original_filenames: {verb} {total_updated} filename "
             f"field(s); skipped {total_skipped} (already standardized — original "
-            f"name unrecoverable)."
+            f"name unrecoverable); {total_errors} row(s) errored and were skipped."
         )
         _logger.debug("Completed backfill_original_filenames.py")
 
     def _backfill_model(self, model, dry_run):
         """Backfill both file fields for one concrete artifact model.
 
-        Returns a ``(num_updated, num_skipped)`` tuple counting individual
-        filename fields touched / deliberately left blank.
+        Returns a ``(num_updated, num_skipped, num_errors)`` tuple counting
+        individual filename fields touched / deliberately left blank / skipped
+        because the row raised.
+
+        Each row is processed inside its own try/except: this iterates the
+        whole production dataset, and a single malformed row (e.g. a null
+        ``date`` or ``title`` makes ``generate_filename`` raise) must not abort
+        the entire backfill and leave every later row untouched. The entrypoint
+        has no ``set -e``, so an aborted run would fail silently (the traceback
+        prints but startup continues) — this per-row isolation is defensive
+        insurance against that.
         """
         num_updated = 0
         num_skipped = 0
+        num_errors = 0
         for file_attr, original_attr in self.FILE_FIELDS:
             # Only rows that have a file but no captured original name yet.
             candidates = (
@@ -80,52 +92,73 @@ def _backfill_model(self, model, dry_run):
             candidates = candidates.prefetch_related("authors")
 
             for artifact in candidates:
-                file_field = getattr(artifact, file_attr)
-                if not file_field:
-                    continue
-
-                current_basename = os.path.basename(file_field.name)
-                current_no_ext = os.path.splitext(current_basename)[0]
-                standardized_no_ext = Artifact.generate_filename(artifact)
-
-                # Treat the file as already-standardized when its name equals the
-                # standardized scheme OR is a uniquified variant of it. When a
-                # standardized name collides on disk, ensure_filename_is_unique()
-                # (fileutils.py) appends "-<timestamp>" — e.g.
-                # "Lee_Talk_CHI2021-1782399772.42.pdf" — so the on-disk name
-                # still STARTS WITH the standardized base. Matching only on exact
-                # equality would misread those as never-renamed and record the
-                # standardized+suffix name as the "original" — a false positive.
-                already_standardized = (
-                    current_no_ext == standardized_no_ext
-                    or current_no_ext.startswith(standardized_no_ext + "-")
-                )
-                if already_standardized:
-                    # Already renamed — the original upload name is gone.
-                    _logger.debug(
-                        f"Skipping {model.__name__} id={artifact.pk} {file_attr}="
-                        f"'{current_basename}': already standardized."
+                try:
+                    if self._backfill_row(model, artifact, file_attr,
+                                          original_attr, dry_run):
+                        num_updated += 1
+                    else:
+                        num_skipped += 1
+                except Exception:
+                    # Log and move on — never let one row kill the batch.
+                    _logger.exception(
+                        "backfill_original_filenames: skipping %s id=%s %s due "
+                        "to an error", model.__name__,
+                        getattr(artifact, "pk", "?"), file_attr,
                     )
-                    num_skipped += 1
-                    continue
-
-                # Never renamed: the current on-disk name is the original.
-                if dry_run:
-                    _logger.debug(
-                        f"[dry-run] Would set {original_attr}='{current_basename}' "
-                        f"for {model.__name__} id={artifact.pk} '{artifact.title}'"
-                    )
-                else:
-                    # Write directly via the queryset so this stays a pure data
-                    # backfill — no file-rename / thumbnail side effects from the
-                    # model's save().
-                    model.objects.filter(pk=artifact.pk).update(
-                        **{original_attr: current_basename}
-                    )
-                    _logger.debug(
-                        f"Set {original_attr}='{current_basename}' for "
-                        f"{model.__name__} id={artifact.pk} '{artifact.title}'"
-                    )
-                num_updated += 1
+                    num_errors += 1
+
+        return num_updated, num_skipped, num_errors
+
+    def _backfill_row(self, model, artifact, file_attr, original_attr, dry_run):
+        """Backfill one (artifact, file field) pair.
+
+        Returns True if the original name was (or would be) recorded, False if
+        the row was deliberately skipped because its file is already
+        standardized. Raises on malformed data — the caller isolates that.
+        """
+        file_field = getattr(artifact, file_attr)
+        if not file_field:
+            return False
+
+        current_basename = os.path.basename(file_field.name)
+        current_no_ext = os.path.splitext(current_basename)[0]
+        standardized_no_ext = Artifact.generate_filename(artifact)
+
+        # Treat the file as already-standardized when its name equals the
+        # standardized scheme OR is a uniquified variant of it. When a
+        # standardized name collides on disk, ensure_filename_is_unique()
+        # (fileutils.py) appends "-<timestamp>" — e.g.
+        # "Lee_Talk_CHI2021-1782399772.42.pdf" — so the on-disk name still
+        # STARTS WITH the standardized base. Matching only on exact equality
+        # would misread those as never-renamed and record the standardized+
+        # suffix name as the "original" — a false positive.
+        already_standardized = (
+            current_no_ext == standardized_no_ext
+            or current_no_ext.startswith(standardized_no_ext + "-")
+        )
+        if already_standardized:
+            # Already renamed — the original upload name is gone.
+            _logger.debug(
+                f"Skipping {model.__name__} id={artifact.pk} {file_attr}="
+                f"'{current_basename}': already standardized."
+            )
+            return False
 
-        return num_updated, num_skipped
+        # Never renamed: the current on-disk name is the original.
+        if dry_run:
+            _logger.debug(
+                f"[dry-run] Would set {original_attr}='{current_basename}' "
+                f"for {model.__name__} id={artifact.pk} '{artifact.title}'"
+            )
+        else:
+            # Write directly via the queryset so this stays a pure data
+            # backfill — no file-rename / thumbnail side effects from the
+            # model's save().
+            model.objects.filter(pk=artifact.pk).update(
+                **{original_attr: current_basename}
+            )
+            _logger.debug(
+                f"Set {original_attr}='{current_basename}' for "
+                f"{model.__name__} id={artifact.pk} '{artifact.title}'"
+            )
+        return True
diff --git a/website/tests/test_artifact.py b/website/tests/test_artifact.py
index dac88ea5..582cc48b 100644
--- a/website/tests/test_artifact.py
+++ b/website/tests/test_artifact.py
@@ -296,6 +296,40 @@ def test_uniquified_standardized_name_is_not_backfilled(self):
         talk.refresh_from_db()
         self.assertIsNone(talk.original_pdf_filename)
 
+    def test_one_bad_row_does_not_abort_the_batch(self):
+        """
+        The backfill runs on every container start over the whole dataset, so a
+        single malformed row must not abort the run and leave every other row
+        untouched. A null ``date`` makes ``generate_filename`` raise
+        (``date.year``); before per-row isolation that exception propagated out
+        of the command. Here the bad row is skipped and a good legacy row is
+        still backfilled.
+        """
+        # Malformed: no authors (so save() skips the rename and accepts the
+        # null date) + a file, which makes it a backfill candidate that raises.
+        bad = TalkFactory(
+            title="Bad Row", forum_name="CHI", date=None,
+            pdf_file=_pdf("bad_upload.pdf"),
+        )
+        # A good legacy candidate with a non-standard (never-renamed) name.
+        good = TalkFactory(
+            title="Good Row", forum_name="CHI", date=date(2019, 1, 1),
+            pdf_file=_pdf("good_upload_final.pdf"),
+        )
+        good_name = os.path.basename(good.pdf_file.name)
+        Talk.objects.filter(pk__in=[bad.pk, good.pk]).update(
+            original_pdf_filename=None
+        )
+
+        # Must not raise despite the malformed row...
+        call_command("backfill_original_filenames")
+
+        good.refresh_from_db()
+        bad.refresh_from_db()
+        # ...and the good row is still processed.
+        self.assertEqual(good.original_pdf_filename, good_name)
+        self.assertIsNone(bad.original_pdf_filename)
+
 
 class OriginalUploadFilenamesDisplayTests(SimpleTestCase):
     """ArtifactAdmin.original_upload_filenames read-only display (issue #1391)."""