From bd4bf5229ce88a5d78b2f9206277a864c3b22113 Mon Sep 17 00:00:00 2001
From: Jon Froehlich <jonf@cs.uw.edu>
Date: Thu, 25 Jun 2026 18:42:59 -0700
Subject: [PATCH] Re-standardize legacy artifact filenames (#1401)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Many bulk-imported talk/poster/publication files were never renamed to the
Author_TitleInTitleCase_VenueYear scheme (they never went through an authored
Artifact.save(), and the historical fix-up one-shots are commented out of the
entrypoint). #1391 already captured their original names into original_*_filename
on prod, so re-standardizing is now safe (provenance preserved).

- New command restandardize_artifact_filenames: for each Talk/Poster/Publication
  with authors + a title + a date whose files aren't standardized, call
  artifact.save() — reusing the existing, now-correct rename of pdf_file,
  raw_file, and thumbnail on disk AND in the DB. Per-row try/except (one bad
  row can't abort the batch), --dry-run, summary log.
  - Uses a suffix-tolerant "needs standardizing" gate (not
    Artifact.do_filenames_need_updating): a standardized name that collided on
    disk gets a "-<timestamp>" suffix, which exact-match comparison would treat
    as needing rename forever — churning duplicate-name artifacts' filenames on
    every deploy. The gate treats "Name-<suffix>" as already standardized, so
    the command is idempotent.
  - artifact.save() with no update_fields leaves original_*_filename untouched,
    so the #1391 provenance survives the rename.
- Wired into docker-entrypoint.sh as step 4.10b (after the 4.7b backfill, so
  originals are captured before any rename). Idempotent; safe every start.
- serve_pdf: before the difflib fuzzy guess, add an exact fallback matching the
  requested basename against Publication.original_pdf_filename and redirecting
  to the current file — so stale external links to renamed publication PDFs
  resolve exactly.
- Tests: rename pdf+raw on disk+DB preserving original_* and idempotent;
  already-standardized untouched; malformed (null-date) row skipped without
  aborting the batch; serve_pdf original-filename redirect. Full suite: 556 OK.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 docker-entrypoint.sh                          |   5 +
 .../restandardize_artifact_filenames.py       | 168 ++++++++++++++++++
 website/tests/test_artifact.py                |  98 ++++++++++
 website/tests/test_serve_pdf.py               |  47 ++++-
 website/views/serve_pdf.py                    |  23 ++-
 5 files changed, 339 insertions(+), 2 deletions(-)
 create mode 100644 website/management/commands/restandardize_artifact_filenames.py
diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh
index 05d7ffeb..fc2e7594 100755
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@@ -163,6 +163,11 @@ echo "4.10 Running 'python manage.py setup_admin_groups' to create/refresh the E
 echo "******************************************"
 python manage.py setup_admin_groups
 
+echo "****************** STEP 4.10b/5: docker-entrypoint.sh ************************"
+echo "4.10b Running 'python manage.py restandardize_artifact_filenames' to rename legacy talk/poster/pub files to the standardized scheme (#1401)"
+echo "******************************************"
+python manage.py restandardize_artifact_filenames
+
 # echo "****************** STEP 4.3/5: docker-entrypoint.sh ************************"
 # echo "4.3 Running 'python manage.py rename_person_images' to rename person images"
 # echo "******************************************"
diff --git a/website/management/commands/restandardize_artifact_filenames.py b/website/management/commands/restandardize_artifact_filenames.py
new file mode 100644
index 00000000..693a9bcc
--- /dev/null
+++ b/website/management/commands/restandardize_artifact_filenames.py
@@ -0,0 +1,168 @@
+import os
+import logging
+
+from django.core.management.base import BaseCommand
+
+from website.models import Artifact, Talk, Poster, Publication
+
+# This retrieves a Python logging instance (or creates it)
+_logger = logging.getLogger(__name__)
+
+
+class Command(BaseCommand):
+    help = (
+        "Re-standardizes legacy talk/poster/publication filenames that were "
+        "never renamed to the Author_TitleInTitleCase_VenueYear scheme (issue "
+        "#1401). Production has many such rows (bulk-imported, so they never "
+        "went through an authored Artifact.save()). This reuses the existing, "
+        "now-correct rename path: when Artifact.do_filenames_need_updating() is "
+        "True it calls artifact.save(), which renames the pdf_file, raw_file, "
+        "and thumbnail on disk AND in the DB together. The original upload name "
+        "is preserved (it was captured into original_*_filename by "
+        "backfill_original_filenames / #1391 before this runs). Idempotent: "
+        "once a row is standardized the check returns False, so re-runs do "
+        "nothing. Safe to run on every container start."
+    )
+
+    # The concrete artifact models this covers. Posters are already
+    # standardized in prod, but included for completeness/future data.
+    MODELS = (Talk, Poster, Publication)
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--dry-run",
+            action="store_true",
+            help="Report what would be renamed without touching disk or DB.",
+        )
+
+    def handle(self, *args, **options):
+        dry_run = options["dry_run"]
+        _logger.debug(
+            f"Running restandardize_artifact_filenames.py (dry_run={dry_run}) "
+            f"to rename legacy artifact files to the standardized scheme."
+        )
+
+        total_renamed = 0
+        total_skipped = 0
+        total_errors = 0
+        for model in self.MODELS:
+            renamed, skipped, errors = self._restandardize_model(model, dry_run)
+            total_renamed += renamed
+            total_skipped += skipped
+            total_errors += errors
+
+        verb = "Would rename" if dry_run else "Renamed"
+        _logger.info(
+            f"restandardize_artifact_filenames: {verb} {total_renamed} "
+            f"artifact(s); skipped {total_skipped} (already standardized or no "
+            f"usable name); {total_errors} row(s) errored and were skipped."
+        )
+        _logger.debug("Completed restandardize_artifact_filenames.py")
+
+    def _restandardize_model(self, model, dry_run):
+        """Re-standardize one concrete artifact model.
+
+        Returns a ``(num_renamed, num_skipped, num_errors)`` tuple. Each row is
+        processed inside its own try/except so a single malformed row (e.g. a
+        null ``date``/``title``, which makes ``generate_filename`` raise via
+        ``date.year`` / ``title.title()``) can't abort the batch and leave the
+        rest of the dataset untouched. The entrypoint has no ``set -e``, so an
+        aborted run would fail silently.
+        """
+        num_renamed = 0
+        num_skipped = 0
+        num_errors = 0
+        # prefetch_related('authors') because the rename path reads the first
+        # author's last name (generate_filename) and the rename only fires when
+        # authors exist.
+        for artifact in model.objects.prefetch_related("authors").all():
+            try:
+                if self._restandardize_row(artifact, dry_run):
+                    num_renamed += 1
+                else:
+                    num_skipped += 1
+            except Exception:
+                _logger.exception(
+                    "restandardize_artifact_filenames: skipping %s id=%s due to "
+                    "an error", model.__name__, getattr(artifact, "pk", "?"),
+                )
+                num_errors += 1
+
+        return num_renamed, num_skipped, num_errors
+
+    def _restandardize_row(self, artifact, dry_run):
+        """Re-standardize one artifact's files if needed.
+
+        Returns True if it was (or would be) renamed, False if skipped. Raises
+        on malformed data — the caller isolates that.
+        """
+        model_name = type(artifact).__name__
+
+        # A standardized name needs an author, a title, and a date. Without
+        # them generate_filename() can't produce a sensible name (and would
+        # raise on null date/title), so leave the row untouched rather than
+        # rename it to something degraded.
+        if not artifact.title or not artifact.date:
+            _logger.debug(
+                f"Skipping {model_name} id={artifact.pk}: missing title/date."
+            )
+            return False
+        if not artifact.authors.exists():
+            # Artifact.save() only renames when authors exist (it needs the
+            # first author's last name), so a save() here would be a no-op.
+            _logger.debug(
+                f"Skipping {model_name} id={artifact.pk}: no authors."
+            )
+            return False
+
+        if not self._needs_restandardizing(artifact):
+            return False
+
+        if dry_run:
+            new_name = Artifact.generate_filename(artifact)
+            _logger.debug(
+                f"[dry-run] Would re-standardize {model_name} id={artifact.pk} "
+                f"to '{new_name}' (pdf='{artifact.pdf_file.name if artifact.pdf_file else None}', "
+                f"raw='{artifact.raw_file.name if artifact.raw_file else None}')"
+            )
+            return True
+
+        # Reuse the canonical rename path: save() renames pdf_file, raw_file,
+        # and thumbnail on disk and in the DB, and leaves original_*_filename
+        # untouched (it only captures on a new upload, not a no-update_fields
+        # save), so the provenance recorded by #1391 is preserved.
+        artifact.save()
+        _logger.debug(
+            f"Re-standardized {model_name} id={artifact.pk} to "
+            f"'{Artifact.generate_filename(artifact)}'."
+        )
+        return True
+
+    @staticmethod
+    def _needs_restandardizing(artifact):
+        """Whether the pdf_file/raw_file need standardizing, tolerant of the
+        uniqueness suffix.
+
+        This is NOT ``Artifact.do_filenames_need_updating``: that compares the
+        basename to ``generate_filename`` for exact equality, so a file whose
+        standardized name collided on disk and got a ``-<timestamp>`` suffix
+        (``ensure_filename_is_unique``) reads as "needs updating" forever and
+        would be re-renamed on every run — churning duplicate-name artifacts'
+        filenames on every deploy. Here a name that equals the standardized
+        base OR is a ``-<suffix>`` variant of it counts as already standardized,
+        which keeps the command idempotent.
+        """
+        standardized = Artifact.generate_filename(artifact)
+        for file_attr in ("pdf_file", "raw_file"):
+            file_field = getattr(artifact, file_attr)
+            if not file_field:
+                continue
+            current_no_ext = os.path.splitext(
+                os.path.basename(file_field.name))[0]
+            is_standardized = (
+                current_no_ext == standardized
+                or current_no_ext.startswith(standardized + "-")
+            )
+            if not is_standardized:
+                return True
+        return False
diff --git a/website/tests/test_artifact.py b/website/tests/test_artifact.py
index 582cc48b..4269910d 100644
--- a/website/tests/test_artifact.py
+++ b/website/tests/test_artifact.py
@@ -4,6 +4,8 @@
 from datetime import date
 from unittest.mock import MagicMock, patch
 
+from django.core.files.base import ContentFile
+from django.core.files.storage import default_storage
 from django.core.files.uploadedfile import SimpleUploadedFile
 from django.core.management import call_command
 from django.test import SimpleTestCase
@@ -357,3 +359,99 @@ def test_pdf_only_omits_raw_row(self):
     def test_placeholder_when_nothing_recorded(self):
         html = self._render(pdf=None, raw=None)
         self.assertIn("Not recorded", html)
+
+
+# --- #1401: re-standardize legacy filenames -------------------------------
+
+
+class RestandardizeArtifactFilenamesTests(DatabaseTestCase):
+    """
+    Tests for the restandardize_artifact_filenames command (#1401), which
+    renames legacy never-renamed files to the Author_Title_Venue scheme by
+    reusing Artifact.save(). See also the #1391 capture tests above.
+    """
+
+    def _legacy_talk(self, last_name="Kim", title="My Talk", year=2019,
+                     base="Original_Upload_v2"):
+        """
+        Build a talk with authors but NON-standard files actually present on
+        disk (so the os.rename in save() has something to rename). The factory
+        auto-standardizes on create, so we drop real files at non-standard
+        paths and repoint the row at them, mimicking a never-renamed import.
+        Returns (talk, pdf_basename, raw_basename).
+        """
+        person = self.make_person(last_name=last_name)
+        talk = TalkFactory(title=title, forum_name="CHI",
+                           date=date(year, 1, 1), authors=[person])
+        pdf_name = default_storage.save(
+            f"talks/{base}.pdf", ContentFile(b"%PDF-1.4 x"))
+        raw_name = default_storage.save(
+            f"talks/{base}.pptx", ContentFile(b"PKx"))
+        Talk.objects.filter(pk=talk.pk).update(
+            pdf_file=pdf_name, raw_file=raw_name,
+            original_pdf_filename=os.path.basename(pdf_name),
+            original_raw_filename=os.path.basename(raw_name),
+        )
+        talk.refresh_from_db()
+        return talk, os.path.basename(pdf_name), os.path.basename(raw_name)
+
+    def test_renames_pdf_and_raw_preserving_original_and_idempotent(self):
+        talk, pdf_base, raw_base = self._legacy_talk()
+        orig_pdf = talk.original_pdf_filename
+        orig_raw = talk.original_raw_filename
+
+        call_command("restandardize_artifact_filenames")
+
+        talk.refresh_from_db()
+        # Both files renamed away from the original toward the standardized
+        # scheme (which contains the author last name), on disk and in the DB.
+        self.assertNotIn("Original_Upload", talk.pdf_file.name)
+        self.assertNotIn("Original_Upload", talk.raw_file.name)
+        self.assertIn("Kim", os.path.basename(talk.pdf_file.name))
+        self.assertIn("Kim", os.path.basename(talk.raw_file.name))
+        self.assertTrue(default_storage.exists(talk.pdf_file.name))
+        self.assertTrue(default_storage.exists(talk.raw_file.name))
+        # Provenance preserved (not clobbered by the rename).
+        self.assertEqual(talk.original_pdf_filename, orig_pdf)
+        self.assertEqual(talk.original_raw_filename, orig_raw)
+
+        # Idempotent: a second run leaves the now-standardized names alone.
+        pdf_after = talk.pdf_file.name
+        raw_after = talk.raw_file.name
+        call_command("restandardize_artifact_filenames")
+        talk.refresh_from_db()
+        self.assertEqual(talk.pdf_file.name, pdf_after)
+        self.assertEqual(talk.raw_file.name, raw_after)
+
+    def test_already_standardized_talk_is_untouched(self):
+        # A normally-created talk is auto-standardized on save, so the command
+        # should find nothing to do and leave its filename unchanged.
+        person = self.make_person(last_name="Lee")
+        talk = TalkFactory(title="Standard Talk", forum_name="CHI",
+                           date=date(2021, 1, 1), authors=[person])
+        before = talk.pdf_file.name
+
+        call_command("restandardize_artifact_filenames")
+
+        talk.refresh_from_db()
+        self.assertEqual(talk.pdf_file.name, before)
+
+    def test_malformed_row_is_skipped_and_batch_continues(self):
+        # A null-date talk can't form a standardized name; it must be skipped
+        # (not renamed, no crash) while a good legacy row in the same run is
+        # still re-standardized.
+        bad = TalkFactory(title="Bad Row", forum_name="CHI", date=None,
+                          pdf_file=_pdf("bad_upload.pdf"))
+        bad_name = bad.pdf_file.name
+        good, _, _ = self._legacy_talk(last_name="Park", title="Good Talk",
+                                       base="Good_Original_v1")
+
+        # Must not raise despite the malformed row.
+        call_command("restandardize_artifact_filenames")
+
+        good.refresh_from_db()
+        bad.refresh_from_db()
+        self.assertIn("Park", os.path.basename(good.pdf_file.name))
+        self.assertNotIn("Good_Original", good.pdf_file.name)
+        # The malformed row is left exactly as it was.
+        self.assertEqual(bad.pdf_file.name, bad_name)
diff --git a/website/tests/test_serve_pdf.py b/website/tests/test_serve_pdf.py
index a7c2f990..62477fbd 100644
--- a/website/tests/test_serve_pdf.py
+++ b/website/tests/test_serve_pdf.py
@@ -4,6 +4,17 @@
 
 from django.test import SimpleTestCase
 
+from website.models import Publication
+from website.tests.base import DatabaseTestCase
+
+
+def _miss_original_fallback(mock_pub):
+    """Stub the #1401 original_pdf_filename fallback chain
+    (``filter(...).exclude(...).exclude(...).first()``) to miss, so tests
+    exercising the exact/fuzzy/404 paths fall through it as before."""
+    qs = mock_pub.objects.filter.return_value
+    qs.exclude.return_value.exclude.return_value.first.return_value = None
+
 
 class ServePdfTests(SimpleTestCase):
     """
@@ -56,9 +67,13 @@ def test_uses_iendswith_not_icontains(self):
             return_value=None,
         ):
             MockPub.objects.filter.return_value.first.return_value = None
+            _miss_original_fallback(MockPub)
             with self.assertRaises(Http404):
                 serve_pdf(MagicMock(), ".pdf")
-            MockPub.objects.filter.assert_called_with(pdf_file__iendswith=".pdf")
+            # assert_any_call (not assert_called_with): the original-filename
+            # fallback issues a later filter() call, so the iendswith call is
+            # no longer the most recent one.
+            MockPub.objects.filter.assert_any_call(pdf_file__iendswith=".pdf")
 
     def test_no_exact_match_uses_fuzzy_redirect(self):
         """
@@ -72,6 +87,7 @@ def test_no_exact_match_uses_fuzzy_redirect(self):
             return_value="publications/Froehlich2018Updated.pdf",
         ):
             MockPub.objects.filter.return_value.first.return_value = None
+            _miss_original_fallback(MockPub)
             response = serve_pdf(MagicMock(), "Froehlich2018Old.pdf")
         self.assertEqual(response.status_code, 302)
         self.assertEqual(response.url, "/media/publications/Froehlich2018Updated.pdf")
@@ -85,5 +101,34 @@ def test_no_exact_no_fuzzy_returns_404(self):
             return_value=None,
         ):
             MockPub.objects.filter.return_value.first.return_value = None
+            _miss_original_fallback(MockPub)
             with self.assertRaises(Http404):
                 serve_pdf(MagicMock(), "NoSuchPaper.pdf")
+
+
+class ServePdfOriginalFilenameFallbackTests(DatabaseTestCase):
+    """
+    DB-backed test for the #1401 original_pdf_filename fallback: after a
+    publication PDF is re-standardized, a stale external link to its old
+    (original upload) filename must still resolve. serve_pdf matches the
+    requested name against the captured ``original_pdf_filename`` and redirects
+    to the current file — an exact resolution, before the difflib guess.
+    """
+
+    def test_old_filename_redirects_to_current_via_original_pdf_filename(self):
+        from website.views.serve_pdf import serve_pdf
+        pub = self.make_publication(title="Gamifying Green", year=2013)
+        # Simulate a re-standardized pub: current file is the standardized
+        # name; the old upload name is preserved in original_pdf_filename.
+        Publication.objects.filter(pk=pub.pk).update(
+            pdf_file="publications/Froehlich_GamifyingGreen_CHI2013.pdf",
+            original_pdf_filename="Gamifying_Green_yY7Jx99.pdf",
+        )
+
+        response = serve_pdf(MagicMock(), "Gamifying_Green_yY7Jx99.pdf")
+
+        self.assertEqual(response.status_code, 302)
+        self.assertEqual(
+            response.url,
+            "/media/publications/Froehlich_GamifyingGreen_CHI2013.pdf",
+        )
diff --git a/website/views/serve_pdf.py b/website/views/serve_pdf.py
index b67afcca..f1778bd4 100644
--- a/website/views/serve_pdf.py
+++ b/website/views/serve_pdf.py
@@ -45,7 +45,28 @@ def serve_pdf(request, filename):
         response['Content-Disposition'] = f'inline;filename={basename}'
         return response
 
-    # No exact match — try fuzzy fallback for stale external links.
+    # No exact match. Before the fuzzy guess, try an EXACT match on the
+    # captured original upload name (#1391/#1401): when a publication's PDF was
+    # re-standardized (Author_Title_Venue), its previous on-disk basename was
+    # saved to ``original_pdf_filename``. A stale external link still points at
+    # that old name, so match it there and redirect to the current file. This
+    # resolves renamed-paper links exactly, rather than relying on the difflib
+    # similarity guess below.
+    requested_basename = os.path.basename(filename)
+    renamed = (Publication.objects
+               .filter(original_pdf_filename__iexact=requested_basename)
+               .exclude(pdf_file="")
+               .exclude(pdf_file__isnull=True)
+               .first())
+    if renamed is not None:
+        current_basename = os.path.basename(renamed.pdf_file.name)
+        _logger.debug(
+            f"{filename} matched original_pdf_filename of pub id={renamed.pk}; "
+            f"redirecting to /media/publications/{current_basename}"
+        )
+        return redirect(f'/media/publications/{current_basename}')
+
+    # Still no match — try fuzzy fallback for stale external links.
     _logger.debug(f"{filename} not found exactly; trying fuzzy match")
     closest = get_closest_filename_from_database(filename, 0.7)
     if closest: