From bd4bf5229ce88a5d78b2f9206277a864c3b22113 Mon Sep 17 00:00:00 2001 From: Jon Froehlich Date: Thu, 25 Jun 2026 18:42:59 -0700 Subject: [PATCH] Re-standardize legacy artifact filenames (#1401) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Many bulk-imported talk/poster/publication files were never renamed to the Author_TitleInTitleCase_VenueYear scheme (they never went through an authored Artifact.save(), and the historical fix-up one-shots are commented out of the entrypoint). #1391 already captured their original names into original_*_filename on prod, so re-standardizing is now safe (provenance preserved). - New command restandardize_artifact_filenames: for each Talk/Poster/Publication with authors + a title + a date whose files aren't standardized, call artifact.save() — reusing the existing, now-correct rename of pdf_file, raw_file, and thumbnail on disk AND in the DB. Per-row try/except (one bad row can't abort the batch), --dry-run, summary log. - Uses a suffix-tolerant "needs standardizing" gate (not Artifact.do_filenames_need_updating): a standardized name that collided on disk gets a "-" suffix, which exact-match comparison would treat as needing rename forever — churning duplicate-name artifacts' filenames on every deploy. The gate treats "Name-" as already standardized, so the command is idempotent. - artifact.save() with no update_fields leaves original_*_filename untouched, so the #1391 provenance survives the rename. - Wired into docker-entrypoint.sh as step 4.10b (after the 4.7b backfill, so originals are captured before any rename). Idempotent; safe every start. - serve_pdf: before the difflib fuzzy guess, add an exact fallback matching the requested basename against Publication.original_pdf_filename and redirecting to the current file — so stale external links to renamed publication PDFs resolve exactly. - Tests: rename pdf+raw on disk+DB preserving original_* and idempotent; already-standardized untouched; malformed (null-date) row skipped without aborting the batch; serve_pdf original-filename redirect. Full suite: 556 OK. Co-Authored-By: Claude Opus 4.8 (1M context) --- docker-entrypoint.sh | 5 + .../restandardize_artifact_filenames.py | 168 ++++++++++++++++++ website/tests/test_artifact.py | 98 ++++++++++ website/tests/test_serve_pdf.py | 47 ++++- website/views/serve_pdf.py | 23 ++- 5 files changed, 339 insertions(+), 2 deletions(-) create mode 100644 website/management/commands/restandardize_artifact_filenames.py diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 05d7ffeb..fc2e7594 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -163,6 +163,11 @@ echo "4.10 Running 'python manage.py setup_admin_groups' to create/refresh the E echo "******************************************" python manage.py setup_admin_groups +echo "****************** STEP 4.10b/5: docker-entrypoint.sh ************************" +echo "4.10b Running 'python manage.py restandardize_artifact_filenames' to rename legacy talk/poster/pub files to the standardized scheme (#1401)" +echo "******************************************" +python manage.py restandardize_artifact_filenames + # echo "****************** STEP 4.3/5: docker-entrypoint.sh ************************" # echo "4.3 Running 'python manage.py rename_person_images' to rename person images" # echo "******************************************" diff --git a/website/management/commands/restandardize_artifact_filenames.py b/website/management/commands/restandardize_artifact_filenames.py new file mode 100644 index 00000000..693a9bcc --- /dev/null +++ b/website/management/commands/restandardize_artifact_filenames.py @@ -0,0 +1,168 @@ +import os +import logging + +from django.core.management.base import BaseCommand + +from website.models import Artifact, Talk, Poster, Publication + +# This retrieves a Python logging instance (or creates it) +_logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = ( + "Re-standardizes legacy talk/poster/publication filenames that were " + "never renamed to the Author_TitleInTitleCase_VenueYear scheme (issue " + "#1401). Production has many such rows (bulk-imported, so they never " + "went through an authored Artifact.save()). This reuses the existing, " + "now-correct rename path: when Artifact.do_filenames_need_updating() is " + "True it calls artifact.save(), which renames the pdf_file, raw_file, " + "and thumbnail on disk AND in the DB together. The original upload name " + "is preserved (it was captured into original_*_filename by " + "backfill_original_filenames / #1391 before this runs). Idempotent: " + "once a row is standardized the check returns False, so re-runs do " + "nothing. Safe to run on every container start." + ) + + # The concrete artifact models this covers. Posters are already + # standardized in prod, but included for completeness/future data. + MODELS = (Talk, Poster, Publication) + + def add_arguments(self, parser): + parser.add_argument( + "--dry-run", + action="store_true", + help="Report what would be renamed without touching disk or DB.", + ) + + def handle(self, *args, **options): + dry_run = options["dry_run"] + _logger.debug( + f"Running restandardize_artifact_filenames.py (dry_run={dry_run}) " + f"to rename legacy artifact files to the standardized scheme." + ) + + total_renamed = 0 + total_skipped = 0 + total_errors = 0 + for model in self.MODELS: + renamed, skipped, errors = self._restandardize_model(model, dry_run) + total_renamed += renamed + total_skipped += skipped + total_errors += errors + + verb = "Would rename" if dry_run else "Renamed" + _logger.info( + f"restandardize_artifact_filenames: {verb} {total_renamed} " + f"artifact(s); skipped {total_skipped} (already standardized or no " + f"usable name); {total_errors} row(s) errored and were skipped." + ) + _logger.debug("Completed restandardize_artifact_filenames.py") + + def _restandardize_model(self, model, dry_run): + """Re-standardize one concrete artifact model. + + Returns a ``(num_renamed, num_skipped, num_errors)`` tuple. Each row is + processed inside its own try/except so a single malformed row (e.g. a + null ``date``/``title``, which makes ``generate_filename`` raise via + ``date.year`` / ``title.title()``) can't abort the batch and leave the + rest of the dataset untouched. The entrypoint has no ``set -e``, so an + aborted run would fail silently. + """ + num_renamed = 0 + num_skipped = 0 + num_errors = 0 + # prefetch_related('authors') because the rename path reads the first + # author's last name (generate_filename) and the rename only fires when + # authors exist. + for artifact in model.objects.prefetch_related("authors").all(): + try: + if self._restandardize_row(artifact, dry_run): + num_renamed += 1 + else: + num_skipped += 1 + except Exception: + _logger.exception( + "restandardize_artifact_filenames: skipping %s id=%s due to " + "an error", model.__name__, getattr(artifact, "pk", "?"), + ) + num_errors += 1 + + return num_renamed, num_skipped, num_errors + + def _restandardize_row(self, artifact, dry_run): + """Re-standardize one artifact's files if needed. + + Returns True if it was (or would be) renamed, False if skipped. Raises + on malformed data — the caller isolates that. + """ + model_name = type(artifact).__name__ + + # A standardized name needs an author, a title, and a date. Without + # them generate_filename() can't produce a sensible name (and would + # raise on null date/title), so leave the row untouched rather than + # rename it to something degraded. + if not artifact.title or not artifact.date: + _logger.debug( + f"Skipping {model_name} id={artifact.pk}: missing title/date." + ) + return False + if not artifact.authors.exists(): + # Artifact.save() only renames when authors exist (it needs the + # first author's last name), so a save() here would be a no-op. + _logger.debug( + f"Skipping {model_name} id={artifact.pk}: no authors." + ) + return False + + if not self._needs_restandardizing(artifact): + return False + + if dry_run: + new_name = Artifact.generate_filename(artifact) + _logger.debug( + f"[dry-run] Would re-standardize {model_name} id={artifact.pk} " + f"to '{new_name}' (pdf='{artifact.pdf_file.name if artifact.pdf_file else None}', " + f"raw='{artifact.raw_file.name if artifact.raw_file else None}')" + ) + return True + + # Reuse the canonical rename path: save() renames pdf_file, raw_file, + # and thumbnail on disk and in the DB, and leaves original_*_filename + # untouched (it only captures on a new upload, not a no-update_fields + # save), so the provenance recorded by #1391 is preserved. + artifact.save() + _logger.debug( + f"Re-standardized {model_name} id={artifact.pk} to " + f"'{Artifact.generate_filename(artifact)}'." + ) + return True + + @staticmethod + def _needs_restandardizing(artifact): + """Whether the pdf_file/raw_file need standardizing, tolerant of the + uniqueness suffix. + + This is NOT ``Artifact.do_filenames_need_updating``: that compares the + basename to ``generate_filename`` for exact equality, so a file whose + standardized name collided on disk and got a ``-`` suffix + (``ensure_filename_is_unique``) reads as "needs updating" forever and + would be re-renamed on every run — churning duplicate-name artifacts' + filenames on every deploy. Here a name that equals the standardized + base OR is a ``-`` variant of it counts as already standardized, + which keeps the command idempotent. + """ + standardized = Artifact.generate_filename(artifact) + for file_attr in ("pdf_file", "raw_file"): + file_field = getattr(artifact, file_attr) + if not file_field: + continue + current_no_ext = os.path.splitext( + os.path.basename(file_field.name))[0] + is_standardized = ( + current_no_ext == standardized + or current_no_ext.startswith(standardized + "-") + ) + if not is_standardized: + return True + return False diff --git a/website/tests/test_artifact.py b/website/tests/test_artifact.py index 582cc48b..4269910d 100644 --- a/website/tests/test_artifact.py +++ b/website/tests/test_artifact.py @@ -4,6 +4,8 @@ from datetime import date from unittest.mock import MagicMock, patch +from django.core.files.base import ContentFile +from django.core.files.storage import default_storage from django.core.files.uploadedfile import SimpleUploadedFile from django.core.management import call_command from django.test import SimpleTestCase @@ -357,3 +359,99 @@ def test_pdf_only_omits_raw_row(self): def test_placeholder_when_nothing_recorded(self): html = self._render(pdf=None, raw=None) self.assertIn("Not recorded", html) + + +# --- #1401: re-standardize legacy filenames ------------------------------- + + +class RestandardizeArtifactFilenamesTests(DatabaseTestCase): + """ + Tests for the restandardize_artifact_filenames command (#1401), which + renames legacy never-renamed files to the Author_Title_Venue scheme by + reusing Artifact.save(). See also the #1391 capture tests above. + """ + + def _legacy_talk(self, last_name="Kim", title="My Talk", year=2019, + base="Original_Upload_v2"): + """ + Build a talk with authors but NON-standard files actually present on + disk (so the os.rename in save() has something to rename). The factory + auto-standardizes on create, so we drop real files at non-standard + paths and repoint the row at them, mimicking a never-renamed import. + Returns (talk, pdf_basename, raw_basename). + """ + person = self.make_person(last_name=last_name) + talk = TalkFactory(title=title, forum_name="CHI", + date=date(year, 1, 1), authors=[person]) + pdf_name = default_storage.save( + f"talks/{base}.pdf", ContentFile(b"%PDF-1.4 x")) + raw_name = default_storage.save( + f"talks/{base}.pptx", ContentFile(b"PKx")) + Talk.objects.filter(pk=talk.pk).update( + pdf_file=pdf_name, raw_file=raw_name, + original_pdf_filename=os.path.basename(pdf_name), + original_raw_filename=os.path.basename(raw_name), + ) + talk.refresh_from_db() + return talk, os.path.basename(pdf_name), os.path.basename(raw_name) + + def test_renames_pdf_and_raw_preserving_original_and_idempotent(self): + talk, pdf_base, raw_base = self._legacy_talk() + orig_pdf = talk.original_pdf_filename + orig_raw = talk.original_raw_filename + + call_command("restandardize_artifact_filenames") + + talk.refresh_from_db() + # Both files renamed away from the original toward the standardized + # scheme (which contains the author last name), on disk and in the DB. + self.assertNotIn("Original_Upload", talk.pdf_file.name) + self.assertNotIn("Original_Upload", talk.raw_file.name) + self.assertIn("Kim", os.path.basename(talk.pdf_file.name)) + self.assertIn("Kim", os.path.basename(talk.raw_file.name)) + self.assertTrue(default_storage.exists(talk.pdf_file.name)) + self.assertTrue(default_storage.exists(talk.raw_file.name)) + # Provenance preserved (not clobbered by the rename). + self.assertEqual(talk.original_pdf_filename, orig_pdf) + self.assertEqual(talk.original_raw_filename, orig_raw) + + # Idempotent: a second run leaves the now-standardized names alone. + pdf_after = talk.pdf_file.name + raw_after = talk.raw_file.name + call_command("restandardize_artifact_filenames") + talk.refresh_from_db() + self.assertEqual(talk.pdf_file.name, pdf_after) + self.assertEqual(talk.raw_file.name, raw_after) + + def test_already_standardized_talk_is_untouched(self): + # A normally-created talk is auto-standardized on save, so the command + # should find nothing to do and leave its filename unchanged. + person = self.make_person(last_name="Lee") + talk = TalkFactory(title="Standard Talk", forum_name="CHI", + date=date(2021, 1, 1), authors=[person]) + before = talk.pdf_file.name + + call_command("restandardize_artifact_filenames") + + talk.refresh_from_db() + self.assertEqual(talk.pdf_file.name, before) + + def test_malformed_row_is_skipped_and_batch_continues(self): + # A null-date talk can't form a standardized name; it must be skipped + # (not renamed, no crash) while a good legacy row in the same run is + # still re-standardized. + bad = TalkFactory(title="Bad Row", forum_name="CHI", date=None, + pdf_file=_pdf("bad_upload.pdf")) + bad_name = bad.pdf_file.name + good, _, _ = self._legacy_talk(last_name="Park", title="Good Talk", + base="Good_Original_v1") + + # Must not raise despite the malformed row. + call_command("restandardize_artifact_filenames") + + good.refresh_from_db() + bad.refresh_from_db() + self.assertIn("Park", os.path.basename(good.pdf_file.name)) + self.assertNotIn("Good_Original", good.pdf_file.name) + # The malformed row is left exactly as it was. + self.assertEqual(bad.pdf_file.name, bad_name) diff --git a/website/tests/test_serve_pdf.py b/website/tests/test_serve_pdf.py index a7c2f990..62477fbd 100644 --- a/website/tests/test_serve_pdf.py +++ b/website/tests/test_serve_pdf.py @@ -4,6 +4,17 @@ from django.test import SimpleTestCase +from website.models import Publication +from website.tests.base import DatabaseTestCase + + +def _miss_original_fallback(mock_pub): + """Stub the #1401 original_pdf_filename fallback chain + (``filter(...).exclude(...).exclude(...).first()``) to miss, so tests + exercising the exact/fuzzy/404 paths fall through it as before.""" + qs = mock_pub.objects.filter.return_value + qs.exclude.return_value.exclude.return_value.first.return_value = None + class ServePdfTests(SimpleTestCase): """ @@ -56,9 +67,13 @@ def test_uses_iendswith_not_icontains(self): return_value=None, ): MockPub.objects.filter.return_value.first.return_value = None + _miss_original_fallback(MockPub) with self.assertRaises(Http404): serve_pdf(MagicMock(), ".pdf") - MockPub.objects.filter.assert_called_with(pdf_file__iendswith=".pdf") + # assert_any_call (not assert_called_with): the original-filename + # fallback issues a later filter() call, so the iendswith call is + # no longer the most recent one. + MockPub.objects.filter.assert_any_call(pdf_file__iendswith=".pdf") def test_no_exact_match_uses_fuzzy_redirect(self): """ @@ -72,6 +87,7 @@ def test_no_exact_match_uses_fuzzy_redirect(self): return_value="publications/Froehlich2018Updated.pdf", ): MockPub.objects.filter.return_value.first.return_value = None + _miss_original_fallback(MockPub) response = serve_pdf(MagicMock(), "Froehlich2018Old.pdf") self.assertEqual(response.status_code, 302) self.assertEqual(response.url, "/media/publications/Froehlich2018Updated.pdf") @@ -85,5 +101,34 @@ def test_no_exact_no_fuzzy_returns_404(self): return_value=None, ): MockPub.objects.filter.return_value.first.return_value = None + _miss_original_fallback(MockPub) with self.assertRaises(Http404): serve_pdf(MagicMock(), "NoSuchPaper.pdf") + + +class ServePdfOriginalFilenameFallbackTests(DatabaseTestCase): + """ + DB-backed test for the #1401 original_pdf_filename fallback: after a + publication PDF is re-standardized, a stale external link to its old + (original upload) filename must still resolve. serve_pdf matches the + requested name against the captured ``original_pdf_filename`` and redirects + to the current file — an exact resolution, before the difflib guess. + """ + + def test_old_filename_redirects_to_current_via_original_pdf_filename(self): + from website.views.serve_pdf import serve_pdf + pub = self.make_publication(title="Gamifying Green", year=2013) + # Simulate a re-standardized pub: current file is the standardized + # name; the old upload name is preserved in original_pdf_filename. + Publication.objects.filter(pk=pub.pk).update( + pdf_file="publications/Froehlich_GamifyingGreen_CHI2013.pdf", + original_pdf_filename="Gamifying_Green_yY7Jx99.pdf", + ) + + response = serve_pdf(MagicMock(), "Gamifying_Green_yY7Jx99.pdf") + + self.assertEqual(response.status_code, 302) + self.assertEqual( + response.url, + "/media/publications/Froehlich_GamifyingGreen_CHI2013.pdf", + ) diff --git a/website/views/serve_pdf.py b/website/views/serve_pdf.py index b67afcca..f1778bd4 100644 --- a/website/views/serve_pdf.py +++ b/website/views/serve_pdf.py @@ -45,7 +45,28 @@ def serve_pdf(request, filename): response['Content-Disposition'] = f'inline;filename={basename}' return response - # No exact match — try fuzzy fallback for stale external links. + # No exact match. Before the fuzzy guess, try an EXACT match on the + # captured original upload name (#1391/#1401): when a publication's PDF was + # re-standardized (Author_Title_Venue), its previous on-disk basename was + # saved to ``original_pdf_filename``. A stale external link still points at + # that old name, so match it there and redirect to the current file. This + # resolves renamed-paper links exactly, rather than relying on the difflib + # similarity guess below. + requested_basename = os.path.basename(filename) + renamed = (Publication.objects + .filter(original_pdf_filename__iexact=requested_basename) + .exclude(pdf_file="") + .exclude(pdf_file__isnull=True) + .first()) + if renamed is not None: + current_basename = os.path.basename(renamed.pdf_file.name) + _logger.debug( + f"{filename} matched original_pdf_filename of pub id={renamed.pk}; " + f"redirecting to /media/publications/{current_basename}" + ) + return redirect(f'/media/publications/{current_basename}') + + # Still no match — try fuzzy fallback for stale external links. _logger.debug(f"{filename} not found exactly; trying fuzzy match") closest = get_closest_filename_from_database(filename, 0.7) if closest: