Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docker-entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,11 @@ echo "4.10 Running 'python manage.py setup_admin_groups' to create/refresh the E
echo "******************************************"
python manage.py setup_admin_groups

echo "****************** STEP 4.10b/5: docker-entrypoint.sh ************************"
echo "4.10b Running 'python manage.py restandardize_artifact_filenames' to rename legacy talk/poster/pub files to the standardized scheme (#1401)"
echo "******************************************"
python manage.py restandardize_artifact_filenames

# echo "****************** STEP 4.3/5: docker-entrypoint.sh ************************"
# echo "4.3 Running 'python manage.py rename_person_images' to rename person images"
# echo "******************************************"
Expand Down
168 changes: 168 additions & 0 deletions website/management/commands/restandardize_artifact_filenames.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import os
import logging

from django.core.management.base import BaseCommand

from website.models import Artifact, Talk, Poster, Publication

# This retrieves a Python logging instance (or creates it)
_logger = logging.getLogger(__name__)


class Command(BaseCommand):
help = (
"Re-standardizes legacy talk/poster/publication filenames that were "
"never renamed to the Author_TitleInTitleCase_VenueYear scheme (issue "
"#1401). Production has many such rows (bulk-imported, so they never "
"went through an authored Artifact.save()). This reuses the existing, "
"now-correct rename path: when Artifact.do_filenames_need_updating() is "
"True it calls artifact.save(), which renames the pdf_file, raw_file, "
"and thumbnail on disk AND in the DB together. The original upload name "
"is preserved (it was captured into original_*_filename by "
"backfill_original_filenames / #1391 before this runs). Idempotent: "
"once a row is standardized the check returns False, so re-runs do "
"nothing. Safe to run on every container start."
)

# The concrete artifact models this covers. Posters are already
# standardized in prod, but included for completeness/future data.
MODELS = (Talk, Poster, Publication)

def add_arguments(self, parser):
parser.add_argument(
"--dry-run",
action="store_true",
help="Report what would be renamed without touching disk or DB.",
)

def handle(self, *args, **options):
dry_run = options["dry_run"]
_logger.debug(
f"Running restandardize_artifact_filenames.py (dry_run={dry_run}) "
f"to rename legacy artifact files to the standardized scheme."
)

total_renamed = 0
total_skipped = 0
total_errors = 0
for model in self.MODELS:
renamed, skipped, errors = self._restandardize_model(model, dry_run)
total_renamed += renamed
total_skipped += skipped
total_errors += errors

verb = "Would rename" if dry_run else "Renamed"
_logger.info(
f"restandardize_artifact_filenames: {verb} {total_renamed} "
f"artifact(s); skipped {total_skipped} (already standardized or no "
f"usable name); {total_errors} row(s) errored and were skipped."
)
_logger.debug("Completed restandardize_artifact_filenames.py")

def _restandardize_model(self, model, dry_run):
"""Re-standardize one concrete artifact model.

Returns a ``(num_renamed, num_skipped, num_errors)`` tuple. Each row is
processed inside its own try/except so a single malformed row (e.g. a
null ``date``/``title``, which makes ``generate_filename`` raise via
``date.year`` / ``title.title()``) can't abort the batch and leave the
rest of the dataset untouched. The entrypoint has no ``set -e``, so an
aborted run would fail silently.
"""
num_renamed = 0
num_skipped = 0
num_errors = 0
# prefetch_related('authors') because the rename path reads the first
# author's last name (generate_filename) and the rename only fires when
# authors exist.
for artifact in model.objects.prefetch_related("authors").all():
try:
if self._restandardize_row(artifact, dry_run):
num_renamed += 1
else:
num_skipped += 1
except Exception:
_logger.exception(
"restandardize_artifact_filenames: skipping %s id=%s due to "
"an error", model.__name__, getattr(artifact, "pk", "?"),
)
num_errors += 1

return num_renamed, num_skipped, num_errors

def _restandardize_row(self, artifact, dry_run):
"""Re-standardize one artifact's files if needed.

Returns True if it was (or would be) renamed, False if skipped. Raises
on malformed data — the caller isolates that.
"""
model_name = type(artifact).__name__

# A standardized name needs an author, a title, and a date. Without
# them generate_filename() can't produce a sensible name (and would
# raise on null date/title), so leave the row untouched rather than
# rename it to something degraded.
if not artifact.title or not artifact.date:
_logger.debug(
f"Skipping {model_name} id={artifact.pk}: missing title/date."
)
return False
if not artifact.authors.exists():
# Artifact.save() only renames when authors exist (it needs the
# first author's last name), so a save() here would be a no-op.
_logger.debug(
f"Skipping {model_name} id={artifact.pk}: no authors."
)
return False

if not self._needs_restandardizing(artifact):
return False

if dry_run:
new_name = Artifact.generate_filename(artifact)
_logger.debug(
f"[dry-run] Would re-standardize {model_name} id={artifact.pk} "
f"to '{new_name}' (pdf='{artifact.pdf_file.name if artifact.pdf_file else None}', "
f"raw='{artifact.raw_file.name if artifact.raw_file else None}')"
)
return True

# Reuse the canonical rename path: save() renames pdf_file, raw_file,
# and thumbnail on disk and in the DB, and leaves original_*_filename
# untouched (it only captures on a new upload, not a no-update_fields
# save), so the provenance recorded by #1391 is preserved.
artifact.save()
_logger.debug(
f"Re-standardized {model_name} id={artifact.pk} to "
f"'{Artifact.generate_filename(artifact)}'."
)
return True

@staticmethod
def _needs_restandardizing(artifact):
"""Whether the pdf_file/raw_file need standardizing, tolerant of the
uniqueness suffix.

This is NOT ``Artifact.do_filenames_need_updating``: that compares the
basename to ``generate_filename`` for exact equality, so a file whose
standardized name collided on disk and got a ``-<timestamp>`` suffix
(``ensure_filename_is_unique``) reads as "needs updating" forever and
would be re-renamed on every run — churning duplicate-name artifacts'
filenames on every deploy. Here a name that equals the standardized
base OR is a ``-<suffix>`` variant of it counts as already standardized,
which keeps the command idempotent.
"""
standardized = Artifact.generate_filename(artifact)
for file_attr in ("pdf_file", "raw_file"):
file_field = getattr(artifact, file_attr)
if not file_field:
continue
current_no_ext = os.path.splitext(
os.path.basename(file_field.name))[0]
is_standardized = (
current_no_ext == standardized
or current_no_ext.startswith(standardized + "-")
)
if not is_standardized:
return True
return False
98 changes: 98 additions & 0 deletions website/tests/test_artifact.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from datetime import date
from unittest.mock import MagicMock, patch

from django.core.files.base import ContentFile
from django.core.files.storage import default_storage
from django.core.files.uploadedfile import SimpleUploadedFile
from django.core.management import call_command
from django.test import SimpleTestCase
Expand Down Expand Up @@ -357,3 +359,99 @@ def test_pdf_only_omits_raw_row(self):
def test_placeholder_when_nothing_recorded(self):
html = self._render(pdf=None, raw=None)
self.assertIn("Not recorded", html)


# --- #1401: re-standardize legacy filenames -------------------------------


class RestandardizeArtifactFilenamesTests(DatabaseTestCase):
"""
Tests for the restandardize_artifact_filenames command (#1401), which
renames legacy never-renamed files to the Author_Title_Venue scheme by
reusing Artifact.save(). See also the #1391 capture tests above.
"""

def _legacy_talk(self, last_name="Kim", title="My Talk", year=2019,
base="Original_Upload_v2"):
"""
Build a talk with authors but NON-standard files actually present on
disk (so the os.rename in save() has something to rename). The factory
auto-standardizes on create, so we drop real files at non-standard
paths and repoint the row at them, mimicking a never-renamed import.
Returns (talk, pdf_basename, raw_basename).
"""
person = self.make_person(last_name=last_name)
talk = TalkFactory(title=title, forum_name="CHI",
date=date(year, 1, 1), authors=[person])
pdf_name = default_storage.save(
f"talks/{base}.pdf", ContentFile(b"%PDF-1.4 x"))
raw_name = default_storage.save(
f"talks/{base}.pptx", ContentFile(b"PKx"))
Talk.objects.filter(pk=talk.pk).update(
pdf_file=pdf_name, raw_file=raw_name,
original_pdf_filename=os.path.basename(pdf_name),
original_raw_filename=os.path.basename(raw_name),
)
talk.refresh_from_db()
return talk, os.path.basename(pdf_name), os.path.basename(raw_name)

def test_renames_pdf_and_raw_preserving_original_and_idempotent(self):
talk, pdf_base, raw_base = self._legacy_talk()
orig_pdf = talk.original_pdf_filename
orig_raw = talk.original_raw_filename

call_command("restandardize_artifact_filenames")

talk.refresh_from_db()
# Both files renamed away from the original toward the standardized
# scheme (which contains the author last name), on disk and in the DB.
self.assertNotIn("Original_Upload", talk.pdf_file.name)
self.assertNotIn("Original_Upload", talk.raw_file.name)
self.assertIn("Kim", os.path.basename(talk.pdf_file.name))
self.assertIn("Kim", os.path.basename(talk.raw_file.name))
self.assertTrue(default_storage.exists(talk.pdf_file.name))
self.assertTrue(default_storage.exists(talk.raw_file.name))
# Provenance preserved (not clobbered by the rename).
self.assertEqual(talk.original_pdf_filename, orig_pdf)
self.assertEqual(talk.original_raw_filename, orig_raw)

# Idempotent: a second run leaves the now-standardized names alone.
pdf_after = talk.pdf_file.name
raw_after = talk.raw_file.name
call_command("restandardize_artifact_filenames")
talk.refresh_from_db()
self.assertEqual(talk.pdf_file.name, pdf_after)
self.assertEqual(talk.raw_file.name, raw_after)

def test_already_standardized_talk_is_untouched(self):
# A normally-created talk is auto-standardized on save, so the command
# should find nothing to do and leave its filename unchanged.
person = self.make_person(last_name="Lee")
talk = TalkFactory(title="Standard Talk", forum_name="CHI",
date=date(2021, 1, 1), authors=[person])
before = talk.pdf_file.name

call_command("restandardize_artifact_filenames")

talk.refresh_from_db()
self.assertEqual(talk.pdf_file.name, before)

def test_malformed_row_is_skipped_and_batch_continues(self):
# A null-date talk can't form a standardized name; it must be skipped
# (not renamed, no crash) while a good legacy row in the same run is
# still re-standardized.
bad = TalkFactory(title="Bad Row", forum_name="CHI", date=None,
pdf_file=_pdf("bad_upload.pdf"))
bad_name = bad.pdf_file.name
good, _, _ = self._legacy_talk(last_name="Park", title="Good Talk",
base="Good_Original_v1")

# Must not raise despite the malformed row.
call_command("restandardize_artifact_filenames")

good.refresh_from_db()
bad.refresh_from_db()
self.assertIn("Park", os.path.basename(good.pdf_file.name))
self.assertNotIn("Good_Original", good.pdf_file.name)
# The malformed row is left exactly as it was.
self.assertEqual(bad.pdf_file.name, bad_name)
47 changes: 46 additions & 1 deletion website/tests/test_serve_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,17 @@

from django.test import SimpleTestCase

from website.models import Publication
from website.tests.base import DatabaseTestCase


def _miss_original_fallback(mock_pub):
"""Stub the #1401 original_pdf_filename fallback chain
(``filter(...).exclude(...).exclude(...).first()``) to miss, so tests
exercising the exact/fuzzy/404 paths fall through it as before."""
qs = mock_pub.objects.filter.return_value
qs.exclude.return_value.exclude.return_value.first.return_value = None


class ServePdfTests(SimpleTestCase):
"""
Expand Down Expand Up @@ -56,9 +67,13 @@ def test_uses_iendswith_not_icontains(self):
return_value=None,
):
MockPub.objects.filter.return_value.first.return_value = None
_miss_original_fallback(MockPub)
with self.assertRaises(Http404):
serve_pdf(MagicMock(), ".pdf")
MockPub.objects.filter.assert_called_with(pdf_file__iendswith=".pdf")
# assert_any_call (not assert_called_with): the original-filename
# fallback issues a later filter() call, so the iendswith call is
# no longer the most recent one.
MockPub.objects.filter.assert_any_call(pdf_file__iendswith=".pdf")

def test_no_exact_match_uses_fuzzy_redirect(self):
"""
Expand All @@ -72,6 +87,7 @@ def test_no_exact_match_uses_fuzzy_redirect(self):
return_value="publications/Froehlich2018Updated.pdf",
):
MockPub.objects.filter.return_value.first.return_value = None
_miss_original_fallback(MockPub)
response = serve_pdf(MagicMock(), "Froehlich2018Old.pdf")
self.assertEqual(response.status_code, 302)
self.assertEqual(response.url, "/media/publications/Froehlich2018Updated.pdf")
Expand All @@ -85,5 +101,34 @@ def test_no_exact_no_fuzzy_returns_404(self):
return_value=None,
):
MockPub.objects.filter.return_value.first.return_value = None
_miss_original_fallback(MockPub)
with self.assertRaises(Http404):
serve_pdf(MagicMock(), "NoSuchPaper.pdf")


class ServePdfOriginalFilenameFallbackTests(DatabaseTestCase):
"""
DB-backed test for the #1401 original_pdf_filename fallback: after a
publication PDF is re-standardized, a stale external link to its old
(original upload) filename must still resolve. serve_pdf matches the
requested name against the captured ``original_pdf_filename`` and redirects
to the current file — an exact resolution, before the difflib guess.
"""

def test_old_filename_redirects_to_current_via_original_pdf_filename(self):
from website.views.serve_pdf import serve_pdf
pub = self.make_publication(title="Gamifying Green", year=2013)
# Simulate a re-standardized pub: current file is the standardized
# name; the old upload name is preserved in original_pdf_filename.
Publication.objects.filter(pk=pub.pk).update(
pdf_file="publications/Froehlich_GamifyingGreen_CHI2013.pdf",
original_pdf_filename="Gamifying_Green_yY7Jx99.pdf",
)

response = serve_pdf(MagicMock(), "Gamifying_Green_yY7Jx99.pdf")

self.assertEqual(response.status_code, 302)
self.assertEqual(
response.url,
"/media/publications/Froehlich_GamifyingGreen_CHI2013.pdf",
)
23 changes: 22 additions & 1 deletion website/views/serve_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,28 @@ def serve_pdf(request, filename):
response['Content-Disposition'] = f'inline;filename={basename}'
return response

# No exact match — try fuzzy fallback for stale external links.
# No exact match. Before the fuzzy guess, try an EXACT match on the
# captured original upload name (#1391/#1401): when a publication's PDF was
# re-standardized (Author_Title_Venue), its previous on-disk basename was
# saved to ``original_pdf_filename``. A stale external link still points at
# that old name, so match it there and redirect to the current file. This
# resolves renamed-paper links exactly, rather than relying on the difflib
# similarity guess below.
requested_basename = os.path.basename(filename)
renamed = (Publication.objects
.filter(original_pdf_filename__iexact=requested_basename)
.exclude(pdf_file="")
.exclude(pdf_file__isnull=True)
.first())
if renamed is not None:
current_basename = os.path.basename(renamed.pdf_file.name)
_logger.debug(
f"{filename} matched original_pdf_filename of pub id={renamed.pk}; "
f"redirecting to /media/publications/{current_basename}"
)
return redirect(f'/media/publications/{current_basename}')

# Still no match — try fuzzy fallback for stale external links.
_logger.debug(f"{filename} not found exactly; trying fuzzy match")
closest = get_closest_filename_from_database(filename, 0.7)
if closest:
Expand Down
Loading